aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks202
-rw-r--r--kernel/Makefile7
-rw-r--r--kernel/acct.c3
-rw-r--r--kernel/audit_tree.c13
-rw-r--r--kernel/auditsc.c1
-rw-r--r--kernel/bounds.c2
-rw-r--r--kernel/capability.c15
-rw-r--r--kernel/cgroup.c16
-rw-r--r--kernel/cpu.c41
-rw-r--r--kernel/cpuset.c45
-rw-r--r--kernel/cred.c2
-rw-r--r--kernel/exit.c73
-rw-r--r--kernel/fork.c42
-rw-r--r--kernel/futex.c137
-rw-r--r--kernel/hrtimer.c170
-rw-r--r--kernel/hung_task.c2
-rw-r--r--kernel/hw_breakpoint.c493
-rw-r--r--kernel/irq/autoprobe.c20
-rw-r--r--kernel/irq/chip.c92
-rw-r--r--kernel/irq/handle.c22
-rw-r--r--kernel/irq/internals.h2
-rw-r--r--kernel/irq/manage.c52
-rw-r--r--kernel/irq/migration.c2
-rw-r--r--kernel/irq/numa_migrate.c8
-rw-r--r--kernel/irq/pm.c8
-rw-r--r--kernel/irq/proc.c44
-rw-r--r--kernel/irq/spurious.c32
-rw-r--r--kernel/itimer.c7
-rw-r--r--kernel/kallsyms.c1
-rw-r--r--kernel/kexec.c65
-rw-r--r--kernel/kfifo.c410
-rw-r--r--kernel/kgdb.c67
-rw-r--r--kernel/kmod.c20
-rw-r--r--kernel/kprobes.c72
-rw-r--r--kernel/ksysfs.c21
-rw-r--r--kernel/kthread.c14
-rw-r--r--kernel/lockdep.c71
-rw-r--r--kernel/module.c205
-rw-r--r--kernel/mutex-debug.h12
-rw-r--r--kernel/mutex.c4
-rw-r--r--kernel/notifier.c2
-rw-r--r--kernel/panic.c4
-rw-r--r--kernel/params.c25
-rw-r--r--kernel/perf_event.c903
-rw-r--r--kernel/pid.c12
-rw-r--r--kernel/pm_qos_params.c20
-rw-r--r--kernel/posix-cpu-timers.c5
-rw-r--r--kernel/power/Makefile2
-rw-r--r--kernel/power/console.c7
-rw-r--r--kernel/power/hibernate.c41
-rw-r--r--kernel/power/main.c1
-rw-r--r--kernel/power/process.c14
-rw-r--r--kernel/power/suspend_test.c5
-rw-r--r--kernel/power/swap.c146
-rw-r--r--kernel/power/swsusp.c130
-rw-r--r--kernel/printk.c127
-rw-r--r--kernel/rcupdate.c122
-rw-r--r--kernel/rcutiny.c282
-rw-r--r--kernel/rcutorture.c73
-rw-r--r--kernel/rcutree.c493
-rw-r--r--kernel/rcutree.h84
-rw-r--r--kernel/rcutree_plugin.h343
-rw-r--r--kernel/rcutree_trace.c12
-rw-r--r--kernel/relay.c2
-rw-r--r--kernel/resource.c30
-rw-r--r--kernel/rtmutex-debug.c4
-rw-r--r--kernel/rtmutex.c106
-rw-r--r--kernel/sched.c1016
-rw-r--r--kernel/sched_clock.c23
-rw-r--r--kernel/sched_cpupri.c10
-rw-r--r--kernel/sched_cpupri.h2
-rw-r--r--kernel/sched_debug.c21
-rw-r--r--kernel/sched_fair.c317
-rw-r--r--kernel/sched_features.h5
-rw-r--r--kernel/sched_idletask.c6
-rw-r--r--kernel/sched_rt.c127
-rw-r--r--kernel/signal.c139
-rw-r--r--kernel/slow-work-debugfs.c227
-rw-r--r--kernel/slow-work.c519
-rw-r--r--kernel/slow-work.h72
-rw-r--r--kernel/smp.c91
-rw-r--r--kernel/softirq.c21
-rw-r--r--kernel/softlockup.c69
-rw-r--r--kernel/spinlock.c448
-rw-r--r--kernel/srcu.c74
-rw-r--r--kernel/sys.c70
-rw-r--r--kernel/sys_ni.c3
-rw-r--r--kernel/sysctl.c934
-rw-r--r--kernel/sysctl_binary.c1543
-rw-r--r--kernel/sysctl_check.c1378
-rw-r--r--kernel/time.c30
-rw-r--r--kernel/time/clockevents.c46
-rw-r--r--kernel/time/clocksource.c123
-rw-r--r--kernel/time/tick-broadcast.c42
-rw-r--r--kernel/time/tick-common.c20
-rw-r--r--kernel/time/tick-internal.h1
-rw-r--r--kernel/time/tick-oneshot.c4
-rw-r--r--kernel/time/tick-sched.c141
-rw-r--r--kernel/time/timecompare.c8
-rw-r--r--kernel/time/timekeeping.c104
-rw-r--r--kernel/time/timer_list.c25
-rw-r--r--kernel/time/timer_stats.c18
-rw-r--r--kernel/timer.c5
-rw-r--r--kernel/trace/Kconfig144
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/ftrace.c419
-rw-r--r--kernel/trace/power-traces.c2
-rw-r--r--kernel/trace/ring_buffer.c109
-rw-r--r--kernel/trace/ring_buffer_benchmark.c85
-rw-r--r--kernel/trace/trace.c333
-rw-r--r--kernel/trace/trace.h107
-rw-r--r--kernel/trace/trace_clock.c16
-rw-r--r--kernel/trace/trace_entries.h16
-rw-r--r--kernel/trace/trace_event_profile.c49
-rw-r--r--kernel/trace/trace_events.c228
-rw-r--r--kernel/trace/trace_events_filter.c441
-rw-r--r--kernel/trace/trace_export.c54
-rw-r--r--kernel/trace/trace_functions_graph.c169
-rw-r--r--kernel/trace/trace_hw_branches.c51
-rw-r--r--kernel/trace/trace_irqsoff.c2
-rw-r--r--kernel/trace/trace_kprobe.c1553
-rw-r--r--kernel/trace/trace_ksym.c519
-rw-r--r--kernel/trace/trace_output.c80
-rw-r--r--kernel/trace/trace_sched_wakeup.c16
-rw-r--r--kernel/trace/trace_selftest.c59
-rw-r--r--kernel/trace/trace_stack.c40
-rw-r--r--kernel/trace/trace_syscalls.c235
-rw-r--r--kernel/trace/trace_sysprof.c1
-rw-r--r--kernel/user-return-notifier.c44
-rw-r--r--kernel/user.c2
-rw-r--r--kernel/utsname_sysctl.c31
-rw-r--r--kernel/workqueue.c166
132 files changed, 12188 insertions, 5977 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
new file mode 100644
index 00000000000..88c92fb4461
--- /dev/null
+++ b/kernel/Kconfig.locks
@@ -0,0 +1,202 @@
1#
2# The ARCH_INLINE foo is necessary because select ignores "depends on"
3#
4config ARCH_INLINE_SPIN_TRYLOCK
5 bool
6
7config ARCH_INLINE_SPIN_TRYLOCK_BH
8 bool
9
10config ARCH_INLINE_SPIN_LOCK
11 bool
12
13config ARCH_INLINE_SPIN_LOCK_BH
14 bool
15
16config ARCH_INLINE_SPIN_LOCK_IRQ
17 bool
18
19config ARCH_INLINE_SPIN_LOCK_IRQSAVE
20 bool
21
22config ARCH_INLINE_SPIN_UNLOCK
23 bool
24
25config ARCH_INLINE_SPIN_UNLOCK_BH
26 bool
27
28config ARCH_INLINE_SPIN_UNLOCK_IRQ
29 bool
30
31config ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
32 bool
33
34
35config ARCH_INLINE_READ_TRYLOCK
36 bool
37
38config ARCH_INLINE_READ_LOCK
39 bool
40
41config ARCH_INLINE_READ_LOCK_BH
42 bool
43
44config ARCH_INLINE_READ_LOCK_IRQ
45 bool
46
47config ARCH_INLINE_READ_LOCK_IRQSAVE
48 bool
49
50config ARCH_INLINE_READ_UNLOCK
51 bool
52
53config ARCH_INLINE_READ_UNLOCK_BH
54 bool
55
56config ARCH_INLINE_READ_UNLOCK_IRQ
57 bool
58
59config ARCH_INLINE_READ_UNLOCK_IRQRESTORE
60 bool
61
62
63config ARCH_INLINE_WRITE_TRYLOCK
64 bool
65
66config ARCH_INLINE_WRITE_LOCK
67 bool
68
69config ARCH_INLINE_WRITE_LOCK_BH
70 bool
71
72config ARCH_INLINE_WRITE_LOCK_IRQ
73 bool
74
75config ARCH_INLINE_WRITE_LOCK_IRQSAVE
76 bool
77
78config ARCH_INLINE_WRITE_UNLOCK
79 bool
80
81config ARCH_INLINE_WRITE_UNLOCK_BH
82 bool
83
84config ARCH_INLINE_WRITE_UNLOCK_IRQ
85 bool
86
87config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
88 bool
89
90#
91# lock_* functions are inlined when:
92# - DEBUG_SPINLOCK=n and GENERIC_LOCKBREAK=n and ARCH_INLINE_*LOCK=y
93#
94# trylock_* functions are inlined when:
95# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
96#
97# unlock and unlock_irq functions are inlined when:
98# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
99# or
100# - DEBUG_SPINLOCK=n and PREEMPT=n
101#
102# unlock_bh and unlock_irqrestore functions are inlined when:
103# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
104#
105
106config INLINE_SPIN_TRYLOCK
107 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK
108
109config INLINE_SPIN_TRYLOCK_BH
110 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK_BH
111
112config INLINE_SPIN_LOCK
113 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK
114
115config INLINE_SPIN_LOCK_BH
116 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
117 ARCH_INLINE_SPIN_LOCK_BH
118
119config INLINE_SPIN_LOCK_IRQ
120 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
121 ARCH_INLINE_SPIN_LOCK_IRQ
122
123config INLINE_SPIN_LOCK_IRQSAVE
124 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
125 ARCH_INLINE_SPIN_LOCK_IRQSAVE
126
127config INLINE_SPIN_UNLOCK
128 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK)
129
130config INLINE_SPIN_UNLOCK_BH
131 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH
132
133config INLINE_SPIN_UNLOCK_IRQ
134 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH)
135
136config INLINE_SPIN_UNLOCK_IRQRESTORE
137 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
138
139
140config INLINE_READ_TRYLOCK
141 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_TRYLOCK
142
143config INLINE_READ_LOCK
144 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK
145
146config INLINE_READ_LOCK_BH
147 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
148 ARCH_INLINE_READ_LOCK_BH
149
150config INLINE_READ_LOCK_IRQ
151 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
152 ARCH_INLINE_READ_LOCK_IRQ
153
154config INLINE_READ_LOCK_IRQSAVE
155 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
156 ARCH_INLINE_READ_LOCK_IRQSAVE
157
158config INLINE_READ_UNLOCK
159 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK)
160
161config INLINE_READ_UNLOCK_BH
162 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_BH
163
164config INLINE_READ_UNLOCK_IRQ
165 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK_BH)
166
167config INLINE_READ_UNLOCK_IRQRESTORE
168 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_IRQRESTORE
169
170
171config INLINE_WRITE_TRYLOCK
172 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_TRYLOCK
173
174config INLINE_WRITE_LOCK
175 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK
176
177config INLINE_WRITE_LOCK_BH
178 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
179 ARCH_INLINE_WRITE_LOCK_BH
180
181config INLINE_WRITE_LOCK_IRQ
182 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
183 ARCH_INLINE_WRITE_LOCK_IRQ
184
185config INLINE_WRITE_LOCK_IRQSAVE
186 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
187 ARCH_INLINE_WRITE_LOCK_IRQSAVE
188
189config INLINE_WRITE_UNLOCK
190 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK)
191
192config INLINE_WRITE_UNLOCK_BH
193 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_BH
194
195config INLINE_WRITE_UNLOCK_IRQ
196 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH)
197
198config INLINE_WRITE_UNLOCK_IRQRESTORE
199 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
200
201config MUTEX_SPIN_ON_OWNER
202 def_bool SMP && !DEBUG_MUTEXES && !HAVE_DEFAULT_NO_SPIN_MUTEXES
diff --git a/kernel/Makefile b/kernel/Makefile
index b8d4cd8ac0b..864ff75d65f 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -4,7 +4,7 @@
4 4
5obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ 5obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
6 cpu.o exit.o itimer.o time.o softirq.o resource.o \ 6 cpu.o exit.o itimer.o time.o softirq.o resource.o \
7 sysctl.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
@@ -21,6 +21,7 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
21CFLAGS_REMOVE_rtmutex-debug.o = -pg 21CFLAGS_REMOVE_rtmutex-debug.o = -pg
22CFLAGS_REMOVE_cgroup-debug.o = -pg 22CFLAGS_REMOVE_cgroup-debug.o = -pg
23CFLAGS_REMOVE_sched_clock.o = -pg 23CFLAGS_REMOVE_sched_clock.o = -pg
24CFLAGS_REMOVE_perf_event.o = -pg
24endif 25endif
25 26
26obj-$(CONFIG_FREEZER) += freezer.o 27obj-$(CONFIG_FREEZER) += freezer.o
@@ -82,6 +83,7 @@ obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
82obj-$(CONFIG_TREE_RCU) += rcutree.o 83obj-$(CONFIG_TREE_RCU) += rcutree.o
83obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o 84obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
84obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o 85obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
86obj-$(CONFIG_TINY_RCU) += rcutiny.o
85obj-$(CONFIG_RELAY) += relay.o 87obj-$(CONFIG_RELAY) += relay.o
86obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 88obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
87obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 89obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
@@ -94,7 +96,10 @@ obj-$(CONFIG_X86_DS) += trace/
94obj-$(CONFIG_RING_BUFFER) += trace/ 96obj-$(CONFIG_RING_BUFFER) += trace/
95obj-$(CONFIG_SMP) += sched_cpupri.o 97obj-$(CONFIG_SMP) += sched_cpupri.o
96obj-$(CONFIG_SLOW_WORK) += slow-work.o 98obj-$(CONFIG_SLOW_WORK) += slow-work.o
99obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
97obj-$(CONFIG_PERF_EVENTS) += perf_event.o 100obj-$(CONFIG_PERF_EVENTS) += perf_event.o
101obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
102obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
98 103
99ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 104ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
100# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 105# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index 9a4715a2f6b..a6605ca921b 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -536,7 +536,8 @@ static void do_acct_process(struct bsd_acct_struct *acct,
536 do_div(elapsed, AHZ); 536 do_div(elapsed, AHZ);
537 ac.ac_btime = get_seconds() - elapsed; 537 ac.ac_btime = get_seconds() - elapsed;
538 /* we really need to bite the bullet and change layout */ 538 /* we really need to bite the bullet and change layout */
539 current_uid_gid(&ac.ac_uid, &ac.ac_gid); 539 ac.ac_uid = orig_cred->uid;
540 ac.ac_gid = orig_cred->gid;
540#if ACCT_VERSION==2 541#if ACCT_VERSION==2
541 ac.ac_ahz = AHZ; 542 ac.ac_ahz = AHZ;
542#endif 543#endif
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 2451dc6f328..4b05bd9479d 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -277,7 +277,7 @@ static void untag_chunk(struct node *p)
277 owner->root = NULL; 277 owner->root = NULL;
278 } 278 }
279 279
280 for (i = j = 0; i < size; i++, j++) { 280 for (i = j = 0; j <= size; i++, j++) {
281 struct audit_tree *s; 281 struct audit_tree *s;
282 if (&chunk->owners[j] == p) { 282 if (&chunk->owners[j] == p) {
283 list_del_init(&p->list); 283 list_del_init(&p->list);
@@ -290,7 +290,7 @@ static void untag_chunk(struct node *p)
290 if (!s) /* result of earlier fallback */ 290 if (!s) /* result of earlier fallback */
291 continue; 291 continue;
292 get_tree(s); 292 get_tree(s);
293 list_replace_init(&chunk->owners[i].list, &new->owners[j].list); 293 list_replace_init(&chunk->owners[j].list, &new->owners[i].list);
294 } 294 }
295 295
296 list_replace_rcu(&chunk->hash, &new->hash); 296 list_replace_rcu(&chunk->hash, &new->hash);
@@ -373,15 +373,17 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
373 for (n = 0; n < old->count; n++) { 373 for (n = 0; n < old->count; n++) {
374 if (old->owners[n].owner == tree) { 374 if (old->owners[n].owner == tree) {
375 spin_unlock(&hash_lock); 375 spin_unlock(&hash_lock);
376 put_inotify_watch(watch); 376 put_inotify_watch(&old->watch);
377 return 0; 377 return 0;
378 } 378 }
379 } 379 }
380 spin_unlock(&hash_lock); 380 spin_unlock(&hash_lock);
381 381
382 chunk = alloc_chunk(old->count + 1); 382 chunk = alloc_chunk(old->count + 1);
383 if (!chunk) 383 if (!chunk) {
384 put_inotify_watch(&old->watch);
384 return -ENOMEM; 385 return -ENOMEM;
386 }
385 387
386 mutex_lock(&inode->inotify_mutex); 388 mutex_lock(&inode->inotify_mutex);
387 if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) { 389 if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) {
@@ -425,7 +427,8 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
425 spin_unlock(&hash_lock); 427 spin_unlock(&hash_lock);
426 inotify_evict_watch(&old->watch); 428 inotify_evict_watch(&old->watch);
427 mutex_unlock(&inode->inotify_mutex); 429 mutex_unlock(&inode->inotify_mutex);
428 put_inotify_watch(&old->watch); 430 put_inotify_watch(&old->watch); /* pair to inotify_find_watch */
431 put_inotify_watch(&old->watch); /* and kill it */
429 return 0; 432 return 0;
430} 433}
431 434
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 267e484f019..fc0f928167e 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -250,7 +250,6 @@ struct audit_context {
250#endif 250#endif
251}; 251};
252 252
253#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE])
254static inline int open_arg(int flags, int mask) 253static inline int open_arg(int flags, int mask)
255{ 254{
256 int n = ACC_MODE(flags); 255 int n = ACC_MODE(flags);
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 3c530138183..98a51f26c13 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -12,7 +12,7 @@
12 12
13void foo(void) 13void foo(void)
14{ 14{
15 /* The enum constants to put into include/linux/bounds.h */ 15 /* The enum constants to put into include/generated/bounds.h */
16 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); 16 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
17 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); 17 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
18 /* End of constants */ 18 /* End of constants */
diff --git a/kernel/capability.c b/kernel/capability.c
index 4e17041963f..7f876e60521 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -29,7 +29,6 @@ EXPORT_SYMBOL(__cap_empty_set);
29EXPORT_SYMBOL(__cap_full_set); 29EXPORT_SYMBOL(__cap_full_set);
30EXPORT_SYMBOL(__cap_init_eff_set); 30EXPORT_SYMBOL(__cap_init_eff_set);
31 31
32#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
33int file_caps_enabled = 1; 32int file_caps_enabled = 1;
34 33
35static int __init file_caps_disable(char *str) 34static int __init file_caps_disable(char *str)
@@ -38,7 +37,6 @@ static int __init file_caps_disable(char *str)
38 return 1; 37 return 1;
39} 38}
40__setup("no_file_caps", file_caps_disable); 39__setup("no_file_caps", file_caps_disable);
41#endif
42 40
43/* 41/*
44 * More recent versions of libcap are available from: 42 * More recent versions of libcap are available from:
@@ -169,8 +167,8 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
169 kernel_cap_t pE, pI, pP; 167 kernel_cap_t pE, pI, pP;
170 168
171 ret = cap_validate_magic(header, &tocopy); 169 ret = cap_validate_magic(header, &tocopy);
172 if (ret != 0) 170 if ((dataptr == NULL) || (ret != 0))
173 return ret; 171 return ((dataptr == NULL) && (ret == -EINVAL)) ? 0 : ret;
174 172
175 if (get_user(pid, &header->pid)) 173 if (get_user(pid, &header->pid))
176 return -EFAULT; 174 return -EFAULT;
@@ -238,7 +236,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
238SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) 236SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
239{ 237{
240 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; 238 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
241 unsigned i, tocopy; 239 unsigned i, tocopy, copybytes;
242 kernel_cap_t inheritable, permitted, effective; 240 kernel_cap_t inheritable, permitted, effective;
243 struct cred *new; 241 struct cred *new;
244 int ret; 242 int ret;
@@ -255,8 +253,11 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
255 if (pid != 0 && pid != task_pid_vnr(current)) 253 if (pid != 0 && pid != task_pid_vnr(current))
256 return -EPERM; 254 return -EPERM;
257 255
258 if (copy_from_user(&kdata, data, 256 copybytes = tocopy * sizeof(struct __user_cap_data_struct);
259 tocopy * sizeof(struct __user_cap_data_struct))) 257 if (copybytes > sizeof(kdata))
258 return -EFAULT;
259
260 if (copy_from_user(&kdata, data, copybytes))
260 return -EFAULT; 261 return -EFAULT;
261 262
262 for (i = 0; i < tocopy; i++) { 263 for (i = 0; i < tocopy; i++) {
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ca83b73fba1..aa3bee56644 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1710,14 +1710,13 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
1710 return -EFAULT; 1710 return -EFAULT;
1711 1711
1712 buffer[nbytes] = 0; /* nul-terminate */ 1712 buffer[nbytes] = 0; /* nul-terminate */
1713 strstrip(buffer);
1714 if (cft->write_u64) { 1713 if (cft->write_u64) {
1715 u64 val = simple_strtoull(buffer, &end, 0); 1714 u64 val = simple_strtoull(strstrip(buffer), &end, 0);
1716 if (*end) 1715 if (*end)
1717 return -EINVAL; 1716 return -EINVAL;
1718 retval = cft->write_u64(cgrp, cft, val); 1717 retval = cft->write_u64(cgrp, cft, val);
1719 } else { 1718 } else {
1720 s64 val = simple_strtoll(buffer, &end, 0); 1719 s64 val = simple_strtoll(strstrip(buffer), &end, 0);
1721 if (*end) 1720 if (*end)
1722 return -EINVAL; 1721 return -EINVAL;
1723 retval = cft->write_s64(cgrp, cft, val); 1722 retval = cft->write_s64(cgrp, cft, val);
@@ -1753,8 +1752,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
1753 } 1752 }
1754 1753
1755 buffer[nbytes] = 0; /* nul-terminate */ 1754 buffer[nbytes] = 0; /* nul-terminate */
1756 strstrip(buffer); 1755 retval = cft->write_string(cgrp, cft, strstrip(buffer));
1757 retval = cft->write_string(cgrp, cft, buffer);
1758 if (!retval) 1756 if (!retval)
1759 retval = nbytes; 1757 retval = nbytes;
1760out: 1758out:
@@ -2470,7 +2468,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2470 /* make sure l doesn't vanish out from under us */ 2468 /* make sure l doesn't vanish out from under us */
2471 down_write(&l->mutex); 2469 down_write(&l->mutex);
2472 mutex_unlock(&cgrp->pidlist_mutex); 2470 mutex_unlock(&cgrp->pidlist_mutex);
2473 l->use_count++;
2474 return l; 2471 return l;
2475 } 2472 }
2476 } 2473 }
@@ -2939,14 +2936,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2939 2936
2940 for_each_subsys(root, ss) { 2937 for_each_subsys(root, ss) {
2941 struct cgroup_subsys_state *css = ss->create(ss, cgrp); 2938 struct cgroup_subsys_state *css = ss->create(ss, cgrp);
2939
2942 if (IS_ERR(css)) { 2940 if (IS_ERR(css)) {
2943 err = PTR_ERR(css); 2941 err = PTR_ERR(css);
2944 goto err_destroy; 2942 goto err_destroy;
2945 } 2943 }
2946 init_cgroup_css(css, ss, cgrp); 2944 init_cgroup_css(css, ss, cgrp);
2947 if (ss->use_id) 2945 if (ss->use_id) {
2948 if (alloc_css_id(ss, parent, cgrp)) 2946 err = alloc_css_id(ss, parent, cgrp);
2947 if (err)
2949 goto err_destroy; 2948 goto err_destroy;
2949 }
2950 /* At error, ->destroy() callback has to free assigned ID. */ 2950 /* At error, ->destroy() callback has to free assigned ID. */
2951 } 2951 }
2952 2952
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6ba0f1ecb21..677f25376a3 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -151,13 +151,13 @@ static inline void check_for_tasks(int cpu)
151 151
152 write_lock_irq(&tasklist_lock); 152 write_lock_irq(&tasklist_lock);
153 for_each_process(p) { 153 for_each_process(p) {
154 if (task_cpu(p) == cpu && 154 if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
155 (!cputime_eq(p->utime, cputime_zero) || 155 (!cputime_eq(p->utime, cputime_zero) ||
156 !cputime_eq(p->stime, cputime_zero))) 156 !cputime_eq(p->stime, cputime_zero)))
157 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\ 157 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
158 (state = %ld, flags = %x) \n", 158 "(state = %ld, flags = %x)\n",
159 p->comm, task_pid_nr(p), cpu, 159 p->comm, task_pid_nr(p), cpu,
160 p->state, p->flags); 160 p->state, p->flags);
161 } 161 }
162 write_unlock_irq(&tasklist_lock); 162 write_unlock_irq(&tasklist_lock);
163} 163}
@@ -209,9 +209,12 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
209 return -ENOMEM; 209 return -ENOMEM;
210 210
211 cpu_hotplug_begin(); 211 cpu_hotplug_begin();
212 set_cpu_active(cpu, false);
212 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, 213 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
213 hcpu, -1, &nr_calls); 214 hcpu, -1, &nr_calls);
214 if (err == NOTIFY_BAD) { 215 if (err == NOTIFY_BAD) {
216 set_cpu_active(cpu, true);
217
215 nr_calls--; 218 nr_calls--;
216 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 219 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
217 hcpu, nr_calls, NULL); 220 hcpu, nr_calls, NULL);
@@ -223,11 +226,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
223 226
224 /* Ensure that we are not runnable on dying cpu */ 227 /* Ensure that we are not runnable on dying cpu */
225 cpumask_copy(old_allowed, &current->cpus_allowed); 228 cpumask_copy(old_allowed, &current->cpus_allowed);
226 set_cpus_allowed_ptr(current, 229 set_cpus_allowed_ptr(current, cpu_active_mask);
227 cpumask_of(cpumask_any_but(cpu_online_mask, cpu)));
228 230
229 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); 231 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
230 if (err) { 232 if (err) {
233 set_cpu_active(cpu, true);
231 /* CPU didn't die: tell everyone. Can't complain. */ 234 /* CPU didn't die: tell everyone. Can't complain. */
232 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 235 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
233 hcpu) == NOTIFY_BAD) 236 hcpu) == NOTIFY_BAD)
@@ -278,23 +281,8 @@ int __ref cpu_down(unsigned int cpu)
278 goto out; 281 goto out;
279 } 282 }
280 283
281 set_cpu_active(cpu, false);
282
283 /*
284 * Make sure the all cpus did the reschedule and are not
285 * using stale version of the cpu_active_mask.
286 * This is not strictly necessary becuase stop_machine()
287 * that we run down the line already provides the required
288 * synchronization. But it's really a side effect and we do not
289 * want to depend on the innards of the stop_machine here.
290 */
291 synchronize_sched();
292
293 err = _cpu_down(cpu, 0); 284 err = _cpu_down(cpu, 0);
294 285
295 if (cpu_online(cpu))
296 set_cpu_active(cpu, true);
297
298out: 286out:
299 cpu_maps_update_done(); 287 cpu_maps_update_done();
300 stop_machine_destroy(); 288 stop_machine_destroy();
@@ -383,19 +371,20 @@ int disable_nonboot_cpus(void)
383 return error; 371 return error;
384 cpu_maps_update_begin(); 372 cpu_maps_update_begin();
385 first_cpu = cpumask_first(cpu_online_mask); 373 first_cpu = cpumask_first(cpu_online_mask);
386 /* We take down all of the non-boot CPUs in one shot to avoid races 374 /*
375 * We take down all of the non-boot CPUs in one shot to avoid races
387 * with the userspace trying to use the CPU hotplug at the same time 376 * with the userspace trying to use the CPU hotplug at the same time
388 */ 377 */
389 cpumask_clear(frozen_cpus); 378 cpumask_clear(frozen_cpus);
379
390 printk("Disabling non-boot CPUs ...\n"); 380 printk("Disabling non-boot CPUs ...\n");
391 for_each_online_cpu(cpu) { 381 for_each_online_cpu(cpu) {
392 if (cpu == first_cpu) 382 if (cpu == first_cpu)
393 continue; 383 continue;
394 error = _cpu_down(cpu, 1); 384 error = _cpu_down(cpu, 1);
395 if (!error) { 385 if (!error)
396 cpumask_set_cpu(cpu, frozen_cpus); 386 cpumask_set_cpu(cpu, frozen_cpus);
397 printk("CPU%d is down\n", cpu); 387 else {
398 } else {
399 printk(KERN_ERR "Error taking CPU%d down: %d\n", 388 printk(KERN_ERR "Error taking CPU%d down: %d\n",
400 cpu, error); 389 cpu, error);
401 break; 390 break;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b5cb469d254..ba401fab459 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -537,8 +537,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
537 * element of the partition (one sched domain) to be passed to 537 * element of the partition (one sched domain) to be passed to
538 * partition_sched_domains(). 538 * partition_sched_domains().
539 */ 539 */
540/* FIXME: see the FIXME in partition_sched_domains() */ 540static int generate_sched_domains(cpumask_var_t **domains,
541static int generate_sched_domains(struct cpumask **domains,
542 struct sched_domain_attr **attributes) 541 struct sched_domain_attr **attributes)
543{ 542{
544 LIST_HEAD(q); /* queue of cpusets to be scanned */ 543 LIST_HEAD(q); /* queue of cpusets to be scanned */
@@ -546,7 +545,7 @@ static int generate_sched_domains(struct cpumask **domains,
546 struct cpuset **csa; /* array of all cpuset ptrs */ 545 struct cpuset **csa; /* array of all cpuset ptrs */
547 int csn; /* how many cpuset ptrs in csa so far */ 546 int csn; /* how many cpuset ptrs in csa so far */
548 int i, j, k; /* indices for partition finding loops */ 547 int i, j, k; /* indices for partition finding loops */
549 struct cpumask *doms; /* resulting partition; i.e. sched domains */ 548 cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
550 struct sched_domain_attr *dattr; /* attributes for custom domains */ 549 struct sched_domain_attr *dattr; /* attributes for custom domains */
551 int ndoms = 0; /* number of sched domains in result */ 550 int ndoms = 0; /* number of sched domains in result */
552 int nslot; /* next empty doms[] struct cpumask slot */ 551 int nslot; /* next empty doms[] struct cpumask slot */
@@ -557,7 +556,8 @@ static int generate_sched_domains(struct cpumask **domains,
557 556
558 /* Special case for the 99% of systems with one, full, sched domain */ 557 /* Special case for the 99% of systems with one, full, sched domain */
559 if (is_sched_load_balance(&top_cpuset)) { 558 if (is_sched_load_balance(&top_cpuset)) {
560 doms = kmalloc(cpumask_size(), GFP_KERNEL); 559 ndoms = 1;
560 doms = alloc_sched_domains(ndoms);
561 if (!doms) 561 if (!doms)
562 goto done; 562 goto done;
563 563
@@ -566,9 +566,8 @@ static int generate_sched_domains(struct cpumask **domains,
566 *dattr = SD_ATTR_INIT; 566 *dattr = SD_ATTR_INIT;
567 update_domain_attr_tree(dattr, &top_cpuset); 567 update_domain_attr_tree(dattr, &top_cpuset);
568 } 568 }
569 cpumask_copy(doms, top_cpuset.cpus_allowed); 569 cpumask_copy(doms[0], top_cpuset.cpus_allowed);
570 570
571 ndoms = 1;
572 goto done; 571 goto done;
573 } 572 }
574 573
@@ -636,7 +635,7 @@ restart:
636 * Now we know how many domains to create. 635 * Now we know how many domains to create.
637 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. 636 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
638 */ 637 */
639 doms = kmalloc(ndoms * cpumask_size(), GFP_KERNEL); 638 doms = alloc_sched_domains(ndoms);
640 if (!doms) 639 if (!doms)
641 goto done; 640 goto done;
642 641
@@ -656,7 +655,7 @@ restart:
656 continue; 655 continue;
657 } 656 }
658 657
659 dp = doms + nslot; 658 dp = doms[nslot];
660 659
661 if (nslot == ndoms) { 660 if (nslot == ndoms) {
662 static int warnings = 10; 661 static int warnings = 10;
@@ -718,7 +717,7 @@ done:
718static void do_rebuild_sched_domains(struct work_struct *unused) 717static void do_rebuild_sched_domains(struct work_struct *unused)
719{ 718{
720 struct sched_domain_attr *attr; 719 struct sched_domain_attr *attr;
721 struct cpumask *doms; 720 cpumask_var_t *doms;
722 int ndoms; 721 int ndoms;
723 722
724 get_online_cpus(); 723 get_online_cpus();
@@ -738,7 +737,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused)
738{ 737{
739} 738}
740 739
741static int generate_sched_domains(struct cpumask **domains, 740static int generate_sched_domains(cpumask_var_t **domains,
742 struct sched_domain_attr **attributes) 741 struct sched_domain_attr **attributes)
743{ 742{
744 *domains = NULL; 743 *domains = NULL;
@@ -873,7 +872,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
873 if (retval < 0) 872 if (retval < 0)
874 return retval; 873 return retval;
875 874
876 if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask)) 875 if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
877 return -EINVAL; 876 return -EINVAL;
878 } 877 }
879 retval = validate_change(cs, trialcs); 878 retval = validate_change(cs, trialcs);
@@ -2011,7 +2010,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2011 } 2010 }
2012 2011
2013 /* Continue past cpusets with all cpus, mems online */ 2012 /* Continue past cpusets with all cpus, mems online */
2014 if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) && 2013 if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&
2015 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) 2014 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
2016 continue; 2015 continue;
2017 2016
@@ -2020,7 +2019,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2020 /* Remove offline cpus and mems from this cpuset. */ 2019 /* Remove offline cpus and mems from this cpuset. */
2021 mutex_lock(&callback_mutex); 2020 mutex_lock(&callback_mutex);
2022 cpumask_and(cp->cpus_allowed, cp->cpus_allowed, 2021 cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
2023 cpu_online_mask); 2022 cpu_active_mask);
2024 nodes_and(cp->mems_allowed, cp->mems_allowed, 2023 nodes_and(cp->mems_allowed, cp->mems_allowed,
2025 node_states[N_HIGH_MEMORY]); 2024 node_states[N_HIGH_MEMORY]);
2026 mutex_unlock(&callback_mutex); 2025 mutex_unlock(&callback_mutex);
@@ -2052,14 +2051,16 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2052 unsigned long phase, void *unused_cpu) 2051 unsigned long phase, void *unused_cpu)
2053{ 2052{
2054 struct sched_domain_attr *attr; 2053 struct sched_domain_attr *attr;
2055 struct cpumask *doms; 2054 cpumask_var_t *doms;
2056 int ndoms; 2055 int ndoms;
2057 2056
2058 switch (phase) { 2057 switch (phase) {
2059 case CPU_ONLINE: 2058 case CPU_ONLINE:
2060 case CPU_ONLINE_FROZEN: 2059 case CPU_ONLINE_FROZEN:
2061 case CPU_DEAD: 2060 case CPU_DOWN_PREPARE:
2062 case CPU_DEAD_FROZEN: 2061 case CPU_DOWN_PREPARE_FROZEN:
2062 case CPU_DOWN_FAILED:
2063 case CPU_DOWN_FAILED_FROZEN:
2063 break; 2064 break;
2064 2065
2065 default: 2066 default:
@@ -2068,7 +2069,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2068 2069
2069 cgroup_lock(); 2070 cgroup_lock();
2070 mutex_lock(&callback_mutex); 2071 mutex_lock(&callback_mutex);
2071 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); 2072 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2072 mutex_unlock(&callback_mutex); 2073 mutex_unlock(&callback_mutex);
2073 scan_for_empty_cpusets(&top_cpuset); 2074 scan_for_empty_cpusets(&top_cpuset);
2074 ndoms = generate_sched_domains(&doms, &attr); 2075 ndoms = generate_sched_domains(&doms, &attr);
@@ -2115,7 +2116,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2115 2116
2116void __init cpuset_init_smp(void) 2117void __init cpuset_init_smp(void)
2117{ 2118{
2118 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); 2119 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2119 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2120 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2120 2121
2121 hotcpu_notifier(cpuset_track_online_cpus, 0); 2122 hotcpu_notifier(cpuset_track_online_cpus, 0);
@@ -2537,15 +2538,9 @@ const struct file_operations proc_cpuset_operations = {
2537}; 2538};
2538#endif /* CONFIG_PROC_PID_CPUSET */ 2539#endif /* CONFIG_PROC_PID_CPUSET */
2539 2540
2540/* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */ 2541/* Display task mems_allowed in /proc/<pid>/status file. */
2541void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) 2542void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2542{ 2543{
2543 seq_printf(m, "Cpus_allowed:\t");
2544 seq_cpumask(m, &task->cpus_allowed);
2545 seq_printf(m, "\n");
2546 seq_printf(m, "Cpus_allowed_list:\t");
2547 seq_cpumask_list(m, &task->cpus_allowed);
2548 seq_printf(m, "\n");
2549 seq_printf(m, "Mems_allowed:\t"); 2544 seq_printf(m, "Mems_allowed:\t");
2550 seq_nodemask(m, &task->mems_allowed); 2545 seq_nodemask(m, &task->mems_allowed);
2551 seq_printf(m, "\n"); 2546 seq_printf(m, "\n");
diff --git a/kernel/cred.c b/kernel/cred.c
index dd76cfe5f5b..1ed8ca18790 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -224,7 +224,7 @@ struct cred *cred_alloc_blank(void)
224#ifdef CONFIG_KEYS 224#ifdef CONFIG_KEYS
225 new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL); 225 new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
226 if (!new->tgcred) { 226 if (!new->tgcred) {
227 kfree(new); 227 kmem_cache_free(cred_jar, new);
228 return NULL; 228 return NULL;
229 } 229 }
230 atomic_set(&new->tgcred->usage, 1); 230 atomic_set(&new->tgcred->usage, 1);
diff --git a/kernel/exit.c b/kernel/exit.c
index e61891f8012..546774a31a6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -49,6 +49,7 @@
49#include <linux/init_task.h> 49#include <linux/init_task.h>
50#include <linux/perf_event.h> 50#include <linux/perf_event.h>
51#include <trace/events/sched.h> 51#include <trace/events/sched.h>
52#include <linux/hw_breakpoint.h>
52 53
53#include <asm/uaccess.h> 54#include <asm/uaccess.h>
54#include <asm/unistd.h> 55#include <asm/unistd.h>
@@ -67,10 +68,10 @@ static void __unhash_process(struct task_struct *p)
67 detach_pid(p, PIDTYPE_SID); 68 detach_pid(p, PIDTYPE_SID);
68 69
69 list_del_rcu(&p->tasks); 70 list_del_rcu(&p->tasks);
71 list_del_init(&p->sibling);
70 __get_cpu_var(process_counts)--; 72 __get_cpu_var(process_counts)--;
71 } 73 }
72 list_del_rcu(&p->thread_group); 74 list_del_rcu(&p->thread_group);
73 list_del_init(&p->sibling);
74} 75}
75 76
76/* 77/*
@@ -110,9 +111,9 @@ static void __exit_signal(struct task_struct *tsk)
110 * We won't ever get here for the group leader, since it 111 * We won't ever get here for the group leader, since it
111 * will have been the last reference on the signal_struct. 112 * will have been the last reference on the signal_struct.
112 */ 113 */
113 sig->utime = cputime_add(sig->utime, task_utime(tsk)); 114 sig->utime = cputime_add(sig->utime, tsk->utime);
114 sig->stime = cputime_add(sig->stime, task_stime(tsk)); 115 sig->stime = cputime_add(sig->stime, tsk->stime);
115 sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); 116 sig->gtime = cputime_add(sig->gtime, tsk->gtime);
116 sig->min_flt += tsk->min_flt; 117 sig->min_flt += tsk->min_flt;
117 sig->maj_flt += tsk->maj_flt; 118 sig->maj_flt += tsk->maj_flt;
118 sig->nvcsw += tsk->nvcsw; 119 sig->nvcsw += tsk->nvcsw;
@@ -359,10 +360,8 @@ void __set_special_pids(struct pid *pid)
359{ 360{
360 struct task_struct *curr = current->group_leader; 361 struct task_struct *curr = current->group_leader;
361 362
362 if (task_session(curr) != pid) { 363 if (task_session(curr) != pid)
363 change_pid(curr, PIDTYPE_SID, pid); 364 change_pid(curr, PIDTYPE_SID, pid);
364 proc_sid_connector(curr);
365 }
366 365
367 if (task_pgrp(curr) != pid) 366 if (task_pgrp(curr) != pid)
368 change_pid(curr, PIDTYPE_PGID, pid); 367 change_pid(curr, PIDTYPE_PGID, pid);
@@ -737,12 +736,9 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
737/* 736/*
738* Any that need to be release_task'd are put on the @dead list. 737* Any that need to be release_task'd are put on the @dead list.
739 */ 738 */
740static void reparent_thread(struct task_struct *father, struct task_struct *p, 739static void reparent_leader(struct task_struct *father, struct task_struct *p,
741 struct list_head *dead) 740 struct list_head *dead)
742{ 741{
743 if (p->pdeath_signal)
744 group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
745
746 list_move_tail(&p->sibling, &p->real_parent->children); 742 list_move_tail(&p->sibling, &p->real_parent->children);
747 743
748 if (task_detached(p)) 744 if (task_detached(p))
@@ -781,12 +777,18 @@ static void forget_original_parent(struct task_struct *father)
781 reaper = find_new_reaper(father); 777 reaper = find_new_reaper(father);
782 778
783 list_for_each_entry_safe(p, n, &father->children, sibling) { 779 list_for_each_entry_safe(p, n, &father->children, sibling) {
784 p->real_parent = reaper; 780 struct task_struct *t = p;
785 if (p->parent == father) { 781 do {
786 BUG_ON(task_ptrace(p)); 782 t->real_parent = reaper;
787 p->parent = p->real_parent; 783 if (t->parent == father) {
788 } 784 BUG_ON(task_ptrace(t));
789 reparent_thread(father, p, &dead_children); 785 t->parent = t->real_parent;
786 }
787 if (t->pdeath_signal)
788 group_send_sig_info(t->pdeath_signal,
789 SEND_SIG_NOINFO, t);
790 } while_each_thread(p, t);
791 reparent_leader(father, p, &dead_children);
790 } 792 }
791 write_unlock_irq(&tasklist_lock); 793 write_unlock_irq(&tasklist_lock);
792 794
@@ -934,7 +936,7 @@ NORET_TYPE void do_exit(long code)
934 * an exiting task cleaning up the robust pi futexes. 936 * an exiting task cleaning up the robust pi futexes.
935 */ 937 */
936 smp_mb(); 938 smp_mb();
937 spin_unlock_wait(&tsk->pi_lock); 939 raw_spin_unlock_wait(&tsk->pi_lock);
938 940
939 if (unlikely(in_atomic())) 941 if (unlikely(in_atomic()))
940 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", 942 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
@@ -972,7 +974,7 @@ NORET_TYPE void do_exit(long code)
972 exit_thread(); 974 exit_thread();
973 cgroup_exit(tsk, 1); 975 cgroup_exit(tsk, 1);
974 976
975 if (group_dead && tsk->signal->leader) 977 if (group_dead)
976 disassociate_ctty(1); 978 disassociate_ctty(1);
977 979
978 module_put(task_thread_info(tsk)->exec_domain->module); 980 module_put(task_thread_info(tsk)->exec_domain->module);
@@ -980,6 +982,10 @@ NORET_TYPE void do_exit(long code)
980 proc_exit_connector(tsk); 982 proc_exit_connector(tsk);
981 983
982 /* 984 /*
985 * FIXME: do that only when needed, using sched_exit tracepoint
986 */
987 flush_ptrace_hw_breakpoint(tsk);
988 /*
983 * Flush inherited counters to the parent - before the parent 989 * Flush inherited counters to the parent - before the parent
984 * gets woken up by child-exit notifications. 990 * gets woken up by child-exit notifications.
985 */ 991 */
@@ -1006,7 +1012,7 @@ NORET_TYPE void do_exit(long code)
1006 tsk->flags |= PF_EXITPIDONE; 1012 tsk->flags |= PF_EXITPIDONE;
1007 1013
1008 if (tsk->io_context) 1014 if (tsk->io_context)
1009 exit_io_context(); 1015 exit_io_context(tsk);
1010 1016
1011 if (tsk->splice_pipe) 1017 if (tsk->splice_pipe)
1012 __free_pipe_info(tsk->splice_pipe); 1018 __free_pipe_info(tsk->splice_pipe);
@@ -1207,6 +1213,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1207 struct signal_struct *psig; 1213 struct signal_struct *psig;
1208 struct signal_struct *sig; 1214 struct signal_struct *sig;
1209 unsigned long maxrss; 1215 unsigned long maxrss;
1216 cputime_t tgutime, tgstime;
1210 1217
1211 /* 1218 /*
1212 * The resource counters for the group leader are in its 1219 * The resource counters for the group leader are in its
@@ -1222,20 +1229,23 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1222 * need to protect the access to parent->signal fields, 1229 * need to protect the access to parent->signal fields,
1223 * as other threads in the parent group can be right 1230 * as other threads in the parent group can be right
1224 * here reaping other children at the same time. 1231 * here reaping other children at the same time.
1232 *
1233 * We use thread_group_times() to get times for the thread
1234 * group, which consolidates times for all threads in the
1235 * group including the group leader.
1225 */ 1236 */
1237 thread_group_times(p, &tgutime, &tgstime);
1226 spin_lock_irq(&p->real_parent->sighand->siglock); 1238 spin_lock_irq(&p->real_parent->sighand->siglock);
1227 psig = p->real_parent->signal; 1239 psig = p->real_parent->signal;
1228 sig = p->signal; 1240 sig = p->signal;
1229 psig->cutime = 1241 psig->cutime =
1230 cputime_add(psig->cutime, 1242 cputime_add(psig->cutime,
1231 cputime_add(p->utime, 1243 cputime_add(tgutime,
1232 cputime_add(sig->utime, 1244 sig->cutime));
1233 sig->cutime)));
1234 psig->cstime = 1245 psig->cstime =
1235 cputime_add(psig->cstime, 1246 cputime_add(psig->cstime,
1236 cputime_add(p->stime, 1247 cputime_add(tgstime,
1237 cputime_add(sig->stime, 1248 sig->cstime));
1238 sig->cstime)));
1239 psig->cgtime = 1249 psig->cgtime =
1240 cputime_add(psig->cgtime, 1250 cputime_add(psig->cgtime,
1241 cputime_add(p->gtime, 1251 cputime_add(p->gtime,
@@ -1544,14 +1554,9 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
1544 struct task_struct *p; 1554 struct task_struct *p;
1545 1555
1546 list_for_each_entry(p, &tsk->children, sibling) { 1556 list_for_each_entry(p, &tsk->children, sibling) {
1547 /* 1557 int ret = wait_consider_task(wo, 0, p);
1548 * Do not consider detached threads. 1558 if (ret)
1549 */ 1559 return ret;
1550 if (!task_detached(p)) {
1551 int ret = wait_consider_task(wo, 0, p);
1552 if (ret)
1553 return ret;
1554 }
1555 } 1560 }
1556 1561
1557 return 0; 1562 return 0;
diff --git a/kernel/fork.c b/kernel/fork.c
index 4c20fff8c13..f88bd984df3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -64,6 +64,7 @@
64#include <linux/magic.h> 64#include <linux/magic.h>
65#include <linux/perf_event.h> 65#include <linux/perf_event.h>
66#include <linux/posix-timers.h> 66#include <linux/posix-timers.h>
67#include <linux/user-return-notifier.h>
67 68
68#include <asm/pgtable.h> 69#include <asm/pgtable.h>
69#include <asm/pgalloc.h> 70#include <asm/pgalloc.h>
@@ -91,7 +92,7 @@ int nr_processes(void)
91 int cpu; 92 int cpu;
92 int total = 0; 93 int total = 0;
93 94
94 for_each_online_cpu(cpu) 95 for_each_possible_cpu(cpu)
95 total += per_cpu(process_counts, cpu); 96 total += per_cpu(process_counts, cpu);
96 97
97 return total; 98 return total;
@@ -249,6 +250,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
249 goto out; 250 goto out;
250 251
251 setup_thread_stack(tsk, orig); 252 setup_thread_stack(tsk, orig);
253 clear_user_return_notifier(tsk);
252 stackend = end_of_stack(tsk); 254 stackend = end_of_stack(tsk);
253 *stackend = STACK_END_MAGIC; /* for overflow detection */ 255 *stackend = STACK_END_MAGIC; /* for overflow detection */
254 256
@@ -884,6 +886,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
884 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; 886 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
885 sig->gtime = cputime_zero; 887 sig->gtime = cputime_zero;
886 sig->cgtime = cputime_zero; 888 sig->cgtime = cputime_zero;
889#ifndef CONFIG_VIRT_CPU_ACCOUNTING
890 sig->prev_utime = sig->prev_stime = cputime_zero;
891#endif
887 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; 892 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
888 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; 893 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
889 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; 894 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
@@ -934,9 +939,9 @@ SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
934 939
935static void rt_mutex_init_task(struct task_struct *p) 940static void rt_mutex_init_task(struct task_struct *p)
936{ 941{
937 spin_lock_init(&p->pi_lock); 942 raw_spin_lock_init(&p->pi_lock);
938#ifdef CONFIG_RT_MUTEXES 943#ifdef CONFIG_RT_MUTEXES
939 plist_head_init(&p->pi_waiters, &p->pi_lock); 944 plist_head_init_raw(&p->pi_waiters, &p->pi_lock);
940 p->pi_blocked_on = NULL; 945 p->pi_blocked_on = NULL;
941#endif 946#endif
942} 947}
@@ -1066,8 +1071,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1066 p->gtime = cputime_zero; 1071 p->gtime = cputime_zero;
1067 p->utimescaled = cputime_zero; 1072 p->utimescaled = cputime_zero;
1068 p->stimescaled = cputime_zero; 1073 p->stimescaled = cputime_zero;
1074#ifndef CONFIG_VIRT_CPU_ACCOUNTING
1069 p->prev_utime = cputime_zero; 1075 p->prev_utime = cputime_zero;
1070 p->prev_stime = cputime_zero; 1076 p->prev_stime = cputime_zero;
1077#endif
1071 1078
1072 p->default_timer_slack_ns = current->timer_slack_ns; 1079 p->default_timer_slack_ns = current->timer_slack_ns;
1073 1080
@@ -1120,6 +1127,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1120#ifdef CONFIG_DEBUG_MUTEXES 1127#ifdef CONFIG_DEBUG_MUTEXES
1121 p->blocked_on = NULL; /* not blocked yet */ 1128 p->blocked_on = NULL; /* not blocked yet */
1122#endif 1129#endif
1130#ifdef CONFIG_CGROUP_MEM_RES_CTLR
1131 p->memcg_batch.do_batch = 0;
1132 p->memcg_batch.memcg = NULL;
1133#endif
1123 1134
1124 p->bts = NULL; 1135 p->bts = NULL;
1125 1136
@@ -1199,9 +1210,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1199 p->sas_ss_sp = p->sas_ss_size = 0; 1210 p->sas_ss_sp = p->sas_ss_size = 0;
1200 1211
1201 /* 1212 /*
1202 * Syscall tracing should be turned off in the child regardless 1213 * Syscall tracing and stepping should be turned off in the
1203 * of CLONE_PTRACE. 1214 * child regardless of CLONE_PTRACE.
1204 */ 1215 */
1216 user_disable_single_step(p);
1205 clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); 1217 clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
1206#ifdef TIF_SYSCALL_EMU 1218#ifdef TIF_SYSCALL_EMU
1207 clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); 1219 clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
@@ -1229,21 +1241,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1229 /* Need tasklist lock for parent etc handling! */ 1241 /* Need tasklist lock for parent etc handling! */
1230 write_lock_irq(&tasklist_lock); 1242 write_lock_irq(&tasklist_lock);
1231 1243
1232 /*
1233 * The task hasn't been attached yet, so its cpus_allowed mask will
1234 * not be changed, nor will its assigned CPU.
1235 *
1236 * The cpus_allowed mask of the parent may have changed after it was
1237 * copied first time - so re-copy it here, then check the child's CPU
1238 * to ensure it is on a valid CPU (and if not, just force it back to
1239 * parent's CPU). This avoids alot of nasty races.
1240 */
1241 p->cpus_allowed = current->cpus_allowed;
1242 p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
1243 if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
1244 !cpu_online(task_cpu(p))))
1245 set_task_cpu(p, smp_processor_id());
1246
1247 /* CLONE_PARENT re-uses the old parent */ 1244 /* CLONE_PARENT re-uses the old parent */
1248 if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { 1245 if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
1249 p->real_parent = current->real_parent; 1246 p->real_parent = current->real_parent;
@@ -1279,7 +1276,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1279 } 1276 }
1280 1277
1281 if (likely(p->pid)) { 1278 if (likely(p->pid)) {
1282 list_add_tail(&p->sibling, &p->real_parent->children);
1283 tracehook_finish_clone(p, clone_flags, trace); 1279 tracehook_finish_clone(p, clone_flags, trace);
1284 1280
1285 if (thread_group_leader(p)) { 1281 if (thread_group_leader(p)) {
@@ -1291,6 +1287,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1291 p->signal->tty = tty_kref_get(current->signal->tty); 1287 p->signal->tty = tty_kref_get(current->signal->tty);
1292 attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); 1288 attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
1293 attach_pid(p, PIDTYPE_SID, task_session(current)); 1289 attach_pid(p, PIDTYPE_SID, task_session(current));
1290 list_add_tail(&p->sibling, &p->real_parent->children);
1294 list_add_tail_rcu(&p->tasks, &init_task.tasks); 1291 list_add_tail_rcu(&p->tasks, &init_task.tasks);
1295 __get_cpu_var(process_counts)++; 1292 __get_cpu_var(process_counts)++;
1296 } 1293 }
@@ -1310,7 +1307,8 @@ bad_fork_free_pid:
1310 if (pid != &init_struct_pid) 1307 if (pid != &init_struct_pid)
1311 free_pid(pid); 1308 free_pid(pid);
1312bad_fork_cleanup_io: 1309bad_fork_cleanup_io:
1313 put_io_context(p->io_context); 1310 if (p->io_context)
1311 exit_io_context(p);
1314bad_fork_cleanup_namespaces: 1312bad_fork_cleanup_namespaces:
1315 exit_task_namespaces(p); 1313 exit_task_namespaces(p);
1316bad_fork_cleanup_mm: 1314bad_fork_cleanup_mm:
diff --git a/kernel/futex.c b/kernel/futex.c
index 4949d336d88..e7a35f1039e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -150,7 +150,8 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key)
150 */ 150 */
151static inline int match_futex(union futex_key *key1, union futex_key *key2) 151static inline int match_futex(union futex_key *key1, union futex_key *key2)
152{ 152{
153 return (key1->both.word == key2->both.word 153 return (key1 && key2
154 && key1->both.word == key2->both.word
154 && key1->both.ptr == key2->both.ptr 155 && key1->both.ptr == key2->both.ptr
155 && key1->both.offset == key2->both.offset); 156 && key1->both.offset == key2->both.offset);
156} 157}
@@ -202,8 +203,6 @@ static void drop_futex_key_refs(union futex_key *key)
202 * @uaddr: virtual address of the futex 203 * @uaddr: virtual address of the futex
203 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED 204 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
204 * @key: address where result is stored. 205 * @key: address where result is stored.
205 * @rw: mapping needs to be read/write (values: VERIFY_READ,
206 * VERIFY_WRITE)
207 * 206 *
208 * Returns a negative error code or 0 207 * Returns a negative error code or 0
209 * The key words are stored in *key on success. 208 * The key words are stored in *key on success.
@@ -215,7 +214,7 @@ static void drop_futex_key_refs(union futex_key *key)
215 * lock_page() might sleep, the caller should not hold a spinlock. 214 * lock_page() might sleep, the caller should not hold a spinlock.
216 */ 215 */
217static int 216static int
218get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) 217get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
219{ 218{
220 unsigned long address = (unsigned long)uaddr; 219 unsigned long address = (unsigned long)uaddr;
221 struct mm_struct *mm = current->mm; 220 struct mm_struct *mm = current->mm;
@@ -238,7 +237,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
238 * but access_ok() should be faster than find_vma() 237 * but access_ok() should be faster than find_vma()
239 */ 238 */
240 if (!fshared) { 239 if (!fshared) {
241 if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) 240 if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
242 return -EFAULT; 241 return -EFAULT;
243 key->private.mm = mm; 242 key->private.mm = mm;
244 key->private.address = address; 243 key->private.address = address;
@@ -247,7 +246,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
247 } 246 }
248 247
249again: 248again:
250 err = get_user_pages_fast(address, 1, rw == VERIFY_WRITE, &page); 249 err = get_user_pages_fast(address, 1, 1, &page);
251 if (err < 0) 250 if (err < 0)
252 return err; 251 return err;
253 252
@@ -303,8 +302,14 @@ void put_futex_key(int fshared, union futex_key *key)
303 */ 302 */
304static int fault_in_user_writeable(u32 __user *uaddr) 303static int fault_in_user_writeable(u32 __user *uaddr)
305{ 304{
306 int ret = get_user_pages(current, current->mm, (unsigned long)uaddr, 305 struct mm_struct *mm = current->mm;
307 1, 1, 0, NULL, NULL); 306 int ret;
307
308 down_read(&mm->mmap_sem);
309 ret = get_user_pages(current, mm, (unsigned long)uaddr,
310 1, 1, 0, NULL, NULL);
311 up_read(&mm->mmap_sem);
312
308 return ret < 0 ? ret : 0; 313 return ret < 0 ? ret : 0;
309} 314}
310 315
@@ -396,9 +401,9 @@ static void free_pi_state(struct futex_pi_state *pi_state)
396 * and has cleaned up the pi_state already 401 * and has cleaned up the pi_state already
397 */ 402 */
398 if (pi_state->owner) { 403 if (pi_state->owner) {
399 spin_lock_irq(&pi_state->owner->pi_lock); 404 raw_spin_lock_irq(&pi_state->owner->pi_lock);
400 list_del_init(&pi_state->list); 405 list_del_init(&pi_state->list);
401 spin_unlock_irq(&pi_state->owner->pi_lock); 406 raw_spin_unlock_irq(&pi_state->owner->pi_lock);
402 407
403 rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); 408 rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
404 } 409 }
@@ -463,18 +468,18 @@ void exit_pi_state_list(struct task_struct *curr)
463 * pi_state_list anymore, but we have to be careful 468 * pi_state_list anymore, but we have to be careful
464 * versus waiters unqueueing themselves: 469 * versus waiters unqueueing themselves:
465 */ 470 */
466 spin_lock_irq(&curr->pi_lock); 471 raw_spin_lock_irq(&curr->pi_lock);
467 while (!list_empty(head)) { 472 while (!list_empty(head)) {
468 473
469 next = head->next; 474 next = head->next;
470 pi_state = list_entry(next, struct futex_pi_state, list); 475 pi_state = list_entry(next, struct futex_pi_state, list);
471 key = pi_state->key; 476 key = pi_state->key;
472 hb = hash_futex(&key); 477 hb = hash_futex(&key);
473 spin_unlock_irq(&curr->pi_lock); 478 raw_spin_unlock_irq(&curr->pi_lock);
474 479
475 spin_lock(&hb->lock); 480 spin_lock(&hb->lock);
476 481
477 spin_lock_irq(&curr->pi_lock); 482 raw_spin_lock_irq(&curr->pi_lock);
478 /* 483 /*
479 * We dropped the pi-lock, so re-check whether this 484 * We dropped the pi-lock, so re-check whether this
480 * task still owns the PI-state: 485 * task still owns the PI-state:
@@ -488,15 +493,15 @@ void exit_pi_state_list(struct task_struct *curr)
488 WARN_ON(list_empty(&pi_state->list)); 493 WARN_ON(list_empty(&pi_state->list));
489 list_del_init(&pi_state->list); 494 list_del_init(&pi_state->list);
490 pi_state->owner = NULL; 495 pi_state->owner = NULL;
491 spin_unlock_irq(&curr->pi_lock); 496 raw_spin_unlock_irq(&curr->pi_lock);
492 497
493 rt_mutex_unlock(&pi_state->pi_mutex); 498 rt_mutex_unlock(&pi_state->pi_mutex);
494 499
495 spin_unlock(&hb->lock); 500 spin_unlock(&hb->lock);
496 501
497 spin_lock_irq(&curr->pi_lock); 502 raw_spin_lock_irq(&curr->pi_lock);
498 } 503 }
499 spin_unlock_irq(&curr->pi_lock); 504 raw_spin_unlock_irq(&curr->pi_lock);
500} 505}
501 506
502static int 507static int
@@ -525,8 +530,25 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
525 return -EINVAL; 530 return -EINVAL;
526 531
527 WARN_ON(!atomic_read(&pi_state->refcount)); 532 WARN_ON(!atomic_read(&pi_state->refcount));
528 WARN_ON(pid && pi_state->owner && 533
529 pi_state->owner->pid != pid); 534 /*
535 * When pi_state->owner is NULL then the owner died
536 * and another waiter is on the fly. pi_state->owner
537 * is fixed up by the task which acquires
538 * pi_state->rt_mutex.
539 *
540 * We do not check for pid == 0 which can happen when
541 * the owner died and robust_list_exit() cleared the
542 * TID.
543 */
544 if (pid && pi_state->owner) {
545 /*
546 * Bail out if user space manipulated the
547 * futex value.
548 */
549 if (pid != task_pid_vnr(pi_state->owner))
550 return -EINVAL;
551 }
530 552
531 atomic_inc(&pi_state->refcount); 553 atomic_inc(&pi_state->refcount);
532 *ps = pi_state; 554 *ps = pi_state;
@@ -551,7 +573,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
551 * change of the task flags, we do this protected by 573 * change of the task flags, we do this protected by
552 * p->pi_lock: 574 * p->pi_lock:
553 */ 575 */
554 spin_lock_irq(&p->pi_lock); 576 raw_spin_lock_irq(&p->pi_lock);
555 if (unlikely(p->flags & PF_EXITING)) { 577 if (unlikely(p->flags & PF_EXITING)) {
556 /* 578 /*
557 * The task is on the way out. When PF_EXITPIDONE is 579 * The task is on the way out. When PF_EXITPIDONE is
@@ -560,7 +582,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
560 */ 582 */
561 int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN; 583 int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
562 584
563 spin_unlock_irq(&p->pi_lock); 585 raw_spin_unlock_irq(&p->pi_lock);
564 put_task_struct(p); 586 put_task_struct(p);
565 return ret; 587 return ret;
566 } 588 }
@@ -579,7 +601,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
579 WARN_ON(!list_empty(&pi_state->list)); 601 WARN_ON(!list_empty(&pi_state->list));
580 list_add(&pi_state->list, &p->pi_state_list); 602 list_add(&pi_state->list, &p->pi_state_list);
581 pi_state->owner = p; 603 pi_state->owner = p;
582 spin_unlock_irq(&p->pi_lock); 604 raw_spin_unlock_irq(&p->pi_lock);
583 605
584 put_task_struct(p); 606 put_task_struct(p);
585 607
@@ -753,7 +775,14 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
753 if (!pi_state) 775 if (!pi_state)
754 return -EINVAL; 776 return -EINVAL;
755 777
756 spin_lock(&pi_state->pi_mutex.wait_lock); 778 /*
779 * If current does not own the pi_state then the futex is
780 * inconsistent and user space fiddled with the futex value.
781 */
782 if (pi_state->owner != current)
783 return -EINVAL;
784
785 raw_spin_lock(&pi_state->pi_mutex.wait_lock);
757 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); 786 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
758 787
759 /* 788 /*
@@ -782,23 +811,23 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
782 else if (curval != uval) 811 else if (curval != uval)
783 ret = -EINVAL; 812 ret = -EINVAL;
784 if (ret) { 813 if (ret) {
785 spin_unlock(&pi_state->pi_mutex.wait_lock); 814 raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
786 return ret; 815 return ret;
787 } 816 }
788 } 817 }
789 818
790 spin_lock_irq(&pi_state->owner->pi_lock); 819 raw_spin_lock_irq(&pi_state->owner->pi_lock);
791 WARN_ON(list_empty(&pi_state->list)); 820 WARN_ON(list_empty(&pi_state->list));
792 list_del_init(&pi_state->list); 821 list_del_init(&pi_state->list);
793 spin_unlock_irq(&pi_state->owner->pi_lock); 822 raw_spin_unlock_irq(&pi_state->owner->pi_lock);
794 823
795 spin_lock_irq(&new_owner->pi_lock); 824 raw_spin_lock_irq(&new_owner->pi_lock);
796 WARN_ON(!list_empty(&pi_state->list)); 825 WARN_ON(!list_empty(&pi_state->list));
797 list_add(&pi_state->list, &new_owner->pi_state_list); 826 list_add(&pi_state->list, &new_owner->pi_state_list);
798 pi_state->owner = new_owner; 827 pi_state->owner = new_owner;
799 spin_unlock_irq(&new_owner->pi_lock); 828 raw_spin_unlock_irq(&new_owner->pi_lock);
800 829
801 spin_unlock(&pi_state->pi_mutex.wait_lock); 830 raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
802 rt_mutex_unlock(&pi_state->pi_mutex); 831 rt_mutex_unlock(&pi_state->pi_mutex);
803 832
804 return 0; 833 return 0;
@@ -860,7 +889,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
860 if (!bitset) 889 if (!bitset)
861 return -EINVAL; 890 return -EINVAL;
862 891
863 ret = get_futex_key(uaddr, fshared, &key, VERIFY_READ); 892 ret = get_futex_key(uaddr, fshared, &key);
864 if (unlikely(ret != 0)) 893 if (unlikely(ret != 0))
865 goto out; 894 goto out;
866 895
@@ -906,10 +935,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
906 int ret, op_ret; 935 int ret, op_ret;
907 936
908retry: 937retry:
909 ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); 938 ret = get_futex_key(uaddr1, fshared, &key1);
910 if (unlikely(ret != 0)) 939 if (unlikely(ret != 0))
911 goto out; 940 goto out;
912 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); 941 ret = get_futex_key(uaddr2, fshared, &key2);
913 if (unlikely(ret != 0)) 942 if (unlikely(ret != 0))
914 goto out_put_key1; 943 goto out_put_key1;
915 944
@@ -1003,7 +1032,7 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1003 plist_add(&q->list, &hb2->chain); 1032 plist_add(&q->list, &hb2->chain);
1004 q->lock_ptr = &hb2->lock; 1033 q->lock_ptr = &hb2->lock;
1005#ifdef CONFIG_DEBUG_PI_LIST 1034#ifdef CONFIG_DEBUG_PI_LIST
1006 q->list.plist.lock = &hb2->lock; 1035 q->list.plist.spinlock = &hb2->lock;
1007#endif 1036#endif
1008 } 1037 }
1009 get_futex_key_refs(key2); 1038 get_futex_key_refs(key2);
@@ -1028,7 +1057,6 @@ static inline
1028void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, 1057void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1029 struct futex_hash_bucket *hb) 1058 struct futex_hash_bucket *hb)
1030{ 1059{
1031 drop_futex_key_refs(&q->key);
1032 get_futex_key_refs(key); 1060 get_futex_key_refs(key);
1033 q->key = *key; 1061 q->key = *key;
1034 1062
@@ -1040,7 +1068,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1040 1068
1041 q->lock_ptr = &hb->lock; 1069 q->lock_ptr = &hb->lock;
1042#ifdef CONFIG_DEBUG_PI_LIST 1070#ifdef CONFIG_DEBUG_PI_LIST
1043 q->list.plist.lock = &hb->lock; 1071 q->list.plist.spinlock = &hb->lock;
1044#endif 1072#endif
1045 1073
1046 wake_up_state(q->task, TASK_NORMAL); 1074 wake_up_state(q->task, TASK_NORMAL);
@@ -1169,11 +1197,10 @@ retry:
1169 pi_state = NULL; 1197 pi_state = NULL;
1170 } 1198 }
1171 1199
1172 ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); 1200 ret = get_futex_key(uaddr1, fshared, &key1);
1173 if (unlikely(ret != 0)) 1201 if (unlikely(ret != 0))
1174 goto out; 1202 goto out;
1175 ret = get_futex_key(uaddr2, fshared, &key2, 1203 ret = get_futex_key(uaddr2, fshared, &key2);
1176 requeue_pi ? VERIFY_WRITE : VERIFY_READ);
1177 if (unlikely(ret != 0)) 1204 if (unlikely(ret != 0))
1178 goto out_put_key1; 1205 goto out_put_key1;
1179 1206
@@ -1226,6 +1253,7 @@ retry_private:
1226 */ 1253 */
1227 if (ret == 1) { 1254 if (ret == 1) {
1228 WARN_ON(pi_state); 1255 WARN_ON(pi_state);
1256 drop_count++;
1229 task_count++; 1257 task_count++;
1230 ret = get_futex_value_locked(&curval2, uaddr2); 1258 ret = get_futex_value_locked(&curval2, uaddr2);
1231 if (!ret) 1259 if (!ret)
@@ -1304,6 +1332,7 @@ retry_private:
1304 if (ret == 1) { 1332 if (ret == 1) {
1305 /* We got the lock. */ 1333 /* We got the lock. */
1306 requeue_pi_wake_futex(this, &key2, hb2); 1334 requeue_pi_wake_futex(this, &key2, hb2);
1335 drop_count++;
1307 continue; 1336 continue;
1308 } else if (ret) { 1337 } else if (ret) {
1309 /* -EDEADLK */ 1338 /* -EDEADLK */
@@ -1386,7 +1415,7 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1386 1415
1387 plist_node_init(&q->list, prio); 1416 plist_node_init(&q->list, prio);
1388#ifdef CONFIG_DEBUG_PI_LIST 1417#ifdef CONFIG_DEBUG_PI_LIST
1389 q->list.plist.lock = &hb->lock; 1418 q->list.plist.spinlock = &hb->lock;
1390#endif 1419#endif
1391 plist_add(&q->list, &hb->chain); 1420 plist_add(&q->list, &hb->chain);
1392 q->task = current; 1421 q->task = current;
@@ -1521,18 +1550,18 @@ retry:
1521 * itself. 1550 * itself.
1522 */ 1551 */
1523 if (pi_state->owner != NULL) { 1552 if (pi_state->owner != NULL) {
1524 spin_lock_irq(&pi_state->owner->pi_lock); 1553 raw_spin_lock_irq(&pi_state->owner->pi_lock);
1525 WARN_ON(list_empty(&pi_state->list)); 1554 WARN_ON(list_empty(&pi_state->list));
1526 list_del_init(&pi_state->list); 1555 list_del_init(&pi_state->list);
1527 spin_unlock_irq(&pi_state->owner->pi_lock); 1556 raw_spin_unlock_irq(&pi_state->owner->pi_lock);
1528 } 1557 }
1529 1558
1530 pi_state->owner = newowner; 1559 pi_state->owner = newowner;
1531 1560
1532 spin_lock_irq(&newowner->pi_lock); 1561 raw_spin_lock_irq(&newowner->pi_lock);
1533 WARN_ON(!list_empty(&pi_state->list)); 1562 WARN_ON(!list_empty(&pi_state->list));
1534 list_add(&pi_state->list, &newowner->pi_state_list); 1563 list_add(&pi_state->list, &newowner->pi_state_list);
1535 spin_unlock_irq(&newowner->pi_lock); 1564 raw_spin_unlock_irq(&newowner->pi_lock);
1536 return 0; 1565 return 0;
1537 1566
1538 /* 1567 /*
@@ -1730,7 +1759,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
1730 */ 1759 */
1731retry: 1760retry:
1732 q->key = FUTEX_KEY_INIT; 1761 q->key = FUTEX_KEY_INIT;
1733 ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ); 1762 ret = get_futex_key(uaddr, fshared, &q->key);
1734 if (unlikely(ret != 0)) 1763 if (unlikely(ret != 0))
1735 return ret; 1764 return ret;
1736 1765
@@ -1791,6 +1820,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1791 current->timer_slack_ns); 1820 current->timer_slack_ns);
1792 } 1821 }
1793 1822
1823retry:
1794 /* Prepare to wait on uaddr. */ 1824 /* Prepare to wait on uaddr. */
1795 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); 1825 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
1796 if (ret) 1826 if (ret)
@@ -1808,9 +1838,14 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1808 goto out_put_key; 1838 goto out_put_key;
1809 1839
1810 /* 1840 /*
1811 * We expect signal_pending(current), but another thread may 1841 * We expect signal_pending(current), but we might be the
1812 * have handled it for us already. 1842 * victim of a spurious wakeup as well.
1813 */ 1843 */
1844 if (!signal_pending(current)) {
1845 put_futex_key(fshared, &q.key);
1846 goto retry;
1847 }
1848
1814 ret = -ERESTARTSYS; 1849 ret = -ERESTARTSYS;
1815 if (!abs_time) 1850 if (!abs_time)
1816 goto out_put_key; 1851 goto out_put_key;
@@ -1890,7 +1925,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1890 q.requeue_pi_key = NULL; 1925 q.requeue_pi_key = NULL;
1891retry: 1926retry:
1892 q.key = FUTEX_KEY_INIT; 1927 q.key = FUTEX_KEY_INIT;
1893 ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE); 1928 ret = get_futex_key(uaddr, fshared, &q.key);
1894 if (unlikely(ret != 0)) 1929 if (unlikely(ret != 0))
1895 goto out; 1930 goto out;
1896 1931
@@ -1960,7 +1995,7 @@ retry_private:
1960 /* Unqueue and drop the lock */ 1995 /* Unqueue and drop the lock */
1961 unqueue_me_pi(&q); 1996 unqueue_me_pi(&q);
1962 1997
1963 goto out; 1998 goto out_put_key;
1964 1999
1965out_unlock_put_key: 2000out_unlock_put_key:
1966 queue_unlock(&q, hb); 2001 queue_unlock(&q, hb);
@@ -2009,7 +2044,7 @@ retry:
2009 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) 2044 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
2010 return -EPERM; 2045 return -EPERM;
2011 2046
2012 ret = get_futex_key(uaddr, fshared, &key, VERIFY_WRITE); 2047 ret = get_futex_key(uaddr, fshared, &key);
2013 if (unlikely(ret != 0)) 2048 if (unlikely(ret != 0))
2014 goto out; 2049 goto out;
2015 2050
@@ -2118,9 +2153,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2118 */ 2153 */
2119 plist_del(&q->list, &q->list.plist); 2154 plist_del(&q->list, &q->list.plist);
2120 2155
2156 /* Handle spurious wakeups gracefully */
2157 ret = -EWOULDBLOCK;
2121 if (timeout && !timeout->task) 2158 if (timeout && !timeout->task)
2122 ret = -ETIMEDOUT; 2159 ret = -ETIMEDOUT;
2123 else 2160 else if (signal_pending(current))
2124 ret = -ERESTARTNOINTR; 2161 ret = -ERESTARTNOINTR;
2125 } 2162 }
2126 return ret; 2163 return ret;
@@ -2199,7 +2236,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2199 rt_waiter.task = NULL; 2236 rt_waiter.task = NULL;
2200 2237
2201 key2 = FUTEX_KEY_INIT; 2238 key2 = FUTEX_KEY_INIT;
2202 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); 2239 ret = get_futex_key(uaddr2, fshared, &key2);
2203 if (unlikely(ret != 0)) 2240 if (unlikely(ret != 0))
2204 goto out; 2241 goto out;
2205 2242
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 3e1c36e7998..0086628b6e9 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -127,11 +127,11 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
127 for (;;) { 127 for (;;) {
128 base = timer->base; 128 base = timer->base;
129 if (likely(base != NULL)) { 129 if (likely(base != NULL)) {
130 spin_lock_irqsave(&base->cpu_base->lock, *flags); 130 raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
131 if (likely(base == timer->base)) 131 if (likely(base == timer->base))
132 return base; 132 return base;
133 /* The timer has migrated to another CPU: */ 133 /* The timer has migrated to another CPU: */
134 spin_unlock_irqrestore(&base->cpu_base->lock, *flags); 134 raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags);
135 } 135 }
136 cpu_relax(); 136 cpu_relax();
137 } 137 }
@@ -208,13 +208,13 @@ again:
208 208
209 /* See the comment in lock_timer_base() */ 209 /* See the comment in lock_timer_base() */
210 timer->base = NULL; 210 timer->base = NULL;
211 spin_unlock(&base->cpu_base->lock); 211 raw_spin_unlock(&base->cpu_base->lock);
212 spin_lock(&new_base->cpu_base->lock); 212 raw_spin_lock(&new_base->cpu_base->lock);
213 213
214 if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { 214 if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
215 cpu = this_cpu; 215 cpu = this_cpu;
216 spin_unlock(&new_base->cpu_base->lock); 216 raw_spin_unlock(&new_base->cpu_base->lock);
217 spin_lock(&base->cpu_base->lock); 217 raw_spin_lock(&base->cpu_base->lock);
218 timer->base = base; 218 timer->base = base;
219 goto again; 219 goto again;
220 } 220 }
@@ -230,7 +230,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
230{ 230{
231 struct hrtimer_clock_base *base = timer->base; 231 struct hrtimer_clock_base *base = timer->base;
232 232
233 spin_lock_irqsave(&base->cpu_base->lock, *flags); 233 raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
234 234
235 return base; 235 return base;
236} 236}
@@ -557,7 +557,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
557static int hrtimer_reprogram(struct hrtimer *timer, 557static int hrtimer_reprogram(struct hrtimer *timer,
558 struct hrtimer_clock_base *base) 558 struct hrtimer_clock_base *base)
559{ 559{
560 ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next; 560 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
561 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); 561 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
562 int res; 562 int res;
563 563
@@ -582,7 +582,16 @@ static int hrtimer_reprogram(struct hrtimer *timer,
582 if (expires.tv64 < 0) 582 if (expires.tv64 < 0)
583 return -ETIME; 583 return -ETIME;
584 584
585 if (expires.tv64 >= expires_next->tv64) 585 if (expires.tv64 >= cpu_base->expires_next.tv64)
586 return 0;
587
588 /*
589 * If a hang was detected in the last timer interrupt then we
590 * do not schedule a timer which is earlier than the expiry
591 * which we enforced in the hang detection. We want the system
592 * to make progress.
593 */
594 if (cpu_base->hang_detected)
586 return 0; 595 return 0;
587 596
588 /* 597 /*
@@ -590,7 +599,7 @@ static int hrtimer_reprogram(struct hrtimer *timer,
590 */ 599 */
591 res = tick_program_event(expires, 0); 600 res = tick_program_event(expires, 0);
592 if (!IS_ERR_VALUE(res)) 601 if (!IS_ERR_VALUE(res))
593 *expires_next = expires; 602 cpu_base->expires_next = expires;
594 return res; 603 return res;
595} 604}
596 605
@@ -619,12 +628,12 @@ static void retrigger_next_event(void *arg)
619 base = &__get_cpu_var(hrtimer_bases); 628 base = &__get_cpu_var(hrtimer_bases);
620 629
621 /* Adjust CLOCK_REALTIME offset */ 630 /* Adjust CLOCK_REALTIME offset */
622 spin_lock(&base->lock); 631 raw_spin_lock(&base->lock);
623 base->clock_base[CLOCK_REALTIME].offset = 632 base->clock_base[CLOCK_REALTIME].offset =
624 timespec_to_ktime(realtime_offset); 633 timespec_to_ktime(realtime_offset);
625 634
626 hrtimer_force_reprogram(base, 0); 635 hrtimer_force_reprogram(base, 0);
627 spin_unlock(&base->lock); 636 raw_spin_unlock(&base->lock);
628} 637}
629 638
630/* 639/*
@@ -685,9 +694,9 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
685{ 694{
686 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { 695 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
687 if (wakeup) { 696 if (wakeup) {
688 spin_unlock(&base->cpu_base->lock); 697 raw_spin_unlock(&base->cpu_base->lock);
689 raise_softirq_irqoff(HRTIMER_SOFTIRQ); 698 raise_softirq_irqoff(HRTIMER_SOFTIRQ);
690 spin_lock(&base->cpu_base->lock); 699 raw_spin_lock(&base->cpu_base->lock);
691 } else 700 } else
692 __raise_softirq_irqoff(HRTIMER_SOFTIRQ); 701 __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
693 702
@@ -747,17 +756,33 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
747 756
748#endif /* CONFIG_HIGH_RES_TIMERS */ 757#endif /* CONFIG_HIGH_RES_TIMERS */
749 758
750#ifdef CONFIG_TIMER_STATS 759static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
751void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr)
752{ 760{
761#ifdef CONFIG_TIMER_STATS
753 if (timer->start_site) 762 if (timer->start_site)
754 return; 763 return;
755 764 timer->start_site = __builtin_return_address(0);
756 timer->start_site = addr;
757 memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); 765 memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
758 timer->start_pid = current->pid; 766 timer->start_pid = current->pid;
767#endif
759} 768}
769
770static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer)
771{
772#ifdef CONFIG_TIMER_STATS
773 timer->start_site = NULL;
774#endif
775}
776
777static inline void timer_stats_account_hrtimer(struct hrtimer *timer)
778{
779#ifdef CONFIG_TIMER_STATS
780 if (likely(!timer_stats_active))
781 return;
782 timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
783 timer->function, timer->start_comm, 0);
760#endif 784#endif
785}
761 786
762/* 787/*
763 * Counterpart to lock_hrtimer_base above: 788 * Counterpart to lock_hrtimer_base above:
@@ -765,7 +790,7 @@ void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr)
765static inline 790static inline
766void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) 791void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
767{ 792{
768 spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); 793 raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
769} 794}
770 795
771/** 796/**
@@ -1098,7 +1123,7 @@ ktime_t hrtimer_get_next_event(void)
1098 unsigned long flags; 1123 unsigned long flags;
1099 int i; 1124 int i;
1100 1125
1101 spin_lock_irqsave(&cpu_base->lock, flags); 1126 raw_spin_lock_irqsave(&cpu_base->lock, flags);
1102 1127
1103 if (!hrtimer_hres_active()) { 1128 if (!hrtimer_hres_active()) {
1104 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { 1129 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
@@ -1115,7 +1140,7 @@ ktime_t hrtimer_get_next_event(void)
1115 } 1140 }
1116 } 1141 }
1117 1142
1118 spin_unlock_irqrestore(&cpu_base->lock, flags); 1143 raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
1119 1144
1120 if (mindelta.tv64 < 0) 1145 if (mindelta.tv64 < 0)
1121 mindelta.tv64 = 0; 1146 mindelta.tv64 = 0;
@@ -1197,11 +1222,11 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
1197 * they get migrated to another cpu, therefore its safe to unlock 1222 * they get migrated to another cpu, therefore its safe to unlock
1198 * the timer base. 1223 * the timer base.
1199 */ 1224 */
1200 spin_unlock(&cpu_base->lock); 1225 raw_spin_unlock(&cpu_base->lock);
1201 trace_hrtimer_expire_entry(timer, now); 1226 trace_hrtimer_expire_entry(timer, now);
1202 restart = fn(timer); 1227 restart = fn(timer);
1203 trace_hrtimer_expire_exit(timer); 1228 trace_hrtimer_expire_exit(timer);
1204 spin_lock(&cpu_base->lock); 1229 raw_spin_lock(&cpu_base->lock);
1205 1230
1206 /* 1231 /*
1207 * Note: We clear the CALLBACK bit after enqueue_hrtimer and 1232 * Note: We clear the CALLBACK bit after enqueue_hrtimer and
@@ -1217,29 +1242,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
1217 1242
1218#ifdef CONFIG_HIGH_RES_TIMERS 1243#ifdef CONFIG_HIGH_RES_TIMERS
1219 1244
1220static int force_clock_reprogram;
1221
1222/*
1223 * After 5 iteration's attempts, we consider that hrtimer_interrupt()
1224 * is hanging, which could happen with something that slows the interrupt
1225 * such as the tracing. Then we force the clock reprogramming for each future
1226 * hrtimer interrupts to avoid infinite loops and use the min_delta_ns
1227 * threshold that we will overwrite.
1228 * The next tick event will be scheduled to 3 times we currently spend on
1229 * hrtimer_interrupt(). This gives a good compromise, the cpus will spend
1230 * 1/4 of their time to process the hrtimer interrupts. This is enough to
1231 * let it running without serious starvation.
1232 */
1233
1234static inline void
1235hrtimer_interrupt_hanging(struct clock_event_device *dev,
1236 ktime_t try_time)
1237{
1238 force_clock_reprogram = 1;
1239 dev->min_delta_ns = (unsigned long)try_time.tv64 * 3;
1240 printk(KERN_WARNING "hrtimer: interrupt too slow, "
1241 "forcing clock min delta to %lu ns\n", dev->min_delta_ns);
1242}
1243/* 1245/*
1244 * High resolution timer interrupt 1246 * High resolution timer interrupt
1245 * Called with interrupts disabled 1247 * Called with interrupts disabled
@@ -1248,24 +1250,18 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1248{ 1250{
1249 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1251 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1250 struct hrtimer_clock_base *base; 1252 struct hrtimer_clock_base *base;
1251 ktime_t expires_next, now; 1253 ktime_t expires_next, now, entry_time, delta;
1252 int nr_retries = 0; 1254 int i, retries = 0;
1253 int i;
1254 1255
1255 BUG_ON(!cpu_base->hres_active); 1256 BUG_ON(!cpu_base->hres_active);
1256 cpu_base->nr_events++; 1257 cpu_base->nr_events++;
1257 dev->next_event.tv64 = KTIME_MAX; 1258 dev->next_event.tv64 = KTIME_MAX;
1258 1259
1259 retry: 1260 entry_time = now = ktime_get();
1260 /* 5 retries is enough to notice a hang */ 1261retry:
1261 if (!(++nr_retries % 5))
1262 hrtimer_interrupt_hanging(dev, ktime_sub(ktime_get(), now));
1263
1264 now = ktime_get();
1265
1266 expires_next.tv64 = KTIME_MAX; 1262 expires_next.tv64 = KTIME_MAX;
1267 1263
1268 spin_lock(&cpu_base->lock); 1264 raw_spin_lock(&cpu_base->lock);
1269 /* 1265 /*
1270 * We set expires_next to KTIME_MAX here with cpu_base->lock 1266 * We set expires_next to KTIME_MAX here with cpu_base->lock
1271 * held to prevent that a timer is enqueued in our queue via 1267 * held to prevent that a timer is enqueued in our queue via
@@ -1321,13 +1317,51 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1321 * against it. 1317 * against it.
1322 */ 1318 */
1323 cpu_base->expires_next = expires_next; 1319 cpu_base->expires_next = expires_next;
1324 spin_unlock(&cpu_base->lock); 1320 raw_spin_unlock(&cpu_base->lock);
1325 1321
1326 /* Reprogramming necessary ? */ 1322 /* Reprogramming necessary ? */
1327 if (expires_next.tv64 != KTIME_MAX) { 1323 if (expires_next.tv64 == KTIME_MAX ||
1328 if (tick_program_event(expires_next, force_clock_reprogram)) 1324 !tick_program_event(expires_next, 0)) {
1329 goto retry; 1325 cpu_base->hang_detected = 0;
1326 return;
1330 } 1327 }
1328
1329 /*
1330 * The next timer was already expired due to:
1331 * - tracing
1332 * - long lasting callbacks
1333 * - being scheduled away when running in a VM
1334 *
1335 * We need to prevent that we loop forever in the hrtimer
1336 * interrupt routine. We give it 3 attempts to avoid
1337 * overreacting on some spurious event.
1338 */
1339 now = ktime_get();
1340 cpu_base->nr_retries++;
1341 if (++retries < 3)
1342 goto retry;
1343 /*
1344 * Give the system a chance to do something else than looping
1345 * here. We stored the entry time, so we know exactly how long
1346 * we spent here. We schedule the next event this amount of
1347 * time away.
1348 */
1349 cpu_base->nr_hangs++;
1350 cpu_base->hang_detected = 1;
1351 delta = ktime_sub(now, entry_time);
1352 if (delta.tv64 > cpu_base->max_hang_time.tv64)
1353 cpu_base->max_hang_time = delta;
1354 /*
1355 * Limit it to a sensible value as we enforce a longer
1356 * delay. Give the CPU at least 100ms to catch up.
1357 */
1358 if (delta.tv64 > 100 * NSEC_PER_MSEC)
1359 expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
1360 else
1361 expires_next = ktime_add(now, delta);
1362 tick_program_event(expires_next, 1);
1363 printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n",
1364 ktime_to_ns(delta));
1331} 1365}
1332 1366
1333/* 1367/*
@@ -1423,7 +1457,7 @@ void hrtimer_run_queues(void)
1423 gettime = 0; 1457 gettime = 0;
1424 } 1458 }
1425 1459
1426 spin_lock(&cpu_base->lock); 1460 raw_spin_lock(&cpu_base->lock);
1427 1461
1428 while ((node = base->first)) { 1462 while ((node = base->first)) {
1429 struct hrtimer *timer; 1463 struct hrtimer *timer;
@@ -1435,7 +1469,7 @@ void hrtimer_run_queues(void)
1435 1469
1436 __run_hrtimer(timer, &base->softirq_time); 1470 __run_hrtimer(timer, &base->softirq_time);
1437 } 1471 }
1438 spin_unlock(&cpu_base->lock); 1472 raw_spin_unlock(&cpu_base->lock);
1439 } 1473 }
1440} 1474}
1441 1475
@@ -1591,7 +1625,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
1591 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); 1625 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
1592 int i; 1626 int i;
1593 1627
1594 spin_lock_init(&cpu_base->lock); 1628 raw_spin_lock_init(&cpu_base->lock);
1595 1629
1596 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) 1630 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
1597 cpu_base->clock_base[i].cpu_base = cpu_base; 1631 cpu_base->clock_base[i].cpu_base = cpu_base;
@@ -1649,16 +1683,16 @@ static void migrate_hrtimers(int scpu)
1649 * The caller is globally serialized and nobody else 1683 * The caller is globally serialized and nobody else
1650 * takes two locks at once, deadlock is not possible. 1684 * takes two locks at once, deadlock is not possible.
1651 */ 1685 */
1652 spin_lock(&new_base->lock); 1686 raw_spin_lock(&new_base->lock);
1653 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); 1687 raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1654 1688
1655 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1689 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1656 migrate_hrtimer_list(&old_base->clock_base[i], 1690 migrate_hrtimer_list(&old_base->clock_base[i],
1657 &new_base->clock_base[i]); 1691 &new_base->clock_base[i]);
1658 } 1692 }
1659 1693
1660 spin_unlock(&old_base->lock); 1694 raw_spin_unlock(&old_base->lock);
1661 spin_unlock(&new_base->lock); 1695 raw_spin_unlock(&new_base->lock);
1662 1696
1663 /* Check, if we got expired work to do */ 1697 /* Check, if we got expired work to do */
1664 __hrtimer_peek_ahead_timers(); 1698 __hrtimer_peek_ahead_timers();
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index d4e84174740..0c642d51aac 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -144,7 +144,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
144 144
145 rcu_read_lock(); 145 rcu_read_lock();
146 do_each_thread(g, t) { 146 do_each_thread(g, t) {
147 if (!--max_count) 147 if (!max_count--)
148 goto unlock; 148 goto unlock;
149 if (!--batch_count) { 149 if (!--batch_count) {
150 batch_count = HUNG_TASK_BATCHING; 150 batch_count = HUNG_TASK_BATCHING;
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
new file mode 100644
index 00000000000..967e66143e1
--- /dev/null
+++ b/kernel/hw_breakpoint.c
@@ -0,0 +1,493 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) 2007 Alan Stern
17 * Copyright (C) IBM Corporation, 2009
18 * Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com>
19 *
20 * Thanks to Ingo Molnar for his many suggestions.
21 *
22 * Authors: Alan Stern <stern@rowland.harvard.edu>
23 * K.Prasad <prasad@linux.vnet.ibm.com>
24 * Frederic Weisbecker <fweisbec@gmail.com>
25 */
26
27/*
28 * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
29 * using the CPU's debug registers.
30 * This file contains the arch-independent routines.
31 */
32
33#include <linux/irqflags.h>
34#include <linux/kallsyms.h>
35#include <linux/notifier.h>
36#include <linux/kprobes.h>
37#include <linux/kdebug.h>
38#include <linux/kernel.h>
39#include <linux/module.h>
40#include <linux/percpu.h>
41#include <linux/sched.h>
42#include <linux/init.h>
43#include <linux/cpu.h>
44#include <linux/smp.h>
45
46#include <linux/hw_breakpoint.h>
47
48/*
49 * Constraints data
50 */
51
52/* Number of pinned cpu breakpoints in a cpu */
53static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned);
54
55/* Number of pinned task breakpoints in a cpu */
56static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[HBP_NUM]);
57
58/* Number of non-pinned cpu/task breakpoints in a cpu */
59static DEFINE_PER_CPU(unsigned int, nr_bp_flexible);
60
61/* Gather the number of total pinned and un-pinned bp in a cpuset */
62struct bp_busy_slots {
63 unsigned int pinned;
64 unsigned int flexible;
65};
66
67/* Serialize accesses to the above constraints */
68static DEFINE_MUTEX(nr_bp_mutex);
69
70/*
71 * Report the maximum number of pinned breakpoints a task
72 * have in this cpu
73 */
74static unsigned int max_task_bp_pinned(int cpu)
75{
76 int i;
77 unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned, cpu);
78
79 for (i = HBP_NUM -1; i >= 0; i--) {
80 if (tsk_pinned[i] > 0)
81 return i + 1;
82 }
83
84 return 0;
85}
86
87static int task_bp_pinned(struct task_struct *tsk)
88{
89 struct perf_event_context *ctx = tsk->perf_event_ctxp;
90 struct list_head *list;
91 struct perf_event *bp;
92 unsigned long flags;
93 int count = 0;
94
95 if (WARN_ONCE(!ctx, "No perf context for this task"))
96 return 0;
97
98 list = &ctx->event_list;
99
100 raw_spin_lock_irqsave(&ctx->lock, flags);
101
102 /*
103 * The current breakpoint counter is not included in the list
104 * at the open() callback time
105 */
106 list_for_each_entry(bp, list, event_entry) {
107 if (bp->attr.type == PERF_TYPE_BREAKPOINT)
108 count++;
109 }
110
111 raw_spin_unlock_irqrestore(&ctx->lock, flags);
112
113 return count;
114}
115
116/*
117 * Report the number of pinned/un-pinned breakpoints we have in
118 * a given cpu (cpu > -1) or in all of them (cpu = -1).
119 */
120static void
121fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
122{
123 int cpu = bp->cpu;
124 struct task_struct *tsk = bp->ctx->task;
125
126 if (cpu >= 0) {
127 slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu);
128 if (!tsk)
129 slots->pinned += max_task_bp_pinned(cpu);
130 else
131 slots->pinned += task_bp_pinned(tsk);
132 slots->flexible = per_cpu(nr_bp_flexible, cpu);
133
134 return;
135 }
136
137 for_each_online_cpu(cpu) {
138 unsigned int nr;
139
140 nr = per_cpu(nr_cpu_bp_pinned, cpu);
141 if (!tsk)
142 nr += max_task_bp_pinned(cpu);
143 else
144 nr += task_bp_pinned(tsk);
145
146 if (nr > slots->pinned)
147 slots->pinned = nr;
148
149 nr = per_cpu(nr_bp_flexible, cpu);
150
151 if (nr > slots->flexible)
152 slots->flexible = nr;
153 }
154}
155
156/*
157 * Add a pinned breakpoint for the given task in our constraint table
158 */
159static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable)
160{
161 unsigned int *tsk_pinned;
162 int count = 0;
163
164 count = task_bp_pinned(tsk);
165
166 tsk_pinned = per_cpu(nr_task_bp_pinned, cpu);
167 if (enable) {
168 tsk_pinned[count]++;
169 if (count > 0)
170 tsk_pinned[count-1]--;
171 } else {
172 tsk_pinned[count]--;
173 if (count > 0)
174 tsk_pinned[count-1]++;
175 }
176}
177
178/*
179 * Add/remove the given breakpoint in our constraint table
180 */
181static void toggle_bp_slot(struct perf_event *bp, bool enable)
182{
183 int cpu = bp->cpu;
184 struct task_struct *tsk = bp->ctx->task;
185
186 /* Pinned counter task profiling */
187 if (tsk) {
188 if (cpu >= 0) {
189 toggle_bp_task_slot(tsk, cpu, enable);
190 return;
191 }
192
193 for_each_online_cpu(cpu)
194 toggle_bp_task_slot(tsk, cpu, enable);
195 return;
196 }
197
198 /* Pinned counter cpu profiling */
199 if (enable)
200 per_cpu(nr_cpu_bp_pinned, bp->cpu)++;
201 else
202 per_cpu(nr_cpu_bp_pinned, bp->cpu)--;
203}
204
205/*
206 * Contraints to check before allowing this new breakpoint counter:
207 *
208 * == Non-pinned counter == (Considered as pinned for now)
209 *
210 * - If attached to a single cpu, check:
211 *
212 * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu)
213 * + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM
214 *
215 * -> If there are already non-pinned counters in this cpu, it means
216 * there is already a free slot for them.
217 * Otherwise, we check that the maximum number of per task
218 * breakpoints (for this cpu) plus the number of per cpu breakpoint
219 * (for this cpu) doesn't cover every registers.
220 *
221 * - If attached to every cpus, check:
222 *
223 * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *))
224 * + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM
225 *
226 * -> This is roughly the same, except we check the number of per cpu
227 * bp for every cpu and we keep the max one. Same for the per tasks
228 * breakpoints.
229 *
230 *
231 * == Pinned counter ==
232 *
233 * - If attached to a single cpu, check:
234 *
235 * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu)
236 * + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM
237 *
238 * -> Same checks as before. But now the nr_bp_flexible, if any, must keep
239 * one register at least (or they will never be fed).
240 *
241 * - If attached to every cpus, check:
242 *
243 * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
244 * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM
245 */
246static int __reserve_bp_slot(struct perf_event *bp)
247{
248 struct bp_busy_slots slots = {0};
249
250 fetch_bp_busy_slots(&slots, bp);
251
252 /* Flexible counters need to keep at least one slot */
253 if (slots.pinned + (!!slots.flexible) == HBP_NUM)
254 return -ENOSPC;
255
256 toggle_bp_slot(bp, true);
257
258 return 0;
259}
260
261int reserve_bp_slot(struct perf_event *bp)
262{
263 int ret;
264
265 mutex_lock(&nr_bp_mutex);
266
267 ret = __reserve_bp_slot(bp);
268
269 mutex_unlock(&nr_bp_mutex);
270
271 return ret;
272}
273
274static void __release_bp_slot(struct perf_event *bp)
275{
276 toggle_bp_slot(bp, false);
277}
278
279void release_bp_slot(struct perf_event *bp)
280{
281 mutex_lock(&nr_bp_mutex);
282
283 __release_bp_slot(bp);
284
285 mutex_unlock(&nr_bp_mutex);
286}
287
288/*
289 * Allow the kernel debugger to reserve breakpoint slots without
290 * taking a lock using the dbg_* variant of for the reserve and
291 * release breakpoint slots.
292 */
293int dbg_reserve_bp_slot(struct perf_event *bp)
294{
295 if (mutex_is_locked(&nr_bp_mutex))
296 return -1;
297
298 return __reserve_bp_slot(bp);
299}
300
301int dbg_release_bp_slot(struct perf_event *bp)
302{
303 if (mutex_is_locked(&nr_bp_mutex))
304 return -1;
305
306 __release_bp_slot(bp);
307
308 return 0;
309}
310
311int register_perf_hw_breakpoint(struct perf_event *bp)
312{
313 int ret;
314
315 ret = reserve_bp_slot(bp);
316 if (ret)
317 return ret;
318
319 /*
320 * Ptrace breakpoints can be temporary perf events only
321 * meant to reserve a slot. In this case, it is created disabled and
322 * we don't want to check the params right now (as we put a null addr)
323 * But perf tools create events as disabled and we want to check
324 * the params for them.
325 * This is a quick hack that will be removed soon, once we remove
326 * the tmp breakpoints from ptrace
327 */
328 if (!bp->attr.disabled || !bp->overflow_handler)
329 ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
330
331 /* if arch_validate_hwbkpt_settings() fails then release bp slot */
332 if (ret)
333 release_bp_slot(bp);
334
335 return ret;
336}
337
338/**
339 * register_user_hw_breakpoint - register a hardware breakpoint for user space
340 * @attr: breakpoint attributes
341 * @triggered: callback to trigger when we hit the breakpoint
342 * @tsk: pointer to 'task_struct' of the process to which the address belongs
343 */
344struct perf_event *
345register_user_hw_breakpoint(struct perf_event_attr *attr,
346 perf_overflow_handler_t triggered,
347 struct task_struct *tsk)
348{
349 return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered);
350}
351EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
352
353/**
354 * modify_user_hw_breakpoint - modify a user-space hardware breakpoint
355 * @bp: the breakpoint structure to modify
356 * @attr: new breakpoint attributes
357 * @triggered: callback to trigger when we hit the breakpoint
358 * @tsk: pointer to 'task_struct' of the process to which the address belongs
359 */
360int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr)
361{
362 u64 old_addr = bp->attr.bp_addr;
363 u64 old_len = bp->attr.bp_len;
364 int old_type = bp->attr.bp_type;
365 int err = 0;
366
367 perf_event_disable(bp);
368
369 bp->attr.bp_addr = attr->bp_addr;
370 bp->attr.bp_type = attr->bp_type;
371 bp->attr.bp_len = attr->bp_len;
372
373 if (attr->disabled)
374 goto end;
375
376 err = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
377 if (!err)
378 perf_event_enable(bp);
379
380 if (err) {
381 bp->attr.bp_addr = old_addr;
382 bp->attr.bp_type = old_type;
383 bp->attr.bp_len = old_len;
384 if (!bp->attr.disabled)
385 perf_event_enable(bp);
386
387 return err;
388 }
389
390end:
391 bp->attr.disabled = attr->disabled;
392
393 return 0;
394}
395EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
396
397/**
398 * unregister_hw_breakpoint - unregister a user-space hardware breakpoint
399 * @bp: the breakpoint structure to unregister
400 */
401void unregister_hw_breakpoint(struct perf_event *bp)
402{
403 if (!bp)
404 return;
405 perf_event_release_kernel(bp);
406}
407EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
408
409/**
410 * register_wide_hw_breakpoint - register a wide breakpoint in the kernel
411 * @attr: breakpoint attributes
412 * @triggered: callback to trigger when we hit the breakpoint
413 *
414 * @return a set of per_cpu pointers to perf events
415 */
416struct perf_event **
417register_wide_hw_breakpoint(struct perf_event_attr *attr,
418 perf_overflow_handler_t triggered)
419{
420 struct perf_event **cpu_events, **pevent, *bp;
421 long err;
422 int cpu;
423
424 cpu_events = alloc_percpu(typeof(*cpu_events));
425 if (!cpu_events)
426 return ERR_PTR(-ENOMEM);
427
428 get_online_cpus();
429 for_each_online_cpu(cpu) {
430 pevent = per_cpu_ptr(cpu_events, cpu);
431 bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered);
432
433 *pevent = bp;
434
435 if (IS_ERR(bp)) {
436 err = PTR_ERR(bp);
437 goto fail;
438 }
439 }
440 put_online_cpus();
441
442 return cpu_events;
443
444fail:
445 for_each_online_cpu(cpu) {
446 pevent = per_cpu_ptr(cpu_events, cpu);
447 if (IS_ERR(*pevent))
448 break;
449 unregister_hw_breakpoint(*pevent);
450 }
451 put_online_cpus();
452
453 free_percpu(cpu_events);
454 return ERR_PTR(err);
455}
456EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
457
458/**
459 * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel
460 * @cpu_events: the per cpu set of events to unregister
461 */
462void unregister_wide_hw_breakpoint(struct perf_event **cpu_events)
463{
464 int cpu;
465 struct perf_event **pevent;
466
467 for_each_possible_cpu(cpu) {
468 pevent = per_cpu_ptr(cpu_events, cpu);
469 unregister_hw_breakpoint(*pevent);
470 }
471 free_percpu(cpu_events);
472}
473EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint);
474
475static struct notifier_block hw_breakpoint_exceptions_nb = {
476 .notifier_call = hw_breakpoint_exceptions_notify,
477 /* we need to be notified first */
478 .priority = 0x7fffffff
479};
480
481static int __init init_hw_breakpoint(void)
482{
483 return register_die_notifier(&hw_breakpoint_exceptions_nb);
484}
485core_initcall(init_hw_breakpoint);
486
487
488struct pmu perf_ops_bp = {
489 .enable = arch_install_hw_breakpoint,
490 .disable = arch_uninstall_hw_breakpoint,
491 .read = hw_breakpoint_pmu_read,
492 .unthrottle = hw_breakpoint_pmu_unthrottle
493};
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 1de9700f416..2295a31ef11 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -45,7 +45,7 @@ unsigned long probe_irq_on(void)
45 * flush such a longstanding irq before considering it as spurious. 45 * flush such a longstanding irq before considering it as spurious.
46 */ 46 */
47 for_each_irq_desc_reverse(i, desc) { 47 for_each_irq_desc_reverse(i, desc) {
48 spin_lock_irq(&desc->lock); 48 raw_spin_lock_irq(&desc->lock);
49 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 49 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
50 /* 50 /*
51 * An old-style architecture might still have 51 * An old-style architecture might still have
@@ -61,7 +61,7 @@ unsigned long probe_irq_on(void)
61 desc->chip->set_type(i, IRQ_TYPE_PROBE); 61 desc->chip->set_type(i, IRQ_TYPE_PROBE);
62 desc->chip->startup(i); 62 desc->chip->startup(i);
63 } 63 }
64 spin_unlock_irq(&desc->lock); 64 raw_spin_unlock_irq(&desc->lock);
65 } 65 }
66 66
67 /* Wait for longstanding interrupts to trigger. */ 67 /* Wait for longstanding interrupts to trigger. */
@@ -73,13 +73,13 @@ unsigned long probe_irq_on(void)
73 * happened in the previous stage, it may have masked itself) 73 * happened in the previous stage, it may have masked itself)
74 */ 74 */
75 for_each_irq_desc_reverse(i, desc) { 75 for_each_irq_desc_reverse(i, desc) {
76 spin_lock_irq(&desc->lock); 76 raw_spin_lock_irq(&desc->lock);
77 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 77 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
78 desc->status |= IRQ_AUTODETECT | IRQ_WAITING; 78 desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
79 if (desc->chip->startup(i)) 79 if (desc->chip->startup(i))
80 desc->status |= IRQ_PENDING; 80 desc->status |= IRQ_PENDING;
81 } 81 }
82 spin_unlock_irq(&desc->lock); 82 raw_spin_unlock_irq(&desc->lock);
83 } 83 }
84 84
85 /* 85 /*
@@ -91,7 +91,7 @@ unsigned long probe_irq_on(void)
91 * Now filter out any obviously spurious interrupts 91 * Now filter out any obviously spurious interrupts
92 */ 92 */
93 for_each_irq_desc(i, desc) { 93 for_each_irq_desc(i, desc) {
94 spin_lock_irq(&desc->lock); 94 raw_spin_lock_irq(&desc->lock);
95 status = desc->status; 95 status = desc->status;
96 96
97 if (status & IRQ_AUTODETECT) { 97 if (status & IRQ_AUTODETECT) {
@@ -103,7 +103,7 @@ unsigned long probe_irq_on(void)
103 if (i < 32) 103 if (i < 32)
104 mask |= 1 << i; 104 mask |= 1 << i;
105 } 105 }
106 spin_unlock_irq(&desc->lock); 106 raw_spin_unlock_irq(&desc->lock);
107 } 107 }
108 108
109 return mask; 109 return mask;
@@ -129,7 +129,7 @@ unsigned int probe_irq_mask(unsigned long val)
129 int i; 129 int i;
130 130
131 for_each_irq_desc(i, desc) { 131 for_each_irq_desc(i, desc) {
132 spin_lock_irq(&desc->lock); 132 raw_spin_lock_irq(&desc->lock);
133 status = desc->status; 133 status = desc->status;
134 134
135 if (status & IRQ_AUTODETECT) { 135 if (status & IRQ_AUTODETECT) {
@@ -139,7 +139,7 @@ unsigned int probe_irq_mask(unsigned long val)
139 desc->status = status & ~IRQ_AUTODETECT; 139 desc->status = status & ~IRQ_AUTODETECT;
140 desc->chip->shutdown(i); 140 desc->chip->shutdown(i);
141 } 141 }
142 spin_unlock_irq(&desc->lock); 142 raw_spin_unlock_irq(&desc->lock);
143 } 143 }
144 mutex_unlock(&probing_active); 144 mutex_unlock(&probing_active);
145 145
@@ -171,7 +171,7 @@ int probe_irq_off(unsigned long val)
171 unsigned int status; 171 unsigned int status;
172 172
173 for_each_irq_desc(i, desc) { 173 for_each_irq_desc(i, desc) {
174 spin_lock_irq(&desc->lock); 174 raw_spin_lock_irq(&desc->lock);
175 status = desc->status; 175 status = desc->status;
176 176
177 if (status & IRQ_AUTODETECT) { 177 if (status & IRQ_AUTODETECT) {
@@ -183,7 +183,7 @@ int probe_irq_off(unsigned long val)
183 desc->status = status & ~IRQ_AUTODETECT; 183 desc->status = status & ~IRQ_AUTODETECT;
184 desc->chip->shutdown(i); 184 desc->chip->shutdown(i);
185 } 185 }
186 spin_unlock_irq(&desc->lock); 186 raw_spin_unlock_irq(&desc->lock);
187 } 187 }
188 mutex_unlock(&probing_active); 188 mutex_unlock(&probing_active);
189 189
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c1660194d11..ecc3fa28f66 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -34,7 +34,7 @@ void dynamic_irq_init(unsigned int irq)
34 } 34 }
35 35
36 /* Ensure we don't have left over values from a previous use of this irq */ 36 /* Ensure we don't have left over values from a previous use of this irq */
37 spin_lock_irqsave(&desc->lock, flags); 37 raw_spin_lock_irqsave(&desc->lock, flags);
38 desc->status = IRQ_DISABLED; 38 desc->status = IRQ_DISABLED;
39 desc->chip = &no_irq_chip; 39 desc->chip = &no_irq_chip;
40 desc->handle_irq = handle_bad_irq; 40 desc->handle_irq = handle_bad_irq;
@@ -51,7 +51,7 @@ void dynamic_irq_init(unsigned int irq)
51 cpumask_clear(desc->pending_mask); 51 cpumask_clear(desc->pending_mask);
52#endif 52#endif
53#endif 53#endif
54 spin_unlock_irqrestore(&desc->lock, flags); 54 raw_spin_unlock_irqrestore(&desc->lock, flags);
55} 55}
56 56
57/** 57/**
@@ -68,9 +68,9 @@ void dynamic_irq_cleanup(unsigned int irq)
68 return; 68 return;
69 } 69 }
70 70
71 spin_lock_irqsave(&desc->lock, flags); 71 raw_spin_lock_irqsave(&desc->lock, flags);
72 if (desc->action) { 72 if (desc->action) {
73 spin_unlock_irqrestore(&desc->lock, flags); 73 raw_spin_unlock_irqrestore(&desc->lock, flags);
74 WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n", 74 WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n",
75 irq); 75 irq);
76 return; 76 return;
@@ -82,7 +82,7 @@ void dynamic_irq_cleanup(unsigned int irq)
82 desc->chip = &no_irq_chip; 82 desc->chip = &no_irq_chip;
83 desc->name = NULL; 83 desc->name = NULL;
84 clear_kstat_irqs(desc); 84 clear_kstat_irqs(desc);
85 spin_unlock_irqrestore(&desc->lock, flags); 85 raw_spin_unlock_irqrestore(&desc->lock, flags);
86} 86}
87 87
88 88
@@ -104,10 +104,10 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
104 if (!chip) 104 if (!chip)
105 chip = &no_irq_chip; 105 chip = &no_irq_chip;
106 106
107 spin_lock_irqsave(&desc->lock, flags); 107 raw_spin_lock_irqsave(&desc->lock, flags);
108 irq_chip_set_defaults(chip); 108 irq_chip_set_defaults(chip);
109 desc->chip = chip; 109 desc->chip = chip;
110 spin_unlock_irqrestore(&desc->lock, flags); 110 raw_spin_unlock_irqrestore(&desc->lock, flags);
111 111
112 return 0; 112 return 0;
113} 113}
@@ -133,9 +133,9 @@ int set_irq_type(unsigned int irq, unsigned int type)
133 if (type == IRQ_TYPE_NONE) 133 if (type == IRQ_TYPE_NONE)
134 return 0; 134 return 0;
135 135
136 spin_lock_irqsave(&desc->lock, flags); 136 raw_spin_lock_irqsave(&desc->lock, flags);
137 ret = __irq_set_trigger(desc, irq, type); 137 ret = __irq_set_trigger(desc, irq, type);
138 spin_unlock_irqrestore(&desc->lock, flags); 138 raw_spin_unlock_irqrestore(&desc->lock, flags);
139 return ret; 139 return ret;
140} 140}
141EXPORT_SYMBOL(set_irq_type); 141EXPORT_SYMBOL(set_irq_type);
@@ -158,19 +158,19 @@ int set_irq_data(unsigned int irq, void *data)
158 return -EINVAL; 158 return -EINVAL;
159 } 159 }
160 160
161 spin_lock_irqsave(&desc->lock, flags); 161 raw_spin_lock_irqsave(&desc->lock, flags);
162 desc->handler_data = data; 162 desc->handler_data = data;
163 spin_unlock_irqrestore(&desc->lock, flags); 163 raw_spin_unlock_irqrestore(&desc->lock, flags);
164 return 0; 164 return 0;
165} 165}
166EXPORT_SYMBOL(set_irq_data); 166EXPORT_SYMBOL(set_irq_data);
167 167
168/** 168/**
169 * set_irq_data - set irq type data for an irq 169 * set_irq_msi - set MSI descriptor data for an irq
170 * @irq: Interrupt number 170 * @irq: Interrupt number
171 * @entry: Pointer to MSI descriptor data 171 * @entry: Pointer to MSI descriptor data
172 * 172 *
173 * Set the hardware irq controller data for an irq 173 * Set the MSI descriptor entry for an irq
174 */ 174 */
175int set_irq_msi(unsigned int irq, struct msi_desc *entry) 175int set_irq_msi(unsigned int irq, struct msi_desc *entry)
176{ 176{
@@ -183,11 +183,11 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry)
183 return -EINVAL; 183 return -EINVAL;
184 } 184 }
185 185
186 spin_lock_irqsave(&desc->lock, flags); 186 raw_spin_lock_irqsave(&desc->lock, flags);
187 desc->msi_desc = entry; 187 desc->msi_desc = entry;
188 if (entry) 188 if (entry)
189 entry->irq = irq; 189 entry->irq = irq;
190 spin_unlock_irqrestore(&desc->lock, flags); 190 raw_spin_unlock_irqrestore(&desc->lock, flags);
191 return 0; 191 return 0;
192} 192}
193 193
@@ -214,9 +214,9 @@ int set_irq_chip_data(unsigned int irq, void *data)
214 return -EINVAL; 214 return -EINVAL;
215 } 215 }
216 216
217 spin_lock_irqsave(&desc->lock, flags); 217 raw_spin_lock_irqsave(&desc->lock, flags);
218 desc->chip_data = data; 218 desc->chip_data = data;
219 spin_unlock_irqrestore(&desc->lock, flags); 219 raw_spin_unlock_irqrestore(&desc->lock, flags);
220 220
221 return 0; 221 return 0;
222} 222}
@@ -241,12 +241,12 @@ void set_irq_nested_thread(unsigned int irq, int nest)
241 if (!desc) 241 if (!desc)
242 return; 242 return;
243 243
244 spin_lock_irqsave(&desc->lock, flags); 244 raw_spin_lock_irqsave(&desc->lock, flags);
245 if (nest) 245 if (nest)
246 desc->status |= IRQ_NESTED_THREAD; 246 desc->status |= IRQ_NESTED_THREAD;
247 else 247 else
248 desc->status &= ~IRQ_NESTED_THREAD; 248 desc->status &= ~IRQ_NESTED_THREAD;
249 spin_unlock_irqrestore(&desc->lock, flags); 249 raw_spin_unlock_irqrestore(&desc->lock, flags);
250} 250}
251EXPORT_SYMBOL_GPL(set_irq_nested_thread); 251EXPORT_SYMBOL_GPL(set_irq_nested_thread);
252 252
@@ -343,7 +343,7 @@ void handle_nested_irq(unsigned int irq)
343 343
344 might_sleep(); 344 might_sleep();
345 345
346 spin_lock_irq(&desc->lock); 346 raw_spin_lock_irq(&desc->lock);
347 347
348 kstat_incr_irqs_this_cpu(irq, desc); 348 kstat_incr_irqs_this_cpu(irq, desc);
349 349
@@ -352,17 +352,17 @@ void handle_nested_irq(unsigned int irq)
352 goto out_unlock; 352 goto out_unlock;
353 353
354 desc->status |= IRQ_INPROGRESS; 354 desc->status |= IRQ_INPROGRESS;
355 spin_unlock_irq(&desc->lock); 355 raw_spin_unlock_irq(&desc->lock);
356 356
357 action_ret = action->thread_fn(action->irq, action->dev_id); 357 action_ret = action->thread_fn(action->irq, action->dev_id);
358 if (!noirqdebug) 358 if (!noirqdebug)
359 note_interrupt(irq, desc, action_ret); 359 note_interrupt(irq, desc, action_ret);
360 360
361 spin_lock_irq(&desc->lock); 361 raw_spin_lock_irq(&desc->lock);
362 desc->status &= ~IRQ_INPROGRESS; 362 desc->status &= ~IRQ_INPROGRESS;
363 363
364out_unlock: 364out_unlock:
365 spin_unlock_irq(&desc->lock); 365 raw_spin_unlock_irq(&desc->lock);
366} 366}
367EXPORT_SYMBOL_GPL(handle_nested_irq); 367EXPORT_SYMBOL_GPL(handle_nested_irq);
368 368
@@ -384,7 +384,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
384 struct irqaction *action; 384 struct irqaction *action;
385 irqreturn_t action_ret; 385 irqreturn_t action_ret;
386 386
387 spin_lock(&desc->lock); 387 raw_spin_lock(&desc->lock);
388 388
389 if (unlikely(desc->status & IRQ_INPROGRESS)) 389 if (unlikely(desc->status & IRQ_INPROGRESS))
390 goto out_unlock; 390 goto out_unlock;
@@ -396,16 +396,16 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
396 goto out_unlock; 396 goto out_unlock;
397 397
398 desc->status |= IRQ_INPROGRESS; 398 desc->status |= IRQ_INPROGRESS;
399 spin_unlock(&desc->lock); 399 raw_spin_unlock(&desc->lock);
400 400
401 action_ret = handle_IRQ_event(irq, action); 401 action_ret = handle_IRQ_event(irq, action);
402 if (!noirqdebug) 402 if (!noirqdebug)
403 note_interrupt(irq, desc, action_ret); 403 note_interrupt(irq, desc, action_ret);
404 404
405 spin_lock(&desc->lock); 405 raw_spin_lock(&desc->lock);
406 desc->status &= ~IRQ_INPROGRESS; 406 desc->status &= ~IRQ_INPROGRESS;
407out_unlock: 407out_unlock:
408 spin_unlock(&desc->lock); 408 raw_spin_unlock(&desc->lock);
409} 409}
410 410
411/** 411/**
@@ -424,7 +424,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
424 struct irqaction *action; 424 struct irqaction *action;
425 irqreturn_t action_ret; 425 irqreturn_t action_ret;
426 426
427 spin_lock(&desc->lock); 427 raw_spin_lock(&desc->lock);
428 mask_ack_irq(desc, irq); 428 mask_ack_irq(desc, irq);
429 429
430 if (unlikely(desc->status & IRQ_INPROGRESS)) 430 if (unlikely(desc->status & IRQ_INPROGRESS))
@@ -441,13 +441,13 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
441 goto out_unlock; 441 goto out_unlock;
442 442
443 desc->status |= IRQ_INPROGRESS; 443 desc->status |= IRQ_INPROGRESS;
444 spin_unlock(&desc->lock); 444 raw_spin_unlock(&desc->lock);
445 445
446 action_ret = handle_IRQ_event(irq, action); 446 action_ret = handle_IRQ_event(irq, action);
447 if (!noirqdebug) 447 if (!noirqdebug)
448 note_interrupt(irq, desc, action_ret); 448 note_interrupt(irq, desc, action_ret);
449 449
450 spin_lock(&desc->lock); 450 raw_spin_lock(&desc->lock);
451 desc->status &= ~IRQ_INPROGRESS; 451 desc->status &= ~IRQ_INPROGRESS;
452 452
453 if (unlikely(desc->status & IRQ_ONESHOT)) 453 if (unlikely(desc->status & IRQ_ONESHOT))
@@ -455,7 +455,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
455 else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) 455 else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
456 desc->chip->unmask(irq); 456 desc->chip->unmask(irq);
457out_unlock: 457out_unlock:
458 spin_unlock(&desc->lock); 458 raw_spin_unlock(&desc->lock);
459} 459}
460EXPORT_SYMBOL_GPL(handle_level_irq); 460EXPORT_SYMBOL_GPL(handle_level_irq);
461 461
@@ -475,7 +475,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
475 struct irqaction *action; 475 struct irqaction *action;
476 irqreturn_t action_ret; 476 irqreturn_t action_ret;
477 477
478 spin_lock(&desc->lock); 478 raw_spin_lock(&desc->lock);
479 479
480 if (unlikely(desc->status & IRQ_INPROGRESS)) 480 if (unlikely(desc->status & IRQ_INPROGRESS))
481 goto out; 481 goto out;
@@ -497,18 +497,18 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
497 497
498 desc->status |= IRQ_INPROGRESS; 498 desc->status |= IRQ_INPROGRESS;
499 desc->status &= ~IRQ_PENDING; 499 desc->status &= ~IRQ_PENDING;
500 spin_unlock(&desc->lock); 500 raw_spin_unlock(&desc->lock);
501 501
502 action_ret = handle_IRQ_event(irq, action); 502 action_ret = handle_IRQ_event(irq, action);
503 if (!noirqdebug) 503 if (!noirqdebug)
504 note_interrupt(irq, desc, action_ret); 504 note_interrupt(irq, desc, action_ret);
505 505
506 spin_lock(&desc->lock); 506 raw_spin_lock(&desc->lock);
507 desc->status &= ~IRQ_INPROGRESS; 507 desc->status &= ~IRQ_INPROGRESS;
508out: 508out:
509 desc->chip->eoi(irq); 509 desc->chip->eoi(irq);
510 510
511 spin_unlock(&desc->lock); 511 raw_spin_unlock(&desc->lock);
512} 512}
513 513
514/** 514/**
@@ -530,7 +530,7 @@ out:
530void 530void
531handle_edge_irq(unsigned int irq, struct irq_desc *desc) 531handle_edge_irq(unsigned int irq, struct irq_desc *desc)
532{ 532{
533 spin_lock(&desc->lock); 533 raw_spin_lock(&desc->lock);
534 534
535 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 535 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
536 536
@@ -576,21 +576,21 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
576 } 576 }
577 577
578 desc->status &= ~IRQ_PENDING; 578 desc->status &= ~IRQ_PENDING;
579 spin_unlock(&desc->lock); 579 raw_spin_unlock(&desc->lock);
580 action_ret = handle_IRQ_event(irq, action); 580 action_ret = handle_IRQ_event(irq, action);
581 if (!noirqdebug) 581 if (!noirqdebug)
582 note_interrupt(irq, desc, action_ret); 582 note_interrupt(irq, desc, action_ret);
583 spin_lock(&desc->lock); 583 raw_spin_lock(&desc->lock);
584 584
585 } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING); 585 } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING);
586 586
587 desc->status &= ~IRQ_INPROGRESS; 587 desc->status &= ~IRQ_INPROGRESS;
588out_unlock: 588out_unlock:
589 spin_unlock(&desc->lock); 589 raw_spin_unlock(&desc->lock);
590} 590}
591 591
592/** 592/**
593 * handle_percpu_IRQ - Per CPU local irq handler 593 * handle_percpu_irq - Per CPU local irq handler
594 * @irq: the interrupt number 594 * @irq: the interrupt number
595 * @desc: the interrupt description structure for this irq 595 * @desc: the interrupt description structure for this irq
596 * 596 *
@@ -643,7 +643,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
643 } 643 }
644 644
645 chip_bus_lock(irq, desc); 645 chip_bus_lock(irq, desc);
646 spin_lock_irqsave(&desc->lock, flags); 646 raw_spin_lock_irqsave(&desc->lock, flags);
647 647
648 /* Uninstall? */ 648 /* Uninstall? */
649 if (handle == handle_bad_irq) { 649 if (handle == handle_bad_irq) {
@@ -661,7 +661,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
661 desc->depth = 0; 661 desc->depth = 0;
662 desc->chip->startup(irq); 662 desc->chip->startup(irq);
663 } 663 }
664 spin_unlock_irqrestore(&desc->lock, flags); 664 raw_spin_unlock_irqrestore(&desc->lock, flags);
665 chip_bus_sync_unlock(irq, desc); 665 chip_bus_sync_unlock(irq, desc);
666} 666}
667EXPORT_SYMBOL_GPL(__set_irq_handler); 667EXPORT_SYMBOL_GPL(__set_irq_handler);
@@ -692,9 +692,9 @@ void __init set_irq_noprobe(unsigned int irq)
692 return; 692 return;
693 } 693 }
694 694
695 spin_lock_irqsave(&desc->lock, flags); 695 raw_spin_lock_irqsave(&desc->lock, flags);
696 desc->status |= IRQ_NOPROBE; 696 desc->status |= IRQ_NOPROBE;
697 spin_unlock_irqrestore(&desc->lock, flags); 697 raw_spin_unlock_irqrestore(&desc->lock, flags);
698} 698}
699 699
700void __init set_irq_probe(unsigned int irq) 700void __init set_irq_probe(unsigned int irq)
@@ -707,7 +707,7 @@ void __init set_irq_probe(unsigned int irq)
707 return; 707 return;
708 } 708 }
709 709
710 spin_lock_irqsave(&desc->lock, flags); 710 raw_spin_lock_irqsave(&desc->lock, flags);
711 desc->status &= ~IRQ_NOPROBE; 711 desc->status &= ~IRQ_NOPROBE;
712 spin_unlock_irqrestore(&desc->lock, flags); 712 raw_spin_unlock_irqrestore(&desc->lock, flags);
713} 713}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 17c71bb565c..814940e7f48 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -80,7 +80,7 @@ static struct irq_desc irq_desc_init = {
80 .chip = &no_irq_chip, 80 .chip = &no_irq_chip,
81 .handle_irq = handle_bad_irq, 81 .handle_irq = handle_bad_irq,
82 .depth = 1, 82 .depth = 1,
83 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), 83 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
84}; 84};
85 85
86void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr) 86void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
@@ -108,7 +108,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
108{ 108{
109 memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); 109 memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
110 110
111 spin_lock_init(&desc->lock); 111 raw_spin_lock_init(&desc->lock);
112 desc->irq = irq; 112 desc->irq = irq;
113#ifdef CONFIG_SMP 113#ifdef CONFIG_SMP
114 desc->node = node; 114 desc->node = node;
@@ -130,7 +130,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
130/* 130/*
131 * Protect the sparse_irqs: 131 * Protect the sparse_irqs:
132 */ 132 */
133DEFINE_SPINLOCK(sparse_irq_lock); 133DEFINE_RAW_SPINLOCK(sparse_irq_lock);
134 134
135struct irq_desc **irq_desc_ptrs __read_mostly; 135struct irq_desc **irq_desc_ptrs __read_mostly;
136 136
@@ -141,7 +141,7 @@ static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_sm
141 .chip = &no_irq_chip, 141 .chip = &no_irq_chip,
142 .handle_irq = handle_bad_irq, 142 .handle_irq = handle_bad_irq,
143 .depth = 1, 143 .depth = 1,
144 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), 144 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
145 } 145 }
146}; 146};
147 147
@@ -212,7 +212,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
212 if (desc) 212 if (desc)
213 return desc; 213 return desc;
214 214
215 spin_lock_irqsave(&sparse_irq_lock, flags); 215 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
216 216
217 /* We have to check it to avoid races with another CPU */ 217 /* We have to check it to avoid races with another CPU */
218 desc = irq_desc_ptrs[irq]; 218 desc = irq_desc_ptrs[irq];
@@ -234,7 +234,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
234 irq_desc_ptrs[irq] = desc; 234 irq_desc_ptrs[irq] = desc;
235 235
236out_unlock: 236out_unlock:
237 spin_unlock_irqrestore(&sparse_irq_lock, flags); 237 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
238 238
239 return desc; 239 return desc;
240} 240}
@@ -247,7 +247,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
247 .chip = &no_irq_chip, 247 .chip = &no_irq_chip,
248 .handle_irq = handle_bad_irq, 248 .handle_irq = handle_bad_irq,
249 .depth = 1, 249 .depth = 1,
250 .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock), 250 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
251 } 251 }
252}; 252};
253 253
@@ -473,7 +473,7 @@ unsigned int __do_IRQ(unsigned int irq)
473 return 1; 473 return 1;
474 } 474 }
475 475
476 spin_lock(&desc->lock); 476 raw_spin_lock(&desc->lock);
477 if (desc->chip->ack) 477 if (desc->chip->ack)
478 desc->chip->ack(irq); 478 desc->chip->ack(irq);
479 /* 479 /*
@@ -517,13 +517,13 @@ unsigned int __do_IRQ(unsigned int irq)
517 for (;;) { 517 for (;;) {
518 irqreturn_t action_ret; 518 irqreturn_t action_ret;
519 519
520 spin_unlock(&desc->lock); 520 raw_spin_unlock(&desc->lock);
521 521
522 action_ret = handle_IRQ_event(irq, action); 522 action_ret = handle_IRQ_event(irq, action);
523 if (!noirqdebug) 523 if (!noirqdebug)
524 note_interrupt(irq, desc, action_ret); 524 note_interrupt(irq, desc, action_ret);
525 525
526 spin_lock(&desc->lock); 526 raw_spin_lock(&desc->lock);
527 if (likely(!(desc->status & IRQ_PENDING))) 527 if (likely(!(desc->status & IRQ_PENDING)))
528 break; 528 break;
529 desc->status &= ~IRQ_PENDING; 529 desc->status &= ~IRQ_PENDING;
@@ -536,7 +536,7 @@ out:
536 * disabled while the handler was running. 536 * disabled while the handler was running.
537 */ 537 */
538 desc->chip->end(irq); 538 desc->chip->end(irq);
539 spin_unlock(&desc->lock); 539 raw_spin_unlock(&desc->lock);
540 540
541 return 1; 541 return 1;
542} 542}
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 1b5d742c6a7..b2821f070a3 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -18,7 +18,7 @@ extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
18extern struct lock_class_key irq_desc_lock_class; 18extern struct lock_class_key irq_desc_lock_class;
19extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); 19extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
20extern void clear_kstat_irqs(struct irq_desc *desc); 20extern void clear_kstat_irqs(struct irq_desc *desc);
21extern spinlock_t sparse_irq_lock; 21extern raw_spinlock_t sparse_irq_lock;
22 22
23#ifdef CONFIG_SPARSE_IRQ 23#ifdef CONFIG_SPARSE_IRQ
24/* irq_desc_ptrs allocated at boot time */ 24/* irq_desc_ptrs allocated at boot time */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index bde4c667d24..eb6078ca60c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -46,9 +46,9 @@ void synchronize_irq(unsigned int irq)
46 cpu_relax(); 46 cpu_relax();
47 47
48 /* Ok, that indicated we're done: double-check carefully. */ 48 /* Ok, that indicated we're done: double-check carefully. */
49 spin_lock_irqsave(&desc->lock, flags); 49 raw_spin_lock_irqsave(&desc->lock, flags);
50 status = desc->status; 50 status = desc->status;
51 spin_unlock_irqrestore(&desc->lock, flags); 51 raw_spin_unlock_irqrestore(&desc->lock, flags);
52 52
53 /* Oops, that failed? */ 53 /* Oops, that failed? */
54 } while (status & IRQ_INPROGRESS); 54 } while (status & IRQ_INPROGRESS);
@@ -114,7 +114,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
114 if (!desc->chip->set_affinity) 114 if (!desc->chip->set_affinity)
115 return -EINVAL; 115 return -EINVAL;
116 116
117 spin_lock_irqsave(&desc->lock, flags); 117 raw_spin_lock_irqsave(&desc->lock, flags);
118 118
119#ifdef CONFIG_GENERIC_PENDING_IRQ 119#ifdef CONFIG_GENERIC_PENDING_IRQ
120 if (desc->status & IRQ_MOVE_PCNTXT) { 120 if (desc->status & IRQ_MOVE_PCNTXT) {
@@ -134,7 +134,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
134 } 134 }
135#endif 135#endif
136 desc->status |= IRQ_AFFINITY_SET; 136 desc->status |= IRQ_AFFINITY_SET;
137 spin_unlock_irqrestore(&desc->lock, flags); 137 raw_spin_unlock_irqrestore(&desc->lock, flags);
138 return 0; 138 return 0;
139} 139}
140 140
@@ -181,11 +181,11 @@ int irq_select_affinity_usr(unsigned int irq)
181 unsigned long flags; 181 unsigned long flags;
182 int ret; 182 int ret;
183 183
184 spin_lock_irqsave(&desc->lock, flags); 184 raw_spin_lock_irqsave(&desc->lock, flags);
185 ret = setup_affinity(irq, desc); 185 ret = setup_affinity(irq, desc);
186 if (!ret) 186 if (!ret)
187 irq_set_thread_affinity(desc); 187 irq_set_thread_affinity(desc);
188 spin_unlock_irqrestore(&desc->lock, flags); 188 raw_spin_unlock_irqrestore(&desc->lock, flags);
189 189
190 return ret; 190 return ret;
191} 191}
@@ -231,9 +231,9 @@ void disable_irq_nosync(unsigned int irq)
231 return; 231 return;
232 232
233 chip_bus_lock(irq, desc); 233 chip_bus_lock(irq, desc);
234 spin_lock_irqsave(&desc->lock, flags); 234 raw_spin_lock_irqsave(&desc->lock, flags);
235 __disable_irq(desc, irq, false); 235 __disable_irq(desc, irq, false);
236 spin_unlock_irqrestore(&desc->lock, flags); 236 raw_spin_unlock_irqrestore(&desc->lock, flags);
237 chip_bus_sync_unlock(irq, desc); 237 chip_bus_sync_unlock(irq, desc);
238} 238}
239EXPORT_SYMBOL(disable_irq_nosync); 239EXPORT_SYMBOL(disable_irq_nosync);
@@ -308,9 +308,9 @@ void enable_irq(unsigned int irq)
308 return; 308 return;
309 309
310 chip_bus_lock(irq, desc); 310 chip_bus_lock(irq, desc);
311 spin_lock_irqsave(&desc->lock, flags); 311 raw_spin_lock_irqsave(&desc->lock, flags);
312 __enable_irq(desc, irq, false); 312 __enable_irq(desc, irq, false);
313 spin_unlock_irqrestore(&desc->lock, flags); 313 raw_spin_unlock_irqrestore(&desc->lock, flags);
314 chip_bus_sync_unlock(irq, desc); 314 chip_bus_sync_unlock(irq, desc);
315} 315}
316EXPORT_SYMBOL(enable_irq); 316EXPORT_SYMBOL(enable_irq);
@@ -347,7 +347,7 @@ int set_irq_wake(unsigned int irq, unsigned int on)
347 /* wakeup-capable irqs can be shared between drivers that 347 /* wakeup-capable irqs can be shared between drivers that
348 * don't need to have the same sleep mode behaviors. 348 * don't need to have the same sleep mode behaviors.
349 */ 349 */
350 spin_lock_irqsave(&desc->lock, flags); 350 raw_spin_lock_irqsave(&desc->lock, flags);
351 if (on) { 351 if (on) {
352 if (desc->wake_depth++ == 0) { 352 if (desc->wake_depth++ == 0) {
353 ret = set_irq_wake_real(irq, on); 353 ret = set_irq_wake_real(irq, on);
@@ -368,7 +368,7 @@ int set_irq_wake(unsigned int irq, unsigned int on)
368 } 368 }
369 } 369 }
370 370
371 spin_unlock_irqrestore(&desc->lock, flags); 371 raw_spin_unlock_irqrestore(&desc->lock, flags);
372 return ret; 372 return ret;
373} 373}
374EXPORT_SYMBOL(set_irq_wake); 374EXPORT_SYMBOL(set_irq_wake);
@@ -484,12 +484,12 @@ static int irq_wait_for_interrupt(struct irqaction *action)
484static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) 484static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
485{ 485{
486 chip_bus_lock(irq, desc); 486 chip_bus_lock(irq, desc);
487 spin_lock_irq(&desc->lock); 487 raw_spin_lock_irq(&desc->lock);
488 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { 488 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
489 desc->status &= ~IRQ_MASKED; 489 desc->status &= ~IRQ_MASKED;
490 desc->chip->unmask(irq); 490 desc->chip->unmask(irq);
491 } 491 }
492 spin_unlock_irq(&desc->lock); 492 raw_spin_unlock_irq(&desc->lock);
493 chip_bus_sync_unlock(irq, desc); 493 chip_bus_sync_unlock(irq, desc);
494} 494}
495 495
@@ -514,9 +514,9 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
514 return; 514 return;
515 } 515 }
516 516
517 spin_lock_irq(&desc->lock); 517 raw_spin_lock_irq(&desc->lock);
518 cpumask_copy(mask, desc->affinity); 518 cpumask_copy(mask, desc->affinity);
519 spin_unlock_irq(&desc->lock); 519 raw_spin_unlock_irq(&desc->lock);
520 520
521 set_cpus_allowed_ptr(current, mask); 521 set_cpus_allowed_ptr(current, mask);
522 free_cpumask_var(mask); 522 free_cpumask_var(mask);
@@ -545,7 +545,7 @@ static int irq_thread(void *data)
545 545
546 atomic_inc(&desc->threads_active); 546 atomic_inc(&desc->threads_active);
547 547
548 spin_lock_irq(&desc->lock); 548 raw_spin_lock_irq(&desc->lock);
549 if (unlikely(desc->status & IRQ_DISABLED)) { 549 if (unlikely(desc->status & IRQ_DISABLED)) {
550 /* 550 /*
551 * CHECKME: We might need a dedicated 551 * CHECKME: We might need a dedicated
@@ -555,9 +555,9 @@ static int irq_thread(void *data)
555 * retriggers the interrupt itself --- tglx 555 * retriggers the interrupt itself --- tglx
556 */ 556 */
557 desc->status |= IRQ_PENDING; 557 desc->status |= IRQ_PENDING;
558 spin_unlock_irq(&desc->lock); 558 raw_spin_unlock_irq(&desc->lock);
559 } else { 559 } else {
560 spin_unlock_irq(&desc->lock); 560 raw_spin_unlock_irq(&desc->lock);
561 561
562 action->thread_fn(action->irq, action->dev_id); 562 action->thread_fn(action->irq, action->dev_id);
563 563
@@ -679,7 +679,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
679 /* 679 /*
680 * The following block of code has to be executed atomically 680 * The following block of code has to be executed atomically
681 */ 681 */
682 spin_lock_irqsave(&desc->lock, flags); 682 raw_spin_lock_irqsave(&desc->lock, flags);
683 old_ptr = &desc->action; 683 old_ptr = &desc->action;
684 old = *old_ptr; 684 old = *old_ptr;
685 if (old) { 685 if (old) {
@@ -775,7 +775,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
775 __enable_irq(desc, irq, false); 775 __enable_irq(desc, irq, false);
776 } 776 }
777 777
778 spin_unlock_irqrestore(&desc->lock, flags); 778 raw_spin_unlock_irqrestore(&desc->lock, flags);
779 779
780 /* 780 /*
781 * Strictly no need to wake it up, but hung_task complains 781 * Strictly no need to wake it up, but hung_task complains
@@ -802,7 +802,7 @@ mismatch:
802 ret = -EBUSY; 802 ret = -EBUSY;
803 803
804out_thread: 804out_thread:
805 spin_unlock_irqrestore(&desc->lock, flags); 805 raw_spin_unlock_irqrestore(&desc->lock, flags);
806 if (new->thread) { 806 if (new->thread) {
807 struct task_struct *t = new->thread; 807 struct task_struct *t = new->thread;
808 808
@@ -844,7 +844,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
844 if (!desc) 844 if (!desc)
845 return NULL; 845 return NULL;
846 846
847 spin_lock_irqsave(&desc->lock, flags); 847 raw_spin_lock_irqsave(&desc->lock, flags);
848 848
849 /* 849 /*
850 * There can be multiple actions per IRQ descriptor, find the right 850 * There can be multiple actions per IRQ descriptor, find the right
@@ -856,7 +856,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
856 856
857 if (!action) { 857 if (!action) {
858 WARN(1, "Trying to free already-free IRQ %d\n", irq); 858 WARN(1, "Trying to free already-free IRQ %d\n", irq);
859 spin_unlock_irqrestore(&desc->lock, flags); 859 raw_spin_unlock_irqrestore(&desc->lock, flags);
860 860
861 return NULL; 861 return NULL;
862 } 862 }
@@ -884,7 +884,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
884 desc->chip->disable(irq); 884 desc->chip->disable(irq);
885 } 885 }
886 886
887 spin_unlock_irqrestore(&desc->lock, flags); 887 raw_spin_unlock_irqrestore(&desc->lock, flags);
888 888
889 unregister_handler_proc(irq, action); 889 unregister_handler_proc(irq, action);
890 890
@@ -1067,7 +1067,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1067 kfree(action); 1067 kfree(action);
1068 1068
1069#ifdef CONFIG_DEBUG_SHIRQ 1069#ifdef CONFIG_DEBUG_SHIRQ
1070 if (irqflags & IRQF_SHARED) { 1070 if (!retval && (irqflags & IRQF_SHARED)) {
1071 /* 1071 /*
1072 * It's a shared IRQ -- the driver ought to be prepared for it 1072 * It's a shared IRQ -- the driver ought to be prepared for it
1073 * to happen immediately, so let's make sure.... 1073 * to happen immediately, so let's make sure....
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index fcb6c96f262..24196228083 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -27,7 +27,7 @@ void move_masked_irq(int irq)
27 if (!desc->chip->set_affinity) 27 if (!desc->chip->set_affinity)
28 return; 28 return;
29 29
30 assert_spin_locked(&desc->lock); 30 assert_raw_spin_locked(&desc->lock);
31 31
32 /* 32 /*
33 * If there was a valid mask to work with, please 33 * If there was a valid mask to work with, please
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 3fd30197da2..26bac9d8f86 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -42,7 +42,7 @@ static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
42 "for migration.\n", irq); 42 "for migration.\n", irq);
43 return false; 43 return false;
44 } 44 }
45 spin_lock_init(&desc->lock); 45 raw_spin_lock_init(&desc->lock);
46 desc->node = node; 46 desc->node = node;
47 lockdep_set_class(&desc->lock, &irq_desc_lock_class); 47 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
48 init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids); 48 init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids);
@@ -67,7 +67,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
67 67
68 irq = old_desc->irq; 68 irq = old_desc->irq;
69 69
70 spin_lock_irqsave(&sparse_irq_lock, flags); 70 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
71 71
72 /* We have to check it to avoid races with another CPU */ 72 /* We have to check it to avoid races with another CPU */
73 desc = irq_desc_ptrs[irq]; 73 desc = irq_desc_ptrs[irq];
@@ -91,7 +91,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
91 } 91 }
92 92
93 irq_desc_ptrs[irq] = desc; 93 irq_desc_ptrs[irq] = desc;
94 spin_unlock_irqrestore(&sparse_irq_lock, flags); 94 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
95 95
96 /* free the old one */ 96 /* free the old one */
97 free_one_irq_desc(old_desc, desc); 97 free_one_irq_desc(old_desc, desc);
@@ -100,7 +100,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
100 return desc; 100 return desc;
101 101
102out_unlock: 102out_unlock:
103 spin_unlock_irqrestore(&sparse_irq_lock, flags); 103 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
104 104
105 return desc; 105 return desc;
106} 106}
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index a0bb09e7986..0d4005d85b0 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -28,9 +28,9 @@ void suspend_device_irqs(void)
28 for_each_irq_desc(irq, desc) { 28 for_each_irq_desc(irq, desc) {
29 unsigned long flags; 29 unsigned long flags;
30 30
31 spin_lock_irqsave(&desc->lock, flags); 31 raw_spin_lock_irqsave(&desc->lock, flags);
32 __disable_irq(desc, irq, true); 32 __disable_irq(desc, irq, true);
33 spin_unlock_irqrestore(&desc->lock, flags); 33 raw_spin_unlock_irqrestore(&desc->lock, flags);
34 } 34 }
35 35
36 for_each_irq_desc(irq, desc) 36 for_each_irq_desc(irq, desc)
@@ -56,9 +56,9 @@ void resume_device_irqs(void)
56 if (!(desc->status & IRQ_SUSPENDED)) 56 if (!(desc->status & IRQ_SUSPENDED))
57 continue; 57 continue;
58 58
59 spin_lock_irqsave(&desc->lock, flags); 59 raw_spin_lock_irqsave(&desc->lock, flags);
60 __enable_irq(desc, irq, true); 60 __enable_irq(desc, irq, true);
61 spin_unlock_irqrestore(&desc->lock, flags); 61 raw_spin_unlock_irqrestore(&desc->lock, flags);
62 } 62 }
63} 63}
64EXPORT_SYMBOL_GPL(resume_device_irqs); 64EXPORT_SYMBOL_GPL(resume_device_irqs);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 692363dd591..6f50eccc79c 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -136,7 +136,7 @@ out:
136 136
137static int default_affinity_open(struct inode *inode, struct file *file) 137static int default_affinity_open(struct inode *inode, struct file *file)
138{ 138{
139 return single_open(file, default_affinity_show, NULL); 139 return single_open(file, default_affinity_show, PDE(inode)->data);
140} 140}
141 141
142static const struct file_operations default_affinity_proc_fops = { 142static const struct file_operations default_affinity_proc_fops = {
@@ -148,18 +148,28 @@ static const struct file_operations default_affinity_proc_fops = {
148}; 148};
149#endif 149#endif
150 150
151static int irq_spurious_read(char *page, char **start, off_t off, 151static int irq_spurious_proc_show(struct seq_file *m, void *v)
152 int count, int *eof, void *data)
153{ 152{
154 struct irq_desc *desc = irq_to_desc((long) data); 153 struct irq_desc *desc = irq_to_desc((long) m->private);
155 return sprintf(page, "count %u\n" 154
156 "unhandled %u\n" 155 seq_printf(m, "count %u\n" "unhandled %u\n" "last_unhandled %u ms\n",
157 "last_unhandled %u ms\n", 156 desc->irq_count, desc->irqs_unhandled,
158 desc->irq_count, 157 jiffies_to_msecs(desc->last_unhandled));
159 desc->irqs_unhandled, 158 return 0;
160 jiffies_to_msecs(desc->last_unhandled)); 159}
160
161static int irq_spurious_proc_open(struct inode *inode, struct file *file)
162{
163 return single_open(file, irq_spurious_proc_show, NULL);
161} 164}
162 165
166static const struct file_operations irq_spurious_proc_fops = {
167 .open = irq_spurious_proc_open,
168 .read = seq_read,
169 .llseek = seq_lseek,
170 .release = single_release,
171};
172
163#define MAX_NAMELEN 128 173#define MAX_NAMELEN 128
164 174
165static int name_unique(unsigned int irq, struct irqaction *new_action) 175static int name_unique(unsigned int irq, struct irqaction *new_action)
@@ -169,7 +179,7 @@ static int name_unique(unsigned int irq, struct irqaction *new_action)
169 unsigned long flags; 179 unsigned long flags;
170 int ret = 1; 180 int ret = 1;
171 181
172 spin_lock_irqsave(&desc->lock, flags); 182 raw_spin_lock_irqsave(&desc->lock, flags);
173 for (action = desc->action ; action; action = action->next) { 183 for (action = desc->action ; action; action = action->next) {
174 if ((action != new_action) && action->name && 184 if ((action != new_action) && action->name &&
175 !strcmp(new_action->name, action->name)) { 185 !strcmp(new_action->name, action->name)) {
@@ -177,7 +187,7 @@ static int name_unique(unsigned int irq, struct irqaction *new_action)
177 break; 187 break;
178 } 188 }
179 } 189 }
180 spin_unlock_irqrestore(&desc->lock, flags); 190 raw_spin_unlock_irqrestore(&desc->lock, flags);
181 return ret; 191 return ret;
182} 192}
183 193
@@ -204,7 +214,6 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
204void register_irq_proc(unsigned int irq, struct irq_desc *desc) 214void register_irq_proc(unsigned int irq, struct irq_desc *desc)
205{ 215{
206 char name [MAX_NAMELEN]; 216 char name [MAX_NAMELEN];
207 struct proc_dir_entry *entry;
208 217
209 if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir) 218 if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir)
210 return; 219 return;
@@ -214,6 +223,8 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
214 223
215 /* create /proc/irq/1234 */ 224 /* create /proc/irq/1234 */
216 desc->dir = proc_mkdir(name, root_irq_dir); 225 desc->dir = proc_mkdir(name, root_irq_dir);
226 if (!desc->dir)
227 return;
217 228
218#ifdef CONFIG_SMP 229#ifdef CONFIG_SMP
219 /* create /proc/irq/<irq>/smp_affinity */ 230 /* create /proc/irq/<irq>/smp_affinity */
@@ -221,11 +232,8 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
221 &irq_affinity_proc_fops, (void *)(long)irq); 232 &irq_affinity_proc_fops, (void *)(long)irq);
222#endif 233#endif
223 234
224 entry = create_proc_entry("spurious", 0444, desc->dir); 235 proc_create_data("spurious", 0444, desc->dir,
225 if (entry) { 236 &irq_spurious_proc_fops, (void *)(long)irq);
226 entry->data = (void *)(long)irq;
227 entry->read_proc = irq_spurious_read;
228 }
229} 237}
230 238
231#undef MAX_NAMELEN 239#undef MAX_NAMELEN
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 114e704760f..89fb90ae534 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -28,7 +28,7 @@ static int try_one_irq(int irq, struct irq_desc *desc)
28 struct irqaction *action; 28 struct irqaction *action;
29 int ok = 0, work = 0; 29 int ok = 0, work = 0;
30 30
31 spin_lock(&desc->lock); 31 raw_spin_lock(&desc->lock);
32 /* Already running on another processor */ 32 /* Already running on another processor */
33 if (desc->status & IRQ_INPROGRESS) { 33 if (desc->status & IRQ_INPROGRESS) {
34 /* 34 /*
@@ -37,13 +37,13 @@ static int try_one_irq(int irq, struct irq_desc *desc)
37 */ 37 */
38 if (desc->action && (desc->action->flags & IRQF_SHARED)) 38 if (desc->action && (desc->action->flags & IRQF_SHARED))
39 desc->status |= IRQ_PENDING; 39 desc->status |= IRQ_PENDING;
40 spin_unlock(&desc->lock); 40 raw_spin_unlock(&desc->lock);
41 return ok; 41 return ok;
42 } 42 }
43 /* Honour the normal IRQ locking */ 43 /* Honour the normal IRQ locking */
44 desc->status |= IRQ_INPROGRESS; 44 desc->status |= IRQ_INPROGRESS;
45 action = desc->action; 45 action = desc->action;
46 spin_unlock(&desc->lock); 46 raw_spin_unlock(&desc->lock);
47 47
48 while (action) { 48 while (action) {
49 /* Only shared IRQ handlers are safe to call */ 49 /* Only shared IRQ handlers are safe to call */
@@ -56,7 +56,7 @@ static int try_one_irq(int irq, struct irq_desc *desc)
56 } 56 }
57 local_irq_disable(); 57 local_irq_disable();
58 /* Now clean up the flags */ 58 /* Now clean up the flags */
59 spin_lock(&desc->lock); 59 raw_spin_lock(&desc->lock);
60 action = desc->action; 60 action = desc->action;
61 61
62 /* 62 /*
@@ -68,9 +68,9 @@ static int try_one_irq(int irq, struct irq_desc *desc)
68 * Perform real IRQ processing for the IRQ we deferred 68 * Perform real IRQ processing for the IRQ we deferred
69 */ 69 */
70 work = 1; 70 work = 1;
71 spin_unlock(&desc->lock); 71 raw_spin_unlock(&desc->lock);
72 handle_IRQ_event(irq, action); 72 handle_IRQ_event(irq, action);
73 spin_lock(&desc->lock); 73 raw_spin_lock(&desc->lock);
74 desc->status &= ~IRQ_PENDING; 74 desc->status &= ~IRQ_PENDING;
75 } 75 }
76 desc->status &= ~IRQ_INPROGRESS; 76 desc->status &= ~IRQ_INPROGRESS;
@@ -80,7 +80,7 @@ static int try_one_irq(int irq, struct irq_desc *desc)
80 */ 80 */
81 if (work && desc->chip && desc->chip->end) 81 if (work && desc->chip && desc->chip->end)
82 desc->chip->end(irq); 82 desc->chip->end(irq);
83 spin_unlock(&desc->lock); 83 raw_spin_unlock(&desc->lock);
84 84
85 return ok; 85 return ok;
86} 86}
@@ -104,7 +104,7 @@ static int misrouted_irq(int irq)
104 return ok; 104 return ok;
105} 105}
106 106
107static void poll_all_shared_irqs(void) 107static void poll_spurious_irqs(unsigned long dummy)
108{ 108{
109 struct irq_desc *desc; 109 struct irq_desc *desc;
110 int i; 110 int i;
@@ -121,25 +121,15 @@ static void poll_all_shared_irqs(void)
121 if (!(status & IRQ_SPURIOUS_DISABLED)) 121 if (!(status & IRQ_SPURIOUS_DISABLED))
122 continue; 122 continue;
123 123
124 local_irq_disable();
124 try_one_irq(i, desc); 125 try_one_irq(i, desc);
126 local_irq_enable();
125 } 127 }
126}
127
128static void poll_spurious_irqs(unsigned long dummy)
129{
130 poll_all_shared_irqs();
131 128
132 mod_timer(&poll_spurious_irq_timer, 129 mod_timer(&poll_spurious_irq_timer,
133 jiffies + POLL_SPURIOUS_IRQ_INTERVAL); 130 jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
134} 131}
135 132
136#ifdef CONFIG_DEBUG_SHIRQ
137void debug_poll_all_shared_irqs(void)
138{
139 poll_all_shared_irqs();
140}
141#endif
142
143/* 133/*
144 * If 99,900 of the previous 100,000 interrupts have not been handled 134 * If 99,900 of the previous 100,000 interrupts have not been handled
145 * then assume that the IRQ is stuck in some manner. Drop a diagnostic 135 * then assume that the IRQ is stuck in some manner. Drop a diagnostic
@@ -230,7 +220,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
230 /* 220 /*
231 * If we are seeing only the odd spurious IRQ caused by 221 * If we are seeing only the odd spurious IRQ caused by
232 * bus asynchronicity then don't eventually trigger an error, 222 * bus asynchronicity then don't eventually trigger an error,
233 * otherwise the couter becomes a doomsday timer for otherwise 223 * otherwise the counter becomes a doomsday timer for otherwise
234 * working systems 224 * working systems
235 */ 225 */
236 if (time_after(jiffies, desc->last_unhandled + HZ/10)) 226 if (time_after(jiffies, desc->last_unhandled + HZ/10))
diff --git a/kernel/itimer.c b/kernel/itimer.c
index b03451ede52..d802883153d 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -146,6 +146,7 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
146{ 146{
147 cputime_t cval, nval, cinterval, ninterval; 147 cputime_t cval, nval, cinterval, ninterval;
148 s64 ns_ninterval, ns_nval; 148 s64 ns_ninterval, ns_nval;
149 u32 error, incr_error;
149 struct cpu_itimer *it = &tsk->signal->it[clock_id]; 150 struct cpu_itimer *it = &tsk->signal->it[clock_id];
150 151
151 nval = timeval_to_cputime(&value->it_value); 152 nval = timeval_to_cputime(&value->it_value);
@@ -153,8 +154,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
153 ninterval = timeval_to_cputime(&value->it_interval); 154 ninterval = timeval_to_cputime(&value->it_interval);
154 ns_ninterval = timeval_to_ns(&value->it_interval); 155 ns_ninterval = timeval_to_ns(&value->it_interval);
155 156
156 it->incr_error = cputime_sub_ns(ninterval, ns_ninterval); 157 error = cputime_sub_ns(nval, ns_nval);
157 it->error = cputime_sub_ns(nval, ns_nval); 158 incr_error = cputime_sub_ns(ninterval, ns_ninterval);
158 159
159 spin_lock_irq(&tsk->sighand->siglock); 160 spin_lock_irq(&tsk->sighand->siglock);
160 161
@@ -168,6 +169,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
168 } 169 }
169 it->expires = nval; 170 it->expires = nval;
170 it->incr = ninterval; 171 it->incr = ninterval;
172 it->error = error;
173 it->incr_error = incr_error;
171 trace_itimer_state(clock_id == CPUCLOCK_VIRT ? 174 trace_itimer_state(clock_id == CPUCLOCK_VIRT ?
172 ITIMER_VIRTUAL : ITIMER_PROF, value, nval); 175 ITIMER_VIRTUAL : ITIMER_PROF, value, nval);
173 176
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 8b6b8b697c6..8e5288a8a35 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -181,6 +181,7 @@ unsigned long kallsyms_lookup_name(const char *name)
181 } 181 }
182 return module_kallsyms_lookup_name(name); 182 return module_kallsyms_lookup_name(name);
183} 183}
184EXPORT_SYMBOL_GPL(kallsyms_lookup_name);
184 185
185int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, 186int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
186 unsigned long), 187 unsigned long),
diff --git a/kernel/kexec.c b/kernel/kexec.c
index f336e2107f9..ef077fb7315 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -21,7 +21,7 @@
21#include <linux/hardirq.h> 21#include <linux/hardirq.h>
22#include <linux/elf.h> 22#include <linux/elf.h>
23#include <linux/elfcore.h> 23#include <linux/elfcore.h>
24#include <linux/utsrelease.h> 24#include <generated/utsrelease.h>
25#include <linux/utsname.h> 25#include <linux/utsname.h>
26#include <linux/numa.h> 26#include <linux/numa.h>
27#include <linux/suspend.h> 27#include <linux/suspend.h>
@@ -31,6 +31,8 @@
31#include <linux/cpu.h> 31#include <linux/cpu.h>
32#include <linux/console.h> 32#include <linux/console.h>
33#include <linux/vmalloc.h> 33#include <linux/vmalloc.h>
34#include <linux/swap.h>
35#include <linux/kmsg_dump.h>
34 36
35#include <asm/page.h> 37#include <asm/page.h>
36#include <asm/uaccess.h> 38#include <asm/uaccess.h>
@@ -1073,6 +1075,9 @@ void crash_kexec(struct pt_regs *regs)
1073 if (mutex_trylock(&kexec_mutex)) { 1075 if (mutex_trylock(&kexec_mutex)) {
1074 if (kexec_crash_image) { 1076 if (kexec_crash_image) {
1075 struct pt_regs fixed_regs; 1077 struct pt_regs fixed_regs;
1078
1079 kmsg_dump(KMSG_DUMP_KEXEC);
1080
1076 crash_setup_regs(&fixed_regs, regs); 1081 crash_setup_regs(&fixed_regs, regs);
1077 crash_save_vmcoreinfo(); 1082 crash_save_vmcoreinfo();
1078 machine_crash_shutdown(&fixed_regs); 1083 machine_crash_shutdown(&fixed_regs);
@@ -1082,6 +1087,64 @@ void crash_kexec(struct pt_regs *regs)
1082 } 1087 }
1083} 1088}
1084 1089
1090size_t crash_get_memory_size(void)
1091{
1092 size_t size;
1093 mutex_lock(&kexec_mutex);
1094 size = crashk_res.end - crashk_res.start + 1;
1095 mutex_unlock(&kexec_mutex);
1096 return size;
1097}
1098
1099static void free_reserved_phys_range(unsigned long begin, unsigned long end)
1100{
1101 unsigned long addr;
1102
1103 for (addr = begin; addr < end; addr += PAGE_SIZE) {
1104 ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT));
1105 init_page_count(pfn_to_page(addr >> PAGE_SHIFT));
1106 free_page((unsigned long)__va(addr));
1107 totalram_pages++;
1108 }
1109}
1110
1111int crash_shrink_memory(unsigned long new_size)
1112{
1113 int ret = 0;
1114 unsigned long start, end;
1115
1116 mutex_lock(&kexec_mutex);
1117
1118 if (kexec_crash_image) {
1119 ret = -ENOENT;
1120 goto unlock;
1121 }
1122 start = crashk_res.start;
1123 end = crashk_res.end;
1124
1125 if (new_size >= end - start + 1) {
1126 ret = -EINVAL;
1127 if (new_size == end - start + 1)
1128 ret = 0;
1129 goto unlock;
1130 }
1131
1132 start = roundup(start, PAGE_SIZE);
1133 end = roundup(start + new_size, PAGE_SIZE);
1134
1135 free_reserved_phys_range(end, crashk_res.end);
1136
1137 if (start == end) {
1138 crashk_res.end = end;
1139 release_resource(&crashk_res);
1140 } else
1141 crashk_res.end = end - 1;
1142
1143unlock:
1144 mutex_unlock(&kexec_mutex);
1145 return ret;
1146}
1147
1085static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, 1148static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
1086 size_t data_len) 1149 size_t data_len)
1087{ 1150{
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 3765ff3c1bb..35edbe22e9a 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -1,6 +1,7 @@
1/* 1/*
2 * A simple kernel FIFO implementation. 2 * A generic kernel FIFO implementation.
3 * 3 *
4 * Copyright (C) 2009 Stefani Seibold <stefani@seibold.net>
4 * Copyright (C) 2004 Stelian Pop <stelian@popies.net> 5 * Copyright (C) 2004 Stelian Pop <stelian@popies.net>
5 * 6 *
6 * This program is free software; you can redistribute it and/or modify 7 * This program is free software; you can redistribute it and/or modify
@@ -25,50 +26,48 @@
25#include <linux/err.h> 26#include <linux/err.h>
26#include <linux/kfifo.h> 27#include <linux/kfifo.h>
27#include <linux/log2.h> 28#include <linux/log2.h>
29#include <linux/uaccess.h>
30
31static void _kfifo_init(struct kfifo *fifo, void *buffer,
32 unsigned int size)
33{
34 fifo->buffer = buffer;
35 fifo->size = size;
36
37 kfifo_reset(fifo);
38}
28 39
29/** 40/**
30 * kfifo_init - allocates a new FIFO using a preallocated buffer 41 * kfifo_init - initialize a FIFO using a preallocated buffer
42 * @fifo: the fifo to assign the buffer
31 * @buffer: the preallocated buffer to be used. 43 * @buffer: the preallocated buffer to be used.
32 * @size: the size of the internal buffer, this have to be a power of 2. 44 * @size: the size of the internal buffer, this has to be a power of 2.
33 * @gfp_mask: get_free_pages mask, passed to kmalloc()
34 * @lock: the lock to be used to protect the fifo buffer
35 * 45 *
36 * Do NOT pass the kfifo to kfifo_free() after use! Simply free the
37 * &struct kfifo with kfree().
38 */ 46 */
39struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size, 47void kfifo_init(struct kfifo *fifo, void *buffer, unsigned int size)
40 gfp_t gfp_mask, spinlock_t *lock)
41{ 48{
42 struct kfifo *fifo;
43
44 /* size must be a power of 2 */ 49 /* size must be a power of 2 */
45 BUG_ON(!is_power_of_2(size)); 50 BUG_ON(!is_power_of_2(size));
46 51
47 fifo = kmalloc(sizeof(struct kfifo), gfp_mask); 52 _kfifo_init(fifo, buffer, size);
48 if (!fifo)
49 return ERR_PTR(-ENOMEM);
50
51 fifo->buffer = buffer;
52 fifo->size = size;
53 fifo->in = fifo->out = 0;
54 fifo->lock = lock;
55
56 return fifo;
57} 53}
58EXPORT_SYMBOL(kfifo_init); 54EXPORT_SYMBOL(kfifo_init);
59 55
60/** 56/**
61 * kfifo_alloc - allocates a new FIFO and its internal buffer 57 * kfifo_alloc - allocates a new FIFO internal buffer
62 * @size: the size of the internal buffer to be allocated. 58 * @fifo: the fifo to assign then new buffer
59 * @size: the size of the buffer to be allocated, this have to be a power of 2.
63 * @gfp_mask: get_free_pages mask, passed to kmalloc() 60 * @gfp_mask: get_free_pages mask, passed to kmalloc()
64 * @lock: the lock to be used to protect the fifo buffer 61 *
62 * This function dynamically allocates a new fifo internal buffer
65 * 63 *
66 * The size will be rounded-up to a power of 2. 64 * The size will be rounded-up to a power of 2.
65 * The buffer will be release with kfifo_free().
66 * Return 0 if no error, otherwise the an error code
67 */ 67 */
68struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock) 68int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask)
69{ 69{
70 unsigned char *buffer; 70 unsigned char *buffer;
71 struct kfifo *ret;
72 71
73 /* 72 /*
74 * round up to the next power of 2, since our 'let the indices 73 * round up to the next power of 2, since our 'let the indices
@@ -80,48 +79,93 @@ struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock)
80 } 79 }
81 80
82 buffer = kmalloc(size, gfp_mask); 81 buffer = kmalloc(size, gfp_mask);
83 if (!buffer) 82 if (!buffer) {
84 return ERR_PTR(-ENOMEM); 83 _kfifo_init(fifo, NULL, 0);
85 84 return -ENOMEM;
86 ret = kfifo_init(buffer, size, gfp_mask, lock); 85 }
87 86
88 if (IS_ERR(ret)) 87 _kfifo_init(fifo, buffer, size);
89 kfree(buffer);
90 88
91 return ret; 89 return 0;
92} 90}
93EXPORT_SYMBOL(kfifo_alloc); 91EXPORT_SYMBOL(kfifo_alloc);
94 92
95/** 93/**
96 * kfifo_free - frees the FIFO 94 * kfifo_free - frees the FIFO internal buffer
97 * @fifo: the fifo to be freed. 95 * @fifo: the fifo to be freed.
98 */ 96 */
99void kfifo_free(struct kfifo *fifo) 97void kfifo_free(struct kfifo *fifo)
100{ 98{
101 kfree(fifo->buffer); 99 kfree(fifo->buffer);
102 kfree(fifo); 100 _kfifo_init(fifo, NULL, 0);
103} 101}
104EXPORT_SYMBOL(kfifo_free); 102EXPORT_SYMBOL(kfifo_free);
105 103
106/** 104/**
107 * __kfifo_put - puts some data into the FIFO, no locking version 105 * kfifo_skip - skip output data
108 * @fifo: the fifo to be used. 106 * @fifo: the fifo to be used.
109 * @buffer: the data to be added. 107 * @len: number of bytes to skip
110 * @len: the length of the data to be added.
111 *
112 * This function copies at most @len bytes from the @buffer into
113 * the FIFO depending on the free space, and returns the number of
114 * bytes copied.
115 *
116 * Note that with only one concurrent reader and one concurrent
117 * writer, you don't need extra locking to use these functions.
118 */ 108 */
119unsigned int __kfifo_put(struct kfifo *fifo, 109void kfifo_skip(struct kfifo *fifo, unsigned int len)
120 const unsigned char *buffer, unsigned int len) 110{
111 if (len < kfifo_len(fifo)) {
112 __kfifo_add_out(fifo, len);
113 return;
114 }
115 kfifo_reset_out(fifo);
116}
117EXPORT_SYMBOL(kfifo_skip);
118
119static inline void __kfifo_in_data(struct kfifo *fifo,
120 const void *from, unsigned int len, unsigned int off)
121{ 121{
122 unsigned int l; 122 unsigned int l;
123 123
124 len = min(len, fifo->size - fifo->in + fifo->out); 124 /*
125 * Ensure that we sample the fifo->out index -before- we
126 * start putting bytes into the kfifo.
127 */
128
129 smp_mb();
130
131 off = __kfifo_off(fifo, fifo->in + off);
132
133 /* first put the data starting from fifo->in to buffer end */
134 l = min(len, fifo->size - off);
135 memcpy(fifo->buffer + off, from, l);
136
137 /* then put the rest (if any) at the beginning of the buffer */
138 memcpy(fifo->buffer, from + l, len - l);
139}
140
141static inline void __kfifo_out_data(struct kfifo *fifo,
142 void *to, unsigned int len, unsigned int off)
143{
144 unsigned int l;
145
146 /*
147 * Ensure that we sample the fifo->in index -before- we
148 * start removing bytes from the kfifo.
149 */
150
151 smp_rmb();
152
153 off = __kfifo_off(fifo, fifo->out + off);
154
155 /* first get the data from fifo->out until the end of the buffer */
156 l = min(len, fifo->size - off);
157 memcpy(to, fifo->buffer + off, l);
158
159 /* then get the rest (if any) from the beginning of the buffer */
160 memcpy(to + l, fifo->buffer, len - l);
161}
162
163static inline int __kfifo_from_user_data(struct kfifo *fifo,
164 const void __user *from, unsigned int len, unsigned int off,
165 unsigned *lenout)
166{
167 unsigned int l;
168 int ret;
125 169
126 /* 170 /*
127 * Ensure that we sample the fifo->out index -before- we 171 * Ensure that we sample the fifo->out index -before- we
@@ -130,68 +174,272 @@ unsigned int __kfifo_put(struct kfifo *fifo,
130 174
131 smp_mb(); 175 smp_mb();
132 176
177 off = __kfifo_off(fifo, fifo->in + off);
178
133 /* first put the data starting from fifo->in to buffer end */ 179 /* first put the data starting from fifo->in to buffer end */
134 l = min(len, fifo->size - (fifo->in & (fifo->size - 1))); 180 l = min(len, fifo->size - off);
135 memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l); 181 ret = copy_from_user(fifo->buffer + off, from, l);
182 if (unlikely(ret)) {
183 *lenout = ret;
184 return -EFAULT;
185 }
186 *lenout = l;
136 187
137 /* then put the rest (if any) at the beginning of the buffer */ 188 /* then put the rest (if any) at the beginning of the buffer */
138 memcpy(fifo->buffer, buffer + l, len - l); 189 ret = copy_from_user(fifo->buffer, from + l, len - l);
190 *lenout += ret ? ret : len - l;
191 return ret ? -EFAULT : 0;
192}
193
194static inline int __kfifo_to_user_data(struct kfifo *fifo,
195 void __user *to, unsigned int len, unsigned int off, unsigned *lenout)
196{
197 unsigned int l;
198 int ret;
139 199
140 /* 200 /*
141 * Ensure that we add the bytes to the kfifo -before- 201 * Ensure that we sample the fifo->in index -before- we
142 * we update the fifo->in index. 202 * start removing bytes from the kfifo.
143 */ 203 */
144 204
145 smp_wmb(); 205 smp_rmb();
206
207 off = __kfifo_off(fifo, fifo->out + off);
208
209 /* first get the data from fifo->out until the end of the buffer */
210 l = min(len, fifo->size - off);
211 ret = copy_to_user(to, fifo->buffer + off, l);
212 *lenout = l;
213 if (unlikely(ret)) {
214 *lenout -= ret;
215 return -EFAULT;
216 }
217
218 /* then get the rest (if any) from the beginning of the buffer */
219 len -= l;
220 ret = copy_to_user(to + l, fifo->buffer, len);
221 if (unlikely(ret)) {
222 *lenout += len - ret;
223 return -EFAULT;
224 }
225 *lenout += len;
226 return 0;
227}
228
229unsigned int __kfifo_in_n(struct kfifo *fifo,
230 const void *from, unsigned int len, unsigned int recsize)
231{
232 if (kfifo_avail(fifo) < len + recsize)
233 return len + 1;
234
235 __kfifo_in_data(fifo, from, len, recsize);
236 return 0;
237}
238EXPORT_SYMBOL(__kfifo_in_n);
146 239
147 fifo->in += len; 240/**
241 * kfifo_in - puts some data into the FIFO
242 * @fifo: the fifo to be used.
243 * @from: the data to be added.
244 * @len: the length of the data to be added.
245 *
246 * This function copies at most @len bytes from the @from buffer into
247 * the FIFO depending on the free space, and returns the number of
248 * bytes copied.
249 *
250 * Note that with only one concurrent reader and one concurrent
251 * writer, you don't need extra locking to use these functions.
252 */
253unsigned int kfifo_in(struct kfifo *fifo, const void *from,
254 unsigned int len)
255{
256 len = min(kfifo_avail(fifo), len);
148 257
258 __kfifo_in_data(fifo, from, len, 0);
259 __kfifo_add_in(fifo, len);
149 return len; 260 return len;
150} 261}
151EXPORT_SYMBOL(__kfifo_put); 262EXPORT_SYMBOL(kfifo_in);
263
264unsigned int __kfifo_in_generic(struct kfifo *fifo,
265 const void *from, unsigned int len, unsigned int recsize)
266{
267 return __kfifo_in_rec(fifo, from, len, recsize);
268}
269EXPORT_SYMBOL(__kfifo_in_generic);
270
271unsigned int __kfifo_out_n(struct kfifo *fifo,
272 void *to, unsigned int len, unsigned int recsize)
273{
274 if (kfifo_len(fifo) < len + recsize)
275 return len;
276
277 __kfifo_out_data(fifo, to, len, recsize);
278 __kfifo_add_out(fifo, len + recsize);
279 return 0;
280}
281EXPORT_SYMBOL(__kfifo_out_n);
152 282
153/** 283/**
154 * __kfifo_get - gets some data from the FIFO, no locking version 284 * kfifo_out - gets some data from the FIFO
155 * @fifo: the fifo to be used. 285 * @fifo: the fifo to be used.
156 * @buffer: where the data must be copied. 286 * @to: where the data must be copied.
157 * @len: the size of the destination buffer. 287 * @len: the size of the destination buffer.
158 * 288 *
159 * This function copies at most @len bytes from the FIFO into the 289 * This function copies at most @len bytes from the FIFO into the
160 * @buffer and returns the number of copied bytes. 290 * @to buffer and returns the number of copied bytes.
161 * 291 *
162 * Note that with only one concurrent reader and one concurrent 292 * Note that with only one concurrent reader and one concurrent
163 * writer, you don't need extra locking to use these functions. 293 * writer, you don't need extra locking to use these functions.
164 */ 294 */
165unsigned int __kfifo_get(struct kfifo *fifo, 295unsigned int kfifo_out(struct kfifo *fifo, void *to, unsigned int len)
166 unsigned char *buffer, unsigned int len)
167{ 296{
168 unsigned int l; 297 len = min(kfifo_len(fifo), len);
169 298
170 len = min(len, fifo->in - fifo->out); 299 __kfifo_out_data(fifo, to, len, 0);
300 __kfifo_add_out(fifo, len);
171 301
172 /* 302 return len;
173 * Ensure that we sample the fifo->in index -before- we 303}
174 * start removing bytes from the kfifo. 304EXPORT_SYMBOL(kfifo_out);
175 */
176 305
177 smp_rmb(); 306/**
307 * kfifo_out_peek - copy some data from the FIFO, but do not remove it
308 * @fifo: the fifo to be used.
309 * @to: where the data must be copied.
310 * @len: the size of the destination buffer.
311 * @offset: offset into the fifo
312 *
313 * This function copies at most @len bytes at @offset from the FIFO
314 * into the @to buffer and returns the number of copied bytes.
315 * The data is not removed from the FIFO.
316 */
317unsigned int kfifo_out_peek(struct kfifo *fifo, void *to, unsigned int len,
318 unsigned offset)
319{
320 len = min(kfifo_len(fifo), len + offset);
178 321
179 /* first get the data from fifo->out until the end of the buffer */ 322 __kfifo_out_data(fifo, to, len, offset);
180 l = min(len, fifo->size - (fifo->out & (fifo->size - 1))); 323 return len;
181 memcpy(buffer, fifo->buffer + (fifo->out & (fifo->size - 1)), l); 324}
325EXPORT_SYMBOL(kfifo_out_peek);
182 326
183 /* then get the rest (if any) from the beginning of the buffer */ 327unsigned int __kfifo_out_generic(struct kfifo *fifo,
184 memcpy(buffer + l, fifo->buffer, len - l); 328 void *to, unsigned int len, unsigned int recsize,
329 unsigned int *total)
330{
331 return __kfifo_out_rec(fifo, to, len, recsize, total);
332}
333EXPORT_SYMBOL(__kfifo_out_generic);
185 334
186 /* 335unsigned int __kfifo_from_user_n(struct kfifo *fifo,
187 * Ensure that we remove the bytes from the kfifo -before- 336 const void __user *from, unsigned int len, unsigned int recsize)
188 * we update the fifo->out index. 337{
189 */ 338 unsigned total;
190 339
191 smp_mb(); 340 if (kfifo_avail(fifo) < len + recsize)
341 return len + 1;
192 342
193 fifo->out += len; 343 __kfifo_from_user_data(fifo, from, len, recsize, &total);
344 return total;
345}
346EXPORT_SYMBOL(__kfifo_from_user_n);
194 347
195 return len; 348/**
349 * kfifo_from_user - puts some data from user space into the FIFO
350 * @fifo: the fifo to be used.
351 * @from: pointer to the data to be added.
352 * @len: the length of the data to be added.
353 * @total: the actual returned data length.
354 *
355 * This function copies at most @len bytes from the @from into the
356 * FIFO depending and returns -EFAULT/0.
357 *
358 * Note that with only one concurrent reader and one concurrent
359 * writer, you don't need extra locking to use these functions.
360 */
361int kfifo_from_user(struct kfifo *fifo,
362 const void __user *from, unsigned int len, unsigned *total)
363{
364 int ret;
365 len = min(kfifo_avail(fifo), len);
366 ret = __kfifo_from_user_data(fifo, from, len, 0, total);
367 if (ret)
368 return ret;
369 __kfifo_add_in(fifo, len);
370 return 0;
196} 371}
197EXPORT_SYMBOL(__kfifo_get); 372EXPORT_SYMBOL(kfifo_from_user);
373
374unsigned int __kfifo_from_user_generic(struct kfifo *fifo,
375 const void __user *from, unsigned int len, unsigned int recsize)
376{
377 return __kfifo_from_user_rec(fifo, from, len, recsize);
378}
379EXPORT_SYMBOL(__kfifo_from_user_generic);
380
381unsigned int __kfifo_to_user_n(struct kfifo *fifo,
382 void __user *to, unsigned int len, unsigned int reclen,
383 unsigned int recsize)
384{
385 unsigned int ret, total;
386
387 if (kfifo_len(fifo) < reclen + recsize)
388 return len;
389
390 ret = __kfifo_to_user_data(fifo, to, reclen, recsize, &total);
391
392 if (likely(ret == 0))
393 __kfifo_add_out(fifo, reclen + recsize);
394
395 return total;
396}
397EXPORT_SYMBOL(__kfifo_to_user_n);
398
399/**
400 * kfifo_to_user - gets data from the FIFO and write it to user space
401 * @fifo: the fifo to be used.
402 * @to: where the data must be copied.
403 * @len: the size of the destination buffer.
404 * @lenout: pointer to output variable with copied data
405 *
406 * This function copies at most @len bytes from the FIFO into the
407 * @to buffer and 0 or -EFAULT.
408 *
409 * Note that with only one concurrent reader and one concurrent
410 * writer, you don't need extra locking to use these functions.
411 */
412int kfifo_to_user(struct kfifo *fifo,
413 void __user *to, unsigned int len, unsigned *lenout)
414{
415 int ret;
416 len = min(kfifo_len(fifo), len);
417 ret = __kfifo_to_user_data(fifo, to, len, 0, lenout);
418 __kfifo_add_out(fifo, *lenout);
419 return ret;
420}
421EXPORT_SYMBOL(kfifo_to_user);
422
423unsigned int __kfifo_to_user_generic(struct kfifo *fifo,
424 void __user *to, unsigned int len, unsigned int recsize,
425 unsigned int *total)
426{
427 return __kfifo_to_user_rec(fifo, to, len, recsize, total);
428}
429EXPORT_SYMBOL(__kfifo_to_user_generic);
430
431unsigned int __kfifo_peek_generic(struct kfifo *fifo, unsigned int recsize)
432{
433 if (recsize == 0)
434 return kfifo_avail(fifo);
435
436 return __kfifo_peek_n(fifo, recsize);
437}
438EXPORT_SYMBOL(__kfifo_peek_generic);
439
440void __kfifo_skip_generic(struct kfifo *fifo, unsigned int recsize)
441{
442 __kfifo_skip_rec(fifo, recsize);
443}
444EXPORT_SYMBOL(__kfifo_skip_generic);
445
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 9147a3190c9..761fdd2b303 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -129,6 +129,7 @@ struct task_struct *kgdb_usethread;
129struct task_struct *kgdb_contthread; 129struct task_struct *kgdb_contthread;
130 130
131int kgdb_single_step; 131int kgdb_single_step;
132pid_t kgdb_sstep_pid;
132 133
133/* Our I/O buffers. */ 134/* Our I/O buffers. */
134static char remcom_in_buffer[BUFMAX]; 135static char remcom_in_buffer[BUFMAX];
@@ -541,12 +542,17 @@ static struct task_struct *getthread(struct pt_regs *regs, int tid)
541 */ 542 */
542 if (tid == 0 || tid == -1) 543 if (tid == 0 || tid == -1)
543 tid = -atomic_read(&kgdb_active) - 2; 544 tid = -atomic_read(&kgdb_active) - 2;
544 if (tid < 0) { 545 if (tid < -1 && tid > -NR_CPUS - 2) {
545 if (kgdb_info[-tid - 2].task) 546 if (kgdb_info[-tid - 2].task)
546 return kgdb_info[-tid - 2].task; 547 return kgdb_info[-tid - 2].task;
547 else 548 else
548 return idle_task(-tid - 2); 549 return idle_task(-tid - 2);
549 } 550 }
551 if (tid <= 0) {
552 printk(KERN_ERR "KGDB: Internal thread select error\n");
553 dump_stack();
554 return NULL;
555 }
550 556
551 /* 557 /*
552 * find_task_by_pid_ns() does not take the tasklist lock anymore 558 * find_task_by_pid_ns() does not take the tasklist lock anymore
@@ -577,6 +583,9 @@ static void kgdb_wait(struct pt_regs *regs)
577 smp_wmb(); 583 smp_wmb();
578 atomic_set(&cpu_in_kgdb[cpu], 1); 584 atomic_set(&cpu_in_kgdb[cpu], 1);
579 585
586 /* Disable any cpu specific hw breakpoints */
587 kgdb_disable_hw_debug(regs);
588
580 /* Wait till primary CPU is done with debugging */ 589 /* Wait till primary CPU is done with debugging */
581 while (atomic_read(&passive_cpu_wait[cpu])) 590 while (atomic_read(&passive_cpu_wait[cpu]))
582 cpu_relax(); 591 cpu_relax();
@@ -590,7 +599,7 @@ static void kgdb_wait(struct pt_regs *regs)
590 599
591 /* Signal the primary CPU that we are done: */ 600 /* Signal the primary CPU that we are done: */
592 atomic_set(&cpu_in_kgdb[cpu], 0); 601 atomic_set(&cpu_in_kgdb[cpu], 0);
593 touch_softlockup_watchdog(); 602 touch_softlockup_watchdog_sync();
594 clocksource_touch_watchdog(); 603 clocksource_touch_watchdog();
595 local_irq_restore(flags); 604 local_irq_restore(flags);
596} 605}
@@ -619,7 +628,8 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
619static int kgdb_activate_sw_breakpoints(void) 628static int kgdb_activate_sw_breakpoints(void)
620{ 629{
621 unsigned long addr; 630 unsigned long addr;
622 int error = 0; 631 int error;
632 int ret = 0;
623 int i; 633 int i;
624 634
625 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { 635 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
@@ -629,13 +639,16 @@ static int kgdb_activate_sw_breakpoints(void)
629 addr = kgdb_break[i].bpt_addr; 639 addr = kgdb_break[i].bpt_addr;
630 error = kgdb_arch_set_breakpoint(addr, 640 error = kgdb_arch_set_breakpoint(addr,
631 kgdb_break[i].saved_instr); 641 kgdb_break[i].saved_instr);
632 if (error) 642 if (error) {
633 return error; 643 ret = error;
644 printk(KERN_INFO "KGDB: BP install failed: %lx", addr);
645 continue;
646 }
634 647
635 kgdb_flush_swbreak_addr(addr); 648 kgdb_flush_swbreak_addr(addr);
636 kgdb_break[i].state = BP_ACTIVE; 649 kgdb_break[i].state = BP_ACTIVE;
637 } 650 }
638 return 0; 651 return ret;
639} 652}
640 653
641static int kgdb_set_sw_break(unsigned long addr) 654static int kgdb_set_sw_break(unsigned long addr)
@@ -682,7 +695,8 @@ static int kgdb_set_sw_break(unsigned long addr)
682static int kgdb_deactivate_sw_breakpoints(void) 695static int kgdb_deactivate_sw_breakpoints(void)
683{ 696{
684 unsigned long addr; 697 unsigned long addr;
685 int error = 0; 698 int error;
699 int ret = 0;
686 int i; 700 int i;
687 701
688 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { 702 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
@@ -691,13 +705,15 @@ static int kgdb_deactivate_sw_breakpoints(void)
691 addr = kgdb_break[i].bpt_addr; 705 addr = kgdb_break[i].bpt_addr;
692 error = kgdb_arch_remove_breakpoint(addr, 706 error = kgdb_arch_remove_breakpoint(addr,
693 kgdb_break[i].saved_instr); 707 kgdb_break[i].saved_instr);
694 if (error) 708 if (error) {
695 return error; 709 printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr);
710 ret = error;
711 }
696 712
697 kgdb_flush_swbreak_addr(addr); 713 kgdb_flush_swbreak_addr(addr);
698 kgdb_break[i].state = BP_SET; 714 kgdb_break[i].state = BP_SET;
699 } 715 }
700 return 0; 716 return ret;
701} 717}
702 718
703static int kgdb_remove_sw_break(unsigned long addr) 719static int kgdb_remove_sw_break(unsigned long addr)
@@ -870,7 +886,7 @@ static void gdb_cmd_getregs(struct kgdb_state *ks)
870 886
871 /* 887 /*
872 * All threads that don't have debuggerinfo should be 888 * All threads that don't have debuggerinfo should be
873 * in __schedule() sleeping, since all other CPUs 889 * in schedule() sleeping, since all other CPUs
874 * are in kgdb_wait, and thus have debuggerinfo. 890 * are in kgdb_wait, and thus have debuggerinfo.
875 */ 891 */
876 if (local_debuggerinfo) { 892 if (local_debuggerinfo) {
@@ -1204,8 +1220,10 @@ static int gdb_cmd_exception_pass(struct kgdb_state *ks)
1204 return 1; 1220 return 1;
1205 1221
1206 } else { 1222 } else {
1207 error_packet(remcom_out_buffer, -EINVAL); 1223 kgdb_msg_write("KGDB only knows signal 9 (pass)"
1208 return 0; 1224 " and 15 (pass and disconnect)\n"
1225 "Executing a continue without signal passing\n", 0);
1226 remcom_in_buffer[0] = 'c';
1209 } 1227 }
1210 1228
1211 /* Indicate fall through */ 1229 /* Indicate fall through */
@@ -1395,6 +1413,7 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
1395 struct kgdb_state kgdb_var; 1413 struct kgdb_state kgdb_var;
1396 struct kgdb_state *ks = &kgdb_var; 1414 struct kgdb_state *ks = &kgdb_var;
1397 unsigned long flags; 1415 unsigned long flags;
1416 int sstep_tries = 100;
1398 int error = 0; 1417 int error = 0;
1399 int i, cpu; 1418 int i, cpu;
1400 1419
@@ -1425,15 +1444,16 @@ acquirelock:
1425 cpu_relax(); 1444 cpu_relax();
1426 1445
1427 /* 1446 /*
1428 * Do not start the debugger connection on this CPU if the last 1447 * For single stepping, try to only enter on the processor
1429 * instance of the exception handler wanted to come into the 1448 * that was single stepping. To gaurd against a deadlock, the
1430 * debugger on a different CPU via a single step 1449 * kernel will only try for the value of sstep_tries before
1450 * giving up and continuing on.
1431 */ 1451 */
1432 if (atomic_read(&kgdb_cpu_doing_single_step) != -1 && 1452 if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
1433 atomic_read(&kgdb_cpu_doing_single_step) != cpu) { 1453 (kgdb_info[cpu].task &&
1434 1454 kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
1435 atomic_set(&kgdb_active, -1); 1455 atomic_set(&kgdb_active, -1);
1436 touch_softlockup_watchdog(); 1456 touch_softlockup_watchdog_sync();
1437 clocksource_touch_watchdog(); 1457 clocksource_touch_watchdog();
1438 local_irq_restore(flags); 1458 local_irq_restore(flags);
1439 1459
@@ -1524,9 +1544,16 @@ acquirelock:
1524 } 1544 }
1525 1545
1526kgdb_restore: 1546kgdb_restore:
1547 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
1548 int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step);
1549 if (kgdb_info[sstep_cpu].task)
1550 kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid;
1551 else
1552 kgdb_sstep_pid = 0;
1553 }
1527 /* Free kgdb_active */ 1554 /* Free kgdb_active */
1528 atomic_set(&kgdb_active, -1); 1555 atomic_set(&kgdb_active, -1);
1529 touch_softlockup_watchdog(); 1556 touch_softlockup_watchdog_sync();
1530 clocksource_touch_watchdog(); 1557 clocksource_touch_watchdog();
1531 local_irq_restore(flags); 1558 local_irq_restore(flags);
1532 1559
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 9fcb53a11f8..bf0e231d970 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -80,16 +80,16 @@ int __request_module(bool wait, const char *fmt, ...)
80#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ 80#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
81 static int kmod_loop_msg; 81 static int kmod_loop_msg;
82 82
83 ret = security_kernel_module_request();
84 if (ret)
85 return ret;
86
87 va_start(args, fmt); 83 va_start(args, fmt);
88 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); 84 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
89 va_end(args); 85 va_end(args);
90 if (ret >= MODULE_NAME_LEN) 86 if (ret >= MODULE_NAME_LEN)
91 return -ENAMETOOLONG; 87 return -ENAMETOOLONG;
92 88
89 ret = security_kernel_module_request(module_name);
90 if (ret)
91 return ret;
92
93 /* If modprobe needs a service that is in a module, we get a recursive 93 /* If modprobe needs a service that is in a module, we get a recursive
94 * loop. Limit the number of running kmod threads to max_threads/2 or 94 * loop. Limit the number of running kmod threads to max_threads/2 or
95 * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method 95 * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method
@@ -520,13 +520,15 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
520 return -ENOMEM; 520 return -ENOMEM;
521 521
522 ret = call_usermodehelper_stdinpipe(sub_info, filp); 522 ret = call_usermodehelper_stdinpipe(sub_info, filp);
523 if (ret < 0) 523 if (ret < 0) {
524 goto out; 524 call_usermodehelper_freeinfo(sub_info);
525 return ret;
526 }
525 527
526 return call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); 528 ret = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
529 if (ret < 0) /* Failed to execute helper, close pipe */
530 filp_close(*filp, NULL);
527 531
528 out:
529 call_usermodehelper_freeinfo(sub_info);
530 return ret; 532 return ret;
531} 533}
532EXPORT_SYMBOL(call_usermodehelper_pipe); 534EXPORT_SYMBOL(call_usermodehelper_pipe);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 5240d75f4c6..b7df302a020 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -90,6 +90,9 @@ static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
90 */ 90 */
91static struct kprobe_blackpoint kprobe_blacklist[] = { 91static struct kprobe_blackpoint kprobe_blacklist[] = {
92 {"preempt_schedule",}, 92 {"preempt_schedule",},
93 {"native_get_debugreg",},
94 {"irq_entries_start",},
95 {"common_interrupt",},
93 {NULL} /* Terminator */ 96 {NULL} /* Terminator */
94}; 97};
95 98
@@ -673,6 +676,40 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
673 return (kprobe_opcode_t *)(((char *)addr) + p->offset); 676 return (kprobe_opcode_t *)(((char *)addr) + p->offset);
674} 677}
675 678
679/* Check passed kprobe is valid and return kprobe in kprobe_table. */
680static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
681{
682 struct kprobe *old_p, *list_p;
683
684 old_p = get_kprobe(p->addr);
685 if (unlikely(!old_p))
686 return NULL;
687
688 if (p != old_p) {
689 list_for_each_entry_rcu(list_p, &old_p->list, list)
690 if (list_p == p)
691 /* kprobe p is a valid probe */
692 goto valid;
693 return NULL;
694 }
695valid:
696 return old_p;
697}
698
699/* Return error if the kprobe is being re-registered */
700static inline int check_kprobe_rereg(struct kprobe *p)
701{
702 int ret = 0;
703 struct kprobe *old_p;
704
705 mutex_lock(&kprobe_mutex);
706 old_p = __get_valid_kprobe(p);
707 if (old_p)
708 ret = -EINVAL;
709 mutex_unlock(&kprobe_mutex);
710 return ret;
711}
712
676int __kprobes register_kprobe(struct kprobe *p) 713int __kprobes register_kprobe(struct kprobe *p)
677{ 714{
678 int ret = 0; 715 int ret = 0;
@@ -685,6 +722,10 @@ int __kprobes register_kprobe(struct kprobe *p)
685 return -EINVAL; 722 return -EINVAL;
686 p->addr = addr; 723 p->addr = addr;
687 724
725 ret = check_kprobe_rereg(p);
726 if (ret)
727 return ret;
728
688 preempt_disable(); 729 preempt_disable();
689 if (!kernel_text_address((unsigned long) p->addr) || 730 if (!kernel_text_address((unsigned long) p->addr) ||
690 in_kprobes_functions((unsigned long) p->addr)) { 731 in_kprobes_functions((unsigned long) p->addr)) {
@@ -754,26 +795,6 @@ out:
754} 795}
755EXPORT_SYMBOL_GPL(register_kprobe); 796EXPORT_SYMBOL_GPL(register_kprobe);
756 797
757/* Check passed kprobe is valid and return kprobe in kprobe_table. */
758static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
759{
760 struct kprobe *old_p, *list_p;
761
762 old_p = get_kprobe(p->addr);
763 if (unlikely(!old_p))
764 return NULL;
765
766 if (p != old_p) {
767 list_for_each_entry_rcu(list_p, &old_p->list, list)
768 if (list_p == p)
769 /* kprobe p is a valid probe */
770 goto valid;
771 return NULL;
772 }
773valid:
774 return old_p;
775}
776
777/* 798/*
778 * Unregister a kprobe without a scheduler synchronization. 799 * Unregister a kprobe without a scheduler synchronization.
779 */ 800 */
@@ -1014,9 +1035,9 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
1014 /* Pre-allocate memory for max kretprobe instances */ 1035 /* Pre-allocate memory for max kretprobe instances */
1015 if (rp->maxactive <= 0) { 1036 if (rp->maxactive <= 0) {
1016#ifdef CONFIG_PREEMPT 1037#ifdef CONFIG_PREEMPT
1017 rp->maxactive = max(10, 2 * NR_CPUS); 1038 rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());
1018#else 1039#else
1019 rp->maxactive = NR_CPUS; 1040 rp->maxactive = num_possible_cpus();
1020#endif 1041#endif
1021 } 1042 }
1022 spin_lock_init(&rp->lock); 1043 spin_lock_init(&rp->lock);
@@ -1141,6 +1162,13 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1141 arch_remove_kprobe(p); 1162 arch_remove_kprobe(p);
1142} 1163}
1143 1164
1165void __kprobes dump_kprobe(struct kprobe *kp)
1166{
1167 printk(KERN_WARNING "Dumping kprobe:\n");
1168 printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n",
1169 kp->symbol_name, kp->addr, kp->offset);
1170}
1171
1144/* Module notifier call back, checking kprobes on the module */ 1172/* Module notifier call back, checking kprobes on the module */
1145static int __kprobes kprobes_module_callback(struct notifier_block *nb, 1173static int __kprobes kprobes_module_callback(struct notifier_block *nb,
1146 unsigned long val, void *data) 1174 unsigned long val, void *data)
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 528dd78e7e7..3feaf5a7451 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -100,6 +100,26 @@ static ssize_t kexec_crash_loaded_show(struct kobject *kobj,
100} 100}
101KERNEL_ATTR_RO(kexec_crash_loaded); 101KERNEL_ATTR_RO(kexec_crash_loaded);
102 102
103static ssize_t kexec_crash_size_show(struct kobject *kobj,
104 struct kobj_attribute *attr, char *buf)
105{
106 return sprintf(buf, "%zu\n", crash_get_memory_size());
107}
108static ssize_t kexec_crash_size_store(struct kobject *kobj,
109 struct kobj_attribute *attr,
110 const char *buf, size_t count)
111{
112 unsigned long cnt;
113 int ret;
114
115 if (strict_strtoul(buf, 0, &cnt))
116 return -EINVAL;
117
118 ret = crash_shrink_memory(cnt);
119 return ret < 0 ? ret : count;
120}
121KERNEL_ATTR_RW(kexec_crash_size);
122
103static ssize_t vmcoreinfo_show(struct kobject *kobj, 123static ssize_t vmcoreinfo_show(struct kobject *kobj,
104 struct kobj_attribute *attr, char *buf) 124 struct kobj_attribute *attr, char *buf)
105{ 125{
@@ -147,6 +167,7 @@ static struct attribute * kernel_attrs[] = {
147#ifdef CONFIG_KEXEC 167#ifdef CONFIG_KEXEC
148 &kexec_loaded_attr.attr, 168 &kexec_loaded_attr.attr,
149 &kexec_crash_loaded_attr.attr, 169 &kexec_crash_loaded_attr.attr,
170 &kexec_crash_size_attr.attr,
150 &vmcoreinfo_attr.attr, 171 &vmcoreinfo_attr.attr,
151#endif 172#endif
152 NULL 173 NULL
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 5fe709982ca..fbb6222fe7e 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -151,24 +151,24 @@ EXPORT_SYMBOL(kthread_create);
151 151
152/** 152/**
153 * kthread_bind - bind a just-created kthread to a cpu. 153 * kthread_bind - bind a just-created kthread to a cpu.
154 * @k: thread created by kthread_create(). 154 * @p: thread created by kthread_create().
155 * @cpu: cpu (might not be online, must be possible) for @k to run on. 155 * @cpu: cpu (might not be online, must be possible) for @k to run on.
156 * 156 *
157 * Description: This function is equivalent to set_cpus_allowed(), 157 * Description: This function is equivalent to set_cpus_allowed(),
158 * except that @cpu doesn't need to be online, and the thread must be 158 * except that @cpu doesn't need to be online, and the thread must be
159 * stopped (i.e., just returned from kthread_create()). 159 * stopped (i.e., just returned from kthread_create()).
160 */ 160 */
161void kthread_bind(struct task_struct *k, unsigned int cpu) 161void kthread_bind(struct task_struct *p, unsigned int cpu)
162{ 162{
163 /* Must have done schedule() in kthread() before we set_task_cpu */ 163 /* Must have done schedule() in kthread() before we set_task_cpu */
164 if (!wait_task_inactive(k, TASK_UNINTERRUPTIBLE)) { 164 if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
165 WARN_ON(1); 165 WARN_ON(1);
166 return; 166 return;
167 } 167 }
168 set_task_cpu(k, cpu); 168
169 k->cpus_allowed = cpumask_of_cpu(cpu); 169 p->cpus_allowed = cpumask_of_cpu(cpu);
170 k->rt.nr_cpus_allowed = 1; 170 p->rt.nr_cpus_allowed = 1;
171 k->flags |= PF_THREAD_BOUND; 171 p->flags |= PF_THREAD_BOUND;
172} 172}
173EXPORT_SYMBOL(kthread_bind); 173EXPORT_SYMBOL(kthread_bind);
174 174
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 3815ac1d58b..c62ec14609b 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -49,7 +49,7 @@
49#include "lockdep_internals.h" 49#include "lockdep_internals.h"
50 50
51#define CREATE_TRACE_POINTS 51#define CREATE_TRACE_POINTS
52#include <trace/events/lockdep.h> 52#include <trace/events/lock.h>
53 53
54#ifdef CONFIG_PROVE_LOCKING 54#ifdef CONFIG_PROVE_LOCKING
55int prove_locking = 1; 55int prove_locking = 1;
@@ -73,11 +73,11 @@ module_param(lock_stat, int, 0644);
73 * to use a raw spinlock - we really dont want the spinlock 73 * to use a raw spinlock - we really dont want the spinlock
74 * code to recurse back into the lockdep code... 74 * code to recurse back into the lockdep code...
75 */ 75 */
76static raw_spinlock_t lockdep_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 76static arch_spinlock_t lockdep_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
77 77
78static int graph_lock(void) 78static int graph_lock(void)
79{ 79{
80 __raw_spin_lock(&lockdep_lock); 80 arch_spin_lock(&lockdep_lock);
81 /* 81 /*
82 * Make sure that if another CPU detected a bug while 82 * Make sure that if another CPU detected a bug while
83 * walking the graph we dont change it (while the other 83 * walking the graph we dont change it (while the other
@@ -85,7 +85,7 @@ static int graph_lock(void)
85 * dropped already) 85 * dropped already)
86 */ 86 */
87 if (!debug_locks) { 87 if (!debug_locks) {
88 __raw_spin_unlock(&lockdep_lock); 88 arch_spin_unlock(&lockdep_lock);
89 return 0; 89 return 0;
90 } 90 }
91 /* prevent any recursions within lockdep from causing deadlocks */ 91 /* prevent any recursions within lockdep from causing deadlocks */
@@ -95,11 +95,11 @@ static int graph_lock(void)
95 95
96static inline int graph_unlock(void) 96static inline int graph_unlock(void)
97{ 97{
98 if (debug_locks && !__raw_spin_is_locked(&lockdep_lock)) 98 if (debug_locks && !arch_spin_is_locked(&lockdep_lock))
99 return DEBUG_LOCKS_WARN_ON(1); 99 return DEBUG_LOCKS_WARN_ON(1);
100 100
101 current->lockdep_recursion--; 101 current->lockdep_recursion--;
102 __raw_spin_unlock(&lockdep_lock); 102 arch_spin_unlock(&lockdep_lock);
103 return 0; 103 return 0;
104} 104}
105 105
@@ -111,7 +111,7 @@ static inline int debug_locks_off_graph_unlock(void)
111{ 111{
112 int ret = debug_locks_off(); 112 int ret = debug_locks_off();
113 113
114 __raw_spin_unlock(&lockdep_lock); 114 arch_spin_unlock(&lockdep_lock);
115 115
116 return ret; 116 return ret;
117} 117}
@@ -140,7 +140,13 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock)
140} 140}
141 141
142#ifdef CONFIG_LOCK_STAT 142#ifdef CONFIG_LOCK_STAT
143static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); 143static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS],
144 cpu_lock_stats);
145
146static inline u64 lockstat_clock(void)
147{
148 return cpu_clock(smp_processor_id());
149}
144 150
145static int lock_point(unsigned long points[], unsigned long ip) 151static int lock_point(unsigned long points[], unsigned long ip)
146{ 152{
@@ -158,12 +164,12 @@ static int lock_point(unsigned long points[], unsigned long ip)
158 return i; 164 return i;
159} 165}
160 166
161static void lock_time_inc(struct lock_time *lt, s64 time) 167static void lock_time_inc(struct lock_time *lt, u64 time)
162{ 168{
163 if (time > lt->max) 169 if (time > lt->max)
164 lt->max = time; 170 lt->max = time;
165 171
166 if (time < lt->min || !lt->min) 172 if (time < lt->min || !lt->nr)
167 lt->min = time; 173 lt->min = time;
168 174
169 lt->total += time; 175 lt->total += time;
@@ -172,8 +178,15 @@ static void lock_time_inc(struct lock_time *lt, s64 time)
172 178
173static inline void lock_time_add(struct lock_time *src, struct lock_time *dst) 179static inline void lock_time_add(struct lock_time *src, struct lock_time *dst)
174{ 180{
175 dst->min += src->min; 181 if (!src->nr)
176 dst->max += src->max; 182 return;
183
184 if (src->max > dst->max)
185 dst->max = src->max;
186
187 if (src->min < dst->min || !dst->nr)
188 dst->min = src->min;
189
177 dst->total += src->total; 190 dst->total += src->total;
178 dst->nr += src->nr; 191 dst->nr += src->nr;
179} 192}
@@ -186,7 +199,7 @@ struct lock_class_stats lock_stats(struct lock_class *class)
186 memset(&stats, 0, sizeof(struct lock_class_stats)); 199 memset(&stats, 0, sizeof(struct lock_class_stats));
187 for_each_possible_cpu(cpu) { 200 for_each_possible_cpu(cpu) {
188 struct lock_class_stats *pcs = 201 struct lock_class_stats *pcs =
189 &per_cpu(lock_stats, cpu)[class - lock_classes]; 202 &per_cpu(cpu_lock_stats, cpu)[class - lock_classes];
190 203
191 for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++) 204 for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
192 stats.contention_point[i] += pcs->contention_point[i]; 205 stats.contention_point[i] += pcs->contention_point[i];
@@ -213,7 +226,7 @@ void clear_lock_stats(struct lock_class *class)
213 226
214 for_each_possible_cpu(cpu) { 227 for_each_possible_cpu(cpu) {
215 struct lock_class_stats *cpu_stats = 228 struct lock_class_stats *cpu_stats =
216 &per_cpu(lock_stats, cpu)[class - lock_classes]; 229 &per_cpu(cpu_lock_stats, cpu)[class - lock_classes];
217 230
218 memset(cpu_stats, 0, sizeof(struct lock_class_stats)); 231 memset(cpu_stats, 0, sizeof(struct lock_class_stats));
219 } 232 }
@@ -223,23 +236,23 @@ void clear_lock_stats(struct lock_class *class)
223 236
224static struct lock_class_stats *get_lock_stats(struct lock_class *class) 237static struct lock_class_stats *get_lock_stats(struct lock_class *class)
225{ 238{
226 return &get_cpu_var(lock_stats)[class - lock_classes]; 239 return &get_cpu_var(cpu_lock_stats)[class - lock_classes];
227} 240}
228 241
229static void put_lock_stats(struct lock_class_stats *stats) 242static void put_lock_stats(struct lock_class_stats *stats)
230{ 243{
231 put_cpu_var(lock_stats); 244 put_cpu_var(cpu_lock_stats);
232} 245}
233 246
234static void lock_release_holdtime(struct held_lock *hlock) 247static void lock_release_holdtime(struct held_lock *hlock)
235{ 248{
236 struct lock_class_stats *stats; 249 struct lock_class_stats *stats;
237 s64 holdtime; 250 u64 holdtime;
238 251
239 if (!lock_stat) 252 if (!lock_stat)
240 return; 253 return;
241 254
242 holdtime = sched_clock() - hlock->holdtime_stamp; 255 holdtime = lockstat_clock() - hlock->holdtime_stamp;
243 256
244 stats = get_lock_stats(hlock_class(hlock)); 257 stats = get_lock_stats(hlock_class(hlock));
245 if (hlock->read) 258 if (hlock->read)
@@ -374,7 +387,8 @@ static int save_trace(struct stack_trace *trace)
374 * complete trace that maxes out the entries provided will be reported 387 * complete trace that maxes out the entries provided will be reported
375 * as incomplete, friggin useless </rant> 388 * as incomplete, friggin useless </rant>
376 */ 389 */
377 if (trace->entries[trace->nr_entries-1] == ULONG_MAX) 390 if (trace->nr_entries != 0 &&
391 trace->entries[trace->nr_entries-1] == ULONG_MAX)
378 trace->nr_entries--; 392 trace->nr_entries--;
379 393
380 trace->max_entries = trace->nr_entries; 394 trace->max_entries = trace->nr_entries;
@@ -1156,9 +1170,9 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class)
1156 this.class = class; 1170 this.class = class;
1157 1171
1158 local_irq_save(flags); 1172 local_irq_save(flags);
1159 __raw_spin_lock(&lockdep_lock); 1173 arch_spin_lock(&lockdep_lock);
1160 ret = __lockdep_count_forward_deps(&this); 1174 ret = __lockdep_count_forward_deps(&this);
1161 __raw_spin_unlock(&lockdep_lock); 1175 arch_spin_unlock(&lockdep_lock);
1162 local_irq_restore(flags); 1176 local_irq_restore(flags);
1163 1177
1164 return ret; 1178 return ret;
@@ -1183,9 +1197,9 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
1183 this.class = class; 1197 this.class = class;
1184 1198
1185 local_irq_save(flags); 1199 local_irq_save(flags);
1186 __raw_spin_lock(&lockdep_lock); 1200 arch_spin_lock(&lockdep_lock);
1187 ret = __lockdep_count_backward_deps(&this); 1201 ret = __lockdep_count_backward_deps(&this);
1188 __raw_spin_unlock(&lockdep_lock); 1202 arch_spin_unlock(&lockdep_lock);
1189 local_irq_restore(flags); 1203 local_irq_restore(flags);
1190 1204
1191 return ret; 1205 return ret;
@@ -2133,7 +2147,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
2133 return ret; 2147 return ret;
2134 2148
2135 return print_irq_inversion_bug(curr, &root, target_entry, 2149 return print_irq_inversion_bug(curr, &root, target_entry,
2136 this, 1, irqclass); 2150 this, 0, irqclass);
2137} 2151}
2138 2152
2139void print_irqtrace_events(struct task_struct *curr) 2153void print_irqtrace_events(struct task_struct *curr)
@@ -2792,7 +2806,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2792 hlock->references = references; 2806 hlock->references = references;
2793#ifdef CONFIG_LOCK_STAT 2807#ifdef CONFIG_LOCK_STAT
2794 hlock->waittime_stamp = 0; 2808 hlock->waittime_stamp = 0;
2795 hlock->holdtime_stamp = sched_clock(); 2809 hlock->holdtime_stamp = lockstat_clock();
2796#endif 2810#endif
2797 2811
2798 if (check == 2 && !mark_irqflags(curr, hlock)) 2812 if (check == 2 && !mark_irqflags(curr, hlock))
@@ -3322,7 +3336,7 @@ found_it:
3322 if (hlock->instance != lock) 3336 if (hlock->instance != lock)
3323 return; 3337 return;
3324 3338
3325 hlock->waittime_stamp = sched_clock(); 3339 hlock->waittime_stamp = lockstat_clock();
3326 3340
3327 contention_point = lock_point(hlock_class(hlock)->contention_point, ip); 3341 contention_point = lock_point(hlock_class(hlock)->contention_point, ip);
3328 contending_point = lock_point(hlock_class(hlock)->contending_point, 3342 contending_point = lock_point(hlock_class(hlock)->contending_point,
@@ -3345,8 +3359,7 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
3345 struct held_lock *hlock, *prev_hlock; 3359 struct held_lock *hlock, *prev_hlock;
3346 struct lock_class_stats *stats; 3360 struct lock_class_stats *stats;
3347 unsigned int depth; 3361 unsigned int depth;
3348 u64 now; 3362 u64 now, waittime = 0;
3349 s64 waittime = 0;
3350 int i, cpu; 3363 int i, cpu;
3351 3364
3352 depth = curr->lockdep_depth; 3365 depth = curr->lockdep_depth;
@@ -3374,7 +3387,7 @@ found_it:
3374 3387
3375 cpu = smp_processor_id(); 3388 cpu = smp_processor_id();
3376 if (hlock->waittime_stamp) { 3389 if (hlock->waittime_stamp) {
3377 now = sched_clock(); 3390 now = lockstat_clock();
3378 waittime = now - hlock->waittime_stamp; 3391 waittime = now - hlock->waittime_stamp;
3379 hlock->holdtime_stamp = now; 3392 hlock->holdtime_stamp = now;
3380 } 3393 }
diff --git a/kernel/module.c b/kernel/module.c
index 8b7d8805819..f82386bd9ee 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -370,8 +370,6 @@ EXPORT_SYMBOL_GPL(find_module);
370 370
371#ifdef CONFIG_SMP 371#ifdef CONFIG_SMP
372 372
373#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
374
375static void *percpu_modalloc(unsigned long size, unsigned long align, 373static void *percpu_modalloc(unsigned long size, unsigned long align,
376 const char *name) 374 const char *name)
377{ 375{
@@ -395,154 +393,6 @@ static void percpu_modfree(void *freeme)
395 free_percpu(freeme); 393 free_percpu(freeme);
396} 394}
397 395
398#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */
399
400/* Number of blocks used and allocated. */
401static unsigned int pcpu_num_used, pcpu_num_allocated;
402/* Size of each block. -ve means used. */
403static int *pcpu_size;
404
405static int split_block(unsigned int i, unsigned short size)
406{
407 /* Reallocation required? */
408 if (pcpu_num_used + 1 > pcpu_num_allocated) {
409 int *new;
410
411 new = krealloc(pcpu_size, sizeof(new[0])*pcpu_num_allocated*2,
412 GFP_KERNEL);
413 if (!new)
414 return 0;
415
416 pcpu_num_allocated *= 2;
417 pcpu_size = new;
418 }
419
420 /* Insert a new subblock */
421 memmove(&pcpu_size[i+1], &pcpu_size[i],
422 sizeof(pcpu_size[0]) * (pcpu_num_used - i));
423 pcpu_num_used++;
424
425 pcpu_size[i+1] -= size;
426 pcpu_size[i] = size;
427 return 1;
428}
429
430static inline unsigned int block_size(int val)
431{
432 if (val < 0)
433 return -val;
434 return val;
435}
436
437static void *percpu_modalloc(unsigned long size, unsigned long align,
438 const char *name)
439{
440 unsigned long extra;
441 unsigned int i;
442 void *ptr;
443 int cpu;
444
445 if (align > PAGE_SIZE) {
446 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
447 name, align, PAGE_SIZE);
448 align = PAGE_SIZE;
449 }
450
451 ptr = __per_cpu_start;
452 for (i = 0; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
453 /* Extra for alignment requirement. */
454 extra = ALIGN((unsigned long)ptr, align) - (unsigned long)ptr;
455 BUG_ON(i == 0 && extra != 0);
456
457 if (pcpu_size[i] < 0 || pcpu_size[i] < extra + size)
458 continue;
459
460 /* Transfer extra to previous block. */
461 if (pcpu_size[i-1] < 0)
462 pcpu_size[i-1] -= extra;
463 else
464 pcpu_size[i-1] += extra;
465 pcpu_size[i] -= extra;
466 ptr += extra;
467
468 /* Split block if warranted */
469 if (pcpu_size[i] - size > sizeof(unsigned long))
470 if (!split_block(i, size))
471 return NULL;
472
473 /* add the per-cpu scanning areas */
474 for_each_possible_cpu(cpu)
475 kmemleak_alloc(ptr + per_cpu_offset(cpu), size, 0,
476 GFP_KERNEL);
477
478 /* Mark allocated */
479 pcpu_size[i] = -pcpu_size[i];
480 return ptr;
481 }
482
483 printk(KERN_WARNING "Could not allocate %lu bytes percpu data\n",
484 size);
485 return NULL;
486}
487
488static void percpu_modfree(void *freeme)
489{
490 unsigned int i;
491 void *ptr = __per_cpu_start + block_size(pcpu_size[0]);
492 int cpu;
493
494 /* First entry is core kernel percpu data. */
495 for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
496 if (ptr == freeme) {
497 pcpu_size[i] = -pcpu_size[i];
498 goto free;
499 }
500 }
501 BUG();
502
503 free:
504 /* remove the per-cpu scanning areas */
505 for_each_possible_cpu(cpu)
506 kmemleak_free(freeme + per_cpu_offset(cpu));
507
508 /* Merge with previous? */
509 if (pcpu_size[i-1] >= 0) {
510 pcpu_size[i-1] += pcpu_size[i];
511 pcpu_num_used--;
512 memmove(&pcpu_size[i], &pcpu_size[i+1],
513 (pcpu_num_used - i) * sizeof(pcpu_size[0]));
514 i--;
515 }
516 /* Merge with next? */
517 if (i+1 < pcpu_num_used && pcpu_size[i+1] >= 0) {
518 pcpu_size[i] += pcpu_size[i+1];
519 pcpu_num_used--;
520 memmove(&pcpu_size[i+1], &pcpu_size[i+2],
521 (pcpu_num_used - (i+1)) * sizeof(pcpu_size[0]));
522 }
523}
524
525static int percpu_modinit(void)
526{
527 pcpu_num_used = 2;
528 pcpu_num_allocated = 2;
529 pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated,
530 GFP_KERNEL);
531 /* Static in-kernel percpu data (used). */
532 pcpu_size[0] = -(__per_cpu_end-__per_cpu_start);
533 /* Free room. */
534 pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0];
535 if (pcpu_size[1] < 0) {
536 printk(KERN_ERR "No per-cpu room for modules.\n");
537 pcpu_num_used = 1;
538 }
539
540 return 0;
541}
542__initcall(percpu_modinit);
543
544#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */
545
546static unsigned int find_pcpusec(Elf_Ehdr *hdr, 396static unsigned int find_pcpusec(Elf_Ehdr *hdr,
547 Elf_Shdr *sechdrs, 397 Elf_Shdr *sechdrs,
548 const char *secstrings) 398 const char *secstrings)
@@ -1030,11 +880,23 @@ static int try_to_force_load(struct module *mod, const char *reason)
1030} 880}
1031 881
1032#ifdef CONFIG_MODVERSIONS 882#ifdef CONFIG_MODVERSIONS
883/* If the arch applies (non-zero) relocations to kernel kcrctab, unapply it. */
884static unsigned long maybe_relocated(unsigned long crc,
885 const struct module *crc_owner)
886{
887#ifdef ARCH_RELOCATES_KCRCTAB
888 if (crc_owner == NULL)
889 return crc - (unsigned long)reloc_start;
890#endif
891 return crc;
892}
893
1033static int check_version(Elf_Shdr *sechdrs, 894static int check_version(Elf_Shdr *sechdrs,
1034 unsigned int versindex, 895 unsigned int versindex,
1035 const char *symname, 896 const char *symname,
1036 struct module *mod, 897 struct module *mod,
1037 const unsigned long *crc) 898 const unsigned long *crc,
899 const struct module *crc_owner)
1038{ 900{
1039 unsigned int i, num_versions; 901 unsigned int i, num_versions;
1040 struct modversion_info *versions; 902 struct modversion_info *versions;
@@ -1055,10 +917,10 @@ static int check_version(Elf_Shdr *sechdrs,
1055 if (strcmp(versions[i].name, symname) != 0) 917 if (strcmp(versions[i].name, symname) != 0)
1056 continue; 918 continue;
1057 919
1058 if (versions[i].crc == *crc) 920 if (versions[i].crc == maybe_relocated(*crc, crc_owner))
1059 return 1; 921 return 1;
1060 DEBUGP("Found checksum %lX vs module %lX\n", 922 DEBUGP("Found checksum %lX vs module %lX\n",
1061 *crc, versions[i].crc); 923 maybe_relocated(*crc, crc_owner), versions[i].crc);
1062 goto bad_version; 924 goto bad_version;
1063 } 925 }
1064 926
@@ -1081,7 +943,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
1081 if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL, 943 if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL,
1082 &crc, true, false)) 944 &crc, true, false))
1083 BUG(); 945 BUG();
1084 return check_version(sechdrs, versindex, "module_layout", mod, crc); 946 return check_version(sechdrs, versindex, "module_layout", mod, crc,
947 NULL);
1085} 948}
1086 949
1087/* First part is kernel version, which we ignore if module has crcs. */ 950/* First part is kernel version, which we ignore if module has crcs. */
@@ -1099,7 +962,8 @@ static inline int check_version(Elf_Shdr *sechdrs,
1099 unsigned int versindex, 962 unsigned int versindex,
1100 const char *symname, 963 const char *symname,
1101 struct module *mod, 964 struct module *mod,
1102 const unsigned long *crc) 965 const unsigned long *crc,
966 const struct module *crc_owner)
1103{ 967{
1104 return 1; 968 return 1;
1105} 969}
@@ -1134,8 +998,8 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
1134 /* use_module can fail due to OOM, 998 /* use_module can fail due to OOM,
1135 or module initialization or unloading */ 999 or module initialization or unloading */
1136 if (sym) { 1000 if (sym) {
1137 if (!check_version(sechdrs, versindex, name, mod, crc) || 1001 if (!check_version(sechdrs, versindex, name, mod, crc, owner)
1138 !use_module(mod, owner)) 1002 || !use_module(mod, owner))
1139 sym = NULL; 1003 sym = NULL;
1140 } 1004 }
1141 return sym; 1005 return sym;
@@ -1146,6 +1010,12 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
1146 * J. Corbet <corbet@lwn.net> 1010 * J. Corbet <corbet@lwn.net>
1147 */ 1011 */
1148#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS) 1012#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS)
1013
1014static inline bool sect_empty(const Elf_Shdr *sect)
1015{
1016 return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
1017}
1018
1149struct module_sect_attr 1019struct module_sect_attr
1150{ 1020{
1151 struct module_attribute mattr; 1021 struct module_attribute mattr;
@@ -1187,7 +1057,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1187 1057
1188 /* Count loaded sections and allocate structures */ 1058 /* Count loaded sections and allocate structures */
1189 for (i = 0; i < nsect; i++) 1059 for (i = 0; i < nsect; i++)
1190 if (sechdrs[i].sh_flags & SHF_ALLOC) 1060 if (!sect_empty(&sechdrs[i]))
1191 nloaded++; 1061 nloaded++;
1192 size[0] = ALIGN(sizeof(*sect_attrs) 1062 size[0] = ALIGN(sizeof(*sect_attrs)
1193 + nloaded * sizeof(sect_attrs->attrs[0]), 1063 + nloaded * sizeof(sect_attrs->attrs[0]),
@@ -1205,7 +1075,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1205 sattr = &sect_attrs->attrs[0]; 1075 sattr = &sect_attrs->attrs[0];
1206 gattr = &sect_attrs->grp.attrs[0]; 1076 gattr = &sect_attrs->grp.attrs[0];
1207 for (i = 0; i < nsect; i++) { 1077 for (i = 0; i < nsect; i++) {
1208 if (! (sechdrs[i].sh_flags & SHF_ALLOC)) 1078 if (sect_empty(&sechdrs[i]))
1209 continue; 1079 continue;
1210 sattr->address = sechdrs[i].sh_addr; 1080 sattr->address = sechdrs[i].sh_addr;
1211 sattr->name = kstrdup(secstrings + sechdrs[i].sh_name, 1081 sattr->name = kstrdup(secstrings + sechdrs[i].sh_name,
@@ -1289,7 +1159,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1289 /* Count notes sections and allocate structures. */ 1159 /* Count notes sections and allocate structures. */
1290 notes = 0; 1160 notes = 0;
1291 for (i = 0; i < nsect; i++) 1161 for (i = 0; i < nsect; i++)
1292 if ((sechdrs[i].sh_flags & SHF_ALLOC) && 1162 if (!sect_empty(&sechdrs[i]) &&
1293 (sechdrs[i].sh_type == SHT_NOTE)) 1163 (sechdrs[i].sh_type == SHT_NOTE))
1294 ++notes; 1164 ++notes;
1295 1165
@@ -1305,7 +1175,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1305 notes_attrs->notes = notes; 1175 notes_attrs->notes = notes;
1306 nattr = &notes_attrs->attrs[0]; 1176 nattr = &notes_attrs->attrs[0];
1307 for (loaded = i = 0; i < nsect; ++i) { 1177 for (loaded = i = 0; i < nsect; ++i) {
1308 if (!(sechdrs[i].sh_flags & SHF_ALLOC)) 1178 if (sect_empty(&sechdrs[i]))
1309 continue; 1179 continue;
1310 if (sechdrs[i].sh_type == SHT_NOTE) { 1180 if (sechdrs[i].sh_type == SHT_NOTE) {
1311 nattr->attr.name = mod->sect_attrs->attrs[loaded].name; 1181 nattr->attr.name = mod->sect_attrs->attrs[loaded].name;
@@ -2043,9 +1913,7 @@ static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
2043 unsigned int i; 1913 unsigned int i;
2044 1914
2045 /* only scan the sections containing data */ 1915 /* only scan the sections containing data */
2046 kmemleak_scan_area(mod->module_core, (unsigned long)mod - 1916 kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
2047 (unsigned long)mod->module_core,
2048 sizeof(struct module), GFP_KERNEL);
2049 1917
2050 for (i = 1; i < hdr->e_shnum; i++) { 1918 for (i = 1; i < hdr->e_shnum; i++) {
2051 if (!(sechdrs[i].sh_flags & SHF_ALLOC)) 1919 if (!(sechdrs[i].sh_flags & SHF_ALLOC))
@@ -2054,8 +1922,7 @@ static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
2054 && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0) 1922 && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0)
2055 continue; 1923 continue;
2056 1924
2057 kmemleak_scan_area(mod->module_core, sechdrs[i].sh_addr - 1925 kmemleak_scan_area((void *)sechdrs[i].sh_addr,
2058 (unsigned long)mod->module_core,
2059 sechdrs[i].sh_size, GFP_KERNEL); 1926 sechdrs[i].sh_size, GFP_KERNEL);
2060 } 1927 }
2061} 1928}
@@ -2383,6 +2250,12 @@ static noinline struct module *load_module(void __user *umod,
2383 "_ftrace_events", 2250 "_ftrace_events",
2384 sizeof(*mod->trace_events), 2251 sizeof(*mod->trace_events),
2385 &mod->num_trace_events); 2252 &mod->num_trace_events);
2253 /*
2254 * This section contains pointers to allocated objects in the trace
2255 * code and not scanning it leads to false positives.
2256 */
2257 kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
2258 mod->num_trace_events, GFP_KERNEL);
2386#endif 2259#endif
2387#ifdef CONFIG_FTRACE_MCOUNT_RECORD 2260#ifdef CONFIG_FTRACE_MCOUNT_RECORD
2388 /* sechdrs[0].sh_size is always zero */ 2261 /* sechdrs[0].sh_size is always zero */
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index 6b2d735846a..57d527a16f9 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -43,13 +43,13 @@ static inline void mutex_clear_owner(struct mutex *lock)
43 \ 43 \
44 DEBUG_LOCKS_WARN_ON(in_interrupt()); \ 44 DEBUG_LOCKS_WARN_ON(in_interrupt()); \
45 local_irq_save(flags); \ 45 local_irq_save(flags); \
46 __raw_spin_lock(&(lock)->raw_lock); \ 46 arch_spin_lock(&(lock)->rlock.raw_lock);\
47 DEBUG_LOCKS_WARN_ON(l->magic != l); \ 47 DEBUG_LOCKS_WARN_ON(l->magic != l); \
48 } while (0) 48 } while (0)
49 49
50#define spin_unlock_mutex(lock, flags) \ 50#define spin_unlock_mutex(lock, flags) \
51 do { \ 51 do { \
52 __raw_spin_unlock(&(lock)->raw_lock); \ 52 arch_spin_unlock(&(lock)->rlock.raw_lock); \
53 local_irq_restore(flags); \ 53 local_irq_restore(flags); \
54 preempt_check_resched(); \ 54 preempt_check_resched(); \
55 } while (0) 55 } while (0)
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 947b3ad551f..632f04c57d8 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -148,8 +148,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
148 148
149 preempt_disable(); 149 preempt_disable();
150 mutex_acquire(&lock->dep_map, subclass, 0, ip); 150 mutex_acquire(&lock->dep_map, subclass, 0, ip);
151#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES) && \ 151
152 !defined(CONFIG_HAVE_DEFAULT_NO_SPIN_MUTEXES) 152#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
153 /* 153 /*
154 * Optimistic spinning. 154 * Optimistic spinning.
155 * 155 *
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 61d5aa5eced..acd24e7643e 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -558,7 +558,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier);
558 558
559static ATOMIC_NOTIFIER_HEAD(die_chain); 559static ATOMIC_NOTIFIER_HEAD(die_chain);
560 560
561int notrace notify_die(enum die_val val, const char *str, 561int notrace __kprobes notify_die(enum die_val val, const char *str,
562 struct pt_regs *regs, long err, int trap, int sig) 562 struct pt_regs *regs, long err, int trap, int sig)
563{ 563{
564 struct die_args args = { 564 struct die_args args = {
diff --git a/kernel/panic.c b/kernel/panic.c
index 96b45d0b4ba..c787333282b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -10,6 +10,7 @@
10 */ 10 */
11#include <linux/debug_locks.h> 11#include <linux/debug_locks.h>
12#include <linux/interrupt.h> 12#include <linux/interrupt.h>
13#include <linux/kmsg_dump.h>
13#include <linux/kallsyms.h> 14#include <linux/kallsyms.h>
14#include <linux/notifier.h> 15#include <linux/notifier.h>
15#include <linux/module.h> 16#include <linux/module.h>
@@ -81,6 +82,8 @@ NORET_TYPE void panic(const char * fmt, ...)
81 */ 82 */
82 crash_kexec(NULL); 83 crash_kexec(NULL);
83 84
85 kmsg_dump(KMSG_DUMP_PANIC);
86
84 /* 87 /*
85 * Note smp_send_stop is the usual smp shutdown function, which 88 * Note smp_send_stop is the usual smp shutdown function, which
86 * unfortunately means it may not be hardened to work in a panic 89 * unfortunately means it may not be hardened to work in a panic
@@ -339,6 +342,7 @@ void oops_exit(void)
339{ 342{
340 do_oops_enter_exit(); 343 do_oops_enter_exit();
341 print_oops_end_marker(); 344 print_oops_end_marker();
345 kmsg_dump(KMSG_DUMP_OOPS);
342} 346}
343 347
344#ifdef WANT_WARN_ON_SLOWPATH 348#ifdef WANT_WARN_ON_SLOWPATH
diff --git a/kernel/params.c b/kernel/params.c
index 9da58eabdcb..cf1b6918312 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -24,6 +24,7 @@
24#include <linux/err.h> 24#include <linux/err.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/ctype.h> 26#include <linux/ctype.h>
27#include <linux/string.h>
27 28
28#if 0 29#if 0
29#define DEBUGP printk 30#define DEBUGP printk
@@ -122,9 +123,7 @@ static char *next_arg(char *args, char **param, char **val)
122 next = args + i; 123 next = args + i;
123 124
124 /* Chew up trailing spaces. */ 125 /* Chew up trailing spaces. */
125 while (isspace(*next)) 126 return skip_spaces(next);
126 next++;
127 return next;
128} 127}
129 128
130/* Args looks like "foo=bar,bar2 baz=fuz wiz". */ 129/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
@@ -139,8 +138,7 @@ int parse_args(const char *name,
139 DEBUGP("Parsing ARGS: %s\n", args); 138 DEBUGP("Parsing ARGS: %s\n", args);
140 139
141 /* Chew leading spaces */ 140 /* Chew leading spaces */
142 while (isspace(*args)) 141 args = skip_spaces(args);
143 args++;
144 142
145 while (*args) { 143 while (*args) {
146 int ret; 144 int ret;
@@ -218,15 +216,11 @@ int param_set_charp(const char *val, struct kernel_param *kp)
218 return -ENOSPC; 216 return -ENOSPC;
219 } 217 }
220 218
221 if (kp->flags & KPARAM_KMALLOCED)
222 kfree(*(char **)kp->arg);
223
224 /* This is a hack. We can't need to strdup in early boot, and we 219 /* This is a hack. We can't need to strdup in early boot, and we
225 * don't need to; this mangled commandline is preserved. */ 220 * don't need to; this mangled commandline is preserved. */
226 if (slab_is_available()) { 221 if (slab_is_available()) {
227 kp->flags |= KPARAM_KMALLOCED;
228 *(char **)kp->arg = kstrdup(val, GFP_KERNEL); 222 *(char **)kp->arg = kstrdup(val, GFP_KERNEL);
229 if (!kp->arg) 223 if (!*(char **)kp->arg)
230 return -ENOMEM; 224 return -ENOMEM;
231 } else 225 } else
232 *(const char **)kp->arg = val; 226 *(const char **)kp->arg = val;
@@ -304,6 +298,7 @@ static int param_array(const char *name,
304 unsigned int min, unsigned int max, 298 unsigned int min, unsigned int max,
305 void *elem, int elemsize, 299 void *elem, int elemsize,
306 int (*set)(const char *, struct kernel_param *kp), 300 int (*set)(const char *, struct kernel_param *kp),
301 u16 flags,
307 unsigned int *num) 302 unsigned int *num)
308{ 303{
309 int ret; 304 int ret;
@@ -313,6 +308,7 @@ static int param_array(const char *name,
313 /* Get the name right for errors. */ 308 /* Get the name right for errors. */
314 kp.name = name; 309 kp.name = name;
315 kp.arg = elem; 310 kp.arg = elem;
311 kp.flags = flags;
316 312
317 /* No equals sign? */ 313 /* No equals sign? */
318 if (!val) { 314 if (!val) {
@@ -358,7 +354,8 @@ int param_array_set(const char *val, struct kernel_param *kp)
358 unsigned int temp_num; 354 unsigned int temp_num;
359 355
360 return param_array(kp->name, val, 1, arr->max, arr->elem, 356 return param_array(kp->name, val, 1, arr->max, arr->elem,
361 arr->elemsize, arr->set, arr->num ?: &temp_num); 357 arr->elemsize, arr->set, kp->flags,
358 arr->num ?: &temp_num);
362} 359}
363 360
364int param_array_get(char *buffer, struct kernel_param *kp) 361int param_array_get(char *buffer, struct kernel_param *kp)
@@ -605,11 +602,7 @@ void module_param_sysfs_remove(struct module *mod)
605 602
606void destroy_params(const struct kernel_param *params, unsigned num) 603void destroy_params(const struct kernel_param *params, unsigned num)
607{ 604{
608 unsigned int i; 605 /* FIXME: This should free kmalloced charp parameters. It doesn't. */
609
610 for (i = 0; i < num; i++)
611 if (params[i].flags & KPARAM_KMALLOCED)
612 kfree(*(char **)params[i].arg);
613} 606}
614 607
615static void __init kernel_add_sysfs_param(const char *name, 608static void __init kernel_add_sysfs_param(const char *name,
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 9d0b5c66588..2b19297742c 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -28,13 +28,15 @@
28#include <linux/anon_inodes.h> 28#include <linux/anon_inodes.h>
29#include <linux/kernel_stat.h> 29#include <linux/kernel_stat.h>
30#include <linux/perf_event.h> 30#include <linux/perf_event.h>
31#include <linux/ftrace_event.h>
32#include <linux/hw_breakpoint.h>
31 33
32#include <asm/irq_regs.h> 34#include <asm/irq_regs.h>
33 35
34/* 36/*
35 * Each CPU has a list of per CPU events: 37 * Each CPU has a list of per CPU events:
36 */ 38 */
37DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); 39static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
38 40
39int perf_max_events __read_mostly = 1; 41int perf_max_events __read_mostly = 1;
40static int perf_reserved_percpu __read_mostly; 42static int perf_reserved_percpu __read_mostly;
@@ -201,14 +203,14 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
201 * if so. If we locked the right context, then it 203 * if so. If we locked the right context, then it
202 * can't get swapped on us any more. 204 * can't get swapped on us any more.
203 */ 205 */
204 spin_lock_irqsave(&ctx->lock, *flags); 206 raw_spin_lock_irqsave(&ctx->lock, *flags);
205 if (ctx != rcu_dereference(task->perf_event_ctxp)) { 207 if (ctx != rcu_dereference(task->perf_event_ctxp)) {
206 spin_unlock_irqrestore(&ctx->lock, *flags); 208 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
207 goto retry; 209 goto retry;
208 } 210 }
209 211
210 if (!atomic_inc_not_zero(&ctx->refcount)) { 212 if (!atomic_inc_not_zero(&ctx->refcount)) {
211 spin_unlock_irqrestore(&ctx->lock, *flags); 213 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
212 ctx = NULL; 214 ctx = NULL;
213 } 215 }
214 } 216 }
@@ -229,7 +231,7 @@ static struct perf_event_context *perf_pin_task_context(struct task_struct *task
229 ctx = perf_lock_task_context(task, &flags); 231 ctx = perf_lock_task_context(task, &flags);
230 if (ctx) { 232 if (ctx) {
231 ++ctx->pin_count; 233 ++ctx->pin_count;
232 spin_unlock_irqrestore(&ctx->lock, flags); 234 raw_spin_unlock_irqrestore(&ctx->lock, flags);
233 } 235 }
234 return ctx; 236 return ctx;
235} 237}
@@ -238,12 +240,55 @@ static void perf_unpin_context(struct perf_event_context *ctx)
238{ 240{
239 unsigned long flags; 241 unsigned long flags;
240 242
241 spin_lock_irqsave(&ctx->lock, flags); 243 raw_spin_lock_irqsave(&ctx->lock, flags);
242 --ctx->pin_count; 244 --ctx->pin_count;
243 spin_unlock_irqrestore(&ctx->lock, flags); 245 raw_spin_unlock_irqrestore(&ctx->lock, flags);
244 put_ctx(ctx); 246 put_ctx(ctx);
245} 247}
246 248
249static inline u64 perf_clock(void)
250{
251 return cpu_clock(smp_processor_id());
252}
253
254/*
255 * Update the record of the current time in a context.
256 */
257static void update_context_time(struct perf_event_context *ctx)
258{
259 u64 now = perf_clock();
260
261 ctx->time += now - ctx->timestamp;
262 ctx->timestamp = now;
263}
264
265/*
266 * Update the total_time_enabled and total_time_running fields for a event.
267 */
268static void update_event_times(struct perf_event *event)
269{
270 struct perf_event_context *ctx = event->ctx;
271 u64 run_end;
272
273 if (event->state < PERF_EVENT_STATE_INACTIVE ||
274 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
275 return;
276
277 if (ctx->is_active)
278 run_end = ctx->time;
279 else
280 run_end = event->tstamp_stopped;
281
282 event->total_time_enabled = run_end - event->tstamp_enabled;
283
284 if (event->state == PERF_EVENT_STATE_INACTIVE)
285 run_end = event->tstamp_stopped;
286 else
287 run_end = ctx->time;
288
289 event->total_time_running = run_end - event->tstamp_running;
290}
291
247/* 292/*
248 * Add a event from the lists for its context. 293 * Add a event from the lists for its context.
249 * Must be called with ctx->mutex and ctx->lock held. 294 * Must be called with ctx->mutex and ctx->lock held.
@@ -292,6 +337,18 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
292 if (event->group_leader != event) 337 if (event->group_leader != event)
293 event->group_leader->nr_siblings--; 338 event->group_leader->nr_siblings--;
294 339
340 update_event_times(event);
341
342 /*
343 * If event was in error state, then keep it
344 * that way, otherwise bogus counts will be
345 * returned on read(). The only way to get out
346 * of error state is by explicit re-enabling
347 * of the event
348 */
349 if (event->state > PERF_EVENT_STATE_OFF)
350 event->state = PERF_EVENT_STATE_OFF;
351
295 /* 352 /*
296 * If this was a group event with sibling events then 353 * If this was a group event with sibling events then
297 * upgrade the siblings to singleton events by adding them 354 * upgrade the siblings to singleton events by adding them
@@ -370,7 +427,7 @@ static void __perf_event_remove_from_context(void *info)
370 if (ctx->task && cpuctx->task_ctx != ctx) 427 if (ctx->task && cpuctx->task_ctx != ctx)
371 return; 428 return;
372 429
373 spin_lock(&ctx->lock); 430 raw_spin_lock(&ctx->lock);
374 /* 431 /*
375 * Protect the list operation against NMI by disabling the 432 * Protect the list operation against NMI by disabling the
376 * events on a global level. 433 * events on a global level.
@@ -392,7 +449,7 @@ static void __perf_event_remove_from_context(void *info)
392 } 449 }
393 450
394 perf_enable(); 451 perf_enable();
395 spin_unlock(&ctx->lock); 452 raw_spin_unlock(&ctx->lock);
396} 453}
397 454
398 455
@@ -419,7 +476,7 @@ static void perf_event_remove_from_context(struct perf_event *event)
419 if (!task) { 476 if (!task) {
420 /* 477 /*
421 * Per cpu events are removed via an smp call and 478 * Per cpu events are removed via an smp call and
422 * the removal is always sucessful. 479 * the removal is always successful.
423 */ 480 */
424 smp_call_function_single(event->cpu, 481 smp_call_function_single(event->cpu,
425 __perf_event_remove_from_context, 482 __perf_event_remove_from_context,
@@ -431,12 +488,12 @@ retry:
431 task_oncpu_function_call(task, __perf_event_remove_from_context, 488 task_oncpu_function_call(task, __perf_event_remove_from_context,
432 event); 489 event);
433 490
434 spin_lock_irq(&ctx->lock); 491 raw_spin_lock_irq(&ctx->lock);
435 /* 492 /*
436 * If the context is active we need to retry the smp call. 493 * If the context is active we need to retry the smp call.
437 */ 494 */
438 if (ctx->nr_active && !list_empty(&event->group_entry)) { 495 if (ctx->nr_active && !list_empty(&event->group_entry)) {
439 spin_unlock_irq(&ctx->lock); 496 raw_spin_unlock_irq(&ctx->lock);
440 goto retry; 497 goto retry;
441 } 498 }
442 499
@@ -445,48 +502,9 @@ retry:
445 * can remove the event safely, if the call above did not 502 * can remove the event safely, if the call above did not
446 * succeed. 503 * succeed.
447 */ 504 */
448 if (!list_empty(&event->group_entry)) { 505 if (!list_empty(&event->group_entry))
449 list_del_event(event, ctx); 506 list_del_event(event, ctx);
450 } 507 raw_spin_unlock_irq(&ctx->lock);
451 spin_unlock_irq(&ctx->lock);
452}
453
454static inline u64 perf_clock(void)
455{
456 return cpu_clock(smp_processor_id());
457}
458
459/*
460 * Update the record of the current time in a context.
461 */
462static void update_context_time(struct perf_event_context *ctx)
463{
464 u64 now = perf_clock();
465
466 ctx->time += now - ctx->timestamp;
467 ctx->timestamp = now;
468}
469
470/*
471 * Update the total_time_enabled and total_time_running fields for a event.
472 */
473static void update_event_times(struct perf_event *event)
474{
475 struct perf_event_context *ctx = event->ctx;
476 u64 run_end;
477
478 if (event->state < PERF_EVENT_STATE_INACTIVE ||
479 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
480 return;
481
482 event->total_time_enabled = ctx->time - event->tstamp_enabled;
483
484 if (event->state == PERF_EVENT_STATE_INACTIVE)
485 run_end = event->tstamp_stopped;
486 else
487 run_end = ctx->time;
488
489 event->total_time_running = run_end - event->tstamp_running;
490} 508}
491 509
492/* 510/*
@@ -517,7 +535,7 @@ static void __perf_event_disable(void *info)
517 if (ctx->task && cpuctx->task_ctx != ctx) 535 if (ctx->task && cpuctx->task_ctx != ctx)
518 return; 536 return;
519 537
520 spin_lock(&ctx->lock); 538 raw_spin_lock(&ctx->lock);
521 539
522 /* 540 /*
523 * If the event is on, turn it off. 541 * If the event is on, turn it off.
@@ -533,7 +551,7 @@ static void __perf_event_disable(void *info)
533 event->state = PERF_EVENT_STATE_OFF; 551 event->state = PERF_EVENT_STATE_OFF;
534 } 552 }
535 553
536 spin_unlock(&ctx->lock); 554 raw_spin_unlock(&ctx->lock);
537} 555}
538 556
539/* 557/*
@@ -549,7 +567,7 @@ static void __perf_event_disable(void *info)
549 * is the current context on this CPU and preemption is disabled, 567 * is the current context on this CPU and preemption is disabled,
550 * hence we can't get into perf_event_task_sched_out for this context. 568 * hence we can't get into perf_event_task_sched_out for this context.
551 */ 569 */
552static void perf_event_disable(struct perf_event *event) 570void perf_event_disable(struct perf_event *event)
553{ 571{
554 struct perf_event_context *ctx = event->ctx; 572 struct perf_event_context *ctx = event->ctx;
555 struct task_struct *task = ctx->task; 573 struct task_struct *task = ctx->task;
@@ -566,12 +584,12 @@ static void perf_event_disable(struct perf_event *event)
566 retry: 584 retry:
567 task_oncpu_function_call(task, __perf_event_disable, event); 585 task_oncpu_function_call(task, __perf_event_disable, event);
568 586
569 spin_lock_irq(&ctx->lock); 587 raw_spin_lock_irq(&ctx->lock);
570 /* 588 /*
571 * If the event is still active, we need to retry the cross-call. 589 * If the event is still active, we need to retry the cross-call.
572 */ 590 */
573 if (event->state == PERF_EVENT_STATE_ACTIVE) { 591 if (event->state == PERF_EVENT_STATE_ACTIVE) {
574 spin_unlock_irq(&ctx->lock); 592 raw_spin_unlock_irq(&ctx->lock);
575 goto retry; 593 goto retry;
576 } 594 }
577 595
@@ -584,7 +602,7 @@ static void perf_event_disable(struct perf_event *event)
584 event->state = PERF_EVENT_STATE_OFF; 602 event->state = PERF_EVENT_STATE_OFF;
585 } 603 }
586 604
587 spin_unlock_irq(&ctx->lock); 605 raw_spin_unlock_irq(&ctx->lock);
588} 606}
589 607
590static int 608static int
@@ -752,7 +770,7 @@ static void __perf_install_in_context(void *info)
752 cpuctx->task_ctx = ctx; 770 cpuctx->task_ctx = ctx;
753 } 771 }
754 772
755 spin_lock(&ctx->lock); 773 raw_spin_lock(&ctx->lock);
756 ctx->is_active = 1; 774 ctx->is_active = 1;
757 update_context_time(ctx); 775 update_context_time(ctx);
758 776
@@ -764,6 +782,9 @@ static void __perf_install_in_context(void *info)
764 782
765 add_event_to_ctx(event, ctx); 783 add_event_to_ctx(event, ctx);
766 784
785 if (event->cpu != -1 && event->cpu != smp_processor_id())
786 goto unlock;
787
767 /* 788 /*
768 * Don't put the event on if it is disabled or if 789 * Don't put the event on if it is disabled or if
769 * it is in a group and the group isn't on. 790 * it is in a group and the group isn't on.
@@ -802,7 +823,7 @@ static void __perf_install_in_context(void *info)
802 unlock: 823 unlock:
803 perf_enable(); 824 perf_enable();
804 825
805 spin_unlock(&ctx->lock); 826 raw_spin_unlock(&ctx->lock);
806} 827}
807 828
808/* 829/*
@@ -827,7 +848,7 @@ perf_install_in_context(struct perf_event_context *ctx,
827 if (!task) { 848 if (!task) {
828 /* 849 /*
829 * Per cpu events are installed via an smp call and 850 * Per cpu events are installed via an smp call and
830 * the install is always sucessful. 851 * the install is always successful.
831 */ 852 */
832 smp_call_function_single(cpu, __perf_install_in_context, 853 smp_call_function_single(cpu, __perf_install_in_context,
833 event, 1); 854 event, 1);
@@ -838,12 +859,12 @@ retry:
838 task_oncpu_function_call(task, __perf_install_in_context, 859 task_oncpu_function_call(task, __perf_install_in_context,
839 event); 860 event);
840 861
841 spin_lock_irq(&ctx->lock); 862 raw_spin_lock_irq(&ctx->lock);
842 /* 863 /*
843 * we need to retry the smp call. 864 * we need to retry the smp call.
844 */ 865 */
845 if (ctx->is_active && list_empty(&event->group_entry)) { 866 if (ctx->is_active && list_empty(&event->group_entry)) {
846 spin_unlock_irq(&ctx->lock); 867 raw_spin_unlock_irq(&ctx->lock);
847 goto retry; 868 goto retry;
848 } 869 }
849 870
@@ -854,7 +875,7 @@ retry:
854 */ 875 */
855 if (list_empty(&event->group_entry)) 876 if (list_empty(&event->group_entry))
856 add_event_to_ctx(event, ctx); 877 add_event_to_ctx(event, ctx);
857 spin_unlock_irq(&ctx->lock); 878 raw_spin_unlock_irq(&ctx->lock);
858} 879}
859 880
860/* 881/*
@@ -899,7 +920,7 @@ static void __perf_event_enable(void *info)
899 cpuctx->task_ctx = ctx; 920 cpuctx->task_ctx = ctx;
900 } 921 }
901 922
902 spin_lock(&ctx->lock); 923 raw_spin_lock(&ctx->lock);
903 ctx->is_active = 1; 924 ctx->is_active = 1;
904 update_context_time(ctx); 925 update_context_time(ctx);
905 926
@@ -907,6 +928,9 @@ static void __perf_event_enable(void *info)
907 goto unlock; 928 goto unlock;
908 __perf_event_mark_enabled(event, ctx); 929 __perf_event_mark_enabled(event, ctx);
909 930
931 if (event->cpu != -1 && event->cpu != smp_processor_id())
932 goto unlock;
933
910 /* 934 /*
911 * If the event is in a group and isn't the group leader, 935 * If the event is in a group and isn't the group leader,
912 * then don't put it on unless the group is on. 936 * then don't put it on unless the group is on.
@@ -941,7 +965,7 @@ static void __perf_event_enable(void *info)
941 } 965 }
942 966
943 unlock: 967 unlock:
944 spin_unlock(&ctx->lock); 968 raw_spin_unlock(&ctx->lock);
945} 969}
946 970
947/* 971/*
@@ -953,7 +977,7 @@ static void __perf_event_enable(void *info)
953 * perf_event_for_each_child or perf_event_for_each as described 977 * perf_event_for_each_child or perf_event_for_each as described
954 * for perf_event_disable. 978 * for perf_event_disable.
955 */ 979 */
956static void perf_event_enable(struct perf_event *event) 980void perf_event_enable(struct perf_event *event)
957{ 981{
958 struct perf_event_context *ctx = event->ctx; 982 struct perf_event_context *ctx = event->ctx;
959 struct task_struct *task = ctx->task; 983 struct task_struct *task = ctx->task;
@@ -967,7 +991,7 @@ static void perf_event_enable(struct perf_event *event)
967 return; 991 return;
968 } 992 }
969 993
970 spin_lock_irq(&ctx->lock); 994 raw_spin_lock_irq(&ctx->lock);
971 if (event->state >= PERF_EVENT_STATE_INACTIVE) 995 if (event->state >= PERF_EVENT_STATE_INACTIVE)
972 goto out; 996 goto out;
973 997
@@ -982,10 +1006,10 @@ static void perf_event_enable(struct perf_event *event)
982 event->state = PERF_EVENT_STATE_OFF; 1006 event->state = PERF_EVENT_STATE_OFF;
983 1007
984 retry: 1008 retry:
985 spin_unlock_irq(&ctx->lock); 1009 raw_spin_unlock_irq(&ctx->lock);
986 task_oncpu_function_call(task, __perf_event_enable, event); 1010 task_oncpu_function_call(task, __perf_event_enable, event);
987 1011
988 spin_lock_irq(&ctx->lock); 1012 raw_spin_lock_irq(&ctx->lock);
989 1013
990 /* 1014 /*
991 * If the context is active and the event is still off, 1015 * If the context is active and the event is still off,
@@ -1002,7 +1026,7 @@ static void perf_event_enable(struct perf_event *event)
1002 __perf_event_mark_enabled(event, ctx); 1026 __perf_event_mark_enabled(event, ctx);
1003 1027
1004 out: 1028 out:
1005 spin_unlock_irq(&ctx->lock); 1029 raw_spin_unlock_irq(&ctx->lock);
1006} 1030}
1007 1031
1008static int perf_event_refresh(struct perf_event *event, int refresh) 1032static int perf_event_refresh(struct perf_event *event, int refresh)
@@ -1024,20 +1048,20 @@ void __perf_event_sched_out(struct perf_event_context *ctx,
1024{ 1048{
1025 struct perf_event *event; 1049 struct perf_event *event;
1026 1050
1027 spin_lock(&ctx->lock); 1051 raw_spin_lock(&ctx->lock);
1028 ctx->is_active = 0; 1052 ctx->is_active = 0;
1029 if (likely(!ctx->nr_events)) 1053 if (likely(!ctx->nr_events))
1030 goto out; 1054 goto out;
1031 update_context_time(ctx); 1055 update_context_time(ctx);
1032 1056
1033 perf_disable(); 1057 perf_disable();
1034 if (ctx->nr_active) 1058 if (ctx->nr_active) {
1035 list_for_each_entry(event, &ctx->group_list, group_entry) 1059 list_for_each_entry(event, &ctx->group_list, group_entry)
1036 group_sched_out(event, cpuctx, ctx); 1060 group_sched_out(event, cpuctx, ctx);
1037 1061 }
1038 perf_enable(); 1062 perf_enable();
1039 out: 1063 out:
1040 spin_unlock(&ctx->lock); 1064 raw_spin_unlock(&ctx->lock);
1041} 1065}
1042 1066
1043/* 1067/*
@@ -1059,8 +1083,6 @@ static int context_equiv(struct perf_event_context *ctx1,
1059 && !ctx1->pin_count && !ctx2->pin_count; 1083 && !ctx1->pin_count && !ctx2->pin_count;
1060} 1084}
1061 1085
1062static void __perf_event_read(void *event);
1063
1064static void __perf_event_sync_stat(struct perf_event *event, 1086static void __perf_event_sync_stat(struct perf_event *event,
1065 struct perf_event *next_event) 1087 struct perf_event *next_event)
1066{ 1088{
@@ -1078,8 +1100,8 @@ static void __perf_event_sync_stat(struct perf_event *event,
1078 */ 1100 */
1079 switch (event->state) { 1101 switch (event->state) {
1080 case PERF_EVENT_STATE_ACTIVE: 1102 case PERF_EVENT_STATE_ACTIVE:
1081 __perf_event_read(event); 1103 event->pmu->read(event);
1082 break; 1104 /* fall-through */
1083 1105
1084 case PERF_EVENT_STATE_INACTIVE: 1106 case PERF_EVENT_STATE_INACTIVE:
1085 update_event_times(event); 1107 update_event_times(event);
@@ -1118,6 +1140,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1118 if (!ctx->nr_stat) 1140 if (!ctx->nr_stat)
1119 return; 1141 return;
1120 1142
1143 update_context_time(ctx);
1144
1121 event = list_first_entry(&ctx->event_list, 1145 event = list_first_entry(&ctx->event_list,
1122 struct perf_event, event_entry); 1146 struct perf_event, event_entry);
1123 1147
@@ -1161,8 +1185,6 @@ void perf_event_task_sched_out(struct task_struct *task,
1161 if (likely(!ctx || !cpuctx->task_ctx)) 1185 if (likely(!ctx || !cpuctx->task_ctx))
1162 return; 1186 return;
1163 1187
1164 update_context_time(ctx);
1165
1166 rcu_read_lock(); 1188 rcu_read_lock();
1167 parent = rcu_dereference(ctx->parent_ctx); 1189 parent = rcu_dereference(ctx->parent_ctx);
1168 next_ctx = next->perf_event_ctxp; 1190 next_ctx = next->perf_event_ctxp;
@@ -1177,8 +1199,8 @@ void perf_event_task_sched_out(struct task_struct *task,
1177 * order we take the locks because no other cpu could 1199 * order we take the locks because no other cpu could
1178 * be trying to lock both of these tasks. 1200 * be trying to lock both of these tasks.
1179 */ 1201 */
1180 spin_lock(&ctx->lock); 1202 raw_spin_lock(&ctx->lock);
1181 spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); 1203 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1182 if (context_equiv(ctx, next_ctx)) { 1204 if (context_equiv(ctx, next_ctx)) {
1183 /* 1205 /*
1184 * XXX do we need a memory barrier of sorts 1206 * XXX do we need a memory barrier of sorts
@@ -1192,8 +1214,8 @@ void perf_event_task_sched_out(struct task_struct *task,
1192 1214
1193 perf_event_sync_stat(ctx, next_ctx); 1215 perf_event_sync_stat(ctx, next_ctx);
1194 } 1216 }
1195 spin_unlock(&next_ctx->lock); 1217 raw_spin_unlock(&next_ctx->lock);
1196 spin_unlock(&ctx->lock); 1218 raw_spin_unlock(&ctx->lock);
1197 } 1219 }
1198 rcu_read_unlock(); 1220 rcu_read_unlock();
1199 1221
@@ -1235,7 +1257,7 @@ __perf_event_sched_in(struct perf_event_context *ctx,
1235 struct perf_event *event; 1257 struct perf_event *event;
1236 int can_add_hw = 1; 1258 int can_add_hw = 1;
1237 1259
1238 spin_lock(&ctx->lock); 1260 raw_spin_lock(&ctx->lock);
1239 ctx->is_active = 1; 1261 ctx->is_active = 1;
1240 if (likely(!ctx->nr_events)) 1262 if (likely(!ctx->nr_events))
1241 goto out; 1263 goto out;
@@ -1290,7 +1312,7 @@ __perf_event_sched_in(struct perf_event_context *ctx,
1290 } 1312 }
1291 perf_enable(); 1313 perf_enable();
1292 out: 1314 out:
1293 spin_unlock(&ctx->lock); 1315 raw_spin_unlock(&ctx->lock);
1294} 1316}
1295 1317
1296/* 1318/*
@@ -1354,11 +1376,14 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1354 struct hw_perf_event *hwc; 1376 struct hw_perf_event *hwc;
1355 u64 interrupts, freq; 1377 u64 interrupts, freq;
1356 1378
1357 spin_lock(&ctx->lock); 1379 raw_spin_lock(&ctx->lock);
1358 list_for_each_entry(event, &ctx->group_list, group_entry) { 1380 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
1359 if (event->state != PERF_EVENT_STATE_ACTIVE) 1381 if (event->state != PERF_EVENT_STATE_ACTIVE)
1360 continue; 1382 continue;
1361 1383
1384 if (event->cpu != -1 && event->cpu != smp_processor_id())
1385 continue;
1386
1362 hwc = &event->hw; 1387 hwc = &event->hw;
1363 1388
1364 interrupts = hwc->interrupts; 1389 interrupts = hwc->interrupts;
@@ -1409,7 +1434,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1409 perf_enable(); 1434 perf_enable();
1410 } 1435 }
1411 } 1436 }
1412 spin_unlock(&ctx->lock); 1437 raw_spin_unlock(&ctx->lock);
1413} 1438}
1414 1439
1415/* 1440/*
@@ -1422,7 +1447,7 @@ static void rotate_ctx(struct perf_event_context *ctx)
1422 if (!ctx->nr_events) 1447 if (!ctx->nr_events)
1423 return; 1448 return;
1424 1449
1425 spin_lock(&ctx->lock); 1450 raw_spin_lock(&ctx->lock);
1426 /* 1451 /*
1427 * Rotate the first entry last (works just fine for group events too): 1452 * Rotate the first entry last (works just fine for group events too):
1428 */ 1453 */
@@ -1433,7 +1458,7 @@ static void rotate_ctx(struct perf_event_context *ctx)
1433 } 1458 }
1434 perf_enable(); 1459 perf_enable();
1435 1460
1436 spin_unlock(&ctx->lock); 1461 raw_spin_unlock(&ctx->lock);
1437} 1462}
1438 1463
1439void perf_event_task_tick(struct task_struct *curr, int cpu) 1464void perf_event_task_tick(struct task_struct *curr, int cpu)
@@ -1482,7 +1507,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1482 1507
1483 __perf_event_task_sched_out(ctx); 1508 __perf_event_task_sched_out(ctx);
1484 1509
1485 spin_lock(&ctx->lock); 1510 raw_spin_lock(&ctx->lock);
1486 1511
1487 list_for_each_entry(event, &ctx->group_list, group_entry) { 1512 list_for_each_entry(event, &ctx->group_list, group_entry) {
1488 if (!event->attr.enable_on_exec) 1513 if (!event->attr.enable_on_exec)
@@ -1500,7 +1525,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1500 if (enabled) 1525 if (enabled)
1501 unclone_ctx(ctx); 1526 unclone_ctx(ctx);
1502 1527
1503 spin_unlock(&ctx->lock); 1528 raw_spin_unlock(&ctx->lock);
1504 1529
1505 perf_event_task_sched_in(task, smp_processor_id()); 1530 perf_event_task_sched_in(task, smp_processor_id());
1506 out: 1531 out:
@@ -1515,7 +1540,6 @@ static void __perf_event_read(void *info)
1515 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1540 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1516 struct perf_event *event = info; 1541 struct perf_event *event = info;
1517 struct perf_event_context *ctx = event->ctx; 1542 struct perf_event_context *ctx = event->ctx;
1518 unsigned long flags;
1519 1543
1520 /* 1544 /*
1521 * If this is a task context, we need to check whether it is 1545 * If this is a task context, we need to check whether it is
@@ -1527,12 +1551,12 @@ static void __perf_event_read(void *info)
1527 if (ctx->task && cpuctx->task_ctx != ctx) 1551 if (ctx->task && cpuctx->task_ctx != ctx)
1528 return; 1552 return;
1529 1553
1530 local_irq_save(flags); 1554 raw_spin_lock(&ctx->lock);
1531 if (ctx->is_active) 1555 update_context_time(ctx);
1532 update_context_time(ctx);
1533 event->pmu->read(event);
1534 update_event_times(event); 1556 update_event_times(event);
1535 local_irq_restore(flags); 1557 raw_spin_unlock(&ctx->lock);
1558
1559 event->pmu->read(event);
1536} 1560}
1537 1561
1538static u64 perf_event_read(struct perf_event *event) 1562static u64 perf_event_read(struct perf_event *event)
@@ -1545,7 +1569,13 @@ static u64 perf_event_read(struct perf_event *event)
1545 smp_call_function_single(event->oncpu, 1569 smp_call_function_single(event->oncpu,
1546 __perf_event_read, event, 1); 1570 __perf_event_read, event, 1);
1547 } else if (event->state == PERF_EVENT_STATE_INACTIVE) { 1571 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
1572 struct perf_event_context *ctx = event->ctx;
1573 unsigned long flags;
1574
1575 raw_spin_lock_irqsave(&ctx->lock, flags);
1576 update_context_time(ctx);
1548 update_event_times(event); 1577 update_event_times(event);
1578 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1549 } 1579 }
1550 1580
1551 return atomic64_read(&event->count); 1581 return atomic64_read(&event->count);
@@ -1558,8 +1588,7 @@ static void
1558__perf_event_init_context(struct perf_event_context *ctx, 1588__perf_event_init_context(struct perf_event_context *ctx,
1559 struct task_struct *task) 1589 struct task_struct *task)
1560{ 1590{
1561 memset(ctx, 0, sizeof(*ctx)); 1591 raw_spin_lock_init(&ctx->lock);
1562 spin_lock_init(&ctx->lock);
1563 mutex_init(&ctx->mutex); 1592 mutex_init(&ctx->mutex);
1564 INIT_LIST_HEAD(&ctx->group_list); 1593 INIT_LIST_HEAD(&ctx->group_list);
1565 INIT_LIST_HEAD(&ctx->event_list); 1594 INIT_LIST_HEAD(&ctx->event_list);
@@ -1575,15 +1604,12 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1575 unsigned long flags; 1604 unsigned long flags;
1576 int err; 1605 int err;
1577 1606
1578 /* 1607 if (pid == -1 && cpu != -1) {
1579 * If cpu is not a wildcard then this is a percpu event:
1580 */
1581 if (cpu != -1) {
1582 /* Must be root to operate on a CPU event: */ 1608 /* Must be root to operate on a CPU event: */
1583 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) 1609 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1584 return ERR_PTR(-EACCES); 1610 return ERR_PTR(-EACCES);
1585 1611
1586 if (cpu < 0 || cpu > num_possible_cpus()) 1612 if (cpu < 0 || cpu >= nr_cpumask_bits)
1587 return ERR_PTR(-EINVAL); 1613 return ERR_PTR(-EINVAL);
1588 1614
1589 /* 1615 /*
@@ -1591,7 +1617,7 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1591 * offline CPU and activate it when the CPU comes up, but 1617 * offline CPU and activate it when the CPU comes up, but
1592 * that's for later. 1618 * that's for later.
1593 */ 1619 */
1594 if (!cpu_isset(cpu, cpu_online_map)) 1620 if (!cpu_online(cpu))
1595 return ERR_PTR(-ENODEV); 1621 return ERR_PTR(-ENODEV);
1596 1622
1597 cpuctx = &per_cpu(perf_cpu_context, cpu); 1623 cpuctx = &per_cpu(perf_cpu_context, cpu);
@@ -1629,11 +1655,11 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1629 ctx = perf_lock_task_context(task, &flags); 1655 ctx = perf_lock_task_context(task, &flags);
1630 if (ctx) { 1656 if (ctx) {
1631 unclone_ctx(ctx); 1657 unclone_ctx(ctx);
1632 spin_unlock_irqrestore(&ctx->lock, flags); 1658 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1633 } 1659 }
1634 1660
1635 if (!ctx) { 1661 if (!ctx) {
1636 ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL); 1662 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
1637 err = -ENOMEM; 1663 err = -ENOMEM;
1638 if (!ctx) 1664 if (!ctx)
1639 goto errout; 1665 goto errout;
@@ -1658,6 +1684,8 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1658 return ERR_PTR(err); 1684 return ERR_PTR(err);
1659} 1685}
1660 1686
1687static void perf_event_free_filter(struct perf_event *event);
1688
1661static void free_event_rcu(struct rcu_head *head) 1689static void free_event_rcu(struct rcu_head *head)
1662{ 1690{
1663 struct perf_event *event; 1691 struct perf_event *event;
@@ -1665,6 +1693,7 @@ static void free_event_rcu(struct rcu_head *head)
1665 event = container_of(head, struct perf_event, rcu_head); 1693 event = container_of(head, struct perf_event, rcu_head);
1666 if (event->ns) 1694 if (event->ns)
1667 put_pid_ns(event->ns); 1695 put_pid_ns(event->ns);
1696 perf_event_free_filter(event);
1668 kfree(event); 1697 kfree(event);
1669} 1698}
1670 1699
@@ -1696,16 +1725,10 @@ static void free_event(struct perf_event *event)
1696 call_rcu(&event->rcu_head, free_event_rcu); 1725 call_rcu(&event->rcu_head, free_event_rcu);
1697} 1726}
1698 1727
1699/* 1728int perf_event_release_kernel(struct perf_event *event)
1700 * Called when the last reference to the file is gone.
1701 */
1702static int perf_release(struct inode *inode, struct file *file)
1703{ 1729{
1704 struct perf_event *event = file->private_data;
1705 struct perf_event_context *ctx = event->ctx; 1730 struct perf_event_context *ctx = event->ctx;
1706 1731
1707 file->private_data = NULL;
1708
1709 WARN_ON_ONCE(ctx->parent_ctx); 1732 WARN_ON_ONCE(ctx->parent_ctx);
1710 mutex_lock(&ctx->mutex); 1733 mutex_lock(&ctx->mutex);
1711 perf_event_remove_from_context(event); 1734 perf_event_remove_from_context(event);
@@ -1720,6 +1743,19 @@ static int perf_release(struct inode *inode, struct file *file)
1720 1743
1721 return 0; 1744 return 0;
1722} 1745}
1746EXPORT_SYMBOL_GPL(perf_event_release_kernel);
1747
1748/*
1749 * Called when the last reference to the file is gone.
1750 */
1751static int perf_release(struct inode *inode, struct file *file)
1752{
1753 struct perf_event *event = file->private_data;
1754
1755 file->private_data = NULL;
1756
1757 return perf_event_release_kernel(event);
1758}
1723 1759
1724static int perf_event_read_size(struct perf_event *event) 1760static int perf_event_read_size(struct perf_event *event)
1725{ 1761{
@@ -1746,91 +1782,94 @@ static int perf_event_read_size(struct perf_event *event)
1746 return size; 1782 return size;
1747} 1783}
1748 1784
1749static u64 perf_event_read_value(struct perf_event *event) 1785u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
1750{ 1786{
1751 struct perf_event *child; 1787 struct perf_event *child;
1752 u64 total = 0; 1788 u64 total = 0;
1753 1789
1790 *enabled = 0;
1791 *running = 0;
1792
1793 mutex_lock(&event->child_mutex);
1754 total += perf_event_read(event); 1794 total += perf_event_read(event);
1755 list_for_each_entry(child, &event->child_list, child_list) 1795 *enabled += event->total_time_enabled +
1796 atomic64_read(&event->child_total_time_enabled);
1797 *running += event->total_time_running +
1798 atomic64_read(&event->child_total_time_running);
1799
1800 list_for_each_entry(child, &event->child_list, child_list) {
1756 total += perf_event_read(child); 1801 total += perf_event_read(child);
1802 *enabled += child->total_time_enabled;
1803 *running += child->total_time_running;
1804 }
1805 mutex_unlock(&event->child_mutex);
1757 1806
1758 return total; 1807 return total;
1759} 1808}
1760 1809EXPORT_SYMBOL_GPL(perf_event_read_value);
1761static int perf_event_read_entry(struct perf_event *event,
1762 u64 read_format, char __user *buf)
1763{
1764 int n = 0, count = 0;
1765 u64 values[2];
1766
1767 values[n++] = perf_event_read_value(event);
1768 if (read_format & PERF_FORMAT_ID)
1769 values[n++] = primary_event_id(event);
1770
1771 count = n * sizeof(u64);
1772
1773 if (copy_to_user(buf, values, count))
1774 return -EFAULT;
1775
1776 return count;
1777}
1778 1810
1779static int perf_event_read_group(struct perf_event *event, 1811static int perf_event_read_group(struct perf_event *event,
1780 u64 read_format, char __user *buf) 1812 u64 read_format, char __user *buf)
1781{ 1813{
1782 struct perf_event *leader = event->group_leader, *sub; 1814 struct perf_event *leader = event->group_leader, *sub;
1783 int n = 0, size = 0, err = -EFAULT; 1815 int n = 0, size = 0, ret = -EFAULT;
1784 u64 values[3]; 1816 struct perf_event_context *ctx = leader->ctx;
1817 u64 values[5];
1818 u64 count, enabled, running;
1819
1820 mutex_lock(&ctx->mutex);
1821 count = perf_event_read_value(leader, &enabled, &running);
1785 1822
1786 values[n++] = 1 + leader->nr_siblings; 1823 values[n++] = 1 + leader->nr_siblings;
1787 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 1824 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1788 values[n++] = leader->total_time_enabled + 1825 values[n++] = enabled;
1789 atomic64_read(&leader->child_total_time_enabled); 1826 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1790 } 1827 values[n++] = running;
1791 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { 1828 values[n++] = count;
1792 values[n++] = leader->total_time_running + 1829 if (read_format & PERF_FORMAT_ID)
1793 atomic64_read(&leader->child_total_time_running); 1830 values[n++] = primary_event_id(leader);
1794 }
1795 1831
1796 size = n * sizeof(u64); 1832 size = n * sizeof(u64);
1797 1833
1798 if (copy_to_user(buf, values, size)) 1834 if (copy_to_user(buf, values, size))
1799 return -EFAULT; 1835 goto unlock;
1800
1801 err = perf_event_read_entry(leader, read_format, buf + size);
1802 if (err < 0)
1803 return err;
1804 1836
1805 size += err; 1837 ret = size;
1806 1838
1807 list_for_each_entry(sub, &leader->sibling_list, group_entry) { 1839 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1808 err = perf_event_read_entry(sub, read_format, 1840 n = 0;
1809 buf + size); 1841
1810 if (err < 0) 1842 values[n++] = perf_event_read_value(sub, &enabled, &running);
1811 return err; 1843 if (read_format & PERF_FORMAT_ID)
1844 values[n++] = primary_event_id(sub);
1845
1846 size = n * sizeof(u64);
1847
1848 if (copy_to_user(buf + ret, values, size)) {
1849 ret = -EFAULT;
1850 goto unlock;
1851 }
1812 1852
1813 size += err; 1853 ret += size;
1814 } 1854 }
1855unlock:
1856 mutex_unlock(&ctx->mutex);
1815 1857
1816 return size; 1858 return ret;
1817} 1859}
1818 1860
1819static int perf_event_read_one(struct perf_event *event, 1861static int perf_event_read_one(struct perf_event *event,
1820 u64 read_format, char __user *buf) 1862 u64 read_format, char __user *buf)
1821{ 1863{
1864 u64 enabled, running;
1822 u64 values[4]; 1865 u64 values[4];
1823 int n = 0; 1866 int n = 0;
1824 1867
1825 values[n++] = perf_event_read_value(event); 1868 values[n++] = perf_event_read_value(event, &enabled, &running);
1826 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 1869 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1827 values[n++] = event->total_time_enabled + 1870 values[n++] = enabled;
1828 atomic64_read(&event->child_total_time_enabled); 1871 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1829 } 1872 values[n++] = running;
1830 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1831 values[n++] = event->total_time_running +
1832 atomic64_read(&event->child_total_time_running);
1833 }
1834 if (read_format & PERF_FORMAT_ID) 1873 if (read_format & PERF_FORMAT_ID)
1835 values[n++] = primary_event_id(event); 1874 values[n++] = primary_event_id(event);
1836 1875
@@ -1861,12 +1900,10 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
1861 return -ENOSPC; 1900 return -ENOSPC;
1862 1901
1863 WARN_ON_ONCE(event->ctx->parent_ctx); 1902 WARN_ON_ONCE(event->ctx->parent_ctx);
1864 mutex_lock(&event->child_mutex);
1865 if (read_format & PERF_FORMAT_GROUP) 1903 if (read_format & PERF_FORMAT_GROUP)
1866 ret = perf_event_read_group(event, read_format, buf); 1904 ret = perf_event_read_group(event, read_format, buf);
1867 else 1905 else
1868 ret = perf_event_read_one(event, read_format, buf); 1906 ret = perf_event_read_one(event, read_format, buf);
1869 mutex_unlock(&event->child_mutex);
1870 1907
1871 return ret; 1908 return ret;
1872} 1909}
@@ -1956,7 +1993,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
1956 if (!value) 1993 if (!value)
1957 return -EINVAL; 1994 return -EINVAL;
1958 1995
1959 spin_lock_irq(&ctx->lock); 1996 raw_spin_lock_irq(&ctx->lock);
1960 if (event->attr.freq) { 1997 if (event->attr.freq) {
1961 if (value > sysctl_perf_event_sample_rate) { 1998 if (value > sysctl_perf_event_sample_rate) {
1962 ret = -EINVAL; 1999 ret = -EINVAL;
@@ -1969,12 +2006,13 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
1969 event->hw.sample_period = value; 2006 event->hw.sample_period = value;
1970 } 2007 }
1971unlock: 2008unlock:
1972 spin_unlock_irq(&ctx->lock); 2009 raw_spin_unlock_irq(&ctx->lock);
1973 2010
1974 return ret; 2011 return ret;
1975} 2012}
1976 2013
1977int perf_event_set_output(struct perf_event *event, int output_fd); 2014static int perf_event_set_output(struct perf_event *event, int output_fd);
2015static int perf_event_set_filter(struct perf_event *event, void __user *arg);
1978 2016
1979static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 2017static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1980{ 2018{
@@ -2002,6 +2040,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2002 case PERF_EVENT_IOC_SET_OUTPUT: 2040 case PERF_EVENT_IOC_SET_OUTPUT:
2003 return perf_event_set_output(event, arg); 2041 return perf_event_set_output(event, arg);
2004 2042
2043 case PERF_EVENT_IOC_SET_FILTER:
2044 return perf_event_set_filter(event, (void __user *)arg);
2045
2005 default: 2046 default:
2006 return -ENOTTY; 2047 return -ENOTTY;
2007 } 2048 }
@@ -2174,6 +2215,7 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)
2174 perf_mmap_free_page((unsigned long)data->user_page); 2215 perf_mmap_free_page((unsigned long)data->user_page);
2175 for (i = 0; i < data->nr_pages; i++) 2216 for (i = 0; i < data->nr_pages; i++)
2176 perf_mmap_free_page((unsigned long)data->data_pages[i]); 2217 perf_mmap_free_page((unsigned long)data->data_pages[i]);
2218 kfree(data);
2177} 2219}
2178 2220
2179#else 2221#else
@@ -2214,6 +2256,7 @@ static void perf_mmap_data_free_work(struct work_struct *work)
2214 perf_mmap_unmark_page(base + (i * PAGE_SIZE)); 2256 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
2215 2257
2216 vfree(base); 2258 vfree(base);
2259 kfree(data);
2217} 2260}
2218 2261
2219static void perf_mmap_data_free(struct perf_mmap_data *data) 2262static void perf_mmap_data_free(struct perf_mmap_data *data)
@@ -2307,7 +2350,7 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2307 } 2350 }
2308 2351
2309 if (!data->watermark) 2352 if (!data->watermark)
2310 data->watermark = max_t(long, PAGE_SIZE, max_size / 2); 2353 data->watermark = max_size / 2;
2311 2354
2312 2355
2313 rcu_assign_pointer(event->data, data); 2356 rcu_assign_pointer(event->data, data);
@@ -2319,7 +2362,6 @@ static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
2319 2362
2320 data = container_of(rcu_head, struct perf_mmap_data, rcu_head); 2363 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2321 perf_mmap_data_free(data); 2364 perf_mmap_data_free(data);
2322 kfree(data);
2323} 2365}
2324 2366
2325static void perf_mmap_data_release(struct perf_event *event) 2367static void perf_mmap_data_release(struct perf_event *event)
@@ -2666,20 +2708,21 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
2666static void perf_output_lock(struct perf_output_handle *handle) 2708static void perf_output_lock(struct perf_output_handle *handle)
2667{ 2709{
2668 struct perf_mmap_data *data = handle->data; 2710 struct perf_mmap_data *data = handle->data;
2669 int cpu; 2711 int cur, cpu = get_cpu();
2670 2712
2671 handle->locked = 0; 2713 handle->locked = 0;
2672 2714
2673 local_irq_save(handle->flags); 2715 for (;;) {
2674 cpu = smp_processor_id(); 2716 cur = atomic_cmpxchg(&data->lock, -1, cpu);
2675 2717 if (cur == -1) {
2676 if (in_nmi() && atomic_read(&data->lock) == cpu) 2718 handle->locked = 1;
2677 return; 2719 break;
2720 }
2721 if (cur == cpu)
2722 break;
2678 2723
2679 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2680 cpu_relax(); 2724 cpu_relax();
2681 2725 }
2682 handle->locked = 1;
2683} 2726}
2684 2727
2685static void perf_output_unlock(struct perf_output_handle *handle) 2728static void perf_output_unlock(struct perf_output_handle *handle)
@@ -2725,7 +2768,7 @@ again:
2725 if (atomic_xchg(&data->wakeup, 0)) 2768 if (atomic_xchg(&data->wakeup, 0))
2726 perf_output_wakeup(handle); 2769 perf_output_wakeup(handle);
2727out: 2770out:
2728 local_irq_restore(handle->flags); 2771 put_cpu();
2729} 2772}
2730 2773
2731void perf_output_copy(struct perf_output_handle *handle, 2774void perf_output_copy(struct perf_output_handle *handle,
@@ -3225,6 +3268,12 @@ static void perf_event_task_output(struct perf_event *event,
3225 3268
3226static int perf_event_task_match(struct perf_event *event) 3269static int perf_event_task_match(struct perf_event *event)
3227{ 3270{
3271 if (event->state != PERF_EVENT_STATE_ACTIVE)
3272 return 0;
3273
3274 if (event->cpu != -1 && event->cpu != smp_processor_id())
3275 return 0;
3276
3228 if (event->attr.comm || event->attr.mmap || event->attr.task) 3277 if (event->attr.comm || event->attr.mmap || event->attr.task)
3229 return 1; 3278 return 1;
3230 3279
@@ -3236,15 +3285,10 @@ static void perf_event_task_ctx(struct perf_event_context *ctx,
3236{ 3285{
3237 struct perf_event *event; 3286 struct perf_event *event;
3238 3287
3239 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3240 return;
3241
3242 rcu_read_lock();
3243 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3288 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3244 if (perf_event_task_match(event)) 3289 if (perf_event_task_match(event))
3245 perf_event_task_output(event, task_event); 3290 perf_event_task_output(event, task_event);
3246 } 3291 }
3247 rcu_read_unlock();
3248} 3292}
3249 3293
3250static void perf_event_task_event(struct perf_task_event *task_event) 3294static void perf_event_task_event(struct perf_task_event *task_event)
@@ -3252,15 +3296,14 @@ static void perf_event_task_event(struct perf_task_event *task_event)
3252 struct perf_cpu_context *cpuctx; 3296 struct perf_cpu_context *cpuctx;
3253 struct perf_event_context *ctx = task_event->task_ctx; 3297 struct perf_event_context *ctx = task_event->task_ctx;
3254 3298
3299 rcu_read_lock();
3255 cpuctx = &get_cpu_var(perf_cpu_context); 3300 cpuctx = &get_cpu_var(perf_cpu_context);
3256 perf_event_task_ctx(&cpuctx->ctx, task_event); 3301 perf_event_task_ctx(&cpuctx->ctx, task_event);
3257 put_cpu_var(perf_cpu_context);
3258
3259 rcu_read_lock();
3260 if (!ctx) 3302 if (!ctx)
3261 ctx = rcu_dereference(task_event->task->perf_event_ctxp); 3303 ctx = rcu_dereference(task_event->task->perf_event_ctxp);
3262 if (ctx) 3304 if (ctx)
3263 perf_event_task_ctx(ctx, task_event); 3305 perf_event_task_ctx(ctx, task_event);
3306 put_cpu_var(perf_cpu_context);
3264 rcu_read_unlock(); 3307 rcu_read_unlock();
3265} 3308}
3266 3309
@@ -3337,6 +3380,12 @@ static void perf_event_comm_output(struct perf_event *event,
3337 3380
3338static int perf_event_comm_match(struct perf_event *event) 3381static int perf_event_comm_match(struct perf_event *event)
3339{ 3382{
3383 if (event->state != PERF_EVENT_STATE_ACTIVE)
3384 return 0;
3385
3386 if (event->cpu != -1 && event->cpu != smp_processor_id())
3387 return 0;
3388
3340 if (event->attr.comm) 3389 if (event->attr.comm)
3341 return 1; 3390 return 1;
3342 3391
@@ -3348,15 +3397,10 @@ static void perf_event_comm_ctx(struct perf_event_context *ctx,
3348{ 3397{
3349 struct perf_event *event; 3398 struct perf_event *event;
3350 3399
3351 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3352 return;
3353
3354 rcu_read_lock();
3355 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3400 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3356 if (perf_event_comm_match(event)) 3401 if (perf_event_comm_match(event))
3357 perf_event_comm_output(event, comm_event); 3402 perf_event_comm_output(event, comm_event);
3358 } 3403 }
3359 rcu_read_unlock();
3360} 3404}
3361 3405
3362static void perf_event_comm_event(struct perf_comm_event *comm_event) 3406static void perf_event_comm_event(struct perf_comm_event *comm_event)
@@ -3367,7 +3411,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3367 char comm[TASK_COMM_LEN]; 3411 char comm[TASK_COMM_LEN];
3368 3412
3369 memset(comm, 0, sizeof(comm)); 3413 memset(comm, 0, sizeof(comm));
3370 strncpy(comm, comm_event->task->comm, sizeof(comm)); 3414 strlcpy(comm, comm_event->task->comm, sizeof(comm));
3371 size = ALIGN(strlen(comm)+1, sizeof(u64)); 3415 size = ALIGN(strlen(comm)+1, sizeof(u64));
3372 3416
3373 comm_event->comm = comm; 3417 comm_event->comm = comm;
@@ -3375,18 +3419,13 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3375 3419
3376 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; 3420 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3377 3421
3422 rcu_read_lock();
3378 cpuctx = &get_cpu_var(perf_cpu_context); 3423 cpuctx = &get_cpu_var(perf_cpu_context);
3379 perf_event_comm_ctx(&cpuctx->ctx, comm_event); 3424 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3380 put_cpu_var(perf_cpu_context);
3381
3382 rcu_read_lock();
3383 /*
3384 * doesn't really matter which of the child contexts the
3385 * events ends up in.
3386 */
3387 ctx = rcu_dereference(current->perf_event_ctxp); 3425 ctx = rcu_dereference(current->perf_event_ctxp);
3388 if (ctx) 3426 if (ctx)
3389 perf_event_comm_ctx(ctx, comm_event); 3427 perf_event_comm_ctx(ctx, comm_event);
3428 put_cpu_var(perf_cpu_context);
3390 rcu_read_unlock(); 3429 rcu_read_unlock();
3391} 3430}
3392 3431
@@ -3461,6 +3500,12 @@ static void perf_event_mmap_output(struct perf_event *event,
3461static int perf_event_mmap_match(struct perf_event *event, 3500static int perf_event_mmap_match(struct perf_event *event,
3462 struct perf_mmap_event *mmap_event) 3501 struct perf_mmap_event *mmap_event)
3463{ 3502{
3503 if (event->state != PERF_EVENT_STATE_ACTIVE)
3504 return 0;
3505
3506 if (event->cpu != -1 && event->cpu != smp_processor_id())
3507 return 0;
3508
3464 if (event->attr.mmap) 3509 if (event->attr.mmap)
3465 return 1; 3510 return 1;
3466 3511
@@ -3472,15 +3517,10 @@ static void perf_event_mmap_ctx(struct perf_event_context *ctx,
3472{ 3517{
3473 struct perf_event *event; 3518 struct perf_event *event;
3474 3519
3475 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3476 return;
3477
3478 rcu_read_lock();
3479 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3520 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3480 if (perf_event_mmap_match(event, mmap_event)) 3521 if (perf_event_mmap_match(event, mmap_event))
3481 perf_event_mmap_output(event, mmap_event); 3522 perf_event_mmap_output(event, mmap_event);
3482 } 3523 }
3483 rcu_read_unlock();
3484} 3524}
3485 3525
3486static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) 3526static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
@@ -3536,18 +3576,13 @@ got_name:
3536 3576
3537 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; 3577 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3538 3578
3579 rcu_read_lock();
3539 cpuctx = &get_cpu_var(perf_cpu_context); 3580 cpuctx = &get_cpu_var(perf_cpu_context);
3540 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); 3581 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
3541 put_cpu_var(perf_cpu_context);
3542
3543 rcu_read_lock();
3544 /*
3545 * doesn't really matter which of the child contexts the
3546 * events ends up in.
3547 */
3548 ctx = rcu_dereference(current->perf_event_ctxp); 3582 ctx = rcu_dereference(current->perf_event_ctxp);
3549 if (ctx) 3583 if (ctx)
3550 perf_event_mmap_ctx(ctx, mmap_event); 3584 perf_event_mmap_ctx(ctx, mmap_event);
3585 put_cpu_var(perf_cpu_context);
3551 rcu_read_unlock(); 3586 rcu_read_unlock();
3552 3587
3553 kfree(buf); 3588 kfree(buf);
@@ -3679,7 +3714,11 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
3679 perf_event_disable(event); 3714 perf_event_disable(event);
3680 } 3715 }
3681 3716
3682 perf_event_output(event, nmi, data, regs); 3717 if (event->overflow_handler)
3718 event->overflow_handler(event, nmi, data, regs);
3719 else
3720 perf_event_output(event, nmi, data, regs);
3721
3683 return ret; 3722 return ret;
3684} 3723}
3685 3724
@@ -3724,16 +3763,16 @@ again:
3724 return nr; 3763 return nr;
3725} 3764}
3726 3765
3727static void perf_swevent_overflow(struct perf_event *event, 3766static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
3728 int nmi, struct perf_sample_data *data, 3767 int nmi, struct perf_sample_data *data,
3729 struct pt_regs *regs) 3768 struct pt_regs *regs)
3730{ 3769{
3731 struct hw_perf_event *hwc = &event->hw; 3770 struct hw_perf_event *hwc = &event->hw;
3732 int throttle = 0; 3771 int throttle = 0;
3733 u64 overflow;
3734 3772
3735 data->period = event->hw.last_period; 3773 data->period = event->hw.last_period;
3736 overflow = perf_swevent_set_period(event); 3774 if (!overflow)
3775 overflow = perf_swevent_set_period(event);
3737 3776
3738 if (hwc->interrupts == MAX_INTERRUPTS) 3777 if (hwc->interrupts == MAX_INTERRUPTS)
3739 return; 3778 return;
@@ -3766,14 +3805,19 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
3766 3805
3767 atomic64_add(nr, &event->count); 3806 atomic64_add(nr, &event->count);
3768 3807
3808 if (!regs)
3809 return;
3810
3769 if (!hwc->sample_period) 3811 if (!hwc->sample_period)
3770 return; 3812 return;
3771 3813
3772 if (!regs) 3814 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
3815 return perf_swevent_overflow(event, 1, nmi, data, regs);
3816
3817 if (atomic64_add_negative(nr, &hwc->period_left))
3773 return; 3818 return;
3774 3819
3775 if (!atomic64_add_negative(nr, &hwc->period_left)) 3820 perf_swevent_overflow(event, 0, nmi, data, regs);
3776 perf_swevent_overflow(event, nmi, data, regs);
3777} 3821}
3778 3822
3779static int perf_swevent_is_counting(struct perf_event *event) 3823static int perf_swevent_is_counting(struct perf_event *event)
@@ -3806,25 +3850,47 @@ static int perf_swevent_is_counting(struct perf_event *event)
3806 return 1; 3850 return 1;
3807} 3851}
3808 3852
3853static int perf_tp_event_match(struct perf_event *event,
3854 struct perf_sample_data *data);
3855
3856static int perf_exclude_event(struct perf_event *event,
3857 struct pt_regs *regs)
3858{
3859 if (regs) {
3860 if (event->attr.exclude_user && user_mode(regs))
3861 return 1;
3862
3863 if (event->attr.exclude_kernel && !user_mode(regs))
3864 return 1;
3865 }
3866
3867 return 0;
3868}
3869
3809static int perf_swevent_match(struct perf_event *event, 3870static int perf_swevent_match(struct perf_event *event,
3810 enum perf_type_id type, 3871 enum perf_type_id type,
3811 u32 event_id, struct pt_regs *regs) 3872 u32 event_id,
3873 struct perf_sample_data *data,
3874 struct pt_regs *regs)
3812{ 3875{
3876 if (event->cpu != -1 && event->cpu != smp_processor_id())
3877 return 0;
3878
3813 if (!perf_swevent_is_counting(event)) 3879 if (!perf_swevent_is_counting(event))
3814 return 0; 3880 return 0;
3815 3881
3816 if (event->attr.type != type) 3882 if (event->attr.type != type)
3817 return 0; 3883 return 0;
3884
3818 if (event->attr.config != event_id) 3885 if (event->attr.config != event_id)
3819 return 0; 3886 return 0;
3820 3887
3821 if (regs) { 3888 if (perf_exclude_event(event, regs))
3822 if (event->attr.exclude_user && user_mode(regs)) 3889 return 0;
3823 return 0;
3824 3890
3825 if (event->attr.exclude_kernel && !user_mode(regs)) 3891 if (event->attr.type == PERF_TYPE_TRACEPOINT &&
3826 return 0; 3892 !perf_tp_event_match(event, data))
3827 } 3893 return 0;
3828 3894
3829 return 1; 3895 return 1;
3830} 3896}
@@ -3837,49 +3903,59 @@ static void perf_swevent_ctx_event(struct perf_event_context *ctx,
3837{ 3903{
3838 struct perf_event *event; 3904 struct perf_event *event;
3839 3905
3840 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3841 return;
3842
3843 rcu_read_lock();
3844 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3906 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3845 if (perf_swevent_match(event, type, event_id, regs)) 3907 if (perf_swevent_match(event, type, event_id, data, regs))
3846 perf_swevent_add(event, nr, nmi, data, regs); 3908 perf_swevent_add(event, nr, nmi, data, regs);
3847 } 3909 }
3848 rcu_read_unlock();
3849} 3910}
3850 3911
3851static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx) 3912int perf_swevent_get_recursion_context(void)
3852{ 3913{
3914 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3915 int rctx;
3916
3853 if (in_nmi()) 3917 if (in_nmi())
3854 return &cpuctx->recursion[3]; 3918 rctx = 3;
3919 else if (in_irq())
3920 rctx = 2;
3921 else if (in_softirq())
3922 rctx = 1;
3923 else
3924 rctx = 0;
3855 3925
3856 if (in_irq()) 3926 if (cpuctx->recursion[rctx]) {
3857 return &cpuctx->recursion[2]; 3927 put_cpu_var(perf_cpu_context);
3928 return -1;
3929 }
3930
3931 cpuctx->recursion[rctx]++;
3932 barrier();
3858 3933
3859 if (in_softirq()) 3934 return rctx;
3860 return &cpuctx->recursion[1]; 3935}
3936EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
3861 3937
3862 return &cpuctx->recursion[0]; 3938void perf_swevent_put_recursion_context(int rctx)
3939{
3940 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
3941 barrier();
3942 cpuctx->recursion[rctx]--;
3943 put_cpu_var(perf_cpu_context);
3863} 3944}
3945EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
3864 3946
3865static void do_perf_sw_event(enum perf_type_id type, u32 event_id, 3947static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
3866 u64 nr, int nmi, 3948 u64 nr, int nmi,
3867 struct perf_sample_data *data, 3949 struct perf_sample_data *data,
3868 struct pt_regs *regs) 3950 struct pt_regs *regs)
3869{ 3951{
3870 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); 3952 struct perf_cpu_context *cpuctx;
3871 int *recursion = perf_swevent_recursion_context(cpuctx);
3872 struct perf_event_context *ctx; 3953 struct perf_event_context *ctx;
3873 3954
3874 if (*recursion) 3955 cpuctx = &__get_cpu_var(perf_cpu_context);
3875 goto out; 3956 rcu_read_lock();
3876
3877 (*recursion)++;
3878 barrier();
3879
3880 perf_swevent_ctx_event(&cpuctx->ctx, type, event_id, 3957 perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
3881 nr, nmi, data, regs); 3958 nr, nmi, data, regs);
3882 rcu_read_lock();
3883 /* 3959 /*
3884 * doesn't really matter which of the child contexts the 3960 * doesn't really matter which of the child contexts the
3885 * events ends up in. 3961 * events ends up in.
@@ -3888,23 +3964,24 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
3888 if (ctx) 3964 if (ctx)
3889 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs); 3965 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
3890 rcu_read_unlock(); 3966 rcu_read_unlock();
3891
3892 barrier();
3893 (*recursion)--;
3894
3895out:
3896 put_cpu_var(perf_cpu_context);
3897} 3967}
3898 3968
3899void __perf_sw_event(u32 event_id, u64 nr, int nmi, 3969void __perf_sw_event(u32 event_id, u64 nr, int nmi,
3900 struct pt_regs *regs, u64 addr) 3970 struct pt_regs *regs, u64 addr)
3901{ 3971{
3902 struct perf_sample_data data = { 3972 struct perf_sample_data data;
3903 .addr = addr, 3973 int rctx;
3904 };
3905 3974
3906 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, 3975 rctx = perf_swevent_get_recursion_context();
3907 &data, regs); 3976 if (rctx < 0)
3977 return;
3978
3979 data.addr = addr;
3980 data.raw = NULL;
3981
3982 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
3983
3984 perf_swevent_put_recursion_context(rctx);
3908} 3985}
3909 3986
3910static void perf_swevent_read(struct perf_event *event) 3987static void perf_swevent_read(struct perf_event *event)
@@ -3949,6 +4026,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
3949 event->pmu->read(event); 4026 event->pmu->read(event);
3950 4027
3951 data.addr = 0; 4028 data.addr = 0;
4029 data.raw = NULL;
4030 data.period = event->hw.last_period;
3952 regs = get_irq_regs(); 4031 regs = get_irq_regs();
3953 /* 4032 /*
3954 * In case we exclude kernel IPs or are somehow not in interrupt 4033 * In case we exclude kernel IPs or are somehow not in interrupt
@@ -3959,8 +4038,9 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
3959 regs = task_pt_regs(current); 4038 regs = task_pt_regs(current);
3960 4039
3961 if (regs) { 4040 if (regs) {
3962 if (perf_event_overflow(event, 0, &data, regs)) 4041 if (!(event->attr.exclude_idle && current->pid == 0))
3963 ret = HRTIMER_NORESTART; 4042 if (perf_event_overflow(event, 0, &data, regs))
4043 ret = HRTIMER_NORESTART;
3964 } 4044 }
3965 4045
3966 period = max_t(u64, 10000, event->hw.sample_period); 4046 period = max_t(u64, 10000, event->hw.sample_period);
@@ -3969,6 +4049,42 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
3969 return ret; 4049 return ret;
3970} 4050}
3971 4051
4052static void perf_swevent_start_hrtimer(struct perf_event *event)
4053{
4054 struct hw_perf_event *hwc = &event->hw;
4055
4056 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4057 hwc->hrtimer.function = perf_swevent_hrtimer;
4058 if (hwc->sample_period) {
4059 u64 period;
4060
4061 if (hwc->remaining) {
4062 if (hwc->remaining < 0)
4063 period = 10000;
4064 else
4065 period = hwc->remaining;
4066 hwc->remaining = 0;
4067 } else {
4068 period = max_t(u64, 10000, hwc->sample_period);
4069 }
4070 __hrtimer_start_range_ns(&hwc->hrtimer,
4071 ns_to_ktime(period), 0,
4072 HRTIMER_MODE_REL, 0);
4073 }
4074}
4075
4076static void perf_swevent_cancel_hrtimer(struct perf_event *event)
4077{
4078 struct hw_perf_event *hwc = &event->hw;
4079
4080 if (hwc->sample_period) {
4081 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4082 hwc->remaining = ktime_to_ns(remaining);
4083
4084 hrtimer_cancel(&hwc->hrtimer);
4085 }
4086}
4087
3972/* 4088/*
3973 * Software event: cpu wall time clock 4089 * Software event: cpu wall time clock
3974 */ 4090 */
@@ -3980,8 +4096,7 @@ static void cpu_clock_perf_event_update(struct perf_event *event)
3980 u64 now; 4096 u64 now;
3981 4097
3982 now = cpu_clock(cpu); 4098 now = cpu_clock(cpu);
3983 prev = atomic64_read(&event->hw.prev_count); 4099 prev = atomic64_xchg(&event->hw.prev_count, now);
3984 atomic64_set(&event->hw.prev_count, now);
3985 atomic64_add(now - prev, &event->count); 4100 atomic64_add(now - prev, &event->count);
3986} 4101}
3987 4102
@@ -3991,22 +4106,14 @@ static int cpu_clock_perf_event_enable(struct perf_event *event)
3991 int cpu = raw_smp_processor_id(); 4106 int cpu = raw_smp_processor_id();
3992 4107
3993 atomic64_set(&hwc->prev_count, cpu_clock(cpu)); 4108 atomic64_set(&hwc->prev_count, cpu_clock(cpu));
3994 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 4109 perf_swevent_start_hrtimer(event);
3995 hwc->hrtimer.function = perf_swevent_hrtimer;
3996 if (hwc->sample_period) {
3997 u64 period = max_t(u64, 10000, hwc->sample_period);
3998 __hrtimer_start_range_ns(&hwc->hrtimer,
3999 ns_to_ktime(period), 0,
4000 HRTIMER_MODE_REL, 0);
4001 }
4002 4110
4003 return 0; 4111 return 0;
4004} 4112}
4005 4113
4006static void cpu_clock_perf_event_disable(struct perf_event *event) 4114static void cpu_clock_perf_event_disable(struct perf_event *event)
4007{ 4115{
4008 if (event->hw.sample_period) 4116 perf_swevent_cancel_hrtimer(event);
4009 hrtimer_cancel(&event->hw.hrtimer);
4010 cpu_clock_perf_event_update(event); 4117 cpu_clock_perf_event_update(event);
4011} 4118}
4012 4119
@@ -4043,22 +4150,15 @@ static int task_clock_perf_event_enable(struct perf_event *event)
4043 now = event->ctx->time; 4150 now = event->ctx->time;
4044 4151
4045 atomic64_set(&hwc->prev_count, now); 4152 atomic64_set(&hwc->prev_count, now);
4046 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 4153
4047 hwc->hrtimer.function = perf_swevent_hrtimer; 4154 perf_swevent_start_hrtimer(event);
4048 if (hwc->sample_period) {
4049 u64 period = max_t(u64, 10000, hwc->sample_period);
4050 __hrtimer_start_range_ns(&hwc->hrtimer,
4051 ns_to_ktime(period), 0,
4052 HRTIMER_MODE_REL, 0);
4053 }
4054 4155
4055 return 0; 4156 return 0;
4056} 4157}
4057 4158
4058static void task_clock_perf_event_disable(struct perf_event *event) 4159static void task_clock_perf_event_disable(struct perf_event *event)
4059{ 4160{
4060 if (event->hw.sample_period) 4161 perf_swevent_cancel_hrtimer(event);
4061 hrtimer_cancel(&event->hw.hrtimer);
4062 task_clock_perf_event_update(event, event->ctx->time); 4162 task_clock_perf_event_update(event, event->ctx->time);
4063 4163
4064} 4164}
@@ -4086,6 +4186,7 @@ static const struct pmu perf_ops_task_clock = {
4086}; 4186};
4087 4187
4088#ifdef CONFIG_EVENT_PROFILE 4188#ifdef CONFIG_EVENT_PROFILE
4189
4089void perf_tp_event(int event_id, u64 addr, u64 count, void *record, 4190void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4090 int entry_size) 4191 int entry_size)
4091{ 4192{
@@ -4104,13 +4205,21 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4104 if (!regs) 4205 if (!regs)
4105 regs = task_pt_regs(current); 4206 regs = task_pt_regs(current);
4106 4207
4208 /* Trace events already protected against recursion */
4107 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, 4209 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
4108 &data, regs); 4210 &data, regs);
4109} 4211}
4110EXPORT_SYMBOL_GPL(perf_tp_event); 4212EXPORT_SYMBOL_GPL(perf_tp_event);
4111 4213
4112extern int ftrace_profile_enable(int); 4214static int perf_tp_event_match(struct perf_event *event,
4113extern void ftrace_profile_disable(int); 4215 struct perf_sample_data *data)
4216{
4217 void *record = data->raw->data;
4218
4219 if (likely(!event->filter) || filter_match_preds(event->filter, record))
4220 return 1;
4221 return 0;
4222}
4114 4223
4115static void tp_perf_event_destroy(struct perf_event *event) 4224static void tp_perf_event_destroy(struct perf_event *event)
4116{ 4225{
@@ -4135,11 +4244,93 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
4135 4244
4136 return &perf_ops_generic; 4245 return &perf_ops_generic;
4137} 4246}
4247
4248static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4249{
4250 char *filter_str;
4251 int ret;
4252
4253 if (event->attr.type != PERF_TYPE_TRACEPOINT)
4254 return -EINVAL;
4255
4256 filter_str = strndup_user(arg, PAGE_SIZE);
4257 if (IS_ERR(filter_str))
4258 return PTR_ERR(filter_str);
4259
4260 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
4261
4262 kfree(filter_str);
4263 return ret;
4264}
4265
4266static void perf_event_free_filter(struct perf_event *event)
4267{
4268 ftrace_profile_free_filter(event);
4269}
4270
4138#else 4271#else
4272
4273static int perf_tp_event_match(struct perf_event *event,
4274 struct perf_sample_data *data)
4275{
4276 return 1;
4277}
4278
4139static const struct pmu *tp_perf_event_init(struct perf_event *event) 4279static const struct pmu *tp_perf_event_init(struct perf_event *event)
4140{ 4280{
4141 return NULL; 4281 return NULL;
4142} 4282}
4283
4284static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4285{
4286 return -ENOENT;
4287}
4288
4289static void perf_event_free_filter(struct perf_event *event)
4290{
4291}
4292
4293#endif /* CONFIG_EVENT_PROFILE */
4294
4295#ifdef CONFIG_HAVE_HW_BREAKPOINT
4296static void bp_perf_event_destroy(struct perf_event *event)
4297{
4298 release_bp_slot(event);
4299}
4300
4301static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4302{
4303 int err;
4304
4305 err = register_perf_hw_breakpoint(bp);
4306 if (err)
4307 return ERR_PTR(err);
4308
4309 bp->destroy = bp_perf_event_destroy;
4310
4311 return &perf_ops_bp;
4312}
4313
4314void perf_bp_event(struct perf_event *bp, void *data)
4315{
4316 struct perf_sample_data sample;
4317 struct pt_regs *regs = data;
4318
4319 sample.raw = NULL;
4320 sample.addr = bp->attr.bp_addr;
4321
4322 if (!perf_exclude_event(bp, regs))
4323 perf_swevent_add(bp, 1, 1, &sample, regs);
4324}
4325#else
4326static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4327{
4328 return NULL;
4329}
4330
4331void perf_bp_event(struct perf_event *bp, void *regs)
4332{
4333}
4143#endif 4334#endif
4144 4335
4145atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; 4336atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
@@ -4186,6 +4377,8 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
4186 case PERF_COUNT_SW_PAGE_FAULTS_MAJ: 4377 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
4187 case PERF_COUNT_SW_CONTEXT_SWITCHES: 4378 case PERF_COUNT_SW_CONTEXT_SWITCHES:
4188 case PERF_COUNT_SW_CPU_MIGRATIONS: 4379 case PERF_COUNT_SW_CPU_MIGRATIONS:
4380 case PERF_COUNT_SW_ALIGNMENT_FAULTS:
4381 case PERF_COUNT_SW_EMULATION_FAULTS:
4189 if (!event->parent) { 4382 if (!event->parent) {
4190 atomic_inc(&perf_swevent_enabled[event_id]); 4383 atomic_inc(&perf_swevent_enabled[event_id]);
4191 event->destroy = sw_perf_event_destroy; 4384 event->destroy = sw_perf_event_destroy;
@@ -4206,6 +4399,7 @@ perf_event_alloc(struct perf_event_attr *attr,
4206 struct perf_event_context *ctx, 4399 struct perf_event_context *ctx,
4207 struct perf_event *group_leader, 4400 struct perf_event *group_leader,
4208 struct perf_event *parent_event, 4401 struct perf_event *parent_event,
4402 perf_overflow_handler_t overflow_handler,
4209 gfp_t gfpflags) 4403 gfp_t gfpflags)
4210{ 4404{
4211 const struct pmu *pmu; 4405 const struct pmu *pmu;
@@ -4248,6 +4442,11 @@ perf_event_alloc(struct perf_event_attr *attr,
4248 4442
4249 event->state = PERF_EVENT_STATE_INACTIVE; 4443 event->state = PERF_EVENT_STATE_INACTIVE;
4250 4444
4445 if (!overflow_handler && parent_event)
4446 overflow_handler = parent_event->overflow_handler;
4447
4448 event->overflow_handler = overflow_handler;
4449
4251 if (attr->disabled) 4450 if (attr->disabled)
4252 event->state = PERF_EVENT_STATE_OFF; 4451 event->state = PERF_EVENT_STATE_OFF;
4253 4452
@@ -4282,6 +4481,11 @@ perf_event_alloc(struct perf_event_attr *attr,
4282 pmu = tp_perf_event_init(event); 4481 pmu = tp_perf_event_init(event);
4283 break; 4482 break;
4284 4483
4484 case PERF_TYPE_BREAKPOINT:
4485 pmu = bp_perf_event_init(event);
4486 break;
4487
4488
4285 default: 4489 default:
4286 break; 4490 break;
4287 } 4491 }
@@ -4376,7 +4580,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
4376 if (attr->type >= PERF_TYPE_MAX) 4580 if (attr->type >= PERF_TYPE_MAX)
4377 return -EINVAL; 4581 return -EINVAL;
4378 4582
4379 if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) 4583 if (attr->__reserved_1)
4380 return -EINVAL; 4584 return -EINVAL;
4381 4585
4382 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) 4586 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
@@ -4394,7 +4598,7 @@ err_size:
4394 goto out; 4598 goto out;
4395} 4599}
4396 4600
4397int perf_event_set_output(struct perf_event *event, int output_fd) 4601static int perf_event_set_output(struct perf_event *event, int output_fd)
4398{ 4602{
4399 struct perf_event *output_event = NULL; 4603 struct perf_event *output_event = NULL;
4400 struct file *output_file = NULL; 4604 struct file *output_file = NULL;
@@ -4524,12 +4728,12 @@ SYSCALL_DEFINE5(perf_event_open,
4524 } 4728 }
4525 4729
4526 event = perf_event_alloc(&attr, cpu, ctx, group_leader, 4730 event = perf_event_alloc(&attr, cpu, ctx, group_leader,
4527 NULL, GFP_KERNEL); 4731 NULL, NULL, GFP_KERNEL);
4528 err = PTR_ERR(event); 4732 err = PTR_ERR(event);
4529 if (IS_ERR(event)) 4733 if (IS_ERR(event))
4530 goto err_put_context; 4734 goto err_put_context;
4531 4735
4532 err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0); 4736 err = anon_inode_getfd("[perf_event]", &perf_fops, event, O_RDWR);
4533 if (err < 0) 4737 if (err < 0)
4534 goto err_free_put_context; 4738 goto err_free_put_context;
4535 4739
@@ -4572,6 +4776,61 @@ err_put_context:
4572 return err; 4776 return err;
4573} 4777}
4574 4778
4779/**
4780 * perf_event_create_kernel_counter
4781 *
4782 * @attr: attributes of the counter to create
4783 * @cpu: cpu in which the counter is bound
4784 * @pid: task to profile
4785 */
4786struct perf_event *
4787perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
4788 pid_t pid,
4789 perf_overflow_handler_t overflow_handler)
4790{
4791 struct perf_event *event;
4792 struct perf_event_context *ctx;
4793 int err;
4794
4795 /*
4796 * Get the target context (task or percpu):
4797 */
4798
4799 ctx = find_get_context(pid, cpu);
4800 if (IS_ERR(ctx)) {
4801 err = PTR_ERR(ctx);
4802 goto err_exit;
4803 }
4804
4805 event = perf_event_alloc(attr, cpu, ctx, NULL,
4806 NULL, overflow_handler, GFP_KERNEL);
4807 if (IS_ERR(event)) {
4808 err = PTR_ERR(event);
4809 goto err_put_context;
4810 }
4811
4812 event->filp = NULL;
4813 WARN_ON_ONCE(ctx->parent_ctx);
4814 mutex_lock(&ctx->mutex);
4815 perf_install_in_context(ctx, event, cpu);
4816 ++ctx->generation;
4817 mutex_unlock(&ctx->mutex);
4818
4819 event->owner = current;
4820 get_task_struct(current);
4821 mutex_lock(&current->perf_event_mutex);
4822 list_add_tail(&event->owner_entry, &current->perf_event_list);
4823 mutex_unlock(&current->perf_event_mutex);
4824
4825 return event;
4826
4827 err_put_context:
4828 put_ctx(ctx);
4829 err_exit:
4830 return ERR_PTR(err);
4831}
4832EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
4833
4575/* 4834/*
4576 * inherit a event from parent task to child task: 4835 * inherit a event from parent task to child task:
4577 */ 4836 */
@@ -4597,7 +4856,7 @@ inherit_event(struct perf_event *parent_event,
4597 child_event = perf_event_alloc(&parent_event->attr, 4856 child_event = perf_event_alloc(&parent_event->attr,
4598 parent_event->cpu, child_ctx, 4857 parent_event->cpu, child_ctx,
4599 group_leader, parent_event, 4858 group_leader, parent_event,
4600 GFP_KERNEL); 4859 NULL, GFP_KERNEL);
4601 if (IS_ERR(child_event)) 4860 if (IS_ERR(child_event))
4602 return child_event; 4861 return child_event;
4603 get_ctx(child_ctx); 4862 get_ctx(child_ctx);
@@ -4615,6 +4874,8 @@ inherit_event(struct perf_event *parent_event,
4615 if (parent_event->attr.freq) 4874 if (parent_event->attr.freq)
4616 child_event->hw.sample_period = parent_event->hw.sample_period; 4875 child_event->hw.sample_period = parent_event->hw.sample_period;
4617 4876
4877 child_event->overflow_handler = parent_event->overflow_handler;
4878
4618 /* 4879 /*
4619 * Link it up in the child's context: 4880 * Link it up in the child's context:
4620 */ 4881 */
@@ -4704,7 +4965,6 @@ __perf_event_exit_task(struct perf_event *child_event,
4704{ 4965{
4705 struct perf_event *parent_event; 4966 struct perf_event *parent_event;
4706 4967
4707 update_event_times(child_event);
4708 perf_event_remove_from_context(child_event); 4968 perf_event_remove_from_context(child_event);
4709 4969
4710 parent_event = child_event->parent; 4970 parent_event = child_event->parent;
@@ -4748,7 +5008,7 @@ void perf_event_exit_task(struct task_struct *child)
4748 * reading child->perf_event_ctxp, we wait until it has 5008 * reading child->perf_event_ctxp, we wait until it has
4749 * incremented the context's refcount before we do put_ctx below. 5009 * incremented the context's refcount before we do put_ctx below.
4750 */ 5010 */
4751 spin_lock(&child_ctx->lock); 5011 raw_spin_lock(&child_ctx->lock);
4752 child->perf_event_ctxp = NULL; 5012 child->perf_event_ctxp = NULL;
4753 /* 5013 /*
4754 * If this context is a clone; unclone it so it can't get 5014 * If this context is a clone; unclone it so it can't get
@@ -4756,7 +5016,8 @@ void perf_event_exit_task(struct task_struct *child)
4756 * the events from it. 5016 * the events from it.
4757 */ 5017 */
4758 unclone_ctx(child_ctx); 5018 unclone_ctx(child_ctx);
4759 spin_unlock_irqrestore(&child_ctx->lock, flags); 5019 update_context_time(child_ctx);
5020 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
4760 5021
4761 /* 5022 /*
4762 * Report the task dead after unscheduling the events so that we 5023 * Report the task dead after unscheduling the events so that we
@@ -4839,7 +5100,7 @@ again:
4839 */ 5100 */
4840int perf_event_init_task(struct task_struct *child) 5101int perf_event_init_task(struct task_struct *child)
4841{ 5102{
4842 struct perf_event_context *child_ctx, *parent_ctx; 5103 struct perf_event_context *child_ctx = NULL, *parent_ctx;
4843 struct perf_event_context *cloned_ctx; 5104 struct perf_event_context *cloned_ctx;
4844 struct perf_event *event; 5105 struct perf_event *event;
4845 struct task_struct *parent = current; 5106 struct task_struct *parent = current;
@@ -4855,20 +5116,6 @@ int perf_event_init_task(struct task_struct *child)
4855 return 0; 5116 return 0;
4856 5117
4857 /* 5118 /*
4858 * This is executed from the parent task context, so inherit
4859 * events that have been marked for cloning.
4860 * First allocate and initialize a context for the child.
4861 */
4862
4863 child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
4864 if (!child_ctx)
4865 return -ENOMEM;
4866
4867 __perf_event_init_context(child_ctx, child);
4868 child->perf_event_ctxp = child_ctx;
4869 get_task_struct(child);
4870
4871 /*
4872 * If the parent's context is a clone, pin it so it won't get 5119 * If the parent's context is a clone, pin it so it won't get
4873 * swapped under us. 5120 * swapped under us.
4874 */ 5121 */
@@ -4898,6 +5145,26 @@ int perf_event_init_task(struct task_struct *child)
4898 continue; 5145 continue;
4899 } 5146 }
4900 5147
5148 if (!child->perf_event_ctxp) {
5149 /*
5150 * This is executed from the parent task context, so
5151 * inherit events that have been marked for cloning.
5152 * First allocate and initialize a context for the
5153 * child.
5154 */
5155
5156 child_ctx = kzalloc(sizeof(struct perf_event_context),
5157 GFP_KERNEL);
5158 if (!child_ctx) {
5159 ret = -ENOMEM;
5160 break;
5161 }
5162
5163 __perf_event_init_context(child_ctx, child);
5164 child->perf_event_ctxp = child_ctx;
5165 get_task_struct(child);
5166 }
5167
4901 ret = inherit_group(event, parent, parent_ctx, 5168 ret = inherit_group(event, parent, parent_ctx,
4902 child, child_ctx); 5169 child, child_ctx);
4903 if (ret) { 5170 if (ret) {
@@ -4906,7 +5173,7 @@ int perf_event_init_task(struct task_struct *child)
4906 } 5173 }
4907 } 5174 }
4908 5175
4909 if (inherited_all) { 5176 if (child_ctx && inherited_all) {
4910 /* 5177 /*
4911 * Mark the child context as a clone of the parent 5178 * Mark the child context as a clone of the parent
4912 * context, or of whatever the parent is a clone of. 5179 * context, or of whatever the parent is a clone of.
@@ -5040,11 +5307,11 @@ perf_set_reserve_percpu(struct sysdev_class *class,
5040 perf_reserved_percpu = val; 5307 perf_reserved_percpu = val;
5041 for_each_online_cpu(cpu) { 5308 for_each_online_cpu(cpu) {
5042 cpuctx = &per_cpu(perf_cpu_context, cpu); 5309 cpuctx = &per_cpu(perf_cpu_context, cpu);
5043 spin_lock_irq(&cpuctx->ctx.lock); 5310 raw_spin_lock_irq(&cpuctx->ctx.lock);
5044 mpt = min(perf_max_events - cpuctx->ctx.nr_events, 5311 mpt = min(perf_max_events - cpuctx->ctx.nr_events,
5045 perf_max_events - perf_reserved_percpu); 5312 perf_max_events - perf_reserved_percpu);
5046 cpuctx->max_pertask = mpt; 5313 cpuctx->max_pertask = mpt;
5047 spin_unlock_irq(&cpuctx->ctx.lock); 5314 raw_spin_unlock_irq(&cpuctx->ctx.lock);
5048 } 5315 }
5049 spin_unlock(&perf_resource_lock); 5316 spin_unlock(&perf_resource_lock);
5050 5317
diff --git a/kernel/pid.c b/kernel/pid.c
index d3f722d20f9..2e17c9c92cb 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -141,11 +141,12 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
141 * installing it: 141 * installing it:
142 */ 142 */
143 spin_lock_irq(&pidmap_lock); 143 spin_lock_irq(&pidmap_lock);
144 if (map->page) 144 if (!map->page) {
145 kfree(page);
146 else
147 map->page = page; 145 map->page = page;
146 page = NULL;
147 }
148 spin_unlock_irq(&pidmap_lock); 148 spin_unlock_irq(&pidmap_lock);
149 kfree(page);
149 if (unlikely(!map->page)) 150 if (unlikely(!map->page))
150 break; 151 break;
151 } 152 }
@@ -268,12 +269,11 @@ struct pid *alloc_pid(struct pid_namespace *ns)
268 for (type = 0; type < PIDTYPE_MAX; ++type) 269 for (type = 0; type < PIDTYPE_MAX; ++type)
269 INIT_HLIST_HEAD(&pid->tasks[type]); 270 INIT_HLIST_HEAD(&pid->tasks[type]);
270 271
272 upid = pid->numbers + ns->level;
271 spin_lock_irq(&pidmap_lock); 273 spin_lock_irq(&pidmap_lock);
272 for (i = ns->level; i >= 0; i--) { 274 for ( ; upid >= pid->numbers; --upid)
273 upid = &pid->numbers[i];
274 hlist_add_head_rcu(&upid->pid_chain, 275 hlist_add_head_rcu(&upid->pid_chain,
275 &pid_hash[pid_hashfn(upid->nr, upid->ns)]); 276 &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
276 }
277 spin_unlock_irq(&pidmap_lock); 277 spin_unlock_irq(&pidmap_lock);
278 278
279out: 279out:
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index dfdec524d1b..3db49b9ca37 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -29,7 +29,6 @@
29 29
30#include <linux/pm_qos_params.h> 30#include <linux/pm_qos_params.h>
31#include <linux/sched.h> 31#include <linux/sched.h>
32#include <linux/smp_lock.h>
33#include <linux/spinlock.h> 32#include <linux/spinlock.h>
34#include <linux/slab.h> 33#include <linux/slab.h>
35#include <linux/time.h> 34#include <linux/time.h>
@@ -344,37 +343,33 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
344} 343}
345EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); 344EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
346 345
347#define PID_NAME_LEN sizeof("process_1234567890") 346#define PID_NAME_LEN 32
348static char name[PID_NAME_LEN];
349 347
350static int pm_qos_power_open(struct inode *inode, struct file *filp) 348static int pm_qos_power_open(struct inode *inode, struct file *filp)
351{ 349{
352 int ret; 350 int ret;
353 long pm_qos_class; 351 long pm_qos_class;
352 char name[PID_NAME_LEN];
354 353
355 lock_kernel();
356 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); 354 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
357 if (pm_qos_class >= 0) { 355 if (pm_qos_class >= 0) {
358 filp->private_data = (void *)pm_qos_class; 356 filp->private_data = (void *)pm_qos_class;
359 sprintf(name, "process_%d", current->pid); 357 snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
360 ret = pm_qos_add_requirement(pm_qos_class, name, 358 ret = pm_qos_add_requirement(pm_qos_class, name,
361 PM_QOS_DEFAULT_VALUE); 359 PM_QOS_DEFAULT_VALUE);
362 if (ret >= 0) { 360 if (ret >= 0)
363 unlock_kernel();
364 return 0; 361 return 0;
365 }
366 } 362 }
367 unlock_kernel();
368
369 return -EPERM; 363 return -EPERM;
370} 364}
371 365
372static int pm_qos_power_release(struct inode *inode, struct file *filp) 366static int pm_qos_power_release(struct inode *inode, struct file *filp)
373{ 367{
374 int pm_qos_class; 368 int pm_qos_class;
369 char name[PID_NAME_LEN];
375 370
376 pm_qos_class = (long)filp->private_data; 371 pm_qos_class = (long)filp->private_data;
377 sprintf(name, "process_%d", current->pid); 372 snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
378 pm_qos_remove_requirement(pm_qos_class, name); 373 pm_qos_remove_requirement(pm_qos_class, name);
379 374
380 return 0; 375 return 0;
@@ -385,13 +380,14 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
385{ 380{
386 s32 value; 381 s32 value;
387 int pm_qos_class; 382 int pm_qos_class;
383 char name[PID_NAME_LEN];
388 384
389 pm_qos_class = (long)filp->private_data; 385 pm_qos_class = (long)filp->private_data;
390 if (count != sizeof(s32)) 386 if (count != sizeof(s32))
391 return -EINVAL; 387 return -EINVAL;
392 if (copy_from_user(&value, buf, sizeof(s32))) 388 if (copy_from_user(&value, buf, sizeof(s32)))
393 return -EFAULT; 389 return -EFAULT;
394 sprintf(name, "process_%d", current->pid); 390 snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
395 pm_qos_update_requirement(pm_qos_class, name, value); 391 pm_qos_update_requirement(pm_qos_class, name, value);
396 392
397 return sizeof(s32); 393 return sizeof(s32);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 5c9dc228747..438ff452351 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -384,7 +384,8 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
384 384
385/* 385/*
386 * Validate the clockid_t for a new CPU-clock timer, and initialize the timer. 386 * Validate the clockid_t for a new CPU-clock timer, and initialize the timer.
387 * This is called from sys_timer_create with the new timer already locked. 387 * This is called from sys_timer_create() and do_cpu_nanosleep() with the
388 * new timer already all-zeros initialized.
388 */ 389 */
389int posix_cpu_timer_create(struct k_itimer *new_timer) 390int posix_cpu_timer_create(struct k_itimer *new_timer)
390{ 391{
@@ -396,8 +397,6 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
396 return -EINVAL; 397 return -EINVAL;
397 398
398 INIT_LIST_HEAD(&new_timer->it.cpu.entry); 399 INIT_LIST_HEAD(&new_timer->it.cpu.entry);
399 new_timer->it.cpu.incr.sched = 0;
400 new_timer->it.cpu.expires.sched = 0;
401 400
402 read_lock(&tasklist_lock); 401 read_lock(&tasklist_lock);
403 if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) { 402 if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index c3b81c30e5d..43191815f87 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -8,7 +8,7 @@ obj-$(CONFIG_PM_SLEEP) += console.o
8obj-$(CONFIG_FREEZER) += process.o 8obj-$(CONFIG_FREEZER) += process.o
9obj-$(CONFIG_SUSPEND) += suspend.o 9obj-$(CONFIG_SUSPEND) += suspend.o
10obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o 10obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
11obj-$(CONFIG_HIBERNATION) += swsusp.o hibernate.o snapshot.o swap.o user.o 11obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o
12obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o 12obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o
13 13
14obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 14obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/console.c b/kernel/power/console.c
index 5187136fe1d..218e5af9015 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -6,7 +6,7 @@
6 6
7#include <linux/vt_kern.h> 7#include <linux/vt_kern.h>
8#include <linux/kbd_kern.h> 8#include <linux/kbd_kern.h>
9#include <linux/console.h> 9#include <linux/vt.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include "power.h" 11#include "power.h"
12 12
@@ -21,8 +21,7 @@ int pm_prepare_console(void)
21 if (orig_fgconsole < 0) 21 if (orig_fgconsole < 0)
22 return 1; 22 return 1;
23 23
24 orig_kmsg = kmsg_redirect; 24 orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE);
25 kmsg_redirect = SUSPEND_CONSOLE;
26 return 0; 25 return 0;
27} 26}
28 27
@@ -30,7 +29,7 @@ void pm_restore_console(void)
30{ 29{
31 if (orig_fgconsole >= 0) { 30 if (orig_fgconsole >= 0) {
32 vt_move_to_console(orig_fgconsole, 0); 31 vt_move_to_console(orig_fgconsole, 0);
33 kmsg_redirect = orig_kmsg; 32 vt_kmsg_redirect(orig_kmsg);
34 } 33 }
35} 34}
36#endif 35#endif
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 04b3a83d686..bbfe472d752 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -32,6 +32,7 @@ static int noresume = 0;
32static char resume_file[256] = CONFIG_PM_STD_PARTITION; 32static char resume_file[256] = CONFIG_PM_STD_PARTITION;
33dev_t swsusp_resume_device; 33dev_t swsusp_resume_device;
34sector_t swsusp_resume_block; 34sector_t swsusp_resume_block;
35int in_suspend __nosavedata = 0;
35 36
36enum { 37enum {
37 HIBERNATION_INVALID, 38 HIBERNATION_INVALID,
@@ -202,6 +203,35 @@ static void platform_recover(int platform_mode)
202} 203}
203 204
204/** 205/**
206 * swsusp_show_speed - print the time elapsed between two events.
207 * @start: Starting event.
208 * @stop: Final event.
209 * @nr_pages - number of pages processed between @start and @stop
210 * @msg - introductory message to print
211 */
212
213void swsusp_show_speed(struct timeval *start, struct timeval *stop,
214 unsigned nr_pages, char *msg)
215{
216 s64 elapsed_centisecs64;
217 int centisecs;
218 int k;
219 int kps;
220
221 elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
222 do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
223 centisecs = elapsed_centisecs64;
224 if (centisecs == 0)
225 centisecs = 1; /* avoid div-by-zero */
226 k = nr_pages * (PAGE_SIZE / 1024);
227 kps = (k * 100) / centisecs;
228 printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n",
229 msg, k,
230 centisecs / 100, centisecs % 100,
231 kps / 1000, (kps % 1000) / 10);
232}
233
234/**
205 * create_image - freeze devices that need to be frozen with interrupts 235 * create_image - freeze devices that need to be frozen with interrupts
206 * off, create the hibernation image and thaw those devices. Control 236 * off, create the hibernation image and thaw those devices. Control
207 * reappears in this routine after a restore. 237 * reappears in this routine after a restore.
@@ -693,21 +723,22 @@ static int software_resume(void)
693 /* The snapshot device should not be opened while we're running */ 723 /* The snapshot device should not be opened while we're running */
694 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { 724 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
695 error = -EBUSY; 725 error = -EBUSY;
726 swsusp_close(FMODE_READ);
696 goto Unlock; 727 goto Unlock;
697 } 728 }
698 729
699 pm_prepare_console(); 730 pm_prepare_console();
700 error = pm_notifier_call_chain(PM_RESTORE_PREPARE); 731 error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
701 if (error) 732 if (error)
702 goto Finish; 733 goto close_finish;
703 734
704 error = usermodehelper_disable(); 735 error = usermodehelper_disable();
705 if (error) 736 if (error)
706 goto Finish; 737 goto close_finish;
707 738
708 error = create_basic_memory_bitmaps(); 739 error = create_basic_memory_bitmaps();
709 if (error) 740 if (error)
710 goto Finish; 741 goto close_finish;
711 742
712 pr_debug("PM: Preparing processes for restore.\n"); 743 pr_debug("PM: Preparing processes for restore.\n");
713 error = prepare_processes(); 744 error = prepare_processes();
@@ -719,6 +750,7 @@ static int software_resume(void)
719 pr_debug("PM: Reading hibernation image.\n"); 750 pr_debug("PM: Reading hibernation image.\n");
720 751
721 error = swsusp_read(&flags); 752 error = swsusp_read(&flags);
753 swsusp_close(FMODE_READ);
722 if (!error) 754 if (!error)
723 hibernation_restore(flags & SF_PLATFORM_MODE); 755 hibernation_restore(flags & SF_PLATFORM_MODE);
724 756
@@ -737,6 +769,9 @@ static int software_resume(void)
737 mutex_unlock(&pm_mutex); 769 mutex_unlock(&pm_mutex);
738 pr_debug("PM: Resume from disk failed.\n"); 770 pr_debug("PM: Resume from disk failed.\n");
739 return error; 771 return error;
772close_finish:
773 swsusp_close(FMODE_READ);
774 goto Finish;
740} 775}
741 776
742late_initcall(software_resume); 777late_initcall(software_resume);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 347d2cc88cd..0998c713905 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -220,6 +220,7 @@ static struct attribute_group attr_group = {
220 220
221#ifdef CONFIG_PM_RUNTIME 221#ifdef CONFIG_PM_RUNTIME
222struct workqueue_struct *pm_wq; 222struct workqueue_struct *pm_wq;
223EXPORT_SYMBOL_GPL(pm_wq);
223 224
224static int __init pm_start_workqueue(void) 225static int __init pm_start_workqueue(void)
225{ 226{
diff --git a/kernel/power/process.c b/kernel/power/process.c
index cc2e55373b6..5ade1bdcf36 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -14,6 +14,7 @@
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/freezer.h> 16#include <linux/freezer.h>
17#include <linux/delay.h>
17 18
18/* 19/*
19 * Timeout for stopping processes 20 * Timeout for stopping processes
@@ -41,7 +42,7 @@ static int try_to_freeze_tasks(bool sig_only)
41 do_gettimeofday(&start); 42 do_gettimeofday(&start);
42 43
43 end_time = jiffies + TIMEOUT; 44 end_time = jiffies + TIMEOUT;
44 do { 45 while (true) {
45 todo = 0; 46 todo = 0;
46 read_lock(&tasklist_lock); 47 read_lock(&tasklist_lock);
47 do_each_thread(g, p) { 48 do_each_thread(g, p) {
@@ -62,10 +63,15 @@ static int try_to_freeze_tasks(bool sig_only)
62 todo++; 63 todo++;
63 } while_each_thread(g, p); 64 } while_each_thread(g, p);
64 read_unlock(&tasklist_lock); 65 read_unlock(&tasklist_lock);
65 yield(); /* Yield is okay here */ 66 if (!todo || time_after(jiffies, end_time))
66 if (time_after(jiffies, end_time))
67 break; 67 break;
68 } while (todo); 68
69 /*
70 * We need to retry, but first give the freezing tasks some
71 * time to enter the regrigerator.
72 */
73 msleep(10);
74 }
69 75
70 do_gettimeofday(&end); 76 do_gettimeofday(&end);
71 elapsed_csecs64 = timeval_to_ns(&end) - timeval_to_ns(&start); 77 elapsed_csecs64 = timeval_to_ns(&end) - timeval_to_ns(&start);
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 17d8bb1acf9..25596e450ac 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -19,7 +19,7 @@
19 * The time it takes is system-specific though, so when we test this 19 * The time it takes is system-specific though, so when we test this
20 * during system bootup we allow a LOT of time. 20 * during system bootup we allow a LOT of time.
21 */ 21 */
22#define TEST_SUSPEND_SECONDS 5 22#define TEST_SUSPEND_SECONDS 10
23 23
24static unsigned long suspend_test_start_time; 24static unsigned long suspend_test_start_time;
25 25
@@ -49,7 +49,8 @@ void suspend_test_finish(const char *label)
49 * has some performance issues. The stack dump of a WARN_ON 49 * has some performance issues. The stack dump of a WARN_ON
50 * is more likely to get the right attention than a printk... 50 * is more likely to get the right attention than a printk...
51 */ 51 */
52 WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label); 52 WARN(msec > (TEST_SUSPEND_SECONDS * 1000),
53 "Component: %s, time: %u\n", label, msec);
53} 54}
54 55
55/* 56/*
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index b101cdc4df3..09b2b0ae9e9 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -38,6 +38,107 @@ struct swsusp_header {
38 38
39static struct swsusp_header *swsusp_header; 39static struct swsusp_header *swsusp_header;
40 40
41/**
42 * The following functions are used for tracing the allocated
43 * swap pages, so that they can be freed in case of an error.
44 */
45
46struct swsusp_extent {
47 struct rb_node node;
48 unsigned long start;
49 unsigned long end;
50};
51
52static struct rb_root swsusp_extents = RB_ROOT;
53
54static int swsusp_extents_insert(unsigned long swap_offset)
55{
56 struct rb_node **new = &(swsusp_extents.rb_node);
57 struct rb_node *parent = NULL;
58 struct swsusp_extent *ext;
59
60 /* Figure out where to put the new node */
61 while (*new) {
62 ext = container_of(*new, struct swsusp_extent, node);
63 parent = *new;
64 if (swap_offset < ext->start) {
65 /* Try to merge */
66 if (swap_offset == ext->start - 1) {
67 ext->start--;
68 return 0;
69 }
70 new = &((*new)->rb_left);
71 } else if (swap_offset > ext->end) {
72 /* Try to merge */
73 if (swap_offset == ext->end + 1) {
74 ext->end++;
75 return 0;
76 }
77 new = &((*new)->rb_right);
78 } else {
79 /* It already is in the tree */
80 return -EINVAL;
81 }
82 }
83 /* Add the new node and rebalance the tree. */
84 ext = kzalloc(sizeof(struct swsusp_extent), GFP_KERNEL);
85 if (!ext)
86 return -ENOMEM;
87
88 ext->start = swap_offset;
89 ext->end = swap_offset;
90 rb_link_node(&ext->node, parent, new);
91 rb_insert_color(&ext->node, &swsusp_extents);
92 return 0;
93}
94
95/**
96 * alloc_swapdev_block - allocate a swap page and register that it has
97 * been allocated, so that it can be freed in case of an error.
98 */
99
100sector_t alloc_swapdev_block(int swap)
101{
102 unsigned long offset;
103
104 offset = swp_offset(get_swap_page_of_type(swap));
105 if (offset) {
106 if (swsusp_extents_insert(offset))
107 swap_free(swp_entry(swap, offset));
108 else
109 return swapdev_block(swap, offset);
110 }
111 return 0;
112}
113
114/**
115 * free_all_swap_pages - free swap pages allocated for saving image data.
116 * It also frees the extents used to register which swap entres had been
117 * allocated.
118 */
119
120void free_all_swap_pages(int swap)
121{
122 struct rb_node *node;
123
124 while ((node = swsusp_extents.rb_node)) {
125 struct swsusp_extent *ext;
126 unsigned long offset;
127
128 ext = container_of(node, struct swsusp_extent, node);
129 rb_erase(node, &swsusp_extents);
130 for (offset = ext->start; offset <= ext->end; offset++)
131 swap_free(swp_entry(swap, offset));
132
133 kfree(ext);
134 }
135}
136
137int swsusp_swap_in_use(void)
138{
139 return (swsusp_extents.rb_node != NULL);
140}
141
41/* 142/*
42 * General things 143 * General things
43 */ 144 */
@@ -314,7 +415,6 @@ static int save_image(struct swap_map_handle *handle,
314{ 415{
315 unsigned int m; 416 unsigned int m;
316 int ret; 417 int ret;
317 int error = 0;
318 int nr_pages; 418 int nr_pages;
319 int err2; 419 int err2;
320 struct bio *bio; 420 struct bio *bio;
@@ -329,26 +429,27 @@ static int save_image(struct swap_map_handle *handle,
329 nr_pages = 0; 429 nr_pages = 0;
330 bio = NULL; 430 bio = NULL;
331 do_gettimeofday(&start); 431 do_gettimeofday(&start);
332 do { 432 while (1) {
333 ret = snapshot_read_next(snapshot, PAGE_SIZE); 433 ret = snapshot_read_next(snapshot, PAGE_SIZE);
334 if (ret > 0) { 434 if (ret <= 0)
335 error = swap_write_page(handle, data_of(*snapshot), 435 break;
336 &bio); 436 ret = swap_write_page(handle, data_of(*snapshot), &bio);
337 if (error) 437 if (ret)
338 break; 438 break;
339 if (!(nr_pages % m)) 439 if (!(nr_pages % m))
340 printk("\b\b\b\b%3d%%", nr_pages / m); 440 printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
341 nr_pages++; 441 nr_pages++;
342 } 442 }
343 } while (ret > 0);
344 err2 = wait_on_bio_chain(&bio); 443 err2 = wait_on_bio_chain(&bio);
345 do_gettimeofday(&stop); 444 do_gettimeofday(&stop);
346 if (!error) 445 if (!ret)
347 error = err2; 446 ret = err2;
348 if (!error) 447 if (!ret)
349 printk("\b\b\b\bdone\n"); 448 printk(KERN_CONT "\b\b\b\bdone\n");
449 else
450 printk(KERN_CONT "\n");
350 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); 451 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
351 return error; 452 return ret;
352} 453}
353 454
354/** 455/**
@@ -536,7 +637,8 @@ static int load_image(struct swap_map_handle *handle,
536 snapshot_write_finalize(snapshot); 637 snapshot_write_finalize(snapshot);
537 if (!snapshot_image_loaded(snapshot)) 638 if (!snapshot_image_loaded(snapshot))
538 error = -ENODATA; 639 error = -ENODATA;
539 } 640 } else
641 printk("\n");
540 swsusp_show_speed(&start, &stop, nr_to_read, "Read"); 642 swsusp_show_speed(&start, &stop, nr_to_read, "Read");
541 return error; 643 return error;
542} 644}
@@ -572,8 +674,6 @@ int swsusp_read(unsigned int *flags_p)
572 error = load_image(&handle, &snapshot, header->pages - 1); 674 error = load_image(&handle, &snapshot, header->pages - 1);
573 release_swap_reader(&handle); 675 release_swap_reader(&handle);
574 676
575 blkdev_put(resume_bdev, FMODE_READ);
576
577 if (!error) 677 if (!error)
578 pr_debug("PM: Image successfully loaded\n"); 678 pr_debug("PM: Image successfully loaded\n");
579 else 679 else
@@ -596,7 +696,7 @@ int swsusp_check(void)
596 error = bio_read_page(swsusp_resume_block, 696 error = bio_read_page(swsusp_resume_block,
597 swsusp_header, NULL); 697 swsusp_header, NULL);
598 if (error) 698 if (error)
599 return error; 699 goto put;
600 700
601 if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) { 701 if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) {
602 memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); 702 memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
@@ -604,8 +704,10 @@ int swsusp_check(void)
604 error = bio_write_page(swsusp_resume_block, 704 error = bio_write_page(swsusp_resume_block,
605 swsusp_header, NULL); 705 swsusp_header, NULL);
606 } else { 706 } else {
607 return -EINVAL; 707 error = -EINVAL;
608 } 708 }
709
710put:
609 if (error) 711 if (error)
610 blkdev_put(resume_bdev, FMODE_READ); 712 blkdev_put(resume_bdev, FMODE_READ);
611 else 713 else
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 6a07f4dbf2f..5b3601bd189 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -56,133 +56,3 @@
56#include "power.h" 56#include "power.h"
57 57
58int in_suspend __nosavedata = 0; 58int in_suspend __nosavedata = 0;
59
60/**
61 * The following functions are used for tracing the allocated
62 * swap pages, so that they can be freed in case of an error.
63 */
64
65struct swsusp_extent {
66 struct rb_node node;
67 unsigned long start;
68 unsigned long end;
69};
70
71static struct rb_root swsusp_extents = RB_ROOT;
72
73static int swsusp_extents_insert(unsigned long swap_offset)
74{
75 struct rb_node **new = &(swsusp_extents.rb_node);
76 struct rb_node *parent = NULL;
77 struct swsusp_extent *ext;
78
79 /* Figure out where to put the new node */
80 while (*new) {
81 ext = container_of(*new, struct swsusp_extent, node);
82 parent = *new;
83 if (swap_offset < ext->start) {
84 /* Try to merge */
85 if (swap_offset == ext->start - 1) {
86 ext->start--;
87 return 0;
88 }
89 new = &((*new)->rb_left);
90 } else if (swap_offset > ext->end) {
91 /* Try to merge */
92 if (swap_offset == ext->end + 1) {
93 ext->end++;
94 return 0;
95 }
96 new = &((*new)->rb_right);
97 } else {
98 /* It already is in the tree */
99 return -EINVAL;
100 }
101 }
102 /* Add the new node and rebalance the tree. */
103 ext = kzalloc(sizeof(struct swsusp_extent), GFP_KERNEL);
104 if (!ext)
105 return -ENOMEM;
106
107 ext->start = swap_offset;
108 ext->end = swap_offset;
109 rb_link_node(&ext->node, parent, new);
110 rb_insert_color(&ext->node, &swsusp_extents);
111 return 0;
112}
113
114/**
115 * alloc_swapdev_block - allocate a swap page and register that it has
116 * been allocated, so that it can be freed in case of an error.
117 */
118
119sector_t alloc_swapdev_block(int swap)
120{
121 unsigned long offset;
122
123 offset = swp_offset(get_swap_page_of_type(swap));
124 if (offset) {
125 if (swsusp_extents_insert(offset))
126 swap_free(swp_entry(swap, offset));
127 else
128 return swapdev_block(swap, offset);
129 }
130 return 0;
131}
132
133/**
134 * free_all_swap_pages - free swap pages allocated for saving image data.
135 * It also frees the extents used to register which swap entres had been
136 * allocated.
137 */
138
139void free_all_swap_pages(int swap)
140{
141 struct rb_node *node;
142
143 while ((node = swsusp_extents.rb_node)) {
144 struct swsusp_extent *ext;
145 unsigned long offset;
146
147 ext = container_of(node, struct swsusp_extent, node);
148 rb_erase(node, &swsusp_extents);
149 for (offset = ext->start; offset <= ext->end; offset++)
150 swap_free(swp_entry(swap, offset));
151
152 kfree(ext);
153 }
154}
155
156int swsusp_swap_in_use(void)
157{
158 return (swsusp_extents.rb_node != NULL);
159}
160
161/**
162 * swsusp_show_speed - print the time elapsed between two events represented by
163 * @start and @stop
164 *
165 * @nr_pages - number of pages processed between @start and @stop
166 * @msg - introductory message to print
167 */
168
169void swsusp_show_speed(struct timeval *start, struct timeval *stop,
170 unsigned nr_pages, char *msg)
171{
172 s64 elapsed_centisecs64;
173 int centisecs;
174 int k;
175 int kps;
176
177 elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
178 do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
179 centisecs = elapsed_centisecs64;
180 if (centisecs == 0)
181 centisecs = 1; /* avoid div-by-zero */
182 k = nr_pages * (PAGE_SIZE / 1024);
183 kps = (k * 100) / centisecs;
184 printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n",
185 msg, k,
186 centisecs / 100, centisecs % 100,
187 kps / 1000, (kps % 1000) / 10);
188}
diff --git a/kernel/printk.c b/kernel/printk.c
index f38b07f78a4..1751c456b71 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -33,6 +33,8 @@
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/kexec.h> 35#include <linux/kexec.h>
36#include <linux/ratelimit.h>
37#include <linux/kmsg_dump.h>
36 38
37#include <asm/uaccess.h> 39#include <asm/uaccess.h>
38 40
@@ -1376,11 +1378,11 @@ late_initcall(disable_boot_consoles);
1376 */ 1378 */
1377DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10); 1379DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);
1378 1380
1379int printk_ratelimit(void) 1381int __printk_ratelimit(const char *func)
1380{ 1382{
1381 return __ratelimit(&printk_ratelimit_state); 1383 return ___ratelimit(&printk_ratelimit_state, func);
1382} 1384}
1383EXPORT_SYMBOL(printk_ratelimit); 1385EXPORT_SYMBOL(__printk_ratelimit);
1384 1386
1385/** 1387/**
1386 * printk_timed_ratelimit - caller-controlled printk ratelimiting 1388 * printk_timed_ratelimit - caller-controlled printk ratelimiting
@@ -1404,4 +1406,123 @@ bool printk_timed_ratelimit(unsigned long *caller_jiffies,
1404 return false; 1406 return false;
1405} 1407}
1406EXPORT_SYMBOL(printk_timed_ratelimit); 1408EXPORT_SYMBOL(printk_timed_ratelimit);
1409
1410static DEFINE_SPINLOCK(dump_list_lock);
1411static LIST_HEAD(dump_list);
1412
1413/**
1414 * kmsg_dump_register - register a kernel log dumper.
1415 * @dumper: pointer to the kmsg_dumper structure
1416 *
1417 * Adds a kernel log dumper to the system. The dump callback in the
1418 * structure will be called when the kernel oopses or panics and must be
1419 * set. Returns zero on success and %-EINVAL or %-EBUSY otherwise.
1420 */
1421int kmsg_dump_register(struct kmsg_dumper *dumper)
1422{
1423 unsigned long flags;
1424 int err = -EBUSY;
1425
1426 /* The dump callback needs to be set */
1427 if (!dumper->dump)
1428 return -EINVAL;
1429
1430 spin_lock_irqsave(&dump_list_lock, flags);
1431 /* Don't allow registering multiple times */
1432 if (!dumper->registered) {
1433 dumper->registered = 1;
1434 list_add_tail(&dumper->list, &dump_list);
1435 err = 0;
1436 }
1437 spin_unlock_irqrestore(&dump_list_lock, flags);
1438
1439 return err;
1440}
1441EXPORT_SYMBOL_GPL(kmsg_dump_register);
1442
1443/**
1444 * kmsg_dump_unregister - unregister a kmsg dumper.
1445 * @dumper: pointer to the kmsg_dumper structure
1446 *
1447 * Removes a dump device from the system. Returns zero on success and
1448 * %-EINVAL otherwise.
1449 */
1450int kmsg_dump_unregister(struct kmsg_dumper *dumper)
1451{
1452 unsigned long flags;
1453 int err = -EINVAL;
1454
1455 spin_lock_irqsave(&dump_list_lock, flags);
1456 if (dumper->registered) {
1457 dumper->registered = 0;
1458 list_del(&dumper->list);
1459 err = 0;
1460 }
1461 spin_unlock_irqrestore(&dump_list_lock, flags);
1462
1463 return err;
1464}
1465EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
1466
1467static const char const *kmsg_reasons[] = {
1468 [KMSG_DUMP_OOPS] = "oops",
1469 [KMSG_DUMP_PANIC] = "panic",
1470 [KMSG_DUMP_KEXEC] = "kexec",
1471};
1472
1473static const char *kmsg_to_str(enum kmsg_dump_reason reason)
1474{
1475 if (reason >= ARRAY_SIZE(kmsg_reasons) || reason < 0)
1476 return "unknown";
1477
1478 return kmsg_reasons[reason];
1479}
1480
1481/**
1482 * kmsg_dump - dump kernel log to kernel message dumpers.
1483 * @reason: the reason (oops, panic etc) for dumping
1484 *
1485 * Iterate through each of the dump devices and call the oops/panic
1486 * callbacks with the log buffer.
1487 */
1488void kmsg_dump(enum kmsg_dump_reason reason)
1489{
1490 unsigned long end;
1491 unsigned chars;
1492 struct kmsg_dumper *dumper;
1493 const char *s1, *s2;
1494 unsigned long l1, l2;
1495 unsigned long flags;
1496
1497 /* Theoretically, the log could move on after we do this, but
1498 there's not a lot we can do about that. The new messages
1499 will overwrite the start of what we dump. */
1500 spin_lock_irqsave(&logbuf_lock, flags);
1501 end = log_end & LOG_BUF_MASK;
1502 chars = logged_chars;
1503 spin_unlock_irqrestore(&logbuf_lock, flags);
1504
1505 if (logged_chars > end) {
1506 s1 = log_buf + log_buf_len - logged_chars + end;
1507 l1 = logged_chars - end;
1508
1509 s2 = log_buf;
1510 l2 = end;
1511 } else {
1512 s1 = "";
1513 l1 = 0;
1514
1515 s2 = log_buf + end - logged_chars;
1516 l2 = logged_chars;
1517 }
1518
1519 if (!spin_trylock_irqsave(&dump_list_lock, flags)) {
1520 printk(KERN_ERR "dump_kmsg: dump list lock is held during %s, skipping dump\n",
1521 kmsg_to_str(reason));
1522 return;
1523 }
1524 list_for_each_entry(dumper, &dump_list, list)
1525 dumper->dump(dumper, reason, s1, l1, s2, l2);
1526 spin_unlock_irqrestore(&dump_list_lock, flags);
1527}
1407#endif 1528#endif
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 400183346ad..9b7fd472387 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -44,7 +44,6 @@
44#include <linux/cpu.h> 44#include <linux/cpu.h>
45#include <linux/mutex.h> 45#include <linux/mutex.h>
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/kernel_stat.h>
48 47
49#ifdef CONFIG_DEBUG_LOCK_ALLOC 48#ifdef CONFIG_DEBUG_LOCK_ALLOC
50static struct lock_class_key rcu_lock_key; 49static struct lock_class_key rcu_lock_key;
@@ -53,8 +52,6 @@ struct lockdep_map rcu_lock_map =
53EXPORT_SYMBOL_GPL(rcu_lock_map); 52EXPORT_SYMBOL_GPL(rcu_lock_map);
54#endif 53#endif
55 54
56int rcu_scheduler_active __read_mostly;
57
58/* 55/*
59 * Awaken the corresponding synchronize_rcu() instance now that a 56 * Awaken the corresponding synchronize_rcu() instance now that a
60 * grace period has elapsed. 57 * grace period has elapsed.
@@ -66,122 +63,3 @@ void wakeme_after_rcu(struct rcu_head *head)
66 rcu = container_of(head, struct rcu_synchronize, head); 63 rcu = container_of(head, struct rcu_synchronize, head);
67 complete(&rcu->completion); 64 complete(&rcu->completion);
68} 65}
69
70#ifdef CONFIG_TREE_PREEMPT_RCU
71
72/**
73 * synchronize_rcu - wait until a grace period has elapsed.
74 *
75 * Control will return to the caller some time after a full grace
76 * period has elapsed, in other words after all currently executing RCU
77 * read-side critical sections have completed. RCU read-side critical
78 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
79 * and may be nested.
80 */
81void synchronize_rcu(void)
82{
83 struct rcu_synchronize rcu;
84
85 if (!rcu_scheduler_active)
86 return;
87
88 init_completion(&rcu.completion);
89 /* Will wake me after RCU finished. */
90 call_rcu(&rcu.head, wakeme_after_rcu);
91 /* Wait for it. */
92 wait_for_completion(&rcu.completion);
93}
94EXPORT_SYMBOL_GPL(synchronize_rcu);
95
96#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
97
98/**
99 * synchronize_sched - wait until an rcu-sched grace period has elapsed.
100 *
101 * Control will return to the caller some time after a full rcu-sched
102 * grace period has elapsed, in other words after all currently executing
103 * rcu-sched read-side critical sections have completed. These read-side
104 * critical sections are delimited by rcu_read_lock_sched() and
105 * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(),
106 * local_irq_disable(), and so on may be used in place of
107 * rcu_read_lock_sched().
108 *
109 * This means that all preempt_disable code sequences, including NMI and
110 * hardware-interrupt handlers, in progress on entry will have completed
111 * before this primitive returns. However, this does not guarantee that
112 * softirq handlers will have completed, since in some kernels, these
113 * handlers can run in process context, and can block.
114 *
115 * This primitive provides the guarantees made by the (now removed)
116 * synchronize_kernel() API. In contrast, synchronize_rcu() only
117 * guarantees that rcu_read_lock() sections will have completed.
118 * In "classic RCU", these two guarantees happen to be one and
119 * the same, but can differ in realtime RCU implementations.
120 */
121void synchronize_sched(void)
122{
123 struct rcu_synchronize rcu;
124
125 if (rcu_blocking_is_gp())
126 return;
127
128 init_completion(&rcu.completion);
129 /* Will wake me after RCU finished. */
130 call_rcu_sched(&rcu.head, wakeme_after_rcu);
131 /* Wait for it. */
132 wait_for_completion(&rcu.completion);
133}
134EXPORT_SYMBOL_GPL(synchronize_sched);
135
136/**
137 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
138 *
139 * Control will return to the caller some time after a full rcu_bh grace
140 * period has elapsed, in other words after all currently executing rcu_bh
141 * read-side critical sections have completed. RCU read-side critical
142 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
143 * and may be nested.
144 */
145void synchronize_rcu_bh(void)
146{
147 struct rcu_synchronize rcu;
148
149 if (rcu_blocking_is_gp())
150 return;
151
152 init_completion(&rcu.completion);
153 /* Will wake me after RCU finished. */
154 call_rcu_bh(&rcu.head, wakeme_after_rcu);
155 /* Wait for it. */
156 wait_for_completion(&rcu.completion);
157}
158EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
159
160static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
161 unsigned long action, void *hcpu)
162{
163 return rcu_cpu_notify(self, action, hcpu);
164}
165
166void __init rcu_init(void)
167{
168 int i;
169
170 __rcu_init();
171 cpu_notifier(rcu_barrier_cpu_hotplug, 0);
172
173 /*
174 * We don't need protection against CPU-hotplug here because
175 * this is called early in boot, before either interrupts
176 * or the scheduler are operational.
177 */
178 for_each_online_cpu(i)
179 rcu_barrier_cpu_hotplug(NULL, CPU_UP_PREPARE, (void *)(long)i);
180}
181
182void rcu_scheduler_starting(void)
183{
184 WARN_ON(num_online_cpus() != 1);
185 WARN_ON(nr_context_switches() > 0);
186 rcu_scheduler_active = 1;
187}
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
new file mode 100644
index 00000000000..9f6d9ff2572
--- /dev/null
+++ b/kernel/rcutiny.c
@@ -0,0 +1,282 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2008
19 *
20 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
21 *
22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU
24 */
25#include <linux/moduleparam.h>
26#include <linux/completion.h>
27#include <linux/interrupt.h>
28#include <linux/notifier.h>
29#include <linux/rcupdate.h>
30#include <linux/kernel.h>
31#include <linux/module.h>
32#include <linux/mutex.h>
33#include <linux/sched.h>
34#include <linux/types.h>
35#include <linux/init.h>
36#include <linux/time.h>
37#include <linux/cpu.h>
38
39/* Global control variables for rcupdate callback mechanism. */
40struct rcu_ctrlblk {
41 struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
42 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
43 struct rcu_head **curtail; /* ->next pointer of last CB. */
44};
45
46/* Definition for rcupdate control block. */
47static struct rcu_ctrlblk rcu_ctrlblk = {
48 .donetail = &rcu_ctrlblk.rcucblist,
49 .curtail = &rcu_ctrlblk.rcucblist,
50};
51
52static struct rcu_ctrlblk rcu_bh_ctrlblk = {
53 .donetail = &rcu_bh_ctrlblk.rcucblist,
54 .curtail = &rcu_bh_ctrlblk.rcucblist,
55};
56
57#ifdef CONFIG_NO_HZ
58
59static long rcu_dynticks_nesting = 1;
60
61/*
62 * Enter dynticks-idle mode, which is an extended quiescent state
63 * if we have fully entered that mode (i.e., if the new value of
64 * dynticks_nesting is zero).
65 */
66void rcu_enter_nohz(void)
67{
68 if (--rcu_dynticks_nesting == 0)
69 rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */
70}
71
72/*
73 * Exit dynticks-idle mode, so that we are no longer in an extended
74 * quiescent state.
75 */
76void rcu_exit_nohz(void)
77{
78 rcu_dynticks_nesting++;
79}
80
81#endif /* #ifdef CONFIG_NO_HZ */
82
83/*
84 * Helper function for rcu_qsctr_inc() and rcu_bh_qsctr_inc().
85 * Also disable irqs to avoid confusion due to interrupt handlers
86 * invoking call_rcu().
87 */
88static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
89{
90 unsigned long flags;
91
92 local_irq_save(flags);
93 if (rcp->rcucblist != NULL &&
94 rcp->donetail != rcp->curtail) {
95 rcp->donetail = rcp->curtail;
96 local_irq_restore(flags);
97 return 1;
98 }
99 local_irq_restore(flags);
100
101 return 0;
102}
103
104/*
105 * Record an rcu quiescent state. And an rcu_bh quiescent state while we
106 * are at it, given that any rcu quiescent state is also an rcu_bh
107 * quiescent state. Use "+" instead of "||" to defeat short circuiting.
108 */
109void rcu_sched_qs(int cpu)
110{
111 if (rcu_qsctr_help(&rcu_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk))
112 raise_softirq(RCU_SOFTIRQ);
113}
114
115/*
116 * Record an rcu_bh quiescent state.
117 */
118void rcu_bh_qs(int cpu)
119{
120 if (rcu_qsctr_help(&rcu_bh_ctrlblk))
121 raise_softirq(RCU_SOFTIRQ);
122}
123
124/*
125 * Check to see if the scheduling-clock interrupt came from an extended
126 * quiescent state, and, if so, tell RCU about it.
127 */
128void rcu_check_callbacks(int cpu, int user)
129{
130 if (user ||
131 (idle_cpu(cpu) &&
132 !in_softirq() &&
133 hardirq_count() <= (1 << HARDIRQ_SHIFT)))
134 rcu_sched_qs(cpu);
135 else if (!in_softirq())
136 rcu_bh_qs(cpu);
137}
138
139/*
140 * Helper function for rcu_process_callbacks() that operates on the
141 * specified rcu_ctrlkblk structure.
142 */
143static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
144{
145 struct rcu_head *next, *list;
146 unsigned long flags;
147
148 /* If no RCU callbacks ready to invoke, just return. */
149 if (&rcp->rcucblist == rcp->donetail)
150 return;
151
152 /* Move the ready-to-invoke callbacks to a local list. */
153 local_irq_save(flags);
154 list = rcp->rcucblist;
155 rcp->rcucblist = *rcp->donetail;
156 *rcp->donetail = NULL;
157 if (rcp->curtail == rcp->donetail)
158 rcp->curtail = &rcp->rcucblist;
159 rcp->donetail = &rcp->rcucblist;
160 local_irq_restore(flags);
161
162 /* Invoke the callbacks on the local list. */
163 while (list) {
164 next = list->next;
165 prefetch(next);
166 list->func(list);
167 list = next;
168 }
169}
170
171/*
172 * Invoke any callbacks whose grace period has completed.
173 */
174static void rcu_process_callbacks(struct softirq_action *unused)
175{
176 __rcu_process_callbacks(&rcu_ctrlblk);
177 __rcu_process_callbacks(&rcu_bh_ctrlblk);
178}
179
180/*
181 * Wait for a grace period to elapse. But it is illegal to invoke
182 * synchronize_sched() from within an RCU read-side critical section.
183 * Therefore, any legal call to synchronize_sched() is a quiescent
184 * state, and so on a UP system, synchronize_sched() need do nothing.
185 * Ditto for synchronize_rcu_bh(). (But Lai Jiangshan points out the
186 * benefits of doing might_sleep() to reduce latency.)
187 *
188 * Cool, huh? (Due to Josh Triplett.)
189 *
190 * But we want to make this a static inline later.
191 */
192void synchronize_sched(void)
193{
194 cond_resched();
195}
196EXPORT_SYMBOL_GPL(synchronize_sched);
197
198void synchronize_rcu_bh(void)
199{
200 synchronize_sched();
201}
202EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
203
204/*
205 * Helper function for call_rcu() and call_rcu_bh().
206 */
207static void __call_rcu(struct rcu_head *head,
208 void (*func)(struct rcu_head *rcu),
209 struct rcu_ctrlblk *rcp)
210{
211 unsigned long flags;
212
213 head->func = func;
214 head->next = NULL;
215
216 local_irq_save(flags);
217 *rcp->curtail = head;
218 rcp->curtail = &head->next;
219 local_irq_restore(flags);
220}
221
222/*
223 * Post an RCU callback to be invoked after the end of an RCU grace
224 * period. But since we have but one CPU, that would be after any
225 * quiescent state.
226 */
227void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
228{
229 __call_rcu(head, func, &rcu_ctrlblk);
230}
231EXPORT_SYMBOL_GPL(call_rcu);
232
233/*
234 * Post an RCU bottom-half callback to be invoked after any subsequent
235 * quiescent state.
236 */
237void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
238{
239 __call_rcu(head, func, &rcu_bh_ctrlblk);
240}
241EXPORT_SYMBOL_GPL(call_rcu_bh);
242
243void rcu_barrier(void)
244{
245 struct rcu_synchronize rcu;
246
247 init_completion(&rcu.completion);
248 /* Will wake me after RCU finished. */
249 call_rcu(&rcu.head, wakeme_after_rcu);
250 /* Wait for it. */
251 wait_for_completion(&rcu.completion);
252}
253EXPORT_SYMBOL_GPL(rcu_barrier);
254
255void rcu_barrier_bh(void)
256{
257 struct rcu_synchronize rcu;
258
259 init_completion(&rcu.completion);
260 /* Will wake me after RCU finished. */
261 call_rcu_bh(&rcu.head, wakeme_after_rcu);
262 /* Wait for it. */
263 wait_for_completion(&rcu.completion);
264}
265EXPORT_SYMBOL_GPL(rcu_barrier_bh);
266
267void rcu_barrier_sched(void)
268{
269 struct rcu_synchronize rcu;
270
271 init_completion(&rcu.completion);
272 /* Will wake me after RCU finished. */
273 call_rcu_sched(&rcu.head, wakeme_after_rcu);
274 /* Wait for it. */
275 wait_for_completion(&rcu.completion);
276}
277EXPORT_SYMBOL_GPL(rcu_barrier_sched);
278
279void __init rcu_init(void)
280{
281 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
282}
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 697c0a0229d..9bb52177af0 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -327,6 +327,11 @@ rcu_torture_cb(struct rcu_head *p)
327 cur_ops->deferred_free(rp); 327 cur_ops->deferred_free(rp);
328} 328}
329 329
330static int rcu_no_completed(void)
331{
332 return 0;
333}
334
330static void rcu_torture_deferred_free(struct rcu_torture *p) 335static void rcu_torture_deferred_free(struct rcu_torture *p)
331{ 336{
332 call_rcu(&p->rtort_rcu, rcu_torture_cb); 337 call_rcu(&p->rtort_rcu, rcu_torture_cb);
@@ -388,6 +393,21 @@ static struct rcu_torture_ops rcu_sync_ops = {
388 .name = "rcu_sync" 393 .name = "rcu_sync"
389}; 394};
390 395
396static struct rcu_torture_ops rcu_expedited_ops = {
397 .init = rcu_sync_torture_init,
398 .cleanup = NULL,
399 .readlock = rcu_torture_read_lock,
400 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
401 .readunlock = rcu_torture_read_unlock,
402 .completed = rcu_no_completed,
403 .deferred_free = rcu_sync_torture_deferred_free,
404 .sync = synchronize_rcu_expedited,
405 .cb_barrier = NULL,
406 .stats = NULL,
407 .irq_capable = 1,
408 .name = "rcu_expedited"
409};
410
391/* 411/*
392 * Definitions for rcu_bh torture testing. 412 * Definitions for rcu_bh torture testing.
393 */ 413 */
@@ -547,6 +567,25 @@ static struct rcu_torture_ops srcu_ops = {
547 .name = "srcu" 567 .name = "srcu"
548}; 568};
549 569
570static void srcu_torture_synchronize_expedited(void)
571{
572 synchronize_srcu_expedited(&srcu_ctl);
573}
574
575static struct rcu_torture_ops srcu_expedited_ops = {
576 .init = srcu_torture_init,
577 .cleanup = srcu_torture_cleanup,
578 .readlock = srcu_torture_read_lock,
579 .read_delay = srcu_read_delay,
580 .readunlock = srcu_torture_read_unlock,
581 .completed = srcu_torture_completed,
582 .deferred_free = rcu_sync_torture_deferred_free,
583 .sync = srcu_torture_synchronize_expedited,
584 .cb_barrier = NULL,
585 .stats = srcu_torture_stats,
586 .name = "srcu_expedited"
587};
588
550/* 589/*
551 * Definitions for sched torture testing. 590 * Definitions for sched torture testing.
552 */ 591 */
@@ -562,11 +601,6 @@ static void sched_torture_read_unlock(int idx)
562 preempt_enable(); 601 preempt_enable();
563} 602}
564 603
565static int sched_torture_completed(void)
566{
567 return 0;
568}
569
570static void rcu_sched_torture_deferred_free(struct rcu_torture *p) 604static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
571{ 605{
572 call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); 606 call_rcu_sched(&p->rtort_rcu, rcu_torture_cb);
@@ -583,7 +617,7 @@ static struct rcu_torture_ops sched_ops = {
583 .readlock = sched_torture_read_lock, 617 .readlock = sched_torture_read_lock,
584 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 618 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
585 .readunlock = sched_torture_read_unlock, 619 .readunlock = sched_torture_read_unlock,
586 .completed = sched_torture_completed, 620 .completed = rcu_no_completed,
587 .deferred_free = rcu_sched_torture_deferred_free, 621 .deferred_free = rcu_sched_torture_deferred_free,
588 .sync = sched_torture_synchronize, 622 .sync = sched_torture_synchronize,
589 .cb_barrier = rcu_barrier_sched, 623 .cb_barrier = rcu_barrier_sched,
@@ -592,13 +626,13 @@ static struct rcu_torture_ops sched_ops = {
592 .name = "sched" 626 .name = "sched"
593}; 627};
594 628
595static struct rcu_torture_ops sched_ops_sync = { 629static struct rcu_torture_ops sched_sync_ops = {
596 .init = rcu_sync_torture_init, 630 .init = rcu_sync_torture_init,
597 .cleanup = NULL, 631 .cleanup = NULL,
598 .readlock = sched_torture_read_lock, 632 .readlock = sched_torture_read_lock,
599 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 633 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
600 .readunlock = sched_torture_read_unlock, 634 .readunlock = sched_torture_read_unlock,
601 .completed = sched_torture_completed, 635 .completed = rcu_no_completed,
602 .deferred_free = rcu_sync_torture_deferred_free, 636 .deferred_free = rcu_sync_torture_deferred_free,
603 .sync = sched_torture_synchronize, 637 .sync = sched_torture_synchronize,
604 .cb_barrier = NULL, 638 .cb_barrier = NULL,
@@ -612,7 +646,7 @@ static struct rcu_torture_ops sched_expedited_ops = {
612 .readlock = sched_torture_read_lock, 646 .readlock = sched_torture_read_lock,
613 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 647 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
614 .readunlock = sched_torture_read_unlock, 648 .readunlock = sched_torture_read_unlock,
615 .completed = sched_torture_completed, 649 .completed = rcu_no_completed,
616 .deferred_free = rcu_sync_torture_deferred_free, 650 .deferred_free = rcu_sync_torture_deferred_free,
617 .sync = synchronize_sched_expedited, 651 .sync = synchronize_sched_expedited,
618 .cb_barrier = NULL, 652 .cb_barrier = NULL,
@@ -729,13 +763,13 @@ static void rcu_torture_timer(unsigned long unused)
729 /* Should not happen, but... */ 763 /* Should not happen, but... */
730 pipe_count = RCU_TORTURE_PIPE_LEN; 764 pipe_count = RCU_TORTURE_PIPE_LEN;
731 } 765 }
732 ++__get_cpu_var(rcu_torture_count)[pipe_count]; 766 __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]);
733 completed = cur_ops->completed() - completed; 767 completed = cur_ops->completed() - completed;
734 if (completed > RCU_TORTURE_PIPE_LEN) { 768 if (completed > RCU_TORTURE_PIPE_LEN) {
735 /* Should not happen, but... */ 769 /* Should not happen, but... */
736 completed = RCU_TORTURE_PIPE_LEN; 770 completed = RCU_TORTURE_PIPE_LEN;
737 } 771 }
738 ++__get_cpu_var(rcu_torture_batch)[completed]; 772 __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]);
739 preempt_enable(); 773 preempt_enable();
740 cur_ops->readunlock(idx); 774 cur_ops->readunlock(idx);
741} 775}
@@ -784,13 +818,13 @@ rcu_torture_reader(void *arg)
784 /* Should not happen, but... */ 818 /* Should not happen, but... */
785 pipe_count = RCU_TORTURE_PIPE_LEN; 819 pipe_count = RCU_TORTURE_PIPE_LEN;
786 } 820 }
787 ++__get_cpu_var(rcu_torture_count)[pipe_count]; 821 __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]);
788 completed = cur_ops->completed() - completed; 822 completed = cur_ops->completed() - completed;
789 if (completed > RCU_TORTURE_PIPE_LEN) { 823 if (completed > RCU_TORTURE_PIPE_LEN) {
790 /* Should not happen, but... */ 824 /* Should not happen, but... */
791 completed = RCU_TORTURE_PIPE_LEN; 825 completed = RCU_TORTURE_PIPE_LEN;
792 } 826 }
793 ++__get_cpu_var(rcu_torture_batch)[completed]; 827 __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]);
794 preempt_enable(); 828 preempt_enable();
795 cur_ops->readunlock(idx); 829 cur_ops->readunlock(idx);
796 schedule(); 830 schedule();
@@ -1097,9 +1131,10 @@ rcu_torture_init(void)
1097 int cpu; 1131 int cpu;
1098 int firsterr = 0; 1132 int firsterr = 0;
1099 static struct rcu_torture_ops *torture_ops[] = 1133 static struct rcu_torture_ops *torture_ops[] =
1100 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, 1134 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
1101 &sched_expedited_ops, 1135 &rcu_bh_ops, &rcu_bh_sync_ops,
1102 &srcu_ops, &sched_ops, &sched_ops_sync, }; 1136 &srcu_ops, &srcu_expedited_ops,
1137 &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
1103 1138
1104 mutex_lock(&fullstop_mutex); 1139 mutex_lock(&fullstop_mutex);
1105 1140
@@ -1110,8 +1145,12 @@ rcu_torture_init(void)
1110 break; 1145 break;
1111 } 1146 }
1112 if (i == ARRAY_SIZE(torture_ops)) { 1147 if (i == ARRAY_SIZE(torture_ops)) {
1113 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", 1148 printk(KERN_ALERT "rcu-torture: invalid torture type: \"%s\"\n",
1114 torture_type); 1149 torture_type);
1150 printk(KERN_ALERT "rcu-torture types:");
1151 for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
1152 printk(KERN_ALERT " %s", torture_ops[i]->name);
1153 printk(KERN_ALERT "\n");
1115 mutex_unlock(&fullstop_mutex); 1154 mutex_unlock(&fullstop_mutex);
1116 return -EINVAL; 1155 return -EINVAL;
1117 } 1156 }
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 705f02ac743..53ae9598f79 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -46,20 +46,24 @@
46#include <linux/cpu.h> 46#include <linux/cpu.h>
47#include <linux/mutex.h> 47#include <linux/mutex.h>
48#include <linux/time.h> 48#include <linux/time.h>
49#include <linux/kernel_stat.h>
49 50
50#include "rcutree.h" 51#include "rcutree.h"
51 52
52/* Data structures. */ 53/* Data structures. */
53 54
55static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
56
54#define RCU_STATE_INITIALIZER(name) { \ 57#define RCU_STATE_INITIALIZER(name) { \
55 .level = { &name.node[0] }, \ 58 .level = { &name.node[0] }, \
56 .levelcnt = { \ 59 .levelcnt = { \
57 NUM_RCU_LVL_0, /* root of hierarchy. */ \ 60 NUM_RCU_LVL_0, /* root of hierarchy. */ \
58 NUM_RCU_LVL_1, \ 61 NUM_RCU_LVL_1, \
59 NUM_RCU_LVL_2, \ 62 NUM_RCU_LVL_2, \
60 NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \ 63 NUM_RCU_LVL_3, \
64 NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \
61 }, \ 65 }, \
62 .signaled = RCU_SIGNAL_INIT, \ 66 .signaled = RCU_GP_IDLE, \
63 .gpnum = -300, \ 67 .gpnum = -300, \
64 .completed = -300, \ 68 .completed = -300, \
65 .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \ 69 .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \
@@ -77,6 +81,8 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
77struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 81struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
78DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 82DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
79 83
84static int rcu_scheduler_active __read_mostly;
85
80 86
81/* 87/*
82 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 88 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
@@ -98,7 +104,7 @@ void rcu_sched_qs(int cpu)
98 struct rcu_data *rdp; 104 struct rcu_data *rdp;
99 105
100 rdp = &per_cpu(rcu_sched_data, cpu); 106 rdp = &per_cpu(rcu_sched_data, cpu);
101 rdp->passed_quiesc_completed = rdp->completed; 107 rdp->passed_quiesc_completed = rdp->gpnum - 1;
102 barrier(); 108 barrier();
103 rdp->passed_quiesc = 1; 109 rdp->passed_quiesc = 1;
104 rcu_preempt_note_context_switch(cpu); 110 rcu_preempt_note_context_switch(cpu);
@@ -109,7 +115,7 @@ void rcu_bh_qs(int cpu)
109 struct rcu_data *rdp; 115 struct rcu_data *rdp;
110 116
111 rdp = &per_cpu(rcu_bh_data, cpu); 117 rdp = &per_cpu(rcu_bh_data, cpu);
112 rdp->passed_quiesc_completed = rdp->completed; 118 rdp->passed_quiesc_completed = rdp->gpnum - 1;
113 barrier(); 119 barrier();
114 rdp->passed_quiesc = 1; 120 rdp->passed_quiesc = 1;
115} 121}
@@ -335,28 +341,9 @@ void rcu_irq_exit(void)
335 set_need_resched(); 341 set_need_resched();
336} 342}
337 343
338/*
339 * Record the specified "completed" value, which is later used to validate
340 * dynticks counter manipulations. Specify "rsp->completed - 1" to
341 * unconditionally invalidate any future dynticks manipulations (which is
342 * useful at the beginning of a grace period).
343 */
344static void dyntick_record_completed(struct rcu_state *rsp, long comp)
345{
346 rsp->dynticks_completed = comp;
347}
348
349#ifdef CONFIG_SMP 344#ifdef CONFIG_SMP
350 345
351/* 346/*
352 * Recall the previously recorded value of the completion for dynticks.
353 */
354static long dyntick_recall_completed(struct rcu_state *rsp)
355{
356 return rsp->dynticks_completed;
357}
358
359/*
360 * Snapshot the specified CPU's dynticks counter so that we can later 347 * Snapshot the specified CPU's dynticks counter so that we can later
361 * credit them with an implicit quiescent state. Return 1 if this CPU 348 * credit them with an implicit quiescent state. Return 1 if this CPU
362 * is in dynticks idle mode, which is an extended quiescent state. 349 * is in dynticks idle mode, which is an extended quiescent state.
@@ -419,24 +406,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
419 406
420#else /* #ifdef CONFIG_NO_HZ */ 407#else /* #ifdef CONFIG_NO_HZ */
421 408
422static void dyntick_record_completed(struct rcu_state *rsp, long comp)
423{
424}
425
426#ifdef CONFIG_SMP 409#ifdef CONFIG_SMP
427 410
428/*
429 * If there are no dynticks, then the only way that a CPU can passively
430 * be in a quiescent state is to be offline. Unlike dynticks idle, which
431 * is a point in time during the prior (already finished) grace period,
432 * an offline CPU is always in a quiescent state, and thus can be
433 * unconditionally applied. So just return the current value of completed.
434 */
435static long dyntick_recall_completed(struct rcu_state *rsp)
436{
437 return rsp->completed;
438}
439
440static int dyntick_save_progress_counter(struct rcu_data *rdp) 411static int dyntick_save_progress_counter(struct rcu_data *rdp)
441{ 412{
442 return 0; 413 return 0;
@@ -553,13 +524,33 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
553/* 524/*
554 * Update CPU-local rcu_data state to record the newly noticed grace period. 525 * Update CPU-local rcu_data state to record the newly noticed grace period.
555 * This is used both when we started the grace period and when we notice 526 * This is used both when we started the grace period and when we notice
556 * that someone else started the grace period. 527 * that someone else started the grace period. The caller must hold the
528 * ->lock of the leaf rcu_node structure corresponding to the current CPU,
529 * and must have irqs disabled.
557 */ 530 */
531static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
532{
533 if (rdp->gpnum != rnp->gpnum) {
534 rdp->qs_pending = 1;
535 rdp->passed_quiesc = 0;
536 rdp->gpnum = rnp->gpnum;
537 }
538}
539
558static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp) 540static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
559{ 541{
560 rdp->qs_pending = 1; 542 unsigned long flags;
561 rdp->passed_quiesc = 0; 543 struct rcu_node *rnp;
562 rdp->gpnum = rsp->gpnum; 544
545 local_irq_save(flags);
546 rnp = rdp->mynode;
547 if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */
548 !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */
549 local_irq_restore(flags);
550 return;
551 }
552 __note_new_gpnum(rsp, rnp, rdp);
553 spin_unlock_irqrestore(&rnp->lock, flags);
563} 554}
564 555
565/* 556/*
@@ -583,6 +574,79 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
583} 574}
584 575
585/* 576/*
577 * Advance this CPU's callbacks, but only if the current grace period
578 * has ended. This may be called only from the CPU to whom the rdp
579 * belongs. In addition, the corresponding leaf rcu_node structure's
580 * ->lock must be held by the caller, with irqs disabled.
581 */
582static void
583__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
584{
585 /* Did another grace period end? */
586 if (rdp->completed != rnp->completed) {
587
588 /* Advance callbacks. No harm if list empty. */
589 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
590 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
591 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
592
593 /* Remember that we saw this grace-period completion. */
594 rdp->completed = rnp->completed;
595 }
596}
597
598/*
599 * Advance this CPU's callbacks, but only if the current grace period
600 * has ended. This may be called only from the CPU to whom the rdp
601 * belongs.
602 */
603static void
604rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
605{
606 unsigned long flags;
607 struct rcu_node *rnp;
608
609 local_irq_save(flags);
610 rnp = rdp->mynode;
611 if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */
612 !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */
613 local_irq_restore(flags);
614 return;
615 }
616 __rcu_process_gp_end(rsp, rnp, rdp);
617 spin_unlock_irqrestore(&rnp->lock, flags);
618}
619
620/*
621 * Do per-CPU grace-period initialization for running CPU. The caller
622 * must hold the lock of the leaf rcu_node structure corresponding to
623 * this CPU.
624 */
625static void
626rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
627{
628 /* Prior grace period ended, so advance callbacks for current CPU. */
629 __rcu_process_gp_end(rsp, rnp, rdp);
630
631 /*
632 * Because this CPU just now started the new grace period, we know
633 * that all of its callbacks will be covered by this upcoming grace
634 * period, even the ones that were registered arbitrarily recently.
635 * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL.
636 *
637 * Other CPUs cannot be sure exactly when the grace period started.
638 * Therefore, their recently registered callbacks must pass through
639 * an additional RCU_NEXT_READY stage, so that they will be handled
640 * by the next RCU grace period.
641 */
642 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
643 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
644
645 /* Set state so that this CPU will detect the next quiescent state. */
646 __note_new_gpnum(rsp, rnp, rdp);
647}
648
649/*
586 * Start a new RCU grace period if warranted, re-initializing the hierarchy 650 * Start a new RCU grace period if warranted, re-initializing the hierarchy
587 * in preparation for detecting the next grace period. The caller must hold 651 * in preparation for detecting the next grace period. The caller must hold
588 * the root node's ->lock, which is released before return. Hard irqs must 652 * the root node's ->lock, which is released before return. Hard irqs must
@@ -596,7 +660,23 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
596 struct rcu_node *rnp = rcu_get_root(rsp); 660 struct rcu_node *rnp = rcu_get_root(rsp);
597 661
598 if (!cpu_needs_another_gp(rsp, rdp)) { 662 if (!cpu_needs_another_gp(rsp, rdp)) {
599 spin_unlock_irqrestore(&rnp->lock, flags); 663 if (rnp->completed == rsp->completed) {
664 spin_unlock_irqrestore(&rnp->lock, flags);
665 return;
666 }
667 spin_unlock(&rnp->lock); /* irqs remain disabled. */
668
669 /*
670 * Propagate new ->completed value to rcu_node structures
671 * so that other CPUs don't have to wait until the start
672 * of the next grace period to process their callbacks.
673 */
674 rcu_for_each_node_breadth_first(rsp, rnp) {
675 spin_lock(&rnp->lock); /* irqs already disabled. */
676 rnp->completed = rsp->completed;
677 spin_unlock(&rnp->lock); /* irqs remain disabled. */
678 }
679 local_irq_restore(flags);
600 return; 680 return;
601 } 681 }
602 682
@@ -606,29 +686,15 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
606 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ 686 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
607 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 687 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
608 record_gp_stall_check_time(rsp); 688 record_gp_stall_check_time(rsp);
609 dyntick_record_completed(rsp, rsp->completed - 1);
610 note_new_gpnum(rsp, rdp);
611
612 /*
613 * Because this CPU just now started the new grace period, we know
614 * that all of its callbacks will be covered by this upcoming grace
615 * period, even the ones that were registered arbitrarily recently.
616 * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL.
617 *
618 * Other CPUs cannot be sure exactly when the grace period started.
619 * Therefore, their recently registered callbacks must pass through
620 * an additional RCU_NEXT_READY stage, so that they will be handled
621 * by the next RCU grace period.
622 */
623 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
624 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
625 689
626 /* Special-case the common single-level case. */ 690 /* Special-case the common single-level case. */
627 if (NUM_RCU_NODES == 1) { 691 if (NUM_RCU_NODES == 1) {
628 rcu_preempt_check_blocked_tasks(rnp); 692 rcu_preempt_check_blocked_tasks(rnp);
629 rnp->qsmask = rnp->qsmaskinit; 693 rnp->qsmask = rnp->qsmaskinit;
630 rnp->gpnum = rsp->gpnum; 694 rnp->gpnum = rsp->gpnum;
695 rnp->completed = rsp->completed;
631 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ 696 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
697 rcu_start_gp_per_cpu(rsp, rnp, rdp);
632 spin_unlock_irqrestore(&rnp->lock, flags); 698 spin_unlock_irqrestore(&rnp->lock, flags);
633 return; 699 return;
634 } 700 }
@@ -657,69 +723,50 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
657 * irqs disabled. 723 * irqs disabled.
658 */ 724 */
659 rcu_for_each_node_breadth_first(rsp, rnp) { 725 rcu_for_each_node_breadth_first(rsp, rnp) {
660 spin_lock(&rnp->lock); /* irqs already disabled. */ 726 spin_lock(&rnp->lock); /* irqs already disabled. */
661 rcu_preempt_check_blocked_tasks(rnp); 727 rcu_preempt_check_blocked_tasks(rnp);
662 rnp->qsmask = rnp->qsmaskinit; 728 rnp->qsmask = rnp->qsmaskinit;
663 rnp->gpnum = rsp->gpnum; 729 rnp->gpnum = rsp->gpnum;
664 spin_unlock(&rnp->lock); /* irqs already disabled. */ 730 rnp->completed = rsp->completed;
731 if (rnp == rdp->mynode)
732 rcu_start_gp_per_cpu(rsp, rnp, rdp);
733 spin_unlock(&rnp->lock); /* irqs remain disabled. */
665 } 734 }
666 735
736 rnp = rcu_get_root(rsp);
737 spin_lock(&rnp->lock); /* irqs already disabled. */
667 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ 738 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
739 spin_unlock(&rnp->lock); /* irqs remain disabled. */
668 spin_unlock_irqrestore(&rsp->onofflock, flags); 740 spin_unlock_irqrestore(&rsp->onofflock, flags);
669} 741}
670 742
671/* 743/*
672 * Advance this CPU's callbacks, but only if the current grace period 744 * Report a full set of quiescent states to the specified rcu_state
673 * has ended. This may be called only from the CPU to whom the rdp 745 * data structure. This involves cleaning up after the prior grace
674 * belongs. 746 * period and letting rcu_start_gp() start up the next grace period
675 */ 747 * if one is needed. Note that the caller must hold rnp->lock, as
676static void 748 * required by rcu_start_gp(), which will release it.
677rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
678{
679 long completed_snap;
680 unsigned long flags;
681
682 local_irq_save(flags);
683 completed_snap = ACCESS_ONCE(rsp->completed); /* outside of lock. */
684
685 /* Did another grace period end? */
686 if (rdp->completed != completed_snap) {
687
688 /* Advance callbacks. No harm if list empty. */
689 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
690 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
691 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
692
693 /* Remember that we saw this grace-period completion. */
694 rdp->completed = completed_snap;
695 }
696 local_irq_restore(flags);
697}
698
699/*
700 * Clean up after the prior grace period and let rcu_start_gp() start up
701 * the next grace period if one is needed. Note that the caller must
702 * hold rnp->lock, as required by rcu_start_gp(), which will release it.
703 */ 749 */
704static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags) 750static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
705 __releases(rcu_get_root(rsp)->lock) 751 __releases(rcu_get_root(rsp)->lock)
706{ 752{
707 WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); 753 WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
708 rsp->completed = rsp->gpnum; 754 rsp->completed = rsp->gpnum;
709 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]); 755 rsp->signaled = RCU_GP_IDLE;
710 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ 756 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
711} 757}
712 758
713/* 759/*
714 * Similar to cpu_quiet(), for which it is a helper function. Allows 760 * Similar to rcu_report_qs_rdp(), for which it is a helper function.
715 * a group of CPUs to be quieted at one go, though all the CPUs in the 761 * Allows quiescent states for a group of CPUs to be reported at one go
716 * group must be represented by the same leaf rcu_node structure. 762 * to the specified rcu_node structure, though all the CPUs in the group
717 * That structure's lock must be held upon entry, and it is released 763 * must be represented by the same rcu_node structure (which need not be
718 * before return. 764 * a leaf rcu_node structure, though it often will be). That structure's
765 * lock must be held upon entry, and it is released before return.
719 */ 766 */
720static void 767static void
721cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp, 768rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
722 unsigned long flags) 769 struct rcu_node *rnp, unsigned long flags)
723 __releases(rnp->lock) 770 __releases(rnp->lock)
724{ 771{
725 struct rcu_node *rnp_c; 772 struct rcu_node *rnp_c;
@@ -755,21 +802,23 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
755 802
756 /* 803 /*
757 * Get here if we are the last CPU to pass through a quiescent 804 * Get here if we are the last CPU to pass through a quiescent
758 * state for this grace period. Invoke cpu_quiet_msk_finish() 805 * state for this grace period. Invoke rcu_report_qs_rsp()
759 * to clean up and start the next grace period if one is needed. 806 * to clean up and start the next grace period if one is needed.
760 */ 807 */
761 cpu_quiet_msk_finish(rsp, flags); /* releases rnp->lock. */ 808 rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */
762} 809}
763 810
764/* 811/*
765 * Record a quiescent state for the specified CPU, which must either be 812 * Record a quiescent state for the specified CPU to that CPU's rcu_data
766 * the current CPU. The lastcomp argument is used to make sure we are 813 * structure. This must be either called from the specified CPU, or
767 * still in the grace period of interest. We don't want to end the current 814 * called when the specified CPU is known to be offline (and when it is
768 * grace period based on quiescent states detected in an earlier grace 815 * also known that no other CPU is concurrently trying to help the offline
769 * period! 816 * CPU). The lastcomp argument is used to make sure we are still in the
817 * grace period of interest. We don't want to end the current grace period
818 * based on quiescent states detected in an earlier grace period!
770 */ 819 */
771static void 820static void
772cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) 821rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
773{ 822{
774 unsigned long flags; 823 unsigned long flags;
775 unsigned long mask; 824 unsigned long mask;
@@ -777,15 +826,15 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
777 826
778 rnp = rdp->mynode; 827 rnp = rdp->mynode;
779 spin_lock_irqsave(&rnp->lock, flags); 828 spin_lock_irqsave(&rnp->lock, flags);
780 if (lastcomp != ACCESS_ONCE(rsp->completed)) { 829 if (lastcomp != rnp->completed) {
781 830
782 /* 831 /*
783 * Someone beat us to it for this grace period, so leave. 832 * Someone beat us to it for this grace period, so leave.
784 * The race with GP start is resolved by the fact that we 833 * The race with GP start is resolved by the fact that we
785 * hold the leaf rcu_node lock, so that the per-CPU bits 834 * hold the leaf rcu_node lock, so that the per-CPU bits
786 * cannot yet be initialized -- so we would simply find our 835 * cannot yet be initialized -- so we would simply find our
787 * CPU's bit already cleared in cpu_quiet_msk() if this race 836 * CPU's bit already cleared in rcu_report_qs_rnp() if this
788 * occurred. 837 * race occurred.
789 */ 838 */
790 rdp->passed_quiesc = 0; /* try again later! */ 839 rdp->passed_quiesc = 0; /* try again later! */
791 spin_unlock_irqrestore(&rnp->lock, flags); 840 spin_unlock_irqrestore(&rnp->lock, flags);
@@ -803,7 +852,7 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
803 */ 852 */
804 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; 853 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
805 854
806 cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */ 855 rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
807 } 856 }
808} 857}
809 858
@@ -834,8 +883,11 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
834 if (!rdp->passed_quiesc) 883 if (!rdp->passed_quiesc)
835 return; 884 return;
836 885
837 /* Tell RCU we are done (but cpu_quiet() will be the judge of that). */ 886 /*
838 cpu_quiet(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed); 887 * Tell RCU we are done (but rcu_report_qs_rdp() will be the
888 * judge of that).
889 */
890 rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed);
839} 891}
840 892
841#ifdef CONFIG_HOTPLUG_CPU 893#ifdef CONFIG_HOTPLUG_CPU
@@ -895,8 +947,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
895static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) 947static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
896{ 948{
897 unsigned long flags; 949 unsigned long flags;
898 long lastcomp;
899 unsigned long mask; 950 unsigned long mask;
951 int need_report = 0;
900 struct rcu_data *rdp = rsp->rda[cpu]; 952 struct rcu_data *rdp = rsp->rda[cpu];
901 struct rcu_node *rnp; 953 struct rcu_node *rnp;
902 954
@@ -910,17 +962,32 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
910 spin_lock(&rnp->lock); /* irqs already disabled. */ 962 spin_lock(&rnp->lock); /* irqs already disabled. */
911 rnp->qsmaskinit &= ~mask; 963 rnp->qsmaskinit &= ~mask;
912 if (rnp->qsmaskinit != 0) { 964 if (rnp->qsmaskinit != 0) {
913 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 965 if (rnp != rdp->mynode)
966 spin_unlock(&rnp->lock); /* irqs remain disabled. */
914 break; 967 break;
915 } 968 }
916 rcu_preempt_offline_tasks(rsp, rnp, rdp); 969 if (rnp == rdp->mynode)
970 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
971 else
972 spin_unlock(&rnp->lock); /* irqs remain disabled. */
917 mask = rnp->grpmask; 973 mask = rnp->grpmask;
918 spin_unlock(&rnp->lock); /* irqs remain disabled. */
919 rnp = rnp->parent; 974 rnp = rnp->parent;
920 } while (rnp != NULL); 975 } while (rnp != NULL);
921 lastcomp = rsp->completed;
922 976
923 spin_unlock_irqrestore(&rsp->onofflock, flags); 977 /*
978 * We still hold the leaf rcu_node structure lock here, and
979 * irqs are still disabled. The reason for this subterfuge is
980 * because invoking rcu_report_unblock_qs_rnp() with ->onofflock
981 * held leads to deadlock.
982 */
983 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
984 rnp = rdp->mynode;
985 if (need_report & RCU_OFL_TASKS_NORM_GP)
986 rcu_report_unblock_qs_rnp(rnp, flags);
987 else
988 spin_unlock_irqrestore(&rnp->lock, flags);
989 if (need_report & RCU_OFL_TASKS_EXP_GP)
990 rcu_report_exp_rnp(rsp, rnp);
924 991
925 rcu_adopt_orphan_cbs(rsp); 992 rcu_adopt_orphan_cbs(rsp);
926} 993}
@@ -958,7 +1025,7 @@ static void rcu_offline_cpu(int cpu)
958 * Invoke any RCU callbacks that have made it to the end of their grace 1025 * Invoke any RCU callbacks that have made it to the end of their grace
959 * period. Thottle as specified by rdp->blimit. 1026 * period. Thottle as specified by rdp->blimit.
960 */ 1027 */
961static void rcu_do_batch(struct rcu_data *rdp) 1028static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
962{ 1029{
963 unsigned long flags; 1030 unsigned long flags;
964 struct rcu_head *next, *list, **tail; 1031 struct rcu_head *next, *list, **tail;
@@ -1011,6 +1078,13 @@ static void rcu_do_batch(struct rcu_data *rdp)
1011 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) 1078 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
1012 rdp->blimit = blimit; 1079 rdp->blimit = blimit;
1013 1080
1081 /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
1082 if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) {
1083 rdp->qlen_last_fqs_check = 0;
1084 rdp->n_force_qs_snap = rsp->n_force_qs;
1085 } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark)
1086 rdp->qlen_last_fqs_check = rdp->qlen;
1087
1014 local_irq_restore(flags); 1088 local_irq_restore(flags);
1015 1089
1016 /* Re-raise the RCU softirq if there are callbacks remaining. */ 1090 /* Re-raise the RCU softirq if there are callbacks remaining. */
@@ -1085,7 +1159,7 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1085 rcu_for_each_leaf_node(rsp, rnp) { 1159 rcu_for_each_leaf_node(rsp, rnp) {
1086 mask = 0; 1160 mask = 0;
1087 spin_lock_irqsave(&rnp->lock, flags); 1161 spin_lock_irqsave(&rnp->lock, flags);
1088 if (rsp->completed != lastcomp) { 1162 if (rnp->completed != lastcomp) {
1089 spin_unlock_irqrestore(&rnp->lock, flags); 1163 spin_unlock_irqrestore(&rnp->lock, flags);
1090 return 1; 1164 return 1;
1091 } 1165 }
@@ -1099,10 +1173,10 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1099 if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) 1173 if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu]))
1100 mask |= bit; 1174 mask |= bit;
1101 } 1175 }
1102 if (mask != 0 && rsp->completed == lastcomp) { 1176 if (mask != 0 && rnp->completed == lastcomp) {
1103 1177
1104 /* cpu_quiet_msk() releases rnp->lock. */ 1178 /* rcu_report_qs_rnp() releases rnp->lock. */
1105 cpu_quiet_msk(mask, rsp, rnp, flags); 1179 rcu_report_qs_rnp(mask, rsp, rnp, flags);
1106 continue; 1180 continue;
1107 } 1181 }
1108 spin_unlock_irqrestore(&rnp->lock, flags); 1182 spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1120,6 +1194,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1120 long lastcomp; 1194 long lastcomp;
1121 struct rcu_node *rnp = rcu_get_root(rsp); 1195 struct rcu_node *rnp = rcu_get_root(rsp);
1122 u8 signaled; 1196 u8 signaled;
1197 u8 forcenow;
1123 1198
1124 if (!rcu_gp_in_progress(rsp)) 1199 if (!rcu_gp_in_progress(rsp))
1125 return; /* No grace period in progress, nothing to force. */ 1200 return; /* No grace period in progress, nothing to force. */
@@ -1132,19 +1207,20 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1132 goto unlock_ret; /* no emergency and done recently. */ 1207 goto unlock_ret; /* no emergency and done recently. */
1133 rsp->n_force_qs++; 1208 rsp->n_force_qs++;
1134 spin_lock(&rnp->lock); 1209 spin_lock(&rnp->lock);
1135 lastcomp = rsp->completed; 1210 lastcomp = rsp->gpnum - 1;
1136 signaled = rsp->signaled; 1211 signaled = rsp->signaled;
1137 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 1212 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
1138 if (lastcomp == rsp->gpnum) { 1213 if(!rcu_gp_in_progress(rsp)) {
1139 rsp->n_force_qs_ngp++; 1214 rsp->n_force_qs_ngp++;
1140 spin_unlock(&rnp->lock); 1215 spin_unlock(&rnp->lock);
1141 goto unlock_ret; /* no GP in progress, time updated. */ 1216 goto unlock_ret; /* no GP in progress, time updated. */
1142 } 1217 }
1143 spin_unlock(&rnp->lock); 1218 spin_unlock(&rnp->lock);
1144 switch (signaled) { 1219 switch (signaled) {
1220 case RCU_GP_IDLE:
1145 case RCU_GP_INIT: 1221 case RCU_GP_INIT:
1146 1222
1147 break; /* grace period still initializing, ignore. */ 1223 break; /* grace period idle or initializing, ignore. */
1148 1224
1149 case RCU_SAVE_DYNTICK: 1225 case RCU_SAVE_DYNTICK:
1150 1226
@@ -1155,20 +1231,29 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1155 if (rcu_process_dyntick(rsp, lastcomp, 1231 if (rcu_process_dyntick(rsp, lastcomp,
1156 dyntick_save_progress_counter)) 1232 dyntick_save_progress_counter))
1157 goto unlock_ret; 1233 goto unlock_ret;
1234 /* fall into next case. */
1235
1236 case RCU_SAVE_COMPLETED:
1158 1237
1159 /* Update state, record completion counter. */ 1238 /* Update state, record completion counter. */
1239 forcenow = 0;
1160 spin_lock(&rnp->lock); 1240 spin_lock(&rnp->lock);
1161 if (lastcomp == rsp->completed) { 1241 if (lastcomp + 1 == rsp->gpnum &&
1242 lastcomp == rsp->completed &&
1243 rsp->signaled == signaled) {
1162 rsp->signaled = RCU_FORCE_QS; 1244 rsp->signaled = RCU_FORCE_QS;
1163 dyntick_record_completed(rsp, lastcomp); 1245 rsp->completed_fqs = lastcomp;
1246 forcenow = signaled == RCU_SAVE_COMPLETED;
1164 } 1247 }
1165 spin_unlock(&rnp->lock); 1248 spin_unlock(&rnp->lock);
1166 break; 1249 if (!forcenow)
1250 break;
1251 /* fall into next case. */
1167 1252
1168 case RCU_FORCE_QS: 1253 case RCU_FORCE_QS:
1169 1254
1170 /* Check dyntick-idle state, send IPI to laggarts. */ 1255 /* Check dyntick-idle state, send IPI to laggarts. */
1171 if (rcu_process_dyntick(rsp, dyntick_recall_completed(rsp), 1256 if (rcu_process_dyntick(rsp, rsp->completed_fqs,
1172 rcu_implicit_dynticks_qs)) 1257 rcu_implicit_dynticks_qs))
1173 goto unlock_ret; 1258 goto unlock_ret;
1174 1259
@@ -1224,7 +1309,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1224 } 1309 }
1225 1310
1226 /* If there are callbacks ready, invoke them. */ 1311 /* If there are callbacks ready, invoke them. */
1227 rcu_do_batch(rdp); 1312 rcu_do_batch(rsp, rdp);
1228} 1313}
1229 1314
1230/* 1315/*
@@ -1288,10 +1373,20 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1288 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ 1373 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */
1289 } 1374 }
1290 1375
1291 /* Force the grace period if too many callbacks or too long waiting. */ 1376 /*
1292 if (unlikely(++rdp->qlen > qhimark)) { 1377 * Force the grace period if too many callbacks or too long waiting.
1378 * Enforce hysteresis, and don't invoke force_quiescent_state()
1379 * if some other CPU has recently done so. Also, don't bother
1380 * invoking force_quiescent_state() if the newly enqueued callback
1381 * is the only one waiting for a grace period to complete.
1382 */
1383 if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
1293 rdp->blimit = LONG_MAX; 1384 rdp->blimit = LONG_MAX;
1294 force_quiescent_state(rsp, 0); 1385 if (rsp->n_force_qs == rdp->n_force_qs_snap &&
1386 *rdp->nxttail[RCU_DONE_TAIL] != head)
1387 force_quiescent_state(rsp, 0);
1388 rdp->n_force_qs_snap = rsp->n_force_qs;
1389 rdp->qlen_last_fqs_check = rdp->qlen;
1295 } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) 1390 } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
1296 force_quiescent_state(rsp, 1); 1391 force_quiescent_state(rsp, 1);
1297 local_irq_restore(flags); 1392 local_irq_restore(flags);
@@ -1315,6 +1410,68 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1315} 1410}
1316EXPORT_SYMBOL_GPL(call_rcu_bh); 1411EXPORT_SYMBOL_GPL(call_rcu_bh);
1317 1412
1413/**
1414 * synchronize_sched - wait until an rcu-sched grace period has elapsed.
1415 *
1416 * Control will return to the caller some time after a full rcu-sched
1417 * grace period has elapsed, in other words after all currently executing
1418 * rcu-sched read-side critical sections have completed. These read-side
1419 * critical sections are delimited by rcu_read_lock_sched() and
1420 * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(),
1421 * local_irq_disable(), and so on may be used in place of
1422 * rcu_read_lock_sched().
1423 *
1424 * This means that all preempt_disable code sequences, including NMI and
1425 * hardware-interrupt handlers, in progress on entry will have completed
1426 * before this primitive returns. However, this does not guarantee that
1427 * softirq handlers will have completed, since in some kernels, these
1428 * handlers can run in process context, and can block.
1429 *
1430 * This primitive provides the guarantees made by the (now removed)
1431 * synchronize_kernel() API. In contrast, synchronize_rcu() only
1432 * guarantees that rcu_read_lock() sections will have completed.
1433 * In "classic RCU", these two guarantees happen to be one and
1434 * the same, but can differ in realtime RCU implementations.
1435 */
1436void synchronize_sched(void)
1437{
1438 struct rcu_synchronize rcu;
1439
1440 if (rcu_blocking_is_gp())
1441 return;
1442
1443 init_completion(&rcu.completion);
1444 /* Will wake me after RCU finished. */
1445 call_rcu_sched(&rcu.head, wakeme_after_rcu);
1446 /* Wait for it. */
1447 wait_for_completion(&rcu.completion);
1448}
1449EXPORT_SYMBOL_GPL(synchronize_sched);
1450
1451/**
1452 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
1453 *
1454 * Control will return to the caller some time after a full rcu_bh grace
1455 * period has elapsed, in other words after all currently executing rcu_bh
1456 * read-side critical sections have completed. RCU read-side critical
1457 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
1458 * and may be nested.
1459 */
1460void synchronize_rcu_bh(void)
1461{
1462 struct rcu_synchronize rcu;
1463
1464 if (rcu_blocking_is_gp())
1465 return;
1466
1467 init_completion(&rcu.completion);
1468 /* Will wake me after RCU finished. */
1469 call_rcu_bh(&rcu.head, wakeme_after_rcu);
1470 /* Wait for it. */
1471 wait_for_completion(&rcu.completion);
1472}
1473EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
1474
1318/* 1475/*
1319 * Check to see if there is any immediate RCU-related work to be done 1476 * Check to see if there is any immediate RCU-related work to be done
1320 * by the current CPU, for the specified type of RCU, returning 1 if so. 1477 * by the current CPU, for the specified type of RCU, returning 1 if so.
@@ -1324,6 +1481,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
1324 */ 1481 */
1325static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) 1482static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1326{ 1483{
1484 struct rcu_node *rnp = rdp->mynode;
1485
1327 rdp->n_rcu_pending++; 1486 rdp->n_rcu_pending++;
1328 1487
1329 /* Check for CPU stalls, if enabled. */ 1488 /* Check for CPU stalls, if enabled. */
@@ -1348,13 +1507,13 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1348 } 1507 }
1349 1508
1350 /* Has another RCU grace period completed? */ 1509 /* Has another RCU grace period completed? */
1351 if (ACCESS_ONCE(rsp->completed) != rdp->completed) { /* outside lock */ 1510 if (ACCESS_ONCE(rnp->completed) != rdp->completed) { /* outside lock */
1352 rdp->n_rp_gp_completed++; 1511 rdp->n_rp_gp_completed++;
1353 return 1; 1512 return 1;
1354 } 1513 }
1355 1514
1356 /* Has a new RCU grace period started? */ 1515 /* Has a new RCU grace period started? */
1357 if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) { /* outside lock */ 1516 if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */
1358 rdp->n_rp_gp_started++; 1517 rdp->n_rp_gp_started++;
1359 return 1; 1518 return 1;
1360 } 1519 }
@@ -1397,6 +1556,21 @@ int rcu_needs_cpu(int cpu)
1397 rcu_preempt_needs_cpu(cpu); 1556 rcu_preempt_needs_cpu(cpu);
1398} 1557}
1399 1558
1559/*
1560 * This function is invoked towards the end of the scheduler's initialization
1561 * process. Before this is called, the idle task might contain
1562 * RCU read-side critical sections (during which time, this idle
1563 * task is booting the system). After this function is called, the
1564 * idle tasks are prohibited from containing RCU read-side critical
1565 * sections.
1566 */
1567void rcu_scheduler_starting(void)
1568{
1569 WARN_ON(num_online_cpus() != 1);
1570 WARN_ON(nr_context_switches() > 0);
1571 rcu_scheduler_active = 1;
1572}
1573
1400static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; 1574static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
1401static atomic_t rcu_barrier_cpu_count; 1575static atomic_t rcu_barrier_cpu_count;
1402static DEFINE_MUTEX(rcu_barrier_mutex); 1576static DEFINE_MUTEX(rcu_barrier_mutex);
@@ -1508,21 +1682,18 @@ static void __cpuinit
1508rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) 1682rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1509{ 1683{
1510 unsigned long flags; 1684 unsigned long flags;
1511 long lastcomp;
1512 unsigned long mask; 1685 unsigned long mask;
1513 struct rcu_data *rdp = rsp->rda[cpu]; 1686 struct rcu_data *rdp = rsp->rda[cpu];
1514 struct rcu_node *rnp = rcu_get_root(rsp); 1687 struct rcu_node *rnp = rcu_get_root(rsp);
1515 1688
1516 /* Set up local state, ensuring consistent view of global state. */ 1689 /* Set up local state, ensuring consistent view of global state. */
1517 spin_lock_irqsave(&rnp->lock, flags); 1690 spin_lock_irqsave(&rnp->lock, flags);
1518 lastcomp = rsp->completed;
1519 rdp->completed = lastcomp;
1520 rdp->gpnum = lastcomp;
1521 rdp->passed_quiesc = 0; /* We could be racing with new GP, */ 1691 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1522 rdp->qs_pending = 1; /* so set up to respond to current GP. */ 1692 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1523 rdp->beenonline = 1; /* We have now been online. */ 1693 rdp->beenonline = 1; /* We have now been online. */
1524 rdp->preemptable = preemptable; 1694 rdp->preemptable = preemptable;
1525 rdp->passed_quiesc_completed = lastcomp - 1; 1695 rdp->qlen_last_fqs_check = 0;
1696 rdp->n_force_qs_snap = rsp->n_force_qs;
1526 rdp->blimit = blimit; 1697 rdp->blimit = blimit;
1527 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1698 spin_unlock(&rnp->lock); /* irqs remain disabled. */
1528 1699
@@ -1542,6 +1713,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1542 spin_lock(&rnp->lock); /* irqs already disabled. */ 1713 spin_lock(&rnp->lock); /* irqs already disabled. */
1543 rnp->qsmaskinit |= mask; 1714 rnp->qsmaskinit |= mask;
1544 mask = rnp->grpmask; 1715 mask = rnp->grpmask;
1716 if (rnp == rdp->mynode) {
1717 rdp->gpnum = rnp->completed; /* if GP in progress... */
1718 rdp->completed = rnp->completed;
1719 rdp->passed_quiesc_completed = rnp->completed - 1;
1720 }
1545 spin_unlock(&rnp->lock); /* irqs already disabled. */ 1721 spin_unlock(&rnp->lock); /* irqs already disabled. */
1546 rnp = rnp->parent; 1722 rnp = rnp->parent;
1547 } while (rnp != NULL && !(rnp->qsmaskinit & mask)); 1723 } while (rnp != NULL && !(rnp->qsmaskinit & mask));
@@ -1559,8 +1735,8 @@ static void __cpuinit rcu_online_cpu(int cpu)
1559/* 1735/*
1560 * Handle CPU online/offline notification events. 1736 * Handle CPU online/offline notification events.
1561 */ 1737 */
1562int __cpuinit rcu_cpu_notify(struct notifier_block *self, 1738static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1563 unsigned long action, void *hcpu) 1739 unsigned long action, void *hcpu)
1564{ 1740{
1565 long cpu = (long)hcpu; 1741 long cpu = (long)hcpu;
1566 1742
@@ -1647,8 +1823,8 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1647 cpustride *= rsp->levelspread[i]; 1823 cpustride *= rsp->levelspread[i];
1648 rnp = rsp->level[i]; 1824 rnp = rsp->level[i];
1649 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { 1825 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
1650 if (rnp != rcu_get_root(rsp)) 1826 spin_lock_init(&rnp->lock);
1651 spin_lock_init(&rnp->lock); 1827 lockdep_set_class(&rnp->lock, &rcu_node_class[i]);
1652 rnp->gpnum = 0; 1828 rnp->gpnum = 0;
1653 rnp->qsmask = 0; 1829 rnp->qsmask = 0;
1654 rnp->qsmaskinit = 0; 1830 rnp->qsmaskinit = 0;
@@ -1669,9 +1845,10 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1669 rnp->level = i; 1845 rnp->level = i;
1670 INIT_LIST_HEAD(&rnp->blocked_tasks[0]); 1846 INIT_LIST_HEAD(&rnp->blocked_tasks[0]);
1671 INIT_LIST_HEAD(&rnp->blocked_tasks[1]); 1847 INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
1848 INIT_LIST_HEAD(&rnp->blocked_tasks[2]);
1849 INIT_LIST_HEAD(&rnp->blocked_tasks[3]);
1672 } 1850 }
1673 } 1851 }
1674 spin_lock_init(&rcu_get_root(rsp)->lock);
1675} 1852}
1676 1853
1677/* 1854/*
@@ -1697,16 +1874,30 @@ do { \
1697 } \ 1874 } \
1698} while (0) 1875} while (0)
1699 1876
1700void __init __rcu_init(void) 1877void __init rcu_init(void)
1701{ 1878{
1879 int i;
1880
1702 rcu_bootup_announce(); 1881 rcu_bootup_announce();
1703#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 1882#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1704 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); 1883 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1705#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 1884#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
1885#if NUM_RCU_LVL_4 != 0
1886 printk(KERN_INFO "Experimental four-level hierarchy is enabled.\n");
1887#endif /* #if NUM_RCU_LVL_4 != 0 */
1706 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); 1888 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
1707 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); 1889 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
1708 __rcu_init_preempt(); 1890 __rcu_init_preempt();
1709 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 1891 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1892
1893 /*
1894 * We don't need protection against CPU-hotplug here because
1895 * this is called early in boot, before either interrupts
1896 * or the scheduler are operational.
1897 */
1898 cpu_notifier(rcu_cpu_notify, 0);
1899 for_each_online_cpu(i)
1900 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)i);
1710} 1901}
1711 1902
1712#include "rcutree_plugin.h" 1903#include "rcutree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index b40ac570604..d2a0046f63b 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -34,10 +34,11 @@
34 * In practice, this has not been tested, so there is probably some 34 * In practice, this has not been tested, so there is probably some
35 * bug somewhere. 35 * bug somewhere.
36 */ 36 */
37#define MAX_RCU_LVLS 3 37#define MAX_RCU_LVLS 4
38#define RCU_FANOUT (CONFIG_RCU_FANOUT) 38#define RCU_FANOUT (CONFIG_RCU_FANOUT)
39#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT) 39#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT)
40#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT) 40#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT)
41#define RCU_FANOUT_FOURTH (RCU_FANOUT_CUBE * RCU_FANOUT)
41 42
42#if NR_CPUS <= RCU_FANOUT 43#if NR_CPUS <= RCU_FANOUT
43# define NUM_RCU_LVLS 1 44# define NUM_RCU_LVLS 1
@@ -45,23 +46,33 @@
45# define NUM_RCU_LVL_1 (NR_CPUS) 46# define NUM_RCU_LVL_1 (NR_CPUS)
46# define NUM_RCU_LVL_2 0 47# define NUM_RCU_LVL_2 0
47# define NUM_RCU_LVL_3 0 48# define NUM_RCU_LVL_3 0
49# define NUM_RCU_LVL_4 0
48#elif NR_CPUS <= RCU_FANOUT_SQ 50#elif NR_CPUS <= RCU_FANOUT_SQ
49# define NUM_RCU_LVLS 2 51# define NUM_RCU_LVLS 2
50# define NUM_RCU_LVL_0 1 52# define NUM_RCU_LVL_0 1
51# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) 53# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
52# define NUM_RCU_LVL_2 (NR_CPUS) 54# define NUM_RCU_LVL_2 (NR_CPUS)
53# define NUM_RCU_LVL_3 0 55# define NUM_RCU_LVL_3 0
56# define NUM_RCU_LVL_4 0
54#elif NR_CPUS <= RCU_FANOUT_CUBE 57#elif NR_CPUS <= RCU_FANOUT_CUBE
55# define NUM_RCU_LVLS 3 58# define NUM_RCU_LVLS 3
56# define NUM_RCU_LVL_0 1 59# define NUM_RCU_LVL_0 1
57# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) 60# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
58# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) 61# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
59# define NUM_RCU_LVL_3 NR_CPUS 62# define NUM_RCU_LVL_3 NR_CPUS
63# define NUM_RCU_LVL_4 0
64#elif NR_CPUS <= RCU_FANOUT_FOURTH
65# define NUM_RCU_LVLS 4
66# define NUM_RCU_LVL_0 1
67# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE)
68# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
69# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
70# define NUM_RCU_LVL_4 NR_CPUS
60#else 71#else
61# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" 72# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
62#endif /* #if (NR_CPUS) <= RCU_FANOUT */ 73#endif /* #if (NR_CPUS) <= RCU_FANOUT */
63 74
64#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) 75#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
65#define NUM_RCU_NODES (RCU_SUM - NR_CPUS) 76#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
66 77
67/* 78/*
@@ -84,14 +95,21 @@ struct rcu_node {
84 long gpnum; /* Current grace period for this node. */ 95 long gpnum; /* Current grace period for this node. */
85 /* This will either be equal to or one */ 96 /* This will either be equal to or one */
86 /* behind the root rcu_node's gpnum. */ 97 /* behind the root rcu_node's gpnum. */
98 long completed; /* Last grace period completed for this node. */
99 /* This will either be equal to or one */
100 /* behind the root rcu_node's gpnum. */
87 unsigned long qsmask; /* CPUs or groups that need to switch in */ 101 unsigned long qsmask; /* CPUs or groups that need to switch in */
88 /* order for current grace period to proceed.*/ 102 /* order for current grace period to proceed.*/
89 /* In leaf rcu_node, each bit corresponds to */ 103 /* In leaf rcu_node, each bit corresponds to */
90 /* an rcu_data structure, otherwise, each */ 104 /* an rcu_data structure, otherwise, each */
91 /* bit corresponds to a child rcu_node */ 105 /* bit corresponds to a child rcu_node */
92 /* structure. */ 106 /* structure. */
107 unsigned long expmask; /* Groups that have ->blocked_tasks[] */
108 /* elements that need to drain to allow the */
109 /* current expedited grace period to */
110 /* complete (only for TREE_PREEMPT_RCU). */
93 unsigned long qsmaskinit; 111 unsigned long qsmaskinit;
94 /* Per-GP initialization for qsmask. */ 112 /* Per-GP initial value for qsmask & expmask. */
95 unsigned long grpmask; /* Mask to apply to parent qsmask. */ 113 unsigned long grpmask; /* Mask to apply to parent qsmask. */
96 /* Only one bit will be set in this mask. */ 114 /* Only one bit will be set in this mask. */
97 int grplo; /* lowest-numbered CPU or group here. */ 115 int grplo; /* lowest-numbered CPU or group here. */
@@ -99,7 +117,7 @@ struct rcu_node {
99 u8 grpnum; /* CPU/group number for next level up. */ 117 u8 grpnum; /* CPU/group number for next level up. */
100 u8 level; /* root is at level 0. */ 118 u8 level; /* root is at level 0. */
101 struct rcu_node *parent; 119 struct rcu_node *parent;
102 struct list_head blocked_tasks[2]; 120 struct list_head blocked_tasks[4];
103 /* Tasks blocked in RCU read-side critsect. */ 121 /* Tasks blocked in RCU read-side critsect. */
104 /* Grace period number (->gpnum) x blocked */ 122 /* Grace period number (->gpnum) x blocked */
105 /* by tasks on the (x & 0x1) element of the */ 123 /* by tasks on the (x & 0x1) element of the */
@@ -114,6 +132,21 @@ struct rcu_node {
114 for ((rnp) = &(rsp)->node[0]; \ 132 for ((rnp) = &(rsp)->node[0]; \
115 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) 133 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
116 134
135/*
136 * Do a breadth-first scan of the non-leaf rcu_node structures for the
137 * specified rcu_state structure. Note that if there is a singleton
138 * rcu_node tree with but one rcu_node structure, this loop is a no-op.
139 */
140#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
141 for ((rnp) = &(rsp)->node[0]; \
142 (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++)
143
144/*
145 * Scan the leaves of the rcu_node hierarchy for the specified rcu_state
146 * structure. Note that if there is a singleton rcu_node tree with but
147 * one rcu_node structure, this loop -will- visit the rcu_node structure.
148 * It is still a leaf node, even if it is also the root node.
149 */
117#define rcu_for_each_leaf_node(rsp, rnp) \ 150#define rcu_for_each_leaf_node(rsp, rnp) \
118 for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \ 151 for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \
119 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) 152 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
@@ -167,6 +200,10 @@ struct rcu_data {
167 struct rcu_head *nxtlist; 200 struct rcu_head *nxtlist;
168 struct rcu_head **nxttail[RCU_NEXT_SIZE]; 201 struct rcu_head **nxttail[RCU_NEXT_SIZE];
169 long qlen; /* # of queued callbacks */ 202 long qlen; /* # of queued callbacks */
203 long qlen_last_fqs_check;
204 /* qlen at last check for QS forcing */
205 unsigned long n_force_qs_snap;
206 /* did other CPU force QS recently? */
170 long blimit; /* Upper limit on a processed batch */ 207 long blimit; /* Upper limit on a processed batch */
171 208
172#ifdef CONFIG_NO_HZ 209#ifdef CONFIG_NO_HZ
@@ -197,13 +234,15 @@ struct rcu_data {
197}; 234};
198 235
199/* Values for signaled field in struct rcu_state. */ 236/* Values for signaled field in struct rcu_state. */
200#define RCU_GP_INIT 0 /* Grace period being initialized. */ 237#define RCU_GP_IDLE 0 /* No grace period in progress. */
201#define RCU_SAVE_DYNTICK 1 /* Need to scan dyntick state. */ 238#define RCU_GP_INIT 1 /* Grace period being initialized. */
202#define RCU_FORCE_QS 2 /* Need to force quiescent state. */ 239#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */
240#define RCU_SAVE_COMPLETED 3 /* Need to save rsp->completed. */
241#define RCU_FORCE_QS 4 /* Need to force quiescent state. */
203#ifdef CONFIG_NO_HZ 242#ifdef CONFIG_NO_HZ
204#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK 243#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
205#else /* #ifdef CONFIG_NO_HZ */ 244#else /* #ifdef CONFIG_NO_HZ */
206#define RCU_SIGNAL_INIT RCU_FORCE_QS 245#define RCU_SIGNAL_INIT RCU_SAVE_COMPLETED
207#endif /* #else #ifdef CONFIG_NO_HZ */ 246#endif /* #else #ifdef CONFIG_NO_HZ */
208 247
209#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 248#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
@@ -241,7 +280,7 @@ struct rcu_state {
241 long gpnum; /* Current gp number. */ 280 long gpnum; /* Current gp number. */
242 long completed; /* # of last completed gp. */ 281 long completed; /* # of last completed gp. */
243 282
244 /* End of fields guarded by root rcu_node's lock. */ 283 /* End of fields guarded by root rcu_node's lock. */
245 284
246 spinlock_t onofflock; /* exclude on/offline and */ 285 spinlock_t onofflock; /* exclude on/offline and */
247 /* starting new GP. Also */ 286 /* starting new GP. Also */
@@ -255,6 +294,8 @@ struct rcu_state {
255 long orphan_qlen; /* Number of orphaned cbs. */ 294 long orphan_qlen; /* Number of orphaned cbs. */
256 spinlock_t fqslock; /* Only one task forcing */ 295 spinlock_t fqslock; /* Only one task forcing */
257 /* quiescent states. */ 296 /* quiescent states. */
297 long completed_fqs; /* Value of completed @ snap. */
298 /* Protected by fqslock. */
258 unsigned long jiffies_force_qs; /* Time at which to invoke */ 299 unsigned long jiffies_force_qs; /* Time at which to invoke */
259 /* force_quiescent_state(). */ 300 /* force_quiescent_state(). */
260 unsigned long n_force_qs; /* Number of calls to */ 301 unsigned long n_force_qs; /* Number of calls to */
@@ -269,11 +310,15 @@ struct rcu_state {
269 unsigned long jiffies_stall; /* Time at which to check */ 310 unsigned long jiffies_stall; /* Time at which to check */
270 /* for CPU stalls. */ 311 /* for CPU stalls. */
271#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 312#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
272#ifdef CONFIG_NO_HZ
273 long dynticks_completed; /* Value of completed @ snap. */
274#endif /* #ifdef CONFIG_NO_HZ */
275}; 313};
276 314
315/* Return values for rcu_preempt_offline_tasks(). */
316
317#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */
318 /* GP were moved to root. */
319#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */
320 /* GP were moved to root. */
321
277#ifdef RCU_TREE_NONCORE 322#ifdef RCU_TREE_NONCORE
278 323
279/* 324/*
@@ -293,23 +338,30 @@ DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
293#else /* #ifdef RCU_TREE_NONCORE */ 338#else /* #ifdef RCU_TREE_NONCORE */
294 339
295/* Forward declarations for rcutree_plugin.h */ 340/* Forward declarations for rcutree_plugin.h */
296static inline void rcu_bootup_announce(void); 341static void rcu_bootup_announce(void);
297long rcu_batches_completed(void); 342long rcu_batches_completed(void);
298static void rcu_preempt_note_context_switch(int cpu); 343static void rcu_preempt_note_context_switch(int cpu);
299static int rcu_preempted_readers(struct rcu_node *rnp); 344static int rcu_preempted_readers(struct rcu_node *rnp);
345#ifdef CONFIG_HOTPLUG_CPU
346static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
347 unsigned long flags);
348#endif /* #ifdef CONFIG_HOTPLUG_CPU */
300#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 349#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
301static void rcu_print_task_stall(struct rcu_node *rnp); 350static void rcu_print_task_stall(struct rcu_node *rnp);
302#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 351#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
303static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); 352static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
304#ifdef CONFIG_HOTPLUG_CPU 353#ifdef CONFIG_HOTPLUG_CPU
305static void rcu_preempt_offline_tasks(struct rcu_state *rsp, 354static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
306 struct rcu_node *rnp, 355 struct rcu_node *rnp,
307 struct rcu_data *rdp); 356 struct rcu_data *rdp);
308static void rcu_preempt_offline_cpu(int cpu); 357static void rcu_preempt_offline_cpu(int cpu);
309#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 358#endif /* #ifdef CONFIG_HOTPLUG_CPU */
310static void rcu_preempt_check_callbacks(int cpu); 359static void rcu_preempt_check_callbacks(int cpu);
311static void rcu_preempt_process_callbacks(void); 360static void rcu_preempt_process_callbacks(void);
312void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 361void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
362#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
363static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
364#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
313static int rcu_preempt_pending(int cpu); 365static int rcu_preempt_pending(int cpu);
314static int rcu_preempt_needs_cpu(int cpu); 366static int rcu_preempt_needs_cpu(int cpu);
315static void __cpuinit rcu_preempt_init_percpu_data(int cpu); 367static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index c0cb783aa16..37fbccdf41d 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -24,16 +24,19 @@
24 * Paul E. McKenney <paulmck@linux.vnet.ibm.com> 24 * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
25 */ 25 */
26 26
27#include <linux/delay.h>
27 28
28#ifdef CONFIG_TREE_PREEMPT_RCU 29#ifdef CONFIG_TREE_PREEMPT_RCU
29 30
30struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); 31struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
31DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); 32DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
32 33
34static int rcu_preempted_readers_exp(struct rcu_node *rnp);
35
33/* 36/*
34 * Tell them what RCU they are running. 37 * Tell them what RCU they are running.
35 */ 38 */
36static inline void rcu_bootup_announce(void) 39static void __init rcu_bootup_announce(void)
37{ 40{
38 printk(KERN_INFO 41 printk(KERN_INFO
39 "Experimental preemptable hierarchical RCU implementation.\n"); 42 "Experimental preemptable hierarchical RCU implementation.\n");
@@ -67,7 +70,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
67static void rcu_preempt_qs(int cpu) 70static void rcu_preempt_qs(int cpu)
68{ 71{
69 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 72 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
70 rdp->passed_quiesc_completed = rdp->completed; 73 rdp->passed_quiesc_completed = rdp->gpnum - 1;
71 barrier(); 74 barrier();
72 rdp->passed_quiesc = 1; 75 rdp->passed_quiesc = 1;
73} 76}
@@ -157,14 +160,58 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock);
157 */ 160 */
158static int rcu_preempted_readers(struct rcu_node *rnp) 161static int rcu_preempted_readers(struct rcu_node *rnp)
159{ 162{
160 return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); 163 int phase = rnp->gpnum & 0x1;
164
165 return !list_empty(&rnp->blocked_tasks[phase]) ||
166 !list_empty(&rnp->blocked_tasks[phase + 2]);
167}
168
169/*
170 * Record a quiescent state for all tasks that were previously queued
171 * on the specified rcu_node structure and that were blocking the current
172 * RCU grace period. The caller must hold the specified rnp->lock with
173 * irqs disabled, and this lock is released upon return, but irqs remain
174 * disabled.
175 */
176static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
177 __releases(rnp->lock)
178{
179 unsigned long mask;
180 struct rcu_node *rnp_p;
181
182 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
183 spin_unlock_irqrestore(&rnp->lock, flags);
184 return; /* Still need more quiescent states! */
185 }
186
187 rnp_p = rnp->parent;
188 if (rnp_p == NULL) {
189 /*
190 * Either there is only one rcu_node in the tree,
191 * or tasks were kicked up to root rcu_node due to
192 * CPUs going offline.
193 */
194 rcu_report_qs_rsp(&rcu_preempt_state, flags);
195 return;
196 }
197
198 /* Report up the rest of the hierarchy. */
199 mask = rnp->grpmask;
200 spin_unlock(&rnp->lock); /* irqs remain disabled. */
201 spin_lock(&rnp_p->lock); /* irqs already disabled. */
202 rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
161} 203}
162 204
205/*
206 * Handle special cases during rcu_read_unlock(), such as needing to
207 * notify RCU core processing or task having blocked during the RCU
208 * read-side critical section.
209 */
163static void rcu_read_unlock_special(struct task_struct *t) 210static void rcu_read_unlock_special(struct task_struct *t)
164{ 211{
165 int empty; 212 int empty;
213 int empty_exp;
166 unsigned long flags; 214 unsigned long flags;
167 unsigned long mask;
168 struct rcu_node *rnp; 215 struct rcu_node *rnp;
169 int special; 216 int special;
170 217
@@ -207,36 +254,30 @@ static void rcu_read_unlock_special(struct task_struct *t)
207 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 254 spin_unlock(&rnp->lock); /* irqs remain disabled. */
208 } 255 }
209 empty = !rcu_preempted_readers(rnp); 256 empty = !rcu_preempted_readers(rnp);
257 empty_exp = !rcu_preempted_readers_exp(rnp);
258 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
210 list_del_init(&t->rcu_node_entry); 259 list_del_init(&t->rcu_node_entry);
211 t->rcu_blocked_node = NULL; 260 t->rcu_blocked_node = NULL;
212 261
213 /* 262 /*
214 * If this was the last task on the current list, and if 263 * If this was the last task on the current list, and if
215 * we aren't waiting on any CPUs, report the quiescent state. 264 * we aren't waiting on any CPUs, report the quiescent state.
216 * Note that both cpu_quiet_msk_finish() and cpu_quiet_msk() 265 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock.
217 * drop rnp->lock and restore irq.
218 */ 266 */
219 if (!empty && rnp->qsmask == 0 && 267 if (empty)
220 !rcu_preempted_readers(rnp)) {
221 struct rcu_node *rnp_p;
222
223 if (rnp->parent == NULL) {
224 /* Only one rcu_node in the tree. */
225 cpu_quiet_msk_finish(&rcu_preempt_state, flags);
226 return;
227 }
228 /* Report up the rest of the hierarchy. */
229 mask = rnp->grpmask;
230 spin_unlock_irqrestore(&rnp->lock, flags); 268 spin_unlock_irqrestore(&rnp->lock, flags);
231 rnp_p = rnp->parent; 269 else
232 spin_lock_irqsave(&rnp_p->lock, flags); 270 rcu_report_unblock_qs_rnp(rnp, flags);
233 WARN_ON_ONCE(rnp->qsmask); 271
234 cpu_quiet_msk(mask, &rcu_preempt_state, rnp_p, flags); 272 /*
235 return; 273 * If this was the last task on the expedited lists,
236 } 274 * then we need to report up the rcu_node hierarchy.
237 spin_unlock(&rnp->lock); 275 */
276 if (!empty_exp && !rcu_preempted_readers_exp(rnp))
277 rcu_report_exp_rnp(&rcu_preempt_state, rnp);
278 } else {
279 local_irq_restore(flags);
238 } 280 }
239 local_irq_restore(flags);
240} 281}
241 282
242/* 283/*
@@ -303,26 +344,34 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
303 * rcu_node. The reason for not just moving them to the immediate 344 * rcu_node. The reason for not just moving them to the immediate
304 * parent is to remove the need for rcu_read_unlock_special() to 345 * parent is to remove the need for rcu_read_unlock_special() to
305 * make more than two attempts to acquire the target rcu_node's lock. 346 * make more than two attempts to acquire the target rcu_node's lock.
347 * Returns true if there were tasks blocking the current RCU grace
348 * period.
349 *
350 * Returns 1 if there was previously a task blocking the current grace
351 * period on the specified rcu_node structure.
306 * 352 *
307 * The caller must hold rnp->lock with irqs disabled. 353 * The caller must hold rnp->lock with irqs disabled.
308 */ 354 */
309static void rcu_preempt_offline_tasks(struct rcu_state *rsp, 355static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
310 struct rcu_node *rnp, 356 struct rcu_node *rnp,
311 struct rcu_data *rdp) 357 struct rcu_data *rdp)
312{ 358{
313 int i; 359 int i;
314 struct list_head *lp; 360 struct list_head *lp;
315 struct list_head *lp_root; 361 struct list_head *lp_root;
362 int retval = 0;
316 struct rcu_node *rnp_root = rcu_get_root(rsp); 363 struct rcu_node *rnp_root = rcu_get_root(rsp);
317 struct task_struct *tp; 364 struct task_struct *tp;
318 365
319 if (rnp == rnp_root) { 366 if (rnp == rnp_root) {
320 WARN_ONCE(1, "Last CPU thought to be offlined?"); 367 WARN_ONCE(1, "Last CPU thought to be offlined?");
321 return; /* Shouldn't happen: at least one CPU online. */ 368 return 0; /* Shouldn't happen: at least one CPU online. */
322 } 369 }
323 WARN_ON_ONCE(rnp != rdp->mynode && 370 WARN_ON_ONCE(rnp != rdp->mynode &&
324 (!list_empty(&rnp->blocked_tasks[0]) || 371 (!list_empty(&rnp->blocked_tasks[0]) ||
325 !list_empty(&rnp->blocked_tasks[1]))); 372 !list_empty(&rnp->blocked_tasks[1]) ||
373 !list_empty(&rnp->blocked_tasks[2]) ||
374 !list_empty(&rnp->blocked_tasks[3])));
326 375
327 /* 376 /*
328 * Move tasks up to root rcu_node. Rely on the fact that the 377 * Move tasks up to root rcu_node. Rely on the fact that the
@@ -330,7 +379,11 @@ static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
330 * rcu_nodes in terms of gp_num value. This fact allows us to 379 * rcu_nodes in terms of gp_num value. This fact allows us to
331 * move the blocked_tasks[] array directly, element by element. 380 * move the blocked_tasks[] array directly, element by element.
332 */ 381 */
333 for (i = 0; i < 2; i++) { 382 if (rcu_preempted_readers(rnp))
383 retval |= RCU_OFL_TASKS_NORM_GP;
384 if (rcu_preempted_readers_exp(rnp))
385 retval |= RCU_OFL_TASKS_EXP_GP;
386 for (i = 0; i < 4; i++) {
334 lp = &rnp->blocked_tasks[i]; 387 lp = &rnp->blocked_tasks[i];
335 lp_root = &rnp_root->blocked_tasks[i]; 388 lp_root = &rnp_root->blocked_tasks[i];
336 while (!list_empty(lp)) { 389 while (!list_empty(lp)) {
@@ -342,6 +395,7 @@ static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
342 spin_unlock(&rnp_root->lock); /* irqs remain disabled */ 395 spin_unlock(&rnp_root->lock); /* irqs remain disabled */
343 } 396 }
344 } 397 }
398 return retval;
345} 399}
346 400
347/* 401/*
@@ -392,6 +446,186 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
392} 446}
393EXPORT_SYMBOL_GPL(call_rcu); 447EXPORT_SYMBOL_GPL(call_rcu);
394 448
449/**
450 * synchronize_rcu - wait until a grace period has elapsed.
451 *
452 * Control will return to the caller some time after a full grace
453 * period has elapsed, in other words after all currently executing RCU
454 * read-side critical sections have completed. RCU read-side critical
455 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
456 * and may be nested.
457 */
458void synchronize_rcu(void)
459{
460 struct rcu_synchronize rcu;
461
462 if (!rcu_scheduler_active)
463 return;
464
465 init_completion(&rcu.completion);
466 /* Will wake me after RCU finished. */
467 call_rcu(&rcu.head, wakeme_after_rcu);
468 /* Wait for it. */
469 wait_for_completion(&rcu.completion);
470}
471EXPORT_SYMBOL_GPL(synchronize_rcu);
472
473static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
474static long sync_rcu_preempt_exp_count;
475static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
476
477/*
478 * Return non-zero if there are any tasks in RCU read-side critical
479 * sections blocking the current preemptible-RCU expedited grace period.
480 * If there is no preemptible-RCU expedited grace period currently in
481 * progress, returns zero unconditionally.
482 */
483static int rcu_preempted_readers_exp(struct rcu_node *rnp)
484{
485 return !list_empty(&rnp->blocked_tasks[2]) ||
486 !list_empty(&rnp->blocked_tasks[3]);
487}
488
489/*
490 * return non-zero if there is no RCU expedited grace period in progress
491 * for the specified rcu_node structure, in other words, if all CPUs and
492 * tasks covered by the specified rcu_node structure have done their bit
493 * for the current expedited grace period. Works only for preemptible
494 * RCU -- other RCU implementation use other means.
495 *
496 * Caller must hold sync_rcu_preempt_exp_mutex.
497 */
498static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
499{
500 return !rcu_preempted_readers_exp(rnp) &&
501 ACCESS_ONCE(rnp->expmask) == 0;
502}
503
504/*
505 * Report the exit from RCU read-side critical section for the last task
506 * that queued itself during or before the current expedited preemptible-RCU
507 * grace period. This event is reported either to the rcu_node structure on
508 * which the task was queued or to one of that rcu_node structure's ancestors,
509 * recursively up the tree. (Calm down, calm down, we do the recursion
510 * iteratively!)
511 *
512 * Caller must hold sync_rcu_preempt_exp_mutex.
513 */
514static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
515{
516 unsigned long flags;
517 unsigned long mask;
518
519 spin_lock_irqsave(&rnp->lock, flags);
520 for (;;) {
521 if (!sync_rcu_preempt_exp_done(rnp))
522 break;
523 if (rnp->parent == NULL) {
524 wake_up(&sync_rcu_preempt_exp_wq);
525 break;
526 }
527 mask = rnp->grpmask;
528 spin_unlock(&rnp->lock); /* irqs remain disabled */
529 rnp = rnp->parent;
530 spin_lock(&rnp->lock); /* irqs already disabled */
531 rnp->expmask &= ~mask;
532 }
533 spin_unlock_irqrestore(&rnp->lock, flags);
534}
535
536/*
537 * Snapshot the tasks blocking the newly started preemptible-RCU expedited
538 * grace period for the specified rcu_node structure. If there are no such
539 * tasks, report it up the rcu_node hierarchy.
540 *
541 * Caller must hold sync_rcu_preempt_exp_mutex and rsp->onofflock.
542 */
543static void
544sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
545{
546 int must_wait;
547
548 spin_lock(&rnp->lock); /* irqs already disabled */
549 list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]);
550 list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]);
551 must_wait = rcu_preempted_readers_exp(rnp);
552 spin_unlock(&rnp->lock); /* irqs remain disabled */
553 if (!must_wait)
554 rcu_report_exp_rnp(rsp, rnp);
555}
556
557/*
558 * Wait for an rcu-preempt grace period, but expedite it. The basic idea
559 * is to invoke synchronize_sched_expedited() to push all the tasks to
560 * the ->blocked_tasks[] lists, move all entries from the first set of
561 * ->blocked_tasks[] lists to the second set, and finally wait for this
562 * second set to drain.
563 */
564void synchronize_rcu_expedited(void)
565{
566 unsigned long flags;
567 struct rcu_node *rnp;
568 struct rcu_state *rsp = &rcu_preempt_state;
569 long snap;
570 int trycount = 0;
571
572 smp_mb(); /* Caller's modifications seen first by other CPUs. */
573 snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1;
574 smp_mb(); /* Above access cannot bleed into critical section. */
575
576 /*
577 * Acquire lock, falling back to synchronize_rcu() if too many
578 * lock-acquisition failures. Of course, if someone does the
579 * expedited grace period for us, just leave.
580 */
581 while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
582 if (trycount++ < 10)
583 udelay(trycount * num_online_cpus());
584 else {
585 synchronize_rcu();
586 return;
587 }
588 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
589 goto mb_ret; /* Others did our work for us. */
590 }
591 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
592 goto unlock_mb_ret; /* Others did our work for us. */
593
594 /* force all RCU readers onto blocked_tasks[]. */
595 synchronize_sched_expedited();
596
597 spin_lock_irqsave(&rsp->onofflock, flags);
598
599 /* Initialize ->expmask for all non-leaf rcu_node structures. */
600 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
601 spin_lock(&rnp->lock); /* irqs already disabled. */
602 rnp->expmask = rnp->qsmaskinit;
603 spin_unlock(&rnp->lock); /* irqs remain disabled. */
604 }
605
606 /* Snapshot current state of ->blocked_tasks[] lists. */
607 rcu_for_each_leaf_node(rsp, rnp)
608 sync_rcu_preempt_exp_init(rsp, rnp);
609 if (NUM_RCU_NODES > 1)
610 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
611
612 spin_unlock_irqrestore(&rsp->onofflock, flags);
613
614 /* Wait for snapshotted ->blocked_tasks[] lists to drain. */
615 rnp = rcu_get_root(rsp);
616 wait_event(sync_rcu_preempt_exp_wq,
617 sync_rcu_preempt_exp_done(rnp));
618
619 /* Clean up and exit. */
620 smp_mb(); /* ensure expedited GP seen before counter increment. */
621 ACCESS_ONCE(sync_rcu_preempt_exp_count)++;
622unlock_mb_ret:
623 mutex_unlock(&sync_rcu_preempt_exp_mutex);
624mb_ret:
625 smp_mb(); /* ensure subsequent action seen after grace period. */
626}
627EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
628
395/* 629/*
396 * Check to see if there is any immediate preemptable-RCU-related work 630 * Check to see if there is any immediate preemptable-RCU-related work
397 * to be done. 631 * to be done.
@@ -464,7 +698,7 @@ void exit_rcu(void)
464/* 698/*
465 * Tell them what RCU they are running. 699 * Tell them what RCU they are running.
466 */ 700 */
467static inline void rcu_bootup_announce(void) 701static void __init rcu_bootup_announce(void)
468{ 702{
469 printk(KERN_INFO "Hierarchical RCU implementation.\n"); 703 printk(KERN_INFO "Hierarchical RCU implementation.\n");
470} 704}
@@ -495,6 +729,16 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
495 return 0; 729 return 0;
496} 730}
497 731
732#ifdef CONFIG_HOTPLUG_CPU
733
734/* Because preemptible RCU does not exist, no quieting of tasks. */
735static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
736{
737 spin_unlock_irqrestore(&rnp->lock, flags);
738}
739
740#endif /* #ifdef CONFIG_HOTPLUG_CPU */
741
498#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 742#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
499 743
500/* 744/*
@@ -521,12 +765,15 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
521 765
522/* 766/*
523 * Because preemptable RCU does not exist, it never needs to migrate 767 * Because preemptable RCU does not exist, it never needs to migrate
524 * tasks that were blocked within RCU read-side critical sections. 768 * tasks that were blocked within RCU read-side critical sections, and
769 * such non-existent tasks cannot possibly have been blocking the current
770 * grace period.
525 */ 771 */
526static void rcu_preempt_offline_tasks(struct rcu_state *rsp, 772static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
527 struct rcu_node *rnp, 773 struct rcu_node *rnp,
528 struct rcu_data *rdp) 774 struct rcu_data *rdp)
529{ 775{
776 return 0;
530} 777}
531 778
532/* 779/*
@@ -565,6 +812,30 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
565EXPORT_SYMBOL_GPL(call_rcu); 812EXPORT_SYMBOL_GPL(call_rcu);
566 813
567/* 814/*
815 * Wait for an rcu-preempt grace period, but make it happen quickly.
816 * But because preemptable RCU does not exist, map to rcu-sched.
817 */
818void synchronize_rcu_expedited(void)
819{
820 synchronize_sched_expedited();
821}
822EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
823
824#ifdef CONFIG_HOTPLUG_CPU
825
826/*
827 * Because preemptable RCU does not exist, there is never any need to
828 * report on tasks preempted in RCU read-side critical sections during
829 * expedited RCU grace periods.
830 */
831static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
832{
833 return;
834}
835
836#endif /* #ifdef CONFIG_HOTPLUG_CPU */
837
838/*
568 * Because preemptable RCU does not exist, it never has any work to do. 839 * Because preemptable RCU does not exist, it never has any work to do.
569 */ 840 */
570static int rcu_preempt_pending(int cpu) 841static int rcu_preempt_pending(int cpu)
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 4b31c779e62..9d2c88423b3 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -155,12 +155,15 @@ static const struct file_operations rcudata_csv_fops = {
155 155
156static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) 156static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
157{ 157{
158 long gpnum;
158 int level = 0; 159 int level = 0;
160 int phase;
159 struct rcu_node *rnp; 161 struct rcu_node *rnp;
160 162
163 gpnum = rsp->gpnum;
161 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x " 164 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x "
162 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", 165 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
163 rsp->completed, rsp->gpnum, rsp->signaled, 166 rsp->completed, gpnum, rsp->signaled,
164 (long)(rsp->jiffies_force_qs - jiffies), 167 (long)(rsp->jiffies_force_qs - jiffies),
165 (int)(jiffies & 0xffff), 168 (int)(jiffies & 0xffff),
166 rsp->n_force_qs, rsp->n_force_qs_ngp, 169 rsp->n_force_qs, rsp->n_force_qs_ngp,
@@ -171,8 +174,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
171 seq_puts(m, "\n"); 174 seq_puts(m, "\n");
172 level = rnp->level; 175 level = rnp->level;
173 } 176 }
174 seq_printf(m, "%lx/%lx %d:%d ^%d ", 177 phase = gpnum & 0x1;
178 seq_printf(m, "%lx/%lx %c%c>%c%c %d:%d ^%d ",
175 rnp->qsmask, rnp->qsmaskinit, 179 rnp->qsmask, rnp->qsmaskinit,
180 "T."[list_empty(&rnp->blocked_tasks[phase])],
181 "E."[list_empty(&rnp->blocked_tasks[phase + 2])],
182 "T."[list_empty(&rnp->blocked_tasks[!phase])],
183 "E."[list_empty(&rnp->blocked_tasks[!phase + 2])],
176 rnp->grplo, rnp->grphi, rnp->grpnum); 184 rnp->grplo, rnp->grphi, rnp->grpnum);
177 } 185 }
178 seq_puts(m, "\n"); 186 seq_puts(m, "\n");
diff --git a/kernel/relay.c b/kernel/relay.c
index 760c26209a3..c705a41b4ba 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1198,7 +1198,7 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe,
1198 relay_consume_bytes(rbuf, buf->private); 1198 relay_consume_bytes(rbuf, buf->private);
1199} 1199}
1200 1200
1201static struct pipe_buf_operations relay_pipe_buf_ops = { 1201static const struct pipe_buf_operations relay_pipe_buf_ops = {
1202 .can_merge = 0, 1202 .can_merge = 0,
1203 .map = generic_pipe_buf_map, 1203 .map = generic_pipe_buf_map,
1204 .unmap = generic_pipe_buf_unmap, 1204 .unmap = generic_pipe_buf_unmap,
diff --git a/kernel/resource.c b/kernel/resource.c
index e68cd7477c4..03c897f7935 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -321,35 +321,37 @@ static int find_resource(struct resource *root, struct resource *new,
321 void *alignf_data) 321 void *alignf_data)
322{ 322{
323 struct resource *this = root->child; 323 struct resource *this = root->child;
324 struct resource tmp = *new;
324 325
325 new->start = root->start; 326 tmp.start = root->start;
326 /* 327 /*
327 * Skip past an allocated resource that starts at 0, since the assignment 328 * Skip past an allocated resource that starts at 0, since the assignment
328 * of this->start - 1 to new->end below would cause an underflow. 329 * of this->start - 1 to tmp->end below would cause an underflow.
329 */ 330 */
330 if (this && this->start == 0) { 331 if (this && this->start == 0) {
331 new->start = this->end + 1; 332 tmp.start = this->end + 1;
332 this = this->sibling; 333 this = this->sibling;
333 } 334 }
334 for(;;) { 335 for(;;) {
335 if (this) 336 if (this)
336 new->end = this->start - 1; 337 tmp.end = this->start - 1;
337 else 338 else
338 new->end = root->end; 339 tmp.end = root->end;
339 if (new->start < min) 340 if (tmp.start < min)
340 new->start = min; 341 tmp.start = min;
341 if (new->end > max) 342 if (tmp.end > max)
342 new->end = max; 343 tmp.end = max;
343 new->start = ALIGN(new->start, align); 344 tmp.start = ALIGN(tmp.start, align);
344 if (alignf) 345 if (alignf)
345 alignf(alignf_data, new, size, align); 346 alignf(alignf_data, &tmp, size, align);
346 if (new->start < new->end && new->end - new->start >= size - 1) { 347 if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) {
347 new->end = new->start + size - 1; 348 new->start = tmp.start;
349 new->end = tmp.start + size - 1;
348 return 0; 350 return 0;
349 } 351 }
350 if (!this) 352 if (!this)
351 break; 353 break;
352 new->start = this->end + 1; 354 tmp.start = this->end + 1;
353 this = this->sibling; 355 this = this->sibling;
354 } 356 }
355 return -EBUSY; 357 return -EBUSY;
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 5fcb4fe645e..ddabb54bb5c 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -37,8 +37,8 @@ do { \
37 if (rt_trace_on) { \ 37 if (rt_trace_on) { \
38 rt_trace_on = 0; \ 38 rt_trace_on = 0; \
39 console_verbose(); \ 39 console_verbose(); \
40 if (spin_is_locked(&current->pi_lock)) \ 40 if (raw_spin_is_locked(&current->pi_lock)) \
41 spin_unlock(&current->pi_lock); \ 41 raw_spin_unlock(&current->pi_lock); \
42 } \ 42 } \
43} while (0) 43} while (0)
44 44
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 29bd4baf9e7..a9604815786 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -138,9 +138,9 @@ static void rt_mutex_adjust_prio(struct task_struct *task)
138{ 138{
139 unsigned long flags; 139 unsigned long flags;
140 140
141 spin_lock_irqsave(&task->pi_lock, flags); 141 raw_spin_lock_irqsave(&task->pi_lock, flags);
142 __rt_mutex_adjust_prio(task); 142 __rt_mutex_adjust_prio(task);
143 spin_unlock_irqrestore(&task->pi_lock, flags); 143 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
144} 144}
145 145
146/* 146/*
@@ -195,7 +195,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
195 /* 195 /*
196 * Task can not go away as we did a get_task() before ! 196 * Task can not go away as we did a get_task() before !
197 */ 197 */
198 spin_lock_irqsave(&task->pi_lock, flags); 198 raw_spin_lock_irqsave(&task->pi_lock, flags);
199 199
200 waiter = task->pi_blocked_on; 200 waiter = task->pi_blocked_on;
201 /* 201 /*
@@ -231,8 +231,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
231 goto out_unlock_pi; 231 goto out_unlock_pi;
232 232
233 lock = waiter->lock; 233 lock = waiter->lock;
234 if (!spin_trylock(&lock->wait_lock)) { 234 if (!raw_spin_trylock(&lock->wait_lock)) {
235 spin_unlock_irqrestore(&task->pi_lock, flags); 235 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
236 cpu_relax(); 236 cpu_relax();
237 goto retry; 237 goto retry;
238 } 238 }
@@ -240,7 +240,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
240 /* Deadlock detection */ 240 /* Deadlock detection */
241 if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { 241 if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
242 debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); 242 debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
243 spin_unlock(&lock->wait_lock); 243 raw_spin_unlock(&lock->wait_lock);
244 ret = deadlock_detect ? -EDEADLK : 0; 244 ret = deadlock_detect ? -EDEADLK : 0;
245 goto out_unlock_pi; 245 goto out_unlock_pi;
246 } 246 }
@@ -253,13 +253,13 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
253 plist_add(&waiter->list_entry, &lock->wait_list); 253 plist_add(&waiter->list_entry, &lock->wait_list);
254 254
255 /* Release the task */ 255 /* Release the task */
256 spin_unlock_irqrestore(&task->pi_lock, flags); 256 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
257 put_task_struct(task); 257 put_task_struct(task);
258 258
259 /* Grab the next task */ 259 /* Grab the next task */
260 task = rt_mutex_owner(lock); 260 task = rt_mutex_owner(lock);
261 get_task_struct(task); 261 get_task_struct(task);
262 spin_lock_irqsave(&task->pi_lock, flags); 262 raw_spin_lock_irqsave(&task->pi_lock, flags);
263 263
264 if (waiter == rt_mutex_top_waiter(lock)) { 264 if (waiter == rt_mutex_top_waiter(lock)) {
265 /* Boost the owner */ 265 /* Boost the owner */
@@ -277,10 +277,10 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
277 __rt_mutex_adjust_prio(task); 277 __rt_mutex_adjust_prio(task);
278 } 278 }
279 279
280 spin_unlock_irqrestore(&task->pi_lock, flags); 280 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
281 281
282 top_waiter = rt_mutex_top_waiter(lock); 282 top_waiter = rt_mutex_top_waiter(lock);
283 spin_unlock(&lock->wait_lock); 283 raw_spin_unlock(&lock->wait_lock);
284 284
285 if (!detect_deadlock && waiter != top_waiter) 285 if (!detect_deadlock && waiter != top_waiter)
286 goto out_put_task; 286 goto out_put_task;
@@ -288,7 +288,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
288 goto again; 288 goto again;
289 289
290 out_unlock_pi: 290 out_unlock_pi:
291 spin_unlock_irqrestore(&task->pi_lock, flags); 291 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
292 out_put_task: 292 out_put_task:
293 put_task_struct(task); 293 put_task_struct(task);
294 294
@@ -313,9 +313,9 @@ static inline int try_to_steal_lock(struct rt_mutex *lock,
313 if (pendowner == task) 313 if (pendowner == task)
314 return 1; 314 return 1;
315 315
316 spin_lock_irqsave(&pendowner->pi_lock, flags); 316 raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
317 if (task->prio >= pendowner->prio) { 317 if (task->prio >= pendowner->prio) {
318 spin_unlock_irqrestore(&pendowner->pi_lock, flags); 318 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
319 return 0; 319 return 0;
320 } 320 }
321 321
@@ -325,7 +325,7 @@ static inline int try_to_steal_lock(struct rt_mutex *lock,
325 * priority. 325 * priority.
326 */ 326 */
327 if (likely(!rt_mutex_has_waiters(lock))) { 327 if (likely(!rt_mutex_has_waiters(lock))) {
328 spin_unlock_irqrestore(&pendowner->pi_lock, flags); 328 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
329 return 1; 329 return 1;
330 } 330 }
331 331
@@ -333,7 +333,7 @@ static inline int try_to_steal_lock(struct rt_mutex *lock,
333 next = rt_mutex_top_waiter(lock); 333 next = rt_mutex_top_waiter(lock);
334 plist_del(&next->pi_list_entry, &pendowner->pi_waiters); 334 plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
335 __rt_mutex_adjust_prio(pendowner); 335 __rt_mutex_adjust_prio(pendowner);
336 spin_unlock_irqrestore(&pendowner->pi_lock, flags); 336 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
337 337
338 /* 338 /*
339 * We are going to steal the lock and a waiter was 339 * We are going to steal the lock and a waiter was
@@ -350,10 +350,10 @@ static inline int try_to_steal_lock(struct rt_mutex *lock,
350 * might be task: 350 * might be task:
351 */ 351 */
352 if (likely(next->task != task)) { 352 if (likely(next->task != task)) {
353 spin_lock_irqsave(&task->pi_lock, flags); 353 raw_spin_lock_irqsave(&task->pi_lock, flags);
354 plist_add(&next->pi_list_entry, &task->pi_waiters); 354 plist_add(&next->pi_list_entry, &task->pi_waiters);
355 __rt_mutex_adjust_prio(task); 355 __rt_mutex_adjust_prio(task);
356 spin_unlock_irqrestore(&task->pi_lock, flags); 356 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
357 } 357 }
358 return 1; 358 return 1;
359} 359}
@@ -420,7 +420,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
420 unsigned long flags; 420 unsigned long flags;
421 int chain_walk = 0, res; 421 int chain_walk = 0, res;
422 422
423 spin_lock_irqsave(&task->pi_lock, flags); 423 raw_spin_lock_irqsave(&task->pi_lock, flags);
424 __rt_mutex_adjust_prio(task); 424 __rt_mutex_adjust_prio(task);
425 waiter->task = task; 425 waiter->task = task;
426 waiter->lock = lock; 426 waiter->lock = lock;
@@ -434,17 +434,17 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
434 434
435 task->pi_blocked_on = waiter; 435 task->pi_blocked_on = waiter;
436 436
437 spin_unlock_irqrestore(&task->pi_lock, flags); 437 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
438 438
439 if (waiter == rt_mutex_top_waiter(lock)) { 439 if (waiter == rt_mutex_top_waiter(lock)) {
440 spin_lock_irqsave(&owner->pi_lock, flags); 440 raw_spin_lock_irqsave(&owner->pi_lock, flags);
441 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); 441 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
442 plist_add(&waiter->pi_list_entry, &owner->pi_waiters); 442 plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
443 443
444 __rt_mutex_adjust_prio(owner); 444 __rt_mutex_adjust_prio(owner);
445 if (owner->pi_blocked_on) 445 if (owner->pi_blocked_on)
446 chain_walk = 1; 446 chain_walk = 1;
447 spin_unlock_irqrestore(&owner->pi_lock, flags); 447 raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
448 } 448 }
449 else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) 449 else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock))
450 chain_walk = 1; 450 chain_walk = 1;
@@ -459,12 +459,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
459 */ 459 */
460 get_task_struct(owner); 460 get_task_struct(owner);
461 461
462 spin_unlock(&lock->wait_lock); 462 raw_spin_unlock(&lock->wait_lock);
463 463
464 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, 464 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
465 task); 465 task);
466 466
467 spin_lock(&lock->wait_lock); 467 raw_spin_lock(&lock->wait_lock);
468 468
469 return res; 469 return res;
470} 470}
@@ -483,7 +483,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
483 struct task_struct *pendowner; 483 struct task_struct *pendowner;
484 unsigned long flags; 484 unsigned long flags;
485 485
486 spin_lock_irqsave(&current->pi_lock, flags); 486 raw_spin_lock_irqsave(&current->pi_lock, flags);
487 487
488 waiter = rt_mutex_top_waiter(lock); 488 waiter = rt_mutex_top_waiter(lock);
489 plist_del(&waiter->list_entry, &lock->wait_list); 489 plist_del(&waiter->list_entry, &lock->wait_list);
@@ -500,7 +500,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
500 500
501 rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); 501 rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING);
502 502
503 spin_unlock_irqrestore(&current->pi_lock, flags); 503 raw_spin_unlock_irqrestore(&current->pi_lock, flags);
504 504
505 /* 505 /*
506 * Clear the pi_blocked_on variable and enqueue a possible 506 * Clear the pi_blocked_on variable and enqueue a possible
@@ -509,7 +509,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
509 * waiter with higher priority than pending-owner->normal_prio 509 * waiter with higher priority than pending-owner->normal_prio
510 * is blocked on the unboosted (pending) owner. 510 * is blocked on the unboosted (pending) owner.
511 */ 511 */
512 spin_lock_irqsave(&pendowner->pi_lock, flags); 512 raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
513 513
514 WARN_ON(!pendowner->pi_blocked_on); 514 WARN_ON(!pendowner->pi_blocked_on);
515 WARN_ON(pendowner->pi_blocked_on != waiter); 515 WARN_ON(pendowner->pi_blocked_on != waiter);
@@ -523,7 +523,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
523 next = rt_mutex_top_waiter(lock); 523 next = rt_mutex_top_waiter(lock);
524 plist_add(&next->pi_list_entry, &pendowner->pi_waiters); 524 plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
525 } 525 }
526 spin_unlock_irqrestore(&pendowner->pi_lock, flags); 526 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
527 527
528 wake_up_process(pendowner); 528 wake_up_process(pendowner);
529} 529}
@@ -541,15 +541,15 @@ static void remove_waiter(struct rt_mutex *lock,
541 unsigned long flags; 541 unsigned long flags;
542 int chain_walk = 0; 542 int chain_walk = 0;
543 543
544 spin_lock_irqsave(&current->pi_lock, flags); 544 raw_spin_lock_irqsave(&current->pi_lock, flags);
545 plist_del(&waiter->list_entry, &lock->wait_list); 545 plist_del(&waiter->list_entry, &lock->wait_list);
546 waiter->task = NULL; 546 waiter->task = NULL;
547 current->pi_blocked_on = NULL; 547 current->pi_blocked_on = NULL;
548 spin_unlock_irqrestore(&current->pi_lock, flags); 548 raw_spin_unlock_irqrestore(&current->pi_lock, flags);
549 549
550 if (first && owner != current) { 550 if (first && owner != current) {
551 551
552 spin_lock_irqsave(&owner->pi_lock, flags); 552 raw_spin_lock_irqsave(&owner->pi_lock, flags);
553 553
554 plist_del(&waiter->pi_list_entry, &owner->pi_waiters); 554 plist_del(&waiter->pi_list_entry, &owner->pi_waiters);
555 555
@@ -564,7 +564,7 @@ static void remove_waiter(struct rt_mutex *lock,
564 if (owner->pi_blocked_on) 564 if (owner->pi_blocked_on)
565 chain_walk = 1; 565 chain_walk = 1;
566 566
567 spin_unlock_irqrestore(&owner->pi_lock, flags); 567 raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
568 } 568 }
569 569
570 WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); 570 WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
@@ -575,11 +575,11 @@ static void remove_waiter(struct rt_mutex *lock,
575 /* gets dropped in rt_mutex_adjust_prio_chain()! */ 575 /* gets dropped in rt_mutex_adjust_prio_chain()! */
576 get_task_struct(owner); 576 get_task_struct(owner);
577 577
578 spin_unlock(&lock->wait_lock); 578 raw_spin_unlock(&lock->wait_lock);
579 579
580 rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current); 580 rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current);
581 581
582 spin_lock(&lock->wait_lock); 582 raw_spin_lock(&lock->wait_lock);
583} 583}
584 584
585/* 585/*
@@ -592,15 +592,15 @@ void rt_mutex_adjust_pi(struct task_struct *task)
592 struct rt_mutex_waiter *waiter; 592 struct rt_mutex_waiter *waiter;
593 unsigned long flags; 593 unsigned long flags;
594 594
595 spin_lock_irqsave(&task->pi_lock, flags); 595 raw_spin_lock_irqsave(&task->pi_lock, flags);
596 596
597 waiter = task->pi_blocked_on; 597 waiter = task->pi_blocked_on;
598 if (!waiter || waiter->list_entry.prio == task->prio) { 598 if (!waiter || waiter->list_entry.prio == task->prio) {
599 spin_unlock_irqrestore(&task->pi_lock, flags); 599 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
600 return; 600 return;
601 } 601 }
602 602
603 spin_unlock_irqrestore(&task->pi_lock, flags); 603 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
604 604
605 /* gets dropped in rt_mutex_adjust_prio_chain()! */ 605 /* gets dropped in rt_mutex_adjust_prio_chain()! */
606 get_task_struct(task); 606 get_task_struct(task);
@@ -672,14 +672,14 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
672 break; 672 break;
673 } 673 }
674 674
675 spin_unlock(&lock->wait_lock); 675 raw_spin_unlock(&lock->wait_lock);
676 676
677 debug_rt_mutex_print_deadlock(waiter); 677 debug_rt_mutex_print_deadlock(waiter);
678 678
679 if (waiter->task) 679 if (waiter->task)
680 schedule_rt_mutex(lock); 680 schedule_rt_mutex(lock);
681 681
682 spin_lock(&lock->wait_lock); 682 raw_spin_lock(&lock->wait_lock);
683 set_current_state(state); 683 set_current_state(state);
684 } 684 }
685 685
@@ -700,11 +700,11 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
700 debug_rt_mutex_init_waiter(&waiter); 700 debug_rt_mutex_init_waiter(&waiter);
701 waiter.task = NULL; 701 waiter.task = NULL;
702 702
703 spin_lock(&lock->wait_lock); 703 raw_spin_lock(&lock->wait_lock);
704 704
705 /* Try to acquire the lock again: */ 705 /* Try to acquire the lock again: */
706 if (try_to_take_rt_mutex(lock)) { 706 if (try_to_take_rt_mutex(lock)) {
707 spin_unlock(&lock->wait_lock); 707 raw_spin_unlock(&lock->wait_lock);
708 return 0; 708 return 0;
709 } 709 }
710 710
@@ -731,7 +731,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
731 */ 731 */
732 fixup_rt_mutex_waiters(lock); 732 fixup_rt_mutex_waiters(lock);
733 733
734 spin_unlock(&lock->wait_lock); 734 raw_spin_unlock(&lock->wait_lock);
735 735
736 /* Remove pending timer: */ 736 /* Remove pending timer: */
737 if (unlikely(timeout)) 737 if (unlikely(timeout))
@@ -758,7 +758,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
758{ 758{
759 int ret = 0; 759 int ret = 0;
760 760
761 spin_lock(&lock->wait_lock); 761 raw_spin_lock(&lock->wait_lock);
762 762
763 if (likely(rt_mutex_owner(lock) != current)) { 763 if (likely(rt_mutex_owner(lock) != current)) {
764 764
@@ -770,7 +770,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
770 fixup_rt_mutex_waiters(lock); 770 fixup_rt_mutex_waiters(lock);
771 } 771 }
772 772
773 spin_unlock(&lock->wait_lock); 773 raw_spin_unlock(&lock->wait_lock);
774 774
775 return ret; 775 return ret;
776} 776}
@@ -781,7 +781,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
781static void __sched 781static void __sched
782rt_mutex_slowunlock(struct rt_mutex *lock) 782rt_mutex_slowunlock(struct rt_mutex *lock)
783{ 783{
784 spin_lock(&lock->wait_lock); 784 raw_spin_lock(&lock->wait_lock);
785 785
786 debug_rt_mutex_unlock(lock); 786 debug_rt_mutex_unlock(lock);
787 787
@@ -789,13 +789,13 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
789 789
790 if (!rt_mutex_has_waiters(lock)) { 790 if (!rt_mutex_has_waiters(lock)) {
791 lock->owner = NULL; 791 lock->owner = NULL;
792 spin_unlock(&lock->wait_lock); 792 raw_spin_unlock(&lock->wait_lock);
793 return; 793 return;
794 } 794 }
795 795
796 wakeup_next_waiter(lock); 796 wakeup_next_waiter(lock);
797 797
798 spin_unlock(&lock->wait_lock); 798 raw_spin_unlock(&lock->wait_lock);
799 799
800 /* Undo pi boosting if necessary: */ 800 /* Undo pi boosting if necessary: */
801 rt_mutex_adjust_prio(current); 801 rt_mutex_adjust_prio(current);
@@ -970,8 +970,8 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy);
970void __rt_mutex_init(struct rt_mutex *lock, const char *name) 970void __rt_mutex_init(struct rt_mutex *lock, const char *name)
971{ 971{
972 lock->owner = NULL; 972 lock->owner = NULL;
973 spin_lock_init(&lock->wait_lock); 973 raw_spin_lock_init(&lock->wait_lock);
974 plist_head_init(&lock->wait_list, &lock->wait_lock); 974 plist_head_init_raw(&lock->wait_list, &lock->wait_lock);
975 975
976 debug_rt_mutex_init(lock, name); 976 debug_rt_mutex_init(lock, name);
977} 977}
@@ -1032,7 +1032,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1032{ 1032{
1033 int ret; 1033 int ret;
1034 1034
1035 spin_lock(&lock->wait_lock); 1035 raw_spin_lock(&lock->wait_lock);
1036 1036
1037 mark_rt_mutex_waiters(lock); 1037 mark_rt_mutex_waiters(lock);
1038 1038
@@ -1040,7 +1040,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1040 /* We got the lock for task. */ 1040 /* We got the lock for task. */
1041 debug_rt_mutex_lock(lock); 1041 debug_rt_mutex_lock(lock);
1042 rt_mutex_set_owner(lock, task, 0); 1042 rt_mutex_set_owner(lock, task, 0);
1043 spin_unlock(&lock->wait_lock); 1043 raw_spin_unlock(&lock->wait_lock);
1044 rt_mutex_deadlock_account_lock(lock, task); 1044 rt_mutex_deadlock_account_lock(lock, task);
1045 return 1; 1045 return 1;
1046 } 1046 }
@@ -1056,7 +1056,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1056 */ 1056 */
1057 ret = 0; 1057 ret = 0;
1058 } 1058 }
1059 spin_unlock(&lock->wait_lock); 1059 raw_spin_unlock(&lock->wait_lock);
1060 1060
1061 debug_rt_mutex_print_deadlock(waiter); 1061 debug_rt_mutex_print_deadlock(waiter);
1062 1062
@@ -1106,7 +1106,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1106{ 1106{
1107 int ret; 1107 int ret;
1108 1108
1109 spin_lock(&lock->wait_lock); 1109 raw_spin_lock(&lock->wait_lock);
1110 1110
1111 set_current_state(TASK_INTERRUPTIBLE); 1111 set_current_state(TASK_INTERRUPTIBLE);
1112 1112
@@ -1124,7 +1124,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1124 */ 1124 */
1125 fixup_rt_mutex_waiters(lock); 1125 fixup_rt_mutex_waiters(lock);
1126 1126
1127 spin_unlock(&lock->wait_lock); 1127 raw_spin_unlock(&lock->wait_lock);
1128 1128
1129 /* 1129 /*
1130 * Readjust priority, when we did not get the lock. We might have been 1130 * Readjust priority, when we did not get the lock. We might have been
diff --git a/kernel/sched.c b/kernel/sched.c
index 76c0e9691fc..3a8fb30a91b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -141,7 +141,7 @@ struct rt_prio_array {
141 141
142struct rt_bandwidth { 142struct rt_bandwidth {
143 /* nests inside the rq lock: */ 143 /* nests inside the rq lock: */
144 spinlock_t rt_runtime_lock; 144 raw_spinlock_t rt_runtime_lock;
145 ktime_t rt_period; 145 ktime_t rt_period;
146 u64 rt_runtime; 146 u64 rt_runtime;
147 struct hrtimer rt_period_timer; 147 struct hrtimer rt_period_timer;
@@ -178,7 +178,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
178 rt_b->rt_period = ns_to_ktime(period); 178 rt_b->rt_period = ns_to_ktime(period);
179 rt_b->rt_runtime = runtime; 179 rt_b->rt_runtime = runtime;
180 180
181 spin_lock_init(&rt_b->rt_runtime_lock); 181 raw_spin_lock_init(&rt_b->rt_runtime_lock);
182 182
183 hrtimer_init(&rt_b->rt_period_timer, 183 hrtimer_init(&rt_b->rt_period_timer,
184 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 184 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
@@ -200,7 +200,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
200 if (hrtimer_active(&rt_b->rt_period_timer)) 200 if (hrtimer_active(&rt_b->rt_period_timer))
201 return; 201 return;
202 202
203 spin_lock(&rt_b->rt_runtime_lock); 203 raw_spin_lock(&rt_b->rt_runtime_lock);
204 for (;;) { 204 for (;;) {
205 unsigned long delta; 205 unsigned long delta;
206 ktime_t soft, hard; 206 ktime_t soft, hard;
@@ -217,7 +217,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
217 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, 217 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
218 HRTIMER_MODE_ABS_PINNED, 0); 218 HRTIMER_MODE_ABS_PINNED, 0);
219 } 219 }
220 spin_unlock(&rt_b->rt_runtime_lock); 220 raw_spin_unlock(&rt_b->rt_runtime_lock);
221} 221}
222 222
223#ifdef CONFIG_RT_GROUP_SCHED 223#ifdef CONFIG_RT_GROUP_SCHED
@@ -298,7 +298,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
298 298
299#ifdef CONFIG_RT_GROUP_SCHED 299#ifdef CONFIG_RT_GROUP_SCHED
300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq); 301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var);
302#endif /* CONFIG_RT_GROUP_SCHED */ 302#endif /* CONFIG_RT_GROUP_SCHED */
303#else /* !CONFIG_USER_SCHED */ 303#else /* !CONFIG_USER_SCHED */
304#define root_task_group init_task_group 304#define root_task_group init_task_group
@@ -309,6 +309,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
309 */ 309 */
310static DEFINE_SPINLOCK(task_group_lock); 310static DEFINE_SPINLOCK(task_group_lock);
311 311
312#ifdef CONFIG_FAIR_GROUP_SCHED
313
312#ifdef CONFIG_SMP 314#ifdef CONFIG_SMP
313static int root_task_group_empty(void) 315static int root_task_group_empty(void)
314{ 316{
@@ -316,7 +318,6 @@ static int root_task_group_empty(void)
316} 318}
317#endif 319#endif
318 320
319#ifdef CONFIG_FAIR_GROUP_SCHED
320#ifdef CONFIG_USER_SCHED 321#ifdef CONFIG_USER_SCHED
321# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) 322# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
322#else /* !CONFIG_USER_SCHED */ 323#else /* !CONFIG_USER_SCHED */
@@ -469,7 +470,7 @@ struct rt_rq {
469 u64 rt_time; 470 u64 rt_time;
470 u64 rt_runtime; 471 u64 rt_runtime;
471 /* Nests inside the rq lock: */ 472 /* Nests inside the rq lock: */
472 spinlock_t rt_runtime_lock; 473 raw_spinlock_t rt_runtime_lock;
473 474
474#ifdef CONFIG_RT_GROUP_SCHED 475#ifdef CONFIG_RT_GROUP_SCHED
475 unsigned long rt_nr_boosted; 476 unsigned long rt_nr_boosted;
@@ -524,7 +525,7 @@ static struct root_domain def_root_domain;
524 */ 525 */
525struct rq { 526struct rq {
526 /* runqueue lock: */ 527 /* runqueue lock: */
527 spinlock_t lock; 528 raw_spinlock_t lock;
528 529
529 /* 530 /*
530 * nr_running and cpu_load should be in the same cacheline because 531 * nr_running and cpu_load should be in the same cacheline because
@@ -534,14 +535,12 @@ struct rq {
534 #define CPU_LOAD_IDX_MAX 5 535 #define CPU_LOAD_IDX_MAX 5
535 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 536 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
536#ifdef CONFIG_NO_HZ 537#ifdef CONFIG_NO_HZ
537 unsigned long last_tick_seen;
538 unsigned char in_nohz_recently; 538 unsigned char in_nohz_recently;
539#endif 539#endif
540 /* capture load from *all* tasks on this cpu: */ 540 /* capture load from *all* tasks on this cpu: */
541 struct load_weight load; 541 struct load_weight load;
542 unsigned long nr_load_updates; 542 unsigned long nr_load_updates;
543 u64 nr_switches; 543 u64 nr_switches;
544 u64 nr_migrations_in;
545 544
546 struct cfs_rq cfs; 545 struct cfs_rq cfs;
547 struct rt_rq rt; 546 struct rt_rq rt;
@@ -590,6 +589,8 @@ struct rq {
590 589
591 u64 rt_avg; 590 u64 rt_avg;
592 u64 age_stamp; 591 u64 age_stamp;
592 u64 idle_stamp;
593 u64 avg_idle;
593#endif 594#endif
594 595
595 /* calc_load related fields */ 596 /* calc_load related fields */
@@ -676,6 +677,7 @@ inline void update_rq_clock(struct rq *rq)
676 677
677/** 678/**
678 * runqueue_is_locked 679 * runqueue_is_locked
680 * @cpu: the processor in question.
679 * 681 *
680 * Returns true if the current cpu runqueue is locked. 682 * Returns true if the current cpu runqueue is locked.
681 * This interface allows printk to be called with the runqueue lock 683 * This interface allows printk to be called with the runqueue lock
@@ -683,7 +685,7 @@ inline void update_rq_clock(struct rq *rq)
683 */ 685 */
684int runqueue_is_locked(int cpu) 686int runqueue_is_locked(int cpu)
685{ 687{
686 return spin_is_locked(&cpu_rq(cpu)->lock); 688 return raw_spin_is_locked(&cpu_rq(cpu)->lock);
687} 689}
688 690
689/* 691/*
@@ -770,7 +772,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
770 if (!sched_feat_names[i]) 772 if (!sched_feat_names[i])
771 return -EINVAL; 773 return -EINVAL;
772 774
773 filp->f_pos += cnt; 775 *ppos += cnt;
774 776
775 return cnt; 777 return cnt;
776} 778}
@@ -812,6 +814,7 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
812 * default: 0.25ms 814 * default: 0.25ms
813 */ 815 */
814unsigned int sysctl_sched_shares_ratelimit = 250000; 816unsigned int sysctl_sched_shares_ratelimit = 250000;
817unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
815 818
816/* 819/*
817 * Inject some fuzzyness into changing the per-cpu group shares 820 * Inject some fuzzyness into changing the per-cpu group shares
@@ -890,7 +893,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
890 */ 893 */
891 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); 894 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
892 895
893 spin_unlock_irq(&rq->lock); 896 raw_spin_unlock_irq(&rq->lock);
894} 897}
895 898
896#else /* __ARCH_WANT_UNLOCKED_CTXSW */ 899#else /* __ARCH_WANT_UNLOCKED_CTXSW */
@@ -914,9 +917,9 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
914 next->oncpu = 1; 917 next->oncpu = 1;
915#endif 918#endif
916#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 919#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
917 spin_unlock_irq(&rq->lock); 920 raw_spin_unlock_irq(&rq->lock);
918#else 921#else
919 spin_unlock(&rq->lock); 922 raw_spin_unlock(&rq->lock);
920#endif 923#endif
921} 924}
922 925
@@ -946,10 +949,10 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
946{ 949{
947 for (;;) { 950 for (;;) {
948 struct rq *rq = task_rq(p); 951 struct rq *rq = task_rq(p);
949 spin_lock(&rq->lock); 952 raw_spin_lock(&rq->lock);
950 if (likely(rq == task_rq(p))) 953 if (likely(rq == task_rq(p)))
951 return rq; 954 return rq;
952 spin_unlock(&rq->lock); 955 raw_spin_unlock(&rq->lock);
953 } 956 }
954} 957}
955 958
@@ -966,10 +969,10 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
966 for (;;) { 969 for (;;) {
967 local_irq_save(*flags); 970 local_irq_save(*flags);
968 rq = task_rq(p); 971 rq = task_rq(p);
969 spin_lock(&rq->lock); 972 raw_spin_lock(&rq->lock);
970 if (likely(rq == task_rq(p))) 973 if (likely(rq == task_rq(p)))
971 return rq; 974 return rq;
972 spin_unlock_irqrestore(&rq->lock, *flags); 975 raw_spin_unlock_irqrestore(&rq->lock, *flags);
973 } 976 }
974} 977}
975 978
@@ -978,19 +981,19 @@ void task_rq_unlock_wait(struct task_struct *p)
978 struct rq *rq = task_rq(p); 981 struct rq *rq = task_rq(p);
979 982
980 smp_mb(); /* spin-unlock-wait is not a full memory barrier */ 983 smp_mb(); /* spin-unlock-wait is not a full memory barrier */
981 spin_unlock_wait(&rq->lock); 984 raw_spin_unlock_wait(&rq->lock);
982} 985}
983 986
984static void __task_rq_unlock(struct rq *rq) 987static void __task_rq_unlock(struct rq *rq)
985 __releases(rq->lock) 988 __releases(rq->lock)
986{ 989{
987 spin_unlock(&rq->lock); 990 raw_spin_unlock(&rq->lock);
988} 991}
989 992
990static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) 993static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
991 __releases(rq->lock) 994 __releases(rq->lock)
992{ 995{
993 spin_unlock_irqrestore(&rq->lock, *flags); 996 raw_spin_unlock_irqrestore(&rq->lock, *flags);
994} 997}
995 998
996/* 999/*
@@ -1003,7 +1006,7 @@ static struct rq *this_rq_lock(void)
1003 1006
1004 local_irq_disable(); 1007 local_irq_disable();
1005 rq = this_rq(); 1008 rq = this_rq();
1006 spin_lock(&rq->lock); 1009 raw_spin_lock(&rq->lock);
1007 1010
1008 return rq; 1011 return rq;
1009} 1012}
@@ -1050,10 +1053,10 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
1050 1053
1051 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); 1054 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1052 1055
1053 spin_lock(&rq->lock); 1056 raw_spin_lock(&rq->lock);
1054 update_rq_clock(rq); 1057 update_rq_clock(rq);
1055 rq->curr->sched_class->task_tick(rq, rq->curr, 1); 1058 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1056 spin_unlock(&rq->lock); 1059 raw_spin_unlock(&rq->lock);
1057 1060
1058 return HRTIMER_NORESTART; 1061 return HRTIMER_NORESTART;
1059} 1062}
@@ -1066,10 +1069,10 @@ static void __hrtick_start(void *arg)
1066{ 1069{
1067 struct rq *rq = arg; 1070 struct rq *rq = arg;
1068 1071
1069 spin_lock(&rq->lock); 1072 raw_spin_lock(&rq->lock);
1070 hrtimer_restart(&rq->hrtick_timer); 1073 hrtimer_restart(&rq->hrtick_timer);
1071 rq->hrtick_csd_pending = 0; 1074 rq->hrtick_csd_pending = 0;
1072 spin_unlock(&rq->lock); 1075 raw_spin_unlock(&rq->lock);
1073} 1076}
1074 1077
1075/* 1078/*
@@ -1176,7 +1179,7 @@ static void resched_task(struct task_struct *p)
1176{ 1179{
1177 int cpu; 1180 int cpu;
1178 1181
1179 assert_spin_locked(&task_rq(p)->lock); 1182 assert_raw_spin_locked(&task_rq(p)->lock);
1180 1183
1181 if (test_tsk_need_resched(p)) 1184 if (test_tsk_need_resched(p))
1182 return; 1185 return;
@@ -1198,10 +1201,10 @@ static void resched_cpu(int cpu)
1198 struct rq *rq = cpu_rq(cpu); 1201 struct rq *rq = cpu_rq(cpu);
1199 unsigned long flags; 1202 unsigned long flags;
1200 1203
1201 if (!spin_trylock_irqsave(&rq->lock, flags)) 1204 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
1202 return; 1205 return;
1203 resched_task(cpu_curr(cpu)); 1206 resched_task(cpu_curr(cpu));
1204 spin_unlock_irqrestore(&rq->lock, flags); 1207 raw_spin_unlock_irqrestore(&rq->lock, flags);
1205} 1208}
1206 1209
1207#ifdef CONFIG_NO_HZ 1210#ifdef CONFIG_NO_HZ
@@ -1270,7 +1273,7 @@ static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1270#else /* !CONFIG_SMP */ 1273#else /* !CONFIG_SMP */
1271static void resched_task(struct task_struct *p) 1274static void resched_task(struct task_struct *p)
1272{ 1275{
1273 assert_spin_locked(&task_rq(p)->lock); 1276 assert_raw_spin_locked(&task_rq(p)->lock);
1274 set_tsk_need_resched(p); 1277 set_tsk_need_resched(p);
1275} 1278}
1276 1279
@@ -1563,11 +1566,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1563 1566
1564#ifdef CONFIG_FAIR_GROUP_SCHED 1567#ifdef CONFIG_FAIR_GROUP_SCHED
1565 1568
1566struct update_shares_data { 1569static __read_mostly unsigned long *update_shares_data;
1567 unsigned long rq_weight[NR_CPUS];
1568};
1569
1570static DEFINE_PER_CPU(struct update_shares_data, update_shares_data);
1571 1570
1572static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1571static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1573 1572
@@ -1577,12 +1576,12 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1577static void update_group_shares_cpu(struct task_group *tg, int cpu, 1576static void update_group_shares_cpu(struct task_group *tg, int cpu,
1578 unsigned long sd_shares, 1577 unsigned long sd_shares,
1579 unsigned long sd_rq_weight, 1578 unsigned long sd_rq_weight,
1580 struct update_shares_data *usd) 1579 unsigned long *usd_rq_weight)
1581{ 1580{
1582 unsigned long shares, rq_weight; 1581 unsigned long shares, rq_weight;
1583 int boost = 0; 1582 int boost = 0;
1584 1583
1585 rq_weight = usd->rq_weight[cpu]; 1584 rq_weight = usd_rq_weight[cpu];
1586 if (!rq_weight) { 1585 if (!rq_weight) {
1587 boost = 1; 1586 boost = 1;
1588 rq_weight = NICE_0_LOAD; 1587 rq_weight = NICE_0_LOAD;
@@ -1601,11 +1600,11 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu,
1601 struct rq *rq = cpu_rq(cpu); 1600 struct rq *rq = cpu_rq(cpu);
1602 unsigned long flags; 1601 unsigned long flags;
1603 1602
1604 spin_lock_irqsave(&rq->lock, flags); 1603 raw_spin_lock_irqsave(&rq->lock, flags);
1605 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; 1604 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1606 tg->cfs_rq[cpu]->shares = boost ? 0 : shares; 1605 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1607 __set_se_shares(tg->se[cpu], shares); 1606 __set_se_shares(tg->se[cpu], shares);
1608 spin_unlock_irqrestore(&rq->lock, flags); 1607 raw_spin_unlock_irqrestore(&rq->lock, flags);
1609 } 1608 }
1610} 1609}
1611 1610
@@ -1616,8 +1615,8 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu,
1616 */ 1615 */
1617static int tg_shares_up(struct task_group *tg, void *data) 1616static int tg_shares_up(struct task_group *tg, void *data)
1618{ 1617{
1619 unsigned long weight, rq_weight = 0, shares = 0; 1618 unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
1620 struct update_shares_data *usd; 1619 unsigned long *usd_rq_weight;
1621 struct sched_domain *sd = data; 1620 struct sched_domain *sd = data;
1622 unsigned long flags; 1621 unsigned long flags;
1623 int i; 1622 int i;
@@ -1626,12 +1625,13 @@ static int tg_shares_up(struct task_group *tg, void *data)
1626 return 0; 1625 return 0;
1627 1626
1628 local_irq_save(flags); 1627 local_irq_save(flags);
1629 usd = &__get_cpu_var(update_shares_data); 1628 usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
1630 1629
1631 for_each_cpu(i, sched_domain_span(sd)) { 1630 for_each_cpu(i, sched_domain_span(sd)) {
1632 weight = tg->cfs_rq[i]->load.weight; 1631 weight = tg->cfs_rq[i]->load.weight;
1633 usd->rq_weight[i] = weight; 1632 usd_rq_weight[i] = weight;
1634 1633
1634 rq_weight += weight;
1635 /* 1635 /*
1636 * If there are currently no tasks on the cpu pretend there 1636 * If there are currently no tasks on the cpu pretend there
1637 * is one of average load so that when a new task gets to 1637 * is one of average load so that when a new task gets to
@@ -1640,10 +1640,13 @@ static int tg_shares_up(struct task_group *tg, void *data)
1640 if (!weight) 1640 if (!weight)
1641 weight = NICE_0_LOAD; 1641 weight = NICE_0_LOAD;
1642 1642
1643 rq_weight += weight; 1643 sum_weight += weight;
1644 shares += tg->cfs_rq[i]->shares; 1644 shares += tg->cfs_rq[i]->shares;
1645 } 1645 }
1646 1646
1647 if (!rq_weight)
1648 rq_weight = sum_weight;
1649
1647 if ((!shares && rq_weight) || shares > tg->shares) 1650 if ((!shares && rq_weight) || shares > tg->shares)
1648 shares = tg->shares; 1651 shares = tg->shares;
1649 1652
@@ -1651,7 +1654,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
1651 shares = tg->shares; 1654 shares = tg->shares;
1652 1655
1653 for_each_cpu(i, sched_domain_span(sd)) 1656 for_each_cpu(i, sched_domain_span(sd))
1654 update_group_shares_cpu(tg, i, shares, rq_weight, usd); 1657 update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
1655 1658
1656 local_irq_restore(flags); 1659 local_irq_restore(flags);
1657 1660
@@ -1703,9 +1706,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1703 if (root_task_group_empty()) 1706 if (root_task_group_empty())
1704 return; 1707 return;
1705 1708
1706 spin_unlock(&rq->lock); 1709 raw_spin_unlock(&rq->lock);
1707 update_shares(sd); 1710 update_shares(sd);
1708 spin_lock(&rq->lock); 1711 raw_spin_lock(&rq->lock);
1709} 1712}
1710 1713
1711static void update_h_load(long cpu) 1714static void update_h_load(long cpu)
@@ -1745,7 +1748,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1745 __acquires(busiest->lock) 1748 __acquires(busiest->lock)
1746 __acquires(this_rq->lock) 1749 __acquires(this_rq->lock)
1747{ 1750{
1748 spin_unlock(&this_rq->lock); 1751 raw_spin_unlock(&this_rq->lock);
1749 double_rq_lock(this_rq, busiest); 1752 double_rq_lock(this_rq, busiest);
1750 1753
1751 return 1; 1754 return 1;
@@ -1766,14 +1769,16 @@ static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1766{ 1769{
1767 int ret = 0; 1770 int ret = 0;
1768 1771
1769 if (unlikely(!spin_trylock(&busiest->lock))) { 1772 if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1770 if (busiest < this_rq) { 1773 if (busiest < this_rq) {
1771 spin_unlock(&this_rq->lock); 1774 raw_spin_unlock(&this_rq->lock);
1772 spin_lock(&busiest->lock); 1775 raw_spin_lock(&busiest->lock);
1773 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING); 1776 raw_spin_lock_nested(&this_rq->lock,
1777 SINGLE_DEPTH_NESTING);
1774 ret = 1; 1778 ret = 1;
1775 } else 1779 } else
1776 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING); 1780 raw_spin_lock_nested(&busiest->lock,
1781 SINGLE_DEPTH_NESTING);
1777 } 1782 }
1778 return ret; 1783 return ret;
1779} 1784}
@@ -1787,7 +1792,7 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1787{ 1792{
1788 if (unlikely(!irqs_disabled())) { 1793 if (unlikely(!irqs_disabled())) {
1789 /* printk() doesn't work good under rq->lock */ 1794 /* printk() doesn't work good under rq->lock */
1790 spin_unlock(&this_rq->lock); 1795 raw_spin_unlock(&this_rq->lock);
1791 BUG_ON(1); 1796 BUG_ON(1);
1792 } 1797 }
1793 1798
@@ -1797,7 +1802,7 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1797static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) 1802static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1798 __releases(busiest->lock) 1803 __releases(busiest->lock)
1799{ 1804{
1800 spin_unlock(&busiest->lock); 1805 raw_spin_unlock(&busiest->lock);
1801 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); 1806 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1802} 1807}
1803#endif 1808#endif
@@ -1812,6 +1817,22 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1812#endif 1817#endif
1813 1818
1814static void calc_load_account_active(struct rq *this_rq); 1819static void calc_load_account_active(struct rq *this_rq);
1820static void update_sysctl(void);
1821static int get_update_sysctl_factor(void);
1822
1823static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1824{
1825 set_task_rq(p, cpu);
1826#ifdef CONFIG_SMP
1827 /*
1828 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1829 * successfuly executed on another CPU. We must ensure that updates of
1830 * per-task data have been completed by this moment.
1831 */
1832 smp_wmb();
1833 task_thread_info(p)->cpu = cpu;
1834#endif
1835}
1815 1836
1816#include "sched_stats.h" 1837#include "sched_stats.h"
1817#include "sched_idletask.c" 1838#include "sched_idletask.c"
@@ -1969,20 +1990,6 @@ inline int task_curr(const struct task_struct *p)
1969 return cpu_curr(task_cpu(p)) == p; 1990 return cpu_curr(task_cpu(p)) == p;
1970} 1991}
1971 1992
1972static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1973{
1974 set_task_rq(p, cpu);
1975#ifdef CONFIG_SMP
1976 /*
1977 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1978 * successfuly executed on another CPU. We must ensure that updates of
1979 * per-task data have been completed by this moment.
1980 */
1981 smp_wmb();
1982 task_thread_info(p)->cpu = cpu;
1983#endif
1984}
1985
1986static inline void check_class_changed(struct rq *rq, struct task_struct *p, 1993static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1987 const struct sched_class *prev_class, 1994 const struct sched_class *prev_class,
1988 int oldprio, int running) 1995 int oldprio, int running)
@@ -2004,17 +2011,17 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2004{ 2011{
2005 s64 delta; 2012 s64 delta;
2006 2013
2014 if (p->sched_class != &fair_sched_class)
2015 return 0;
2016
2007 /* 2017 /*
2008 * Buddy candidates are cache hot: 2018 * Buddy candidates are cache hot:
2009 */ 2019 */
2010 if (sched_feat(CACHE_HOT_BUDDY) && 2020 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
2011 (&p->se == cfs_rq_of(&p->se)->next || 2021 (&p->se == cfs_rq_of(&p->se)->next ||
2012 &p->se == cfs_rq_of(&p->se)->last)) 2022 &p->se == cfs_rq_of(&p->se)->last))
2013 return 1; 2023 return 1;
2014 2024
2015 if (p->sched_class != &fair_sched_class)
2016 return 0;
2017
2018 if (sysctl_sched_migration_cost == -1) 2025 if (sysctl_sched_migration_cost == -1)
2019 return 1; 2026 return 1;
2020 if (sysctl_sched_migration_cost == 0) 2027 if (sysctl_sched_migration_cost == 0)
@@ -2025,39 +2032,23 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2025 return delta < (s64)sysctl_sched_migration_cost; 2032 return delta < (s64)sysctl_sched_migration_cost;
2026} 2033}
2027 2034
2028
2029void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 2035void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2030{ 2036{
2031 int old_cpu = task_cpu(p); 2037#ifdef CONFIG_SCHED_DEBUG
2032 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); 2038 /*
2033 struct cfs_rq *old_cfsrq = task_cfs_rq(p), 2039 * We should never call set_task_cpu() on a blocked task,
2034 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); 2040 * ttwu() will sort out the placement.
2035 u64 clock_offset; 2041 */
2036 2042 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
2037 clock_offset = old_rq->clock - new_rq->clock; 2043 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
2044#endif
2038 2045
2039 trace_sched_migrate_task(p, new_cpu); 2046 trace_sched_migrate_task(p, new_cpu);
2040 2047
2041#ifdef CONFIG_SCHEDSTATS 2048 if (task_cpu(p) != new_cpu) {
2042 if (p->se.wait_start)
2043 p->se.wait_start -= clock_offset;
2044 if (p->se.sleep_start)
2045 p->se.sleep_start -= clock_offset;
2046 if (p->se.block_start)
2047 p->se.block_start -= clock_offset;
2048#endif
2049 if (old_cpu != new_cpu) {
2050 p->se.nr_migrations++; 2049 p->se.nr_migrations++;
2051 new_rq->nr_migrations_in++; 2050 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);
2052#ifdef CONFIG_SCHEDSTATS
2053 if (task_hot(p, old_rq->clock, NULL))
2054 schedstat_inc(p, se.nr_forced2_migrations);
2055#endif
2056 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
2057 1, 1, NULL, 0);
2058 } 2051 }
2059 p->se.vruntime -= old_cfsrq->min_vruntime -
2060 new_cfsrq->min_vruntime;
2061 2052
2062 __set_task_cpu(p, new_cpu); 2053 __set_task_cpu(p, new_cpu);
2063} 2054}
@@ -2082,12 +2073,10 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2082 2073
2083 /* 2074 /*
2084 * If the task is not on a runqueue (and not running), then 2075 * If the task is not on a runqueue (and not running), then
2085 * it is sufficient to simply update the task's cpu field. 2076 * the next wake-up will properly place the task.
2086 */ 2077 */
2087 if (!p->se.on_rq && !task_running(rq, p)) { 2078 if (!p->se.on_rq && !task_running(rq, p))
2088 set_task_cpu(p, dest_cpu);
2089 return 0; 2079 return 0;
2090 }
2091 2080
2092 init_completion(&req->done); 2081 init_completion(&req->done);
2093 req->task = p; 2082 req->task = p;
@@ -2292,6 +2281,75 @@ void task_oncpu_function_call(struct task_struct *p,
2292 preempt_enable(); 2281 preempt_enable();
2293} 2282}
2294 2283
2284#ifdef CONFIG_SMP
2285static int select_fallback_rq(int cpu, struct task_struct *p)
2286{
2287 int dest_cpu;
2288 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
2289
2290 /* Look for allowed, online CPU in same node. */
2291 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
2292 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
2293 return dest_cpu;
2294
2295 /* Any allowed, online CPU? */
2296 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
2297 if (dest_cpu < nr_cpu_ids)
2298 return dest_cpu;
2299
2300 /* No more Mr. Nice Guy. */
2301 if (dest_cpu >= nr_cpu_ids) {
2302 rcu_read_lock();
2303 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
2304 rcu_read_unlock();
2305 dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
2306
2307 /*
2308 * Don't tell them about moving exiting tasks or
2309 * kernel threads (both mm NULL), since they never
2310 * leave kernel.
2311 */
2312 if (p->mm && printk_ratelimit()) {
2313 printk(KERN_INFO "process %d (%s) no "
2314 "longer affine to cpu%d\n",
2315 task_pid_nr(p), p->comm, cpu);
2316 }
2317 }
2318
2319 return dest_cpu;
2320}
2321
2322/*
2323 * Gets called from 3 sites (exec, fork, wakeup), since it is called without
2324 * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done
2325 * by:
2326 *
2327 * exec: is unstable, retry loop
2328 * fork & wake-up: serialize ->cpus_allowed against TASK_WAKING
2329 */
2330static inline
2331int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2332{
2333 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
2334
2335 /*
2336 * In order not to call set_task_cpu() on a blocking task we need
2337 * to rely on ttwu() to place the task on a valid ->cpus_allowed
2338 * cpu.
2339 *
2340 * Since this is common to all placement strategies, this lives here.
2341 *
2342 * [ this allows ->select_task() to simply return task_cpu(p) and
2343 * not worry about this generic constraint ]
2344 */
2345 if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
2346 !cpu_online(cpu)))
2347 cpu = select_fallback_rq(task_cpu(p), p);
2348
2349 return cpu;
2350}
2351#endif
2352
2295/*** 2353/***
2296 * try_to_wake_up - wake up a thread 2354 * try_to_wake_up - wake up a thread
2297 * @p: the to-be-woken-up thread 2355 * @p: the to-be-woken-up thread
@@ -2311,7 +2369,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2311{ 2369{
2312 int cpu, orig_cpu, this_cpu, success = 0; 2370 int cpu, orig_cpu, this_cpu, success = 0;
2313 unsigned long flags; 2371 unsigned long flags;
2314 struct rq *rq; 2372 struct rq *rq, *orig_rq;
2315 2373
2316 if (!sched_feat(SYNC_WAKEUPS)) 2374 if (!sched_feat(SYNC_WAKEUPS))
2317 wake_flags &= ~WF_SYNC; 2375 wake_flags &= ~WF_SYNC;
@@ -2319,7 +2377,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2319 this_cpu = get_cpu(); 2377 this_cpu = get_cpu();
2320 2378
2321 smp_wmb(); 2379 smp_wmb();
2322 rq = task_rq_lock(p, &flags); 2380 rq = orig_rq = task_rq_lock(p, &flags);
2323 update_rq_clock(rq); 2381 update_rq_clock(rq);
2324 if (!(p->state & state)) 2382 if (!(p->state & state))
2325 goto out; 2383 goto out;
@@ -2343,13 +2401,19 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2343 if (task_contributes_to_load(p)) 2401 if (task_contributes_to_load(p))
2344 rq->nr_uninterruptible--; 2402 rq->nr_uninterruptible--;
2345 p->state = TASK_WAKING; 2403 p->state = TASK_WAKING;
2346 task_rq_unlock(rq, &flags);
2347 2404
2348 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 2405 if (p->sched_class->task_waking)
2406 p->sched_class->task_waking(rq, p);
2407
2408 __task_rq_unlock(rq);
2409
2410 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2349 if (cpu != orig_cpu) 2411 if (cpu != orig_cpu)
2350 set_task_cpu(p, cpu); 2412 set_task_cpu(p, cpu);
2351 2413
2352 rq = task_rq_lock(p, &flags); 2414 rq = __task_rq_lock(p);
2415 update_rq_clock(rq);
2416
2353 WARN_ON(p->state != TASK_WAKING); 2417 WARN_ON(p->state != TASK_WAKING);
2354 cpu = task_cpu(p); 2418 cpu = task_cpu(p);
2355 2419
@@ -2404,8 +2468,19 @@ out_running:
2404 2468
2405 p->state = TASK_RUNNING; 2469 p->state = TASK_RUNNING;
2406#ifdef CONFIG_SMP 2470#ifdef CONFIG_SMP
2407 if (p->sched_class->task_wake_up) 2471 if (p->sched_class->task_woken)
2408 p->sched_class->task_wake_up(rq, p); 2472 p->sched_class->task_woken(rq, p);
2473
2474 if (unlikely(rq->idle_stamp)) {
2475 u64 delta = rq->clock - rq->idle_stamp;
2476 u64 max = 2*sysctl_sched_migration_cost;
2477
2478 if (delta > max)
2479 rq->avg_idle = max;
2480 else
2481 update_avg(&rq->avg_idle, delta);
2482 rq->idle_stamp = 0;
2483 }
2409#endif 2484#endif
2410out: 2485out:
2411 task_rq_unlock(rq, &flags); 2486 task_rq_unlock(rq, &flags);
@@ -2452,7 +2527,6 @@ static void __sched_fork(struct task_struct *p)
2452 p->se.avg_overlap = 0; 2527 p->se.avg_overlap = 0;
2453 p->se.start_runtime = 0; 2528 p->se.start_runtime = 0;
2454 p->se.avg_wakeup = sysctl_sched_wakeup_granularity; 2529 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2455 p->se.avg_running = 0;
2456 2530
2457#ifdef CONFIG_SCHEDSTATS 2531#ifdef CONFIG_SCHEDSTATS
2458 p->se.wait_start = 0; 2532 p->se.wait_start = 0;
@@ -2474,7 +2548,6 @@ static void __sched_fork(struct task_struct *p)
2474 p->se.nr_failed_migrations_running = 0; 2548 p->se.nr_failed_migrations_running = 0;
2475 p->se.nr_failed_migrations_hot = 0; 2549 p->se.nr_failed_migrations_hot = 0;
2476 p->se.nr_forced_migrations = 0; 2550 p->se.nr_forced_migrations = 0;
2477 p->se.nr_forced2_migrations = 0;
2478 2551
2479 p->se.nr_wakeups = 0; 2552 p->se.nr_wakeups = 0;
2480 p->se.nr_wakeups_sync = 0; 2553 p->se.nr_wakeups_sync = 0;
@@ -2495,14 +2568,6 @@ static void __sched_fork(struct task_struct *p)
2495#ifdef CONFIG_PREEMPT_NOTIFIERS 2568#ifdef CONFIG_PREEMPT_NOTIFIERS
2496 INIT_HLIST_HEAD(&p->preempt_notifiers); 2569 INIT_HLIST_HEAD(&p->preempt_notifiers);
2497#endif 2570#endif
2498
2499 /*
2500 * We mark the process as running here, but have not actually
2501 * inserted it onto the runqueue yet. This guarantees that
2502 * nobody will actually run it, and a signal or other external
2503 * event cannot wake it up and insert it on the runqueue either.
2504 */
2505 p->state = TASK_RUNNING;
2506} 2571}
2507 2572
2508/* 2573/*
@@ -2513,6 +2578,12 @@ void sched_fork(struct task_struct *p, int clone_flags)
2513 int cpu = get_cpu(); 2578 int cpu = get_cpu();
2514 2579
2515 __sched_fork(p); 2580 __sched_fork(p);
2581 /*
2582 * We mark the process as waking here. This guarantees that
2583 * nobody will actually run it, and a signal or other external
2584 * event cannot wake it up and insert it on the runqueue either.
2585 */
2586 p->state = TASK_WAKING;
2516 2587
2517 /* 2588 /*
2518 * Revert to default priority/policy on fork if requested. 2589 * Revert to default priority/policy on fork if requested.
@@ -2544,9 +2615,9 @@ void sched_fork(struct task_struct *p, int clone_flags)
2544 if (!rt_prio(p->prio)) 2615 if (!rt_prio(p->prio))
2545 p->sched_class = &fair_sched_class; 2616 p->sched_class = &fair_sched_class;
2546 2617
2547#ifdef CONFIG_SMP 2618 if (p->sched_class->task_fork)
2548 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); 2619 p->sched_class->task_fork(p);
2549#endif 2620
2550 set_task_cpu(p, cpu); 2621 set_task_cpu(p, cpu);
2551 2622
2552#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2623#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
@@ -2576,28 +2647,35 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2576{ 2647{
2577 unsigned long flags; 2648 unsigned long flags;
2578 struct rq *rq; 2649 struct rq *rq;
2650 int cpu = get_cpu();
2651
2652#ifdef CONFIG_SMP
2653 /*
2654 * Fork balancing, do it here and not earlier because:
2655 * - cpus_allowed can change in the fork path
2656 * - any previously selected cpu might disappear through hotplug
2657 *
2658 * We still have TASK_WAKING but PF_STARTING is gone now, meaning
2659 * ->cpus_allowed is stable, we have preemption disabled, meaning
2660 * cpu_online_mask is stable.
2661 */
2662 cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
2663 set_task_cpu(p, cpu);
2664#endif
2579 2665
2580 rq = task_rq_lock(p, &flags); 2666 rq = task_rq_lock(p, &flags);
2581 BUG_ON(p->state != TASK_RUNNING); 2667 BUG_ON(p->state != TASK_WAKING);
2668 p->state = TASK_RUNNING;
2582 update_rq_clock(rq); 2669 update_rq_clock(rq);
2583 2670 activate_task(rq, p, 0);
2584 if (!p->sched_class->task_new || !current->se.on_rq) {
2585 activate_task(rq, p, 0);
2586 } else {
2587 /*
2588 * Let the scheduling class do new task startup
2589 * management (if any):
2590 */
2591 p->sched_class->task_new(rq, p);
2592 inc_nr_running(rq);
2593 }
2594 trace_sched_wakeup_new(rq, p, 1); 2671 trace_sched_wakeup_new(rq, p, 1);
2595 check_preempt_curr(rq, p, WF_FORK); 2672 check_preempt_curr(rq, p, WF_FORK);
2596#ifdef CONFIG_SMP 2673#ifdef CONFIG_SMP
2597 if (p->sched_class->task_wake_up) 2674 if (p->sched_class->task_woken)
2598 p->sched_class->task_wake_up(rq, p); 2675 p->sched_class->task_woken(rq, p);
2599#endif 2676#endif
2600 task_rq_unlock(rq, &flags); 2677 task_rq_unlock(rq, &flags);
2678 put_cpu();
2601} 2679}
2602 2680
2603#ifdef CONFIG_PREEMPT_NOTIFIERS 2681#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -2747,10 +2825,10 @@ static inline void post_schedule(struct rq *rq)
2747 if (rq->post_schedule) { 2825 if (rq->post_schedule) {
2748 unsigned long flags; 2826 unsigned long flags;
2749 2827
2750 spin_lock_irqsave(&rq->lock, flags); 2828 raw_spin_lock_irqsave(&rq->lock, flags);
2751 if (rq->curr->sched_class->post_schedule) 2829 if (rq->curr->sched_class->post_schedule)
2752 rq->curr->sched_class->post_schedule(rq); 2830 rq->curr->sched_class->post_schedule(rq);
2753 spin_unlock_irqrestore(&rq->lock, flags); 2831 raw_spin_unlock_irqrestore(&rq->lock, flags);
2754 2832
2755 rq->post_schedule = 0; 2833 rq->post_schedule = 0;
2756 } 2834 }
@@ -2814,14 +2892,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
2814 */ 2892 */
2815 arch_start_context_switch(prev); 2893 arch_start_context_switch(prev);
2816 2894
2817 if (unlikely(!mm)) { 2895 if (likely(!mm)) {
2818 next->active_mm = oldmm; 2896 next->active_mm = oldmm;
2819 atomic_inc(&oldmm->mm_count); 2897 atomic_inc(&oldmm->mm_count);
2820 enter_lazy_tlb(oldmm, next); 2898 enter_lazy_tlb(oldmm, next);
2821 } else 2899 } else
2822 switch_mm(oldmm, mm, next); 2900 switch_mm(oldmm, mm, next);
2823 2901
2824 if (unlikely(!prev->mm)) { 2902 if (likely(!prev->mm)) {
2825 prev->active_mm = NULL; 2903 prev->active_mm = NULL;
2826 rq->prev_mm = oldmm; 2904 rq->prev_mm = oldmm;
2827 } 2905 }
@@ -2984,15 +3062,6 @@ static void calc_load_account_active(struct rq *this_rq)
2984} 3062}
2985 3063
2986/* 3064/*
2987 * Externally visible per-cpu scheduler statistics:
2988 * cpu_nr_migrations(cpu) - number of migrations into that cpu
2989 */
2990u64 cpu_nr_migrations(int cpu)
2991{
2992 return cpu_rq(cpu)->nr_migrations_in;
2993}
2994
2995/*
2996 * Update rq->cpu_load[] statistics. This function is usually called every 3065 * Update rq->cpu_load[] statistics. This function is usually called every
2997 * scheduler tick (TICK_NSEC). 3066 * scheduler tick (TICK_NSEC).
2998 */ 3067 */
@@ -3041,15 +3110,15 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
3041{ 3110{
3042 BUG_ON(!irqs_disabled()); 3111 BUG_ON(!irqs_disabled());
3043 if (rq1 == rq2) { 3112 if (rq1 == rq2) {
3044 spin_lock(&rq1->lock); 3113 raw_spin_lock(&rq1->lock);
3045 __acquire(rq2->lock); /* Fake it out ;) */ 3114 __acquire(rq2->lock); /* Fake it out ;) */
3046 } else { 3115 } else {
3047 if (rq1 < rq2) { 3116 if (rq1 < rq2) {
3048 spin_lock(&rq1->lock); 3117 raw_spin_lock(&rq1->lock);
3049 spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); 3118 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
3050 } else { 3119 } else {
3051 spin_lock(&rq2->lock); 3120 raw_spin_lock(&rq2->lock);
3052 spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); 3121 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
3053 } 3122 }
3054 } 3123 }
3055 update_rq_clock(rq1); 3124 update_rq_clock(rq1);
@@ -3066,29 +3135,44 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
3066 __releases(rq1->lock) 3135 __releases(rq1->lock)
3067 __releases(rq2->lock) 3136 __releases(rq2->lock)
3068{ 3137{
3069 spin_unlock(&rq1->lock); 3138 raw_spin_unlock(&rq1->lock);
3070 if (rq1 != rq2) 3139 if (rq1 != rq2)
3071 spin_unlock(&rq2->lock); 3140 raw_spin_unlock(&rq2->lock);
3072 else 3141 else
3073 __release(rq2->lock); 3142 __release(rq2->lock);
3074} 3143}
3075 3144
3076/* 3145/*
3077 * If dest_cpu is allowed for this process, migrate the task to it. 3146 * sched_exec - execve() is a valuable balancing opportunity, because at
3078 * This is accomplished by forcing the cpu_allowed mask to only 3147 * this point the task has the smallest effective memory and cache footprint.
3079 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
3080 * the cpu_allowed mask is restored.
3081 */ 3148 */
3082static void sched_migrate_task(struct task_struct *p, int dest_cpu) 3149void sched_exec(void)
3083{ 3150{
3151 struct task_struct *p = current;
3084 struct migration_req req; 3152 struct migration_req req;
3153 int dest_cpu, this_cpu;
3085 unsigned long flags; 3154 unsigned long flags;
3086 struct rq *rq; 3155 struct rq *rq;
3087 3156
3157again:
3158 this_cpu = get_cpu();
3159 dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0);
3160 if (dest_cpu == this_cpu) {
3161 put_cpu();
3162 return;
3163 }
3164
3088 rq = task_rq_lock(p, &flags); 3165 rq = task_rq_lock(p, &flags);
3166 put_cpu();
3167
3168 /*
3169 * select_task_rq() can race against ->cpus_allowed
3170 */
3089 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) 3171 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
3090 || unlikely(!cpu_active(dest_cpu))) 3172 || unlikely(!cpu_active(dest_cpu))) {
3091 goto out; 3173 task_rq_unlock(rq, &flags);
3174 goto again;
3175 }
3092 3176
3093 /* force the process onto the specified CPU */ 3177 /* force the process onto the specified CPU */
3094 if (migrate_task(p, dest_cpu, &req)) { 3178 if (migrate_task(p, dest_cpu, &req)) {
@@ -3103,24 +3187,10 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
3103 3187
3104 return; 3188 return;
3105 } 3189 }
3106out:
3107 task_rq_unlock(rq, &flags); 3190 task_rq_unlock(rq, &flags);
3108} 3191}
3109 3192
3110/* 3193/*
3111 * sched_exec - execve() is a valuable balancing opportunity, because at
3112 * this point the task has the smallest effective memory and cache footprint.
3113 */
3114void sched_exec(void)
3115{
3116 int new_cpu, this_cpu = get_cpu();
3117 new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);
3118 put_cpu();
3119 if (new_cpu != this_cpu)
3120 sched_migrate_task(current, new_cpu);
3121}
3122
3123/*
3124 * pull_task - move a task from a remote runqueue to the local runqueue. 3194 * pull_task - move a task from a remote runqueue to the local runqueue.
3125 * Both runqueues must be locked. 3195 * Both runqueues must be locked.
3126 */ 3196 */
@@ -3130,10 +3200,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
3130 deactivate_task(src_rq, p, 0); 3200 deactivate_task(src_rq, p, 0);
3131 set_task_cpu(p, this_cpu); 3201 set_task_cpu(p, this_cpu);
3132 activate_task(this_rq, p, 0); 3202 activate_task(this_rq, p, 0);
3133 /*
3134 * Note that idle threads have a prio of MAX_PRIO, for this test
3135 * to be always true for them.
3136 */
3137 check_preempt_curr(this_rq, p, 0); 3203 check_preempt_curr(this_rq, p, 0);
3138} 3204}
3139 3205
@@ -3656,6 +3722,7 @@ static void update_group_power(struct sched_domain *sd, int cpu)
3656 3722
3657/** 3723/**
3658 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 3724 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3725 * @sd: The sched_domain whose statistics are to be updated.
3659 * @group: sched_group whose statistics are to be updated. 3726 * @group: sched_group whose statistics are to be updated.
3660 * @this_cpu: Cpu for which load balance is currently performed. 3727 * @this_cpu: Cpu for which load balance is currently performed.
3661 * @idle: Idle status of this_cpu 3728 * @idle: Idle status of this_cpu
@@ -4091,7 +4158,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4091 unsigned long flags; 4158 unsigned long flags;
4092 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4159 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4093 4160
4094 cpumask_setall(cpus); 4161 cpumask_copy(cpus, cpu_active_mask);
4095 4162
4096 /* 4163 /*
4097 * When power savings policy is enabled for the parent domain, idle 4164 * When power savings policy is enabled for the parent domain, idle
@@ -4164,14 +4231,15 @@ redo:
4164 4231
4165 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { 4232 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
4166 4233
4167 spin_lock_irqsave(&busiest->lock, flags); 4234 raw_spin_lock_irqsave(&busiest->lock, flags);
4168 4235
4169 /* don't kick the migration_thread, if the curr 4236 /* don't kick the migration_thread, if the curr
4170 * task on busiest cpu can't be moved to this_cpu 4237 * task on busiest cpu can't be moved to this_cpu
4171 */ 4238 */
4172 if (!cpumask_test_cpu(this_cpu, 4239 if (!cpumask_test_cpu(this_cpu,
4173 &busiest->curr->cpus_allowed)) { 4240 &busiest->curr->cpus_allowed)) {
4174 spin_unlock_irqrestore(&busiest->lock, flags); 4241 raw_spin_unlock_irqrestore(&busiest->lock,
4242 flags);
4175 all_pinned = 1; 4243 all_pinned = 1;
4176 goto out_one_pinned; 4244 goto out_one_pinned;
4177 } 4245 }
@@ -4181,7 +4249,7 @@ redo:
4181 busiest->push_cpu = this_cpu; 4249 busiest->push_cpu = this_cpu;
4182 active_balance = 1; 4250 active_balance = 1;
4183 } 4251 }
4184 spin_unlock_irqrestore(&busiest->lock, flags); 4252 raw_spin_unlock_irqrestore(&busiest->lock, flags);
4185 if (active_balance) 4253 if (active_balance)
4186 wake_up_process(busiest->migration_thread); 4254 wake_up_process(busiest->migration_thread);
4187 4255
@@ -4254,7 +4322,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4254 int all_pinned = 0; 4322 int all_pinned = 0;
4255 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4323 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4256 4324
4257 cpumask_setall(cpus); 4325 cpumask_copy(cpus, cpu_active_mask);
4258 4326
4259 /* 4327 /*
4260 * When power savings policy is enabled for the parent domain, idle 4328 * When power savings policy is enabled for the parent domain, idle
@@ -4363,10 +4431,10 @@ redo:
4363 /* 4431 /*
4364 * Should not call ttwu while holding a rq->lock 4432 * Should not call ttwu while holding a rq->lock
4365 */ 4433 */
4366 spin_unlock(&this_rq->lock); 4434 raw_spin_unlock(&this_rq->lock);
4367 if (active_balance) 4435 if (active_balance)
4368 wake_up_process(busiest->migration_thread); 4436 wake_up_process(busiest->migration_thread);
4369 spin_lock(&this_rq->lock); 4437 raw_spin_lock(&this_rq->lock);
4370 4438
4371 } else 4439 } else
4372 sd->nr_balance_failed = 0; 4440 sd->nr_balance_failed = 0;
@@ -4394,6 +4462,11 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
4394 int pulled_task = 0; 4462 int pulled_task = 0;
4395 unsigned long next_balance = jiffies + HZ; 4463 unsigned long next_balance = jiffies + HZ;
4396 4464
4465 this_rq->idle_stamp = this_rq->clock;
4466
4467 if (this_rq->avg_idle < sysctl_sched_migration_cost)
4468 return;
4469
4397 for_each_domain(this_cpu, sd) { 4470 for_each_domain(this_cpu, sd) {
4398 unsigned long interval; 4471 unsigned long interval;
4399 4472
@@ -4408,8 +4481,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
4408 interval = msecs_to_jiffies(sd->balance_interval); 4481 interval = msecs_to_jiffies(sd->balance_interval);
4409 if (time_after(next_balance, sd->last_balance + interval)) 4482 if (time_after(next_balance, sd->last_balance + interval))
4410 next_balance = sd->last_balance + interval; 4483 next_balance = sd->last_balance + interval;
4411 if (pulled_task) 4484 if (pulled_task) {
4485 this_rq->idle_stamp = 0;
4412 break; 4486 break;
4487 }
4413 } 4488 }
4414 if (pulled_task || time_after(jiffies, this_rq->next_balance)) { 4489 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4415 /* 4490 /*
@@ -4644,7 +4719,7 @@ int select_nohz_load_balancer(int stop_tick)
4644 cpumask_set_cpu(cpu, nohz.cpu_mask); 4719 cpumask_set_cpu(cpu, nohz.cpu_mask);
4645 4720
4646 /* time for ilb owner also to sleep */ 4721 /* time for ilb owner also to sleep */
4647 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { 4722 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
4648 if (atomic_read(&nohz.load_balancer) == cpu) 4723 if (atomic_read(&nohz.load_balancer) == cpu)
4649 atomic_set(&nohz.load_balancer, -1); 4724 atomic_set(&nohz.load_balancer, -1);
4650 return 0; 4725 return 0;
@@ -5011,8 +5086,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
5011 p->gtime = cputime_add(p->gtime, cputime); 5086 p->gtime = cputime_add(p->gtime, cputime);
5012 5087
5013 /* Add guest time to cpustat. */ 5088 /* Add guest time to cpustat. */
5014 cpustat->user = cputime64_add(cpustat->user, tmp); 5089 if (TASK_NICE(p) > 0) {
5015 cpustat->guest = cputime64_add(cpustat->guest, tmp); 5090 cpustat->nice = cputime64_add(cpustat->nice, tmp);
5091 cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);
5092 } else {
5093 cpustat->user = cputime64_add(cpustat->user, tmp);
5094 cpustat->guest = cputime64_add(cpustat->guest, tmp);
5095 }
5016} 5096}
5017 5097
5018/* 5098/*
@@ -5127,60 +5207,86 @@ void account_idle_ticks(unsigned long ticks)
5127 * Use precise platform statistics if available: 5207 * Use precise platform statistics if available:
5128 */ 5208 */
5129#ifdef CONFIG_VIRT_CPU_ACCOUNTING 5209#ifdef CONFIG_VIRT_CPU_ACCOUNTING
5130cputime_t task_utime(struct task_struct *p) 5210void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5131{ 5211{
5132 return p->utime; 5212 *ut = p->utime;
5213 *st = p->stime;
5133} 5214}
5134 5215
5135cputime_t task_stime(struct task_struct *p) 5216void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5136{ 5217{
5137 return p->stime; 5218 struct task_cputime cputime;
5219
5220 thread_group_cputime(p, &cputime);
5221
5222 *ut = cputime.utime;
5223 *st = cputime.stime;
5138} 5224}
5139#else 5225#else
5140cputime_t task_utime(struct task_struct *p) 5226
5227#ifndef nsecs_to_cputime
5228# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
5229#endif
5230
5231void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5141{ 5232{
5142 clock_t utime = cputime_to_clock_t(p->utime), 5233 cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);
5143 total = utime + cputime_to_clock_t(p->stime);
5144 u64 temp;
5145 5234
5146 /* 5235 /*
5147 * Use CFS's precise accounting: 5236 * Use CFS's precise accounting:
5148 */ 5237 */
5149 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); 5238 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
5150 5239
5151 if (total) { 5240 if (total) {
5152 temp *= utime; 5241 u64 temp;
5242
5243 temp = (u64)(rtime * utime);
5153 do_div(temp, total); 5244 do_div(temp, total);
5154 } 5245 utime = (cputime_t)temp;
5155 utime = (clock_t)temp; 5246 } else
5247 utime = rtime;
5248
5249 /*
5250 * Compare with previous values, to keep monotonicity:
5251 */
5252 p->prev_utime = max(p->prev_utime, utime);
5253 p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));
5156 5254
5157 p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); 5255 *ut = p->prev_utime;
5158 return p->prev_utime; 5256 *st = p->prev_stime;
5159} 5257}
5160 5258
5161cputime_t task_stime(struct task_struct *p) 5259/*
5260 * Must be called with siglock held.
5261 */
5262void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5162{ 5263{
5163 clock_t stime; 5264 struct signal_struct *sig = p->signal;
5265 struct task_cputime cputime;
5266 cputime_t rtime, utime, total;
5164 5267
5165 /* 5268 thread_group_cputime(p, &cputime);
5166 * Use CFS's precise accounting. (we subtract utime from
5167 * the total, to make sure the total observed by userspace
5168 * grows monotonically - apps rely on that):
5169 */
5170 stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
5171 cputime_to_clock_t(task_utime(p));
5172 5269
5173 if (stime >= 0) 5270 total = cputime_add(cputime.utime, cputime.stime);
5174 p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); 5271 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
5175 5272
5176 return p->prev_stime; 5273 if (total) {
5177} 5274 u64 temp;
5178#endif
5179 5275
5180inline cputime_t task_gtime(struct task_struct *p) 5276 temp = (u64)(rtime * cputime.utime);
5181{ 5277 do_div(temp, total);
5182 return p->gtime; 5278 utime = (cputime_t)temp;
5279 } else
5280 utime = rtime;
5281
5282 sig->prev_utime = max(sig->prev_utime, utime);
5283 sig->prev_stime = max(sig->prev_stime,
5284 cputime_sub(rtime, sig->prev_utime));
5285
5286 *ut = sig->prev_utime;
5287 *st = sig->prev_stime;
5183} 5288}
5289#endif
5184 5290
5185/* 5291/*
5186 * This function gets called by the timer code, with HZ frequency. 5292 * This function gets called by the timer code, with HZ frequency.
@@ -5197,11 +5303,11 @@ void scheduler_tick(void)
5197 5303
5198 sched_clock_tick(); 5304 sched_clock_tick();
5199 5305
5200 spin_lock(&rq->lock); 5306 raw_spin_lock(&rq->lock);
5201 update_rq_clock(rq); 5307 update_rq_clock(rq);
5202 update_cpu_load(rq); 5308 update_cpu_load(rq);
5203 curr->sched_class->task_tick(rq, curr, 0); 5309 curr->sched_class->task_tick(rq, curr, 0);
5204 spin_unlock(&rq->lock); 5310 raw_spin_unlock(&rq->lock);
5205 5311
5206 perf_event_task_tick(curr, cpu); 5312 perf_event_task_tick(curr, cpu);
5207 5313
@@ -5315,13 +5421,14 @@ static inline void schedule_debug(struct task_struct *prev)
5315#endif 5421#endif
5316} 5422}
5317 5423
5318static void put_prev_task(struct rq *rq, struct task_struct *p) 5424static void put_prev_task(struct rq *rq, struct task_struct *prev)
5319{ 5425{
5320 u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime; 5426 if (prev->state == TASK_RUNNING) {
5427 u64 runtime = prev->se.sum_exec_runtime;
5321 5428
5322 update_avg(&p->se.avg_running, runtime); 5429 runtime -= prev->se.prev_sum_exec_runtime;
5430 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5323 5431
5324 if (p->state == TASK_RUNNING) {
5325 /* 5432 /*
5326 * In order to avoid avg_overlap growing stale when we are 5433 * In order to avoid avg_overlap growing stale when we are
5327 * indeed overlapping and hence not getting put to sleep, grow 5434 * indeed overlapping and hence not getting put to sleep, grow
@@ -5331,12 +5438,9 @@ static void put_prev_task(struct rq *rq, struct task_struct *p)
5331 * correlates to the amount of cache footprint a task can 5438 * correlates to the amount of cache footprint a task can
5332 * build up. 5439 * build up.
5333 */ 5440 */
5334 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); 5441 update_avg(&prev->se.avg_overlap, runtime);
5335 update_avg(&p->se.avg_overlap, runtime);
5336 } else {
5337 update_avg(&p->se.avg_running, 0);
5338 } 5442 }
5339 p->sched_class->put_prev_task(rq, p); 5443 prev->sched_class->put_prev_task(rq, prev);
5340} 5444}
5341 5445
5342/* 5446/*
@@ -5397,7 +5501,7 @@ need_resched_nonpreemptible:
5397 if (sched_feat(HRTICK)) 5501 if (sched_feat(HRTICK))
5398 hrtick_clear(rq); 5502 hrtick_clear(rq);
5399 5503
5400 spin_lock_irq(&rq->lock); 5504 raw_spin_lock_irq(&rq->lock);
5401 update_rq_clock(rq); 5505 update_rq_clock(rq);
5402 clear_tsk_need_resched(prev); 5506 clear_tsk_need_resched(prev);
5403 5507
@@ -5433,12 +5537,15 @@ need_resched_nonpreemptible:
5433 cpu = smp_processor_id(); 5537 cpu = smp_processor_id();
5434 rq = cpu_rq(cpu); 5538 rq = cpu_rq(cpu);
5435 } else 5539 } else
5436 spin_unlock_irq(&rq->lock); 5540 raw_spin_unlock_irq(&rq->lock);
5437 5541
5438 post_schedule(rq); 5542 post_schedule(rq);
5439 5543
5440 if (unlikely(reacquire_kernel_lock(current) < 0)) 5544 if (unlikely(reacquire_kernel_lock(current) < 0)) {
5545 prev = rq->curr;
5546 switch_count = &prev->nivcsw;
5441 goto need_resched_nonpreemptible; 5547 goto need_resched_nonpreemptible;
5548 }
5442 5549
5443 preempt_enable_no_resched(); 5550 preempt_enable_no_resched();
5444 if (need_resched()) 5551 if (need_resched())
@@ -5446,7 +5553,7 @@ need_resched_nonpreemptible:
5446} 5553}
5447EXPORT_SYMBOL(schedule); 5554EXPORT_SYMBOL(schedule);
5448 5555
5449#ifdef CONFIG_SMP 5556#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
5450/* 5557/*
5451 * Look out! "owner" is an entirely speculative pointer 5558 * Look out! "owner" is an entirely speculative pointer
5452 * access and not reliable. 5559 * access and not reliable.
@@ -5850,14 +5957,15 @@ EXPORT_SYMBOL(wait_for_completion_killable);
5850 */ 5957 */
5851bool try_wait_for_completion(struct completion *x) 5958bool try_wait_for_completion(struct completion *x)
5852{ 5959{
5960 unsigned long flags;
5853 int ret = 1; 5961 int ret = 1;
5854 5962
5855 spin_lock_irq(&x->wait.lock); 5963 spin_lock_irqsave(&x->wait.lock, flags);
5856 if (!x->done) 5964 if (!x->done)
5857 ret = 0; 5965 ret = 0;
5858 else 5966 else
5859 x->done--; 5967 x->done--;
5860 spin_unlock_irq(&x->wait.lock); 5968 spin_unlock_irqrestore(&x->wait.lock, flags);
5861 return ret; 5969 return ret;
5862} 5970}
5863EXPORT_SYMBOL(try_wait_for_completion); 5971EXPORT_SYMBOL(try_wait_for_completion);
@@ -5872,12 +5980,13 @@ EXPORT_SYMBOL(try_wait_for_completion);
5872 */ 5980 */
5873bool completion_done(struct completion *x) 5981bool completion_done(struct completion *x)
5874{ 5982{
5983 unsigned long flags;
5875 int ret = 1; 5984 int ret = 1;
5876 5985
5877 spin_lock_irq(&x->wait.lock); 5986 spin_lock_irqsave(&x->wait.lock, flags);
5878 if (!x->done) 5987 if (!x->done)
5879 ret = 0; 5988 ret = 0;
5880 spin_unlock_irq(&x->wait.lock); 5989 spin_unlock_irqrestore(&x->wait.lock, flags);
5881 return ret; 5990 return ret;
5882} 5991}
5883EXPORT_SYMBOL(completion_done); 5992EXPORT_SYMBOL(completion_done);
@@ -6140,22 +6249,14 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
6140 BUG_ON(p->se.on_rq); 6249 BUG_ON(p->se.on_rq);
6141 6250
6142 p->policy = policy; 6251 p->policy = policy;
6143 switch (p->policy) {
6144 case SCHED_NORMAL:
6145 case SCHED_BATCH:
6146 case SCHED_IDLE:
6147 p->sched_class = &fair_sched_class;
6148 break;
6149 case SCHED_FIFO:
6150 case SCHED_RR:
6151 p->sched_class = &rt_sched_class;
6152 break;
6153 }
6154
6155 p->rt_priority = prio; 6252 p->rt_priority = prio;
6156 p->normal_prio = normal_prio(p); 6253 p->normal_prio = normal_prio(p);
6157 /* we are holding p->pi_lock already */ 6254 /* we are holding p->pi_lock already */
6158 p->prio = rt_mutex_getprio(p); 6255 p->prio = rt_mutex_getprio(p);
6256 if (rt_prio(p->prio))
6257 p->sched_class = &rt_sched_class;
6258 else
6259 p->sched_class = &fair_sched_class;
6159 set_load_weight(p); 6260 set_load_weight(p);
6160} 6261}
6161 6262
@@ -6270,7 +6371,7 @@ recheck:
6270 * make sure no PI-waiters arrive (or leave) while we are 6371 * make sure no PI-waiters arrive (or leave) while we are
6271 * changing the priority of the task: 6372 * changing the priority of the task:
6272 */ 6373 */
6273 spin_lock_irqsave(&p->pi_lock, flags); 6374 raw_spin_lock_irqsave(&p->pi_lock, flags);
6274 /* 6375 /*
6275 * To be able to change p->policy safely, the apropriate 6376 * To be able to change p->policy safely, the apropriate
6276 * runqueue lock must be held. 6377 * runqueue lock must be held.
@@ -6280,7 +6381,7 @@ recheck:
6280 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 6381 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
6281 policy = oldpolicy = -1; 6382 policy = oldpolicy = -1;
6282 __task_rq_unlock(rq); 6383 __task_rq_unlock(rq);
6283 spin_unlock_irqrestore(&p->pi_lock, flags); 6384 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
6284 goto recheck; 6385 goto recheck;
6285 } 6386 }
6286 update_rq_clock(rq); 6387 update_rq_clock(rq);
@@ -6304,7 +6405,7 @@ recheck:
6304 check_class_changed(rq, p, prev_class, oldprio, running); 6405 check_class_changed(rq, p, prev_class, oldprio, running);
6305 } 6406 }
6306 __task_rq_unlock(rq); 6407 __task_rq_unlock(rq);
6307 spin_unlock_irqrestore(&p->pi_lock, flags); 6408 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
6308 6409
6309 rt_mutex_adjust_pi(p); 6410 rt_mutex_adjust_pi(p);
6310 6411
@@ -6404,7 +6505,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
6404 return -EINVAL; 6505 return -EINVAL;
6405 6506
6406 retval = -ESRCH; 6507 retval = -ESRCH;
6407 read_lock(&tasklist_lock); 6508 rcu_read_lock();
6408 p = find_process_by_pid(pid); 6509 p = find_process_by_pid(pid);
6409 if (p) { 6510 if (p) {
6410 retval = security_task_getscheduler(p); 6511 retval = security_task_getscheduler(p);
@@ -6412,7 +6513,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
6412 retval = p->policy 6513 retval = p->policy
6413 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); 6514 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
6414 } 6515 }
6415 read_unlock(&tasklist_lock); 6516 rcu_read_unlock();
6416 return retval; 6517 return retval;
6417} 6518}
6418 6519
@@ -6430,7 +6531,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
6430 if (!param || pid < 0) 6531 if (!param || pid < 0)
6431 return -EINVAL; 6532 return -EINVAL;
6432 6533
6433 read_lock(&tasklist_lock); 6534 rcu_read_lock();
6434 p = find_process_by_pid(pid); 6535 p = find_process_by_pid(pid);
6435 retval = -ESRCH; 6536 retval = -ESRCH;
6436 if (!p) 6537 if (!p)
@@ -6441,7 +6542,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
6441 goto out_unlock; 6542 goto out_unlock;
6442 6543
6443 lp.sched_priority = p->rt_priority; 6544 lp.sched_priority = p->rt_priority;
6444 read_unlock(&tasklist_lock); 6545 rcu_read_unlock();
6445 6546
6446 /* 6547 /*
6447 * This one might sleep, we cannot do it with a spinlock held ... 6548 * This one might sleep, we cannot do it with a spinlock held ...
@@ -6451,7 +6552,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
6451 return retval; 6552 return retval;
6452 6553
6453out_unlock: 6554out_unlock:
6454 read_unlock(&tasklist_lock); 6555 rcu_read_unlock();
6455 return retval; 6556 return retval;
6456} 6557}
6457 6558
@@ -6462,22 +6563,18 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
6462 int retval; 6563 int retval;
6463 6564
6464 get_online_cpus(); 6565 get_online_cpus();
6465 read_lock(&tasklist_lock); 6566 rcu_read_lock();
6466 6567
6467 p = find_process_by_pid(pid); 6568 p = find_process_by_pid(pid);
6468 if (!p) { 6569 if (!p) {
6469 read_unlock(&tasklist_lock); 6570 rcu_read_unlock();
6470 put_online_cpus(); 6571 put_online_cpus();
6471 return -ESRCH; 6572 return -ESRCH;
6472 } 6573 }
6473 6574
6474 /* 6575 /* Prevent p going away */
6475 * It is not safe to call set_cpus_allowed with the
6476 * tasklist_lock held. We will bump the task_struct's
6477 * usage count and then drop tasklist_lock.
6478 */
6479 get_task_struct(p); 6576 get_task_struct(p);
6480 read_unlock(&tasklist_lock); 6577 rcu_read_unlock();
6481 6578
6482 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { 6579 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
6483 retval = -ENOMEM; 6580 retval = -ENOMEM;
@@ -6558,10 +6655,12 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
6558long sched_getaffinity(pid_t pid, struct cpumask *mask) 6655long sched_getaffinity(pid_t pid, struct cpumask *mask)
6559{ 6656{
6560 struct task_struct *p; 6657 struct task_struct *p;
6658 unsigned long flags;
6659 struct rq *rq;
6561 int retval; 6660 int retval;
6562 6661
6563 get_online_cpus(); 6662 get_online_cpus();
6564 read_lock(&tasklist_lock); 6663 rcu_read_lock();
6565 6664
6566 retval = -ESRCH; 6665 retval = -ESRCH;
6567 p = find_process_by_pid(pid); 6666 p = find_process_by_pid(pid);
@@ -6572,10 +6671,12 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
6572 if (retval) 6671 if (retval)
6573 goto out_unlock; 6672 goto out_unlock;
6574 6673
6674 rq = task_rq_lock(p, &flags);
6575 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 6675 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
6676 task_rq_unlock(rq, &flags);
6576 6677
6577out_unlock: 6678out_unlock:
6578 read_unlock(&tasklist_lock); 6679 rcu_read_unlock();
6579 put_online_cpus(); 6680 put_online_cpus();
6580 6681
6581 return retval; 6682 return retval;
@@ -6630,7 +6731,7 @@ SYSCALL_DEFINE0(sched_yield)
6630 */ 6731 */
6631 __release(rq->lock); 6732 __release(rq->lock);
6632 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 6733 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
6633 _raw_spin_unlock(&rq->lock); 6734 do_raw_spin_unlock(&rq->lock);
6634 preempt_enable_no_resched(); 6735 preempt_enable_no_resched();
6635 6736
6636 schedule(); 6737 schedule();
@@ -6718,9 +6819,6 @@ EXPORT_SYMBOL(yield);
6718/* 6819/*
6719 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 6820 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
6720 * that process accounting knows that this is a task in IO wait state. 6821 * that process accounting knows that this is a task in IO wait state.
6721 *
6722 * But don't do that if it is a deliberate, throttling IO wait (this task
6723 * has set its backing_dev_info: the queue against which it should throttle)
6724 */ 6822 */
6725void __sched io_schedule(void) 6823void __sched io_schedule(void)
6726{ 6824{
@@ -6813,6 +6911,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6813{ 6911{
6814 struct task_struct *p; 6912 struct task_struct *p;
6815 unsigned int time_slice; 6913 unsigned int time_slice;
6914 unsigned long flags;
6915 struct rq *rq;
6816 int retval; 6916 int retval;
6817 struct timespec t; 6917 struct timespec t;
6818 6918
@@ -6820,7 +6920,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6820 return -EINVAL; 6920 return -EINVAL;
6821 6921
6822 retval = -ESRCH; 6922 retval = -ESRCH;
6823 read_lock(&tasklist_lock); 6923 rcu_read_lock();
6824 p = find_process_by_pid(pid); 6924 p = find_process_by_pid(pid);
6825 if (!p) 6925 if (!p)
6826 goto out_unlock; 6926 goto out_unlock;
@@ -6829,15 +6929,17 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6829 if (retval) 6929 if (retval)
6830 goto out_unlock; 6930 goto out_unlock;
6831 6931
6832 time_slice = p->sched_class->get_rr_interval(p); 6932 rq = task_rq_lock(p, &flags);
6933 time_slice = p->sched_class->get_rr_interval(rq, p);
6934 task_rq_unlock(rq, &flags);
6833 6935
6834 read_unlock(&tasklist_lock); 6936 rcu_read_unlock();
6835 jiffies_to_timespec(time_slice, &t); 6937 jiffies_to_timespec(time_slice, &t);
6836 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 6938 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
6837 return retval; 6939 return retval;
6838 6940
6839out_unlock: 6941out_unlock:
6840 read_unlock(&tasklist_lock); 6942 rcu_read_unlock();
6841 return retval; 6943 return retval;
6842} 6944}
6843 6945
@@ -6903,7 +7005,7 @@ void show_state_filter(unsigned long state_filter)
6903 /* 7005 /*
6904 * Only show locks if all tasks are dumped: 7006 * Only show locks if all tasks are dumped:
6905 */ 7007 */
6906 if (state_filter == -1) 7008 if (!state_filter)
6907 debug_show_all_locks(); 7009 debug_show_all_locks();
6908} 7010}
6909 7011
@@ -6925,12 +7027,12 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
6925 struct rq *rq = cpu_rq(cpu); 7027 struct rq *rq = cpu_rq(cpu);
6926 unsigned long flags; 7028 unsigned long flags;
6927 7029
6928 spin_lock_irqsave(&rq->lock, flags); 7030 raw_spin_lock_irqsave(&rq->lock, flags);
6929 7031
6930 __sched_fork(idle); 7032 __sched_fork(idle);
7033 idle->state = TASK_RUNNING;
6931 idle->se.exec_start = sched_clock(); 7034 idle->se.exec_start = sched_clock();
6932 7035
6933 idle->prio = idle->normal_prio = MAX_PRIO;
6934 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); 7036 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
6935 __set_task_cpu(idle, cpu); 7037 __set_task_cpu(idle, cpu);
6936 7038
@@ -6938,7 +7040,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
6938#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 7040#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
6939 idle->oncpu = 1; 7041 idle->oncpu = 1;
6940#endif 7042#endif
6941 spin_unlock_irqrestore(&rq->lock, flags); 7043 raw_spin_unlock_irqrestore(&rq->lock, flags);
6942 7044
6943 /* Set the preempt count _outside_ the spinlocks! */ 7045 /* Set the preempt count _outside_ the spinlocks! */
6944#if defined(CONFIG_PREEMPT) 7046#if defined(CONFIG_PREEMPT)
@@ -6971,22 +7073,43 @@ cpumask_var_t nohz_cpu_mask;
6971 * 7073 *
6972 * This idea comes from the SD scheduler of Con Kolivas: 7074 * This idea comes from the SD scheduler of Con Kolivas:
6973 */ 7075 */
6974static inline void sched_init_granularity(void) 7076static int get_update_sysctl_factor(void)
6975{ 7077{
6976 unsigned int factor = 1 + ilog2(num_online_cpus()); 7078 unsigned int cpus = min_t(int, num_online_cpus(), 8);
6977 const unsigned long limit = 200000000; 7079 unsigned int factor;
6978 7080
6979 sysctl_sched_min_granularity *= factor; 7081 switch (sysctl_sched_tunable_scaling) {
6980 if (sysctl_sched_min_granularity > limit) 7082 case SCHED_TUNABLESCALING_NONE:
6981 sysctl_sched_min_granularity = limit; 7083 factor = 1;
7084 break;
7085 case SCHED_TUNABLESCALING_LINEAR:
7086 factor = cpus;
7087 break;
7088 case SCHED_TUNABLESCALING_LOG:
7089 default:
7090 factor = 1 + ilog2(cpus);
7091 break;
7092 }
6982 7093
6983 sysctl_sched_latency *= factor; 7094 return factor;
6984 if (sysctl_sched_latency > limit) 7095}
6985 sysctl_sched_latency = limit;
6986 7096
6987 sysctl_sched_wakeup_granularity *= factor; 7097static void update_sysctl(void)
7098{
7099 unsigned int factor = get_update_sysctl_factor();
7100
7101#define SET_SYSCTL(name) \
7102 (sysctl_##name = (factor) * normalized_sysctl_##name)
7103 SET_SYSCTL(sched_min_granularity);
7104 SET_SYSCTL(sched_latency);
7105 SET_SYSCTL(sched_wakeup_granularity);
7106 SET_SYSCTL(sched_shares_ratelimit);
7107#undef SET_SYSCTL
7108}
6988 7109
6989 sysctl_sched_shares_ratelimit *= factor; 7110static inline void sched_init_granularity(void)
7111{
7112 update_sysctl();
6990} 7113}
6991 7114
6992#ifdef CONFIG_SMP 7115#ifdef CONFIG_SMP
@@ -7022,8 +7145,28 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7022 struct rq *rq; 7145 struct rq *rq;
7023 int ret = 0; 7146 int ret = 0;
7024 7147
7148 /*
7149 * Since we rely on wake-ups to migrate sleeping tasks, don't change
7150 * the ->cpus_allowed mask from under waking tasks, which would be
7151 * possible when we change rq->lock in ttwu(), so synchronize against
7152 * TASK_WAKING to avoid that.
7153 *
7154 * Make an exception for freshly cloned tasks, since cpuset namespaces
7155 * might move the task about, we have to validate the target in
7156 * wake_up_new_task() anyway since the cpu might have gone away.
7157 */
7158again:
7159 while (p->state == TASK_WAKING && !(p->flags & PF_STARTING))
7160 cpu_relax();
7161
7025 rq = task_rq_lock(p, &flags); 7162 rq = task_rq_lock(p, &flags);
7026 if (!cpumask_intersects(new_mask, cpu_online_mask)) { 7163
7164 if (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) {
7165 task_rq_unlock(rq, &flags);
7166 goto again;
7167 }
7168
7169 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
7027 ret = -EINVAL; 7170 ret = -EINVAL;
7028 goto out; 7171 goto out;
7029 } 7172 }
@@ -7045,7 +7188,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7045 if (cpumask_test_cpu(task_cpu(p), new_mask)) 7188 if (cpumask_test_cpu(task_cpu(p), new_mask))
7046 goto out; 7189 goto out;
7047 7190
7048 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { 7191 if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) {
7049 /* Need help from migration thread: drop lock and wait. */ 7192 /* Need help from migration thread: drop lock and wait. */
7050 struct task_struct *mt = rq->migration_thread; 7193 struct task_struct *mt = rq->migration_thread;
7051 7194
@@ -7078,7 +7221,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
7078static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 7221static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
7079{ 7222{
7080 struct rq *rq_dest, *rq_src; 7223 struct rq *rq_dest, *rq_src;
7081 int ret = 0, on_rq; 7224 int ret = 0;
7082 7225
7083 if (unlikely(!cpu_active(dest_cpu))) 7226 if (unlikely(!cpu_active(dest_cpu)))
7084 return ret; 7227 return ret;
@@ -7094,12 +7237,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
7094 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 7237 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
7095 goto fail; 7238 goto fail;
7096 7239
7097 on_rq = p->se.on_rq; 7240 /*
7098 if (on_rq) 7241 * If we're not on a rq, the next wake-up will ensure we're
7242 * placed properly.
7243 */
7244 if (p->se.on_rq) {
7099 deactivate_task(rq_src, p, 0); 7245 deactivate_task(rq_src, p, 0);
7100 7246 set_task_cpu(p, dest_cpu);
7101 set_task_cpu(p, dest_cpu);
7102 if (on_rq) {
7103 activate_task(rq_dest, p, 0); 7247 activate_task(rq_dest, p, 0);
7104 check_preempt_curr(rq_dest, p, 0); 7248 check_preempt_curr(rq_dest, p, 0);
7105 } 7249 }
@@ -7134,10 +7278,10 @@ static int migration_thread(void *data)
7134 struct migration_req *req; 7278 struct migration_req *req;
7135 struct list_head *head; 7279 struct list_head *head;
7136 7280
7137 spin_lock_irq(&rq->lock); 7281 raw_spin_lock_irq(&rq->lock);
7138 7282
7139 if (cpu_is_offline(cpu)) { 7283 if (cpu_is_offline(cpu)) {
7140 spin_unlock_irq(&rq->lock); 7284 raw_spin_unlock_irq(&rq->lock);
7141 break; 7285 break;
7142 } 7286 }
7143 7287
@@ -7149,7 +7293,7 @@ static int migration_thread(void *data)
7149 head = &rq->migration_queue; 7293 head = &rq->migration_queue;
7150 7294
7151 if (list_empty(head)) { 7295 if (list_empty(head)) {
7152 spin_unlock_irq(&rq->lock); 7296 raw_spin_unlock_irq(&rq->lock);
7153 schedule(); 7297 schedule();
7154 set_current_state(TASK_INTERRUPTIBLE); 7298 set_current_state(TASK_INTERRUPTIBLE);
7155 continue; 7299 continue;
@@ -7158,14 +7302,14 @@ static int migration_thread(void *data)
7158 list_del_init(head->next); 7302 list_del_init(head->next);
7159 7303
7160 if (req->task != NULL) { 7304 if (req->task != NULL) {
7161 spin_unlock(&rq->lock); 7305 raw_spin_unlock(&rq->lock);
7162 __migrate_task(req->task, cpu, req->dest_cpu); 7306 __migrate_task(req->task, cpu, req->dest_cpu);
7163 } else if (likely(cpu == (badcpu = smp_processor_id()))) { 7307 } else if (likely(cpu == (badcpu = smp_processor_id()))) {
7164 req->dest_cpu = RCU_MIGRATION_GOT_QS; 7308 req->dest_cpu = RCU_MIGRATION_GOT_QS;
7165 spin_unlock(&rq->lock); 7309 raw_spin_unlock(&rq->lock);
7166 } else { 7310 } else {
7167 req->dest_cpu = RCU_MIGRATION_MUST_SYNC; 7311 req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
7168 spin_unlock(&rq->lock); 7312 raw_spin_unlock(&rq->lock);
7169 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu); 7313 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
7170 } 7314 }
7171 local_irq_enable(); 7315 local_irq_enable();
@@ -7195,37 +7339,10 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
7195static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 7339static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
7196{ 7340{
7197 int dest_cpu; 7341 int dest_cpu;
7198 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu));
7199 7342
7200again: 7343again:
7201 /* Look for allowed, online CPU in same node. */ 7344 dest_cpu = select_fallback_rq(dead_cpu, p);
7202 for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)
7203 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
7204 goto move;
7205
7206 /* Any allowed, online CPU? */
7207 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);
7208 if (dest_cpu < nr_cpu_ids)
7209 goto move;
7210
7211 /* No more Mr. Nice Guy. */
7212 if (dest_cpu >= nr_cpu_ids) {
7213 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
7214 dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);
7215 7345
7216 /*
7217 * Don't tell them about moving exiting tasks or
7218 * kernel threads (both mm NULL), since they never
7219 * leave kernel.
7220 */
7221 if (p->mm && printk_ratelimit()) {
7222 printk(KERN_INFO "process %d (%s) no "
7223 "longer affine to cpu%d\n",
7224 task_pid_nr(p), p->comm, dead_cpu);
7225 }
7226 }
7227
7228move:
7229 /* It can have affinity changed while we were choosing. */ 7346 /* It can have affinity changed while we were choosing. */
7230 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) 7347 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
7231 goto again; 7348 goto again;
@@ -7240,7 +7357,7 @@ move:
7240 */ 7357 */
7241static void migrate_nr_uninterruptible(struct rq *rq_src) 7358static void migrate_nr_uninterruptible(struct rq *rq_src)
7242{ 7359{
7243 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask)); 7360 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
7244 unsigned long flags; 7361 unsigned long flags;
7245 7362
7246 local_irq_save(flags); 7363 local_irq_save(flags);
@@ -7288,14 +7405,14 @@ void sched_idle_next(void)
7288 * Strictly not necessary since rest of the CPUs are stopped by now 7405 * Strictly not necessary since rest of the CPUs are stopped by now
7289 * and interrupts disabled on the current cpu. 7406 * and interrupts disabled on the current cpu.
7290 */ 7407 */
7291 spin_lock_irqsave(&rq->lock, flags); 7408 raw_spin_lock_irqsave(&rq->lock, flags);
7292 7409
7293 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); 7410 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
7294 7411
7295 update_rq_clock(rq); 7412 update_rq_clock(rq);
7296 activate_task(rq, p, 0); 7413 activate_task(rq, p, 0);
7297 7414
7298 spin_unlock_irqrestore(&rq->lock, flags); 7415 raw_spin_unlock_irqrestore(&rq->lock, flags);
7299} 7416}
7300 7417
7301/* 7418/*
@@ -7331,9 +7448,9 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
7331 * that's OK. No task can be added to this CPU, so iteration is 7448 * that's OK. No task can be added to this CPU, so iteration is
7332 * fine. 7449 * fine.
7333 */ 7450 */
7334 spin_unlock_irq(&rq->lock); 7451 raw_spin_unlock_irq(&rq->lock);
7335 move_task_off_dead_cpu(dead_cpu, p); 7452 move_task_off_dead_cpu(dead_cpu, p);
7336 spin_lock_irq(&rq->lock); 7453 raw_spin_lock_irq(&rq->lock);
7337 7454
7338 put_task_struct(p); 7455 put_task_struct(p);
7339} 7456}
@@ -7374,17 +7491,16 @@ static struct ctl_table sd_ctl_dir[] = {
7374 .procname = "sched_domain", 7491 .procname = "sched_domain",
7375 .mode = 0555, 7492 .mode = 0555,
7376 }, 7493 },
7377 {0, }, 7494 {}
7378}; 7495};
7379 7496
7380static struct ctl_table sd_ctl_root[] = { 7497static struct ctl_table sd_ctl_root[] = {
7381 { 7498 {
7382 .ctl_name = CTL_KERN,
7383 .procname = "kernel", 7499 .procname = "kernel",
7384 .mode = 0555, 7500 .mode = 0555,
7385 .child = sd_ctl_dir, 7501 .child = sd_ctl_dir,
7386 }, 7502 },
7387 {0, }, 7503 {}
7388}; 7504};
7389 7505
7390static struct ctl_table *sd_alloc_ctl_entry(int n) 7506static struct ctl_table *sd_alloc_ctl_entry(int n)
@@ -7494,7 +7610,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
7494static struct ctl_table_header *sd_sysctl_header; 7610static struct ctl_table_header *sd_sysctl_header;
7495static void register_sched_domain_sysctl(void) 7611static void register_sched_domain_sysctl(void)
7496{ 7612{
7497 int i, cpu_num = num_online_cpus(); 7613 int i, cpu_num = num_possible_cpus();
7498 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); 7614 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
7499 char buf[32]; 7615 char buf[32];
7500 7616
@@ -7504,7 +7620,7 @@ static void register_sched_domain_sysctl(void)
7504 if (entry == NULL) 7620 if (entry == NULL)
7505 return; 7621 return;
7506 7622
7507 for_each_online_cpu(i) { 7623 for_each_possible_cpu(i) {
7508 snprintf(buf, 32, "cpu%d", i); 7624 snprintf(buf, 32, "cpu%d", i);
7509 entry->procname = kstrdup(buf, GFP_KERNEL); 7625 entry->procname = kstrdup(buf, GFP_KERNEL);
7510 entry->mode = 0555; 7626 entry->mode = 0555;
@@ -7600,13 +7716,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7600 7716
7601 /* Update our root-domain */ 7717 /* Update our root-domain */
7602 rq = cpu_rq(cpu); 7718 rq = cpu_rq(cpu);
7603 spin_lock_irqsave(&rq->lock, flags); 7719 raw_spin_lock_irqsave(&rq->lock, flags);
7604 if (rq->rd) { 7720 if (rq->rd) {
7605 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 7721 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7606 7722
7607 set_rq_online(rq); 7723 set_rq_online(rq);
7608 } 7724 }
7609 spin_unlock_irqrestore(&rq->lock, flags); 7725 raw_spin_unlock_irqrestore(&rq->lock, flags);
7610 break; 7726 break;
7611 7727
7612#ifdef CONFIG_HOTPLUG_CPU 7728#ifdef CONFIG_HOTPLUG_CPU
@@ -7631,14 +7747,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7631 put_task_struct(rq->migration_thread); 7747 put_task_struct(rq->migration_thread);
7632 rq->migration_thread = NULL; 7748 rq->migration_thread = NULL;
7633 /* Idle task back to normal (off runqueue, low prio) */ 7749 /* Idle task back to normal (off runqueue, low prio) */
7634 spin_lock_irq(&rq->lock); 7750 raw_spin_lock_irq(&rq->lock);
7635 update_rq_clock(rq); 7751 update_rq_clock(rq);
7636 deactivate_task(rq, rq->idle, 0); 7752 deactivate_task(rq, rq->idle, 0);
7637 rq->idle->static_prio = MAX_PRIO;
7638 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); 7753 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
7639 rq->idle->sched_class = &idle_sched_class; 7754 rq->idle->sched_class = &idle_sched_class;
7640 migrate_dead_tasks(cpu); 7755 migrate_dead_tasks(cpu);
7641 spin_unlock_irq(&rq->lock); 7756 raw_spin_unlock_irq(&rq->lock);
7642 cpuset_unlock(); 7757 cpuset_unlock();
7643 migrate_nr_uninterruptible(rq); 7758 migrate_nr_uninterruptible(rq);
7644 BUG_ON(rq->nr_running != 0); 7759 BUG_ON(rq->nr_running != 0);
@@ -7648,30 +7763,30 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7648 * they didn't take sched_hotcpu_mutex. Just wake up 7763 * they didn't take sched_hotcpu_mutex. Just wake up
7649 * the requestors. 7764 * the requestors.
7650 */ 7765 */
7651 spin_lock_irq(&rq->lock); 7766 raw_spin_lock_irq(&rq->lock);
7652 while (!list_empty(&rq->migration_queue)) { 7767 while (!list_empty(&rq->migration_queue)) {
7653 struct migration_req *req; 7768 struct migration_req *req;
7654 7769
7655 req = list_entry(rq->migration_queue.next, 7770 req = list_entry(rq->migration_queue.next,
7656 struct migration_req, list); 7771 struct migration_req, list);
7657 list_del_init(&req->list); 7772 list_del_init(&req->list);
7658 spin_unlock_irq(&rq->lock); 7773 raw_spin_unlock_irq(&rq->lock);
7659 complete(&req->done); 7774 complete(&req->done);
7660 spin_lock_irq(&rq->lock); 7775 raw_spin_lock_irq(&rq->lock);
7661 } 7776 }
7662 spin_unlock_irq(&rq->lock); 7777 raw_spin_unlock_irq(&rq->lock);
7663 break; 7778 break;
7664 7779
7665 case CPU_DYING: 7780 case CPU_DYING:
7666 case CPU_DYING_FROZEN: 7781 case CPU_DYING_FROZEN:
7667 /* Update our root-domain */ 7782 /* Update our root-domain */
7668 rq = cpu_rq(cpu); 7783 rq = cpu_rq(cpu);
7669 spin_lock_irqsave(&rq->lock, flags); 7784 raw_spin_lock_irqsave(&rq->lock, flags);
7670 if (rq->rd) { 7785 if (rq->rd) {
7671 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 7786 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7672 set_rq_offline(rq); 7787 set_rq_offline(rq);
7673 } 7788 }
7674 spin_unlock_irqrestore(&rq->lock, flags); 7789 raw_spin_unlock_irqrestore(&rq->lock, flags);
7675 break; 7790 break;
7676#endif 7791#endif
7677 } 7792 }
@@ -7708,6 +7823,16 @@ early_initcall(migration_init);
7708 7823
7709#ifdef CONFIG_SCHED_DEBUG 7824#ifdef CONFIG_SCHED_DEBUG
7710 7825
7826static __read_mostly int sched_domain_debug_enabled;
7827
7828static int __init sched_domain_debug_setup(char *str)
7829{
7830 sched_domain_debug_enabled = 1;
7831
7832 return 0;
7833}
7834early_param("sched_debug", sched_domain_debug_setup);
7835
7711static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 7836static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7712 struct cpumask *groupmask) 7837 struct cpumask *groupmask)
7713{ 7838{
@@ -7794,6 +7919,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
7794 cpumask_var_t groupmask; 7919 cpumask_var_t groupmask;
7795 int level = 0; 7920 int level = 0;
7796 7921
7922 if (!sched_domain_debug_enabled)
7923 return;
7924
7797 if (!sd) { 7925 if (!sd) {
7798 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); 7926 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
7799 return; 7927 return;
@@ -7873,6 +8001,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
7873 8001
7874static void free_rootdomain(struct root_domain *rd) 8002static void free_rootdomain(struct root_domain *rd)
7875{ 8003{
8004 synchronize_sched();
8005
7876 cpupri_cleanup(&rd->cpupri); 8006 cpupri_cleanup(&rd->cpupri);
7877 8007
7878 free_cpumask_var(rd->rto_mask); 8008 free_cpumask_var(rd->rto_mask);
@@ -7886,7 +8016,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7886 struct root_domain *old_rd = NULL; 8016 struct root_domain *old_rd = NULL;
7887 unsigned long flags; 8017 unsigned long flags;
7888 8018
7889 spin_lock_irqsave(&rq->lock, flags); 8019 raw_spin_lock_irqsave(&rq->lock, flags);
7890 8020
7891 if (rq->rd) { 8021 if (rq->rd) {
7892 old_rd = rq->rd; 8022 old_rd = rq->rd;
@@ -7912,7 +8042,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7912 if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) 8042 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
7913 set_rq_online(rq); 8043 set_rq_online(rq);
7914 8044
7915 spin_unlock_irqrestore(&rq->lock, flags); 8045 raw_spin_unlock_irqrestore(&rq->lock, flags);
7916 8046
7917 if (old_rd) 8047 if (old_rd)
7918 free_rootdomain(old_rd); 8048 free_rootdomain(old_rd);
@@ -8013,6 +8143,7 @@ static cpumask_var_t cpu_isolated_map;
8013/* Setup the mask of cpus configured for isolated domains */ 8143/* Setup the mask of cpus configured for isolated domains */
8014static int __init isolated_cpu_setup(char *str) 8144static int __init isolated_cpu_setup(char *str)
8015{ 8145{
8146 alloc_bootmem_cpumask_var(&cpu_isolated_map);
8016 cpulist_parse(str, cpu_isolated_map); 8147 cpulist_parse(str, cpu_isolated_map);
8017 return 1; 8148 return 1;
8018} 8149}
@@ -8197,14 +8328,14 @@ enum s_alloc {
8197 */ 8328 */
8198#ifdef CONFIG_SCHED_SMT 8329#ifdef CONFIG_SCHED_SMT
8199static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); 8330static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
8200static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus); 8331static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
8201 8332
8202static int 8333static int
8203cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, 8334cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
8204 struct sched_group **sg, struct cpumask *unused) 8335 struct sched_group **sg, struct cpumask *unused)
8205{ 8336{
8206 if (sg) 8337 if (sg)
8207 *sg = &per_cpu(sched_group_cpus, cpu).sg; 8338 *sg = &per_cpu(sched_groups, cpu).sg;
8208 return cpu; 8339 return cpu;
8209} 8340}
8210#endif /* CONFIG_SCHED_SMT */ 8341#endif /* CONFIG_SCHED_SMT */
@@ -8849,7 +8980,7 @@ static int build_sched_domains(const struct cpumask *cpu_map)
8849 return __build_sched_domains(cpu_map, NULL); 8980 return __build_sched_domains(cpu_map, NULL);
8850} 8981}
8851 8982
8852static struct cpumask *doms_cur; /* current sched domains */ 8983static cpumask_var_t *doms_cur; /* current sched domains */
8853static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 8984static int ndoms_cur; /* number of sched domains in 'doms_cur' */
8854static struct sched_domain_attr *dattr_cur; 8985static struct sched_domain_attr *dattr_cur;
8855 /* attribues of custom domains in 'doms_cur' */ 8986 /* attribues of custom domains in 'doms_cur' */
@@ -8871,6 +9002,31 @@ int __attribute__((weak)) arch_update_cpu_topology(void)
8871 return 0; 9002 return 0;
8872} 9003}
8873 9004
9005cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
9006{
9007 int i;
9008 cpumask_var_t *doms;
9009
9010 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
9011 if (!doms)
9012 return NULL;
9013 for (i = 0; i < ndoms; i++) {
9014 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
9015 free_sched_domains(doms, i);
9016 return NULL;
9017 }
9018 }
9019 return doms;
9020}
9021
9022void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
9023{
9024 unsigned int i;
9025 for (i = 0; i < ndoms; i++)
9026 free_cpumask_var(doms[i]);
9027 kfree(doms);
9028}
9029
8874/* 9030/*
8875 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 9031 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
8876 * For now this just excludes isolated cpus, but could be used to 9032 * For now this just excludes isolated cpus, but could be used to
@@ -8882,12 +9038,12 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
8882 9038
8883 arch_update_cpu_topology(); 9039 arch_update_cpu_topology();
8884 ndoms_cur = 1; 9040 ndoms_cur = 1;
8885 doms_cur = kmalloc(cpumask_size(), GFP_KERNEL); 9041 doms_cur = alloc_sched_domains(ndoms_cur);
8886 if (!doms_cur) 9042 if (!doms_cur)
8887 doms_cur = fallback_doms; 9043 doms_cur = &fallback_doms;
8888 cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); 9044 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
8889 dattr_cur = NULL; 9045 dattr_cur = NULL;
8890 err = build_sched_domains(doms_cur); 9046 err = build_sched_domains(doms_cur[0]);
8891 register_sched_domain_sysctl(); 9047 register_sched_domain_sysctl();
8892 9048
8893 return err; 9049 return err;
@@ -8937,19 +9093,19 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
8937 * doms_new[] to the current sched domain partitioning, doms_cur[]. 9093 * doms_new[] to the current sched domain partitioning, doms_cur[].
8938 * It destroys each deleted domain and builds each new domain. 9094 * It destroys each deleted domain and builds each new domain.
8939 * 9095 *
8940 * 'doms_new' is an array of cpumask's of length 'ndoms_new'. 9096 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
8941 * The masks don't intersect (don't overlap.) We should setup one 9097 * The masks don't intersect (don't overlap.) We should setup one
8942 * sched domain for each mask. CPUs not in any of the cpumasks will 9098 * sched domain for each mask. CPUs not in any of the cpumasks will
8943 * not be load balanced. If the same cpumask appears both in the 9099 * not be load balanced. If the same cpumask appears both in the
8944 * current 'doms_cur' domains and in the new 'doms_new', we can leave 9100 * current 'doms_cur' domains and in the new 'doms_new', we can leave
8945 * it as it is. 9101 * it as it is.
8946 * 9102 *
8947 * The passed in 'doms_new' should be kmalloc'd. This routine takes 9103 * The passed in 'doms_new' should be allocated using
8948 * ownership of it and will kfree it when done with it. If the caller 9104 * alloc_sched_domains. This routine takes ownership of it and will
8949 * failed the kmalloc call, then it can pass in doms_new == NULL && 9105 * free_sched_domains it when done with it. If the caller failed the
8950 * ndoms_new == 1, and partition_sched_domains() will fallback to 9106 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
8951 * the single partition 'fallback_doms', it also forces the domains 9107 * and partition_sched_domains() will fallback to the single partition
8952 * to be rebuilt. 9108 * 'fallback_doms', it also forces the domains to be rebuilt.
8953 * 9109 *
8954 * If doms_new == NULL it will be replaced with cpu_online_mask. 9110 * If doms_new == NULL it will be replaced with cpu_online_mask.
8955 * ndoms_new == 0 is a special case for destroying existing domains, 9111 * ndoms_new == 0 is a special case for destroying existing domains,
@@ -8957,8 +9113,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
8957 * 9113 *
8958 * Call with hotplug lock held 9114 * Call with hotplug lock held
8959 */ 9115 */
8960/* FIXME: Change to struct cpumask *doms_new[] */ 9116void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
8961void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
8962 struct sched_domain_attr *dattr_new) 9117 struct sched_domain_attr *dattr_new)
8963{ 9118{
8964 int i, j, n; 9119 int i, j, n;
@@ -8977,40 +9132,40 @@ void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
8977 /* Destroy deleted domains */ 9132 /* Destroy deleted domains */
8978 for (i = 0; i < ndoms_cur; i++) { 9133 for (i = 0; i < ndoms_cur; i++) {
8979 for (j = 0; j < n && !new_topology; j++) { 9134 for (j = 0; j < n && !new_topology; j++) {
8980 if (cpumask_equal(&doms_cur[i], &doms_new[j]) 9135 if (cpumask_equal(doms_cur[i], doms_new[j])
8981 && dattrs_equal(dattr_cur, i, dattr_new, j)) 9136 && dattrs_equal(dattr_cur, i, dattr_new, j))
8982 goto match1; 9137 goto match1;
8983 } 9138 }
8984 /* no match - a current sched domain not in new doms_new[] */ 9139 /* no match - a current sched domain not in new doms_new[] */
8985 detach_destroy_domains(doms_cur + i); 9140 detach_destroy_domains(doms_cur[i]);
8986match1: 9141match1:
8987 ; 9142 ;
8988 } 9143 }
8989 9144
8990 if (doms_new == NULL) { 9145 if (doms_new == NULL) {
8991 ndoms_cur = 0; 9146 ndoms_cur = 0;
8992 doms_new = fallback_doms; 9147 doms_new = &fallback_doms;
8993 cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); 9148 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
8994 WARN_ON_ONCE(dattr_new); 9149 WARN_ON_ONCE(dattr_new);
8995 } 9150 }
8996 9151
8997 /* Build new domains */ 9152 /* Build new domains */
8998 for (i = 0; i < ndoms_new; i++) { 9153 for (i = 0; i < ndoms_new; i++) {
8999 for (j = 0; j < ndoms_cur && !new_topology; j++) { 9154 for (j = 0; j < ndoms_cur && !new_topology; j++) {
9000 if (cpumask_equal(&doms_new[i], &doms_cur[j]) 9155 if (cpumask_equal(doms_new[i], doms_cur[j])
9001 && dattrs_equal(dattr_new, i, dattr_cur, j)) 9156 && dattrs_equal(dattr_new, i, dattr_cur, j))
9002 goto match2; 9157 goto match2;
9003 } 9158 }
9004 /* no match - add a new doms_new */ 9159 /* no match - add a new doms_new */
9005 __build_sched_domains(doms_new + i, 9160 __build_sched_domains(doms_new[i],
9006 dattr_new ? dattr_new + i : NULL); 9161 dattr_new ? dattr_new + i : NULL);
9007match2: 9162match2:
9008 ; 9163 ;
9009 } 9164 }
9010 9165
9011 /* Remember the new sched domains */ 9166 /* Remember the new sched domains */
9012 if (doms_cur != fallback_doms) 9167 if (doms_cur != &fallback_doms)
9013 kfree(doms_cur); 9168 free_sched_domains(doms_cur, ndoms_cur);
9014 kfree(dattr_cur); /* kfree(NULL) is safe */ 9169 kfree(dattr_cur); /* kfree(NULL) is safe */
9015 doms_cur = doms_new; 9170 doms_cur = doms_new;
9016 dattr_cur = dattr_new; 9171 dattr_cur = dattr_new;
@@ -9121,8 +9276,10 @@ static int update_sched_domains(struct notifier_block *nfb,
9121 switch (action) { 9276 switch (action) {
9122 case CPU_ONLINE: 9277 case CPU_ONLINE:
9123 case CPU_ONLINE_FROZEN: 9278 case CPU_ONLINE_FROZEN:
9124 case CPU_DEAD: 9279 case CPU_DOWN_PREPARE:
9125 case CPU_DEAD_FROZEN: 9280 case CPU_DOWN_PREPARE_FROZEN:
9281 case CPU_DOWN_FAILED:
9282 case CPU_DOWN_FAILED_FROZEN:
9126 partition_sched_domains(1, NULL, NULL); 9283 partition_sched_domains(1, NULL, NULL);
9127 return NOTIFY_OK; 9284 return NOTIFY_OK;
9128 9285
@@ -9169,7 +9326,7 @@ void __init sched_init_smp(void)
9169#endif 9326#endif
9170 get_online_cpus(); 9327 get_online_cpus();
9171 mutex_lock(&sched_domains_mutex); 9328 mutex_lock(&sched_domains_mutex);
9172 arch_init_sched_domains(cpu_online_mask); 9329 arch_init_sched_domains(cpu_active_mask);
9173 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 9330 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
9174 if (cpumask_empty(non_isolated_cpus)) 9331 if (cpumask_empty(non_isolated_cpus))
9175 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 9332 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -9242,13 +9399,13 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
9242#ifdef CONFIG_SMP 9399#ifdef CONFIG_SMP
9243 rt_rq->rt_nr_migratory = 0; 9400 rt_rq->rt_nr_migratory = 0;
9244 rt_rq->overloaded = 0; 9401 rt_rq->overloaded = 0;
9245 plist_head_init(&rt_rq->pushable_tasks, &rq->lock); 9402 plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock);
9246#endif 9403#endif
9247 9404
9248 rt_rq->rt_time = 0; 9405 rt_rq->rt_time = 0;
9249 rt_rq->rt_throttled = 0; 9406 rt_rq->rt_throttled = 0;
9250 rt_rq->rt_runtime = 0; 9407 rt_rq->rt_runtime = 0;
9251 spin_lock_init(&rt_rq->rt_runtime_lock); 9408 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
9252 9409
9253#ifdef CONFIG_RT_GROUP_SCHED 9410#ifdef CONFIG_RT_GROUP_SCHED
9254 rt_rq->rt_nr_boosted = 0; 9411 rt_rq->rt_nr_boosted = 0;
@@ -9332,10 +9489,6 @@ void __init sched_init(void)
9332#ifdef CONFIG_CPUMASK_OFFSTACK 9489#ifdef CONFIG_CPUMASK_OFFSTACK
9333 alloc_size += num_possible_cpus() * cpumask_size(); 9490 alloc_size += num_possible_cpus() * cpumask_size();
9334#endif 9491#endif
9335 /*
9336 * As sched_init() is called before page_alloc is setup,
9337 * we use alloc_bootmem().
9338 */
9339 if (alloc_size) { 9492 if (alloc_size) {
9340 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 9493 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
9341 9494
@@ -9404,11 +9557,15 @@ void __init sched_init(void)
9404#endif /* CONFIG_USER_SCHED */ 9557#endif /* CONFIG_USER_SCHED */
9405#endif /* CONFIG_GROUP_SCHED */ 9558#endif /* CONFIG_GROUP_SCHED */
9406 9559
9560#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
9561 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
9562 __alignof__(unsigned long));
9563#endif
9407 for_each_possible_cpu(i) { 9564 for_each_possible_cpu(i) {
9408 struct rq *rq; 9565 struct rq *rq;
9409 9566
9410 rq = cpu_rq(i); 9567 rq = cpu_rq(i);
9411 spin_lock_init(&rq->lock); 9568 raw_spin_lock_init(&rq->lock);
9412 rq->nr_running = 0; 9569 rq->nr_running = 0;
9413 rq->calc_load_active = 0; 9570 rq->calc_load_active = 0;
9414 rq->calc_load_update = jiffies + LOAD_FREQ; 9571 rq->calc_load_update = jiffies + LOAD_FREQ;
@@ -9468,7 +9625,7 @@ void __init sched_init(void)
9468#elif defined CONFIG_USER_SCHED 9625#elif defined CONFIG_USER_SCHED
9469 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL); 9626 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
9470 init_tg_rt_entry(&init_task_group, 9627 init_tg_rt_entry(&init_task_group,
9471 &per_cpu(init_rt_rq, i), 9628 &per_cpu(init_rt_rq_var, i),
9472 &per_cpu(init_sched_rt_entity, i), i, 1, 9629 &per_cpu(init_sched_rt_entity, i), i, 1,
9473 root_task_group.rt_se[i]); 9630 root_task_group.rt_se[i]);
9474#endif 9631#endif
@@ -9486,6 +9643,8 @@ void __init sched_init(void)
9486 rq->cpu = i; 9643 rq->cpu = i;
9487 rq->online = 0; 9644 rq->online = 0;
9488 rq->migration_thread = NULL; 9645 rq->migration_thread = NULL;
9646 rq->idle_stamp = 0;
9647 rq->avg_idle = 2*sysctl_sched_migration_cost;
9489 INIT_LIST_HEAD(&rq->migration_queue); 9648 INIT_LIST_HEAD(&rq->migration_queue);
9490 rq_attach_root(rq, &def_root_domain); 9649 rq_attach_root(rq, &def_root_domain);
9491#endif 9650#endif
@@ -9504,7 +9663,7 @@ void __init sched_init(void)
9504#endif 9663#endif
9505 9664
9506#ifdef CONFIG_RT_MUTEXES 9665#ifdef CONFIG_RT_MUTEXES
9507 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); 9666 plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock);
9508#endif 9667#endif
9509 9668
9510 /* 9669 /*
@@ -9529,13 +9688,15 @@ void __init sched_init(void)
9529 current->sched_class = &fair_sched_class; 9688 current->sched_class = &fair_sched_class;
9530 9689
9531 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ 9690 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
9532 alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 9691 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
9533#ifdef CONFIG_SMP 9692#ifdef CONFIG_SMP
9534#ifdef CONFIG_NO_HZ 9693#ifdef CONFIG_NO_HZ
9535 alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); 9694 zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
9536 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); 9695 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
9537#endif 9696#endif
9538 alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 9697 /* May be allocated at isolcpus cmdline parse time */
9698 if (cpu_isolated_map == NULL)
9699 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
9539#endif /* SMP */ 9700#endif /* SMP */
9540 9701
9541 perf_event_init(); 9702 perf_event_init();
@@ -9546,7 +9707,7 @@ void __init sched_init(void)
9546#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 9707#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
9547static inline int preempt_count_equals(int preempt_offset) 9708static inline int preempt_count_equals(int preempt_offset)
9548{ 9709{
9549 int nested = preempt_count() & ~PREEMPT_ACTIVE; 9710 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
9550 9711
9551 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); 9712 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
9552} 9713}
@@ -9627,13 +9788,13 @@ void normalize_rt_tasks(void)
9627 continue; 9788 continue;
9628 } 9789 }
9629 9790
9630 spin_lock(&p->pi_lock); 9791 raw_spin_lock(&p->pi_lock);
9631 rq = __task_rq_lock(p); 9792 rq = __task_rq_lock(p);
9632 9793
9633 normalize_task(rq, p); 9794 normalize_task(rq, p);
9634 9795
9635 __task_rq_unlock(rq); 9796 __task_rq_unlock(rq);
9636 spin_unlock(&p->pi_lock); 9797 raw_spin_unlock(&p->pi_lock);
9637 } while_each_thread(g, p); 9798 } while_each_thread(g, p);
9638 9799
9639 read_unlock_irqrestore(&tasklist_lock, flags); 9800 read_unlock_irqrestore(&tasklist_lock, flags);
@@ -9729,13 +9890,15 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
9729 se = kzalloc_node(sizeof(struct sched_entity), 9890 se = kzalloc_node(sizeof(struct sched_entity),
9730 GFP_KERNEL, cpu_to_node(i)); 9891 GFP_KERNEL, cpu_to_node(i));
9731 if (!se) 9892 if (!se)
9732 goto err; 9893 goto err_free_rq;
9733 9894
9734 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); 9895 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
9735 } 9896 }
9736 9897
9737 return 1; 9898 return 1;
9738 9899
9900 err_free_rq:
9901 kfree(cfs_rq);
9739 err: 9902 err:
9740 return 0; 9903 return 0;
9741} 9904}
@@ -9817,13 +9980,15 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
9817 rt_se = kzalloc_node(sizeof(struct sched_rt_entity), 9980 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
9818 GFP_KERNEL, cpu_to_node(i)); 9981 GFP_KERNEL, cpu_to_node(i));
9819 if (!rt_se) 9982 if (!rt_se)
9820 goto err; 9983 goto err_free_rq;
9821 9984
9822 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); 9985 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
9823 } 9986 }
9824 9987
9825 return 1; 9988 return 1;
9826 9989
9990 err_free_rq:
9991 kfree(rt_rq);
9827 err: 9992 err:
9828 return 0; 9993 return 0;
9829} 9994}
@@ -9957,7 +10122,7 @@ void sched_move_task(struct task_struct *tsk)
9957 10122
9958#ifdef CONFIG_FAIR_GROUP_SCHED 10123#ifdef CONFIG_FAIR_GROUP_SCHED
9959 if (tsk->sched_class->moved_group) 10124 if (tsk->sched_class->moved_group)
9960 tsk->sched_class->moved_group(tsk); 10125 tsk->sched_class->moved_group(tsk, on_rq);
9961#endif 10126#endif
9962 10127
9963 if (unlikely(running)) 10128 if (unlikely(running))
@@ -9992,9 +10157,9 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
9992 struct rq *rq = cfs_rq->rq; 10157 struct rq *rq = cfs_rq->rq;
9993 unsigned long flags; 10158 unsigned long flags;
9994 10159
9995 spin_lock_irqsave(&rq->lock, flags); 10160 raw_spin_lock_irqsave(&rq->lock, flags);
9996 __set_se_shares(se, shares); 10161 __set_se_shares(se, shares);
9997 spin_unlock_irqrestore(&rq->lock, flags); 10162 raw_spin_unlock_irqrestore(&rq->lock, flags);
9998} 10163}
9999 10164
10000static DEFINE_MUTEX(shares_mutex); 10165static DEFINE_MUTEX(shares_mutex);
@@ -10179,18 +10344,18 @@ static int tg_set_bandwidth(struct task_group *tg,
10179 if (err) 10344 if (err)
10180 goto unlock; 10345 goto unlock;
10181 10346
10182 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 10347 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
10183 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 10348 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
10184 tg->rt_bandwidth.rt_runtime = rt_runtime; 10349 tg->rt_bandwidth.rt_runtime = rt_runtime;
10185 10350
10186 for_each_possible_cpu(i) { 10351 for_each_possible_cpu(i) {
10187 struct rt_rq *rt_rq = tg->rt_rq[i]; 10352 struct rt_rq *rt_rq = tg->rt_rq[i];
10188 10353
10189 spin_lock(&rt_rq->rt_runtime_lock); 10354 raw_spin_lock(&rt_rq->rt_runtime_lock);
10190 rt_rq->rt_runtime = rt_runtime; 10355 rt_rq->rt_runtime = rt_runtime;
10191 spin_unlock(&rt_rq->rt_runtime_lock); 10356 raw_spin_unlock(&rt_rq->rt_runtime_lock);
10192 } 10357 }
10193 spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 10358 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
10194 unlock: 10359 unlock:
10195 read_unlock(&tasklist_lock); 10360 read_unlock(&tasklist_lock);
10196 mutex_unlock(&rt_constraints_mutex); 10361 mutex_unlock(&rt_constraints_mutex);
@@ -10295,15 +10460,15 @@ static int sched_rt_global_constraints(void)
10295 if (sysctl_sched_rt_runtime == 0) 10460 if (sysctl_sched_rt_runtime == 0)
10296 return -EBUSY; 10461 return -EBUSY;
10297 10462
10298 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 10463 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
10299 for_each_possible_cpu(i) { 10464 for_each_possible_cpu(i) {
10300 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 10465 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
10301 10466
10302 spin_lock(&rt_rq->rt_runtime_lock); 10467 raw_spin_lock(&rt_rq->rt_runtime_lock);
10303 rt_rq->rt_runtime = global_rt_runtime(); 10468 rt_rq->rt_runtime = global_rt_runtime();
10304 spin_unlock(&rt_rq->rt_runtime_lock); 10469 raw_spin_unlock(&rt_rq->rt_runtime_lock);
10305 } 10470 }
10306 spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 10471 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
10307 10472
10308 return 0; 10473 return 0;
10309} 10474}
@@ -10594,9 +10759,9 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
10594 /* 10759 /*
10595 * Take rq->lock to make 64-bit read safe on 32-bit platforms. 10760 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
10596 */ 10761 */
10597 spin_lock_irq(&cpu_rq(cpu)->lock); 10762 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
10598 data = *cpuusage; 10763 data = *cpuusage;
10599 spin_unlock_irq(&cpu_rq(cpu)->lock); 10764 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
10600#else 10765#else
10601 data = *cpuusage; 10766 data = *cpuusage;
10602#endif 10767#endif
@@ -10612,9 +10777,9 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
10612 /* 10777 /*
10613 * Take rq->lock to make 64-bit write safe on 32-bit platforms. 10778 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
10614 */ 10779 */
10615 spin_lock_irq(&cpu_rq(cpu)->lock); 10780 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
10616 *cpuusage = val; 10781 *cpuusage = val;
10617 spin_unlock_irq(&cpu_rq(cpu)->lock); 10782 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
10618#else 10783#else
10619 *cpuusage = val; 10784 *cpuusage = val;
10620#endif 10785#endif
@@ -10848,9 +11013,9 @@ void synchronize_sched_expedited(void)
10848 init_completion(&req->done); 11013 init_completion(&req->done);
10849 req->task = NULL; 11014 req->task = NULL;
10850 req->dest_cpu = RCU_MIGRATION_NEED_QS; 11015 req->dest_cpu = RCU_MIGRATION_NEED_QS;
10851 spin_lock_irqsave(&rq->lock, flags); 11016 raw_spin_lock_irqsave(&rq->lock, flags);
10852 list_add(&req->list, &rq->migration_queue); 11017 list_add(&req->list, &rq->migration_queue);
10853 spin_unlock_irqrestore(&rq->lock, flags); 11018 raw_spin_unlock_irqrestore(&rq->lock, flags);
10854 wake_up_process(rq->migration_thread); 11019 wake_up_process(rq->migration_thread);
10855 } 11020 }
10856 for_each_online_cpu(cpu) { 11021 for_each_online_cpu(cpu) {
@@ -10858,13 +11023,14 @@ void synchronize_sched_expedited(void)
10858 req = &per_cpu(rcu_migration_req, cpu); 11023 req = &per_cpu(rcu_migration_req, cpu);
10859 rq = cpu_rq(cpu); 11024 rq = cpu_rq(cpu);
10860 wait_for_completion(&req->done); 11025 wait_for_completion(&req->done);
10861 spin_lock_irqsave(&rq->lock, flags); 11026 raw_spin_lock_irqsave(&rq->lock, flags);
10862 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC)) 11027 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
10863 need_full_sync = 1; 11028 need_full_sync = 1;
10864 req->dest_cpu = RCU_MIGRATION_IDLE; 11029 req->dest_cpu = RCU_MIGRATION_IDLE;
10865 spin_unlock_irqrestore(&rq->lock, flags); 11030 raw_spin_unlock_irqrestore(&rq->lock, flags);
10866 } 11031 }
10867 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; 11032 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
11033 synchronize_sched_expedited_count++;
10868 mutex_unlock(&rcu_sched_expedited_mutex); 11034 mutex_unlock(&rcu_sched_expedited_mutex);
10869 put_online_cpus(); 11035 put_online_cpus();
10870 if (need_full_sync) 11036 if (need_full_sync)
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 479ce5682d7..5b496132c28 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -236,6 +236,18 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
236} 236}
237EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); 237EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
238 238
239unsigned long long cpu_clock(int cpu)
240{
241 unsigned long long clock;
242 unsigned long flags;
243
244 local_irq_save(flags);
245 clock = sched_clock_cpu(cpu);
246 local_irq_restore(flags);
247
248 return clock;
249}
250
239#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 251#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
240 252
241void sched_clock_init(void) 253void sched_clock_init(void)
@@ -251,17 +263,12 @@ u64 sched_clock_cpu(int cpu)
251 return sched_clock(); 263 return sched_clock();
252} 264}
253 265
254#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
255 266
256unsigned long long cpu_clock(int cpu) 267unsigned long long cpu_clock(int cpu)
257{ 268{
258 unsigned long long clock; 269 return sched_clock_cpu(cpu);
259 unsigned long flags; 270}
260 271
261 local_irq_save(flags); 272#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
262 clock = sched_clock_cpu(cpu);
263 local_irq_restore(flags);
264 273
265 return clock;
266}
267EXPORT_SYMBOL_GPL(cpu_clock); 274EXPORT_SYMBOL_GPL(cpu_clock);
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 0f052fc674d..597b33099df 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -135,26 +135,26 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
135 if (likely(newpri != CPUPRI_INVALID)) { 135 if (likely(newpri != CPUPRI_INVALID)) {
136 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; 136 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
137 137
138 spin_lock_irqsave(&vec->lock, flags); 138 raw_spin_lock_irqsave(&vec->lock, flags);
139 139
140 cpumask_set_cpu(cpu, vec->mask); 140 cpumask_set_cpu(cpu, vec->mask);
141 vec->count++; 141 vec->count++;
142 if (vec->count == 1) 142 if (vec->count == 1)
143 set_bit(newpri, cp->pri_active); 143 set_bit(newpri, cp->pri_active);
144 144
145 spin_unlock_irqrestore(&vec->lock, flags); 145 raw_spin_unlock_irqrestore(&vec->lock, flags);
146 } 146 }
147 if (likely(oldpri != CPUPRI_INVALID)) { 147 if (likely(oldpri != CPUPRI_INVALID)) {
148 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; 148 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
149 149
150 spin_lock_irqsave(&vec->lock, flags); 150 raw_spin_lock_irqsave(&vec->lock, flags);
151 151
152 vec->count--; 152 vec->count--;
153 if (!vec->count) 153 if (!vec->count)
154 clear_bit(oldpri, cp->pri_active); 154 clear_bit(oldpri, cp->pri_active);
155 cpumask_clear_cpu(cpu, vec->mask); 155 cpumask_clear_cpu(cpu, vec->mask);
156 156
157 spin_unlock_irqrestore(&vec->lock, flags); 157 raw_spin_unlock_irqrestore(&vec->lock, flags);
158 } 158 }
159 159
160 *currpri = newpri; 160 *currpri = newpri;
@@ -180,7 +180,7 @@ int cpupri_init(struct cpupri *cp, bool bootmem)
180 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { 180 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
181 struct cpupri_vec *vec = &cp->pri_to_cpu[i]; 181 struct cpupri_vec *vec = &cp->pri_to_cpu[i];
182 182
183 spin_lock_init(&vec->lock); 183 raw_spin_lock_init(&vec->lock);
184 vec->count = 0; 184 vec->count = 0;
185 if (!zalloc_cpumask_var(&vec->mask, gfp)) 185 if (!zalloc_cpumask_var(&vec->mask, gfp))
186 goto cleanup; 186 goto cleanup;
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 9a7e859b8fb..7cb5bb6b95b 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -12,7 +12,7 @@
12/* values 2-101 are RT priorities 0-99 */ 12/* values 2-101 are RT priorities 0-99 */
13 13
14struct cpupri_vec { 14struct cpupri_vec {
15 spinlock_t lock; 15 raw_spinlock_t lock;
16 int count; 16 int count;
17 cpumask_var_t mask; 17 cpumask_var_t mask;
18}; 18};
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index efb84409bc4..67f95aada4b 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -184,7 +184,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
184 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", 184 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
185 SPLIT_NS(cfs_rq->exec_clock)); 185 SPLIT_NS(cfs_rq->exec_clock));
186 186
187 spin_lock_irqsave(&rq->lock, flags); 187 raw_spin_lock_irqsave(&rq->lock, flags);
188 if (cfs_rq->rb_leftmost) 188 if (cfs_rq->rb_leftmost)
189 MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; 189 MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime;
190 last = __pick_last_entity(cfs_rq); 190 last = __pick_last_entity(cfs_rq);
@@ -192,7 +192,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
192 max_vruntime = last->vruntime; 192 max_vruntime = last->vruntime;
193 min_vruntime = cfs_rq->min_vruntime; 193 min_vruntime = cfs_rq->min_vruntime;
194 rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime; 194 rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
195 spin_unlock_irqrestore(&rq->lock, flags); 195 raw_spin_unlock_irqrestore(&rq->lock, flags);
196 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", 196 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
197 SPLIT_NS(MIN_vruntime)); 197 SPLIT_NS(MIN_vruntime));
198 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", 198 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
@@ -285,12 +285,16 @@ static void print_cpu(struct seq_file *m, int cpu)
285 285
286#ifdef CONFIG_SCHEDSTATS 286#ifdef CONFIG_SCHEDSTATS
287#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); 287#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
288#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n);
288 289
289 P(yld_count); 290 P(yld_count);
290 291
291 P(sched_switch); 292 P(sched_switch);
292 P(sched_count); 293 P(sched_count);
293 P(sched_goidle); 294 P(sched_goidle);
295#ifdef CONFIG_SMP
296 P64(avg_idle);
297#endif
294 298
295 P(ttwu_count); 299 P(ttwu_count);
296 P(ttwu_local); 300 P(ttwu_local);
@@ -305,6 +309,12 @@ static void print_cpu(struct seq_file *m, int cpu)
305 print_rq(m, rq, cpu); 309 print_rq(m, rq, cpu);
306} 310}
307 311
312static const char *sched_tunable_scaling_names[] = {
313 "none",
314 "logaritmic",
315 "linear"
316};
317
308static int sched_debug_show(struct seq_file *m, void *v) 318static int sched_debug_show(struct seq_file *m, void *v)
309{ 319{
310 u64 now = ktime_to_ns(ktime_get()); 320 u64 now = ktime_to_ns(ktime_get());
@@ -330,6 +340,10 @@ static int sched_debug_show(struct seq_file *m, void *v)
330#undef PN 340#undef PN
331#undef P 341#undef P
332 342
343 SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling",
344 sysctl_sched_tunable_scaling,
345 sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
346
333 for_each_online_cpu(cpu) 347 for_each_online_cpu(cpu)
334 print_cpu(m, cpu); 348 print_cpu(m, cpu);
335 349
@@ -395,7 +409,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
395 PN(se.sum_exec_runtime); 409 PN(se.sum_exec_runtime);
396 PN(se.avg_overlap); 410 PN(se.avg_overlap);
397 PN(se.avg_wakeup); 411 PN(se.avg_wakeup);
398 PN(se.avg_running);
399 412
400 nr_switches = p->nvcsw + p->nivcsw; 413 nr_switches = p->nvcsw + p->nivcsw;
401 414
@@ -419,7 +432,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
419 P(se.nr_failed_migrations_running); 432 P(se.nr_failed_migrations_running);
420 P(se.nr_failed_migrations_hot); 433 P(se.nr_failed_migrations_hot);
421 P(se.nr_forced_migrations); 434 P(se.nr_forced_migrations);
422 P(se.nr_forced2_migrations);
423 P(se.nr_wakeups); 435 P(se.nr_wakeups);
424 P(se.nr_wakeups_sync); 436 P(se.nr_wakeups_sync);
425 P(se.nr_wakeups_migrate); 437 P(se.nr_wakeups_migrate);
@@ -495,7 +507,6 @@ void proc_sched_set_task(struct task_struct *p)
495 p->se.nr_failed_migrations_running = 0; 507 p->se.nr_failed_migrations_running = 0;
496 p->se.nr_failed_migrations_hot = 0; 508 p->se.nr_failed_migrations_hot = 0;
497 p->se.nr_forced_migrations = 0; 509 p->se.nr_forced_migrations = 0;
498 p->se.nr_forced2_migrations = 0;
499 p->se.nr_wakeups = 0; 510 p->se.nr_wakeups = 0;
500 p->se.nr_wakeups_sync = 0; 511 p->se.nr_wakeups_sync = 0;
501 p->se.nr_wakeups_migrate = 0; 512 p->se.nr_wakeups_migrate = 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 4e777b47eed..8fe7ee81c55 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -21,6 +21,7 @@
21 */ 21 */
22 22
23#include <linux/latencytop.h> 23#include <linux/latencytop.h>
24#include <linux/sched.h>
24 25
25/* 26/*
26 * Targeted preemption latency for CPU-bound tasks: 27 * Targeted preemption latency for CPU-bound tasks:
@@ -35,12 +36,26 @@
35 * run vmstat and monitor the context-switches (cs) field) 36 * run vmstat and monitor the context-switches (cs) field)
36 */ 37 */
37unsigned int sysctl_sched_latency = 5000000ULL; 38unsigned int sysctl_sched_latency = 5000000ULL;
39unsigned int normalized_sysctl_sched_latency = 5000000ULL;
40
41/*
42 * The initial- and re-scaling of tunables is configurable
43 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
44 *
45 * Options are:
46 * SCHED_TUNABLESCALING_NONE - unscaled, always *1
47 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
48 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
49 */
50enum sched_tunable_scaling sysctl_sched_tunable_scaling
51 = SCHED_TUNABLESCALING_LOG;
38 52
39/* 53/*
40 * Minimal preemption granularity for CPU-bound tasks: 54 * Minimal preemption granularity for CPU-bound tasks:
41 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) 55 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
42 */ 56 */
43unsigned int sysctl_sched_min_granularity = 1000000ULL; 57unsigned int sysctl_sched_min_granularity = 1000000ULL;
58unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL;
44 59
45/* 60/*
46 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity 61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
@@ -70,6 +85,7 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
70 * have immediate wakeup/sleep latencies. 85 * have immediate wakeup/sleep latencies.
71 */ 86 */
72unsigned int sysctl_sched_wakeup_granularity = 1000000UL; 87unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
88unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
73 89
74const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 90const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
75 91
@@ -383,11 +399,12 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
383 */ 399 */
384 400
385#ifdef CONFIG_SCHED_DEBUG 401#ifdef CONFIG_SCHED_DEBUG
386int sched_nr_latency_handler(struct ctl_table *table, int write, 402int sched_proc_update_handler(struct ctl_table *table, int write,
387 void __user *buffer, size_t *lenp, 403 void __user *buffer, size_t *lenp,
388 loff_t *ppos) 404 loff_t *ppos)
389{ 405{
390 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 406 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
407 int factor = get_update_sysctl_factor();
391 408
392 if (ret || !write) 409 if (ret || !write)
393 return ret; 410 return ret;
@@ -395,6 +412,14 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
395 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, 412 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
396 sysctl_sched_min_granularity); 413 sysctl_sched_min_granularity);
397 414
415#define WRT_SYSCTL(name) \
416 (normalized_sysctl_##name = sysctl_##name / (factor))
417 WRT_SYSCTL(sched_min_granularity);
418 WRT_SYSCTL(sched_latency);
419 WRT_SYSCTL(sched_wakeup_granularity);
420 WRT_SYSCTL(sched_shares_ratelimit);
421#undef WRT_SYSCTL
422
398 return 0; 423 return 0;
399} 424}
400#endif 425#endif
@@ -485,6 +510,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
485 curr->sum_exec_runtime += delta_exec; 510 curr->sum_exec_runtime += delta_exec;
486 schedstat_add(cfs_rq, exec_clock, delta_exec); 511 schedstat_add(cfs_rq, exec_clock, delta_exec);
487 delta_exec_weighted = calc_delta_fair(delta_exec, curr); 512 delta_exec_weighted = calc_delta_fair(delta_exec, curr);
513
488 curr->vruntime += delta_exec_weighted; 514 curr->vruntime += delta_exec_weighted;
489 update_min_vruntime(cfs_rq); 515 update_min_vruntime(cfs_rq);
490} 516}
@@ -740,16 +766,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
740 se->vruntime = vruntime; 766 se->vruntime = vruntime;
741} 767}
742 768
769#define ENQUEUE_WAKEUP 1
770#define ENQUEUE_MIGRATE 2
771
743static void 772static void
744enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) 773enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
745{ 774{
746 /* 775 /*
776 * Update the normalized vruntime before updating min_vruntime
777 * through callig update_curr().
778 */
779 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE))
780 se->vruntime += cfs_rq->min_vruntime;
781
782 /*
747 * Update run-time statistics of the 'current'. 783 * Update run-time statistics of the 'current'.
748 */ 784 */
749 update_curr(cfs_rq); 785 update_curr(cfs_rq);
750 account_entity_enqueue(cfs_rq, se); 786 account_entity_enqueue(cfs_rq, se);
751 787
752 if (wakeup) { 788 if (flags & ENQUEUE_WAKEUP) {
753 place_entity(cfs_rq, se, 0); 789 place_entity(cfs_rq, se, 0);
754 enqueue_sleeper(cfs_rq, se); 790 enqueue_sleeper(cfs_rq, se);
755 } 791 }
@@ -803,6 +839,14 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
803 __dequeue_entity(cfs_rq, se); 839 __dequeue_entity(cfs_rq, se);
804 account_entity_dequeue(cfs_rq, se); 840 account_entity_dequeue(cfs_rq, se);
805 update_min_vruntime(cfs_rq); 841 update_min_vruntime(cfs_rq);
842
843 /*
844 * Normalize the entity after updating the min_vruntime because the
845 * update can refer to the ->curr item and we need to reflect this
846 * movement in our normalized position.
847 */
848 if (!sleep)
849 se->vruntime -= cfs_rq->min_vruntime;
806} 850}
807 851
808/* 852/*
@@ -822,6 +866,26 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
822 * re-elected due to buddy favours. 866 * re-elected due to buddy favours.
823 */ 867 */
824 clear_buddies(cfs_rq, curr); 868 clear_buddies(cfs_rq, curr);
869 return;
870 }
871
872 /*
873 * Ensure that a task that missed wakeup preemption by a
874 * narrow margin doesn't have to wait for a full slice.
875 * This also mitigates buddy induced latencies under load.
876 */
877 if (!sched_feat(WAKEUP_PREEMPT))
878 return;
879
880 if (delta_exec < sysctl_sched_min_granularity)
881 return;
882
883 if (cfs_rq->nr_running > 1) {
884 struct sched_entity *se = __pick_next_entity(cfs_rq);
885 s64 delta = curr->vruntime - se->vruntime;
886
887 if (delta > ideal_runtime)
888 resched_task(rq_of(cfs_rq)->curr);
825 } 889 }
826} 890}
827 891
@@ -861,12 +925,18 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
861static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) 925static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
862{ 926{
863 struct sched_entity *se = __pick_next_entity(cfs_rq); 927 struct sched_entity *se = __pick_next_entity(cfs_rq);
928 struct sched_entity *left = se;
864 929
865 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1) 930 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
866 return cfs_rq->next; 931 se = cfs_rq->next;
867 932
868 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1) 933 /*
869 return cfs_rq->last; 934 * Prefer last buddy, try to return the CPU to a preempted task.
935 */
936 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
937 se = cfs_rq->last;
938
939 clear_buddies(cfs_rq, se);
870 940
871 return se; 941 return se;
872} 942}
@@ -987,13 +1057,19 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
987{ 1057{
988 struct cfs_rq *cfs_rq; 1058 struct cfs_rq *cfs_rq;
989 struct sched_entity *se = &p->se; 1059 struct sched_entity *se = &p->se;
1060 int flags = 0;
1061
1062 if (wakeup)
1063 flags |= ENQUEUE_WAKEUP;
1064 if (p->state == TASK_WAKING)
1065 flags |= ENQUEUE_MIGRATE;
990 1066
991 for_each_sched_entity(se) { 1067 for_each_sched_entity(se) {
992 if (se->on_rq) 1068 if (se->on_rq)
993 break; 1069 break;
994 cfs_rq = cfs_rq_of(se); 1070 cfs_rq = cfs_rq_of(se);
995 enqueue_entity(cfs_rq, se, wakeup); 1071 enqueue_entity(cfs_rq, se, flags);
996 wakeup = 1; 1072 flags = ENQUEUE_WAKEUP;
997 } 1073 }
998 1074
999 hrtick_update(rq); 1075 hrtick_update(rq);
@@ -1069,6 +1145,14 @@ static void yield_task_fair(struct rq *rq)
1069 1145
1070#ifdef CONFIG_SMP 1146#ifdef CONFIG_SMP
1071 1147
1148static void task_waking_fair(struct rq *rq, struct task_struct *p)
1149{
1150 struct sched_entity *se = &p->se;
1151 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1152
1153 se->vruntime -= cfs_rq->min_vruntime;
1154}
1155
1072#ifdef CONFIG_FAIR_GROUP_SCHED 1156#ifdef CONFIG_FAIR_GROUP_SCHED
1073/* 1157/*
1074 * effective_load() calculates the load change as seen from the root_task_group 1158 * effective_load() calculates the load change as seen from the root_task_group
@@ -1319,6 +1403,37 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1319} 1403}
1320 1404
1321/* 1405/*
1406 * Try and locate an idle CPU in the sched_domain.
1407 */
1408static int
1409select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
1410{
1411 int cpu = smp_processor_id();
1412 int prev_cpu = task_cpu(p);
1413 int i;
1414
1415 /*
1416 * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE
1417 * test in select_task_rq_fair) and the prev_cpu is idle then that's
1418 * always a better target than the current cpu.
1419 */
1420 if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running)
1421 return prev_cpu;
1422
1423 /*
1424 * Otherwise, iterate the domain and find an elegible idle cpu.
1425 */
1426 for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
1427 if (!cpu_rq(i)->cfs.nr_running) {
1428 target = i;
1429 break;
1430 }
1431 }
1432
1433 return target;
1434}
1435
1436/*
1322 * sched_balance_self: balance the current task (running on cpu) in domains 1437 * sched_balance_self: balance the current task (running on cpu) in domains
1323 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and 1438 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1324 * SD_BALANCE_EXEC. 1439 * SD_BALANCE_EXEC.
@@ -1346,8 +1461,10 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1346 new_cpu = prev_cpu; 1461 new_cpu = prev_cpu;
1347 } 1462 }
1348 1463
1349 rcu_read_lock();
1350 for_each_domain(cpu, tmp) { 1464 for_each_domain(cpu, tmp) {
1465 if (!(tmp->flags & SD_LOAD_BALANCE))
1466 continue;
1467
1351 /* 1468 /*
1352 * If power savings logic is enabled for a domain, see if we 1469 * If power savings logic is enabled for a domain, see if we
1353 * are not overloaded, if so, don't balance wider. 1470 * are not overloaded, if so, don't balance wider.
@@ -1372,11 +1489,35 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1372 want_sd = 0; 1489 want_sd = 0;
1373 } 1490 }
1374 1491
1375 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && 1492 /*
1376 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { 1493 * While iterating the domains looking for a spanning
1494 * WAKE_AFFINE domain, adjust the affine target to any idle cpu
1495 * in cache sharing domains along the way.
1496 */
1497 if (want_affine) {
1498 int target = -1;
1377 1499
1378 affine_sd = tmp; 1500 /*
1379 want_affine = 0; 1501 * If both cpu and prev_cpu are part of this domain,
1502 * cpu is a valid SD_WAKE_AFFINE target.
1503 */
1504 if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
1505 target = cpu;
1506
1507 /*
1508 * If there's an idle sibling in this domain, make that
1509 * the wake_affine target instead of the current cpu.
1510 */
1511 if (tmp->flags & SD_SHARE_PKG_RESOURCES)
1512 target = select_idle_sibling(p, tmp, target);
1513
1514 if (target >= 0) {
1515 if (tmp->flags & SD_WAKE_AFFINE) {
1516 affine_sd = tmp;
1517 want_affine = 0;
1518 }
1519 cpu = target;
1520 }
1380 } 1521 }
1381 1522
1382 if (!want_sd && !want_affine) 1523 if (!want_sd && !want_affine)
@@ -1403,10 +1544,8 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1403 update_shares(tmp); 1544 update_shares(tmp);
1404 } 1545 }
1405 1546
1406 if (affine_sd && wake_affine(affine_sd, p, sync)) { 1547 if (affine_sd && wake_affine(affine_sd, p, sync))
1407 new_cpu = cpu; 1548 return cpu;
1408 goto out;
1409 }
1410 1549
1411 while (sd) { 1550 while (sd) {
1412 int load_idx = sd->forkexec_idx; 1551 int load_idx = sd->forkexec_idx;
@@ -1447,8 +1586,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1447 /* while loop will break here if sd == NULL */ 1586 /* while loop will break here if sd == NULL */
1448 } 1587 }
1449 1588
1450out:
1451 rcu_read_unlock();
1452 return new_cpu; 1589 return new_cpu;
1453} 1590}
1454#endif /* CONFIG_SMP */ 1591#endif /* CONFIG_SMP */
@@ -1568,13 +1705,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1568 struct sched_entity *se = &curr->se, *pse = &p->se; 1705 struct sched_entity *se = &curr->se, *pse = &p->se;
1569 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1706 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1570 int sync = wake_flags & WF_SYNC; 1707 int sync = wake_flags & WF_SYNC;
1708 int scale = cfs_rq->nr_running >= sched_nr_latency;
1571 1709
1572 update_curr(cfs_rq); 1710 if (unlikely(rt_prio(p->prio)))
1573 1711 goto preempt;
1574 if (unlikely(rt_prio(p->prio))) {
1575 resched_task(curr);
1576 return;
1577 }
1578 1712
1579 if (unlikely(p->sched_class != &fair_sched_class)) 1713 if (unlikely(p->sched_class != &fair_sched_class))
1580 return; 1714 return;
@@ -1582,18 +1716,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1582 if (unlikely(se == pse)) 1716 if (unlikely(se == pse))
1583 return; 1717 return;
1584 1718
1585 /* 1719 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK))
1586 * Only set the backward buddy when the current task is still on the
1587 * rq. This can happen when a wakeup gets interleaved with schedule on
1588 * the ->pre_schedule() or idle_balance() point, either of which can
1589 * drop the rq lock.
1590 *
1591 * Also, during early boot the idle thread is in the fair class, for
1592 * obvious reasons its a bad idea to schedule back to the idle thread.
1593 */
1594 if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
1595 set_last_buddy(se);
1596 if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK))
1597 set_next_buddy(pse); 1720 set_next_buddy(pse);
1598 1721
1599 /* 1722 /*
@@ -1611,36 +1734,44 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1611 return; 1734 return;
1612 1735
1613 /* Idle tasks are by definition preempted by everybody. */ 1736 /* Idle tasks are by definition preempted by everybody. */
1614 if (unlikely(curr->policy == SCHED_IDLE)) { 1737 if (unlikely(curr->policy == SCHED_IDLE))
1615 resched_task(curr); 1738 goto preempt;
1616 return;
1617 }
1618 1739
1619 if ((sched_feat(WAKEUP_SYNC) && sync) || 1740 if (sched_feat(WAKEUP_SYNC) && sync)
1620 (sched_feat(WAKEUP_OVERLAP) && 1741 goto preempt;
1621 (se->avg_overlap < sysctl_sched_migration_cost &&
1622 pse->avg_overlap < sysctl_sched_migration_cost))) {
1623 resched_task(curr);
1624 return;
1625 }
1626 1742
1627 if (sched_feat(WAKEUP_RUNNING)) { 1743 if (sched_feat(WAKEUP_OVERLAP) &&
1628 if (pse->avg_running < se->avg_running) { 1744 se->avg_overlap < sysctl_sched_migration_cost &&
1629 set_next_buddy(pse); 1745 pse->avg_overlap < sysctl_sched_migration_cost)
1630 resched_task(curr); 1746 goto preempt;
1631 return;
1632 }
1633 }
1634 1747
1635 if (!sched_feat(WAKEUP_PREEMPT)) 1748 if (!sched_feat(WAKEUP_PREEMPT))
1636 return; 1749 return;
1637 1750
1751 update_curr(cfs_rq);
1638 find_matching_se(&se, &pse); 1752 find_matching_se(&se, &pse);
1639
1640 BUG_ON(!pse); 1753 BUG_ON(!pse);
1641
1642 if (wakeup_preempt_entity(se, pse) == 1) 1754 if (wakeup_preempt_entity(se, pse) == 1)
1643 resched_task(curr); 1755 goto preempt;
1756
1757 return;
1758
1759preempt:
1760 resched_task(curr);
1761 /*
1762 * Only set the backward buddy when the current task is still
1763 * on the rq. This can happen when a wakeup gets interleaved
1764 * with schedule on the ->pre_schedule() or idle_balance()
1765 * point, either of which can * drop the rq lock.
1766 *
1767 * Also, during early boot the idle thread is in the fair class,
1768 * for obvious reasons its a bad idea to schedule back to it.
1769 */
1770 if (unlikely(!se->on_rq || curr == rq->idle))
1771 return;
1772
1773 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
1774 set_last_buddy(se);
1644} 1775}
1645 1776
1646static struct task_struct *pick_next_task_fair(struct rq *rq) 1777static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1649,21 +1780,11 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
1649 struct cfs_rq *cfs_rq = &rq->cfs; 1780 struct cfs_rq *cfs_rq = &rq->cfs;
1650 struct sched_entity *se; 1781 struct sched_entity *se;
1651 1782
1652 if (unlikely(!cfs_rq->nr_running)) 1783 if (!cfs_rq->nr_running)
1653 return NULL; 1784 return NULL;
1654 1785
1655 do { 1786 do {
1656 se = pick_next_entity(cfs_rq); 1787 se = pick_next_entity(cfs_rq);
1657 /*
1658 * If se was a buddy, clear it so that it will have to earn
1659 * the favour again.
1660 *
1661 * If se was not a buddy, clear the buddies because neither
1662 * was elegible to run, let them earn it again.
1663 *
1664 * IOW. unconditionally clear buddies.
1665 */
1666 __clear_buddies(cfs_rq, NULL);
1667 set_next_entity(cfs_rq, se); 1788 set_next_entity(cfs_rq, se);
1668 cfs_rq = group_cfs_rq(se); 1789 cfs_rq = group_cfs_rq(se);
1669 } while (cfs_rq); 1790 } while (cfs_rq);
@@ -1830,6 +1951,17 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1830 1951
1831 return 0; 1952 return 0;
1832} 1953}
1954
1955static void rq_online_fair(struct rq *rq)
1956{
1957 update_sysctl();
1958}
1959
1960static void rq_offline_fair(struct rq *rq)
1961{
1962 update_sysctl();
1963}
1964
1833#endif /* CONFIG_SMP */ 1965#endif /* CONFIG_SMP */
1834 1966
1835/* 1967/*
@@ -1847,28 +1979,30 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
1847} 1979}
1848 1980
1849/* 1981/*
1850 * Share the fairness runtime between parent and child, thus the 1982 * called on fork with the child task as argument from the parent's context
1851 * total amount of pressure for CPU stays equal - new tasks 1983 * - child not yet on the tasklist
1852 * get a chance to run but frequent forkers are not allowed to 1984 * - preemption disabled
1853 * monopolize the CPU. Note: the parent runqueue is locked,
1854 * the child is not running yet.
1855 */ 1985 */
1856static void task_new_fair(struct rq *rq, struct task_struct *p) 1986static void task_fork_fair(struct task_struct *p)
1857{ 1987{
1858 struct cfs_rq *cfs_rq = task_cfs_rq(p); 1988 struct cfs_rq *cfs_rq = task_cfs_rq(current);
1859 struct sched_entity *se = &p->se, *curr = cfs_rq->curr; 1989 struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
1860 int this_cpu = smp_processor_id(); 1990 int this_cpu = smp_processor_id();
1991 struct rq *rq = this_rq();
1992 unsigned long flags;
1993
1994 raw_spin_lock_irqsave(&rq->lock, flags);
1861 1995
1862 sched_info_queued(p); 1996 if (unlikely(task_cpu(p) != this_cpu))
1997 __set_task_cpu(p, this_cpu);
1863 1998
1864 update_curr(cfs_rq); 1999 update_curr(cfs_rq);
2000
1865 if (curr) 2001 if (curr)
1866 se->vruntime = curr->vruntime; 2002 se->vruntime = curr->vruntime;
1867 place_entity(cfs_rq, se, 1); 2003 place_entity(cfs_rq, se, 1);
1868 2004
1869 /* 'curr' will be NULL if the child belongs to a different group */ 2005 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
1870 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
1871 curr && entity_before(curr, se)) {
1872 /* 2006 /*
1873 * Upon rescheduling, sched_class::put_prev_task() will place 2007 * Upon rescheduling, sched_class::put_prev_task() will place
1874 * 'current' within the tree based on its new key value. 2008 * 'current' within the tree based on its new key value.
@@ -1877,7 +2011,9 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1877 resched_task(rq->curr); 2011 resched_task(rq->curr);
1878 } 2012 }
1879 2013
1880 enqueue_task_fair(rq, p, 0); 2014 se->vruntime -= cfs_rq->min_vruntime;
2015
2016 raw_spin_unlock_irqrestore(&rq->lock, flags);
1881} 2017}
1882 2018
1883/* 2019/*
@@ -1930,30 +2066,27 @@ static void set_curr_task_fair(struct rq *rq)
1930} 2066}
1931 2067
1932#ifdef CONFIG_FAIR_GROUP_SCHED 2068#ifdef CONFIG_FAIR_GROUP_SCHED
1933static void moved_group_fair(struct task_struct *p) 2069static void moved_group_fair(struct task_struct *p, int on_rq)
1934{ 2070{
1935 struct cfs_rq *cfs_rq = task_cfs_rq(p); 2071 struct cfs_rq *cfs_rq = task_cfs_rq(p);
1936 2072
1937 update_curr(cfs_rq); 2073 update_curr(cfs_rq);
1938 place_entity(cfs_rq, &p->se, 1); 2074 if (!on_rq)
2075 place_entity(cfs_rq, &p->se, 1);
1939} 2076}
1940#endif 2077#endif
1941 2078
1942unsigned int get_rr_interval_fair(struct task_struct *task) 2079unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
1943{ 2080{
1944 struct sched_entity *se = &task->se; 2081 struct sched_entity *se = &task->se;
1945 unsigned long flags;
1946 struct rq *rq;
1947 unsigned int rr_interval = 0; 2082 unsigned int rr_interval = 0;
1948 2083
1949 /* 2084 /*
1950 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise 2085 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
1951 * idle runqueue: 2086 * idle runqueue:
1952 */ 2087 */
1953 rq = task_rq_lock(task, &flags);
1954 if (rq->cfs.load.weight) 2088 if (rq->cfs.load.weight)
1955 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); 2089 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
1956 task_rq_unlock(rq, &flags);
1957 2090
1958 return rr_interval; 2091 return rr_interval;
1959} 2092}
@@ -1977,11 +2110,15 @@ static const struct sched_class fair_sched_class = {
1977 2110
1978 .load_balance = load_balance_fair, 2111 .load_balance = load_balance_fair,
1979 .move_one_task = move_one_task_fair, 2112 .move_one_task = move_one_task_fair,
2113 .rq_online = rq_online_fair,
2114 .rq_offline = rq_offline_fair,
2115
2116 .task_waking = task_waking_fair,
1980#endif 2117#endif
1981 2118
1982 .set_curr_task = set_curr_task_fair, 2119 .set_curr_task = set_curr_task_fair,
1983 .task_tick = task_tick_fair, 2120 .task_tick = task_tick_fair,
1984 .task_new = task_new_fair, 2121 .task_fork = task_fork_fair,
1985 2122
1986 .prio_changed = prio_changed_fair, 2123 .prio_changed = prio_changed_fair,
1987 .switched_to = switched_to_fair, 2124 .switched_to = switched_to_fair,
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 0d94083582c..d5059fd761d 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -54,11 +54,6 @@ SCHED_FEAT(WAKEUP_SYNC, 0)
54SCHED_FEAT(WAKEUP_OVERLAP, 0) 54SCHED_FEAT(WAKEUP_OVERLAP, 0)
55 55
56/* 56/*
57 * Wakeup preemption towards tasks that run short
58 */
59SCHED_FEAT(WAKEUP_RUNNING, 0)
60
61/*
62 * Use the SYNC wakeup hint, pipes and the likes use this to indicate 57 * Use the SYNC wakeup hint, pipes and the likes use this to indicate
63 * the remote end is likely to consume the data we just wrote, and 58 * the remote end is likely to consume the data we just wrote, and
64 * therefore has cache benefit from being placed on the same cpu, see 59 * therefore has cache benefit from being placed on the same cpu, see
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index b133a28fcde..5f93b570d38 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -34,10 +34,10 @@ static struct task_struct *pick_next_task_idle(struct rq *rq)
34static void 34static void
35dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep) 35dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep)
36{ 36{
37 spin_unlock_irq(&rq->lock); 37 raw_spin_unlock_irq(&rq->lock);
38 printk(KERN_ERR "bad: scheduling from the idle thread!\n"); 38 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
39 dump_stack(); 39 dump_stack();
40 spin_lock_irq(&rq->lock); 40 raw_spin_lock_irq(&rq->lock);
41} 41}
42 42
43static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) 43static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
@@ -97,7 +97,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
97 check_preempt_curr(rq, p, 0); 97 check_preempt_curr(rq, p, 0);
98} 98}
99 99
100unsigned int get_rr_interval_idle(struct task_struct *task) 100unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
101{ 101{
102 return 0; 102 return 0;
103} 103}
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index a4d790cddb1..f48328ac216 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -327,7 +327,7 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
327 327
328 weight = cpumask_weight(rd->span); 328 weight = cpumask_weight(rd->span);
329 329
330 spin_lock(&rt_b->rt_runtime_lock); 330 raw_spin_lock(&rt_b->rt_runtime_lock);
331 rt_period = ktime_to_ns(rt_b->rt_period); 331 rt_period = ktime_to_ns(rt_b->rt_period);
332 for_each_cpu(i, rd->span) { 332 for_each_cpu(i, rd->span) {
333 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 333 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
@@ -336,7 +336,7 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
336 if (iter == rt_rq) 336 if (iter == rt_rq)
337 continue; 337 continue;
338 338
339 spin_lock(&iter->rt_runtime_lock); 339 raw_spin_lock(&iter->rt_runtime_lock);
340 /* 340 /*
341 * Either all rqs have inf runtime and there's nothing to steal 341 * Either all rqs have inf runtime and there's nothing to steal
342 * or __disable_runtime() below sets a specific rq to inf to 342 * or __disable_runtime() below sets a specific rq to inf to
@@ -358,14 +358,14 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
358 rt_rq->rt_runtime += diff; 358 rt_rq->rt_runtime += diff;
359 more = 1; 359 more = 1;
360 if (rt_rq->rt_runtime == rt_period) { 360 if (rt_rq->rt_runtime == rt_period) {
361 spin_unlock(&iter->rt_runtime_lock); 361 raw_spin_unlock(&iter->rt_runtime_lock);
362 break; 362 break;
363 } 363 }
364 } 364 }
365next: 365next:
366 spin_unlock(&iter->rt_runtime_lock); 366 raw_spin_unlock(&iter->rt_runtime_lock);
367 } 367 }
368 spin_unlock(&rt_b->rt_runtime_lock); 368 raw_spin_unlock(&rt_b->rt_runtime_lock);
369 369
370 return more; 370 return more;
371} 371}
@@ -386,8 +386,8 @@ static void __disable_runtime(struct rq *rq)
386 s64 want; 386 s64 want;
387 int i; 387 int i;
388 388
389 spin_lock(&rt_b->rt_runtime_lock); 389 raw_spin_lock(&rt_b->rt_runtime_lock);
390 spin_lock(&rt_rq->rt_runtime_lock); 390 raw_spin_lock(&rt_rq->rt_runtime_lock);
391 /* 391 /*
392 * Either we're all inf and nobody needs to borrow, or we're 392 * Either we're all inf and nobody needs to borrow, or we're
393 * already disabled and thus have nothing to do, or we have 393 * already disabled and thus have nothing to do, or we have
@@ -396,7 +396,7 @@ static void __disable_runtime(struct rq *rq)
396 if (rt_rq->rt_runtime == RUNTIME_INF || 396 if (rt_rq->rt_runtime == RUNTIME_INF ||
397 rt_rq->rt_runtime == rt_b->rt_runtime) 397 rt_rq->rt_runtime == rt_b->rt_runtime)
398 goto balanced; 398 goto balanced;
399 spin_unlock(&rt_rq->rt_runtime_lock); 399 raw_spin_unlock(&rt_rq->rt_runtime_lock);
400 400
401 /* 401 /*
402 * Calculate the difference between what we started out with 402 * Calculate the difference between what we started out with
@@ -418,7 +418,7 @@ static void __disable_runtime(struct rq *rq)
418 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) 418 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
419 continue; 419 continue;
420 420
421 spin_lock(&iter->rt_runtime_lock); 421 raw_spin_lock(&iter->rt_runtime_lock);
422 if (want > 0) { 422 if (want > 0) {
423 diff = min_t(s64, iter->rt_runtime, want); 423 diff = min_t(s64, iter->rt_runtime, want);
424 iter->rt_runtime -= diff; 424 iter->rt_runtime -= diff;
@@ -427,13 +427,13 @@ static void __disable_runtime(struct rq *rq)
427 iter->rt_runtime -= want; 427 iter->rt_runtime -= want;
428 want -= want; 428 want -= want;
429 } 429 }
430 spin_unlock(&iter->rt_runtime_lock); 430 raw_spin_unlock(&iter->rt_runtime_lock);
431 431
432 if (!want) 432 if (!want)
433 break; 433 break;
434 } 434 }
435 435
436 spin_lock(&rt_rq->rt_runtime_lock); 436 raw_spin_lock(&rt_rq->rt_runtime_lock);
437 /* 437 /*
438 * We cannot be left wanting - that would mean some runtime 438 * We cannot be left wanting - that would mean some runtime
439 * leaked out of the system. 439 * leaked out of the system.
@@ -445,8 +445,8 @@ balanced:
445 * runtime - in which case borrowing doesn't make sense. 445 * runtime - in which case borrowing doesn't make sense.
446 */ 446 */
447 rt_rq->rt_runtime = RUNTIME_INF; 447 rt_rq->rt_runtime = RUNTIME_INF;
448 spin_unlock(&rt_rq->rt_runtime_lock); 448 raw_spin_unlock(&rt_rq->rt_runtime_lock);
449 spin_unlock(&rt_b->rt_runtime_lock); 449 raw_spin_unlock(&rt_b->rt_runtime_lock);
450 } 450 }
451} 451}
452 452
@@ -454,9 +454,9 @@ static void disable_runtime(struct rq *rq)
454{ 454{
455 unsigned long flags; 455 unsigned long flags;
456 456
457 spin_lock_irqsave(&rq->lock, flags); 457 raw_spin_lock_irqsave(&rq->lock, flags);
458 __disable_runtime(rq); 458 __disable_runtime(rq);
459 spin_unlock_irqrestore(&rq->lock, flags); 459 raw_spin_unlock_irqrestore(&rq->lock, flags);
460} 460}
461 461
462static void __enable_runtime(struct rq *rq) 462static void __enable_runtime(struct rq *rq)
@@ -472,13 +472,13 @@ static void __enable_runtime(struct rq *rq)
472 for_each_leaf_rt_rq(rt_rq, rq) { 472 for_each_leaf_rt_rq(rt_rq, rq) {
473 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 473 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
474 474
475 spin_lock(&rt_b->rt_runtime_lock); 475 raw_spin_lock(&rt_b->rt_runtime_lock);
476 spin_lock(&rt_rq->rt_runtime_lock); 476 raw_spin_lock(&rt_rq->rt_runtime_lock);
477 rt_rq->rt_runtime = rt_b->rt_runtime; 477 rt_rq->rt_runtime = rt_b->rt_runtime;
478 rt_rq->rt_time = 0; 478 rt_rq->rt_time = 0;
479 rt_rq->rt_throttled = 0; 479 rt_rq->rt_throttled = 0;
480 spin_unlock(&rt_rq->rt_runtime_lock); 480 raw_spin_unlock(&rt_rq->rt_runtime_lock);
481 spin_unlock(&rt_b->rt_runtime_lock); 481 raw_spin_unlock(&rt_b->rt_runtime_lock);
482 } 482 }
483} 483}
484 484
@@ -486,9 +486,9 @@ static void enable_runtime(struct rq *rq)
486{ 486{
487 unsigned long flags; 487 unsigned long flags;
488 488
489 spin_lock_irqsave(&rq->lock, flags); 489 raw_spin_lock_irqsave(&rq->lock, flags);
490 __enable_runtime(rq); 490 __enable_runtime(rq);
491 spin_unlock_irqrestore(&rq->lock, flags); 491 raw_spin_unlock_irqrestore(&rq->lock, flags);
492} 492}
493 493
494static int balance_runtime(struct rt_rq *rt_rq) 494static int balance_runtime(struct rt_rq *rt_rq)
@@ -496,9 +496,9 @@ static int balance_runtime(struct rt_rq *rt_rq)
496 int more = 0; 496 int more = 0;
497 497
498 if (rt_rq->rt_time > rt_rq->rt_runtime) { 498 if (rt_rq->rt_time > rt_rq->rt_runtime) {
499 spin_unlock(&rt_rq->rt_runtime_lock); 499 raw_spin_unlock(&rt_rq->rt_runtime_lock);
500 more = do_balance_runtime(rt_rq); 500 more = do_balance_runtime(rt_rq);
501 spin_lock(&rt_rq->rt_runtime_lock); 501 raw_spin_lock(&rt_rq->rt_runtime_lock);
502 } 502 }
503 503
504 return more; 504 return more;
@@ -524,11 +524,11 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
524 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); 524 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
525 struct rq *rq = rq_of_rt_rq(rt_rq); 525 struct rq *rq = rq_of_rt_rq(rt_rq);
526 526
527 spin_lock(&rq->lock); 527 raw_spin_lock(&rq->lock);
528 if (rt_rq->rt_time) { 528 if (rt_rq->rt_time) {
529 u64 runtime; 529 u64 runtime;
530 530
531 spin_lock(&rt_rq->rt_runtime_lock); 531 raw_spin_lock(&rt_rq->rt_runtime_lock);
532 if (rt_rq->rt_throttled) 532 if (rt_rq->rt_throttled)
533 balance_runtime(rt_rq); 533 balance_runtime(rt_rq);
534 runtime = rt_rq->rt_runtime; 534 runtime = rt_rq->rt_runtime;
@@ -539,13 +539,13 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
539 } 539 }
540 if (rt_rq->rt_time || rt_rq->rt_nr_running) 540 if (rt_rq->rt_time || rt_rq->rt_nr_running)
541 idle = 0; 541 idle = 0;
542 spin_unlock(&rt_rq->rt_runtime_lock); 542 raw_spin_unlock(&rt_rq->rt_runtime_lock);
543 } else if (rt_rq->rt_nr_running) 543 } else if (rt_rq->rt_nr_running)
544 idle = 0; 544 idle = 0;
545 545
546 if (enqueue) 546 if (enqueue)
547 sched_rt_rq_enqueue(rt_rq); 547 sched_rt_rq_enqueue(rt_rq);
548 spin_unlock(&rq->lock); 548 raw_spin_unlock(&rq->lock);
549 } 549 }
550 550
551 return idle; 551 return idle;
@@ -624,11 +624,11 @@ static void update_curr_rt(struct rq *rq)
624 rt_rq = rt_rq_of_se(rt_se); 624 rt_rq = rt_rq_of_se(rt_se);
625 625
626 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { 626 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
627 spin_lock(&rt_rq->rt_runtime_lock); 627 raw_spin_lock(&rt_rq->rt_runtime_lock);
628 rt_rq->rt_time += delta_exec; 628 rt_rq->rt_time += delta_exec;
629 if (sched_rt_runtime_exceeded(rt_rq)) 629 if (sched_rt_runtime_exceeded(rt_rq))
630 resched_task(curr); 630 resched_task(curr);
631 spin_unlock(&rt_rq->rt_runtime_lock); 631 raw_spin_unlock(&rt_rq->rt_runtime_lock);
632 } 632 }
633 } 633 }
634} 634}
@@ -1153,29 +1153,12 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
1153 1153
1154static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); 1154static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1155 1155
1156static inline int pick_optimal_cpu(int this_cpu,
1157 const struct cpumask *mask)
1158{
1159 int first;
1160
1161 /* "this_cpu" is cheaper to preempt than a remote processor */
1162 if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask))
1163 return this_cpu;
1164
1165 first = cpumask_first(mask);
1166 if (first < nr_cpu_ids)
1167 return first;
1168
1169 return -1;
1170}
1171
1172static int find_lowest_rq(struct task_struct *task) 1156static int find_lowest_rq(struct task_struct *task)
1173{ 1157{
1174 struct sched_domain *sd; 1158 struct sched_domain *sd;
1175 struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); 1159 struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
1176 int this_cpu = smp_processor_id(); 1160 int this_cpu = smp_processor_id();
1177 int cpu = task_cpu(task); 1161 int cpu = task_cpu(task);
1178 cpumask_var_t domain_mask;
1179 1162
1180 if (task->rt.nr_cpus_allowed == 1) 1163 if (task->rt.nr_cpus_allowed == 1)
1181 return -1; /* No other targets possible */ 1164 return -1; /* No other targets possible */
@@ -1198,28 +1181,26 @@ static int find_lowest_rq(struct task_struct *task)
1198 * Otherwise, we consult the sched_domains span maps to figure 1181 * Otherwise, we consult the sched_domains span maps to figure
1199 * out which cpu is logically closest to our hot cache data. 1182 * out which cpu is logically closest to our hot cache data.
1200 */ 1183 */
1201 if (this_cpu == cpu) 1184 if (!cpumask_test_cpu(this_cpu, lowest_mask))
1202 this_cpu = -1; /* Skip this_cpu opt if the same */ 1185 this_cpu = -1; /* Skip this_cpu opt if not among lowest */
1203
1204 if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) {
1205 for_each_domain(cpu, sd) {
1206 if (sd->flags & SD_WAKE_AFFINE) {
1207 int best_cpu;
1208 1186
1209 cpumask_and(domain_mask, 1187 for_each_domain(cpu, sd) {
1210 sched_domain_span(sd), 1188 if (sd->flags & SD_WAKE_AFFINE) {
1211 lowest_mask); 1189 int best_cpu;
1212 1190
1213 best_cpu = pick_optimal_cpu(this_cpu, 1191 /*
1214 domain_mask); 1192 * "this_cpu" is cheaper to preempt than a
1215 1193 * remote processor.
1216 if (best_cpu != -1) { 1194 */
1217 free_cpumask_var(domain_mask); 1195 if (this_cpu != -1 &&
1218 return best_cpu; 1196 cpumask_test_cpu(this_cpu, sched_domain_span(sd)))
1219 } 1197 return this_cpu;
1220 } 1198
1199 best_cpu = cpumask_first_and(lowest_mask,
1200 sched_domain_span(sd));
1201 if (best_cpu < nr_cpu_ids)
1202 return best_cpu;
1221 } 1203 }
1222 free_cpumask_var(domain_mask);
1223 } 1204 }
1224 1205
1225 /* 1206 /*
@@ -1227,7 +1208,13 @@ static int find_lowest_rq(struct task_struct *task)
1227 * just give the caller *something* to work with from the compatible 1208 * just give the caller *something* to work with from the compatible
1228 * locations. 1209 * locations.
1229 */ 1210 */
1230 return pick_optimal_cpu(this_cpu, lowest_mask); 1211 if (this_cpu != -1)
1212 return this_cpu;
1213
1214 cpu = cpumask_any(lowest_mask);
1215 if (cpu < nr_cpu_ids)
1216 return cpu;
1217 return -1;
1231} 1218}
1232 1219
1233/* Will lock the rq it finds */ 1220/* Will lock the rq it finds */
@@ -1259,7 +1246,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1259 task_running(rq, task) || 1246 task_running(rq, task) ||
1260 !task->se.on_rq)) { 1247 !task->se.on_rq)) {
1261 1248
1262 spin_unlock(&lowest_rq->lock); 1249 raw_spin_unlock(&lowest_rq->lock);
1263 lowest_rq = NULL; 1250 lowest_rq = NULL;
1264 break; 1251 break;
1265 } 1252 }
@@ -1485,7 +1472,7 @@ static void post_schedule_rt(struct rq *rq)
1485 * If we are not running and we are not going to reschedule soon, we should 1472 * If we are not running and we are not going to reschedule soon, we should
1486 * try to push tasks away now 1473 * try to push tasks away now
1487 */ 1474 */
1488static void task_wake_up_rt(struct rq *rq, struct task_struct *p) 1475static void task_woken_rt(struct rq *rq, struct task_struct *p)
1489{ 1476{
1490 if (!task_running(rq, p) && 1477 if (!task_running(rq, p) &&
1491 !test_tsk_need_resched(rq->curr) && 1478 !test_tsk_need_resched(rq->curr) &&
@@ -1734,7 +1721,7 @@ static void set_curr_task_rt(struct rq *rq)
1734 dequeue_pushable_task(rq, p); 1721 dequeue_pushable_task(rq, p);
1735} 1722}
1736 1723
1737unsigned int get_rr_interval_rt(struct task_struct *task) 1724unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
1738{ 1725{
1739 /* 1726 /*
1740 * Time slice is 0 for SCHED_FIFO tasks 1727 * Time slice is 0 for SCHED_FIFO tasks
@@ -1766,7 +1753,7 @@ static const struct sched_class rt_sched_class = {
1766 .rq_offline = rq_offline_rt, 1753 .rq_offline = rq_offline_rt,
1767 .pre_schedule = pre_schedule_rt, 1754 .pre_schedule = pre_schedule_rt,
1768 .post_schedule = post_schedule_rt, 1755 .post_schedule = post_schedule_rt,
1769 .task_wake_up = task_wake_up_rt, 1756 .task_woken = task_woken_rt,
1770 .switched_from = switched_from_rt, 1757 .switched_from = switched_from_rt,
1771#endif 1758#endif
1772 1759
diff --git a/kernel/signal.c b/kernel/signal.c
index 6705320784f..934ae5e687b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -22,12 +22,14 @@
22#include <linux/ptrace.h> 22#include <linux/ptrace.h>
23#include <linux/signal.h> 23#include <linux/signal.h>
24#include <linux/signalfd.h> 24#include <linux/signalfd.h>
25#include <linux/ratelimit.h>
25#include <linux/tracehook.h> 26#include <linux/tracehook.h>
26#include <linux/capability.h> 27#include <linux/capability.h>
27#include <linux/freezer.h> 28#include <linux/freezer.h>
28#include <linux/pid_namespace.h> 29#include <linux/pid_namespace.h>
29#include <linux/nsproxy.h> 30#include <linux/nsproxy.h>
30#include <trace/events/sched.h> 31#define CREATE_TRACE_POINTS
32#include <trace/events/signal.h>
31 33
32#include <asm/param.h> 34#include <asm/param.h>
33#include <asm/uaccess.h> 35#include <asm/uaccess.h>
@@ -41,6 +43,8 @@
41 43
42static struct kmem_cache *sigqueue_cachep; 44static struct kmem_cache *sigqueue_cachep;
43 45
46int print_fatal_signals __read_mostly;
47
44static void __user *sig_handler(struct task_struct *t, int sig) 48static void __user *sig_handler(struct task_struct *t, int sig)
45{ 49{
46 return t->sighand->action[sig - 1].sa.sa_handler; 50 return t->sighand->action[sig - 1].sa.sa_handler;
@@ -159,7 +163,7 @@ int next_signal(struct sigpending *pending, sigset_t *mask)
159{ 163{
160 unsigned long i, *s, *m, x; 164 unsigned long i, *s, *m, x;
161 int sig = 0; 165 int sig = 0;
162 166
163 s = pending->signal.sig; 167 s = pending->signal.sig;
164 m = mask->sig; 168 m = mask->sig;
165 switch (_NSIG_WORDS) { 169 switch (_NSIG_WORDS) {
@@ -184,33 +188,52 @@ int next_signal(struct sigpending *pending, sigset_t *mask)
184 sig = ffz(~x) + 1; 188 sig = ffz(~x) + 1;
185 break; 189 break;
186 } 190 }
187 191
188 return sig; 192 return sig;
189} 193}
190 194
195static inline void print_dropped_signal(int sig)
196{
197 static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
198
199 if (!print_fatal_signals)
200 return;
201
202 if (!__ratelimit(&ratelimit_state))
203 return;
204
205 printk(KERN_INFO "%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n",
206 current->comm, current->pid, sig);
207}
208
191/* 209/*
192 * allocate a new signal queue record 210 * allocate a new signal queue record
193 * - this may be called without locks if and only if t == current, otherwise an 211 * - this may be called without locks if and only if t == current, otherwise an
194 * appopriate lock must be held to stop the target task from exiting 212 * appopriate lock must be held to stop the target task from exiting
195 */ 213 */
196static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, 214static struct sigqueue *
197 int override_rlimit) 215__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
198{ 216{
199 struct sigqueue *q = NULL; 217 struct sigqueue *q = NULL;
200 struct user_struct *user; 218 struct user_struct *user;
201 219
202 /* 220 /*
203 * We won't get problems with the target's UID changing under us 221 * Protect access to @t credentials. This can go away when all
204 * because changing it requires RCU be used, and if t != current, the 222 * callers hold rcu read lock.
205 * caller must be holding the RCU readlock (by way of a spinlock) and
206 * we use RCU protection here
207 */ 223 */
224 rcu_read_lock();
208 user = get_uid(__task_cred(t)->user); 225 user = get_uid(__task_cred(t)->user);
209 atomic_inc(&user->sigpending); 226 atomic_inc(&user->sigpending);
227 rcu_read_unlock();
228
210 if (override_rlimit || 229 if (override_rlimit ||
211 atomic_read(&user->sigpending) <= 230 atomic_read(&user->sigpending) <=
212 t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) 231 t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) {
213 q = kmem_cache_alloc(sigqueue_cachep, flags); 232 q = kmem_cache_alloc(sigqueue_cachep, flags);
233 } else {
234 print_dropped_signal(sig);
235 }
236
214 if (unlikely(q == NULL)) { 237 if (unlikely(q == NULL)) {
215 atomic_dec(&user->sigpending); 238 atomic_dec(&user->sigpending);
216 free_uid(user); 239 free_uid(user);
@@ -400,7 +423,7 @@ still_pending:
400 */ 423 */
401 info->si_signo = sig; 424 info->si_signo = sig;
402 info->si_errno = 0; 425 info->si_errno = 0;
403 info->si_code = 0; 426 info->si_code = SI_USER;
404 info->si_pid = 0; 427 info->si_pid = 0;
405 info->si_uid = 0; 428 info->si_uid = 0;
406 } 429 }
@@ -584,6 +607,17 @@ static int rm_from_queue(unsigned long mask, struct sigpending *s)
584 return 1; 607 return 1;
585} 608}
586 609
610static inline int is_si_special(const struct siginfo *info)
611{
612 return info <= SEND_SIG_FORCED;
613}
614
615static inline bool si_fromuser(const struct siginfo *info)
616{
617 return info == SEND_SIG_NOINFO ||
618 (!is_si_special(info) && SI_FROMUSER(info));
619}
620
587/* 621/*
588 * Bad permissions for sending the signal 622 * Bad permissions for sending the signal
589 * - the caller must hold at least the RCU read lock 623 * - the caller must hold at least the RCU read lock
@@ -598,7 +632,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
598 if (!valid_signal(sig)) 632 if (!valid_signal(sig))
599 return -EINVAL; 633 return -EINVAL;
600 634
601 if (info != SEND_SIG_NOINFO && (is_si_special(info) || SI_FROMKERNEL(info))) 635 if (!si_fromuser(info))
602 return 0; 636 return 0;
603 637
604 error = audit_signal_info(sig, t); /* Let audit system see the signal */ 638 error = audit_signal_info(sig, t); /* Let audit system see the signal */
@@ -834,7 +868,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
834 struct sigqueue *q; 868 struct sigqueue *q;
835 int override_rlimit; 869 int override_rlimit;
836 870
837 trace_sched_signal_send(sig, t); 871 trace_signal_generate(sig, info, t);
838 872
839 assert_spin_locked(&t->sighand->siglock); 873 assert_spin_locked(&t->sighand->siglock);
840 874
@@ -869,7 +903,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
869 else 903 else
870 override_rlimit = 0; 904 override_rlimit = 0;
871 905
872 q = __sigqueue_alloc(t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE, 906 q = __sigqueue_alloc(sig, t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE,
873 override_rlimit); 907 override_rlimit);
874 if (q) { 908 if (q) {
875 list_add_tail(&q->list, &pending->list); 909 list_add_tail(&q->list, &pending->list);
@@ -896,12 +930,21 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
896 break; 930 break;
897 } 931 }
898 } else if (!is_si_special(info)) { 932 } else if (!is_si_special(info)) {
899 if (sig >= SIGRTMIN && info->si_code != SI_USER) 933 if (sig >= SIGRTMIN && info->si_code != SI_USER) {
900 /* 934 /*
901 * Queue overflow, abort. We may abort if the signal was rt 935 * Queue overflow, abort. We may abort if the
902 * and sent by user using something other than kill(). 936 * signal was rt and sent by user using something
903 */ 937 * other than kill().
938 */
939 trace_signal_overflow_fail(sig, group, info);
904 return -EAGAIN; 940 return -EAGAIN;
941 } else {
942 /*
943 * This is a silent loss of information. We still
944 * send the signal, but the *info bits are lost.
945 */
946 trace_signal_lose_info(sig, group, info);
947 }
905 } 948 }
906 949
907out_set: 950out_set:
@@ -917,16 +960,13 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
917 int from_ancestor_ns = 0; 960 int from_ancestor_ns = 0;
918 961
919#ifdef CONFIG_PID_NS 962#ifdef CONFIG_PID_NS
920 if (!is_si_special(info) && SI_FROMUSER(info) && 963 from_ancestor_ns = si_fromuser(info) &&
921 task_pid_nr_ns(current, task_active_pid_ns(t)) <= 0) 964 !task_pid_nr_ns(current, task_active_pid_ns(t));
922 from_ancestor_ns = 1;
923#endif 965#endif
924 966
925 return __send_signal(sig, info, t, group, from_ancestor_ns); 967 return __send_signal(sig, info, t, group, from_ancestor_ns);
926} 968}
927 969
928int print_fatal_signals;
929
930static void print_fatal_signal(struct pt_regs *regs, int signr) 970static void print_fatal_signal(struct pt_regs *regs, int signr)
931{ 971{
932 printk("%s/%d: potentially unexpected fatal signal %d.\n", 972 printk("%s/%d: potentially unexpected fatal signal %d.\n",
@@ -939,7 +979,8 @@ static void print_fatal_signal(struct pt_regs *regs, int signr)
939 for (i = 0; i < 16; i++) { 979 for (i = 0; i < 16; i++) {
940 unsigned char insn; 980 unsigned char insn;
941 981
942 __get_user(insn, (unsigned char *)(regs->ip + i)); 982 if (get_user(insn, (unsigned char *)(regs->ip + i)))
983 break;
943 printk("%02x ", insn); 984 printk("%02x ", insn);
944 } 985 }
945 } 986 }
@@ -1022,12 +1063,6 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
1022 return ret; 1063 return ret;
1023} 1064}
1024 1065
1025void
1026force_sig_specific(int sig, struct task_struct *t)
1027{
1028 force_sig_info(sig, SEND_SIG_FORCED, t);
1029}
1030
1031/* 1066/*
1032 * Nuke all other threads in the group. 1067 * Nuke all other threads in the group.
1033 */ 1068 */
@@ -1145,19 +1180,19 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
1145 int ret = -EINVAL; 1180 int ret = -EINVAL;
1146 struct task_struct *p; 1181 struct task_struct *p;
1147 const struct cred *pcred; 1182 const struct cred *pcred;
1183 unsigned long flags;
1148 1184
1149 if (!valid_signal(sig)) 1185 if (!valid_signal(sig))
1150 return ret; 1186 return ret;
1151 1187
1152 read_lock(&tasklist_lock); 1188 rcu_read_lock();
1153 p = pid_task(pid, PIDTYPE_PID); 1189 p = pid_task(pid, PIDTYPE_PID);
1154 if (!p) { 1190 if (!p) {
1155 ret = -ESRCH; 1191 ret = -ESRCH;
1156 goto out_unlock; 1192 goto out_unlock;
1157 } 1193 }
1158 pcred = __task_cred(p); 1194 pcred = __task_cred(p);
1159 if ((info == SEND_SIG_NOINFO || 1195 if (si_fromuser(info) &&
1160 (!is_si_special(info) && SI_FROMUSER(info))) &&
1161 euid != pcred->suid && euid != pcred->uid && 1196 euid != pcred->suid && euid != pcred->uid &&
1162 uid != pcred->suid && uid != pcred->uid) { 1197 uid != pcred->suid && uid != pcred->uid) {
1163 ret = -EPERM; 1198 ret = -EPERM;
@@ -1166,14 +1201,16 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
1166 ret = security_task_kill(p, info, sig, secid); 1201 ret = security_task_kill(p, info, sig, secid);
1167 if (ret) 1202 if (ret)
1168 goto out_unlock; 1203 goto out_unlock;
1169 if (sig && p->sighand) { 1204
1170 unsigned long flags; 1205 if (sig) {
1171 spin_lock_irqsave(&p->sighand->siglock, flags); 1206 if (lock_task_sighand(p, &flags)) {
1172 ret = __send_signal(sig, info, p, 1, 0); 1207 ret = __send_signal(sig, info, p, 1, 0);
1173 spin_unlock_irqrestore(&p->sighand->siglock, flags); 1208 unlock_task_sighand(p, &flags);
1209 } else
1210 ret = -ESRCH;
1174 } 1211 }
1175out_unlock: 1212out_unlock:
1176 read_unlock(&tasklist_lock); 1213 rcu_read_unlock();
1177 return ret; 1214 return ret;
1178} 1215}
1179EXPORT_SYMBOL_GPL(kill_pid_info_as_uid); 1216EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
@@ -1293,19 +1330,19 @@ EXPORT_SYMBOL(kill_pid);
1293 * These functions support sending signals using preallocated sigqueue 1330 * These functions support sending signals using preallocated sigqueue
1294 * structures. This is needed "because realtime applications cannot 1331 * structures. This is needed "because realtime applications cannot
1295 * afford to lose notifications of asynchronous events, like timer 1332 * afford to lose notifications of asynchronous events, like timer
1296 * expirations or I/O completions". In the case of Posix Timers 1333 * expirations or I/O completions". In the case of Posix Timers
1297 * we allocate the sigqueue structure from the timer_create. If this 1334 * we allocate the sigqueue structure from the timer_create. If this
1298 * allocation fails we are able to report the failure to the application 1335 * allocation fails we are able to report the failure to the application
1299 * with an EAGAIN error. 1336 * with an EAGAIN error.
1300 */ 1337 */
1301
1302struct sigqueue *sigqueue_alloc(void) 1338struct sigqueue *sigqueue_alloc(void)
1303{ 1339{
1304 struct sigqueue *q; 1340 struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
1305 1341
1306 if ((q = __sigqueue_alloc(current, GFP_KERNEL, 0))) 1342 if (q)
1307 q->flags |= SIGQUEUE_PREALLOC; 1343 q->flags |= SIGQUEUE_PREALLOC;
1308 return(q); 1344
1345 return q;
1309} 1346}
1310 1347
1311void sigqueue_free(struct sigqueue *q) 1348void sigqueue_free(struct sigqueue *q)
@@ -1807,11 +1844,6 @@ relock:
1807 1844
1808 for (;;) { 1845 for (;;) {
1809 struct k_sigaction *ka; 1846 struct k_sigaction *ka;
1810
1811 if (unlikely(signal->group_stop_count > 0) &&
1812 do_signal_stop(0))
1813 goto relock;
1814
1815 /* 1847 /*
1816 * Tracing can induce an artifical signal and choose sigaction. 1848 * Tracing can induce an artifical signal and choose sigaction.
1817 * The return value in @signr determines the default action, 1849 * The return value in @signr determines the default action,
@@ -1823,6 +1855,10 @@ relock:
1823 if (unlikely(signr != 0)) 1855 if (unlikely(signr != 0))
1824 ka = return_ka; 1856 ka = return_ka;
1825 else { 1857 else {
1858 if (unlikely(signal->group_stop_count > 0) &&
1859 do_signal_stop(0))
1860 goto relock;
1861
1826 signr = dequeue_signal(current, &current->blocked, 1862 signr = dequeue_signal(current, &current->blocked,
1827 info); 1863 info);
1828 1864
@@ -1839,6 +1875,9 @@ relock:
1839 ka = &sighand->action[signr-1]; 1875 ka = &sighand->action[signr-1];
1840 } 1876 }
1841 1877
1878 /* Trace actually delivered signals. */
1879 trace_signal_deliver(signr, info, ka);
1880
1842 if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ 1881 if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */
1843 continue; 1882 continue;
1844 if (ka->sa.sa_handler != SIG_DFL) { 1883 if (ka->sa.sa_handler != SIG_DFL) {
diff --git a/kernel/slow-work-debugfs.c b/kernel/slow-work-debugfs.c
new file mode 100644
index 00000000000..e45c4364529
--- /dev/null
+++ b/kernel/slow-work-debugfs.c
@@ -0,0 +1,227 @@
1/* Slow work debugging
2 *
3 * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/module.h>
13#include <linux/slow-work.h>
14#include <linux/fs.h>
15#include <linux/time.h>
16#include <linux/seq_file.h>
17#include "slow-work.h"
18
19#define ITERATOR_SHIFT (BITS_PER_LONG - 4)
20#define ITERATOR_SELECTOR (0xfUL << ITERATOR_SHIFT)
21#define ITERATOR_COUNTER (~ITERATOR_SELECTOR)
22
23void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m)
24{
25 seq_puts(m, "Slow-work: New thread");
26}
27
28/*
29 * Render the time mark field on a work item into a 5-char time with units plus
30 * a space
31 */
32static void slow_work_print_mark(struct seq_file *m, struct slow_work *work)
33{
34 struct timespec now, diff;
35
36 now = CURRENT_TIME;
37 diff = timespec_sub(now, work->mark);
38
39 if (diff.tv_sec < 0)
40 seq_puts(m, " -ve ");
41 else if (diff.tv_sec == 0 && diff.tv_nsec < 1000)
42 seq_printf(m, "%3luns ", diff.tv_nsec);
43 else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000)
44 seq_printf(m, "%3luus ", diff.tv_nsec / 1000);
45 else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000)
46 seq_printf(m, "%3lums ", diff.tv_nsec / 1000000);
47 else if (diff.tv_sec <= 1)
48 seq_puts(m, " 1s ");
49 else if (diff.tv_sec < 60)
50 seq_printf(m, "%4lus ", diff.tv_sec);
51 else if (diff.tv_sec < 60 * 60)
52 seq_printf(m, "%4lum ", diff.tv_sec / 60);
53 else if (diff.tv_sec < 60 * 60 * 24)
54 seq_printf(m, "%4luh ", diff.tv_sec / 3600);
55 else
56 seq_puts(m, "exces ");
57}
58
59/*
60 * Describe a slow work item for debugfs
61 */
62static int slow_work_runqueue_show(struct seq_file *m, void *v)
63{
64 struct slow_work *work;
65 struct list_head *p = v;
66 unsigned long id;
67
68 switch ((unsigned long) v) {
69 case 1:
70 seq_puts(m, "THR PID ITEM ADDR FL MARK DESC\n");
71 return 0;
72 case 2:
73 seq_puts(m, "=== ===== ================ == ===== ==========\n");
74 return 0;
75
76 case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1:
77 id = (unsigned long) v - 3;
78
79 read_lock(&slow_work_execs_lock);
80 work = slow_work_execs[id];
81 if (work) {
82 smp_read_barrier_depends();
83
84 seq_printf(m, "%3lu %5d %16p %2lx ",
85 id, slow_work_pids[id], work, work->flags);
86 slow_work_print_mark(m, work);
87
88 if (work->ops->desc)
89 work->ops->desc(work, m);
90 seq_putc(m, '\n');
91 }
92 read_unlock(&slow_work_execs_lock);
93 return 0;
94
95 default:
96 work = list_entry(p, struct slow_work, link);
97 seq_printf(m, "%3s - %16p %2lx ",
98 work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq",
99 work, work->flags);
100 slow_work_print_mark(m, work);
101
102 if (work->ops->desc)
103 work->ops->desc(work, m);
104 seq_putc(m, '\n');
105 return 0;
106 }
107}
108
109/*
110 * map the iterator to a work item
111 */
112static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos)
113{
114 struct list_head *p;
115 unsigned long count, id;
116
117 switch (*_pos >> ITERATOR_SHIFT) {
118 case 0x0:
119 if (*_pos == 0)
120 *_pos = 1;
121 if (*_pos < 3)
122 return (void *)(unsigned long) *_pos;
123 if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT)
124 for (id = *_pos - 3;
125 id < SLOW_WORK_THREAD_LIMIT;
126 id++, (*_pos)++)
127 if (slow_work_execs[id])
128 return (void *)(unsigned long) *_pos;
129 *_pos = 0x1UL << ITERATOR_SHIFT;
130
131 case 0x1:
132 count = *_pos & ITERATOR_COUNTER;
133 list_for_each(p, &slow_work_queue) {
134 if (count == 0)
135 return p;
136 count--;
137 }
138 *_pos = 0x2UL << ITERATOR_SHIFT;
139
140 case 0x2:
141 count = *_pos & ITERATOR_COUNTER;
142 list_for_each(p, &vslow_work_queue) {
143 if (count == 0)
144 return p;
145 count--;
146 }
147 *_pos = 0x3UL << ITERATOR_SHIFT;
148
149 default:
150 return NULL;
151 }
152}
153
154/*
155 * set up the iterator to start reading from the first line
156 */
157static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos)
158{
159 spin_lock_irq(&slow_work_queue_lock);
160 return slow_work_runqueue_index(m, _pos);
161}
162
163/*
164 * move to the next line
165 */
166static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos)
167{
168 struct list_head *p = v;
169 unsigned long selector = *_pos >> ITERATOR_SHIFT;
170
171 (*_pos)++;
172 switch (selector) {
173 case 0x0:
174 return slow_work_runqueue_index(m, _pos);
175
176 case 0x1:
177 if (*_pos >> ITERATOR_SHIFT == 0x1) {
178 p = p->next;
179 if (p != &slow_work_queue)
180 return p;
181 }
182 *_pos = 0x2UL << ITERATOR_SHIFT;
183 p = &vslow_work_queue;
184
185 case 0x2:
186 if (*_pos >> ITERATOR_SHIFT == 0x2) {
187 p = p->next;
188 if (p != &vslow_work_queue)
189 return p;
190 }
191 *_pos = 0x3UL << ITERATOR_SHIFT;
192
193 default:
194 return NULL;
195 }
196}
197
198/*
199 * clean up after reading
200 */
201static void slow_work_runqueue_stop(struct seq_file *m, void *v)
202{
203 spin_unlock_irq(&slow_work_queue_lock);
204}
205
206static const struct seq_operations slow_work_runqueue_ops = {
207 .start = slow_work_runqueue_start,
208 .stop = slow_work_runqueue_stop,
209 .next = slow_work_runqueue_next,
210 .show = slow_work_runqueue_show,
211};
212
213/*
214 * open "/sys/kernel/debug/slow_work/runqueue" to list queue contents
215 */
216static int slow_work_runqueue_open(struct inode *inode, struct file *file)
217{
218 return seq_open(file, &slow_work_runqueue_ops);
219}
220
221const struct file_operations slow_work_runqueue_fops = {
222 .owner = THIS_MODULE,
223 .open = slow_work_runqueue_open,
224 .read = seq_read,
225 .llseek = seq_lseek,
226 .release = seq_release,
227};
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 0d31135efbf..7494bbf5a27 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -16,11 +16,8 @@
16#include <linux/kthread.h> 16#include <linux/kthread.h>
17#include <linux/freezer.h> 17#include <linux/freezer.h>
18#include <linux/wait.h> 18#include <linux/wait.h>
19 19#include <linux/debugfs.h>
20#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of 20#include "slow-work.h"
21 * things to do */
22#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after
23 * OOM */
24 21
25static void slow_work_cull_timeout(unsigned long); 22static void slow_work_cull_timeout(unsigned long);
26static void slow_work_oom_timeout(unsigned long); 23static void slow_work_oom_timeout(unsigned long);
@@ -46,13 +43,12 @@ static unsigned vslow_work_proportion = 50; /* % of threads that may process
46 43
47#ifdef CONFIG_SYSCTL 44#ifdef CONFIG_SYSCTL
48static const int slow_work_min_min_threads = 2; 45static const int slow_work_min_min_threads = 2;
49static int slow_work_max_max_threads = 255; 46static int slow_work_max_max_threads = SLOW_WORK_THREAD_LIMIT;
50static const int slow_work_min_vslow = 1; 47static const int slow_work_min_vslow = 1;
51static const int slow_work_max_vslow = 99; 48static const int slow_work_max_vslow = 99;
52 49
53ctl_table slow_work_sysctls[] = { 50ctl_table slow_work_sysctls[] = {
54 { 51 {
55 .ctl_name = CTL_UNNUMBERED,
56 .procname = "min-threads", 52 .procname = "min-threads",
57 .data = &slow_work_min_threads, 53 .data = &slow_work_min_threads,
58 .maxlen = sizeof(unsigned), 54 .maxlen = sizeof(unsigned),
@@ -62,7 +58,6 @@ ctl_table slow_work_sysctls[] = {
62 .extra2 = &slow_work_max_threads, 58 .extra2 = &slow_work_max_threads,
63 }, 59 },
64 { 60 {
65 .ctl_name = CTL_UNNUMBERED,
66 .procname = "max-threads", 61 .procname = "max-threads",
67 .data = &slow_work_max_threads, 62 .data = &slow_work_max_threads,
68 .maxlen = sizeof(unsigned), 63 .maxlen = sizeof(unsigned),
@@ -72,16 +67,15 @@ ctl_table slow_work_sysctls[] = {
72 .extra2 = (void *) &slow_work_max_max_threads, 67 .extra2 = (void *) &slow_work_max_max_threads,
73 }, 68 },
74 { 69 {
75 .ctl_name = CTL_UNNUMBERED,
76 .procname = "vslow-percentage", 70 .procname = "vslow-percentage",
77 .data = &vslow_work_proportion, 71 .data = &vslow_work_proportion,
78 .maxlen = sizeof(unsigned), 72 .maxlen = sizeof(unsigned),
79 .mode = 0644, 73 .mode = 0644,
80 .proc_handler = &proc_dointvec_minmax, 74 .proc_handler = proc_dointvec_minmax,
81 .extra1 = (void *) &slow_work_min_vslow, 75 .extra1 = (void *) &slow_work_min_vslow,
82 .extra2 = (void *) &slow_work_max_vslow, 76 .extra2 = (void *) &slow_work_max_vslow,
83 }, 77 },
84 { .ctl_name = 0 } 78 {}
85}; 79};
86#endif 80#endif
87 81
@@ -98,6 +92,56 @@ static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0);
98static struct slow_work slow_work_new_thread; /* new thread starter */ 92static struct slow_work slow_work_new_thread; /* new thread starter */
99 93
100/* 94/*
95 * slow work ID allocation (use slow_work_queue_lock)
96 */
97static DECLARE_BITMAP(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
98
99/*
100 * Unregistration tracking to prevent put_ref() from disappearing during module
101 * unload
102 */
103#ifdef CONFIG_MODULES
104static struct module *slow_work_thread_processing[SLOW_WORK_THREAD_LIMIT];
105static struct module *slow_work_unreg_module;
106static struct slow_work *slow_work_unreg_work_item;
107static DECLARE_WAIT_QUEUE_HEAD(slow_work_unreg_wq);
108static DEFINE_MUTEX(slow_work_unreg_sync_lock);
109
110static void slow_work_set_thread_processing(int id, struct slow_work *work)
111{
112 if (work)
113 slow_work_thread_processing[id] = work->owner;
114}
115static void slow_work_done_thread_processing(int id, struct slow_work *work)
116{
117 struct module *module = slow_work_thread_processing[id];
118
119 slow_work_thread_processing[id] = NULL;
120 smp_mb();
121 if (slow_work_unreg_work_item == work ||
122 slow_work_unreg_module == module)
123 wake_up_all(&slow_work_unreg_wq);
124}
125static void slow_work_clear_thread_processing(int id)
126{
127 slow_work_thread_processing[id] = NULL;
128}
129#else
130static void slow_work_set_thread_processing(int id, struct slow_work *work) {}
131static void slow_work_done_thread_processing(int id, struct slow_work *work) {}
132static void slow_work_clear_thread_processing(int id) {}
133#endif
134
135/*
136 * Data for tracking currently executing items for indication through /proc
137 */
138#ifdef CONFIG_SLOW_WORK_DEBUG
139struct slow_work *slow_work_execs[SLOW_WORK_THREAD_LIMIT];
140pid_t slow_work_pids[SLOW_WORK_THREAD_LIMIT];
141DEFINE_RWLOCK(slow_work_execs_lock);
142#endif
143
144/*
101 * The queues of work items and the lock governing access to them. These are 145 * The queues of work items and the lock governing access to them. These are
102 * shared between all the CPUs. It doesn't make sense to have per-CPU queues 146 * shared between all the CPUs. It doesn't make sense to have per-CPU queues
103 * as the number of threads bears no relation to the number of CPUs. 147 * as the number of threads bears no relation to the number of CPUs.
@@ -105,9 +149,18 @@ static struct slow_work slow_work_new_thread; /* new thread starter */
105 * There are two queues of work items: one for slow work items, and one for 149 * There are two queues of work items: one for slow work items, and one for
106 * very slow work items. 150 * very slow work items.
107 */ 151 */
108static LIST_HEAD(slow_work_queue); 152LIST_HEAD(slow_work_queue);
109static LIST_HEAD(vslow_work_queue); 153LIST_HEAD(vslow_work_queue);
110static DEFINE_SPINLOCK(slow_work_queue_lock); 154DEFINE_SPINLOCK(slow_work_queue_lock);
155
156/*
157 * The following are two wait queues that get pinged when a work item is placed
158 * on an empty queue. These allow work items that are hogging a thread by
159 * sleeping in a way that could be deferred to yield their thread and enqueue
160 * themselves.
161 */
162static DECLARE_WAIT_QUEUE_HEAD(slow_work_queue_waits_for_occupation);
163static DECLARE_WAIT_QUEUE_HEAD(vslow_work_queue_waits_for_occupation);
111 164
112/* 165/*
113 * The thread controls. A variable used to signal to the threads that they 166 * The thread controls. A variable used to signal to the threads that they
@@ -126,6 +179,20 @@ static DECLARE_COMPLETION(slow_work_last_thread_exited);
126static int slow_work_user_count; 179static int slow_work_user_count;
127static DEFINE_MUTEX(slow_work_user_lock); 180static DEFINE_MUTEX(slow_work_user_lock);
128 181
182static inline int slow_work_get_ref(struct slow_work *work)
183{
184 if (work->ops->get_ref)
185 return work->ops->get_ref(work);
186
187 return 0;
188}
189
190static inline void slow_work_put_ref(struct slow_work *work)
191{
192 if (work->ops->put_ref)
193 work->ops->put_ref(work);
194}
195
129/* 196/*
130 * Calculate the maximum number of active threads in the pool that are 197 * Calculate the maximum number of active threads in the pool that are
131 * permitted to process very slow work items. 198 * permitted to process very slow work items.
@@ -149,7 +216,7 @@ static unsigned slow_work_calc_vsmax(void)
149 * Attempt to execute stuff queued on a slow thread. Return true if we managed 216 * Attempt to execute stuff queued on a slow thread. Return true if we managed
150 * it, false if there was nothing to do. 217 * it, false if there was nothing to do.
151 */ 218 */
152static bool slow_work_execute(void) 219static noinline bool slow_work_execute(int id)
153{ 220{
154 struct slow_work *work = NULL; 221 struct slow_work *work = NULL;
155 unsigned vsmax; 222 unsigned vsmax;
@@ -186,6 +253,13 @@ static bool slow_work_execute(void)
186 } else { 253 } else {
187 very_slow = false; /* avoid the compiler warning */ 254 very_slow = false; /* avoid the compiler warning */
188 } 255 }
256
257 slow_work_set_thread_processing(id, work);
258 if (work) {
259 slow_work_mark_time(work);
260 slow_work_begin_exec(id, work);
261 }
262
189 spin_unlock_irq(&slow_work_queue_lock); 263 spin_unlock_irq(&slow_work_queue_lock);
190 264
191 if (!work) 265 if (!work)
@@ -194,12 +268,19 @@ static bool slow_work_execute(void)
194 if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags)) 268 if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags))
195 BUG(); 269 BUG();
196 270
197 work->ops->execute(work); 271 /* don't execute if the work is in the process of being cancelled */
272 if (!test_bit(SLOW_WORK_CANCELLING, &work->flags))
273 work->ops->execute(work);
198 274
199 if (very_slow) 275 if (very_slow)
200 atomic_dec(&vslow_work_executing_count); 276 atomic_dec(&vslow_work_executing_count);
201 clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags); 277 clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags);
202 278
279 /* wake up anyone waiting for this work to be complete */
280 wake_up_bit(&work->flags, SLOW_WORK_EXECUTING);
281
282 slow_work_end_exec(id, work);
283
203 /* if someone tried to enqueue the item whilst we were executing it, 284 /* if someone tried to enqueue the item whilst we were executing it,
204 * then it'll be left unenqueued to avoid multiple threads trying to 285 * then it'll be left unenqueued to avoid multiple threads trying to
205 * execute it simultaneously 286 * execute it simultaneously
@@ -219,7 +300,10 @@ static bool slow_work_execute(void)
219 spin_unlock_irq(&slow_work_queue_lock); 300 spin_unlock_irq(&slow_work_queue_lock);
220 } 301 }
221 302
222 work->ops->put_ref(work); 303 /* sort out the race between module unloading and put_ref() */
304 slow_work_put_ref(work);
305 slow_work_done_thread_processing(id, work);
306
223 return true; 307 return true;
224 308
225auto_requeue: 309auto_requeue:
@@ -227,15 +311,61 @@ auto_requeue:
227 * - we transfer our ref on the item back to the appropriate queue 311 * - we transfer our ref on the item back to the appropriate queue
228 * - don't wake another thread up as we're awake already 312 * - don't wake another thread up as we're awake already
229 */ 313 */
314 slow_work_mark_time(work);
230 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) 315 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
231 list_add_tail(&work->link, &vslow_work_queue); 316 list_add_tail(&work->link, &vslow_work_queue);
232 else 317 else
233 list_add_tail(&work->link, &slow_work_queue); 318 list_add_tail(&work->link, &slow_work_queue);
234 spin_unlock_irq(&slow_work_queue_lock); 319 spin_unlock_irq(&slow_work_queue_lock);
320 slow_work_clear_thread_processing(id);
235 return true; 321 return true;
236} 322}
237 323
238/** 324/**
325 * slow_work_sleep_till_thread_needed - Sleep till thread needed by other work
326 * work: The work item under execution that wants to sleep
327 * _timeout: Scheduler sleep timeout
328 *
329 * Allow a requeueable work item to sleep on a slow-work processor thread until
330 * that thread is needed to do some other work or the sleep is interrupted by
331 * some other event.
332 *
333 * The caller must set up a wake up event before calling this and must have set
334 * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own
335 * condition before calling this function as no test is made here.
336 *
337 * False is returned if there is nothing on the queue; true is returned if the
338 * work item should be requeued
339 */
340bool slow_work_sleep_till_thread_needed(struct slow_work *work,
341 signed long *_timeout)
342{
343 wait_queue_head_t *wfo_wq;
344 struct list_head *queue;
345
346 DEFINE_WAIT(wait);
347
348 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
349 wfo_wq = &vslow_work_queue_waits_for_occupation;
350 queue = &vslow_work_queue;
351 } else {
352 wfo_wq = &slow_work_queue_waits_for_occupation;
353 queue = &slow_work_queue;
354 }
355
356 if (!list_empty(queue))
357 return true;
358
359 add_wait_queue_exclusive(wfo_wq, &wait);
360 if (list_empty(queue))
361 *_timeout = schedule_timeout(*_timeout);
362 finish_wait(wfo_wq, &wait);
363
364 return !list_empty(queue);
365}
366EXPORT_SYMBOL(slow_work_sleep_till_thread_needed);
367
368/**
239 * slow_work_enqueue - Schedule a slow work item for processing 369 * slow_work_enqueue - Schedule a slow work item for processing
240 * @work: The work item to queue 370 * @work: The work item to queue
241 * 371 *
@@ -260,16 +390,22 @@ auto_requeue:
260 * allowed to pick items to execute. This ensures that very slow items won't 390 * allowed to pick items to execute. This ensures that very slow items won't
261 * overly block ones that are just ordinarily slow. 391 * overly block ones that are just ordinarily slow.
262 * 392 *
263 * Returns 0 if successful, -EAGAIN if not. 393 * Returns 0 if successful, -EAGAIN if not (or -ECANCELED if cancelled work is
394 * attempted queued)
264 */ 395 */
265int slow_work_enqueue(struct slow_work *work) 396int slow_work_enqueue(struct slow_work *work)
266{ 397{
398 wait_queue_head_t *wfo_wq;
399 struct list_head *queue;
267 unsigned long flags; 400 unsigned long flags;
401 int ret;
402
403 if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
404 return -ECANCELED;
268 405
269 BUG_ON(slow_work_user_count <= 0); 406 BUG_ON(slow_work_user_count <= 0);
270 BUG_ON(!work); 407 BUG_ON(!work);
271 BUG_ON(!work->ops); 408 BUG_ON(!work->ops);
272 BUG_ON(!work->ops->get_ref);
273 409
274 /* when honouring an enqueue request, we only promise that we will run 410 /* when honouring an enqueue request, we only promise that we will run
275 * the work function in the future; we do not promise to run it once 411 * the work function in the future; we do not promise to run it once
@@ -280,8 +416,19 @@ int slow_work_enqueue(struct slow_work *work)
280 * maintaining our promise 416 * maintaining our promise
281 */ 417 */
282 if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) { 418 if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
419 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
420 wfo_wq = &vslow_work_queue_waits_for_occupation;
421 queue = &vslow_work_queue;
422 } else {
423 wfo_wq = &slow_work_queue_waits_for_occupation;
424 queue = &slow_work_queue;
425 }
426
283 spin_lock_irqsave(&slow_work_queue_lock, flags); 427 spin_lock_irqsave(&slow_work_queue_lock, flags);
284 428
429 if (unlikely(test_bit(SLOW_WORK_CANCELLING, &work->flags)))
430 goto cancelled;
431
285 /* we promise that we will not attempt to execute the work 432 /* we promise that we will not attempt to execute the work
286 * function in more than one thread simultaneously 433 * function in more than one thread simultaneously
287 * 434 *
@@ -299,25 +446,221 @@ int slow_work_enqueue(struct slow_work *work)
299 if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) { 446 if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
300 set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags); 447 set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
301 } else { 448 } else {
302 if (work->ops->get_ref(work) < 0) 449 ret = slow_work_get_ref(work);
303 goto cant_get_ref; 450 if (ret < 0)
304 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) 451 goto failed;
305 list_add_tail(&work->link, &vslow_work_queue); 452 slow_work_mark_time(work);
306 else 453 list_add_tail(&work->link, queue);
307 list_add_tail(&work->link, &slow_work_queue);
308 wake_up(&slow_work_thread_wq); 454 wake_up(&slow_work_thread_wq);
455
456 /* if someone who could be requeued is sleeping on a
457 * thread, then ask them to yield their thread */
458 if (work->link.prev == queue)
459 wake_up(wfo_wq);
309 } 460 }
310 461
311 spin_unlock_irqrestore(&slow_work_queue_lock, flags); 462 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
312 } 463 }
313 return 0; 464 return 0;
314 465
315cant_get_ref: 466cancelled:
467 ret = -ECANCELED;
468failed:
316 spin_unlock_irqrestore(&slow_work_queue_lock, flags); 469 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
317 return -EAGAIN; 470 return ret;
318} 471}
319EXPORT_SYMBOL(slow_work_enqueue); 472EXPORT_SYMBOL(slow_work_enqueue);
320 473
474static int slow_work_wait(void *word)
475{
476 schedule();
477 return 0;
478}
479
480/**
481 * slow_work_cancel - Cancel a slow work item
482 * @work: The work item to cancel
483 *
484 * This function will cancel a previously enqueued work item. If we cannot
485 * cancel the work item, it is guarenteed to have run when this function
486 * returns.
487 */
488void slow_work_cancel(struct slow_work *work)
489{
490 bool wait = true, put = false;
491
492 set_bit(SLOW_WORK_CANCELLING, &work->flags);
493 smp_mb();
494
495 /* if the work item is a delayed work item with an active timer, we
496 * need to wait for the timer to finish _before_ getting the spinlock,
497 * lest we deadlock against the timer routine
498 *
499 * the timer routine will leave DELAYED set if it notices the
500 * CANCELLING flag in time
501 */
502 if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
503 struct delayed_slow_work *dwork =
504 container_of(work, struct delayed_slow_work, work);
505 del_timer_sync(&dwork->timer);
506 }
507
508 spin_lock_irq(&slow_work_queue_lock);
509
510 if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
511 /* the timer routine aborted or never happened, so we are left
512 * holding the timer's reference on the item and should just
513 * drop the pending flag and wait for any ongoing execution to
514 * finish */
515 struct delayed_slow_work *dwork =
516 container_of(work, struct delayed_slow_work, work);
517
518 BUG_ON(timer_pending(&dwork->timer));
519 BUG_ON(!list_empty(&work->link));
520
521 clear_bit(SLOW_WORK_DELAYED, &work->flags);
522 put = true;
523 clear_bit(SLOW_WORK_PENDING, &work->flags);
524
525 } else if (test_bit(SLOW_WORK_PENDING, &work->flags) &&
526 !list_empty(&work->link)) {
527 /* the link in the pending queue holds a reference on the item
528 * that we will need to release */
529 list_del_init(&work->link);
530 wait = false;
531 put = true;
532 clear_bit(SLOW_WORK_PENDING, &work->flags);
533
534 } else if (test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) {
535 /* the executor is holding our only reference on the item, so
536 * we merely need to wait for it to finish executing */
537 clear_bit(SLOW_WORK_PENDING, &work->flags);
538 }
539
540 spin_unlock_irq(&slow_work_queue_lock);
541
542 /* the EXECUTING flag is set by the executor whilst the spinlock is set
543 * and before the item is dequeued - so assuming the above doesn't
544 * actually dequeue it, simply waiting for the EXECUTING flag to be
545 * released here should be sufficient */
546 if (wait)
547 wait_on_bit(&work->flags, SLOW_WORK_EXECUTING, slow_work_wait,
548 TASK_UNINTERRUPTIBLE);
549
550 clear_bit(SLOW_WORK_CANCELLING, &work->flags);
551 if (put)
552 slow_work_put_ref(work);
553}
554EXPORT_SYMBOL(slow_work_cancel);
555
556/*
557 * Handle expiry of the delay timer, indicating that a delayed slow work item
558 * should now be queued if not cancelled
559 */
560static void delayed_slow_work_timer(unsigned long data)
561{
562 wait_queue_head_t *wfo_wq;
563 struct list_head *queue;
564 struct slow_work *work = (struct slow_work *) data;
565 unsigned long flags;
566 bool queued = false, put = false, first = false;
567
568 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
569 wfo_wq = &vslow_work_queue_waits_for_occupation;
570 queue = &vslow_work_queue;
571 } else {
572 wfo_wq = &slow_work_queue_waits_for_occupation;
573 queue = &slow_work_queue;
574 }
575
576 spin_lock_irqsave(&slow_work_queue_lock, flags);
577 if (likely(!test_bit(SLOW_WORK_CANCELLING, &work->flags))) {
578 clear_bit(SLOW_WORK_DELAYED, &work->flags);
579
580 if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
581 /* we discard the reference the timer was holding in
582 * favour of the one the executor holds */
583 set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
584 put = true;
585 } else {
586 slow_work_mark_time(work);
587 list_add_tail(&work->link, queue);
588 queued = true;
589 if (work->link.prev == queue)
590 first = true;
591 }
592 }
593
594 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
595 if (put)
596 slow_work_put_ref(work);
597 if (first)
598 wake_up(wfo_wq);
599 if (queued)
600 wake_up(&slow_work_thread_wq);
601}
602
603/**
604 * delayed_slow_work_enqueue - Schedule a delayed slow work item for processing
605 * @dwork: The delayed work item to queue
606 * @delay: When to start executing the work, in jiffies from now
607 *
608 * This is similar to slow_work_enqueue(), but it adds a delay before the work
609 * is actually queued for processing.
610 *
611 * The item can have delayed processing requested on it whilst it is being
612 * executed. The delay will begin immediately, and if it expires before the
613 * item finishes executing, the item will be placed back on the queue when it
614 * has done executing.
615 */
616int delayed_slow_work_enqueue(struct delayed_slow_work *dwork,
617 unsigned long delay)
618{
619 struct slow_work *work = &dwork->work;
620 unsigned long flags;
621 int ret;
622
623 if (delay == 0)
624 return slow_work_enqueue(&dwork->work);
625
626 BUG_ON(slow_work_user_count <= 0);
627 BUG_ON(!work);
628 BUG_ON(!work->ops);
629
630 if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
631 return -ECANCELED;
632
633 if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
634 spin_lock_irqsave(&slow_work_queue_lock, flags);
635
636 if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
637 goto cancelled;
638
639 /* the timer holds a reference whilst it is pending */
640 ret = work->ops->get_ref(work);
641 if (ret < 0)
642 goto cant_get_ref;
643
644 if (test_and_set_bit(SLOW_WORK_DELAYED, &work->flags))
645 BUG();
646 dwork->timer.expires = jiffies + delay;
647 dwork->timer.data = (unsigned long) work;
648 dwork->timer.function = delayed_slow_work_timer;
649 add_timer(&dwork->timer);
650
651 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
652 }
653
654 return 0;
655
656cancelled:
657 ret = -ECANCELED;
658cant_get_ref:
659 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
660 return ret;
661}
662EXPORT_SYMBOL(delayed_slow_work_enqueue);
663
321/* 664/*
322 * Schedule a cull of the thread pool at some time in the near future 665 * Schedule a cull of the thread pool at some time in the near future
323 */ 666 */
@@ -368,13 +711,23 @@ static inline bool slow_work_available(int vsmax)
368 */ 711 */
369static int slow_work_thread(void *_data) 712static int slow_work_thread(void *_data)
370{ 713{
371 int vsmax; 714 int vsmax, id;
372 715
373 DEFINE_WAIT(wait); 716 DEFINE_WAIT(wait);
374 717
375 set_freezable(); 718 set_freezable();
376 set_user_nice(current, -5); 719 set_user_nice(current, -5);
377 720
721 /* allocate ourselves an ID */
722 spin_lock_irq(&slow_work_queue_lock);
723 id = find_first_zero_bit(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
724 BUG_ON(id < 0 || id >= SLOW_WORK_THREAD_LIMIT);
725 __set_bit(id, slow_work_ids);
726 slow_work_set_thread_pid(id, current->pid);
727 spin_unlock_irq(&slow_work_queue_lock);
728
729 sprintf(current->comm, "kslowd%03u", id);
730
378 for (;;) { 731 for (;;) {
379 vsmax = vslow_work_proportion; 732 vsmax = vslow_work_proportion;
380 vsmax *= atomic_read(&slow_work_thread_count); 733 vsmax *= atomic_read(&slow_work_thread_count);
@@ -395,7 +748,7 @@ static int slow_work_thread(void *_data)
395 vsmax *= atomic_read(&slow_work_thread_count); 748 vsmax *= atomic_read(&slow_work_thread_count);
396 vsmax /= 100; 749 vsmax /= 100;
397 750
398 if (slow_work_available(vsmax) && slow_work_execute()) { 751 if (slow_work_available(vsmax) && slow_work_execute(id)) {
399 cond_resched(); 752 cond_resched();
400 if (list_empty(&slow_work_queue) && 753 if (list_empty(&slow_work_queue) &&
401 list_empty(&vslow_work_queue) && 754 list_empty(&vslow_work_queue) &&
@@ -412,6 +765,11 @@ static int slow_work_thread(void *_data)
412 break; 765 break;
413 } 766 }
414 767
768 spin_lock_irq(&slow_work_queue_lock);
769 slow_work_set_thread_pid(id, 0);
770 __clear_bit(id, slow_work_ids);
771 spin_unlock_irq(&slow_work_queue_lock);
772
415 if (atomic_dec_and_test(&slow_work_thread_count)) 773 if (atomic_dec_and_test(&slow_work_thread_count))
416 complete_and_exit(&slow_work_last_thread_exited, 0); 774 complete_and_exit(&slow_work_last_thread_exited, 0);
417 return 0; 775 return 0;
@@ -427,21 +785,6 @@ static void slow_work_cull_timeout(unsigned long data)
427} 785}
428 786
429/* 787/*
430 * Get a reference on slow work thread starter
431 */
432static int slow_work_new_thread_get_ref(struct slow_work *work)
433{
434 return 0;
435}
436
437/*
438 * Drop a reference on slow work thread starter
439 */
440static void slow_work_new_thread_put_ref(struct slow_work *work)
441{
442}
443
444/*
445 * Start a new slow work thread 788 * Start a new slow work thread
446 */ 789 */
447static void slow_work_new_thread_execute(struct slow_work *work) 790static void slow_work_new_thread_execute(struct slow_work *work)
@@ -475,9 +818,11 @@ static void slow_work_new_thread_execute(struct slow_work *work)
475} 818}
476 819
477static const struct slow_work_ops slow_work_new_thread_ops = { 820static const struct slow_work_ops slow_work_new_thread_ops = {
478 .get_ref = slow_work_new_thread_get_ref, 821 .owner = THIS_MODULE,
479 .put_ref = slow_work_new_thread_put_ref,
480 .execute = slow_work_new_thread_execute, 822 .execute = slow_work_new_thread_execute,
823#ifdef CONFIG_SLOW_WORK_DEBUG
824 .desc = slow_work_new_thread_desc,
825#endif
481}; 826};
482 827
483/* 828/*
@@ -546,12 +891,13 @@ static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
546 891
547/** 892/**
548 * slow_work_register_user - Register a user of the facility 893 * slow_work_register_user - Register a user of the facility
894 * @module: The module about to make use of the facility
549 * 895 *
550 * Register a user of the facility, starting up the initial threads if there 896 * Register a user of the facility, starting up the initial threads if there
551 * aren't any other users at this point. This will return 0 if successful, or 897 * aren't any other users at this point. This will return 0 if successful, or
552 * an error if not. 898 * an error if not.
553 */ 899 */
554int slow_work_register_user(void) 900int slow_work_register_user(struct module *module)
555{ 901{
556 struct task_struct *p; 902 struct task_struct *p;
557 int loop; 903 int loop;
@@ -598,14 +944,81 @@ error:
598} 944}
599EXPORT_SYMBOL(slow_work_register_user); 945EXPORT_SYMBOL(slow_work_register_user);
600 946
947/*
948 * wait for all outstanding items from the calling module to complete
949 * - note that more items may be queued whilst we're waiting
950 */
951static void slow_work_wait_for_items(struct module *module)
952{
953#ifdef CONFIG_MODULES
954 DECLARE_WAITQUEUE(myself, current);
955 struct slow_work *work;
956 int loop;
957
958 mutex_lock(&slow_work_unreg_sync_lock);
959 add_wait_queue(&slow_work_unreg_wq, &myself);
960
961 for (;;) {
962 spin_lock_irq(&slow_work_queue_lock);
963
964 /* first of all, we wait for the last queued item in each list
965 * to be processed */
966 list_for_each_entry_reverse(work, &vslow_work_queue, link) {
967 if (work->owner == module) {
968 set_current_state(TASK_UNINTERRUPTIBLE);
969 slow_work_unreg_work_item = work;
970 goto do_wait;
971 }
972 }
973 list_for_each_entry_reverse(work, &slow_work_queue, link) {
974 if (work->owner == module) {
975 set_current_state(TASK_UNINTERRUPTIBLE);
976 slow_work_unreg_work_item = work;
977 goto do_wait;
978 }
979 }
980
981 /* then we wait for the items being processed to finish */
982 slow_work_unreg_module = module;
983 smp_mb();
984 for (loop = 0; loop < SLOW_WORK_THREAD_LIMIT; loop++) {
985 if (slow_work_thread_processing[loop] == module)
986 goto do_wait;
987 }
988 spin_unlock_irq(&slow_work_queue_lock);
989 break; /* okay, we're done */
990
991 do_wait:
992 spin_unlock_irq(&slow_work_queue_lock);
993 schedule();
994 slow_work_unreg_work_item = NULL;
995 slow_work_unreg_module = NULL;
996 }
997
998 remove_wait_queue(&slow_work_unreg_wq, &myself);
999 mutex_unlock(&slow_work_unreg_sync_lock);
1000#endif /* CONFIG_MODULES */
1001}
1002
601/** 1003/**
602 * slow_work_unregister_user - Unregister a user of the facility 1004 * slow_work_unregister_user - Unregister a user of the facility
1005 * @module: The module whose items should be cleared
603 * 1006 *
604 * Unregister a user of the facility, killing all the threads if this was the 1007 * Unregister a user of the facility, killing all the threads if this was the
605 * last one. 1008 * last one.
1009 *
1010 * This waits for all the work items belonging to the nominated module to go
1011 * away before proceeding.
606 */ 1012 */
607void slow_work_unregister_user(void) 1013void slow_work_unregister_user(struct module *module)
608{ 1014{
1015 /* first of all, wait for all outstanding items from the calling module
1016 * to complete */
1017 if (module)
1018 slow_work_wait_for_items(module);
1019
1020 /* then we can actually go about shutting down the facility if need
1021 * be */
609 mutex_lock(&slow_work_user_lock); 1022 mutex_lock(&slow_work_user_lock);
610 1023
611 BUG_ON(slow_work_user_count <= 0); 1024 BUG_ON(slow_work_user_count <= 0);
@@ -639,6 +1052,16 @@ static int __init init_slow_work(void)
639 if (slow_work_max_max_threads < nr_cpus * 2) 1052 if (slow_work_max_max_threads < nr_cpus * 2)
640 slow_work_max_max_threads = nr_cpus * 2; 1053 slow_work_max_max_threads = nr_cpus * 2;
641#endif 1054#endif
1055#ifdef CONFIG_SLOW_WORK_DEBUG
1056 {
1057 struct dentry *dbdir;
1058
1059 dbdir = debugfs_create_dir("slow_work", NULL);
1060 if (dbdir && !IS_ERR(dbdir))
1061 debugfs_create_file("runqueue", S_IFREG | 0400, dbdir,
1062 NULL, &slow_work_runqueue_fops);
1063 }
1064#endif
642 return 0; 1065 return 0;
643} 1066}
644 1067
diff --git a/kernel/slow-work.h b/kernel/slow-work.h
new file mode 100644
index 00000000000..321f3c59d73
--- /dev/null
+++ b/kernel/slow-work.h
@@ -0,0 +1,72 @@
1/* Slow work private definitions
2 *
3 * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of
13 * things to do */
14#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after
15 * OOM */
16
17#define SLOW_WORK_THREAD_LIMIT 255 /* abs maximum number of slow-work threads */
18
19/*
20 * slow-work.c
21 */
22#ifdef CONFIG_SLOW_WORK_DEBUG
23extern struct slow_work *slow_work_execs[];
24extern pid_t slow_work_pids[];
25extern rwlock_t slow_work_execs_lock;
26#endif
27
28extern struct list_head slow_work_queue;
29extern struct list_head vslow_work_queue;
30extern spinlock_t slow_work_queue_lock;
31
32/*
33 * slow-work-debugfs.c
34 */
35#ifdef CONFIG_SLOW_WORK_DEBUG
36extern const struct file_operations slow_work_runqueue_fops;
37
38extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *);
39#endif
40
41/*
42 * Helper functions
43 */
44static inline void slow_work_set_thread_pid(int id, pid_t pid)
45{
46#ifdef CONFIG_SLOW_WORK_PROC
47 slow_work_pids[id] = pid;
48#endif
49}
50
51static inline void slow_work_mark_time(struct slow_work *work)
52{
53#ifdef CONFIG_SLOW_WORK_PROC
54 work->mark = CURRENT_TIME;
55#endif
56}
57
58static inline void slow_work_begin_exec(int id, struct slow_work *work)
59{
60#ifdef CONFIG_SLOW_WORK_PROC
61 slow_work_execs[id] = work;
62#endif
63}
64
65static inline void slow_work_end_exec(int id, struct slow_work *work)
66{
67#ifdef CONFIG_SLOW_WORK_PROC
68 write_lock(&slow_work_execs_lock);
69 slow_work_execs[id] = NULL;
70 write_unlock(&slow_work_execs_lock);
71#endif
72}
diff --git a/kernel/smp.c b/kernel/smp.c
index c9d1c7835c2..f1040842244 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -16,11 +16,11 @@ static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
16 16
17static struct { 17static struct {
18 struct list_head queue; 18 struct list_head queue;
19 spinlock_t lock; 19 raw_spinlock_t lock;
20} call_function __cacheline_aligned_in_smp = 20} call_function __cacheline_aligned_in_smp =
21 { 21 {
22 .queue = LIST_HEAD_INIT(call_function.queue), 22 .queue = LIST_HEAD_INIT(call_function.queue),
23 .lock = __SPIN_LOCK_UNLOCKED(call_function.lock), 23 .lock = __RAW_SPIN_LOCK_UNLOCKED(call_function.lock),
24 }; 24 };
25 25
26enum { 26enum {
@@ -35,7 +35,7 @@ struct call_function_data {
35 35
36struct call_single_queue { 36struct call_single_queue {
37 struct list_head list; 37 struct list_head list;
38 spinlock_t lock; 38 raw_spinlock_t lock;
39}; 39};
40 40
41static DEFINE_PER_CPU(struct call_function_data, cfd_data); 41static DEFINE_PER_CPU(struct call_function_data, cfd_data);
@@ -80,7 +80,7 @@ static int __cpuinit init_call_single_data(void)
80 for_each_possible_cpu(i) { 80 for_each_possible_cpu(i) {
81 struct call_single_queue *q = &per_cpu(call_single_queue, i); 81 struct call_single_queue *q = &per_cpu(call_single_queue, i);
82 82
83 spin_lock_init(&q->lock); 83 raw_spin_lock_init(&q->lock);
84 INIT_LIST_HEAD(&q->list); 84 INIT_LIST_HEAD(&q->list);
85 } 85 }
86 86
@@ -141,10 +141,10 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
141 unsigned long flags; 141 unsigned long flags;
142 int ipi; 142 int ipi;
143 143
144 spin_lock_irqsave(&dst->lock, flags); 144 raw_spin_lock_irqsave(&dst->lock, flags);
145 ipi = list_empty(&dst->list); 145 ipi = list_empty(&dst->list);
146 list_add_tail(&data->list, &dst->list); 146 list_add_tail(&data->list, &dst->list);
147 spin_unlock_irqrestore(&dst->lock, flags); 147 raw_spin_unlock_irqrestore(&dst->lock, flags);
148 148
149 /* 149 /*
150 * The list addition should be visible before sending the IPI 150 * The list addition should be visible before sending the IPI
@@ -171,7 +171,7 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
171void generic_smp_call_function_interrupt(void) 171void generic_smp_call_function_interrupt(void)
172{ 172{
173 struct call_function_data *data; 173 struct call_function_data *data;
174 int cpu = get_cpu(); 174 int cpu = smp_processor_id();
175 175
176 /* 176 /*
177 * Shouldn't receive this interrupt on a cpu that is not yet online. 177 * Shouldn't receive this interrupt on a cpu that is not yet online.
@@ -201,9 +201,9 @@ void generic_smp_call_function_interrupt(void)
201 refs = atomic_dec_return(&data->refs); 201 refs = atomic_dec_return(&data->refs);
202 WARN_ON(refs < 0); 202 WARN_ON(refs < 0);
203 if (!refs) { 203 if (!refs) {
204 spin_lock(&call_function.lock); 204 raw_spin_lock(&call_function.lock);
205 list_del_rcu(&data->csd.list); 205 list_del_rcu(&data->csd.list);
206 spin_unlock(&call_function.lock); 206 raw_spin_unlock(&call_function.lock);
207 } 207 }
208 208
209 if (refs) 209 if (refs)
@@ -212,7 +212,6 @@ void generic_smp_call_function_interrupt(void)
212 csd_unlock(&data->csd); 212 csd_unlock(&data->csd);
213 } 213 }
214 214
215 put_cpu();
216} 215}
217 216
218/* 217/*
@@ -230,9 +229,9 @@ void generic_smp_call_function_single_interrupt(void)
230 */ 229 */
231 WARN_ON_ONCE(!cpu_online(smp_processor_id())); 230 WARN_ON_ONCE(!cpu_online(smp_processor_id()));
232 231
233 spin_lock(&q->lock); 232 raw_spin_lock(&q->lock);
234 list_replace_init(&q->list, &list); 233 list_replace_init(&q->list, &list);
235 spin_unlock(&q->lock); 234 raw_spin_unlock(&q->lock);
236 235
237 while (!list_empty(&list)) { 236 while (!list_empty(&list)) {
238 struct call_single_data *data; 237 struct call_single_data *data;
@@ -265,9 +264,7 @@ static DEFINE_PER_CPU(struct call_single_data, csd_data);
265 * @info: An arbitrary pointer to pass to the function. 264 * @info: An arbitrary pointer to pass to the function.
266 * @wait: If true, wait until function has completed on other CPUs. 265 * @wait: If true, wait until function has completed on other CPUs.
267 * 266 *
268 * Returns 0 on success, else a negative status code. Note that @wait 267 * Returns 0 on success, else a negative status code.
269 * will be implicitly turned on in case of allocation failures, since
270 * we fall back to on-stack allocation.
271 */ 268 */
272int smp_call_function_single(int cpu, void (*func) (void *info), void *info, 269int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
273 int wait) 270 int wait)
@@ -321,6 +318,51 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
321} 318}
322EXPORT_SYMBOL(smp_call_function_single); 319EXPORT_SYMBOL(smp_call_function_single);
323 320
321/*
322 * smp_call_function_any - Run a function on any of the given cpus
323 * @mask: The mask of cpus it can run on.
324 * @func: The function to run. This must be fast and non-blocking.
325 * @info: An arbitrary pointer to pass to the function.
326 * @wait: If true, wait until function has completed.
327 *
328 * Returns 0 on success, else a negative status code (if no cpus were online).
329 * Note that @wait will be implicitly turned on in case of allocation failures,
330 * since we fall back to on-stack allocation.
331 *
332 * Selection preference:
333 * 1) current cpu if in @mask
334 * 2) any cpu of current node if in @mask
335 * 3) any other online cpu in @mask
336 */
337int smp_call_function_any(const struct cpumask *mask,
338 void (*func)(void *info), void *info, int wait)
339{
340 unsigned int cpu;
341 const struct cpumask *nodemask;
342 int ret;
343
344 /* Try for same CPU (cheapest) */
345 cpu = get_cpu();
346 if (cpumask_test_cpu(cpu, mask))
347 goto call;
348
349 /* Try for same node. */
350 nodemask = cpumask_of_node(cpu_to_node(cpu));
351 for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids;
352 cpu = cpumask_next_and(cpu, nodemask, mask)) {
353 if (cpu_online(cpu))
354 goto call;
355 }
356
357 /* Any online will do: smp_call_function_single handles nr_cpu_ids. */
358 cpu = cpumask_any_and(mask, cpu_online_mask);
359call:
360 ret = smp_call_function_single(cpu, func, info, wait);
361 put_cpu();
362 return ret;
363}
364EXPORT_SYMBOL_GPL(smp_call_function_any);
365
324/** 366/**
325 * __smp_call_function_single(): Run a function on another CPU 367 * __smp_call_function_single(): Run a function on another CPU
326 * @cpu: The CPU to run on. 368 * @cpu: The CPU to run on.
@@ -355,9 +397,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
355 * @wait: If true, wait (atomically) until function has completed 397 * @wait: If true, wait (atomically) until function has completed
356 * on other CPUs. 398 * on other CPUs.
357 * 399 *
358 * If @wait is true, then returns once @func has returned. Note that @wait 400 * If @wait is true, then returns once @func has returned.
359 * will be implicitly turned on in case of allocation failures, since
360 * we fall back to on-stack allocation.
361 * 401 *
362 * You must not call this function with disabled interrupts or from a 402 * You must not call this function with disabled interrupts or from a
363 * hardware interrupt handler or from a bottom half handler. Preemption 403 * hardware interrupt handler or from a bottom half handler. Preemption
@@ -408,14 +448,14 @@ void smp_call_function_many(const struct cpumask *mask,
408 cpumask_clear_cpu(this_cpu, data->cpumask); 448 cpumask_clear_cpu(this_cpu, data->cpumask);
409 atomic_set(&data->refs, cpumask_weight(data->cpumask)); 449 atomic_set(&data->refs, cpumask_weight(data->cpumask));
410 450
411 spin_lock_irqsave(&call_function.lock, flags); 451 raw_spin_lock_irqsave(&call_function.lock, flags);
412 /* 452 /*
413 * Place entry at the _HEAD_ of the list, so that any cpu still 453 * Place entry at the _HEAD_ of the list, so that any cpu still
414 * observing the entry in generic_smp_call_function_interrupt() 454 * observing the entry in generic_smp_call_function_interrupt()
415 * will not miss any other list entries: 455 * will not miss any other list entries:
416 */ 456 */
417 list_add_rcu(&data->csd.list, &call_function.queue); 457 list_add_rcu(&data->csd.list, &call_function.queue);
418 spin_unlock_irqrestore(&call_function.lock, flags); 458 raw_spin_unlock_irqrestore(&call_function.lock, flags);
419 459
420 /* 460 /*
421 * Make the list addition visible before sending the ipi. 461 * Make the list addition visible before sending the ipi.
@@ -443,8 +483,7 @@ EXPORT_SYMBOL(smp_call_function_many);
443 * Returns 0. 483 * Returns 0.
444 * 484 *
445 * If @wait is true, then returns once @func has returned; otherwise 485 * If @wait is true, then returns once @func has returned; otherwise
446 * it returns just before the target cpu calls @func. In case of allocation 486 * it returns just before the target cpu calls @func.
447 * failure, @wait will be implicitly turned on.
448 * 487 *
449 * You must not call this function with disabled interrupts or from a 488 * You must not call this function with disabled interrupts or from a
450 * hardware interrupt handler or from a bottom half handler. 489 * hardware interrupt handler or from a bottom half handler.
@@ -461,20 +500,20 @@ EXPORT_SYMBOL(smp_call_function);
461 500
462void ipi_call_lock(void) 501void ipi_call_lock(void)
463{ 502{
464 spin_lock(&call_function.lock); 503 raw_spin_lock(&call_function.lock);
465} 504}
466 505
467void ipi_call_unlock(void) 506void ipi_call_unlock(void)
468{ 507{
469 spin_unlock(&call_function.lock); 508 raw_spin_unlock(&call_function.lock);
470} 509}
471 510
472void ipi_call_lock_irq(void) 511void ipi_call_lock_irq(void)
473{ 512{
474 spin_lock_irq(&call_function.lock); 513 raw_spin_lock_irq(&call_function.lock);
475} 514}
476 515
477void ipi_call_unlock_irq(void) 516void ipi_call_unlock_irq(void)
478{ 517{
479 spin_unlock_irq(&call_function.lock); 518 raw_spin_unlock_irq(&call_function.lock);
480} 519}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index f8749e5216e..7c1a67ef027 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -302,9 +302,9 @@ void irq_exit(void)
302 if (!in_interrupt() && local_softirq_pending()) 302 if (!in_interrupt() && local_softirq_pending())
303 invoke_softirq(); 303 invoke_softirq();
304 304
305 rcu_irq_exit();
305#ifdef CONFIG_NO_HZ 306#ifdef CONFIG_NO_HZ
306 /* Make sure that timer wheel updates are propagated */ 307 /* Make sure that timer wheel updates are propagated */
307 rcu_irq_exit();
308 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) 308 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
309 tick_nohz_stop_sched_tick(0); 309 tick_nohz_stop_sched_tick(0);
310#endif 310#endif
@@ -500,22 +500,17 @@ EXPORT_SYMBOL(tasklet_kill);
500 */ 500 */
501 501
502/* 502/*
503 * The trampoline is called when the hrtimer expires. If this is 503 * The trampoline is called when the hrtimer expires. It schedules a tasklet
504 * called from the hrtimer interrupt then we schedule the tasklet as 504 * to run __tasklet_hrtimer_trampoline() which in turn will call the intended
505 * the timer callback function expects to run in softirq context. If 505 * hrtimer callback, but from softirq context.
506 * it's called in softirq context anyway (i.e. high resolution timers
507 * disabled) then the hrtimer callback is called right away.
508 */ 506 */
509static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) 507static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
510{ 508{
511 struct tasklet_hrtimer *ttimer = 509 struct tasklet_hrtimer *ttimer =
512 container_of(timer, struct tasklet_hrtimer, timer); 510 container_of(timer, struct tasklet_hrtimer, timer);
513 511
514 if (hrtimer_is_hres_active(timer)) { 512 tasklet_hi_schedule(&ttimer->tasklet);
515 tasklet_hi_schedule(&ttimer->tasklet); 513 return HRTIMER_NORESTART;
516 return HRTIMER_NORESTART;
517 }
518 return ttimer->function(timer);
519} 514}
520 515
521/* 516/*
@@ -697,7 +692,7 @@ void __init softirq_init(void)
697 open_softirq(HI_SOFTIRQ, tasklet_hi_action); 692 open_softirq(HI_SOFTIRQ, tasklet_hi_action);
698} 693}
699 694
700static int ksoftirqd(void * __bind_cpu) 695static int run_ksoftirqd(void * __bind_cpu)
701{ 696{
702 set_current_state(TASK_INTERRUPTIBLE); 697 set_current_state(TASK_INTERRUPTIBLE);
703 698
@@ -810,7 +805,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
810 switch (action) { 805 switch (action) {
811 case CPU_UP_PREPARE: 806 case CPU_UP_PREPARE:
812 case CPU_UP_PREPARE_FROZEN: 807 case CPU_UP_PREPARE_FROZEN:
813 p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); 808 p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
814 if (IS_ERR(p)) { 809 if (IS_ERR(p)) {
815 printk("ksoftirqd for %i failed\n", hotcpu); 810 printk("ksoftirqd for %i failed\n", hotcpu);
816 return NOTIFY_BAD; 811 return NOTIFY_BAD;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 81324d12eb3..0d4c7898ab8 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -22,9 +22,10 @@
22 22
23static DEFINE_SPINLOCK(print_lock); 23static DEFINE_SPINLOCK(print_lock);
24 24
25static DEFINE_PER_CPU(unsigned long, touch_timestamp); 25static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */
26static DEFINE_PER_CPU(unsigned long, print_timestamp); 26static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */
27static DEFINE_PER_CPU(struct task_struct *, watchdog_task); 27static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
28static DEFINE_PER_CPU(bool, softlock_touch_sync);
28 29
29static int __read_mostly did_panic; 30static int __read_mostly did_panic;
30int __read_mostly softlockup_thresh = 60; 31int __read_mostly softlockup_thresh = 60;
@@ -70,22 +71,28 @@ static void __touch_softlockup_watchdog(void)
70{ 71{
71 int this_cpu = raw_smp_processor_id(); 72 int this_cpu = raw_smp_processor_id();
72 73
73 __raw_get_cpu_var(touch_timestamp) = get_timestamp(this_cpu); 74 __raw_get_cpu_var(softlockup_touch_ts) = get_timestamp(this_cpu);
74} 75}
75 76
76void touch_softlockup_watchdog(void) 77void touch_softlockup_watchdog(void)
77{ 78{
78 __raw_get_cpu_var(touch_timestamp) = 0; 79 __raw_get_cpu_var(softlockup_touch_ts) = 0;
79} 80}
80EXPORT_SYMBOL(touch_softlockup_watchdog); 81EXPORT_SYMBOL(touch_softlockup_watchdog);
81 82
83void touch_softlockup_watchdog_sync(void)
84{
85 __raw_get_cpu_var(softlock_touch_sync) = true;
86 __raw_get_cpu_var(softlockup_touch_ts) = 0;
87}
88
82void touch_all_softlockup_watchdogs(void) 89void touch_all_softlockup_watchdogs(void)
83{ 90{
84 int cpu; 91 int cpu;
85 92
86 /* Cause each CPU to re-update its timestamp rather than complain */ 93 /* Cause each CPU to re-update its timestamp rather than complain */
87 for_each_online_cpu(cpu) 94 for_each_online_cpu(cpu)
88 per_cpu(touch_timestamp, cpu) = 0; 95 per_cpu(softlockup_touch_ts, cpu) = 0;
89} 96}
90EXPORT_SYMBOL(touch_all_softlockup_watchdogs); 97EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
91 98
@@ -104,28 +111,36 @@ int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
104void softlockup_tick(void) 111void softlockup_tick(void)
105{ 112{
106 int this_cpu = smp_processor_id(); 113 int this_cpu = smp_processor_id();
107 unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu); 114 unsigned long touch_ts = per_cpu(softlockup_touch_ts, this_cpu);
108 unsigned long print_timestamp; 115 unsigned long print_ts;
109 struct pt_regs *regs = get_irq_regs(); 116 struct pt_regs *regs = get_irq_regs();
110 unsigned long now; 117 unsigned long now;
111 118
112 /* Is detection switched off? */ 119 /* Is detection switched off? */
113 if (!per_cpu(watchdog_task, this_cpu) || softlockup_thresh <= 0) { 120 if (!per_cpu(softlockup_watchdog, this_cpu) || softlockup_thresh <= 0) {
114 /* Be sure we don't false trigger if switched back on */ 121 /* Be sure we don't false trigger if switched back on */
115 if (touch_timestamp) 122 if (touch_ts)
116 per_cpu(touch_timestamp, this_cpu) = 0; 123 per_cpu(softlockup_touch_ts, this_cpu) = 0;
117 return; 124 return;
118 } 125 }
119 126
120 if (touch_timestamp == 0) { 127 if (touch_ts == 0) {
128 if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) {
129 /*
130 * If the time stamp was touched atomically
131 * make sure the scheduler tick is up to date.
132 */
133 per_cpu(softlock_touch_sync, this_cpu) = false;
134 sched_clock_tick();
135 }
121 __touch_softlockup_watchdog(); 136 __touch_softlockup_watchdog();
122 return; 137 return;
123 } 138 }
124 139
125 print_timestamp = per_cpu(print_timestamp, this_cpu); 140 print_ts = per_cpu(softlockup_print_ts, this_cpu);
126 141
127 /* report at most once a second */ 142 /* report at most once a second */
128 if (print_timestamp == touch_timestamp || did_panic) 143 if (print_ts == touch_ts || did_panic)
129 return; 144 return;
130 145
131 /* do not print during early bootup: */ 146 /* do not print during early bootup: */
@@ -140,18 +155,18 @@ void softlockup_tick(void)
140 * Wake up the high-prio watchdog task twice per 155 * Wake up the high-prio watchdog task twice per
141 * threshold timespan. 156 * threshold timespan.
142 */ 157 */
143 if (now > touch_timestamp + softlockup_thresh/2) 158 if (now > touch_ts + softlockup_thresh/2)
144 wake_up_process(per_cpu(watchdog_task, this_cpu)); 159 wake_up_process(per_cpu(softlockup_watchdog, this_cpu));
145 160
146 /* Warn about unreasonable delays: */ 161 /* Warn about unreasonable delays: */
147 if (now <= (touch_timestamp + softlockup_thresh)) 162 if (now <= (touch_ts + softlockup_thresh))
148 return; 163 return;
149 164
150 per_cpu(print_timestamp, this_cpu) = touch_timestamp; 165 per_cpu(softlockup_print_ts, this_cpu) = touch_ts;
151 166
152 spin_lock(&print_lock); 167 spin_lock(&print_lock);
153 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n", 168 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n",
154 this_cpu, now - touch_timestamp, 169 this_cpu, now - touch_ts,
155 current->comm, task_pid_nr(current)); 170 current->comm, task_pid_nr(current));
156 print_modules(); 171 print_modules();
157 print_irqtrace_events(current); 172 print_irqtrace_events(current);
@@ -209,32 +224,32 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
209 switch (action) { 224 switch (action) {
210 case CPU_UP_PREPARE: 225 case CPU_UP_PREPARE:
211 case CPU_UP_PREPARE_FROZEN: 226 case CPU_UP_PREPARE_FROZEN:
212 BUG_ON(per_cpu(watchdog_task, hotcpu)); 227 BUG_ON(per_cpu(softlockup_watchdog, hotcpu));
213 p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu); 228 p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
214 if (IS_ERR(p)) { 229 if (IS_ERR(p)) {
215 printk(KERN_ERR "watchdog for %i failed\n", hotcpu); 230 printk(KERN_ERR "watchdog for %i failed\n", hotcpu);
216 return NOTIFY_BAD; 231 return NOTIFY_BAD;
217 } 232 }
218 per_cpu(touch_timestamp, hotcpu) = 0; 233 per_cpu(softlockup_touch_ts, hotcpu) = 0;
219 per_cpu(watchdog_task, hotcpu) = p; 234 per_cpu(softlockup_watchdog, hotcpu) = p;
220 kthread_bind(p, hotcpu); 235 kthread_bind(p, hotcpu);
221 break; 236 break;
222 case CPU_ONLINE: 237 case CPU_ONLINE:
223 case CPU_ONLINE_FROZEN: 238 case CPU_ONLINE_FROZEN:
224 wake_up_process(per_cpu(watchdog_task, hotcpu)); 239 wake_up_process(per_cpu(softlockup_watchdog, hotcpu));
225 break; 240 break;
226#ifdef CONFIG_HOTPLUG_CPU 241#ifdef CONFIG_HOTPLUG_CPU
227 case CPU_UP_CANCELED: 242 case CPU_UP_CANCELED:
228 case CPU_UP_CANCELED_FROZEN: 243 case CPU_UP_CANCELED_FROZEN:
229 if (!per_cpu(watchdog_task, hotcpu)) 244 if (!per_cpu(softlockup_watchdog, hotcpu))
230 break; 245 break;
231 /* Unbind so it can run. Fall thru. */ 246 /* Unbind so it can run. Fall thru. */
232 kthread_bind(per_cpu(watchdog_task, hotcpu), 247 kthread_bind(per_cpu(softlockup_watchdog, hotcpu),
233 cpumask_any(cpu_online_mask)); 248 cpumask_any(cpu_online_mask));
234 case CPU_DEAD: 249 case CPU_DEAD:
235 case CPU_DEAD_FROZEN: 250 case CPU_DEAD_FROZEN:
236 p = per_cpu(watchdog_task, hotcpu); 251 p = per_cpu(softlockup_watchdog, hotcpu);
237 per_cpu(watchdog_task, hotcpu) = NULL; 252 per_cpu(softlockup_watchdog, hotcpu) = NULL;
238 kthread_stop(p); 253 kthread_stop(p);
239 break; 254 break;
240#endif /* CONFIG_HOTPLUG_CPU */ 255#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 5ddab730cb2..be6517fb9c1 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -21,193 +21,72 @@
21#include <linux/debug_locks.h> 21#include <linux/debug_locks.h>
22#include <linux/module.h> 22#include <linux/module.h>
23 23
24#ifndef _spin_trylock
25int __lockfunc _spin_trylock(spinlock_t *lock)
26{
27 return __spin_trylock(lock);
28}
29EXPORT_SYMBOL(_spin_trylock);
30#endif
31
32#ifndef _read_trylock
33int __lockfunc _read_trylock(rwlock_t *lock)
34{
35 return __read_trylock(lock);
36}
37EXPORT_SYMBOL(_read_trylock);
38#endif
39
40#ifndef _write_trylock
41int __lockfunc _write_trylock(rwlock_t *lock)
42{
43 return __write_trylock(lock);
44}
45EXPORT_SYMBOL(_write_trylock);
46#endif
47
48/* 24/*
49 * If lockdep is enabled then we use the non-preemption spin-ops 25 * If lockdep is enabled then we use the non-preemption spin-ops
50 * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are 26 * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
51 * not re-enabled during lock-acquire (which the preempt-spin-ops do): 27 * not re-enabled during lock-acquire (which the preempt-spin-ops do):
52 */ 28 */
53#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) 29#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
54
55#ifndef _read_lock
56void __lockfunc _read_lock(rwlock_t *lock)
57{
58 __read_lock(lock);
59}
60EXPORT_SYMBOL(_read_lock);
61#endif
62
63#ifndef _spin_lock_irqsave
64unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
65{
66 return __spin_lock_irqsave(lock);
67}
68EXPORT_SYMBOL(_spin_lock_irqsave);
69#endif
70
71#ifndef _spin_lock_irq
72void __lockfunc _spin_lock_irq(spinlock_t *lock)
73{
74 __spin_lock_irq(lock);
75}
76EXPORT_SYMBOL(_spin_lock_irq);
77#endif
78
79#ifndef _spin_lock_bh
80void __lockfunc _spin_lock_bh(spinlock_t *lock)
81{
82 __spin_lock_bh(lock);
83}
84EXPORT_SYMBOL(_spin_lock_bh);
85#endif
86
87#ifndef _read_lock_irqsave
88unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
89{
90 return __read_lock_irqsave(lock);
91}
92EXPORT_SYMBOL(_read_lock_irqsave);
93#endif
94
95#ifndef _read_lock_irq
96void __lockfunc _read_lock_irq(rwlock_t *lock)
97{
98 __read_lock_irq(lock);
99}
100EXPORT_SYMBOL(_read_lock_irq);
101#endif
102
103#ifndef _read_lock_bh
104void __lockfunc _read_lock_bh(rwlock_t *lock)
105{
106 __read_lock_bh(lock);
107}
108EXPORT_SYMBOL(_read_lock_bh);
109#endif
110
111#ifndef _write_lock_irqsave
112unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
113{
114 return __write_lock_irqsave(lock);
115}
116EXPORT_SYMBOL(_write_lock_irqsave);
117#endif
118
119#ifndef _write_lock_irq
120void __lockfunc _write_lock_irq(rwlock_t *lock)
121{
122 __write_lock_irq(lock);
123}
124EXPORT_SYMBOL(_write_lock_irq);
125#endif
126
127#ifndef _write_lock_bh
128void __lockfunc _write_lock_bh(rwlock_t *lock)
129{
130 __write_lock_bh(lock);
131}
132EXPORT_SYMBOL(_write_lock_bh);
133#endif
134
135#ifndef _spin_lock
136void __lockfunc _spin_lock(spinlock_t *lock)
137{
138 __spin_lock(lock);
139}
140EXPORT_SYMBOL(_spin_lock);
141#endif
142
143#ifndef _write_lock
144void __lockfunc _write_lock(rwlock_t *lock)
145{
146 __write_lock(lock);
147}
148EXPORT_SYMBOL(_write_lock);
149#endif
150
151#else /* CONFIG_PREEMPT: */
152
153/* 30/*
31 * The __lock_function inlines are taken from
32 * include/linux/spinlock_api_smp.h
33 */
34#else
35#define raw_read_can_lock(l) read_can_lock(l)
36#define raw_write_can_lock(l) write_can_lock(l)
37/*
38 * We build the __lock_function inlines here. They are too large for
39 * inlining all over the place, but here is only one user per function
40 * which embedds them into the calling _lock_function below.
41 *
154 * This could be a long-held lock. We both prepare to spin for a long 42 * This could be a long-held lock. We both prepare to spin for a long
155 * time (making _this_ CPU preemptable if possible), and we also signal 43 * time (making _this_ CPU preemptable if possible), and we also signal
156 * towards that other CPU that it should break the lock ASAP. 44 * towards that other CPU that it should break the lock ASAP.
157 *
158 * (We do this in a function because inlining it would be excessive.)
159 */ 45 */
160
161#define BUILD_LOCK_OPS(op, locktype) \ 46#define BUILD_LOCK_OPS(op, locktype) \
162void __lockfunc _##op##_lock(locktype##_t *lock) \ 47void __lockfunc __raw_##op##_lock(locktype##_t *lock) \
163{ \ 48{ \
164 for (;;) { \ 49 for (;;) { \
165 preempt_disable(); \ 50 preempt_disable(); \
166 if (likely(_raw_##op##_trylock(lock))) \ 51 if (likely(do_raw_##op##_trylock(lock))) \
167 break; \ 52 break; \
168 preempt_enable(); \ 53 preempt_enable(); \
169 \ 54 \
170 if (!(lock)->break_lock) \ 55 if (!(lock)->break_lock) \
171 (lock)->break_lock = 1; \ 56 (lock)->break_lock = 1; \
172 while (!op##_can_lock(lock) && (lock)->break_lock) \ 57 while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\
173 _raw_##op##_relax(&lock->raw_lock); \ 58 arch_##op##_relax(&lock->raw_lock); \
174 } \ 59 } \
175 (lock)->break_lock = 0; \ 60 (lock)->break_lock = 0; \
176} \ 61} \
177 \ 62 \
178EXPORT_SYMBOL(_##op##_lock); \ 63unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \
179 \
180unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \
181{ \ 64{ \
182 unsigned long flags; \ 65 unsigned long flags; \
183 \ 66 \
184 for (;;) { \ 67 for (;;) { \
185 preempt_disable(); \ 68 preempt_disable(); \
186 local_irq_save(flags); \ 69 local_irq_save(flags); \
187 if (likely(_raw_##op##_trylock(lock))) \ 70 if (likely(do_raw_##op##_trylock(lock))) \
188 break; \ 71 break; \
189 local_irq_restore(flags); \ 72 local_irq_restore(flags); \
190 preempt_enable(); \ 73 preempt_enable(); \
191 \ 74 \
192 if (!(lock)->break_lock) \ 75 if (!(lock)->break_lock) \
193 (lock)->break_lock = 1; \ 76 (lock)->break_lock = 1; \
194 while (!op##_can_lock(lock) && (lock)->break_lock) \ 77 while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\
195 _raw_##op##_relax(&lock->raw_lock); \ 78 arch_##op##_relax(&lock->raw_lock); \
196 } \ 79 } \
197 (lock)->break_lock = 0; \ 80 (lock)->break_lock = 0; \
198 return flags; \ 81 return flags; \
199} \ 82} \
200 \ 83 \
201EXPORT_SYMBOL(_##op##_lock_irqsave); \ 84void __lockfunc __raw_##op##_lock_irq(locktype##_t *lock) \
202 \
203void __lockfunc _##op##_lock_irq(locktype##_t *lock) \
204{ \ 85{ \
205 _##op##_lock_irqsave(lock); \ 86 _raw_##op##_lock_irqsave(lock); \
206} \ 87} \
207 \ 88 \
208EXPORT_SYMBOL(_##op##_lock_irq); \ 89void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \
209 \
210void __lockfunc _##op##_lock_bh(locktype##_t *lock) \
211{ \ 90{ \
212 unsigned long flags; \ 91 unsigned long flags; \
213 \ 92 \
@@ -216,164 +95,283 @@ void __lockfunc _##op##_lock_bh(locktype##_t *lock) \
216 /* irq-disabling. We use the generic preemption-aware */ \ 95 /* irq-disabling. We use the generic preemption-aware */ \
217 /* function: */ \ 96 /* function: */ \
218 /**/ \ 97 /**/ \
219 flags = _##op##_lock_irqsave(lock); \ 98 flags = _raw_##op##_lock_irqsave(lock); \
220 local_bh_disable(); \ 99 local_bh_disable(); \
221 local_irq_restore(flags); \ 100 local_irq_restore(flags); \
222} \ 101} \
223 \
224EXPORT_SYMBOL(_##op##_lock_bh)
225 102
226/* 103/*
227 * Build preemption-friendly versions of the following 104 * Build preemption-friendly versions of the following
228 * lock-spinning functions: 105 * lock-spinning functions:
229 * 106 *
230 * _[spin|read|write]_lock() 107 * __[spin|read|write]_lock()
231 * _[spin|read|write]_lock_irq() 108 * __[spin|read|write]_lock_irq()
232 * _[spin|read|write]_lock_irqsave() 109 * __[spin|read|write]_lock_irqsave()
233 * _[spin|read|write]_lock_bh() 110 * __[spin|read|write]_lock_bh()
234 */ 111 */
235BUILD_LOCK_OPS(spin, spinlock); 112BUILD_LOCK_OPS(spin, raw_spinlock);
236BUILD_LOCK_OPS(read, rwlock); 113BUILD_LOCK_OPS(read, rwlock);
237BUILD_LOCK_OPS(write, rwlock); 114BUILD_LOCK_OPS(write, rwlock);
238 115
239#endif /* CONFIG_PREEMPT */ 116#endif
240 117
241#ifdef CONFIG_DEBUG_LOCK_ALLOC 118#ifndef CONFIG_INLINE_SPIN_TRYLOCK
119int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock)
120{
121 return __raw_spin_trylock(lock);
122}
123EXPORT_SYMBOL(_raw_spin_trylock);
124#endif
242 125
243void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) 126#ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH
127int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock)
244{ 128{
245 preempt_disable(); 129 return __raw_spin_trylock_bh(lock);
246 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
247 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
248} 130}
249EXPORT_SYMBOL(_spin_lock_nested); 131EXPORT_SYMBOL(_raw_spin_trylock_bh);
132#endif
250 133
251unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass) 134#ifndef CONFIG_INLINE_SPIN_LOCK
135void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)
252{ 136{
253 unsigned long flags; 137 __raw_spin_lock(lock);
138}
139EXPORT_SYMBOL(_raw_spin_lock);
140#endif
254 141
255 local_irq_save(flags); 142#ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE
256 preempt_disable(); 143unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock)
257 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); 144{
258 LOCK_CONTENDED_FLAGS(lock, _raw_spin_trylock, _raw_spin_lock, 145 return __raw_spin_lock_irqsave(lock);
259 _raw_spin_lock_flags, &flags);
260 return flags;
261} 146}
262EXPORT_SYMBOL(_spin_lock_irqsave_nested); 147EXPORT_SYMBOL(_raw_spin_lock_irqsave);
148#endif
263 149
264void __lockfunc _spin_lock_nest_lock(spinlock_t *lock, 150#ifndef CONFIG_INLINE_SPIN_LOCK_IRQ
265 struct lockdep_map *nest_lock) 151void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock)
266{ 152{
267 preempt_disable(); 153 __raw_spin_lock_irq(lock);
268 spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
269 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
270} 154}
271EXPORT_SYMBOL(_spin_lock_nest_lock); 155EXPORT_SYMBOL(_raw_spin_lock_irq);
156#endif
272 157
158#ifndef CONFIG_INLINE_SPIN_LOCK_BH
159void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
160{
161 __raw_spin_lock_bh(lock);
162}
163EXPORT_SYMBOL(_raw_spin_lock_bh);
273#endif 164#endif
274 165
275#ifndef _spin_unlock 166#ifndef CONFIG_INLINE_SPIN_UNLOCK
276void __lockfunc _spin_unlock(spinlock_t *lock) 167void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
277{ 168{
278 __spin_unlock(lock); 169 __raw_spin_unlock(lock);
279} 170}
280EXPORT_SYMBOL(_spin_unlock); 171EXPORT_SYMBOL(_raw_spin_unlock);
281#endif 172#endif
282 173
283#ifndef _write_unlock 174#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE
284void __lockfunc _write_unlock(rwlock_t *lock) 175void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags)
285{ 176{
286 __write_unlock(lock); 177 __raw_spin_unlock_irqrestore(lock, flags);
287} 178}
288EXPORT_SYMBOL(_write_unlock); 179EXPORT_SYMBOL(_raw_spin_unlock_irqrestore);
289#endif 180#endif
290 181
291#ifndef _read_unlock 182#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ
292void __lockfunc _read_unlock(rwlock_t *lock) 183void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock)
293{ 184{
294 __read_unlock(lock); 185 __raw_spin_unlock_irq(lock);
295} 186}
296EXPORT_SYMBOL(_read_unlock); 187EXPORT_SYMBOL(_raw_spin_unlock_irq);
297#endif 188#endif
298 189
299#ifndef _spin_unlock_irqrestore 190#ifndef CONFIG_INLINE_SPIN_UNLOCK_BH
300void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) 191void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
301{ 192{
302 __spin_unlock_irqrestore(lock, flags); 193 __raw_spin_unlock_bh(lock);
303} 194}
304EXPORT_SYMBOL(_spin_unlock_irqrestore); 195EXPORT_SYMBOL(_raw_spin_unlock_bh);
305#endif 196#endif
306 197
307#ifndef _spin_unlock_irq 198#ifndef CONFIG_INLINE_READ_TRYLOCK
308void __lockfunc _spin_unlock_irq(spinlock_t *lock) 199int __lockfunc _raw_read_trylock(rwlock_t *lock)
309{ 200{
310 __spin_unlock_irq(lock); 201 return __raw_read_trylock(lock);
311} 202}
312EXPORT_SYMBOL(_spin_unlock_irq); 203EXPORT_SYMBOL(_raw_read_trylock);
313#endif 204#endif
314 205
315#ifndef _spin_unlock_bh 206#ifndef CONFIG_INLINE_READ_LOCK
316void __lockfunc _spin_unlock_bh(spinlock_t *lock) 207void __lockfunc _raw_read_lock(rwlock_t *lock)
317{ 208{
318 __spin_unlock_bh(lock); 209 __raw_read_lock(lock);
319} 210}
320EXPORT_SYMBOL(_spin_unlock_bh); 211EXPORT_SYMBOL(_raw_read_lock);
321#endif 212#endif
322 213
323#ifndef _read_unlock_irqrestore 214#ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE
324void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 215unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock)
325{ 216{
326 __read_unlock_irqrestore(lock, flags); 217 return __raw_read_lock_irqsave(lock);
327} 218}
328EXPORT_SYMBOL(_read_unlock_irqrestore); 219EXPORT_SYMBOL(_raw_read_lock_irqsave);
329#endif 220#endif
330 221
331#ifndef _read_unlock_irq 222#ifndef CONFIG_INLINE_READ_LOCK_IRQ
332void __lockfunc _read_unlock_irq(rwlock_t *lock) 223void __lockfunc _raw_read_lock_irq(rwlock_t *lock)
333{ 224{
334 __read_unlock_irq(lock); 225 __raw_read_lock_irq(lock);
335} 226}
336EXPORT_SYMBOL(_read_unlock_irq); 227EXPORT_SYMBOL(_raw_read_lock_irq);
337#endif 228#endif
338 229
339#ifndef _read_unlock_bh 230#ifndef CONFIG_INLINE_READ_LOCK_BH
340void __lockfunc _read_unlock_bh(rwlock_t *lock) 231void __lockfunc _raw_read_lock_bh(rwlock_t *lock)
341{ 232{
342 __read_unlock_bh(lock); 233 __raw_read_lock_bh(lock);
343} 234}
344EXPORT_SYMBOL(_read_unlock_bh); 235EXPORT_SYMBOL(_raw_read_lock_bh);
345#endif 236#endif
346 237
347#ifndef _write_unlock_irqrestore 238#ifndef CONFIG_INLINE_READ_UNLOCK
348void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 239void __lockfunc _raw_read_unlock(rwlock_t *lock)
349{ 240{
350 __write_unlock_irqrestore(lock, flags); 241 __raw_read_unlock(lock);
351} 242}
352EXPORT_SYMBOL(_write_unlock_irqrestore); 243EXPORT_SYMBOL(_raw_read_unlock);
353#endif 244#endif
354 245
355#ifndef _write_unlock_irq 246#ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE
356void __lockfunc _write_unlock_irq(rwlock_t *lock) 247void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
357{ 248{
358 __write_unlock_irq(lock); 249 __raw_read_unlock_irqrestore(lock, flags);
359} 250}
360EXPORT_SYMBOL(_write_unlock_irq); 251EXPORT_SYMBOL(_raw_read_unlock_irqrestore);
361#endif 252#endif
362 253
363#ifndef _write_unlock_bh 254#ifndef CONFIG_INLINE_READ_UNLOCK_IRQ
364void __lockfunc _write_unlock_bh(rwlock_t *lock) 255void __lockfunc _raw_read_unlock_irq(rwlock_t *lock)
365{ 256{
366 __write_unlock_bh(lock); 257 __raw_read_unlock_irq(lock);
367} 258}
368EXPORT_SYMBOL(_write_unlock_bh); 259EXPORT_SYMBOL(_raw_read_unlock_irq);
369#endif 260#endif
370 261
371#ifndef _spin_trylock_bh 262#ifndef CONFIG_INLINE_READ_UNLOCK_BH
372int __lockfunc _spin_trylock_bh(spinlock_t *lock) 263void __lockfunc _raw_read_unlock_bh(rwlock_t *lock)
373{ 264{
374 return __spin_trylock_bh(lock); 265 __raw_read_unlock_bh(lock);
375} 266}
376EXPORT_SYMBOL(_spin_trylock_bh); 267EXPORT_SYMBOL(_raw_read_unlock_bh);
268#endif
269
270#ifndef CONFIG_INLINE_WRITE_TRYLOCK
271int __lockfunc _raw_write_trylock(rwlock_t *lock)
272{
273 return __raw_write_trylock(lock);
274}
275EXPORT_SYMBOL(_raw_write_trylock);
276#endif
277
278#ifndef CONFIG_INLINE_WRITE_LOCK
279void __lockfunc _raw_write_lock(rwlock_t *lock)
280{
281 __raw_write_lock(lock);
282}
283EXPORT_SYMBOL(_raw_write_lock);
284#endif
285
286#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE
287unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock)
288{
289 return __raw_write_lock_irqsave(lock);
290}
291EXPORT_SYMBOL(_raw_write_lock_irqsave);
292#endif
293
294#ifndef CONFIG_INLINE_WRITE_LOCK_IRQ
295void __lockfunc _raw_write_lock_irq(rwlock_t *lock)
296{
297 __raw_write_lock_irq(lock);
298}
299EXPORT_SYMBOL(_raw_write_lock_irq);
300#endif
301
302#ifndef CONFIG_INLINE_WRITE_LOCK_BH
303void __lockfunc _raw_write_lock_bh(rwlock_t *lock)
304{
305 __raw_write_lock_bh(lock);
306}
307EXPORT_SYMBOL(_raw_write_lock_bh);
308#endif
309
310#ifndef CONFIG_INLINE_WRITE_UNLOCK
311void __lockfunc _raw_write_unlock(rwlock_t *lock)
312{
313 __raw_write_unlock(lock);
314}
315EXPORT_SYMBOL(_raw_write_unlock);
316#endif
317
318#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE
319void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
320{
321 __raw_write_unlock_irqrestore(lock, flags);
322}
323EXPORT_SYMBOL(_raw_write_unlock_irqrestore);
324#endif
325
326#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ
327void __lockfunc _raw_write_unlock_irq(rwlock_t *lock)
328{
329 __raw_write_unlock_irq(lock);
330}
331EXPORT_SYMBOL(_raw_write_unlock_irq);
332#endif
333
334#ifndef CONFIG_INLINE_WRITE_UNLOCK_BH
335void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
336{
337 __raw_write_unlock_bh(lock);
338}
339EXPORT_SYMBOL(_raw_write_unlock_bh);
340#endif
341
342#ifdef CONFIG_DEBUG_LOCK_ALLOC
343
344void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
345{
346 preempt_disable();
347 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
348 LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
349}
350EXPORT_SYMBOL(_raw_spin_lock_nested);
351
352unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock,
353 int subclass)
354{
355 unsigned long flags;
356
357 local_irq_save(flags);
358 preempt_disable();
359 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
360 LOCK_CONTENDED_FLAGS(lock, do_raw_spin_trylock, do_raw_spin_lock,
361 do_raw_spin_lock_flags, &flags);
362 return flags;
363}
364EXPORT_SYMBOL(_raw_spin_lock_irqsave_nested);
365
366void __lockfunc _raw_spin_lock_nest_lock(raw_spinlock_t *lock,
367 struct lockdep_map *nest_lock)
368{
369 preempt_disable();
370 spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
371 LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
372}
373EXPORT_SYMBOL(_raw_spin_lock_nest_lock);
374
377#endif 375#endif
378 376
379notrace int in_lock_functions(unsigned long addr) 377notrace int in_lock_functions(unsigned long addr)
diff --git a/kernel/srcu.c b/kernel/srcu.c
index b0aeeaf22ce..818d7d9aa03 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -49,6 +49,7 @@ int init_srcu_struct(struct srcu_struct *sp)
49 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); 49 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
50 return (sp->per_cpu_ref ? 0 : -ENOMEM); 50 return (sp->per_cpu_ref ? 0 : -ENOMEM);
51} 51}
52EXPORT_SYMBOL_GPL(init_srcu_struct);
52 53
53/* 54/*
54 * srcu_readers_active_idx -- returns approximate number of readers 55 * srcu_readers_active_idx -- returns approximate number of readers
@@ -97,6 +98,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
97 free_percpu(sp->per_cpu_ref); 98 free_percpu(sp->per_cpu_ref);
98 sp->per_cpu_ref = NULL; 99 sp->per_cpu_ref = NULL;
99} 100}
101EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
100 102
101/** 103/**
102 * srcu_read_lock - register a new reader for an SRCU-protected structure. 104 * srcu_read_lock - register a new reader for an SRCU-protected structure.
@@ -118,6 +120,7 @@ int srcu_read_lock(struct srcu_struct *sp)
118 preempt_enable(); 120 preempt_enable();
119 return idx; 121 return idx;
120} 122}
123EXPORT_SYMBOL_GPL(srcu_read_lock);
121 124
122/** 125/**
123 * srcu_read_unlock - unregister a old reader from an SRCU-protected structure. 126 * srcu_read_unlock - unregister a old reader from an SRCU-protected structure.
@@ -136,22 +139,12 @@ void srcu_read_unlock(struct srcu_struct *sp, int idx)
136 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; 139 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--;
137 preempt_enable(); 140 preempt_enable();
138} 141}
142EXPORT_SYMBOL_GPL(srcu_read_unlock);
139 143
140/** 144/*
141 * synchronize_srcu - wait for prior SRCU read-side critical-section completion 145 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
142 * @sp: srcu_struct with which to synchronize.
143 *
144 * Flip the completed counter, and wait for the old count to drain to zero.
145 * As with classic RCU, the updater must use some separate means of
146 * synchronizing concurrent updates. Can block; must be called from
147 * process context.
148 *
149 * Note that it is illegal to call synchornize_srcu() from the corresponding
150 * SRCU read-side critical section; doing so will result in deadlock.
151 * However, it is perfectly legal to call synchronize_srcu() on one
152 * srcu_struct from some other srcu_struct's read-side critical section.
153 */ 146 */
154void synchronize_srcu(struct srcu_struct *sp) 147void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
155{ 148{
156 int idx; 149 int idx;
157 150
@@ -173,7 +166,7 @@ void synchronize_srcu(struct srcu_struct *sp)
173 return; 166 return;
174 } 167 }
175 168
176 synchronize_sched(); /* Force memory barrier on all CPUs. */ 169 sync_func(); /* Force memory barrier on all CPUs. */
177 170
178 /* 171 /*
179 * The preceding synchronize_sched() ensures that any CPU that 172 * The preceding synchronize_sched() ensures that any CPU that
@@ -190,7 +183,7 @@ void synchronize_srcu(struct srcu_struct *sp)
190 idx = sp->completed & 0x1; 183 idx = sp->completed & 0x1;
191 sp->completed++; 184 sp->completed++;
192 185
193 synchronize_sched(); /* Force memory barrier on all CPUs. */ 186 sync_func(); /* Force memory barrier on all CPUs. */
194 187
195 /* 188 /*
196 * At this point, because of the preceding synchronize_sched(), 189 * At this point, because of the preceding synchronize_sched(),
@@ -203,7 +196,7 @@ void synchronize_srcu(struct srcu_struct *sp)
203 while (srcu_readers_active_idx(sp, idx)) 196 while (srcu_readers_active_idx(sp, idx))
204 schedule_timeout_interruptible(1); 197 schedule_timeout_interruptible(1);
205 198
206 synchronize_sched(); /* Force memory barrier on all CPUs. */ 199 sync_func(); /* Force memory barrier on all CPUs. */
207 200
208 /* 201 /*
209 * The preceding synchronize_sched() forces all srcu_read_unlock() 202 * The preceding synchronize_sched() forces all srcu_read_unlock()
@@ -237,6 +230,47 @@ void synchronize_srcu(struct srcu_struct *sp)
237} 230}
238 231
239/** 232/**
233 * synchronize_srcu - wait for prior SRCU read-side critical-section completion
234 * @sp: srcu_struct with which to synchronize.
235 *
236 * Flip the completed counter, and wait for the old count to drain to zero.
237 * As with classic RCU, the updater must use some separate means of
238 * synchronizing concurrent updates. Can block; must be called from
239 * process context.
240 *
241 * Note that it is illegal to call synchronize_srcu() from the corresponding
242 * SRCU read-side critical section; doing so will result in deadlock.
243 * However, it is perfectly legal to call synchronize_srcu() on one
244 * srcu_struct from some other srcu_struct's read-side critical section.
245 */
246void synchronize_srcu(struct srcu_struct *sp)
247{
248 __synchronize_srcu(sp, synchronize_sched);
249}
250EXPORT_SYMBOL_GPL(synchronize_srcu);
251
252/**
253 * synchronize_srcu_expedited - like synchronize_srcu, but less patient
254 * @sp: srcu_struct with which to synchronize.
255 *
256 * Flip the completed counter, and wait for the old count to drain to zero.
257 * As with classic RCU, the updater must use some separate means of
258 * synchronizing concurrent updates. Can block; must be called from
259 * process context.
260 *
261 * Note that it is illegal to call synchronize_srcu_expedited()
262 * from the corresponding SRCU read-side critical section; doing so
263 * will result in deadlock. However, it is perfectly legal to call
264 * synchronize_srcu_expedited() on one srcu_struct from some other
265 * srcu_struct's read-side critical section.
266 */
267void synchronize_srcu_expedited(struct srcu_struct *sp)
268{
269 __synchronize_srcu(sp, synchronize_sched_expedited);
270}
271EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
272
273/**
240 * srcu_batches_completed - return batches completed. 274 * srcu_batches_completed - return batches completed.
241 * @sp: srcu_struct on which to report batch completion. 275 * @sp: srcu_struct on which to report batch completion.
242 * 276 *
@@ -248,10 +282,4 @@ long srcu_batches_completed(struct srcu_struct *sp)
248{ 282{
249 return sp->completed; 283 return sp->completed;
250} 284}
251
252EXPORT_SYMBOL_GPL(init_srcu_struct);
253EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
254EXPORT_SYMBOL_GPL(srcu_read_lock);
255EXPORT_SYMBOL_GPL(srcu_read_unlock);
256EXPORT_SYMBOL_GPL(synchronize_srcu);
257EXPORT_SYMBOL_GPL(srcu_batches_completed); 285EXPORT_SYMBOL_GPL(srcu_batches_completed);
diff --git a/kernel/sys.c b/kernel/sys.c
index 255475d163e..26a6b73a6b8 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -8,7 +8,6 @@
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/utsname.h> 9#include <linux/utsname.h>
10#include <linux/mman.h> 10#include <linux/mman.h>
11#include <linux/smp_lock.h>
12#include <linux/notifier.h> 11#include <linux/notifier.h>
13#include <linux/reboot.h> 12#include <linux/reboot.h>
14#include <linux/prctl.h> 13#include <linux/prctl.h>
@@ -163,6 +162,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
163 if (niceval > 19) 162 if (niceval > 19)
164 niceval = 19; 163 niceval = 19;
165 164
165 rcu_read_lock();
166 read_lock(&tasklist_lock); 166 read_lock(&tasklist_lock);
167 switch (which) { 167 switch (which) {
168 case PRIO_PROCESS: 168 case PRIO_PROCESS:
@@ -190,16 +190,17 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
190 !(user = find_user(who))) 190 !(user = find_user(who)))
191 goto out_unlock; /* No processes for this user */ 191 goto out_unlock; /* No processes for this user */
192 192
193 do_each_thread(g, p) 193 do_each_thread(g, p) {
194 if (__task_cred(p)->uid == who) 194 if (__task_cred(p)->uid == who)
195 error = set_one_prio(p, niceval, error); 195 error = set_one_prio(p, niceval, error);
196 while_each_thread(g, p); 196 } while_each_thread(g, p);
197 if (who != cred->uid) 197 if (who != cred->uid)
198 free_uid(user); /* For find_user() */ 198 free_uid(user); /* For find_user() */
199 break; 199 break;
200 } 200 }
201out_unlock: 201out_unlock:
202 read_unlock(&tasklist_lock); 202 read_unlock(&tasklist_lock);
203 rcu_read_unlock();
203out: 204out:
204 return error; 205 return error;
205} 206}
@@ -253,13 +254,13 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
253 !(user = find_user(who))) 254 !(user = find_user(who)))
254 goto out_unlock; /* No processes for this user */ 255 goto out_unlock; /* No processes for this user */
255 256
256 do_each_thread(g, p) 257 do_each_thread(g, p) {
257 if (__task_cred(p)->uid == who) { 258 if (__task_cred(p)->uid == who) {
258 niceval = 20 - task_nice(p); 259 niceval = 20 - task_nice(p);
259 if (niceval > retval) 260 if (niceval > retval)
260 retval = niceval; 261 retval = niceval;
261 } 262 }
262 while_each_thread(g, p); 263 } while_each_thread(g, p);
263 if (who != cred->uid) 264 if (who != cred->uid)
264 free_uid(user); /* for find_user() */ 265 free_uid(user); /* for find_user() */
265 break; 266 break;
@@ -349,6 +350,9 @@ void kernel_power_off(void)
349 machine_power_off(); 350 machine_power_off();
350} 351}
351EXPORT_SYMBOL_GPL(kernel_power_off); 352EXPORT_SYMBOL_GPL(kernel_power_off);
353
354static DEFINE_MUTEX(reboot_mutex);
355
352/* 356/*
353 * Reboot system call: for obvious reasons only root may call it, 357 * Reboot system call: for obvious reasons only root may call it,
354 * and even root needs to set up some magic numbers in the registers 358 * and even root needs to set up some magic numbers in the registers
@@ -381,7 +385,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
381 if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) 385 if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
382 cmd = LINUX_REBOOT_CMD_HALT; 386 cmd = LINUX_REBOOT_CMD_HALT;
383 387
384 lock_kernel(); 388 mutex_lock(&reboot_mutex);
385 switch (cmd) { 389 switch (cmd) {
386 case LINUX_REBOOT_CMD_RESTART: 390 case LINUX_REBOOT_CMD_RESTART:
387 kernel_restart(NULL); 391 kernel_restart(NULL);
@@ -397,20 +401,18 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
397 401
398 case LINUX_REBOOT_CMD_HALT: 402 case LINUX_REBOOT_CMD_HALT:
399 kernel_halt(); 403 kernel_halt();
400 unlock_kernel();
401 do_exit(0); 404 do_exit(0);
402 panic("cannot halt"); 405 panic("cannot halt");
403 406
404 case LINUX_REBOOT_CMD_POWER_OFF: 407 case LINUX_REBOOT_CMD_POWER_OFF:
405 kernel_power_off(); 408 kernel_power_off();
406 unlock_kernel();
407 do_exit(0); 409 do_exit(0);
408 break; 410 break;
409 411
410 case LINUX_REBOOT_CMD_RESTART2: 412 case LINUX_REBOOT_CMD_RESTART2:
411 if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) { 413 if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) {
412 unlock_kernel(); 414 ret = -EFAULT;
413 return -EFAULT; 415 break;
414 } 416 }
415 buffer[sizeof(buffer) - 1] = '\0'; 417 buffer[sizeof(buffer) - 1] = '\0';
416 418
@@ -433,7 +435,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
433 ret = -EINVAL; 435 ret = -EINVAL;
434 break; 436 break;
435 } 437 }
436 unlock_kernel(); 438 mutex_unlock(&reboot_mutex);
437 return ret; 439 return ret;
438} 440}
439 441
@@ -911,16 +913,15 @@ change_okay:
911 913
912void do_sys_times(struct tms *tms) 914void do_sys_times(struct tms *tms)
913{ 915{
914 struct task_cputime cputime; 916 cputime_t tgutime, tgstime, cutime, cstime;
915 cputime_t cutime, cstime;
916 917
917 thread_group_cputime(current, &cputime);
918 spin_lock_irq(&current->sighand->siglock); 918 spin_lock_irq(&current->sighand->siglock);
919 thread_group_times(current, &tgutime, &tgstime);
919 cutime = current->signal->cutime; 920 cutime = current->signal->cutime;
920 cstime = current->signal->cstime; 921 cstime = current->signal->cstime;
921 spin_unlock_irq(&current->sighand->siglock); 922 spin_unlock_irq(&current->sighand->siglock);
922 tms->tms_utime = cputime_to_clock_t(cputime.utime); 923 tms->tms_utime = cputime_to_clock_t(tgutime);
923 tms->tms_stime = cputime_to_clock_t(cputime.stime); 924 tms->tms_stime = cputime_to_clock_t(tgstime);
924 tms->tms_cutime = cputime_to_clock_t(cutime); 925 tms->tms_cutime = cputime_to_clock_t(cutime);
925 tms->tms_cstime = cputime_to_clock_t(cstime); 926 tms->tms_cstime = cputime_to_clock_t(cstime);
926} 927}
@@ -1110,6 +1111,8 @@ SYSCALL_DEFINE0(setsid)
1110 err = session; 1111 err = session;
1111out: 1112out:
1112 write_unlock_irq(&tasklist_lock); 1113 write_unlock_irq(&tasklist_lock);
1114 if (err > 0)
1115 proc_sid_connector(group_leader);
1113 return err; 1116 return err;
1114} 1117}
1115 1118
@@ -1336,16 +1339,14 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1336{ 1339{
1337 struct task_struct *t; 1340 struct task_struct *t;
1338 unsigned long flags; 1341 unsigned long flags;
1339 cputime_t utime, stime; 1342 cputime_t tgutime, tgstime, utime, stime;
1340 struct task_cputime cputime;
1341 unsigned long maxrss = 0; 1343 unsigned long maxrss = 0;
1342 1344
1343 memset((char *) r, 0, sizeof *r); 1345 memset((char *) r, 0, sizeof *r);
1344 utime = stime = cputime_zero; 1346 utime = stime = cputime_zero;
1345 1347
1346 if (who == RUSAGE_THREAD) { 1348 if (who == RUSAGE_THREAD) {
1347 utime = task_utime(current); 1349 task_times(current, &utime, &stime);
1348 stime = task_stime(current);
1349 accumulate_thread_rusage(p, r); 1350 accumulate_thread_rusage(p, r);
1350 maxrss = p->signal->maxrss; 1351 maxrss = p->signal->maxrss;
1351 goto out; 1352 goto out;
@@ -1371,9 +1372,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1371 break; 1372 break;
1372 1373
1373 case RUSAGE_SELF: 1374 case RUSAGE_SELF:
1374 thread_group_cputime(p, &cputime); 1375 thread_group_times(p, &tgutime, &tgstime);
1375 utime = cputime_add(utime, cputime.utime); 1376 utime = cputime_add(utime, tgutime);
1376 stime = cputime_add(stime, cputime.stime); 1377 stime = cputime_add(stime, tgstime);
1377 r->ru_nvcsw += p->signal->nvcsw; 1378 r->ru_nvcsw += p->signal->nvcsw;
1378 r->ru_nivcsw += p->signal->nivcsw; 1379 r->ru_nivcsw += p->signal->nivcsw;
1379 r->ru_minflt += p->signal->min_flt; 1380 r->ru_minflt += p->signal->min_flt;
@@ -1546,24 +1547,37 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1546 if (arg4 | arg5) 1547 if (arg4 | arg5)
1547 return -EINVAL; 1548 return -EINVAL;
1548 switch (arg2) { 1549 switch (arg2) {
1549 case 0: 1550 case PR_MCE_KILL_CLEAR:
1550 if (arg3 != 0) 1551 if (arg3 != 0)
1551 return -EINVAL; 1552 return -EINVAL;
1552 current->flags &= ~PF_MCE_PROCESS; 1553 current->flags &= ~PF_MCE_PROCESS;
1553 break; 1554 break;
1554 case 1: 1555 case PR_MCE_KILL_SET:
1555 current->flags |= PF_MCE_PROCESS; 1556 current->flags |= PF_MCE_PROCESS;
1556 if (arg3 != 0) 1557 if (arg3 == PR_MCE_KILL_EARLY)
1557 current->flags |= PF_MCE_EARLY; 1558 current->flags |= PF_MCE_EARLY;
1558 else 1559 else if (arg3 == PR_MCE_KILL_LATE)
1559 current->flags &= ~PF_MCE_EARLY; 1560 current->flags &= ~PF_MCE_EARLY;
1561 else if (arg3 == PR_MCE_KILL_DEFAULT)
1562 current->flags &=
1563 ~(PF_MCE_EARLY|PF_MCE_PROCESS);
1564 else
1565 return -EINVAL;
1560 break; 1566 break;
1561 default: 1567 default:
1562 return -EINVAL; 1568 return -EINVAL;
1563 } 1569 }
1564 error = 0; 1570 error = 0;
1565 break; 1571 break;
1566 1572 case PR_MCE_KILL_GET:
1573 if (arg2 | arg3 | arg4 | arg5)
1574 return -EINVAL;
1575 if (current->flags & PF_MCE_PROCESS)
1576 error = (current->flags & PF_MCE_EARLY) ?
1577 PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
1578 else
1579 error = PR_MCE_KILL_DEFAULT;
1580 break;
1567 default: 1581 default:
1568 error = -EINVAL; 1582 error = -EINVAL;
1569 break; 1583 break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index e06d0b8d195..695384f12a7 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -48,8 +48,10 @@ cond_syscall(sys_shutdown);
48cond_syscall(sys_sendmsg); 48cond_syscall(sys_sendmsg);
49cond_syscall(compat_sys_sendmsg); 49cond_syscall(compat_sys_sendmsg);
50cond_syscall(sys_recvmsg); 50cond_syscall(sys_recvmsg);
51cond_syscall(sys_recvmmsg);
51cond_syscall(compat_sys_recvmsg); 52cond_syscall(compat_sys_recvmsg);
52cond_syscall(compat_sys_recvfrom); 53cond_syscall(compat_sys_recvfrom);
54cond_syscall(compat_sys_recvmmsg);
53cond_syscall(sys_socketcall); 55cond_syscall(sys_socketcall);
54cond_syscall(sys_futex); 56cond_syscall(sys_futex);
55cond_syscall(compat_sys_futex); 57cond_syscall(compat_sys_futex);
@@ -139,7 +141,6 @@ cond_syscall(sys_pciconfig_read);
139cond_syscall(sys_pciconfig_write); 141cond_syscall(sys_pciconfig_write);
140cond_syscall(sys_pciconfig_iobase); 142cond_syscall(sys_pciconfig_iobase);
141cond_syscall(sys32_ipc); 143cond_syscall(sys32_ipc);
142cond_syscall(sys32_sysctl);
143cond_syscall(ppc_rtas); 144cond_syscall(ppc_rtas);
144cond_syscall(sys_spu_run); 145cond_syscall(sys_spu_run);
145cond_syscall(sys_spu_create); 146cond_syscall(sys_spu_create);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0d949c51741..8a68b244846 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -27,7 +27,6 @@
27#include <linux/security.h> 27#include <linux/security.h>
28#include <linux/ctype.h> 28#include <linux/ctype.h>
29#include <linux/kmemcheck.h> 29#include <linux/kmemcheck.h>
30#include <linux/smp_lock.h>
31#include <linux/fs.h> 30#include <linux/fs.h>
32#include <linux/init.h> 31#include <linux/init.h>
33#include <linux/kernel.h> 32#include <linux/kernel.h>
@@ -36,6 +35,7 @@
36#include <linux/sysrq.h> 35#include <linux/sysrq.h>
37#include <linux/highuid.h> 36#include <linux/highuid.h>
38#include <linux/writeback.h> 37#include <linux/writeback.h>
38#include <linux/ratelimit.h>
39#include <linux/hugetlb.h> 39#include <linux/hugetlb.h>
40#include <linux/initrd.h> 40#include <linux/initrd.h>
41#include <linux/key.h> 41#include <linux/key.h>
@@ -60,7 +60,6 @@
60#include <asm/io.h> 60#include <asm/io.h>
61#endif 61#endif
62 62
63static int deprecated_sysctl_warning(struct __sysctl_args *args);
64 63
65#if defined(CONFIG_SYSCTL) 64#if defined(CONFIG_SYSCTL)
66 65
@@ -158,6 +157,8 @@ extern int no_unaligned_warning;
158extern int unaligned_dump_stack; 157extern int unaligned_dump_stack;
159#endif 158#endif
160 159
160extern struct ratelimit_state printk_ratelimit_state;
161
161#ifdef CONFIG_RT_MUTEXES 162#ifdef CONFIG_RT_MUTEXES
162extern int max_lock_depth; 163extern int max_lock_depth;
163#endif 164#endif
@@ -207,31 +208,26 @@ extern int lock_stat;
207 208
208static struct ctl_table root_table[] = { 209static struct ctl_table root_table[] = {
209 { 210 {
210 .ctl_name = CTL_KERN,
211 .procname = "kernel", 211 .procname = "kernel",
212 .mode = 0555, 212 .mode = 0555,
213 .child = kern_table, 213 .child = kern_table,
214 }, 214 },
215 { 215 {
216 .ctl_name = CTL_VM,
217 .procname = "vm", 216 .procname = "vm",
218 .mode = 0555, 217 .mode = 0555,
219 .child = vm_table, 218 .child = vm_table,
220 }, 219 },
221 { 220 {
222 .ctl_name = CTL_FS,
223 .procname = "fs", 221 .procname = "fs",
224 .mode = 0555, 222 .mode = 0555,
225 .child = fs_table, 223 .child = fs_table,
226 }, 224 },
227 { 225 {
228 .ctl_name = CTL_DEBUG,
229 .procname = "debug", 226 .procname = "debug",
230 .mode = 0555, 227 .mode = 0555,
231 .child = debug_table, 228 .child = debug_table,
232 }, 229 },
233 { 230 {
234 .ctl_name = CTL_DEV,
235 .procname = "dev", 231 .procname = "dev",
236 .mode = 0555, 232 .mode = 0555,
237 .child = dev_table, 233 .child = dev_table,
@@ -240,7 +236,7 @@ static struct ctl_table root_table[] = {
240 * NOTE: do not add new entries to this table unless you have read 236 * NOTE: do not add new entries to this table unless you have read
241 * Documentation/sysctl/ctl_unnumbered.txt 237 * Documentation/sysctl/ctl_unnumbered.txt
242 */ 238 */
243 { .ctl_name = 0 } 239 { }
244}; 240};
245 241
246#ifdef CONFIG_SCHED_DEBUG 242#ifdef CONFIG_SCHED_DEBUG
@@ -248,196 +244,178 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */
248static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ 244static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
249static int min_wakeup_granularity_ns; /* 0 usecs */ 245static int min_wakeup_granularity_ns; /* 0 usecs */
250static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ 246static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
247static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
248static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
249static int min_sched_shares_ratelimit = 100000; /* 100 usec */
250static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
251#endif 251#endif
252 252
253static struct ctl_table kern_table[] = { 253static struct ctl_table kern_table[] = {
254 { 254 {
255 .ctl_name = CTL_UNNUMBERED,
256 .procname = "sched_child_runs_first", 255 .procname = "sched_child_runs_first",
257 .data = &sysctl_sched_child_runs_first, 256 .data = &sysctl_sched_child_runs_first,
258 .maxlen = sizeof(unsigned int), 257 .maxlen = sizeof(unsigned int),
259 .mode = 0644, 258 .mode = 0644,
260 .proc_handler = &proc_dointvec, 259 .proc_handler = proc_dointvec,
261 }, 260 },
262#ifdef CONFIG_SCHED_DEBUG 261#ifdef CONFIG_SCHED_DEBUG
263 { 262 {
264 .ctl_name = CTL_UNNUMBERED,
265 .procname = "sched_min_granularity_ns", 263 .procname = "sched_min_granularity_ns",
266 .data = &sysctl_sched_min_granularity, 264 .data = &sysctl_sched_min_granularity,
267 .maxlen = sizeof(unsigned int), 265 .maxlen = sizeof(unsigned int),
268 .mode = 0644, 266 .mode = 0644,
269 .proc_handler = &sched_nr_latency_handler, 267 .proc_handler = sched_proc_update_handler,
270 .strategy = &sysctl_intvec,
271 .extra1 = &min_sched_granularity_ns, 268 .extra1 = &min_sched_granularity_ns,
272 .extra2 = &max_sched_granularity_ns, 269 .extra2 = &max_sched_granularity_ns,
273 }, 270 },
274 { 271 {
275 .ctl_name = CTL_UNNUMBERED,
276 .procname = "sched_latency_ns", 272 .procname = "sched_latency_ns",
277 .data = &sysctl_sched_latency, 273 .data = &sysctl_sched_latency,
278 .maxlen = sizeof(unsigned int), 274 .maxlen = sizeof(unsigned int),
279 .mode = 0644, 275 .mode = 0644,
280 .proc_handler = &sched_nr_latency_handler, 276 .proc_handler = sched_proc_update_handler,
281 .strategy = &sysctl_intvec,
282 .extra1 = &min_sched_granularity_ns, 277 .extra1 = &min_sched_granularity_ns,
283 .extra2 = &max_sched_granularity_ns, 278 .extra2 = &max_sched_granularity_ns,
284 }, 279 },
285 { 280 {
286 .ctl_name = CTL_UNNUMBERED,
287 .procname = "sched_wakeup_granularity_ns", 281 .procname = "sched_wakeup_granularity_ns",
288 .data = &sysctl_sched_wakeup_granularity, 282 .data = &sysctl_sched_wakeup_granularity,
289 .maxlen = sizeof(unsigned int), 283 .maxlen = sizeof(unsigned int),
290 .mode = 0644, 284 .mode = 0644,
291 .proc_handler = &proc_dointvec_minmax, 285 .proc_handler = sched_proc_update_handler,
292 .strategy = &sysctl_intvec,
293 .extra1 = &min_wakeup_granularity_ns, 286 .extra1 = &min_wakeup_granularity_ns,
294 .extra2 = &max_wakeup_granularity_ns, 287 .extra2 = &max_wakeup_granularity_ns,
295 }, 288 },
296 { 289 {
297 .ctl_name = CTL_UNNUMBERED,
298 .procname = "sched_shares_ratelimit", 290 .procname = "sched_shares_ratelimit",
299 .data = &sysctl_sched_shares_ratelimit, 291 .data = &sysctl_sched_shares_ratelimit,
300 .maxlen = sizeof(unsigned int), 292 .maxlen = sizeof(unsigned int),
301 .mode = 0644, 293 .mode = 0644,
302 .proc_handler = &proc_dointvec, 294 .proc_handler = sched_proc_update_handler,
295 .extra1 = &min_sched_shares_ratelimit,
296 .extra2 = &max_sched_shares_ratelimit,
303 }, 297 },
304 { 298 {
305 .ctl_name = CTL_UNNUMBERED, 299 .procname = "sched_tunable_scaling",
306 .procname = "sched_shares_thresh", 300 .data = &sysctl_sched_tunable_scaling,
307 .data = &sysctl_sched_shares_thresh, 301 .maxlen = sizeof(enum sched_tunable_scaling),
308 .maxlen = sizeof(unsigned int),
309 .mode = 0644, 302 .mode = 0644,
310 .proc_handler = &proc_dointvec_minmax, 303 .proc_handler = sched_proc_update_handler,
311 .strategy = &sysctl_intvec, 304 .extra1 = &min_sched_tunable_scaling,
312 .extra1 = &zero, 305 .extra2 = &max_sched_tunable_scaling,
313 }, 306 },
314 { 307 {
315 .ctl_name = CTL_UNNUMBERED, 308 .procname = "sched_shares_thresh",
316 .procname = "sched_features", 309 .data = &sysctl_sched_shares_thresh,
317 .data = &sysctl_sched_features,
318 .maxlen = sizeof(unsigned int), 310 .maxlen = sizeof(unsigned int),
319 .mode = 0644, 311 .mode = 0644,
320 .proc_handler = &proc_dointvec, 312 .proc_handler = proc_dointvec_minmax,
313 .extra1 = &zero,
321 }, 314 },
322 { 315 {
323 .ctl_name = CTL_UNNUMBERED,
324 .procname = "sched_migration_cost", 316 .procname = "sched_migration_cost",
325 .data = &sysctl_sched_migration_cost, 317 .data = &sysctl_sched_migration_cost,
326 .maxlen = sizeof(unsigned int), 318 .maxlen = sizeof(unsigned int),
327 .mode = 0644, 319 .mode = 0644,
328 .proc_handler = &proc_dointvec, 320 .proc_handler = proc_dointvec,
329 }, 321 },
330 { 322 {
331 .ctl_name = CTL_UNNUMBERED,
332 .procname = "sched_nr_migrate", 323 .procname = "sched_nr_migrate",
333 .data = &sysctl_sched_nr_migrate, 324 .data = &sysctl_sched_nr_migrate,
334 .maxlen = sizeof(unsigned int), 325 .maxlen = sizeof(unsigned int),
335 .mode = 0644, 326 .mode = 0644,
336 .proc_handler = &proc_dointvec, 327 .proc_handler = proc_dointvec,
337 }, 328 },
338 { 329 {
339 .ctl_name = CTL_UNNUMBERED,
340 .procname = "sched_time_avg", 330 .procname = "sched_time_avg",
341 .data = &sysctl_sched_time_avg, 331 .data = &sysctl_sched_time_avg,
342 .maxlen = sizeof(unsigned int), 332 .maxlen = sizeof(unsigned int),
343 .mode = 0644, 333 .mode = 0644,
344 .proc_handler = &proc_dointvec, 334 .proc_handler = proc_dointvec,
345 }, 335 },
346 { 336 {
347 .ctl_name = CTL_UNNUMBERED,
348 .procname = "timer_migration", 337 .procname = "timer_migration",
349 .data = &sysctl_timer_migration, 338 .data = &sysctl_timer_migration,
350 .maxlen = sizeof(unsigned int), 339 .maxlen = sizeof(unsigned int),
351 .mode = 0644, 340 .mode = 0644,
352 .proc_handler = &proc_dointvec_minmax, 341 .proc_handler = proc_dointvec_minmax,
353 .strategy = &sysctl_intvec,
354 .extra1 = &zero, 342 .extra1 = &zero,
355 .extra2 = &one, 343 .extra2 = &one,
356 }, 344 },
357#endif 345#endif
358 { 346 {
359 .ctl_name = CTL_UNNUMBERED,
360 .procname = "sched_rt_period_us", 347 .procname = "sched_rt_period_us",
361 .data = &sysctl_sched_rt_period, 348 .data = &sysctl_sched_rt_period,
362 .maxlen = sizeof(unsigned int), 349 .maxlen = sizeof(unsigned int),
363 .mode = 0644, 350 .mode = 0644,
364 .proc_handler = &sched_rt_handler, 351 .proc_handler = sched_rt_handler,
365 }, 352 },
366 { 353 {
367 .ctl_name = CTL_UNNUMBERED,
368 .procname = "sched_rt_runtime_us", 354 .procname = "sched_rt_runtime_us",
369 .data = &sysctl_sched_rt_runtime, 355 .data = &sysctl_sched_rt_runtime,
370 .maxlen = sizeof(int), 356 .maxlen = sizeof(int),
371 .mode = 0644, 357 .mode = 0644,
372 .proc_handler = &sched_rt_handler, 358 .proc_handler = sched_rt_handler,
373 }, 359 },
374 { 360 {
375 .ctl_name = CTL_UNNUMBERED,
376 .procname = "sched_compat_yield", 361 .procname = "sched_compat_yield",
377 .data = &sysctl_sched_compat_yield, 362 .data = &sysctl_sched_compat_yield,
378 .maxlen = sizeof(unsigned int), 363 .maxlen = sizeof(unsigned int),
379 .mode = 0644, 364 .mode = 0644,
380 .proc_handler = &proc_dointvec, 365 .proc_handler = proc_dointvec,
381 }, 366 },
382#ifdef CONFIG_PROVE_LOCKING 367#ifdef CONFIG_PROVE_LOCKING
383 { 368 {
384 .ctl_name = CTL_UNNUMBERED,
385 .procname = "prove_locking", 369 .procname = "prove_locking",
386 .data = &prove_locking, 370 .data = &prove_locking,
387 .maxlen = sizeof(int), 371 .maxlen = sizeof(int),
388 .mode = 0644, 372 .mode = 0644,
389 .proc_handler = &proc_dointvec, 373 .proc_handler = proc_dointvec,
390 }, 374 },
391#endif 375#endif
392#ifdef CONFIG_LOCK_STAT 376#ifdef CONFIG_LOCK_STAT
393 { 377 {
394 .ctl_name = CTL_UNNUMBERED,
395 .procname = "lock_stat", 378 .procname = "lock_stat",
396 .data = &lock_stat, 379 .data = &lock_stat,
397 .maxlen = sizeof(int), 380 .maxlen = sizeof(int),
398 .mode = 0644, 381 .mode = 0644,
399 .proc_handler = &proc_dointvec, 382 .proc_handler = proc_dointvec,
400 }, 383 },
401#endif 384#endif
402 { 385 {
403 .ctl_name = KERN_PANIC,
404 .procname = "panic", 386 .procname = "panic",
405 .data = &panic_timeout, 387 .data = &panic_timeout,
406 .maxlen = sizeof(int), 388 .maxlen = sizeof(int),
407 .mode = 0644, 389 .mode = 0644,
408 .proc_handler = &proc_dointvec, 390 .proc_handler = proc_dointvec,
409 }, 391 },
410 { 392 {
411 .ctl_name = KERN_CORE_USES_PID,
412 .procname = "core_uses_pid", 393 .procname = "core_uses_pid",
413 .data = &core_uses_pid, 394 .data = &core_uses_pid,
414 .maxlen = sizeof(int), 395 .maxlen = sizeof(int),
415 .mode = 0644, 396 .mode = 0644,
416 .proc_handler = &proc_dointvec, 397 .proc_handler = proc_dointvec,
417 }, 398 },
418 { 399 {
419 .ctl_name = KERN_CORE_PATTERN,
420 .procname = "core_pattern", 400 .procname = "core_pattern",
421 .data = core_pattern, 401 .data = core_pattern,
422 .maxlen = CORENAME_MAX_SIZE, 402 .maxlen = CORENAME_MAX_SIZE,
423 .mode = 0644, 403 .mode = 0644,
424 .proc_handler = &proc_dostring, 404 .proc_handler = proc_dostring,
425 .strategy = &sysctl_string,
426 }, 405 },
427 { 406 {
428 .ctl_name = CTL_UNNUMBERED,
429 .procname = "core_pipe_limit", 407 .procname = "core_pipe_limit",
430 .data = &core_pipe_limit, 408 .data = &core_pipe_limit,
431 .maxlen = sizeof(unsigned int), 409 .maxlen = sizeof(unsigned int),
432 .mode = 0644, 410 .mode = 0644,
433 .proc_handler = &proc_dointvec, 411 .proc_handler = proc_dointvec,
434 }, 412 },
435#ifdef CONFIG_PROC_SYSCTL 413#ifdef CONFIG_PROC_SYSCTL
436 { 414 {
437 .procname = "tainted", 415 .procname = "tainted",
438 .maxlen = sizeof(long), 416 .maxlen = sizeof(long),
439 .mode = 0644, 417 .mode = 0644,
440 .proc_handler = &proc_taint, 418 .proc_handler = proc_taint,
441 }, 419 },
442#endif 420#endif
443#ifdef CONFIG_LATENCYTOP 421#ifdef CONFIG_LATENCYTOP
@@ -446,181 +424,160 @@ static struct ctl_table kern_table[] = {
446 .data = &latencytop_enabled, 424 .data = &latencytop_enabled,
447 .maxlen = sizeof(int), 425 .maxlen = sizeof(int),
448 .mode = 0644, 426 .mode = 0644,
449 .proc_handler = &proc_dointvec, 427 .proc_handler = proc_dointvec,
450 }, 428 },
451#endif 429#endif
452#ifdef CONFIG_BLK_DEV_INITRD 430#ifdef CONFIG_BLK_DEV_INITRD
453 { 431 {
454 .ctl_name = KERN_REALROOTDEV,
455 .procname = "real-root-dev", 432 .procname = "real-root-dev",
456 .data = &real_root_dev, 433 .data = &real_root_dev,
457 .maxlen = sizeof(int), 434 .maxlen = sizeof(int),
458 .mode = 0644, 435 .mode = 0644,
459 .proc_handler = &proc_dointvec, 436 .proc_handler = proc_dointvec,
460 }, 437 },
461#endif 438#endif
462 { 439 {
463 .ctl_name = CTL_UNNUMBERED,
464 .procname = "print-fatal-signals", 440 .procname = "print-fatal-signals",
465 .data = &print_fatal_signals, 441 .data = &print_fatal_signals,
466 .maxlen = sizeof(int), 442 .maxlen = sizeof(int),
467 .mode = 0644, 443 .mode = 0644,
468 .proc_handler = &proc_dointvec, 444 .proc_handler = proc_dointvec,
469 }, 445 },
470#ifdef CONFIG_SPARC 446#ifdef CONFIG_SPARC
471 { 447 {
472 .ctl_name = KERN_SPARC_REBOOT,
473 .procname = "reboot-cmd", 448 .procname = "reboot-cmd",
474 .data = reboot_command, 449 .data = reboot_command,
475 .maxlen = 256, 450 .maxlen = 256,
476 .mode = 0644, 451 .mode = 0644,
477 .proc_handler = &proc_dostring, 452 .proc_handler = proc_dostring,
478 .strategy = &sysctl_string,
479 }, 453 },
480 { 454 {
481 .ctl_name = KERN_SPARC_STOP_A,
482 .procname = "stop-a", 455 .procname = "stop-a",
483 .data = &stop_a_enabled, 456 .data = &stop_a_enabled,
484 .maxlen = sizeof (int), 457 .maxlen = sizeof (int),
485 .mode = 0644, 458 .mode = 0644,
486 .proc_handler = &proc_dointvec, 459 .proc_handler = proc_dointvec,
487 }, 460 },
488 { 461 {
489 .ctl_name = KERN_SPARC_SCONS_PWROFF,
490 .procname = "scons-poweroff", 462 .procname = "scons-poweroff",
491 .data = &scons_pwroff, 463 .data = &scons_pwroff,
492 .maxlen = sizeof (int), 464 .maxlen = sizeof (int),
493 .mode = 0644, 465 .mode = 0644,
494 .proc_handler = &proc_dointvec, 466 .proc_handler = proc_dointvec,
495 }, 467 },
496#endif 468#endif
497#ifdef CONFIG_SPARC64 469#ifdef CONFIG_SPARC64
498 { 470 {
499 .ctl_name = CTL_UNNUMBERED,
500 .procname = "tsb-ratio", 471 .procname = "tsb-ratio",
501 .data = &sysctl_tsb_ratio, 472 .data = &sysctl_tsb_ratio,
502 .maxlen = sizeof (int), 473 .maxlen = sizeof (int),
503 .mode = 0644, 474 .mode = 0644,
504 .proc_handler = &proc_dointvec, 475 .proc_handler = proc_dointvec,
505 }, 476 },
506#endif 477#endif
507#ifdef __hppa__ 478#ifdef __hppa__
508 { 479 {
509 .ctl_name = KERN_HPPA_PWRSW,
510 .procname = "soft-power", 480 .procname = "soft-power",
511 .data = &pwrsw_enabled, 481 .data = &pwrsw_enabled,
512 .maxlen = sizeof (int), 482 .maxlen = sizeof (int),
513 .mode = 0644, 483 .mode = 0644,
514 .proc_handler = &proc_dointvec, 484 .proc_handler = proc_dointvec,
515 }, 485 },
516 { 486 {
517 .ctl_name = KERN_HPPA_UNALIGNED,
518 .procname = "unaligned-trap", 487 .procname = "unaligned-trap",
519 .data = &unaligned_enabled, 488 .data = &unaligned_enabled,
520 .maxlen = sizeof (int), 489 .maxlen = sizeof (int),
521 .mode = 0644, 490 .mode = 0644,
522 .proc_handler = &proc_dointvec, 491 .proc_handler = proc_dointvec,
523 }, 492 },
524#endif 493#endif
525 { 494 {
526 .ctl_name = KERN_CTLALTDEL,
527 .procname = "ctrl-alt-del", 495 .procname = "ctrl-alt-del",
528 .data = &C_A_D, 496 .data = &C_A_D,
529 .maxlen = sizeof(int), 497 .maxlen = sizeof(int),
530 .mode = 0644, 498 .mode = 0644,
531 .proc_handler = &proc_dointvec, 499 .proc_handler = proc_dointvec,
532 }, 500 },
533#ifdef CONFIG_FUNCTION_TRACER 501#ifdef CONFIG_FUNCTION_TRACER
534 { 502 {
535 .ctl_name = CTL_UNNUMBERED,
536 .procname = "ftrace_enabled", 503 .procname = "ftrace_enabled",
537 .data = &ftrace_enabled, 504 .data = &ftrace_enabled,
538 .maxlen = sizeof(int), 505 .maxlen = sizeof(int),
539 .mode = 0644, 506 .mode = 0644,
540 .proc_handler = &ftrace_enable_sysctl, 507 .proc_handler = ftrace_enable_sysctl,
541 }, 508 },
542#endif 509#endif
543#ifdef CONFIG_STACK_TRACER 510#ifdef CONFIG_STACK_TRACER
544 { 511 {
545 .ctl_name = CTL_UNNUMBERED,
546 .procname = "stack_tracer_enabled", 512 .procname = "stack_tracer_enabled",
547 .data = &stack_tracer_enabled, 513 .data = &stack_tracer_enabled,
548 .maxlen = sizeof(int), 514 .maxlen = sizeof(int),
549 .mode = 0644, 515 .mode = 0644,
550 .proc_handler = &stack_trace_sysctl, 516 .proc_handler = stack_trace_sysctl,
551 }, 517 },
552#endif 518#endif
553#ifdef CONFIG_TRACING 519#ifdef CONFIG_TRACING
554 { 520 {
555 .ctl_name = CTL_UNNUMBERED,
556 .procname = "ftrace_dump_on_oops", 521 .procname = "ftrace_dump_on_oops",
557 .data = &ftrace_dump_on_oops, 522 .data = &ftrace_dump_on_oops,
558 .maxlen = sizeof(int), 523 .maxlen = sizeof(int),
559 .mode = 0644, 524 .mode = 0644,
560 .proc_handler = &proc_dointvec, 525 .proc_handler = proc_dointvec,
561 }, 526 },
562#endif 527#endif
563#ifdef CONFIG_MODULES 528#ifdef CONFIG_MODULES
564 { 529 {
565 .ctl_name = KERN_MODPROBE,
566 .procname = "modprobe", 530 .procname = "modprobe",
567 .data = &modprobe_path, 531 .data = &modprobe_path,
568 .maxlen = KMOD_PATH_LEN, 532 .maxlen = KMOD_PATH_LEN,
569 .mode = 0644, 533 .mode = 0644,
570 .proc_handler = &proc_dostring, 534 .proc_handler = proc_dostring,
571 .strategy = &sysctl_string,
572 }, 535 },
573 { 536 {
574 .ctl_name = CTL_UNNUMBERED,
575 .procname = "modules_disabled", 537 .procname = "modules_disabled",
576 .data = &modules_disabled, 538 .data = &modules_disabled,
577 .maxlen = sizeof(int), 539 .maxlen = sizeof(int),
578 .mode = 0644, 540 .mode = 0644,
579 /* only handle a transition from default "0" to "1" */ 541 /* only handle a transition from default "0" to "1" */
580 .proc_handler = &proc_dointvec_minmax, 542 .proc_handler = proc_dointvec_minmax,
581 .extra1 = &one, 543 .extra1 = &one,
582 .extra2 = &one, 544 .extra2 = &one,
583 }, 545 },
584#endif 546#endif
585#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) 547#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
586 { 548 {
587 .ctl_name = KERN_HOTPLUG,
588 .procname = "hotplug", 549 .procname = "hotplug",
589 .data = &uevent_helper, 550 .data = &uevent_helper,
590 .maxlen = UEVENT_HELPER_PATH_LEN, 551 .maxlen = UEVENT_HELPER_PATH_LEN,
591 .mode = 0644, 552 .mode = 0644,
592 .proc_handler = &proc_dostring, 553 .proc_handler = proc_dostring,
593 .strategy = &sysctl_string,
594 }, 554 },
595#endif 555#endif
596#ifdef CONFIG_CHR_DEV_SG 556#ifdef CONFIG_CHR_DEV_SG
597 { 557 {
598 .ctl_name = KERN_SG_BIG_BUFF,
599 .procname = "sg-big-buff", 558 .procname = "sg-big-buff",
600 .data = &sg_big_buff, 559 .data = &sg_big_buff,
601 .maxlen = sizeof (int), 560 .maxlen = sizeof (int),
602 .mode = 0444, 561 .mode = 0444,
603 .proc_handler = &proc_dointvec, 562 .proc_handler = proc_dointvec,
604 }, 563 },
605#endif 564#endif
606#ifdef CONFIG_BSD_PROCESS_ACCT 565#ifdef CONFIG_BSD_PROCESS_ACCT
607 { 566 {
608 .ctl_name = KERN_ACCT,
609 .procname = "acct", 567 .procname = "acct",
610 .data = &acct_parm, 568 .data = &acct_parm,
611 .maxlen = 3*sizeof(int), 569 .maxlen = 3*sizeof(int),
612 .mode = 0644, 570 .mode = 0644,
613 .proc_handler = &proc_dointvec, 571 .proc_handler = proc_dointvec,
614 }, 572 },
615#endif 573#endif
616#ifdef CONFIG_MAGIC_SYSRQ 574#ifdef CONFIG_MAGIC_SYSRQ
617 { 575 {
618 .ctl_name = KERN_SYSRQ,
619 .procname = "sysrq", 576 .procname = "sysrq",
620 .data = &__sysrq_enabled, 577 .data = &__sysrq_enabled,
621 .maxlen = sizeof (int), 578 .maxlen = sizeof (int),
622 .mode = 0644, 579 .mode = 0644,
623 .proc_handler = &proc_dointvec, 580 .proc_handler = proc_dointvec,
624 }, 581 },
625#endif 582#endif
626#ifdef CONFIG_PROC_SYSCTL 583#ifdef CONFIG_PROC_SYSCTL
@@ -629,215 +586,188 @@ static struct ctl_table kern_table[] = {
629 .data = NULL, 586 .data = NULL,
630 .maxlen = sizeof (int), 587 .maxlen = sizeof (int),
631 .mode = 0600, 588 .mode = 0600,
632 .proc_handler = &proc_do_cad_pid, 589 .proc_handler = proc_do_cad_pid,
633 }, 590 },
634#endif 591#endif
635 { 592 {
636 .ctl_name = KERN_MAX_THREADS,
637 .procname = "threads-max", 593 .procname = "threads-max",
638 .data = &max_threads, 594 .data = &max_threads,
639 .maxlen = sizeof(int), 595 .maxlen = sizeof(int),
640 .mode = 0644, 596 .mode = 0644,
641 .proc_handler = &proc_dointvec, 597 .proc_handler = proc_dointvec,
642 }, 598 },
643 { 599 {
644 .ctl_name = KERN_RANDOM,
645 .procname = "random", 600 .procname = "random",
646 .mode = 0555, 601 .mode = 0555,
647 .child = random_table, 602 .child = random_table,
648 }, 603 },
649 { 604 {
650 .ctl_name = KERN_OVERFLOWUID,
651 .procname = "overflowuid", 605 .procname = "overflowuid",
652 .data = &overflowuid, 606 .data = &overflowuid,
653 .maxlen = sizeof(int), 607 .maxlen = sizeof(int),
654 .mode = 0644, 608 .mode = 0644,
655 .proc_handler = &proc_dointvec_minmax, 609 .proc_handler = proc_dointvec_minmax,
656 .strategy = &sysctl_intvec,
657 .extra1 = &minolduid, 610 .extra1 = &minolduid,
658 .extra2 = &maxolduid, 611 .extra2 = &maxolduid,
659 }, 612 },
660 { 613 {
661 .ctl_name = KERN_OVERFLOWGID,
662 .procname = "overflowgid", 614 .procname = "overflowgid",
663 .data = &overflowgid, 615 .data = &overflowgid,
664 .maxlen = sizeof(int), 616 .maxlen = sizeof(int),
665 .mode = 0644, 617 .mode = 0644,
666 .proc_handler = &proc_dointvec_minmax, 618 .proc_handler = proc_dointvec_minmax,
667 .strategy = &sysctl_intvec,
668 .extra1 = &minolduid, 619 .extra1 = &minolduid,
669 .extra2 = &maxolduid, 620 .extra2 = &maxolduid,
670 }, 621 },
671#ifdef CONFIG_S390 622#ifdef CONFIG_S390
672#ifdef CONFIG_MATHEMU 623#ifdef CONFIG_MATHEMU
673 { 624 {
674 .ctl_name = KERN_IEEE_EMULATION_WARNINGS,
675 .procname = "ieee_emulation_warnings", 625 .procname = "ieee_emulation_warnings",
676 .data = &sysctl_ieee_emulation_warnings, 626 .data = &sysctl_ieee_emulation_warnings,
677 .maxlen = sizeof(int), 627 .maxlen = sizeof(int),
678 .mode = 0644, 628 .mode = 0644,
679 .proc_handler = &proc_dointvec, 629 .proc_handler = proc_dointvec,
680 }, 630 },
681#endif 631#endif
682 { 632 {
683 .ctl_name = KERN_S390_USER_DEBUG_LOGGING,
684 .procname = "userprocess_debug", 633 .procname = "userprocess_debug",
685 .data = &sysctl_userprocess_debug, 634 .data = &sysctl_userprocess_debug,
686 .maxlen = sizeof(int), 635 .maxlen = sizeof(int),
687 .mode = 0644, 636 .mode = 0644,
688 .proc_handler = &proc_dointvec, 637 .proc_handler = proc_dointvec,
689 }, 638 },
690#endif 639#endif
691 { 640 {
692 .ctl_name = KERN_PIDMAX,
693 .procname = "pid_max", 641 .procname = "pid_max",
694 .data = &pid_max, 642 .data = &pid_max,
695 .maxlen = sizeof (int), 643 .maxlen = sizeof (int),
696 .mode = 0644, 644 .mode = 0644,
697 .proc_handler = &proc_dointvec_minmax, 645 .proc_handler = proc_dointvec_minmax,
698 .strategy = sysctl_intvec,
699 .extra1 = &pid_max_min, 646 .extra1 = &pid_max_min,
700 .extra2 = &pid_max_max, 647 .extra2 = &pid_max_max,
701 }, 648 },
702 { 649 {
703 .ctl_name = KERN_PANIC_ON_OOPS,
704 .procname = "panic_on_oops", 650 .procname = "panic_on_oops",
705 .data = &panic_on_oops, 651 .data = &panic_on_oops,
706 .maxlen = sizeof(int), 652 .maxlen = sizeof(int),
707 .mode = 0644, 653 .mode = 0644,
708 .proc_handler = &proc_dointvec, 654 .proc_handler = proc_dointvec,
709 }, 655 },
710#if defined CONFIG_PRINTK 656#if defined CONFIG_PRINTK
711 { 657 {
712 .ctl_name = KERN_PRINTK,
713 .procname = "printk", 658 .procname = "printk",
714 .data = &console_loglevel, 659 .data = &console_loglevel,
715 .maxlen = 4*sizeof(int), 660 .maxlen = 4*sizeof(int),
716 .mode = 0644, 661 .mode = 0644,
717 .proc_handler = &proc_dointvec, 662 .proc_handler = proc_dointvec,
718 }, 663 },
719 { 664 {
720 .ctl_name = KERN_PRINTK_RATELIMIT,
721 .procname = "printk_ratelimit", 665 .procname = "printk_ratelimit",
722 .data = &printk_ratelimit_state.interval, 666 .data = &printk_ratelimit_state.interval,
723 .maxlen = sizeof(int), 667 .maxlen = sizeof(int),
724 .mode = 0644, 668 .mode = 0644,
725 .proc_handler = &proc_dointvec_jiffies, 669 .proc_handler = proc_dointvec_jiffies,
726 .strategy = &sysctl_jiffies,
727 }, 670 },
728 { 671 {
729 .ctl_name = KERN_PRINTK_RATELIMIT_BURST,
730 .procname = "printk_ratelimit_burst", 672 .procname = "printk_ratelimit_burst",
731 .data = &printk_ratelimit_state.burst, 673 .data = &printk_ratelimit_state.burst,
732 .maxlen = sizeof(int), 674 .maxlen = sizeof(int),
733 .mode = 0644, 675 .mode = 0644,
734 .proc_handler = &proc_dointvec, 676 .proc_handler = proc_dointvec,
735 }, 677 },
736 { 678 {
737 .ctl_name = CTL_UNNUMBERED,
738 .procname = "printk_delay", 679 .procname = "printk_delay",
739 .data = &printk_delay_msec, 680 .data = &printk_delay_msec,
740 .maxlen = sizeof(int), 681 .maxlen = sizeof(int),
741 .mode = 0644, 682 .mode = 0644,
742 .proc_handler = &proc_dointvec_minmax, 683 .proc_handler = proc_dointvec_minmax,
743 .strategy = &sysctl_intvec,
744 .extra1 = &zero, 684 .extra1 = &zero,
745 .extra2 = &ten_thousand, 685 .extra2 = &ten_thousand,
746 }, 686 },
747#endif 687#endif
748 { 688 {
749 .ctl_name = KERN_NGROUPS_MAX,
750 .procname = "ngroups_max", 689 .procname = "ngroups_max",
751 .data = &ngroups_max, 690 .data = &ngroups_max,
752 .maxlen = sizeof (int), 691 .maxlen = sizeof (int),
753 .mode = 0444, 692 .mode = 0444,
754 .proc_handler = &proc_dointvec, 693 .proc_handler = proc_dointvec,
755 }, 694 },
756#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) 695#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
757 { 696 {
758 .ctl_name = KERN_UNKNOWN_NMI_PANIC,
759 .procname = "unknown_nmi_panic", 697 .procname = "unknown_nmi_panic",
760 .data = &unknown_nmi_panic, 698 .data = &unknown_nmi_panic,
761 .maxlen = sizeof (int), 699 .maxlen = sizeof (int),
762 .mode = 0644, 700 .mode = 0644,
763 .proc_handler = &proc_dointvec, 701 .proc_handler = proc_dointvec,
764 }, 702 },
765 { 703 {
766 .procname = "nmi_watchdog", 704 .procname = "nmi_watchdog",
767 .data = &nmi_watchdog_enabled, 705 .data = &nmi_watchdog_enabled,
768 .maxlen = sizeof (int), 706 .maxlen = sizeof (int),
769 .mode = 0644, 707 .mode = 0644,
770 .proc_handler = &proc_nmi_enabled, 708 .proc_handler = proc_nmi_enabled,
771 }, 709 },
772#endif 710#endif
773#if defined(CONFIG_X86) 711#if defined(CONFIG_X86)
774 { 712 {
775 .ctl_name = KERN_PANIC_ON_NMI,
776 .procname = "panic_on_unrecovered_nmi", 713 .procname = "panic_on_unrecovered_nmi",
777 .data = &panic_on_unrecovered_nmi, 714 .data = &panic_on_unrecovered_nmi,
778 .maxlen = sizeof(int), 715 .maxlen = sizeof(int),
779 .mode = 0644, 716 .mode = 0644,
780 .proc_handler = &proc_dointvec, 717 .proc_handler = proc_dointvec,
781 }, 718 },
782 { 719 {
783 .ctl_name = CTL_UNNUMBERED,
784 .procname = "panic_on_io_nmi", 720 .procname = "panic_on_io_nmi",
785 .data = &panic_on_io_nmi, 721 .data = &panic_on_io_nmi,
786 .maxlen = sizeof(int), 722 .maxlen = sizeof(int),
787 .mode = 0644, 723 .mode = 0644,
788 .proc_handler = &proc_dointvec, 724 .proc_handler = proc_dointvec,
789 }, 725 },
790 { 726 {
791 .ctl_name = KERN_BOOTLOADER_TYPE,
792 .procname = "bootloader_type", 727 .procname = "bootloader_type",
793 .data = &bootloader_type, 728 .data = &bootloader_type,
794 .maxlen = sizeof (int), 729 .maxlen = sizeof (int),
795 .mode = 0444, 730 .mode = 0444,
796 .proc_handler = &proc_dointvec, 731 .proc_handler = proc_dointvec,
797 }, 732 },
798 { 733 {
799 .ctl_name = CTL_UNNUMBERED,
800 .procname = "bootloader_version", 734 .procname = "bootloader_version",
801 .data = &bootloader_version, 735 .data = &bootloader_version,
802 .maxlen = sizeof (int), 736 .maxlen = sizeof (int),
803 .mode = 0444, 737 .mode = 0444,
804 .proc_handler = &proc_dointvec, 738 .proc_handler = proc_dointvec,
805 }, 739 },
806 { 740 {
807 .ctl_name = CTL_UNNUMBERED,
808 .procname = "kstack_depth_to_print", 741 .procname = "kstack_depth_to_print",
809 .data = &kstack_depth_to_print, 742 .data = &kstack_depth_to_print,
810 .maxlen = sizeof(int), 743 .maxlen = sizeof(int),
811 .mode = 0644, 744 .mode = 0644,
812 .proc_handler = &proc_dointvec, 745 .proc_handler = proc_dointvec,
813 }, 746 },
814 { 747 {
815 .ctl_name = CTL_UNNUMBERED,
816 .procname = "io_delay_type", 748 .procname = "io_delay_type",
817 .data = &io_delay_type, 749 .data = &io_delay_type,
818 .maxlen = sizeof(int), 750 .maxlen = sizeof(int),
819 .mode = 0644, 751 .mode = 0644,
820 .proc_handler = &proc_dointvec, 752 .proc_handler = proc_dointvec,
821 }, 753 },
822#endif 754#endif
823#if defined(CONFIG_MMU) 755#if defined(CONFIG_MMU)
824 { 756 {
825 .ctl_name = KERN_RANDOMIZE,
826 .procname = "randomize_va_space", 757 .procname = "randomize_va_space",
827 .data = &randomize_va_space, 758 .data = &randomize_va_space,
828 .maxlen = sizeof(int), 759 .maxlen = sizeof(int),
829 .mode = 0644, 760 .mode = 0644,
830 .proc_handler = &proc_dointvec, 761 .proc_handler = proc_dointvec,
831 }, 762 },
832#endif 763#endif
833#if defined(CONFIG_S390) && defined(CONFIG_SMP) 764#if defined(CONFIG_S390) && defined(CONFIG_SMP)
834 { 765 {
835 .ctl_name = KERN_SPIN_RETRY,
836 .procname = "spin_retry", 766 .procname = "spin_retry",
837 .data = &spin_retry, 767 .data = &spin_retry,
838 .maxlen = sizeof (int), 768 .maxlen = sizeof (int),
839 .mode = 0644, 769 .mode = 0644,
840 .proc_handler = &proc_dointvec, 770 .proc_handler = proc_dointvec,
841 }, 771 },
842#endif 772#endif
843#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) 773#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86)
@@ -846,123 +776,104 @@ static struct ctl_table kern_table[] = {
846 .data = &acpi_realmode_flags, 776 .data = &acpi_realmode_flags,
847 .maxlen = sizeof (unsigned long), 777 .maxlen = sizeof (unsigned long),
848 .mode = 0644, 778 .mode = 0644,
849 .proc_handler = &proc_doulongvec_minmax, 779 .proc_handler = proc_doulongvec_minmax,
850 }, 780 },
851#endif 781#endif
852#ifdef CONFIG_IA64 782#ifdef CONFIG_IA64
853 { 783 {
854 .ctl_name = KERN_IA64_UNALIGNED,
855 .procname = "ignore-unaligned-usertrap", 784 .procname = "ignore-unaligned-usertrap",
856 .data = &no_unaligned_warning, 785 .data = &no_unaligned_warning,
857 .maxlen = sizeof (int), 786 .maxlen = sizeof (int),
858 .mode = 0644, 787 .mode = 0644,
859 .proc_handler = &proc_dointvec, 788 .proc_handler = proc_dointvec,
860 }, 789 },
861 { 790 {
862 .ctl_name = CTL_UNNUMBERED,
863 .procname = "unaligned-dump-stack", 791 .procname = "unaligned-dump-stack",
864 .data = &unaligned_dump_stack, 792 .data = &unaligned_dump_stack,
865 .maxlen = sizeof (int), 793 .maxlen = sizeof (int),
866 .mode = 0644, 794 .mode = 0644,
867 .proc_handler = &proc_dointvec, 795 .proc_handler = proc_dointvec,
868 }, 796 },
869#endif 797#endif
870#ifdef CONFIG_DETECT_SOFTLOCKUP 798#ifdef CONFIG_DETECT_SOFTLOCKUP
871 { 799 {
872 .ctl_name = CTL_UNNUMBERED,
873 .procname = "softlockup_panic", 800 .procname = "softlockup_panic",
874 .data = &softlockup_panic, 801 .data = &softlockup_panic,
875 .maxlen = sizeof(int), 802 .maxlen = sizeof(int),
876 .mode = 0644, 803 .mode = 0644,
877 .proc_handler = &proc_dointvec_minmax, 804 .proc_handler = proc_dointvec_minmax,
878 .strategy = &sysctl_intvec,
879 .extra1 = &zero, 805 .extra1 = &zero,
880 .extra2 = &one, 806 .extra2 = &one,
881 }, 807 },
882 { 808 {
883 .ctl_name = CTL_UNNUMBERED,
884 .procname = "softlockup_thresh", 809 .procname = "softlockup_thresh",
885 .data = &softlockup_thresh, 810 .data = &softlockup_thresh,
886 .maxlen = sizeof(int), 811 .maxlen = sizeof(int),
887 .mode = 0644, 812 .mode = 0644,
888 .proc_handler = &proc_dosoftlockup_thresh, 813 .proc_handler = proc_dosoftlockup_thresh,
889 .strategy = &sysctl_intvec,
890 .extra1 = &neg_one, 814 .extra1 = &neg_one,
891 .extra2 = &sixty, 815 .extra2 = &sixty,
892 }, 816 },
893#endif 817#endif
894#ifdef CONFIG_DETECT_HUNG_TASK 818#ifdef CONFIG_DETECT_HUNG_TASK
895 { 819 {
896 .ctl_name = CTL_UNNUMBERED,
897 .procname = "hung_task_panic", 820 .procname = "hung_task_panic",
898 .data = &sysctl_hung_task_panic, 821 .data = &sysctl_hung_task_panic,
899 .maxlen = sizeof(int), 822 .maxlen = sizeof(int),
900 .mode = 0644, 823 .mode = 0644,
901 .proc_handler = &proc_dointvec_minmax, 824 .proc_handler = proc_dointvec_minmax,
902 .strategy = &sysctl_intvec,
903 .extra1 = &zero, 825 .extra1 = &zero,
904 .extra2 = &one, 826 .extra2 = &one,
905 }, 827 },
906 { 828 {
907 .ctl_name = CTL_UNNUMBERED,
908 .procname = "hung_task_check_count", 829 .procname = "hung_task_check_count",
909 .data = &sysctl_hung_task_check_count, 830 .data = &sysctl_hung_task_check_count,
910 .maxlen = sizeof(unsigned long), 831 .maxlen = sizeof(unsigned long),
911 .mode = 0644, 832 .mode = 0644,
912 .proc_handler = &proc_doulongvec_minmax, 833 .proc_handler = proc_doulongvec_minmax,
913 .strategy = &sysctl_intvec,
914 }, 834 },
915 { 835 {
916 .ctl_name = CTL_UNNUMBERED,
917 .procname = "hung_task_timeout_secs", 836 .procname = "hung_task_timeout_secs",
918 .data = &sysctl_hung_task_timeout_secs, 837 .data = &sysctl_hung_task_timeout_secs,
919 .maxlen = sizeof(unsigned long), 838 .maxlen = sizeof(unsigned long),
920 .mode = 0644, 839 .mode = 0644,
921 .proc_handler = &proc_dohung_task_timeout_secs, 840 .proc_handler = proc_dohung_task_timeout_secs,
922 .strategy = &sysctl_intvec,
923 }, 841 },
924 { 842 {
925 .ctl_name = CTL_UNNUMBERED,
926 .procname = "hung_task_warnings", 843 .procname = "hung_task_warnings",
927 .data = &sysctl_hung_task_warnings, 844 .data = &sysctl_hung_task_warnings,
928 .maxlen = sizeof(unsigned long), 845 .maxlen = sizeof(unsigned long),
929 .mode = 0644, 846 .mode = 0644,
930 .proc_handler = &proc_doulongvec_minmax, 847 .proc_handler = proc_doulongvec_minmax,
931 .strategy = &sysctl_intvec,
932 }, 848 },
933#endif 849#endif
934#ifdef CONFIG_COMPAT 850#ifdef CONFIG_COMPAT
935 { 851 {
936 .ctl_name = KERN_COMPAT_LOG,
937 .procname = "compat-log", 852 .procname = "compat-log",
938 .data = &compat_log, 853 .data = &compat_log,
939 .maxlen = sizeof (int), 854 .maxlen = sizeof (int),
940 .mode = 0644, 855 .mode = 0644,
941 .proc_handler = &proc_dointvec, 856 .proc_handler = proc_dointvec,
942 }, 857 },
943#endif 858#endif
944#ifdef CONFIG_RT_MUTEXES 859#ifdef CONFIG_RT_MUTEXES
945 { 860 {
946 .ctl_name = KERN_MAX_LOCK_DEPTH,
947 .procname = "max_lock_depth", 861 .procname = "max_lock_depth",
948 .data = &max_lock_depth, 862 .data = &max_lock_depth,
949 .maxlen = sizeof(int), 863 .maxlen = sizeof(int),
950 .mode = 0644, 864 .mode = 0644,
951 .proc_handler = &proc_dointvec, 865 .proc_handler = proc_dointvec,
952 }, 866 },
953#endif 867#endif
954 { 868 {
955 .ctl_name = CTL_UNNUMBERED,
956 .procname = "poweroff_cmd", 869 .procname = "poweroff_cmd",
957 .data = &poweroff_cmd, 870 .data = &poweroff_cmd,
958 .maxlen = POWEROFF_CMD_PATH_LEN, 871 .maxlen = POWEROFF_CMD_PATH_LEN,
959 .mode = 0644, 872 .mode = 0644,
960 .proc_handler = &proc_dostring, 873 .proc_handler = proc_dostring,
961 .strategy = &sysctl_string,
962 }, 874 },
963#ifdef CONFIG_KEYS 875#ifdef CONFIG_KEYS
964 { 876 {
965 .ctl_name = CTL_UNNUMBERED,
966 .procname = "keys", 877 .procname = "keys",
967 .mode = 0555, 878 .mode = 0555,
968 .child = key_sysctls, 879 .child = key_sysctls,
@@ -970,17 +881,15 @@ static struct ctl_table kern_table[] = {
970#endif 881#endif
971#ifdef CONFIG_RCU_TORTURE_TEST 882#ifdef CONFIG_RCU_TORTURE_TEST
972 { 883 {
973 .ctl_name = CTL_UNNUMBERED,
974 .procname = "rcutorture_runnable", 884 .procname = "rcutorture_runnable",
975 .data = &rcutorture_runnable, 885 .data = &rcutorture_runnable,
976 .maxlen = sizeof(int), 886 .maxlen = sizeof(int),
977 .mode = 0644, 887 .mode = 0644,
978 .proc_handler = &proc_dointvec, 888 .proc_handler = proc_dointvec,
979 }, 889 },
980#endif 890#endif
981#ifdef CONFIG_SLOW_WORK 891#ifdef CONFIG_SLOW_WORK
982 { 892 {
983 .ctl_name = CTL_UNNUMBERED,
984 .procname = "slow-work", 893 .procname = "slow-work",
985 .mode = 0555, 894 .mode = 0555,
986 .child = slow_work_sysctls, 895 .child = slow_work_sysctls,
@@ -988,146 +897,127 @@ static struct ctl_table kern_table[] = {
988#endif 897#endif
989#ifdef CONFIG_PERF_EVENTS 898#ifdef CONFIG_PERF_EVENTS
990 { 899 {
991 .ctl_name = CTL_UNNUMBERED,
992 .procname = "perf_event_paranoid", 900 .procname = "perf_event_paranoid",
993 .data = &sysctl_perf_event_paranoid, 901 .data = &sysctl_perf_event_paranoid,
994 .maxlen = sizeof(sysctl_perf_event_paranoid), 902 .maxlen = sizeof(sysctl_perf_event_paranoid),
995 .mode = 0644, 903 .mode = 0644,
996 .proc_handler = &proc_dointvec, 904 .proc_handler = proc_dointvec,
997 }, 905 },
998 { 906 {
999 .ctl_name = CTL_UNNUMBERED,
1000 .procname = "perf_event_mlock_kb", 907 .procname = "perf_event_mlock_kb",
1001 .data = &sysctl_perf_event_mlock, 908 .data = &sysctl_perf_event_mlock,
1002 .maxlen = sizeof(sysctl_perf_event_mlock), 909 .maxlen = sizeof(sysctl_perf_event_mlock),
1003 .mode = 0644, 910 .mode = 0644,
1004 .proc_handler = &proc_dointvec, 911 .proc_handler = proc_dointvec,
1005 }, 912 },
1006 { 913 {
1007 .ctl_name = CTL_UNNUMBERED,
1008 .procname = "perf_event_max_sample_rate", 914 .procname = "perf_event_max_sample_rate",
1009 .data = &sysctl_perf_event_sample_rate, 915 .data = &sysctl_perf_event_sample_rate,
1010 .maxlen = sizeof(sysctl_perf_event_sample_rate), 916 .maxlen = sizeof(sysctl_perf_event_sample_rate),
1011 .mode = 0644, 917 .mode = 0644,
1012 .proc_handler = &proc_dointvec, 918 .proc_handler = proc_dointvec,
1013 }, 919 },
1014#endif 920#endif
1015#ifdef CONFIG_KMEMCHECK 921#ifdef CONFIG_KMEMCHECK
1016 { 922 {
1017 .ctl_name = CTL_UNNUMBERED,
1018 .procname = "kmemcheck", 923 .procname = "kmemcheck",
1019 .data = &kmemcheck_enabled, 924 .data = &kmemcheck_enabled,
1020 .maxlen = sizeof(int), 925 .maxlen = sizeof(int),
1021 .mode = 0644, 926 .mode = 0644,
1022 .proc_handler = &proc_dointvec, 927 .proc_handler = proc_dointvec,
1023 }, 928 },
1024#endif 929#endif
1025#ifdef CONFIG_BLOCK 930#ifdef CONFIG_BLOCK
1026 { 931 {
1027 .ctl_name = CTL_UNNUMBERED,
1028 .procname = "blk_iopoll", 932 .procname = "blk_iopoll",
1029 .data = &blk_iopoll_enabled, 933 .data = &blk_iopoll_enabled,
1030 .maxlen = sizeof(int), 934 .maxlen = sizeof(int),
1031 .mode = 0644, 935 .mode = 0644,
1032 .proc_handler = &proc_dointvec, 936 .proc_handler = proc_dointvec,
1033 }, 937 },
1034#endif 938#endif
1035/* 939/*
1036 * NOTE: do not add new entries to this table unless you have read 940 * NOTE: do not add new entries to this table unless you have read
1037 * Documentation/sysctl/ctl_unnumbered.txt 941 * Documentation/sysctl/ctl_unnumbered.txt
1038 */ 942 */
1039 { .ctl_name = 0 } 943 { }
1040}; 944};
1041 945
1042static struct ctl_table vm_table[] = { 946static struct ctl_table vm_table[] = {
1043 { 947 {
1044 .ctl_name = VM_OVERCOMMIT_MEMORY,
1045 .procname = "overcommit_memory", 948 .procname = "overcommit_memory",
1046 .data = &sysctl_overcommit_memory, 949 .data = &sysctl_overcommit_memory,
1047 .maxlen = sizeof(sysctl_overcommit_memory), 950 .maxlen = sizeof(sysctl_overcommit_memory),
1048 .mode = 0644, 951 .mode = 0644,
1049 .proc_handler = &proc_dointvec, 952 .proc_handler = proc_dointvec,
1050 }, 953 },
1051 { 954 {
1052 .ctl_name = VM_PANIC_ON_OOM,
1053 .procname = "panic_on_oom", 955 .procname = "panic_on_oom",
1054 .data = &sysctl_panic_on_oom, 956 .data = &sysctl_panic_on_oom,
1055 .maxlen = sizeof(sysctl_panic_on_oom), 957 .maxlen = sizeof(sysctl_panic_on_oom),
1056 .mode = 0644, 958 .mode = 0644,
1057 .proc_handler = &proc_dointvec, 959 .proc_handler = proc_dointvec,
1058 }, 960 },
1059 { 961 {
1060 .ctl_name = CTL_UNNUMBERED,
1061 .procname = "oom_kill_allocating_task", 962 .procname = "oom_kill_allocating_task",
1062 .data = &sysctl_oom_kill_allocating_task, 963 .data = &sysctl_oom_kill_allocating_task,
1063 .maxlen = sizeof(sysctl_oom_kill_allocating_task), 964 .maxlen = sizeof(sysctl_oom_kill_allocating_task),
1064 .mode = 0644, 965 .mode = 0644,
1065 .proc_handler = &proc_dointvec, 966 .proc_handler = proc_dointvec,
1066 }, 967 },
1067 { 968 {
1068 .ctl_name = CTL_UNNUMBERED,
1069 .procname = "oom_dump_tasks", 969 .procname = "oom_dump_tasks",
1070 .data = &sysctl_oom_dump_tasks, 970 .data = &sysctl_oom_dump_tasks,
1071 .maxlen = sizeof(sysctl_oom_dump_tasks), 971 .maxlen = sizeof(sysctl_oom_dump_tasks),
1072 .mode = 0644, 972 .mode = 0644,
1073 .proc_handler = &proc_dointvec, 973 .proc_handler = proc_dointvec,
1074 }, 974 },
1075 { 975 {
1076 .ctl_name = VM_OVERCOMMIT_RATIO,
1077 .procname = "overcommit_ratio", 976 .procname = "overcommit_ratio",
1078 .data = &sysctl_overcommit_ratio, 977 .data = &sysctl_overcommit_ratio,
1079 .maxlen = sizeof(sysctl_overcommit_ratio), 978 .maxlen = sizeof(sysctl_overcommit_ratio),
1080 .mode = 0644, 979 .mode = 0644,
1081 .proc_handler = &proc_dointvec, 980 .proc_handler = proc_dointvec,
1082 }, 981 },
1083 { 982 {
1084 .ctl_name = VM_PAGE_CLUSTER,
1085 .procname = "page-cluster", 983 .procname = "page-cluster",
1086 .data = &page_cluster, 984 .data = &page_cluster,
1087 .maxlen = sizeof(int), 985 .maxlen = sizeof(int),
1088 .mode = 0644, 986 .mode = 0644,
1089 .proc_handler = &proc_dointvec, 987 .proc_handler = proc_dointvec,
1090 }, 988 },
1091 { 989 {
1092 .ctl_name = VM_DIRTY_BACKGROUND,
1093 .procname = "dirty_background_ratio", 990 .procname = "dirty_background_ratio",
1094 .data = &dirty_background_ratio, 991 .data = &dirty_background_ratio,
1095 .maxlen = sizeof(dirty_background_ratio), 992 .maxlen = sizeof(dirty_background_ratio),
1096 .mode = 0644, 993 .mode = 0644,
1097 .proc_handler = &dirty_background_ratio_handler, 994 .proc_handler = dirty_background_ratio_handler,
1098 .strategy = &sysctl_intvec,
1099 .extra1 = &zero, 995 .extra1 = &zero,
1100 .extra2 = &one_hundred, 996 .extra2 = &one_hundred,
1101 }, 997 },
1102 { 998 {
1103 .ctl_name = CTL_UNNUMBERED,
1104 .procname = "dirty_background_bytes", 999 .procname = "dirty_background_bytes",
1105 .data = &dirty_background_bytes, 1000 .data = &dirty_background_bytes,
1106 .maxlen = sizeof(dirty_background_bytes), 1001 .maxlen = sizeof(dirty_background_bytes),
1107 .mode = 0644, 1002 .mode = 0644,
1108 .proc_handler = &dirty_background_bytes_handler, 1003 .proc_handler = dirty_background_bytes_handler,
1109 .strategy = &sysctl_intvec,
1110 .extra1 = &one_ul, 1004 .extra1 = &one_ul,
1111 }, 1005 },
1112 { 1006 {
1113 .ctl_name = VM_DIRTY_RATIO,
1114 .procname = "dirty_ratio", 1007 .procname = "dirty_ratio",
1115 .data = &vm_dirty_ratio, 1008 .data = &vm_dirty_ratio,
1116 .maxlen = sizeof(vm_dirty_ratio), 1009 .maxlen = sizeof(vm_dirty_ratio),
1117 .mode = 0644, 1010 .mode = 0644,
1118 .proc_handler = &dirty_ratio_handler, 1011 .proc_handler = dirty_ratio_handler,
1119 .strategy = &sysctl_intvec,
1120 .extra1 = &zero, 1012 .extra1 = &zero,
1121 .extra2 = &one_hundred, 1013 .extra2 = &one_hundred,
1122 }, 1014 },
1123 { 1015 {
1124 .ctl_name = CTL_UNNUMBERED,
1125 .procname = "dirty_bytes", 1016 .procname = "dirty_bytes",
1126 .data = &vm_dirty_bytes, 1017 .data = &vm_dirty_bytes,
1127 .maxlen = sizeof(vm_dirty_bytes), 1018 .maxlen = sizeof(vm_dirty_bytes),
1128 .mode = 0644, 1019 .mode = 0644,
1129 .proc_handler = &dirty_bytes_handler, 1020 .proc_handler = dirty_bytes_handler,
1130 .strategy = &sysctl_intvec,
1131 .extra1 = &dirty_bytes_min, 1021 .extra1 = &dirty_bytes_min,
1132 }, 1022 },
1133 { 1023 {
@@ -1135,289 +1025,258 @@ static struct ctl_table vm_table[] = {
1135 .data = &dirty_writeback_interval, 1025 .data = &dirty_writeback_interval,
1136 .maxlen = sizeof(dirty_writeback_interval), 1026 .maxlen = sizeof(dirty_writeback_interval),
1137 .mode = 0644, 1027 .mode = 0644,
1138 .proc_handler = &dirty_writeback_centisecs_handler, 1028 .proc_handler = dirty_writeback_centisecs_handler,
1139 }, 1029 },
1140 { 1030 {
1141 .procname = "dirty_expire_centisecs", 1031 .procname = "dirty_expire_centisecs",
1142 .data = &dirty_expire_interval, 1032 .data = &dirty_expire_interval,
1143 .maxlen = sizeof(dirty_expire_interval), 1033 .maxlen = sizeof(dirty_expire_interval),
1144 .mode = 0644, 1034 .mode = 0644,
1145 .proc_handler = &proc_dointvec, 1035 .proc_handler = proc_dointvec,
1146 }, 1036 },
1147 { 1037 {
1148 .ctl_name = VM_NR_PDFLUSH_THREADS,
1149 .procname = "nr_pdflush_threads", 1038 .procname = "nr_pdflush_threads",
1150 .data = &nr_pdflush_threads, 1039 .data = &nr_pdflush_threads,
1151 .maxlen = sizeof nr_pdflush_threads, 1040 .maxlen = sizeof nr_pdflush_threads,
1152 .mode = 0444 /* read-only*/, 1041 .mode = 0444 /* read-only*/,
1153 .proc_handler = &proc_dointvec, 1042 .proc_handler = proc_dointvec,
1154 }, 1043 },
1155 { 1044 {
1156 .ctl_name = VM_SWAPPINESS,
1157 .procname = "swappiness", 1045 .procname = "swappiness",
1158 .data = &vm_swappiness, 1046 .data = &vm_swappiness,
1159 .maxlen = sizeof(vm_swappiness), 1047 .maxlen = sizeof(vm_swappiness),
1160 .mode = 0644, 1048 .mode = 0644,
1161 .proc_handler = &proc_dointvec_minmax, 1049 .proc_handler = proc_dointvec_minmax,
1162 .strategy = &sysctl_intvec,
1163 .extra1 = &zero, 1050 .extra1 = &zero,
1164 .extra2 = &one_hundred, 1051 .extra2 = &one_hundred,
1165 }, 1052 },
1166#ifdef CONFIG_HUGETLB_PAGE 1053#ifdef CONFIG_HUGETLB_PAGE
1167 { 1054 {
1168 .procname = "nr_hugepages", 1055 .procname = "nr_hugepages",
1169 .data = NULL, 1056 .data = NULL,
1170 .maxlen = sizeof(unsigned long), 1057 .maxlen = sizeof(unsigned long),
1171 .mode = 0644, 1058 .mode = 0644,
1172 .proc_handler = &hugetlb_sysctl_handler, 1059 .proc_handler = hugetlb_sysctl_handler,
1173 .extra1 = (void *)&hugetlb_zero, 1060 .extra1 = (void *)&hugetlb_zero,
1174 .extra2 = (void *)&hugetlb_infinity, 1061 .extra2 = (void *)&hugetlb_infinity,
1175 }, 1062 },
1063#ifdef CONFIG_NUMA
1064 {
1065 .procname = "nr_hugepages_mempolicy",
1066 .data = NULL,
1067 .maxlen = sizeof(unsigned long),
1068 .mode = 0644,
1069 .proc_handler = &hugetlb_mempolicy_sysctl_handler,
1070 .extra1 = (void *)&hugetlb_zero,
1071 .extra2 = (void *)&hugetlb_infinity,
1072 },
1073#endif
1176 { 1074 {
1177 .ctl_name = VM_HUGETLB_GROUP,
1178 .procname = "hugetlb_shm_group", 1075 .procname = "hugetlb_shm_group",
1179 .data = &sysctl_hugetlb_shm_group, 1076 .data = &sysctl_hugetlb_shm_group,
1180 .maxlen = sizeof(gid_t), 1077 .maxlen = sizeof(gid_t),
1181 .mode = 0644, 1078 .mode = 0644,
1182 .proc_handler = &proc_dointvec, 1079 .proc_handler = proc_dointvec,
1183 }, 1080 },
1184 { 1081 {
1185 .ctl_name = CTL_UNNUMBERED,
1186 .procname = "hugepages_treat_as_movable", 1082 .procname = "hugepages_treat_as_movable",
1187 .data = &hugepages_treat_as_movable, 1083 .data = &hugepages_treat_as_movable,
1188 .maxlen = sizeof(int), 1084 .maxlen = sizeof(int),
1189 .mode = 0644, 1085 .mode = 0644,
1190 .proc_handler = &hugetlb_treat_movable_handler, 1086 .proc_handler = hugetlb_treat_movable_handler,
1191 }, 1087 },
1192 { 1088 {
1193 .ctl_name = CTL_UNNUMBERED,
1194 .procname = "nr_overcommit_hugepages", 1089 .procname = "nr_overcommit_hugepages",
1195 .data = NULL, 1090 .data = NULL,
1196 .maxlen = sizeof(unsigned long), 1091 .maxlen = sizeof(unsigned long),
1197 .mode = 0644, 1092 .mode = 0644,
1198 .proc_handler = &hugetlb_overcommit_handler, 1093 .proc_handler = hugetlb_overcommit_handler,
1199 .extra1 = (void *)&hugetlb_zero, 1094 .extra1 = (void *)&hugetlb_zero,
1200 .extra2 = (void *)&hugetlb_infinity, 1095 .extra2 = (void *)&hugetlb_infinity,
1201 }, 1096 },
1202#endif 1097#endif
1203 { 1098 {
1204 .ctl_name = VM_LOWMEM_RESERVE_RATIO,
1205 .procname = "lowmem_reserve_ratio", 1099 .procname = "lowmem_reserve_ratio",
1206 .data = &sysctl_lowmem_reserve_ratio, 1100 .data = &sysctl_lowmem_reserve_ratio,
1207 .maxlen = sizeof(sysctl_lowmem_reserve_ratio), 1101 .maxlen = sizeof(sysctl_lowmem_reserve_ratio),
1208 .mode = 0644, 1102 .mode = 0644,
1209 .proc_handler = &lowmem_reserve_ratio_sysctl_handler, 1103 .proc_handler = lowmem_reserve_ratio_sysctl_handler,
1210 .strategy = &sysctl_intvec,
1211 }, 1104 },
1212 { 1105 {
1213 .ctl_name = VM_DROP_PAGECACHE,
1214 .procname = "drop_caches", 1106 .procname = "drop_caches",
1215 .data = &sysctl_drop_caches, 1107 .data = &sysctl_drop_caches,
1216 .maxlen = sizeof(int), 1108 .maxlen = sizeof(int),
1217 .mode = 0644, 1109 .mode = 0644,
1218 .proc_handler = drop_caches_sysctl_handler, 1110 .proc_handler = drop_caches_sysctl_handler,
1219 .strategy = &sysctl_intvec,
1220 }, 1111 },
1221 { 1112 {
1222 .ctl_name = VM_MIN_FREE_KBYTES,
1223 .procname = "min_free_kbytes", 1113 .procname = "min_free_kbytes",
1224 .data = &min_free_kbytes, 1114 .data = &min_free_kbytes,
1225 .maxlen = sizeof(min_free_kbytes), 1115 .maxlen = sizeof(min_free_kbytes),
1226 .mode = 0644, 1116 .mode = 0644,
1227 .proc_handler = &min_free_kbytes_sysctl_handler, 1117 .proc_handler = min_free_kbytes_sysctl_handler,
1228 .strategy = &sysctl_intvec,
1229 .extra1 = &zero, 1118 .extra1 = &zero,
1230 }, 1119 },
1231 { 1120 {
1232 .ctl_name = VM_PERCPU_PAGELIST_FRACTION,
1233 .procname = "percpu_pagelist_fraction", 1121 .procname = "percpu_pagelist_fraction",
1234 .data = &percpu_pagelist_fraction, 1122 .data = &percpu_pagelist_fraction,
1235 .maxlen = sizeof(percpu_pagelist_fraction), 1123 .maxlen = sizeof(percpu_pagelist_fraction),
1236 .mode = 0644, 1124 .mode = 0644,
1237 .proc_handler = &percpu_pagelist_fraction_sysctl_handler, 1125 .proc_handler = percpu_pagelist_fraction_sysctl_handler,
1238 .strategy = &sysctl_intvec,
1239 .extra1 = &min_percpu_pagelist_fract, 1126 .extra1 = &min_percpu_pagelist_fract,
1240 }, 1127 },
1241#ifdef CONFIG_MMU 1128#ifdef CONFIG_MMU
1242 { 1129 {
1243 .ctl_name = VM_MAX_MAP_COUNT,
1244 .procname = "max_map_count", 1130 .procname = "max_map_count",
1245 .data = &sysctl_max_map_count, 1131 .data = &sysctl_max_map_count,
1246 .maxlen = sizeof(sysctl_max_map_count), 1132 .maxlen = sizeof(sysctl_max_map_count),
1247 .mode = 0644, 1133 .mode = 0644,
1248 .proc_handler = &proc_dointvec 1134 .proc_handler = proc_dointvec_minmax,
1135 .extra1 = &zero,
1249 }, 1136 },
1250#else 1137#else
1251 { 1138 {
1252 .ctl_name = CTL_UNNUMBERED,
1253 .procname = "nr_trim_pages", 1139 .procname = "nr_trim_pages",
1254 .data = &sysctl_nr_trim_pages, 1140 .data = &sysctl_nr_trim_pages,
1255 .maxlen = sizeof(sysctl_nr_trim_pages), 1141 .maxlen = sizeof(sysctl_nr_trim_pages),
1256 .mode = 0644, 1142 .mode = 0644,
1257 .proc_handler = &proc_dointvec_minmax, 1143 .proc_handler = proc_dointvec_minmax,
1258 .strategy = &sysctl_intvec,
1259 .extra1 = &zero, 1144 .extra1 = &zero,
1260 }, 1145 },
1261#endif 1146#endif
1262 { 1147 {
1263 .ctl_name = VM_LAPTOP_MODE,
1264 .procname = "laptop_mode", 1148 .procname = "laptop_mode",
1265 .data = &laptop_mode, 1149 .data = &laptop_mode,
1266 .maxlen = sizeof(laptop_mode), 1150 .maxlen = sizeof(laptop_mode),
1267 .mode = 0644, 1151 .mode = 0644,
1268 .proc_handler = &proc_dointvec_jiffies, 1152 .proc_handler = proc_dointvec_jiffies,
1269 .strategy = &sysctl_jiffies,
1270 }, 1153 },
1271 { 1154 {
1272 .ctl_name = VM_BLOCK_DUMP,
1273 .procname = "block_dump", 1155 .procname = "block_dump",
1274 .data = &block_dump, 1156 .data = &block_dump,
1275 .maxlen = sizeof(block_dump), 1157 .maxlen = sizeof(block_dump),
1276 .mode = 0644, 1158 .mode = 0644,
1277 .proc_handler = &proc_dointvec, 1159 .proc_handler = proc_dointvec,
1278 .strategy = &sysctl_intvec,
1279 .extra1 = &zero, 1160 .extra1 = &zero,
1280 }, 1161 },
1281 { 1162 {
1282 .ctl_name = VM_VFS_CACHE_PRESSURE,
1283 .procname = "vfs_cache_pressure", 1163 .procname = "vfs_cache_pressure",
1284 .data = &sysctl_vfs_cache_pressure, 1164 .data = &sysctl_vfs_cache_pressure,
1285 .maxlen = sizeof(sysctl_vfs_cache_pressure), 1165 .maxlen = sizeof(sysctl_vfs_cache_pressure),
1286 .mode = 0644, 1166 .mode = 0644,
1287 .proc_handler = &proc_dointvec, 1167 .proc_handler = proc_dointvec,
1288 .strategy = &sysctl_intvec,
1289 .extra1 = &zero, 1168 .extra1 = &zero,
1290 }, 1169 },
1291#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT 1170#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
1292 { 1171 {
1293 .ctl_name = VM_LEGACY_VA_LAYOUT,
1294 .procname = "legacy_va_layout", 1172 .procname = "legacy_va_layout",
1295 .data = &sysctl_legacy_va_layout, 1173 .data = &sysctl_legacy_va_layout,
1296 .maxlen = sizeof(sysctl_legacy_va_layout), 1174 .maxlen = sizeof(sysctl_legacy_va_layout),
1297 .mode = 0644, 1175 .mode = 0644,
1298 .proc_handler = &proc_dointvec, 1176 .proc_handler = proc_dointvec,
1299 .strategy = &sysctl_intvec,
1300 .extra1 = &zero, 1177 .extra1 = &zero,
1301 }, 1178 },
1302#endif 1179#endif
1303#ifdef CONFIG_NUMA 1180#ifdef CONFIG_NUMA
1304 { 1181 {
1305 .ctl_name = VM_ZONE_RECLAIM_MODE,
1306 .procname = "zone_reclaim_mode", 1182 .procname = "zone_reclaim_mode",
1307 .data = &zone_reclaim_mode, 1183 .data = &zone_reclaim_mode,
1308 .maxlen = sizeof(zone_reclaim_mode), 1184 .maxlen = sizeof(zone_reclaim_mode),
1309 .mode = 0644, 1185 .mode = 0644,
1310 .proc_handler = &proc_dointvec, 1186 .proc_handler = proc_dointvec,
1311 .strategy = &sysctl_intvec,
1312 .extra1 = &zero, 1187 .extra1 = &zero,
1313 }, 1188 },
1314 { 1189 {
1315 .ctl_name = VM_MIN_UNMAPPED,
1316 .procname = "min_unmapped_ratio", 1190 .procname = "min_unmapped_ratio",
1317 .data = &sysctl_min_unmapped_ratio, 1191 .data = &sysctl_min_unmapped_ratio,
1318 .maxlen = sizeof(sysctl_min_unmapped_ratio), 1192 .maxlen = sizeof(sysctl_min_unmapped_ratio),
1319 .mode = 0644, 1193 .mode = 0644,
1320 .proc_handler = &sysctl_min_unmapped_ratio_sysctl_handler, 1194 .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler,
1321 .strategy = &sysctl_intvec,
1322 .extra1 = &zero, 1195 .extra1 = &zero,
1323 .extra2 = &one_hundred, 1196 .extra2 = &one_hundred,
1324 }, 1197 },
1325 { 1198 {
1326 .ctl_name = VM_MIN_SLAB,
1327 .procname = "min_slab_ratio", 1199 .procname = "min_slab_ratio",
1328 .data = &sysctl_min_slab_ratio, 1200 .data = &sysctl_min_slab_ratio,
1329 .maxlen = sizeof(sysctl_min_slab_ratio), 1201 .maxlen = sizeof(sysctl_min_slab_ratio),
1330 .mode = 0644, 1202 .mode = 0644,
1331 .proc_handler = &sysctl_min_slab_ratio_sysctl_handler, 1203 .proc_handler = sysctl_min_slab_ratio_sysctl_handler,
1332 .strategy = &sysctl_intvec,
1333 .extra1 = &zero, 1204 .extra1 = &zero,
1334 .extra2 = &one_hundred, 1205 .extra2 = &one_hundred,
1335 }, 1206 },
1336#endif 1207#endif
1337#ifdef CONFIG_SMP 1208#ifdef CONFIG_SMP
1338 { 1209 {
1339 .ctl_name = CTL_UNNUMBERED,
1340 .procname = "stat_interval", 1210 .procname = "stat_interval",
1341 .data = &sysctl_stat_interval, 1211 .data = &sysctl_stat_interval,
1342 .maxlen = sizeof(sysctl_stat_interval), 1212 .maxlen = sizeof(sysctl_stat_interval),
1343 .mode = 0644, 1213 .mode = 0644,
1344 .proc_handler = &proc_dointvec_jiffies, 1214 .proc_handler = proc_dointvec_jiffies,
1345 .strategy = &sysctl_jiffies,
1346 }, 1215 },
1347#endif 1216#endif
1217#ifdef CONFIG_MMU
1348 { 1218 {
1349 .ctl_name = CTL_UNNUMBERED,
1350 .procname = "mmap_min_addr", 1219 .procname = "mmap_min_addr",
1351 .data = &dac_mmap_min_addr, 1220 .data = &dac_mmap_min_addr,
1352 .maxlen = sizeof(unsigned long), 1221 .maxlen = sizeof(unsigned long),
1353 .mode = 0644, 1222 .mode = 0644,
1354 .proc_handler = &mmap_min_addr_handler, 1223 .proc_handler = mmap_min_addr_handler,
1355 }, 1224 },
1225#endif
1356#ifdef CONFIG_NUMA 1226#ifdef CONFIG_NUMA
1357 { 1227 {
1358 .ctl_name = CTL_UNNUMBERED,
1359 .procname = "numa_zonelist_order", 1228 .procname = "numa_zonelist_order",
1360 .data = &numa_zonelist_order, 1229 .data = &numa_zonelist_order,
1361 .maxlen = NUMA_ZONELIST_ORDER_LEN, 1230 .maxlen = NUMA_ZONELIST_ORDER_LEN,
1362 .mode = 0644, 1231 .mode = 0644,
1363 .proc_handler = &numa_zonelist_order_handler, 1232 .proc_handler = numa_zonelist_order_handler,
1364 .strategy = &sysctl_string,
1365 }, 1233 },
1366#endif 1234#endif
1367#if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \ 1235#if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \
1368 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) 1236 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
1369 { 1237 {
1370 .ctl_name = VM_VDSO_ENABLED,
1371 .procname = "vdso_enabled", 1238 .procname = "vdso_enabled",
1372 .data = &vdso_enabled, 1239 .data = &vdso_enabled,
1373 .maxlen = sizeof(vdso_enabled), 1240 .maxlen = sizeof(vdso_enabled),
1374 .mode = 0644, 1241 .mode = 0644,
1375 .proc_handler = &proc_dointvec, 1242 .proc_handler = proc_dointvec,
1376 .strategy = &sysctl_intvec,
1377 .extra1 = &zero, 1243 .extra1 = &zero,
1378 }, 1244 },
1379#endif 1245#endif
1380#ifdef CONFIG_HIGHMEM 1246#ifdef CONFIG_HIGHMEM
1381 { 1247 {
1382 .ctl_name = CTL_UNNUMBERED,
1383 .procname = "highmem_is_dirtyable", 1248 .procname = "highmem_is_dirtyable",
1384 .data = &vm_highmem_is_dirtyable, 1249 .data = &vm_highmem_is_dirtyable,
1385 .maxlen = sizeof(vm_highmem_is_dirtyable), 1250 .maxlen = sizeof(vm_highmem_is_dirtyable),
1386 .mode = 0644, 1251 .mode = 0644,
1387 .proc_handler = &proc_dointvec_minmax, 1252 .proc_handler = proc_dointvec_minmax,
1388 .strategy = &sysctl_intvec,
1389 .extra1 = &zero, 1253 .extra1 = &zero,
1390 .extra2 = &one, 1254 .extra2 = &one,
1391 }, 1255 },
1392#endif 1256#endif
1393 { 1257 {
1394 .ctl_name = CTL_UNNUMBERED,
1395 .procname = "scan_unevictable_pages", 1258 .procname = "scan_unevictable_pages",
1396 .data = &scan_unevictable_pages, 1259 .data = &scan_unevictable_pages,
1397 .maxlen = sizeof(scan_unevictable_pages), 1260 .maxlen = sizeof(scan_unevictable_pages),
1398 .mode = 0644, 1261 .mode = 0644,
1399 .proc_handler = &scan_unevictable_handler, 1262 .proc_handler = scan_unevictable_handler,
1400 }, 1263 },
1401#ifdef CONFIG_MEMORY_FAILURE 1264#ifdef CONFIG_MEMORY_FAILURE
1402 { 1265 {
1403 .ctl_name = CTL_UNNUMBERED,
1404 .procname = "memory_failure_early_kill", 1266 .procname = "memory_failure_early_kill",
1405 .data = &sysctl_memory_failure_early_kill, 1267 .data = &sysctl_memory_failure_early_kill,
1406 .maxlen = sizeof(sysctl_memory_failure_early_kill), 1268 .maxlen = sizeof(sysctl_memory_failure_early_kill),
1407 .mode = 0644, 1269 .mode = 0644,
1408 .proc_handler = &proc_dointvec_minmax, 1270 .proc_handler = proc_dointvec_minmax,
1409 .strategy = &sysctl_intvec,
1410 .extra1 = &zero, 1271 .extra1 = &zero,
1411 .extra2 = &one, 1272 .extra2 = &one,
1412 }, 1273 },
1413 { 1274 {
1414 .ctl_name = CTL_UNNUMBERED,
1415 .procname = "memory_failure_recovery", 1275 .procname = "memory_failure_recovery",
1416 .data = &sysctl_memory_failure_recovery, 1276 .data = &sysctl_memory_failure_recovery,
1417 .maxlen = sizeof(sysctl_memory_failure_recovery), 1277 .maxlen = sizeof(sysctl_memory_failure_recovery),
1418 .mode = 0644, 1278 .mode = 0644,
1419 .proc_handler = &proc_dointvec_minmax, 1279 .proc_handler = proc_dointvec_minmax,
1420 .strategy = &sysctl_intvec,
1421 .extra1 = &zero, 1280 .extra1 = &zero,
1422 .extra2 = &one, 1281 .extra2 = &one,
1423 }, 1282 },
@@ -1427,116 +1286,104 @@ static struct ctl_table vm_table[] = {
1427 * NOTE: do not add new entries to this table unless you have read 1286 * NOTE: do not add new entries to this table unless you have read
1428 * Documentation/sysctl/ctl_unnumbered.txt 1287 * Documentation/sysctl/ctl_unnumbered.txt
1429 */ 1288 */
1430 { .ctl_name = 0 } 1289 { }
1431}; 1290};
1432 1291
1433#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) 1292#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
1434static struct ctl_table binfmt_misc_table[] = { 1293static struct ctl_table binfmt_misc_table[] = {
1435 { .ctl_name = 0 } 1294 { }
1436}; 1295};
1437#endif 1296#endif
1438 1297
1439static struct ctl_table fs_table[] = { 1298static struct ctl_table fs_table[] = {
1440 { 1299 {
1441 .ctl_name = FS_NRINODE,
1442 .procname = "inode-nr", 1300 .procname = "inode-nr",
1443 .data = &inodes_stat, 1301 .data = &inodes_stat,
1444 .maxlen = 2*sizeof(int), 1302 .maxlen = 2*sizeof(int),
1445 .mode = 0444, 1303 .mode = 0444,
1446 .proc_handler = &proc_dointvec, 1304 .proc_handler = proc_dointvec,
1447 }, 1305 },
1448 { 1306 {
1449 .ctl_name = FS_STATINODE,
1450 .procname = "inode-state", 1307 .procname = "inode-state",
1451 .data = &inodes_stat, 1308 .data = &inodes_stat,
1452 .maxlen = 7*sizeof(int), 1309 .maxlen = 7*sizeof(int),
1453 .mode = 0444, 1310 .mode = 0444,
1454 .proc_handler = &proc_dointvec, 1311 .proc_handler = proc_dointvec,
1455 }, 1312 },
1456 { 1313 {
1457 .procname = "file-nr", 1314 .procname = "file-nr",
1458 .data = &files_stat, 1315 .data = &files_stat,
1459 .maxlen = 3*sizeof(int), 1316 .maxlen = 3*sizeof(int),
1460 .mode = 0444, 1317 .mode = 0444,
1461 .proc_handler = &proc_nr_files, 1318 .proc_handler = proc_nr_files,
1462 }, 1319 },
1463 { 1320 {
1464 .ctl_name = FS_MAXFILE,
1465 .procname = "file-max", 1321 .procname = "file-max",
1466 .data = &files_stat.max_files, 1322 .data = &files_stat.max_files,
1467 .maxlen = sizeof(int), 1323 .maxlen = sizeof(int),
1468 .mode = 0644, 1324 .mode = 0644,
1469 .proc_handler = &proc_dointvec, 1325 .proc_handler = proc_dointvec,
1470 }, 1326 },
1471 { 1327 {
1472 .ctl_name = CTL_UNNUMBERED,
1473 .procname = "nr_open", 1328 .procname = "nr_open",
1474 .data = &sysctl_nr_open, 1329 .data = &sysctl_nr_open,
1475 .maxlen = sizeof(int), 1330 .maxlen = sizeof(int),
1476 .mode = 0644, 1331 .mode = 0644,
1477 .proc_handler = &proc_dointvec_minmax, 1332 .proc_handler = proc_dointvec_minmax,
1478 .extra1 = &sysctl_nr_open_min, 1333 .extra1 = &sysctl_nr_open_min,
1479 .extra2 = &sysctl_nr_open_max, 1334 .extra2 = &sysctl_nr_open_max,
1480 }, 1335 },
1481 { 1336 {
1482 .ctl_name = FS_DENTRY,
1483 .procname = "dentry-state", 1337 .procname = "dentry-state",
1484 .data = &dentry_stat, 1338 .data = &dentry_stat,
1485 .maxlen = 6*sizeof(int), 1339 .maxlen = 6*sizeof(int),
1486 .mode = 0444, 1340 .mode = 0444,
1487 .proc_handler = &proc_dointvec, 1341 .proc_handler = proc_dointvec,
1488 }, 1342 },
1489 { 1343 {
1490 .ctl_name = FS_OVERFLOWUID,
1491 .procname = "overflowuid", 1344 .procname = "overflowuid",
1492 .data = &fs_overflowuid, 1345 .data = &fs_overflowuid,
1493 .maxlen = sizeof(int), 1346 .maxlen = sizeof(int),
1494 .mode = 0644, 1347 .mode = 0644,
1495 .proc_handler = &proc_dointvec_minmax, 1348 .proc_handler = proc_dointvec_minmax,
1496 .strategy = &sysctl_intvec,
1497 .extra1 = &minolduid, 1349 .extra1 = &minolduid,
1498 .extra2 = &maxolduid, 1350 .extra2 = &maxolduid,
1499 }, 1351 },
1500 { 1352 {
1501 .ctl_name = FS_OVERFLOWGID,
1502 .procname = "overflowgid", 1353 .procname = "overflowgid",
1503 .data = &fs_overflowgid, 1354 .data = &fs_overflowgid,
1504 .maxlen = sizeof(int), 1355 .maxlen = sizeof(int),
1505 .mode = 0644, 1356 .mode = 0644,
1506 .proc_handler = &proc_dointvec_minmax, 1357 .proc_handler = proc_dointvec_minmax,
1507 .strategy = &sysctl_intvec,
1508 .extra1 = &minolduid, 1358 .extra1 = &minolduid,
1509 .extra2 = &maxolduid, 1359 .extra2 = &maxolduid,
1510 }, 1360 },
1511#ifdef CONFIG_FILE_LOCKING 1361#ifdef CONFIG_FILE_LOCKING
1512 { 1362 {
1513 .ctl_name = FS_LEASES,
1514 .procname = "leases-enable", 1363 .procname = "leases-enable",
1515 .data = &leases_enable, 1364 .data = &leases_enable,
1516 .maxlen = sizeof(int), 1365 .maxlen = sizeof(int),
1517 .mode = 0644, 1366 .mode = 0644,
1518 .proc_handler = &proc_dointvec, 1367 .proc_handler = proc_dointvec,
1519 }, 1368 },
1520#endif 1369#endif
1521#ifdef CONFIG_DNOTIFY 1370#ifdef CONFIG_DNOTIFY
1522 { 1371 {
1523 .ctl_name = FS_DIR_NOTIFY,
1524 .procname = "dir-notify-enable", 1372 .procname = "dir-notify-enable",
1525 .data = &dir_notify_enable, 1373 .data = &dir_notify_enable,
1526 .maxlen = sizeof(int), 1374 .maxlen = sizeof(int),
1527 .mode = 0644, 1375 .mode = 0644,
1528 .proc_handler = &proc_dointvec, 1376 .proc_handler = proc_dointvec,
1529 }, 1377 },
1530#endif 1378#endif
1531#ifdef CONFIG_MMU 1379#ifdef CONFIG_MMU
1532#ifdef CONFIG_FILE_LOCKING 1380#ifdef CONFIG_FILE_LOCKING
1533 { 1381 {
1534 .ctl_name = FS_LEASE_TIME,
1535 .procname = "lease-break-time", 1382 .procname = "lease-break-time",
1536 .data = &lease_break_time, 1383 .data = &lease_break_time,
1537 .maxlen = sizeof(int), 1384 .maxlen = sizeof(int),
1538 .mode = 0644, 1385 .mode = 0644,
1539 .proc_handler = &proc_dointvec, 1386 .proc_handler = proc_dointvec,
1540 }, 1387 },
1541#endif 1388#endif
1542#ifdef CONFIG_AIO 1389#ifdef CONFIG_AIO
@@ -1545,19 +1392,18 @@ static struct ctl_table fs_table[] = {
1545 .data = &aio_nr, 1392 .data = &aio_nr,
1546 .maxlen = sizeof(aio_nr), 1393 .maxlen = sizeof(aio_nr),
1547 .mode = 0444, 1394 .mode = 0444,
1548 .proc_handler = &proc_doulongvec_minmax, 1395 .proc_handler = proc_doulongvec_minmax,
1549 }, 1396 },
1550 { 1397 {
1551 .procname = "aio-max-nr", 1398 .procname = "aio-max-nr",
1552 .data = &aio_max_nr, 1399 .data = &aio_max_nr,
1553 .maxlen = sizeof(aio_max_nr), 1400 .maxlen = sizeof(aio_max_nr),
1554 .mode = 0644, 1401 .mode = 0644,
1555 .proc_handler = &proc_doulongvec_minmax, 1402 .proc_handler = proc_doulongvec_minmax,
1556 }, 1403 },
1557#endif /* CONFIG_AIO */ 1404#endif /* CONFIG_AIO */
1558#ifdef CONFIG_INOTIFY_USER 1405#ifdef CONFIG_INOTIFY_USER
1559 { 1406 {
1560 .ctl_name = FS_INOTIFY,
1561 .procname = "inotify", 1407 .procname = "inotify",
1562 .mode = 0555, 1408 .mode = 0555,
1563 .child = inotify_table, 1409 .child = inotify_table,
@@ -1572,19 +1418,16 @@ static struct ctl_table fs_table[] = {
1572#endif 1418#endif
1573#endif 1419#endif
1574 { 1420 {
1575 .ctl_name = KERN_SETUID_DUMPABLE,
1576 .procname = "suid_dumpable", 1421 .procname = "suid_dumpable",
1577 .data = &suid_dumpable, 1422 .data = &suid_dumpable,
1578 .maxlen = sizeof(int), 1423 .maxlen = sizeof(int),
1579 .mode = 0644, 1424 .mode = 0644,
1580 .proc_handler = &proc_dointvec_minmax, 1425 .proc_handler = proc_dointvec_minmax,
1581 .strategy = &sysctl_intvec,
1582 .extra1 = &zero, 1426 .extra1 = &zero,
1583 .extra2 = &two, 1427 .extra2 = &two,
1584 }, 1428 },
1585#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) 1429#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
1586 { 1430 {
1587 .ctl_name = CTL_UNNUMBERED,
1588 .procname = "binfmt_misc", 1431 .procname = "binfmt_misc",
1589 .mode = 0555, 1432 .mode = 0555,
1590 .child = binfmt_misc_table, 1433 .child = binfmt_misc_table,
@@ -1594,13 +1437,12 @@ static struct ctl_table fs_table[] = {
1594 * NOTE: do not add new entries to this table unless you have read 1437 * NOTE: do not add new entries to this table unless you have read
1595 * Documentation/sysctl/ctl_unnumbered.txt 1438 * Documentation/sysctl/ctl_unnumbered.txt
1596 */ 1439 */
1597 { .ctl_name = 0 } 1440 { }
1598}; 1441};
1599 1442
1600static struct ctl_table debug_table[] = { 1443static struct ctl_table debug_table[] = {
1601#if defined(CONFIG_X86) || defined(CONFIG_PPC) 1444#if defined(CONFIG_X86) || defined(CONFIG_PPC)
1602 { 1445 {
1603 .ctl_name = CTL_UNNUMBERED,
1604 .procname = "exception-trace", 1446 .procname = "exception-trace",
1605 .data = &show_unhandled_signals, 1447 .data = &show_unhandled_signals,
1606 .maxlen = sizeof(int), 1448 .maxlen = sizeof(int),
@@ -1608,11 +1450,11 @@ static struct ctl_table debug_table[] = {
1608 .proc_handler = proc_dointvec 1450 .proc_handler = proc_dointvec
1609 }, 1451 },
1610#endif 1452#endif
1611 { .ctl_name = 0 } 1453 { }
1612}; 1454};
1613 1455
1614static struct ctl_table dev_table[] = { 1456static struct ctl_table dev_table[] = {
1615 { .ctl_name = 0 } 1457 { }
1616}; 1458};
1617 1459
1618static DEFINE_SPINLOCK(sysctl_lock); 1460static DEFINE_SPINLOCK(sysctl_lock);
@@ -1766,122 +1608,6 @@ void register_sysctl_root(struct ctl_table_root *root)
1766 spin_unlock(&sysctl_lock); 1608 spin_unlock(&sysctl_lock);
1767} 1609}
1768 1610
1769#ifdef CONFIG_SYSCTL_SYSCALL
1770/* Perform the actual read/write of a sysctl table entry. */
1771static int do_sysctl_strategy(struct ctl_table_root *root,
1772 struct ctl_table *table,
1773 void __user *oldval, size_t __user *oldlenp,
1774 void __user *newval, size_t newlen)
1775{
1776 int op = 0, rc;
1777
1778 if (oldval)
1779 op |= MAY_READ;
1780 if (newval)
1781 op |= MAY_WRITE;
1782 if (sysctl_perm(root, table, op))
1783 return -EPERM;
1784
1785 if (table->strategy) {
1786 rc = table->strategy(table, oldval, oldlenp, newval, newlen);
1787 if (rc < 0)
1788 return rc;
1789 if (rc > 0)
1790 return 0;
1791 }
1792
1793 /* If there is no strategy routine, or if the strategy returns
1794 * zero, proceed with automatic r/w */
1795 if (table->data && table->maxlen) {
1796 rc = sysctl_data(table, oldval, oldlenp, newval, newlen);
1797 if (rc < 0)
1798 return rc;
1799 }
1800 return 0;
1801}
1802
1803static int parse_table(int __user *name, int nlen,
1804 void __user *oldval, size_t __user *oldlenp,
1805 void __user *newval, size_t newlen,
1806 struct ctl_table_root *root,
1807 struct ctl_table *table)
1808{
1809 int n;
1810repeat:
1811 if (!nlen)
1812 return -ENOTDIR;
1813 if (get_user(n, name))
1814 return -EFAULT;
1815 for ( ; table->ctl_name || table->procname; table++) {
1816 if (!table->ctl_name)
1817 continue;
1818 if (n == table->ctl_name) {
1819 int error;
1820 if (table->child) {
1821 if (sysctl_perm(root, table, MAY_EXEC))
1822 return -EPERM;
1823 name++;
1824 nlen--;
1825 table = table->child;
1826 goto repeat;
1827 }
1828 error = do_sysctl_strategy(root, table,
1829 oldval, oldlenp,
1830 newval, newlen);
1831 return error;
1832 }
1833 }
1834 return -ENOTDIR;
1835}
1836
1837int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp,
1838 void __user *newval, size_t newlen)
1839{
1840 struct ctl_table_header *head;
1841 int error = -ENOTDIR;
1842
1843 if (nlen <= 0 || nlen >= CTL_MAXNAME)
1844 return -ENOTDIR;
1845 if (oldval) {
1846 int old_len;
1847 if (!oldlenp || get_user(old_len, oldlenp))
1848 return -EFAULT;
1849 }
1850
1851 for (head = sysctl_head_next(NULL); head;
1852 head = sysctl_head_next(head)) {
1853 error = parse_table(name, nlen, oldval, oldlenp,
1854 newval, newlen,
1855 head->root, head->ctl_table);
1856 if (error != -ENOTDIR) {
1857 sysctl_head_finish(head);
1858 break;
1859 }
1860 }
1861 return error;
1862}
1863
1864SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
1865{
1866 struct __sysctl_args tmp;
1867 int error;
1868
1869 if (copy_from_user(&tmp, args, sizeof(tmp)))
1870 return -EFAULT;
1871
1872 error = deprecated_sysctl_warning(&tmp);
1873 if (error)
1874 goto out;
1875
1876 lock_kernel();
1877 error = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, tmp.oldlenp,
1878 tmp.newval, tmp.newlen);
1879 unlock_kernel();
1880out:
1881 return error;
1882}
1883#endif /* CONFIG_SYSCTL_SYSCALL */
1884
1885/* 1611/*
1886 * sysctl_perm does NOT grant the superuser all rights automatically, because 1612 * sysctl_perm does NOT grant the superuser all rights automatically, because
1887 * some sysctl variables are readonly even to root. 1613 * some sysctl variables are readonly even to root.
@@ -1917,7 +1643,7 @@ int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
1917 1643
1918static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) 1644static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
1919{ 1645{
1920 for (; table->ctl_name || table->procname; table++) { 1646 for (; table->procname; table++) {
1921 table->parent = parent; 1647 table->parent = parent;
1922 if (table->child) 1648 if (table->child)
1923 sysctl_set_parent(table, table->child); 1649 sysctl_set_parent(table, table->child);
@@ -1949,11 +1675,11 @@ static struct ctl_table *is_branch_in(struct ctl_table *branch,
1949 return NULL; 1675 return NULL;
1950 1676
1951 /* ... and nothing else */ 1677 /* ... and nothing else */
1952 if (branch[1].procname || branch[1].ctl_name) 1678 if (branch[1].procname)
1953 return NULL; 1679 return NULL;
1954 1680
1955 /* table should contain subdirectory with the same name */ 1681 /* table should contain subdirectory with the same name */
1956 for (p = table; p->procname || p->ctl_name; p++) { 1682 for (p = table; p->procname; p++) {
1957 if (!p->child) 1683 if (!p->child)
1958 continue; 1684 continue;
1959 if (p->procname && strcmp(p->procname, s) == 0) 1685 if (p->procname && strcmp(p->procname, s) == 0)
@@ -1998,9 +1724,6 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
1998 * 1724 *
1999 * The members of the &struct ctl_table structure are used as follows: 1725 * The members of the &struct ctl_table structure are used as follows:
2000 * 1726 *
2001 * ctl_name - This is the numeric sysctl value used by sysctl(2). The number
2002 * must be unique within that level of sysctl
2003 *
2004 * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not 1727 * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
2005 * enter a sysctl file 1728 * enter a sysctl file
2006 * 1729 *
@@ -2015,8 +1738,6 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
2015 * 1738 *
2016 * proc_handler - the text handler routine (described below) 1739 * proc_handler - the text handler routine (described below)
2017 * 1740 *
2018 * strategy - the strategy routine (described below)
2019 *
2020 * de - for internal use by the sysctl routines 1741 * de - for internal use by the sysctl routines
2021 * 1742 *
2022 * extra1, extra2 - extra pointers usable by the proc handler routines 1743 * extra1, extra2 - extra pointers usable by the proc handler routines
@@ -2029,19 +1750,6 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
2029 * struct enable minimal validation of the values being written to be 1750 * struct enable minimal validation of the values being written to be
2030 * performed, and the mode field allows minimal authentication. 1751 * performed, and the mode field allows minimal authentication.
2031 * 1752 *
2032 * More sophisticated management can be enabled by the provision of a
2033 * strategy routine with the table entry. This will be called before
2034 * any automatic read or write of the data is performed.
2035 *
2036 * The strategy routine may return
2037 *
2038 * < 0 - Error occurred (error is passed to user process)
2039 *
2040 * 0 - OK - proceed with automatic read or write.
2041 *
2042 * > 0 - OK - read or write has been done by the strategy routine, so
2043 * return immediately.
2044 *
2045 * There must be a proc_handler routine for any terminal nodes 1753 * There must be a proc_handler routine for any terminal nodes
2046 * mirrored under /proc/sys (non-terminals are handled by a built-in 1754 * mirrored under /proc/sys (non-terminals are handled by a built-in
2047 * directory handler). Several default handlers are available to 1755 * directory handler). Several default handlers are available to
@@ -2068,13 +1776,13 @@ struct ctl_table_header *__register_sysctl_paths(
2068 struct ctl_table_set *set; 1776 struct ctl_table_set *set;
2069 1777
2070 /* Count the path components */ 1778 /* Count the path components */
2071 for (npath = 0; path[npath].ctl_name || path[npath].procname; ++npath) 1779 for (npath = 0; path[npath].procname; ++npath)
2072 ; 1780 ;
2073 1781
2074 /* 1782 /*
2075 * For each path component, allocate a 2-element ctl_table array. 1783 * For each path component, allocate a 2-element ctl_table array.
2076 * The first array element will be filled with the sysctl entry 1784 * The first array element will be filled with the sysctl entry
2077 * for this, the second will be the sentinel (ctl_name == 0). 1785 * for this, the second will be the sentinel (procname == 0).
2078 * 1786 *
2079 * We allocate everything in one go so that we don't have to 1787 * We allocate everything in one go so that we don't have to
2080 * worry about freeing additional memory in unregister_sysctl_table. 1788 * worry about freeing additional memory in unregister_sysctl_table.
@@ -2091,7 +1799,6 @@ struct ctl_table_header *__register_sysctl_paths(
2091 for (n = 0; n < npath; ++n, ++path) { 1799 for (n = 0; n < npath; ++n, ++path) {
2092 /* Copy the procname */ 1800 /* Copy the procname */
2093 new->procname = path->procname; 1801 new->procname = path->procname;
2094 new->ctl_name = path->ctl_name;
2095 new->mode = 0555; 1802 new->mode = 0555;
2096 1803
2097 *prevp = new; 1804 *prevp = new;
@@ -2953,286 +2660,6 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2953 2660
2954#endif /* CONFIG_PROC_FS */ 2661#endif /* CONFIG_PROC_FS */
2955 2662
2956
2957#ifdef CONFIG_SYSCTL_SYSCALL
2958/*
2959 * General sysctl support routines
2960 */
2961
2962/* The generic sysctl data routine (used if no strategy routine supplied) */
2963int sysctl_data(struct ctl_table *table,
2964 void __user *oldval, size_t __user *oldlenp,
2965 void __user *newval, size_t newlen)
2966{
2967 size_t len;
2968
2969 /* Get out of I don't have a variable */
2970 if (!table->data || !table->maxlen)
2971 return -ENOTDIR;
2972
2973 if (oldval && oldlenp) {
2974 if (get_user(len, oldlenp))
2975 return -EFAULT;
2976 if (len) {
2977 if (len > table->maxlen)
2978 len = table->maxlen;
2979 if (copy_to_user(oldval, table->data, len))
2980 return -EFAULT;
2981 if (put_user(len, oldlenp))
2982 return -EFAULT;
2983 }
2984 }
2985
2986 if (newval && newlen) {
2987 if (newlen > table->maxlen)
2988 newlen = table->maxlen;
2989
2990 if (copy_from_user(table->data, newval, newlen))
2991 return -EFAULT;
2992 }
2993 return 1;
2994}
2995
2996/* The generic string strategy routine: */
2997int sysctl_string(struct ctl_table *table,
2998 void __user *oldval, size_t __user *oldlenp,
2999 void __user *newval, size_t newlen)
3000{
3001 if (!table->data || !table->maxlen)
3002 return -ENOTDIR;
3003
3004 if (oldval && oldlenp) {
3005 size_t bufsize;
3006 if (get_user(bufsize, oldlenp))
3007 return -EFAULT;
3008 if (bufsize) {
3009 size_t len = strlen(table->data), copied;
3010
3011 /* This shouldn't trigger for a well-formed sysctl */
3012 if (len > table->maxlen)
3013 len = table->maxlen;
3014
3015 /* Copy up to a max of bufsize-1 bytes of the string */
3016 copied = (len >= bufsize) ? bufsize - 1 : len;
3017
3018 if (copy_to_user(oldval, table->data, copied) ||
3019 put_user(0, (char __user *)(oldval + copied)))
3020 return -EFAULT;
3021 if (put_user(len, oldlenp))
3022 return -EFAULT;
3023 }
3024 }
3025 if (newval && newlen) {
3026 size_t len = newlen;
3027 if (len > table->maxlen)
3028 len = table->maxlen;
3029 if(copy_from_user(table->data, newval, len))
3030 return -EFAULT;
3031 if (len == table->maxlen)
3032 len--;
3033 ((char *) table->data)[len] = 0;
3034 }
3035 return 1;
3036}
3037
3038/*
3039 * This function makes sure that all of the integers in the vector
3040 * are between the minimum and maximum values given in the arrays
3041 * table->extra1 and table->extra2, respectively.
3042 */
3043int sysctl_intvec(struct ctl_table *table,
3044 void __user *oldval, size_t __user *oldlenp,
3045 void __user *newval, size_t newlen)
3046{
3047
3048 if (newval && newlen) {
3049 int __user *vec = (int __user *) newval;
3050 int *min = (int *) table->extra1;
3051 int *max = (int *) table->extra2;
3052 size_t length;
3053 int i;
3054
3055 if (newlen % sizeof(int) != 0)
3056 return -EINVAL;
3057
3058 if (!table->extra1 && !table->extra2)
3059 return 0;
3060
3061 if (newlen > table->maxlen)
3062 newlen = table->maxlen;
3063 length = newlen / sizeof(int);
3064
3065 for (i = 0; i < length; i++) {
3066 int value;
3067 if (get_user(value, vec + i))
3068 return -EFAULT;
3069 if (min && value < min[i])
3070 return -EINVAL;
3071 if (max && value > max[i])
3072 return -EINVAL;
3073 }
3074 }
3075 return 0;
3076}
3077
3078/* Strategy function to convert jiffies to seconds */
3079int sysctl_jiffies(struct ctl_table *table,
3080 void __user *oldval, size_t __user *oldlenp,
3081 void __user *newval, size_t newlen)
3082{
3083 if (oldval && oldlenp) {
3084 size_t olen;
3085
3086 if (get_user(olen, oldlenp))
3087 return -EFAULT;
3088 if (olen) {
3089 int val;
3090
3091 if (olen < sizeof(int))
3092 return -EINVAL;
3093
3094 val = *(int *)(table->data) / HZ;
3095 if (put_user(val, (int __user *)oldval))
3096 return -EFAULT;
3097 if (put_user(sizeof(int), oldlenp))
3098 return -EFAULT;
3099 }
3100 }
3101 if (newval && newlen) {
3102 int new;
3103 if (newlen != sizeof(int))
3104 return -EINVAL;
3105 if (get_user(new, (int __user *)newval))
3106 return -EFAULT;
3107 *(int *)(table->data) = new*HZ;
3108 }
3109 return 1;
3110}
3111
3112/* Strategy function to convert jiffies to seconds */
3113int sysctl_ms_jiffies(struct ctl_table *table,
3114 void __user *oldval, size_t __user *oldlenp,
3115 void __user *newval, size_t newlen)
3116{
3117 if (oldval && oldlenp) {
3118 size_t olen;
3119
3120 if (get_user(olen, oldlenp))
3121 return -EFAULT;
3122 if (olen) {
3123 int val;
3124
3125 if (olen < sizeof(int))
3126 return -EINVAL;
3127
3128 val = jiffies_to_msecs(*(int *)(table->data));
3129 if (put_user(val, (int __user *)oldval))
3130 return -EFAULT;
3131 if (put_user(sizeof(int), oldlenp))
3132 return -EFAULT;
3133 }
3134 }
3135 if (newval && newlen) {
3136 int new;
3137 if (newlen != sizeof(int))
3138 return -EINVAL;
3139 if (get_user(new, (int __user *)newval))
3140 return -EFAULT;
3141 *(int *)(table->data) = msecs_to_jiffies(new);
3142 }
3143 return 1;
3144}
3145
3146
3147
3148#else /* CONFIG_SYSCTL_SYSCALL */
3149
3150
3151SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
3152{
3153 struct __sysctl_args tmp;
3154 int error;
3155
3156 if (copy_from_user(&tmp, args, sizeof(tmp)))
3157 return -EFAULT;
3158
3159 error = deprecated_sysctl_warning(&tmp);
3160
3161 /* If no error reading the parameters then just -ENOSYS ... */
3162 if (!error)
3163 error = -ENOSYS;
3164
3165 return error;
3166}
3167
3168int sysctl_data(struct ctl_table *table,
3169 void __user *oldval, size_t __user *oldlenp,
3170 void __user *newval, size_t newlen)
3171{
3172 return -ENOSYS;
3173}
3174
3175int sysctl_string(struct ctl_table *table,
3176 void __user *oldval, size_t __user *oldlenp,
3177 void __user *newval, size_t newlen)
3178{
3179 return -ENOSYS;
3180}
3181
3182int sysctl_intvec(struct ctl_table *table,
3183 void __user *oldval, size_t __user *oldlenp,
3184 void __user *newval, size_t newlen)
3185{
3186 return -ENOSYS;
3187}
3188
3189int sysctl_jiffies(struct ctl_table *table,
3190 void __user *oldval, size_t __user *oldlenp,
3191 void __user *newval, size_t newlen)
3192{
3193 return -ENOSYS;
3194}
3195
3196int sysctl_ms_jiffies(struct ctl_table *table,
3197 void __user *oldval, size_t __user *oldlenp,
3198 void __user *newval, size_t newlen)
3199{
3200 return -ENOSYS;
3201}
3202
3203#endif /* CONFIG_SYSCTL_SYSCALL */
3204
3205static int deprecated_sysctl_warning(struct __sysctl_args *args)
3206{
3207 static int msg_count;
3208 int name[CTL_MAXNAME];
3209 int i;
3210
3211 /* Check args->nlen. */
3212 if (args->nlen < 0 || args->nlen > CTL_MAXNAME)
3213 return -ENOTDIR;
3214
3215 /* Read in the sysctl name for better debug message logging */
3216 for (i = 0; i < args->nlen; i++)
3217 if (get_user(name[i], args->name + i))
3218 return -EFAULT;
3219
3220 /* Ignore accesses to kernel.version */
3221 if ((args->nlen == 2) && (name[0] == CTL_KERN) && (name[1] == KERN_VERSION))
3222 return 0;
3223
3224 if (msg_count < 5) {
3225 msg_count++;
3226 printk(KERN_INFO
3227 "warning: process `%s' used the deprecated sysctl "
3228 "system call with ", current->comm);
3229 for (i = 0; i < args->nlen; i++)
3230 printk("%d.", name[i]);
3231 printk("\n");
3232 }
3233 return 0;
3234}
3235
3236/* 2663/*
3237 * No sense putting this after each symbol definition, twice, 2664 * No sense putting this after each symbol definition, twice,
3238 * exception granted :-) 2665 * exception granted :-)
@@ -3247,9 +2674,4 @@ EXPORT_SYMBOL(proc_doulongvec_minmax);
3247EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); 2674EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
3248EXPORT_SYMBOL(register_sysctl_table); 2675EXPORT_SYMBOL(register_sysctl_table);
3249EXPORT_SYMBOL(register_sysctl_paths); 2676EXPORT_SYMBOL(register_sysctl_paths);
3250EXPORT_SYMBOL(sysctl_intvec);
3251EXPORT_SYMBOL(sysctl_jiffies);
3252EXPORT_SYMBOL(sysctl_ms_jiffies);
3253EXPORT_SYMBOL(sysctl_string);
3254EXPORT_SYMBOL(sysctl_data);
3255EXPORT_SYMBOL(unregister_sysctl_table); 2677EXPORT_SYMBOL(unregister_sysctl_table);
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
new file mode 100644
index 00000000000..8f5d16e0707
--- /dev/null
+++ b/kernel/sysctl_binary.c
@@ -0,0 +1,1543 @@
1#include <linux/stat.h>
2#include <linux/sysctl.h>
3#include "../fs/xfs/linux-2.6/xfs_sysctl.h"
4#include <linux/sunrpc/debug.h>
5#include <linux/string.h>
6#include <net/ip_vs.h>
7#include <linux/syscalls.h>
8#include <linux/namei.h>
9#include <linux/mount.h>
10#include <linux/fs.h>
11#include <linux/nsproxy.h>
12#include <linux/pid_namespace.h>
13#include <linux/file.h>
14#include <linux/ctype.h>
15#include <linux/netdevice.h>
16
17#ifdef CONFIG_SYSCTL_SYSCALL
18
19struct bin_table;
20typedef ssize_t bin_convert_t(struct file *file,
21 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen);
22
23static bin_convert_t bin_dir;
24static bin_convert_t bin_string;
25static bin_convert_t bin_intvec;
26static bin_convert_t bin_ulongvec;
27static bin_convert_t bin_uuid;
28static bin_convert_t bin_dn_node_address;
29
30#define CTL_DIR bin_dir
31#define CTL_STR bin_string
32#define CTL_INT bin_intvec
33#define CTL_ULONG bin_ulongvec
34#define CTL_UUID bin_uuid
35#define CTL_DNADR bin_dn_node_address
36
37#define BUFSZ 256
38
39struct bin_table {
40 bin_convert_t *convert;
41 int ctl_name;
42 const char *procname;
43 const struct bin_table *child;
44};
45
46static const struct bin_table bin_random_table[] = {
47 { CTL_INT, RANDOM_POOLSIZE, "poolsize" },
48 { CTL_INT, RANDOM_ENTROPY_COUNT, "entropy_avail" },
49 { CTL_INT, RANDOM_READ_THRESH, "read_wakeup_threshold" },
50 { CTL_INT, RANDOM_WRITE_THRESH, "write_wakeup_threshold" },
51 { CTL_UUID, RANDOM_BOOT_ID, "boot_id" },
52 { CTL_UUID, RANDOM_UUID, "uuid" },
53 {}
54};
55
56static const struct bin_table bin_pty_table[] = {
57 { CTL_INT, PTY_MAX, "max" },
58 { CTL_INT, PTY_NR, "nr" },
59 {}
60};
61
62static const struct bin_table bin_kern_table[] = {
63 { CTL_STR, KERN_OSTYPE, "ostype" },
64 { CTL_STR, KERN_OSRELEASE, "osrelease" },
65 /* KERN_OSREV not used */
66 { CTL_STR, KERN_VERSION, "version" },
67 /* KERN_SECUREMASK not used */
68 /* KERN_PROF not used */
69 { CTL_STR, KERN_NODENAME, "hostname" },
70 { CTL_STR, KERN_DOMAINNAME, "domainname" },
71
72 { CTL_INT, KERN_PANIC, "panic" },
73 { CTL_INT, KERN_REALROOTDEV, "real-root-dev" },
74
75 { CTL_STR, KERN_SPARC_REBOOT, "reboot-cmd" },
76 { CTL_INT, KERN_CTLALTDEL, "ctrl-alt-del" },
77 { CTL_INT, KERN_PRINTK, "printk" },
78
79 /* KERN_NAMETRANS not used */
80 /* KERN_PPC_HTABRECLAIM not used */
81 /* KERN_PPC_ZEROPAGED not used */
82 { CTL_INT, KERN_PPC_POWERSAVE_NAP, "powersave-nap" },
83
84 { CTL_STR, KERN_MODPROBE, "modprobe" },
85 { CTL_INT, KERN_SG_BIG_BUFF, "sg-big-buff" },
86 { CTL_INT, KERN_ACCT, "acct" },
87 /* KERN_PPC_L2CR "l2cr" no longer used */
88
89 /* KERN_RTSIGNR not used */
90 /* KERN_RTSIGMAX not used */
91
92 { CTL_ULONG, KERN_SHMMAX, "shmmax" },
93 { CTL_INT, KERN_MSGMAX, "msgmax" },
94 { CTL_INT, KERN_MSGMNB, "msgmnb" },
95 /* KERN_MSGPOOL not used*/
96 { CTL_INT, KERN_SYSRQ, "sysrq" },
97 { CTL_INT, KERN_MAX_THREADS, "threads-max" },
98 { CTL_DIR, KERN_RANDOM, "random", bin_random_table },
99 { CTL_ULONG, KERN_SHMALL, "shmall" },
100 { CTL_INT, KERN_MSGMNI, "msgmni" },
101 { CTL_INT, KERN_SEM, "sem" },
102 { CTL_INT, KERN_SPARC_STOP_A, "stop-a" },
103 { CTL_INT, KERN_SHMMNI, "shmmni" },
104
105 { CTL_INT, KERN_OVERFLOWUID, "overflowuid" },
106 { CTL_INT, KERN_OVERFLOWGID, "overflowgid" },
107
108 { CTL_STR, KERN_HOTPLUG, "hotplug", },
109 { CTL_INT, KERN_IEEE_EMULATION_WARNINGS, "ieee_emulation_warnings" },
110
111 { CTL_INT, KERN_S390_USER_DEBUG_LOGGING, "userprocess_debug" },
112 { CTL_INT, KERN_CORE_USES_PID, "core_uses_pid" },
113 /* KERN_TAINTED "tainted" no longer used */
114 { CTL_INT, KERN_CADPID, "cad_pid" },
115 { CTL_INT, KERN_PIDMAX, "pid_max" },
116 { CTL_STR, KERN_CORE_PATTERN, "core_pattern" },
117 { CTL_INT, KERN_PANIC_ON_OOPS, "panic_on_oops" },
118 { CTL_INT, KERN_HPPA_PWRSW, "soft-power" },
119 { CTL_INT, KERN_HPPA_UNALIGNED, "unaligned-trap" },
120
121 { CTL_INT, KERN_PRINTK_RATELIMIT, "printk_ratelimit" },
122 { CTL_INT, KERN_PRINTK_RATELIMIT_BURST, "printk_ratelimit_burst" },
123
124 { CTL_DIR, KERN_PTY, "pty", bin_pty_table },
125 { CTL_INT, KERN_NGROUPS_MAX, "ngroups_max" },
126 { CTL_INT, KERN_SPARC_SCONS_PWROFF, "scons-poweroff" },
127 /* KERN_HZ_TIMER "hz_timer" no longer used */
128 { CTL_INT, KERN_UNKNOWN_NMI_PANIC, "unknown_nmi_panic" },
129 { CTL_INT, KERN_BOOTLOADER_TYPE, "bootloader_type" },
130 { CTL_INT, KERN_RANDOMIZE, "randomize_va_space" },
131
132 { CTL_INT, KERN_SPIN_RETRY, "spin_retry" },
133 /* KERN_ACPI_VIDEO_FLAGS "acpi_video_flags" no longer used */
134 { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" },
135 { CTL_INT, KERN_COMPAT_LOG, "compat-log" },
136 { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
137 { CTL_INT, KERN_NMI_WATCHDOG, "nmi_watchdog" },
138 { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
139 {}
140};
141
142static const struct bin_table bin_vm_table[] = {
143 { CTL_INT, VM_OVERCOMMIT_MEMORY, "overcommit_memory" },
144 { CTL_INT, VM_PAGE_CLUSTER, "page-cluster" },
145 { CTL_INT, VM_DIRTY_BACKGROUND, "dirty_background_ratio" },
146 { CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" },
147 /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */
148 /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */
149 { CTL_INT, VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" },
150 { CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" },
151 /* VM_PAGEBUF unused */
152 /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */
153 { CTL_INT, VM_SWAPPINESS, "swappiness" },
154 { CTL_INT, VM_LOWMEM_RESERVE_RATIO, "lowmem_reserve_ratio" },
155 { CTL_INT, VM_MIN_FREE_KBYTES, "min_free_kbytes" },
156 { CTL_INT, VM_MAX_MAP_COUNT, "max_map_count" },
157 { CTL_INT, VM_LAPTOP_MODE, "laptop_mode" },
158 { CTL_INT, VM_BLOCK_DUMP, "block_dump" },
159 { CTL_INT, VM_HUGETLB_GROUP, "hugetlb_shm_group" },
160 { CTL_INT, VM_VFS_CACHE_PRESSURE, "vfs_cache_pressure" },
161 { CTL_INT, VM_LEGACY_VA_LAYOUT, "legacy_va_layout" },
162 /* VM_SWAP_TOKEN_TIMEOUT unused */
163 { CTL_INT, VM_DROP_PAGECACHE, "drop_caches" },
164 { CTL_INT, VM_PERCPU_PAGELIST_FRACTION, "percpu_pagelist_fraction" },
165 { CTL_INT, VM_ZONE_RECLAIM_MODE, "zone_reclaim_mode" },
166 { CTL_INT, VM_MIN_UNMAPPED, "min_unmapped_ratio" },
167 { CTL_INT, VM_PANIC_ON_OOM, "panic_on_oom" },
168 { CTL_INT, VM_VDSO_ENABLED, "vdso_enabled" },
169 { CTL_INT, VM_MIN_SLAB, "min_slab_ratio" },
170
171 {}
172};
173
174static const struct bin_table bin_net_core_table[] = {
175 { CTL_INT, NET_CORE_WMEM_MAX, "wmem_max" },
176 { CTL_INT, NET_CORE_RMEM_MAX, "rmem_max" },
177 { CTL_INT, NET_CORE_WMEM_DEFAULT, "wmem_default" },
178 { CTL_INT, NET_CORE_RMEM_DEFAULT, "rmem_default" },
179 /* NET_CORE_DESTROY_DELAY unused */
180 { CTL_INT, NET_CORE_MAX_BACKLOG, "netdev_max_backlog" },
181 /* NET_CORE_FASTROUTE unused */
182 { CTL_INT, NET_CORE_MSG_COST, "message_cost" },
183 { CTL_INT, NET_CORE_MSG_BURST, "message_burst" },
184 { CTL_INT, NET_CORE_OPTMEM_MAX, "optmem_max" },
185 /* NET_CORE_HOT_LIST_LENGTH unused */
186 /* NET_CORE_DIVERT_VERSION unused */
187 /* NET_CORE_NO_CONG_THRESH unused */
188 /* NET_CORE_NO_CONG unused */
189 /* NET_CORE_LO_CONG unused */
190 /* NET_CORE_MOD_CONG unused */
191 { CTL_INT, NET_CORE_DEV_WEIGHT, "dev_weight" },
192 { CTL_INT, NET_CORE_SOMAXCONN, "somaxconn" },
193 { CTL_INT, NET_CORE_BUDGET, "netdev_budget" },
194 { CTL_INT, NET_CORE_AEVENT_ETIME, "xfrm_aevent_etime" },
195 { CTL_INT, NET_CORE_AEVENT_RSEQTH, "xfrm_aevent_rseqth" },
196 { CTL_INT, NET_CORE_WARNINGS, "warnings" },
197 {},
198};
199
200static const struct bin_table bin_net_unix_table[] = {
201 /* NET_UNIX_DESTROY_DELAY unused */
202 /* NET_UNIX_DELETE_DELAY unused */
203 { CTL_INT, NET_UNIX_MAX_DGRAM_QLEN, "max_dgram_qlen" },
204 {}
205};
206
207static const struct bin_table bin_net_ipv4_route_table[] = {
208 { CTL_INT, NET_IPV4_ROUTE_FLUSH, "flush" },
209 /* NET_IPV4_ROUTE_MIN_DELAY "min_delay" no longer used */
210 /* NET_IPV4_ROUTE_MAX_DELAY "max_delay" no longer used */
211 { CTL_INT, NET_IPV4_ROUTE_GC_THRESH, "gc_thresh" },
212 { CTL_INT, NET_IPV4_ROUTE_MAX_SIZE, "max_size" },
213 { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" },
214 { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" },
215 { CTL_INT, NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" },
216 { CTL_INT, NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval" },
217 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" },
218 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" },
219 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" },
220 { CTL_INT, NET_IPV4_ROUTE_ERROR_COST, "error_cost" },
221 { CTL_INT, NET_IPV4_ROUTE_ERROR_BURST, "error_burst" },
222 { CTL_INT, NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity" },
223 { CTL_INT, NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" },
224 { CTL_INT, NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" },
225 { CTL_INT, NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" },
226 { CTL_INT, NET_IPV4_ROUTE_SECRET_INTERVAL, "secret_interval" },
227 {}
228};
229
230static const struct bin_table bin_net_ipv4_conf_vars_table[] = {
231 { CTL_INT, NET_IPV4_CONF_FORWARDING, "forwarding" },
232 { CTL_INT, NET_IPV4_CONF_MC_FORWARDING, "mc_forwarding" },
233
234 { CTL_INT, NET_IPV4_CONF_ACCEPT_REDIRECTS, "accept_redirects" },
235 { CTL_INT, NET_IPV4_CONF_SECURE_REDIRECTS, "secure_redirects" },
236 { CTL_INT, NET_IPV4_CONF_SEND_REDIRECTS, "send_redirects" },
237 { CTL_INT, NET_IPV4_CONF_SHARED_MEDIA, "shared_media" },
238 { CTL_INT, NET_IPV4_CONF_RP_FILTER, "rp_filter" },
239 { CTL_INT, NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
240 { CTL_INT, NET_IPV4_CONF_PROXY_ARP, "proxy_arp" },
241 { CTL_INT, NET_IPV4_CONF_MEDIUM_ID, "medium_id" },
242 { CTL_INT, NET_IPV4_CONF_BOOTP_RELAY, "bootp_relay" },
243 { CTL_INT, NET_IPV4_CONF_LOG_MARTIANS, "log_martians" },
244 { CTL_INT, NET_IPV4_CONF_TAG, "tag" },
245 { CTL_INT, NET_IPV4_CONF_ARPFILTER, "arp_filter" },
246 { CTL_INT, NET_IPV4_CONF_ARP_ANNOUNCE, "arp_announce" },
247 { CTL_INT, NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" },
248 { CTL_INT, NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" },
249 { CTL_INT, NET_IPV4_CONF_ARP_NOTIFY, "arp_notify" },
250
251 { CTL_INT, NET_IPV4_CONF_NOXFRM, "disable_xfrm" },
252 { CTL_INT, NET_IPV4_CONF_NOPOLICY, "disable_policy" },
253 { CTL_INT, NET_IPV4_CONF_FORCE_IGMP_VERSION, "force_igmp_version" },
254 { CTL_INT, NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" },
255 {}
256};
257
258static const struct bin_table bin_net_ipv4_conf_table[] = {
259 { CTL_DIR, NET_PROTO_CONF_ALL, "all", bin_net_ipv4_conf_vars_table },
260 { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_ipv4_conf_vars_table },
261 { CTL_DIR, 0, NULL, bin_net_ipv4_conf_vars_table },
262 {}
263};
264
265static const struct bin_table bin_net_neigh_vars_table[] = {
266 { CTL_INT, NET_NEIGH_MCAST_SOLICIT, "mcast_solicit" },
267 { CTL_INT, NET_NEIGH_UCAST_SOLICIT, "ucast_solicit" },
268 { CTL_INT, NET_NEIGH_APP_SOLICIT, "app_solicit" },
269 /* NET_NEIGH_RETRANS_TIME "retrans_time" no longer used */
270 { CTL_INT, NET_NEIGH_REACHABLE_TIME, "base_reachable_time" },
271 { CTL_INT, NET_NEIGH_DELAY_PROBE_TIME, "delay_first_probe_time" },
272 { CTL_INT, NET_NEIGH_GC_STALE_TIME, "gc_stale_time" },
273 { CTL_INT, NET_NEIGH_UNRES_QLEN, "unres_qlen" },
274 { CTL_INT, NET_NEIGH_PROXY_QLEN, "proxy_qlen" },
275 /* NET_NEIGH_ANYCAST_DELAY "anycast_delay" no longer used */
276 /* NET_NEIGH_PROXY_DELAY "proxy_delay" no longer used */
277 /* NET_NEIGH_LOCKTIME "locktime" no longer used */
278 { CTL_INT, NET_NEIGH_GC_INTERVAL, "gc_interval" },
279 { CTL_INT, NET_NEIGH_GC_THRESH1, "gc_thresh1" },
280 { CTL_INT, NET_NEIGH_GC_THRESH2, "gc_thresh2" },
281 { CTL_INT, NET_NEIGH_GC_THRESH3, "gc_thresh3" },
282 { CTL_INT, NET_NEIGH_RETRANS_TIME_MS, "retrans_time_ms" },
283 { CTL_INT, NET_NEIGH_REACHABLE_TIME_MS, "base_reachable_time_ms" },
284 {}
285};
286
287static const struct bin_table bin_net_neigh_table[] = {
288 { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_neigh_vars_table },
289 { CTL_DIR, 0, NULL, bin_net_neigh_vars_table },
290 {}
291};
292
293static const struct bin_table bin_net_ipv4_netfilter_table[] = {
294 { CTL_INT, NET_IPV4_NF_CONNTRACK_MAX, "ip_conntrack_max" },
295
296 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "ip_conntrack_tcp_timeout_syn_sent" no longer used */
297 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "ip_conntrack_tcp_timeout_syn_recv" no longer used */
298 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "ip_conntrack_tcp_timeout_established" no longer used */
299 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "ip_conntrack_tcp_timeout_fin_wait" no longer used */
300 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT "ip_conntrack_tcp_timeout_close_wait" no longer used */
301 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "ip_conntrack_tcp_timeout_last_ack" no longer used */
302 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "ip_conntrack_tcp_timeout_time_wait" no longer used */
303 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "ip_conntrack_tcp_timeout_close" no longer used */
304
305 /* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT "ip_conntrack_udp_timeout" no longer used */
306 /* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM "ip_conntrack_udp_timeout_stream" no longer used */
307 /* NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT "ip_conntrack_icmp_timeout" no longer used */
308 /* NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT "ip_conntrack_generic_timeout" no longer used */
309
310 { CTL_INT, NET_IPV4_NF_CONNTRACK_BUCKETS, "ip_conntrack_buckets" },
311 { CTL_INT, NET_IPV4_NF_CONNTRACK_LOG_INVALID, "ip_conntrack_log_invalid" },
312 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "ip_conntrack_tcp_timeout_max_retrans" no longer used */
313 { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_LOOSE, "ip_conntrack_tcp_loose" },
314 { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL, "ip_conntrack_tcp_be_liberal" },
315 { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS, "ip_conntrack_tcp_max_retrans" },
316
317 /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "ip_conntrack_sctp_timeout_closed" no longer used */
318 /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "ip_conntrack_sctp_timeout_cookie_wait" no longer used */
319 /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "ip_conntrack_sctp_timeout_cookie_echoed" no longer used */
320 /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "ip_conntrack_sctp_timeout_established" no longer used */
321 /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "ip_conntrack_sctp_timeout_shutdown_sent" no longer used */
322 /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "ip_conntrack_sctp_timeout_shutdown_recd" no longer used */
323 /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "ip_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */
324
325 { CTL_INT, NET_IPV4_NF_CONNTRACK_COUNT, "ip_conntrack_count" },
326 { CTL_INT, NET_IPV4_NF_CONNTRACK_CHECKSUM, "ip_conntrack_checksum" },
327 {}
328};
329
330static const struct bin_table bin_net_ipv4_table[] = {
331 {CTL_INT, NET_IPV4_FORWARD, "ip_forward" },
332
333 { CTL_DIR, NET_IPV4_CONF, "conf", bin_net_ipv4_conf_table },
334 { CTL_DIR, NET_IPV4_NEIGH, "neigh", bin_net_neigh_table },
335 { CTL_DIR, NET_IPV4_ROUTE, "route", bin_net_ipv4_route_table },
336 /* NET_IPV4_FIB_HASH unused */
337 { CTL_DIR, NET_IPV4_NETFILTER, "netfilter", bin_net_ipv4_netfilter_table },
338
339 { CTL_INT, NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps" },
340 { CTL_INT, NET_IPV4_TCP_WINDOW_SCALING, "tcp_window_scaling" },
341 { CTL_INT, NET_IPV4_TCP_SACK, "tcp_sack" },
342 { CTL_INT, NET_IPV4_TCP_RETRANS_COLLAPSE, "tcp_retrans_collapse" },
343 { CTL_INT, NET_IPV4_DEFAULT_TTL, "ip_default_ttl" },
344 /* NET_IPV4_AUTOCONFIG unused */
345 { CTL_INT, NET_IPV4_NO_PMTU_DISC, "ip_no_pmtu_disc" },
346 { CTL_INT, NET_IPV4_NONLOCAL_BIND, "ip_nonlocal_bind" },
347 { CTL_INT, NET_IPV4_TCP_SYN_RETRIES, "tcp_syn_retries" },
348 { CTL_INT, NET_TCP_SYNACK_RETRIES, "tcp_synack_retries" },
349 { CTL_INT, NET_TCP_MAX_ORPHANS, "tcp_max_orphans" },
350 { CTL_INT, NET_TCP_MAX_TW_BUCKETS, "tcp_max_tw_buckets" },
351 { CTL_INT, NET_IPV4_DYNADDR, "ip_dynaddr" },
352 { CTL_INT, NET_IPV4_TCP_KEEPALIVE_TIME, "tcp_keepalive_time" },
353 { CTL_INT, NET_IPV4_TCP_KEEPALIVE_PROBES, "tcp_keepalive_probes" },
354 { CTL_INT, NET_IPV4_TCP_KEEPALIVE_INTVL, "tcp_keepalive_intvl" },
355 { CTL_INT, NET_IPV4_TCP_RETRIES1, "tcp_retries1" },
356 { CTL_INT, NET_IPV4_TCP_RETRIES2, "tcp_retries2" },
357 { CTL_INT, NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout" },
358 { CTL_INT, NET_TCP_SYNCOOKIES, "tcp_syncookies" },
359 { CTL_INT, NET_TCP_TW_RECYCLE, "tcp_tw_recycle" },
360 { CTL_INT, NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow" },
361 { CTL_INT, NET_TCP_STDURG, "tcp_stdurg" },
362 { CTL_INT, NET_TCP_RFC1337, "tcp_rfc1337" },
363 { CTL_INT, NET_TCP_MAX_SYN_BACKLOG, "tcp_max_syn_backlog" },
364 { CTL_INT, NET_IPV4_LOCAL_PORT_RANGE, "ip_local_port_range" },
365 { CTL_INT, NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships" },
366 { CTL_INT, NET_IPV4_IGMP_MAX_MSF, "igmp_max_msf" },
367 { CTL_INT, NET_IPV4_INET_PEER_THRESHOLD, "inet_peer_threshold" },
368 { CTL_INT, NET_IPV4_INET_PEER_MINTTL, "inet_peer_minttl" },
369 { CTL_INT, NET_IPV4_INET_PEER_MAXTTL, "inet_peer_maxttl" },
370 { CTL_INT, NET_IPV4_INET_PEER_GC_MINTIME, "inet_peer_gc_mintime" },
371 { CTL_INT, NET_IPV4_INET_PEER_GC_MAXTIME, "inet_peer_gc_maxtime" },
372 { CTL_INT, NET_TCP_ORPHAN_RETRIES, "tcp_orphan_retries" },
373 { CTL_INT, NET_TCP_FACK, "tcp_fack" },
374 { CTL_INT, NET_TCP_REORDERING, "tcp_reordering" },
375 { CTL_INT, NET_TCP_ECN, "tcp_ecn" },
376 { CTL_INT, NET_TCP_DSACK, "tcp_dsack" },
377 { CTL_INT, NET_TCP_MEM, "tcp_mem" },
378 { CTL_INT, NET_TCP_WMEM, "tcp_wmem" },
379 { CTL_INT, NET_TCP_RMEM, "tcp_rmem" },
380 { CTL_INT, NET_TCP_APP_WIN, "tcp_app_win" },
381 { CTL_INT, NET_TCP_ADV_WIN_SCALE, "tcp_adv_win_scale" },
382 { CTL_INT, NET_TCP_TW_REUSE, "tcp_tw_reuse" },
383 { CTL_INT, NET_TCP_FRTO, "tcp_frto" },
384 { CTL_INT, NET_TCP_FRTO_RESPONSE, "tcp_frto_response" },
385 { CTL_INT, NET_TCP_LOW_LATENCY, "tcp_low_latency" },
386 { CTL_INT, NET_TCP_NO_METRICS_SAVE, "tcp_no_metrics_save" },
387 { CTL_INT, NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" },
388 { CTL_INT, NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" },
389 { CTL_STR, NET_TCP_CONG_CONTROL, "tcp_congestion_control" },
390 { CTL_INT, NET_TCP_ABC, "tcp_abc" },
391 { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" },
392 { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" },
393 { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" },
394 { CTL_INT, NET_TCP_DMA_COPYBREAK, "tcp_dma_copybreak" },
395 { CTL_INT, NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" },
396 { CTL_INT, NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" },
397 { CTL_INT, NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" },
398 { CTL_INT, NET_CIPSOV4_RBM_OPTFMT, "cipso_rbm_optfmt" },
399 { CTL_INT, NET_CIPSOV4_RBM_STRICTVALID, "cipso_rbm_strictvalid" },
400 /* NET_TCP_AVAIL_CONG_CONTROL "tcp_available_congestion_control" no longer used */
401 { CTL_STR, NET_TCP_ALLOWED_CONG_CONTROL, "tcp_allowed_congestion_control" },
402 { CTL_INT, NET_TCP_MAX_SSTHRESH, "tcp_max_ssthresh" },
403
404 { CTL_INT, NET_IPV4_ICMP_ECHO_IGNORE_ALL, "icmp_echo_ignore_all" },
405 { CTL_INT, NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, "icmp_echo_ignore_broadcasts" },
406 { CTL_INT, NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, "icmp_ignore_bogus_error_responses" },
407 { CTL_INT, NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR, "icmp_errors_use_inbound_ifaddr" },
408 { CTL_INT, NET_IPV4_ICMP_RATELIMIT, "icmp_ratelimit" },
409 { CTL_INT, NET_IPV4_ICMP_RATEMASK, "icmp_ratemask" },
410
411 { CTL_INT, NET_IPV4_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh" },
412 { CTL_INT, NET_IPV4_IPFRAG_LOW_THRESH, "ipfrag_low_thresh" },
413 { CTL_INT, NET_IPV4_IPFRAG_TIME, "ipfrag_time" },
414
415 { CTL_INT, NET_IPV4_IPFRAG_SECRET_INTERVAL, "ipfrag_secret_interval" },
416 /* NET_IPV4_IPFRAG_MAX_DIST "ipfrag_max_dist" no longer used */
417
418 { CTL_INT, 2088 /* NET_IPQ_QMAX */, "ip_queue_maxlen" },
419
420 /* NET_TCP_DEFAULT_WIN_SCALE unused */
421 /* NET_TCP_BIC_BETA unused */
422 /* NET_IPV4_TCP_MAX_KA_PROBES unused */
423 /* NET_IPV4_IP_MASQ_DEBUG unused */
424 /* NET_TCP_SYN_TAILDROP unused */
425 /* NET_IPV4_ICMP_SOURCEQUENCH_RATE unused */
426 /* NET_IPV4_ICMP_DESTUNREACH_RATE unused */
427 /* NET_IPV4_ICMP_TIMEEXCEED_RATE unused */
428 /* NET_IPV4_ICMP_PARAMPROB_RATE unused */
429 /* NET_IPV4_ICMP_ECHOREPLY_RATE unused */
430 /* NET_IPV4_ALWAYS_DEFRAG unused */
431 {}
432};
433
434static const struct bin_table bin_net_ipx_table[] = {
435 { CTL_INT, NET_IPX_PPROP_BROADCASTING, "ipx_pprop_broadcasting" },
436 /* NET_IPX_FORWARDING unused */
437 {}
438};
439
440static const struct bin_table bin_net_atalk_table[] = {
441 { CTL_INT, NET_ATALK_AARP_EXPIRY_TIME, "aarp-expiry-time" },
442 { CTL_INT, NET_ATALK_AARP_TICK_TIME, "aarp-tick-time" },
443 { CTL_INT, NET_ATALK_AARP_RETRANSMIT_LIMIT, "aarp-retransmit-limit" },
444 { CTL_INT, NET_ATALK_AARP_RESOLVE_TIME, "aarp-resolve-time" },
445 {},
446};
447
448static const struct bin_table bin_net_netrom_table[] = {
449 { CTL_INT, NET_NETROM_DEFAULT_PATH_QUALITY, "default_path_quality" },
450 { CTL_INT, NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER, "obsolescence_count_initialiser" },
451 { CTL_INT, NET_NETROM_NETWORK_TTL_INITIALISER, "network_ttl_initialiser" },
452 { CTL_INT, NET_NETROM_TRANSPORT_TIMEOUT, "transport_timeout" },
453 { CTL_INT, NET_NETROM_TRANSPORT_MAXIMUM_TRIES, "transport_maximum_tries" },
454 { CTL_INT, NET_NETROM_TRANSPORT_ACKNOWLEDGE_DELAY, "transport_acknowledge_delay" },
455 { CTL_INT, NET_NETROM_TRANSPORT_BUSY_DELAY, "transport_busy_delay" },
456 { CTL_INT, NET_NETROM_TRANSPORT_REQUESTED_WINDOW_SIZE, "transport_requested_window_size" },
457 { CTL_INT, NET_NETROM_TRANSPORT_NO_ACTIVITY_TIMEOUT, "transport_no_activity_timeout" },
458 { CTL_INT, NET_NETROM_ROUTING_CONTROL, "routing_control" },
459 { CTL_INT, NET_NETROM_LINK_FAILS_COUNT, "link_fails_count" },
460 { CTL_INT, NET_NETROM_RESET, "reset" },
461 {}
462};
463
464static const struct bin_table bin_net_ax25_param_table[] = {
465 { CTL_INT, NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" },
466 { CTL_INT, NET_AX25_DEFAULT_MODE, "ax25_default_mode" },
467 { CTL_INT, NET_AX25_BACKOFF_TYPE, "backoff_type" },
468 { CTL_INT, NET_AX25_CONNECT_MODE, "connect_mode" },
469 { CTL_INT, NET_AX25_STANDARD_WINDOW, "standard_window_size" },
470 { CTL_INT, NET_AX25_EXTENDED_WINDOW, "extended_window_size" },
471 { CTL_INT, NET_AX25_T1_TIMEOUT, "t1_timeout" },
472 { CTL_INT, NET_AX25_T2_TIMEOUT, "t2_timeout" },
473 { CTL_INT, NET_AX25_T3_TIMEOUT, "t3_timeout" },
474 { CTL_INT, NET_AX25_IDLE_TIMEOUT, "idle_timeout" },
475 { CTL_INT, NET_AX25_N2, "maximum_retry_count" },
476 { CTL_INT, NET_AX25_PACLEN, "maximum_packet_length" },
477 { CTL_INT, NET_AX25_PROTOCOL, "protocol" },
478 { CTL_INT, NET_AX25_DAMA_SLAVE_TIMEOUT, "dama_slave_timeout" },
479 {}
480};
481
482static const struct bin_table bin_net_ax25_table[] = {
483 { CTL_DIR, 0, NULL, bin_net_ax25_param_table },
484 {}
485};
486
487static const struct bin_table bin_net_rose_table[] = {
488 { CTL_INT, NET_ROSE_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" },
489 { CTL_INT, NET_ROSE_CALL_REQUEST_TIMEOUT, "call_request_timeout" },
490 { CTL_INT, NET_ROSE_RESET_REQUEST_TIMEOUT, "reset_request_timeout" },
491 { CTL_INT, NET_ROSE_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" },
492 { CTL_INT, NET_ROSE_ACK_HOLD_BACK_TIMEOUT, "acknowledge_hold_back_timeout" },
493 { CTL_INT, NET_ROSE_ROUTING_CONTROL, "routing_control" },
494 { CTL_INT, NET_ROSE_LINK_FAIL_TIMEOUT, "link_fail_timeout" },
495 { CTL_INT, NET_ROSE_MAX_VCS, "maximum_virtual_circuits" },
496 { CTL_INT, NET_ROSE_WINDOW_SIZE, "window_size" },
497 { CTL_INT, NET_ROSE_NO_ACTIVITY_TIMEOUT, "no_activity_timeout" },
498 {}
499};
500
501static const struct bin_table bin_net_ipv6_conf_var_table[] = {
502 { CTL_INT, NET_IPV6_FORWARDING, "forwarding" },
503 { CTL_INT, NET_IPV6_HOP_LIMIT, "hop_limit" },
504 { CTL_INT, NET_IPV6_MTU, "mtu" },
505 { CTL_INT, NET_IPV6_ACCEPT_RA, "accept_ra" },
506 { CTL_INT, NET_IPV6_ACCEPT_REDIRECTS, "accept_redirects" },
507 { CTL_INT, NET_IPV6_AUTOCONF, "autoconf" },
508 { CTL_INT, NET_IPV6_DAD_TRANSMITS, "dad_transmits" },
509 { CTL_INT, NET_IPV6_RTR_SOLICITS, "router_solicitations" },
510 { CTL_INT, NET_IPV6_RTR_SOLICIT_INTERVAL, "router_solicitation_interval" },
511 { CTL_INT, NET_IPV6_RTR_SOLICIT_DELAY, "router_solicitation_delay" },
512 { CTL_INT, NET_IPV6_USE_TEMPADDR, "use_tempaddr" },
513 { CTL_INT, NET_IPV6_TEMP_VALID_LFT, "temp_valid_lft" },
514 { CTL_INT, NET_IPV6_TEMP_PREFERED_LFT, "temp_prefered_lft" },
515 { CTL_INT, NET_IPV6_REGEN_MAX_RETRY, "regen_max_retry" },
516 { CTL_INT, NET_IPV6_MAX_DESYNC_FACTOR, "max_desync_factor" },
517 { CTL_INT, NET_IPV6_MAX_ADDRESSES, "max_addresses" },
518 { CTL_INT, NET_IPV6_FORCE_MLD_VERSION, "force_mld_version" },
519 { CTL_INT, NET_IPV6_ACCEPT_RA_DEFRTR, "accept_ra_defrtr" },
520 { CTL_INT, NET_IPV6_ACCEPT_RA_PINFO, "accept_ra_pinfo" },
521 { CTL_INT, NET_IPV6_ACCEPT_RA_RTR_PREF, "accept_ra_rtr_pref" },
522 { CTL_INT, NET_IPV6_RTR_PROBE_INTERVAL, "router_probe_interval" },
523 { CTL_INT, NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" },
524 { CTL_INT, NET_IPV6_PROXY_NDP, "proxy_ndp" },
525 { CTL_INT, NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
526 {}
527};
528
529static const struct bin_table bin_net_ipv6_conf_table[] = {
530 { CTL_DIR, NET_PROTO_CONF_ALL, "all", bin_net_ipv6_conf_var_table },
531 { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_ipv6_conf_var_table },
532 { CTL_DIR, 0, NULL, bin_net_ipv6_conf_var_table },
533 {}
534};
535
536static const struct bin_table bin_net_ipv6_route_table[] = {
537 /* NET_IPV6_ROUTE_FLUSH "flush" no longer used */
538 { CTL_INT, NET_IPV6_ROUTE_GC_THRESH, "gc_thresh" },
539 { CTL_INT, NET_IPV6_ROUTE_MAX_SIZE, "max_size" },
540 { CTL_INT, NET_IPV6_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" },
541 { CTL_INT, NET_IPV6_ROUTE_GC_TIMEOUT, "gc_timeout" },
542 { CTL_INT, NET_IPV6_ROUTE_GC_INTERVAL, "gc_interval" },
543 { CTL_INT, NET_IPV6_ROUTE_GC_ELASTICITY, "gc_elasticity" },
544 { CTL_INT, NET_IPV6_ROUTE_MTU_EXPIRES, "mtu_expires" },
545 { CTL_INT, NET_IPV6_ROUTE_MIN_ADVMSS, "min_adv_mss" },
546 { CTL_INT, NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" },
547 {}
548};
549
550static const struct bin_table bin_net_ipv6_icmp_table[] = {
551 { CTL_INT, NET_IPV6_ICMP_RATELIMIT, "ratelimit" },
552 {}
553};
554
555static const struct bin_table bin_net_ipv6_table[] = {
556 { CTL_DIR, NET_IPV6_CONF, "conf", bin_net_ipv6_conf_table },
557 { CTL_DIR, NET_IPV6_NEIGH, "neigh", bin_net_neigh_table },
558 { CTL_DIR, NET_IPV6_ROUTE, "route", bin_net_ipv6_route_table },
559 { CTL_DIR, NET_IPV6_ICMP, "icmp", bin_net_ipv6_icmp_table },
560 { CTL_INT, NET_IPV6_BINDV6ONLY, "bindv6only" },
561 { CTL_INT, NET_IPV6_IP6FRAG_HIGH_THRESH, "ip6frag_high_thresh" },
562 { CTL_INT, NET_IPV6_IP6FRAG_LOW_THRESH, "ip6frag_low_thresh" },
563 { CTL_INT, NET_IPV6_IP6FRAG_TIME, "ip6frag_time" },
564 { CTL_INT, NET_IPV6_IP6FRAG_SECRET_INTERVAL, "ip6frag_secret_interval" },
565 { CTL_INT, NET_IPV6_MLD_MAX_MSF, "mld_max_msf" },
566 { CTL_INT, 2088 /* IPQ_QMAX */, "ip6_queue_maxlen" },
567 {}
568};
569
570static const struct bin_table bin_net_x25_table[] = {
571 { CTL_INT, NET_X25_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" },
572 { CTL_INT, NET_X25_CALL_REQUEST_TIMEOUT, "call_request_timeout" },
573 { CTL_INT, NET_X25_RESET_REQUEST_TIMEOUT, "reset_request_timeout" },
574 { CTL_INT, NET_X25_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" },
575 { CTL_INT, NET_X25_ACK_HOLD_BACK_TIMEOUT, "acknowledgement_hold_back_timeout" },
576 { CTL_INT, NET_X25_FORWARD, "x25_forward" },
577 {}
578};
579
580static const struct bin_table bin_net_tr_table[] = {
581 { CTL_INT, NET_TR_RIF_TIMEOUT, "rif_timeout" },
582 {}
583};
584
585
586static const struct bin_table bin_net_decnet_conf_vars[] = {
587 { CTL_INT, NET_DECNET_CONF_DEV_FORWARDING, "forwarding" },
588 { CTL_INT, NET_DECNET_CONF_DEV_PRIORITY, "priority" },
589 { CTL_INT, NET_DECNET_CONF_DEV_T2, "t2" },
590 { CTL_INT, NET_DECNET_CONF_DEV_T3, "t3" },
591 {}
592};
593
594static const struct bin_table bin_net_decnet_conf[] = {
595 { CTL_DIR, NET_DECNET_CONF_ETHER, "ethernet", bin_net_decnet_conf_vars },
596 { CTL_DIR, NET_DECNET_CONF_GRE, "ipgre", bin_net_decnet_conf_vars },
597 { CTL_DIR, NET_DECNET_CONF_X25, "x25", bin_net_decnet_conf_vars },
598 { CTL_DIR, NET_DECNET_CONF_PPP, "ppp", bin_net_decnet_conf_vars },
599 { CTL_DIR, NET_DECNET_CONF_DDCMP, "ddcmp", bin_net_decnet_conf_vars },
600 { CTL_DIR, NET_DECNET_CONF_LOOPBACK, "loopback", bin_net_decnet_conf_vars },
601 { CTL_DIR, 0, NULL, bin_net_decnet_conf_vars },
602 {}
603};
604
605static const struct bin_table bin_net_decnet_table[] = {
606 { CTL_DIR, NET_DECNET_CONF, "conf", bin_net_decnet_conf },
607 { CTL_DNADR, NET_DECNET_NODE_ADDRESS, "node_address" },
608 { CTL_STR, NET_DECNET_NODE_NAME, "node_name" },
609 { CTL_STR, NET_DECNET_DEFAULT_DEVICE, "default_device" },
610 { CTL_INT, NET_DECNET_TIME_WAIT, "time_wait" },
611 { CTL_INT, NET_DECNET_DN_COUNT, "dn_count" },
612 { CTL_INT, NET_DECNET_DI_COUNT, "di_count" },
613 { CTL_INT, NET_DECNET_DR_COUNT, "dr_count" },
614 { CTL_INT, NET_DECNET_DST_GC_INTERVAL, "dst_gc_interval" },
615 { CTL_INT, NET_DECNET_NO_FC_MAX_CWND, "no_fc_max_cwnd" },
616 { CTL_INT, NET_DECNET_MEM, "decnet_mem" },
617 { CTL_INT, NET_DECNET_RMEM, "decnet_rmem" },
618 { CTL_INT, NET_DECNET_WMEM, "decnet_wmem" },
619 { CTL_INT, NET_DECNET_DEBUG_LEVEL, "debug" },
620 {}
621};
622
623static const struct bin_table bin_net_sctp_table[] = {
624 { CTL_INT, NET_SCTP_RTO_INITIAL, "rto_initial" },
625 { CTL_INT, NET_SCTP_RTO_MIN, "rto_min" },
626 { CTL_INT, NET_SCTP_RTO_MAX, "rto_max" },
627 { CTL_INT, NET_SCTP_RTO_ALPHA, "rto_alpha_exp_divisor" },
628 { CTL_INT, NET_SCTP_RTO_BETA, "rto_beta_exp_divisor" },
629 { CTL_INT, NET_SCTP_VALID_COOKIE_LIFE, "valid_cookie_life" },
630 { CTL_INT, NET_SCTP_ASSOCIATION_MAX_RETRANS, "association_max_retrans" },
631 { CTL_INT, NET_SCTP_PATH_MAX_RETRANS, "path_max_retrans" },
632 { CTL_INT, NET_SCTP_MAX_INIT_RETRANSMITS, "max_init_retransmits" },
633 { CTL_INT, NET_SCTP_HB_INTERVAL, "hb_interval" },
634 { CTL_INT, NET_SCTP_PRESERVE_ENABLE, "cookie_preserve_enable" },
635 { CTL_INT, NET_SCTP_MAX_BURST, "max_burst" },
636 { CTL_INT, NET_SCTP_ADDIP_ENABLE, "addip_enable" },
637 { CTL_INT, NET_SCTP_PRSCTP_ENABLE, "prsctp_enable" },
638 { CTL_INT, NET_SCTP_SNDBUF_POLICY, "sndbuf_policy" },
639 { CTL_INT, NET_SCTP_SACK_TIMEOUT, "sack_timeout" },
640 { CTL_INT, NET_SCTP_RCVBUF_POLICY, "rcvbuf_policy" },
641 {}
642};
643
644static const struct bin_table bin_net_llc_llc2_timeout_table[] = {
645 { CTL_INT, NET_LLC2_ACK_TIMEOUT, "ack" },
646 { CTL_INT, NET_LLC2_P_TIMEOUT, "p" },
647 { CTL_INT, NET_LLC2_REJ_TIMEOUT, "rej" },
648 { CTL_INT, NET_LLC2_BUSY_TIMEOUT, "busy" },
649 {}
650};
651
652static const struct bin_table bin_net_llc_station_table[] = {
653 { CTL_INT, NET_LLC_STATION_ACK_TIMEOUT, "ack_timeout" },
654 {}
655};
656
657static const struct bin_table bin_net_llc_llc2_table[] = {
658 { CTL_DIR, NET_LLC2, "timeout", bin_net_llc_llc2_timeout_table },
659 {}
660};
661
662static const struct bin_table bin_net_llc_table[] = {
663 { CTL_DIR, NET_LLC2, "llc2", bin_net_llc_llc2_table },
664 { CTL_DIR, NET_LLC_STATION, "station", bin_net_llc_station_table },
665 {}
666};
667
668static const struct bin_table bin_net_netfilter_table[] = {
669 { CTL_INT, NET_NF_CONNTRACK_MAX, "nf_conntrack_max" },
670 /* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "nf_conntrack_tcp_timeout_syn_sent" no longer used */
671 /* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "nf_conntrack_tcp_timeout_syn_recv" no longer used */
672 /* NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "nf_conntrack_tcp_timeout_established" no longer used */
673 /* NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "nf_conntrack_tcp_timeout_fin_wait" no longer used */
674 /* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT "nf_conntrack_tcp_timeout_close_wait" no longer used */
675 /* NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "nf_conntrack_tcp_timeout_last_ack" no longer used */
676 /* NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "nf_conntrack_tcp_timeout_time_wait" no longer used */
677 /* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "nf_conntrack_tcp_timeout_close" no longer used */
678 /* NET_NF_CONNTRACK_UDP_TIMEOUT "nf_conntrack_udp_timeout" no longer used */
679 /* NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM "nf_conntrack_udp_timeout_stream" no longer used */
680 /* NET_NF_CONNTRACK_ICMP_TIMEOUT "nf_conntrack_icmp_timeout" no longer used */
681 /* NET_NF_CONNTRACK_GENERIC_TIMEOUT "nf_conntrack_generic_timeout" no longer used */
682 { CTL_INT, NET_NF_CONNTRACK_BUCKETS, "nf_conntrack_buckets" },
683 { CTL_INT, NET_NF_CONNTRACK_LOG_INVALID, "nf_conntrack_log_invalid" },
684 /* NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "nf_conntrack_tcp_timeout_max_retrans" no longer used */
685 { CTL_INT, NET_NF_CONNTRACK_TCP_LOOSE, "nf_conntrack_tcp_loose" },
686 { CTL_INT, NET_NF_CONNTRACK_TCP_BE_LIBERAL, "nf_conntrack_tcp_be_liberal" },
687 { CTL_INT, NET_NF_CONNTRACK_TCP_MAX_RETRANS, "nf_conntrack_tcp_max_retrans" },
688 /* NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "nf_conntrack_sctp_timeout_closed" no longer used */
689 /* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "nf_conntrack_sctp_timeout_cookie_wait" no longer used */
690 /* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "nf_conntrack_sctp_timeout_cookie_echoed" no longer used */
691 /* NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "nf_conntrack_sctp_timeout_established" no longer used */
692 /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "nf_conntrack_sctp_timeout_shutdown_sent" no longer used */
693 /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "nf_conntrack_sctp_timeout_shutdown_recd" no longer used */
694 /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "nf_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */
695 { CTL_INT, NET_NF_CONNTRACK_COUNT, "nf_conntrack_count" },
696 /* NET_NF_CONNTRACK_ICMPV6_TIMEOUT "nf_conntrack_icmpv6_timeout" no longer used */
697 /* NET_NF_CONNTRACK_FRAG6_TIMEOUT "nf_conntrack_frag6_timeout" no longer used */
698 { CTL_INT, NET_NF_CONNTRACK_FRAG6_LOW_THRESH, "nf_conntrack_frag6_low_thresh" },
699 { CTL_INT, NET_NF_CONNTRACK_FRAG6_HIGH_THRESH, "nf_conntrack_frag6_high_thresh" },
700 { CTL_INT, NET_NF_CONNTRACK_CHECKSUM, "nf_conntrack_checksum" },
701
702 {}
703};
704
705static const struct bin_table bin_net_irda_table[] = {
706 { CTL_INT, NET_IRDA_DISCOVERY, "discovery" },
707 { CTL_STR, NET_IRDA_DEVNAME, "devname" },
708 { CTL_INT, NET_IRDA_DEBUG, "debug" },
709 { CTL_INT, NET_IRDA_FAST_POLL, "fast_poll_increase" },
710 { CTL_INT, NET_IRDA_DISCOVERY_SLOTS, "discovery_slots" },
711 { CTL_INT, NET_IRDA_DISCOVERY_TIMEOUT, "discovery_timeout" },
712 { CTL_INT, NET_IRDA_SLOT_TIMEOUT, "slot_timeout" },
713 { CTL_INT, NET_IRDA_MAX_BAUD_RATE, "max_baud_rate" },
714 { CTL_INT, NET_IRDA_MIN_TX_TURN_TIME, "min_tx_turn_time" },
715 { CTL_INT, NET_IRDA_MAX_TX_DATA_SIZE, "max_tx_data_size" },
716 { CTL_INT, NET_IRDA_MAX_TX_WINDOW, "max_tx_window" },
717 { CTL_INT, NET_IRDA_MAX_NOREPLY_TIME, "max_noreply_time" },
718 { CTL_INT, NET_IRDA_WARN_NOREPLY_TIME, "warn_noreply_time" },
719 { CTL_INT, NET_IRDA_LAP_KEEPALIVE_TIME, "lap_keepalive_time" },
720 {}
721};
722
723static const struct bin_table bin_net_table[] = {
724 { CTL_DIR, NET_CORE, "core", bin_net_core_table },
725 /* NET_ETHER not used */
726 /* NET_802 not used */
727 { CTL_DIR, NET_UNIX, "unix", bin_net_unix_table },
728 { CTL_DIR, NET_IPV4, "ipv4", bin_net_ipv4_table },
729 { CTL_DIR, NET_IPX, "ipx", bin_net_ipx_table },
730 { CTL_DIR, NET_ATALK, "appletalk", bin_net_atalk_table },
731 { CTL_DIR, NET_NETROM, "netrom", bin_net_netrom_table },
732 { CTL_DIR, NET_AX25, "ax25", bin_net_ax25_table },
733 /* NET_BRIDGE "bridge" no longer used */
734 { CTL_DIR, NET_ROSE, "rose", bin_net_rose_table },
735 { CTL_DIR, NET_IPV6, "ipv6", bin_net_ipv6_table },
736 { CTL_DIR, NET_X25, "x25", bin_net_x25_table },
737 { CTL_DIR, NET_TR, "token-ring", bin_net_tr_table },
738 { CTL_DIR, NET_DECNET, "decnet", bin_net_decnet_table },
739 /* NET_ECONET not used */
740 { CTL_DIR, NET_SCTP, "sctp", bin_net_sctp_table },
741 { CTL_DIR, NET_LLC, "llc", bin_net_llc_table },
742 { CTL_DIR, NET_NETFILTER, "netfilter", bin_net_netfilter_table },
743 /* NET_DCCP "dccp" no longer used */
744 { CTL_DIR, NET_IRDA, "irda", bin_net_irda_table },
745 { CTL_INT, 2089, "nf_conntrack_max" },
746 {}
747};
748
749static const struct bin_table bin_fs_quota_table[] = {
750 { CTL_INT, FS_DQ_LOOKUPS, "lookups" },
751 { CTL_INT, FS_DQ_DROPS, "drops" },
752 { CTL_INT, FS_DQ_READS, "reads" },
753 { CTL_INT, FS_DQ_WRITES, "writes" },
754 { CTL_INT, FS_DQ_CACHE_HITS, "cache_hits" },
755 { CTL_INT, FS_DQ_ALLOCATED, "allocated_dquots" },
756 { CTL_INT, FS_DQ_FREE, "free_dquots" },
757 { CTL_INT, FS_DQ_SYNCS, "syncs" },
758 { CTL_INT, FS_DQ_WARNINGS, "warnings" },
759 {}
760};
761
762static const struct bin_table bin_fs_xfs_table[] = {
763 { CTL_INT, XFS_SGID_INHERIT, "irix_sgid_inherit" },
764 { CTL_INT, XFS_SYMLINK_MODE, "irix_symlink_mode" },
765 { CTL_INT, XFS_PANIC_MASK, "panic_mask" },
766
767 { CTL_INT, XFS_ERRLEVEL, "error_level" },
768 { CTL_INT, XFS_SYNCD_TIMER, "xfssyncd_centisecs" },
769 { CTL_INT, XFS_INHERIT_SYNC, "inherit_sync" },
770 { CTL_INT, XFS_INHERIT_NODUMP, "inherit_nodump" },
771 { CTL_INT, XFS_INHERIT_NOATIME, "inherit_noatime" },
772 { CTL_INT, XFS_BUF_TIMER, "xfsbufd_centisecs" },
773 { CTL_INT, XFS_BUF_AGE, "age_buffer_centisecs" },
774 { CTL_INT, XFS_INHERIT_NOSYM, "inherit_nosymlinks" },
775 { CTL_INT, XFS_ROTORSTEP, "rotorstep" },
776 { CTL_INT, XFS_INHERIT_NODFRG, "inherit_nodefrag" },
777 { CTL_INT, XFS_FILESTREAM_TIMER, "filestream_centisecs" },
778 { CTL_INT, XFS_STATS_CLEAR, "stats_clear" },
779 {}
780};
781
782static const struct bin_table bin_fs_ocfs2_nm_table[] = {
783 { CTL_STR, 1, "hb_ctl_path" },
784 {}
785};
786
787static const struct bin_table bin_fs_ocfs2_table[] = {
788 { CTL_DIR, 1, "nm", bin_fs_ocfs2_nm_table },
789 {}
790};
791
792static const struct bin_table bin_inotify_table[] = {
793 { CTL_INT, INOTIFY_MAX_USER_INSTANCES, "max_user_instances" },
794 { CTL_INT, INOTIFY_MAX_USER_WATCHES, "max_user_watches" },
795 { CTL_INT, INOTIFY_MAX_QUEUED_EVENTS, "max_queued_events" },
796 {}
797};
798
799static const struct bin_table bin_fs_table[] = {
800 { CTL_INT, FS_NRINODE, "inode-nr" },
801 { CTL_INT, FS_STATINODE, "inode-state" },
802 /* FS_MAXINODE unused */
803 /* FS_NRDQUOT unused */
804 /* FS_MAXDQUOT unused */
805 /* FS_NRFILE "file-nr" no longer used */
806 { CTL_INT, FS_MAXFILE, "file-max" },
807 { CTL_INT, FS_DENTRY, "dentry-state" },
808 /* FS_NRSUPER unused */
809 /* FS_MAXUPSER unused */
810 { CTL_INT, FS_OVERFLOWUID, "overflowuid" },
811 { CTL_INT, FS_OVERFLOWGID, "overflowgid" },
812 { CTL_INT, FS_LEASES, "leases-enable" },
813 { CTL_INT, FS_DIR_NOTIFY, "dir-notify-enable" },
814 { CTL_INT, FS_LEASE_TIME, "lease-break-time" },
815 { CTL_DIR, FS_DQSTATS, "quota", bin_fs_quota_table },
816 { CTL_DIR, FS_XFS, "xfs", bin_fs_xfs_table },
817 { CTL_ULONG, FS_AIO_NR, "aio-nr" },
818 { CTL_ULONG, FS_AIO_MAX_NR, "aio-max-nr" },
819 { CTL_DIR, FS_INOTIFY, "inotify", bin_inotify_table },
820 { CTL_DIR, FS_OCFS2, "ocfs2", bin_fs_ocfs2_table },
821 { CTL_INT, KERN_SETUID_DUMPABLE, "suid_dumpable" },
822 {}
823};
824
825static const struct bin_table bin_ipmi_table[] = {
826 { CTL_INT, DEV_IPMI_POWEROFF_POWERCYCLE, "poweroff_powercycle" },
827 {}
828};
829
830static const struct bin_table bin_mac_hid_files[] = {
831 /* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */
832 /* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */
833 { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON_EMULATION, "mouse_button_emulation" },
834 { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON2_KEYCODE, "mouse_button2_keycode" },
835 { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON3_KEYCODE, "mouse_button3_keycode" },
836 /* DEV_MAC_HID_ADB_MOUSE_SENDS_KEYCODES unused */
837 {}
838};
839
840static const struct bin_table bin_raid_table[] = {
841 { CTL_INT, DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min" },
842 { CTL_INT, DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max" },
843 {}
844};
845
846static const struct bin_table bin_scsi_table[] = {
847 { CTL_INT, DEV_SCSI_LOGGING_LEVEL, "logging_level" },
848 {}
849};
850
851static const struct bin_table bin_dev_table[] = {
852 /* DEV_CDROM "cdrom" no longer used */
853 /* DEV_HWMON unused */
854 /* DEV_PARPORT "parport" no longer used */
855 { CTL_DIR, DEV_RAID, "raid", bin_raid_table },
856 { CTL_DIR, DEV_MAC_HID, "mac_hid", bin_mac_hid_files },
857 { CTL_DIR, DEV_SCSI, "scsi", bin_scsi_table },
858 { CTL_DIR, DEV_IPMI, "ipmi", bin_ipmi_table },
859 {}
860};
861
862static const struct bin_table bin_bus_isa_table[] = {
863 { CTL_INT, BUS_ISA_MEM_BASE, "membase" },
864 { CTL_INT, BUS_ISA_PORT_BASE, "portbase" },
865 { CTL_INT, BUS_ISA_PORT_SHIFT, "portshift" },
866 {}
867};
868
869static const struct bin_table bin_bus_table[] = {
870 { CTL_DIR, CTL_BUS_ISA, "isa", bin_bus_isa_table },
871 {}
872};
873
874
875static const struct bin_table bin_s390dbf_table[] = {
876 { CTL_INT, 5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" },
877 { CTL_INT, 5679 /* CTL_S390DBF_ACTIVE */, "debug_active" },
878 {}
879};
880
881static const struct bin_table bin_sunrpc_table[] = {
882 /* CTL_RPCDEBUG "rpc_debug" no longer used */
883 /* CTL_NFSDEBUG "nfs_debug" no longer used */
884 /* CTL_NFSDDEBUG "nfsd_debug" no longer used */
885 /* CTL_NLMDEBUG "nlm_debug" no longer used */
886
887 { CTL_INT, CTL_SLOTTABLE_UDP, "udp_slot_table_entries" },
888 { CTL_INT, CTL_SLOTTABLE_TCP, "tcp_slot_table_entries" },
889 { CTL_INT, CTL_MIN_RESVPORT, "min_resvport" },
890 { CTL_INT, CTL_MAX_RESVPORT, "max_resvport" },
891 {}
892};
893
894static const struct bin_table bin_pm_table[] = {
895 /* frv specific */
896 /* 1 == CTL_PM_SUSPEND "suspend" no longer used" */
897 { CTL_INT, 2 /* CTL_PM_CMODE */, "cmode" },
898 { CTL_INT, 3 /* CTL_PM_P0 */, "p0" },
899 { CTL_INT, 4 /* CTL_PM_CM */, "cm" },
900 {}
901};
902
903static const struct bin_table bin_root_table[] = {
904 { CTL_DIR, CTL_KERN, "kernel", bin_kern_table },
905 { CTL_DIR, CTL_VM, "vm", bin_vm_table },
906 { CTL_DIR, CTL_NET, "net", bin_net_table },
907 /* CTL_PROC not used */
908 { CTL_DIR, CTL_FS, "fs", bin_fs_table },
909 /* CTL_DEBUG "debug" no longer used */
910 { CTL_DIR, CTL_DEV, "dev", bin_dev_table },
911 { CTL_DIR, CTL_BUS, "bus", bin_bus_table },
912 { CTL_DIR, CTL_ABI, "abi" },
913 /* CTL_CPU not used */
914 /* CTL_ARLAN "arlan" no longer used */
915 { CTL_DIR, CTL_S390DBF, "s390dbf", bin_s390dbf_table },
916 { CTL_DIR, CTL_SUNRPC, "sunrpc", bin_sunrpc_table },
917 { CTL_DIR, CTL_PM, "pm", bin_pm_table },
918 {}
919};
920
921static ssize_t bin_dir(struct file *file,
922 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
923{
924 return -ENOTDIR;
925}
926
927
928static ssize_t bin_string(struct file *file,
929 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
930{
931 ssize_t result, copied = 0;
932
933 if (oldval && oldlen) {
934 char __user *lastp;
935 loff_t pos = 0;
936 int ch;
937
938 result = vfs_read(file, oldval, oldlen, &pos);
939 if (result < 0)
940 goto out;
941
942 copied = result;
943 lastp = oldval + copied - 1;
944
945 result = -EFAULT;
946 if (get_user(ch, lastp))
947 goto out;
948
949 /* Trim off the trailing newline */
950 if (ch == '\n') {
951 result = -EFAULT;
952 if (put_user('\0', lastp))
953 goto out;
954 copied -= 1;
955 }
956 }
957
958 if (newval && newlen) {
959 loff_t pos = 0;
960
961 result = vfs_write(file, newval, newlen, &pos);
962 if (result < 0)
963 goto out;
964 }
965
966 result = copied;
967out:
968 return result;
969}
970
971static ssize_t bin_intvec(struct file *file,
972 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
973{
974 mm_segment_t old_fs = get_fs();
975 ssize_t copied = 0;
976 char *buffer;
977 ssize_t result;
978
979 result = -ENOMEM;
980 buffer = kmalloc(BUFSZ, GFP_KERNEL);
981 if (!buffer)
982 goto out;
983
984 if (oldval && oldlen) {
985 unsigned __user *vec = oldval;
986 size_t length = oldlen / sizeof(*vec);
987 loff_t pos = 0;
988 char *str, *end;
989 int i;
990
991 set_fs(KERNEL_DS);
992 result = vfs_read(file, buffer, BUFSZ - 1, &pos);
993 set_fs(old_fs);
994 if (result < 0)
995 goto out_kfree;
996
997 str = buffer;
998 end = str + result;
999 *end++ = '\0';
1000 for (i = 0; i < length; i++) {
1001 unsigned long value;
1002
1003 value = simple_strtoul(str, &str, 10);
1004 while (isspace(*str))
1005 str++;
1006
1007 result = -EFAULT;
1008 if (put_user(value, vec + i))
1009 goto out_kfree;
1010
1011 copied += sizeof(*vec);
1012 if (!isdigit(*str))
1013 break;
1014 }
1015 }
1016
1017 if (newval && newlen) {
1018 unsigned __user *vec = newval;
1019 size_t length = newlen / sizeof(*vec);
1020 loff_t pos = 0;
1021 char *str, *end;
1022 int i;
1023
1024 str = buffer;
1025 end = str + BUFSZ;
1026 for (i = 0; i < length; i++) {
1027 unsigned long value;
1028
1029 result = -EFAULT;
1030 if (get_user(value, vec + i))
1031 goto out_kfree;
1032
1033 str += snprintf(str, end - str, "%lu\t", value);
1034 }
1035
1036 set_fs(KERNEL_DS);
1037 result = vfs_write(file, buffer, str - buffer, &pos);
1038 set_fs(old_fs);
1039 if (result < 0)
1040 goto out_kfree;
1041 }
1042 result = copied;
1043out_kfree:
1044 kfree(buffer);
1045out:
1046 return result;
1047}
1048
1049static ssize_t bin_ulongvec(struct file *file,
1050 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1051{
1052 mm_segment_t old_fs = get_fs();
1053 ssize_t copied = 0;
1054 char *buffer;
1055 ssize_t result;
1056
1057 result = -ENOMEM;
1058 buffer = kmalloc(BUFSZ, GFP_KERNEL);
1059 if (!buffer)
1060 goto out;
1061
1062 if (oldval && oldlen) {
1063 unsigned long __user *vec = oldval;
1064 size_t length = oldlen / sizeof(*vec);
1065 loff_t pos = 0;
1066 char *str, *end;
1067 int i;
1068
1069 set_fs(KERNEL_DS);
1070 result = vfs_read(file, buffer, BUFSZ - 1, &pos);
1071 set_fs(old_fs);
1072 if (result < 0)
1073 goto out_kfree;
1074
1075 str = buffer;
1076 end = str + result;
1077 *end++ = '\0';
1078 for (i = 0; i < length; i++) {
1079 unsigned long value;
1080
1081 value = simple_strtoul(str, &str, 10);
1082 while (isspace(*str))
1083 str++;
1084
1085 result = -EFAULT;
1086 if (put_user(value, vec + i))
1087 goto out_kfree;
1088
1089 copied += sizeof(*vec);
1090 if (!isdigit(*str))
1091 break;
1092 }
1093 }
1094
1095 if (newval && newlen) {
1096 unsigned long __user *vec = newval;
1097 size_t length = newlen / sizeof(*vec);
1098 loff_t pos = 0;
1099 char *str, *end;
1100 int i;
1101
1102 str = buffer;
1103 end = str + BUFSZ;
1104 for (i = 0; i < length; i++) {
1105 unsigned long value;
1106
1107 result = -EFAULT;
1108 if (get_user(value, vec + i))
1109 goto out_kfree;
1110
1111 str += snprintf(str, end - str, "%lu\t", value);
1112 }
1113
1114 set_fs(KERNEL_DS);
1115 result = vfs_write(file, buffer, str - buffer, &pos);
1116 set_fs(old_fs);
1117 if (result < 0)
1118 goto out_kfree;
1119 }
1120 result = copied;
1121out_kfree:
1122 kfree(buffer);
1123out:
1124 return result;
1125}
1126
1127static unsigned hex_value(int ch)
1128{
1129 return isdigit(ch) ? ch - '0' : ((ch | 0x20) - 'a') + 10;
1130}
1131
1132static ssize_t bin_uuid(struct file *file,
1133 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1134{
1135 mm_segment_t old_fs = get_fs();
1136 ssize_t result, copied = 0;
1137
1138 /* Only supports reads */
1139 if (oldval && oldlen) {
1140 loff_t pos = 0;
1141 char buf[40], *str = buf;
1142 unsigned char uuid[16];
1143 int i;
1144
1145 set_fs(KERNEL_DS);
1146 result = vfs_read(file, buf, sizeof(buf) - 1, &pos);
1147 set_fs(old_fs);
1148 if (result < 0)
1149 goto out;
1150
1151 buf[result] = '\0';
1152
1153 /* Convert the uuid to from a string to binary */
1154 for (i = 0; i < 16; i++) {
1155 result = -EIO;
1156 if (!isxdigit(str[0]) || !isxdigit(str[1]))
1157 goto out;
1158
1159 uuid[i] = (hex_value(str[0]) << 4) | hex_value(str[1]);
1160 str += 2;
1161 if (*str == '-')
1162 str++;
1163 }
1164
1165 if (oldlen > 16)
1166 oldlen = 16;
1167
1168 result = -EFAULT;
1169 if (copy_to_user(oldval, uuid, oldlen))
1170 goto out;
1171
1172 copied = oldlen;
1173 }
1174 result = copied;
1175out:
1176 return result;
1177}
1178
1179static ssize_t bin_dn_node_address(struct file *file,
1180 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1181{
1182 mm_segment_t old_fs = get_fs();
1183 ssize_t result, copied = 0;
1184
1185 if (oldval && oldlen) {
1186 loff_t pos = 0;
1187 char buf[15], *nodep;
1188 unsigned long area, node;
1189 __le16 dnaddr;
1190
1191 set_fs(KERNEL_DS);
1192 result = vfs_read(file, buf, sizeof(buf) - 1, &pos);
1193 set_fs(old_fs);
1194 if (result < 0)
1195 goto out;
1196
1197 buf[result] = '\0';
1198
1199 /* Convert the decnet addresss to binary */
1200 result = -EIO;
1201 nodep = strchr(buf, '.') + 1;
1202 if (!nodep)
1203 goto out;
1204
1205 area = simple_strtoul(buf, NULL, 10);
1206 node = simple_strtoul(nodep, NULL, 10);
1207
1208 result = -EIO;
1209 if ((area > 63)||(node > 1023))
1210 goto out;
1211
1212 dnaddr = cpu_to_le16((area << 10) | node);
1213
1214 result = -EFAULT;
1215 if (put_user(dnaddr, (__le16 __user *)oldval))
1216 goto out;
1217
1218 copied = sizeof(dnaddr);
1219 }
1220
1221 if (newval && newlen) {
1222 loff_t pos = 0;
1223 __le16 dnaddr;
1224 char buf[15];
1225 int len;
1226
1227 result = -EINVAL;
1228 if (newlen != sizeof(dnaddr))
1229 goto out;
1230
1231 result = -EFAULT;
1232 if (get_user(dnaddr, (__le16 __user *)newval))
1233 goto out;
1234
1235 len = snprintf(buf, sizeof(buf), "%hu.%hu",
1236 le16_to_cpu(dnaddr) >> 10,
1237 le16_to_cpu(dnaddr) & 0x3ff);
1238
1239 set_fs(KERNEL_DS);
1240 result = vfs_write(file, buf, len, &pos);
1241 set_fs(old_fs);
1242 if (result < 0)
1243 goto out;
1244 }
1245
1246 result = copied;
1247out:
1248 return result;
1249}
1250
1251static const struct bin_table *get_sysctl(const int *name, int nlen, char *path)
1252{
1253 const struct bin_table *table = &bin_root_table[0];
1254 int ctl_name;
1255
1256 /* The binary sysctl tables have a small maximum depth so
1257 * there is no danger of overflowing our path as it PATH_MAX
1258 * bytes long.
1259 */
1260 memcpy(path, "sys/", 4);
1261 path += 4;
1262
1263repeat:
1264 if (!nlen)
1265 return ERR_PTR(-ENOTDIR);
1266 ctl_name = *name;
1267 name++;
1268 nlen--;
1269 for ( ; table->convert; table++) {
1270 int len = 0;
1271
1272 /*
1273 * For a wild card entry map from ifindex to network
1274 * device name.
1275 */
1276 if (!table->ctl_name) {
1277#ifdef CONFIG_NET
1278 struct net *net = current->nsproxy->net_ns;
1279 struct net_device *dev;
1280 dev = dev_get_by_index(net, ctl_name);
1281 if (dev) {
1282 len = strlen(dev->name);
1283 memcpy(path, dev->name, len);
1284 dev_put(dev);
1285 }
1286#endif
1287 /* Use the well known sysctl number to proc name mapping */
1288 } else if (ctl_name == table->ctl_name) {
1289 len = strlen(table->procname);
1290 memcpy(path, table->procname, len);
1291 }
1292 if (len) {
1293 path += len;
1294 if (table->child) {
1295 *path++ = '/';
1296 table = table->child;
1297 goto repeat;
1298 }
1299 *path = '\0';
1300 return table;
1301 }
1302 }
1303 return ERR_PTR(-ENOTDIR);
1304}
1305
1306static char *sysctl_getname(const int *name, int nlen, const struct bin_table **tablep)
1307{
1308 char *tmp, *result;
1309
1310 result = ERR_PTR(-ENOMEM);
1311 tmp = __getname();
1312 if (tmp) {
1313 const struct bin_table *table = get_sysctl(name, nlen, tmp);
1314 result = tmp;
1315 *tablep = table;
1316 if (IS_ERR(table)) {
1317 __putname(tmp);
1318 result = ERR_CAST(table);
1319 }
1320 }
1321 return result;
1322}
1323
1324static ssize_t binary_sysctl(const int *name, int nlen,
1325 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1326{
1327 const struct bin_table *table = NULL;
1328 struct nameidata nd;
1329 struct vfsmount *mnt;
1330 struct file *file;
1331 ssize_t result;
1332 char *pathname;
1333 int flags;
1334 int acc_mode, fmode;
1335
1336 pathname = sysctl_getname(name, nlen, &table);
1337 result = PTR_ERR(pathname);
1338 if (IS_ERR(pathname))
1339 goto out;
1340
1341 /* How should the sysctl be accessed? */
1342 if (oldval && oldlen && newval && newlen) {
1343 flags = O_RDWR;
1344 acc_mode = MAY_READ | MAY_WRITE;
1345 fmode = FMODE_READ | FMODE_WRITE;
1346 } else if (newval && newlen) {
1347 flags = O_WRONLY;
1348 acc_mode = MAY_WRITE;
1349 fmode = FMODE_WRITE;
1350 } else if (oldval && oldlen) {
1351 flags = O_RDONLY;
1352 acc_mode = MAY_READ;
1353 fmode = FMODE_READ;
1354 } else {
1355 result = 0;
1356 goto out_putname;
1357 }
1358
1359 mnt = current->nsproxy->pid_ns->proc_mnt;
1360 result = vfs_path_lookup(mnt->mnt_root, mnt, pathname, 0, &nd);
1361 if (result)
1362 goto out_putname;
1363
1364 result = may_open(&nd.path, acc_mode, fmode);
1365 if (result)
1366 goto out_putpath;
1367
1368 file = dentry_open(nd.path.dentry, nd.path.mnt, flags, current_cred());
1369 result = PTR_ERR(file);
1370 if (IS_ERR(file))
1371 goto out_putname;
1372
1373 result = table->convert(file, oldval, oldlen, newval, newlen);
1374
1375 fput(file);
1376out_putname:
1377 putname(pathname);
1378out:
1379 return result;
1380
1381out_putpath:
1382 path_put(&nd.path);
1383 goto out_putname;
1384}
1385
1386
1387#else /* CONFIG_SYSCTL_SYSCALL */
1388
1389static ssize_t binary_sysctl(const int *name, int nlen,
1390 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1391{
1392 return -ENOSYS;
1393}
1394
1395#endif /* CONFIG_SYSCTL_SYSCALL */
1396
1397
1398static void deprecated_sysctl_warning(const int *name, int nlen)
1399{
1400 int i;
1401
1402 /*
1403 * CTL_KERN/KERN_VERSION is used by older glibc and cannot
1404 * ever go away.
1405 */
1406 if (name[0] == CTL_KERN && name[1] == KERN_VERSION)
1407 return;
1408
1409 if (printk_ratelimit()) {
1410 printk(KERN_INFO
1411 "warning: process `%s' used the deprecated sysctl "
1412 "system call with ", current->comm);
1413 for (i = 0; i < nlen; i++)
1414 printk("%d.", name[i]);
1415 printk("\n");
1416 }
1417 return;
1418}
1419
1420#define WARN_ONCE_HASH_BITS 8
1421#define WARN_ONCE_HASH_SIZE (1<<WARN_ONCE_HASH_BITS)
1422
1423static DECLARE_BITMAP(warn_once_bitmap, WARN_ONCE_HASH_SIZE);
1424
1425#define FNV32_OFFSET 2166136261U
1426#define FNV32_PRIME 0x01000193
1427
1428/*
1429 * Print each legacy sysctl (approximately) only once.
1430 * To avoid making the tables non-const use a external
1431 * hash-table instead.
1432 * Worst case hash collision: 6, but very rarely.
1433 * NOTE! We don't use the SMP-safe bit tests. We simply
1434 * don't care enough.
1435 */
1436static void warn_on_bintable(const int *name, int nlen)
1437{
1438 int i;
1439 u32 hash = FNV32_OFFSET;
1440
1441 for (i = 0; i < nlen; i++)
1442 hash = (hash ^ name[i]) * FNV32_PRIME;
1443 hash %= WARN_ONCE_HASH_SIZE;
1444 if (__test_and_set_bit(hash, warn_once_bitmap))
1445 return;
1446 deprecated_sysctl_warning(name, nlen);
1447}
1448
1449static ssize_t do_sysctl(int __user *args_name, int nlen,
1450 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1451{
1452 int name[CTL_MAXNAME];
1453 int i;
1454
1455 /* Check args->nlen. */
1456 if (nlen < 0 || nlen > CTL_MAXNAME)
1457 return -ENOTDIR;
1458 /* Read in the sysctl name for simplicity */
1459 for (i = 0; i < nlen; i++)
1460 if (get_user(name[i], args_name + i))
1461 return -EFAULT;
1462
1463 warn_on_bintable(name, nlen);
1464
1465 return binary_sysctl(name, nlen, oldval, oldlen, newval, newlen);
1466}
1467
1468SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
1469{
1470 struct __sysctl_args tmp;
1471 size_t oldlen = 0;
1472 ssize_t result;
1473
1474 if (copy_from_user(&tmp, args, sizeof(tmp)))
1475 return -EFAULT;
1476
1477 if (tmp.oldval && !tmp.oldlenp)
1478 return -EFAULT;
1479
1480 if (tmp.oldlenp && get_user(oldlen, tmp.oldlenp))
1481 return -EFAULT;
1482
1483 result = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, oldlen,
1484 tmp.newval, tmp.newlen);
1485
1486 if (result >= 0) {
1487 oldlen = result;
1488 result = 0;
1489 }
1490
1491 if (tmp.oldlenp && put_user(oldlen, tmp.oldlenp))
1492 return -EFAULT;
1493
1494 return result;
1495}
1496
1497
1498#ifdef CONFIG_COMPAT
1499#include <asm/compat.h>
1500
1501struct compat_sysctl_args {
1502 compat_uptr_t name;
1503 int nlen;
1504 compat_uptr_t oldval;
1505 compat_uptr_t oldlenp;
1506 compat_uptr_t newval;
1507 compat_size_t newlen;
1508 compat_ulong_t __unused[4];
1509};
1510
1511asmlinkage long compat_sys_sysctl(struct compat_sysctl_args __user *args)
1512{
1513 struct compat_sysctl_args tmp;
1514 compat_size_t __user *compat_oldlenp;
1515 size_t oldlen = 0;
1516 ssize_t result;
1517
1518 if (copy_from_user(&tmp, args, sizeof(tmp)))
1519 return -EFAULT;
1520
1521 if (tmp.oldval && !tmp.oldlenp)
1522 return -EFAULT;
1523
1524 compat_oldlenp = compat_ptr(tmp.oldlenp);
1525 if (compat_oldlenp && get_user(oldlen, compat_oldlenp))
1526 return -EFAULT;
1527
1528 result = do_sysctl(compat_ptr(tmp.name), tmp.nlen,
1529 compat_ptr(tmp.oldval), oldlen,
1530 compat_ptr(tmp.newval), tmp.newlen);
1531
1532 if (result >= 0) {
1533 oldlen = result;
1534 result = 0;
1535 }
1536
1537 if (compat_oldlenp && put_user(oldlen, compat_oldlenp))
1538 return -EFAULT;
1539
1540 return result;
1541}
1542
1543#endif /* CONFIG_COMPAT */
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index b38423ca711..04cdcf72c82 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -5,1239 +5,6 @@
5#include <linux/string.h> 5#include <linux/string.h>
6#include <net/ip_vs.h> 6#include <net/ip_vs.h>
7 7
8struct trans_ctl_table {
9 int ctl_name;
10 const char *procname;
11 const struct trans_ctl_table *child;
12};
13
14static const struct trans_ctl_table trans_random_table[] = {
15 { RANDOM_POOLSIZE, "poolsize" },
16 { RANDOM_ENTROPY_COUNT, "entropy_avail" },
17 { RANDOM_READ_THRESH, "read_wakeup_threshold" },
18 { RANDOM_WRITE_THRESH, "write_wakeup_threshold" },
19 { RANDOM_BOOT_ID, "boot_id" },
20 { RANDOM_UUID, "uuid" },
21 {}
22};
23
24static const struct trans_ctl_table trans_pty_table[] = {
25 { PTY_MAX, "max" },
26 { PTY_NR, "nr" },
27 {}
28};
29
30static const struct trans_ctl_table trans_kern_table[] = {
31 { KERN_OSTYPE, "ostype" },
32 { KERN_OSRELEASE, "osrelease" },
33 /* KERN_OSREV not used */
34 { KERN_VERSION, "version" },
35 /* KERN_SECUREMASK not used */
36 /* KERN_PROF not used */
37 { KERN_NODENAME, "hostname" },
38 { KERN_DOMAINNAME, "domainname" },
39
40 { KERN_PANIC, "panic" },
41 { KERN_REALROOTDEV, "real-root-dev" },
42
43 { KERN_SPARC_REBOOT, "reboot-cmd" },
44 { KERN_CTLALTDEL, "ctrl-alt-del" },
45 { KERN_PRINTK, "printk" },
46
47 /* KERN_NAMETRANS not used */
48 /* KERN_PPC_HTABRECLAIM not used */
49 /* KERN_PPC_ZEROPAGED not used */
50 { KERN_PPC_POWERSAVE_NAP, "powersave-nap" },
51
52 { KERN_MODPROBE, "modprobe" },
53 { KERN_SG_BIG_BUFF, "sg-big-buff" },
54 { KERN_ACCT, "acct" },
55 { KERN_PPC_L2CR, "l2cr" },
56
57 /* KERN_RTSIGNR not used */
58 /* KERN_RTSIGMAX not used */
59
60 { KERN_SHMMAX, "shmmax" },
61 { KERN_MSGMAX, "msgmax" },
62 { KERN_MSGMNB, "msgmnb" },
63 /* KERN_MSGPOOL not used*/
64 { KERN_SYSRQ, "sysrq" },
65 { KERN_MAX_THREADS, "threads-max" },
66 { KERN_RANDOM, "random", trans_random_table },
67 { KERN_SHMALL, "shmall" },
68 { KERN_MSGMNI, "msgmni" },
69 { KERN_SEM, "sem" },
70 { KERN_SPARC_STOP_A, "stop-a" },
71 { KERN_SHMMNI, "shmmni" },
72
73 { KERN_OVERFLOWUID, "overflowuid" },
74 { KERN_OVERFLOWGID, "overflowgid" },
75
76 { KERN_HOTPLUG, "hotplug", },
77 { KERN_IEEE_EMULATION_WARNINGS, "ieee_emulation_warnings" },
78
79 { KERN_S390_USER_DEBUG_LOGGING, "userprocess_debug" },
80 { KERN_CORE_USES_PID, "core_uses_pid" },
81 { KERN_TAINTED, "tainted" },
82 { KERN_CADPID, "cad_pid" },
83 { KERN_PIDMAX, "pid_max" },
84 { KERN_CORE_PATTERN, "core_pattern" },
85 { KERN_PANIC_ON_OOPS, "panic_on_oops" },
86 { KERN_HPPA_PWRSW, "soft-power" },
87 { KERN_HPPA_UNALIGNED, "unaligned-trap" },
88
89 { KERN_PRINTK_RATELIMIT, "printk_ratelimit" },
90 { KERN_PRINTK_RATELIMIT_BURST, "printk_ratelimit_burst" },
91
92 { KERN_PTY, "pty", trans_pty_table },
93 { KERN_NGROUPS_MAX, "ngroups_max" },
94 { KERN_SPARC_SCONS_PWROFF, "scons-poweroff" },
95 { KERN_HZ_TIMER, "hz_timer" },
96 { KERN_UNKNOWN_NMI_PANIC, "unknown_nmi_panic" },
97 { KERN_BOOTLOADER_TYPE, "bootloader_type" },
98 { KERN_RANDOMIZE, "randomize_va_space" },
99
100 { KERN_SPIN_RETRY, "spin_retry" },
101 { KERN_ACPI_VIDEO_FLAGS, "acpi_video_flags" },
102 { KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" },
103 { KERN_COMPAT_LOG, "compat-log" },
104 { KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
105 { KERN_NMI_WATCHDOG, "nmi_watchdog" },
106 { KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
107 {}
108};
109
110static const struct trans_ctl_table trans_vm_table[] = {
111 { VM_OVERCOMMIT_MEMORY, "overcommit_memory" },
112 { VM_PAGE_CLUSTER, "page-cluster" },
113 { VM_DIRTY_BACKGROUND, "dirty_background_ratio" },
114 { VM_DIRTY_RATIO, "dirty_ratio" },
115 { VM_DIRTY_WB_CS, "dirty_writeback_centisecs" },
116 { VM_DIRTY_EXPIRE_CS, "dirty_expire_centisecs" },
117 { VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" },
118 { VM_OVERCOMMIT_RATIO, "overcommit_ratio" },
119 /* VM_PAGEBUF unused */
120 { VM_HUGETLB_PAGES, "nr_hugepages" },
121 { VM_SWAPPINESS, "swappiness" },
122 { VM_LOWMEM_RESERVE_RATIO, "lowmem_reserve_ratio" },
123 { VM_MIN_FREE_KBYTES, "min_free_kbytes" },
124 { VM_MAX_MAP_COUNT, "max_map_count" },
125 { VM_LAPTOP_MODE, "laptop_mode" },
126 { VM_BLOCK_DUMP, "block_dump" },
127 { VM_HUGETLB_GROUP, "hugetlb_shm_group" },
128 { VM_VFS_CACHE_PRESSURE, "vfs_cache_pressure" },
129 { VM_LEGACY_VA_LAYOUT, "legacy_va_layout" },
130 /* VM_SWAP_TOKEN_TIMEOUT unused */
131 { VM_DROP_PAGECACHE, "drop_caches" },
132 { VM_PERCPU_PAGELIST_FRACTION, "percpu_pagelist_fraction" },
133 { VM_ZONE_RECLAIM_MODE, "zone_reclaim_mode" },
134 { VM_MIN_UNMAPPED, "min_unmapped_ratio" },
135 { VM_PANIC_ON_OOM, "panic_on_oom" },
136 { VM_VDSO_ENABLED, "vdso_enabled" },
137 { VM_MIN_SLAB, "min_slab_ratio" },
138
139 {}
140};
141
142static const struct trans_ctl_table trans_net_core_table[] = {
143 { NET_CORE_WMEM_MAX, "wmem_max" },
144 { NET_CORE_RMEM_MAX, "rmem_max" },
145 { NET_CORE_WMEM_DEFAULT, "wmem_default" },
146 { NET_CORE_RMEM_DEFAULT, "rmem_default" },
147 /* NET_CORE_DESTROY_DELAY unused */
148 { NET_CORE_MAX_BACKLOG, "netdev_max_backlog" },
149 /* NET_CORE_FASTROUTE unused */
150 { NET_CORE_MSG_COST, "message_cost" },
151 { NET_CORE_MSG_BURST, "message_burst" },
152 { NET_CORE_OPTMEM_MAX, "optmem_max" },
153 /* NET_CORE_HOT_LIST_LENGTH unused */
154 /* NET_CORE_DIVERT_VERSION unused */
155 /* NET_CORE_NO_CONG_THRESH unused */
156 /* NET_CORE_NO_CONG unused */
157 /* NET_CORE_LO_CONG unused */
158 /* NET_CORE_MOD_CONG unused */
159 { NET_CORE_DEV_WEIGHT, "dev_weight" },
160 { NET_CORE_SOMAXCONN, "somaxconn" },
161 { NET_CORE_BUDGET, "netdev_budget" },
162 { NET_CORE_AEVENT_ETIME, "xfrm_aevent_etime" },
163 { NET_CORE_AEVENT_RSEQTH, "xfrm_aevent_rseqth" },
164 { NET_CORE_WARNINGS, "warnings" },
165 {},
166};
167
168static const struct trans_ctl_table trans_net_unix_table[] = {
169 /* NET_UNIX_DESTROY_DELAY unused */
170 /* NET_UNIX_DELETE_DELAY unused */
171 { NET_UNIX_MAX_DGRAM_QLEN, "max_dgram_qlen" },
172 {}
173};
174
175static const struct trans_ctl_table trans_net_ipv4_route_table[] = {
176 { NET_IPV4_ROUTE_FLUSH, "flush" },
177 { NET_IPV4_ROUTE_MIN_DELAY, "min_delay" },
178 { NET_IPV4_ROUTE_MAX_DELAY, "max_delay" },
179 { NET_IPV4_ROUTE_GC_THRESH, "gc_thresh" },
180 { NET_IPV4_ROUTE_MAX_SIZE, "max_size" },
181 { NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" },
182 { NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" },
183 { NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval" },
184 { NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" },
185 { NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" },
186 { NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" },
187 { NET_IPV4_ROUTE_ERROR_COST, "error_cost" },
188 { NET_IPV4_ROUTE_ERROR_BURST, "error_burst" },
189 { NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity" },
190 { NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" },
191 { NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" },
192 { NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" },
193 { NET_IPV4_ROUTE_SECRET_INTERVAL, "secret_interval" },
194 { NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" },
195 {}
196};
197
198static const struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = {
199 { NET_IPV4_CONF_FORWARDING, "forwarding" },
200 { NET_IPV4_CONF_MC_FORWARDING, "mc_forwarding" },
201
202 { NET_IPV4_CONF_PROXY_ARP, "proxy_arp" },
203 { NET_IPV4_CONF_ACCEPT_REDIRECTS, "accept_redirects" },
204 { NET_IPV4_CONF_SECURE_REDIRECTS, "secure_redirects" },
205 { NET_IPV4_CONF_SEND_REDIRECTS, "send_redirects" },
206 { NET_IPV4_CONF_SHARED_MEDIA, "shared_media" },
207 { NET_IPV4_CONF_RP_FILTER, "rp_filter" },
208 { NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
209 { NET_IPV4_CONF_BOOTP_RELAY, "bootp_relay" },
210 { NET_IPV4_CONF_LOG_MARTIANS, "log_martians" },
211 { NET_IPV4_CONF_TAG, "tag" },
212 { NET_IPV4_CONF_ARPFILTER, "arp_filter" },
213 { NET_IPV4_CONF_MEDIUM_ID, "medium_id" },
214 { NET_IPV4_CONF_NOXFRM, "disable_xfrm" },
215 { NET_IPV4_CONF_NOPOLICY, "disable_policy" },
216 { NET_IPV4_CONF_FORCE_IGMP_VERSION, "force_igmp_version" },
217
218 { NET_IPV4_CONF_ARP_ANNOUNCE, "arp_announce" },
219 { NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" },
220 { NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" },
221 { NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" },
222 { NET_IPV4_CONF_ARP_NOTIFY, "arp_notify" },
223 {}
224};
225
226static const struct trans_ctl_table trans_net_ipv4_conf_table[] = {
227 { NET_PROTO_CONF_ALL, "all", trans_net_ipv4_conf_vars_table },
228 { NET_PROTO_CONF_DEFAULT, "default", trans_net_ipv4_conf_vars_table },
229 { 0, NULL, trans_net_ipv4_conf_vars_table },
230 {}
231};
232
233static const struct trans_ctl_table trans_net_neigh_vars_table[] = {
234 { NET_NEIGH_MCAST_SOLICIT, "mcast_solicit" },
235 { NET_NEIGH_UCAST_SOLICIT, "ucast_solicit" },
236 { NET_NEIGH_APP_SOLICIT, "app_solicit" },
237 { NET_NEIGH_RETRANS_TIME, "retrans_time" },
238 { NET_NEIGH_REACHABLE_TIME, "base_reachable_time" },
239 { NET_NEIGH_DELAY_PROBE_TIME, "delay_first_probe_time" },
240 { NET_NEIGH_GC_STALE_TIME, "gc_stale_time" },
241 { NET_NEIGH_UNRES_QLEN, "unres_qlen" },
242 { NET_NEIGH_PROXY_QLEN, "proxy_qlen" },
243 { NET_NEIGH_ANYCAST_DELAY, "anycast_delay" },
244 { NET_NEIGH_PROXY_DELAY, "proxy_delay" },
245 { NET_NEIGH_LOCKTIME, "locktime" },
246 { NET_NEIGH_GC_INTERVAL, "gc_interval" },
247 { NET_NEIGH_GC_THRESH1, "gc_thresh1" },
248 { NET_NEIGH_GC_THRESH2, "gc_thresh2" },
249 { NET_NEIGH_GC_THRESH3, "gc_thresh3" },
250 { NET_NEIGH_RETRANS_TIME_MS, "retrans_time_ms" },
251 { NET_NEIGH_REACHABLE_TIME_MS, "base_reachable_time_ms" },
252 {}
253};
254
255static const struct trans_ctl_table trans_net_neigh_table[] = {
256 { NET_PROTO_CONF_DEFAULT, "default", trans_net_neigh_vars_table },
257 { 0, NULL, trans_net_neigh_vars_table },
258 {}
259};
260
261static const struct trans_ctl_table trans_net_ipv4_netfilter_table[] = {
262 { NET_IPV4_NF_CONNTRACK_MAX, "ip_conntrack_max" },
263
264 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, "ip_conntrack_tcp_timeout_syn_sent" },
265 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, "ip_conntrack_tcp_timeout_syn_recv" },
266 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED, "ip_conntrack_tcp_timeout_established" },
267 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT, "ip_conntrack_tcp_timeout_fin_wait" },
268 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT, "ip_conntrack_tcp_timeout_close_wait" },
269 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK, "ip_conntrack_tcp_timeout_last_ack" },
270 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT, "ip_conntrack_tcp_timeout_time_wait" },
271 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE, "ip_conntrack_tcp_timeout_close" },
272
273 { NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT, "ip_conntrack_udp_timeout" },
274 { NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM, "ip_conntrack_udp_timeout_stream" },
275 { NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT, "ip_conntrack_icmp_timeout" },
276 { NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT, "ip_conntrack_generic_timeout" },
277
278 { NET_IPV4_NF_CONNTRACK_BUCKETS, "ip_conntrack_buckets" },
279 { NET_IPV4_NF_CONNTRACK_LOG_INVALID, "ip_conntrack_log_invalid" },
280 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS, "ip_conntrack_tcp_timeout_max_retrans" },
281 { NET_IPV4_NF_CONNTRACK_TCP_LOOSE, "ip_conntrack_tcp_loose" },
282 { NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL, "ip_conntrack_tcp_be_liberal" },
283 { NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS, "ip_conntrack_tcp_max_retrans" },
284
285 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED, "ip_conntrack_sctp_timeout_closed" },
286 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT, "ip_conntrack_sctp_timeout_cookie_wait" },
287 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED, "ip_conntrack_sctp_timeout_cookie_echoed" },
288 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED, "ip_conntrack_sctp_timeout_established" },
289 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT, "ip_conntrack_sctp_timeout_shutdown_sent" },
290 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD, "ip_conntrack_sctp_timeout_shutdown_recd" },
291 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT, "ip_conntrack_sctp_timeout_shutdown_ack_sent" },
292
293 { NET_IPV4_NF_CONNTRACK_COUNT, "ip_conntrack_count" },
294 { NET_IPV4_NF_CONNTRACK_CHECKSUM, "ip_conntrack_checksum" },
295 {}
296};
297
298static const struct trans_ctl_table trans_net_ipv4_table[] = {
299 { NET_IPV4_FORWARD, "ip_forward" },
300 { NET_IPV4_DYNADDR, "ip_dynaddr" },
301
302 { NET_IPV4_CONF, "conf", trans_net_ipv4_conf_table },
303 { NET_IPV4_NEIGH, "neigh", trans_net_neigh_table },
304 { NET_IPV4_ROUTE, "route", trans_net_ipv4_route_table },
305 /* NET_IPV4_FIB_HASH unused */
306 { NET_IPV4_NETFILTER, "netfilter", trans_net_ipv4_netfilter_table },
307
308 { NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps" },
309 { NET_IPV4_TCP_WINDOW_SCALING, "tcp_window_scaling" },
310 { NET_IPV4_TCP_SACK, "tcp_sack" },
311 { NET_IPV4_TCP_RETRANS_COLLAPSE, "tcp_retrans_collapse" },
312 { NET_IPV4_DEFAULT_TTL, "ip_default_ttl" },
313 /* NET_IPV4_AUTOCONFIG unused */
314 { NET_IPV4_NO_PMTU_DISC, "ip_no_pmtu_disc" },
315 { NET_IPV4_TCP_SYN_RETRIES, "tcp_syn_retries" },
316 { NET_IPV4_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh" },
317 { NET_IPV4_IPFRAG_LOW_THRESH, "ipfrag_low_thresh" },
318 { NET_IPV4_IPFRAG_TIME, "ipfrag_time" },
319 /* NET_IPV4_TCP_MAX_KA_PROBES unused */
320 { NET_IPV4_TCP_KEEPALIVE_TIME, "tcp_keepalive_time" },
321 { NET_IPV4_TCP_KEEPALIVE_PROBES, "tcp_keepalive_probes" },
322 { NET_IPV4_TCP_RETRIES1, "tcp_retries1" },
323 { NET_IPV4_TCP_RETRIES2, "tcp_retries2" },
324 { NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout" },
325 /* NET_IPV4_IP_MASQ_DEBUG unused */
326 { NET_TCP_SYNCOOKIES, "tcp_syncookies" },
327 { NET_TCP_STDURG, "tcp_stdurg" },
328 { NET_TCP_RFC1337, "tcp_rfc1337" },
329 /* NET_TCP_SYN_TAILDROP unused */
330 { NET_TCP_MAX_SYN_BACKLOG, "tcp_max_syn_backlog" },
331 { NET_IPV4_LOCAL_PORT_RANGE, "ip_local_port_range" },
332 { NET_IPV4_ICMP_ECHO_IGNORE_ALL, "icmp_echo_ignore_all" },
333 { NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, "icmp_echo_ignore_broadcasts" },
334 /* NET_IPV4_ICMP_SOURCEQUENCH_RATE unused */
335 /* NET_IPV4_ICMP_DESTUNREACH_RATE unused */
336 /* NET_IPV4_ICMP_TIMEEXCEED_RATE unused */
337 /* NET_IPV4_ICMP_PARAMPROB_RATE unused */
338 /* NET_IPV4_ICMP_ECHOREPLY_RATE unused */
339 { NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, "icmp_ignore_bogus_error_responses" },
340 { NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships" },
341 { NET_TCP_TW_RECYCLE, "tcp_tw_recycle" },
342 /* NET_IPV4_ALWAYS_DEFRAG unused */
343 { NET_IPV4_TCP_KEEPALIVE_INTVL, "tcp_keepalive_intvl" },
344 { NET_IPV4_INET_PEER_THRESHOLD, "inet_peer_threshold" },
345 { NET_IPV4_INET_PEER_MINTTL, "inet_peer_minttl" },
346 { NET_IPV4_INET_PEER_MAXTTL, "inet_peer_maxttl" },
347 { NET_IPV4_INET_PEER_GC_MINTIME, "inet_peer_gc_mintime" },
348 { NET_IPV4_INET_PEER_GC_MAXTIME, "inet_peer_gc_maxtime" },
349 { NET_TCP_ORPHAN_RETRIES, "tcp_orphan_retries" },
350 { NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow" },
351 { NET_TCP_SYNACK_RETRIES, "tcp_synack_retries" },
352 { NET_TCP_MAX_ORPHANS, "tcp_max_orphans" },
353 { NET_TCP_MAX_TW_BUCKETS, "tcp_max_tw_buckets" },
354 { NET_TCP_FACK, "tcp_fack" },
355 { NET_TCP_REORDERING, "tcp_reordering" },
356 { NET_TCP_ECN, "tcp_ecn" },
357 { NET_TCP_DSACK, "tcp_dsack" },
358 { NET_TCP_MEM, "tcp_mem" },
359 { NET_TCP_WMEM, "tcp_wmem" },
360 { NET_TCP_RMEM, "tcp_rmem" },
361 { NET_TCP_APP_WIN, "tcp_app_win" },
362 { NET_TCP_ADV_WIN_SCALE, "tcp_adv_win_scale" },
363 { NET_IPV4_NONLOCAL_BIND, "ip_nonlocal_bind" },
364 { NET_IPV4_ICMP_RATELIMIT, "icmp_ratelimit" },
365 { NET_IPV4_ICMP_RATEMASK, "icmp_ratemask" },
366 { NET_TCP_TW_REUSE, "tcp_tw_reuse" },
367 { NET_TCP_FRTO, "tcp_frto" },
368 { NET_TCP_LOW_LATENCY, "tcp_low_latency" },
369 { NET_IPV4_IPFRAG_SECRET_INTERVAL, "ipfrag_secret_interval" },
370 { NET_IPV4_IGMP_MAX_MSF, "igmp_max_msf" },
371 { NET_TCP_NO_METRICS_SAVE, "tcp_no_metrics_save" },
372 /* NET_TCP_DEFAULT_WIN_SCALE unused */
373 { NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" },
374 { NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" },
375 /* NET_TCP_BIC_BETA unused */
376 { NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR, "icmp_errors_use_inbound_ifaddr" },
377 { NET_TCP_CONG_CONTROL, "tcp_congestion_control" },
378 { NET_TCP_ABC, "tcp_abc" },
379 { NET_IPV4_IPFRAG_MAX_DIST, "ipfrag_max_dist" },
380 { NET_TCP_MTU_PROBING, "tcp_mtu_probing" },
381 { NET_TCP_BASE_MSS, "tcp_base_mss" },
382 { NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" },
383 { NET_TCP_DMA_COPYBREAK, "tcp_dma_copybreak" },
384 { NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" },
385 { NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" },
386 { NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" },
387 { NET_CIPSOV4_RBM_OPTFMT, "cipso_rbm_optfmt" },
388 { NET_CIPSOV4_RBM_STRICTVALID, "cipso_rbm_strictvalid" },
389 { NET_TCP_AVAIL_CONG_CONTROL, "tcp_available_congestion_control" },
390 { NET_TCP_ALLOWED_CONG_CONTROL, "tcp_allowed_congestion_control" },
391 { NET_TCP_MAX_SSTHRESH, "tcp_max_ssthresh" },
392 { NET_TCP_FRTO_RESPONSE, "tcp_frto_response" },
393 { 2088 /* NET_IPQ_QMAX */, "ip_queue_maxlen" },
394 {}
395};
396
397static const struct trans_ctl_table trans_net_ipx_table[] = {
398 { NET_IPX_PPROP_BROADCASTING, "ipx_pprop_broadcasting" },
399 /* NET_IPX_FORWARDING unused */
400 {}
401};
402
403static const struct trans_ctl_table trans_net_atalk_table[] = {
404 { NET_ATALK_AARP_EXPIRY_TIME, "aarp-expiry-time" },
405 { NET_ATALK_AARP_TICK_TIME, "aarp-tick-time" },
406 { NET_ATALK_AARP_RETRANSMIT_LIMIT, "aarp-retransmit-limit" },
407 { NET_ATALK_AARP_RESOLVE_TIME, "aarp-resolve-time" },
408 {},
409};
410
411static const struct trans_ctl_table trans_net_netrom_table[] = {
412 { NET_NETROM_DEFAULT_PATH_QUALITY, "default_path_quality" },
413 { NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER, "obsolescence_count_initialiser" },
414 { NET_NETROM_NETWORK_TTL_INITIALISER, "network_ttl_initialiser" },
415 { NET_NETROM_TRANSPORT_TIMEOUT, "transport_timeout" },
416 { NET_NETROM_TRANSPORT_MAXIMUM_TRIES, "transport_maximum_tries" },
417 { NET_NETROM_TRANSPORT_ACKNOWLEDGE_DELAY, "transport_acknowledge_delay" },
418 { NET_NETROM_TRANSPORT_BUSY_DELAY, "transport_busy_delay" },
419 { NET_NETROM_TRANSPORT_REQUESTED_WINDOW_SIZE, "transport_requested_window_size" },
420 { NET_NETROM_TRANSPORT_NO_ACTIVITY_TIMEOUT, "transport_no_activity_timeout" },
421 { NET_NETROM_ROUTING_CONTROL, "routing_control" },
422 { NET_NETROM_LINK_FAILS_COUNT, "link_fails_count" },
423 { NET_NETROM_RESET, "reset" },
424 {}
425};
426
427static const struct trans_ctl_table trans_net_ax25_param_table[] = {
428 { NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" },
429 { NET_AX25_DEFAULT_MODE, "ax25_default_mode" },
430 { NET_AX25_BACKOFF_TYPE, "backoff_type" },
431 { NET_AX25_CONNECT_MODE, "connect_mode" },
432 { NET_AX25_STANDARD_WINDOW, "standard_window_size" },
433 { NET_AX25_EXTENDED_WINDOW, "extended_window_size" },
434 { NET_AX25_T1_TIMEOUT, "t1_timeout" },
435 { NET_AX25_T2_TIMEOUT, "t2_timeout" },
436 { NET_AX25_T3_TIMEOUT, "t3_timeout" },
437 { NET_AX25_IDLE_TIMEOUT, "idle_timeout" },
438 { NET_AX25_N2, "maximum_retry_count" },
439 { NET_AX25_PACLEN, "maximum_packet_length" },
440 { NET_AX25_PROTOCOL, "protocol" },
441 { NET_AX25_DAMA_SLAVE_TIMEOUT, "dama_slave_timeout" },
442 {}
443};
444
445static const struct trans_ctl_table trans_net_ax25_table[] = {
446 { 0, NULL, trans_net_ax25_param_table },
447 {}
448};
449
450static const struct trans_ctl_table trans_net_bridge_table[] = {
451 { NET_BRIDGE_NF_CALL_ARPTABLES, "bridge-nf-call-arptables" },
452 { NET_BRIDGE_NF_CALL_IPTABLES, "bridge-nf-call-iptables" },
453 { NET_BRIDGE_NF_CALL_IP6TABLES, "bridge-nf-call-ip6tables" },
454 { NET_BRIDGE_NF_FILTER_VLAN_TAGGED, "bridge-nf-filter-vlan-tagged" },
455 { NET_BRIDGE_NF_FILTER_PPPOE_TAGGED, "bridge-nf-filter-pppoe-tagged" },
456 {}
457};
458
459static const struct trans_ctl_table trans_net_rose_table[] = {
460 { NET_ROSE_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" },
461 { NET_ROSE_CALL_REQUEST_TIMEOUT, "call_request_timeout" },
462 { NET_ROSE_RESET_REQUEST_TIMEOUT, "reset_request_timeout" },
463 { NET_ROSE_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" },
464 { NET_ROSE_ACK_HOLD_BACK_TIMEOUT, "acknowledge_hold_back_timeout" },
465 { NET_ROSE_ROUTING_CONTROL, "routing_control" },
466 { NET_ROSE_LINK_FAIL_TIMEOUT, "link_fail_timeout" },
467 { NET_ROSE_MAX_VCS, "maximum_virtual_circuits" },
468 { NET_ROSE_WINDOW_SIZE, "window_size" },
469 { NET_ROSE_NO_ACTIVITY_TIMEOUT, "no_activity_timeout" },
470 {}
471};
472
473static const struct trans_ctl_table trans_net_ipv6_conf_var_table[] = {
474 { NET_IPV6_FORWARDING, "forwarding" },
475 { NET_IPV6_HOP_LIMIT, "hop_limit" },
476 { NET_IPV6_MTU, "mtu" },
477 { NET_IPV6_ACCEPT_RA, "accept_ra" },
478 { NET_IPV6_ACCEPT_REDIRECTS, "accept_redirects" },
479 { NET_IPV6_AUTOCONF, "autoconf" },
480 { NET_IPV6_DAD_TRANSMITS, "dad_transmits" },
481 { NET_IPV6_RTR_SOLICITS, "router_solicitations" },
482 { NET_IPV6_RTR_SOLICIT_INTERVAL, "router_solicitation_interval" },
483 { NET_IPV6_RTR_SOLICIT_DELAY, "router_solicitation_delay" },
484 { NET_IPV6_USE_TEMPADDR, "use_tempaddr" },
485 { NET_IPV6_TEMP_VALID_LFT, "temp_valid_lft" },
486 { NET_IPV6_TEMP_PREFERED_LFT, "temp_prefered_lft" },
487 { NET_IPV6_REGEN_MAX_RETRY, "regen_max_retry" },
488 { NET_IPV6_MAX_DESYNC_FACTOR, "max_desync_factor" },
489 { NET_IPV6_MAX_ADDRESSES, "max_addresses" },
490 { NET_IPV6_FORCE_MLD_VERSION, "force_mld_version" },
491 { NET_IPV6_ACCEPT_RA_DEFRTR, "accept_ra_defrtr" },
492 { NET_IPV6_ACCEPT_RA_PINFO, "accept_ra_pinfo" },
493 { NET_IPV6_ACCEPT_RA_RTR_PREF, "accept_ra_rtr_pref" },
494 { NET_IPV6_RTR_PROBE_INTERVAL, "router_probe_interval" },
495 { NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" },
496 { NET_IPV6_PROXY_NDP, "proxy_ndp" },
497 { NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
498 {}
499};
500
501static const struct trans_ctl_table trans_net_ipv6_conf_table[] = {
502 { NET_PROTO_CONF_ALL, "all", trans_net_ipv6_conf_var_table },
503 { NET_PROTO_CONF_DEFAULT, "default", trans_net_ipv6_conf_var_table },
504 { 0, NULL, trans_net_ipv6_conf_var_table },
505 {}
506};
507
508static const struct trans_ctl_table trans_net_ipv6_route_table[] = {
509 { NET_IPV6_ROUTE_FLUSH, "flush" },
510 { NET_IPV6_ROUTE_GC_THRESH, "gc_thresh" },
511 { NET_IPV6_ROUTE_MAX_SIZE, "max_size" },
512 { NET_IPV6_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" },
513 { NET_IPV6_ROUTE_GC_TIMEOUT, "gc_timeout" },
514 { NET_IPV6_ROUTE_GC_INTERVAL, "gc_interval" },
515 { NET_IPV6_ROUTE_GC_ELASTICITY, "gc_elasticity" },
516 { NET_IPV6_ROUTE_MTU_EXPIRES, "mtu_expires" },
517 { NET_IPV6_ROUTE_MIN_ADVMSS, "min_adv_mss" },
518 { NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" },
519 {}
520};
521
522static const struct trans_ctl_table trans_net_ipv6_icmp_table[] = {
523 { NET_IPV6_ICMP_RATELIMIT, "ratelimit" },
524 {}
525};
526
527static const struct trans_ctl_table trans_net_ipv6_table[] = {
528 { NET_IPV6_CONF, "conf", trans_net_ipv6_conf_table },
529 { NET_IPV6_NEIGH, "neigh", trans_net_neigh_table },
530 { NET_IPV6_ROUTE, "route", trans_net_ipv6_route_table },
531 { NET_IPV6_ICMP, "icmp", trans_net_ipv6_icmp_table },
532 { NET_IPV6_BINDV6ONLY, "bindv6only" },
533 { NET_IPV6_IP6FRAG_HIGH_THRESH, "ip6frag_high_thresh" },
534 { NET_IPV6_IP6FRAG_LOW_THRESH, "ip6frag_low_thresh" },
535 { NET_IPV6_IP6FRAG_TIME, "ip6frag_time" },
536 { NET_IPV6_IP6FRAG_SECRET_INTERVAL, "ip6frag_secret_interval" },
537 { NET_IPV6_MLD_MAX_MSF, "mld_max_msf" },
538 { 2088 /* IPQ_QMAX */, "ip6_queue_maxlen" },
539 {}
540};
541
542static const struct trans_ctl_table trans_net_x25_table[] = {
543 { NET_X25_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" },
544 { NET_X25_CALL_REQUEST_TIMEOUT, "call_request_timeout" },
545 { NET_X25_RESET_REQUEST_TIMEOUT, "reset_request_timeout" },
546 { NET_X25_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" },
547 { NET_X25_ACK_HOLD_BACK_TIMEOUT, "acknowledgement_hold_back_timeout" },
548 { NET_X25_FORWARD, "x25_forward" },
549 {}
550};
551
552static const struct trans_ctl_table trans_net_tr_table[] = {
553 { NET_TR_RIF_TIMEOUT, "rif_timeout" },
554 {}
555};
556
557
558static const struct trans_ctl_table trans_net_decnet_conf_vars[] = {
559 { NET_DECNET_CONF_DEV_FORWARDING, "forwarding" },
560 { NET_DECNET_CONF_DEV_PRIORITY, "priority" },
561 { NET_DECNET_CONF_DEV_T2, "t2" },
562 { NET_DECNET_CONF_DEV_T3, "t3" },
563 {}
564};
565
566static const struct trans_ctl_table trans_net_decnet_conf[] = {
567 { 0, NULL, trans_net_decnet_conf_vars },
568 {}
569};
570
571static const struct trans_ctl_table trans_net_decnet_table[] = {
572 { NET_DECNET_CONF, "conf", trans_net_decnet_conf },
573 { NET_DECNET_NODE_ADDRESS, "node_address" },
574 { NET_DECNET_NODE_NAME, "node_name" },
575 { NET_DECNET_DEFAULT_DEVICE, "default_device" },
576 { NET_DECNET_TIME_WAIT, "time_wait" },
577 { NET_DECNET_DN_COUNT, "dn_count" },
578 { NET_DECNET_DI_COUNT, "di_count" },
579 { NET_DECNET_DR_COUNT, "dr_count" },
580 { NET_DECNET_DST_GC_INTERVAL, "dst_gc_interval" },
581 { NET_DECNET_NO_FC_MAX_CWND, "no_fc_max_cwnd" },
582 { NET_DECNET_MEM, "decnet_mem" },
583 { NET_DECNET_RMEM, "decnet_rmem" },
584 { NET_DECNET_WMEM, "decnet_wmem" },
585 { NET_DECNET_DEBUG_LEVEL, "debug" },
586 {}
587};
588
589static const struct trans_ctl_table trans_net_sctp_table[] = {
590 { NET_SCTP_RTO_INITIAL, "rto_initial" },
591 { NET_SCTP_RTO_MIN, "rto_min" },
592 { NET_SCTP_RTO_MAX, "rto_max" },
593 { NET_SCTP_RTO_ALPHA, "rto_alpha_exp_divisor" },
594 { NET_SCTP_RTO_BETA, "rto_beta_exp_divisor" },
595 { NET_SCTP_VALID_COOKIE_LIFE, "valid_cookie_life" },
596 { NET_SCTP_ASSOCIATION_MAX_RETRANS, "association_max_retrans" },
597 { NET_SCTP_PATH_MAX_RETRANS, "path_max_retrans" },
598 { NET_SCTP_MAX_INIT_RETRANSMITS, "max_init_retransmits" },
599 { NET_SCTP_HB_INTERVAL, "hb_interval" },
600 { NET_SCTP_PRESERVE_ENABLE, "cookie_preserve_enable" },
601 { NET_SCTP_MAX_BURST, "max_burst" },
602 { NET_SCTP_ADDIP_ENABLE, "addip_enable" },
603 { NET_SCTP_PRSCTP_ENABLE, "prsctp_enable" },
604 { NET_SCTP_SNDBUF_POLICY, "sndbuf_policy" },
605 { NET_SCTP_SACK_TIMEOUT, "sack_timeout" },
606 { NET_SCTP_RCVBUF_POLICY, "rcvbuf_policy" },
607 {}
608};
609
610static const struct trans_ctl_table trans_net_llc_llc2_timeout_table[] = {
611 { NET_LLC2_ACK_TIMEOUT, "ack" },
612 { NET_LLC2_P_TIMEOUT, "p" },
613 { NET_LLC2_REJ_TIMEOUT, "rej" },
614 { NET_LLC2_BUSY_TIMEOUT, "busy" },
615 {}
616};
617
618static const struct trans_ctl_table trans_net_llc_station_table[] = {
619 { NET_LLC_STATION_ACK_TIMEOUT, "ack_timeout" },
620 {}
621};
622
623static const struct trans_ctl_table trans_net_llc_llc2_table[] = {
624 { NET_LLC2, "timeout", trans_net_llc_llc2_timeout_table },
625 {}
626};
627
628static const struct trans_ctl_table trans_net_llc_table[] = {
629 { NET_LLC2, "llc2", trans_net_llc_llc2_table },
630 { NET_LLC_STATION, "station", trans_net_llc_station_table },
631 {}
632};
633
634static const struct trans_ctl_table trans_net_netfilter_table[] = {
635 { NET_NF_CONNTRACK_MAX, "nf_conntrack_max" },
636 { NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, "nf_conntrack_tcp_timeout_syn_sent" },
637 { NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, "nf_conntrack_tcp_timeout_syn_recv" },
638 { NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED, "nf_conntrack_tcp_timeout_established" },
639 { NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT, "nf_conntrack_tcp_timeout_fin_wait" },
640 { NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT, "nf_conntrack_tcp_timeout_close_wait" },
641 { NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK, "nf_conntrack_tcp_timeout_last_ack" },
642 { NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT, "nf_conntrack_tcp_timeout_time_wait" },
643 { NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE, "nf_conntrack_tcp_timeout_close" },
644 { NET_NF_CONNTRACK_UDP_TIMEOUT, "nf_conntrack_udp_timeout" },
645 { NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM, "nf_conntrack_udp_timeout_stream" },
646 { NET_NF_CONNTRACK_ICMP_TIMEOUT, "nf_conntrack_icmp_timeout" },
647 { NET_NF_CONNTRACK_GENERIC_TIMEOUT, "nf_conntrack_generic_timeout" },
648 { NET_NF_CONNTRACK_BUCKETS, "nf_conntrack_buckets" },
649 { NET_NF_CONNTRACK_LOG_INVALID, "nf_conntrack_log_invalid" },
650 { NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS, "nf_conntrack_tcp_timeout_max_retrans" },
651 { NET_NF_CONNTRACK_TCP_LOOSE, "nf_conntrack_tcp_loose" },
652 { NET_NF_CONNTRACK_TCP_BE_LIBERAL, "nf_conntrack_tcp_be_liberal" },
653 { NET_NF_CONNTRACK_TCP_MAX_RETRANS, "nf_conntrack_tcp_max_retrans" },
654 { NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED, "nf_conntrack_sctp_timeout_closed" },
655 { NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT, "nf_conntrack_sctp_timeout_cookie_wait" },
656 { NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED, "nf_conntrack_sctp_timeout_cookie_echoed" },
657 { NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED, "nf_conntrack_sctp_timeout_established" },
658 { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT, "nf_conntrack_sctp_timeout_shutdown_sent" },
659 { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD, "nf_conntrack_sctp_timeout_shutdown_recd" },
660 { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT, "nf_conntrack_sctp_timeout_shutdown_ack_sent" },
661 { NET_NF_CONNTRACK_COUNT, "nf_conntrack_count" },
662 { NET_NF_CONNTRACK_ICMPV6_TIMEOUT, "nf_conntrack_icmpv6_timeout" },
663 { NET_NF_CONNTRACK_FRAG6_TIMEOUT, "nf_conntrack_frag6_timeout" },
664 { NET_NF_CONNTRACK_FRAG6_LOW_THRESH, "nf_conntrack_frag6_low_thresh" },
665 { NET_NF_CONNTRACK_FRAG6_HIGH_THRESH, "nf_conntrack_frag6_high_thresh" },
666 { NET_NF_CONNTRACK_CHECKSUM, "nf_conntrack_checksum" },
667
668 {}
669};
670
671static const struct trans_ctl_table trans_net_dccp_table[] = {
672 { NET_DCCP_DEFAULT, "default" },
673 {}
674};
675
676static const struct trans_ctl_table trans_net_irda_table[] = {
677 { NET_IRDA_DISCOVERY, "discovery" },
678 { NET_IRDA_DEVNAME, "devname" },
679 { NET_IRDA_DEBUG, "debug" },
680 { NET_IRDA_FAST_POLL, "fast_poll_increase" },
681 { NET_IRDA_DISCOVERY_SLOTS, "discovery_slots" },
682 { NET_IRDA_DISCOVERY_TIMEOUT, "discovery_timeout" },
683 { NET_IRDA_SLOT_TIMEOUT, "slot_timeout" },
684 { NET_IRDA_MAX_BAUD_RATE, "max_baud_rate" },
685 { NET_IRDA_MIN_TX_TURN_TIME, "min_tx_turn_time" },
686 { NET_IRDA_MAX_TX_DATA_SIZE, "max_tx_data_size" },
687 { NET_IRDA_MAX_TX_WINDOW, "max_tx_window" },
688 { NET_IRDA_MAX_NOREPLY_TIME, "max_noreply_time" },
689 { NET_IRDA_WARN_NOREPLY_TIME, "warn_noreply_time" },
690 { NET_IRDA_LAP_KEEPALIVE_TIME, "lap_keepalive_time" },
691 {}
692};
693
694static const struct trans_ctl_table trans_net_table[] = {
695 { NET_CORE, "core", trans_net_core_table },
696 /* NET_ETHER not used */
697 /* NET_802 not used */
698 { NET_UNIX, "unix", trans_net_unix_table },
699 { NET_IPV4, "ipv4", trans_net_ipv4_table },
700 { NET_IPX, "ipx", trans_net_ipx_table },
701 { NET_ATALK, "appletalk", trans_net_atalk_table },
702 { NET_NETROM, "netrom", trans_net_netrom_table },
703 { NET_AX25, "ax25", trans_net_ax25_table },
704 { NET_BRIDGE, "bridge", trans_net_bridge_table },
705 { NET_ROSE, "rose", trans_net_rose_table },
706 { NET_IPV6, "ipv6", trans_net_ipv6_table },
707 { NET_X25, "x25", trans_net_x25_table },
708 { NET_TR, "token-ring", trans_net_tr_table },
709 { NET_DECNET, "decnet", trans_net_decnet_table },
710 /* NET_ECONET not used */
711 { NET_SCTP, "sctp", trans_net_sctp_table },
712 { NET_LLC, "llc", trans_net_llc_table },
713 { NET_NETFILTER, "netfilter", trans_net_netfilter_table },
714 { NET_DCCP, "dccp", trans_net_dccp_table },
715 { NET_IRDA, "irda", trans_net_irda_table },
716 { 2089, "nf_conntrack_max" },
717 {}
718};
719
720static const struct trans_ctl_table trans_fs_quota_table[] = {
721 { FS_DQ_LOOKUPS, "lookups" },
722 { FS_DQ_DROPS, "drops" },
723 { FS_DQ_READS, "reads" },
724 { FS_DQ_WRITES, "writes" },
725 { FS_DQ_CACHE_HITS, "cache_hits" },
726 { FS_DQ_ALLOCATED, "allocated_dquots" },
727 { FS_DQ_FREE, "free_dquots" },
728 { FS_DQ_SYNCS, "syncs" },
729 { FS_DQ_WARNINGS, "warnings" },
730 {}
731};
732
733static const struct trans_ctl_table trans_fs_xfs_table[] = {
734 { XFS_SGID_INHERIT, "irix_sgid_inherit" },
735 { XFS_SYMLINK_MODE, "irix_symlink_mode" },
736 { XFS_PANIC_MASK, "panic_mask" },
737
738 { XFS_ERRLEVEL, "error_level" },
739 { XFS_SYNCD_TIMER, "xfssyncd_centisecs" },
740 { XFS_INHERIT_SYNC, "inherit_sync" },
741 { XFS_INHERIT_NODUMP, "inherit_nodump" },
742 { XFS_INHERIT_NOATIME, "inherit_noatime" },
743 { XFS_BUF_TIMER, "xfsbufd_centisecs" },
744 { XFS_BUF_AGE, "age_buffer_centisecs" },
745 { XFS_INHERIT_NOSYM, "inherit_nosymlinks" },
746 { XFS_ROTORSTEP, "rotorstep" },
747 { XFS_INHERIT_NODFRG, "inherit_nodefrag" },
748 { XFS_FILESTREAM_TIMER, "filestream_centisecs" },
749 { XFS_STATS_CLEAR, "stats_clear" },
750 {}
751};
752
753static const struct trans_ctl_table trans_fs_ocfs2_nm_table[] = {
754 { 1, "hb_ctl_path" },
755 {}
756};
757
758static const struct trans_ctl_table trans_fs_ocfs2_table[] = {
759 { 1, "nm", trans_fs_ocfs2_nm_table },
760 {}
761};
762
763static const struct trans_ctl_table trans_inotify_table[] = {
764 { INOTIFY_MAX_USER_INSTANCES, "max_user_instances" },
765 { INOTIFY_MAX_USER_WATCHES, "max_user_watches" },
766 { INOTIFY_MAX_QUEUED_EVENTS, "max_queued_events" },
767 {}
768};
769
770static const struct trans_ctl_table trans_fs_table[] = {
771 { FS_NRINODE, "inode-nr" },
772 { FS_STATINODE, "inode-state" },
773 /* FS_MAXINODE unused */
774 /* FS_NRDQUOT unused */
775 /* FS_MAXDQUOT unused */
776 { FS_NRFILE, "file-nr" },
777 { FS_MAXFILE, "file-max" },
778 { FS_DENTRY, "dentry-state" },
779 /* FS_NRSUPER unused */
780 /* FS_MAXUPSER unused */
781 { FS_OVERFLOWUID, "overflowuid" },
782 { FS_OVERFLOWGID, "overflowgid" },
783 { FS_LEASES, "leases-enable" },
784 { FS_DIR_NOTIFY, "dir-notify-enable" },
785 { FS_LEASE_TIME, "lease-break-time" },
786 { FS_DQSTATS, "quota", trans_fs_quota_table },
787 { FS_XFS, "xfs", trans_fs_xfs_table },
788 { FS_AIO_NR, "aio-nr" },
789 { FS_AIO_MAX_NR, "aio-max-nr" },
790 { FS_INOTIFY, "inotify", trans_inotify_table },
791 { FS_OCFS2, "ocfs2", trans_fs_ocfs2_table },
792 { KERN_SETUID_DUMPABLE, "suid_dumpable" },
793 {}
794};
795
796static const struct trans_ctl_table trans_debug_table[] = {
797 {}
798};
799
800static const struct trans_ctl_table trans_cdrom_table[] = {
801 { DEV_CDROM_INFO, "info" },
802 { DEV_CDROM_AUTOCLOSE, "autoclose" },
803 { DEV_CDROM_AUTOEJECT, "autoeject" },
804 { DEV_CDROM_DEBUG, "debug" },
805 { DEV_CDROM_LOCK, "lock" },
806 { DEV_CDROM_CHECK_MEDIA, "check_media" },
807 {}
808};
809
810static const struct trans_ctl_table trans_ipmi_table[] = {
811 { DEV_IPMI_POWEROFF_POWERCYCLE, "poweroff_powercycle" },
812 {}
813};
814
815static const struct trans_ctl_table trans_mac_hid_files[] = {
816 /* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */
817 /* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */
818 { DEV_MAC_HID_MOUSE_BUTTON_EMULATION, "mouse_button_emulation" },
819 { DEV_MAC_HID_MOUSE_BUTTON2_KEYCODE, "mouse_button2_keycode" },
820 { DEV_MAC_HID_MOUSE_BUTTON3_KEYCODE, "mouse_button3_keycode" },
821 /* DEV_MAC_HID_ADB_MOUSE_SENDS_KEYCODES unused */
822 {}
823};
824
825static const struct trans_ctl_table trans_raid_table[] = {
826 { DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min" },
827 { DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max" },
828 {}
829};
830
831static const struct trans_ctl_table trans_scsi_table[] = {
832 { DEV_SCSI_LOGGING_LEVEL, "logging_level" },
833 {}
834};
835
836static const struct trans_ctl_table trans_parport_default_table[] = {
837 { DEV_PARPORT_DEFAULT_TIMESLICE, "timeslice" },
838 { DEV_PARPORT_DEFAULT_SPINTIME, "spintime" },
839 {}
840};
841
842static const struct trans_ctl_table trans_parport_device_table[] = {
843 { DEV_PARPORT_DEVICE_TIMESLICE, "timeslice" },
844 {}
845};
846
847static const struct trans_ctl_table trans_parport_devices_table[] = {
848 { DEV_PARPORT_DEVICES_ACTIVE, "active" },
849 { 0, NULL, trans_parport_device_table },
850 {}
851};
852
853static const struct trans_ctl_table trans_parport_parport_table[] = {
854 { DEV_PARPORT_SPINTIME, "spintime" },
855 { DEV_PARPORT_BASE_ADDR, "base-addr" },
856 { DEV_PARPORT_IRQ, "irq" },
857 { DEV_PARPORT_DMA, "dma" },
858 { DEV_PARPORT_MODES, "modes" },
859 { DEV_PARPORT_DEVICES, "devices", trans_parport_devices_table },
860 { DEV_PARPORT_AUTOPROBE, "autoprobe" },
861 { DEV_PARPORT_AUTOPROBE + 1, "autoprobe0" },
862 { DEV_PARPORT_AUTOPROBE + 2, "autoprobe1" },
863 { DEV_PARPORT_AUTOPROBE + 3, "autoprobe2" },
864 { DEV_PARPORT_AUTOPROBE + 4, "autoprobe3" },
865 {}
866};
867static const struct trans_ctl_table trans_parport_table[] = {
868 { DEV_PARPORT_DEFAULT, "default", trans_parport_default_table },
869 { 0, NULL, trans_parport_parport_table },
870 {}
871};
872
873static const struct trans_ctl_table trans_dev_table[] = {
874 { DEV_CDROM, "cdrom", trans_cdrom_table },
875 /* DEV_HWMON unused */
876 { DEV_PARPORT, "parport", trans_parport_table },
877 { DEV_RAID, "raid", trans_raid_table },
878 { DEV_MAC_HID, "mac_hid", trans_mac_hid_files },
879 { DEV_SCSI, "scsi", trans_scsi_table },
880 { DEV_IPMI, "ipmi", trans_ipmi_table },
881 {}
882};
883
884static const struct trans_ctl_table trans_bus_isa_table[] = {
885 { BUS_ISA_MEM_BASE, "membase" },
886 { BUS_ISA_PORT_BASE, "portbase" },
887 { BUS_ISA_PORT_SHIFT, "portshift" },
888 {}
889};
890
891static const struct trans_ctl_table trans_bus_table[] = {
892 { CTL_BUS_ISA, "isa", trans_bus_isa_table },
893 {}
894};
895
896static const struct trans_ctl_table trans_arlan_conf_table0[] = {
897 { 1, "spreadingCode" },
898 { 2, "channelNumber" },
899 { 3, "scramblingDisable" },
900 { 4, "txAttenuation" },
901 { 5, "systemId" },
902 { 6, "maxDatagramSize" },
903 { 7, "maxFrameSize" },
904 { 8, "maxRetries" },
905 { 9, "receiveMode" },
906 { 10, "priority" },
907 { 11, "rootOrRepeater" },
908 { 12, "SID" },
909 { 13, "registrationMode" },
910 { 14, "registrationFill" },
911 { 15, "localTalkAddress" },
912 { 16, "codeFormat" },
913 { 17, "numChannels" },
914 { 18, "channel1" },
915 { 19, "channel2" },
916 { 20, "channel3" },
917 { 21, "channel4" },
918 { 22, "txClear" },
919 { 23, "txRetries" },
920 { 24, "txRouting" },
921 { 25, "txScrambled" },
922 { 26, "rxParameter" },
923 { 27, "txTimeoutMs" },
924 { 28, "waitCardTimeout" },
925 { 29, "channelSet" },
926 { 30, "name" },
927 { 31, "waitTime" },
928 { 32, "lParameter" },
929 { 33, "_15" },
930 { 34, "headerSize" },
931 { 36, "tx_delay_ms" },
932 { 37, "retries" },
933 { 38, "ReTransmitPacketMaxSize" },
934 { 39, "waitReTransmitPacketMaxSize" },
935 { 40, "fastReTransCount" },
936 { 41, "driverRetransmissions" },
937 { 42, "txAckTimeoutMs" },
938 { 43, "registrationInterrupts" },
939 { 44, "hardwareType" },
940 { 45, "radioType" },
941 { 46, "writeEEPROM" },
942 { 47, "writeRadioType" },
943 { 48, "entry_exit_debug" },
944 { 49, "debug" },
945 { 50, "in_speed" },
946 { 51, "out_speed" },
947 { 52, "in_speed10" },
948 { 53, "out_speed10" },
949 { 54, "in_speed_max" },
950 { 55, "out_speed_max" },
951 { 56, "measure_rate" },
952 { 57, "pre_Command_Wait" },
953 { 58, "rx_tweak1" },
954 { 59, "rx_tweak2" },
955 { 60, "tx_queue_len" },
956
957 { 150, "arlan0-txRing" },
958 { 151, "arlan0-rxRing" },
959 { 152, "arlan0-18" },
960 { 153, "arlan0-ring" },
961 { 154, "arlan0-shm-cpy" },
962 { 155, "config0" },
963 { 156, "reset0" },
964 {}
965};
966
967static const struct trans_ctl_table trans_arlan_conf_table1[] = {
968 { 1, "spreadingCode" },
969 { 2, "channelNumber" },
970 { 3, "scramblingDisable" },
971 { 4, "txAttenuation" },
972 { 5, "systemId" },
973 { 6, "maxDatagramSize" },
974 { 7, "maxFrameSize" },
975 { 8, "maxRetries" },
976 { 9, "receiveMode" },
977 { 10, "priority" },
978 { 11, "rootOrRepeater" },
979 { 12, "SID" },
980 { 13, "registrationMode" },
981 { 14, "registrationFill" },
982 { 15, "localTalkAddress" },
983 { 16, "codeFormat" },
984 { 17, "numChannels" },
985 { 18, "channel1" },
986 { 19, "channel2" },
987 { 20, "channel3" },
988 { 21, "channel4" },
989 { 22, "txClear" },
990 { 23, "txRetries" },
991 { 24, "txRouting" },
992 { 25, "txScrambled" },
993 { 26, "rxParameter" },
994 { 27, "txTimeoutMs" },
995 { 28, "waitCardTimeout" },
996 { 29, "channelSet" },
997 { 30, "name" },
998 { 31, "waitTime" },
999 { 32, "lParameter" },
1000 { 33, "_15" },
1001 { 34, "headerSize" },
1002 { 36, "tx_delay_ms" },
1003 { 37, "retries" },
1004 { 38, "ReTransmitPacketMaxSize" },
1005 { 39, "waitReTransmitPacketMaxSize" },
1006 { 40, "fastReTransCount" },
1007 { 41, "driverRetransmissions" },
1008 { 42, "txAckTimeoutMs" },
1009 { 43, "registrationInterrupts" },
1010 { 44, "hardwareType" },
1011 { 45, "radioType" },
1012 { 46, "writeEEPROM" },
1013 { 47, "writeRadioType" },
1014 { 48, "entry_exit_debug" },
1015 { 49, "debug" },
1016 { 50, "in_speed" },
1017 { 51, "out_speed" },
1018 { 52, "in_speed10" },
1019 { 53, "out_speed10" },
1020 { 54, "in_speed_max" },
1021 { 55, "out_speed_max" },
1022 { 56, "measure_rate" },
1023 { 57, "pre_Command_Wait" },
1024 { 58, "rx_tweak1" },
1025 { 59, "rx_tweak2" },
1026 { 60, "tx_queue_len" },
1027
1028 { 150, "arlan1-txRing" },
1029 { 151, "arlan1-rxRing" },
1030 { 152, "arlan1-18" },
1031 { 153, "arlan1-ring" },
1032 { 154, "arlan1-shm-cpy" },
1033 { 155, "config1" },
1034 { 156, "reset1" },
1035 {}
1036};
1037
1038static const struct trans_ctl_table trans_arlan_conf_table2[] = {
1039 { 1, "spreadingCode" },
1040 { 2, "channelNumber" },
1041 { 3, "scramblingDisable" },
1042 { 4, "txAttenuation" },
1043 { 5, "systemId" },
1044 { 6, "maxDatagramSize" },
1045 { 7, "maxFrameSize" },
1046 { 8, "maxRetries" },
1047 { 9, "receiveMode" },
1048 { 10, "priority" },
1049 { 11, "rootOrRepeater" },
1050 { 12, "SID" },
1051 { 13, "registrationMode" },
1052 { 14, "registrationFill" },
1053 { 15, "localTalkAddress" },
1054 { 16, "codeFormat" },
1055 { 17, "numChannels" },
1056 { 18, "channel1" },
1057 { 19, "channel2" },
1058 { 20, "channel3" },
1059 { 21, "channel4" },
1060 { 22, "txClear" },
1061 { 23, "txRetries" },
1062 { 24, "txRouting" },
1063 { 25, "txScrambled" },
1064 { 26, "rxParameter" },
1065 { 27, "txTimeoutMs" },
1066 { 28, "waitCardTimeout" },
1067 { 29, "channelSet" },
1068 { 30, "name" },
1069 { 31, "waitTime" },
1070 { 32, "lParameter" },
1071 { 33, "_15" },
1072 { 34, "headerSize" },
1073 { 36, "tx_delay_ms" },
1074 { 37, "retries" },
1075 { 38, "ReTransmitPacketMaxSize" },
1076 { 39, "waitReTransmitPacketMaxSize" },
1077 { 40, "fastReTransCount" },
1078 { 41, "driverRetransmissions" },
1079 { 42, "txAckTimeoutMs" },
1080 { 43, "registrationInterrupts" },
1081 { 44, "hardwareType" },
1082 { 45, "radioType" },
1083 { 46, "writeEEPROM" },
1084 { 47, "writeRadioType" },
1085 { 48, "entry_exit_debug" },
1086 { 49, "debug" },
1087 { 50, "in_speed" },
1088 { 51, "out_speed" },
1089 { 52, "in_speed10" },
1090 { 53, "out_speed10" },
1091 { 54, "in_speed_max" },
1092 { 55, "out_speed_max" },
1093 { 56, "measure_rate" },
1094 { 57, "pre_Command_Wait" },
1095 { 58, "rx_tweak1" },
1096 { 59, "rx_tweak2" },
1097 { 60, "tx_queue_len" },
1098
1099 { 150, "arlan2-txRing" },
1100 { 151, "arlan2-rxRing" },
1101 { 152, "arlan2-18" },
1102 { 153, "arlan2-ring" },
1103 { 154, "arlan2-shm-cpy" },
1104 { 155, "config2" },
1105 { 156, "reset2" },
1106 {}
1107};
1108
1109static const struct trans_ctl_table trans_arlan_conf_table3[] = {
1110 { 1, "spreadingCode" },
1111 { 2, "channelNumber" },
1112 { 3, "scramblingDisable" },
1113 { 4, "txAttenuation" },
1114 { 5, "systemId" },
1115 { 6, "maxDatagramSize" },
1116 { 7, "maxFrameSize" },
1117 { 8, "maxRetries" },
1118 { 9, "receiveMode" },
1119 { 10, "priority" },
1120 { 11, "rootOrRepeater" },
1121 { 12, "SID" },
1122 { 13, "registrationMode" },
1123 { 14, "registrationFill" },
1124 { 15, "localTalkAddress" },
1125 { 16, "codeFormat" },
1126 { 17, "numChannels" },
1127 { 18, "channel1" },
1128 { 19, "channel2" },
1129 { 20, "channel3" },
1130 { 21, "channel4" },
1131 { 22, "txClear" },
1132 { 23, "txRetries" },
1133 { 24, "txRouting" },
1134 { 25, "txScrambled" },
1135 { 26, "rxParameter" },
1136 { 27, "txTimeoutMs" },
1137 { 28, "waitCardTimeout" },
1138 { 29, "channelSet" },
1139 { 30, "name" },
1140 { 31, "waitTime" },
1141 { 32, "lParameter" },
1142 { 33, "_15" },
1143 { 34, "headerSize" },
1144 { 36, "tx_delay_ms" },
1145 { 37, "retries" },
1146 { 38, "ReTransmitPacketMaxSize" },
1147 { 39, "waitReTransmitPacketMaxSize" },
1148 { 40, "fastReTransCount" },
1149 { 41, "driverRetransmissions" },
1150 { 42, "txAckTimeoutMs" },
1151 { 43, "registrationInterrupts" },
1152 { 44, "hardwareType" },
1153 { 45, "radioType" },
1154 { 46, "writeEEPROM" },
1155 { 47, "writeRadioType" },
1156 { 48, "entry_exit_debug" },
1157 { 49, "debug" },
1158 { 50, "in_speed" },
1159 { 51, "out_speed" },
1160 { 52, "in_speed10" },
1161 { 53, "out_speed10" },
1162 { 54, "in_speed_max" },
1163 { 55, "out_speed_max" },
1164 { 56, "measure_rate" },
1165 { 57, "pre_Command_Wait" },
1166 { 58, "rx_tweak1" },
1167 { 59, "rx_tweak2" },
1168 { 60, "tx_queue_len" },
1169
1170 { 150, "arlan3-txRing" },
1171 { 151, "arlan3-rxRing" },
1172 { 152, "arlan3-18" },
1173 { 153, "arlan3-ring" },
1174 { 154, "arlan3-shm-cpy" },
1175 { 155, "config3" },
1176 { 156, "reset3" },
1177 {}
1178};
1179
1180static const struct trans_ctl_table trans_arlan_table[] = {
1181 { 1, "arlan0", trans_arlan_conf_table0 },
1182 { 2, "arlan1", trans_arlan_conf_table1 },
1183 { 3, "arlan2", trans_arlan_conf_table2 },
1184 { 4, "arlan3", trans_arlan_conf_table3 },
1185 {}
1186};
1187
1188static const struct trans_ctl_table trans_s390dbf_table[] = {
1189 { 5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" },
1190 { 5679 /* CTL_S390DBF_ACTIVE */, "debug_active" },
1191 {}
1192};
1193
1194static const struct trans_ctl_table trans_sunrpc_table[] = {
1195 { CTL_RPCDEBUG, "rpc_debug" },
1196 { CTL_NFSDEBUG, "nfs_debug" },
1197 { CTL_NFSDDEBUG, "nfsd_debug" },
1198 { CTL_NLMDEBUG, "nlm_debug" },
1199 { CTL_SLOTTABLE_UDP, "udp_slot_table_entries" },
1200 { CTL_SLOTTABLE_TCP, "tcp_slot_table_entries" },
1201 { CTL_MIN_RESVPORT, "min_resvport" },
1202 { CTL_MAX_RESVPORT, "max_resvport" },
1203 {}
1204};
1205
1206static const struct trans_ctl_table trans_pm_table[] = {
1207 { 1 /* CTL_PM_SUSPEND */, "suspend" },
1208 { 2 /* CTL_PM_CMODE */, "cmode" },
1209 { 3 /* CTL_PM_P0 */, "p0" },
1210 { 4 /* CTL_PM_CM */, "cm" },
1211 {}
1212};
1213
1214static const struct trans_ctl_table trans_frv_table[] = {
1215 { 1, "cache-mode" },
1216 { 2, "pin-cxnr" },
1217 {}
1218};
1219
1220static const struct trans_ctl_table trans_root_table[] = {
1221 { CTL_KERN, "kernel", trans_kern_table },
1222 { CTL_VM, "vm", trans_vm_table },
1223 { CTL_NET, "net", trans_net_table },
1224 /* CTL_PROC not used */
1225 { CTL_FS, "fs", trans_fs_table },
1226 { CTL_DEBUG, "debug", trans_debug_table },
1227 { CTL_DEV, "dev", trans_dev_table },
1228 { CTL_BUS, "bus", trans_bus_table },
1229 { CTL_ABI, "abi" },
1230 /* CTL_CPU not used */
1231 { CTL_ARLAN, "arlan", trans_arlan_table },
1232 { CTL_S390DBF, "s390dbf", trans_s390dbf_table },
1233 { CTL_SUNRPC, "sunrpc", trans_sunrpc_table },
1234 { CTL_PM, "pm", trans_pm_table },
1235 { CTL_FRV, "frv", trans_frv_table },
1236 {}
1237};
1238
1239
1240
1241 8
1242static int sysctl_depth(struct ctl_table *table) 9static int sysctl_depth(struct ctl_table *table)
1243{ 10{
@@ -1261,47 +28,6 @@ static struct ctl_table *sysctl_parent(struct ctl_table *table, int n)
1261 return table; 28 return table;
1262} 29}
1263 30
1264static const struct trans_ctl_table *sysctl_binary_lookup(struct ctl_table *table)
1265{
1266 struct ctl_table *test;
1267 const struct trans_ctl_table *ref;
1268 int cur_depth;
1269
1270 cur_depth = sysctl_depth(table);
1271
1272 ref = trans_root_table;
1273repeat:
1274 test = sysctl_parent(table, cur_depth);
1275 for (; ref->ctl_name || ref->procname || ref->child; ref++) {
1276 int match = 0;
1277
1278 if (cur_depth && !ref->child)
1279 continue;
1280
1281 if (test->procname && ref->procname &&
1282 (strcmp(test->procname, ref->procname) == 0))
1283 match++;
1284
1285 if (test->ctl_name && ref->ctl_name &&
1286 (test->ctl_name == ref->ctl_name))
1287 match++;
1288
1289 if (!ref->ctl_name && !ref->procname)
1290 match++;
1291
1292 if (match) {
1293 if (cur_depth != 0) {
1294 cur_depth--;
1295 ref = ref->child;
1296 goto repeat;
1297 }
1298 goto out;
1299 }
1300 }
1301 ref = NULL;
1302out:
1303 return ref;
1304}
1305 31
1306static void sysctl_print_path(struct ctl_table *table) 32static void sysctl_print_path(struct ctl_table *table)
1307{ 33{
@@ -1315,26 +41,6 @@ static void sysctl_print_path(struct ctl_table *table)
1315 } 41 }
1316 } 42 }
1317 printk(" "); 43 printk(" ");
1318 if (table->ctl_name) {
1319 for (i = depth; i >= 0; i--) {
1320 tmp = sysctl_parent(table, i);
1321 printk(".%d", tmp->ctl_name);
1322 }
1323 }
1324}
1325
1326static void sysctl_repair_table(struct ctl_table *table)
1327{
1328 /* Don't complain about the classic default
1329 * sysctl strategy routine. Maybe later we
1330 * can get the tables fixed and complain about
1331 * this.
1332 */
1333 if (table->ctl_name && table->procname &&
1334 (table->proc_handler == proc_dointvec) &&
1335 (!table->strategy)) {
1336 table->strategy = sysctl_data;
1337 }
1338} 44}
1339 45
1340static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces, 46static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces,
@@ -1352,7 +58,7 @@ static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces,
1352 ref = head->ctl_table; 58 ref = head->ctl_table;
1353repeat: 59repeat:
1354 test = sysctl_parent(table, cur_depth); 60 test = sysctl_parent(table, cur_depth);
1355 for (; ref->ctl_name || ref->procname; ref++) { 61 for (; ref->procname; ref++) {
1356 int match = 0; 62 int match = 0;
1357 if (cur_depth && !ref->child) 63 if (cur_depth && !ref->child)
1358 continue; 64 continue;
@@ -1361,10 +67,6 @@ repeat:
1361 (strcmp(test->procname, ref->procname) == 0)) 67 (strcmp(test->procname, ref->procname) == 0))
1362 match++; 68 match++;
1363 69
1364 if (test->ctl_name && ref->ctl_name &&
1365 (test->ctl_name == ref->ctl_name))
1366 match++;
1367
1368 if (match) { 70 if (match) {
1369 if (cur_depth != 0) { 71 if (cur_depth != 0) {
1370 cur_depth--; 72 cur_depth--;
@@ -1392,38 +94,6 @@ static void set_fail(const char **fail, struct ctl_table *table, const char *str
1392 *fail = str; 94 *fail = str;
1393} 95}
1394 96
1395static int sysctl_check_dir(struct nsproxy *namespaces,
1396 struct ctl_table *table)
1397{
1398 struct ctl_table *ref;
1399 int error;
1400
1401 error = 0;
1402 ref = sysctl_check_lookup(namespaces, table);
1403 if (ref) {
1404 int match = 0;
1405 if ((!table->procname && !ref->procname) ||
1406 (table->procname && ref->procname &&
1407 (strcmp(table->procname, ref->procname) == 0)))
1408 match++;
1409
1410 if ((!table->ctl_name && !ref->ctl_name) ||
1411 (table->ctl_name && ref->ctl_name &&
1412 (table->ctl_name == ref->ctl_name)))
1413 match++;
1414
1415 if (match != 2) {
1416 printk(KERN_ERR "%s: failed: ", __func__);
1417 sysctl_print_path(table);
1418 printk(" ref: ");
1419 sysctl_print_path(ref);
1420 printk("\n");
1421 error = -EINVAL;
1422 }
1423 }
1424 return error;
1425}
1426
1427static void sysctl_check_leaf(struct nsproxy *namespaces, 97static void sysctl_check_leaf(struct nsproxy *namespaces,
1428 struct ctl_table *table, const char **fail) 98 struct ctl_table *table, const char **fail)
1429{ 99{
@@ -1434,37 +104,15 @@ static void sysctl_check_leaf(struct nsproxy *namespaces,
1434 set_fail(fail, table, "Sysctl already exists"); 104 set_fail(fail, table, "Sysctl already exists");
1435} 105}
1436 106
1437static void sysctl_check_bin_path(struct ctl_table *table, const char **fail)
1438{
1439 const struct trans_ctl_table *ref;
1440
1441 ref = sysctl_binary_lookup(table);
1442 if (table->ctl_name && !ref)
1443 set_fail(fail, table, "Unknown sysctl binary path");
1444 if (ref) {
1445 if (ref->procname &&
1446 (!table->procname ||
1447 (strcmp(table->procname, ref->procname) != 0)))
1448 set_fail(fail, table, "procname does not match binary path procname");
1449
1450 if (ref->ctl_name && table->ctl_name &&
1451 (table->ctl_name != ref->ctl_name))
1452 set_fail(fail, table, "ctl_name does not match binary path ctl_name");
1453 }
1454}
1455
1456int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) 107int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
1457{ 108{
1458 int error = 0; 109 int error = 0;
1459 for (; table->ctl_name || table->procname; table++) { 110 for (; table->procname; table++) {
1460 const char *fail = NULL; 111 const char *fail = NULL;
1461 112
1462 sysctl_repair_table(table);
1463 if (table->parent) { 113 if (table->parent) {
1464 if (table->procname && !table->parent->procname) 114 if (table->procname && !table->parent->procname)
1465 set_fail(&fail, table, "Parent without procname"); 115 set_fail(&fail, table, "Parent without procname");
1466 if (table->ctl_name && !table->parent->ctl_name)
1467 set_fail(&fail, table, "Parent without ctl_name");
1468 } 116 }
1469 if (!table->procname) 117 if (!table->procname)
1470 set_fail(&fail, table, "No procname"); 118 set_fail(&fail, table, "No procname");
@@ -1477,21 +125,12 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
1477 set_fail(&fail, table, "Writable sysctl directory"); 125 set_fail(&fail, table, "Writable sysctl directory");
1478 if (table->proc_handler) 126 if (table->proc_handler)
1479 set_fail(&fail, table, "Directory with proc_handler"); 127 set_fail(&fail, table, "Directory with proc_handler");
1480 if (table->strategy)
1481 set_fail(&fail, table, "Directory with strategy");
1482 if (table->extra1) 128 if (table->extra1)
1483 set_fail(&fail, table, "Directory with extra1"); 129 set_fail(&fail, table, "Directory with extra1");
1484 if (table->extra2) 130 if (table->extra2)
1485 set_fail(&fail, table, "Directory with extra2"); 131 set_fail(&fail, table, "Directory with extra2");
1486 if (sysctl_check_dir(namespaces, table))
1487 set_fail(&fail, table, "Inconsistent directory names");
1488 } else { 132 } else {
1489 if ((table->strategy == sysctl_data) || 133 if ((table->proc_handler == proc_dostring) ||
1490 (table->strategy == sysctl_string) ||
1491 (table->strategy == sysctl_intvec) ||
1492 (table->strategy == sysctl_jiffies) ||
1493 (table->strategy == sysctl_ms_jiffies) ||
1494 (table->proc_handler == proc_dostring) ||
1495 (table->proc_handler == proc_dointvec) || 134 (table->proc_handler == proc_dointvec) ||
1496 (table->proc_handler == proc_dointvec_minmax) || 135 (table->proc_handler == proc_dointvec_minmax) ||
1497 (table->proc_handler == proc_dointvec_jiffies) || 136 (table->proc_handler == proc_dointvec_jiffies) ||
@@ -1513,15 +152,7 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
1513 set_fail(&fail, table, "No max"); 152 set_fail(&fail, table, "No max");
1514 } 153 }
1515 } 154 }
1516#ifdef CONFIG_SYSCTL_SYSCALL 155#ifdef CONFIG_PROC_SYSCTL
1517 if (table->ctl_name && !table->strategy)
1518 set_fail(&fail, table, "Missing strategy");
1519#endif
1520#if 0
1521 if (!table->ctl_name && table->strategy)
1522 set_fail(&fail, table, "Strategy without ctl_name");
1523#endif
1524#ifdef CONFIG_PROC_FS
1525 if (table->procname && !table->proc_handler) 156 if (table->procname && !table->proc_handler)
1526 set_fail(&fail, table, "No proc_handler"); 157 set_fail(&fail, table, "No proc_handler");
1527#endif 158#endif
@@ -1531,7 +162,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
1531#endif 162#endif
1532 sysctl_check_leaf(namespaces, table, &fail); 163 sysctl_check_leaf(namespaces, table, &fail);
1533 } 164 }
1534 sysctl_check_bin_path(table, &fail);
1535 if (table->mode > 0777) 165 if (table->mode > 0777)
1536 set_fail(&fail, table, "bogus .mode"); 166 set_fail(&fail, table, "bogus .mode");
1537 if (fail) { 167 if (fail) {
diff --git a/kernel/time.c b/kernel/time.c
index 2e2e469a7fe..804798005d1 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -662,6 +662,36 @@ u64 nsec_to_clock_t(u64 x)
662#endif 662#endif
663} 663}
664 664
665/**
666 * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
667 *
668 * @n: nsecs in u64
669 *
670 * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
671 * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
672 * for scheduler, not for use in device drivers to calculate timeout value.
673 *
674 * note:
675 * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
676 * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
677 */
678unsigned long nsecs_to_jiffies(u64 n)
679{
680#if (NSEC_PER_SEC % HZ) == 0
681 /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
682 return div_u64(n, NSEC_PER_SEC / HZ);
683#elif (HZ % 512) == 0
684 /* overflow after 292 years if HZ = 1024 */
685 return div_u64(n * HZ / 512, NSEC_PER_SEC / 512);
686#else
687 /*
688 * Generic case - optimized for cases where HZ is a multiple of 3.
689 * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc.
690 */
691 return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ);
692#endif
693}
694
665#if (BITS_PER_LONG < 64) 695#if (BITS_PER_LONG < 64)
666u64 get_jiffies_64(void) 696u64 get_jiffies_64(void)
667{ 697{
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 620b58abdc3..d7395fdfb9f 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -20,6 +20,8 @@
20#include <linux/sysdev.h> 20#include <linux/sysdev.h>
21#include <linux/tick.h> 21#include <linux/tick.h>
22 22
23#include "tick-internal.h"
24
23/* The registered clock event devices */ 25/* The registered clock event devices */
24static LIST_HEAD(clockevent_devices); 26static LIST_HEAD(clockevent_devices);
25static LIST_HEAD(clockevents_released); 27static LIST_HEAD(clockevents_released);
@@ -28,7 +30,7 @@ static LIST_HEAD(clockevents_released);
28static RAW_NOTIFIER_HEAD(clockevents_chain); 30static RAW_NOTIFIER_HEAD(clockevents_chain);
29 31
30/* Protection for the above */ 32/* Protection for the above */
31static DEFINE_SPINLOCK(clockevents_lock); 33static DEFINE_RAW_SPINLOCK(clockevents_lock);
32 34
33/** 35/**
34 * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds 36 * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds
@@ -37,10 +39,9 @@ static DEFINE_SPINLOCK(clockevents_lock);
37 * 39 *
38 * Math helper, returns latch value converted to nanoseconds (bound checked) 40 * Math helper, returns latch value converted to nanoseconds (bound checked)
39 */ 41 */
40unsigned long clockevent_delta2ns(unsigned long latch, 42u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
41 struct clock_event_device *evt)
42{ 43{
43 u64 clc = ((u64) latch << evt->shift); 44 u64 clc = (u64) latch << evt->shift;
44 45
45 if (unlikely(!evt->mult)) { 46 if (unlikely(!evt->mult)) {
46 evt->mult = 1; 47 evt->mult = 1;
@@ -50,10 +51,10 @@ unsigned long clockevent_delta2ns(unsigned long latch,
50 do_div(clc, evt->mult); 51 do_div(clc, evt->mult);
51 if (clc < 1000) 52 if (clc < 1000)
52 clc = 1000; 53 clc = 1000;
53 if (clc > LONG_MAX) 54 if (clc > KTIME_MAX)
54 clc = LONG_MAX; 55 clc = KTIME_MAX;
55 56
56 return (unsigned long) clc; 57 return clc;
57} 58}
58EXPORT_SYMBOL_GPL(clockevent_delta2ns); 59EXPORT_SYMBOL_GPL(clockevent_delta2ns);
59 60
@@ -140,9 +141,9 @@ int clockevents_register_notifier(struct notifier_block *nb)
140 unsigned long flags; 141 unsigned long flags;
141 int ret; 142 int ret;
142 143
143 spin_lock_irqsave(&clockevents_lock, flags); 144 raw_spin_lock_irqsave(&clockevents_lock, flags);
144 ret = raw_notifier_chain_register(&clockevents_chain, nb); 145 ret = raw_notifier_chain_register(&clockevents_chain, nb);
145 spin_unlock_irqrestore(&clockevents_lock, flags); 146 raw_spin_unlock_irqrestore(&clockevents_lock, flags);
146 147
147 return ret; 148 return ret;
148} 149}
@@ -184,13 +185,13 @@ void clockevents_register_device(struct clock_event_device *dev)
184 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); 185 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
185 BUG_ON(!dev->cpumask); 186 BUG_ON(!dev->cpumask);
186 187
187 spin_lock_irqsave(&clockevents_lock, flags); 188 raw_spin_lock_irqsave(&clockevents_lock, flags);
188 189
189 list_add(&dev->list, &clockevent_devices); 190 list_add(&dev->list, &clockevent_devices);
190 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); 191 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
191 clockevents_notify_released(); 192 clockevents_notify_released();
192 193
193 spin_unlock_irqrestore(&clockevents_lock, flags); 194 raw_spin_unlock_irqrestore(&clockevents_lock, flags);
194} 195}
195EXPORT_SYMBOL_GPL(clockevents_register_device); 196EXPORT_SYMBOL_GPL(clockevents_register_device);
196 197
@@ -237,10 +238,11 @@ void clockevents_exchange_device(struct clock_event_device *old,
237 */ 238 */
238void clockevents_notify(unsigned long reason, void *arg) 239void clockevents_notify(unsigned long reason, void *arg)
239{ 240{
240 struct list_head *node, *tmp; 241 struct clock_event_device *dev, *tmp;
241 unsigned long flags; 242 unsigned long flags;
243 int cpu;
242 244
243 spin_lock_irqsave(&clockevents_lock, flags); 245 raw_spin_lock_irqsave(&clockevents_lock, flags);
244 clockevents_do_notify(reason, arg); 246 clockevents_do_notify(reason, arg);
245 247
246 switch (reason) { 248 switch (reason) {
@@ -249,13 +251,25 @@ void clockevents_notify(unsigned long reason, void *arg)
249 * Unregister the clock event devices which were 251 * Unregister the clock event devices which were
250 * released from the users in the notify chain. 252 * released from the users in the notify chain.
251 */ 253 */
252 list_for_each_safe(node, tmp, &clockevents_released) 254 list_for_each_entry_safe(dev, tmp, &clockevents_released, list)
253 list_del(node); 255 list_del(&dev->list);
256 /*
257 * Now check whether the CPU has left unused per cpu devices
258 */
259 cpu = *((int *)arg);
260 list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) {
261 if (cpumask_test_cpu(cpu, dev->cpumask) &&
262 cpumask_weight(dev->cpumask) == 1 &&
263 !tick_is_broadcast_device(dev)) {
264 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
265 list_del(&dev->list);
266 }
267 }
254 break; 268 break;
255 default: 269 default:
256 break; 270 break;
257 } 271 }
258 spin_unlock_irqrestore(&clockevents_lock, flags); 272 raw_spin_unlock_irqrestore(&clockevents_lock, flags);
259} 273}
260EXPORT_SYMBOL_GPL(clockevents_notify); 274EXPORT_SYMBOL_GPL(clockevents_notify);
261#endif 275#endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 5e18c6ab2c6..13700833c18 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -39,7 +39,7 @@ void timecounter_init(struct timecounter *tc,
39 tc->cycle_last = cc->read(cc); 39 tc->cycle_last = cc->read(cc);
40 tc->nsec = start_tstamp; 40 tc->nsec = start_tstamp;
41} 41}
42EXPORT_SYMBOL(timecounter_init); 42EXPORT_SYMBOL_GPL(timecounter_init);
43 43
44/** 44/**
45 * timecounter_read_delta - get nanoseconds since last call of this function 45 * timecounter_read_delta - get nanoseconds since last call of this function
@@ -83,7 +83,7 @@ u64 timecounter_read(struct timecounter *tc)
83 83
84 return nsec; 84 return nsec;
85} 85}
86EXPORT_SYMBOL(timecounter_read); 86EXPORT_SYMBOL_GPL(timecounter_read);
87 87
88u64 timecounter_cyc2time(struct timecounter *tc, 88u64 timecounter_cyc2time(struct timecounter *tc,
89 cycle_t cycle_tstamp) 89 cycle_t cycle_tstamp)
@@ -105,7 +105,60 @@ u64 timecounter_cyc2time(struct timecounter *tc,
105 105
106 return nsec; 106 return nsec;
107} 107}
108EXPORT_SYMBOL(timecounter_cyc2time); 108EXPORT_SYMBOL_GPL(timecounter_cyc2time);
109
110/**
111 * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
112 * @mult: pointer to mult variable
113 * @shift: pointer to shift variable
114 * @from: frequency to convert from
115 * @to: frequency to convert to
116 * @minsec: guaranteed runtime conversion range in seconds
117 *
118 * The function evaluates the shift/mult pair for the scaled math
119 * operations of clocksources and clockevents.
120 *
121 * @to and @from are frequency values in HZ. For clock sources @to is
122 * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock
123 * event @to is the counter frequency and @from is NSEC_PER_SEC.
124 *
125 * The @minsec conversion range argument controls the time frame in
126 * seconds which must be covered by the runtime conversion with the
127 * calculated mult and shift factors. This guarantees that no 64bit
128 * overflow happens when the input value of the conversion is
129 * multiplied with the calculated mult factor. Larger ranges may
130 * reduce the conversion accuracy by chosing smaller mult and shift
131 * factors.
132 */
133void
134clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
135{
136 u64 tmp;
137 u32 sft, sftacc= 32;
138
139 /*
140 * Calculate the shift factor which is limiting the conversion
141 * range:
142 */
143 tmp = ((u64)minsec * from) >> 32;
144 while (tmp) {
145 tmp >>=1;
146 sftacc--;
147 }
148
149 /*
150 * Find the conversion shift/mult pair which has the best
151 * accuracy and fits the maxsec conversion range:
152 */
153 for (sft = 32; sft > 0; sft--) {
154 tmp = (u64) to << sft;
155 do_div(tmp, from);
156 if ((tmp >> sftacc) == 0)
157 break;
158 }
159 *mult = tmp;
160 *shift = sft;
161}
109 162
110/*[Clocksource internal variables]--------- 163/*[Clocksource internal variables]---------
111 * curr_clocksource: 164 * curr_clocksource:
@@ -290,7 +343,19 @@ static void clocksource_resume_watchdog(void)
290{ 343{
291 unsigned long flags; 344 unsigned long flags;
292 345
293 spin_lock_irqsave(&watchdog_lock, flags); 346 /*
347 * We use trylock here to avoid a potential dead lock when
348 * kgdb calls this code after the kernel has been stopped with
349 * watchdog_lock held. When watchdog_lock is held we just
350 * return and accept, that the watchdog might trigger and mark
351 * the monitored clock source (usually TSC) unstable.
352 *
353 * This does not affect the other caller clocksource_resume()
354 * because at this point the kernel is UP, interrupts are
355 * disabled and nothing can hold watchdog_lock.
356 */
357 if (!spin_trylock_irqsave(&watchdog_lock, flags))
358 return;
294 clocksource_reset_watchdog(); 359 clocksource_reset_watchdog();
295 spin_unlock_irqrestore(&watchdog_lock, flags); 360 spin_unlock_irqrestore(&watchdog_lock, flags);
296} 361}
@@ -405,14 +470,55 @@ void clocksource_resume(void)
405 * clocksource_touch_watchdog - Update watchdog 470 * clocksource_touch_watchdog - Update watchdog
406 * 471 *
407 * Update the watchdog after exception contexts such as kgdb so as not 472 * Update the watchdog after exception contexts such as kgdb so as not
408 * to incorrectly trip the watchdog. 473 * to incorrectly trip the watchdog. This might fail when the kernel
409 * 474 * was stopped in code which holds watchdog_lock.
410 */ 475 */
411void clocksource_touch_watchdog(void) 476void clocksource_touch_watchdog(void)
412{ 477{
413 clocksource_resume_watchdog(); 478 clocksource_resume_watchdog();
414} 479}
415 480
481/**
482 * clocksource_max_deferment - Returns max time the clocksource can be deferred
483 * @cs: Pointer to clocksource
484 *
485 */
486static u64 clocksource_max_deferment(struct clocksource *cs)
487{
488 u64 max_nsecs, max_cycles;
489
490 /*
491 * Calculate the maximum number of cycles that we can pass to the
492 * cyc2ns function without overflowing a 64-bit signed result. The
493 * maximum number of cycles is equal to ULLONG_MAX/cs->mult which
494 * is equivalent to the below.
495 * max_cycles < (2^63)/cs->mult
496 * max_cycles < 2^(log2((2^63)/cs->mult))
497 * max_cycles < 2^(log2(2^63) - log2(cs->mult))
498 * max_cycles < 2^(63 - log2(cs->mult))
499 * max_cycles < 1 << (63 - log2(cs->mult))
500 * Please note that we add 1 to the result of the log2 to account for
501 * any rounding errors, ensure the above inequality is satisfied and
502 * no overflow will occur.
503 */
504 max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1));
505
506 /*
507 * The actual maximum number of cycles we can defer the clocksource is
508 * determined by the minimum of max_cycles and cs->mask.
509 */
510 max_cycles = min_t(u64, max_cycles, (u64) cs->mask);
511 max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift);
512
513 /*
514 * To ensure that the clocksource does not wrap whilst we are idle,
515 * limit the time the clocksource can be deferred by 12.5%. Please
516 * note a margin of 12.5% is used because this can be computed with
517 * a shift, versus say 10% which would require division.
518 */
519 return max_nsecs - (max_nsecs >> 5);
520}
521
416#ifdef CONFIG_GENERIC_TIME 522#ifdef CONFIG_GENERIC_TIME
417 523
418/** 524/**
@@ -511,6 +617,9 @@ static void clocksource_enqueue(struct clocksource *cs)
511 */ 617 */
512int clocksource_register(struct clocksource *cs) 618int clocksource_register(struct clocksource *cs)
513{ 619{
620 /* calculate max idle time permitted for this clocksource */
621 cs->max_idle_ns = clocksource_max_deferment(cs);
622
514 mutex_lock(&clocksource_mutex); 623 mutex_lock(&clocksource_mutex);
515 clocksource_enqueue(cs); 624 clocksource_enqueue(cs);
516 clocksource_select(); 625 clocksource_select();
@@ -580,7 +689,7 @@ sysfs_show_current_clocksources(struct sys_device *dev,
580 * @count: length of buffer 689 * @count: length of buffer
581 * 690 *
582 * Takes input from sysfs interface for manually overriding the default 691 * Takes input from sysfs interface for manually overriding the default
583 * clocksource selction. 692 * clocksource selection.
584 */ 693 */
585static ssize_t sysfs_override_clocksource(struct sys_device *dev, 694static ssize_t sysfs_override_clocksource(struct sys_device *dev,
586 struct sysdev_attribute *attr, 695 struct sysdev_attribute *attr,
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index c2ec25087a3..b3bafd5fc66 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -31,7 +31,7 @@ static struct tick_device tick_broadcast_device;
31/* FIXME: Use cpumask_var_t. */ 31/* FIXME: Use cpumask_var_t. */
32static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS); 32static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS);
33static DECLARE_BITMAP(tmpmask, NR_CPUS); 33static DECLARE_BITMAP(tmpmask, NR_CPUS);
34static DEFINE_SPINLOCK(tick_broadcast_lock); 34static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
35static int tick_broadcast_force; 35static int tick_broadcast_force;
36 36
37#ifdef CONFIG_TICK_ONESHOT 37#ifdef CONFIG_TICK_ONESHOT
@@ -96,7 +96,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
96 unsigned long flags; 96 unsigned long flags;
97 int ret = 0; 97 int ret = 0;
98 98
99 spin_lock_irqsave(&tick_broadcast_lock, flags); 99 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
100 100
101 /* 101 /*
102 * Devices might be registered with both periodic and oneshot 102 * Devices might be registered with both periodic and oneshot
@@ -122,7 +122,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
122 tick_broadcast_clear_oneshot(cpu); 122 tick_broadcast_clear_oneshot(cpu);
123 } 123 }
124 } 124 }
125 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 125 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
126 return ret; 126 return ret;
127} 127}
128 128
@@ -161,13 +161,13 @@ static void tick_do_broadcast(struct cpumask *mask)
161 */ 161 */
162static void tick_do_periodic_broadcast(void) 162static void tick_do_periodic_broadcast(void)
163{ 163{
164 spin_lock(&tick_broadcast_lock); 164 raw_spin_lock(&tick_broadcast_lock);
165 165
166 cpumask_and(to_cpumask(tmpmask), 166 cpumask_and(to_cpumask(tmpmask),
167 cpu_online_mask, tick_get_broadcast_mask()); 167 cpu_online_mask, tick_get_broadcast_mask());
168 tick_do_broadcast(to_cpumask(tmpmask)); 168 tick_do_broadcast(to_cpumask(tmpmask));
169 169
170 spin_unlock(&tick_broadcast_lock); 170 raw_spin_unlock(&tick_broadcast_lock);
171} 171}
172 172
173/* 173/*
@@ -212,7 +212,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
212 unsigned long flags; 212 unsigned long flags;
213 int cpu, bc_stopped; 213 int cpu, bc_stopped;
214 214
215 spin_lock_irqsave(&tick_broadcast_lock, flags); 215 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
216 216
217 cpu = smp_processor_id(); 217 cpu = smp_processor_id();
218 td = &per_cpu(tick_cpu_device, cpu); 218 td = &per_cpu(tick_cpu_device, cpu);
@@ -263,7 +263,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
263 tick_broadcast_setup_oneshot(bc); 263 tick_broadcast_setup_oneshot(bc);
264 } 264 }
265out: 265out:
266 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 266 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
267} 267}
268 268
269/* 269/*
@@ -299,7 +299,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
299 unsigned long flags; 299 unsigned long flags;
300 unsigned int cpu = *cpup; 300 unsigned int cpu = *cpup;
301 301
302 spin_lock_irqsave(&tick_broadcast_lock, flags); 302 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
303 303
304 bc = tick_broadcast_device.evtdev; 304 bc = tick_broadcast_device.evtdev;
305 cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); 305 cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
@@ -309,7 +309,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
309 clockevents_shutdown(bc); 309 clockevents_shutdown(bc);
310 } 310 }
311 311
312 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 312 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
313} 313}
314 314
315void tick_suspend_broadcast(void) 315void tick_suspend_broadcast(void)
@@ -317,13 +317,13 @@ void tick_suspend_broadcast(void)
317 struct clock_event_device *bc; 317 struct clock_event_device *bc;
318 unsigned long flags; 318 unsigned long flags;
319 319
320 spin_lock_irqsave(&tick_broadcast_lock, flags); 320 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
321 321
322 bc = tick_broadcast_device.evtdev; 322 bc = tick_broadcast_device.evtdev;
323 if (bc) 323 if (bc)
324 clockevents_shutdown(bc); 324 clockevents_shutdown(bc);
325 325
326 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 326 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
327} 327}
328 328
329int tick_resume_broadcast(void) 329int tick_resume_broadcast(void)
@@ -332,7 +332,7 @@ int tick_resume_broadcast(void)
332 unsigned long flags; 332 unsigned long flags;
333 int broadcast = 0; 333 int broadcast = 0;
334 334
335 spin_lock_irqsave(&tick_broadcast_lock, flags); 335 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
336 336
337 bc = tick_broadcast_device.evtdev; 337 bc = tick_broadcast_device.evtdev;
338 338
@@ -351,7 +351,7 @@ int tick_resume_broadcast(void)
351 break; 351 break;
352 } 352 }
353 } 353 }
354 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 354 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
355 355
356 return broadcast; 356 return broadcast;
357} 357}
@@ -405,7 +405,7 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
405 ktime_t now, next_event; 405 ktime_t now, next_event;
406 int cpu; 406 int cpu;
407 407
408 spin_lock(&tick_broadcast_lock); 408 raw_spin_lock(&tick_broadcast_lock);
409again: 409again:
410 dev->next_event.tv64 = KTIME_MAX; 410 dev->next_event.tv64 = KTIME_MAX;
411 next_event.tv64 = KTIME_MAX; 411 next_event.tv64 = KTIME_MAX;
@@ -443,7 +443,7 @@ again:
443 if (tick_broadcast_set_event(next_event, 0)) 443 if (tick_broadcast_set_event(next_event, 0))
444 goto again; 444 goto again;
445 } 445 }
446 spin_unlock(&tick_broadcast_lock); 446 raw_spin_unlock(&tick_broadcast_lock);
447} 447}
448 448
449/* 449/*
@@ -457,7 +457,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
457 unsigned long flags; 457 unsigned long flags;
458 int cpu; 458 int cpu;
459 459
460 spin_lock_irqsave(&tick_broadcast_lock, flags); 460 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
461 461
462 /* 462 /*
463 * Periodic mode does not care about the enter/exit of power 463 * Periodic mode does not care about the enter/exit of power
@@ -492,7 +492,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
492 } 492 }
493 493
494out: 494out:
495 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 495 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
496} 496}
497 497
498/* 498/*
@@ -563,13 +563,13 @@ void tick_broadcast_switch_to_oneshot(void)
563 struct clock_event_device *bc; 563 struct clock_event_device *bc;
564 unsigned long flags; 564 unsigned long flags;
565 565
566 spin_lock_irqsave(&tick_broadcast_lock, flags); 566 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
567 567
568 tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; 568 tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT;
569 bc = tick_broadcast_device.evtdev; 569 bc = tick_broadcast_device.evtdev;
570 if (bc) 570 if (bc)
571 tick_broadcast_setup_oneshot(bc); 571 tick_broadcast_setup_oneshot(bc);
572 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 572 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
573} 573}
574 574
575 575
@@ -581,7 +581,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
581 unsigned long flags; 581 unsigned long flags;
582 unsigned int cpu = *cpup; 582 unsigned int cpu = *cpup;
583 583
584 spin_lock_irqsave(&tick_broadcast_lock, flags); 584 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
585 585
586 /* 586 /*
587 * Clear the broadcast mask flag for the dead cpu, but do not 587 * Clear the broadcast mask flag for the dead cpu, but do not
@@ -589,7 +589,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
589 */ 589 */
590 cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); 590 cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
591 591
592 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 592 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
593} 593}
594 594
595/* 595/*
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 83c4417b6a3..b6b898d2eee 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -34,7 +34,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
34ktime_t tick_next_period; 34ktime_t tick_next_period;
35ktime_t tick_period; 35ktime_t tick_period;
36int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; 36int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
37DEFINE_SPINLOCK(tick_device_lock); 37static DEFINE_RAW_SPINLOCK(tick_device_lock);
38 38
39/* 39/*
40 * Debugging: see timer_list.c 40 * Debugging: see timer_list.c
@@ -209,7 +209,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
209 int cpu, ret = NOTIFY_OK; 209 int cpu, ret = NOTIFY_OK;
210 unsigned long flags; 210 unsigned long flags;
211 211
212 spin_lock_irqsave(&tick_device_lock, flags); 212 raw_spin_lock_irqsave(&tick_device_lock, flags);
213 213
214 cpu = smp_processor_id(); 214 cpu = smp_processor_id();
215 if (!cpumask_test_cpu(cpu, newdev->cpumask)) 215 if (!cpumask_test_cpu(cpu, newdev->cpumask))
@@ -268,7 +268,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
268 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) 268 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
269 tick_oneshot_notify(); 269 tick_oneshot_notify();
270 270
271 spin_unlock_irqrestore(&tick_device_lock, flags); 271 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
272 return NOTIFY_STOP; 272 return NOTIFY_STOP;
273 273
274out_bc: 274out_bc:
@@ -278,7 +278,7 @@ out_bc:
278 if (tick_check_broadcast_device(newdev)) 278 if (tick_check_broadcast_device(newdev))
279 ret = NOTIFY_STOP; 279 ret = NOTIFY_STOP;
280 280
281 spin_unlock_irqrestore(&tick_device_lock, flags); 281 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
282 282
283 return ret; 283 return ret;
284} 284}
@@ -311,7 +311,7 @@ static void tick_shutdown(unsigned int *cpup)
311 struct clock_event_device *dev = td->evtdev; 311 struct clock_event_device *dev = td->evtdev;
312 unsigned long flags; 312 unsigned long flags;
313 313
314 spin_lock_irqsave(&tick_device_lock, flags); 314 raw_spin_lock_irqsave(&tick_device_lock, flags);
315 td->mode = TICKDEV_MODE_PERIODIC; 315 td->mode = TICKDEV_MODE_PERIODIC;
316 if (dev) { 316 if (dev) {
317 /* 317 /*
@@ -322,7 +322,7 @@ static void tick_shutdown(unsigned int *cpup)
322 clockevents_exchange_device(dev, NULL); 322 clockevents_exchange_device(dev, NULL);
323 td->evtdev = NULL; 323 td->evtdev = NULL;
324 } 324 }
325 spin_unlock_irqrestore(&tick_device_lock, flags); 325 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
326} 326}
327 327
328static void tick_suspend(void) 328static void tick_suspend(void)
@@ -330,9 +330,9 @@ static void tick_suspend(void)
330 struct tick_device *td = &__get_cpu_var(tick_cpu_device); 330 struct tick_device *td = &__get_cpu_var(tick_cpu_device);
331 unsigned long flags; 331 unsigned long flags;
332 332
333 spin_lock_irqsave(&tick_device_lock, flags); 333 raw_spin_lock_irqsave(&tick_device_lock, flags);
334 clockevents_shutdown(td->evtdev); 334 clockevents_shutdown(td->evtdev);
335 spin_unlock_irqrestore(&tick_device_lock, flags); 335 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
336} 336}
337 337
338static void tick_resume(void) 338static void tick_resume(void)
@@ -341,7 +341,7 @@ static void tick_resume(void)
341 unsigned long flags; 341 unsigned long flags;
342 int broadcast = tick_resume_broadcast(); 342 int broadcast = tick_resume_broadcast();
343 343
344 spin_lock_irqsave(&tick_device_lock, flags); 344 raw_spin_lock_irqsave(&tick_device_lock, flags);
345 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); 345 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);
346 346
347 if (!broadcast) { 347 if (!broadcast) {
@@ -350,7 +350,7 @@ static void tick_resume(void)
350 else 350 else
351 tick_resume_oneshot(); 351 tick_resume_oneshot();
352 } 352 }
353 spin_unlock_irqrestore(&tick_device_lock, flags); 353 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
354} 354}
355 355
356/* 356/*
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index b1c05bf75ee..290eefbc1f6 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -6,7 +6,6 @@
6#define TICK_DO_TIMER_BOOT -2 6#define TICK_DO_TIMER_BOOT -2
7 7
8DECLARE_PER_CPU(struct tick_device, tick_cpu_device); 8DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
9extern spinlock_t tick_device_lock;
10extern ktime_t tick_next_period; 9extern ktime_t tick_next_period;
11extern ktime_t tick_period; 10extern ktime_t tick_period;
12extern int tick_do_timer_cpu __read_mostly; 11extern int tick_do_timer_cpu __read_mostly;
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index a96c0e2b89c..0a8a213016f 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -50,9 +50,9 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
50 dev->min_delta_ns += dev->min_delta_ns >> 1; 50 dev->min_delta_ns += dev->min_delta_ns >> 1;
51 51
52 printk(KERN_WARNING 52 printk(KERN_WARNING
53 "CE: %s increasing min_delta_ns to %lu nsec\n", 53 "CE: %s increasing min_delta_ns to %llu nsec\n",
54 dev->name ? dev->name : "?", 54 dev->name ? dev->name : "?",
55 dev->min_delta_ns << 1); 55 (unsigned long long) dev->min_delta_ns << 1);
56 56
57 i = 0; 57 i = 0;
58 } 58 }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 89aed5933ed..f992762d7f5 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -134,18 +134,13 @@ __setup("nohz=", setup_tick_nohz);
134 * value. We do this unconditionally on any cpu, as we don't know whether the 134 * value. We do this unconditionally on any cpu, as we don't know whether the
135 * cpu, which has the update task assigned is in a long sleep. 135 * cpu, which has the update task assigned is in a long sleep.
136 */ 136 */
137static void tick_nohz_update_jiffies(void) 137static void tick_nohz_update_jiffies(ktime_t now)
138{ 138{
139 int cpu = smp_processor_id(); 139 int cpu = smp_processor_id();
140 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 140 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
141 unsigned long flags; 141 unsigned long flags;
142 ktime_t now;
143
144 if (!ts->tick_stopped)
145 return;
146 142
147 cpumask_clear_cpu(cpu, nohz_cpu_mask); 143 cpumask_clear_cpu(cpu, nohz_cpu_mask);
148 now = ktime_get();
149 ts->idle_waketime = now; 144 ts->idle_waketime = now;
150 145
151 local_irq_save(flags); 146 local_irq_save(flags);
@@ -155,20 +150,17 @@ static void tick_nohz_update_jiffies(void)
155 touch_softlockup_watchdog(); 150 touch_softlockup_watchdog();
156} 151}
157 152
158static void tick_nohz_stop_idle(int cpu) 153static void tick_nohz_stop_idle(int cpu, ktime_t now)
159{ 154{
160 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 155 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
156 ktime_t delta;
161 157
162 if (ts->idle_active) { 158 delta = ktime_sub(now, ts->idle_entrytime);
163 ktime_t now, delta; 159 ts->idle_lastupdate = now;
164 now = ktime_get(); 160 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
165 delta = ktime_sub(now, ts->idle_entrytime); 161 ts->idle_active = 0;
166 ts->idle_lastupdate = now;
167 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
168 ts->idle_active = 0;
169 162
170 sched_clock_idle_wakeup_event(0); 163 sched_clock_idle_wakeup_event(0);
171 }
172} 164}
173 165
174static ktime_t tick_nohz_start_idle(struct tick_sched *ts) 166static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
@@ -216,6 +208,7 @@ void tick_nohz_stop_sched_tick(int inidle)
216 struct tick_sched *ts; 208 struct tick_sched *ts;
217 ktime_t last_update, expires, now; 209 ktime_t last_update, expires, now;
218 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 210 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
211 u64 time_delta;
219 int cpu; 212 int cpu;
220 213
221 local_irq_save(flags); 214 local_irq_save(flags);
@@ -263,7 +256,7 @@ void tick_nohz_stop_sched_tick(int inidle)
263 256
264 if (ratelimit < 10) { 257 if (ratelimit < 10) {
265 printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", 258 printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
266 local_softirq_pending()); 259 (unsigned int) local_softirq_pending());
267 ratelimit++; 260 ratelimit++;
268 } 261 }
269 goto end; 262 goto end;
@@ -275,14 +268,18 @@ void tick_nohz_stop_sched_tick(int inidle)
275 seq = read_seqbegin(&xtime_lock); 268 seq = read_seqbegin(&xtime_lock);
276 last_update = last_jiffies_update; 269 last_update = last_jiffies_update;
277 last_jiffies = jiffies; 270 last_jiffies = jiffies;
271 time_delta = timekeeping_max_deferment();
278 } while (read_seqretry(&xtime_lock, seq)); 272 } while (read_seqretry(&xtime_lock, seq));
279 273
280 /* Get the next timer wheel timer */ 274 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
281 next_jiffies = get_next_timer_interrupt(last_jiffies); 275 arch_needs_cpu(cpu)) {
282 delta_jiffies = next_jiffies - last_jiffies; 276 next_jiffies = last_jiffies + 1;
283
284 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu))
285 delta_jiffies = 1; 277 delta_jiffies = 1;
278 } else {
279 /* Get the next timer wheel timer */
280 next_jiffies = get_next_timer_interrupt(last_jiffies);
281 delta_jiffies = next_jiffies - last_jiffies;
282 }
286 /* 283 /*
287 * Do not stop the tick, if we are only one off 284 * Do not stop the tick, if we are only one off
288 * or if the cpu is required for rcu 285 * or if the cpu is required for rcu
@@ -294,22 +291,51 @@ void tick_nohz_stop_sched_tick(int inidle)
294 if ((long)delta_jiffies >= 1) { 291 if ((long)delta_jiffies >= 1) {
295 292
296 /* 293 /*
297 * calculate the expiry time for the next timer wheel
298 * timer
299 */
300 expires = ktime_add_ns(last_update, tick_period.tv64 *
301 delta_jiffies);
302
303 /*
304 * If this cpu is the one which updates jiffies, then 294 * If this cpu is the one which updates jiffies, then
305 * give up the assignment and let it be taken by the 295 * give up the assignment and let it be taken by the
306 * cpu which runs the tick timer next, which might be 296 * cpu which runs the tick timer next, which might be
307 * this cpu as well. If we don't drop this here the 297 * this cpu as well. If we don't drop this here the
308 * jiffies might be stale and do_timer() never 298 * jiffies might be stale and do_timer() never
309 * invoked. 299 * invoked. Keep track of the fact that it was the one
300 * which had the do_timer() duty last. If this cpu is
301 * the one which had the do_timer() duty last, we
302 * limit the sleep time to the timekeeping
303 * max_deferement value which we retrieved
304 * above. Otherwise we can sleep as long as we want.
310 */ 305 */
311 if (cpu == tick_do_timer_cpu) 306 if (cpu == tick_do_timer_cpu) {
312 tick_do_timer_cpu = TICK_DO_TIMER_NONE; 307 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
308 ts->do_timer_last = 1;
309 } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
310 time_delta = KTIME_MAX;
311 ts->do_timer_last = 0;
312 } else if (!ts->do_timer_last) {
313 time_delta = KTIME_MAX;
314 }
315
316 /*
317 * calculate the expiry time for the next timer wheel
318 * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals
319 * that there is no timer pending or at least extremely
320 * far into the future (12 days for HZ=1000). In this
321 * case we set the expiry to the end of time.
322 */
323 if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) {
324 /*
325 * Calculate the time delta for the next timer event.
326 * If the time delta exceeds the maximum time delta
327 * permitted by the current clocksource then adjust
328 * the time delta accordingly to ensure the
329 * clocksource does not wrap.
330 */
331 time_delta = min_t(u64, time_delta,
332 tick_period.tv64 * delta_jiffies);
333 }
334
335 if (time_delta < KTIME_MAX)
336 expires = ktime_add_ns(last_update, time_delta);
337 else
338 expires.tv64 = KTIME_MAX;
313 339
314 if (delta_jiffies > 1) 340 if (delta_jiffies > 1)
315 cpumask_set_cpu(cpu, nohz_cpu_mask); 341 cpumask_set_cpu(cpu, nohz_cpu_mask);
@@ -342,22 +368,19 @@ void tick_nohz_stop_sched_tick(int inidle)
342 368
343 ts->idle_sleeps++; 369 ts->idle_sleeps++;
344 370
371 /* Mark expires */
372 ts->idle_expires = expires;
373
345 /* 374 /*
346 * delta_jiffies >= NEXT_TIMER_MAX_DELTA signals that 375 * If the expiration time == KTIME_MAX, then
347 * there is no timer pending or at least extremly far 376 * in this case we simply stop the tick timer.
348 * into the future (12 days for HZ=1000). In this case
349 * we simply stop the tick timer:
350 */ 377 */
351 if (unlikely(delta_jiffies >= NEXT_TIMER_MAX_DELTA)) { 378 if (unlikely(expires.tv64 == KTIME_MAX)) {
352 ts->idle_expires.tv64 = KTIME_MAX;
353 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) 379 if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
354 hrtimer_cancel(&ts->sched_timer); 380 hrtimer_cancel(&ts->sched_timer);
355 goto out; 381 goto out;
356 } 382 }
357 383
358 /* Mark expiries */
359 ts->idle_expires = expires;
360
361 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { 384 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
362 hrtimer_start(&ts->sched_timer, expires, 385 hrtimer_start(&ts->sched_timer, expires,
363 HRTIMER_MODE_ABS_PINNED); 386 HRTIMER_MODE_ABS_PINNED);
@@ -436,7 +459,11 @@ void tick_nohz_restart_sched_tick(void)
436 ktime_t now; 459 ktime_t now;
437 460
438 local_irq_disable(); 461 local_irq_disable();
439 tick_nohz_stop_idle(cpu); 462 if (ts->idle_active || (ts->inidle && ts->tick_stopped))
463 now = ktime_get();
464
465 if (ts->idle_active)
466 tick_nohz_stop_idle(cpu, now);
440 467
441 if (!ts->inidle || !ts->tick_stopped) { 468 if (!ts->inidle || !ts->tick_stopped) {
442 ts->inidle = 0; 469 ts->inidle = 0;
@@ -450,7 +477,6 @@ void tick_nohz_restart_sched_tick(void)
450 477
451 /* Update jiffies first */ 478 /* Update jiffies first */
452 select_nohz_load_balancer(0); 479 select_nohz_load_balancer(0);
453 now = ktime_get();
454 tick_do_update_jiffies64(now); 480 tick_do_update_jiffies64(now);
455 cpumask_clear_cpu(cpu, nohz_cpu_mask); 481 cpumask_clear_cpu(cpu, nohz_cpu_mask);
456 482
@@ -584,22 +610,18 @@ static void tick_nohz_switch_to_nohz(void)
584 * timer and do not touch the other magic bits which need to be done 610 * timer and do not touch the other magic bits which need to be done
585 * when idle is left. 611 * when idle is left.
586 */ 612 */
587static void tick_nohz_kick_tick(int cpu) 613static void tick_nohz_kick_tick(int cpu, ktime_t now)
588{ 614{
589#if 0 615#if 0
590 /* Switch back to 2.6.27 behaviour */ 616 /* Switch back to 2.6.27 behaviour */
591 617
592 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 618 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
593 ktime_t delta, now; 619 ktime_t delta;
594
595 if (!ts->tick_stopped)
596 return;
597 620
598 /* 621 /*
599 * Do not touch the tick device, when the next expiry is either 622 * Do not touch the tick device, when the next expiry is either
600 * already reached or less/equal than the tick period. 623 * already reached or less/equal than the tick period.
601 */ 624 */
602 now = ktime_get();
603 delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now); 625 delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now);
604 if (delta.tv64 <= tick_period.tv64) 626 if (delta.tv64 <= tick_period.tv64)
605 return; 627 return;
@@ -608,9 +630,26 @@ static void tick_nohz_kick_tick(int cpu)
608#endif 630#endif
609} 631}
610 632
633static inline void tick_check_nohz(int cpu)
634{
635 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
636 ktime_t now;
637
638 if (!ts->idle_active && !ts->tick_stopped)
639 return;
640 now = ktime_get();
641 if (ts->idle_active)
642 tick_nohz_stop_idle(cpu, now);
643 if (ts->tick_stopped) {
644 tick_nohz_update_jiffies(now);
645 tick_nohz_kick_tick(cpu, now);
646 }
647}
648
611#else 649#else
612 650
613static inline void tick_nohz_switch_to_nohz(void) { } 651static inline void tick_nohz_switch_to_nohz(void) { }
652static inline void tick_check_nohz(int cpu) { }
614 653
615#endif /* NO_HZ */ 654#endif /* NO_HZ */
616 655
@@ -620,11 +659,7 @@ static inline void tick_nohz_switch_to_nohz(void) { }
620void tick_check_idle(int cpu) 659void tick_check_idle(int cpu)
621{ 660{
622 tick_check_oneshot_broadcast(cpu); 661 tick_check_oneshot_broadcast(cpu);
623#ifdef CONFIG_NO_HZ 662 tick_check_nohz(cpu);
624 tick_nohz_stop_idle(cpu);
625 tick_nohz_update_jiffies();
626 tick_nohz_kick_tick(cpu);
627#endif
628} 663}
629 664
630/* 665/*
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
index 71e7f1a1915..12f5c55090b 100644
--- a/kernel/time/timecompare.c
+++ b/kernel/time/timecompare.c
@@ -40,7 +40,7 @@ ktime_t timecompare_transform(struct timecompare *sync,
40 40
41 return ns_to_ktime(nsec); 41 return ns_to_ktime(nsec);
42} 42}
43EXPORT_SYMBOL(timecompare_transform); 43EXPORT_SYMBOL_GPL(timecompare_transform);
44 44
45int timecompare_offset(struct timecompare *sync, 45int timecompare_offset(struct timecompare *sync,
46 s64 *offset, 46 s64 *offset,
@@ -89,7 +89,7 @@ int timecompare_offset(struct timecompare *sync,
89 * source time 89 * source time
90 */ 90 */
91 sample.offset = 91 sample.offset =
92 ktime_to_ns(ktime_add(end, start)) / 2 - 92 (ktime_to_ns(end) + ktime_to_ns(start)) / 2 -
93 ts; 93 ts;
94 94
95 /* simple insertion sort based on duration */ 95 /* simple insertion sort based on duration */
@@ -131,7 +131,7 @@ int timecompare_offset(struct timecompare *sync,
131 131
132 return used; 132 return used;
133} 133}
134EXPORT_SYMBOL(timecompare_offset); 134EXPORT_SYMBOL_GPL(timecompare_offset);
135 135
136void __timecompare_update(struct timecompare *sync, 136void __timecompare_update(struct timecompare *sync,
137 u64 source_tstamp) 137 u64 source_tstamp)
@@ -188,4 +188,4 @@ void __timecompare_update(struct timecompare *sync,
188 } 188 }
189 } 189 }
190} 190}
191EXPORT_SYMBOL(__timecompare_update); 191EXPORT_SYMBOL_GPL(__timecompare_update);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index c3a4e2907ea..e2ab064c6d4 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -177,7 +177,7 @@ void timekeeping_leap_insert(int leapsecond)
177{ 177{
178 xtime.tv_sec += leapsecond; 178 xtime.tv_sec += leapsecond;
179 wall_to_monotonic.tv_sec -= leapsecond; 179 wall_to_monotonic.tv_sec -= leapsecond;
180 update_vsyscall(&xtime, timekeeper.clock); 180 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
181} 181}
182 182
183#ifdef CONFIG_GENERIC_TIME 183#ifdef CONFIG_GENERIC_TIME
@@ -337,7 +337,7 @@ int do_settimeofday(struct timespec *tv)
337 timekeeper.ntp_error = 0; 337 timekeeper.ntp_error = 0;
338 ntp_clear(); 338 ntp_clear();
339 339
340 update_vsyscall(&xtime, timekeeper.clock); 340 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
341 341
342 write_sequnlock_irqrestore(&xtime_lock, flags); 342 write_sequnlock_irqrestore(&xtime_lock, flags);
343 343
@@ -488,6 +488,17 @@ int timekeeping_valid_for_hres(void)
488} 488}
489 489
490/** 490/**
491 * timekeeping_max_deferment - Returns max time the clocksource can be deferred
492 *
493 * Caller must observe xtime_lock via read_seqbegin/read_seqretry to
494 * ensure that the clocksource does not change!
495 */
496u64 timekeeping_max_deferment(void)
497{
498 return timekeeper.clock->max_idle_ns;
499}
500
501/**
491 * read_persistent_clock - Return time from the persistent clock. 502 * read_persistent_clock - Return time from the persistent clock.
492 * 503 *
493 * Weak dummy function for arches that do not yet support it. 504 * Weak dummy function for arches that do not yet support it.
@@ -722,6 +733,51 @@ static void timekeeping_adjust(s64 offset)
722 timekeeper.ntp_error_shift; 733 timekeeper.ntp_error_shift;
723} 734}
724 735
736
737/**
738 * logarithmic_accumulation - shifted accumulation of cycles
739 *
740 * This functions accumulates a shifted interval of cycles into
741 * into a shifted interval nanoseconds. Allows for O(log) accumulation
742 * loop.
743 *
744 * Returns the unconsumed cycles.
745 */
746static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
747{
748 u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
749
750 /* If the offset is smaller then a shifted interval, do nothing */
751 if (offset < timekeeper.cycle_interval<<shift)
752 return offset;
753
754 /* Accumulate one shifted interval */
755 offset -= timekeeper.cycle_interval << shift;
756 timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift;
757
758 timekeeper.xtime_nsec += timekeeper.xtime_interval << shift;
759 while (timekeeper.xtime_nsec >= nsecps) {
760 timekeeper.xtime_nsec -= nsecps;
761 xtime.tv_sec++;
762 second_overflow();
763 }
764
765 /* Accumulate into raw time */
766 raw_time.tv_nsec += timekeeper.raw_interval << shift;;
767 while (raw_time.tv_nsec >= NSEC_PER_SEC) {
768 raw_time.tv_nsec -= NSEC_PER_SEC;
769 raw_time.tv_sec++;
770 }
771
772 /* Accumulate error between NTP and clock interval */
773 timekeeper.ntp_error += tick_length << shift;
774 timekeeper.ntp_error -= timekeeper.xtime_interval <<
775 (timekeeper.ntp_error_shift + shift);
776
777 return offset;
778}
779
780
725/** 781/**
726 * update_wall_time - Uses the current clocksource to increment the wall time 782 * update_wall_time - Uses the current clocksource to increment the wall time
727 * 783 *
@@ -732,6 +788,7 @@ void update_wall_time(void)
732 struct clocksource *clock; 788 struct clocksource *clock;
733 cycle_t offset; 789 cycle_t offset;
734 u64 nsecs; 790 u64 nsecs;
791 int shift = 0, maxshift;
735 792
736 /* Make sure we're fully resumed: */ 793 /* Make sure we're fully resumed: */
737 if (unlikely(timekeeping_suspended)) 794 if (unlikely(timekeeping_suspended))
@@ -745,33 +802,22 @@ void update_wall_time(void)
745#endif 802#endif
746 timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; 803 timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
747 804
748 /* normally this loop will run just once, however in the 805 /*
749 * case of lost or late ticks, it will accumulate correctly. 806 * With NO_HZ we may have to accumulate many cycle_intervals
807 * (think "ticks") worth of time at once. To do this efficiently,
808 * we calculate the largest doubling multiple of cycle_intervals
809 * that is smaller then the offset. We then accumulate that
810 * chunk in one go, and then try to consume the next smaller
811 * doubled multiple.
750 */ 812 */
813 shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
814 shift = max(0, shift);
815 /* Bound shift to one less then what overflows tick_length */
816 maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1;
817 shift = min(shift, maxshift);
751 while (offset >= timekeeper.cycle_interval) { 818 while (offset >= timekeeper.cycle_interval) {
752 u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; 819 offset = logarithmic_accumulation(offset, shift);
753 820 shift--;
754 /* accumulate one interval */
755 offset -= timekeeper.cycle_interval;
756 clock->cycle_last += timekeeper.cycle_interval;
757
758 timekeeper.xtime_nsec += timekeeper.xtime_interval;
759 if (timekeeper.xtime_nsec >= nsecps) {
760 timekeeper.xtime_nsec -= nsecps;
761 xtime.tv_sec++;
762 second_overflow();
763 }
764
765 raw_time.tv_nsec += timekeeper.raw_interval;
766 if (raw_time.tv_nsec >= NSEC_PER_SEC) {
767 raw_time.tv_nsec -= NSEC_PER_SEC;
768 raw_time.tv_sec++;
769 }
770
771 /* accumulate error between NTP and clock interval */
772 timekeeper.ntp_error += tick_length;
773 timekeeper.ntp_error -= timekeeper.xtime_interval <<
774 timekeeper.ntp_error_shift;
775 } 821 }
776 822
777 /* correct the clock when NTP error is too big */ 823 /* correct the clock when NTP error is too big */
@@ -811,7 +857,7 @@ void update_wall_time(void)
811 update_xtime_cache(nsecs); 857 update_xtime_cache(nsecs);
812 858
813 /* check to see if there is a new clocksource to use */ 859 /* check to see if there is a new clocksource to use */
814 update_vsyscall(&xtime, timekeeper.clock); 860 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
815} 861}
816 862
817/** 863/**
@@ -834,6 +880,7 @@ void getboottime(struct timespec *ts)
834 880
835 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); 881 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
836} 882}
883EXPORT_SYMBOL_GPL(getboottime);
837 884
838/** 885/**
839 * monotonic_to_bootbased - Convert the monotonic time to boot based. 886 * monotonic_to_bootbased - Convert the monotonic time to boot based.
@@ -843,6 +890,7 @@ void monotonic_to_bootbased(struct timespec *ts)
843{ 890{
844 *ts = timespec_add_safe(*ts, total_sleep_time); 891 *ts = timespec_add_safe(*ts, total_sleep_time);
845} 892}
893EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
846 894
847unsigned long get_seconds(void) 895unsigned long get_seconds(void)
848{ 896{
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 1b5b7aa2fdf..bdfb8dd1050 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -84,7 +84,7 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
84 84
85next_one: 85next_one:
86 i = 0; 86 i = 0;
87 spin_lock_irqsave(&base->cpu_base->lock, flags); 87 raw_spin_lock_irqsave(&base->cpu_base->lock, flags);
88 88
89 curr = base->first; 89 curr = base->first;
90 /* 90 /*
@@ -100,13 +100,13 @@ next_one:
100 100
101 timer = rb_entry(curr, struct hrtimer, node); 101 timer = rb_entry(curr, struct hrtimer, node);
102 tmp = *timer; 102 tmp = *timer;
103 spin_unlock_irqrestore(&base->cpu_base->lock, flags); 103 raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags);
104 104
105 print_timer(m, timer, &tmp, i, now); 105 print_timer(m, timer, &tmp, i, now);
106 next++; 106 next++;
107 goto next_one; 107 goto next_one;
108 } 108 }
109 spin_unlock_irqrestore(&base->cpu_base->lock, flags); 109 raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags);
110} 110}
111 111
112static void 112static void
@@ -150,6 +150,9 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
150 P_ns(expires_next); 150 P_ns(expires_next);
151 P(hres_active); 151 P(hres_active);
152 P(nr_events); 152 P(nr_events);
153 P(nr_retries);
154 P(nr_hangs);
155 P_ns(max_hang_time);
153#endif 156#endif
154#undef P 157#undef P
155#undef P_ns 158#undef P_ns
@@ -204,10 +207,12 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
204 return; 207 return;
205 } 208 }
206 SEQ_printf(m, "%s\n", dev->name); 209 SEQ_printf(m, "%s\n", dev->name);
207 SEQ_printf(m, " max_delta_ns: %lu\n", dev->max_delta_ns); 210 SEQ_printf(m, " max_delta_ns: %llu\n",
208 SEQ_printf(m, " min_delta_ns: %lu\n", dev->min_delta_ns); 211 (unsigned long long) dev->max_delta_ns);
209 SEQ_printf(m, " mult: %lu\n", dev->mult); 212 SEQ_printf(m, " min_delta_ns: %llu\n",
210 SEQ_printf(m, " shift: %d\n", dev->shift); 213 (unsigned long long) dev->min_delta_ns);
214 SEQ_printf(m, " mult: %u\n", dev->mult);
215 SEQ_printf(m, " shift: %u\n", dev->shift);
211 SEQ_printf(m, " mode: %d\n", dev->mode); 216 SEQ_printf(m, " mode: %d\n", dev->mode);
212 SEQ_printf(m, " next_event: %Ld nsecs\n", 217 SEQ_printf(m, " next_event: %Ld nsecs\n",
213 (unsigned long long) ktime_to_ns(dev->next_event)); 218 (unsigned long long) ktime_to_ns(dev->next_event));
@@ -232,10 +237,10 @@ static void timer_list_show_tickdevices(struct seq_file *m)
232#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST 237#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
233 print_tickdevice(m, tick_get_broadcast_device(), -1); 238 print_tickdevice(m, tick_get_broadcast_device(), -1);
234 SEQ_printf(m, "tick_broadcast_mask: %08lx\n", 239 SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
235 tick_get_broadcast_mask()->bits[0]); 240 cpumask_bits(tick_get_broadcast_mask())[0]);
236#ifdef CONFIG_TICK_ONESHOT 241#ifdef CONFIG_TICK_ONESHOT
237 SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n", 242 SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n",
238 tick_get_broadcast_oneshot_mask()->bits[0]); 243 cpumask_bits(tick_get_broadcast_oneshot_mask())[0]);
239#endif 244#endif
240 SEQ_printf(m, "\n"); 245 SEQ_printf(m, "\n");
241#endif 246#endif
@@ -252,7 +257,7 @@ static int timer_list_show(struct seq_file *m, void *v)
252 u64 now = ktime_to_ns(ktime_get()); 257 u64 now = ktime_to_ns(ktime_get());
253 int cpu; 258 int cpu;
254 259
255 SEQ_printf(m, "Timer List Version: v0.4\n"); 260 SEQ_printf(m, "Timer List Version: v0.5\n");
256 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); 261 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
257 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); 262 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
258 263
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index ee5681f8d7e..2f3b585b8d7 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -86,7 +86,7 @@ static DEFINE_SPINLOCK(table_lock);
86/* 86/*
87 * Per-CPU lookup locks for fast hash lookup: 87 * Per-CPU lookup locks for fast hash lookup:
88 */ 88 */
89static DEFINE_PER_CPU(spinlock_t, lookup_lock); 89static DEFINE_PER_CPU(raw_spinlock_t, tstats_lookup_lock);
90 90
91/* 91/*
92 * Mutex to serialize state changes with show-stats activities: 92 * Mutex to serialize state changes with show-stats activities:
@@ -238,14 +238,14 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
238 /* 238 /*
239 * It doesnt matter which lock we take: 239 * It doesnt matter which lock we take:
240 */ 240 */
241 spinlock_t *lock; 241 raw_spinlock_t *lock;
242 struct entry *entry, input; 242 struct entry *entry, input;
243 unsigned long flags; 243 unsigned long flags;
244 244
245 if (likely(!timer_stats_active)) 245 if (likely(!timer_stats_active))
246 return; 246 return;
247 247
248 lock = &per_cpu(lookup_lock, raw_smp_processor_id()); 248 lock = &per_cpu(tstats_lookup_lock, raw_smp_processor_id());
249 249
250 input.timer = timer; 250 input.timer = timer;
251 input.start_func = startf; 251 input.start_func = startf;
@@ -253,7 +253,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
253 input.pid = pid; 253 input.pid = pid;
254 input.timer_flag = timer_flag; 254 input.timer_flag = timer_flag;
255 255
256 spin_lock_irqsave(lock, flags); 256 raw_spin_lock_irqsave(lock, flags);
257 if (!timer_stats_active) 257 if (!timer_stats_active)
258 goto out_unlock; 258 goto out_unlock;
259 259
@@ -264,7 +264,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
264 atomic_inc(&overflow_count); 264 atomic_inc(&overflow_count);
265 265
266 out_unlock: 266 out_unlock:
267 spin_unlock_irqrestore(lock, flags); 267 raw_spin_unlock_irqrestore(lock, flags);
268} 268}
269 269
270static void print_name_offset(struct seq_file *m, unsigned long addr) 270static void print_name_offset(struct seq_file *m, unsigned long addr)
@@ -348,9 +348,11 @@ static void sync_access(void)
348 int cpu; 348 int cpu;
349 349
350 for_each_online_cpu(cpu) { 350 for_each_online_cpu(cpu) {
351 spin_lock_irqsave(&per_cpu(lookup_lock, cpu), flags); 351 raw_spinlock_t *lock = &per_cpu(tstats_lookup_lock, cpu);
352
353 raw_spin_lock_irqsave(lock, flags);
352 /* nothing */ 354 /* nothing */
353 spin_unlock_irqrestore(&per_cpu(lookup_lock, cpu), flags); 355 raw_spin_unlock_irqrestore(lock, flags);
354 } 356 }
355} 357}
356 358
@@ -408,7 +410,7 @@ void __init init_timer_stats(void)
408 int cpu; 410 int cpu;
409 411
410 for_each_possible_cpu(cpu) 412 for_each_possible_cpu(cpu)
411 spin_lock_init(&per_cpu(lookup_lock, cpu)); 413 raw_spin_lock_init(&per_cpu(tstats_lookup_lock, cpu));
412} 414}
413 415
414static int __init init_tstats_procfs(void) 416static int __init init_tstats_procfs(void)
diff --git a/kernel/timer.c b/kernel/timer.c
index 5db5a8d2681..c61a7949387 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -656,8 +656,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
656 656
657 debug_activate(timer, expires); 657 debug_activate(timer, expires);
658 658
659 new_base = __get_cpu_var(tvec_bases);
660
661 cpu = smp_processor_id(); 659 cpu = smp_processor_id();
662 660
663#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) 661#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
@@ -1200,6 +1198,7 @@ void update_process_times(int user_tick)
1200 run_local_timers(); 1198 run_local_timers();
1201 rcu_check_callbacks(cpu, user_tick); 1199 rcu_check_callbacks(cpu, user_tick);
1202 printk_tick(); 1200 printk_tick();
1201 perf_event_do_pending();
1203 scheduler_tick(); 1202 scheduler_tick();
1204 run_posix_cpu_timers(p); 1203 run_posix_cpu_timers(p);
1205} 1204}
@@ -1211,8 +1210,6 @@ static void run_timer_softirq(struct softirq_action *h)
1211{ 1210{
1212 struct tvec_base *base = __get_cpu_var(tvec_bases); 1211 struct tvec_base *base = __get_cpu_var(tvec_bases);
1213 1212
1214 perf_event_do_pending();
1215
1216 hrtimer_run_pending(); 1213 hrtimer_run_pending();
1217 1214
1218 if (time_after_eq(jiffies, base->timer_jiffies)) 1215 if (time_after_eq(jiffies, base->timer_jiffies))
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index b416512ad17..60e2ce0181e 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -12,39 +12,37 @@ config NOP_TRACER
12config HAVE_FTRACE_NMI_ENTER 12config HAVE_FTRACE_NMI_ENTER
13 bool 13 bool
14 help 14 help
15 See Documentation/trace/ftrace-implementation.txt 15 See Documentation/trace/ftrace-design.txt
16 16
17config HAVE_FUNCTION_TRACER 17config HAVE_FUNCTION_TRACER
18 bool 18 bool
19 help 19 help
20 See Documentation/trace/ftrace-implementation.txt 20 See Documentation/trace/ftrace-design.txt
21 21
22config HAVE_FUNCTION_GRAPH_TRACER 22config HAVE_FUNCTION_GRAPH_TRACER
23 bool 23 bool
24 help 24 help
25 See Documentation/trace/ftrace-implementation.txt 25 See Documentation/trace/ftrace-design.txt
26 26
27config HAVE_FUNCTION_GRAPH_FP_TEST 27config HAVE_FUNCTION_GRAPH_FP_TEST
28 bool 28 bool
29 help 29 help
30 An arch may pass in a unique value (frame pointer) to both the 30 See Documentation/trace/ftrace-design.txt
31 entering and exiting of a function. On exit, the value is compared
32 and if it does not match, then it will panic the kernel.
33 31
34config HAVE_FUNCTION_TRACE_MCOUNT_TEST 32config HAVE_FUNCTION_TRACE_MCOUNT_TEST
35 bool 33 bool
36 help 34 help
37 See Documentation/trace/ftrace-implementation.txt 35 See Documentation/trace/ftrace-design.txt
38 36
39config HAVE_DYNAMIC_FTRACE 37config HAVE_DYNAMIC_FTRACE
40 bool 38 bool
41 help 39 help
42 See Documentation/trace/ftrace-implementation.txt 40 See Documentation/trace/ftrace-design.txt
43 41
44config HAVE_FTRACE_MCOUNT_RECORD 42config HAVE_FTRACE_MCOUNT_RECORD
45 bool 43 bool
46 help 44 help
47 See Documentation/trace/ftrace-implementation.txt 45 See Documentation/trace/ftrace-design.txt
48 46
49config HAVE_HW_BRANCH_TRACER 47config HAVE_HW_BRANCH_TRACER
50 bool 48 bool
@@ -52,7 +50,7 @@ config HAVE_HW_BRANCH_TRACER
52config HAVE_SYSCALL_TRACEPOINTS 50config HAVE_SYSCALL_TRACEPOINTS
53 bool 51 bool
54 help 52 help
55 See Documentation/trace/ftrace-implementation.txt 53 See Documentation/trace/ftrace-design.txt
56 54
57config TRACER_MAX_TRACE 55config TRACER_MAX_TRACE
58 bool 56 bool
@@ -83,7 +81,7 @@ config RING_BUFFER_ALLOW_SWAP
83# This allows those options to appear when no other tracer is selected. But the 81# This allows those options to appear when no other tracer is selected. But the
84# options do not appear when something else selects it. We need the two options 82# options do not appear when something else selects it. We need the two options
85# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the 83# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the
86# hidding of the automatic options. 84# hiding of the automatic options.
87 85
88config TRACING 86config TRACING
89 bool 87 bool
@@ -119,7 +117,7 @@ menuconfig FTRACE
119 bool "Tracers" 117 bool "Tracers"
120 default y if DEBUG_KERNEL 118 default y if DEBUG_KERNEL
121 help 119 help
122 Enable the kernel tracing infrastructure. 120 Enable the kernel tracing infrastructure.
123 121
124if FTRACE 122if FTRACE
125 123
@@ -133,7 +131,7 @@ config FUNCTION_TRACER
133 help 131 help
134 Enable the kernel to trace every kernel function. This is done 132 Enable the kernel to trace every kernel function. This is done
135 by using a compiler feature to insert a small, 5-byte No-Operation 133 by using a compiler feature to insert a small, 5-byte No-Operation
136 instruction to the beginning of every kernel function, which NOP 134 instruction at the beginning of every kernel function, which NOP
137 sequence is then dynamically patched into a tracer call when 135 sequence is then dynamically patched into a tracer call when
138 tracing is enabled by the administrator. If it's runtime disabled 136 tracing is enabled by the administrator. If it's runtime disabled
139 (the bootup default), then the overhead of the instructions is very 137 (the bootup default), then the overhead of the instructions is very
@@ -150,7 +148,7 @@ config FUNCTION_GRAPH_TRACER
150 and its entry. 148 and its entry.
151 Its first purpose is to trace the duration of functions and 149 Its first purpose is to trace the duration of functions and
152 draw a call graph for each thread with some information like 150 draw a call graph for each thread with some information like
153 the return value. This is done by setting the current return 151 the return value. This is done by setting the current return
154 address on the current task structure into a stack of calls. 152 address on the current task structure into a stack of calls.
155 153
156 154
@@ -173,7 +171,7 @@ config IRQSOFF_TRACER
173 171
174 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency 172 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
175 173
176 (Note that kernel size and overhead increases with this option 174 (Note that kernel size and overhead increase with this option
177 enabled. This option and the preempt-off timing option can be 175 enabled. This option and the preempt-off timing option can be
178 used together or separately.) 176 used together or separately.)
179 177
@@ -186,7 +184,7 @@ config PREEMPT_TRACER
186 select TRACER_MAX_TRACE 184 select TRACER_MAX_TRACE
187 select RING_BUFFER_ALLOW_SWAP 185 select RING_BUFFER_ALLOW_SWAP
188 help 186 help
189 This option measures the time spent in preemption off critical 187 This option measures the time spent in preemption-off critical
190 sections, with microsecond accuracy. 188 sections, with microsecond accuracy.
191 189
192 The default measurement method is a maximum search, which is 190 The default measurement method is a maximum search, which is
@@ -195,7 +193,7 @@ config PREEMPT_TRACER
195 193
196 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency 194 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
197 195
198 (Note that kernel size and overhead increases with this option 196 (Note that kernel size and overhead increase with this option
199 enabled. This option and the irqs-off timing option can be 197 enabled. This option and the irqs-off timing option can be
200 used together or separately.) 198 used together or separately.)
201 199
@@ -222,7 +220,7 @@ config ENABLE_DEFAULT_TRACERS
222 depends on !GENERIC_TRACER 220 depends on !GENERIC_TRACER
223 select TRACING 221 select TRACING
224 help 222 help
225 This tracer hooks to various trace points in the kernel 223 This tracer hooks to various trace points in the kernel,
226 allowing the user to pick and choose which trace point they 224 allowing the user to pick and choose which trace point they
227 want to trace. It also includes the sched_switch tracer plugin. 225 want to trace. It also includes the sched_switch tracer plugin.
228 226
@@ -265,19 +263,19 @@ choice
265 The likely/unlikely profiler only looks at the conditions that 263 The likely/unlikely profiler only looks at the conditions that
266 are annotated with a likely or unlikely macro. 264 are annotated with a likely or unlikely macro.
267 265
268 The "all branch" profiler will profile every if statement in the 266 The "all branch" profiler will profile every if-statement in the
269 kernel. This profiler will also enable the likely/unlikely 267 kernel. This profiler will also enable the likely/unlikely
270 profiler as well. 268 profiler.
271 269
272 Either of the above profilers add a bit of overhead to the system. 270 Either of the above profilers adds a bit of overhead to the system.
273 If unsure choose "No branch profiling". 271 If unsure, choose "No branch profiling".
274 272
275config BRANCH_PROFILE_NONE 273config BRANCH_PROFILE_NONE
276 bool "No branch profiling" 274 bool "No branch profiling"
277 help 275 help
278 No branch profiling. Branch profiling adds a bit of overhead. 276 No branch profiling. Branch profiling adds a bit of overhead.
279 Only enable it if you want to analyse the branching behavior. 277 Only enable it if you want to analyse the branching behavior.
280 Otherwise keep it disabled. 278 Otherwise keep it disabled.
281 279
282config PROFILE_ANNOTATED_BRANCHES 280config PROFILE_ANNOTATED_BRANCHES
283 bool "Trace likely/unlikely profiler" 281 bool "Trace likely/unlikely profiler"
@@ -288,7 +286,7 @@ config PROFILE_ANNOTATED_BRANCHES
288 286
289 /sys/kernel/debug/tracing/profile_annotated_branch 287 /sys/kernel/debug/tracing/profile_annotated_branch
290 288
291 Note: this will add a significant overhead, only turn this 289 Note: this will add a significant overhead; only turn this
292 on if you need to profile the system's use of these macros. 290 on if you need to profile the system's use of these macros.
293 291
294config PROFILE_ALL_BRANCHES 292config PROFILE_ALL_BRANCHES
@@ -305,7 +303,7 @@ config PROFILE_ALL_BRANCHES
305 303
306 This configuration, when enabled, will impose a great overhead 304 This configuration, when enabled, will impose a great overhead
307 on the system. This should only be enabled when the system 305 on the system. This should only be enabled when the system
308 is to be analyzed 306 is to be analyzed in much detail.
309endchoice 307endchoice
310 308
311config TRACING_BRANCHES 309config TRACING_BRANCHES
@@ -335,10 +333,31 @@ config POWER_TRACER
335 depends on X86 333 depends on X86
336 select GENERIC_TRACER 334 select GENERIC_TRACER
337 help 335 help
338 This tracer helps developers to analyze and optimize the kernels 336 This tracer helps developers to analyze and optimize the kernel's
339 power management decisions, specifically the C-state and P-state 337 power management decisions, specifically the C-state and P-state
340 behavior. 338 behavior.
341 339
340config KSYM_TRACER
341 bool "Trace read and write access on kernel memory locations"
342 depends on HAVE_HW_BREAKPOINT
343 select TRACING
344 help
345 This tracer helps find read and write operations on any given kernel
346 symbol i.e. /proc/kallsyms.
347
348config PROFILE_KSYM_TRACER
349 bool "Profile all kernel memory accesses on 'watched' variables"
350 depends on KSYM_TRACER
351 help
352 This tracer profiles kernel accesses on variables watched through the
353 ksym tracer ftrace plugin. Depending upon the hardware, all read
354 and write operations on kernel variables can be monitored for
355 accesses.
356
357 The results will be displayed in:
358 /debugfs/tracing/profile_ksym
359
360 Say N if unsure.
342 361
343config STACK_TRACER 362config STACK_TRACER
344 bool "Trace max stack" 363 bool "Trace max stack"
@@ -370,14 +389,14 @@ config HW_BRANCH_TRACER
370 select GENERIC_TRACER 389 select GENERIC_TRACER
371 help 390 help
372 This tracer records all branches on the system in a circular 391 This tracer records all branches on the system in a circular
373 buffer giving access to the last N branches for each cpu. 392 buffer, giving access to the last N branches for each cpu.
374 393
375config KMEMTRACE 394config KMEMTRACE
376 bool "Trace SLAB allocations" 395 bool "Trace SLAB allocations"
377 select GENERIC_TRACER 396 select GENERIC_TRACER
378 help 397 help
379 kmemtrace provides tracing for slab allocator functions, such as 398 kmemtrace provides tracing for slab allocator functions, such as
380 kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected 399 kmalloc, kfree, kmem_cache_alloc, kmem_cache_free, etc. Collected
381 data is then fed to the userspace application in order to analyse 400 data is then fed to the userspace application in order to analyse
382 allocation hotspots, internal fragmentation and so on, making it 401 allocation hotspots, internal fragmentation and so on, making it
383 possible to see how well an allocator performs, as well as debug 402 possible to see how well an allocator performs, as well as debug
@@ -396,15 +415,15 @@ config WORKQUEUE_TRACER
396 bool "Trace workqueues" 415 bool "Trace workqueues"
397 select GENERIC_TRACER 416 select GENERIC_TRACER
398 help 417 help
399 The workqueue tracer provides some statistical informations 418 The workqueue tracer provides some statistical information
400 about each cpu workqueue thread such as the number of the 419 about each cpu workqueue thread such as the number of the
401 works inserted and executed since their creation. It can help 420 works inserted and executed since their creation. It can help
402 to evaluate the amount of work each of them have to perform. 421 to evaluate the amount of work each of them has to perform.
403 For example it can help a developer to decide whether he should 422 For example it can help a developer to decide whether he should
404 choose a per cpu workqueue instead of a singlethreaded one. 423 choose a per-cpu workqueue instead of a singlethreaded one.
405 424
406config BLK_DEV_IO_TRACE 425config BLK_DEV_IO_TRACE
407 bool "Support for tracing block io actions" 426 bool "Support for tracing block IO actions"
408 depends on SYSFS 427 depends on SYSFS
409 depends on BLOCK 428 depends on BLOCK
410 select RELAY 429 select RELAY
@@ -428,38 +447,55 @@ config BLK_DEV_IO_TRACE
428 447
429 If unsure, say N. 448 If unsure, say N.
430 449
450config KPROBE_EVENT
451 depends on KPROBES
452 depends on X86
453 bool "Enable kprobes-based dynamic events"
454 select TRACING
455 default y
456 help
457 This allows the user to add tracing events (similar to tracepoints)
458 on the fly via the ftrace interface. See
459 Documentation/trace/kprobetrace.txt for more details.
460
461 Those events can be inserted wherever kprobes can probe, and record
462 various register and memory values.
463
464 This option is also required by perf-probe subcommand of perf tools.
465 If you want to use perf tools, this option is strongly recommended.
466
431config DYNAMIC_FTRACE 467config DYNAMIC_FTRACE
432 bool "enable/disable ftrace tracepoints dynamically" 468 bool "enable/disable ftrace tracepoints dynamically"
433 depends on FUNCTION_TRACER 469 depends on FUNCTION_TRACER
434 depends on HAVE_DYNAMIC_FTRACE 470 depends on HAVE_DYNAMIC_FTRACE
435 default y 471 default y
436 help 472 help
437 This option will modify all the calls to ftrace dynamically 473 This option will modify all the calls to ftrace dynamically
438 (will patch them out of the binary image and replaces them 474 (will patch them out of the binary image and replace them
439 with a No-Op instruction) as they are called. A table is 475 with a No-Op instruction) as they are called. A table is
440 created to dynamically enable them again. 476 created to dynamically enable them again.
441 477
442 This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but otherwise 478 This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but
443 has native performance as long as no tracing is active. 479 otherwise has native performance as long as no tracing is active.
444 480
445 The changes to the code are done by a kernel thread that 481 The changes to the code are done by a kernel thread that
446 wakes up once a second and checks to see if any ftrace calls 482 wakes up once a second and checks to see if any ftrace calls
447 were made. If so, it runs stop_machine (stops all CPUS) 483 were made. If so, it runs stop_machine (stops all CPUS)
448 and modifies the code to jump over the call to ftrace. 484 and modifies the code to jump over the call to ftrace.
449 485
450config FUNCTION_PROFILER 486config FUNCTION_PROFILER
451 bool "Kernel function profiler" 487 bool "Kernel function profiler"
452 depends on FUNCTION_TRACER 488 depends on FUNCTION_TRACER
453 default n 489 default n
454 help 490 help
455 This option enables the kernel function profiler. A file is created 491 This option enables the kernel function profiler. A file is created
456 in debugfs called function_profile_enabled which defaults to zero. 492 in debugfs called function_profile_enabled which defaults to zero.
457 When a 1 is echoed into this file profiling begins, and when a 493 When a 1 is echoed into this file profiling begins, and when a
458 zero is entered, profiling stops. A file in the trace_stats 494 zero is entered, profiling stops. A "functions" file is created in
459 directory called functions, that show the list of functions that 495 the trace_stats directory; this file shows the list of functions that
460 have been hit and their counters. 496 have been hit and their counters.
461 497
462 If in doubt, say N 498 If in doubt, say N.
463 499
464config FTRACE_MCOUNT_RECORD 500config FTRACE_MCOUNT_RECORD
465 def_bool y 501 def_bool y
@@ -518,8 +554,8 @@ config RING_BUFFER_BENCHMARK
518 tristate "Ring buffer benchmark stress tester" 554 tristate "Ring buffer benchmark stress tester"
519 depends on RING_BUFFER 555 depends on RING_BUFFER
520 help 556 help
521 This option creates a test to stress the ring buffer and bench mark it. 557 This option creates a test to stress the ring buffer and benchmark it.
522 It creates its own ring buffer such that it will not interfer with 558 It creates its own ring buffer such that it will not interfere with
523 any other users of the ring buffer (such as ftrace). It then creates 559 any other users of the ring buffer (such as ftrace). It then creates
524 a producer and consumer that will run for 10 seconds and sleep for 560 a producer and consumer that will run for 10 seconds and sleep for
525 10 seconds. Each interval it will print out the number of events 561 10 seconds. Each interval it will print out the number of events
@@ -528,7 +564,7 @@ config RING_BUFFER_BENCHMARK
528 It does not disable interrupts or raise its priority, so it may be 564 It does not disable interrupts or raise its priority, so it may be
529 affected by processes that are running. 565 affected by processes that are running.
530 566
531 If unsure, say N 567 If unsure, say N.
532 568
533endif # FTRACE 569endif # FTRACE
534 570
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 26f03ac07c2..cd9ecd89ec7 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -53,6 +53,8 @@ obj-$(CONFIG_EVENT_TRACING) += trace_export.o
53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o 53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
54obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o 54obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
55obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 55obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
56obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
57obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
56obj-$(CONFIG_EVENT_TRACING) += power-traces.o 58obj-$(CONFIG_EVENT_TRACING) += power-traces.o
57 59
58libftrace-y := ftrace.o 60libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 37ba67e3326..1e6640f8045 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -60,6 +60,13 @@ static int last_ftrace_enabled;
60/* Quick disabling of function tracer. */ 60/* Quick disabling of function tracer. */
61int function_trace_stop; 61int function_trace_stop;
62 62
63/* List for set_ftrace_pid's pids. */
64LIST_HEAD(ftrace_pids);
65struct ftrace_pid {
66 struct list_head list;
67 struct pid *pid;
68};
69
63/* 70/*
64 * ftrace_disabled is set when an anomaly is discovered. 71 * ftrace_disabled is set when an anomaly is discovered.
65 * ftrace_disabled is much stronger than ftrace_enabled. 72 * ftrace_disabled is much stronger than ftrace_enabled.
@@ -78,6 +85,10 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
78ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; 85ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
79ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; 86ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
80 87
88#ifdef CONFIG_FUNCTION_GRAPH_TRACER
89static int ftrace_set_func(unsigned long *array, int *idx, char *buffer);
90#endif
91
81static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) 92static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
82{ 93{
83 struct ftrace_ops *op = ftrace_list; 94 struct ftrace_ops *op = ftrace_list;
@@ -155,7 +166,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
155 else 166 else
156 func = ftrace_list_func; 167 func = ftrace_list_func;
157 168
158 if (ftrace_pid_trace) { 169 if (!list_empty(&ftrace_pids)) {
159 set_ftrace_pid_function(func); 170 set_ftrace_pid_function(func);
160 func = ftrace_pid_func; 171 func = ftrace_pid_func;
161 } 172 }
@@ -203,7 +214,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
203 if (ftrace_list->next == &ftrace_list_end) { 214 if (ftrace_list->next == &ftrace_list_end) {
204 ftrace_func_t func = ftrace_list->func; 215 ftrace_func_t func = ftrace_list->func;
205 216
206 if (ftrace_pid_trace) { 217 if (!list_empty(&ftrace_pids)) {
207 set_ftrace_pid_function(func); 218 set_ftrace_pid_function(func);
208 func = ftrace_pid_func; 219 func = ftrace_pid_func;
209 } 220 }
@@ -231,7 +242,7 @@ static void ftrace_update_pid_func(void)
231 func = __ftrace_trace_function; 242 func = __ftrace_trace_function;
232#endif 243#endif
233 244
234 if (ftrace_pid_trace) { 245 if (!list_empty(&ftrace_pids)) {
235 set_ftrace_pid_function(func); 246 set_ftrace_pid_function(func);
236 func = ftrace_pid_func; 247 func = ftrace_pid_func;
237 } else { 248 } else {
@@ -740,7 +751,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
740 out: 751 out:
741 mutex_unlock(&ftrace_profile_lock); 752 mutex_unlock(&ftrace_profile_lock);
742 753
743 filp->f_pos += cnt; 754 *ppos += cnt;
744 755
745 return cnt; 756 return cnt;
746} 757}
@@ -821,8 +832,6 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
821} 832}
822#endif /* CONFIG_FUNCTION_PROFILER */ 833#endif /* CONFIG_FUNCTION_PROFILER */
823 834
824/* set when tracing only a pid */
825struct pid *ftrace_pid_trace;
826static struct pid * const ftrace_swapper_pid = &init_struct_pid; 835static struct pid * const ftrace_swapper_pid = &init_struct_pid;
827 836
828#ifdef CONFIG_DYNAMIC_FTRACE 837#ifdef CONFIG_DYNAMIC_FTRACE
@@ -1261,12 +1270,34 @@ static int ftrace_update_code(struct module *mod)
1261 ftrace_new_addrs = p->newlist; 1270 ftrace_new_addrs = p->newlist;
1262 p->flags = 0L; 1271 p->flags = 0L;
1263 1272
1264 /* convert record (i.e, patch mcount-call with NOP) */ 1273 /*
1265 if (ftrace_code_disable(mod, p)) { 1274 * Do the initial record convertion from mcount jump
1266 p->flags |= FTRACE_FL_CONVERTED; 1275 * to the NOP instructions.
1267 ftrace_update_cnt++; 1276 */
1268 } else 1277 if (!ftrace_code_disable(mod, p)) {
1269 ftrace_free_rec(p); 1278 ftrace_free_rec(p);
1279 continue;
1280 }
1281
1282 p->flags |= FTRACE_FL_CONVERTED;
1283 ftrace_update_cnt++;
1284
1285 /*
1286 * If the tracing is enabled, go ahead and enable the record.
1287 *
1288 * The reason not to enable the record immediatelly is the
1289 * inherent check of ftrace_make_nop/ftrace_make_call for
1290 * correct previous instructions. Making first the NOP
1291 * conversion puts the module to the correct state, thus
1292 * passing the ftrace_make_call check.
1293 */
1294 if (ftrace_start_up) {
1295 int failed = __ftrace_replace_code(p, 1);
1296 if (failed) {
1297 ftrace_bug(failed, p->ip);
1298 ftrace_free_rec(p);
1299 }
1300 }
1270 } 1301 }
1271 1302
1272 stop = ftrace_now(raw_smp_processor_id()); 1303 stop = ftrace_now(raw_smp_processor_id());
@@ -1656,64 +1687,10 @@ ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
1656 return ret; 1687 return ret;
1657} 1688}
1658 1689
1659enum {
1660 MATCH_FULL,
1661 MATCH_FRONT_ONLY,
1662 MATCH_MIDDLE_ONLY,
1663 MATCH_END_ONLY,
1664};
1665
1666/*
1667 * (static function - no need for kernel doc)
1668 *
1669 * Pass in a buffer containing a glob and this function will
1670 * set search to point to the search part of the buffer and
1671 * return the type of search it is (see enum above).
1672 * This does modify buff.
1673 *
1674 * Returns enum type.
1675 * search returns the pointer to use for comparison.
1676 * not returns 1 if buff started with a '!'
1677 * 0 otherwise.
1678 */
1679static int
1680ftrace_setup_glob(char *buff, int len, char **search, int *not)
1681{
1682 int type = MATCH_FULL;
1683 int i;
1684
1685 if (buff[0] == '!') {
1686 *not = 1;
1687 buff++;
1688 len--;
1689 } else
1690 *not = 0;
1691
1692 *search = buff;
1693
1694 for (i = 0; i < len; i++) {
1695 if (buff[i] == '*') {
1696 if (!i) {
1697 *search = buff + 1;
1698 type = MATCH_END_ONLY;
1699 } else {
1700 if (type == MATCH_END_ONLY)
1701 type = MATCH_MIDDLE_ONLY;
1702 else
1703 type = MATCH_FRONT_ONLY;
1704 buff[i] = 0;
1705 break;
1706 }
1707 }
1708 }
1709
1710 return type;
1711}
1712
1713static int ftrace_match(char *str, char *regex, int len, int type) 1690static int ftrace_match(char *str, char *regex, int len, int type)
1714{ 1691{
1715 int matched = 0; 1692 int matched = 0;
1716 char *ptr; 1693 int slen;
1717 1694
1718 switch (type) { 1695 switch (type) {
1719 case MATCH_FULL: 1696 case MATCH_FULL:
@@ -1729,8 +1706,8 @@ static int ftrace_match(char *str, char *regex, int len, int type)
1729 matched = 1; 1706 matched = 1;
1730 break; 1707 break;
1731 case MATCH_END_ONLY: 1708 case MATCH_END_ONLY:
1732 ptr = strstr(str, regex); 1709 slen = strlen(str);
1733 if (ptr && (ptr[len] == 0)) 1710 if (slen >= len && memcmp(str + slen - len, regex, len) == 0)
1734 matched = 1; 1711 matched = 1;
1735 break; 1712 break;
1736 } 1713 }
@@ -1747,7 +1724,7 @@ ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type)
1747 return ftrace_match(str, regex, len, type); 1724 return ftrace_match(str, regex, len, type);
1748} 1725}
1749 1726
1750static void ftrace_match_records(char *buff, int len, int enable) 1727static int ftrace_match_records(char *buff, int len, int enable)
1751{ 1728{
1752 unsigned int search_len; 1729 unsigned int search_len;
1753 struct ftrace_page *pg; 1730 struct ftrace_page *pg;
@@ -1756,9 +1733,10 @@ static void ftrace_match_records(char *buff, int len, int enable)
1756 char *search; 1733 char *search;
1757 int type; 1734 int type;
1758 int not; 1735 int not;
1736 int found = 0;
1759 1737
1760 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; 1738 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1761 type = ftrace_setup_glob(buff, len, &search, &not); 1739 type = filter_parse_regex(buff, len, &search, &not);
1762 1740
1763 search_len = strlen(search); 1741 search_len = strlen(search);
1764 1742
@@ -1773,6 +1751,7 @@ static void ftrace_match_records(char *buff, int len, int enable)
1773 rec->flags &= ~flag; 1751 rec->flags &= ~flag;
1774 else 1752 else
1775 rec->flags |= flag; 1753 rec->flags |= flag;
1754 found = 1;
1776 } 1755 }
1777 /* 1756 /*
1778 * Only enable filtering if we have a function that 1757 * Only enable filtering if we have a function that
@@ -1782,6 +1761,8 @@ static void ftrace_match_records(char *buff, int len, int enable)
1782 ftrace_filtered = 1; 1761 ftrace_filtered = 1;
1783 } while_for_each_ftrace_rec(); 1762 } while_for_each_ftrace_rec();
1784 mutex_unlock(&ftrace_lock); 1763 mutex_unlock(&ftrace_lock);
1764
1765 return found;
1785} 1766}
1786 1767
1787static int 1768static int
@@ -1803,7 +1784,7 @@ ftrace_match_module_record(struct dyn_ftrace *rec, char *mod,
1803 return 1; 1784 return 1;
1804} 1785}
1805 1786
1806static void ftrace_match_module_records(char *buff, char *mod, int enable) 1787static int ftrace_match_module_records(char *buff, char *mod, int enable)
1807{ 1788{
1808 unsigned search_len = 0; 1789 unsigned search_len = 0;
1809 struct ftrace_page *pg; 1790 struct ftrace_page *pg;
@@ -1812,6 +1793,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
1812 char *search = buff; 1793 char *search = buff;
1813 unsigned long flag; 1794 unsigned long flag;
1814 int not = 0; 1795 int not = 0;
1796 int found = 0;
1815 1797
1816 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; 1798 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1817 1799
@@ -1826,7 +1808,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
1826 } 1808 }
1827 1809
1828 if (strlen(buff)) { 1810 if (strlen(buff)) {
1829 type = ftrace_setup_glob(buff, strlen(buff), &search, &not); 1811 type = filter_parse_regex(buff, strlen(buff), &search, &not);
1830 search_len = strlen(search); 1812 search_len = strlen(search);
1831 } 1813 }
1832 1814
@@ -1842,12 +1824,15 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
1842 rec->flags &= ~flag; 1824 rec->flags &= ~flag;
1843 else 1825 else
1844 rec->flags |= flag; 1826 rec->flags |= flag;
1827 found = 1;
1845 } 1828 }
1846 if (enable && (rec->flags & FTRACE_FL_FILTER)) 1829 if (enable && (rec->flags & FTRACE_FL_FILTER))
1847 ftrace_filtered = 1; 1830 ftrace_filtered = 1;
1848 1831
1849 } while_for_each_ftrace_rec(); 1832 } while_for_each_ftrace_rec();
1850 mutex_unlock(&ftrace_lock); 1833 mutex_unlock(&ftrace_lock);
1834
1835 return found;
1851} 1836}
1852 1837
1853/* 1838/*
@@ -1876,8 +1861,9 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
1876 if (!strlen(mod)) 1861 if (!strlen(mod))
1877 return -EINVAL; 1862 return -EINVAL;
1878 1863
1879 ftrace_match_module_records(func, mod, enable); 1864 if (ftrace_match_module_records(func, mod, enable))
1880 return 0; 1865 return 0;
1866 return -EINVAL;
1881} 1867}
1882 1868
1883static struct ftrace_func_command ftrace_mod_cmd = { 1869static struct ftrace_func_command ftrace_mod_cmd = {
@@ -1991,7 +1977,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
1991 int count = 0; 1977 int count = 0;
1992 char *search; 1978 char *search;
1993 1979
1994 type = ftrace_setup_glob(glob, strlen(glob), &search, &not); 1980 type = filter_parse_regex(glob, strlen(glob), &search, &not);
1995 len = strlen(search); 1981 len = strlen(search);
1996 1982
1997 /* we do not support '!' for function probes */ 1983 /* we do not support '!' for function probes */
@@ -2068,7 +2054,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
2068 else if (glob) { 2054 else if (glob) {
2069 int not; 2055 int not;
2070 2056
2071 type = ftrace_setup_glob(glob, strlen(glob), &search, &not); 2057 type = filter_parse_regex(glob, strlen(glob), &search, &not);
2072 len = strlen(search); 2058 len = strlen(search);
2073 2059
2074 /* we do not support '!' for function probes */ 2060 /* we do not support '!' for function probes */
@@ -2174,8 +2160,9 @@ static int ftrace_process_regex(char *buff, int len, int enable)
2174 func = strsep(&next, ":"); 2160 func = strsep(&next, ":");
2175 2161
2176 if (!next) { 2162 if (!next) {
2177 ftrace_match_records(func, len, enable); 2163 if (ftrace_match_records(func, len, enable))
2178 return 0; 2164 return 0;
2165 return ret;
2179 } 2166 }
2180 2167
2181 /* command found */ 2168 /* command found */
@@ -2221,16 +2208,15 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2221 !trace_parser_cont(parser)) { 2208 !trace_parser_cont(parser)) {
2222 ret = ftrace_process_regex(parser->buffer, 2209 ret = ftrace_process_regex(parser->buffer,
2223 parser->idx, enable); 2210 parser->idx, enable);
2224 if (ret)
2225 goto out;
2226
2227 trace_parser_clear(parser); 2211 trace_parser_clear(parser);
2212 if (ret)
2213 goto out_unlock;
2228 } 2214 }
2229 2215
2230 ret = read; 2216 ret = read;
2231 2217out_unlock:
2232 mutex_unlock(&ftrace_regex_lock); 2218 mutex_unlock(&ftrace_regex_lock);
2233out: 2219
2234 return ret; 2220 return ret;
2235} 2221}
2236 2222
@@ -2312,6 +2298,32 @@ static int __init set_ftrace_filter(char *str)
2312} 2298}
2313__setup("ftrace_filter=", set_ftrace_filter); 2299__setup("ftrace_filter=", set_ftrace_filter);
2314 2300
2301#ifdef CONFIG_FUNCTION_GRAPH_TRACER
2302static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
2303static int __init set_graph_function(char *str)
2304{
2305 strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
2306 return 1;
2307}
2308__setup("ftrace_graph_filter=", set_graph_function);
2309
2310static void __init set_ftrace_early_graph(char *buf)
2311{
2312 int ret;
2313 char *func;
2314
2315 while (buf) {
2316 func = strsep(&buf, ",");
2317 /* we allow only one expression at a time */
2318 ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
2319 func);
2320 if (ret)
2321 printk(KERN_DEBUG "ftrace: function %s not "
2322 "traceable\n", func);
2323 }
2324}
2325#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
2326
2315static void __init set_ftrace_early_filter(char *buf, int enable) 2327static void __init set_ftrace_early_filter(char *buf, int enable)
2316{ 2328{
2317 char *func; 2329 char *func;
@@ -2328,6 +2340,10 @@ static void __init set_ftrace_early_filters(void)
2328 set_ftrace_early_filter(ftrace_filter_buf, 1); 2340 set_ftrace_early_filter(ftrace_filter_buf, 1);
2329 if (ftrace_notrace_buf[0]) 2341 if (ftrace_notrace_buf[0])
2330 set_ftrace_early_filter(ftrace_notrace_buf, 0); 2342 set_ftrace_early_filter(ftrace_notrace_buf, 0);
2343#ifdef CONFIG_FUNCTION_GRAPH_TRACER
2344 if (ftrace_graph_buf[0])
2345 set_ftrace_early_graph(ftrace_graph_buf);
2346#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
2331} 2347}
2332 2348
2333static int 2349static int
@@ -2513,7 +2529,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2513 return -ENODEV; 2529 return -ENODEV;
2514 2530
2515 /* decode regex */ 2531 /* decode regex */
2516 type = ftrace_setup_glob(buffer, strlen(buffer), &search, &not); 2532 type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
2517 if (not) 2533 if (not)
2518 return -EINVAL; 2534 return -EINVAL;
2519 2535
@@ -2536,10 +2552,9 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2536 exists = true; 2552 exists = true;
2537 break; 2553 break;
2538 } 2554 }
2539 if (!exists) { 2555 if (!exists)
2540 array[(*idx)++] = rec->ip; 2556 array[(*idx)++] = rec->ip;
2541 found = 1; 2557 found = 1;
2542 }
2543 } 2558 }
2544 } while_for_each_ftrace_rec(); 2559 } while_for_each_ftrace_rec();
2545 2560
@@ -2624,7 +2639,7 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
2624 return 0; 2639 return 0;
2625} 2640}
2626 2641
2627static int ftrace_convert_nops(struct module *mod, 2642static int ftrace_process_locs(struct module *mod,
2628 unsigned long *start, 2643 unsigned long *start,
2629 unsigned long *end) 2644 unsigned long *end)
2630{ 2645{
@@ -2684,7 +2699,7 @@ static void ftrace_init_module(struct module *mod,
2684{ 2699{
2685 if (ftrace_disabled || start == end) 2700 if (ftrace_disabled || start == end)
2686 return; 2701 return;
2687 ftrace_convert_nops(mod, start, end); 2702 ftrace_process_locs(mod, start, end);
2688} 2703}
2689 2704
2690static int ftrace_module_notify(struct notifier_block *self, 2705static int ftrace_module_notify(struct notifier_block *self,
@@ -2745,7 +2760,7 @@ void __init ftrace_init(void)
2745 2760
2746 last_ftrace_enabled = ftrace_enabled = 1; 2761 last_ftrace_enabled = ftrace_enabled = 1;
2747 2762
2748 ret = ftrace_convert_nops(NULL, 2763 ret = ftrace_process_locs(NULL,
2749 __start_mcount_loc, 2764 __start_mcount_loc,
2750 __stop_mcount_loc); 2765 __stop_mcount_loc);
2751 2766
@@ -2778,23 +2793,6 @@ static inline void ftrace_startup_enable(int command) { }
2778# define ftrace_shutdown_sysctl() do { } while (0) 2793# define ftrace_shutdown_sysctl() do { } while (0)
2779#endif /* CONFIG_DYNAMIC_FTRACE */ 2794#endif /* CONFIG_DYNAMIC_FTRACE */
2780 2795
2781static ssize_t
2782ftrace_pid_read(struct file *file, char __user *ubuf,
2783 size_t cnt, loff_t *ppos)
2784{
2785 char buf[64];
2786 int r;
2787
2788 if (ftrace_pid_trace == ftrace_swapper_pid)
2789 r = sprintf(buf, "swapper tasks\n");
2790 else if (ftrace_pid_trace)
2791 r = sprintf(buf, "%u\n", pid_vnr(ftrace_pid_trace));
2792 else
2793 r = sprintf(buf, "no pid\n");
2794
2795 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2796}
2797
2798static void clear_ftrace_swapper(void) 2796static void clear_ftrace_swapper(void)
2799{ 2797{
2800 struct task_struct *p; 2798 struct task_struct *p;
@@ -2845,14 +2843,12 @@ static void set_ftrace_pid(struct pid *pid)
2845 rcu_read_unlock(); 2843 rcu_read_unlock();
2846} 2844}
2847 2845
2848static void clear_ftrace_pid_task(struct pid **pid) 2846static void clear_ftrace_pid_task(struct pid *pid)
2849{ 2847{
2850 if (*pid == ftrace_swapper_pid) 2848 if (pid == ftrace_swapper_pid)
2851 clear_ftrace_swapper(); 2849 clear_ftrace_swapper();
2852 else 2850 else
2853 clear_ftrace_pid(*pid); 2851 clear_ftrace_pid(pid);
2854
2855 *pid = NULL;
2856} 2852}
2857 2853
2858static void set_ftrace_pid_task(struct pid *pid) 2854static void set_ftrace_pid_task(struct pid *pid)
@@ -2863,74 +2859,184 @@ static void set_ftrace_pid_task(struct pid *pid)
2863 set_ftrace_pid(pid); 2859 set_ftrace_pid(pid);
2864} 2860}
2865 2861
2866static ssize_t 2862static int ftrace_pid_add(int p)
2867ftrace_pid_write(struct file *filp, const char __user *ubuf,
2868 size_t cnt, loff_t *ppos)
2869{ 2863{
2870 struct pid *pid; 2864 struct pid *pid;
2871 char buf[64]; 2865 struct ftrace_pid *fpid;
2872 long val; 2866 int ret = -EINVAL;
2873 int ret;
2874 2867
2875 if (cnt >= sizeof(buf)) 2868 mutex_lock(&ftrace_lock);
2876 return -EINVAL;
2877 2869
2878 if (copy_from_user(&buf, ubuf, cnt)) 2870 if (!p)
2879 return -EFAULT; 2871 pid = ftrace_swapper_pid;
2872 else
2873 pid = find_get_pid(p);
2880 2874
2881 buf[cnt] = 0; 2875 if (!pid)
2876 goto out;
2882 2877
2883 ret = strict_strtol(buf, 10, &val); 2878 ret = 0;
2884 if (ret < 0)
2885 return ret;
2886 2879
2887 mutex_lock(&ftrace_lock); 2880 list_for_each_entry(fpid, &ftrace_pids, list)
2888 if (val < 0) { 2881 if (fpid->pid == pid)
2889 /* disable pid tracing */ 2882 goto out_put;
2890 if (!ftrace_pid_trace)
2891 goto out;
2892 2883
2893 clear_ftrace_pid_task(&ftrace_pid_trace); 2884 ret = -ENOMEM;
2894 2885
2895 } else { 2886 fpid = kmalloc(sizeof(*fpid), GFP_KERNEL);
2896 /* swapper task is special */ 2887 if (!fpid)
2897 if (!val) { 2888 goto out_put;
2898 pid = ftrace_swapper_pid;
2899 if (pid == ftrace_pid_trace)
2900 goto out;
2901 } else {
2902 pid = find_get_pid(val);
2903 2889
2904 if (pid == ftrace_pid_trace) { 2890 list_add(&fpid->list, &ftrace_pids);
2905 put_pid(pid); 2891 fpid->pid = pid;
2906 goto out;
2907 }
2908 }
2909 2892
2910 if (ftrace_pid_trace) 2893 set_ftrace_pid_task(pid);
2911 clear_ftrace_pid_task(&ftrace_pid_trace);
2912 2894
2913 if (!pid) 2895 ftrace_update_pid_func();
2914 goto out; 2896 ftrace_startup_enable(0);
2915 2897
2916 ftrace_pid_trace = pid; 2898 mutex_unlock(&ftrace_lock);
2899 return 0;
2917 2900
2918 set_ftrace_pid_task(ftrace_pid_trace); 2901out_put:
2902 if (pid != ftrace_swapper_pid)
2903 put_pid(pid);
2904
2905out:
2906 mutex_unlock(&ftrace_lock);
2907 return ret;
2908}
2909
2910static void ftrace_pid_reset(void)
2911{
2912 struct ftrace_pid *fpid, *safe;
2913
2914 mutex_lock(&ftrace_lock);
2915 list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) {
2916 struct pid *pid = fpid->pid;
2917
2918 clear_ftrace_pid_task(pid);
2919
2920 list_del(&fpid->list);
2921 kfree(fpid);
2919 } 2922 }
2920 2923
2921 /* update the function call */
2922 ftrace_update_pid_func(); 2924 ftrace_update_pid_func();
2923 ftrace_startup_enable(0); 2925 ftrace_startup_enable(0);
2924 2926
2925 out:
2926 mutex_unlock(&ftrace_lock); 2927 mutex_unlock(&ftrace_lock);
2928}
2927 2929
2928 return cnt; 2930static void *fpid_start(struct seq_file *m, loff_t *pos)
2931{
2932 mutex_lock(&ftrace_lock);
2933
2934 if (list_empty(&ftrace_pids) && (!*pos))
2935 return (void *) 1;
2936
2937 return seq_list_start(&ftrace_pids, *pos);
2938}
2939
2940static void *fpid_next(struct seq_file *m, void *v, loff_t *pos)
2941{
2942 if (v == (void *)1)
2943 return NULL;
2944
2945 return seq_list_next(v, &ftrace_pids, pos);
2946}
2947
2948static void fpid_stop(struct seq_file *m, void *p)
2949{
2950 mutex_unlock(&ftrace_lock);
2951}
2952
2953static int fpid_show(struct seq_file *m, void *v)
2954{
2955 const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list);
2956
2957 if (v == (void *)1) {
2958 seq_printf(m, "no pid\n");
2959 return 0;
2960 }
2961
2962 if (fpid->pid == ftrace_swapper_pid)
2963 seq_printf(m, "swapper tasks\n");
2964 else
2965 seq_printf(m, "%u\n", pid_vnr(fpid->pid));
2966
2967 return 0;
2968}
2969
2970static const struct seq_operations ftrace_pid_sops = {
2971 .start = fpid_start,
2972 .next = fpid_next,
2973 .stop = fpid_stop,
2974 .show = fpid_show,
2975};
2976
2977static int
2978ftrace_pid_open(struct inode *inode, struct file *file)
2979{
2980 int ret = 0;
2981
2982 if ((file->f_mode & FMODE_WRITE) &&
2983 (file->f_flags & O_TRUNC))
2984 ftrace_pid_reset();
2985
2986 if (file->f_mode & FMODE_READ)
2987 ret = seq_open(file, &ftrace_pid_sops);
2988
2989 return ret;
2990}
2991
2992static ssize_t
2993ftrace_pid_write(struct file *filp, const char __user *ubuf,
2994 size_t cnt, loff_t *ppos)
2995{
2996 char buf[64], *tmp;
2997 long val;
2998 int ret;
2999
3000 if (cnt >= sizeof(buf))
3001 return -EINVAL;
3002
3003 if (copy_from_user(&buf, ubuf, cnt))
3004 return -EFAULT;
3005
3006 buf[cnt] = 0;
3007
3008 /*
3009 * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid"
3010 * to clean the filter quietly.
3011 */
3012 tmp = strstrip(buf);
3013 if (strlen(tmp) == 0)
3014 return 1;
3015
3016 ret = strict_strtol(tmp, 10, &val);
3017 if (ret < 0)
3018 return ret;
3019
3020 ret = ftrace_pid_add(val);
3021
3022 return ret ? ret : cnt;
3023}
3024
3025static int
3026ftrace_pid_release(struct inode *inode, struct file *file)
3027{
3028 if (file->f_mode & FMODE_READ)
3029 seq_release(inode, file);
3030
3031 return 0;
2929} 3032}
2930 3033
2931static const struct file_operations ftrace_pid_fops = { 3034static const struct file_operations ftrace_pid_fops = {
2932 .read = ftrace_pid_read, 3035 .open = ftrace_pid_open,
2933 .write = ftrace_pid_write, 3036 .write = ftrace_pid_write,
3037 .read = seq_read,
3038 .llseek = seq_lseek,
3039 .release = ftrace_pid_release,
2934}; 3040};
2935 3041
2936static __init int ftrace_init_debugfs(void) 3042static __init int ftrace_init_debugfs(void)
@@ -3293,4 +3399,3 @@ void ftrace_graph_stop(void)
3293 ftrace_stop(); 3399 ftrace_stop();
3294} 3400}
3295#endif 3401#endif
3296
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index e06c6e3d56a..9f4f565b01e 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -14,7 +14,5 @@
14#define CREATE_TRACE_POINTS 14#define CREATE_TRACE_POINTS
15#include <trace/events/power.h> 15#include <trace/events/power.h>
16 16
17EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
18EXPORT_TRACEPOINT_SYMBOL_GPL(power_end);
19EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency); 17EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency);
20 18
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index d4ff0197054..8c1b2d29071 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -397,18 +397,21 @@ int ring_buffer_print_page_header(struct trace_seq *s)
397 int ret; 397 int ret;
398 398
399 ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t" 399 ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t"
400 "offset:0;\tsize:%u;\n", 400 "offset:0;\tsize:%u;\tsigned:%u;\n",
401 (unsigned int)sizeof(field.time_stamp)); 401 (unsigned int)sizeof(field.time_stamp),
402 (unsigned int)is_signed_type(u64));
402 403
403 ret = trace_seq_printf(s, "\tfield: local_t commit;\t" 404 ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
404 "offset:%u;\tsize:%u;\n", 405 "offset:%u;\tsize:%u;\tsigned:%u;\n",
405 (unsigned int)offsetof(typeof(field), commit), 406 (unsigned int)offsetof(typeof(field), commit),
406 (unsigned int)sizeof(field.commit)); 407 (unsigned int)sizeof(field.commit),
408 (unsigned int)is_signed_type(long));
407 409
408 ret = trace_seq_printf(s, "\tfield: char data;\t" 410 ret = trace_seq_printf(s, "\tfield: char data;\t"
409 "offset:%u;\tsize:%u;\n", 411 "offset:%u;\tsize:%u;\tsigned:%u;\n",
410 (unsigned int)offsetof(typeof(field), data), 412 (unsigned int)offsetof(typeof(field), data),
411 (unsigned int)BUF_PAGE_SIZE); 413 (unsigned int)BUF_PAGE_SIZE,
414 (unsigned int)is_signed_type(char));
412 415
413 return ret; 416 return ret;
414} 417}
@@ -420,7 +423,7 @@ struct ring_buffer_per_cpu {
420 int cpu; 423 int cpu;
421 struct ring_buffer *buffer; 424 struct ring_buffer *buffer;
422 spinlock_t reader_lock; /* serialize readers */ 425 spinlock_t reader_lock; /* serialize readers */
423 raw_spinlock_t lock; 426 arch_spinlock_t lock;
424 struct lock_class_key lock_key; 427 struct lock_class_key lock_key;
425 struct list_head *pages; 428 struct list_head *pages;
426 struct buffer_page *head_page; /* read from head */ 429 struct buffer_page *head_page; /* read from head */
@@ -461,6 +464,8 @@ struct ring_buffer_iter {
461 struct ring_buffer_per_cpu *cpu_buffer; 464 struct ring_buffer_per_cpu *cpu_buffer;
462 unsigned long head; 465 unsigned long head;
463 struct buffer_page *head_page; 466 struct buffer_page *head_page;
467 struct buffer_page *cache_reader_page;
468 unsigned long cache_read;
464 u64 read_stamp; 469 u64 read_stamp;
465}; 470};
466 471
@@ -483,7 +488,7 @@ struct ring_buffer_iter {
483/* Up this if you want to test the TIME_EXTENTS and normalization */ 488/* Up this if you want to test the TIME_EXTENTS and normalization */
484#define DEBUG_SHIFT 0 489#define DEBUG_SHIFT 0
485 490
486static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu) 491static inline u64 rb_time_stamp(struct ring_buffer *buffer)
487{ 492{
488 /* shift to debug/test normalization and TIME_EXTENTS */ 493 /* shift to debug/test normalization and TIME_EXTENTS */
489 return buffer->clock() << DEBUG_SHIFT; 494 return buffer->clock() << DEBUG_SHIFT;
@@ -494,7 +499,7 @@ u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
494 u64 time; 499 u64 time;
495 500
496 preempt_disable_notrace(); 501 preempt_disable_notrace();
497 time = rb_time_stamp(buffer, cpu); 502 time = rb_time_stamp(buffer);
498 preempt_enable_no_resched_notrace(); 503 preempt_enable_no_resched_notrace();
499 504
500 return time; 505 return time;
@@ -599,7 +604,7 @@ static struct list_head *rb_list_head(struct list_head *list)
599} 604}
600 605
601/* 606/*
602 * rb_is_head_page - test if the give page is the head page 607 * rb_is_head_page - test if the given page is the head page
603 * 608 *
604 * Because the reader may move the head_page pointer, we can 609 * Because the reader may move the head_page pointer, we can
605 * not trust what the head page is (it may be pointing to 610 * not trust what the head page is (it may be pointing to
@@ -995,7 +1000,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
995 cpu_buffer->buffer = buffer; 1000 cpu_buffer->buffer = buffer;
996 spin_lock_init(&cpu_buffer->reader_lock); 1001 spin_lock_init(&cpu_buffer->reader_lock);
997 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); 1002 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
998 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 1003 cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
999 1004
1000 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1005 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
1001 GFP_KERNEL, cpu_to_node(cpu)); 1006 GFP_KERNEL, cpu_to_node(cpu));
@@ -1190,9 +1195,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
1190 struct list_head *p; 1195 struct list_head *p;
1191 unsigned i; 1196 unsigned i;
1192 1197
1193 atomic_inc(&cpu_buffer->record_disabled); 1198 spin_lock_irq(&cpu_buffer->reader_lock);
1194 synchronize_sched();
1195
1196 rb_head_page_deactivate(cpu_buffer); 1199 rb_head_page_deactivate(cpu_buffer);
1197 1200
1198 for (i = 0; i < nr_pages; i++) { 1201 for (i = 0; i < nr_pages; i++) {
@@ -1207,11 +1210,9 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
1207 return; 1210 return;
1208 1211
1209 rb_reset_cpu(cpu_buffer); 1212 rb_reset_cpu(cpu_buffer);
1210
1211 rb_check_pages(cpu_buffer); 1213 rb_check_pages(cpu_buffer);
1212 1214
1213 atomic_dec(&cpu_buffer->record_disabled); 1215 spin_unlock_irq(&cpu_buffer->reader_lock);
1214
1215} 1216}
1216 1217
1217static void 1218static void
@@ -1222,9 +1223,6 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
1222 struct list_head *p; 1223 struct list_head *p;
1223 unsigned i; 1224 unsigned i;
1224 1225
1225 atomic_inc(&cpu_buffer->record_disabled);
1226 synchronize_sched();
1227
1228 spin_lock_irq(&cpu_buffer->reader_lock); 1226 spin_lock_irq(&cpu_buffer->reader_lock);
1229 rb_head_page_deactivate(cpu_buffer); 1227 rb_head_page_deactivate(cpu_buffer);
1230 1228
@@ -1237,11 +1235,9 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
1237 list_add_tail(&bpage->list, cpu_buffer->pages); 1235 list_add_tail(&bpage->list, cpu_buffer->pages);
1238 } 1236 }
1239 rb_reset_cpu(cpu_buffer); 1237 rb_reset_cpu(cpu_buffer);
1240 spin_unlock_irq(&cpu_buffer->reader_lock);
1241
1242 rb_check_pages(cpu_buffer); 1238 rb_check_pages(cpu_buffer);
1243 1239
1244 atomic_dec(&cpu_buffer->record_disabled); 1240 spin_unlock_irq(&cpu_buffer->reader_lock);
1245} 1241}
1246 1242
1247/** 1243/**
@@ -1249,11 +1245,6 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
1249 * @buffer: the buffer to resize. 1245 * @buffer: the buffer to resize.
1250 * @size: the new size. 1246 * @size: the new size.
1251 * 1247 *
1252 * The tracer is responsible for making sure that the buffer is
1253 * not being used while changing the size.
1254 * Note: We may be able to change the above requirement by using
1255 * RCU synchronizations.
1256 *
1257 * Minimum size is 2 * BUF_PAGE_SIZE. 1248 * Minimum size is 2 * BUF_PAGE_SIZE.
1258 * 1249 *
1259 * Returns -1 on failure. 1250 * Returns -1 on failure.
@@ -1285,6 +1276,11 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1285 if (size == buffer_size) 1276 if (size == buffer_size)
1286 return size; 1277 return size;
1287 1278
1279 atomic_inc(&buffer->record_disabled);
1280
1281 /* Make sure all writers are done with this buffer. */
1282 synchronize_sched();
1283
1288 mutex_lock(&buffer->mutex); 1284 mutex_lock(&buffer->mutex);
1289 get_online_cpus(); 1285 get_online_cpus();
1290 1286
@@ -1347,6 +1343,8 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1347 put_online_cpus(); 1343 put_online_cpus();
1348 mutex_unlock(&buffer->mutex); 1344 mutex_unlock(&buffer->mutex);
1349 1345
1346 atomic_dec(&buffer->record_disabled);
1347
1350 return size; 1348 return size;
1351 1349
1352 free_pages: 1350 free_pages:
@@ -1356,6 +1354,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1356 } 1354 }
1357 put_online_cpus(); 1355 put_online_cpus();
1358 mutex_unlock(&buffer->mutex); 1356 mutex_unlock(&buffer->mutex);
1357 atomic_dec(&buffer->record_disabled);
1359 return -ENOMEM; 1358 return -ENOMEM;
1360 1359
1361 /* 1360 /*
@@ -1365,6 +1364,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1365 out_fail: 1364 out_fail:
1366 put_online_cpus(); 1365 put_online_cpus();
1367 mutex_unlock(&buffer->mutex); 1366 mutex_unlock(&buffer->mutex);
1367 atomic_dec(&buffer->record_disabled);
1368 return -1; 1368 return -1;
1369} 1369}
1370EXPORT_SYMBOL_GPL(ring_buffer_resize); 1370EXPORT_SYMBOL_GPL(ring_buffer_resize);
@@ -1785,9 +1785,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1785static struct ring_buffer_event * 1785static struct ring_buffer_event *
1786rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, 1786rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1787 unsigned long length, unsigned long tail, 1787 unsigned long length, unsigned long tail,
1788 struct buffer_page *commit_page,
1789 struct buffer_page *tail_page, u64 *ts) 1788 struct buffer_page *tail_page, u64 *ts)
1790{ 1789{
1790 struct buffer_page *commit_page = cpu_buffer->commit_page;
1791 struct ring_buffer *buffer = cpu_buffer->buffer; 1791 struct ring_buffer *buffer = cpu_buffer->buffer;
1792 struct buffer_page *next_page; 1792 struct buffer_page *next_page;
1793 int ret; 1793 int ret;
@@ -1868,7 +1868,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1868 * Nested commits always have zero deltas, so 1868 * Nested commits always have zero deltas, so
1869 * just reread the time stamp 1869 * just reread the time stamp
1870 */ 1870 */
1871 *ts = rb_time_stamp(buffer, cpu_buffer->cpu); 1871 *ts = rb_time_stamp(buffer);
1872 next_page->page->time_stamp = *ts; 1872 next_page->page->time_stamp = *ts;
1873 } 1873 }
1874 1874
@@ -1890,13 +1890,10 @@ static struct ring_buffer_event *
1890__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, 1890__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1891 unsigned type, unsigned long length, u64 *ts) 1891 unsigned type, unsigned long length, u64 *ts)
1892{ 1892{
1893 struct buffer_page *tail_page, *commit_page; 1893 struct buffer_page *tail_page;
1894 struct ring_buffer_event *event; 1894 struct ring_buffer_event *event;
1895 unsigned long tail, write; 1895 unsigned long tail, write;
1896 1896
1897 commit_page = cpu_buffer->commit_page;
1898 /* we just need to protect against interrupts */
1899 barrier();
1900 tail_page = cpu_buffer->tail_page; 1897 tail_page = cpu_buffer->tail_page;
1901 write = local_add_return(length, &tail_page->write); 1898 write = local_add_return(length, &tail_page->write);
1902 1899
@@ -1907,7 +1904,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1907 /* See if we shot pass the end of this buffer page */ 1904 /* See if we shot pass the end of this buffer page */
1908 if (write > BUF_PAGE_SIZE) 1905 if (write > BUF_PAGE_SIZE)
1909 return rb_move_tail(cpu_buffer, length, tail, 1906 return rb_move_tail(cpu_buffer, length, tail,
1910 commit_page, tail_page, ts); 1907 tail_page, ts);
1911 1908
1912 /* We reserved something on the buffer */ 1909 /* We reserved something on the buffer */
1913 1910
@@ -2111,7 +2108,7 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2111 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) 2108 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
2112 goto out_fail; 2109 goto out_fail;
2113 2110
2114 ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); 2111 ts = rb_time_stamp(cpu_buffer->buffer);
2115 2112
2116 /* 2113 /*
2117 * Only the first commit can update the timestamp. 2114 * Only the first commit can update the timestamp.
@@ -2681,7 +2678,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
2681EXPORT_SYMBOL_GPL(ring_buffer_entries); 2678EXPORT_SYMBOL_GPL(ring_buffer_entries);
2682 2679
2683/** 2680/**
2684 * ring_buffer_overrun_cpu - get the number of overruns in buffer 2681 * ring_buffer_overruns - get the number of overruns in buffer
2685 * @buffer: The ring buffer 2682 * @buffer: The ring buffer
2686 * 2683 *
2687 * Returns the total number of overruns in the ring buffer 2684 * Returns the total number of overruns in the ring buffer
@@ -2721,6 +2718,8 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
2721 iter->read_stamp = cpu_buffer->read_stamp; 2718 iter->read_stamp = cpu_buffer->read_stamp;
2722 else 2719 else
2723 iter->read_stamp = iter->head_page->page->time_stamp; 2720 iter->read_stamp = iter->head_page->page->time_stamp;
2721 iter->cache_reader_page = cpu_buffer->reader_page;
2722 iter->cache_read = cpu_buffer->read;
2724} 2723}
2725 2724
2726/** 2725/**
@@ -2832,7 +2831,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2832 int ret; 2831 int ret;
2833 2832
2834 local_irq_save(flags); 2833 local_irq_save(flags);
2835 __raw_spin_lock(&cpu_buffer->lock); 2834 arch_spin_lock(&cpu_buffer->lock);
2836 2835
2837 again: 2836 again:
2838 /* 2837 /*
@@ -2874,7 +2873,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2874 * Splice the empty reader page into the list around the head. 2873 * Splice the empty reader page into the list around the head.
2875 */ 2874 */
2876 reader = rb_set_head_page(cpu_buffer); 2875 reader = rb_set_head_page(cpu_buffer);
2877 cpu_buffer->reader_page->list.next = reader->list.next; 2876 cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next);
2878 cpu_buffer->reader_page->list.prev = reader->list.prev; 2877 cpu_buffer->reader_page->list.prev = reader->list.prev;
2879 2878
2880 /* 2879 /*
@@ -2911,7 +2910,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2911 * 2910 *
2912 * Now make the new head point back to the reader page. 2911 * Now make the new head point back to the reader page.
2913 */ 2912 */
2914 reader->list.next->prev = &cpu_buffer->reader_page->list; 2913 rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list;
2915 rb_inc_page(cpu_buffer, &cpu_buffer->head_page); 2914 rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
2916 2915
2917 /* Finally update the reader page to the new head */ 2916 /* Finally update the reader page to the new head */
@@ -2921,7 +2920,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2921 goto again; 2920 goto again;
2922 2921
2923 out: 2922 out:
2924 __raw_spin_unlock(&cpu_buffer->lock); 2923 arch_spin_unlock(&cpu_buffer->lock);
2925 local_irq_restore(flags); 2924 local_irq_restore(flags);
2926 2925
2927 return reader; 2926 return reader;
@@ -3065,13 +3064,22 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3065 struct ring_buffer_event *event; 3064 struct ring_buffer_event *event;
3066 int nr_loops = 0; 3065 int nr_loops = 0;
3067 3066
3068 if (ring_buffer_iter_empty(iter))
3069 return NULL;
3070
3071 cpu_buffer = iter->cpu_buffer; 3067 cpu_buffer = iter->cpu_buffer;
3072 buffer = cpu_buffer->buffer; 3068 buffer = cpu_buffer->buffer;
3073 3069
3070 /*
3071 * Check if someone performed a consuming read to
3072 * the buffer. A consuming read invalidates the iterator
3073 * and we need to reset the iterator in this case.
3074 */
3075 if (unlikely(iter->cache_read != cpu_buffer->read ||
3076 iter->cache_reader_page != cpu_buffer->reader_page))
3077 rb_iter_reset(iter);
3078
3074 again: 3079 again:
3080 if (ring_buffer_iter_empty(iter))
3081 return NULL;
3082
3075 /* 3083 /*
3076 * We repeat when a timestamp is encountered. 3084 * We repeat when a timestamp is encountered.
3077 * We can get multiple timestamps by nested interrupts or also 3085 * We can get multiple timestamps by nested interrupts or also
@@ -3086,6 +3094,11 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3086 if (rb_per_cpu_empty(cpu_buffer)) 3094 if (rb_per_cpu_empty(cpu_buffer))
3087 return NULL; 3095 return NULL;
3088 3096
3097 if (iter->head >= local_read(&iter->head_page->page->commit)) {
3098 rb_inc_iter(iter);
3099 goto again;
3100 }
3101
3089 event = rb_iter_head_event(iter); 3102 event = rb_iter_head_event(iter);
3090 3103
3091 switch (event->type_len) { 3104 switch (event->type_len) {
@@ -3284,9 +3297,9 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
3284 synchronize_sched(); 3297 synchronize_sched();
3285 3298
3286 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3299 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3287 __raw_spin_lock(&cpu_buffer->lock); 3300 arch_spin_lock(&cpu_buffer->lock);
3288 rb_iter_reset(iter); 3301 rb_iter_reset(iter);
3289 __raw_spin_unlock(&cpu_buffer->lock); 3302 arch_spin_unlock(&cpu_buffer->lock);
3290 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3303 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3291 3304
3292 return iter; 3305 return iter;
@@ -3406,11 +3419,11 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
3406 if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) 3419 if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
3407 goto out; 3420 goto out;
3408 3421
3409 __raw_spin_lock(&cpu_buffer->lock); 3422 arch_spin_lock(&cpu_buffer->lock);
3410 3423
3411 rb_reset_cpu(cpu_buffer); 3424 rb_reset_cpu(cpu_buffer);
3412 3425
3413 __raw_spin_unlock(&cpu_buffer->lock); 3426 arch_spin_unlock(&cpu_buffer->lock);
3414 3427
3415 out: 3428 out:
3416 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3429 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 573d3cc762c..b2477caf09c 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -35,6 +35,28 @@ static int disable_reader;
35module_param(disable_reader, uint, 0644); 35module_param(disable_reader, uint, 0644);
36MODULE_PARM_DESC(disable_reader, "only run producer"); 36MODULE_PARM_DESC(disable_reader, "only run producer");
37 37
38static int write_iteration = 50;
39module_param(write_iteration, uint, 0644);
40MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings");
41
42static int producer_nice = 19;
43static int consumer_nice = 19;
44
45static int producer_fifo = -1;
46static int consumer_fifo = -1;
47
48module_param(producer_nice, uint, 0644);
49MODULE_PARM_DESC(producer_nice, "nice prio for producer");
50
51module_param(consumer_nice, uint, 0644);
52MODULE_PARM_DESC(consumer_nice, "nice prio for consumer");
53
54module_param(producer_fifo, uint, 0644);
55MODULE_PARM_DESC(producer_fifo, "fifo prio for producer");
56
57module_param(consumer_fifo, uint, 0644);
58MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer");
59
38static int read_events; 60static int read_events;
39 61
40static int kill_test; 62static int kill_test;
@@ -208,15 +230,18 @@ static void ring_buffer_producer(void)
208 do { 230 do {
209 struct ring_buffer_event *event; 231 struct ring_buffer_event *event;
210 int *entry; 232 int *entry;
211 233 int i;
212 event = ring_buffer_lock_reserve(buffer, 10); 234
213 if (!event) { 235 for (i = 0; i < write_iteration; i++) {
214 missed++; 236 event = ring_buffer_lock_reserve(buffer, 10);
215 } else { 237 if (!event) {
216 hit++; 238 missed++;
217 entry = ring_buffer_event_data(event); 239 } else {
218 *entry = smp_processor_id(); 240 hit++;
219 ring_buffer_unlock_commit(buffer, event); 241 entry = ring_buffer_event_data(event);
242 *entry = smp_processor_id();
243 ring_buffer_unlock_commit(buffer, event);
244 }
220 } 245 }
221 do_gettimeofday(&end_tv); 246 do_gettimeofday(&end_tv);
222 247
@@ -263,6 +288,27 @@ static void ring_buffer_producer(void)
263 288
264 if (kill_test) 289 if (kill_test)
265 trace_printk("ERROR!\n"); 290 trace_printk("ERROR!\n");
291
292 if (!disable_reader) {
293 if (consumer_fifo < 0)
294 trace_printk("Running Consumer at nice: %d\n",
295 consumer_nice);
296 else
297 trace_printk("Running Consumer at SCHED_FIFO %d\n",
298 consumer_fifo);
299 }
300 if (producer_fifo < 0)
301 trace_printk("Running Producer at nice: %d\n",
302 producer_nice);
303 else
304 trace_printk("Running Producer at SCHED_FIFO %d\n",
305 producer_fifo);
306
307 /* Let the user know that the test is running at low priority */
308 if (producer_fifo < 0 && consumer_fifo < 0 &&
309 producer_nice == 19 && consumer_nice == 19)
310 trace_printk("WARNING!!! This test is running at lowest priority.\n");
311
266 trace_printk("Time: %lld (usecs)\n", time); 312 trace_printk("Time: %lld (usecs)\n", time);
267 trace_printk("Overruns: %lld\n", overruns); 313 trace_printk("Overruns: %lld\n", overruns);
268 if (disable_reader) 314 if (disable_reader)
@@ -392,6 +438,27 @@ static int __init ring_buffer_benchmark_init(void)
392 if (IS_ERR(producer)) 438 if (IS_ERR(producer))
393 goto out_kill; 439 goto out_kill;
394 440
441 /*
442 * Run them as low-prio background tasks by default:
443 */
444 if (!disable_reader) {
445 if (consumer_fifo >= 0) {
446 struct sched_param param = {
447 .sched_priority = consumer_fifo
448 };
449 sched_setscheduler(consumer, SCHED_FIFO, &param);
450 } else
451 set_user_nice(consumer, consumer_nice);
452 }
453
454 if (producer_fifo >= 0) {
455 struct sched_param param = {
456 .sched_priority = consumer_fifo
457 };
458 sched_setscheduler(producer, SCHED_FIFO, &param);
459 } else
460 set_user_nice(producer, producer_nice);
461
395 return 0; 462 return 0;
396 463
397 out_kill: 464 out_kill:
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 45068269ebb..eac6875cb99 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -12,7 +12,7 @@
12 * Copyright (C) 2004 William Lee Irwin III 12 * Copyright (C) 2004 William Lee Irwin III
13 */ 13 */
14#include <linux/ring_buffer.h> 14#include <linux/ring_buffer.h>
15#include <linux/utsrelease.h> 15#include <generated/utsrelease.h>
16#include <linux/stacktrace.h> 16#include <linux/stacktrace.h>
17#include <linux/writeback.h> 17#include <linux/writeback.h>
18#include <linux/kallsyms.h> 18#include <linux/kallsyms.h>
@@ -86,17 +86,17 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
86 */ 86 */
87static int tracing_disabled = 1; 87static int tracing_disabled = 1;
88 88
89DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); 89DEFINE_PER_CPU(int, ftrace_cpu_disabled);
90 90
91static inline void ftrace_disable_cpu(void) 91static inline void ftrace_disable_cpu(void)
92{ 92{
93 preempt_disable(); 93 preempt_disable();
94 local_inc(&__get_cpu_var(ftrace_cpu_disabled)); 94 __this_cpu_inc(per_cpu_var(ftrace_cpu_disabled));
95} 95}
96 96
97static inline void ftrace_enable_cpu(void) 97static inline void ftrace_enable_cpu(void)
98{ 98{
99 local_dec(&__get_cpu_var(ftrace_cpu_disabled)); 99 __this_cpu_dec(per_cpu_var(ftrace_cpu_disabled));
100 preempt_enable(); 100 preempt_enable();
101} 101}
102 102
@@ -129,7 +129,7 @@ static int tracing_set_tracer(const char *buf);
129static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; 129static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
130static char *default_bootup_tracer; 130static char *default_bootup_tracer;
131 131
132static int __init set_ftrace(char *str) 132static int __init set_cmdline_ftrace(char *str)
133{ 133{
134 strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); 134 strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
135 default_bootup_tracer = bootup_tracer_buf; 135 default_bootup_tracer = bootup_tracer_buf;
@@ -137,7 +137,7 @@ static int __init set_ftrace(char *str)
137 ring_buffer_expanded = 1; 137 ring_buffer_expanded = 1;
138 return 1; 138 return 1;
139} 139}
140__setup("ftrace=", set_ftrace); 140__setup("ftrace=", set_cmdline_ftrace);
141 141
142static int __init set_ftrace_dump_on_oops(char *str) 142static int __init set_ftrace_dump_on_oops(char *str)
143{ 143{
@@ -203,7 +203,7 @@ cycle_t ftrace_now(int cpu)
203 */ 203 */
204static struct trace_array max_tr; 204static struct trace_array max_tr;
205 205
206static DEFINE_PER_CPU(struct trace_array_cpu, max_data); 206static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data);
207 207
208/* tracer_enabled is used to toggle activation of a tracer */ 208/* tracer_enabled is used to toggle activation of a tracer */
209static int tracer_enabled = 1; 209static int tracer_enabled = 1;
@@ -313,7 +313,6 @@ static const char *trace_options[] = {
313 "bin", 313 "bin",
314 "block", 314 "block",
315 "stacktrace", 315 "stacktrace",
316 "sched-tree",
317 "trace_printk", 316 "trace_printk",
318 "ftrace_preempt", 317 "ftrace_preempt",
319 "branch", 318 "branch",
@@ -493,15 +492,15 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
493 * protected by per_cpu spinlocks. But the action of the swap 492 * protected by per_cpu spinlocks. But the action of the swap
494 * needs its own lock. 493 * needs its own lock.
495 * 494 *
496 * This is defined as a raw_spinlock_t in order to help 495 * This is defined as a arch_spinlock_t in order to help
497 * with performance when lockdep debugging is enabled. 496 * with performance when lockdep debugging is enabled.
498 * 497 *
499 * It is also used in other places outside the update_max_tr 498 * It is also used in other places outside the update_max_tr
500 * so it needs to be defined outside of the 499 * so it needs to be defined outside of the
501 * CONFIG_TRACER_MAX_TRACE. 500 * CONFIG_TRACER_MAX_TRACE.
502 */ 501 */
503static raw_spinlock_t ftrace_max_lock = 502static arch_spinlock_t ftrace_max_lock =
504 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 503 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
505 504
506#ifdef CONFIG_TRACER_MAX_TRACE 505#ifdef CONFIG_TRACER_MAX_TRACE
507unsigned long __read_mostly tracing_max_latency; 506unsigned long __read_mostly tracing_max_latency;
@@ -555,13 +554,13 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
555 return; 554 return;
556 555
557 WARN_ON_ONCE(!irqs_disabled()); 556 WARN_ON_ONCE(!irqs_disabled());
558 __raw_spin_lock(&ftrace_max_lock); 557 arch_spin_lock(&ftrace_max_lock);
559 558
560 tr->buffer = max_tr.buffer; 559 tr->buffer = max_tr.buffer;
561 max_tr.buffer = buf; 560 max_tr.buffer = buf;
562 561
563 __update_max_tr(tr, tsk, cpu); 562 __update_max_tr(tr, tsk, cpu);
564 __raw_spin_unlock(&ftrace_max_lock); 563 arch_spin_unlock(&ftrace_max_lock);
565} 564}
566 565
567/** 566/**
@@ -581,7 +580,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
581 return; 580 return;
582 581
583 WARN_ON_ONCE(!irqs_disabled()); 582 WARN_ON_ONCE(!irqs_disabled());
584 __raw_spin_lock(&ftrace_max_lock); 583 arch_spin_lock(&ftrace_max_lock);
585 584
586 ftrace_disable_cpu(); 585 ftrace_disable_cpu();
587 586
@@ -603,7 +602,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
603 WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); 602 WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
604 603
605 __update_max_tr(tr, tsk, cpu); 604 __update_max_tr(tr, tsk, cpu);
606 __raw_spin_unlock(&ftrace_max_lock); 605 arch_spin_unlock(&ftrace_max_lock);
607} 606}
608#endif /* CONFIG_TRACER_MAX_TRACE */ 607#endif /* CONFIG_TRACER_MAX_TRACE */
609 608
@@ -802,7 +801,7 @@ static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
802static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; 801static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
803static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; 802static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
804static int cmdline_idx; 803static int cmdline_idx;
805static raw_spinlock_t trace_cmdline_lock = __RAW_SPIN_LOCK_UNLOCKED; 804static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
806 805
807/* temporary disable recording */ 806/* temporary disable recording */
808static atomic_t trace_record_cmdline_disabled __read_mostly; 807static atomic_t trace_record_cmdline_disabled __read_mostly;
@@ -915,7 +914,7 @@ static void trace_save_cmdline(struct task_struct *tsk)
915 * nor do we want to disable interrupts, 914 * nor do we want to disable interrupts,
916 * so if we miss here, then better luck next time. 915 * so if we miss here, then better luck next time.
917 */ 916 */
918 if (!__raw_spin_trylock(&trace_cmdline_lock)) 917 if (!arch_spin_trylock(&trace_cmdline_lock))
919 return; 918 return;
920 919
921 idx = map_pid_to_cmdline[tsk->pid]; 920 idx = map_pid_to_cmdline[tsk->pid];
@@ -940,7 +939,7 @@ static void trace_save_cmdline(struct task_struct *tsk)
940 939
941 memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); 940 memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN);
942 941
943 __raw_spin_unlock(&trace_cmdline_lock); 942 arch_spin_unlock(&trace_cmdline_lock);
944} 943}
945 944
946void trace_find_cmdline(int pid, char comm[]) 945void trace_find_cmdline(int pid, char comm[])
@@ -952,20 +951,25 @@ void trace_find_cmdline(int pid, char comm[])
952 return; 951 return;
953 } 952 }
954 953
954 if (WARN_ON_ONCE(pid < 0)) {
955 strcpy(comm, "<XXX>");
956 return;
957 }
958
955 if (pid > PID_MAX_DEFAULT) { 959 if (pid > PID_MAX_DEFAULT) {
956 strcpy(comm, "<...>"); 960 strcpy(comm, "<...>");
957 return; 961 return;
958 } 962 }
959 963
960 preempt_disable(); 964 preempt_disable();
961 __raw_spin_lock(&trace_cmdline_lock); 965 arch_spin_lock(&trace_cmdline_lock);
962 map = map_pid_to_cmdline[pid]; 966 map = map_pid_to_cmdline[pid];
963 if (map != NO_CMDLINE_MAP) 967 if (map != NO_CMDLINE_MAP)
964 strcpy(comm, saved_cmdlines[map]); 968 strcpy(comm, saved_cmdlines[map]);
965 else 969 else
966 strcpy(comm, "<...>"); 970 strcpy(comm, "<...>");
967 971
968 __raw_spin_unlock(&trace_cmdline_lock); 972 arch_spin_unlock(&trace_cmdline_lock);
969 preempt_enable(); 973 preempt_enable();
970} 974}
971 975
@@ -1085,7 +1089,7 @@ trace_function(struct trace_array *tr,
1085 struct ftrace_entry *entry; 1089 struct ftrace_entry *entry;
1086 1090
1087 /* If we are reading the ring buffer, don't trace */ 1091 /* If we are reading the ring buffer, don't trace */
1088 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) 1092 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
1089 return; 1093 return;
1090 1094
1091 event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), 1095 event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
@@ -1151,6 +1155,22 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
1151 __ftrace_trace_stack(tr->buffer, flags, skip, pc); 1155 __ftrace_trace_stack(tr->buffer, flags, skip, pc);
1152} 1156}
1153 1157
1158/**
1159 * trace_dump_stack - record a stack back trace in the trace buffer
1160 */
1161void trace_dump_stack(void)
1162{
1163 unsigned long flags;
1164
1165 if (tracing_disabled || tracing_selftest_running)
1166 return;
1167
1168 local_save_flags(flags);
1169
1170 /* skipping 3 traces, seems to get us at the caller of this function */
1171 __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count());
1172}
1173
1154void 1174void
1155ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) 1175ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1156{ 1176{
@@ -1251,8 +1271,8 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
1251 */ 1271 */
1252int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) 1272int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1253{ 1273{
1254 static raw_spinlock_t trace_buf_lock = 1274 static arch_spinlock_t trace_buf_lock =
1255 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 1275 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
1256 static u32 trace_buf[TRACE_BUF_SIZE]; 1276 static u32 trace_buf[TRACE_BUF_SIZE];
1257 1277
1258 struct ftrace_event_call *call = &event_bprint; 1278 struct ftrace_event_call *call = &event_bprint;
@@ -1283,7 +1303,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1283 1303
1284 /* Lockdep uses trace_printk for lock tracing */ 1304 /* Lockdep uses trace_printk for lock tracing */
1285 local_irq_save(flags); 1305 local_irq_save(flags);
1286 __raw_spin_lock(&trace_buf_lock); 1306 arch_spin_lock(&trace_buf_lock);
1287 len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args); 1307 len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
1288 1308
1289 if (len > TRACE_BUF_SIZE || len < 0) 1309 if (len > TRACE_BUF_SIZE || len < 0)
@@ -1304,7 +1324,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1304 ring_buffer_unlock_commit(buffer, event); 1324 ring_buffer_unlock_commit(buffer, event);
1305 1325
1306out_unlock: 1326out_unlock:
1307 __raw_spin_unlock(&trace_buf_lock); 1327 arch_spin_unlock(&trace_buf_lock);
1308 local_irq_restore(flags); 1328 local_irq_restore(flags);
1309 1329
1310out: 1330out:
@@ -1334,7 +1354,7 @@ int trace_array_printk(struct trace_array *tr,
1334int trace_array_vprintk(struct trace_array *tr, 1354int trace_array_vprintk(struct trace_array *tr,
1335 unsigned long ip, const char *fmt, va_list args) 1355 unsigned long ip, const char *fmt, va_list args)
1336{ 1356{
1337 static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; 1357 static arch_spinlock_t trace_buf_lock = __ARCH_SPIN_LOCK_UNLOCKED;
1338 static char trace_buf[TRACE_BUF_SIZE]; 1358 static char trace_buf[TRACE_BUF_SIZE];
1339 1359
1340 struct ftrace_event_call *call = &event_print; 1360 struct ftrace_event_call *call = &event_print;
@@ -1360,12 +1380,9 @@ int trace_array_vprintk(struct trace_array *tr,
1360 1380
1361 pause_graph_tracing(); 1381 pause_graph_tracing();
1362 raw_local_irq_save(irq_flags); 1382 raw_local_irq_save(irq_flags);
1363 __raw_spin_lock(&trace_buf_lock); 1383 arch_spin_lock(&trace_buf_lock);
1364 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); 1384 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
1365 1385
1366 len = min(len, TRACE_BUF_SIZE-1);
1367 trace_buf[len] = 0;
1368
1369 size = sizeof(*entry) + len + 1; 1386 size = sizeof(*entry) + len + 1;
1370 buffer = tr->buffer; 1387 buffer = tr->buffer;
1371 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, 1388 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
@@ -1373,15 +1390,15 @@ int trace_array_vprintk(struct trace_array *tr,
1373 if (!event) 1390 if (!event)
1374 goto out_unlock; 1391 goto out_unlock;
1375 entry = ring_buffer_event_data(event); 1392 entry = ring_buffer_event_data(event);
1376 entry->ip = ip; 1393 entry->ip = ip;
1377 1394
1378 memcpy(&entry->buf, trace_buf, len); 1395 memcpy(&entry->buf, trace_buf, len);
1379 entry->buf[len] = 0; 1396 entry->buf[len] = '\0';
1380 if (!filter_check_discard(call, entry, buffer, event)) 1397 if (!filter_check_discard(call, entry, buffer, event))
1381 ring_buffer_unlock_commit(buffer, event); 1398 ring_buffer_unlock_commit(buffer, event);
1382 1399
1383 out_unlock: 1400 out_unlock:
1384 __raw_spin_unlock(&trace_buf_lock); 1401 arch_spin_unlock(&trace_buf_lock);
1385 raw_local_irq_restore(irq_flags); 1402 raw_local_irq_restore(irq_flags);
1386 unpause_graph_tracing(); 1403 unpause_graph_tracing();
1387 out: 1404 out:
@@ -1393,7 +1410,7 @@ int trace_array_vprintk(struct trace_array *tr,
1393 1410
1394int trace_vprintk(unsigned long ip, const char *fmt, va_list args) 1411int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1395{ 1412{
1396 return trace_array_printk(&global_trace, ip, fmt, args); 1413 return trace_array_vprintk(&global_trace, ip, fmt, args);
1397} 1414}
1398EXPORT_SYMBOL_GPL(trace_vprintk); 1415EXPORT_SYMBOL_GPL(trace_vprintk);
1399 1416
@@ -1515,6 +1532,8 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
1515 int i = (int)*pos; 1532 int i = (int)*pos;
1516 void *ent; 1533 void *ent;
1517 1534
1535 WARN_ON_ONCE(iter->leftover);
1536
1518 (*pos)++; 1537 (*pos)++;
1519 1538
1520 /* can't go backwards */ 1539 /* can't go backwards */
@@ -1613,8 +1632,16 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1613 ; 1632 ;
1614 1633
1615 } else { 1634 } else {
1616 l = *pos - 1; 1635 /*
1617 p = s_next(m, p, &l); 1636 * If we overflowed the seq_file before, then we want
1637 * to just reuse the trace_seq buffer again.
1638 */
1639 if (iter->leftover)
1640 p = iter;
1641 else {
1642 l = *pos - 1;
1643 p = s_next(m, p, &l);
1644 }
1618 } 1645 }
1619 1646
1620 trace_event_read_lock(); 1647 trace_event_read_lock();
@@ -1922,6 +1949,7 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
1922static int s_show(struct seq_file *m, void *v) 1949static int s_show(struct seq_file *m, void *v)
1923{ 1950{
1924 struct trace_iterator *iter = v; 1951 struct trace_iterator *iter = v;
1952 int ret;
1925 1953
1926 if (iter->ent == NULL) { 1954 if (iter->ent == NULL) {
1927 if (iter->tr) { 1955 if (iter->tr) {
@@ -1941,9 +1969,27 @@ static int s_show(struct seq_file *m, void *v)
1941 if (!(trace_flags & TRACE_ITER_VERBOSE)) 1969 if (!(trace_flags & TRACE_ITER_VERBOSE))
1942 print_func_help_header(m); 1970 print_func_help_header(m);
1943 } 1971 }
1972 } else if (iter->leftover) {
1973 /*
1974 * If we filled the seq_file buffer earlier, we
1975 * want to just show it now.
1976 */
1977 ret = trace_print_seq(m, &iter->seq);
1978
1979 /* ret should this time be zero, but you never know */
1980 iter->leftover = ret;
1981
1944 } else { 1982 } else {
1945 print_trace_line(iter); 1983 print_trace_line(iter);
1946 trace_print_seq(m, &iter->seq); 1984 ret = trace_print_seq(m, &iter->seq);
1985 /*
1986 * If we overflow the seq_file buffer, then it will
1987 * ask us for this data again at start up.
1988 * Use that instead.
1989 * ret is 0 if seq_file write succeeded.
1990 * -1 otherwise.
1991 */
1992 iter->leftover = ret;
1947 } 1993 }
1948 1994
1949 return 0; 1995 return 0;
@@ -2253,7 +2299,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2253 mutex_lock(&tracing_cpumask_update_lock); 2299 mutex_lock(&tracing_cpumask_update_lock);
2254 2300
2255 local_irq_disable(); 2301 local_irq_disable();
2256 __raw_spin_lock(&ftrace_max_lock); 2302 arch_spin_lock(&ftrace_max_lock);
2257 for_each_tracing_cpu(cpu) { 2303 for_each_tracing_cpu(cpu) {
2258 /* 2304 /*
2259 * Increase/decrease the disabled counter if we are 2305 * Increase/decrease the disabled counter if we are
@@ -2268,7 +2314,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2268 atomic_dec(&global_trace.data[cpu]->disabled); 2314 atomic_dec(&global_trace.data[cpu]->disabled);
2269 } 2315 }
2270 } 2316 }
2271 __raw_spin_unlock(&ftrace_max_lock); 2317 arch_spin_unlock(&ftrace_max_lock);
2272 local_irq_enable(); 2318 local_irq_enable();
2273 2319
2274 cpumask_copy(tracing_cpumask, tracing_cpumask_new); 2320 cpumask_copy(tracing_cpumask, tracing_cpumask_new);
@@ -2290,67 +2336,49 @@ static const struct file_operations tracing_cpumask_fops = {
2290 .write = tracing_cpumask_write, 2336 .write = tracing_cpumask_write,
2291}; 2337};
2292 2338
2293static ssize_t 2339static int tracing_trace_options_show(struct seq_file *m, void *v)
2294tracing_trace_options_read(struct file *filp, char __user *ubuf,
2295 size_t cnt, loff_t *ppos)
2296{ 2340{
2297 struct tracer_opt *trace_opts; 2341 struct tracer_opt *trace_opts;
2298 u32 tracer_flags; 2342 u32 tracer_flags;
2299 int len = 0;
2300 char *buf;
2301 int r = 0;
2302 int i; 2343 int i;
2303 2344
2304
2305 /* calculate max size */
2306 for (i = 0; trace_options[i]; i++) {
2307 len += strlen(trace_options[i]);
2308 len += 3; /* "no" and newline */
2309 }
2310
2311 mutex_lock(&trace_types_lock); 2345 mutex_lock(&trace_types_lock);
2312 tracer_flags = current_trace->flags->val; 2346 tracer_flags = current_trace->flags->val;
2313 trace_opts = current_trace->flags->opts; 2347 trace_opts = current_trace->flags->opts;
2314 2348
2315 /*
2316 * Increase the size with names of options specific
2317 * of the current tracer.
2318 */
2319 for (i = 0; trace_opts[i].name; i++) {
2320 len += strlen(trace_opts[i].name);
2321 len += 3; /* "no" and newline */
2322 }
2323
2324 /* +1 for \0 */
2325 buf = kmalloc(len + 1, GFP_KERNEL);
2326 if (!buf) {
2327 mutex_unlock(&trace_types_lock);
2328 return -ENOMEM;
2329 }
2330
2331 for (i = 0; trace_options[i]; i++) { 2349 for (i = 0; trace_options[i]; i++) {
2332 if (trace_flags & (1 << i)) 2350 if (trace_flags & (1 << i))
2333 r += sprintf(buf + r, "%s\n", trace_options[i]); 2351 seq_printf(m, "%s\n", trace_options[i]);
2334 else 2352 else
2335 r += sprintf(buf + r, "no%s\n", trace_options[i]); 2353 seq_printf(m, "no%s\n", trace_options[i]);
2336 } 2354 }
2337 2355
2338 for (i = 0; trace_opts[i].name; i++) { 2356 for (i = 0; trace_opts[i].name; i++) {
2339 if (tracer_flags & trace_opts[i].bit) 2357 if (tracer_flags & trace_opts[i].bit)
2340 r += sprintf(buf + r, "%s\n", 2358 seq_printf(m, "%s\n", trace_opts[i].name);
2341 trace_opts[i].name);
2342 else 2359 else
2343 r += sprintf(buf + r, "no%s\n", 2360 seq_printf(m, "no%s\n", trace_opts[i].name);
2344 trace_opts[i].name);
2345 } 2361 }
2346 mutex_unlock(&trace_types_lock); 2362 mutex_unlock(&trace_types_lock);
2347 2363
2348 WARN_ON(r >= len + 1); 2364 return 0;
2365}
2349 2366
2350 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 2367static int __set_tracer_option(struct tracer *trace,
2368 struct tracer_flags *tracer_flags,
2369 struct tracer_opt *opts, int neg)
2370{
2371 int ret;
2351 2372
2352 kfree(buf); 2373 ret = trace->set_flag(tracer_flags->val, opts->bit, !neg);
2353 return r; 2374 if (ret)
2375 return ret;
2376
2377 if (neg)
2378 tracer_flags->val &= ~opts->bit;
2379 else
2380 tracer_flags->val |= opts->bit;
2381 return 0;
2354} 2382}
2355 2383
2356/* Try to assign a tracer specific option */ 2384/* Try to assign a tracer specific option */
@@ -2358,33 +2386,17 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
2358{ 2386{
2359 struct tracer_flags *tracer_flags = trace->flags; 2387 struct tracer_flags *tracer_flags = trace->flags;
2360 struct tracer_opt *opts = NULL; 2388 struct tracer_opt *opts = NULL;
2361 int ret = 0, i = 0; 2389 int i;
2362 int len;
2363 2390
2364 for (i = 0; tracer_flags->opts[i].name; i++) { 2391 for (i = 0; tracer_flags->opts[i].name; i++) {
2365 opts = &tracer_flags->opts[i]; 2392 opts = &tracer_flags->opts[i];
2366 len = strlen(opts->name);
2367 2393
2368 if (strncmp(cmp, opts->name, len) == 0) { 2394 if (strcmp(cmp, opts->name) == 0)
2369 ret = trace->set_flag(tracer_flags->val, 2395 return __set_tracer_option(trace, trace->flags,
2370 opts->bit, !neg); 2396 opts, neg);
2371 break;
2372 }
2373 } 2397 }
2374 /* Not found */
2375 if (!tracer_flags->opts[i].name)
2376 return -EINVAL;
2377 2398
2378 /* Refused to handle */ 2399 return -EINVAL;
2379 if (ret)
2380 return ret;
2381
2382 if (neg)
2383 tracer_flags->val &= ~opts->bit;
2384 else
2385 tracer_flags->val |= opts->bit;
2386
2387 return 0;
2388} 2400}
2389 2401
2390static void set_tracer_flags(unsigned int mask, int enabled) 2402static void set_tracer_flags(unsigned int mask, int enabled)
@@ -2404,7 +2416,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2404 size_t cnt, loff_t *ppos) 2416 size_t cnt, loff_t *ppos)
2405{ 2417{
2406 char buf[64]; 2418 char buf[64];
2407 char *cmp = buf; 2419 char *cmp;
2408 int neg = 0; 2420 int neg = 0;
2409 int ret; 2421 int ret;
2410 int i; 2422 int i;
@@ -2416,16 +2428,15 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2416 return -EFAULT; 2428 return -EFAULT;
2417 2429
2418 buf[cnt] = 0; 2430 buf[cnt] = 0;
2431 cmp = strstrip(buf);
2419 2432
2420 if (strncmp(buf, "no", 2) == 0) { 2433 if (strncmp(cmp, "no", 2) == 0) {
2421 neg = 1; 2434 neg = 1;
2422 cmp += 2; 2435 cmp += 2;
2423 } 2436 }
2424 2437
2425 for (i = 0; trace_options[i]; i++) { 2438 for (i = 0; trace_options[i]; i++) {
2426 int len = strlen(trace_options[i]); 2439 if (strcmp(cmp, trace_options[i]) == 0) {
2427
2428 if (strncmp(cmp, trace_options[i], len) == 0) {
2429 set_tracer_flags(1 << i, !neg); 2440 set_tracer_flags(1 << i, !neg);
2430 break; 2441 break;
2431 } 2442 }
@@ -2440,14 +2451,23 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2440 return ret; 2451 return ret;
2441 } 2452 }
2442 2453
2443 filp->f_pos += cnt; 2454 *ppos += cnt;
2444 2455
2445 return cnt; 2456 return cnt;
2446} 2457}
2447 2458
2459static int tracing_trace_options_open(struct inode *inode, struct file *file)
2460{
2461 if (tracing_disabled)
2462 return -ENODEV;
2463 return single_open(file, tracing_trace_options_show, NULL);
2464}
2465
2448static const struct file_operations tracing_iter_fops = { 2466static const struct file_operations tracing_iter_fops = {
2449 .open = tracing_open_generic, 2467 .open = tracing_trace_options_open,
2450 .read = tracing_trace_options_read, 2468 .read = seq_read,
2469 .llseek = seq_lseek,
2470 .release = single_release,
2451 .write = tracing_trace_options_write, 2471 .write = tracing_trace_options_write,
2452}; 2472};
2453 2473
@@ -2582,7 +2602,7 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
2582 } 2602 }
2583 mutex_unlock(&trace_types_lock); 2603 mutex_unlock(&trace_types_lock);
2584 2604
2585 filp->f_pos += cnt; 2605 *ppos += cnt;
2586 2606
2587 return cnt; 2607 return cnt;
2588} 2608}
@@ -2764,7 +2784,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
2764 if (err) 2784 if (err)
2765 return err; 2785 return err;
2766 2786
2767 filp->f_pos += ret; 2787 *ppos += ret;
2768 2788
2769 return ret; 2789 return ret;
2770} 2790}
@@ -2897,6 +2917,10 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
2897 else 2917 else
2898 cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask); 2918 cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask);
2899 2919
2920
2921 if (iter->trace->pipe_close)
2922 iter->trace->pipe_close(iter);
2923
2900 mutex_unlock(&trace_types_lock); 2924 mutex_unlock(&trace_types_lock);
2901 2925
2902 free_cpumask_var(iter->started); 2926 free_cpumask_var(iter->started);
@@ -3103,7 +3127,7 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
3103 __free_page(spd->pages[idx]); 3127 __free_page(spd->pages[idx]);
3104} 3128}
3105 3129
3106static struct pipe_buf_operations tracing_pipe_buf_ops = { 3130static const struct pipe_buf_operations tracing_pipe_buf_ops = {
3107 .can_merge = 0, 3131 .can_merge = 0,
3108 .map = generic_pipe_buf_map, 3132 .map = generic_pipe_buf_map,
3109 .unmap = generic_pipe_buf_unmap, 3133 .unmap = generic_pipe_buf_unmap,
@@ -3299,7 +3323,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3299 } 3323 }
3300 } 3324 }
3301 3325
3302 filp->f_pos += cnt; 3326 *ppos += cnt;
3303 3327
3304 /* If check pages failed, return ENOMEM */ 3328 /* If check pages failed, return ENOMEM */
3305 if (tracing_disabled) 3329 if (tracing_disabled)
@@ -3334,7 +3358,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3334 size_t cnt, loff_t *fpos) 3358 size_t cnt, loff_t *fpos)
3335{ 3359{
3336 char *buf; 3360 char *buf;
3337 char *end;
3338 3361
3339 if (tracing_disabled) 3362 if (tracing_disabled)
3340 return -EINVAL; 3363 return -EINVAL;
@@ -3342,7 +3365,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3342 if (cnt > TRACE_BUF_SIZE) 3365 if (cnt > TRACE_BUF_SIZE)
3343 cnt = TRACE_BUF_SIZE; 3366 cnt = TRACE_BUF_SIZE;
3344 3367
3345 buf = kmalloc(cnt + 1, GFP_KERNEL); 3368 buf = kmalloc(cnt + 2, GFP_KERNEL);
3346 if (buf == NULL) 3369 if (buf == NULL)
3347 return -ENOMEM; 3370 return -ENOMEM;
3348 3371
@@ -3350,35 +3373,31 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3350 kfree(buf); 3373 kfree(buf);
3351 return -EFAULT; 3374 return -EFAULT;
3352 } 3375 }
3376 if (buf[cnt-1] != '\n') {
3377 buf[cnt] = '\n';
3378 buf[cnt+1] = '\0';
3379 } else
3380 buf[cnt] = '\0';
3353 3381
3354 /* Cut from the first nil or newline. */ 3382 cnt = mark_printk("%s", buf);
3355 buf[cnt] = '\0';
3356 end = strchr(buf, '\n');
3357 if (end)
3358 *end = '\0';
3359
3360 cnt = mark_printk("%s\n", buf);
3361 kfree(buf); 3383 kfree(buf);
3362 *fpos += cnt; 3384 *fpos += cnt;
3363 3385
3364 return cnt; 3386 return cnt;
3365} 3387}
3366 3388
3367static ssize_t tracing_clock_read(struct file *filp, char __user *ubuf, 3389static int tracing_clock_show(struct seq_file *m, void *v)
3368 size_t cnt, loff_t *ppos)
3369{ 3390{
3370 char buf[64];
3371 int bufiter = 0;
3372 int i; 3391 int i;
3373 3392
3374 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) 3393 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++)
3375 bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter, 3394 seq_printf(m,
3376 "%s%s%s%s", i ? " " : "", 3395 "%s%s%s%s", i ? " " : "",
3377 i == trace_clock_id ? "[" : "", trace_clocks[i].name, 3396 i == trace_clock_id ? "[" : "", trace_clocks[i].name,
3378 i == trace_clock_id ? "]" : ""); 3397 i == trace_clock_id ? "]" : "");
3379 bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter, "\n"); 3398 seq_putc(m, '\n');
3380 3399
3381 return simple_read_from_buffer(ubuf, cnt, ppos, buf, bufiter); 3400 return 0;
3382} 3401}
3383 3402
3384static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, 3403static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
@@ -3420,6 +3439,13 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
3420 return cnt; 3439 return cnt;
3421} 3440}
3422 3441
3442static int tracing_clock_open(struct inode *inode, struct file *file)
3443{
3444 if (tracing_disabled)
3445 return -ENODEV;
3446 return single_open(file, tracing_clock_show, NULL);
3447}
3448
3423static const struct file_operations tracing_max_lat_fops = { 3449static const struct file_operations tracing_max_lat_fops = {
3424 .open = tracing_open_generic, 3450 .open = tracing_open_generic,
3425 .read = tracing_max_lat_read, 3451 .read = tracing_max_lat_read,
@@ -3458,8 +3484,10 @@ static const struct file_operations tracing_mark_fops = {
3458}; 3484};
3459 3485
3460static const struct file_operations trace_clock_fops = { 3486static const struct file_operations trace_clock_fops = {
3461 .open = tracing_open_generic, 3487 .open = tracing_clock_open,
3462 .read = tracing_clock_read, 3488 .read = seq_read,
3489 .llseek = seq_lseek,
3490 .release = single_release,
3463 .write = tracing_clock_write, 3491 .write = tracing_clock_write,
3464}; 3492};
3465 3493
@@ -3589,7 +3617,7 @@ static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
3589} 3617}
3590 3618
3591/* Pipe buffer operations for a buffer. */ 3619/* Pipe buffer operations for a buffer. */
3592static struct pipe_buf_operations buffer_pipe_buf_ops = { 3620static const struct pipe_buf_operations buffer_pipe_buf_ops = {
3593 .can_merge = 0, 3621 .can_merge = 0,
3594 .map = generic_pipe_buf_map, 3622 .map = generic_pipe_buf_map,
3595 .unmap = generic_pipe_buf_unmap, 3623 .unmap = generic_pipe_buf_unmap,
@@ -3730,7 +3758,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
3730 3758
3731 s = kmalloc(sizeof(*s), GFP_KERNEL); 3759 s = kmalloc(sizeof(*s), GFP_KERNEL);
3732 if (!s) 3760 if (!s)
3733 return ENOMEM; 3761 return -ENOMEM;
3734 3762
3735 trace_seq_init(s); 3763 trace_seq_init(s);
3736 3764
@@ -3920,39 +3948,16 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
3920 if (ret < 0) 3948 if (ret < 0)
3921 return ret; 3949 return ret;
3922 3950
3923 ret = 0; 3951 if (val != 0 && val != 1)
3924 switch (val) { 3952 return -EINVAL;
3925 case 0:
3926 /* do nothing if already cleared */
3927 if (!(topt->flags->val & topt->opt->bit))
3928 break;
3929
3930 mutex_lock(&trace_types_lock);
3931 if (current_trace->set_flag)
3932 ret = current_trace->set_flag(topt->flags->val,
3933 topt->opt->bit, 0);
3934 mutex_unlock(&trace_types_lock);
3935 if (ret)
3936 return ret;
3937 topt->flags->val &= ~topt->opt->bit;
3938 break;
3939 case 1:
3940 /* do nothing if already set */
3941 if (topt->flags->val & topt->opt->bit)
3942 break;
3943 3953
3954 if (!!(topt->flags->val & topt->opt->bit) != val) {
3944 mutex_lock(&trace_types_lock); 3955 mutex_lock(&trace_types_lock);
3945 if (current_trace->set_flag) 3956 ret = __set_tracer_option(current_trace, topt->flags,
3946 ret = current_trace->set_flag(topt->flags->val, 3957 topt->opt, !val);
3947 topt->opt->bit, 1);
3948 mutex_unlock(&trace_types_lock); 3958 mutex_unlock(&trace_types_lock);
3949 if (ret) 3959 if (ret)
3950 return ret; 3960 return ret;
3951 topt->flags->val |= topt->opt->bit;
3952 break;
3953
3954 default:
3955 return -EINVAL;
3956 } 3961 }
3957 3962
3958 *ppos += cnt; 3963 *ppos += cnt;
@@ -4279,8 +4284,8 @@ trace_printk_seq(struct trace_seq *s)
4279 4284
4280static void __ftrace_dump(bool disable_tracing) 4285static void __ftrace_dump(bool disable_tracing)
4281{ 4286{
4282 static raw_spinlock_t ftrace_dump_lock = 4287 static arch_spinlock_t ftrace_dump_lock =
4283 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 4288 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
4284 /* use static because iter can be a bit big for the stack */ 4289 /* use static because iter can be a bit big for the stack */
4285 static struct trace_iterator iter; 4290 static struct trace_iterator iter;
4286 unsigned int old_userobj; 4291 unsigned int old_userobj;
@@ -4290,7 +4295,7 @@ static void __ftrace_dump(bool disable_tracing)
4290 4295
4291 /* only one dump */ 4296 /* only one dump */
4292 local_irq_save(flags); 4297 local_irq_save(flags);
4293 __raw_spin_lock(&ftrace_dump_lock); 4298 arch_spin_lock(&ftrace_dump_lock);
4294 if (dump_ran) 4299 if (dump_ran)
4295 goto out; 4300 goto out;
4296 4301
@@ -4365,7 +4370,7 @@ static void __ftrace_dump(bool disable_tracing)
4365 } 4370 }
4366 4371
4367 out: 4372 out:
4368 __raw_spin_unlock(&ftrace_dump_lock); 4373 arch_spin_unlock(&ftrace_dump_lock);
4369 local_irq_restore(flags); 4374 local_irq_restore(flags);
4370} 4375}
4371 4376
@@ -4426,7 +4431,7 @@ __init static int tracer_alloc_buffers(void)
4426 /* Allocate the first page for all buffers */ 4431 /* Allocate the first page for all buffers */
4427 for_each_tracing_cpu(i) { 4432 for_each_tracing_cpu(i) {
4428 global_trace.data[i] = &per_cpu(global_trace_cpu, i); 4433 global_trace.data[i] = &per_cpu(global_trace_cpu, i);
4429 max_tr.data[i] = &per_cpu(max_data, i); 4434 max_tr.data[i] = &per_cpu(max_tr_data, i);
4430 } 4435 }
4431 4436
4432 trace_init_cmdlines(); 4437 trace_init_cmdlines();
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 405cb850b75..4df6a77eb19 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -11,6 +11,7 @@
11#include <linux/ftrace.h> 11#include <linux/ftrace.h>
12#include <trace/boot.h> 12#include <trace/boot.h>
13#include <linux/kmemtrace.h> 13#include <linux/kmemtrace.h>
14#include <linux/hw_breakpoint.h>
14 15
15#include <linux/trace_seq.h> 16#include <linux/trace_seq.h>
16#include <linux/ftrace_event.h> 17#include <linux/ftrace_event.h>
@@ -37,6 +38,7 @@ enum trace_type {
37 TRACE_KMEM_ALLOC, 38 TRACE_KMEM_ALLOC,
38 TRACE_KMEM_FREE, 39 TRACE_KMEM_FREE,
39 TRACE_BLK, 40 TRACE_BLK,
41 TRACE_KSYM,
40 42
41 __TRACE_LAST_TYPE, 43 __TRACE_LAST_TYPE,
42}; 44};
@@ -98,9 +100,32 @@ struct syscall_trace_enter {
98struct syscall_trace_exit { 100struct syscall_trace_exit {
99 struct trace_entry ent; 101 struct trace_entry ent;
100 int nr; 102 int nr;
101 unsigned long ret; 103 long ret;
102}; 104};
103 105
106struct kprobe_trace_entry {
107 struct trace_entry ent;
108 unsigned long ip;
109 int nargs;
110 unsigned long args[];
111};
112
113#define SIZEOF_KPROBE_TRACE_ENTRY(n) \
114 (offsetof(struct kprobe_trace_entry, args) + \
115 (sizeof(unsigned long) * (n)))
116
117struct kretprobe_trace_entry {
118 struct trace_entry ent;
119 unsigned long func;
120 unsigned long ret_ip;
121 int nargs;
122 unsigned long args[];
123};
124
125#define SIZEOF_KRETPROBE_TRACE_ENTRY(n) \
126 (offsetof(struct kretprobe_trace_entry, args) + \
127 (sizeof(unsigned long) * (n)))
128
104/* 129/*
105 * trace_flag_type is an enumeration that holds different 130 * trace_flag_type is an enumeration that holds different
106 * states when a trace occurs. These are: 131 * states when a trace occurs. These are:
@@ -209,6 +234,7 @@ extern void __ftrace_bad_type(void);
209 TRACE_KMEM_ALLOC); \ 234 TRACE_KMEM_ALLOC); \
210 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ 235 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
211 TRACE_KMEM_FREE); \ 236 TRACE_KMEM_FREE); \
237 IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\
212 __ftrace_bad_type(); \ 238 __ftrace_bad_type(); \
213 } while (0) 239 } while (0)
214 240
@@ -246,6 +272,7 @@ struct tracer_flags {
246 * @pipe_open: called when the trace_pipe file is opened 272 * @pipe_open: called when the trace_pipe file is opened
247 * @wait_pipe: override how the user waits for traces on trace_pipe 273 * @wait_pipe: override how the user waits for traces on trace_pipe
248 * @close: called when the trace file is released 274 * @close: called when the trace file is released
275 * @pipe_close: called when the trace_pipe file is released
249 * @read: override the default read callback on trace_pipe 276 * @read: override the default read callback on trace_pipe
250 * @splice_read: override the default splice_read callback on trace_pipe 277 * @splice_read: override the default splice_read callback on trace_pipe
251 * @selftest: selftest to run on boot (see trace_selftest.c) 278 * @selftest: selftest to run on boot (see trace_selftest.c)
@@ -264,6 +291,7 @@ struct tracer {
264 void (*pipe_open)(struct trace_iterator *iter); 291 void (*pipe_open)(struct trace_iterator *iter);
265 void (*wait_pipe)(struct trace_iterator *iter); 292 void (*wait_pipe)(struct trace_iterator *iter);
266 void (*close)(struct trace_iterator *iter); 293 void (*close)(struct trace_iterator *iter);
294 void (*pipe_close)(struct trace_iterator *iter);
267 ssize_t (*read)(struct trace_iterator *iter, 295 ssize_t (*read)(struct trace_iterator *iter,
268 struct file *filp, char __user *ubuf, 296 struct file *filp, char __user *ubuf,
269 size_t cnt, loff_t *ppos); 297 size_t cnt, loff_t *ppos);
@@ -364,6 +392,8 @@ int register_tracer(struct tracer *type);
364void unregister_tracer(struct tracer *type); 392void unregister_tracer(struct tracer *type);
365int is_tracing_stopped(void); 393int is_tracing_stopped(void);
366 394
395extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr);
396
367extern unsigned long nsecs_to_usecs(unsigned long nsecs); 397extern unsigned long nsecs_to_usecs(unsigned long nsecs);
368 398
369#ifdef CONFIG_TRACER_MAX_TRACE 399#ifdef CONFIG_TRACER_MAX_TRACE
@@ -413,7 +443,7 @@ extern int DYN_FTRACE_TEST_NAME(void);
413 443
414extern int ring_buffer_expanded; 444extern int ring_buffer_expanded;
415extern bool tracing_selftest_disabled; 445extern bool tracing_selftest_disabled;
416DECLARE_PER_CPU(local_t, ftrace_cpu_disabled); 446DECLARE_PER_CPU(int, ftrace_cpu_disabled);
417 447
418#ifdef CONFIG_FTRACE_STARTUP_TEST 448#ifdef CONFIG_FTRACE_STARTUP_TEST
419extern int trace_selftest_startup_function(struct tracer *trace, 449extern int trace_selftest_startup_function(struct tracer *trace,
@@ -438,6 +468,8 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
438 struct trace_array *tr); 468 struct trace_array *tr);
439extern int trace_selftest_startup_hw_branches(struct tracer *trace, 469extern int trace_selftest_startup_hw_branches(struct tracer *trace,
440 struct trace_array *tr); 470 struct trace_array *tr);
471extern int trace_selftest_startup_ksym(struct tracer *trace,
472 struct trace_array *tr);
441#endif /* CONFIG_FTRACE_STARTUP_TEST */ 473#endif /* CONFIG_FTRACE_STARTUP_TEST */
442 474
443extern void *head_page(struct trace_array_cpu *data); 475extern void *head_page(struct trace_array_cpu *data);
@@ -483,10 +515,6 @@ static inline int ftrace_graph_addr(unsigned long addr)
483 return 0; 515 return 0;
484} 516}
485#else 517#else
486static inline int ftrace_trace_addr(unsigned long addr)
487{
488 return 1;
489}
490static inline int ftrace_graph_addr(unsigned long addr) 518static inline int ftrace_graph_addr(unsigned long addr)
491{ 519{
492 return 1; 520 return 1;
@@ -500,12 +528,12 @@ print_graph_function(struct trace_iterator *iter)
500} 528}
501#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 529#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
502 530
503extern struct pid *ftrace_pid_trace; 531extern struct list_head ftrace_pids;
504 532
505#ifdef CONFIG_FUNCTION_TRACER 533#ifdef CONFIG_FUNCTION_TRACER
506static inline int ftrace_trace_task(struct task_struct *task) 534static inline int ftrace_trace_task(struct task_struct *task)
507{ 535{
508 if (!ftrace_pid_trace) 536 if (list_empty(&ftrace_pids))
509 return 1; 537 return 1;
510 538
511 return test_tsk_trace_trace(task); 539 return test_tsk_trace_trace(task);
@@ -569,18 +597,17 @@ enum trace_iterator_flags {
569 TRACE_ITER_BIN = 0x40, 597 TRACE_ITER_BIN = 0x40,
570 TRACE_ITER_BLOCK = 0x80, 598 TRACE_ITER_BLOCK = 0x80,
571 TRACE_ITER_STACKTRACE = 0x100, 599 TRACE_ITER_STACKTRACE = 0x100,
572 TRACE_ITER_SCHED_TREE = 0x200, 600 TRACE_ITER_PRINTK = 0x200,
573 TRACE_ITER_PRINTK = 0x400, 601 TRACE_ITER_PREEMPTONLY = 0x400,
574 TRACE_ITER_PREEMPTONLY = 0x800, 602 TRACE_ITER_BRANCH = 0x800,
575 TRACE_ITER_BRANCH = 0x1000, 603 TRACE_ITER_ANNOTATE = 0x1000,
576 TRACE_ITER_ANNOTATE = 0x2000, 604 TRACE_ITER_USERSTACKTRACE = 0x2000,
577 TRACE_ITER_USERSTACKTRACE = 0x4000, 605 TRACE_ITER_SYM_USEROBJ = 0x4000,
578 TRACE_ITER_SYM_USEROBJ = 0x8000, 606 TRACE_ITER_PRINTK_MSGONLY = 0x8000,
579 TRACE_ITER_PRINTK_MSGONLY = 0x10000, 607 TRACE_ITER_CONTEXT_INFO = 0x10000, /* Print pid/cpu/time */
580 TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */ 608 TRACE_ITER_LATENCY_FMT = 0x20000,
581 TRACE_ITER_LATENCY_FMT = 0x40000, 609 TRACE_ITER_SLEEP_TIME = 0x40000,
582 TRACE_ITER_SLEEP_TIME = 0x80000, 610 TRACE_ITER_GRAPH_TIME = 0x80000,
583 TRACE_ITER_GRAPH_TIME = 0x100000,
584}; 611};
585 612
586/* 613/*
@@ -687,7 +714,6 @@ struct event_filter {
687 int n_preds; 714 int n_preds;
688 struct filter_pred **preds; 715 struct filter_pred **preds;
689 char *filter_string; 716 char *filter_string;
690 bool no_reset;
691}; 717};
692 718
693struct event_subsystem { 719struct event_subsystem {
@@ -699,22 +725,40 @@ struct event_subsystem {
699}; 725};
700 726
701struct filter_pred; 727struct filter_pred;
728struct regex;
702 729
703typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, 730typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event,
704 int val1, int val2); 731 int val1, int val2);
705 732
733typedef int (*regex_match_func)(char *str, struct regex *r, int len);
734
735enum regex_type {
736 MATCH_FULL = 0,
737 MATCH_FRONT_ONLY,
738 MATCH_MIDDLE_ONLY,
739 MATCH_END_ONLY,
740};
741
742struct regex {
743 char pattern[MAX_FILTER_STR_VAL];
744 int len;
745 int field_len;
746 regex_match_func match;
747};
748
706struct filter_pred { 749struct filter_pred {
707 filter_pred_fn_t fn; 750 filter_pred_fn_t fn;
708 u64 val; 751 u64 val;
709 char str_val[MAX_FILTER_STR_VAL]; 752 struct regex regex;
710 int str_len; 753 char *field_name;
711 char *field_name; 754 int offset;
712 int offset; 755 int not;
713 int not; 756 int op;
714 int op; 757 int pop_n;
715 int pop_n;
716}; 758};
717 759
760extern enum regex_type
761filter_parse_regex(char *buff, int len, char **search, int *not);
718extern void print_event_filter(struct ftrace_event_call *call, 762extern void print_event_filter(struct ftrace_event_call *call,
719 struct trace_seq *s); 763 struct trace_seq *s);
720extern int apply_event_filter(struct ftrace_event_call *call, 764extern int apply_event_filter(struct ftrace_event_call *call,
@@ -730,7 +774,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
730 struct ring_buffer *buffer, 774 struct ring_buffer *buffer,
731 struct ring_buffer_event *event) 775 struct ring_buffer_event *event)
732{ 776{
733 if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) { 777 if (unlikely(call->filter_active) &&
778 !filter_match_preds(call->filter, rec)) {
734 ring_buffer_discard_commit(buffer, event); 779 ring_buffer_discard_commit(buffer, event);
735 return 1; 780 return 1;
736 } 781 }
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 20c5f92e28a..84a3a7ba072 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -20,6 +20,8 @@
20#include <linux/ktime.h> 20#include <linux/ktime.h>
21#include <linux/trace_clock.h> 21#include <linux/trace_clock.h>
22 22
23#include "trace.h"
24
23/* 25/*
24 * trace_clock_local(): the simplest and least coherent tracing clock. 26 * trace_clock_local(): the simplest and least coherent tracing clock.
25 * 27 *
@@ -28,17 +30,17 @@
28 */ 30 */
29u64 notrace trace_clock_local(void) 31u64 notrace trace_clock_local(void)
30{ 32{
31 unsigned long flags;
32 u64 clock; 33 u64 clock;
34 int resched;
33 35
34 /* 36 /*
35 * sched_clock() is an architecture implemented, fast, scalable, 37 * sched_clock() is an architecture implemented, fast, scalable,
36 * lockless clock. It is not guaranteed to be coherent across 38 * lockless clock. It is not guaranteed to be coherent across
37 * CPUs, nor across CPU idle events. 39 * CPUs, nor across CPU idle events.
38 */ 40 */
39 raw_local_irq_save(flags); 41 resched = ftrace_preempt_disable();
40 clock = sched_clock(); 42 clock = sched_clock();
41 raw_local_irq_restore(flags); 43 ftrace_preempt_enable(resched);
42 44
43 return clock; 45 return clock;
44} 46}
@@ -69,10 +71,10 @@ u64 notrace trace_clock(void)
69/* keep prev_time and lock in the same cacheline. */ 71/* keep prev_time and lock in the same cacheline. */
70static struct { 72static struct {
71 u64 prev_time; 73 u64 prev_time;
72 raw_spinlock_t lock; 74 arch_spinlock_t lock;
73} trace_clock_struct ____cacheline_aligned_in_smp = 75} trace_clock_struct ____cacheline_aligned_in_smp =
74 { 76 {
75 .lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED, 77 .lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED,
76 }; 78 };
77 79
78u64 notrace trace_clock_global(void) 80u64 notrace trace_clock_global(void)
@@ -92,7 +94,7 @@ u64 notrace trace_clock_global(void)
92 if (unlikely(in_nmi())) 94 if (unlikely(in_nmi()))
93 goto out; 95 goto out;
94 96
95 __raw_spin_lock(&trace_clock_struct.lock); 97 arch_spin_lock(&trace_clock_struct.lock);
96 98
97 /* 99 /*
98 * TODO: if this happens often then maybe we should reset 100 * TODO: if this happens often then maybe we should reset
@@ -104,7 +106,7 @@ u64 notrace trace_clock_global(void)
104 106
105 trace_clock_struct.prev_time = now; 107 trace_clock_struct.prev_time = now;
106 108
107 __raw_spin_unlock(&trace_clock_struct.lock); 109 arch_spin_unlock(&trace_clock_struct.lock);
108 110
109 out: 111 out:
110 raw_local_irq_restore(flags); 112 raw_local_irq_restore(flags);
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index ead3d724599..c16a08f399d 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -364,3 +364,19 @@ FTRACE_ENTRY(kmem_free, kmemtrace_free_entry,
364 F_printk("type:%u call_site:%lx ptr:%p", 364 F_printk("type:%u call_site:%lx ptr:%p",
365 __entry->type_id, __entry->call_site, __entry->ptr) 365 __entry->type_id, __entry->call_site, __entry->ptr)
366); 366);
367
368FTRACE_ENTRY(ksym_trace, ksym_trace_entry,
369
370 TRACE_KSYM,
371
372 F_STRUCT(
373 __field( unsigned long, ip )
374 __field( unsigned char, type )
375 __array( char , cmd, TASK_COMM_LEN )
376 __field( unsigned long, addr )
377 ),
378
379 F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s",
380 (void *)__entry->ip, (unsigned int)__entry->type,
381 (void *)__entry->addr, __entry->cmd)
382);
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 8d5c171cc99..9e25573242c 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -8,17 +8,14 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include "trace.h" 9#include "trace.h"
10 10
11/*
12 * We can't use a size but a type in alloc_percpu()
13 * So let's create a dummy type that matches the desired size
14 */
15typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t;
16 11
17char *trace_profile_buf; 12char *perf_trace_buf;
18EXPORT_SYMBOL_GPL(trace_profile_buf); 13EXPORT_SYMBOL_GPL(perf_trace_buf);
14
15char *perf_trace_buf_nmi;
16EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
19 17
20char *trace_profile_buf_nmi; 18typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;
21EXPORT_SYMBOL_GPL(trace_profile_buf_nmi);
22 19
23/* Count the events in use (per event id, not per instance) */ 20/* Count the events in use (per event id, not per instance) */
24static int total_profile_count; 21static int total_profile_count;
@@ -28,24 +25,24 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event)
28 char *buf; 25 char *buf;
29 int ret = -ENOMEM; 26 int ret = -ENOMEM;
30 27
31 if (atomic_inc_return(&event->profile_count)) 28 if (event->profile_count++ > 0)
32 return 0; 29 return 0;
33 30
34 if (!total_profile_count) { 31 if (!total_profile_count) {
35 buf = (char *)alloc_percpu(profile_buf_t); 32 buf = (char *)alloc_percpu(perf_trace_t);
36 if (!buf) 33 if (!buf)
37 goto fail_buf; 34 goto fail_buf;
38 35
39 rcu_assign_pointer(trace_profile_buf, buf); 36 rcu_assign_pointer(perf_trace_buf, buf);
40 37
41 buf = (char *)alloc_percpu(profile_buf_t); 38 buf = (char *)alloc_percpu(perf_trace_t);
42 if (!buf) 39 if (!buf)
43 goto fail_buf_nmi; 40 goto fail_buf_nmi;
44 41
45 rcu_assign_pointer(trace_profile_buf_nmi, buf); 42 rcu_assign_pointer(perf_trace_buf_nmi, buf);
46 } 43 }
47 44
48 ret = event->profile_enable(); 45 ret = event->profile_enable(event);
49 if (!ret) { 46 if (!ret) {
50 total_profile_count++; 47 total_profile_count++;
51 return 0; 48 return 0;
@@ -53,13 +50,13 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event)
53 50
54fail_buf_nmi: 51fail_buf_nmi:
55 if (!total_profile_count) { 52 if (!total_profile_count) {
56 free_percpu(trace_profile_buf_nmi); 53 free_percpu(perf_trace_buf_nmi);
57 free_percpu(trace_profile_buf); 54 free_percpu(perf_trace_buf);
58 trace_profile_buf_nmi = NULL; 55 perf_trace_buf_nmi = NULL;
59 trace_profile_buf = NULL; 56 perf_trace_buf = NULL;
60 } 57 }
61fail_buf: 58fail_buf:
62 atomic_dec(&event->profile_count); 59 event->profile_count--;
63 60
64 return ret; 61 return ret;
65} 62}
@@ -86,17 +83,17 @@ static void ftrace_profile_disable_event(struct ftrace_event_call *event)
86{ 83{
87 char *buf, *nmi_buf; 84 char *buf, *nmi_buf;
88 85
89 if (!atomic_add_negative(-1, &event->profile_count)) 86 if (--event->profile_count > 0)
90 return; 87 return;
91 88
92 event->profile_disable(); 89 event->profile_disable(event);
93 90
94 if (!--total_profile_count) { 91 if (!--total_profile_count) {
95 buf = trace_profile_buf; 92 buf = perf_trace_buf;
96 rcu_assign_pointer(trace_profile_buf, NULL); 93 rcu_assign_pointer(perf_trace_buf, NULL);
97 94
98 nmi_buf = trace_profile_buf_nmi; 95 nmi_buf = perf_trace_buf_nmi;
99 rcu_assign_pointer(trace_profile_buf_nmi, NULL); 96 rcu_assign_pointer(perf_trace_buf_nmi, NULL);
100 97
101 /* 98 /*
102 * Ensure every events in profiling have finished before 99 * Ensure every events in profiling have finished before
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index d128f65778e..189b09baf4f 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -78,7 +78,7 @@ EXPORT_SYMBOL_GPL(trace_define_field);
78 if (ret) \ 78 if (ret) \
79 return ret; 79 return ret;
80 80
81int trace_define_common_fields(struct ftrace_event_call *call) 81static int trace_define_common_fields(struct ftrace_event_call *call)
82{ 82{
83 int ret; 83 int ret;
84 struct trace_entry ent; 84 struct trace_entry ent;
@@ -91,11 +91,8 @@ int trace_define_common_fields(struct ftrace_event_call *call)
91 91
92 return ret; 92 return ret;
93} 93}
94EXPORT_SYMBOL_GPL(trace_define_common_fields);
95 94
96#ifdef CONFIG_MODULES 95void trace_destroy_fields(struct ftrace_event_call *call)
97
98static void trace_destroy_fields(struct ftrace_event_call *call)
99{ 96{
100 struct ftrace_event_field *field, *next; 97 struct ftrace_event_field *field, *next;
101 98
@@ -107,27 +104,49 @@ static void trace_destroy_fields(struct ftrace_event_call *call)
107 } 104 }
108} 105}
109 106
110#endif /* CONFIG_MODULES */ 107int trace_event_raw_init(struct ftrace_event_call *call)
108{
109 int id;
110
111 id = register_ftrace_event(call->event);
112 if (!id)
113 return -ENODEV;
114 call->id = id;
115 INIT_LIST_HEAD(&call->fields);
116
117 return 0;
118}
119EXPORT_SYMBOL_GPL(trace_event_raw_init);
111 120
112static void ftrace_event_enable_disable(struct ftrace_event_call *call, 121static int ftrace_event_enable_disable(struct ftrace_event_call *call,
113 int enable) 122 int enable)
114{ 123{
124 int ret = 0;
125
115 switch (enable) { 126 switch (enable) {
116 case 0: 127 case 0:
117 if (call->enabled) { 128 if (call->enabled) {
118 call->enabled = 0; 129 call->enabled = 0;
119 tracing_stop_cmdline_record(); 130 tracing_stop_cmdline_record();
120 call->unregfunc(call->data); 131 call->unregfunc(call);
121 } 132 }
122 break; 133 break;
123 case 1: 134 case 1:
124 if (!call->enabled) { 135 if (!call->enabled) {
125 call->enabled = 1;
126 tracing_start_cmdline_record(); 136 tracing_start_cmdline_record();
127 call->regfunc(call->data); 137 ret = call->regfunc(call);
138 if (ret) {
139 tracing_stop_cmdline_record();
140 pr_info("event trace: Could not enable event "
141 "%s\n", call->name);
142 break;
143 }
144 call->enabled = 1;
128 } 145 }
129 break; 146 break;
130 } 147 }
148
149 return ret;
131} 150}
132 151
133static void ftrace_clear_events(void) 152static void ftrace_clear_events(void)
@@ -406,7 +425,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
406 case 0: 425 case 0:
407 case 1: 426 case 1:
408 mutex_lock(&event_mutex); 427 mutex_lock(&event_mutex);
409 ftrace_event_enable_disable(call, val); 428 ret = ftrace_event_enable_disable(call, val);
410 mutex_unlock(&event_mutex); 429 mutex_unlock(&event_mutex);
411 break; 430 break;
412 431
@@ -416,7 +435,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
416 435
417 *ppos += cnt; 436 *ppos += cnt;
418 437
419 return cnt; 438 return ret ? ret : cnt;
420} 439}
421 440
422static ssize_t 441static ssize_t
@@ -507,7 +526,7 @@ extern char *__bad_type_size(void);
507#define FIELD(type, name) \ 526#define FIELD(type, name) \
508 sizeof(type) != sizeof(field.name) ? __bad_type_size() : \ 527 sizeof(type) != sizeof(field.name) ? __bad_type_size() : \
509 #type, "common_" #name, offsetof(typeof(field), name), \ 528 #type, "common_" #name, offsetof(typeof(field), name), \
510 sizeof(field.name) 529 sizeof(field.name), is_signed_type(type)
511 530
512static int trace_write_header(struct trace_seq *s) 531static int trace_write_header(struct trace_seq *s)
513{ 532{
@@ -515,17 +534,17 @@ static int trace_write_header(struct trace_seq *s)
515 534
516 /* struct trace_entry */ 535 /* struct trace_entry */
517 return trace_seq_printf(s, 536 return trace_seq_printf(s,
518 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 537 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
519 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 538 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
520 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 539 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
521 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 540 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
522 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 541 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
523 "\n", 542 "\n",
524 FIELD(unsigned short, type), 543 FIELD(unsigned short, type),
525 FIELD(unsigned char, flags), 544 FIELD(unsigned char, flags),
526 FIELD(unsigned char, preempt_count), 545 FIELD(unsigned char, preempt_count),
527 FIELD(int, pid), 546 FIELD(int, pid),
528 FIELD(int, lock_depth)); 547 FIELD(int, lock_depth));
529} 548}
530 549
531static ssize_t 550static ssize_t
@@ -878,9 +897,9 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
878 "'%s/filter' entry\n", name); 897 "'%s/filter' entry\n", name);
879 } 898 }
880 899
881 entry = trace_create_file("enable", 0644, system->entry, 900 trace_create_file("enable", 0644, system->entry,
882 (void *)system->name, 901 (void *)system->name,
883 &ftrace_system_enable_fops); 902 &ftrace_system_enable_fops);
884 903
885 return system->entry; 904 return system->entry;
886} 905}
@@ -892,7 +911,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
892 const struct file_operations *filter, 911 const struct file_operations *filter,
893 const struct file_operations *format) 912 const struct file_operations *format)
894{ 913{
895 struct dentry *entry;
896 int ret; 914 int ret;
897 915
898 /* 916 /*
@@ -910,55 +928,76 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
910 } 928 }
911 929
912 if (call->regfunc) 930 if (call->regfunc)
913 entry = trace_create_file("enable", 0644, call->dir, call, 931 trace_create_file("enable", 0644, call->dir, call,
914 enable); 932 enable);
915 933
916 if (call->id && call->profile_enable) 934 if (call->id && call->profile_enable)
917 entry = trace_create_file("id", 0444, call->dir, call, 935 trace_create_file("id", 0444, call->dir, call,
918 id); 936 id);
919 937
920 if (call->define_fields) { 938 if (call->define_fields) {
921 ret = call->define_fields(call); 939 ret = trace_define_common_fields(call);
940 if (!ret)
941 ret = call->define_fields(call);
922 if (ret < 0) { 942 if (ret < 0) {
923 pr_warning("Could not initialize trace point" 943 pr_warning("Could not initialize trace point"
924 " events/%s\n", call->name); 944 " events/%s\n", call->name);
925 return ret; 945 return ret;
926 } 946 }
927 entry = trace_create_file("filter", 0644, call->dir, call, 947 trace_create_file("filter", 0644, call->dir, call,
928 filter); 948 filter);
929 } 949 }
930 950
931 /* A trace may not want to export its format */ 951 /* A trace may not want to export its format */
932 if (!call->show_format) 952 if (!call->show_format)
933 return 0; 953 return 0;
934 954
935 entry = trace_create_file("format", 0444, call->dir, call, 955 trace_create_file("format", 0444, call->dir, call,
936 format); 956 format);
937 957
938 return 0; 958 return 0;
939} 959}
940 960
941#define for_each_event(event, start, end) \ 961static int __trace_add_event_call(struct ftrace_event_call *call)
942 for (event = start; \ 962{
943 (unsigned long)event < (unsigned long)end; \ 963 struct dentry *d_events;
944 event++) 964 int ret;
945 965
946#ifdef CONFIG_MODULES 966 if (!call->name)
967 return -EINVAL;
947 968
948static LIST_HEAD(ftrace_module_file_list); 969 if (call->raw_init) {
970 ret = call->raw_init(call);
971 if (ret < 0) {
972 if (ret != -ENOSYS)
973 pr_warning("Could not initialize trace "
974 "events/%s\n", call->name);
975 return ret;
976 }
977 }
949 978
950/* 979 d_events = event_trace_events_dir();
951 * Modules must own their file_operations to keep up with 980 if (!d_events)
952 * reference counting. 981 return -ENOENT;
953 */ 982
954struct ftrace_module_file_ops { 983 ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
955 struct list_head list; 984 &ftrace_enable_fops, &ftrace_event_filter_fops,
956 struct module *mod; 985 &ftrace_event_format_fops);
957 struct file_operations id; 986 if (!ret)
958 struct file_operations enable; 987 list_add(&call->list, &ftrace_events);
959 struct file_operations format; 988
960 struct file_operations filter; 989 return ret;
961}; 990}
991
992/* Add an additional event_call dynamically */
993int trace_add_event_call(struct ftrace_event_call *call)
994{
995 int ret;
996 mutex_lock(&event_mutex);
997 ret = __trace_add_event_call(call);
998 mutex_unlock(&event_mutex);
999 return ret;
1000}
962 1001
963static void remove_subsystem_dir(const char *name) 1002static void remove_subsystem_dir(const char *name)
964{ 1003{
@@ -986,6 +1025,53 @@ static void remove_subsystem_dir(const char *name)
986 } 1025 }
987} 1026}
988 1027
1028/*
1029 * Must be called under locking both of event_mutex and trace_event_mutex.
1030 */
1031static void __trace_remove_event_call(struct ftrace_event_call *call)
1032{
1033 ftrace_event_enable_disable(call, 0);
1034 if (call->event)
1035 __unregister_ftrace_event(call->event);
1036 debugfs_remove_recursive(call->dir);
1037 list_del(&call->list);
1038 trace_destroy_fields(call);
1039 destroy_preds(call);
1040 remove_subsystem_dir(call->system);
1041}
1042
1043/* Remove an event_call */
1044void trace_remove_event_call(struct ftrace_event_call *call)
1045{
1046 mutex_lock(&event_mutex);
1047 down_write(&trace_event_mutex);
1048 __trace_remove_event_call(call);
1049 up_write(&trace_event_mutex);
1050 mutex_unlock(&event_mutex);
1051}
1052
1053#define for_each_event(event, start, end) \
1054 for (event = start; \
1055 (unsigned long)event < (unsigned long)end; \
1056 event++)
1057
1058#ifdef CONFIG_MODULES
1059
1060static LIST_HEAD(ftrace_module_file_list);
1061
1062/*
1063 * Modules must own their file_operations to keep up with
1064 * reference counting.
1065 */
1066struct ftrace_module_file_ops {
1067 struct list_head list;
1068 struct module *mod;
1069 struct file_operations id;
1070 struct file_operations enable;
1071 struct file_operations format;
1072 struct file_operations filter;
1073};
1074
989static struct ftrace_module_file_ops * 1075static struct ftrace_module_file_ops *
990trace_create_file_ops(struct module *mod) 1076trace_create_file_ops(struct module *mod)
991{ 1077{
@@ -1043,7 +1129,7 @@ static void trace_module_add_events(struct module *mod)
1043 if (!call->name) 1129 if (!call->name)
1044 continue; 1130 continue;
1045 if (call->raw_init) { 1131 if (call->raw_init) {
1046 ret = call->raw_init(); 1132 ret = call->raw_init(call);
1047 if (ret < 0) { 1133 if (ret < 0) {
1048 if (ret != -ENOSYS) 1134 if (ret != -ENOSYS)
1049 pr_warning("Could not initialize trace " 1135 pr_warning("Could not initialize trace "
@@ -1061,10 +1147,11 @@ static void trace_module_add_events(struct module *mod)
1061 return; 1147 return;
1062 } 1148 }
1063 call->mod = mod; 1149 call->mod = mod;
1064 list_add(&call->list, &ftrace_events); 1150 ret = event_create_dir(call, d_events,
1065 event_create_dir(call, d_events, 1151 &file_ops->id, &file_ops->enable,
1066 &file_ops->id, &file_ops->enable, 1152 &file_ops->filter, &file_ops->format);
1067 &file_ops->filter, &file_ops->format); 1153 if (!ret)
1154 list_add(&call->list, &ftrace_events);
1068 } 1155 }
1069} 1156}
1070 1157
@@ -1078,14 +1165,7 @@ static void trace_module_remove_events(struct module *mod)
1078 list_for_each_entry_safe(call, p, &ftrace_events, list) { 1165 list_for_each_entry_safe(call, p, &ftrace_events, list) {
1079 if (call->mod == mod) { 1166 if (call->mod == mod) {
1080 found = true; 1167 found = true;
1081 ftrace_event_enable_disable(call, 0); 1168 __trace_remove_event_call(call);
1082 if (call->event)
1083 __unregister_ftrace_event(call->event);
1084 debugfs_remove_recursive(call->dir);
1085 list_del(&call->list);
1086 trace_destroy_fields(call);
1087 destroy_preds(call);
1088 remove_subsystem_dir(call->system);
1089 } 1169 }
1090 } 1170 }
1091 1171
@@ -1203,7 +1283,7 @@ static __init int event_trace_init(void)
1203 if (!call->name) 1283 if (!call->name)
1204 continue; 1284 continue;
1205 if (call->raw_init) { 1285 if (call->raw_init) {
1206 ret = call->raw_init(); 1286 ret = call->raw_init(call);
1207 if (ret < 0) { 1287 if (ret < 0) {
1208 if (ret != -ENOSYS) 1288 if (ret != -ENOSYS)
1209 pr_warning("Could not initialize trace " 1289 pr_warning("Could not initialize trace "
@@ -1211,10 +1291,12 @@ static __init int event_trace_init(void)
1211 continue; 1291 continue;
1212 } 1292 }
1213 } 1293 }
1214 list_add(&call->list, &ftrace_events); 1294 ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
1215 event_create_dir(call, d_events, &ftrace_event_id_fops, 1295 &ftrace_enable_fops,
1216 &ftrace_enable_fops, &ftrace_event_filter_fops, 1296 &ftrace_event_filter_fops,
1217 &ftrace_event_format_fops); 1297 &ftrace_event_format_fops);
1298 if (!ret)
1299 list_add(&call->list, &ftrace_events);
1218 } 1300 }
1219 1301
1220 while (true) { 1302 while (true) {
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 23245785927..e42af9aad69 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -18,11 +18,10 @@
18 * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com> 18 * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
19 */ 19 */
20 20
21#include <linux/debugfs.h>
22#include <linux/uaccess.h>
23#include <linux/module.h> 21#include <linux/module.h>
24#include <linux/ctype.h> 22#include <linux/ctype.h>
25#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/perf_event.h>
26 25
27#include "trace.h" 26#include "trace.h"
28#include "trace_output.h" 27#include "trace_output.h"
@@ -31,6 +30,7 @@ enum filter_op_ids
31{ 30{
32 OP_OR, 31 OP_OR,
33 OP_AND, 32 OP_AND,
33 OP_GLOB,
34 OP_NE, 34 OP_NE,
35 OP_EQ, 35 OP_EQ,
36 OP_LT, 36 OP_LT,
@@ -48,16 +48,17 @@ struct filter_op {
48}; 48};
49 49
50static struct filter_op filter_ops[] = { 50static struct filter_op filter_ops[] = {
51 { OP_OR, "||", 1 }, 51 { OP_OR, "||", 1 },
52 { OP_AND, "&&", 2 }, 52 { OP_AND, "&&", 2 },
53 { OP_NE, "!=", 4 }, 53 { OP_GLOB, "~", 4 },
54 { OP_EQ, "==", 4 }, 54 { OP_NE, "!=", 4 },
55 { OP_LT, "<", 5 }, 55 { OP_EQ, "==", 4 },
56 { OP_LE, "<=", 5 }, 56 { OP_LT, "<", 5 },
57 { OP_GT, ">", 5 }, 57 { OP_LE, "<=", 5 },
58 { OP_GE, ">=", 5 }, 58 { OP_GT, ">", 5 },
59 { OP_NONE, "OP_NONE", 0 }, 59 { OP_GE, ">=", 5 },
60 { OP_OPEN_PAREN, "(", 0 }, 60 { OP_NONE, "OP_NONE", 0 },
61 { OP_OPEN_PAREN, "(", 0 },
61}; 62};
62 63
63enum { 64enum {
@@ -197,9 +198,9 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
197 char *addr = (char *)(event + pred->offset); 198 char *addr = (char *)(event + pred->offset);
198 int cmp, match; 199 int cmp, match;
199 200
200 cmp = strncmp(addr, pred->str_val, pred->str_len); 201 cmp = pred->regex.match(addr, &pred->regex, pred->regex.field_len);
201 202
202 match = (!cmp) ^ pred->not; 203 match = cmp ^ pred->not;
203 204
204 return match; 205 return match;
205} 206}
@@ -210,10 +211,11 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event,
210{ 211{
211 char **addr = (char **)(event + pred->offset); 212 char **addr = (char **)(event + pred->offset);
212 int cmp, match; 213 int cmp, match;
214 int len = strlen(*addr) + 1; /* including tailing '\0' */
213 215
214 cmp = strncmp(*addr, pred->str_val, pred->str_len); 216 cmp = pred->regex.match(*addr, &pred->regex, len);
215 217
216 match = (!cmp) ^ pred->not; 218 match = cmp ^ pred->not;
217 219
218 return match; 220 return match;
219} 221}
@@ -237,9 +239,9 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event,
237 char *addr = (char *)(event + str_loc); 239 char *addr = (char *)(event + str_loc);
238 int cmp, match; 240 int cmp, match;
239 241
240 cmp = strncmp(addr, pred->str_val, str_len); 242 cmp = pred->regex.match(addr, &pred->regex, str_len);
241 243
242 match = (!cmp) ^ pred->not; 244 match = cmp ^ pred->not;
243 245
244 return match; 246 return match;
245} 247}
@@ -250,10 +252,133 @@ static int filter_pred_none(struct filter_pred *pred, void *event,
250 return 0; 252 return 0;
251} 253}
252 254
255/*
256 * regex_match_foo - Basic regex callbacks
257 *
258 * @str: the string to be searched
259 * @r: the regex structure containing the pattern string
260 * @len: the length of the string to be searched (including '\0')
261 *
262 * Note:
263 * - @str might not be NULL-terminated if it's of type DYN_STRING
264 * or STATIC_STRING
265 */
266
267static int regex_match_full(char *str, struct regex *r, int len)
268{
269 if (strncmp(str, r->pattern, len) == 0)
270 return 1;
271 return 0;
272}
273
274static int regex_match_front(char *str, struct regex *r, int len)
275{
276 if (strncmp(str, r->pattern, r->len) == 0)
277 return 1;
278 return 0;
279}
280
281static int regex_match_middle(char *str, struct regex *r, int len)
282{
283 if (strnstr(str, r->pattern, len))
284 return 1;
285 return 0;
286}
287
288static int regex_match_end(char *str, struct regex *r, int len)
289{
290 int strlen = len - 1;
291
292 if (strlen >= r->len &&
293 memcmp(str + strlen - r->len, r->pattern, r->len) == 0)
294 return 1;
295 return 0;
296}
297
298/**
299 * filter_parse_regex - parse a basic regex
300 * @buff: the raw regex
301 * @len: length of the regex
302 * @search: will point to the beginning of the string to compare
303 * @not: tell whether the match will have to be inverted
304 *
305 * This passes in a buffer containing a regex and this function will
306 * set search to point to the search part of the buffer and
307 * return the type of search it is (see enum above).
308 * This does modify buff.
309 *
310 * Returns enum type.
311 * search returns the pointer to use for comparison.
312 * not returns 1 if buff started with a '!'
313 * 0 otherwise.
314 */
315enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not)
316{
317 int type = MATCH_FULL;
318 int i;
319
320 if (buff[0] == '!') {
321 *not = 1;
322 buff++;
323 len--;
324 } else
325 *not = 0;
326
327 *search = buff;
328
329 for (i = 0; i < len; i++) {
330 if (buff[i] == '*') {
331 if (!i) {
332 *search = buff + 1;
333 type = MATCH_END_ONLY;
334 } else {
335 if (type == MATCH_END_ONLY)
336 type = MATCH_MIDDLE_ONLY;
337 else
338 type = MATCH_FRONT_ONLY;
339 buff[i] = 0;
340 break;
341 }
342 }
343 }
344
345 return type;
346}
347
348static void filter_build_regex(struct filter_pred *pred)
349{
350 struct regex *r = &pred->regex;
351 char *search;
352 enum regex_type type = MATCH_FULL;
353 int not = 0;
354
355 if (pred->op == OP_GLOB) {
356 type = filter_parse_regex(r->pattern, r->len, &search, &not);
357 r->len = strlen(search);
358 memmove(r->pattern, search, r->len+1);
359 }
360
361 switch (type) {
362 case MATCH_FULL:
363 r->match = regex_match_full;
364 break;
365 case MATCH_FRONT_ONLY:
366 r->match = regex_match_front;
367 break;
368 case MATCH_MIDDLE_ONLY:
369 r->match = regex_match_middle;
370 break;
371 case MATCH_END_ONLY:
372 r->match = regex_match_end;
373 break;
374 }
375
376 pred->not ^= not;
377}
378
253/* return 1 if event matches, 0 otherwise (discard) */ 379/* return 1 if event matches, 0 otherwise (discard) */
254int filter_match_preds(struct ftrace_event_call *call, void *rec) 380int filter_match_preds(struct event_filter *filter, void *rec)
255{ 381{
256 struct event_filter *filter = call->filter;
257 int match, top = 0, val1 = 0, val2 = 0; 382 int match, top = 0, val1 = 0, val2 = 0;
258 int stack[MAX_FILTER_PRED]; 383 int stack[MAX_FILTER_PRED];
259 struct filter_pred *pred; 384 struct filter_pred *pred;
@@ -396,7 +521,7 @@ static void filter_clear_pred(struct filter_pred *pred)
396{ 521{
397 kfree(pred->field_name); 522 kfree(pred->field_name);
398 pred->field_name = NULL; 523 pred->field_name = NULL;
399 pred->str_len = 0; 524 pred->regex.len = 0;
400} 525}
401 526
402static int filter_set_pred(struct filter_pred *dest, 527static int filter_set_pred(struct filter_pred *dest,
@@ -426,9 +551,8 @@ static void filter_disable_preds(struct ftrace_event_call *call)
426 filter->preds[i]->fn = filter_pred_none; 551 filter->preds[i]->fn = filter_pred_none;
427} 552}
428 553
429void destroy_preds(struct ftrace_event_call *call) 554static void __free_preds(struct event_filter *filter)
430{ 555{
431 struct event_filter *filter = call->filter;
432 int i; 556 int i;
433 557
434 if (!filter) 558 if (!filter)
@@ -441,21 +565,24 @@ void destroy_preds(struct ftrace_event_call *call)
441 kfree(filter->preds); 565 kfree(filter->preds);
442 kfree(filter->filter_string); 566 kfree(filter->filter_string);
443 kfree(filter); 567 kfree(filter);
568}
569
570void destroy_preds(struct ftrace_event_call *call)
571{
572 __free_preds(call->filter);
444 call->filter = NULL; 573 call->filter = NULL;
574 call->filter_active = 0;
445} 575}
446 576
447static int init_preds(struct ftrace_event_call *call) 577static struct event_filter *__alloc_preds(void)
448{ 578{
449 struct event_filter *filter; 579 struct event_filter *filter;
450 struct filter_pred *pred; 580 struct filter_pred *pred;
451 int i; 581 int i;
452 582
453 if (call->filter) 583 filter = kzalloc(sizeof(*filter), GFP_KERNEL);
454 return 0; 584 if (!filter)
455 585 return ERR_PTR(-ENOMEM);
456 filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
457 if (!call->filter)
458 return -ENOMEM;
459 586
460 filter->n_preds = 0; 587 filter->n_preds = 0;
461 588
@@ -471,12 +598,24 @@ static int init_preds(struct ftrace_event_call *call)
471 filter->preds[i] = pred; 598 filter->preds[i] = pred;
472 } 599 }
473 600
474 return 0; 601 return filter;
475 602
476oom: 603oom:
477 destroy_preds(call); 604 __free_preds(filter);
605 return ERR_PTR(-ENOMEM);
606}
478 607
479 return -ENOMEM; 608static int init_preds(struct ftrace_event_call *call)
609{
610 if (call->filter)
611 return 0;
612
613 call->filter_active = 0;
614 call->filter = __alloc_preds();
615 if (IS_ERR(call->filter))
616 return PTR_ERR(call->filter);
617
618 return 0;
480} 619}
481 620
482static int init_subsystem_preds(struct event_subsystem *system) 621static int init_subsystem_preds(struct event_subsystem *system)
@@ -499,14 +638,7 @@ static int init_subsystem_preds(struct event_subsystem *system)
499 return 0; 638 return 0;
500} 639}
501 640
502enum { 641static void filter_free_subsystem_preds(struct event_subsystem *system)
503 FILTER_DISABLE_ALL,
504 FILTER_INIT_NO_RESET,
505 FILTER_SKIP_NO_RESET,
506};
507
508static void filter_free_subsystem_preds(struct event_subsystem *system,
509 int flag)
510{ 642{
511 struct ftrace_event_call *call; 643 struct ftrace_event_call *call;
512 644
@@ -517,14 +649,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system,
517 if (strcmp(call->system, system->name) != 0) 649 if (strcmp(call->system, system->name) != 0)
518 continue; 650 continue;
519 651
520 if (flag == FILTER_INIT_NO_RESET) {
521 call->filter->no_reset = false;
522 continue;
523 }
524
525 if (flag == FILTER_SKIP_NO_RESET && call->filter->no_reset)
526 continue;
527
528 filter_disable_preds(call); 652 filter_disable_preds(call);
529 remove_filter_string(call->filter); 653 remove_filter_string(call->filter);
530 } 654 }
@@ -532,10 +656,10 @@ static void filter_free_subsystem_preds(struct event_subsystem *system,
532 656
533static int filter_add_pred_fn(struct filter_parse_state *ps, 657static int filter_add_pred_fn(struct filter_parse_state *ps,
534 struct ftrace_event_call *call, 658 struct ftrace_event_call *call,
659 struct event_filter *filter,
535 struct filter_pred *pred, 660 struct filter_pred *pred,
536 filter_pred_fn_t fn) 661 filter_pred_fn_t fn)
537{ 662{
538 struct event_filter *filter = call->filter;
539 int idx, err; 663 int idx, err;
540 664
541 if (filter->n_preds == MAX_FILTER_PRED) { 665 if (filter->n_preds == MAX_FILTER_PRED) {
@@ -550,7 +674,6 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
550 return err; 674 return err;
551 675
552 filter->n_preds++; 676 filter->n_preds++;
553 call->filter_active = 1;
554 677
555 return 0; 678 return 0;
556} 679}
@@ -575,7 +698,10 @@ static bool is_string_field(struct ftrace_event_field *field)
575 698
576static int is_legal_op(struct ftrace_event_field *field, int op) 699static int is_legal_op(struct ftrace_event_field *field, int op)
577{ 700{
578 if (is_string_field(field) && (op != OP_EQ && op != OP_NE)) 701 if (is_string_field(field) &&
702 (op != OP_EQ && op != OP_NE && op != OP_GLOB))
703 return 0;
704 if (!is_string_field(field) && op == OP_GLOB)
579 return 0; 705 return 0;
580 706
581 return 1; 707 return 1;
@@ -626,6 +752,7 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
626 752
627static int filter_add_pred(struct filter_parse_state *ps, 753static int filter_add_pred(struct filter_parse_state *ps,
628 struct ftrace_event_call *call, 754 struct ftrace_event_call *call,
755 struct event_filter *filter,
629 struct filter_pred *pred, 756 struct filter_pred *pred,
630 bool dry_run) 757 bool dry_run)
631{ 758{
@@ -660,21 +787,20 @@ static int filter_add_pred(struct filter_parse_state *ps,
660 } 787 }
661 788
662 if (is_string_field(field)) { 789 if (is_string_field(field)) {
663 pred->str_len = field->size; 790 filter_build_regex(pred);
664 791
665 if (field->filter_type == FILTER_STATIC_STRING) 792 if (field->filter_type == FILTER_STATIC_STRING) {
666 fn = filter_pred_string; 793 fn = filter_pred_string;
667 else if (field->filter_type == FILTER_DYN_STRING) 794 pred->regex.field_len = field->size;
795 } else if (field->filter_type == FILTER_DYN_STRING)
668 fn = filter_pred_strloc; 796 fn = filter_pred_strloc;
669 else { 797 else
670 fn = filter_pred_pchar; 798 fn = filter_pred_pchar;
671 pred->str_len = strlen(pred->str_val);
672 }
673 } else { 799 } else {
674 if (field->is_signed) 800 if (field->is_signed)
675 ret = strict_strtoll(pred->str_val, 0, &val); 801 ret = strict_strtoll(pred->regex.pattern, 0, &val);
676 else 802 else
677 ret = strict_strtoull(pred->str_val, 0, &val); 803 ret = strict_strtoull(pred->regex.pattern, 0, &val);
678 if (ret) { 804 if (ret) {
679 parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); 805 parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
680 return -EINVAL; 806 return -EINVAL;
@@ -694,45 +820,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
694 820
695add_pred_fn: 821add_pred_fn:
696 if (!dry_run) 822 if (!dry_run)
697 return filter_add_pred_fn(ps, call, pred, fn); 823 return filter_add_pred_fn(ps, call, filter, pred, fn);
698 return 0;
699}
700
701static int filter_add_subsystem_pred(struct filter_parse_state *ps,
702 struct event_subsystem *system,
703 struct filter_pred *pred,
704 char *filter_string,
705 bool dry_run)
706{
707 struct ftrace_event_call *call;
708 int err = 0;
709 bool fail = true;
710
711 list_for_each_entry(call, &ftrace_events, list) {
712
713 if (!call->define_fields)
714 continue;
715
716 if (strcmp(call->system, system->name))
717 continue;
718
719 if (call->filter->no_reset)
720 continue;
721
722 err = filter_add_pred(ps, call, pred, dry_run);
723 if (err)
724 call->filter->no_reset = true;
725 else
726 fail = false;
727
728 if (!dry_run)
729 replace_filter_string(call->filter, filter_string);
730 }
731
732 if (fail) {
733 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
734 return err;
735 }
736 return 0; 824 return 0;
737} 825}
738 826
@@ -933,8 +1021,9 @@ static void postfix_clear(struct filter_parse_state *ps)
933 1021
934 while (!list_empty(&ps->postfix)) { 1022 while (!list_empty(&ps->postfix)) {
935 elt = list_first_entry(&ps->postfix, struct postfix_elt, list); 1023 elt = list_first_entry(&ps->postfix, struct postfix_elt, list);
936 kfree(elt->operand);
937 list_del(&elt->list); 1024 list_del(&elt->list);
1025 kfree(elt->operand);
1026 kfree(elt);
938 } 1027 }
939} 1028}
940 1029
@@ -1044,8 +1133,8 @@ static struct filter_pred *create_pred(int op, char *operand1, char *operand2)
1044 return NULL; 1133 return NULL;
1045 } 1134 }
1046 1135
1047 strcpy(pred->str_val, operand2); 1136 strcpy(pred->regex.pattern, operand2);
1048 pred->str_len = strlen(operand2); 1137 pred->regex.len = strlen(pred->regex.pattern);
1049 1138
1050 pred->op = op; 1139 pred->op = op;
1051 1140
@@ -1089,8 +1178,8 @@ static int check_preds(struct filter_parse_state *ps)
1089 return 0; 1178 return 0;
1090} 1179}
1091 1180
1092static int replace_preds(struct event_subsystem *system, 1181static int replace_preds(struct ftrace_event_call *call,
1093 struct ftrace_event_call *call, 1182 struct event_filter *filter,
1094 struct filter_parse_state *ps, 1183 struct filter_parse_state *ps,
1095 char *filter_string, 1184 char *filter_string,
1096 bool dry_run) 1185 bool dry_run)
@@ -1137,11 +1226,7 @@ static int replace_preds(struct event_subsystem *system,
1137add_pred: 1226add_pred:
1138 if (!pred) 1227 if (!pred)
1139 return -ENOMEM; 1228 return -ENOMEM;
1140 if (call) 1229 err = filter_add_pred(ps, call, filter, pred, dry_run);
1141 err = filter_add_pred(ps, call, pred, false);
1142 else
1143 err = filter_add_subsystem_pred(ps, system, pred,
1144 filter_string, dry_run);
1145 filter_free_pred(pred); 1230 filter_free_pred(pred);
1146 if (err) 1231 if (err)
1147 return err; 1232 return err;
@@ -1152,10 +1237,50 @@ add_pred:
1152 return 0; 1237 return 0;
1153} 1238}
1154 1239
1155int apply_event_filter(struct ftrace_event_call *call, char *filter_string) 1240static int replace_system_preds(struct event_subsystem *system,
1241 struct filter_parse_state *ps,
1242 char *filter_string)
1156{ 1243{
1244 struct ftrace_event_call *call;
1245 bool fail = true;
1157 int err; 1246 int err;
1158 1247
1248 list_for_each_entry(call, &ftrace_events, list) {
1249 struct event_filter *filter = call->filter;
1250
1251 if (!call->define_fields)
1252 continue;
1253
1254 if (strcmp(call->system, system->name) != 0)
1255 continue;
1256
1257 /* try to see if the filter can be applied */
1258 err = replace_preds(call, filter, ps, filter_string, true);
1259 if (err)
1260 continue;
1261
1262 /* really apply the filter */
1263 filter_disable_preds(call);
1264 err = replace_preds(call, filter, ps, filter_string, false);
1265 if (err)
1266 filter_disable_preds(call);
1267 else {
1268 call->filter_active = 1;
1269 replace_filter_string(filter, filter_string);
1270 }
1271 fail = false;
1272 }
1273
1274 if (fail) {
1275 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
1276 return -EINVAL;
1277 }
1278 return 0;
1279}
1280
1281int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1282{
1283 int err;
1159 struct filter_parse_state *ps; 1284 struct filter_parse_state *ps;
1160 1285
1161 mutex_lock(&event_mutex); 1286 mutex_lock(&event_mutex);
@@ -1167,8 +1292,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1167 if (!strcmp(strstrip(filter_string), "0")) { 1292 if (!strcmp(strstrip(filter_string), "0")) {
1168 filter_disable_preds(call); 1293 filter_disable_preds(call);
1169 remove_filter_string(call->filter); 1294 remove_filter_string(call->filter);
1170 mutex_unlock(&event_mutex); 1295 goto out_unlock;
1171 return 0;
1172 } 1296 }
1173 1297
1174 err = -ENOMEM; 1298 err = -ENOMEM;
@@ -1186,10 +1310,11 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1186 goto out; 1310 goto out;
1187 } 1311 }
1188 1312
1189 err = replace_preds(NULL, call, ps, filter_string, false); 1313 err = replace_preds(call, call->filter, ps, filter_string, false);
1190 if (err) 1314 if (err)
1191 append_filter_err(ps, call->filter); 1315 append_filter_err(ps, call->filter);
1192 1316 else
1317 call->filter_active = 1;
1193out: 1318out:
1194 filter_opstack_clear(ps); 1319 filter_opstack_clear(ps);
1195 postfix_clear(ps); 1320 postfix_clear(ps);
@@ -1204,7 +1329,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1204 char *filter_string) 1329 char *filter_string)
1205{ 1330{
1206 int err; 1331 int err;
1207
1208 struct filter_parse_state *ps; 1332 struct filter_parse_state *ps;
1209 1333
1210 mutex_lock(&event_mutex); 1334 mutex_lock(&event_mutex);
@@ -1214,10 +1338,9 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1214 goto out_unlock; 1338 goto out_unlock;
1215 1339
1216 if (!strcmp(strstrip(filter_string), "0")) { 1340 if (!strcmp(strstrip(filter_string), "0")) {
1217 filter_free_subsystem_preds(system, FILTER_DISABLE_ALL); 1341 filter_free_subsystem_preds(system);
1218 remove_filter_string(system->filter); 1342 remove_filter_string(system->filter);
1219 mutex_unlock(&event_mutex); 1343 goto out_unlock;
1220 return 0;
1221 } 1344 }
1222 1345
1223 err = -ENOMEM; 1346 err = -ENOMEM;
@@ -1234,31 +1357,87 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1234 goto out; 1357 goto out;
1235 } 1358 }
1236 1359
1237 filter_free_subsystem_preds(system, FILTER_INIT_NO_RESET); 1360 err = replace_system_preds(system, ps, filter_string);
1238 1361 if (err)
1239 /* try to see the filter can be applied to which events */
1240 err = replace_preds(system, NULL, ps, filter_string, true);
1241 if (err) {
1242 append_filter_err(ps, system->filter); 1362 append_filter_err(ps, system->filter);
1243 goto out; 1363
1364out:
1365 filter_opstack_clear(ps);
1366 postfix_clear(ps);
1367 kfree(ps);
1368out_unlock:
1369 mutex_unlock(&event_mutex);
1370
1371 return err;
1372}
1373
1374#ifdef CONFIG_EVENT_PROFILE
1375
1376void ftrace_profile_free_filter(struct perf_event *event)
1377{
1378 struct event_filter *filter = event->filter;
1379
1380 event->filter = NULL;
1381 __free_preds(filter);
1382}
1383
1384int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1385 char *filter_str)
1386{
1387 int err;
1388 struct event_filter *filter;
1389 struct filter_parse_state *ps;
1390 struct ftrace_event_call *call = NULL;
1391
1392 mutex_lock(&event_mutex);
1393
1394 list_for_each_entry(call, &ftrace_events, list) {
1395 if (call->id == event_id)
1396 break;
1244 } 1397 }
1245 1398
1246 filter_free_subsystem_preds(system, FILTER_SKIP_NO_RESET); 1399 err = -EINVAL;
1400 if (!call)
1401 goto out_unlock;
1402
1403 err = -EEXIST;
1404 if (event->filter)
1405 goto out_unlock;
1247 1406
1248 /* really apply the filter to the events */ 1407 filter = __alloc_preds();
1249 err = replace_preds(system, NULL, ps, filter_string, false); 1408 if (IS_ERR(filter)) {
1250 if (err) { 1409 err = PTR_ERR(filter);
1251 append_filter_err(ps, system->filter); 1410 goto out_unlock;
1252 filter_free_subsystem_preds(system, 2);
1253 } 1411 }
1254 1412
1255out: 1413 err = -ENOMEM;
1414 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1415 if (!ps)
1416 goto free_preds;
1417
1418 parse_init(ps, filter_ops, filter_str);
1419 err = filter_parse(ps);
1420 if (err)
1421 goto free_ps;
1422
1423 err = replace_preds(call, filter, ps, filter_str, false);
1424 if (!err)
1425 event->filter = filter;
1426
1427free_ps:
1256 filter_opstack_clear(ps); 1428 filter_opstack_clear(ps);
1257 postfix_clear(ps); 1429 postfix_clear(ps);
1258 kfree(ps); 1430 kfree(ps);
1431
1432free_preds:
1433 if (err)
1434 __free_preds(filter);
1435
1259out_unlock: 1436out_unlock:
1260 mutex_unlock(&event_mutex); 1437 mutex_unlock(&event_mutex);
1261 1438
1262 return err; 1439 return err;
1263} 1440}
1264 1441
1442#endif /* CONFIG_EVENT_PROFILE */
1443
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 9753fcc61bc..d4fa5dc1ee4 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -48,11 +48,11 @@
48struct ____ftrace_##name { \ 48struct ____ftrace_##name { \
49 tstruct \ 49 tstruct \
50}; \ 50}; \
51static void __used ____ftrace_check_##name(void) \ 51static void __always_unused ____ftrace_check_##name(void) \
52{ \ 52{ \
53 struct ____ftrace_##name *__entry = NULL; \ 53 struct ____ftrace_##name *__entry = NULL; \
54 \ 54 \
55 /* force cmpile-time check on F_printk() */ \ 55 /* force compile-time check on F_printk() */ \
56 printk(print); \ 56 printk(print); \
57} 57}
58 58
@@ -66,44 +66,47 @@ static void __used ____ftrace_check_##name(void) \
66#undef __field 66#undef __field
67#define __field(type, item) \ 67#define __field(type, item) \
68 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ 68 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
69 "offset:%zu;\tsize:%zu;\n", \ 69 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
70 offsetof(typeof(field), item), \ 70 offsetof(typeof(field), item), \
71 sizeof(field.item)); \ 71 sizeof(field.item), is_signed_type(type)); \
72 if (!ret) \ 72 if (!ret) \
73 return 0; 73 return 0;
74 74
75#undef __field_desc 75#undef __field_desc
76#define __field_desc(type, container, item) \ 76#define __field_desc(type, container, item) \
77 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ 77 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
78 "offset:%zu;\tsize:%zu;\n", \ 78 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
79 offsetof(typeof(field), container.item), \ 79 offsetof(typeof(field), container.item), \
80 sizeof(field.container.item)); \ 80 sizeof(field.container.item), \
81 is_signed_type(type)); \
81 if (!ret) \ 82 if (!ret) \
82 return 0; 83 return 0;
83 84
84#undef __array 85#undef __array
85#define __array(type, item, len) \ 86#define __array(type, item, len) \
86 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ 87 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
87 "offset:%zu;\tsize:%zu;\n", \ 88 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
88 offsetof(typeof(field), item), \ 89 offsetof(typeof(field), item), \
89 sizeof(field.item)); \ 90 sizeof(field.item), is_signed_type(type)); \
90 if (!ret) \ 91 if (!ret) \
91 return 0; 92 return 0;
92 93
93#undef __array_desc 94#undef __array_desc
94#define __array_desc(type, container, item, len) \ 95#define __array_desc(type, container, item, len) \
95 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ 96 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
96 "offset:%zu;\tsize:%zu;\n", \ 97 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
97 offsetof(typeof(field), container.item), \ 98 offsetof(typeof(field), container.item), \
98 sizeof(field.container.item)); \ 99 sizeof(field.container.item), \
100 is_signed_type(type)); \
99 if (!ret) \ 101 if (!ret) \
100 return 0; 102 return 0;
101 103
102#undef __dynamic_array 104#undef __dynamic_array
103#define __dynamic_array(type, item) \ 105#define __dynamic_array(type, item) \
104 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ 106 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
105 "offset:%zu;\tsize:0;\n", \ 107 "offset:%zu;\tsize:0;\tsigned:%u;\n", \
106 offsetof(typeof(field), item)); \ 108 offsetof(typeof(field), item), \
109 is_signed_type(type)); \
107 if (!ret) \ 110 if (!ret) \
108 return 0; 111 return 0;
109 112
@@ -131,7 +134,6 @@ ftrace_format_##name(struct ftrace_event_call *unused, \
131 134
132#include "trace_entries.h" 135#include "trace_entries.h"
133 136
134
135#undef __field 137#undef __field
136#define __field(type, item) \ 138#define __field(type, item) \
137 ret = trace_define_field(event_call, #type, #item, \ 139 ret = trace_define_field(event_call, #type, #item, \
@@ -156,7 +158,8 @@ ftrace_format_##name(struct ftrace_event_call *unused, \
156 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ 158 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
157 ret = trace_define_field(event_call, #type "[" #len "]", #item, \ 159 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
158 offsetof(typeof(field), item), \ 160 offsetof(typeof(field), item), \
159 sizeof(field.item), 0, FILTER_OTHER); \ 161 sizeof(field.item), \
162 is_signed_type(type), FILTER_OTHER); \
160 if (ret) \ 163 if (ret) \
161 return ret; 164 return ret;
162 165
@@ -166,8 +169,8 @@ ftrace_format_##name(struct ftrace_event_call *unused, \
166 ret = trace_define_field(event_call, #type "[" #len "]", #item, \ 169 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
167 offsetof(typeof(field), \ 170 offsetof(typeof(field), \
168 container.item), \ 171 container.item), \
169 sizeof(field.container.item), 0, \ 172 sizeof(field.container.item), \
170 FILTER_OTHER); \ 173 is_signed_type(type), FILTER_OTHER); \
171 if (ret) \ 174 if (ret) \
172 return ret; 175 return ret;
173 176
@@ -182,10 +185,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
182 struct struct_name field; \ 185 struct struct_name field; \
183 int ret; \ 186 int ret; \
184 \ 187 \
185 ret = trace_define_common_fields(event_call); \
186 if (ret) \
187 return ret; \
188 \
189 tstruct; \ 188 tstruct; \
190 \ 189 \
191 return ret; \ 190 return ret; \
@@ -193,6 +192,11 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
193 192
194#include "trace_entries.h" 193#include "trace_entries.h"
195 194
195static int ftrace_raw_init_event(struct ftrace_event_call *call)
196{
197 INIT_LIST_HEAD(&call->fields);
198 return 0;
199}
196 200
197#undef __field 201#undef __field
198#define __field(type, item) 202#define __field(type, item)
@@ -211,7 +215,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
211 215
212#undef FTRACE_ENTRY 216#undef FTRACE_ENTRY
213#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ 217#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \
214static int ftrace_raw_init_event_##call(void); \
215 \ 218 \
216struct ftrace_event_call __used \ 219struct ftrace_event_call __used \
217__attribute__((__aligned__(4))) \ 220__attribute__((__aligned__(4))) \
@@ -219,14 +222,9 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
219 .name = #call, \ 222 .name = #call, \
220 .id = type, \ 223 .id = type, \
221 .system = __stringify(TRACE_SYSTEM), \ 224 .system = __stringify(TRACE_SYSTEM), \
222 .raw_init = ftrace_raw_init_event_##call, \ 225 .raw_init = ftrace_raw_init_event, \
223 .show_format = ftrace_format_##call, \ 226 .show_format = ftrace_format_##call, \
224 .define_fields = ftrace_define_fields_##call, \ 227 .define_fields = ftrace_define_fields_##call, \
225}; \ 228}; \
226static int ftrace_raw_init_event_##call(void) \
227{ \
228 INIT_LIST_HEAD(&event_##call.fields); \
229 return 0; \
230} \
231 229
232#include "trace_entries.h" 230#include "trace_entries.h"
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 45e6c01b2e4..b1342c5d37c 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -14,9 +14,20 @@
14#include "trace.h" 14#include "trace.h"
15#include "trace_output.h" 15#include "trace_output.h"
16 16
17struct fgraph_data { 17struct fgraph_cpu_data {
18 pid_t last_pid; 18 pid_t last_pid;
19 int depth; 19 int depth;
20 int ignore;
21};
22
23struct fgraph_data {
24 struct fgraph_cpu_data *cpu_data;
25
26 /* Place to preserve last processed entry. */
27 struct ftrace_graph_ent_entry ent;
28 struct ftrace_graph_ret_entry ret;
29 int failed;
30 int cpu;
20}; 31};
21 32
22#define TRACE_GRAPH_INDENT 2 33#define TRACE_GRAPH_INDENT 2
@@ -176,7 +187,7 @@ static int __trace_graph_entry(struct trace_array *tr,
176 struct ring_buffer *buffer = tr->buffer; 187 struct ring_buffer *buffer = tr->buffer;
177 struct ftrace_graph_ent_entry *entry; 188 struct ftrace_graph_ent_entry *entry;
178 189
179 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) 190 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
180 return 0; 191 return 0;
181 192
182 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, 193 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
@@ -240,7 +251,7 @@ static void __trace_graph_return(struct trace_array *tr,
240 struct ring_buffer *buffer = tr->buffer; 251 struct ring_buffer *buffer = tr->buffer;
241 struct ftrace_graph_ret_entry *entry; 252 struct ftrace_graph_ret_entry *entry;
242 253
243 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) 254 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
244 return; 255 return;
245 256
246 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, 257 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
@@ -384,7 +395,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
384 if (!data) 395 if (!data)
385 return TRACE_TYPE_HANDLED; 396 return TRACE_TYPE_HANDLED;
386 397
387 last_pid = &(per_cpu_ptr(data, cpu)->last_pid); 398 last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
388 399
389 if (*last_pid == pid) 400 if (*last_pid == pid)
390 return TRACE_TYPE_HANDLED; 401 return TRACE_TYPE_HANDLED;
@@ -435,26 +446,49 @@ static struct ftrace_graph_ret_entry *
435get_return_for_leaf(struct trace_iterator *iter, 446get_return_for_leaf(struct trace_iterator *iter,
436 struct ftrace_graph_ent_entry *curr) 447 struct ftrace_graph_ent_entry *curr)
437{ 448{
438 struct ring_buffer_iter *ring_iter; 449 struct fgraph_data *data = iter->private;
450 struct ring_buffer_iter *ring_iter = NULL;
439 struct ring_buffer_event *event; 451 struct ring_buffer_event *event;
440 struct ftrace_graph_ret_entry *next; 452 struct ftrace_graph_ret_entry *next;
441 453
442 ring_iter = iter->buffer_iter[iter->cpu]; 454 /*
455 * If the previous output failed to write to the seq buffer,
456 * then we just reuse the data from before.
457 */
458 if (data && data->failed) {
459 curr = &data->ent;
460 next = &data->ret;
461 } else {
443 462
444 /* First peek to compare current entry and the next one */ 463 ring_iter = iter->buffer_iter[iter->cpu];
445 if (ring_iter) 464
446 event = ring_buffer_iter_peek(ring_iter, NULL); 465 /* First peek to compare current entry and the next one */
447 else { 466 if (ring_iter)
448 /* We need to consume the current entry to see the next one */ 467 event = ring_buffer_iter_peek(ring_iter, NULL);
449 ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL); 468 else {
450 event = ring_buffer_peek(iter->tr->buffer, iter->cpu, 469 /*
451 NULL); 470 * We need to consume the current entry to see
452 } 471 * the next one.
472 */
473 ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
474 event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
475 NULL);
476 }
453 477
454 if (!event) 478 if (!event)
455 return NULL; 479 return NULL;
480
481 next = ring_buffer_event_data(event);
456 482
457 next = ring_buffer_event_data(event); 483 if (data) {
484 /*
485 * Save current and next entries for later reference
486 * if the output fails.
487 */
488 data->ent = *curr;
489 data->ret = *next;
490 }
491 }
458 492
459 if (next->ent.type != TRACE_GRAPH_RET) 493 if (next->ent.type != TRACE_GRAPH_RET)
460 return NULL; 494 return NULL;
@@ -640,7 +674,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
640 674
641 if (data) { 675 if (data) {
642 int cpu = iter->cpu; 676 int cpu = iter->cpu;
643 int *depth = &(per_cpu_ptr(data, cpu)->depth); 677 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
644 678
645 /* 679 /*
646 * Comments display at + 1 to depth. Since 680 * Comments display at + 1 to depth. Since
@@ -688,7 +722,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
688 722
689 if (data) { 723 if (data) {
690 int cpu = iter->cpu; 724 int cpu = iter->cpu;
691 int *depth = &(per_cpu_ptr(data, cpu)->depth); 725 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
692 726
693 *depth = call->depth; 727 *depth = call->depth;
694 } 728 }
@@ -782,19 +816,34 @@ static enum print_line_t
782print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, 816print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
783 struct trace_iterator *iter) 817 struct trace_iterator *iter)
784{ 818{
785 int cpu = iter->cpu; 819 struct fgraph_data *data = iter->private;
786 struct ftrace_graph_ent *call = &field->graph_ent; 820 struct ftrace_graph_ent *call = &field->graph_ent;
787 struct ftrace_graph_ret_entry *leaf_ret; 821 struct ftrace_graph_ret_entry *leaf_ret;
822 static enum print_line_t ret;
823 int cpu = iter->cpu;
788 824
789 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func)) 825 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func))
790 return TRACE_TYPE_PARTIAL_LINE; 826 return TRACE_TYPE_PARTIAL_LINE;
791 827
792 leaf_ret = get_return_for_leaf(iter, field); 828 leaf_ret = get_return_for_leaf(iter, field);
793 if (leaf_ret) 829 if (leaf_ret)
794 return print_graph_entry_leaf(iter, field, leaf_ret, s); 830 ret = print_graph_entry_leaf(iter, field, leaf_ret, s);
795 else 831 else
796 return print_graph_entry_nested(iter, field, s, cpu); 832 ret = print_graph_entry_nested(iter, field, s, cpu);
797 833
834 if (data) {
835 /*
836 * If we failed to write our output, then we need to make
837 * note of it. Because we already consumed our entry.
838 */
839 if (s->full) {
840 data->failed = 1;
841 data->cpu = cpu;
842 } else
843 data->failed = 0;
844 }
845
846 return ret;
798} 847}
799 848
800static enum print_line_t 849static enum print_line_t
@@ -810,7 +859,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
810 859
811 if (data) { 860 if (data) {
812 int cpu = iter->cpu; 861 int cpu = iter->cpu;
813 int *depth = &(per_cpu_ptr(data, cpu)->depth); 862 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
814 863
815 /* 864 /*
816 * Comments display at + 1 to depth. This is the 865 * Comments display at + 1 to depth. This is the
@@ -873,7 +922,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
873 int i; 922 int i;
874 923
875 if (data) 924 if (data)
876 depth = per_cpu_ptr(data, iter->cpu)->depth; 925 depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
877 926
878 if (print_graph_prologue(iter, s, 0, 0)) 927 if (print_graph_prologue(iter, s, 0, 0))
879 return TRACE_TYPE_PARTIAL_LINE; 928 return TRACE_TYPE_PARTIAL_LINE;
@@ -941,8 +990,33 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
941enum print_line_t 990enum print_line_t
942print_graph_function(struct trace_iterator *iter) 991print_graph_function(struct trace_iterator *iter)
943{ 992{
993 struct ftrace_graph_ent_entry *field;
994 struct fgraph_data *data = iter->private;
944 struct trace_entry *entry = iter->ent; 995 struct trace_entry *entry = iter->ent;
945 struct trace_seq *s = &iter->seq; 996 struct trace_seq *s = &iter->seq;
997 int cpu = iter->cpu;
998 int ret;
999
1000 if (data && per_cpu_ptr(data->cpu_data, cpu)->ignore) {
1001 per_cpu_ptr(data->cpu_data, cpu)->ignore = 0;
1002 return TRACE_TYPE_HANDLED;
1003 }
1004
1005 /*
1006 * If the last output failed, there's a possibility we need
1007 * to print out the missing entry which would never go out.
1008 */
1009 if (data && data->failed) {
1010 field = &data->ent;
1011 iter->cpu = data->cpu;
1012 ret = print_graph_entry(field, s, iter);
1013 if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) {
1014 per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1;
1015 ret = TRACE_TYPE_NO_CONSUME;
1016 }
1017 iter->cpu = cpu;
1018 return ret;
1019 }
946 1020
947 switch (entry->type) { 1021 switch (entry->type) {
948 case TRACE_GRAPH_ENT: { 1022 case TRACE_GRAPH_ENT: {
@@ -952,7 +1026,7 @@ print_graph_function(struct trace_iterator *iter)
952 * sizeof(struct ftrace_graph_ent_entry) is very small, 1026 * sizeof(struct ftrace_graph_ent_entry) is very small,
953 * it can be safely saved at the stack. 1027 * it can be safely saved at the stack.
954 */ 1028 */
955 struct ftrace_graph_ent_entry *field, saved; 1029 struct ftrace_graph_ent_entry saved;
956 trace_assign_type(field, entry); 1030 trace_assign_type(field, entry);
957 saved = *field; 1031 saved = *field;
958 return print_graph_entry(&saved, s, iter); 1032 return print_graph_entry(&saved, s, iter);
@@ -1030,31 +1104,54 @@ static void print_graph_headers(struct seq_file *s)
1030static void graph_trace_open(struct trace_iterator *iter) 1104static void graph_trace_open(struct trace_iterator *iter)
1031{ 1105{
1032 /* pid and depth on the last trace processed */ 1106 /* pid and depth on the last trace processed */
1033 struct fgraph_data *data = alloc_percpu(struct fgraph_data); 1107 struct fgraph_data *data;
1034 int cpu; 1108 int cpu;
1035 1109
1110 iter->private = NULL;
1111
1112 data = kzalloc(sizeof(*data), GFP_KERNEL);
1036 if (!data) 1113 if (!data)
1037 pr_warning("function graph tracer: not enough memory\n"); 1114 goto out_err;
1038 else 1115
1039 for_each_possible_cpu(cpu) { 1116 data->cpu_data = alloc_percpu(struct fgraph_cpu_data);
1040 pid_t *pid = &(per_cpu_ptr(data, cpu)->last_pid); 1117 if (!data->cpu_data)
1041 int *depth = &(per_cpu_ptr(data, cpu)->depth); 1118 goto out_err_free;
1042 *pid = -1; 1119
1043 *depth = 0; 1120 for_each_possible_cpu(cpu) {
1044 } 1121 pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
1122 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
1123 int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore);
1124 *pid = -1;
1125 *depth = 0;
1126 *ignore = 0;
1127 }
1045 1128
1046 iter->private = data; 1129 iter->private = data;
1130
1131 return;
1132
1133 out_err_free:
1134 kfree(data);
1135 out_err:
1136 pr_warning("function graph tracer: not enough memory\n");
1047} 1137}
1048 1138
1049static void graph_trace_close(struct trace_iterator *iter) 1139static void graph_trace_close(struct trace_iterator *iter)
1050{ 1140{
1051 free_percpu(iter->private); 1141 struct fgraph_data *data = iter->private;
1142
1143 if (data) {
1144 free_percpu(data->cpu_data);
1145 kfree(data);
1146 }
1052} 1147}
1053 1148
1054static struct tracer graph_trace __read_mostly = { 1149static struct tracer graph_trace __read_mostly = {
1055 .name = "function_graph", 1150 .name = "function_graph",
1056 .open = graph_trace_open, 1151 .open = graph_trace_open,
1152 .pipe_open = graph_trace_open,
1057 .close = graph_trace_close, 1153 .close = graph_trace_close,
1154 .pipe_close = graph_trace_close,
1058 .wait_pipe = poll_wait_pipe, 1155 .wait_pipe = poll_wait_pipe,
1059 .init = graph_trace_init, 1156 .init = graph_trace_init,
1060 .reset = graph_trace_reset, 1157 .reset = graph_trace_reset,
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 69543a905cd..7b97000745f 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -20,10 +20,10 @@
20 20
21#define BTS_BUFFER_SIZE (1 << 13) 21#define BTS_BUFFER_SIZE (1 << 13)
22 22
23static DEFINE_PER_CPU(struct bts_tracer *, tracer); 23static DEFINE_PER_CPU(struct bts_tracer *, hwb_tracer);
24static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer); 24static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], hwb_buffer);
25 25
26#define this_tracer per_cpu(tracer, smp_processor_id()) 26#define this_tracer per_cpu(hwb_tracer, smp_processor_id())
27 27
28static int trace_hw_branches_enabled __read_mostly; 28static int trace_hw_branches_enabled __read_mostly;
29static int trace_hw_branches_suspended __read_mostly; 29static int trace_hw_branches_suspended __read_mostly;
@@ -32,12 +32,13 @@ static struct trace_array *hw_branch_trace __read_mostly;
32 32
33static void bts_trace_init_cpu(int cpu) 33static void bts_trace_init_cpu(int cpu)
34{ 34{
35 per_cpu(tracer, cpu) = 35 per_cpu(hwb_tracer, cpu) =
36 ds_request_bts_cpu(cpu, per_cpu(buffer, cpu), BTS_BUFFER_SIZE, 36 ds_request_bts_cpu(cpu, per_cpu(hwb_buffer, cpu),
37 NULL, (size_t)-1, BTS_KERNEL); 37 BTS_BUFFER_SIZE, NULL, (size_t)-1,
38 BTS_KERNEL);
38 39
39 if (IS_ERR(per_cpu(tracer, cpu))) 40 if (IS_ERR(per_cpu(hwb_tracer, cpu)))
40 per_cpu(tracer, cpu) = NULL; 41 per_cpu(hwb_tracer, cpu) = NULL;
41} 42}
42 43
43static int bts_trace_init(struct trace_array *tr) 44static int bts_trace_init(struct trace_array *tr)
@@ -51,7 +52,7 @@ static int bts_trace_init(struct trace_array *tr)
51 for_each_online_cpu(cpu) { 52 for_each_online_cpu(cpu) {
52 bts_trace_init_cpu(cpu); 53 bts_trace_init_cpu(cpu);
53 54
54 if (likely(per_cpu(tracer, cpu))) 55 if (likely(per_cpu(hwb_tracer, cpu)))
55 trace_hw_branches_enabled = 1; 56 trace_hw_branches_enabled = 1;
56 } 57 }
57 trace_hw_branches_suspended = 0; 58 trace_hw_branches_suspended = 0;
@@ -67,9 +68,9 @@ static void bts_trace_reset(struct trace_array *tr)
67 68
68 get_online_cpus(); 69 get_online_cpus();
69 for_each_online_cpu(cpu) { 70 for_each_online_cpu(cpu) {
70 if (likely(per_cpu(tracer, cpu))) { 71 if (likely(per_cpu(hwb_tracer, cpu))) {
71 ds_release_bts(per_cpu(tracer, cpu)); 72 ds_release_bts(per_cpu(hwb_tracer, cpu));
72 per_cpu(tracer, cpu) = NULL; 73 per_cpu(hwb_tracer, cpu) = NULL;
73 } 74 }
74 } 75 }
75 trace_hw_branches_enabled = 0; 76 trace_hw_branches_enabled = 0;
@@ -83,8 +84,8 @@ static void bts_trace_start(struct trace_array *tr)
83 84
84 get_online_cpus(); 85 get_online_cpus();
85 for_each_online_cpu(cpu) 86 for_each_online_cpu(cpu)
86 if (likely(per_cpu(tracer, cpu))) 87 if (likely(per_cpu(hwb_tracer, cpu)))
87 ds_resume_bts(per_cpu(tracer, cpu)); 88 ds_resume_bts(per_cpu(hwb_tracer, cpu));
88 trace_hw_branches_suspended = 0; 89 trace_hw_branches_suspended = 0;
89 put_online_cpus(); 90 put_online_cpus();
90} 91}
@@ -95,8 +96,8 @@ static void bts_trace_stop(struct trace_array *tr)
95 96
96 get_online_cpus(); 97 get_online_cpus();
97 for_each_online_cpu(cpu) 98 for_each_online_cpu(cpu)
98 if (likely(per_cpu(tracer, cpu))) 99 if (likely(per_cpu(hwb_tracer, cpu)))
99 ds_suspend_bts(per_cpu(tracer, cpu)); 100 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
100 trace_hw_branches_suspended = 1; 101 trace_hw_branches_suspended = 1;
101 put_online_cpus(); 102 put_online_cpus();
102} 103}
@@ -114,16 +115,16 @@ static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
114 bts_trace_init_cpu(cpu); 115 bts_trace_init_cpu(cpu);
115 116
116 if (trace_hw_branches_suspended && 117 if (trace_hw_branches_suspended &&
117 likely(per_cpu(tracer, cpu))) 118 likely(per_cpu(hwb_tracer, cpu)))
118 ds_suspend_bts(per_cpu(tracer, cpu)); 119 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
119 } 120 }
120 break; 121 break;
121 122
122 case CPU_DOWN_PREPARE: 123 case CPU_DOWN_PREPARE:
123 /* The notification is sent with interrupts enabled. */ 124 /* The notification is sent with interrupts enabled. */
124 if (likely(per_cpu(tracer, cpu))) { 125 if (likely(per_cpu(hwb_tracer, cpu))) {
125 ds_release_bts(per_cpu(tracer, cpu)); 126 ds_release_bts(per_cpu(hwb_tracer, cpu));
126 per_cpu(tracer, cpu) = NULL; 127 per_cpu(hwb_tracer, cpu) = NULL;
127 } 128 }
128 } 129 }
129 130
@@ -258,8 +259,8 @@ static void trace_bts_prepare(struct trace_iterator *iter)
258 259
259 get_online_cpus(); 260 get_online_cpus();
260 for_each_online_cpu(cpu) 261 for_each_online_cpu(cpu)
261 if (likely(per_cpu(tracer, cpu))) 262 if (likely(per_cpu(hwb_tracer, cpu)))
262 ds_suspend_bts(per_cpu(tracer, cpu)); 263 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
263 /* 264 /*
264 * We need to collect the trace on the respective cpu since ftrace 265 * We need to collect the trace on the respective cpu since ftrace
265 * implicitly adds the record for the current cpu. 266 * implicitly adds the record for the current cpu.
@@ -268,8 +269,8 @@ static void trace_bts_prepare(struct trace_iterator *iter)
268 on_each_cpu(trace_bts_cpu, iter->tr, 1); 269 on_each_cpu(trace_bts_cpu, iter->tr, 1);
269 270
270 for_each_online_cpu(cpu) 271 for_each_online_cpu(cpu)
271 if (likely(per_cpu(tracer, cpu))) 272 if (likely(per_cpu(hwb_tracer, cpu)))
272 ds_resume_bts(per_cpu(tracer, cpu)); 273 ds_resume_bts(per_cpu(hwb_tracer, cpu));
273 put_online_cpus(); 274 put_online_cpus();
274} 275}
275 276
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 3aa7eaa2114..2974bc7538c 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -151,6 +151,8 @@ check_critical_timing(struct trace_array *tr,
151 goto out_unlock; 151 goto out_unlock;
152 152
153 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 153 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
154 /* Skip 5 functions to get to the irq/preempt enable function */
155 __trace_stack(tr, flags, 5, pc);
154 156
155 if (data->critical_sequence != max_sequence) 157 if (data->critical_sequence != max_sequence)
156 goto out_unlock; 158 goto out_unlock;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
new file mode 100644
index 00000000000..50b1b823980
--- /dev/null
+++ b/kernel/trace/trace_kprobe.c
@@ -0,0 +1,1553 @@
1/*
2 * Kprobes-based tracing events
3 *
4 * Created by Masami Hiramatsu <mhiramat@redhat.com>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20#include <linux/module.h>
21#include <linux/uaccess.h>
22#include <linux/kprobes.h>
23#include <linux/seq_file.h>
24#include <linux/slab.h>
25#include <linux/smp.h>
26#include <linux/debugfs.h>
27#include <linux/types.h>
28#include <linux/string.h>
29#include <linux/ctype.h>
30#include <linux/ptrace.h>
31#include <linux/perf_event.h>
32
33#include "trace.h"
34#include "trace_output.h"
35
36#define MAX_TRACE_ARGS 128
37#define MAX_ARGSTR_LEN 63
38#define MAX_EVENT_NAME_LEN 64
39#define KPROBE_EVENT_SYSTEM "kprobes"
40
41/* Reserved field names */
42#define FIELD_STRING_IP "__probe_ip"
43#define FIELD_STRING_NARGS "__probe_nargs"
44#define FIELD_STRING_RETIP "__probe_ret_ip"
45#define FIELD_STRING_FUNC "__probe_func"
46
47const char *reserved_field_names[] = {
48 "common_type",
49 "common_flags",
50 "common_preempt_count",
51 "common_pid",
52 "common_tgid",
53 "common_lock_depth",
54 FIELD_STRING_IP,
55 FIELD_STRING_NARGS,
56 FIELD_STRING_RETIP,
57 FIELD_STRING_FUNC,
58};
59
60struct fetch_func {
61 unsigned long (*func)(struct pt_regs *, void *);
62 void *data;
63};
64
65static __kprobes unsigned long call_fetch(struct fetch_func *f,
66 struct pt_regs *regs)
67{
68 return f->func(regs, f->data);
69}
70
71/* fetch handlers */
72static __kprobes unsigned long fetch_register(struct pt_regs *regs,
73 void *offset)
74{
75 return regs_get_register(regs, (unsigned int)((unsigned long)offset));
76}
77
78static __kprobes unsigned long fetch_stack(struct pt_regs *regs,
79 void *num)
80{
81 return regs_get_kernel_stack_nth(regs,
82 (unsigned int)((unsigned long)num));
83}
84
85static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr)
86{
87 unsigned long retval;
88
89 if (probe_kernel_address(addr, retval))
90 return 0;
91 return retval;
92}
93
94static __kprobes unsigned long fetch_argument(struct pt_regs *regs, void *num)
95{
96 return regs_get_argument_nth(regs, (unsigned int)((unsigned long)num));
97}
98
99static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs,
100 void *dummy)
101{
102 return regs_return_value(regs);
103}
104
105static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs,
106 void *dummy)
107{
108 return kernel_stack_pointer(regs);
109}
110
111/* Memory fetching by symbol */
112struct symbol_cache {
113 char *symbol;
114 long offset;
115 unsigned long addr;
116};
117
118static unsigned long update_symbol_cache(struct symbol_cache *sc)
119{
120 sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
121 if (sc->addr)
122 sc->addr += sc->offset;
123 return sc->addr;
124}
125
126static void free_symbol_cache(struct symbol_cache *sc)
127{
128 kfree(sc->symbol);
129 kfree(sc);
130}
131
132static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
133{
134 struct symbol_cache *sc;
135
136 if (!sym || strlen(sym) == 0)
137 return NULL;
138 sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
139 if (!sc)
140 return NULL;
141
142 sc->symbol = kstrdup(sym, GFP_KERNEL);
143 if (!sc->symbol) {
144 kfree(sc);
145 return NULL;
146 }
147 sc->offset = offset;
148
149 update_symbol_cache(sc);
150 return sc;
151}
152
153static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data)
154{
155 struct symbol_cache *sc = data;
156
157 if (sc->addr)
158 return fetch_memory(regs, (void *)sc->addr);
159 else
160 return 0;
161}
162
163/* Special indirect memory access interface */
164struct indirect_fetch_data {
165 struct fetch_func orig;
166 long offset;
167};
168
169static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data)
170{
171 struct indirect_fetch_data *ind = data;
172 unsigned long addr;
173
174 addr = call_fetch(&ind->orig, regs);
175 if (addr) {
176 addr += ind->offset;
177 return fetch_memory(regs, (void *)addr);
178 } else
179 return 0;
180}
181
182static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data)
183{
184 if (data->orig.func == fetch_indirect)
185 free_indirect_fetch_data(data->orig.data);
186 else if (data->orig.func == fetch_symbol)
187 free_symbol_cache(data->orig.data);
188 kfree(data);
189}
190
191/**
192 * Kprobe event core functions
193 */
194
195struct probe_arg {
196 struct fetch_func fetch;
197 const char *name;
198};
199
200/* Flags for trace_probe */
201#define TP_FLAG_TRACE 1
202#define TP_FLAG_PROFILE 2
203
204struct trace_probe {
205 struct list_head list;
206 struct kretprobe rp; /* Use rp.kp for kprobe use */
207 unsigned long nhit;
208 unsigned int flags; /* For TP_FLAG_* */
209 const char *symbol; /* symbol name */
210 struct ftrace_event_call call;
211 struct trace_event event;
212 unsigned int nr_args;
213 struct probe_arg args[];
214};
215
216#define SIZEOF_TRACE_PROBE(n) \
217 (offsetof(struct trace_probe, args) + \
218 (sizeof(struct probe_arg) * (n)))
219
220static __kprobes int probe_is_return(struct trace_probe *tp)
221{
222 return tp->rp.handler != NULL;
223}
224
225static __kprobes const char *probe_symbol(struct trace_probe *tp)
226{
227 return tp->symbol ? tp->symbol : "unknown";
228}
229
230static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
231{
232 int ret = -EINVAL;
233
234 if (ff->func == fetch_argument)
235 ret = snprintf(buf, n, "$arg%lu", (unsigned long)ff->data);
236 else if (ff->func == fetch_register) {
237 const char *name;
238 name = regs_query_register_name((unsigned int)((long)ff->data));
239 ret = snprintf(buf, n, "%%%s", name);
240 } else if (ff->func == fetch_stack)
241 ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data);
242 else if (ff->func == fetch_memory)
243 ret = snprintf(buf, n, "@0x%p", ff->data);
244 else if (ff->func == fetch_symbol) {
245 struct symbol_cache *sc = ff->data;
246 if (sc->offset)
247 ret = snprintf(buf, n, "@%s%+ld", sc->symbol,
248 sc->offset);
249 else
250 ret = snprintf(buf, n, "@%s", sc->symbol);
251 } else if (ff->func == fetch_retvalue)
252 ret = snprintf(buf, n, "$retval");
253 else if (ff->func == fetch_stack_address)
254 ret = snprintf(buf, n, "$stack");
255 else if (ff->func == fetch_indirect) {
256 struct indirect_fetch_data *id = ff->data;
257 size_t l = 0;
258 ret = snprintf(buf, n, "%+ld(", id->offset);
259 if (ret >= n)
260 goto end;
261 l += ret;
262 ret = probe_arg_string(buf + l, n - l, &id->orig);
263 if (ret < 0)
264 goto end;
265 l += ret;
266 ret = snprintf(buf + l, n - l, ")");
267 ret += l;
268 }
269end:
270 if (ret >= n)
271 return -ENOSPC;
272 return ret;
273}
274
275static int register_probe_event(struct trace_probe *tp);
276static void unregister_probe_event(struct trace_probe *tp);
277
278static DEFINE_MUTEX(probe_lock);
279static LIST_HEAD(probe_list);
280
281static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
282static int kretprobe_dispatcher(struct kretprobe_instance *ri,
283 struct pt_regs *regs);
284
285/* Check the name is good for event/group */
286static int check_event_name(const char *name)
287{
288 if (!isalpha(*name) && *name != '_')
289 return 0;
290 while (*++name != '\0') {
291 if (!isalpha(*name) && !isdigit(*name) && *name != '_')
292 return 0;
293 }
294 return 1;
295}
296
297/*
298 * Allocate new trace_probe and initialize it (including kprobes).
299 */
300static struct trace_probe *alloc_trace_probe(const char *group,
301 const char *event,
302 void *addr,
303 const char *symbol,
304 unsigned long offs,
305 int nargs, int is_return)
306{
307 struct trace_probe *tp;
308 int ret = -ENOMEM;
309
310 tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL);
311 if (!tp)
312 return ERR_PTR(ret);
313
314 if (symbol) {
315 tp->symbol = kstrdup(symbol, GFP_KERNEL);
316 if (!tp->symbol)
317 goto error;
318 tp->rp.kp.symbol_name = tp->symbol;
319 tp->rp.kp.offset = offs;
320 } else
321 tp->rp.kp.addr = addr;
322
323 if (is_return)
324 tp->rp.handler = kretprobe_dispatcher;
325 else
326 tp->rp.kp.pre_handler = kprobe_dispatcher;
327
328 if (!event || !check_event_name(event)) {
329 ret = -EINVAL;
330 goto error;
331 }
332
333 tp->call.name = kstrdup(event, GFP_KERNEL);
334 if (!tp->call.name)
335 goto error;
336
337 if (!group || !check_event_name(group)) {
338 ret = -EINVAL;
339 goto error;
340 }
341
342 tp->call.system = kstrdup(group, GFP_KERNEL);
343 if (!tp->call.system)
344 goto error;
345
346 INIT_LIST_HEAD(&tp->list);
347 return tp;
348error:
349 kfree(tp->call.name);
350 kfree(tp->symbol);
351 kfree(tp);
352 return ERR_PTR(ret);
353}
354
355static void free_probe_arg(struct probe_arg *arg)
356{
357 if (arg->fetch.func == fetch_symbol)
358 free_symbol_cache(arg->fetch.data);
359 else if (arg->fetch.func == fetch_indirect)
360 free_indirect_fetch_data(arg->fetch.data);
361 kfree(arg->name);
362}
363
364static void free_trace_probe(struct trace_probe *tp)
365{
366 int i;
367
368 for (i = 0; i < tp->nr_args; i++)
369 free_probe_arg(&tp->args[i]);
370
371 kfree(tp->call.system);
372 kfree(tp->call.name);
373 kfree(tp->symbol);
374 kfree(tp);
375}
376
377static struct trace_probe *find_probe_event(const char *event,
378 const char *group)
379{
380 struct trace_probe *tp;
381
382 list_for_each_entry(tp, &probe_list, list)
383 if (strcmp(tp->call.name, event) == 0 &&
384 strcmp(tp->call.system, group) == 0)
385 return tp;
386 return NULL;
387}
388
389/* Unregister a trace_probe and probe_event: call with locking probe_lock */
390static void unregister_trace_probe(struct trace_probe *tp)
391{
392 if (probe_is_return(tp))
393 unregister_kretprobe(&tp->rp);
394 else
395 unregister_kprobe(&tp->rp.kp);
396 list_del(&tp->list);
397 unregister_probe_event(tp);
398}
399
400/* Register a trace_probe and probe_event */
401static int register_trace_probe(struct trace_probe *tp)
402{
403 struct trace_probe *old_tp;
404 int ret;
405
406 mutex_lock(&probe_lock);
407
408 /* register as an event */
409 old_tp = find_probe_event(tp->call.name, tp->call.system);
410 if (old_tp) {
411 /* delete old event */
412 unregister_trace_probe(old_tp);
413 free_trace_probe(old_tp);
414 }
415 ret = register_probe_event(tp);
416 if (ret) {
417 pr_warning("Faild to register probe event(%d)\n", ret);
418 goto end;
419 }
420
421 tp->rp.kp.flags |= KPROBE_FLAG_DISABLED;
422 if (probe_is_return(tp))
423 ret = register_kretprobe(&tp->rp);
424 else
425 ret = register_kprobe(&tp->rp.kp);
426
427 if (ret) {
428 pr_warning("Could not insert probe(%d)\n", ret);
429 if (ret == -EILSEQ) {
430 pr_warning("Probing address(0x%p) is not an "
431 "instruction boundary.\n",
432 tp->rp.kp.addr);
433 ret = -EINVAL;
434 }
435 unregister_probe_event(tp);
436 } else
437 list_add_tail(&tp->list, &probe_list);
438end:
439 mutex_unlock(&probe_lock);
440 return ret;
441}
442
443/* Split symbol and offset. */
444static int split_symbol_offset(char *symbol, unsigned long *offset)
445{
446 char *tmp;
447 int ret;
448
449 if (!offset)
450 return -EINVAL;
451
452 tmp = strchr(symbol, '+');
453 if (tmp) {
454 /* skip sign because strict_strtol doesn't accept '+' */
455 ret = strict_strtoul(tmp + 1, 0, offset);
456 if (ret)
457 return ret;
458 *tmp = '\0';
459 } else
460 *offset = 0;
461 return 0;
462}
463
464#define PARAM_MAX_ARGS 16
465#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
466
467static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
468{
469 int ret = 0;
470 unsigned long param;
471
472 if (strcmp(arg, "retval") == 0) {
473 if (is_return) {
474 ff->func = fetch_retvalue;
475 ff->data = NULL;
476 } else
477 ret = -EINVAL;
478 } else if (strncmp(arg, "stack", 5) == 0) {
479 if (arg[5] == '\0') {
480 ff->func = fetch_stack_address;
481 ff->data = NULL;
482 } else if (isdigit(arg[5])) {
483 ret = strict_strtoul(arg + 5, 10, &param);
484 if (ret || param > PARAM_MAX_STACK)
485 ret = -EINVAL;
486 else {
487 ff->func = fetch_stack;
488 ff->data = (void *)param;
489 }
490 } else
491 ret = -EINVAL;
492 } else if (strncmp(arg, "arg", 3) == 0 && isdigit(arg[3])) {
493 ret = strict_strtoul(arg + 3, 10, &param);
494 if (ret || param > PARAM_MAX_ARGS)
495 ret = -EINVAL;
496 else {
497 ff->func = fetch_argument;
498 ff->data = (void *)param;
499 }
500 } else
501 ret = -EINVAL;
502 return ret;
503}
504
505/* Recursive argument parser */
506static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
507{
508 int ret = 0;
509 unsigned long param;
510 long offset;
511 char *tmp;
512
513 switch (arg[0]) {
514 case '$':
515 ret = parse_probe_vars(arg + 1, ff, is_return);
516 break;
517 case '%': /* named register */
518 ret = regs_query_register_offset(arg + 1);
519 if (ret >= 0) {
520 ff->func = fetch_register;
521 ff->data = (void *)(unsigned long)ret;
522 ret = 0;
523 }
524 break;
525 case '@': /* memory or symbol */
526 if (isdigit(arg[1])) {
527 ret = strict_strtoul(arg + 1, 0, &param);
528 if (ret)
529 break;
530 ff->func = fetch_memory;
531 ff->data = (void *)param;
532 } else {
533 ret = split_symbol_offset(arg + 1, &offset);
534 if (ret)
535 break;
536 ff->data = alloc_symbol_cache(arg + 1, offset);
537 if (ff->data)
538 ff->func = fetch_symbol;
539 else
540 ret = -EINVAL;
541 }
542 break;
543 case '+': /* indirect memory */
544 case '-':
545 tmp = strchr(arg, '(');
546 if (!tmp) {
547 ret = -EINVAL;
548 break;
549 }
550 *tmp = '\0';
551 ret = strict_strtol(arg + 1, 0, &offset);
552 if (ret)
553 break;
554 if (arg[0] == '-')
555 offset = -offset;
556 arg = tmp + 1;
557 tmp = strrchr(arg, ')');
558 if (tmp) {
559 struct indirect_fetch_data *id;
560 *tmp = '\0';
561 id = kzalloc(sizeof(struct indirect_fetch_data),
562 GFP_KERNEL);
563 if (!id)
564 return -ENOMEM;
565 id->offset = offset;
566 ret = __parse_probe_arg(arg, &id->orig, is_return);
567 if (ret)
568 kfree(id);
569 else {
570 ff->func = fetch_indirect;
571 ff->data = (void *)id;
572 }
573 } else
574 ret = -EINVAL;
575 break;
576 default:
577 /* TODO: support custom handler */
578 ret = -EINVAL;
579 }
580 return ret;
581}
582
583/* String length checking wrapper */
584static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
585{
586 if (strlen(arg) > MAX_ARGSTR_LEN) {
587 pr_info("Argument is too long.: %s\n", arg);
588 return -ENOSPC;
589 }
590 return __parse_probe_arg(arg, ff, is_return);
591}
592
593/* Return 1 if name is reserved or already used by another argument */
594static int conflict_field_name(const char *name,
595 struct probe_arg *args, int narg)
596{
597 int i;
598 for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++)
599 if (strcmp(reserved_field_names[i], name) == 0)
600 return 1;
601 for (i = 0; i < narg; i++)
602 if (strcmp(args[i].name, name) == 0)
603 return 1;
604 return 0;
605}
606
607static int create_trace_probe(int argc, char **argv)
608{
609 /*
610 * Argument syntax:
611 * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS]
612 * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS]
613 * Fetch args:
614 * $argN : fetch Nth of function argument. (N:0-)
615 * $retval : fetch return value
616 * $stack : fetch stack address
617 * $stackN : fetch Nth of stack (N:0-)
618 * @ADDR : fetch memory at ADDR (ADDR should be in kernel)
619 * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
620 * %REG : fetch register REG
621 * Indirect memory fetch:
622 * +|-offs(ARG) : fetch memory at ARG +|- offs address.
623 * Alias name of args:
624 * NAME=FETCHARG : set NAME as alias of FETCHARG.
625 */
626 struct trace_probe *tp;
627 int i, ret = 0;
628 int is_return = 0, is_delete = 0;
629 char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL;
630 unsigned long offset = 0;
631 void *addr = NULL;
632 char buf[MAX_EVENT_NAME_LEN];
633
634 /* argc must be >= 1 */
635 if (argv[0][0] == 'p')
636 is_return = 0;
637 else if (argv[0][0] == 'r')
638 is_return = 1;
639 else if (argv[0][0] == '-')
640 is_delete = 1;
641 else {
642 pr_info("Probe definition must be started with 'p', 'r' or"
643 " '-'.\n");
644 return -EINVAL;
645 }
646
647 if (argv[0][1] == ':') {
648 event = &argv[0][2];
649 if (strchr(event, '/')) {
650 group = event;
651 event = strchr(group, '/') + 1;
652 event[-1] = '\0';
653 if (strlen(group) == 0) {
654 pr_info("Group name is not specifiled\n");
655 return -EINVAL;
656 }
657 }
658 if (strlen(event) == 0) {
659 pr_info("Event name is not specifiled\n");
660 return -EINVAL;
661 }
662 }
663 if (!group)
664 group = KPROBE_EVENT_SYSTEM;
665
666 if (is_delete) {
667 if (!event) {
668 pr_info("Delete command needs an event name.\n");
669 return -EINVAL;
670 }
671 tp = find_probe_event(event, group);
672 if (!tp) {
673 pr_info("Event %s/%s doesn't exist.\n", group, event);
674 return -ENOENT;
675 }
676 /* delete an event */
677 unregister_trace_probe(tp);
678 free_trace_probe(tp);
679 return 0;
680 }
681
682 if (argc < 2) {
683 pr_info("Probe point is not specified.\n");
684 return -EINVAL;
685 }
686 if (isdigit(argv[1][0])) {
687 if (is_return) {
688 pr_info("Return probe point must be a symbol.\n");
689 return -EINVAL;
690 }
691 /* an address specified */
692 ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr);
693 if (ret) {
694 pr_info("Failed to parse address.\n");
695 return ret;
696 }
697 } else {
698 /* a symbol specified */
699 symbol = argv[1];
700 /* TODO: support .init module functions */
701 ret = split_symbol_offset(symbol, &offset);
702 if (ret) {
703 pr_info("Failed to parse symbol.\n");
704 return ret;
705 }
706 if (offset && is_return) {
707 pr_info("Return probe must be used without offset.\n");
708 return -EINVAL;
709 }
710 }
711 argc -= 2; argv += 2;
712
713 /* setup a probe */
714 if (!event) {
715 /* Make a new event name */
716 if (symbol)
717 snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_%ld",
718 is_return ? 'r' : 'p', symbol, offset);
719 else
720 snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p",
721 is_return ? 'r' : 'p', addr);
722 event = buf;
723 }
724 tp = alloc_trace_probe(group, event, addr, symbol, offset, argc,
725 is_return);
726 if (IS_ERR(tp)) {
727 pr_info("Failed to allocate trace_probe.(%d)\n",
728 (int)PTR_ERR(tp));
729 return PTR_ERR(tp);
730 }
731
732 /* parse arguments */
733 ret = 0;
734 for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
735 /* Parse argument name */
736 arg = strchr(argv[i], '=');
737 if (arg)
738 *arg++ = '\0';
739 else
740 arg = argv[i];
741
742 if (conflict_field_name(argv[i], tp->args, i)) {
743 pr_info("Argument%d name '%s' conflicts with "
744 "another field.\n", i, argv[i]);
745 ret = -EINVAL;
746 goto error;
747 }
748
749 tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
750 if (!tp->args[i].name) {
751 pr_info("Failed to allocate argument%d name '%s'.\n",
752 i, argv[i]);
753 ret = -ENOMEM;
754 goto error;
755 }
756
757 /* Parse fetch argument */
758 ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return);
759 if (ret) {
760 pr_info("Parse error at argument%d. (%d)\n", i, ret);
761 kfree(tp->args[i].name);
762 goto error;
763 }
764
765 tp->nr_args++;
766 }
767
768 ret = register_trace_probe(tp);
769 if (ret)
770 goto error;
771 return 0;
772
773error:
774 free_trace_probe(tp);
775 return ret;
776}
777
778static void cleanup_all_probes(void)
779{
780 struct trace_probe *tp;
781
782 mutex_lock(&probe_lock);
783 /* TODO: Use batch unregistration */
784 while (!list_empty(&probe_list)) {
785 tp = list_entry(probe_list.next, struct trace_probe, list);
786 unregister_trace_probe(tp);
787 free_trace_probe(tp);
788 }
789 mutex_unlock(&probe_lock);
790}
791
792
793/* Probes listing interfaces */
794static void *probes_seq_start(struct seq_file *m, loff_t *pos)
795{
796 mutex_lock(&probe_lock);
797 return seq_list_start(&probe_list, *pos);
798}
799
800static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos)
801{
802 return seq_list_next(v, &probe_list, pos);
803}
804
805static void probes_seq_stop(struct seq_file *m, void *v)
806{
807 mutex_unlock(&probe_lock);
808}
809
810static int probes_seq_show(struct seq_file *m, void *v)
811{
812 struct trace_probe *tp = v;
813 int i, ret;
814 char buf[MAX_ARGSTR_LEN + 1];
815
816 seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
817 seq_printf(m, ":%s/%s", tp->call.system, tp->call.name);
818
819 if (!tp->symbol)
820 seq_printf(m, " 0x%p", tp->rp.kp.addr);
821 else if (tp->rp.kp.offset)
822 seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset);
823 else
824 seq_printf(m, " %s", probe_symbol(tp));
825
826 for (i = 0; i < tp->nr_args; i++) {
827 ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch);
828 if (ret < 0) {
829 pr_warning("Argument%d decoding error(%d).\n", i, ret);
830 return ret;
831 }
832 seq_printf(m, " %s=%s", tp->args[i].name, buf);
833 }
834 seq_printf(m, "\n");
835 return 0;
836}
837
838static const struct seq_operations probes_seq_op = {
839 .start = probes_seq_start,
840 .next = probes_seq_next,
841 .stop = probes_seq_stop,
842 .show = probes_seq_show
843};
844
845static int probes_open(struct inode *inode, struct file *file)
846{
847 if ((file->f_mode & FMODE_WRITE) &&
848 (file->f_flags & O_TRUNC))
849 cleanup_all_probes();
850
851 return seq_open(file, &probes_seq_op);
852}
853
854static int command_trace_probe(const char *buf)
855{
856 char **argv;
857 int argc = 0, ret = 0;
858
859 argv = argv_split(GFP_KERNEL, buf, &argc);
860 if (!argv)
861 return -ENOMEM;
862
863 if (argc)
864 ret = create_trace_probe(argc, argv);
865
866 argv_free(argv);
867 return ret;
868}
869
870#define WRITE_BUFSIZE 128
871
872static ssize_t probes_write(struct file *file, const char __user *buffer,
873 size_t count, loff_t *ppos)
874{
875 char *kbuf, *tmp;
876 int ret;
877 size_t done;
878 size_t size;
879
880 kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
881 if (!kbuf)
882 return -ENOMEM;
883
884 ret = done = 0;
885 while (done < count) {
886 size = count - done;
887 if (size >= WRITE_BUFSIZE)
888 size = WRITE_BUFSIZE - 1;
889 if (copy_from_user(kbuf, buffer + done, size)) {
890 ret = -EFAULT;
891 goto out;
892 }
893 kbuf[size] = '\0';
894 tmp = strchr(kbuf, '\n');
895 if (tmp) {
896 *tmp = '\0';
897 size = tmp - kbuf + 1;
898 } else if (done + size < count) {
899 pr_warning("Line length is too long: "
900 "Should be less than %d.", WRITE_BUFSIZE);
901 ret = -EINVAL;
902 goto out;
903 }
904 done += size;
905 /* Remove comments */
906 tmp = strchr(kbuf, '#');
907 if (tmp)
908 *tmp = '\0';
909
910 ret = command_trace_probe(kbuf);
911 if (ret)
912 goto out;
913 }
914 ret = done;
915out:
916 kfree(kbuf);
917 return ret;
918}
919
920static const struct file_operations kprobe_events_ops = {
921 .owner = THIS_MODULE,
922 .open = probes_open,
923 .read = seq_read,
924 .llseek = seq_lseek,
925 .release = seq_release,
926 .write = probes_write,
927};
928
929/* Probes profiling interfaces */
930static int probes_profile_seq_show(struct seq_file *m, void *v)
931{
932 struct trace_probe *tp = v;
933
934 seq_printf(m, " %-44s %15lu %15lu\n", tp->call.name, tp->nhit,
935 tp->rp.kp.nmissed);
936
937 return 0;
938}
939
940static const struct seq_operations profile_seq_op = {
941 .start = probes_seq_start,
942 .next = probes_seq_next,
943 .stop = probes_seq_stop,
944 .show = probes_profile_seq_show
945};
946
947static int profile_open(struct inode *inode, struct file *file)
948{
949 return seq_open(file, &profile_seq_op);
950}
951
952static const struct file_operations kprobe_profile_ops = {
953 .owner = THIS_MODULE,
954 .open = profile_open,
955 .read = seq_read,
956 .llseek = seq_lseek,
957 .release = seq_release,
958};
959
960/* Kprobe handler */
961static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
962{
963 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
964 struct kprobe_trace_entry *entry;
965 struct ring_buffer_event *event;
966 struct ring_buffer *buffer;
967 int size, i, pc;
968 unsigned long irq_flags;
969 struct ftrace_event_call *call = &tp->call;
970
971 tp->nhit++;
972
973 local_save_flags(irq_flags);
974 pc = preempt_count();
975
976 size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
977
978 event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
979 irq_flags, pc);
980 if (!event)
981 return 0;
982
983 entry = ring_buffer_event_data(event);
984 entry->nargs = tp->nr_args;
985 entry->ip = (unsigned long)kp->addr;
986 for (i = 0; i < tp->nr_args; i++)
987 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
988
989 if (!filter_current_check_discard(buffer, call, entry, event))
990 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
991 return 0;
992}
993
994/* Kretprobe handler */
995static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
996 struct pt_regs *regs)
997{
998 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
999 struct kretprobe_trace_entry *entry;
1000 struct ring_buffer_event *event;
1001 struct ring_buffer *buffer;
1002 int size, i, pc;
1003 unsigned long irq_flags;
1004 struct ftrace_event_call *call = &tp->call;
1005
1006 local_save_flags(irq_flags);
1007 pc = preempt_count();
1008
1009 size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
1010
1011 event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
1012 irq_flags, pc);
1013 if (!event)
1014 return 0;
1015
1016 entry = ring_buffer_event_data(event);
1017 entry->nargs = tp->nr_args;
1018 entry->func = (unsigned long)tp->rp.kp.addr;
1019 entry->ret_ip = (unsigned long)ri->ret_addr;
1020 for (i = 0; i < tp->nr_args; i++)
1021 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1022
1023 if (!filter_current_check_discard(buffer, call, entry, event))
1024 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
1025
1026 return 0;
1027}
1028
1029/* Event entry printers */
1030enum print_line_t
1031print_kprobe_event(struct trace_iterator *iter, int flags)
1032{
1033 struct kprobe_trace_entry *field;
1034 struct trace_seq *s = &iter->seq;
1035 struct trace_event *event;
1036 struct trace_probe *tp;
1037 int i;
1038
1039 field = (struct kprobe_trace_entry *)iter->ent;
1040 event = ftrace_find_event(field->ent.type);
1041 tp = container_of(event, struct trace_probe, event);
1042
1043 if (!trace_seq_printf(s, "%s: (", tp->call.name))
1044 goto partial;
1045
1046 if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
1047 goto partial;
1048
1049 if (!trace_seq_puts(s, ")"))
1050 goto partial;
1051
1052 for (i = 0; i < field->nargs; i++)
1053 if (!trace_seq_printf(s, " %s=%lx",
1054 tp->args[i].name, field->args[i]))
1055 goto partial;
1056
1057 if (!trace_seq_puts(s, "\n"))
1058 goto partial;
1059
1060 return TRACE_TYPE_HANDLED;
1061partial:
1062 return TRACE_TYPE_PARTIAL_LINE;
1063}
1064
1065enum print_line_t
1066print_kretprobe_event(struct trace_iterator *iter, int flags)
1067{
1068 struct kretprobe_trace_entry *field;
1069 struct trace_seq *s = &iter->seq;
1070 struct trace_event *event;
1071 struct trace_probe *tp;
1072 int i;
1073
1074 field = (struct kretprobe_trace_entry *)iter->ent;
1075 event = ftrace_find_event(field->ent.type);
1076 tp = container_of(event, struct trace_probe, event);
1077
1078 if (!trace_seq_printf(s, "%s: (", tp->call.name))
1079 goto partial;
1080
1081 if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
1082 goto partial;
1083
1084 if (!trace_seq_puts(s, " <- "))
1085 goto partial;
1086
1087 if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET))
1088 goto partial;
1089
1090 if (!trace_seq_puts(s, ")"))
1091 goto partial;
1092
1093 for (i = 0; i < field->nargs; i++)
1094 if (!trace_seq_printf(s, " %s=%lx",
1095 tp->args[i].name, field->args[i]))
1096 goto partial;
1097
1098 if (!trace_seq_puts(s, "\n"))
1099 goto partial;
1100
1101 return TRACE_TYPE_HANDLED;
1102partial:
1103 return TRACE_TYPE_PARTIAL_LINE;
1104}
1105
1106static int probe_event_enable(struct ftrace_event_call *call)
1107{
1108 struct trace_probe *tp = (struct trace_probe *)call->data;
1109
1110 tp->flags |= TP_FLAG_TRACE;
1111 if (probe_is_return(tp))
1112 return enable_kretprobe(&tp->rp);
1113 else
1114 return enable_kprobe(&tp->rp.kp);
1115}
1116
1117static void probe_event_disable(struct ftrace_event_call *call)
1118{
1119 struct trace_probe *tp = (struct trace_probe *)call->data;
1120
1121 tp->flags &= ~TP_FLAG_TRACE;
1122 if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) {
1123 if (probe_is_return(tp))
1124 disable_kretprobe(&tp->rp);
1125 else
1126 disable_kprobe(&tp->rp.kp);
1127 }
1128}
1129
1130static int probe_event_raw_init(struct ftrace_event_call *event_call)
1131{
1132 INIT_LIST_HEAD(&event_call->fields);
1133
1134 return 0;
1135}
1136
1137#undef DEFINE_FIELD
1138#define DEFINE_FIELD(type, item, name, is_signed) \
1139 do { \
1140 ret = trace_define_field(event_call, #type, name, \
1141 offsetof(typeof(field), item), \
1142 sizeof(field.item), is_signed, \
1143 FILTER_OTHER); \
1144 if (ret) \
1145 return ret; \
1146 } while (0)
1147
1148static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
1149{
1150 int ret, i;
1151 struct kprobe_trace_entry field;
1152 struct trace_probe *tp = (struct trace_probe *)event_call->data;
1153
1154 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
1155 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
1156 /* Set argument names as fields */
1157 for (i = 0; i < tp->nr_args; i++)
1158 DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
1159 return 0;
1160}
1161
1162static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
1163{
1164 int ret, i;
1165 struct kretprobe_trace_entry field;
1166 struct trace_probe *tp = (struct trace_probe *)event_call->data;
1167
1168 DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
1169 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
1170 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
1171 /* Set argument names as fields */
1172 for (i = 0; i < tp->nr_args; i++)
1173 DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
1174 return 0;
1175}
1176
1177static int __probe_event_show_format(struct trace_seq *s,
1178 struct trace_probe *tp, const char *fmt,
1179 const char *arg)
1180{
1181 int i;
1182
1183 /* Show format */
1184 if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt))
1185 return 0;
1186
1187 for (i = 0; i < tp->nr_args; i++)
1188 if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name))
1189 return 0;
1190
1191 if (!trace_seq_printf(s, "\", %s", arg))
1192 return 0;
1193
1194 for (i = 0; i < tp->nr_args; i++)
1195 if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name))
1196 return 0;
1197
1198 return trace_seq_puts(s, "\n");
1199}
1200
1201#undef SHOW_FIELD
1202#define SHOW_FIELD(type, item, name) \
1203 do { \
1204 ret = trace_seq_printf(s, "\tfield:" #type " %s;\t" \
1205 "offset:%u;\tsize:%u;\tsigned:%d;\n", name,\
1206 (unsigned int)offsetof(typeof(field), item),\
1207 (unsigned int)sizeof(type), \
1208 is_signed_type(type)); \
1209 if (!ret) \
1210 return 0; \
1211 } while (0)
1212
1213static int kprobe_event_show_format(struct ftrace_event_call *call,
1214 struct trace_seq *s)
1215{
1216 struct kprobe_trace_entry field __attribute__((unused));
1217 int ret, i;
1218 struct trace_probe *tp = (struct trace_probe *)call->data;
1219
1220 SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP);
1221 SHOW_FIELD(int, nargs, FIELD_STRING_NARGS);
1222
1223 /* Show fields */
1224 for (i = 0; i < tp->nr_args; i++)
1225 SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
1226 trace_seq_puts(s, "\n");
1227
1228 return __probe_event_show_format(s, tp, "(%lx)",
1229 "REC->" FIELD_STRING_IP);
1230}
1231
1232static int kretprobe_event_show_format(struct ftrace_event_call *call,
1233 struct trace_seq *s)
1234{
1235 struct kretprobe_trace_entry field __attribute__((unused));
1236 int ret, i;
1237 struct trace_probe *tp = (struct trace_probe *)call->data;
1238
1239 SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC);
1240 SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP);
1241 SHOW_FIELD(int, nargs, FIELD_STRING_NARGS);
1242
1243 /* Show fields */
1244 for (i = 0; i < tp->nr_args; i++)
1245 SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
1246 trace_seq_puts(s, "\n");
1247
1248 return __probe_event_show_format(s, tp, "(%lx <- %lx)",
1249 "REC->" FIELD_STRING_FUNC
1250 ", REC->" FIELD_STRING_RETIP);
1251}
1252
1253#ifdef CONFIG_EVENT_PROFILE
1254
1255/* Kprobe profile handler */
1256static __kprobes int kprobe_profile_func(struct kprobe *kp,
1257 struct pt_regs *regs)
1258{
1259 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
1260 struct ftrace_event_call *call = &tp->call;
1261 struct kprobe_trace_entry *entry;
1262 struct trace_entry *ent;
1263 int size, __size, i, pc, __cpu;
1264 unsigned long irq_flags;
1265 char *trace_buf;
1266 char *raw_data;
1267 int rctx;
1268
1269 pc = preempt_count();
1270 __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
1271 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1272 size -= sizeof(u32);
1273 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
1274 "profile buffer not large enough"))
1275 return 0;
1276
1277 /*
1278 * Protect the non nmi buffer
1279 * This also protects the rcu read side
1280 */
1281 local_irq_save(irq_flags);
1282
1283 rctx = perf_swevent_get_recursion_context();
1284 if (rctx < 0)
1285 goto end_recursion;
1286
1287 __cpu = smp_processor_id();
1288
1289 if (in_nmi())
1290 trace_buf = rcu_dereference(perf_trace_buf_nmi);
1291 else
1292 trace_buf = rcu_dereference(perf_trace_buf);
1293
1294 if (!trace_buf)
1295 goto end;
1296
1297 raw_data = per_cpu_ptr(trace_buf, __cpu);
1298
1299 /* Zero dead bytes from alignment to avoid buffer leak to userspace */
1300 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
1301 entry = (struct kprobe_trace_entry *)raw_data;
1302 ent = &entry->ent;
1303
1304 tracing_generic_entry_update(ent, irq_flags, pc);
1305 ent->type = call->id;
1306 entry->nargs = tp->nr_args;
1307 entry->ip = (unsigned long)kp->addr;
1308 for (i = 0; i < tp->nr_args; i++)
1309 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1310 perf_tp_event(call->id, entry->ip, 1, entry, size);
1311
1312end:
1313 perf_swevent_put_recursion_context(rctx);
1314end_recursion:
1315 local_irq_restore(irq_flags);
1316
1317 return 0;
1318}
1319
1320/* Kretprobe profile handler */
1321static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
1322 struct pt_regs *regs)
1323{
1324 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
1325 struct ftrace_event_call *call = &tp->call;
1326 struct kretprobe_trace_entry *entry;
1327 struct trace_entry *ent;
1328 int size, __size, i, pc, __cpu;
1329 unsigned long irq_flags;
1330 char *trace_buf;
1331 char *raw_data;
1332 int rctx;
1333
1334 pc = preempt_count();
1335 __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
1336 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1337 size -= sizeof(u32);
1338 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
1339 "profile buffer not large enough"))
1340 return 0;
1341
1342 /*
1343 * Protect the non nmi buffer
1344 * This also protects the rcu read side
1345 */
1346 local_irq_save(irq_flags);
1347
1348 rctx = perf_swevent_get_recursion_context();
1349 if (rctx < 0)
1350 goto end_recursion;
1351
1352 __cpu = smp_processor_id();
1353
1354 if (in_nmi())
1355 trace_buf = rcu_dereference(perf_trace_buf_nmi);
1356 else
1357 trace_buf = rcu_dereference(perf_trace_buf);
1358
1359 if (!trace_buf)
1360 goto end;
1361
1362 raw_data = per_cpu_ptr(trace_buf, __cpu);
1363
1364 /* Zero dead bytes from alignment to avoid buffer leak to userspace */
1365 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
1366 entry = (struct kretprobe_trace_entry *)raw_data;
1367 ent = &entry->ent;
1368
1369 tracing_generic_entry_update(ent, irq_flags, pc);
1370 ent->type = call->id;
1371 entry->nargs = tp->nr_args;
1372 entry->func = (unsigned long)tp->rp.kp.addr;
1373 entry->ret_ip = (unsigned long)ri->ret_addr;
1374 for (i = 0; i < tp->nr_args; i++)
1375 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1376 perf_tp_event(call->id, entry->ret_ip, 1, entry, size);
1377
1378end:
1379 perf_swevent_put_recursion_context(rctx);
1380end_recursion:
1381 local_irq_restore(irq_flags);
1382
1383 return 0;
1384}
1385
1386static int probe_profile_enable(struct ftrace_event_call *call)
1387{
1388 struct trace_probe *tp = (struct trace_probe *)call->data;
1389
1390 tp->flags |= TP_FLAG_PROFILE;
1391
1392 if (probe_is_return(tp))
1393 return enable_kretprobe(&tp->rp);
1394 else
1395 return enable_kprobe(&tp->rp.kp);
1396}
1397
1398static void probe_profile_disable(struct ftrace_event_call *call)
1399{
1400 struct trace_probe *tp = (struct trace_probe *)call->data;
1401
1402 tp->flags &= ~TP_FLAG_PROFILE;
1403
1404 if (!(tp->flags & TP_FLAG_TRACE)) {
1405 if (probe_is_return(tp))
1406 disable_kretprobe(&tp->rp);
1407 else
1408 disable_kprobe(&tp->rp.kp);
1409 }
1410}
1411#endif /* CONFIG_EVENT_PROFILE */
1412
1413
1414static __kprobes
1415int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
1416{
1417 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
1418
1419 if (tp->flags & TP_FLAG_TRACE)
1420 kprobe_trace_func(kp, regs);
1421#ifdef CONFIG_EVENT_PROFILE
1422 if (tp->flags & TP_FLAG_PROFILE)
1423 kprobe_profile_func(kp, regs);
1424#endif /* CONFIG_EVENT_PROFILE */
1425 return 0; /* We don't tweek kernel, so just return 0 */
1426}
1427
1428static __kprobes
1429int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
1430{
1431 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
1432
1433 if (tp->flags & TP_FLAG_TRACE)
1434 kretprobe_trace_func(ri, regs);
1435#ifdef CONFIG_EVENT_PROFILE
1436 if (tp->flags & TP_FLAG_PROFILE)
1437 kretprobe_profile_func(ri, regs);
1438#endif /* CONFIG_EVENT_PROFILE */
1439 return 0; /* We don't tweek kernel, so just return 0 */
1440}
1441
1442static int register_probe_event(struct trace_probe *tp)
1443{
1444 struct ftrace_event_call *call = &tp->call;
1445 int ret;
1446
1447 /* Initialize ftrace_event_call */
1448 if (probe_is_return(tp)) {
1449 tp->event.trace = print_kretprobe_event;
1450 call->raw_init = probe_event_raw_init;
1451 call->show_format = kretprobe_event_show_format;
1452 call->define_fields = kretprobe_event_define_fields;
1453 } else {
1454 tp->event.trace = print_kprobe_event;
1455 call->raw_init = probe_event_raw_init;
1456 call->show_format = kprobe_event_show_format;
1457 call->define_fields = kprobe_event_define_fields;
1458 }
1459 call->event = &tp->event;
1460 call->id = register_ftrace_event(&tp->event);
1461 if (!call->id)
1462 return -ENODEV;
1463 call->enabled = 0;
1464 call->regfunc = probe_event_enable;
1465 call->unregfunc = probe_event_disable;
1466
1467#ifdef CONFIG_EVENT_PROFILE
1468 call->profile_enable = probe_profile_enable;
1469 call->profile_disable = probe_profile_disable;
1470#endif
1471 call->data = tp;
1472 ret = trace_add_event_call(call);
1473 if (ret) {
1474 pr_info("Failed to register kprobe event: %s\n", call->name);
1475 unregister_ftrace_event(&tp->event);
1476 }
1477 return ret;
1478}
1479
1480static void unregister_probe_event(struct trace_probe *tp)
1481{
1482 /* tp->event is unregistered in trace_remove_event_call() */
1483 trace_remove_event_call(&tp->call);
1484}
1485
1486/* Make a debugfs interface for controling probe points */
1487static __init int init_kprobe_trace(void)
1488{
1489 struct dentry *d_tracer;
1490 struct dentry *entry;
1491
1492 d_tracer = tracing_init_dentry();
1493 if (!d_tracer)
1494 return 0;
1495
1496 entry = debugfs_create_file("kprobe_events", 0644, d_tracer,
1497 NULL, &kprobe_events_ops);
1498
1499 /* Event list interface */
1500 if (!entry)
1501 pr_warning("Could not create debugfs "
1502 "'kprobe_events' entry\n");
1503
1504 /* Profile interface */
1505 entry = debugfs_create_file("kprobe_profile", 0444, d_tracer,
1506 NULL, &kprobe_profile_ops);
1507
1508 if (!entry)
1509 pr_warning("Could not create debugfs "
1510 "'kprobe_profile' entry\n");
1511 return 0;
1512}
1513fs_initcall(init_kprobe_trace);
1514
1515
1516#ifdef CONFIG_FTRACE_STARTUP_TEST
1517
1518static int kprobe_trace_selftest_target(int a1, int a2, int a3,
1519 int a4, int a5, int a6)
1520{
1521 return a1 + a2 + a3 + a4 + a5 + a6;
1522}
1523
1524static __init int kprobe_trace_self_tests_init(void)
1525{
1526 int ret;
1527 int (*target)(int, int, int, int, int, int);
1528
1529 target = kprobe_trace_selftest_target;
1530
1531 pr_info("Testing kprobe tracing: ");
1532
1533 ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target "
1534 "$arg1 $arg2 $arg3 $arg4 $stack $stack0");
1535 if (WARN_ON_ONCE(ret))
1536 pr_warning("error enabling function entry\n");
1537
1538 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
1539 "$retval");
1540 if (WARN_ON_ONCE(ret))
1541 pr_warning("error enabling function return\n");
1542
1543 ret = target(1, 2, 3, 4, 5, 6);
1544
1545 cleanup_all_probes();
1546
1547 pr_cont("OK\n");
1548 return 0;
1549}
1550
1551late_initcall(kprobe_trace_self_tests_init);
1552
1553#endif
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
new file mode 100644
index 00000000000..94103cdcf9d
--- /dev/null
+++ b/kernel/trace/trace_ksym.c
@@ -0,0 +1,519 @@
1/*
2 * trace_ksym.c - Kernel Symbol Tracer
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2009
19 */
20
21#include <linux/kallsyms.h>
22#include <linux/uaccess.h>
23#include <linux/debugfs.h>
24#include <linux/ftrace.h>
25#include <linux/module.h>
26#include <linux/fs.h>
27
28#include "trace_output.h"
29#include "trace.h"
30
31#include <linux/hw_breakpoint.h>
32#include <asm/hw_breakpoint.h>
33
34#include <asm/atomic.h>
35
36/*
37 * For now, let us restrict the no. of symbols traced simultaneously to number
38 * of available hardware breakpoint registers.
39 */
40#define KSYM_TRACER_MAX HBP_NUM
41
42#define KSYM_TRACER_OP_LEN 3 /* rw- */
43
44struct trace_ksym {
45 struct perf_event **ksym_hbp;
46 struct perf_event_attr attr;
47#ifdef CONFIG_PROFILE_KSYM_TRACER
48 atomic64_t counter;
49#endif
50 struct hlist_node ksym_hlist;
51};
52
53static struct trace_array *ksym_trace_array;
54
55static unsigned int ksym_filter_entry_count;
56static unsigned int ksym_tracing_enabled;
57
58static HLIST_HEAD(ksym_filter_head);
59
60static DEFINE_MUTEX(ksym_tracer_mutex);
61
62#ifdef CONFIG_PROFILE_KSYM_TRACER
63
64#define MAX_UL_INT 0xffffffff
65
66void ksym_collect_stats(unsigned long hbp_hit_addr)
67{
68 struct hlist_node *node;
69 struct trace_ksym *entry;
70
71 rcu_read_lock();
72 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
73 if (entry->attr.bp_addr == hbp_hit_addr) {
74 atomic64_inc(&entry->counter);
75 break;
76 }
77 }
78 rcu_read_unlock();
79}
80#endif /* CONFIG_PROFILE_KSYM_TRACER */
81
82void ksym_hbp_handler(struct perf_event *hbp, int nmi,
83 struct perf_sample_data *data,
84 struct pt_regs *regs)
85{
86 struct ring_buffer_event *event;
87 struct ksym_trace_entry *entry;
88 struct ring_buffer *buffer;
89 int pc;
90
91 if (!ksym_tracing_enabled)
92 return;
93
94 buffer = ksym_trace_array->buffer;
95
96 pc = preempt_count();
97
98 event = trace_buffer_lock_reserve(buffer, TRACE_KSYM,
99 sizeof(*entry), 0, pc);
100 if (!event)
101 return;
102
103 entry = ring_buffer_event_data(event);
104 entry->ip = instruction_pointer(regs);
105 entry->type = hw_breakpoint_type(hbp);
106 entry->addr = hw_breakpoint_addr(hbp);
107 strlcpy(entry->cmd, current->comm, TASK_COMM_LEN);
108
109#ifdef CONFIG_PROFILE_KSYM_TRACER
110 ksym_collect_stats(hw_breakpoint_addr(hbp));
111#endif /* CONFIG_PROFILE_KSYM_TRACER */
112
113 trace_buffer_unlock_commit(buffer, event, 0, pc);
114}
115
116/* Valid access types are represented as
117 *
118 * rw- : Set Read/Write Access Breakpoint
119 * -w- : Set Write Access Breakpoint
120 * --- : Clear Breakpoints
121 * --x : Set Execution Break points (Not available yet)
122 *
123 */
124static int ksym_trace_get_access_type(char *str)
125{
126 int access = 0;
127
128 if (str[0] == 'r')
129 access |= HW_BREAKPOINT_R;
130
131 if (str[1] == 'w')
132 access |= HW_BREAKPOINT_W;
133
134 if (str[2] == 'x')
135 access |= HW_BREAKPOINT_X;
136
137 switch (access) {
138 case HW_BREAKPOINT_R:
139 case HW_BREAKPOINT_W:
140 case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
141 return access;
142 default:
143 return -EINVAL;
144 }
145}
146
147/*
148 * There can be several possible malformed requests and we attempt to capture
149 * all of them. We enumerate some of the rules
150 * 1. We will not allow kernel symbols with ':' since it is used as a delimiter.
151 * i.e. multiple ':' symbols disallowed. Possible uses are of the form
152 * <module>:<ksym_name>:<op>.
153 * 2. No delimiter symbol ':' in the input string
154 * 3. Spurious operator symbols or symbols not in their respective positions
155 * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file
156 * 5. Kernel symbol not a part of /proc/kallsyms
157 * 6. Duplicate requests
158 */
159static int parse_ksym_trace_str(char *input_string, char **ksymname,
160 unsigned long *addr)
161{
162 int ret;
163
164 *ksymname = strsep(&input_string, ":");
165 *addr = kallsyms_lookup_name(*ksymname);
166
167 /* Check for malformed request: (2), (1) and (5) */
168 if ((!input_string) ||
169 (strlen(input_string) != KSYM_TRACER_OP_LEN) ||
170 (*addr == 0))
171 return -EINVAL;;
172
173 ret = ksym_trace_get_access_type(input_string);
174
175 return ret;
176}
177
178int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
179{
180 struct trace_ksym *entry;
181 int ret = -ENOMEM;
182
183 if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
184 printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
185 " new requests for tracing can be accepted now.\n",
186 KSYM_TRACER_MAX);
187 return -ENOSPC;
188 }
189
190 entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
191 if (!entry)
192 return -ENOMEM;
193
194 hw_breakpoint_init(&entry->attr);
195
196 entry->attr.bp_type = op;
197 entry->attr.bp_addr = addr;
198 entry->attr.bp_len = HW_BREAKPOINT_LEN_4;
199
200 entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr,
201 ksym_hbp_handler);
202
203 if (IS_ERR(entry->ksym_hbp)) {
204 ret = PTR_ERR(entry->ksym_hbp);
205 printk(KERN_INFO "ksym_tracer request failed. Try again"
206 " later!!\n");
207 goto err;
208 }
209
210 hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
211 ksym_filter_entry_count++;
212
213 return 0;
214
215err:
216 kfree(entry);
217
218 return ret;
219}
220
221static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
222 size_t count, loff_t *ppos)
223{
224 struct trace_ksym *entry;
225 struct hlist_node *node;
226 struct trace_seq *s;
227 ssize_t cnt = 0;
228 int ret;
229
230 s = kmalloc(sizeof(*s), GFP_KERNEL);
231 if (!s)
232 return -ENOMEM;
233 trace_seq_init(s);
234
235 mutex_lock(&ksym_tracer_mutex);
236
237 hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
238 ret = trace_seq_printf(s, "%pS:",
239 (void *)(unsigned long)entry->attr.bp_addr);
240 if (entry->attr.bp_type == HW_BREAKPOINT_R)
241 ret = trace_seq_puts(s, "r--\n");
242 else if (entry->attr.bp_type == HW_BREAKPOINT_W)
243 ret = trace_seq_puts(s, "-w-\n");
244 else if (entry->attr.bp_type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R))
245 ret = trace_seq_puts(s, "rw-\n");
246 WARN_ON_ONCE(!ret);
247 }
248
249 cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
250
251 mutex_unlock(&ksym_tracer_mutex);
252
253 kfree(s);
254
255 return cnt;
256}
257
258static void __ksym_trace_reset(void)
259{
260 struct trace_ksym *entry;
261 struct hlist_node *node, *node1;
262
263 mutex_lock(&ksym_tracer_mutex);
264 hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
265 ksym_hlist) {
266 unregister_wide_hw_breakpoint(entry->ksym_hbp);
267 ksym_filter_entry_count--;
268 hlist_del_rcu(&(entry->ksym_hlist));
269 synchronize_rcu();
270 kfree(entry);
271 }
272 mutex_unlock(&ksym_tracer_mutex);
273}
274
275static ssize_t ksym_trace_filter_write(struct file *file,
276 const char __user *buffer,
277 size_t count, loff_t *ppos)
278{
279 struct trace_ksym *entry;
280 struct hlist_node *node;
281 char *buf, *input_string, *ksymname = NULL;
282 unsigned long ksym_addr = 0;
283 int ret, op, changed = 0;
284
285 buf = kzalloc(count + 1, GFP_KERNEL);
286 if (!buf)
287 return -ENOMEM;
288
289 ret = -EFAULT;
290 if (copy_from_user(buf, buffer, count))
291 goto out;
292
293 buf[count] = '\0';
294 input_string = strstrip(buf);
295
296 /*
297 * Clear all breakpoints if:
298 * 1: echo > ksym_trace_filter
299 * 2: echo 0 > ksym_trace_filter
300 * 3: echo "*:---" > ksym_trace_filter
301 */
302 if (!input_string[0] || !strcmp(input_string, "0") ||
303 !strcmp(input_string, "*:---")) {
304 __ksym_trace_reset();
305 ret = 0;
306 goto out;
307 }
308
309 ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
310 if (ret < 0)
311 goto out;
312
313 mutex_lock(&ksym_tracer_mutex);
314
315 ret = -EINVAL;
316 hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
317 if (entry->attr.bp_addr == ksym_addr) {
318 /* Check for malformed request: (6) */
319 if (entry->attr.bp_type != op)
320 changed = 1;
321 else
322 goto out_unlock;
323 break;
324 }
325 }
326 if (changed) {
327 unregister_wide_hw_breakpoint(entry->ksym_hbp);
328 entry->attr.bp_type = op;
329 ret = 0;
330 if (op > 0) {
331 entry->ksym_hbp =
332 register_wide_hw_breakpoint(&entry->attr,
333 ksym_hbp_handler);
334 if (IS_ERR(entry->ksym_hbp))
335 ret = PTR_ERR(entry->ksym_hbp);
336 else
337 goto out_unlock;
338 }
339 /* Error or "symbol:---" case: drop it */
340 ksym_filter_entry_count--;
341 hlist_del_rcu(&(entry->ksym_hlist));
342 synchronize_rcu();
343 kfree(entry);
344 goto out_unlock;
345 } else {
346 /* Check for malformed request: (4) */
347 if (op)
348 ret = process_new_ksym_entry(ksymname, op, ksym_addr);
349 }
350out_unlock:
351 mutex_unlock(&ksym_tracer_mutex);
352out:
353 kfree(buf);
354 return !ret ? count : ret;
355}
356
357static const struct file_operations ksym_tracing_fops = {
358 .open = tracing_open_generic,
359 .read = ksym_trace_filter_read,
360 .write = ksym_trace_filter_write,
361};
362
363static void ksym_trace_reset(struct trace_array *tr)
364{
365 ksym_tracing_enabled = 0;
366 __ksym_trace_reset();
367}
368
369static int ksym_trace_init(struct trace_array *tr)
370{
371 int cpu, ret = 0;
372
373 for_each_online_cpu(cpu)
374 tracing_reset(tr, cpu);
375 ksym_tracing_enabled = 1;
376 ksym_trace_array = tr;
377
378 return ret;
379}
380
381static void ksym_trace_print_header(struct seq_file *m)
382{
383 seq_puts(m,
384 "# TASK-PID CPU# Symbol "
385 "Type Function\n");
386 seq_puts(m,
387 "# | | | "
388 " | |\n");
389}
390
391static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
392{
393 struct trace_entry *entry = iter->ent;
394 struct trace_seq *s = &iter->seq;
395 struct ksym_trace_entry *field;
396 char str[KSYM_SYMBOL_LEN];
397 int ret;
398
399 if (entry->type != TRACE_KSYM)
400 return TRACE_TYPE_UNHANDLED;
401
402 trace_assign_type(field, entry);
403
404 ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd,
405 entry->pid, iter->cpu, (char *)field->addr);
406 if (!ret)
407 return TRACE_TYPE_PARTIAL_LINE;
408
409 switch (field->type) {
410 case HW_BREAKPOINT_R:
411 ret = trace_seq_printf(s, " R ");
412 break;
413 case HW_BREAKPOINT_W:
414 ret = trace_seq_printf(s, " W ");
415 break;
416 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
417 ret = trace_seq_printf(s, " RW ");
418 break;
419 default:
420 return TRACE_TYPE_PARTIAL_LINE;
421 }
422
423 if (!ret)
424 return TRACE_TYPE_PARTIAL_LINE;
425
426 sprint_symbol(str, field->ip);
427 ret = trace_seq_printf(s, "%s\n", str);
428 if (!ret)
429 return TRACE_TYPE_PARTIAL_LINE;
430
431 return TRACE_TYPE_HANDLED;
432}
433
434struct tracer ksym_tracer __read_mostly =
435{
436 .name = "ksym_tracer",
437 .init = ksym_trace_init,
438 .reset = ksym_trace_reset,
439#ifdef CONFIG_FTRACE_SELFTEST
440 .selftest = trace_selftest_startup_ksym,
441#endif
442 .print_header = ksym_trace_print_header,
443 .print_line = ksym_trace_output
444};
445
446#ifdef CONFIG_PROFILE_KSYM_TRACER
447static int ksym_profile_show(struct seq_file *m, void *v)
448{
449 struct hlist_node *node;
450 struct trace_ksym *entry;
451 int access_type = 0;
452 char fn_name[KSYM_NAME_LEN];
453
454 seq_puts(m, " Access Type ");
455 seq_puts(m, " Symbol Counter\n");
456 seq_puts(m, " ----------- ");
457 seq_puts(m, " ------ -------\n");
458
459 rcu_read_lock();
460 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
461
462 access_type = entry->attr.bp_type;
463
464 switch (access_type) {
465 case HW_BREAKPOINT_R:
466 seq_puts(m, " R ");
467 break;
468 case HW_BREAKPOINT_W:
469 seq_puts(m, " W ");
470 break;
471 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
472 seq_puts(m, " RW ");
473 break;
474 default:
475 seq_puts(m, " NA ");
476 }
477
478 if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0)
479 seq_printf(m, " %-36s", fn_name);
480 else
481 seq_printf(m, " %-36s", "<NA>");
482 seq_printf(m, " %15llu\n",
483 (unsigned long long)atomic64_read(&entry->counter));
484 }
485 rcu_read_unlock();
486
487 return 0;
488}
489
490static int ksym_profile_open(struct inode *node, struct file *file)
491{
492 return single_open(file, ksym_profile_show, NULL);
493}
494
495static const struct file_operations ksym_profile_fops = {
496 .open = ksym_profile_open,
497 .read = seq_read,
498 .llseek = seq_lseek,
499 .release = single_release,
500};
501#endif /* CONFIG_PROFILE_KSYM_TRACER */
502
503__init static int init_ksym_trace(void)
504{
505 struct dentry *d_tracer;
506
507 d_tracer = tracing_init_dentry();
508
509 trace_create_file("ksym_trace_filter", 0644, d_tracer,
510 NULL, &ksym_tracing_fops);
511
512#ifdef CONFIG_PROFILE_KSYM_TRACER
513 trace_create_file("ksym_profile", 0444, d_tracer,
514 NULL, &ksym_profile_fops);
515#endif
516
517 return register_tracer(&ksym_tracer);
518}
519device_initcall(init_ksym_trace);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index ed17565826b..8e46b3323cd 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -23,13 +23,21 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
23 23
24static int next_event_type = __TRACE_LAST_TYPE + 1; 24static int next_event_type = __TRACE_LAST_TYPE + 1;
25 25
26void trace_print_seq(struct seq_file *m, struct trace_seq *s) 26int trace_print_seq(struct seq_file *m, struct trace_seq *s)
27{ 27{
28 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; 28 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
29 int ret;
30
31 ret = seq_write(m, s->buffer, len);
29 32
30 seq_write(m, s->buffer, len); 33 /*
34 * Only reset this buffer if we successfully wrote to the
35 * seq_file buffer.
36 */
37 if (!ret)
38 trace_seq_init(s);
31 39
32 trace_seq_init(s); 40 return ret;
33} 41}
34 42
35enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) 43enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
@@ -69,6 +77,9 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
69 * @s: trace sequence descriptor 77 * @s: trace sequence descriptor
70 * @fmt: printf format string 78 * @fmt: printf format string
71 * 79 *
80 * It returns 0 if the trace oversizes the buffer's free
81 * space, 1 otherwise.
82 *
72 * The tracer may use either sequence operations or its own 83 * The tracer may use either sequence operations or its own
73 * copy to user routines. To simplify formating of a trace 84 * copy to user routines. To simplify formating of a trace
74 * trace_seq_printf is used to store strings into a special 85 * trace_seq_printf is used to store strings into a special
@@ -82,7 +93,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
82 va_list ap; 93 va_list ap;
83 int ret; 94 int ret;
84 95
85 if (!len) 96 if (s->full || !len)
86 return 0; 97 return 0;
87 98
88 va_start(ap, fmt); 99 va_start(ap, fmt);
@@ -90,12 +101,14 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
90 va_end(ap); 101 va_end(ap);
91 102
92 /* If we can't write it all, don't bother writing anything */ 103 /* If we can't write it all, don't bother writing anything */
93 if (ret >= len) 104 if (ret >= len) {
105 s->full = 1;
94 return 0; 106 return 0;
107 }
95 108
96 s->len += ret; 109 s->len += ret;
97 110
98 return len; 111 return 1;
99} 112}
100EXPORT_SYMBOL_GPL(trace_seq_printf); 113EXPORT_SYMBOL_GPL(trace_seq_printf);
101 114
@@ -116,14 +129,16 @@ trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
116 int len = (PAGE_SIZE - 1) - s->len; 129 int len = (PAGE_SIZE - 1) - s->len;
117 int ret; 130 int ret;
118 131
119 if (!len) 132 if (s->full || !len)
120 return 0; 133 return 0;
121 134
122 ret = vsnprintf(s->buffer + s->len, len, fmt, args); 135 ret = vsnprintf(s->buffer + s->len, len, fmt, args);
123 136
124 /* If we can't write it all, don't bother writing anything */ 137 /* If we can't write it all, don't bother writing anything */
125 if (ret >= len) 138 if (ret >= len) {
139 s->full = 1;
126 return 0; 140 return 0;
141 }
127 142
128 s->len += ret; 143 s->len += ret;
129 144
@@ -136,14 +151,16 @@ int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
136 int len = (PAGE_SIZE - 1) - s->len; 151 int len = (PAGE_SIZE - 1) - s->len;
137 int ret; 152 int ret;
138 153
139 if (!len) 154 if (s->full || !len)
140 return 0; 155 return 0;
141 156
142 ret = bstr_printf(s->buffer + s->len, len, fmt, binary); 157 ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
143 158
144 /* If we can't write it all, don't bother writing anything */ 159 /* If we can't write it all, don't bother writing anything */
145 if (ret >= len) 160 if (ret >= len) {
161 s->full = 1;
146 return 0; 162 return 0;
163 }
147 164
148 s->len += ret; 165 s->len += ret;
149 166
@@ -164,9 +181,14 @@ int trace_seq_puts(struct trace_seq *s, const char *str)
164{ 181{
165 int len = strlen(str); 182 int len = strlen(str);
166 183
167 if (len > ((PAGE_SIZE - 1) - s->len)) 184 if (s->full)
168 return 0; 185 return 0;
169 186
187 if (len > ((PAGE_SIZE - 1) - s->len)) {
188 s->full = 1;
189 return 0;
190 }
191
170 memcpy(s->buffer + s->len, str, len); 192 memcpy(s->buffer + s->len, str, len);
171 s->len += len; 193 s->len += len;
172 194
@@ -175,8 +197,13 @@ int trace_seq_puts(struct trace_seq *s, const char *str)
175 197
176int trace_seq_putc(struct trace_seq *s, unsigned char c) 198int trace_seq_putc(struct trace_seq *s, unsigned char c)
177{ 199{
178 if (s->len >= (PAGE_SIZE - 1)) 200 if (s->full)
201 return 0;
202
203 if (s->len >= (PAGE_SIZE - 1)) {
204 s->full = 1;
179 return 0; 205 return 0;
206 }
180 207
181 s->buffer[s->len++] = c; 208 s->buffer[s->len++] = c;
182 209
@@ -185,8 +212,13 @@ int trace_seq_putc(struct trace_seq *s, unsigned char c)
185 212
186int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len) 213int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
187{ 214{
188 if (len > ((PAGE_SIZE - 1) - s->len)) 215 if (s->full)
216 return 0;
217
218 if (len > ((PAGE_SIZE - 1) - s->len)) {
219 s->full = 1;
189 return 0; 220 return 0;
221 }
190 222
191 memcpy(s->buffer + s->len, mem, len); 223 memcpy(s->buffer + s->len, mem, len);
192 s->len += len; 224 s->len += len;
@@ -200,6 +232,9 @@ int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len)
200 const unsigned char *data = mem; 232 const unsigned char *data = mem;
201 int i, j; 233 int i, j;
202 234
235 if (s->full)
236 return 0;
237
203#ifdef __BIG_ENDIAN 238#ifdef __BIG_ENDIAN
204 for (i = 0, j = 0; i < len; i++) { 239 for (i = 0, j = 0; i < len; i++) {
205#else 240#else
@@ -217,8 +252,13 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)
217{ 252{
218 void *ret; 253 void *ret;
219 254
220 if (len > ((PAGE_SIZE - 1) - s->len)) 255 if (s->full)
256 return 0;
257
258 if (len > ((PAGE_SIZE - 1) - s->len)) {
259 s->full = 1;
221 return NULL; 260 return NULL;
261 }
222 262
223 ret = s->buffer + s->len; 263 ret = s->buffer + s->len;
224 s->len += len; 264 s->len += len;
@@ -230,8 +270,14 @@ int trace_seq_path(struct trace_seq *s, struct path *path)
230{ 270{
231 unsigned char *p; 271 unsigned char *p;
232 272
233 if (s->len >= (PAGE_SIZE - 1)) 273 if (s->full)
274 return 0;
275
276 if (s->len >= (PAGE_SIZE - 1)) {
277 s->full = 1;
234 return 0; 278 return 0;
279 }
280
235 p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); 281 p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
236 if (!IS_ERR(p)) { 282 if (!IS_ERR(p)) {
237 p = mangle_path(s->buffer + s->len, p, "\n"); 283 p = mangle_path(s->buffer + s->len, p, "\n");
@@ -244,6 +290,7 @@ int trace_seq_path(struct trace_seq *s, struct path *path)
244 return 1; 290 return 1;
245 } 291 }
246 292
293 s->full = 1;
247 return 0; 294 return 0;
248} 295}
249 296
@@ -370,6 +417,9 @@ int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
370 unsigned long vmstart = 0; 417 unsigned long vmstart = 0;
371 int ret = 1; 418 int ret = 1;
372 419
420 if (s->full)
421 return 0;
422
373 if (mm) { 423 if (mm) {
374 const struct vm_area_struct *vma; 424 const struct vm_area_struct *vma;
375 425
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 26185d72767..0271742abb8 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -28,8 +28,8 @@ static int wakeup_current_cpu;
28static unsigned wakeup_prio = -1; 28static unsigned wakeup_prio = -1;
29static int wakeup_rt; 29static int wakeup_rt;
30 30
31static raw_spinlock_t wakeup_lock = 31static arch_spinlock_t wakeup_lock =
32 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 32 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
33 33
34static void __wakeup_reset(struct trace_array *tr); 34static void __wakeup_reset(struct trace_array *tr);
35 35
@@ -143,7 +143,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
143 goto out; 143 goto out;
144 144
145 local_irq_save(flags); 145 local_irq_save(flags);
146 __raw_spin_lock(&wakeup_lock); 146 arch_spin_lock(&wakeup_lock);
147 147
148 /* We could race with grabbing wakeup_lock */ 148 /* We could race with grabbing wakeup_lock */
149 if (unlikely(!tracer_enabled || next != wakeup_task)) 149 if (unlikely(!tracer_enabled || next != wakeup_task))
@@ -169,7 +169,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
169 169
170out_unlock: 170out_unlock:
171 __wakeup_reset(wakeup_trace); 171 __wakeup_reset(wakeup_trace);
172 __raw_spin_unlock(&wakeup_lock); 172 arch_spin_unlock(&wakeup_lock);
173 local_irq_restore(flags); 173 local_irq_restore(flags);
174out: 174out:
175 atomic_dec(&wakeup_trace->data[cpu]->disabled); 175 atomic_dec(&wakeup_trace->data[cpu]->disabled);
@@ -193,9 +193,9 @@ static void wakeup_reset(struct trace_array *tr)
193 tracing_reset_online_cpus(tr); 193 tracing_reset_online_cpus(tr);
194 194
195 local_irq_save(flags); 195 local_irq_save(flags);
196 __raw_spin_lock(&wakeup_lock); 196 arch_spin_lock(&wakeup_lock);
197 __wakeup_reset(tr); 197 __wakeup_reset(tr);
198 __raw_spin_unlock(&wakeup_lock); 198 arch_spin_unlock(&wakeup_lock);
199 local_irq_restore(flags); 199 local_irq_restore(flags);
200} 200}
201 201
@@ -225,7 +225,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
225 goto out; 225 goto out;
226 226
227 /* interrupts should be off from try_to_wake_up */ 227 /* interrupts should be off from try_to_wake_up */
228 __raw_spin_lock(&wakeup_lock); 228 arch_spin_lock(&wakeup_lock);
229 229
230 /* check for races. */ 230 /* check for races. */
231 if (!tracer_enabled || p->prio >= wakeup_prio) 231 if (!tracer_enabled || p->prio >= wakeup_prio)
@@ -255,7 +255,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
255 trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); 255 trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
256 256
257out_locked: 257out_locked:
258 __raw_spin_unlock(&wakeup_lock); 258 arch_spin_unlock(&wakeup_lock);
259out: 259out:
260 atomic_dec(&wakeup_trace->data[cpu]->disabled); 260 atomic_dec(&wakeup_trace->data[cpu]->disabled);
261} 261}
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index d2cdbabb4ea..280fea470d6 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -17,6 +17,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
17 case TRACE_GRAPH_ENT: 17 case TRACE_GRAPH_ENT:
18 case TRACE_GRAPH_RET: 18 case TRACE_GRAPH_RET:
19 case TRACE_HW_BRANCHES: 19 case TRACE_HW_BRANCHES:
20 case TRACE_KSYM:
20 return 1; 21 return 1;
21 } 22 }
22 return 0; 23 return 0;
@@ -66,7 +67,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
66 67
67 /* Don't allow flipping of max traces now */ 68 /* Don't allow flipping of max traces now */
68 local_irq_save(flags); 69 local_irq_save(flags);
69 __raw_spin_lock(&ftrace_max_lock); 70 arch_spin_lock(&ftrace_max_lock);
70 71
71 cnt = ring_buffer_entries(tr->buffer); 72 cnt = ring_buffer_entries(tr->buffer);
72 73
@@ -84,7 +85,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
84 break; 85 break;
85 } 86 }
86 tracing_on(); 87 tracing_on();
87 __raw_spin_unlock(&ftrace_max_lock); 88 arch_spin_unlock(&ftrace_max_lock);
88 local_irq_restore(flags); 89 local_irq_restore(flags);
89 90
90 if (count) 91 if (count)
@@ -808,3 +809,57 @@ trace_selftest_startup_hw_branches(struct tracer *trace,
808 return ret; 809 return ret;
809} 810}
810#endif /* CONFIG_HW_BRANCH_TRACER */ 811#endif /* CONFIG_HW_BRANCH_TRACER */
812
813#ifdef CONFIG_KSYM_TRACER
814static int ksym_selftest_dummy;
815
816int
817trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
818{
819 unsigned long count;
820 int ret;
821
822 /* start the tracing */
823 ret = tracer_init(trace, tr);
824 if (ret) {
825 warn_failed_init_tracer(trace, ret);
826 return ret;
827 }
828
829 ksym_selftest_dummy = 0;
830 /* Register the read-write tracing request */
831
832 ret = process_new_ksym_entry("ksym_selftest_dummy",
833 HW_BREAKPOINT_R | HW_BREAKPOINT_W,
834 (unsigned long)(&ksym_selftest_dummy));
835
836 if (ret < 0) {
837 printk(KERN_CONT "ksym_trace read-write startup test failed\n");
838 goto ret_path;
839 }
840 /* Perform a read and a write operation over the dummy variable to
841 * trigger the tracer
842 */
843 if (ksym_selftest_dummy == 0)
844 ksym_selftest_dummy++;
845
846 /* stop the tracing. */
847 tracing_stop();
848 /* check the trace buffer */
849 ret = trace_test_buffer(tr, &count);
850 trace->reset(tr);
851 tracing_start();
852
853 /* read & write operations - one each is performed on the dummy variable
854 * triggering two entries in the trace buffer
855 */
856 if (!ret && count != 2) {
857 printk(KERN_CONT "Ksym tracer startup test failed");
858 ret = -1;
859 }
860
861ret_path:
862 return ret;
863}
864#endif /* CONFIG_KSYM_TRACER */
865
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 8504ac71e4e..f4bc9b27de5 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -27,8 +27,8 @@ static struct stack_trace max_stack_trace = {
27}; 27};
28 28
29static unsigned long max_stack_size; 29static unsigned long max_stack_size;
30static raw_spinlock_t max_stack_lock = 30static arch_spinlock_t max_stack_lock =
31 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 31 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
32 32
33static int stack_trace_disabled __read_mostly; 33static int stack_trace_disabled __read_mostly;
34static DEFINE_PER_CPU(int, trace_active); 34static DEFINE_PER_CPU(int, trace_active);
@@ -54,7 +54,7 @@ static inline void check_stack(void)
54 return; 54 return;
55 55
56 local_irq_save(flags); 56 local_irq_save(flags);
57 __raw_spin_lock(&max_stack_lock); 57 arch_spin_lock(&max_stack_lock);
58 58
59 /* a race could have already updated it */ 59 /* a race could have already updated it */
60 if (this_size <= max_stack_size) 60 if (this_size <= max_stack_size)
@@ -103,7 +103,7 @@ static inline void check_stack(void)
103 } 103 }
104 104
105 out: 105 out:
106 __raw_spin_unlock(&max_stack_lock); 106 arch_spin_unlock(&max_stack_lock);
107 local_irq_restore(flags); 107 local_irq_restore(flags);
108} 108}
109 109
@@ -157,6 +157,7 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
157 unsigned long val, flags; 157 unsigned long val, flags;
158 char buf[64]; 158 char buf[64];
159 int ret; 159 int ret;
160 int cpu;
160 161
161 if (count >= sizeof(buf)) 162 if (count >= sizeof(buf))
162 return -EINVAL; 163 return -EINVAL;
@@ -171,9 +172,20 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
171 return ret; 172 return ret;
172 173
173 local_irq_save(flags); 174 local_irq_save(flags);
174 __raw_spin_lock(&max_stack_lock); 175
176 /*
177 * In case we trace inside arch_spin_lock() or after (NMI),
178 * we will cause circular lock, so we also need to increase
179 * the percpu trace_active here.
180 */
181 cpu = smp_processor_id();
182 per_cpu(trace_active, cpu)++;
183
184 arch_spin_lock(&max_stack_lock);
175 *ptr = val; 185 *ptr = val;
176 __raw_spin_unlock(&max_stack_lock); 186 arch_spin_unlock(&max_stack_lock);
187
188 per_cpu(trace_active, cpu)--;
177 local_irq_restore(flags); 189 local_irq_restore(flags);
178 190
179 return count; 191 return count;
@@ -206,8 +218,14 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
206 218
207static void *t_start(struct seq_file *m, loff_t *pos) 219static void *t_start(struct seq_file *m, loff_t *pos)
208{ 220{
221 int cpu;
222
209 local_irq_disable(); 223 local_irq_disable();
210 __raw_spin_lock(&max_stack_lock); 224
225 cpu = smp_processor_id();
226 per_cpu(trace_active, cpu)++;
227
228 arch_spin_lock(&max_stack_lock);
211 229
212 if (*pos == 0) 230 if (*pos == 0)
213 return SEQ_START_TOKEN; 231 return SEQ_START_TOKEN;
@@ -217,7 +235,13 @@ static void *t_start(struct seq_file *m, loff_t *pos)
217 235
218static void t_stop(struct seq_file *m, void *p) 236static void t_stop(struct seq_file *m, void *p)
219{ 237{
220 __raw_spin_unlock(&max_stack_lock); 238 int cpu;
239
240 arch_spin_unlock(&max_stack_lock);
241
242 cpu = smp_processor_id();
243 per_cpu(trace_active, cpu)--;
244
221 local_irq_enable(); 245 local_irq_enable();
222} 246}
223 247
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 527e17eae57..75289f372dd 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -14,6 +14,43 @@ static int sys_refcount_exit;
14static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); 14static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
15static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); 15static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
16 16
17extern unsigned long __start_syscalls_metadata[];
18extern unsigned long __stop_syscalls_metadata[];
19
20static struct syscall_metadata **syscalls_metadata;
21
22static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
23{
24 struct syscall_metadata *start;
25 struct syscall_metadata *stop;
26 char str[KSYM_SYMBOL_LEN];
27
28
29 start = (struct syscall_metadata *)__start_syscalls_metadata;
30 stop = (struct syscall_metadata *)__stop_syscalls_metadata;
31 kallsyms_lookup(syscall, NULL, NULL, NULL, str);
32
33 for ( ; start < stop; start++) {
34 /*
35 * Only compare after the "sys" prefix. Archs that use
36 * syscall wrappers may have syscalls symbols aliases prefixed
37 * with "SyS" instead of "sys", leading to an unwanted
38 * mismatch.
39 */
40 if (start->name && !strcmp(start->name + 3, str + 3))
41 return start;
42 }
43 return NULL;
44}
45
46static struct syscall_metadata *syscall_nr_to_meta(int nr)
47{
48 if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
49 return NULL;
50
51 return syscalls_metadata[nr];
52}
53
17enum print_line_t 54enum print_line_t
18print_syscall_enter(struct trace_iterator *iter, int flags) 55print_syscall_enter(struct trace_iterator *iter, int flags)
19{ 56{
@@ -30,7 +67,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags)
30 if (!entry) 67 if (!entry)
31 goto end; 68 goto end;
32 69
33 if (entry->enter_id != ent->type) { 70 if (entry->enter_event->id != ent->type) {
34 WARN_ON_ONCE(1); 71 WARN_ON_ONCE(1);
35 goto end; 72 goto end;
36 } 73 }
@@ -85,7 +122,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
85 return TRACE_TYPE_HANDLED; 122 return TRACE_TYPE_HANDLED;
86 } 123 }
87 124
88 if (entry->exit_id != ent->type) { 125 if (entry->exit_event->id != ent->type) {
89 WARN_ON_ONCE(1); 126 WARN_ON_ONCE(1);
90 return TRACE_TYPE_UNHANDLED; 127 return TRACE_TYPE_UNHANDLED;
91 } 128 }
@@ -103,24 +140,19 @@ extern char *__bad_type_size(void);
103#define SYSCALL_FIELD(type, name) \ 140#define SYSCALL_FIELD(type, name) \
104 sizeof(type) != sizeof(trace.name) ? \ 141 sizeof(type) != sizeof(trace.name) ? \
105 __bad_type_size() : \ 142 __bad_type_size() : \
106 #type, #name, offsetof(typeof(trace), name), sizeof(trace.name) 143 #type, #name, offsetof(typeof(trace), name), \
144 sizeof(trace.name), is_signed_type(type)
107 145
108int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) 146int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
109{ 147{
110 int i; 148 int i;
111 int nr;
112 int ret; 149 int ret;
113 struct syscall_metadata *entry; 150 struct syscall_metadata *entry = call->data;
114 struct syscall_trace_enter trace; 151 struct syscall_trace_enter trace;
115 int offset = offsetof(struct syscall_trace_enter, args); 152 int offset = offsetof(struct syscall_trace_enter, args);
116 153
117 nr = syscall_name_to_nr(call->data); 154 ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
118 entry = syscall_nr_to_meta(nr); 155 "\tsigned:%u;\n",
119
120 if (!entry)
121 return 0;
122
123 ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n",
124 SYSCALL_FIELD(int, nr)); 156 SYSCALL_FIELD(int, nr));
125 if (!ret) 157 if (!ret)
126 return 0; 158 return 0;
@@ -130,8 +162,10 @@ int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
130 entry->args[i]); 162 entry->args[i]);
131 if (!ret) 163 if (!ret)
132 return 0; 164 return 0;
133 ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;\n", offset, 165 ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;"
134 sizeof(unsigned long)); 166 "\tsigned:%u;\n", offset,
167 sizeof(unsigned long),
168 is_signed_type(unsigned long));
135 if (!ret) 169 if (!ret)
136 return 0; 170 return 0;
137 offset += sizeof(unsigned long); 171 offset += sizeof(unsigned long);
@@ -163,8 +197,10 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
163 struct syscall_trace_exit trace; 197 struct syscall_trace_exit trace;
164 198
165 ret = trace_seq_printf(s, 199 ret = trace_seq_printf(s,
166 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 200 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
167 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n", 201 "\tsigned:%u;\n"
202 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
203 "\tsigned:%u;\n",
168 SYSCALL_FIELD(int, nr), 204 SYSCALL_FIELD(int, nr),
169 SYSCALL_FIELD(long, ret)); 205 SYSCALL_FIELD(long, ret));
170 if (!ret) 206 if (!ret)
@@ -176,19 +212,12 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
176int syscall_enter_define_fields(struct ftrace_event_call *call) 212int syscall_enter_define_fields(struct ftrace_event_call *call)
177{ 213{
178 struct syscall_trace_enter trace; 214 struct syscall_trace_enter trace;
179 struct syscall_metadata *meta; 215 struct syscall_metadata *meta = call->data;
180 int ret; 216 int ret;
181 int nr;
182 int i; 217 int i;
183 int offset = offsetof(typeof(trace), args); 218 int offset = offsetof(typeof(trace), args);
184 219
185 nr = syscall_name_to_nr(call->data); 220 ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
186 meta = syscall_nr_to_meta(nr);
187
188 if (!meta)
189 return 0;
190
191 ret = trace_define_common_fields(call);
192 if (ret) 221 if (ret)
193 return ret; 222 return ret;
194 223
@@ -208,11 +237,11 @@ int syscall_exit_define_fields(struct ftrace_event_call *call)
208 struct syscall_trace_exit trace; 237 struct syscall_trace_exit trace;
209 int ret; 238 int ret;
210 239
211 ret = trace_define_common_fields(call); 240 ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
212 if (ret) 241 if (ret)
213 return ret; 242 return ret;
214 243
215 ret = trace_define_field(call, SYSCALL_FIELD(long, ret), 0, 244 ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
216 FILTER_OTHER); 245 FILTER_OTHER);
217 246
218 return ret; 247 return ret;
@@ -239,8 +268,8 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
239 268
240 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; 269 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
241 270
242 event = trace_current_buffer_lock_reserve(&buffer, sys_data->enter_id, 271 event = trace_current_buffer_lock_reserve(&buffer,
243 size, 0, 0); 272 sys_data->enter_event->id, size, 0, 0);
244 if (!event) 273 if (!event)
245 return; 274 return;
246 275
@@ -271,8 +300,8 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
271 if (!sys_data) 300 if (!sys_data)
272 return; 301 return;
273 302
274 event = trace_current_buffer_lock_reserve(&buffer, sys_data->exit_id, 303 event = trace_current_buffer_lock_reserve(&buffer,
275 sizeof(*entry), 0, 0); 304 sys_data->exit_event->id, sizeof(*entry), 0, 0);
276 if (!event) 305 if (!event)
277 return; 306 return;
278 307
@@ -285,23 +314,18 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
285 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 314 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
286} 315}
287 316
288int reg_event_syscall_enter(void *ptr) 317int reg_event_syscall_enter(struct ftrace_event_call *call)
289{ 318{
290 int ret = 0; 319 int ret = 0;
291 int num; 320 int num;
292 char *name;
293 321
294 name = (char *)ptr; 322 num = ((struct syscall_metadata *)call->data)->syscall_nr;
295 num = syscall_name_to_nr(name);
296 if (num < 0 || num >= NR_syscalls) 323 if (num < 0 || num >= NR_syscalls)
297 return -ENOSYS; 324 return -ENOSYS;
298 mutex_lock(&syscall_trace_lock); 325 mutex_lock(&syscall_trace_lock);
299 if (!sys_refcount_enter) 326 if (!sys_refcount_enter)
300 ret = register_trace_sys_enter(ftrace_syscall_enter); 327 ret = register_trace_sys_enter(ftrace_syscall_enter);
301 if (ret) { 328 if (!ret) {
302 pr_info("event trace: Could not activate"
303 "syscall entry trace point");
304 } else {
305 set_bit(num, enabled_enter_syscalls); 329 set_bit(num, enabled_enter_syscalls);
306 sys_refcount_enter++; 330 sys_refcount_enter++;
307 } 331 }
@@ -309,13 +333,11 @@ int reg_event_syscall_enter(void *ptr)
309 return ret; 333 return ret;
310} 334}
311 335
312void unreg_event_syscall_enter(void *ptr) 336void unreg_event_syscall_enter(struct ftrace_event_call *call)
313{ 337{
314 int num; 338 int num;
315 char *name;
316 339
317 name = (char *)ptr; 340 num = ((struct syscall_metadata *)call->data)->syscall_nr;
318 num = syscall_name_to_nr(name);
319 if (num < 0 || num >= NR_syscalls) 341 if (num < 0 || num >= NR_syscalls)
320 return; 342 return;
321 mutex_lock(&syscall_trace_lock); 343 mutex_lock(&syscall_trace_lock);
@@ -326,23 +348,18 @@ void unreg_event_syscall_enter(void *ptr)
326 mutex_unlock(&syscall_trace_lock); 348 mutex_unlock(&syscall_trace_lock);
327} 349}
328 350
329int reg_event_syscall_exit(void *ptr) 351int reg_event_syscall_exit(struct ftrace_event_call *call)
330{ 352{
331 int ret = 0; 353 int ret = 0;
332 int num; 354 int num;
333 char *name;
334 355
335 name = (char *)ptr; 356 num = ((struct syscall_metadata *)call->data)->syscall_nr;
336 num = syscall_name_to_nr(name);
337 if (num < 0 || num >= NR_syscalls) 357 if (num < 0 || num >= NR_syscalls)
338 return -ENOSYS; 358 return -ENOSYS;
339 mutex_lock(&syscall_trace_lock); 359 mutex_lock(&syscall_trace_lock);
340 if (!sys_refcount_exit) 360 if (!sys_refcount_exit)
341 ret = register_trace_sys_exit(ftrace_syscall_exit); 361 ret = register_trace_sys_exit(ftrace_syscall_exit);
342 if (ret) { 362 if (!ret) {
343 pr_info("event trace: Could not activate"
344 "syscall exit trace point");
345 } else {
346 set_bit(num, enabled_exit_syscalls); 363 set_bit(num, enabled_exit_syscalls);
347 sys_refcount_exit++; 364 sys_refcount_exit++;
348 } 365 }
@@ -350,13 +367,11 @@ int reg_event_syscall_exit(void *ptr)
350 return ret; 367 return ret;
351} 368}
352 369
353void unreg_event_syscall_exit(void *ptr) 370void unreg_event_syscall_exit(struct ftrace_event_call *call)
354{ 371{
355 int num; 372 int num;
356 char *name;
357 373
358 name = (char *)ptr; 374 num = ((struct syscall_metadata *)call->data)->syscall_nr;
359 num = syscall_name_to_nr(name);
360 if (num < 0 || num >= NR_syscalls) 375 if (num < 0 || num >= NR_syscalls)
361 return; 376 return;
362 mutex_lock(&syscall_trace_lock); 377 mutex_lock(&syscall_trace_lock);
@@ -367,13 +382,44 @@ void unreg_event_syscall_exit(void *ptr)
367 mutex_unlock(&syscall_trace_lock); 382 mutex_unlock(&syscall_trace_lock);
368} 383}
369 384
370struct trace_event event_syscall_enter = { 385int init_syscall_trace(struct ftrace_event_call *call)
371 .trace = print_syscall_enter, 386{
372}; 387 int id;
388
389 id = register_ftrace_event(call->event);
390 if (!id)
391 return -ENODEV;
392 call->id = id;
393 INIT_LIST_HEAD(&call->fields);
394 return 0;
395}
396
397int __init init_ftrace_syscalls(void)
398{
399 struct syscall_metadata *meta;
400 unsigned long addr;
401 int i;
402
403 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
404 NR_syscalls, GFP_KERNEL);
405 if (!syscalls_metadata) {
406 WARN_ON(1);
407 return -ENOMEM;
408 }
409
410 for (i = 0; i < NR_syscalls; i++) {
411 addr = arch_syscall_addr(i);
412 meta = find_syscall_meta(addr);
413 if (!meta)
414 continue;
415
416 meta->syscall_nr = i;
417 syscalls_metadata[i] = meta;
418 }
373 419
374struct trace_event event_syscall_exit = { 420 return 0;
375 .trace = print_syscall_exit, 421}
376}; 422core_initcall(init_ftrace_syscalls);
377 423
378#ifdef CONFIG_EVENT_PROFILE 424#ifdef CONFIG_EVENT_PROFILE
379 425
@@ -387,8 +433,10 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
387 struct syscall_metadata *sys_data; 433 struct syscall_metadata *sys_data;
388 struct syscall_trace_enter *rec; 434 struct syscall_trace_enter *rec;
389 unsigned long flags; 435 unsigned long flags;
436 char *trace_buf;
390 char *raw_data; 437 char *raw_data;
391 int syscall_nr; 438 int syscall_nr;
439 int rctx;
392 int size; 440 int size;
393 int cpu; 441 int cpu;
394 442
@@ -412,41 +460,42 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
412 /* Protect the per cpu buffer, begin the rcu read side */ 460 /* Protect the per cpu buffer, begin the rcu read side */
413 local_irq_save(flags); 461 local_irq_save(flags);
414 462
463 rctx = perf_swevent_get_recursion_context();
464 if (rctx < 0)
465 goto end_recursion;
466
415 cpu = smp_processor_id(); 467 cpu = smp_processor_id();
416 468
417 if (in_nmi()) 469 trace_buf = rcu_dereference(perf_trace_buf);
418 raw_data = rcu_dereference(trace_profile_buf_nmi);
419 else
420 raw_data = rcu_dereference(trace_profile_buf);
421 470
422 if (!raw_data) 471 if (!trace_buf)
423 goto end; 472 goto end;
424 473
425 raw_data = per_cpu_ptr(raw_data, cpu); 474 raw_data = per_cpu_ptr(trace_buf, cpu);
426 475
427 /* zero the dead bytes from align to not leak stack to user */ 476 /* zero the dead bytes from align to not leak stack to user */
428 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; 477 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
429 478
430 rec = (struct syscall_trace_enter *) raw_data; 479 rec = (struct syscall_trace_enter *) raw_data;
431 tracing_generic_entry_update(&rec->ent, 0, 0); 480 tracing_generic_entry_update(&rec->ent, 0, 0);
432 rec->ent.type = sys_data->enter_id; 481 rec->ent.type = sys_data->enter_event->id;
433 rec->nr = syscall_nr; 482 rec->nr = syscall_nr;
434 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 483 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
435 (unsigned long *)&rec->args); 484 (unsigned long *)&rec->args);
436 perf_tp_event(sys_data->enter_id, 0, 1, rec, size); 485 perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size);
437 486
438end: 487end:
488 perf_swevent_put_recursion_context(rctx);
489end_recursion:
439 local_irq_restore(flags); 490 local_irq_restore(flags);
440} 491}
441 492
442int reg_prof_syscall_enter(char *name) 493int prof_sysenter_enable(struct ftrace_event_call *call)
443{ 494{
444 int ret = 0; 495 int ret = 0;
445 int num; 496 int num;
446 497
447 num = syscall_name_to_nr(name); 498 num = ((struct syscall_metadata *)call->data)->syscall_nr;
448 if (num < 0 || num >= NR_syscalls)
449 return -ENOSYS;
450 499
451 mutex_lock(&syscall_trace_lock); 500 mutex_lock(&syscall_trace_lock);
452 if (!sys_prof_refcount_enter) 501 if (!sys_prof_refcount_enter)
@@ -462,13 +511,11 @@ int reg_prof_syscall_enter(char *name)
462 return ret; 511 return ret;
463} 512}
464 513
465void unreg_prof_syscall_enter(char *name) 514void prof_sysenter_disable(struct ftrace_event_call *call)
466{ 515{
467 int num; 516 int num;
468 517
469 num = syscall_name_to_nr(name); 518 num = ((struct syscall_metadata *)call->data)->syscall_nr;
470 if (num < 0 || num >= NR_syscalls)
471 return;
472 519
473 mutex_lock(&syscall_trace_lock); 520 mutex_lock(&syscall_trace_lock);
474 sys_prof_refcount_enter--; 521 sys_prof_refcount_enter--;
@@ -484,7 +531,9 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
484 struct syscall_trace_exit *rec; 531 struct syscall_trace_exit *rec;
485 unsigned long flags; 532 unsigned long flags;
486 int syscall_nr; 533 int syscall_nr;
534 char *trace_buf;
487 char *raw_data; 535 char *raw_data;
536 int rctx;
488 int size; 537 int size;
489 int cpu; 538 int cpu;
490 539
@@ -510,17 +559,19 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
510 559
511 /* Protect the per cpu buffer, begin the rcu read side */ 560 /* Protect the per cpu buffer, begin the rcu read side */
512 local_irq_save(flags); 561 local_irq_save(flags);
562
563 rctx = perf_swevent_get_recursion_context();
564 if (rctx < 0)
565 goto end_recursion;
566
513 cpu = smp_processor_id(); 567 cpu = smp_processor_id();
514 568
515 if (in_nmi()) 569 trace_buf = rcu_dereference(perf_trace_buf);
516 raw_data = rcu_dereference(trace_profile_buf_nmi);
517 else
518 raw_data = rcu_dereference(trace_profile_buf);
519 570
520 if (!raw_data) 571 if (!trace_buf)
521 goto end; 572 goto end;
522 573
523 raw_data = per_cpu_ptr(raw_data, cpu); 574 raw_data = per_cpu_ptr(trace_buf, cpu);
524 575
525 /* zero the dead bytes from align to not leak stack to user */ 576 /* zero the dead bytes from align to not leak stack to user */
526 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; 577 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
@@ -528,24 +579,24 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
528 rec = (struct syscall_trace_exit *)raw_data; 579 rec = (struct syscall_trace_exit *)raw_data;
529 580
530 tracing_generic_entry_update(&rec->ent, 0, 0); 581 tracing_generic_entry_update(&rec->ent, 0, 0);
531 rec->ent.type = sys_data->exit_id; 582 rec->ent.type = sys_data->exit_event->id;
532 rec->nr = syscall_nr; 583 rec->nr = syscall_nr;
533 rec->ret = syscall_get_return_value(current, regs); 584 rec->ret = syscall_get_return_value(current, regs);
534 585
535 perf_tp_event(sys_data->exit_id, 0, 1, rec, size); 586 perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size);
536 587
537end: 588end:
589 perf_swevent_put_recursion_context(rctx);
590end_recursion:
538 local_irq_restore(flags); 591 local_irq_restore(flags);
539} 592}
540 593
541int reg_prof_syscall_exit(char *name) 594int prof_sysexit_enable(struct ftrace_event_call *call)
542{ 595{
543 int ret = 0; 596 int ret = 0;
544 int num; 597 int num;
545 598
546 num = syscall_name_to_nr(name); 599 num = ((struct syscall_metadata *)call->data)->syscall_nr;
547 if (num < 0 || num >= NR_syscalls)
548 return -ENOSYS;
549 600
550 mutex_lock(&syscall_trace_lock); 601 mutex_lock(&syscall_trace_lock);
551 if (!sys_prof_refcount_exit) 602 if (!sys_prof_refcount_exit)
@@ -561,13 +612,11 @@ int reg_prof_syscall_exit(char *name)
561 return ret; 612 return ret;
562} 613}
563 614
564void unreg_prof_syscall_exit(char *name) 615void prof_sysexit_disable(struct ftrace_event_call *call)
565{ 616{
566 int num; 617 int num;
567 618
568 num = syscall_name_to_nr(name); 619 num = ((struct syscall_metadata *)call->data)->syscall_nr;
569 if (num < 0 || num >= NR_syscalls)
570 return;
571 620
572 mutex_lock(&syscall_trace_lock); 621 mutex_lock(&syscall_trace_lock);
573 sys_prof_refcount_exit--; 622 sys_prof_refcount_exit--;
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index f6693969287..a7974a552ca 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -93,6 +93,7 @@ static const struct stacktrace_ops backtrace_ops = {
93 .warning_symbol = backtrace_warning_symbol, 93 .warning_symbol = backtrace_warning_symbol,
94 .stack = backtrace_stack, 94 .stack = backtrace_stack,
95 .address = backtrace_address, 95 .address = backtrace_address,
96 .walk_stack = print_context_stack,
96}; 97};
97 98
98static int 99static int
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
new file mode 100644
index 00000000000..eb27fd3430a
--- /dev/null
+++ b/kernel/user-return-notifier.c
@@ -0,0 +1,44 @@
1
2#include <linux/user-return-notifier.h>
3#include <linux/percpu.h>
4#include <linux/sched.h>
5#include <linux/module.h>
6
7static DEFINE_PER_CPU(struct hlist_head, return_notifier_list);
8
9/*
10 * Request a notification when the current cpu returns to userspace. Must be
11 * called in atomic context. The notifier will also be called in atomic
12 * context.
13 */
14void user_return_notifier_register(struct user_return_notifier *urn)
15{
16 set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
17 hlist_add_head(&urn->link, &__get_cpu_var(return_notifier_list));
18}
19EXPORT_SYMBOL_GPL(user_return_notifier_register);
20
21/*
22 * Removes a registered user return notifier. Must be called from atomic
23 * context, and from the same cpu registration occured in.
24 */
25void user_return_notifier_unregister(struct user_return_notifier *urn)
26{
27 hlist_del(&urn->link);
28 if (hlist_empty(&__get_cpu_var(return_notifier_list)))
29 clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
30}
31EXPORT_SYMBOL_GPL(user_return_notifier_unregister);
32
33/* Calls registered user return notifiers */
34void fire_user_return_notifiers(void)
35{
36 struct user_return_notifier *urn;
37 struct hlist_node *tmp1, *tmp2;
38 struct hlist_head *head;
39
40 head = &get_cpu_var(return_notifier_list);
41 hlist_for_each_entry_safe(urn, tmp1, tmp2, head, link)
42 urn->on_user_return(urn);
43 put_cpu_var(return_notifier_list);
44}
diff --git a/kernel/user.c b/kernel/user.c
index 2c000e7132a..46d0165ca70 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -330,9 +330,9 @@ done:
330 */ 330 */
331static void free_user(struct user_struct *up, unsigned long flags) 331static void free_user(struct user_struct *up, unsigned long flags)
332{ 332{
333 spin_unlock_irqrestore(&uidhash_lock, flags);
334 INIT_DELAYED_WORK(&up->work, cleanup_user_struct); 333 INIT_DELAYED_WORK(&up->work, cleanup_user_struct);
335 schedule_delayed_work(&up->work, msecs_to_jiffies(1000)); 334 schedule_delayed_work(&up->work, msecs_to_jiffies(1000));
335 spin_unlock_irqrestore(&uidhash_lock, flags);
336} 336}
337 337
338#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */ 338#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 69eae358a72..a2cd77e70d4 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -57,78 +57,47 @@ static int proc_do_uts_string(ctl_table *table, int write,
57#define proc_do_uts_string NULL 57#define proc_do_uts_string NULL
58#endif 58#endif
59 59
60
61#ifdef CONFIG_SYSCTL_SYSCALL
62/* The generic string strategy routine: */
63static int sysctl_uts_string(ctl_table *table,
64 void __user *oldval, size_t __user *oldlenp,
65 void __user *newval, size_t newlen)
66{
67 struct ctl_table uts_table;
68 int r, write;
69 write = newval && newlen;
70 memcpy(&uts_table, table, sizeof(uts_table));
71 uts_table.data = get_uts(table, write);
72 r = sysctl_string(&uts_table, oldval, oldlenp, newval, newlen);
73 put_uts(table, write, uts_table.data);
74 return r;
75}
76#else
77#define sysctl_uts_string NULL
78#endif
79
80static struct ctl_table uts_kern_table[] = { 60static struct ctl_table uts_kern_table[] = {
81 { 61 {
82 .ctl_name = KERN_OSTYPE,
83 .procname = "ostype", 62 .procname = "ostype",
84 .data = init_uts_ns.name.sysname, 63 .data = init_uts_ns.name.sysname,
85 .maxlen = sizeof(init_uts_ns.name.sysname), 64 .maxlen = sizeof(init_uts_ns.name.sysname),
86 .mode = 0444, 65 .mode = 0444,
87 .proc_handler = proc_do_uts_string, 66 .proc_handler = proc_do_uts_string,
88 .strategy = sysctl_uts_string,
89 }, 67 },
90 { 68 {
91 .ctl_name = KERN_OSRELEASE,
92 .procname = "osrelease", 69 .procname = "osrelease",
93 .data = init_uts_ns.name.release, 70 .data = init_uts_ns.name.release,
94 .maxlen = sizeof(init_uts_ns.name.release), 71 .maxlen = sizeof(init_uts_ns.name.release),
95 .mode = 0444, 72 .mode = 0444,
96 .proc_handler = proc_do_uts_string, 73 .proc_handler = proc_do_uts_string,
97 .strategy = sysctl_uts_string,
98 }, 74 },
99 { 75 {
100 .ctl_name = KERN_VERSION,
101 .procname = "version", 76 .procname = "version",
102 .data = init_uts_ns.name.version, 77 .data = init_uts_ns.name.version,
103 .maxlen = sizeof(init_uts_ns.name.version), 78 .maxlen = sizeof(init_uts_ns.name.version),
104 .mode = 0444, 79 .mode = 0444,
105 .proc_handler = proc_do_uts_string, 80 .proc_handler = proc_do_uts_string,
106 .strategy = sysctl_uts_string,
107 }, 81 },
108 { 82 {
109 .ctl_name = KERN_NODENAME,
110 .procname = "hostname", 83 .procname = "hostname",
111 .data = init_uts_ns.name.nodename, 84 .data = init_uts_ns.name.nodename,
112 .maxlen = sizeof(init_uts_ns.name.nodename), 85 .maxlen = sizeof(init_uts_ns.name.nodename),
113 .mode = 0644, 86 .mode = 0644,
114 .proc_handler = proc_do_uts_string, 87 .proc_handler = proc_do_uts_string,
115 .strategy = sysctl_uts_string,
116 }, 88 },
117 { 89 {
118 .ctl_name = KERN_DOMAINNAME,
119 .procname = "domainname", 90 .procname = "domainname",
120 .data = init_uts_ns.name.domainname, 91 .data = init_uts_ns.name.domainname,
121 .maxlen = sizeof(init_uts_ns.name.domainname), 92 .maxlen = sizeof(init_uts_ns.name.domainname),
122 .mode = 0644, 93 .mode = 0644,
123 .proc_handler = proc_do_uts_string, 94 .proc_handler = proc_do_uts_string,
124 .strategy = sysctl_uts_string,
125 }, 95 },
126 {} 96 {}
127}; 97};
128 98
129static struct ctl_table uts_root_table[] = { 99static struct ctl_table uts_root_table[] = {
130 { 100 {
131 .ctl_name = CTL_KERN,
132 .procname = "kernel", 101 .procname = "kernel",
133 .mode = 0555, 102 .mode = 0555,
134 .child = uts_kern_table, 103 .child = uts_kern_table,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index addfe2df93b..dee48658805 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -68,6 +68,116 @@ struct workqueue_struct {
68#endif 68#endif
69}; 69};
70 70
71#ifdef CONFIG_DEBUG_OBJECTS_WORK
72
73static struct debug_obj_descr work_debug_descr;
74
75/*
76 * fixup_init is called when:
77 * - an active object is initialized
78 */
79static int work_fixup_init(void *addr, enum debug_obj_state state)
80{
81 struct work_struct *work = addr;
82
83 switch (state) {
84 case ODEBUG_STATE_ACTIVE:
85 cancel_work_sync(work);
86 debug_object_init(work, &work_debug_descr);
87 return 1;
88 default:
89 return 0;
90 }
91}
92
93/*
94 * fixup_activate is called when:
95 * - an active object is activated
96 * - an unknown object is activated (might be a statically initialized object)
97 */
98static int work_fixup_activate(void *addr, enum debug_obj_state state)
99{
100 struct work_struct *work = addr;
101
102 switch (state) {
103
104 case ODEBUG_STATE_NOTAVAILABLE:
105 /*
106 * This is not really a fixup. The work struct was
107 * statically initialized. We just make sure that it
108 * is tracked in the object tracker.
109 */
110 if (test_bit(WORK_STRUCT_STATIC, work_data_bits(work))) {
111 debug_object_init(work, &work_debug_descr);
112 debug_object_activate(work, &work_debug_descr);
113 return 0;
114 }
115 WARN_ON_ONCE(1);
116 return 0;
117
118 case ODEBUG_STATE_ACTIVE:
119 WARN_ON(1);
120
121 default:
122 return 0;
123 }
124}
125
126/*
127 * fixup_free is called when:
128 * - an active object is freed
129 */
130static int work_fixup_free(void *addr, enum debug_obj_state state)
131{
132 struct work_struct *work = addr;
133
134 switch (state) {
135 case ODEBUG_STATE_ACTIVE:
136 cancel_work_sync(work);
137 debug_object_free(work, &work_debug_descr);
138 return 1;
139 default:
140 return 0;
141 }
142}
143
144static struct debug_obj_descr work_debug_descr = {
145 .name = "work_struct",
146 .fixup_init = work_fixup_init,
147 .fixup_activate = work_fixup_activate,
148 .fixup_free = work_fixup_free,
149};
150
151static inline void debug_work_activate(struct work_struct *work)
152{
153 debug_object_activate(work, &work_debug_descr);
154}
155
156static inline void debug_work_deactivate(struct work_struct *work)
157{
158 debug_object_deactivate(work, &work_debug_descr);
159}
160
161void __init_work(struct work_struct *work, int onstack)
162{
163 if (onstack)
164 debug_object_init_on_stack(work, &work_debug_descr);
165 else
166 debug_object_init(work, &work_debug_descr);
167}
168EXPORT_SYMBOL_GPL(__init_work);
169
170void destroy_work_on_stack(struct work_struct *work)
171{
172 debug_object_free(work, &work_debug_descr);
173}
174EXPORT_SYMBOL_GPL(destroy_work_on_stack);
175
176#else
177static inline void debug_work_activate(struct work_struct *work) { }
178static inline void debug_work_deactivate(struct work_struct *work) { }
179#endif
180
71/* Serializes the accesses to the list of workqueues. */ 181/* Serializes the accesses to the list of workqueues. */
72static DEFINE_SPINLOCK(workqueue_lock); 182static DEFINE_SPINLOCK(workqueue_lock);
73static LIST_HEAD(workqueues); 183static LIST_HEAD(workqueues);
@@ -145,6 +255,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
145{ 255{
146 unsigned long flags; 256 unsigned long flags;
147 257
258 debug_work_activate(work);
148 spin_lock_irqsave(&cwq->lock, flags); 259 spin_lock_irqsave(&cwq->lock, flags);
149 insert_work(cwq, work, &cwq->worklist); 260 insert_work(cwq, work, &cwq->worklist);
150 spin_unlock_irqrestore(&cwq->lock, flags); 261 spin_unlock_irqrestore(&cwq->lock, flags);
@@ -280,6 +391,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
280 struct lockdep_map lockdep_map = work->lockdep_map; 391 struct lockdep_map lockdep_map = work->lockdep_map;
281#endif 392#endif
282 trace_workqueue_execution(cwq->thread, work); 393 trace_workqueue_execution(cwq->thread, work);
394 debug_work_deactivate(work);
283 cwq->current_work = work; 395 cwq->current_work = work;
284 list_del_init(cwq->worklist.next); 396 list_del_init(cwq->worklist.next);
285 spin_unlock_irq(&cwq->lock); 397 spin_unlock_irq(&cwq->lock);
@@ -350,11 +462,18 @@ static void wq_barrier_func(struct work_struct *work)
350static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, 462static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
351 struct wq_barrier *barr, struct list_head *head) 463 struct wq_barrier *barr, struct list_head *head)
352{ 464{
353 INIT_WORK(&barr->work, wq_barrier_func); 465 /*
466 * debugobject calls are safe here even with cwq->lock locked
467 * as we know for sure that this will not trigger any of the
468 * checks and call back into the fixup functions where we
469 * might deadlock.
470 */
471 INIT_WORK_ON_STACK(&barr->work, wq_barrier_func);
354 __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work)); 472 __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
355 473
356 init_completion(&barr->done); 474 init_completion(&barr->done);
357 475
476 debug_work_activate(&barr->work);
358 insert_work(cwq, &barr->work, head); 477 insert_work(cwq, &barr->work, head);
359} 478}
360 479
@@ -372,8 +491,10 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
372 } 491 }
373 spin_unlock_irq(&cwq->lock); 492 spin_unlock_irq(&cwq->lock);
374 493
375 if (active) 494 if (active) {
376 wait_for_completion(&barr.done); 495 wait_for_completion(&barr.done);
496 destroy_work_on_stack(&barr.work);
497 }
377 498
378 return active; 499 return active;
379} 500}
@@ -451,6 +572,7 @@ out:
451 return 0; 572 return 0;
452 573
453 wait_for_completion(&barr.done); 574 wait_for_completion(&barr.done);
575 destroy_work_on_stack(&barr.work);
454 return 1; 576 return 1;
455} 577}
456EXPORT_SYMBOL_GPL(flush_work); 578EXPORT_SYMBOL_GPL(flush_work);
@@ -485,6 +607,7 @@ static int try_to_grab_pending(struct work_struct *work)
485 */ 607 */
486 smp_rmb(); 608 smp_rmb();
487 if (cwq == get_wq_data(work)) { 609 if (cwq == get_wq_data(work)) {
610 debug_work_deactivate(work);
488 list_del_init(&work->entry); 611 list_del_init(&work->entry);
489 ret = 1; 612 ret = 1;
490 } 613 }
@@ -507,8 +630,10 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
507 } 630 }
508 spin_unlock_irq(&cwq->lock); 631 spin_unlock_irq(&cwq->lock);
509 632
510 if (unlikely(running)) 633 if (unlikely(running)) {
511 wait_for_completion(&barr.done); 634 wait_for_completion(&barr.done);
635 destroy_work_on_stack(&barr.work);
636 }
512} 637}
513 638
514static void wait_on_work(struct work_struct *work) 639static void wait_on_work(struct work_struct *work)
@@ -640,6 +765,24 @@ int schedule_delayed_work(struct delayed_work *dwork,
640EXPORT_SYMBOL(schedule_delayed_work); 765EXPORT_SYMBOL(schedule_delayed_work);
641 766
642/** 767/**
768 * flush_delayed_work - block until a dwork_struct's callback has terminated
769 * @dwork: the delayed work which is to be flushed
770 *
771 * Any timeout is cancelled, and any pending work is run immediately.
772 */
773void flush_delayed_work(struct delayed_work *dwork)
774{
775 if (del_timer_sync(&dwork->timer)) {
776 struct cpu_workqueue_struct *cwq;
777 cwq = wq_per_cpu(keventd_wq, get_cpu());
778 __queue_work(cwq, &dwork->work);
779 put_cpu();
780 }
781 flush_work(&dwork->work);
782}
783EXPORT_SYMBOL(flush_delayed_work);
784
785/**
643 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay 786 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
644 * @cpu: cpu to use 787 * @cpu: cpu to use
645 * @dwork: job to be done 788 * @dwork: job to be done
@@ -667,6 +810,7 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
667int schedule_on_each_cpu(work_func_t func) 810int schedule_on_each_cpu(work_func_t func)
668{ 811{
669 int cpu; 812 int cpu;
813 int orig = -1;
670 struct work_struct *works; 814 struct work_struct *works;
671 815
672 works = alloc_percpu(struct work_struct); 816 works = alloc_percpu(struct work_struct);
@@ -674,14 +818,28 @@ int schedule_on_each_cpu(work_func_t func)
674 return -ENOMEM; 818 return -ENOMEM;
675 819
676 get_online_cpus(); 820 get_online_cpus();
821
822 /*
823 * When running in keventd don't schedule a work item on
824 * itself. Can just call directly because the work queue is
825 * already bound. This also is faster.
826 */
827 if (current_is_keventd())
828 orig = raw_smp_processor_id();
829
677 for_each_online_cpu(cpu) { 830 for_each_online_cpu(cpu) {
678 struct work_struct *work = per_cpu_ptr(works, cpu); 831 struct work_struct *work = per_cpu_ptr(works, cpu);
679 832
680 INIT_WORK(work, func); 833 INIT_WORK(work, func);
681 schedule_work_on(cpu, work); 834 if (cpu != orig)
835 schedule_work_on(cpu, work);
682 } 836 }
837 if (orig >= 0)
838 func(per_cpu_ptr(works, orig));
839
683 for_each_online_cpu(cpu) 840 for_each_online_cpu(cpu)
684 flush_work(per_cpu_ptr(works, cpu)); 841 flush_work(per_cpu_ptr(works, cpu));
842
685 put_online_cpus(); 843 put_online_cpus();
686 free_percpu(works); 844 free_percpu(works);
687 return 0; 845 return 0;