aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks202
-rw-r--r--kernel/Makefile13
-rw-r--r--kernel/acct.c30
-rw-r--r--kernel/async.c1
-rw-r--r--kernel/audit.c3
-rw-r--r--kernel/audit_tree.c114
-rw-r--r--kernel/audit_watch.c1
-rw-r--r--kernel/auditfilter.c1
-rw-r--r--kernel/auditsc.c11
-rw-r--r--kernel/bounds.c2
-rw-r--r--kernel/capability.c19
-rw-r--r--kernel/cgroup.c777
-rw-r--r--kernel/cgroup_freezer.c15
-rw-r--r--kernel/compat.c1
-rw-r--r--kernel/cpu.c44
-rw-r--r--kernel/cpuset.c151
-rw-r--r--kernel/cred.c13
-rw-r--r--kernel/early_res.c584
-rw-r--r--kernel/elfcore.c28
-rw-r--r--kernel/exit.c89
-rw-r--r--kernel/fork.c104
-rw-r--r--kernel/futex.c117
-rw-r--r--kernel/futex_compat.c6
-rw-r--r--kernel/hrtimer.c178
-rw-r--r--kernel/hung_task.c2
-rw-r--r--kernel/hw_breakpoint.c492
-rw-r--r--kernel/irq/autoprobe.c20
-rw-r--r--kernel/irq/chip.c181
-rw-r--r--kernel/irq/devres.c4
-rw-r--r--kernel/irq/handle.c80
-rw-r--r--kernel/irq/internals.h8
-rw-r--r--kernel/irq/manage.c84
-rw-r--r--kernel/irq/migration.c2
-rw-r--r--kernel/irq/numa_migrate.c13
-rw-r--r--kernel/irq/pm.c8
-rw-r--r--kernel/irq/proc.c45
-rw-r--r--kernel/irq/spurious.c30
-rw-r--r--kernel/itimer.c7
-rw-r--r--kernel/kallsyms.c2
-rw-r--r--kernel/kexec.c65
-rw-r--r--kernel/kfifo.c410
-rw-r--r--kernel/kgdb.c264
-rw-r--r--kernel/kmod.c20
-rw-r--r--kernel/kprobes.c754
-rw-r--r--kernel/ksysfs.c31
-rw-r--r--kernel/kthread.c27
-rw-r--r--kernel/latencytop.c1
-rw-r--r--kernel/lockdep.c101
-rw-r--r--kernel/module.c361
-rw-r--r--kernel/mutex-debug.h12
-rw-r--r--kernel/mutex.c4
-rw-r--r--kernel/notifier.c8
-rw-r--r--kernel/nsproxy.c14
-rw-r--r--kernel/padata.c697
-rw-r--r--kernel/panic.c50
-rw-r--r--kernel/params.c18
-rw-r--r--kernel/perf_event.c1537
-rw-r--r--kernel/pid.c18
-rw-r--r--kernel/pid_namespace.c8
-rw-r--r--kernel/pm_qos_params.c20
-rw-r--r--kernel/posix-cpu-timers.c51
-rw-r--r--kernel/posix-timers.c2
-rw-r--r--kernel/power/Kconfig19
-rw-r--r--kernel/power/Makefile2
-rw-r--r--kernel/power/console.c7
-rw-r--r--kernel/power/hibernate.c40
-rw-r--r--kernel/power/hibernate_nvs.c1
-rw-r--r--kernel/power/main.c32
-rw-r--r--kernel/power/process.c19
-rw-r--r--kernel/power/snapshot.c5
-rw-r--r--kernel/power/suspend.c4
-rw-r--r--kernel/power/swap.c112
-rw-r--r--kernel/power/swsusp.c188
-rw-r--r--kernel/power/user.c25
-rw-r--r--kernel/printk.c182
-rw-r--r--kernel/profile.c4
-rw-r--r--kernel/ptrace.c88
-rw-r--r--kernel/range.c163
-rw-r--r--kernel/rcupdate.c166
-rw-r--r--kernel/rcutiny.c282
-rw-r--r--kernel/rcutorture.c167
-rw-r--r--kernel/rcutree.c627
-rw-r--r--kernel/rcutree.h133
-rw-r--r--kernel/rcutree_plugin.h512
-rw-r--r--kernel/rcutree_trace.c24
-rw-r--r--kernel/relay.c7
-rw-r--r--kernel/res_counter.c1
-rw-r--r--kernel/resource.c138
-rw-r--r--kernel/rtmutex-debug.c4
-rw-r--r--kernel/rtmutex.c106
-rw-r--r--kernel/sched.c3170
-rw-r--r--kernel/sched_clock.c23
-rw-r--r--kernel/sched_cpupri.c17
-rw-r--r--kernel/sched_cpupri.h2
-rw-r--r--kernel/sched_debug.c27
-rw-r--r--kernel/sched_fair.c1970
-rw-r--r--kernel/sched_features.h5
-rw-r--r--kernel/sched_idletask.c27
-rw-r--r--kernel/sched_rt.c191
-rw-r--r--kernel/signal.c182
-rw-r--r--kernel/slow-work.c9
-rw-r--r--kernel/slow-work.h8
-rw-r--r--kernel/smp.c100
-rw-r--r--kernel/softirq.c21
-rw-r--r--kernel/softlockup.c69
-rw-r--r--kernel/spinlock.c448
-rw-r--r--kernel/srcu.c121
-rw-r--r--kernel/stop_machine.c2
-rw-r--r--kernel/sys.c123
-rw-r--r--kernel/sys_ni.c4
-rw-r--r--kernel/sysctl.c983
-rw-r--r--kernel/sysctl_binary.c1541
-rw-r--r--kernel/sysctl_check.c1376
-rw-r--r--kernel/taskstats.c7
-rw-r--r--kernel/time.c31
-rw-r--r--kernel/time/clockevents.c46
-rw-r--r--kernel/time/clocksource.c141
-rw-r--r--kernel/time/ntp.c10
-rw-r--r--kernel/time/tick-broadcast.c42
-rw-r--r--kernel/time/tick-common.c20
-rw-r--r--kernel/time/tick-internal.h1
-rw-r--r--kernel/time/tick-oneshot.c52
-rw-r--r--kernel/time/tick-sched.c141
-rw-r--r--kernel/time/timecompare.c9
-rw-r--r--kernel/time/timekeeping.c106
-rw-r--r--kernel/time/timer_list.c26
-rw-r--r--kernel/time/timer_stats.c18
-rw-r--r--kernel/timer.c7
-rw-r--r--kernel/trace/Kconfig147
-rw-r--r--kernel/trace/Makefile6
-rw-r--r--kernel/trace/blktrace.c6
-rw-r--r--kernel/trace/ftrace.c533
-rw-r--r--kernel/trace/power-traces.c3
-rw-r--r--kernel/trace/ring_buffer.c137
-rw-r--r--kernel/trace/ring_buffer_benchmark.c86
-rw-r--r--kernel/trace/trace.c514
-rw-r--r--kernel/trace/trace.h118
-rw-r--r--kernel/trace/trace_branch.c19
-rw-r--r--kernel/trace/trace_clock.c21
-rw-r--r--kernel/trace/trace_entries.h16
-rw-r--r--kernel/trace/trace_event_perf.c175
-rw-r--r--kernel/trace/trace_event_profile.c125
-rw-r--r--kernel/trace/trace_events.c288
-rw-r--r--kernel/trace/trace_events_filter.c439
-rw-r--r--kernel/trace/trace_export.c113
-rw-r--r--kernel/trace/trace_functions_graph.c267
-rw-r--r--kernel/trace/trace_hw_branches.c51
-rw-r--r--kernel/trace/trace_irqsoff.c2
-rw-r--r--kernel/trace/trace_kprobe.c1488
-rw-r--r--kernel/trace/trace_ksym.c520
-rw-r--r--kernel/trace/trace_mmiotrace.c1
-rw-r--r--kernel/trace/trace_output.c75
-rw-r--r--kernel/trace/trace_sched_wakeup.c16
-rw-r--r--kernel/trace/trace_selftest.c60
-rw-r--r--kernel/trace/trace_stack.c40
-rw-r--r--kernel/trace/trace_stat.c1
-rw-r--r--kernel/trace/trace_syscalls.c397
-rw-r--r--kernel/trace/trace_sysprof.c1
-rw-r--r--kernel/trace/trace_workqueue.c1
-rw-r--r--kernel/tsacct.c1
-rw-r--r--kernel/user-return-notifier.c44
-rw-r--r--kernel/user.c305
-rw-r--r--kernel/utsname_sysctl.c31
-rw-r--r--kernel/workqueue.c133
164 files changed, 18314 insertions, 9955 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
new file mode 100644
index 000000000000..88c92fb44618
--- /dev/null
+++ b/kernel/Kconfig.locks
@@ -0,0 +1,202 @@
1#
2# The ARCH_INLINE foo is necessary because select ignores "depends on"
3#
4config ARCH_INLINE_SPIN_TRYLOCK
5 bool
6
7config ARCH_INLINE_SPIN_TRYLOCK_BH
8 bool
9
10config ARCH_INLINE_SPIN_LOCK
11 bool
12
13config ARCH_INLINE_SPIN_LOCK_BH
14 bool
15
16config ARCH_INLINE_SPIN_LOCK_IRQ
17 bool
18
19config ARCH_INLINE_SPIN_LOCK_IRQSAVE
20 bool
21
22config ARCH_INLINE_SPIN_UNLOCK
23 bool
24
25config ARCH_INLINE_SPIN_UNLOCK_BH
26 bool
27
28config ARCH_INLINE_SPIN_UNLOCK_IRQ
29 bool
30
31config ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
32 bool
33
34
35config ARCH_INLINE_READ_TRYLOCK
36 bool
37
38config ARCH_INLINE_READ_LOCK
39 bool
40
41config ARCH_INLINE_READ_LOCK_BH
42 bool
43
44config ARCH_INLINE_READ_LOCK_IRQ
45 bool
46
47config ARCH_INLINE_READ_LOCK_IRQSAVE
48 bool
49
50config ARCH_INLINE_READ_UNLOCK
51 bool
52
53config ARCH_INLINE_READ_UNLOCK_BH
54 bool
55
56config ARCH_INLINE_READ_UNLOCK_IRQ
57 bool
58
59config ARCH_INLINE_READ_UNLOCK_IRQRESTORE
60 bool
61
62
63config ARCH_INLINE_WRITE_TRYLOCK
64 bool
65
66config ARCH_INLINE_WRITE_LOCK
67 bool
68
69config ARCH_INLINE_WRITE_LOCK_BH
70 bool
71
72config ARCH_INLINE_WRITE_LOCK_IRQ
73 bool
74
75config ARCH_INLINE_WRITE_LOCK_IRQSAVE
76 bool
77
78config ARCH_INLINE_WRITE_UNLOCK
79 bool
80
81config ARCH_INLINE_WRITE_UNLOCK_BH
82 bool
83
84config ARCH_INLINE_WRITE_UNLOCK_IRQ
85 bool
86
87config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
88 bool
89
90#
91# lock_* functions are inlined when:
92# - DEBUG_SPINLOCK=n and GENERIC_LOCKBREAK=n and ARCH_INLINE_*LOCK=y
93#
94# trylock_* functions are inlined when:
95# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
96#
97# unlock and unlock_irq functions are inlined when:
98# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
99# or
100# - DEBUG_SPINLOCK=n and PREEMPT=n
101#
102# unlock_bh and unlock_irqrestore functions are inlined when:
103# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
104#
105
106config INLINE_SPIN_TRYLOCK
107 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK
108
109config INLINE_SPIN_TRYLOCK_BH
110 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK_BH
111
112config INLINE_SPIN_LOCK
113 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK
114
115config INLINE_SPIN_LOCK_BH
116 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
117 ARCH_INLINE_SPIN_LOCK_BH
118
119config INLINE_SPIN_LOCK_IRQ
120 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
121 ARCH_INLINE_SPIN_LOCK_IRQ
122
123config INLINE_SPIN_LOCK_IRQSAVE
124 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
125 ARCH_INLINE_SPIN_LOCK_IRQSAVE
126
127config INLINE_SPIN_UNLOCK
128 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK)
129
130config INLINE_SPIN_UNLOCK_BH
131 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH
132
133config INLINE_SPIN_UNLOCK_IRQ
134 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH)
135
136config INLINE_SPIN_UNLOCK_IRQRESTORE
137 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
138
139
140config INLINE_READ_TRYLOCK
141 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_TRYLOCK
142
143config INLINE_READ_LOCK
144 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK
145
146config INLINE_READ_LOCK_BH
147 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
148 ARCH_INLINE_READ_LOCK_BH
149
150config INLINE_READ_LOCK_IRQ
151 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
152 ARCH_INLINE_READ_LOCK_IRQ
153
154config INLINE_READ_LOCK_IRQSAVE
155 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
156 ARCH_INLINE_READ_LOCK_IRQSAVE
157
158config INLINE_READ_UNLOCK
159 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK)
160
161config INLINE_READ_UNLOCK_BH
162 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_BH
163
164config INLINE_READ_UNLOCK_IRQ
165 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK_BH)
166
167config INLINE_READ_UNLOCK_IRQRESTORE
168 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_IRQRESTORE
169
170
171config INLINE_WRITE_TRYLOCK
172 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_TRYLOCK
173
174config INLINE_WRITE_LOCK
175 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK
176
177config INLINE_WRITE_LOCK_BH
178 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
179 ARCH_INLINE_WRITE_LOCK_BH
180
181config INLINE_WRITE_LOCK_IRQ
182 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
183 ARCH_INLINE_WRITE_LOCK_IRQ
184
185config INLINE_WRITE_LOCK_IRQSAVE
186 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
187 ARCH_INLINE_WRITE_LOCK_IRQSAVE
188
189config INLINE_WRITE_UNLOCK
190 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK)
191
192config INLINE_WRITE_UNLOCK_BH
193 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_BH
194
195config INLINE_WRITE_UNLOCK_IRQ
196 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH)
197
198config INLINE_WRITE_UNLOCK_IRQRESTORE
199 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
200
201config MUTEX_SPIN_ON_OWNER
202 def_bool SMP && !DEBUG_MUTEXES && !HAVE_DEFAULT_NO_SPIN_MUTEXES
diff --git a/kernel/Makefile b/kernel/Makefile
index d7c13d249b2d..a987aa1676b5 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -4,13 +4,14 @@
4 4
5obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ 5obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
6 cpu.o exit.o itimer.o time.o softirq.o resource.o \ 6 cpu.o exit.o itimer.o time.o softirq.o resource.o \
7 sysctl.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
13 async.o 13 async.o range.o
14obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o
14obj-y += groups.o 15obj-y += groups.o
15 16
16ifdef CONFIG_FUNCTION_TRACER 17ifdef CONFIG_FUNCTION_TRACER
@@ -21,6 +22,7 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
21CFLAGS_REMOVE_rtmutex-debug.o = -pg 22CFLAGS_REMOVE_rtmutex-debug.o = -pg
22CFLAGS_REMOVE_cgroup-debug.o = -pg 23CFLAGS_REMOVE_cgroup-debug.o = -pg
23CFLAGS_REMOVE_sched_clock.o = -pg 24CFLAGS_REMOVE_sched_clock.o = -pg
25CFLAGS_REMOVE_perf_event.o = -pg
24endif 26endif
25 27
26obj-$(CONFIG_FREEZER) += freezer.o 28obj-$(CONFIG_FREEZER) += freezer.o
@@ -82,12 +84,16 @@ obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
82obj-$(CONFIG_TREE_RCU) += rcutree.o 84obj-$(CONFIG_TREE_RCU) += rcutree.o
83obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o 85obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
84obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o 86obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
87obj-$(CONFIG_TINY_RCU) += rcutiny.o
85obj-$(CONFIG_RELAY) += relay.o 88obj-$(CONFIG_RELAY) += relay.o
86obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 89obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
87obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 90obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
88obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 91obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
89obj-$(CONFIG_TRACEPOINTS) += tracepoint.o 92obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
90obj-$(CONFIG_LATENCYTOP) += latencytop.o 93obj-$(CONFIG_LATENCYTOP) += latencytop.o
94obj-$(CONFIG_BINFMT_ELF) += elfcore.o
95obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o
96obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o
91obj-$(CONFIG_FUNCTION_TRACER) += trace/ 97obj-$(CONFIG_FUNCTION_TRACER) += trace/
92obj-$(CONFIG_TRACING) += trace/ 98obj-$(CONFIG_TRACING) += trace/
93obj-$(CONFIG_X86_DS) += trace/ 99obj-$(CONFIG_X86_DS) += trace/
@@ -96,6 +102,9 @@ obj-$(CONFIG_SMP) += sched_cpupri.o
96obj-$(CONFIG_SLOW_WORK) += slow-work.o 102obj-$(CONFIG_SLOW_WORK) += slow-work.o
97obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o 103obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
98obj-$(CONFIG_PERF_EVENTS) += perf_event.o 104obj-$(CONFIG_PERF_EVENTS) += perf_event.o
105obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
106obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
107obj-$(CONFIG_PADATA) += padata.o
99 108
100ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 109ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
101# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 110# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index 9a4715a2f6bf..e4c0e1fee9b0 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -353,17 +353,18 @@ restart:
353 353
354void acct_exit_ns(struct pid_namespace *ns) 354void acct_exit_ns(struct pid_namespace *ns)
355{ 355{
356 struct bsd_acct_struct *acct; 356 struct bsd_acct_struct *acct = ns->bacct;
357 357
358 spin_lock(&acct_lock); 358 if (acct == NULL)
359 acct = ns->bacct; 359 return;
360 if (acct != NULL) {
361 if (acct->file != NULL)
362 acct_file_reopen(acct, NULL, NULL);
363 360
364 kfree(acct); 361 del_timer_sync(&acct->timer);
365 } 362 spin_lock(&acct_lock);
363 if (acct->file != NULL)
364 acct_file_reopen(acct, NULL, NULL);
366 spin_unlock(&acct_lock); 365 spin_unlock(&acct_lock);
366
367 kfree(acct);
367} 368}
368 369
369/* 370/*
@@ -536,7 +537,8 @@ static void do_acct_process(struct bsd_acct_struct *acct,
536 do_div(elapsed, AHZ); 537 do_div(elapsed, AHZ);
537 ac.ac_btime = get_seconds() - elapsed; 538 ac.ac_btime = get_seconds() - elapsed;
538 /* we really need to bite the bullet and change layout */ 539 /* we really need to bite the bullet and change layout */
539 current_uid_gid(&ac.ac_uid, &ac.ac_gid); 540 ac.ac_uid = orig_cred->uid;
541 ac.ac_gid = orig_cred->gid;
540#if ACCT_VERSION==2 542#if ACCT_VERSION==2
541 ac.ac_ahz = AHZ; 543 ac.ac_ahz = AHZ;
542#endif 544#endif
@@ -587,16 +589,6 @@ out:
587} 589}
588 590
589/** 591/**
590 * acct_init_pacct - initialize a new pacct_struct
591 * @pacct: per-process accounting info struct to initialize
592 */
593void acct_init_pacct(struct pacct_struct *pacct)
594{
595 memset(pacct, 0, sizeof(struct pacct_struct));
596 pacct->ac_utime = pacct->ac_stime = cputime_zero;
597}
598
599/**
600 * acct_collect - collect accounting information into pacct_struct 592 * acct_collect - collect accounting information into pacct_struct
601 * @exitcode: task exit code 593 * @exitcode: task exit code
602 * @group_dead: not 0, if this thread is the last one in the process. 594 * @group_dead: not 0, if this thread is the last one in the process.
diff --git a/kernel/async.c b/kernel/async.c
index 27235f5de198..15319d6c18fe 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -56,6 +56,7 @@ asynchronous and synchronous parts of the kernel.
56#include <linux/init.h> 56#include <linux/init.h>
57#include <linux/kthread.h> 57#include <linux/kthread.h>
58#include <linux/delay.h> 58#include <linux/delay.h>
59#include <linux/slab.h>
59#include <asm/atomic.h> 60#include <asm/atomic.h>
60 61
61static async_cookie_t next_cookie = 1; 62static async_cookie_t next_cookie = 1;
diff --git a/kernel/audit.c b/kernel/audit.c
index 5feed232be9d..c71bd26631a2 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -46,6 +46,7 @@
46#include <asm/atomic.h> 46#include <asm/atomic.h>
47#include <linux/mm.h> 47#include <linux/mm.h>
48#include <linux/module.h> 48#include <linux/module.h>
49#include <linux/slab.h>
49#include <linux/err.h> 50#include <linux/err.h>
50#include <linux/kthread.h> 51#include <linux/kthread.h>
51 52
@@ -398,7 +399,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
398 skb_get(skb); 399 skb_get(skb);
399 err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); 400 err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0);
400 if (err < 0) { 401 if (err < 0) {
401 BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */ 402 BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
402 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); 403 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
403 audit_log_lost("auditd dissapeared\n"); 404 audit_log_lost("auditd dissapeared\n");
404 audit_pid = 0; 405 audit_pid = 0;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 2451dc6f3282..46a57b57a335 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -3,6 +3,7 @@
3#include <linux/namei.h> 3#include <linux/namei.h>
4#include <linux/mount.h> 4#include <linux/mount.h>
5#include <linux/kthread.h> 5#include <linux/kthread.h>
6#include <linux/slab.h>
6 7
7struct audit_tree; 8struct audit_tree;
8struct audit_chunk; 9struct audit_chunk;
@@ -277,7 +278,7 @@ static void untag_chunk(struct node *p)
277 owner->root = NULL; 278 owner->root = NULL;
278 } 279 }
279 280
280 for (i = j = 0; i < size; i++, j++) { 281 for (i = j = 0; j <= size; i++, j++) {
281 struct audit_tree *s; 282 struct audit_tree *s;
282 if (&chunk->owners[j] == p) { 283 if (&chunk->owners[j] == p) {
283 list_del_init(&p->list); 284 list_del_init(&p->list);
@@ -290,7 +291,7 @@ static void untag_chunk(struct node *p)
290 if (!s) /* result of earlier fallback */ 291 if (!s) /* result of earlier fallback */
291 continue; 292 continue;
292 get_tree(s); 293 get_tree(s);
293 list_replace_init(&chunk->owners[i].list, &new->owners[j].list); 294 list_replace_init(&chunk->owners[j].list, &new->owners[i].list);
294 } 295 }
295 296
296 list_replace_rcu(&chunk->hash, &new->hash); 297 list_replace_rcu(&chunk->hash, &new->hash);
@@ -373,15 +374,17 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
373 for (n = 0; n < old->count; n++) { 374 for (n = 0; n < old->count; n++) {
374 if (old->owners[n].owner == tree) { 375 if (old->owners[n].owner == tree) {
375 spin_unlock(&hash_lock); 376 spin_unlock(&hash_lock);
376 put_inotify_watch(watch); 377 put_inotify_watch(&old->watch);
377 return 0; 378 return 0;
378 } 379 }
379 } 380 }
380 spin_unlock(&hash_lock); 381 spin_unlock(&hash_lock);
381 382
382 chunk = alloc_chunk(old->count + 1); 383 chunk = alloc_chunk(old->count + 1);
383 if (!chunk) 384 if (!chunk) {
385 put_inotify_watch(&old->watch);
384 return -ENOMEM; 386 return -ENOMEM;
387 }
385 388
386 mutex_lock(&inode->inotify_mutex); 389 mutex_lock(&inode->inotify_mutex);
387 if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) { 390 if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) {
@@ -425,7 +428,8 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
425 spin_unlock(&hash_lock); 428 spin_unlock(&hash_lock);
426 inotify_evict_watch(&old->watch); 429 inotify_evict_watch(&old->watch);
427 mutex_unlock(&inode->inotify_mutex); 430 mutex_unlock(&inode->inotify_mutex);
428 put_inotify_watch(&old->watch); 431 put_inotify_watch(&old->watch); /* pair to inotify_find_watch */
432 put_inotify_watch(&old->watch); /* and kill it */
429 return 0; 433 return 0;
430} 434}
431 435
@@ -545,6 +549,11 @@ int audit_remove_tree_rule(struct audit_krule *rule)
545 return 0; 549 return 0;
546} 550}
547 551
552static int compare_root(struct vfsmount *mnt, void *arg)
553{
554 return mnt->mnt_root->d_inode == arg;
555}
556
548void audit_trim_trees(void) 557void audit_trim_trees(void)
549{ 558{
550 struct list_head cursor; 559 struct list_head cursor;
@@ -556,7 +565,6 @@ void audit_trim_trees(void)
556 struct path path; 565 struct path path;
557 struct vfsmount *root_mnt; 566 struct vfsmount *root_mnt;
558 struct node *node; 567 struct node *node;
559 struct list_head list;
560 int err; 568 int err;
561 569
562 tree = container_of(cursor.next, struct audit_tree, list); 570 tree = container_of(cursor.next, struct audit_tree, list);
@@ -574,24 +582,16 @@ void audit_trim_trees(void)
574 if (!root_mnt) 582 if (!root_mnt)
575 goto skip_it; 583 goto skip_it;
576 584
577 list_add_tail(&list, &root_mnt->mnt_list);
578 spin_lock(&hash_lock); 585 spin_lock(&hash_lock);
579 list_for_each_entry(node, &tree->chunks, list) { 586 list_for_each_entry(node, &tree->chunks, list) {
580 struct audit_chunk *chunk = find_chunk(node); 587 struct inode *inode = find_chunk(node)->watch.inode;
581 struct inode *inode = chunk->watch.inode;
582 struct vfsmount *mnt;
583 node->index |= 1U<<31; 588 node->index |= 1U<<31;
584 list_for_each_entry(mnt, &list, mnt_list) { 589 if (iterate_mounts(compare_root, inode, root_mnt))
585 if (mnt->mnt_root->d_inode == inode) { 590 node->index &= ~(1U<<31);
586 node->index &= ~(1U<<31);
587 break;
588 }
589 }
590 } 591 }
591 spin_unlock(&hash_lock); 592 spin_unlock(&hash_lock);
592 trim_marked(tree); 593 trim_marked(tree);
593 put_tree(tree); 594 put_tree(tree);
594 list_del_init(&list);
595 drop_collected_mounts(root_mnt); 595 drop_collected_mounts(root_mnt);
596skip_it: 596skip_it:
597 mutex_lock(&audit_filter_mutex); 597 mutex_lock(&audit_filter_mutex);
@@ -600,22 +600,6 @@ skip_it:
600 mutex_unlock(&audit_filter_mutex); 600 mutex_unlock(&audit_filter_mutex);
601} 601}
602 602
603static int is_under(struct vfsmount *mnt, struct dentry *dentry,
604 struct path *path)
605{
606 if (mnt != path->mnt) {
607 for (;;) {
608 if (mnt->mnt_parent == mnt)
609 return 0;
610 if (mnt->mnt_parent == path->mnt)
611 break;
612 mnt = mnt->mnt_parent;
613 }
614 dentry = mnt->mnt_mountpoint;
615 }
616 return is_subdir(dentry, path->dentry);
617}
618
619int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op) 603int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
620{ 604{
621 605
@@ -635,13 +619,17 @@ void audit_put_tree(struct audit_tree *tree)
635 put_tree(tree); 619 put_tree(tree);
636} 620}
637 621
622static int tag_mount(struct vfsmount *mnt, void *arg)
623{
624 return tag_chunk(mnt->mnt_root->d_inode, arg);
625}
626
638/* called with audit_filter_mutex */ 627/* called with audit_filter_mutex */
639int audit_add_tree_rule(struct audit_krule *rule) 628int audit_add_tree_rule(struct audit_krule *rule)
640{ 629{
641 struct audit_tree *seed = rule->tree, *tree; 630 struct audit_tree *seed = rule->tree, *tree;
642 struct path path; 631 struct path path;
643 struct vfsmount *mnt, *p; 632 struct vfsmount *mnt;
644 struct list_head list;
645 int err; 633 int err;
646 634
647 list_for_each_entry(tree, &tree_list, list) { 635 list_for_each_entry(tree, &tree_list, list) {
@@ -667,16 +655,9 @@ int audit_add_tree_rule(struct audit_krule *rule)
667 err = -ENOMEM; 655 err = -ENOMEM;
668 goto Err; 656 goto Err;
669 } 657 }
670 list_add_tail(&list, &mnt->mnt_list);
671 658
672 get_tree(tree); 659 get_tree(tree);
673 list_for_each_entry(p, &list, mnt_list) { 660 err = iterate_mounts(tag_mount, tree, mnt);
674 err = tag_chunk(p->mnt_root->d_inode, tree);
675 if (err)
676 break;
677 }
678
679 list_del(&list);
680 drop_collected_mounts(mnt); 661 drop_collected_mounts(mnt);
681 662
682 if (!err) { 663 if (!err) {
@@ -711,31 +692,23 @@ int audit_tag_tree(char *old, char *new)
711{ 692{
712 struct list_head cursor, barrier; 693 struct list_head cursor, barrier;
713 int failed = 0; 694 int failed = 0;
714 struct path path; 695 struct path path1, path2;
715 struct vfsmount *tagged; 696 struct vfsmount *tagged;
716 struct list_head list;
717 struct vfsmount *mnt;
718 struct dentry *dentry;
719 int err; 697 int err;
720 698
721 err = kern_path(new, 0, &path); 699 err = kern_path(new, 0, &path2);
722 if (err) 700 if (err)
723 return err; 701 return err;
724 tagged = collect_mounts(&path); 702 tagged = collect_mounts(&path2);
725 path_put(&path); 703 path_put(&path2);
726 if (!tagged) 704 if (!tagged)
727 return -ENOMEM; 705 return -ENOMEM;
728 706
729 err = kern_path(old, 0, &path); 707 err = kern_path(old, 0, &path1);
730 if (err) { 708 if (err) {
731 drop_collected_mounts(tagged); 709 drop_collected_mounts(tagged);
732 return err; 710 return err;
733 } 711 }
734 mnt = mntget(path.mnt);
735 dentry = dget(path.dentry);
736 path_put(&path);
737
738 list_add_tail(&list, &tagged->mnt_list);
739 712
740 mutex_lock(&audit_filter_mutex); 713 mutex_lock(&audit_filter_mutex);
741 list_add(&barrier, &tree_list); 714 list_add(&barrier, &tree_list);
@@ -743,7 +716,7 @@ int audit_tag_tree(char *old, char *new)
743 716
744 while (cursor.next != &tree_list) { 717 while (cursor.next != &tree_list) {
745 struct audit_tree *tree; 718 struct audit_tree *tree;
746 struct vfsmount *p; 719 int good_one = 0;
747 720
748 tree = container_of(cursor.next, struct audit_tree, list); 721 tree = container_of(cursor.next, struct audit_tree, list);
749 get_tree(tree); 722 get_tree(tree);
@@ -751,30 +724,19 @@ int audit_tag_tree(char *old, char *new)
751 list_add(&cursor, &tree->list); 724 list_add(&cursor, &tree->list);
752 mutex_unlock(&audit_filter_mutex); 725 mutex_unlock(&audit_filter_mutex);
753 726
754 err = kern_path(tree->pathname, 0, &path); 727 err = kern_path(tree->pathname, 0, &path2);
755 if (err) { 728 if (!err) {
756 put_tree(tree); 729 good_one = path_is_under(&path1, &path2);
757 mutex_lock(&audit_filter_mutex); 730 path_put(&path2);
758 continue;
759 } 731 }
760 732
761 spin_lock(&vfsmount_lock); 733 if (!good_one) {
762 if (!is_under(mnt, dentry, &path)) {
763 spin_unlock(&vfsmount_lock);
764 path_put(&path);
765 put_tree(tree); 734 put_tree(tree);
766 mutex_lock(&audit_filter_mutex); 735 mutex_lock(&audit_filter_mutex);
767 continue; 736 continue;
768 } 737 }
769 spin_unlock(&vfsmount_lock);
770 path_put(&path);
771
772 list_for_each_entry(p, &list, mnt_list) {
773 failed = tag_chunk(p->mnt_root->d_inode, tree);
774 if (failed)
775 break;
776 }
777 738
739 failed = iterate_mounts(tag_mount, tree, tagged);
778 if (failed) { 740 if (failed) {
779 put_tree(tree); 741 put_tree(tree);
780 mutex_lock(&audit_filter_mutex); 742 mutex_lock(&audit_filter_mutex);
@@ -815,10 +777,8 @@ int audit_tag_tree(char *old, char *new)
815 } 777 }
816 list_del(&barrier); 778 list_del(&barrier);
817 list_del(&cursor); 779 list_del(&cursor);
818 list_del(&list);
819 mutex_unlock(&audit_filter_mutex); 780 mutex_unlock(&audit_filter_mutex);
820 dput(dentry); 781 path_put(&path1);
821 mntput(mnt);
822 drop_collected_mounts(tagged); 782 drop_collected_mounts(tagged);
823 return failed; 783 return failed;
824} 784}
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index cc7e87936cbc..8df43696f4ba 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -27,6 +27,7 @@
27#include <linux/namei.h> 27#include <linux/namei.h>
28#include <linux/netlink.h> 28#include <linux/netlink.h>
29#include <linux/sched.h> 29#include <linux/sched.h>
30#include <linux/slab.h>
30#include <linux/inotify.h> 31#include <linux/inotify.h>
31#include <linux/security.h> 32#include <linux/security.h>
32#include "audit.h" 33#include "audit.h"
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index a70604047f3c..ce08041f578d 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -27,6 +27,7 @@
27#include <linux/namei.h> 27#include <linux/namei.h>
28#include <linux/netlink.h> 28#include <linux/netlink.h>
29#include <linux/sched.h> 29#include <linux/sched.h>
30#include <linux/slab.h>
30#include <linux/security.h> 31#include <linux/security.h>
31#include "audit.h" 32#include "audit.h"
32 33
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 267e484f0198..3828ad5fb8f1 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -49,6 +49,7 @@
49#include <linux/namei.h> 49#include <linux/namei.h>
50#include <linux/mm.h> 50#include <linux/mm.h>
51#include <linux/module.h> 51#include <linux/module.h>
52#include <linux/slab.h>
52#include <linux/mount.h> 53#include <linux/mount.h>
53#include <linux/socket.h> 54#include <linux/socket.h>
54#include <linux/mqueue.h> 55#include <linux/mqueue.h>
@@ -250,7 +251,6 @@ struct audit_context {
250#endif 251#endif
251}; 252};
252 253
253#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE])
254static inline int open_arg(int flags, int mask) 254static inline int open_arg(int flags, int mask)
255{ 255{
256 int n = ACC_MODE(flags); 256 int n = ACC_MODE(flags);
@@ -1894,7 +1894,7 @@ static int audit_inc_name_count(struct audit_context *context,
1894{ 1894{
1895 if (context->name_count >= AUDIT_NAMES) { 1895 if (context->name_count >= AUDIT_NAMES) {
1896 if (inode) 1896 if (inode)
1897 printk(KERN_DEBUG "name_count maxed, losing inode data: " 1897 printk(KERN_DEBUG "audit: name_count maxed, losing inode data: "
1898 "dev=%02x:%02x, inode=%lu\n", 1898 "dev=%02x:%02x, inode=%lu\n",
1899 MAJOR(inode->i_sb->s_dev), 1899 MAJOR(inode->i_sb->s_dev),
1900 MINOR(inode->i_sb->s_dev), 1900 MINOR(inode->i_sb->s_dev),
@@ -1989,7 +1989,6 @@ void __audit_inode(const char *name, const struct dentry *dentry)
1989 1989
1990/** 1990/**
1991 * audit_inode_child - collect inode info for created/removed objects 1991 * audit_inode_child - collect inode info for created/removed objects
1992 * @dname: inode's dentry name
1993 * @dentry: dentry being audited 1992 * @dentry: dentry being audited
1994 * @parent: inode of dentry parent 1993 * @parent: inode of dentry parent
1995 * 1994 *
@@ -2001,13 +2000,14 @@ void __audit_inode(const char *name, const struct dentry *dentry)
2001 * must be hooked prior, in order to capture the target inode during 2000 * must be hooked prior, in order to capture the target inode during
2002 * unsuccessful attempts. 2001 * unsuccessful attempts.
2003 */ 2002 */
2004void __audit_inode_child(const char *dname, const struct dentry *dentry, 2003void __audit_inode_child(const struct dentry *dentry,
2005 const struct inode *parent) 2004 const struct inode *parent)
2006{ 2005{
2007 int idx; 2006 int idx;
2008 struct audit_context *context = current->audit_context; 2007 struct audit_context *context = current->audit_context;
2009 const char *found_parent = NULL, *found_child = NULL; 2008 const char *found_parent = NULL, *found_child = NULL;
2010 const struct inode *inode = dentry->d_inode; 2009 const struct inode *inode = dentry->d_inode;
2010 const char *dname = dentry->d_name.name;
2011 int dirlen = 0; 2011 int dirlen = 0;
2012 2012
2013 if (!context->in_syscall) 2013 if (!context->in_syscall)
@@ -2015,9 +2015,6 @@ void __audit_inode_child(const char *dname, const struct dentry *dentry,
2015 2015
2016 if (inode) 2016 if (inode)
2017 handle_one(inode); 2017 handle_one(inode);
2018 /* determine matching parent */
2019 if (!dname)
2020 goto add_names;
2021 2018
2022 /* parent is more likely, look for it first */ 2019 /* parent is more likely, look for it first */
2023 for (idx = 0; idx < context->name_count; idx++) { 2020 for (idx = 0; idx < context->name_count; idx++) {
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 3c5301381837..98a51f26c136 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -12,7 +12,7 @@
12 12
13void foo(void) 13void foo(void)
14{ 14{
15 /* The enum constants to put into include/linux/bounds.h */ 15 /* The enum constants to put into include/generated/bounds.h */
16 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); 16 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
17 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); 17 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
18 /* End of constants */ 18 /* End of constants */
diff --git a/kernel/capability.c b/kernel/capability.c
index 4e17041963f5..9e4697e9b276 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -29,7 +29,6 @@ EXPORT_SYMBOL(__cap_empty_set);
29EXPORT_SYMBOL(__cap_full_set); 29EXPORT_SYMBOL(__cap_full_set);
30EXPORT_SYMBOL(__cap_init_eff_set); 30EXPORT_SYMBOL(__cap_init_eff_set);
31 31
32#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
33int file_caps_enabled = 1; 32int file_caps_enabled = 1;
34 33
35static int __init file_caps_disable(char *str) 34static int __init file_caps_disable(char *str)
@@ -38,7 +37,6 @@ static int __init file_caps_disable(char *str)
38 return 1; 37 return 1;
39} 38}
40__setup("no_file_caps", file_caps_disable); 39__setup("no_file_caps", file_caps_disable);
41#endif
42 40
43/* 41/*
44 * More recent versions of libcap are available from: 42 * More recent versions of libcap are available from:
@@ -137,7 +135,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
137 if (pid && (pid != task_pid_vnr(current))) { 135 if (pid && (pid != task_pid_vnr(current))) {
138 struct task_struct *target; 136 struct task_struct *target;
139 137
140 read_lock(&tasklist_lock); 138 rcu_read_lock();
141 139
142 target = find_task_by_vpid(pid); 140 target = find_task_by_vpid(pid);
143 if (!target) 141 if (!target)
@@ -145,7 +143,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
145 else 143 else
146 ret = security_capget(target, pEp, pIp, pPp); 144 ret = security_capget(target, pEp, pIp, pPp);
147 145
148 read_unlock(&tasklist_lock); 146 rcu_read_unlock();
149 } else 147 } else
150 ret = security_capget(current, pEp, pIp, pPp); 148 ret = security_capget(current, pEp, pIp, pPp);
151 149
@@ -169,8 +167,8 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
169 kernel_cap_t pE, pI, pP; 167 kernel_cap_t pE, pI, pP;
170 168
171 ret = cap_validate_magic(header, &tocopy); 169 ret = cap_validate_magic(header, &tocopy);
172 if (ret != 0) 170 if ((dataptr == NULL) || (ret != 0))
173 return ret; 171 return ((dataptr == NULL) && (ret == -EINVAL)) ? 0 : ret;
174 172
175 if (get_user(pid, &header->pid)) 173 if (get_user(pid, &header->pid))
176 return -EFAULT; 174 return -EFAULT;
@@ -238,7 +236,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
238SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) 236SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
239{ 237{
240 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; 238 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
241 unsigned i, tocopy; 239 unsigned i, tocopy, copybytes;
242 kernel_cap_t inheritable, permitted, effective; 240 kernel_cap_t inheritable, permitted, effective;
243 struct cred *new; 241 struct cred *new;
244 int ret; 242 int ret;
@@ -255,8 +253,11 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
255 if (pid != 0 && pid != task_pid_vnr(current)) 253 if (pid != 0 && pid != task_pid_vnr(current))
256 return -EPERM; 254 return -EPERM;
257 255
258 if (copy_from_user(&kdata, data, 256 copybytes = tocopy * sizeof(struct __user_cap_data_struct);
259 tocopy * sizeof(struct __user_cap_data_struct))) 257 if (copybytes > sizeof(kdata))
258 return -EFAULT;
259
260 if (copy_from_user(&kdata, data, copybytes))
260 return -EFAULT; 261 return -EFAULT;
261 262
262 for (i = 0; i < tocopy; i++) { 263 for (i = 0; i < tocopy; i++) {
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0249f4be9b5c..6d870f2d1228 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4,6 +4,10 @@
4 * Based originally on the cpuset system, extracted by Paul Menage 4 * Based originally on the cpuset system, extracted by Paul Menage
5 * Copyright (C) 2006 Google, Inc 5 * Copyright (C) 2006 Google, Inc
6 * 6 *
7 * Notifications support
8 * Copyright (C) 2009 Nokia Corporation
9 * Author: Kirill A. Shutemov
10 *
7 * Copyright notices from the original cpuset code: 11 * Copyright notices from the original cpuset code:
8 * -------------------------------------------------- 12 * --------------------------------------------------
9 * Copyright (C) 2003 BULL SA. 13 * Copyright (C) 2003 BULL SA.
@@ -43,6 +47,7 @@
43#include <linux/string.h> 47#include <linux/string.h>
44#include <linux/sort.h> 48#include <linux/sort.h>
45#include <linux/kmod.h> 49#include <linux/kmod.h>
50#include <linux/module.h>
46#include <linux/delayacct.h> 51#include <linux/delayacct.h>
47#include <linux/cgroupstats.h> 52#include <linux/cgroupstats.h>
48#include <linux/hash.h> 53#include <linux/hash.h>
@@ -51,15 +56,21 @@
51#include <linux/pid_namespace.h> 56#include <linux/pid_namespace.h>
52#include <linux/idr.h> 57#include <linux/idr.h>
53#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 58#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
59#include <linux/eventfd.h>
60#include <linux/poll.h>
54 61
55#include <asm/atomic.h> 62#include <asm/atomic.h>
56 63
57static DEFINE_MUTEX(cgroup_mutex); 64static DEFINE_MUTEX(cgroup_mutex);
58 65
59/* Generate an array of cgroup subsystem pointers */ 66/*
67 * Generate an array of cgroup subsystem pointers. At boot time, this is
68 * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are
69 * registered after that. The mutable section of this array is protected by
70 * cgroup_mutex.
71 */
60#define SUBSYS(_x) &_x ## _subsys, 72#define SUBSYS(_x) &_x ## _subsys,
61 73static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
62static struct cgroup_subsys *subsys[] = {
63#include <linux/cgroup_subsys.h> 74#include <linux/cgroup_subsys.h>
64}; 75};
65 76
@@ -146,6 +157,35 @@ struct css_id {
146 unsigned short stack[0]; /* Array of Length (depth+1) */ 157 unsigned short stack[0]; /* Array of Length (depth+1) */
147}; 158};
148 159
160/*
161 * cgroup_event represents events which userspace want to recieve.
162 */
163struct cgroup_event {
164 /*
165 * Cgroup which the event belongs to.
166 */
167 struct cgroup *cgrp;
168 /*
169 * Control file which the event associated.
170 */
171 struct cftype *cft;
172 /*
173 * eventfd to signal userspace about the event.
174 */
175 struct eventfd_ctx *eventfd;
176 /*
177 * Each of these stored in a list by the cgroup.
178 */
179 struct list_head list;
180 /*
181 * All fields below needed to unregister event when
182 * userspace closes eventfd.
183 */
184 poll_table pt;
185 wait_queue_head_t *wqh;
186 wait_queue_t wait;
187 struct work_struct remove;
188};
149 189
150/* The list of hierarchy roots */ 190/* The list of hierarchy roots */
151 191
@@ -166,6 +206,20 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
166 */ 206 */
167static int need_forkexit_callback __read_mostly; 207static int need_forkexit_callback __read_mostly;
168 208
209#ifdef CONFIG_PROVE_LOCKING
210int cgroup_lock_is_held(void)
211{
212 return lockdep_is_held(&cgroup_mutex);
213}
214#else /* #ifdef CONFIG_PROVE_LOCKING */
215int cgroup_lock_is_held(void)
216{
217 return mutex_is_locked(&cgroup_mutex);
218}
219#endif /* #else #ifdef CONFIG_PROVE_LOCKING */
220
221EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
222
169/* convenient tests for these bits */ 223/* convenient tests for these bits */
170inline int cgroup_is_removed(const struct cgroup *cgrp) 224inline int cgroup_is_removed(const struct cgroup *cgrp)
171{ 225{
@@ -235,7 +289,8 @@ struct cg_cgroup_link {
235static struct css_set init_css_set; 289static struct css_set init_css_set;
236static struct cg_cgroup_link init_css_set_link; 290static struct cg_cgroup_link init_css_set_link;
237 291
238static int cgroup_subsys_init_idr(struct cgroup_subsys *ss); 292static int cgroup_init_idr(struct cgroup_subsys *ss,
293 struct cgroup_subsys_state *css);
239 294
240/* css_set_lock protects the list of css_set objects, and the 295/* css_set_lock protects the list of css_set objects, and the
241 * chain of tasks off each css_set. Nests outside task->alloc_lock 296 * chain of tasks off each css_set. Nests outside task->alloc_lock
@@ -433,8 +488,11 @@ static struct css_set *find_existing_css_set(
433 struct hlist_node *node; 488 struct hlist_node *node;
434 struct css_set *cg; 489 struct css_set *cg;
435 490
436 /* Built the set of subsystem state objects that we want to 491 /*
437 * see in the new css_set */ 492 * Build the set of subsystem state objects that we want to see in the
493 * new css_set. while subsystems can change globally, the entries here
494 * won't change, so no need for locking.
495 */
438 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 496 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
439 if (root->subsys_bits & (1UL << i)) { 497 if (root->subsys_bits & (1UL << i)) {
440 /* Subsystem is in this hierarchy. So we want 498 /* Subsystem is in this hierarchy. So we want
@@ -681,6 +739,7 @@ void cgroup_lock(void)
681{ 739{
682 mutex_lock(&cgroup_mutex); 740 mutex_lock(&cgroup_mutex);
683} 741}
742EXPORT_SYMBOL_GPL(cgroup_lock);
684 743
685/** 744/**
686 * cgroup_unlock - release lock on cgroup changes 745 * cgroup_unlock - release lock on cgroup changes
@@ -691,6 +750,7 @@ void cgroup_unlock(void)
691{ 750{
692 mutex_unlock(&cgroup_mutex); 751 mutex_unlock(&cgroup_mutex);
693} 752}
753EXPORT_SYMBOL_GPL(cgroup_unlock);
694 754
695/* 755/*
696 * A couple of forward declarations required, due to cyclic reference loop: 756 * A couple of forward declarations required, due to cyclic reference loop:
@@ -742,6 +802,7 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
742 if (ret) 802 if (ret)
743 break; 803 break;
744 } 804 }
805
745 return ret; 806 return ret;
746} 807}
747 808
@@ -869,7 +930,11 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
869 css_put(css); 930 css_put(css);
870} 931}
871 932
872 933/*
934 * Call with cgroup_mutex held. Drops reference counts on modules, including
935 * any duplicate ones that parse_cgroupfs_options took. If this function
936 * returns an error, no reference counts are touched.
937 */
873static int rebind_subsystems(struct cgroupfs_root *root, 938static int rebind_subsystems(struct cgroupfs_root *root,
874 unsigned long final_bits) 939 unsigned long final_bits)
875{ 940{
@@ -877,6 +942,8 @@ static int rebind_subsystems(struct cgroupfs_root *root,
877 struct cgroup *cgrp = &root->top_cgroup; 942 struct cgroup *cgrp = &root->top_cgroup;
878 int i; 943 int i;
879 944
945 BUG_ON(!mutex_is_locked(&cgroup_mutex));
946
880 removed_bits = root->actual_subsys_bits & ~final_bits; 947 removed_bits = root->actual_subsys_bits & ~final_bits;
881 added_bits = final_bits & ~root->actual_subsys_bits; 948 added_bits = final_bits & ~root->actual_subsys_bits;
882 /* Check that any added subsystems are currently free */ 949 /* Check that any added subsystems are currently free */
@@ -885,6 +952,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
885 struct cgroup_subsys *ss = subsys[i]; 952 struct cgroup_subsys *ss = subsys[i];
886 if (!(bit & added_bits)) 953 if (!(bit & added_bits))
887 continue; 954 continue;
955 /*
956 * Nobody should tell us to do a subsys that doesn't exist:
957 * parse_cgroupfs_options should catch that case and refcounts
958 * ensure that subsystems won't disappear once selected.
959 */
960 BUG_ON(ss == NULL);
888 if (ss->root != &rootnode) { 961 if (ss->root != &rootnode) {
889 /* Subsystem isn't free */ 962 /* Subsystem isn't free */
890 return -EBUSY; 963 return -EBUSY;
@@ -904,6 +977,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
904 unsigned long bit = 1UL << i; 977 unsigned long bit = 1UL << i;
905 if (bit & added_bits) { 978 if (bit & added_bits) {
906 /* We're binding this subsystem to this hierarchy */ 979 /* We're binding this subsystem to this hierarchy */
980 BUG_ON(ss == NULL);
907 BUG_ON(cgrp->subsys[i]); 981 BUG_ON(cgrp->subsys[i]);
908 BUG_ON(!dummytop->subsys[i]); 982 BUG_ON(!dummytop->subsys[i]);
909 BUG_ON(dummytop->subsys[i]->cgroup != dummytop); 983 BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
@@ -915,8 +989,10 @@ static int rebind_subsystems(struct cgroupfs_root *root,
915 if (ss->bind) 989 if (ss->bind)
916 ss->bind(ss, cgrp); 990 ss->bind(ss, cgrp);
917 mutex_unlock(&ss->hierarchy_mutex); 991 mutex_unlock(&ss->hierarchy_mutex);
992 /* refcount was already taken, and we're keeping it */
918 } else if (bit & removed_bits) { 993 } else if (bit & removed_bits) {
919 /* We're removing this subsystem */ 994 /* We're removing this subsystem */
995 BUG_ON(ss == NULL);
920 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); 996 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
921 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 997 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
922 mutex_lock(&ss->hierarchy_mutex); 998 mutex_lock(&ss->hierarchy_mutex);
@@ -927,9 +1003,20 @@ static int rebind_subsystems(struct cgroupfs_root *root,
927 subsys[i]->root = &rootnode; 1003 subsys[i]->root = &rootnode;
928 list_move(&ss->sibling, &rootnode.subsys_list); 1004 list_move(&ss->sibling, &rootnode.subsys_list);
929 mutex_unlock(&ss->hierarchy_mutex); 1005 mutex_unlock(&ss->hierarchy_mutex);
1006 /* subsystem is now free - drop reference on module */
1007 module_put(ss->module);
930 } else if (bit & final_bits) { 1008 } else if (bit & final_bits) {
931 /* Subsystem state should already exist */ 1009 /* Subsystem state should already exist */
1010 BUG_ON(ss == NULL);
932 BUG_ON(!cgrp->subsys[i]); 1011 BUG_ON(!cgrp->subsys[i]);
1012 /*
1013 * a refcount was taken, but we already had one, so
1014 * drop the extra reference.
1015 */
1016 module_put(ss->module);
1017#ifdef CONFIG_MODULE_UNLOAD
1018 BUG_ON(ss->module && !module_refcount(ss->module));
1019#endif
933 } else { 1020 } else {
934 /* Subsystem state shouldn't exist */ 1021 /* Subsystem state shouldn't exist */
935 BUG_ON(cgrp->subsys[i]); 1022 BUG_ON(cgrp->subsys[i]);
@@ -971,13 +1058,20 @@ struct cgroup_sb_opts {
971 1058
972}; 1059};
973 1060
974/* Convert a hierarchy specifier into a bitmask of subsystems and 1061/*
975 * flags. */ 1062 * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
976static int parse_cgroupfs_options(char *data, 1063 * with cgroup_mutex held to protect the subsys[] array. This function takes
977 struct cgroup_sb_opts *opts) 1064 * refcounts on subsystems to be used, unless it returns error, in which case
1065 * no refcounts are taken.
1066 */
1067static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
978{ 1068{
979 char *token, *o = data ?: "all"; 1069 char *token, *o = data ?: "all";
980 unsigned long mask = (unsigned long)-1; 1070 unsigned long mask = (unsigned long)-1;
1071 int i;
1072 bool module_pin_failed = false;
1073
1074 BUG_ON(!mutex_is_locked(&cgroup_mutex));
981 1075
982#ifdef CONFIG_CPUSETS 1076#ifdef CONFIG_CPUSETS
983 mask = ~(1UL << cpuset_subsys_id); 1077 mask = ~(1UL << cpuset_subsys_id);
@@ -990,10 +1084,11 @@ static int parse_cgroupfs_options(char *data,
990 return -EINVAL; 1084 return -EINVAL;
991 if (!strcmp(token, "all")) { 1085 if (!strcmp(token, "all")) {
992 /* Add all non-disabled subsystems */ 1086 /* Add all non-disabled subsystems */
993 int i;
994 opts->subsys_bits = 0; 1087 opts->subsys_bits = 0;
995 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1088 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
996 struct cgroup_subsys *ss = subsys[i]; 1089 struct cgroup_subsys *ss = subsys[i];
1090 if (ss == NULL)
1091 continue;
997 if (!ss->disabled) 1092 if (!ss->disabled)
998 opts->subsys_bits |= 1ul << i; 1093 opts->subsys_bits |= 1ul << i;
999 } 1094 }
@@ -1011,7 +1106,6 @@ static int parse_cgroupfs_options(char *data,
1011 if (!opts->release_agent) 1106 if (!opts->release_agent)
1012 return -ENOMEM; 1107 return -ENOMEM;
1013 } else if (!strncmp(token, "name=", 5)) { 1108 } else if (!strncmp(token, "name=", 5)) {
1014 int i;
1015 const char *name = token + 5; 1109 const char *name = token + 5;
1016 /* Can't specify an empty name */ 1110 /* Can't specify an empty name */
1017 if (!strlen(name)) 1111 if (!strlen(name))
@@ -1035,9 +1129,10 @@ static int parse_cgroupfs_options(char *data,
1035 return -ENOMEM; 1129 return -ENOMEM;
1036 } else { 1130 } else {
1037 struct cgroup_subsys *ss; 1131 struct cgroup_subsys *ss;
1038 int i;
1039 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1132 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1040 ss = subsys[i]; 1133 ss = subsys[i];
1134 if (ss == NULL)
1135 continue;
1041 if (!strcmp(token, ss->name)) { 1136 if (!strcmp(token, ss->name)) {
1042 if (!ss->disabled) 1137 if (!ss->disabled)
1043 set_bit(i, &opts->subsys_bits); 1138 set_bit(i, &opts->subsys_bits);
@@ -1072,9 +1167,54 @@ static int parse_cgroupfs_options(char *data,
1072 if (!opts->subsys_bits && !opts->name) 1167 if (!opts->subsys_bits && !opts->name)
1073 return -EINVAL; 1168 return -EINVAL;
1074 1169
1170 /*
1171 * Grab references on all the modules we'll need, so the subsystems
1172 * don't dance around before rebind_subsystems attaches them. This may
1173 * take duplicate reference counts on a subsystem that's already used,
1174 * but rebind_subsystems handles this case.
1175 */
1176 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
1177 unsigned long bit = 1UL << i;
1178
1179 if (!(bit & opts->subsys_bits))
1180 continue;
1181 if (!try_module_get(subsys[i]->module)) {
1182 module_pin_failed = true;
1183 break;
1184 }
1185 }
1186 if (module_pin_failed) {
1187 /*
1188 * oops, one of the modules was going away. this means that we
1189 * raced with a module_delete call, and to the user this is
1190 * essentially a "subsystem doesn't exist" case.
1191 */
1192 for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) {
1193 /* drop refcounts only on the ones we took */
1194 unsigned long bit = 1UL << i;
1195
1196 if (!(bit & opts->subsys_bits))
1197 continue;
1198 module_put(subsys[i]->module);
1199 }
1200 return -ENOENT;
1201 }
1202
1075 return 0; 1203 return 0;
1076} 1204}
1077 1205
1206static void drop_parsed_module_refcounts(unsigned long subsys_bits)
1207{
1208 int i;
1209 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
1210 unsigned long bit = 1UL << i;
1211
1212 if (!(bit & subsys_bits))
1213 continue;
1214 module_put(subsys[i]->module);
1215 }
1216}
1217
1078static int cgroup_remount(struct super_block *sb, int *flags, char *data) 1218static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1079{ 1219{
1080 int ret = 0; 1220 int ret = 0;
@@ -1091,21 +1231,19 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1091 if (ret) 1231 if (ret)
1092 goto out_unlock; 1232 goto out_unlock;
1093 1233
1094 /* Don't allow flags to change at remount */ 1234 /* Don't allow flags or name to change at remount */
1095 if (opts.flags != root->flags) { 1235 if (opts.flags != root->flags ||
1096 ret = -EINVAL; 1236 (opts.name && strcmp(opts.name, root->name))) {
1097 goto out_unlock;
1098 }
1099
1100 /* Don't allow name to change at remount */
1101 if (opts.name && strcmp(opts.name, root->name)) {
1102 ret = -EINVAL; 1237 ret = -EINVAL;
1238 drop_parsed_module_refcounts(opts.subsys_bits);
1103 goto out_unlock; 1239 goto out_unlock;
1104 } 1240 }
1105 1241
1106 ret = rebind_subsystems(root, opts.subsys_bits); 1242 ret = rebind_subsystems(root, opts.subsys_bits);
1107 if (ret) 1243 if (ret) {
1244 drop_parsed_module_refcounts(opts.subsys_bits);
1108 goto out_unlock; 1245 goto out_unlock;
1246 }
1109 1247
1110 /* (re)populate subsystem files */ 1248 /* (re)populate subsystem files */
1111 cgroup_populate_dir(cgrp); 1249 cgroup_populate_dir(cgrp);
@@ -1136,6 +1274,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1136 INIT_LIST_HEAD(&cgrp->release_list); 1274 INIT_LIST_HEAD(&cgrp->release_list);
1137 INIT_LIST_HEAD(&cgrp->pidlists); 1275 INIT_LIST_HEAD(&cgrp->pidlists);
1138 mutex_init(&cgrp->pidlist_mutex); 1276 mutex_init(&cgrp->pidlist_mutex);
1277 INIT_LIST_HEAD(&cgrp->event_list);
1278 spin_lock_init(&cgrp->event_list_lock);
1139} 1279}
1140 1280
1141static void init_cgroup_root(struct cgroupfs_root *root) 1281static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1291,7 +1431,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1291 struct cgroupfs_root *new_root; 1431 struct cgroupfs_root *new_root;
1292 1432
1293 /* First find the desired set of subsystems */ 1433 /* First find the desired set of subsystems */
1434 mutex_lock(&cgroup_mutex);
1294 ret = parse_cgroupfs_options(data, &opts); 1435 ret = parse_cgroupfs_options(data, &opts);
1436 mutex_unlock(&cgroup_mutex);
1295 if (ret) 1437 if (ret)
1296 goto out_err; 1438 goto out_err;
1297 1439
@@ -1302,7 +1444,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1302 new_root = cgroup_root_from_opts(&opts); 1444 new_root = cgroup_root_from_opts(&opts);
1303 if (IS_ERR(new_root)) { 1445 if (IS_ERR(new_root)) {
1304 ret = PTR_ERR(new_root); 1446 ret = PTR_ERR(new_root);
1305 goto out_err; 1447 goto drop_modules;
1306 } 1448 }
1307 opts.new_root = new_root; 1449 opts.new_root = new_root;
1308 1450
@@ -1311,7 +1453,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1311 if (IS_ERR(sb)) { 1453 if (IS_ERR(sb)) {
1312 ret = PTR_ERR(sb); 1454 ret = PTR_ERR(sb);
1313 cgroup_drop_root(opts.new_root); 1455 cgroup_drop_root(opts.new_root);
1314 goto out_err; 1456 goto drop_modules;
1315 } 1457 }
1316 1458
1317 root = sb->s_fs_info; 1459 root = sb->s_fs_info;
@@ -1367,6 +1509,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1367 free_cg_links(&tmp_cg_links); 1509 free_cg_links(&tmp_cg_links);
1368 goto drop_new_super; 1510 goto drop_new_super;
1369 } 1511 }
1512 /*
1513 * There must be no failure case after here, since rebinding
1514 * takes care of subsystems' refcounts, which are explicitly
1515 * dropped in the failure exit path.
1516 */
1370 1517
1371 /* EBUSY should be the only error here */ 1518 /* EBUSY should be the only error here */
1372 BUG_ON(ret); 1519 BUG_ON(ret);
@@ -1405,6 +1552,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1405 * any) is not needed 1552 * any) is not needed
1406 */ 1553 */
1407 cgroup_drop_root(opts.new_root); 1554 cgroup_drop_root(opts.new_root);
1555 /* no subsys rebinding, so refcounts don't change */
1556 drop_parsed_module_refcounts(opts.subsys_bits);
1408 } 1557 }
1409 1558
1410 simple_set_mnt(mnt, sb); 1559 simple_set_mnt(mnt, sb);
@@ -1414,6 +1563,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1414 1563
1415 drop_new_super: 1564 drop_new_super:
1416 deactivate_locked_super(sb); 1565 deactivate_locked_super(sb);
1566 drop_modules:
1567 drop_parsed_module_refcounts(opts.subsys_bits);
1417 out_err: 1568 out_err:
1418 kfree(opts.release_agent); 1569 kfree(opts.release_agent);
1419 kfree(opts.name); 1570 kfree(opts.name);
@@ -1495,7 +1646,9 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
1495int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) 1646int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1496{ 1647{
1497 char *start; 1648 char *start;
1498 struct dentry *dentry = rcu_dereference(cgrp->dentry); 1649 struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
1650 rcu_read_lock_held() ||
1651 cgroup_lock_is_held());
1499 1652
1500 if (!dentry || cgrp == dummytop) { 1653 if (!dentry || cgrp == dummytop) {
1501 /* 1654 /*
@@ -1511,13 +1664,17 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1511 *--start = '\0'; 1664 *--start = '\0';
1512 for (;;) { 1665 for (;;) {
1513 int len = dentry->d_name.len; 1666 int len = dentry->d_name.len;
1667
1514 if ((start -= len) < buf) 1668 if ((start -= len) < buf)
1515 return -ENAMETOOLONG; 1669 return -ENAMETOOLONG;
1516 memcpy(start, cgrp->dentry->d_name.name, len); 1670 memcpy(start, dentry->d_name.name, len);
1517 cgrp = cgrp->parent; 1671 cgrp = cgrp->parent;
1518 if (!cgrp) 1672 if (!cgrp)
1519 break; 1673 break;
1520 dentry = rcu_dereference(cgrp->dentry); 1674
1675 dentry = rcu_dereference_check(cgrp->dentry,
1676 rcu_read_lock_held() ||
1677 cgroup_lock_is_held());
1521 if (!cgrp->parent) 1678 if (!cgrp->parent)
1522 continue; 1679 continue;
1523 if (--start < buf) 1680 if (--start < buf)
@@ -1527,6 +1684,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1527 memmove(buf, start, buf + buflen - start); 1684 memmove(buf, start, buf + buflen - start);
1528 return 0; 1685 return 0;
1529} 1686}
1687EXPORT_SYMBOL_GPL(cgroup_path);
1530 1688
1531/** 1689/**
1532 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' 1690 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
@@ -1539,7 +1697,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1539int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 1697int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1540{ 1698{
1541 int retval = 0; 1699 int retval = 0;
1542 struct cgroup_subsys *ss; 1700 struct cgroup_subsys *ss, *failed_ss = NULL;
1543 struct cgroup *oldcgrp; 1701 struct cgroup *oldcgrp;
1544 struct css_set *cg; 1702 struct css_set *cg;
1545 struct css_set *newcg; 1703 struct css_set *newcg;
@@ -1553,8 +1711,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1553 for_each_subsys(root, ss) { 1711 for_each_subsys(root, ss) {
1554 if (ss->can_attach) { 1712 if (ss->can_attach) {
1555 retval = ss->can_attach(ss, cgrp, tsk, false); 1713 retval = ss->can_attach(ss, cgrp, tsk, false);
1556 if (retval) 1714 if (retval) {
1557 return retval; 1715 /*
1716 * Remember on which subsystem the can_attach()
1717 * failed, so that we only call cancel_attach()
1718 * against the subsystems whose can_attach()
1719 * succeeded. (See below)
1720 */
1721 failed_ss = ss;
1722 goto out;
1723 }
1558 } 1724 }
1559 } 1725 }
1560 1726
@@ -1568,14 +1734,17 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1568 */ 1734 */
1569 newcg = find_css_set(cg, cgrp); 1735 newcg = find_css_set(cg, cgrp);
1570 put_css_set(cg); 1736 put_css_set(cg);
1571 if (!newcg) 1737 if (!newcg) {
1572 return -ENOMEM; 1738 retval = -ENOMEM;
1739 goto out;
1740 }
1573 1741
1574 task_lock(tsk); 1742 task_lock(tsk);
1575 if (tsk->flags & PF_EXITING) { 1743 if (tsk->flags & PF_EXITING) {
1576 task_unlock(tsk); 1744 task_unlock(tsk);
1577 put_css_set(newcg); 1745 put_css_set(newcg);
1578 return -ESRCH; 1746 retval = -ESRCH;
1747 goto out;
1579 } 1748 }
1580 rcu_assign_pointer(tsk->cgroups, newcg); 1749 rcu_assign_pointer(tsk->cgroups, newcg);
1581 task_unlock(tsk); 1750 task_unlock(tsk);
@@ -1601,7 +1770,22 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1601 * is no longer empty. 1770 * is no longer empty.
1602 */ 1771 */
1603 cgroup_wakeup_rmdir_waiter(cgrp); 1772 cgroup_wakeup_rmdir_waiter(cgrp);
1604 return 0; 1773out:
1774 if (retval) {
1775 for_each_subsys(root, ss) {
1776 if (ss == failed_ss)
1777 /*
1778 * This subsystem was the one that failed the
1779 * can_attach() check earlier, so we don't need
1780 * to call cancel_attach() against it or any
1781 * remaining subsystems.
1782 */
1783 break;
1784 if (ss->cancel_attach)
1785 ss->cancel_attach(ss, cgrp, tsk, false);
1786 }
1787 }
1788 return retval;
1605} 1789}
1606 1790
1607/* 1791/*
@@ -1667,6 +1851,7 @@ bool cgroup_lock_live_group(struct cgroup *cgrp)
1667 } 1851 }
1668 return true; 1852 return true;
1669} 1853}
1854EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
1670 1855
1671static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, 1856static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
1672 const char *buffer) 1857 const char *buffer)
@@ -1935,6 +2120,16 @@ static const struct inode_operations cgroup_dir_inode_operations = {
1935 .rename = cgroup_rename, 2120 .rename = cgroup_rename,
1936}; 2121};
1937 2122
2123/*
2124 * Check if a file is a control file
2125 */
2126static inline struct cftype *__file_cft(struct file *file)
2127{
2128 if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
2129 return ERR_PTR(-EINVAL);
2130 return __d_cft(file->f_dentry);
2131}
2132
1938static int cgroup_create_file(struct dentry *dentry, mode_t mode, 2133static int cgroup_create_file(struct dentry *dentry, mode_t mode,
1939 struct super_block *sb) 2134 struct super_block *sb)
1940{ 2135{
@@ -2054,6 +2249,7 @@ int cgroup_add_file(struct cgroup *cgrp,
2054 error = PTR_ERR(dentry); 2249 error = PTR_ERR(dentry);
2055 return error; 2250 return error;
2056} 2251}
2252EXPORT_SYMBOL_GPL(cgroup_add_file);
2057 2253
2058int cgroup_add_files(struct cgroup *cgrp, 2254int cgroup_add_files(struct cgroup *cgrp,
2059 struct cgroup_subsys *subsys, 2255 struct cgroup_subsys *subsys,
@@ -2068,6 +2264,7 @@ int cgroup_add_files(struct cgroup *cgrp,
2068 } 2264 }
2069 return 0; 2265 return 0;
2070} 2266}
2267EXPORT_SYMBOL_GPL(cgroup_add_files);
2071 2268
2072/** 2269/**
2073 * cgroup_task_count - count the number of tasks in a cgroup. 2270 * cgroup_task_count - count the number of tasks in a cgroup.
@@ -2453,7 +2650,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2453{ 2650{
2454 struct cgroup_pidlist *l; 2651 struct cgroup_pidlist *l;
2455 /* don't need task_nsproxy() if we're looking at ourself */ 2652 /* don't need task_nsproxy() if we're looking at ourself */
2456 struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns); 2653 struct pid_namespace *ns = current->nsproxy->pid_ns;
2654
2457 /* 2655 /*
2458 * We can't drop the pidlist_mutex before taking the l->mutex in case 2656 * We can't drop the pidlist_mutex before taking the l->mutex in case
2459 * the last ref-holder is trying to remove l from the list at the same 2657 * the last ref-holder is trying to remove l from the list at the same
@@ -2463,12 +2661,9 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2463 mutex_lock(&cgrp->pidlist_mutex); 2661 mutex_lock(&cgrp->pidlist_mutex);
2464 list_for_each_entry(l, &cgrp->pidlists, links) { 2662 list_for_each_entry(l, &cgrp->pidlists, links) {
2465 if (l->key.type == type && l->key.ns == ns) { 2663 if (l->key.type == type && l->key.ns == ns) {
2466 /* found a matching list - drop the extra refcount */
2467 put_pid_ns(ns);
2468 /* make sure l doesn't vanish out from under us */ 2664 /* make sure l doesn't vanish out from under us */
2469 down_write(&l->mutex); 2665 down_write(&l->mutex);
2470 mutex_unlock(&cgrp->pidlist_mutex); 2666 mutex_unlock(&cgrp->pidlist_mutex);
2471 l->use_count++;
2472 return l; 2667 return l;
2473 } 2668 }
2474 } 2669 }
@@ -2476,13 +2671,12 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2476 l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); 2671 l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
2477 if (!l) { 2672 if (!l) {
2478 mutex_unlock(&cgrp->pidlist_mutex); 2673 mutex_unlock(&cgrp->pidlist_mutex);
2479 put_pid_ns(ns);
2480 return l; 2674 return l;
2481 } 2675 }
2482 init_rwsem(&l->mutex); 2676 init_rwsem(&l->mutex);
2483 down_write(&l->mutex); 2677 down_write(&l->mutex);
2484 l->key.type = type; 2678 l->key.type = type;
2485 l->key.ns = ns; 2679 l->key.ns = get_pid_ns(ns);
2486 l->use_count = 0; /* don't increment here */ 2680 l->use_count = 0; /* don't increment here */
2487 l->list = NULL; 2681 l->list = NULL;
2488 l->owner = cgrp; 2682 l->owner = cgrp;
@@ -2790,6 +2984,174 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
2790} 2984}
2791 2985
2792/* 2986/*
2987 * Unregister event and free resources.
2988 *
2989 * Gets called from workqueue.
2990 */
2991static void cgroup_event_remove(struct work_struct *work)
2992{
2993 struct cgroup_event *event = container_of(work, struct cgroup_event,
2994 remove);
2995 struct cgroup *cgrp = event->cgrp;
2996
2997 /* TODO: check return code */
2998 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
2999
3000 eventfd_ctx_put(event->eventfd);
3001 kfree(event);
3002 dput(cgrp->dentry);
3003}
3004
3005/*
3006 * Gets called on POLLHUP on eventfd when user closes it.
3007 *
3008 * Called with wqh->lock held and interrupts disabled.
3009 */
3010static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3011 int sync, void *key)
3012{
3013 struct cgroup_event *event = container_of(wait,
3014 struct cgroup_event, wait);
3015 struct cgroup *cgrp = event->cgrp;
3016 unsigned long flags = (unsigned long)key;
3017
3018 if (flags & POLLHUP) {
3019 remove_wait_queue_locked(event->wqh, &event->wait);
3020 spin_lock(&cgrp->event_list_lock);
3021 list_del(&event->list);
3022 spin_unlock(&cgrp->event_list_lock);
3023 /*
3024 * We are in atomic context, but cgroup_event_remove() may
3025 * sleep, so we have to call it in workqueue.
3026 */
3027 schedule_work(&event->remove);
3028 }
3029
3030 return 0;
3031}
3032
3033static void cgroup_event_ptable_queue_proc(struct file *file,
3034 wait_queue_head_t *wqh, poll_table *pt)
3035{
3036 struct cgroup_event *event = container_of(pt,
3037 struct cgroup_event, pt);
3038
3039 event->wqh = wqh;
3040 add_wait_queue(wqh, &event->wait);
3041}
3042
3043/*
3044 * Parse input and register new cgroup event handler.
3045 *
3046 * Input must be in format '<event_fd> <control_fd> <args>'.
3047 * Interpretation of args is defined by control file implementation.
3048 */
3049static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3050 const char *buffer)
3051{
3052 struct cgroup_event *event = NULL;
3053 unsigned int efd, cfd;
3054 struct file *efile = NULL;
3055 struct file *cfile = NULL;
3056 char *endp;
3057 int ret;
3058
3059 efd = simple_strtoul(buffer, &endp, 10);
3060 if (*endp != ' ')
3061 return -EINVAL;
3062 buffer = endp + 1;
3063
3064 cfd = simple_strtoul(buffer, &endp, 10);
3065 if ((*endp != ' ') && (*endp != '\0'))
3066 return -EINVAL;
3067 buffer = endp + 1;
3068
3069 event = kzalloc(sizeof(*event), GFP_KERNEL);
3070 if (!event)
3071 return -ENOMEM;
3072 event->cgrp = cgrp;
3073 INIT_LIST_HEAD(&event->list);
3074 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
3075 init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
3076 INIT_WORK(&event->remove, cgroup_event_remove);
3077
3078 efile = eventfd_fget(efd);
3079 if (IS_ERR(efile)) {
3080 ret = PTR_ERR(efile);
3081 goto fail;
3082 }
3083
3084 event->eventfd = eventfd_ctx_fileget(efile);
3085 if (IS_ERR(event->eventfd)) {
3086 ret = PTR_ERR(event->eventfd);
3087 goto fail;
3088 }
3089
3090 cfile = fget(cfd);
3091 if (!cfile) {
3092 ret = -EBADF;
3093 goto fail;
3094 }
3095
3096 /* the process need read permission on control file */
3097 ret = file_permission(cfile, MAY_READ);
3098 if (ret < 0)
3099 goto fail;
3100
3101 event->cft = __file_cft(cfile);
3102 if (IS_ERR(event->cft)) {
3103 ret = PTR_ERR(event->cft);
3104 goto fail;
3105 }
3106
3107 if (!event->cft->register_event || !event->cft->unregister_event) {
3108 ret = -EINVAL;
3109 goto fail;
3110 }
3111
3112 ret = event->cft->register_event(cgrp, event->cft,
3113 event->eventfd, buffer);
3114 if (ret)
3115 goto fail;
3116
3117 if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
3118 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
3119 ret = 0;
3120 goto fail;
3121 }
3122
3123 /*
3124 * Events should be removed after rmdir of cgroup directory, but before
3125 * destroying subsystem state objects. Let's take reference to cgroup
3126 * directory dentry to do that.
3127 */
3128 dget(cgrp->dentry);
3129
3130 spin_lock(&cgrp->event_list_lock);
3131 list_add(&event->list, &cgrp->event_list);
3132 spin_unlock(&cgrp->event_list_lock);
3133
3134 fput(cfile);
3135 fput(efile);
3136
3137 return 0;
3138
3139fail:
3140 if (cfile)
3141 fput(cfile);
3142
3143 if (event && event->eventfd && !IS_ERR(event->eventfd))
3144 eventfd_ctx_put(event->eventfd);
3145
3146 if (!IS_ERR_OR_NULL(efile))
3147 fput(efile);
3148
3149 kfree(event);
3150
3151 return ret;
3152}
3153
3154/*
2793 * for the common functions, 'private' gives the type of file 3155 * for the common functions, 'private' gives the type of file
2794 */ 3156 */
2795/* for hysterical raisins, we can't put this on the older files */ 3157/* for hysterical raisins, we can't put this on the older files */
@@ -2814,6 +3176,11 @@ static struct cftype files[] = {
2814 .read_u64 = cgroup_read_notify_on_release, 3176 .read_u64 = cgroup_read_notify_on_release,
2815 .write_u64 = cgroup_write_notify_on_release, 3177 .write_u64 = cgroup_write_notify_on_release,
2816 }, 3178 },
3179 {
3180 .name = CGROUP_FILE_GENERIC_PREFIX "event_control",
3181 .write_string = cgroup_write_event_control,
3182 .mode = S_IWUGO,
3183 },
2817}; 3184};
2818 3185
2819static struct cftype cft_release_agent = { 3186static struct cftype cft_release_agent = {
@@ -2878,8 +3245,14 @@ static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
2878 /* We need to take each hierarchy_mutex in a consistent order */ 3245 /* We need to take each hierarchy_mutex in a consistent order */
2879 int i; 3246 int i;
2880 3247
3248 /*
3249 * No worry about a race with rebind_subsystems that might mess up the
3250 * locking order, since both parties are under cgroup_mutex.
3251 */
2881 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3252 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2882 struct cgroup_subsys *ss = subsys[i]; 3253 struct cgroup_subsys *ss = subsys[i];
3254 if (ss == NULL)
3255 continue;
2883 if (ss->root == root) 3256 if (ss->root == root)
2884 mutex_lock(&ss->hierarchy_mutex); 3257 mutex_lock(&ss->hierarchy_mutex);
2885 } 3258 }
@@ -2891,6 +3264,8 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
2891 3264
2892 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3265 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2893 struct cgroup_subsys *ss = subsys[i]; 3266 struct cgroup_subsys *ss = subsys[i];
3267 if (ss == NULL)
3268 continue;
2894 if (ss->root == root) 3269 if (ss->root == root)
2895 mutex_unlock(&ss->hierarchy_mutex); 3270 mutex_unlock(&ss->hierarchy_mutex);
2896 } 3271 }
@@ -2937,14 +3312,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2937 3312
2938 for_each_subsys(root, ss) { 3313 for_each_subsys(root, ss) {
2939 struct cgroup_subsys_state *css = ss->create(ss, cgrp); 3314 struct cgroup_subsys_state *css = ss->create(ss, cgrp);
3315
2940 if (IS_ERR(css)) { 3316 if (IS_ERR(css)) {
2941 err = PTR_ERR(css); 3317 err = PTR_ERR(css);
2942 goto err_destroy; 3318 goto err_destroy;
2943 } 3319 }
2944 init_cgroup_css(css, ss, cgrp); 3320 init_cgroup_css(css, ss, cgrp);
2945 if (ss->use_id) 3321 if (ss->use_id) {
2946 if (alloc_css_id(ss, parent, cgrp)) 3322 err = alloc_css_id(ss, parent, cgrp);
3323 if (err)
2947 goto err_destroy; 3324 goto err_destroy;
3325 }
2948 /* At error, ->destroy() callback has to free assigned ID. */ 3326 /* At error, ->destroy() callback has to free assigned ID. */
2949 } 3327 }
2950 3328
@@ -3011,11 +3389,16 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
3011 * synchronization other than RCU, and the subsystem linked 3389 * synchronization other than RCU, and the subsystem linked
3012 * list isn't RCU-safe */ 3390 * list isn't RCU-safe */
3013 int i; 3391 int i;
3392 /*
3393 * We won't need to lock the subsys array, because the subsystems
3394 * we're concerned about aren't going anywhere since our cgroup root
3395 * has a reference on them.
3396 */
3014 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3397 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3015 struct cgroup_subsys *ss = subsys[i]; 3398 struct cgroup_subsys *ss = subsys[i];
3016 struct cgroup_subsys_state *css; 3399 struct cgroup_subsys_state *css;
3017 /* Skip subsystems not in this hierarchy */ 3400 /* Skip subsystems not present or not in this hierarchy */
3018 if (ss->root != cgrp->root) 3401 if (ss == NULL || ss->root != cgrp->root)
3019 continue; 3402 continue;
3020 css = cgrp->subsys[ss->subsys_id]; 3403 css = cgrp->subsys[ss->subsys_id];
3021 /* When called from check_for_release() it's possible 3404 /* When called from check_for_release() it's possible
@@ -3089,6 +3472,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
3089 struct dentry *d; 3472 struct dentry *d;
3090 struct cgroup *parent; 3473 struct cgroup *parent;
3091 DEFINE_WAIT(wait); 3474 DEFINE_WAIT(wait);
3475 struct cgroup_event *event, *tmp;
3092 int ret; 3476 int ret;
3093 3477
3094 /* the vfs holds both inode->i_mutex already */ 3478 /* the vfs holds both inode->i_mutex already */
@@ -3172,6 +3556,20 @@ again:
3172 set_bit(CGRP_RELEASABLE, &parent->flags); 3556 set_bit(CGRP_RELEASABLE, &parent->flags);
3173 check_for_release(parent); 3557 check_for_release(parent);
3174 3558
3559 /*
3560 * Unregister events and notify userspace.
3561 * Notify userspace about cgroup removing only after rmdir of cgroup
3562 * directory to avoid race between userspace and kernelspace
3563 */
3564 spin_lock(&cgrp->event_list_lock);
3565 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
3566 list_del(&event->list);
3567 remove_wait_queue(event->wqh, &event->wait);
3568 eventfd_signal(event->eventfd, 1);
3569 schedule_work(&event->remove);
3570 }
3571 spin_unlock(&cgrp->event_list_lock);
3572
3175 mutex_unlock(&cgroup_mutex); 3573 mutex_unlock(&cgroup_mutex);
3176 return 0; 3574 return 0;
3177} 3575}
@@ -3206,9 +3604,198 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
3206 mutex_init(&ss->hierarchy_mutex); 3604 mutex_init(&ss->hierarchy_mutex);
3207 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); 3605 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
3208 ss->active = 1; 3606 ss->active = 1;
3607
3608 /* this function shouldn't be used with modular subsystems, since they
3609 * need to register a subsys_id, among other things */
3610 BUG_ON(ss->module);
3209} 3611}
3210 3612
3211/** 3613/**
3614 * cgroup_load_subsys: load and register a modular subsystem at runtime
3615 * @ss: the subsystem to load
3616 *
3617 * This function should be called in a modular subsystem's initcall. If the
3618 * subsytem is built as a module, it will be assigned a new subsys_id and set
3619 * up for use. If the subsystem is built-in anyway, work is delegated to the
3620 * simpler cgroup_init_subsys.
3621 */
3622int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
3623{
3624 int i;
3625 struct cgroup_subsys_state *css;
3626
3627 /* check name and function validity */
3628 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
3629 ss->create == NULL || ss->destroy == NULL)
3630 return -EINVAL;
3631
3632 /*
3633 * we don't support callbacks in modular subsystems. this check is
3634 * before the ss->module check for consistency; a subsystem that could
3635 * be a module should still have no callbacks even if the user isn't
3636 * compiling it as one.
3637 */
3638 if (ss->fork || ss->exit)
3639 return -EINVAL;
3640
3641 /*
3642 * an optionally modular subsystem is built-in: we want to do nothing,
3643 * since cgroup_init_subsys will have already taken care of it.
3644 */
3645 if (ss->module == NULL) {
3646 /* a few sanity checks */
3647 BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT);
3648 BUG_ON(subsys[ss->subsys_id] != ss);
3649 return 0;
3650 }
3651
3652 /*
3653 * need to register a subsys id before anything else - for example,
3654 * init_cgroup_css needs it.
3655 */
3656 mutex_lock(&cgroup_mutex);
3657 /* find the first empty slot in the array */
3658 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
3659 if (subsys[i] == NULL)
3660 break;
3661 }
3662 if (i == CGROUP_SUBSYS_COUNT) {
3663 /* maximum number of subsystems already registered! */
3664 mutex_unlock(&cgroup_mutex);
3665 return -EBUSY;
3666 }
3667 /* assign ourselves the subsys_id */
3668 ss->subsys_id = i;
3669 subsys[i] = ss;
3670
3671 /*
3672 * no ss->create seems to need anything important in the ss struct, so
3673 * this can happen first (i.e. before the rootnode attachment).
3674 */
3675 css = ss->create(ss, dummytop);
3676 if (IS_ERR(css)) {
3677 /* failure case - need to deassign the subsys[] slot. */
3678 subsys[i] = NULL;
3679 mutex_unlock(&cgroup_mutex);
3680 return PTR_ERR(css);
3681 }
3682
3683 list_add(&ss->sibling, &rootnode.subsys_list);
3684 ss->root = &rootnode;
3685
3686 /* our new subsystem will be attached to the dummy hierarchy. */
3687 init_cgroup_css(css, ss, dummytop);
3688 /* init_idr must be after init_cgroup_css because it sets css->id. */
3689 if (ss->use_id) {
3690 int ret = cgroup_init_idr(ss, css);
3691 if (ret) {
3692 dummytop->subsys[ss->subsys_id] = NULL;
3693 ss->destroy(ss, dummytop);
3694 subsys[i] = NULL;
3695 mutex_unlock(&cgroup_mutex);
3696 return ret;
3697 }
3698 }
3699
3700 /*
3701 * Now we need to entangle the css into the existing css_sets. unlike
3702 * in cgroup_init_subsys, there are now multiple css_sets, so each one
3703 * will need a new pointer to it; done by iterating the css_set_table.
3704 * furthermore, modifying the existing css_sets will corrupt the hash
3705 * table state, so each changed css_set will need its hash recomputed.
3706 * this is all done under the css_set_lock.
3707 */
3708 write_lock(&css_set_lock);
3709 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
3710 struct css_set *cg;
3711 struct hlist_node *node, *tmp;
3712 struct hlist_head *bucket = &css_set_table[i], *new_bucket;
3713
3714 hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
3715 /* skip entries that we already rehashed */
3716 if (cg->subsys[ss->subsys_id])
3717 continue;
3718 /* remove existing entry */
3719 hlist_del(&cg->hlist);
3720 /* set new value */
3721 cg->subsys[ss->subsys_id] = css;
3722 /* recompute hash and restore entry */
3723 new_bucket = css_set_hash(cg->subsys);
3724 hlist_add_head(&cg->hlist, new_bucket);
3725 }
3726 }
3727 write_unlock(&css_set_lock);
3728
3729 mutex_init(&ss->hierarchy_mutex);
3730 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
3731 ss->active = 1;
3732
3733 /* success! */
3734 mutex_unlock(&cgroup_mutex);
3735 return 0;
3736}
3737EXPORT_SYMBOL_GPL(cgroup_load_subsys);
3738
3739/**
3740 * cgroup_unload_subsys: unload a modular subsystem
3741 * @ss: the subsystem to unload
3742 *
3743 * This function should be called in a modular subsystem's exitcall. When this
3744 * function is invoked, the refcount on the subsystem's module will be 0, so
3745 * the subsystem will not be attached to any hierarchy.
3746 */
3747void cgroup_unload_subsys(struct cgroup_subsys *ss)
3748{
3749 struct cg_cgroup_link *link;
3750 struct hlist_head *hhead;
3751
3752 BUG_ON(ss->module == NULL);
3753
3754 /*
3755 * we shouldn't be called if the subsystem is in use, and the use of
3756 * try_module_get in parse_cgroupfs_options should ensure that it
3757 * doesn't start being used while we're killing it off.
3758 */
3759 BUG_ON(ss->root != &rootnode);
3760
3761 mutex_lock(&cgroup_mutex);
3762 /* deassign the subsys_id */
3763 BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);
3764 subsys[ss->subsys_id] = NULL;
3765
3766 /* remove subsystem from rootnode's list of subsystems */
3767 list_del(&ss->sibling);
3768
3769 /*
3770 * disentangle the css from all css_sets attached to the dummytop. as
3771 * in loading, we need to pay our respects to the hashtable gods.
3772 */
3773 write_lock(&css_set_lock);
3774 list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
3775 struct css_set *cg = link->cg;
3776
3777 hlist_del(&cg->hlist);
3778 BUG_ON(!cg->subsys[ss->subsys_id]);
3779 cg->subsys[ss->subsys_id] = NULL;
3780 hhead = css_set_hash(cg->subsys);
3781 hlist_add_head(&cg->hlist, hhead);
3782 }
3783 write_unlock(&css_set_lock);
3784
3785 /*
3786 * remove subsystem's css from the dummytop and free it - need to free
3787 * before marking as null because ss->destroy needs the cgrp->subsys
3788 * pointer to find their state. note that this also takes care of
3789 * freeing the css_id.
3790 */
3791 ss->destroy(ss, dummytop);
3792 dummytop->subsys[ss->subsys_id] = NULL;
3793
3794 mutex_unlock(&cgroup_mutex);
3795}
3796EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
3797
3798/**
3212 * cgroup_init_early - cgroup initialization at system boot 3799 * cgroup_init_early - cgroup initialization at system boot
3213 * 3800 *
3214 * Initialize cgroups at system boot, and initialize any 3801 * Initialize cgroups at system boot, and initialize any
@@ -3236,7 +3823,8 @@ int __init cgroup_init_early(void)
3236 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) 3823 for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
3237 INIT_HLIST_HEAD(&css_set_table[i]); 3824 INIT_HLIST_HEAD(&css_set_table[i]);
3238 3825
3239 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3826 /* at bootup time, we don't worry about modular subsystems */
3827 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3240 struct cgroup_subsys *ss = subsys[i]; 3828 struct cgroup_subsys *ss = subsys[i];
3241 3829
3242 BUG_ON(!ss->name); 3830 BUG_ON(!ss->name);
@@ -3271,12 +3859,13 @@ int __init cgroup_init(void)
3271 if (err) 3859 if (err)
3272 return err; 3860 return err;
3273 3861
3274 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3862 /* at bootup time, we don't worry about modular subsystems */
3863 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3275 struct cgroup_subsys *ss = subsys[i]; 3864 struct cgroup_subsys *ss = subsys[i];
3276 if (!ss->early_init) 3865 if (!ss->early_init)
3277 cgroup_init_subsys(ss); 3866 cgroup_init_subsys(ss);
3278 if (ss->use_id) 3867 if (ss->use_id)
3279 cgroup_subsys_init_idr(ss); 3868 cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
3280 } 3869 }
3281 3870
3282 /* Add init_css_set to the hash table */ 3871 /* Add init_css_set to the hash table */
@@ -3380,9 +3969,16 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
3380 int i; 3969 int i;
3381 3970
3382 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); 3971 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
3972 /*
3973 * ideally we don't want subsystems moving around while we do this.
3974 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
3975 * subsys/hierarchy state.
3976 */
3383 mutex_lock(&cgroup_mutex); 3977 mutex_lock(&cgroup_mutex);
3384 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3978 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3385 struct cgroup_subsys *ss = subsys[i]; 3979 struct cgroup_subsys *ss = subsys[i];
3980 if (ss == NULL)
3981 continue;
3386 seq_printf(m, "%s\t%d\t%d\t%d\n", 3982 seq_printf(m, "%s\t%d\t%d\t%d\n",
3387 ss->name, ss->root->hierarchy_id, 3983 ss->name, ss->root->hierarchy_id,
3388 ss->root->number_of_cgroups, !ss->disabled); 3984 ss->root->number_of_cgroups, !ss->disabled);
@@ -3440,7 +4036,12 @@ void cgroup_fork_callbacks(struct task_struct *child)
3440{ 4036{
3441 if (need_forkexit_callback) { 4037 if (need_forkexit_callback) {
3442 int i; 4038 int i;
3443 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4039 /*
4040 * forkexit callbacks are only supported for builtin
4041 * subsystems, and the builtin section of the subsys array is
4042 * immutable, so we don't need to lock the subsys array here.
4043 */
4044 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3444 struct cgroup_subsys *ss = subsys[i]; 4045 struct cgroup_subsys *ss = subsys[i];
3445 if (ss->fork) 4046 if (ss->fork)
3446 ss->fork(ss, child); 4047 ss->fork(ss, child);
@@ -3509,7 +4110,11 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
3509 struct css_set *cg; 4110 struct css_set *cg;
3510 4111
3511 if (run_callbacks && need_forkexit_callback) { 4112 if (run_callbacks && need_forkexit_callback) {
3512 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4113 /*
4114 * modular subsystems can't use callbacks, so no need to lock
4115 * the subsys array
4116 */
4117 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3513 struct cgroup_subsys *ss = subsys[i]; 4118 struct cgroup_subsys *ss = subsys[i];
3514 if (ss->exit) 4119 if (ss->exit)
3515 ss->exit(ss, tsk); 4120 ss->exit(ss, tsk);
@@ -3703,12 +4308,13 @@ static void check_for_release(struct cgroup *cgrp)
3703 } 4308 }
3704} 4309}
3705 4310
3706void __css_put(struct cgroup_subsys_state *css) 4311/* Caller must verify that the css is not for root cgroup */
4312void __css_put(struct cgroup_subsys_state *css, int count)
3707{ 4313{
3708 struct cgroup *cgrp = css->cgroup; 4314 struct cgroup *cgrp = css->cgroup;
3709 int val; 4315 int val;
3710 rcu_read_lock(); 4316 rcu_read_lock();
3711 val = atomic_dec_return(&css->refcnt); 4317 val = atomic_sub_return(count, &css->refcnt);
3712 if (val == 1) { 4318 if (val == 1) {
3713 if (notify_on_release(cgrp)) { 4319 if (notify_on_release(cgrp)) {
3714 set_bit(CGRP_RELEASABLE, &cgrp->flags); 4320 set_bit(CGRP_RELEASABLE, &cgrp->flags);
@@ -3719,6 +4325,7 @@ void __css_put(struct cgroup_subsys_state *css)
3719 rcu_read_unlock(); 4325 rcu_read_unlock();
3720 WARN_ON_ONCE(val < 1); 4326 WARN_ON_ONCE(val < 1);
3721} 4327}
4328EXPORT_SYMBOL_GPL(__css_put);
3722 4329
3723/* 4330/*
3724 * Notify userspace when a cgroup is released, by running the 4331 * Notify userspace when a cgroup is released, by running the
@@ -3800,8 +4407,11 @@ static int __init cgroup_disable(char *str)
3800 while ((token = strsep(&str, ",")) != NULL) { 4407 while ((token = strsep(&str, ",")) != NULL) {
3801 if (!*token) 4408 if (!*token)
3802 continue; 4409 continue;
3803 4410 /*
3804 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4411 * cgroup_disable, being at boot time, can't know about module
4412 * subsystems, so we don't worry about them.
4413 */
4414 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3805 struct cgroup_subsys *ss = subsys[i]; 4415 struct cgroup_subsys *ss = subsys[i];
3806 4416
3807 if (!strcmp(token, ss->name)) { 4417 if (!strcmp(token, ss->name)) {
@@ -3825,31 +4435,65 @@ __setup("cgroup_disable=", cgroup_disable);
3825 */ 4435 */
3826unsigned short css_id(struct cgroup_subsys_state *css) 4436unsigned short css_id(struct cgroup_subsys_state *css)
3827{ 4437{
3828 struct css_id *cssid = rcu_dereference(css->id); 4438 struct css_id *cssid;
4439
4440 /*
4441 * This css_id() can return correct value when somone has refcnt
4442 * on this or this is under rcu_read_lock(). Once css->id is allocated,
4443 * it's unchanged until freed.
4444 */
4445 cssid = rcu_dereference_check(css->id,
4446 rcu_read_lock_held() || atomic_read(&css->refcnt));
3829 4447
3830 if (cssid) 4448 if (cssid)
3831 return cssid->id; 4449 return cssid->id;
3832 return 0; 4450 return 0;
3833} 4451}
4452EXPORT_SYMBOL_GPL(css_id);
3834 4453
3835unsigned short css_depth(struct cgroup_subsys_state *css) 4454unsigned short css_depth(struct cgroup_subsys_state *css)
3836{ 4455{
3837 struct css_id *cssid = rcu_dereference(css->id); 4456 struct css_id *cssid;
4457
4458 cssid = rcu_dereference_check(css->id,
4459 rcu_read_lock_held() || atomic_read(&css->refcnt));
3838 4460
3839 if (cssid) 4461 if (cssid)
3840 return cssid->depth; 4462 return cssid->depth;
3841 return 0; 4463 return 0;
3842} 4464}
4465EXPORT_SYMBOL_GPL(css_depth);
4466
4467/**
4468 * css_is_ancestor - test "root" css is an ancestor of "child"
4469 * @child: the css to be tested.
4470 * @root: the css supporsed to be an ancestor of the child.
4471 *
4472 * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
4473 * this function reads css->id, this use rcu_dereference() and rcu_read_lock().
4474 * But, considering usual usage, the csses should be valid objects after test.
4475 * Assuming that the caller will do some action to the child if this returns
4476 * returns true, the caller must take "child";s reference count.
4477 * If "child" is valid object and this returns true, "root" is valid, too.
4478 */
3843 4479
3844bool css_is_ancestor(struct cgroup_subsys_state *child, 4480bool css_is_ancestor(struct cgroup_subsys_state *child,
3845 const struct cgroup_subsys_state *root) 4481 const struct cgroup_subsys_state *root)
3846{ 4482{
3847 struct css_id *child_id = rcu_dereference(child->id); 4483 struct css_id *child_id;
3848 struct css_id *root_id = rcu_dereference(root->id); 4484 struct css_id *root_id;
4485 bool ret = true;
3849 4486
3850 if (!child_id || !root_id || (child_id->depth < root_id->depth)) 4487 rcu_read_lock();
3851 return false; 4488 child_id = rcu_dereference(child->id);
3852 return child_id->stack[root_id->depth] == root_id->id; 4489 root_id = rcu_dereference(root->id);
4490 if (!child_id
4491 || !root_id
4492 || (child_id->depth < root_id->depth)
4493 || (child_id->stack[root_id->depth] != root_id->id))
4494 ret = false;
4495 rcu_read_unlock();
4496 return ret;
3853} 4497}
3854 4498
3855static void __free_css_id_cb(struct rcu_head *head) 4499static void __free_css_id_cb(struct rcu_head *head)
@@ -3876,6 +4520,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
3876 spin_unlock(&ss->id_lock); 4520 spin_unlock(&ss->id_lock);
3877 call_rcu(&id->rcu_head, __free_css_id_cb); 4521 call_rcu(&id->rcu_head, __free_css_id_cb);
3878} 4522}
4523EXPORT_SYMBOL_GPL(free_css_id);
3879 4524
3880/* 4525/*
3881 * This is called by init or create(). Then, calls to this function are 4526 * This is called by init or create(). Then, calls to this function are
@@ -3925,15 +4570,14 @@ err_out:
3925 4570
3926} 4571}
3927 4572
3928static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss) 4573static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
4574 struct cgroup_subsys_state *rootcss)
3929{ 4575{
3930 struct css_id *newid; 4576 struct css_id *newid;
3931 struct cgroup_subsys_state *rootcss;
3932 4577
3933 spin_lock_init(&ss->id_lock); 4578 spin_lock_init(&ss->id_lock);
3934 idr_init(&ss->idr); 4579 idr_init(&ss->idr);
3935 4580
3936 rootcss = init_css_set.subsys[ss->subsys_id];
3937 newid = get_new_cssid(ss, 0); 4581 newid = get_new_cssid(ss, 0);
3938 if (IS_ERR(newid)) 4582 if (IS_ERR(newid))
3939 return PTR_ERR(newid); 4583 return PTR_ERR(newid);
@@ -3949,13 +4593,13 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
3949{ 4593{
3950 int subsys_id, i, depth = 0; 4594 int subsys_id, i, depth = 0;
3951 struct cgroup_subsys_state *parent_css, *child_css; 4595 struct cgroup_subsys_state *parent_css, *child_css;
3952 struct css_id *child_id, *parent_id = NULL; 4596 struct css_id *child_id, *parent_id;
3953 4597
3954 subsys_id = ss->subsys_id; 4598 subsys_id = ss->subsys_id;
3955 parent_css = parent->subsys[subsys_id]; 4599 parent_css = parent->subsys[subsys_id];
3956 child_css = child->subsys[subsys_id]; 4600 child_css = child->subsys[subsys_id];
3957 depth = css_depth(parent_css) + 1;
3958 parent_id = parent_css->id; 4601 parent_id = parent_css->id;
4602 depth = parent_id->depth;
3959 4603
3960 child_id = get_new_cssid(ss, depth); 4604 child_id = get_new_cssid(ss, depth);
3961 if (IS_ERR(child_id)) 4605 if (IS_ERR(child_id))
@@ -3993,6 +4637,7 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
3993 4637
3994 return rcu_dereference(cssid->css); 4638 return rcu_dereference(cssid->css);
3995} 4639}
4640EXPORT_SYMBOL_GPL(css_lookup);
3996 4641
3997/** 4642/**
3998 * css_get_next - lookup next cgroup under specified hierarchy. 4643 * css_get_next - lookup next cgroup under specified hierarchy.
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 59e9ef6aab40..e5c0244962b0 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -15,6 +15,7 @@
15 */ 15 */
16 16
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/slab.h>
18#include <linux/cgroup.h> 19#include <linux/cgroup.h>
19#include <linux/fs.h> 20#include <linux/fs.h>
20#include <linux/uaccess.h> 21#include <linux/uaccess.h>
@@ -47,17 +48,20 @@ static inline struct freezer *task_freezer(struct task_struct *task)
47 struct freezer, css); 48 struct freezer, css);
48} 49}
49 50
50int cgroup_frozen(struct task_struct *task) 51int cgroup_freezing_or_frozen(struct task_struct *task)
51{ 52{
52 struct freezer *freezer; 53 struct freezer *freezer;
53 enum freezer_state state; 54 enum freezer_state state;
54 55
55 task_lock(task); 56 task_lock(task);
56 freezer = task_freezer(task); 57 freezer = task_freezer(task);
57 state = freezer->state; 58 if (!freezer->css.cgroup->parent)
59 state = CGROUP_THAWED; /* root cgroup can't be frozen */
60 else
61 state = freezer->state;
58 task_unlock(task); 62 task_unlock(task);
59 63
60 return state == CGROUP_FROZEN; 64 return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
61} 65}
62 66
63/* 67/*
@@ -201,9 +205,12 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
201 * No lock is needed, since the task isn't on tasklist yet, 205 * No lock is needed, since the task isn't on tasklist yet,
202 * so it can't be moved to another cgroup, which means the 206 * so it can't be moved to another cgroup, which means the
203 * freezer won't be removed and will be valid during this 207 * freezer won't be removed and will be valid during this
204 * function call. 208 * function call. Nevertheless, apply RCU read-side critical
209 * section to suppress RCU lockdep false positives.
205 */ 210 */
211 rcu_read_lock();
206 freezer = task_freezer(task); 212 freezer = task_freezer(task);
213 rcu_read_unlock();
207 214
208 /* 215 /*
209 * The root cgroup is non-freezable, so we can skip the 216 * The root cgroup is non-freezable, so we can skip the
diff --git a/kernel/compat.c b/kernel/compat.c
index f6c204f07ea6..7f40e9275fd9 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -25,6 +25,7 @@
25#include <linux/posix-timers.h> 25#include <linux/posix-timers.h>
26#include <linux/times.h> 26#include <linux/times.h>
27#include <linux/ptrace.h> 27#include <linux/ptrace.h>
28#include <linux/gfp.h>
28 29
29#include <asm/uaccess.h> 30#include <asm/uaccess.h>
30 31
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6ba0f1ecb212..25bba73b1be3 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -14,6 +14,7 @@
14#include <linux/kthread.h> 14#include <linux/kthread.h>
15#include <linux/stop_machine.h> 15#include <linux/stop_machine.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <linux/gfp.h>
17 18
18#ifdef CONFIG_SMP 19#ifdef CONFIG_SMP
19/* Serializes the updates to cpu_online_mask, cpu_present_mask */ 20/* Serializes the updates to cpu_online_mask, cpu_present_mask */
@@ -151,13 +152,13 @@ static inline void check_for_tasks(int cpu)
151 152
152 write_lock_irq(&tasklist_lock); 153 write_lock_irq(&tasklist_lock);
153 for_each_process(p) { 154 for_each_process(p) {
154 if (task_cpu(p) == cpu && 155 if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
155 (!cputime_eq(p->utime, cputime_zero) || 156 (!cputime_eq(p->utime, cputime_zero) ||
156 !cputime_eq(p->stime, cputime_zero))) 157 !cputime_eq(p->stime, cputime_zero)))
157 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\ 158 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
158 (state = %ld, flags = %x) \n", 159 "(state = %ld, flags = %x)\n",
159 p->comm, task_pid_nr(p), cpu, 160 p->comm, task_pid_nr(p), cpu,
160 p->state, p->flags); 161 p->state, p->flags);
161 } 162 }
162 write_unlock_irq(&tasklist_lock); 163 write_unlock_irq(&tasklist_lock);
163} 164}
@@ -209,9 +210,12 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
209 return -ENOMEM; 210 return -ENOMEM;
210 211
211 cpu_hotplug_begin(); 212 cpu_hotplug_begin();
213 set_cpu_active(cpu, false);
212 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, 214 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
213 hcpu, -1, &nr_calls); 215 hcpu, -1, &nr_calls);
214 if (err == NOTIFY_BAD) { 216 if (err == NOTIFY_BAD) {
217 set_cpu_active(cpu, true);
218
215 nr_calls--; 219 nr_calls--;
216 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 220 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
217 hcpu, nr_calls, NULL); 221 hcpu, nr_calls, NULL);
@@ -223,11 +227,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
223 227
224 /* Ensure that we are not runnable on dying cpu */ 228 /* Ensure that we are not runnable on dying cpu */
225 cpumask_copy(old_allowed, &current->cpus_allowed); 229 cpumask_copy(old_allowed, &current->cpus_allowed);
226 set_cpus_allowed_ptr(current, 230 set_cpus_allowed_ptr(current, cpu_active_mask);
227 cpumask_of(cpumask_any_but(cpu_online_mask, cpu)));
228 231
229 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); 232 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
230 if (err) { 233 if (err) {
234 set_cpu_active(cpu, true);
231 /* CPU didn't die: tell everyone. Can't complain. */ 235 /* CPU didn't die: tell everyone. Can't complain. */
232 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 236 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
233 hcpu) == NOTIFY_BAD) 237 hcpu) == NOTIFY_BAD)
@@ -278,23 +282,8 @@ int __ref cpu_down(unsigned int cpu)
278 goto out; 282 goto out;
279 } 283 }
280 284
281 set_cpu_active(cpu, false);
282
283 /*
284 * Make sure the all cpus did the reschedule and are not
285 * using stale version of the cpu_active_mask.
286 * This is not strictly necessary becuase stop_machine()
287 * that we run down the line already provides the required
288 * synchronization. But it's really a side effect and we do not
289 * want to depend on the innards of the stop_machine here.
290 */
291 synchronize_sched();
292
293 err = _cpu_down(cpu, 0); 285 err = _cpu_down(cpu, 0);
294 286
295 if (cpu_online(cpu))
296 set_cpu_active(cpu, true);
297
298out: 287out:
299 cpu_maps_update_done(); 288 cpu_maps_update_done();
300 stop_machine_destroy(); 289 stop_machine_destroy();
@@ -350,7 +339,7 @@ int __cpuinit cpu_up(unsigned int cpu)
350 if (!cpu_possible(cpu)) { 339 if (!cpu_possible(cpu)) {
351 printk(KERN_ERR "can't online cpu %d because it is not " 340 printk(KERN_ERR "can't online cpu %d because it is not "
352 "configured as may-hotadd at boot time\n", cpu); 341 "configured as may-hotadd at boot time\n", cpu);
353#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) 342#if defined(CONFIG_IA64)
354 printk(KERN_ERR "please check additional_cpus= boot " 343 printk(KERN_ERR "please check additional_cpus= boot "
355 "parameter\n"); 344 "parameter\n");
356#endif 345#endif
@@ -383,19 +372,20 @@ int disable_nonboot_cpus(void)
383 return error; 372 return error;
384 cpu_maps_update_begin(); 373 cpu_maps_update_begin();
385 first_cpu = cpumask_first(cpu_online_mask); 374 first_cpu = cpumask_first(cpu_online_mask);
386 /* We take down all of the non-boot CPUs in one shot to avoid races 375 /*
376 * We take down all of the non-boot CPUs in one shot to avoid races
387 * with the userspace trying to use the CPU hotplug at the same time 377 * with the userspace trying to use the CPU hotplug at the same time
388 */ 378 */
389 cpumask_clear(frozen_cpus); 379 cpumask_clear(frozen_cpus);
380
390 printk("Disabling non-boot CPUs ...\n"); 381 printk("Disabling non-boot CPUs ...\n");
391 for_each_online_cpu(cpu) { 382 for_each_online_cpu(cpu) {
392 if (cpu == first_cpu) 383 if (cpu == first_cpu)
393 continue; 384 continue;
394 error = _cpu_down(cpu, 1); 385 error = _cpu_down(cpu, 1);
395 if (!error) { 386 if (!error)
396 cpumask_set_cpu(cpu, frozen_cpus); 387 cpumask_set_cpu(cpu, frozen_cpus);
397 printk("CPU%d is down\n", cpu); 388 else {
398 } else {
399 printk(KERN_ERR "Error taking CPU%d down: %d\n", 389 printk(KERN_ERR "Error taking CPU%d down: %d\n",
400 cpu, error); 390 cpu, error);
401 break; 391 break;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b5cb469d2545..d10946748ec2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -537,8 +537,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
537 * element of the partition (one sched domain) to be passed to 537 * element of the partition (one sched domain) to be passed to
538 * partition_sched_domains(). 538 * partition_sched_domains().
539 */ 539 */
540/* FIXME: see the FIXME in partition_sched_domains() */ 540static int generate_sched_domains(cpumask_var_t **domains,
541static int generate_sched_domains(struct cpumask **domains,
542 struct sched_domain_attr **attributes) 541 struct sched_domain_attr **attributes)
543{ 542{
544 LIST_HEAD(q); /* queue of cpusets to be scanned */ 543 LIST_HEAD(q); /* queue of cpusets to be scanned */
@@ -546,7 +545,7 @@ static int generate_sched_domains(struct cpumask **domains,
546 struct cpuset **csa; /* array of all cpuset ptrs */ 545 struct cpuset **csa; /* array of all cpuset ptrs */
547 int csn; /* how many cpuset ptrs in csa so far */ 546 int csn; /* how many cpuset ptrs in csa so far */
548 int i, j, k; /* indices for partition finding loops */ 547 int i, j, k; /* indices for partition finding loops */
549 struct cpumask *doms; /* resulting partition; i.e. sched domains */ 548 cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
550 struct sched_domain_attr *dattr; /* attributes for custom domains */ 549 struct sched_domain_attr *dattr; /* attributes for custom domains */
551 int ndoms = 0; /* number of sched domains in result */ 550 int ndoms = 0; /* number of sched domains in result */
552 int nslot; /* next empty doms[] struct cpumask slot */ 551 int nslot; /* next empty doms[] struct cpumask slot */
@@ -557,7 +556,8 @@ static int generate_sched_domains(struct cpumask **domains,
557 556
558 /* Special case for the 99% of systems with one, full, sched domain */ 557 /* Special case for the 99% of systems with one, full, sched domain */
559 if (is_sched_load_balance(&top_cpuset)) { 558 if (is_sched_load_balance(&top_cpuset)) {
560 doms = kmalloc(cpumask_size(), GFP_KERNEL); 559 ndoms = 1;
560 doms = alloc_sched_domains(ndoms);
561 if (!doms) 561 if (!doms)
562 goto done; 562 goto done;
563 563
@@ -566,9 +566,8 @@ static int generate_sched_domains(struct cpumask **domains,
566 *dattr = SD_ATTR_INIT; 566 *dattr = SD_ATTR_INIT;
567 update_domain_attr_tree(dattr, &top_cpuset); 567 update_domain_attr_tree(dattr, &top_cpuset);
568 } 568 }
569 cpumask_copy(doms, top_cpuset.cpus_allowed); 569 cpumask_copy(doms[0], top_cpuset.cpus_allowed);
570 570
571 ndoms = 1;
572 goto done; 571 goto done;
573 } 572 }
574 573
@@ -636,7 +635,7 @@ restart:
636 * Now we know how many domains to create. 635 * Now we know how many domains to create.
637 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. 636 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
638 */ 637 */
639 doms = kmalloc(ndoms * cpumask_size(), GFP_KERNEL); 638 doms = alloc_sched_domains(ndoms);
640 if (!doms) 639 if (!doms)
641 goto done; 640 goto done;
642 641
@@ -656,7 +655,7 @@ restart:
656 continue; 655 continue;
657 } 656 }
658 657
659 dp = doms + nslot; 658 dp = doms[nslot];
660 659
661 if (nslot == ndoms) { 660 if (nslot == ndoms) {
662 static int warnings = 10; 661 static int warnings = 10;
@@ -718,7 +717,7 @@ done:
718static void do_rebuild_sched_domains(struct work_struct *unused) 717static void do_rebuild_sched_domains(struct work_struct *unused)
719{ 718{
720 struct sched_domain_attr *attr; 719 struct sched_domain_attr *attr;
721 struct cpumask *doms; 720 cpumask_var_t *doms;
722 int ndoms; 721 int ndoms;
723 722
724 get_online_cpus(); 723 get_online_cpus();
@@ -738,7 +737,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused)
738{ 737{
739} 738}
740 739
741static int generate_sched_domains(struct cpumask **domains, 740static int generate_sched_domains(cpumask_var_t **domains,
742 struct sched_domain_attr **attributes) 741 struct sched_domain_attr **attributes)
743{ 742{
744 *domains = NULL; 743 *domains = NULL;
@@ -873,7 +872,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
873 if (retval < 0) 872 if (retval < 0)
874 return retval; 873 return retval;
875 874
876 if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask)) 875 if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
877 return -EINVAL; 876 return -EINVAL;
878 } 877 }
879 retval = validate_change(cs, trialcs); 878 retval = validate_change(cs, trialcs);
@@ -921,9 +920,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
921 * call to guarantee_online_mems(), as we know no one is changing 920 * call to guarantee_online_mems(), as we know no one is changing
922 * our task's cpuset. 921 * our task's cpuset.
923 * 922 *
924 * Hold callback_mutex around the two modifications of our tasks
925 * mems_allowed to synchronize with cpuset_mems_allowed().
926 *
927 * While the mm_struct we are migrating is typically from some 923 * While the mm_struct we are migrating is typically from some
928 * other task, the task_struct mems_allowed that we are hacking 924 * other task, the task_struct mems_allowed that we are hacking
929 * is for our current task, which must allocate new pages for that 925 * is for our current task, which must allocate new pages for that
@@ -974,15 +970,20 @@ static void cpuset_change_nodemask(struct task_struct *p,
974 struct cpuset *cs; 970 struct cpuset *cs;
975 int migrate; 971 int migrate;
976 const nodemask_t *oldmem = scan->data; 972 const nodemask_t *oldmem = scan->data;
977 nodemask_t newmems; 973 NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL);
974
975 if (!newmems)
976 return;
978 977
979 cs = cgroup_cs(scan->cg); 978 cs = cgroup_cs(scan->cg);
980 guarantee_online_mems(cs, &newmems); 979 guarantee_online_mems(cs, newmems);
981 980
982 task_lock(p); 981 task_lock(p);
983 cpuset_change_task_nodemask(p, &newmems); 982 cpuset_change_task_nodemask(p, newmems);
984 task_unlock(p); 983 task_unlock(p);
985 984
985 NODEMASK_FREE(newmems);
986
986 mm = get_task_mm(p); 987 mm = get_task_mm(p);
987 if (!mm) 988 if (!mm)
988 return; 989 return;
@@ -1052,16 +1053,21 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1052static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, 1053static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1053 const char *buf) 1054 const char *buf)
1054{ 1055{
1055 nodemask_t oldmem; 1056 NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);
1056 int retval; 1057 int retval;
1057 struct ptr_heap heap; 1058 struct ptr_heap heap;
1058 1059
1060 if (!oldmem)
1061 return -ENOMEM;
1062
1059 /* 1063 /*
1060 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; 1064 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
1061 * it's read-only 1065 * it's read-only
1062 */ 1066 */
1063 if (cs == &top_cpuset) 1067 if (cs == &top_cpuset) {
1064 return -EACCES; 1068 retval = -EACCES;
1069 goto done;
1070 }
1065 1071
1066 /* 1072 /*
1067 * An empty mems_allowed is ok iff there are no tasks in the cpuset. 1073 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
@@ -1077,11 +1083,13 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1077 goto done; 1083 goto done;
1078 1084
1079 if (!nodes_subset(trialcs->mems_allowed, 1085 if (!nodes_subset(trialcs->mems_allowed,
1080 node_states[N_HIGH_MEMORY])) 1086 node_states[N_HIGH_MEMORY])) {
1081 return -EINVAL; 1087 retval = -EINVAL;
1088 goto done;
1089 }
1082 } 1090 }
1083 oldmem = cs->mems_allowed; 1091 *oldmem = cs->mems_allowed;
1084 if (nodes_equal(oldmem, trialcs->mems_allowed)) { 1092 if (nodes_equal(*oldmem, trialcs->mems_allowed)) {
1085 retval = 0; /* Too easy - nothing to do */ 1093 retval = 0; /* Too easy - nothing to do */
1086 goto done; 1094 goto done;
1087 } 1095 }
@@ -1097,10 +1105,11 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1097 cs->mems_allowed = trialcs->mems_allowed; 1105 cs->mems_allowed = trialcs->mems_allowed;
1098 mutex_unlock(&callback_mutex); 1106 mutex_unlock(&callback_mutex);
1099 1107
1100 update_tasks_nodemask(cs, &oldmem, &heap); 1108 update_tasks_nodemask(cs, oldmem, &heap);
1101 1109
1102 heap_free(&heap); 1110 heap_free(&heap);
1103done: 1111done:
1112 NODEMASK_FREE(oldmem);
1104 return retval; 1113 return retval;
1105} 1114}
1106 1115
@@ -1385,40 +1394,47 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1385 struct cgroup *oldcont, struct task_struct *tsk, 1394 struct cgroup *oldcont, struct task_struct *tsk,
1386 bool threadgroup) 1395 bool threadgroup)
1387{ 1396{
1388 nodemask_t from, to;
1389 struct mm_struct *mm; 1397 struct mm_struct *mm;
1390 struct cpuset *cs = cgroup_cs(cont); 1398 struct cpuset *cs = cgroup_cs(cont);
1391 struct cpuset *oldcs = cgroup_cs(oldcont); 1399 struct cpuset *oldcs = cgroup_cs(oldcont);
1400 NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL);
1401 NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
1402
1403 if (from == NULL || to == NULL)
1404 goto alloc_fail;
1392 1405
1393 if (cs == &top_cpuset) { 1406 if (cs == &top_cpuset) {
1394 cpumask_copy(cpus_attach, cpu_possible_mask); 1407 cpumask_copy(cpus_attach, cpu_possible_mask);
1395 to = node_possible_map;
1396 } else { 1408 } else {
1397 guarantee_online_cpus(cs, cpus_attach); 1409 guarantee_online_cpus(cs, cpus_attach);
1398 guarantee_online_mems(cs, &to);
1399 } 1410 }
1411 guarantee_online_mems(cs, to);
1400 1412
1401 /* do per-task migration stuff possibly for each in the threadgroup */ 1413 /* do per-task migration stuff possibly for each in the threadgroup */
1402 cpuset_attach_task(tsk, &to, cs); 1414 cpuset_attach_task(tsk, to, cs);
1403 if (threadgroup) { 1415 if (threadgroup) {
1404 struct task_struct *c; 1416 struct task_struct *c;
1405 rcu_read_lock(); 1417 rcu_read_lock();
1406 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { 1418 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1407 cpuset_attach_task(c, &to, cs); 1419 cpuset_attach_task(c, to, cs);
1408 } 1420 }
1409 rcu_read_unlock(); 1421 rcu_read_unlock();
1410 } 1422 }
1411 1423
1412 /* change mm; only needs to be done once even if threadgroup */ 1424 /* change mm; only needs to be done once even if threadgroup */
1413 from = oldcs->mems_allowed; 1425 *from = oldcs->mems_allowed;
1414 to = cs->mems_allowed; 1426 *to = cs->mems_allowed;
1415 mm = get_task_mm(tsk); 1427 mm = get_task_mm(tsk);
1416 if (mm) { 1428 if (mm) {
1417 mpol_rebind_mm(mm, &to); 1429 mpol_rebind_mm(mm, to);
1418 if (is_memory_migrate(cs)) 1430 if (is_memory_migrate(cs))
1419 cpuset_migrate_mm(mm, &from, &to); 1431 cpuset_migrate_mm(mm, from, to);
1420 mmput(mm); 1432 mmput(mm);
1421 } 1433 }
1434
1435alloc_fail:
1436 NODEMASK_FREE(from);
1437 NODEMASK_FREE(to);
1422} 1438}
1423 1439
1424/* The various types of files and directories in a cpuset file system */ 1440/* The various types of files and directories in a cpuset file system */
@@ -1563,13 +1579,21 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1563 1579
1564static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) 1580static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1565{ 1581{
1566 nodemask_t mask; 1582 NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL);
1583 int retval;
1584
1585 if (mask == NULL)
1586 return -ENOMEM;
1567 1587
1568 mutex_lock(&callback_mutex); 1588 mutex_lock(&callback_mutex);
1569 mask = cs->mems_allowed; 1589 *mask = cs->mems_allowed;
1570 mutex_unlock(&callback_mutex); 1590 mutex_unlock(&callback_mutex);
1571 1591
1572 return nodelist_scnprintf(page, PAGE_SIZE, mask); 1592 retval = nodelist_scnprintf(page, PAGE_SIZE, *mask);
1593
1594 NODEMASK_FREE(mask);
1595
1596 return retval;
1573} 1597}
1574 1598
1575static ssize_t cpuset_common_file_read(struct cgroup *cont, 1599static ssize_t cpuset_common_file_read(struct cgroup *cont,
@@ -1998,7 +2022,10 @@ static void scan_for_empty_cpusets(struct cpuset *root)
1998 struct cpuset *cp; /* scans cpusets being updated */ 2022 struct cpuset *cp; /* scans cpusets being updated */
1999 struct cpuset *child; /* scans child cpusets of cp */ 2023 struct cpuset *child; /* scans child cpusets of cp */
2000 struct cgroup *cont; 2024 struct cgroup *cont;
2001 nodemask_t oldmems; 2025 NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
2026
2027 if (oldmems == NULL)
2028 return;
2002 2029
2003 list_add_tail((struct list_head *)&root->stack_list, &queue); 2030 list_add_tail((struct list_head *)&root->stack_list, &queue);
2004 2031
@@ -2011,16 +2038,16 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2011 } 2038 }
2012 2039
2013 /* Continue past cpusets with all cpus, mems online */ 2040 /* Continue past cpusets with all cpus, mems online */
2014 if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) && 2041 if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&
2015 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) 2042 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
2016 continue; 2043 continue;
2017 2044
2018 oldmems = cp->mems_allowed; 2045 *oldmems = cp->mems_allowed;
2019 2046
2020 /* Remove offline cpus and mems from this cpuset. */ 2047 /* Remove offline cpus and mems from this cpuset. */
2021 mutex_lock(&callback_mutex); 2048 mutex_lock(&callback_mutex);
2022 cpumask_and(cp->cpus_allowed, cp->cpus_allowed, 2049 cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
2023 cpu_online_mask); 2050 cpu_active_mask);
2024 nodes_and(cp->mems_allowed, cp->mems_allowed, 2051 nodes_and(cp->mems_allowed, cp->mems_allowed,
2025 node_states[N_HIGH_MEMORY]); 2052 node_states[N_HIGH_MEMORY]);
2026 mutex_unlock(&callback_mutex); 2053 mutex_unlock(&callback_mutex);
@@ -2031,9 +2058,10 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2031 remove_tasks_in_empty_cpuset(cp); 2058 remove_tasks_in_empty_cpuset(cp);
2032 else { 2059 else {
2033 update_tasks_cpumask(cp, NULL); 2060 update_tasks_cpumask(cp, NULL);
2034 update_tasks_nodemask(cp, &oldmems, NULL); 2061 update_tasks_nodemask(cp, oldmems, NULL);
2035 } 2062 }
2036 } 2063 }
2064 NODEMASK_FREE(oldmems);
2037} 2065}
2038 2066
2039/* 2067/*
@@ -2052,14 +2080,16 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2052 unsigned long phase, void *unused_cpu) 2080 unsigned long phase, void *unused_cpu)
2053{ 2081{
2054 struct sched_domain_attr *attr; 2082 struct sched_domain_attr *attr;
2055 struct cpumask *doms; 2083 cpumask_var_t *doms;
2056 int ndoms; 2084 int ndoms;
2057 2085
2058 switch (phase) { 2086 switch (phase) {
2059 case CPU_ONLINE: 2087 case CPU_ONLINE:
2060 case CPU_ONLINE_FROZEN: 2088 case CPU_ONLINE_FROZEN:
2061 case CPU_DEAD: 2089 case CPU_DOWN_PREPARE:
2062 case CPU_DEAD_FROZEN: 2090 case CPU_DOWN_PREPARE_FROZEN:
2091 case CPU_DOWN_FAILED:
2092 case CPU_DOWN_FAILED_FROZEN:
2063 break; 2093 break;
2064 2094
2065 default: 2095 default:
@@ -2068,7 +2098,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2068 2098
2069 cgroup_lock(); 2099 cgroup_lock();
2070 mutex_lock(&callback_mutex); 2100 mutex_lock(&callback_mutex);
2071 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); 2101 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2072 mutex_unlock(&callback_mutex); 2102 mutex_unlock(&callback_mutex);
2073 scan_for_empty_cpusets(&top_cpuset); 2103 scan_for_empty_cpusets(&top_cpuset);
2074 ndoms = generate_sched_domains(&doms, &attr); 2104 ndoms = generate_sched_domains(&doms, &attr);
@@ -2089,20 +2119,33 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2089static int cpuset_track_online_nodes(struct notifier_block *self, 2119static int cpuset_track_online_nodes(struct notifier_block *self,
2090 unsigned long action, void *arg) 2120 unsigned long action, void *arg)
2091{ 2121{
2122 NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
2123
2124 if (oldmems == NULL)
2125 return NOTIFY_DONE;
2126
2092 cgroup_lock(); 2127 cgroup_lock();
2093 switch (action) { 2128 switch (action) {
2094 case MEM_ONLINE: 2129 case MEM_ONLINE:
2095 case MEM_OFFLINE: 2130 *oldmems = top_cpuset.mems_allowed;
2096 mutex_lock(&callback_mutex); 2131 mutex_lock(&callback_mutex);
2097 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2132 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2098 mutex_unlock(&callback_mutex); 2133 mutex_unlock(&callback_mutex);
2099 if (action == MEM_OFFLINE) 2134 update_tasks_nodemask(&top_cpuset, oldmems, NULL);
2100 scan_for_empty_cpusets(&top_cpuset); 2135 break;
2136 case MEM_OFFLINE:
2137 /*
2138 * needn't update top_cpuset.mems_allowed explicitly because
2139 * scan_for_empty_cpusets() will update it.
2140 */
2141 scan_for_empty_cpusets(&top_cpuset);
2101 break; 2142 break;
2102 default: 2143 default:
2103 break; 2144 break;
2104 } 2145 }
2105 cgroup_unlock(); 2146 cgroup_unlock();
2147
2148 NODEMASK_FREE(oldmems);
2106 return NOTIFY_OK; 2149 return NOTIFY_OK;
2107} 2150}
2108#endif 2151#endif
@@ -2115,7 +2158,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2115 2158
2116void __init cpuset_init_smp(void) 2159void __init cpuset_init_smp(void)
2117{ 2160{
2118 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); 2161 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2119 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2162 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2120 2163
2121 hotcpu_notifier(cpuset_track_online_cpus, 0); 2164 hotcpu_notifier(cpuset_track_online_cpus, 0);
@@ -2537,15 +2580,9 @@ const struct file_operations proc_cpuset_operations = {
2537}; 2580};
2538#endif /* CONFIG_PROC_PID_CPUSET */ 2581#endif /* CONFIG_PROC_PID_CPUSET */
2539 2582
2540/* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */ 2583/* Display task mems_allowed in /proc/<pid>/status file. */
2541void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) 2584void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2542{ 2585{
2543 seq_printf(m, "Cpus_allowed:\t");
2544 seq_cpumask(m, &task->cpus_allowed);
2545 seq_printf(m, "\n");
2546 seq_printf(m, "Cpus_allowed_list:\t");
2547 seq_cpumask_list(m, &task->cpus_allowed);
2548 seq_printf(m, "\n");
2549 seq_printf(m, "Mems_allowed:\t"); 2586 seq_printf(m, "Mems_allowed:\t");
2550 seq_nodemask(m, &task->mems_allowed); 2587 seq_nodemask(m, &task->mems_allowed);
2551 seq_printf(m, "\n"); 2588 seq_printf(m, "\n");
diff --git a/kernel/cred.c b/kernel/cred.c
index dd76cfe5f5b0..62af1816c235 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -10,6 +10,7 @@
10 */ 10 */
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/cred.h> 12#include <linux/cred.h>
13#include <linux/slab.h>
13#include <linux/sched.h> 14#include <linux/sched.h>
14#include <linux/key.h> 15#include <linux/key.h>
15#include <linux/keyctl.h> 16#include <linux/keyctl.h>
@@ -224,7 +225,7 @@ struct cred *cred_alloc_blank(void)
224#ifdef CONFIG_KEYS 225#ifdef CONFIG_KEYS
225 new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL); 226 new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
226 if (!new->tgcred) { 227 if (!new->tgcred) {
227 kfree(new); 228 kmem_cache_free(cred_jar, new);
228 return NULL; 229 return NULL;
229 } 230 }
230 atomic_set(&new->tgcred->usage, 1); 231 atomic_set(&new->tgcred->usage, 1);
@@ -364,7 +365,7 @@ struct cred *prepare_usermodehelper_creds(void)
364 365
365 new = kmem_cache_alloc(cred_jar, GFP_ATOMIC); 366 new = kmem_cache_alloc(cred_jar, GFP_ATOMIC);
366 if (!new) 367 if (!new)
367 return NULL; 368 goto free_tgcred;
368 369
369 kdebug("prepare_usermodehelper_creds() alloc %p", new); 370 kdebug("prepare_usermodehelper_creds() alloc %p", new);
370 371
@@ -398,6 +399,12 @@ struct cred *prepare_usermodehelper_creds(void)
398error: 399error:
399 put_cred(new); 400 put_cred(new);
400 return NULL; 401 return NULL;
402
403free_tgcred:
404#ifdef CONFIG_KEYS
405 kfree(tgcred);
406#endif
407 return NULL;
401} 408}
402 409
403/* 410/*
@@ -786,8 +793,6 @@ bool creds_are_invalid(const struct cred *cred)
786{ 793{
787 if (cred->magic != CRED_MAGIC) 794 if (cred->magic != CRED_MAGIC)
788 return true; 795 return true;
789 if (atomic_read(&cred->usage) < atomic_read(&cred->subscribers))
790 return true;
791#ifdef CONFIG_SECURITY_SELINUX 796#ifdef CONFIG_SECURITY_SELINUX
792 if (selinux_is_enabled()) { 797 if (selinux_is_enabled()) {
793 if ((unsigned long) cred->security < PAGE_SIZE) 798 if ((unsigned long) cred->security < PAGE_SIZE)
diff --git a/kernel/early_res.c b/kernel/early_res.c
new file mode 100644
index 000000000000..31aa9332ef3f
--- /dev/null
+++ b/kernel/early_res.c
@@ -0,0 +1,584 @@
1/*
2 * early_res, could be used to replace bootmem
3 */
4#include <linux/kernel.h>
5#include <linux/types.h>
6#include <linux/init.h>
7#include <linux/bootmem.h>
8#include <linux/mm.h>
9#include <linux/early_res.h>
10
11/*
12 * Early reserved memory areas.
13 */
14/*
15 * need to make sure this one is bigger enough before
16 * find_fw_memmap_area could be used
17 */
18#define MAX_EARLY_RES_X 32
19
20struct early_res {
21 u64 start, end;
22 char name[15];
23 char overlap_ok;
24};
25static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata;
26
27static int max_early_res __initdata = MAX_EARLY_RES_X;
28static struct early_res *early_res __initdata = &early_res_x[0];
29static int early_res_count __initdata;
30
31static int __init find_overlapped_early(u64 start, u64 end)
32{
33 int i;
34 struct early_res *r;
35
36 for (i = 0; i < max_early_res && early_res[i].end; i++) {
37 r = &early_res[i];
38 if (end > r->start && start < r->end)
39 break;
40 }
41
42 return i;
43}
44
45/*
46 * Drop the i-th range from the early reservation map,
47 * by copying any higher ranges down one over it, and
48 * clearing what had been the last slot.
49 */
50static void __init drop_range(int i)
51{
52 int j;
53
54 for (j = i + 1; j < max_early_res && early_res[j].end; j++)
55 ;
56
57 memmove(&early_res[i], &early_res[i + 1],
58 (j - 1 - i) * sizeof(struct early_res));
59
60 early_res[j - 1].end = 0;
61 early_res_count--;
62}
63
64static void __init drop_range_partial(int i, u64 start, u64 end)
65{
66 u64 common_start, common_end;
67 u64 old_start, old_end;
68
69 old_start = early_res[i].start;
70 old_end = early_res[i].end;
71 common_start = max(old_start, start);
72 common_end = min(old_end, end);
73
74 /* no overlap ? */
75 if (common_start >= common_end)
76 return;
77
78 if (old_start < common_start) {
79 /* make head segment */
80 early_res[i].end = common_start;
81 if (old_end > common_end) {
82 char name[15];
83
84 /*
85 * Save a local copy of the name, since the
86 * early_res array could get resized inside
87 * reserve_early_without_check() ->
88 * __check_and_double_early_res(), which would
89 * make the current name pointer invalid.
90 */
91 strncpy(name, early_res[i].name,
92 sizeof(early_res[i].name) - 1);
93 /* add another for left over on tail */
94 reserve_early_without_check(common_end, old_end, name);
95 }
96 return;
97 } else {
98 if (old_end > common_end) {
99 /* reuse the entry for tail left */
100 early_res[i].start = common_end;
101 return;
102 }
103 /* all covered */
104 drop_range(i);
105 }
106}
107
108/*
109 * Split any existing ranges that:
110 * 1) are marked 'overlap_ok', and
111 * 2) overlap with the stated range [start, end)
112 * into whatever portion (if any) of the existing range is entirely
113 * below or entirely above the stated range. Drop the portion
114 * of the existing range that overlaps with the stated range,
115 * which will allow the caller of this routine to then add that
116 * stated range without conflicting with any existing range.
117 */
118static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
119{
120 int i;
121 struct early_res *r;
122 u64 lower_start, lower_end;
123 u64 upper_start, upper_end;
124 char name[15];
125
126 for (i = 0; i < max_early_res && early_res[i].end; i++) {
127 r = &early_res[i];
128
129 /* Continue past non-overlapping ranges */
130 if (end <= r->start || start >= r->end)
131 continue;
132
133 /*
134 * Leave non-ok overlaps as is; let caller
135 * panic "Overlapping early reservations"
136 * when it hits this overlap.
137 */
138 if (!r->overlap_ok)
139 return;
140
141 /*
142 * We have an ok overlap. We will drop it from the early
143 * reservation map, and add back in any non-overlapping
144 * portions (lower or upper) as separate, overlap_ok,
145 * non-overlapping ranges.
146 */
147
148 /* 1. Note any non-overlapping (lower or upper) ranges. */
149 strncpy(name, r->name, sizeof(name) - 1);
150
151 lower_start = lower_end = 0;
152 upper_start = upper_end = 0;
153 if (r->start < start) {
154 lower_start = r->start;
155 lower_end = start;
156 }
157 if (r->end > end) {
158 upper_start = end;
159 upper_end = r->end;
160 }
161
162 /* 2. Drop the original ok overlapping range */
163 drop_range(i);
164
165 i--; /* resume for-loop on copied down entry */
166
167 /* 3. Add back in any non-overlapping ranges. */
168 if (lower_end)
169 reserve_early_overlap_ok(lower_start, lower_end, name);
170 if (upper_end)
171 reserve_early_overlap_ok(upper_start, upper_end, name);
172 }
173}
174
175static void __init __reserve_early(u64 start, u64 end, char *name,
176 int overlap_ok)
177{
178 int i;
179 struct early_res *r;
180
181 i = find_overlapped_early(start, end);
182 if (i >= max_early_res)
183 panic("Too many early reservations");
184 r = &early_res[i];
185 if (r->end)
186 panic("Overlapping early reservations "
187 "%llx-%llx %s to %llx-%llx %s\n",
188 start, end - 1, name ? name : "", r->start,
189 r->end - 1, r->name);
190 r->start = start;
191 r->end = end;
192 r->overlap_ok = overlap_ok;
193 if (name)
194 strncpy(r->name, name, sizeof(r->name) - 1);
195 early_res_count++;
196}
197
198/*
199 * A few early reservtations come here.
200 *
201 * The 'overlap_ok' in the name of this routine does -not- mean it
202 * is ok for these reservations to overlap an earlier reservation.
203 * Rather it means that it is ok for subsequent reservations to
204 * overlap this one.
205 *
206 * Use this entry point to reserve early ranges when you are doing
207 * so out of "Paranoia", reserving perhaps more memory than you need,
208 * just in case, and don't mind a subsequent overlapping reservation
209 * that is known to be needed.
210 *
211 * The drop_overlaps_that_are_ok() call here isn't really needed.
212 * It would be needed if we had two colliding 'overlap_ok'
213 * reservations, so that the second such would not panic on the
214 * overlap with the first. We don't have any such as of this
215 * writing, but might as well tolerate such if it happens in
216 * the future.
217 */
218void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
219{
220 drop_overlaps_that_are_ok(start, end);
221 __reserve_early(start, end, name, 1);
222}
223
224static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end)
225{
226 u64 start, end, size, mem;
227 struct early_res *new;
228
229 /* do we have enough slots left ? */
230 if ((max_early_res - early_res_count) > max(max_early_res/8, 2))
231 return;
232
233 /* double it */
234 mem = -1ULL;
235 size = sizeof(struct early_res) * max_early_res * 2;
236 if (early_res == early_res_x)
237 start = 0;
238 else
239 start = early_res[0].end;
240 end = ex_start;
241 if (start + size < end)
242 mem = find_fw_memmap_area(start, end, size,
243 sizeof(struct early_res));
244 if (mem == -1ULL) {
245 start = ex_end;
246 end = get_max_mapped();
247 if (start + size < end)
248 mem = find_fw_memmap_area(start, end, size,
249 sizeof(struct early_res));
250 }
251 if (mem == -1ULL)
252 panic("can not find more space for early_res array");
253
254 new = __va(mem);
255 /* save the first one for own */
256 new[0].start = mem;
257 new[0].end = mem + size;
258 new[0].overlap_ok = 0;
259 /* copy old to new */
260 if (early_res == early_res_x) {
261 memcpy(&new[1], &early_res[0],
262 sizeof(struct early_res) * max_early_res);
263 memset(&new[max_early_res+1], 0,
264 sizeof(struct early_res) * (max_early_res - 1));
265 early_res_count++;
266 } else {
267 memcpy(&new[1], &early_res[1],
268 sizeof(struct early_res) * (max_early_res - 1));
269 memset(&new[max_early_res], 0,
270 sizeof(struct early_res) * max_early_res);
271 }
272 memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
273 early_res = new;
274 max_early_res *= 2;
275 printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n",
276 max_early_res, mem, mem + size - 1);
277}
278
279/*
280 * Most early reservations come here.
281 *
282 * We first have drop_overlaps_that_are_ok() drop any pre-existing
283 * 'overlap_ok' ranges, so that we can then reserve this memory
284 * range without risk of panic'ing on an overlapping overlap_ok
285 * early reservation.
286 */
287void __init reserve_early(u64 start, u64 end, char *name)
288{
289 if (start >= end)
290 return;
291
292 __check_and_double_early_res(start, end);
293
294 drop_overlaps_that_are_ok(start, end);
295 __reserve_early(start, end, name, 0);
296}
297
298void __init reserve_early_without_check(u64 start, u64 end, char *name)
299{
300 struct early_res *r;
301
302 if (start >= end)
303 return;
304
305 __check_and_double_early_res(start, end);
306
307 r = &early_res[early_res_count];
308
309 r->start = start;
310 r->end = end;
311 r->overlap_ok = 0;
312 if (name)
313 strncpy(r->name, name, sizeof(r->name) - 1);
314 early_res_count++;
315}
316
317void __init free_early(u64 start, u64 end)
318{
319 struct early_res *r;
320 int i;
321
322 i = find_overlapped_early(start, end);
323 r = &early_res[i];
324 if (i >= max_early_res || r->end != end || r->start != start)
325 panic("free_early on not reserved area: %llx-%llx!",
326 start, end - 1);
327
328 drop_range(i);
329}
330
331void __init free_early_partial(u64 start, u64 end)
332{
333 struct early_res *r;
334 int i;
335
336 if (start == end)
337 return;
338
339 if (WARN_ONCE(start > end, " wrong range [%#llx, %#llx]\n", start, end))
340 return;
341
342try_next:
343 i = find_overlapped_early(start, end);
344 if (i >= max_early_res)
345 return;
346
347 r = &early_res[i];
348 /* hole ? */
349 if (r->end >= end && r->start <= start) {
350 drop_range_partial(i, start, end);
351 return;
352 }
353
354 drop_range_partial(i, start, end);
355 goto try_next;
356}
357
358#ifdef CONFIG_NO_BOOTMEM
359static void __init subtract_early_res(struct range *range, int az)
360{
361 int i, count;
362 u64 final_start, final_end;
363 int idx = 0;
364
365 count = 0;
366 for (i = 0; i < max_early_res && early_res[i].end; i++)
367 count++;
368
369 /* need to skip first one ?*/
370 if (early_res != early_res_x)
371 idx = 1;
372
373#define DEBUG_PRINT_EARLY_RES 1
374
375#if DEBUG_PRINT_EARLY_RES
376 printk(KERN_INFO "Subtract (%d early reservations)\n", count);
377#endif
378 for (i = idx; i < count; i++) {
379 struct early_res *r = &early_res[i];
380#if DEBUG_PRINT_EARLY_RES
381 printk(KERN_INFO " #%d [%010llx - %010llx] %15s\n", i,
382 r->start, r->end, r->name);
383#endif
384 final_start = PFN_DOWN(r->start);
385 final_end = PFN_UP(r->end);
386 if (final_start >= final_end)
387 continue;
388 subtract_range(range, az, final_start, final_end);
389 }
390
391}
392
393int __init get_free_all_memory_range(struct range **rangep, int nodeid)
394{
395 int i, count;
396 u64 start = 0, end;
397 u64 size;
398 u64 mem;
399 struct range *range;
400 int nr_range;
401
402 count = 0;
403 for (i = 0; i < max_early_res && early_res[i].end; i++)
404 count++;
405
406 count *= 2;
407
408 size = sizeof(struct range) * count;
409 end = get_max_mapped();
410#ifdef MAX_DMA32_PFN
411 if (end > (MAX_DMA32_PFN << PAGE_SHIFT))
412 start = MAX_DMA32_PFN << PAGE_SHIFT;
413#endif
414 mem = find_fw_memmap_area(start, end, size, sizeof(struct range));
415 if (mem == -1ULL)
416 panic("can not find more space for range free");
417
418 range = __va(mem);
419 /* use early_node_map[] and early_res to get range array at first */
420 memset(range, 0, size);
421 nr_range = 0;
422
423 /* need to go over early_node_map to find out good range for node */
424 nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
425#ifdef CONFIG_X86_32
426 subtract_range(range, count, max_low_pfn, -1ULL);
427#endif
428 subtract_early_res(range, count);
429 nr_range = clean_sort_range(range, count);
430
431 /* need to clear it ? */
432 if (nodeid == MAX_NUMNODES) {
433 memset(&early_res[0], 0,
434 sizeof(struct early_res) * max_early_res);
435 early_res = NULL;
436 max_early_res = 0;
437 }
438
439 *rangep = range;
440 return nr_range;
441}
442#else
443void __init early_res_to_bootmem(u64 start, u64 end)
444{
445 int i, count;
446 u64 final_start, final_end;
447 int idx = 0;
448
449 count = 0;
450 for (i = 0; i < max_early_res && early_res[i].end; i++)
451 count++;
452
453 /* need to skip first one ?*/
454 if (early_res != early_res_x)
455 idx = 1;
456
457 printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n",
458 count - idx, max_early_res, start, end);
459 for (i = idx; i < count; i++) {
460 struct early_res *r = &early_res[i];
461 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
462 r->start, r->end, r->name);
463 final_start = max(start, r->start);
464 final_end = min(end, r->end);
465 if (final_start >= final_end) {
466 printk(KERN_CONT "\n");
467 continue;
468 }
469 printk(KERN_CONT " ==> [%010llx - %010llx]\n",
470 final_start, final_end);
471 reserve_bootmem_generic(final_start, final_end - final_start,
472 BOOTMEM_DEFAULT);
473 }
474 /* clear them */
475 memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
476 early_res = NULL;
477 max_early_res = 0;
478 early_res_count = 0;
479}
480#endif
481
482/* Check for already reserved areas */
483static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
484{
485 int i;
486 u64 addr = *addrp;
487 int changed = 0;
488 struct early_res *r;
489again:
490 i = find_overlapped_early(addr, addr + size);
491 r = &early_res[i];
492 if (i < max_early_res && r->end) {
493 *addrp = addr = round_up(r->end, align);
494 changed = 1;
495 goto again;
496 }
497 return changed;
498}
499
500/* Check for already reserved areas */
501static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
502{
503 int i;
504 u64 addr = *addrp, last;
505 u64 size = *sizep;
506 int changed = 0;
507again:
508 last = addr + size;
509 for (i = 0; i < max_early_res && early_res[i].end; i++) {
510 struct early_res *r = &early_res[i];
511 if (last > r->start && addr < r->start) {
512 size = r->start - addr;
513 changed = 1;
514 goto again;
515 }
516 if (last > r->end && addr < r->end) {
517 addr = round_up(r->end, align);
518 size = last - addr;
519 changed = 1;
520 goto again;
521 }
522 if (last <= r->end && addr >= r->start) {
523 (*sizep)++;
524 return 0;
525 }
526 }
527 if (changed) {
528 *addrp = addr;
529 *sizep = size;
530 }
531 return changed;
532}
533
534/*
535 * Find a free area with specified alignment in a specific range.
536 * only with the area.between start to end is active range from early_node_map
537 * so they are good as RAM
538 */
539u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end,
540 u64 size, u64 align)
541{
542 u64 addr, last;
543
544 addr = round_up(ei_start, align);
545 if (addr < start)
546 addr = round_up(start, align);
547 if (addr >= ei_last)
548 goto out;
549 while (bad_addr(&addr, size, align) && addr+size <= ei_last)
550 ;
551 last = addr + size;
552 if (last > ei_last)
553 goto out;
554 if (last > end)
555 goto out;
556
557 return addr;
558
559out:
560 return -1ULL;
561}
562
563u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start,
564 u64 *sizep, u64 align)
565{
566 u64 addr, last;
567
568 addr = round_up(ei_start, align);
569 if (addr < start)
570 addr = round_up(start, align);
571 if (addr >= ei_last)
572 goto out;
573 *sizep = ei_last - addr;
574 while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last)
575 ;
576 last = addr + *sizep;
577 if (last > ei_last)
578 goto out;
579
580 return addr;
581
582out:
583 return -1ULL;
584}
diff --git a/kernel/elfcore.c b/kernel/elfcore.c
new file mode 100644
index 000000000000..ff915efef66d
--- /dev/null
+++ b/kernel/elfcore.c
@@ -0,0 +1,28 @@
1#include <linux/elf.h>
2#include <linux/fs.h>
3#include <linux/mm.h>
4
5#include <asm/elf.h>
6
7
8Elf_Half __weak elf_core_extra_phdrs(void)
9{
10 return 0;
11}
12
13int __weak elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size,
14 unsigned long limit)
15{
16 return 1;
17}
18
19int __weak elf_core_write_extra_data(struct file *file, size_t *size,
20 unsigned long limit)
21{
22 return 1;
23}
24
25size_t __weak elf_core_extra_data_size(void)
26{
27 return 0;
28}
diff --git a/kernel/exit.c b/kernel/exit.c
index 3da04257aeaf..256ce8c2ebc8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -49,6 +49,7 @@
49#include <linux/init_task.h> 49#include <linux/init_task.h>
50#include <linux/perf_event.h> 50#include <linux/perf_event.h>
51#include <trace/events/sched.h> 51#include <trace/events/sched.h>
52#include <linux/hw_breakpoint.h>
52 53
53#include <asm/uaccess.h> 54#include <asm/uaccess.h>
54#include <asm/unistd.h> 55#include <asm/unistd.h>
@@ -69,10 +70,10 @@ static void __unhash_process(struct task_struct *p)
69 detach_pid(p, PIDTYPE_SID); 70 detach_pid(p, PIDTYPE_SID);
70 71
71 list_del_rcu(&p->tasks); 72 list_del_rcu(&p->tasks);
73 list_del_init(&p->sibling);
72 __get_cpu_var(process_counts)--; 74 __get_cpu_var(process_counts)--;
73 } 75 }
74 list_del_rcu(&p->thread_group); 76 list_del_rcu(&p->thread_group);
75 list_del_init(&p->sibling);
76} 77}
77 78
78/* 79/*
@@ -86,7 +87,9 @@ static void __exit_signal(struct task_struct *tsk)
86 BUG_ON(!sig); 87 BUG_ON(!sig);
87 BUG_ON(!atomic_read(&sig->count)); 88 BUG_ON(!atomic_read(&sig->count));
88 89
89 sighand = rcu_dereference(tsk->sighand); 90 sighand = rcu_dereference_check(tsk->sighand,
91 rcu_read_lock_held() ||
92 lockdep_tasklist_lock_is_held());
90 spin_lock(&sighand->siglock); 93 spin_lock(&sighand->siglock);
91 94
92 posix_cpu_timers_exit(tsk); 95 posix_cpu_timers_exit(tsk);
@@ -112,9 +115,9 @@ static void __exit_signal(struct task_struct *tsk)
112 * We won't ever get here for the group leader, since it 115 * We won't ever get here for the group leader, since it
113 * will have been the last reference on the signal_struct. 116 * will have been the last reference on the signal_struct.
114 */ 117 */
115 sig->utime = cputime_add(sig->utime, task_utime(tsk)); 118 sig->utime = cputime_add(sig->utime, tsk->utime);
116 sig->stime = cputime_add(sig->stime, task_stime(tsk)); 119 sig->stime = cputime_add(sig->stime, tsk->stime);
117 sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); 120 sig->gtime = cputime_add(sig->gtime, tsk->gtime);
118 sig->min_flt += tsk->min_flt; 121 sig->min_flt += tsk->min_flt;
119 sig->maj_flt += tsk->maj_flt; 122 sig->maj_flt += tsk->maj_flt;
120 sig->nvcsw += tsk->nvcsw; 123 sig->nvcsw += tsk->nvcsw;
@@ -171,8 +174,10 @@ void release_task(struct task_struct * p)
171repeat: 174repeat:
172 tracehook_prepare_release_task(p); 175 tracehook_prepare_release_task(p);
173 /* don't need to get the RCU readlock here - the process is dead and 176 /* don't need to get the RCU readlock here - the process is dead and
174 * can't be modifying its own credentials */ 177 * can't be modifying its own credentials. But shut RCU-lockdep up */
178 rcu_read_lock();
175 atomic_dec(&__task_cred(p)->user->processes); 179 atomic_dec(&__task_cred(p)->user->processes);
180 rcu_read_unlock();
176 181
177 proc_flush_task(p); 182 proc_flush_task(p);
178 183
@@ -474,9 +479,11 @@ static void close_files(struct files_struct * files)
474 /* 479 /*
475 * It is safe to dereference the fd table without RCU or 480 * It is safe to dereference the fd table without RCU or
476 * ->file_lock because this is the last reference to the 481 * ->file_lock because this is the last reference to the
477 * files structure. 482 * files structure. But use RCU to shut RCU-lockdep up.
478 */ 483 */
484 rcu_read_lock();
479 fdt = files_fdtable(files); 485 fdt = files_fdtable(files);
486 rcu_read_unlock();
480 for (;;) { 487 for (;;) {
481 unsigned long set; 488 unsigned long set;
482 i = j * __NFDBITS; 489 i = j * __NFDBITS;
@@ -522,10 +529,12 @@ void put_files_struct(struct files_struct *files)
522 * at the end of the RCU grace period. Otherwise, 529 * at the end of the RCU grace period. Otherwise,
523 * you can free files immediately. 530 * you can free files immediately.
524 */ 531 */
532 rcu_read_lock();
525 fdt = files_fdtable(files); 533 fdt = files_fdtable(files);
526 if (fdt != &files->fdtab) 534 if (fdt != &files->fdtab)
527 kmem_cache_free(files_cachep, files); 535 kmem_cache_free(files_cachep, files);
528 free_fdtable(fdt); 536 free_fdtable(fdt);
537 rcu_read_unlock();
529 } 538 }
530} 539}
531 540
@@ -737,12 +746,9 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
737/* 746/*
738* Any that need to be release_task'd are put on the @dead list. 747* Any that need to be release_task'd are put on the @dead list.
739 */ 748 */
740static void reparent_thread(struct task_struct *father, struct task_struct *p, 749static void reparent_leader(struct task_struct *father, struct task_struct *p,
741 struct list_head *dead) 750 struct list_head *dead)
742{ 751{
743 if (p->pdeath_signal)
744 group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
745
746 list_move_tail(&p->sibling, &p->real_parent->children); 752 list_move_tail(&p->sibling, &p->real_parent->children);
747 753
748 if (task_detached(p)) 754 if (task_detached(p))
@@ -781,12 +787,18 @@ static void forget_original_parent(struct task_struct *father)
781 reaper = find_new_reaper(father); 787 reaper = find_new_reaper(father);
782 788
783 list_for_each_entry_safe(p, n, &father->children, sibling) { 789 list_for_each_entry_safe(p, n, &father->children, sibling) {
784 p->real_parent = reaper; 790 struct task_struct *t = p;
785 if (p->parent == father) { 791 do {
786 BUG_ON(task_ptrace(p)); 792 t->real_parent = reaper;
787 p->parent = p->real_parent; 793 if (t->parent == father) {
788 } 794 BUG_ON(task_ptrace(t));
789 reparent_thread(father, p, &dead_children); 795 t->parent = t->real_parent;
796 }
797 if (t->pdeath_signal)
798 group_send_sig_info(t->pdeath_signal,
799 SEND_SIG_NOINFO, t);
800 } while_each_thread(p, t);
801 reparent_leader(father, p, &dead_children);
790 } 802 }
791 write_unlock_irq(&tasklist_lock); 803 write_unlock_irq(&tasklist_lock);
792 804
@@ -934,7 +946,7 @@ NORET_TYPE void do_exit(long code)
934 * an exiting task cleaning up the robust pi futexes. 946 * an exiting task cleaning up the robust pi futexes.
935 */ 947 */
936 smp_mb(); 948 smp_mb();
937 spin_unlock_wait(&tsk->pi_lock); 949 raw_spin_unlock_wait(&tsk->pi_lock);
938 950
939 if (unlikely(in_atomic())) 951 if (unlikely(in_atomic()))
940 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", 952 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
@@ -942,7 +954,9 @@ NORET_TYPE void do_exit(long code)
942 preempt_count()); 954 preempt_count());
943 955
944 acct_update_integrals(tsk); 956 acct_update_integrals(tsk);
945 957 /* sync mm's RSS info before statistics gathering */
958 if (tsk->mm)
959 sync_mm_rss(tsk, tsk->mm);
946 group_dead = atomic_dec_and_test(&tsk->signal->live); 960 group_dead = atomic_dec_and_test(&tsk->signal->live);
947 if (group_dead) { 961 if (group_dead) {
948 hrtimer_cancel(&tsk->signal->real_timer); 962 hrtimer_cancel(&tsk->signal->real_timer);
@@ -974,7 +988,7 @@ NORET_TYPE void do_exit(long code)
974 exit_thread(); 988 exit_thread();
975 cgroup_exit(tsk, 1); 989 cgroup_exit(tsk, 1);
976 990
977 if (group_dead && tsk->signal->leader) 991 if (group_dead)
978 disassociate_ctty(1); 992 disassociate_ctty(1);
979 993
980 module_put(task_thread_info(tsk)->exec_domain->module); 994 module_put(task_thread_info(tsk)->exec_domain->module);
@@ -982,6 +996,10 @@ NORET_TYPE void do_exit(long code)
982 proc_exit_connector(tsk); 996 proc_exit_connector(tsk);
983 997
984 /* 998 /*
999 * FIXME: do that only when needed, using sched_exit tracepoint
1000 */
1001 flush_ptrace_hw_breakpoint(tsk);
1002 /*
985 * Flush inherited counters to the parent - before the parent 1003 * Flush inherited counters to the parent - before the parent
986 * gets woken up by child-exit notifications. 1004 * gets woken up by child-exit notifications.
987 */ 1005 */
@@ -1008,7 +1026,7 @@ NORET_TYPE void do_exit(long code)
1008 tsk->flags |= PF_EXITPIDONE; 1026 tsk->flags |= PF_EXITPIDONE;
1009 1027
1010 if (tsk->io_context) 1028 if (tsk->io_context)
1011 exit_io_context(); 1029 exit_io_context(tsk);
1012 1030
1013 if (tsk->splice_pipe) 1031 if (tsk->splice_pipe)
1014 __free_pipe_info(tsk->splice_pipe); 1032 __free_pipe_info(tsk->splice_pipe);
@@ -1176,7 +1194,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1176 1194
1177 if (unlikely(wo->wo_flags & WNOWAIT)) { 1195 if (unlikely(wo->wo_flags & WNOWAIT)) {
1178 int exit_code = p->exit_code; 1196 int exit_code = p->exit_code;
1179 int why, status; 1197 int why;
1180 1198
1181 get_task_struct(p); 1199 get_task_struct(p);
1182 read_unlock(&tasklist_lock); 1200 read_unlock(&tasklist_lock);
@@ -1209,6 +1227,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1209 struct signal_struct *psig; 1227 struct signal_struct *psig;
1210 struct signal_struct *sig; 1228 struct signal_struct *sig;
1211 unsigned long maxrss; 1229 unsigned long maxrss;
1230 cputime_t tgutime, tgstime;
1212 1231
1213 /* 1232 /*
1214 * The resource counters for the group leader are in its 1233 * The resource counters for the group leader are in its
@@ -1224,20 +1243,23 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1224 * need to protect the access to parent->signal fields, 1243 * need to protect the access to parent->signal fields,
1225 * as other threads in the parent group can be right 1244 * as other threads in the parent group can be right
1226 * here reaping other children at the same time. 1245 * here reaping other children at the same time.
1246 *
1247 * We use thread_group_times() to get times for the thread
1248 * group, which consolidates times for all threads in the
1249 * group including the group leader.
1227 */ 1250 */
1251 thread_group_times(p, &tgutime, &tgstime);
1228 spin_lock_irq(&p->real_parent->sighand->siglock); 1252 spin_lock_irq(&p->real_parent->sighand->siglock);
1229 psig = p->real_parent->signal; 1253 psig = p->real_parent->signal;
1230 sig = p->signal; 1254 sig = p->signal;
1231 psig->cutime = 1255 psig->cutime =
1232 cputime_add(psig->cutime, 1256 cputime_add(psig->cutime,
1233 cputime_add(p->utime, 1257 cputime_add(tgutime,
1234 cputime_add(sig->utime, 1258 sig->cutime));
1235 sig->cutime)));
1236 psig->cstime = 1259 psig->cstime =
1237 cputime_add(psig->cstime, 1260 cputime_add(psig->cstime,
1238 cputime_add(p->stime, 1261 cputime_add(tgstime,
1239 cputime_add(sig->stime, 1262 sig->cstime));
1240 sig->cstime)));
1241 psig->cgtime = 1263 psig->cgtime =
1242 cputime_add(psig->cgtime, 1264 cputime_add(psig->cgtime,
1243 cputime_add(p->gtime, 1265 cputime_add(p->gtime,
@@ -1546,14 +1568,9 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
1546 struct task_struct *p; 1568 struct task_struct *p;
1547 1569
1548 list_for_each_entry(p, &tsk->children, sibling) { 1570 list_for_each_entry(p, &tsk->children, sibling) {
1549 /* 1571 int ret = wait_consider_task(wo, 0, p);
1550 * Do not consider detached threads. 1572 if (ret)
1551 */ 1573 return ret;
1552 if (!task_detached(p)) {
1553 int ret = wait_consider_task(wo, 0, p);
1554 if (ret)
1555 return ret;
1556 }
1557 } 1574 }
1558 1575
1559 return 0; 1576 return 0;
diff --git a/kernel/fork.c b/kernel/fork.c
index 9fad346d7029..166eb780dd7d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -64,6 +64,7 @@
64#include <linux/magic.h> 64#include <linux/magic.h>
65#include <linux/perf_event.h> 65#include <linux/perf_event.h>
66#include <linux/posix-timers.h> 66#include <linux/posix-timers.h>
67#include <linux/user-return-notifier.h>
67 68
68#include <asm/pgtable.h> 69#include <asm/pgtable.h>
69#include <asm/pgalloc.h> 70#include <asm/pgalloc.h>
@@ -89,6 +90,14 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0;
89 90
90__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ 91__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
91 92
93#ifdef CONFIG_PROVE_RCU
94int lockdep_tasklist_lock_is_held(void)
95{
96 return lockdep_is_held(&tasklist_lock);
97}
98EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
99#endif /* #ifdef CONFIG_PROVE_RCU */
100
92int nr_processes(void) 101int nr_processes(void)
93{ 102{
94 int cpu; 103 int cpu;
@@ -256,6 +265,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
256 goto out; 265 goto out;
257 266
258 setup_thread_stack(tsk, orig); 267 setup_thread_stack(tsk, orig);
268 clear_user_return_notifier(tsk);
259 stackend = end_of_stack(tsk); 269 stackend = end_of_stack(tsk);
260 *stackend = STACK_END_MAGIC; /* for overflow detection */ 270 *stackend = STACK_END_MAGIC; /* for overflow detection */
261 271
@@ -333,15 +343,17 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
333 if (!tmp) 343 if (!tmp)
334 goto fail_nomem; 344 goto fail_nomem;
335 *tmp = *mpnt; 345 *tmp = *mpnt;
346 INIT_LIST_HEAD(&tmp->anon_vma_chain);
336 pol = mpol_dup(vma_policy(mpnt)); 347 pol = mpol_dup(vma_policy(mpnt));
337 retval = PTR_ERR(pol); 348 retval = PTR_ERR(pol);
338 if (IS_ERR(pol)) 349 if (IS_ERR(pol))
339 goto fail_nomem_policy; 350 goto fail_nomem_policy;
340 vma_set_policy(tmp, pol); 351 vma_set_policy(tmp, pol);
352 if (anon_vma_fork(tmp, mpnt))
353 goto fail_nomem_anon_vma_fork;
341 tmp->vm_flags &= ~VM_LOCKED; 354 tmp->vm_flags &= ~VM_LOCKED;
342 tmp->vm_mm = mm; 355 tmp->vm_mm = mm;
343 tmp->vm_next = NULL; 356 tmp->vm_next = NULL;
344 anon_vma_link(tmp);
345 file = tmp->vm_file; 357 file = tmp->vm_file;
346 if (file) { 358 if (file) {
347 struct inode *inode = file->f_path.dentry->d_inode; 359 struct inode *inode = file->f_path.dentry->d_inode;
@@ -396,6 +408,8 @@ out:
396 flush_tlb_mm(oldmm); 408 flush_tlb_mm(oldmm);
397 up_write(&oldmm->mmap_sem); 409 up_write(&oldmm->mmap_sem);
398 return retval; 410 return retval;
411fail_nomem_anon_vma_fork:
412 mpol_put(pol);
399fail_nomem_policy: 413fail_nomem_policy:
400 kmem_cache_free(vm_area_cachep, tmp); 414 kmem_cache_free(vm_area_cachep, tmp);
401fail_nomem: 415fail_nomem:
@@ -459,8 +473,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
459 (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; 473 (current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
460 mm->core_state = NULL; 474 mm->core_state = NULL;
461 mm->nr_ptes = 0; 475 mm->nr_ptes = 0;
462 set_mm_counter(mm, file_rss, 0); 476 memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
463 set_mm_counter(mm, anon_rss, 0);
464 spin_lock_init(&mm->page_table_lock); 477 spin_lock_init(&mm->page_table_lock);
465 mm->free_area_cache = TASK_UNMAPPED_BASE; 478 mm->free_area_cache = TASK_UNMAPPED_BASE;
466 mm->cached_hole_size = ~0UL; 479 mm->cached_hole_size = ~0UL;
@@ -829,23 +842,14 @@ void __cleanup_sighand(struct sighand_struct *sighand)
829 */ 842 */
830static void posix_cpu_timers_init_group(struct signal_struct *sig) 843static void posix_cpu_timers_init_group(struct signal_struct *sig)
831{ 844{
845 unsigned long cpu_limit;
846
832 /* Thread group counters. */ 847 /* Thread group counters. */
833 thread_group_cputime_init(sig); 848 thread_group_cputime_init(sig);
834 849
835 /* Expiration times and increments. */ 850 cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
836 sig->it[CPUCLOCK_PROF].expires = cputime_zero; 851 if (cpu_limit != RLIM_INFINITY) {
837 sig->it[CPUCLOCK_PROF].incr = cputime_zero; 852 sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
838 sig->it[CPUCLOCK_VIRT].expires = cputime_zero;
839 sig->it[CPUCLOCK_VIRT].incr = cputime_zero;
840
841 /* Cached expiration times. */
842 sig->cputime_expires.prof_exp = cputime_zero;
843 sig->cputime_expires.virt_exp = cputime_zero;
844 sig->cputime_expires.sched_exp = 0;
845
846 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
847 sig->cputime_expires.prof_exp =
848 secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
849 sig->cputimer.running = 1; 853 sig->cputimer.running = 1;
850 } 854 }
851 855
@@ -862,7 +866,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
862 if (clone_flags & CLONE_THREAD) 866 if (clone_flags & CLONE_THREAD)
863 return 0; 867 return 0;
864 868
865 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); 869 sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
866 tsk->signal = sig; 870 tsk->signal = sig;
867 if (!sig) 871 if (!sig)
868 return -ENOMEM; 872 return -ENOMEM;
@@ -870,43 +874,21 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
870 atomic_set(&sig->count, 1); 874 atomic_set(&sig->count, 1);
871 atomic_set(&sig->live, 1); 875 atomic_set(&sig->live, 1);
872 init_waitqueue_head(&sig->wait_chldexit); 876 init_waitqueue_head(&sig->wait_chldexit);
873 sig->flags = 0;
874 if (clone_flags & CLONE_NEWPID) 877 if (clone_flags & CLONE_NEWPID)
875 sig->flags |= SIGNAL_UNKILLABLE; 878 sig->flags |= SIGNAL_UNKILLABLE;
876 sig->group_exit_code = 0;
877 sig->group_exit_task = NULL;
878 sig->group_stop_count = 0;
879 sig->curr_target = tsk; 879 sig->curr_target = tsk;
880 init_sigpending(&sig->shared_pending); 880 init_sigpending(&sig->shared_pending);
881 INIT_LIST_HEAD(&sig->posix_timers); 881 INIT_LIST_HEAD(&sig->posix_timers);
882 882
883 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 883 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
884 sig->it_real_incr.tv64 = 0;
885 sig->real_timer.function = it_real_fn; 884 sig->real_timer.function = it_real_fn;
886 885
887 sig->leader = 0; /* session leadership doesn't inherit */
888 sig->tty_old_pgrp = NULL;
889 sig->tty = NULL;
890
891 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
892 sig->gtime = cputime_zero;
893 sig->cgtime = cputime_zero;
894 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
895 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
896 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
897 sig->maxrss = sig->cmaxrss = 0;
898 task_io_accounting_init(&sig->ioac);
899 sig->sum_sched_runtime = 0;
900 taskstats_tgid_init(sig);
901
902 task_lock(current->group_leader); 886 task_lock(current->group_leader);
903 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); 887 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
904 task_unlock(current->group_leader); 888 task_unlock(current->group_leader);
905 889
906 posix_cpu_timers_init_group(sig); 890 posix_cpu_timers_init_group(sig);
907 891
908 acct_init_pacct(&sig->pacct);
909
910 tty_audit_fork(sig); 892 tty_audit_fork(sig);
911 893
912 sig->oom_adj = current->signal->oom_adj; 894 sig->oom_adj = current->signal->oom_adj;
@@ -941,9 +923,9 @@ SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
941 923
942static void rt_mutex_init_task(struct task_struct *p) 924static void rt_mutex_init_task(struct task_struct *p)
943{ 925{
944 spin_lock_init(&p->pi_lock); 926 raw_spin_lock_init(&p->pi_lock);
945#ifdef CONFIG_RT_MUTEXES 927#ifdef CONFIG_RT_MUTEXES
946 plist_head_init(&p->pi_waiters, &p->pi_lock); 928 plist_head_init_raw(&p->pi_waiters, &p->pi_lock);
947 p->pi_blocked_on = NULL; 929 p->pi_blocked_on = NULL;
948#endif 930#endif
949} 931}
@@ -1035,7 +1017,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1035#endif 1017#endif
1036 retval = -EAGAIN; 1018 retval = -EAGAIN;
1037 if (atomic_read(&p->real_cred->user->processes) >= 1019 if (atomic_read(&p->real_cred->user->processes) >=
1038 p->signal->rlim[RLIMIT_NPROC].rlim_cur) { 1020 task_rlimit(p, RLIMIT_NPROC)) {
1039 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && 1021 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
1040 p->real_cred->user != INIT_USER) 1022 p->real_cred->user != INIT_USER)
1041 goto bad_fork_free; 1023 goto bad_fork_free;
@@ -1073,8 +1055,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1073 p->gtime = cputime_zero; 1055 p->gtime = cputime_zero;
1074 p->utimescaled = cputime_zero; 1056 p->utimescaled = cputime_zero;
1075 p->stimescaled = cputime_zero; 1057 p->stimescaled = cputime_zero;
1058#ifndef CONFIG_VIRT_CPU_ACCOUNTING
1076 p->prev_utime = cputime_zero; 1059 p->prev_utime = cputime_zero;
1077 p->prev_stime = cputime_zero; 1060 p->prev_stime = cputime_zero;
1061#endif
1062#if defined(SPLIT_RSS_COUNTING)
1063 memset(&p->rss_stat, 0, sizeof(p->rss_stat));
1064#endif
1078 1065
1079 p->default_timer_slack_ns = current->timer_slack_ns; 1066 p->default_timer_slack_ns = current->timer_slack_ns;
1080 1067
@@ -1127,11 +1114,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1127#ifdef CONFIG_DEBUG_MUTEXES 1114#ifdef CONFIG_DEBUG_MUTEXES
1128 p->blocked_on = NULL; /* not blocked yet */ 1115 p->blocked_on = NULL; /* not blocked yet */
1129#endif 1116#endif
1117#ifdef CONFIG_CGROUP_MEM_RES_CTLR
1118 p->memcg_batch.do_batch = 0;
1119 p->memcg_batch.memcg = NULL;
1120#endif
1130 1121
1131 p->bts = NULL; 1122 p->bts = NULL;
1132 1123
1133 p->stack_start = stack_start;
1134
1135 /* Perform scheduler related setup. Assign this task to a CPU. */ 1124 /* Perform scheduler related setup. Assign this task to a CPU. */
1136 sched_fork(p, clone_flags); 1125 sched_fork(p, clone_flags);
1137 1126
@@ -1206,9 +1195,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1206 p->sas_ss_sp = p->sas_ss_size = 0; 1195 p->sas_ss_sp = p->sas_ss_size = 0;
1207 1196
1208 /* 1197 /*
1209 * Syscall tracing should be turned off in the child regardless 1198 * Syscall tracing and stepping should be turned off in the
1210 * of CLONE_PTRACE. 1199 * child regardless of CLONE_PTRACE.
1211 */ 1200 */
1201 user_disable_single_step(p);
1212 clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); 1202 clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
1213#ifdef TIF_SYSCALL_EMU 1203#ifdef TIF_SYSCALL_EMU
1214 clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); 1204 clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
@@ -1236,21 +1226,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1236 /* Need tasklist lock for parent etc handling! */ 1226 /* Need tasklist lock for parent etc handling! */
1237 write_lock_irq(&tasklist_lock); 1227 write_lock_irq(&tasklist_lock);
1238 1228
1239 /*
1240 * The task hasn't been attached yet, so its cpus_allowed mask will
1241 * not be changed, nor will its assigned CPU.
1242 *
1243 * The cpus_allowed mask of the parent may have changed after it was
1244 * copied first time - so re-copy it here, then check the child's CPU
1245 * to ensure it is on a valid CPU (and if not, just force it back to
1246 * parent's CPU). This avoids alot of nasty races.
1247 */
1248 p->cpus_allowed = current->cpus_allowed;
1249 p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
1250 if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
1251 !cpu_online(task_cpu(p))))
1252 set_task_cpu(p, smp_processor_id());
1253
1254 /* CLONE_PARENT re-uses the old parent */ 1229 /* CLONE_PARENT re-uses the old parent */
1255 if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { 1230 if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
1256 p->real_parent = current->real_parent; 1231 p->real_parent = current->real_parent;
@@ -1286,7 +1261,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1286 } 1261 }
1287 1262
1288 if (likely(p->pid)) { 1263 if (likely(p->pid)) {
1289 list_add_tail(&p->sibling, &p->real_parent->children);
1290 tracehook_finish_clone(p, clone_flags, trace); 1264 tracehook_finish_clone(p, clone_flags, trace);
1291 1265
1292 if (thread_group_leader(p)) { 1266 if (thread_group_leader(p)) {
@@ -1298,6 +1272,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1298 p->signal->tty = tty_kref_get(current->signal->tty); 1272 p->signal->tty = tty_kref_get(current->signal->tty);
1299 attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); 1273 attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
1300 attach_pid(p, PIDTYPE_SID, task_session(current)); 1274 attach_pid(p, PIDTYPE_SID, task_session(current));
1275 list_add_tail(&p->sibling, &p->real_parent->children);
1301 list_add_tail_rcu(&p->tasks, &init_task.tasks); 1276 list_add_tail_rcu(&p->tasks, &init_task.tasks);
1302 __get_cpu_var(process_counts)++; 1277 __get_cpu_var(process_counts)++;
1303 } 1278 }
@@ -1317,7 +1292,8 @@ bad_fork_free_pid:
1317 if (pid != &init_struct_pid) 1292 if (pid != &init_struct_pid)
1318 free_pid(pid); 1293 free_pid(pid);
1319bad_fork_cleanup_io: 1294bad_fork_cleanup_io:
1320 put_io_context(p->io_context); 1295 if (p->io_context)
1296 exit_io_context(p);
1321bad_fork_cleanup_namespaces: 1297bad_fork_cleanup_namespaces:
1322 exit_task_namespaces(p); 1298 exit_task_namespaces(p);
1323bad_fork_cleanup_mm: 1299bad_fork_cleanup_mm:
diff --git a/kernel/futex.c b/kernel/futex.c
index fb65e822fc41..e7a35f1039e7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -203,8 +203,6 @@ static void drop_futex_key_refs(union futex_key *key)
203 * @uaddr: virtual address of the futex 203 * @uaddr: virtual address of the futex
204 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED 204 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
205 * @key: address where result is stored. 205 * @key: address where result is stored.
206 * @rw: mapping needs to be read/write (values: VERIFY_READ,
207 * VERIFY_WRITE)
208 * 206 *
209 * Returns a negative error code or 0 207 * Returns a negative error code or 0
210 * The key words are stored in *key on success. 208 * The key words are stored in *key on success.
@@ -216,7 +214,7 @@ static void drop_futex_key_refs(union futex_key *key)
216 * lock_page() might sleep, the caller should not hold a spinlock. 214 * lock_page() might sleep, the caller should not hold a spinlock.
217 */ 215 */
218static int 216static int
219get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) 217get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
220{ 218{
221 unsigned long address = (unsigned long)uaddr; 219 unsigned long address = (unsigned long)uaddr;
222 struct mm_struct *mm = current->mm; 220 struct mm_struct *mm = current->mm;
@@ -239,7 +237,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
239 * but access_ok() should be faster than find_vma() 237 * but access_ok() should be faster than find_vma()
240 */ 238 */
241 if (!fshared) { 239 if (!fshared) {
242 if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) 240 if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
243 return -EFAULT; 241 return -EFAULT;
244 key->private.mm = mm; 242 key->private.mm = mm;
245 key->private.address = address; 243 key->private.address = address;
@@ -248,7 +246,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
248 } 246 }
249 247
250again: 248again:
251 err = get_user_pages_fast(address, 1, rw == VERIFY_WRITE, &page); 249 err = get_user_pages_fast(address, 1, 1, &page);
252 if (err < 0) 250 if (err < 0)
253 return err; 251 return err;
254 252
@@ -304,8 +302,14 @@ void put_futex_key(int fshared, union futex_key *key)
304 */ 302 */
305static int fault_in_user_writeable(u32 __user *uaddr) 303static int fault_in_user_writeable(u32 __user *uaddr)
306{ 304{
307 int ret = get_user_pages(current, current->mm, (unsigned long)uaddr, 305 struct mm_struct *mm = current->mm;
308 1, 1, 0, NULL, NULL); 306 int ret;
307
308 down_read(&mm->mmap_sem);
309 ret = get_user_pages(current, mm, (unsigned long)uaddr,
310 1, 1, 0, NULL, NULL);
311 up_read(&mm->mmap_sem);
312
309 return ret < 0 ? ret : 0; 313 return ret < 0 ? ret : 0;
310} 314}
311 315
@@ -397,9 +401,9 @@ static void free_pi_state(struct futex_pi_state *pi_state)
397 * and has cleaned up the pi_state already 401 * and has cleaned up the pi_state already
398 */ 402 */
399 if (pi_state->owner) { 403 if (pi_state->owner) {
400 spin_lock_irq(&pi_state->owner->pi_lock); 404 raw_spin_lock_irq(&pi_state->owner->pi_lock);
401 list_del_init(&pi_state->list); 405 list_del_init(&pi_state->list);
402 spin_unlock_irq(&pi_state->owner->pi_lock); 406 raw_spin_unlock_irq(&pi_state->owner->pi_lock);
403 407
404 rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); 408 rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
405 } 409 }
@@ -464,18 +468,18 @@ void exit_pi_state_list(struct task_struct *curr)
464 * pi_state_list anymore, but we have to be careful 468 * pi_state_list anymore, but we have to be careful
465 * versus waiters unqueueing themselves: 469 * versus waiters unqueueing themselves:
466 */ 470 */
467 spin_lock_irq(&curr->pi_lock); 471 raw_spin_lock_irq(&curr->pi_lock);
468 while (!list_empty(head)) { 472 while (!list_empty(head)) {
469 473
470 next = head->next; 474 next = head->next;
471 pi_state = list_entry(next, struct futex_pi_state, list); 475 pi_state = list_entry(next, struct futex_pi_state, list);
472 key = pi_state->key; 476 key = pi_state->key;
473 hb = hash_futex(&key); 477 hb = hash_futex(&key);
474 spin_unlock_irq(&curr->pi_lock); 478 raw_spin_unlock_irq(&curr->pi_lock);
475 479
476 spin_lock(&hb->lock); 480 spin_lock(&hb->lock);
477 481
478 spin_lock_irq(&curr->pi_lock); 482 raw_spin_lock_irq(&curr->pi_lock);
479 /* 483 /*
480 * We dropped the pi-lock, so re-check whether this 484 * We dropped the pi-lock, so re-check whether this
481 * task still owns the PI-state: 485 * task still owns the PI-state:
@@ -489,15 +493,15 @@ void exit_pi_state_list(struct task_struct *curr)
489 WARN_ON(list_empty(&pi_state->list)); 493 WARN_ON(list_empty(&pi_state->list));
490 list_del_init(&pi_state->list); 494 list_del_init(&pi_state->list);
491 pi_state->owner = NULL; 495 pi_state->owner = NULL;
492 spin_unlock_irq(&curr->pi_lock); 496 raw_spin_unlock_irq(&curr->pi_lock);
493 497
494 rt_mutex_unlock(&pi_state->pi_mutex); 498 rt_mutex_unlock(&pi_state->pi_mutex);
495 499
496 spin_unlock(&hb->lock); 500 spin_unlock(&hb->lock);
497 501
498 spin_lock_irq(&curr->pi_lock); 502 raw_spin_lock_irq(&curr->pi_lock);
499 } 503 }
500 spin_unlock_irq(&curr->pi_lock); 504 raw_spin_unlock_irq(&curr->pi_lock);
501} 505}
502 506
503static int 507static int
@@ -526,8 +530,25 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
526 return -EINVAL; 530 return -EINVAL;
527 531
528 WARN_ON(!atomic_read(&pi_state->refcount)); 532 WARN_ON(!atomic_read(&pi_state->refcount));
529 WARN_ON(pid && pi_state->owner && 533
530 pi_state->owner->pid != pid); 534 /*
535 * When pi_state->owner is NULL then the owner died
536 * and another waiter is on the fly. pi_state->owner
537 * is fixed up by the task which acquires
538 * pi_state->rt_mutex.
539 *
540 * We do not check for pid == 0 which can happen when
541 * the owner died and robust_list_exit() cleared the
542 * TID.
543 */
544 if (pid && pi_state->owner) {
545 /*
546 * Bail out if user space manipulated the
547 * futex value.
548 */
549 if (pid != task_pid_vnr(pi_state->owner))
550 return -EINVAL;
551 }
531 552
532 atomic_inc(&pi_state->refcount); 553 atomic_inc(&pi_state->refcount);
533 *ps = pi_state; 554 *ps = pi_state;
@@ -552,7 +573,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
552 * change of the task flags, we do this protected by 573 * change of the task flags, we do this protected by
553 * p->pi_lock: 574 * p->pi_lock:
554 */ 575 */
555 spin_lock_irq(&p->pi_lock); 576 raw_spin_lock_irq(&p->pi_lock);
556 if (unlikely(p->flags & PF_EXITING)) { 577 if (unlikely(p->flags & PF_EXITING)) {
557 /* 578 /*
558 * The task is on the way out. When PF_EXITPIDONE is 579 * The task is on the way out. When PF_EXITPIDONE is
@@ -561,7 +582,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
561 */ 582 */
562 int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN; 583 int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
563 584
564 spin_unlock_irq(&p->pi_lock); 585 raw_spin_unlock_irq(&p->pi_lock);
565 put_task_struct(p); 586 put_task_struct(p);
566 return ret; 587 return ret;
567 } 588 }
@@ -580,7 +601,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
580 WARN_ON(!list_empty(&pi_state->list)); 601 WARN_ON(!list_empty(&pi_state->list));
581 list_add(&pi_state->list, &p->pi_state_list); 602 list_add(&pi_state->list, &p->pi_state_list);
582 pi_state->owner = p; 603 pi_state->owner = p;
583 spin_unlock_irq(&p->pi_lock); 604 raw_spin_unlock_irq(&p->pi_lock);
584 605
585 put_task_struct(p); 606 put_task_struct(p);
586 607
@@ -754,7 +775,14 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
754 if (!pi_state) 775 if (!pi_state)
755 return -EINVAL; 776 return -EINVAL;
756 777
757 spin_lock(&pi_state->pi_mutex.wait_lock); 778 /*
779 * If current does not own the pi_state then the futex is
780 * inconsistent and user space fiddled with the futex value.
781 */
782 if (pi_state->owner != current)
783 return -EINVAL;
784
785 raw_spin_lock(&pi_state->pi_mutex.wait_lock);
758 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); 786 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
759 787
760 /* 788 /*
@@ -783,23 +811,23 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
783 else if (curval != uval) 811 else if (curval != uval)
784 ret = -EINVAL; 812 ret = -EINVAL;
785 if (ret) { 813 if (ret) {
786 spin_unlock(&pi_state->pi_mutex.wait_lock); 814 raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
787 return ret; 815 return ret;
788 } 816 }
789 } 817 }
790 818
791 spin_lock_irq(&pi_state->owner->pi_lock); 819 raw_spin_lock_irq(&pi_state->owner->pi_lock);
792 WARN_ON(list_empty(&pi_state->list)); 820 WARN_ON(list_empty(&pi_state->list));
793 list_del_init(&pi_state->list); 821 list_del_init(&pi_state->list);
794 spin_unlock_irq(&pi_state->owner->pi_lock); 822 raw_spin_unlock_irq(&pi_state->owner->pi_lock);
795 823
796 spin_lock_irq(&new_owner->pi_lock); 824 raw_spin_lock_irq(&new_owner->pi_lock);
797 WARN_ON(!list_empty(&pi_state->list)); 825 WARN_ON(!list_empty(&pi_state->list));
798 list_add(&pi_state->list, &new_owner->pi_state_list); 826 list_add(&pi_state->list, &new_owner->pi_state_list);
799 pi_state->owner = new_owner; 827 pi_state->owner = new_owner;
800 spin_unlock_irq(&new_owner->pi_lock); 828 raw_spin_unlock_irq(&new_owner->pi_lock);
801 829
802 spin_unlock(&pi_state->pi_mutex.wait_lock); 830 raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
803 rt_mutex_unlock(&pi_state->pi_mutex); 831 rt_mutex_unlock(&pi_state->pi_mutex);
804 832
805 return 0; 833 return 0;
@@ -861,7 +889,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
861 if (!bitset) 889 if (!bitset)
862 return -EINVAL; 890 return -EINVAL;
863 891
864 ret = get_futex_key(uaddr, fshared, &key, VERIFY_READ); 892 ret = get_futex_key(uaddr, fshared, &key);
865 if (unlikely(ret != 0)) 893 if (unlikely(ret != 0))
866 goto out; 894 goto out;
867 895
@@ -907,10 +935,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
907 int ret, op_ret; 935 int ret, op_ret;
908 936
909retry: 937retry:
910 ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); 938 ret = get_futex_key(uaddr1, fshared, &key1);
911 if (unlikely(ret != 0)) 939 if (unlikely(ret != 0))
912 goto out; 940 goto out;
913 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); 941 ret = get_futex_key(uaddr2, fshared, &key2);
914 if (unlikely(ret != 0)) 942 if (unlikely(ret != 0))
915 goto out_put_key1; 943 goto out_put_key1;
916 944
@@ -1004,7 +1032,7 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1004 plist_add(&q->list, &hb2->chain); 1032 plist_add(&q->list, &hb2->chain);
1005 q->lock_ptr = &hb2->lock; 1033 q->lock_ptr = &hb2->lock;
1006#ifdef CONFIG_DEBUG_PI_LIST 1034#ifdef CONFIG_DEBUG_PI_LIST
1007 q->list.plist.lock = &hb2->lock; 1035 q->list.plist.spinlock = &hb2->lock;
1008#endif 1036#endif
1009 } 1037 }
1010 get_futex_key_refs(key2); 1038 get_futex_key_refs(key2);
@@ -1040,7 +1068,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1040 1068
1041 q->lock_ptr = &hb->lock; 1069 q->lock_ptr = &hb->lock;
1042#ifdef CONFIG_DEBUG_PI_LIST 1070#ifdef CONFIG_DEBUG_PI_LIST
1043 q->list.plist.lock = &hb->lock; 1071 q->list.plist.spinlock = &hb->lock;
1044#endif 1072#endif
1045 1073
1046 wake_up_state(q->task, TASK_NORMAL); 1074 wake_up_state(q->task, TASK_NORMAL);
@@ -1169,11 +1197,10 @@ retry:
1169 pi_state = NULL; 1197 pi_state = NULL;
1170 } 1198 }
1171 1199
1172 ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); 1200 ret = get_futex_key(uaddr1, fshared, &key1);
1173 if (unlikely(ret != 0)) 1201 if (unlikely(ret != 0))
1174 goto out; 1202 goto out;
1175 ret = get_futex_key(uaddr2, fshared, &key2, 1203 ret = get_futex_key(uaddr2, fshared, &key2);
1176 requeue_pi ? VERIFY_WRITE : VERIFY_READ);
1177 if (unlikely(ret != 0)) 1204 if (unlikely(ret != 0))
1178 goto out_put_key1; 1205 goto out_put_key1;
1179 1206
@@ -1388,7 +1415,7 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1388 1415
1389 plist_node_init(&q->list, prio); 1416 plist_node_init(&q->list, prio);
1390#ifdef CONFIG_DEBUG_PI_LIST 1417#ifdef CONFIG_DEBUG_PI_LIST
1391 q->list.plist.lock = &hb->lock; 1418 q->list.plist.spinlock = &hb->lock;
1392#endif 1419#endif
1393 plist_add(&q->list, &hb->chain); 1420 plist_add(&q->list, &hb->chain);
1394 q->task = current; 1421 q->task = current;
@@ -1523,18 +1550,18 @@ retry:
1523 * itself. 1550 * itself.
1524 */ 1551 */
1525 if (pi_state->owner != NULL) { 1552 if (pi_state->owner != NULL) {
1526 spin_lock_irq(&pi_state->owner->pi_lock); 1553 raw_spin_lock_irq(&pi_state->owner->pi_lock);
1527 WARN_ON(list_empty(&pi_state->list)); 1554 WARN_ON(list_empty(&pi_state->list));
1528 list_del_init(&pi_state->list); 1555 list_del_init(&pi_state->list);
1529 spin_unlock_irq(&pi_state->owner->pi_lock); 1556 raw_spin_unlock_irq(&pi_state->owner->pi_lock);
1530 } 1557 }
1531 1558
1532 pi_state->owner = newowner; 1559 pi_state->owner = newowner;
1533 1560
1534 spin_lock_irq(&newowner->pi_lock); 1561 raw_spin_lock_irq(&newowner->pi_lock);
1535 WARN_ON(!list_empty(&pi_state->list)); 1562 WARN_ON(!list_empty(&pi_state->list));
1536 list_add(&pi_state->list, &newowner->pi_state_list); 1563 list_add(&pi_state->list, &newowner->pi_state_list);
1537 spin_unlock_irq(&newowner->pi_lock); 1564 raw_spin_unlock_irq(&newowner->pi_lock);
1538 return 0; 1565 return 0;
1539 1566
1540 /* 1567 /*
@@ -1732,7 +1759,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
1732 */ 1759 */
1733retry: 1760retry:
1734 q->key = FUTEX_KEY_INIT; 1761 q->key = FUTEX_KEY_INIT;
1735 ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ); 1762 ret = get_futex_key(uaddr, fshared, &q->key);
1736 if (unlikely(ret != 0)) 1763 if (unlikely(ret != 0))
1737 return ret; 1764 return ret;
1738 1765
@@ -1898,7 +1925,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1898 q.requeue_pi_key = NULL; 1925 q.requeue_pi_key = NULL;
1899retry: 1926retry:
1900 q.key = FUTEX_KEY_INIT; 1927 q.key = FUTEX_KEY_INIT;
1901 ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE); 1928 ret = get_futex_key(uaddr, fshared, &q.key);
1902 if (unlikely(ret != 0)) 1929 if (unlikely(ret != 0))
1903 goto out; 1930 goto out;
1904 1931
@@ -1968,7 +1995,7 @@ retry_private:
1968 /* Unqueue and drop the lock */ 1995 /* Unqueue and drop the lock */
1969 unqueue_me_pi(&q); 1996 unqueue_me_pi(&q);
1970 1997
1971 goto out; 1998 goto out_put_key;
1972 1999
1973out_unlock_put_key: 2000out_unlock_put_key:
1974 queue_unlock(&q, hb); 2001 queue_unlock(&q, hb);
@@ -2017,7 +2044,7 @@ retry:
2017 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) 2044 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
2018 return -EPERM; 2045 return -EPERM;
2019 2046
2020 ret = get_futex_key(uaddr, fshared, &key, VERIFY_WRITE); 2047 ret = get_futex_key(uaddr, fshared, &key);
2021 if (unlikely(ret != 0)) 2048 if (unlikely(ret != 0))
2022 goto out; 2049 goto out;
2023 2050
@@ -2209,7 +2236,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2209 rt_waiter.task = NULL; 2236 rt_waiter.task = NULL;
2210 2237
2211 key2 = FUTEX_KEY_INIT; 2238 key2 = FUTEX_KEY_INIT;
2212 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); 2239 ret = get_futex_key(uaddr2, fshared, &key2);
2213 if (unlikely(ret != 0)) 2240 if (unlikely(ret != 0))
2214 goto out; 2241 goto out;
2215 2242
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 235716556bf1..d49afb2395e5 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -146,7 +146,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
146 struct task_struct *p; 146 struct task_struct *p;
147 147
148 ret = -ESRCH; 148 ret = -ESRCH;
149 read_lock(&tasklist_lock); 149 rcu_read_lock();
150 p = find_task_by_vpid(pid); 150 p = find_task_by_vpid(pid);
151 if (!p) 151 if (!p)
152 goto err_unlock; 152 goto err_unlock;
@@ -157,7 +157,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
157 !capable(CAP_SYS_PTRACE)) 157 !capable(CAP_SYS_PTRACE))
158 goto err_unlock; 158 goto err_unlock;
159 head = p->compat_robust_list; 159 head = p->compat_robust_list;
160 read_unlock(&tasklist_lock); 160 rcu_read_unlock();
161 } 161 }
162 162
163 if (put_user(sizeof(*head), len_ptr)) 163 if (put_user(sizeof(*head), len_ptr))
@@ -165,7 +165,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
165 return put_user(ptr_to_compat(head), head_ptr); 165 return put_user(ptr_to_compat(head), head_ptr);
166 166
167err_unlock: 167err_unlock:
168 read_unlock(&tasklist_lock); 168 rcu_read_unlock();
169 169
170 return ret; 170 return ret;
171} 171}
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 7b19403900ad..02e5097bf319 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -129,11 +129,11 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
129 for (;;) { 129 for (;;) {
130 base = timer->base; 130 base = timer->base;
131 if (likely(base != NULL)) { 131 if (likely(base != NULL)) {
132 spin_lock_irqsave(&base->cpu_base->lock, *flags); 132 raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
133 if (likely(base == timer->base)) 133 if (likely(base == timer->base))
134 return base; 134 return base;
135 /* The timer has migrated to another CPU: */ 135 /* The timer has migrated to another CPU: */
136 spin_unlock_irqrestore(&base->cpu_base->lock, *flags); 136 raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags);
137 } 137 }
138 cpu_relax(); 138 cpu_relax();
139 } 139 }
@@ -210,13 +210,13 @@ again:
210 210
211 /* See the comment in lock_timer_base() */ 211 /* See the comment in lock_timer_base() */
212 timer->base = NULL; 212 timer->base = NULL;
213 spin_unlock(&base->cpu_base->lock); 213 raw_spin_unlock(&base->cpu_base->lock);
214 spin_lock(&new_base->cpu_base->lock); 214 raw_spin_lock(&new_base->cpu_base->lock);
215 215
216 if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { 216 if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
217 cpu = this_cpu; 217 cpu = this_cpu;
218 spin_unlock(&new_base->cpu_base->lock); 218 raw_spin_unlock(&new_base->cpu_base->lock);
219 spin_lock(&base->cpu_base->lock); 219 raw_spin_lock(&base->cpu_base->lock);
220 timer->base = base; 220 timer->base = base;
221 goto again; 221 goto again;
222 } 222 }
@@ -232,7 +232,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
232{ 232{
233 struct hrtimer_clock_base *base = timer->base; 233 struct hrtimer_clock_base *base = timer->base;
234 234
235 spin_lock_irqsave(&base->cpu_base->lock, *flags); 235 raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
236 236
237 return base; 237 return base;
238} 238}
@@ -559,7 +559,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
559static int hrtimer_reprogram(struct hrtimer *timer, 559static int hrtimer_reprogram(struct hrtimer *timer,
560 struct hrtimer_clock_base *base) 560 struct hrtimer_clock_base *base)
561{ 561{
562 ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next; 562 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
563 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); 563 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
564 int res; 564 int res;
565 565
@@ -584,7 +584,16 @@ static int hrtimer_reprogram(struct hrtimer *timer,
584 if (expires.tv64 < 0) 584 if (expires.tv64 < 0)
585 return -ETIME; 585 return -ETIME;
586 586
587 if (expires.tv64 >= expires_next->tv64) 587 if (expires.tv64 >= cpu_base->expires_next.tv64)
588 return 0;
589
590 /*
591 * If a hang was detected in the last timer interrupt then we
592 * do not schedule a timer which is earlier than the expiry
593 * which we enforced in the hang detection. We want the system
594 * to make progress.
595 */
596 if (cpu_base->hang_detected)
588 return 0; 597 return 0;
589 598
590 /* 599 /*
@@ -592,7 +601,7 @@ static int hrtimer_reprogram(struct hrtimer *timer,
592 */ 601 */
593 res = tick_program_event(expires, 0); 602 res = tick_program_event(expires, 0);
594 if (!IS_ERR_VALUE(res)) 603 if (!IS_ERR_VALUE(res))
595 *expires_next = expires; 604 cpu_base->expires_next = expires;
596 return res; 605 return res;
597} 606}
598 607
@@ -621,12 +630,12 @@ static void retrigger_next_event(void *arg)
621 base = &__get_cpu_var(hrtimer_bases); 630 base = &__get_cpu_var(hrtimer_bases);
622 631
623 /* Adjust CLOCK_REALTIME offset */ 632 /* Adjust CLOCK_REALTIME offset */
624 spin_lock(&base->lock); 633 raw_spin_lock(&base->lock);
625 base->clock_base[CLOCK_REALTIME].offset = 634 base->clock_base[CLOCK_REALTIME].offset =
626 timespec_to_ktime(realtime_offset); 635 timespec_to_ktime(realtime_offset);
627 636
628 hrtimer_force_reprogram(base, 0); 637 hrtimer_force_reprogram(base, 0);
629 spin_unlock(&base->lock); 638 raw_spin_unlock(&base->lock);
630} 639}
631 640
632/* 641/*
@@ -687,9 +696,9 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
687{ 696{
688 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { 697 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
689 if (wakeup) { 698 if (wakeup) {
690 spin_unlock(&base->cpu_base->lock); 699 raw_spin_unlock(&base->cpu_base->lock);
691 raise_softirq_irqoff(HRTIMER_SOFTIRQ); 700 raise_softirq_irqoff(HRTIMER_SOFTIRQ);
692 spin_lock(&base->cpu_base->lock); 701 raw_spin_lock(&base->cpu_base->lock);
693 } else 702 } else
694 __raise_softirq_irqoff(HRTIMER_SOFTIRQ); 703 __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
695 704
@@ -749,17 +758,33 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
749 758
750#endif /* CONFIG_HIGH_RES_TIMERS */ 759#endif /* CONFIG_HIGH_RES_TIMERS */
751 760
752#ifdef CONFIG_TIMER_STATS 761static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
753void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr)
754{ 762{
763#ifdef CONFIG_TIMER_STATS
755 if (timer->start_site) 764 if (timer->start_site)
756 return; 765 return;
757 766 timer->start_site = __builtin_return_address(0);
758 timer->start_site = addr;
759 memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); 767 memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
760 timer->start_pid = current->pid; 768 timer->start_pid = current->pid;
769#endif
761} 770}
771
772static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer)
773{
774#ifdef CONFIG_TIMER_STATS
775 timer->start_site = NULL;
776#endif
777}
778
779static inline void timer_stats_account_hrtimer(struct hrtimer *timer)
780{
781#ifdef CONFIG_TIMER_STATS
782 if (likely(!timer_stats_active))
783 return;
784 timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
785 timer->function, timer->start_comm, 0);
762#endif 786#endif
787}
763 788
764/* 789/*
765 * Counterpart to lock_hrtimer_base above: 790 * Counterpart to lock_hrtimer_base above:
@@ -767,7 +792,7 @@ void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr)
767static inline 792static inline
768void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) 793void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
769{ 794{
770 spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); 795 raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
771} 796}
772 797
773/** 798/**
@@ -1027,9 +1052,9 @@ void hrtimer_pull(void)
1027 struct hrtimer_start_on_info *info; 1052 struct hrtimer_start_on_info *info;
1028 struct list_head *pos, *safe, list; 1053 struct list_head *pos, *safe, list;
1029 1054
1030 spin_lock(&base->lock); 1055 raw_spin_lock(&base->lock);
1031 list_replace_init(&base->to_pull, &list); 1056 list_replace_init(&base->to_pull, &list);
1032 spin_unlock(&base->lock); 1057 raw_spin_unlock(&base->lock);
1033 1058
1034 list_for_each_safe(pos, safe, &list) { 1059 list_for_each_safe(pos, safe, &list) {
1035 info = list_entry(pos, struct hrtimer_start_on_info, list); 1060 info = list_entry(pos, struct hrtimer_start_on_info, list);
@@ -1083,10 +1108,10 @@ int hrtimer_start_on(int cpu, struct hrtimer_start_on_info* info,
1083 } else { 1108 } else {
1084 TRACE("hrtimer_start_on: pulling to remote CPU\n"); 1109 TRACE("hrtimer_start_on: pulling to remote CPU\n");
1085 base = &per_cpu(hrtimer_bases, cpu); 1110 base = &per_cpu(hrtimer_bases, cpu);
1086 spin_lock_irqsave(&base->lock, flags); 1111 raw_spin_lock_irqsave(&base->lock, flags);
1087 was_empty = list_empty(&base->to_pull); 1112 was_empty = list_empty(&base->to_pull);
1088 list_add(&info->list, &base->to_pull); 1113 list_add(&info->list, &base->to_pull);
1089 spin_unlock_irqrestore(&base->lock, flags); 1114 raw_spin_unlock_irqrestore(&base->lock, flags);
1090 if (was_empty) 1115 if (was_empty)
1091 /* only send IPI if other no else 1116 /* only send IPI if other no else
1092 * has done so already 1117 * has done so already
@@ -1179,7 +1204,7 @@ ktime_t hrtimer_get_next_event(void)
1179 unsigned long flags; 1204 unsigned long flags;
1180 int i; 1205 int i;
1181 1206
1182 spin_lock_irqsave(&cpu_base->lock, flags); 1207 raw_spin_lock_irqsave(&cpu_base->lock, flags);
1183 1208
1184 if (!hrtimer_hres_active()) { 1209 if (!hrtimer_hres_active()) {
1185 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { 1210 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
@@ -1196,7 +1221,7 @@ ktime_t hrtimer_get_next_event(void)
1196 } 1221 }
1197 } 1222 }
1198 1223
1199 spin_unlock_irqrestore(&cpu_base->lock, flags); 1224 raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
1200 1225
1201 if (mindelta.tv64 < 0) 1226 if (mindelta.tv64 < 0)
1202 mindelta.tv64 = 0; 1227 mindelta.tv64 = 0;
@@ -1278,11 +1303,11 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
1278 * they get migrated to another cpu, therefore its safe to unlock 1303 * they get migrated to another cpu, therefore its safe to unlock
1279 * the timer base. 1304 * the timer base.
1280 */ 1305 */
1281 spin_unlock(&cpu_base->lock); 1306 raw_spin_unlock(&cpu_base->lock);
1282 trace_hrtimer_expire_entry(timer, now); 1307 trace_hrtimer_expire_entry(timer, now);
1283 restart = fn(timer); 1308 restart = fn(timer);
1284 trace_hrtimer_expire_exit(timer); 1309 trace_hrtimer_expire_exit(timer);
1285 spin_lock(&cpu_base->lock); 1310 raw_spin_lock(&cpu_base->lock);
1286 1311
1287 /* 1312 /*
1288 * Note: We clear the CALLBACK bit after enqueue_hrtimer and 1313 * Note: We clear the CALLBACK bit after enqueue_hrtimer and
@@ -1298,29 +1323,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
1298 1323
1299#ifdef CONFIG_HIGH_RES_TIMERS 1324#ifdef CONFIG_HIGH_RES_TIMERS
1300 1325
1301static int force_clock_reprogram;
1302
1303/*
1304 * After 5 iteration's attempts, we consider that hrtimer_interrupt()
1305 * is hanging, which could happen with something that slows the interrupt
1306 * such as the tracing. Then we force the clock reprogramming for each future
1307 * hrtimer interrupts to avoid infinite loops and use the min_delta_ns
1308 * threshold that we will overwrite.
1309 * The next tick event will be scheduled to 3 times we currently spend on
1310 * hrtimer_interrupt(). This gives a good compromise, the cpus will spend
1311 * 1/4 of their time to process the hrtimer interrupts. This is enough to
1312 * let it running without serious starvation.
1313 */
1314
1315static inline void
1316hrtimer_interrupt_hanging(struct clock_event_device *dev,
1317 ktime_t try_time)
1318{
1319 force_clock_reprogram = 1;
1320 dev->min_delta_ns = (unsigned long)try_time.tv64 * 3;
1321 printk(KERN_WARNING "hrtimer: interrupt too slow, "
1322 "forcing clock min delta to %lu ns\n", dev->min_delta_ns);
1323}
1324/* 1326/*
1325 * High resolution timer interrupt 1327 * High resolution timer interrupt
1326 * Called with interrupts disabled 1328 * Called with interrupts disabled
@@ -1329,24 +1331,18 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1329{ 1331{
1330 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1332 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1331 struct hrtimer_clock_base *base; 1333 struct hrtimer_clock_base *base;
1332 ktime_t expires_next, now; 1334 ktime_t expires_next, now, entry_time, delta;
1333 int nr_retries = 0; 1335 int i, retries = 0;
1334 int i;
1335 1336
1336 BUG_ON(!cpu_base->hres_active); 1337 BUG_ON(!cpu_base->hres_active);
1337 cpu_base->nr_events++; 1338 cpu_base->nr_events++;
1338 dev->next_event.tv64 = KTIME_MAX; 1339 dev->next_event.tv64 = KTIME_MAX;
1339 1340
1340 retry: 1341 entry_time = now = ktime_get();
1341 /* 5 retries is enough to notice a hang */ 1342retry:
1342 if (!(++nr_retries % 5))
1343 hrtimer_interrupt_hanging(dev, ktime_sub(ktime_get(), now));
1344
1345 now = ktime_get();
1346
1347 expires_next.tv64 = KTIME_MAX; 1343 expires_next.tv64 = KTIME_MAX;
1348 1344
1349 spin_lock(&cpu_base->lock); 1345 raw_spin_lock(&cpu_base->lock);
1350 /* 1346 /*
1351 * We set expires_next to KTIME_MAX here with cpu_base->lock 1347 * We set expires_next to KTIME_MAX here with cpu_base->lock
1352 * held to prevent that a timer is enqueued in our queue via 1348 * held to prevent that a timer is enqueued in our queue via
@@ -1402,13 +1398,51 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1402 * against it. 1398 * against it.
1403 */ 1399 */
1404 cpu_base->expires_next = expires_next; 1400 cpu_base->expires_next = expires_next;
1405 spin_unlock(&cpu_base->lock); 1401 raw_spin_unlock(&cpu_base->lock);
1406 1402
1407 /* Reprogramming necessary ? */ 1403 /* Reprogramming necessary ? */
1408 if (expires_next.tv64 != KTIME_MAX) { 1404 if (expires_next.tv64 == KTIME_MAX ||
1409 if (tick_program_event(expires_next, force_clock_reprogram)) 1405 !tick_program_event(expires_next, 0)) {
1410 goto retry; 1406 cpu_base->hang_detected = 0;
1407 return;
1411 } 1408 }
1409
1410 /*
1411 * The next timer was already expired due to:
1412 * - tracing
1413 * - long lasting callbacks
1414 * - being scheduled away when running in a VM
1415 *
1416 * We need to prevent that we loop forever in the hrtimer
1417 * interrupt routine. We give it 3 attempts to avoid
1418 * overreacting on some spurious event.
1419 */
1420 now = ktime_get();
1421 cpu_base->nr_retries++;
1422 if (++retries < 3)
1423 goto retry;
1424 /*
1425 * Give the system a chance to do something else than looping
1426 * here. We stored the entry time, so we know exactly how long
1427 * we spent here. We schedule the next event this amount of
1428 * time away.
1429 */
1430 cpu_base->nr_hangs++;
1431 cpu_base->hang_detected = 1;
1432 delta = ktime_sub(now, entry_time);
1433 if (delta.tv64 > cpu_base->max_hang_time.tv64)
1434 cpu_base->max_hang_time = delta;
1435 /*
1436 * Limit it to a sensible value as we enforce a longer
1437 * delay. Give the CPU at least 100ms to catch up.
1438 */
1439 if (delta.tv64 > 100 * NSEC_PER_MSEC)
1440 expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
1441 else
1442 expires_next = ktime_add(now, delta);
1443 tick_program_event(expires_next, 1);
1444 printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n",
1445 ktime_to_ns(delta));
1412} 1446}
1413 1447
1414/* 1448/*
@@ -1504,7 +1538,7 @@ void hrtimer_run_queues(void)
1504 gettime = 0; 1538 gettime = 0;
1505 } 1539 }
1506 1540
1507 spin_lock(&cpu_base->lock); 1541 raw_spin_lock(&cpu_base->lock);
1508 1542
1509 while ((node = base->first)) { 1543 while ((node = base->first)) {
1510 struct hrtimer *timer; 1544 struct hrtimer *timer;
@@ -1516,7 +1550,7 @@ void hrtimer_run_queues(void)
1516 1550
1517 __run_hrtimer(timer, &base->softirq_time); 1551 __run_hrtimer(timer, &base->softirq_time);
1518 } 1552 }
1519 spin_unlock(&cpu_base->lock); 1553 raw_spin_unlock(&cpu_base->lock);
1520 } 1554 }
1521} 1555}
1522 1556
@@ -1672,7 +1706,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
1672 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); 1706 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
1673 int i; 1707 int i;
1674 1708
1675 spin_lock_init(&cpu_base->lock); 1709 raw_spin_lock_init(&cpu_base->lock);
1676 1710
1677 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) 1711 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
1678 cpu_base->clock_base[i].cpu_base = cpu_base; 1712 cpu_base->clock_base[i].cpu_base = cpu_base;
@@ -1731,16 +1765,16 @@ static void migrate_hrtimers(int scpu)
1731 * The caller is globally serialized and nobody else 1765 * The caller is globally serialized and nobody else
1732 * takes two locks at once, deadlock is not possible. 1766 * takes two locks at once, deadlock is not possible.
1733 */ 1767 */
1734 spin_lock(&new_base->lock); 1768 raw_spin_lock(&new_base->lock);
1735 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); 1769 raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1736 1770
1737 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1771 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1738 migrate_hrtimer_list(&old_base->clock_base[i], 1772 migrate_hrtimer_list(&old_base->clock_base[i],
1739 &new_base->clock_base[i]); 1773 &new_base->clock_base[i]);
1740 } 1774 }
1741 1775
1742 spin_unlock(&old_base->lock); 1776 raw_spin_unlock(&old_base->lock);
1743 spin_unlock(&new_base->lock); 1777 raw_spin_unlock(&new_base->lock);
1744 1778
1745 /* Check, if we got expired work to do */ 1779 /* Check, if we got expired work to do */
1746 __hrtimer_peek_ahead_timers(); 1780 __hrtimer_peek_ahead_timers();
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index d4e841747400..0c642d51aac2 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -144,7 +144,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
144 144
145 rcu_read_lock(); 145 rcu_read_lock();
146 do_each_thread(g, t) { 146 do_each_thread(g, t) {
147 if (!--max_count) 147 if (!max_count--)
148 goto unlock; 148 goto unlock;
149 if (!--batch_count) { 149 if (!--batch_count) {
150 batch_count = HUNG_TASK_BATCHING; 150 batch_count = HUNG_TASK_BATCHING;
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
new file mode 100644
index 000000000000..03808ed342a6
--- /dev/null
+++ b/kernel/hw_breakpoint.c
@@ -0,0 +1,492 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) 2007 Alan Stern
17 * Copyright (C) IBM Corporation, 2009
18 * Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com>
19 *
20 * Thanks to Ingo Molnar for his many suggestions.
21 *
22 * Authors: Alan Stern <stern@rowland.harvard.edu>
23 * K.Prasad <prasad@linux.vnet.ibm.com>
24 * Frederic Weisbecker <fweisbec@gmail.com>
25 */
26
27/*
28 * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
29 * using the CPU's debug registers.
30 * This file contains the arch-independent routines.
31 */
32
33#include <linux/irqflags.h>
34#include <linux/kallsyms.h>
35#include <linux/notifier.h>
36#include <linux/kprobes.h>
37#include <linux/kdebug.h>
38#include <linux/kernel.h>
39#include <linux/module.h>
40#include <linux/percpu.h>
41#include <linux/sched.h>
42#include <linux/init.h>
43#include <linux/cpu.h>
44#include <linux/smp.h>
45
46#include <linux/hw_breakpoint.h>
47
48/*
49 * Constraints data
50 */
51
52/* Number of pinned cpu breakpoints in a cpu */
53static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned);
54
55/* Number of pinned task breakpoints in a cpu */
56static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[HBP_NUM]);
57
58/* Number of non-pinned cpu/task breakpoints in a cpu */
59static DEFINE_PER_CPU(unsigned int, nr_bp_flexible);
60
61/* Gather the number of total pinned and un-pinned bp in a cpuset */
62struct bp_busy_slots {
63 unsigned int pinned;
64 unsigned int flexible;
65};
66
67/* Serialize accesses to the above constraints */
68static DEFINE_MUTEX(nr_bp_mutex);
69
70/*
71 * Report the maximum number of pinned breakpoints a task
72 * have in this cpu
73 */
74static unsigned int max_task_bp_pinned(int cpu)
75{
76 int i;
77 unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned, cpu);
78
79 for (i = HBP_NUM -1; i >= 0; i--) {
80 if (tsk_pinned[i] > 0)
81 return i + 1;
82 }
83
84 return 0;
85}
86
87static int task_bp_pinned(struct task_struct *tsk)
88{
89 struct perf_event_context *ctx = tsk->perf_event_ctxp;
90 struct list_head *list;
91 struct perf_event *bp;
92 unsigned long flags;
93 int count = 0;
94
95 if (WARN_ONCE(!ctx, "No perf context for this task"))
96 return 0;
97
98 list = &ctx->event_list;
99
100 raw_spin_lock_irqsave(&ctx->lock, flags);
101
102 /*
103 * The current breakpoint counter is not included in the list
104 * at the open() callback time
105 */
106 list_for_each_entry(bp, list, event_entry) {
107 if (bp->attr.type == PERF_TYPE_BREAKPOINT)
108 count++;
109 }
110
111 raw_spin_unlock_irqrestore(&ctx->lock, flags);
112
113 return count;
114}
115
116/*
117 * Report the number of pinned/un-pinned breakpoints we have in
118 * a given cpu (cpu > -1) or in all of them (cpu = -1).
119 */
120static void
121fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
122{
123 int cpu = bp->cpu;
124 struct task_struct *tsk = bp->ctx->task;
125
126 if (cpu >= 0) {
127 slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu);
128 if (!tsk)
129 slots->pinned += max_task_bp_pinned(cpu);
130 else
131 slots->pinned += task_bp_pinned(tsk);
132 slots->flexible = per_cpu(nr_bp_flexible, cpu);
133
134 return;
135 }
136
137 for_each_online_cpu(cpu) {
138 unsigned int nr;
139
140 nr = per_cpu(nr_cpu_bp_pinned, cpu);
141 if (!tsk)
142 nr += max_task_bp_pinned(cpu);
143 else
144 nr += task_bp_pinned(tsk);
145
146 if (nr > slots->pinned)
147 slots->pinned = nr;
148
149 nr = per_cpu(nr_bp_flexible, cpu);
150
151 if (nr > slots->flexible)
152 slots->flexible = nr;
153 }
154}
155
156/*
157 * Add a pinned breakpoint for the given task in our constraint table
158 */
159static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable)
160{
161 unsigned int *tsk_pinned;
162 int count = 0;
163
164 count = task_bp_pinned(tsk);
165
166 tsk_pinned = per_cpu(nr_task_bp_pinned, cpu);
167 if (enable) {
168 tsk_pinned[count]++;
169 if (count > 0)
170 tsk_pinned[count-1]--;
171 } else {
172 tsk_pinned[count]--;
173 if (count > 0)
174 tsk_pinned[count-1]++;
175 }
176}
177
178/*
179 * Add/remove the given breakpoint in our constraint table
180 */
181static void toggle_bp_slot(struct perf_event *bp, bool enable)
182{
183 int cpu = bp->cpu;
184 struct task_struct *tsk = bp->ctx->task;
185
186 /* Pinned counter task profiling */
187 if (tsk) {
188 if (cpu >= 0) {
189 toggle_bp_task_slot(tsk, cpu, enable);
190 return;
191 }
192
193 for_each_online_cpu(cpu)
194 toggle_bp_task_slot(tsk, cpu, enable);
195 return;
196 }
197
198 /* Pinned counter cpu profiling */
199 if (enable)
200 per_cpu(nr_cpu_bp_pinned, bp->cpu)++;
201 else
202 per_cpu(nr_cpu_bp_pinned, bp->cpu)--;
203}
204
205/*
206 * Contraints to check before allowing this new breakpoint counter:
207 *
208 * == Non-pinned counter == (Considered as pinned for now)
209 *
210 * - If attached to a single cpu, check:
211 *
212 * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu)
213 * + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM
214 *
215 * -> If there are already non-pinned counters in this cpu, it means
216 * there is already a free slot for them.
217 * Otherwise, we check that the maximum number of per task
218 * breakpoints (for this cpu) plus the number of per cpu breakpoint
219 * (for this cpu) doesn't cover every registers.
220 *
221 * - If attached to every cpus, check:
222 *
223 * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *))
224 * + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM
225 *
226 * -> This is roughly the same, except we check the number of per cpu
227 * bp for every cpu and we keep the max one. Same for the per tasks
228 * breakpoints.
229 *
230 *
231 * == Pinned counter ==
232 *
233 * - If attached to a single cpu, check:
234 *
235 * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu)
236 * + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM
237 *
238 * -> Same checks as before. But now the nr_bp_flexible, if any, must keep
239 * one register at least (or they will never be fed).
240 *
241 * - If attached to every cpus, check:
242 *
243 * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
244 * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM
245 */
246static int __reserve_bp_slot(struct perf_event *bp)
247{
248 struct bp_busy_slots slots = {0};
249
250 fetch_bp_busy_slots(&slots, bp);
251
252 /* Flexible counters need to keep at least one slot */
253 if (slots.pinned + (!!slots.flexible) == HBP_NUM)
254 return -ENOSPC;
255
256 toggle_bp_slot(bp, true);
257
258 return 0;
259}
260
261int reserve_bp_slot(struct perf_event *bp)
262{
263 int ret;
264
265 mutex_lock(&nr_bp_mutex);
266
267 ret = __reserve_bp_slot(bp);
268
269 mutex_unlock(&nr_bp_mutex);
270
271 return ret;
272}
273
274static void __release_bp_slot(struct perf_event *bp)
275{
276 toggle_bp_slot(bp, false);
277}
278
279void release_bp_slot(struct perf_event *bp)
280{
281 mutex_lock(&nr_bp_mutex);
282
283 __release_bp_slot(bp);
284
285 mutex_unlock(&nr_bp_mutex);
286}
287
288/*
289 * Allow the kernel debugger to reserve breakpoint slots without
290 * taking a lock using the dbg_* variant of for the reserve and
291 * release breakpoint slots.
292 */
293int dbg_reserve_bp_slot(struct perf_event *bp)
294{
295 if (mutex_is_locked(&nr_bp_mutex))
296 return -1;
297
298 return __reserve_bp_slot(bp);
299}
300
301int dbg_release_bp_slot(struct perf_event *bp)
302{
303 if (mutex_is_locked(&nr_bp_mutex))
304 return -1;
305
306 __release_bp_slot(bp);
307
308 return 0;
309}
310
311int register_perf_hw_breakpoint(struct perf_event *bp)
312{
313 int ret;
314
315 ret = reserve_bp_slot(bp);
316 if (ret)
317 return ret;
318
319 /*
320 * Ptrace breakpoints can be temporary perf events only
321 * meant to reserve a slot. In this case, it is created disabled and
322 * we don't want to check the params right now (as we put a null addr)
323 * But perf tools create events as disabled and we want to check
324 * the params for them.
325 * This is a quick hack that will be removed soon, once we remove
326 * the tmp breakpoints from ptrace
327 */
328 if (!bp->attr.disabled || !bp->overflow_handler)
329 ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
330
331 /* if arch_validate_hwbkpt_settings() fails then release bp slot */
332 if (ret)
333 release_bp_slot(bp);
334
335 return ret;
336}
337
338/**
339 * register_user_hw_breakpoint - register a hardware breakpoint for user space
340 * @attr: breakpoint attributes
341 * @triggered: callback to trigger when we hit the breakpoint
342 * @tsk: pointer to 'task_struct' of the process to which the address belongs
343 */
344struct perf_event *
345register_user_hw_breakpoint(struct perf_event_attr *attr,
346 perf_overflow_handler_t triggered,
347 struct task_struct *tsk)
348{
349 return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered);
350}
351EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
352
353/**
354 * modify_user_hw_breakpoint - modify a user-space hardware breakpoint
355 * @bp: the breakpoint structure to modify
356 * @attr: new breakpoint attributes
357 * @triggered: callback to trigger when we hit the breakpoint
358 * @tsk: pointer to 'task_struct' of the process to which the address belongs
359 */
360int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr)
361{
362 u64 old_addr = bp->attr.bp_addr;
363 u64 old_len = bp->attr.bp_len;
364 int old_type = bp->attr.bp_type;
365 int err = 0;
366
367 perf_event_disable(bp);
368
369 bp->attr.bp_addr = attr->bp_addr;
370 bp->attr.bp_type = attr->bp_type;
371 bp->attr.bp_len = attr->bp_len;
372
373 if (attr->disabled)
374 goto end;
375
376 err = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
377 if (!err)
378 perf_event_enable(bp);
379
380 if (err) {
381 bp->attr.bp_addr = old_addr;
382 bp->attr.bp_type = old_type;
383 bp->attr.bp_len = old_len;
384 if (!bp->attr.disabled)
385 perf_event_enable(bp);
386
387 return err;
388 }
389
390end:
391 bp->attr.disabled = attr->disabled;
392
393 return 0;
394}
395EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
396
397/**
398 * unregister_hw_breakpoint - unregister a user-space hardware breakpoint
399 * @bp: the breakpoint structure to unregister
400 */
401void unregister_hw_breakpoint(struct perf_event *bp)
402{
403 if (!bp)
404 return;
405 perf_event_release_kernel(bp);
406}
407EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
408
409/**
410 * register_wide_hw_breakpoint - register a wide breakpoint in the kernel
411 * @attr: breakpoint attributes
412 * @triggered: callback to trigger when we hit the breakpoint
413 *
414 * @return a set of per_cpu pointers to perf events
415 */
416struct perf_event * __percpu *
417register_wide_hw_breakpoint(struct perf_event_attr *attr,
418 perf_overflow_handler_t triggered)
419{
420 struct perf_event * __percpu *cpu_events, **pevent, *bp;
421 long err;
422 int cpu;
423
424 cpu_events = alloc_percpu(typeof(*cpu_events));
425 if (!cpu_events)
426 return (void __percpu __force *)ERR_PTR(-ENOMEM);
427
428 get_online_cpus();
429 for_each_online_cpu(cpu) {
430 pevent = per_cpu_ptr(cpu_events, cpu);
431 bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered);
432
433 *pevent = bp;
434
435 if (IS_ERR(bp)) {
436 err = PTR_ERR(bp);
437 goto fail;
438 }
439 }
440 put_online_cpus();
441
442 return cpu_events;
443
444fail:
445 for_each_online_cpu(cpu) {
446 pevent = per_cpu_ptr(cpu_events, cpu);
447 if (IS_ERR(*pevent))
448 break;
449 unregister_hw_breakpoint(*pevent);
450 }
451 put_online_cpus();
452
453 free_percpu(cpu_events);
454 return (void __percpu __force *)ERR_PTR(err);
455}
456EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
457
458/**
459 * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel
460 * @cpu_events: the per cpu set of events to unregister
461 */
462void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events)
463{
464 int cpu;
465 struct perf_event **pevent;
466
467 for_each_possible_cpu(cpu) {
468 pevent = per_cpu_ptr(cpu_events, cpu);
469 unregister_hw_breakpoint(*pevent);
470 }
471 free_percpu(cpu_events);
472}
473EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint);
474
475static struct notifier_block hw_breakpoint_exceptions_nb = {
476 .notifier_call = hw_breakpoint_exceptions_notify,
477 /* we need to be notified first */
478 .priority = 0x7fffffff
479};
480
481static int __init init_hw_breakpoint(void)
482{
483 return register_die_notifier(&hw_breakpoint_exceptions_nb);
484}
485core_initcall(init_hw_breakpoint);
486
487
488struct pmu perf_ops_bp = {
489 .enable = arch_install_hw_breakpoint,
490 .disable = arch_uninstall_hw_breakpoint,
491 .read = hw_breakpoint_pmu_read,
492};
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 1de9700f416e..2295a31ef110 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -45,7 +45,7 @@ unsigned long probe_irq_on(void)
45 * flush such a longstanding irq before considering it as spurious. 45 * flush such a longstanding irq before considering it as spurious.
46 */ 46 */
47 for_each_irq_desc_reverse(i, desc) { 47 for_each_irq_desc_reverse(i, desc) {
48 spin_lock_irq(&desc->lock); 48 raw_spin_lock_irq(&desc->lock);
49 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 49 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
50 /* 50 /*
51 * An old-style architecture might still have 51 * An old-style architecture might still have
@@ -61,7 +61,7 @@ unsigned long probe_irq_on(void)
61 desc->chip->set_type(i, IRQ_TYPE_PROBE); 61 desc->chip->set_type(i, IRQ_TYPE_PROBE);
62 desc->chip->startup(i); 62 desc->chip->startup(i);
63 } 63 }
64 spin_unlock_irq(&desc->lock); 64 raw_spin_unlock_irq(&desc->lock);
65 } 65 }
66 66
67 /* Wait for longstanding interrupts to trigger. */ 67 /* Wait for longstanding interrupts to trigger. */
@@ -73,13 +73,13 @@ unsigned long probe_irq_on(void)
73 * happened in the previous stage, it may have masked itself) 73 * happened in the previous stage, it may have masked itself)
74 */ 74 */
75 for_each_irq_desc_reverse(i, desc) { 75 for_each_irq_desc_reverse(i, desc) {
76 spin_lock_irq(&desc->lock); 76 raw_spin_lock_irq(&desc->lock);
77 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 77 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
78 desc->status |= IRQ_AUTODETECT | IRQ_WAITING; 78 desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
79 if (desc->chip->startup(i)) 79 if (desc->chip->startup(i))
80 desc->status |= IRQ_PENDING; 80 desc->status |= IRQ_PENDING;
81 } 81 }
82 spin_unlock_irq(&desc->lock); 82 raw_spin_unlock_irq(&desc->lock);
83 } 83 }
84 84
85 /* 85 /*
@@ -91,7 +91,7 @@ unsigned long probe_irq_on(void)
91 * Now filter out any obviously spurious interrupts 91 * Now filter out any obviously spurious interrupts
92 */ 92 */
93 for_each_irq_desc(i, desc) { 93 for_each_irq_desc(i, desc) {
94 spin_lock_irq(&desc->lock); 94 raw_spin_lock_irq(&desc->lock);
95 status = desc->status; 95 status = desc->status;
96 96
97 if (status & IRQ_AUTODETECT) { 97 if (status & IRQ_AUTODETECT) {
@@ -103,7 +103,7 @@ unsigned long probe_irq_on(void)
103 if (i < 32) 103 if (i < 32)
104 mask |= 1 << i; 104 mask |= 1 << i;
105 } 105 }
106 spin_unlock_irq(&desc->lock); 106 raw_spin_unlock_irq(&desc->lock);
107 } 107 }
108 108
109 return mask; 109 return mask;
@@ -129,7 +129,7 @@ unsigned int probe_irq_mask(unsigned long val)
129 int i; 129 int i;
130 130
131 for_each_irq_desc(i, desc) { 131 for_each_irq_desc(i, desc) {
132 spin_lock_irq(&desc->lock); 132 raw_spin_lock_irq(&desc->lock);
133 status = desc->status; 133 status = desc->status;
134 134
135 if (status & IRQ_AUTODETECT) { 135 if (status & IRQ_AUTODETECT) {
@@ -139,7 +139,7 @@ unsigned int probe_irq_mask(unsigned long val)
139 desc->status = status & ~IRQ_AUTODETECT; 139 desc->status = status & ~IRQ_AUTODETECT;
140 desc->chip->shutdown(i); 140 desc->chip->shutdown(i);
141 } 141 }
142 spin_unlock_irq(&desc->lock); 142 raw_spin_unlock_irq(&desc->lock);
143 } 143 }
144 mutex_unlock(&probing_active); 144 mutex_unlock(&probing_active);
145 145
@@ -171,7 +171,7 @@ int probe_irq_off(unsigned long val)
171 unsigned int status; 171 unsigned int status;
172 172
173 for_each_irq_desc(i, desc) { 173 for_each_irq_desc(i, desc) {
174 spin_lock_irq(&desc->lock); 174 raw_spin_lock_irq(&desc->lock);
175 status = desc->status; 175 status = desc->status;
176 176
177 if (status & IRQ_AUTODETECT) { 177 if (status & IRQ_AUTODETECT) {
@@ -183,7 +183,7 @@ int probe_irq_off(unsigned long val)
183 desc->status = status & ~IRQ_AUTODETECT; 183 desc->status = status & ~IRQ_AUTODETECT;
184 desc->chip->shutdown(i); 184 desc->chip->shutdown(i);
185 } 185 }
186 spin_unlock_irq(&desc->lock); 186 raw_spin_unlock_irq(&desc->lock);
187 } 187 }
188 mutex_unlock(&probing_active); 188 mutex_unlock(&probing_active);
189 189
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c1660194d115..b7091d5ca2f8 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -18,11 +18,7 @@
18 18
19#include "internals.h" 19#include "internals.h"
20 20
21/** 21static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data)
22 * dynamic_irq_init - initialize a dynamically allocated irq
23 * @irq: irq number to initialize
24 */
25void dynamic_irq_init(unsigned int irq)
26{ 22{
27 struct irq_desc *desc; 23 struct irq_desc *desc;
28 unsigned long flags; 24 unsigned long flags;
@@ -34,14 +30,15 @@ void dynamic_irq_init(unsigned int irq)
34 } 30 }
35 31
36 /* Ensure we don't have left over values from a previous use of this irq */ 32 /* Ensure we don't have left over values from a previous use of this irq */
37 spin_lock_irqsave(&desc->lock, flags); 33 raw_spin_lock_irqsave(&desc->lock, flags);
38 desc->status = IRQ_DISABLED; 34 desc->status = IRQ_DISABLED;
39 desc->chip = &no_irq_chip; 35 desc->chip = &no_irq_chip;
40 desc->handle_irq = handle_bad_irq; 36 desc->handle_irq = handle_bad_irq;
41 desc->depth = 1; 37 desc->depth = 1;
42 desc->msi_desc = NULL; 38 desc->msi_desc = NULL;
43 desc->handler_data = NULL; 39 desc->handler_data = NULL;
44 desc->chip_data = NULL; 40 if (!keep_chip_data)
41 desc->chip_data = NULL;
45 desc->action = NULL; 42 desc->action = NULL;
46 desc->irq_count = 0; 43 desc->irq_count = 0;
47 desc->irqs_unhandled = 0; 44 desc->irqs_unhandled = 0;
@@ -51,14 +48,30 @@ void dynamic_irq_init(unsigned int irq)
51 cpumask_clear(desc->pending_mask); 48 cpumask_clear(desc->pending_mask);
52#endif 49#endif
53#endif 50#endif
54 spin_unlock_irqrestore(&desc->lock, flags); 51 raw_spin_unlock_irqrestore(&desc->lock, flags);
55} 52}
56 53
57/** 54/**
58 * dynamic_irq_cleanup - cleanup a dynamically allocated irq 55 * dynamic_irq_init - initialize a dynamically allocated irq
59 * @irq: irq number to initialize 56 * @irq: irq number to initialize
60 */ 57 */
61void dynamic_irq_cleanup(unsigned int irq) 58void dynamic_irq_init(unsigned int irq)
59{
60 dynamic_irq_init_x(irq, false);
61}
62
63/**
64 * dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq
65 * @irq: irq number to initialize
66 *
67 * does not set irq_to_desc(irq)->chip_data to NULL
68 */
69void dynamic_irq_init_keep_chip_data(unsigned int irq)
70{
71 dynamic_irq_init_x(irq, true);
72}
73
74static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data)
62{ 75{
63 struct irq_desc *desc = irq_to_desc(irq); 76 struct irq_desc *desc = irq_to_desc(irq);
64 unsigned long flags; 77 unsigned long flags;
@@ -68,21 +81,42 @@ void dynamic_irq_cleanup(unsigned int irq)
68 return; 81 return;
69 } 82 }
70 83
71 spin_lock_irqsave(&desc->lock, flags); 84 raw_spin_lock_irqsave(&desc->lock, flags);
72 if (desc->action) { 85 if (desc->action) {
73 spin_unlock_irqrestore(&desc->lock, flags); 86 raw_spin_unlock_irqrestore(&desc->lock, flags);
74 WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n", 87 WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n",
75 irq); 88 irq);
76 return; 89 return;
77 } 90 }
78 desc->msi_desc = NULL; 91 desc->msi_desc = NULL;
79 desc->handler_data = NULL; 92 desc->handler_data = NULL;
80 desc->chip_data = NULL; 93 if (!keep_chip_data)
94 desc->chip_data = NULL;
81 desc->handle_irq = handle_bad_irq; 95 desc->handle_irq = handle_bad_irq;
82 desc->chip = &no_irq_chip; 96 desc->chip = &no_irq_chip;
83 desc->name = NULL; 97 desc->name = NULL;
84 clear_kstat_irqs(desc); 98 clear_kstat_irqs(desc);
85 spin_unlock_irqrestore(&desc->lock, flags); 99 raw_spin_unlock_irqrestore(&desc->lock, flags);
100}
101
102/**
103 * dynamic_irq_cleanup - cleanup a dynamically allocated irq
104 * @irq: irq number to initialize
105 */
106void dynamic_irq_cleanup(unsigned int irq)
107{
108 dynamic_irq_cleanup_x(irq, false);
109}
110
111/**
112 * dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq
113 * @irq: irq number to initialize
114 *
115 * does not set irq_to_desc(irq)->chip_data to NULL
116 */
117void dynamic_irq_cleanup_keep_chip_data(unsigned int irq)
118{
119 dynamic_irq_cleanup_x(irq, true);
86} 120}
87 121
88 122
@@ -104,10 +138,10 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
104 if (!chip) 138 if (!chip)
105 chip = &no_irq_chip; 139 chip = &no_irq_chip;
106 140
107 spin_lock_irqsave(&desc->lock, flags); 141 raw_spin_lock_irqsave(&desc->lock, flags);
108 irq_chip_set_defaults(chip); 142 irq_chip_set_defaults(chip);
109 desc->chip = chip; 143 desc->chip = chip;
110 spin_unlock_irqrestore(&desc->lock, flags); 144 raw_spin_unlock_irqrestore(&desc->lock, flags);
111 145
112 return 0; 146 return 0;
113} 147}
@@ -133,9 +167,9 @@ int set_irq_type(unsigned int irq, unsigned int type)
133 if (type == IRQ_TYPE_NONE) 167 if (type == IRQ_TYPE_NONE)
134 return 0; 168 return 0;
135 169
136 spin_lock_irqsave(&desc->lock, flags); 170 raw_spin_lock_irqsave(&desc->lock, flags);
137 ret = __irq_set_trigger(desc, irq, type); 171 ret = __irq_set_trigger(desc, irq, type);
138 spin_unlock_irqrestore(&desc->lock, flags); 172 raw_spin_unlock_irqrestore(&desc->lock, flags);
139 return ret; 173 return ret;
140} 174}
141EXPORT_SYMBOL(set_irq_type); 175EXPORT_SYMBOL(set_irq_type);
@@ -158,19 +192,19 @@ int set_irq_data(unsigned int irq, void *data)
158 return -EINVAL; 192 return -EINVAL;
159 } 193 }
160 194
161 spin_lock_irqsave(&desc->lock, flags); 195 raw_spin_lock_irqsave(&desc->lock, flags);
162 desc->handler_data = data; 196 desc->handler_data = data;
163 spin_unlock_irqrestore(&desc->lock, flags); 197 raw_spin_unlock_irqrestore(&desc->lock, flags);
164 return 0; 198 return 0;
165} 199}
166EXPORT_SYMBOL(set_irq_data); 200EXPORT_SYMBOL(set_irq_data);
167 201
168/** 202/**
169 * set_irq_data - set irq type data for an irq 203 * set_irq_msi - set MSI descriptor data for an irq
170 * @irq: Interrupt number 204 * @irq: Interrupt number
171 * @entry: Pointer to MSI descriptor data 205 * @entry: Pointer to MSI descriptor data
172 * 206 *
173 * Set the hardware irq controller data for an irq 207 * Set the MSI descriptor entry for an irq
174 */ 208 */
175int set_irq_msi(unsigned int irq, struct msi_desc *entry) 209int set_irq_msi(unsigned int irq, struct msi_desc *entry)
176{ 210{
@@ -183,11 +217,11 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry)
183 return -EINVAL; 217 return -EINVAL;
184 } 218 }
185 219
186 spin_lock_irqsave(&desc->lock, flags); 220 raw_spin_lock_irqsave(&desc->lock, flags);
187 desc->msi_desc = entry; 221 desc->msi_desc = entry;
188 if (entry) 222 if (entry)
189 entry->irq = irq; 223 entry->irq = irq;
190 spin_unlock_irqrestore(&desc->lock, flags); 224 raw_spin_unlock_irqrestore(&desc->lock, flags);
191 return 0; 225 return 0;
192} 226}
193 227
@@ -214,9 +248,9 @@ int set_irq_chip_data(unsigned int irq, void *data)
214 return -EINVAL; 248 return -EINVAL;
215 } 249 }
216 250
217 spin_lock_irqsave(&desc->lock, flags); 251 raw_spin_lock_irqsave(&desc->lock, flags);
218 desc->chip_data = data; 252 desc->chip_data = data;
219 spin_unlock_irqrestore(&desc->lock, flags); 253 raw_spin_unlock_irqrestore(&desc->lock, flags);
220 254
221 return 0; 255 return 0;
222} 256}
@@ -241,12 +275,12 @@ void set_irq_nested_thread(unsigned int irq, int nest)
241 if (!desc) 275 if (!desc)
242 return; 276 return;
243 277
244 spin_lock_irqsave(&desc->lock, flags); 278 raw_spin_lock_irqsave(&desc->lock, flags);
245 if (nest) 279 if (nest)
246 desc->status |= IRQ_NESTED_THREAD; 280 desc->status |= IRQ_NESTED_THREAD;
247 else 281 else
248 desc->status &= ~IRQ_NESTED_THREAD; 282 desc->status &= ~IRQ_NESTED_THREAD;
249 spin_unlock_irqrestore(&desc->lock, flags); 283 raw_spin_unlock_irqrestore(&desc->lock, flags);
250} 284}
251EXPORT_SYMBOL_GPL(set_irq_nested_thread); 285EXPORT_SYMBOL_GPL(set_irq_nested_thread);
252 286
@@ -325,6 +359,23 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)
325 if (desc->chip->ack) 359 if (desc->chip->ack)
326 desc->chip->ack(irq); 360 desc->chip->ack(irq);
327 } 361 }
362 desc->status |= IRQ_MASKED;
363}
364
365static inline void mask_irq(struct irq_desc *desc, int irq)
366{
367 if (desc->chip->mask) {
368 desc->chip->mask(irq);
369 desc->status |= IRQ_MASKED;
370 }
371}
372
373static inline void unmask_irq(struct irq_desc *desc, int irq)
374{
375 if (desc->chip->unmask) {
376 desc->chip->unmask(irq);
377 desc->status &= ~IRQ_MASKED;
378 }
328} 379}
329 380
330/* 381/*
@@ -343,7 +394,7 @@ void handle_nested_irq(unsigned int irq)
343 394
344 might_sleep(); 395 might_sleep();
345 396
346 spin_lock_irq(&desc->lock); 397 raw_spin_lock_irq(&desc->lock);
347 398
348 kstat_incr_irqs_this_cpu(irq, desc); 399 kstat_incr_irqs_this_cpu(irq, desc);
349 400
@@ -352,17 +403,17 @@ void handle_nested_irq(unsigned int irq)
352 goto out_unlock; 403 goto out_unlock;
353 404
354 desc->status |= IRQ_INPROGRESS; 405 desc->status |= IRQ_INPROGRESS;
355 spin_unlock_irq(&desc->lock); 406 raw_spin_unlock_irq(&desc->lock);
356 407
357 action_ret = action->thread_fn(action->irq, action->dev_id); 408 action_ret = action->thread_fn(action->irq, action->dev_id);
358 if (!noirqdebug) 409 if (!noirqdebug)
359 note_interrupt(irq, desc, action_ret); 410 note_interrupt(irq, desc, action_ret);
360 411
361 spin_lock_irq(&desc->lock); 412 raw_spin_lock_irq(&desc->lock);
362 desc->status &= ~IRQ_INPROGRESS; 413 desc->status &= ~IRQ_INPROGRESS;
363 414
364out_unlock: 415out_unlock:
365 spin_unlock_irq(&desc->lock); 416 raw_spin_unlock_irq(&desc->lock);
366} 417}
367EXPORT_SYMBOL_GPL(handle_nested_irq); 418EXPORT_SYMBOL_GPL(handle_nested_irq);
368 419
@@ -384,7 +435,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
384 struct irqaction *action; 435 struct irqaction *action;
385 irqreturn_t action_ret; 436 irqreturn_t action_ret;
386 437
387 spin_lock(&desc->lock); 438 raw_spin_lock(&desc->lock);
388 439
389 if (unlikely(desc->status & IRQ_INPROGRESS)) 440 if (unlikely(desc->status & IRQ_INPROGRESS))
390 goto out_unlock; 441 goto out_unlock;
@@ -396,16 +447,16 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
396 goto out_unlock; 447 goto out_unlock;
397 448
398 desc->status |= IRQ_INPROGRESS; 449 desc->status |= IRQ_INPROGRESS;
399 spin_unlock(&desc->lock); 450 raw_spin_unlock(&desc->lock);
400 451
401 action_ret = handle_IRQ_event(irq, action); 452 action_ret = handle_IRQ_event(irq, action);
402 if (!noirqdebug) 453 if (!noirqdebug)
403 note_interrupt(irq, desc, action_ret); 454 note_interrupt(irq, desc, action_ret);
404 455
405 spin_lock(&desc->lock); 456 raw_spin_lock(&desc->lock);
406 desc->status &= ~IRQ_INPROGRESS; 457 desc->status &= ~IRQ_INPROGRESS;
407out_unlock: 458out_unlock:
408 spin_unlock(&desc->lock); 459 raw_spin_unlock(&desc->lock);
409} 460}
410 461
411/** 462/**
@@ -424,7 +475,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
424 struct irqaction *action; 475 struct irqaction *action;
425 irqreturn_t action_ret; 476 irqreturn_t action_ret;
426 477
427 spin_lock(&desc->lock); 478 raw_spin_lock(&desc->lock);
428 mask_ack_irq(desc, irq); 479 mask_ack_irq(desc, irq);
429 480
430 if (unlikely(desc->status & IRQ_INPROGRESS)) 481 if (unlikely(desc->status & IRQ_INPROGRESS))
@@ -441,21 +492,19 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
441 goto out_unlock; 492 goto out_unlock;
442 493
443 desc->status |= IRQ_INPROGRESS; 494 desc->status |= IRQ_INPROGRESS;
444 spin_unlock(&desc->lock); 495 raw_spin_unlock(&desc->lock);
445 496
446 action_ret = handle_IRQ_event(irq, action); 497 action_ret = handle_IRQ_event(irq, action);
447 if (!noirqdebug) 498 if (!noirqdebug)
448 note_interrupt(irq, desc, action_ret); 499 note_interrupt(irq, desc, action_ret);
449 500
450 spin_lock(&desc->lock); 501 raw_spin_lock(&desc->lock);
451 desc->status &= ~IRQ_INPROGRESS; 502 desc->status &= ~IRQ_INPROGRESS;
452 503
453 if (unlikely(desc->status & IRQ_ONESHOT)) 504 if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT)))
454 desc->status |= IRQ_MASKED; 505 unmask_irq(desc, irq);
455 else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
456 desc->chip->unmask(irq);
457out_unlock: 506out_unlock:
458 spin_unlock(&desc->lock); 507 raw_spin_unlock(&desc->lock);
459} 508}
460EXPORT_SYMBOL_GPL(handle_level_irq); 509EXPORT_SYMBOL_GPL(handle_level_irq);
461 510
@@ -475,7 +524,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
475 struct irqaction *action; 524 struct irqaction *action;
476 irqreturn_t action_ret; 525 irqreturn_t action_ret;
477 526
478 spin_lock(&desc->lock); 527 raw_spin_lock(&desc->lock);
479 528
480 if (unlikely(desc->status & IRQ_INPROGRESS)) 529 if (unlikely(desc->status & IRQ_INPROGRESS))
481 goto out; 530 goto out;
@@ -490,25 +539,24 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
490 action = desc->action; 539 action = desc->action;
491 if (unlikely(!action || (desc->status & IRQ_DISABLED))) { 540 if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
492 desc->status |= IRQ_PENDING; 541 desc->status |= IRQ_PENDING;
493 if (desc->chip->mask) 542 mask_irq(desc, irq);
494 desc->chip->mask(irq);
495 goto out; 543 goto out;
496 } 544 }
497 545
498 desc->status |= IRQ_INPROGRESS; 546 desc->status |= IRQ_INPROGRESS;
499 desc->status &= ~IRQ_PENDING; 547 desc->status &= ~IRQ_PENDING;
500 spin_unlock(&desc->lock); 548 raw_spin_unlock(&desc->lock);
501 549
502 action_ret = handle_IRQ_event(irq, action); 550 action_ret = handle_IRQ_event(irq, action);
503 if (!noirqdebug) 551 if (!noirqdebug)
504 note_interrupt(irq, desc, action_ret); 552 note_interrupt(irq, desc, action_ret);
505 553
506 spin_lock(&desc->lock); 554 raw_spin_lock(&desc->lock);
507 desc->status &= ~IRQ_INPROGRESS; 555 desc->status &= ~IRQ_INPROGRESS;
508out: 556out:
509 desc->chip->eoi(irq); 557 desc->chip->eoi(irq);
510 558
511 spin_unlock(&desc->lock); 559 raw_spin_unlock(&desc->lock);
512} 560}
513 561
514/** 562/**
@@ -520,7 +568,7 @@ out:
520 * signal. The occurence is latched into the irq controller hardware 568 * signal. The occurence is latched into the irq controller hardware
521 * and must be acked in order to be reenabled. After the ack another 569 * and must be acked in order to be reenabled. After the ack another
522 * interrupt can happen on the same source even before the first one 570 * interrupt can happen on the same source even before the first one
523 * is handled by the assosiacted event handler. If this happens it 571 * is handled by the associated event handler. If this happens it
524 * might be necessary to disable (mask) the interrupt depending on the 572 * might be necessary to disable (mask) the interrupt depending on the
525 * controller hardware. This requires to reenable the interrupt inside 573 * controller hardware. This requires to reenable the interrupt inside
526 * of the loop which handles the interrupts which have arrived while 574 * of the loop which handles the interrupts which have arrived while
@@ -530,7 +578,7 @@ out:
530void 578void
531handle_edge_irq(unsigned int irq, struct irq_desc *desc) 579handle_edge_irq(unsigned int irq, struct irq_desc *desc)
532{ 580{
533 spin_lock(&desc->lock); 581 raw_spin_lock(&desc->lock);
534 582
535 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 583 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
536 584
@@ -559,7 +607,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
559 irqreturn_t action_ret; 607 irqreturn_t action_ret;
560 608
561 if (unlikely(!action)) { 609 if (unlikely(!action)) {
562 desc->chip->mask(irq); 610 mask_irq(desc, irq);
563 goto out_unlock; 611 goto out_unlock;
564 } 612 }
565 613
@@ -571,26 +619,25 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
571 if (unlikely((desc->status & 619 if (unlikely((desc->status &
572 (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == 620 (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
573 (IRQ_PENDING | IRQ_MASKED))) { 621 (IRQ_PENDING | IRQ_MASKED))) {
574 desc->chip->unmask(irq); 622 unmask_irq(desc, irq);
575 desc->status &= ~IRQ_MASKED;
576 } 623 }
577 624
578 desc->status &= ~IRQ_PENDING; 625 desc->status &= ~IRQ_PENDING;
579 spin_unlock(&desc->lock); 626 raw_spin_unlock(&desc->lock);
580 action_ret = handle_IRQ_event(irq, action); 627 action_ret = handle_IRQ_event(irq, action);
581 if (!noirqdebug) 628 if (!noirqdebug)
582 note_interrupt(irq, desc, action_ret); 629 note_interrupt(irq, desc, action_ret);
583 spin_lock(&desc->lock); 630 raw_spin_lock(&desc->lock);
584 631
585 } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING); 632 } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING);
586 633
587 desc->status &= ~IRQ_INPROGRESS; 634 desc->status &= ~IRQ_INPROGRESS;
588out_unlock: 635out_unlock:
589 spin_unlock(&desc->lock); 636 raw_spin_unlock(&desc->lock);
590} 637}
591 638
592/** 639/**
593 * handle_percpu_IRQ - Per CPU local irq handler 640 * handle_percpu_irq - Per CPU local irq handler
594 * @irq: the interrupt number 641 * @irq: the interrupt number
595 * @desc: the interrupt description structure for this irq 642 * @desc: the interrupt description structure for this irq
596 * 643 *
@@ -643,7 +690,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
643 } 690 }
644 691
645 chip_bus_lock(irq, desc); 692 chip_bus_lock(irq, desc);
646 spin_lock_irqsave(&desc->lock, flags); 693 raw_spin_lock_irqsave(&desc->lock, flags);
647 694
648 /* Uninstall? */ 695 /* Uninstall? */
649 if (handle == handle_bad_irq) { 696 if (handle == handle_bad_irq) {
@@ -661,7 +708,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
661 desc->depth = 0; 708 desc->depth = 0;
662 desc->chip->startup(irq); 709 desc->chip->startup(irq);
663 } 710 }
664 spin_unlock_irqrestore(&desc->lock, flags); 711 raw_spin_unlock_irqrestore(&desc->lock, flags);
665 chip_bus_sync_unlock(irq, desc); 712 chip_bus_sync_unlock(irq, desc);
666} 713}
667EXPORT_SYMBOL_GPL(__set_irq_handler); 714EXPORT_SYMBOL_GPL(__set_irq_handler);
@@ -682,7 +729,7 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
682 __set_irq_handler(irq, handle, 0, name); 729 __set_irq_handler(irq, handle, 0, name);
683} 730}
684 731
685void __init set_irq_noprobe(unsigned int irq) 732void set_irq_noprobe(unsigned int irq)
686{ 733{
687 struct irq_desc *desc = irq_to_desc(irq); 734 struct irq_desc *desc = irq_to_desc(irq);
688 unsigned long flags; 735 unsigned long flags;
@@ -692,12 +739,12 @@ void __init set_irq_noprobe(unsigned int irq)
692 return; 739 return;
693 } 740 }
694 741
695 spin_lock_irqsave(&desc->lock, flags); 742 raw_spin_lock_irqsave(&desc->lock, flags);
696 desc->status |= IRQ_NOPROBE; 743 desc->status |= IRQ_NOPROBE;
697 spin_unlock_irqrestore(&desc->lock, flags); 744 raw_spin_unlock_irqrestore(&desc->lock, flags);
698} 745}
699 746
700void __init set_irq_probe(unsigned int irq) 747void set_irq_probe(unsigned int irq)
701{ 748{
702 struct irq_desc *desc = irq_to_desc(irq); 749 struct irq_desc *desc = irq_to_desc(irq);
703 unsigned long flags; 750 unsigned long flags;
@@ -707,7 +754,7 @@ void __init set_irq_probe(unsigned int irq)
707 return; 754 return;
708 } 755 }
709 756
710 spin_lock_irqsave(&desc->lock, flags); 757 raw_spin_lock_irqsave(&desc->lock, flags);
711 desc->status &= ~IRQ_NOPROBE; 758 desc->status &= ~IRQ_NOPROBE;
712 spin_unlock_irqrestore(&desc->lock, flags); 759 raw_spin_unlock_irqrestore(&desc->lock, flags);
713} 760}
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index d06df9c41cba..1ef4ffcdfa55 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -42,7 +42,7 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
42 * automatically freed on driver detach. 42 * automatically freed on driver detach.
43 * 43 *
44 * If an IRQ allocated with this function needs to be freed 44 * If an IRQ allocated with this function needs to be freed
45 * separately, dev_free_irq() must be used. 45 * separately, devm_free_irq() must be used.
46 */ 46 */
47int devm_request_threaded_irq(struct device *dev, unsigned int irq, 47int devm_request_threaded_irq(struct device *dev, unsigned int irq,
48 irq_handler_t handler, irq_handler_t thread_fn, 48 irq_handler_t handler, irq_handler_t thread_fn,
@@ -81,7 +81,7 @@ EXPORT_SYMBOL(devm_request_threaded_irq);
81 * Except for the extra @dev argument, this function takes the 81 * Except for the extra @dev argument, this function takes the
82 * same arguments and performs the same function as free_irq(). 82 * same arguments and performs the same function as free_irq().
83 * This function instead of free_irq() should be used to manually 83 * This function instead of free_irq() should be used to manually
84 * free IRQs allocated with dev_request_irq(). 84 * free IRQs allocated with devm_request_irq().
85 */ 85 */
86void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id) 86void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id)
87{ 87{
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 17c71bb565c6..76d5a671bfe1 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -19,7 +19,7 @@
19#include <linux/kernel_stat.h> 19#include <linux/kernel_stat.h>
20#include <linux/rculist.h> 20#include <linux/rculist.h>
21#include <linux/hash.h> 21#include <linux/hash.h>
22#include <linux/bootmem.h> 22#include <linux/radix-tree.h>
23#include <trace/events/irq.h> 23#include <trace/events/irq.h>
24 24
25#include "internals.h" 25#include "internals.h"
@@ -80,19 +80,15 @@ static struct irq_desc irq_desc_init = {
80 .chip = &no_irq_chip, 80 .chip = &no_irq_chip,
81 .handle_irq = handle_bad_irq, 81 .handle_irq = handle_bad_irq,
82 .depth = 1, 82 .depth = 1,
83 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), 83 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
84}; 84};
85 85
86void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr) 86void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
87{ 87{
88 void *ptr; 88 void *ptr;
89 89
90 if (slab_is_available()) 90 ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
91 ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), 91 GFP_ATOMIC, node);
92 GFP_ATOMIC, node);
93 else
94 ptr = alloc_bootmem_node(NODE_DATA(node),
95 nr * sizeof(*desc->kstat_irqs));
96 92
97 /* 93 /*
98 * don't overwite if can not get new one 94 * don't overwite if can not get new one
@@ -108,7 +104,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
108{ 104{
109 memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); 105 memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
110 106
111 spin_lock_init(&desc->lock); 107 raw_spin_lock_init(&desc->lock);
112 desc->irq = irq; 108 desc->irq = irq;
113#ifdef CONFIG_SMP 109#ifdef CONFIG_SMP
114 desc->node = node; 110 desc->node = node;
@@ -130,9 +126,28 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
130/* 126/*
131 * Protect the sparse_irqs: 127 * Protect the sparse_irqs:
132 */ 128 */
133DEFINE_SPINLOCK(sparse_irq_lock); 129DEFINE_RAW_SPINLOCK(sparse_irq_lock);
134 130
135struct irq_desc **irq_desc_ptrs __read_mostly; 131static RADIX_TREE(irq_desc_tree, GFP_ATOMIC);
132
133static void set_irq_desc(unsigned int irq, struct irq_desc *desc)
134{
135 radix_tree_insert(&irq_desc_tree, irq, desc);
136}
137
138struct irq_desc *irq_to_desc(unsigned int irq)
139{
140 return radix_tree_lookup(&irq_desc_tree, irq);
141}
142
143void replace_irq_desc(unsigned int irq, struct irq_desc *desc)
144{
145 void **ptr;
146
147 ptr = radix_tree_lookup_slot(&irq_desc_tree, irq);
148 if (ptr)
149 radix_tree_replace_slot(ptr, desc);
150}
136 151
137static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { 152static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
138 [0 ... NR_IRQS_LEGACY-1] = { 153 [0 ... NR_IRQS_LEGACY-1] = {
@@ -141,7 +156,7 @@ static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_sm
141 .chip = &no_irq_chip, 156 .chip = &no_irq_chip,
142 .handle_irq = handle_bad_irq, 157 .handle_irq = handle_bad_irq,
143 .depth = 1, 158 .depth = 1,
144 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), 159 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
145 } 160 }
146}; 161};
147 162
@@ -164,9 +179,6 @@ int __init early_irq_init(void)
164 legacy_count = ARRAY_SIZE(irq_desc_legacy); 179 legacy_count = ARRAY_SIZE(irq_desc_legacy);
165 node = first_online_node; 180 node = first_online_node;
166 181
167 /* allocate irq_desc_ptrs array based on nr_irqs */
168 irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
169
170 /* allocate based on nr_cpu_ids */ 182 /* allocate based on nr_cpu_ids */
171 kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids * 183 kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
172 sizeof(int), GFP_NOWAIT, node); 184 sizeof(int), GFP_NOWAIT, node);
@@ -180,23 +192,12 @@ int __init early_irq_init(void)
180 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 192 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
181 alloc_desc_masks(&desc[i], node, true); 193 alloc_desc_masks(&desc[i], node, true);
182 init_desc_masks(&desc[i]); 194 init_desc_masks(&desc[i]);
183 irq_desc_ptrs[i] = desc + i; 195 set_irq_desc(i, &desc[i]);
184 } 196 }
185 197
186 for (i = legacy_count; i < nr_irqs; i++)
187 irq_desc_ptrs[i] = NULL;
188
189 return arch_early_irq_init(); 198 return arch_early_irq_init();
190} 199}
191 200
192struct irq_desc *irq_to_desc(unsigned int irq)
193{
194 if (irq_desc_ptrs && irq < nr_irqs)
195 return irq_desc_ptrs[irq];
196
197 return NULL;
198}
199
200struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) 201struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
201{ 202{
202 struct irq_desc *desc; 203 struct irq_desc *desc;
@@ -208,21 +209,18 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
208 return NULL; 209 return NULL;
209 } 210 }
210 211
211 desc = irq_desc_ptrs[irq]; 212 desc = irq_to_desc(irq);
212 if (desc) 213 if (desc)
213 return desc; 214 return desc;
214 215
215 spin_lock_irqsave(&sparse_irq_lock, flags); 216 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
216 217
217 /* We have to check it to avoid races with another CPU */ 218 /* We have to check it to avoid races with another CPU */
218 desc = irq_desc_ptrs[irq]; 219 desc = irq_to_desc(irq);
219 if (desc) 220 if (desc)
220 goto out_unlock; 221 goto out_unlock;
221 222
222 if (slab_is_available()) 223 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
223 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
224 else
225 desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc));
226 224
227 printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node); 225 printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node);
228 if (!desc) { 226 if (!desc) {
@@ -231,10 +229,10 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
231 } 229 }
232 init_one_irq_desc(irq, desc, node); 230 init_one_irq_desc(irq, desc, node);
233 231
234 irq_desc_ptrs[irq] = desc; 232 set_irq_desc(irq, desc);
235 233
236out_unlock: 234out_unlock:
237 spin_unlock_irqrestore(&sparse_irq_lock, flags); 235 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
238 236
239 return desc; 237 return desc;
240} 238}
@@ -247,7 +245,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
247 .chip = &no_irq_chip, 245 .chip = &no_irq_chip,
248 .handle_irq = handle_bad_irq, 246 .handle_irq = handle_bad_irq,
249 .depth = 1, 247 .depth = 1,
250 .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock), 248 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
251 } 249 }
252}; 250};
253 251
@@ -473,7 +471,7 @@ unsigned int __do_IRQ(unsigned int irq)
473 return 1; 471 return 1;
474 } 472 }
475 473
476 spin_lock(&desc->lock); 474 raw_spin_lock(&desc->lock);
477 if (desc->chip->ack) 475 if (desc->chip->ack)
478 desc->chip->ack(irq); 476 desc->chip->ack(irq);
479 /* 477 /*
@@ -517,13 +515,13 @@ unsigned int __do_IRQ(unsigned int irq)
517 for (;;) { 515 for (;;) {
518 irqreturn_t action_ret; 516 irqreturn_t action_ret;
519 517
520 spin_unlock(&desc->lock); 518 raw_spin_unlock(&desc->lock);
521 519
522 action_ret = handle_IRQ_event(irq, action); 520 action_ret = handle_IRQ_event(irq, action);
523 if (!noirqdebug) 521 if (!noirqdebug)
524 note_interrupt(irq, desc, action_ret); 522 note_interrupt(irq, desc, action_ret);
525 523
526 spin_lock(&desc->lock); 524 raw_spin_lock(&desc->lock);
527 if (likely(!(desc->status & IRQ_PENDING))) 525 if (likely(!(desc->status & IRQ_PENDING)))
528 break; 526 break;
529 desc->status &= ~IRQ_PENDING; 527 desc->status &= ~IRQ_PENDING;
@@ -536,7 +534,7 @@ out:
536 * disabled while the handler was running. 534 * disabled while the handler was running.
537 */ 535 */
538 desc->chip->end(irq); 536 desc->chip->end(irq);
539 spin_unlock(&desc->lock); 537 raw_spin_unlock(&desc->lock);
540 538
541 return 1; 539 return 1;
542} 540}
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 1b5d742c6a77..c63f3bc88f0b 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -18,14 +18,10 @@ extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
18extern struct lock_class_key irq_desc_lock_class; 18extern struct lock_class_key irq_desc_lock_class;
19extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); 19extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
20extern void clear_kstat_irqs(struct irq_desc *desc); 20extern void clear_kstat_irqs(struct irq_desc *desc);
21extern spinlock_t sparse_irq_lock; 21extern raw_spinlock_t sparse_irq_lock;
22 22
23#ifdef CONFIG_SPARSE_IRQ 23#ifdef CONFIG_SPARSE_IRQ
24/* irq_desc_ptrs allocated at boot time */ 24void replace_irq_desc(unsigned int irq, struct irq_desc *desc);
25extern struct irq_desc **irq_desc_ptrs;
26#else
27/* irq_desc_ptrs is a fixed size array */
28extern struct irq_desc *irq_desc_ptrs[NR_IRQS];
29#endif 25#endif
30 26
31#ifdef CONFIG_PROC_FS 27#ifdef CONFIG_PROC_FS
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index bde4c667d24d..704e488730a5 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -46,9 +46,9 @@ void synchronize_irq(unsigned int irq)
46 cpu_relax(); 46 cpu_relax();
47 47
48 /* Ok, that indicated we're done: double-check carefully. */ 48 /* Ok, that indicated we're done: double-check carefully. */
49 spin_lock_irqsave(&desc->lock, flags); 49 raw_spin_lock_irqsave(&desc->lock, flags);
50 status = desc->status; 50 status = desc->status;
51 spin_unlock_irqrestore(&desc->lock, flags); 51 raw_spin_unlock_irqrestore(&desc->lock, flags);
52 52
53 /* Oops, that failed? */ 53 /* Oops, that failed? */
54 } while (status & IRQ_INPROGRESS); 54 } while (status & IRQ_INPROGRESS);
@@ -114,7 +114,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
114 if (!desc->chip->set_affinity) 114 if (!desc->chip->set_affinity)
115 return -EINVAL; 115 return -EINVAL;
116 116
117 spin_lock_irqsave(&desc->lock, flags); 117 raw_spin_lock_irqsave(&desc->lock, flags);
118 118
119#ifdef CONFIG_GENERIC_PENDING_IRQ 119#ifdef CONFIG_GENERIC_PENDING_IRQ
120 if (desc->status & IRQ_MOVE_PCNTXT) { 120 if (desc->status & IRQ_MOVE_PCNTXT) {
@@ -134,7 +134,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
134 } 134 }
135#endif 135#endif
136 desc->status |= IRQ_AFFINITY_SET; 136 desc->status |= IRQ_AFFINITY_SET;
137 spin_unlock_irqrestore(&desc->lock, flags); 137 raw_spin_unlock_irqrestore(&desc->lock, flags);
138 return 0; 138 return 0;
139} 139}
140 140
@@ -181,11 +181,11 @@ int irq_select_affinity_usr(unsigned int irq)
181 unsigned long flags; 181 unsigned long flags;
182 int ret; 182 int ret;
183 183
184 spin_lock_irqsave(&desc->lock, flags); 184 raw_spin_lock_irqsave(&desc->lock, flags);
185 ret = setup_affinity(irq, desc); 185 ret = setup_affinity(irq, desc);
186 if (!ret) 186 if (!ret)
187 irq_set_thread_affinity(desc); 187 irq_set_thread_affinity(desc);
188 spin_unlock_irqrestore(&desc->lock, flags); 188 raw_spin_unlock_irqrestore(&desc->lock, flags);
189 189
190 return ret; 190 return ret;
191} 191}
@@ -231,9 +231,9 @@ void disable_irq_nosync(unsigned int irq)
231 return; 231 return;
232 232
233 chip_bus_lock(irq, desc); 233 chip_bus_lock(irq, desc);
234 spin_lock_irqsave(&desc->lock, flags); 234 raw_spin_lock_irqsave(&desc->lock, flags);
235 __disable_irq(desc, irq, false); 235 __disable_irq(desc, irq, false);
236 spin_unlock_irqrestore(&desc->lock, flags); 236 raw_spin_unlock_irqrestore(&desc->lock, flags);
237 chip_bus_sync_unlock(irq, desc); 237 chip_bus_sync_unlock(irq, desc);
238} 238}
239EXPORT_SYMBOL(disable_irq_nosync); 239EXPORT_SYMBOL(disable_irq_nosync);
@@ -308,9 +308,9 @@ void enable_irq(unsigned int irq)
308 return; 308 return;
309 309
310 chip_bus_lock(irq, desc); 310 chip_bus_lock(irq, desc);
311 spin_lock_irqsave(&desc->lock, flags); 311 raw_spin_lock_irqsave(&desc->lock, flags);
312 __enable_irq(desc, irq, false); 312 __enable_irq(desc, irq, false);
313 spin_unlock_irqrestore(&desc->lock, flags); 313 raw_spin_unlock_irqrestore(&desc->lock, flags);
314 chip_bus_sync_unlock(irq, desc); 314 chip_bus_sync_unlock(irq, desc);
315} 315}
316EXPORT_SYMBOL(enable_irq); 316EXPORT_SYMBOL(enable_irq);
@@ -347,7 +347,7 @@ int set_irq_wake(unsigned int irq, unsigned int on)
347 /* wakeup-capable irqs can be shared between drivers that 347 /* wakeup-capable irqs can be shared between drivers that
348 * don't need to have the same sleep mode behaviors. 348 * don't need to have the same sleep mode behaviors.
349 */ 349 */
350 spin_lock_irqsave(&desc->lock, flags); 350 raw_spin_lock_irqsave(&desc->lock, flags);
351 if (on) { 351 if (on) {
352 if (desc->wake_depth++ == 0) { 352 if (desc->wake_depth++ == 0) {
353 ret = set_irq_wake_real(irq, on); 353 ret = set_irq_wake_real(irq, on);
@@ -368,7 +368,7 @@ int set_irq_wake(unsigned int irq, unsigned int on)
368 } 368 }
369 } 369 }
370 370
371 spin_unlock_irqrestore(&desc->lock, flags); 371 raw_spin_unlock_irqrestore(&desc->lock, flags);
372 return ret; 372 return ret;
373} 373}
374EXPORT_SYMBOL(set_irq_wake); 374EXPORT_SYMBOL(set_irq_wake);
@@ -382,6 +382,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
382{ 382{
383 struct irq_desc *desc = irq_to_desc(irq); 383 struct irq_desc *desc = irq_to_desc(irq);
384 struct irqaction *action; 384 struct irqaction *action;
385 unsigned long flags;
385 386
386 if (!desc) 387 if (!desc)
387 return 0; 388 return 0;
@@ -389,11 +390,14 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
389 if (desc->status & IRQ_NOREQUEST) 390 if (desc->status & IRQ_NOREQUEST)
390 return 0; 391 return 0;
391 392
393 raw_spin_lock_irqsave(&desc->lock, flags);
392 action = desc->action; 394 action = desc->action;
393 if (action) 395 if (action)
394 if (irqflags & action->flags & IRQF_SHARED) 396 if (irqflags & action->flags & IRQF_SHARED)
395 action = NULL; 397 action = NULL;
396 398
399 raw_spin_unlock_irqrestore(&desc->lock, flags);
400
397 return !action; 401 return !action;
398} 402}
399 403
@@ -483,13 +487,31 @@ static int irq_wait_for_interrupt(struct irqaction *action)
483 */ 487 */
484static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) 488static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
485{ 489{
490again:
486 chip_bus_lock(irq, desc); 491 chip_bus_lock(irq, desc);
487 spin_lock_irq(&desc->lock); 492 raw_spin_lock_irq(&desc->lock);
493
494 /*
495 * Implausible though it may be we need to protect us against
496 * the following scenario:
497 *
498 * The thread is faster done than the hard interrupt handler
499 * on the other CPU. If we unmask the irq line then the
500 * interrupt can come in again and masks the line, leaves due
501 * to IRQ_INPROGRESS and the irq line is masked forever.
502 */
503 if (unlikely(desc->status & IRQ_INPROGRESS)) {
504 raw_spin_unlock_irq(&desc->lock);
505 chip_bus_sync_unlock(irq, desc);
506 cpu_relax();
507 goto again;
508 }
509
488 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { 510 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
489 desc->status &= ~IRQ_MASKED; 511 desc->status &= ~IRQ_MASKED;
490 desc->chip->unmask(irq); 512 desc->chip->unmask(irq);
491 } 513 }
492 spin_unlock_irq(&desc->lock); 514 raw_spin_unlock_irq(&desc->lock);
493 chip_bus_sync_unlock(irq, desc); 515 chip_bus_sync_unlock(irq, desc);
494} 516}
495 517
@@ -514,9 +536,9 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
514 return; 536 return;
515 } 537 }
516 538
517 spin_lock_irq(&desc->lock); 539 raw_spin_lock_irq(&desc->lock);
518 cpumask_copy(mask, desc->affinity); 540 cpumask_copy(mask, desc->affinity);
519 spin_unlock_irq(&desc->lock); 541 raw_spin_unlock_irq(&desc->lock);
520 542
521 set_cpus_allowed_ptr(current, mask); 543 set_cpus_allowed_ptr(current, mask);
522 free_cpumask_var(mask); 544 free_cpumask_var(mask);
@@ -545,7 +567,7 @@ static int irq_thread(void *data)
545 567
546 atomic_inc(&desc->threads_active); 568 atomic_inc(&desc->threads_active);
547 569
548 spin_lock_irq(&desc->lock); 570 raw_spin_lock_irq(&desc->lock);
549 if (unlikely(desc->status & IRQ_DISABLED)) { 571 if (unlikely(desc->status & IRQ_DISABLED)) {
550 /* 572 /*
551 * CHECKME: We might need a dedicated 573 * CHECKME: We might need a dedicated
@@ -555,9 +577,9 @@ static int irq_thread(void *data)
555 * retriggers the interrupt itself --- tglx 577 * retriggers the interrupt itself --- tglx
556 */ 578 */
557 desc->status |= IRQ_PENDING; 579 desc->status |= IRQ_PENDING;
558 spin_unlock_irq(&desc->lock); 580 raw_spin_unlock_irq(&desc->lock);
559 } else { 581 } else {
560 spin_unlock_irq(&desc->lock); 582 raw_spin_unlock_irq(&desc->lock);
561 583
562 action->thread_fn(action->irq, action->dev_id); 584 action->thread_fn(action->irq, action->dev_id);
563 585
@@ -679,7 +701,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
679 /* 701 /*
680 * The following block of code has to be executed atomically 702 * The following block of code has to be executed atomically
681 */ 703 */
682 spin_lock_irqsave(&desc->lock, flags); 704 raw_spin_lock_irqsave(&desc->lock, flags);
683 old_ptr = &desc->action; 705 old_ptr = &desc->action;
684 old = *old_ptr; 706 old = *old_ptr;
685 if (old) { 707 if (old) {
@@ -735,6 +757,16 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
735 if (new->flags & IRQF_ONESHOT) 757 if (new->flags & IRQF_ONESHOT)
736 desc->status |= IRQ_ONESHOT; 758 desc->status |= IRQ_ONESHOT;
737 759
760 /*
761 * Force MSI interrupts to run with interrupts
762 * disabled. The multi vector cards can cause stack
763 * overflows due to nested interrupts when enough of
764 * them are directed to a core and fire at the same
765 * time.
766 */
767 if (desc->msi_desc)
768 new->flags |= IRQF_DISABLED;
769
738 if (!(desc->status & IRQ_NOAUTOEN)) { 770 if (!(desc->status & IRQ_NOAUTOEN)) {
739 desc->depth = 0; 771 desc->depth = 0;
740 desc->status &= ~IRQ_DISABLED; 772 desc->status &= ~IRQ_DISABLED;
@@ -775,7 +807,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
775 __enable_irq(desc, irq, false); 807 __enable_irq(desc, irq, false);
776 } 808 }
777 809
778 spin_unlock_irqrestore(&desc->lock, flags); 810 raw_spin_unlock_irqrestore(&desc->lock, flags);
779 811
780 /* 812 /*
781 * Strictly no need to wake it up, but hung_task complains 813 * Strictly no need to wake it up, but hung_task complains
@@ -802,7 +834,7 @@ mismatch:
802 ret = -EBUSY; 834 ret = -EBUSY;
803 835
804out_thread: 836out_thread:
805 spin_unlock_irqrestore(&desc->lock, flags); 837 raw_spin_unlock_irqrestore(&desc->lock, flags);
806 if (new->thread) { 838 if (new->thread) {
807 struct task_struct *t = new->thread; 839 struct task_struct *t = new->thread;
808 840
@@ -844,7 +876,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
844 if (!desc) 876 if (!desc)
845 return NULL; 877 return NULL;
846 878
847 spin_lock_irqsave(&desc->lock, flags); 879 raw_spin_lock_irqsave(&desc->lock, flags);
848 880
849 /* 881 /*
850 * There can be multiple actions per IRQ descriptor, find the right 882 * There can be multiple actions per IRQ descriptor, find the right
@@ -856,7 +888,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
856 888
857 if (!action) { 889 if (!action) {
858 WARN(1, "Trying to free already-free IRQ %d\n", irq); 890 WARN(1, "Trying to free already-free IRQ %d\n", irq);
859 spin_unlock_irqrestore(&desc->lock, flags); 891 raw_spin_unlock_irqrestore(&desc->lock, flags);
860 892
861 return NULL; 893 return NULL;
862 } 894 }
@@ -884,7 +916,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
884 desc->chip->disable(irq); 916 desc->chip->disable(irq);
885 } 917 }
886 918
887 spin_unlock_irqrestore(&desc->lock, flags); 919 raw_spin_unlock_irqrestore(&desc->lock, flags);
888 920
889 unregister_handler_proc(irq, action); 921 unregister_handler_proc(irq, action);
890 922
@@ -1067,7 +1099,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1067 kfree(action); 1099 kfree(action);
1068 1100
1069#ifdef CONFIG_DEBUG_SHIRQ 1101#ifdef CONFIG_DEBUG_SHIRQ
1070 if (irqflags & IRQF_SHARED) { 1102 if (!retval && (irqflags & IRQF_SHARED)) {
1071 /* 1103 /*
1072 * It's a shared IRQ -- the driver ought to be prepared for it 1104 * It's a shared IRQ -- the driver ought to be prepared for it
1073 * to happen immediately, so let's make sure.... 1105 * to happen immediately, so let's make sure....
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index fcb6c96f2627..241962280836 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -27,7 +27,7 @@ void move_masked_irq(int irq)
27 if (!desc->chip->set_affinity) 27 if (!desc->chip->set_affinity)
28 return; 28 return;
29 29
30 assert_spin_locked(&desc->lock); 30 assert_raw_spin_locked(&desc->lock);
31 31
32 /* 32 /*
33 * If there was a valid mask to work with, please 33 * If there was a valid mask to work with, please
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 3fd30197da2e..65d3845665ac 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -6,6 +6,7 @@
6 */ 6 */
7 7
8#include <linux/irq.h> 8#include <linux/irq.h>
9#include <linux/slab.h>
9#include <linux/module.h> 10#include <linux/module.h>
10#include <linux/random.h> 11#include <linux/random.h>
11#include <linux/interrupt.h> 12#include <linux/interrupt.h>
@@ -42,7 +43,7 @@ static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
42 "for migration.\n", irq); 43 "for migration.\n", irq);
43 return false; 44 return false;
44 } 45 }
45 spin_lock_init(&desc->lock); 46 raw_spin_lock_init(&desc->lock);
46 desc->node = node; 47 desc->node = node;
47 lockdep_set_class(&desc->lock, &irq_desc_lock_class); 48 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
48 init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids); 49 init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids);
@@ -67,10 +68,10 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
67 68
68 irq = old_desc->irq; 69 irq = old_desc->irq;
69 70
70 spin_lock_irqsave(&sparse_irq_lock, flags); 71 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
71 72
72 /* We have to check it to avoid races with another CPU */ 73 /* We have to check it to avoid races with another CPU */
73 desc = irq_desc_ptrs[irq]; 74 desc = irq_to_desc(irq);
74 75
75 if (desc && old_desc != desc) 76 if (desc && old_desc != desc)
76 goto out_unlock; 77 goto out_unlock;
@@ -90,8 +91,8 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
90 goto out_unlock; 91 goto out_unlock;
91 } 92 }
92 93
93 irq_desc_ptrs[irq] = desc; 94 replace_irq_desc(irq, desc);
94 spin_unlock_irqrestore(&sparse_irq_lock, flags); 95 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
95 96
96 /* free the old one */ 97 /* free the old one */
97 free_one_irq_desc(old_desc, desc); 98 free_one_irq_desc(old_desc, desc);
@@ -100,7 +101,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
100 return desc; 101 return desc;
101 102
102out_unlock: 103out_unlock:
103 spin_unlock_irqrestore(&sparse_irq_lock, flags); 104 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
104 105
105 return desc; 106 return desc;
106} 107}
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index a0bb09e79867..0d4005d85b03 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -28,9 +28,9 @@ void suspend_device_irqs(void)
28 for_each_irq_desc(irq, desc) { 28 for_each_irq_desc(irq, desc) {
29 unsigned long flags; 29 unsigned long flags;
30 30
31 spin_lock_irqsave(&desc->lock, flags); 31 raw_spin_lock_irqsave(&desc->lock, flags);
32 __disable_irq(desc, irq, true); 32 __disable_irq(desc, irq, true);
33 spin_unlock_irqrestore(&desc->lock, flags); 33 raw_spin_unlock_irqrestore(&desc->lock, flags);
34 } 34 }
35 35
36 for_each_irq_desc(irq, desc) 36 for_each_irq_desc(irq, desc)
@@ -56,9 +56,9 @@ void resume_device_irqs(void)
56 if (!(desc->status & IRQ_SUSPENDED)) 56 if (!(desc->status & IRQ_SUSPENDED))
57 continue; 57 continue;
58 58
59 spin_lock_irqsave(&desc->lock, flags); 59 raw_spin_lock_irqsave(&desc->lock, flags);
60 __enable_irq(desc, irq, true); 60 __enable_irq(desc, irq, true);
61 spin_unlock_irqrestore(&desc->lock, flags); 61 raw_spin_unlock_irqrestore(&desc->lock, flags);
62 } 62 }
63} 63}
64EXPORT_SYMBOL_GPL(resume_device_irqs); 64EXPORT_SYMBOL_GPL(resume_device_irqs);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 692363dd591f..7a6eb04ef6b5 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/irq.h> 9#include <linux/irq.h>
10#include <linux/gfp.h>
10#include <linux/proc_fs.h> 11#include <linux/proc_fs.h>
11#include <linux/seq_file.h> 12#include <linux/seq_file.h>
12#include <linux/interrupt.h> 13#include <linux/interrupt.h>
@@ -136,7 +137,7 @@ out:
136 137
137static int default_affinity_open(struct inode *inode, struct file *file) 138static int default_affinity_open(struct inode *inode, struct file *file)
138{ 139{
139 return single_open(file, default_affinity_show, NULL); 140 return single_open(file, default_affinity_show, PDE(inode)->data);
140} 141}
141 142
142static const struct file_operations default_affinity_proc_fops = { 143static const struct file_operations default_affinity_proc_fops = {
@@ -148,18 +149,28 @@ static const struct file_operations default_affinity_proc_fops = {
148}; 149};
149#endif 150#endif
150 151
151static int irq_spurious_read(char *page, char **start, off_t off, 152static int irq_spurious_proc_show(struct seq_file *m, void *v)
152 int count, int *eof, void *data)
153{ 153{
154 struct irq_desc *desc = irq_to_desc((long) data); 154 struct irq_desc *desc = irq_to_desc((long) m->private);
155 return sprintf(page, "count %u\n" 155
156 "unhandled %u\n" 156 seq_printf(m, "count %u\n" "unhandled %u\n" "last_unhandled %u ms\n",
157 "last_unhandled %u ms\n", 157 desc->irq_count, desc->irqs_unhandled,
158 desc->irq_count, 158 jiffies_to_msecs(desc->last_unhandled));
159 desc->irqs_unhandled, 159 return 0;
160 jiffies_to_msecs(desc->last_unhandled)); 160}
161
162static int irq_spurious_proc_open(struct inode *inode, struct file *file)
163{
164 return single_open(file, irq_spurious_proc_show, NULL);
161} 165}
162 166
167static const struct file_operations irq_spurious_proc_fops = {
168 .open = irq_spurious_proc_open,
169 .read = seq_read,
170 .llseek = seq_lseek,
171 .release = single_release,
172};
173
163#define MAX_NAMELEN 128 174#define MAX_NAMELEN 128
164 175
165static int name_unique(unsigned int irq, struct irqaction *new_action) 176static int name_unique(unsigned int irq, struct irqaction *new_action)
@@ -169,7 +180,7 @@ static int name_unique(unsigned int irq, struct irqaction *new_action)
169 unsigned long flags; 180 unsigned long flags;
170 int ret = 1; 181 int ret = 1;
171 182
172 spin_lock_irqsave(&desc->lock, flags); 183 raw_spin_lock_irqsave(&desc->lock, flags);
173 for (action = desc->action ; action; action = action->next) { 184 for (action = desc->action ; action; action = action->next) {
174 if ((action != new_action) && action->name && 185 if ((action != new_action) && action->name &&
175 !strcmp(new_action->name, action->name)) { 186 !strcmp(new_action->name, action->name)) {
@@ -177,7 +188,7 @@ static int name_unique(unsigned int irq, struct irqaction *new_action)
177 break; 188 break;
178 } 189 }
179 } 190 }
180 spin_unlock_irqrestore(&desc->lock, flags); 191 raw_spin_unlock_irqrestore(&desc->lock, flags);
181 return ret; 192 return ret;
182} 193}
183 194
@@ -204,7 +215,6 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
204void register_irq_proc(unsigned int irq, struct irq_desc *desc) 215void register_irq_proc(unsigned int irq, struct irq_desc *desc)
205{ 216{
206 char name [MAX_NAMELEN]; 217 char name [MAX_NAMELEN];
207 struct proc_dir_entry *entry;
208 218
209 if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir) 219 if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir)
210 return; 220 return;
@@ -214,6 +224,8 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
214 224
215 /* create /proc/irq/1234 */ 225 /* create /proc/irq/1234 */
216 desc->dir = proc_mkdir(name, root_irq_dir); 226 desc->dir = proc_mkdir(name, root_irq_dir);
227 if (!desc->dir)
228 return;
217 229
218#ifdef CONFIG_SMP 230#ifdef CONFIG_SMP
219 /* create /proc/irq/<irq>/smp_affinity */ 231 /* create /proc/irq/<irq>/smp_affinity */
@@ -221,11 +233,8 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
221 &irq_affinity_proc_fops, (void *)(long)irq); 233 &irq_affinity_proc_fops, (void *)(long)irq);
222#endif 234#endif
223 235
224 entry = create_proc_entry("spurious", 0444, desc->dir); 236 proc_create_data("spurious", 0444, desc->dir,
225 if (entry) { 237 &irq_spurious_proc_fops, (void *)(long)irq);
226 entry->data = (void *)(long)irq;
227 entry->read_proc = irq_spurious_read;
228 }
229} 238}
230 239
231#undef MAX_NAMELEN 240#undef MAX_NAMELEN
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index bd7273e6282e..89fb90ae534f 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -28,7 +28,7 @@ static int try_one_irq(int irq, struct irq_desc *desc)
28 struct irqaction *action; 28 struct irqaction *action;
29 int ok = 0, work = 0; 29 int ok = 0, work = 0;
30 30
31 spin_lock(&desc->lock); 31 raw_spin_lock(&desc->lock);
32 /* Already running on another processor */ 32 /* Already running on another processor */
33 if (desc->status & IRQ_INPROGRESS) { 33 if (desc->status & IRQ_INPROGRESS) {
34 /* 34 /*
@@ -37,13 +37,13 @@ static int try_one_irq(int irq, struct irq_desc *desc)
37 */ 37 */
38 if (desc->action && (desc->action->flags & IRQF_SHARED)) 38 if (desc->action && (desc->action->flags & IRQF_SHARED))
39 desc->status |= IRQ_PENDING; 39 desc->status |= IRQ_PENDING;
40 spin_unlock(&desc->lock); 40 raw_spin_unlock(&desc->lock);
41 return ok; 41 return ok;
42 } 42 }
43 /* Honour the normal IRQ locking */ 43 /* Honour the normal IRQ locking */
44 desc->status |= IRQ_INPROGRESS; 44 desc->status |= IRQ_INPROGRESS;
45 action = desc->action; 45 action = desc->action;
46 spin_unlock(&desc->lock); 46 raw_spin_unlock(&desc->lock);
47 47
48 while (action) { 48 while (action) {
49 /* Only shared IRQ handlers are safe to call */ 49 /* Only shared IRQ handlers are safe to call */
@@ -56,7 +56,7 @@ static int try_one_irq(int irq, struct irq_desc *desc)
56 } 56 }
57 local_irq_disable(); 57 local_irq_disable();
58 /* Now clean up the flags */ 58 /* Now clean up the flags */
59 spin_lock(&desc->lock); 59 raw_spin_lock(&desc->lock);
60 action = desc->action; 60 action = desc->action;
61 61
62 /* 62 /*
@@ -68,9 +68,9 @@ static int try_one_irq(int irq, struct irq_desc *desc)
68 * Perform real IRQ processing for the IRQ we deferred 68 * Perform real IRQ processing for the IRQ we deferred
69 */ 69 */
70 work = 1; 70 work = 1;
71 spin_unlock(&desc->lock); 71 raw_spin_unlock(&desc->lock);
72 handle_IRQ_event(irq, action); 72 handle_IRQ_event(irq, action);
73 spin_lock(&desc->lock); 73 raw_spin_lock(&desc->lock);
74 desc->status &= ~IRQ_PENDING; 74 desc->status &= ~IRQ_PENDING;
75 } 75 }
76 desc->status &= ~IRQ_INPROGRESS; 76 desc->status &= ~IRQ_INPROGRESS;
@@ -80,7 +80,7 @@ static int try_one_irq(int irq, struct irq_desc *desc)
80 */ 80 */
81 if (work && desc->chip && desc->chip->end) 81 if (work && desc->chip && desc->chip->end)
82 desc->chip->end(irq); 82 desc->chip->end(irq);
83 spin_unlock(&desc->lock); 83 raw_spin_unlock(&desc->lock);
84 84
85 return ok; 85 return ok;
86} 86}
@@ -104,7 +104,7 @@ static int misrouted_irq(int irq)
104 return ok; 104 return ok;
105} 105}
106 106
107static void poll_all_shared_irqs(void) 107static void poll_spurious_irqs(unsigned long dummy)
108{ 108{
109 struct irq_desc *desc; 109 struct irq_desc *desc;
110 int i; 110 int i;
@@ -125,23 +125,11 @@ static void poll_all_shared_irqs(void)
125 try_one_irq(i, desc); 125 try_one_irq(i, desc);
126 local_irq_enable(); 126 local_irq_enable();
127 } 127 }
128}
129
130static void poll_spurious_irqs(unsigned long dummy)
131{
132 poll_all_shared_irqs();
133 128
134 mod_timer(&poll_spurious_irq_timer, 129 mod_timer(&poll_spurious_irq_timer,
135 jiffies + POLL_SPURIOUS_IRQ_INTERVAL); 130 jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
136} 131}
137 132
138#ifdef CONFIG_DEBUG_SHIRQ
139void debug_poll_all_shared_irqs(void)
140{
141 poll_all_shared_irqs();
142}
143#endif
144
145/* 133/*
146 * If 99,900 of the previous 100,000 interrupts have not been handled 134 * If 99,900 of the previous 100,000 interrupts have not been handled
147 * then assume that the IRQ is stuck in some manner. Drop a diagnostic 135 * then assume that the IRQ is stuck in some manner. Drop a diagnostic
@@ -232,7 +220,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
232 /* 220 /*
233 * If we are seeing only the odd spurious IRQ caused by 221 * If we are seeing only the odd spurious IRQ caused by
234 * bus asynchronicity then don't eventually trigger an error, 222 * bus asynchronicity then don't eventually trigger an error,
235 * otherwise the couter becomes a doomsday timer for otherwise 223 * otherwise the counter becomes a doomsday timer for otherwise
236 * working systems 224 * working systems
237 */ 225 */
238 if (time_after(jiffies, desc->last_unhandled + HZ/10)) 226 if (time_after(jiffies, desc->last_unhandled + HZ/10))
diff --git a/kernel/itimer.c b/kernel/itimer.c
index b03451ede528..d802883153da 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -146,6 +146,7 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
146{ 146{
147 cputime_t cval, nval, cinterval, ninterval; 147 cputime_t cval, nval, cinterval, ninterval;
148 s64 ns_ninterval, ns_nval; 148 s64 ns_ninterval, ns_nval;
149 u32 error, incr_error;
149 struct cpu_itimer *it = &tsk->signal->it[clock_id]; 150 struct cpu_itimer *it = &tsk->signal->it[clock_id];
150 151
151 nval = timeval_to_cputime(&value->it_value); 152 nval = timeval_to_cputime(&value->it_value);
@@ -153,8 +154,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
153 ninterval = timeval_to_cputime(&value->it_interval); 154 ninterval = timeval_to_cputime(&value->it_interval);
154 ns_ninterval = timeval_to_ns(&value->it_interval); 155 ns_ninterval = timeval_to_ns(&value->it_interval);
155 156
156 it->incr_error = cputime_sub_ns(ninterval, ns_ninterval); 157 error = cputime_sub_ns(nval, ns_nval);
157 it->error = cputime_sub_ns(nval, ns_nval); 158 incr_error = cputime_sub_ns(ninterval, ns_ninterval);
158 159
159 spin_lock_irq(&tsk->sighand->siglock); 160 spin_lock_irq(&tsk->sighand->siglock);
160 161
@@ -168,6 +169,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
168 } 169 }
169 it->expires = nval; 170 it->expires = nval;
170 it->incr = ninterval; 171 it->incr = ninterval;
172 it->error = error;
173 it->incr_error = incr_error;
171 trace_itimer_state(clock_id == CPUCLOCK_VIRT ? 174 trace_itimer_state(clock_id == CPUCLOCK_VIRT ?
172 ITIMER_VIRTUAL : ITIMER_PROF, value, nval); 175 ITIMER_VIRTUAL : ITIMER_PROF, value, nval);
173 176
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 8b6b8b697c68..13aff293f4de 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -21,6 +21,7 @@
21#include <linux/sched.h> /* for cond_resched */ 21#include <linux/sched.h> /* for cond_resched */
22#include <linux/mm.h> 22#include <linux/mm.h>
23#include <linux/ctype.h> 23#include <linux/ctype.h>
24#include <linux/slab.h>
24 25
25#include <asm/sections.h> 26#include <asm/sections.h>
26 27
@@ -181,6 +182,7 @@ unsigned long kallsyms_lookup_name(const char *name)
181 } 182 }
182 return module_kallsyms_lookup_name(name); 183 return module_kallsyms_lookup_name(name);
183} 184}
185EXPORT_SYMBOL_GPL(kallsyms_lookup_name);
184 186
185int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, 187int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
186 unsigned long), 188 unsigned long),
diff --git a/kernel/kexec.c b/kernel/kexec.c
index f336e2107f98..474a84715eac 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -21,7 +21,7 @@
21#include <linux/hardirq.h> 21#include <linux/hardirq.h>
22#include <linux/elf.h> 22#include <linux/elf.h>
23#include <linux/elfcore.h> 23#include <linux/elfcore.h>
24#include <linux/utsrelease.h> 24#include <generated/utsrelease.h>
25#include <linux/utsname.h> 25#include <linux/utsname.h>
26#include <linux/numa.h> 26#include <linux/numa.h>
27#include <linux/suspend.h> 27#include <linux/suspend.h>
@@ -31,6 +31,8 @@
31#include <linux/cpu.h> 31#include <linux/cpu.h>
32#include <linux/console.h> 32#include <linux/console.h>
33#include <linux/vmalloc.h> 33#include <linux/vmalloc.h>
34#include <linux/swap.h>
35#include <linux/kmsg_dump.h>
34 36
35#include <asm/page.h> 37#include <asm/page.h>
36#include <asm/uaccess.h> 38#include <asm/uaccess.h>
@@ -39,7 +41,7 @@
39#include <asm/sections.h> 41#include <asm/sections.h>
40 42
41/* Per cpu memory for storing cpu states in case of system crash. */ 43/* Per cpu memory for storing cpu states in case of system crash. */
42note_buf_t* crash_notes; 44note_buf_t __percpu *crash_notes;
43 45
44/* vmcoreinfo stuff */ 46/* vmcoreinfo stuff */
45static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; 47static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
@@ -1073,6 +1075,9 @@ void crash_kexec(struct pt_regs *regs)
1073 if (mutex_trylock(&kexec_mutex)) { 1075 if (mutex_trylock(&kexec_mutex)) {
1074 if (kexec_crash_image) { 1076 if (kexec_crash_image) {
1075 struct pt_regs fixed_regs; 1077 struct pt_regs fixed_regs;
1078
1079 kmsg_dump(KMSG_DUMP_KEXEC);
1080
1076 crash_setup_regs(&fixed_regs, regs); 1081 crash_setup_regs(&fixed_regs, regs);
1077 crash_save_vmcoreinfo(); 1082 crash_save_vmcoreinfo();
1078 machine_crash_shutdown(&fixed_regs); 1083 machine_crash_shutdown(&fixed_regs);
@@ -1082,6 +1087,62 @@ void crash_kexec(struct pt_regs *regs)
1082 } 1087 }
1083} 1088}
1084 1089
1090size_t crash_get_memory_size(void)
1091{
1092 size_t size;
1093 mutex_lock(&kexec_mutex);
1094 size = crashk_res.end - crashk_res.start + 1;
1095 mutex_unlock(&kexec_mutex);
1096 return size;
1097}
1098
1099static void free_reserved_phys_range(unsigned long begin, unsigned long end)
1100{
1101 unsigned long addr;
1102
1103 for (addr = begin; addr < end; addr += PAGE_SIZE) {
1104 ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT));
1105 init_page_count(pfn_to_page(addr >> PAGE_SHIFT));
1106 free_page((unsigned long)__va(addr));
1107 totalram_pages++;
1108 }
1109}
1110
1111int crash_shrink_memory(unsigned long new_size)
1112{
1113 int ret = 0;
1114 unsigned long start, end;
1115
1116 mutex_lock(&kexec_mutex);
1117
1118 if (kexec_crash_image) {
1119 ret = -ENOENT;
1120 goto unlock;
1121 }
1122 start = crashk_res.start;
1123 end = crashk_res.end;
1124
1125 if (new_size >= end - start + 1) {
1126 ret = -EINVAL;
1127 if (new_size == end - start + 1)
1128 ret = 0;
1129 goto unlock;
1130 }
1131
1132 start = roundup(start, PAGE_SIZE);
1133 end = roundup(start + new_size, PAGE_SIZE);
1134
1135 free_reserved_phys_range(end, crashk_res.end);
1136
1137 if (start == end)
1138 release_resource(&crashk_res);
1139 crashk_res.end = end - 1;
1140
1141unlock:
1142 mutex_unlock(&kexec_mutex);
1143 return ret;
1144}
1145
1085static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, 1146static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
1086 size_t data_len) 1147 size_t data_len)
1087{ 1148{
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 3765ff3c1bbe..35edbe22e9a9 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -1,6 +1,7 @@
1/* 1/*
2 * A simple kernel FIFO implementation. 2 * A generic kernel FIFO implementation.
3 * 3 *
4 * Copyright (C) 2009 Stefani Seibold <stefani@seibold.net>
4 * Copyright (C) 2004 Stelian Pop <stelian@popies.net> 5 * Copyright (C) 2004 Stelian Pop <stelian@popies.net>
5 * 6 *
6 * This program is free software; you can redistribute it and/or modify 7 * This program is free software; you can redistribute it and/or modify
@@ -25,50 +26,48 @@
25#include <linux/err.h> 26#include <linux/err.h>
26#include <linux/kfifo.h> 27#include <linux/kfifo.h>
27#include <linux/log2.h> 28#include <linux/log2.h>
29#include <linux/uaccess.h>
30
31static void _kfifo_init(struct kfifo *fifo, void *buffer,
32 unsigned int size)
33{
34 fifo->buffer = buffer;
35 fifo->size = size;
36
37 kfifo_reset(fifo);
38}
28 39
29/** 40/**
30 * kfifo_init - allocates a new FIFO using a preallocated buffer 41 * kfifo_init - initialize a FIFO using a preallocated buffer
42 * @fifo: the fifo to assign the buffer
31 * @buffer: the preallocated buffer to be used. 43 * @buffer: the preallocated buffer to be used.
32 * @size: the size of the internal buffer, this have to be a power of 2. 44 * @size: the size of the internal buffer, this has to be a power of 2.
33 * @gfp_mask: get_free_pages mask, passed to kmalloc()
34 * @lock: the lock to be used to protect the fifo buffer
35 * 45 *
36 * Do NOT pass the kfifo to kfifo_free() after use! Simply free the
37 * &struct kfifo with kfree().
38 */ 46 */
39struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size, 47void kfifo_init(struct kfifo *fifo, void *buffer, unsigned int size)
40 gfp_t gfp_mask, spinlock_t *lock)
41{ 48{
42 struct kfifo *fifo;
43
44 /* size must be a power of 2 */ 49 /* size must be a power of 2 */
45 BUG_ON(!is_power_of_2(size)); 50 BUG_ON(!is_power_of_2(size));
46 51
47 fifo = kmalloc(sizeof(struct kfifo), gfp_mask); 52 _kfifo_init(fifo, buffer, size);
48 if (!fifo)
49 return ERR_PTR(-ENOMEM);
50
51 fifo->buffer = buffer;
52 fifo->size = size;
53 fifo->in = fifo->out = 0;
54 fifo->lock = lock;
55
56 return fifo;
57} 53}
58EXPORT_SYMBOL(kfifo_init); 54EXPORT_SYMBOL(kfifo_init);
59 55
60/** 56/**
61 * kfifo_alloc - allocates a new FIFO and its internal buffer 57 * kfifo_alloc - allocates a new FIFO internal buffer
62 * @size: the size of the internal buffer to be allocated. 58 * @fifo: the fifo to assign then new buffer
59 * @size: the size of the buffer to be allocated, this have to be a power of 2.
63 * @gfp_mask: get_free_pages mask, passed to kmalloc() 60 * @gfp_mask: get_free_pages mask, passed to kmalloc()
64 * @lock: the lock to be used to protect the fifo buffer 61 *
62 * This function dynamically allocates a new fifo internal buffer
65 * 63 *
66 * The size will be rounded-up to a power of 2. 64 * The size will be rounded-up to a power of 2.
65 * The buffer will be release with kfifo_free().
66 * Return 0 if no error, otherwise the an error code
67 */ 67 */
68struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock) 68int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask)
69{ 69{
70 unsigned char *buffer; 70 unsigned char *buffer;
71 struct kfifo *ret;
72 71
73 /* 72 /*
74 * round up to the next power of 2, since our 'let the indices 73 * round up to the next power of 2, since our 'let the indices
@@ -80,48 +79,93 @@ struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock)
80 } 79 }
81 80
82 buffer = kmalloc(size, gfp_mask); 81 buffer = kmalloc(size, gfp_mask);
83 if (!buffer) 82 if (!buffer) {
84 return ERR_PTR(-ENOMEM); 83 _kfifo_init(fifo, NULL, 0);
85 84 return -ENOMEM;
86 ret = kfifo_init(buffer, size, gfp_mask, lock); 85 }
87 86
88 if (IS_ERR(ret)) 87 _kfifo_init(fifo, buffer, size);
89 kfree(buffer);
90 88
91 return ret; 89 return 0;
92} 90}
93EXPORT_SYMBOL(kfifo_alloc); 91EXPORT_SYMBOL(kfifo_alloc);
94 92
95/** 93/**
96 * kfifo_free - frees the FIFO 94 * kfifo_free - frees the FIFO internal buffer
97 * @fifo: the fifo to be freed. 95 * @fifo: the fifo to be freed.
98 */ 96 */
99void kfifo_free(struct kfifo *fifo) 97void kfifo_free(struct kfifo *fifo)
100{ 98{
101 kfree(fifo->buffer); 99 kfree(fifo->buffer);
102 kfree(fifo); 100 _kfifo_init(fifo, NULL, 0);
103} 101}
104EXPORT_SYMBOL(kfifo_free); 102EXPORT_SYMBOL(kfifo_free);
105 103
106/** 104/**
107 * __kfifo_put - puts some data into the FIFO, no locking version 105 * kfifo_skip - skip output data
108 * @fifo: the fifo to be used. 106 * @fifo: the fifo to be used.
109 * @buffer: the data to be added. 107 * @len: number of bytes to skip
110 * @len: the length of the data to be added.
111 *
112 * This function copies at most @len bytes from the @buffer into
113 * the FIFO depending on the free space, and returns the number of
114 * bytes copied.
115 *
116 * Note that with only one concurrent reader and one concurrent
117 * writer, you don't need extra locking to use these functions.
118 */ 108 */
119unsigned int __kfifo_put(struct kfifo *fifo, 109void kfifo_skip(struct kfifo *fifo, unsigned int len)
120 const unsigned char *buffer, unsigned int len) 110{
111 if (len < kfifo_len(fifo)) {
112 __kfifo_add_out(fifo, len);
113 return;
114 }
115 kfifo_reset_out(fifo);
116}
117EXPORT_SYMBOL(kfifo_skip);
118
119static inline void __kfifo_in_data(struct kfifo *fifo,
120 const void *from, unsigned int len, unsigned int off)
121{ 121{
122 unsigned int l; 122 unsigned int l;
123 123
124 len = min(len, fifo->size - fifo->in + fifo->out); 124 /*
125 * Ensure that we sample the fifo->out index -before- we
126 * start putting bytes into the kfifo.
127 */
128
129 smp_mb();
130
131 off = __kfifo_off(fifo, fifo->in + off);
132
133 /* first put the data starting from fifo->in to buffer end */
134 l = min(len, fifo->size - off);
135 memcpy(fifo->buffer + off, from, l);
136
137 /* then put the rest (if any) at the beginning of the buffer */
138 memcpy(fifo->buffer, from + l, len - l);
139}
140
141static inline void __kfifo_out_data(struct kfifo *fifo,
142 void *to, unsigned int len, unsigned int off)
143{
144 unsigned int l;
145
146 /*
147 * Ensure that we sample the fifo->in index -before- we
148 * start removing bytes from the kfifo.
149 */
150
151 smp_rmb();
152
153 off = __kfifo_off(fifo, fifo->out + off);
154
155 /* first get the data from fifo->out until the end of the buffer */
156 l = min(len, fifo->size - off);
157 memcpy(to, fifo->buffer + off, l);
158
159 /* then get the rest (if any) from the beginning of the buffer */
160 memcpy(to + l, fifo->buffer, len - l);
161}
162
163static inline int __kfifo_from_user_data(struct kfifo *fifo,
164 const void __user *from, unsigned int len, unsigned int off,
165 unsigned *lenout)
166{
167 unsigned int l;
168 int ret;
125 169
126 /* 170 /*
127 * Ensure that we sample the fifo->out index -before- we 171 * Ensure that we sample the fifo->out index -before- we
@@ -130,68 +174,272 @@ unsigned int __kfifo_put(struct kfifo *fifo,
130 174
131 smp_mb(); 175 smp_mb();
132 176
177 off = __kfifo_off(fifo, fifo->in + off);
178
133 /* first put the data starting from fifo->in to buffer end */ 179 /* first put the data starting from fifo->in to buffer end */
134 l = min(len, fifo->size - (fifo->in & (fifo->size - 1))); 180 l = min(len, fifo->size - off);
135 memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l); 181 ret = copy_from_user(fifo->buffer + off, from, l);
182 if (unlikely(ret)) {
183 *lenout = ret;
184 return -EFAULT;
185 }
186 *lenout = l;
136 187
137 /* then put the rest (if any) at the beginning of the buffer */ 188 /* then put the rest (if any) at the beginning of the buffer */
138 memcpy(fifo->buffer, buffer + l, len - l); 189 ret = copy_from_user(fifo->buffer, from + l, len - l);
190 *lenout += ret ? ret : len - l;
191 return ret ? -EFAULT : 0;
192}
193
194static inline int __kfifo_to_user_data(struct kfifo *fifo,
195 void __user *to, unsigned int len, unsigned int off, unsigned *lenout)
196{
197 unsigned int l;
198 int ret;
139 199
140 /* 200 /*
141 * Ensure that we add the bytes to the kfifo -before- 201 * Ensure that we sample the fifo->in index -before- we
142 * we update the fifo->in index. 202 * start removing bytes from the kfifo.
143 */ 203 */
144 204
145 smp_wmb(); 205 smp_rmb();
206
207 off = __kfifo_off(fifo, fifo->out + off);
208
209 /* first get the data from fifo->out until the end of the buffer */
210 l = min(len, fifo->size - off);
211 ret = copy_to_user(to, fifo->buffer + off, l);
212 *lenout = l;
213 if (unlikely(ret)) {
214 *lenout -= ret;
215 return -EFAULT;
216 }
217
218 /* then get the rest (if any) from the beginning of the buffer */
219 len -= l;
220 ret = copy_to_user(to + l, fifo->buffer, len);
221 if (unlikely(ret)) {
222 *lenout += len - ret;
223 return -EFAULT;
224 }
225 *lenout += len;
226 return 0;
227}
228
229unsigned int __kfifo_in_n(struct kfifo *fifo,
230 const void *from, unsigned int len, unsigned int recsize)
231{
232 if (kfifo_avail(fifo) < len + recsize)
233 return len + 1;
234
235 __kfifo_in_data(fifo, from, len, recsize);
236 return 0;
237}
238EXPORT_SYMBOL(__kfifo_in_n);
146 239
147 fifo->in += len; 240/**
241 * kfifo_in - puts some data into the FIFO
242 * @fifo: the fifo to be used.
243 * @from: the data to be added.
244 * @len: the length of the data to be added.
245 *
246 * This function copies at most @len bytes from the @from buffer into
247 * the FIFO depending on the free space, and returns the number of
248 * bytes copied.
249 *
250 * Note that with only one concurrent reader and one concurrent
251 * writer, you don't need extra locking to use these functions.
252 */
253unsigned int kfifo_in(struct kfifo *fifo, const void *from,
254 unsigned int len)
255{
256 len = min(kfifo_avail(fifo), len);
148 257
258 __kfifo_in_data(fifo, from, len, 0);
259 __kfifo_add_in(fifo, len);
149 return len; 260 return len;
150} 261}
151EXPORT_SYMBOL(__kfifo_put); 262EXPORT_SYMBOL(kfifo_in);
263
264unsigned int __kfifo_in_generic(struct kfifo *fifo,
265 const void *from, unsigned int len, unsigned int recsize)
266{
267 return __kfifo_in_rec(fifo, from, len, recsize);
268}
269EXPORT_SYMBOL(__kfifo_in_generic);
270
271unsigned int __kfifo_out_n(struct kfifo *fifo,
272 void *to, unsigned int len, unsigned int recsize)
273{
274 if (kfifo_len(fifo) < len + recsize)
275 return len;
276
277 __kfifo_out_data(fifo, to, len, recsize);
278 __kfifo_add_out(fifo, len + recsize);
279 return 0;
280}
281EXPORT_SYMBOL(__kfifo_out_n);
152 282
153/** 283/**
154 * __kfifo_get - gets some data from the FIFO, no locking version 284 * kfifo_out - gets some data from the FIFO
155 * @fifo: the fifo to be used. 285 * @fifo: the fifo to be used.
156 * @buffer: where the data must be copied. 286 * @to: where the data must be copied.
157 * @len: the size of the destination buffer. 287 * @len: the size of the destination buffer.
158 * 288 *
159 * This function copies at most @len bytes from the FIFO into the 289 * This function copies at most @len bytes from the FIFO into the
160 * @buffer and returns the number of copied bytes. 290 * @to buffer and returns the number of copied bytes.
161 * 291 *
162 * Note that with only one concurrent reader and one concurrent 292 * Note that with only one concurrent reader and one concurrent
163 * writer, you don't need extra locking to use these functions. 293 * writer, you don't need extra locking to use these functions.
164 */ 294 */
165unsigned int __kfifo_get(struct kfifo *fifo, 295unsigned int kfifo_out(struct kfifo *fifo, void *to, unsigned int len)
166 unsigned char *buffer, unsigned int len)
167{ 296{
168 unsigned int l; 297 len = min(kfifo_len(fifo), len);
169 298
170 len = min(len, fifo->in - fifo->out); 299 __kfifo_out_data(fifo, to, len, 0);
300 __kfifo_add_out(fifo, len);
171 301
172 /* 302 return len;
173 * Ensure that we sample the fifo->in index -before- we 303}
174 * start removing bytes from the kfifo. 304EXPORT_SYMBOL(kfifo_out);
175 */
176 305
177 smp_rmb(); 306/**
307 * kfifo_out_peek - copy some data from the FIFO, but do not remove it
308 * @fifo: the fifo to be used.
309 * @to: where the data must be copied.
310 * @len: the size of the destination buffer.
311 * @offset: offset into the fifo
312 *
313 * This function copies at most @len bytes at @offset from the FIFO
314 * into the @to buffer and returns the number of copied bytes.
315 * The data is not removed from the FIFO.
316 */
317unsigned int kfifo_out_peek(struct kfifo *fifo, void *to, unsigned int len,
318 unsigned offset)
319{
320 len = min(kfifo_len(fifo), len + offset);
178 321
179 /* first get the data from fifo->out until the end of the buffer */ 322 __kfifo_out_data(fifo, to, len, offset);
180 l = min(len, fifo->size - (fifo->out & (fifo->size - 1))); 323 return len;
181 memcpy(buffer, fifo->buffer + (fifo->out & (fifo->size - 1)), l); 324}
325EXPORT_SYMBOL(kfifo_out_peek);
182 326
183 /* then get the rest (if any) from the beginning of the buffer */ 327unsigned int __kfifo_out_generic(struct kfifo *fifo,
184 memcpy(buffer + l, fifo->buffer, len - l); 328 void *to, unsigned int len, unsigned int recsize,
329 unsigned int *total)
330{
331 return __kfifo_out_rec(fifo, to, len, recsize, total);
332}
333EXPORT_SYMBOL(__kfifo_out_generic);
185 334
186 /* 335unsigned int __kfifo_from_user_n(struct kfifo *fifo,
187 * Ensure that we remove the bytes from the kfifo -before- 336 const void __user *from, unsigned int len, unsigned int recsize)
188 * we update the fifo->out index. 337{
189 */ 338 unsigned total;
190 339
191 smp_mb(); 340 if (kfifo_avail(fifo) < len + recsize)
341 return len + 1;
192 342
193 fifo->out += len; 343 __kfifo_from_user_data(fifo, from, len, recsize, &total);
344 return total;
345}
346EXPORT_SYMBOL(__kfifo_from_user_n);
194 347
195 return len; 348/**
349 * kfifo_from_user - puts some data from user space into the FIFO
350 * @fifo: the fifo to be used.
351 * @from: pointer to the data to be added.
352 * @len: the length of the data to be added.
353 * @total: the actual returned data length.
354 *
355 * This function copies at most @len bytes from the @from into the
356 * FIFO depending and returns -EFAULT/0.
357 *
358 * Note that with only one concurrent reader and one concurrent
359 * writer, you don't need extra locking to use these functions.
360 */
361int kfifo_from_user(struct kfifo *fifo,
362 const void __user *from, unsigned int len, unsigned *total)
363{
364 int ret;
365 len = min(kfifo_avail(fifo), len);
366 ret = __kfifo_from_user_data(fifo, from, len, 0, total);
367 if (ret)
368 return ret;
369 __kfifo_add_in(fifo, len);
370 return 0;
196} 371}
197EXPORT_SYMBOL(__kfifo_get); 372EXPORT_SYMBOL(kfifo_from_user);
373
374unsigned int __kfifo_from_user_generic(struct kfifo *fifo,
375 const void __user *from, unsigned int len, unsigned int recsize)
376{
377 return __kfifo_from_user_rec(fifo, from, len, recsize);
378}
379EXPORT_SYMBOL(__kfifo_from_user_generic);
380
381unsigned int __kfifo_to_user_n(struct kfifo *fifo,
382 void __user *to, unsigned int len, unsigned int reclen,
383 unsigned int recsize)
384{
385 unsigned int ret, total;
386
387 if (kfifo_len(fifo) < reclen + recsize)
388 return len;
389
390 ret = __kfifo_to_user_data(fifo, to, reclen, recsize, &total);
391
392 if (likely(ret == 0))
393 __kfifo_add_out(fifo, reclen + recsize);
394
395 return total;
396}
397EXPORT_SYMBOL(__kfifo_to_user_n);
398
399/**
400 * kfifo_to_user - gets data from the FIFO and write it to user space
401 * @fifo: the fifo to be used.
402 * @to: where the data must be copied.
403 * @len: the size of the destination buffer.
404 * @lenout: pointer to output variable with copied data
405 *
406 * This function copies at most @len bytes from the FIFO into the
407 * @to buffer and 0 or -EFAULT.
408 *
409 * Note that with only one concurrent reader and one concurrent
410 * writer, you don't need extra locking to use these functions.
411 */
412int kfifo_to_user(struct kfifo *fifo,
413 void __user *to, unsigned int len, unsigned *lenout)
414{
415 int ret;
416 len = min(kfifo_len(fifo), len);
417 ret = __kfifo_to_user_data(fifo, to, len, 0, lenout);
418 __kfifo_add_out(fifo, *lenout);
419 return ret;
420}
421EXPORT_SYMBOL(kfifo_to_user);
422
423unsigned int __kfifo_to_user_generic(struct kfifo *fifo,
424 void __user *to, unsigned int len, unsigned int recsize,
425 unsigned int *total)
426{
427 return __kfifo_to_user_rec(fifo, to, len, recsize, total);
428}
429EXPORT_SYMBOL(__kfifo_to_user_generic);
430
431unsigned int __kfifo_peek_generic(struct kfifo *fifo, unsigned int recsize)
432{
433 if (recsize == 0)
434 return kfifo_avail(fifo);
435
436 return __kfifo_peek_n(fifo, recsize);
437}
438EXPORT_SYMBOL(__kfifo_peek_generic);
439
440void __kfifo_skip_generic(struct kfifo *fifo, unsigned int recsize)
441{
442 __kfifo_skip_rec(fifo, recsize);
443}
444EXPORT_SYMBOL(__kfifo_skip_generic);
445
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 9147a3190c9d..11f3515ca83f 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -69,9 +69,16 @@ struct kgdb_state {
69 struct pt_regs *linux_regs; 69 struct pt_regs *linux_regs;
70}; 70};
71 71
72/* Exception state values */
73#define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */
74#define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */
75#define DCPU_IS_SLAVE 0x4 /* Slave cpu enter exception */
76#define DCPU_SSTEP 0x8 /* CPU is single stepping */
77
72static struct debuggerinfo_struct { 78static struct debuggerinfo_struct {
73 void *debuggerinfo; 79 void *debuggerinfo;
74 struct task_struct *task; 80 struct task_struct *task;
81 int exception_state;
75} kgdb_info[NR_CPUS]; 82} kgdb_info[NR_CPUS];
76 83
77/** 84/**
@@ -129,6 +136,7 @@ struct task_struct *kgdb_usethread;
129struct task_struct *kgdb_contthread; 136struct task_struct *kgdb_contthread;
130 137
131int kgdb_single_step; 138int kgdb_single_step;
139pid_t kgdb_sstep_pid;
132 140
133/* Our I/O buffers. */ 141/* Our I/O buffers. */
134static char remcom_in_buffer[BUFMAX]; 142static char remcom_in_buffer[BUFMAX];
@@ -390,27 +398,22 @@ int kgdb_mem2hex(char *mem, char *buf, int count)
390 398
391/* 399/*
392 * Copy the binary array pointed to by buf into mem. Fix $, #, and 400 * Copy the binary array pointed to by buf into mem. Fix $, #, and
393 * 0x7d escaped with 0x7d. Return a pointer to the character after 401 * 0x7d escaped with 0x7d. Return -EFAULT on failure or 0 on success.
394 * the last byte written. 402 * The input buf is overwitten with the result to write to mem.
395 */ 403 */
396static int kgdb_ebin2mem(char *buf, char *mem, int count) 404static int kgdb_ebin2mem(char *buf, char *mem, int count)
397{ 405{
398 int err = 0; 406 int size = 0;
399 char c; 407 char *c = buf;
400 408
401 while (count-- > 0) { 409 while (count-- > 0) {
402 c = *buf++; 410 c[size] = *buf++;
403 if (c == 0x7d) 411 if (c[size] == 0x7d)
404 c = *buf++ ^ 0x20; 412 c[size] = *buf++ ^ 0x20;
405 413 size++;
406 err = probe_kernel_write(mem, &c, 1);
407 if (err)
408 break;
409
410 mem++;
411 } 414 }
412 415
413 return err; 416 return probe_kernel_write(mem, c, size);
414} 417}
415 418
416/* 419/*
@@ -541,12 +544,17 @@ static struct task_struct *getthread(struct pt_regs *regs, int tid)
541 */ 544 */
542 if (tid == 0 || tid == -1) 545 if (tid == 0 || tid == -1)
543 tid = -atomic_read(&kgdb_active) - 2; 546 tid = -atomic_read(&kgdb_active) - 2;
544 if (tid < 0) { 547 if (tid < -1 && tid > -NR_CPUS - 2) {
545 if (kgdb_info[-tid - 2].task) 548 if (kgdb_info[-tid - 2].task)
546 return kgdb_info[-tid - 2].task; 549 return kgdb_info[-tid - 2].task;
547 else 550 else
548 return idle_task(-tid - 2); 551 return idle_task(-tid - 2);
549 } 552 }
553 if (tid <= 0) {
554 printk(KERN_ERR "KGDB: Internal thread select error\n");
555 dump_stack();
556 return NULL;
557 }
550 558
551 /* 559 /*
552 * find_task_by_pid_ns() does not take the tasklist lock anymore 560 * find_task_by_pid_ns() does not take the tasklist lock anymore
@@ -557,46 +565,6 @@ static struct task_struct *getthread(struct pt_regs *regs, int tid)
557} 565}
558 566
559/* 567/*
560 * CPU debug state control:
561 */
562
563#ifdef CONFIG_SMP
564static void kgdb_wait(struct pt_regs *regs)
565{
566 unsigned long flags;
567 int cpu;
568
569 local_irq_save(flags);
570 cpu = raw_smp_processor_id();
571 kgdb_info[cpu].debuggerinfo = regs;
572 kgdb_info[cpu].task = current;
573 /*
574 * Make sure the above info reaches the primary CPU before
575 * our cpu_in_kgdb[] flag setting does:
576 */
577 smp_wmb();
578 atomic_set(&cpu_in_kgdb[cpu], 1);
579
580 /* Wait till primary CPU is done with debugging */
581 while (atomic_read(&passive_cpu_wait[cpu]))
582 cpu_relax();
583
584 kgdb_info[cpu].debuggerinfo = NULL;
585 kgdb_info[cpu].task = NULL;
586
587 /* fix up hardware debug registers on local cpu */
588 if (arch_kgdb_ops.correct_hw_break)
589 arch_kgdb_ops.correct_hw_break();
590
591 /* Signal the primary CPU that we are done: */
592 atomic_set(&cpu_in_kgdb[cpu], 0);
593 touch_softlockup_watchdog();
594 clocksource_touch_watchdog();
595 local_irq_restore(flags);
596}
597#endif
598
599/*
600 * Some architectures need cache flushes when we set/clear a 568 * Some architectures need cache flushes when we set/clear a
601 * breakpoint: 569 * breakpoint:
602 */ 570 */
@@ -619,7 +587,8 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
619static int kgdb_activate_sw_breakpoints(void) 587static int kgdb_activate_sw_breakpoints(void)
620{ 588{
621 unsigned long addr; 589 unsigned long addr;
622 int error = 0; 590 int error;
591 int ret = 0;
623 int i; 592 int i;
624 593
625 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { 594 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
@@ -629,13 +598,16 @@ static int kgdb_activate_sw_breakpoints(void)
629 addr = kgdb_break[i].bpt_addr; 598 addr = kgdb_break[i].bpt_addr;
630 error = kgdb_arch_set_breakpoint(addr, 599 error = kgdb_arch_set_breakpoint(addr,
631 kgdb_break[i].saved_instr); 600 kgdb_break[i].saved_instr);
632 if (error) 601 if (error) {
633 return error; 602 ret = error;
603 printk(KERN_INFO "KGDB: BP install failed: %lx", addr);
604 continue;
605 }
634 606
635 kgdb_flush_swbreak_addr(addr); 607 kgdb_flush_swbreak_addr(addr);
636 kgdb_break[i].state = BP_ACTIVE; 608 kgdb_break[i].state = BP_ACTIVE;
637 } 609 }
638 return 0; 610 return ret;
639} 611}
640 612
641static int kgdb_set_sw_break(unsigned long addr) 613static int kgdb_set_sw_break(unsigned long addr)
@@ -682,7 +654,8 @@ static int kgdb_set_sw_break(unsigned long addr)
682static int kgdb_deactivate_sw_breakpoints(void) 654static int kgdb_deactivate_sw_breakpoints(void)
683{ 655{
684 unsigned long addr; 656 unsigned long addr;
685 int error = 0; 657 int error;
658 int ret = 0;
686 int i; 659 int i;
687 660
688 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { 661 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
@@ -691,13 +664,15 @@ static int kgdb_deactivate_sw_breakpoints(void)
691 addr = kgdb_break[i].bpt_addr; 664 addr = kgdb_break[i].bpt_addr;
692 error = kgdb_arch_remove_breakpoint(addr, 665 error = kgdb_arch_remove_breakpoint(addr,
693 kgdb_break[i].saved_instr); 666 kgdb_break[i].saved_instr);
694 if (error) 667 if (error) {
695 return error; 668 printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr);
669 ret = error;
670 }
696 671
697 kgdb_flush_swbreak_addr(addr); 672 kgdb_flush_swbreak_addr(addr);
698 kgdb_break[i].state = BP_SET; 673 kgdb_break[i].state = BP_SET;
699 } 674 }
700 return 0; 675 return ret;
701} 676}
702 677
703static int kgdb_remove_sw_break(unsigned long addr) 678static int kgdb_remove_sw_break(unsigned long addr)
@@ -870,7 +845,7 @@ static void gdb_cmd_getregs(struct kgdb_state *ks)
870 845
871 /* 846 /*
872 * All threads that don't have debuggerinfo should be 847 * All threads that don't have debuggerinfo should be
873 * in __schedule() sleeping, since all other CPUs 848 * in schedule() sleeping, since all other CPUs
874 * are in kgdb_wait, and thus have debuggerinfo. 849 * are in kgdb_wait, and thus have debuggerinfo.
875 */ 850 */
876 if (local_debuggerinfo) { 851 if (local_debuggerinfo) {
@@ -1204,8 +1179,10 @@ static int gdb_cmd_exception_pass(struct kgdb_state *ks)
1204 return 1; 1179 return 1;
1205 1180
1206 } else { 1181 } else {
1207 error_packet(remcom_out_buffer, -EINVAL); 1182 kgdb_msg_write("KGDB only knows signal 9 (pass)"
1208 return 0; 1183 " and 15 (pass and disconnect)\n"
1184 "Executing a continue without signal passing\n", 0);
1185 remcom_in_buffer[0] = 'c';
1209 } 1186 }
1210 1187
1211 /* Indicate fall through */ 1188 /* Indicate fall through */
@@ -1382,33 +1359,13 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
1382 return 1; 1359 return 1;
1383} 1360}
1384 1361
1385/* 1362static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs)
1386 * kgdb_handle_exception() - main entry point from a kernel exception
1387 *
1388 * Locking hierarchy:
1389 * interface locks, if any (begin_session)
1390 * kgdb lock (kgdb_active)
1391 */
1392int
1393kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
1394{ 1363{
1395 struct kgdb_state kgdb_var;
1396 struct kgdb_state *ks = &kgdb_var;
1397 unsigned long flags; 1364 unsigned long flags;
1365 int sstep_tries = 100;
1398 int error = 0; 1366 int error = 0;
1399 int i, cpu; 1367 int i, cpu;
1400 1368 int trace_on = 0;
1401 ks->cpu = raw_smp_processor_id();
1402 ks->ex_vector = evector;
1403 ks->signo = signo;
1404 ks->ex_vector = evector;
1405 ks->err_code = ecode;
1406 ks->kgdb_usethreadid = 0;
1407 ks->linux_regs = regs;
1408
1409 if (kgdb_reenter_check(ks))
1410 return 0; /* Ouch, double exception ! */
1411
1412acquirelock: 1369acquirelock:
1413 /* 1370 /*
1414 * Interrupts will be restored by the 'trap return' code, except when 1371 * Interrupts will be restored by the 'trap return' code, except when
@@ -1416,24 +1373,55 @@ acquirelock:
1416 */ 1373 */
1417 local_irq_save(flags); 1374 local_irq_save(flags);
1418 1375
1419 cpu = raw_smp_processor_id(); 1376 cpu = ks->cpu;
1377 kgdb_info[cpu].debuggerinfo = regs;
1378 kgdb_info[cpu].task = current;
1379 /*
1380 * Make sure the above info reaches the primary CPU before
1381 * our cpu_in_kgdb[] flag setting does:
1382 */
1383 atomic_inc(&cpu_in_kgdb[cpu]);
1420 1384
1421 /* 1385 /*
1422 * Acquire the kgdb_active lock: 1386 * CPU will loop if it is a slave or request to become a kgdb
1387 * master cpu and acquire the kgdb_active lock:
1423 */ 1388 */
1424 while (atomic_cmpxchg(&kgdb_active, -1, cpu) != -1) 1389 while (1) {
1390 if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) {
1391 if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu)
1392 break;
1393 } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
1394 if (!atomic_read(&passive_cpu_wait[cpu]))
1395 goto return_normal;
1396 } else {
1397return_normal:
1398 /* Return to normal operation by executing any
1399 * hw breakpoint fixup.
1400 */
1401 if (arch_kgdb_ops.correct_hw_break)
1402 arch_kgdb_ops.correct_hw_break();
1403 if (trace_on)
1404 tracing_on();
1405 atomic_dec(&cpu_in_kgdb[cpu]);
1406 touch_softlockup_watchdog_sync();
1407 clocksource_touch_watchdog();
1408 local_irq_restore(flags);
1409 return 0;
1410 }
1425 cpu_relax(); 1411 cpu_relax();
1412 }
1426 1413
1427 /* 1414 /*
1428 * Do not start the debugger connection on this CPU if the last 1415 * For single stepping, try to only enter on the processor
1429 * instance of the exception handler wanted to come into the 1416 * that was single stepping. To gaurd against a deadlock, the
1430 * debugger on a different CPU via a single step 1417 * kernel will only try for the value of sstep_tries before
1418 * giving up and continuing on.
1431 */ 1419 */
1432 if (atomic_read(&kgdb_cpu_doing_single_step) != -1 && 1420 if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
1433 atomic_read(&kgdb_cpu_doing_single_step) != cpu) { 1421 (kgdb_info[cpu].task &&
1434 1422 kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
1435 atomic_set(&kgdb_active, -1); 1423 atomic_set(&kgdb_active, -1);
1436 touch_softlockup_watchdog(); 1424 touch_softlockup_watchdog_sync();
1437 clocksource_touch_watchdog(); 1425 clocksource_touch_watchdog();
1438 local_irq_restore(flags); 1426 local_irq_restore(flags);
1439 1427
@@ -1455,9 +1443,6 @@ acquirelock:
1455 if (kgdb_io_ops->pre_exception) 1443 if (kgdb_io_ops->pre_exception)
1456 kgdb_io_ops->pre_exception(); 1444 kgdb_io_ops->pre_exception();
1457 1445
1458 kgdb_info[ks->cpu].debuggerinfo = ks->linux_regs;
1459 kgdb_info[ks->cpu].task = current;
1460
1461 kgdb_disable_hw_debug(ks->linux_regs); 1446 kgdb_disable_hw_debug(ks->linux_regs);
1462 1447
1463 /* 1448 /*
@@ -1466,15 +1451,9 @@ acquirelock:
1466 */ 1451 */
1467 if (!kgdb_single_step) { 1452 if (!kgdb_single_step) {
1468 for (i = 0; i < NR_CPUS; i++) 1453 for (i = 0; i < NR_CPUS; i++)
1469 atomic_set(&passive_cpu_wait[i], 1); 1454 atomic_inc(&passive_cpu_wait[i]);
1470 } 1455 }
1471 1456
1472 /*
1473 * spin_lock code is good enough as a barrier so we don't
1474 * need one here:
1475 */
1476 atomic_set(&cpu_in_kgdb[ks->cpu], 1);
1477
1478#ifdef CONFIG_SMP 1457#ifdef CONFIG_SMP
1479 /* Signal the other CPUs to enter kgdb_wait() */ 1458 /* Signal the other CPUs to enter kgdb_wait() */
1480 if ((!kgdb_single_step) && kgdb_do_roundup) 1459 if ((!kgdb_single_step) && kgdb_do_roundup)
@@ -1498,6 +1477,9 @@ acquirelock:
1498 kgdb_single_step = 0; 1477 kgdb_single_step = 0;
1499 kgdb_contthread = current; 1478 kgdb_contthread = current;
1500 exception_level = 0; 1479 exception_level = 0;
1480 trace_on = tracing_is_on();
1481 if (trace_on)
1482 tracing_off();
1501 1483
1502 /* Talk to debugger with gdbserial protocol */ 1484 /* Talk to debugger with gdbserial protocol */
1503 error = gdb_serial_stub(ks); 1485 error = gdb_serial_stub(ks);
@@ -1506,13 +1488,11 @@ acquirelock:
1506 if (kgdb_io_ops->post_exception) 1488 if (kgdb_io_ops->post_exception)
1507 kgdb_io_ops->post_exception(); 1489 kgdb_io_ops->post_exception();
1508 1490
1509 kgdb_info[ks->cpu].debuggerinfo = NULL; 1491 atomic_dec(&cpu_in_kgdb[ks->cpu]);
1510 kgdb_info[ks->cpu].task = NULL;
1511 atomic_set(&cpu_in_kgdb[ks->cpu], 0);
1512 1492
1513 if (!kgdb_single_step) { 1493 if (!kgdb_single_step) {
1514 for (i = NR_CPUS-1; i >= 0; i--) 1494 for (i = NR_CPUS-1; i >= 0; i--)
1515 atomic_set(&passive_cpu_wait[i], 0); 1495 atomic_dec(&passive_cpu_wait[i]);
1516 /* 1496 /*
1517 * Wait till all the CPUs have quit 1497 * Wait till all the CPUs have quit
1518 * from the debugger. 1498 * from the debugger.
@@ -1524,22 +1504,70 @@ acquirelock:
1524 } 1504 }
1525 1505
1526kgdb_restore: 1506kgdb_restore:
1507 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
1508 int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step);
1509 if (kgdb_info[sstep_cpu].task)
1510 kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid;
1511 else
1512 kgdb_sstep_pid = 0;
1513 }
1514 if (trace_on)
1515 tracing_on();
1527 /* Free kgdb_active */ 1516 /* Free kgdb_active */
1528 atomic_set(&kgdb_active, -1); 1517 atomic_set(&kgdb_active, -1);
1529 touch_softlockup_watchdog(); 1518 touch_softlockup_watchdog_sync();
1530 clocksource_touch_watchdog(); 1519 clocksource_touch_watchdog();
1531 local_irq_restore(flags); 1520 local_irq_restore(flags);
1532 1521
1533 return error; 1522 return error;
1534} 1523}
1535 1524
1525/*
1526 * kgdb_handle_exception() - main entry point from a kernel exception
1527 *
1528 * Locking hierarchy:
1529 * interface locks, if any (begin_session)
1530 * kgdb lock (kgdb_active)
1531 */
1532int
1533kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
1534{
1535 struct kgdb_state kgdb_var;
1536 struct kgdb_state *ks = &kgdb_var;
1537 int ret;
1538
1539 ks->cpu = raw_smp_processor_id();
1540 ks->ex_vector = evector;
1541 ks->signo = signo;
1542 ks->ex_vector = evector;
1543 ks->err_code = ecode;
1544 ks->kgdb_usethreadid = 0;
1545 ks->linux_regs = regs;
1546
1547 if (kgdb_reenter_check(ks))
1548 return 0; /* Ouch, double exception ! */
1549 kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER;
1550 ret = kgdb_cpu_enter(ks, regs);
1551 kgdb_info[ks->cpu].exception_state &= ~DCPU_WANT_MASTER;
1552 return ret;
1553}
1554
1536int kgdb_nmicallback(int cpu, void *regs) 1555int kgdb_nmicallback(int cpu, void *regs)
1537{ 1556{
1538#ifdef CONFIG_SMP 1557#ifdef CONFIG_SMP
1558 struct kgdb_state kgdb_var;
1559 struct kgdb_state *ks = &kgdb_var;
1560
1561 memset(ks, 0, sizeof(struct kgdb_state));
1562 ks->cpu = cpu;
1563 ks->linux_regs = regs;
1564
1539 if (!atomic_read(&cpu_in_kgdb[cpu]) && 1565 if (!atomic_read(&cpu_in_kgdb[cpu]) &&
1540 atomic_read(&kgdb_active) != cpu && 1566 atomic_read(&kgdb_active) != -1 &&
1541 atomic_read(&cpu_in_kgdb[atomic_read(&kgdb_active)])) { 1567 atomic_read(&kgdb_active) != cpu) {
1542 kgdb_wait((struct pt_regs *)regs); 1568 kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
1569 kgdb_cpu_enter(ks, regs);
1570 kgdb_info[cpu].exception_state &= ~DCPU_IS_SLAVE;
1543 return 0; 1571 return 0;
1544 } 1572 }
1545#endif 1573#endif
@@ -1715,11 +1743,11 @@ EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
1715 */ 1743 */
1716void kgdb_breakpoint(void) 1744void kgdb_breakpoint(void)
1717{ 1745{
1718 atomic_set(&kgdb_setting_breakpoint, 1); 1746 atomic_inc(&kgdb_setting_breakpoint);
1719 wmb(); /* Sync point before breakpoint */ 1747 wmb(); /* Sync point before breakpoint */
1720 arch_kgdb_breakpoint(); 1748 arch_kgdb_breakpoint();
1721 wmb(); /* Sync point after breakpoint */ 1749 wmb(); /* Sync point after breakpoint */
1722 atomic_set(&kgdb_setting_breakpoint, 0); 1750 atomic_dec(&kgdb_setting_breakpoint);
1723} 1751}
1724EXPORT_SYMBOL_GPL(kgdb_breakpoint); 1752EXPORT_SYMBOL_GPL(kgdb_breakpoint);
1725 1753
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 9fcb53a11f87..bf0e231d9702 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -80,16 +80,16 @@ int __request_module(bool wait, const char *fmt, ...)
80#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ 80#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
81 static int kmod_loop_msg; 81 static int kmod_loop_msg;
82 82
83 ret = security_kernel_module_request();
84 if (ret)
85 return ret;
86
87 va_start(args, fmt); 83 va_start(args, fmt);
88 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); 84 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
89 va_end(args); 85 va_end(args);
90 if (ret >= MODULE_NAME_LEN) 86 if (ret >= MODULE_NAME_LEN)
91 return -ENAMETOOLONG; 87 return -ENAMETOOLONG;
92 88
89 ret = security_kernel_module_request(module_name);
90 if (ret)
91 return ret;
92
93 /* If modprobe needs a service that is in a module, we get a recursive 93 /* If modprobe needs a service that is in a module, we get a recursive
94 * loop. Limit the number of running kmod threads to max_threads/2 or 94 * loop. Limit the number of running kmod threads to max_threads/2 or
95 * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method 95 * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method
@@ -520,13 +520,15 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
520 return -ENOMEM; 520 return -ENOMEM;
521 521
522 ret = call_usermodehelper_stdinpipe(sub_info, filp); 522 ret = call_usermodehelper_stdinpipe(sub_info, filp);
523 if (ret < 0) 523 if (ret < 0) {
524 goto out; 524 call_usermodehelper_freeinfo(sub_info);
525 return ret;
526 }
525 527
526 return call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); 528 ret = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
529 if (ret < 0) /* Failed to execute helper, close pipe */
530 filp_close(*filp, NULL);
527 531
528 out:
529 call_usermodehelper_freeinfo(sub_info);
530 return ret; 532 return ret;
531} 533}
532EXPORT_SYMBOL(call_usermodehelper_pipe); 534EXPORT_SYMBOL(call_usermodehelper_pipe);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 5240d75f4c60..0ed46f3e51e9 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -42,8 +42,11 @@
42#include <linux/freezer.h> 42#include <linux/freezer.h>
43#include <linux/seq_file.h> 43#include <linux/seq_file.h>
44#include <linux/debugfs.h> 44#include <linux/debugfs.h>
45#include <linux/sysctl.h>
45#include <linux/kdebug.h> 46#include <linux/kdebug.h>
46#include <linux/memory.h> 47#include <linux/memory.h>
48#include <linux/ftrace.h>
49#include <linux/cpu.h>
47 50
48#include <asm-generic/sections.h> 51#include <asm-generic/sections.h>
49#include <asm/cacheflush.h> 52#include <asm/cacheflush.h>
@@ -90,6 +93,10 @@ static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
90 */ 93 */
91static struct kprobe_blackpoint kprobe_blacklist[] = { 94static struct kprobe_blackpoint kprobe_blacklist[] = {
92 {"preempt_schedule",}, 95 {"preempt_schedule",},
96 {"native_get_debugreg",},
97 {"irq_entries_start",},
98 {"common_interrupt",},
99 {"mcount",}, /* mcount can be called from everywhere */
93 {NULL} /* Terminator */ 100 {NULL} /* Terminator */
94}; 101};
95 102
@@ -100,81 +107,74 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
100 * stepping on the instruction on a vmalloced/kmalloced/data page 107 * stepping on the instruction on a vmalloced/kmalloced/data page
101 * is a recipe for disaster 108 * is a recipe for disaster
102 */ 109 */
103#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
104
105struct kprobe_insn_page { 110struct kprobe_insn_page {
106 struct list_head list; 111 struct list_head list;
107 kprobe_opcode_t *insns; /* Page of instruction slots */ 112 kprobe_opcode_t *insns; /* Page of instruction slots */
108 char slot_used[INSNS_PER_PAGE];
109 int nused; 113 int nused;
110 int ngarbage; 114 int ngarbage;
115 char slot_used[];
116};
117
118#define KPROBE_INSN_PAGE_SIZE(slots) \
119 (offsetof(struct kprobe_insn_page, slot_used) + \
120 (sizeof(char) * (slots)))
121
122struct kprobe_insn_cache {
123 struct list_head pages; /* list of kprobe_insn_page */
124 size_t insn_size; /* size of instruction slot */
125 int nr_garbage;
111}; 126};
112 127
128static int slots_per_page(struct kprobe_insn_cache *c)
129{
130 return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t));
131}
132
113enum kprobe_slot_state { 133enum kprobe_slot_state {
114 SLOT_CLEAN = 0, 134 SLOT_CLEAN = 0,
115 SLOT_DIRTY = 1, 135 SLOT_DIRTY = 1,
116 SLOT_USED = 2, 136 SLOT_USED = 2,
117}; 137};
118 138
119static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */ 139static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_slots */
120static LIST_HEAD(kprobe_insn_pages); 140static struct kprobe_insn_cache kprobe_insn_slots = {
121static int kprobe_garbage_slots; 141 .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
122static int collect_garbage_slots(void); 142 .insn_size = MAX_INSN_SIZE,
123 143 .nr_garbage = 0,
124static int __kprobes check_safety(void) 144};
125{ 145static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c);
126 int ret = 0;
127#if defined(CONFIG_PREEMPT) && defined(CONFIG_FREEZER)
128 ret = freeze_processes();
129 if (ret == 0) {
130 struct task_struct *p, *q;
131 do_each_thread(p, q) {
132 if (p != current && p->state == TASK_RUNNING &&
133 p->pid != 0) {
134 printk("Check failed: %s is running\n",p->comm);
135 ret = -1;
136 goto loop_end;
137 }
138 } while_each_thread(p, q);
139 }
140loop_end:
141 thaw_processes();
142#else
143 synchronize_sched();
144#endif
145 return ret;
146}
147 146
148/** 147/**
149 * __get_insn_slot() - Find a slot on an executable page for an instruction. 148 * __get_insn_slot() - Find a slot on an executable page for an instruction.
150 * We allocate an executable page if there's no room on existing ones. 149 * We allocate an executable page if there's no room on existing ones.
151 */ 150 */
152static kprobe_opcode_t __kprobes *__get_insn_slot(void) 151static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
153{ 152{
154 struct kprobe_insn_page *kip; 153 struct kprobe_insn_page *kip;
155 154
156 retry: 155 retry:
157 list_for_each_entry(kip, &kprobe_insn_pages, list) { 156 list_for_each_entry(kip, &c->pages, list) {
158 if (kip->nused < INSNS_PER_PAGE) { 157 if (kip->nused < slots_per_page(c)) {
159 int i; 158 int i;
160 for (i = 0; i < INSNS_PER_PAGE; i++) { 159 for (i = 0; i < slots_per_page(c); i++) {
161 if (kip->slot_used[i] == SLOT_CLEAN) { 160 if (kip->slot_used[i] == SLOT_CLEAN) {
162 kip->slot_used[i] = SLOT_USED; 161 kip->slot_used[i] = SLOT_USED;
163 kip->nused++; 162 kip->nused++;
164 return kip->insns + (i * MAX_INSN_SIZE); 163 return kip->insns + (i * c->insn_size);
165 } 164 }
166 } 165 }
167 /* Surprise! No unused slots. Fix kip->nused. */ 166 /* kip->nused is broken. Fix it. */
168 kip->nused = INSNS_PER_PAGE; 167 kip->nused = slots_per_page(c);
168 WARN_ON(1);
169 } 169 }
170 } 170 }
171 171
172 /* If there are any garbage slots, collect it and try again. */ 172 /* If there are any garbage slots, collect it and try again. */
173 if (kprobe_garbage_slots && collect_garbage_slots() == 0) { 173 if (c->nr_garbage && collect_garbage_slots(c) == 0)
174 goto retry; 174 goto retry;
175 } 175
176 /* All out of space. Need to allocate a new page. Use slot 0. */ 176 /* All out of space. Need to allocate a new page. */
177 kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); 177 kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL);
178 if (!kip) 178 if (!kip)
179 return NULL; 179 return NULL;
180 180
@@ -189,20 +189,23 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void)
189 return NULL; 189 return NULL;
190 } 190 }
191 INIT_LIST_HEAD(&kip->list); 191 INIT_LIST_HEAD(&kip->list);
192 list_add(&kip->list, &kprobe_insn_pages); 192 memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c));
193 memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE);
194 kip->slot_used[0] = SLOT_USED; 193 kip->slot_used[0] = SLOT_USED;
195 kip->nused = 1; 194 kip->nused = 1;
196 kip->ngarbage = 0; 195 kip->ngarbage = 0;
196 list_add(&kip->list, &c->pages);
197 return kip->insns; 197 return kip->insns;
198} 198}
199 199
200
200kprobe_opcode_t __kprobes *get_insn_slot(void) 201kprobe_opcode_t __kprobes *get_insn_slot(void)
201{ 202{
202 kprobe_opcode_t *ret; 203 kprobe_opcode_t *ret = NULL;
204
203 mutex_lock(&kprobe_insn_mutex); 205 mutex_lock(&kprobe_insn_mutex);
204 ret = __get_insn_slot(); 206 ret = __get_insn_slot(&kprobe_insn_slots);
205 mutex_unlock(&kprobe_insn_mutex); 207 mutex_unlock(&kprobe_insn_mutex);
208
206 return ret; 209 return ret;
207} 210}
208 211
@@ -218,7 +221,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
218 * so as not to have to set it up again the 221 * so as not to have to set it up again the
219 * next time somebody inserts a probe. 222 * next time somebody inserts a probe.
220 */ 223 */
221 if (!list_is_singular(&kprobe_insn_pages)) { 224 if (!list_is_singular(&kip->list)) {
222 list_del(&kip->list); 225 list_del(&kip->list);
223 module_free(NULL, kip->insns); 226 module_free(NULL, kip->insns);
224 kfree(kip); 227 kfree(kip);
@@ -228,52 +231,85 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
228 return 0; 231 return 0;
229} 232}
230 233
231static int __kprobes collect_garbage_slots(void) 234static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c)
232{ 235{
233 struct kprobe_insn_page *kip, *next; 236 struct kprobe_insn_page *kip, *next;
234 237
235 /* Ensure no-one is preepmted on the garbages */ 238 /* Ensure no-one is interrupted on the garbages */
236 if (check_safety()) 239 synchronize_sched();
237 return -EAGAIN;
238 240
239 list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) { 241 list_for_each_entry_safe(kip, next, &c->pages, list) {
240 int i; 242 int i;
241 if (kip->ngarbage == 0) 243 if (kip->ngarbage == 0)
242 continue; 244 continue;
243 kip->ngarbage = 0; /* we will collect all garbages */ 245 kip->ngarbage = 0; /* we will collect all garbages */
244 for (i = 0; i < INSNS_PER_PAGE; i++) { 246 for (i = 0; i < slots_per_page(c); i++) {
245 if (kip->slot_used[i] == SLOT_DIRTY && 247 if (kip->slot_used[i] == SLOT_DIRTY &&
246 collect_one_slot(kip, i)) 248 collect_one_slot(kip, i))
247 break; 249 break;
248 } 250 }
249 } 251 }
250 kprobe_garbage_slots = 0; 252 c->nr_garbage = 0;
251 return 0; 253 return 0;
252} 254}
253 255
254void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) 256static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
257 kprobe_opcode_t *slot, int dirty)
255{ 258{
256 struct kprobe_insn_page *kip; 259 struct kprobe_insn_page *kip;
257 260
258 mutex_lock(&kprobe_insn_mutex); 261 list_for_each_entry(kip, &c->pages, list) {
259 list_for_each_entry(kip, &kprobe_insn_pages, list) { 262 long idx = ((long)slot - (long)kip->insns) /
260 if (kip->insns <= slot && 263 (c->insn_size * sizeof(kprobe_opcode_t));
261 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { 264 if (idx >= 0 && idx < slots_per_page(c)) {
262 int i = (slot - kip->insns) / MAX_INSN_SIZE; 265 WARN_ON(kip->slot_used[idx] != SLOT_USED);
263 if (dirty) { 266 if (dirty) {
264 kip->slot_used[i] = SLOT_DIRTY; 267 kip->slot_used[idx] = SLOT_DIRTY;
265 kip->ngarbage++; 268 kip->ngarbage++;
269 if (++c->nr_garbage > slots_per_page(c))
270 collect_garbage_slots(c);
266 } else 271 } else
267 collect_one_slot(kip, i); 272 collect_one_slot(kip, idx);
268 break; 273 return;
269 } 274 }
270 } 275 }
276 /* Could not free this slot. */
277 WARN_ON(1);
278}
271 279
272 if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE) 280void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
273 collect_garbage_slots(); 281{
274 282 mutex_lock(&kprobe_insn_mutex);
283 __free_insn_slot(&kprobe_insn_slots, slot, dirty);
275 mutex_unlock(&kprobe_insn_mutex); 284 mutex_unlock(&kprobe_insn_mutex);
276} 285}
286#ifdef CONFIG_OPTPROBES
287/* For optimized_kprobe buffer */
288static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */
289static struct kprobe_insn_cache kprobe_optinsn_slots = {
290 .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
291 /* .insn_size is initialized later */
292 .nr_garbage = 0,
293};
294/* Get a slot for optimized_kprobe buffer */
295kprobe_opcode_t __kprobes *get_optinsn_slot(void)
296{
297 kprobe_opcode_t *ret = NULL;
298
299 mutex_lock(&kprobe_optinsn_mutex);
300 ret = __get_insn_slot(&kprobe_optinsn_slots);
301 mutex_unlock(&kprobe_optinsn_mutex);
302
303 return ret;
304}
305
306void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty)
307{
308 mutex_lock(&kprobe_optinsn_mutex);
309 __free_insn_slot(&kprobe_optinsn_slots, slot, dirty);
310 mutex_unlock(&kprobe_optinsn_mutex);
311}
312#endif
277#endif 313#endif
278 314
279/* We have preemption disabled.. so it is safe to use __ versions */ 315/* We have preemption disabled.. so it is safe to use __ versions */
@@ -304,23 +340,401 @@ struct kprobe __kprobes *get_kprobe(void *addr)
304 if (p->addr == addr) 340 if (p->addr == addr)
305 return p; 341 return p;
306 } 342 }
343
307 return NULL; 344 return NULL;
308} 345}
309 346
347static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
348
349/* Return true if the kprobe is an aggregator */
350static inline int kprobe_aggrprobe(struct kprobe *p)
351{
352 return p->pre_handler == aggr_pre_handler;
353}
354
355/*
356 * Keep all fields in the kprobe consistent
357 */
358static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
359{
360 memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
361 memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
362}
363
364#ifdef CONFIG_OPTPROBES
365/* NOTE: change this value only with kprobe_mutex held */
366static bool kprobes_allow_optimization;
367
368/*
369 * Call all pre_handler on the list, but ignores its return value.
370 * This must be called from arch-dep optimized caller.
371 */
372void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
373{
374 struct kprobe *kp;
375
376 list_for_each_entry_rcu(kp, &p->list, list) {
377 if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
378 set_kprobe_instance(kp);
379 kp->pre_handler(kp, regs);
380 }
381 reset_kprobe_instance();
382 }
383}
384
385/* Return true(!0) if the kprobe is ready for optimization. */
386static inline int kprobe_optready(struct kprobe *p)
387{
388 struct optimized_kprobe *op;
389
390 if (kprobe_aggrprobe(p)) {
391 op = container_of(p, struct optimized_kprobe, kp);
392 return arch_prepared_optinsn(&op->optinsn);
393 }
394
395 return 0;
396}
397
398/*
399 * Return an optimized kprobe whose optimizing code replaces
400 * instructions including addr (exclude breakpoint).
401 */
402struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
403{
404 int i;
405 struct kprobe *p = NULL;
406 struct optimized_kprobe *op;
407
408 /* Don't check i == 0, since that is a breakpoint case. */
409 for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH; i++)
410 p = get_kprobe((void *)(addr - i));
411
412 if (p && kprobe_optready(p)) {
413 op = container_of(p, struct optimized_kprobe, kp);
414 if (arch_within_optimized_kprobe(op, addr))
415 return p;
416 }
417
418 return NULL;
419}
420
421/* Optimization staging list, protected by kprobe_mutex */
422static LIST_HEAD(optimizing_list);
423
424static void kprobe_optimizer(struct work_struct *work);
425static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
426#define OPTIMIZE_DELAY 5
427
428/* Kprobe jump optimizer */
429static __kprobes void kprobe_optimizer(struct work_struct *work)
430{
431 struct optimized_kprobe *op, *tmp;
432
433 /* Lock modules while optimizing kprobes */
434 mutex_lock(&module_mutex);
435 mutex_lock(&kprobe_mutex);
436 if (kprobes_all_disarmed || !kprobes_allow_optimization)
437 goto end;
438
439 /*
440 * Wait for quiesence period to ensure all running interrupts
441 * are done. Because optprobe may modify multiple instructions
442 * there is a chance that Nth instruction is interrupted. In that
443 * case, running interrupt can return to 2nd-Nth byte of jump
444 * instruction. This wait is for avoiding it.
445 */
446 synchronize_sched();
447
448 /*
449 * The optimization/unoptimization refers online_cpus via
450 * stop_machine() and cpu-hotplug modifies online_cpus.
451 * And same time, text_mutex will be held in cpu-hotplug and here.
452 * This combination can cause a deadlock (cpu-hotplug try to lock
453 * text_mutex but stop_machine can not be done because online_cpus
454 * has been changed)
455 * To avoid this deadlock, we need to call get_online_cpus()
456 * for preventing cpu-hotplug outside of text_mutex locking.
457 */
458 get_online_cpus();
459 mutex_lock(&text_mutex);
460 list_for_each_entry_safe(op, tmp, &optimizing_list, list) {
461 WARN_ON(kprobe_disabled(&op->kp));
462 if (arch_optimize_kprobe(op) < 0)
463 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
464 list_del_init(&op->list);
465 }
466 mutex_unlock(&text_mutex);
467 put_online_cpus();
468end:
469 mutex_unlock(&kprobe_mutex);
470 mutex_unlock(&module_mutex);
471}
472
473/* Optimize kprobe if p is ready to be optimized */
474static __kprobes void optimize_kprobe(struct kprobe *p)
475{
476 struct optimized_kprobe *op;
477
478 /* Check if the kprobe is disabled or not ready for optimization. */
479 if (!kprobe_optready(p) || !kprobes_allow_optimization ||
480 (kprobe_disabled(p) || kprobes_all_disarmed))
481 return;
482
483 /* Both of break_handler and post_handler are not supported. */
484 if (p->break_handler || p->post_handler)
485 return;
486
487 op = container_of(p, struct optimized_kprobe, kp);
488
489 /* Check there is no other kprobes at the optimized instructions */
490 if (arch_check_optimized_kprobe(op) < 0)
491 return;
492
493 /* Check if it is already optimized. */
494 if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
495 return;
496
497 op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
498 list_add(&op->list, &optimizing_list);
499 if (!delayed_work_pending(&optimizing_work))
500 schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
501}
502
503/* Unoptimize a kprobe if p is optimized */
504static __kprobes void unoptimize_kprobe(struct kprobe *p)
505{
506 struct optimized_kprobe *op;
507
508 if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) {
509 op = container_of(p, struct optimized_kprobe, kp);
510 if (!list_empty(&op->list))
511 /* Dequeue from the optimization queue */
512 list_del_init(&op->list);
513 else
514 /* Replace jump with break */
515 arch_unoptimize_kprobe(op);
516 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
517 }
518}
519
520/* Remove optimized instructions */
521static void __kprobes kill_optimized_kprobe(struct kprobe *p)
522{
523 struct optimized_kprobe *op;
524
525 op = container_of(p, struct optimized_kprobe, kp);
526 if (!list_empty(&op->list)) {
527 /* Dequeue from the optimization queue */
528 list_del_init(&op->list);
529 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
530 }
531 /* Don't unoptimize, because the target code will be freed. */
532 arch_remove_optimized_kprobe(op);
533}
534
535/* Try to prepare optimized instructions */
536static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
537{
538 struct optimized_kprobe *op;
539
540 op = container_of(p, struct optimized_kprobe, kp);
541 arch_prepare_optimized_kprobe(op);
542}
543
544/* Free optimized instructions and optimized_kprobe */
545static __kprobes void free_aggr_kprobe(struct kprobe *p)
546{
547 struct optimized_kprobe *op;
548
549 op = container_of(p, struct optimized_kprobe, kp);
550 arch_remove_optimized_kprobe(op);
551 kfree(op);
552}
553
554/* Allocate new optimized_kprobe and try to prepare optimized instructions */
555static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
556{
557 struct optimized_kprobe *op;
558
559 op = kzalloc(sizeof(struct optimized_kprobe), GFP_KERNEL);
560 if (!op)
561 return NULL;
562
563 INIT_LIST_HEAD(&op->list);
564 op->kp.addr = p->addr;
565 arch_prepare_optimized_kprobe(op);
566
567 return &op->kp;
568}
569
570static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);
571
572/*
573 * Prepare an optimized_kprobe and optimize it
574 * NOTE: p must be a normal registered kprobe
575 */
576static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
577{
578 struct kprobe *ap;
579 struct optimized_kprobe *op;
580
581 ap = alloc_aggr_kprobe(p);
582 if (!ap)
583 return;
584
585 op = container_of(ap, struct optimized_kprobe, kp);
586 if (!arch_prepared_optinsn(&op->optinsn)) {
587 /* If failed to setup optimizing, fallback to kprobe */
588 free_aggr_kprobe(ap);
589 return;
590 }
591
592 init_aggr_kprobe(ap, p);
593 optimize_kprobe(ap);
594}
595
596#ifdef CONFIG_SYSCTL
597static void __kprobes optimize_all_kprobes(void)
598{
599 struct hlist_head *head;
600 struct hlist_node *node;
601 struct kprobe *p;
602 unsigned int i;
603
604 /* If optimization is already allowed, just return */
605 if (kprobes_allow_optimization)
606 return;
607
608 kprobes_allow_optimization = true;
609 mutex_lock(&text_mutex);
610 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
611 head = &kprobe_table[i];
612 hlist_for_each_entry_rcu(p, node, head, hlist)
613 if (!kprobe_disabled(p))
614 optimize_kprobe(p);
615 }
616 mutex_unlock(&text_mutex);
617 printk(KERN_INFO "Kprobes globally optimized\n");
618}
619
620static void __kprobes unoptimize_all_kprobes(void)
621{
622 struct hlist_head *head;
623 struct hlist_node *node;
624 struct kprobe *p;
625 unsigned int i;
626
627 /* If optimization is already prohibited, just return */
628 if (!kprobes_allow_optimization)
629 return;
630
631 kprobes_allow_optimization = false;
632 printk(KERN_INFO "Kprobes globally unoptimized\n");
633 get_online_cpus(); /* For avoiding text_mutex deadlock */
634 mutex_lock(&text_mutex);
635 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
636 head = &kprobe_table[i];
637 hlist_for_each_entry_rcu(p, node, head, hlist) {
638 if (!kprobe_disabled(p))
639 unoptimize_kprobe(p);
640 }
641 }
642
643 mutex_unlock(&text_mutex);
644 put_online_cpus();
645 /* Allow all currently running kprobes to complete */
646 synchronize_sched();
647}
648
649int sysctl_kprobes_optimization;
650int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
651 void __user *buffer, size_t *length,
652 loff_t *ppos)
653{
654 int ret;
655
656 mutex_lock(&kprobe_mutex);
657 sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;
658 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
659
660 if (sysctl_kprobes_optimization)
661 optimize_all_kprobes();
662 else
663 unoptimize_all_kprobes();
664 mutex_unlock(&kprobe_mutex);
665
666 return ret;
667}
668#endif /* CONFIG_SYSCTL */
669
670static void __kprobes __arm_kprobe(struct kprobe *p)
671{
672 struct kprobe *old_p;
673
674 /* Check collision with other optimized kprobes */
675 old_p = get_optimized_kprobe((unsigned long)p->addr);
676 if (unlikely(old_p))
677 unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */
678
679 arch_arm_kprobe(p);
680 optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */
681}
682
683static void __kprobes __disarm_kprobe(struct kprobe *p)
684{
685 struct kprobe *old_p;
686
687 unoptimize_kprobe(p); /* Try to unoptimize */
688 arch_disarm_kprobe(p);
689
690 /* If another kprobe was blocked, optimize it. */
691 old_p = get_optimized_kprobe((unsigned long)p->addr);
692 if (unlikely(old_p))
693 optimize_kprobe(old_p);
694}
695
696#else /* !CONFIG_OPTPROBES */
697
698#define optimize_kprobe(p) do {} while (0)
699#define unoptimize_kprobe(p) do {} while (0)
700#define kill_optimized_kprobe(p) do {} while (0)
701#define prepare_optimized_kprobe(p) do {} while (0)
702#define try_to_optimize_kprobe(p) do {} while (0)
703#define __arm_kprobe(p) arch_arm_kprobe(p)
704#define __disarm_kprobe(p) arch_disarm_kprobe(p)
705
706static __kprobes void free_aggr_kprobe(struct kprobe *p)
707{
708 kfree(p);
709}
710
711static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
712{
713 return kzalloc(sizeof(struct kprobe), GFP_KERNEL);
714}
715#endif /* CONFIG_OPTPROBES */
716
310/* Arm a kprobe with text_mutex */ 717/* Arm a kprobe with text_mutex */
311static void __kprobes arm_kprobe(struct kprobe *kp) 718static void __kprobes arm_kprobe(struct kprobe *kp)
312{ 719{
720 /*
721 * Here, since __arm_kprobe() doesn't use stop_machine(),
722 * this doesn't cause deadlock on text_mutex. So, we don't
723 * need get_online_cpus().
724 */
313 mutex_lock(&text_mutex); 725 mutex_lock(&text_mutex);
314 arch_arm_kprobe(kp); 726 __arm_kprobe(kp);
315 mutex_unlock(&text_mutex); 727 mutex_unlock(&text_mutex);
316} 728}
317 729
318/* Disarm a kprobe with text_mutex */ 730/* Disarm a kprobe with text_mutex */
319static void __kprobes disarm_kprobe(struct kprobe *kp) 731static void __kprobes disarm_kprobe(struct kprobe *kp)
320{ 732{
733 get_online_cpus(); /* For avoiding text_mutex deadlock */
321 mutex_lock(&text_mutex); 734 mutex_lock(&text_mutex);
322 arch_disarm_kprobe(kp); 735 __disarm_kprobe(kp);
323 mutex_unlock(&text_mutex); 736 mutex_unlock(&text_mutex);
737 put_online_cpus();
324} 738}
325 739
326/* 740/*
@@ -389,7 +803,7 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
389void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) 803void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
390{ 804{
391 struct kprobe *kp; 805 struct kprobe *kp;
392 if (p->pre_handler != aggr_pre_handler) { 806 if (!kprobe_aggrprobe(p)) {
393 p->nmissed++; 807 p->nmissed++;
394 } else { 808 } else {
395 list_for_each_entry_rcu(kp, &p->list, list) 809 list_for_each_entry_rcu(kp, &p->list, list)
@@ -513,21 +927,16 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
513} 927}
514 928
515/* 929/*
516 * Keep all fields in the kprobe consistent
517 */
518static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
519{
520 memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
521 memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
522}
523
524/*
525* Add the new probe to ap->list. Fail if this is the 930* Add the new probe to ap->list. Fail if this is the
526* second jprobe at the address - two jprobes can't coexist 931* second jprobe at the address - two jprobes can't coexist
527*/ 932*/
528static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) 933static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
529{ 934{
530 BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); 935 BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
936
937 if (p->break_handler || p->post_handler)
938 unoptimize_kprobe(ap); /* Fall back to normal kprobe */
939
531 if (p->break_handler) { 940 if (p->break_handler) {
532 if (ap->break_handler) 941 if (ap->break_handler)
533 return -EEXIST; 942 return -EEXIST;
@@ -542,7 +951,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
542 ap->flags &= ~KPROBE_FLAG_DISABLED; 951 ap->flags &= ~KPROBE_FLAG_DISABLED;
543 if (!kprobes_all_disarmed) 952 if (!kprobes_all_disarmed)
544 /* Arm the breakpoint again. */ 953 /* Arm the breakpoint again. */
545 arm_kprobe(ap); 954 __arm_kprobe(ap);
546 } 955 }
547 return 0; 956 return 0;
548} 957}
@@ -551,12 +960,13 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
551 * Fill in the required fields of the "manager kprobe". Replace the 960 * Fill in the required fields of the "manager kprobe". Replace the
552 * earlier kprobe in the hlist with the manager kprobe 961 * earlier kprobe in the hlist with the manager kprobe
553 */ 962 */
554static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) 963static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
555{ 964{
965 /* Copy p's insn slot to ap */
556 copy_kprobe(p, ap); 966 copy_kprobe(p, ap);
557 flush_insn_slot(ap); 967 flush_insn_slot(ap);
558 ap->addr = p->addr; 968 ap->addr = p->addr;
559 ap->flags = p->flags; 969 ap->flags = p->flags & ~KPROBE_FLAG_OPTIMIZED;
560 ap->pre_handler = aggr_pre_handler; 970 ap->pre_handler = aggr_pre_handler;
561 ap->fault_handler = aggr_fault_handler; 971 ap->fault_handler = aggr_fault_handler;
562 /* We don't care the kprobe which has gone. */ 972 /* We don't care the kprobe which has gone. */
@@ -566,8 +976,9 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
566 ap->break_handler = aggr_break_handler; 976 ap->break_handler = aggr_break_handler;
567 977
568 INIT_LIST_HEAD(&ap->list); 978 INIT_LIST_HEAD(&ap->list);
569 list_add_rcu(&p->list, &ap->list); 979 INIT_HLIST_NODE(&ap->hlist);
570 980
981 list_add_rcu(&p->list, &ap->list);
571 hlist_replace_rcu(&p->hlist, &ap->hlist); 982 hlist_replace_rcu(&p->hlist, &ap->hlist);
572} 983}
573 984
@@ -581,12 +992,12 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
581 int ret = 0; 992 int ret = 0;
582 struct kprobe *ap = old_p; 993 struct kprobe *ap = old_p;
583 994
584 if (old_p->pre_handler != aggr_pre_handler) { 995 if (!kprobe_aggrprobe(old_p)) {
585 /* If old_p is not an aggr_probe, create new aggr_kprobe. */ 996 /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */
586 ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL); 997 ap = alloc_aggr_kprobe(old_p);
587 if (!ap) 998 if (!ap)
588 return -ENOMEM; 999 return -ENOMEM;
589 add_aggr_kprobe(ap, old_p); 1000 init_aggr_kprobe(ap, old_p);
590 } 1001 }
591 1002
592 if (kprobe_gone(ap)) { 1003 if (kprobe_gone(ap)) {
@@ -605,6 +1016,9 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
605 */ 1016 */
606 return ret; 1017 return ret;
607 1018
1019 /* Prepare optimized instructions if possible. */
1020 prepare_optimized_kprobe(ap);
1021
608 /* 1022 /*
609 * Clear gone flag to prevent allocating new slot again, and 1023 * Clear gone flag to prevent allocating new slot again, and
610 * set disabled flag because it is not armed yet. 1024 * set disabled flag because it is not armed yet.
@@ -613,6 +1027,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
613 | KPROBE_FLAG_DISABLED; 1027 | KPROBE_FLAG_DISABLED;
614 } 1028 }
615 1029
1030 /* Copy ap's insn slot to p */
616 copy_kprobe(ap, p); 1031 copy_kprobe(ap, p);
617 return add_new_kprobe(ap, p); 1032 return add_new_kprobe(ap, p);
618} 1033}
@@ -673,6 +1088,40 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
673 return (kprobe_opcode_t *)(((char *)addr) + p->offset); 1088 return (kprobe_opcode_t *)(((char *)addr) + p->offset);
674} 1089}
675 1090
1091/* Check passed kprobe is valid and return kprobe in kprobe_table. */
1092static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
1093{
1094 struct kprobe *old_p, *list_p;
1095
1096 old_p = get_kprobe(p->addr);
1097 if (unlikely(!old_p))
1098 return NULL;
1099
1100 if (p != old_p) {
1101 list_for_each_entry_rcu(list_p, &old_p->list, list)
1102 if (list_p == p)
1103 /* kprobe p is a valid probe */
1104 goto valid;
1105 return NULL;
1106 }
1107valid:
1108 return old_p;
1109}
1110
1111/* Return error if the kprobe is being re-registered */
1112static inline int check_kprobe_rereg(struct kprobe *p)
1113{
1114 int ret = 0;
1115 struct kprobe *old_p;
1116
1117 mutex_lock(&kprobe_mutex);
1118 old_p = __get_valid_kprobe(p);
1119 if (old_p)
1120 ret = -EINVAL;
1121 mutex_unlock(&kprobe_mutex);
1122 return ret;
1123}
1124
676int __kprobes register_kprobe(struct kprobe *p) 1125int __kprobes register_kprobe(struct kprobe *p)
677{ 1126{
678 int ret = 0; 1127 int ret = 0;
@@ -685,9 +1134,14 @@ int __kprobes register_kprobe(struct kprobe *p)
685 return -EINVAL; 1134 return -EINVAL;
686 p->addr = addr; 1135 p->addr = addr;
687 1136
1137 ret = check_kprobe_rereg(p);
1138 if (ret)
1139 return ret;
1140
688 preempt_disable(); 1141 preempt_disable();
689 if (!kernel_text_address((unsigned long) p->addr) || 1142 if (!kernel_text_address((unsigned long) p->addr) ||
690 in_kprobes_functions((unsigned long) p->addr)) { 1143 in_kprobes_functions((unsigned long) p->addr) ||
1144 ftrace_text_reserved(p->addr, p->addr)) {
691 preempt_enable(); 1145 preempt_enable();
692 return -EINVAL; 1146 return -EINVAL;
693 } 1147 }
@@ -724,27 +1178,34 @@ int __kprobes register_kprobe(struct kprobe *p)
724 p->nmissed = 0; 1178 p->nmissed = 0;
725 INIT_LIST_HEAD(&p->list); 1179 INIT_LIST_HEAD(&p->list);
726 mutex_lock(&kprobe_mutex); 1180 mutex_lock(&kprobe_mutex);
1181
1182 get_online_cpus(); /* For avoiding text_mutex deadlock. */
1183 mutex_lock(&text_mutex);
1184
727 old_p = get_kprobe(p->addr); 1185 old_p = get_kprobe(p->addr);
728 if (old_p) { 1186 if (old_p) {
1187 /* Since this may unoptimize old_p, locking text_mutex. */
729 ret = register_aggr_kprobe(old_p, p); 1188 ret = register_aggr_kprobe(old_p, p);
730 goto out; 1189 goto out;
731 } 1190 }
732 1191
733 mutex_lock(&text_mutex);
734 ret = arch_prepare_kprobe(p); 1192 ret = arch_prepare_kprobe(p);
735 if (ret) 1193 if (ret)
736 goto out_unlock_text; 1194 goto out;
737 1195
738 INIT_HLIST_NODE(&p->hlist); 1196 INIT_HLIST_NODE(&p->hlist);
739 hlist_add_head_rcu(&p->hlist, 1197 hlist_add_head_rcu(&p->hlist,
740 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); 1198 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
741 1199
742 if (!kprobes_all_disarmed && !kprobe_disabled(p)) 1200 if (!kprobes_all_disarmed && !kprobe_disabled(p))
743 arch_arm_kprobe(p); 1201 __arm_kprobe(p);
1202
1203 /* Try to optimize kprobe */
1204 try_to_optimize_kprobe(p);
744 1205
745out_unlock_text:
746 mutex_unlock(&text_mutex);
747out: 1206out:
1207 mutex_unlock(&text_mutex);
1208 put_online_cpus();
748 mutex_unlock(&kprobe_mutex); 1209 mutex_unlock(&kprobe_mutex);
749 1210
750 if (probed_mod) 1211 if (probed_mod)
@@ -754,26 +1215,6 @@ out:
754} 1215}
755EXPORT_SYMBOL_GPL(register_kprobe); 1216EXPORT_SYMBOL_GPL(register_kprobe);
756 1217
757/* Check passed kprobe is valid and return kprobe in kprobe_table. */
758static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
759{
760 struct kprobe *old_p, *list_p;
761
762 old_p = get_kprobe(p->addr);
763 if (unlikely(!old_p))
764 return NULL;
765
766 if (p != old_p) {
767 list_for_each_entry_rcu(list_p, &old_p->list, list)
768 if (list_p == p)
769 /* kprobe p is a valid probe */
770 goto valid;
771 return NULL;
772 }
773valid:
774 return old_p;
775}
776
777/* 1218/*
778 * Unregister a kprobe without a scheduler synchronization. 1219 * Unregister a kprobe without a scheduler synchronization.
779 */ 1220 */
@@ -786,7 +1227,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p)
786 return -EINVAL; 1227 return -EINVAL;
787 1228
788 if (old_p == p || 1229 if (old_p == p ||
789 (old_p->pre_handler == aggr_pre_handler && 1230 (kprobe_aggrprobe(old_p) &&
790 list_is_singular(&old_p->list))) { 1231 list_is_singular(&old_p->list))) {
791 /* 1232 /*
792 * Only probe on the hash list. Disarm only if kprobes are 1233 * Only probe on the hash list. Disarm only if kprobes are
@@ -794,7 +1235,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p)
794 * already have been removed. We save on flushing icache. 1235 * already have been removed. We save on flushing icache.
795 */ 1236 */
796 if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) 1237 if (!kprobes_all_disarmed && !kprobe_disabled(old_p))
797 disarm_kprobe(p); 1238 disarm_kprobe(old_p);
798 hlist_del_rcu(&old_p->hlist); 1239 hlist_del_rcu(&old_p->hlist);
799 } else { 1240 } else {
800 if (p->break_handler && !kprobe_gone(p)) 1241 if (p->break_handler && !kprobe_gone(p))
@@ -810,8 +1251,13 @@ noclean:
810 list_del_rcu(&p->list); 1251 list_del_rcu(&p->list);
811 if (!kprobe_disabled(old_p)) { 1252 if (!kprobe_disabled(old_p)) {
812 try_to_disable_aggr_kprobe(old_p); 1253 try_to_disable_aggr_kprobe(old_p);
813 if (!kprobes_all_disarmed && kprobe_disabled(old_p)) 1254 if (!kprobes_all_disarmed) {
814 disarm_kprobe(old_p); 1255 if (kprobe_disabled(old_p))
1256 disarm_kprobe(old_p);
1257 else
1258 /* Try to optimize this probe again */
1259 optimize_kprobe(old_p);
1260 }
815 } 1261 }
816 } 1262 }
817 return 0; 1263 return 0;
@@ -828,7 +1274,7 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
828 old_p = list_entry(p->list.next, struct kprobe, list); 1274 old_p = list_entry(p->list.next, struct kprobe, list);
829 list_del(&p->list); 1275 list_del(&p->list);
830 arch_remove_kprobe(old_p); 1276 arch_remove_kprobe(old_p);
831 kfree(old_p); 1277 free_aggr_kprobe(old_p);
832 } 1278 }
833} 1279}
834 1280
@@ -1014,9 +1460,9 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
1014 /* Pre-allocate memory for max kretprobe instances */ 1460 /* Pre-allocate memory for max kretprobe instances */
1015 if (rp->maxactive <= 0) { 1461 if (rp->maxactive <= 0) {
1016#ifdef CONFIG_PREEMPT 1462#ifdef CONFIG_PREEMPT
1017 rp->maxactive = max(10, 2 * NR_CPUS); 1463 rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());
1018#else 1464#else
1019 rp->maxactive = NR_CPUS; 1465 rp->maxactive = num_possible_cpus();
1020#endif 1466#endif
1021 } 1467 }
1022 spin_lock_init(&rp->lock); 1468 spin_lock_init(&rp->lock);
@@ -1124,7 +1570,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1124 struct kprobe *kp; 1570 struct kprobe *kp;
1125 1571
1126 p->flags |= KPROBE_FLAG_GONE; 1572 p->flags |= KPROBE_FLAG_GONE;
1127 if (p->pre_handler == aggr_pre_handler) { 1573 if (kprobe_aggrprobe(p)) {
1128 /* 1574 /*
1129 * If this is an aggr_kprobe, we have to list all the 1575 * If this is an aggr_kprobe, we have to list all the
1130 * chained probes and mark them GONE. 1576 * chained probes and mark them GONE.
@@ -1133,6 +1579,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1133 kp->flags |= KPROBE_FLAG_GONE; 1579 kp->flags |= KPROBE_FLAG_GONE;
1134 p->post_handler = NULL; 1580 p->post_handler = NULL;
1135 p->break_handler = NULL; 1581 p->break_handler = NULL;
1582 kill_optimized_kprobe(p);
1136 } 1583 }
1137 /* 1584 /*
1138 * Here, we can remove insn_slot safely, because no thread calls 1585 * Here, we can remove insn_slot safely, because no thread calls
@@ -1141,6 +1588,13 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1141 arch_remove_kprobe(p); 1588 arch_remove_kprobe(p);
1142} 1589}
1143 1590
1591void __kprobes dump_kprobe(struct kprobe *kp)
1592{
1593 printk(KERN_WARNING "Dumping kprobe:\n");
1594 printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n",
1595 kp->symbol_name, kp->addr, kp->offset);
1596}
1597
1144/* Module notifier call back, checking kprobes on the module */ 1598/* Module notifier call back, checking kprobes on the module */
1145static int __kprobes kprobes_module_callback(struct notifier_block *nb, 1599static int __kprobes kprobes_module_callback(struct notifier_block *nb,
1146 unsigned long val, void *data) 1600 unsigned long val, void *data)
@@ -1235,6 +1689,15 @@ static int __init init_kprobes(void)
1235 } 1689 }
1236 } 1690 }
1237 1691
1692#if defined(CONFIG_OPTPROBES)
1693#if defined(__ARCH_WANT_KPROBES_INSN_SLOT)
1694 /* Init kprobe_optinsn_slots */
1695 kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
1696#endif
1697 /* By default, kprobes can be optimized */
1698 kprobes_allow_optimization = true;
1699#endif
1700
1238 /* By default, kprobes are armed */ 1701 /* By default, kprobes are armed */
1239 kprobes_all_disarmed = false; 1702 kprobes_all_disarmed = false;
1240 1703
@@ -1253,7 +1716,7 @@ static int __init init_kprobes(void)
1253 1716
1254#ifdef CONFIG_DEBUG_FS 1717#ifdef CONFIG_DEBUG_FS
1255static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, 1718static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
1256 const char *sym, int offset,char *modname) 1719 const char *sym, int offset, char *modname, struct kprobe *pp)
1257{ 1720{
1258 char *kprobe_type; 1721 char *kprobe_type;
1259 1722
@@ -1263,19 +1726,21 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
1263 kprobe_type = "j"; 1726 kprobe_type = "j";
1264 else 1727 else
1265 kprobe_type = "k"; 1728 kprobe_type = "k";
1729
1266 if (sym) 1730 if (sym)
1267 seq_printf(pi, "%p %s %s+0x%x %s %s%s\n", 1731 seq_printf(pi, "%p %s %s+0x%x %s ",
1268 p->addr, kprobe_type, sym, offset, 1732 p->addr, kprobe_type, sym, offset,
1269 (modname ? modname : " "), 1733 (modname ? modname : " "));
1270 (kprobe_gone(p) ? "[GONE]" : ""),
1271 ((kprobe_disabled(p) && !kprobe_gone(p)) ?
1272 "[DISABLED]" : ""));
1273 else 1734 else
1274 seq_printf(pi, "%p %s %p %s%s\n", 1735 seq_printf(pi, "%p %s %p ",
1275 p->addr, kprobe_type, p->addr, 1736 p->addr, kprobe_type, p->addr);
1276 (kprobe_gone(p) ? "[GONE]" : ""), 1737
1277 ((kprobe_disabled(p) && !kprobe_gone(p)) ? 1738 if (!pp)
1278 "[DISABLED]" : "")); 1739 pp = p;
1740 seq_printf(pi, "%s%s%s\n",
1741 (kprobe_gone(p) ? "[GONE]" : ""),
1742 ((kprobe_disabled(p) && !kprobe_gone(p)) ? "[DISABLED]" : ""),
1743 (kprobe_optimized(pp) ? "[OPTIMIZED]" : ""));
1279} 1744}
1280 1745
1281static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) 1746static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
@@ -1311,11 +1776,11 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
1311 hlist_for_each_entry_rcu(p, node, head, hlist) { 1776 hlist_for_each_entry_rcu(p, node, head, hlist) {
1312 sym = kallsyms_lookup((unsigned long)p->addr, NULL, 1777 sym = kallsyms_lookup((unsigned long)p->addr, NULL,
1313 &offset, &modname, namebuf); 1778 &offset, &modname, namebuf);
1314 if (p->pre_handler == aggr_pre_handler) { 1779 if (kprobe_aggrprobe(p)) {
1315 list_for_each_entry_rcu(kp, &p->list, list) 1780 list_for_each_entry_rcu(kp, &p->list, list)
1316 report_probe(pi, kp, sym, offset, modname); 1781 report_probe(pi, kp, sym, offset, modname, p);
1317 } else 1782 } else
1318 report_probe(pi, p, sym, offset, modname); 1783 report_probe(pi, p, sym, offset, modname, NULL);
1319 } 1784 }
1320 preempt_enable(); 1785 preempt_enable();
1321 return 0; 1786 return 0;
@@ -1393,12 +1858,13 @@ int __kprobes enable_kprobe(struct kprobe *kp)
1393 goto out; 1858 goto out;
1394 } 1859 }
1395 1860
1396 if (!kprobes_all_disarmed && kprobe_disabled(p))
1397 arm_kprobe(p);
1398
1399 p->flags &= ~KPROBE_FLAG_DISABLED;
1400 if (p != kp) 1861 if (p != kp)
1401 kp->flags &= ~KPROBE_FLAG_DISABLED; 1862 kp->flags &= ~KPROBE_FLAG_DISABLED;
1863
1864 if (!kprobes_all_disarmed && kprobe_disabled(p)) {
1865 p->flags &= ~KPROBE_FLAG_DISABLED;
1866 arm_kprobe(p);
1867 }
1402out: 1868out:
1403 mutex_unlock(&kprobe_mutex); 1869 mutex_unlock(&kprobe_mutex);
1404 return ret; 1870 return ret;
@@ -1418,12 +1884,13 @@ static void __kprobes arm_all_kprobes(void)
1418 if (!kprobes_all_disarmed) 1884 if (!kprobes_all_disarmed)
1419 goto already_enabled; 1885 goto already_enabled;
1420 1886
1887 /* Arming kprobes doesn't optimize kprobe itself */
1421 mutex_lock(&text_mutex); 1888 mutex_lock(&text_mutex);
1422 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1889 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1423 head = &kprobe_table[i]; 1890 head = &kprobe_table[i];
1424 hlist_for_each_entry_rcu(p, node, head, hlist) 1891 hlist_for_each_entry_rcu(p, node, head, hlist)
1425 if (!kprobe_disabled(p)) 1892 if (!kprobe_disabled(p))
1426 arch_arm_kprobe(p); 1893 __arm_kprobe(p);
1427 } 1894 }
1428 mutex_unlock(&text_mutex); 1895 mutex_unlock(&text_mutex);
1429 1896
@@ -1450,16 +1917,23 @@ static void __kprobes disarm_all_kprobes(void)
1450 1917
1451 kprobes_all_disarmed = true; 1918 kprobes_all_disarmed = true;
1452 printk(KERN_INFO "Kprobes globally disabled\n"); 1919 printk(KERN_INFO "Kprobes globally disabled\n");
1920
1921 /*
1922 * Here we call get_online_cpus() for avoiding text_mutex deadlock,
1923 * because disarming may also unoptimize kprobes.
1924 */
1925 get_online_cpus();
1453 mutex_lock(&text_mutex); 1926 mutex_lock(&text_mutex);
1454 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1927 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1455 head = &kprobe_table[i]; 1928 head = &kprobe_table[i];
1456 hlist_for_each_entry_rcu(p, node, head, hlist) { 1929 hlist_for_each_entry_rcu(p, node, head, hlist) {
1457 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) 1930 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
1458 arch_disarm_kprobe(p); 1931 __disarm_kprobe(p);
1459 } 1932 }
1460 } 1933 }
1461 1934
1462 mutex_unlock(&text_mutex); 1935 mutex_unlock(&text_mutex);
1936 put_online_cpus();
1463 mutex_unlock(&kprobe_mutex); 1937 mutex_unlock(&kprobe_mutex);
1464 /* Allow all currently running kprobes to complete */ 1938 /* Allow all currently running kprobes to complete */
1465 synchronize_sched(); 1939 synchronize_sched();
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 528dd78e7e7e..21fe3c426948 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -33,7 +33,7 @@ static ssize_t uevent_seqnum_show(struct kobject *kobj,
33} 33}
34KERNEL_ATTR_RO(uevent_seqnum); 34KERNEL_ATTR_RO(uevent_seqnum);
35 35
36/* uevent helper program, used during early boo */ 36/* uevent helper program, used during early boot */
37static ssize_t uevent_helper_show(struct kobject *kobj, 37static ssize_t uevent_helper_show(struct kobject *kobj,
38 struct kobj_attribute *attr, char *buf) 38 struct kobj_attribute *attr, char *buf)
39{ 39{
@@ -100,6 +100,26 @@ static ssize_t kexec_crash_loaded_show(struct kobject *kobj,
100} 100}
101KERNEL_ATTR_RO(kexec_crash_loaded); 101KERNEL_ATTR_RO(kexec_crash_loaded);
102 102
103static ssize_t kexec_crash_size_show(struct kobject *kobj,
104 struct kobj_attribute *attr, char *buf)
105{
106 return sprintf(buf, "%zu\n", crash_get_memory_size());
107}
108static ssize_t kexec_crash_size_store(struct kobject *kobj,
109 struct kobj_attribute *attr,
110 const char *buf, size_t count)
111{
112 unsigned long cnt;
113 int ret;
114
115 if (strict_strtoul(buf, 0, &cnt))
116 return -EINVAL;
117
118 ret = crash_shrink_memory(cnt);
119 return ret < 0 ? ret : count;
120}
121KERNEL_ATTR_RW(kexec_crash_size);
122
103static ssize_t vmcoreinfo_show(struct kobject *kobj, 123static ssize_t vmcoreinfo_show(struct kobject *kobj,
104 struct kobj_attribute *attr, char *buf) 124 struct kobj_attribute *attr, char *buf)
105{ 125{
@@ -147,6 +167,7 @@ static struct attribute * kernel_attrs[] = {
147#ifdef CONFIG_KEXEC 167#ifdef CONFIG_KEXEC
148 &kexec_loaded_attr.attr, 168 &kexec_loaded_attr.attr,
149 &kexec_crash_loaded_attr.attr, 169 &kexec_crash_loaded_attr.attr,
170 &kexec_crash_size_attr.attr,
150 &vmcoreinfo_attr.attr, 171 &vmcoreinfo_attr.attr,
151#endif 172#endif
152 NULL 173 NULL
@@ -176,16 +197,8 @@ static int __init ksysfs_init(void)
176 goto group_exit; 197 goto group_exit;
177 } 198 }
178 199
179 /* create the /sys/kernel/uids/ directory */
180 error = uids_sysfs_init();
181 if (error)
182 goto notes_exit;
183
184 return 0; 200 return 0;
185 201
186notes_exit:
187 if (notes_size > 0)
188 sysfs_remove_bin_file(kernel_kobj, &notes_attr);
189group_exit: 202group_exit:
190 sysfs_remove_group(kernel_kobj, &kernel_attr_group); 203 sysfs_remove_group(kernel_kobj, &kernel_attr_group);
191kset_exit: 204kset_exit:
diff --git a/kernel/kthread.c b/kernel/kthread.c
index ab7ae57773e1..83911c780175 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -101,7 +101,7 @@ static void create_kthread(struct kthread_create_info *create)
101 * 101 *
102 * Description: This helper function creates and names a kernel 102 * Description: This helper function creates and names a kernel
103 * thread. The thread will be stopped: use wake_up_process() to start 103 * thread. The thread will be stopped: use wake_up_process() to start
104 * it. See also kthread_run(), kthread_create_on_cpu(). 104 * it. See also kthread_run().
105 * 105 *
106 * When woken, the thread will run @threadfn() with @data as its 106 * When woken, the thread will run @threadfn() with @data as its
107 * argument. @threadfn() can either call do_exit() directly if it is a 107 * argument. @threadfn() can either call do_exit() directly if it is a
@@ -150,6 +150,29 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
150EXPORT_SYMBOL(kthread_create); 150EXPORT_SYMBOL(kthread_create);
151 151
152/** 152/**
153 * kthread_bind - bind a just-created kthread to a cpu.
154 * @p: thread created by kthread_create().
155 * @cpu: cpu (might not be online, must be possible) for @k to run on.
156 *
157 * Description: This function is equivalent to set_cpus_allowed(),
158 * except that @cpu doesn't need to be online, and the thread must be
159 * stopped (i.e., just returned from kthread_create()).
160 */
161void kthread_bind(struct task_struct *p, unsigned int cpu)
162{
163 /* Must have done schedule() in kthread() before we set_task_cpu */
164 if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
165 WARN_ON(1);
166 return;
167 }
168
169 p->cpus_allowed = cpumask_of_cpu(cpu);
170 p->rt.nr_cpus_allowed = 1;
171 p->flags |= PF_THREAD_BOUND;
172}
173EXPORT_SYMBOL(kthread_bind);
174
175/**
153 * kthread_stop - stop a thread created by kthread_create(). 176 * kthread_stop - stop a thread created by kthread_create().
154 * @k: thread created by kthread_create(). 177 * @k: thread created by kthread_create().
155 * 178 *
@@ -196,7 +219,7 @@ int kthreadd(void *unused)
196 set_task_comm(tsk, "kthreadd"); 219 set_task_comm(tsk, "kthreadd");
197 ignore_signals(tsk); 220 ignore_signals(tsk);
198 set_cpus_allowed_ptr(tsk, cpu_all_mask); 221 set_cpus_allowed_ptr(tsk, cpu_all_mask);
199 set_mems_allowed(node_possible_map); 222 set_mems_allowed(node_states[N_HIGH_MEMORY]);
200 223
201 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; 224 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
202 225
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index ca07c5c0c914..877fb306d415 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -56,7 +56,6 @@
56#include <linux/module.h> 56#include <linux/module.h>
57#include <linux/sched.h> 57#include <linux/sched.h>
58#include <linux/list.h> 58#include <linux/list.h>
59#include <linux/slab.h>
60#include <linux/stacktrace.h> 59#include <linux/stacktrace.h>
61 60
62static DEFINE_SPINLOCK(latency_lock); 61static DEFINE_SPINLOCK(latency_lock);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 9af56723c096..2594e1ce41cb 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -43,13 +43,14 @@
43#include <linux/ftrace.h> 43#include <linux/ftrace.h>
44#include <linux/stringify.h> 44#include <linux/stringify.h>
45#include <linux/bitops.h> 45#include <linux/bitops.h>
46#include <linux/gfp.h>
46 47
47#include <asm/sections.h> 48#include <asm/sections.h>
48 49
49#include "lockdep_internals.h" 50#include "lockdep_internals.h"
50 51
51#define CREATE_TRACE_POINTS 52#define CREATE_TRACE_POINTS
52#include <trace/events/lockdep.h> 53#include <trace/events/lock.h>
53 54
54#ifdef CONFIG_PROVE_LOCKING 55#ifdef CONFIG_PROVE_LOCKING
55int prove_locking = 1; 56int prove_locking = 1;
@@ -73,11 +74,11 @@ module_param(lock_stat, int, 0644);
73 * to use a raw spinlock - we really dont want the spinlock 74 * to use a raw spinlock - we really dont want the spinlock
74 * code to recurse back into the lockdep code... 75 * code to recurse back into the lockdep code...
75 */ 76 */
76static raw_spinlock_t lockdep_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 77static arch_spinlock_t lockdep_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
77 78
78static int graph_lock(void) 79static int graph_lock(void)
79{ 80{
80 __raw_spin_lock(&lockdep_lock); 81 arch_spin_lock(&lockdep_lock);
81 /* 82 /*
82 * Make sure that if another CPU detected a bug while 83 * Make sure that if another CPU detected a bug while
83 * walking the graph we dont change it (while the other 84 * walking the graph we dont change it (while the other
@@ -85,7 +86,7 @@ static int graph_lock(void)
85 * dropped already) 86 * dropped already)
86 */ 87 */
87 if (!debug_locks) { 88 if (!debug_locks) {
88 __raw_spin_unlock(&lockdep_lock); 89 arch_spin_unlock(&lockdep_lock);
89 return 0; 90 return 0;
90 } 91 }
91 /* prevent any recursions within lockdep from causing deadlocks */ 92 /* prevent any recursions within lockdep from causing deadlocks */
@@ -95,11 +96,11 @@ static int graph_lock(void)
95 96
96static inline int graph_unlock(void) 97static inline int graph_unlock(void)
97{ 98{
98 if (debug_locks && !__raw_spin_is_locked(&lockdep_lock)) 99 if (debug_locks && !arch_spin_is_locked(&lockdep_lock))
99 return DEBUG_LOCKS_WARN_ON(1); 100 return DEBUG_LOCKS_WARN_ON(1);
100 101
101 current->lockdep_recursion--; 102 current->lockdep_recursion--;
102 __raw_spin_unlock(&lockdep_lock); 103 arch_spin_unlock(&lockdep_lock);
103 return 0; 104 return 0;
104} 105}
105 106
@@ -111,7 +112,7 @@ static inline int debug_locks_off_graph_unlock(void)
111{ 112{
112 int ret = debug_locks_off(); 113 int ret = debug_locks_off();
113 114
114 __raw_spin_unlock(&lockdep_lock); 115 arch_spin_unlock(&lockdep_lock);
115 116
116 return ret; 117 return ret;
117} 118}
@@ -140,7 +141,8 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock)
140} 141}
141 142
142#ifdef CONFIG_LOCK_STAT 143#ifdef CONFIG_LOCK_STAT
143static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); 144static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS],
145 cpu_lock_stats);
144 146
145static inline u64 lockstat_clock(void) 147static inline u64 lockstat_clock(void)
146{ 148{
@@ -168,7 +170,7 @@ static void lock_time_inc(struct lock_time *lt, u64 time)
168 if (time > lt->max) 170 if (time > lt->max)
169 lt->max = time; 171 lt->max = time;
170 172
171 if (time < lt->min || !lt->min) 173 if (time < lt->min || !lt->nr)
172 lt->min = time; 174 lt->min = time;
173 175
174 lt->total += time; 176 lt->total += time;
@@ -177,8 +179,15 @@ static void lock_time_inc(struct lock_time *lt, u64 time)
177 179
178static inline void lock_time_add(struct lock_time *src, struct lock_time *dst) 180static inline void lock_time_add(struct lock_time *src, struct lock_time *dst)
179{ 181{
180 dst->min += src->min; 182 if (!src->nr)
181 dst->max += src->max; 183 return;
184
185 if (src->max > dst->max)
186 dst->max = src->max;
187
188 if (src->min < dst->min || !dst->nr)
189 dst->min = src->min;
190
182 dst->total += src->total; 191 dst->total += src->total;
183 dst->nr += src->nr; 192 dst->nr += src->nr;
184} 193}
@@ -191,7 +200,7 @@ struct lock_class_stats lock_stats(struct lock_class *class)
191 memset(&stats, 0, sizeof(struct lock_class_stats)); 200 memset(&stats, 0, sizeof(struct lock_class_stats));
192 for_each_possible_cpu(cpu) { 201 for_each_possible_cpu(cpu) {
193 struct lock_class_stats *pcs = 202 struct lock_class_stats *pcs =
194 &per_cpu(lock_stats, cpu)[class - lock_classes]; 203 &per_cpu(cpu_lock_stats, cpu)[class - lock_classes];
195 204
196 for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++) 205 for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
197 stats.contention_point[i] += pcs->contention_point[i]; 206 stats.contention_point[i] += pcs->contention_point[i];
@@ -218,7 +227,7 @@ void clear_lock_stats(struct lock_class *class)
218 227
219 for_each_possible_cpu(cpu) { 228 for_each_possible_cpu(cpu) {
220 struct lock_class_stats *cpu_stats = 229 struct lock_class_stats *cpu_stats =
221 &per_cpu(lock_stats, cpu)[class - lock_classes]; 230 &per_cpu(cpu_lock_stats, cpu)[class - lock_classes];
222 231
223 memset(cpu_stats, 0, sizeof(struct lock_class_stats)); 232 memset(cpu_stats, 0, sizeof(struct lock_class_stats));
224 } 233 }
@@ -228,12 +237,12 @@ void clear_lock_stats(struct lock_class *class)
228 237
229static struct lock_class_stats *get_lock_stats(struct lock_class *class) 238static struct lock_class_stats *get_lock_stats(struct lock_class *class)
230{ 239{
231 return &get_cpu_var(lock_stats)[class - lock_classes]; 240 return &get_cpu_var(cpu_lock_stats)[class - lock_classes];
232} 241}
233 242
234static void put_lock_stats(struct lock_class_stats *stats) 243static void put_lock_stats(struct lock_class_stats *stats)
235{ 244{
236 put_cpu_var(lock_stats); 245 put_cpu_var(cpu_lock_stats);
237} 246}
238 247
239static void lock_release_holdtime(struct held_lock *hlock) 248static void lock_release_holdtime(struct held_lock *hlock)
@@ -379,7 +388,8 @@ static int save_trace(struct stack_trace *trace)
379 * complete trace that maxes out the entries provided will be reported 388 * complete trace that maxes out the entries provided will be reported
380 * as incomplete, friggin useless </rant> 389 * as incomplete, friggin useless </rant>
381 */ 390 */
382 if (trace->entries[trace->nr_entries-1] == ULONG_MAX) 391 if (trace->nr_entries != 0 &&
392 trace->entries[trace->nr_entries-1] == ULONG_MAX)
383 trace->nr_entries--; 393 trace->nr_entries--;
384 394
385 trace->max_entries = trace->nr_entries; 395 trace->max_entries = trace->nr_entries;
@@ -573,9 +583,6 @@ static int static_obj(void *obj)
573 unsigned long start = (unsigned long) &_stext, 583 unsigned long start = (unsigned long) &_stext,
574 end = (unsigned long) &_end, 584 end = (unsigned long) &_end,
575 addr = (unsigned long) obj; 585 addr = (unsigned long) obj;
576#ifdef CONFIG_SMP
577 int i;
578#endif
579 586
580 /* 587 /*
581 * static variable? 588 * static variable?
@@ -586,24 +593,16 @@ static int static_obj(void *obj)
586 if (arch_is_kernel_data(addr)) 593 if (arch_is_kernel_data(addr))
587 return 1; 594 return 1;
588 595
589#ifdef CONFIG_SMP
590 /* 596 /*
591 * percpu var? 597 * in-kernel percpu var?
592 */ 598 */
593 for_each_possible_cpu(i) { 599 if (is_kernel_percpu_address(addr))
594 start = (unsigned long) &__per_cpu_start + per_cpu_offset(i); 600 return 1;
595 end = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM
596 + per_cpu_offset(i);
597
598 if ((addr >= start) && (addr < end))
599 return 1;
600 }
601#endif
602 601
603 /* 602 /*
604 * module var? 603 * module static or percpu var?
605 */ 604 */
606 return is_module_address(addr); 605 return is_module_address(addr) || is_module_percpu_address(addr);
607} 606}
608 607
609/* 608/*
@@ -1161,9 +1160,9 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class)
1161 this.class = class; 1160 this.class = class;
1162 1161
1163 local_irq_save(flags); 1162 local_irq_save(flags);
1164 __raw_spin_lock(&lockdep_lock); 1163 arch_spin_lock(&lockdep_lock);
1165 ret = __lockdep_count_forward_deps(&this); 1164 ret = __lockdep_count_forward_deps(&this);
1166 __raw_spin_unlock(&lockdep_lock); 1165 arch_spin_unlock(&lockdep_lock);
1167 local_irq_restore(flags); 1166 local_irq_restore(flags);
1168 1167
1169 return ret; 1168 return ret;
@@ -1188,9 +1187,9 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
1188 this.class = class; 1187 this.class = class;
1189 1188
1190 local_irq_save(flags); 1189 local_irq_save(flags);
1191 __raw_spin_lock(&lockdep_lock); 1190 arch_spin_lock(&lockdep_lock);
1192 ret = __lockdep_count_backward_deps(&this); 1191 ret = __lockdep_count_backward_deps(&this);
1193 __raw_spin_unlock(&lockdep_lock); 1192 arch_spin_unlock(&lockdep_lock);
1194 local_irq_restore(flags); 1193 local_irq_restore(flags);
1195 1194
1196 return ret; 1195 return ret;
@@ -2138,7 +2137,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
2138 return ret; 2137 return ret;
2139 2138
2140 return print_irq_inversion_bug(curr, &root, target_entry, 2139 return print_irq_inversion_bug(curr, &root, target_entry,
2141 this, 1, irqclass); 2140 this, 0, irqclass);
2142} 2141}
2143 2142
2144void print_irqtrace_events(struct task_struct *curr) 2143void print_irqtrace_events(struct task_struct *curr)
@@ -3202,8 +3201,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3202{ 3201{
3203 unsigned long flags; 3202 unsigned long flags;
3204 3203
3205 trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
3206
3207 if (unlikely(current->lockdep_recursion)) 3204 if (unlikely(current->lockdep_recursion))
3208 return; 3205 return;
3209 3206
@@ -3211,6 +3208,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3211 check_flags(flags); 3208 check_flags(flags);
3212 3209
3213 current->lockdep_recursion = 1; 3210 current->lockdep_recursion = 1;
3211 trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
3214 __lock_acquire(lock, subclass, trylock, read, check, 3212 __lock_acquire(lock, subclass, trylock, read, check,
3215 irqs_disabled_flags(flags), nest_lock, ip, 0); 3213 irqs_disabled_flags(flags), nest_lock, ip, 0);
3216 current->lockdep_recursion = 0; 3214 current->lockdep_recursion = 0;
@@ -3223,14 +3221,13 @@ void lock_release(struct lockdep_map *lock, int nested,
3223{ 3221{
3224 unsigned long flags; 3222 unsigned long flags;
3225 3223
3226 trace_lock_release(lock, nested, ip);
3227
3228 if (unlikely(current->lockdep_recursion)) 3224 if (unlikely(current->lockdep_recursion))
3229 return; 3225 return;
3230 3226
3231 raw_local_irq_save(flags); 3227 raw_local_irq_save(flags);
3232 check_flags(flags); 3228 check_flags(flags);
3233 current->lockdep_recursion = 1; 3229 current->lockdep_recursion = 1;
3230 trace_lock_release(lock, nested, ip);
3234 __lock_release(lock, nested, ip); 3231 __lock_release(lock, nested, ip);
3235 current->lockdep_recursion = 0; 3232 current->lockdep_recursion = 0;
3236 raw_local_irq_restore(flags); 3233 raw_local_irq_restore(flags);
@@ -3404,8 +3401,6 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
3404{ 3401{
3405 unsigned long flags; 3402 unsigned long flags;
3406 3403
3407 trace_lock_contended(lock, ip);
3408
3409 if (unlikely(!lock_stat)) 3404 if (unlikely(!lock_stat))
3410 return; 3405 return;
3411 3406
@@ -3415,6 +3410,7 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
3415 raw_local_irq_save(flags); 3410 raw_local_irq_save(flags);
3416 check_flags(flags); 3411 check_flags(flags);
3417 current->lockdep_recursion = 1; 3412 current->lockdep_recursion = 1;
3413 trace_lock_contended(lock, ip);
3418 __lock_contended(lock, ip); 3414 __lock_contended(lock, ip);
3419 current->lockdep_recursion = 0; 3415 current->lockdep_recursion = 0;
3420 raw_local_irq_restore(flags); 3416 raw_local_irq_restore(flags);
@@ -3800,3 +3796,22 @@ void lockdep_sys_exit(void)
3800 lockdep_print_held_locks(curr); 3796 lockdep_print_held_locks(curr);
3801 } 3797 }
3802} 3798}
3799
3800void lockdep_rcu_dereference(const char *file, const int line)
3801{
3802 struct task_struct *curr = current;
3803
3804 if (!debug_locks_off())
3805 return;
3806 printk("\n===================================================\n");
3807 printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n");
3808 printk( "---------------------------------------------------\n");
3809 printk("%s:%d invoked rcu_dereference_check() without protection!\n",
3810 file, line);
3811 printk("\nother info that might help us debug this:\n\n");
3812 printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks);
3813 lockdep_print_held_locks(curr);
3814 printk("\nstack backtrace:\n");
3815 dump_stack();
3816}
3817EXPORT_SYMBOL_GPL(lockdep_rcu_dereference);
diff --git a/kernel/module.c b/kernel/module.c
index 5842a71cf052..1016b75b026a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -370,204 +370,98 @@ EXPORT_SYMBOL_GPL(find_module);
370 370
371#ifdef CONFIG_SMP 371#ifdef CONFIG_SMP
372 372
373#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA 373static inline void __percpu *mod_percpu(struct module *mod)
374
375static void *percpu_modalloc(unsigned long size, unsigned long align,
376 const char *name)
377{ 374{
378 void *ptr; 375 return mod->percpu;
376}
379 377
378static int percpu_modalloc(struct module *mod,
379 unsigned long size, unsigned long align)
380{
380 if (align > PAGE_SIZE) { 381 if (align > PAGE_SIZE) {
381 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", 382 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
382 name, align, PAGE_SIZE); 383 mod->name, align, PAGE_SIZE);
383 align = PAGE_SIZE; 384 align = PAGE_SIZE;
384 } 385 }
385 386
386 ptr = __alloc_reserved_percpu(size, align); 387 mod->percpu = __alloc_reserved_percpu(size, align);
387 if (!ptr) 388 if (!mod->percpu) {
388 printk(KERN_WARNING 389 printk(KERN_WARNING
389 "Could not allocate %lu bytes percpu data\n", size); 390 "Could not allocate %lu bytes percpu data\n", size);
390 return ptr; 391 return -ENOMEM;
391}
392
393static void percpu_modfree(void *freeme)
394{
395 free_percpu(freeme);
396}
397
398#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */
399
400/* Number of blocks used and allocated. */
401static unsigned int pcpu_num_used, pcpu_num_allocated;
402/* Size of each block. -ve means used. */
403static int *pcpu_size;
404
405static int split_block(unsigned int i, unsigned short size)
406{
407 /* Reallocation required? */
408 if (pcpu_num_used + 1 > pcpu_num_allocated) {
409 int *new;
410
411 new = krealloc(pcpu_size, sizeof(new[0])*pcpu_num_allocated*2,
412 GFP_KERNEL);
413 if (!new)
414 return 0;
415
416 pcpu_num_allocated *= 2;
417 pcpu_size = new;
418 } 392 }
419 393 mod->percpu_size = size;
420 /* Insert a new subblock */ 394 return 0;
421 memmove(&pcpu_size[i+1], &pcpu_size[i],
422 sizeof(pcpu_size[0]) * (pcpu_num_used - i));
423 pcpu_num_used++;
424
425 pcpu_size[i+1] -= size;
426 pcpu_size[i] = size;
427 return 1;
428} 395}
429 396
430static inline unsigned int block_size(int val) 397static void percpu_modfree(struct module *mod)
431{ 398{
432 if (val < 0) 399 free_percpu(mod->percpu);
433 return -val;
434 return val;
435} 400}
436 401
437static void *percpu_modalloc(unsigned long size, unsigned long align, 402static unsigned int find_pcpusec(Elf_Ehdr *hdr,
438 const char *name) 403 Elf_Shdr *sechdrs,
404 const char *secstrings)
439{ 405{
440 unsigned long extra; 406 return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
441 unsigned int i;
442 void *ptr;
443 int cpu;
444
445 if (align > PAGE_SIZE) {
446 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
447 name, align, PAGE_SIZE);
448 align = PAGE_SIZE;
449 }
450
451 ptr = __per_cpu_start;
452 for (i = 0; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
453 /* Extra for alignment requirement. */
454 extra = ALIGN((unsigned long)ptr, align) - (unsigned long)ptr;
455 BUG_ON(i == 0 && extra != 0);
456
457 if (pcpu_size[i] < 0 || pcpu_size[i] < extra + size)
458 continue;
459
460 /* Transfer extra to previous block. */
461 if (pcpu_size[i-1] < 0)
462 pcpu_size[i-1] -= extra;
463 else
464 pcpu_size[i-1] += extra;
465 pcpu_size[i] -= extra;
466 ptr += extra;
467
468 /* Split block if warranted */
469 if (pcpu_size[i] - size > sizeof(unsigned long))
470 if (!split_block(i, size))
471 return NULL;
472
473 /* add the per-cpu scanning areas */
474 for_each_possible_cpu(cpu)
475 kmemleak_alloc(ptr + per_cpu_offset(cpu), size, 0,
476 GFP_KERNEL);
477
478 /* Mark allocated */
479 pcpu_size[i] = -pcpu_size[i];
480 return ptr;
481 }
482
483 printk(KERN_WARNING "Could not allocate %lu bytes percpu data\n",
484 size);
485 return NULL;
486} 407}
487 408
488static void percpu_modfree(void *freeme) 409static void percpu_modcopy(struct module *mod,
410 const void *from, unsigned long size)
489{ 411{
490 unsigned int i;
491 void *ptr = __per_cpu_start + block_size(pcpu_size[0]);
492 int cpu; 412 int cpu;
493 413
494 /* First entry is core kernel percpu data. */
495 for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
496 if (ptr == freeme) {
497 pcpu_size[i] = -pcpu_size[i];
498 goto free;
499 }
500 }
501 BUG();
502
503 free:
504 /* remove the per-cpu scanning areas */
505 for_each_possible_cpu(cpu) 414 for_each_possible_cpu(cpu)
506 kmemleak_free(freeme + per_cpu_offset(cpu)); 415 memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
507
508 /* Merge with previous? */
509 if (pcpu_size[i-1] >= 0) {
510 pcpu_size[i-1] += pcpu_size[i];
511 pcpu_num_used--;
512 memmove(&pcpu_size[i], &pcpu_size[i+1],
513 (pcpu_num_used - i) * sizeof(pcpu_size[0]));
514 i--;
515 }
516 /* Merge with next? */
517 if (i+1 < pcpu_num_used && pcpu_size[i+1] >= 0) {
518 pcpu_size[i] += pcpu_size[i+1];
519 pcpu_num_used--;
520 memmove(&pcpu_size[i+1], &pcpu_size[i+2],
521 (pcpu_num_used - (i+1)) * sizeof(pcpu_size[0]));
522 }
523} 416}
524 417
525static int percpu_modinit(void) 418/**
419 * is_module_percpu_address - test whether address is from module static percpu
420 * @addr: address to test
421 *
422 * Test whether @addr belongs to module static percpu area.
423 *
424 * RETURNS:
425 * %true if @addr is from module static percpu area
426 */
427bool is_module_percpu_address(unsigned long addr)
526{ 428{
527 pcpu_num_used = 2; 429 struct module *mod;
528 pcpu_num_allocated = 2; 430 unsigned int cpu;
529 pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated,
530 GFP_KERNEL);
531 /* Static in-kernel percpu data (used). */
532 pcpu_size[0] = -(__per_cpu_end-__per_cpu_start);
533 /* Free room. */
534 pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0];
535 if (pcpu_size[1] < 0) {
536 printk(KERN_ERR "No per-cpu room for modules.\n");
537 pcpu_num_used = 1;
538 }
539
540 return 0;
541}
542__initcall(percpu_modinit);
543 431
544#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */ 432 preempt_disable();
545 433
546static unsigned int find_pcpusec(Elf_Ehdr *hdr, 434 list_for_each_entry_rcu(mod, &modules, list) {
547 Elf_Shdr *sechdrs, 435 if (!mod->percpu_size)
548 const char *secstrings) 436 continue;
549{ 437 for_each_possible_cpu(cpu) {
550 return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); 438 void *start = per_cpu_ptr(mod->percpu, cpu);
551}
552 439
553static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size) 440 if ((void *)addr >= start &&
554{ 441 (void *)addr < start + mod->percpu_size) {
555 int cpu; 442 preempt_enable();
443 return true;
444 }
445 }
446 }
556 447
557 for_each_possible_cpu(cpu) 448 preempt_enable();
558 memcpy(pcpudest + per_cpu_offset(cpu), from, size); 449 return false;
559} 450}
560 451
561#else /* ... !CONFIG_SMP */ 452#else /* ... !CONFIG_SMP */
562 453
563static inline void *percpu_modalloc(unsigned long size, unsigned long align, 454static inline void __percpu *mod_percpu(struct module *mod)
564 const char *name)
565{ 455{
566 return NULL; 456 return NULL;
567} 457}
568static inline void percpu_modfree(void *pcpuptr) 458static inline int percpu_modalloc(struct module *mod,
459 unsigned long size, unsigned long align)
460{
461 return -ENOMEM;
462}
463static inline void percpu_modfree(struct module *mod)
569{ 464{
570 BUG();
571} 465}
572static inline unsigned int find_pcpusec(Elf_Ehdr *hdr, 466static inline unsigned int find_pcpusec(Elf_Ehdr *hdr,
573 Elf_Shdr *sechdrs, 467 Elf_Shdr *sechdrs,
@@ -575,12 +469,16 @@ static inline unsigned int find_pcpusec(Elf_Ehdr *hdr,
575{ 469{
576 return 0; 470 return 0;
577} 471}
578static inline void percpu_modcopy(void *pcpudst, const void *src, 472static inline void percpu_modcopy(struct module *mod,
579 unsigned long size) 473 const void *from, unsigned long size)
580{ 474{
581 /* pcpusec should be 0, and size of that section should be 0. */ 475 /* pcpusec should be 0, and size of that section should be 0. */
582 BUG_ON(size != 0); 476 BUG_ON(size != 0);
583} 477}
478bool is_module_percpu_address(unsigned long addr)
479{
480 return false;
481}
584 482
585#endif /* CONFIG_SMP */ 483#endif /* CONFIG_SMP */
586 484
@@ -623,10 +521,13 @@ static void module_unload_init(struct module *mod)
623 int cpu; 521 int cpu;
624 522
625 INIT_LIST_HEAD(&mod->modules_which_use_me); 523 INIT_LIST_HEAD(&mod->modules_which_use_me);
626 for_each_possible_cpu(cpu) 524 for_each_possible_cpu(cpu) {
627 local_set(__module_ref_addr(mod, cpu), 0); 525 per_cpu_ptr(mod->refptr, cpu)->incs = 0;
526 per_cpu_ptr(mod->refptr, cpu)->decs = 0;
527 }
528
628 /* Hold reference count during initialization. */ 529 /* Hold reference count during initialization. */
629 local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1); 530 __this_cpu_write(mod->refptr->incs, 1);
630 /* Backwards compatibility macros put refcount during init. */ 531 /* Backwards compatibility macros put refcount during init. */
631 mod->waiter = current; 532 mod->waiter = current;
632} 533}
@@ -765,12 +666,28 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
765 666
766unsigned int module_refcount(struct module *mod) 667unsigned int module_refcount(struct module *mod)
767{ 668{
768 unsigned int total = 0; 669 unsigned int incs = 0, decs = 0;
769 int cpu; 670 int cpu;
770 671
771 for_each_possible_cpu(cpu) 672 for_each_possible_cpu(cpu)
772 total += local_read(__module_ref_addr(mod, cpu)); 673 decs += per_cpu_ptr(mod->refptr, cpu)->decs;
773 return total; 674 /*
675 * ensure the incs are added up after the decs.
676 * module_put ensures incs are visible before decs with smp_wmb.
677 *
678 * This 2-count scheme avoids the situation where the refcount
679 * for CPU0 is read, then CPU0 increments the module refcount,
680 * then CPU1 drops that refcount, then the refcount for CPU1 is
681 * read. We would record a decrement but not its corresponding
682 * increment so we would see a low count (disaster).
683 *
684 * Rare situation? But module_refcount can be preempted, and we
685 * might be tallying up 4096+ CPUs. So it is not impossible.
686 */
687 smp_rmb();
688 for_each_possible_cpu(cpu)
689 incs += per_cpu_ptr(mod->refptr, cpu)->incs;
690 return incs - decs;
774} 691}
775EXPORT_SYMBOL(module_refcount); 692EXPORT_SYMBOL(module_refcount);
776 693
@@ -946,14 +863,16 @@ static struct module_attribute refcnt = {
946void module_put(struct module *module) 863void module_put(struct module *module)
947{ 864{
948 if (module) { 865 if (module) {
949 unsigned int cpu = get_cpu(); 866 preempt_disable();
950 local_dec(__module_ref_addr(module, cpu)); 867 smp_wmb(); /* see comment in module_refcount */
868 __this_cpu_inc(module->refptr->decs);
869
951 trace_module_put(module, _RET_IP_, 870 trace_module_put(module, _RET_IP_,
952 local_read(__module_ref_addr(module, cpu))); 871 __this_cpu_read(module->refptr->decs));
953 /* Maybe they're waiting for us to drop reference? */ 872 /* Maybe they're waiting for us to drop reference? */
954 if (unlikely(!module_is_live(module))) 873 if (unlikely(!module_is_live(module)))
955 wake_up_process(module->waiter); 874 wake_up_process(module->waiter);
956 put_cpu(); 875 preempt_enable();
957 } 876 }
958} 877}
959EXPORT_SYMBOL(module_put); 878EXPORT_SYMBOL(module_put);
@@ -1030,11 +949,23 @@ static int try_to_force_load(struct module *mod, const char *reason)
1030} 949}
1031 950
1032#ifdef CONFIG_MODVERSIONS 951#ifdef CONFIG_MODVERSIONS
952/* If the arch applies (non-zero) relocations to kernel kcrctab, unapply it. */
953static unsigned long maybe_relocated(unsigned long crc,
954 const struct module *crc_owner)
955{
956#ifdef ARCH_RELOCATES_KCRCTAB
957 if (crc_owner == NULL)
958 return crc - (unsigned long)reloc_start;
959#endif
960 return crc;
961}
962
1033static int check_version(Elf_Shdr *sechdrs, 963static int check_version(Elf_Shdr *sechdrs,
1034 unsigned int versindex, 964 unsigned int versindex,
1035 const char *symname, 965 const char *symname,
1036 struct module *mod, 966 struct module *mod,
1037 const unsigned long *crc) 967 const unsigned long *crc,
968 const struct module *crc_owner)
1038{ 969{
1039 unsigned int i, num_versions; 970 unsigned int i, num_versions;
1040 struct modversion_info *versions; 971 struct modversion_info *versions;
@@ -1055,10 +986,10 @@ static int check_version(Elf_Shdr *sechdrs,
1055 if (strcmp(versions[i].name, symname) != 0) 986 if (strcmp(versions[i].name, symname) != 0)
1056 continue; 987 continue;
1057 988
1058 if (versions[i].crc == *crc) 989 if (versions[i].crc == maybe_relocated(*crc, crc_owner))
1059 return 1; 990 return 1;
1060 DEBUGP("Found checksum %lX vs module %lX\n", 991 DEBUGP("Found checksum %lX vs module %lX\n",
1061 *crc, versions[i].crc); 992 maybe_relocated(*crc, crc_owner), versions[i].crc);
1062 goto bad_version; 993 goto bad_version;
1063 } 994 }
1064 995
@@ -1081,7 +1012,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
1081 if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL, 1012 if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL,
1082 &crc, true, false)) 1013 &crc, true, false))
1083 BUG(); 1014 BUG();
1084 return check_version(sechdrs, versindex, "module_layout", mod, crc); 1015 return check_version(sechdrs, versindex, "module_layout", mod, crc,
1016 NULL);
1085} 1017}
1086 1018
1087/* First part is kernel version, which we ignore if module has crcs. */ 1019/* First part is kernel version, which we ignore if module has crcs. */
@@ -1099,7 +1031,8 @@ static inline int check_version(Elf_Shdr *sechdrs,
1099 unsigned int versindex, 1031 unsigned int versindex,
1100 const char *symname, 1032 const char *symname,
1101 struct module *mod, 1033 struct module *mod,
1102 const unsigned long *crc) 1034 const unsigned long *crc,
1035 const struct module *crc_owner)
1103{ 1036{
1104 return 1; 1037 return 1;
1105} 1038}
@@ -1134,8 +1067,8 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
1134 /* use_module can fail due to OOM, 1067 /* use_module can fail due to OOM,
1135 or module initialization or unloading */ 1068 or module initialization or unloading */
1136 if (sym) { 1069 if (sym) {
1137 if (!check_version(sechdrs, versindex, name, mod, crc) || 1070 if (!check_version(sechdrs, versindex, name, mod, crc, owner)
1138 !use_module(mod, owner)) 1071 || !use_module(mod, owner))
1139 sym = NULL; 1072 sym = NULL;
1140 } 1073 }
1141 return sym; 1074 return sym;
@@ -1146,6 +1079,12 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
1146 * J. Corbet <corbet@lwn.net> 1079 * J. Corbet <corbet@lwn.net>
1147 */ 1080 */
1148#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS) 1081#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS)
1082
1083static inline bool sect_empty(const Elf_Shdr *sect)
1084{
1085 return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
1086}
1087
1149struct module_sect_attr 1088struct module_sect_attr
1150{ 1089{
1151 struct module_attribute mattr; 1090 struct module_attribute mattr;
@@ -1187,8 +1126,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1187 1126
1188 /* Count loaded sections and allocate structures */ 1127 /* Count loaded sections and allocate structures */
1189 for (i = 0; i < nsect; i++) 1128 for (i = 0; i < nsect; i++)
1190 if (sechdrs[i].sh_flags & SHF_ALLOC 1129 if (!sect_empty(&sechdrs[i]))
1191 && sechdrs[i].sh_size)
1192 nloaded++; 1130 nloaded++;
1193 size[0] = ALIGN(sizeof(*sect_attrs) 1131 size[0] = ALIGN(sizeof(*sect_attrs)
1194 + nloaded * sizeof(sect_attrs->attrs[0]), 1132 + nloaded * sizeof(sect_attrs->attrs[0]),
@@ -1206,9 +1144,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1206 sattr = &sect_attrs->attrs[0]; 1144 sattr = &sect_attrs->attrs[0];
1207 gattr = &sect_attrs->grp.attrs[0]; 1145 gattr = &sect_attrs->grp.attrs[0];
1208 for (i = 0; i < nsect; i++) { 1146 for (i = 0; i < nsect; i++) {
1209 if (! (sechdrs[i].sh_flags & SHF_ALLOC)) 1147 if (sect_empty(&sechdrs[i]))
1210 continue;
1211 if (!sechdrs[i].sh_size)
1212 continue; 1148 continue;
1213 sattr->address = sechdrs[i].sh_addr; 1149 sattr->address = sechdrs[i].sh_addr;
1214 sattr->name = kstrdup(secstrings + sechdrs[i].sh_name, 1150 sattr->name = kstrdup(secstrings + sechdrs[i].sh_name,
@@ -1216,6 +1152,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1216 if (sattr->name == NULL) 1152 if (sattr->name == NULL)
1217 goto out; 1153 goto out;
1218 sect_attrs->nsections++; 1154 sect_attrs->nsections++;
1155 sysfs_attr_init(&sattr->mattr.attr);
1219 sattr->mattr.show = module_sect_show; 1156 sattr->mattr.show = module_sect_show;
1220 sattr->mattr.store = NULL; 1157 sattr->mattr.store = NULL;
1221 sattr->mattr.attr.name = sattr->name; 1158 sattr->mattr.attr.name = sattr->name;
@@ -1292,7 +1229,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1292 /* Count notes sections and allocate structures. */ 1229 /* Count notes sections and allocate structures. */
1293 notes = 0; 1230 notes = 0;
1294 for (i = 0; i < nsect; i++) 1231 for (i = 0; i < nsect; i++)
1295 if ((sechdrs[i].sh_flags & SHF_ALLOC) && 1232 if (!sect_empty(&sechdrs[i]) &&
1296 (sechdrs[i].sh_type == SHT_NOTE)) 1233 (sechdrs[i].sh_type == SHT_NOTE))
1297 ++notes; 1234 ++notes;
1298 1235
@@ -1308,9 +1245,10 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1308 notes_attrs->notes = notes; 1245 notes_attrs->notes = notes;
1309 nattr = &notes_attrs->attrs[0]; 1246 nattr = &notes_attrs->attrs[0];
1310 for (loaded = i = 0; i < nsect; ++i) { 1247 for (loaded = i = 0; i < nsect; ++i) {
1311 if (!(sechdrs[i].sh_flags & SHF_ALLOC)) 1248 if (sect_empty(&sechdrs[i]))
1312 continue; 1249 continue;
1313 if (sechdrs[i].sh_type == SHT_NOTE) { 1250 if (sechdrs[i].sh_type == SHT_NOTE) {
1251 sysfs_bin_attr_init(nattr);
1314 nattr->attr.name = mod->sect_attrs->attrs[loaded].name; 1252 nattr->attr.name = mod->sect_attrs->attrs[loaded].name;
1315 nattr->attr.mode = S_IRUGO; 1253 nattr->attr.mode = S_IRUGO;
1316 nattr->size = sechdrs[i].sh_size; 1254 nattr->size = sechdrs[i].sh_size;
@@ -1383,6 +1321,7 @@ int module_add_modinfo_attrs(struct module *mod)
1383 if (!attr->test || 1321 if (!attr->test ||
1384 (attr->test && attr->test(mod))) { 1322 (attr->test && attr->test(mod))) {
1385 memcpy(temp_attr, attr, sizeof(*temp_attr)); 1323 memcpy(temp_attr, attr, sizeof(*temp_attr));
1324 sysfs_attr_init(&temp_attr->attr);
1386 error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr); 1325 error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr);
1387 ++temp_attr; 1326 ++temp_attr;
1388 } 1327 }
@@ -1528,11 +1467,10 @@ static void free_module(struct module *mod)
1528 /* This may be NULL, but that's OK */ 1467 /* This may be NULL, but that's OK */
1529 module_free(mod, mod->module_init); 1468 module_free(mod, mod->module_init);
1530 kfree(mod->args); 1469 kfree(mod->args);
1531 if (mod->percpu) 1470 percpu_modfree(mod);
1532 percpu_modfree(mod->percpu); 1471#if defined(CONFIG_MODULE_UNLOAD)
1533#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
1534 if (mod->refptr) 1472 if (mod->refptr)
1535 percpu_modfree(mod->refptr); 1473 free_percpu(mod->refptr);
1536#endif 1474#endif
1537 /* Free lock-classes: */ 1475 /* Free lock-classes: */
1538 lockdep_free_key_range(mod->module_core, mod->core_size); 1476 lockdep_free_key_range(mod->module_core, mod->core_size);
@@ -1648,7 +1586,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1648 default: 1586 default:
1649 /* Divert to percpu allocation if a percpu var. */ 1587 /* Divert to percpu allocation if a percpu var. */
1650 if (sym[i].st_shndx == pcpuindex) 1588 if (sym[i].st_shndx == pcpuindex)
1651 secbase = (unsigned long)mod->percpu; 1589 secbase = (unsigned long)mod_percpu(mod);
1652 else 1590 else
1653 secbase = sechdrs[sym[i].st_shndx].sh_addr; 1591 secbase = sechdrs[sym[i].st_shndx].sh_addr;
1654 sym[i].st_value += secbase; 1592 sym[i].st_value += secbase;
@@ -2046,9 +1984,7 @@ static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
2046 unsigned int i; 1984 unsigned int i;
2047 1985
2048 /* only scan the sections containing data */ 1986 /* only scan the sections containing data */
2049 kmemleak_scan_area(mod->module_core, (unsigned long)mod - 1987 kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
2050 (unsigned long)mod->module_core,
2051 sizeof(struct module), GFP_KERNEL);
2052 1988
2053 for (i = 1; i < hdr->e_shnum; i++) { 1989 for (i = 1; i < hdr->e_shnum; i++) {
2054 if (!(sechdrs[i].sh_flags & SHF_ALLOC)) 1990 if (!(sechdrs[i].sh_flags & SHF_ALLOC))
@@ -2057,8 +1993,7 @@ static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
2057 && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0) 1993 && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0)
2058 continue; 1994 continue;
2059 1995
2060 kmemleak_scan_area(mod->module_core, sechdrs[i].sh_addr - 1996 kmemleak_scan_area((void *)sechdrs[i].sh_addr,
2061 (unsigned long)mod->module_core,
2062 sechdrs[i].sh_size, GFP_KERNEL); 1997 sechdrs[i].sh_size, GFP_KERNEL);
2063 } 1998 }
2064} 1999}
@@ -2085,7 +2020,7 @@ static noinline struct module *load_module(void __user *umod,
2085 unsigned int modindex, versindex, infoindex, pcpuindex; 2020 unsigned int modindex, versindex, infoindex, pcpuindex;
2086 struct module *mod; 2021 struct module *mod;
2087 long err = 0; 2022 long err = 0;
2088 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 2023 void *ptr = NULL; /* Stops spurious gcc warning */
2089 unsigned long symoffs, stroffs, *strmap; 2024 unsigned long symoffs, stroffs, *strmap;
2090 2025
2091 mm_segment_t old_fs; 2026 mm_segment_t old_fs;
@@ -2225,15 +2160,11 @@ static noinline struct module *load_module(void __user *umod,
2225 2160
2226 if (pcpuindex) { 2161 if (pcpuindex) {
2227 /* We have a special allocation for this section. */ 2162 /* We have a special allocation for this section. */
2228 percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size, 2163 err = percpu_modalloc(mod, sechdrs[pcpuindex].sh_size,
2229 sechdrs[pcpuindex].sh_addralign, 2164 sechdrs[pcpuindex].sh_addralign);
2230 mod->name); 2165 if (err)
2231 if (!percpu) {
2232 err = -ENOMEM;
2233 goto free_mod; 2166 goto free_mod;
2234 }
2235 sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 2167 sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
2236 mod->percpu = percpu;
2237 } 2168 }
2238 2169
2239 /* Determine total sizes, and put offsets in sh_entsize. For now 2170 /* Determine total sizes, and put offsets in sh_entsize. For now
@@ -2298,9 +2229,8 @@ static noinline struct module *load_module(void __user *umod,
2298 mod = (void *)sechdrs[modindex].sh_addr; 2229 mod = (void *)sechdrs[modindex].sh_addr;
2299 kmemleak_load_module(mod, hdr, sechdrs, secstrings); 2230 kmemleak_load_module(mod, hdr, sechdrs, secstrings);
2300 2231
2301#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 2232#if defined(CONFIG_MODULE_UNLOAD)
2302 mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t), 2233 mod->refptr = alloc_percpu(struct module_ref);
2303 mod->name);
2304 if (!mod->refptr) { 2234 if (!mod->refptr) {
2305 err = -ENOMEM; 2235 err = -ENOMEM;
2306 goto free_init; 2236 goto free_init;
@@ -2386,6 +2316,12 @@ static noinline struct module *load_module(void __user *umod,
2386 "_ftrace_events", 2316 "_ftrace_events",
2387 sizeof(*mod->trace_events), 2317 sizeof(*mod->trace_events),
2388 &mod->num_trace_events); 2318 &mod->num_trace_events);
2319 /*
2320 * This section contains pointers to allocated objects in the trace
2321 * code and not scanning it leads to false positives.
2322 */
2323 kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
2324 mod->num_trace_events, GFP_KERNEL);
2389#endif 2325#endif
2390#ifdef CONFIG_FTRACE_MCOUNT_RECORD 2326#ifdef CONFIG_FTRACE_MCOUNT_RECORD
2391 /* sechdrs[0].sh_size is always zero */ 2327 /* sechdrs[0].sh_size is always zero */
@@ -2443,7 +2379,7 @@ static noinline struct module *load_module(void __user *umod,
2443 sort_extable(mod->extable, mod->extable + mod->num_exentries); 2379 sort_extable(mod->extable, mod->extable + mod->num_exentries);
2444 2380
2445 /* Finally, copy percpu area over. */ 2381 /* Finally, copy percpu area over. */
2446 percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr, 2382 percpu_modcopy(mod, (void *)sechdrs[pcpuindex].sh_addr,
2447 sechdrs[pcpuindex].sh_size); 2383 sechdrs[pcpuindex].sh_size);
2448 2384
2449 add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex, 2385 add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex,
@@ -2526,8 +2462,8 @@ static noinline struct module *load_module(void __user *umod,
2526 kobject_put(&mod->mkobj.kobj); 2462 kobject_put(&mod->mkobj.kobj);
2527 free_unload: 2463 free_unload:
2528 module_unload_free(mod); 2464 module_unload_free(mod);
2529#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 2465#if defined(CONFIG_MODULE_UNLOAD)
2530 percpu_modfree(mod->refptr); 2466 free_percpu(mod->refptr);
2531 free_init: 2467 free_init:
2532#endif 2468#endif
2533 module_free(mod, mod->module_init); 2469 module_free(mod, mod->module_init);
@@ -2535,8 +2471,7 @@ static noinline struct module *load_module(void __user *umod,
2535 module_free(mod, mod->module_core); 2471 module_free(mod, mod->module_core);
2536 /* mod will be freed with core. Don't access it beyond this line! */ 2472 /* mod will be freed with core. Don't access it beyond this line! */
2537 free_percpu: 2473 free_percpu:
2538 if (percpu) 2474 percpu_modfree(mod);
2539 percpu_modfree(percpu);
2540 free_mod: 2475 free_mod:
2541 kfree(args); 2476 kfree(args);
2542 kfree(strmap); 2477 kfree(strmap);
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index 6b2d735846a5..57d527a16f9d 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -43,13 +43,13 @@ static inline void mutex_clear_owner(struct mutex *lock)
43 \ 43 \
44 DEBUG_LOCKS_WARN_ON(in_interrupt()); \ 44 DEBUG_LOCKS_WARN_ON(in_interrupt()); \
45 local_irq_save(flags); \ 45 local_irq_save(flags); \
46 __raw_spin_lock(&(lock)->raw_lock); \ 46 arch_spin_lock(&(lock)->rlock.raw_lock);\
47 DEBUG_LOCKS_WARN_ON(l->magic != l); \ 47 DEBUG_LOCKS_WARN_ON(l->magic != l); \
48 } while (0) 48 } while (0)
49 49
50#define spin_unlock_mutex(lock, flags) \ 50#define spin_unlock_mutex(lock, flags) \
51 do { \ 51 do { \
52 __raw_spin_unlock(&(lock)->raw_lock); \ 52 arch_spin_unlock(&(lock)->rlock.raw_lock); \
53 local_irq_restore(flags); \ 53 local_irq_restore(flags); \
54 preempt_check_resched(); \ 54 preempt_check_resched(); \
55 } while (0) 55 } while (0)
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 947b3ad551f8..632f04c57d82 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -148,8 +148,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
148 148
149 preempt_disable(); 149 preempt_disable();
150 mutex_acquire(&lock->dep_map, subclass, 0, ip); 150 mutex_acquire(&lock->dep_map, subclass, 0, ip);
151#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES) && \ 151
152 !defined(CONFIG_HAVE_DEFAULT_NO_SPIN_MUTEXES) 152#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
153 /* 153 /*
154 * Optimistic spinning. 154 * Optimistic spinning.
155 * 155 *
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 61d5aa5eced3..2488ba7eb568 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -78,10 +78,10 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
78 int ret = NOTIFY_DONE; 78 int ret = NOTIFY_DONE;
79 struct notifier_block *nb, *next_nb; 79 struct notifier_block *nb, *next_nb;
80 80
81 nb = rcu_dereference(*nl); 81 nb = rcu_dereference_raw(*nl);
82 82
83 while (nb && nr_to_call) { 83 while (nb && nr_to_call) {
84 next_nb = rcu_dereference(nb->next); 84 next_nb = rcu_dereference_raw(nb->next);
85 85
86#ifdef CONFIG_DEBUG_NOTIFIERS 86#ifdef CONFIG_DEBUG_NOTIFIERS
87 if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) { 87 if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
@@ -309,7 +309,7 @@ int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
309 * racy then it does not matter what the result of the test 309 * racy then it does not matter what the result of the test
310 * is, we re-check the list after having taken the lock anyway: 310 * is, we re-check the list after having taken the lock anyway:
311 */ 311 */
312 if (rcu_dereference(nh->head)) { 312 if (rcu_dereference_raw(nh->head)) {
313 down_read(&nh->rwsem); 313 down_read(&nh->rwsem);
314 ret = notifier_call_chain(&nh->head, val, v, nr_to_call, 314 ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
315 nr_calls); 315 nr_calls);
@@ -558,7 +558,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier);
558 558
559static ATOMIC_NOTIFIER_HEAD(die_chain); 559static ATOMIC_NOTIFIER_HEAD(die_chain);
560 560
561int notrace notify_die(enum die_val val, const char *str, 561int notrace __kprobes notify_die(enum die_val val, const char *str,
562 struct pt_regs *regs, long err, int trap, int sig) 562 struct pt_regs *regs, long err, int trap, int sig)
563{ 563{
564 struct die_args args = { 564 struct die_args args = {
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 09b4ff9711b2..f74e6c00e26d 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -13,6 +13,7 @@
13 * Pavel Emelianov <xemul@openvz.org> 13 * Pavel Emelianov <xemul@openvz.org>
14 */ 14 */
15 15
16#include <linux/slab.h>
16#include <linux/module.h> 17#include <linux/module.h>
17#include <linux/nsproxy.h> 18#include <linux/nsproxy.h>
18#include <linux/init_task.h> 19#include <linux/init_task.h>
@@ -24,7 +25,18 @@
24 25
25static struct kmem_cache *nsproxy_cachep; 26static struct kmem_cache *nsproxy_cachep;
26 27
27struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); 28struct nsproxy init_nsproxy = {
29 .count = ATOMIC_INIT(1),
30 .uts_ns = &init_uts_ns,
31#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
32 .ipc_ns = &init_ipc_ns,
33#endif
34 .mnt_ns = NULL,
35 .pid_ns = &init_pid_ns,
36#ifdef CONFIG_NET
37 .net_ns = &init_net,
38#endif
39};
28 40
29static inline struct nsproxy *create_nsproxy(void) 41static inline struct nsproxy *create_nsproxy(void)
30{ 42{
diff --git a/kernel/padata.c b/kernel/padata.c
new file mode 100644
index 000000000000..fd03513c7327
--- /dev/null
+++ b/kernel/padata.c
@@ -0,0 +1,697 @@
1/*
2 * padata.c - generic interface to process data streams in parallel
3 *
4 * Copyright (C) 2008, 2009 secunet Security Networks AG
5 * Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com>
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms and conditions of the GNU General Public License,
9 * version 2, as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 *
16 * You should have received a copy of the GNU General Public License along with
17 * this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
19 */
20
21#include <linux/module.h>
22#include <linux/cpumask.h>
23#include <linux/err.h>
24#include <linux/cpu.h>
25#include <linux/padata.h>
26#include <linux/mutex.h>
27#include <linux/sched.h>
28#include <linux/slab.h>
29#include <linux/rcupdate.h>
30
31#define MAX_SEQ_NR INT_MAX - NR_CPUS
32#define MAX_OBJ_NUM 10000 * NR_CPUS
33
34static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
35{
36 int cpu, target_cpu;
37
38 target_cpu = cpumask_first(pd->cpumask);
39 for (cpu = 0; cpu < cpu_index; cpu++)
40 target_cpu = cpumask_next(target_cpu, pd->cpumask);
41
42 return target_cpu;
43}
44
45static int padata_cpu_hash(struct padata_priv *padata)
46{
47 int cpu_index;
48 struct parallel_data *pd;
49
50 pd = padata->pd;
51
52 /*
53 * Hash the sequence numbers to the cpus by taking
54 * seq_nr mod. number of cpus in use.
55 */
56 cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask);
57
58 return padata_index_to_cpu(pd, cpu_index);
59}
60
61static void padata_parallel_worker(struct work_struct *work)
62{
63 struct padata_queue *queue;
64 struct parallel_data *pd;
65 struct padata_instance *pinst;
66 LIST_HEAD(local_list);
67
68 local_bh_disable();
69 queue = container_of(work, struct padata_queue, pwork);
70 pd = queue->pd;
71 pinst = pd->pinst;
72
73 spin_lock(&queue->parallel.lock);
74 list_replace_init(&queue->parallel.list, &local_list);
75 spin_unlock(&queue->parallel.lock);
76
77 while (!list_empty(&local_list)) {
78 struct padata_priv *padata;
79
80 padata = list_entry(local_list.next,
81 struct padata_priv, list);
82
83 list_del_init(&padata->list);
84
85 padata->parallel(padata);
86 }
87
88 local_bh_enable();
89}
90
91/*
92 * padata_do_parallel - padata parallelization function
93 *
94 * @pinst: padata instance
95 * @padata: object to be parallelized
96 * @cb_cpu: cpu the serialization callback function will run on,
97 * must be in the cpumask of padata.
98 *
99 * The parallelization callback function will run with BHs off.
100 * Note: Every object which is parallelized by padata_do_parallel
101 * must be seen by padata_do_serial.
102 */
103int padata_do_parallel(struct padata_instance *pinst,
104 struct padata_priv *padata, int cb_cpu)
105{
106 int target_cpu, err;
107 struct padata_queue *queue;
108 struct parallel_data *pd;
109
110 rcu_read_lock_bh();
111
112 pd = rcu_dereference(pinst->pd);
113
114 err = 0;
115 if (!(pinst->flags & PADATA_INIT))
116 goto out;
117
118 err = -EBUSY;
119 if ((pinst->flags & PADATA_RESET))
120 goto out;
121
122 if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM)
123 goto out;
124
125 err = -EINVAL;
126 if (!cpumask_test_cpu(cb_cpu, pd->cpumask))
127 goto out;
128
129 err = -EINPROGRESS;
130 atomic_inc(&pd->refcnt);
131 padata->pd = pd;
132 padata->cb_cpu = cb_cpu;
133
134 if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr))
135 atomic_set(&pd->seq_nr, -1);
136
137 padata->seq_nr = atomic_inc_return(&pd->seq_nr);
138
139 target_cpu = padata_cpu_hash(padata);
140 queue = per_cpu_ptr(pd->queue, target_cpu);
141
142 spin_lock(&queue->parallel.lock);
143 list_add_tail(&padata->list, &queue->parallel.list);
144 spin_unlock(&queue->parallel.lock);
145
146 queue_work_on(target_cpu, pinst->wq, &queue->pwork);
147
148out:
149 rcu_read_unlock_bh();
150
151 return err;
152}
153EXPORT_SYMBOL(padata_do_parallel);
154
155static struct padata_priv *padata_get_next(struct parallel_data *pd)
156{
157 int cpu, num_cpus, empty, calc_seq_nr;
158 int seq_nr, next_nr, overrun, next_overrun;
159 struct padata_queue *queue, *next_queue;
160 struct padata_priv *padata;
161 struct padata_list *reorder;
162
163 empty = 0;
164 next_nr = -1;
165 next_overrun = 0;
166 next_queue = NULL;
167
168 num_cpus = cpumask_weight(pd->cpumask);
169
170 for_each_cpu(cpu, pd->cpumask) {
171 queue = per_cpu_ptr(pd->queue, cpu);
172 reorder = &queue->reorder;
173
174 /*
175 * Calculate the seq_nr of the object that should be
176 * next in this queue.
177 */
178 overrun = 0;
179 calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus)
180 + queue->cpu_index;
181
182 if (unlikely(calc_seq_nr > pd->max_seq_nr)) {
183 calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1;
184 overrun = 1;
185 }
186
187 if (!list_empty(&reorder->list)) {
188 padata = list_entry(reorder->list.next,
189 struct padata_priv, list);
190
191 seq_nr = padata->seq_nr;
192 BUG_ON(calc_seq_nr != seq_nr);
193 } else {
194 seq_nr = calc_seq_nr;
195 empty++;
196 }
197
198 if (next_nr < 0 || seq_nr < next_nr
199 || (next_overrun && !overrun)) {
200 next_nr = seq_nr;
201 next_overrun = overrun;
202 next_queue = queue;
203 }
204 }
205
206 padata = NULL;
207
208 if (empty == num_cpus)
209 goto out;
210
211 reorder = &next_queue->reorder;
212
213 if (!list_empty(&reorder->list)) {
214 padata = list_entry(reorder->list.next,
215 struct padata_priv, list);
216
217 if (unlikely(next_overrun)) {
218 for_each_cpu(cpu, pd->cpumask) {
219 queue = per_cpu_ptr(pd->queue, cpu);
220 atomic_set(&queue->num_obj, 0);
221 }
222 }
223
224 spin_lock(&reorder->lock);
225 list_del_init(&padata->list);
226 atomic_dec(&pd->reorder_objects);
227 spin_unlock(&reorder->lock);
228
229 atomic_inc(&next_queue->num_obj);
230
231 goto out;
232 }
233
234 if (next_nr % num_cpus == next_queue->cpu_index) {
235 padata = ERR_PTR(-ENODATA);
236 goto out;
237 }
238
239 padata = ERR_PTR(-EINPROGRESS);
240out:
241 return padata;
242}
243
244static void padata_reorder(struct parallel_data *pd)
245{
246 struct padata_priv *padata;
247 struct padata_queue *queue;
248 struct padata_instance *pinst = pd->pinst;
249
250try_again:
251 if (!spin_trylock_bh(&pd->lock))
252 goto out;
253
254 while (1) {
255 padata = padata_get_next(pd);
256
257 if (!padata || PTR_ERR(padata) == -EINPROGRESS)
258 break;
259
260 if (PTR_ERR(padata) == -ENODATA) {
261 spin_unlock_bh(&pd->lock);
262 goto out;
263 }
264
265 queue = per_cpu_ptr(pd->queue, padata->cb_cpu);
266
267 spin_lock(&queue->serial.lock);
268 list_add_tail(&padata->list, &queue->serial.list);
269 spin_unlock(&queue->serial.lock);
270
271 queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork);
272 }
273
274 spin_unlock_bh(&pd->lock);
275
276 if (atomic_read(&pd->reorder_objects))
277 goto try_again;
278
279out:
280 return;
281}
282
283static void padata_serial_worker(struct work_struct *work)
284{
285 struct padata_queue *queue;
286 struct parallel_data *pd;
287 LIST_HEAD(local_list);
288
289 local_bh_disable();
290 queue = container_of(work, struct padata_queue, swork);
291 pd = queue->pd;
292
293 spin_lock(&queue->serial.lock);
294 list_replace_init(&queue->serial.list, &local_list);
295 spin_unlock(&queue->serial.lock);
296
297 while (!list_empty(&local_list)) {
298 struct padata_priv *padata;
299
300 padata = list_entry(local_list.next,
301 struct padata_priv, list);
302
303 list_del_init(&padata->list);
304
305 padata->serial(padata);
306 atomic_dec(&pd->refcnt);
307 }
308 local_bh_enable();
309}
310
311/*
312 * padata_do_serial - padata serialization function
313 *
314 * @padata: object to be serialized.
315 *
316 * padata_do_serial must be called for every parallelized object.
317 * The serialization callback function will run with BHs off.
318 */
319void padata_do_serial(struct padata_priv *padata)
320{
321 int cpu;
322 struct padata_queue *queue;
323 struct parallel_data *pd;
324
325 pd = padata->pd;
326
327 cpu = get_cpu();
328 queue = per_cpu_ptr(pd->queue, cpu);
329
330 spin_lock(&queue->reorder.lock);
331 atomic_inc(&pd->reorder_objects);
332 list_add_tail(&padata->list, &queue->reorder.list);
333 spin_unlock(&queue->reorder.lock);
334
335 put_cpu();
336
337 padata_reorder(pd);
338}
339EXPORT_SYMBOL(padata_do_serial);
340
341static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
342 const struct cpumask *cpumask)
343{
344 int cpu, cpu_index, num_cpus;
345 struct padata_queue *queue;
346 struct parallel_data *pd;
347
348 cpu_index = 0;
349
350 pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
351 if (!pd)
352 goto err;
353
354 pd->queue = alloc_percpu(struct padata_queue);
355 if (!pd->queue)
356 goto err_free_pd;
357
358 if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL))
359 goto err_free_queue;
360
361 for_each_possible_cpu(cpu) {
362 queue = per_cpu_ptr(pd->queue, cpu);
363
364 queue->pd = pd;
365
366 if (cpumask_test_cpu(cpu, cpumask)
367 && cpumask_test_cpu(cpu, cpu_active_mask)) {
368 queue->cpu_index = cpu_index;
369 cpu_index++;
370 } else
371 queue->cpu_index = -1;
372
373 INIT_LIST_HEAD(&queue->reorder.list);
374 INIT_LIST_HEAD(&queue->parallel.list);
375 INIT_LIST_HEAD(&queue->serial.list);
376 spin_lock_init(&queue->reorder.lock);
377 spin_lock_init(&queue->parallel.lock);
378 spin_lock_init(&queue->serial.lock);
379
380 INIT_WORK(&queue->pwork, padata_parallel_worker);
381 INIT_WORK(&queue->swork, padata_serial_worker);
382 atomic_set(&queue->num_obj, 0);
383 }
384
385 cpumask_and(pd->cpumask, cpumask, cpu_active_mask);
386
387 num_cpus = cpumask_weight(pd->cpumask);
388 pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1;
389
390 atomic_set(&pd->seq_nr, -1);
391 atomic_set(&pd->reorder_objects, 0);
392 atomic_set(&pd->refcnt, 0);
393 pd->pinst = pinst;
394 spin_lock_init(&pd->lock);
395
396 return pd;
397
398err_free_queue:
399 free_percpu(pd->queue);
400err_free_pd:
401 kfree(pd);
402err:
403 return NULL;
404}
405
406static void padata_free_pd(struct parallel_data *pd)
407{
408 free_cpumask_var(pd->cpumask);
409 free_percpu(pd->queue);
410 kfree(pd);
411}
412
413static void padata_replace(struct padata_instance *pinst,
414 struct parallel_data *pd_new)
415{
416 struct parallel_data *pd_old = pinst->pd;
417
418 pinst->flags |= PADATA_RESET;
419
420 rcu_assign_pointer(pinst->pd, pd_new);
421
422 synchronize_rcu();
423
424 while (atomic_read(&pd_old->refcnt) != 0)
425 yield();
426
427 flush_workqueue(pinst->wq);
428
429 padata_free_pd(pd_old);
430
431 pinst->flags &= ~PADATA_RESET;
432}
433
434/*
435 * padata_set_cpumask - set the cpumask that padata should use
436 *
437 * @pinst: padata instance
438 * @cpumask: the cpumask to use
439 */
440int padata_set_cpumask(struct padata_instance *pinst,
441 cpumask_var_t cpumask)
442{
443 struct parallel_data *pd;
444 int err = 0;
445
446 might_sleep();
447
448 mutex_lock(&pinst->lock);
449
450 pd = padata_alloc_pd(pinst, cpumask);
451 if (!pd) {
452 err = -ENOMEM;
453 goto out;
454 }
455
456 cpumask_copy(pinst->cpumask, cpumask);
457
458 padata_replace(pinst, pd);
459
460out:
461 mutex_unlock(&pinst->lock);
462
463 return err;
464}
465EXPORT_SYMBOL(padata_set_cpumask);
466
467static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
468{
469 struct parallel_data *pd;
470
471 if (cpumask_test_cpu(cpu, cpu_active_mask)) {
472 pd = padata_alloc_pd(pinst, pinst->cpumask);
473 if (!pd)
474 return -ENOMEM;
475
476 padata_replace(pinst, pd);
477 }
478
479 return 0;
480}
481
482/*
483 * padata_add_cpu - add a cpu to the padata cpumask
484 *
485 * @pinst: padata instance
486 * @cpu: cpu to add
487 */
488int padata_add_cpu(struct padata_instance *pinst, int cpu)
489{
490 int err;
491
492 might_sleep();
493
494 mutex_lock(&pinst->lock);
495
496 cpumask_set_cpu(cpu, pinst->cpumask);
497 err = __padata_add_cpu(pinst, cpu);
498
499 mutex_unlock(&pinst->lock);
500
501 return err;
502}
503EXPORT_SYMBOL(padata_add_cpu);
504
505static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
506{
507 struct parallel_data *pd;
508
509 if (cpumask_test_cpu(cpu, cpu_online_mask)) {
510 pd = padata_alloc_pd(pinst, pinst->cpumask);
511 if (!pd)
512 return -ENOMEM;
513
514 padata_replace(pinst, pd);
515 }
516
517 return 0;
518}
519
520/*
521 * padata_remove_cpu - remove a cpu from the padata cpumask
522 *
523 * @pinst: padata instance
524 * @cpu: cpu to remove
525 */
526int padata_remove_cpu(struct padata_instance *pinst, int cpu)
527{
528 int err;
529
530 might_sleep();
531
532 mutex_lock(&pinst->lock);
533
534 cpumask_clear_cpu(cpu, pinst->cpumask);
535 err = __padata_remove_cpu(pinst, cpu);
536
537 mutex_unlock(&pinst->lock);
538
539 return err;
540}
541EXPORT_SYMBOL(padata_remove_cpu);
542
543/*
544 * padata_start - start the parallel processing
545 *
546 * @pinst: padata instance to start
547 */
548void padata_start(struct padata_instance *pinst)
549{
550 might_sleep();
551
552 mutex_lock(&pinst->lock);
553 pinst->flags |= PADATA_INIT;
554 mutex_unlock(&pinst->lock);
555}
556EXPORT_SYMBOL(padata_start);
557
558/*
559 * padata_stop - stop the parallel processing
560 *
561 * @pinst: padata instance to stop
562 */
563void padata_stop(struct padata_instance *pinst)
564{
565 might_sleep();
566
567 mutex_lock(&pinst->lock);
568 pinst->flags &= ~PADATA_INIT;
569 mutex_unlock(&pinst->lock);
570}
571EXPORT_SYMBOL(padata_stop);
572
573static int __cpuinit padata_cpu_callback(struct notifier_block *nfb,
574 unsigned long action, void *hcpu)
575{
576 int err;
577 struct padata_instance *pinst;
578 int cpu = (unsigned long)hcpu;
579
580 pinst = container_of(nfb, struct padata_instance, cpu_notifier);
581
582 switch (action) {
583 case CPU_ONLINE:
584 case CPU_ONLINE_FROZEN:
585 if (!cpumask_test_cpu(cpu, pinst->cpumask))
586 break;
587 mutex_lock(&pinst->lock);
588 err = __padata_add_cpu(pinst, cpu);
589 mutex_unlock(&pinst->lock);
590 if (err)
591 return NOTIFY_BAD;
592 break;
593
594 case CPU_DOWN_PREPARE:
595 case CPU_DOWN_PREPARE_FROZEN:
596 if (!cpumask_test_cpu(cpu, pinst->cpumask))
597 break;
598 mutex_lock(&pinst->lock);
599 err = __padata_remove_cpu(pinst, cpu);
600 mutex_unlock(&pinst->lock);
601 if (err)
602 return NOTIFY_BAD;
603 break;
604
605 case CPU_UP_CANCELED:
606 case CPU_UP_CANCELED_FROZEN:
607 if (!cpumask_test_cpu(cpu, pinst->cpumask))
608 break;
609 mutex_lock(&pinst->lock);
610 __padata_remove_cpu(pinst, cpu);
611 mutex_unlock(&pinst->lock);
612
613 case CPU_DOWN_FAILED:
614 case CPU_DOWN_FAILED_FROZEN:
615 if (!cpumask_test_cpu(cpu, pinst->cpumask))
616 break;
617 mutex_lock(&pinst->lock);
618 __padata_add_cpu(pinst, cpu);
619 mutex_unlock(&pinst->lock);
620 }
621
622 return NOTIFY_OK;
623}
624
625/*
626 * padata_alloc - allocate and initialize a padata instance
627 *
628 * @cpumask: cpumask that padata uses for parallelization
629 * @wq: workqueue to use for the allocated padata instance
630 */
631struct padata_instance *padata_alloc(const struct cpumask *cpumask,
632 struct workqueue_struct *wq)
633{
634 int err;
635 struct padata_instance *pinst;
636 struct parallel_data *pd;
637
638 pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL);
639 if (!pinst)
640 goto err;
641
642 pd = padata_alloc_pd(pinst, cpumask);
643 if (!pd)
644 goto err_free_inst;
645
646 if (!alloc_cpumask_var(&pinst->cpumask, GFP_KERNEL))
647 goto err_free_pd;
648
649 rcu_assign_pointer(pinst->pd, pd);
650
651 pinst->wq = wq;
652
653 cpumask_copy(pinst->cpumask, cpumask);
654
655 pinst->flags = 0;
656
657 pinst->cpu_notifier.notifier_call = padata_cpu_callback;
658 pinst->cpu_notifier.priority = 0;
659 err = register_hotcpu_notifier(&pinst->cpu_notifier);
660 if (err)
661 goto err_free_cpumask;
662
663 mutex_init(&pinst->lock);
664
665 return pinst;
666
667err_free_cpumask:
668 free_cpumask_var(pinst->cpumask);
669err_free_pd:
670 padata_free_pd(pd);
671err_free_inst:
672 kfree(pinst);
673err:
674 return NULL;
675}
676EXPORT_SYMBOL(padata_alloc);
677
678/*
679 * padata_free - free a padata instance
680 *
681 * @ padata_inst: padata instance to free
682 */
683void padata_free(struct padata_instance *pinst)
684{
685 padata_stop(pinst);
686
687 synchronize_rcu();
688
689 while (atomic_read(&pinst->pd->refcnt) != 0)
690 yield();
691
692 unregister_hotcpu_notifier(&pinst->cpu_notifier);
693 padata_free_pd(pinst->pd);
694 free_cpumask_var(pinst->cpumask);
695 kfree(pinst);
696}
697EXPORT_SYMBOL(padata_free);
diff --git a/kernel/panic.c b/kernel/panic.c
index 96b45d0b4ba5..13d966b4c14a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -10,6 +10,7 @@
10 */ 10 */
11#include <linux/debug_locks.h> 11#include <linux/debug_locks.h>
12#include <linux/interrupt.h> 12#include <linux/interrupt.h>
13#include <linux/kmsg_dump.h>
13#include <linux/kallsyms.h> 14#include <linux/kallsyms.h>
14#include <linux/notifier.h> 15#include <linux/notifier.h>
15#include <linux/module.h> 16#include <linux/module.h>
@@ -35,15 +36,36 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
35 36
36EXPORT_SYMBOL(panic_notifier_list); 37EXPORT_SYMBOL(panic_notifier_list);
37 38
38static long no_blink(long time)
39{
40 return 0;
41}
42
43/* Returns how long it waited in ms */ 39/* Returns how long it waited in ms */
44long (*panic_blink)(long time); 40long (*panic_blink)(long time);
45EXPORT_SYMBOL(panic_blink); 41EXPORT_SYMBOL(panic_blink);
46 42
43static void panic_blink_one_second(void)
44{
45 static long i = 0, end;
46
47 if (panic_blink) {
48 end = i + MSEC_PER_SEC;
49
50 while (i < end) {
51 i += panic_blink(i);
52 mdelay(1);
53 i++;
54 }
55 } else {
56 /*
57 * When running under a hypervisor a small mdelay may get
58 * rounded up to the hypervisor timeslice. For example, with
59 * a 1ms in 10ms hypervisor timeslice we might inflate a
60 * mdelay(1) loop by 10x.
61 *
62 * If we have nothing to blink, spin on 1 second calls to
63 * mdelay to avoid this.
64 */
65 mdelay(MSEC_PER_SEC);
66 }
67}
68
47/** 69/**
48 * panic - halt the system 70 * panic - halt the system
49 * @fmt: The text string to print 71 * @fmt: The text string to print
@@ -81,6 +103,8 @@ NORET_TYPE void panic(const char * fmt, ...)
81 */ 103 */
82 crash_kexec(NULL); 104 crash_kexec(NULL);
83 105
106 kmsg_dump(KMSG_DUMP_PANIC);
107
84 /* 108 /*
85 * Note smp_send_stop is the usual smp shutdown function, which 109 * Note smp_send_stop is the usual smp shutdown function, which
86 * unfortunately means it may not be hardened to work in a panic 110 * unfortunately means it may not be hardened to work in a panic
@@ -92,9 +116,6 @@ NORET_TYPE void panic(const char * fmt, ...)
92 116
93 bust_spinlocks(0); 117 bust_spinlocks(0);
94 118
95 if (!panic_blink)
96 panic_blink = no_blink;
97
98 if (panic_timeout > 0) { 119 if (panic_timeout > 0) {
99 /* 120 /*
100 * Delay timeout seconds before rebooting the machine. 121 * Delay timeout seconds before rebooting the machine.
@@ -102,11 +123,9 @@ NORET_TYPE void panic(const char * fmt, ...)
102 */ 123 */
103 printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout); 124 printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
104 125
105 for (i = 0; i < panic_timeout*1000; ) { 126 for (i = 0; i < panic_timeout; i++) {
106 touch_nmi_watchdog(); 127 touch_nmi_watchdog();
107 i += panic_blink(i); 128 panic_blink_one_second();
108 mdelay(1);
109 i++;
110 } 129 }
111 /* 130 /*
112 * This will not be a clean reboot, with everything 131 * This will not be a clean reboot, with everything
@@ -132,11 +151,9 @@ NORET_TYPE void panic(const char * fmt, ...)
132 } 151 }
133#endif 152#endif
134 local_irq_enable(); 153 local_irq_enable();
135 for (i = 0; ; ) { 154 while (1) {
136 touch_softlockup_watchdog(); 155 touch_softlockup_watchdog();
137 i += panic_blink(i); 156 panic_blink_one_second();
138 mdelay(1);
139 i++;
140 } 157 }
141} 158}
142 159
@@ -339,6 +356,7 @@ void oops_exit(void)
339{ 356{
340 do_oops_enter_exit(); 357 do_oops_enter_exit();
341 print_oops_end_marker(); 358 print_oops_end_marker();
359 kmsg_dump(KMSG_DUMP_OOPS);
342} 360}
343 361
344#ifdef WANT_WARN_ON_SLOWPATH 362#ifdef WANT_WARN_ON_SLOWPATH
diff --git a/kernel/params.c b/kernel/params.c
index d656c276508d..0b30ecd53a52 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -122,9 +122,7 @@ static char *next_arg(char *args, char **param, char **val)
122 next = args + i; 122 next = args + i;
123 123
124 /* Chew up trailing spaces. */ 124 /* Chew up trailing spaces. */
125 while (isspace(*next)) 125 return skip_spaces(next);
126 next++;
127 return next;
128} 126}
129 127
130/* Args looks like "foo=bar,bar2 baz=fuz wiz". */ 128/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
@@ -139,8 +137,7 @@ int parse_args(const char *name,
139 DEBUGP("Parsing ARGS: %s\n", args); 137 DEBUGP("Parsing ARGS: %s\n", args);
140 138
141 /* Chew leading spaces */ 139 /* Chew leading spaces */
142 while (isspace(*args)) 140 args = skip_spaces(args);
143 args++;
144 141
145 while (*args) { 142 while (*args) {
146 int ret; 143 int ret;
@@ -404,8 +401,8 @@ int param_get_string(char *buffer, struct kernel_param *kp)
404} 401}
405 402
406/* sysfs output in /sys/modules/XYZ/parameters/ */ 403/* sysfs output in /sys/modules/XYZ/parameters/ */
407#define to_module_attr(n) container_of(n, struct module_attribute, attr); 404#define to_module_attr(n) container_of(n, struct module_attribute, attr)
408#define to_module_kobject(n) container_of(n, struct module_kobject, kobj); 405#define to_module_kobject(n) container_of(n, struct module_kobject, kobj)
409 406
410extern struct kernel_param __start___param[], __stop___param[]; 407extern struct kernel_param __start___param[], __stop___param[];
411 408
@@ -423,7 +420,7 @@ struct module_param_attrs
423}; 420};
424 421
425#ifdef CONFIG_SYSFS 422#ifdef CONFIG_SYSFS
426#define to_param_attr(n) container_of(n, struct param_attribute, mattr); 423#define to_param_attr(n) container_of(n, struct param_attribute, mattr)
427 424
428static ssize_t param_attr_show(struct module_attribute *mattr, 425static ssize_t param_attr_show(struct module_attribute *mattr,
429 struct module *mod, char *buf) 426 struct module *mod, char *buf)
@@ -519,6 +516,7 @@ static __modinit int add_sysfs_param(struct module_kobject *mk,
519 new->grp.attrs = attrs; 516 new->grp.attrs = attrs;
520 517
521 /* Tack new one on the end. */ 518 /* Tack new one on the end. */
519 sysfs_attr_init(&new->attrs[num].mattr.attr);
522 new->attrs[num].param = kp; 520 new->attrs[num].param = kp;
523 new->attrs[num].mattr.show = param_attr_show; 521 new->attrs[num].mattr.show = param_attr_show;
524 new->attrs[num].mattr.store = param_attr_store; 522 new->attrs[num].mattr.store = param_attr_store;
@@ -725,7 +723,7 @@ static ssize_t module_attr_store(struct kobject *kobj,
725 return ret; 723 return ret;
726} 724}
727 725
728static struct sysfs_ops module_sysfs_ops = { 726static const struct sysfs_ops module_sysfs_ops = {
729 .show = module_attr_show, 727 .show = module_attr_show,
730 .store = module_attr_store, 728 .store = module_attr_store,
731}; 729};
@@ -739,7 +737,7 @@ static int uevent_filter(struct kset *kset, struct kobject *kobj)
739 return 0; 737 return 0;
740} 738}
741 739
742static struct kset_uevent_ops module_uevent_ops = { 740static const struct kset_uevent_ops module_uevent_ops = {
743 .filter = uevent_filter, 741 .filter = uevent_filter,
744}; 742};
745 743
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 7f29643c8985..3d1552d3c12b 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -15,6 +15,7 @@
15#include <linux/smp.h> 15#include <linux/smp.h>
16#include <linux/file.h> 16#include <linux/file.h>
17#include <linux/poll.h> 17#include <linux/poll.h>
18#include <linux/slab.h>
18#include <linux/sysfs.h> 19#include <linux/sysfs.h>
19#include <linux/dcache.h> 20#include <linux/dcache.h>
20#include <linux/percpu.h> 21#include <linux/percpu.h>
@@ -28,13 +29,15 @@
28#include <linux/anon_inodes.h> 29#include <linux/anon_inodes.h>
29#include <linux/kernel_stat.h> 30#include <linux/kernel_stat.h>
30#include <linux/perf_event.h> 31#include <linux/perf_event.h>
32#include <linux/ftrace_event.h>
33#include <linux/hw_breakpoint.h>
31 34
32#include <asm/irq_regs.h> 35#include <asm/irq_regs.h>
33 36
34/* 37/*
35 * Each CPU has a list of per CPU events: 38 * Each CPU has a list of per CPU events:
36 */ 39 */
37DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); 40static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
38 41
39int perf_max_events __read_mostly = 1; 42int perf_max_events __read_mostly = 1;
40static int perf_reserved_percpu __read_mostly; 43static int perf_reserved_percpu __read_mostly;
@@ -54,21 +57,6 @@ static atomic_t nr_task_events __read_mostly;
54 */ 57 */
55int sysctl_perf_event_paranoid __read_mostly = 1; 58int sysctl_perf_event_paranoid __read_mostly = 1;
56 59
57static inline bool perf_paranoid_tracepoint_raw(void)
58{
59 return sysctl_perf_event_paranoid > -1;
60}
61
62static inline bool perf_paranoid_cpu(void)
63{
64 return sysctl_perf_event_paranoid > 0;
65}
66
67static inline bool perf_paranoid_kernel(void)
68{
69 return sysctl_perf_event_paranoid > 1;
70}
71
72int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ 60int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
73 61
74/* 62/*
@@ -94,13 +82,10 @@ extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
94void __weak hw_perf_disable(void) { barrier(); } 82void __weak hw_perf_disable(void) { barrier(); }
95void __weak hw_perf_enable(void) { barrier(); } 83void __weak hw_perf_enable(void) { barrier(); }
96 84
97void __weak hw_perf_event_setup(int cpu) { barrier(); }
98void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
99
100int __weak 85int __weak
101hw_perf_group_sched_in(struct perf_event *group_leader, 86hw_perf_group_sched_in(struct perf_event *group_leader,
102 struct perf_cpu_context *cpuctx, 87 struct perf_cpu_context *cpuctx,
103 struct perf_event_context *ctx, int cpu) 88 struct perf_event_context *ctx)
104{ 89{
105 return 0; 90 return 0;
106} 91}
@@ -109,25 +94,15 @@ void __weak perf_event_print_debug(void) { }
109 94
110static DEFINE_PER_CPU(int, perf_disable_count); 95static DEFINE_PER_CPU(int, perf_disable_count);
111 96
112void __perf_disable(void)
113{
114 __get_cpu_var(perf_disable_count)++;
115}
116
117bool __perf_enable(void)
118{
119 return !--__get_cpu_var(perf_disable_count);
120}
121
122void perf_disable(void) 97void perf_disable(void)
123{ 98{
124 __perf_disable(); 99 if (!__get_cpu_var(perf_disable_count)++)
125 hw_perf_disable(); 100 hw_perf_disable();
126} 101}
127 102
128void perf_enable(void) 103void perf_enable(void)
129{ 104{
130 if (__perf_enable()) 105 if (!--__get_cpu_var(perf_disable_count))
131 hw_perf_enable(); 106 hw_perf_enable();
132} 107}
133 108
@@ -201,14 +176,14 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
201 * if so. If we locked the right context, then it 176 * if so. If we locked the right context, then it
202 * can't get swapped on us any more. 177 * can't get swapped on us any more.
203 */ 178 */
204 spin_lock_irqsave(&ctx->lock, *flags); 179 raw_spin_lock_irqsave(&ctx->lock, *flags);
205 if (ctx != rcu_dereference(task->perf_event_ctxp)) { 180 if (ctx != rcu_dereference(task->perf_event_ctxp)) {
206 spin_unlock_irqrestore(&ctx->lock, *flags); 181 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
207 goto retry; 182 goto retry;
208 } 183 }
209 184
210 if (!atomic_inc_not_zero(&ctx->refcount)) { 185 if (!atomic_inc_not_zero(&ctx->refcount)) {
211 spin_unlock_irqrestore(&ctx->lock, *flags); 186 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
212 ctx = NULL; 187 ctx = NULL;
213 } 188 }
214 } 189 }
@@ -229,7 +204,7 @@ static struct perf_event_context *perf_pin_task_context(struct task_struct *task
229 ctx = perf_lock_task_context(task, &flags); 204 ctx = perf_lock_task_context(task, &flags);
230 if (ctx) { 205 if (ctx) {
231 ++ctx->pin_count; 206 ++ctx->pin_count;
232 spin_unlock_irqrestore(&ctx->lock, flags); 207 raw_spin_unlock_irqrestore(&ctx->lock, flags);
233 } 208 }
234 return ctx; 209 return ctx;
235} 210}
@@ -238,12 +213,64 @@ static void perf_unpin_context(struct perf_event_context *ctx)
238{ 213{
239 unsigned long flags; 214 unsigned long flags;
240 215
241 spin_lock_irqsave(&ctx->lock, flags); 216 raw_spin_lock_irqsave(&ctx->lock, flags);
242 --ctx->pin_count; 217 --ctx->pin_count;
243 spin_unlock_irqrestore(&ctx->lock, flags); 218 raw_spin_unlock_irqrestore(&ctx->lock, flags);
244 put_ctx(ctx); 219 put_ctx(ctx);
245} 220}
246 221
222static inline u64 perf_clock(void)
223{
224 return cpu_clock(raw_smp_processor_id());
225}
226
227/*
228 * Update the record of the current time in a context.
229 */
230static void update_context_time(struct perf_event_context *ctx)
231{
232 u64 now = perf_clock();
233
234 ctx->time += now - ctx->timestamp;
235 ctx->timestamp = now;
236}
237
238/*
239 * Update the total_time_enabled and total_time_running fields for a event.
240 */
241static void update_event_times(struct perf_event *event)
242{
243 struct perf_event_context *ctx = event->ctx;
244 u64 run_end;
245
246 if (event->state < PERF_EVENT_STATE_INACTIVE ||
247 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
248 return;
249
250 if (ctx->is_active)
251 run_end = ctx->time;
252 else
253 run_end = event->tstamp_stopped;
254
255 event->total_time_enabled = run_end - event->tstamp_enabled;
256
257 if (event->state == PERF_EVENT_STATE_INACTIVE)
258 run_end = event->tstamp_stopped;
259 else
260 run_end = ctx->time;
261
262 event->total_time_running = run_end - event->tstamp_running;
263}
264
265static struct list_head *
266ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
267{
268 if (event->attr.pinned)
269 return &ctx->pinned_groups;
270 else
271 return &ctx->flexible_groups;
272}
273
247/* 274/*
248 * Add a event from the lists for its context. 275 * Add a event from the lists for its context.
249 * Must be called with ctx->mutex and ctx->lock held. 276 * Must be called with ctx->mutex and ctx->lock held.
@@ -258,9 +285,19 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
258 * add it straight to the context's event list, or to the group 285 * add it straight to the context's event list, or to the group
259 * leader's sibling list: 286 * leader's sibling list:
260 */ 287 */
261 if (group_leader == event) 288 if (group_leader == event) {
262 list_add_tail(&event->group_entry, &ctx->group_list); 289 struct list_head *list;
263 else { 290
291 if (is_software_event(event))
292 event->group_flags |= PERF_GROUP_SOFTWARE;
293
294 list = ctx_group_list(event, ctx);
295 list_add_tail(&event->group_entry, list);
296 } else {
297 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
298 !is_software_event(event))
299 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
300
264 list_add_tail(&event->group_entry, &group_leader->sibling_list); 301 list_add_tail(&event->group_entry, &group_leader->sibling_list);
265 group_leader->nr_siblings++; 302 group_leader->nr_siblings++;
266 } 303 }
@@ -292,15 +329,32 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
292 if (event->group_leader != event) 329 if (event->group_leader != event)
293 event->group_leader->nr_siblings--; 330 event->group_leader->nr_siblings--;
294 331
332 update_event_times(event);
333
334 /*
335 * If event was in error state, then keep it
336 * that way, otherwise bogus counts will be
337 * returned on read(). The only way to get out
338 * of error state is by explicit re-enabling
339 * of the event
340 */
341 if (event->state > PERF_EVENT_STATE_OFF)
342 event->state = PERF_EVENT_STATE_OFF;
343
295 /* 344 /*
296 * If this was a group event with sibling events then 345 * If this was a group event with sibling events then
297 * upgrade the siblings to singleton events by adding them 346 * upgrade the siblings to singleton events by adding them
298 * to the context list directly: 347 * to the context list directly:
299 */ 348 */
300 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { 349 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
350 struct list_head *list;
301 351
302 list_move_tail(&sibling->group_entry, &ctx->group_list); 352 list = ctx_group_list(event, ctx);
353 list_move_tail(&sibling->group_entry, list);
303 sibling->group_leader = sibling; 354 sibling->group_leader = sibling;
355
356 /* Inherit group flags from the previous leader */
357 sibling->group_flags = event->group_flags;
304 } 358 }
305} 359}
306 360
@@ -370,7 +424,7 @@ static void __perf_event_remove_from_context(void *info)
370 if (ctx->task && cpuctx->task_ctx != ctx) 424 if (ctx->task && cpuctx->task_ctx != ctx)
371 return; 425 return;
372 426
373 spin_lock(&ctx->lock); 427 raw_spin_lock(&ctx->lock);
374 /* 428 /*
375 * Protect the list operation against NMI by disabling the 429 * Protect the list operation against NMI by disabling the
376 * events on a global level. 430 * events on a global level.
@@ -392,7 +446,7 @@ static void __perf_event_remove_from_context(void *info)
392 } 446 }
393 447
394 perf_enable(); 448 perf_enable();
395 spin_unlock(&ctx->lock); 449 raw_spin_unlock(&ctx->lock);
396} 450}
397 451
398 452
@@ -419,7 +473,7 @@ static void perf_event_remove_from_context(struct perf_event *event)
419 if (!task) { 473 if (!task) {
420 /* 474 /*
421 * Per cpu events are removed via an smp call and 475 * Per cpu events are removed via an smp call and
422 * the removal is always sucessful. 476 * the removal is always successful.
423 */ 477 */
424 smp_call_function_single(event->cpu, 478 smp_call_function_single(event->cpu,
425 __perf_event_remove_from_context, 479 __perf_event_remove_from_context,
@@ -431,12 +485,12 @@ retry:
431 task_oncpu_function_call(task, __perf_event_remove_from_context, 485 task_oncpu_function_call(task, __perf_event_remove_from_context,
432 event); 486 event);
433 487
434 spin_lock_irq(&ctx->lock); 488 raw_spin_lock_irq(&ctx->lock);
435 /* 489 /*
436 * If the context is active we need to retry the smp call. 490 * If the context is active we need to retry the smp call.
437 */ 491 */
438 if (ctx->nr_active && !list_empty(&event->group_entry)) { 492 if (ctx->nr_active && !list_empty(&event->group_entry)) {
439 spin_unlock_irq(&ctx->lock); 493 raw_spin_unlock_irq(&ctx->lock);
440 goto retry; 494 goto retry;
441 } 495 }
442 496
@@ -445,48 +499,9 @@ retry:
445 * can remove the event safely, if the call above did not 499 * can remove the event safely, if the call above did not
446 * succeed. 500 * succeed.
447 */ 501 */
448 if (!list_empty(&event->group_entry)) { 502 if (!list_empty(&event->group_entry))
449 list_del_event(event, ctx); 503 list_del_event(event, ctx);
450 } 504 raw_spin_unlock_irq(&ctx->lock);
451 spin_unlock_irq(&ctx->lock);
452}
453
454static inline u64 perf_clock(void)
455{
456 return cpu_clock(smp_processor_id());
457}
458
459/*
460 * Update the record of the current time in a context.
461 */
462static void update_context_time(struct perf_event_context *ctx)
463{
464 u64 now = perf_clock();
465
466 ctx->time += now - ctx->timestamp;
467 ctx->timestamp = now;
468}
469
470/*
471 * Update the total_time_enabled and total_time_running fields for a event.
472 */
473static void update_event_times(struct perf_event *event)
474{
475 struct perf_event_context *ctx = event->ctx;
476 u64 run_end;
477
478 if (event->state < PERF_EVENT_STATE_INACTIVE ||
479 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
480 return;
481
482 event->total_time_enabled = ctx->time - event->tstamp_enabled;
483
484 if (event->state == PERF_EVENT_STATE_INACTIVE)
485 run_end = event->tstamp_stopped;
486 else
487 run_end = ctx->time;
488
489 event->total_time_running = run_end - event->tstamp_running;
490} 505}
491 506
492/* 507/*
@@ -517,7 +532,7 @@ static void __perf_event_disable(void *info)
517 if (ctx->task && cpuctx->task_ctx != ctx) 532 if (ctx->task && cpuctx->task_ctx != ctx)
518 return; 533 return;
519 534
520 spin_lock(&ctx->lock); 535 raw_spin_lock(&ctx->lock);
521 536
522 /* 537 /*
523 * If the event is on, turn it off. 538 * If the event is on, turn it off.
@@ -533,7 +548,7 @@ static void __perf_event_disable(void *info)
533 event->state = PERF_EVENT_STATE_OFF; 548 event->state = PERF_EVENT_STATE_OFF;
534 } 549 }
535 550
536 spin_unlock(&ctx->lock); 551 raw_spin_unlock(&ctx->lock);
537} 552}
538 553
539/* 554/*
@@ -549,7 +564,7 @@ static void __perf_event_disable(void *info)
549 * is the current context on this CPU and preemption is disabled, 564 * is the current context on this CPU and preemption is disabled,
550 * hence we can't get into perf_event_task_sched_out for this context. 565 * hence we can't get into perf_event_task_sched_out for this context.
551 */ 566 */
552static void perf_event_disable(struct perf_event *event) 567void perf_event_disable(struct perf_event *event)
553{ 568{
554 struct perf_event_context *ctx = event->ctx; 569 struct perf_event_context *ctx = event->ctx;
555 struct task_struct *task = ctx->task; 570 struct task_struct *task = ctx->task;
@@ -566,12 +581,12 @@ static void perf_event_disable(struct perf_event *event)
566 retry: 581 retry:
567 task_oncpu_function_call(task, __perf_event_disable, event); 582 task_oncpu_function_call(task, __perf_event_disable, event);
568 583
569 spin_lock_irq(&ctx->lock); 584 raw_spin_lock_irq(&ctx->lock);
570 /* 585 /*
571 * If the event is still active, we need to retry the cross-call. 586 * If the event is still active, we need to retry the cross-call.
572 */ 587 */
573 if (event->state == PERF_EVENT_STATE_ACTIVE) { 588 if (event->state == PERF_EVENT_STATE_ACTIVE) {
574 spin_unlock_irq(&ctx->lock); 589 raw_spin_unlock_irq(&ctx->lock);
575 goto retry; 590 goto retry;
576 } 591 }
577 592
@@ -584,20 +599,19 @@ static void perf_event_disable(struct perf_event *event)
584 event->state = PERF_EVENT_STATE_OFF; 599 event->state = PERF_EVENT_STATE_OFF;
585 } 600 }
586 601
587 spin_unlock_irq(&ctx->lock); 602 raw_spin_unlock_irq(&ctx->lock);
588} 603}
589 604
590static int 605static int
591event_sched_in(struct perf_event *event, 606event_sched_in(struct perf_event *event,
592 struct perf_cpu_context *cpuctx, 607 struct perf_cpu_context *cpuctx,
593 struct perf_event_context *ctx, 608 struct perf_event_context *ctx)
594 int cpu)
595{ 609{
596 if (event->state <= PERF_EVENT_STATE_OFF) 610 if (event->state <= PERF_EVENT_STATE_OFF)
597 return 0; 611 return 0;
598 612
599 event->state = PERF_EVENT_STATE_ACTIVE; 613 event->state = PERF_EVENT_STATE_ACTIVE;
600 event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ 614 event->oncpu = smp_processor_id();
601 /* 615 /*
602 * The new state must be visible before we turn it on in the hardware: 616 * The new state must be visible before we turn it on in the hardware:
603 */ 617 */
@@ -624,8 +638,7 @@ event_sched_in(struct perf_event *event,
624static int 638static int
625group_sched_in(struct perf_event *group_event, 639group_sched_in(struct perf_event *group_event,
626 struct perf_cpu_context *cpuctx, 640 struct perf_cpu_context *cpuctx,
627 struct perf_event_context *ctx, 641 struct perf_event_context *ctx)
628 int cpu)
629{ 642{
630 struct perf_event *event, *partial_group; 643 struct perf_event *event, *partial_group;
631 int ret; 644 int ret;
@@ -633,18 +646,18 @@ group_sched_in(struct perf_event *group_event,
633 if (group_event->state == PERF_EVENT_STATE_OFF) 646 if (group_event->state == PERF_EVENT_STATE_OFF)
634 return 0; 647 return 0;
635 648
636 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu); 649 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx);
637 if (ret) 650 if (ret)
638 return ret < 0 ? ret : 0; 651 return ret < 0 ? ret : 0;
639 652
640 if (event_sched_in(group_event, cpuctx, ctx, cpu)) 653 if (event_sched_in(group_event, cpuctx, ctx))
641 return -EAGAIN; 654 return -EAGAIN;
642 655
643 /* 656 /*
644 * Schedule in siblings as one group (if any): 657 * Schedule in siblings as one group (if any):
645 */ 658 */
646 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 659 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
647 if (event_sched_in(event, cpuctx, ctx, cpu)) { 660 if (event_sched_in(event, cpuctx, ctx)) {
648 partial_group = event; 661 partial_group = event;
649 goto group_error; 662 goto group_error;
650 } 663 }
@@ -668,24 +681,6 @@ group_error:
668} 681}
669 682
670/* 683/*
671 * Return 1 for a group consisting entirely of software events,
672 * 0 if the group contains any hardware events.
673 */
674static int is_software_only_group(struct perf_event *leader)
675{
676 struct perf_event *event;
677
678 if (!is_software_event(leader))
679 return 0;
680
681 list_for_each_entry(event, &leader->sibling_list, group_entry)
682 if (!is_software_event(event))
683 return 0;
684
685 return 1;
686}
687
688/*
689 * Work out whether we can put this event group on the CPU now. 684 * Work out whether we can put this event group on the CPU now.
690 */ 685 */
691static int group_can_go_on(struct perf_event *event, 686static int group_can_go_on(struct perf_event *event,
@@ -695,7 +690,7 @@ static int group_can_go_on(struct perf_event *event,
695 /* 690 /*
696 * Groups consisting entirely of software events can always go on. 691 * Groups consisting entirely of software events can always go on.
697 */ 692 */
698 if (is_software_only_group(event)) 693 if (event->group_flags & PERF_GROUP_SOFTWARE)
699 return 1; 694 return 1;
700 /* 695 /*
701 * If an exclusive group is already on, no other hardware 696 * If an exclusive group is already on, no other hardware
@@ -736,7 +731,6 @@ static void __perf_install_in_context(void *info)
736 struct perf_event *event = info; 731 struct perf_event *event = info;
737 struct perf_event_context *ctx = event->ctx; 732 struct perf_event_context *ctx = event->ctx;
738 struct perf_event *leader = event->group_leader; 733 struct perf_event *leader = event->group_leader;
739 int cpu = smp_processor_id();
740 int err; 734 int err;
741 735
742 /* 736 /*
@@ -752,7 +746,7 @@ static void __perf_install_in_context(void *info)
752 cpuctx->task_ctx = ctx; 746 cpuctx->task_ctx = ctx;
753 } 747 }
754 748
755 spin_lock(&ctx->lock); 749 raw_spin_lock(&ctx->lock);
756 ctx->is_active = 1; 750 ctx->is_active = 1;
757 update_context_time(ctx); 751 update_context_time(ctx);
758 752
@@ -764,6 +758,9 @@ static void __perf_install_in_context(void *info)
764 758
765 add_event_to_ctx(event, ctx); 759 add_event_to_ctx(event, ctx);
766 760
761 if (event->cpu != -1 && event->cpu != smp_processor_id())
762 goto unlock;
763
767 /* 764 /*
768 * Don't put the event on if it is disabled or if 765 * Don't put the event on if it is disabled or if
769 * it is in a group and the group isn't on. 766 * it is in a group and the group isn't on.
@@ -780,7 +777,7 @@ static void __perf_install_in_context(void *info)
780 if (!group_can_go_on(event, cpuctx, 1)) 777 if (!group_can_go_on(event, cpuctx, 1))
781 err = -EEXIST; 778 err = -EEXIST;
782 else 779 else
783 err = event_sched_in(event, cpuctx, ctx, cpu); 780 err = event_sched_in(event, cpuctx, ctx);
784 781
785 if (err) { 782 if (err) {
786 /* 783 /*
@@ -802,7 +799,7 @@ static void __perf_install_in_context(void *info)
802 unlock: 799 unlock:
803 perf_enable(); 800 perf_enable();
804 801
805 spin_unlock(&ctx->lock); 802 raw_spin_unlock(&ctx->lock);
806} 803}
807 804
808/* 805/*
@@ -827,7 +824,7 @@ perf_install_in_context(struct perf_event_context *ctx,
827 if (!task) { 824 if (!task) {
828 /* 825 /*
829 * Per cpu events are installed via an smp call and 826 * Per cpu events are installed via an smp call and
830 * the install is always sucessful. 827 * the install is always successful.
831 */ 828 */
832 smp_call_function_single(cpu, __perf_install_in_context, 829 smp_call_function_single(cpu, __perf_install_in_context,
833 event, 1); 830 event, 1);
@@ -838,12 +835,12 @@ retry:
838 task_oncpu_function_call(task, __perf_install_in_context, 835 task_oncpu_function_call(task, __perf_install_in_context,
839 event); 836 event);
840 837
841 spin_lock_irq(&ctx->lock); 838 raw_spin_lock_irq(&ctx->lock);
842 /* 839 /*
843 * we need to retry the smp call. 840 * we need to retry the smp call.
844 */ 841 */
845 if (ctx->is_active && list_empty(&event->group_entry)) { 842 if (ctx->is_active && list_empty(&event->group_entry)) {
846 spin_unlock_irq(&ctx->lock); 843 raw_spin_unlock_irq(&ctx->lock);
847 goto retry; 844 goto retry;
848 } 845 }
849 846
@@ -854,7 +851,7 @@ retry:
854 */ 851 */
855 if (list_empty(&event->group_entry)) 852 if (list_empty(&event->group_entry))
856 add_event_to_ctx(event, ctx); 853 add_event_to_ctx(event, ctx);
857 spin_unlock_irq(&ctx->lock); 854 raw_spin_unlock_irq(&ctx->lock);
858} 855}
859 856
860/* 857/*
@@ -899,7 +896,7 @@ static void __perf_event_enable(void *info)
899 cpuctx->task_ctx = ctx; 896 cpuctx->task_ctx = ctx;
900 } 897 }
901 898
902 spin_lock(&ctx->lock); 899 raw_spin_lock(&ctx->lock);
903 ctx->is_active = 1; 900 ctx->is_active = 1;
904 update_context_time(ctx); 901 update_context_time(ctx);
905 902
@@ -907,6 +904,9 @@ static void __perf_event_enable(void *info)
907 goto unlock; 904 goto unlock;
908 __perf_event_mark_enabled(event, ctx); 905 __perf_event_mark_enabled(event, ctx);
909 906
907 if (event->cpu != -1 && event->cpu != smp_processor_id())
908 goto unlock;
909
910 /* 910 /*
911 * If the event is in a group and isn't the group leader, 911 * If the event is in a group and isn't the group leader,
912 * then don't put it on unless the group is on. 912 * then don't put it on unless the group is on.
@@ -919,11 +919,9 @@ static void __perf_event_enable(void *info)
919 } else { 919 } else {
920 perf_disable(); 920 perf_disable();
921 if (event == leader) 921 if (event == leader)
922 err = group_sched_in(event, cpuctx, ctx, 922 err = group_sched_in(event, cpuctx, ctx);
923 smp_processor_id());
924 else 923 else
925 err = event_sched_in(event, cpuctx, ctx, 924 err = event_sched_in(event, cpuctx, ctx);
926 smp_processor_id());
927 perf_enable(); 925 perf_enable();
928 } 926 }
929 927
@@ -941,7 +939,7 @@ static void __perf_event_enable(void *info)
941 } 939 }
942 940
943 unlock: 941 unlock:
944 spin_unlock(&ctx->lock); 942 raw_spin_unlock(&ctx->lock);
945} 943}
946 944
947/* 945/*
@@ -953,7 +951,7 @@ static void __perf_event_enable(void *info)
953 * perf_event_for_each_child or perf_event_for_each as described 951 * perf_event_for_each_child or perf_event_for_each as described
954 * for perf_event_disable. 952 * for perf_event_disable.
955 */ 953 */
956static void perf_event_enable(struct perf_event *event) 954void perf_event_enable(struct perf_event *event)
957{ 955{
958 struct perf_event_context *ctx = event->ctx; 956 struct perf_event_context *ctx = event->ctx;
959 struct task_struct *task = ctx->task; 957 struct task_struct *task = ctx->task;
@@ -967,7 +965,7 @@ static void perf_event_enable(struct perf_event *event)
967 return; 965 return;
968 } 966 }
969 967
970 spin_lock_irq(&ctx->lock); 968 raw_spin_lock_irq(&ctx->lock);
971 if (event->state >= PERF_EVENT_STATE_INACTIVE) 969 if (event->state >= PERF_EVENT_STATE_INACTIVE)
972 goto out; 970 goto out;
973 971
@@ -982,10 +980,10 @@ static void perf_event_enable(struct perf_event *event)
982 event->state = PERF_EVENT_STATE_OFF; 980 event->state = PERF_EVENT_STATE_OFF;
983 981
984 retry: 982 retry:
985 spin_unlock_irq(&ctx->lock); 983 raw_spin_unlock_irq(&ctx->lock);
986 task_oncpu_function_call(task, __perf_event_enable, event); 984 task_oncpu_function_call(task, __perf_event_enable, event);
987 985
988 spin_lock_irq(&ctx->lock); 986 raw_spin_lock_irq(&ctx->lock);
989 987
990 /* 988 /*
991 * If the context is active and the event is still off, 989 * If the context is active and the event is still off,
@@ -1002,7 +1000,7 @@ static void perf_event_enable(struct perf_event *event)
1002 __perf_event_mark_enabled(event, ctx); 1000 __perf_event_mark_enabled(event, ctx);
1003 1001
1004 out: 1002 out:
1005 spin_unlock_irq(&ctx->lock); 1003 raw_spin_unlock_irq(&ctx->lock);
1006} 1004}
1007 1005
1008static int perf_event_refresh(struct perf_event *event, int refresh) 1006static int perf_event_refresh(struct perf_event *event, int refresh)
@@ -1019,25 +1017,40 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
1019 return 0; 1017 return 0;
1020} 1018}
1021 1019
1022void __perf_event_sched_out(struct perf_event_context *ctx, 1020enum event_type_t {
1023 struct perf_cpu_context *cpuctx) 1021 EVENT_FLEXIBLE = 0x1,
1022 EVENT_PINNED = 0x2,
1023 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
1024};
1025
1026static void ctx_sched_out(struct perf_event_context *ctx,
1027 struct perf_cpu_context *cpuctx,
1028 enum event_type_t event_type)
1024{ 1029{
1025 struct perf_event *event; 1030 struct perf_event *event;
1026 1031
1027 spin_lock(&ctx->lock); 1032 raw_spin_lock(&ctx->lock);
1028 ctx->is_active = 0; 1033 ctx->is_active = 0;
1029 if (likely(!ctx->nr_events)) 1034 if (likely(!ctx->nr_events))
1030 goto out; 1035 goto out;
1031 update_context_time(ctx); 1036 update_context_time(ctx);
1032 1037
1033 perf_disable(); 1038 perf_disable();
1034 if (ctx->nr_active) 1039 if (!ctx->nr_active)
1035 list_for_each_entry(event, &ctx->group_list, group_entry) 1040 goto out_enable;
1041
1042 if (event_type & EVENT_PINNED)
1043 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1036 group_sched_out(event, cpuctx, ctx); 1044 group_sched_out(event, cpuctx, ctx);
1037 1045
1046 if (event_type & EVENT_FLEXIBLE)
1047 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1048 group_sched_out(event, cpuctx, ctx);
1049
1050 out_enable:
1038 perf_enable(); 1051 perf_enable();
1039 out: 1052 out:
1040 spin_unlock(&ctx->lock); 1053 raw_spin_unlock(&ctx->lock);
1041} 1054}
1042 1055
1043/* 1056/*
@@ -1059,8 +1072,6 @@ static int context_equiv(struct perf_event_context *ctx1,
1059 && !ctx1->pin_count && !ctx2->pin_count; 1072 && !ctx1->pin_count && !ctx2->pin_count;
1060} 1073}
1061 1074
1062static void __perf_event_read(void *event);
1063
1064static void __perf_event_sync_stat(struct perf_event *event, 1075static void __perf_event_sync_stat(struct perf_event *event,
1065 struct perf_event *next_event) 1076 struct perf_event *next_event)
1066{ 1077{
@@ -1078,8 +1089,8 @@ static void __perf_event_sync_stat(struct perf_event *event,
1078 */ 1089 */
1079 switch (event->state) { 1090 switch (event->state) {
1080 case PERF_EVENT_STATE_ACTIVE: 1091 case PERF_EVENT_STATE_ACTIVE:
1081 __perf_event_read(event); 1092 event->pmu->read(event);
1082 break; 1093 /* fall-through */
1083 1094
1084 case PERF_EVENT_STATE_INACTIVE: 1095 case PERF_EVENT_STATE_INACTIVE:
1085 update_event_times(event); 1096 update_event_times(event);
@@ -1118,6 +1129,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1118 if (!ctx->nr_stat) 1129 if (!ctx->nr_stat)
1119 return; 1130 return;
1120 1131
1132 update_context_time(ctx);
1133
1121 event = list_first_entry(&ctx->event_list, 1134 event = list_first_entry(&ctx->event_list,
1122 struct perf_event, event_entry); 1135 struct perf_event, event_entry);
1123 1136
@@ -1146,23 +1159,19 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1146 * not restart the event. 1159 * not restart the event.
1147 */ 1160 */
1148void perf_event_task_sched_out(struct task_struct *task, 1161void perf_event_task_sched_out(struct task_struct *task,
1149 struct task_struct *next, int cpu) 1162 struct task_struct *next)
1150{ 1163{
1151 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 1164 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1152 struct perf_event_context *ctx = task->perf_event_ctxp; 1165 struct perf_event_context *ctx = task->perf_event_ctxp;
1153 struct perf_event_context *next_ctx; 1166 struct perf_event_context *next_ctx;
1154 struct perf_event_context *parent; 1167 struct perf_event_context *parent;
1155 struct pt_regs *regs;
1156 int do_switch = 1; 1168 int do_switch = 1;
1157 1169
1158 regs = task_pt_regs(task); 1170 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
1159 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1160 1171
1161 if (likely(!ctx || !cpuctx->task_ctx)) 1172 if (likely(!ctx || !cpuctx->task_ctx))
1162 return; 1173 return;
1163 1174
1164 update_context_time(ctx);
1165
1166 rcu_read_lock(); 1175 rcu_read_lock();
1167 parent = rcu_dereference(ctx->parent_ctx); 1176 parent = rcu_dereference(ctx->parent_ctx);
1168 next_ctx = next->perf_event_ctxp; 1177 next_ctx = next->perf_event_ctxp;
@@ -1177,8 +1186,8 @@ void perf_event_task_sched_out(struct task_struct *task,
1177 * order we take the locks because no other cpu could 1186 * order we take the locks because no other cpu could
1178 * be trying to lock both of these tasks. 1187 * be trying to lock both of these tasks.
1179 */ 1188 */
1180 spin_lock(&ctx->lock); 1189 raw_spin_lock(&ctx->lock);
1181 spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); 1190 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1182 if (context_equiv(ctx, next_ctx)) { 1191 if (context_equiv(ctx, next_ctx)) {
1183 /* 1192 /*
1184 * XXX do we need a memory barrier of sorts 1193 * XXX do we need a memory barrier of sorts
@@ -1192,21 +1201,19 @@ void perf_event_task_sched_out(struct task_struct *task,
1192 1201
1193 perf_event_sync_stat(ctx, next_ctx); 1202 perf_event_sync_stat(ctx, next_ctx);
1194 } 1203 }
1195 spin_unlock(&next_ctx->lock); 1204 raw_spin_unlock(&next_ctx->lock);
1196 spin_unlock(&ctx->lock); 1205 raw_spin_unlock(&ctx->lock);
1197 } 1206 }
1198 rcu_read_unlock(); 1207 rcu_read_unlock();
1199 1208
1200 if (do_switch) { 1209 if (do_switch) {
1201 __perf_event_sched_out(ctx, cpuctx); 1210 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
1202 cpuctx->task_ctx = NULL; 1211 cpuctx->task_ctx = NULL;
1203 } 1212 }
1204} 1213}
1205 1214
1206/* 1215static void task_ctx_sched_out(struct perf_event_context *ctx,
1207 * Called with IRQs disabled 1216 enum event_type_t event_type)
1208 */
1209static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1210{ 1217{
1211 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1218 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1212 1219
@@ -1216,47 +1223,41 @@ static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1216 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) 1223 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1217 return; 1224 return;
1218 1225
1219 __perf_event_sched_out(ctx, cpuctx); 1226 ctx_sched_out(ctx, cpuctx, event_type);
1220 cpuctx->task_ctx = NULL; 1227 cpuctx->task_ctx = NULL;
1221} 1228}
1222 1229
1223/* 1230/*
1224 * Called with IRQs disabled 1231 * Called with IRQs disabled
1225 */ 1232 */
1226static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx) 1233static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1234{
1235 task_ctx_sched_out(ctx, EVENT_ALL);
1236}
1237
1238/*
1239 * Called with IRQs disabled
1240 */
1241static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
1242 enum event_type_t event_type)
1227{ 1243{
1228 __perf_event_sched_out(&cpuctx->ctx, cpuctx); 1244 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
1229} 1245}
1230 1246
1231static void 1247static void
1232__perf_event_sched_in(struct perf_event_context *ctx, 1248ctx_pinned_sched_in(struct perf_event_context *ctx,
1233 struct perf_cpu_context *cpuctx, int cpu) 1249 struct perf_cpu_context *cpuctx)
1234{ 1250{
1235 struct perf_event *event; 1251 struct perf_event *event;
1236 int can_add_hw = 1;
1237
1238 spin_lock(&ctx->lock);
1239 ctx->is_active = 1;
1240 if (likely(!ctx->nr_events))
1241 goto out;
1242 1252
1243 ctx->timestamp = perf_clock(); 1253 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1244 1254 if (event->state <= PERF_EVENT_STATE_OFF)
1245 perf_disable();
1246
1247 /*
1248 * First go through the list and put on any pinned groups
1249 * in order to give them the best chance of going on.
1250 */
1251 list_for_each_entry(event, &ctx->group_list, group_entry) {
1252 if (event->state <= PERF_EVENT_STATE_OFF ||
1253 !event->attr.pinned)
1254 continue; 1255 continue;
1255 if (event->cpu != -1 && event->cpu != cpu) 1256 if (event->cpu != -1 && event->cpu != smp_processor_id())
1256 continue; 1257 continue;
1257 1258
1258 if (group_can_go_on(event, cpuctx, 1)) 1259 if (group_can_go_on(event, cpuctx, 1))
1259 group_sched_in(event, cpuctx, ctx, cpu); 1260 group_sched_in(event, cpuctx, ctx);
1260 1261
1261 /* 1262 /*
1262 * If this pinned group hasn't been scheduled, 1263 * If this pinned group hasn't been scheduled,
@@ -1267,32 +1268,83 @@ __perf_event_sched_in(struct perf_event_context *ctx,
1267 event->state = PERF_EVENT_STATE_ERROR; 1268 event->state = PERF_EVENT_STATE_ERROR;
1268 } 1269 }
1269 } 1270 }
1271}
1270 1272
1271 list_for_each_entry(event, &ctx->group_list, group_entry) { 1273static void
1272 /* 1274ctx_flexible_sched_in(struct perf_event_context *ctx,
1273 * Ignore events in OFF or ERROR state, and 1275 struct perf_cpu_context *cpuctx)
1274 * ignore pinned events since we did them already. 1276{
1275 */ 1277 struct perf_event *event;
1276 if (event->state <= PERF_EVENT_STATE_OFF || 1278 int can_add_hw = 1;
1277 event->attr.pinned)
1278 continue;
1279 1279
1280 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
1281 /* Ignore events in OFF or ERROR state */
1282 if (event->state <= PERF_EVENT_STATE_OFF)
1283 continue;
1280 /* 1284 /*
1281 * Listen to the 'cpu' scheduling filter constraint 1285 * Listen to the 'cpu' scheduling filter constraint
1282 * of events: 1286 * of events:
1283 */ 1287 */
1284 if (event->cpu != -1 && event->cpu != cpu) 1288 if (event->cpu != -1 && event->cpu != smp_processor_id())
1285 continue; 1289 continue;
1286 1290
1287 if (group_can_go_on(event, cpuctx, can_add_hw)) 1291 if (group_can_go_on(event, cpuctx, can_add_hw))
1288 if (group_sched_in(event, cpuctx, ctx, cpu)) 1292 if (group_sched_in(event, cpuctx, ctx))
1289 can_add_hw = 0; 1293 can_add_hw = 0;
1290 } 1294 }
1295}
1296
1297static void
1298ctx_sched_in(struct perf_event_context *ctx,
1299 struct perf_cpu_context *cpuctx,
1300 enum event_type_t event_type)
1301{
1302 raw_spin_lock(&ctx->lock);
1303 ctx->is_active = 1;
1304 if (likely(!ctx->nr_events))
1305 goto out;
1306
1307 ctx->timestamp = perf_clock();
1308
1309 perf_disable();
1310
1311 /*
1312 * First go through the list and put on any pinned groups
1313 * in order to give them the best chance of going on.
1314 */
1315 if (event_type & EVENT_PINNED)
1316 ctx_pinned_sched_in(ctx, cpuctx);
1317
1318 /* Then walk through the lower prio flexible groups */
1319 if (event_type & EVENT_FLEXIBLE)
1320 ctx_flexible_sched_in(ctx, cpuctx);
1321
1291 perf_enable(); 1322 perf_enable();
1292 out: 1323 out:
1293 spin_unlock(&ctx->lock); 1324 raw_spin_unlock(&ctx->lock);
1294} 1325}
1295 1326
1327static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1328 enum event_type_t event_type)
1329{
1330 struct perf_event_context *ctx = &cpuctx->ctx;
1331
1332 ctx_sched_in(ctx, cpuctx, event_type);
1333}
1334
1335static void task_ctx_sched_in(struct task_struct *task,
1336 enum event_type_t event_type)
1337{
1338 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1339 struct perf_event_context *ctx = task->perf_event_ctxp;
1340
1341 if (likely(!ctx))
1342 return;
1343 if (cpuctx->task_ctx == ctx)
1344 return;
1345 ctx_sched_in(ctx, cpuctx, event_type);
1346 cpuctx->task_ctx = ctx;
1347}
1296/* 1348/*
1297 * Called from scheduler to add the events of the current task 1349 * Called from scheduler to add the events of the current task
1298 * with interrupts disabled. 1350 * with interrupts disabled.
@@ -1304,38 +1356,128 @@ __perf_event_sched_in(struct perf_event_context *ctx,
1304 * accessing the event control register. If a NMI hits, then it will 1356 * accessing the event control register. If a NMI hits, then it will
1305 * keep the event running. 1357 * keep the event running.
1306 */ 1358 */
1307void perf_event_task_sched_in(struct task_struct *task, int cpu) 1359void perf_event_task_sched_in(struct task_struct *task)
1308{ 1360{
1309 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 1361 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1310 struct perf_event_context *ctx = task->perf_event_ctxp; 1362 struct perf_event_context *ctx = task->perf_event_ctxp;
1311 1363
1312 if (likely(!ctx)) 1364 if (likely(!ctx))
1313 return; 1365 return;
1366
1314 if (cpuctx->task_ctx == ctx) 1367 if (cpuctx->task_ctx == ctx)
1315 return; 1368 return;
1316 __perf_event_sched_in(ctx, cpuctx, cpu); 1369
1370 /*
1371 * We want to keep the following priority order:
1372 * cpu pinned (that don't need to move), task pinned,
1373 * cpu flexible, task flexible.
1374 */
1375 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1376
1377 ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
1378 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1379 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
1380
1317 cpuctx->task_ctx = ctx; 1381 cpuctx->task_ctx = ctx;
1318} 1382}
1319 1383
1320static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) 1384#define MAX_INTERRUPTS (~0ULL)
1385
1386static void perf_log_throttle(struct perf_event *event, int enable);
1387
1388static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
1321{ 1389{
1322 struct perf_event_context *ctx = &cpuctx->ctx; 1390 u64 frequency = event->attr.sample_freq;
1391 u64 sec = NSEC_PER_SEC;
1392 u64 divisor, dividend;
1393
1394 int count_fls, nsec_fls, frequency_fls, sec_fls;
1395
1396 count_fls = fls64(count);
1397 nsec_fls = fls64(nsec);
1398 frequency_fls = fls64(frequency);
1399 sec_fls = 30;
1400
1401 /*
1402 * We got @count in @nsec, with a target of sample_freq HZ
1403 * the target period becomes:
1404 *
1405 * @count * 10^9
1406 * period = -------------------
1407 * @nsec * sample_freq
1408 *
1409 */
1410
1411 /*
1412 * Reduce accuracy by one bit such that @a and @b converge
1413 * to a similar magnitude.
1414 */
1415#define REDUCE_FLS(a, b) \
1416do { \
1417 if (a##_fls > b##_fls) { \
1418 a >>= 1; \
1419 a##_fls--; \
1420 } else { \
1421 b >>= 1; \
1422 b##_fls--; \
1423 } \
1424} while (0)
1425
1426 /*
1427 * Reduce accuracy until either term fits in a u64, then proceed with
1428 * the other, so that finally we can do a u64/u64 division.
1429 */
1430 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
1431 REDUCE_FLS(nsec, frequency);
1432 REDUCE_FLS(sec, count);
1433 }
1323 1434
1324 __perf_event_sched_in(ctx, cpuctx, cpu); 1435 if (count_fls + sec_fls > 64) {
1436 divisor = nsec * frequency;
1437
1438 while (count_fls + sec_fls > 64) {
1439 REDUCE_FLS(count, sec);
1440 divisor >>= 1;
1441 }
1442
1443 dividend = count * sec;
1444 } else {
1445 dividend = count * sec;
1446
1447 while (nsec_fls + frequency_fls > 64) {
1448 REDUCE_FLS(nsec, frequency);
1449 dividend >>= 1;
1450 }
1451
1452 divisor = nsec * frequency;
1453 }
1454
1455 return div64_u64(dividend, divisor);
1325} 1456}
1326 1457
1327#define MAX_INTERRUPTS (~0ULL) 1458static void perf_event_stop(struct perf_event *event)
1459{
1460 if (!event->pmu->stop)
1461 return event->pmu->disable(event);
1328 1462
1329static void perf_log_throttle(struct perf_event *event, int enable); 1463 return event->pmu->stop(event);
1464}
1465
1466static int perf_event_start(struct perf_event *event)
1467{
1468 if (!event->pmu->start)
1469 return event->pmu->enable(event);
1470
1471 return event->pmu->start(event);
1472}
1330 1473
1331static void perf_adjust_period(struct perf_event *event, u64 events) 1474static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1332{ 1475{
1333 struct hw_perf_event *hwc = &event->hw; 1476 struct hw_perf_event *hwc = &event->hw;
1334 u64 period, sample_period; 1477 u64 period, sample_period;
1335 s64 delta; 1478 s64 delta;
1336 1479
1337 events *= hwc->sample_period; 1480 period = perf_calculate_period(event, nsec, count);
1338 period = div64_u64(events, event->attr.sample_freq);
1339 1481
1340 delta = (s64)(period - hwc->sample_period); 1482 delta = (s64)(period - hwc->sample_period);
1341 delta = (delta + 7) / 8; /* low pass filter */ 1483 delta = (delta + 7) / 8; /* low pass filter */
@@ -1346,19 +1488,31 @@ static void perf_adjust_period(struct perf_event *event, u64 events)
1346 sample_period = 1; 1488 sample_period = 1;
1347 1489
1348 hwc->sample_period = sample_period; 1490 hwc->sample_period = sample_period;
1491
1492 if (atomic64_read(&hwc->period_left) > 8*sample_period) {
1493 perf_disable();
1494 perf_event_stop(event);
1495 atomic64_set(&hwc->period_left, 0);
1496 perf_event_start(event);
1497 perf_enable();
1498 }
1349} 1499}
1350 1500
1351static void perf_ctx_adjust_freq(struct perf_event_context *ctx) 1501static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1352{ 1502{
1353 struct perf_event *event; 1503 struct perf_event *event;
1354 struct hw_perf_event *hwc; 1504 struct hw_perf_event *hwc;
1355 u64 interrupts, freq; 1505 u64 interrupts, now;
1506 s64 delta;
1356 1507
1357 spin_lock(&ctx->lock); 1508 raw_spin_lock(&ctx->lock);
1358 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 1509 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
1359 if (event->state != PERF_EVENT_STATE_ACTIVE) 1510 if (event->state != PERF_EVENT_STATE_ACTIVE)
1360 continue; 1511 continue;
1361 1512
1513 if (event->cpu != -1 && event->cpu != smp_processor_id())
1514 continue;
1515
1362 hwc = &event->hw; 1516 hwc = &event->hw;
1363 1517
1364 interrupts = hwc->interrupts; 1518 interrupts = hwc->interrupts;
@@ -1369,47 +1523,25 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1369 */ 1523 */
1370 if (interrupts == MAX_INTERRUPTS) { 1524 if (interrupts == MAX_INTERRUPTS) {
1371 perf_log_throttle(event, 1); 1525 perf_log_throttle(event, 1);
1526 perf_disable();
1372 event->pmu->unthrottle(event); 1527 event->pmu->unthrottle(event);
1373 interrupts = 2*sysctl_perf_event_sample_rate/HZ; 1528 perf_enable();
1374 } 1529 }
1375 1530
1376 if (!event->attr.freq || !event->attr.sample_freq) 1531 if (!event->attr.freq || !event->attr.sample_freq)
1377 continue; 1532 continue;
1378 1533
1379 /* 1534 perf_disable();
1380 * if the specified freq < HZ then we need to skip ticks 1535 event->pmu->read(event);
1381 */ 1536 now = atomic64_read(&event->count);
1382 if (event->attr.sample_freq < HZ) { 1537 delta = now - hwc->freq_count_stamp;
1383 freq = event->attr.sample_freq; 1538 hwc->freq_count_stamp = now;
1384
1385 hwc->freq_count += freq;
1386 hwc->freq_interrupts += interrupts;
1387
1388 if (hwc->freq_count < HZ)
1389 continue;
1390
1391 interrupts = hwc->freq_interrupts;
1392 hwc->freq_interrupts = 0;
1393 hwc->freq_count -= HZ;
1394 } else
1395 freq = HZ;
1396
1397 perf_adjust_period(event, freq * interrupts);
1398 1539
1399 /* 1540 if (delta > 0)
1400 * In order to avoid being stalled by an (accidental) huge 1541 perf_adjust_period(event, TICK_NSEC, delta);
1401 * sample period, force reset the sample period if we didn't 1542 perf_enable();
1402 * get any events in this freq period.
1403 */
1404 if (!interrupts) {
1405 perf_disable();
1406 event->pmu->disable(event);
1407 atomic64_set(&hwc->period_left, 0);
1408 event->pmu->enable(event);
1409 perf_enable();
1410 }
1411 } 1543 }
1412 spin_unlock(&ctx->lock); 1544 raw_spin_unlock(&ctx->lock);
1413} 1545}
1414 1546
1415/* 1547/*
@@ -1417,51 +1549,67 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1417 */ 1549 */
1418static void rotate_ctx(struct perf_event_context *ctx) 1550static void rotate_ctx(struct perf_event_context *ctx)
1419{ 1551{
1420 struct perf_event *event; 1552 raw_spin_lock(&ctx->lock);
1421 1553
1422 if (!ctx->nr_events) 1554 /* Rotate the first entry last of non-pinned groups */
1423 return; 1555 list_rotate_left(&ctx->flexible_groups);
1424
1425 spin_lock(&ctx->lock);
1426 /*
1427 * Rotate the first entry last (works just fine for group events too):
1428 */
1429 perf_disable();
1430 list_for_each_entry(event, &ctx->group_list, group_entry) {
1431 list_move_tail(&event->group_entry, &ctx->group_list);
1432 break;
1433 }
1434 perf_enable();
1435 1556
1436 spin_unlock(&ctx->lock); 1557 raw_spin_unlock(&ctx->lock);
1437} 1558}
1438 1559
1439void perf_event_task_tick(struct task_struct *curr, int cpu) 1560void perf_event_task_tick(struct task_struct *curr)
1440{ 1561{
1441 struct perf_cpu_context *cpuctx; 1562 struct perf_cpu_context *cpuctx;
1442 struct perf_event_context *ctx; 1563 struct perf_event_context *ctx;
1564 int rotate = 0;
1443 1565
1444 if (!atomic_read(&nr_events)) 1566 if (!atomic_read(&nr_events))
1445 return; 1567 return;
1446 1568
1447 cpuctx = &per_cpu(perf_cpu_context, cpu); 1569 cpuctx = &__get_cpu_var(perf_cpu_context);
1570 if (cpuctx->ctx.nr_events &&
1571 cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
1572 rotate = 1;
1573
1448 ctx = curr->perf_event_ctxp; 1574 ctx = curr->perf_event_ctxp;
1575 if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active)
1576 rotate = 1;
1449 1577
1450 perf_ctx_adjust_freq(&cpuctx->ctx); 1578 perf_ctx_adjust_freq(&cpuctx->ctx);
1451 if (ctx) 1579 if (ctx)
1452 perf_ctx_adjust_freq(ctx); 1580 perf_ctx_adjust_freq(ctx);
1453 1581
1454 perf_event_cpu_sched_out(cpuctx); 1582 if (!rotate)
1583 return;
1584
1585 perf_disable();
1586 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1455 if (ctx) 1587 if (ctx)
1456 __perf_event_task_sched_out(ctx); 1588 task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
1457 1589
1458 rotate_ctx(&cpuctx->ctx); 1590 rotate_ctx(&cpuctx->ctx);
1459 if (ctx) 1591 if (ctx)
1460 rotate_ctx(ctx); 1592 rotate_ctx(ctx);
1461 1593
1462 perf_event_cpu_sched_in(cpuctx, cpu); 1594 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1463 if (ctx) 1595 if (ctx)
1464 perf_event_task_sched_in(curr, cpu); 1596 task_ctx_sched_in(curr, EVENT_FLEXIBLE);
1597 perf_enable();
1598}
1599
1600static int event_enable_on_exec(struct perf_event *event,
1601 struct perf_event_context *ctx)
1602{
1603 if (!event->attr.enable_on_exec)
1604 return 0;
1605
1606 event->attr.enable_on_exec = 0;
1607 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1608 return 0;
1609
1610 __perf_event_mark_enabled(event, ctx);
1611
1612 return 1;
1465} 1613}
1466 1614
1467/* 1615/*
@@ -1474,6 +1622,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1474 struct perf_event *event; 1622 struct perf_event *event;
1475 unsigned long flags; 1623 unsigned long flags;
1476 int enabled = 0; 1624 int enabled = 0;
1625 int ret;
1477 1626
1478 local_irq_save(flags); 1627 local_irq_save(flags);
1479 ctx = task->perf_event_ctxp; 1628 ctx = task->perf_event_ctxp;
@@ -1482,16 +1631,18 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1482 1631
1483 __perf_event_task_sched_out(ctx); 1632 __perf_event_task_sched_out(ctx);
1484 1633
1485 spin_lock(&ctx->lock); 1634 raw_spin_lock(&ctx->lock);
1486 1635
1487 list_for_each_entry(event, &ctx->group_list, group_entry) { 1636 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1488 if (!event->attr.enable_on_exec) 1637 ret = event_enable_on_exec(event, ctx);
1489 continue; 1638 if (ret)
1490 event->attr.enable_on_exec = 0; 1639 enabled = 1;
1491 if (event->state >= PERF_EVENT_STATE_INACTIVE) 1640 }
1492 continue; 1641
1493 __perf_event_mark_enabled(event, ctx); 1642 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
1494 enabled = 1; 1643 ret = event_enable_on_exec(event, ctx);
1644 if (ret)
1645 enabled = 1;
1495 } 1646 }
1496 1647
1497 /* 1648 /*
@@ -1500,9 +1651,9 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1500 if (enabled) 1651 if (enabled)
1501 unclone_ctx(ctx); 1652 unclone_ctx(ctx);
1502 1653
1503 spin_unlock(&ctx->lock); 1654 raw_spin_unlock(&ctx->lock);
1504 1655
1505 perf_event_task_sched_in(task, smp_processor_id()); 1656 perf_event_task_sched_in(task);
1506 out: 1657 out:
1507 local_irq_restore(flags); 1658 local_irq_restore(flags);
1508} 1659}
@@ -1515,7 +1666,6 @@ static void __perf_event_read(void *info)
1515 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1666 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1516 struct perf_event *event = info; 1667 struct perf_event *event = info;
1517 struct perf_event_context *ctx = event->ctx; 1668 struct perf_event_context *ctx = event->ctx;
1518 unsigned long flags;
1519 1669
1520 /* 1670 /*
1521 * If this is a task context, we need to check whether it is 1671 * If this is a task context, we need to check whether it is
@@ -1527,12 +1677,12 @@ static void __perf_event_read(void *info)
1527 if (ctx->task && cpuctx->task_ctx != ctx) 1677 if (ctx->task && cpuctx->task_ctx != ctx)
1528 return; 1678 return;
1529 1679
1530 local_irq_save(flags); 1680 raw_spin_lock(&ctx->lock);
1531 if (ctx->is_active) 1681 update_context_time(ctx);
1532 update_context_time(ctx);
1533 event->pmu->read(event);
1534 update_event_times(event); 1682 update_event_times(event);
1535 local_irq_restore(flags); 1683 raw_spin_unlock(&ctx->lock);
1684
1685 event->pmu->read(event);
1536} 1686}
1537 1687
1538static u64 perf_event_read(struct perf_event *event) 1688static u64 perf_event_read(struct perf_event *event)
@@ -1545,7 +1695,13 @@ static u64 perf_event_read(struct perf_event *event)
1545 smp_call_function_single(event->oncpu, 1695 smp_call_function_single(event->oncpu,
1546 __perf_event_read, event, 1); 1696 __perf_event_read, event, 1);
1547 } else if (event->state == PERF_EVENT_STATE_INACTIVE) { 1697 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
1698 struct perf_event_context *ctx = event->ctx;
1699 unsigned long flags;
1700
1701 raw_spin_lock_irqsave(&ctx->lock, flags);
1702 update_context_time(ctx);
1548 update_event_times(event); 1703 update_event_times(event);
1704 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1549 } 1705 }
1550 1706
1551 return atomic64_read(&event->count); 1707 return atomic64_read(&event->count);
@@ -1558,10 +1714,10 @@ static void
1558__perf_event_init_context(struct perf_event_context *ctx, 1714__perf_event_init_context(struct perf_event_context *ctx,
1559 struct task_struct *task) 1715 struct task_struct *task)
1560{ 1716{
1561 memset(ctx, 0, sizeof(*ctx)); 1717 raw_spin_lock_init(&ctx->lock);
1562 spin_lock_init(&ctx->lock);
1563 mutex_init(&ctx->mutex); 1718 mutex_init(&ctx->mutex);
1564 INIT_LIST_HEAD(&ctx->group_list); 1719 INIT_LIST_HEAD(&ctx->pinned_groups);
1720 INIT_LIST_HEAD(&ctx->flexible_groups);
1565 INIT_LIST_HEAD(&ctx->event_list); 1721 INIT_LIST_HEAD(&ctx->event_list);
1566 atomic_set(&ctx->refcount, 1); 1722 atomic_set(&ctx->refcount, 1);
1567 ctx->task = task; 1723 ctx->task = task;
@@ -1575,15 +1731,12 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1575 unsigned long flags; 1731 unsigned long flags;
1576 int err; 1732 int err;
1577 1733
1578 /* 1734 if (pid == -1 && cpu != -1) {
1579 * If cpu is not a wildcard then this is a percpu event:
1580 */
1581 if (cpu != -1) {
1582 /* Must be root to operate on a CPU event: */ 1735 /* Must be root to operate on a CPU event: */
1583 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) 1736 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1584 return ERR_PTR(-EACCES); 1737 return ERR_PTR(-EACCES);
1585 1738
1586 if (cpu < 0 || cpu > num_possible_cpus()) 1739 if (cpu < 0 || cpu >= nr_cpumask_bits)
1587 return ERR_PTR(-EINVAL); 1740 return ERR_PTR(-EINVAL);
1588 1741
1589 /* 1742 /*
@@ -1591,7 +1744,7 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1591 * offline CPU and activate it when the CPU comes up, but 1744 * offline CPU and activate it when the CPU comes up, but
1592 * that's for later. 1745 * that's for later.
1593 */ 1746 */
1594 if (!cpu_isset(cpu, cpu_online_map)) 1747 if (!cpu_online(cpu))
1595 return ERR_PTR(-ENODEV); 1748 return ERR_PTR(-ENODEV);
1596 1749
1597 cpuctx = &per_cpu(perf_cpu_context, cpu); 1750 cpuctx = &per_cpu(perf_cpu_context, cpu);
@@ -1629,11 +1782,11 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1629 ctx = perf_lock_task_context(task, &flags); 1782 ctx = perf_lock_task_context(task, &flags);
1630 if (ctx) { 1783 if (ctx) {
1631 unclone_ctx(ctx); 1784 unclone_ctx(ctx);
1632 spin_unlock_irqrestore(&ctx->lock, flags); 1785 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1633 } 1786 }
1634 1787
1635 if (!ctx) { 1788 if (!ctx) {
1636 ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL); 1789 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
1637 err = -ENOMEM; 1790 err = -ENOMEM;
1638 if (!ctx) 1791 if (!ctx)
1639 goto errout; 1792 goto errout;
@@ -1658,6 +1811,8 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1658 return ERR_PTR(err); 1811 return ERR_PTR(err);
1659} 1812}
1660 1813
1814static void perf_event_free_filter(struct perf_event *event);
1815
1661static void free_event_rcu(struct rcu_head *head) 1816static void free_event_rcu(struct rcu_head *head)
1662{ 1817{
1663 struct perf_event *event; 1818 struct perf_event *event;
@@ -1665,6 +1820,7 @@ static void free_event_rcu(struct rcu_head *head)
1665 event = container_of(head, struct perf_event, rcu_head); 1820 event = container_of(head, struct perf_event, rcu_head);
1666 if (event->ns) 1821 if (event->ns)
1667 put_pid_ns(event->ns); 1822 put_pid_ns(event->ns);
1823 perf_event_free_filter(event);
1668 kfree(event); 1824 kfree(event);
1669} 1825}
1670 1826
@@ -1696,16 +1852,10 @@ static void free_event(struct perf_event *event)
1696 call_rcu(&event->rcu_head, free_event_rcu); 1852 call_rcu(&event->rcu_head, free_event_rcu);
1697} 1853}
1698 1854
1699/* 1855int perf_event_release_kernel(struct perf_event *event)
1700 * Called when the last reference to the file is gone.
1701 */
1702static int perf_release(struct inode *inode, struct file *file)
1703{ 1856{
1704 struct perf_event *event = file->private_data;
1705 struct perf_event_context *ctx = event->ctx; 1857 struct perf_event_context *ctx = event->ctx;
1706 1858
1707 file->private_data = NULL;
1708
1709 WARN_ON_ONCE(ctx->parent_ctx); 1859 WARN_ON_ONCE(ctx->parent_ctx);
1710 mutex_lock(&ctx->mutex); 1860 mutex_lock(&ctx->mutex);
1711 perf_event_remove_from_context(event); 1861 perf_event_remove_from_context(event);
@@ -1720,6 +1870,19 @@ static int perf_release(struct inode *inode, struct file *file)
1720 1870
1721 return 0; 1871 return 0;
1722} 1872}
1873EXPORT_SYMBOL_GPL(perf_event_release_kernel);
1874
1875/*
1876 * Called when the last reference to the file is gone.
1877 */
1878static int perf_release(struct inode *inode, struct file *file)
1879{
1880 struct perf_event *event = file->private_data;
1881
1882 file->private_data = NULL;
1883
1884 return perf_event_release_kernel(event);
1885}
1723 1886
1724static int perf_event_read_size(struct perf_event *event) 1887static int perf_event_read_size(struct perf_event *event)
1725{ 1888{
@@ -1746,91 +1909,94 @@ static int perf_event_read_size(struct perf_event *event)
1746 return size; 1909 return size;
1747} 1910}
1748 1911
1749static u64 perf_event_read_value(struct perf_event *event) 1912u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
1750{ 1913{
1751 struct perf_event *child; 1914 struct perf_event *child;
1752 u64 total = 0; 1915 u64 total = 0;
1753 1916
1917 *enabled = 0;
1918 *running = 0;
1919
1920 mutex_lock(&event->child_mutex);
1754 total += perf_event_read(event); 1921 total += perf_event_read(event);
1755 list_for_each_entry(child, &event->child_list, child_list) 1922 *enabled += event->total_time_enabled +
1923 atomic64_read(&event->child_total_time_enabled);
1924 *running += event->total_time_running +
1925 atomic64_read(&event->child_total_time_running);
1926
1927 list_for_each_entry(child, &event->child_list, child_list) {
1756 total += perf_event_read(child); 1928 total += perf_event_read(child);
1929 *enabled += child->total_time_enabled;
1930 *running += child->total_time_running;
1931 }
1932 mutex_unlock(&event->child_mutex);
1757 1933
1758 return total; 1934 return total;
1759} 1935}
1760 1936EXPORT_SYMBOL_GPL(perf_event_read_value);
1761static int perf_event_read_entry(struct perf_event *event,
1762 u64 read_format, char __user *buf)
1763{
1764 int n = 0, count = 0;
1765 u64 values[2];
1766
1767 values[n++] = perf_event_read_value(event);
1768 if (read_format & PERF_FORMAT_ID)
1769 values[n++] = primary_event_id(event);
1770
1771 count = n * sizeof(u64);
1772
1773 if (copy_to_user(buf, values, count))
1774 return -EFAULT;
1775
1776 return count;
1777}
1778 1937
1779static int perf_event_read_group(struct perf_event *event, 1938static int perf_event_read_group(struct perf_event *event,
1780 u64 read_format, char __user *buf) 1939 u64 read_format, char __user *buf)
1781{ 1940{
1782 struct perf_event *leader = event->group_leader, *sub; 1941 struct perf_event *leader = event->group_leader, *sub;
1783 int n = 0, size = 0, err = -EFAULT; 1942 int n = 0, size = 0, ret = -EFAULT;
1784 u64 values[3]; 1943 struct perf_event_context *ctx = leader->ctx;
1944 u64 values[5];
1945 u64 count, enabled, running;
1946
1947 mutex_lock(&ctx->mutex);
1948 count = perf_event_read_value(leader, &enabled, &running);
1785 1949
1786 values[n++] = 1 + leader->nr_siblings; 1950 values[n++] = 1 + leader->nr_siblings;
1787 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 1951 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1788 values[n++] = leader->total_time_enabled + 1952 values[n++] = enabled;
1789 atomic64_read(&leader->child_total_time_enabled); 1953 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1790 } 1954 values[n++] = running;
1791 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { 1955 values[n++] = count;
1792 values[n++] = leader->total_time_running + 1956 if (read_format & PERF_FORMAT_ID)
1793 atomic64_read(&leader->child_total_time_running); 1957 values[n++] = primary_event_id(leader);
1794 }
1795 1958
1796 size = n * sizeof(u64); 1959 size = n * sizeof(u64);
1797 1960
1798 if (copy_to_user(buf, values, size)) 1961 if (copy_to_user(buf, values, size))
1799 return -EFAULT; 1962 goto unlock;
1800
1801 err = perf_event_read_entry(leader, read_format, buf + size);
1802 if (err < 0)
1803 return err;
1804 1963
1805 size += err; 1964 ret = size;
1806 1965
1807 list_for_each_entry(sub, &leader->sibling_list, group_entry) { 1966 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1808 err = perf_event_read_entry(sub, read_format, 1967 n = 0;
1809 buf + size); 1968
1810 if (err < 0) 1969 values[n++] = perf_event_read_value(sub, &enabled, &running);
1811 return err; 1970 if (read_format & PERF_FORMAT_ID)
1971 values[n++] = primary_event_id(sub);
1812 1972
1813 size += err; 1973 size = n * sizeof(u64);
1974
1975 if (copy_to_user(buf + ret, values, size)) {
1976 ret = -EFAULT;
1977 goto unlock;
1978 }
1979
1980 ret += size;
1814 } 1981 }
1982unlock:
1983 mutex_unlock(&ctx->mutex);
1815 1984
1816 return size; 1985 return ret;
1817} 1986}
1818 1987
1819static int perf_event_read_one(struct perf_event *event, 1988static int perf_event_read_one(struct perf_event *event,
1820 u64 read_format, char __user *buf) 1989 u64 read_format, char __user *buf)
1821{ 1990{
1991 u64 enabled, running;
1822 u64 values[4]; 1992 u64 values[4];
1823 int n = 0; 1993 int n = 0;
1824 1994
1825 values[n++] = perf_event_read_value(event); 1995 values[n++] = perf_event_read_value(event, &enabled, &running);
1826 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 1996 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1827 values[n++] = event->total_time_enabled + 1997 values[n++] = enabled;
1828 atomic64_read(&event->child_total_time_enabled); 1998 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1829 } 1999 values[n++] = running;
1830 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1831 values[n++] = event->total_time_running +
1832 atomic64_read(&event->child_total_time_running);
1833 }
1834 if (read_format & PERF_FORMAT_ID) 2000 if (read_format & PERF_FORMAT_ID)
1835 values[n++] = primary_event_id(event); 2001 values[n++] = primary_event_id(event);
1836 2002
@@ -1861,12 +2027,10 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
1861 return -ENOSPC; 2027 return -ENOSPC;
1862 2028
1863 WARN_ON_ONCE(event->ctx->parent_ctx); 2029 WARN_ON_ONCE(event->ctx->parent_ctx);
1864 mutex_lock(&event->child_mutex);
1865 if (read_format & PERF_FORMAT_GROUP) 2030 if (read_format & PERF_FORMAT_GROUP)
1866 ret = perf_event_read_group(event, read_format, buf); 2031 ret = perf_event_read_group(event, read_format, buf);
1867 else 2032 else
1868 ret = perf_event_read_one(event, read_format, buf); 2033 ret = perf_event_read_one(event, read_format, buf);
1869 mutex_unlock(&event->child_mutex);
1870 2034
1871 return ret; 2035 return ret;
1872} 2036}
@@ -1956,7 +2120,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
1956 if (!value) 2120 if (!value)
1957 return -EINVAL; 2121 return -EINVAL;
1958 2122
1959 spin_lock_irq(&ctx->lock); 2123 raw_spin_lock_irq(&ctx->lock);
1960 if (event->attr.freq) { 2124 if (event->attr.freq) {
1961 if (value > sysctl_perf_event_sample_rate) { 2125 if (value > sysctl_perf_event_sample_rate) {
1962 ret = -EINVAL; 2126 ret = -EINVAL;
@@ -1969,12 +2133,13 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
1969 event->hw.sample_period = value; 2133 event->hw.sample_period = value;
1970 } 2134 }
1971unlock: 2135unlock:
1972 spin_unlock_irq(&ctx->lock); 2136 raw_spin_unlock_irq(&ctx->lock);
1973 2137
1974 return ret; 2138 return ret;
1975} 2139}
1976 2140
1977int perf_event_set_output(struct perf_event *event, int output_fd); 2141static int perf_event_set_output(struct perf_event *event, int output_fd);
2142static int perf_event_set_filter(struct perf_event *event, void __user *arg);
1978 2143
1979static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 2144static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1980{ 2145{
@@ -2002,6 +2167,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2002 case PERF_EVENT_IOC_SET_OUTPUT: 2167 case PERF_EVENT_IOC_SET_OUTPUT:
2003 return perf_event_set_output(event, arg); 2168 return perf_event_set_output(event, arg);
2004 2169
2170 case PERF_EVENT_IOC_SET_FILTER:
2171 return perf_event_set_filter(event, (void __user *)arg);
2172
2005 default: 2173 default:
2006 return -ENOTTY; 2174 return -ENOTTY;
2007 } 2175 }
@@ -2174,6 +2342,7 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)
2174 perf_mmap_free_page((unsigned long)data->user_page); 2342 perf_mmap_free_page((unsigned long)data->user_page);
2175 for (i = 0; i < data->nr_pages; i++) 2343 for (i = 0; i < data->nr_pages; i++)
2176 perf_mmap_free_page((unsigned long)data->data_pages[i]); 2344 perf_mmap_free_page((unsigned long)data->data_pages[i]);
2345 kfree(data);
2177} 2346}
2178 2347
2179#else 2348#else
@@ -2214,6 +2383,7 @@ static void perf_mmap_data_free_work(struct work_struct *work)
2214 perf_mmap_unmark_page(base + (i * PAGE_SIZE)); 2383 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
2215 2384
2216 vfree(base); 2385 vfree(base);
2386 kfree(data);
2217} 2387}
2218 2388
2219static void perf_mmap_data_free(struct perf_mmap_data *data) 2389static void perf_mmap_data_free(struct perf_mmap_data *data)
@@ -2307,7 +2477,7 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2307 } 2477 }
2308 2478
2309 if (!data->watermark) 2479 if (!data->watermark)
2310 data->watermark = max_t(long, PAGE_SIZE, max_size / 2); 2480 data->watermark = max_size / 2;
2311 2481
2312 2482
2313 rcu_assign_pointer(event->data, data); 2483 rcu_assign_pointer(event->data, data);
@@ -2319,7 +2489,6 @@ static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
2319 2489
2320 data = container_of(rcu_head, struct perf_mmap_data, rcu_head); 2490 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2321 perf_mmap_data_free(data); 2491 perf_mmap_data_free(data);
2322 kfree(data);
2323} 2492}
2324 2493
2325static void perf_mmap_data_release(struct perf_event *event) 2494static void perf_mmap_data_release(struct perf_event *event)
@@ -2420,7 +2589,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2420 if (user_locked > user_lock_limit) 2589 if (user_locked > user_lock_limit)
2421 extra = user_locked - user_lock_limit; 2590 extra = user_locked - user_lock_limit;
2422 2591
2423 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 2592 lock_limit = rlimit(RLIMIT_MEMLOCK);
2424 lock_limit >>= PAGE_SHIFT; 2593 lock_limit >>= PAGE_SHIFT;
2425 locked = vma->vm_mm->locked_vm + extra; 2594 locked = vma->vm_mm->locked_vm + extra;
2426 2595
@@ -2616,6 +2785,12 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2616 return NULL; 2785 return NULL;
2617} 2786}
2618 2787
2788__weak
2789void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
2790{
2791}
2792
2793
2619/* 2794/*
2620 * Output 2795 * Output
2621 */ 2796 */
@@ -2666,20 +2841,21 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
2666static void perf_output_lock(struct perf_output_handle *handle) 2841static void perf_output_lock(struct perf_output_handle *handle)
2667{ 2842{
2668 struct perf_mmap_data *data = handle->data; 2843 struct perf_mmap_data *data = handle->data;
2669 int cpu; 2844 int cur, cpu = get_cpu();
2670 2845
2671 handle->locked = 0; 2846 handle->locked = 0;
2672 2847
2673 local_irq_save(handle->flags); 2848 for (;;) {
2674 cpu = smp_processor_id(); 2849 cur = atomic_cmpxchg(&data->lock, -1, cpu);
2675 2850 if (cur == -1) {
2676 if (in_nmi() && atomic_read(&data->lock) == cpu) 2851 handle->locked = 1;
2677 return; 2852 break;
2853 }
2854 if (cur == cpu)
2855 break;
2678 2856
2679 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2680 cpu_relax(); 2857 cpu_relax();
2681 2858 }
2682 handle->locked = 1;
2683} 2859}
2684 2860
2685static void perf_output_unlock(struct perf_output_handle *handle) 2861static void perf_output_unlock(struct perf_output_handle *handle)
@@ -2725,7 +2901,7 @@ again:
2725 if (atomic_xchg(&data->wakeup, 0)) 2901 if (atomic_xchg(&data->wakeup, 0))
2726 perf_output_wakeup(handle); 2902 perf_output_wakeup(handle);
2727out: 2903out:
2728 local_irq_restore(handle->flags); 2904 put_cpu();
2729} 2905}
2730 2906
2731void perf_output_copy(struct perf_output_handle *handle, 2907void perf_output_copy(struct perf_output_handle *handle,
@@ -3200,15 +3376,23 @@ static void perf_event_task_output(struct perf_event *event,
3200 struct perf_task_event *task_event) 3376 struct perf_task_event *task_event)
3201{ 3377{
3202 struct perf_output_handle handle; 3378 struct perf_output_handle handle;
3203 int size;
3204 struct task_struct *task = task_event->task; 3379 struct task_struct *task = task_event->task;
3205 int ret; 3380 unsigned long flags;
3381 int size, ret;
3382
3383 /*
3384 * If this CPU attempts to acquire an rq lock held by a CPU spinning
3385 * in perf_output_lock() from interrupt context, it's game over.
3386 */
3387 local_irq_save(flags);
3206 3388
3207 size = task_event->event_id.header.size; 3389 size = task_event->event_id.header.size;
3208 ret = perf_output_begin(&handle, event, size, 0, 0); 3390 ret = perf_output_begin(&handle, event, size, 0, 0);
3209 3391
3210 if (ret) 3392 if (ret) {
3393 local_irq_restore(flags);
3211 return; 3394 return;
3395 }
3212 3396
3213 task_event->event_id.pid = perf_event_pid(event, task); 3397 task_event->event_id.pid = perf_event_pid(event, task);
3214 task_event->event_id.ppid = perf_event_pid(event, current); 3398 task_event->event_id.ppid = perf_event_pid(event, current);
@@ -3216,15 +3400,20 @@ static void perf_event_task_output(struct perf_event *event,
3216 task_event->event_id.tid = perf_event_tid(event, task); 3400 task_event->event_id.tid = perf_event_tid(event, task);
3217 task_event->event_id.ptid = perf_event_tid(event, current); 3401 task_event->event_id.ptid = perf_event_tid(event, current);
3218 3402
3219 task_event->event_id.time = perf_clock();
3220
3221 perf_output_put(&handle, task_event->event_id); 3403 perf_output_put(&handle, task_event->event_id);
3222 3404
3223 perf_output_end(&handle); 3405 perf_output_end(&handle);
3406 local_irq_restore(flags);
3224} 3407}
3225 3408
3226static int perf_event_task_match(struct perf_event *event) 3409static int perf_event_task_match(struct perf_event *event)
3227{ 3410{
3411 if (event->state < PERF_EVENT_STATE_INACTIVE)
3412 return 0;
3413
3414 if (event->cpu != -1 && event->cpu != smp_processor_id())
3415 return 0;
3416
3228 if (event->attr.comm || event->attr.mmap || event->attr.task) 3417 if (event->attr.comm || event->attr.mmap || event->attr.task)
3229 return 1; 3418 return 1;
3230 3419
@@ -3236,15 +3425,10 @@ static void perf_event_task_ctx(struct perf_event_context *ctx,
3236{ 3425{
3237 struct perf_event *event; 3426 struct perf_event *event;
3238 3427
3239 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3240 return;
3241
3242 rcu_read_lock();
3243 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3428 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3244 if (perf_event_task_match(event)) 3429 if (perf_event_task_match(event))
3245 perf_event_task_output(event, task_event); 3430 perf_event_task_output(event, task_event);
3246 } 3431 }
3247 rcu_read_unlock();
3248} 3432}
3249 3433
3250static void perf_event_task_event(struct perf_task_event *task_event) 3434static void perf_event_task_event(struct perf_task_event *task_event)
@@ -3252,15 +3436,14 @@ static void perf_event_task_event(struct perf_task_event *task_event)
3252 struct perf_cpu_context *cpuctx; 3436 struct perf_cpu_context *cpuctx;
3253 struct perf_event_context *ctx = task_event->task_ctx; 3437 struct perf_event_context *ctx = task_event->task_ctx;
3254 3438
3439 rcu_read_lock();
3255 cpuctx = &get_cpu_var(perf_cpu_context); 3440 cpuctx = &get_cpu_var(perf_cpu_context);
3256 perf_event_task_ctx(&cpuctx->ctx, task_event); 3441 perf_event_task_ctx(&cpuctx->ctx, task_event);
3257 put_cpu_var(perf_cpu_context);
3258
3259 rcu_read_lock();
3260 if (!ctx) 3442 if (!ctx)
3261 ctx = rcu_dereference(task_event->task->perf_event_ctxp); 3443 ctx = rcu_dereference(current->perf_event_ctxp);
3262 if (ctx) 3444 if (ctx)
3263 perf_event_task_ctx(ctx, task_event); 3445 perf_event_task_ctx(ctx, task_event);
3446 put_cpu_var(perf_cpu_context);
3264 rcu_read_unlock(); 3447 rcu_read_unlock();
3265} 3448}
3266 3449
@@ -3288,6 +3471,7 @@ static void perf_event_task(struct task_struct *task,
3288 /* .ppid */ 3471 /* .ppid */
3289 /* .tid */ 3472 /* .tid */
3290 /* .ptid */ 3473 /* .ptid */
3474 .time = perf_clock(),
3291 }, 3475 },
3292 }; 3476 };
3293 3477
@@ -3337,6 +3521,12 @@ static void perf_event_comm_output(struct perf_event *event,
3337 3521
3338static int perf_event_comm_match(struct perf_event *event) 3522static int perf_event_comm_match(struct perf_event *event)
3339{ 3523{
3524 if (event->state < PERF_EVENT_STATE_INACTIVE)
3525 return 0;
3526
3527 if (event->cpu != -1 && event->cpu != smp_processor_id())
3528 return 0;
3529
3340 if (event->attr.comm) 3530 if (event->attr.comm)
3341 return 1; 3531 return 1;
3342 3532
@@ -3348,15 +3538,10 @@ static void perf_event_comm_ctx(struct perf_event_context *ctx,
3348{ 3538{
3349 struct perf_event *event; 3539 struct perf_event *event;
3350 3540
3351 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3352 return;
3353
3354 rcu_read_lock();
3355 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3541 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3356 if (perf_event_comm_match(event)) 3542 if (perf_event_comm_match(event))
3357 perf_event_comm_output(event, comm_event); 3543 perf_event_comm_output(event, comm_event);
3358 } 3544 }
3359 rcu_read_unlock();
3360} 3545}
3361 3546
3362static void perf_event_comm_event(struct perf_comm_event *comm_event) 3547static void perf_event_comm_event(struct perf_comm_event *comm_event)
@@ -3367,7 +3552,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3367 char comm[TASK_COMM_LEN]; 3552 char comm[TASK_COMM_LEN];
3368 3553
3369 memset(comm, 0, sizeof(comm)); 3554 memset(comm, 0, sizeof(comm));
3370 strncpy(comm, comm_event->task->comm, sizeof(comm)); 3555 strlcpy(comm, comm_event->task->comm, sizeof(comm));
3371 size = ALIGN(strlen(comm)+1, sizeof(u64)); 3556 size = ALIGN(strlen(comm)+1, sizeof(u64));
3372 3557
3373 comm_event->comm = comm; 3558 comm_event->comm = comm;
@@ -3375,18 +3560,13 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3375 3560
3376 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; 3561 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3377 3562
3563 rcu_read_lock();
3378 cpuctx = &get_cpu_var(perf_cpu_context); 3564 cpuctx = &get_cpu_var(perf_cpu_context);
3379 perf_event_comm_ctx(&cpuctx->ctx, comm_event); 3565 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3380 put_cpu_var(perf_cpu_context);
3381
3382 rcu_read_lock();
3383 /*
3384 * doesn't really matter which of the child contexts the
3385 * events ends up in.
3386 */
3387 ctx = rcu_dereference(current->perf_event_ctxp); 3566 ctx = rcu_dereference(current->perf_event_ctxp);
3388 if (ctx) 3567 if (ctx)
3389 perf_event_comm_ctx(ctx, comm_event); 3568 perf_event_comm_ctx(ctx, comm_event);
3569 put_cpu_var(perf_cpu_context);
3390 rcu_read_unlock(); 3570 rcu_read_unlock();
3391} 3571}
3392 3572
@@ -3461,6 +3641,12 @@ static void perf_event_mmap_output(struct perf_event *event,
3461static int perf_event_mmap_match(struct perf_event *event, 3641static int perf_event_mmap_match(struct perf_event *event,
3462 struct perf_mmap_event *mmap_event) 3642 struct perf_mmap_event *mmap_event)
3463{ 3643{
3644 if (event->state < PERF_EVENT_STATE_INACTIVE)
3645 return 0;
3646
3647 if (event->cpu != -1 && event->cpu != smp_processor_id())
3648 return 0;
3649
3464 if (event->attr.mmap) 3650 if (event->attr.mmap)
3465 return 1; 3651 return 1;
3466 3652
@@ -3472,15 +3658,10 @@ static void perf_event_mmap_ctx(struct perf_event_context *ctx,
3472{ 3658{
3473 struct perf_event *event; 3659 struct perf_event *event;
3474 3660
3475 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3476 return;
3477
3478 rcu_read_lock();
3479 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3661 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3480 if (perf_event_mmap_match(event, mmap_event)) 3662 if (perf_event_mmap_match(event, mmap_event))
3481 perf_event_mmap_output(event, mmap_event); 3663 perf_event_mmap_output(event, mmap_event);
3482 } 3664 }
3483 rcu_read_unlock();
3484} 3665}
3485 3666
3486static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) 3667static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
@@ -3536,18 +3717,13 @@ got_name:
3536 3717
3537 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; 3718 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3538 3719
3720 rcu_read_lock();
3539 cpuctx = &get_cpu_var(perf_cpu_context); 3721 cpuctx = &get_cpu_var(perf_cpu_context);
3540 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); 3722 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
3541 put_cpu_var(perf_cpu_context);
3542
3543 rcu_read_lock();
3544 /*
3545 * doesn't really matter which of the child contexts the
3546 * events ends up in.
3547 */
3548 ctx = rcu_dereference(current->perf_event_ctxp); 3723 ctx = rcu_dereference(current->perf_event_ctxp);
3549 if (ctx) 3724 if (ctx)
3550 perf_event_mmap_ctx(ctx, mmap_event); 3725 perf_event_mmap_ctx(ctx, mmap_event);
3726 put_cpu_var(perf_cpu_context);
3551 rcu_read_unlock(); 3727 rcu_read_unlock();
3552 3728
3553 kfree(buf); 3729 kfree(buf);
@@ -3574,7 +3750,7 @@ void __perf_event_mmap(struct vm_area_struct *vma)
3574 /* .tid */ 3750 /* .tid */
3575 .start = vma->vm_start, 3751 .start = vma->vm_start,
3576 .len = vma->vm_end - vma->vm_start, 3752 .len = vma->vm_end - vma->vm_start,
3577 .pgoff = vma->vm_pgoff, 3753 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
3578 }, 3754 },
3579 }; 3755 };
3580 3756
@@ -3654,12 +3830,12 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
3654 3830
3655 if (event->attr.freq) { 3831 if (event->attr.freq) {
3656 u64 now = perf_clock(); 3832 u64 now = perf_clock();
3657 s64 delta = now - hwc->freq_stamp; 3833 s64 delta = now - hwc->freq_time_stamp;
3658 3834
3659 hwc->freq_stamp = now; 3835 hwc->freq_time_stamp = now;
3660 3836
3661 if (delta > 0 && delta < TICK_NSEC) 3837 if (delta > 0 && delta < 2*TICK_NSEC)
3662 perf_adjust_period(event, NSEC_PER_SEC / (int)delta); 3838 perf_adjust_period(event, delta, hwc->last_period);
3663 } 3839 }
3664 3840
3665 /* 3841 /*
@@ -3679,7 +3855,11 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
3679 perf_event_disable(event); 3855 perf_event_disable(event);
3680 } 3856 }
3681 3857
3682 perf_event_output(event, nmi, data, regs); 3858 if (event->overflow_handler)
3859 event->overflow_handler(event, nmi, data, regs);
3860 else
3861 perf_event_output(event, nmi, data, regs);
3862
3683 return ret; 3863 return ret;
3684} 3864}
3685 3865
@@ -3724,16 +3904,16 @@ again:
3724 return nr; 3904 return nr;
3725} 3905}
3726 3906
3727static void perf_swevent_overflow(struct perf_event *event, 3907static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
3728 int nmi, struct perf_sample_data *data, 3908 int nmi, struct perf_sample_data *data,
3729 struct pt_regs *regs) 3909 struct pt_regs *regs)
3730{ 3910{
3731 struct hw_perf_event *hwc = &event->hw; 3911 struct hw_perf_event *hwc = &event->hw;
3732 int throttle = 0; 3912 int throttle = 0;
3733 u64 overflow;
3734 3913
3735 data->period = event->hw.last_period; 3914 data->period = event->hw.last_period;
3736 overflow = perf_swevent_set_period(event); 3915 if (!overflow)
3916 overflow = perf_swevent_set_period(event);
3737 3917
3738 if (hwc->interrupts == MAX_INTERRUPTS) 3918 if (hwc->interrupts == MAX_INTERRUPTS)
3739 return; 3919 return;
@@ -3766,14 +3946,19 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
3766 3946
3767 atomic64_add(nr, &event->count); 3947 atomic64_add(nr, &event->count);
3768 3948
3949 if (!regs)
3950 return;
3951
3769 if (!hwc->sample_period) 3952 if (!hwc->sample_period)
3770 return; 3953 return;
3771 3954
3772 if (!regs) 3955 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
3956 return perf_swevent_overflow(event, 1, nmi, data, regs);
3957
3958 if (atomic64_add_negative(nr, &hwc->period_left))
3773 return; 3959 return;
3774 3960
3775 if (!atomic64_add_negative(nr, &hwc->period_left)) 3961 perf_swevent_overflow(event, 0, nmi, data, regs);
3776 perf_swevent_overflow(event, nmi, data, regs);
3777} 3962}
3778 3963
3779static int perf_swevent_is_counting(struct perf_event *event) 3964static int perf_swevent_is_counting(struct perf_event *event)
@@ -3806,25 +3991,47 @@ static int perf_swevent_is_counting(struct perf_event *event)
3806 return 1; 3991 return 1;
3807} 3992}
3808 3993
3994static int perf_tp_event_match(struct perf_event *event,
3995 struct perf_sample_data *data);
3996
3997static int perf_exclude_event(struct perf_event *event,
3998 struct pt_regs *regs)
3999{
4000 if (regs) {
4001 if (event->attr.exclude_user && user_mode(regs))
4002 return 1;
4003
4004 if (event->attr.exclude_kernel && !user_mode(regs))
4005 return 1;
4006 }
4007
4008 return 0;
4009}
4010
3809static int perf_swevent_match(struct perf_event *event, 4011static int perf_swevent_match(struct perf_event *event,
3810 enum perf_type_id type, 4012 enum perf_type_id type,
3811 u32 event_id, struct pt_regs *regs) 4013 u32 event_id,
4014 struct perf_sample_data *data,
4015 struct pt_regs *regs)
3812{ 4016{
4017 if (event->cpu != -1 && event->cpu != smp_processor_id())
4018 return 0;
4019
3813 if (!perf_swevent_is_counting(event)) 4020 if (!perf_swevent_is_counting(event))
3814 return 0; 4021 return 0;
3815 4022
3816 if (event->attr.type != type) 4023 if (event->attr.type != type)
3817 return 0; 4024 return 0;
4025
3818 if (event->attr.config != event_id) 4026 if (event->attr.config != event_id)
3819 return 0; 4027 return 0;
3820 4028
3821 if (regs) { 4029 if (perf_exclude_event(event, regs))
3822 if (event->attr.exclude_user && user_mode(regs)) 4030 return 0;
3823 return 0;
3824 4031
3825 if (event->attr.exclude_kernel && !user_mode(regs)) 4032 if (event->attr.type == PERF_TYPE_TRACEPOINT &&
3826 return 0; 4033 !perf_tp_event_match(event, data))
3827 } 4034 return 0;
3828 4035
3829 return 1; 4036 return 1;
3830} 4037}
@@ -3837,49 +4044,59 @@ static void perf_swevent_ctx_event(struct perf_event_context *ctx,
3837{ 4044{
3838 struct perf_event *event; 4045 struct perf_event *event;
3839 4046
3840 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3841 return;
3842
3843 rcu_read_lock();
3844 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 4047 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3845 if (perf_swevent_match(event, type, event_id, regs)) 4048 if (perf_swevent_match(event, type, event_id, data, regs))
3846 perf_swevent_add(event, nr, nmi, data, regs); 4049 perf_swevent_add(event, nr, nmi, data, regs);
3847 } 4050 }
3848 rcu_read_unlock();
3849} 4051}
3850 4052
3851static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx) 4053int perf_swevent_get_recursion_context(void)
3852{ 4054{
4055 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
4056 int rctx;
4057
3853 if (in_nmi()) 4058 if (in_nmi())
3854 return &cpuctx->recursion[3]; 4059 rctx = 3;
4060 else if (in_irq())
4061 rctx = 2;
4062 else if (in_softirq())
4063 rctx = 1;
4064 else
4065 rctx = 0;
4066
4067 if (cpuctx->recursion[rctx]) {
4068 put_cpu_var(perf_cpu_context);
4069 return -1;
4070 }
3855 4071
3856 if (in_irq()) 4072 cpuctx->recursion[rctx]++;
3857 return &cpuctx->recursion[2]; 4073 barrier();
3858 4074
3859 if (in_softirq()) 4075 return rctx;
3860 return &cpuctx->recursion[1]; 4076}
4077EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
3861 4078
3862 return &cpuctx->recursion[0]; 4079void perf_swevent_put_recursion_context(int rctx)
4080{
4081 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4082 barrier();
4083 cpuctx->recursion[rctx]--;
4084 put_cpu_var(perf_cpu_context);
3863} 4085}
4086EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
3864 4087
3865static void do_perf_sw_event(enum perf_type_id type, u32 event_id, 4088static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
3866 u64 nr, int nmi, 4089 u64 nr, int nmi,
3867 struct perf_sample_data *data, 4090 struct perf_sample_data *data,
3868 struct pt_regs *regs) 4091 struct pt_regs *regs)
3869{ 4092{
3870 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); 4093 struct perf_cpu_context *cpuctx;
3871 int *recursion = perf_swevent_recursion_context(cpuctx);
3872 struct perf_event_context *ctx; 4094 struct perf_event_context *ctx;
3873 4095
3874 if (*recursion) 4096 cpuctx = &__get_cpu_var(perf_cpu_context);
3875 goto out; 4097 rcu_read_lock();
3876
3877 (*recursion)++;
3878 barrier();
3879
3880 perf_swevent_ctx_event(&cpuctx->ctx, type, event_id, 4098 perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
3881 nr, nmi, data, regs); 4099 nr, nmi, data, regs);
3882 rcu_read_lock();
3883 /* 4100 /*
3884 * doesn't really matter which of the child contexts the 4101 * doesn't really matter which of the child contexts the
3885 * events ends up in. 4102 * events ends up in.
@@ -3888,23 +4105,23 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
3888 if (ctx) 4105 if (ctx)
3889 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs); 4106 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
3890 rcu_read_unlock(); 4107 rcu_read_unlock();
3891
3892 barrier();
3893 (*recursion)--;
3894
3895out:
3896 put_cpu_var(perf_cpu_context);
3897} 4108}
3898 4109
3899void __perf_sw_event(u32 event_id, u64 nr, int nmi, 4110void __perf_sw_event(u32 event_id, u64 nr, int nmi,
3900 struct pt_regs *regs, u64 addr) 4111 struct pt_regs *regs, u64 addr)
3901{ 4112{
3902 struct perf_sample_data data = { 4113 struct perf_sample_data data;
3903 .addr = addr, 4114 int rctx;
3904 };
3905 4115
3906 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, 4116 rctx = perf_swevent_get_recursion_context();
3907 &data, regs); 4117 if (rctx < 0)
4118 return;
4119
4120 perf_sample_data_init(&data, addr);
4121
4122 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
4123
4124 perf_swevent_put_recursion_context(rctx);
3908} 4125}
3909 4126
3910static void perf_swevent_read(struct perf_event *event) 4127static void perf_swevent_read(struct perf_event *event)
@@ -3945,10 +4162,11 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
3945 struct perf_event *event; 4162 struct perf_event *event;
3946 u64 period; 4163 u64 period;
3947 4164
3948 event = container_of(hrtimer, struct perf_event, hw.hrtimer); 4165 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
3949 event->pmu->read(event); 4166 event->pmu->read(event);
3950 4167
3951 data.addr = 0; 4168 perf_sample_data_init(&data, 0);
4169 data.period = event->hw.last_period;
3952 regs = get_irq_regs(); 4170 regs = get_irq_regs();
3953 /* 4171 /*
3954 * In case we exclude kernel IPs or are somehow not in interrupt 4172 * In case we exclude kernel IPs or are somehow not in interrupt
@@ -4017,8 +4235,7 @@ static void cpu_clock_perf_event_update(struct perf_event *event)
4017 u64 now; 4235 u64 now;
4018 4236
4019 now = cpu_clock(cpu); 4237 now = cpu_clock(cpu);
4020 prev = atomic64_read(&event->hw.prev_count); 4238 prev = atomic64_xchg(&event->hw.prev_count, now);
4021 atomic64_set(&event->hw.prev_count, now);
4022 atomic64_add(now - prev, &event->count); 4239 atomic64_add(now - prev, &event->count);
4023} 4240}
4024 4241
@@ -4107,36 +4324,39 @@ static const struct pmu perf_ops_task_clock = {
4107 .read = task_clock_perf_event_read, 4324 .read = task_clock_perf_event_read,
4108}; 4325};
4109 4326
4110#ifdef CONFIG_EVENT_PROFILE 4327#ifdef CONFIG_EVENT_TRACING
4328
4111void perf_tp_event(int event_id, u64 addr, u64 count, void *record, 4329void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4112 int entry_size) 4330 int entry_size, struct pt_regs *regs)
4113{ 4331{
4332 struct perf_sample_data data;
4114 struct perf_raw_record raw = { 4333 struct perf_raw_record raw = {
4115 .size = entry_size, 4334 .size = entry_size,
4116 .data = record, 4335 .data = record,
4117 }; 4336 };
4118 4337
4119 struct perf_sample_data data = { 4338 perf_sample_data_init(&data, addr);
4120 .addr = addr, 4339 data.raw = &raw;
4121 .raw = &raw,
4122 };
4123
4124 struct pt_regs *regs = get_irq_regs();
4125
4126 if (!regs)
4127 regs = task_pt_regs(current);
4128 4340
4341 /* Trace events already protected against recursion */
4129 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, 4342 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
4130 &data, regs); 4343 &data, regs);
4131} 4344}
4132EXPORT_SYMBOL_GPL(perf_tp_event); 4345EXPORT_SYMBOL_GPL(perf_tp_event);
4133 4346
4134extern int ftrace_profile_enable(int); 4347static int perf_tp_event_match(struct perf_event *event,
4135extern void ftrace_profile_disable(int); 4348 struct perf_sample_data *data)
4349{
4350 void *record = data->raw->data;
4351
4352 if (likely(!event->filter) || filter_match_preds(event->filter, record))
4353 return 1;
4354 return 0;
4355}
4136 4356
4137static void tp_perf_event_destroy(struct perf_event *event) 4357static void tp_perf_event_destroy(struct perf_event *event)
4138{ 4358{
4139 ftrace_profile_disable(event->attr.config); 4359 perf_trace_disable(event->attr.config);
4140} 4360}
4141 4361
4142static const struct pmu *tp_perf_event_init(struct perf_event *event) 4362static const struct pmu *tp_perf_event_init(struct perf_event *event)
@@ -4150,18 +4370,99 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
4150 !capable(CAP_SYS_ADMIN)) 4370 !capable(CAP_SYS_ADMIN))
4151 return ERR_PTR(-EPERM); 4371 return ERR_PTR(-EPERM);
4152 4372
4153 if (ftrace_profile_enable(event->attr.config)) 4373 if (perf_trace_enable(event->attr.config))
4154 return NULL; 4374 return NULL;
4155 4375
4156 event->destroy = tp_perf_event_destroy; 4376 event->destroy = tp_perf_event_destroy;
4157 4377
4158 return &perf_ops_generic; 4378 return &perf_ops_generic;
4159} 4379}
4380
4381static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4382{
4383 char *filter_str;
4384 int ret;
4385
4386 if (event->attr.type != PERF_TYPE_TRACEPOINT)
4387 return -EINVAL;
4388
4389 filter_str = strndup_user(arg, PAGE_SIZE);
4390 if (IS_ERR(filter_str))
4391 return PTR_ERR(filter_str);
4392
4393 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
4394
4395 kfree(filter_str);
4396 return ret;
4397}
4398
4399static void perf_event_free_filter(struct perf_event *event)
4400{
4401 ftrace_profile_free_filter(event);
4402}
4403
4160#else 4404#else
4405
4406static int perf_tp_event_match(struct perf_event *event,
4407 struct perf_sample_data *data)
4408{
4409 return 1;
4410}
4411
4161static const struct pmu *tp_perf_event_init(struct perf_event *event) 4412static const struct pmu *tp_perf_event_init(struct perf_event *event)
4162{ 4413{
4163 return NULL; 4414 return NULL;
4164} 4415}
4416
4417static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4418{
4419 return -ENOENT;
4420}
4421
4422static void perf_event_free_filter(struct perf_event *event)
4423{
4424}
4425
4426#endif /* CONFIG_EVENT_TRACING */
4427
4428#ifdef CONFIG_HAVE_HW_BREAKPOINT
4429static void bp_perf_event_destroy(struct perf_event *event)
4430{
4431 release_bp_slot(event);
4432}
4433
4434static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4435{
4436 int err;
4437
4438 err = register_perf_hw_breakpoint(bp);
4439 if (err)
4440 return ERR_PTR(err);
4441
4442 bp->destroy = bp_perf_event_destroy;
4443
4444 return &perf_ops_bp;
4445}
4446
4447void perf_bp_event(struct perf_event *bp, void *data)
4448{
4449 struct perf_sample_data sample;
4450 struct pt_regs *regs = data;
4451
4452 perf_sample_data_init(&sample, bp->attr.bp_addr);
4453
4454 if (!perf_exclude_event(bp, regs))
4455 perf_swevent_add(bp, 1, 1, &sample, regs);
4456}
4457#else
4458static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4459{
4460 return NULL;
4461}
4462
4463void perf_bp_event(struct perf_event *bp, void *regs)
4464{
4465}
4165#endif 4466#endif
4166 4467
4167atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; 4468atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
@@ -4208,6 +4509,8 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
4208 case PERF_COUNT_SW_PAGE_FAULTS_MAJ: 4509 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
4209 case PERF_COUNT_SW_CONTEXT_SWITCHES: 4510 case PERF_COUNT_SW_CONTEXT_SWITCHES:
4210 case PERF_COUNT_SW_CPU_MIGRATIONS: 4511 case PERF_COUNT_SW_CPU_MIGRATIONS:
4512 case PERF_COUNT_SW_ALIGNMENT_FAULTS:
4513 case PERF_COUNT_SW_EMULATION_FAULTS:
4211 if (!event->parent) { 4514 if (!event->parent) {
4212 atomic_inc(&perf_swevent_enabled[event_id]); 4515 atomic_inc(&perf_swevent_enabled[event_id]);
4213 event->destroy = sw_perf_event_destroy; 4516 event->destroy = sw_perf_event_destroy;
@@ -4228,6 +4531,7 @@ perf_event_alloc(struct perf_event_attr *attr,
4228 struct perf_event_context *ctx, 4531 struct perf_event_context *ctx,
4229 struct perf_event *group_leader, 4532 struct perf_event *group_leader,
4230 struct perf_event *parent_event, 4533 struct perf_event *parent_event,
4534 perf_overflow_handler_t overflow_handler,
4231 gfp_t gfpflags) 4535 gfp_t gfpflags)
4232{ 4536{
4233 const struct pmu *pmu; 4537 const struct pmu *pmu;
@@ -4270,6 +4574,11 @@ perf_event_alloc(struct perf_event_attr *attr,
4270 4574
4271 event->state = PERF_EVENT_STATE_INACTIVE; 4575 event->state = PERF_EVENT_STATE_INACTIVE;
4272 4576
4577 if (!overflow_handler && parent_event)
4578 overflow_handler = parent_event->overflow_handler;
4579
4580 event->overflow_handler = overflow_handler;
4581
4273 if (attr->disabled) 4582 if (attr->disabled)
4274 event->state = PERF_EVENT_STATE_OFF; 4583 event->state = PERF_EVENT_STATE_OFF;
4275 4584
@@ -4304,6 +4613,11 @@ perf_event_alloc(struct perf_event_attr *attr,
4304 pmu = tp_perf_event_init(event); 4613 pmu = tp_perf_event_init(event);
4305 break; 4614 break;
4306 4615
4616 case PERF_TYPE_BREAKPOINT:
4617 pmu = bp_perf_event_init(event);
4618 break;
4619
4620
4307 default: 4621 default:
4308 break; 4622 break;
4309 } 4623 }
@@ -4398,7 +4712,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
4398 if (attr->type >= PERF_TYPE_MAX) 4712 if (attr->type >= PERF_TYPE_MAX)
4399 return -EINVAL; 4713 return -EINVAL;
4400 4714
4401 if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) 4715 if (attr->__reserved_1)
4402 return -EINVAL; 4716 return -EINVAL;
4403 4717
4404 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) 4718 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
@@ -4416,7 +4730,7 @@ err_size:
4416 goto out; 4730 goto out;
4417} 4731}
4418 4732
4419int perf_event_set_output(struct perf_event *event, int output_fd) 4733static int perf_event_set_output(struct perf_event *event, int output_fd)
4420{ 4734{
4421 struct perf_event *output_event = NULL; 4735 struct perf_event *output_event = NULL;
4422 struct file *output_file = NULL; 4736 struct file *output_file = NULL;
@@ -4546,12 +4860,12 @@ SYSCALL_DEFINE5(perf_event_open,
4546 } 4860 }
4547 4861
4548 event = perf_event_alloc(&attr, cpu, ctx, group_leader, 4862 event = perf_event_alloc(&attr, cpu, ctx, group_leader,
4549 NULL, GFP_KERNEL); 4863 NULL, NULL, GFP_KERNEL);
4550 err = PTR_ERR(event); 4864 err = PTR_ERR(event);
4551 if (IS_ERR(event)) 4865 if (IS_ERR(event))
4552 goto err_put_context; 4866 goto err_put_context;
4553 4867
4554 err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0); 4868 err = anon_inode_getfd("[perf_event]", &perf_fops, event, O_RDWR);
4555 if (err < 0) 4869 if (err < 0)
4556 goto err_free_put_context; 4870 goto err_free_put_context;
4557 4871
@@ -4583,7 +4897,7 @@ err_fput_free_put_context:
4583 4897
4584err_free_put_context: 4898err_free_put_context:
4585 if (err < 0) 4899 if (err < 0)
4586 kfree(event); 4900 free_event(event);
4587 4901
4588err_put_context: 4902err_put_context:
4589 if (err < 0) 4903 if (err < 0)
@@ -4594,6 +4908,61 @@ err_put_context:
4594 return err; 4908 return err;
4595} 4909}
4596 4910
4911/**
4912 * perf_event_create_kernel_counter
4913 *
4914 * @attr: attributes of the counter to create
4915 * @cpu: cpu in which the counter is bound
4916 * @pid: task to profile
4917 */
4918struct perf_event *
4919perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
4920 pid_t pid,
4921 perf_overflow_handler_t overflow_handler)
4922{
4923 struct perf_event *event;
4924 struct perf_event_context *ctx;
4925 int err;
4926
4927 /*
4928 * Get the target context (task or percpu):
4929 */
4930
4931 ctx = find_get_context(pid, cpu);
4932 if (IS_ERR(ctx)) {
4933 err = PTR_ERR(ctx);
4934 goto err_exit;
4935 }
4936
4937 event = perf_event_alloc(attr, cpu, ctx, NULL,
4938 NULL, overflow_handler, GFP_KERNEL);
4939 if (IS_ERR(event)) {
4940 err = PTR_ERR(event);
4941 goto err_put_context;
4942 }
4943
4944 event->filp = NULL;
4945 WARN_ON_ONCE(ctx->parent_ctx);
4946 mutex_lock(&ctx->mutex);
4947 perf_install_in_context(ctx, event, cpu);
4948 ++ctx->generation;
4949 mutex_unlock(&ctx->mutex);
4950
4951 event->owner = current;
4952 get_task_struct(current);
4953 mutex_lock(&current->perf_event_mutex);
4954 list_add_tail(&event->owner_entry, &current->perf_event_list);
4955 mutex_unlock(&current->perf_event_mutex);
4956
4957 return event;
4958
4959 err_put_context:
4960 put_ctx(ctx);
4961 err_exit:
4962 return ERR_PTR(err);
4963}
4964EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
4965
4597/* 4966/*
4598 * inherit a event from parent task to child task: 4967 * inherit a event from parent task to child task:
4599 */ 4968 */
@@ -4619,7 +4988,7 @@ inherit_event(struct perf_event *parent_event,
4619 child_event = perf_event_alloc(&parent_event->attr, 4988 child_event = perf_event_alloc(&parent_event->attr,
4620 parent_event->cpu, child_ctx, 4989 parent_event->cpu, child_ctx,
4621 group_leader, parent_event, 4990 group_leader, parent_event,
4622 GFP_KERNEL); 4991 NULL, GFP_KERNEL);
4623 if (IS_ERR(child_event)) 4992 if (IS_ERR(child_event))
4624 return child_event; 4993 return child_event;
4625 get_ctx(child_ctx); 4994 get_ctx(child_ctx);
@@ -4634,8 +5003,17 @@ inherit_event(struct perf_event *parent_event,
4634 else 5003 else
4635 child_event->state = PERF_EVENT_STATE_OFF; 5004 child_event->state = PERF_EVENT_STATE_OFF;
4636 5005
4637 if (parent_event->attr.freq) 5006 if (parent_event->attr.freq) {
4638 child_event->hw.sample_period = parent_event->hw.sample_period; 5007 u64 sample_period = parent_event->hw.sample_period;
5008 struct hw_perf_event *hwc = &child_event->hw;
5009
5010 hwc->sample_period = sample_period;
5011 hwc->last_period = sample_period;
5012
5013 atomic64_set(&hwc->period_left, sample_period);
5014 }
5015
5016 child_event->overflow_handler = parent_event->overflow_handler;
4639 5017
4640 /* 5018 /*
4641 * Link it up in the child's context: 5019 * Link it up in the child's context:
@@ -4726,7 +5104,6 @@ __perf_event_exit_task(struct perf_event *child_event,
4726{ 5104{
4727 struct perf_event *parent_event; 5105 struct perf_event *parent_event;
4728 5106
4729 update_event_times(child_event);
4730 perf_event_remove_from_context(child_event); 5107 perf_event_remove_from_context(child_event);
4731 5108
4732 parent_event = child_event->parent; 5109 parent_event = child_event->parent;
@@ -4770,7 +5147,7 @@ void perf_event_exit_task(struct task_struct *child)
4770 * reading child->perf_event_ctxp, we wait until it has 5147 * reading child->perf_event_ctxp, we wait until it has
4771 * incremented the context's refcount before we do put_ctx below. 5148 * incremented the context's refcount before we do put_ctx below.
4772 */ 5149 */
4773 spin_lock(&child_ctx->lock); 5150 raw_spin_lock(&child_ctx->lock);
4774 child->perf_event_ctxp = NULL; 5151 child->perf_event_ctxp = NULL;
4775 /* 5152 /*
4776 * If this context is a clone; unclone it so it can't get 5153 * If this context is a clone; unclone it so it can't get
@@ -4778,7 +5155,8 @@ void perf_event_exit_task(struct task_struct *child)
4778 * the events from it. 5155 * the events from it.
4779 */ 5156 */
4780 unclone_ctx(child_ctx); 5157 unclone_ctx(child_ctx);
4781 spin_unlock_irqrestore(&child_ctx->lock, flags); 5158 update_context_time(child_ctx);
5159 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
4782 5160
4783 /* 5161 /*
4784 * Report the task dead after unscheduling the events so that we 5162 * Report the task dead after unscheduling the events so that we
@@ -4801,7 +5179,11 @@ void perf_event_exit_task(struct task_struct *child)
4801 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); 5179 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
4802 5180
4803again: 5181again:
4804 list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list, 5182 list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
5183 group_entry)
5184 __perf_event_exit_task(child_event, child_ctx, child);
5185
5186 list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
4805 group_entry) 5187 group_entry)
4806 __perf_event_exit_task(child_event, child_ctx, child); 5188 __perf_event_exit_task(child_event, child_ctx, child);
4807 5189
@@ -4810,7 +5192,8 @@ again:
4810 * its siblings to the list, but we obtained 'tmp' before that which 5192 * its siblings to the list, but we obtained 'tmp' before that which
4811 * will still point to the list head terminating the iteration. 5193 * will still point to the list head terminating the iteration.
4812 */ 5194 */
4813 if (!list_empty(&child_ctx->group_list)) 5195 if (!list_empty(&child_ctx->pinned_groups) ||
5196 !list_empty(&child_ctx->flexible_groups))
4814 goto again; 5197 goto again;
4815 5198
4816 mutex_unlock(&child_ctx->mutex); 5199 mutex_unlock(&child_ctx->mutex);
@@ -4818,6 +5201,24 @@ again:
4818 put_ctx(child_ctx); 5201 put_ctx(child_ctx);
4819} 5202}
4820 5203
5204static void perf_free_event(struct perf_event *event,
5205 struct perf_event_context *ctx)
5206{
5207 struct perf_event *parent = event->parent;
5208
5209 if (WARN_ON_ONCE(!parent))
5210 return;
5211
5212 mutex_lock(&parent->child_mutex);
5213 list_del_init(&event->child_list);
5214 mutex_unlock(&parent->child_mutex);
5215
5216 fput(parent->filp);
5217
5218 list_del_event(event, ctx);
5219 free_event(event);
5220}
5221
4821/* 5222/*
4822 * free an unexposed, unused context as created by inheritance by 5223 * free an unexposed, unused context as created by inheritance by
4823 * init_task below, used by fork() in case of fail. 5224 * init_task below, used by fork() in case of fail.
@@ -4832,30 +5233,64 @@ void perf_event_free_task(struct task_struct *task)
4832 5233
4833 mutex_lock(&ctx->mutex); 5234 mutex_lock(&ctx->mutex);
4834again: 5235again:
4835 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) { 5236 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
4836 struct perf_event *parent = event->parent; 5237 perf_free_event(event, ctx);
4837 5238
4838 if (WARN_ON_ONCE(!parent)) 5239 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
4839 continue; 5240 group_entry)
5241 perf_free_event(event, ctx);
4840 5242
4841 mutex_lock(&parent->child_mutex); 5243 if (!list_empty(&ctx->pinned_groups) ||
4842 list_del_init(&event->child_list); 5244 !list_empty(&ctx->flexible_groups))
4843 mutex_unlock(&parent->child_mutex); 5245 goto again;
4844 5246
4845 fput(parent->filp); 5247 mutex_unlock(&ctx->mutex);
4846 5248
4847 list_del_event(event, ctx); 5249 put_ctx(ctx);
4848 free_event(event); 5250}
5251
5252static int
5253inherit_task_group(struct perf_event *event, struct task_struct *parent,
5254 struct perf_event_context *parent_ctx,
5255 struct task_struct *child,
5256 int *inherited_all)
5257{
5258 int ret;
5259 struct perf_event_context *child_ctx = child->perf_event_ctxp;
5260
5261 if (!event->attr.inherit) {
5262 *inherited_all = 0;
5263 return 0;
4849 } 5264 }
4850 5265
4851 if (!list_empty(&ctx->group_list)) 5266 if (!child_ctx) {
4852 goto again; 5267 /*
5268 * This is executed from the parent task context, so
5269 * inherit events that have been marked for cloning.
5270 * First allocate and initialize a context for the
5271 * child.
5272 */
4853 5273
4854 mutex_unlock(&ctx->mutex); 5274 child_ctx = kzalloc(sizeof(struct perf_event_context),
5275 GFP_KERNEL);
5276 if (!child_ctx)
5277 return -ENOMEM;
4855 5278
4856 put_ctx(ctx); 5279 __perf_event_init_context(child_ctx, child);
5280 child->perf_event_ctxp = child_ctx;
5281 get_task_struct(child);
5282 }
5283
5284 ret = inherit_group(event, parent, parent_ctx,
5285 child, child_ctx);
5286
5287 if (ret)
5288 *inherited_all = 0;
5289
5290 return ret;
4857} 5291}
4858 5292
5293
4859/* 5294/*
4860 * Initialize the perf_event context in task_struct 5295 * Initialize the perf_event context in task_struct
4861 */ 5296 */
@@ -4877,20 +5312,6 @@ int perf_event_init_task(struct task_struct *child)
4877 return 0; 5312 return 0;
4878 5313
4879 /* 5314 /*
4880 * This is executed from the parent task context, so inherit
4881 * events that have been marked for cloning.
4882 * First allocate and initialize a context for the child.
4883 */
4884
4885 child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
4886 if (!child_ctx)
4887 return -ENOMEM;
4888
4889 __perf_event_init_context(child_ctx, child);
4890 child->perf_event_ctxp = child_ctx;
4891 get_task_struct(child);
4892
4893 /*
4894 * If the parent's context is a clone, pin it so it won't get 5315 * If the parent's context is a clone, pin it so it won't get
4895 * swapped under us. 5316 * swapped under us.
4896 */ 5317 */
@@ -4913,22 +5334,23 @@ int perf_event_init_task(struct task_struct *child)
4913 * We dont have to disable NMIs - we are only looking at 5334 * We dont have to disable NMIs - we are only looking at
4914 * the list, not manipulating it: 5335 * the list, not manipulating it:
4915 */ 5336 */
4916 list_for_each_entry(event, &parent_ctx->group_list, group_entry) { 5337 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
4917 5338 ret = inherit_task_group(event, parent, parent_ctx, child,
4918 if (!event->attr.inherit) { 5339 &inherited_all);
4919 inherited_all = 0; 5340 if (ret)
4920 continue; 5341 break;
4921 } 5342 }
4922 5343
4923 ret = inherit_group(event, parent, parent_ctx, 5344 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
4924 child, child_ctx); 5345 ret = inherit_task_group(event, parent, parent_ctx, child,
4925 if (ret) { 5346 &inherited_all);
4926 inherited_all = 0; 5347 if (ret)
4927 break; 5348 break;
4928 }
4929 } 5349 }
4930 5350
4931 if (inherited_all) { 5351 child_ctx = child->perf_event_ctxp;
5352
5353 if (child_ctx && inherited_all) {
4932 /* 5354 /*
4933 * Mark the child context as a clone of the parent 5355 * Mark the child context as a clone of the parent
4934 * context, or of whatever the parent is a clone of. 5356 * context, or of whatever the parent is a clone of.
@@ -4955,18 +5377,26 @@ int perf_event_init_task(struct task_struct *child)
4955 return ret; 5377 return ret;
4956} 5378}
4957 5379
5380static void __init perf_event_init_all_cpus(void)
5381{
5382 int cpu;
5383 struct perf_cpu_context *cpuctx;
5384
5385 for_each_possible_cpu(cpu) {
5386 cpuctx = &per_cpu(perf_cpu_context, cpu);
5387 __perf_event_init_context(&cpuctx->ctx, NULL);
5388 }
5389}
5390
4958static void __cpuinit perf_event_init_cpu(int cpu) 5391static void __cpuinit perf_event_init_cpu(int cpu)
4959{ 5392{
4960 struct perf_cpu_context *cpuctx; 5393 struct perf_cpu_context *cpuctx;
4961 5394
4962 cpuctx = &per_cpu(perf_cpu_context, cpu); 5395 cpuctx = &per_cpu(perf_cpu_context, cpu);
4963 __perf_event_init_context(&cpuctx->ctx, NULL);
4964 5396
4965 spin_lock(&perf_resource_lock); 5397 spin_lock(&perf_resource_lock);
4966 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; 5398 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
4967 spin_unlock(&perf_resource_lock); 5399 spin_unlock(&perf_resource_lock);
4968
4969 hw_perf_event_setup(cpu);
4970} 5400}
4971 5401
4972#ifdef CONFIG_HOTPLUG_CPU 5402#ifdef CONFIG_HOTPLUG_CPU
@@ -4976,7 +5406,9 @@ static void __perf_event_exit_cpu(void *info)
4976 struct perf_event_context *ctx = &cpuctx->ctx; 5406 struct perf_event_context *ctx = &cpuctx->ctx;
4977 struct perf_event *event, *tmp; 5407 struct perf_event *event, *tmp;
4978 5408
4979 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) 5409 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5410 __perf_event_remove_from_context(event);
5411 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
4980 __perf_event_remove_from_context(event); 5412 __perf_event_remove_from_context(event);
4981} 5413}
4982static void perf_event_exit_cpu(int cpu) 5414static void perf_event_exit_cpu(int cpu)
@@ -5004,11 +5436,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5004 perf_event_init_cpu(cpu); 5436 perf_event_init_cpu(cpu);
5005 break; 5437 break;
5006 5438
5007 case CPU_ONLINE:
5008 case CPU_ONLINE_FROZEN:
5009 hw_perf_event_setup_online(cpu);
5010 break;
5011
5012 case CPU_DOWN_PREPARE: 5439 case CPU_DOWN_PREPARE:
5013 case CPU_DOWN_PREPARE_FROZEN: 5440 case CPU_DOWN_PREPARE_FROZEN:
5014 perf_event_exit_cpu(cpu); 5441 perf_event_exit_cpu(cpu);
@@ -5031,6 +5458,7 @@ static struct notifier_block __cpuinitdata perf_cpu_nb = {
5031 5458
5032void __init perf_event_init(void) 5459void __init perf_event_init(void)
5033{ 5460{
5461 perf_event_init_all_cpus();
5034 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, 5462 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
5035 (void *)(long)smp_processor_id()); 5463 (void *)(long)smp_processor_id());
5036 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, 5464 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
@@ -5038,13 +5466,16 @@ void __init perf_event_init(void)
5038 register_cpu_notifier(&perf_cpu_nb); 5466 register_cpu_notifier(&perf_cpu_nb);
5039} 5467}
5040 5468
5041static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) 5469static ssize_t perf_show_reserve_percpu(struct sysdev_class *class,
5470 struct sysdev_class_attribute *attr,
5471 char *buf)
5042{ 5472{
5043 return sprintf(buf, "%d\n", perf_reserved_percpu); 5473 return sprintf(buf, "%d\n", perf_reserved_percpu);
5044} 5474}
5045 5475
5046static ssize_t 5476static ssize_t
5047perf_set_reserve_percpu(struct sysdev_class *class, 5477perf_set_reserve_percpu(struct sysdev_class *class,
5478 struct sysdev_class_attribute *attr,
5048 const char *buf, 5479 const char *buf,
5049 size_t count) 5480 size_t count)
5050{ 5481{
@@ -5062,24 +5493,28 @@ perf_set_reserve_percpu(struct sysdev_class *class,
5062 perf_reserved_percpu = val; 5493 perf_reserved_percpu = val;
5063 for_each_online_cpu(cpu) { 5494 for_each_online_cpu(cpu) {
5064 cpuctx = &per_cpu(perf_cpu_context, cpu); 5495 cpuctx = &per_cpu(perf_cpu_context, cpu);
5065 spin_lock_irq(&cpuctx->ctx.lock); 5496 raw_spin_lock_irq(&cpuctx->ctx.lock);
5066 mpt = min(perf_max_events - cpuctx->ctx.nr_events, 5497 mpt = min(perf_max_events - cpuctx->ctx.nr_events,
5067 perf_max_events - perf_reserved_percpu); 5498 perf_max_events - perf_reserved_percpu);
5068 cpuctx->max_pertask = mpt; 5499 cpuctx->max_pertask = mpt;
5069 spin_unlock_irq(&cpuctx->ctx.lock); 5500 raw_spin_unlock_irq(&cpuctx->ctx.lock);
5070 } 5501 }
5071 spin_unlock(&perf_resource_lock); 5502 spin_unlock(&perf_resource_lock);
5072 5503
5073 return count; 5504 return count;
5074} 5505}
5075 5506
5076static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf) 5507static ssize_t perf_show_overcommit(struct sysdev_class *class,
5508 struct sysdev_class_attribute *attr,
5509 char *buf)
5077{ 5510{
5078 return sprintf(buf, "%d\n", perf_overcommit); 5511 return sprintf(buf, "%d\n", perf_overcommit);
5079} 5512}
5080 5513
5081static ssize_t 5514static ssize_t
5082perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) 5515perf_set_overcommit(struct sysdev_class *class,
5516 struct sysdev_class_attribute *attr,
5517 const char *buf, size_t count)
5083{ 5518{
5084 unsigned long val; 5519 unsigned long val;
5085 int err; 5520 int err;
diff --git a/kernel/pid.c b/kernel/pid.c
index d3f722d20f9c..aebb30d9c233 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -141,11 +141,12 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
141 * installing it: 141 * installing it:
142 */ 142 */
143 spin_lock_irq(&pidmap_lock); 143 spin_lock_irq(&pidmap_lock);
144 if (map->page) 144 if (!map->page) {
145 kfree(page);
146 else
147 map->page = page; 145 map->page = page;
146 page = NULL;
147 }
148 spin_unlock_irq(&pidmap_lock); 148 spin_unlock_irq(&pidmap_lock);
149 kfree(page);
149 if (unlikely(!map->page)) 150 if (unlikely(!map->page))
150 break; 151 break;
151 } 152 }
@@ -268,12 +269,11 @@ struct pid *alloc_pid(struct pid_namespace *ns)
268 for (type = 0; type < PIDTYPE_MAX; ++type) 269 for (type = 0; type < PIDTYPE_MAX; ++type)
269 INIT_HLIST_HEAD(&pid->tasks[type]); 270 INIT_HLIST_HEAD(&pid->tasks[type]);
270 271
272 upid = pid->numbers + ns->level;
271 spin_lock_irq(&pidmap_lock); 273 spin_lock_irq(&pidmap_lock);
272 for (i = ns->level; i >= 0; i--) { 274 for ( ; upid >= pid->numbers; --upid)
273 upid = &pid->numbers[i];
274 hlist_add_head_rcu(&upid->pid_chain, 275 hlist_add_head_rcu(&upid->pid_chain,
275 &pid_hash[pid_hashfn(upid->nr, upid->ns)]); 276 &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
276 }
277 spin_unlock_irq(&pidmap_lock); 277 spin_unlock_irq(&pidmap_lock);
278 278
279out: 279out:
@@ -367,7 +367,9 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
367 struct task_struct *result = NULL; 367 struct task_struct *result = NULL;
368 if (pid) { 368 if (pid) {
369 struct hlist_node *first; 369 struct hlist_node *first;
370 first = rcu_dereference(pid->tasks[type].first); 370 first = rcu_dereference_check(pid->tasks[type].first,
371 rcu_read_lock_held() ||
372 lockdep_tasklist_lock_is_held());
371 if (first) 373 if (first)
372 result = hlist_entry(first, struct task_struct, pids[(type)].node); 374 result = hlist_entry(first, struct task_struct, pids[(type)].node);
373 } 375 }
@@ -376,7 +378,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
376EXPORT_SYMBOL(pid_task); 378EXPORT_SYMBOL(pid_task);
377 379
378/* 380/*
379 * Must be called under rcu_read_lock() or with tasklist_lock read-held. 381 * Must be called under rcu_read_lock().
380 */ 382 */
381struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) 383struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
382{ 384{
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 86b3796b0436..a5aff94e1f0b 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -13,6 +13,7 @@
13#include <linux/syscalls.h> 13#include <linux/syscalls.h>
14#include <linux/err.h> 14#include <linux/err.h>
15#include <linux/acct.h> 15#include <linux/acct.h>
16#include <linux/slab.h>
16 17
17#define BITS_PER_PAGE (PAGE_SIZE*8) 18#define BITS_PER_PAGE (PAGE_SIZE*8)
18 19
@@ -161,13 +162,12 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
161 rcu_read_lock(); 162 rcu_read_lock();
162 163
163 /* 164 /*
164 * Use force_sig() since it clears SIGNAL_UNKILLABLE ensuring 165 * Any nested-container's init processes won't ignore the
165 * any nested-container's init processes don't ignore the 166 * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser().
166 * signal
167 */ 167 */
168 task = pid_task(find_vpid(nr), PIDTYPE_PID); 168 task = pid_task(find_vpid(nr), PIDTYPE_PID);
169 if (task) 169 if (task)
170 force_sig(SIGKILL, task); 170 send_sig_info(SIGKILL, SEND_SIG_NOINFO, task);
171 171
172 rcu_read_unlock(); 172 rcu_read_unlock();
173 173
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index dfdec524d1b7..3db49b9ca374 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -29,7 +29,6 @@
29 29
30#include <linux/pm_qos_params.h> 30#include <linux/pm_qos_params.h>
31#include <linux/sched.h> 31#include <linux/sched.h>
32#include <linux/smp_lock.h>
33#include <linux/spinlock.h> 32#include <linux/spinlock.h>
34#include <linux/slab.h> 33#include <linux/slab.h>
35#include <linux/time.h> 34#include <linux/time.h>
@@ -344,37 +343,33 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
344} 343}
345EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); 344EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
346 345
347#define PID_NAME_LEN sizeof("process_1234567890") 346#define PID_NAME_LEN 32
348static char name[PID_NAME_LEN];
349 347
350static int pm_qos_power_open(struct inode *inode, struct file *filp) 348static int pm_qos_power_open(struct inode *inode, struct file *filp)
351{ 349{
352 int ret; 350 int ret;
353 long pm_qos_class; 351 long pm_qos_class;
352 char name[PID_NAME_LEN];
354 353
355 lock_kernel();
356 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); 354 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
357 if (pm_qos_class >= 0) { 355 if (pm_qos_class >= 0) {
358 filp->private_data = (void *)pm_qos_class; 356 filp->private_data = (void *)pm_qos_class;
359 sprintf(name, "process_%d", current->pid); 357 snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
360 ret = pm_qos_add_requirement(pm_qos_class, name, 358 ret = pm_qos_add_requirement(pm_qos_class, name,
361 PM_QOS_DEFAULT_VALUE); 359 PM_QOS_DEFAULT_VALUE);
362 if (ret >= 0) { 360 if (ret >= 0)
363 unlock_kernel();
364 return 0; 361 return 0;
365 }
366 } 362 }
367 unlock_kernel();
368
369 return -EPERM; 363 return -EPERM;
370} 364}
371 365
372static int pm_qos_power_release(struct inode *inode, struct file *filp) 366static int pm_qos_power_release(struct inode *inode, struct file *filp)
373{ 367{
374 int pm_qos_class; 368 int pm_qos_class;
369 char name[PID_NAME_LEN];
375 370
376 pm_qos_class = (long)filp->private_data; 371 pm_qos_class = (long)filp->private_data;
377 sprintf(name, "process_%d", current->pid); 372 snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
378 pm_qos_remove_requirement(pm_qos_class, name); 373 pm_qos_remove_requirement(pm_qos_class, name);
379 374
380 return 0; 375 return 0;
@@ -385,13 +380,14 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
385{ 380{
386 s32 value; 381 s32 value;
387 int pm_qos_class; 382 int pm_qos_class;
383 char name[PID_NAME_LEN];
388 384
389 pm_qos_class = (long)filp->private_data; 385 pm_qos_class = (long)filp->private_data;
390 if (count != sizeof(s32)) 386 if (count != sizeof(s32))
391 return -EINVAL; 387 return -EINVAL;
392 if (copy_from_user(&value, buf, sizeof(s32))) 388 if (copy_from_user(&value, buf, sizeof(s32)))
393 return -EFAULT; 389 return -EFAULT;
394 sprintf(name, "process_%d", current->pid); 390 snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
395 pm_qos_update_requirement(pm_qos_class, name, value); 391 pm_qos_update_requirement(pm_qos_class, name, value);
396 392
397 return sizeof(s32); 393 return sizeof(s32);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 5c9dc228747b..bc7704b3a443 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -384,7 +384,8 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
384 384
385/* 385/*
386 * Validate the clockid_t for a new CPU-clock timer, and initialize the timer. 386 * Validate the clockid_t for a new CPU-clock timer, and initialize the timer.
387 * This is called from sys_timer_create with the new timer already locked. 387 * This is called from sys_timer_create() and do_cpu_nanosleep() with the
388 * new timer already all-zeros initialized.
388 */ 389 */
389int posix_cpu_timer_create(struct k_itimer *new_timer) 390int posix_cpu_timer_create(struct k_itimer *new_timer)
390{ 391{
@@ -396,8 +397,6 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
396 return -EINVAL; 397 return -EINVAL;
397 398
398 INIT_LIST_HEAD(&new_timer->it.cpu.entry); 399 INIT_LIST_HEAD(&new_timer->it.cpu.entry);
399 new_timer->it.cpu.incr.sched = 0;
400 new_timer->it.cpu.expires.sched = 0;
401 400
402 read_lock(&tasklist_lock); 401 read_lock(&tasklist_lock);
403 if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) { 402 if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {
@@ -983,6 +982,7 @@ static void check_thread_timers(struct task_struct *tsk,
983 int maxfire; 982 int maxfire;
984 struct list_head *timers = tsk->cpu_timers; 983 struct list_head *timers = tsk->cpu_timers;
985 struct signal_struct *const sig = tsk->signal; 984 struct signal_struct *const sig = tsk->signal;
985 unsigned long soft;
986 986
987 maxfire = 20; 987 maxfire = 20;
988 tsk->cputime_expires.prof_exp = cputime_zero; 988 tsk->cputime_expires.prof_exp = cputime_zero;
@@ -1031,9 +1031,10 @@ static void check_thread_timers(struct task_struct *tsk,
1031 /* 1031 /*
1032 * Check for the special case thread timers. 1032 * Check for the special case thread timers.
1033 */ 1033 */
1034 if (sig->rlim[RLIMIT_RTTIME].rlim_cur != RLIM_INFINITY) { 1034 soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
1035 unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max; 1035 if (soft != RLIM_INFINITY) {
1036 unsigned long *soft = &sig->rlim[RLIMIT_RTTIME].rlim_cur; 1036 unsigned long hard =
1037 ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
1037 1038
1038 if (hard != RLIM_INFINITY && 1039 if (hard != RLIM_INFINITY &&
1039 tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { 1040 tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
@@ -1044,14 +1045,13 @@ static void check_thread_timers(struct task_struct *tsk,
1044 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); 1045 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
1045 return; 1046 return;
1046 } 1047 }
1047 if (tsk->rt.timeout > DIV_ROUND_UP(*soft, USEC_PER_SEC/HZ)) { 1048 if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
1048 /* 1049 /*
1049 * At the soft limit, send a SIGXCPU every second. 1050 * At the soft limit, send a SIGXCPU every second.
1050 */ 1051 */
1051 if (sig->rlim[RLIMIT_RTTIME].rlim_cur 1052 if (soft < hard) {
1052 < sig->rlim[RLIMIT_RTTIME].rlim_max) { 1053 soft += USEC_PER_SEC;
1053 sig->rlim[RLIMIT_RTTIME].rlim_cur += 1054 sig->rlim[RLIMIT_RTTIME].rlim_cur = soft;
1054 USEC_PER_SEC;
1055 } 1055 }
1056 printk(KERN_INFO 1056 printk(KERN_INFO
1057 "RT Watchdog Timeout: %s[%d]\n", 1057 "RT Watchdog Timeout: %s[%d]\n",
@@ -1061,9 +1061,9 @@ static void check_thread_timers(struct task_struct *tsk,
1061 } 1061 }
1062} 1062}
1063 1063
1064static void stop_process_timers(struct task_struct *tsk) 1064static void stop_process_timers(struct signal_struct *sig)
1065{ 1065{
1066 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; 1066 struct thread_group_cputimer *cputimer = &sig->cputimer;
1067 unsigned long flags; 1067 unsigned long flags;
1068 1068
1069 if (!cputimer->running) 1069 if (!cputimer->running)
@@ -1072,6 +1072,10 @@ static void stop_process_timers(struct task_struct *tsk)
1072 spin_lock_irqsave(&cputimer->lock, flags); 1072 spin_lock_irqsave(&cputimer->lock, flags);
1073 cputimer->running = 0; 1073 cputimer->running = 0;
1074 spin_unlock_irqrestore(&cputimer->lock, flags); 1074 spin_unlock_irqrestore(&cputimer->lock, flags);
1075
1076 sig->cputime_expires.prof_exp = cputime_zero;
1077 sig->cputime_expires.virt_exp = cputime_zero;
1078 sig->cputime_expires.sched_exp = 0;
1075} 1079}
1076 1080
1077static u32 onecputick; 1081static u32 onecputick;
@@ -1122,6 +1126,7 @@ static void check_process_timers(struct task_struct *tsk,
1122 unsigned long long sum_sched_runtime, sched_expires; 1126 unsigned long long sum_sched_runtime, sched_expires;
1123 struct list_head *timers = sig->cpu_timers; 1127 struct list_head *timers = sig->cpu_timers;
1124 struct task_cputime cputime; 1128 struct task_cputime cputime;
1129 unsigned long soft;
1125 1130
1126 /* 1131 /*
1127 * Don't sample the current process CPU clocks if there are no timers. 1132 * Don't sample the current process CPU clocks if there are no timers.
@@ -1132,7 +1137,7 @@ static void check_process_timers(struct task_struct *tsk,
1132 list_empty(&timers[CPUCLOCK_VIRT]) && 1137 list_empty(&timers[CPUCLOCK_VIRT]) &&
1133 cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) && 1138 cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) &&
1134 list_empty(&timers[CPUCLOCK_SCHED])) { 1139 list_empty(&timers[CPUCLOCK_SCHED])) {
1135 stop_process_timers(tsk); 1140 stop_process_timers(sig);
1136 return; 1141 return;
1137 } 1142 }
1138 1143
@@ -1194,11 +1199,13 @@ static void check_process_timers(struct task_struct *tsk,
1194 SIGPROF); 1199 SIGPROF);
1195 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, 1200 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
1196 SIGVTALRM); 1201 SIGVTALRM);
1197 1202 soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
1198 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { 1203 if (soft != RLIM_INFINITY) {
1199 unsigned long psecs = cputime_to_secs(ptime); 1204 unsigned long psecs = cputime_to_secs(ptime);
1205 unsigned long hard =
1206 ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
1200 cputime_t x; 1207 cputime_t x;
1201 if (psecs >= sig->rlim[RLIMIT_CPU].rlim_max) { 1208 if (psecs >= hard) {
1202 /* 1209 /*
1203 * At the hard limit, we just die. 1210 * At the hard limit, we just die.
1204 * No need to calculate anything else now. 1211 * No need to calculate anything else now.
@@ -1206,17 +1213,17 @@ static void check_process_timers(struct task_struct *tsk,
1206 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); 1213 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
1207 return; 1214 return;
1208 } 1215 }
1209 if (psecs >= sig->rlim[RLIMIT_CPU].rlim_cur) { 1216 if (psecs >= soft) {
1210 /* 1217 /*
1211 * At the soft limit, send a SIGXCPU every second. 1218 * At the soft limit, send a SIGXCPU every second.
1212 */ 1219 */
1213 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); 1220 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
1214 if (sig->rlim[RLIMIT_CPU].rlim_cur 1221 if (soft < hard) {
1215 < sig->rlim[RLIMIT_CPU].rlim_max) { 1222 soft++;
1216 sig->rlim[RLIMIT_CPU].rlim_cur++; 1223 sig->rlim[RLIMIT_CPU].rlim_cur = soft;
1217 } 1224 }
1218 } 1225 }
1219 x = secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); 1226 x = secs_to_cputime(soft);
1220 if (cputime_eq(prof_expires, cputime_zero) || 1227 if (cputime_eq(prof_expires, cputime_zero) ||
1221 cputime_lt(x, prof_expires)) { 1228 cputime_lt(x, prof_expires)) {
1222 prof_expires = x; 1229 prof_expires = x;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 495440779ce3..00d1fda58ab6 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -256,7 +256,7 @@ static int posix_get_monotonic_coarse(clockid_t which_clock,
256 return 0; 256 return 0;
257} 257}
258 258
259int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp) 259static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
260{ 260{
261 *tp = ktime_to_timespec(KTIME_LOW_RES); 261 *tp = ktime_to_timespec(KTIME_LOW_RES);
262 return 0; 262 return 0;
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 91e09d3b2eb2..5c36ea9d55d2 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -27,6 +27,15 @@ config PM_DEBUG
27 code. This is helpful when debugging and reporting PM bugs, like 27 code. This is helpful when debugging and reporting PM bugs, like
28 suspend support. 28 suspend support.
29 29
30config PM_ADVANCED_DEBUG
31 bool "Extra PM attributes in sysfs for low-level debugging/testing"
32 depends on PM_DEBUG
33 default n
34 ---help---
35 Add extra sysfs attributes allowing one to access some Power Management
36 fields of device objects from user space. If you are not a kernel
37 developer interested in debugging/testing Power Management, say "no".
38
30config PM_VERBOSE 39config PM_VERBOSE
31 bool "Verbose Power Management debugging" 40 bool "Verbose Power Management debugging"
32 depends on PM_DEBUG 41 depends on PM_DEBUG
@@ -85,6 +94,11 @@ config PM_SLEEP
85 depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE 94 depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
86 default y 95 default y
87 96
97config PM_SLEEP_ADVANCED_DEBUG
98 bool
99 depends on PM_ADVANCED_DEBUG
100 default n
101
88config SUSPEND 102config SUSPEND
89 bool "Suspend to RAM and standby" 103 bool "Suspend to RAM and standby"
90 depends on PM && ARCH_SUSPEND_POSSIBLE 104 depends on PM && ARCH_SUSPEND_POSSIBLE
@@ -222,3 +236,8 @@ config PM_RUNTIME
222 and the bus type drivers of the buses the devices are on are 236 and the bus type drivers of the buses the devices are on are
223 responsible for the actual handling of the autosuspend requests and 237 responsible for the actual handling of the autosuspend requests and
224 wake-up events. 238 wake-up events.
239
240config PM_OPS
241 bool
242 depends on PM_SLEEP || PM_RUNTIME
243 default y
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index c3b81c30e5d5..43191815f874 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -8,7 +8,7 @@ obj-$(CONFIG_PM_SLEEP) += console.o
8obj-$(CONFIG_FREEZER) += process.o 8obj-$(CONFIG_FREEZER) += process.o
9obj-$(CONFIG_SUSPEND) += suspend.o 9obj-$(CONFIG_SUSPEND) += suspend.o
10obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o 10obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
11obj-$(CONFIG_HIBERNATION) += swsusp.o hibernate.o snapshot.o swap.o user.o 11obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o
12obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o 12obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o
13 13
14obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 14obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/console.c b/kernel/power/console.c
index 5187136fe1de..218e5af90156 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -6,7 +6,7 @@
6 6
7#include <linux/vt_kern.h> 7#include <linux/vt_kern.h>
8#include <linux/kbd_kern.h> 8#include <linux/kbd_kern.h>
9#include <linux/console.h> 9#include <linux/vt.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include "power.h" 11#include "power.h"
12 12
@@ -21,8 +21,7 @@ int pm_prepare_console(void)
21 if (orig_fgconsole < 0) 21 if (orig_fgconsole < 0)
22 return 1; 22 return 1;
23 23
24 orig_kmsg = kmsg_redirect; 24 orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE);
25 kmsg_redirect = SUSPEND_CONSOLE;
26 return 0; 25 return 0;
27} 26}
28 27
@@ -30,7 +29,7 @@ void pm_restore_console(void)
30{ 29{
31 if (orig_fgconsole >= 0) { 30 if (orig_fgconsole >= 0) {
32 vt_move_to_console(orig_fgconsole, 0); 31 vt_move_to_console(orig_fgconsole, 0);
33 kmsg_redirect = orig_kmsg; 32 vt_kmsg_redirect(orig_kmsg);
34 } 33 }
35} 34}
36#endif 35#endif
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 04a9e90d248f..aa9e916da4d5 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -22,6 +22,7 @@
22#include <linux/console.h> 22#include <linux/console.h>
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/freezer.h> 24#include <linux/freezer.h>
25#include <linux/gfp.h>
25#include <scsi/scsi_scan.h> 26#include <scsi/scsi_scan.h>
26#include <asm/suspend.h> 27#include <asm/suspend.h>
27 28
@@ -32,6 +33,7 @@ static int noresume = 0;
32static char resume_file[256] = CONFIG_PM_STD_PARTITION; 33static char resume_file[256] = CONFIG_PM_STD_PARTITION;
33dev_t swsusp_resume_device; 34dev_t swsusp_resume_device;
34sector_t swsusp_resume_block; 35sector_t swsusp_resume_block;
36int in_suspend __nosavedata = 0;
35 37
36enum { 38enum {
37 HIBERNATION_INVALID, 39 HIBERNATION_INVALID,
@@ -202,6 +204,35 @@ static void platform_recover(int platform_mode)
202} 204}
203 205
204/** 206/**
207 * swsusp_show_speed - print the time elapsed between two events.
208 * @start: Starting event.
209 * @stop: Final event.
210 * @nr_pages - number of pages processed between @start and @stop
211 * @msg - introductory message to print
212 */
213
214void swsusp_show_speed(struct timeval *start, struct timeval *stop,
215 unsigned nr_pages, char *msg)
216{
217 s64 elapsed_centisecs64;
218 int centisecs;
219 int k;
220 int kps;
221
222 elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
223 do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
224 centisecs = elapsed_centisecs64;
225 if (centisecs == 0)
226 centisecs = 1; /* avoid div-by-zero */
227 k = nr_pages * (PAGE_SIZE / 1024);
228 kps = (k * 100) / centisecs;
229 printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n",
230 msg, k,
231 centisecs / 100, centisecs % 100,
232 kps / 1000, (kps % 1000) / 10);
233}
234
235/**
205 * create_image - freeze devices that need to be frozen with interrupts 236 * create_image - freeze devices that need to be frozen with interrupts
206 * off, create the hibernation image and thaw those devices. Control 237 * off, create the hibernation image and thaw those devices. Control
207 * reappears in this routine after a restore. 238 * reappears in this routine after a restore.
@@ -293,6 +324,7 @@ static int create_image(int platform_mode)
293int hibernation_snapshot(int platform_mode) 324int hibernation_snapshot(int platform_mode)
294{ 325{
295 int error; 326 int error;
327 gfp_t saved_mask;
296 328
297 error = platform_begin(platform_mode); 329 error = platform_begin(platform_mode);
298 if (error) 330 if (error)
@@ -304,6 +336,7 @@ int hibernation_snapshot(int platform_mode)
304 goto Close; 336 goto Close;
305 337
306 suspend_console(); 338 suspend_console();
339 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
307 error = dpm_suspend_start(PMSG_FREEZE); 340 error = dpm_suspend_start(PMSG_FREEZE);
308 if (error) 341 if (error)
309 goto Recover_platform; 342 goto Recover_platform;
@@ -321,6 +354,7 @@ int hibernation_snapshot(int platform_mode)
321 354
322 dpm_resume_end(in_suspend ? 355 dpm_resume_end(in_suspend ?
323 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 356 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
357 set_gfp_allowed_mask(saved_mask);
324 resume_console(); 358 resume_console();
325 Close: 359 Close:
326 platform_end(platform_mode); 360 platform_end(platform_mode);
@@ -415,14 +449,17 @@ static int resume_target_kernel(bool platform_mode)
415int hibernation_restore(int platform_mode) 449int hibernation_restore(int platform_mode)
416{ 450{
417 int error; 451 int error;
452 gfp_t saved_mask;
418 453
419 pm_prepare_console(); 454 pm_prepare_console();
420 suspend_console(); 455 suspend_console();
456 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
421 error = dpm_suspend_start(PMSG_QUIESCE); 457 error = dpm_suspend_start(PMSG_QUIESCE);
422 if (!error) { 458 if (!error) {
423 error = resume_target_kernel(platform_mode); 459 error = resume_target_kernel(platform_mode);
424 dpm_resume_end(PMSG_RECOVER); 460 dpm_resume_end(PMSG_RECOVER);
425 } 461 }
462 set_gfp_allowed_mask(saved_mask);
426 resume_console(); 463 resume_console();
427 pm_restore_console(); 464 pm_restore_console();
428 return error; 465 return error;
@@ -436,6 +473,7 @@ int hibernation_restore(int platform_mode)
436int hibernation_platform_enter(void) 473int hibernation_platform_enter(void)
437{ 474{
438 int error; 475 int error;
476 gfp_t saved_mask;
439 477
440 if (!hibernation_ops) 478 if (!hibernation_ops)
441 return -ENOSYS; 479 return -ENOSYS;
@@ -451,6 +489,7 @@ int hibernation_platform_enter(void)
451 489
452 entering_platform_hibernation = true; 490 entering_platform_hibernation = true;
453 suspend_console(); 491 suspend_console();
492 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
454 error = dpm_suspend_start(PMSG_HIBERNATE); 493 error = dpm_suspend_start(PMSG_HIBERNATE);
455 if (error) { 494 if (error) {
456 if (hibernation_ops->recover) 495 if (hibernation_ops->recover)
@@ -488,6 +527,7 @@ int hibernation_platform_enter(void)
488 Resume_devices: 527 Resume_devices:
489 entering_platform_hibernation = false; 528 entering_platform_hibernation = false;
490 dpm_resume_end(PMSG_RESTORE); 529 dpm_resume_end(PMSG_RESTORE);
530 set_gfp_allowed_mask(saved_mask);
491 resume_console(); 531 resume_console();
492 532
493 Close: 533 Close:
diff --git a/kernel/power/hibernate_nvs.c b/kernel/power/hibernate_nvs.c
index 39ac698ef836..fdcad9ed5a7b 100644
--- a/kernel/power/hibernate_nvs.c
+++ b/kernel/power/hibernate_nvs.c
@@ -10,6 +10,7 @@
10#include <linux/kernel.h> 10#include <linux/kernel.h>
11#include <linux/list.h> 11#include <linux/list.h>
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/slab.h>
13#include <linux/suspend.h> 14#include <linux/suspend.h>
14 15
15/* 16/*
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 347d2cc88cd0..b58800b21fc0 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -44,6 +44,32 @@ int pm_notifier_call_chain(unsigned long val)
44 == NOTIFY_BAD) ? -EINVAL : 0; 44 == NOTIFY_BAD) ? -EINVAL : 0;
45} 45}
46 46
47/* If set, devices may be suspended and resumed asynchronously. */
48int pm_async_enabled = 1;
49
50static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr,
51 char *buf)
52{
53 return sprintf(buf, "%d\n", pm_async_enabled);
54}
55
56static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr,
57 const char *buf, size_t n)
58{
59 unsigned long val;
60
61 if (strict_strtoul(buf, 10, &val))
62 return -EINVAL;
63
64 if (val > 1)
65 return -EINVAL;
66
67 pm_async_enabled = val;
68 return n;
69}
70
71power_attr(pm_async);
72
47#ifdef CONFIG_PM_DEBUG 73#ifdef CONFIG_PM_DEBUG
48int pm_test_level = TEST_NONE; 74int pm_test_level = TEST_NONE;
49 75
@@ -208,9 +234,12 @@ static struct attribute * g[] = {
208#ifdef CONFIG_PM_TRACE 234#ifdef CONFIG_PM_TRACE
209 &pm_trace_attr.attr, 235 &pm_trace_attr.attr,
210#endif 236#endif
211#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_PM_DEBUG) 237#ifdef CONFIG_PM_SLEEP
238 &pm_async_attr.attr,
239#ifdef CONFIG_PM_DEBUG
212 &pm_test_attr.attr, 240 &pm_test_attr.attr,
213#endif 241#endif
242#endif
214 NULL, 243 NULL,
215}; 244};
216 245
@@ -220,6 +249,7 @@ static struct attribute_group attr_group = {
220 249
221#ifdef CONFIG_PM_RUNTIME 250#ifdef CONFIG_PM_RUNTIME
222struct workqueue_struct *pm_wq; 251struct workqueue_struct *pm_wq;
252EXPORT_SYMBOL_GPL(pm_wq);
223 253
224static int __init pm_start_workqueue(void) 254static int __init pm_start_workqueue(void)
225{ 255{
diff --git a/kernel/power/process.c b/kernel/power/process.c
index cc2e55373b68..71ae29052ab6 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -14,6 +14,7 @@
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/freezer.h> 16#include <linux/freezer.h>
17#include <linux/delay.h>
17 18
18/* 19/*
19 * Timeout for stopping processes 20 * Timeout for stopping processes
@@ -41,7 +42,7 @@ static int try_to_freeze_tasks(bool sig_only)
41 do_gettimeofday(&start); 42 do_gettimeofday(&start);
42 43
43 end_time = jiffies + TIMEOUT; 44 end_time = jiffies + TIMEOUT;
44 do { 45 while (true) {
45 todo = 0; 46 todo = 0;
46 read_lock(&tasklist_lock); 47 read_lock(&tasklist_lock);
47 do_each_thread(g, p) { 48 do_each_thread(g, p) {
@@ -62,10 +63,15 @@ static int try_to_freeze_tasks(bool sig_only)
62 todo++; 63 todo++;
63 } while_each_thread(g, p); 64 } while_each_thread(g, p);
64 read_unlock(&tasklist_lock); 65 read_unlock(&tasklist_lock);
65 yield(); /* Yield is okay here */ 66 if (!todo || time_after(jiffies, end_time))
66 if (time_after(jiffies, end_time))
67 break; 67 break;
68 } while (todo); 68
69 /*
70 * We need to retry, but first give the freezing tasks some
71 * time to enter the regrigerator.
72 */
73 msleep(10);
74 }
69 75
70 do_gettimeofday(&end); 76 do_gettimeofday(&end);
71 elapsed_csecs64 = timeval_to_ns(&end) - timeval_to_ns(&start); 77 elapsed_csecs64 = timeval_to_ns(&end) - timeval_to_ns(&start);
@@ -82,12 +88,11 @@ static int try_to_freeze_tasks(bool sig_only)
82 printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds " 88 printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds "
83 "(%d tasks refusing to freeze):\n", 89 "(%d tasks refusing to freeze):\n",
84 elapsed_csecs / 100, elapsed_csecs % 100, todo); 90 elapsed_csecs / 100, elapsed_csecs % 100, todo);
85 show_state();
86 read_lock(&tasklist_lock); 91 read_lock(&tasklist_lock);
87 do_each_thread(g, p) { 92 do_each_thread(g, p) {
88 task_lock(p); 93 task_lock(p);
89 if (freezing(p) && !freezer_should_skip(p)) 94 if (freezing(p) && !freezer_should_skip(p))
90 printk(KERN_ERR " %s\n", p->comm); 95 sched_show_task(p);
91 cancel_freezing(p); 96 cancel_freezing(p);
92 task_unlock(p); 97 task_unlock(p);
93 } while_each_thread(g, p); 98 } while_each_thread(g, p);
@@ -139,7 +144,7 @@ static void thaw_tasks(bool nosig_only)
139 if (nosig_only && should_send_signal(p)) 144 if (nosig_only && should_send_signal(p))
140 continue; 145 continue;
141 146
142 if (cgroup_frozen(p)) 147 if (cgroup_freezing_or_frozen(p))
143 continue; 148 continue;
144 149
145 thaw_process(p); 150 thaw_process(p);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 36cb168e4330..be861c26dda7 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -26,6 +26,7 @@
26#include <linux/console.h> 26#include <linux/console.h>
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/list.h> 28#include <linux/list.h>
29#include <linux/slab.h>
29 30
30#include <asm/uaccess.h> 31#include <asm/uaccess.h>
31#include <asm/mmu_context.h> 32#include <asm/mmu_context.h>
@@ -1181,7 +1182,7 @@ static void free_unnecessary_pages(void)
1181 1182
1182 memory_bm_position_reset(&copy_bm); 1183 memory_bm_position_reset(&copy_bm);
1183 1184
1184 while (to_free_normal > 0 && to_free_highmem > 0) { 1185 while (to_free_normal > 0 || to_free_highmem > 0) {
1185 unsigned long pfn = memory_bm_next_pfn(&copy_bm); 1186 unsigned long pfn = memory_bm_next_pfn(&copy_bm);
1186 struct page *page = pfn_to_page(pfn); 1187 struct page *page = pfn_to_page(pfn);
1187 1188
@@ -1500,7 +1501,7 @@ asmlinkage int swsusp_save(void)
1500{ 1501{
1501 unsigned int nr_pages, nr_highmem; 1502 unsigned int nr_pages, nr_highmem;
1502 1503
1503 printk(KERN_INFO "PM: Creating hibernation image: \n"); 1504 printk(KERN_INFO "PM: Creating hibernation image:\n");
1504 1505
1505 drain_local_pages(NULL); 1506 drain_local_pages(NULL);
1506 nr_pages = count_data_pages(); 1507 nr_pages = count_data_pages();
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 6f10dfc2d3e9..56e7dbb8b996 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -15,6 +15,7 @@
15#include <linux/console.h> 15#include <linux/console.h>
16#include <linux/cpu.h> 16#include <linux/cpu.h>
17#include <linux/syscalls.h> 17#include <linux/syscalls.h>
18#include <linux/gfp.h>
18 19
19#include "power.h" 20#include "power.h"
20 21
@@ -189,6 +190,7 @@ static int suspend_enter(suspend_state_t state)
189int suspend_devices_and_enter(suspend_state_t state) 190int suspend_devices_and_enter(suspend_state_t state)
190{ 191{
191 int error; 192 int error;
193 gfp_t saved_mask;
192 194
193 if (!suspend_ops) 195 if (!suspend_ops)
194 return -ENOSYS; 196 return -ENOSYS;
@@ -199,6 +201,7 @@ int suspend_devices_and_enter(suspend_state_t state)
199 goto Close; 201 goto Close;
200 } 202 }
201 suspend_console(); 203 suspend_console();
204 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
202 suspend_test_start(); 205 suspend_test_start();
203 error = dpm_suspend_start(PMSG_SUSPEND); 206 error = dpm_suspend_start(PMSG_SUSPEND);
204 if (error) { 207 if (error) {
@@ -215,6 +218,7 @@ int suspend_devices_and_enter(suspend_state_t state)
215 suspend_test_start(); 218 suspend_test_start();
216 dpm_resume_end(PMSG_RESUME); 219 dpm_resume_end(PMSG_RESUME);
217 suspend_test_finish("resume devices"); 220 suspend_test_finish("resume devices");
221 set_gfp_allowed_mask(saved_mask);
218 resume_console(); 222 resume_console();
219 Close: 223 Close:
220 if (suspend_ops->end) 224 if (suspend_ops->end)
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 890f6b11b1d3..66824d71983a 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -23,6 +23,7 @@
23#include <linux/swap.h> 23#include <linux/swap.h>
24#include <linux/swapops.h> 24#include <linux/swapops.h>
25#include <linux/pm.h> 25#include <linux/pm.h>
26#include <linux/slab.h>
26 27
27#include "power.h" 28#include "power.h"
28 29
@@ -38,6 +39,107 @@ struct swsusp_header {
38 39
39static struct swsusp_header *swsusp_header; 40static struct swsusp_header *swsusp_header;
40 41
42/**
43 * The following functions are used for tracing the allocated
44 * swap pages, so that they can be freed in case of an error.
45 */
46
47struct swsusp_extent {
48 struct rb_node node;
49 unsigned long start;
50 unsigned long end;
51};
52
53static struct rb_root swsusp_extents = RB_ROOT;
54
55static int swsusp_extents_insert(unsigned long swap_offset)
56{
57 struct rb_node **new = &(swsusp_extents.rb_node);
58 struct rb_node *parent = NULL;
59 struct swsusp_extent *ext;
60
61 /* Figure out where to put the new node */
62 while (*new) {
63 ext = container_of(*new, struct swsusp_extent, node);
64 parent = *new;
65 if (swap_offset < ext->start) {
66 /* Try to merge */
67 if (swap_offset == ext->start - 1) {
68 ext->start--;
69 return 0;
70 }
71 new = &((*new)->rb_left);
72 } else if (swap_offset > ext->end) {
73 /* Try to merge */
74 if (swap_offset == ext->end + 1) {
75 ext->end++;
76 return 0;
77 }
78 new = &((*new)->rb_right);
79 } else {
80 /* It already is in the tree */
81 return -EINVAL;
82 }
83 }
84 /* Add the new node and rebalance the tree. */
85 ext = kzalloc(sizeof(struct swsusp_extent), GFP_KERNEL);
86 if (!ext)
87 return -ENOMEM;
88
89 ext->start = swap_offset;
90 ext->end = swap_offset;
91 rb_link_node(&ext->node, parent, new);
92 rb_insert_color(&ext->node, &swsusp_extents);
93 return 0;
94}
95
96/**
97 * alloc_swapdev_block - allocate a swap page and register that it has
98 * been allocated, so that it can be freed in case of an error.
99 */
100
101sector_t alloc_swapdev_block(int swap)
102{
103 unsigned long offset;
104
105 offset = swp_offset(get_swap_page_of_type(swap));
106 if (offset) {
107 if (swsusp_extents_insert(offset))
108 swap_free(swp_entry(swap, offset));
109 else
110 return swapdev_block(swap, offset);
111 }
112 return 0;
113}
114
115/**
116 * free_all_swap_pages - free swap pages allocated for saving image data.
117 * It also frees the extents used to register which swap entres had been
118 * allocated.
119 */
120
121void free_all_swap_pages(int swap)
122{
123 struct rb_node *node;
124
125 while ((node = swsusp_extents.rb_node)) {
126 struct swsusp_extent *ext;
127 unsigned long offset;
128
129 ext = container_of(node, struct swsusp_extent, node);
130 rb_erase(node, &swsusp_extents);
131 for (offset = ext->start; offset <= ext->end; offset++)
132 swap_free(swp_entry(swap, offset));
133
134 kfree(ext);
135 }
136}
137
138int swsusp_swap_in_use(void)
139{
140 return (swsusp_extents.rb_node != NULL);
141}
142
41/* 143/*
42 * General things 144 * General things
43 */ 145 */
@@ -336,7 +438,7 @@ static int save_image(struct swap_map_handle *handle,
336 if (ret) 438 if (ret)
337 break; 439 break;
338 if (!(nr_pages % m)) 440 if (!(nr_pages % m))
339 printk("\b\b\b\b%3d%%", nr_pages / m); 441 printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
340 nr_pages++; 442 nr_pages++;
341 } 443 }
342 err2 = wait_on_bio_chain(&bio); 444 err2 = wait_on_bio_chain(&bio);
@@ -344,9 +446,9 @@ static int save_image(struct swap_map_handle *handle,
344 if (!ret) 446 if (!ret)
345 ret = err2; 447 ret = err2;
346 if (!ret) 448 if (!ret)
347 printk("\b\b\b\bdone\n"); 449 printk(KERN_CONT "\b\b\b\bdone\n");
348 else 450 else
349 printk("\n"); 451 printk(KERN_CONT "\n");
350 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); 452 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
351 return ret; 453 return ret;
352} 454}
@@ -556,10 +658,6 @@ int swsusp_read(unsigned int *flags_p)
556 struct swsusp_info *header; 658 struct swsusp_info *header;
557 659
558 *flags_p = swsusp_header->flags; 660 *flags_p = swsusp_header->flags;
559 if (IS_ERR(resume_bdev)) {
560 pr_debug("PM: Image device not initialised\n");
561 return PTR_ERR(resume_bdev);
562 }
563 661
564 memset(&snapshot, 0, sizeof(struct snapshot_handle)); 662 memset(&snapshot, 0, sizeof(struct snapshot_handle));
565 error = snapshot_write_next(&snapshot, PAGE_SIZE); 663 error = snapshot_write_next(&snapshot, PAGE_SIZE);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
deleted file mode 100644
index 6a07f4dbf2f8..000000000000
--- a/kernel/power/swsusp.c
+++ /dev/null
@@ -1,188 +0,0 @@
1/*
2 * linux/kernel/power/swsusp.c
3 *
4 * This file provides code to write suspend image to swap and read it back.
5 *
6 * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
8 *
9 * This file is released under the GPLv2.
10 *
11 * I'd like to thank the following people for their work:
12 *
13 * Pavel Machek <pavel@ucw.cz>:
14 * Modifications, defectiveness pointing, being with me at the very beginning,
15 * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
16 *
17 * Steve Doddi <dirk@loth.demon.co.uk>:
18 * Support the possibility of hardware state restoring.
19 *
20 * Raph <grey.havens@earthling.net>:
21 * Support for preserving states of network devices and virtual console
22 * (including X and svgatextmode)
23 *
24 * Kurt Garloff <garloff@suse.de>:
25 * Straightened the critical function in order to prevent compilers from
26 * playing tricks with local variables.
27 *
28 * Andreas Mohr <a.mohr@mailto.de>
29 *
30 * Alex Badea <vampire@go.ro>:
31 * Fixed runaway init
32 *
33 * Rafael J. Wysocki <rjw@sisk.pl>
34 * Reworked the freeing of memory and the handling of swap
35 *
36 * More state savers are welcome. Especially for the scsi layer...
37 *
38 * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
39 */
40
41#include <linux/mm.h>
42#include <linux/suspend.h>
43#include <linux/spinlock.h>
44#include <linux/kernel.h>
45#include <linux/major.h>
46#include <linux/swap.h>
47#include <linux/pm.h>
48#include <linux/swapops.h>
49#include <linux/bootmem.h>
50#include <linux/syscalls.h>
51#include <linux/highmem.h>
52#include <linux/time.h>
53#include <linux/rbtree.h>
54#include <linux/io.h>
55
56#include "power.h"
57
58int in_suspend __nosavedata = 0;
59
60/**
61 * The following functions are used for tracing the allocated
62 * swap pages, so that they can be freed in case of an error.
63 */
64
65struct swsusp_extent {
66 struct rb_node node;
67 unsigned long start;
68 unsigned long end;
69};
70
71static struct rb_root swsusp_extents = RB_ROOT;
72
73static int swsusp_extents_insert(unsigned long swap_offset)
74{
75 struct rb_node **new = &(swsusp_extents.rb_node);
76 struct rb_node *parent = NULL;
77 struct swsusp_extent *ext;
78
79 /* Figure out where to put the new node */
80 while (*new) {
81 ext = container_of(*new, struct swsusp_extent, node);
82 parent = *new;
83 if (swap_offset < ext->start) {
84 /* Try to merge */
85 if (swap_offset == ext->start - 1) {
86 ext->start--;
87 return 0;
88 }
89 new = &((*new)->rb_left);
90 } else if (swap_offset > ext->end) {
91 /* Try to merge */
92 if (swap_offset == ext->end + 1) {
93 ext->end++;
94 return 0;
95 }
96 new = &((*new)->rb_right);
97 } else {
98 /* It already is in the tree */
99 return -EINVAL;
100 }
101 }
102 /* Add the new node and rebalance the tree. */
103 ext = kzalloc(sizeof(struct swsusp_extent), GFP_KERNEL);
104 if (!ext)
105 return -ENOMEM;
106
107 ext->start = swap_offset;
108 ext->end = swap_offset;
109 rb_link_node(&ext->node, parent, new);
110 rb_insert_color(&ext->node, &swsusp_extents);
111 return 0;
112}
113
114/**
115 * alloc_swapdev_block - allocate a swap page and register that it has
116 * been allocated, so that it can be freed in case of an error.
117 */
118
119sector_t alloc_swapdev_block(int swap)
120{
121 unsigned long offset;
122
123 offset = swp_offset(get_swap_page_of_type(swap));
124 if (offset) {
125 if (swsusp_extents_insert(offset))
126 swap_free(swp_entry(swap, offset));
127 else
128 return swapdev_block(swap, offset);
129 }
130 return 0;
131}
132
133/**
134 * free_all_swap_pages - free swap pages allocated for saving image data.
135 * It also frees the extents used to register which swap entres had been
136 * allocated.
137 */
138
139void free_all_swap_pages(int swap)
140{
141 struct rb_node *node;
142
143 while ((node = swsusp_extents.rb_node)) {
144 struct swsusp_extent *ext;
145 unsigned long offset;
146
147 ext = container_of(node, struct swsusp_extent, node);
148 rb_erase(node, &swsusp_extents);
149 for (offset = ext->start; offset <= ext->end; offset++)
150 swap_free(swp_entry(swap, offset));
151
152 kfree(ext);
153 }
154}
155
156int swsusp_swap_in_use(void)
157{
158 return (swsusp_extents.rb_node != NULL);
159}
160
161/**
162 * swsusp_show_speed - print the time elapsed between two events represented by
163 * @start and @stop
164 *
165 * @nr_pages - number of pages processed between @start and @stop
166 * @msg - introductory message to print
167 */
168
169void swsusp_show_speed(struct timeval *start, struct timeval *stop,
170 unsigned nr_pages, char *msg)
171{
172 s64 elapsed_centisecs64;
173 int centisecs;
174 int k;
175 int kps;
176
177 elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
178 do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
179 centisecs = elapsed_centisecs64;
180 if (centisecs == 0)
181 centisecs = 1; /* avoid div-by-zero */
182 k = nr_pages * (PAGE_SIZE / 1024);
183 kps = (k * 100) / centisecs;
184 printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n",
185 msg, k,
186 centisecs / 100, centisecs % 100,
187 kps / 1000, (kps % 1000) / 10);
188}
diff --git a/kernel/power/user.c b/kernel/power/user.c
index bf0014d6a5f0..a8c96212bc1b 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -195,6 +195,15 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
195 return res; 195 return res;
196} 196}
197 197
198static void snapshot_deprecated_ioctl(unsigned int cmd)
199{
200 if (printk_ratelimit())
201 printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will "
202 "be removed soon, update your suspend-to-disk "
203 "utilities\n",
204 __builtin_return_address(0), cmd);
205}
206
198static long snapshot_ioctl(struct file *filp, unsigned int cmd, 207static long snapshot_ioctl(struct file *filp, unsigned int cmd,
199 unsigned long arg) 208 unsigned long arg)
200{ 209{
@@ -246,8 +255,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
246 data->frozen = 0; 255 data->frozen = 0;
247 break; 256 break;
248 257
249 case SNAPSHOT_CREATE_IMAGE:
250 case SNAPSHOT_ATOMIC_SNAPSHOT: 258 case SNAPSHOT_ATOMIC_SNAPSHOT:
259 snapshot_deprecated_ioctl(cmd);
260 case SNAPSHOT_CREATE_IMAGE:
251 if (data->mode != O_RDONLY || !data->frozen || data->ready) { 261 if (data->mode != O_RDONLY || !data->frozen || data->ready) {
252 error = -EPERM; 262 error = -EPERM;
253 break; 263 break;
@@ -275,8 +285,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
275 data->ready = 0; 285 data->ready = 0;
276 break; 286 break;
277 287
278 case SNAPSHOT_PREF_IMAGE_SIZE:
279 case SNAPSHOT_SET_IMAGE_SIZE: 288 case SNAPSHOT_SET_IMAGE_SIZE:
289 snapshot_deprecated_ioctl(cmd);
290 case SNAPSHOT_PREF_IMAGE_SIZE:
280 image_size = arg; 291 image_size = arg;
281 break; 292 break;
282 293
@@ -290,15 +301,17 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
290 error = put_user(size, (loff_t __user *)arg); 301 error = put_user(size, (loff_t __user *)arg);
291 break; 302 break;
292 303
293 case SNAPSHOT_AVAIL_SWAP_SIZE:
294 case SNAPSHOT_AVAIL_SWAP: 304 case SNAPSHOT_AVAIL_SWAP:
305 snapshot_deprecated_ioctl(cmd);
306 case SNAPSHOT_AVAIL_SWAP_SIZE:
295 size = count_swap_pages(data->swap, 1); 307 size = count_swap_pages(data->swap, 1);
296 size <<= PAGE_SHIFT; 308 size <<= PAGE_SHIFT;
297 error = put_user(size, (loff_t __user *)arg); 309 error = put_user(size, (loff_t __user *)arg);
298 break; 310 break;
299 311
300 case SNAPSHOT_ALLOC_SWAP_PAGE:
301 case SNAPSHOT_GET_SWAP_PAGE: 312 case SNAPSHOT_GET_SWAP_PAGE:
313 snapshot_deprecated_ioctl(cmd);
314 case SNAPSHOT_ALLOC_SWAP_PAGE:
302 if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { 315 if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
303 error = -ENODEV; 316 error = -ENODEV;
304 break; 317 break;
@@ -321,6 +334,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
321 break; 334 break;
322 335
323 case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */ 336 case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */
337 snapshot_deprecated_ioctl(cmd);
324 if (!swsusp_swap_in_use()) { 338 if (!swsusp_swap_in_use()) {
325 /* 339 /*
326 * User space encodes device types as two-byte values, 340 * User space encodes device types as two-byte values,
@@ -362,6 +376,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
362 break; 376 break;
363 377
364 case SNAPSHOT_PMOPS: /* This ioctl is deprecated */ 378 case SNAPSHOT_PMOPS: /* This ioctl is deprecated */
379 snapshot_deprecated_ioctl(cmd);
365 error = -EINVAL; 380 error = -EINVAL;
366 381
367 switch (arg) { 382 switch (arg) {
@@ -405,7 +420,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
405 * User space encodes device types as two-byte values, 420 * User space encodes device types as two-byte values,
406 * so we need to recode them 421 * so we need to recode them
407 */ 422 */
408 swdev = old_decode_dev(swap_area.dev); 423 swdev = new_decode_dev(swap_area.dev);
409 if (swdev) { 424 if (swdev) {
410 offset = swap_area.offset; 425 offset = swap_area.offset;
411 data->swap = swap_type_of(swdev, offset, NULL); 426 data->swap = swap_type_of(swdev, offset, NULL);
diff --git a/kernel/printk.c b/kernel/printk.c
index 6712a252b306..ee54355cfdf1 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -33,6 +33,9 @@
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/kexec.h> 35#include <linux/kexec.h>
36#include <linux/ratelimit.h>
37#include <linux/kmsg_dump.h>
38#include <linux/syslog.h>
36 39
37#include <asm/uaccess.h> 40#include <asm/uaccess.h>
38 41
@@ -67,8 +70,6 @@ int console_printk[4] = {
67 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ 70 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */
68}; 71};
69 72
70static int saved_console_loglevel = -1;
71
72/* 73/*
73 * divert printk() messages when there is a LITMUS^RT debug listener 74 * divert printk() messages when there is a LITMUS^RT debug listener
74 */ 75 */
@@ -150,6 +151,7 @@ static char __log_buf[__LOG_BUF_LEN];
150static char *log_buf = __log_buf; 151static char *log_buf = __log_buf;
151static int log_buf_len = __LOG_BUF_LEN; 152static int log_buf_len = __LOG_BUF_LEN;
152static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ 153static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
154static int saved_console_loglevel = -1;
153 155
154#ifdef CONFIG_KEXEC 156#ifdef CONFIG_KEXEC
155/* 157/*
@@ -263,38 +265,23 @@ static inline void boot_delay_msec(void)
263} 265}
264#endif 266#endif
265 267
266/* 268int do_syslog(int type, char __user *buf, int len, bool from_file)
267 * Commands to do_syslog:
268 *
269 * 0 -- Close the log. Currently a NOP.
270 * 1 -- Open the log. Currently a NOP.
271 * 2 -- Read from the log.
272 * 3 -- Read all messages remaining in the ring buffer.
273 * 4 -- Read and clear all messages remaining in the ring buffer
274 * 5 -- Clear ring buffer.
275 * 6 -- Disable printk's to console
276 * 7 -- Enable printk's to console
277 * 8 -- Set level of messages printed to console
278 * 9 -- Return number of unread characters in the log buffer
279 * 10 -- Return size of the log buffer
280 */
281int do_syslog(int type, char __user *buf, int len)
282{ 269{
283 unsigned i, j, limit, count; 270 unsigned i, j, limit, count;
284 int do_clear = 0; 271 int do_clear = 0;
285 char c; 272 char c;
286 int error = 0; 273 int error = 0;
287 274
288 error = security_syslog(type); 275 error = security_syslog(type, from_file);
289 if (error) 276 if (error)
290 return error; 277 return error;
291 278
292 switch (type) { 279 switch (type) {
293 case 0: /* Close log */ 280 case SYSLOG_ACTION_CLOSE: /* Close log */
294 break; 281 break;
295 case 1: /* Open log */ 282 case SYSLOG_ACTION_OPEN: /* Open log */
296 break; 283 break;
297 case 2: /* Read from log */ 284 case SYSLOG_ACTION_READ: /* Read from log */
298 error = -EINVAL; 285 error = -EINVAL;
299 if (!buf || len < 0) 286 if (!buf || len < 0)
300 goto out; 287 goto out;
@@ -325,10 +312,12 @@ int do_syslog(int type, char __user *buf, int len)
325 if (!error) 312 if (!error)
326 error = i; 313 error = i;
327 break; 314 break;
328 case 4: /* Read/clear last kernel messages */ 315 /* Read/clear last kernel messages */
316 case SYSLOG_ACTION_READ_CLEAR:
329 do_clear = 1; 317 do_clear = 1;
330 /* FALL THRU */ 318 /* FALL THRU */
331 case 3: /* Read last kernel messages */ 319 /* Read last kernel messages */
320 case SYSLOG_ACTION_READ_ALL:
332 error = -EINVAL; 321 error = -EINVAL;
333 if (!buf || len < 0) 322 if (!buf || len < 0)
334 goto out; 323 goto out;
@@ -381,21 +370,25 @@ int do_syslog(int type, char __user *buf, int len)
381 } 370 }
382 } 371 }
383 break; 372 break;
384 case 5: /* Clear ring buffer */ 373 /* Clear ring buffer */
374 case SYSLOG_ACTION_CLEAR:
385 logged_chars = 0; 375 logged_chars = 0;
386 break; 376 break;
387 case 6: /* Disable logging to console */ 377 /* Disable logging to console */
378 case SYSLOG_ACTION_CONSOLE_OFF:
388 if (saved_console_loglevel == -1) 379 if (saved_console_loglevel == -1)
389 saved_console_loglevel = console_loglevel; 380 saved_console_loglevel = console_loglevel;
390 console_loglevel = minimum_console_loglevel; 381 console_loglevel = minimum_console_loglevel;
391 break; 382 break;
392 case 7: /* Enable logging to console */ 383 /* Enable logging to console */
384 case SYSLOG_ACTION_CONSOLE_ON:
393 if (saved_console_loglevel != -1) { 385 if (saved_console_loglevel != -1) {
394 console_loglevel = saved_console_loglevel; 386 console_loglevel = saved_console_loglevel;
395 saved_console_loglevel = -1; 387 saved_console_loglevel = -1;
396 } 388 }
397 break; 389 break;
398 case 8: /* Set level of messages printed to console */ 390 /* Set level of messages printed to console */
391 case SYSLOG_ACTION_CONSOLE_LEVEL:
399 error = -EINVAL; 392 error = -EINVAL;
400 if (len < 1 || len > 8) 393 if (len < 1 || len > 8)
401 goto out; 394 goto out;
@@ -406,10 +399,12 @@ int do_syslog(int type, char __user *buf, int len)
406 saved_console_loglevel = -1; 399 saved_console_loglevel = -1;
407 error = 0; 400 error = 0;
408 break; 401 break;
409 case 9: /* Number of chars in the log buffer */ 402 /* Number of chars in the log buffer */
403 case SYSLOG_ACTION_SIZE_UNREAD:
410 error = log_end - log_start; 404 error = log_end - log_start;
411 break; 405 break;
412 case 10: /* Size of the log buffer */ 406 /* Size of the log buffer */
407 case SYSLOG_ACTION_SIZE_BUFFER:
413 error = log_buf_len; 408 error = log_buf_len;
414 break; 409 break;
415 default: 410 default:
@@ -422,7 +417,7 @@ out:
422 417
423SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) 418SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
424{ 419{
425 return do_syslog(type, buf, len); 420 return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
426} 421}
427 422
428/* 423/*
@@ -1386,11 +1381,11 @@ late_initcall(disable_boot_consoles);
1386 */ 1381 */
1387DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10); 1382DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);
1388 1383
1389int printk_ratelimit(void) 1384int __printk_ratelimit(const char *func)
1390{ 1385{
1391 return __ratelimit(&printk_ratelimit_state); 1386 return ___ratelimit(&printk_ratelimit_state, func);
1392} 1387}
1393EXPORT_SYMBOL(printk_ratelimit); 1388EXPORT_SYMBOL(__printk_ratelimit);
1394 1389
1395/** 1390/**
1396 * printk_timed_ratelimit - caller-controlled printk ratelimiting 1391 * printk_timed_ratelimit - caller-controlled printk ratelimiting
@@ -1414,4 +1409,123 @@ bool printk_timed_ratelimit(unsigned long *caller_jiffies,
1414 return false; 1409 return false;
1415} 1410}
1416EXPORT_SYMBOL(printk_timed_ratelimit); 1411EXPORT_SYMBOL(printk_timed_ratelimit);
1412
1413static DEFINE_SPINLOCK(dump_list_lock);
1414static LIST_HEAD(dump_list);
1415
1416/**
1417 * kmsg_dump_register - register a kernel log dumper.
1418 * @dumper: pointer to the kmsg_dumper structure
1419 *
1420 * Adds a kernel log dumper to the system. The dump callback in the
1421 * structure will be called when the kernel oopses or panics and must be
1422 * set. Returns zero on success and %-EINVAL or %-EBUSY otherwise.
1423 */
1424int kmsg_dump_register(struct kmsg_dumper *dumper)
1425{
1426 unsigned long flags;
1427 int err = -EBUSY;
1428
1429 /* The dump callback needs to be set */
1430 if (!dumper->dump)
1431 return -EINVAL;
1432
1433 spin_lock_irqsave(&dump_list_lock, flags);
1434 /* Don't allow registering multiple times */
1435 if (!dumper->registered) {
1436 dumper->registered = 1;
1437 list_add_tail(&dumper->list, &dump_list);
1438 err = 0;
1439 }
1440 spin_unlock_irqrestore(&dump_list_lock, flags);
1441
1442 return err;
1443}
1444EXPORT_SYMBOL_GPL(kmsg_dump_register);
1445
1446/**
1447 * kmsg_dump_unregister - unregister a kmsg dumper.
1448 * @dumper: pointer to the kmsg_dumper structure
1449 *
1450 * Removes a dump device from the system. Returns zero on success and
1451 * %-EINVAL otherwise.
1452 */
1453int kmsg_dump_unregister(struct kmsg_dumper *dumper)
1454{
1455 unsigned long flags;
1456 int err = -EINVAL;
1457
1458 spin_lock_irqsave(&dump_list_lock, flags);
1459 if (dumper->registered) {
1460 dumper->registered = 0;
1461 list_del(&dumper->list);
1462 err = 0;
1463 }
1464 spin_unlock_irqrestore(&dump_list_lock, flags);
1465
1466 return err;
1467}
1468EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
1469
1470static const char const *kmsg_reasons[] = {
1471 [KMSG_DUMP_OOPS] = "oops",
1472 [KMSG_DUMP_PANIC] = "panic",
1473 [KMSG_DUMP_KEXEC] = "kexec",
1474};
1475
1476static const char *kmsg_to_str(enum kmsg_dump_reason reason)
1477{
1478 if (reason >= ARRAY_SIZE(kmsg_reasons) || reason < 0)
1479 return "unknown";
1480
1481 return kmsg_reasons[reason];
1482}
1483
1484/**
1485 * kmsg_dump - dump kernel log to kernel message dumpers.
1486 * @reason: the reason (oops, panic etc) for dumping
1487 *
1488 * Iterate through each of the dump devices and call the oops/panic
1489 * callbacks with the log buffer.
1490 */
1491void kmsg_dump(enum kmsg_dump_reason reason)
1492{
1493 unsigned long end;
1494 unsigned chars;
1495 struct kmsg_dumper *dumper;
1496 const char *s1, *s2;
1497 unsigned long l1, l2;
1498 unsigned long flags;
1499
1500 /* Theoretically, the log could move on after we do this, but
1501 there's not a lot we can do about that. The new messages
1502 will overwrite the start of what we dump. */
1503 spin_lock_irqsave(&logbuf_lock, flags);
1504 end = log_end & LOG_BUF_MASK;
1505 chars = logged_chars;
1506 spin_unlock_irqrestore(&logbuf_lock, flags);
1507
1508 if (logged_chars > end) {
1509 s1 = log_buf + log_buf_len - logged_chars + end;
1510 l1 = logged_chars - end;
1511
1512 s2 = log_buf;
1513 l2 = end;
1514 } else {
1515 s1 = "";
1516 l1 = 0;
1517
1518 s2 = log_buf + end - logged_chars;
1519 l2 = logged_chars;
1520 }
1521
1522 if (!spin_trylock_irqsave(&dump_list_lock, flags)) {
1523 printk(KERN_ERR "dump_kmsg: dump list lock is held during %s, skipping dump\n",
1524 kmsg_to_str(reason));
1525 return;
1526 }
1527 list_for_each_entry(dumper, &dump_list, list)
1528 dumper->dump(dumper, reason, s1, l1, s2, l2);
1529 spin_unlock_irqrestore(&dump_list_lock, flags);
1530}
1417#endif 1531#endif
diff --git a/kernel/profile.c b/kernel/profile.c
index a55d3a367ae8..dfadc5b729f1 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -127,8 +127,10 @@ int __ref profile_init(void)
127 return 0; 127 return 0;
128 128
129 prof_buffer = vmalloc(buffer_bytes); 129 prof_buffer = vmalloc(buffer_bytes);
130 if (prof_buffer) 130 if (prof_buffer) {
131 memset(prof_buffer, 0, buffer_bytes);
131 return 0; 132 return 0;
133 }
132 134
133 free_cpumask_var(prof_cpu_mask); 135 free_cpumask_var(prof_cpu_mask);
134 return -ENOMEM; 136 return -ENOMEM;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 23bd09cd042e..42ad8ae729a0 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -22,6 +22,7 @@
22#include <linux/pid_namespace.h> 22#include <linux/pid_namespace.h>
23#include <linux/syscalls.h> 23#include <linux/syscalls.h>
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
25#include <linux/regset.h>
25 26
26 27
27/* 28/*
@@ -511,6 +512,47 @@ static int ptrace_resume(struct task_struct *child, long request, long data)
511 return 0; 512 return 0;
512} 513}
513 514
515#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
516
517static const struct user_regset *
518find_regset(const struct user_regset_view *view, unsigned int type)
519{
520 const struct user_regset *regset;
521 int n;
522
523 for (n = 0; n < view->n; ++n) {
524 regset = view->regsets + n;
525 if (regset->core_note_type == type)
526 return regset;
527 }
528
529 return NULL;
530}
531
532static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
533 struct iovec *kiov)
534{
535 const struct user_regset_view *view = task_user_regset_view(task);
536 const struct user_regset *regset = find_regset(view, type);
537 int regset_no;
538
539 if (!regset || (kiov->iov_len % regset->size) != 0)
540 return -EINVAL;
541
542 regset_no = regset - view->regsets;
543 kiov->iov_len = min(kiov->iov_len,
544 (__kernel_size_t) (regset->n * regset->size));
545
546 if (req == PTRACE_GETREGSET)
547 return copy_regset_to_user(task, view, regset_no, 0,
548 kiov->iov_len, kiov->iov_base);
549 else
550 return copy_regset_from_user(task, view, regset_no, 0,
551 kiov->iov_len, kiov->iov_base);
552}
553
554#endif
555
514int ptrace_request(struct task_struct *child, long request, 556int ptrace_request(struct task_struct *child, long request,
515 long addr, long data) 557 long addr, long data)
516{ 558{
@@ -573,6 +615,26 @@ int ptrace_request(struct task_struct *child, long request,
573 return 0; 615 return 0;
574 return ptrace_resume(child, request, SIGKILL); 616 return ptrace_resume(child, request, SIGKILL);
575 617
618#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
619 case PTRACE_GETREGSET:
620 case PTRACE_SETREGSET:
621 {
622 struct iovec kiov;
623 struct iovec __user *uiov = (struct iovec __user *) data;
624
625 if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
626 return -EFAULT;
627
628 if (__get_user(kiov.iov_base, &uiov->iov_base) ||
629 __get_user(kiov.iov_len, &uiov->iov_len))
630 return -EFAULT;
631
632 ret = ptrace_regset(child, request, addr, &kiov);
633 if (!ret)
634 ret = __put_user(kiov.iov_len, &uiov->iov_len);
635 break;
636 }
637#endif
576 default: 638 default:
577 break; 639 break;
578 } 640 }
@@ -711,6 +773,32 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
711 else 773 else
712 ret = ptrace_setsiginfo(child, &siginfo); 774 ret = ptrace_setsiginfo(child, &siginfo);
713 break; 775 break;
776#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
777 case PTRACE_GETREGSET:
778 case PTRACE_SETREGSET:
779 {
780 struct iovec kiov;
781 struct compat_iovec __user *uiov =
782 (struct compat_iovec __user *) datap;
783 compat_uptr_t ptr;
784 compat_size_t len;
785
786 if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
787 return -EFAULT;
788
789 if (__get_user(ptr, &uiov->iov_base) ||
790 __get_user(len, &uiov->iov_len))
791 return -EFAULT;
792
793 kiov.iov_base = compat_ptr(ptr);
794 kiov.iov_len = len;
795
796 ret = ptrace_regset(child, request, addr, &kiov);
797 if (!ret)
798 ret = __put_user(kiov.iov_len, &uiov->iov_len);
799 break;
800 }
801#endif
714 802
715 default: 803 default:
716 ret = ptrace_request(child, request, addr, data); 804 ret = ptrace_request(child, request, addr, data);
diff --git a/kernel/range.c b/kernel/range.c
new file mode 100644
index 000000000000..74e2e6114927
--- /dev/null
+++ b/kernel/range.c
@@ -0,0 +1,163 @@
1/*
2 * Range add and subtract
3 */
4#include <linux/module.h>
5#include <linux/init.h>
6#include <linux/sort.h>
7
8#include <linux/range.h>
9
10#ifndef ARRAY_SIZE
11#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
12#endif
13
14int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)
15{
16 if (start >= end)
17 return nr_range;
18
19 /* Out of slots: */
20 if (nr_range >= az)
21 return nr_range;
22
23 range[nr_range].start = start;
24 range[nr_range].end = end;
25
26 nr_range++;
27
28 return nr_range;
29}
30
31int add_range_with_merge(struct range *range, int az, int nr_range,
32 u64 start, u64 end)
33{
34 int i;
35
36 if (start >= end)
37 return nr_range;
38
39 /* Try to merge it with old one: */
40 for (i = 0; i < nr_range; i++) {
41 u64 final_start, final_end;
42 u64 common_start, common_end;
43
44 if (!range[i].end)
45 continue;
46
47 common_start = max(range[i].start, start);
48 common_end = min(range[i].end, end);
49 if (common_start > common_end)
50 continue;
51
52 final_start = min(range[i].start, start);
53 final_end = max(range[i].end, end);
54
55 range[i].start = final_start;
56 range[i].end = final_end;
57 return nr_range;
58 }
59
60 /* Need to add it: */
61 return add_range(range, az, nr_range, start, end);
62}
63
64void subtract_range(struct range *range, int az, u64 start, u64 end)
65{
66 int i, j;
67
68 if (start >= end)
69 return;
70
71 for (j = 0; j < az; j++) {
72 if (!range[j].end)
73 continue;
74
75 if (start <= range[j].start && end >= range[j].end) {
76 range[j].start = 0;
77 range[j].end = 0;
78 continue;
79 }
80
81 if (start <= range[j].start && end < range[j].end &&
82 range[j].start < end) {
83 range[j].start = end;
84 continue;
85 }
86
87
88 if (start > range[j].start && end >= range[j].end &&
89 range[j].end > start) {
90 range[j].end = start;
91 continue;
92 }
93
94 if (start > range[j].start && end < range[j].end) {
95 /* Find the new spare: */
96 for (i = 0; i < az; i++) {
97 if (range[i].end == 0)
98 break;
99 }
100 if (i < az) {
101 range[i].end = range[j].end;
102 range[i].start = end;
103 } else {
104 printk(KERN_ERR "run of slot in ranges\n");
105 }
106 range[j].end = start;
107 continue;
108 }
109 }
110}
111
112static int cmp_range(const void *x1, const void *x2)
113{
114 const struct range *r1 = x1;
115 const struct range *r2 = x2;
116 s64 start1, start2;
117
118 start1 = r1->start;
119 start2 = r2->start;
120
121 return start1 - start2;
122}
123
124int clean_sort_range(struct range *range, int az)
125{
126 int i, j, k = az - 1, nr_range = 0;
127
128 for (i = 0; i < k; i++) {
129 if (range[i].end)
130 continue;
131 for (j = k; j > i; j--) {
132 if (range[j].end) {
133 k = j;
134 break;
135 }
136 }
137 if (j == i)
138 break;
139 range[i].start = range[k].start;
140 range[i].end = range[k].end;
141 range[k].start = 0;
142 range[k].end = 0;
143 k--;
144 }
145 /* count it */
146 for (i = 0; i < az; i++) {
147 if (!range[i].end) {
148 nr_range = i;
149 break;
150 }
151 }
152
153 /* sort them */
154 sort(range, nr_range, sizeof(struct range), cmp_range, NULL);
155
156 return nr_range;
157}
158
159void sort_range(struct range *range, int nr_range)
160{
161 /* sort them */
162 sort(range, nr_range, sizeof(struct range), cmp_range, NULL);
163}
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 400183346ad2..49d808e833b0 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -45,143 +45,91 @@
45#include <linux/mutex.h> 45#include <linux/mutex.h>
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/kernel_stat.h> 47#include <linux/kernel_stat.h>
48#include <linux/hardirq.h>
48 49
49#ifdef CONFIG_DEBUG_LOCK_ALLOC 50#ifdef CONFIG_DEBUG_LOCK_ALLOC
50static struct lock_class_key rcu_lock_key; 51static struct lock_class_key rcu_lock_key;
51struct lockdep_map rcu_lock_map = 52struct lockdep_map rcu_lock_map =
52 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); 53 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
53EXPORT_SYMBOL_GPL(rcu_lock_map); 54EXPORT_SYMBOL_GPL(rcu_lock_map);
54#endif
55 55
56int rcu_scheduler_active __read_mostly; 56static struct lock_class_key rcu_bh_lock_key;
57struct lockdep_map rcu_bh_lock_map =
58 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_bh", &rcu_bh_lock_key);
59EXPORT_SYMBOL_GPL(rcu_bh_lock_map);
57 60
58/* 61static struct lock_class_key rcu_sched_lock_key;
59 * Awaken the corresponding synchronize_rcu() instance now that a 62struct lockdep_map rcu_sched_lock_map =
60 * grace period has elapsed. 63 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key);
61 */ 64EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
62void wakeme_after_rcu(struct rcu_head *head) 65#endif
63{
64 struct rcu_synchronize *rcu;
65 66
66 rcu = container_of(head, struct rcu_synchronize, head); 67int rcu_scheduler_active __read_mostly;
67 complete(&rcu->completion); 68EXPORT_SYMBOL_GPL(rcu_scheduler_active);
68}
69 69
70#ifdef CONFIG_TREE_PREEMPT_RCU 70#ifdef CONFIG_DEBUG_LOCK_ALLOC
71 71
72/** 72int debug_lockdep_rcu_enabled(void)
73 * synchronize_rcu - wait until a grace period has elapsed.
74 *
75 * Control will return to the caller some time after a full grace
76 * period has elapsed, in other words after all currently executing RCU
77 * read-side critical sections have completed. RCU read-side critical
78 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
79 * and may be nested.
80 */
81void synchronize_rcu(void)
82{ 73{
83 struct rcu_synchronize rcu; 74 return rcu_scheduler_active && debug_locks &&
84 75 current->lockdep_recursion == 0;
85 if (!rcu_scheduler_active)
86 return;
87
88 init_completion(&rcu.completion);
89 /* Will wake me after RCU finished. */
90 call_rcu(&rcu.head, wakeme_after_rcu);
91 /* Wait for it. */
92 wait_for_completion(&rcu.completion);
93} 76}
94EXPORT_SYMBOL_GPL(synchronize_rcu); 77EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
95
96#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
97 78
98/** 79/**
99 * synchronize_sched - wait until an rcu-sched grace period has elapsed. 80 * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section?
100 *
101 * Control will return to the caller some time after a full rcu-sched
102 * grace period has elapsed, in other words after all currently executing
103 * rcu-sched read-side critical sections have completed. These read-side
104 * critical sections are delimited by rcu_read_lock_sched() and
105 * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(),
106 * local_irq_disable(), and so on may be used in place of
107 * rcu_read_lock_sched().
108 * 81 *
109 * This means that all preempt_disable code sequences, including NMI and 82 * Check for bottom half being disabled, which covers both the
110 * hardware-interrupt handlers, in progress on entry will have completed 83 * CONFIG_PROVE_RCU and not cases. Note that if someone uses
111 * before this primitive returns. However, this does not guarantee that 84 * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled)
112 * softirq handlers will have completed, since in some kernels, these 85 * will show the situation.
113 * handlers can run in process context, and can block.
114 * 86 *
115 * This primitive provides the guarantees made by the (now removed) 87 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot.
116 * synchronize_kernel() API. In contrast, synchronize_rcu() only
117 * guarantees that rcu_read_lock() sections will have completed.
118 * In "classic RCU", these two guarantees happen to be one and
119 * the same, but can differ in realtime RCU implementations.
120 */ 88 */
121void synchronize_sched(void) 89int rcu_read_lock_bh_held(void)
122{ 90{
123 struct rcu_synchronize rcu; 91 if (!debug_lockdep_rcu_enabled())
124 92 return 1;
125 if (rcu_blocking_is_gp()) 93 return in_softirq();
126 return;
127
128 init_completion(&rcu.completion);
129 /* Will wake me after RCU finished. */
130 call_rcu_sched(&rcu.head, wakeme_after_rcu);
131 /* Wait for it. */
132 wait_for_completion(&rcu.completion);
133} 94}
134EXPORT_SYMBOL_GPL(synchronize_sched); 95EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
135
136/**
137 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
138 *
139 * Control will return to the caller some time after a full rcu_bh grace
140 * period has elapsed, in other words after all currently executing rcu_bh
141 * read-side critical sections have completed. RCU read-side critical
142 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
143 * and may be nested.
144 */
145void synchronize_rcu_bh(void)
146{
147 struct rcu_synchronize rcu;
148 96
149 if (rcu_blocking_is_gp()) 97#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
150 return;
151 98
152 init_completion(&rcu.completion); 99/*
153 /* Will wake me after RCU finished. */ 100 * This function is invoked towards the end of the scheduler's initialization
154 call_rcu_bh(&rcu.head, wakeme_after_rcu); 101 * process. Before this is called, the idle task might contain
155 /* Wait for it. */ 102 * RCU read-side critical sections (during which time, this idle
156 wait_for_completion(&rcu.completion); 103 * task is booting the system). After this function is called, the
157} 104 * idle tasks are prohibited from containing RCU read-side critical
158EXPORT_SYMBOL_GPL(synchronize_rcu_bh); 105 * sections.
159 106 */
160static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self, 107void rcu_scheduler_starting(void)
161 unsigned long action, void *hcpu)
162{ 108{
163 return rcu_cpu_notify(self, action, hcpu); 109 WARN_ON(num_online_cpus() != 1);
110 WARN_ON(nr_context_switches() > 0);
111 rcu_scheduler_active = 1;
164} 112}
165 113
166void __init rcu_init(void) 114/*
115 * Awaken the corresponding synchronize_rcu() instance now that a
116 * grace period has elapsed.
117 */
118void wakeme_after_rcu(struct rcu_head *head)
167{ 119{
168 int i; 120 struct rcu_synchronize *rcu;
169
170 __rcu_init();
171 cpu_notifier(rcu_barrier_cpu_hotplug, 0);
172 121
173 /* 122 rcu = container_of(head, struct rcu_synchronize, head);
174 * We don't need protection against CPU-hotplug here because 123 complete(&rcu->completion);
175 * this is called early in boot, before either interrupts
176 * or the scheduler are operational.
177 */
178 for_each_online_cpu(i)
179 rcu_barrier_cpu_hotplug(NULL, CPU_UP_PREPARE, (void *)(long)i);
180} 124}
181 125
182void rcu_scheduler_starting(void) 126#ifdef CONFIG_PROVE_RCU
127/*
128 * wrapper function to avoid #include problems.
129 */
130int rcu_my_thread_group_empty(void)
183{ 131{
184 WARN_ON(num_online_cpus() != 1); 132 return thread_group_empty(current);
185 WARN_ON(nr_context_switches() > 0);
186 rcu_scheduler_active = 1;
187} 133}
134EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty);
135#endif /* #ifdef CONFIG_PROVE_RCU */
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
new file mode 100644
index 000000000000..9f6d9ff2572c
--- /dev/null
+++ b/kernel/rcutiny.c
@@ -0,0 +1,282 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2008
19 *
20 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
21 *
22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU
24 */
25#include <linux/moduleparam.h>
26#include <linux/completion.h>
27#include <linux/interrupt.h>
28#include <linux/notifier.h>
29#include <linux/rcupdate.h>
30#include <linux/kernel.h>
31#include <linux/module.h>
32#include <linux/mutex.h>
33#include <linux/sched.h>
34#include <linux/types.h>
35#include <linux/init.h>
36#include <linux/time.h>
37#include <linux/cpu.h>
38
39/* Global control variables for rcupdate callback mechanism. */
40struct rcu_ctrlblk {
41 struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
42 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
43 struct rcu_head **curtail; /* ->next pointer of last CB. */
44};
45
46/* Definition for rcupdate control block. */
47static struct rcu_ctrlblk rcu_ctrlblk = {
48 .donetail = &rcu_ctrlblk.rcucblist,
49 .curtail = &rcu_ctrlblk.rcucblist,
50};
51
52static struct rcu_ctrlblk rcu_bh_ctrlblk = {
53 .donetail = &rcu_bh_ctrlblk.rcucblist,
54 .curtail = &rcu_bh_ctrlblk.rcucblist,
55};
56
57#ifdef CONFIG_NO_HZ
58
59static long rcu_dynticks_nesting = 1;
60
61/*
62 * Enter dynticks-idle mode, which is an extended quiescent state
63 * if we have fully entered that mode (i.e., if the new value of
64 * dynticks_nesting is zero).
65 */
66void rcu_enter_nohz(void)
67{
68 if (--rcu_dynticks_nesting == 0)
69 rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */
70}
71
72/*
73 * Exit dynticks-idle mode, so that we are no longer in an extended
74 * quiescent state.
75 */
76void rcu_exit_nohz(void)
77{
78 rcu_dynticks_nesting++;
79}
80
81#endif /* #ifdef CONFIG_NO_HZ */
82
83/*
84 * Helper function for rcu_qsctr_inc() and rcu_bh_qsctr_inc().
85 * Also disable irqs to avoid confusion due to interrupt handlers
86 * invoking call_rcu().
87 */
88static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
89{
90 unsigned long flags;
91
92 local_irq_save(flags);
93 if (rcp->rcucblist != NULL &&
94 rcp->donetail != rcp->curtail) {
95 rcp->donetail = rcp->curtail;
96 local_irq_restore(flags);
97 return 1;
98 }
99 local_irq_restore(flags);
100
101 return 0;
102}
103
104/*
105 * Record an rcu quiescent state. And an rcu_bh quiescent state while we
106 * are at it, given that any rcu quiescent state is also an rcu_bh
107 * quiescent state. Use "+" instead of "||" to defeat short circuiting.
108 */
109void rcu_sched_qs(int cpu)
110{
111 if (rcu_qsctr_help(&rcu_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk))
112 raise_softirq(RCU_SOFTIRQ);
113}
114
115/*
116 * Record an rcu_bh quiescent state.
117 */
118void rcu_bh_qs(int cpu)
119{
120 if (rcu_qsctr_help(&rcu_bh_ctrlblk))
121 raise_softirq(RCU_SOFTIRQ);
122}
123
124/*
125 * Check to see if the scheduling-clock interrupt came from an extended
126 * quiescent state, and, if so, tell RCU about it.
127 */
128void rcu_check_callbacks(int cpu, int user)
129{
130 if (user ||
131 (idle_cpu(cpu) &&
132 !in_softirq() &&
133 hardirq_count() <= (1 << HARDIRQ_SHIFT)))
134 rcu_sched_qs(cpu);
135 else if (!in_softirq())
136 rcu_bh_qs(cpu);
137}
138
139/*
140 * Helper function for rcu_process_callbacks() that operates on the
141 * specified rcu_ctrlkblk structure.
142 */
143static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
144{
145 struct rcu_head *next, *list;
146 unsigned long flags;
147
148 /* If no RCU callbacks ready to invoke, just return. */
149 if (&rcp->rcucblist == rcp->donetail)
150 return;
151
152 /* Move the ready-to-invoke callbacks to a local list. */
153 local_irq_save(flags);
154 list = rcp->rcucblist;
155 rcp->rcucblist = *rcp->donetail;
156 *rcp->donetail = NULL;
157 if (rcp->curtail == rcp->donetail)
158 rcp->curtail = &rcp->rcucblist;
159 rcp->donetail = &rcp->rcucblist;
160 local_irq_restore(flags);
161
162 /* Invoke the callbacks on the local list. */
163 while (list) {
164 next = list->next;
165 prefetch(next);
166 list->func(list);
167 list = next;
168 }
169}
170
171/*
172 * Invoke any callbacks whose grace period has completed.
173 */
174static void rcu_process_callbacks(struct softirq_action *unused)
175{
176 __rcu_process_callbacks(&rcu_ctrlblk);
177 __rcu_process_callbacks(&rcu_bh_ctrlblk);
178}
179
180/*
181 * Wait for a grace period to elapse. But it is illegal to invoke
182 * synchronize_sched() from within an RCU read-side critical section.
183 * Therefore, any legal call to synchronize_sched() is a quiescent
184 * state, and so on a UP system, synchronize_sched() need do nothing.
185 * Ditto for synchronize_rcu_bh(). (But Lai Jiangshan points out the
186 * benefits of doing might_sleep() to reduce latency.)
187 *
188 * Cool, huh? (Due to Josh Triplett.)
189 *
190 * But we want to make this a static inline later.
191 */
192void synchronize_sched(void)
193{
194 cond_resched();
195}
196EXPORT_SYMBOL_GPL(synchronize_sched);
197
198void synchronize_rcu_bh(void)
199{
200 synchronize_sched();
201}
202EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
203
204/*
205 * Helper function for call_rcu() and call_rcu_bh().
206 */
207static void __call_rcu(struct rcu_head *head,
208 void (*func)(struct rcu_head *rcu),
209 struct rcu_ctrlblk *rcp)
210{
211 unsigned long flags;
212
213 head->func = func;
214 head->next = NULL;
215
216 local_irq_save(flags);
217 *rcp->curtail = head;
218 rcp->curtail = &head->next;
219 local_irq_restore(flags);
220}
221
222/*
223 * Post an RCU callback to be invoked after the end of an RCU grace
224 * period. But since we have but one CPU, that would be after any
225 * quiescent state.
226 */
227void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
228{
229 __call_rcu(head, func, &rcu_ctrlblk);
230}
231EXPORT_SYMBOL_GPL(call_rcu);
232
233/*
234 * Post an RCU bottom-half callback to be invoked after any subsequent
235 * quiescent state.
236 */
237void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
238{
239 __call_rcu(head, func, &rcu_bh_ctrlblk);
240}
241EXPORT_SYMBOL_GPL(call_rcu_bh);
242
243void rcu_barrier(void)
244{
245 struct rcu_synchronize rcu;
246
247 init_completion(&rcu.completion);
248 /* Will wake me after RCU finished. */
249 call_rcu(&rcu.head, wakeme_after_rcu);
250 /* Wait for it. */
251 wait_for_completion(&rcu.completion);
252}
253EXPORT_SYMBOL_GPL(rcu_barrier);
254
255void rcu_barrier_bh(void)
256{
257 struct rcu_synchronize rcu;
258
259 init_completion(&rcu.completion);
260 /* Will wake me after RCU finished. */
261 call_rcu_bh(&rcu.head, wakeme_after_rcu);
262 /* Wait for it. */
263 wait_for_completion(&rcu.completion);
264}
265EXPORT_SYMBOL_GPL(rcu_barrier_bh);
266
267void rcu_barrier_sched(void)
268{
269 struct rcu_synchronize rcu;
270
271 init_completion(&rcu.completion);
272 /* Will wake me after RCU finished. */
273 call_rcu_sched(&rcu.head, wakeme_after_rcu);
274 /* Wait for it. */
275 wait_for_completion(&rcu.completion);
276}
277EXPORT_SYMBOL_GPL(rcu_barrier_sched);
278
279void __init rcu_init(void)
280{
281 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
282}
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 697c0a0229d4..58df55bf83ed 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -61,6 +61,9 @@ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
61static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ 61static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
62static int stutter = 5; /* Start/stop testing interval (in sec) */ 62static int stutter = 5; /* Start/stop testing interval (in sec) */
63static int irqreader = 1; /* RCU readers from irq (timers). */ 63static int irqreader = 1; /* RCU readers from irq (timers). */
64static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */
65static int fqs_holdoff = 0; /* Hold time within burst (us). */
66static int fqs_stutter = 3; /* Wait time between bursts (s). */
64static char *torture_type = "rcu"; /* What RCU implementation to torture. */ 67static char *torture_type = "rcu"; /* What RCU implementation to torture. */
65 68
66module_param(nreaders, int, 0444); 69module_param(nreaders, int, 0444);
@@ -79,6 +82,12 @@ module_param(stutter, int, 0444);
79MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); 82MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
80module_param(irqreader, int, 0444); 83module_param(irqreader, int, 0444);
81MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); 84MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
85module_param(fqs_duration, int, 0444);
86MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)");
87module_param(fqs_holdoff, int, 0444);
88MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
89module_param(fqs_stutter, int, 0444);
90MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
82module_param(torture_type, charp, 0444); 91module_param(torture_type, charp, 0444);
83MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); 92MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
84 93
@@ -99,6 +108,7 @@ static struct task_struct **reader_tasks;
99static struct task_struct *stats_task; 108static struct task_struct *stats_task;
100static struct task_struct *shuffler_task; 109static struct task_struct *shuffler_task;
101static struct task_struct *stutter_task; 110static struct task_struct *stutter_task;
111static struct task_struct *fqs_task;
102 112
103#define RCU_TORTURE_PIPE_LEN 10 113#define RCU_TORTURE_PIPE_LEN 10
104 114
@@ -263,6 +273,7 @@ struct rcu_torture_ops {
263 void (*deferred_free)(struct rcu_torture *p); 273 void (*deferred_free)(struct rcu_torture *p);
264 void (*sync)(void); 274 void (*sync)(void);
265 void (*cb_barrier)(void); 275 void (*cb_barrier)(void);
276 void (*fqs)(void);
266 int (*stats)(char *page); 277 int (*stats)(char *page);
267 int irq_capable; 278 int irq_capable;
268 char *name; 279 char *name;
@@ -327,6 +338,11 @@ rcu_torture_cb(struct rcu_head *p)
327 cur_ops->deferred_free(rp); 338 cur_ops->deferred_free(rp);
328} 339}
329 340
341static int rcu_no_completed(void)
342{
343 return 0;
344}
345
330static void rcu_torture_deferred_free(struct rcu_torture *p) 346static void rcu_torture_deferred_free(struct rcu_torture *p)
331{ 347{
332 call_rcu(&p->rtort_rcu, rcu_torture_cb); 348 call_rcu(&p->rtort_rcu, rcu_torture_cb);
@@ -342,6 +358,7 @@ static struct rcu_torture_ops rcu_ops = {
342 .deferred_free = rcu_torture_deferred_free, 358 .deferred_free = rcu_torture_deferred_free,
343 .sync = synchronize_rcu, 359 .sync = synchronize_rcu,
344 .cb_barrier = rcu_barrier, 360 .cb_barrier = rcu_barrier,
361 .fqs = rcu_force_quiescent_state,
345 .stats = NULL, 362 .stats = NULL,
346 .irq_capable = 1, 363 .irq_capable = 1,
347 .name = "rcu" 364 .name = "rcu"
@@ -383,11 +400,28 @@ static struct rcu_torture_ops rcu_sync_ops = {
383 .deferred_free = rcu_sync_torture_deferred_free, 400 .deferred_free = rcu_sync_torture_deferred_free,
384 .sync = synchronize_rcu, 401 .sync = synchronize_rcu,
385 .cb_barrier = NULL, 402 .cb_barrier = NULL,
403 .fqs = rcu_force_quiescent_state,
386 .stats = NULL, 404 .stats = NULL,
387 .irq_capable = 1, 405 .irq_capable = 1,
388 .name = "rcu_sync" 406 .name = "rcu_sync"
389}; 407};
390 408
409static struct rcu_torture_ops rcu_expedited_ops = {
410 .init = rcu_sync_torture_init,
411 .cleanup = NULL,
412 .readlock = rcu_torture_read_lock,
413 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
414 .readunlock = rcu_torture_read_unlock,
415 .completed = rcu_no_completed,
416 .deferred_free = rcu_sync_torture_deferred_free,
417 .sync = synchronize_rcu_expedited,
418 .cb_barrier = NULL,
419 .fqs = rcu_force_quiescent_state,
420 .stats = NULL,
421 .irq_capable = 1,
422 .name = "rcu_expedited"
423};
424
391/* 425/*
392 * Definitions for rcu_bh torture testing. 426 * Definitions for rcu_bh torture testing.
393 */ 427 */
@@ -445,6 +479,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
445 .deferred_free = rcu_bh_torture_deferred_free, 479 .deferred_free = rcu_bh_torture_deferred_free,
446 .sync = rcu_bh_torture_synchronize, 480 .sync = rcu_bh_torture_synchronize,
447 .cb_barrier = rcu_barrier_bh, 481 .cb_barrier = rcu_barrier_bh,
482 .fqs = rcu_bh_force_quiescent_state,
448 .stats = NULL, 483 .stats = NULL,
449 .irq_capable = 1, 484 .irq_capable = 1,
450 .name = "rcu_bh" 485 .name = "rcu_bh"
@@ -460,6 +495,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
460 .deferred_free = rcu_sync_torture_deferred_free, 495 .deferred_free = rcu_sync_torture_deferred_free,
461 .sync = rcu_bh_torture_synchronize, 496 .sync = rcu_bh_torture_synchronize,
462 .cb_barrier = NULL, 497 .cb_barrier = NULL,
498 .fqs = rcu_bh_force_quiescent_state,
463 .stats = NULL, 499 .stats = NULL,
464 .irq_capable = 1, 500 .irq_capable = 1,
465 .name = "rcu_bh_sync" 501 .name = "rcu_bh_sync"
@@ -547,6 +583,25 @@ static struct rcu_torture_ops srcu_ops = {
547 .name = "srcu" 583 .name = "srcu"
548}; 584};
549 585
586static void srcu_torture_synchronize_expedited(void)
587{
588 synchronize_srcu_expedited(&srcu_ctl);
589}
590
591static struct rcu_torture_ops srcu_expedited_ops = {
592 .init = srcu_torture_init,
593 .cleanup = srcu_torture_cleanup,
594 .readlock = srcu_torture_read_lock,
595 .read_delay = srcu_read_delay,
596 .readunlock = srcu_torture_read_unlock,
597 .completed = srcu_torture_completed,
598 .deferred_free = rcu_sync_torture_deferred_free,
599 .sync = srcu_torture_synchronize_expedited,
600 .cb_barrier = NULL,
601 .stats = srcu_torture_stats,
602 .name = "srcu_expedited"
603};
604
550/* 605/*
551 * Definitions for sched torture testing. 606 * Definitions for sched torture testing.
552 */ 607 */
@@ -562,11 +617,6 @@ static void sched_torture_read_unlock(int idx)
562 preempt_enable(); 617 preempt_enable();
563} 618}
564 619
565static int sched_torture_completed(void)
566{
567 return 0;
568}
569
570static void rcu_sched_torture_deferred_free(struct rcu_torture *p) 620static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
571{ 621{
572 call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); 622 call_rcu_sched(&p->rtort_rcu, rcu_torture_cb);
@@ -583,25 +633,27 @@ static struct rcu_torture_ops sched_ops = {
583 .readlock = sched_torture_read_lock, 633 .readlock = sched_torture_read_lock,
584 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 634 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
585 .readunlock = sched_torture_read_unlock, 635 .readunlock = sched_torture_read_unlock,
586 .completed = sched_torture_completed, 636 .completed = rcu_no_completed,
587 .deferred_free = rcu_sched_torture_deferred_free, 637 .deferred_free = rcu_sched_torture_deferred_free,
588 .sync = sched_torture_synchronize, 638 .sync = sched_torture_synchronize,
589 .cb_barrier = rcu_barrier_sched, 639 .cb_barrier = rcu_barrier_sched,
640 .fqs = rcu_sched_force_quiescent_state,
590 .stats = NULL, 641 .stats = NULL,
591 .irq_capable = 1, 642 .irq_capable = 1,
592 .name = "sched" 643 .name = "sched"
593}; 644};
594 645
595static struct rcu_torture_ops sched_ops_sync = { 646static struct rcu_torture_ops sched_sync_ops = {
596 .init = rcu_sync_torture_init, 647 .init = rcu_sync_torture_init,
597 .cleanup = NULL, 648 .cleanup = NULL,
598 .readlock = sched_torture_read_lock, 649 .readlock = sched_torture_read_lock,
599 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 650 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
600 .readunlock = sched_torture_read_unlock, 651 .readunlock = sched_torture_read_unlock,
601 .completed = sched_torture_completed, 652 .completed = rcu_no_completed,
602 .deferred_free = rcu_sync_torture_deferred_free, 653 .deferred_free = rcu_sync_torture_deferred_free,
603 .sync = sched_torture_synchronize, 654 .sync = sched_torture_synchronize,
604 .cb_barrier = NULL, 655 .cb_barrier = NULL,
656 .fqs = rcu_sched_force_quiescent_state,
605 .stats = NULL, 657 .stats = NULL,
606 .name = "sched_sync" 658 .name = "sched_sync"
607}; 659};
@@ -612,16 +664,49 @@ static struct rcu_torture_ops sched_expedited_ops = {
612 .readlock = sched_torture_read_lock, 664 .readlock = sched_torture_read_lock,
613 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 665 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
614 .readunlock = sched_torture_read_unlock, 666 .readunlock = sched_torture_read_unlock,
615 .completed = sched_torture_completed, 667 .completed = rcu_no_completed,
616 .deferred_free = rcu_sync_torture_deferred_free, 668 .deferred_free = rcu_sync_torture_deferred_free,
617 .sync = synchronize_sched_expedited, 669 .sync = synchronize_sched_expedited,
618 .cb_barrier = NULL, 670 .cb_barrier = NULL,
671 .fqs = rcu_sched_force_quiescent_state,
619 .stats = rcu_expedited_torture_stats, 672 .stats = rcu_expedited_torture_stats,
620 .irq_capable = 1, 673 .irq_capable = 1,
621 .name = "sched_expedited" 674 .name = "sched_expedited"
622}; 675};
623 676
624/* 677/*
678 * RCU torture force-quiescent-state kthread. Repeatedly induces
679 * bursts of calls to force_quiescent_state(), increasing the probability
680 * of occurrence of some important types of race conditions.
681 */
682static int
683rcu_torture_fqs(void *arg)
684{
685 unsigned long fqs_resume_time;
686 int fqs_burst_remaining;
687
688 VERBOSE_PRINTK_STRING("rcu_torture_fqs task started");
689 do {
690 fqs_resume_time = jiffies + fqs_stutter * HZ;
691 while (jiffies - fqs_resume_time > LONG_MAX) {
692 schedule_timeout_interruptible(1);
693 }
694 fqs_burst_remaining = fqs_duration;
695 while (fqs_burst_remaining > 0) {
696 cur_ops->fqs();
697 udelay(fqs_holdoff);
698 fqs_burst_remaining -= fqs_holdoff;
699 }
700 rcu_stutter_wait("rcu_torture_fqs");
701 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
702 VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping");
703 rcutorture_shutdown_absorb("rcu_torture_fqs");
704 while (!kthread_should_stop())
705 schedule_timeout_uninterruptible(1);
706 return 0;
707}
708
709/*
625 * RCU torture writer kthread. Repeatedly substitutes a new structure 710 * RCU torture writer kthread. Repeatedly substitutes a new structure
626 * for that pointed to by rcu_torture_current, freeing the old structure 711 * for that pointed to by rcu_torture_current, freeing the old structure
627 * after a series of grace periods (the "pipeline"). 712 * after a series of grace periods (the "pipeline").
@@ -711,7 +796,11 @@ static void rcu_torture_timer(unsigned long unused)
711 796
712 idx = cur_ops->readlock(); 797 idx = cur_ops->readlock();
713 completed = cur_ops->completed(); 798 completed = cur_ops->completed();
714 p = rcu_dereference(rcu_torture_current); 799 p = rcu_dereference_check(rcu_torture_current,
800 rcu_read_lock_held() ||
801 rcu_read_lock_bh_held() ||
802 rcu_read_lock_sched_held() ||
803 srcu_read_lock_held(&srcu_ctl));
715 if (p == NULL) { 804 if (p == NULL) {
716 /* Leave because rcu_torture_writer is not yet underway */ 805 /* Leave because rcu_torture_writer is not yet underway */
717 cur_ops->readunlock(idx); 806 cur_ops->readunlock(idx);
@@ -729,13 +818,13 @@ static void rcu_torture_timer(unsigned long unused)
729 /* Should not happen, but... */ 818 /* Should not happen, but... */
730 pipe_count = RCU_TORTURE_PIPE_LEN; 819 pipe_count = RCU_TORTURE_PIPE_LEN;
731 } 820 }
732 ++__get_cpu_var(rcu_torture_count)[pipe_count]; 821 __this_cpu_inc(rcu_torture_count[pipe_count]);
733 completed = cur_ops->completed() - completed; 822 completed = cur_ops->completed() - completed;
734 if (completed > RCU_TORTURE_PIPE_LEN) { 823 if (completed > RCU_TORTURE_PIPE_LEN) {
735 /* Should not happen, but... */ 824 /* Should not happen, but... */
736 completed = RCU_TORTURE_PIPE_LEN; 825 completed = RCU_TORTURE_PIPE_LEN;
737 } 826 }
738 ++__get_cpu_var(rcu_torture_batch)[completed]; 827 __this_cpu_inc(rcu_torture_batch[completed]);
739 preempt_enable(); 828 preempt_enable();
740 cur_ops->readunlock(idx); 829 cur_ops->readunlock(idx);
741} 830}
@@ -764,11 +853,15 @@ rcu_torture_reader(void *arg)
764 do { 853 do {
765 if (irqreader && cur_ops->irq_capable) { 854 if (irqreader && cur_ops->irq_capable) {
766 if (!timer_pending(&t)) 855 if (!timer_pending(&t))
767 mod_timer(&t, 1); 856 mod_timer(&t, jiffies + 1);
768 } 857 }
769 idx = cur_ops->readlock(); 858 idx = cur_ops->readlock();
770 completed = cur_ops->completed(); 859 completed = cur_ops->completed();
771 p = rcu_dereference(rcu_torture_current); 860 p = rcu_dereference_check(rcu_torture_current,
861 rcu_read_lock_held() ||
862 rcu_read_lock_bh_held() ||
863 rcu_read_lock_sched_held() ||
864 srcu_read_lock_held(&srcu_ctl));
772 if (p == NULL) { 865 if (p == NULL) {
773 /* Wait for rcu_torture_writer to get underway */ 866 /* Wait for rcu_torture_writer to get underway */
774 cur_ops->readunlock(idx); 867 cur_ops->readunlock(idx);
@@ -784,13 +877,13 @@ rcu_torture_reader(void *arg)
784 /* Should not happen, but... */ 877 /* Should not happen, but... */
785 pipe_count = RCU_TORTURE_PIPE_LEN; 878 pipe_count = RCU_TORTURE_PIPE_LEN;
786 } 879 }
787 ++__get_cpu_var(rcu_torture_count)[pipe_count]; 880 __this_cpu_inc(rcu_torture_count[pipe_count]);
788 completed = cur_ops->completed() - completed; 881 completed = cur_ops->completed() - completed;
789 if (completed > RCU_TORTURE_PIPE_LEN) { 882 if (completed > RCU_TORTURE_PIPE_LEN) {
790 /* Should not happen, but... */ 883 /* Should not happen, but... */
791 completed = RCU_TORTURE_PIPE_LEN; 884 completed = RCU_TORTURE_PIPE_LEN;
792 } 885 }
793 ++__get_cpu_var(rcu_torture_batch)[completed]; 886 __this_cpu_inc(rcu_torture_batch[completed]);
794 preempt_enable(); 887 preempt_enable();
795 cur_ops->readunlock(idx); 888 cur_ops->readunlock(idx);
796 schedule(); 889 schedule();
@@ -996,10 +1089,11 @@ rcu_torture_print_module_parms(char *tag)
996 printk(KERN_ALERT "%s" TORTURE_FLAG 1089 printk(KERN_ALERT "%s" TORTURE_FLAG
997 "--- %s: nreaders=%d nfakewriters=%d " 1090 "--- %s: nreaders=%d nfakewriters=%d "
998 "stat_interval=%d verbose=%d test_no_idle_hz=%d " 1091 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
999 "shuffle_interval=%d stutter=%d irqreader=%d\n", 1092 "shuffle_interval=%d stutter=%d irqreader=%d "
1093 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n",
1000 torture_type, tag, nrealreaders, nfakewriters, 1094 torture_type, tag, nrealreaders, nfakewriters,
1001 stat_interval, verbose, test_no_idle_hz, shuffle_interval, 1095 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
1002 stutter, irqreader); 1096 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter);
1003} 1097}
1004 1098
1005static struct notifier_block rcutorture_nb = { 1099static struct notifier_block rcutorture_nb = {
@@ -1075,6 +1169,12 @@ rcu_torture_cleanup(void)
1075 } 1169 }
1076 stats_task = NULL; 1170 stats_task = NULL;
1077 1171
1172 if (fqs_task) {
1173 VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task");
1174 kthread_stop(fqs_task);
1175 }
1176 fqs_task = NULL;
1177
1078 /* Wait for all RCU callbacks to fire. */ 1178 /* Wait for all RCU callbacks to fire. */
1079 1179
1080 if (cur_ops->cb_barrier != NULL) 1180 if (cur_ops->cb_barrier != NULL)
@@ -1097,9 +1197,10 @@ rcu_torture_init(void)
1097 int cpu; 1197 int cpu;
1098 int firsterr = 0; 1198 int firsterr = 0;
1099 static struct rcu_torture_ops *torture_ops[] = 1199 static struct rcu_torture_ops *torture_ops[] =
1100 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, 1200 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
1101 &sched_expedited_ops, 1201 &rcu_bh_ops, &rcu_bh_sync_ops,
1102 &srcu_ops, &sched_ops, &sched_ops_sync, }; 1202 &srcu_ops, &srcu_expedited_ops,
1203 &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
1103 1204
1104 mutex_lock(&fullstop_mutex); 1205 mutex_lock(&fullstop_mutex);
1105 1206
@@ -1110,11 +1211,20 @@ rcu_torture_init(void)
1110 break; 1211 break;
1111 } 1212 }
1112 if (i == ARRAY_SIZE(torture_ops)) { 1213 if (i == ARRAY_SIZE(torture_ops)) {
1113 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", 1214 printk(KERN_ALERT "rcu-torture: invalid torture type: \"%s\"\n",
1114 torture_type); 1215 torture_type);
1216 printk(KERN_ALERT "rcu-torture types:");
1217 for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
1218 printk(KERN_ALERT " %s", torture_ops[i]->name);
1219 printk(KERN_ALERT "\n");
1115 mutex_unlock(&fullstop_mutex); 1220 mutex_unlock(&fullstop_mutex);
1116 return -EINVAL; 1221 return -EINVAL;
1117 } 1222 }
1223 if (cur_ops->fqs == NULL && fqs_duration != 0) {
1224 printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero "
1225 "fqs_duration, fqs disabled.\n");
1226 fqs_duration = 0;
1227 }
1118 if (cur_ops->init) 1228 if (cur_ops->init)
1119 cur_ops->init(); /* no "goto unwind" prior to this point!!! */ 1229 cur_ops->init(); /* no "goto unwind" prior to this point!!! */
1120 1230
@@ -1243,6 +1353,19 @@ rcu_torture_init(void)
1243 goto unwind; 1353 goto unwind;
1244 } 1354 }
1245 } 1355 }
1356 if (fqs_duration < 0)
1357 fqs_duration = 0;
1358 if (fqs_duration) {
1359 /* Create the stutter thread */
1360 fqs_task = kthread_run(rcu_torture_fqs, NULL,
1361 "rcu_torture_fqs");
1362 if (IS_ERR(fqs_task)) {
1363 firsterr = PTR_ERR(fqs_task);
1364 VERBOSE_PRINTK_ERRSTRING("Failed to create fqs");
1365 fqs_task = NULL;
1366 goto unwind;
1367 }
1368 }
1246 register_reboot_notifier(&rcutorture_nb); 1369 register_reboot_notifier(&rcutorture_nb);
1247 mutex_unlock(&fullstop_mutex); 1370 mutex_unlock(&fullstop_mutex);
1248 return 0; 1371 return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index f3077c0ab181..3ec8160fc75f 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -51,22 +51,25 @@
51 51
52/* Data structures. */ 52/* Data structures. */
53 53
54static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
55
54#define RCU_STATE_INITIALIZER(name) { \ 56#define RCU_STATE_INITIALIZER(name) { \
55 .level = { &name.node[0] }, \ 57 .level = { &name.node[0] }, \
56 .levelcnt = { \ 58 .levelcnt = { \
57 NUM_RCU_LVL_0, /* root of hierarchy. */ \ 59 NUM_RCU_LVL_0, /* root of hierarchy. */ \
58 NUM_RCU_LVL_1, \ 60 NUM_RCU_LVL_1, \
59 NUM_RCU_LVL_2, \ 61 NUM_RCU_LVL_2, \
60 NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \ 62 NUM_RCU_LVL_3, \
63 NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \
61 }, \ 64 }, \
62 .signaled = RCU_GP_IDLE, \ 65 .signaled = RCU_GP_IDLE, \
63 .gpnum = -300, \ 66 .gpnum = -300, \
64 .completed = -300, \ 67 .completed = -300, \
65 .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \ 68 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&name.onofflock), \
66 .orphan_cbs_list = NULL, \ 69 .orphan_cbs_list = NULL, \
67 .orphan_cbs_tail = &name.orphan_cbs_list, \ 70 .orphan_cbs_tail = &name.orphan_cbs_list, \
68 .orphan_qlen = 0, \ 71 .orphan_qlen = 0, \
69 .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \ 72 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&name.fqslock), \
70 .n_force_qs = 0, \ 73 .n_force_qs = 0, \
71 .n_force_qs_ngp = 0, \ 74 .n_force_qs_ngp = 0, \
72} 75}
@@ -77,7 +80,6 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
77struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 80struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
78DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 81DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
79 82
80
81/* 83/*
82 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 84 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
83 * permit this function to be invoked without holding the root rcu_node 85 * permit this function to be invoked without holding the root rcu_node
@@ -98,7 +100,7 @@ void rcu_sched_qs(int cpu)
98 struct rcu_data *rdp; 100 struct rcu_data *rdp;
99 101
100 rdp = &per_cpu(rcu_sched_data, cpu); 102 rdp = &per_cpu(rcu_sched_data, cpu);
101 rdp->passed_quiesc_completed = rdp->completed; 103 rdp->passed_quiesc_completed = rdp->gpnum - 1;
102 barrier(); 104 barrier();
103 rdp->passed_quiesc = 1; 105 rdp->passed_quiesc = 1;
104 rcu_preempt_note_context_switch(cpu); 106 rcu_preempt_note_context_switch(cpu);
@@ -109,7 +111,7 @@ void rcu_bh_qs(int cpu)
109 struct rcu_data *rdp; 111 struct rcu_data *rdp;
110 112
111 rdp = &per_cpu(rcu_bh_data, cpu); 113 rdp = &per_cpu(rcu_bh_data, cpu);
112 rdp->passed_quiesc_completed = rdp->completed; 114 rdp->passed_quiesc_completed = rdp->gpnum - 1;
113 barrier(); 115 barrier();
114 rdp->passed_quiesc = 1; 116 rdp->passed_quiesc = 1;
115} 117}
@@ -151,6 +153,24 @@ long rcu_batches_completed_bh(void)
151EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); 153EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
152 154
153/* 155/*
156 * Force a quiescent state for RCU BH.
157 */
158void rcu_bh_force_quiescent_state(void)
159{
160 force_quiescent_state(&rcu_bh_state, 0);
161}
162EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
163
164/*
165 * Force a quiescent state for RCU-sched.
166 */
167void rcu_sched_force_quiescent_state(void)
168{
169 force_quiescent_state(&rcu_sched_state, 0);
170}
171EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
172
173/*
154 * Does the CPU have callbacks ready to be invoked? 174 * Does the CPU have callbacks ready to be invoked?
155 */ 175 */
156static int 176static int
@@ -335,28 +355,9 @@ void rcu_irq_exit(void)
335 set_need_resched(); 355 set_need_resched();
336} 356}
337 357
338/*
339 * Record the specified "completed" value, which is later used to validate
340 * dynticks counter manipulations. Specify "rsp->completed - 1" to
341 * unconditionally invalidate any future dynticks manipulations (which is
342 * useful at the beginning of a grace period).
343 */
344static void dyntick_record_completed(struct rcu_state *rsp, long comp)
345{
346 rsp->dynticks_completed = comp;
347}
348
349#ifdef CONFIG_SMP 358#ifdef CONFIG_SMP
350 359
351/* 360/*
352 * Recall the previously recorded value of the completion for dynticks.
353 */
354static long dyntick_recall_completed(struct rcu_state *rsp)
355{
356 return rsp->dynticks_completed;
357}
358
359/*
360 * Snapshot the specified CPU's dynticks counter so that we can later 361 * Snapshot the specified CPU's dynticks counter so that we can later
361 * credit them with an implicit quiescent state. Return 1 if this CPU 362 * credit them with an implicit quiescent state. Return 1 if this CPU
362 * is in dynticks idle mode, which is an extended quiescent state. 363 * is in dynticks idle mode, which is an extended quiescent state.
@@ -419,24 +420,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
419 420
420#else /* #ifdef CONFIG_NO_HZ */ 421#else /* #ifdef CONFIG_NO_HZ */
421 422
422static void dyntick_record_completed(struct rcu_state *rsp, long comp)
423{
424}
425
426#ifdef CONFIG_SMP 423#ifdef CONFIG_SMP
427 424
428/*
429 * If there are no dynticks, then the only way that a CPU can passively
430 * be in a quiescent state is to be offline. Unlike dynticks idle, which
431 * is a point in time during the prior (already finished) grace period,
432 * an offline CPU is always in a quiescent state, and thus can be
433 * unconditionally applied. So just return the current value of completed.
434 */
435static long dyntick_recall_completed(struct rcu_state *rsp)
436{
437 return rsp->completed;
438}
439
440static int dyntick_save_progress_counter(struct rcu_data *rdp) 425static int dyntick_save_progress_counter(struct rcu_data *rdp)
441{ 426{
442 return 0; 427 return 0;
@@ -468,10 +453,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
468 453
469 /* Only let one CPU complain about others per time interval. */ 454 /* Only let one CPU complain about others per time interval. */
470 455
471 spin_lock_irqsave(&rnp->lock, flags); 456 raw_spin_lock_irqsave(&rnp->lock, flags);
472 delta = jiffies - rsp->jiffies_stall; 457 delta = jiffies - rsp->jiffies_stall;
473 if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { 458 if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
474 spin_unlock_irqrestore(&rnp->lock, flags); 459 raw_spin_unlock_irqrestore(&rnp->lock, flags);
475 return; 460 return;
476 } 461 }
477 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; 462 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
@@ -481,13 +466,15 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
481 * due to CPU offlining. 466 * due to CPU offlining.
482 */ 467 */
483 rcu_print_task_stall(rnp); 468 rcu_print_task_stall(rnp);
484 spin_unlock_irqrestore(&rnp->lock, flags); 469 raw_spin_unlock_irqrestore(&rnp->lock, flags);
485 470
486 /* OK, time to rat on our buddy... */ 471 /* OK, time to rat on our buddy... */
487 472
488 printk(KERN_ERR "INFO: RCU detected CPU stalls:"); 473 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
489 rcu_for_each_leaf_node(rsp, rnp) { 474 rcu_for_each_leaf_node(rsp, rnp) {
475 raw_spin_lock_irqsave(&rnp->lock, flags);
490 rcu_print_task_stall(rnp); 476 rcu_print_task_stall(rnp);
477 raw_spin_unlock_irqrestore(&rnp->lock, flags);
491 if (rnp->qsmask == 0) 478 if (rnp->qsmask == 0)
492 continue; 479 continue;
493 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) 480 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
@@ -498,6 +485,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
498 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 485 smp_processor_id(), (long)(jiffies - rsp->gp_start));
499 trigger_all_cpu_backtrace(); 486 trigger_all_cpu_backtrace();
500 487
488 /* If so configured, complain about tasks blocking the grace period. */
489
490 rcu_print_detail_task_stall(rsp);
491
501 force_quiescent_state(rsp, 0); /* Kick them all. */ 492 force_quiescent_state(rsp, 0); /* Kick them all. */
502} 493}
503 494
@@ -510,11 +501,11 @@ static void print_cpu_stall(struct rcu_state *rsp)
510 smp_processor_id(), jiffies - rsp->gp_start); 501 smp_processor_id(), jiffies - rsp->gp_start);
511 trigger_all_cpu_backtrace(); 502 trigger_all_cpu_backtrace();
512 503
513 spin_lock_irqsave(&rnp->lock, flags); 504 raw_spin_lock_irqsave(&rnp->lock, flags);
514 if ((long)(jiffies - rsp->jiffies_stall) >= 0) 505 if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
515 rsp->jiffies_stall = 506 rsp->jiffies_stall =
516 jiffies + RCU_SECONDS_TILL_STALL_RECHECK; 507 jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
517 spin_unlock_irqrestore(&rnp->lock, flags); 508 raw_spin_unlock_irqrestore(&rnp->lock, flags);
518 509
519 set_need_resched(); /* kick ourselves to get things going. */ 510 set_need_resched(); /* kick ourselves to get things going. */
520} 511}
@@ -553,13 +544,33 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
553/* 544/*
554 * Update CPU-local rcu_data state to record the newly noticed grace period. 545 * Update CPU-local rcu_data state to record the newly noticed grace period.
555 * This is used both when we started the grace period and when we notice 546 * This is used both when we started the grace period and when we notice
556 * that someone else started the grace period. 547 * that someone else started the grace period. The caller must hold the
548 * ->lock of the leaf rcu_node structure corresponding to the current CPU,
549 * and must have irqs disabled.
557 */ 550 */
551static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
552{
553 if (rdp->gpnum != rnp->gpnum) {
554 rdp->qs_pending = 1;
555 rdp->passed_quiesc = 0;
556 rdp->gpnum = rnp->gpnum;
557 }
558}
559
558static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp) 560static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
559{ 561{
560 rdp->qs_pending = 1; 562 unsigned long flags;
561 rdp->passed_quiesc = 0; 563 struct rcu_node *rnp;
562 rdp->gpnum = rsp->gpnum; 564
565 local_irq_save(flags);
566 rnp = rdp->mynode;
567 if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */
568 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
569 local_irq_restore(flags);
570 return;
571 }
572 __note_new_gpnum(rsp, rnp, rdp);
573 raw_spin_unlock_irqrestore(&rnp->lock, flags);
563} 574}
564 575
565/* 576/*
@@ -583,31 +594,59 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
583} 594}
584 595
585/* 596/*
586 * Start a new RCU grace period if warranted, re-initializing the hierarchy 597 * Advance this CPU's callbacks, but only if the current grace period
587 * in preparation for detecting the next grace period. The caller must hold 598 * has ended. This may be called only from the CPU to whom the rdp
588 * the root node's ->lock, which is released before return. Hard irqs must 599 * belongs. In addition, the corresponding leaf rcu_node structure's
589 * be disabled. 600 * ->lock must be held by the caller, with irqs disabled.
590 */ 601 */
591static void 602static void
592rcu_start_gp(struct rcu_state *rsp, unsigned long flags) 603__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
593 __releases(rcu_get_root(rsp)->lock)
594{ 604{
595 struct rcu_data *rdp = rsp->rda[smp_processor_id()]; 605 /* Did another grace period end? */
596 struct rcu_node *rnp = rcu_get_root(rsp); 606 if (rdp->completed != rnp->completed) {
607
608 /* Advance callbacks. No harm if list empty. */
609 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
610 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
611 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
612
613 /* Remember that we saw this grace-period completion. */
614 rdp->completed = rnp->completed;
615 }
616}
617
618/*
619 * Advance this CPU's callbacks, but only if the current grace period
620 * has ended. This may be called only from the CPU to whom the rdp
621 * belongs.
622 */
623static void
624rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
625{
626 unsigned long flags;
627 struct rcu_node *rnp;
597 628
598 if (!cpu_needs_another_gp(rsp, rdp)) { 629 local_irq_save(flags);
599 spin_unlock_irqrestore(&rnp->lock, flags); 630 rnp = rdp->mynode;
631 if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */
632 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
633 local_irq_restore(flags);
600 return; 634 return;
601 } 635 }
636 __rcu_process_gp_end(rsp, rnp, rdp);
637 raw_spin_unlock_irqrestore(&rnp->lock, flags);
638}
602 639
603 /* Advance to a new grace period and initialize state. */ 640/*
604 rsp->gpnum++; 641 * Do per-CPU grace-period initialization for running CPU. The caller
605 WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT); 642 * must hold the lock of the leaf rcu_node structure corresponding to
606 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ 643 * this CPU.
607 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 644 */
608 record_gp_stall_check_time(rsp); 645static void
609 dyntick_record_completed(rsp, rsp->completed - 1); 646rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
610 note_new_gpnum(rsp, rdp); 647{
648 /* Prior grace period ended, so advance callbacks for current CPU. */
649 __rcu_process_gp_end(rsp, rnp, rdp);
611 650
612 /* 651 /*
613 * Because this CPU just now started the new grace period, we know 652 * Because this CPU just now started the new grace period, we know
@@ -623,21 +662,70 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
623 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; 662 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
624 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; 663 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
625 664
665 /* Set state so that this CPU will detect the next quiescent state. */
666 __note_new_gpnum(rsp, rnp, rdp);
667}
668
669/*
670 * Start a new RCU grace period if warranted, re-initializing the hierarchy
671 * in preparation for detecting the next grace period. The caller must hold
672 * the root node's ->lock, which is released before return. Hard irqs must
673 * be disabled.
674 */
675static void
676rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
677 __releases(rcu_get_root(rsp)->lock)
678{
679 struct rcu_data *rdp = rsp->rda[smp_processor_id()];
680 struct rcu_node *rnp = rcu_get_root(rsp);
681
682 if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
683 if (cpu_needs_another_gp(rsp, rdp))
684 rsp->fqs_need_gp = 1;
685 if (rnp->completed == rsp->completed) {
686 raw_spin_unlock_irqrestore(&rnp->lock, flags);
687 return;
688 }
689 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
690
691 /*
692 * Propagate new ->completed value to rcu_node structures
693 * so that other CPUs don't have to wait until the start
694 * of the next grace period to process their callbacks.
695 */
696 rcu_for_each_node_breadth_first(rsp, rnp) {
697 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
698 rnp->completed = rsp->completed;
699 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
700 }
701 local_irq_restore(flags);
702 return;
703 }
704
705 /* Advance to a new grace period and initialize state. */
706 rsp->gpnum++;
707 WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT);
708 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
709 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
710 record_gp_stall_check_time(rsp);
711
626 /* Special-case the common single-level case. */ 712 /* Special-case the common single-level case. */
627 if (NUM_RCU_NODES == 1) { 713 if (NUM_RCU_NODES == 1) {
628 rcu_preempt_check_blocked_tasks(rnp); 714 rcu_preempt_check_blocked_tasks(rnp);
629 rnp->qsmask = rnp->qsmaskinit; 715 rnp->qsmask = rnp->qsmaskinit;
630 rnp->gpnum = rsp->gpnum; 716 rnp->gpnum = rsp->gpnum;
717 rnp->completed = rsp->completed;
631 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ 718 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
632 spin_unlock_irqrestore(&rnp->lock, flags); 719 rcu_start_gp_per_cpu(rsp, rnp, rdp);
720 raw_spin_unlock_irqrestore(&rnp->lock, flags);
633 return; 721 return;
634 } 722 }
635 723
636 spin_unlock(&rnp->lock); /* leave irqs disabled. */ 724 raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */
637 725
638 726
639 /* Exclude any concurrent CPU-hotplug operations. */ 727 /* Exclude any concurrent CPU-hotplug operations. */
640 spin_lock(&rsp->onofflock); /* irqs already disabled. */ 728 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
641 729
642 /* 730 /*
643 * Set the quiescent-state-needed bits in all the rcu_node 731 * Set the quiescent-state-needed bits in all the rcu_node
@@ -657,73 +745,50 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
657 * irqs disabled. 745 * irqs disabled.
658 */ 746 */
659 rcu_for_each_node_breadth_first(rsp, rnp) { 747 rcu_for_each_node_breadth_first(rsp, rnp) {
660 spin_lock(&rnp->lock); /* irqs already disabled. */ 748 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
661 rcu_preempt_check_blocked_tasks(rnp); 749 rcu_preempt_check_blocked_tasks(rnp);
662 rnp->qsmask = rnp->qsmaskinit; 750 rnp->qsmask = rnp->qsmaskinit;
663 rnp->gpnum = rsp->gpnum; 751 rnp->gpnum = rsp->gpnum;
664 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 752 rnp->completed = rsp->completed;
753 if (rnp == rdp->mynode)
754 rcu_start_gp_per_cpu(rsp, rnp, rdp);
755 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
665 } 756 }
666 757
667 rnp = rcu_get_root(rsp); 758 rnp = rcu_get_root(rsp);
668 spin_lock(&rnp->lock); /* irqs already disabled. */ 759 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
669 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ 760 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
670 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 761 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
671 spin_unlock_irqrestore(&rsp->onofflock, flags); 762 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
672} 763}
673 764
674/* 765/*
675 * Advance this CPU's callbacks, but only if the current grace period 766 * Report a full set of quiescent states to the specified rcu_state
676 * has ended. This may be called only from the CPU to whom the rdp 767 * data structure. This involves cleaning up after the prior grace
677 * belongs. 768 * period and letting rcu_start_gp() start up the next grace period
769 * if one is needed. Note that the caller must hold rnp->lock, as
770 * required by rcu_start_gp(), which will release it.
678 */ 771 */
679static void 772static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
680rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
681{
682 long completed_snap;
683 unsigned long flags;
684
685 local_irq_save(flags);
686 completed_snap = ACCESS_ONCE(rsp->completed); /* outside of lock. */
687
688 /* Did another grace period end? */
689 if (rdp->completed != completed_snap) {
690
691 /* Advance callbacks. No harm if list empty. */
692 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
693 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
694 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
695
696 /* Remember that we saw this grace-period completion. */
697 rdp->completed = completed_snap;
698 }
699 local_irq_restore(flags);
700}
701
702/*
703 * Clean up after the prior grace period and let rcu_start_gp() start up
704 * the next grace period if one is needed. Note that the caller must
705 * hold rnp->lock, as required by rcu_start_gp(), which will release it.
706 */
707static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags)
708 __releases(rcu_get_root(rsp)->lock) 773 __releases(rcu_get_root(rsp)->lock)
709{ 774{
710 WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); 775 WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
711 rsp->completed = rsp->gpnum; 776 rsp->completed = rsp->gpnum;
712 rsp->signaled = RCU_GP_IDLE; 777 rsp->signaled = RCU_GP_IDLE;
713 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
714 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ 778 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
715} 779}
716 780
717/* 781/*
718 * Similar to cpu_quiet(), for which it is a helper function. Allows 782 * Similar to rcu_report_qs_rdp(), for which it is a helper function.
719 * a group of CPUs to be quieted at one go, though all the CPUs in the 783 * Allows quiescent states for a group of CPUs to be reported at one go
720 * group must be represented by the same leaf rcu_node structure. 784 * to the specified rcu_node structure, though all the CPUs in the group
721 * That structure's lock must be held upon entry, and it is released 785 * must be represented by the same rcu_node structure (which need not be
722 * before return. 786 * a leaf rcu_node structure, though it often will be). That structure's
787 * lock must be held upon entry, and it is released before return.
723 */ 788 */
724static void 789static void
725cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp, 790rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
726 unsigned long flags) 791 struct rcu_node *rnp, unsigned long flags)
727 __releases(rnp->lock) 792 __releases(rnp->lock)
728{ 793{
729 struct rcu_node *rnp_c; 794 struct rcu_node *rnp_c;
@@ -733,14 +798,14 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
733 if (!(rnp->qsmask & mask)) { 798 if (!(rnp->qsmask & mask)) {
734 799
735 /* Our bit has already been cleared, so done. */ 800 /* Our bit has already been cleared, so done. */
736 spin_unlock_irqrestore(&rnp->lock, flags); 801 raw_spin_unlock_irqrestore(&rnp->lock, flags);
737 return; 802 return;
738 } 803 }
739 rnp->qsmask &= ~mask; 804 rnp->qsmask &= ~mask;
740 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { 805 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
741 806
742 /* Other bits still set at this level, so done. */ 807 /* Other bits still set at this level, so done. */
743 spin_unlock_irqrestore(&rnp->lock, flags); 808 raw_spin_unlock_irqrestore(&rnp->lock, flags);
744 return; 809 return;
745 } 810 }
746 mask = rnp->grpmask; 811 mask = rnp->grpmask;
@@ -750,54 +815,56 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
750 815
751 break; 816 break;
752 } 817 }
753 spin_unlock_irqrestore(&rnp->lock, flags); 818 raw_spin_unlock_irqrestore(&rnp->lock, flags);
754 rnp_c = rnp; 819 rnp_c = rnp;
755 rnp = rnp->parent; 820 rnp = rnp->parent;
756 spin_lock_irqsave(&rnp->lock, flags); 821 raw_spin_lock_irqsave(&rnp->lock, flags);
757 WARN_ON_ONCE(rnp_c->qsmask); 822 WARN_ON_ONCE(rnp_c->qsmask);
758 } 823 }
759 824
760 /* 825 /*
761 * Get here if we are the last CPU to pass through a quiescent 826 * Get here if we are the last CPU to pass through a quiescent
762 * state for this grace period. Invoke cpu_quiet_msk_finish() 827 * state for this grace period. Invoke rcu_report_qs_rsp()
763 * to clean up and start the next grace period if one is needed. 828 * to clean up and start the next grace period if one is needed.
764 */ 829 */
765 cpu_quiet_msk_finish(rsp, flags); /* releases rnp->lock. */ 830 rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */
766} 831}
767 832
768/* 833/*
769 * Record a quiescent state for the specified CPU, which must either be 834 * Record a quiescent state for the specified CPU to that CPU's rcu_data
770 * the current CPU. The lastcomp argument is used to make sure we are 835 * structure. This must be either called from the specified CPU, or
771 * still in the grace period of interest. We don't want to end the current 836 * called when the specified CPU is known to be offline (and when it is
772 * grace period based on quiescent states detected in an earlier grace 837 * also known that no other CPU is concurrently trying to help the offline
773 * period! 838 * CPU). The lastcomp argument is used to make sure we are still in the
839 * grace period of interest. We don't want to end the current grace period
840 * based on quiescent states detected in an earlier grace period!
774 */ 841 */
775static void 842static void
776cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) 843rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
777{ 844{
778 unsigned long flags; 845 unsigned long flags;
779 unsigned long mask; 846 unsigned long mask;
780 struct rcu_node *rnp; 847 struct rcu_node *rnp;
781 848
782 rnp = rdp->mynode; 849 rnp = rdp->mynode;
783 spin_lock_irqsave(&rnp->lock, flags); 850 raw_spin_lock_irqsave(&rnp->lock, flags);
784 if (lastcomp != ACCESS_ONCE(rsp->completed)) { 851 if (lastcomp != rnp->completed) {
785 852
786 /* 853 /*
787 * Someone beat us to it for this grace period, so leave. 854 * Someone beat us to it for this grace period, so leave.
788 * The race with GP start is resolved by the fact that we 855 * The race with GP start is resolved by the fact that we
789 * hold the leaf rcu_node lock, so that the per-CPU bits 856 * hold the leaf rcu_node lock, so that the per-CPU bits
790 * cannot yet be initialized -- so we would simply find our 857 * cannot yet be initialized -- so we would simply find our
791 * CPU's bit already cleared in cpu_quiet_msk() if this race 858 * CPU's bit already cleared in rcu_report_qs_rnp() if this
792 * occurred. 859 * race occurred.
793 */ 860 */
794 rdp->passed_quiesc = 0; /* try again later! */ 861 rdp->passed_quiesc = 0; /* try again later! */
795 spin_unlock_irqrestore(&rnp->lock, flags); 862 raw_spin_unlock_irqrestore(&rnp->lock, flags);
796 return; 863 return;
797 } 864 }
798 mask = rdp->grpmask; 865 mask = rdp->grpmask;
799 if ((rnp->qsmask & mask) == 0) { 866 if ((rnp->qsmask & mask) == 0) {
800 spin_unlock_irqrestore(&rnp->lock, flags); 867 raw_spin_unlock_irqrestore(&rnp->lock, flags);
801 } else { 868 } else {
802 rdp->qs_pending = 0; 869 rdp->qs_pending = 0;
803 870
@@ -807,7 +874,7 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
807 */ 874 */
808 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; 875 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
809 876
810 cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */ 877 rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
811 } 878 }
812} 879}
813 880
@@ -838,8 +905,11 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
838 if (!rdp->passed_quiesc) 905 if (!rdp->passed_quiesc)
839 return; 906 return;
840 907
841 /* Tell RCU we are done (but cpu_quiet() will be the judge of that). */ 908 /*
842 cpu_quiet(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed); 909 * Tell RCU we are done (but rcu_report_qs_rdp() will be the
910 * judge of that).
911 */
912 rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed);
843} 913}
844 914
845#ifdef CONFIG_HOTPLUG_CPU 915#ifdef CONFIG_HOTPLUG_CPU
@@ -858,7 +928,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
858 928
859 if (rdp->nxtlist == NULL) 929 if (rdp->nxtlist == NULL)
860 return; /* irqs disabled, so comparison is stable. */ 930 return; /* irqs disabled, so comparison is stable. */
861 spin_lock(&rsp->onofflock); /* irqs already disabled. */ 931 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
862 *rsp->orphan_cbs_tail = rdp->nxtlist; 932 *rsp->orphan_cbs_tail = rdp->nxtlist;
863 rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL]; 933 rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL];
864 rdp->nxtlist = NULL; 934 rdp->nxtlist = NULL;
@@ -866,7 +936,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
866 rdp->nxttail[i] = &rdp->nxtlist; 936 rdp->nxttail[i] = &rdp->nxtlist;
867 rsp->orphan_qlen += rdp->qlen; 937 rsp->orphan_qlen += rdp->qlen;
868 rdp->qlen = 0; 938 rdp->qlen = 0;
869 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ 939 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
870} 940}
871 941
872/* 942/*
@@ -877,10 +947,10 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
877 unsigned long flags; 947 unsigned long flags;
878 struct rcu_data *rdp; 948 struct rcu_data *rdp;
879 949
880 spin_lock_irqsave(&rsp->onofflock, flags); 950 raw_spin_lock_irqsave(&rsp->onofflock, flags);
881 rdp = rsp->rda[smp_processor_id()]; 951 rdp = rsp->rda[smp_processor_id()];
882 if (rsp->orphan_cbs_list == NULL) { 952 if (rsp->orphan_cbs_list == NULL) {
883 spin_unlock_irqrestore(&rsp->onofflock, flags); 953 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
884 return; 954 return;
885 } 955 }
886 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; 956 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
@@ -889,7 +959,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
889 rsp->orphan_cbs_list = NULL; 959 rsp->orphan_cbs_list = NULL;
890 rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; 960 rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
891 rsp->orphan_qlen = 0; 961 rsp->orphan_qlen = 0;
892 spin_unlock_irqrestore(&rsp->onofflock, flags); 962 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
893} 963}
894 964
895/* 965/*
@@ -899,45 +969,47 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
899static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) 969static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
900{ 970{
901 unsigned long flags; 971 unsigned long flags;
902 long lastcomp;
903 unsigned long mask; 972 unsigned long mask;
973 int need_report = 0;
904 struct rcu_data *rdp = rsp->rda[cpu]; 974 struct rcu_data *rdp = rsp->rda[cpu];
905 struct rcu_node *rnp; 975 struct rcu_node *rnp;
906 976
907 /* Exclude any attempts to start a new grace period. */ 977 /* Exclude any attempts to start a new grace period. */
908 spin_lock_irqsave(&rsp->onofflock, flags); 978 raw_spin_lock_irqsave(&rsp->onofflock, flags);
909 979
910 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ 980 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
911 rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */ 981 rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */
912 mask = rdp->grpmask; /* rnp->grplo is constant. */ 982 mask = rdp->grpmask; /* rnp->grplo is constant. */
913 do { 983 do {
914 spin_lock(&rnp->lock); /* irqs already disabled. */ 984 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
915 rnp->qsmaskinit &= ~mask; 985 rnp->qsmaskinit &= ~mask;
916 if (rnp->qsmaskinit != 0) { 986 if (rnp->qsmaskinit != 0) {
917 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 987 if (rnp != rdp->mynode)
988 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
918 break; 989 break;
919 } 990 }
920 991 if (rnp == rdp->mynode)
921 /* 992 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
922 * If there was a task blocking the current grace period, 993 else
923 * and if all CPUs have checked in, we need to propagate 994 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
924 * the quiescent state up the rcu_node hierarchy. But that
925 * is inconvenient at the moment due to deadlock issues if
926 * this should end the current grace period. So set the
927 * offlined CPU's bit in ->qsmask in order to force the
928 * next force_quiescent_state() invocation to clean up this
929 * mess in a deadlock-free manner.
930 */
931 if (rcu_preempt_offline_tasks(rsp, rnp, rdp) && !rnp->qsmask)
932 rnp->qsmask |= mask;
933
934 mask = rnp->grpmask; 995 mask = rnp->grpmask;
935 spin_unlock(&rnp->lock); /* irqs remain disabled. */
936 rnp = rnp->parent; 996 rnp = rnp->parent;
937 } while (rnp != NULL); 997 } while (rnp != NULL);
938 lastcomp = rsp->completed;
939 998
940 spin_unlock_irqrestore(&rsp->onofflock, flags); 999 /*
1000 * We still hold the leaf rcu_node structure lock here, and
1001 * irqs are still disabled. The reason for this subterfuge is
1002 * because invoking rcu_report_unblock_qs_rnp() with ->onofflock
1003 * held leads to deadlock.
1004 */
1005 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
1006 rnp = rdp->mynode;
1007 if (need_report & RCU_OFL_TASKS_NORM_GP)
1008 rcu_report_unblock_qs_rnp(rnp, flags);
1009 else
1010 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1011 if (need_report & RCU_OFL_TASKS_EXP_GP)
1012 rcu_report_exp_rnp(rsp, rnp);
941 1013
942 rcu_adopt_orphan_cbs(rsp); 1014 rcu_adopt_orphan_cbs(rsp);
943} 1015}
@@ -1094,11 +1166,9 @@ void rcu_check_callbacks(int cpu, int user)
1094/* 1166/*
1095 * Scan the leaf rcu_node structures, processing dyntick state for any that 1167 * Scan the leaf rcu_node structures, processing dyntick state for any that
1096 * have not yet encountered a quiescent state, using the function specified. 1168 * have not yet encountered a quiescent state, using the function specified.
1097 * Returns 1 if the current grace period ends while scanning (possibly 1169 * The caller must have suppressed start of new grace periods.
1098 * because we made it end).
1099 */ 1170 */
1100static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp, 1171static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
1101 int (*f)(struct rcu_data *))
1102{ 1172{
1103 unsigned long bit; 1173 unsigned long bit;
1104 int cpu; 1174 int cpu;
@@ -1108,13 +1178,13 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1108 1178
1109 rcu_for_each_leaf_node(rsp, rnp) { 1179 rcu_for_each_leaf_node(rsp, rnp) {
1110 mask = 0; 1180 mask = 0;
1111 spin_lock_irqsave(&rnp->lock, flags); 1181 raw_spin_lock_irqsave(&rnp->lock, flags);
1112 if (rsp->completed != lastcomp) { 1182 if (!rcu_gp_in_progress(rsp)) {
1113 spin_unlock_irqrestore(&rnp->lock, flags); 1183 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1114 return 1; 1184 return;
1115 } 1185 }
1116 if (rnp->qsmask == 0) { 1186 if (rnp->qsmask == 0) {
1117 spin_unlock_irqrestore(&rnp->lock, flags); 1187 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1118 continue; 1188 continue;
1119 } 1189 }
1120 cpu = rnp->grplo; 1190 cpu = rnp->grplo;
@@ -1123,15 +1193,14 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1123 if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) 1193 if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu]))
1124 mask |= bit; 1194 mask |= bit;
1125 } 1195 }
1126 if (mask != 0 && rsp->completed == lastcomp) { 1196 if (mask != 0) {
1127 1197
1128 /* cpu_quiet_msk() releases rnp->lock. */ 1198 /* rcu_report_qs_rnp() releases rnp->lock. */
1129 cpu_quiet_msk(mask, rsp, rnp, flags); 1199 rcu_report_qs_rnp(mask, rsp, rnp, flags);
1130 continue; 1200 continue;
1131 } 1201 }
1132 spin_unlock_irqrestore(&rnp->lock, flags); 1202 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1133 } 1203 }
1134 return 0;
1135} 1204}
1136 1205
1137/* 1206/*
@@ -1141,31 +1210,26 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1141static void force_quiescent_state(struct rcu_state *rsp, int relaxed) 1210static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1142{ 1211{
1143 unsigned long flags; 1212 unsigned long flags;
1144 long lastcomp;
1145 struct rcu_node *rnp = rcu_get_root(rsp); 1213 struct rcu_node *rnp = rcu_get_root(rsp);
1146 u8 signaled;
1147 1214
1148 if (!rcu_gp_in_progress(rsp)) 1215 if (!rcu_gp_in_progress(rsp))
1149 return; /* No grace period in progress, nothing to force. */ 1216 return; /* No grace period in progress, nothing to force. */
1150 if (!spin_trylock_irqsave(&rsp->fqslock, flags)) { 1217 if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) {
1151 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ 1218 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */
1152 return; /* Someone else is already on the job. */ 1219 return; /* Someone else is already on the job. */
1153 } 1220 }
1154 if (relaxed && 1221 if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies))
1155 (long)(rsp->jiffies_force_qs - jiffies) >= 0) 1222 goto unlock_fqs_ret; /* no emergency and done recently. */
1156 goto unlock_ret; /* no emergency and done recently. */
1157 rsp->n_force_qs++; 1223 rsp->n_force_qs++;
1158 spin_lock(&rnp->lock); 1224 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1159 lastcomp = rsp->completed;
1160 signaled = rsp->signaled;
1161 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 1225 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
1162 if (lastcomp == rsp->gpnum) { 1226 if(!rcu_gp_in_progress(rsp)) {
1163 rsp->n_force_qs_ngp++; 1227 rsp->n_force_qs_ngp++;
1164 spin_unlock(&rnp->lock); 1228 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1165 goto unlock_ret; /* no GP in progress, time updated. */ 1229 goto unlock_fqs_ret; /* no GP in progress, time updated. */
1166 } 1230 }
1167 spin_unlock(&rnp->lock); 1231 rsp->fqs_active = 1;
1168 switch (signaled) { 1232 switch (rsp->signaled) {
1169 case RCU_GP_IDLE: 1233 case RCU_GP_IDLE:
1170 case RCU_GP_INIT: 1234 case RCU_GP_INIT:
1171 1235
@@ -1173,37 +1237,38 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1173 1237
1174 case RCU_SAVE_DYNTICK: 1238 case RCU_SAVE_DYNTICK:
1175 1239
1240 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1176 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK) 1241 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
1177 break; /* So gcc recognizes the dead code. */ 1242 break; /* So gcc recognizes the dead code. */
1178 1243
1179 /* Record dyntick-idle state. */ 1244 /* Record dyntick-idle state. */
1180 if (rcu_process_dyntick(rsp, lastcomp, 1245 force_qs_rnp(rsp, dyntick_save_progress_counter);
1181 dyntick_save_progress_counter)) 1246 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1182 goto unlock_ret; 1247 if (rcu_gp_in_progress(rsp))
1183
1184 /* Update state, record completion counter. */
1185 spin_lock(&rnp->lock);
1186 if (lastcomp == rsp->completed &&
1187 rsp->signaled == RCU_SAVE_DYNTICK) {
1188 rsp->signaled = RCU_FORCE_QS; 1248 rsp->signaled = RCU_FORCE_QS;
1189 dyntick_record_completed(rsp, lastcomp);
1190 }
1191 spin_unlock(&rnp->lock);
1192 break; 1249 break;
1193 1250
1194 case RCU_FORCE_QS: 1251 case RCU_FORCE_QS:
1195 1252
1196 /* Check dyntick-idle state, send IPI to laggarts. */ 1253 /* Check dyntick-idle state, send IPI to laggarts. */
1197 if (rcu_process_dyntick(rsp, dyntick_recall_completed(rsp), 1254 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1198 rcu_implicit_dynticks_qs)) 1255 force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
1199 goto unlock_ret;
1200 1256
1201 /* Leave state in case more forcing is required. */ 1257 /* Leave state in case more forcing is required. */
1202 1258
1259 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1203 break; 1260 break;
1204 } 1261 }
1205unlock_ret: 1262 rsp->fqs_active = 0;
1206 spin_unlock_irqrestore(&rsp->fqslock, flags); 1263 if (rsp->fqs_need_gp) {
1264 raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */
1265 rsp->fqs_need_gp = 0;
1266 rcu_start_gp(rsp, flags); /* releases rnp->lock */
1267 return;
1268 }
1269 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1270unlock_fqs_ret:
1271 raw_spin_unlock_irqrestore(&rsp->fqslock, flags);
1207} 1272}
1208 1273
1209#else /* #ifdef CONFIG_SMP */ 1274#else /* #ifdef CONFIG_SMP */
@@ -1231,7 +1296,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1231 * If an RCU GP has gone long enough, go check for dyntick 1296 * If an RCU GP has gone long enough, go check for dyntick
1232 * idle CPUs and, if needed, send resched IPIs. 1297 * idle CPUs and, if needed, send resched IPIs.
1233 */ 1298 */
1234 if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) 1299 if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
1235 force_quiescent_state(rsp, 1); 1300 force_quiescent_state(rsp, 1);
1236 1301
1237 /* 1302 /*
@@ -1245,7 +1310,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1245 1310
1246 /* Does this CPU require a not-yet-started grace period? */ 1311 /* Does this CPU require a not-yet-started grace period? */
1247 if (cpu_needs_another_gp(rsp, rdp)) { 1312 if (cpu_needs_another_gp(rsp, rdp)) {
1248 spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags); 1313 raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
1249 rcu_start_gp(rsp, flags); /* releases above lock */ 1314 rcu_start_gp(rsp, flags); /* releases above lock */
1250 } 1315 }
1251 1316
@@ -1276,6 +1341,9 @@ static void rcu_process_callbacks(struct softirq_action *unused)
1276 * grace-period manipulations above. 1341 * grace-period manipulations above.
1277 */ 1342 */
1278 smp_mb(); /* See above block comment. */ 1343 smp_mb(); /* See above block comment. */
1344
1345 /* If we are last CPU on way to dyntick-idle mode, accelerate it. */
1346 rcu_needs_cpu_flush();
1279} 1347}
1280 1348
1281static void 1349static void
@@ -1310,7 +1378,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1310 unsigned long nestflag; 1378 unsigned long nestflag;
1311 struct rcu_node *rnp_root = rcu_get_root(rsp); 1379 struct rcu_node *rnp_root = rcu_get_root(rsp);
1312 1380
1313 spin_lock_irqsave(&rnp_root->lock, nestflag); 1381 raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
1314 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ 1382 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */
1315 } 1383 }
1316 1384
@@ -1328,7 +1396,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1328 force_quiescent_state(rsp, 0); 1396 force_quiescent_state(rsp, 0);
1329 rdp->n_force_qs_snap = rsp->n_force_qs; 1397 rdp->n_force_qs_snap = rsp->n_force_qs;
1330 rdp->qlen_last_fqs_check = rdp->qlen; 1398 rdp->qlen_last_fqs_check = rdp->qlen;
1331 } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) 1399 } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
1332 force_quiescent_state(rsp, 1); 1400 force_quiescent_state(rsp, 1);
1333 local_irq_restore(flags); 1401 local_irq_restore(flags);
1334} 1402}
@@ -1351,6 +1419,68 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1351} 1419}
1352EXPORT_SYMBOL_GPL(call_rcu_bh); 1420EXPORT_SYMBOL_GPL(call_rcu_bh);
1353 1421
1422/**
1423 * synchronize_sched - wait until an rcu-sched grace period has elapsed.
1424 *
1425 * Control will return to the caller some time after a full rcu-sched
1426 * grace period has elapsed, in other words after all currently executing
1427 * rcu-sched read-side critical sections have completed. These read-side
1428 * critical sections are delimited by rcu_read_lock_sched() and
1429 * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(),
1430 * local_irq_disable(), and so on may be used in place of
1431 * rcu_read_lock_sched().
1432 *
1433 * This means that all preempt_disable code sequences, including NMI and
1434 * hardware-interrupt handlers, in progress on entry will have completed
1435 * before this primitive returns. However, this does not guarantee that
1436 * softirq handlers will have completed, since in some kernels, these
1437 * handlers can run in process context, and can block.
1438 *
1439 * This primitive provides the guarantees made by the (now removed)
1440 * synchronize_kernel() API. In contrast, synchronize_rcu() only
1441 * guarantees that rcu_read_lock() sections will have completed.
1442 * In "classic RCU", these two guarantees happen to be one and
1443 * the same, but can differ in realtime RCU implementations.
1444 */
1445void synchronize_sched(void)
1446{
1447 struct rcu_synchronize rcu;
1448
1449 if (rcu_blocking_is_gp())
1450 return;
1451
1452 init_completion(&rcu.completion);
1453 /* Will wake me after RCU finished. */
1454 call_rcu_sched(&rcu.head, wakeme_after_rcu);
1455 /* Wait for it. */
1456 wait_for_completion(&rcu.completion);
1457}
1458EXPORT_SYMBOL_GPL(synchronize_sched);
1459
1460/**
1461 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
1462 *
1463 * Control will return to the caller some time after a full rcu_bh grace
1464 * period has elapsed, in other words after all currently executing rcu_bh
1465 * read-side critical sections have completed. RCU read-side critical
1466 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
1467 * and may be nested.
1468 */
1469void synchronize_rcu_bh(void)
1470{
1471 struct rcu_synchronize rcu;
1472
1473 if (rcu_blocking_is_gp())
1474 return;
1475
1476 init_completion(&rcu.completion);
1477 /* Will wake me after RCU finished. */
1478 call_rcu_bh(&rcu.head, wakeme_after_rcu);
1479 /* Wait for it. */
1480 wait_for_completion(&rcu.completion);
1481}
1482EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
1483
1354/* 1484/*
1355 * Check to see if there is any immediate RCU-related work to be done 1485 * Check to see if there is any immediate RCU-related work to be done
1356 * by the current CPU, for the specified type of RCU, returning 1 if so. 1486 * by the current CPU, for the specified type of RCU, returning 1 if so.
@@ -1360,6 +1490,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
1360 */ 1490 */
1361static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) 1491static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1362{ 1492{
1493 struct rcu_node *rnp = rdp->mynode;
1494
1363 rdp->n_rcu_pending++; 1495 rdp->n_rcu_pending++;
1364 1496
1365 /* Check for CPU stalls, if enabled. */ 1497 /* Check for CPU stalls, if enabled. */
@@ -1384,20 +1516,20 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1384 } 1516 }
1385 1517
1386 /* Has another RCU grace period completed? */ 1518 /* Has another RCU grace period completed? */
1387 if (ACCESS_ONCE(rsp->completed) != rdp->completed) { /* outside lock */ 1519 if (ACCESS_ONCE(rnp->completed) != rdp->completed) { /* outside lock */
1388 rdp->n_rp_gp_completed++; 1520 rdp->n_rp_gp_completed++;
1389 return 1; 1521 return 1;
1390 } 1522 }
1391 1523
1392 /* Has a new RCU grace period started? */ 1524 /* Has a new RCU grace period started? */
1393 if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) { /* outside lock */ 1525 if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */
1394 rdp->n_rp_gp_started++; 1526 rdp->n_rp_gp_started++;
1395 return 1; 1527 return 1;
1396 } 1528 }
1397 1529
1398 /* Has an RCU GP gone long enough to send resched IPIs &c? */ 1530 /* Has an RCU GP gone long enough to send resched IPIs &c? */
1399 if (rcu_gp_in_progress(rsp) && 1531 if (rcu_gp_in_progress(rsp) &&
1400 ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) { 1532 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) {
1401 rdp->n_rp_need_fqs++; 1533 rdp->n_rp_need_fqs++;
1402 return 1; 1534 return 1;
1403 } 1535 }
@@ -1422,10 +1554,9 @@ static int rcu_pending(int cpu)
1422/* 1554/*
1423 * Check to see if any future RCU-related work will need to be done 1555 * Check to see if any future RCU-related work will need to be done
1424 * by the current CPU, even if none need be done immediately, returning 1556 * by the current CPU, even if none need be done immediately, returning
1425 * 1 if so. This function is part of the RCU implementation; it is -not- 1557 * 1 if so.
1426 * an exported member of the RCU API.
1427 */ 1558 */
1428int rcu_needs_cpu(int cpu) 1559static int rcu_needs_cpu_quick_check(int cpu)
1429{ 1560{
1430 /* RCU callbacks either ready or pending? */ 1561 /* RCU callbacks either ready or pending? */
1431 return per_cpu(rcu_sched_data, cpu).nxtlist || 1562 return per_cpu(rcu_sched_data, cpu).nxtlist ||
@@ -1521,7 +1652,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1521 struct rcu_node *rnp = rcu_get_root(rsp); 1652 struct rcu_node *rnp = rcu_get_root(rsp);
1522 1653
1523 /* Set up local state, ensuring consistent view of global state. */ 1654 /* Set up local state, ensuring consistent view of global state. */
1524 spin_lock_irqsave(&rnp->lock, flags); 1655 raw_spin_lock_irqsave(&rnp->lock, flags);
1525 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); 1656 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
1526 rdp->nxtlist = NULL; 1657 rdp->nxtlist = NULL;
1527 for (i = 0; i < RCU_NEXT_SIZE; i++) 1658 for (i = 0; i < RCU_NEXT_SIZE; i++)
@@ -1531,7 +1662,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1531 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 1662 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1532#endif /* #ifdef CONFIG_NO_HZ */ 1663#endif /* #ifdef CONFIG_NO_HZ */
1533 rdp->cpu = cpu; 1664 rdp->cpu = cpu;
1534 spin_unlock_irqrestore(&rnp->lock, flags); 1665 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1535} 1666}
1536 1667
1537/* 1668/*
@@ -1544,25 +1675,20 @@ static void __cpuinit
1544rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) 1675rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1545{ 1676{
1546 unsigned long flags; 1677 unsigned long flags;
1547 long lastcomp;
1548 unsigned long mask; 1678 unsigned long mask;
1549 struct rcu_data *rdp = rsp->rda[cpu]; 1679 struct rcu_data *rdp = rsp->rda[cpu];
1550 struct rcu_node *rnp = rcu_get_root(rsp); 1680 struct rcu_node *rnp = rcu_get_root(rsp);
1551 1681
1552 /* Set up local state, ensuring consistent view of global state. */ 1682 /* Set up local state, ensuring consistent view of global state. */
1553 spin_lock_irqsave(&rnp->lock, flags); 1683 raw_spin_lock_irqsave(&rnp->lock, flags);
1554 lastcomp = rsp->completed;
1555 rdp->completed = lastcomp;
1556 rdp->gpnum = lastcomp;
1557 rdp->passed_quiesc = 0; /* We could be racing with new GP, */ 1684 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1558 rdp->qs_pending = 1; /* so set up to respond to current GP. */ 1685 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1559 rdp->beenonline = 1; /* We have now been online. */ 1686 rdp->beenonline = 1; /* We have now been online. */
1560 rdp->preemptable = preemptable; 1687 rdp->preemptable = preemptable;
1561 rdp->passed_quiesc_completed = lastcomp - 1;
1562 rdp->qlen_last_fqs_check = 0; 1688 rdp->qlen_last_fqs_check = 0;
1563 rdp->n_force_qs_snap = rsp->n_force_qs; 1689 rdp->n_force_qs_snap = rsp->n_force_qs;
1564 rdp->blimit = blimit; 1690 rdp->blimit = blimit;
1565 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1691 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1566 1692
1567 /* 1693 /*
1568 * A new grace period might start here. If so, we won't be part 1694 * A new grace period might start here. If so, we won't be part
@@ -1570,21 +1696,26 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1570 */ 1696 */
1571 1697
1572 /* Exclude any attempts to start a new GP on large systems. */ 1698 /* Exclude any attempts to start a new GP on large systems. */
1573 spin_lock(&rsp->onofflock); /* irqs already disabled. */ 1699 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
1574 1700
1575 /* Add CPU to rcu_node bitmasks. */ 1701 /* Add CPU to rcu_node bitmasks. */
1576 rnp = rdp->mynode; 1702 rnp = rdp->mynode;
1577 mask = rdp->grpmask; 1703 mask = rdp->grpmask;
1578 do { 1704 do {
1579 /* Exclude any attempts to start a new GP on small systems. */ 1705 /* Exclude any attempts to start a new GP on small systems. */
1580 spin_lock(&rnp->lock); /* irqs already disabled. */ 1706 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
1581 rnp->qsmaskinit |= mask; 1707 rnp->qsmaskinit |= mask;
1582 mask = rnp->grpmask; 1708 mask = rnp->grpmask;
1583 spin_unlock(&rnp->lock); /* irqs already disabled. */ 1709 if (rnp == rdp->mynode) {
1710 rdp->gpnum = rnp->completed; /* if GP in progress... */
1711 rdp->completed = rnp->completed;
1712 rdp->passed_quiesc_completed = rnp->completed - 1;
1713 }
1714 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
1584 rnp = rnp->parent; 1715 rnp = rnp->parent;
1585 } while (rnp != NULL && !(rnp->qsmaskinit & mask)); 1716 } while (rnp != NULL && !(rnp->qsmaskinit & mask));
1586 1717
1587 spin_unlock_irqrestore(&rsp->onofflock, flags); 1718 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
1588} 1719}
1589 1720
1590static void __cpuinit rcu_online_cpu(int cpu) 1721static void __cpuinit rcu_online_cpu(int cpu)
@@ -1597,8 +1728,8 @@ static void __cpuinit rcu_online_cpu(int cpu)
1597/* 1728/*
1598 * Handle CPU online/offline notification events. 1729 * Handle CPU online/offline notification events.
1599 */ 1730 */
1600int __cpuinit rcu_cpu_notify(struct notifier_block *self, 1731static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1601 unsigned long action, void *hcpu) 1732 unsigned long action, void *hcpu)
1602{ 1733{
1603 long cpu = (long)hcpu; 1734 long cpu = (long)hcpu;
1604 1735
@@ -1668,11 +1799,17 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
1668 */ 1799 */
1669static void __init rcu_init_one(struct rcu_state *rsp) 1800static void __init rcu_init_one(struct rcu_state *rsp)
1670{ 1801{
1802 static char *buf[] = { "rcu_node_level_0",
1803 "rcu_node_level_1",
1804 "rcu_node_level_2",
1805 "rcu_node_level_3" }; /* Match MAX_RCU_LVLS */
1671 int cpustride = 1; 1806 int cpustride = 1;
1672 int i; 1807 int i;
1673 int j; 1808 int j;
1674 struct rcu_node *rnp; 1809 struct rcu_node *rnp;
1675 1810
1811 BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */
1812
1676 /* Initialize the level-tracking arrays. */ 1813 /* Initialize the level-tracking arrays. */
1677 1814
1678 for (i = 1; i < NUM_RCU_LVLS; i++) 1815 for (i = 1; i < NUM_RCU_LVLS; i++)
@@ -1685,8 +1822,9 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1685 cpustride *= rsp->levelspread[i]; 1822 cpustride *= rsp->levelspread[i];
1686 rnp = rsp->level[i]; 1823 rnp = rsp->level[i];
1687 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { 1824 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
1688 if (rnp != rcu_get_root(rsp)) 1825 raw_spin_lock_init(&rnp->lock);
1689 spin_lock_init(&rnp->lock); 1826 lockdep_set_class_and_name(&rnp->lock,
1827 &rcu_node_class[i], buf[i]);
1690 rnp->gpnum = 0; 1828 rnp->gpnum = 0;
1691 rnp->qsmask = 0; 1829 rnp->qsmask = 0;
1692 rnp->qsmaskinit = 0; 1830 rnp->qsmaskinit = 0;
@@ -1707,9 +1845,10 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1707 rnp->level = i; 1845 rnp->level = i;
1708 INIT_LIST_HEAD(&rnp->blocked_tasks[0]); 1846 INIT_LIST_HEAD(&rnp->blocked_tasks[0]);
1709 INIT_LIST_HEAD(&rnp->blocked_tasks[1]); 1847 INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
1848 INIT_LIST_HEAD(&rnp->blocked_tasks[2]);
1849 INIT_LIST_HEAD(&rnp->blocked_tasks[3]);
1710 } 1850 }
1711 } 1851 }
1712 spin_lock_init(&rcu_get_root(rsp)->lock);
1713} 1852}
1714 1853
1715/* 1854/*
@@ -1735,16 +1874,30 @@ do { \
1735 } \ 1874 } \
1736} while (0) 1875} while (0)
1737 1876
1738void __init __rcu_init(void) 1877void __init rcu_init(void)
1739{ 1878{
1879 int cpu;
1880
1740 rcu_bootup_announce(); 1881 rcu_bootup_announce();
1741#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 1882#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1742 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); 1883 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1743#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 1884#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
1885#if NUM_RCU_LVL_4 != 0
1886 printk(KERN_INFO "Experimental four-level hierarchy is enabled.\n");
1887#endif /* #if NUM_RCU_LVL_4 != 0 */
1744 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); 1888 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
1745 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); 1889 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
1746 __rcu_init_preempt(); 1890 __rcu_init_preempt();
1747 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 1891 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1892
1893 /*
1894 * We don't need protection against CPU-hotplug here because
1895 * this is called early in boot, before either interrupts
1896 * or the scheduler are operational.
1897 */
1898 cpu_notifier(rcu_cpu_notify, 0);
1899 for_each_online_cpu(cpu)
1900 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
1748} 1901}
1749 1902
1750#include "rcutree_plugin.h" 1903#include "rcutree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 1899023b0962..4a525a30e08e 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -34,10 +34,11 @@
34 * In practice, this has not been tested, so there is probably some 34 * In practice, this has not been tested, so there is probably some
35 * bug somewhere. 35 * bug somewhere.
36 */ 36 */
37#define MAX_RCU_LVLS 3 37#define MAX_RCU_LVLS 4
38#define RCU_FANOUT (CONFIG_RCU_FANOUT) 38#define RCU_FANOUT (CONFIG_RCU_FANOUT)
39#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT) 39#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT)
40#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT) 40#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT)
41#define RCU_FANOUT_FOURTH (RCU_FANOUT_CUBE * RCU_FANOUT)
41 42
42#if NR_CPUS <= RCU_FANOUT 43#if NR_CPUS <= RCU_FANOUT
43# define NUM_RCU_LVLS 1 44# define NUM_RCU_LVLS 1
@@ -45,23 +46,33 @@
45# define NUM_RCU_LVL_1 (NR_CPUS) 46# define NUM_RCU_LVL_1 (NR_CPUS)
46# define NUM_RCU_LVL_2 0 47# define NUM_RCU_LVL_2 0
47# define NUM_RCU_LVL_3 0 48# define NUM_RCU_LVL_3 0
49# define NUM_RCU_LVL_4 0
48#elif NR_CPUS <= RCU_FANOUT_SQ 50#elif NR_CPUS <= RCU_FANOUT_SQ
49# define NUM_RCU_LVLS 2 51# define NUM_RCU_LVLS 2
50# define NUM_RCU_LVL_0 1 52# define NUM_RCU_LVL_0 1
51# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) 53# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
52# define NUM_RCU_LVL_2 (NR_CPUS) 54# define NUM_RCU_LVL_2 (NR_CPUS)
53# define NUM_RCU_LVL_3 0 55# define NUM_RCU_LVL_3 0
56# define NUM_RCU_LVL_4 0
54#elif NR_CPUS <= RCU_FANOUT_CUBE 57#elif NR_CPUS <= RCU_FANOUT_CUBE
55# define NUM_RCU_LVLS 3 58# define NUM_RCU_LVLS 3
56# define NUM_RCU_LVL_0 1 59# define NUM_RCU_LVL_0 1
57# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) 60# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
58# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) 61# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
59# define NUM_RCU_LVL_3 NR_CPUS 62# define NUM_RCU_LVL_3 NR_CPUS
63# define NUM_RCU_LVL_4 0
64#elif NR_CPUS <= RCU_FANOUT_FOURTH
65# define NUM_RCU_LVLS 4
66# define NUM_RCU_LVL_0 1
67# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE)
68# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
69# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
70# define NUM_RCU_LVL_4 NR_CPUS
60#else 71#else
61# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" 72# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
62#endif /* #if (NR_CPUS) <= RCU_FANOUT */ 73#endif /* #if (NR_CPUS) <= RCU_FANOUT */
63 74
64#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) 75#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
65#define NUM_RCU_NODES (RCU_SUM - NR_CPUS) 76#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
66 77
67/* 78/*
@@ -79,9 +90,12 @@ struct rcu_dynticks {
79 * Definition for node within the RCU grace-period-detection hierarchy. 90 * Definition for node within the RCU grace-period-detection hierarchy.
80 */ 91 */
81struct rcu_node { 92struct rcu_node {
82 spinlock_t lock; /* Root rcu_node's lock protects some */ 93 raw_spinlock_t lock; /* Root rcu_node's lock protects some */
83 /* rcu_state fields as well as following. */ 94 /* rcu_state fields as well as following. */
84 long gpnum; /* Current grace period for this node. */ 95 unsigned long gpnum; /* Current grace period for this node. */
96 /* This will either be equal to or one */
97 /* behind the root rcu_node's gpnum. */
98 unsigned long completed; /* Last GP completed for this node. */
85 /* This will either be equal to or one */ 99 /* This will either be equal to or one */
86 /* behind the root rcu_node's gpnum. */ 100 /* behind the root rcu_node's gpnum. */
87 unsigned long qsmask; /* CPUs or groups that need to switch in */ 101 unsigned long qsmask; /* CPUs or groups that need to switch in */
@@ -90,8 +104,12 @@ struct rcu_node {
90 /* an rcu_data structure, otherwise, each */ 104 /* an rcu_data structure, otherwise, each */
91 /* bit corresponds to a child rcu_node */ 105 /* bit corresponds to a child rcu_node */
92 /* structure. */ 106 /* structure. */
107 unsigned long expmask; /* Groups that have ->blocked_tasks[] */
108 /* elements that need to drain to allow the */
109 /* current expedited grace period to */
110 /* complete (only for TREE_PREEMPT_RCU). */
93 unsigned long qsmaskinit; 111 unsigned long qsmaskinit;
94 /* Per-GP initialization for qsmask. */ 112 /* Per-GP initial value for qsmask & expmask. */
95 unsigned long grpmask; /* Mask to apply to parent qsmask. */ 113 unsigned long grpmask; /* Mask to apply to parent qsmask. */
96 /* Only one bit will be set in this mask. */ 114 /* Only one bit will be set in this mask. */
97 int grplo; /* lowest-numbered CPU or group here. */ 115 int grplo; /* lowest-numbered CPU or group here. */
@@ -99,7 +117,7 @@ struct rcu_node {
99 u8 grpnum; /* CPU/group number for next level up. */ 117 u8 grpnum; /* CPU/group number for next level up. */
100 u8 level; /* root is at level 0. */ 118 u8 level; /* root is at level 0. */
101 struct rcu_node *parent; 119 struct rcu_node *parent;
102 struct list_head blocked_tasks[2]; 120 struct list_head blocked_tasks[4];
103 /* Tasks blocked in RCU read-side critsect. */ 121 /* Tasks blocked in RCU read-side critsect. */
104 /* Grace period number (->gpnum) x blocked */ 122 /* Grace period number (->gpnum) x blocked */
105 /* by tasks on the (x & 0x1) element of the */ 123 /* by tasks on the (x & 0x1) element of the */
@@ -114,6 +132,21 @@ struct rcu_node {
114 for ((rnp) = &(rsp)->node[0]; \ 132 for ((rnp) = &(rsp)->node[0]; \
115 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) 133 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
116 134
135/*
136 * Do a breadth-first scan of the non-leaf rcu_node structures for the
137 * specified rcu_state structure. Note that if there is a singleton
138 * rcu_node tree with but one rcu_node structure, this loop is a no-op.
139 */
140#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
141 for ((rnp) = &(rsp)->node[0]; \
142 (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++)
143
144/*
145 * Scan the leaves of the rcu_node hierarchy for the specified rcu_state
146 * structure. Note that if there is a singleton rcu_node tree with but
147 * one rcu_node structure, this loop -will- visit the rcu_node structure.
148 * It is still a leaf node, even if it is also the root node.
149 */
117#define rcu_for_each_leaf_node(rsp, rnp) \ 150#define rcu_for_each_leaf_node(rsp, rnp) \
118 for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \ 151 for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \
119 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) 152 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
@@ -128,11 +161,11 @@ struct rcu_node {
128/* Per-CPU data for read-copy update. */ 161/* Per-CPU data for read-copy update. */
129struct rcu_data { 162struct rcu_data {
130 /* 1) quiescent-state and grace-period handling : */ 163 /* 1) quiescent-state and grace-period handling : */
131 long completed; /* Track rsp->completed gp number */ 164 unsigned long completed; /* Track rsp->completed gp number */
132 /* in order to detect GP end. */ 165 /* in order to detect GP end. */
133 long gpnum; /* Highest gp number that this CPU */ 166 unsigned long gpnum; /* Highest gp number that this CPU */
134 /* is aware of having started. */ 167 /* is aware of having started. */
135 long passed_quiesc_completed; 168 unsigned long passed_quiesc_completed;
136 /* Value of completed at time of qs. */ 169 /* Value of completed at time of qs. */
137 bool passed_quiesc; /* User-mode/idle loop etc. */ 170 bool passed_quiesc; /* User-mode/idle loop etc. */
138 bool qs_pending; /* Core waits for quiesc state. */ 171 bool qs_pending; /* Core waits for quiesc state. */
@@ -188,14 +221,14 @@ struct rcu_data {
188 unsigned long resched_ipi; /* Sent a resched IPI. */ 221 unsigned long resched_ipi; /* Sent a resched IPI. */
189 222
190 /* 5) __rcu_pending() statistics. */ 223 /* 5) __rcu_pending() statistics. */
191 long n_rcu_pending; /* rcu_pending() calls since boot. */ 224 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
192 long n_rp_qs_pending; 225 unsigned long n_rp_qs_pending;
193 long n_rp_cb_ready; 226 unsigned long n_rp_cb_ready;
194 long n_rp_cpu_needs_gp; 227 unsigned long n_rp_cpu_needs_gp;
195 long n_rp_gp_completed; 228 unsigned long n_rp_gp_completed;
196 long n_rp_gp_started; 229 unsigned long n_rp_gp_started;
197 long n_rp_need_fqs; 230 unsigned long n_rp_need_fqs;
198 long n_rp_need_nothing; 231 unsigned long n_rp_need_nothing;
199 232
200 int cpu; 233 int cpu;
201}; 234};
@@ -213,15 +246,27 @@ struct rcu_data {
213 246
214#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 247#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
215#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 248#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
216#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rsp->jiffies_stall */ 249
217#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rsp->jiffies_stall */ 250#ifdef CONFIG_PROVE_RCU
218#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ 251#define RCU_STALL_DELAY_DELTA (5 * HZ)
219 /* to take at least one */ 252#else
220 /* scheduling clock irq */ 253#define RCU_STALL_DELAY_DELTA 0
221 /* before ratting on them. */ 254#endif
255
256#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ + RCU_STALL_DELAY_DELTA)
257 /* for rsp->jiffies_stall */
258#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA)
259 /* for rsp->jiffies_stall */
260#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
261 /* to take at least one */
262 /* scheduling clock irq */
263 /* before ratting on them. */
222 264
223#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 265#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
224 266
267#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
268#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
269
225/* 270/*
226 * RCU global state, including node hierarchy. This hierarchy is 271 * RCU global state, including node hierarchy. This hierarchy is
227 * represented in "heap" form in a dense array. The root (first level) 272 * represented in "heap" form in a dense array. The root (first level)
@@ -243,12 +288,19 @@ struct rcu_state {
243 288
244 u8 signaled ____cacheline_internodealigned_in_smp; 289 u8 signaled ____cacheline_internodealigned_in_smp;
245 /* Force QS state. */ 290 /* Force QS state. */
246 long gpnum; /* Current gp number. */ 291 u8 fqs_active; /* force_quiescent_state() */
247 long completed; /* # of last completed gp. */ 292 /* is running. */
293 u8 fqs_need_gp; /* A CPU was prevented from */
294 /* starting a new grace */
295 /* period because */
296 /* force_quiescent_state() */
297 /* was running. */
298 unsigned long gpnum; /* Current gp number. */
299 unsigned long completed; /* # of last completed gp. */
248 300
249 /* End of fields guarded by root rcu_node's lock. */ 301 /* End of fields guarded by root rcu_node's lock. */
250 302
251 spinlock_t onofflock; /* exclude on/offline and */ 303 raw_spinlock_t onofflock; /* exclude on/offline and */
252 /* starting new GP. Also */ 304 /* starting new GP. Also */
253 /* protects the following */ 305 /* protects the following */
254 /* orphan_cbs fields. */ 306 /* orphan_cbs fields. */
@@ -258,7 +310,7 @@ struct rcu_state {
258 /* going offline. */ 310 /* going offline. */
259 struct rcu_head **orphan_cbs_tail; /* And tail pointer. */ 311 struct rcu_head **orphan_cbs_tail; /* And tail pointer. */
260 long orphan_qlen; /* Number of orphaned cbs. */ 312 long orphan_qlen; /* Number of orphaned cbs. */
261 spinlock_t fqslock; /* Only one task forcing */ 313 raw_spinlock_t fqslock; /* Only one task forcing */
262 /* quiescent states. */ 314 /* quiescent states. */
263 unsigned long jiffies_force_qs; /* Time at which to invoke */ 315 unsigned long jiffies_force_qs; /* Time at which to invoke */
264 /* force_quiescent_state(). */ 316 /* force_quiescent_state(). */
@@ -274,12 +326,14 @@ struct rcu_state {
274 unsigned long jiffies_stall; /* Time at which to check */ 326 unsigned long jiffies_stall; /* Time at which to check */
275 /* for CPU stalls. */ 327 /* for CPU stalls. */
276#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 328#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
277#ifdef CONFIG_NO_HZ
278 long dynticks_completed; /* Value of completed @ snap. */
279#endif /* #ifdef CONFIG_NO_HZ */
280}; 329};
281 330
282#ifdef RCU_TREE_NONCORE 331/* Return values for rcu_preempt_offline_tasks(). */
332
333#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */
334 /* GP were moved to root. */
335#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */
336 /* GP were moved to root. */
283 337
284/* 338/*
285 * RCU implementation internal declarations: 339 * RCU implementation internal declarations:
@@ -295,14 +349,19 @@ extern struct rcu_state rcu_preempt_state;
295DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); 349DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
296#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 350#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
297 351
298#else /* #ifdef RCU_TREE_NONCORE */ 352#ifndef RCU_TREE_NONCORE
299 353
300/* Forward declarations for rcutree_plugin.h */ 354/* Forward declarations for rcutree_plugin.h */
301static inline void rcu_bootup_announce(void); 355static void rcu_bootup_announce(void);
302long rcu_batches_completed(void); 356long rcu_batches_completed(void);
303static void rcu_preempt_note_context_switch(int cpu); 357static void rcu_preempt_note_context_switch(int cpu);
304static int rcu_preempted_readers(struct rcu_node *rnp); 358static int rcu_preempted_readers(struct rcu_node *rnp);
359#ifdef CONFIG_HOTPLUG_CPU
360static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
361 unsigned long flags);
362#endif /* #ifdef CONFIG_HOTPLUG_CPU */
305#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 363#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
364static void rcu_print_detail_task_stall(struct rcu_state *rsp);
306static void rcu_print_task_stall(struct rcu_node *rnp); 365static void rcu_print_task_stall(struct rcu_node *rnp);
307#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 366#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
308static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); 367static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
@@ -315,10 +374,14 @@ static void rcu_preempt_offline_cpu(int cpu);
315static void rcu_preempt_check_callbacks(int cpu); 374static void rcu_preempt_check_callbacks(int cpu);
316static void rcu_preempt_process_callbacks(void); 375static void rcu_preempt_process_callbacks(void);
317void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 376void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
377#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
378static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
379#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
318static int rcu_preempt_pending(int cpu); 380static int rcu_preempt_pending(int cpu);
319static int rcu_preempt_needs_cpu(int cpu); 381static int rcu_preempt_needs_cpu(int cpu);
320static void __cpuinit rcu_preempt_init_percpu_data(int cpu); 382static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
321static void rcu_preempt_send_cbs_to_orphanage(void); 383static void rcu_preempt_send_cbs_to_orphanage(void);
322static void __init __rcu_init_preempt(void); 384static void __init __rcu_init_preempt(void);
385static void rcu_needs_cpu_flush(void);
323 386
324#endif /* #else #ifdef RCU_TREE_NONCORE */ 387#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index ef2a58c2b9d5..79b53bda8943 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -24,16 +24,19 @@
24 * Paul E. McKenney <paulmck@linux.vnet.ibm.com> 24 * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
25 */ 25 */
26 26
27#include <linux/delay.h>
27 28
28#ifdef CONFIG_TREE_PREEMPT_RCU 29#ifdef CONFIG_TREE_PREEMPT_RCU
29 30
30struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); 31struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
31DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); 32DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
32 33
34static int rcu_preempted_readers_exp(struct rcu_node *rnp);
35
33/* 36/*
34 * Tell them what RCU they are running. 37 * Tell them what RCU they are running.
35 */ 38 */
36static inline void rcu_bootup_announce(void) 39static void __init rcu_bootup_announce(void)
37{ 40{
38 printk(KERN_INFO 41 printk(KERN_INFO
39 "Experimental preemptable hierarchical RCU implementation.\n"); 42 "Experimental preemptable hierarchical RCU implementation.\n");
@@ -59,6 +62,15 @@ long rcu_batches_completed(void)
59EXPORT_SYMBOL_GPL(rcu_batches_completed); 62EXPORT_SYMBOL_GPL(rcu_batches_completed);
60 63
61/* 64/*
65 * Force a quiescent state for preemptible RCU.
66 */
67void rcu_force_quiescent_state(void)
68{
69 force_quiescent_state(&rcu_preempt_state, 0);
70}
71EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
72
73/*
62 * Record a preemptable-RCU quiescent state for the specified CPU. Note 74 * Record a preemptable-RCU quiescent state for the specified CPU. Note
63 * that this just means that the task currently running on the CPU is 75 * that this just means that the task currently running on the CPU is
64 * not in a quiescent state. There might be any number of tasks blocked 76 * not in a quiescent state. There might be any number of tasks blocked
@@ -67,7 +79,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
67static void rcu_preempt_qs(int cpu) 79static void rcu_preempt_qs(int cpu)
68{ 80{
69 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 81 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
70 rdp->passed_quiesc_completed = rdp->completed; 82 rdp->passed_quiesc_completed = rdp->gpnum - 1;
71 barrier(); 83 barrier();
72 rdp->passed_quiesc = 1; 84 rdp->passed_quiesc = 1;
73} 85}
@@ -99,7 +111,7 @@ static void rcu_preempt_note_context_switch(int cpu)
99 /* Possibly blocking in an RCU read-side critical section. */ 111 /* Possibly blocking in an RCU read-side critical section. */
100 rdp = rcu_preempt_state.rda[cpu]; 112 rdp = rcu_preempt_state.rda[cpu];
101 rnp = rdp->mynode; 113 rnp = rdp->mynode;
102 spin_lock_irqsave(&rnp->lock, flags); 114 raw_spin_lock_irqsave(&rnp->lock, flags);
103 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; 115 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
104 t->rcu_blocked_node = rnp; 116 t->rcu_blocked_node = rnp;
105 117
@@ -120,7 +132,7 @@ static void rcu_preempt_note_context_switch(int cpu)
120 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); 132 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
121 phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1; 133 phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1;
122 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); 134 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]);
123 spin_unlock_irqrestore(&rnp->lock, flags); 135 raw_spin_unlock_irqrestore(&rnp->lock, flags);
124 } 136 }
125 137
126 /* 138 /*
@@ -157,14 +169,58 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock);
157 */ 169 */
158static int rcu_preempted_readers(struct rcu_node *rnp) 170static int rcu_preempted_readers(struct rcu_node *rnp)
159{ 171{
160 return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); 172 int phase = rnp->gpnum & 0x1;
173
174 return !list_empty(&rnp->blocked_tasks[phase]) ||
175 !list_empty(&rnp->blocked_tasks[phase + 2]);
161} 176}
162 177
178/*
179 * Record a quiescent state for all tasks that were previously queued
180 * on the specified rcu_node structure and that were blocking the current
181 * RCU grace period. The caller must hold the specified rnp->lock with
182 * irqs disabled, and this lock is released upon return, but irqs remain
183 * disabled.
184 */
185static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
186 __releases(rnp->lock)
187{
188 unsigned long mask;
189 struct rcu_node *rnp_p;
190
191 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
192 raw_spin_unlock_irqrestore(&rnp->lock, flags);
193 return; /* Still need more quiescent states! */
194 }
195
196 rnp_p = rnp->parent;
197 if (rnp_p == NULL) {
198 /*
199 * Either there is only one rcu_node in the tree,
200 * or tasks were kicked up to root rcu_node due to
201 * CPUs going offline.
202 */
203 rcu_report_qs_rsp(&rcu_preempt_state, flags);
204 return;
205 }
206
207 /* Report up the rest of the hierarchy. */
208 mask = rnp->grpmask;
209 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
210 raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */
211 rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
212}
213
214/*
215 * Handle special cases during rcu_read_unlock(), such as needing to
216 * notify RCU core processing or task having blocked during the RCU
217 * read-side critical section.
218 */
163static void rcu_read_unlock_special(struct task_struct *t) 219static void rcu_read_unlock_special(struct task_struct *t)
164{ 220{
165 int empty; 221 int empty;
222 int empty_exp;
166 unsigned long flags; 223 unsigned long flags;
167 unsigned long mask;
168 struct rcu_node *rnp; 224 struct rcu_node *rnp;
169 int special; 225 int special;
170 226
@@ -201,42 +257,36 @@ static void rcu_read_unlock_special(struct task_struct *t)
201 */ 257 */
202 for (;;) { 258 for (;;) {
203 rnp = t->rcu_blocked_node; 259 rnp = t->rcu_blocked_node;
204 spin_lock(&rnp->lock); /* irqs already disabled. */ 260 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
205 if (rnp == t->rcu_blocked_node) 261 if (rnp == t->rcu_blocked_node)
206 break; 262 break;
207 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 263 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
208 } 264 }
209 empty = !rcu_preempted_readers(rnp); 265 empty = !rcu_preempted_readers(rnp);
266 empty_exp = !rcu_preempted_readers_exp(rnp);
267 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
210 list_del_init(&t->rcu_node_entry); 268 list_del_init(&t->rcu_node_entry);
211 t->rcu_blocked_node = NULL; 269 t->rcu_blocked_node = NULL;
212 270
213 /* 271 /*
214 * If this was the last task on the current list, and if 272 * If this was the last task on the current list, and if
215 * we aren't waiting on any CPUs, report the quiescent state. 273 * we aren't waiting on any CPUs, report the quiescent state.
216 * Note that both cpu_quiet_msk_finish() and cpu_quiet_msk() 274 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock.
217 * drop rnp->lock and restore irq.
218 */ 275 */
219 if (!empty && rnp->qsmask == 0 && 276 if (empty)
220 !rcu_preempted_readers(rnp)) { 277 raw_spin_unlock_irqrestore(&rnp->lock, flags);
221 struct rcu_node *rnp_p; 278 else
222 279 rcu_report_unblock_qs_rnp(rnp, flags);
223 if (rnp->parent == NULL) { 280
224 /* Only one rcu_node in the tree. */ 281 /*
225 cpu_quiet_msk_finish(&rcu_preempt_state, flags); 282 * If this was the last task on the expedited lists,
226 return; 283 * then we need to report up the rcu_node hierarchy.
227 } 284 */
228 /* Report up the rest of the hierarchy. */ 285 if (!empty_exp && !rcu_preempted_readers_exp(rnp))
229 mask = rnp->grpmask; 286 rcu_report_exp_rnp(&rcu_preempt_state, rnp);
230 spin_unlock_irqrestore(&rnp->lock, flags); 287 } else {
231 rnp_p = rnp->parent; 288 local_irq_restore(flags);
232 spin_lock_irqsave(&rnp_p->lock, flags);
233 WARN_ON_ONCE(rnp->qsmask);
234 cpu_quiet_msk(mask, &rcu_preempt_state, rnp_p, flags);
235 return;
236 }
237 spin_unlock(&rnp->lock);
238 } 289 }
239 local_irq_restore(flags);
240} 290}
241 291
242/* 292/*
@@ -254,29 +304,73 @@ void __rcu_read_unlock(void)
254 if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 && 304 if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 &&
255 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) 305 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
256 rcu_read_unlock_special(t); 306 rcu_read_unlock_special(t);
307#ifdef CONFIG_PROVE_LOCKING
308 WARN_ON_ONCE(ACCESS_ONCE(t->rcu_read_lock_nesting) < 0);
309#endif /* #ifdef CONFIG_PROVE_LOCKING */
257} 310}
258EXPORT_SYMBOL_GPL(__rcu_read_unlock); 311EXPORT_SYMBOL_GPL(__rcu_read_unlock);
259 312
260#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 313#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
261 314
315#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
316
317/*
318 * Dump detailed information for all tasks blocking the current RCU
319 * grace period on the specified rcu_node structure.
320 */
321static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
322{
323 unsigned long flags;
324 struct list_head *lp;
325 int phase;
326 struct task_struct *t;
327
328 if (rcu_preempted_readers(rnp)) {
329 raw_spin_lock_irqsave(&rnp->lock, flags);
330 phase = rnp->gpnum & 0x1;
331 lp = &rnp->blocked_tasks[phase];
332 list_for_each_entry(t, lp, rcu_node_entry)
333 sched_show_task(t);
334 raw_spin_unlock_irqrestore(&rnp->lock, flags);
335 }
336}
337
338/*
339 * Dump detailed information for all tasks blocking the current RCU
340 * grace period.
341 */
342static void rcu_print_detail_task_stall(struct rcu_state *rsp)
343{
344 struct rcu_node *rnp = rcu_get_root(rsp);
345
346 rcu_print_detail_task_stall_rnp(rnp);
347 rcu_for_each_leaf_node(rsp, rnp)
348 rcu_print_detail_task_stall_rnp(rnp);
349}
350
351#else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
352
353static void rcu_print_detail_task_stall(struct rcu_state *rsp)
354{
355}
356
357#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
358
262/* 359/*
263 * Scan the current list of tasks blocked within RCU read-side critical 360 * Scan the current list of tasks blocked within RCU read-side critical
264 * sections, printing out the tid of each. 361 * sections, printing out the tid of each.
265 */ 362 */
266static void rcu_print_task_stall(struct rcu_node *rnp) 363static void rcu_print_task_stall(struct rcu_node *rnp)
267{ 364{
268 unsigned long flags;
269 struct list_head *lp; 365 struct list_head *lp;
270 int phase; 366 int phase;
271 struct task_struct *t; 367 struct task_struct *t;
272 368
273 if (rcu_preempted_readers(rnp)) { 369 if (rcu_preempted_readers(rnp)) {
274 spin_lock_irqsave(&rnp->lock, flags);
275 phase = rnp->gpnum & 0x1; 370 phase = rnp->gpnum & 0x1;
276 lp = &rnp->blocked_tasks[phase]; 371 lp = &rnp->blocked_tasks[phase];
277 list_for_each_entry(t, lp, rcu_node_entry) 372 list_for_each_entry(t, lp, rcu_node_entry)
278 printk(" P%d", t->pid); 373 printk(" P%d", t->pid);
279 spin_unlock_irqrestore(&rnp->lock, flags);
280 } 374 }
281} 375}
282 376
@@ -303,6 +397,8 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
303 * rcu_node. The reason for not just moving them to the immediate 397 * rcu_node. The reason for not just moving them to the immediate
304 * parent is to remove the need for rcu_read_unlock_special() to 398 * parent is to remove the need for rcu_read_unlock_special() to
305 * make more than two attempts to acquire the target rcu_node's lock. 399 * make more than two attempts to acquire the target rcu_node's lock.
400 * Returns true if there were tasks blocking the current RCU grace
401 * period.
306 * 402 *
307 * Returns 1 if there was previously a task blocking the current grace 403 * Returns 1 if there was previously a task blocking the current grace
308 * period on the specified rcu_node structure. 404 * period on the specified rcu_node structure.
@@ -316,7 +412,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
316 int i; 412 int i;
317 struct list_head *lp; 413 struct list_head *lp;
318 struct list_head *lp_root; 414 struct list_head *lp_root;
319 int retval = rcu_preempted_readers(rnp); 415 int retval = 0;
320 struct rcu_node *rnp_root = rcu_get_root(rsp); 416 struct rcu_node *rnp_root = rcu_get_root(rsp);
321 struct task_struct *tp; 417 struct task_struct *tp;
322 418
@@ -326,7 +422,9 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
326 } 422 }
327 WARN_ON_ONCE(rnp != rdp->mynode && 423 WARN_ON_ONCE(rnp != rdp->mynode &&
328 (!list_empty(&rnp->blocked_tasks[0]) || 424 (!list_empty(&rnp->blocked_tasks[0]) ||
329 !list_empty(&rnp->blocked_tasks[1]))); 425 !list_empty(&rnp->blocked_tasks[1]) ||
426 !list_empty(&rnp->blocked_tasks[2]) ||
427 !list_empty(&rnp->blocked_tasks[3])));
330 428
331 /* 429 /*
332 * Move tasks up to root rcu_node. Rely on the fact that the 430 * Move tasks up to root rcu_node. Rely on the fact that the
@@ -334,19 +432,22 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
334 * rcu_nodes in terms of gp_num value. This fact allows us to 432 * rcu_nodes in terms of gp_num value. This fact allows us to
335 * move the blocked_tasks[] array directly, element by element. 433 * move the blocked_tasks[] array directly, element by element.
336 */ 434 */
337 for (i = 0; i < 2; i++) { 435 if (rcu_preempted_readers(rnp))
436 retval |= RCU_OFL_TASKS_NORM_GP;
437 if (rcu_preempted_readers_exp(rnp))
438 retval |= RCU_OFL_TASKS_EXP_GP;
439 for (i = 0; i < 4; i++) {
338 lp = &rnp->blocked_tasks[i]; 440 lp = &rnp->blocked_tasks[i];
339 lp_root = &rnp_root->blocked_tasks[i]; 441 lp_root = &rnp_root->blocked_tasks[i];
340 while (!list_empty(lp)) { 442 while (!list_empty(lp)) {
341 tp = list_entry(lp->next, typeof(*tp), rcu_node_entry); 443 tp = list_entry(lp->next, typeof(*tp), rcu_node_entry);
342 spin_lock(&rnp_root->lock); /* irqs already disabled */ 444 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
343 list_del(&tp->rcu_node_entry); 445 list_del(&tp->rcu_node_entry);
344 tp->rcu_blocked_node = rnp_root; 446 tp->rcu_blocked_node = rnp_root;
345 list_add(&tp->rcu_node_entry, lp_root); 447 list_add(&tp->rcu_node_entry, lp_root);
346 spin_unlock(&rnp_root->lock); /* irqs remain disabled */ 448 raw_spin_unlock(&rnp_root->lock); /* irqs remain disabled */
347 } 449 }
348 } 450 }
349
350 return retval; 451 return retval;
351} 452}
352 453
@@ -398,14 +499,183 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
398} 499}
399EXPORT_SYMBOL_GPL(call_rcu); 500EXPORT_SYMBOL_GPL(call_rcu);
400 501
502/**
503 * synchronize_rcu - wait until a grace period has elapsed.
504 *
505 * Control will return to the caller some time after a full grace
506 * period has elapsed, in other words after all currently executing RCU
507 * read-side critical sections have completed. RCU read-side critical
508 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
509 * and may be nested.
510 */
511void synchronize_rcu(void)
512{
513 struct rcu_synchronize rcu;
514
515 if (!rcu_scheduler_active)
516 return;
517
518 init_completion(&rcu.completion);
519 /* Will wake me after RCU finished. */
520 call_rcu(&rcu.head, wakeme_after_rcu);
521 /* Wait for it. */
522 wait_for_completion(&rcu.completion);
523}
524EXPORT_SYMBOL_GPL(synchronize_rcu);
525
526static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
527static long sync_rcu_preempt_exp_count;
528static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
529
530/*
531 * Return non-zero if there are any tasks in RCU read-side critical
532 * sections blocking the current preemptible-RCU expedited grace period.
533 * If there is no preemptible-RCU expedited grace period currently in
534 * progress, returns zero unconditionally.
535 */
536static int rcu_preempted_readers_exp(struct rcu_node *rnp)
537{
538 return !list_empty(&rnp->blocked_tasks[2]) ||
539 !list_empty(&rnp->blocked_tasks[3]);
540}
541
542/*
543 * return non-zero if there is no RCU expedited grace period in progress
544 * for the specified rcu_node structure, in other words, if all CPUs and
545 * tasks covered by the specified rcu_node structure have done their bit
546 * for the current expedited grace period. Works only for preemptible
547 * RCU -- other RCU implementation use other means.
548 *
549 * Caller must hold sync_rcu_preempt_exp_mutex.
550 */
551static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
552{
553 return !rcu_preempted_readers_exp(rnp) &&
554 ACCESS_ONCE(rnp->expmask) == 0;
555}
556
401/* 557/*
402 * Wait for an rcu-preempt grace period. We are supposed to expedite the 558 * Report the exit from RCU read-side critical section for the last task
403 * grace period, but this is the crude slow compatability hack, so just 559 * that queued itself during or before the current expedited preemptible-RCU
404 * invoke synchronize_rcu(). 560 * grace period. This event is reported either to the rcu_node structure on
561 * which the task was queued or to one of that rcu_node structure's ancestors,
562 * recursively up the tree. (Calm down, calm down, we do the recursion
563 * iteratively!)
564 *
565 * Caller must hold sync_rcu_preempt_exp_mutex.
566 */
567static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
568{
569 unsigned long flags;
570 unsigned long mask;
571
572 raw_spin_lock_irqsave(&rnp->lock, flags);
573 for (;;) {
574 if (!sync_rcu_preempt_exp_done(rnp))
575 break;
576 if (rnp->parent == NULL) {
577 wake_up(&sync_rcu_preempt_exp_wq);
578 break;
579 }
580 mask = rnp->grpmask;
581 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
582 rnp = rnp->parent;
583 raw_spin_lock(&rnp->lock); /* irqs already disabled */
584 rnp->expmask &= ~mask;
585 }
586 raw_spin_unlock_irqrestore(&rnp->lock, flags);
587}
588
589/*
590 * Snapshot the tasks blocking the newly started preemptible-RCU expedited
591 * grace period for the specified rcu_node structure. If there are no such
592 * tasks, report it up the rcu_node hierarchy.
593 *
594 * Caller must hold sync_rcu_preempt_exp_mutex and rsp->onofflock.
595 */
596static void
597sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
598{
599 int must_wait;
600
601 raw_spin_lock(&rnp->lock); /* irqs already disabled */
602 list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]);
603 list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]);
604 must_wait = rcu_preempted_readers_exp(rnp);
605 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
606 if (!must_wait)
607 rcu_report_exp_rnp(rsp, rnp);
608}
609
610/*
611 * Wait for an rcu-preempt grace period, but expedite it. The basic idea
612 * is to invoke synchronize_sched_expedited() to push all the tasks to
613 * the ->blocked_tasks[] lists, move all entries from the first set of
614 * ->blocked_tasks[] lists to the second set, and finally wait for this
615 * second set to drain.
405 */ 616 */
406void synchronize_rcu_expedited(void) 617void synchronize_rcu_expedited(void)
407{ 618{
408 synchronize_rcu(); 619 unsigned long flags;
620 struct rcu_node *rnp;
621 struct rcu_state *rsp = &rcu_preempt_state;
622 long snap;
623 int trycount = 0;
624
625 smp_mb(); /* Caller's modifications seen first by other CPUs. */
626 snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1;
627 smp_mb(); /* Above access cannot bleed into critical section. */
628
629 /*
630 * Acquire lock, falling back to synchronize_rcu() if too many
631 * lock-acquisition failures. Of course, if someone does the
632 * expedited grace period for us, just leave.
633 */
634 while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
635 if (trycount++ < 10)
636 udelay(trycount * num_online_cpus());
637 else {
638 synchronize_rcu();
639 return;
640 }
641 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
642 goto mb_ret; /* Others did our work for us. */
643 }
644 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
645 goto unlock_mb_ret; /* Others did our work for us. */
646
647 /* force all RCU readers onto blocked_tasks[]. */
648 synchronize_sched_expedited();
649
650 raw_spin_lock_irqsave(&rsp->onofflock, flags);
651
652 /* Initialize ->expmask for all non-leaf rcu_node structures. */
653 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
654 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
655 rnp->expmask = rnp->qsmaskinit;
656 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
657 }
658
659 /* Snapshot current state of ->blocked_tasks[] lists. */
660 rcu_for_each_leaf_node(rsp, rnp)
661 sync_rcu_preempt_exp_init(rsp, rnp);
662 if (NUM_RCU_NODES > 1)
663 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
664
665 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
666
667 /* Wait for snapshotted ->blocked_tasks[] lists to drain. */
668 rnp = rcu_get_root(rsp);
669 wait_event(sync_rcu_preempt_exp_wq,
670 sync_rcu_preempt_exp_done(rnp));
671
672 /* Clean up and exit. */
673 smp_mb(); /* ensure expedited GP seen before counter increment. */
674 ACCESS_ONCE(sync_rcu_preempt_exp_count)++;
675unlock_mb_ret:
676 mutex_unlock(&sync_rcu_preempt_exp_mutex);
677mb_ret:
678 smp_mb(); /* ensure subsequent action seen after grace period. */
409} 679}
410EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 680EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
411 681
@@ -481,7 +751,7 @@ void exit_rcu(void)
481/* 751/*
482 * Tell them what RCU they are running. 752 * Tell them what RCU they are running.
483 */ 753 */
484static inline void rcu_bootup_announce(void) 754static void __init rcu_bootup_announce(void)
485{ 755{
486 printk(KERN_INFO "Hierarchical RCU implementation.\n"); 756 printk(KERN_INFO "Hierarchical RCU implementation.\n");
487} 757}
@@ -496,6 +766,16 @@ long rcu_batches_completed(void)
496EXPORT_SYMBOL_GPL(rcu_batches_completed); 766EXPORT_SYMBOL_GPL(rcu_batches_completed);
497 767
498/* 768/*
769 * Force a quiescent state for RCU, which, because there is no preemptible
770 * RCU, becomes the same as rcu-sched.
771 */
772void rcu_force_quiescent_state(void)
773{
774 rcu_sched_force_quiescent_state();
775}
776EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
777
778/*
499 * Because preemptable RCU does not exist, we never have to check for 779 * Because preemptable RCU does not exist, we never have to check for
500 * CPUs being in quiescent states. 780 * CPUs being in quiescent states.
501 */ 781 */
@@ -512,12 +792,30 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
512 return 0; 792 return 0;
513} 793}
514 794
795#ifdef CONFIG_HOTPLUG_CPU
796
797/* Because preemptible RCU does not exist, no quieting of tasks. */
798static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
799{
800 raw_spin_unlock_irqrestore(&rnp->lock, flags);
801}
802
803#endif /* #ifdef CONFIG_HOTPLUG_CPU */
804
515#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 805#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
516 806
517/* 807/*
518 * Because preemptable RCU does not exist, we never have to check for 808 * Because preemptable RCU does not exist, we never have to check for
519 * tasks blocked within RCU read-side critical sections. 809 * tasks blocked within RCU read-side critical sections.
520 */ 810 */
811static void rcu_print_detail_task_stall(struct rcu_state *rsp)
812{
813}
814
815/*
816 * Because preemptable RCU does not exist, we never have to check for
817 * tasks blocked within RCU read-side critical sections.
818 */
521static void rcu_print_task_stall(struct rcu_node *rnp) 819static void rcu_print_task_stall(struct rcu_node *rnp)
522{ 820{
523} 821}
@@ -594,6 +892,20 @@ void synchronize_rcu_expedited(void)
594} 892}
595EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 893EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
596 894
895#ifdef CONFIG_HOTPLUG_CPU
896
897/*
898 * Because preemptable RCU does not exist, there is never any need to
899 * report on tasks preempted in RCU read-side critical sections during
900 * expedited RCU grace periods.
901 */
902static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
903{
904 return;
905}
906
907#endif /* #ifdef CONFIG_HOTPLUG_CPU */
908
597/* 909/*
598 * Because preemptable RCU does not exist, it never has any work to do. 910 * Because preemptable RCU does not exist, it never has any work to do.
599 */ 911 */
@@ -643,3 +955,115 @@ static void __init __rcu_init_preempt(void)
643} 955}
644 956
645#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ 957#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
958
959#if !defined(CONFIG_RCU_FAST_NO_HZ)
960
961/*
962 * Check to see if any future RCU-related work will need to be done
963 * by the current CPU, even if none need be done immediately, returning
964 * 1 if so. This function is part of the RCU implementation; it is -not-
965 * an exported member of the RCU API.
966 *
967 * Because we have preemptible RCU, just check whether this CPU needs
968 * any flavor of RCU. Do not chew up lots of CPU cycles with preemption
969 * disabled in a most-likely vain attempt to cause RCU not to need this CPU.
970 */
971int rcu_needs_cpu(int cpu)
972{
973 return rcu_needs_cpu_quick_check(cpu);
974}
975
976/*
977 * Check to see if we need to continue a callback-flush operations to
978 * allow the last CPU to enter dyntick-idle mode. But fast dyntick-idle
979 * entry is not configured, so we never do need to.
980 */
981static void rcu_needs_cpu_flush(void)
982{
983}
984
985#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
986
987#define RCU_NEEDS_CPU_FLUSHES 5
988static DEFINE_PER_CPU(int, rcu_dyntick_drain);
989static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
990
991/*
992 * Check to see if any future RCU-related work will need to be done
993 * by the current CPU, even if none need be done immediately, returning
994 * 1 if so. This function is part of the RCU implementation; it is -not-
995 * an exported member of the RCU API.
996 *
997 * Because we are not supporting preemptible RCU, attempt to accelerate
998 * any current grace periods so that RCU no longer needs this CPU, but
999 * only if all other CPUs are already in dynticks-idle mode. This will
1000 * allow the CPU cores to be powered down immediately, as opposed to after
1001 * waiting many milliseconds for grace periods to elapse.
1002 *
1003 * Because it is not legal to invoke rcu_process_callbacks() with irqs
1004 * disabled, we do one pass of force_quiescent_state(), then do a
1005 * raise_softirq() to cause rcu_process_callbacks() to be invoked later.
1006 * The per-cpu rcu_dyntick_drain variable controls the sequencing.
1007 */
1008int rcu_needs_cpu(int cpu)
1009{
1010 int c = 0;
1011 int thatcpu;
1012
1013 /* Check for being in the holdoff period. */
1014 if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies)
1015 return rcu_needs_cpu_quick_check(cpu);
1016
1017 /* Don't bother unless we are the last non-dyntick-idle CPU. */
1018 for_each_cpu_not(thatcpu, nohz_cpu_mask)
1019 if (thatcpu != cpu) {
1020 per_cpu(rcu_dyntick_drain, cpu) = 0;
1021 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
1022 return rcu_needs_cpu_quick_check(cpu);
1023 }
1024
1025 /* Check and update the rcu_dyntick_drain sequencing. */
1026 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
1027 /* First time through, initialize the counter. */
1028 per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES;
1029 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
1030 /* We have hit the limit, so time to give up. */
1031 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
1032 return rcu_needs_cpu_quick_check(cpu);
1033 }
1034
1035 /* Do one step pushing remaining RCU callbacks through. */
1036 if (per_cpu(rcu_sched_data, cpu).nxtlist) {
1037 rcu_sched_qs(cpu);
1038 force_quiescent_state(&rcu_sched_state, 0);
1039 c = c || per_cpu(rcu_sched_data, cpu).nxtlist;
1040 }
1041 if (per_cpu(rcu_bh_data, cpu).nxtlist) {
1042 rcu_bh_qs(cpu);
1043 force_quiescent_state(&rcu_bh_state, 0);
1044 c = c || per_cpu(rcu_bh_data, cpu).nxtlist;
1045 }
1046
1047 /* If RCU callbacks are still pending, RCU still needs this CPU. */
1048 if (c)
1049 raise_softirq(RCU_SOFTIRQ);
1050 return c;
1051}
1052
1053/*
1054 * Check to see if we need to continue a callback-flush operations to
1055 * allow the last CPU to enter dyntick-idle mode.
1056 */
1057static void rcu_needs_cpu_flush(void)
1058{
1059 int cpu = smp_processor_id();
1060 unsigned long flags;
1061
1062 if (per_cpu(rcu_dyntick_drain, cpu) <= 0)
1063 return;
1064 local_irq_save(flags);
1065 (void)rcu_needs_cpu(cpu);
1066 local_irq_restore(flags);
1067}
1068
1069#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 4b31c779e62e..d45db2e35d27 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -50,7 +50,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
50{ 50{
51 if (!rdp->beenonline) 51 if (!rdp->beenonline)
52 return; 52 return;
53 seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d", 53 seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pqc=%lu qp=%d",
54 rdp->cpu, 54 rdp->cpu,
55 cpu_is_offline(rdp->cpu) ? '!' : ' ', 55 cpu_is_offline(rdp->cpu) ? '!' : ' ',
56 rdp->completed, rdp->gpnum, 56 rdp->completed, rdp->gpnum,
@@ -105,7 +105,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
105{ 105{
106 if (!rdp->beenonline) 106 if (!rdp->beenonline)
107 return; 107 return;
108 seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d", 108 seq_printf(m, "%d,%s,%lu,%lu,%d,%lu,%d",
109 rdp->cpu, 109 rdp->cpu,
110 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", 110 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
111 rdp->completed, rdp->gpnum, 111 rdp->completed, rdp->gpnum,
@@ -155,12 +155,15 @@ static const struct file_operations rcudata_csv_fops = {
155 155
156static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) 156static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
157{ 157{
158 unsigned long gpnum;
158 int level = 0; 159 int level = 0;
160 int phase;
159 struct rcu_node *rnp; 161 struct rcu_node *rnp;
160 162
161 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x " 163 gpnum = rsp->gpnum;
164 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
162 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", 165 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
163 rsp->completed, rsp->gpnum, rsp->signaled, 166 rsp->completed, gpnum, rsp->signaled,
164 (long)(rsp->jiffies_force_qs - jiffies), 167 (long)(rsp->jiffies_force_qs - jiffies),
165 (int)(jiffies & 0xffff), 168 (int)(jiffies & 0xffff),
166 rsp->n_force_qs, rsp->n_force_qs_ngp, 169 rsp->n_force_qs, rsp->n_force_qs_ngp,
@@ -171,8 +174,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
171 seq_puts(m, "\n"); 174 seq_puts(m, "\n");
172 level = rnp->level; 175 level = rnp->level;
173 } 176 }
174 seq_printf(m, "%lx/%lx %d:%d ^%d ", 177 phase = gpnum & 0x1;
178 seq_printf(m, "%lx/%lx %c%c>%c%c %d:%d ^%d ",
175 rnp->qsmask, rnp->qsmaskinit, 179 rnp->qsmask, rnp->qsmaskinit,
180 "T."[list_empty(&rnp->blocked_tasks[phase])],
181 "E."[list_empty(&rnp->blocked_tasks[phase + 2])],
182 "T."[list_empty(&rnp->blocked_tasks[!phase])],
183 "E."[list_empty(&rnp->blocked_tasks[!phase + 2])],
176 rnp->grplo, rnp->grphi, rnp->grpnum); 184 rnp->grplo, rnp->grphi, rnp->grpnum);
177 } 185 }
178 seq_puts(m, "\n"); 186 seq_puts(m, "\n");
@@ -207,12 +215,12 @@ static const struct file_operations rcuhier_fops = {
207static int show_rcugp(struct seq_file *m, void *unused) 215static int show_rcugp(struct seq_file *m, void *unused)
208{ 216{
209#ifdef CONFIG_TREE_PREEMPT_RCU 217#ifdef CONFIG_TREE_PREEMPT_RCU
210 seq_printf(m, "rcu_preempt: completed=%ld gpnum=%ld\n", 218 seq_printf(m, "rcu_preempt: completed=%ld gpnum=%lu\n",
211 rcu_preempt_state.completed, rcu_preempt_state.gpnum); 219 rcu_preempt_state.completed, rcu_preempt_state.gpnum);
212#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 220#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
213 seq_printf(m, "rcu_sched: completed=%ld gpnum=%ld\n", 221 seq_printf(m, "rcu_sched: completed=%ld gpnum=%lu\n",
214 rcu_sched_state.completed, rcu_sched_state.gpnum); 222 rcu_sched_state.completed, rcu_sched_state.gpnum);
215 seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n", 223 seq_printf(m, "rcu_bh: completed=%ld gpnum=%lu\n",
216 rcu_bh_state.completed, rcu_bh_state.gpnum); 224 rcu_bh_state.completed, rcu_bh_state.gpnum);
217 return 0; 225 return 0;
218} 226}
diff --git a/kernel/relay.c b/kernel/relay.c
index 760c26209a3c..3d97f2821611 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1198,7 +1198,7 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe,
1198 relay_consume_bytes(rbuf, buf->private); 1198 relay_consume_bytes(rbuf, buf->private);
1199} 1199}
1200 1200
1201static struct pipe_buf_operations relay_pipe_buf_ops = { 1201static const struct pipe_buf_operations relay_pipe_buf_ops = {
1202 .can_merge = 0, 1202 .can_merge = 0,
1203 .map = generic_pipe_buf_map, 1203 .map = generic_pipe_buf_map,
1204 .unmap = generic_pipe_buf_unmap, 1204 .unmap = generic_pipe_buf_unmap,
@@ -1215,14 +1215,14 @@ static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i)
1215/* 1215/*
1216 * subbuf_splice_actor - splice up to one subbuf's worth of data 1216 * subbuf_splice_actor - splice up to one subbuf's worth of data
1217 */ 1217 */
1218static int subbuf_splice_actor(struct file *in, 1218static ssize_t subbuf_splice_actor(struct file *in,
1219 loff_t *ppos, 1219 loff_t *ppos,
1220 struct pipe_inode_info *pipe, 1220 struct pipe_inode_info *pipe,
1221 size_t len, 1221 size_t len,
1222 unsigned int flags, 1222 unsigned int flags,
1223 int *nonpad_ret) 1223 int *nonpad_ret)
1224{ 1224{
1225 unsigned int pidx, poff, total_len, subbuf_pages, nr_pages, ret; 1225 unsigned int pidx, poff, total_len, subbuf_pages, nr_pages;
1226 struct rchan_buf *rbuf = in->private_data; 1226 struct rchan_buf *rbuf = in->private_data;
1227 unsigned int subbuf_size = rbuf->chan->subbuf_size; 1227 unsigned int subbuf_size = rbuf->chan->subbuf_size;
1228 uint64_t pos = (uint64_t) *ppos; 1228 uint64_t pos = (uint64_t) *ppos;
@@ -1241,6 +1241,7 @@ static int subbuf_splice_actor(struct file *in,
1241 .ops = &relay_pipe_buf_ops, 1241 .ops = &relay_pipe_buf_ops,
1242 .spd_release = relay_page_release, 1242 .spd_release = relay_page_release,
1243 }; 1243 };
1244 ssize_t ret;
1244 1245
1245 if (rbuf->subbufs_produced == rbuf->subbufs_consumed) 1246 if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
1246 return 0; 1247 return 0;
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index bcdabf37c40b..c7eaa37a768b 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -10,7 +10,6 @@
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/parser.h> 11#include <linux/parser.h>
12#include <linux/fs.h> 12#include <linux/fs.h>
13#include <linux/slab.h>
14#include <linux/res_counter.h> 13#include <linux/res_counter.h>
15#include <linux/uaccess.h> 14#include <linux/uaccess.h>
16#include <linux/mm.h> 15#include <linux/mm.h>
diff --git a/kernel/resource.c b/kernel/resource.c
index fb11a58b9594..9c358e263534 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -188,20 +188,65 @@ static int __release_resource(struct resource *old)
188 return -EINVAL; 188 return -EINVAL;
189} 189}
190 190
191static void __release_child_resources(struct resource *r)
192{
193 struct resource *tmp, *p;
194 resource_size_t size;
195
196 p = r->child;
197 r->child = NULL;
198 while (p) {
199 tmp = p;
200 p = p->sibling;
201
202 tmp->parent = NULL;
203 tmp->sibling = NULL;
204 __release_child_resources(tmp);
205
206 printk(KERN_DEBUG "release child resource %pR\n", tmp);
207 /* need to restore size, and keep flags */
208 size = resource_size(tmp);
209 tmp->start = 0;
210 tmp->end = size - 1;
211 }
212}
213
214void release_child_resources(struct resource *r)
215{
216 write_lock(&resource_lock);
217 __release_child_resources(r);
218 write_unlock(&resource_lock);
219}
220
191/** 221/**
192 * request_resource - request and reserve an I/O or memory resource 222 * request_resource_conflict - request and reserve an I/O or memory resource
193 * @root: root resource descriptor 223 * @root: root resource descriptor
194 * @new: resource descriptor desired by caller 224 * @new: resource descriptor desired by caller
195 * 225 *
196 * Returns 0 for success, negative error code on error. 226 * Returns 0 for success, conflict resource on error.
197 */ 227 */
198int request_resource(struct resource *root, struct resource *new) 228struct resource *request_resource_conflict(struct resource *root, struct resource *new)
199{ 229{
200 struct resource *conflict; 230 struct resource *conflict;
201 231
202 write_lock(&resource_lock); 232 write_lock(&resource_lock);
203 conflict = __request_resource(root, new); 233 conflict = __request_resource(root, new);
204 write_unlock(&resource_lock); 234 write_unlock(&resource_lock);
235 return conflict;
236}
237
238/**
239 * request_resource - request and reserve an I/O or memory resource
240 * @root: root resource descriptor
241 * @new: resource descriptor desired by caller
242 *
243 * Returns 0 for success, negative error code on error.
244 */
245int request_resource(struct resource *root, struct resource *new)
246{
247 struct resource *conflict;
248
249 conflict = request_resource_conflict(root, new);
205 return conflict ? -EBUSY : 0; 250 return conflict ? -EBUSY : 0;
206} 251}
207 252
@@ -274,7 +319,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
274 void *arg, int (*func)(unsigned long, unsigned long, void *)) 319 void *arg, int (*func)(unsigned long, unsigned long, void *))
275{ 320{
276 struct resource res; 321 struct resource res;
277 unsigned long pfn, len; 322 unsigned long pfn, end_pfn;
278 u64 orig_end; 323 u64 orig_end;
279 int ret = -1; 324 int ret = -1;
280 325
@@ -284,9 +329,10 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
284 orig_end = res.end; 329 orig_end = res.end;
285 while ((res.start < res.end) && 330 while ((res.start < res.end) &&
286 (find_next_system_ram(&res, "System RAM") >= 0)) { 331 (find_next_system_ram(&res, "System RAM") >= 0)) {
287 pfn = (unsigned long)(res.start >> PAGE_SHIFT); 332 pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
288 len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT); 333 end_pfn = (res.end + 1) >> PAGE_SHIFT;
289 ret = (*func)(pfn, len, arg); 334 if (end_pfn > pfn)
335 ret = (*func)(pfn, end_pfn - pfn, arg);
290 if (ret) 336 if (ret)
291 break; 337 break;
292 res.start = res.end + 1; 338 res.start = res.end + 1;
@@ -297,46 +343,63 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
297 343
298#endif 344#endif
299 345
346static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
347{
348 return 1;
349}
350/*
351 * This generic page_is_ram() returns true if specified address is
352 * registered as "System RAM" in iomem_resource list.
353 */
354int __weak page_is_ram(unsigned long pfn)
355{
356 return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
357}
358
300/* 359/*
301 * Find empty slot in the resource tree given range and alignment. 360 * Find empty slot in the resource tree given range and alignment.
302 */ 361 */
303static int find_resource(struct resource *root, struct resource *new, 362static int find_resource(struct resource *root, struct resource *new,
304 resource_size_t size, resource_size_t min, 363 resource_size_t size, resource_size_t min,
305 resource_size_t max, resource_size_t align, 364 resource_size_t max, resource_size_t align,
306 void (*alignf)(void *, struct resource *, 365 resource_size_t (*alignf)(void *,
307 resource_size_t, resource_size_t), 366 const struct resource *,
367 resource_size_t,
368 resource_size_t),
308 void *alignf_data) 369 void *alignf_data)
309{ 370{
310 struct resource *this = root->child; 371 struct resource *this = root->child;
372 struct resource tmp = *new;
311 373
312 new->start = root->start; 374 tmp.start = root->start;
313 /* 375 /*
314 * Skip past an allocated resource that starts at 0, since the assignment 376 * Skip past an allocated resource that starts at 0, since the assignment
315 * of this->start - 1 to new->end below would cause an underflow. 377 * of this->start - 1 to tmp->end below would cause an underflow.
316 */ 378 */
317 if (this && this->start == 0) { 379 if (this && this->start == 0) {
318 new->start = this->end + 1; 380 tmp.start = this->end + 1;
319 this = this->sibling; 381 this = this->sibling;
320 } 382 }
321 for(;;) { 383 for(;;) {
322 if (this) 384 if (this)
323 new->end = this->start - 1; 385 tmp.end = this->start - 1;
324 else 386 else
325 new->end = root->end; 387 tmp.end = root->end;
326 if (new->start < min) 388 if (tmp.start < min)
327 new->start = min; 389 tmp.start = min;
328 if (new->end > max) 390 if (tmp.end > max)
329 new->end = max; 391 tmp.end = max;
330 new->start = ALIGN(new->start, align); 392 tmp.start = ALIGN(tmp.start, align);
331 if (alignf) 393 if (alignf)
332 alignf(alignf_data, new, size, align); 394 tmp.start = alignf(alignf_data, &tmp, size, align);
333 if (new->start < new->end && new->end - new->start >= size - 1) { 395 if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) {
334 new->end = new->start + size - 1; 396 new->start = tmp.start;
397 new->end = tmp.start + size - 1;
335 return 0; 398 return 0;
336 } 399 }
337 if (!this) 400 if (!this)
338 break; 401 break;
339 new->start = this->end + 1; 402 tmp.start = this->end + 1;
340 this = this->sibling; 403 this = this->sibling;
341 } 404 }
342 return -EBUSY; 405 return -EBUSY;
@@ -356,8 +419,10 @@ static int find_resource(struct resource *root, struct resource *new,
356int allocate_resource(struct resource *root, struct resource *new, 419int allocate_resource(struct resource *root, struct resource *new,
357 resource_size_t size, resource_size_t min, 420 resource_size_t size, resource_size_t min,
358 resource_size_t max, resource_size_t align, 421 resource_size_t max, resource_size_t align,
359 void (*alignf)(void *, struct resource *, 422 resource_size_t (*alignf)(void *,
360 resource_size_t, resource_size_t), 423 const struct resource *,
424 resource_size_t,
425 resource_size_t),
361 void *alignf_data) 426 void *alignf_data)
362{ 427{
363 int err; 428 int err;
@@ -424,25 +489,40 @@ static struct resource * __insert_resource(struct resource *parent, struct resou
424} 489}
425 490
426/** 491/**
427 * insert_resource - Inserts a resource in the resource tree 492 * insert_resource_conflict - Inserts resource in the resource tree
428 * @parent: parent of the new resource 493 * @parent: parent of the new resource
429 * @new: new resource to insert 494 * @new: new resource to insert
430 * 495 *
431 * Returns 0 on success, -EBUSY if the resource can't be inserted. 496 * Returns 0 on success, conflict resource if the resource can't be inserted.
432 * 497 *
433 * This function is equivalent to request_resource when no conflict 498 * This function is equivalent to request_resource_conflict when no conflict
434 * happens. If a conflict happens, and the conflicting resources 499 * happens. If a conflict happens, and the conflicting resources
435 * entirely fit within the range of the new resource, then the new 500 * entirely fit within the range of the new resource, then the new
436 * resource is inserted and the conflicting resources become children of 501 * resource is inserted and the conflicting resources become children of
437 * the new resource. 502 * the new resource.
438 */ 503 */
439int insert_resource(struct resource *parent, struct resource *new) 504struct resource *insert_resource_conflict(struct resource *parent, struct resource *new)
440{ 505{
441 struct resource *conflict; 506 struct resource *conflict;
442 507
443 write_lock(&resource_lock); 508 write_lock(&resource_lock);
444 conflict = __insert_resource(parent, new); 509 conflict = __insert_resource(parent, new);
445 write_unlock(&resource_lock); 510 write_unlock(&resource_lock);
511 return conflict;
512}
513
514/**
515 * insert_resource - Inserts a resource in the resource tree
516 * @parent: parent of the new resource
517 * @new: new resource to insert
518 *
519 * Returns 0 on success, -EBUSY if the resource can't be inserted.
520 */
521int insert_resource(struct resource *parent, struct resource *new)
522{
523 struct resource *conflict;
524
525 conflict = insert_resource_conflict(parent, new);
446 return conflict ? -EBUSY : 0; 526 return conflict ? -EBUSY : 0;
447} 527}
448 528
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 5fcb4fe645e2..ddabb54bb5c8 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -37,8 +37,8 @@ do { \
37 if (rt_trace_on) { \ 37 if (rt_trace_on) { \
38 rt_trace_on = 0; \ 38 rt_trace_on = 0; \
39 console_verbose(); \ 39 console_verbose(); \
40 if (spin_is_locked(&current->pi_lock)) \ 40 if (raw_spin_is_locked(&current->pi_lock)) \
41 spin_unlock(&current->pi_lock); \ 41 raw_spin_unlock(&current->pi_lock); \
42 } \ 42 } \
43} while (0) 43} while (0)
44 44
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 29bd4baf9e75..a9604815786a 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -138,9 +138,9 @@ static void rt_mutex_adjust_prio(struct task_struct *task)
138{ 138{
139 unsigned long flags; 139 unsigned long flags;
140 140
141 spin_lock_irqsave(&task->pi_lock, flags); 141 raw_spin_lock_irqsave(&task->pi_lock, flags);
142 __rt_mutex_adjust_prio(task); 142 __rt_mutex_adjust_prio(task);
143 spin_unlock_irqrestore(&task->pi_lock, flags); 143 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
144} 144}
145 145
146/* 146/*
@@ -195,7 +195,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
195 /* 195 /*
196 * Task can not go away as we did a get_task() before ! 196 * Task can not go away as we did a get_task() before !
197 */ 197 */
198 spin_lock_irqsave(&task->pi_lock, flags); 198 raw_spin_lock_irqsave(&task->pi_lock, flags);
199 199
200 waiter = task->pi_blocked_on; 200 waiter = task->pi_blocked_on;
201 /* 201 /*
@@ -231,8 +231,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
231 goto out_unlock_pi; 231 goto out_unlock_pi;
232 232
233 lock = waiter->lock; 233 lock = waiter->lock;
234 if (!spin_trylock(&lock->wait_lock)) { 234 if (!raw_spin_trylock(&lock->wait_lock)) {
235 spin_unlock_irqrestore(&task->pi_lock, flags); 235 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
236 cpu_relax(); 236 cpu_relax();
237 goto retry; 237 goto retry;
238 } 238 }
@@ -240,7 +240,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
240 /* Deadlock detection */ 240 /* Deadlock detection */
241 if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { 241 if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
242 debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); 242 debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
243 spin_unlock(&lock->wait_lock); 243 raw_spin_unlock(&lock->wait_lock);
244 ret = deadlock_detect ? -EDEADLK : 0; 244 ret = deadlock_detect ? -EDEADLK : 0;
245 goto out_unlock_pi; 245 goto out_unlock_pi;
246 } 246 }
@@ -253,13 +253,13 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
253 plist_add(&waiter->list_entry, &lock->wait_list); 253 plist_add(&waiter->list_entry, &lock->wait_list);
254 254
255 /* Release the task */ 255 /* Release the task */
256 spin_unlock_irqrestore(&task->pi_lock, flags); 256 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
257 put_task_struct(task); 257 put_task_struct(task);
258 258
259 /* Grab the next task */ 259 /* Grab the next task */
260 task = rt_mutex_owner(lock); 260 task = rt_mutex_owner(lock);
261 get_task_struct(task); 261 get_task_struct(task);
262 spin_lock_irqsave(&task->pi_lock, flags); 262 raw_spin_lock_irqsave(&task->pi_lock, flags);
263 263
264 if (waiter == rt_mutex_top_waiter(lock)) { 264 if (waiter == rt_mutex_top_waiter(lock)) {
265 /* Boost the owner */ 265 /* Boost the owner */
@@ -277,10 +277,10 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
277 __rt_mutex_adjust_prio(task); 277 __rt_mutex_adjust_prio(task);
278 } 278 }
279 279
280 spin_unlock_irqrestore(&task->pi_lock, flags); 280 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
281 281
282 top_waiter = rt_mutex_top_waiter(lock); 282 top_waiter = rt_mutex_top_waiter(lock);
283 spin_unlock(&lock->wait_lock); 283 raw_spin_unlock(&lock->wait_lock);
284 284
285 if (!detect_deadlock && waiter != top_waiter) 285 if (!detect_deadlock && waiter != top_waiter)
286 goto out_put_task; 286 goto out_put_task;
@@ -288,7 +288,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
288 goto again; 288 goto again;
289 289
290 out_unlock_pi: 290 out_unlock_pi:
291 spin_unlock_irqrestore(&task->pi_lock, flags); 291 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
292 out_put_task: 292 out_put_task:
293 put_task_struct(task); 293 put_task_struct(task);
294 294
@@ -313,9 +313,9 @@ static inline int try_to_steal_lock(struct rt_mutex *lock,
313 if (pendowner == task) 313 if (pendowner == task)
314 return 1; 314 return 1;
315 315
316 spin_lock_irqsave(&pendowner->pi_lock, flags); 316 raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
317 if (task->prio >= pendowner->prio) { 317 if (task->prio >= pendowner->prio) {
318 spin_unlock_irqrestore(&pendowner->pi_lock, flags); 318 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
319 return 0; 319 return 0;
320 } 320 }
321 321
@@ -325,7 +325,7 @@ static inline int try_to_steal_lock(struct rt_mutex *lock,
325 * priority. 325 * priority.
326 */ 326 */
327 if (likely(!rt_mutex_has_waiters(lock))) { 327 if (likely(!rt_mutex_has_waiters(lock))) {
328 spin_unlock_irqrestore(&pendowner->pi_lock, flags); 328 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
329 return 1; 329 return 1;
330 } 330 }
331 331
@@ -333,7 +333,7 @@ static inline int try_to_steal_lock(struct rt_mutex *lock,
333 next = rt_mutex_top_waiter(lock); 333 next = rt_mutex_top_waiter(lock);
334 plist_del(&next->pi_list_entry, &pendowner->pi_waiters); 334 plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
335 __rt_mutex_adjust_prio(pendowner); 335 __rt_mutex_adjust_prio(pendowner);
336 spin_unlock_irqrestore(&pendowner->pi_lock, flags); 336 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
337 337
338 /* 338 /*
339 * We are going to steal the lock and a waiter was 339 * We are going to steal the lock and a waiter was
@@ -350,10 +350,10 @@ static inline int try_to_steal_lock(struct rt_mutex *lock,
350 * might be task: 350 * might be task:
351 */ 351 */
352 if (likely(next->task != task)) { 352 if (likely(next->task != task)) {
353 spin_lock_irqsave(&task->pi_lock, flags); 353 raw_spin_lock_irqsave(&task->pi_lock, flags);
354 plist_add(&next->pi_list_entry, &task->pi_waiters); 354 plist_add(&next->pi_list_entry, &task->pi_waiters);
355 __rt_mutex_adjust_prio(task); 355 __rt_mutex_adjust_prio(task);
356 spin_unlock_irqrestore(&task->pi_lock, flags); 356 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
357 } 357 }
358 return 1; 358 return 1;
359} 359}
@@ -420,7 +420,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
420 unsigned long flags; 420 unsigned long flags;
421 int chain_walk = 0, res; 421 int chain_walk = 0, res;
422 422
423 spin_lock_irqsave(&task->pi_lock, flags); 423 raw_spin_lock_irqsave(&task->pi_lock, flags);
424 __rt_mutex_adjust_prio(task); 424 __rt_mutex_adjust_prio(task);
425 waiter->task = task; 425 waiter->task = task;
426 waiter->lock = lock; 426 waiter->lock = lock;
@@ -434,17 +434,17 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
434 434
435 task->pi_blocked_on = waiter; 435 task->pi_blocked_on = waiter;
436 436
437 spin_unlock_irqrestore(&task->pi_lock, flags); 437 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
438 438
439 if (waiter == rt_mutex_top_waiter(lock)) { 439 if (waiter == rt_mutex_top_waiter(lock)) {
440 spin_lock_irqsave(&owner->pi_lock, flags); 440 raw_spin_lock_irqsave(&owner->pi_lock, flags);
441 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); 441 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
442 plist_add(&waiter->pi_list_entry, &owner->pi_waiters); 442 plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
443 443
444 __rt_mutex_adjust_prio(owner); 444 __rt_mutex_adjust_prio(owner);
445 if (owner->pi_blocked_on) 445 if (owner->pi_blocked_on)
446 chain_walk = 1; 446 chain_walk = 1;
447 spin_unlock_irqrestore(&owner->pi_lock, flags); 447 raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
448 } 448 }
449 else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) 449 else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock))
450 chain_walk = 1; 450 chain_walk = 1;
@@ -459,12 +459,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
459 */ 459 */
460 get_task_struct(owner); 460 get_task_struct(owner);
461 461
462 spin_unlock(&lock->wait_lock); 462 raw_spin_unlock(&lock->wait_lock);
463 463
464 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, 464 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
465 task); 465 task);
466 466
467 spin_lock(&lock->wait_lock); 467 raw_spin_lock(&lock->wait_lock);
468 468
469 return res; 469 return res;
470} 470}
@@ -483,7 +483,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
483 struct task_struct *pendowner; 483 struct task_struct *pendowner;
484 unsigned long flags; 484 unsigned long flags;
485 485
486 spin_lock_irqsave(&current->pi_lock, flags); 486 raw_spin_lock_irqsave(&current->pi_lock, flags);
487 487
488 waiter = rt_mutex_top_waiter(lock); 488 waiter = rt_mutex_top_waiter(lock);
489 plist_del(&waiter->list_entry, &lock->wait_list); 489 plist_del(&waiter->list_entry, &lock->wait_list);
@@ -500,7 +500,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
500 500
501 rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); 501 rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING);
502 502
503 spin_unlock_irqrestore(&current->pi_lock, flags); 503 raw_spin_unlock_irqrestore(&current->pi_lock, flags);
504 504
505 /* 505 /*
506 * Clear the pi_blocked_on variable and enqueue a possible 506 * Clear the pi_blocked_on variable and enqueue a possible
@@ -509,7 +509,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
509 * waiter with higher priority than pending-owner->normal_prio 509 * waiter with higher priority than pending-owner->normal_prio
510 * is blocked on the unboosted (pending) owner. 510 * is blocked on the unboosted (pending) owner.
511 */ 511 */
512 spin_lock_irqsave(&pendowner->pi_lock, flags); 512 raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
513 513
514 WARN_ON(!pendowner->pi_blocked_on); 514 WARN_ON(!pendowner->pi_blocked_on);
515 WARN_ON(pendowner->pi_blocked_on != waiter); 515 WARN_ON(pendowner->pi_blocked_on != waiter);
@@ -523,7 +523,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
523 next = rt_mutex_top_waiter(lock); 523 next = rt_mutex_top_waiter(lock);
524 plist_add(&next->pi_list_entry, &pendowner->pi_waiters); 524 plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
525 } 525 }
526 spin_unlock_irqrestore(&pendowner->pi_lock, flags); 526 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
527 527
528 wake_up_process(pendowner); 528 wake_up_process(pendowner);
529} 529}
@@ -541,15 +541,15 @@ static void remove_waiter(struct rt_mutex *lock,
541 unsigned long flags; 541 unsigned long flags;
542 int chain_walk = 0; 542 int chain_walk = 0;
543 543
544 spin_lock_irqsave(&current->pi_lock, flags); 544 raw_spin_lock_irqsave(&current->pi_lock, flags);
545 plist_del(&waiter->list_entry, &lock->wait_list); 545 plist_del(&waiter->list_entry, &lock->wait_list);
546 waiter->task = NULL; 546 waiter->task = NULL;
547 current->pi_blocked_on = NULL; 547 current->pi_blocked_on = NULL;
548 spin_unlock_irqrestore(&current->pi_lock, flags); 548 raw_spin_unlock_irqrestore(&current->pi_lock, flags);
549 549
550 if (first && owner != current) { 550 if (first && owner != current) {
551 551
552 spin_lock_irqsave(&owner->pi_lock, flags); 552 raw_spin_lock_irqsave(&owner->pi_lock, flags);
553 553
554 plist_del(&waiter->pi_list_entry, &owner->pi_waiters); 554 plist_del(&waiter->pi_list_entry, &owner->pi_waiters);
555 555
@@ -564,7 +564,7 @@ static void remove_waiter(struct rt_mutex *lock,
564 if (owner->pi_blocked_on) 564 if (owner->pi_blocked_on)
565 chain_walk = 1; 565 chain_walk = 1;
566 566
567 spin_unlock_irqrestore(&owner->pi_lock, flags); 567 raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
568 } 568 }
569 569
570 WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); 570 WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
@@ -575,11 +575,11 @@ static void remove_waiter(struct rt_mutex *lock,
575 /* gets dropped in rt_mutex_adjust_prio_chain()! */ 575 /* gets dropped in rt_mutex_adjust_prio_chain()! */
576 get_task_struct(owner); 576 get_task_struct(owner);
577 577
578 spin_unlock(&lock->wait_lock); 578 raw_spin_unlock(&lock->wait_lock);
579 579
580 rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current); 580 rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current);
581 581
582 spin_lock(&lock->wait_lock); 582 raw_spin_lock(&lock->wait_lock);
583} 583}
584 584
585/* 585/*
@@ -592,15 +592,15 @@ void rt_mutex_adjust_pi(struct task_struct *task)
592 struct rt_mutex_waiter *waiter; 592 struct rt_mutex_waiter *waiter;
593 unsigned long flags; 593 unsigned long flags;
594 594
595 spin_lock_irqsave(&task->pi_lock, flags); 595 raw_spin_lock_irqsave(&task->pi_lock, flags);
596 596
597 waiter = task->pi_blocked_on; 597 waiter = task->pi_blocked_on;
598 if (!waiter || waiter->list_entry.prio == task->prio) { 598 if (!waiter || waiter->list_entry.prio == task->prio) {
599 spin_unlock_irqrestore(&task->pi_lock, flags); 599 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
600 return; 600 return;
601 } 601 }
602 602
603 spin_unlock_irqrestore(&task->pi_lock, flags); 603 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
604 604
605 /* gets dropped in rt_mutex_adjust_prio_chain()! */ 605 /* gets dropped in rt_mutex_adjust_prio_chain()! */
606 get_task_struct(task); 606 get_task_struct(task);
@@ -672,14 +672,14 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
672 break; 672 break;
673 } 673 }
674 674
675 spin_unlock(&lock->wait_lock); 675 raw_spin_unlock(&lock->wait_lock);
676 676
677 debug_rt_mutex_print_deadlock(waiter); 677 debug_rt_mutex_print_deadlock(waiter);
678 678
679 if (waiter->task) 679 if (waiter->task)
680 schedule_rt_mutex(lock); 680 schedule_rt_mutex(lock);
681 681
682 spin_lock(&lock->wait_lock); 682 raw_spin_lock(&lock->wait_lock);
683 set_current_state(state); 683 set_current_state(state);
684 } 684 }
685 685
@@ -700,11 +700,11 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
700 debug_rt_mutex_init_waiter(&waiter); 700 debug_rt_mutex_init_waiter(&waiter);
701 waiter.task = NULL; 701 waiter.task = NULL;
702 702
703 spin_lock(&lock->wait_lock); 703 raw_spin_lock(&lock->wait_lock);
704 704
705 /* Try to acquire the lock again: */ 705 /* Try to acquire the lock again: */
706 if (try_to_take_rt_mutex(lock)) { 706 if (try_to_take_rt_mutex(lock)) {
707 spin_unlock(&lock->wait_lock); 707 raw_spin_unlock(&lock->wait_lock);
708 return 0; 708 return 0;
709 } 709 }
710 710
@@ -731,7 +731,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
731 */ 731 */
732 fixup_rt_mutex_waiters(lock); 732 fixup_rt_mutex_waiters(lock);
733 733
734 spin_unlock(&lock->wait_lock); 734 raw_spin_unlock(&lock->wait_lock);
735 735
736 /* Remove pending timer: */ 736 /* Remove pending timer: */
737 if (unlikely(timeout)) 737 if (unlikely(timeout))
@@ -758,7 +758,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
758{ 758{
759 int ret = 0; 759 int ret = 0;
760 760
761 spin_lock(&lock->wait_lock); 761 raw_spin_lock(&lock->wait_lock);
762 762
763 if (likely(rt_mutex_owner(lock) != current)) { 763 if (likely(rt_mutex_owner(lock) != current)) {
764 764
@@ -770,7 +770,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
770 fixup_rt_mutex_waiters(lock); 770 fixup_rt_mutex_waiters(lock);
771 } 771 }
772 772
773 spin_unlock(&lock->wait_lock); 773 raw_spin_unlock(&lock->wait_lock);
774 774
775 return ret; 775 return ret;
776} 776}
@@ -781,7 +781,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
781static void __sched 781static void __sched
782rt_mutex_slowunlock(struct rt_mutex *lock) 782rt_mutex_slowunlock(struct rt_mutex *lock)
783{ 783{
784 spin_lock(&lock->wait_lock); 784 raw_spin_lock(&lock->wait_lock);
785 785
786 debug_rt_mutex_unlock(lock); 786 debug_rt_mutex_unlock(lock);
787 787
@@ -789,13 +789,13 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
789 789
790 if (!rt_mutex_has_waiters(lock)) { 790 if (!rt_mutex_has_waiters(lock)) {
791 lock->owner = NULL; 791 lock->owner = NULL;
792 spin_unlock(&lock->wait_lock); 792 raw_spin_unlock(&lock->wait_lock);
793 return; 793 return;
794 } 794 }
795 795
796 wakeup_next_waiter(lock); 796 wakeup_next_waiter(lock);
797 797
798 spin_unlock(&lock->wait_lock); 798 raw_spin_unlock(&lock->wait_lock);
799 799
800 /* Undo pi boosting if necessary: */ 800 /* Undo pi boosting if necessary: */
801 rt_mutex_adjust_prio(current); 801 rt_mutex_adjust_prio(current);
@@ -970,8 +970,8 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy);
970void __rt_mutex_init(struct rt_mutex *lock, const char *name) 970void __rt_mutex_init(struct rt_mutex *lock, const char *name)
971{ 971{
972 lock->owner = NULL; 972 lock->owner = NULL;
973 spin_lock_init(&lock->wait_lock); 973 raw_spin_lock_init(&lock->wait_lock);
974 plist_head_init(&lock->wait_list, &lock->wait_lock); 974 plist_head_init_raw(&lock->wait_list, &lock->wait_lock);
975 975
976 debug_rt_mutex_init(lock, name); 976 debug_rt_mutex_init(lock, name);
977} 977}
@@ -1032,7 +1032,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1032{ 1032{
1033 int ret; 1033 int ret;
1034 1034
1035 spin_lock(&lock->wait_lock); 1035 raw_spin_lock(&lock->wait_lock);
1036 1036
1037 mark_rt_mutex_waiters(lock); 1037 mark_rt_mutex_waiters(lock);
1038 1038
@@ -1040,7 +1040,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1040 /* We got the lock for task. */ 1040 /* We got the lock for task. */
1041 debug_rt_mutex_lock(lock); 1041 debug_rt_mutex_lock(lock);
1042 rt_mutex_set_owner(lock, task, 0); 1042 rt_mutex_set_owner(lock, task, 0);
1043 spin_unlock(&lock->wait_lock); 1043 raw_spin_unlock(&lock->wait_lock);
1044 rt_mutex_deadlock_account_lock(lock, task); 1044 rt_mutex_deadlock_account_lock(lock, task);
1045 return 1; 1045 return 1;
1046 } 1046 }
@@ -1056,7 +1056,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1056 */ 1056 */
1057 ret = 0; 1057 ret = 0;
1058 } 1058 }
1059 spin_unlock(&lock->wait_lock); 1059 raw_spin_unlock(&lock->wait_lock);
1060 1060
1061 debug_rt_mutex_print_deadlock(waiter); 1061 debug_rt_mutex_print_deadlock(waiter);
1062 1062
@@ -1106,7 +1106,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1106{ 1106{
1107 int ret; 1107 int ret;
1108 1108
1109 spin_lock(&lock->wait_lock); 1109 raw_spin_lock(&lock->wait_lock);
1110 1110
1111 set_current_state(TASK_INTERRUPTIBLE); 1111 set_current_state(TASK_INTERRUPTIBLE);
1112 1112
@@ -1124,7 +1124,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1124 */ 1124 */
1125 fixup_rt_mutex_waiters(lock); 1125 fixup_rt_mutex_waiters(lock);
1126 1126
1127 spin_unlock(&lock->wait_lock); 1127 raw_spin_unlock(&lock->wait_lock);
1128 1128
1129 /* 1129 /*
1130 * Readjust priority, when we did not get the lock. We might have been 1130 * Readjust priority, when we did not get the lock. We might have been
diff --git a/kernel/sched.c b/kernel/sched.c
index adb5e923cc61..5e3c509e0efe 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -71,6 +71,7 @@
71#include <linux/debugfs.h> 71#include <linux/debugfs.h>
72#include <linux/ctype.h> 72#include <linux/ctype.h>
73#include <linux/ftrace.h> 73#include <linux/ftrace.h>
74#include <linux/slab.h>
74 75
75#include <asm/tlb.h> 76#include <asm/tlb.h>
76#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
@@ -144,7 +145,7 @@ struct rt_prio_array {
144 145
145struct rt_bandwidth { 146struct rt_bandwidth {
146 /* nests inside the rq lock: */ 147 /* nests inside the rq lock: */
147 spinlock_t rt_runtime_lock; 148 raw_spinlock_t rt_runtime_lock;
148 ktime_t rt_period; 149 ktime_t rt_period;
149 u64 rt_runtime; 150 u64 rt_runtime;
150 struct hrtimer rt_period_timer; 151 struct hrtimer rt_period_timer;
@@ -181,7 +182,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
181 rt_b->rt_period = ns_to_ktime(period); 182 rt_b->rt_period = ns_to_ktime(period);
182 rt_b->rt_runtime = runtime; 183 rt_b->rt_runtime = runtime;
183 184
184 spin_lock_init(&rt_b->rt_runtime_lock); 185 raw_spin_lock_init(&rt_b->rt_runtime_lock);
185 186
186 hrtimer_init(&rt_b->rt_period_timer, 187 hrtimer_init(&rt_b->rt_period_timer,
187 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 188 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
@@ -203,7 +204,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
203 if (hrtimer_active(&rt_b->rt_period_timer)) 204 if (hrtimer_active(&rt_b->rt_period_timer))
204 return; 205 return;
205 206
206 spin_lock(&rt_b->rt_runtime_lock); 207 raw_spin_lock(&rt_b->rt_runtime_lock);
207 for (;;) { 208 for (;;) {
208 unsigned long delta; 209 unsigned long delta;
209 ktime_t soft, hard; 210 ktime_t soft, hard;
@@ -220,7 +221,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
220 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, 221 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
221 HRTIMER_MODE_ABS_PINNED, 0); 222 HRTIMER_MODE_ABS_PINNED, 0);
222 } 223 }
223 spin_unlock(&rt_b->rt_runtime_lock); 224 raw_spin_unlock(&rt_b->rt_runtime_lock);
224} 225}
225 226
226#ifdef CONFIG_RT_GROUP_SCHED 227#ifdef CONFIG_RT_GROUP_SCHED
@@ -236,7 +237,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
236 */ 237 */
237static DEFINE_MUTEX(sched_domains_mutex); 238static DEFINE_MUTEX(sched_domains_mutex);
238 239
239#ifdef CONFIG_GROUP_SCHED 240#ifdef CONFIG_CGROUP_SCHED
240 241
241#include <linux/cgroup.h> 242#include <linux/cgroup.h>
242 243
@@ -246,13 +247,7 @@ static LIST_HEAD(task_groups);
246 247
247/* task group related information */ 248/* task group related information */
248struct task_group { 249struct task_group {
249#ifdef CONFIG_CGROUP_SCHED
250 struct cgroup_subsys_state css; 250 struct cgroup_subsys_state css;
251#endif
252
253#ifdef CONFIG_USER_SCHED
254 uid_t uid;
255#endif
256 251
257#ifdef CONFIG_FAIR_GROUP_SCHED 252#ifdef CONFIG_FAIR_GROUP_SCHED
258 /* schedulable entities of this group on each cpu */ 253 /* schedulable entities of this group on each cpu */
@@ -277,35 +272,7 @@ struct task_group {
277 struct list_head children; 272 struct list_head children;
278}; 273};
279 274
280#ifdef CONFIG_USER_SCHED
281
282/* Helper function to pass uid information to create_sched_user() */
283void set_tg_uid(struct user_struct *user)
284{
285 user->tg->uid = user->uid;
286}
287
288/*
289 * Root task group.
290 * Every UID task group (including init_task_group aka UID-0) will
291 * be a child to this group.
292 */
293struct task_group root_task_group;
294
295#ifdef CONFIG_FAIR_GROUP_SCHED
296/* Default task group's sched entity on each cpu */
297static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
298/* Default task group's cfs_rq on each cpu */
299static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
300#endif /* CONFIG_FAIR_GROUP_SCHED */
301
302#ifdef CONFIG_RT_GROUP_SCHED
303static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
304static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
305#endif /* CONFIG_RT_GROUP_SCHED */
306#else /* !CONFIG_USER_SCHED */
307#define root_task_group init_task_group 275#define root_task_group init_task_group
308#endif /* CONFIG_USER_SCHED */
309 276
310/* task_group_lock serializes add/remove of task groups and also changes to 277/* task_group_lock serializes add/remove of task groups and also changes to
311 * a task group's cpu shares. 278 * a task group's cpu shares.
@@ -321,11 +288,7 @@ static int root_task_group_empty(void)
321} 288}
322#endif 289#endif
323 290
324#ifdef CONFIG_USER_SCHED
325# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
326#else /* !CONFIG_USER_SCHED */
327# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 291# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
328#endif /* CONFIG_USER_SCHED */
329 292
330/* 293/*
331 * A weight of 0 or 1 can cause arithmetics problems. 294 * A weight of 0 or 1 can cause arithmetics problems.
@@ -351,11 +314,7 @@ static inline struct task_group *task_group(struct task_struct *p)
351{ 314{
352 struct task_group *tg; 315 struct task_group *tg;
353 316
354#ifdef CONFIG_USER_SCHED 317#ifdef CONFIG_CGROUP_SCHED
355 rcu_read_lock();
356 tg = __task_cred(p)->user->tg;
357 rcu_read_unlock();
358#elif defined(CONFIG_CGROUP_SCHED)
359 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), 318 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
360 struct task_group, css); 319 struct task_group, css);
361#else 320#else
@@ -367,6 +326,15 @@ static inline struct task_group *task_group(struct task_struct *p)
367/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 326/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
368static inline void set_task_rq(struct task_struct *p, unsigned int cpu) 327static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
369{ 328{
329 /*
330 * Strictly speaking this rcu_read_lock() is not needed since the
331 * task_group is tied to the cgroup, which in turn can never go away
332 * as long as there are tasks attached to it.
333 *
334 * However since task_group() uses task_subsys_state() which is an
335 * rcu_dereference() user, this quiets CONFIG_PROVE_RCU.
336 */
337 rcu_read_lock();
370#ifdef CONFIG_FAIR_GROUP_SCHED 338#ifdef CONFIG_FAIR_GROUP_SCHED
371 p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; 339 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
372 p->se.parent = task_group(p)->se[cpu]; 340 p->se.parent = task_group(p)->se[cpu];
@@ -376,6 +344,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
376 p->rt.rt_rq = task_group(p)->rt_rq[cpu]; 344 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
377 p->rt.parent = task_group(p)->rt_se[cpu]; 345 p->rt.parent = task_group(p)->rt_se[cpu];
378#endif 346#endif
347 rcu_read_unlock();
379} 348}
380 349
381#else 350#else
@@ -386,7 +355,7 @@ static inline struct task_group *task_group(struct task_struct *p)
386 return NULL; 355 return NULL;
387} 356}
388 357
389#endif /* CONFIG_GROUP_SCHED */ 358#endif /* CONFIG_CGROUP_SCHED */
390 359
391/* CFS-related fields in a runqueue */ 360/* CFS-related fields in a runqueue */
392struct cfs_rq { 361struct cfs_rq {
@@ -473,7 +442,7 @@ struct rt_rq {
473 u64 rt_time; 442 u64 rt_time;
474 u64 rt_runtime; 443 u64 rt_runtime;
475 /* Nests inside the rq lock: */ 444 /* Nests inside the rq lock: */
476 spinlock_t rt_runtime_lock; 445 raw_spinlock_t rt_runtime_lock;
477 446
478#ifdef CONFIG_RT_GROUP_SCHED 447#ifdef CONFIG_RT_GROUP_SCHED
479 unsigned long rt_nr_boosted; 448 unsigned long rt_nr_boosted;
@@ -481,7 +450,6 @@ struct rt_rq {
481 struct rq *rq; 450 struct rq *rq;
482 struct list_head leaf_rt_rq_list; 451 struct list_head leaf_rt_rq_list;
483 struct task_group *tg; 452 struct task_group *tg;
484 struct sched_rt_entity *rt_se;
485#endif 453#endif
486}; 454};
487 455
@@ -534,7 +502,7 @@ static struct root_domain def_root_domain;
534 */ 502 */
535struct rq { 503struct rq {
536 /* runqueue lock: */ 504 /* runqueue lock: */
537 spinlock_t lock; 505 raw_spinlock_t lock;
538 506
539 /* 507 /*
540 * nr_running and cpu_load should be in the same cacheline because 508 * nr_running and cpu_load should be in the same cacheline because
@@ -544,14 +512,12 @@ struct rq {
544 #define CPU_LOAD_IDX_MAX 5 512 #define CPU_LOAD_IDX_MAX 5
545 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 513 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
546#ifdef CONFIG_NO_HZ 514#ifdef CONFIG_NO_HZ
547 unsigned long last_tick_seen;
548 unsigned char in_nohz_recently; 515 unsigned char in_nohz_recently;
549#endif 516#endif
550 /* capture load from *all* tasks on this cpu: */ 517 /* capture load from *all* tasks on this cpu: */
551 struct load_weight load; 518 struct load_weight load;
552 unsigned long nr_load_updates; 519 unsigned long nr_load_updates;
553 u64 nr_switches; 520 u64 nr_switches;
554 u64 nr_migrations_in;
555 521
556 struct cfs_rq cfs; 522 struct cfs_rq cfs;
557 struct rt_rq rt; 523 struct rt_rq rt;
@@ -601,6 +567,8 @@ struct rq {
601 567
602 u64 rt_avg; 568 u64 rt_avg;
603 u64 age_stamp; 569 u64 age_stamp;
570 u64 idle_stamp;
571 u64 avg_idle;
604#endif 572#endif
605 573
606 /* calc_load related fields */ 574 /* calc_load related fields */
@@ -655,6 +623,11 @@ static inline int cpu_of(struct rq *rq)
655#endif 623#endif
656} 624}
657 625
626#define rcu_dereference_check_sched_domain(p) \
627 rcu_dereference_check((p), \
628 rcu_read_lock_sched_held() || \
629 lockdep_is_held(&sched_domains_mutex))
630
658/* 631/*
659 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 632 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
660 * See detach_destroy_domains: synchronize_sched for details. 633 * See detach_destroy_domains: synchronize_sched for details.
@@ -663,7 +636,7 @@ static inline int cpu_of(struct rq *rq)
663 * preempt-disabled sections. 636 * preempt-disabled sections.
664 */ 637 */
665#define for_each_domain(cpu, __sd) \ 638#define for_each_domain(cpu, __sd) \
666 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) 639 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
667 640
668#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 641#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
669#define this_rq() (&__get_cpu_var(runqueues)) 642#define this_rq() (&__get_cpu_var(runqueues))
@@ -695,7 +668,7 @@ inline void update_rq_clock(struct rq *rq)
695 */ 668 */
696int runqueue_is_locked(int cpu) 669int runqueue_is_locked(int cpu)
697{ 670{
698 return spin_is_locked(&cpu_rq(cpu)->lock); 671 return raw_spin_is_locked(&cpu_rq(cpu)->lock);
699} 672}
700 673
701/* 674/*
@@ -782,7 +755,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
782 if (!sched_feat_names[i]) 755 if (!sched_feat_names[i])
783 return -EINVAL; 756 return -EINVAL;
784 757
785 filp->f_pos += cnt; 758 *ppos += cnt;
786 759
787 return cnt; 760 return cnt;
788} 761}
@@ -824,6 +797,7 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
824 * default: 0.25ms 797 * default: 0.25ms
825 */ 798 */
826unsigned int sysctl_sched_shares_ratelimit = 250000; 799unsigned int sysctl_sched_shares_ratelimit = 250000;
800unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
827 801
828/* 802/*
829 * Inject some fuzzyness into changing the per-cpu group shares 803 * Inject some fuzzyness into changing the per-cpu group shares
@@ -902,7 +876,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
902 */ 876 */
903 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); 877 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
904 878
905 spin_unlock_irq(&rq->lock); 879 raw_spin_unlock_irq(&rq->lock);
906} 880}
907 881
908#else /* __ARCH_WANT_UNLOCKED_CTXSW */ 882#else /* __ARCH_WANT_UNLOCKED_CTXSW */
@@ -926,9 +900,9 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
926 next->oncpu = 1; 900 next->oncpu = 1;
927#endif 901#endif
928#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 902#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
929 spin_unlock_irq(&rq->lock); 903 raw_spin_unlock_irq(&rq->lock);
930#else 904#else
931 spin_unlock(&rq->lock); 905 raw_spin_unlock(&rq->lock);
932#endif 906#endif
933} 907}
934 908
@@ -950,18 +924,35 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
950#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 924#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
951 925
952/* 926/*
927 * Check whether the task is waking, we use this to synchronize against
928 * ttwu() so that task_cpu() reports a stable number.
929 *
930 * We need to make an exception for PF_STARTING tasks because the fork
931 * path might require task_rq_lock() to work, eg. it can call
932 * set_cpus_allowed_ptr() from the cpuset clone_ns code.
933 */
934static inline int task_is_waking(struct task_struct *p)
935{
936 return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING));
937}
938
939/*
953 * __task_rq_lock - lock the runqueue a given task resides on. 940 * __task_rq_lock - lock the runqueue a given task resides on.
954 * Must be called interrupts disabled. 941 * Must be called interrupts disabled.
955 */ 942 */
956static inline struct rq *__task_rq_lock(struct task_struct *p) 943static inline struct rq *__task_rq_lock(struct task_struct *p)
957 __acquires(rq->lock) 944 __acquires(rq->lock)
958{ 945{
946 struct rq *rq;
947
959 for (;;) { 948 for (;;) {
960 struct rq *rq = task_rq(p); 949 while (task_is_waking(p))
961 spin_lock(&rq->lock); 950 cpu_relax();
962 if (likely(rq == task_rq(p))) 951 rq = task_rq(p);
952 raw_spin_lock(&rq->lock);
953 if (likely(rq == task_rq(p) && !task_is_waking(p)))
963 return rq; 954 return rq;
964 spin_unlock(&rq->lock); 955 raw_spin_unlock(&rq->lock);
965 } 956 }
966} 957}
967 958
@@ -976,12 +967,14 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
976 struct rq *rq; 967 struct rq *rq;
977 968
978 for (;;) { 969 for (;;) {
970 while (task_is_waking(p))
971 cpu_relax();
979 local_irq_save(*flags); 972 local_irq_save(*flags);
980 rq = task_rq(p); 973 rq = task_rq(p);
981 spin_lock(&rq->lock); 974 raw_spin_lock(&rq->lock);
982 if (likely(rq == task_rq(p))) 975 if (likely(rq == task_rq(p) && !task_is_waking(p)))
983 return rq; 976 return rq;
984 spin_unlock_irqrestore(&rq->lock, *flags); 977 raw_spin_unlock_irqrestore(&rq->lock, *flags);
985 } 978 }
986} 979}
987 980
@@ -990,19 +983,19 @@ void task_rq_unlock_wait(struct task_struct *p)
990 struct rq *rq = task_rq(p); 983 struct rq *rq = task_rq(p);
991 984
992 smp_mb(); /* spin-unlock-wait is not a full memory barrier */ 985 smp_mb(); /* spin-unlock-wait is not a full memory barrier */
993 spin_unlock_wait(&rq->lock); 986 raw_spin_unlock_wait(&rq->lock);
994} 987}
995 988
996static void __task_rq_unlock(struct rq *rq) 989static void __task_rq_unlock(struct rq *rq)
997 __releases(rq->lock) 990 __releases(rq->lock)
998{ 991{
999 spin_unlock(&rq->lock); 992 raw_spin_unlock(&rq->lock);
1000} 993}
1001 994
1002static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) 995static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
1003 __releases(rq->lock) 996 __releases(rq->lock)
1004{ 997{
1005 spin_unlock_irqrestore(&rq->lock, *flags); 998 raw_spin_unlock_irqrestore(&rq->lock, *flags);
1006} 999}
1007 1000
1008/* 1001/*
@@ -1015,7 +1008,7 @@ static struct rq *this_rq_lock(void)
1015 1008
1016 local_irq_disable(); 1009 local_irq_disable();
1017 rq = this_rq(); 1010 rq = this_rq();
1018 spin_lock(&rq->lock); 1011 raw_spin_lock(&rq->lock);
1019 1012
1020 return rq; 1013 return rq;
1021} 1014}
@@ -1062,10 +1055,10 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
1062 1055
1063 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); 1056 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1064 1057
1065 spin_lock(&rq->lock); 1058 raw_spin_lock(&rq->lock);
1066 update_rq_clock(rq); 1059 update_rq_clock(rq);
1067 rq->curr->sched_class->task_tick(rq, rq->curr, 1); 1060 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1068 spin_unlock(&rq->lock); 1061 raw_spin_unlock(&rq->lock);
1069 1062
1070 return HRTIMER_NORESTART; 1063 return HRTIMER_NORESTART;
1071} 1064}
@@ -1078,10 +1071,10 @@ static void __hrtick_start(void *arg)
1078{ 1071{
1079 struct rq *rq = arg; 1072 struct rq *rq = arg;
1080 1073
1081 spin_lock(&rq->lock); 1074 raw_spin_lock(&rq->lock);
1082 hrtimer_restart(&rq->hrtick_timer); 1075 hrtimer_restart(&rq->hrtick_timer);
1083 rq->hrtick_csd_pending = 0; 1076 rq->hrtick_csd_pending = 0;
1084 spin_unlock(&rq->lock); 1077 raw_spin_unlock(&rq->lock);
1085} 1078}
1086 1079
1087/* 1080/*
@@ -1188,7 +1181,7 @@ static void resched_task(struct task_struct *p)
1188{ 1181{
1189 int cpu; 1182 int cpu;
1190 1183
1191 assert_spin_locked(&task_rq(p)->lock); 1184 assert_raw_spin_locked(&task_rq(p)->lock);
1192 1185
1193 if (test_tsk_need_resched(p)) 1186 if (test_tsk_need_resched(p))
1194 return; 1187 return;
@@ -1210,10 +1203,10 @@ static void resched_cpu(int cpu)
1210 struct rq *rq = cpu_rq(cpu); 1203 struct rq *rq = cpu_rq(cpu);
1211 unsigned long flags; 1204 unsigned long flags;
1212 1205
1213 if (!spin_trylock_irqsave(&rq->lock, flags)) 1206 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
1214 return; 1207 return;
1215 resched_task(cpu_curr(cpu)); 1208 resched_task(cpu_curr(cpu));
1216 spin_unlock_irqrestore(&rq->lock, flags); 1209 raw_spin_unlock_irqrestore(&rq->lock, flags);
1217} 1210}
1218 1211
1219#ifdef CONFIG_NO_HZ 1212#ifdef CONFIG_NO_HZ
@@ -1282,7 +1275,7 @@ static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1282#else /* !CONFIG_SMP */ 1275#else /* !CONFIG_SMP */
1283static void resched_task(struct task_struct *p) 1276static void resched_task(struct task_struct *p)
1284{ 1277{
1285 assert_spin_locked(&task_rq(p)->lock); 1278 assert_raw_spin_locked(&task_rq(p)->lock);
1286 set_tsk_need_resched(p); 1279 set_tsk_need_resched(p);
1287} 1280}
1288 1281
@@ -1399,32 +1392,6 @@ static const u32 prio_to_wmult[40] = {
1399 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 1392 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1400}; 1393};
1401 1394
1402static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
1403
1404/*
1405 * runqueue iterator, to support SMP load-balancing between different
1406 * scheduling classes, without having to expose their internal data
1407 * structures to the load-balancing proper:
1408 */
1409struct rq_iterator {
1410 void *arg;
1411 struct task_struct *(*start)(void *);
1412 struct task_struct *(*next)(void *);
1413};
1414
1415#ifdef CONFIG_SMP
1416static unsigned long
1417balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1418 unsigned long max_load_move, struct sched_domain *sd,
1419 enum cpu_idle_type idle, int *all_pinned,
1420 int *this_best_prio, struct rq_iterator *iterator);
1421
1422static int
1423iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1424 struct sched_domain *sd, enum cpu_idle_type idle,
1425 struct rq_iterator *iterator);
1426#endif
1427
1428/* Time spent by the tasks of the cpu accounting group executing in ... */ 1395/* Time spent by the tasks of the cpu accounting group executing in ... */
1429enum cpuacct_stat_index { 1396enum cpuacct_stat_index {
1430 CPUACCT_STAT_USER, /* ... user mode */ 1397 CPUACCT_STAT_USER, /* ... user mode */
@@ -1540,7 +1507,7 @@ static unsigned long target_load(int cpu, int type)
1540 1507
1541static struct sched_group *group_of(int cpu) 1508static struct sched_group *group_of(int cpu)
1542{ 1509{
1543 struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); 1510 struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd);
1544 1511
1545 if (!sd) 1512 if (!sd)
1546 return NULL; 1513 return NULL;
@@ -1575,7 +1542,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1575 1542
1576#ifdef CONFIG_FAIR_GROUP_SCHED 1543#ifdef CONFIG_FAIR_GROUP_SCHED
1577 1544
1578static __read_mostly unsigned long *update_shares_data; 1545static __read_mostly unsigned long __percpu *update_shares_data;
1579 1546
1580static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1547static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1581 1548
@@ -1609,11 +1576,11 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu,
1609 struct rq *rq = cpu_rq(cpu); 1576 struct rq *rq = cpu_rq(cpu);
1610 unsigned long flags; 1577 unsigned long flags;
1611 1578
1612 spin_lock_irqsave(&rq->lock, flags); 1579 raw_spin_lock_irqsave(&rq->lock, flags);
1613 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; 1580 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1614 tg->cfs_rq[cpu]->shares = boost ? 0 : shares; 1581 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1615 __set_se_shares(tg->se[cpu], shares); 1582 __set_se_shares(tg->se[cpu], shares);
1616 spin_unlock_irqrestore(&rq->lock, flags); 1583 raw_spin_unlock_irqrestore(&rq->lock, flags);
1617 } 1584 }
1618} 1585}
1619 1586
@@ -1624,7 +1591,7 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu,
1624 */ 1591 */
1625static int tg_shares_up(struct task_group *tg, void *data) 1592static int tg_shares_up(struct task_group *tg, void *data)
1626{ 1593{
1627 unsigned long weight, rq_weight = 0, shares = 0; 1594 unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
1628 unsigned long *usd_rq_weight; 1595 unsigned long *usd_rq_weight;
1629 struct sched_domain *sd = data; 1596 struct sched_domain *sd = data;
1630 unsigned long flags; 1597 unsigned long flags;
@@ -1640,6 +1607,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
1640 weight = tg->cfs_rq[i]->load.weight; 1607 weight = tg->cfs_rq[i]->load.weight;
1641 usd_rq_weight[i] = weight; 1608 usd_rq_weight[i] = weight;
1642 1609
1610 rq_weight += weight;
1643 /* 1611 /*
1644 * If there are currently no tasks on the cpu pretend there 1612 * If there are currently no tasks on the cpu pretend there
1645 * is one of average load so that when a new task gets to 1613 * is one of average load so that when a new task gets to
@@ -1648,10 +1616,13 @@ static int tg_shares_up(struct task_group *tg, void *data)
1648 if (!weight) 1616 if (!weight)
1649 weight = NICE_0_LOAD; 1617 weight = NICE_0_LOAD;
1650 1618
1651 rq_weight += weight; 1619 sum_weight += weight;
1652 shares += tg->cfs_rq[i]->shares; 1620 shares += tg->cfs_rq[i]->shares;
1653 } 1621 }
1654 1622
1623 if (!rq_weight)
1624 rq_weight = sum_weight;
1625
1655 if ((!shares && rq_weight) || shares > tg->shares) 1626 if ((!shares && rq_weight) || shares > tg->shares)
1656 shares = tg->shares; 1627 shares = tg->shares;
1657 1628
@@ -1706,16 +1677,6 @@ static void update_shares(struct sched_domain *sd)
1706 } 1677 }
1707} 1678}
1708 1679
1709static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1710{
1711 if (root_task_group_empty())
1712 return;
1713
1714 spin_unlock(&rq->lock);
1715 update_shares(sd);
1716 spin_lock(&rq->lock);
1717}
1718
1719static void update_h_load(long cpu) 1680static void update_h_load(long cpu)
1720{ 1681{
1721 if (root_task_group_empty()) 1682 if (root_task_group_empty())
@@ -1730,10 +1691,6 @@ static inline void update_shares(struct sched_domain *sd)
1730{ 1691{
1731} 1692}
1732 1693
1733static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1734{
1735}
1736
1737#endif 1694#endif
1738 1695
1739#ifdef CONFIG_PREEMPT 1696#ifdef CONFIG_PREEMPT
@@ -1753,7 +1710,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1753 __acquires(busiest->lock) 1710 __acquires(busiest->lock)
1754 __acquires(this_rq->lock) 1711 __acquires(this_rq->lock)
1755{ 1712{
1756 spin_unlock(&this_rq->lock); 1713 raw_spin_unlock(&this_rq->lock);
1757 double_rq_lock(this_rq, busiest); 1714 double_rq_lock(this_rq, busiest);
1758 1715
1759 return 1; 1716 return 1;
@@ -1774,14 +1731,16 @@ static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1774{ 1731{
1775 int ret = 0; 1732 int ret = 0;
1776 1733
1777 if (unlikely(!spin_trylock(&busiest->lock))) { 1734 if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1778 if (busiest < this_rq) { 1735 if (busiest < this_rq) {
1779 spin_unlock(&this_rq->lock); 1736 raw_spin_unlock(&this_rq->lock);
1780 spin_lock(&busiest->lock); 1737 raw_spin_lock(&busiest->lock);
1781 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING); 1738 raw_spin_lock_nested(&this_rq->lock,
1739 SINGLE_DEPTH_NESTING);
1782 ret = 1; 1740 ret = 1;
1783 } else 1741 } else
1784 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING); 1742 raw_spin_lock_nested(&busiest->lock,
1743 SINGLE_DEPTH_NESTING);
1785 } 1744 }
1786 return ret; 1745 return ret;
1787} 1746}
@@ -1795,7 +1754,7 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1795{ 1754{
1796 if (unlikely(!irqs_disabled())) { 1755 if (unlikely(!irqs_disabled())) {
1797 /* printk() doesn't work good under rq->lock */ 1756 /* printk() doesn't work good under rq->lock */
1798 spin_unlock(&this_rq->lock); 1757 raw_spin_unlock(&this_rq->lock);
1799 BUG_ON(1); 1758 BUG_ON(1);
1800 } 1759 }
1801 1760
@@ -1805,9 +1764,54 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1805static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) 1764static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1806 __releases(busiest->lock) 1765 __releases(busiest->lock)
1807{ 1766{
1808 spin_unlock(&busiest->lock); 1767 raw_spin_unlock(&busiest->lock);
1809 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); 1768 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1810} 1769}
1770
1771/*
1772 * double_rq_lock - safely lock two runqueues
1773 *
1774 * Note this does not disable interrupts like task_rq_lock,
1775 * you need to do so manually before calling.
1776 */
1777static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1778 __acquires(rq1->lock)
1779 __acquires(rq2->lock)
1780{
1781 BUG_ON(!irqs_disabled());
1782 if (rq1 == rq2) {
1783 raw_spin_lock(&rq1->lock);
1784 __acquire(rq2->lock); /* Fake it out ;) */
1785 } else {
1786 if (rq1 < rq2) {
1787 raw_spin_lock(&rq1->lock);
1788 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1789 } else {
1790 raw_spin_lock(&rq2->lock);
1791 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1792 }
1793 }
1794 update_rq_clock(rq1);
1795 update_rq_clock(rq2);
1796}
1797
1798/*
1799 * double_rq_unlock - safely unlock two runqueues
1800 *
1801 * Note this does not restore interrupts like task_rq_unlock,
1802 * you need to do so manually after calling.
1803 */
1804static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1805 __releases(rq1->lock)
1806 __releases(rq2->lock)
1807{
1808 raw_spin_unlock(&rq1->lock);
1809 if (rq1 != rq2)
1810 raw_spin_unlock(&rq2->lock);
1811 else
1812 __release(rq2->lock);
1813}
1814
1811#endif 1815#endif
1812 1816
1813#ifdef CONFIG_FAIR_GROUP_SCHED 1817#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1820,20 +1824,31 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1820#endif 1824#endif
1821 1825
1822static void calc_load_account_active(struct rq *this_rq); 1826static void calc_load_account_active(struct rq *this_rq);
1827static void update_sysctl(void);
1828static int get_update_sysctl_factor(void);
1823 1829
1824#include "sched_stats.h" 1830static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1825#include "sched_idletask.c" 1831{
1826#include "sched_fair.c" 1832 set_task_rq(p, cpu);
1827#include "sched_rt.c" 1833#ifdef CONFIG_SMP
1828#include "../litmus/sched_litmus.c" 1834 /*
1829#ifdef CONFIG_SCHED_DEBUG 1835 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1830# include "sched_debug.c" 1836 * successfuly executed on another CPU. We must ensure that updates of
1837 * per-task data have been completed by this moment.
1838 */
1839 smp_wmb();
1840 task_thread_info(p)->cpu = cpu;
1831#endif 1841#endif
1842}
1843
1844static const struct sched_class rt_sched_class;
1832 1845
1833#define sched_class_highest (&litmus_sched_class) 1846#define sched_class_highest (&litmus_sched_class)
1834#define for_each_class(class) \ 1847#define for_each_class(class) \
1835 for (class = sched_class_highest; class; class = class->next) 1848 for (class = sched_class_highest; class; class = class->next)
1836 1849
1850#include "sched_stats.h"
1851
1837static void inc_nr_running(struct rq *rq) 1852static void inc_nr_running(struct rq *rq)
1838{ 1853{
1839 rq->nr_running++; 1854 rq->nr_running++;
@@ -1871,13 +1886,14 @@ static void update_avg(u64 *avg, u64 sample)
1871 *avg += diff >> 3; 1886 *avg += diff >> 3;
1872} 1887}
1873 1888
1874static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) 1889static void
1890enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1875{ 1891{
1876 if (wakeup) 1892 if (wakeup)
1877 p->se.start_runtime = p->se.sum_exec_runtime; 1893 p->se.start_runtime = p->se.sum_exec_runtime;
1878 1894
1879 sched_info_queued(p); 1895 sched_info_queued(p);
1880 p->sched_class->enqueue_task(rq, p, wakeup); 1896 p->sched_class->enqueue_task(rq, p, wakeup, head);
1881 p->se.on_rq = 1; 1897 p->se.on_rq = 1;
1882} 1898}
1883 1899
@@ -1900,6 +1916,38 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1900} 1916}
1901 1917
1902/* 1918/*
1919 * activate_task - move a task to the runqueue.
1920 */
1921static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1922{
1923 if (task_contributes_to_load(p))
1924 rq->nr_uninterruptible--;
1925
1926 enqueue_task(rq, p, wakeup, false);
1927 inc_nr_running(rq);
1928}
1929
1930/*
1931 * deactivate_task - remove a task from the runqueue.
1932 */
1933static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1934{
1935 if (task_contributes_to_load(p))
1936 rq->nr_uninterruptible++;
1937
1938 dequeue_task(rq, p, sleep);
1939 dec_nr_running(rq);
1940}
1941
1942#include "sched_idletask.c"
1943#include "sched_fair.c"
1944#include "sched_rt.c"
1945#include "../litmus/sched_litmus.c"
1946#ifdef CONFIG_SCHED_DEBUG
1947# include "sched_debug.c"
1948#endif
1949
1950/*
1903 * __normal_prio - return the priority that is based on the static prio 1951 * __normal_prio - return the priority that is based on the static prio
1904 */ 1952 */
1905static inline int __normal_prio(struct task_struct *p) 1953static inline int __normal_prio(struct task_struct *p)
@@ -1945,30 +1993,6 @@ static int effective_prio(struct task_struct *p)
1945 return p->prio; 1993 return p->prio;
1946} 1994}
1947 1995
1948/*
1949 * activate_task - move a task to the runqueue.
1950 */
1951static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1952{
1953 if (task_contributes_to_load(p))
1954 rq->nr_uninterruptible--;
1955
1956 enqueue_task(rq, p, wakeup);
1957 inc_nr_running(rq);
1958}
1959
1960/*
1961 * deactivate_task - remove a task from the runqueue.
1962 */
1963static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1964{
1965 if (task_contributes_to_load(p))
1966 rq->nr_uninterruptible++;
1967
1968 dequeue_task(rq, p, sleep);
1969 dec_nr_running(rq);
1970}
1971
1972/** 1996/**
1973 * task_curr - is this task currently executing on a CPU? 1997 * task_curr - is this task currently executing on a CPU?
1974 * @p: the task in question. 1998 * @p: the task in question.
@@ -1978,20 +2002,6 @@ inline int task_curr(const struct task_struct *p)
1978 return cpu_curr(task_cpu(p)) == p; 2002 return cpu_curr(task_cpu(p)) == p;
1979} 2003}
1980 2004
1981static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1982{
1983 set_task_rq(p, cpu);
1984#ifdef CONFIG_SMP
1985 /*
1986 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1987 * successfuly executed on another CPU. We must ensure that updates of
1988 * per-task data have been completed by this moment.
1989 */
1990 smp_wmb();
1991 task_thread_info(p)->cpu = cpu;
1992#endif
1993}
1994
1995static inline void check_class_changed(struct rq *rq, struct task_struct *p, 2005static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1996 const struct sched_class *prev_class, 2006 const struct sched_class *prev_class,
1997 int oldprio, int running) 2007 int oldprio, int running)
@@ -2004,38 +2014,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2004 p->sched_class->prio_changed(rq, p, oldprio, running); 2014 p->sched_class->prio_changed(rq, p, oldprio, running);
2005} 2015}
2006 2016
2007/**
2008 * kthread_bind - bind a just-created kthread to a cpu.
2009 * @p: thread created by kthread_create().
2010 * @cpu: cpu (might not be online, must be possible) for @k to run on.
2011 *
2012 * Description: This function is equivalent to set_cpus_allowed(),
2013 * except that @cpu doesn't need to be online, and the thread must be
2014 * stopped (i.e., just returned from kthread_create()).
2015 *
2016 * Function lives here instead of kthread.c because it messes with
2017 * scheduler internals which require locking.
2018 */
2019void kthread_bind(struct task_struct *p, unsigned int cpu)
2020{
2021 struct rq *rq = cpu_rq(cpu);
2022 unsigned long flags;
2023
2024 /* Must have done schedule() in kthread() before we set_task_cpu */
2025 if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
2026 WARN_ON(1);
2027 return;
2028 }
2029
2030 spin_lock_irqsave(&rq->lock, flags);
2031 set_task_cpu(p, cpu);
2032 p->cpus_allowed = cpumask_of_cpu(cpu);
2033 p->rt.nr_cpus_allowed = 1;
2034 p->flags |= PF_THREAD_BOUND;
2035 spin_unlock_irqrestore(&rq->lock, flags);
2036}
2037EXPORT_SYMBOL(kthread_bind);
2038
2039#ifdef CONFIG_SMP 2017#ifdef CONFIG_SMP
2040/* 2018/*
2041 * Is this task likely cache-hot: 2019 * Is this task likely cache-hot:
@@ -2045,6 +2023,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2045{ 2023{
2046 s64 delta; 2024 s64 delta;
2047 2025
2026 if (p->sched_class != &fair_sched_class)
2027 return 0;
2028
2048 /* 2029 /*
2049 * Buddy candidates are cache hot: 2030 * Buddy candidates are cache hot:
2050 */ 2031 */
@@ -2053,9 +2034,6 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2053 &p->se == cfs_rq_of(&p->se)->last)) 2034 &p->se == cfs_rq_of(&p->se)->last))
2054 return 1; 2035 return 1;
2055 2036
2056 if (p->sched_class != &fair_sched_class)
2057 return 0;
2058
2059 if (sysctl_sched_migration_cost == -1) 2037 if (sysctl_sched_migration_cost == -1)
2060 return 1; 2038 return 1;
2061 if (sysctl_sched_migration_cost == 0) 2039 if (sysctl_sched_migration_cost == 0)
@@ -2066,39 +2044,23 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2066 return delta < (s64)sysctl_sched_migration_cost; 2044 return delta < (s64)sysctl_sched_migration_cost;
2067} 2045}
2068 2046
2069
2070void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 2047void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2071{ 2048{
2072 int old_cpu = task_cpu(p); 2049#ifdef CONFIG_SCHED_DEBUG
2073 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); 2050 /*
2074 struct cfs_rq *old_cfsrq = task_cfs_rq(p), 2051 * We should never call set_task_cpu() on a blocked task,
2075 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); 2052 * ttwu() will sort out the placement.
2076 u64 clock_offset; 2053 */
2077 2054 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
2078 clock_offset = old_rq->clock - new_rq->clock; 2055 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
2056#endif
2079 2057
2080 trace_sched_migrate_task(p, new_cpu); 2058 trace_sched_migrate_task(p, new_cpu);
2081 2059
2082#ifdef CONFIG_SCHEDSTATS 2060 if (task_cpu(p) != new_cpu) {
2083 if (p->se.wait_start)
2084 p->se.wait_start -= clock_offset;
2085 if (p->se.sleep_start)
2086 p->se.sleep_start -= clock_offset;
2087 if (p->se.block_start)
2088 p->se.block_start -= clock_offset;
2089#endif
2090 if (old_cpu != new_cpu) {
2091 p->se.nr_migrations++; 2061 p->se.nr_migrations++;
2092 new_rq->nr_migrations_in++; 2062 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);
2093#ifdef CONFIG_SCHEDSTATS
2094 if (task_hot(p, old_rq->clock, NULL))
2095 schedstat_inc(p, se.nr_forced2_migrations);
2096#endif
2097 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
2098 1, 1, NULL, 0);
2099 } 2063 }
2100 p->se.vruntime -= old_cfsrq->min_vruntime -
2101 new_cfsrq->min_vruntime;
2102 2064
2103 __set_task_cpu(p, new_cpu); 2065 __set_task_cpu(p, new_cpu);
2104} 2066}
@@ -2123,12 +2085,10 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2123 2085
2124 /* 2086 /*
2125 * If the task is not on a runqueue (and not running), then 2087 * If the task is not on a runqueue (and not running), then
2126 * it is sufficient to simply update the task's cpu field. 2088 * the next wake-up will properly place the task.
2127 */ 2089 */
2128 if (!p->se.on_rq && !task_running(rq, p)) { 2090 if (!p->se.on_rq && !task_running(rq, p))
2129 set_task_cpu(p, dest_cpu);
2130 return 0; 2091 return 0;
2131 }
2132 2092
2133 init_completion(&req->done); 2093 init_completion(&req->done);
2134 req->task = p; 2094 req->task = p;
@@ -2333,6 +2293,75 @@ void task_oncpu_function_call(struct task_struct *p,
2333 preempt_enable(); 2293 preempt_enable();
2334} 2294}
2335 2295
2296#ifdef CONFIG_SMP
2297static int select_fallback_rq(int cpu, struct task_struct *p)
2298{
2299 int dest_cpu;
2300 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
2301
2302 /* Look for allowed, online CPU in same node. */
2303 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
2304 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
2305 return dest_cpu;
2306
2307 /* Any allowed, online CPU? */
2308 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
2309 if (dest_cpu < nr_cpu_ids)
2310 return dest_cpu;
2311
2312 /* No more Mr. Nice Guy. */
2313 if (dest_cpu >= nr_cpu_ids) {
2314 rcu_read_lock();
2315 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
2316 rcu_read_unlock();
2317 dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
2318
2319 /*
2320 * Don't tell them about moving exiting tasks or
2321 * kernel threads (both mm NULL), since they never
2322 * leave kernel.
2323 */
2324 if (p->mm && printk_ratelimit()) {
2325 printk(KERN_INFO "process %d (%s) no "
2326 "longer affine to cpu%d\n",
2327 task_pid_nr(p), p->comm, cpu);
2328 }
2329 }
2330
2331 return dest_cpu;
2332}
2333
2334/*
2335 * Gets called from 3 sites (exec, fork, wakeup), since it is called without
2336 * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done
2337 * by:
2338 *
2339 * exec: is unstable, retry loop
2340 * fork & wake-up: serialize ->cpus_allowed against TASK_WAKING
2341 */
2342static inline
2343int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2344{
2345 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
2346
2347 /*
2348 * In order not to call set_task_cpu() on a blocking task we need
2349 * to rely on ttwu() to place the task on a valid ->cpus_allowed
2350 * cpu.
2351 *
2352 * Since this is common to all placement strategies, this lives here.
2353 *
2354 * [ this allows ->select_task() to simply return task_cpu(p) and
2355 * not worry about this generic constraint ]
2356 */
2357 if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
2358 !cpu_online(cpu)))
2359 cpu = select_fallback_rq(task_cpu(p), p);
2360
2361 return cpu;
2362}
2363#endif
2364
2336/*** 2365/***
2337 * try_to_wake_up - wake up a thread 2366 * try_to_wake_up - wake up a thread
2338 * @p: the to-be-woken-up thread 2367 * @p: the to-be-woken-up thread
@@ -2352,7 +2381,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2352{ 2381{
2353 int cpu, orig_cpu, this_cpu, success = 0; 2382 int cpu, orig_cpu, this_cpu, success = 0;
2354 unsigned long flags; 2383 unsigned long flags;
2355 struct rq *rq, *orig_rq; 2384 struct rq *rq;
2356 2385
2357 if (is_realtime(p)) 2386 if (is_realtime(p))
2358 TRACE_TASK(p, "try_to_wake_up() state:%d\n", p->state); 2387 TRACE_TASK(p, "try_to_wake_up() state:%d\n", p->state);
@@ -2363,7 +2392,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2363 this_cpu = get_cpu(); 2392 this_cpu = get_cpu();
2364 2393
2365 smp_wmb(); 2394 smp_wmb();
2366 rq = orig_rq = task_rq_lock(p, &flags); 2395 rq = task_rq_lock(p, &flags);
2367 update_rq_clock(rq); 2396 update_rq_clock(rq);
2368 if (!(p->state & state)) 2397 if (!(p->state & state))
2369 goto out; 2398 goto out;
@@ -2387,19 +2416,34 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2387 if (task_contributes_to_load(p)) 2416 if (task_contributes_to_load(p))
2388 rq->nr_uninterruptible--; 2417 rq->nr_uninterruptible--;
2389 p->state = TASK_WAKING; 2418 p->state = TASK_WAKING;
2390 task_rq_unlock(rq, &flags);
2391 2419
2392 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 2420 if (p->sched_class->task_waking)
2393 if (cpu != orig_cpu) 2421 p->sched_class->task_waking(rq, p);
2394 set_task_cpu(p, cpu);
2395 2422
2396 rq = task_rq_lock(p, &flags); 2423 __task_rq_unlock(rq);
2397 2424
2398 if (rq != orig_rq) 2425 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2399 update_rq_clock(rq); 2426 if (cpu != orig_cpu) {
2427 /*
2428 * Since we migrate the task without holding any rq->lock,
2429 * we need to be careful with task_rq_lock(), since that
2430 * might end up locking an invalid rq.
2431 */
2432 set_task_cpu(p, cpu);
2433 }
2400 2434
2435 rq = cpu_rq(cpu);
2436 raw_spin_lock(&rq->lock);
2437 update_rq_clock(rq);
2438
2439 /*
2440 * We migrated the task without holding either rq->lock, however
2441 * since the task is not on the task list itself, nobody else
2442 * will try and migrate the task, hence the rq should match the
2443 * cpu we just moved it to.
2444 */
2445 WARN_ON(task_cpu(p) != cpu);
2401 WARN_ON(p->state != TASK_WAKING); 2446 WARN_ON(p->state != TASK_WAKING);
2402 cpu = task_cpu(p);
2403 2447
2404#ifdef CONFIG_SCHEDSTATS 2448#ifdef CONFIG_SCHEDSTATS
2405 schedstat_inc(rq, ttwu_count); 2449 schedstat_inc(rq, ttwu_count);
@@ -2452,8 +2496,19 @@ out_running:
2452 2496
2453 p->state = TASK_RUNNING; 2497 p->state = TASK_RUNNING;
2454#ifdef CONFIG_SMP 2498#ifdef CONFIG_SMP
2455 if (p->sched_class->task_wake_up) 2499 if (p->sched_class->task_woken)
2456 p->sched_class->task_wake_up(rq, p); 2500 p->sched_class->task_woken(rq, p);
2501
2502 if (unlikely(rq->idle_stamp)) {
2503 u64 delta = rq->clock - rq->idle_stamp;
2504 u64 max = 2*sysctl_sched_migration_cost;
2505
2506 if (delta > max)
2507 rq->avg_idle = max;
2508 else
2509 update_avg(&rq->avg_idle, delta);
2510 rq->idle_stamp = 0;
2511 }
2457#endif 2512#endif
2458out: 2513out:
2459 if (is_realtime(p)) 2514 if (is_realtime(p))
@@ -2502,7 +2557,6 @@ static void __sched_fork(struct task_struct *p)
2502 p->se.avg_overlap = 0; 2557 p->se.avg_overlap = 0;
2503 p->se.start_runtime = 0; 2558 p->se.start_runtime = 0;
2504 p->se.avg_wakeup = sysctl_sched_wakeup_granularity; 2559 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2505 p->se.avg_running = 0;
2506 2560
2507#ifdef CONFIG_SCHEDSTATS 2561#ifdef CONFIG_SCHEDSTATS
2508 p->se.wait_start = 0; 2562 p->se.wait_start = 0;
@@ -2524,7 +2578,6 @@ static void __sched_fork(struct task_struct *p)
2524 p->se.nr_failed_migrations_running = 0; 2578 p->se.nr_failed_migrations_running = 0;
2525 p->se.nr_failed_migrations_hot = 0; 2579 p->se.nr_failed_migrations_hot = 0;
2526 p->se.nr_forced_migrations = 0; 2580 p->se.nr_forced_migrations = 0;
2527 p->se.nr_forced2_migrations = 0;
2528 2581
2529 p->se.nr_wakeups = 0; 2582 p->se.nr_wakeups = 0;
2530 p->se.nr_wakeups_sync = 0; 2583 p->se.nr_wakeups_sync = 0;
@@ -2545,14 +2598,6 @@ static void __sched_fork(struct task_struct *p)
2545#ifdef CONFIG_PREEMPT_NOTIFIERS 2598#ifdef CONFIG_PREEMPT_NOTIFIERS
2546 INIT_HLIST_HEAD(&p->preempt_notifiers); 2599 INIT_HLIST_HEAD(&p->preempt_notifiers);
2547#endif 2600#endif
2548
2549 /*
2550 * We mark the process as running here, but have not actually
2551 * inserted it onto the runqueue yet. This guarantees that
2552 * nobody will actually run it, and a signal or other external
2553 * event cannot wake it up and insert it on the runqueue either.
2554 */
2555 p->state = TASK_RUNNING;
2556} 2601}
2557 2602
2558/* 2603/*
@@ -2563,6 +2608,12 @@ void sched_fork(struct task_struct *p, int clone_flags)
2563 int cpu = get_cpu(); 2608 int cpu = get_cpu();
2564 2609
2565 __sched_fork(p); 2610 __sched_fork(p);
2611 /*
2612 * We mark the process as waking here. This guarantees that
2613 * nobody will actually run it, and a signal or other external
2614 * event cannot wake it up and insert it on the runqueue either.
2615 */
2616 p->state = TASK_WAKING;
2566 2617
2567 /* 2618 /*
2568 * Revert to default priority/policy on fork if requested. 2619 * Revert to default priority/policy on fork if requested.
@@ -2594,9 +2645,9 @@ void sched_fork(struct task_struct *p, int clone_flags)
2594 if (!rt_prio(p->prio)) 2645 if (!rt_prio(p->prio))
2595 p->sched_class = &fair_sched_class; 2646 p->sched_class = &fair_sched_class;
2596 2647
2597#ifdef CONFIG_SMP 2648 if (p->sched_class->task_fork)
2598 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); 2649 p->sched_class->task_fork(p);
2599#endif 2650
2600 set_task_cpu(p, cpu); 2651 set_task_cpu(p, cpu);
2601 2652
2602#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2653#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
@@ -2626,28 +2677,41 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2626{ 2677{
2627 unsigned long flags; 2678 unsigned long flags;
2628 struct rq *rq; 2679 struct rq *rq;
2680 int cpu __maybe_unused = get_cpu();
2629 2681
2630 rq = task_rq_lock(p, &flags); 2682#ifdef CONFIG_SMP
2631 BUG_ON(p->state != TASK_RUNNING); 2683 /*
2632 update_rq_clock(rq); 2684 * Fork balancing, do it here and not earlier because:
2685 * - cpus_allowed can change in the fork path
2686 * - any previously selected cpu might disappear through hotplug
2687 *
2688 * We still have TASK_WAKING but PF_STARTING is gone now, meaning
2689 * ->cpus_allowed is stable, we have preemption disabled, meaning
2690 * cpu_online_mask is stable.
2691 */
2692 cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
2693 set_task_cpu(p, cpu);
2694#endif
2633 2695
2634 if (!p->sched_class->task_new || !current->se.on_rq) { 2696 /*
2635 activate_task(rq, p, 0); 2697 * Since the task is not on the rq and we still have TASK_WAKING set
2636 } else { 2698 * nobody else will migrate this task.
2637 /* 2699 */
2638 * Let the scheduling class do new task startup 2700 rq = cpu_rq(cpu);
2639 * management (if any): 2701 raw_spin_lock_irqsave(&rq->lock, flags);
2640 */ 2702
2641 p->sched_class->task_new(rq, p); 2703 BUG_ON(p->state != TASK_WAKING);
2642 inc_nr_running(rq); 2704 p->state = TASK_RUNNING;
2643 } 2705 update_rq_clock(rq);
2706 activate_task(rq, p, 0);
2644 trace_sched_wakeup_new(rq, p, 1); 2707 trace_sched_wakeup_new(rq, p, 1);
2645 check_preempt_curr(rq, p, WF_FORK); 2708 check_preempt_curr(rq, p, WF_FORK);
2646#ifdef CONFIG_SMP 2709#ifdef CONFIG_SMP
2647 if (p->sched_class->task_wake_up) 2710 if (p->sched_class->task_woken)
2648 p->sched_class->task_wake_up(rq, p); 2711 p->sched_class->task_woken(rq, p);
2649#endif 2712#endif
2650 task_rq_unlock(rq, &flags); 2713 task_rq_unlock(rq, &flags);
2714 put_cpu();
2651} 2715}
2652 2716
2653#ifdef CONFIG_PREEMPT_NOTIFIERS 2717#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -2768,7 +2832,13 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2768 finish_arch_switch(prev); 2832 finish_arch_switch(prev);
2769 litmus->finish_switch(prev); 2833 litmus->finish_switch(prev);
2770 prev->rt_param.stack_in_use = NO_CPU; 2834 prev->rt_param.stack_in_use = NO_CPU;
2771 perf_event_task_sched_in(current, cpu_of(rq)); 2835#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2836 local_irq_disable();
2837#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2838 perf_event_task_sched_in(current);
2839#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2840 local_irq_enable();
2841#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2772 finish_lock_switch(rq, prev); 2842 finish_lock_switch(rq, prev);
2773 2843
2774 fire_sched_in_preempt_notifiers(current); 2844 fire_sched_in_preempt_notifiers(current);
@@ -2808,10 +2878,10 @@ static inline void post_schedule(struct rq *rq)
2808 if (rq->post_schedule) { 2878 if (rq->post_schedule) {
2809 unsigned long flags; 2879 unsigned long flags;
2810 2880
2811 spin_lock_irqsave(&rq->lock, flags); 2881 raw_spin_lock_irqsave(&rq->lock, flags);
2812 if (rq->curr->sched_class->post_schedule) 2882 if (rq->curr->sched_class->post_schedule)
2813 rq->curr->sched_class->post_schedule(rq); 2883 rq->curr->sched_class->post_schedule(rq);
2814 spin_unlock_irqrestore(&rq->lock, flags); 2884 raw_spin_unlock_irqrestore(&rq->lock, flags);
2815 2885
2816 rq->post_schedule = 0; 2886 rq->post_schedule = 0;
2817 } 2887 }
@@ -2875,14 +2945,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
2875 */ 2945 */
2876 arch_start_context_switch(prev); 2946 arch_start_context_switch(prev);
2877 2947
2878 if (unlikely(!mm)) { 2948 if (likely(!mm)) {
2879 next->active_mm = oldmm; 2949 next->active_mm = oldmm;
2880 atomic_inc(&oldmm->mm_count); 2950 atomic_inc(&oldmm->mm_count);
2881 enter_lazy_tlb(oldmm, next); 2951 enter_lazy_tlb(oldmm, next);
2882 } else 2952 } else
2883 switch_mm(oldmm, mm, next); 2953 switch_mm(oldmm, mm, next);
2884 2954
2885 if (unlikely(!prev->mm)) { 2955 if (likely(!prev->mm)) {
2886 prev->active_mm = NULL; 2956 prev->active_mm = NULL;
2887 rq->prev_mm = oldmm; 2957 rq->prev_mm = oldmm;
2888 } 2958 }
@@ -3045,15 +3115,6 @@ static void calc_load_account_active(struct rq *this_rq)
3045} 3115}
3046 3116
3047/* 3117/*
3048 * Externally visible per-cpu scheduler statistics:
3049 * cpu_nr_migrations(cpu) - number of migrations into that cpu
3050 */
3051u64 cpu_nr_migrations(int cpu)
3052{
3053 return cpu_rq(cpu)->nr_migrations_in;
3054}
3055
3056/*
3057 * Update rq->cpu_load[] statistics. This function is usually called every 3118 * Update rq->cpu_load[] statistics. This function is usually called every
3058 * scheduler tick (TICK_NSEC). 3119 * scheduler tick (TICK_NSEC).
3059 */ 3120 */
@@ -3091,65 +3152,36 @@ static void update_cpu_load(struct rq *this_rq)
3091#ifdef CONFIG_SMP 3152#ifdef CONFIG_SMP
3092 3153
3093/* 3154/*
3094 * double_rq_lock - safely lock two runqueues 3155 * sched_exec - execve() is a valuable balancing opportunity, because at
3095 * 3156 * this point the task has the smallest effective memory and cache footprint.
3096 * Note this does not disable interrupts like task_rq_lock,
3097 * you need to do so manually before calling.
3098 */
3099static void double_rq_lock(struct rq *rq1, struct rq *rq2)
3100 __acquires(rq1->lock)
3101 __acquires(rq2->lock)
3102{
3103 BUG_ON(!irqs_disabled());
3104 if (rq1 == rq2) {
3105 spin_lock(&rq1->lock);
3106 __acquire(rq2->lock); /* Fake it out ;) */
3107 } else {
3108 if (rq1 < rq2) {
3109 spin_lock(&rq1->lock);
3110 spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
3111 } else {
3112 spin_lock(&rq2->lock);
3113 spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
3114 }
3115 }
3116 update_rq_clock(rq1);
3117 update_rq_clock(rq2);
3118}
3119
3120/*
3121 * double_rq_unlock - safely unlock two runqueues
3122 *
3123 * Note this does not restore interrupts like task_rq_unlock,
3124 * you need to do so manually after calling.
3125 */
3126static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
3127 __releases(rq1->lock)
3128 __releases(rq2->lock)
3129{
3130 spin_unlock(&rq1->lock);
3131 if (rq1 != rq2)
3132 spin_unlock(&rq2->lock);
3133 else
3134 __release(rq2->lock);
3135}
3136
3137/*
3138 * If dest_cpu is allowed for this process, migrate the task to it.
3139 * This is accomplished by forcing the cpu_allowed mask to only
3140 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
3141 * the cpu_allowed mask is restored.
3142 */ 3157 */
3143static void sched_migrate_task(struct task_struct *p, int dest_cpu) 3158void sched_exec(void)
3144{ 3159{
3160 struct task_struct *p = current;
3145 struct migration_req req; 3161 struct migration_req req;
3162 int dest_cpu, this_cpu;
3146 unsigned long flags; 3163 unsigned long flags;
3147 struct rq *rq; 3164 struct rq *rq;
3148 3165
3166again:
3167 this_cpu = get_cpu();
3168 dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0);
3169 if (dest_cpu == this_cpu) {
3170 put_cpu();
3171 return;
3172 }
3173
3149 rq = task_rq_lock(p, &flags); 3174 rq = task_rq_lock(p, &flags);
3175 put_cpu();
3176
3177 /*
3178 * select_task_rq() can race against ->cpus_allowed
3179 */
3150 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) 3180 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
3151 || unlikely(!cpu_active(dest_cpu))) 3181 || unlikely(!cpu_active(dest_cpu))) {
3152 goto out; 3182 task_rq_unlock(rq, &flags);
3183 goto again;
3184 }
3153 3185
3154 /* force the process onto the specified CPU */ 3186 /* force the process onto the specified CPU */
3155 if (migrate_task(p, dest_cpu, &req)) { 3187 if (migrate_task(p, dest_cpu, &req)) {
@@ -3164,1784 +3196,9 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
3164 3196
3165 return; 3197 return;
3166 } 3198 }
3167out:
3168 task_rq_unlock(rq, &flags); 3199 task_rq_unlock(rq, &flags);
3169} 3200}
3170 3201
3171/*
3172 * sched_exec - execve() is a valuable balancing opportunity, because at
3173 * this point the task has the smallest effective memory and cache footprint.
3174 */
3175void sched_exec(void)
3176{
3177 int new_cpu, this_cpu = get_cpu();
3178 new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);
3179 put_cpu();
3180 if (new_cpu != this_cpu)
3181 sched_migrate_task(current, new_cpu);
3182}
3183
3184/*
3185 * pull_task - move a task from a remote runqueue to the local runqueue.
3186 * Both runqueues must be locked.
3187 */
3188static void pull_task(struct rq *src_rq, struct task_struct *p,
3189 struct rq *this_rq, int this_cpu)
3190{
3191 deactivate_task(src_rq, p, 0);
3192 set_task_cpu(p, this_cpu);
3193 activate_task(this_rq, p, 0);
3194 /*
3195 * Note that idle threads have a prio of MAX_PRIO, for this test
3196 * to be always true for them.
3197 */
3198 check_preempt_curr(this_rq, p, 0);
3199}
3200
3201/*
3202 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3203 */
3204static
3205int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
3206 struct sched_domain *sd, enum cpu_idle_type idle,
3207 int *all_pinned)
3208{
3209 int tsk_cache_hot = 0;
3210 /*
3211 * We do not migrate tasks that are:
3212 * 1) running (obviously), or
3213 * 2) cannot be migrated to this CPU due to cpus_allowed, or
3214 * 3) are cache-hot on their current CPU.
3215 */
3216 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
3217 schedstat_inc(p, se.nr_failed_migrations_affine);
3218 return 0;
3219 }
3220 *all_pinned = 0;
3221
3222 if (task_running(rq, p)) {
3223 schedstat_inc(p, se.nr_failed_migrations_running);
3224 return 0;
3225 }
3226
3227 /*
3228 * Aggressive migration if:
3229 * 1) task is cache cold, or
3230 * 2) too many balance attempts have failed.
3231 */
3232
3233 tsk_cache_hot = task_hot(p, rq->clock, sd);
3234 if (!tsk_cache_hot ||
3235 sd->nr_balance_failed > sd->cache_nice_tries) {
3236#ifdef CONFIG_SCHEDSTATS
3237 if (tsk_cache_hot) {
3238 schedstat_inc(sd, lb_hot_gained[idle]);
3239 schedstat_inc(p, se.nr_forced_migrations);
3240 }
3241#endif
3242 return 1;
3243 }
3244
3245 if (tsk_cache_hot) {
3246 schedstat_inc(p, se.nr_failed_migrations_hot);
3247 return 0;
3248 }
3249 return 1;
3250}
3251
3252static unsigned long
3253balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3254 unsigned long max_load_move, struct sched_domain *sd,
3255 enum cpu_idle_type idle, int *all_pinned,
3256 int *this_best_prio, struct rq_iterator *iterator)
3257{
3258 int loops = 0, pulled = 0, pinned = 0;
3259 struct task_struct *p;
3260 long rem_load_move = max_load_move;
3261
3262 if (max_load_move == 0)
3263 goto out;
3264
3265 pinned = 1;
3266
3267 /*
3268 * Start the load-balancing iterator:
3269 */
3270 p = iterator->start(iterator->arg);
3271next:
3272 if (!p || loops++ > sysctl_sched_nr_migrate)
3273 goto out;
3274
3275 if ((p->se.load.weight >> 1) > rem_load_move ||
3276 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3277 p = iterator->next(iterator->arg);
3278 goto next;
3279 }
3280
3281 pull_task(busiest, p, this_rq, this_cpu);
3282 pulled++;
3283 rem_load_move -= p->se.load.weight;
3284
3285#ifdef CONFIG_PREEMPT
3286 /*
3287 * NEWIDLE balancing is a source of latency, so preemptible kernels
3288 * will stop after the first task is pulled to minimize the critical
3289 * section.
3290 */
3291 if (idle == CPU_NEWLY_IDLE)
3292 goto out;
3293#endif
3294
3295 /*
3296 * We only want to steal up to the prescribed amount of weighted load.
3297 */
3298 if (rem_load_move > 0) {
3299 if (p->prio < *this_best_prio)
3300 *this_best_prio = p->prio;
3301 p = iterator->next(iterator->arg);
3302 goto next;
3303 }
3304out:
3305 /*
3306 * Right now, this is one of only two places pull_task() is called,
3307 * so we can safely collect pull_task() stats here rather than
3308 * inside pull_task().
3309 */
3310 schedstat_add(sd, lb_gained[idle], pulled);
3311
3312 if (all_pinned)
3313 *all_pinned = pinned;
3314
3315 return max_load_move - rem_load_move;
3316}
3317
3318/*
3319 * move_tasks tries to move up to max_load_move weighted load from busiest to
3320 * this_rq, as part of a balancing operation within domain "sd".
3321 * Returns 1 if successful and 0 otherwise.
3322 *
3323 * Called with both runqueues locked.
3324 */
3325static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3326 unsigned long max_load_move,
3327 struct sched_domain *sd, enum cpu_idle_type idle,
3328 int *all_pinned)
3329{
3330 const struct sched_class *class = sched_class_highest;
3331 unsigned long total_load_moved = 0;
3332 int this_best_prio = this_rq->curr->prio;
3333
3334 do {
3335 total_load_moved +=
3336 class->load_balance(this_rq, this_cpu, busiest,
3337 max_load_move - total_load_moved,
3338 sd, idle, all_pinned, &this_best_prio);
3339 class = class->next;
3340
3341#ifdef CONFIG_PREEMPT
3342 /*
3343 * NEWIDLE balancing is a source of latency, so preemptible
3344 * kernels will stop after the first task is pulled to minimize
3345 * the critical section.
3346 */
3347 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3348 break;
3349#endif
3350 } while (class && max_load_move > total_load_moved);
3351
3352 return total_load_moved > 0;
3353}
3354
3355static int
3356iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3357 struct sched_domain *sd, enum cpu_idle_type idle,
3358 struct rq_iterator *iterator)
3359{
3360 struct task_struct *p = iterator->start(iterator->arg);
3361 int pinned = 0;
3362
3363 while (p) {
3364 if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3365 pull_task(busiest, p, this_rq, this_cpu);
3366 /*
3367 * Right now, this is only the second place pull_task()
3368 * is called, so we can safely collect pull_task()
3369 * stats here rather than inside pull_task().
3370 */
3371 schedstat_inc(sd, lb_gained[idle]);
3372
3373 return 1;
3374 }
3375 p = iterator->next(iterator->arg);
3376 }
3377
3378 return 0;
3379}
3380
3381/*
3382 * move_one_task tries to move exactly one task from busiest to this_rq, as
3383 * part of active balancing operations within "domain".
3384 * Returns 1 if successful and 0 otherwise.
3385 *
3386 * Called with both runqueues locked.
3387 */
3388static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3389 struct sched_domain *sd, enum cpu_idle_type idle)
3390{
3391 const struct sched_class *class;
3392
3393 for_each_class(class) {
3394 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3395 return 1;
3396 }
3397
3398 return 0;
3399}
3400/********** Helpers for find_busiest_group ************************/
3401/*
3402 * sd_lb_stats - Structure to store the statistics of a sched_domain
3403 * during load balancing.
3404 */
3405struct sd_lb_stats {
3406 struct sched_group *busiest; /* Busiest group in this sd */
3407 struct sched_group *this; /* Local group in this sd */
3408 unsigned long total_load; /* Total load of all groups in sd */
3409 unsigned long total_pwr; /* Total power of all groups in sd */
3410 unsigned long avg_load; /* Average load across all groups in sd */
3411
3412 /** Statistics of this group */
3413 unsigned long this_load;
3414 unsigned long this_load_per_task;
3415 unsigned long this_nr_running;
3416
3417 /* Statistics of the busiest group */
3418 unsigned long max_load;
3419 unsigned long busiest_load_per_task;
3420 unsigned long busiest_nr_running;
3421
3422 int group_imb; /* Is there imbalance in this sd */
3423#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3424 int power_savings_balance; /* Is powersave balance needed for this sd */
3425 struct sched_group *group_min; /* Least loaded group in sd */
3426 struct sched_group *group_leader; /* Group which relieves group_min */
3427 unsigned long min_load_per_task; /* load_per_task in group_min */
3428 unsigned long leader_nr_running; /* Nr running of group_leader */
3429 unsigned long min_nr_running; /* Nr running of group_min */
3430#endif
3431};
3432
3433/*
3434 * sg_lb_stats - stats of a sched_group required for load_balancing
3435 */
3436struct sg_lb_stats {
3437 unsigned long avg_load; /*Avg load across the CPUs of the group */
3438 unsigned long group_load; /* Total load over the CPUs of the group */
3439 unsigned long sum_nr_running; /* Nr tasks running in the group */
3440 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
3441 unsigned long group_capacity;
3442 int group_imb; /* Is there an imbalance in the group ? */
3443};
3444
3445/**
3446 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
3447 * @group: The group whose first cpu is to be returned.
3448 */
3449static inline unsigned int group_first_cpu(struct sched_group *group)
3450{
3451 return cpumask_first(sched_group_cpus(group));
3452}
3453
3454/**
3455 * get_sd_load_idx - Obtain the load index for a given sched domain.
3456 * @sd: The sched_domain whose load_idx is to be obtained.
3457 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
3458 */
3459static inline int get_sd_load_idx(struct sched_domain *sd,
3460 enum cpu_idle_type idle)
3461{
3462 int load_idx;
3463
3464 switch (idle) {
3465 case CPU_NOT_IDLE:
3466 load_idx = sd->busy_idx;
3467 break;
3468
3469 case CPU_NEWLY_IDLE:
3470 load_idx = sd->newidle_idx;
3471 break;
3472 default:
3473 load_idx = sd->idle_idx;
3474 break;
3475 }
3476
3477 return load_idx;
3478}
3479
3480
3481#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3482/**
3483 * init_sd_power_savings_stats - Initialize power savings statistics for
3484 * the given sched_domain, during load balancing.
3485 *
3486 * @sd: Sched domain whose power-savings statistics are to be initialized.
3487 * @sds: Variable containing the statistics for sd.
3488 * @idle: Idle status of the CPU at which we're performing load-balancing.
3489 */
3490static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3491 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3492{
3493 /*
3494 * Busy processors will not participate in power savings
3495 * balance.
3496 */
3497 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3498 sds->power_savings_balance = 0;
3499 else {
3500 sds->power_savings_balance = 1;
3501 sds->min_nr_running = ULONG_MAX;
3502 sds->leader_nr_running = 0;
3503 }
3504}
3505
3506/**
3507 * update_sd_power_savings_stats - Update the power saving stats for a
3508 * sched_domain while performing load balancing.
3509 *
3510 * @group: sched_group belonging to the sched_domain under consideration.
3511 * @sds: Variable containing the statistics of the sched_domain
3512 * @local_group: Does group contain the CPU for which we're performing
3513 * load balancing ?
3514 * @sgs: Variable containing the statistics of the group.
3515 */
3516static inline void update_sd_power_savings_stats(struct sched_group *group,
3517 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3518{
3519
3520 if (!sds->power_savings_balance)
3521 return;
3522
3523 /*
3524 * If the local group is idle or completely loaded
3525 * no need to do power savings balance at this domain
3526 */
3527 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3528 !sds->this_nr_running))
3529 sds->power_savings_balance = 0;
3530
3531 /*
3532 * If a group is already running at full capacity or idle,
3533 * don't include that group in power savings calculations
3534 */
3535 if (!sds->power_savings_balance ||
3536 sgs->sum_nr_running >= sgs->group_capacity ||
3537 !sgs->sum_nr_running)
3538 return;
3539
3540 /*
3541 * Calculate the group which has the least non-idle load.
3542 * This is the group from where we need to pick up the load
3543 * for saving power
3544 */
3545 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3546 (sgs->sum_nr_running == sds->min_nr_running &&
3547 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3548 sds->group_min = group;
3549 sds->min_nr_running = sgs->sum_nr_running;
3550 sds->min_load_per_task = sgs->sum_weighted_load /
3551 sgs->sum_nr_running;
3552 }
3553
3554 /*
3555 * Calculate the group which is almost near its
3556 * capacity but still has some space to pick up some load
3557 * from other group and save more power
3558 */
3559 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3560 return;
3561
3562 if (sgs->sum_nr_running > sds->leader_nr_running ||
3563 (sgs->sum_nr_running == sds->leader_nr_running &&
3564 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3565 sds->group_leader = group;
3566 sds->leader_nr_running = sgs->sum_nr_running;
3567 }
3568}
3569
3570/**
3571 * check_power_save_busiest_group - see if there is potential for some power-savings balance
3572 * @sds: Variable containing the statistics of the sched_domain
3573 * under consideration.
3574 * @this_cpu: Cpu at which we're currently performing load-balancing.
3575 * @imbalance: Variable to store the imbalance.
3576 *
3577 * Description:
3578 * Check if we have potential to perform some power-savings balance.
3579 * If yes, set the busiest group to be the least loaded group in the
3580 * sched_domain, so that it's CPUs can be put to idle.
3581 *
3582 * Returns 1 if there is potential to perform power-savings balance.
3583 * Else returns 0.
3584 */
3585static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3586 int this_cpu, unsigned long *imbalance)
3587{
3588 if (!sds->power_savings_balance)
3589 return 0;
3590
3591 if (sds->this != sds->group_leader ||
3592 sds->group_leader == sds->group_min)
3593 return 0;
3594
3595 *imbalance = sds->min_load_per_task;
3596 sds->busiest = sds->group_min;
3597
3598 return 1;
3599
3600}
3601#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3602static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3603 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3604{
3605 return;
3606}
3607
3608static inline void update_sd_power_savings_stats(struct sched_group *group,
3609 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3610{
3611 return;
3612}
3613
3614static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3615 int this_cpu, unsigned long *imbalance)
3616{
3617 return 0;
3618}
3619#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3620
3621
3622unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3623{
3624 return SCHED_LOAD_SCALE;
3625}
3626
3627unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3628{
3629 return default_scale_freq_power(sd, cpu);
3630}
3631
3632unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3633{
3634 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3635 unsigned long smt_gain = sd->smt_gain;
3636
3637 smt_gain /= weight;
3638
3639 return smt_gain;
3640}
3641
3642unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3643{
3644 return default_scale_smt_power(sd, cpu);
3645}
3646
3647unsigned long scale_rt_power(int cpu)
3648{
3649 struct rq *rq = cpu_rq(cpu);
3650 u64 total, available;
3651
3652 sched_avg_update(rq);
3653
3654 total = sched_avg_period() + (rq->clock - rq->age_stamp);
3655 available = total - rq->rt_avg;
3656
3657 if (unlikely((s64)total < SCHED_LOAD_SCALE))
3658 total = SCHED_LOAD_SCALE;
3659
3660 total >>= SCHED_LOAD_SHIFT;
3661
3662 return div_u64(available, total);
3663}
3664
3665static void update_cpu_power(struct sched_domain *sd, int cpu)
3666{
3667 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3668 unsigned long power = SCHED_LOAD_SCALE;
3669 struct sched_group *sdg = sd->groups;
3670
3671 if (sched_feat(ARCH_POWER))
3672 power *= arch_scale_freq_power(sd, cpu);
3673 else
3674 power *= default_scale_freq_power(sd, cpu);
3675
3676 power >>= SCHED_LOAD_SHIFT;
3677
3678 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3679 if (sched_feat(ARCH_POWER))
3680 power *= arch_scale_smt_power(sd, cpu);
3681 else
3682 power *= default_scale_smt_power(sd, cpu);
3683
3684 power >>= SCHED_LOAD_SHIFT;
3685 }
3686
3687 power *= scale_rt_power(cpu);
3688 power >>= SCHED_LOAD_SHIFT;
3689
3690 if (!power)
3691 power = 1;
3692
3693 sdg->cpu_power = power;
3694}
3695
3696static void update_group_power(struct sched_domain *sd, int cpu)
3697{
3698 struct sched_domain *child = sd->child;
3699 struct sched_group *group, *sdg = sd->groups;
3700 unsigned long power;
3701
3702 if (!child) {
3703 update_cpu_power(sd, cpu);
3704 return;
3705 }
3706
3707 power = 0;
3708
3709 group = child->groups;
3710 do {
3711 power += group->cpu_power;
3712 group = group->next;
3713 } while (group != child->groups);
3714
3715 sdg->cpu_power = power;
3716}
3717
3718/**
3719 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3720 * @sd: The sched_domain whose statistics are to be updated.
3721 * @group: sched_group whose statistics are to be updated.
3722 * @this_cpu: Cpu for which load balance is currently performed.
3723 * @idle: Idle status of this_cpu
3724 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3725 * @sd_idle: Idle status of the sched_domain containing group.
3726 * @local_group: Does group contain this_cpu.
3727 * @cpus: Set of cpus considered for load balancing.
3728 * @balance: Should we balance.
3729 * @sgs: variable to hold the statistics for this group.
3730 */
3731static inline void update_sg_lb_stats(struct sched_domain *sd,
3732 struct sched_group *group, int this_cpu,
3733 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3734 int local_group, const struct cpumask *cpus,
3735 int *balance, struct sg_lb_stats *sgs)
3736{
3737 unsigned long load, max_cpu_load, min_cpu_load;
3738 int i;
3739 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3740 unsigned long sum_avg_load_per_task;
3741 unsigned long avg_load_per_task;
3742
3743 if (local_group) {
3744 balance_cpu = group_first_cpu(group);
3745 if (balance_cpu == this_cpu)
3746 update_group_power(sd, this_cpu);
3747 }
3748
3749 /* Tally up the load of all CPUs in the group */
3750 sum_avg_load_per_task = avg_load_per_task = 0;
3751 max_cpu_load = 0;
3752 min_cpu_load = ~0UL;
3753
3754 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3755 struct rq *rq = cpu_rq(i);
3756
3757 if (*sd_idle && rq->nr_running)
3758 *sd_idle = 0;
3759
3760 /* Bias balancing toward cpus of our domain */
3761 if (local_group) {
3762 if (idle_cpu(i) && !first_idle_cpu) {
3763 first_idle_cpu = 1;
3764 balance_cpu = i;
3765 }
3766
3767 load = target_load(i, load_idx);
3768 } else {
3769 load = source_load(i, load_idx);
3770 if (load > max_cpu_load)
3771 max_cpu_load = load;
3772 if (min_cpu_load > load)
3773 min_cpu_load = load;
3774 }
3775
3776 sgs->group_load += load;
3777 sgs->sum_nr_running += rq->nr_running;
3778 sgs->sum_weighted_load += weighted_cpuload(i);
3779
3780 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3781 }
3782
3783 /*
3784 * First idle cpu or the first cpu(busiest) in this sched group
3785 * is eligible for doing load balancing at this and above
3786 * domains. In the newly idle case, we will allow all the cpu's
3787 * to do the newly idle load balance.
3788 */
3789 if (idle != CPU_NEWLY_IDLE && local_group &&
3790 balance_cpu != this_cpu && balance) {
3791 *balance = 0;
3792 return;
3793 }
3794
3795 /* Adjust by relative CPU power of the group */
3796 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
3797
3798
3799 /*
3800 * Consider the group unbalanced when the imbalance is larger
3801 * than the average weight of two tasks.
3802 *
3803 * APZ: with cgroup the avg task weight can vary wildly and
3804 * might not be a suitable number - should we keep a
3805 * normalized nr_running number somewhere that negates
3806 * the hierarchy?
3807 */
3808 avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
3809 group->cpu_power;
3810
3811 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3812 sgs->group_imb = 1;
3813
3814 sgs->group_capacity =
3815 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
3816}
3817
3818/**
3819 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
3820 * @sd: sched_domain whose statistics are to be updated.
3821 * @this_cpu: Cpu for which load balance is currently performed.
3822 * @idle: Idle status of this_cpu
3823 * @sd_idle: Idle status of the sched_domain containing group.
3824 * @cpus: Set of cpus considered for load balancing.
3825 * @balance: Should we balance.
3826 * @sds: variable to hold the statistics for this sched_domain.
3827 */
3828static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3829 enum cpu_idle_type idle, int *sd_idle,
3830 const struct cpumask *cpus, int *balance,
3831 struct sd_lb_stats *sds)
3832{
3833 struct sched_domain *child = sd->child;
3834 struct sched_group *group = sd->groups;
3835 struct sg_lb_stats sgs;
3836 int load_idx, prefer_sibling = 0;
3837
3838 if (child && child->flags & SD_PREFER_SIBLING)
3839 prefer_sibling = 1;
3840
3841 init_sd_power_savings_stats(sd, sds, idle);
3842 load_idx = get_sd_load_idx(sd, idle);
3843
3844 do {
3845 int local_group;
3846
3847 local_group = cpumask_test_cpu(this_cpu,
3848 sched_group_cpus(group));
3849 memset(&sgs, 0, sizeof(sgs));
3850 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
3851 local_group, cpus, balance, &sgs);
3852
3853 if (local_group && balance && !(*balance))
3854 return;
3855
3856 sds->total_load += sgs.group_load;
3857 sds->total_pwr += group->cpu_power;
3858
3859 /*
3860 * In case the child domain prefers tasks go to siblings
3861 * first, lower the group capacity to one so that we'll try
3862 * and move all the excess tasks away.
3863 */
3864 if (prefer_sibling)
3865 sgs.group_capacity = min(sgs.group_capacity, 1UL);
3866
3867 if (local_group) {
3868 sds->this_load = sgs.avg_load;
3869 sds->this = group;
3870 sds->this_nr_running = sgs.sum_nr_running;
3871 sds->this_load_per_task = sgs.sum_weighted_load;
3872 } else if (sgs.avg_load > sds->max_load &&
3873 (sgs.sum_nr_running > sgs.group_capacity ||
3874 sgs.group_imb)) {
3875 sds->max_load = sgs.avg_load;
3876 sds->busiest = group;
3877 sds->busiest_nr_running = sgs.sum_nr_running;
3878 sds->busiest_load_per_task = sgs.sum_weighted_load;
3879 sds->group_imb = sgs.group_imb;
3880 }
3881
3882 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3883 group = group->next;
3884 } while (group != sd->groups);
3885}
3886
3887/**
3888 * fix_small_imbalance - Calculate the minor imbalance that exists
3889 * amongst the groups of a sched_domain, during
3890 * load balancing.
3891 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
3892 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
3893 * @imbalance: Variable to store the imbalance.
3894 */
3895static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3896 int this_cpu, unsigned long *imbalance)
3897{
3898 unsigned long tmp, pwr_now = 0, pwr_move = 0;
3899 unsigned int imbn = 2;
3900
3901 if (sds->this_nr_running) {
3902 sds->this_load_per_task /= sds->this_nr_running;
3903 if (sds->busiest_load_per_task >
3904 sds->this_load_per_task)
3905 imbn = 1;
3906 } else
3907 sds->this_load_per_task =
3908 cpu_avg_load_per_task(this_cpu);
3909
3910 if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
3911 sds->busiest_load_per_task * imbn) {
3912 *imbalance = sds->busiest_load_per_task;
3913 return;
3914 }
3915
3916 /*
3917 * OK, we don't have enough imbalance to justify moving tasks,
3918 * however we may be able to increase total CPU power used by
3919 * moving them.
3920 */
3921
3922 pwr_now += sds->busiest->cpu_power *
3923 min(sds->busiest_load_per_task, sds->max_load);
3924 pwr_now += sds->this->cpu_power *
3925 min(sds->this_load_per_task, sds->this_load);
3926 pwr_now /= SCHED_LOAD_SCALE;
3927
3928 /* Amount of load we'd subtract */
3929 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3930 sds->busiest->cpu_power;
3931 if (sds->max_load > tmp)
3932 pwr_move += sds->busiest->cpu_power *
3933 min(sds->busiest_load_per_task, sds->max_load - tmp);
3934
3935 /* Amount of load we'd add */
3936 if (sds->max_load * sds->busiest->cpu_power <
3937 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3938 tmp = (sds->max_load * sds->busiest->cpu_power) /
3939 sds->this->cpu_power;
3940 else
3941 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3942 sds->this->cpu_power;
3943 pwr_move += sds->this->cpu_power *
3944 min(sds->this_load_per_task, sds->this_load + tmp);
3945 pwr_move /= SCHED_LOAD_SCALE;
3946
3947 /* Move if we gain throughput */
3948 if (pwr_move > pwr_now)
3949 *imbalance = sds->busiest_load_per_task;
3950}
3951
3952/**
3953 * calculate_imbalance - Calculate the amount of imbalance present within the
3954 * groups of a given sched_domain during load balance.
3955 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
3956 * @this_cpu: Cpu for which currently load balance is being performed.
3957 * @imbalance: The variable to store the imbalance.
3958 */
3959static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3960 unsigned long *imbalance)
3961{
3962 unsigned long max_pull;
3963 /*
3964 * In the presence of smp nice balancing, certain scenarios can have
3965 * max load less than avg load(as we skip the groups at or below
3966 * its cpu_power, while calculating max_load..)
3967 */
3968 if (sds->max_load < sds->avg_load) {
3969 *imbalance = 0;
3970 return fix_small_imbalance(sds, this_cpu, imbalance);
3971 }
3972
3973 /* Don't want to pull so many tasks that a group would go idle */
3974 max_pull = min(sds->max_load - sds->avg_load,
3975 sds->max_load - sds->busiest_load_per_task);
3976
3977 /* How much load to actually move to equalise the imbalance */
3978 *imbalance = min(max_pull * sds->busiest->cpu_power,
3979 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
3980 / SCHED_LOAD_SCALE;
3981
3982 /*
3983 * if *imbalance is less than the average load per runnable task
3984 * there is no gaurantee that any tasks will be moved so we'll have
3985 * a think about bumping its value to force at least one task to be
3986 * moved
3987 */
3988 if (*imbalance < sds->busiest_load_per_task)
3989 return fix_small_imbalance(sds, this_cpu, imbalance);
3990
3991}
3992/******* find_busiest_group() helpers end here *********************/
3993
3994/**
3995 * find_busiest_group - Returns the busiest group within the sched_domain
3996 * if there is an imbalance. If there isn't an imbalance, and
3997 * the user has opted for power-savings, it returns a group whose
3998 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
3999 * such a group exists.
4000 *
4001 * Also calculates the amount of weighted load which should be moved
4002 * to restore balance.
4003 *
4004 * @sd: The sched_domain whose busiest group is to be returned.
4005 * @this_cpu: The cpu for which load balancing is currently being performed.
4006 * @imbalance: Variable which stores amount of weighted load which should
4007 * be moved to restore balance/put a group to idle.
4008 * @idle: The idle status of this_cpu.
4009 * @sd_idle: The idleness of sd
4010 * @cpus: The set of CPUs under consideration for load-balancing.
4011 * @balance: Pointer to a variable indicating if this_cpu
4012 * is the appropriate cpu to perform load balancing at this_level.
4013 *
4014 * Returns: - the busiest group if imbalance exists.
4015 * - If no imbalance and user has opted for power-savings balance,
4016 * return the least loaded group whose CPUs can be
4017 * put to idle by rebalancing its tasks onto our group.
4018 */
4019static struct sched_group *
4020find_busiest_group(struct sched_domain *sd, int this_cpu,
4021 unsigned long *imbalance, enum cpu_idle_type idle,
4022 int *sd_idle, const struct cpumask *cpus, int *balance)
4023{
4024 struct sd_lb_stats sds;
4025
4026 memset(&sds, 0, sizeof(sds));
4027
4028 /*
4029 * Compute the various statistics relavent for load balancing at
4030 * this level.
4031 */
4032 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
4033 balance, &sds);
4034
4035 /* Cases where imbalance does not exist from POV of this_cpu */
4036 /* 1) this_cpu is not the appropriate cpu to perform load balancing
4037 * at this level.
4038 * 2) There is no busy sibling group to pull from.
4039 * 3) This group is the busiest group.
4040 * 4) This group is more busy than the avg busieness at this
4041 * sched_domain.
4042 * 5) The imbalance is within the specified limit.
4043 * 6) Any rebalance would lead to ping-pong
4044 */
4045 if (balance && !(*balance))
4046 goto ret;
4047
4048 if (!sds.busiest || sds.busiest_nr_running == 0)
4049 goto out_balanced;
4050
4051 if (sds.this_load >= sds.max_load)
4052 goto out_balanced;
4053
4054 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
4055
4056 if (sds.this_load >= sds.avg_load)
4057 goto out_balanced;
4058
4059 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
4060 goto out_balanced;
4061
4062 sds.busiest_load_per_task /= sds.busiest_nr_running;
4063 if (sds.group_imb)
4064 sds.busiest_load_per_task =
4065 min(sds.busiest_load_per_task, sds.avg_load);
4066
4067 /*
4068 * We're trying to get all the cpus to the average_load, so we don't
4069 * want to push ourselves above the average load, nor do we wish to
4070 * reduce the max loaded cpu below the average load, as either of these
4071 * actions would just result in more rebalancing later, and ping-pong
4072 * tasks around. Thus we look for the minimum possible imbalance.
4073 * Negative imbalances (*we* are more loaded than anyone else) will
4074 * be counted as no imbalance for these purposes -- we can't fix that
4075 * by pulling tasks to us. Be careful of negative numbers as they'll
4076 * appear as very large values with unsigned longs.
4077 */
4078 if (sds.max_load <= sds.busiest_load_per_task)
4079 goto out_balanced;
4080
4081 /* Looks like there is an imbalance. Compute it */
4082 calculate_imbalance(&sds, this_cpu, imbalance);
4083 return sds.busiest;
4084
4085out_balanced:
4086 /*
4087 * There is no obvious imbalance. But check if we can do some balancing
4088 * to save power.
4089 */
4090 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
4091 return sds.busiest;
4092ret:
4093 *imbalance = 0;
4094 return NULL;
4095}
4096
4097/*
4098 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4099 */
4100static struct rq *
4101find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
4102 unsigned long imbalance, const struct cpumask *cpus)
4103{
4104 struct rq *busiest = NULL, *rq;
4105 unsigned long max_load = 0;
4106 int i;
4107
4108 for_each_cpu(i, sched_group_cpus(group)) {
4109 unsigned long power = power_of(i);
4110 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
4111 unsigned long wl;
4112
4113 if (!cpumask_test_cpu(i, cpus))
4114 continue;
4115
4116 rq = cpu_rq(i);
4117 wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
4118 wl /= power;
4119
4120 if (capacity && rq->nr_running == 1 && wl > imbalance)
4121 continue;
4122
4123 if (wl > max_load) {
4124 max_load = wl;
4125 busiest = rq;
4126 }
4127 }
4128
4129 return busiest;
4130}
4131
4132/*
4133 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
4134 * so long as it is large enough.
4135 */
4136#define MAX_PINNED_INTERVAL 512
4137
4138/* Working cpumask for load_balance and load_balance_newidle. */
4139static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4140
4141/*
4142 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4143 * tasks if there is an imbalance.
4144 */
4145static int load_balance(int this_cpu, struct rq *this_rq,
4146 struct sched_domain *sd, enum cpu_idle_type idle,
4147 int *balance)
4148{
4149 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
4150 struct sched_group *group;
4151 unsigned long imbalance;
4152 struct rq *busiest;
4153 unsigned long flags;
4154 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4155
4156 cpumask_setall(cpus);
4157
4158 /*
4159 * When power savings policy is enabled for the parent domain, idle
4160 * sibling can pick up load irrespective of busy siblings. In this case,
4161 * let the state of idle sibling percolate up as CPU_IDLE, instead of
4162 * portraying it as CPU_NOT_IDLE.
4163 */
4164 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
4165 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4166 sd_idle = 1;
4167
4168 schedstat_inc(sd, lb_count[idle]);
4169
4170redo:
4171 update_shares(sd);
4172 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
4173 cpus, balance);
4174
4175 if (*balance == 0)
4176 goto out_balanced;
4177
4178 if (!group) {
4179 schedstat_inc(sd, lb_nobusyg[idle]);
4180 goto out_balanced;
4181 }
4182
4183 busiest = find_busiest_queue(group, idle, imbalance, cpus);
4184 if (!busiest) {
4185 schedstat_inc(sd, lb_nobusyq[idle]);
4186 goto out_balanced;
4187 }
4188
4189 BUG_ON(busiest == this_rq);
4190
4191 schedstat_add(sd, lb_imbalance[idle], imbalance);
4192
4193 ld_moved = 0;
4194 if (busiest->nr_running > 1) {
4195 /*
4196 * Attempt to move tasks. If find_busiest_group has found
4197 * an imbalance but busiest->nr_running <= 1, the group is
4198 * still unbalanced. ld_moved simply stays zero, so it is
4199 * correctly treated as an imbalance.
4200 */
4201 local_irq_save(flags);
4202 double_rq_lock(this_rq, busiest);
4203 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4204 imbalance, sd, idle, &all_pinned);
4205 double_rq_unlock(this_rq, busiest);
4206 local_irq_restore(flags);
4207
4208 /*
4209 * some other cpu did the load balance for us.
4210 */
4211 if (ld_moved && this_cpu != smp_processor_id())
4212 resched_cpu(this_cpu);
4213
4214 /* All tasks on this runqueue were pinned by CPU affinity */
4215 if (unlikely(all_pinned)) {
4216 cpumask_clear_cpu(cpu_of(busiest), cpus);
4217 if (!cpumask_empty(cpus))
4218 goto redo;
4219 goto out_balanced;
4220 }
4221 }
4222
4223 if (!ld_moved) {
4224 schedstat_inc(sd, lb_failed[idle]);
4225 sd->nr_balance_failed++;
4226
4227 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
4228
4229 spin_lock_irqsave(&busiest->lock, flags);
4230
4231 /* don't kick the migration_thread, if the curr
4232 * task on busiest cpu can't be moved to this_cpu
4233 */
4234 if (!cpumask_test_cpu(this_cpu,
4235 &busiest->curr->cpus_allowed)) {
4236 spin_unlock_irqrestore(&busiest->lock, flags);
4237 all_pinned = 1;
4238 goto out_one_pinned;
4239 }
4240
4241 if (!busiest->active_balance) {
4242 busiest->active_balance = 1;
4243 busiest->push_cpu = this_cpu;
4244 active_balance = 1;
4245 }
4246 spin_unlock_irqrestore(&busiest->lock, flags);
4247 if (active_balance)
4248 wake_up_process(busiest->migration_thread);
4249
4250 /*
4251 * We've kicked active balancing, reset the failure
4252 * counter.
4253 */
4254 sd->nr_balance_failed = sd->cache_nice_tries+1;
4255 }
4256 } else
4257 sd->nr_balance_failed = 0;
4258
4259 if (likely(!active_balance)) {
4260 /* We were unbalanced, so reset the balancing interval */
4261 sd->balance_interval = sd->min_interval;
4262 } else {
4263 /*
4264 * If we've begun active balancing, start to back off. This
4265 * case may not be covered by the all_pinned logic if there
4266 * is only 1 task on the busy runqueue (because we don't call
4267 * move_tasks).
4268 */
4269 if (sd->balance_interval < sd->max_interval)
4270 sd->balance_interval *= 2;
4271 }
4272
4273 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4274 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4275 ld_moved = -1;
4276
4277 goto out;
4278
4279out_balanced:
4280 schedstat_inc(sd, lb_balanced[idle]);
4281
4282 sd->nr_balance_failed = 0;
4283
4284out_one_pinned:
4285 /* tune up the balancing interval */
4286 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
4287 (sd->balance_interval < sd->max_interval))
4288 sd->balance_interval *= 2;
4289
4290 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4291 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4292 ld_moved = -1;
4293 else
4294 ld_moved = 0;
4295out:
4296 if (ld_moved)
4297 update_shares(sd);
4298 return ld_moved;
4299}
4300
4301/*
4302 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4303 * tasks if there is an imbalance.
4304 *
4305 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
4306 * this_rq is locked.
4307 */
4308static int
4309load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4310{
4311 struct sched_group *group;
4312 struct rq *busiest = NULL;
4313 unsigned long imbalance;
4314 int ld_moved = 0;
4315 int sd_idle = 0;
4316 int all_pinned = 0;
4317 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4318
4319 cpumask_setall(cpus);
4320
4321 /*
4322 * When power savings policy is enabled for the parent domain, idle
4323 * sibling can pick up load irrespective of busy siblings. In this case,
4324 * let the state of idle sibling percolate up as IDLE, instead of
4325 * portraying it as CPU_NOT_IDLE.
4326 */
4327 if (sd->flags & SD_SHARE_CPUPOWER &&
4328 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4329 sd_idle = 1;
4330
4331 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
4332redo:
4333 update_shares_locked(this_rq, sd);
4334 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
4335 &sd_idle, cpus, NULL);
4336 if (!group) {
4337 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
4338 goto out_balanced;
4339 }
4340
4341 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
4342 if (!busiest) {
4343 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
4344 goto out_balanced;
4345 }
4346
4347 BUG_ON(busiest == this_rq);
4348
4349 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
4350
4351 ld_moved = 0;
4352 if (busiest->nr_running > 1) {
4353 /* Attempt to move tasks */
4354 double_lock_balance(this_rq, busiest);
4355 /* this_rq->clock is already updated */
4356 update_rq_clock(busiest);
4357 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4358 imbalance, sd, CPU_NEWLY_IDLE,
4359 &all_pinned);
4360 double_unlock_balance(this_rq, busiest);
4361
4362 if (unlikely(all_pinned)) {
4363 cpumask_clear_cpu(cpu_of(busiest), cpus);
4364 if (!cpumask_empty(cpus))
4365 goto redo;
4366 }
4367 }
4368
4369 if (!ld_moved) {
4370 int active_balance = 0;
4371
4372 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
4373 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4374 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4375 return -1;
4376
4377 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
4378 return -1;
4379
4380 if (sd->nr_balance_failed++ < 2)
4381 return -1;
4382
4383 /*
4384 * The only task running in a non-idle cpu can be moved to this
4385 * cpu in an attempt to completely freeup the other CPU
4386 * package. The same method used to move task in load_balance()
4387 * have been extended for load_balance_newidle() to speedup
4388 * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
4389 *
4390 * The package power saving logic comes from
4391 * find_busiest_group(). If there are no imbalance, then
4392 * f_b_g() will return NULL. However when sched_mc={1,2} then
4393 * f_b_g() will select a group from which a running task may be
4394 * pulled to this cpu in order to make the other package idle.
4395 * If there is no opportunity to make a package idle and if
4396 * there are no imbalance, then f_b_g() will return NULL and no
4397 * action will be taken in load_balance_newidle().
4398 *
4399 * Under normal task pull operation due to imbalance, there
4400 * will be more than one task in the source run queue and
4401 * move_tasks() will succeed. ld_moved will be true and this
4402 * active balance code will not be triggered.
4403 */
4404
4405 /* Lock busiest in correct order while this_rq is held */
4406 double_lock_balance(this_rq, busiest);
4407
4408 /*
4409 * don't kick the migration_thread, if the curr
4410 * task on busiest cpu can't be moved to this_cpu
4411 */
4412 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
4413 double_unlock_balance(this_rq, busiest);
4414 all_pinned = 1;
4415 return ld_moved;
4416 }
4417
4418 if (!busiest->active_balance) {
4419 busiest->active_balance = 1;
4420 busiest->push_cpu = this_cpu;
4421 active_balance = 1;
4422 }
4423
4424 double_unlock_balance(this_rq, busiest);
4425 /*
4426 * Should not call ttwu while holding a rq->lock
4427 */
4428 spin_unlock(&this_rq->lock);
4429 if (active_balance)
4430 wake_up_process(busiest->migration_thread);
4431 spin_lock(&this_rq->lock);
4432
4433 } else
4434 sd->nr_balance_failed = 0;
4435
4436 update_shares_locked(this_rq, sd);
4437 return ld_moved;
4438
4439out_balanced:
4440 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
4441 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4442 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4443 return -1;
4444 sd->nr_balance_failed = 0;
4445
4446 return 0;
4447}
4448
4449/*
4450 * idle_balance is called by schedule() if this_cpu is about to become
4451 * idle. Attempts to pull tasks from other CPUs.
4452 */
4453static void idle_balance(int this_cpu, struct rq *this_rq)
4454{
4455 struct sched_domain *sd;
4456 int pulled_task = 0;
4457 unsigned long next_balance = jiffies + HZ;
4458
4459 for_each_domain(this_cpu, sd) {
4460 unsigned long interval;
4461
4462 if (!(sd->flags & SD_LOAD_BALANCE))
4463 continue;
4464
4465 if (sd->flags & SD_BALANCE_NEWIDLE)
4466 /* If we've pulled tasks over stop searching: */
4467 pulled_task = load_balance_newidle(this_cpu, this_rq,
4468 sd);
4469
4470 interval = msecs_to_jiffies(sd->balance_interval);
4471 if (time_after(next_balance, sd->last_balance + interval))
4472 next_balance = sd->last_balance + interval;
4473 if (pulled_task)
4474 break;
4475 }
4476 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4477 /*
4478 * We are going idle. next_balance may be set based on
4479 * a busy processor. So reset next_balance.
4480 */
4481 this_rq->next_balance = next_balance;
4482 }
4483}
4484
4485/*
4486 * active_load_balance is run by migration threads. It pushes running tasks
4487 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
4488 * running on each physical CPU where possible, and avoids physical /
4489 * logical imbalances.
4490 *
4491 * Called with busiest_rq locked.
4492 */
4493static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4494{
4495 int target_cpu = busiest_rq->push_cpu;
4496 struct sched_domain *sd;
4497 struct rq *target_rq;
4498
4499 /* Is there any task to move? */
4500 if (busiest_rq->nr_running <= 1)
4501 return;
4502
4503 target_rq = cpu_rq(target_cpu);
4504
4505 /*
4506 * This condition is "impossible", if it occurs
4507 * we need to fix it. Originally reported by
4508 * Bjorn Helgaas on a 128-cpu setup.
4509 */
4510 BUG_ON(busiest_rq == target_rq);
4511
4512 /* move a task from busiest_rq to target_rq */
4513 double_lock_balance(busiest_rq, target_rq);
4514 update_rq_clock(busiest_rq);
4515 update_rq_clock(target_rq);
4516
4517 /* Search for an sd spanning us and the target CPU. */
4518 for_each_domain(target_cpu, sd) {
4519 if ((sd->flags & SD_LOAD_BALANCE) &&
4520 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
4521 break;
4522 }
4523
4524 if (likely(sd)) {
4525 schedstat_inc(sd, alb_count);
4526
4527 if (move_one_task(target_rq, target_cpu, busiest_rq,
4528 sd, CPU_IDLE))
4529 schedstat_inc(sd, alb_pushed);
4530 else
4531 schedstat_inc(sd, alb_failed);
4532 }
4533 double_unlock_balance(busiest_rq, target_rq);
4534}
4535
4536#ifdef CONFIG_NO_HZ
4537static struct {
4538 atomic_t load_balancer;
4539 cpumask_var_t cpu_mask;
4540 cpumask_var_t ilb_grp_nohz_mask;
4541} nohz ____cacheline_aligned = {
4542 .load_balancer = ATOMIC_INIT(-1),
4543};
4544
4545int get_nohz_load_balancer(void)
4546{
4547 return atomic_read(&nohz.load_balancer);
4548}
4549
4550#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4551/**
4552 * lowest_flag_domain - Return lowest sched_domain containing flag.
4553 * @cpu: The cpu whose lowest level of sched domain is to
4554 * be returned.
4555 * @flag: The flag to check for the lowest sched_domain
4556 * for the given cpu.
4557 *
4558 * Returns the lowest sched_domain of a cpu which contains the given flag.
4559 */
4560static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4561{
4562 struct sched_domain *sd;
4563
4564 for_each_domain(cpu, sd)
4565 if (sd && (sd->flags & flag))
4566 break;
4567
4568 return sd;
4569}
4570
4571/**
4572 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4573 * @cpu: The cpu whose domains we're iterating over.
4574 * @sd: variable holding the value of the power_savings_sd
4575 * for cpu.
4576 * @flag: The flag to filter the sched_domains to be iterated.
4577 *
4578 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4579 * set, starting from the lowest sched_domain to the highest.
4580 */
4581#define for_each_flag_domain(cpu, sd, flag) \
4582 for (sd = lowest_flag_domain(cpu, flag); \
4583 (sd && (sd->flags & flag)); sd = sd->parent)
4584
4585/**
4586 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4587 * @ilb_group: group to be checked for semi-idleness
4588 *
4589 * Returns: 1 if the group is semi-idle. 0 otherwise.
4590 *
4591 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4592 * and atleast one non-idle CPU. This helper function checks if the given
4593 * sched_group is semi-idle or not.
4594 */
4595static inline int is_semi_idle_group(struct sched_group *ilb_group)
4596{
4597 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
4598 sched_group_cpus(ilb_group));
4599
4600 /*
4601 * A sched_group is semi-idle when it has atleast one busy cpu
4602 * and atleast one idle cpu.
4603 */
4604 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
4605 return 0;
4606
4607 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
4608 return 0;
4609
4610 return 1;
4611}
4612/**
4613 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4614 * @cpu: The cpu which is nominating a new idle_load_balancer.
4615 *
4616 * Returns: Returns the id of the idle load balancer if it exists,
4617 * Else, returns >= nr_cpu_ids.
4618 *
4619 * This algorithm picks the idle load balancer such that it belongs to a
4620 * semi-idle powersavings sched_domain. The idea is to try and avoid
4621 * completely idle packages/cores just for the purpose of idle load balancing
4622 * when there are other idle cpu's which are better suited for that job.
4623 */
4624static int find_new_ilb(int cpu)
4625{
4626 struct sched_domain *sd;
4627 struct sched_group *ilb_group;
4628
4629 /*
4630 * Have idle load balancer selection from semi-idle packages only
4631 * when power-aware load balancing is enabled
4632 */
4633 if (!(sched_smt_power_savings || sched_mc_power_savings))
4634 goto out_done;
4635
4636 /*
4637 * Optimize for the case when we have no idle CPUs or only one
4638 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4639 */
4640 if (cpumask_weight(nohz.cpu_mask) < 2)
4641 goto out_done;
4642
4643 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4644 ilb_group = sd->groups;
4645
4646 do {
4647 if (is_semi_idle_group(ilb_group))
4648 return cpumask_first(nohz.ilb_grp_nohz_mask);
4649
4650 ilb_group = ilb_group->next;
4651
4652 } while (ilb_group != sd->groups);
4653 }
4654
4655out_done:
4656 return cpumask_first(nohz.cpu_mask);
4657}
4658#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4659static inline int find_new_ilb(int call_cpu)
4660{
4661 return cpumask_first(nohz.cpu_mask);
4662}
4663#endif
4664
4665/*
4666 * This routine will try to nominate the ilb (idle load balancing)
4667 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
4668 * load balancing on behalf of all those cpus. If all the cpus in the system
4669 * go into this tickless mode, then there will be no ilb owner (as there is
4670 * no need for one) and all the cpus will sleep till the next wakeup event
4671 * arrives...
4672 *
4673 * For the ilb owner, tick is not stopped. And this tick will be used
4674 * for idle load balancing. ilb owner will still be part of
4675 * nohz.cpu_mask..
4676 *
4677 * While stopping the tick, this cpu will become the ilb owner if there
4678 * is no other owner. And will be the owner till that cpu becomes busy
4679 * or if all cpus in the system stop their ticks at which point
4680 * there is no need for ilb owner.
4681 *
4682 * When the ilb owner becomes busy, it nominates another owner, during the
4683 * next busy scheduler_tick()
4684 */
4685int select_nohz_load_balancer(int stop_tick)
4686{
4687 int cpu = smp_processor_id();
4688
4689 if (stop_tick) {
4690 cpu_rq(cpu)->in_nohz_recently = 1;
4691
4692 if (!cpu_active(cpu)) {
4693 if (atomic_read(&nohz.load_balancer) != cpu)
4694 return 0;
4695
4696 /*
4697 * If we are going offline and still the leader,
4698 * give up!
4699 */
4700 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4701 BUG();
4702
4703 return 0;
4704 }
4705
4706 cpumask_set_cpu(cpu, nohz.cpu_mask);
4707
4708 /* time for ilb owner also to sleep */
4709 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4710 if (atomic_read(&nohz.load_balancer) == cpu)
4711 atomic_set(&nohz.load_balancer, -1);
4712 return 0;
4713 }
4714
4715 if (atomic_read(&nohz.load_balancer) == -1) {
4716 /* make me the ilb owner */
4717 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4718 return 1;
4719 } else if (atomic_read(&nohz.load_balancer) == cpu) {
4720 int new_ilb;
4721
4722 if (!(sched_smt_power_savings ||
4723 sched_mc_power_savings))
4724 return 1;
4725 /*
4726 * Check to see if there is a more power-efficient
4727 * ilb.
4728 */
4729 new_ilb = find_new_ilb(cpu);
4730 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4731 atomic_set(&nohz.load_balancer, -1);
4732 resched_cpu(new_ilb);
4733 return 0;
4734 }
4735 return 1;
4736 }
4737 } else {
4738 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4739 return 0;
4740
4741 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4742
4743 if (atomic_read(&nohz.load_balancer) == cpu)
4744 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4745 BUG();
4746 }
4747 return 0;
4748}
4749#endif
4750
4751static DEFINE_SPINLOCK(balancing);
4752
4753/*
4754 * It checks each scheduling domain to see if it is due to be balanced,
4755 * and initiates a balancing operation if so.
4756 *
4757 * Balancing parameters are set up in arch_init_sched_domains.
4758 */
4759static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4760{
4761 int balance = 1;
4762 struct rq *rq = cpu_rq(cpu);
4763 unsigned long interval;
4764 struct sched_domain *sd;
4765 /* Earliest time when we have to do rebalance again */
4766 unsigned long next_balance = jiffies + 60*HZ;
4767 int update_next_balance = 0;
4768 int need_serialize;
4769
4770 for_each_domain(cpu, sd) {
4771 if (!(sd->flags & SD_LOAD_BALANCE))
4772 continue;
4773
4774 interval = sd->balance_interval;
4775 if (idle != CPU_IDLE)
4776 interval *= sd->busy_factor;
4777
4778 /* scale ms to jiffies */
4779 interval = msecs_to_jiffies(interval);
4780 if (unlikely(!interval))
4781 interval = 1;
4782 if (interval > HZ*NR_CPUS/10)
4783 interval = HZ*NR_CPUS/10;
4784
4785 need_serialize = sd->flags & SD_SERIALIZE;
4786
4787 if (need_serialize) {
4788 if (!spin_trylock(&balancing))
4789 goto out;
4790 }
4791
4792 if (time_after_eq(jiffies, sd->last_balance + interval)) {
4793 if (load_balance(cpu, rq, sd, idle, &balance)) {
4794 /*
4795 * We've pulled tasks over so either we're no
4796 * longer idle, or one of our SMT siblings is
4797 * not idle.
4798 */
4799 idle = CPU_NOT_IDLE;
4800 }
4801 sd->last_balance = jiffies;
4802 }
4803 if (need_serialize)
4804 spin_unlock(&balancing);
4805out:
4806 if (time_after(next_balance, sd->last_balance + interval)) {
4807 next_balance = sd->last_balance + interval;
4808 update_next_balance = 1;
4809 }
4810
4811 /*
4812 * Stop the load balance at this level. There is another
4813 * CPU in our sched group which is doing load balancing more
4814 * actively.
4815 */
4816 if (!balance)
4817 break;
4818 }
4819
4820 /*
4821 * next_balance will be updated only when there is a need.
4822 * When the cpu is attached to null domain for ex, it will not be
4823 * updated.
4824 */
4825 if (likely(update_next_balance))
4826 rq->next_balance = next_balance;
4827}
4828
4829/*
4830 * run_rebalance_domains is triggered when needed from the scheduler tick.
4831 * In CONFIG_NO_HZ case, the idle load balance owner will do the
4832 * rebalancing for all the cpus for whom scheduler ticks are stopped.
4833 */
4834static void run_rebalance_domains(struct softirq_action *h)
4835{
4836 int this_cpu = smp_processor_id();
4837 struct rq *this_rq = cpu_rq(this_cpu);
4838 enum cpu_idle_type idle = this_rq->idle_at_tick ?
4839 CPU_IDLE : CPU_NOT_IDLE;
4840
4841 rebalance_domains(this_cpu, idle);
4842
4843#ifdef CONFIG_NO_HZ
4844 /*
4845 * If this cpu is the owner for idle load balancing, then do the
4846 * balancing on behalf of the other idle cpus whose ticks are
4847 * stopped.
4848 */
4849 if (this_rq->idle_at_tick &&
4850 atomic_read(&nohz.load_balancer) == this_cpu) {
4851 struct rq *rq;
4852 int balance_cpu;
4853
4854 for_each_cpu(balance_cpu, nohz.cpu_mask) {
4855 if (balance_cpu == this_cpu)
4856 continue;
4857
4858 /*
4859 * If this cpu gets work to do, stop the load balancing
4860 * work being done for other cpus. Next load
4861 * balancing owner will pick it up.
4862 */
4863 if (need_resched())
4864 break;
4865
4866 rebalance_domains(balance_cpu, CPU_IDLE);
4867
4868 rq = cpu_rq(balance_cpu);
4869 if (time_after(this_rq->next_balance, rq->next_balance))
4870 this_rq->next_balance = rq->next_balance;
4871 }
4872 }
4873#endif
4874}
4875
4876static inline int on_null_domain(int cpu)
4877{
4878 return !rcu_dereference(cpu_rq(cpu)->sd);
4879}
4880
4881/*
4882 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4883 *
4884 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
4885 * idle load balancing owner or decide to stop the periodic load balancing,
4886 * if the whole system is idle.
4887 */
4888static inline void trigger_load_balance(struct rq *rq, int cpu)
4889{
4890#ifdef CONFIG_NO_HZ
4891 /*
4892 * If we were in the nohz mode recently and busy at the current
4893 * scheduler tick, then check if we need to nominate new idle
4894 * load balancer.
4895 */
4896 if (rq->in_nohz_recently && !rq->idle_at_tick) {
4897 rq->in_nohz_recently = 0;
4898
4899 if (atomic_read(&nohz.load_balancer) == cpu) {
4900 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4901 atomic_set(&nohz.load_balancer, -1);
4902 }
4903
4904 if (atomic_read(&nohz.load_balancer) == -1) {
4905 int ilb = find_new_ilb(cpu);
4906
4907 if (ilb < nr_cpu_ids)
4908 resched_cpu(ilb);
4909 }
4910 }
4911
4912 /*
4913 * If this cpu is idle and doing idle load balancing for all the
4914 * cpus with ticks stopped, is it time for that to stop?
4915 */
4916 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4917 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4918 resched_cpu(cpu);
4919 return;
4920 }
4921
4922 /*
4923 * If this cpu is idle and the idle load balancing is done by
4924 * someone else, then no need raise the SCHED_SOFTIRQ
4925 */
4926 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
4927 cpumask_test_cpu(cpu, nohz.cpu_mask))
4928 return;
4929#endif
4930 /* Don't need to rebalance while attached to NULL domain */
4931 if (time_after_eq(jiffies, rq->next_balance) &&
4932 likely(!on_null_domain(cpu)))
4933 raise_softirq(SCHED_SOFTIRQ);
4934}
4935
4936#else /* CONFIG_SMP */
4937
4938/*
4939 * on UP we do not need to balance between CPUs:
4940 */
4941static inline void idle_balance(int cpu, struct rq *rq)
4942{
4943}
4944
4945#endif 3202#endif
4946 3203
4947DEFINE_PER_CPU(struct kernel_stat, kstat); 3204DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -5073,8 +3330,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
5073 p->gtime = cputime_add(p->gtime, cputime); 3330 p->gtime = cputime_add(p->gtime, cputime);
5074 3331
5075 /* Add guest time to cpustat. */ 3332 /* Add guest time to cpustat. */
5076 cpustat->user = cputime64_add(cpustat->user, tmp); 3333 if (TASK_NICE(p) > 0) {
5077 cpustat->guest = cputime64_add(cpustat->guest, tmp); 3334 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3335 cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);
3336 } else {
3337 cpustat->user = cputime64_add(cpustat->user, tmp);
3338 cpustat->guest = cputime64_add(cpustat->guest, tmp);
3339 }
5078} 3340}
5079 3341
5080/* 3342/*
@@ -5189,60 +3451,86 @@ void account_idle_ticks(unsigned long ticks)
5189 * Use precise platform statistics if available: 3451 * Use precise platform statistics if available:
5190 */ 3452 */
5191#ifdef CONFIG_VIRT_CPU_ACCOUNTING 3453#ifdef CONFIG_VIRT_CPU_ACCOUNTING
5192cputime_t task_utime(struct task_struct *p) 3454void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5193{ 3455{
5194 return p->utime; 3456 *ut = p->utime;
3457 *st = p->stime;
5195} 3458}
5196 3459
5197cputime_t task_stime(struct task_struct *p) 3460void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5198{ 3461{
5199 return p->stime; 3462 struct task_cputime cputime;
3463
3464 thread_group_cputime(p, &cputime);
3465
3466 *ut = cputime.utime;
3467 *st = cputime.stime;
5200} 3468}
5201#else 3469#else
5202cputime_t task_utime(struct task_struct *p) 3470
3471#ifndef nsecs_to_cputime
3472# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
3473#endif
3474
3475void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5203{ 3476{
5204 clock_t utime = cputime_to_clock_t(p->utime), 3477 cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);
5205 total = utime + cputime_to_clock_t(p->stime);
5206 u64 temp;
5207 3478
5208 /* 3479 /*
5209 * Use CFS's precise accounting: 3480 * Use CFS's precise accounting:
5210 */ 3481 */
5211 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); 3482 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
5212 3483
5213 if (total) { 3484 if (total) {
5214 temp *= utime; 3485 u64 temp;
3486
3487 temp = (u64)(rtime * utime);
5215 do_div(temp, total); 3488 do_div(temp, total);
5216 } 3489 utime = (cputime_t)temp;
5217 utime = (clock_t)temp; 3490 } else
3491 utime = rtime;
3492
3493 /*
3494 * Compare with previous values, to keep monotonicity:
3495 */
3496 p->prev_utime = max(p->prev_utime, utime);
3497 p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));
5218 3498
5219 p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); 3499 *ut = p->prev_utime;
5220 return p->prev_utime; 3500 *st = p->prev_stime;
5221} 3501}
5222 3502
5223cputime_t task_stime(struct task_struct *p) 3503/*
3504 * Must be called with siglock held.
3505 */
3506void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5224{ 3507{
5225 clock_t stime; 3508 struct signal_struct *sig = p->signal;
3509 struct task_cputime cputime;
3510 cputime_t rtime, utime, total;
5226 3511
5227 /* 3512 thread_group_cputime(p, &cputime);
5228 * Use CFS's precise accounting. (we subtract utime from
5229 * the total, to make sure the total observed by userspace
5230 * grows monotonically - apps rely on that):
5231 */
5232 stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
5233 cputime_to_clock_t(task_utime(p));
5234 3513
5235 if (stime >= 0) 3514 total = cputime_add(cputime.utime, cputime.stime);
5236 p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); 3515 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
5237 3516
5238 return p->prev_stime; 3517 if (total) {
5239} 3518 u64 temp;
5240#endif
5241 3519
5242inline cputime_t task_gtime(struct task_struct *p) 3520 temp = (u64)(rtime * cputime.utime);
5243{ 3521 do_div(temp, total);
5244 return p->gtime; 3522 utime = (cputime_t)temp;
3523 } else
3524 utime = rtime;
3525
3526 sig->prev_utime = max(sig->prev_utime, utime);
3527 sig->prev_stime = max(sig->prev_stime,
3528 cputime_sub(rtime, sig->prev_utime));
3529
3530 *ut = sig->prev_utime;
3531 *st = sig->prev_stime;
5245} 3532}
3533#endif
5246 3534
5247/* 3535/*
5248 * This function gets called by the timer code, with HZ frequency. 3536 * This function gets called by the timer code, with HZ frequency.
@@ -5261,7 +3549,7 @@ void scheduler_tick(void)
5261 3549
5262 TS_TICK_START(current); 3550 TS_TICK_START(current);
5263 3551
5264 spin_lock(&rq->lock); 3552 raw_spin_lock(&rq->lock);
5265 update_rq_clock(rq); 3553 update_rq_clock(rq);
5266 update_cpu_load(rq); 3554 update_cpu_load(rq);
5267 curr->sched_class->task_tick(rq, curr, 0); 3555 curr->sched_class->task_tick(rq, curr, 0);
@@ -5269,9 +3557,9 @@ void scheduler_tick(void)
5269 /* litmus_tick may force current to resched */ 3557 /* litmus_tick may force current to resched */
5270 litmus_tick(rq, curr); 3558 litmus_tick(rq, curr);
5271 3559
5272 spin_unlock(&rq->lock); 3560 raw_spin_unlock(&rq->lock);
5273 3561
5274 perf_event_task_tick(curr, cpu); 3562 perf_event_task_tick(curr);
5275 3563
5276#ifdef CONFIG_SMP 3564#ifdef CONFIG_SMP
5277 rq->idle_at_tick = idle_cpu(cpu); 3565 rq->idle_at_tick = idle_cpu(cpu);
@@ -5385,13 +3673,14 @@ static inline void schedule_debug(struct task_struct *prev)
5385#endif 3673#endif
5386} 3674}
5387 3675
5388static void put_prev_task(struct rq *rq, struct task_struct *p) 3676static void put_prev_task(struct rq *rq, struct task_struct *prev)
5389{ 3677{
5390 u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime; 3678 if (prev->state == TASK_RUNNING) {
3679 u64 runtime = prev->se.sum_exec_runtime;
5391 3680
5392 update_avg(&p->se.avg_running, runtime); 3681 runtime -= prev->se.prev_sum_exec_runtime;
3682 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5393 3683
5394 if (p->state == TASK_RUNNING) {
5395 /* 3684 /*
5396 * In order to avoid avg_overlap growing stale when we are 3685 * In order to avoid avg_overlap growing stale when we are
5397 * indeed overlapping and hence not getting put to sleep, grow 3686 * indeed overlapping and hence not getting put to sleep, grow
@@ -5401,12 +3690,9 @@ static void put_prev_task(struct rq *rq, struct task_struct *p)
5401 * correlates to the amount of cache footprint a task can 3690 * correlates to the amount of cache footprint a task can
5402 * build up. 3691 * build up.
5403 */ 3692 */
5404 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); 3693 update_avg(&prev->se.avg_overlap, runtime);
5405 update_avg(&p->se.avg_overlap, runtime);
5406 } else {
5407 update_avg(&p->se.avg_running, 0);
5408 } 3694 }
5409 p->sched_class->put_prev_task(rq, p); 3695 prev->sched_class->put_prev_task(rq, prev);
5410} 3696}
5411 3697
5412/* 3698/*
@@ -5477,7 +3763,7 @@ need_resched_nonpreemptible:
5477 if (sched_feat(HRTICK)) 3763 if (sched_feat(HRTICK))
5478 hrtick_clear(rq); 3764 hrtick_clear(rq);
5479 3765
5480 spin_lock_irq(&rq->lock); 3766 raw_spin_lock_irq(&rq->lock);
5481 update_rq_clock(rq); 3767 update_rq_clock(rq);
5482 clear_tsk_need_resched(prev); 3768 clear_tsk_need_resched(prev);
5483 3769
@@ -5499,7 +3785,7 @@ need_resched_nonpreemptible:
5499 3785
5500 if (likely(prev != next)) { 3786 if (likely(prev != next)) {
5501 sched_info_switch(prev, next); 3787 sched_info_switch(prev, next);
5502 perf_event_task_sched_out(prev, next, cpu); 3788 perf_event_task_sched_out(prev, next);
5503 3789
5504 rq->nr_switches++; 3790 rq->nr_switches++;
5505 rq->curr = next; 3791 rq->curr = next;
@@ -5517,7 +3803,7 @@ need_resched_nonpreemptible:
5517 rq = cpu_rq(cpu); 3803 rq = cpu_rq(cpu);
5518 } else { 3804 } else {
5519 TS_SCHED_END(prev); 3805 TS_SCHED_END(prev);
5520 spin_unlock_irq(&rq->lock); 3806 raw_spin_unlock_irq(&rq->lock);
5521 } 3807 }
5522 3808
5523 sched_trace_task_switch_to(current); 3809 sched_trace_task_switch_to(current);
@@ -5525,11 +3811,12 @@ need_resched_nonpreemptible:
5525 post_schedule(rq); 3811 post_schedule(rq);
5526 3812
5527 if (unlikely(reacquire_kernel_lock(current) < 0)) { 3813 if (unlikely(reacquire_kernel_lock(current) < 0)) {
3814 prev = rq->curr;
3815 switch_count = &prev->nivcsw;
5528 goto need_resched_nonpreemptible; 3816 goto need_resched_nonpreemptible;
5529 } 3817 }
5530 3818
5531 preempt_enable_no_resched(); 3819 preempt_enable_no_resched();
5532
5533 if (need_resched()) 3820 if (need_resched())
5534 goto need_resched; 3821 goto need_resched;
5535 3822
@@ -5538,7 +3825,7 @@ need_resched_nonpreemptible:
5538} 3825}
5539EXPORT_SYMBOL(schedule); 3826EXPORT_SYMBOL(schedule);
5540 3827
5541#ifdef CONFIG_SMP 3828#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
5542/* 3829/*
5543 * Look out! "owner" is an entirely speculative pointer 3830 * Look out! "owner" is an entirely speculative pointer
5544 * access and not reliable. 3831 * access and not reliable.
@@ -5558,7 +3845,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
5558 * the mutex owner just released it and exited. 3845 * the mutex owner just released it and exited.
5559 */ 3846 */
5560 if (probe_kernel_address(&owner->cpu, cpu)) 3847 if (probe_kernel_address(&owner->cpu, cpu))
5561 goto out; 3848 return 0;
5562#else 3849#else
5563 cpu = owner->cpu; 3850 cpu = owner->cpu;
5564#endif 3851#endif
@@ -5568,14 +3855,14 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
5568 * the cpu field may no longer be valid. 3855 * the cpu field may no longer be valid.
5569 */ 3856 */
5570 if (cpu >= nr_cpumask_bits) 3857 if (cpu >= nr_cpumask_bits)
5571 goto out; 3858 return 0;
5572 3859
5573 /* 3860 /*
5574 * We need to validate that we can do a 3861 * We need to validate that we can do a
5575 * get_cpu() and that we have the percpu area. 3862 * get_cpu() and that we have the percpu area.
5576 */ 3863 */
5577 if (!cpu_online(cpu)) 3864 if (!cpu_online(cpu))
5578 goto out; 3865 return 0;
5579 3866
5580 rq = cpu_rq(cpu); 3867 rq = cpu_rq(cpu);
5581 3868
@@ -5594,7 +3881,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
5594 3881
5595 cpu_relax(); 3882 cpu_relax();
5596 } 3883 }
5597out: 3884
5598 return 1; 3885 return 1;
5599} 3886}
5600#endif 3887#endif
@@ -5953,14 +4240,15 @@ EXPORT_SYMBOL(wait_for_completion_killable);
5953 */ 4240 */
5954bool try_wait_for_completion(struct completion *x) 4241bool try_wait_for_completion(struct completion *x)
5955{ 4242{
4243 unsigned long flags;
5956 int ret = 1; 4244 int ret = 1;
5957 4245
5958 spin_lock_irq(&x->wait.lock); 4246 spin_lock_irqsave(&x->wait.lock, flags);
5959 if (!x->done) 4247 if (!x->done)
5960 ret = 0; 4248 ret = 0;
5961 else 4249 else
5962 x->done--; 4250 x->done--;
5963 spin_unlock_irq(&x->wait.lock); 4251 spin_unlock_irqrestore(&x->wait.lock, flags);
5964 return ret; 4252 return ret;
5965} 4253}
5966EXPORT_SYMBOL(try_wait_for_completion); 4254EXPORT_SYMBOL(try_wait_for_completion);
@@ -5975,12 +4263,13 @@ EXPORT_SYMBOL(try_wait_for_completion);
5975 */ 4263 */
5976bool completion_done(struct completion *x) 4264bool completion_done(struct completion *x)
5977{ 4265{
4266 unsigned long flags;
5978 int ret = 1; 4267 int ret = 1;
5979 4268
5980 spin_lock_irq(&x->wait.lock); 4269 spin_lock_irqsave(&x->wait.lock, flags);
5981 if (!x->done) 4270 if (!x->done)
5982 ret = 0; 4271 ret = 0;
5983 spin_unlock_irq(&x->wait.lock); 4272 spin_unlock_irqrestore(&x->wait.lock, flags);
5984 return ret; 4273 return ret;
5985} 4274}
5986EXPORT_SYMBOL(completion_done); 4275EXPORT_SYMBOL(completion_done);
@@ -6048,7 +4337,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6048 unsigned long flags; 4337 unsigned long flags;
6049 int oldprio, on_rq, running; 4338 int oldprio, on_rq, running;
6050 struct rq *rq; 4339 struct rq *rq;
6051 const struct sched_class *prev_class = p->sched_class; 4340 const struct sched_class *prev_class;
6052 4341
6053 BUG_ON(prio < 0 || prio > MAX_PRIO); 4342 BUG_ON(prio < 0 || prio > MAX_PRIO);
6054 4343
@@ -6056,6 +4345,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6056 update_rq_clock(rq); 4345 update_rq_clock(rq);
6057 4346
6058 oldprio = p->prio; 4347 oldprio = p->prio;
4348 prev_class = p->sched_class;
6059 on_rq = p->se.on_rq; 4349 on_rq = p->se.on_rq;
6060 running = task_current(rq, p); 4350 running = task_current(rq, p);
6061 if (on_rq) 4351 if (on_rq)
@@ -6073,7 +4363,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6073 if (running) 4363 if (running)
6074 p->sched_class->set_curr_task(rq); 4364 p->sched_class->set_curr_task(rq);
6075 if (on_rq) { 4365 if (on_rq) {
6076 enqueue_task(rq, p, 0); 4366 enqueue_task(rq, p, 0, oldprio < prio);
6077 4367
6078 check_class_changed(rq, p, prev_class, oldprio, running); 4368 check_class_changed(rq, p, prev_class, oldprio, running);
6079 } 4369 }
@@ -6117,7 +4407,7 @@ void set_user_nice(struct task_struct *p, long nice)
6117 delta = p->prio - old_prio; 4407 delta = p->prio - old_prio;
6118 4408
6119 if (on_rq) { 4409 if (on_rq) {
6120 enqueue_task(rq, p, 0); 4410 enqueue_task(rq, p, 0, false);
6121 /* 4411 /*
6122 * If the task increased its priority or is running and 4412 * If the task increased its priority or is running and
6123 * lowered its priority, then reschedule its CPU: 4413 * lowered its priority, then reschedule its CPU:
@@ -6140,7 +4430,7 @@ int can_nice(const struct task_struct *p, const int nice)
6140 /* convert nice value [19,-20] to rlimit style value [1,40] */ 4430 /* convert nice value [19,-20] to rlimit style value [1,40] */
6141 int nice_rlim = 20 - nice; 4431 int nice_rlim = 20 - nice;
6142 4432
6143 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || 4433 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
6144 capable(CAP_SYS_NICE)); 4434 capable(CAP_SYS_NICE));
6145} 4435}
6146 4436
@@ -6243,25 +4533,16 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
6243 BUG_ON(p->se.on_rq); 4533 BUG_ON(p->se.on_rq);
6244 4534
6245 p->policy = policy; 4535 p->policy = policy;
6246 switch (p->policy) {
6247 case SCHED_NORMAL:
6248 case SCHED_BATCH:
6249 case SCHED_IDLE:
6250 p->sched_class = &fair_sched_class;
6251 break;
6252 case SCHED_FIFO:
6253 case SCHED_RR:
6254 p->sched_class = &rt_sched_class;
6255 break;
6256 case SCHED_LITMUS:
6257 p->sched_class = &litmus_sched_class;
6258 break;
6259 }
6260
6261 p->rt_priority = prio; 4536 p->rt_priority = prio;
6262 p->normal_prio = normal_prio(p); 4537 p->normal_prio = normal_prio(p);
6263 /* we are holding p->pi_lock already */ 4538 /* we are holding p->pi_lock already */
6264 p->prio = rt_mutex_getprio(p); 4539 p->prio = rt_mutex_getprio(p);
4540 if (p->policy == SCHED_LITMUS)
4541 p->sched_class = &litmus_sched_class;
4542 else if (rt_prio(p->prio))
4543 p->sched_class = &rt_sched_class;
4544 else
4545 p->sched_class = &fair_sched_class;
6265 set_load_weight(p); 4546 set_load_weight(p);
6266} 4547}
6267 4548
@@ -6286,7 +4567,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
6286{ 4567{
6287 int retval, oldprio, oldpolicy = -1, on_rq, running; 4568 int retval, oldprio, oldpolicy = -1, on_rq, running;
6288 unsigned long flags; 4569 unsigned long flags;
6289 const struct sched_class *prev_class = p->sched_class; 4570 const struct sched_class *prev_class;
6290 struct rq *rq; 4571 struct rq *rq;
6291 int reset_on_fork; 4572 int reset_on_fork;
6292 4573
@@ -6330,7 +4611,7 @@ recheck:
6330 4611
6331 if (!lock_task_sighand(p, &flags)) 4612 if (!lock_task_sighand(p, &flags))
6332 return -ESRCH; 4613 return -ESRCH;
6333 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; 4614 rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
6334 unlock_task_sighand(p, &flags); 4615 unlock_task_sighand(p, &flags);
6335 4616
6336 /* can't set/change the rt policy */ 4617 /* can't set/change the rt policy */
@@ -6384,7 +4665,7 @@ recheck:
6384 * make sure no PI-waiters arrive (or leave) while we are 4665 * make sure no PI-waiters arrive (or leave) while we are
6385 * changing the priority of the task: 4666 * changing the priority of the task:
6386 */ 4667 */
6387 spin_lock_irqsave(&p->pi_lock, flags); 4668 raw_spin_lock_irqsave(&p->pi_lock, flags);
6388 /* 4669 /*
6389 * To be able to change p->policy safely, the apropriate 4670 * To be able to change p->policy safely, the apropriate
6390 * runqueue lock must be held. 4671 * runqueue lock must be held.
@@ -6394,7 +4675,7 @@ recheck:
6394 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 4675 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
6395 policy = oldpolicy = -1; 4676 policy = oldpolicy = -1;
6396 __task_rq_unlock(rq); 4677 __task_rq_unlock(rq);
6397 spin_unlock_irqrestore(&p->pi_lock, flags); 4678 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
6398 goto recheck; 4679 goto recheck;
6399 } 4680 }
6400 update_rq_clock(rq); 4681 update_rq_clock(rq);
@@ -6411,6 +4692,7 @@ recheck:
6411 litmus_exit_task(p); 4692 litmus_exit_task(p);
6412 4693
6413 oldprio = p->prio; 4694 oldprio = p->prio;
4695 prev_class = p->sched_class;
6414 __setscheduler(rq, p, policy, param->sched_priority); 4696 __setscheduler(rq, p, policy, param->sched_priority);
6415 4697
6416 if (policy == SCHED_LITMUS) { 4698 if (policy == SCHED_LITMUS) {
@@ -6427,7 +4709,7 @@ recheck:
6427 check_class_changed(rq, p, prev_class, oldprio, running); 4709 check_class_changed(rq, p, prev_class, oldprio, running);
6428 } 4710 }
6429 __task_rq_unlock(rq); 4711 __task_rq_unlock(rq);
6430 spin_unlock_irqrestore(&p->pi_lock, flags); 4712 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
6431 4713
6432 rt_mutex_adjust_pi(p); 4714 rt_mutex_adjust_pi(p);
6433 4715
@@ -6527,7 +4809,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
6527 return -EINVAL; 4809 return -EINVAL;
6528 4810
6529 retval = -ESRCH; 4811 retval = -ESRCH;
6530 read_lock(&tasklist_lock); 4812 rcu_read_lock();
6531 p = find_process_by_pid(pid); 4813 p = find_process_by_pid(pid);
6532 if (p) { 4814 if (p) {
6533 retval = security_task_getscheduler(p); 4815 retval = security_task_getscheduler(p);
@@ -6535,7 +4817,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
6535 retval = p->policy 4817 retval = p->policy
6536 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); 4818 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
6537 } 4819 }
6538 read_unlock(&tasklist_lock); 4820 rcu_read_unlock();
6539 return retval; 4821 return retval;
6540} 4822}
6541 4823
@@ -6553,7 +4835,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
6553 if (!param || pid < 0) 4835 if (!param || pid < 0)
6554 return -EINVAL; 4836 return -EINVAL;
6555 4837
6556 read_lock(&tasklist_lock); 4838 rcu_read_lock();
6557 p = find_process_by_pid(pid); 4839 p = find_process_by_pid(pid);
6558 retval = -ESRCH; 4840 retval = -ESRCH;
6559 if (!p) 4841 if (!p)
@@ -6564,7 +4846,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
6564 goto out_unlock; 4846 goto out_unlock;
6565 4847
6566 lp.sched_priority = p->rt_priority; 4848 lp.sched_priority = p->rt_priority;
6567 read_unlock(&tasklist_lock); 4849 rcu_read_unlock();
6568 4850
6569 /* 4851 /*
6570 * This one might sleep, we cannot do it with a spinlock held ... 4852 * This one might sleep, we cannot do it with a spinlock held ...
@@ -6574,7 +4856,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
6574 return retval; 4856 return retval;
6575 4857
6576out_unlock: 4858out_unlock:
6577 read_unlock(&tasklist_lock); 4859 rcu_read_unlock();
6578 return retval; 4860 return retval;
6579} 4861}
6580 4862
@@ -6585,23 +4867,19 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
6585 int retval; 4867 int retval;
6586 4868
6587 get_online_cpus(); 4869 get_online_cpus();
6588 read_lock(&tasklist_lock); 4870 rcu_read_lock();
6589 4871
6590 p = find_process_by_pid(pid); 4872 p = find_process_by_pid(pid);
6591 /* Don't set affinity if task not found and for LITMUS tasks */ 4873 /* Don't set affinity if task not found and for LITMUS tasks */
6592 if (!p || is_realtime(p)) { 4874 if (!p || is_realtime(p)) {
6593 read_unlock(&tasklist_lock); 4875 rcu_read_unlock();
6594 put_online_cpus(); 4876 put_online_cpus();
6595 return p ? -EPERM : -ESRCH; 4877 return p ? -EPERM : -ESRCH;
6596 } 4878 }
6597 4879
6598 /* 4880 /* Prevent p going away */
6599 * It is not safe to call set_cpus_allowed with the
6600 * tasklist_lock held. We will bump the task_struct's
6601 * usage count and then drop tasklist_lock.
6602 */
6603 get_task_struct(p); 4881 get_task_struct(p);
6604 read_unlock(&tasklist_lock); 4882 rcu_read_unlock();
6605 4883
6606 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { 4884 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
6607 retval = -ENOMEM; 4885 retval = -ENOMEM;
@@ -6682,10 +4960,12 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
6682long sched_getaffinity(pid_t pid, struct cpumask *mask) 4960long sched_getaffinity(pid_t pid, struct cpumask *mask)
6683{ 4961{
6684 struct task_struct *p; 4962 struct task_struct *p;
4963 unsigned long flags;
4964 struct rq *rq;
6685 int retval; 4965 int retval;
6686 4966
6687 get_online_cpus(); 4967 get_online_cpus();
6688 read_lock(&tasklist_lock); 4968 rcu_read_lock();
6689 4969
6690 retval = -ESRCH; 4970 retval = -ESRCH;
6691 p = find_process_by_pid(pid); 4971 p = find_process_by_pid(pid);
@@ -6696,10 +4976,12 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
6696 if (retval) 4976 if (retval)
6697 goto out_unlock; 4977 goto out_unlock;
6698 4978
4979 rq = task_rq_lock(p, &flags);
6699 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 4980 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
4981 task_rq_unlock(rq, &flags);
6700 4982
6701out_unlock: 4983out_unlock:
6702 read_unlock(&tasklist_lock); 4984 rcu_read_unlock();
6703 put_online_cpus(); 4985 put_online_cpus();
6704 4986
6705 return retval; 4987 return retval;
@@ -6717,7 +4999,9 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
6717 int ret; 4999 int ret;
6718 cpumask_var_t mask; 5000 cpumask_var_t mask;
6719 5001
6720 if (len < cpumask_size()) 5002 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
5003 return -EINVAL;
5004 if (len & (sizeof(unsigned long)-1))
6721 return -EINVAL; 5005 return -EINVAL;
6722 5006
6723 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 5007 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
@@ -6725,10 +5009,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
6725 5009
6726 ret = sched_getaffinity(pid, mask); 5010 ret = sched_getaffinity(pid, mask);
6727 if (ret == 0) { 5011 if (ret == 0) {
6728 if (copy_to_user(user_mask_ptr, mask, cpumask_size())) 5012 size_t retlen = min_t(size_t, len, cpumask_size());
5013
5014 if (copy_to_user(user_mask_ptr, mask, retlen))
6729 ret = -EFAULT; 5015 ret = -EFAULT;
6730 else 5016 else
6731 ret = cpumask_size(); 5017 ret = retlen;
6732 } 5018 }
6733 free_cpumask_var(mask); 5019 free_cpumask_var(mask);
6734 5020
@@ -6754,7 +5040,7 @@ SYSCALL_DEFINE0(sched_yield)
6754 */ 5040 */
6755 __release(rq->lock); 5041 __release(rq->lock);
6756 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 5042 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
6757 _raw_spin_unlock(&rq->lock); 5043 do_raw_spin_unlock(&rq->lock);
6758 preempt_enable_no_resched(); 5044 preempt_enable_no_resched();
6759 5045
6760 schedule(); 5046 schedule();
@@ -6934,6 +5220,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6934{ 5220{
6935 struct task_struct *p; 5221 struct task_struct *p;
6936 unsigned int time_slice; 5222 unsigned int time_slice;
5223 unsigned long flags;
5224 struct rq *rq;
6937 int retval; 5225 int retval;
6938 struct timespec t; 5226 struct timespec t;
6939 5227
@@ -6941,7 +5229,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6941 return -EINVAL; 5229 return -EINVAL;
6942 5230
6943 retval = -ESRCH; 5231 retval = -ESRCH;
6944 read_lock(&tasklist_lock); 5232 rcu_read_lock();
6945 p = find_process_by_pid(pid); 5233 p = find_process_by_pid(pid);
6946 if (!p) 5234 if (!p)
6947 goto out_unlock; 5235 goto out_unlock;
@@ -6950,15 +5238,17 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6950 if (retval) 5238 if (retval)
6951 goto out_unlock; 5239 goto out_unlock;
6952 5240
6953 time_slice = p->sched_class->get_rr_interval(p); 5241 rq = task_rq_lock(p, &flags);
5242 time_slice = p->sched_class->get_rr_interval(rq, p);
5243 task_rq_unlock(rq, &flags);
6954 5244
6955 read_unlock(&tasklist_lock); 5245 rcu_read_unlock();
6956 jiffies_to_timespec(time_slice, &t); 5246 jiffies_to_timespec(time_slice, &t);
6957 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 5247 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
6958 return retval; 5248 return retval;
6959 5249
6960out_unlock: 5250out_unlock:
6961 read_unlock(&tasklist_lock); 5251 rcu_read_unlock();
6962 return retval; 5252 return retval;
6963} 5253}
6964 5254
@@ -7024,7 +5314,7 @@ void show_state_filter(unsigned long state_filter)
7024 /* 5314 /*
7025 * Only show locks if all tasks are dumped: 5315 * Only show locks if all tasks are dumped:
7026 */ 5316 */
7027 if (state_filter == -1) 5317 if (!state_filter)
7028 debug_show_all_locks(); 5318 debug_show_all_locks();
7029} 5319}
7030 5320
@@ -7046,12 +5336,12 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
7046 struct rq *rq = cpu_rq(cpu); 5336 struct rq *rq = cpu_rq(cpu);
7047 unsigned long flags; 5337 unsigned long flags;
7048 5338
7049 spin_lock_irqsave(&rq->lock, flags); 5339 raw_spin_lock_irqsave(&rq->lock, flags);
7050 5340
7051 __sched_fork(idle); 5341 __sched_fork(idle);
5342 idle->state = TASK_RUNNING;
7052 idle->se.exec_start = sched_clock(); 5343 idle->se.exec_start = sched_clock();
7053 5344
7054 idle->prio = idle->normal_prio = MAX_PRIO;
7055 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); 5345 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
7056 __set_task_cpu(idle, cpu); 5346 __set_task_cpu(idle, cpu);
7057 5347
@@ -7059,7 +5349,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
7059#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 5349#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
7060 idle->oncpu = 1; 5350 idle->oncpu = 1;
7061#endif 5351#endif
7062 spin_unlock_irqrestore(&rq->lock, flags); 5352 raw_spin_unlock_irqrestore(&rq->lock, flags);
7063 5353
7064 /* Set the preempt count _outside_ the spinlocks! */ 5354 /* Set the preempt count _outside_ the spinlocks! */
7065#if defined(CONFIG_PREEMPT) 5355#if defined(CONFIG_PREEMPT)
@@ -7092,22 +5382,43 @@ cpumask_var_t nohz_cpu_mask;
7092 * 5382 *
7093 * This idea comes from the SD scheduler of Con Kolivas: 5383 * This idea comes from the SD scheduler of Con Kolivas:
7094 */ 5384 */
7095static inline void sched_init_granularity(void) 5385static int get_update_sysctl_factor(void)
7096{ 5386{
7097 unsigned int factor = 1 + ilog2(num_online_cpus()); 5387 unsigned int cpus = min_t(int, num_online_cpus(), 8);
7098 const unsigned long limit = 200000000; 5388 unsigned int factor;
5389
5390 switch (sysctl_sched_tunable_scaling) {
5391 case SCHED_TUNABLESCALING_NONE:
5392 factor = 1;
5393 break;
5394 case SCHED_TUNABLESCALING_LINEAR:
5395 factor = cpus;
5396 break;
5397 case SCHED_TUNABLESCALING_LOG:
5398 default:
5399 factor = 1 + ilog2(cpus);
5400 break;
5401 }
7099 5402
7100 sysctl_sched_min_granularity *= factor; 5403 return factor;
7101 if (sysctl_sched_min_granularity > limit) 5404}
7102 sysctl_sched_min_granularity = limit;
7103 5405
7104 sysctl_sched_latency *= factor; 5406static void update_sysctl(void)
7105 if (sysctl_sched_latency > limit) 5407{
7106 sysctl_sched_latency = limit; 5408 unsigned int factor = get_update_sysctl_factor();
7107 5409
7108 sysctl_sched_wakeup_granularity *= factor; 5410#define SET_SYSCTL(name) \
5411 (sysctl_##name = (factor) * normalized_sysctl_##name)
5412 SET_SYSCTL(sched_min_granularity);
5413 SET_SYSCTL(sched_latency);
5414 SET_SYSCTL(sched_wakeup_granularity);
5415 SET_SYSCTL(sched_shares_ratelimit);
5416#undef SET_SYSCTL
5417}
7109 5418
7110 sysctl_sched_shares_ratelimit *= factor; 5419static inline void sched_init_granularity(void)
5420{
5421 update_sysctl();
7111} 5422}
7112 5423
7113#ifdef CONFIG_SMP 5424#ifdef CONFIG_SMP
@@ -7144,7 +5455,8 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7144 int ret = 0; 5455 int ret = 0;
7145 5456
7146 rq = task_rq_lock(p, &flags); 5457 rq = task_rq_lock(p, &flags);
7147 if (!cpumask_intersects(new_mask, cpu_online_mask)) { 5458
5459 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
7148 ret = -EINVAL; 5460 ret = -EINVAL;
7149 goto out; 5461 goto out;
7150 } 5462 }
@@ -7166,13 +5478,13 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7166 if (cpumask_test_cpu(task_cpu(p), new_mask)) 5478 if (cpumask_test_cpu(task_cpu(p), new_mask))
7167 goto out; 5479 goto out;
7168 5480
7169 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { 5481 if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) {
7170 /* Need help from migration thread: drop lock and wait. */ 5482 /* Need help from migration thread: drop lock and wait. */
7171 struct task_struct *mt = rq->migration_thread; 5483 struct task_struct *mt = rq->migration_thread;
7172 5484
7173 get_task_struct(mt); 5485 get_task_struct(mt);
7174 task_rq_unlock(rq, &flags); 5486 task_rq_unlock(rq, &flags);
7175 wake_up_process(rq->migration_thread); 5487 wake_up_process(mt);
7176 put_task_struct(mt); 5488 put_task_struct(mt);
7177 wait_for_completion(&req.done); 5489 wait_for_completion(&req.done);
7178 tlb_migrate_finish(p->mm); 5490 tlb_migrate_finish(p->mm);
@@ -7199,7 +5511,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
7199static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 5511static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
7200{ 5512{
7201 struct rq *rq_dest, *rq_src; 5513 struct rq *rq_dest, *rq_src;
7202 int ret = 0, on_rq; 5514 int ret = 0;
7203 5515
7204 if (unlikely(!cpu_active(dest_cpu))) 5516 if (unlikely(!cpu_active(dest_cpu)))
7205 return ret; 5517 return ret;
@@ -7215,12 +5527,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
7215 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 5527 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
7216 goto fail; 5528 goto fail;
7217 5529
7218 on_rq = p->se.on_rq; 5530 /*
7219 if (on_rq) 5531 * If we're not on a rq, the next wake-up will ensure we're
5532 * placed properly.
5533 */
5534 if (p->se.on_rq) {
7220 deactivate_task(rq_src, p, 0); 5535 deactivate_task(rq_src, p, 0);
7221 5536 set_task_cpu(p, dest_cpu);
7222 set_task_cpu(p, dest_cpu);
7223 if (on_rq) {
7224 activate_task(rq_dest, p, 0); 5537 activate_task(rq_dest, p, 0);
7225 check_preempt_curr(rq_dest, p, 0); 5538 check_preempt_curr(rq_dest, p, 0);
7226 } 5539 }
@@ -7255,10 +5568,10 @@ static int migration_thread(void *data)
7255 struct migration_req *req; 5568 struct migration_req *req;
7256 struct list_head *head; 5569 struct list_head *head;
7257 5570
7258 spin_lock_irq(&rq->lock); 5571 raw_spin_lock_irq(&rq->lock);
7259 5572
7260 if (cpu_is_offline(cpu)) { 5573 if (cpu_is_offline(cpu)) {
7261 spin_unlock_irq(&rq->lock); 5574 raw_spin_unlock_irq(&rq->lock);
7262 break; 5575 break;
7263 } 5576 }
7264 5577
@@ -7270,7 +5583,7 @@ static int migration_thread(void *data)
7270 head = &rq->migration_queue; 5583 head = &rq->migration_queue;
7271 5584
7272 if (list_empty(head)) { 5585 if (list_empty(head)) {
7273 spin_unlock_irq(&rq->lock); 5586 raw_spin_unlock_irq(&rq->lock);
7274 schedule(); 5587 schedule();
7275 set_current_state(TASK_INTERRUPTIBLE); 5588 set_current_state(TASK_INTERRUPTIBLE);
7276 continue; 5589 continue;
@@ -7279,14 +5592,14 @@ static int migration_thread(void *data)
7279 list_del_init(head->next); 5592 list_del_init(head->next);
7280 5593
7281 if (req->task != NULL) { 5594 if (req->task != NULL) {
7282 spin_unlock(&rq->lock); 5595 raw_spin_unlock(&rq->lock);
7283 __migrate_task(req->task, cpu, req->dest_cpu); 5596 __migrate_task(req->task, cpu, req->dest_cpu);
7284 } else if (likely(cpu == (badcpu = smp_processor_id()))) { 5597 } else if (likely(cpu == (badcpu = smp_processor_id()))) {
7285 req->dest_cpu = RCU_MIGRATION_GOT_QS; 5598 req->dest_cpu = RCU_MIGRATION_GOT_QS;
7286 spin_unlock(&rq->lock); 5599 raw_spin_unlock(&rq->lock);
7287 } else { 5600 } else {
7288 req->dest_cpu = RCU_MIGRATION_MUST_SYNC; 5601 req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
7289 spin_unlock(&rq->lock); 5602 raw_spin_unlock(&rq->lock);
7290 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu); 5603 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
7291 } 5604 }
7292 local_irq_enable(); 5605 local_irq_enable();
@@ -7316,37 +5629,10 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
7316static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 5629static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
7317{ 5630{
7318 int dest_cpu; 5631 int dest_cpu;
7319 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu));
7320 5632
7321again: 5633again:
7322 /* Look for allowed, online CPU in same node. */ 5634 dest_cpu = select_fallback_rq(dead_cpu, p);
7323 for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)
7324 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
7325 goto move;
7326
7327 /* Any allowed, online CPU? */
7328 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);
7329 if (dest_cpu < nr_cpu_ids)
7330 goto move;
7331
7332 /* No more Mr. Nice Guy. */
7333 if (dest_cpu >= nr_cpu_ids) {
7334 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
7335 dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);
7336
7337 /*
7338 * Don't tell them about moving exiting tasks or
7339 * kernel threads (both mm NULL), since they never
7340 * leave kernel.
7341 */
7342 if (p->mm && printk_ratelimit()) {
7343 printk(KERN_INFO "process %d (%s) no "
7344 "longer affine to cpu%d\n",
7345 task_pid_nr(p), p->comm, dead_cpu);
7346 }
7347 }
7348 5635
7349move:
7350 /* It can have affinity changed while we were choosing. */ 5636 /* It can have affinity changed while we were choosing. */
7351 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) 5637 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
7352 goto again; 5638 goto again;
@@ -7361,7 +5647,7 @@ move:
7361 */ 5647 */
7362static void migrate_nr_uninterruptible(struct rq *rq_src) 5648static void migrate_nr_uninterruptible(struct rq *rq_src)
7363{ 5649{
7364 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask)); 5650 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
7365 unsigned long flags; 5651 unsigned long flags;
7366 5652
7367 local_irq_save(flags); 5653 local_irq_save(flags);
@@ -7409,14 +5695,14 @@ void sched_idle_next(void)
7409 * Strictly not necessary since rest of the CPUs are stopped by now 5695 * Strictly not necessary since rest of the CPUs are stopped by now
7410 * and interrupts disabled on the current cpu. 5696 * and interrupts disabled on the current cpu.
7411 */ 5697 */
7412 spin_lock_irqsave(&rq->lock, flags); 5698 raw_spin_lock_irqsave(&rq->lock, flags);
7413 5699
7414 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); 5700 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
7415 5701
7416 update_rq_clock(rq); 5702 update_rq_clock(rq);
7417 activate_task(rq, p, 0); 5703 activate_task(rq, p, 0);
7418 5704
7419 spin_unlock_irqrestore(&rq->lock, flags); 5705 raw_spin_unlock_irqrestore(&rq->lock, flags);
7420} 5706}
7421 5707
7422/* 5708/*
@@ -7452,9 +5738,9 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
7452 * that's OK. No task can be added to this CPU, so iteration is 5738 * that's OK. No task can be added to this CPU, so iteration is
7453 * fine. 5739 * fine.
7454 */ 5740 */
7455 spin_unlock_irq(&rq->lock); 5741 raw_spin_unlock_irq(&rq->lock);
7456 move_task_off_dead_cpu(dead_cpu, p); 5742 move_task_off_dead_cpu(dead_cpu, p);
7457 spin_lock_irq(&rq->lock); 5743 raw_spin_lock_irq(&rq->lock);
7458 5744
7459 put_task_struct(p); 5745 put_task_struct(p);
7460} 5746}
@@ -7495,17 +5781,16 @@ static struct ctl_table sd_ctl_dir[] = {
7495 .procname = "sched_domain", 5781 .procname = "sched_domain",
7496 .mode = 0555, 5782 .mode = 0555,
7497 }, 5783 },
7498 {0, }, 5784 {}
7499}; 5785};
7500 5786
7501static struct ctl_table sd_ctl_root[] = { 5787static struct ctl_table sd_ctl_root[] = {
7502 { 5788 {
7503 .ctl_name = CTL_KERN,
7504 .procname = "kernel", 5789 .procname = "kernel",
7505 .mode = 0555, 5790 .mode = 0555,
7506 .child = sd_ctl_dir, 5791 .child = sd_ctl_dir,
7507 }, 5792 },
7508 {0, }, 5793 {}
7509}; 5794};
7510 5795
7511static struct ctl_table *sd_alloc_ctl_entry(int n) 5796static struct ctl_table *sd_alloc_ctl_entry(int n)
@@ -7615,7 +5900,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
7615static struct ctl_table_header *sd_sysctl_header; 5900static struct ctl_table_header *sd_sysctl_header;
7616static void register_sched_domain_sysctl(void) 5901static void register_sched_domain_sysctl(void)
7617{ 5902{
7618 int i, cpu_num = num_online_cpus(); 5903 int i, cpu_num = num_possible_cpus();
7619 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); 5904 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
7620 char buf[32]; 5905 char buf[32];
7621 5906
@@ -7625,7 +5910,7 @@ static void register_sched_domain_sysctl(void)
7625 if (entry == NULL) 5910 if (entry == NULL)
7626 return; 5911 return;
7627 5912
7628 for_each_online_cpu(i) { 5913 for_each_possible_cpu(i) {
7629 snprintf(buf, 32, "cpu%d", i); 5914 snprintf(buf, 32, "cpu%d", i);
7630 entry->procname = kstrdup(buf, GFP_KERNEL); 5915 entry->procname = kstrdup(buf, GFP_KERNEL);
7631 entry->mode = 0555; 5916 entry->mode = 0555;
@@ -7721,13 +6006,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7721 6006
7722 /* Update our root-domain */ 6007 /* Update our root-domain */
7723 rq = cpu_rq(cpu); 6008 rq = cpu_rq(cpu);
7724 spin_lock_irqsave(&rq->lock, flags); 6009 raw_spin_lock_irqsave(&rq->lock, flags);
7725 if (rq->rd) { 6010 if (rq->rd) {
7726 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 6011 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7727 6012
7728 set_rq_online(rq); 6013 set_rq_online(rq);
7729 } 6014 }
7730 spin_unlock_irqrestore(&rq->lock, flags); 6015 raw_spin_unlock_irqrestore(&rq->lock, flags);
7731 break; 6016 break;
7732 6017
7733#ifdef CONFIG_HOTPLUG_CPU 6018#ifdef CONFIG_HOTPLUG_CPU
@@ -7752,14 +6037,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7752 put_task_struct(rq->migration_thread); 6037 put_task_struct(rq->migration_thread);
7753 rq->migration_thread = NULL; 6038 rq->migration_thread = NULL;
7754 /* Idle task back to normal (off runqueue, low prio) */ 6039 /* Idle task back to normal (off runqueue, low prio) */
7755 spin_lock_irq(&rq->lock); 6040 raw_spin_lock_irq(&rq->lock);
7756 update_rq_clock(rq); 6041 update_rq_clock(rq);
7757 deactivate_task(rq, rq->idle, 0); 6042 deactivate_task(rq, rq->idle, 0);
7758 rq->idle->static_prio = MAX_PRIO;
7759 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); 6043 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
7760 rq->idle->sched_class = &idle_sched_class; 6044 rq->idle->sched_class = &idle_sched_class;
7761 migrate_dead_tasks(cpu); 6045 migrate_dead_tasks(cpu);
7762 spin_unlock_irq(&rq->lock); 6046 raw_spin_unlock_irq(&rq->lock);
7763 cpuset_unlock(); 6047 cpuset_unlock();
7764 migrate_nr_uninterruptible(rq); 6048 migrate_nr_uninterruptible(rq);
7765 BUG_ON(rq->nr_running != 0); 6049 BUG_ON(rq->nr_running != 0);
@@ -7769,30 +6053,30 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7769 * they didn't take sched_hotcpu_mutex. Just wake up 6053 * they didn't take sched_hotcpu_mutex. Just wake up
7770 * the requestors. 6054 * the requestors.
7771 */ 6055 */
7772 spin_lock_irq(&rq->lock); 6056 raw_spin_lock_irq(&rq->lock);
7773 while (!list_empty(&rq->migration_queue)) { 6057 while (!list_empty(&rq->migration_queue)) {
7774 struct migration_req *req; 6058 struct migration_req *req;
7775 6059
7776 req = list_entry(rq->migration_queue.next, 6060 req = list_entry(rq->migration_queue.next,
7777 struct migration_req, list); 6061 struct migration_req, list);
7778 list_del_init(&req->list); 6062 list_del_init(&req->list);
7779 spin_unlock_irq(&rq->lock); 6063 raw_spin_unlock_irq(&rq->lock);
7780 complete(&req->done); 6064 complete(&req->done);
7781 spin_lock_irq(&rq->lock); 6065 raw_spin_lock_irq(&rq->lock);
7782 } 6066 }
7783 spin_unlock_irq(&rq->lock); 6067 raw_spin_unlock_irq(&rq->lock);
7784 break; 6068 break;
7785 6069
7786 case CPU_DYING: 6070 case CPU_DYING:
7787 case CPU_DYING_FROZEN: 6071 case CPU_DYING_FROZEN:
7788 /* Update our root-domain */ 6072 /* Update our root-domain */
7789 rq = cpu_rq(cpu); 6073 rq = cpu_rq(cpu);
7790 spin_lock_irqsave(&rq->lock, flags); 6074 raw_spin_lock_irqsave(&rq->lock, flags);
7791 if (rq->rd) { 6075 if (rq->rd) {
7792 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 6076 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7793 set_rq_offline(rq); 6077 set_rq_offline(rq);
7794 } 6078 }
7795 spin_unlock_irqrestore(&rq->lock, flags); 6079 raw_spin_unlock_irqrestore(&rq->lock, flags);
7796 break; 6080 break;
7797#endif 6081#endif
7798 } 6082 }
@@ -7829,6 +6113,16 @@ early_initcall(migration_init);
7829 6113
7830#ifdef CONFIG_SCHED_DEBUG 6114#ifdef CONFIG_SCHED_DEBUG
7831 6115
6116static __read_mostly int sched_domain_debug_enabled;
6117
6118static int __init sched_domain_debug_setup(char *str)
6119{
6120 sched_domain_debug_enabled = 1;
6121
6122 return 0;
6123}
6124early_param("sched_debug", sched_domain_debug_setup);
6125
7832static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 6126static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7833 struct cpumask *groupmask) 6127 struct cpumask *groupmask)
7834{ 6128{
@@ -7915,6 +6209,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
7915 cpumask_var_t groupmask; 6209 cpumask_var_t groupmask;
7916 int level = 0; 6210 int level = 0;
7917 6211
6212 if (!sched_domain_debug_enabled)
6213 return;
6214
7918 if (!sd) { 6215 if (!sd) {
7919 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); 6216 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
7920 return; 6217 return;
@@ -7994,6 +6291,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
7994 6291
7995static void free_rootdomain(struct root_domain *rd) 6292static void free_rootdomain(struct root_domain *rd)
7996{ 6293{
6294 synchronize_sched();
6295
7997 cpupri_cleanup(&rd->cpupri); 6296 cpupri_cleanup(&rd->cpupri);
7998 6297
7999 free_cpumask_var(rd->rto_mask); 6298 free_cpumask_var(rd->rto_mask);
@@ -8007,7 +6306,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
8007 struct root_domain *old_rd = NULL; 6306 struct root_domain *old_rd = NULL;
8008 unsigned long flags; 6307 unsigned long flags;
8009 6308
8010 spin_lock_irqsave(&rq->lock, flags); 6309 raw_spin_lock_irqsave(&rq->lock, flags);
8011 6310
8012 if (rq->rd) { 6311 if (rq->rd) {
8013 old_rd = rq->rd; 6312 old_rd = rq->rd;
@@ -8033,7 +6332,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
8033 if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) 6332 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
8034 set_rq_online(rq); 6333 set_rq_online(rq);
8035 6334
8036 spin_unlock_irqrestore(&rq->lock, flags); 6335 raw_spin_unlock_irqrestore(&rq->lock, flags);
8037 6336
8038 if (old_rd) 6337 if (old_rd)
8039 free_rootdomain(old_rd); 6338 free_rootdomain(old_rd);
@@ -8134,6 +6433,7 @@ static cpumask_var_t cpu_isolated_map;
8134/* Setup the mask of cpus configured for isolated domains */ 6433/* Setup the mask of cpus configured for isolated domains */
8135static int __init isolated_cpu_setup(char *str) 6434static int __init isolated_cpu_setup(char *str)
8136{ 6435{
6436 alloc_bootmem_cpumask_var(&cpu_isolated_map);
8137 cpulist_parse(str, cpu_isolated_map); 6437 cpulist_parse(str, cpu_isolated_map);
8138 return 1; 6438 return 1;
8139} 6439}
@@ -8318,14 +6618,14 @@ enum s_alloc {
8318 */ 6618 */
8319#ifdef CONFIG_SCHED_SMT 6619#ifdef CONFIG_SCHED_SMT
8320static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); 6620static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
8321static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus); 6621static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
8322 6622
8323static int 6623static int
8324cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, 6624cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
8325 struct sched_group **sg, struct cpumask *unused) 6625 struct sched_group **sg, struct cpumask *unused)
8326{ 6626{
8327 if (sg) 6627 if (sg)
8328 *sg = &per_cpu(sched_group_cpus, cpu).sg; 6628 *sg = &per_cpu(sched_groups, cpu).sg;
8329 return cpu; 6629 return cpu;
8330} 6630}
8331#endif /* CONFIG_SCHED_SMT */ 6631#endif /* CONFIG_SCHED_SMT */
@@ -8970,7 +7270,7 @@ static int build_sched_domains(const struct cpumask *cpu_map)
8970 return __build_sched_domains(cpu_map, NULL); 7270 return __build_sched_domains(cpu_map, NULL);
8971} 7271}
8972 7272
8973static struct cpumask *doms_cur; /* current sched domains */ 7273static cpumask_var_t *doms_cur; /* current sched domains */
8974static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 7274static int ndoms_cur; /* number of sched domains in 'doms_cur' */
8975static struct sched_domain_attr *dattr_cur; 7275static struct sched_domain_attr *dattr_cur;
8976 /* attribues of custom domains in 'doms_cur' */ 7276 /* attribues of custom domains in 'doms_cur' */
@@ -8992,6 +7292,31 @@ int __attribute__((weak)) arch_update_cpu_topology(void)
8992 return 0; 7292 return 0;
8993} 7293}
8994 7294
7295cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
7296{
7297 int i;
7298 cpumask_var_t *doms;
7299
7300 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
7301 if (!doms)
7302 return NULL;
7303 for (i = 0; i < ndoms; i++) {
7304 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
7305 free_sched_domains(doms, i);
7306 return NULL;
7307 }
7308 }
7309 return doms;
7310}
7311
7312void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
7313{
7314 unsigned int i;
7315 for (i = 0; i < ndoms; i++)
7316 free_cpumask_var(doms[i]);
7317 kfree(doms);
7318}
7319
8995/* 7320/*
8996 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 7321 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
8997 * For now this just excludes isolated cpus, but could be used to 7322 * For now this just excludes isolated cpus, but could be used to
@@ -9003,12 +7328,12 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
9003 7328
9004 arch_update_cpu_topology(); 7329 arch_update_cpu_topology();
9005 ndoms_cur = 1; 7330 ndoms_cur = 1;
9006 doms_cur = kmalloc(cpumask_size(), GFP_KERNEL); 7331 doms_cur = alloc_sched_domains(ndoms_cur);
9007 if (!doms_cur) 7332 if (!doms_cur)
9008 doms_cur = fallback_doms; 7333 doms_cur = &fallback_doms;
9009 cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); 7334 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
9010 dattr_cur = NULL; 7335 dattr_cur = NULL;
9011 err = build_sched_domains(doms_cur); 7336 err = build_sched_domains(doms_cur[0]);
9012 register_sched_domain_sysctl(); 7337 register_sched_domain_sysctl();
9013 7338
9014 return err; 7339 return err;
@@ -9058,19 +7383,19 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
9058 * doms_new[] to the current sched domain partitioning, doms_cur[]. 7383 * doms_new[] to the current sched domain partitioning, doms_cur[].
9059 * It destroys each deleted domain and builds each new domain. 7384 * It destroys each deleted domain and builds each new domain.
9060 * 7385 *
9061 * 'doms_new' is an array of cpumask's of length 'ndoms_new'. 7386 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
9062 * The masks don't intersect (don't overlap.) We should setup one 7387 * The masks don't intersect (don't overlap.) We should setup one
9063 * sched domain for each mask. CPUs not in any of the cpumasks will 7388 * sched domain for each mask. CPUs not in any of the cpumasks will
9064 * not be load balanced. If the same cpumask appears both in the 7389 * not be load balanced. If the same cpumask appears both in the
9065 * current 'doms_cur' domains and in the new 'doms_new', we can leave 7390 * current 'doms_cur' domains and in the new 'doms_new', we can leave
9066 * it as it is. 7391 * it as it is.
9067 * 7392 *
9068 * The passed in 'doms_new' should be kmalloc'd. This routine takes 7393 * The passed in 'doms_new' should be allocated using
9069 * ownership of it and will kfree it when done with it. If the caller 7394 * alloc_sched_domains. This routine takes ownership of it and will
9070 * failed the kmalloc call, then it can pass in doms_new == NULL && 7395 * free_sched_domains it when done with it. If the caller failed the
9071 * ndoms_new == 1, and partition_sched_domains() will fallback to 7396 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
9072 * the single partition 'fallback_doms', it also forces the domains 7397 * and partition_sched_domains() will fallback to the single partition
9073 * to be rebuilt. 7398 * 'fallback_doms', it also forces the domains to be rebuilt.
9074 * 7399 *
9075 * If doms_new == NULL it will be replaced with cpu_online_mask. 7400 * If doms_new == NULL it will be replaced with cpu_online_mask.
9076 * ndoms_new == 0 is a special case for destroying existing domains, 7401 * ndoms_new == 0 is a special case for destroying existing domains,
@@ -9078,8 +7403,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
9078 * 7403 *
9079 * Call with hotplug lock held 7404 * Call with hotplug lock held
9080 */ 7405 */
9081/* FIXME: Change to struct cpumask *doms_new[] */ 7406void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
9082void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
9083 struct sched_domain_attr *dattr_new) 7407 struct sched_domain_attr *dattr_new)
9084{ 7408{
9085 int i, j, n; 7409 int i, j, n;
@@ -9098,40 +7422,40 @@ void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
9098 /* Destroy deleted domains */ 7422 /* Destroy deleted domains */
9099 for (i = 0; i < ndoms_cur; i++) { 7423 for (i = 0; i < ndoms_cur; i++) {
9100 for (j = 0; j < n && !new_topology; j++) { 7424 for (j = 0; j < n && !new_topology; j++) {
9101 if (cpumask_equal(&doms_cur[i], &doms_new[j]) 7425 if (cpumask_equal(doms_cur[i], doms_new[j])
9102 && dattrs_equal(dattr_cur, i, dattr_new, j)) 7426 && dattrs_equal(dattr_cur, i, dattr_new, j))
9103 goto match1; 7427 goto match1;
9104 } 7428 }
9105 /* no match - a current sched domain not in new doms_new[] */ 7429 /* no match - a current sched domain not in new doms_new[] */
9106 detach_destroy_domains(doms_cur + i); 7430 detach_destroy_domains(doms_cur[i]);
9107match1: 7431match1:
9108 ; 7432 ;
9109 } 7433 }
9110 7434
9111 if (doms_new == NULL) { 7435 if (doms_new == NULL) {
9112 ndoms_cur = 0; 7436 ndoms_cur = 0;
9113 doms_new = fallback_doms; 7437 doms_new = &fallback_doms;
9114 cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); 7438 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
9115 WARN_ON_ONCE(dattr_new); 7439 WARN_ON_ONCE(dattr_new);
9116 } 7440 }
9117 7441
9118 /* Build new domains */ 7442 /* Build new domains */
9119 for (i = 0; i < ndoms_new; i++) { 7443 for (i = 0; i < ndoms_new; i++) {
9120 for (j = 0; j < ndoms_cur && !new_topology; j++) { 7444 for (j = 0; j < ndoms_cur && !new_topology; j++) {
9121 if (cpumask_equal(&doms_new[i], &doms_cur[j]) 7445 if (cpumask_equal(doms_new[i], doms_cur[j])
9122 && dattrs_equal(dattr_new, i, dattr_cur, j)) 7446 && dattrs_equal(dattr_new, i, dattr_cur, j))
9123 goto match2; 7447 goto match2;
9124 } 7448 }
9125 /* no match - add a new doms_new */ 7449 /* no match - add a new doms_new */
9126 __build_sched_domains(doms_new + i, 7450 __build_sched_domains(doms_new[i],
9127 dattr_new ? dattr_new + i : NULL); 7451 dattr_new ? dattr_new + i : NULL);
9128match2: 7452match2:
9129 ; 7453 ;
9130 } 7454 }
9131 7455
9132 /* Remember the new sched domains */ 7456 /* Remember the new sched domains */
9133 if (doms_cur != fallback_doms) 7457 if (doms_cur != &fallback_doms)
9134 kfree(doms_cur); 7458 free_sched_domains(doms_cur, ndoms_cur);
9135 kfree(dattr_cur); /* kfree(NULL) is safe */ 7459 kfree(dattr_cur); /* kfree(NULL) is safe */
9136 doms_cur = doms_new; 7460 doms_cur = doms_new;
9137 dattr_cur = dattr_new; 7461 dattr_cur = dattr_new;
@@ -9183,11 +7507,13 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
9183 7507
9184#ifdef CONFIG_SCHED_MC 7508#ifdef CONFIG_SCHED_MC
9185static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, 7509static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
7510 struct sysdev_class_attribute *attr,
9186 char *page) 7511 char *page)
9187{ 7512{
9188 return sprintf(page, "%u\n", sched_mc_power_savings); 7513 return sprintf(page, "%u\n", sched_mc_power_savings);
9189} 7514}
9190static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, 7515static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
7516 struct sysdev_class_attribute *attr,
9191 const char *buf, size_t count) 7517 const char *buf, size_t count)
9192{ 7518{
9193 return sched_power_savings_store(buf, count, 0); 7519 return sched_power_savings_store(buf, count, 0);
@@ -9199,11 +7525,13 @@ static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
9199 7525
9200#ifdef CONFIG_SCHED_SMT 7526#ifdef CONFIG_SCHED_SMT
9201static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, 7527static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
7528 struct sysdev_class_attribute *attr,
9202 char *page) 7529 char *page)
9203{ 7530{
9204 return sprintf(page, "%u\n", sched_smt_power_savings); 7531 return sprintf(page, "%u\n", sched_smt_power_savings);
9205} 7532}
9206static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, 7533static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
7534 struct sysdev_class_attribute *attr,
9207 const char *buf, size_t count) 7535 const char *buf, size_t count)
9208{ 7536{
9209 return sched_power_savings_store(buf, count, 1); 7537 return sched_power_savings_store(buf, count, 1);
@@ -9242,8 +7570,10 @@ static int update_sched_domains(struct notifier_block *nfb,
9242 switch (action) { 7570 switch (action) {
9243 case CPU_ONLINE: 7571 case CPU_ONLINE:
9244 case CPU_ONLINE_FROZEN: 7572 case CPU_ONLINE_FROZEN:
9245 case CPU_DEAD: 7573 case CPU_DOWN_PREPARE:
9246 case CPU_DEAD_FROZEN: 7574 case CPU_DOWN_PREPARE_FROZEN:
7575 case CPU_DOWN_FAILED:
7576 case CPU_DOWN_FAILED_FROZEN:
9247 partition_sched_domains(1, NULL, NULL); 7577 partition_sched_domains(1, NULL, NULL);
9248 return NOTIFY_OK; 7578 return NOTIFY_OK;
9249 7579
@@ -9290,7 +7620,7 @@ void __init sched_init_smp(void)
9290#endif 7620#endif
9291 get_online_cpus(); 7621 get_online_cpus();
9292 mutex_lock(&sched_domains_mutex); 7622 mutex_lock(&sched_domains_mutex);
9293 arch_init_sched_domains(cpu_online_mask); 7623 arch_init_sched_domains(cpu_active_mask);
9294 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 7624 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
9295 if (cpumask_empty(non_isolated_cpus)) 7625 if (cpumask_empty(non_isolated_cpus))
9296 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 7626 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -9363,13 +7693,13 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
9363#ifdef CONFIG_SMP 7693#ifdef CONFIG_SMP
9364 rt_rq->rt_nr_migratory = 0; 7694 rt_rq->rt_nr_migratory = 0;
9365 rt_rq->overloaded = 0; 7695 rt_rq->overloaded = 0;
9366 plist_head_init(&rt_rq->pushable_tasks, &rq->lock); 7696 plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock);
9367#endif 7697#endif
9368 7698
9369 rt_rq->rt_time = 0; 7699 rt_rq->rt_time = 0;
9370 rt_rq->rt_throttled = 0; 7700 rt_rq->rt_throttled = 0;
9371 rt_rq->rt_runtime = 0; 7701 rt_rq->rt_runtime = 0;
9372 spin_lock_init(&rt_rq->rt_runtime_lock); 7702 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
9373 7703
9374#ifdef CONFIG_RT_GROUP_SCHED 7704#ifdef CONFIG_RT_GROUP_SCHED
9375 rt_rq->rt_nr_boosted = 0; 7705 rt_rq->rt_nr_boosted = 0;
@@ -9416,7 +7746,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
9416 tg->rt_rq[cpu] = rt_rq; 7746 tg->rt_rq[cpu] = rt_rq;
9417 init_rt_rq(rt_rq, rq); 7747 init_rt_rq(rt_rq, rq);
9418 rt_rq->tg = tg; 7748 rt_rq->tg = tg;
9419 rt_rq->rt_se = rt_se;
9420 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 7749 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
9421 if (add) 7750 if (add)
9422 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); 7751 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
@@ -9447,16 +7776,9 @@ void __init sched_init(void)
9447#ifdef CONFIG_RT_GROUP_SCHED 7776#ifdef CONFIG_RT_GROUP_SCHED
9448 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7777 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
9449#endif 7778#endif
9450#ifdef CONFIG_USER_SCHED
9451 alloc_size *= 2;
9452#endif
9453#ifdef CONFIG_CPUMASK_OFFSTACK 7779#ifdef CONFIG_CPUMASK_OFFSTACK
9454 alloc_size += num_possible_cpus() * cpumask_size(); 7780 alloc_size += num_possible_cpus() * cpumask_size();
9455#endif 7781#endif
9456 /*
9457 * As sched_init() is called before page_alloc is setup,
9458 * we use alloc_bootmem().
9459 */
9460 if (alloc_size) { 7782 if (alloc_size) {
9461 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 7783 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
9462 7784
@@ -9467,13 +7789,6 @@ void __init sched_init(void)
9467 init_task_group.cfs_rq = (struct cfs_rq **)ptr; 7789 init_task_group.cfs_rq = (struct cfs_rq **)ptr;
9468 ptr += nr_cpu_ids * sizeof(void **); 7790 ptr += nr_cpu_ids * sizeof(void **);
9469 7791
9470#ifdef CONFIG_USER_SCHED
9471 root_task_group.se = (struct sched_entity **)ptr;
9472 ptr += nr_cpu_ids * sizeof(void **);
9473
9474 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
9475 ptr += nr_cpu_ids * sizeof(void **);
9476#endif /* CONFIG_USER_SCHED */
9477#endif /* CONFIG_FAIR_GROUP_SCHED */ 7792#endif /* CONFIG_FAIR_GROUP_SCHED */
9478#ifdef CONFIG_RT_GROUP_SCHED 7793#ifdef CONFIG_RT_GROUP_SCHED
9479 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 7794 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -9482,13 +7797,6 @@ void __init sched_init(void)
9482 init_task_group.rt_rq = (struct rt_rq **)ptr; 7797 init_task_group.rt_rq = (struct rt_rq **)ptr;
9483 ptr += nr_cpu_ids * sizeof(void **); 7798 ptr += nr_cpu_ids * sizeof(void **);
9484 7799
9485#ifdef CONFIG_USER_SCHED
9486 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
9487 ptr += nr_cpu_ids * sizeof(void **);
9488
9489 root_task_group.rt_rq = (struct rt_rq **)ptr;
9490 ptr += nr_cpu_ids * sizeof(void **);
9491#endif /* CONFIG_USER_SCHED */
9492#endif /* CONFIG_RT_GROUP_SCHED */ 7800#endif /* CONFIG_RT_GROUP_SCHED */
9493#ifdef CONFIG_CPUMASK_OFFSTACK 7801#ifdef CONFIG_CPUMASK_OFFSTACK
9494 for_each_possible_cpu(i) { 7802 for_each_possible_cpu(i) {
@@ -9508,22 +7816,13 @@ void __init sched_init(void)
9508#ifdef CONFIG_RT_GROUP_SCHED 7816#ifdef CONFIG_RT_GROUP_SCHED
9509 init_rt_bandwidth(&init_task_group.rt_bandwidth, 7817 init_rt_bandwidth(&init_task_group.rt_bandwidth,
9510 global_rt_period(), global_rt_runtime()); 7818 global_rt_period(), global_rt_runtime());
9511#ifdef CONFIG_USER_SCHED
9512 init_rt_bandwidth(&root_task_group.rt_bandwidth,
9513 global_rt_period(), RUNTIME_INF);
9514#endif /* CONFIG_USER_SCHED */
9515#endif /* CONFIG_RT_GROUP_SCHED */ 7819#endif /* CONFIG_RT_GROUP_SCHED */
9516 7820
9517#ifdef CONFIG_GROUP_SCHED 7821#ifdef CONFIG_CGROUP_SCHED
9518 list_add(&init_task_group.list, &task_groups); 7822 list_add(&init_task_group.list, &task_groups);
9519 INIT_LIST_HEAD(&init_task_group.children); 7823 INIT_LIST_HEAD(&init_task_group.children);
9520 7824
9521#ifdef CONFIG_USER_SCHED 7825#endif /* CONFIG_CGROUP_SCHED */
9522 INIT_LIST_HEAD(&root_task_group.children);
9523 init_task_group.parent = &root_task_group;
9524 list_add(&init_task_group.siblings, &root_task_group.children);
9525#endif /* CONFIG_USER_SCHED */
9526#endif /* CONFIG_GROUP_SCHED */
9527 7826
9528#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP 7827#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
9529 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), 7828 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
@@ -9533,7 +7832,7 @@ void __init sched_init(void)
9533 struct rq *rq; 7832 struct rq *rq;
9534 7833
9535 rq = cpu_rq(i); 7834 rq = cpu_rq(i);
9536 spin_lock_init(&rq->lock); 7835 raw_spin_lock_init(&rq->lock);
9537 rq->nr_running = 0; 7836 rq->nr_running = 0;
9538 rq->calc_load_active = 0; 7837 rq->calc_load_active = 0;
9539 rq->calc_load_update = jiffies + LOAD_FREQ; 7838 rq->calc_load_update = jiffies + LOAD_FREQ;
@@ -9563,25 +7862,6 @@ void __init sched_init(void)
9563 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 7862 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
9564 */ 7863 */
9565 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); 7864 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
9566#elif defined CONFIG_USER_SCHED
9567 root_task_group.shares = NICE_0_LOAD;
9568 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
9569 /*
9570 * In case of task-groups formed thr' the user id of tasks,
9571 * init_task_group represents tasks belonging to root user.
9572 * Hence it forms a sibling of all subsequent groups formed.
9573 * In this case, init_task_group gets only a fraction of overall
9574 * system cpu resource, based on the weight assigned to root
9575 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
9576 * by letting tasks of init_task_group sit in a separate cfs_rq
9577 * (init_tg_cfs_rq) and having one entity represent this group of
9578 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
9579 */
9580 init_tg_cfs_entry(&init_task_group,
9581 &per_cpu(init_tg_cfs_rq, i),
9582 &per_cpu(init_sched_entity, i), i, 1,
9583 root_task_group.se[i]);
9584
9585#endif 7865#endif
9586#endif /* CONFIG_FAIR_GROUP_SCHED */ 7866#endif /* CONFIG_FAIR_GROUP_SCHED */
9587 7867
@@ -9590,12 +7870,6 @@ void __init sched_init(void)
9590 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 7870 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
9591#ifdef CONFIG_CGROUP_SCHED 7871#ifdef CONFIG_CGROUP_SCHED
9592 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); 7872 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
9593#elif defined CONFIG_USER_SCHED
9594 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
9595 init_tg_rt_entry(&init_task_group,
9596 &per_cpu(init_rt_rq, i),
9597 &per_cpu(init_sched_rt_entity, i), i, 1,
9598 root_task_group.rt_se[i]);
9599#endif 7873#endif
9600#endif 7874#endif
9601 7875
@@ -9611,6 +7885,8 @@ void __init sched_init(void)
9611 rq->cpu = i; 7885 rq->cpu = i;
9612 rq->online = 0; 7886 rq->online = 0;
9613 rq->migration_thread = NULL; 7887 rq->migration_thread = NULL;
7888 rq->idle_stamp = 0;
7889 rq->avg_idle = 2*sysctl_sched_migration_cost;
9614 INIT_LIST_HEAD(&rq->migration_queue); 7890 INIT_LIST_HEAD(&rq->migration_queue);
9615 rq_attach_root(rq, &def_root_domain); 7891 rq_attach_root(rq, &def_root_domain);
9616#endif 7892#endif
@@ -9629,7 +7905,7 @@ void __init sched_init(void)
9629#endif 7905#endif
9630 7906
9631#ifdef CONFIG_RT_MUTEXES 7907#ifdef CONFIG_RT_MUTEXES
9632 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); 7908 plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock);
9633#endif 7909#endif
9634 7910
9635 /* 7911 /*
@@ -9660,7 +7936,9 @@ void __init sched_init(void)
9660 zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); 7936 zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
9661 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); 7937 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
9662#endif 7938#endif
9663 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 7939 /* May be allocated at isolcpus cmdline parse time */
7940 if (cpu_isolated_map == NULL)
7941 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
9664#endif /* SMP */ 7942#endif /* SMP */
9665 7943
9666 perf_event_init(); 7944 perf_event_init();
@@ -9671,12 +7949,12 @@ void __init sched_init(void)
9671#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 7949#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
9672static inline int preempt_count_equals(int preempt_offset) 7950static inline int preempt_count_equals(int preempt_offset)
9673{ 7951{
9674 int nested = preempt_count() & ~PREEMPT_ACTIVE; 7952 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
9675 7953
9676 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); 7954 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
9677} 7955}
9678 7956
9679void __might_sleep(char *file, int line, int preempt_offset) 7957void __might_sleep(const char *file, int line, int preempt_offset)
9680{ 7958{
9681#ifdef in_atomic 7959#ifdef in_atomic
9682 static unsigned long prev_jiffy; /* ratelimiting */ 7960 static unsigned long prev_jiffy; /* ratelimiting */
@@ -9752,13 +8030,13 @@ void normalize_rt_tasks(void)
9752 continue; 8030 continue;
9753 } 8031 }
9754 8032
9755 spin_lock(&p->pi_lock); 8033 raw_spin_lock(&p->pi_lock);
9756 rq = __task_rq_lock(p); 8034 rq = __task_rq_lock(p);
9757 8035
9758 normalize_task(rq, p); 8036 normalize_task(rq, p);
9759 8037
9760 __task_rq_unlock(rq); 8038 __task_rq_unlock(rq);
9761 spin_unlock(&p->pi_lock); 8039 raw_spin_unlock(&p->pi_lock);
9762 } while_each_thread(g, p); 8040 } while_each_thread(g, p);
9763 8041
9764 read_unlock_irqrestore(&tasklist_lock, flags); 8042 read_unlock_irqrestore(&tasklist_lock, flags);
@@ -9854,13 +8132,15 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
9854 se = kzalloc_node(sizeof(struct sched_entity), 8132 se = kzalloc_node(sizeof(struct sched_entity),
9855 GFP_KERNEL, cpu_to_node(i)); 8133 GFP_KERNEL, cpu_to_node(i));
9856 if (!se) 8134 if (!se)
9857 goto err; 8135 goto err_free_rq;
9858 8136
9859 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); 8137 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
9860 } 8138 }
9861 8139
9862 return 1; 8140 return 1;
9863 8141
8142 err_free_rq:
8143 kfree(cfs_rq);
9864 err: 8144 err:
9865 return 0; 8145 return 0;
9866} 8146}
@@ -9942,13 +8222,15 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
9942 rt_se = kzalloc_node(sizeof(struct sched_rt_entity), 8222 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
9943 GFP_KERNEL, cpu_to_node(i)); 8223 GFP_KERNEL, cpu_to_node(i));
9944 if (!rt_se) 8224 if (!rt_se)
9945 goto err; 8225 goto err_free_rq;
9946 8226
9947 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); 8227 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
9948 } 8228 }
9949 8229
9950 return 1; 8230 return 1;
9951 8231
8232 err_free_rq:
8233 kfree(rt_rq);
9952 err: 8234 err:
9953 return 0; 8235 return 0;
9954} 8236}
@@ -9983,7 +8265,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
9983} 8265}
9984#endif /* CONFIG_RT_GROUP_SCHED */ 8266#endif /* CONFIG_RT_GROUP_SCHED */
9985 8267
9986#ifdef CONFIG_GROUP_SCHED 8268#ifdef CONFIG_CGROUP_SCHED
9987static void free_sched_group(struct task_group *tg) 8269static void free_sched_group(struct task_group *tg)
9988{ 8270{
9989 free_fair_sched_group(tg); 8271 free_fair_sched_group(tg);
@@ -10082,17 +8364,17 @@ void sched_move_task(struct task_struct *tsk)
10082 8364
10083#ifdef CONFIG_FAIR_GROUP_SCHED 8365#ifdef CONFIG_FAIR_GROUP_SCHED
10084 if (tsk->sched_class->moved_group) 8366 if (tsk->sched_class->moved_group)
10085 tsk->sched_class->moved_group(tsk); 8367 tsk->sched_class->moved_group(tsk, on_rq);
10086#endif 8368#endif
10087 8369
10088 if (unlikely(running)) 8370 if (unlikely(running))
10089 tsk->sched_class->set_curr_task(rq); 8371 tsk->sched_class->set_curr_task(rq);
10090 if (on_rq) 8372 if (on_rq)
10091 enqueue_task(rq, tsk, 0); 8373 enqueue_task(rq, tsk, 0, false);
10092 8374
10093 task_rq_unlock(rq, &flags); 8375 task_rq_unlock(rq, &flags);
10094} 8376}
10095#endif /* CONFIG_GROUP_SCHED */ 8377#endif /* CONFIG_CGROUP_SCHED */
10096 8378
10097#ifdef CONFIG_FAIR_GROUP_SCHED 8379#ifdef CONFIG_FAIR_GROUP_SCHED
10098static void __set_se_shares(struct sched_entity *se, unsigned long shares) 8380static void __set_se_shares(struct sched_entity *se, unsigned long shares)
@@ -10117,9 +8399,9 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
10117 struct rq *rq = cfs_rq->rq; 8399 struct rq *rq = cfs_rq->rq;
10118 unsigned long flags; 8400 unsigned long flags;
10119 8401
10120 spin_lock_irqsave(&rq->lock, flags); 8402 raw_spin_lock_irqsave(&rq->lock, flags);
10121 __set_se_shares(se, shares); 8403 __set_se_shares(se, shares);
10122 spin_unlock_irqrestore(&rq->lock, flags); 8404 raw_spin_unlock_irqrestore(&rq->lock, flags);
10123} 8405}
10124 8406
10125static DEFINE_MUTEX(shares_mutex); 8407static DEFINE_MUTEX(shares_mutex);
@@ -10234,13 +8516,6 @@ static int tg_schedulable(struct task_group *tg, void *data)
10234 runtime = d->rt_runtime; 8516 runtime = d->rt_runtime;
10235 } 8517 }
10236 8518
10237#ifdef CONFIG_USER_SCHED
10238 if (tg == &root_task_group) {
10239 period = global_rt_period();
10240 runtime = global_rt_runtime();
10241 }
10242#endif
10243
10244 /* 8519 /*
10245 * Cannot have more runtime than the period. 8520 * Cannot have more runtime than the period.
10246 */ 8521 */
@@ -10304,18 +8579,18 @@ static int tg_set_bandwidth(struct task_group *tg,
10304 if (err) 8579 if (err)
10305 goto unlock; 8580 goto unlock;
10306 8581
10307 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8582 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
10308 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 8583 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
10309 tg->rt_bandwidth.rt_runtime = rt_runtime; 8584 tg->rt_bandwidth.rt_runtime = rt_runtime;
10310 8585
10311 for_each_possible_cpu(i) { 8586 for_each_possible_cpu(i) {
10312 struct rt_rq *rt_rq = tg->rt_rq[i]; 8587 struct rt_rq *rt_rq = tg->rt_rq[i];
10313 8588
10314 spin_lock(&rt_rq->rt_runtime_lock); 8589 raw_spin_lock(&rt_rq->rt_runtime_lock);
10315 rt_rq->rt_runtime = rt_runtime; 8590 rt_rq->rt_runtime = rt_runtime;
10316 spin_unlock(&rt_rq->rt_runtime_lock); 8591 raw_spin_unlock(&rt_rq->rt_runtime_lock);
10317 } 8592 }
10318 spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8593 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
10319 unlock: 8594 unlock:
10320 read_unlock(&tasklist_lock); 8595 read_unlock(&tasklist_lock);
10321 mutex_unlock(&rt_constraints_mutex); 8596 mutex_unlock(&rt_constraints_mutex);
@@ -10420,15 +8695,15 @@ static int sched_rt_global_constraints(void)
10420 if (sysctl_sched_rt_runtime == 0) 8695 if (sysctl_sched_rt_runtime == 0)
10421 return -EBUSY; 8696 return -EBUSY;
10422 8697
10423 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 8698 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
10424 for_each_possible_cpu(i) { 8699 for_each_possible_cpu(i) {
10425 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 8700 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
10426 8701
10427 spin_lock(&rt_rq->rt_runtime_lock); 8702 raw_spin_lock(&rt_rq->rt_runtime_lock);
10428 rt_rq->rt_runtime = global_rt_runtime(); 8703 rt_rq->rt_runtime = global_rt_runtime();
10429 spin_unlock(&rt_rq->rt_runtime_lock); 8704 raw_spin_unlock(&rt_rq->rt_runtime_lock);
10430 } 8705 }
10431 spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 8706 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
10432 8707
10433 return 0; 8708 return 0;
10434} 8709}
@@ -10643,7 +8918,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
10643struct cpuacct { 8918struct cpuacct {
10644 struct cgroup_subsys_state css; 8919 struct cgroup_subsys_state css;
10645 /* cpuusage holds pointer to a u64-type object on every cpu */ 8920 /* cpuusage holds pointer to a u64-type object on every cpu */
10646 u64 *cpuusage; 8921 u64 __percpu *cpuusage;
10647 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; 8922 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
10648 struct cpuacct *parent; 8923 struct cpuacct *parent;
10649}; 8924};
@@ -10719,9 +8994,9 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
10719 /* 8994 /*
10720 * Take rq->lock to make 64-bit read safe on 32-bit platforms. 8995 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
10721 */ 8996 */
10722 spin_lock_irq(&cpu_rq(cpu)->lock); 8997 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
10723 data = *cpuusage; 8998 data = *cpuusage;
10724 spin_unlock_irq(&cpu_rq(cpu)->lock); 8999 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
10725#else 9000#else
10726 data = *cpuusage; 9001 data = *cpuusage;
10727#endif 9002#endif
@@ -10737,9 +9012,9 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
10737 /* 9012 /*
10738 * Take rq->lock to make 64-bit write safe on 32-bit platforms. 9013 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
10739 */ 9014 */
10740 spin_lock_irq(&cpu_rq(cpu)->lock); 9015 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
10741 *cpuusage = val; 9016 *cpuusage = val;
10742 spin_unlock_irq(&cpu_rq(cpu)->lock); 9017 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
10743#else 9018#else
10744 *cpuusage = val; 9019 *cpuusage = val;
10745#endif 9020#endif
@@ -10860,12 +9135,30 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
10860} 9135}
10861 9136
10862/* 9137/*
9138 * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
9139 * in cputime_t units. As a result, cpuacct_update_stats calls
9140 * percpu_counter_add with values large enough to always overflow the
9141 * per cpu batch limit causing bad SMP scalability.
9142 *
9143 * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
9144 * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
9145 * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
9146 */
9147#ifdef CONFIG_SMP
9148#define CPUACCT_BATCH \
9149 min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
9150#else
9151#define CPUACCT_BATCH 0
9152#endif
9153
9154/*
10863 * Charge the system/user time to the task's accounting group. 9155 * Charge the system/user time to the task's accounting group.
10864 */ 9156 */
10865static void cpuacct_update_stats(struct task_struct *tsk, 9157static void cpuacct_update_stats(struct task_struct *tsk,
10866 enum cpuacct_stat_index idx, cputime_t val) 9158 enum cpuacct_stat_index idx, cputime_t val)
10867{ 9159{
10868 struct cpuacct *ca; 9160 struct cpuacct *ca;
9161 int batch = CPUACCT_BATCH;
10869 9162
10870 if (unlikely(!cpuacct_subsys.active)) 9163 if (unlikely(!cpuacct_subsys.active))
10871 return; 9164 return;
@@ -10874,7 +9167,7 @@ static void cpuacct_update_stats(struct task_struct *tsk,
10874 ca = task_ca(tsk); 9167 ca = task_ca(tsk);
10875 9168
10876 do { 9169 do {
10877 percpu_counter_add(&ca->cpustat[idx], val); 9170 __percpu_counter_add(&ca->cpustat[idx], val, batch);
10878 ca = ca->parent; 9171 ca = ca->parent;
10879 } while (ca); 9172 } while (ca);
10880 rcu_read_unlock(); 9173 rcu_read_unlock();
@@ -10973,9 +9266,9 @@ void synchronize_sched_expedited(void)
10973 init_completion(&req->done); 9266 init_completion(&req->done);
10974 req->task = NULL; 9267 req->task = NULL;
10975 req->dest_cpu = RCU_MIGRATION_NEED_QS; 9268 req->dest_cpu = RCU_MIGRATION_NEED_QS;
10976 spin_lock_irqsave(&rq->lock, flags); 9269 raw_spin_lock_irqsave(&rq->lock, flags);
10977 list_add(&req->list, &rq->migration_queue); 9270 list_add(&req->list, &rq->migration_queue);
10978 spin_unlock_irqrestore(&rq->lock, flags); 9271 raw_spin_unlock_irqrestore(&rq->lock, flags);
10979 wake_up_process(rq->migration_thread); 9272 wake_up_process(rq->migration_thread);
10980 } 9273 }
10981 for_each_online_cpu(cpu) { 9274 for_each_online_cpu(cpu) {
@@ -10983,13 +9276,14 @@ void synchronize_sched_expedited(void)
10983 req = &per_cpu(rcu_migration_req, cpu); 9276 req = &per_cpu(rcu_migration_req, cpu);
10984 rq = cpu_rq(cpu); 9277 rq = cpu_rq(cpu);
10985 wait_for_completion(&req->done); 9278 wait_for_completion(&req->done);
10986 spin_lock_irqsave(&rq->lock, flags); 9279 raw_spin_lock_irqsave(&rq->lock, flags);
10987 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC)) 9280 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
10988 need_full_sync = 1; 9281 need_full_sync = 1;
10989 req->dest_cpu = RCU_MIGRATION_IDLE; 9282 req->dest_cpu = RCU_MIGRATION_IDLE;
10990 spin_unlock_irqrestore(&rq->lock, flags); 9283 raw_spin_unlock_irqrestore(&rq->lock, flags);
10991 } 9284 }
10992 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; 9285 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
9286 synchronize_sched_expedited_count++;
10993 mutex_unlock(&rcu_sched_expedited_mutex); 9287 mutex_unlock(&rcu_sched_expedited_mutex);
10994 put_online_cpus(); 9288 put_online_cpus();
10995 if (need_full_sync) 9289 if (need_full_sync)
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 479ce5682d7c..5b496132c28a 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -236,6 +236,18 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
236} 236}
237EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); 237EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
238 238
239unsigned long long cpu_clock(int cpu)
240{
241 unsigned long long clock;
242 unsigned long flags;
243
244 local_irq_save(flags);
245 clock = sched_clock_cpu(cpu);
246 local_irq_restore(flags);
247
248 return clock;
249}
250
239#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 251#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
240 252
241void sched_clock_init(void) 253void sched_clock_init(void)
@@ -251,17 +263,12 @@ u64 sched_clock_cpu(int cpu)
251 return sched_clock(); 263 return sched_clock();
252} 264}
253 265
254#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
255 266
256unsigned long long cpu_clock(int cpu) 267unsigned long long cpu_clock(int cpu)
257{ 268{
258 unsigned long long clock; 269 return sched_clock_cpu(cpu);
259 unsigned long flags; 270}
260 271
261 local_irq_save(flags); 272#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
262 clock = sched_clock_cpu(cpu);
263 local_irq_restore(flags);
264 273
265 return clock;
266}
267EXPORT_SYMBOL_GPL(cpu_clock); 274EXPORT_SYMBOL_GPL(cpu_clock);
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 0f052fc674d5..e6871cb3fc83 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -27,6 +27,7 @@
27 * of the License. 27 * of the License.
28 */ 28 */
29 29
30#include <linux/gfp.h>
30#include "sched_cpupri.h" 31#include "sched_cpupri.h"
31 32
32/* Convert between a 140 based task->prio, and our 102 based cpupri */ 33/* Convert between a 140 based task->prio, and our 102 based cpupri */
@@ -47,9 +48,7 @@ static int convert_prio(int prio)
47} 48}
48 49
49#define for_each_cpupri_active(array, idx) \ 50#define for_each_cpupri_active(array, idx) \
50 for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \ 51 for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES)
51 idx < CPUPRI_NR_PRIORITIES; \
52 idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
53 52
54/** 53/**
55 * cpupri_find - find the best (lowest-pri) CPU in the system 54 * cpupri_find - find the best (lowest-pri) CPU in the system
@@ -58,7 +57,7 @@ static int convert_prio(int prio)
58 * @lowest_mask: A mask to fill in with selected CPUs (or NULL) 57 * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
59 * 58 *
60 * Note: This function returns the recommended CPUs as calculated during the 59 * Note: This function returns the recommended CPUs as calculated during the
61 * current invokation. By the time the call returns, the CPUs may have in 60 * current invocation. By the time the call returns, the CPUs may have in
62 * fact changed priorities any number of times. While not ideal, it is not 61 * fact changed priorities any number of times. While not ideal, it is not
63 * an issue of correctness since the normal rebalancer logic will correct 62 * an issue of correctness since the normal rebalancer logic will correct
64 * any discrepancies created by racing against the uncertainty of the current 63 * any discrepancies created by racing against the uncertainty of the current
@@ -135,26 +134,26 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
135 if (likely(newpri != CPUPRI_INVALID)) { 134 if (likely(newpri != CPUPRI_INVALID)) {
136 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; 135 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
137 136
138 spin_lock_irqsave(&vec->lock, flags); 137 raw_spin_lock_irqsave(&vec->lock, flags);
139 138
140 cpumask_set_cpu(cpu, vec->mask); 139 cpumask_set_cpu(cpu, vec->mask);
141 vec->count++; 140 vec->count++;
142 if (vec->count == 1) 141 if (vec->count == 1)
143 set_bit(newpri, cp->pri_active); 142 set_bit(newpri, cp->pri_active);
144 143
145 spin_unlock_irqrestore(&vec->lock, flags); 144 raw_spin_unlock_irqrestore(&vec->lock, flags);
146 } 145 }
147 if (likely(oldpri != CPUPRI_INVALID)) { 146 if (likely(oldpri != CPUPRI_INVALID)) {
148 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; 147 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
149 148
150 spin_lock_irqsave(&vec->lock, flags); 149 raw_spin_lock_irqsave(&vec->lock, flags);
151 150
152 vec->count--; 151 vec->count--;
153 if (!vec->count) 152 if (!vec->count)
154 clear_bit(oldpri, cp->pri_active); 153 clear_bit(oldpri, cp->pri_active);
155 cpumask_clear_cpu(cpu, vec->mask); 154 cpumask_clear_cpu(cpu, vec->mask);
156 155
157 spin_unlock_irqrestore(&vec->lock, flags); 156 raw_spin_unlock_irqrestore(&vec->lock, flags);
158 } 157 }
159 158
160 *currpri = newpri; 159 *currpri = newpri;
@@ -180,7 +179,7 @@ int cpupri_init(struct cpupri *cp, bool bootmem)
180 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { 179 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
181 struct cpupri_vec *vec = &cp->pri_to_cpu[i]; 180 struct cpupri_vec *vec = &cp->pri_to_cpu[i];
182 181
183 spin_lock_init(&vec->lock); 182 raw_spin_lock_init(&vec->lock);
184 vec->count = 0; 183 vec->count = 0;
185 if (!zalloc_cpumask_var(&vec->mask, gfp)) 184 if (!zalloc_cpumask_var(&vec->mask, gfp))
186 goto cleanup; 185 goto cleanup;
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 9a7e859b8fbf..7cb5bb6b95be 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -12,7 +12,7 @@
12/* values 2-101 are RT priorities 0-99 */ 12/* values 2-101 are RT priorities 0-99 */
13 13
14struct cpupri_vec { 14struct cpupri_vec {
15 spinlock_t lock; 15 raw_spinlock_t lock;
16 int count; 16 int count;
17 cpumask_var_t mask; 17 cpumask_var_t mask;
18}; 18};
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index efb84409bc43..19be00ba6123 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -114,7 +114,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
114 { 114 {
115 char path[64]; 115 char path[64];
116 116
117 rcu_read_lock();
117 cgroup_path(task_group(p)->css.cgroup, path, sizeof(path)); 118 cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
119 rcu_read_unlock();
118 SEQ_printf(m, " %s", path); 120 SEQ_printf(m, " %s", path);
119 } 121 }
120#endif 122#endif
@@ -184,7 +186,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
184 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", 186 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
185 SPLIT_NS(cfs_rq->exec_clock)); 187 SPLIT_NS(cfs_rq->exec_clock));
186 188
187 spin_lock_irqsave(&rq->lock, flags); 189 raw_spin_lock_irqsave(&rq->lock, flags);
188 if (cfs_rq->rb_leftmost) 190 if (cfs_rq->rb_leftmost)
189 MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; 191 MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime;
190 last = __pick_last_entity(cfs_rq); 192 last = __pick_last_entity(cfs_rq);
@@ -192,7 +194,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
192 max_vruntime = last->vruntime; 194 max_vruntime = last->vruntime;
193 min_vruntime = cfs_rq->min_vruntime; 195 min_vruntime = cfs_rq->min_vruntime;
194 rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime; 196 rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
195 spin_unlock_irqrestore(&rq->lock, flags); 197 raw_spin_unlock_irqrestore(&rq->lock, flags);
196 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", 198 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
197 SPLIT_NS(MIN_vruntime)); 199 SPLIT_NS(MIN_vruntime));
198 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", 200 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
@@ -285,12 +287,16 @@ static void print_cpu(struct seq_file *m, int cpu)
285 287
286#ifdef CONFIG_SCHEDSTATS 288#ifdef CONFIG_SCHEDSTATS
287#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); 289#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
290#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n);
288 291
289 P(yld_count); 292 P(yld_count);
290 293
291 P(sched_switch); 294 P(sched_switch);
292 P(sched_count); 295 P(sched_count);
293 P(sched_goidle); 296 P(sched_goidle);
297#ifdef CONFIG_SMP
298 P64(avg_idle);
299#endif
294 300
295 P(ttwu_count); 301 P(ttwu_count);
296 P(ttwu_local); 302 P(ttwu_local);
@@ -305,6 +311,12 @@ static void print_cpu(struct seq_file *m, int cpu)
305 print_rq(m, rq, cpu); 311 print_rq(m, rq, cpu);
306} 312}
307 313
314static const char *sched_tunable_scaling_names[] = {
315 "none",
316 "logaritmic",
317 "linear"
318};
319
308static int sched_debug_show(struct seq_file *m, void *v) 320static int sched_debug_show(struct seq_file *m, void *v)
309{ 321{
310 u64 now = ktime_to_ns(ktime_get()); 322 u64 now = ktime_to_ns(ktime_get());
@@ -330,6 +342,10 @@ static int sched_debug_show(struct seq_file *m, void *v)
330#undef PN 342#undef PN
331#undef P 343#undef P
332 344
345 SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling",
346 sysctl_sched_tunable_scaling,
347 sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
348
333 for_each_online_cpu(cpu) 349 for_each_online_cpu(cpu)
334 print_cpu(m, cpu); 350 print_cpu(m, cpu);
335 351
@@ -395,7 +411,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
395 PN(se.sum_exec_runtime); 411 PN(se.sum_exec_runtime);
396 PN(se.avg_overlap); 412 PN(se.avg_overlap);
397 PN(se.avg_wakeup); 413 PN(se.avg_wakeup);
398 PN(se.avg_running);
399 414
400 nr_switches = p->nvcsw + p->nivcsw; 415 nr_switches = p->nvcsw + p->nivcsw;
401 416
@@ -419,7 +434,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
419 P(se.nr_failed_migrations_running); 434 P(se.nr_failed_migrations_running);
420 P(se.nr_failed_migrations_hot); 435 P(se.nr_failed_migrations_hot);
421 P(se.nr_forced_migrations); 436 P(se.nr_forced_migrations);
422 P(se.nr_forced2_migrations);
423 P(se.nr_wakeups); 437 P(se.nr_wakeups);
424 P(se.nr_wakeups_sync); 438 P(se.nr_wakeups_sync);
425 P(se.nr_wakeups_migrate); 439 P(se.nr_wakeups_migrate);
@@ -495,7 +509,6 @@ void proc_sched_set_task(struct task_struct *p)
495 p->se.nr_failed_migrations_running = 0; 509 p->se.nr_failed_migrations_running = 0;
496 p->se.nr_failed_migrations_hot = 0; 510 p->se.nr_failed_migrations_hot = 0;
497 p->se.nr_forced_migrations = 0; 511 p->se.nr_forced_migrations = 0;
498 p->se.nr_forced2_migrations = 0;
499 p->se.nr_wakeups = 0; 512 p->se.nr_wakeups = 0;
500 p->se.nr_wakeups_sync = 0; 513 p->se.nr_wakeups_sync = 0;
501 p->se.nr_wakeups_migrate = 0; 514 p->se.nr_wakeups_migrate = 0;
@@ -507,8 +520,4 @@ void proc_sched_set_task(struct task_struct *p)
507 p->se.nr_wakeups_idle = 0; 520 p->se.nr_wakeups_idle = 0;
508 p->sched_info.bkl_count = 0; 521 p->sched_info.bkl_count = 0;
509#endif 522#endif
510 p->se.sum_exec_runtime = 0;
511 p->se.prev_sum_exec_runtime = 0;
512 p->nvcsw = 0;
513 p->nivcsw = 0;
514} 523}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ef43ff95999d..b1af6d42c024 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -21,6 +21,7 @@
21 */ 21 */
22 22
23#include <linux/latencytop.h> 23#include <linux/latencytop.h>
24#include <linux/sched.h>
24 25
25/* 26/*
26 * Targeted preemption latency for CPU-bound tasks: 27 * Targeted preemption latency for CPU-bound tasks:
@@ -35,12 +36,26 @@
35 * run vmstat and monitor the context-switches (cs) field) 36 * run vmstat and monitor the context-switches (cs) field)
36 */ 37 */
37unsigned int sysctl_sched_latency = 5000000ULL; 38unsigned int sysctl_sched_latency = 5000000ULL;
39unsigned int normalized_sysctl_sched_latency = 5000000ULL;
40
41/*
42 * The initial- and re-scaling of tunables is configurable
43 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
44 *
45 * Options are:
46 * SCHED_TUNABLESCALING_NONE - unscaled, always *1
47 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
48 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
49 */
50enum sched_tunable_scaling sysctl_sched_tunable_scaling
51 = SCHED_TUNABLESCALING_LOG;
38 52
39/* 53/*
40 * Minimal preemption granularity for CPU-bound tasks: 54 * Minimal preemption granularity for CPU-bound tasks:
41 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) 55 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
42 */ 56 */
43unsigned int sysctl_sched_min_granularity = 1000000ULL; 57unsigned int sysctl_sched_min_granularity = 1000000ULL;
58unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL;
44 59
45/* 60/*
46 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity 61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
@@ -70,6 +85,7 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
70 * have immediate wakeup/sleep latencies. 85 * have immediate wakeup/sleep latencies.
71 */ 86 */
72unsigned int sysctl_sched_wakeup_granularity = 1000000UL; 87unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
88unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
73 89
74const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 90const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
75 91
@@ -383,11 +399,12 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
383 */ 399 */
384 400
385#ifdef CONFIG_SCHED_DEBUG 401#ifdef CONFIG_SCHED_DEBUG
386int sched_nr_latency_handler(struct ctl_table *table, int write, 402int sched_proc_update_handler(struct ctl_table *table, int write,
387 void __user *buffer, size_t *lenp, 403 void __user *buffer, size_t *lenp,
388 loff_t *ppos) 404 loff_t *ppos)
389{ 405{
390 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 406 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
407 int factor = get_update_sysctl_factor();
391 408
392 if (ret || !write) 409 if (ret || !write)
393 return ret; 410 return ret;
@@ -395,6 +412,14 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
395 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, 412 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
396 sysctl_sched_min_granularity); 413 sysctl_sched_min_granularity);
397 414
415#define WRT_SYSCTL(name) \
416 (normalized_sysctl_##name = sysctl_##name / (factor))
417 WRT_SYSCTL(sched_min_granularity);
418 WRT_SYSCTL(sched_latency);
419 WRT_SYSCTL(sched_wakeup_granularity);
420 WRT_SYSCTL(sched_shares_ratelimit);
421#undef WRT_SYSCTL
422
398 return 0; 423 return 0;
399} 424}
400#endif 425#endif
@@ -485,6 +510,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
485 curr->sum_exec_runtime += delta_exec; 510 curr->sum_exec_runtime += delta_exec;
486 schedstat_add(cfs_rq, exec_clock, delta_exec); 511 schedstat_add(cfs_rq, exec_clock, delta_exec);
487 delta_exec_weighted = calc_delta_fair(delta_exec, curr); 512 delta_exec_weighted = calc_delta_fair(delta_exec, curr);
513
488 curr->vruntime += delta_exec_weighted; 514 curr->vruntime += delta_exec_weighted;
489 update_min_vruntime(cfs_rq); 515 update_min_vruntime(cfs_rq);
490} 516}
@@ -740,16 +766,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
740 se->vruntime = vruntime; 766 se->vruntime = vruntime;
741} 767}
742 768
769#define ENQUEUE_WAKEUP 1
770#define ENQUEUE_MIGRATE 2
771
743static void 772static void
744enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) 773enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
745{ 774{
746 /* 775 /*
776 * Update the normalized vruntime before updating min_vruntime
777 * through callig update_curr().
778 */
779 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE))
780 se->vruntime += cfs_rq->min_vruntime;
781
782 /*
747 * Update run-time statistics of the 'current'. 783 * Update run-time statistics of the 'current'.
748 */ 784 */
749 update_curr(cfs_rq); 785 update_curr(cfs_rq);
750 account_entity_enqueue(cfs_rq, se); 786 account_entity_enqueue(cfs_rq, se);
751 787
752 if (wakeup) { 788 if (flags & ENQUEUE_WAKEUP) {
753 place_entity(cfs_rq, se, 0); 789 place_entity(cfs_rq, se, 0);
754 enqueue_sleeper(cfs_rq, se); 790 enqueue_sleeper(cfs_rq, se);
755 } 791 }
@@ -803,6 +839,14 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
803 __dequeue_entity(cfs_rq, se); 839 __dequeue_entity(cfs_rq, se);
804 account_entity_dequeue(cfs_rq, se); 840 account_entity_dequeue(cfs_rq, se);
805 update_min_vruntime(cfs_rq); 841 update_min_vruntime(cfs_rq);
842
843 /*
844 * Normalize the entity after updating the min_vruntime because the
845 * update can refer to the ->curr item and we need to reflect this
846 * movement in our normalized position.
847 */
848 if (!sleep)
849 se->vruntime -= cfs_rq->min_vruntime;
806} 850}
807 851
808/* 852/*
@@ -1009,17 +1053,24 @@ static inline void hrtick_update(struct rq *rq)
1009 * increased. Here we update the fair scheduling stats and 1053 * increased. Here we update the fair scheduling stats and
1010 * then put the task into the rbtree: 1054 * then put the task into the rbtree:
1011 */ 1055 */
1012static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) 1056static void
1057enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1013{ 1058{
1014 struct cfs_rq *cfs_rq; 1059 struct cfs_rq *cfs_rq;
1015 struct sched_entity *se = &p->se; 1060 struct sched_entity *se = &p->se;
1061 int flags = 0;
1062
1063 if (wakeup)
1064 flags |= ENQUEUE_WAKEUP;
1065 if (p->state == TASK_WAKING)
1066 flags |= ENQUEUE_MIGRATE;
1016 1067
1017 for_each_sched_entity(se) { 1068 for_each_sched_entity(se) {
1018 if (se->on_rq) 1069 if (se->on_rq)
1019 break; 1070 break;
1020 cfs_rq = cfs_rq_of(se); 1071 cfs_rq = cfs_rq_of(se);
1021 enqueue_entity(cfs_rq, se, wakeup); 1072 enqueue_entity(cfs_rq, se, flags);
1022 wakeup = 1; 1073 flags = ENQUEUE_WAKEUP;
1023 } 1074 }
1024 1075
1025 hrtick_update(rq); 1076 hrtick_update(rq);
@@ -1095,6 +1146,14 @@ static void yield_task_fair(struct rq *rq)
1095 1146
1096#ifdef CONFIG_SMP 1147#ifdef CONFIG_SMP
1097 1148
1149static void task_waking_fair(struct rq *rq, struct task_struct *p)
1150{
1151 struct sched_entity *se = &p->se;
1152 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1153
1154 se->vruntime -= cfs_rq->min_vruntime;
1155}
1156
1098#ifdef CONFIG_FAIR_GROUP_SCHED 1157#ifdef CONFIG_FAIR_GROUP_SCHED
1099/* 1158/*
1100 * effective_load() calculates the load change as seen from the root_task_group 1159 * effective_load() calculates the load change as seen from the root_task_group
@@ -1345,6 +1404,37 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1345} 1404}
1346 1405
1347/* 1406/*
1407 * Try and locate an idle CPU in the sched_domain.
1408 */
1409static int
1410select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
1411{
1412 int cpu = smp_processor_id();
1413 int prev_cpu = task_cpu(p);
1414 int i;
1415
1416 /*
1417 * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE
1418 * test in select_task_rq_fair) and the prev_cpu is idle then that's
1419 * always a better target than the current cpu.
1420 */
1421 if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running)
1422 return prev_cpu;
1423
1424 /*
1425 * Otherwise, iterate the domain and find an elegible idle cpu.
1426 */
1427 for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
1428 if (!cpu_rq(i)->cfs.nr_running) {
1429 target = i;
1430 break;
1431 }
1432 }
1433
1434 return target;
1435}
1436
1437/*
1348 * sched_balance_self: balance the current task (running on cpu) in domains 1438 * sched_balance_self: balance the current task (running on cpu) in domains
1349 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and 1439 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1350 * SD_BALANCE_EXEC. 1440 * SD_BALANCE_EXEC.
@@ -1372,8 +1462,10 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1372 new_cpu = prev_cpu; 1462 new_cpu = prev_cpu;
1373 } 1463 }
1374 1464
1375 rcu_read_lock();
1376 for_each_domain(cpu, tmp) { 1465 for_each_domain(cpu, tmp) {
1466 if (!(tmp->flags & SD_LOAD_BALANCE))
1467 continue;
1468
1377 /* 1469 /*
1378 * If power savings logic is enabled for a domain, see if we 1470 * If power savings logic is enabled for a domain, see if we
1379 * are not overloaded, if so, don't balance wider. 1471 * are not overloaded, if so, don't balance wider.
@@ -1398,11 +1490,35 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1398 want_sd = 0; 1490 want_sd = 0;
1399 } 1491 }
1400 1492
1401 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && 1493 /*
1402 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { 1494 * While iterating the domains looking for a spanning
1495 * WAKE_AFFINE domain, adjust the affine target to any idle cpu
1496 * in cache sharing domains along the way.
1497 */
1498 if (want_affine) {
1499 int target = -1;
1500
1501 /*
1502 * If both cpu and prev_cpu are part of this domain,
1503 * cpu is a valid SD_WAKE_AFFINE target.
1504 */
1505 if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
1506 target = cpu;
1403 1507
1404 affine_sd = tmp; 1508 /*
1405 want_affine = 0; 1509 * If there's an idle sibling in this domain, make that
1510 * the wake_affine target instead of the current cpu.
1511 */
1512 if (tmp->flags & SD_SHARE_PKG_RESOURCES)
1513 target = select_idle_sibling(p, tmp, target);
1514
1515 if (target >= 0) {
1516 if (tmp->flags & SD_WAKE_AFFINE) {
1517 affine_sd = tmp;
1518 want_affine = 0;
1519 }
1520 cpu = target;
1521 }
1406 } 1522 }
1407 1523
1408 if (!want_sd && !want_affine) 1524 if (!want_sd && !want_affine)
@@ -1429,10 +1545,8 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1429 update_shares(tmp); 1545 update_shares(tmp);
1430 } 1546 }
1431 1547
1432 if (affine_sd && wake_affine(affine_sd, p, sync)) { 1548 if (affine_sd && wake_affine(affine_sd, p, sync))
1433 new_cpu = cpu; 1549 return cpu;
1434 goto out;
1435 }
1436 1550
1437 while (sd) { 1551 while (sd) {
1438 int load_idx = sd->forkexec_idx; 1552 int load_idx = sd->forkexec_idx;
@@ -1473,8 +1587,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1473 /* while loop will break here if sd == NULL */ 1587 /* while loop will break here if sd == NULL */
1474 } 1588 }
1475 1589
1476out:
1477 rcu_read_unlock();
1478 return new_cpu; 1590 return new_cpu;
1479} 1591}
1480#endif /* CONFIG_SMP */ 1592#endif /* CONFIG_SMP */
@@ -1596,12 +1708,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1596 int sync = wake_flags & WF_SYNC; 1708 int sync = wake_flags & WF_SYNC;
1597 int scale = cfs_rq->nr_running >= sched_nr_latency; 1709 int scale = cfs_rq->nr_running >= sched_nr_latency;
1598 1710
1599 update_curr(cfs_rq); 1711 if (unlikely(rt_prio(p->prio)) || p->policy == SCHED_LITMUS)
1600 1712 goto preempt;
1601 if (unlikely(rt_prio(p->prio)) || p->policy == SCHED_LITMUS) {
1602 resched_task(curr);
1603 return;
1604 }
1605 1713
1606 if (unlikely(p->sched_class != &fair_sched_class)) 1714 if (unlikely(p->sched_class != &fair_sched_class))
1607 return; 1715 return;
@@ -1627,50 +1735,44 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1627 return; 1735 return;
1628 1736
1629 /* Idle tasks are by definition preempted by everybody. */ 1737 /* Idle tasks are by definition preempted by everybody. */
1630 if (unlikely(curr->policy == SCHED_IDLE)) { 1738 if (unlikely(curr->policy == SCHED_IDLE))
1631 resched_task(curr); 1739 goto preempt;
1632 return;
1633 }
1634 1740
1635 if ((sched_feat(WAKEUP_SYNC) && sync) || 1741 if (sched_feat(WAKEUP_SYNC) && sync)
1636 (sched_feat(WAKEUP_OVERLAP) && 1742 goto preempt;
1637 (se->avg_overlap < sysctl_sched_migration_cost &&
1638 pse->avg_overlap < sysctl_sched_migration_cost))) {
1639 resched_task(curr);
1640 return;
1641 }
1642 1743
1643 if (sched_feat(WAKEUP_RUNNING)) { 1744 if (sched_feat(WAKEUP_OVERLAP) &&
1644 if (pse->avg_running < se->avg_running) { 1745 se->avg_overlap < sysctl_sched_migration_cost &&
1645 set_next_buddy(pse); 1746 pse->avg_overlap < sysctl_sched_migration_cost)
1646 resched_task(curr); 1747 goto preempt;
1647 return;
1648 }
1649 }
1650 1748
1651 if (!sched_feat(WAKEUP_PREEMPT)) 1749 if (!sched_feat(WAKEUP_PREEMPT))
1652 return; 1750 return;
1653 1751
1752 update_curr(cfs_rq);
1654 find_matching_se(&se, &pse); 1753 find_matching_se(&se, &pse);
1655
1656 BUG_ON(!pse); 1754 BUG_ON(!pse);
1755 if (wakeup_preempt_entity(se, pse) == 1)
1756 goto preempt;
1657 1757
1658 if (wakeup_preempt_entity(se, pse) == 1) { 1758 return;
1659 resched_task(curr); 1759
1660 /* 1760preempt:
1661 * Only set the backward buddy when the current task is still 1761 resched_task(curr);
1662 * on the rq. This can happen when a wakeup gets interleaved 1762 /*
1663 * with schedule on the ->pre_schedule() or idle_balance() 1763 * Only set the backward buddy when the current task is still
1664 * point, either of which can * drop the rq lock. 1764 * on the rq. This can happen when a wakeup gets interleaved
1665 * 1765 * with schedule on the ->pre_schedule() or idle_balance()
1666 * Also, during early boot the idle thread is in the fair class, 1766 * point, either of which can * drop the rq lock.
1667 * for obvious reasons its a bad idea to schedule back to it. 1767 *
1668 */ 1768 * Also, during early boot the idle thread is in the fair class,
1669 if (unlikely(!se->on_rq || curr == rq->idle)) 1769 * for obvious reasons its a bad idea to schedule back to it.
1670 return; 1770 */
1671 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) 1771 if (unlikely(!se->on_rq || curr == rq->idle))
1672 set_last_buddy(se); 1772 return;
1673 } 1773
1774 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
1775 set_last_buddy(se);
1674} 1776}
1675 1777
1676static struct task_struct *pick_next_task_fair(struct rq *rq) 1778static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1679,7 +1781,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
1679 struct cfs_rq *cfs_rq = &rq->cfs; 1781 struct cfs_rq *cfs_rq = &rq->cfs;
1680 struct sched_entity *se; 1782 struct sched_entity *se;
1681 1783
1682 if (unlikely(!cfs_rq->nr_running)) 1784 if (!cfs_rq->nr_running)
1683 return NULL; 1785 return NULL;
1684 1786
1685 do { 1787 do {
@@ -1714,57 +1816,164 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
1714 */ 1816 */
1715 1817
1716/* 1818/*
1717 * Load-balancing iterator. Note: while the runqueue stays locked 1819 * pull_task - move a task from a remote runqueue to the local runqueue.
1718 * during the whole iteration, the current task might be 1820 * Both runqueues must be locked.
1719 * dequeued so the iterator has to be dequeue-safe. Here we
1720 * achieve that by always pre-iterating before returning
1721 * the current task:
1722 */ 1821 */
1723static struct task_struct * 1822static void pull_task(struct rq *src_rq, struct task_struct *p,
1724__load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) 1823 struct rq *this_rq, int this_cpu)
1725{ 1824{
1726 struct task_struct *p = NULL; 1825 deactivate_task(src_rq, p, 0);
1727 struct sched_entity *se; 1826 set_task_cpu(p, this_cpu);
1827 activate_task(this_rq, p, 0);
1828 check_preempt_curr(this_rq, p, 0);
1829}
1728 1830
1729 if (next == &cfs_rq->tasks) 1831/*
1730 return NULL; 1832 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1833 */
1834static
1835int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1836 struct sched_domain *sd, enum cpu_idle_type idle,
1837 int *all_pinned)
1838{
1839 int tsk_cache_hot = 0;
1840 /*
1841 * We do not migrate tasks that are:
1842 * 1) running (obviously), or
1843 * 2) cannot be migrated to this CPU due to cpus_allowed, or
1844 * 3) are cache-hot on their current CPU.
1845 */
1846 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
1847 schedstat_inc(p, se.nr_failed_migrations_affine);
1848 return 0;
1849 }
1850 *all_pinned = 0;
1731 1851
1732 se = list_entry(next, struct sched_entity, group_node); 1852 if (task_running(rq, p)) {
1733 p = task_of(se); 1853 schedstat_inc(p, se.nr_failed_migrations_running);
1734 cfs_rq->balance_iterator = next->next; 1854 return 0;
1855 }
1735 1856
1736 return p; 1857 /*
1737} 1858 * Aggressive migration if:
1859 * 1) task is cache cold, or
1860 * 2) too many balance attempts have failed.
1861 */
1738 1862
1739static struct task_struct *load_balance_start_fair(void *arg) 1863 tsk_cache_hot = task_hot(p, rq->clock, sd);
1740{ 1864 if (!tsk_cache_hot ||
1741 struct cfs_rq *cfs_rq = arg; 1865 sd->nr_balance_failed > sd->cache_nice_tries) {
1866#ifdef CONFIG_SCHEDSTATS
1867 if (tsk_cache_hot) {
1868 schedstat_inc(sd, lb_hot_gained[idle]);
1869 schedstat_inc(p, se.nr_forced_migrations);
1870 }
1871#endif
1872 return 1;
1873 }
1742 1874
1743 return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next); 1875 if (tsk_cache_hot) {
1876 schedstat_inc(p, se.nr_failed_migrations_hot);
1877 return 0;
1878 }
1879 return 1;
1744} 1880}
1745 1881
1746static struct task_struct *load_balance_next_fair(void *arg) 1882/*
1883 * move_one_task tries to move exactly one task from busiest to this_rq, as
1884 * part of active balancing operations within "domain".
1885 * Returns 1 if successful and 0 otherwise.
1886 *
1887 * Called with both runqueues locked.
1888 */
1889static int
1890move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1891 struct sched_domain *sd, enum cpu_idle_type idle)
1747{ 1892{
1748 struct cfs_rq *cfs_rq = arg; 1893 struct task_struct *p, *n;
1894 struct cfs_rq *cfs_rq;
1895 int pinned = 0;
1749 1896
1750 return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); 1897 for_each_leaf_cfs_rq(busiest, cfs_rq) {
1898 list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
1899
1900 if (!can_migrate_task(p, busiest, this_cpu,
1901 sd, idle, &pinned))
1902 continue;
1903
1904 pull_task(busiest, p, this_rq, this_cpu);
1905 /*
1906 * Right now, this is only the second place pull_task()
1907 * is called, so we can safely collect pull_task()
1908 * stats here rather than inside pull_task().
1909 */
1910 schedstat_inc(sd, lb_gained[idle]);
1911 return 1;
1912 }
1913 }
1914
1915 return 0;
1751} 1916}
1752 1917
1753static unsigned long 1918static unsigned long
1754__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1919balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1755 unsigned long max_load_move, struct sched_domain *sd, 1920 unsigned long max_load_move, struct sched_domain *sd,
1756 enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, 1921 enum cpu_idle_type idle, int *all_pinned,
1757 struct cfs_rq *cfs_rq) 1922 int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
1758{ 1923{
1759 struct rq_iterator cfs_rq_iterator; 1924 int loops = 0, pulled = 0, pinned = 0;
1925 long rem_load_move = max_load_move;
1926 struct task_struct *p, *n;
1760 1927
1761 cfs_rq_iterator.start = load_balance_start_fair; 1928 if (max_load_move == 0)
1762 cfs_rq_iterator.next = load_balance_next_fair; 1929 goto out;
1763 cfs_rq_iterator.arg = cfs_rq;
1764 1930
1765 return balance_tasks(this_rq, this_cpu, busiest, 1931 pinned = 1;
1766 max_load_move, sd, idle, all_pinned, 1932
1767 this_best_prio, &cfs_rq_iterator); 1933 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
1934 if (loops++ > sysctl_sched_nr_migrate)
1935 break;
1936
1937 if ((p->se.load.weight >> 1) > rem_load_move ||
1938 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned))
1939 continue;
1940
1941 pull_task(busiest, p, this_rq, this_cpu);
1942 pulled++;
1943 rem_load_move -= p->se.load.weight;
1944
1945#ifdef CONFIG_PREEMPT
1946 /*
1947 * NEWIDLE balancing is a source of latency, so preemptible
1948 * kernels will stop after the first task is pulled to minimize
1949 * the critical section.
1950 */
1951 if (idle == CPU_NEWLY_IDLE)
1952 break;
1953#endif
1954
1955 /*
1956 * We only want to steal up to the prescribed amount of
1957 * weighted load.
1958 */
1959 if (rem_load_move <= 0)
1960 break;
1961
1962 if (p->prio < *this_best_prio)
1963 *this_best_prio = p->prio;
1964 }
1965out:
1966 /*
1967 * Right now, this is one of only two places pull_task() is called,
1968 * so we can safely collect pull_task() stats here rather than
1969 * inside pull_task().
1970 */
1971 schedstat_add(sd, lb_gained[idle], pulled);
1972
1973 if (all_pinned)
1974 *all_pinned = pinned;
1975
1976 return max_load_move - rem_load_move;
1768} 1977}
1769 1978
1770#ifdef CONFIG_FAIR_GROUP_SCHED 1979#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1796,9 +2005,9 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1796 rem_load = (u64)rem_load_move * busiest_weight; 2005 rem_load = (u64)rem_load_move * busiest_weight;
1797 rem_load = div_u64(rem_load, busiest_h_load + 1); 2006 rem_load = div_u64(rem_load, busiest_h_load + 1);
1798 2007
1799 moved_load = __load_balance_fair(this_rq, this_cpu, busiest, 2008 moved_load = balance_tasks(this_rq, this_cpu, busiest,
1800 rem_load, sd, idle, all_pinned, this_best_prio, 2009 rem_load, sd, idle, all_pinned, this_best_prio,
1801 tg->cfs_rq[busiest_cpu]); 2010 busiest_cfs_rq);
1802 2011
1803 if (!moved_load) 2012 if (!moved_load)
1804 continue; 2013 continue;
@@ -1821,35 +2030,1529 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1821 struct sched_domain *sd, enum cpu_idle_type idle, 2030 struct sched_domain *sd, enum cpu_idle_type idle,
1822 int *all_pinned, int *this_best_prio) 2031 int *all_pinned, int *this_best_prio)
1823{ 2032{
1824 return __load_balance_fair(this_rq, this_cpu, busiest, 2033 return balance_tasks(this_rq, this_cpu, busiest,
1825 max_load_move, sd, idle, all_pinned, 2034 max_load_move, sd, idle, all_pinned,
1826 this_best_prio, &busiest->cfs); 2035 this_best_prio, &busiest->cfs);
1827} 2036}
1828#endif 2037#endif
1829 2038
1830static int 2039/*
1831move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2040 * move_tasks tries to move up to max_load_move weighted load from busiest to
1832 struct sched_domain *sd, enum cpu_idle_type idle) 2041 * this_rq, as part of a balancing operation within domain "sd".
2042 * Returns 1 if successful and 0 otherwise.
2043 *
2044 * Called with both runqueues locked.
2045 */
2046static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2047 unsigned long max_load_move,
2048 struct sched_domain *sd, enum cpu_idle_type idle,
2049 int *all_pinned)
1833{ 2050{
1834 struct cfs_rq *busy_cfs_rq; 2051 unsigned long total_load_moved = 0, load_moved;
1835 struct rq_iterator cfs_rq_iterator; 2052 int this_best_prio = this_rq->curr->prio;
1836 2053
1837 cfs_rq_iterator.start = load_balance_start_fair; 2054 do {
1838 cfs_rq_iterator.next = load_balance_next_fair; 2055 load_moved = load_balance_fair(this_rq, this_cpu, busiest,
2056 max_load_move - total_load_moved,
2057 sd, idle, all_pinned, &this_best_prio);
1839 2058
1840 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { 2059 total_load_moved += load_moved;
2060
2061#ifdef CONFIG_PREEMPT
1841 /* 2062 /*
1842 * pass busy_cfs_rq argument into 2063 * NEWIDLE balancing is a source of latency, so preemptible
1843 * load_balance_[start|next]_fair iterators 2064 * kernels will stop after the first task is pulled to minimize
2065 * the critical section.
1844 */ 2066 */
1845 cfs_rq_iterator.arg = busy_cfs_rq; 2067 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
1846 if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle, 2068 break;
1847 &cfs_rq_iterator)) 2069
1848 return 1; 2070 if (raw_spin_is_contended(&this_rq->lock) ||
2071 raw_spin_is_contended(&busiest->lock))
2072 break;
2073#endif
2074 } while (load_moved && max_load_move > total_load_moved);
2075
2076 return total_load_moved > 0;
2077}
2078
2079/********** Helpers for find_busiest_group ************************/
2080/*
2081 * sd_lb_stats - Structure to store the statistics of a sched_domain
2082 * during load balancing.
2083 */
2084struct sd_lb_stats {
2085 struct sched_group *busiest; /* Busiest group in this sd */
2086 struct sched_group *this; /* Local group in this sd */
2087 unsigned long total_load; /* Total load of all groups in sd */
2088 unsigned long total_pwr; /* Total power of all groups in sd */
2089 unsigned long avg_load; /* Average load across all groups in sd */
2090
2091 /** Statistics of this group */
2092 unsigned long this_load;
2093 unsigned long this_load_per_task;
2094 unsigned long this_nr_running;
2095
2096 /* Statistics of the busiest group */
2097 unsigned long max_load;
2098 unsigned long busiest_load_per_task;
2099 unsigned long busiest_nr_running;
2100 unsigned long busiest_group_capacity;
2101
2102 int group_imb; /* Is there imbalance in this sd */
2103#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2104 int power_savings_balance; /* Is powersave balance needed for this sd */
2105 struct sched_group *group_min; /* Least loaded group in sd */
2106 struct sched_group *group_leader; /* Group which relieves group_min */
2107 unsigned long min_load_per_task; /* load_per_task in group_min */
2108 unsigned long leader_nr_running; /* Nr running of group_leader */
2109 unsigned long min_nr_running; /* Nr running of group_min */
2110#endif
2111};
2112
2113/*
2114 * sg_lb_stats - stats of a sched_group required for load_balancing
2115 */
2116struct sg_lb_stats {
2117 unsigned long avg_load; /*Avg load across the CPUs of the group */
2118 unsigned long group_load; /* Total load over the CPUs of the group */
2119 unsigned long sum_nr_running; /* Nr tasks running in the group */
2120 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
2121 unsigned long group_capacity;
2122 int group_imb; /* Is there an imbalance in the group ? */
2123};
2124
2125/**
2126 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
2127 * @group: The group whose first cpu is to be returned.
2128 */
2129static inline unsigned int group_first_cpu(struct sched_group *group)
2130{
2131 return cpumask_first(sched_group_cpus(group));
2132}
2133
2134/**
2135 * get_sd_load_idx - Obtain the load index for a given sched domain.
2136 * @sd: The sched_domain whose load_idx is to be obtained.
2137 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
2138 */
2139static inline int get_sd_load_idx(struct sched_domain *sd,
2140 enum cpu_idle_type idle)
2141{
2142 int load_idx;
2143
2144 switch (idle) {
2145 case CPU_NOT_IDLE:
2146 load_idx = sd->busy_idx;
2147 break;
2148
2149 case CPU_NEWLY_IDLE:
2150 load_idx = sd->newidle_idx;
2151 break;
2152 default:
2153 load_idx = sd->idle_idx;
2154 break;
1849 } 2155 }
1850 2156
2157 return load_idx;
2158}
2159
2160
2161#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2162/**
2163 * init_sd_power_savings_stats - Initialize power savings statistics for
2164 * the given sched_domain, during load balancing.
2165 *
2166 * @sd: Sched domain whose power-savings statistics are to be initialized.
2167 * @sds: Variable containing the statistics for sd.
2168 * @idle: Idle status of the CPU at which we're performing load-balancing.
2169 */
2170static inline void init_sd_power_savings_stats(struct sched_domain *sd,
2171 struct sd_lb_stats *sds, enum cpu_idle_type idle)
2172{
2173 /*
2174 * Busy processors will not participate in power savings
2175 * balance.
2176 */
2177 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2178 sds->power_savings_balance = 0;
2179 else {
2180 sds->power_savings_balance = 1;
2181 sds->min_nr_running = ULONG_MAX;
2182 sds->leader_nr_running = 0;
2183 }
2184}
2185
2186/**
2187 * update_sd_power_savings_stats - Update the power saving stats for a
2188 * sched_domain while performing load balancing.
2189 *
2190 * @group: sched_group belonging to the sched_domain under consideration.
2191 * @sds: Variable containing the statistics of the sched_domain
2192 * @local_group: Does group contain the CPU for which we're performing
2193 * load balancing ?
2194 * @sgs: Variable containing the statistics of the group.
2195 */
2196static inline void update_sd_power_savings_stats(struct sched_group *group,
2197 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
2198{
2199
2200 if (!sds->power_savings_balance)
2201 return;
2202
2203 /*
2204 * If the local group is idle or completely loaded
2205 * no need to do power savings balance at this domain
2206 */
2207 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
2208 !sds->this_nr_running))
2209 sds->power_savings_balance = 0;
2210
2211 /*
2212 * If a group is already running at full capacity or idle,
2213 * don't include that group in power savings calculations
2214 */
2215 if (!sds->power_savings_balance ||
2216 sgs->sum_nr_running >= sgs->group_capacity ||
2217 !sgs->sum_nr_running)
2218 return;
2219
2220 /*
2221 * Calculate the group which has the least non-idle load.
2222 * This is the group from where we need to pick up the load
2223 * for saving power
2224 */
2225 if ((sgs->sum_nr_running < sds->min_nr_running) ||
2226 (sgs->sum_nr_running == sds->min_nr_running &&
2227 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
2228 sds->group_min = group;
2229 sds->min_nr_running = sgs->sum_nr_running;
2230 sds->min_load_per_task = sgs->sum_weighted_load /
2231 sgs->sum_nr_running;
2232 }
2233
2234 /*
2235 * Calculate the group which is almost near its
2236 * capacity but still has some space to pick up some load
2237 * from other group and save more power
2238 */
2239 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
2240 return;
2241
2242 if (sgs->sum_nr_running > sds->leader_nr_running ||
2243 (sgs->sum_nr_running == sds->leader_nr_running &&
2244 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
2245 sds->group_leader = group;
2246 sds->leader_nr_running = sgs->sum_nr_running;
2247 }
2248}
2249
2250/**
2251 * check_power_save_busiest_group - see if there is potential for some power-savings balance
2252 * @sds: Variable containing the statistics of the sched_domain
2253 * under consideration.
2254 * @this_cpu: Cpu at which we're currently performing load-balancing.
2255 * @imbalance: Variable to store the imbalance.
2256 *
2257 * Description:
2258 * Check if we have potential to perform some power-savings balance.
2259 * If yes, set the busiest group to be the least loaded group in the
2260 * sched_domain, so that it's CPUs can be put to idle.
2261 *
2262 * Returns 1 if there is potential to perform power-savings balance.
2263 * Else returns 0.
2264 */
2265static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
2266 int this_cpu, unsigned long *imbalance)
2267{
2268 if (!sds->power_savings_balance)
2269 return 0;
2270
2271 if (sds->this != sds->group_leader ||
2272 sds->group_leader == sds->group_min)
2273 return 0;
2274
2275 *imbalance = sds->min_load_per_task;
2276 sds->busiest = sds->group_min;
2277
2278 return 1;
2279
2280}
2281#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
2282static inline void init_sd_power_savings_stats(struct sched_domain *sd,
2283 struct sd_lb_stats *sds, enum cpu_idle_type idle)
2284{
2285 return;
2286}
2287
2288static inline void update_sd_power_savings_stats(struct sched_group *group,
2289 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
2290{
2291 return;
2292}
2293
2294static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
2295 int this_cpu, unsigned long *imbalance)
2296{
1851 return 0; 2297 return 0;
1852} 2298}
2299#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
2300
2301
2302unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
2303{
2304 return SCHED_LOAD_SCALE;
2305}
2306
2307unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
2308{
2309 return default_scale_freq_power(sd, cpu);
2310}
2311
2312unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
2313{
2314 unsigned long weight = cpumask_weight(sched_domain_span(sd));
2315 unsigned long smt_gain = sd->smt_gain;
2316
2317 smt_gain /= weight;
2318
2319 return smt_gain;
2320}
2321
2322unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
2323{
2324 return default_scale_smt_power(sd, cpu);
2325}
2326
2327unsigned long scale_rt_power(int cpu)
2328{
2329 struct rq *rq = cpu_rq(cpu);
2330 u64 total, available;
2331
2332 sched_avg_update(rq);
2333
2334 total = sched_avg_period() + (rq->clock - rq->age_stamp);
2335 available = total - rq->rt_avg;
2336
2337 if (unlikely((s64)total < SCHED_LOAD_SCALE))
2338 total = SCHED_LOAD_SCALE;
2339
2340 total >>= SCHED_LOAD_SHIFT;
2341
2342 return div_u64(available, total);
2343}
2344
2345static void update_cpu_power(struct sched_domain *sd, int cpu)
2346{
2347 unsigned long weight = cpumask_weight(sched_domain_span(sd));
2348 unsigned long power = SCHED_LOAD_SCALE;
2349 struct sched_group *sdg = sd->groups;
2350
2351 if (sched_feat(ARCH_POWER))
2352 power *= arch_scale_freq_power(sd, cpu);
2353 else
2354 power *= default_scale_freq_power(sd, cpu);
2355
2356 power >>= SCHED_LOAD_SHIFT;
2357
2358 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
2359 if (sched_feat(ARCH_POWER))
2360 power *= arch_scale_smt_power(sd, cpu);
2361 else
2362 power *= default_scale_smt_power(sd, cpu);
2363
2364 power >>= SCHED_LOAD_SHIFT;
2365 }
2366
2367 power *= scale_rt_power(cpu);
2368 power >>= SCHED_LOAD_SHIFT;
2369
2370 if (!power)
2371 power = 1;
2372
2373 sdg->cpu_power = power;
2374}
2375
2376static void update_group_power(struct sched_domain *sd, int cpu)
2377{
2378 struct sched_domain *child = sd->child;
2379 struct sched_group *group, *sdg = sd->groups;
2380 unsigned long power;
2381
2382 if (!child) {
2383 update_cpu_power(sd, cpu);
2384 return;
2385 }
2386
2387 power = 0;
2388
2389 group = child->groups;
2390 do {
2391 power += group->cpu_power;
2392 group = group->next;
2393 } while (group != child->groups);
2394
2395 sdg->cpu_power = power;
2396}
2397
2398/**
2399 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
2400 * @sd: The sched_domain whose statistics are to be updated.
2401 * @group: sched_group whose statistics are to be updated.
2402 * @this_cpu: Cpu for which load balance is currently performed.
2403 * @idle: Idle status of this_cpu
2404 * @load_idx: Load index of sched_domain of this_cpu for load calc.
2405 * @sd_idle: Idle status of the sched_domain containing group.
2406 * @local_group: Does group contain this_cpu.
2407 * @cpus: Set of cpus considered for load balancing.
2408 * @balance: Should we balance.
2409 * @sgs: variable to hold the statistics for this group.
2410 */
2411static inline void update_sg_lb_stats(struct sched_domain *sd,
2412 struct sched_group *group, int this_cpu,
2413 enum cpu_idle_type idle, int load_idx, int *sd_idle,
2414 int local_group, const struct cpumask *cpus,
2415 int *balance, struct sg_lb_stats *sgs)
2416{
2417 unsigned long load, max_cpu_load, min_cpu_load;
2418 int i;
2419 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2420 unsigned long avg_load_per_task = 0;
2421
2422 if (local_group)
2423 balance_cpu = group_first_cpu(group);
2424
2425 /* Tally up the load of all CPUs in the group */
2426 max_cpu_load = 0;
2427 min_cpu_load = ~0UL;
2428
2429 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
2430 struct rq *rq = cpu_rq(i);
2431
2432 if (*sd_idle && rq->nr_running)
2433 *sd_idle = 0;
2434
2435 /* Bias balancing toward cpus of our domain */
2436 if (local_group) {
2437 if (idle_cpu(i) && !first_idle_cpu) {
2438 first_idle_cpu = 1;
2439 balance_cpu = i;
2440 }
2441
2442 load = target_load(i, load_idx);
2443 } else {
2444 load = source_load(i, load_idx);
2445 if (load > max_cpu_load)
2446 max_cpu_load = load;
2447 if (min_cpu_load > load)
2448 min_cpu_load = load;
2449 }
2450
2451 sgs->group_load += load;
2452 sgs->sum_nr_running += rq->nr_running;
2453 sgs->sum_weighted_load += weighted_cpuload(i);
2454
2455 }
2456
2457 /*
2458 * First idle cpu or the first cpu(busiest) in this sched group
2459 * is eligible for doing load balancing at this and above
2460 * domains. In the newly idle case, we will allow all the cpu's
2461 * to do the newly idle load balance.
2462 */
2463 if (idle != CPU_NEWLY_IDLE && local_group &&
2464 balance_cpu != this_cpu) {
2465 *balance = 0;
2466 return;
2467 }
2468
2469 update_group_power(sd, this_cpu);
2470
2471 /* Adjust by relative CPU power of the group */
2472 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
2473
2474 /*
2475 * Consider the group unbalanced when the imbalance is larger
2476 * than the average weight of two tasks.
2477 *
2478 * APZ: with cgroup the avg task weight can vary wildly and
2479 * might not be a suitable number - should we keep a
2480 * normalized nr_running number somewhere that negates
2481 * the hierarchy?
2482 */
2483 if (sgs->sum_nr_running)
2484 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
2485
2486 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
2487 sgs->group_imb = 1;
2488
2489 sgs->group_capacity =
2490 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2491}
2492
2493/**
2494 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
2495 * @sd: sched_domain whose statistics are to be updated.
2496 * @this_cpu: Cpu for which load balance is currently performed.
2497 * @idle: Idle status of this_cpu
2498 * @sd_idle: Idle status of the sched_domain containing group.
2499 * @cpus: Set of cpus considered for load balancing.
2500 * @balance: Should we balance.
2501 * @sds: variable to hold the statistics for this sched_domain.
2502 */
2503static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2504 enum cpu_idle_type idle, int *sd_idle,
2505 const struct cpumask *cpus, int *balance,
2506 struct sd_lb_stats *sds)
2507{
2508 struct sched_domain *child = sd->child;
2509 struct sched_group *group = sd->groups;
2510 struct sg_lb_stats sgs;
2511 int load_idx, prefer_sibling = 0;
2512
2513 if (child && child->flags & SD_PREFER_SIBLING)
2514 prefer_sibling = 1;
2515
2516 init_sd_power_savings_stats(sd, sds, idle);
2517 load_idx = get_sd_load_idx(sd, idle);
2518
2519 do {
2520 int local_group;
2521
2522 local_group = cpumask_test_cpu(this_cpu,
2523 sched_group_cpus(group));
2524 memset(&sgs, 0, sizeof(sgs));
2525 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
2526 local_group, cpus, balance, &sgs);
2527
2528 if (local_group && !(*balance))
2529 return;
2530
2531 sds->total_load += sgs.group_load;
2532 sds->total_pwr += group->cpu_power;
2533
2534 /*
2535 * In case the child domain prefers tasks go to siblings
2536 * first, lower the group capacity to one so that we'll try
2537 * and move all the excess tasks away.
2538 */
2539 if (prefer_sibling)
2540 sgs.group_capacity = min(sgs.group_capacity, 1UL);
2541
2542 if (local_group) {
2543 sds->this_load = sgs.avg_load;
2544 sds->this = group;
2545 sds->this_nr_running = sgs.sum_nr_running;
2546 sds->this_load_per_task = sgs.sum_weighted_load;
2547 } else if (sgs.avg_load > sds->max_load &&
2548 (sgs.sum_nr_running > sgs.group_capacity ||
2549 sgs.group_imb)) {
2550 sds->max_load = sgs.avg_load;
2551 sds->busiest = group;
2552 sds->busiest_nr_running = sgs.sum_nr_running;
2553 sds->busiest_group_capacity = sgs.group_capacity;
2554 sds->busiest_load_per_task = sgs.sum_weighted_load;
2555 sds->group_imb = sgs.group_imb;
2556 }
2557
2558 update_sd_power_savings_stats(group, sds, local_group, &sgs);
2559 group = group->next;
2560 } while (group != sd->groups);
2561}
2562
2563/**
2564 * fix_small_imbalance - Calculate the minor imbalance that exists
2565 * amongst the groups of a sched_domain, during
2566 * load balancing.
2567 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
2568 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
2569 * @imbalance: Variable to store the imbalance.
2570 */
2571static inline void fix_small_imbalance(struct sd_lb_stats *sds,
2572 int this_cpu, unsigned long *imbalance)
2573{
2574 unsigned long tmp, pwr_now = 0, pwr_move = 0;
2575 unsigned int imbn = 2;
2576 unsigned long scaled_busy_load_per_task;
2577
2578 if (sds->this_nr_running) {
2579 sds->this_load_per_task /= sds->this_nr_running;
2580 if (sds->busiest_load_per_task >
2581 sds->this_load_per_task)
2582 imbn = 1;
2583 } else
2584 sds->this_load_per_task =
2585 cpu_avg_load_per_task(this_cpu);
2586
2587 scaled_busy_load_per_task = sds->busiest_load_per_task
2588 * SCHED_LOAD_SCALE;
2589 scaled_busy_load_per_task /= sds->busiest->cpu_power;
2590
2591 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
2592 (scaled_busy_load_per_task * imbn)) {
2593 *imbalance = sds->busiest_load_per_task;
2594 return;
2595 }
2596
2597 /*
2598 * OK, we don't have enough imbalance to justify moving tasks,
2599 * however we may be able to increase total CPU power used by
2600 * moving them.
2601 */
2602
2603 pwr_now += sds->busiest->cpu_power *
2604 min(sds->busiest_load_per_task, sds->max_load);
2605 pwr_now += sds->this->cpu_power *
2606 min(sds->this_load_per_task, sds->this_load);
2607 pwr_now /= SCHED_LOAD_SCALE;
2608
2609 /* Amount of load we'd subtract */
2610 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
2611 sds->busiest->cpu_power;
2612 if (sds->max_load > tmp)
2613 pwr_move += sds->busiest->cpu_power *
2614 min(sds->busiest_load_per_task, sds->max_load - tmp);
2615
2616 /* Amount of load we'd add */
2617 if (sds->max_load * sds->busiest->cpu_power <
2618 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
2619 tmp = (sds->max_load * sds->busiest->cpu_power) /
2620 sds->this->cpu_power;
2621 else
2622 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
2623 sds->this->cpu_power;
2624 pwr_move += sds->this->cpu_power *
2625 min(sds->this_load_per_task, sds->this_load + tmp);
2626 pwr_move /= SCHED_LOAD_SCALE;
2627
2628 /* Move if we gain throughput */
2629 if (pwr_move > pwr_now)
2630 *imbalance = sds->busiest_load_per_task;
2631}
2632
2633/**
2634 * calculate_imbalance - Calculate the amount of imbalance present within the
2635 * groups of a given sched_domain during load balance.
2636 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
2637 * @this_cpu: Cpu for which currently load balance is being performed.
2638 * @imbalance: The variable to store the imbalance.
2639 */
2640static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
2641 unsigned long *imbalance)
2642{
2643 unsigned long max_pull, load_above_capacity = ~0UL;
2644
2645 sds->busiest_load_per_task /= sds->busiest_nr_running;
2646 if (sds->group_imb) {
2647 sds->busiest_load_per_task =
2648 min(sds->busiest_load_per_task, sds->avg_load);
2649 }
2650
2651 /*
2652 * In the presence of smp nice balancing, certain scenarios can have
2653 * max load less than avg load(as we skip the groups at or below
2654 * its cpu_power, while calculating max_load..)
2655 */
2656 if (sds->max_load < sds->avg_load) {
2657 *imbalance = 0;
2658 return fix_small_imbalance(sds, this_cpu, imbalance);
2659 }
2660
2661 if (!sds->group_imb) {
2662 /*
2663 * Don't want to pull so many tasks that a group would go idle.
2664 */
2665 load_above_capacity = (sds->busiest_nr_running -
2666 sds->busiest_group_capacity);
2667
2668 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE);
2669
2670 load_above_capacity /= sds->busiest->cpu_power;
2671 }
2672
2673 /*
2674 * We're trying to get all the cpus to the average_load, so we don't
2675 * want to push ourselves above the average load, nor do we wish to
2676 * reduce the max loaded cpu below the average load. At the same time,
2677 * we also don't want to reduce the group load below the group capacity
2678 * (so that we can implement power-savings policies etc). Thus we look
2679 * for the minimum possible imbalance.
2680 * Be careful of negative numbers as they'll appear as very large values
2681 * with unsigned longs.
2682 */
2683 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
2684
2685 /* How much load to actually move to equalise the imbalance */
2686 *imbalance = min(max_pull * sds->busiest->cpu_power,
2687 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
2688 / SCHED_LOAD_SCALE;
2689
2690 /*
2691 * if *imbalance is less than the average load per runnable task
2692 * there is no gaurantee that any tasks will be moved so we'll have
2693 * a think about bumping its value to force at least one task to be
2694 * moved
2695 */
2696 if (*imbalance < sds->busiest_load_per_task)
2697 return fix_small_imbalance(sds, this_cpu, imbalance);
2698
2699}
2700/******* find_busiest_group() helpers end here *********************/
2701
2702/**
2703 * find_busiest_group - Returns the busiest group within the sched_domain
2704 * if there is an imbalance. If there isn't an imbalance, and
2705 * the user has opted for power-savings, it returns a group whose
2706 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
2707 * such a group exists.
2708 *
2709 * Also calculates the amount of weighted load which should be moved
2710 * to restore balance.
2711 *
2712 * @sd: The sched_domain whose busiest group is to be returned.
2713 * @this_cpu: The cpu for which load balancing is currently being performed.
2714 * @imbalance: Variable which stores amount of weighted load which should
2715 * be moved to restore balance/put a group to idle.
2716 * @idle: The idle status of this_cpu.
2717 * @sd_idle: The idleness of sd
2718 * @cpus: The set of CPUs under consideration for load-balancing.
2719 * @balance: Pointer to a variable indicating if this_cpu
2720 * is the appropriate cpu to perform load balancing at this_level.
2721 *
2722 * Returns: - the busiest group if imbalance exists.
2723 * - If no imbalance and user has opted for power-savings balance,
2724 * return the least loaded group whose CPUs can be
2725 * put to idle by rebalancing its tasks onto our group.
2726 */
2727static struct sched_group *
2728find_busiest_group(struct sched_domain *sd, int this_cpu,
2729 unsigned long *imbalance, enum cpu_idle_type idle,
2730 int *sd_idle, const struct cpumask *cpus, int *balance)
2731{
2732 struct sd_lb_stats sds;
2733
2734 memset(&sds, 0, sizeof(sds));
2735
2736 /*
2737 * Compute the various statistics relavent for load balancing at
2738 * this level.
2739 */
2740 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
2741 balance, &sds);
2742
2743 /* Cases where imbalance does not exist from POV of this_cpu */
2744 /* 1) this_cpu is not the appropriate cpu to perform load balancing
2745 * at this level.
2746 * 2) There is no busy sibling group to pull from.
2747 * 3) This group is the busiest group.
2748 * 4) This group is more busy than the avg busieness at this
2749 * sched_domain.
2750 * 5) The imbalance is within the specified limit.
2751 */
2752 if (!(*balance))
2753 goto ret;
2754
2755 if (!sds.busiest || sds.busiest_nr_running == 0)
2756 goto out_balanced;
2757
2758 if (sds.this_load >= sds.max_load)
2759 goto out_balanced;
2760
2761 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
2762
2763 if (sds.this_load >= sds.avg_load)
2764 goto out_balanced;
2765
2766 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
2767 goto out_balanced;
2768
2769 /* Looks like there is an imbalance. Compute it */
2770 calculate_imbalance(&sds, this_cpu, imbalance);
2771 return sds.busiest;
2772
2773out_balanced:
2774 /*
2775 * There is no obvious imbalance. But check if we can do some balancing
2776 * to save power.
2777 */
2778 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
2779 return sds.busiest;
2780ret:
2781 *imbalance = 0;
2782 return NULL;
2783}
2784
2785/*
2786 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2787 */
2788static struct rq *
2789find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2790 unsigned long imbalance, const struct cpumask *cpus)
2791{
2792 struct rq *busiest = NULL, *rq;
2793 unsigned long max_load = 0;
2794 int i;
2795
2796 for_each_cpu(i, sched_group_cpus(group)) {
2797 unsigned long power = power_of(i);
2798 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
2799 unsigned long wl;
2800
2801 if (!cpumask_test_cpu(i, cpus))
2802 continue;
2803
2804 rq = cpu_rq(i);
2805 wl = weighted_cpuload(i);
2806
2807 /*
2808 * When comparing with imbalance, use weighted_cpuload()
2809 * which is not scaled with the cpu power.
2810 */
2811 if (capacity && rq->nr_running == 1 && wl > imbalance)
2812 continue;
2813
2814 /*
2815 * For the load comparisons with the other cpu's, consider
2816 * the weighted_cpuload() scaled with the cpu power, so that
2817 * the load can be moved away from the cpu that is potentially
2818 * running at a lower capacity.
2819 */
2820 wl = (wl * SCHED_LOAD_SCALE) / power;
2821
2822 if (wl > max_load) {
2823 max_load = wl;
2824 busiest = rq;
2825 }
2826 }
2827
2828 return busiest;
2829}
2830
2831/*
2832 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2833 * so long as it is large enough.
2834 */
2835#define MAX_PINNED_INTERVAL 512
2836
2837/* Working cpumask for load_balance and load_balance_newidle. */
2838static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
2839
2840static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
2841{
2842 if (idle == CPU_NEWLY_IDLE) {
2843 /*
2844 * The only task running in a non-idle cpu can be moved to this
2845 * cpu in an attempt to completely freeup the other CPU
2846 * package.
2847 *
2848 * The package power saving logic comes from
2849 * find_busiest_group(). If there are no imbalance, then
2850 * f_b_g() will return NULL. However when sched_mc={1,2} then
2851 * f_b_g() will select a group from which a running task may be
2852 * pulled to this cpu in order to make the other package idle.
2853 * If there is no opportunity to make a package idle and if
2854 * there are no imbalance, then f_b_g() will return NULL and no
2855 * action will be taken in load_balance_newidle().
2856 *
2857 * Under normal task pull operation due to imbalance, there
2858 * will be more than one task in the source run queue and
2859 * move_tasks() will succeed. ld_moved will be true and this
2860 * active balance code will not be triggered.
2861 */
2862 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2863 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2864 return 0;
2865
2866 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
2867 return 0;
2868 }
2869
2870 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
2871}
2872
2873/*
2874 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2875 * tasks if there is an imbalance.
2876 */
2877static int load_balance(int this_cpu, struct rq *this_rq,
2878 struct sched_domain *sd, enum cpu_idle_type idle,
2879 int *balance)
2880{
2881 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2882 struct sched_group *group;
2883 unsigned long imbalance;
2884 struct rq *busiest;
2885 unsigned long flags;
2886 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
2887
2888 cpumask_copy(cpus, cpu_active_mask);
2889
2890 /*
2891 * When power savings policy is enabled for the parent domain, idle
2892 * sibling can pick up load irrespective of busy siblings. In this case,
2893 * let the state of idle sibling percolate up as CPU_IDLE, instead of
2894 * portraying it as CPU_NOT_IDLE.
2895 */
2896 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2897 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2898 sd_idle = 1;
2899
2900 schedstat_inc(sd, lb_count[idle]);
2901
2902redo:
2903 update_shares(sd);
2904 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2905 cpus, balance);
2906
2907 if (*balance == 0)
2908 goto out_balanced;
2909
2910 if (!group) {
2911 schedstat_inc(sd, lb_nobusyg[idle]);
2912 goto out_balanced;
2913 }
2914
2915 busiest = find_busiest_queue(group, idle, imbalance, cpus);
2916 if (!busiest) {
2917 schedstat_inc(sd, lb_nobusyq[idle]);
2918 goto out_balanced;
2919 }
2920
2921 BUG_ON(busiest == this_rq);
2922
2923 schedstat_add(sd, lb_imbalance[idle], imbalance);
2924
2925 ld_moved = 0;
2926 if (busiest->nr_running > 1) {
2927 /*
2928 * Attempt to move tasks. If find_busiest_group has found
2929 * an imbalance but busiest->nr_running <= 1, the group is
2930 * still unbalanced. ld_moved simply stays zero, so it is
2931 * correctly treated as an imbalance.
2932 */
2933 local_irq_save(flags);
2934 double_rq_lock(this_rq, busiest);
2935 ld_moved = move_tasks(this_rq, this_cpu, busiest,
2936 imbalance, sd, idle, &all_pinned);
2937 double_rq_unlock(this_rq, busiest);
2938 local_irq_restore(flags);
2939
2940 /*
2941 * some other cpu did the load balance for us.
2942 */
2943 if (ld_moved && this_cpu != smp_processor_id())
2944 resched_cpu(this_cpu);
2945
2946 /* All tasks on this runqueue were pinned by CPU affinity */
2947 if (unlikely(all_pinned)) {
2948 cpumask_clear_cpu(cpu_of(busiest), cpus);
2949 if (!cpumask_empty(cpus))
2950 goto redo;
2951 goto out_balanced;
2952 }
2953 }
2954
2955 if (!ld_moved) {
2956 schedstat_inc(sd, lb_failed[idle]);
2957 sd->nr_balance_failed++;
2958
2959 if (need_active_balance(sd, sd_idle, idle)) {
2960 raw_spin_lock_irqsave(&busiest->lock, flags);
2961
2962 /* don't kick the migration_thread, if the curr
2963 * task on busiest cpu can't be moved to this_cpu
2964 */
2965 if (!cpumask_test_cpu(this_cpu,
2966 &busiest->curr->cpus_allowed)) {
2967 raw_spin_unlock_irqrestore(&busiest->lock,
2968 flags);
2969 all_pinned = 1;
2970 goto out_one_pinned;
2971 }
2972
2973 if (!busiest->active_balance) {
2974 busiest->active_balance = 1;
2975 busiest->push_cpu = this_cpu;
2976 active_balance = 1;
2977 }
2978 raw_spin_unlock_irqrestore(&busiest->lock, flags);
2979 if (active_balance)
2980 wake_up_process(busiest->migration_thread);
2981
2982 /*
2983 * We've kicked active balancing, reset the failure
2984 * counter.
2985 */
2986 sd->nr_balance_failed = sd->cache_nice_tries+1;
2987 }
2988 } else
2989 sd->nr_balance_failed = 0;
2990
2991 if (likely(!active_balance)) {
2992 /* We were unbalanced, so reset the balancing interval */
2993 sd->balance_interval = sd->min_interval;
2994 } else {
2995 /*
2996 * If we've begun active balancing, start to back off. This
2997 * case may not be covered by the all_pinned logic if there
2998 * is only 1 task on the busy runqueue (because we don't call
2999 * move_tasks).
3000 */
3001 if (sd->balance_interval < sd->max_interval)
3002 sd->balance_interval *= 2;
3003 }
3004
3005 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3006 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3007 ld_moved = -1;
3008
3009 goto out;
3010
3011out_balanced:
3012 schedstat_inc(sd, lb_balanced[idle]);
3013
3014 sd->nr_balance_failed = 0;
3015
3016out_one_pinned:
3017 /* tune up the balancing interval */
3018 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
3019 (sd->balance_interval < sd->max_interval))
3020 sd->balance_interval *= 2;
3021
3022 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3023 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3024 ld_moved = -1;
3025 else
3026 ld_moved = 0;
3027out:
3028 if (ld_moved)
3029 update_shares(sd);
3030 return ld_moved;
3031}
3032
3033/*
3034 * idle_balance is called by schedule() if this_cpu is about to become
3035 * idle. Attempts to pull tasks from other CPUs.
3036 */
3037static void idle_balance(int this_cpu, struct rq *this_rq)
3038{
3039 struct sched_domain *sd;
3040 int pulled_task = 0;
3041 unsigned long next_balance = jiffies + HZ;
3042
3043 this_rq->idle_stamp = this_rq->clock;
3044
3045 if (this_rq->avg_idle < sysctl_sched_migration_cost)
3046 return;
3047
3048 /*
3049 * Drop the rq->lock, but keep IRQ/preempt disabled.
3050 */
3051 raw_spin_unlock(&this_rq->lock);
3052
3053 for_each_domain(this_cpu, sd) {
3054 unsigned long interval;
3055 int balance = 1;
3056
3057 if (!(sd->flags & SD_LOAD_BALANCE))
3058 continue;
3059
3060 if (sd->flags & SD_BALANCE_NEWIDLE) {
3061 /* If we've pulled tasks over stop searching: */
3062 pulled_task = load_balance(this_cpu, this_rq,
3063 sd, CPU_NEWLY_IDLE, &balance);
3064 }
3065
3066 interval = msecs_to_jiffies(sd->balance_interval);
3067 if (time_after(next_balance, sd->last_balance + interval))
3068 next_balance = sd->last_balance + interval;
3069 if (pulled_task) {
3070 this_rq->idle_stamp = 0;
3071 break;
3072 }
3073 }
3074
3075 raw_spin_lock(&this_rq->lock);
3076
3077 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
3078 /*
3079 * We are going idle. next_balance may be set based on
3080 * a busy processor. So reset next_balance.
3081 */
3082 this_rq->next_balance = next_balance;
3083 }
3084}
3085
3086/*
3087 * active_load_balance is run by migration threads. It pushes running tasks
3088 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
3089 * running on each physical CPU where possible, and avoids physical /
3090 * logical imbalances.
3091 *
3092 * Called with busiest_rq locked.
3093 */
3094static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3095{
3096 int target_cpu = busiest_rq->push_cpu;
3097 struct sched_domain *sd;
3098 struct rq *target_rq;
3099
3100 /* Is there any task to move? */
3101 if (busiest_rq->nr_running <= 1)
3102 return;
3103
3104 target_rq = cpu_rq(target_cpu);
3105
3106 /*
3107 * This condition is "impossible", if it occurs
3108 * we need to fix it. Originally reported by
3109 * Bjorn Helgaas on a 128-cpu setup.
3110 */
3111 BUG_ON(busiest_rq == target_rq);
3112
3113 /* move a task from busiest_rq to target_rq */
3114 double_lock_balance(busiest_rq, target_rq);
3115 update_rq_clock(busiest_rq);
3116 update_rq_clock(target_rq);
3117
3118 /* Search for an sd spanning us and the target CPU. */
3119 for_each_domain(target_cpu, sd) {
3120 if ((sd->flags & SD_LOAD_BALANCE) &&
3121 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
3122 break;
3123 }
3124
3125 if (likely(sd)) {
3126 schedstat_inc(sd, alb_count);
3127
3128 if (move_one_task(target_rq, target_cpu, busiest_rq,
3129 sd, CPU_IDLE))
3130 schedstat_inc(sd, alb_pushed);
3131 else
3132 schedstat_inc(sd, alb_failed);
3133 }
3134 double_unlock_balance(busiest_rq, target_rq);
3135}
3136
3137#ifdef CONFIG_NO_HZ
3138static struct {
3139 atomic_t load_balancer;
3140 cpumask_var_t cpu_mask;
3141 cpumask_var_t ilb_grp_nohz_mask;
3142} nohz ____cacheline_aligned = {
3143 .load_balancer = ATOMIC_INIT(-1),
3144};
3145
3146int get_nohz_load_balancer(void)
3147{
3148 return atomic_read(&nohz.load_balancer);
3149}
3150
3151#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3152/**
3153 * lowest_flag_domain - Return lowest sched_domain containing flag.
3154 * @cpu: The cpu whose lowest level of sched domain is to
3155 * be returned.
3156 * @flag: The flag to check for the lowest sched_domain
3157 * for the given cpu.
3158 *
3159 * Returns the lowest sched_domain of a cpu which contains the given flag.
3160 */
3161static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
3162{
3163 struct sched_domain *sd;
3164
3165 for_each_domain(cpu, sd)
3166 if (sd && (sd->flags & flag))
3167 break;
3168
3169 return sd;
3170}
3171
3172/**
3173 * for_each_flag_domain - Iterates over sched_domains containing the flag.
3174 * @cpu: The cpu whose domains we're iterating over.
3175 * @sd: variable holding the value of the power_savings_sd
3176 * for cpu.
3177 * @flag: The flag to filter the sched_domains to be iterated.
3178 *
3179 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
3180 * set, starting from the lowest sched_domain to the highest.
3181 */
3182#define for_each_flag_domain(cpu, sd, flag) \
3183 for (sd = lowest_flag_domain(cpu, flag); \
3184 (sd && (sd->flags & flag)); sd = sd->parent)
3185
3186/**
3187 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
3188 * @ilb_group: group to be checked for semi-idleness
3189 *
3190 * Returns: 1 if the group is semi-idle. 0 otherwise.
3191 *
3192 * We define a sched_group to be semi idle if it has atleast one idle-CPU
3193 * and atleast one non-idle CPU. This helper function checks if the given
3194 * sched_group is semi-idle or not.
3195 */
3196static inline int is_semi_idle_group(struct sched_group *ilb_group)
3197{
3198 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
3199 sched_group_cpus(ilb_group));
3200
3201 /*
3202 * A sched_group is semi-idle when it has atleast one busy cpu
3203 * and atleast one idle cpu.
3204 */
3205 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
3206 return 0;
3207
3208 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
3209 return 0;
3210
3211 return 1;
3212}
3213/**
3214 * find_new_ilb - Finds the optimum idle load balancer for nomination.
3215 * @cpu: The cpu which is nominating a new idle_load_balancer.
3216 *
3217 * Returns: Returns the id of the idle load balancer if it exists,
3218 * Else, returns >= nr_cpu_ids.
3219 *
3220 * This algorithm picks the idle load balancer such that it belongs to a
3221 * semi-idle powersavings sched_domain. The idea is to try and avoid
3222 * completely idle packages/cores just for the purpose of idle load balancing
3223 * when there are other idle cpu's which are better suited for that job.
3224 */
3225static int find_new_ilb(int cpu)
3226{
3227 struct sched_domain *sd;
3228 struct sched_group *ilb_group;
3229
3230 /*
3231 * Have idle load balancer selection from semi-idle packages only
3232 * when power-aware load balancing is enabled
3233 */
3234 if (!(sched_smt_power_savings || sched_mc_power_savings))
3235 goto out_done;
3236
3237 /*
3238 * Optimize for the case when we have no idle CPUs or only one
3239 * idle CPU. Don't walk the sched_domain hierarchy in such cases
3240 */
3241 if (cpumask_weight(nohz.cpu_mask) < 2)
3242 goto out_done;
3243
3244 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
3245 ilb_group = sd->groups;
3246
3247 do {
3248 if (is_semi_idle_group(ilb_group))
3249 return cpumask_first(nohz.ilb_grp_nohz_mask);
3250
3251 ilb_group = ilb_group->next;
3252
3253 } while (ilb_group != sd->groups);
3254 }
3255
3256out_done:
3257 return cpumask_first(nohz.cpu_mask);
3258}
3259#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
3260static inline int find_new_ilb(int call_cpu)
3261{
3262 return cpumask_first(nohz.cpu_mask);
3263}
3264#endif
3265
3266/*
3267 * This routine will try to nominate the ilb (idle load balancing)
3268 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
3269 * load balancing on behalf of all those cpus. If all the cpus in the system
3270 * go into this tickless mode, then there will be no ilb owner (as there is
3271 * no need for one) and all the cpus will sleep till the next wakeup event
3272 * arrives...
3273 *
3274 * For the ilb owner, tick is not stopped. And this tick will be used
3275 * for idle load balancing. ilb owner will still be part of
3276 * nohz.cpu_mask..
3277 *
3278 * While stopping the tick, this cpu will become the ilb owner if there
3279 * is no other owner. And will be the owner till that cpu becomes busy
3280 * or if all cpus in the system stop their ticks at which point
3281 * there is no need for ilb owner.
3282 *
3283 * When the ilb owner becomes busy, it nominates another owner, during the
3284 * next busy scheduler_tick()
3285 */
3286int select_nohz_load_balancer(int stop_tick)
3287{
3288 int cpu = smp_processor_id();
3289
3290 if (stop_tick) {
3291 cpu_rq(cpu)->in_nohz_recently = 1;
3292
3293 if (!cpu_active(cpu)) {
3294 if (atomic_read(&nohz.load_balancer) != cpu)
3295 return 0;
3296
3297 /*
3298 * If we are going offline and still the leader,
3299 * give up!
3300 */
3301 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3302 BUG();
3303
3304 return 0;
3305 }
3306
3307 cpumask_set_cpu(cpu, nohz.cpu_mask);
3308
3309 /* time for ilb owner also to sleep */
3310 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
3311 if (atomic_read(&nohz.load_balancer) == cpu)
3312 atomic_set(&nohz.load_balancer, -1);
3313 return 0;
3314 }
3315
3316 if (atomic_read(&nohz.load_balancer) == -1) {
3317 /* make me the ilb owner */
3318 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
3319 return 1;
3320 } else if (atomic_read(&nohz.load_balancer) == cpu) {
3321 int new_ilb;
3322
3323 if (!(sched_smt_power_savings ||
3324 sched_mc_power_savings))
3325 return 1;
3326 /*
3327 * Check to see if there is a more power-efficient
3328 * ilb.
3329 */
3330 new_ilb = find_new_ilb(cpu);
3331 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
3332 atomic_set(&nohz.load_balancer, -1);
3333 resched_cpu(new_ilb);
3334 return 0;
3335 }
3336 return 1;
3337 }
3338 } else {
3339 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
3340 return 0;
3341
3342 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3343
3344 if (atomic_read(&nohz.load_balancer) == cpu)
3345 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3346 BUG();
3347 }
3348 return 0;
3349}
3350#endif
3351
3352static DEFINE_SPINLOCK(balancing);
3353
3354/*
3355 * It checks each scheduling domain to see if it is due to be balanced,
3356 * and initiates a balancing operation if so.
3357 *
3358 * Balancing parameters are set up in arch_init_sched_domains.
3359 */
3360static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3361{
3362 int balance = 1;
3363 struct rq *rq = cpu_rq(cpu);
3364 unsigned long interval;
3365 struct sched_domain *sd;
3366 /* Earliest time when we have to do rebalance again */
3367 unsigned long next_balance = jiffies + 60*HZ;
3368 int update_next_balance = 0;
3369 int need_serialize;
3370
3371 for_each_domain(cpu, sd) {
3372 if (!(sd->flags & SD_LOAD_BALANCE))
3373 continue;
3374
3375 interval = sd->balance_interval;
3376 if (idle != CPU_IDLE)
3377 interval *= sd->busy_factor;
3378
3379 /* scale ms to jiffies */
3380 interval = msecs_to_jiffies(interval);
3381 if (unlikely(!interval))
3382 interval = 1;
3383 if (interval > HZ*NR_CPUS/10)
3384 interval = HZ*NR_CPUS/10;
3385
3386 need_serialize = sd->flags & SD_SERIALIZE;
3387
3388 if (need_serialize) {
3389 if (!spin_trylock(&balancing))
3390 goto out;
3391 }
3392
3393 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3394 if (load_balance(cpu, rq, sd, idle, &balance)) {
3395 /*
3396 * We've pulled tasks over so either we're no
3397 * longer idle, or one of our SMT siblings is
3398 * not idle.
3399 */
3400 idle = CPU_NOT_IDLE;
3401 }
3402 sd->last_balance = jiffies;
3403 }
3404 if (need_serialize)
3405 spin_unlock(&balancing);
3406out:
3407 if (time_after(next_balance, sd->last_balance + interval)) {
3408 next_balance = sd->last_balance + interval;
3409 update_next_balance = 1;
3410 }
3411
3412 /*
3413 * Stop the load balance at this level. There is another
3414 * CPU in our sched group which is doing load balancing more
3415 * actively.
3416 */
3417 if (!balance)
3418 break;
3419 }
3420
3421 /*
3422 * next_balance will be updated only when there is a need.
3423 * When the cpu is attached to null domain for ex, it will not be
3424 * updated.
3425 */
3426 if (likely(update_next_balance))
3427 rq->next_balance = next_balance;
3428}
3429
3430/*
3431 * run_rebalance_domains is triggered when needed from the scheduler tick.
3432 * In CONFIG_NO_HZ case, the idle load balance owner will do the
3433 * rebalancing for all the cpus for whom scheduler ticks are stopped.
3434 */
3435static void run_rebalance_domains(struct softirq_action *h)
3436{
3437 int this_cpu = smp_processor_id();
3438 struct rq *this_rq = cpu_rq(this_cpu);
3439 enum cpu_idle_type idle = this_rq->idle_at_tick ?
3440 CPU_IDLE : CPU_NOT_IDLE;
3441
3442 rebalance_domains(this_cpu, idle);
3443
3444#ifdef CONFIG_NO_HZ
3445 /*
3446 * If this cpu is the owner for idle load balancing, then do the
3447 * balancing on behalf of the other idle cpus whose ticks are
3448 * stopped.
3449 */
3450 if (this_rq->idle_at_tick &&
3451 atomic_read(&nohz.load_balancer) == this_cpu) {
3452 struct rq *rq;
3453 int balance_cpu;
3454
3455 for_each_cpu(balance_cpu, nohz.cpu_mask) {
3456 if (balance_cpu == this_cpu)
3457 continue;
3458
3459 /*
3460 * If this cpu gets work to do, stop the load balancing
3461 * work being done for other cpus. Next load
3462 * balancing owner will pick it up.
3463 */
3464 if (need_resched())
3465 break;
3466
3467 rebalance_domains(balance_cpu, CPU_IDLE);
3468
3469 rq = cpu_rq(balance_cpu);
3470 if (time_after(this_rq->next_balance, rq->next_balance))
3471 this_rq->next_balance = rq->next_balance;
3472 }
3473 }
3474#endif
3475}
3476
3477static inline int on_null_domain(int cpu)
3478{
3479 return !rcu_dereference_sched(cpu_rq(cpu)->sd);
3480}
3481
3482/*
3483 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3484 *
3485 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3486 * idle load balancing owner or decide to stop the periodic load balancing,
3487 * if the whole system is idle.
3488 */
3489static inline void trigger_load_balance(struct rq *rq, int cpu)
3490{
3491#ifdef CONFIG_NO_HZ
3492 /*
3493 * If we were in the nohz mode recently and busy at the current
3494 * scheduler tick, then check if we need to nominate new idle
3495 * load balancer.
3496 */
3497 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3498 rq->in_nohz_recently = 0;
3499
3500 if (atomic_read(&nohz.load_balancer) == cpu) {
3501 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3502 atomic_set(&nohz.load_balancer, -1);
3503 }
3504
3505 if (atomic_read(&nohz.load_balancer) == -1) {
3506 int ilb = find_new_ilb(cpu);
3507
3508 if (ilb < nr_cpu_ids)
3509 resched_cpu(ilb);
3510 }
3511 }
3512
3513 /*
3514 * If this cpu is idle and doing idle load balancing for all the
3515 * cpus with ticks stopped, is it time for that to stop?
3516 */
3517 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3518 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3519 resched_cpu(cpu);
3520 return;
3521 }
3522
3523 /*
3524 * If this cpu is idle and the idle load balancing is done by
3525 * someone else, then no need raise the SCHED_SOFTIRQ
3526 */
3527 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3528 cpumask_test_cpu(cpu, nohz.cpu_mask))
3529 return;
3530#endif
3531 /* Don't need to rebalance while attached to NULL domain */
3532 if (time_after_eq(jiffies, rq->next_balance) &&
3533 likely(!on_null_domain(cpu)))
3534 raise_softirq(SCHED_SOFTIRQ);
3535}
3536
3537static void rq_online_fair(struct rq *rq)
3538{
3539 update_sysctl();
3540}
3541
3542static void rq_offline_fair(struct rq *rq)
3543{
3544 update_sysctl();
3545}
3546
3547#else /* CONFIG_SMP */
3548
3549/*
3550 * on UP we do not need to balance between CPUs:
3551 */
3552static inline void idle_balance(int cpu, struct rq *rq)
3553{
3554}
3555
1853#endif /* CONFIG_SMP */ 3556#endif /* CONFIG_SMP */
1854 3557
1855/* 3558/*
@@ -1867,28 +3570,30 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
1867} 3570}
1868 3571
1869/* 3572/*
1870 * Share the fairness runtime between parent and child, thus the 3573 * called on fork with the child task as argument from the parent's context
1871 * total amount of pressure for CPU stays equal - new tasks 3574 * - child not yet on the tasklist
1872 * get a chance to run but frequent forkers are not allowed to 3575 * - preemption disabled
1873 * monopolize the CPU. Note: the parent runqueue is locked,
1874 * the child is not running yet.
1875 */ 3576 */
1876static void task_new_fair(struct rq *rq, struct task_struct *p) 3577static void task_fork_fair(struct task_struct *p)
1877{ 3578{
1878 struct cfs_rq *cfs_rq = task_cfs_rq(p); 3579 struct cfs_rq *cfs_rq = task_cfs_rq(current);
1879 struct sched_entity *se = &p->se, *curr = cfs_rq->curr; 3580 struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
1880 int this_cpu = smp_processor_id(); 3581 int this_cpu = smp_processor_id();
3582 struct rq *rq = this_rq();
3583 unsigned long flags;
1881 3584
1882 sched_info_queued(p); 3585 raw_spin_lock_irqsave(&rq->lock, flags);
3586
3587 if (unlikely(task_cpu(p) != this_cpu))
3588 __set_task_cpu(p, this_cpu);
1883 3589
1884 update_curr(cfs_rq); 3590 update_curr(cfs_rq);
3591
1885 if (curr) 3592 if (curr)
1886 se->vruntime = curr->vruntime; 3593 se->vruntime = curr->vruntime;
1887 place_entity(cfs_rq, se, 1); 3594 place_entity(cfs_rq, se, 1);
1888 3595
1889 /* 'curr' will be NULL if the child belongs to a different group */ 3596 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
1890 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
1891 curr && entity_before(curr, se)) {
1892 /* 3597 /*
1893 * Upon rescheduling, sched_class::put_prev_task() will place 3598 * Upon rescheduling, sched_class::put_prev_task() will place
1894 * 'current' within the tree based on its new key value. 3599 * 'current' within the tree based on its new key value.
@@ -1897,7 +3602,9 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1897 resched_task(rq->curr); 3602 resched_task(rq->curr);
1898 } 3603 }
1899 3604
1900 enqueue_task_fair(rq, p, 0); 3605 se->vruntime -= cfs_rq->min_vruntime;
3606
3607 raw_spin_unlock_irqrestore(&rq->lock, flags);
1901} 3608}
1902 3609
1903/* 3610/*
@@ -1950,30 +3657,27 @@ static void set_curr_task_fair(struct rq *rq)
1950} 3657}
1951 3658
1952#ifdef CONFIG_FAIR_GROUP_SCHED 3659#ifdef CONFIG_FAIR_GROUP_SCHED
1953static void moved_group_fair(struct task_struct *p) 3660static void moved_group_fair(struct task_struct *p, int on_rq)
1954{ 3661{
1955 struct cfs_rq *cfs_rq = task_cfs_rq(p); 3662 struct cfs_rq *cfs_rq = task_cfs_rq(p);
1956 3663
1957 update_curr(cfs_rq); 3664 update_curr(cfs_rq);
1958 place_entity(cfs_rq, &p->se, 1); 3665 if (!on_rq)
3666 place_entity(cfs_rq, &p->se, 1);
1959} 3667}
1960#endif 3668#endif
1961 3669
1962unsigned int get_rr_interval_fair(struct task_struct *task) 3670static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
1963{ 3671{
1964 struct sched_entity *se = &task->se; 3672 struct sched_entity *se = &task->se;
1965 unsigned long flags;
1966 struct rq *rq;
1967 unsigned int rr_interval = 0; 3673 unsigned int rr_interval = 0;
1968 3674
1969 /* 3675 /*
1970 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise 3676 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
1971 * idle runqueue: 3677 * idle runqueue:
1972 */ 3678 */
1973 rq = task_rq_lock(task, &flags);
1974 if (rq->cfs.load.weight) 3679 if (rq->cfs.load.weight)
1975 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); 3680 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
1976 task_rq_unlock(rq, &flags);
1977 3681
1978 return rr_interval; 3682 return rr_interval;
1979} 3683}
@@ -1995,13 +3699,15 @@ static const struct sched_class fair_sched_class = {
1995#ifdef CONFIG_SMP 3699#ifdef CONFIG_SMP
1996 .select_task_rq = select_task_rq_fair, 3700 .select_task_rq = select_task_rq_fair,
1997 3701
1998 .load_balance = load_balance_fair, 3702 .rq_online = rq_online_fair,
1999 .move_one_task = move_one_task_fair, 3703 .rq_offline = rq_offline_fair,
3704
3705 .task_waking = task_waking_fair,
2000#endif 3706#endif
2001 3707
2002 .set_curr_task = set_curr_task_fair, 3708 .set_curr_task = set_curr_task_fair,
2003 .task_tick = task_tick_fair, 3709 .task_tick = task_tick_fair,
2004 .task_new = task_new_fair, 3710 .task_fork = task_fork_fair,
2005 3711
2006 .prio_changed = prio_changed_fair, 3712 .prio_changed = prio_changed_fair,
2007 .switched_to = switched_to_fair, 3713 .switched_to = switched_to_fair,
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 0d94083582c7..d5059fd761d9 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -54,11 +54,6 @@ SCHED_FEAT(WAKEUP_SYNC, 0)
54SCHED_FEAT(WAKEUP_OVERLAP, 0) 54SCHED_FEAT(WAKEUP_OVERLAP, 0)
55 55
56/* 56/*
57 * Wakeup preemption towards tasks that run short
58 */
59SCHED_FEAT(WAKEUP_RUNNING, 0)
60
61/*
62 * Use the SYNC wakeup hint, pipes and the likes use this to indicate 57 * Use the SYNC wakeup hint, pipes and the likes use this to indicate
63 * the remote end is likely to consume the data we just wrote, and 58 * the remote end is likely to consume the data we just wrote, and
64 * therefore has cache benefit from being placed on the same cpu, see 59 * therefore has cache benefit from being placed on the same cpu, see
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index b133a28fcde3..a8a6d8a50947 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -34,34 +34,16 @@ static struct task_struct *pick_next_task_idle(struct rq *rq)
34static void 34static void
35dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep) 35dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep)
36{ 36{
37 spin_unlock_irq(&rq->lock); 37 raw_spin_unlock_irq(&rq->lock);
38 printk(KERN_ERR "bad: scheduling from the idle thread!\n"); 38 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
39 dump_stack(); 39 dump_stack();
40 spin_lock_irq(&rq->lock); 40 raw_spin_lock_irq(&rq->lock);
41} 41}
42 42
43static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) 43static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
44{ 44{
45} 45}
46 46
47#ifdef CONFIG_SMP
48static unsigned long
49load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
50 unsigned long max_load_move,
51 struct sched_domain *sd, enum cpu_idle_type idle,
52 int *all_pinned, int *this_best_prio)
53{
54 return 0;
55}
56
57static int
58move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
59 struct sched_domain *sd, enum cpu_idle_type idle)
60{
61 return 0;
62}
63#endif
64
65static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) 47static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
66{ 48{
67} 49}
@@ -97,7 +79,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
97 check_preempt_curr(rq, p, 0); 79 check_preempt_curr(rq, p, 0);
98} 80}
99 81
100unsigned int get_rr_interval_idle(struct task_struct *task) 82static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
101{ 83{
102 return 0; 84 return 0;
103} 85}
@@ -119,9 +101,6 @@ static const struct sched_class idle_sched_class = {
119 101
120#ifdef CONFIG_SMP 102#ifdef CONFIG_SMP
121 .select_task_rq = select_task_rq_idle, 103 .select_task_rq = select_task_rq_idle,
122
123 .load_balance = load_balance_idle,
124 .move_one_task = move_one_task_idle,
125#endif 104#endif
126 105
127 .set_curr_task = set_curr_task_idle, 106 .set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index f622880e918f..c2fbb02c1b54 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -194,17 +194,20 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
194 return rt_se->my_q; 194 return rt_se->my_q;
195} 195}
196 196
197static void enqueue_rt_entity(struct sched_rt_entity *rt_se); 197static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
198static void dequeue_rt_entity(struct sched_rt_entity *rt_se); 198static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
199 199
200static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 200static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
201{ 201{
202 int this_cpu = smp_processor_id();
202 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 203 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
203 struct sched_rt_entity *rt_se = rt_rq->rt_se; 204 struct sched_rt_entity *rt_se;
205
206 rt_se = rt_rq->tg->rt_se[this_cpu];
204 207
205 if (rt_rq->rt_nr_running) { 208 if (rt_rq->rt_nr_running) {
206 if (rt_se && !on_rt_rq(rt_se)) 209 if (rt_se && !on_rt_rq(rt_se))
207 enqueue_rt_entity(rt_se); 210 enqueue_rt_entity(rt_se, false);
208 if (rt_rq->highest_prio.curr < curr->prio) 211 if (rt_rq->highest_prio.curr < curr->prio)
209 resched_task(curr); 212 resched_task(curr);
210 } 213 }
@@ -212,7 +215,10 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
212 215
213static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 216static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
214{ 217{
215 struct sched_rt_entity *rt_se = rt_rq->rt_se; 218 int this_cpu = smp_processor_id();
219 struct sched_rt_entity *rt_se;
220
221 rt_se = rt_rq->tg->rt_se[this_cpu];
216 222
217 if (rt_se && on_rt_rq(rt_se)) 223 if (rt_se && on_rt_rq(rt_se))
218 dequeue_rt_entity(rt_se); 224 dequeue_rt_entity(rt_se);
@@ -327,7 +333,7 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
327 333
328 weight = cpumask_weight(rd->span); 334 weight = cpumask_weight(rd->span);
329 335
330 spin_lock(&rt_b->rt_runtime_lock); 336 raw_spin_lock(&rt_b->rt_runtime_lock);
331 rt_period = ktime_to_ns(rt_b->rt_period); 337 rt_period = ktime_to_ns(rt_b->rt_period);
332 for_each_cpu(i, rd->span) { 338 for_each_cpu(i, rd->span) {
333 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 339 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
@@ -336,7 +342,7 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
336 if (iter == rt_rq) 342 if (iter == rt_rq)
337 continue; 343 continue;
338 344
339 spin_lock(&iter->rt_runtime_lock); 345 raw_spin_lock(&iter->rt_runtime_lock);
340 /* 346 /*
341 * Either all rqs have inf runtime and there's nothing to steal 347 * Either all rqs have inf runtime and there's nothing to steal
342 * or __disable_runtime() below sets a specific rq to inf to 348 * or __disable_runtime() below sets a specific rq to inf to
@@ -358,14 +364,14 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
358 rt_rq->rt_runtime += diff; 364 rt_rq->rt_runtime += diff;
359 more = 1; 365 more = 1;
360 if (rt_rq->rt_runtime == rt_period) { 366 if (rt_rq->rt_runtime == rt_period) {
361 spin_unlock(&iter->rt_runtime_lock); 367 raw_spin_unlock(&iter->rt_runtime_lock);
362 break; 368 break;
363 } 369 }
364 } 370 }
365next: 371next:
366 spin_unlock(&iter->rt_runtime_lock); 372 raw_spin_unlock(&iter->rt_runtime_lock);
367 } 373 }
368 spin_unlock(&rt_b->rt_runtime_lock); 374 raw_spin_unlock(&rt_b->rt_runtime_lock);
369 375
370 return more; 376 return more;
371} 377}
@@ -386,8 +392,8 @@ static void __disable_runtime(struct rq *rq)
386 s64 want; 392 s64 want;
387 int i; 393 int i;
388 394
389 spin_lock(&rt_b->rt_runtime_lock); 395 raw_spin_lock(&rt_b->rt_runtime_lock);
390 spin_lock(&rt_rq->rt_runtime_lock); 396 raw_spin_lock(&rt_rq->rt_runtime_lock);
391 /* 397 /*
392 * Either we're all inf and nobody needs to borrow, or we're 398 * Either we're all inf and nobody needs to borrow, or we're
393 * already disabled and thus have nothing to do, or we have 399 * already disabled and thus have nothing to do, or we have
@@ -396,7 +402,7 @@ static void __disable_runtime(struct rq *rq)
396 if (rt_rq->rt_runtime == RUNTIME_INF || 402 if (rt_rq->rt_runtime == RUNTIME_INF ||
397 rt_rq->rt_runtime == rt_b->rt_runtime) 403 rt_rq->rt_runtime == rt_b->rt_runtime)
398 goto balanced; 404 goto balanced;
399 spin_unlock(&rt_rq->rt_runtime_lock); 405 raw_spin_unlock(&rt_rq->rt_runtime_lock);
400 406
401 /* 407 /*
402 * Calculate the difference between what we started out with 408 * Calculate the difference between what we started out with
@@ -418,7 +424,7 @@ static void __disable_runtime(struct rq *rq)
418 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) 424 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
419 continue; 425 continue;
420 426
421 spin_lock(&iter->rt_runtime_lock); 427 raw_spin_lock(&iter->rt_runtime_lock);
422 if (want > 0) { 428 if (want > 0) {
423 diff = min_t(s64, iter->rt_runtime, want); 429 diff = min_t(s64, iter->rt_runtime, want);
424 iter->rt_runtime -= diff; 430 iter->rt_runtime -= diff;
@@ -427,13 +433,13 @@ static void __disable_runtime(struct rq *rq)
427 iter->rt_runtime -= want; 433 iter->rt_runtime -= want;
428 want -= want; 434 want -= want;
429 } 435 }
430 spin_unlock(&iter->rt_runtime_lock); 436 raw_spin_unlock(&iter->rt_runtime_lock);
431 437
432 if (!want) 438 if (!want)
433 break; 439 break;
434 } 440 }
435 441
436 spin_lock(&rt_rq->rt_runtime_lock); 442 raw_spin_lock(&rt_rq->rt_runtime_lock);
437 /* 443 /*
438 * We cannot be left wanting - that would mean some runtime 444 * We cannot be left wanting - that would mean some runtime
439 * leaked out of the system. 445 * leaked out of the system.
@@ -445,8 +451,8 @@ balanced:
445 * runtime - in which case borrowing doesn't make sense. 451 * runtime - in which case borrowing doesn't make sense.
446 */ 452 */
447 rt_rq->rt_runtime = RUNTIME_INF; 453 rt_rq->rt_runtime = RUNTIME_INF;
448 spin_unlock(&rt_rq->rt_runtime_lock); 454 raw_spin_unlock(&rt_rq->rt_runtime_lock);
449 spin_unlock(&rt_b->rt_runtime_lock); 455 raw_spin_unlock(&rt_b->rt_runtime_lock);
450 } 456 }
451} 457}
452 458
@@ -454,9 +460,9 @@ static void disable_runtime(struct rq *rq)
454{ 460{
455 unsigned long flags; 461 unsigned long flags;
456 462
457 spin_lock_irqsave(&rq->lock, flags); 463 raw_spin_lock_irqsave(&rq->lock, flags);
458 __disable_runtime(rq); 464 __disable_runtime(rq);
459 spin_unlock_irqrestore(&rq->lock, flags); 465 raw_spin_unlock_irqrestore(&rq->lock, flags);
460} 466}
461 467
462static void __enable_runtime(struct rq *rq) 468static void __enable_runtime(struct rq *rq)
@@ -472,13 +478,13 @@ static void __enable_runtime(struct rq *rq)
472 for_each_leaf_rt_rq(rt_rq, rq) { 478 for_each_leaf_rt_rq(rt_rq, rq) {
473 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 479 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
474 480
475 spin_lock(&rt_b->rt_runtime_lock); 481 raw_spin_lock(&rt_b->rt_runtime_lock);
476 spin_lock(&rt_rq->rt_runtime_lock); 482 raw_spin_lock(&rt_rq->rt_runtime_lock);
477 rt_rq->rt_runtime = rt_b->rt_runtime; 483 rt_rq->rt_runtime = rt_b->rt_runtime;
478 rt_rq->rt_time = 0; 484 rt_rq->rt_time = 0;
479 rt_rq->rt_throttled = 0; 485 rt_rq->rt_throttled = 0;
480 spin_unlock(&rt_rq->rt_runtime_lock); 486 raw_spin_unlock(&rt_rq->rt_runtime_lock);
481 spin_unlock(&rt_b->rt_runtime_lock); 487 raw_spin_unlock(&rt_b->rt_runtime_lock);
482 } 488 }
483} 489}
484 490
@@ -486,9 +492,9 @@ static void enable_runtime(struct rq *rq)
486{ 492{
487 unsigned long flags; 493 unsigned long flags;
488 494
489 spin_lock_irqsave(&rq->lock, flags); 495 raw_spin_lock_irqsave(&rq->lock, flags);
490 __enable_runtime(rq); 496 __enable_runtime(rq);
491 spin_unlock_irqrestore(&rq->lock, flags); 497 raw_spin_unlock_irqrestore(&rq->lock, flags);
492} 498}
493 499
494static int balance_runtime(struct rt_rq *rt_rq) 500static int balance_runtime(struct rt_rq *rt_rq)
@@ -496,9 +502,9 @@ static int balance_runtime(struct rt_rq *rt_rq)
496 int more = 0; 502 int more = 0;
497 503
498 if (rt_rq->rt_time > rt_rq->rt_runtime) { 504 if (rt_rq->rt_time > rt_rq->rt_runtime) {
499 spin_unlock(&rt_rq->rt_runtime_lock); 505 raw_spin_unlock(&rt_rq->rt_runtime_lock);
500 more = do_balance_runtime(rt_rq); 506 more = do_balance_runtime(rt_rq);
501 spin_lock(&rt_rq->rt_runtime_lock); 507 raw_spin_lock(&rt_rq->rt_runtime_lock);
502 } 508 }
503 509
504 return more; 510 return more;
@@ -524,11 +530,11 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
524 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); 530 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
525 struct rq *rq = rq_of_rt_rq(rt_rq); 531 struct rq *rq = rq_of_rt_rq(rt_rq);
526 532
527 spin_lock(&rq->lock); 533 raw_spin_lock(&rq->lock);
528 if (rt_rq->rt_time) { 534 if (rt_rq->rt_time) {
529 u64 runtime; 535 u64 runtime;
530 536
531 spin_lock(&rt_rq->rt_runtime_lock); 537 raw_spin_lock(&rt_rq->rt_runtime_lock);
532 if (rt_rq->rt_throttled) 538 if (rt_rq->rt_throttled)
533 balance_runtime(rt_rq); 539 balance_runtime(rt_rq);
534 runtime = rt_rq->rt_runtime; 540 runtime = rt_rq->rt_runtime;
@@ -539,13 +545,13 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
539 } 545 }
540 if (rt_rq->rt_time || rt_rq->rt_nr_running) 546 if (rt_rq->rt_time || rt_rq->rt_nr_running)
541 idle = 0; 547 idle = 0;
542 spin_unlock(&rt_rq->rt_runtime_lock); 548 raw_spin_unlock(&rt_rq->rt_runtime_lock);
543 } else if (rt_rq->rt_nr_running) 549 } else if (rt_rq->rt_nr_running)
544 idle = 0; 550 idle = 0;
545 551
546 if (enqueue) 552 if (enqueue)
547 sched_rt_rq_enqueue(rt_rq); 553 sched_rt_rq_enqueue(rt_rq);
548 spin_unlock(&rq->lock); 554 raw_spin_unlock(&rq->lock);
549 } 555 }
550 556
551 return idle; 557 return idle;
@@ -624,11 +630,11 @@ static void update_curr_rt(struct rq *rq)
624 rt_rq = rt_rq_of_se(rt_se); 630 rt_rq = rt_rq_of_se(rt_se);
625 631
626 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { 632 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
627 spin_lock(&rt_rq->rt_runtime_lock); 633 raw_spin_lock(&rt_rq->rt_runtime_lock);
628 rt_rq->rt_time += delta_exec; 634 rt_rq->rt_time += delta_exec;
629 if (sched_rt_runtime_exceeded(rt_rq)) 635 if (sched_rt_runtime_exceeded(rt_rq))
630 resched_task(curr); 636 resched_task(curr);
631 spin_unlock(&rt_rq->rt_runtime_lock); 637 raw_spin_unlock(&rt_rq->rt_runtime_lock);
632 } 638 }
633 } 639 }
634} 640}
@@ -803,7 +809,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
803 dec_rt_group(rt_se, rt_rq); 809 dec_rt_group(rt_se, rt_rq);
804} 810}
805 811
806static void __enqueue_rt_entity(struct sched_rt_entity *rt_se) 812static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
807{ 813{
808 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 814 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
809 struct rt_prio_array *array = &rt_rq->active; 815 struct rt_prio_array *array = &rt_rq->active;
@@ -819,7 +825,10 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
819 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 825 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
820 return; 826 return;
821 827
822 list_add_tail(&rt_se->run_list, queue); 828 if (head)
829 list_add(&rt_se->run_list, queue);
830 else
831 list_add_tail(&rt_se->run_list, queue);
823 __set_bit(rt_se_prio(rt_se), array->bitmap); 832 __set_bit(rt_se_prio(rt_se), array->bitmap);
824 833
825 inc_rt_tasks(rt_se, rt_rq); 834 inc_rt_tasks(rt_se, rt_rq);
@@ -856,11 +865,11 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
856 } 865 }
857} 866}
858 867
859static void enqueue_rt_entity(struct sched_rt_entity *rt_se) 868static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
860{ 869{
861 dequeue_rt_stack(rt_se); 870 dequeue_rt_stack(rt_se);
862 for_each_sched_rt_entity(rt_se) 871 for_each_sched_rt_entity(rt_se)
863 __enqueue_rt_entity(rt_se); 872 __enqueue_rt_entity(rt_se, head);
864} 873}
865 874
866static void dequeue_rt_entity(struct sched_rt_entity *rt_se) 875static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
@@ -871,21 +880,22 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
871 struct rt_rq *rt_rq = group_rt_rq(rt_se); 880 struct rt_rq *rt_rq = group_rt_rq(rt_se);
872 881
873 if (rt_rq && rt_rq->rt_nr_running) 882 if (rt_rq && rt_rq->rt_nr_running)
874 __enqueue_rt_entity(rt_se); 883 __enqueue_rt_entity(rt_se, false);
875 } 884 }
876} 885}
877 886
878/* 887/*
879 * Adding/removing a task to/from a priority array: 888 * Adding/removing a task to/from a priority array:
880 */ 889 */
881static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) 890static void
891enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head)
882{ 892{
883 struct sched_rt_entity *rt_se = &p->rt; 893 struct sched_rt_entity *rt_se = &p->rt;
884 894
885 if (wakeup) 895 if (wakeup)
886 rt_se->timeout = 0; 896 rt_se->timeout = 0;
887 897
888 enqueue_rt_entity(rt_se); 898 enqueue_rt_entity(rt_se, head);
889 899
890 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 900 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
891 enqueue_pushable_task(rq, p); 901 enqueue_pushable_task(rq, p);
@@ -1136,7 +1146,12 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
1136 if (next && next->prio < idx) 1146 if (next && next->prio < idx)
1137 continue; 1147 continue;
1138 list_for_each_entry(rt_se, array->queue + idx, run_list) { 1148 list_for_each_entry(rt_se, array->queue + idx, run_list) {
1139 struct task_struct *p = rt_task_of(rt_se); 1149 struct task_struct *p;
1150
1151 if (!rt_entity_is_task(rt_se))
1152 continue;
1153
1154 p = rt_task_of(rt_se);
1140 if (pick_rt_task(rq, p, cpu)) { 1155 if (pick_rt_task(rq, p, cpu)) {
1141 next = p; 1156 next = p;
1142 break; 1157 break;
@@ -1153,29 +1168,12 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
1153 1168
1154static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); 1169static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1155 1170
1156static inline int pick_optimal_cpu(int this_cpu,
1157 const struct cpumask *mask)
1158{
1159 int first;
1160
1161 /* "this_cpu" is cheaper to preempt than a remote processor */
1162 if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask))
1163 return this_cpu;
1164
1165 first = cpumask_first(mask);
1166 if (first < nr_cpu_ids)
1167 return first;
1168
1169 return -1;
1170}
1171
1172static int find_lowest_rq(struct task_struct *task) 1171static int find_lowest_rq(struct task_struct *task)
1173{ 1172{
1174 struct sched_domain *sd; 1173 struct sched_domain *sd;
1175 struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); 1174 struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
1176 int this_cpu = smp_processor_id(); 1175 int this_cpu = smp_processor_id();
1177 int cpu = task_cpu(task); 1176 int cpu = task_cpu(task);
1178 cpumask_var_t domain_mask;
1179 1177
1180 if (task->rt.nr_cpus_allowed == 1) 1178 if (task->rt.nr_cpus_allowed == 1)
1181 return -1; /* No other targets possible */ 1179 return -1; /* No other targets possible */
@@ -1198,28 +1196,26 @@ static int find_lowest_rq(struct task_struct *task)
1198 * Otherwise, we consult the sched_domains span maps to figure 1196 * Otherwise, we consult the sched_domains span maps to figure
1199 * out which cpu is logically closest to our hot cache data. 1197 * out which cpu is logically closest to our hot cache data.
1200 */ 1198 */
1201 if (this_cpu == cpu) 1199 if (!cpumask_test_cpu(this_cpu, lowest_mask))
1202 this_cpu = -1; /* Skip this_cpu opt if the same */ 1200 this_cpu = -1; /* Skip this_cpu opt if not among lowest */
1203
1204 if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) {
1205 for_each_domain(cpu, sd) {
1206 if (sd->flags & SD_WAKE_AFFINE) {
1207 int best_cpu;
1208
1209 cpumask_and(domain_mask,
1210 sched_domain_span(sd),
1211 lowest_mask);
1212 1201
1213 best_cpu = pick_optimal_cpu(this_cpu, 1202 for_each_domain(cpu, sd) {
1214 domain_mask); 1203 if (sd->flags & SD_WAKE_AFFINE) {
1204 int best_cpu;
1215 1205
1216 if (best_cpu != -1) { 1206 /*
1217 free_cpumask_var(domain_mask); 1207 * "this_cpu" is cheaper to preempt than a
1218 return best_cpu; 1208 * remote processor.
1219 } 1209 */
1220 } 1210 if (this_cpu != -1 &&
1211 cpumask_test_cpu(this_cpu, sched_domain_span(sd)))
1212 return this_cpu;
1213
1214 best_cpu = cpumask_first_and(lowest_mask,
1215 sched_domain_span(sd));
1216 if (best_cpu < nr_cpu_ids)
1217 return best_cpu;
1221 } 1218 }
1222 free_cpumask_var(domain_mask);
1223 } 1219 }
1224 1220
1225 /* 1221 /*
@@ -1227,7 +1223,13 @@ static int find_lowest_rq(struct task_struct *task)
1227 * just give the caller *something* to work with from the compatible 1223 * just give the caller *something* to work with from the compatible
1228 * locations. 1224 * locations.
1229 */ 1225 */
1230 return pick_optimal_cpu(this_cpu, lowest_mask); 1226 if (this_cpu != -1)
1227 return this_cpu;
1228
1229 cpu = cpumask_any(lowest_mask);
1230 if (cpu < nr_cpu_ids)
1231 return cpu;
1232 return -1;
1231} 1233}
1232 1234
1233/* Will lock the rq it finds */ 1235/* Will lock the rq it finds */
@@ -1259,7 +1261,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1259 task_running(rq, task) || 1261 task_running(rq, task) ||
1260 !task->se.on_rq)) { 1262 !task->se.on_rq)) {
1261 1263
1262 spin_unlock(&lowest_rq->lock); 1264 raw_spin_unlock(&lowest_rq->lock);
1263 lowest_rq = NULL; 1265 lowest_rq = NULL;
1264 break; 1266 break;
1265 } 1267 }
@@ -1485,7 +1487,7 @@ static void post_schedule_rt(struct rq *rq)
1485 * If we are not running and we are not going to reschedule soon, we should 1487 * If we are not running and we are not going to reschedule soon, we should
1486 * try to push tasks away now 1488 * try to push tasks away now
1487 */ 1489 */
1488static void task_wake_up_rt(struct rq *rq, struct task_struct *p) 1490static void task_woken_rt(struct rq *rq, struct task_struct *p)
1489{ 1491{
1490 if (!task_running(rq, p) && 1492 if (!task_running(rq, p) &&
1491 !test_tsk_need_resched(rq->curr) && 1493 !test_tsk_need_resched(rq->curr) &&
@@ -1494,24 +1496,6 @@ static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
1494 push_rt_tasks(rq); 1496 push_rt_tasks(rq);
1495} 1497}
1496 1498
1497static unsigned long
1498load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1499 unsigned long max_load_move,
1500 struct sched_domain *sd, enum cpu_idle_type idle,
1501 int *all_pinned, int *this_best_prio)
1502{
1503 /* don't touch RT tasks */
1504 return 0;
1505}
1506
1507static int
1508move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1509 struct sched_domain *sd, enum cpu_idle_type idle)
1510{
1511 /* don't touch RT tasks */
1512 return 0;
1513}
1514
1515static void set_cpus_allowed_rt(struct task_struct *p, 1499static void set_cpus_allowed_rt(struct task_struct *p,
1516 const struct cpumask *new_mask) 1500 const struct cpumask *new_mask)
1517{ 1501{
@@ -1683,8 +1667,9 @@ static void watchdog(struct rq *rq, struct task_struct *p)
1683 if (!p->signal) 1667 if (!p->signal)
1684 return; 1668 return;
1685 1669
1686 soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur; 1670 /* max may change after cur was read, this will be fixed next tick */
1687 hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max; 1671 soft = task_rlimit(p, RLIMIT_RTTIME);
1672 hard = task_rlimit_max(p, RLIMIT_RTTIME);
1688 1673
1689 if (soft != RLIM_INFINITY) { 1674 if (soft != RLIM_INFINITY) {
1690 unsigned long next; 1675 unsigned long next;
@@ -1734,7 +1719,7 @@ static void set_curr_task_rt(struct rq *rq)
1734 dequeue_pushable_task(rq, p); 1719 dequeue_pushable_task(rq, p);
1735} 1720}
1736 1721
1737unsigned int get_rr_interval_rt(struct task_struct *task) 1722static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
1738{ 1723{
1739 /* 1724 /*
1740 * Time slice is 0 for SCHED_FIFO tasks 1725 * Time slice is 0 for SCHED_FIFO tasks
@@ -1759,14 +1744,12 @@ static const struct sched_class rt_sched_class = {
1759#ifdef CONFIG_SMP 1744#ifdef CONFIG_SMP
1760 .select_task_rq = select_task_rq_rt, 1745 .select_task_rq = select_task_rq_rt,
1761 1746
1762 .load_balance = load_balance_rt,
1763 .move_one_task = move_one_task_rt,
1764 .set_cpus_allowed = set_cpus_allowed_rt, 1747 .set_cpus_allowed = set_cpus_allowed_rt,
1765 .rq_online = rq_online_rt, 1748 .rq_online = rq_online_rt,
1766 .rq_offline = rq_offline_rt, 1749 .rq_offline = rq_offline_rt,
1767 .pre_schedule = pre_schedule_rt, 1750 .pre_schedule = pre_schedule_rt,
1768 .post_schedule = post_schedule_rt, 1751 .post_schedule = post_schedule_rt,
1769 .task_wake_up = task_wake_up_rt, 1752 .task_woken = task_woken_rt,
1770 .switched_from = switched_from_rt, 1753 .switched_from = switched_from_rt,
1771#endif 1754#endif
1772 1755
diff --git a/kernel/signal.c b/kernel/signal.c
index 6705320784fd..dbd7fe073c55 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -22,12 +22,14 @@
22#include <linux/ptrace.h> 22#include <linux/ptrace.h>
23#include <linux/signal.h> 23#include <linux/signal.h>
24#include <linux/signalfd.h> 24#include <linux/signalfd.h>
25#include <linux/ratelimit.h>
25#include <linux/tracehook.h> 26#include <linux/tracehook.h>
26#include <linux/capability.h> 27#include <linux/capability.h>
27#include <linux/freezer.h> 28#include <linux/freezer.h>
28#include <linux/pid_namespace.h> 29#include <linux/pid_namespace.h>
29#include <linux/nsproxy.h> 30#include <linux/nsproxy.h>
30#include <trace/events/sched.h> 31#define CREATE_TRACE_POINTS
32#include <trace/events/signal.h>
31 33
32#include <asm/param.h> 34#include <asm/param.h>
33#include <asm/uaccess.h> 35#include <asm/uaccess.h>
@@ -41,6 +43,8 @@
41 43
42static struct kmem_cache *sigqueue_cachep; 44static struct kmem_cache *sigqueue_cachep;
43 45
46int print_fatal_signals __read_mostly;
47
44static void __user *sig_handler(struct task_struct *t, int sig) 48static void __user *sig_handler(struct task_struct *t, int sig)
45{ 49{
46 return t->sighand->action[sig - 1].sa.sa_handler; 50 return t->sighand->action[sig - 1].sa.sa_handler;
@@ -155,62 +159,98 @@ void recalc_sigpending(void)
155 159
156/* Given the mask, find the first available signal that should be serviced. */ 160/* Given the mask, find the first available signal that should be serviced. */
157 161
162#define SYNCHRONOUS_MASK \
163 (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \
164 sigmask(SIGTRAP) | sigmask(SIGFPE))
165
158int next_signal(struct sigpending *pending, sigset_t *mask) 166int next_signal(struct sigpending *pending, sigset_t *mask)
159{ 167{
160 unsigned long i, *s, *m, x; 168 unsigned long i, *s, *m, x;
161 int sig = 0; 169 int sig = 0;
162 170
163 s = pending->signal.sig; 171 s = pending->signal.sig;
164 m = mask->sig; 172 m = mask->sig;
173
174 /*
175 * Handle the first word specially: it contains the
176 * synchronous signals that need to be dequeued first.
177 */
178 x = *s &~ *m;
179 if (x) {
180 if (x & SYNCHRONOUS_MASK)
181 x &= SYNCHRONOUS_MASK;
182 sig = ffz(~x) + 1;
183 return sig;
184 }
185
165 switch (_NSIG_WORDS) { 186 switch (_NSIG_WORDS) {
166 default: 187 default:
167 for (i = 0; i < _NSIG_WORDS; ++i, ++s, ++m) 188 for (i = 1; i < _NSIG_WORDS; ++i) {
168 if ((x = *s &~ *m) != 0) { 189 x = *++s &~ *++m;
169 sig = ffz(~x) + i*_NSIG_BPW + 1; 190 if (!x)
170 break; 191 continue;
171 } 192 sig = ffz(~x) + i*_NSIG_BPW + 1;
193 break;
194 }
172 break; 195 break;
173 196
174 case 2: if ((x = s[0] &~ m[0]) != 0) 197 case 2:
175 sig = 1; 198 x = s[1] &~ m[1];
176 else if ((x = s[1] &~ m[1]) != 0) 199 if (!x)
177 sig = _NSIG_BPW + 1;
178 else
179 break; 200 break;
180 sig += ffz(~x); 201 sig = ffz(~x) + _NSIG_BPW + 1;
181 break; 202 break;
182 203
183 case 1: if ((x = *s &~ *m) != 0) 204 case 1:
184 sig = ffz(~x) + 1; 205 /* Nothing to do */
185 break; 206 break;
186 } 207 }
187 208
188 return sig; 209 return sig;
189} 210}
190 211
212static inline void print_dropped_signal(int sig)
213{
214 static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
215
216 if (!print_fatal_signals)
217 return;
218
219 if (!__ratelimit(&ratelimit_state))
220 return;
221
222 printk(KERN_INFO "%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n",
223 current->comm, current->pid, sig);
224}
225
191/* 226/*
192 * allocate a new signal queue record 227 * allocate a new signal queue record
193 * - this may be called without locks if and only if t == current, otherwise an 228 * - this may be called without locks if and only if t == current, otherwise an
194 * appopriate lock must be held to stop the target task from exiting 229 * appopriate lock must be held to stop the target task from exiting
195 */ 230 */
196static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, 231static struct sigqueue *
197 int override_rlimit) 232__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
198{ 233{
199 struct sigqueue *q = NULL; 234 struct sigqueue *q = NULL;
200 struct user_struct *user; 235 struct user_struct *user;
201 236
202 /* 237 /*
203 * We won't get problems with the target's UID changing under us 238 * Protect access to @t credentials. This can go away when all
204 * because changing it requires RCU be used, and if t != current, the 239 * callers hold rcu read lock.
205 * caller must be holding the RCU readlock (by way of a spinlock) and
206 * we use RCU protection here
207 */ 240 */
241 rcu_read_lock();
208 user = get_uid(__task_cred(t)->user); 242 user = get_uid(__task_cred(t)->user);
209 atomic_inc(&user->sigpending); 243 atomic_inc(&user->sigpending);
244 rcu_read_unlock();
245
210 if (override_rlimit || 246 if (override_rlimit ||
211 atomic_read(&user->sigpending) <= 247 atomic_read(&user->sigpending) <=
212 t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) 248 task_rlimit(t, RLIMIT_SIGPENDING)) {
213 q = kmem_cache_alloc(sigqueue_cachep, flags); 249 q = kmem_cache_alloc(sigqueue_cachep, flags);
250 } else {
251 print_dropped_signal(sig);
252 }
253
214 if (unlikely(q == NULL)) { 254 if (unlikely(q == NULL)) {
215 atomic_dec(&user->sigpending); 255 atomic_dec(&user->sigpending);
216 free_uid(user); 256 free_uid(user);
@@ -400,7 +440,7 @@ still_pending:
400 */ 440 */
401 info->si_signo = sig; 441 info->si_signo = sig;
402 info->si_errno = 0; 442 info->si_errno = 0;
403 info->si_code = 0; 443 info->si_code = SI_USER;
404 info->si_pid = 0; 444 info->si_pid = 0;
405 info->si_uid = 0; 445 info->si_uid = 0;
406 } 446 }
@@ -584,6 +624,17 @@ static int rm_from_queue(unsigned long mask, struct sigpending *s)
584 return 1; 624 return 1;
585} 625}
586 626
627static inline int is_si_special(const struct siginfo *info)
628{
629 return info <= SEND_SIG_FORCED;
630}
631
632static inline bool si_fromuser(const struct siginfo *info)
633{
634 return info == SEND_SIG_NOINFO ||
635 (!is_si_special(info) && SI_FROMUSER(info));
636}
637
587/* 638/*
588 * Bad permissions for sending the signal 639 * Bad permissions for sending the signal
589 * - the caller must hold at least the RCU read lock 640 * - the caller must hold at least the RCU read lock
@@ -598,7 +649,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
598 if (!valid_signal(sig)) 649 if (!valid_signal(sig))
599 return -EINVAL; 650 return -EINVAL;
600 651
601 if (info != SEND_SIG_NOINFO && (is_si_special(info) || SI_FROMKERNEL(info))) 652 if (!si_fromuser(info))
602 return 0; 653 return 0;
603 654
604 error = audit_signal_info(sig, t); /* Let audit system see the signal */ 655 error = audit_signal_info(sig, t); /* Let audit system see the signal */
@@ -834,7 +885,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
834 struct sigqueue *q; 885 struct sigqueue *q;
835 int override_rlimit; 886 int override_rlimit;
836 887
837 trace_sched_signal_send(sig, t); 888 trace_signal_generate(sig, info, t);
838 889
839 assert_spin_locked(&t->sighand->siglock); 890 assert_spin_locked(&t->sighand->siglock);
840 891
@@ -869,7 +920,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
869 else 920 else
870 override_rlimit = 0; 921 override_rlimit = 0;
871 922
872 q = __sigqueue_alloc(t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE, 923 q = __sigqueue_alloc(sig, t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE,
873 override_rlimit); 924 override_rlimit);
874 if (q) { 925 if (q) {
875 list_add_tail(&q->list, &pending->list); 926 list_add_tail(&q->list, &pending->list);
@@ -896,12 +947,21 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
896 break; 947 break;
897 } 948 }
898 } else if (!is_si_special(info)) { 949 } else if (!is_si_special(info)) {
899 if (sig >= SIGRTMIN && info->si_code != SI_USER) 950 if (sig >= SIGRTMIN && info->si_code != SI_USER) {
900 /* 951 /*
901 * Queue overflow, abort. We may abort if the signal was rt 952 * Queue overflow, abort. We may abort if the
902 * and sent by user using something other than kill(). 953 * signal was rt and sent by user using something
903 */ 954 * other than kill().
955 */
956 trace_signal_overflow_fail(sig, group, info);
904 return -EAGAIN; 957 return -EAGAIN;
958 } else {
959 /*
960 * This is a silent loss of information. We still
961 * send the signal, but the *info bits are lost.
962 */
963 trace_signal_lose_info(sig, group, info);
964 }
905 } 965 }
906 966
907out_set: 967out_set:
@@ -917,16 +977,13 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
917 int from_ancestor_ns = 0; 977 int from_ancestor_ns = 0;
918 978
919#ifdef CONFIG_PID_NS 979#ifdef CONFIG_PID_NS
920 if (!is_si_special(info) && SI_FROMUSER(info) && 980 from_ancestor_ns = si_fromuser(info) &&
921 task_pid_nr_ns(current, task_active_pid_ns(t)) <= 0) 981 !task_pid_nr_ns(current, task_active_pid_ns(t));
922 from_ancestor_ns = 1;
923#endif 982#endif
924 983
925 return __send_signal(sig, info, t, group, from_ancestor_ns); 984 return __send_signal(sig, info, t, group, from_ancestor_ns);
926} 985}
927 986
928int print_fatal_signals;
929
930static void print_fatal_signal(struct pt_regs *regs, int signr) 987static void print_fatal_signal(struct pt_regs *regs, int signr)
931{ 988{
932 printk("%s/%d: potentially unexpected fatal signal %d.\n", 989 printk("%s/%d: potentially unexpected fatal signal %d.\n",
@@ -939,7 +996,8 @@ static void print_fatal_signal(struct pt_regs *regs, int signr)
939 for (i = 0; i < 16; i++) { 996 for (i = 0; i < 16; i++) {
940 unsigned char insn; 997 unsigned char insn;
941 998
942 __get_user(insn, (unsigned char *)(regs->ip + i)); 999 if (get_user(insn, (unsigned char *)(regs->ip + i)))
1000 break;
943 printk("%02x ", insn); 1001 printk("%02x ", insn);
944 } 1002 }
945 } 1003 }
@@ -1022,12 +1080,6 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
1022 return ret; 1080 return ret;
1023} 1081}
1024 1082
1025void
1026force_sig_specific(int sig, struct task_struct *t)
1027{
1028 force_sig_info(sig, SEND_SIG_FORCED, t);
1029}
1030
1031/* 1083/*
1032 * Nuke all other threads in the group. 1084 * Nuke all other threads in the group.
1033 */ 1085 */
@@ -1145,19 +1197,19 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
1145 int ret = -EINVAL; 1197 int ret = -EINVAL;
1146 struct task_struct *p; 1198 struct task_struct *p;
1147 const struct cred *pcred; 1199 const struct cred *pcred;
1200 unsigned long flags;
1148 1201
1149 if (!valid_signal(sig)) 1202 if (!valid_signal(sig))
1150 return ret; 1203 return ret;
1151 1204
1152 read_lock(&tasklist_lock); 1205 rcu_read_lock();
1153 p = pid_task(pid, PIDTYPE_PID); 1206 p = pid_task(pid, PIDTYPE_PID);
1154 if (!p) { 1207 if (!p) {
1155 ret = -ESRCH; 1208 ret = -ESRCH;
1156 goto out_unlock; 1209 goto out_unlock;
1157 } 1210 }
1158 pcred = __task_cred(p); 1211 pcred = __task_cred(p);
1159 if ((info == SEND_SIG_NOINFO || 1212 if (si_fromuser(info) &&
1160 (!is_si_special(info) && SI_FROMUSER(info))) &&
1161 euid != pcred->suid && euid != pcred->uid && 1213 euid != pcred->suid && euid != pcred->uid &&
1162 uid != pcred->suid && uid != pcred->uid) { 1214 uid != pcred->suid && uid != pcred->uid) {
1163 ret = -EPERM; 1215 ret = -EPERM;
@@ -1166,14 +1218,16 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
1166 ret = security_task_kill(p, info, sig, secid); 1218 ret = security_task_kill(p, info, sig, secid);
1167 if (ret) 1219 if (ret)
1168 goto out_unlock; 1220 goto out_unlock;
1169 if (sig && p->sighand) { 1221
1170 unsigned long flags; 1222 if (sig) {
1171 spin_lock_irqsave(&p->sighand->siglock, flags); 1223 if (lock_task_sighand(p, &flags)) {
1172 ret = __send_signal(sig, info, p, 1, 0); 1224 ret = __send_signal(sig, info, p, 1, 0);
1173 spin_unlock_irqrestore(&p->sighand->siglock, flags); 1225 unlock_task_sighand(p, &flags);
1226 } else
1227 ret = -ESRCH;
1174 } 1228 }
1175out_unlock: 1229out_unlock:
1176 read_unlock(&tasklist_lock); 1230 rcu_read_unlock();
1177 return ret; 1231 return ret;
1178} 1232}
1179EXPORT_SYMBOL_GPL(kill_pid_info_as_uid); 1233EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
@@ -1293,19 +1347,19 @@ EXPORT_SYMBOL(kill_pid);
1293 * These functions support sending signals using preallocated sigqueue 1347 * These functions support sending signals using preallocated sigqueue
1294 * structures. This is needed "because realtime applications cannot 1348 * structures. This is needed "because realtime applications cannot
1295 * afford to lose notifications of asynchronous events, like timer 1349 * afford to lose notifications of asynchronous events, like timer
1296 * expirations or I/O completions". In the case of Posix Timers 1350 * expirations or I/O completions". In the case of Posix Timers
1297 * we allocate the sigqueue structure from the timer_create. If this 1351 * we allocate the sigqueue structure from the timer_create. If this
1298 * allocation fails we are able to report the failure to the application 1352 * allocation fails we are able to report the failure to the application
1299 * with an EAGAIN error. 1353 * with an EAGAIN error.
1300 */ 1354 */
1301
1302struct sigqueue *sigqueue_alloc(void) 1355struct sigqueue *sigqueue_alloc(void)
1303{ 1356{
1304 struct sigqueue *q; 1357 struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
1305 1358
1306 if ((q = __sigqueue_alloc(current, GFP_KERNEL, 0))) 1359 if (q)
1307 q->flags |= SIGQUEUE_PREALLOC; 1360 q->flags |= SIGQUEUE_PREALLOC;
1308 return(q); 1361
1362 return q;
1309} 1363}
1310 1364
1311void sigqueue_free(struct sigqueue *q) 1365void sigqueue_free(struct sigqueue *q)
@@ -1807,11 +1861,6 @@ relock:
1807 1861
1808 for (;;) { 1862 for (;;) {
1809 struct k_sigaction *ka; 1863 struct k_sigaction *ka;
1810
1811 if (unlikely(signal->group_stop_count > 0) &&
1812 do_signal_stop(0))
1813 goto relock;
1814
1815 /* 1864 /*
1816 * Tracing can induce an artifical signal and choose sigaction. 1865 * Tracing can induce an artifical signal and choose sigaction.
1817 * The return value in @signr determines the default action, 1866 * The return value in @signr determines the default action,
@@ -1823,6 +1872,10 @@ relock:
1823 if (unlikely(signr != 0)) 1872 if (unlikely(signr != 0))
1824 ka = return_ka; 1873 ka = return_ka;
1825 else { 1874 else {
1875 if (unlikely(signal->group_stop_count > 0) &&
1876 do_signal_stop(0))
1877 goto relock;
1878
1826 signr = dequeue_signal(current, &current->blocked, 1879 signr = dequeue_signal(current, &current->blocked,
1827 info); 1880 info);
1828 1881
@@ -1839,6 +1892,9 @@ relock:
1839 ka = &sighand->action[signr-1]; 1892 ka = &sighand->action[signr-1];
1840 } 1893 }
1841 1894
1895 /* Trace actually delivered signals. */
1896 trace_signal_deliver(signr, info, ka);
1897
1842 if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ 1898 if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */
1843 continue; 1899 continue;
1844 if (ka->sa.sa_handler != SIG_DFL) { 1900 if (ka->sa.sa_handler != SIG_DFL) {
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 00889bd3c590..7d3f4fa9ef4f 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -49,7 +49,6 @@ static const int slow_work_max_vslow = 99;
49 49
50ctl_table slow_work_sysctls[] = { 50ctl_table slow_work_sysctls[] = {
51 { 51 {
52 .ctl_name = CTL_UNNUMBERED,
53 .procname = "min-threads", 52 .procname = "min-threads",
54 .data = &slow_work_min_threads, 53 .data = &slow_work_min_threads,
55 .maxlen = sizeof(unsigned), 54 .maxlen = sizeof(unsigned),
@@ -59,7 +58,6 @@ ctl_table slow_work_sysctls[] = {
59 .extra2 = &slow_work_max_threads, 58 .extra2 = &slow_work_max_threads,
60 }, 59 },
61 { 60 {
62 .ctl_name = CTL_UNNUMBERED,
63 .procname = "max-threads", 61 .procname = "max-threads",
64 .data = &slow_work_max_threads, 62 .data = &slow_work_max_threads,
65 .maxlen = sizeof(unsigned), 63 .maxlen = sizeof(unsigned),
@@ -69,16 +67,15 @@ ctl_table slow_work_sysctls[] = {
69 .extra2 = (void *) &slow_work_max_max_threads, 67 .extra2 = (void *) &slow_work_max_max_threads,
70 }, 68 },
71 { 69 {
72 .ctl_name = CTL_UNNUMBERED,
73 .procname = "vslow-percentage", 70 .procname = "vslow-percentage",
74 .data = &vslow_work_proportion, 71 .data = &vslow_work_proportion,
75 .maxlen = sizeof(unsigned), 72 .maxlen = sizeof(unsigned),
76 .mode = 0644, 73 .mode = 0644,
77 .proc_handler = &proc_dointvec_minmax, 74 .proc_handler = proc_dointvec_minmax,
78 .extra1 = (void *) &slow_work_min_vslow, 75 .extra1 = (void *) &slow_work_min_vslow,
79 .extra2 = (void *) &slow_work_max_vslow, 76 .extra2 = (void *) &slow_work_max_vslow,
80 }, 77 },
81 { .ctl_name = 0 } 78 {}
82}; 79};
83#endif 80#endif
84 81
@@ -640,7 +637,7 @@ int delayed_slow_work_enqueue(struct delayed_slow_work *dwork,
640 goto cancelled; 637 goto cancelled;
641 638
642 /* the timer holds a reference whilst it is pending */ 639 /* the timer holds a reference whilst it is pending */
643 ret = work->ops->get_ref(work); 640 ret = slow_work_get_ref(work);
644 if (ret < 0) 641 if (ret < 0)
645 goto cant_get_ref; 642 goto cant_get_ref;
646 643
diff --git a/kernel/slow-work.h b/kernel/slow-work.h
index 321f3c59d732..a29ebd1ef41d 100644
--- a/kernel/slow-work.h
+++ b/kernel/slow-work.h
@@ -43,28 +43,28 @@ extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *);
43 */ 43 */
44static inline void slow_work_set_thread_pid(int id, pid_t pid) 44static inline void slow_work_set_thread_pid(int id, pid_t pid)
45{ 45{
46#ifdef CONFIG_SLOW_WORK_PROC 46#ifdef CONFIG_SLOW_WORK_DEBUG
47 slow_work_pids[id] = pid; 47 slow_work_pids[id] = pid;
48#endif 48#endif
49} 49}
50 50
51static inline void slow_work_mark_time(struct slow_work *work) 51static inline void slow_work_mark_time(struct slow_work *work)
52{ 52{
53#ifdef CONFIG_SLOW_WORK_PROC 53#ifdef CONFIG_SLOW_WORK_DEBUG
54 work->mark = CURRENT_TIME; 54 work->mark = CURRENT_TIME;
55#endif 55#endif
56} 56}
57 57
58static inline void slow_work_begin_exec(int id, struct slow_work *work) 58static inline void slow_work_begin_exec(int id, struct slow_work *work)
59{ 59{
60#ifdef CONFIG_SLOW_WORK_PROC 60#ifdef CONFIG_SLOW_WORK_DEBUG
61 slow_work_execs[id] = work; 61 slow_work_execs[id] = work;
62#endif 62#endif
63} 63}
64 64
65static inline void slow_work_end_exec(int id, struct slow_work *work) 65static inline void slow_work_end_exec(int id, struct slow_work *work)
66{ 66{
67#ifdef CONFIG_SLOW_WORK_PROC 67#ifdef CONFIG_SLOW_WORK_DEBUG
68 write_lock(&slow_work_execs_lock); 68 write_lock(&slow_work_execs_lock);
69 slow_work_execs[id] = NULL; 69 slow_work_execs[id] = NULL;
70 write_unlock(&slow_work_execs_lock); 70 write_unlock(&slow_work_execs_lock);
diff --git a/kernel/smp.c b/kernel/smp.c
index c9d1c7835c2f..3fc697336183 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -9,18 +9,17 @@
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/percpu.h> 10#include <linux/percpu.h>
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/gfp.h>
12#include <linux/smp.h> 13#include <linux/smp.h>
13#include <linux/cpu.h> 14#include <linux/cpu.h>
14 15
15static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
16
17static struct { 16static struct {
18 struct list_head queue; 17 struct list_head queue;
19 spinlock_t lock; 18 raw_spinlock_t lock;
20} call_function __cacheline_aligned_in_smp = 19} call_function __cacheline_aligned_in_smp =
21 { 20 {
22 .queue = LIST_HEAD_INIT(call_function.queue), 21 .queue = LIST_HEAD_INIT(call_function.queue),
23 .lock = __SPIN_LOCK_UNLOCKED(call_function.lock), 22 .lock = __RAW_SPIN_LOCK_UNLOCKED(call_function.lock),
24 }; 23 };
25 24
26enum { 25enum {
@@ -33,12 +32,14 @@ struct call_function_data {
33 cpumask_var_t cpumask; 32 cpumask_var_t cpumask;
34}; 33};
35 34
35static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
36
36struct call_single_queue { 37struct call_single_queue {
37 struct list_head list; 38 struct list_head list;
38 spinlock_t lock; 39 raw_spinlock_t lock;
39}; 40};
40 41
41static DEFINE_PER_CPU(struct call_function_data, cfd_data); 42static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_queue, call_single_queue);
42 43
43static int 44static int
44hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) 45hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
@@ -80,7 +81,7 @@ static int __cpuinit init_call_single_data(void)
80 for_each_possible_cpu(i) { 81 for_each_possible_cpu(i) {
81 struct call_single_queue *q = &per_cpu(call_single_queue, i); 82 struct call_single_queue *q = &per_cpu(call_single_queue, i);
82 83
83 spin_lock_init(&q->lock); 84 raw_spin_lock_init(&q->lock);
84 INIT_LIST_HEAD(&q->list); 85 INIT_LIST_HEAD(&q->list);
85 } 86 }
86 87
@@ -141,10 +142,10 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
141 unsigned long flags; 142 unsigned long flags;
142 int ipi; 143 int ipi;
143 144
144 spin_lock_irqsave(&dst->lock, flags); 145 raw_spin_lock_irqsave(&dst->lock, flags);
145 ipi = list_empty(&dst->list); 146 ipi = list_empty(&dst->list);
146 list_add_tail(&data->list, &dst->list); 147 list_add_tail(&data->list, &dst->list);
147 spin_unlock_irqrestore(&dst->lock, flags); 148 raw_spin_unlock_irqrestore(&dst->lock, flags);
148 149
149 /* 150 /*
150 * The list addition should be visible before sending the IPI 151 * The list addition should be visible before sending the IPI
@@ -171,7 +172,7 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
171void generic_smp_call_function_interrupt(void) 172void generic_smp_call_function_interrupt(void)
172{ 173{
173 struct call_function_data *data; 174 struct call_function_data *data;
174 int cpu = get_cpu(); 175 int cpu = smp_processor_id();
175 176
176 /* 177 /*
177 * Shouldn't receive this interrupt on a cpu that is not yet online. 178 * Shouldn't receive this interrupt on a cpu that is not yet online.
@@ -201,9 +202,9 @@ void generic_smp_call_function_interrupt(void)
201 refs = atomic_dec_return(&data->refs); 202 refs = atomic_dec_return(&data->refs);
202 WARN_ON(refs < 0); 203 WARN_ON(refs < 0);
203 if (!refs) { 204 if (!refs) {
204 spin_lock(&call_function.lock); 205 raw_spin_lock(&call_function.lock);
205 list_del_rcu(&data->csd.list); 206 list_del_rcu(&data->csd.list);
206 spin_unlock(&call_function.lock); 207 raw_spin_unlock(&call_function.lock);
207 } 208 }
208 209
209 if (refs) 210 if (refs)
@@ -212,7 +213,6 @@ void generic_smp_call_function_interrupt(void)
212 csd_unlock(&data->csd); 213 csd_unlock(&data->csd);
213 } 214 }
214 215
215 put_cpu();
216} 216}
217 217
218/* 218/*
@@ -230,9 +230,9 @@ void generic_smp_call_function_single_interrupt(void)
230 */ 230 */
231 WARN_ON_ONCE(!cpu_online(smp_processor_id())); 231 WARN_ON_ONCE(!cpu_online(smp_processor_id()));
232 232
233 spin_lock(&q->lock); 233 raw_spin_lock(&q->lock);
234 list_replace_init(&q->list, &list); 234 list_replace_init(&q->list, &list);
235 spin_unlock(&q->lock); 235 raw_spin_unlock(&q->lock);
236 236
237 while (!list_empty(&list)) { 237 while (!list_empty(&list)) {
238 struct call_single_data *data; 238 struct call_single_data *data;
@@ -257,7 +257,7 @@ void generic_smp_call_function_single_interrupt(void)
257 } 257 }
258} 258}
259 259
260static DEFINE_PER_CPU(struct call_single_data, csd_data); 260static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
261 261
262/* 262/*
263 * smp_call_function_single - Run a function on a specific CPU 263 * smp_call_function_single - Run a function on a specific CPU
@@ -265,9 +265,7 @@ static DEFINE_PER_CPU(struct call_single_data, csd_data);
265 * @info: An arbitrary pointer to pass to the function. 265 * @info: An arbitrary pointer to pass to the function.
266 * @wait: If true, wait until function has completed on other CPUs. 266 * @wait: If true, wait until function has completed on other CPUs.
267 * 267 *
268 * Returns 0 on success, else a negative status code. Note that @wait 268 * Returns 0 on success, else a negative status code.
269 * will be implicitly turned on in case of allocation failures, since
270 * we fall back to on-stack allocation.
271 */ 269 */
272int smp_call_function_single(int cpu, void (*func) (void *info), void *info, 270int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
273 int wait) 271 int wait)
@@ -321,6 +319,51 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
321} 319}
322EXPORT_SYMBOL(smp_call_function_single); 320EXPORT_SYMBOL(smp_call_function_single);
323 321
322/*
323 * smp_call_function_any - Run a function on any of the given cpus
324 * @mask: The mask of cpus it can run on.
325 * @func: The function to run. This must be fast and non-blocking.
326 * @info: An arbitrary pointer to pass to the function.
327 * @wait: If true, wait until function has completed.
328 *
329 * Returns 0 on success, else a negative status code (if no cpus were online).
330 * Note that @wait will be implicitly turned on in case of allocation failures,
331 * since we fall back to on-stack allocation.
332 *
333 * Selection preference:
334 * 1) current cpu if in @mask
335 * 2) any cpu of current node if in @mask
336 * 3) any other online cpu in @mask
337 */
338int smp_call_function_any(const struct cpumask *mask,
339 void (*func)(void *info), void *info, int wait)
340{
341 unsigned int cpu;
342 const struct cpumask *nodemask;
343 int ret;
344
345 /* Try for same CPU (cheapest) */
346 cpu = get_cpu();
347 if (cpumask_test_cpu(cpu, mask))
348 goto call;
349
350 /* Try for same node. */
351 nodemask = cpumask_of_node(cpu_to_node(cpu));
352 for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids;
353 cpu = cpumask_next_and(cpu, nodemask, mask)) {
354 if (cpu_online(cpu))
355 goto call;
356 }
357
358 /* Any online will do: smp_call_function_single handles nr_cpu_ids. */
359 cpu = cpumask_any_and(mask, cpu_online_mask);
360call:
361 ret = smp_call_function_single(cpu, func, info, wait);
362 put_cpu();
363 return ret;
364}
365EXPORT_SYMBOL_GPL(smp_call_function_any);
366
324/** 367/**
325 * __smp_call_function_single(): Run a function on another CPU 368 * __smp_call_function_single(): Run a function on another CPU
326 * @cpu: The CPU to run on. 369 * @cpu: The CPU to run on.
@@ -355,9 +398,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
355 * @wait: If true, wait (atomically) until function has completed 398 * @wait: If true, wait (atomically) until function has completed
356 * on other CPUs. 399 * on other CPUs.
357 * 400 *
358 * If @wait is true, then returns once @func has returned. Note that @wait 401 * If @wait is true, then returns once @func has returned.
359 * will be implicitly turned on in case of allocation failures, since
360 * we fall back to on-stack allocation.
361 * 402 *
362 * You must not call this function with disabled interrupts or from a 403 * You must not call this function with disabled interrupts or from a
363 * hardware interrupt handler or from a bottom half handler. Preemption 404 * hardware interrupt handler or from a bottom half handler. Preemption
@@ -408,14 +449,14 @@ void smp_call_function_many(const struct cpumask *mask,
408 cpumask_clear_cpu(this_cpu, data->cpumask); 449 cpumask_clear_cpu(this_cpu, data->cpumask);
409 atomic_set(&data->refs, cpumask_weight(data->cpumask)); 450 atomic_set(&data->refs, cpumask_weight(data->cpumask));
410 451
411 spin_lock_irqsave(&call_function.lock, flags); 452 raw_spin_lock_irqsave(&call_function.lock, flags);
412 /* 453 /*
413 * Place entry at the _HEAD_ of the list, so that any cpu still 454 * Place entry at the _HEAD_ of the list, so that any cpu still
414 * observing the entry in generic_smp_call_function_interrupt() 455 * observing the entry in generic_smp_call_function_interrupt()
415 * will not miss any other list entries: 456 * will not miss any other list entries:
416 */ 457 */
417 list_add_rcu(&data->csd.list, &call_function.queue); 458 list_add_rcu(&data->csd.list, &call_function.queue);
418 spin_unlock_irqrestore(&call_function.lock, flags); 459 raw_spin_unlock_irqrestore(&call_function.lock, flags);
419 460
420 /* 461 /*
421 * Make the list addition visible before sending the ipi. 462 * Make the list addition visible before sending the ipi.
@@ -443,8 +484,7 @@ EXPORT_SYMBOL(smp_call_function_many);
443 * Returns 0. 484 * Returns 0.
444 * 485 *
445 * If @wait is true, then returns once @func has returned; otherwise 486 * If @wait is true, then returns once @func has returned; otherwise
446 * it returns just before the target cpu calls @func. In case of allocation 487 * it returns just before the target cpu calls @func.
447 * failure, @wait will be implicitly turned on.
448 * 488 *
449 * You must not call this function with disabled interrupts or from a 489 * You must not call this function with disabled interrupts or from a
450 * hardware interrupt handler or from a bottom half handler. 490 * hardware interrupt handler or from a bottom half handler.
@@ -461,20 +501,20 @@ EXPORT_SYMBOL(smp_call_function);
461 501
462void ipi_call_lock(void) 502void ipi_call_lock(void)
463{ 503{
464 spin_lock(&call_function.lock); 504 raw_spin_lock(&call_function.lock);
465} 505}
466 506
467void ipi_call_unlock(void) 507void ipi_call_unlock(void)
468{ 508{
469 spin_unlock(&call_function.lock); 509 raw_spin_unlock(&call_function.lock);
470} 510}
471 511
472void ipi_call_lock_irq(void) 512void ipi_call_lock_irq(void)
473{ 513{
474 spin_lock_irq(&call_function.lock); 514 raw_spin_lock_irq(&call_function.lock);
475} 515}
476 516
477void ipi_call_unlock_irq(void) 517void ipi_call_unlock_irq(void)
478{ 518{
479 spin_unlock_irq(&call_function.lock); 519 raw_spin_unlock_irq(&call_function.lock);
480} 520}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index f8749e5216e0..7c1a67ef0274 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -302,9 +302,9 @@ void irq_exit(void)
302 if (!in_interrupt() && local_softirq_pending()) 302 if (!in_interrupt() && local_softirq_pending())
303 invoke_softirq(); 303 invoke_softirq();
304 304
305 rcu_irq_exit();
305#ifdef CONFIG_NO_HZ 306#ifdef CONFIG_NO_HZ
306 /* Make sure that timer wheel updates are propagated */ 307 /* Make sure that timer wheel updates are propagated */
307 rcu_irq_exit();
308 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) 308 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
309 tick_nohz_stop_sched_tick(0); 309 tick_nohz_stop_sched_tick(0);
310#endif 310#endif
@@ -500,22 +500,17 @@ EXPORT_SYMBOL(tasklet_kill);
500 */ 500 */
501 501
502/* 502/*
503 * The trampoline is called when the hrtimer expires. If this is 503 * The trampoline is called when the hrtimer expires. It schedules a tasklet
504 * called from the hrtimer interrupt then we schedule the tasklet as 504 * to run __tasklet_hrtimer_trampoline() which in turn will call the intended
505 * the timer callback function expects to run in softirq context. If 505 * hrtimer callback, but from softirq context.
506 * it's called in softirq context anyway (i.e. high resolution timers
507 * disabled) then the hrtimer callback is called right away.
508 */ 506 */
509static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) 507static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
510{ 508{
511 struct tasklet_hrtimer *ttimer = 509 struct tasklet_hrtimer *ttimer =
512 container_of(timer, struct tasklet_hrtimer, timer); 510 container_of(timer, struct tasklet_hrtimer, timer);
513 511
514 if (hrtimer_is_hres_active(timer)) { 512 tasklet_hi_schedule(&ttimer->tasklet);
515 tasklet_hi_schedule(&ttimer->tasklet); 513 return HRTIMER_NORESTART;
516 return HRTIMER_NORESTART;
517 }
518 return ttimer->function(timer);
519} 514}
520 515
521/* 516/*
@@ -697,7 +692,7 @@ void __init softirq_init(void)
697 open_softirq(HI_SOFTIRQ, tasklet_hi_action); 692 open_softirq(HI_SOFTIRQ, tasklet_hi_action);
698} 693}
699 694
700static int ksoftirqd(void * __bind_cpu) 695static int run_ksoftirqd(void * __bind_cpu)
701{ 696{
702 set_current_state(TASK_INTERRUPTIBLE); 697 set_current_state(TASK_INTERRUPTIBLE);
703 698
@@ -810,7 +805,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
810 switch (action) { 805 switch (action) {
811 case CPU_UP_PREPARE: 806 case CPU_UP_PREPARE:
812 case CPU_UP_PREPARE_FROZEN: 807 case CPU_UP_PREPARE_FROZEN:
813 p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); 808 p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
814 if (IS_ERR(p)) { 809 if (IS_ERR(p)) {
815 printk("ksoftirqd for %i failed\n", hotcpu); 810 printk("ksoftirqd for %i failed\n", hotcpu);
816 return NOTIFY_BAD; 811 return NOTIFY_BAD;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 81324d12eb35..4b493f67dcb5 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -22,9 +22,10 @@
22 22
23static DEFINE_SPINLOCK(print_lock); 23static DEFINE_SPINLOCK(print_lock);
24 24
25static DEFINE_PER_CPU(unsigned long, touch_timestamp); 25static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */
26static DEFINE_PER_CPU(unsigned long, print_timestamp); 26static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */
27static DEFINE_PER_CPU(struct task_struct *, watchdog_task); 27static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
28static DEFINE_PER_CPU(bool, softlock_touch_sync);
28 29
29static int __read_mostly did_panic; 30static int __read_mostly did_panic;
30int __read_mostly softlockup_thresh = 60; 31int __read_mostly softlockup_thresh = 60;
@@ -70,22 +71,28 @@ static void __touch_softlockup_watchdog(void)
70{ 71{
71 int this_cpu = raw_smp_processor_id(); 72 int this_cpu = raw_smp_processor_id();
72 73
73 __raw_get_cpu_var(touch_timestamp) = get_timestamp(this_cpu); 74 __raw_get_cpu_var(softlockup_touch_ts) = get_timestamp(this_cpu);
74} 75}
75 76
76void touch_softlockup_watchdog(void) 77void touch_softlockup_watchdog(void)
77{ 78{
78 __raw_get_cpu_var(touch_timestamp) = 0; 79 __raw_get_cpu_var(softlockup_touch_ts) = 0;
79} 80}
80EXPORT_SYMBOL(touch_softlockup_watchdog); 81EXPORT_SYMBOL(touch_softlockup_watchdog);
81 82
83void touch_softlockup_watchdog_sync(void)
84{
85 __raw_get_cpu_var(softlock_touch_sync) = true;
86 __raw_get_cpu_var(softlockup_touch_ts) = 0;
87}
88
82void touch_all_softlockup_watchdogs(void) 89void touch_all_softlockup_watchdogs(void)
83{ 90{
84 int cpu; 91 int cpu;
85 92
86 /* Cause each CPU to re-update its timestamp rather than complain */ 93 /* Cause each CPU to re-update its timestamp rather than complain */
87 for_each_online_cpu(cpu) 94 for_each_online_cpu(cpu)
88 per_cpu(touch_timestamp, cpu) = 0; 95 per_cpu(softlockup_touch_ts, cpu) = 0;
89} 96}
90EXPORT_SYMBOL(touch_all_softlockup_watchdogs); 97EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
91 98
@@ -104,28 +111,36 @@ int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
104void softlockup_tick(void) 111void softlockup_tick(void)
105{ 112{
106 int this_cpu = smp_processor_id(); 113 int this_cpu = smp_processor_id();
107 unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu); 114 unsigned long touch_ts = per_cpu(softlockup_touch_ts, this_cpu);
108 unsigned long print_timestamp; 115 unsigned long print_ts;
109 struct pt_regs *regs = get_irq_regs(); 116 struct pt_regs *regs = get_irq_regs();
110 unsigned long now; 117 unsigned long now;
111 118
112 /* Is detection switched off? */ 119 /* Is detection switched off? */
113 if (!per_cpu(watchdog_task, this_cpu) || softlockup_thresh <= 0) { 120 if (!per_cpu(softlockup_watchdog, this_cpu) || softlockup_thresh <= 0) {
114 /* Be sure we don't false trigger if switched back on */ 121 /* Be sure we don't false trigger if switched back on */
115 if (touch_timestamp) 122 if (touch_ts)
116 per_cpu(touch_timestamp, this_cpu) = 0; 123 per_cpu(softlockup_touch_ts, this_cpu) = 0;
117 return; 124 return;
118 } 125 }
119 126
120 if (touch_timestamp == 0) { 127 if (touch_ts == 0) {
128 if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) {
129 /*
130 * If the time stamp was touched atomically
131 * make sure the scheduler tick is up to date.
132 */
133 per_cpu(softlock_touch_sync, this_cpu) = false;
134 sched_clock_tick();
135 }
121 __touch_softlockup_watchdog(); 136 __touch_softlockup_watchdog();
122 return; 137 return;
123 } 138 }
124 139
125 print_timestamp = per_cpu(print_timestamp, this_cpu); 140 print_ts = per_cpu(softlockup_print_ts, this_cpu);
126 141
127 /* report at most once a second */ 142 /* report at most once a second */
128 if (print_timestamp == touch_timestamp || did_panic) 143 if (print_ts == touch_ts || did_panic)
129 return; 144 return;
130 145
131 /* do not print during early bootup: */ 146 /* do not print during early bootup: */
@@ -140,18 +155,18 @@ void softlockup_tick(void)
140 * Wake up the high-prio watchdog task twice per 155 * Wake up the high-prio watchdog task twice per
141 * threshold timespan. 156 * threshold timespan.
142 */ 157 */
143 if (now > touch_timestamp + softlockup_thresh/2) 158 if (time_after(now - softlockup_thresh/2, touch_ts))
144 wake_up_process(per_cpu(watchdog_task, this_cpu)); 159 wake_up_process(per_cpu(softlockup_watchdog, this_cpu));
145 160
146 /* Warn about unreasonable delays: */ 161 /* Warn about unreasonable delays: */
147 if (now <= (touch_timestamp + softlockup_thresh)) 162 if (time_before_eq(now - softlockup_thresh, touch_ts))
148 return; 163 return;
149 164
150 per_cpu(print_timestamp, this_cpu) = touch_timestamp; 165 per_cpu(softlockup_print_ts, this_cpu) = touch_ts;
151 166
152 spin_lock(&print_lock); 167 spin_lock(&print_lock);
153 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n", 168 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n",
154 this_cpu, now - touch_timestamp, 169 this_cpu, now - touch_ts,
155 current->comm, task_pid_nr(current)); 170 current->comm, task_pid_nr(current));
156 print_modules(); 171 print_modules();
157 print_irqtrace_events(current); 172 print_irqtrace_events(current);
@@ -209,32 +224,32 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
209 switch (action) { 224 switch (action) {
210 case CPU_UP_PREPARE: 225 case CPU_UP_PREPARE:
211 case CPU_UP_PREPARE_FROZEN: 226 case CPU_UP_PREPARE_FROZEN:
212 BUG_ON(per_cpu(watchdog_task, hotcpu)); 227 BUG_ON(per_cpu(softlockup_watchdog, hotcpu));
213 p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu); 228 p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
214 if (IS_ERR(p)) { 229 if (IS_ERR(p)) {
215 printk(KERN_ERR "watchdog for %i failed\n", hotcpu); 230 printk(KERN_ERR "watchdog for %i failed\n", hotcpu);
216 return NOTIFY_BAD; 231 return NOTIFY_BAD;
217 } 232 }
218 per_cpu(touch_timestamp, hotcpu) = 0; 233 per_cpu(softlockup_touch_ts, hotcpu) = 0;
219 per_cpu(watchdog_task, hotcpu) = p; 234 per_cpu(softlockup_watchdog, hotcpu) = p;
220 kthread_bind(p, hotcpu); 235 kthread_bind(p, hotcpu);
221 break; 236 break;
222 case CPU_ONLINE: 237 case CPU_ONLINE:
223 case CPU_ONLINE_FROZEN: 238 case CPU_ONLINE_FROZEN:
224 wake_up_process(per_cpu(watchdog_task, hotcpu)); 239 wake_up_process(per_cpu(softlockup_watchdog, hotcpu));
225 break; 240 break;
226#ifdef CONFIG_HOTPLUG_CPU 241#ifdef CONFIG_HOTPLUG_CPU
227 case CPU_UP_CANCELED: 242 case CPU_UP_CANCELED:
228 case CPU_UP_CANCELED_FROZEN: 243 case CPU_UP_CANCELED_FROZEN:
229 if (!per_cpu(watchdog_task, hotcpu)) 244 if (!per_cpu(softlockup_watchdog, hotcpu))
230 break; 245 break;
231 /* Unbind so it can run. Fall thru. */ 246 /* Unbind so it can run. Fall thru. */
232 kthread_bind(per_cpu(watchdog_task, hotcpu), 247 kthread_bind(per_cpu(softlockup_watchdog, hotcpu),
233 cpumask_any(cpu_online_mask)); 248 cpumask_any(cpu_online_mask));
234 case CPU_DEAD: 249 case CPU_DEAD:
235 case CPU_DEAD_FROZEN: 250 case CPU_DEAD_FROZEN:
236 p = per_cpu(watchdog_task, hotcpu); 251 p = per_cpu(softlockup_watchdog, hotcpu);
237 per_cpu(watchdog_task, hotcpu) = NULL; 252 per_cpu(softlockup_watchdog, hotcpu) = NULL;
238 kthread_stop(p); 253 kthread_stop(p);
239 break; 254 break;
240#endif /* CONFIG_HOTPLUG_CPU */ 255#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 5ddab730cb2f..be6517fb9c14 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -21,193 +21,72 @@
21#include <linux/debug_locks.h> 21#include <linux/debug_locks.h>
22#include <linux/module.h> 22#include <linux/module.h>
23 23
24#ifndef _spin_trylock
25int __lockfunc _spin_trylock(spinlock_t *lock)
26{
27 return __spin_trylock(lock);
28}
29EXPORT_SYMBOL(_spin_trylock);
30#endif
31
32#ifndef _read_trylock
33int __lockfunc _read_trylock(rwlock_t *lock)
34{
35 return __read_trylock(lock);
36}
37EXPORT_SYMBOL(_read_trylock);
38#endif
39
40#ifndef _write_trylock
41int __lockfunc _write_trylock(rwlock_t *lock)
42{
43 return __write_trylock(lock);
44}
45EXPORT_SYMBOL(_write_trylock);
46#endif
47
48/* 24/*
49 * If lockdep is enabled then we use the non-preemption spin-ops 25 * If lockdep is enabled then we use the non-preemption spin-ops
50 * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are 26 * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
51 * not re-enabled during lock-acquire (which the preempt-spin-ops do): 27 * not re-enabled during lock-acquire (which the preempt-spin-ops do):
52 */ 28 */
53#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) 29#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
54
55#ifndef _read_lock
56void __lockfunc _read_lock(rwlock_t *lock)
57{
58 __read_lock(lock);
59}
60EXPORT_SYMBOL(_read_lock);
61#endif
62
63#ifndef _spin_lock_irqsave
64unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
65{
66 return __spin_lock_irqsave(lock);
67}
68EXPORT_SYMBOL(_spin_lock_irqsave);
69#endif
70
71#ifndef _spin_lock_irq
72void __lockfunc _spin_lock_irq(spinlock_t *lock)
73{
74 __spin_lock_irq(lock);
75}
76EXPORT_SYMBOL(_spin_lock_irq);
77#endif
78
79#ifndef _spin_lock_bh
80void __lockfunc _spin_lock_bh(spinlock_t *lock)
81{
82 __spin_lock_bh(lock);
83}
84EXPORT_SYMBOL(_spin_lock_bh);
85#endif
86
87#ifndef _read_lock_irqsave
88unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
89{
90 return __read_lock_irqsave(lock);
91}
92EXPORT_SYMBOL(_read_lock_irqsave);
93#endif
94
95#ifndef _read_lock_irq
96void __lockfunc _read_lock_irq(rwlock_t *lock)
97{
98 __read_lock_irq(lock);
99}
100EXPORT_SYMBOL(_read_lock_irq);
101#endif
102
103#ifndef _read_lock_bh
104void __lockfunc _read_lock_bh(rwlock_t *lock)
105{
106 __read_lock_bh(lock);
107}
108EXPORT_SYMBOL(_read_lock_bh);
109#endif
110
111#ifndef _write_lock_irqsave
112unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
113{
114 return __write_lock_irqsave(lock);
115}
116EXPORT_SYMBOL(_write_lock_irqsave);
117#endif
118
119#ifndef _write_lock_irq
120void __lockfunc _write_lock_irq(rwlock_t *lock)
121{
122 __write_lock_irq(lock);
123}
124EXPORT_SYMBOL(_write_lock_irq);
125#endif
126
127#ifndef _write_lock_bh
128void __lockfunc _write_lock_bh(rwlock_t *lock)
129{
130 __write_lock_bh(lock);
131}
132EXPORT_SYMBOL(_write_lock_bh);
133#endif
134
135#ifndef _spin_lock
136void __lockfunc _spin_lock(spinlock_t *lock)
137{
138 __spin_lock(lock);
139}
140EXPORT_SYMBOL(_spin_lock);
141#endif
142
143#ifndef _write_lock
144void __lockfunc _write_lock(rwlock_t *lock)
145{
146 __write_lock(lock);
147}
148EXPORT_SYMBOL(_write_lock);
149#endif
150
151#else /* CONFIG_PREEMPT: */
152
153/* 30/*
31 * The __lock_function inlines are taken from
32 * include/linux/spinlock_api_smp.h
33 */
34#else
35#define raw_read_can_lock(l) read_can_lock(l)
36#define raw_write_can_lock(l) write_can_lock(l)
37/*
38 * We build the __lock_function inlines here. They are too large for
39 * inlining all over the place, but here is only one user per function
40 * which embedds them into the calling _lock_function below.
41 *
154 * This could be a long-held lock. We both prepare to spin for a long 42 * This could be a long-held lock. We both prepare to spin for a long
155 * time (making _this_ CPU preemptable if possible), and we also signal 43 * time (making _this_ CPU preemptable if possible), and we also signal
156 * towards that other CPU that it should break the lock ASAP. 44 * towards that other CPU that it should break the lock ASAP.
157 *
158 * (We do this in a function because inlining it would be excessive.)
159 */ 45 */
160
161#define BUILD_LOCK_OPS(op, locktype) \ 46#define BUILD_LOCK_OPS(op, locktype) \
162void __lockfunc _##op##_lock(locktype##_t *lock) \ 47void __lockfunc __raw_##op##_lock(locktype##_t *lock) \
163{ \ 48{ \
164 for (;;) { \ 49 for (;;) { \
165 preempt_disable(); \ 50 preempt_disable(); \
166 if (likely(_raw_##op##_trylock(lock))) \ 51 if (likely(do_raw_##op##_trylock(lock))) \
167 break; \ 52 break; \
168 preempt_enable(); \ 53 preempt_enable(); \
169 \ 54 \
170 if (!(lock)->break_lock) \ 55 if (!(lock)->break_lock) \
171 (lock)->break_lock = 1; \ 56 (lock)->break_lock = 1; \
172 while (!op##_can_lock(lock) && (lock)->break_lock) \ 57 while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\
173 _raw_##op##_relax(&lock->raw_lock); \ 58 arch_##op##_relax(&lock->raw_lock); \
174 } \ 59 } \
175 (lock)->break_lock = 0; \ 60 (lock)->break_lock = 0; \
176} \ 61} \
177 \ 62 \
178EXPORT_SYMBOL(_##op##_lock); \ 63unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \
179 \
180unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \
181{ \ 64{ \
182 unsigned long flags; \ 65 unsigned long flags; \
183 \ 66 \
184 for (;;) { \ 67 for (;;) { \
185 preempt_disable(); \ 68 preempt_disable(); \
186 local_irq_save(flags); \ 69 local_irq_save(flags); \
187 if (likely(_raw_##op##_trylock(lock))) \ 70 if (likely(do_raw_##op##_trylock(lock))) \
188 break; \ 71 break; \
189 local_irq_restore(flags); \ 72 local_irq_restore(flags); \
190 preempt_enable(); \ 73 preempt_enable(); \
191 \ 74 \
192 if (!(lock)->break_lock) \ 75 if (!(lock)->break_lock) \
193 (lock)->break_lock = 1; \ 76 (lock)->break_lock = 1; \
194 while (!op##_can_lock(lock) && (lock)->break_lock) \ 77 while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\
195 _raw_##op##_relax(&lock->raw_lock); \ 78 arch_##op##_relax(&lock->raw_lock); \
196 } \ 79 } \
197 (lock)->break_lock = 0; \ 80 (lock)->break_lock = 0; \
198 return flags; \ 81 return flags; \
199} \ 82} \
200 \ 83 \
201EXPORT_SYMBOL(_##op##_lock_irqsave); \ 84void __lockfunc __raw_##op##_lock_irq(locktype##_t *lock) \
202 \
203void __lockfunc _##op##_lock_irq(locktype##_t *lock) \
204{ \ 85{ \
205 _##op##_lock_irqsave(lock); \ 86 _raw_##op##_lock_irqsave(lock); \
206} \ 87} \
207 \ 88 \
208EXPORT_SYMBOL(_##op##_lock_irq); \ 89void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \
209 \
210void __lockfunc _##op##_lock_bh(locktype##_t *lock) \
211{ \ 90{ \
212 unsigned long flags; \ 91 unsigned long flags; \
213 \ 92 \
@@ -216,164 +95,283 @@ void __lockfunc _##op##_lock_bh(locktype##_t *lock) \
216 /* irq-disabling. We use the generic preemption-aware */ \ 95 /* irq-disabling. We use the generic preemption-aware */ \
217 /* function: */ \ 96 /* function: */ \
218 /**/ \ 97 /**/ \
219 flags = _##op##_lock_irqsave(lock); \ 98 flags = _raw_##op##_lock_irqsave(lock); \
220 local_bh_disable(); \ 99 local_bh_disable(); \
221 local_irq_restore(flags); \ 100 local_irq_restore(flags); \
222} \ 101} \
223 \
224EXPORT_SYMBOL(_##op##_lock_bh)
225 102
226/* 103/*
227 * Build preemption-friendly versions of the following 104 * Build preemption-friendly versions of the following
228 * lock-spinning functions: 105 * lock-spinning functions:
229 * 106 *
230 * _[spin|read|write]_lock() 107 * __[spin|read|write]_lock()
231 * _[spin|read|write]_lock_irq() 108 * __[spin|read|write]_lock_irq()
232 * _[spin|read|write]_lock_irqsave() 109 * __[spin|read|write]_lock_irqsave()
233 * _[spin|read|write]_lock_bh() 110 * __[spin|read|write]_lock_bh()
234 */ 111 */
235BUILD_LOCK_OPS(spin, spinlock); 112BUILD_LOCK_OPS(spin, raw_spinlock);
236BUILD_LOCK_OPS(read, rwlock); 113BUILD_LOCK_OPS(read, rwlock);
237BUILD_LOCK_OPS(write, rwlock); 114BUILD_LOCK_OPS(write, rwlock);
238 115
239#endif /* CONFIG_PREEMPT */ 116#endif
240 117
241#ifdef CONFIG_DEBUG_LOCK_ALLOC 118#ifndef CONFIG_INLINE_SPIN_TRYLOCK
119int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock)
120{
121 return __raw_spin_trylock(lock);
122}
123EXPORT_SYMBOL(_raw_spin_trylock);
124#endif
242 125
243void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) 126#ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH
127int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock)
244{ 128{
245 preempt_disable(); 129 return __raw_spin_trylock_bh(lock);
246 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
247 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
248} 130}
249EXPORT_SYMBOL(_spin_lock_nested); 131EXPORT_SYMBOL(_raw_spin_trylock_bh);
132#endif
250 133
251unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass) 134#ifndef CONFIG_INLINE_SPIN_LOCK
135void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)
252{ 136{
253 unsigned long flags; 137 __raw_spin_lock(lock);
138}
139EXPORT_SYMBOL(_raw_spin_lock);
140#endif
254 141
255 local_irq_save(flags); 142#ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE
256 preempt_disable(); 143unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock)
257 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); 144{
258 LOCK_CONTENDED_FLAGS(lock, _raw_spin_trylock, _raw_spin_lock, 145 return __raw_spin_lock_irqsave(lock);
259 _raw_spin_lock_flags, &flags);
260 return flags;
261} 146}
262EXPORT_SYMBOL(_spin_lock_irqsave_nested); 147EXPORT_SYMBOL(_raw_spin_lock_irqsave);
148#endif
263 149
264void __lockfunc _spin_lock_nest_lock(spinlock_t *lock, 150#ifndef CONFIG_INLINE_SPIN_LOCK_IRQ
265 struct lockdep_map *nest_lock) 151void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock)
266{ 152{
267 preempt_disable(); 153 __raw_spin_lock_irq(lock);
268 spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
269 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
270} 154}
271EXPORT_SYMBOL(_spin_lock_nest_lock); 155EXPORT_SYMBOL(_raw_spin_lock_irq);
156#endif
272 157
158#ifndef CONFIG_INLINE_SPIN_LOCK_BH
159void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
160{
161 __raw_spin_lock_bh(lock);
162}
163EXPORT_SYMBOL(_raw_spin_lock_bh);
273#endif 164#endif
274 165
275#ifndef _spin_unlock 166#ifndef CONFIG_INLINE_SPIN_UNLOCK
276void __lockfunc _spin_unlock(spinlock_t *lock) 167void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
277{ 168{
278 __spin_unlock(lock); 169 __raw_spin_unlock(lock);
279} 170}
280EXPORT_SYMBOL(_spin_unlock); 171EXPORT_SYMBOL(_raw_spin_unlock);
281#endif 172#endif
282 173
283#ifndef _write_unlock 174#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE
284void __lockfunc _write_unlock(rwlock_t *lock) 175void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags)
285{ 176{
286 __write_unlock(lock); 177 __raw_spin_unlock_irqrestore(lock, flags);
287} 178}
288EXPORT_SYMBOL(_write_unlock); 179EXPORT_SYMBOL(_raw_spin_unlock_irqrestore);
289#endif 180#endif
290 181
291#ifndef _read_unlock 182#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ
292void __lockfunc _read_unlock(rwlock_t *lock) 183void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock)
293{ 184{
294 __read_unlock(lock); 185 __raw_spin_unlock_irq(lock);
295} 186}
296EXPORT_SYMBOL(_read_unlock); 187EXPORT_SYMBOL(_raw_spin_unlock_irq);
297#endif 188#endif
298 189
299#ifndef _spin_unlock_irqrestore 190#ifndef CONFIG_INLINE_SPIN_UNLOCK_BH
300void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) 191void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
301{ 192{
302 __spin_unlock_irqrestore(lock, flags); 193 __raw_spin_unlock_bh(lock);
303} 194}
304EXPORT_SYMBOL(_spin_unlock_irqrestore); 195EXPORT_SYMBOL(_raw_spin_unlock_bh);
305#endif 196#endif
306 197
307#ifndef _spin_unlock_irq 198#ifndef CONFIG_INLINE_READ_TRYLOCK
308void __lockfunc _spin_unlock_irq(spinlock_t *lock) 199int __lockfunc _raw_read_trylock(rwlock_t *lock)
309{ 200{
310 __spin_unlock_irq(lock); 201 return __raw_read_trylock(lock);
311} 202}
312EXPORT_SYMBOL(_spin_unlock_irq); 203EXPORT_SYMBOL(_raw_read_trylock);
313#endif 204#endif
314 205
315#ifndef _spin_unlock_bh 206#ifndef CONFIG_INLINE_READ_LOCK
316void __lockfunc _spin_unlock_bh(spinlock_t *lock) 207void __lockfunc _raw_read_lock(rwlock_t *lock)
317{ 208{
318 __spin_unlock_bh(lock); 209 __raw_read_lock(lock);
319} 210}
320EXPORT_SYMBOL(_spin_unlock_bh); 211EXPORT_SYMBOL(_raw_read_lock);
321#endif 212#endif
322 213
323#ifndef _read_unlock_irqrestore 214#ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE
324void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 215unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock)
325{ 216{
326 __read_unlock_irqrestore(lock, flags); 217 return __raw_read_lock_irqsave(lock);
327} 218}
328EXPORT_SYMBOL(_read_unlock_irqrestore); 219EXPORT_SYMBOL(_raw_read_lock_irqsave);
329#endif 220#endif
330 221
331#ifndef _read_unlock_irq 222#ifndef CONFIG_INLINE_READ_LOCK_IRQ
332void __lockfunc _read_unlock_irq(rwlock_t *lock) 223void __lockfunc _raw_read_lock_irq(rwlock_t *lock)
333{ 224{
334 __read_unlock_irq(lock); 225 __raw_read_lock_irq(lock);
335} 226}
336EXPORT_SYMBOL(_read_unlock_irq); 227EXPORT_SYMBOL(_raw_read_lock_irq);
337#endif 228#endif
338 229
339#ifndef _read_unlock_bh 230#ifndef CONFIG_INLINE_READ_LOCK_BH
340void __lockfunc _read_unlock_bh(rwlock_t *lock) 231void __lockfunc _raw_read_lock_bh(rwlock_t *lock)
341{ 232{
342 __read_unlock_bh(lock); 233 __raw_read_lock_bh(lock);
343} 234}
344EXPORT_SYMBOL(_read_unlock_bh); 235EXPORT_SYMBOL(_raw_read_lock_bh);
345#endif 236#endif
346 237
347#ifndef _write_unlock_irqrestore 238#ifndef CONFIG_INLINE_READ_UNLOCK
348void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 239void __lockfunc _raw_read_unlock(rwlock_t *lock)
349{ 240{
350 __write_unlock_irqrestore(lock, flags); 241 __raw_read_unlock(lock);
351} 242}
352EXPORT_SYMBOL(_write_unlock_irqrestore); 243EXPORT_SYMBOL(_raw_read_unlock);
353#endif 244#endif
354 245
355#ifndef _write_unlock_irq 246#ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE
356void __lockfunc _write_unlock_irq(rwlock_t *lock) 247void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
357{ 248{
358 __write_unlock_irq(lock); 249 __raw_read_unlock_irqrestore(lock, flags);
359} 250}
360EXPORT_SYMBOL(_write_unlock_irq); 251EXPORT_SYMBOL(_raw_read_unlock_irqrestore);
361#endif 252#endif
362 253
363#ifndef _write_unlock_bh 254#ifndef CONFIG_INLINE_READ_UNLOCK_IRQ
364void __lockfunc _write_unlock_bh(rwlock_t *lock) 255void __lockfunc _raw_read_unlock_irq(rwlock_t *lock)
365{ 256{
366 __write_unlock_bh(lock); 257 __raw_read_unlock_irq(lock);
367} 258}
368EXPORT_SYMBOL(_write_unlock_bh); 259EXPORT_SYMBOL(_raw_read_unlock_irq);
369#endif 260#endif
370 261
371#ifndef _spin_trylock_bh 262#ifndef CONFIG_INLINE_READ_UNLOCK_BH
372int __lockfunc _spin_trylock_bh(spinlock_t *lock) 263void __lockfunc _raw_read_unlock_bh(rwlock_t *lock)
373{ 264{
374 return __spin_trylock_bh(lock); 265 __raw_read_unlock_bh(lock);
375} 266}
376EXPORT_SYMBOL(_spin_trylock_bh); 267EXPORT_SYMBOL(_raw_read_unlock_bh);
268#endif
269
270#ifndef CONFIG_INLINE_WRITE_TRYLOCK
271int __lockfunc _raw_write_trylock(rwlock_t *lock)
272{
273 return __raw_write_trylock(lock);
274}
275EXPORT_SYMBOL(_raw_write_trylock);
276#endif
277
278#ifndef CONFIG_INLINE_WRITE_LOCK
279void __lockfunc _raw_write_lock(rwlock_t *lock)
280{
281 __raw_write_lock(lock);
282}
283EXPORT_SYMBOL(_raw_write_lock);
284#endif
285
286#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE
287unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock)
288{
289 return __raw_write_lock_irqsave(lock);
290}
291EXPORT_SYMBOL(_raw_write_lock_irqsave);
292#endif
293
294#ifndef CONFIG_INLINE_WRITE_LOCK_IRQ
295void __lockfunc _raw_write_lock_irq(rwlock_t *lock)
296{
297 __raw_write_lock_irq(lock);
298}
299EXPORT_SYMBOL(_raw_write_lock_irq);
300#endif
301
302#ifndef CONFIG_INLINE_WRITE_LOCK_BH
303void __lockfunc _raw_write_lock_bh(rwlock_t *lock)
304{
305 __raw_write_lock_bh(lock);
306}
307EXPORT_SYMBOL(_raw_write_lock_bh);
308#endif
309
310#ifndef CONFIG_INLINE_WRITE_UNLOCK
311void __lockfunc _raw_write_unlock(rwlock_t *lock)
312{
313 __raw_write_unlock(lock);
314}
315EXPORT_SYMBOL(_raw_write_unlock);
316#endif
317
318#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE
319void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
320{
321 __raw_write_unlock_irqrestore(lock, flags);
322}
323EXPORT_SYMBOL(_raw_write_unlock_irqrestore);
324#endif
325
326#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ
327void __lockfunc _raw_write_unlock_irq(rwlock_t *lock)
328{
329 __raw_write_unlock_irq(lock);
330}
331EXPORT_SYMBOL(_raw_write_unlock_irq);
332#endif
333
334#ifndef CONFIG_INLINE_WRITE_UNLOCK_BH
335void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
336{
337 __raw_write_unlock_bh(lock);
338}
339EXPORT_SYMBOL(_raw_write_unlock_bh);
340#endif
341
342#ifdef CONFIG_DEBUG_LOCK_ALLOC
343
344void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
345{
346 preempt_disable();
347 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
348 LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
349}
350EXPORT_SYMBOL(_raw_spin_lock_nested);
351
352unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock,
353 int subclass)
354{
355 unsigned long flags;
356
357 local_irq_save(flags);
358 preempt_disable();
359 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
360 LOCK_CONTENDED_FLAGS(lock, do_raw_spin_trylock, do_raw_spin_lock,
361 do_raw_spin_lock_flags, &flags);
362 return flags;
363}
364EXPORT_SYMBOL(_raw_spin_lock_irqsave_nested);
365
366void __lockfunc _raw_spin_lock_nest_lock(raw_spinlock_t *lock,
367 struct lockdep_map *nest_lock)
368{
369 preempt_disable();
370 spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
371 LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
372}
373EXPORT_SYMBOL(_raw_spin_lock_nest_lock);
374
377#endif 375#endif
378 376
379notrace int in_lock_functions(unsigned long addr) 377notrace int in_lock_functions(unsigned long addr)
diff --git a/kernel/srcu.c b/kernel/srcu.c
index b0aeeaf22ce4..2980da3fd509 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -30,10 +30,33 @@
30#include <linux/preempt.h> 30#include <linux/preempt.h>
31#include <linux/rcupdate.h> 31#include <linux/rcupdate.h>
32#include <linux/sched.h> 32#include <linux/sched.h>
33#include <linux/slab.h>
34#include <linux/smp.h> 33#include <linux/smp.h>
35#include <linux/srcu.h> 34#include <linux/srcu.h>
36 35
36static int init_srcu_struct_fields(struct srcu_struct *sp)
37{
38 sp->completed = 0;
39 mutex_init(&sp->mutex);
40 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
41 return sp->per_cpu_ref ? 0 : -ENOMEM;
42}
43
44#ifdef CONFIG_DEBUG_LOCK_ALLOC
45
46int __init_srcu_struct(struct srcu_struct *sp, const char *name,
47 struct lock_class_key *key)
48{
49#ifdef CONFIG_DEBUG_LOCK_ALLOC
50 /* Don't re-initialize a lock while it is held. */
51 debug_check_no_locks_freed((void *)sp, sizeof(*sp));
52 lockdep_init_map(&sp->dep_map, name, key, 0);
53#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
54 return init_srcu_struct_fields(sp);
55}
56EXPORT_SYMBOL_GPL(__init_srcu_struct);
57
58#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
59
37/** 60/**
38 * init_srcu_struct - initialize a sleep-RCU structure 61 * init_srcu_struct - initialize a sleep-RCU structure
39 * @sp: structure to initialize. 62 * @sp: structure to initialize.
@@ -44,11 +67,11 @@
44 */ 67 */
45int init_srcu_struct(struct srcu_struct *sp) 68int init_srcu_struct(struct srcu_struct *sp)
46{ 69{
47 sp->completed = 0; 70 return init_srcu_struct_fields(sp);
48 mutex_init(&sp->mutex);
49 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
50 return (sp->per_cpu_ref ? 0 : -ENOMEM);
51} 71}
72EXPORT_SYMBOL_GPL(init_srcu_struct);
73
74#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
52 75
53/* 76/*
54 * srcu_readers_active_idx -- returns approximate number of readers 77 * srcu_readers_active_idx -- returns approximate number of readers
@@ -97,16 +120,14 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
97 free_percpu(sp->per_cpu_ref); 120 free_percpu(sp->per_cpu_ref);
98 sp->per_cpu_ref = NULL; 121 sp->per_cpu_ref = NULL;
99} 122}
123EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
100 124
101/** 125/*
102 * srcu_read_lock - register a new reader for an SRCU-protected structure.
103 * @sp: srcu_struct in which to register the new reader.
104 *
105 * Counts the new reader in the appropriate per-CPU element of the 126 * Counts the new reader in the appropriate per-CPU element of the
106 * srcu_struct. Must be called from process context. 127 * srcu_struct. Must be called from process context.
107 * Returns an index that must be passed to the matching srcu_read_unlock(). 128 * Returns an index that must be passed to the matching srcu_read_unlock().
108 */ 129 */
109int srcu_read_lock(struct srcu_struct *sp) 130int __srcu_read_lock(struct srcu_struct *sp)
110{ 131{
111 int idx; 132 int idx;
112 133
@@ -118,40 +139,27 @@ int srcu_read_lock(struct srcu_struct *sp)
118 preempt_enable(); 139 preempt_enable();
119 return idx; 140 return idx;
120} 141}
142EXPORT_SYMBOL_GPL(__srcu_read_lock);
121 143
122/** 144/*
123 * srcu_read_unlock - unregister a old reader from an SRCU-protected structure.
124 * @sp: srcu_struct in which to unregister the old reader.
125 * @idx: return value from corresponding srcu_read_lock().
126 *
127 * Removes the count for the old reader from the appropriate per-CPU 145 * Removes the count for the old reader from the appropriate per-CPU
128 * element of the srcu_struct. Note that this may well be a different 146 * element of the srcu_struct. Note that this may well be a different
129 * CPU than that which was incremented by the corresponding srcu_read_lock(). 147 * CPU than that which was incremented by the corresponding srcu_read_lock().
130 * Must be called from process context. 148 * Must be called from process context.
131 */ 149 */
132void srcu_read_unlock(struct srcu_struct *sp, int idx) 150void __srcu_read_unlock(struct srcu_struct *sp, int idx)
133{ 151{
134 preempt_disable(); 152 preempt_disable();
135 srcu_barrier(); /* ensure compiler won't misorder critical section. */ 153 srcu_barrier(); /* ensure compiler won't misorder critical section. */
136 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; 154 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--;
137 preempt_enable(); 155 preempt_enable();
138} 156}
157EXPORT_SYMBOL_GPL(__srcu_read_unlock);
139 158
140/** 159/*
141 * synchronize_srcu - wait for prior SRCU read-side critical-section completion 160 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
142 * @sp: srcu_struct with which to synchronize.
143 *
144 * Flip the completed counter, and wait for the old count to drain to zero.
145 * As with classic RCU, the updater must use some separate means of
146 * synchronizing concurrent updates. Can block; must be called from
147 * process context.
148 *
149 * Note that it is illegal to call synchornize_srcu() from the corresponding
150 * SRCU read-side critical section; doing so will result in deadlock.
151 * However, it is perfectly legal to call synchronize_srcu() on one
152 * srcu_struct from some other srcu_struct's read-side critical section.
153 */ 161 */
154void synchronize_srcu(struct srcu_struct *sp) 162static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
155{ 163{
156 int idx; 164 int idx;
157 165
@@ -173,7 +181,7 @@ void synchronize_srcu(struct srcu_struct *sp)
173 return; 181 return;
174 } 182 }
175 183
176 synchronize_sched(); /* Force memory barrier on all CPUs. */ 184 sync_func(); /* Force memory barrier on all CPUs. */
177 185
178 /* 186 /*
179 * The preceding synchronize_sched() ensures that any CPU that 187 * The preceding synchronize_sched() ensures that any CPU that
@@ -190,7 +198,7 @@ void synchronize_srcu(struct srcu_struct *sp)
190 idx = sp->completed & 0x1; 198 idx = sp->completed & 0x1;
191 sp->completed++; 199 sp->completed++;
192 200
193 synchronize_sched(); /* Force memory barrier on all CPUs. */ 201 sync_func(); /* Force memory barrier on all CPUs. */
194 202
195 /* 203 /*
196 * At this point, because of the preceding synchronize_sched(), 204 * At this point, because of the preceding synchronize_sched(),
@@ -203,7 +211,7 @@ void synchronize_srcu(struct srcu_struct *sp)
203 while (srcu_readers_active_idx(sp, idx)) 211 while (srcu_readers_active_idx(sp, idx))
204 schedule_timeout_interruptible(1); 212 schedule_timeout_interruptible(1);
205 213
206 synchronize_sched(); /* Force memory barrier on all CPUs. */ 214 sync_func(); /* Force memory barrier on all CPUs. */
207 215
208 /* 216 /*
209 * The preceding synchronize_sched() forces all srcu_read_unlock() 217 * The preceding synchronize_sched() forces all srcu_read_unlock()
@@ -237,6 +245,47 @@ void synchronize_srcu(struct srcu_struct *sp)
237} 245}
238 246
239/** 247/**
248 * synchronize_srcu - wait for prior SRCU read-side critical-section completion
249 * @sp: srcu_struct with which to synchronize.
250 *
251 * Flip the completed counter, and wait for the old count to drain to zero.
252 * As with classic RCU, the updater must use some separate means of
253 * synchronizing concurrent updates. Can block; must be called from
254 * process context.
255 *
256 * Note that it is illegal to call synchronize_srcu() from the corresponding
257 * SRCU read-side critical section; doing so will result in deadlock.
258 * However, it is perfectly legal to call synchronize_srcu() on one
259 * srcu_struct from some other srcu_struct's read-side critical section.
260 */
261void synchronize_srcu(struct srcu_struct *sp)
262{
263 __synchronize_srcu(sp, synchronize_sched);
264}
265EXPORT_SYMBOL_GPL(synchronize_srcu);
266
267/**
268 * synchronize_srcu_expedited - like synchronize_srcu, but less patient
269 * @sp: srcu_struct with which to synchronize.
270 *
271 * Flip the completed counter, and wait for the old count to drain to zero.
272 * As with classic RCU, the updater must use some separate means of
273 * synchronizing concurrent updates. Can block; must be called from
274 * process context.
275 *
276 * Note that it is illegal to call synchronize_srcu_expedited()
277 * from the corresponding SRCU read-side critical section; doing so
278 * will result in deadlock. However, it is perfectly legal to call
279 * synchronize_srcu_expedited() on one srcu_struct from some other
280 * srcu_struct's read-side critical section.
281 */
282void synchronize_srcu_expedited(struct srcu_struct *sp)
283{
284 __synchronize_srcu(sp, synchronize_sched_expedited);
285}
286EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
287
288/**
240 * srcu_batches_completed - return batches completed. 289 * srcu_batches_completed - return batches completed.
241 * @sp: srcu_struct on which to report batch completion. 290 * @sp: srcu_struct on which to report batch completion.
242 * 291 *
@@ -248,10 +297,4 @@ long srcu_batches_completed(struct srcu_struct *sp)
248{ 297{
249 return sp->completed; 298 return sp->completed;
250} 299}
251
252EXPORT_SYMBOL_GPL(init_srcu_struct);
253EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
254EXPORT_SYMBOL_GPL(srcu_read_lock);
255EXPORT_SYMBOL_GPL(srcu_read_unlock);
256EXPORT_SYMBOL_GPL(synchronize_srcu);
257EXPORT_SYMBOL_GPL(srcu_batches_completed); 300EXPORT_SYMBOL_GPL(srcu_batches_completed);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 912823e2a11b..9bb9fb1bd79c 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -45,7 +45,7 @@ static int refcount;
45static struct workqueue_struct *stop_machine_wq; 45static struct workqueue_struct *stop_machine_wq;
46static struct stop_machine_data active, idle; 46static struct stop_machine_data active, idle;
47static const struct cpumask *active_cpus; 47static const struct cpumask *active_cpus;
48static void *stop_machine_work; 48static void __percpu *stop_machine_work;
49 49
50static void set_state(enum stopmachine_state newstate) 50static void set_state(enum stopmachine_state newstate)
51{ 51{
diff --git a/kernel/sys.c b/kernel/sys.c
index ce17760d9c51..7cb426a58965 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -8,7 +8,6 @@
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/utsname.h> 9#include <linux/utsname.h>
10#include <linux/mman.h> 10#include <linux/mman.h>
11#include <linux/smp_lock.h>
12#include <linux/notifier.h> 11#include <linux/notifier.h>
13#include <linux/reboot.h> 12#include <linux/reboot.h>
14#include <linux/prctl.h> 13#include <linux/prctl.h>
@@ -34,8 +33,10 @@
34#include <linux/task_io_accounting_ops.h> 33#include <linux/task_io_accounting_ops.h>
35#include <linux/seccomp.h> 34#include <linux/seccomp.h>
36#include <linux/cpu.h> 35#include <linux/cpu.h>
36#include <linux/personality.h>
37#include <linux/ptrace.h> 37#include <linux/ptrace.h>
38#include <linux/fs_struct.h> 38#include <linux/fs_struct.h>
39#include <linux/gfp.h>
39 40
40#include <linux/compat.h> 41#include <linux/compat.h>
41#include <linux/syscalls.h> 42#include <linux/syscalls.h>
@@ -163,6 +164,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
163 if (niceval > 19) 164 if (niceval > 19)
164 niceval = 19; 165 niceval = 19;
165 166
167 rcu_read_lock();
166 read_lock(&tasklist_lock); 168 read_lock(&tasklist_lock);
167 switch (which) { 169 switch (which) {
168 case PRIO_PROCESS: 170 case PRIO_PROCESS:
@@ -190,16 +192,17 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
190 !(user = find_user(who))) 192 !(user = find_user(who)))
191 goto out_unlock; /* No processes for this user */ 193 goto out_unlock; /* No processes for this user */
192 194
193 do_each_thread(g, p) 195 do_each_thread(g, p) {
194 if (__task_cred(p)->uid == who) 196 if (__task_cred(p)->uid == who)
195 error = set_one_prio(p, niceval, error); 197 error = set_one_prio(p, niceval, error);
196 while_each_thread(g, p); 198 } while_each_thread(g, p);
197 if (who != cred->uid) 199 if (who != cred->uid)
198 free_uid(user); /* For find_user() */ 200 free_uid(user); /* For find_user() */
199 break; 201 break;
200 } 202 }
201out_unlock: 203out_unlock:
202 read_unlock(&tasklist_lock); 204 read_unlock(&tasklist_lock);
205 rcu_read_unlock();
203out: 206out:
204 return error; 207 return error;
205} 208}
@@ -221,6 +224,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
221 if (which > PRIO_USER || which < PRIO_PROCESS) 224 if (which > PRIO_USER || which < PRIO_PROCESS)
222 return -EINVAL; 225 return -EINVAL;
223 226
227 rcu_read_lock();
224 read_lock(&tasklist_lock); 228 read_lock(&tasklist_lock);
225 switch (which) { 229 switch (which) {
226 case PRIO_PROCESS: 230 case PRIO_PROCESS:
@@ -253,19 +257,20 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
253 !(user = find_user(who))) 257 !(user = find_user(who)))
254 goto out_unlock; /* No processes for this user */ 258 goto out_unlock; /* No processes for this user */
255 259
256 do_each_thread(g, p) 260 do_each_thread(g, p) {
257 if (__task_cred(p)->uid == who) { 261 if (__task_cred(p)->uid == who) {
258 niceval = 20 - task_nice(p); 262 niceval = 20 - task_nice(p);
259 if (niceval > retval) 263 if (niceval > retval)
260 retval = niceval; 264 retval = niceval;
261 } 265 }
262 while_each_thread(g, p); 266 } while_each_thread(g, p);
263 if (who != cred->uid) 267 if (who != cred->uid)
264 free_uid(user); /* for find_user() */ 268 free_uid(user); /* for find_user() */
265 break; 269 break;
266 } 270 }
267out_unlock: 271out_unlock:
268 read_unlock(&tasklist_lock); 272 read_unlock(&tasklist_lock);
273 rcu_read_unlock();
269 274
270 return retval; 275 return retval;
271} 276}
@@ -349,6 +354,9 @@ void kernel_power_off(void)
349 machine_power_off(); 354 machine_power_off();
350} 355}
351EXPORT_SYMBOL_GPL(kernel_power_off); 356EXPORT_SYMBOL_GPL(kernel_power_off);
357
358static DEFINE_MUTEX(reboot_mutex);
359
352/* 360/*
353 * Reboot system call: for obvious reasons only root may call it, 361 * Reboot system call: for obvious reasons only root may call it,
354 * and even root needs to set up some magic numbers in the registers 362 * and even root needs to set up some magic numbers in the registers
@@ -381,7 +389,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
381 if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) 389 if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
382 cmd = LINUX_REBOOT_CMD_HALT; 390 cmd = LINUX_REBOOT_CMD_HALT;
383 391
384 lock_kernel(); 392 mutex_lock(&reboot_mutex);
385 switch (cmd) { 393 switch (cmd) {
386 case LINUX_REBOOT_CMD_RESTART: 394 case LINUX_REBOOT_CMD_RESTART:
387 kernel_restart(NULL); 395 kernel_restart(NULL);
@@ -397,20 +405,18 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
397 405
398 case LINUX_REBOOT_CMD_HALT: 406 case LINUX_REBOOT_CMD_HALT:
399 kernel_halt(); 407 kernel_halt();
400 unlock_kernel();
401 do_exit(0); 408 do_exit(0);
402 panic("cannot halt"); 409 panic("cannot halt");
403 410
404 case LINUX_REBOOT_CMD_POWER_OFF: 411 case LINUX_REBOOT_CMD_POWER_OFF:
405 kernel_power_off(); 412 kernel_power_off();
406 unlock_kernel();
407 do_exit(0); 413 do_exit(0);
408 break; 414 break;
409 415
410 case LINUX_REBOOT_CMD_RESTART2: 416 case LINUX_REBOOT_CMD_RESTART2:
411 if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) { 417 if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) {
412 unlock_kernel(); 418 ret = -EFAULT;
413 return -EFAULT; 419 break;
414 } 420 }
415 buffer[sizeof(buffer) - 1] = '\0'; 421 buffer[sizeof(buffer) - 1] = '\0';
416 422
@@ -433,7 +439,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
433 ret = -EINVAL; 439 ret = -EINVAL;
434 break; 440 break;
435 } 441 }
436 unlock_kernel(); 442 mutex_unlock(&reboot_mutex);
437 return ret; 443 return ret;
438} 444}
439 445
@@ -567,13 +573,7 @@ static int set_user(struct cred *new)
567 if (!new_user) 573 if (!new_user)
568 return -EAGAIN; 574 return -EAGAIN;
569 575
570 if (!task_can_switch_user(new_user, current)) { 576 if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
571 free_uid(new_user);
572 return -EINVAL;
573 }
574
575 if (atomic_read(&new_user->processes) >=
576 current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
577 new_user != INIT_USER) { 577 new_user != INIT_USER) {
578 free_uid(new_user); 578 free_uid(new_user);
579 return -EAGAIN; 579 return -EAGAIN;
@@ -911,16 +911,15 @@ change_okay:
911 911
912void do_sys_times(struct tms *tms) 912void do_sys_times(struct tms *tms)
913{ 913{
914 struct task_cputime cputime; 914 cputime_t tgutime, tgstime, cutime, cstime;
915 cputime_t cutime, cstime;
916 915
917 thread_group_cputime(current, &cputime);
918 spin_lock_irq(&current->sighand->siglock); 916 spin_lock_irq(&current->sighand->siglock);
917 thread_group_times(current, &tgutime, &tgstime);
919 cutime = current->signal->cutime; 918 cutime = current->signal->cutime;
920 cstime = current->signal->cstime; 919 cstime = current->signal->cstime;
921 spin_unlock_irq(&current->sighand->siglock); 920 spin_unlock_irq(&current->sighand->siglock);
922 tms->tms_utime = cputime_to_clock_t(cputime.utime); 921 tms->tms_utime = cputime_to_clock_t(tgutime);
923 tms->tms_stime = cputime_to_clock_t(cputime.stime); 922 tms->tms_stime = cputime_to_clock_t(tgstime);
924 tms->tms_cutime = cputime_to_clock_t(cutime); 923 tms->tms_cutime = cputime_to_clock_t(cutime);
925 tms->tms_cstime = cputime_to_clock_t(cstime); 924 tms->tms_cstime = cputime_to_clock_t(cstime);
926} 925}
@@ -1117,6 +1116,15 @@ out:
1117 1116
1118DECLARE_RWSEM(uts_sem); 1117DECLARE_RWSEM(uts_sem);
1119 1118
1119#ifdef COMPAT_UTS_MACHINE
1120#define override_architecture(name) \
1121 (personality(current->personality) == PER_LINUX32 && \
1122 copy_to_user(name->machine, COMPAT_UTS_MACHINE, \
1123 sizeof(COMPAT_UTS_MACHINE)))
1124#else
1125#define override_architecture(name) 0
1126#endif
1127
1120SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) 1128SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
1121{ 1129{
1122 int errno = 0; 1130 int errno = 0;
@@ -1125,9 +1133,66 @@ SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
1125 if (copy_to_user(name, utsname(), sizeof *name)) 1133 if (copy_to_user(name, utsname(), sizeof *name))
1126 errno = -EFAULT; 1134 errno = -EFAULT;
1127 up_read(&uts_sem); 1135 up_read(&uts_sem);
1136
1137 if (!errno && override_architecture(name))
1138 errno = -EFAULT;
1128 return errno; 1139 return errno;
1129} 1140}
1130 1141
1142#ifdef __ARCH_WANT_SYS_OLD_UNAME
1143/*
1144 * Old cruft
1145 */
1146SYSCALL_DEFINE1(uname, struct old_utsname __user *, name)
1147{
1148 int error = 0;
1149
1150 if (!name)
1151 return -EFAULT;
1152
1153 down_read(&uts_sem);
1154 if (copy_to_user(name, utsname(), sizeof(*name)))
1155 error = -EFAULT;
1156 up_read(&uts_sem);
1157
1158 if (!error && override_architecture(name))
1159 error = -EFAULT;
1160 return error;
1161}
1162
1163SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name)
1164{
1165 int error;
1166
1167 if (!name)
1168 return -EFAULT;
1169 if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
1170 return -EFAULT;
1171
1172 down_read(&uts_sem);
1173 error = __copy_to_user(&name->sysname, &utsname()->sysname,
1174 __OLD_UTS_LEN);
1175 error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
1176 error |= __copy_to_user(&name->nodename, &utsname()->nodename,
1177 __OLD_UTS_LEN);
1178 error |= __put_user(0, name->nodename + __OLD_UTS_LEN);
1179 error |= __copy_to_user(&name->release, &utsname()->release,
1180 __OLD_UTS_LEN);
1181 error |= __put_user(0, name->release + __OLD_UTS_LEN);
1182 error |= __copy_to_user(&name->version, &utsname()->version,
1183 __OLD_UTS_LEN);
1184 error |= __put_user(0, name->version + __OLD_UTS_LEN);
1185 error |= __copy_to_user(&name->machine, &utsname()->machine,
1186 __OLD_UTS_LEN);
1187 error |= __put_user(0, name->machine + __OLD_UTS_LEN);
1188 up_read(&uts_sem);
1189
1190 if (!error && override_architecture(name))
1191 error = -EFAULT;
1192 return error ? -EFAULT : 0;
1193}
1194#endif
1195
1131SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) 1196SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
1132{ 1197{
1133 int errno; 1198 int errno;
@@ -1338,16 +1403,14 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1338{ 1403{
1339 struct task_struct *t; 1404 struct task_struct *t;
1340 unsigned long flags; 1405 unsigned long flags;
1341 cputime_t utime, stime; 1406 cputime_t tgutime, tgstime, utime, stime;
1342 struct task_cputime cputime;
1343 unsigned long maxrss = 0; 1407 unsigned long maxrss = 0;
1344 1408
1345 memset((char *) r, 0, sizeof *r); 1409 memset((char *) r, 0, sizeof *r);
1346 utime = stime = cputime_zero; 1410 utime = stime = cputime_zero;
1347 1411
1348 if (who == RUSAGE_THREAD) { 1412 if (who == RUSAGE_THREAD) {
1349 utime = task_utime(current); 1413 task_times(current, &utime, &stime);
1350 stime = task_stime(current);
1351 accumulate_thread_rusage(p, r); 1414 accumulate_thread_rusage(p, r);
1352 maxrss = p->signal->maxrss; 1415 maxrss = p->signal->maxrss;
1353 goto out; 1416 goto out;
@@ -1373,9 +1436,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1373 break; 1436 break;
1374 1437
1375 case RUSAGE_SELF: 1438 case RUSAGE_SELF:
1376 thread_group_cputime(p, &cputime); 1439 thread_group_times(p, &tgutime, &tgstime);
1377 utime = cputime_add(utime, cputime.utime); 1440 utime = cputime_add(utime, tgutime);
1378 stime = cputime_add(stime, cputime.stime); 1441 stime = cputime_add(stime, tgstime);
1379 r->ru_nvcsw += p->signal->nvcsw; 1442 r->ru_nvcsw += p->signal->nvcsw;
1380 r->ru_nivcsw += p->signal->nivcsw; 1443 r->ru_nivcsw += p->signal->nivcsw;
1381 r->ru_minflt += p->signal->min_flt; 1444 r->ru_minflt += p->signal->min_flt;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index e06d0b8d1951..70f2ea758ffe 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -48,8 +48,10 @@ cond_syscall(sys_shutdown);
48cond_syscall(sys_sendmsg); 48cond_syscall(sys_sendmsg);
49cond_syscall(compat_sys_sendmsg); 49cond_syscall(compat_sys_sendmsg);
50cond_syscall(sys_recvmsg); 50cond_syscall(sys_recvmsg);
51cond_syscall(sys_recvmmsg);
51cond_syscall(compat_sys_recvmsg); 52cond_syscall(compat_sys_recvmsg);
52cond_syscall(compat_sys_recvfrom); 53cond_syscall(compat_sys_recvfrom);
54cond_syscall(compat_sys_recvmmsg);
53cond_syscall(sys_socketcall); 55cond_syscall(sys_socketcall);
54cond_syscall(sys_futex); 56cond_syscall(sys_futex);
55cond_syscall(compat_sys_futex); 57cond_syscall(compat_sys_futex);
@@ -124,6 +126,7 @@ cond_syscall(sys_setreuid16);
124cond_syscall(sys_setuid16); 126cond_syscall(sys_setuid16);
125cond_syscall(sys_vm86old); 127cond_syscall(sys_vm86old);
126cond_syscall(sys_vm86); 128cond_syscall(sys_vm86);
129cond_syscall(sys_ipc);
127cond_syscall(compat_sys_ipc); 130cond_syscall(compat_sys_ipc);
128cond_syscall(compat_sys_sysctl); 131cond_syscall(compat_sys_sysctl);
129cond_syscall(sys_flock); 132cond_syscall(sys_flock);
@@ -139,7 +142,6 @@ cond_syscall(sys_pciconfig_read);
139cond_syscall(sys_pciconfig_write); 142cond_syscall(sys_pciconfig_write);
140cond_syscall(sys_pciconfig_iobase); 143cond_syscall(sys_pciconfig_iobase);
141cond_syscall(sys32_ipc); 144cond_syscall(sys32_ipc);
142cond_syscall(sys32_sysctl);
143cond_syscall(ppc_rtas); 145cond_syscall(ppc_rtas);
144cond_syscall(sys_spu_run); 146cond_syscall(sys_spu_run);
145cond_syscall(sys_spu_create); 147cond_syscall(sys_spu_create);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0d949c517412..8686b0f5fc12 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -23,11 +23,11 @@
23#include <linux/swap.h> 23#include <linux/swap.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/sysctl.h> 25#include <linux/sysctl.h>
26#include <linux/signal.h>
26#include <linux/proc_fs.h> 27#include <linux/proc_fs.h>
27#include <linux/security.h> 28#include <linux/security.h>
28#include <linux/ctype.h> 29#include <linux/ctype.h>
29#include <linux/kmemcheck.h> 30#include <linux/kmemcheck.h>
30#include <linux/smp_lock.h>
31#include <linux/fs.h> 31#include <linux/fs.h>
32#include <linux/init.h> 32#include <linux/init.h>
33#include <linux/kernel.h> 33#include <linux/kernel.h>
@@ -36,6 +36,7 @@
36#include <linux/sysrq.h> 36#include <linux/sysrq.h>
37#include <linux/highuid.h> 37#include <linux/highuid.h>
38#include <linux/writeback.h> 38#include <linux/writeback.h>
39#include <linux/ratelimit.h>
39#include <linux/hugetlb.h> 40#include <linux/hugetlb.h>
40#include <linux/initrd.h> 41#include <linux/initrd.h>
41#include <linux/key.h> 42#include <linux/key.h>
@@ -50,6 +51,7 @@
50#include <linux/ftrace.h> 51#include <linux/ftrace.h>
51#include <linux/slow-work.h> 52#include <linux/slow-work.h>
52#include <linux/perf_event.h> 53#include <linux/perf_event.h>
54#include <linux/kprobes.h>
53 55
54#include <asm/uaccess.h> 56#include <asm/uaccess.h>
55#include <asm/processor.h> 57#include <asm/processor.h>
@@ -59,14 +61,23 @@
59#include <asm/stacktrace.h> 61#include <asm/stacktrace.h>
60#include <asm/io.h> 62#include <asm/io.h>
61#endif 63#endif
64#ifdef CONFIG_BSD_PROCESS_ACCT
65#include <linux/acct.h>
66#endif
67#ifdef CONFIG_RT_MUTEXES
68#include <linux/rtmutex.h>
69#endif
70#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT)
71#include <linux/lockdep.h>
72#endif
73#ifdef CONFIG_CHR_DEV_SG
74#include <scsi/sg.h>
75#endif
62 76
63static int deprecated_sysctl_warning(struct __sysctl_args *args);
64 77
65#if defined(CONFIG_SYSCTL) 78#if defined(CONFIG_SYSCTL)
66 79
67/* External variables not in a header file. */ 80/* External variables not in a header file. */
68extern int C_A_D;
69extern int print_fatal_signals;
70extern int sysctl_overcommit_memory; 81extern int sysctl_overcommit_memory;
71extern int sysctl_overcommit_ratio; 82extern int sysctl_overcommit_ratio;
72extern int sysctl_panic_on_oom; 83extern int sysctl_panic_on_oom;
@@ -88,9 +99,6 @@ extern int sysctl_nr_open_min, sysctl_nr_open_max;
88#ifndef CONFIG_MMU 99#ifndef CONFIG_MMU
89extern int sysctl_nr_trim_pages; 100extern int sysctl_nr_trim_pages;
90#endif 101#endif
91#ifdef CONFIG_RCU_TORTURE_TEST
92extern int rcutorture_runnable;
93#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
94#ifdef CONFIG_BLOCK 102#ifdef CONFIG_BLOCK
95extern int blk_iopoll_enabled; 103extern int blk_iopoll_enabled;
96#endif 104#endif
@@ -120,14 +128,6 @@ static int min_percpu_pagelist_fract = 8;
120 128
121static int ngroups_max = NGROUPS_MAX; 129static int ngroups_max = NGROUPS_MAX;
122 130
123#ifdef CONFIG_MODULES
124extern char modprobe_path[];
125extern int modules_disabled;
126#endif
127#ifdef CONFIG_CHR_DEV_SG
128extern int sg_big_buff;
129#endif
130
131#ifdef CONFIG_SPARC 131#ifdef CONFIG_SPARC
132#include <asm/system.h> 132#include <asm/system.h>
133#endif 133#endif
@@ -149,18 +149,12 @@ extern int sysctl_userprocess_debug;
149extern int spin_retry; 149extern int spin_retry;
150#endif 150#endif
151 151
152#ifdef CONFIG_BSD_PROCESS_ACCT
153extern int acct_parm[];
154#endif
155
156#ifdef CONFIG_IA64 152#ifdef CONFIG_IA64
157extern int no_unaligned_warning; 153extern int no_unaligned_warning;
158extern int unaligned_dump_stack; 154extern int unaligned_dump_stack;
159#endif 155#endif
160 156
161#ifdef CONFIG_RT_MUTEXES 157extern struct ratelimit_state printk_ratelimit_state;
162extern int max_lock_depth;
163#endif
164 158
165#ifdef CONFIG_PROC_SYSCTL 159#ifdef CONFIG_PROC_SYSCTL
166static int proc_do_cad_pid(struct ctl_table *table, int write, 160static int proc_do_cad_pid(struct ctl_table *table, int write,
@@ -200,38 +194,30 @@ extern struct ctl_table epoll_table[];
200int sysctl_legacy_va_layout; 194int sysctl_legacy_va_layout;
201#endif 195#endif
202 196
203extern int prove_locking;
204extern int lock_stat;
205
206/* The default sysctl tables: */ 197/* The default sysctl tables: */
207 198
208static struct ctl_table root_table[] = { 199static struct ctl_table root_table[] = {
209 { 200 {
210 .ctl_name = CTL_KERN,
211 .procname = "kernel", 201 .procname = "kernel",
212 .mode = 0555, 202 .mode = 0555,
213 .child = kern_table, 203 .child = kern_table,
214 }, 204 },
215 { 205 {
216 .ctl_name = CTL_VM,
217 .procname = "vm", 206 .procname = "vm",
218 .mode = 0555, 207 .mode = 0555,
219 .child = vm_table, 208 .child = vm_table,
220 }, 209 },
221 { 210 {
222 .ctl_name = CTL_FS,
223 .procname = "fs", 211 .procname = "fs",
224 .mode = 0555, 212 .mode = 0555,
225 .child = fs_table, 213 .child = fs_table,
226 }, 214 },
227 { 215 {
228 .ctl_name = CTL_DEBUG,
229 .procname = "debug", 216 .procname = "debug",
230 .mode = 0555, 217 .mode = 0555,
231 .child = debug_table, 218 .child = debug_table,
232 }, 219 },
233 { 220 {
234 .ctl_name = CTL_DEV,
235 .procname = "dev", 221 .procname = "dev",
236 .mode = 0555, 222 .mode = 0555,
237 .child = dev_table, 223 .child = dev_table,
@@ -240,7 +226,7 @@ static struct ctl_table root_table[] = {
240 * NOTE: do not add new entries to this table unless you have read 226 * NOTE: do not add new entries to this table unless you have read
241 * Documentation/sysctl/ctl_unnumbered.txt 227 * Documentation/sysctl/ctl_unnumbered.txt
242 */ 228 */
243 { .ctl_name = 0 } 229 { }
244}; 230};
245 231
246#ifdef CONFIG_SCHED_DEBUG 232#ifdef CONFIG_SCHED_DEBUG
@@ -248,196 +234,178 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */
248static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ 234static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
249static int min_wakeup_granularity_ns; /* 0 usecs */ 235static int min_wakeup_granularity_ns; /* 0 usecs */
250static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ 236static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
237static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
238static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
239static int min_sched_shares_ratelimit = 100000; /* 100 usec */
240static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
251#endif 241#endif
252 242
253static struct ctl_table kern_table[] = { 243static struct ctl_table kern_table[] = {
254 { 244 {
255 .ctl_name = CTL_UNNUMBERED,
256 .procname = "sched_child_runs_first", 245 .procname = "sched_child_runs_first",
257 .data = &sysctl_sched_child_runs_first, 246 .data = &sysctl_sched_child_runs_first,
258 .maxlen = sizeof(unsigned int), 247 .maxlen = sizeof(unsigned int),
259 .mode = 0644, 248 .mode = 0644,
260 .proc_handler = &proc_dointvec, 249 .proc_handler = proc_dointvec,
261 }, 250 },
262#ifdef CONFIG_SCHED_DEBUG 251#ifdef CONFIG_SCHED_DEBUG
263 { 252 {
264 .ctl_name = CTL_UNNUMBERED,
265 .procname = "sched_min_granularity_ns", 253 .procname = "sched_min_granularity_ns",
266 .data = &sysctl_sched_min_granularity, 254 .data = &sysctl_sched_min_granularity,
267 .maxlen = sizeof(unsigned int), 255 .maxlen = sizeof(unsigned int),
268 .mode = 0644, 256 .mode = 0644,
269 .proc_handler = &sched_nr_latency_handler, 257 .proc_handler = sched_proc_update_handler,
270 .strategy = &sysctl_intvec,
271 .extra1 = &min_sched_granularity_ns, 258 .extra1 = &min_sched_granularity_ns,
272 .extra2 = &max_sched_granularity_ns, 259 .extra2 = &max_sched_granularity_ns,
273 }, 260 },
274 { 261 {
275 .ctl_name = CTL_UNNUMBERED,
276 .procname = "sched_latency_ns", 262 .procname = "sched_latency_ns",
277 .data = &sysctl_sched_latency, 263 .data = &sysctl_sched_latency,
278 .maxlen = sizeof(unsigned int), 264 .maxlen = sizeof(unsigned int),
279 .mode = 0644, 265 .mode = 0644,
280 .proc_handler = &sched_nr_latency_handler, 266 .proc_handler = sched_proc_update_handler,
281 .strategy = &sysctl_intvec,
282 .extra1 = &min_sched_granularity_ns, 267 .extra1 = &min_sched_granularity_ns,
283 .extra2 = &max_sched_granularity_ns, 268 .extra2 = &max_sched_granularity_ns,
284 }, 269 },
285 { 270 {
286 .ctl_name = CTL_UNNUMBERED,
287 .procname = "sched_wakeup_granularity_ns", 271 .procname = "sched_wakeup_granularity_ns",
288 .data = &sysctl_sched_wakeup_granularity, 272 .data = &sysctl_sched_wakeup_granularity,
289 .maxlen = sizeof(unsigned int), 273 .maxlen = sizeof(unsigned int),
290 .mode = 0644, 274 .mode = 0644,
291 .proc_handler = &proc_dointvec_minmax, 275 .proc_handler = sched_proc_update_handler,
292 .strategy = &sysctl_intvec,
293 .extra1 = &min_wakeup_granularity_ns, 276 .extra1 = &min_wakeup_granularity_ns,
294 .extra2 = &max_wakeup_granularity_ns, 277 .extra2 = &max_wakeup_granularity_ns,
295 }, 278 },
296 { 279 {
297 .ctl_name = CTL_UNNUMBERED,
298 .procname = "sched_shares_ratelimit", 280 .procname = "sched_shares_ratelimit",
299 .data = &sysctl_sched_shares_ratelimit, 281 .data = &sysctl_sched_shares_ratelimit,
300 .maxlen = sizeof(unsigned int), 282 .maxlen = sizeof(unsigned int),
301 .mode = 0644, 283 .mode = 0644,
302 .proc_handler = &proc_dointvec, 284 .proc_handler = sched_proc_update_handler,
285 .extra1 = &min_sched_shares_ratelimit,
286 .extra2 = &max_sched_shares_ratelimit,
303 }, 287 },
304 { 288 {
305 .ctl_name = CTL_UNNUMBERED, 289 .procname = "sched_tunable_scaling",
306 .procname = "sched_shares_thresh", 290 .data = &sysctl_sched_tunable_scaling,
307 .data = &sysctl_sched_shares_thresh, 291 .maxlen = sizeof(enum sched_tunable_scaling),
308 .maxlen = sizeof(unsigned int),
309 .mode = 0644, 292 .mode = 0644,
310 .proc_handler = &proc_dointvec_minmax, 293 .proc_handler = sched_proc_update_handler,
311 .strategy = &sysctl_intvec, 294 .extra1 = &min_sched_tunable_scaling,
312 .extra1 = &zero, 295 .extra2 = &max_sched_tunable_scaling,
313 }, 296 },
314 { 297 {
315 .ctl_name = CTL_UNNUMBERED, 298 .procname = "sched_shares_thresh",
316 .procname = "sched_features", 299 .data = &sysctl_sched_shares_thresh,
317 .data = &sysctl_sched_features,
318 .maxlen = sizeof(unsigned int), 300 .maxlen = sizeof(unsigned int),
319 .mode = 0644, 301 .mode = 0644,
320 .proc_handler = &proc_dointvec, 302 .proc_handler = proc_dointvec_minmax,
303 .extra1 = &zero,
321 }, 304 },
322 { 305 {
323 .ctl_name = CTL_UNNUMBERED,
324 .procname = "sched_migration_cost", 306 .procname = "sched_migration_cost",
325 .data = &sysctl_sched_migration_cost, 307 .data = &sysctl_sched_migration_cost,
326 .maxlen = sizeof(unsigned int), 308 .maxlen = sizeof(unsigned int),
327 .mode = 0644, 309 .mode = 0644,
328 .proc_handler = &proc_dointvec, 310 .proc_handler = proc_dointvec,
329 }, 311 },
330 { 312 {
331 .ctl_name = CTL_UNNUMBERED,
332 .procname = "sched_nr_migrate", 313 .procname = "sched_nr_migrate",
333 .data = &sysctl_sched_nr_migrate, 314 .data = &sysctl_sched_nr_migrate,
334 .maxlen = sizeof(unsigned int), 315 .maxlen = sizeof(unsigned int),
335 .mode = 0644, 316 .mode = 0644,
336 .proc_handler = &proc_dointvec, 317 .proc_handler = proc_dointvec,
337 }, 318 },
338 { 319 {
339 .ctl_name = CTL_UNNUMBERED,
340 .procname = "sched_time_avg", 320 .procname = "sched_time_avg",
341 .data = &sysctl_sched_time_avg, 321 .data = &sysctl_sched_time_avg,
342 .maxlen = sizeof(unsigned int), 322 .maxlen = sizeof(unsigned int),
343 .mode = 0644, 323 .mode = 0644,
344 .proc_handler = &proc_dointvec, 324 .proc_handler = proc_dointvec,
345 }, 325 },
346 { 326 {
347 .ctl_name = CTL_UNNUMBERED,
348 .procname = "timer_migration", 327 .procname = "timer_migration",
349 .data = &sysctl_timer_migration, 328 .data = &sysctl_timer_migration,
350 .maxlen = sizeof(unsigned int), 329 .maxlen = sizeof(unsigned int),
351 .mode = 0644, 330 .mode = 0644,
352 .proc_handler = &proc_dointvec_minmax, 331 .proc_handler = proc_dointvec_minmax,
353 .strategy = &sysctl_intvec,
354 .extra1 = &zero, 332 .extra1 = &zero,
355 .extra2 = &one, 333 .extra2 = &one,
356 }, 334 },
357#endif 335#endif
358 { 336 {
359 .ctl_name = CTL_UNNUMBERED,
360 .procname = "sched_rt_period_us", 337 .procname = "sched_rt_period_us",
361 .data = &sysctl_sched_rt_period, 338 .data = &sysctl_sched_rt_period,
362 .maxlen = sizeof(unsigned int), 339 .maxlen = sizeof(unsigned int),
363 .mode = 0644, 340 .mode = 0644,
364 .proc_handler = &sched_rt_handler, 341 .proc_handler = sched_rt_handler,
365 }, 342 },
366 { 343 {
367 .ctl_name = CTL_UNNUMBERED,
368 .procname = "sched_rt_runtime_us", 344 .procname = "sched_rt_runtime_us",
369 .data = &sysctl_sched_rt_runtime, 345 .data = &sysctl_sched_rt_runtime,
370 .maxlen = sizeof(int), 346 .maxlen = sizeof(int),
371 .mode = 0644, 347 .mode = 0644,
372 .proc_handler = &sched_rt_handler, 348 .proc_handler = sched_rt_handler,
373 }, 349 },
374 { 350 {
375 .ctl_name = CTL_UNNUMBERED,
376 .procname = "sched_compat_yield", 351 .procname = "sched_compat_yield",
377 .data = &sysctl_sched_compat_yield, 352 .data = &sysctl_sched_compat_yield,
378 .maxlen = sizeof(unsigned int), 353 .maxlen = sizeof(unsigned int),
379 .mode = 0644, 354 .mode = 0644,
380 .proc_handler = &proc_dointvec, 355 .proc_handler = proc_dointvec,
381 }, 356 },
382#ifdef CONFIG_PROVE_LOCKING 357#ifdef CONFIG_PROVE_LOCKING
383 { 358 {
384 .ctl_name = CTL_UNNUMBERED,
385 .procname = "prove_locking", 359 .procname = "prove_locking",
386 .data = &prove_locking, 360 .data = &prove_locking,
387 .maxlen = sizeof(int), 361 .maxlen = sizeof(int),
388 .mode = 0644, 362 .mode = 0644,
389 .proc_handler = &proc_dointvec, 363 .proc_handler = proc_dointvec,
390 }, 364 },
391#endif 365#endif
392#ifdef CONFIG_LOCK_STAT 366#ifdef CONFIG_LOCK_STAT
393 { 367 {
394 .ctl_name = CTL_UNNUMBERED,
395 .procname = "lock_stat", 368 .procname = "lock_stat",
396 .data = &lock_stat, 369 .data = &lock_stat,
397 .maxlen = sizeof(int), 370 .maxlen = sizeof(int),
398 .mode = 0644, 371 .mode = 0644,
399 .proc_handler = &proc_dointvec, 372 .proc_handler = proc_dointvec,
400 }, 373 },
401#endif 374#endif
402 { 375 {
403 .ctl_name = KERN_PANIC,
404 .procname = "panic", 376 .procname = "panic",
405 .data = &panic_timeout, 377 .data = &panic_timeout,
406 .maxlen = sizeof(int), 378 .maxlen = sizeof(int),
407 .mode = 0644, 379 .mode = 0644,
408 .proc_handler = &proc_dointvec, 380 .proc_handler = proc_dointvec,
409 }, 381 },
410 { 382 {
411 .ctl_name = KERN_CORE_USES_PID,
412 .procname = "core_uses_pid", 383 .procname = "core_uses_pid",
413 .data = &core_uses_pid, 384 .data = &core_uses_pid,
414 .maxlen = sizeof(int), 385 .maxlen = sizeof(int),
415 .mode = 0644, 386 .mode = 0644,
416 .proc_handler = &proc_dointvec, 387 .proc_handler = proc_dointvec,
417 }, 388 },
418 { 389 {
419 .ctl_name = KERN_CORE_PATTERN,
420 .procname = "core_pattern", 390 .procname = "core_pattern",
421 .data = core_pattern, 391 .data = core_pattern,
422 .maxlen = CORENAME_MAX_SIZE, 392 .maxlen = CORENAME_MAX_SIZE,
423 .mode = 0644, 393 .mode = 0644,
424 .proc_handler = &proc_dostring, 394 .proc_handler = proc_dostring,
425 .strategy = &sysctl_string,
426 }, 395 },
427 { 396 {
428 .ctl_name = CTL_UNNUMBERED,
429 .procname = "core_pipe_limit", 397 .procname = "core_pipe_limit",
430 .data = &core_pipe_limit, 398 .data = &core_pipe_limit,
431 .maxlen = sizeof(unsigned int), 399 .maxlen = sizeof(unsigned int),
432 .mode = 0644, 400 .mode = 0644,
433 .proc_handler = &proc_dointvec, 401 .proc_handler = proc_dointvec,
434 }, 402 },
435#ifdef CONFIG_PROC_SYSCTL 403#ifdef CONFIG_PROC_SYSCTL
436 { 404 {
437 .procname = "tainted", 405 .procname = "tainted",
438 .maxlen = sizeof(long), 406 .maxlen = sizeof(long),
439 .mode = 0644, 407 .mode = 0644,
440 .proc_handler = &proc_taint, 408 .proc_handler = proc_taint,
441 }, 409 },
442#endif 410#endif
443#ifdef CONFIG_LATENCYTOP 411#ifdef CONFIG_LATENCYTOP
@@ -446,181 +414,160 @@ static struct ctl_table kern_table[] = {
446 .data = &latencytop_enabled, 414 .data = &latencytop_enabled,
447 .maxlen = sizeof(int), 415 .maxlen = sizeof(int),
448 .mode = 0644, 416 .mode = 0644,
449 .proc_handler = &proc_dointvec, 417 .proc_handler = proc_dointvec,
450 }, 418 },
451#endif 419#endif
452#ifdef CONFIG_BLK_DEV_INITRD 420#ifdef CONFIG_BLK_DEV_INITRD
453 { 421 {
454 .ctl_name = KERN_REALROOTDEV,
455 .procname = "real-root-dev", 422 .procname = "real-root-dev",
456 .data = &real_root_dev, 423 .data = &real_root_dev,
457 .maxlen = sizeof(int), 424 .maxlen = sizeof(int),
458 .mode = 0644, 425 .mode = 0644,
459 .proc_handler = &proc_dointvec, 426 .proc_handler = proc_dointvec,
460 }, 427 },
461#endif 428#endif
462 { 429 {
463 .ctl_name = CTL_UNNUMBERED,
464 .procname = "print-fatal-signals", 430 .procname = "print-fatal-signals",
465 .data = &print_fatal_signals, 431 .data = &print_fatal_signals,
466 .maxlen = sizeof(int), 432 .maxlen = sizeof(int),
467 .mode = 0644, 433 .mode = 0644,
468 .proc_handler = &proc_dointvec, 434 .proc_handler = proc_dointvec,
469 }, 435 },
470#ifdef CONFIG_SPARC 436#ifdef CONFIG_SPARC
471 { 437 {
472 .ctl_name = KERN_SPARC_REBOOT,
473 .procname = "reboot-cmd", 438 .procname = "reboot-cmd",
474 .data = reboot_command, 439 .data = reboot_command,
475 .maxlen = 256, 440 .maxlen = 256,
476 .mode = 0644, 441 .mode = 0644,
477 .proc_handler = &proc_dostring, 442 .proc_handler = proc_dostring,
478 .strategy = &sysctl_string,
479 }, 443 },
480 { 444 {
481 .ctl_name = KERN_SPARC_STOP_A,
482 .procname = "stop-a", 445 .procname = "stop-a",
483 .data = &stop_a_enabled, 446 .data = &stop_a_enabled,
484 .maxlen = sizeof (int), 447 .maxlen = sizeof (int),
485 .mode = 0644, 448 .mode = 0644,
486 .proc_handler = &proc_dointvec, 449 .proc_handler = proc_dointvec,
487 }, 450 },
488 { 451 {
489 .ctl_name = KERN_SPARC_SCONS_PWROFF,
490 .procname = "scons-poweroff", 452 .procname = "scons-poweroff",
491 .data = &scons_pwroff, 453 .data = &scons_pwroff,
492 .maxlen = sizeof (int), 454 .maxlen = sizeof (int),
493 .mode = 0644, 455 .mode = 0644,
494 .proc_handler = &proc_dointvec, 456 .proc_handler = proc_dointvec,
495 }, 457 },
496#endif 458#endif
497#ifdef CONFIG_SPARC64 459#ifdef CONFIG_SPARC64
498 { 460 {
499 .ctl_name = CTL_UNNUMBERED,
500 .procname = "tsb-ratio", 461 .procname = "tsb-ratio",
501 .data = &sysctl_tsb_ratio, 462 .data = &sysctl_tsb_ratio,
502 .maxlen = sizeof (int), 463 .maxlen = sizeof (int),
503 .mode = 0644, 464 .mode = 0644,
504 .proc_handler = &proc_dointvec, 465 .proc_handler = proc_dointvec,
505 }, 466 },
506#endif 467#endif
507#ifdef __hppa__ 468#ifdef __hppa__
508 { 469 {
509 .ctl_name = KERN_HPPA_PWRSW,
510 .procname = "soft-power", 470 .procname = "soft-power",
511 .data = &pwrsw_enabled, 471 .data = &pwrsw_enabled,
512 .maxlen = sizeof (int), 472 .maxlen = sizeof (int),
513 .mode = 0644, 473 .mode = 0644,
514 .proc_handler = &proc_dointvec, 474 .proc_handler = proc_dointvec,
515 }, 475 },
516 { 476 {
517 .ctl_name = KERN_HPPA_UNALIGNED,
518 .procname = "unaligned-trap", 477 .procname = "unaligned-trap",
519 .data = &unaligned_enabled, 478 .data = &unaligned_enabled,
520 .maxlen = sizeof (int), 479 .maxlen = sizeof (int),
521 .mode = 0644, 480 .mode = 0644,
522 .proc_handler = &proc_dointvec, 481 .proc_handler = proc_dointvec,
523 }, 482 },
524#endif 483#endif
525 { 484 {
526 .ctl_name = KERN_CTLALTDEL,
527 .procname = "ctrl-alt-del", 485 .procname = "ctrl-alt-del",
528 .data = &C_A_D, 486 .data = &C_A_D,
529 .maxlen = sizeof(int), 487 .maxlen = sizeof(int),
530 .mode = 0644, 488 .mode = 0644,
531 .proc_handler = &proc_dointvec, 489 .proc_handler = proc_dointvec,
532 }, 490 },
533#ifdef CONFIG_FUNCTION_TRACER 491#ifdef CONFIG_FUNCTION_TRACER
534 { 492 {
535 .ctl_name = CTL_UNNUMBERED,
536 .procname = "ftrace_enabled", 493 .procname = "ftrace_enabled",
537 .data = &ftrace_enabled, 494 .data = &ftrace_enabled,
538 .maxlen = sizeof(int), 495 .maxlen = sizeof(int),
539 .mode = 0644, 496 .mode = 0644,
540 .proc_handler = &ftrace_enable_sysctl, 497 .proc_handler = ftrace_enable_sysctl,
541 }, 498 },
542#endif 499#endif
543#ifdef CONFIG_STACK_TRACER 500#ifdef CONFIG_STACK_TRACER
544 { 501 {
545 .ctl_name = CTL_UNNUMBERED,
546 .procname = "stack_tracer_enabled", 502 .procname = "stack_tracer_enabled",
547 .data = &stack_tracer_enabled, 503 .data = &stack_tracer_enabled,
548 .maxlen = sizeof(int), 504 .maxlen = sizeof(int),
549 .mode = 0644, 505 .mode = 0644,
550 .proc_handler = &stack_trace_sysctl, 506 .proc_handler = stack_trace_sysctl,
551 }, 507 },
552#endif 508#endif
553#ifdef CONFIG_TRACING 509#ifdef CONFIG_TRACING
554 { 510 {
555 .ctl_name = CTL_UNNUMBERED,
556 .procname = "ftrace_dump_on_oops", 511 .procname = "ftrace_dump_on_oops",
557 .data = &ftrace_dump_on_oops, 512 .data = &ftrace_dump_on_oops,
558 .maxlen = sizeof(int), 513 .maxlen = sizeof(int),
559 .mode = 0644, 514 .mode = 0644,
560 .proc_handler = &proc_dointvec, 515 .proc_handler = proc_dointvec,
561 }, 516 },
562#endif 517#endif
563#ifdef CONFIG_MODULES 518#ifdef CONFIG_MODULES
564 { 519 {
565 .ctl_name = KERN_MODPROBE,
566 .procname = "modprobe", 520 .procname = "modprobe",
567 .data = &modprobe_path, 521 .data = &modprobe_path,
568 .maxlen = KMOD_PATH_LEN, 522 .maxlen = KMOD_PATH_LEN,
569 .mode = 0644, 523 .mode = 0644,
570 .proc_handler = &proc_dostring, 524 .proc_handler = proc_dostring,
571 .strategy = &sysctl_string,
572 }, 525 },
573 { 526 {
574 .ctl_name = CTL_UNNUMBERED,
575 .procname = "modules_disabled", 527 .procname = "modules_disabled",
576 .data = &modules_disabled, 528 .data = &modules_disabled,
577 .maxlen = sizeof(int), 529 .maxlen = sizeof(int),
578 .mode = 0644, 530 .mode = 0644,
579 /* only handle a transition from default "0" to "1" */ 531 /* only handle a transition from default "0" to "1" */
580 .proc_handler = &proc_dointvec_minmax, 532 .proc_handler = proc_dointvec_minmax,
581 .extra1 = &one, 533 .extra1 = &one,
582 .extra2 = &one, 534 .extra2 = &one,
583 }, 535 },
584#endif 536#endif
585#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) 537#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
586 { 538 {
587 .ctl_name = KERN_HOTPLUG,
588 .procname = "hotplug", 539 .procname = "hotplug",
589 .data = &uevent_helper, 540 .data = &uevent_helper,
590 .maxlen = UEVENT_HELPER_PATH_LEN, 541 .maxlen = UEVENT_HELPER_PATH_LEN,
591 .mode = 0644, 542 .mode = 0644,
592 .proc_handler = &proc_dostring, 543 .proc_handler = proc_dostring,
593 .strategy = &sysctl_string,
594 }, 544 },
595#endif 545#endif
596#ifdef CONFIG_CHR_DEV_SG 546#ifdef CONFIG_CHR_DEV_SG
597 { 547 {
598 .ctl_name = KERN_SG_BIG_BUFF,
599 .procname = "sg-big-buff", 548 .procname = "sg-big-buff",
600 .data = &sg_big_buff, 549 .data = &sg_big_buff,
601 .maxlen = sizeof (int), 550 .maxlen = sizeof (int),
602 .mode = 0444, 551 .mode = 0444,
603 .proc_handler = &proc_dointvec, 552 .proc_handler = proc_dointvec,
604 }, 553 },
605#endif 554#endif
606#ifdef CONFIG_BSD_PROCESS_ACCT 555#ifdef CONFIG_BSD_PROCESS_ACCT
607 { 556 {
608 .ctl_name = KERN_ACCT,
609 .procname = "acct", 557 .procname = "acct",
610 .data = &acct_parm, 558 .data = &acct_parm,
611 .maxlen = 3*sizeof(int), 559 .maxlen = 3*sizeof(int),
612 .mode = 0644, 560 .mode = 0644,
613 .proc_handler = &proc_dointvec, 561 .proc_handler = proc_dointvec,
614 }, 562 },
615#endif 563#endif
616#ifdef CONFIG_MAGIC_SYSRQ 564#ifdef CONFIG_MAGIC_SYSRQ
617 { 565 {
618 .ctl_name = KERN_SYSRQ,
619 .procname = "sysrq", 566 .procname = "sysrq",
620 .data = &__sysrq_enabled, 567 .data = &__sysrq_enabled,
621 .maxlen = sizeof (int), 568 .maxlen = sizeof (int),
622 .mode = 0644, 569 .mode = 0644,
623 .proc_handler = &proc_dointvec, 570 .proc_handler = proc_dointvec,
624 }, 571 },
625#endif 572#endif
626#ifdef CONFIG_PROC_SYSCTL 573#ifdef CONFIG_PROC_SYSCTL
@@ -629,215 +576,188 @@ static struct ctl_table kern_table[] = {
629 .data = NULL, 576 .data = NULL,
630 .maxlen = sizeof (int), 577 .maxlen = sizeof (int),
631 .mode = 0600, 578 .mode = 0600,
632 .proc_handler = &proc_do_cad_pid, 579 .proc_handler = proc_do_cad_pid,
633 }, 580 },
634#endif 581#endif
635 { 582 {
636 .ctl_name = KERN_MAX_THREADS,
637 .procname = "threads-max", 583 .procname = "threads-max",
638 .data = &max_threads, 584 .data = &max_threads,
639 .maxlen = sizeof(int), 585 .maxlen = sizeof(int),
640 .mode = 0644, 586 .mode = 0644,
641 .proc_handler = &proc_dointvec, 587 .proc_handler = proc_dointvec,
642 }, 588 },
643 { 589 {
644 .ctl_name = KERN_RANDOM,
645 .procname = "random", 590 .procname = "random",
646 .mode = 0555, 591 .mode = 0555,
647 .child = random_table, 592 .child = random_table,
648 }, 593 },
649 { 594 {
650 .ctl_name = KERN_OVERFLOWUID,
651 .procname = "overflowuid", 595 .procname = "overflowuid",
652 .data = &overflowuid, 596 .data = &overflowuid,
653 .maxlen = sizeof(int), 597 .maxlen = sizeof(int),
654 .mode = 0644, 598 .mode = 0644,
655 .proc_handler = &proc_dointvec_minmax, 599 .proc_handler = proc_dointvec_minmax,
656 .strategy = &sysctl_intvec,
657 .extra1 = &minolduid, 600 .extra1 = &minolduid,
658 .extra2 = &maxolduid, 601 .extra2 = &maxolduid,
659 }, 602 },
660 { 603 {
661 .ctl_name = KERN_OVERFLOWGID,
662 .procname = "overflowgid", 604 .procname = "overflowgid",
663 .data = &overflowgid, 605 .data = &overflowgid,
664 .maxlen = sizeof(int), 606 .maxlen = sizeof(int),
665 .mode = 0644, 607 .mode = 0644,
666 .proc_handler = &proc_dointvec_minmax, 608 .proc_handler = proc_dointvec_minmax,
667 .strategy = &sysctl_intvec,
668 .extra1 = &minolduid, 609 .extra1 = &minolduid,
669 .extra2 = &maxolduid, 610 .extra2 = &maxolduid,
670 }, 611 },
671#ifdef CONFIG_S390 612#ifdef CONFIG_S390
672#ifdef CONFIG_MATHEMU 613#ifdef CONFIG_MATHEMU
673 { 614 {
674 .ctl_name = KERN_IEEE_EMULATION_WARNINGS,
675 .procname = "ieee_emulation_warnings", 615 .procname = "ieee_emulation_warnings",
676 .data = &sysctl_ieee_emulation_warnings, 616 .data = &sysctl_ieee_emulation_warnings,
677 .maxlen = sizeof(int), 617 .maxlen = sizeof(int),
678 .mode = 0644, 618 .mode = 0644,
679 .proc_handler = &proc_dointvec, 619 .proc_handler = proc_dointvec,
680 }, 620 },
681#endif 621#endif
682 { 622 {
683 .ctl_name = KERN_S390_USER_DEBUG_LOGGING,
684 .procname = "userprocess_debug", 623 .procname = "userprocess_debug",
685 .data = &sysctl_userprocess_debug, 624 .data = &sysctl_userprocess_debug,
686 .maxlen = sizeof(int), 625 .maxlen = sizeof(int),
687 .mode = 0644, 626 .mode = 0644,
688 .proc_handler = &proc_dointvec, 627 .proc_handler = proc_dointvec,
689 }, 628 },
690#endif 629#endif
691 { 630 {
692 .ctl_name = KERN_PIDMAX,
693 .procname = "pid_max", 631 .procname = "pid_max",
694 .data = &pid_max, 632 .data = &pid_max,
695 .maxlen = sizeof (int), 633 .maxlen = sizeof (int),
696 .mode = 0644, 634 .mode = 0644,
697 .proc_handler = &proc_dointvec_minmax, 635 .proc_handler = proc_dointvec_minmax,
698 .strategy = sysctl_intvec,
699 .extra1 = &pid_max_min, 636 .extra1 = &pid_max_min,
700 .extra2 = &pid_max_max, 637 .extra2 = &pid_max_max,
701 }, 638 },
702 { 639 {
703 .ctl_name = KERN_PANIC_ON_OOPS,
704 .procname = "panic_on_oops", 640 .procname = "panic_on_oops",
705 .data = &panic_on_oops, 641 .data = &panic_on_oops,
706 .maxlen = sizeof(int), 642 .maxlen = sizeof(int),
707 .mode = 0644, 643 .mode = 0644,
708 .proc_handler = &proc_dointvec, 644 .proc_handler = proc_dointvec,
709 }, 645 },
710#if defined CONFIG_PRINTK 646#if defined CONFIG_PRINTK
711 { 647 {
712 .ctl_name = KERN_PRINTK,
713 .procname = "printk", 648 .procname = "printk",
714 .data = &console_loglevel, 649 .data = &console_loglevel,
715 .maxlen = 4*sizeof(int), 650 .maxlen = 4*sizeof(int),
716 .mode = 0644, 651 .mode = 0644,
717 .proc_handler = &proc_dointvec, 652 .proc_handler = proc_dointvec,
718 }, 653 },
719 { 654 {
720 .ctl_name = KERN_PRINTK_RATELIMIT,
721 .procname = "printk_ratelimit", 655 .procname = "printk_ratelimit",
722 .data = &printk_ratelimit_state.interval, 656 .data = &printk_ratelimit_state.interval,
723 .maxlen = sizeof(int), 657 .maxlen = sizeof(int),
724 .mode = 0644, 658 .mode = 0644,
725 .proc_handler = &proc_dointvec_jiffies, 659 .proc_handler = proc_dointvec_jiffies,
726 .strategy = &sysctl_jiffies,
727 }, 660 },
728 { 661 {
729 .ctl_name = KERN_PRINTK_RATELIMIT_BURST,
730 .procname = "printk_ratelimit_burst", 662 .procname = "printk_ratelimit_burst",
731 .data = &printk_ratelimit_state.burst, 663 .data = &printk_ratelimit_state.burst,
732 .maxlen = sizeof(int), 664 .maxlen = sizeof(int),
733 .mode = 0644, 665 .mode = 0644,
734 .proc_handler = &proc_dointvec, 666 .proc_handler = proc_dointvec,
735 }, 667 },
736 { 668 {
737 .ctl_name = CTL_UNNUMBERED,
738 .procname = "printk_delay", 669 .procname = "printk_delay",
739 .data = &printk_delay_msec, 670 .data = &printk_delay_msec,
740 .maxlen = sizeof(int), 671 .maxlen = sizeof(int),
741 .mode = 0644, 672 .mode = 0644,
742 .proc_handler = &proc_dointvec_minmax, 673 .proc_handler = proc_dointvec_minmax,
743 .strategy = &sysctl_intvec,
744 .extra1 = &zero, 674 .extra1 = &zero,
745 .extra2 = &ten_thousand, 675 .extra2 = &ten_thousand,
746 }, 676 },
747#endif 677#endif
748 { 678 {
749 .ctl_name = KERN_NGROUPS_MAX,
750 .procname = "ngroups_max", 679 .procname = "ngroups_max",
751 .data = &ngroups_max, 680 .data = &ngroups_max,
752 .maxlen = sizeof (int), 681 .maxlen = sizeof (int),
753 .mode = 0444, 682 .mode = 0444,
754 .proc_handler = &proc_dointvec, 683 .proc_handler = proc_dointvec,
755 }, 684 },
756#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) 685#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
757 { 686 {
758 .ctl_name = KERN_UNKNOWN_NMI_PANIC,
759 .procname = "unknown_nmi_panic", 687 .procname = "unknown_nmi_panic",
760 .data = &unknown_nmi_panic, 688 .data = &unknown_nmi_panic,
761 .maxlen = sizeof (int), 689 .maxlen = sizeof (int),
762 .mode = 0644, 690 .mode = 0644,
763 .proc_handler = &proc_dointvec, 691 .proc_handler = proc_dointvec,
764 }, 692 },
765 { 693 {
766 .procname = "nmi_watchdog", 694 .procname = "nmi_watchdog",
767 .data = &nmi_watchdog_enabled, 695 .data = &nmi_watchdog_enabled,
768 .maxlen = sizeof (int), 696 .maxlen = sizeof (int),
769 .mode = 0644, 697 .mode = 0644,
770 .proc_handler = &proc_nmi_enabled, 698 .proc_handler = proc_nmi_enabled,
771 }, 699 },
772#endif 700#endif
773#if defined(CONFIG_X86) 701#if defined(CONFIG_X86)
774 { 702 {
775 .ctl_name = KERN_PANIC_ON_NMI,
776 .procname = "panic_on_unrecovered_nmi", 703 .procname = "panic_on_unrecovered_nmi",
777 .data = &panic_on_unrecovered_nmi, 704 .data = &panic_on_unrecovered_nmi,
778 .maxlen = sizeof(int), 705 .maxlen = sizeof(int),
779 .mode = 0644, 706 .mode = 0644,
780 .proc_handler = &proc_dointvec, 707 .proc_handler = proc_dointvec,
781 }, 708 },
782 { 709 {
783 .ctl_name = CTL_UNNUMBERED,
784 .procname = "panic_on_io_nmi", 710 .procname = "panic_on_io_nmi",
785 .data = &panic_on_io_nmi, 711 .data = &panic_on_io_nmi,
786 .maxlen = sizeof(int), 712 .maxlen = sizeof(int),
787 .mode = 0644, 713 .mode = 0644,
788 .proc_handler = &proc_dointvec, 714 .proc_handler = proc_dointvec,
789 }, 715 },
790 { 716 {
791 .ctl_name = KERN_BOOTLOADER_TYPE,
792 .procname = "bootloader_type", 717 .procname = "bootloader_type",
793 .data = &bootloader_type, 718 .data = &bootloader_type,
794 .maxlen = sizeof (int), 719 .maxlen = sizeof (int),
795 .mode = 0444, 720 .mode = 0444,
796 .proc_handler = &proc_dointvec, 721 .proc_handler = proc_dointvec,
797 }, 722 },
798 { 723 {
799 .ctl_name = CTL_UNNUMBERED,
800 .procname = "bootloader_version", 724 .procname = "bootloader_version",
801 .data = &bootloader_version, 725 .data = &bootloader_version,
802 .maxlen = sizeof (int), 726 .maxlen = sizeof (int),
803 .mode = 0444, 727 .mode = 0444,
804 .proc_handler = &proc_dointvec, 728 .proc_handler = proc_dointvec,
805 }, 729 },
806 { 730 {
807 .ctl_name = CTL_UNNUMBERED,
808 .procname = "kstack_depth_to_print", 731 .procname = "kstack_depth_to_print",
809 .data = &kstack_depth_to_print, 732 .data = &kstack_depth_to_print,
810 .maxlen = sizeof(int), 733 .maxlen = sizeof(int),
811 .mode = 0644, 734 .mode = 0644,
812 .proc_handler = &proc_dointvec, 735 .proc_handler = proc_dointvec,
813 }, 736 },
814 { 737 {
815 .ctl_name = CTL_UNNUMBERED,
816 .procname = "io_delay_type", 738 .procname = "io_delay_type",
817 .data = &io_delay_type, 739 .data = &io_delay_type,
818 .maxlen = sizeof(int), 740 .maxlen = sizeof(int),
819 .mode = 0644, 741 .mode = 0644,
820 .proc_handler = &proc_dointvec, 742 .proc_handler = proc_dointvec,
821 }, 743 },
822#endif 744#endif
823#if defined(CONFIG_MMU) 745#if defined(CONFIG_MMU)
824 { 746 {
825 .ctl_name = KERN_RANDOMIZE,
826 .procname = "randomize_va_space", 747 .procname = "randomize_va_space",
827 .data = &randomize_va_space, 748 .data = &randomize_va_space,
828 .maxlen = sizeof(int), 749 .maxlen = sizeof(int),
829 .mode = 0644, 750 .mode = 0644,
830 .proc_handler = &proc_dointvec, 751 .proc_handler = proc_dointvec,
831 }, 752 },
832#endif 753#endif
833#if defined(CONFIG_S390) && defined(CONFIG_SMP) 754#if defined(CONFIG_S390) && defined(CONFIG_SMP)
834 { 755 {
835 .ctl_name = KERN_SPIN_RETRY,
836 .procname = "spin_retry", 756 .procname = "spin_retry",
837 .data = &spin_retry, 757 .data = &spin_retry,
838 .maxlen = sizeof (int), 758 .maxlen = sizeof (int),
839 .mode = 0644, 759 .mode = 0644,
840 .proc_handler = &proc_dointvec, 760 .proc_handler = proc_dointvec,
841 }, 761 },
842#endif 762#endif
843#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) 763#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86)
@@ -846,123 +766,104 @@ static struct ctl_table kern_table[] = {
846 .data = &acpi_realmode_flags, 766 .data = &acpi_realmode_flags,
847 .maxlen = sizeof (unsigned long), 767 .maxlen = sizeof (unsigned long),
848 .mode = 0644, 768 .mode = 0644,
849 .proc_handler = &proc_doulongvec_minmax, 769 .proc_handler = proc_doulongvec_minmax,
850 }, 770 },
851#endif 771#endif
852#ifdef CONFIG_IA64 772#ifdef CONFIG_IA64
853 { 773 {
854 .ctl_name = KERN_IA64_UNALIGNED,
855 .procname = "ignore-unaligned-usertrap", 774 .procname = "ignore-unaligned-usertrap",
856 .data = &no_unaligned_warning, 775 .data = &no_unaligned_warning,
857 .maxlen = sizeof (int), 776 .maxlen = sizeof (int),
858 .mode = 0644, 777 .mode = 0644,
859 .proc_handler = &proc_dointvec, 778 .proc_handler = proc_dointvec,
860 }, 779 },
861 { 780 {
862 .ctl_name = CTL_UNNUMBERED,
863 .procname = "unaligned-dump-stack", 781 .procname = "unaligned-dump-stack",
864 .data = &unaligned_dump_stack, 782 .data = &unaligned_dump_stack,
865 .maxlen = sizeof (int), 783 .maxlen = sizeof (int),
866 .mode = 0644, 784 .mode = 0644,
867 .proc_handler = &proc_dointvec, 785 .proc_handler = proc_dointvec,
868 }, 786 },
869#endif 787#endif
870#ifdef CONFIG_DETECT_SOFTLOCKUP 788#ifdef CONFIG_DETECT_SOFTLOCKUP
871 { 789 {
872 .ctl_name = CTL_UNNUMBERED,
873 .procname = "softlockup_panic", 790 .procname = "softlockup_panic",
874 .data = &softlockup_panic, 791 .data = &softlockup_panic,
875 .maxlen = sizeof(int), 792 .maxlen = sizeof(int),
876 .mode = 0644, 793 .mode = 0644,
877 .proc_handler = &proc_dointvec_minmax, 794 .proc_handler = proc_dointvec_minmax,
878 .strategy = &sysctl_intvec,
879 .extra1 = &zero, 795 .extra1 = &zero,
880 .extra2 = &one, 796 .extra2 = &one,
881 }, 797 },
882 { 798 {
883 .ctl_name = CTL_UNNUMBERED,
884 .procname = "softlockup_thresh", 799 .procname = "softlockup_thresh",
885 .data = &softlockup_thresh, 800 .data = &softlockup_thresh,
886 .maxlen = sizeof(int), 801 .maxlen = sizeof(int),
887 .mode = 0644, 802 .mode = 0644,
888 .proc_handler = &proc_dosoftlockup_thresh, 803 .proc_handler = proc_dosoftlockup_thresh,
889 .strategy = &sysctl_intvec,
890 .extra1 = &neg_one, 804 .extra1 = &neg_one,
891 .extra2 = &sixty, 805 .extra2 = &sixty,
892 }, 806 },
893#endif 807#endif
894#ifdef CONFIG_DETECT_HUNG_TASK 808#ifdef CONFIG_DETECT_HUNG_TASK
895 { 809 {
896 .ctl_name = CTL_UNNUMBERED,
897 .procname = "hung_task_panic", 810 .procname = "hung_task_panic",
898 .data = &sysctl_hung_task_panic, 811 .data = &sysctl_hung_task_panic,
899 .maxlen = sizeof(int), 812 .maxlen = sizeof(int),
900 .mode = 0644, 813 .mode = 0644,
901 .proc_handler = &proc_dointvec_minmax, 814 .proc_handler = proc_dointvec_minmax,
902 .strategy = &sysctl_intvec,
903 .extra1 = &zero, 815 .extra1 = &zero,
904 .extra2 = &one, 816 .extra2 = &one,
905 }, 817 },
906 { 818 {
907 .ctl_name = CTL_UNNUMBERED,
908 .procname = "hung_task_check_count", 819 .procname = "hung_task_check_count",
909 .data = &sysctl_hung_task_check_count, 820 .data = &sysctl_hung_task_check_count,
910 .maxlen = sizeof(unsigned long), 821 .maxlen = sizeof(unsigned long),
911 .mode = 0644, 822 .mode = 0644,
912 .proc_handler = &proc_doulongvec_minmax, 823 .proc_handler = proc_doulongvec_minmax,
913 .strategy = &sysctl_intvec,
914 }, 824 },
915 { 825 {
916 .ctl_name = CTL_UNNUMBERED,
917 .procname = "hung_task_timeout_secs", 826 .procname = "hung_task_timeout_secs",
918 .data = &sysctl_hung_task_timeout_secs, 827 .data = &sysctl_hung_task_timeout_secs,
919 .maxlen = sizeof(unsigned long), 828 .maxlen = sizeof(unsigned long),
920 .mode = 0644, 829 .mode = 0644,
921 .proc_handler = &proc_dohung_task_timeout_secs, 830 .proc_handler = proc_dohung_task_timeout_secs,
922 .strategy = &sysctl_intvec,
923 }, 831 },
924 { 832 {
925 .ctl_name = CTL_UNNUMBERED,
926 .procname = "hung_task_warnings", 833 .procname = "hung_task_warnings",
927 .data = &sysctl_hung_task_warnings, 834 .data = &sysctl_hung_task_warnings,
928 .maxlen = sizeof(unsigned long), 835 .maxlen = sizeof(unsigned long),
929 .mode = 0644, 836 .mode = 0644,
930 .proc_handler = &proc_doulongvec_minmax, 837 .proc_handler = proc_doulongvec_minmax,
931 .strategy = &sysctl_intvec,
932 }, 838 },
933#endif 839#endif
934#ifdef CONFIG_COMPAT 840#ifdef CONFIG_COMPAT
935 { 841 {
936 .ctl_name = KERN_COMPAT_LOG,
937 .procname = "compat-log", 842 .procname = "compat-log",
938 .data = &compat_log, 843 .data = &compat_log,
939 .maxlen = sizeof (int), 844 .maxlen = sizeof (int),
940 .mode = 0644, 845 .mode = 0644,
941 .proc_handler = &proc_dointvec, 846 .proc_handler = proc_dointvec,
942 }, 847 },
943#endif 848#endif
944#ifdef CONFIG_RT_MUTEXES 849#ifdef CONFIG_RT_MUTEXES
945 { 850 {
946 .ctl_name = KERN_MAX_LOCK_DEPTH,
947 .procname = "max_lock_depth", 851 .procname = "max_lock_depth",
948 .data = &max_lock_depth, 852 .data = &max_lock_depth,
949 .maxlen = sizeof(int), 853 .maxlen = sizeof(int),
950 .mode = 0644, 854 .mode = 0644,
951 .proc_handler = &proc_dointvec, 855 .proc_handler = proc_dointvec,
952 }, 856 },
953#endif 857#endif
954 { 858 {
955 .ctl_name = CTL_UNNUMBERED,
956 .procname = "poweroff_cmd", 859 .procname = "poweroff_cmd",
957 .data = &poweroff_cmd, 860 .data = &poweroff_cmd,
958 .maxlen = POWEROFF_CMD_PATH_LEN, 861 .maxlen = POWEROFF_CMD_PATH_LEN,
959 .mode = 0644, 862 .mode = 0644,
960 .proc_handler = &proc_dostring, 863 .proc_handler = proc_dostring,
961 .strategy = &sysctl_string,
962 }, 864 },
963#ifdef CONFIG_KEYS 865#ifdef CONFIG_KEYS
964 { 866 {
965 .ctl_name = CTL_UNNUMBERED,
966 .procname = "keys", 867 .procname = "keys",
967 .mode = 0555, 868 .mode = 0555,
968 .child = key_sysctls, 869 .child = key_sysctls,
@@ -970,17 +871,15 @@ static struct ctl_table kern_table[] = {
970#endif 871#endif
971#ifdef CONFIG_RCU_TORTURE_TEST 872#ifdef CONFIG_RCU_TORTURE_TEST
972 { 873 {
973 .ctl_name = CTL_UNNUMBERED,
974 .procname = "rcutorture_runnable", 874 .procname = "rcutorture_runnable",
975 .data = &rcutorture_runnable, 875 .data = &rcutorture_runnable,
976 .maxlen = sizeof(int), 876 .maxlen = sizeof(int),
977 .mode = 0644, 877 .mode = 0644,
978 .proc_handler = &proc_dointvec, 878 .proc_handler = proc_dointvec,
979 }, 879 },
980#endif 880#endif
981#ifdef CONFIG_SLOW_WORK 881#ifdef CONFIG_SLOW_WORK
982 { 882 {
983 .ctl_name = CTL_UNNUMBERED,
984 .procname = "slow-work", 883 .procname = "slow-work",
985 .mode = 0555, 884 .mode = 0555,
986 .child = slow_work_sysctls, 885 .child = slow_work_sysctls,
@@ -988,146 +887,127 @@ static struct ctl_table kern_table[] = {
988#endif 887#endif
989#ifdef CONFIG_PERF_EVENTS 888#ifdef CONFIG_PERF_EVENTS
990 { 889 {
991 .ctl_name = CTL_UNNUMBERED,
992 .procname = "perf_event_paranoid", 890 .procname = "perf_event_paranoid",
993 .data = &sysctl_perf_event_paranoid, 891 .data = &sysctl_perf_event_paranoid,
994 .maxlen = sizeof(sysctl_perf_event_paranoid), 892 .maxlen = sizeof(sysctl_perf_event_paranoid),
995 .mode = 0644, 893 .mode = 0644,
996 .proc_handler = &proc_dointvec, 894 .proc_handler = proc_dointvec,
997 }, 895 },
998 { 896 {
999 .ctl_name = CTL_UNNUMBERED,
1000 .procname = "perf_event_mlock_kb", 897 .procname = "perf_event_mlock_kb",
1001 .data = &sysctl_perf_event_mlock, 898 .data = &sysctl_perf_event_mlock,
1002 .maxlen = sizeof(sysctl_perf_event_mlock), 899 .maxlen = sizeof(sysctl_perf_event_mlock),
1003 .mode = 0644, 900 .mode = 0644,
1004 .proc_handler = &proc_dointvec, 901 .proc_handler = proc_dointvec,
1005 }, 902 },
1006 { 903 {
1007 .ctl_name = CTL_UNNUMBERED,
1008 .procname = "perf_event_max_sample_rate", 904 .procname = "perf_event_max_sample_rate",
1009 .data = &sysctl_perf_event_sample_rate, 905 .data = &sysctl_perf_event_sample_rate,
1010 .maxlen = sizeof(sysctl_perf_event_sample_rate), 906 .maxlen = sizeof(sysctl_perf_event_sample_rate),
1011 .mode = 0644, 907 .mode = 0644,
1012 .proc_handler = &proc_dointvec, 908 .proc_handler = proc_dointvec,
1013 }, 909 },
1014#endif 910#endif
1015#ifdef CONFIG_KMEMCHECK 911#ifdef CONFIG_KMEMCHECK
1016 { 912 {
1017 .ctl_name = CTL_UNNUMBERED,
1018 .procname = "kmemcheck", 913 .procname = "kmemcheck",
1019 .data = &kmemcheck_enabled, 914 .data = &kmemcheck_enabled,
1020 .maxlen = sizeof(int), 915 .maxlen = sizeof(int),
1021 .mode = 0644, 916 .mode = 0644,
1022 .proc_handler = &proc_dointvec, 917 .proc_handler = proc_dointvec,
1023 }, 918 },
1024#endif 919#endif
1025#ifdef CONFIG_BLOCK 920#ifdef CONFIG_BLOCK
1026 { 921 {
1027 .ctl_name = CTL_UNNUMBERED,
1028 .procname = "blk_iopoll", 922 .procname = "blk_iopoll",
1029 .data = &blk_iopoll_enabled, 923 .data = &blk_iopoll_enabled,
1030 .maxlen = sizeof(int), 924 .maxlen = sizeof(int),
1031 .mode = 0644, 925 .mode = 0644,
1032 .proc_handler = &proc_dointvec, 926 .proc_handler = proc_dointvec,
1033 }, 927 },
1034#endif 928#endif
1035/* 929/*
1036 * NOTE: do not add new entries to this table unless you have read 930 * NOTE: do not add new entries to this table unless you have read
1037 * Documentation/sysctl/ctl_unnumbered.txt 931 * Documentation/sysctl/ctl_unnumbered.txt
1038 */ 932 */
1039 { .ctl_name = 0 } 933 { }
1040}; 934};
1041 935
1042static struct ctl_table vm_table[] = { 936static struct ctl_table vm_table[] = {
1043 { 937 {
1044 .ctl_name = VM_OVERCOMMIT_MEMORY,
1045 .procname = "overcommit_memory", 938 .procname = "overcommit_memory",
1046 .data = &sysctl_overcommit_memory, 939 .data = &sysctl_overcommit_memory,
1047 .maxlen = sizeof(sysctl_overcommit_memory), 940 .maxlen = sizeof(sysctl_overcommit_memory),
1048 .mode = 0644, 941 .mode = 0644,
1049 .proc_handler = &proc_dointvec, 942 .proc_handler = proc_dointvec,
1050 }, 943 },
1051 { 944 {
1052 .ctl_name = VM_PANIC_ON_OOM,
1053 .procname = "panic_on_oom", 945 .procname = "panic_on_oom",
1054 .data = &sysctl_panic_on_oom, 946 .data = &sysctl_panic_on_oom,
1055 .maxlen = sizeof(sysctl_panic_on_oom), 947 .maxlen = sizeof(sysctl_panic_on_oom),
1056 .mode = 0644, 948 .mode = 0644,
1057 .proc_handler = &proc_dointvec, 949 .proc_handler = proc_dointvec,
1058 }, 950 },
1059 { 951 {
1060 .ctl_name = CTL_UNNUMBERED,
1061 .procname = "oom_kill_allocating_task", 952 .procname = "oom_kill_allocating_task",
1062 .data = &sysctl_oom_kill_allocating_task, 953 .data = &sysctl_oom_kill_allocating_task,
1063 .maxlen = sizeof(sysctl_oom_kill_allocating_task), 954 .maxlen = sizeof(sysctl_oom_kill_allocating_task),
1064 .mode = 0644, 955 .mode = 0644,
1065 .proc_handler = &proc_dointvec, 956 .proc_handler = proc_dointvec,
1066 }, 957 },
1067 { 958 {
1068 .ctl_name = CTL_UNNUMBERED,
1069 .procname = "oom_dump_tasks", 959 .procname = "oom_dump_tasks",
1070 .data = &sysctl_oom_dump_tasks, 960 .data = &sysctl_oom_dump_tasks,
1071 .maxlen = sizeof(sysctl_oom_dump_tasks), 961 .maxlen = sizeof(sysctl_oom_dump_tasks),
1072 .mode = 0644, 962 .mode = 0644,
1073 .proc_handler = &proc_dointvec, 963 .proc_handler = proc_dointvec,
1074 }, 964 },
1075 { 965 {
1076 .ctl_name = VM_OVERCOMMIT_RATIO,
1077 .procname = "overcommit_ratio", 966 .procname = "overcommit_ratio",
1078 .data = &sysctl_overcommit_ratio, 967 .data = &sysctl_overcommit_ratio,
1079 .maxlen = sizeof(sysctl_overcommit_ratio), 968 .maxlen = sizeof(sysctl_overcommit_ratio),
1080 .mode = 0644, 969 .mode = 0644,
1081 .proc_handler = &proc_dointvec, 970 .proc_handler = proc_dointvec,
1082 }, 971 },
1083 { 972 {
1084 .ctl_name = VM_PAGE_CLUSTER,
1085 .procname = "page-cluster", 973 .procname = "page-cluster",
1086 .data = &page_cluster, 974 .data = &page_cluster,
1087 .maxlen = sizeof(int), 975 .maxlen = sizeof(int),
1088 .mode = 0644, 976 .mode = 0644,
1089 .proc_handler = &proc_dointvec, 977 .proc_handler = proc_dointvec,
1090 }, 978 },
1091 { 979 {
1092 .ctl_name = VM_DIRTY_BACKGROUND,
1093 .procname = "dirty_background_ratio", 980 .procname = "dirty_background_ratio",
1094 .data = &dirty_background_ratio, 981 .data = &dirty_background_ratio,
1095 .maxlen = sizeof(dirty_background_ratio), 982 .maxlen = sizeof(dirty_background_ratio),
1096 .mode = 0644, 983 .mode = 0644,
1097 .proc_handler = &dirty_background_ratio_handler, 984 .proc_handler = dirty_background_ratio_handler,
1098 .strategy = &sysctl_intvec,
1099 .extra1 = &zero, 985 .extra1 = &zero,
1100 .extra2 = &one_hundred, 986 .extra2 = &one_hundred,
1101 }, 987 },
1102 { 988 {
1103 .ctl_name = CTL_UNNUMBERED,
1104 .procname = "dirty_background_bytes", 989 .procname = "dirty_background_bytes",
1105 .data = &dirty_background_bytes, 990 .data = &dirty_background_bytes,
1106 .maxlen = sizeof(dirty_background_bytes), 991 .maxlen = sizeof(dirty_background_bytes),
1107 .mode = 0644, 992 .mode = 0644,
1108 .proc_handler = &dirty_background_bytes_handler, 993 .proc_handler = dirty_background_bytes_handler,
1109 .strategy = &sysctl_intvec,
1110 .extra1 = &one_ul, 994 .extra1 = &one_ul,
1111 }, 995 },
1112 { 996 {
1113 .ctl_name = VM_DIRTY_RATIO,
1114 .procname = "dirty_ratio", 997 .procname = "dirty_ratio",
1115 .data = &vm_dirty_ratio, 998 .data = &vm_dirty_ratio,
1116 .maxlen = sizeof(vm_dirty_ratio), 999 .maxlen = sizeof(vm_dirty_ratio),
1117 .mode = 0644, 1000 .mode = 0644,
1118 .proc_handler = &dirty_ratio_handler, 1001 .proc_handler = dirty_ratio_handler,
1119 .strategy = &sysctl_intvec,
1120 .extra1 = &zero, 1002 .extra1 = &zero,
1121 .extra2 = &one_hundred, 1003 .extra2 = &one_hundred,
1122 }, 1004 },
1123 { 1005 {
1124 .ctl_name = CTL_UNNUMBERED,
1125 .procname = "dirty_bytes", 1006 .procname = "dirty_bytes",
1126 .data = &vm_dirty_bytes, 1007 .data = &vm_dirty_bytes,
1127 .maxlen = sizeof(vm_dirty_bytes), 1008 .maxlen = sizeof(vm_dirty_bytes),
1128 .mode = 0644, 1009 .mode = 0644,
1129 .proc_handler = &dirty_bytes_handler, 1010 .proc_handler = dirty_bytes_handler,
1130 .strategy = &sysctl_intvec,
1131 .extra1 = &dirty_bytes_min, 1011 .extra1 = &dirty_bytes_min,
1132 }, 1012 },
1133 { 1013 {
@@ -1135,289 +1015,258 @@ static struct ctl_table vm_table[] = {
1135 .data = &dirty_writeback_interval, 1015 .data = &dirty_writeback_interval,
1136 .maxlen = sizeof(dirty_writeback_interval), 1016 .maxlen = sizeof(dirty_writeback_interval),
1137 .mode = 0644, 1017 .mode = 0644,
1138 .proc_handler = &dirty_writeback_centisecs_handler, 1018 .proc_handler = dirty_writeback_centisecs_handler,
1139 }, 1019 },
1140 { 1020 {
1141 .procname = "dirty_expire_centisecs", 1021 .procname = "dirty_expire_centisecs",
1142 .data = &dirty_expire_interval, 1022 .data = &dirty_expire_interval,
1143 .maxlen = sizeof(dirty_expire_interval), 1023 .maxlen = sizeof(dirty_expire_interval),
1144 .mode = 0644, 1024 .mode = 0644,
1145 .proc_handler = &proc_dointvec, 1025 .proc_handler = proc_dointvec,
1146 }, 1026 },
1147 { 1027 {
1148 .ctl_name = VM_NR_PDFLUSH_THREADS,
1149 .procname = "nr_pdflush_threads", 1028 .procname = "nr_pdflush_threads",
1150 .data = &nr_pdflush_threads, 1029 .data = &nr_pdflush_threads,
1151 .maxlen = sizeof nr_pdflush_threads, 1030 .maxlen = sizeof nr_pdflush_threads,
1152 .mode = 0444 /* read-only*/, 1031 .mode = 0444 /* read-only*/,
1153 .proc_handler = &proc_dointvec, 1032 .proc_handler = proc_dointvec,
1154 }, 1033 },
1155 { 1034 {
1156 .ctl_name = VM_SWAPPINESS,
1157 .procname = "swappiness", 1035 .procname = "swappiness",
1158 .data = &vm_swappiness, 1036 .data = &vm_swappiness,
1159 .maxlen = sizeof(vm_swappiness), 1037 .maxlen = sizeof(vm_swappiness),
1160 .mode = 0644, 1038 .mode = 0644,
1161 .proc_handler = &proc_dointvec_minmax, 1039 .proc_handler = proc_dointvec_minmax,
1162 .strategy = &sysctl_intvec,
1163 .extra1 = &zero, 1040 .extra1 = &zero,
1164 .extra2 = &one_hundred, 1041 .extra2 = &one_hundred,
1165 }, 1042 },
1166#ifdef CONFIG_HUGETLB_PAGE 1043#ifdef CONFIG_HUGETLB_PAGE
1167 { 1044 {
1168 .procname = "nr_hugepages", 1045 .procname = "nr_hugepages",
1169 .data = NULL, 1046 .data = NULL,
1170 .maxlen = sizeof(unsigned long), 1047 .maxlen = sizeof(unsigned long),
1171 .mode = 0644, 1048 .mode = 0644,
1172 .proc_handler = &hugetlb_sysctl_handler, 1049 .proc_handler = hugetlb_sysctl_handler,
1173 .extra1 = (void *)&hugetlb_zero, 1050 .extra1 = (void *)&hugetlb_zero,
1174 .extra2 = (void *)&hugetlb_infinity, 1051 .extra2 = (void *)&hugetlb_infinity,
1175 }, 1052 },
1053#ifdef CONFIG_NUMA
1054 {
1055 .procname = "nr_hugepages_mempolicy",
1056 .data = NULL,
1057 .maxlen = sizeof(unsigned long),
1058 .mode = 0644,
1059 .proc_handler = &hugetlb_mempolicy_sysctl_handler,
1060 .extra1 = (void *)&hugetlb_zero,
1061 .extra2 = (void *)&hugetlb_infinity,
1062 },
1063#endif
1176 { 1064 {
1177 .ctl_name = VM_HUGETLB_GROUP,
1178 .procname = "hugetlb_shm_group", 1065 .procname = "hugetlb_shm_group",
1179 .data = &sysctl_hugetlb_shm_group, 1066 .data = &sysctl_hugetlb_shm_group,
1180 .maxlen = sizeof(gid_t), 1067 .maxlen = sizeof(gid_t),
1181 .mode = 0644, 1068 .mode = 0644,
1182 .proc_handler = &proc_dointvec, 1069 .proc_handler = proc_dointvec,
1183 }, 1070 },
1184 { 1071 {
1185 .ctl_name = CTL_UNNUMBERED,
1186 .procname = "hugepages_treat_as_movable", 1072 .procname = "hugepages_treat_as_movable",
1187 .data = &hugepages_treat_as_movable, 1073 .data = &hugepages_treat_as_movable,
1188 .maxlen = sizeof(int), 1074 .maxlen = sizeof(int),
1189 .mode = 0644, 1075 .mode = 0644,
1190 .proc_handler = &hugetlb_treat_movable_handler, 1076 .proc_handler = hugetlb_treat_movable_handler,
1191 }, 1077 },
1192 { 1078 {
1193 .ctl_name = CTL_UNNUMBERED,
1194 .procname = "nr_overcommit_hugepages", 1079 .procname = "nr_overcommit_hugepages",
1195 .data = NULL, 1080 .data = NULL,
1196 .maxlen = sizeof(unsigned long), 1081 .maxlen = sizeof(unsigned long),
1197 .mode = 0644, 1082 .mode = 0644,
1198 .proc_handler = &hugetlb_overcommit_handler, 1083 .proc_handler = hugetlb_overcommit_handler,
1199 .extra1 = (void *)&hugetlb_zero, 1084 .extra1 = (void *)&hugetlb_zero,
1200 .extra2 = (void *)&hugetlb_infinity, 1085 .extra2 = (void *)&hugetlb_infinity,
1201 }, 1086 },
1202#endif 1087#endif
1203 { 1088 {
1204 .ctl_name = VM_LOWMEM_RESERVE_RATIO,
1205 .procname = "lowmem_reserve_ratio", 1089 .procname = "lowmem_reserve_ratio",
1206 .data = &sysctl_lowmem_reserve_ratio, 1090 .data = &sysctl_lowmem_reserve_ratio,
1207 .maxlen = sizeof(sysctl_lowmem_reserve_ratio), 1091 .maxlen = sizeof(sysctl_lowmem_reserve_ratio),
1208 .mode = 0644, 1092 .mode = 0644,
1209 .proc_handler = &lowmem_reserve_ratio_sysctl_handler, 1093 .proc_handler = lowmem_reserve_ratio_sysctl_handler,
1210 .strategy = &sysctl_intvec,
1211 }, 1094 },
1212 { 1095 {
1213 .ctl_name = VM_DROP_PAGECACHE,
1214 .procname = "drop_caches", 1096 .procname = "drop_caches",
1215 .data = &sysctl_drop_caches, 1097 .data = &sysctl_drop_caches,
1216 .maxlen = sizeof(int), 1098 .maxlen = sizeof(int),
1217 .mode = 0644, 1099 .mode = 0644,
1218 .proc_handler = drop_caches_sysctl_handler, 1100 .proc_handler = drop_caches_sysctl_handler,
1219 .strategy = &sysctl_intvec,
1220 }, 1101 },
1221 { 1102 {
1222 .ctl_name = VM_MIN_FREE_KBYTES,
1223 .procname = "min_free_kbytes", 1103 .procname = "min_free_kbytes",
1224 .data = &min_free_kbytes, 1104 .data = &min_free_kbytes,
1225 .maxlen = sizeof(min_free_kbytes), 1105 .maxlen = sizeof(min_free_kbytes),
1226 .mode = 0644, 1106 .mode = 0644,
1227 .proc_handler = &min_free_kbytes_sysctl_handler, 1107 .proc_handler = min_free_kbytes_sysctl_handler,
1228 .strategy = &sysctl_intvec,
1229 .extra1 = &zero, 1108 .extra1 = &zero,
1230 }, 1109 },
1231 { 1110 {
1232 .ctl_name = VM_PERCPU_PAGELIST_FRACTION,
1233 .procname = "percpu_pagelist_fraction", 1111 .procname = "percpu_pagelist_fraction",
1234 .data = &percpu_pagelist_fraction, 1112 .data = &percpu_pagelist_fraction,
1235 .maxlen = sizeof(percpu_pagelist_fraction), 1113 .maxlen = sizeof(percpu_pagelist_fraction),
1236 .mode = 0644, 1114 .mode = 0644,
1237 .proc_handler = &percpu_pagelist_fraction_sysctl_handler, 1115 .proc_handler = percpu_pagelist_fraction_sysctl_handler,
1238 .strategy = &sysctl_intvec,
1239 .extra1 = &min_percpu_pagelist_fract, 1116 .extra1 = &min_percpu_pagelist_fract,
1240 }, 1117 },
1241#ifdef CONFIG_MMU 1118#ifdef CONFIG_MMU
1242 { 1119 {
1243 .ctl_name = VM_MAX_MAP_COUNT,
1244 .procname = "max_map_count", 1120 .procname = "max_map_count",
1245 .data = &sysctl_max_map_count, 1121 .data = &sysctl_max_map_count,
1246 .maxlen = sizeof(sysctl_max_map_count), 1122 .maxlen = sizeof(sysctl_max_map_count),
1247 .mode = 0644, 1123 .mode = 0644,
1248 .proc_handler = &proc_dointvec 1124 .proc_handler = proc_dointvec_minmax,
1125 .extra1 = &zero,
1249 }, 1126 },
1250#else 1127#else
1251 { 1128 {
1252 .ctl_name = CTL_UNNUMBERED,
1253 .procname = "nr_trim_pages", 1129 .procname = "nr_trim_pages",
1254 .data = &sysctl_nr_trim_pages, 1130 .data = &sysctl_nr_trim_pages,
1255 .maxlen = sizeof(sysctl_nr_trim_pages), 1131 .maxlen = sizeof(sysctl_nr_trim_pages),
1256 .mode = 0644, 1132 .mode = 0644,
1257 .proc_handler = &proc_dointvec_minmax, 1133 .proc_handler = proc_dointvec_minmax,
1258 .strategy = &sysctl_intvec,
1259 .extra1 = &zero, 1134 .extra1 = &zero,
1260 }, 1135 },
1261#endif 1136#endif
1262 { 1137 {
1263 .ctl_name = VM_LAPTOP_MODE,
1264 .procname = "laptop_mode", 1138 .procname = "laptop_mode",
1265 .data = &laptop_mode, 1139 .data = &laptop_mode,
1266 .maxlen = sizeof(laptop_mode), 1140 .maxlen = sizeof(laptop_mode),
1267 .mode = 0644, 1141 .mode = 0644,
1268 .proc_handler = &proc_dointvec_jiffies, 1142 .proc_handler = proc_dointvec_jiffies,
1269 .strategy = &sysctl_jiffies,
1270 }, 1143 },
1271 { 1144 {
1272 .ctl_name = VM_BLOCK_DUMP,
1273 .procname = "block_dump", 1145 .procname = "block_dump",
1274 .data = &block_dump, 1146 .data = &block_dump,
1275 .maxlen = sizeof(block_dump), 1147 .maxlen = sizeof(block_dump),
1276 .mode = 0644, 1148 .mode = 0644,
1277 .proc_handler = &proc_dointvec, 1149 .proc_handler = proc_dointvec,
1278 .strategy = &sysctl_intvec,
1279 .extra1 = &zero, 1150 .extra1 = &zero,
1280 }, 1151 },
1281 { 1152 {
1282 .ctl_name = VM_VFS_CACHE_PRESSURE,
1283 .procname = "vfs_cache_pressure", 1153 .procname = "vfs_cache_pressure",
1284 .data = &sysctl_vfs_cache_pressure, 1154 .data = &sysctl_vfs_cache_pressure,
1285 .maxlen = sizeof(sysctl_vfs_cache_pressure), 1155 .maxlen = sizeof(sysctl_vfs_cache_pressure),
1286 .mode = 0644, 1156 .mode = 0644,
1287 .proc_handler = &proc_dointvec, 1157 .proc_handler = proc_dointvec,
1288 .strategy = &sysctl_intvec,
1289 .extra1 = &zero, 1158 .extra1 = &zero,
1290 }, 1159 },
1291#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT 1160#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
1292 { 1161 {
1293 .ctl_name = VM_LEGACY_VA_LAYOUT,
1294 .procname = "legacy_va_layout", 1162 .procname = "legacy_va_layout",
1295 .data = &sysctl_legacy_va_layout, 1163 .data = &sysctl_legacy_va_layout,
1296 .maxlen = sizeof(sysctl_legacy_va_layout), 1164 .maxlen = sizeof(sysctl_legacy_va_layout),
1297 .mode = 0644, 1165 .mode = 0644,
1298 .proc_handler = &proc_dointvec, 1166 .proc_handler = proc_dointvec,
1299 .strategy = &sysctl_intvec,
1300 .extra1 = &zero, 1167 .extra1 = &zero,
1301 }, 1168 },
1302#endif 1169#endif
1303#ifdef CONFIG_NUMA 1170#ifdef CONFIG_NUMA
1304 { 1171 {
1305 .ctl_name = VM_ZONE_RECLAIM_MODE,
1306 .procname = "zone_reclaim_mode", 1172 .procname = "zone_reclaim_mode",
1307 .data = &zone_reclaim_mode, 1173 .data = &zone_reclaim_mode,
1308 .maxlen = sizeof(zone_reclaim_mode), 1174 .maxlen = sizeof(zone_reclaim_mode),
1309 .mode = 0644, 1175 .mode = 0644,
1310 .proc_handler = &proc_dointvec, 1176 .proc_handler = proc_dointvec,
1311 .strategy = &sysctl_intvec,
1312 .extra1 = &zero, 1177 .extra1 = &zero,
1313 }, 1178 },
1314 { 1179 {
1315 .ctl_name = VM_MIN_UNMAPPED,
1316 .procname = "min_unmapped_ratio", 1180 .procname = "min_unmapped_ratio",
1317 .data = &sysctl_min_unmapped_ratio, 1181 .data = &sysctl_min_unmapped_ratio,
1318 .maxlen = sizeof(sysctl_min_unmapped_ratio), 1182 .maxlen = sizeof(sysctl_min_unmapped_ratio),
1319 .mode = 0644, 1183 .mode = 0644,
1320 .proc_handler = &sysctl_min_unmapped_ratio_sysctl_handler, 1184 .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler,
1321 .strategy = &sysctl_intvec,
1322 .extra1 = &zero, 1185 .extra1 = &zero,
1323 .extra2 = &one_hundred, 1186 .extra2 = &one_hundred,
1324 }, 1187 },
1325 { 1188 {
1326 .ctl_name = VM_MIN_SLAB,
1327 .procname = "min_slab_ratio", 1189 .procname = "min_slab_ratio",
1328 .data = &sysctl_min_slab_ratio, 1190 .data = &sysctl_min_slab_ratio,
1329 .maxlen = sizeof(sysctl_min_slab_ratio), 1191 .maxlen = sizeof(sysctl_min_slab_ratio),
1330 .mode = 0644, 1192 .mode = 0644,
1331 .proc_handler = &sysctl_min_slab_ratio_sysctl_handler, 1193 .proc_handler = sysctl_min_slab_ratio_sysctl_handler,
1332 .strategy = &sysctl_intvec,
1333 .extra1 = &zero, 1194 .extra1 = &zero,
1334 .extra2 = &one_hundred, 1195 .extra2 = &one_hundred,
1335 }, 1196 },
1336#endif 1197#endif
1337#ifdef CONFIG_SMP 1198#ifdef CONFIG_SMP
1338 { 1199 {
1339 .ctl_name = CTL_UNNUMBERED,
1340 .procname = "stat_interval", 1200 .procname = "stat_interval",
1341 .data = &sysctl_stat_interval, 1201 .data = &sysctl_stat_interval,
1342 .maxlen = sizeof(sysctl_stat_interval), 1202 .maxlen = sizeof(sysctl_stat_interval),
1343 .mode = 0644, 1203 .mode = 0644,
1344 .proc_handler = &proc_dointvec_jiffies, 1204 .proc_handler = proc_dointvec_jiffies,
1345 .strategy = &sysctl_jiffies,
1346 }, 1205 },
1347#endif 1206#endif
1207#ifdef CONFIG_MMU
1348 { 1208 {
1349 .ctl_name = CTL_UNNUMBERED,
1350 .procname = "mmap_min_addr", 1209 .procname = "mmap_min_addr",
1351 .data = &dac_mmap_min_addr, 1210 .data = &dac_mmap_min_addr,
1352 .maxlen = sizeof(unsigned long), 1211 .maxlen = sizeof(unsigned long),
1353 .mode = 0644, 1212 .mode = 0644,
1354 .proc_handler = &mmap_min_addr_handler, 1213 .proc_handler = mmap_min_addr_handler,
1355 }, 1214 },
1215#endif
1356#ifdef CONFIG_NUMA 1216#ifdef CONFIG_NUMA
1357 { 1217 {
1358 .ctl_name = CTL_UNNUMBERED,
1359 .procname = "numa_zonelist_order", 1218 .procname = "numa_zonelist_order",
1360 .data = &numa_zonelist_order, 1219 .data = &numa_zonelist_order,
1361 .maxlen = NUMA_ZONELIST_ORDER_LEN, 1220 .maxlen = NUMA_ZONELIST_ORDER_LEN,
1362 .mode = 0644, 1221 .mode = 0644,
1363 .proc_handler = &numa_zonelist_order_handler, 1222 .proc_handler = numa_zonelist_order_handler,
1364 .strategy = &sysctl_string,
1365 }, 1223 },
1366#endif 1224#endif
1367#if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \ 1225#if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \
1368 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) 1226 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
1369 { 1227 {
1370 .ctl_name = VM_VDSO_ENABLED,
1371 .procname = "vdso_enabled", 1228 .procname = "vdso_enabled",
1372 .data = &vdso_enabled, 1229 .data = &vdso_enabled,
1373 .maxlen = sizeof(vdso_enabled), 1230 .maxlen = sizeof(vdso_enabled),
1374 .mode = 0644, 1231 .mode = 0644,
1375 .proc_handler = &proc_dointvec, 1232 .proc_handler = proc_dointvec,
1376 .strategy = &sysctl_intvec,
1377 .extra1 = &zero, 1233 .extra1 = &zero,
1378 }, 1234 },
1379#endif 1235#endif
1380#ifdef CONFIG_HIGHMEM 1236#ifdef CONFIG_HIGHMEM
1381 { 1237 {
1382 .ctl_name = CTL_UNNUMBERED,
1383 .procname = "highmem_is_dirtyable", 1238 .procname = "highmem_is_dirtyable",
1384 .data = &vm_highmem_is_dirtyable, 1239 .data = &vm_highmem_is_dirtyable,
1385 .maxlen = sizeof(vm_highmem_is_dirtyable), 1240 .maxlen = sizeof(vm_highmem_is_dirtyable),
1386 .mode = 0644, 1241 .mode = 0644,
1387 .proc_handler = &proc_dointvec_minmax, 1242 .proc_handler = proc_dointvec_minmax,
1388 .strategy = &sysctl_intvec,
1389 .extra1 = &zero, 1243 .extra1 = &zero,
1390 .extra2 = &one, 1244 .extra2 = &one,
1391 }, 1245 },
1392#endif 1246#endif
1393 { 1247 {
1394 .ctl_name = CTL_UNNUMBERED,
1395 .procname = "scan_unevictable_pages", 1248 .procname = "scan_unevictable_pages",
1396 .data = &scan_unevictable_pages, 1249 .data = &scan_unevictable_pages,
1397 .maxlen = sizeof(scan_unevictable_pages), 1250 .maxlen = sizeof(scan_unevictable_pages),
1398 .mode = 0644, 1251 .mode = 0644,
1399 .proc_handler = &scan_unevictable_handler, 1252 .proc_handler = scan_unevictable_handler,
1400 }, 1253 },
1401#ifdef CONFIG_MEMORY_FAILURE 1254#ifdef CONFIG_MEMORY_FAILURE
1402 { 1255 {
1403 .ctl_name = CTL_UNNUMBERED,
1404 .procname = "memory_failure_early_kill", 1256 .procname = "memory_failure_early_kill",
1405 .data = &sysctl_memory_failure_early_kill, 1257 .data = &sysctl_memory_failure_early_kill,
1406 .maxlen = sizeof(sysctl_memory_failure_early_kill), 1258 .maxlen = sizeof(sysctl_memory_failure_early_kill),
1407 .mode = 0644, 1259 .mode = 0644,
1408 .proc_handler = &proc_dointvec_minmax, 1260 .proc_handler = proc_dointvec_minmax,
1409 .strategy = &sysctl_intvec,
1410 .extra1 = &zero, 1261 .extra1 = &zero,
1411 .extra2 = &one, 1262 .extra2 = &one,
1412 }, 1263 },
1413 { 1264 {
1414 .ctl_name = CTL_UNNUMBERED,
1415 .procname = "memory_failure_recovery", 1265 .procname = "memory_failure_recovery",
1416 .data = &sysctl_memory_failure_recovery, 1266 .data = &sysctl_memory_failure_recovery,
1417 .maxlen = sizeof(sysctl_memory_failure_recovery), 1267 .maxlen = sizeof(sysctl_memory_failure_recovery),
1418 .mode = 0644, 1268 .mode = 0644,
1419 .proc_handler = &proc_dointvec_minmax, 1269 .proc_handler = proc_dointvec_minmax,
1420 .strategy = &sysctl_intvec,
1421 .extra1 = &zero, 1270 .extra1 = &zero,
1422 .extra2 = &one, 1271 .extra2 = &one,
1423 }, 1272 },
@@ -1427,116 +1276,104 @@ static struct ctl_table vm_table[] = {
1427 * NOTE: do not add new entries to this table unless you have read 1276 * NOTE: do not add new entries to this table unless you have read
1428 * Documentation/sysctl/ctl_unnumbered.txt 1277 * Documentation/sysctl/ctl_unnumbered.txt
1429 */ 1278 */
1430 { .ctl_name = 0 } 1279 { }
1431}; 1280};
1432 1281
1433#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) 1282#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
1434static struct ctl_table binfmt_misc_table[] = { 1283static struct ctl_table binfmt_misc_table[] = {
1435 { .ctl_name = 0 } 1284 { }
1436}; 1285};
1437#endif 1286#endif
1438 1287
1439static struct ctl_table fs_table[] = { 1288static struct ctl_table fs_table[] = {
1440 { 1289 {
1441 .ctl_name = FS_NRINODE,
1442 .procname = "inode-nr", 1290 .procname = "inode-nr",
1443 .data = &inodes_stat, 1291 .data = &inodes_stat,
1444 .maxlen = 2*sizeof(int), 1292 .maxlen = 2*sizeof(int),
1445 .mode = 0444, 1293 .mode = 0444,
1446 .proc_handler = &proc_dointvec, 1294 .proc_handler = proc_dointvec,
1447 }, 1295 },
1448 { 1296 {
1449 .ctl_name = FS_STATINODE,
1450 .procname = "inode-state", 1297 .procname = "inode-state",
1451 .data = &inodes_stat, 1298 .data = &inodes_stat,
1452 .maxlen = 7*sizeof(int), 1299 .maxlen = 7*sizeof(int),
1453 .mode = 0444, 1300 .mode = 0444,
1454 .proc_handler = &proc_dointvec, 1301 .proc_handler = proc_dointvec,
1455 }, 1302 },
1456 { 1303 {
1457 .procname = "file-nr", 1304 .procname = "file-nr",
1458 .data = &files_stat, 1305 .data = &files_stat,
1459 .maxlen = 3*sizeof(int), 1306 .maxlen = 3*sizeof(int),
1460 .mode = 0444, 1307 .mode = 0444,
1461 .proc_handler = &proc_nr_files, 1308 .proc_handler = proc_nr_files,
1462 }, 1309 },
1463 { 1310 {
1464 .ctl_name = FS_MAXFILE,
1465 .procname = "file-max", 1311 .procname = "file-max",
1466 .data = &files_stat.max_files, 1312 .data = &files_stat.max_files,
1467 .maxlen = sizeof(int), 1313 .maxlen = sizeof(int),
1468 .mode = 0644, 1314 .mode = 0644,
1469 .proc_handler = &proc_dointvec, 1315 .proc_handler = proc_dointvec,
1470 }, 1316 },
1471 { 1317 {
1472 .ctl_name = CTL_UNNUMBERED,
1473 .procname = "nr_open", 1318 .procname = "nr_open",
1474 .data = &sysctl_nr_open, 1319 .data = &sysctl_nr_open,
1475 .maxlen = sizeof(int), 1320 .maxlen = sizeof(int),
1476 .mode = 0644, 1321 .mode = 0644,
1477 .proc_handler = &proc_dointvec_minmax, 1322 .proc_handler = proc_dointvec_minmax,
1478 .extra1 = &sysctl_nr_open_min, 1323 .extra1 = &sysctl_nr_open_min,
1479 .extra2 = &sysctl_nr_open_max, 1324 .extra2 = &sysctl_nr_open_max,
1480 }, 1325 },
1481 { 1326 {
1482 .ctl_name = FS_DENTRY,
1483 .procname = "dentry-state", 1327 .procname = "dentry-state",
1484 .data = &dentry_stat, 1328 .data = &dentry_stat,
1485 .maxlen = 6*sizeof(int), 1329 .maxlen = 6*sizeof(int),
1486 .mode = 0444, 1330 .mode = 0444,
1487 .proc_handler = &proc_dointvec, 1331 .proc_handler = proc_dointvec,
1488 }, 1332 },
1489 { 1333 {
1490 .ctl_name = FS_OVERFLOWUID,
1491 .procname = "overflowuid", 1334 .procname = "overflowuid",
1492 .data = &fs_overflowuid, 1335 .data = &fs_overflowuid,
1493 .maxlen = sizeof(int), 1336 .maxlen = sizeof(int),
1494 .mode = 0644, 1337 .mode = 0644,
1495 .proc_handler = &proc_dointvec_minmax, 1338 .proc_handler = proc_dointvec_minmax,
1496 .strategy = &sysctl_intvec,
1497 .extra1 = &minolduid, 1339 .extra1 = &minolduid,
1498 .extra2 = &maxolduid, 1340 .extra2 = &maxolduid,
1499 }, 1341 },
1500 { 1342 {
1501 .ctl_name = FS_OVERFLOWGID,
1502 .procname = "overflowgid", 1343 .procname = "overflowgid",
1503 .data = &fs_overflowgid, 1344 .data = &fs_overflowgid,
1504 .maxlen = sizeof(int), 1345 .maxlen = sizeof(int),
1505 .mode = 0644, 1346 .mode = 0644,
1506 .proc_handler = &proc_dointvec_minmax, 1347 .proc_handler = proc_dointvec_minmax,
1507 .strategy = &sysctl_intvec,
1508 .extra1 = &minolduid, 1348 .extra1 = &minolduid,
1509 .extra2 = &maxolduid, 1349 .extra2 = &maxolduid,
1510 }, 1350 },
1511#ifdef CONFIG_FILE_LOCKING 1351#ifdef CONFIG_FILE_LOCKING
1512 { 1352 {
1513 .ctl_name = FS_LEASES,
1514 .procname = "leases-enable", 1353 .procname = "leases-enable",
1515 .data = &leases_enable, 1354 .data = &leases_enable,
1516 .maxlen = sizeof(int), 1355 .maxlen = sizeof(int),
1517 .mode = 0644, 1356 .mode = 0644,
1518 .proc_handler = &proc_dointvec, 1357 .proc_handler = proc_dointvec,
1519 }, 1358 },
1520#endif 1359#endif
1521#ifdef CONFIG_DNOTIFY 1360#ifdef CONFIG_DNOTIFY
1522 { 1361 {
1523 .ctl_name = FS_DIR_NOTIFY,
1524 .procname = "dir-notify-enable", 1362 .procname = "dir-notify-enable",
1525 .data = &dir_notify_enable, 1363 .data = &dir_notify_enable,
1526 .maxlen = sizeof(int), 1364 .maxlen = sizeof(int),
1527 .mode = 0644, 1365 .mode = 0644,
1528 .proc_handler = &proc_dointvec, 1366 .proc_handler = proc_dointvec,
1529 }, 1367 },
1530#endif 1368#endif
1531#ifdef CONFIG_MMU 1369#ifdef CONFIG_MMU
1532#ifdef CONFIG_FILE_LOCKING 1370#ifdef CONFIG_FILE_LOCKING
1533 { 1371 {
1534 .ctl_name = FS_LEASE_TIME,
1535 .procname = "lease-break-time", 1372 .procname = "lease-break-time",
1536 .data = &lease_break_time, 1373 .data = &lease_break_time,
1537 .maxlen = sizeof(int), 1374 .maxlen = sizeof(int),
1538 .mode = 0644, 1375 .mode = 0644,
1539 .proc_handler = &proc_dointvec, 1376 .proc_handler = proc_dointvec,
1540 }, 1377 },
1541#endif 1378#endif
1542#ifdef CONFIG_AIO 1379#ifdef CONFIG_AIO
@@ -1545,19 +1382,18 @@ static struct ctl_table fs_table[] = {
1545 .data = &aio_nr, 1382 .data = &aio_nr,
1546 .maxlen = sizeof(aio_nr), 1383 .maxlen = sizeof(aio_nr),
1547 .mode = 0444, 1384 .mode = 0444,
1548 .proc_handler = &proc_doulongvec_minmax, 1385 .proc_handler = proc_doulongvec_minmax,
1549 }, 1386 },
1550 { 1387 {
1551 .procname = "aio-max-nr", 1388 .procname = "aio-max-nr",
1552 .data = &aio_max_nr, 1389 .data = &aio_max_nr,
1553 .maxlen = sizeof(aio_max_nr), 1390 .maxlen = sizeof(aio_max_nr),
1554 .mode = 0644, 1391 .mode = 0644,
1555 .proc_handler = &proc_doulongvec_minmax, 1392 .proc_handler = proc_doulongvec_minmax,
1556 }, 1393 },
1557#endif /* CONFIG_AIO */ 1394#endif /* CONFIG_AIO */
1558#ifdef CONFIG_INOTIFY_USER 1395#ifdef CONFIG_INOTIFY_USER
1559 { 1396 {
1560 .ctl_name = FS_INOTIFY,
1561 .procname = "inotify", 1397 .procname = "inotify",
1562 .mode = 0555, 1398 .mode = 0555,
1563 .child = inotify_table, 1399 .child = inotify_table,
@@ -1572,19 +1408,16 @@ static struct ctl_table fs_table[] = {
1572#endif 1408#endif
1573#endif 1409#endif
1574 { 1410 {
1575 .ctl_name = KERN_SETUID_DUMPABLE,
1576 .procname = "suid_dumpable", 1411 .procname = "suid_dumpable",
1577 .data = &suid_dumpable, 1412 .data = &suid_dumpable,
1578 .maxlen = sizeof(int), 1413 .maxlen = sizeof(int),
1579 .mode = 0644, 1414 .mode = 0644,
1580 .proc_handler = &proc_dointvec_minmax, 1415 .proc_handler = proc_dointvec_minmax,
1581 .strategy = &sysctl_intvec,
1582 .extra1 = &zero, 1416 .extra1 = &zero,
1583 .extra2 = &two, 1417 .extra2 = &two,
1584 }, 1418 },
1585#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) 1419#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
1586 { 1420 {
1587 .ctl_name = CTL_UNNUMBERED,
1588 .procname = "binfmt_misc", 1421 .procname = "binfmt_misc",
1589 .mode = 0555, 1422 .mode = 0555,
1590 .child = binfmt_misc_table, 1423 .child = binfmt_misc_table,
@@ -1594,13 +1427,12 @@ static struct ctl_table fs_table[] = {
1594 * NOTE: do not add new entries to this table unless you have read 1427 * NOTE: do not add new entries to this table unless you have read
1595 * Documentation/sysctl/ctl_unnumbered.txt 1428 * Documentation/sysctl/ctl_unnumbered.txt
1596 */ 1429 */
1597 { .ctl_name = 0 } 1430 { }
1598}; 1431};
1599 1432
1600static struct ctl_table debug_table[] = { 1433static struct ctl_table debug_table[] = {
1601#if defined(CONFIG_X86) || defined(CONFIG_PPC) 1434#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC)
1602 { 1435 {
1603 .ctl_name = CTL_UNNUMBERED,
1604 .procname = "exception-trace", 1436 .procname = "exception-trace",
1605 .data = &show_unhandled_signals, 1437 .data = &show_unhandled_signals,
1606 .maxlen = sizeof(int), 1438 .maxlen = sizeof(int),
@@ -1608,11 +1440,22 @@ static struct ctl_table debug_table[] = {
1608 .proc_handler = proc_dointvec 1440 .proc_handler = proc_dointvec
1609 }, 1441 },
1610#endif 1442#endif
1611 { .ctl_name = 0 } 1443#if defined(CONFIG_OPTPROBES)
1444 {
1445 .procname = "kprobes-optimization",
1446 .data = &sysctl_kprobes_optimization,
1447 .maxlen = sizeof(int),
1448 .mode = 0644,
1449 .proc_handler = proc_kprobes_optimization_handler,
1450 .extra1 = &zero,
1451 .extra2 = &one,
1452 },
1453#endif
1454 { }
1612}; 1455};
1613 1456
1614static struct ctl_table dev_table[] = { 1457static struct ctl_table dev_table[] = {
1615 { .ctl_name = 0 } 1458 { }
1616}; 1459};
1617 1460
1618static DEFINE_SPINLOCK(sysctl_lock); 1461static DEFINE_SPINLOCK(sysctl_lock);
@@ -1766,122 +1609,6 @@ void register_sysctl_root(struct ctl_table_root *root)
1766 spin_unlock(&sysctl_lock); 1609 spin_unlock(&sysctl_lock);
1767} 1610}
1768 1611
1769#ifdef CONFIG_SYSCTL_SYSCALL
1770/* Perform the actual read/write of a sysctl table entry. */
1771static int do_sysctl_strategy(struct ctl_table_root *root,
1772 struct ctl_table *table,
1773 void __user *oldval, size_t __user *oldlenp,
1774 void __user *newval, size_t newlen)
1775{
1776 int op = 0, rc;
1777
1778 if (oldval)
1779 op |= MAY_READ;
1780 if (newval)
1781 op |= MAY_WRITE;
1782 if (sysctl_perm(root, table, op))
1783 return -EPERM;
1784
1785 if (table->strategy) {
1786 rc = table->strategy(table, oldval, oldlenp, newval, newlen);
1787 if (rc < 0)
1788 return rc;
1789 if (rc > 0)
1790 return 0;
1791 }
1792
1793 /* If there is no strategy routine, or if the strategy returns
1794 * zero, proceed with automatic r/w */
1795 if (table->data && table->maxlen) {
1796 rc = sysctl_data(table, oldval, oldlenp, newval, newlen);
1797 if (rc < 0)
1798 return rc;
1799 }
1800 return 0;
1801}
1802
1803static int parse_table(int __user *name, int nlen,
1804 void __user *oldval, size_t __user *oldlenp,
1805 void __user *newval, size_t newlen,
1806 struct ctl_table_root *root,
1807 struct ctl_table *table)
1808{
1809 int n;
1810repeat:
1811 if (!nlen)
1812 return -ENOTDIR;
1813 if (get_user(n, name))
1814 return -EFAULT;
1815 for ( ; table->ctl_name || table->procname; table++) {
1816 if (!table->ctl_name)
1817 continue;
1818 if (n == table->ctl_name) {
1819 int error;
1820 if (table->child) {
1821 if (sysctl_perm(root, table, MAY_EXEC))
1822 return -EPERM;
1823 name++;
1824 nlen--;
1825 table = table->child;
1826 goto repeat;
1827 }
1828 error = do_sysctl_strategy(root, table,
1829 oldval, oldlenp,
1830 newval, newlen);
1831 return error;
1832 }
1833 }
1834 return -ENOTDIR;
1835}
1836
1837int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp,
1838 void __user *newval, size_t newlen)
1839{
1840 struct ctl_table_header *head;
1841 int error = -ENOTDIR;
1842
1843 if (nlen <= 0 || nlen >= CTL_MAXNAME)
1844 return -ENOTDIR;
1845 if (oldval) {
1846 int old_len;
1847 if (!oldlenp || get_user(old_len, oldlenp))
1848 return -EFAULT;
1849 }
1850
1851 for (head = sysctl_head_next(NULL); head;
1852 head = sysctl_head_next(head)) {
1853 error = parse_table(name, nlen, oldval, oldlenp,
1854 newval, newlen,
1855 head->root, head->ctl_table);
1856 if (error != -ENOTDIR) {
1857 sysctl_head_finish(head);
1858 break;
1859 }
1860 }
1861 return error;
1862}
1863
1864SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
1865{
1866 struct __sysctl_args tmp;
1867 int error;
1868
1869 if (copy_from_user(&tmp, args, sizeof(tmp)))
1870 return -EFAULT;
1871
1872 error = deprecated_sysctl_warning(&tmp);
1873 if (error)
1874 goto out;
1875
1876 lock_kernel();
1877 error = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, tmp.oldlenp,
1878 tmp.newval, tmp.newlen);
1879 unlock_kernel();
1880out:
1881 return error;
1882}
1883#endif /* CONFIG_SYSCTL_SYSCALL */
1884
1885/* 1612/*
1886 * sysctl_perm does NOT grant the superuser all rights automatically, because 1613 * sysctl_perm does NOT grant the superuser all rights automatically, because
1887 * some sysctl variables are readonly even to root. 1614 * some sysctl variables are readonly even to root.
@@ -1917,7 +1644,7 @@ int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
1917 1644
1918static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) 1645static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
1919{ 1646{
1920 for (; table->ctl_name || table->procname; table++) { 1647 for (; table->procname; table++) {
1921 table->parent = parent; 1648 table->parent = parent;
1922 if (table->child) 1649 if (table->child)
1923 sysctl_set_parent(table, table->child); 1650 sysctl_set_parent(table, table->child);
@@ -1949,11 +1676,11 @@ static struct ctl_table *is_branch_in(struct ctl_table *branch,
1949 return NULL; 1676 return NULL;
1950 1677
1951 /* ... and nothing else */ 1678 /* ... and nothing else */
1952 if (branch[1].procname || branch[1].ctl_name) 1679 if (branch[1].procname)
1953 return NULL; 1680 return NULL;
1954 1681
1955 /* table should contain subdirectory with the same name */ 1682 /* table should contain subdirectory with the same name */
1956 for (p = table; p->procname || p->ctl_name; p++) { 1683 for (p = table; p->procname; p++) {
1957 if (!p->child) 1684 if (!p->child)
1958 continue; 1685 continue;
1959 if (p->procname && strcmp(p->procname, s) == 0) 1686 if (p->procname && strcmp(p->procname, s) == 0)
@@ -1998,9 +1725,6 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
1998 * 1725 *
1999 * The members of the &struct ctl_table structure are used as follows: 1726 * The members of the &struct ctl_table structure are used as follows:
2000 * 1727 *
2001 * ctl_name - This is the numeric sysctl value used by sysctl(2). The number
2002 * must be unique within that level of sysctl
2003 *
2004 * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not 1728 * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
2005 * enter a sysctl file 1729 * enter a sysctl file
2006 * 1730 *
@@ -2015,8 +1739,6 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
2015 * 1739 *
2016 * proc_handler - the text handler routine (described below) 1740 * proc_handler - the text handler routine (described below)
2017 * 1741 *
2018 * strategy - the strategy routine (described below)
2019 *
2020 * de - for internal use by the sysctl routines 1742 * de - for internal use by the sysctl routines
2021 * 1743 *
2022 * extra1, extra2 - extra pointers usable by the proc handler routines 1744 * extra1, extra2 - extra pointers usable by the proc handler routines
@@ -2029,19 +1751,6 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
2029 * struct enable minimal validation of the values being written to be 1751 * struct enable minimal validation of the values being written to be
2030 * performed, and the mode field allows minimal authentication. 1752 * performed, and the mode field allows minimal authentication.
2031 * 1753 *
2032 * More sophisticated management can be enabled by the provision of a
2033 * strategy routine with the table entry. This will be called before
2034 * any automatic read or write of the data is performed.
2035 *
2036 * The strategy routine may return
2037 *
2038 * < 0 - Error occurred (error is passed to user process)
2039 *
2040 * 0 - OK - proceed with automatic read or write.
2041 *
2042 * > 0 - OK - read or write has been done by the strategy routine, so
2043 * return immediately.
2044 *
2045 * There must be a proc_handler routine for any terminal nodes 1754 * There must be a proc_handler routine for any terminal nodes
2046 * mirrored under /proc/sys (non-terminals are handled by a built-in 1755 * mirrored under /proc/sys (non-terminals are handled by a built-in
2047 * directory handler). Several default handlers are available to 1756 * directory handler). Several default handlers are available to
@@ -2068,13 +1777,13 @@ struct ctl_table_header *__register_sysctl_paths(
2068 struct ctl_table_set *set; 1777 struct ctl_table_set *set;
2069 1778
2070 /* Count the path components */ 1779 /* Count the path components */
2071 for (npath = 0; path[npath].ctl_name || path[npath].procname; ++npath) 1780 for (npath = 0; path[npath].procname; ++npath)
2072 ; 1781 ;
2073 1782
2074 /* 1783 /*
2075 * For each path component, allocate a 2-element ctl_table array. 1784 * For each path component, allocate a 2-element ctl_table array.
2076 * The first array element will be filled with the sysctl entry 1785 * The first array element will be filled with the sysctl entry
2077 * for this, the second will be the sentinel (ctl_name == 0). 1786 * for this, the second will be the sentinel (procname == 0).
2078 * 1787 *
2079 * We allocate everything in one go so that we don't have to 1788 * We allocate everything in one go so that we don't have to
2080 * worry about freeing additional memory in unregister_sysctl_table. 1789 * worry about freeing additional memory in unregister_sysctl_table.
@@ -2091,7 +1800,6 @@ struct ctl_table_header *__register_sysctl_paths(
2091 for (n = 0; n < npath; ++n, ++path) { 1800 for (n = 0; n < npath; ++n, ++path) {
2092 /* Copy the procname */ 1801 /* Copy the procname */
2093 new->procname = path->procname; 1802 new->procname = path->procname;
2094 new->ctl_name = path->ctl_name;
2095 new->mode = 0555; 1803 new->mode = 0555;
2096 1804
2097 *prevp = new; 1805 *prevp = new;
@@ -2953,286 +2661,6 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2953 2661
2954#endif /* CONFIG_PROC_FS */ 2662#endif /* CONFIG_PROC_FS */
2955 2663
2956
2957#ifdef CONFIG_SYSCTL_SYSCALL
2958/*
2959 * General sysctl support routines
2960 */
2961
2962/* The generic sysctl data routine (used if no strategy routine supplied) */
2963int sysctl_data(struct ctl_table *table,
2964 void __user *oldval, size_t __user *oldlenp,
2965 void __user *newval, size_t newlen)
2966{
2967 size_t len;
2968
2969 /* Get out of I don't have a variable */
2970 if (!table->data || !table->maxlen)
2971 return -ENOTDIR;
2972
2973 if (oldval && oldlenp) {
2974 if (get_user(len, oldlenp))
2975 return -EFAULT;
2976 if (len) {
2977 if (len > table->maxlen)
2978 len = table->maxlen;
2979 if (copy_to_user(oldval, table->data, len))
2980 return -EFAULT;
2981 if (put_user(len, oldlenp))
2982 return -EFAULT;
2983 }
2984 }
2985
2986 if (newval && newlen) {
2987 if (newlen > table->maxlen)
2988 newlen = table->maxlen;
2989
2990 if (copy_from_user(table->data, newval, newlen))
2991 return -EFAULT;
2992 }
2993 return 1;
2994}
2995
2996/* The generic string strategy routine: */
2997int sysctl_string(struct ctl_table *table,
2998 void __user *oldval, size_t __user *oldlenp,
2999 void __user *newval, size_t newlen)
3000{
3001 if (!table->data || !table->maxlen)
3002 return -ENOTDIR;
3003
3004 if (oldval && oldlenp) {
3005 size_t bufsize;
3006 if (get_user(bufsize, oldlenp))
3007 return -EFAULT;
3008 if (bufsize) {
3009 size_t len = strlen(table->data), copied;
3010
3011 /* This shouldn't trigger for a well-formed sysctl */
3012 if (len > table->maxlen)
3013 len = table->maxlen;
3014
3015 /* Copy up to a max of bufsize-1 bytes of the string */
3016 copied = (len >= bufsize) ? bufsize - 1 : len;
3017
3018 if (copy_to_user(oldval, table->data, copied) ||
3019 put_user(0, (char __user *)(oldval + copied)))
3020 return -EFAULT;
3021 if (put_user(len, oldlenp))
3022 return -EFAULT;
3023 }
3024 }
3025 if (newval && newlen) {
3026 size_t len = newlen;
3027 if (len > table->maxlen)
3028 len = table->maxlen;
3029 if(copy_from_user(table->data, newval, len))
3030 return -EFAULT;
3031 if (len == table->maxlen)
3032 len--;
3033 ((char *) table->data)[len] = 0;
3034 }
3035 return 1;
3036}
3037
3038/*
3039 * This function makes sure that all of the integers in the vector
3040 * are between the minimum and maximum values given in the arrays
3041 * table->extra1 and table->extra2, respectively.
3042 */
3043int sysctl_intvec(struct ctl_table *table,
3044 void __user *oldval, size_t __user *oldlenp,
3045 void __user *newval, size_t newlen)
3046{
3047
3048 if (newval && newlen) {
3049 int __user *vec = (int __user *) newval;
3050 int *min = (int *) table->extra1;
3051 int *max = (int *) table->extra2;
3052 size_t length;
3053 int i;
3054
3055 if (newlen % sizeof(int) != 0)
3056 return -EINVAL;
3057
3058 if (!table->extra1 && !table->extra2)
3059 return 0;
3060
3061 if (newlen > table->maxlen)
3062 newlen = table->maxlen;
3063 length = newlen / sizeof(int);
3064
3065 for (i = 0; i < length; i++) {
3066 int value;
3067 if (get_user(value, vec + i))
3068 return -EFAULT;
3069 if (min && value < min[i])
3070 return -EINVAL;
3071 if (max && value > max[i])
3072 return -EINVAL;
3073 }
3074 }
3075 return 0;
3076}
3077
3078/* Strategy function to convert jiffies to seconds */
3079int sysctl_jiffies(struct ctl_table *table,
3080 void __user *oldval, size_t __user *oldlenp,
3081 void __user *newval, size_t newlen)
3082{
3083 if (oldval && oldlenp) {
3084 size_t olen;
3085
3086 if (get_user(olen, oldlenp))
3087 return -EFAULT;
3088 if (olen) {
3089 int val;
3090
3091 if (olen < sizeof(int))
3092 return -EINVAL;
3093
3094 val = *(int *)(table->data) / HZ;
3095 if (put_user(val, (int __user *)oldval))
3096 return -EFAULT;
3097 if (put_user(sizeof(int), oldlenp))
3098 return -EFAULT;
3099 }
3100 }
3101 if (newval && newlen) {
3102 int new;
3103 if (newlen != sizeof(int))
3104 return -EINVAL;
3105 if (get_user(new, (int __user *)newval))
3106 return -EFAULT;
3107 *(int *)(table->data) = new*HZ;
3108 }
3109 return 1;
3110}
3111
3112/* Strategy function to convert jiffies to seconds */
3113int sysctl_ms_jiffies(struct ctl_table *table,
3114 void __user *oldval, size_t __user *oldlenp,
3115 void __user *newval, size_t newlen)
3116{
3117 if (oldval && oldlenp) {
3118 size_t olen;
3119
3120 if (get_user(olen, oldlenp))
3121 return -EFAULT;
3122 if (olen) {
3123 int val;
3124
3125 if (olen < sizeof(int))
3126 return -EINVAL;
3127
3128 val = jiffies_to_msecs(*(int *)(table->data));
3129 if (put_user(val, (int __user *)oldval))
3130 return -EFAULT;
3131 if (put_user(sizeof(int), oldlenp))
3132 return -EFAULT;
3133 }
3134 }
3135 if (newval && newlen) {
3136 int new;
3137 if (newlen != sizeof(int))
3138 return -EINVAL;
3139 if (get_user(new, (int __user *)newval))
3140 return -EFAULT;
3141 *(int *)(table->data) = msecs_to_jiffies(new);
3142 }
3143 return 1;
3144}
3145
3146
3147
3148#else /* CONFIG_SYSCTL_SYSCALL */
3149
3150
3151SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
3152{
3153 struct __sysctl_args tmp;
3154 int error;
3155
3156 if (copy_from_user(&tmp, args, sizeof(tmp)))
3157 return -EFAULT;
3158
3159 error = deprecated_sysctl_warning(&tmp);
3160
3161 /* If no error reading the parameters then just -ENOSYS ... */
3162 if (!error)
3163 error = -ENOSYS;
3164
3165 return error;
3166}
3167
3168int sysctl_data(struct ctl_table *table,
3169 void __user *oldval, size_t __user *oldlenp,
3170 void __user *newval, size_t newlen)
3171{
3172 return -ENOSYS;
3173}
3174
3175int sysctl_string(struct ctl_table *table,
3176 void __user *oldval, size_t __user *oldlenp,
3177 void __user *newval, size_t newlen)
3178{
3179 return -ENOSYS;
3180}
3181
3182int sysctl_intvec(struct ctl_table *table,
3183 void __user *oldval, size_t __user *oldlenp,
3184 void __user *newval, size_t newlen)
3185{
3186 return -ENOSYS;
3187}
3188
3189int sysctl_jiffies(struct ctl_table *table,
3190 void __user *oldval, size_t __user *oldlenp,
3191 void __user *newval, size_t newlen)
3192{
3193 return -ENOSYS;
3194}
3195
3196int sysctl_ms_jiffies(struct ctl_table *table,
3197 void __user *oldval, size_t __user *oldlenp,
3198 void __user *newval, size_t newlen)
3199{
3200 return -ENOSYS;
3201}
3202
3203#endif /* CONFIG_SYSCTL_SYSCALL */
3204
3205static int deprecated_sysctl_warning(struct __sysctl_args *args)
3206{
3207 static int msg_count;
3208 int name[CTL_MAXNAME];
3209 int i;
3210
3211 /* Check args->nlen. */
3212 if (args->nlen < 0 || args->nlen > CTL_MAXNAME)
3213 return -ENOTDIR;
3214
3215 /* Read in the sysctl name for better debug message logging */
3216 for (i = 0; i < args->nlen; i++)
3217 if (get_user(name[i], args->name + i))
3218 return -EFAULT;
3219
3220 /* Ignore accesses to kernel.version */
3221 if ((args->nlen == 2) && (name[0] == CTL_KERN) && (name[1] == KERN_VERSION))
3222 return 0;
3223
3224 if (msg_count < 5) {
3225 msg_count++;
3226 printk(KERN_INFO
3227 "warning: process `%s' used the deprecated sysctl "
3228 "system call with ", current->comm);
3229 for (i = 0; i < args->nlen; i++)
3230 printk("%d.", name[i]);
3231 printk("\n");
3232 }
3233 return 0;
3234}
3235
3236/* 2664/*
3237 * No sense putting this after each symbol definition, twice, 2665 * No sense putting this after each symbol definition, twice,
3238 * exception granted :-) 2666 * exception granted :-)
@@ -3247,9 +2675,4 @@ EXPORT_SYMBOL(proc_doulongvec_minmax);
3247EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); 2675EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
3248EXPORT_SYMBOL(register_sysctl_table); 2676EXPORT_SYMBOL(register_sysctl_table);
3249EXPORT_SYMBOL(register_sysctl_paths); 2677EXPORT_SYMBOL(register_sysctl_paths);
3250EXPORT_SYMBOL(sysctl_intvec);
3251EXPORT_SYMBOL(sysctl_jiffies);
3252EXPORT_SYMBOL(sysctl_ms_jiffies);
3253EXPORT_SYMBOL(sysctl_string);
3254EXPORT_SYMBOL(sysctl_data);
3255EXPORT_SYMBOL(unregister_sysctl_table); 2678EXPORT_SYMBOL(unregister_sysctl_table);
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
new file mode 100644
index 000000000000..59030570f5ca
--- /dev/null
+++ b/kernel/sysctl_binary.c
@@ -0,0 +1,1541 @@
1#include <linux/stat.h>
2#include <linux/sysctl.h>
3#include "../fs/xfs/linux-2.6/xfs_sysctl.h"
4#include <linux/sunrpc/debug.h>
5#include <linux/string.h>
6#include <net/ip_vs.h>
7#include <linux/syscalls.h>
8#include <linux/namei.h>
9#include <linux/mount.h>
10#include <linux/fs.h>
11#include <linux/nsproxy.h>
12#include <linux/pid_namespace.h>
13#include <linux/file.h>
14#include <linux/ctype.h>
15#include <linux/netdevice.h>
16#include <linux/slab.h>
17
18#ifdef CONFIG_SYSCTL_SYSCALL
19
20struct bin_table;
21typedef ssize_t bin_convert_t(struct file *file,
22 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen);
23
24static bin_convert_t bin_dir;
25static bin_convert_t bin_string;
26static bin_convert_t bin_intvec;
27static bin_convert_t bin_ulongvec;
28static bin_convert_t bin_uuid;
29static bin_convert_t bin_dn_node_address;
30
31#define CTL_DIR bin_dir
32#define CTL_STR bin_string
33#define CTL_INT bin_intvec
34#define CTL_ULONG bin_ulongvec
35#define CTL_UUID bin_uuid
36#define CTL_DNADR bin_dn_node_address
37
38#define BUFSZ 256
39
40struct bin_table {
41 bin_convert_t *convert;
42 int ctl_name;
43 const char *procname;
44 const struct bin_table *child;
45};
46
47static const struct bin_table bin_random_table[] = {
48 { CTL_INT, RANDOM_POOLSIZE, "poolsize" },
49 { CTL_INT, RANDOM_ENTROPY_COUNT, "entropy_avail" },
50 { CTL_INT, RANDOM_READ_THRESH, "read_wakeup_threshold" },
51 { CTL_INT, RANDOM_WRITE_THRESH, "write_wakeup_threshold" },
52 { CTL_UUID, RANDOM_BOOT_ID, "boot_id" },
53 { CTL_UUID, RANDOM_UUID, "uuid" },
54 {}
55};
56
57static const struct bin_table bin_pty_table[] = {
58 { CTL_INT, PTY_MAX, "max" },
59 { CTL_INT, PTY_NR, "nr" },
60 {}
61};
62
63static const struct bin_table bin_kern_table[] = {
64 { CTL_STR, KERN_OSTYPE, "ostype" },
65 { CTL_STR, KERN_OSRELEASE, "osrelease" },
66 /* KERN_OSREV not used */
67 { CTL_STR, KERN_VERSION, "version" },
68 /* KERN_SECUREMASK not used */
69 /* KERN_PROF not used */
70 { CTL_STR, KERN_NODENAME, "hostname" },
71 { CTL_STR, KERN_DOMAINNAME, "domainname" },
72
73 { CTL_INT, KERN_PANIC, "panic" },
74 { CTL_INT, KERN_REALROOTDEV, "real-root-dev" },
75
76 { CTL_STR, KERN_SPARC_REBOOT, "reboot-cmd" },
77 { CTL_INT, KERN_CTLALTDEL, "ctrl-alt-del" },
78 { CTL_INT, KERN_PRINTK, "printk" },
79
80 /* KERN_NAMETRANS not used */
81 /* KERN_PPC_HTABRECLAIM not used */
82 /* KERN_PPC_ZEROPAGED not used */
83 { CTL_INT, KERN_PPC_POWERSAVE_NAP, "powersave-nap" },
84
85 { CTL_STR, KERN_MODPROBE, "modprobe" },
86 { CTL_INT, KERN_SG_BIG_BUFF, "sg-big-buff" },
87 { CTL_INT, KERN_ACCT, "acct" },
88 /* KERN_PPC_L2CR "l2cr" no longer used */
89
90 /* KERN_RTSIGNR not used */
91 /* KERN_RTSIGMAX not used */
92
93 { CTL_ULONG, KERN_SHMMAX, "shmmax" },
94 { CTL_INT, KERN_MSGMAX, "msgmax" },
95 { CTL_INT, KERN_MSGMNB, "msgmnb" },
96 /* KERN_MSGPOOL not used*/
97 { CTL_INT, KERN_SYSRQ, "sysrq" },
98 { CTL_INT, KERN_MAX_THREADS, "threads-max" },
99 { CTL_DIR, KERN_RANDOM, "random", bin_random_table },
100 { CTL_ULONG, KERN_SHMALL, "shmall" },
101 { CTL_INT, KERN_MSGMNI, "msgmni" },
102 { CTL_INT, KERN_SEM, "sem" },
103 { CTL_INT, KERN_SPARC_STOP_A, "stop-a" },
104 { CTL_INT, KERN_SHMMNI, "shmmni" },
105
106 { CTL_INT, KERN_OVERFLOWUID, "overflowuid" },
107 { CTL_INT, KERN_OVERFLOWGID, "overflowgid" },
108
109 { CTL_STR, KERN_HOTPLUG, "hotplug", },
110 { CTL_INT, KERN_IEEE_EMULATION_WARNINGS, "ieee_emulation_warnings" },
111
112 { CTL_INT, KERN_S390_USER_DEBUG_LOGGING, "userprocess_debug" },
113 { CTL_INT, KERN_CORE_USES_PID, "core_uses_pid" },
114 /* KERN_TAINTED "tainted" no longer used */
115 { CTL_INT, KERN_CADPID, "cad_pid" },
116 { CTL_INT, KERN_PIDMAX, "pid_max" },
117 { CTL_STR, KERN_CORE_PATTERN, "core_pattern" },
118 { CTL_INT, KERN_PANIC_ON_OOPS, "panic_on_oops" },
119 { CTL_INT, KERN_HPPA_PWRSW, "soft-power" },
120 { CTL_INT, KERN_HPPA_UNALIGNED, "unaligned-trap" },
121
122 { CTL_INT, KERN_PRINTK_RATELIMIT, "printk_ratelimit" },
123 { CTL_INT, KERN_PRINTK_RATELIMIT_BURST, "printk_ratelimit_burst" },
124
125 { CTL_DIR, KERN_PTY, "pty", bin_pty_table },
126 { CTL_INT, KERN_NGROUPS_MAX, "ngroups_max" },
127 { CTL_INT, KERN_SPARC_SCONS_PWROFF, "scons-poweroff" },
128 /* KERN_HZ_TIMER "hz_timer" no longer used */
129 { CTL_INT, KERN_UNKNOWN_NMI_PANIC, "unknown_nmi_panic" },
130 { CTL_INT, KERN_BOOTLOADER_TYPE, "bootloader_type" },
131 { CTL_INT, KERN_RANDOMIZE, "randomize_va_space" },
132
133 { CTL_INT, KERN_SPIN_RETRY, "spin_retry" },
134 /* KERN_ACPI_VIDEO_FLAGS "acpi_video_flags" no longer used */
135 { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" },
136 { CTL_INT, KERN_COMPAT_LOG, "compat-log" },
137 { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
138 { CTL_INT, KERN_NMI_WATCHDOG, "nmi_watchdog" },
139 { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
140 {}
141};
142
143static const struct bin_table bin_vm_table[] = {
144 { CTL_INT, VM_OVERCOMMIT_MEMORY, "overcommit_memory" },
145 { CTL_INT, VM_PAGE_CLUSTER, "page-cluster" },
146 { CTL_INT, VM_DIRTY_BACKGROUND, "dirty_background_ratio" },
147 { CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" },
148 /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */
149 /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */
150 { CTL_INT, VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" },
151 { CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" },
152 /* VM_PAGEBUF unused */
153 /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */
154 { CTL_INT, VM_SWAPPINESS, "swappiness" },
155 { CTL_INT, VM_LOWMEM_RESERVE_RATIO, "lowmem_reserve_ratio" },
156 { CTL_INT, VM_MIN_FREE_KBYTES, "min_free_kbytes" },
157 { CTL_INT, VM_MAX_MAP_COUNT, "max_map_count" },
158 { CTL_INT, VM_LAPTOP_MODE, "laptop_mode" },
159 { CTL_INT, VM_BLOCK_DUMP, "block_dump" },
160 { CTL_INT, VM_HUGETLB_GROUP, "hugetlb_shm_group" },
161 { CTL_INT, VM_VFS_CACHE_PRESSURE, "vfs_cache_pressure" },
162 { CTL_INT, VM_LEGACY_VA_LAYOUT, "legacy_va_layout" },
163 /* VM_SWAP_TOKEN_TIMEOUT unused */
164 { CTL_INT, VM_DROP_PAGECACHE, "drop_caches" },
165 { CTL_INT, VM_PERCPU_PAGELIST_FRACTION, "percpu_pagelist_fraction" },
166 { CTL_INT, VM_ZONE_RECLAIM_MODE, "zone_reclaim_mode" },
167 { CTL_INT, VM_MIN_UNMAPPED, "min_unmapped_ratio" },
168 { CTL_INT, VM_PANIC_ON_OOM, "panic_on_oom" },
169 { CTL_INT, VM_VDSO_ENABLED, "vdso_enabled" },
170 { CTL_INT, VM_MIN_SLAB, "min_slab_ratio" },
171
172 {}
173};
174
175static const struct bin_table bin_net_core_table[] = {
176 { CTL_INT, NET_CORE_WMEM_MAX, "wmem_max" },
177 { CTL_INT, NET_CORE_RMEM_MAX, "rmem_max" },
178 { CTL_INT, NET_CORE_WMEM_DEFAULT, "wmem_default" },
179 { CTL_INT, NET_CORE_RMEM_DEFAULT, "rmem_default" },
180 /* NET_CORE_DESTROY_DELAY unused */
181 { CTL_INT, NET_CORE_MAX_BACKLOG, "netdev_max_backlog" },
182 /* NET_CORE_FASTROUTE unused */
183 { CTL_INT, NET_CORE_MSG_COST, "message_cost" },
184 { CTL_INT, NET_CORE_MSG_BURST, "message_burst" },
185 { CTL_INT, NET_CORE_OPTMEM_MAX, "optmem_max" },
186 /* NET_CORE_HOT_LIST_LENGTH unused */
187 /* NET_CORE_DIVERT_VERSION unused */
188 /* NET_CORE_NO_CONG_THRESH unused */
189 /* NET_CORE_NO_CONG unused */
190 /* NET_CORE_LO_CONG unused */
191 /* NET_CORE_MOD_CONG unused */
192 { CTL_INT, NET_CORE_DEV_WEIGHT, "dev_weight" },
193 { CTL_INT, NET_CORE_SOMAXCONN, "somaxconn" },
194 { CTL_INT, NET_CORE_BUDGET, "netdev_budget" },
195 { CTL_INT, NET_CORE_AEVENT_ETIME, "xfrm_aevent_etime" },
196 { CTL_INT, NET_CORE_AEVENT_RSEQTH, "xfrm_aevent_rseqth" },
197 { CTL_INT, NET_CORE_WARNINGS, "warnings" },
198 {},
199};
200
201static const struct bin_table bin_net_unix_table[] = {
202 /* NET_UNIX_DESTROY_DELAY unused */
203 /* NET_UNIX_DELETE_DELAY unused */
204 { CTL_INT, NET_UNIX_MAX_DGRAM_QLEN, "max_dgram_qlen" },
205 {}
206};
207
208static const struct bin_table bin_net_ipv4_route_table[] = {
209 { CTL_INT, NET_IPV4_ROUTE_FLUSH, "flush" },
210 /* NET_IPV4_ROUTE_MIN_DELAY "min_delay" no longer used */
211 /* NET_IPV4_ROUTE_MAX_DELAY "max_delay" no longer used */
212 { CTL_INT, NET_IPV4_ROUTE_GC_THRESH, "gc_thresh" },
213 { CTL_INT, NET_IPV4_ROUTE_MAX_SIZE, "max_size" },
214 { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" },
215 { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" },
216 { CTL_INT, NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" },
217 { CTL_INT, NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval" },
218 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" },
219 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" },
220 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" },
221 { CTL_INT, NET_IPV4_ROUTE_ERROR_COST, "error_cost" },
222 { CTL_INT, NET_IPV4_ROUTE_ERROR_BURST, "error_burst" },
223 { CTL_INT, NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity" },
224 { CTL_INT, NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" },
225 { CTL_INT, NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" },
226 { CTL_INT, NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" },
227 { CTL_INT, NET_IPV4_ROUTE_SECRET_INTERVAL, "secret_interval" },
228 {}
229};
230
231static const struct bin_table bin_net_ipv4_conf_vars_table[] = {
232 { CTL_INT, NET_IPV4_CONF_FORWARDING, "forwarding" },
233 { CTL_INT, NET_IPV4_CONF_MC_FORWARDING, "mc_forwarding" },
234
235 { CTL_INT, NET_IPV4_CONF_ACCEPT_REDIRECTS, "accept_redirects" },
236 { CTL_INT, NET_IPV4_CONF_SECURE_REDIRECTS, "secure_redirects" },
237 { CTL_INT, NET_IPV4_CONF_SEND_REDIRECTS, "send_redirects" },
238 { CTL_INT, NET_IPV4_CONF_SHARED_MEDIA, "shared_media" },
239 { CTL_INT, NET_IPV4_CONF_RP_FILTER, "rp_filter" },
240 { CTL_INT, NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
241 { CTL_INT, NET_IPV4_CONF_PROXY_ARP, "proxy_arp" },
242 { CTL_INT, NET_IPV4_CONF_MEDIUM_ID, "medium_id" },
243 { CTL_INT, NET_IPV4_CONF_BOOTP_RELAY, "bootp_relay" },
244 { CTL_INT, NET_IPV4_CONF_LOG_MARTIANS, "log_martians" },
245 { CTL_INT, NET_IPV4_CONF_TAG, "tag" },
246 { CTL_INT, NET_IPV4_CONF_ARPFILTER, "arp_filter" },
247 { CTL_INT, NET_IPV4_CONF_ARP_ANNOUNCE, "arp_announce" },
248 { CTL_INT, NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" },
249 { CTL_INT, NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" },
250 { CTL_INT, NET_IPV4_CONF_ARP_NOTIFY, "arp_notify" },
251
252 { CTL_INT, NET_IPV4_CONF_NOXFRM, "disable_xfrm" },
253 { CTL_INT, NET_IPV4_CONF_NOPOLICY, "disable_policy" },
254 { CTL_INT, NET_IPV4_CONF_FORCE_IGMP_VERSION, "force_igmp_version" },
255 { CTL_INT, NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" },
256 {}
257};
258
259static const struct bin_table bin_net_ipv4_conf_table[] = {
260 { CTL_DIR, NET_PROTO_CONF_ALL, "all", bin_net_ipv4_conf_vars_table },
261 { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_ipv4_conf_vars_table },
262 { CTL_DIR, 0, NULL, bin_net_ipv4_conf_vars_table },
263 {}
264};
265
266static const struct bin_table bin_net_neigh_vars_table[] = {
267 { CTL_INT, NET_NEIGH_MCAST_SOLICIT, "mcast_solicit" },
268 { CTL_INT, NET_NEIGH_UCAST_SOLICIT, "ucast_solicit" },
269 { CTL_INT, NET_NEIGH_APP_SOLICIT, "app_solicit" },
270 /* NET_NEIGH_RETRANS_TIME "retrans_time" no longer used */
271 { CTL_INT, NET_NEIGH_REACHABLE_TIME, "base_reachable_time" },
272 { CTL_INT, NET_NEIGH_DELAY_PROBE_TIME, "delay_first_probe_time" },
273 { CTL_INT, NET_NEIGH_GC_STALE_TIME, "gc_stale_time" },
274 { CTL_INT, NET_NEIGH_UNRES_QLEN, "unres_qlen" },
275 { CTL_INT, NET_NEIGH_PROXY_QLEN, "proxy_qlen" },
276 /* NET_NEIGH_ANYCAST_DELAY "anycast_delay" no longer used */
277 /* NET_NEIGH_PROXY_DELAY "proxy_delay" no longer used */
278 /* NET_NEIGH_LOCKTIME "locktime" no longer used */
279 { CTL_INT, NET_NEIGH_GC_INTERVAL, "gc_interval" },
280 { CTL_INT, NET_NEIGH_GC_THRESH1, "gc_thresh1" },
281 { CTL_INT, NET_NEIGH_GC_THRESH2, "gc_thresh2" },
282 { CTL_INT, NET_NEIGH_GC_THRESH3, "gc_thresh3" },
283 { CTL_INT, NET_NEIGH_RETRANS_TIME_MS, "retrans_time_ms" },
284 { CTL_INT, NET_NEIGH_REACHABLE_TIME_MS, "base_reachable_time_ms" },
285 {}
286};
287
288static const struct bin_table bin_net_neigh_table[] = {
289 { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_neigh_vars_table },
290 { CTL_DIR, 0, NULL, bin_net_neigh_vars_table },
291 {}
292};
293
294static const struct bin_table bin_net_ipv4_netfilter_table[] = {
295 { CTL_INT, NET_IPV4_NF_CONNTRACK_MAX, "ip_conntrack_max" },
296
297 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "ip_conntrack_tcp_timeout_syn_sent" no longer used */
298 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "ip_conntrack_tcp_timeout_syn_recv" no longer used */
299 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "ip_conntrack_tcp_timeout_established" no longer used */
300 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "ip_conntrack_tcp_timeout_fin_wait" no longer used */
301 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT "ip_conntrack_tcp_timeout_close_wait" no longer used */
302 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "ip_conntrack_tcp_timeout_last_ack" no longer used */
303 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "ip_conntrack_tcp_timeout_time_wait" no longer used */
304 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "ip_conntrack_tcp_timeout_close" no longer used */
305
306 /* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT "ip_conntrack_udp_timeout" no longer used */
307 /* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM "ip_conntrack_udp_timeout_stream" no longer used */
308 /* NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT "ip_conntrack_icmp_timeout" no longer used */
309 /* NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT "ip_conntrack_generic_timeout" no longer used */
310
311 { CTL_INT, NET_IPV4_NF_CONNTRACK_BUCKETS, "ip_conntrack_buckets" },
312 { CTL_INT, NET_IPV4_NF_CONNTRACK_LOG_INVALID, "ip_conntrack_log_invalid" },
313 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "ip_conntrack_tcp_timeout_max_retrans" no longer used */
314 { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_LOOSE, "ip_conntrack_tcp_loose" },
315 { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL, "ip_conntrack_tcp_be_liberal" },
316 { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS, "ip_conntrack_tcp_max_retrans" },
317
318 /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "ip_conntrack_sctp_timeout_closed" no longer used */
319 /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "ip_conntrack_sctp_timeout_cookie_wait" no longer used */
320 /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "ip_conntrack_sctp_timeout_cookie_echoed" no longer used */
321 /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "ip_conntrack_sctp_timeout_established" no longer used */
322 /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "ip_conntrack_sctp_timeout_shutdown_sent" no longer used */
323 /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "ip_conntrack_sctp_timeout_shutdown_recd" no longer used */
324 /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "ip_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */
325
326 { CTL_INT, NET_IPV4_NF_CONNTRACK_COUNT, "ip_conntrack_count" },
327 { CTL_INT, NET_IPV4_NF_CONNTRACK_CHECKSUM, "ip_conntrack_checksum" },
328 {}
329};
330
331static const struct bin_table bin_net_ipv4_table[] = {
332 {CTL_INT, NET_IPV4_FORWARD, "ip_forward" },
333
334 { CTL_DIR, NET_IPV4_CONF, "conf", bin_net_ipv4_conf_table },
335 { CTL_DIR, NET_IPV4_NEIGH, "neigh", bin_net_neigh_table },
336 { CTL_DIR, NET_IPV4_ROUTE, "route", bin_net_ipv4_route_table },
337 /* NET_IPV4_FIB_HASH unused */
338 { CTL_DIR, NET_IPV4_NETFILTER, "netfilter", bin_net_ipv4_netfilter_table },
339
340 { CTL_INT, NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps" },
341 { CTL_INT, NET_IPV4_TCP_WINDOW_SCALING, "tcp_window_scaling" },
342 { CTL_INT, NET_IPV4_TCP_SACK, "tcp_sack" },
343 { CTL_INT, NET_IPV4_TCP_RETRANS_COLLAPSE, "tcp_retrans_collapse" },
344 { CTL_INT, NET_IPV4_DEFAULT_TTL, "ip_default_ttl" },
345 /* NET_IPV4_AUTOCONFIG unused */
346 { CTL_INT, NET_IPV4_NO_PMTU_DISC, "ip_no_pmtu_disc" },
347 { CTL_INT, NET_IPV4_NONLOCAL_BIND, "ip_nonlocal_bind" },
348 { CTL_INT, NET_IPV4_TCP_SYN_RETRIES, "tcp_syn_retries" },
349 { CTL_INT, NET_TCP_SYNACK_RETRIES, "tcp_synack_retries" },
350 { CTL_INT, NET_TCP_MAX_ORPHANS, "tcp_max_orphans" },
351 { CTL_INT, NET_TCP_MAX_TW_BUCKETS, "tcp_max_tw_buckets" },
352 { CTL_INT, NET_IPV4_DYNADDR, "ip_dynaddr" },
353 { CTL_INT, NET_IPV4_TCP_KEEPALIVE_TIME, "tcp_keepalive_time" },
354 { CTL_INT, NET_IPV4_TCP_KEEPALIVE_PROBES, "tcp_keepalive_probes" },
355 { CTL_INT, NET_IPV4_TCP_KEEPALIVE_INTVL, "tcp_keepalive_intvl" },
356 { CTL_INT, NET_IPV4_TCP_RETRIES1, "tcp_retries1" },
357 { CTL_INT, NET_IPV4_TCP_RETRIES2, "tcp_retries2" },
358 { CTL_INT, NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout" },
359 { CTL_INT, NET_TCP_SYNCOOKIES, "tcp_syncookies" },
360 { CTL_INT, NET_TCP_TW_RECYCLE, "tcp_tw_recycle" },
361 { CTL_INT, NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow" },
362 { CTL_INT, NET_TCP_STDURG, "tcp_stdurg" },
363 { CTL_INT, NET_TCP_RFC1337, "tcp_rfc1337" },
364 { CTL_INT, NET_TCP_MAX_SYN_BACKLOG, "tcp_max_syn_backlog" },
365 { CTL_INT, NET_IPV4_LOCAL_PORT_RANGE, "ip_local_port_range" },
366 { CTL_INT, NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships" },
367 { CTL_INT, NET_IPV4_IGMP_MAX_MSF, "igmp_max_msf" },
368 { CTL_INT, NET_IPV4_INET_PEER_THRESHOLD, "inet_peer_threshold" },
369 { CTL_INT, NET_IPV4_INET_PEER_MINTTL, "inet_peer_minttl" },
370 { CTL_INT, NET_IPV4_INET_PEER_MAXTTL, "inet_peer_maxttl" },
371 { CTL_INT, NET_IPV4_INET_PEER_GC_MINTIME, "inet_peer_gc_mintime" },
372 { CTL_INT, NET_IPV4_INET_PEER_GC_MAXTIME, "inet_peer_gc_maxtime" },
373 { CTL_INT, NET_TCP_ORPHAN_RETRIES, "tcp_orphan_retries" },
374 { CTL_INT, NET_TCP_FACK, "tcp_fack" },
375 { CTL_INT, NET_TCP_REORDERING, "tcp_reordering" },
376 { CTL_INT, NET_TCP_ECN, "tcp_ecn" },
377 { CTL_INT, NET_TCP_DSACK, "tcp_dsack" },
378 { CTL_INT, NET_TCP_MEM, "tcp_mem" },
379 { CTL_INT, NET_TCP_WMEM, "tcp_wmem" },
380 { CTL_INT, NET_TCP_RMEM, "tcp_rmem" },
381 { CTL_INT, NET_TCP_APP_WIN, "tcp_app_win" },
382 { CTL_INT, NET_TCP_ADV_WIN_SCALE, "tcp_adv_win_scale" },
383 { CTL_INT, NET_TCP_TW_REUSE, "tcp_tw_reuse" },
384 { CTL_INT, NET_TCP_FRTO, "tcp_frto" },
385 { CTL_INT, NET_TCP_FRTO_RESPONSE, "tcp_frto_response" },
386 { CTL_INT, NET_TCP_LOW_LATENCY, "tcp_low_latency" },
387 { CTL_INT, NET_TCP_NO_METRICS_SAVE, "tcp_no_metrics_save" },
388 { CTL_INT, NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" },
389 { CTL_INT, NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" },
390 { CTL_STR, NET_TCP_CONG_CONTROL, "tcp_congestion_control" },
391 { CTL_INT, NET_TCP_ABC, "tcp_abc" },
392 { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" },
393 { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" },
394 { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" },
395 { CTL_INT, NET_TCP_DMA_COPYBREAK, "tcp_dma_copybreak" },
396 { CTL_INT, NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" },
397 { CTL_INT, NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" },
398 { CTL_INT, NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" },
399 { CTL_INT, NET_CIPSOV4_RBM_OPTFMT, "cipso_rbm_optfmt" },
400 { CTL_INT, NET_CIPSOV4_RBM_STRICTVALID, "cipso_rbm_strictvalid" },
401 /* NET_TCP_AVAIL_CONG_CONTROL "tcp_available_congestion_control" no longer used */
402 { CTL_STR, NET_TCP_ALLOWED_CONG_CONTROL, "tcp_allowed_congestion_control" },
403 { CTL_INT, NET_TCP_MAX_SSTHRESH, "tcp_max_ssthresh" },
404
405 { CTL_INT, NET_IPV4_ICMP_ECHO_IGNORE_ALL, "icmp_echo_ignore_all" },
406 { CTL_INT, NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, "icmp_echo_ignore_broadcasts" },
407 { CTL_INT, NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, "icmp_ignore_bogus_error_responses" },
408 { CTL_INT, NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR, "icmp_errors_use_inbound_ifaddr" },
409 { CTL_INT, NET_IPV4_ICMP_RATELIMIT, "icmp_ratelimit" },
410 { CTL_INT, NET_IPV4_ICMP_RATEMASK, "icmp_ratemask" },
411
412 { CTL_INT, NET_IPV4_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh" },
413 { CTL_INT, NET_IPV4_IPFRAG_LOW_THRESH, "ipfrag_low_thresh" },
414 { CTL_INT, NET_IPV4_IPFRAG_TIME, "ipfrag_time" },
415
416 { CTL_INT, NET_IPV4_IPFRAG_SECRET_INTERVAL, "ipfrag_secret_interval" },
417 /* NET_IPV4_IPFRAG_MAX_DIST "ipfrag_max_dist" no longer used */
418
419 { CTL_INT, 2088 /* NET_IPQ_QMAX */, "ip_queue_maxlen" },
420
421 /* NET_TCP_DEFAULT_WIN_SCALE unused */
422 /* NET_TCP_BIC_BETA unused */
423 /* NET_IPV4_TCP_MAX_KA_PROBES unused */
424 /* NET_IPV4_IP_MASQ_DEBUG unused */
425 /* NET_TCP_SYN_TAILDROP unused */
426 /* NET_IPV4_ICMP_SOURCEQUENCH_RATE unused */
427 /* NET_IPV4_ICMP_DESTUNREACH_RATE unused */
428 /* NET_IPV4_ICMP_TIMEEXCEED_RATE unused */
429 /* NET_IPV4_ICMP_PARAMPROB_RATE unused */
430 /* NET_IPV4_ICMP_ECHOREPLY_RATE unused */
431 /* NET_IPV4_ALWAYS_DEFRAG unused */
432 {}
433};
434
435static const struct bin_table bin_net_ipx_table[] = {
436 { CTL_INT, NET_IPX_PPROP_BROADCASTING, "ipx_pprop_broadcasting" },
437 /* NET_IPX_FORWARDING unused */
438 {}
439};
440
441static const struct bin_table bin_net_atalk_table[] = {
442 { CTL_INT, NET_ATALK_AARP_EXPIRY_TIME, "aarp-expiry-time" },
443 { CTL_INT, NET_ATALK_AARP_TICK_TIME, "aarp-tick-time" },
444 { CTL_INT, NET_ATALK_AARP_RETRANSMIT_LIMIT, "aarp-retransmit-limit" },
445 { CTL_INT, NET_ATALK_AARP_RESOLVE_TIME, "aarp-resolve-time" },
446 {},
447};
448
449static const struct bin_table bin_net_netrom_table[] = {
450 { CTL_INT, NET_NETROM_DEFAULT_PATH_QUALITY, "default_path_quality" },
451 { CTL_INT, NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER, "obsolescence_count_initialiser" },
452 { CTL_INT, NET_NETROM_NETWORK_TTL_INITIALISER, "network_ttl_initialiser" },
453 { CTL_INT, NET_NETROM_TRANSPORT_TIMEOUT, "transport_timeout" },
454 { CTL_INT, NET_NETROM_TRANSPORT_MAXIMUM_TRIES, "transport_maximum_tries" },
455 { CTL_INT, NET_NETROM_TRANSPORT_ACKNOWLEDGE_DELAY, "transport_acknowledge_delay" },
456 { CTL_INT, NET_NETROM_TRANSPORT_BUSY_DELAY, "transport_busy_delay" },
457 { CTL_INT, NET_NETROM_TRANSPORT_REQUESTED_WINDOW_SIZE, "transport_requested_window_size" },
458 { CTL_INT, NET_NETROM_TRANSPORT_NO_ACTIVITY_TIMEOUT, "transport_no_activity_timeout" },
459 { CTL_INT, NET_NETROM_ROUTING_CONTROL, "routing_control" },
460 { CTL_INT, NET_NETROM_LINK_FAILS_COUNT, "link_fails_count" },
461 { CTL_INT, NET_NETROM_RESET, "reset" },
462 {}
463};
464
465static const struct bin_table bin_net_ax25_param_table[] = {
466 { CTL_INT, NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" },
467 { CTL_INT, NET_AX25_DEFAULT_MODE, "ax25_default_mode" },
468 { CTL_INT, NET_AX25_BACKOFF_TYPE, "backoff_type" },
469 { CTL_INT, NET_AX25_CONNECT_MODE, "connect_mode" },
470 { CTL_INT, NET_AX25_STANDARD_WINDOW, "standard_window_size" },
471 { CTL_INT, NET_AX25_EXTENDED_WINDOW, "extended_window_size" },
472 { CTL_INT, NET_AX25_T1_TIMEOUT, "t1_timeout" },
473 { CTL_INT, NET_AX25_T2_TIMEOUT, "t2_timeout" },
474 { CTL_INT, NET_AX25_T3_TIMEOUT, "t3_timeout" },
475 { CTL_INT, NET_AX25_IDLE_TIMEOUT, "idle_timeout" },
476 { CTL_INT, NET_AX25_N2, "maximum_retry_count" },
477 { CTL_INT, NET_AX25_PACLEN, "maximum_packet_length" },
478 { CTL_INT, NET_AX25_PROTOCOL, "protocol" },
479 { CTL_INT, NET_AX25_DAMA_SLAVE_TIMEOUT, "dama_slave_timeout" },
480 {}
481};
482
483static const struct bin_table bin_net_ax25_table[] = {
484 { CTL_DIR, 0, NULL, bin_net_ax25_param_table },
485 {}
486};
487
488static const struct bin_table bin_net_rose_table[] = {
489 { CTL_INT, NET_ROSE_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" },
490 { CTL_INT, NET_ROSE_CALL_REQUEST_TIMEOUT, "call_request_timeout" },
491 { CTL_INT, NET_ROSE_RESET_REQUEST_TIMEOUT, "reset_request_timeout" },
492 { CTL_INT, NET_ROSE_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" },
493 { CTL_INT, NET_ROSE_ACK_HOLD_BACK_TIMEOUT, "acknowledge_hold_back_timeout" },
494 { CTL_INT, NET_ROSE_ROUTING_CONTROL, "routing_control" },
495 { CTL_INT, NET_ROSE_LINK_FAIL_TIMEOUT, "link_fail_timeout" },
496 { CTL_INT, NET_ROSE_MAX_VCS, "maximum_virtual_circuits" },
497 { CTL_INT, NET_ROSE_WINDOW_SIZE, "window_size" },
498 { CTL_INT, NET_ROSE_NO_ACTIVITY_TIMEOUT, "no_activity_timeout" },
499 {}
500};
501
502static const struct bin_table bin_net_ipv6_conf_var_table[] = {
503 { CTL_INT, NET_IPV6_FORWARDING, "forwarding" },
504 { CTL_INT, NET_IPV6_HOP_LIMIT, "hop_limit" },
505 { CTL_INT, NET_IPV6_MTU, "mtu" },
506 { CTL_INT, NET_IPV6_ACCEPT_RA, "accept_ra" },
507 { CTL_INT, NET_IPV6_ACCEPT_REDIRECTS, "accept_redirects" },
508 { CTL_INT, NET_IPV6_AUTOCONF, "autoconf" },
509 { CTL_INT, NET_IPV6_DAD_TRANSMITS, "dad_transmits" },
510 { CTL_INT, NET_IPV6_RTR_SOLICITS, "router_solicitations" },
511 { CTL_INT, NET_IPV6_RTR_SOLICIT_INTERVAL, "router_solicitation_interval" },
512 { CTL_INT, NET_IPV6_RTR_SOLICIT_DELAY, "router_solicitation_delay" },
513 { CTL_INT, NET_IPV6_USE_TEMPADDR, "use_tempaddr" },
514 { CTL_INT, NET_IPV6_TEMP_VALID_LFT, "temp_valid_lft" },
515 { CTL_INT, NET_IPV6_TEMP_PREFERED_LFT, "temp_prefered_lft" },
516 { CTL_INT, NET_IPV6_REGEN_MAX_RETRY, "regen_max_retry" },
517 { CTL_INT, NET_IPV6_MAX_DESYNC_FACTOR, "max_desync_factor" },
518 { CTL_INT, NET_IPV6_MAX_ADDRESSES, "max_addresses" },
519 { CTL_INT, NET_IPV6_FORCE_MLD_VERSION, "force_mld_version" },
520 { CTL_INT, NET_IPV6_ACCEPT_RA_DEFRTR, "accept_ra_defrtr" },
521 { CTL_INT, NET_IPV6_ACCEPT_RA_PINFO, "accept_ra_pinfo" },
522 { CTL_INT, NET_IPV6_ACCEPT_RA_RTR_PREF, "accept_ra_rtr_pref" },
523 { CTL_INT, NET_IPV6_RTR_PROBE_INTERVAL, "router_probe_interval" },
524 { CTL_INT, NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" },
525 { CTL_INT, NET_IPV6_PROXY_NDP, "proxy_ndp" },
526 { CTL_INT, NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
527 {}
528};
529
530static const struct bin_table bin_net_ipv6_conf_table[] = {
531 { CTL_DIR, NET_PROTO_CONF_ALL, "all", bin_net_ipv6_conf_var_table },
532 { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_ipv6_conf_var_table },
533 { CTL_DIR, 0, NULL, bin_net_ipv6_conf_var_table },
534 {}
535};
536
537static const struct bin_table bin_net_ipv6_route_table[] = {
538 /* NET_IPV6_ROUTE_FLUSH "flush" no longer used */
539 { CTL_INT, NET_IPV6_ROUTE_GC_THRESH, "gc_thresh" },
540 { CTL_INT, NET_IPV6_ROUTE_MAX_SIZE, "max_size" },
541 { CTL_INT, NET_IPV6_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" },
542 { CTL_INT, NET_IPV6_ROUTE_GC_TIMEOUT, "gc_timeout" },
543 { CTL_INT, NET_IPV6_ROUTE_GC_INTERVAL, "gc_interval" },
544 { CTL_INT, NET_IPV6_ROUTE_GC_ELASTICITY, "gc_elasticity" },
545 { CTL_INT, NET_IPV6_ROUTE_MTU_EXPIRES, "mtu_expires" },
546 { CTL_INT, NET_IPV6_ROUTE_MIN_ADVMSS, "min_adv_mss" },
547 { CTL_INT, NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" },
548 {}
549};
550
551static const struct bin_table bin_net_ipv6_icmp_table[] = {
552 { CTL_INT, NET_IPV6_ICMP_RATELIMIT, "ratelimit" },
553 {}
554};
555
556static const struct bin_table bin_net_ipv6_table[] = {
557 { CTL_DIR, NET_IPV6_CONF, "conf", bin_net_ipv6_conf_table },
558 { CTL_DIR, NET_IPV6_NEIGH, "neigh", bin_net_neigh_table },
559 { CTL_DIR, NET_IPV6_ROUTE, "route", bin_net_ipv6_route_table },
560 { CTL_DIR, NET_IPV6_ICMP, "icmp", bin_net_ipv6_icmp_table },
561 { CTL_INT, NET_IPV6_BINDV6ONLY, "bindv6only" },
562 { CTL_INT, NET_IPV6_IP6FRAG_HIGH_THRESH, "ip6frag_high_thresh" },
563 { CTL_INT, NET_IPV6_IP6FRAG_LOW_THRESH, "ip6frag_low_thresh" },
564 { CTL_INT, NET_IPV6_IP6FRAG_TIME, "ip6frag_time" },
565 { CTL_INT, NET_IPV6_IP6FRAG_SECRET_INTERVAL, "ip6frag_secret_interval" },
566 { CTL_INT, NET_IPV6_MLD_MAX_MSF, "mld_max_msf" },
567 { CTL_INT, 2088 /* IPQ_QMAX */, "ip6_queue_maxlen" },
568 {}
569};
570
571static const struct bin_table bin_net_x25_table[] = {
572 { CTL_INT, NET_X25_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" },
573 { CTL_INT, NET_X25_CALL_REQUEST_TIMEOUT, "call_request_timeout" },
574 { CTL_INT, NET_X25_RESET_REQUEST_TIMEOUT, "reset_request_timeout" },
575 { CTL_INT, NET_X25_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" },
576 { CTL_INT, NET_X25_ACK_HOLD_BACK_TIMEOUT, "acknowledgement_hold_back_timeout" },
577 { CTL_INT, NET_X25_FORWARD, "x25_forward" },
578 {}
579};
580
581static const struct bin_table bin_net_tr_table[] = {
582 { CTL_INT, NET_TR_RIF_TIMEOUT, "rif_timeout" },
583 {}
584};
585
586
587static const struct bin_table bin_net_decnet_conf_vars[] = {
588 { CTL_INT, NET_DECNET_CONF_DEV_FORWARDING, "forwarding" },
589 { CTL_INT, NET_DECNET_CONF_DEV_PRIORITY, "priority" },
590 { CTL_INT, NET_DECNET_CONF_DEV_T2, "t2" },
591 { CTL_INT, NET_DECNET_CONF_DEV_T3, "t3" },
592 {}
593};
594
595static const struct bin_table bin_net_decnet_conf[] = {
596 { CTL_DIR, NET_DECNET_CONF_ETHER, "ethernet", bin_net_decnet_conf_vars },
597 { CTL_DIR, NET_DECNET_CONF_GRE, "ipgre", bin_net_decnet_conf_vars },
598 { CTL_DIR, NET_DECNET_CONF_X25, "x25", bin_net_decnet_conf_vars },
599 { CTL_DIR, NET_DECNET_CONF_PPP, "ppp", bin_net_decnet_conf_vars },
600 { CTL_DIR, NET_DECNET_CONF_DDCMP, "ddcmp", bin_net_decnet_conf_vars },
601 { CTL_DIR, NET_DECNET_CONF_LOOPBACK, "loopback", bin_net_decnet_conf_vars },
602 { CTL_DIR, 0, NULL, bin_net_decnet_conf_vars },
603 {}
604};
605
606static const struct bin_table bin_net_decnet_table[] = {
607 { CTL_DIR, NET_DECNET_CONF, "conf", bin_net_decnet_conf },
608 { CTL_DNADR, NET_DECNET_NODE_ADDRESS, "node_address" },
609 { CTL_STR, NET_DECNET_NODE_NAME, "node_name" },
610 { CTL_STR, NET_DECNET_DEFAULT_DEVICE, "default_device" },
611 { CTL_INT, NET_DECNET_TIME_WAIT, "time_wait" },
612 { CTL_INT, NET_DECNET_DN_COUNT, "dn_count" },
613 { CTL_INT, NET_DECNET_DI_COUNT, "di_count" },
614 { CTL_INT, NET_DECNET_DR_COUNT, "dr_count" },
615 { CTL_INT, NET_DECNET_DST_GC_INTERVAL, "dst_gc_interval" },
616 { CTL_INT, NET_DECNET_NO_FC_MAX_CWND, "no_fc_max_cwnd" },
617 { CTL_INT, NET_DECNET_MEM, "decnet_mem" },
618 { CTL_INT, NET_DECNET_RMEM, "decnet_rmem" },
619 { CTL_INT, NET_DECNET_WMEM, "decnet_wmem" },
620 { CTL_INT, NET_DECNET_DEBUG_LEVEL, "debug" },
621 {}
622};
623
624static const struct bin_table bin_net_sctp_table[] = {
625 { CTL_INT, NET_SCTP_RTO_INITIAL, "rto_initial" },
626 { CTL_INT, NET_SCTP_RTO_MIN, "rto_min" },
627 { CTL_INT, NET_SCTP_RTO_MAX, "rto_max" },
628 { CTL_INT, NET_SCTP_RTO_ALPHA, "rto_alpha_exp_divisor" },
629 { CTL_INT, NET_SCTP_RTO_BETA, "rto_beta_exp_divisor" },
630 { CTL_INT, NET_SCTP_VALID_COOKIE_LIFE, "valid_cookie_life" },
631 { CTL_INT, NET_SCTP_ASSOCIATION_MAX_RETRANS, "association_max_retrans" },
632 { CTL_INT, NET_SCTP_PATH_MAX_RETRANS, "path_max_retrans" },
633 { CTL_INT, NET_SCTP_MAX_INIT_RETRANSMITS, "max_init_retransmits" },
634 { CTL_INT, NET_SCTP_HB_INTERVAL, "hb_interval" },
635 { CTL_INT, NET_SCTP_PRESERVE_ENABLE, "cookie_preserve_enable" },
636 { CTL_INT, NET_SCTP_MAX_BURST, "max_burst" },
637 { CTL_INT, NET_SCTP_ADDIP_ENABLE, "addip_enable" },
638 { CTL_INT, NET_SCTP_PRSCTP_ENABLE, "prsctp_enable" },
639 { CTL_INT, NET_SCTP_SNDBUF_POLICY, "sndbuf_policy" },
640 { CTL_INT, NET_SCTP_SACK_TIMEOUT, "sack_timeout" },
641 { CTL_INT, NET_SCTP_RCVBUF_POLICY, "rcvbuf_policy" },
642 {}
643};
644
645static const struct bin_table bin_net_llc_llc2_timeout_table[] = {
646 { CTL_INT, NET_LLC2_ACK_TIMEOUT, "ack" },
647 { CTL_INT, NET_LLC2_P_TIMEOUT, "p" },
648 { CTL_INT, NET_LLC2_REJ_TIMEOUT, "rej" },
649 { CTL_INT, NET_LLC2_BUSY_TIMEOUT, "busy" },
650 {}
651};
652
653static const struct bin_table bin_net_llc_station_table[] = {
654 { CTL_INT, NET_LLC_STATION_ACK_TIMEOUT, "ack_timeout" },
655 {}
656};
657
658static const struct bin_table bin_net_llc_llc2_table[] = {
659 { CTL_DIR, NET_LLC2, "timeout", bin_net_llc_llc2_timeout_table },
660 {}
661};
662
663static const struct bin_table bin_net_llc_table[] = {
664 { CTL_DIR, NET_LLC2, "llc2", bin_net_llc_llc2_table },
665 { CTL_DIR, NET_LLC_STATION, "station", bin_net_llc_station_table },
666 {}
667};
668
669static const struct bin_table bin_net_netfilter_table[] = {
670 { CTL_INT, NET_NF_CONNTRACK_MAX, "nf_conntrack_max" },
671 /* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "nf_conntrack_tcp_timeout_syn_sent" no longer used */
672 /* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "nf_conntrack_tcp_timeout_syn_recv" no longer used */
673 /* NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "nf_conntrack_tcp_timeout_established" no longer used */
674 /* NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "nf_conntrack_tcp_timeout_fin_wait" no longer used */
675 /* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT "nf_conntrack_tcp_timeout_close_wait" no longer used */
676 /* NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "nf_conntrack_tcp_timeout_last_ack" no longer used */
677 /* NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "nf_conntrack_tcp_timeout_time_wait" no longer used */
678 /* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "nf_conntrack_tcp_timeout_close" no longer used */
679 /* NET_NF_CONNTRACK_UDP_TIMEOUT "nf_conntrack_udp_timeout" no longer used */
680 /* NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM "nf_conntrack_udp_timeout_stream" no longer used */
681 /* NET_NF_CONNTRACK_ICMP_TIMEOUT "nf_conntrack_icmp_timeout" no longer used */
682 /* NET_NF_CONNTRACK_GENERIC_TIMEOUT "nf_conntrack_generic_timeout" no longer used */
683 { CTL_INT, NET_NF_CONNTRACK_BUCKETS, "nf_conntrack_buckets" },
684 { CTL_INT, NET_NF_CONNTRACK_LOG_INVALID, "nf_conntrack_log_invalid" },
685 /* NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "nf_conntrack_tcp_timeout_max_retrans" no longer used */
686 { CTL_INT, NET_NF_CONNTRACK_TCP_LOOSE, "nf_conntrack_tcp_loose" },
687 { CTL_INT, NET_NF_CONNTRACK_TCP_BE_LIBERAL, "nf_conntrack_tcp_be_liberal" },
688 { CTL_INT, NET_NF_CONNTRACK_TCP_MAX_RETRANS, "nf_conntrack_tcp_max_retrans" },
689 /* NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "nf_conntrack_sctp_timeout_closed" no longer used */
690 /* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "nf_conntrack_sctp_timeout_cookie_wait" no longer used */
691 /* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "nf_conntrack_sctp_timeout_cookie_echoed" no longer used */
692 /* NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "nf_conntrack_sctp_timeout_established" no longer used */
693 /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "nf_conntrack_sctp_timeout_shutdown_sent" no longer used */
694 /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "nf_conntrack_sctp_timeout_shutdown_recd" no longer used */
695 /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "nf_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */
696 { CTL_INT, NET_NF_CONNTRACK_COUNT, "nf_conntrack_count" },
697 /* NET_NF_CONNTRACK_ICMPV6_TIMEOUT "nf_conntrack_icmpv6_timeout" no longer used */
698 /* NET_NF_CONNTRACK_FRAG6_TIMEOUT "nf_conntrack_frag6_timeout" no longer used */
699 { CTL_INT, NET_NF_CONNTRACK_FRAG6_LOW_THRESH, "nf_conntrack_frag6_low_thresh" },
700 { CTL_INT, NET_NF_CONNTRACK_FRAG6_HIGH_THRESH, "nf_conntrack_frag6_high_thresh" },
701 { CTL_INT, NET_NF_CONNTRACK_CHECKSUM, "nf_conntrack_checksum" },
702
703 {}
704};
705
706static const struct bin_table bin_net_irda_table[] = {
707 { CTL_INT, NET_IRDA_DISCOVERY, "discovery" },
708 { CTL_STR, NET_IRDA_DEVNAME, "devname" },
709 { CTL_INT, NET_IRDA_DEBUG, "debug" },
710 { CTL_INT, NET_IRDA_FAST_POLL, "fast_poll_increase" },
711 { CTL_INT, NET_IRDA_DISCOVERY_SLOTS, "discovery_slots" },
712 { CTL_INT, NET_IRDA_DISCOVERY_TIMEOUT, "discovery_timeout" },
713 { CTL_INT, NET_IRDA_SLOT_TIMEOUT, "slot_timeout" },
714 { CTL_INT, NET_IRDA_MAX_BAUD_RATE, "max_baud_rate" },
715 { CTL_INT, NET_IRDA_MIN_TX_TURN_TIME, "min_tx_turn_time" },
716 { CTL_INT, NET_IRDA_MAX_TX_DATA_SIZE, "max_tx_data_size" },
717 { CTL_INT, NET_IRDA_MAX_TX_WINDOW, "max_tx_window" },
718 { CTL_INT, NET_IRDA_MAX_NOREPLY_TIME, "max_noreply_time" },
719 { CTL_INT, NET_IRDA_WARN_NOREPLY_TIME, "warn_noreply_time" },
720 { CTL_INT, NET_IRDA_LAP_KEEPALIVE_TIME, "lap_keepalive_time" },
721 {}
722};
723
724static const struct bin_table bin_net_table[] = {
725 { CTL_DIR, NET_CORE, "core", bin_net_core_table },
726 /* NET_ETHER not used */
727 /* NET_802 not used */
728 { CTL_DIR, NET_UNIX, "unix", bin_net_unix_table },
729 { CTL_DIR, NET_IPV4, "ipv4", bin_net_ipv4_table },
730 { CTL_DIR, NET_IPX, "ipx", bin_net_ipx_table },
731 { CTL_DIR, NET_ATALK, "appletalk", bin_net_atalk_table },
732 { CTL_DIR, NET_NETROM, "netrom", bin_net_netrom_table },
733 { CTL_DIR, NET_AX25, "ax25", bin_net_ax25_table },
734 /* NET_BRIDGE "bridge" no longer used */
735 { CTL_DIR, NET_ROSE, "rose", bin_net_rose_table },
736 { CTL_DIR, NET_IPV6, "ipv6", bin_net_ipv6_table },
737 { CTL_DIR, NET_X25, "x25", bin_net_x25_table },
738 { CTL_DIR, NET_TR, "token-ring", bin_net_tr_table },
739 { CTL_DIR, NET_DECNET, "decnet", bin_net_decnet_table },
740 /* NET_ECONET not used */
741 { CTL_DIR, NET_SCTP, "sctp", bin_net_sctp_table },
742 { CTL_DIR, NET_LLC, "llc", bin_net_llc_table },
743 { CTL_DIR, NET_NETFILTER, "netfilter", bin_net_netfilter_table },
744 /* NET_DCCP "dccp" no longer used */
745 { CTL_DIR, NET_IRDA, "irda", bin_net_irda_table },
746 { CTL_INT, 2089, "nf_conntrack_max" },
747 {}
748};
749
750static const struct bin_table bin_fs_quota_table[] = {
751 { CTL_INT, FS_DQ_LOOKUPS, "lookups" },
752 { CTL_INT, FS_DQ_DROPS, "drops" },
753 { CTL_INT, FS_DQ_READS, "reads" },
754 { CTL_INT, FS_DQ_WRITES, "writes" },
755 { CTL_INT, FS_DQ_CACHE_HITS, "cache_hits" },
756 { CTL_INT, FS_DQ_ALLOCATED, "allocated_dquots" },
757 { CTL_INT, FS_DQ_FREE, "free_dquots" },
758 { CTL_INT, FS_DQ_SYNCS, "syncs" },
759 { CTL_INT, FS_DQ_WARNINGS, "warnings" },
760 {}
761};
762
763static const struct bin_table bin_fs_xfs_table[] = {
764 { CTL_INT, XFS_SGID_INHERIT, "irix_sgid_inherit" },
765 { CTL_INT, XFS_SYMLINK_MODE, "irix_symlink_mode" },
766 { CTL_INT, XFS_PANIC_MASK, "panic_mask" },
767
768 { CTL_INT, XFS_ERRLEVEL, "error_level" },
769 { CTL_INT, XFS_SYNCD_TIMER, "xfssyncd_centisecs" },
770 { CTL_INT, XFS_INHERIT_SYNC, "inherit_sync" },
771 { CTL_INT, XFS_INHERIT_NODUMP, "inherit_nodump" },
772 { CTL_INT, XFS_INHERIT_NOATIME, "inherit_noatime" },
773 { CTL_INT, XFS_BUF_TIMER, "xfsbufd_centisecs" },
774 { CTL_INT, XFS_BUF_AGE, "age_buffer_centisecs" },
775 { CTL_INT, XFS_INHERIT_NOSYM, "inherit_nosymlinks" },
776 { CTL_INT, XFS_ROTORSTEP, "rotorstep" },
777 { CTL_INT, XFS_INHERIT_NODFRG, "inherit_nodefrag" },
778 { CTL_INT, XFS_FILESTREAM_TIMER, "filestream_centisecs" },
779 { CTL_INT, XFS_STATS_CLEAR, "stats_clear" },
780 {}
781};
782
783static const struct bin_table bin_fs_ocfs2_nm_table[] = {
784 { CTL_STR, 1, "hb_ctl_path" },
785 {}
786};
787
788static const struct bin_table bin_fs_ocfs2_table[] = {
789 { CTL_DIR, 1, "nm", bin_fs_ocfs2_nm_table },
790 {}
791};
792
793static const struct bin_table bin_inotify_table[] = {
794 { CTL_INT, INOTIFY_MAX_USER_INSTANCES, "max_user_instances" },
795 { CTL_INT, INOTIFY_MAX_USER_WATCHES, "max_user_watches" },
796 { CTL_INT, INOTIFY_MAX_QUEUED_EVENTS, "max_queued_events" },
797 {}
798};
799
800static const struct bin_table bin_fs_table[] = {
801 { CTL_INT, FS_NRINODE, "inode-nr" },
802 { CTL_INT, FS_STATINODE, "inode-state" },
803 /* FS_MAXINODE unused */
804 /* FS_NRDQUOT unused */
805 /* FS_MAXDQUOT unused */
806 /* FS_NRFILE "file-nr" no longer used */
807 { CTL_INT, FS_MAXFILE, "file-max" },
808 { CTL_INT, FS_DENTRY, "dentry-state" },
809 /* FS_NRSUPER unused */
810 /* FS_MAXUPSER unused */
811 { CTL_INT, FS_OVERFLOWUID, "overflowuid" },
812 { CTL_INT, FS_OVERFLOWGID, "overflowgid" },
813 { CTL_INT, FS_LEASES, "leases-enable" },
814 { CTL_INT, FS_DIR_NOTIFY, "dir-notify-enable" },
815 { CTL_INT, FS_LEASE_TIME, "lease-break-time" },
816 { CTL_DIR, FS_DQSTATS, "quota", bin_fs_quota_table },
817 { CTL_DIR, FS_XFS, "xfs", bin_fs_xfs_table },
818 { CTL_ULONG, FS_AIO_NR, "aio-nr" },
819 { CTL_ULONG, FS_AIO_MAX_NR, "aio-max-nr" },
820 { CTL_DIR, FS_INOTIFY, "inotify", bin_inotify_table },
821 { CTL_DIR, FS_OCFS2, "ocfs2", bin_fs_ocfs2_table },
822 { CTL_INT, KERN_SETUID_DUMPABLE, "suid_dumpable" },
823 {}
824};
825
826static const struct bin_table bin_ipmi_table[] = {
827 { CTL_INT, DEV_IPMI_POWEROFF_POWERCYCLE, "poweroff_powercycle" },
828 {}
829};
830
831static const struct bin_table bin_mac_hid_files[] = {
832 /* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */
833 /* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */
834 { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON_EMULATION, "mouse_button_emulation" },
835 { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON2_KEYCODE, "mouse_button2_keycode" },
836 { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON3_KEYCODE, "mouse_button3_keycode" },
837 /* DEV_MAC_HID_ADB_MOUSE_SENDS_KEYCODES unused */
838 {}
839};
840
841static const struct bin_table bin_raid_table[] = {
842 { CTL_INT, DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min" },
843 { CTL_INT, DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max" },
844 {}
845};
846
847static const struct bin_table bin_scsi_table[] = {
848 { CTL_INT, DEV_SCSI_LOGGING_LEVEL, "logging_level" },
849 {}
850};
851
852static const struct bin_table bin_dev_table[] = {
853 /* DEV_CDROM "cdrom" no longer used */
854 /* DEV_HWMON unused */
855 /* DEV_PARPORT "parport" no longer used */
856 { CTL_DIR, DEV_RAID, "raid", bin_raid_table },
857 { CTL_DIR, DEV_MAC_HID, "mac_hid", bin_mac_hid_files },
858 { CTL_DIR, DEV_SCSI, "scsi", bin_scsi_table },
859 { CTL_DIR, DEV_IPMI, "ipmi", bin_ipmi_table },
860 {}
861};
862
863static const struct bin_table bin_bus_isa_table[] = {
864 { CTL_INT, BUS_ISA_MEM_BASE, "membase" },
865 { CTL_INT, BUS_ISA_PORT_BASE, "portbase" },
866 { CTL_INT, BUS_ISA_PORT_SHIFT, "portshift" },
867 {}
868};
869
870static const struct bin_table bin_bus_table[] = {
871 { CTL_DIR, CTL_BUS_ISA, "isa", bin_bus_isa_table },
872 {}
873};
874
875
876static const struct bin_table bin_s390dbf_table[] = {
877 { CTL_INT, 5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" },
878 { CTL_INT, 5679 /* CTL_S390DBF_ACTIVE */, "debug_active" },
879 {}
880};
881
882static const struct bin_table bin_sunrpc_table[] = {
883 /* CTL_RPCDEBUG "rpc_debug" no longer used */
884 /* CTL_NFSDEBUG "nfs_debug" no longer used */
885 /* CTL_NFSDDEBUG "nfsd_debug" no longer used */
886 /* CTL_NLMDEBUG "nlm_debug" no longer used */
887
888 { CTL_INT, CTL_SLOTTABLE_UDP, "udp_slot_table_entries" },
889 { CTL_INT, CTL_SLOTTABLE_TCP, "tcp_slot_table_entries" },
890 { CTL_INT, CTL_MIN_RESVPORT, "min_resvport" },
891 { CTL_INT, CTL_MAX_RESVPORT, "max_resvport" },
892 {}
893};
894
895static const struct bin_table bin_pm_table[] = {
896 /* frv specific */
897 /* 1 == CTL_PM_SUSPEND "suspend" no longer used" */
898 { CTL_INT, 2 /* CTL_PM_CMODE */, "cmode" },
899 { CTL_INT, 3 /* CTL_PM_P0 */, "p0" },
900 { CTL_INT, 4 /* CTL_PM_CM */, "cm" },
901 {}
902};
903
904static const struct bin_table bin_root_table[] = {
905 { CTL_DIR, CTL_KERN, "kernel", bin_kern_table },
906 { CTL_DIR, CTL_VM, "vm", bin_vm_table },
907 { CTL_DIR, CTL_NET, "net", bin_net_table },
908 /* CTL_PROC not used */
909 { CTL_DIR, CTL_FS, "fs", bin_fs_table },
910 /* CTL_DEBUG "debug" no longer used */
911 { CTL_DIR, CTL_DEV, "dev", bin_dev_table },
912 { CTL_DIR, CTL_BUS, "bus", bin_bus_table },
913 { CTL_DIR, CTL_ABI, "abi" },
914 /* CTL_CPU not used */
915 /* CTL_ARLAN "arlan" no longer used */
916 { CTL_DIR, CTL_S390DBF, "s390dbf", bin_s390dbf_table },
917 { CTL_DIR, CTL_SUNRPC, "sunrpc", bin_sunrpc_table },
918 { CTL_DIR, CTL_PM, "pm", bin_pm_table },
919 {}
920};
921
922static ssize_t bin_dir(struct file *file,
923 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
924{
925 return -ENOTDIR;
926}
927
928
929static ssize_t bin_string(struct file *file,
930 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
931{
932 ssize_t result, copied = 0;
933
934 if (oldval && oldlen) {
935 char __user *lastp;
936 loff_t pos = 0;
937 int ch;
938
939 result = vfs_read(file, oldval, oldlen, &pos);
940 if (result < 0)
941 goto out;
942
943 copied = result;
944 lastp = oldval + copied - 1;
945
946 result = -EFAULT;
947 if (get_user(ch, lastp))
948 goto out;
949
950 /* Trim off the trailing newline */
951 if (ch == '\n') {
952 result = -EFAULT;
953 if (put_user('\0', lastp))
954 goto out;
955 copied -= 1;
956 }
957 }
958
959 if (newval && newlen) {
960 loff_t pos = 0;
961
962 result = vfs_write(file, newval, newlen, &pos);
963 if (result < 0)
964 goto out;
965 }
966
967 result = copied;
968out:
969 return result;
970}
971
972static ssize_t bin_intvec(struct file *file,
973 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
974{
975 mm_segment_t old_fs = get_fs();
976 ssize_t copied = 0;
977 char *buffer;
978 ssize_t result;
979
980 result = -ENOMEM;
981 buffer = kmalloc(BUFSZ, GFP_KERNEL);
982 if (!buffer)
983 goto out;
984
985 if (oldval && oldlen) {
986 unsigned __user *vec = oldval;
987 size_t length = oldlen / sizeof(*vec);
988 loff_t pos = 0;
989 char *str, *end;
990 int i;
991
992 set_fs(KERNEL_DS);
993 result = vfs_read(file, buffer, BUFSZ - 1, &pos);
994 set_fs(old_fs);
995 if (result < 0)
996 goto out_kfree;
997
998 str = buffer;
999 end = str + result;
1000 *end++ = '\0';
1001 for (i = 0; i < length; i++) {
1002 unsigned long value;
1003
1004 value = simple_strtoul(str, &str, 10);
1005 while (isspace(*str))
1006 str++;
1007
1008 result = -EFAULT;
1009 if (put_user(value, vec + i))
1010 goto out_kfree;
1011
1012 copied += sizeof(*vec);
1013 if (!isdigit(*str))
1014 break;
1015 }
1016 }
1017
1018 if (newval && newlen) {
1019 unsigned __user *vec = newval;
1020 size_t length = newlen / sizeof(*vec);
1021 loff_t pos = 0;
1022 char *str, *end;
1023 int i;
1024
1025 str = buffer;
1026 end = str + BUFSZ;
1027 for (i = 0; i < length; i++) {
1028 unsigned long value;
1029
1030 result = -EFAULT;
1031 if (get_user(value, vec + i))
1032 goto out_kfree;
1033
1034 str += snprintf(str, end - str, "%lu\t", value);
1035 }
1036
1037 set_fs(KERNEL_DS);
1038 result = vfs_write(file, buffer, str - buffer, &pos);
1039 set_fs(old_fs);
1040 if (result < 0)
1041 goto out_kfree;
1042 }
1043 result = copied;
1044out_kfree:
1045 kfree(buffer);
1046out:
1047 return result;
1048}
1049
1050static ssize_t bin_ulongvec(struct file *file,
1051 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1052{
1053 mm_segment_t old_fs = get_fs();
1054 ssize_t copied = 0;
1055 char *buffer;
1056 ssize_t result;
1057
1058 result = -ENOMEM;
1059 buffer = kmalloc(BUFSZ, GFP_KERNEL);
1060 if (!buffer)
1061 goto out;
1062
1063 if (oldval && oldlen) {
1064 unsigned long __user *vec = oldval;
1065 size_t length = oldlen / sizeof(*vec);
1066 loff_t pos = 0;
1067 char *str, *end;
1068 int i;
1069
1070 set_fs(KERNEL_DS);
1071 result = vfs_read(file, buffer, BUFSZ - 1, &pos);
1072 set_fs(old_fs);
1073 if (result < 0)
1074 goto out_kfree;
1075
1076 str = buffer;
1077 end = str + result;
1078 *end++ = '\0';
1079 for (i = 0; i < length; i++) {
1080 unsigned long value;
1081
1082 value = simple_strtoul(str, &str, 10);
1083 while (isspace(*str))
1084 str++;
1085
1086 result = -EFAULT;
1087 if (put_user(value, vec + i))
1088 goto out_kfree;
1089
1090 copied += sizeof(*vec);
1091 if (!isdigit(*str))
1092 break;
1093 }
1094 }
1095
1096 if (newval && newlen) {
1097 unsigned long __user *vec = newval;
1098 size_t length = newlen / sizeof(*vec);
1099 loff_t pos = 0;
1100 char *str, *end;
1101 int i;
1102
1103 str = buffer;
1104 end = str + BUFSZ;
1105 for (i = 0; i < length; i++) {
1106 unsigned long value;
1107
1108 result = -EFAULT;
1109 if (get_user(value, vec + i))
1110 goto out_kfree;
1111
1112 str += snprintf(str, end - str, "%lu\t", value);
1113 }
1114
1115 set_fs(KERNEL_DS);
1116 result = vfs_write(file, buffer, str - buffer, &pos);
1117 set_fs(old_fs);
1118 if (result < 0)
1119 goto out_kfree;
1120 }
1121 result = copied;
1122out_kfree:
1123 kfree(buffer);
1124out:
1125 return result;
1126}
1127
1128static unsigned hex_value(int ch)
1129{
1130 return isdigit(ch) ? ch - '0' : ((ch | 0x20) - 'a') + 10;
1131}
1132
1133static ssize_t bin_uuid(struct file *file,
1134 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1135{
1136 mm_segment_t old_fs = get_fs();
1137 ssize_t result, copied = 0;
1138
1139 /* Only supports reads */
1140 if (oldval && oldlen) {
1141 loff_t pos = 0;
1142 char buf[40], *str = buf;
1143 unsigned char uuid[16];
1144 int i;
1145
1146 set_fs(KERNEL_DS);
1147 result = vfs_read(file, buf, sizeof(buf) - 1, &pos);
1148 set_fs(old_fs);
1149 if (result < 0)
1150 goto out;
1151
1152 buf[result] = '\0';
1153
1154 /* Convert the uuid to from a string to binary */
1155 for (i = 0; i < 16; i++) {
1156 result = -EIO;
1157 if (!isxdigit(str[0]) || !isxdigit(str[1]))
1158 goto out;
1159
1160 uuid[i] = (hex_value(str[0]) << 4) | hex_value(str[1]);
1161 str += 2;
1162 if (*str == '-')
1163 str++;
1164 }
1165
1166 if (oldlen > 16)
1167 oldlen = 16;
1168
1169 result = -EFAULT;
1170 if (copy_to_user(oldval, uuid, oldlen))
1171 goto out;
1172
1173 copied = oldlen;
1174 }
1175 result = copied;
1176out:
1177 return result;
1178}
1179
1180static ssize_t bin_dn_node_address(struct file *file,
1181 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1182{
1183 mm_segment_t old_fs = get_fs();
1184 ssize_t result, copied = 0;
1185
1186 if (oldval && oldlen) {
1187 loff_t pos = 0;
1188 char buf[15], *nodep;
1189 unsigned long area, node;
1190 __le16 dnaddr;
1191
1192 set_fs(KERNEL_DS);
1193 result = vfs_read(file, buf, sizeof(buf) - 1, &pos);
1194 set_fs(old_fs);
1195 if (result < 0)
1196 goto out;
1197
1198 buf[result] = '\0';
1199
1200 /* Convert the decnet addresss to binary */
1201 result = -EIO;
1202 nodep = strchr(buf, '.') + 1;
1203 if (!nodep)
1204 goto out;
1205
1206 area = simple_strtoul(buf, NULL, 10);
1207 node = simple_strtoul(nodep, NULL, 10);
1208
1209 result = -EIO;
1210 if ((area > 63)||(node > 1023))
1211 goto out;
1212
1213 dnaddr = cpu_to_le16((area << 10) | node);
1214
1215 result = -EFAULT;
1216 if (put_user(dnaddr, (__le16 __user *)oldval))
1217 goto out;
1218
1219 copied = sizeof(dnaddr);
1220 }
1221
1222 if (newval && newlen) {
1223 loff_t pos = 0;
1224 __le16 dnaddr;
1225 char buf[15];
1226 int len;
1227
1228 result = -EINVAL;
1229 if (newlen != sizeof(dnaddr))
1230 goto out;
1231
1232 result = -EFAULT;
1233 if (get_user(dnaddr, (__le16 __user *)newval))
1234 goto out;
1235
1236 len = snprintf(buf, sizeof(buf), "%hu.%hu",
1237 le16_to_cpu(dnaddr) >> 10,
1238 le16_to_cpu(dnaddr) & 0x3ff);
1239
1240 set_fs(KERNEL_DS);
1241 result = vfs_write(file, buf, len, &pos);
1242 set_fs(old_fs);
1243 if (result < 0)
1244 goto out;
1245 }
1246
1247 result = copied;
1248out:
1249 return result;
1250}
1251
1252static const struct bin_table *get_sysctl(const int *name, int nlen, char *path)
1253{
1254 const struct bin_table *table = &bin_root_table[0];
1255 int ctl_name;
1256
1257 /* The binary sysctl tables have a small maximum depth so
1258 * there is no danger of overflowing our path as it PATH_MAX
1259 * bytes long.
1260 */
1261 memcpy(path, "sys/", 4);
1262 path += 4;
1263
1264repeat:
1265 if (!nlen)
1266 return ERR_PTR(-ENOTDIR);
1267 ctl_name = *name;
1268 name++;
1269 nlen--;
1270 for ( ; table->convert; table++) {
1271 int len = 0;
1272
1273 /*
1274 * For a wild card entry map from ifindex to network
1275 * device name.
1276 */
1277 if (!table->ctl_name) {
1278#ifdef CONFIG_NET
1279 struct net *net = current->nsproxy->net_ns;
1280 struct net_device *dev;
1281 dev = dev_get_by_index(net, ctl_name);
1282 if (dev) {
1283 len = strlen(dev->name);
1284 memcpy(path, dev->name, len);
1285 dev_put(dev);
1286 }
1287#endif
1288 /* Use the well known sysctl number to proc name mapping */
1289 } else if (ctl_name == table->ctl_name) {
1290 len = strlen(table->procname);
1291 memcpy(path, table->procname, len);
1292 }
1293 if (len) {
1294 path += len;
1295 if (table->child) {
1296 *path++ = '/';
1297 table = table->child;
1298 goto repeat;
1299 }
1300 *path = '\0';
1301 return table;
1302 }
1303 }
1304 return ERR_PTR(-ENOTDIR);
1305}
1306
1307static char *sysctl_getname(const int *name, int nlen, const struct bin_table **tablep)
1308{
1309 char *tmp, *result;
1310
1311 result = ERR_PTR(-ENOMEM);
1312 tmp = __getname();
1313 if (tmp) {
1314 const struct bin_table *table = get_sysctl(name, nlen, tmp);
1315 result = tmp;
1316 *tablep = table;
1317 if (IS_ERR(table)) {
1318 __putname(tmp);
1319 result = ERR_CAST(table);
1320 }
1321 }
1322 return result;
1323}
1324
1325static ssize_t binary_sysctl(const int *name, int nlen,
1326 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1327{
1328 const struct bin_table *table = NULL;
1329 struct nameidata nd;
1330 struct vfsmount *mnt;
1331 struct file *file;
1332 ssize_t result;
1333 char *pathname;
1334 int flags;
1335 int acc_mode;
1336
1337 pathname = sysctl_getname(name, nlen, &table);
1338 result = PTR_ERR(pathname);
1339 if (IS_ERR(pathname))
1340 goto out;
1341
1342 /* How should the sysctl be accessed? */
1343 if (oldval && oldlen && newval && newlen) {
1344 flags = O_RDWR;
1345 acc_mode = MAY_READ | MAY_WRITE;
1346 } else if (newval && newlen) {
1347 flags = O_WRONLY;
1348 acc_mode = MAY_WRITE;
1349 } else if (oldval && oldlen) {
1350 flags = O_RDONLY;
1351 acc_mode = MAY_READ;
1352 } else {
1353 result = 0;
1354 goto out_putname;
1355 }
1356
1357 mnt = current->nsproxy->pid_ns->proc_mnt;
1358 result = vfs_path_lookup(mnt->mnt_root, mnt, pathname, 0, &nd);
1359 if (result)
1360 goto out_putname;
1361
1362 result = may_open(&nd.path, acc_mode, flags);
1363 if (result)
1364 goto out_putpath;
1365
1366 file = dentry_open(nd.path.dentry, nd.path.mnt, flags, current_cred());
1367 result = PTR_ERR(file);
1368 if (IS_ERR(file))
1369 goto out_putname;
1370
1371 result = table->convert(file, oldval, oldlen, newval, newlen);
1372
1373 fput(file);
1374out_putname:
1375 putname(pathname);
1376out:
1377 return result;
1378
1379out_putpath:
1380 path_put(&nd.path);
1381 goto out_putname;
1382}
1383
1384
1385#else /* CONFIG_SYSCTL_SYSCALL */
1386
1387static ssize_t binary_sysctl(const int *name, int nlen,
1388 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1389{
1390 return -ENOSYS;
1391}
1392
1393#endif /* CONFIG_SYSCTL_SYSCALL */
1394
1395
1396static void deprecated_sysctl_warning(const int *name, int nlen)
1397{
1398 int i;
1399
1400 /*
1401 * CTL_KERN/KERN_VERSION is used by older glibc and cannot
1402 * ever go away.
1403 */
1404 if (name[0] == CTL_KERN && name[1] == KERN_VERSION)
1405 return;
1406
1407 if (printk_ratelimit()) {
1408 printk(KERN_INFO
1409 "warning: process `%s' used the deprecated sysctl "
1410 "system call with ", current->comm);
1411 for (i = 0; i < nlen; i++)
1412 printk("%d.", name[i]);
1413 printk("\n");
1414 }
1415 return;
1416}
1417
1418#define WARN_ONCE_HASH_BITS 8
1419#define WARN_ONCE_HASH_SIZE (1<<WARN_ONCE_HASH_BITS)
1420
1421static DECLARE_BITMAP(warn_once_bitmap, WARN_ONCE_HASH_SIZE);
1422
1423#define FNV32_OFFSET 2166136261U
1424#define FNV32_PRIME 0x01000193
1425
1426/*
1427 * Print each legacy sysctl (approximately) only once.
1428 * To avoid making the tables non-const use a external
1429 * hash-table instead.
1430 * Worst case hash collision: 6, but very rarely.
1431 * NOTE! We don't use the SMP-safe bit tests. We simply
1432 * don't care enough.
1433 */
1434static void warn_on_bintable(const int *name, int nlen)
1435{
1436 int i;
1437 u32 hash = FNV32_OFFSET;
1438
1439 for (i = 0; i < nlen; i++)
1440 hash = (hash ^ name[i]) * FNV32_PRIME;
1441 hash %= WARN_ONCE_HASH_SIZE;
1442 if (__test_and_set_bit(hash, warn_once_bitmap))
1443 return;
1444 deprecated_sysctl_warning(name, nlen);
1445}
1446
1447static ssize_t do_sysctl(int __user *args_name, int nlen,
1448 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1449{
1450 int name[CTL_MAXNAME];
1451 int i;
1452
1453 /* Check args->nlen. */
1454 if (nlen < 0 || nlen > CTL_MAXNAME)
1455 return -ENOTDIR;
1456 /* Read in the sysctl name for simplicity */
1457 for (i = 0; i < nlen; i++)
1458 if (get_user(name[i], args_name + i))
1459 return -EFAULT;
1460
1461 warn_on_bintable(name, nlen);
1462
1463 return binary_sysctl(name, nlen, oldval, oldlen, newval, newlen);
1464}
1465
1466SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
1467{
1468 struct __sysctl_args tmp;
1469 size_t oldlen = 0;
1470 ssize_t result;
1471
1472 if (copy_from_user(&tmp, args, sizeof(tmp)))
1473 return -EFAULT;
1474
1475 if (tmp.oldval && !tmp.oldlenp)
1476 return -EFAULT;
1477
1478 if (tmp.oldlenp && get_user(oldlen, tmp.oldlenp))
1479 return -EFAULT;
1480
1481 result = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, oldlen,
1482 tmp.newval, tmp.newlen);
1483
1484 if (result >= 0) {
1485 oldlen = result;
1486 result = 0;
1487 }
1488
1489 if (tmp.oldlenp && put_user(oldlen, tmp.oldlenp))
1490 return -EFAULT;
1491
1492 return result;
1493}
1494
1495
1496#ifdef CONFIG_COMPAT
1497#include <asm/compat.h>
1498
1499struct compat_sysctl_args {
1500 compat_uptr_t name;
1501 int nlen;
1502 compat_uptr_t oldval;
1503 compat_uptr_t oldlenp;
1504 compat_uptr_t newval;
1505 compat_size_t newlen;
1506 compat_ulong_t __unused[4];
1507};
1508
1509asmlinkage long compat_sys_sysctl(struct compat_sysctl_args __user *args)
1510{
1511 struct compat_sysctl_args tmp;
1512 compat_size_t __user *compat_oldlenp;
1513 size_t oldlen = 0;
1514 ssize_t result;
1515
1516 if (copy_from_user(&tmp, args, sizeof(tmp)))
1517 return -EFAULT;
1518
1519 if (tmp.oldval && !tmp.oldlenp)
1520 return -EFAULT;
1521
1522 compat_oldlenp = compat_ptr(tmp.oldlenp);
1523 if (compat_oldlenp && get_user(oldlen, compat_oldlenp))
1524 return -EFAULT;
1525
1526 result = do_sysctl(compat_ptr(tmp.name), tmp.nlen,
1527 compat_ptr(tmp.oldval), oldlen,
1528 compat_ptr(tmp.newval), tmp.newlen);
1529
1530 if (result >= 0) {
1531 oldlen = result;
1532 result = 0;
1533 }
1534
1535 if (compat_oldlenp && put_user(oldlen, compat_oldlenp))
1536 return -EFAULT;
1537
1538 return result;
1539}
1540
1541#endif /* CONFIG_COMPAT */
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index b6e7aaea4604..04cdcf72c827 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -5,1239 +5,6 @@
5#include <linux/string.h> 5#include <linux/string.h>
6#include <net/ip_vs.h> 6#include <net/ip_vs.h>
7 7
8struct trans_ctl_table {
9 int ctl_name;
10 const char *procname;
11 const struct trans_ctl_table *child;
12};
13
14static const struct trans_ctl_table trans_random_table[] = {
15 { RANDOM_POOLSIZE, "poolsize" },
16 { RANDOM_ENTROPY_COUNT, "entropy_avail" },
17 { RANDOM_READ_THRESH, "read_wakeup_threshold" },
18 { RANDOM_WRITE_THRESH, "write_wakeup_threshold" },
19 { RANDOM_BOOT_ID, "boot_id" },
20 { RANDOM_UUID, "uuid" },
21 {}
22};
23
24static const struct trans_ctl_table trans_pty_table[] = {
25 { PTY_MAX, "max" },
26 { PTY_NR, "nr" },
27 {}
28};
29
30static const struct trans_ctl_table trans_kern_table[] = {
31 { KERN_OSTYPE, "ostype" },
32 { KERN_OSRELEASE, "osrelease" },
33 /* KERN_OSREV not used */
34 { KERN_VERSION, "version" },
35 /* KERN_SECUREMASK not used */
36 /* KERN_PROF not used */
37 { KERN_NODENAME, "hostname" },
38 { KERN_DOMAINNAME, "domainname" },
39
40 { KERN_PANIC, "panic" },
41 { KERN_REALROOTDEV, "real-root-dev" },
42
43 { KERN_SPARC_REBOOT, "reboot-cmd" },
44 { KERN_CTLALTDEL, "ctrl-alt-del" },
45 { KERN_PRINTK, "printk" },
46
47 /* KERN_NAMETRANS not used */
48 /* KERN_PPC_HTABRECLAIM not used */
49 /* KERN_PPC_ZEROPAGED not used */
50 { KERN_PPC_POWERSAVE_NAP, "powersave-nap" },
51
52 { KERN_MODPROBE, "modprobe" },
53 { KERN_SG_BIG_BUFF, "sg-big-buff" },
54 { KERN_ACCT, "acct" },
55 { KERN_PPC_L2CR, "l2cr" },
56
57 /* KERN_RTSIGNR not used */
58 /* KERN_RTSIGMAX not used */
59
60 { KERN_SHMMAX, "shmmax" },
61 { KERN_MSGMAX, "msgmax" },
62 { KERN_MSGMNB, "msgmnb" },
63 /* KERN_MSGPOOL not used*/
64 { KERN_SYSRQ, "sysrq" },
65 { KERN_MAX_THREADS, "threads-max" },
66 { KERN_RANDOM, "random", trans_random_table },
67 { KERN_SHMALL, "shmall" },
68 { KERN_MSGMNI, "msgmni" },
69 { KERN_SEM, "sem" },
70 { KERN_SPARC_STOP_A, "stop-a" },
71 { KERN_SHMMNI, "shmmni" },
72
73 { KERN_OVERFLOWUID, "overflowuid" },
74 { KERN_OVERFLOWGID, "overflowgid" },
75
76 { KERN_HOTPLUG, "hotplug", },
77 { KERN_IEEE_EMULATION_WARNINGS, "ieee_emulation_warnings" },
78
79 { KERN_S390_USER_DEBUG_LOGGING, "userprocess_debug" },
80 { KERN_CORE_USES_PID, "core_uses_pid" },
81 { KERN_TAINTED, "tainted" },
82 { KERN_CADPID, "cad_pid" },
83 { KERN_PIDMAX, "pid_max" },
84 { KERN_CORE_PATTERN, "core_pattern" },
85 { KERN_PANIC_ON_OOPS, "panic_on_oops" },
86 { KERN_HPPA_PWRSW, "soft-power" },
87 { KERN_HPPA_UNALIGNED, "unaligned-trap" },
88
89 { KERN_PRINTK_RATELIMIT, "printk_ratelimit" },
90 { KERN_PRINTK_RATELIMIT_BURST, "printk_ratelimit_burst" },
91
92 { KERN_PTY, "pty", trans_pty_table },
93 { KERN_NGROUPS_MAX, "ngroups_max" },
94 { KERN_SPARC_SCONS_PWROFF, "scons-poweroff" },
95 { KERN_HZ_TIMER, "hz_timer" },
96 { KERN_UNKNOWN_NMI_PANIC, "unknown_nmi_panic" },
97 { KERN_BOOTLOADER_TYPE, "bootloader_type" },
98 { KERN_RANDOMIZE, "randomize_va_space" },
99
100 { KERN_SPIN_RETRY, "spin_retry" },
101 { KERN_ACPI_VIDEO_FLAGS, "acpi_video_flags" },
102 { KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" },
103 { KERN_COMPAT_LOG, "compat-log" },
104 { KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
105 { KERN_NMI_WATCHDOG, "nmi_watchdog" },
106 { KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
107 {}
108};
109
110static const struct trans_ctl_table trans_vm_table[] = {
111 { VM_OVERCOMMIT_MEMORY, "overcommit_memory" },
112 { VM_PAGE_CLUSTER, "page-cluster" },
113 { VM_DIRTY_BACKGROUND, "dirty_background_ratio" },
114 { VM_DIRTY_RATIO, "dirty_ratio" },
115 { VM_DIRTY_WB_CS, "dirty_writeback_centisecs" },
116 { VM_DIRTY_EXPIRE_CS, "dirty_expire_centisecs" },
117 { VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" },
118 { VM_OVERCOMMIT_RATIO, "overcommit_ratio" },
119 /* VM_PAGEBUF unused */
120 { VM_HUGETLB_PAGES, "nr_hugepages" },
121 { VM_SWAPPINESS, "swappiness" },
122 { VM_LOWMEM_RESERVE_RATIO, "lowmem_reserve_ratio" },
123 { VM_MIN_FREE_KBYTES, "min_free_kbytes" },
124 { VM_MAX_MAP_COUNT, "max_map_count" },
125 { VM_LAPTOP_MODE, "laptop_mode" },
126 { VM_BLOCK_DUMP, "block_dump" },
127 { VM_HUGETLB_GROUP, "hugetlb_shm_group" },
128 { VM_VFS_CACHE_PRESSURE, "vfs_cache_pressure" },
129 { VM_LEGACY_VA_LAYOUT, "legacy_va_layout" },
130 /* VM_SWAP_TOKEN_TIMEOUT unused */
131 { VM_DROP_PAGECACHE, "drop_caches" },
132 { VM_PERCPU_PAGELIST_FRACTION, "percpu_pagelist_fraction" },
133 { VM_ZONE_RECLAIM_MODE, "zone_reclaim_mode" },
134 { VM_MIN_UNMAPPED, "min_unmapped_ratio" },
135 { VM_PANIC_ON_OOM, "panic_on_oom" },
136 { VM_VDSO_ENABLED, "vdso_enabled" },
137 { VM_MIN_SLAB, "min_slab_ratio" },
138
139 {}
140};
141
142static const struct trans_ctl_table trans_net_core_table[] = {
143 { NET_CORE_WMEM_MAX, "wmem_max" },
144 { NET_CORE_RMEM_MAX, "rmem_max" },
145 { NET_CORE_WMEM_DEFAULT, "wmem_default" },
146 { NET_CORE_RMEM_DEFAULT, "rmem_default" },
147 /* NET_CORE_DESTROY_DELAY unused */
148 { NET_CORE_MAX_BACKLOG, "netdev_max_backlog" },
149 /* NET_CORE_FASTROUTE unused */
150 { NET_CORE_MSG_COST, "message_cost" },
151 { NET_CORE_MSG_BURST, "message_burst" },
152 { NET_CORE_OPTMEM_MAX, "optmem_max" },
153 /* NET_CORE_HOT_LIST_LENGTH unused */
154 /* NET_CORE_DIVERT_VERSION unused */
155 /* NET_CORE_NO_CONG_THRESH unused */
156 /* NET_CORE_NO_CONG unused */
157 /* NET_CORE_LO_CONG unused */
158 /* NET_CORE_MOD_CONG unused */
159 { NET_CORE_DEV_WEIGHT, "dev_weight" },
160 { NET_CORE_SOMAXCONN, "somaxconn" },
161 { NET_CORE_BUDGET, "netdev_budget" },
162 { NET_CORE_AEVENT_ETIME, "xfrm_aevent_etime" },
163 { NET_CORE_AEVENT_RSEQTH, "xfrm_aevent_rseqth" },
164 { NET_CORE_WARNINGS, "warnings" },
165 {},
166};
167
168static const struct trans_ctl_table trans_net_unix_table[] = {
169 /* NET_UNIX_DESTROY_DELAY unused */
170 /* NET_UNIX_DELETE_DELAY unused */
171 { NET_UNIX_MAX_DGRAM_QLEN, "max_dgram_qlen" },
172 {}
173};
174
175static const struct trans_ctl_table trans_net_ipv4_route_table[] = {
176 { NET_IPV4_ROUTE_FLUSH, "flush" },
177 { NET_IPV4_ROUTE_MIN_DELAY, "min_delay" },
178 { NET_IPV4_ROUTE_MAX_DELAY, "max_delay" },
179 { NET_IPV4_ROUTE_GC_THRESH, "gc_thresh" },
180 { NET_IPV4_ROUTE_MAX_SIZE, "max_size" },
181 { NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" },
182 { NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" },
183 { NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval" },
184 { NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" },
185 { NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" },
186 { NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" },
187 { NET_IPV4_ROUTE_ERROR_COST, "error_cost" },
188 { NET_IPV4_ROUTE_ERROR_BURST, "error_burst" },
189 { NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity" },
190 { NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" },
191 { NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" },
192 { NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" },
193 { NET_IPV4_ROUTE_SECRET_INTERVAL, "secret_interval" },
194 { NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" },
195 {}
196};
197
198static const struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = {
199 { NET_IPV4_CONF_FORWARDING, "forwarding" },
200 { NET_IPV4_CONF_MC_FORWARDING, "mc_forwarding" },
201
202 { NET_IPV4_CONF_PROXY_ARP, "proxy_arp" },
203 { NET_IPV4_CONF_ACCEPT_REDIRECTS, "accept_redirects" },
204 { NET_IPV4_CONF_SECURE_REDIRECTS, "secure_redirects" },
205 { NET_IPV4_CONF_SEND_REDIRECTS, "send_redirects" },
206 { NET_IPV4_CONF_SHARED_MEDIA, "shared_media" },
207 { NET_IPV4_CONF_RP_FILTER, "rp_filter" },
208 { NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
209 { NET_IPV4_CONF_BOOTP_RELAY, "bootp_relay" },
210 { NET_IPV4_CONF_LOG_MARTIANS, "log_martians" },
211 { NET_IPV4_CONF_TAG, "tag" },
212 { NET_IPV4_CONF_ARPFILTER, "arp_filter" },
213 { NET_IPV4_CONF_MEDIUM_ID, "medium_id" },
214 { NET_IPV4_CONF_NOXFRM, "disable_xfrm" },
215 { NET_IPV4_CONF_NOPOLICY, "disable_policy" },
216 { NET_IPV4_CONF_FORCE_IGMP_VERSION, "force_igmp_version" },
217
218 { NET_IPV4_CONF_ARP_ANNOUNCE, "arp_announce" },
219 { NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" },
220 { NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" },
221 { NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" },
222 { NET_IPV4_CONF_ARP_NOTIFY, "arp_notify" },
223 {}
224};
225
226static const struct trans_ctl_table trans_net_ipv4_conf_table[] = {
227 { NET_PROTO_CONF_ALL, "all", trans_net_ipv4_conf_vars_table },
228 { NET_PROTO_CONF_DEFAULT, "default", trans_net_ipv4_conf_vars_table },
229 { 0, NULL, trans_net_ipv4_conf_vars_table },
230 {}
231};
232
233static const struct trans_ctl_table trans_net_neigh_vars_table[] = {
234 { NET_NEIGH_MCAST_SOLICIT, "mcast_solicit" },
235 { NET_NEIGH_UCAST_SOLICIT, "ucast_solicit" },
236 { NET_NEIGH_APP_SOLICIT, "app_solicit" },
237 { NET_NEIGH_RETRANS_TIME, "retrans_time" },
238 { NET_NEIGH_REACHABLE_TIME, "base_reachable_time" },
239 { NET_NEIGH_DELAY_PROBE_TIME, "delay_first_probe_time" },
240 { NET_NEIGH_GC_STALE_TIME, "gc_stale_time" },
241 { NET_NEIGH_UNRES_QLEN, "unres_qlen" },
242 { NET_NEIGH_PROXY_QLEN, "proxy_qlen" },
243 { NET_NEIGH_ANYCAST_DELAY, "anycast_delay" },
244 { NET_NEIGH_PROXY_DELAY, "proxy_delay" },
245 { NET_NEIGH_LOCKTIME, "locktime" },
246 { NET_NEIGH_GC_INTERVAL, "gc_interval" },
247 { NET_NEIGH_GC_THRESH1, "gc_thresh1" },
248 { NET_NEIGH_GC_THRESH2, "gc_thresh2" },
249 { NET_NEIGH_GC_THRESH3, "gc_thresh3" },
250 { NET_NEIGH_RETRANS_TIME_MS, "retrans_time_ms" },
251 { NET_NEIGH_REACHABLE_TIME_MS, "base_reachable_time_ms" },
252 {}
253};
254
255static const struct trans_ctl_table trans_net_neigh_table[] = {
256 { NET_PROTO_CONF_DEFAULT, "default", trans_net_neigh_vars_table },
257 { 0, NULL, trans_net_neigh_vars_table },
258 {}
259};
260
261static const struct trans_ctl_table trans_net_ipv4_netfilter_table[] = {
262 { NET_IPV4_NF_CONNTRACK_MAX, "ip_conntrack_max" },
263
264 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, "ip_conntrack_tcp_timeout_syn_sent" },
265 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, "ip_conntrack_tcp_timeout_syn_recv" },
266 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED, "ip_conntrack_tcp_timeout_established" },
267 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT, "ip_conntrack_tcp_timeout_fin_wait" },
268 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT, "ip_conntrack_tcp_timeout_close_wait" },
269 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK, "ip_conntrack_tcp_timeout_last_ack" },
270 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT, "ip_conntrack_tcp_timeout_time_wait" },
271 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE, "ip_conntrack_tcp_timeout_close" },
272
273 { NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT, "ip_conntrack_udp_timeout" },
274 { NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM, "ip_conntrack_udp_timeout_stream" },
275 { NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT, "ip_conntrack_icmp_timeout" },
276 { NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT, "ip_conntrack_generic_timeout" },
277
278 { NET_IPV4_NF_CONNTRACK_BUCKETS, "ip_conntrack_buckets" },
279 { NET_IPV4_NF_CONNTRACK_LOG_INVALID, "ip_conntrack_log_invalid" },
280 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS, "ip_conntrack_tcp_timeout_max_retrans" },
281 { NET_IPV4_NF_CONNTRACK_TCP_LOOSE, "ip_conntrack_tcp_loose" },
282 { NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL, "ip_conntrack_tcp_be_liberal" },
283 { NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS, "ip_conntrack_tcp_max_retrans" },
284
285 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED, "ip_conntrack_sctp_timeout_closed" },
286 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT, "ip_conntrack_sctp_timeout_cookie_wait" },
287 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED, "ip_conntrack_sctp_timeout_cookie_echoed" },
288 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED, "ip_conntrack_sctp_timeout_established" },
289 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT, "ip_conntrack_sctp_timeout_shutdown_sent" },
290 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD, "ip_conntrack_sctp_timeout_shutdown_recd" },
291 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT, "ip_conntrack_sctp_timeout_shutdown_ack_sent" },
292
293 { NET_IPV4_NF_CONNTRACK_COUNT, "ip_conntrack_count" },
294 { NET_IPV4_NF_CONNTRACK_CHECKSUM, "ip_conntrack_checksum" },
295 {}
296};
297
298static const struct trans_ctl_table trans_net_ipv4_table[] = {
299 { NET_IPV4_FORWARD, "ip_forward" },
300 { NET_IPV4_DYNADDR, "ip_dynaddr" },
301
302 { NET_IPV4_CONF, "conf", trans_net_ipv4_conf_table },
303 { NET_IPV4_NEIGH, "neigh", trans_net_neigh_table },
304 { NET_IPV4_ROUTE, "route", trans_net_ipv4_route_table },
305 /* NET_IPV4_FIB_HASH unused */
306 { NET_IPV4_NETFILTER, "netfilter", trans_net_ipv4_netfilter_table },
307
308 { NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps" },
309 { NET_IPV4_TCP_WINDOW_SCALING, "tcp_window_scaling" },
310 { NET_IPV4_TCP_SACK, "tcp_sack" },
311 { NET_IPV4_TCP_RETRANS_COLLAPSE, "tcp_retrans_collapse" },
312 { NET_IPV4_DEFAULT_TTL, "ip_default_ttl" },
313 /* NET_IPV4_AUTOCONFIG unused */
314 { NET_IPV4_NO_PMTU_DISC, "ip_no_pmtu_disc" },
315 { NET_IPV4_TCP_SYN_RETRIES, "tcp_syn_retries" },
316 { NET_IPV4_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh" },
317 { NET_IPV4_IPFRAG_LOW_THRESH, "ipfrag_low_thresh" },
318 { NET_IPV4_IPFRAG_TIME, "ipfrag_time" },
319 /* NET_IPV4_TCP_MAX_KA_PROBES unused */
320 { NET_IPV4_TCP_KEEPALIVE_TIME, "tcp_keepalive_time" },
321 { NET_IPV4_TCP_KEEPALIVE_PROBES, "tcp_keepalive_probes" },
322 { NET_IPV4_TCP_RETRIES1, "tcp_retries1" },
323 { NET_IPV4_TCP_RETRIES2, "tcp_retries2" },
324 { NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout" },
325 /* NET_IPV4_IP_MASQ_DEBUG unused */
326 { NET_TCP_SYNCOOKIES, "tcp_syncookies" },
327 { NET_TCP_STDURG, "tcp_stdurg" },
328 { NET_TCP_RFC1337, "tcp_rfc1337" },
329 /* NET_TCP_SYN_TAILDROP unused */
330 { NET_TCP_MAX_SYN_BACKLOG, "tcp_max_syn_backlog" },
331 { NET_IPV4_LOCAL_PORT_RANGE, "ip_local_port_range" },
332 { NET_IPV4_ICMP_ECHO_IGNORE_ALL, "icmp_echo_ignore_all" },
333 { NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, "icmp_echo_ignore_broadcasts" },
334 /* NET_IPV4_ICMP_SOURCEQUENCH_RATE unused */
335 /* NET_IPV4_ICMP_DESTUNREACH_RATE unused */
336 /* NET_IPV4_ICMP_TIMEEXCEED_RATE unused */
337 /* NET_IPV4_ICMP_PARAMPROB_RATE unused */
338 /* NET_IPV4_ICMP_ECHOREPLY_RATE unused */
339 { NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, "icmp_ignore_bogus_error_responses" },
340 { NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships" },
341 { NET_TCP_TW_RECYCLE, "tcp_tw_recycle" },
342 /* NET_IPV4_ALWAYS_DEFRAG unused */
343 { NET_IPV4_TCP_KEEPALIVE_INTVL, "tcp_keepalive_intvl" },
344 { NET_IPV4_INET_PEER_THRESHOLD, "inet_peer_threshold" },
345 { NET_IPV4_INET_PEER_MINTTL, "inet_peer_minttl" },
346 { NET_IPV4_INET_PEER_MAXTTL, "inet_peer_maxttl" },
347 { NET_IPV4_INET_PEER_GC_MINTIME, "inet_peer_gc_mintime" },
348 { NET_IPV4_INET_PEER_GC_MAXTIME, "inet_peer_gc_maxtime" },
349 { NET_TCP_ORPHAN_RETRIES, "tcp_orphan_retries" },
350 { NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow" },
351 { NET_TCP_SYNACK_RETRIES, "tcp_synack_retries" },
352 { NET_TCP_MAX_ORPHANS, "tcp_max_orphans" },
353 { NET_TCP_MAX_TW_BUCKETS, "tcp_max_tw_buckets" },
354 { NET_TCP_FACK, "tcp_fack" },
355 { NET_TCP_REORDERING, "tcp_reordering" },
356 { NET_TCP_ECN, "tcp_ecn" },
357 { NET_TCP_DSACK, "tcp_dsack" },
358 { NET_TCP_MEM, "tcp_mem" },
359 { NET_TCP_WMEM, "tcp_wmem" },
360 { NET_TCP_RMEM, "tcp_rmem" },
361 { NET_TCP_APP_WIN, "tcp_app_win" },
362 { NET_TCP_ADV_WIN_SCALE, "tcp_adv_win_scale" },
363 { NET_IPV4_NONLOCAL_BIND, "ip_nonlocal_bind" },
364 { NET_IPV4_ICMP_RATELIMIT, "icmp_ratelimit" },
365 { NET_IPV4_ICMP_RATEMASK, "icmp_ratemask" },
366 { NET_TCP_TW_REUSE, "tcp_tw_reuse" },
367 { NET_TCP_FRTO, "tcp_frto" },
368 { NET_TCP_LOW_LATENCY, "tcp_low_latency" },
369 { NET_IPV4_IPFRAG_SECRET_INTERVAL, "ipfrag_secret_interval" },
370 { NET_IPV4_IGMP_MAX_MSF, "igmp_max_msf" },
371 { NET_TCP_NO_METRICS_SAVE, "tcp_no_metrics_save" },
372 /* NET_TCP_DEFAULT_WIN_SCALE unused */
373 { NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" },
374 { NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" },
375 /* NET_TCP_BIC_BETA unused */
376 { NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR, "icmp_errors_use_inbound_ifaddr" },
377 { NET_TCP_CONG_CONTROL, "tcp_congestion_control" },
378 { NET_TCP_ABC, "tcp_abc" },
379 { NET_IPV4_IPFRAG_MAX_DIST, "ipfrag_max_dist" },
380 { NET_TCP_MTU_PROBING, "tcp_mtu_probing" },
381 { NET_TCP_BASE_MSS, "tcp_base_mss" },
382 { NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" },
383 { NET_TCP_DMA_COPYBREAK, "tcp_dma_copybreak" },
384 { NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" },
385 { NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" },
386 { NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" },
387 { NET_CIPSOV4_RBM_OPTFMT, "cipso_rbm_optfmt" },
388 { NET_CIPSOV4_RBM_STRICTVALID, "cipso_rbm_strictvalid" },
389 { NET_TCP_AVAIL_CONG_CONTROL, "tcp_available_congestion_control" },
390 { NET_TCP_ALLOWED_CONG_CONTROL, "tcp_allowed_congestion_control" },
391 { NET_TCP_MAX_SSTHRESH, "tcp_max_ssthresh" },
392 { NET_TCP_FRTO_RESPONSE, "tcp_frto_response" },
393 { 2088 /* NET_IPQ_QMAX */, "ip_queue_maxlen" },
394 {}
395};
396
397static const struct trans_ctl_table trans_net_ipx_table[] = {
398 { NET_IPX_PPROP_BROADCASTING, "ipx_pprop_broadcasting" },
399 /* NET_IPX_FORWARDING unused */
400 {}
401};
402
403static const struct trans_ctl_table trans_net_atalk_table[] = {
404 { NET_ATALK_AARP_EXPIRY_TIME, "aarp-expiry-time" },
405 { NET_ATALK_AARP_TICK_TIME, "aarp-tick-time" },
406 { NET_ATALK_AARP_RETRANSMIT_LIMIT, "aarp-retransmit-limit" },
407 { NET_ATALK_AARP_RESOLVE_TIME, "aarp-resolve-time" },
408 {},
409};
410
411static const struct trans_ctl_table trans_net_netrom_table[] = {
412 { NET_NETROM_DEFAULT_PATH_QUALITY, "default_path_quality" },
413 { NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER, "obsolescence_count_initialiser" },
414 { NET_NETROM_NETWORK_TTL_INITIALISER, "network_ttl_initialiser" },
415 { NET_NETROM_TRANSPORT_TIMEOUT, "transport_timeout" },
416 { NET_NETROM_TRANSPORT_MAXIMUM_TRIES, "transport_maximum_tries" },
417 { NET_NETROM_TRANSPORT_ACKNOWLEDGE_DELAY, "transport_acknowledge_delay" },
418 { NET_NETROM_TRANSPORT_BUSY_DELAY, "transport_busy_delay" },
419 { NET_NETROM_TRANSPORT_REQUESTED_WINDOW_SIZE, "transport_requested_window_size" },
420 { NET_NETROM_TRANSPORT_NO_ACTIVITY_TIMEOUT, "transport_no_activity_timeout" },
421 { NET_NETROM_ROUTING_CONTROL, "routing_control" },
422 { NET_NETROM_LINK_FAILS_COUNT, "link_fails_count" },
423 { NET_NETROM_RESET, "reset" },
424 {}
425};
426
427static const struct trans_ctl_table trans_net_ax25_param_table[] = {
428 { NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" },
429 { NET_AX25_DEFAULT_MODE, "ax25_default_mode" },
430 { NET_AX25_BACKOFF_TYPE, "backoff_type" },
431 { NET_AX25_CONNECT_MODE, "connect_mode" },
432 { NET_AX25_STANDARD_WINDOW, "standard_window_size" },
433 { NET_AX25_EXTENDED_WINDOW, "extended_window_size" },
434 { NET_AX25_T1_TIMEOUT, "t1_timeout" },
435 { NET_AX25_T2_TIMEOUT, "t2_timeout" },
436 { NET_AX25_T3_TIMEOUT, "t3_timeout" },
437 { NET_AX25_IDLE_TIMEOUT, "idle_timeout" },
438 { NET_AX25_N2, "maximum_retry_count" },
439 { NET_AX25_PACLEN, "maximum_packet_length" },
440 { NET_AX25_PROTOCOL, "protocol" },
441 { NET_AX25_DAMA_SLAVE_TIMEOUT, "dama_slave_timeout" },
442 {}
443};
444
445static const struct trans_ctl_table trans_net_ax25_table[] = {
446 { 0, NULL, trans_net_ax25_param_table },
447 {}
448};
449
450static const struct trans_ctl_table trans_net_bridge_table[] = {
451 { NET_BRIDGE_NF_CALL_ARPTABLES, "bridge-nf-call-arptables" },
452 { NET_BRIDGE_NF_CALL_IPTABLES, "bridge-nf-call-iptables" },
453 { NET_BRIDGE_NF_CALL_IP6TABLES, "bridge-nf-call-ip6tables" },
454 { NET_BRIDGE_NF_FILTER_VLAN_TAGGED, "bridge-nf-filter-vlan-tagged" },
455 { NET_BRIDGE_NF_FILTER_PPPOE_TAGGED, "bridge-nf-filter-pppoe-tagged" },
456 {}
457};
458
459static const struct trans_ctl_table trans_net_rose_table[] = {
460 { NET_ROSE_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" },
461 { NET_ROSE_CALL_REQUEST_TIMEOUT, "call_request_timeout" },
462 { NET_ROSE_RESET_REQUEST_TIMEOUT, "reset_request_timeout" },
463 { NET_ROSE_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" },
464 { NET_ROSE_ACK_HOLD_BACK_TIMEOUT, "acknowledge_hold_back_timeout" },
465 { NET_ROSE_ROUTING_CONTROL, "routing_control" },
466 { NET_ROSE_LINK_FAIL_TIMEOUT, "link_fail_timeout" },
467 { NET_ROSE_MAX_VCS, "maximum_virtual_circuits" },
468 { NET_ROSE_WINDOW_SIZE, "window_size" },
469 { NET_ROSE_NO_ACTIVITY_TIMEOUT, "no_activity_timeout" },
470 {}
471};
472
473static const struct trans_ctl_table trans_net_ipv6_conf_var_table[] = {
474 { NET_IPV6_FORWARDING, "forwarding" },
475 { NET_IPV6_HOP_LIMIT, "hop_limit" },
476 { NET_IPV6_MTU, "mtu" },
477 { NET_IPV6_ACCEPT_RA, "accept_ra" },
478 { NET_IPV6_ACCEPT_REDIRECTS, "accept_redirects" },
479 { NET_IPV6_AUTOCONF, "autoconf" },
480 { NET_IPV6_DAD_TRANSMITS, "dad_transmits" },
481 { NET_IPV6_RTR_SOLICITS, "router_solicitations" },
482 { NET_IPV6_RTR_SOLICIT_INTERVAL, "router_solicitation_interval" },
483 { NET_IPV6_RTR_SOLICIT_DELAY, "router_solicitation_delay" },
484 { NET_IPV6_USE_TEMPADDR, "use_tempaddr" },
485 { NET_IPV6_TEMP_VALID_LFT, "temp_valid_lft" },
486 { NET_IPV6_TEMP_PREFERED_LFT, "temp_prefered_lft" },
487 { NET_IPV6_REGEN_MAX_RETRY, "regen_max_retry" },
488 { NET_IPV6_MAX_DESYNC_FACTOR, "max_desync_factor" },
489 { NET_IPV6_MAX_ADDRESSES, "max_addresses" },
490 { NET_IPV6_FORCE_MLD_VERSION, "force_mld_version" },
491 { NET_IPV6_ACCEPT_RA_DEFRTR, "accept_ra_defrtr" },
492 { NET_IPV6_ACCEPT_RA_PINFO, "accept_ra_pinfo" },
493 { NET_IPV6_ACCEPT_RA_RTR_PREF, "accept_ra_rtr_pref" },
494 { NET_IPV6_RTR_PROBE_INTERVAL, "router_probe_interval" },
495 { NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" },
496 { NET_IPV6_PROXY_NDP, "proxy_ndp" },
497 { NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
498 {}
499};
500
501static const struct trans_ctl_table trans_net_ipv6_conf_table[] = {
502 { NET_PROTO_CONF_ALL, "all", trans_net_ipv6_conf_var_table },
503 { NET_PROTO_CONF_DEFAULT, "default", trans_net_ipv6_conf_var_table },
504 { 0, NULL, trans_net_ipv6_conf_var_table },
505 {}
506};
507
508static const struct trans_ctl_table trans_net_ipv6_route_table[] = {
509 { NET_IPV6_ROUTE_FLUSH, "flush" },
510 { NET_IPV6_ROUTE_GC_THRESH, "gc_thresh" },
511 { NET_IPV6_ROUTE_MAX_SIZE, "max_size" },
512 { NET_IPV6_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" },
513 { NET_IPV6_ROUTE_GC_TIMEOUT, "gc_timeout" },
514 { NET_IPV6_ROUTE_GC_INTERVAL, "gc_interval" },
515 { NET_IPV6_ROUTE_GC_ELASTICITY, "gc_elasticity" },
516 { NET_IPV6_ROUTE_MTU_EXPIRES, "mtu_expires" },
517 { NET_IPV6_ROUTE_MIN_ADVMSS, "min_adv_mss" },
518 { NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" },
519 {}
520};
521
522static const struct trans_ctl_table trans_net_ipv6_icmp_table[] = {
523 { NET_IPV6_ICMP_RATELIMIT, "ratelimit" },
524 {}
525};
526
527static const struct trans_ctl_table trans_net_ipv6_table[] = {
528 { NET_IPV6_CONF, "conf", trans_net_ipv6_conf_table },
529 { NET_IPV6_NEIGH, "neigh", trans_net_neigh_table },
530 { NET_IPV6_ROUTE, "route", trans_net_ipv6_route_table },
531 { NET_IPV6_ICMP, "icmp", trans_net_ipv6_icmp_table },
532 { NET_IPV6_BINDV6ONLY, "bindv6only" },
533 { NET_IPV6_IP6FRAG_HIGH_THRESH, "ip6frag_high_thresh" },
534 { NET_IPV6_IP6FRAG_LOW_THRESH, "ip6frag_low_thresh" },
535 { NET_IPV6_IP6FRAG_TIME, "ip6frag_time" },
536 { NET_IPV6_IP6FRAG_SECRET_INTERVAL, "ip6frag_secret_interval" },
537 { NET_IPV6_MLD_MAX_MSF, "mld_max_msf" },
538 { 2088 /* IPQ_QMAX */, "ip6_queue_maxlen" },
539 {}
540};
541
542static const struct trans_ctl_table trans_net_x25_table[] = {
543 { NET_X25_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" },
544 { NET_X25_CALL_REQUEST_TIMEOUT, "call_request_timeout" },
545 { NET_X25_RESET_REQUEST_TIMEOUT, "reset_request_timeout" },
546 { NET_X25_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" },
547 { NET_X25_ACK_HOLD_BACK_TIMEOUT, "acknowledgement_hold_back_timeout" },
548 { NET_X25_FORWARD, "x25_forward" },
549 {}
550};
551
552static const struct trans_ctl_table trans_net_tr_table[] = {
553 { NET_TR_RIF_TIMEOUT, "rif_timeout" },
554 {}
555};
556
557
558static const struct trans_ctl_table trans_net_decnet_conf_vars[] = {
559 { NET_DECNET_CONF_DEV_FORWARDING, "forwarding" },
560 { NET_DECNET_CONF_DEV_PRIORITY, "priority" },
561 { NET_DECNET_CONF_DEV_T2, "t2" },
562 { NET_DECNET_CONF_DEV_T3, "t3" },
563 {}
564};
565
566static const struct trans_ctl_table trans_net_decnet_conf[] = {
567 { 0, NULL, trans_net_decnet_conf_vars },
568 {}
569};
570
571static const struct trans_ctl_table trans_net_decnet_table[] = {
572 { NET_DECNET_CONF, "conf", trans_net_decnet_conf },
573 { NET_DECNET_NODE_ADDRESS, "node_address" },
574 { NET_DECNET_NODE_NAME, "node_name" },
575 { NET_DECNET_DEFAULT_DEVICE, "default_device" },
576 { NET_DECNET_TIME_WAIT, "time_wait" },
577 { NET_DECNET_DN_COUNT, "dn_count" },
578 { NET_DECNET_DI_COUNT, "di_count" },
579 { NET_DECNET_DR_COUNT, "dr_count" },
580 { NET_DECNET_DST_GC_INTERVAL, "dst_gc_interval" },
581 { NET_DECNET_NO_FC_MAX_CWND, "no_fc_max_cwnd" },
582 { NET_DECNET_MEM, "decnet_mem" },
583 { NET_DECNET_RMEM, "decnet_rmem" },
584 { NET_DECNET_WMEM, "decnet_wmem" },
585 { NET_DECNET_DEBUG_LEVEL, "debug" },
586 {}
587};
588
589static const struct trans_ctl_table trans_net_sctp_table[] = {
590 { NET_SCTP_RTO_INITIAL, "rto_initial" },
591 { NET_SCTP_RTO_MIN, "rto_min" },
592 { NET_SCTP_RTO_MAX, "rto_max" },
593 { NET_SCTP_RTO_ALPHA, "rto_alpha_exp_divisor" },
594 { NET_SCTP_RTO_BETA, "rto_beta_exp_divisor" },
595 { NET_SCTP_VALID_COOKIE_LIFE, "valid_cookie_life" },
596 { NET_SCTP_ASSOCIATION_MAX_RETRANS, "association_max_retrans" },
597 { NET_SCTP_PATH_MAX_RETRANS, "path_max_retrans" },
598 { NET_SCTP_MAX_INIT_RETRANSMITS, "max_init_retransmits" },
599 { NET_SCTP_HB_INTERVAL, "hb_interval" },
600 { NET_SCTP_PRESERVE_ENABLE, "cookie_preserve_enable" },
601 { NET_SCTP_MAX_BURST, "max_burst" },
602 { NET_SCTP_ADDIP_ENABLE, "addip_enable" },
603 { NET_SCTP_PRSCTP_ENABLE, "prsctp_enable" },
604 { NET_SCTP_SNDBUF_POLICY, "sndbuf_policy" },
605 { NET_SCTP_SACK_TIMEOUT, "sack_timeout" },
606 { NET_SCTP_RCVBUF_POLICY, "rcvbuf_policy" },
607 {}
608};
609
610static const struct trans_ctl_table trans_net_llc_llc2_timeout_table[] = {
611 { NET_LLC2_ACK_TIMEOUT, "ack" },
612 { NET_LLC2_P_TIMEOUT, "p" },
613 { NET_LLC2_REJ_TIMEOUT, "rej" },
614 { NET_LLC2_BUSY_TIMEOUT, "busy" },
615 {}
616};
617
618static const struct trans_ctl_table trans_net_llc_station_table[] = {
619 { NET_LLC_STATION_ACK_TIMEOUT, "ack_timeout" },
620 {}
621};
622
623static const struct trans_ctl_table trans_net_llc_llc2_table[] = {
624 { NET_LLC2, "timeout", trans_net_llc_llc2_timeout_table },
625 {}
626};
627
628static const struct trans_ctl_table trans_net_llc_table[] = {
629 { NET_LLC2, "llc2", trans_net_llc_llc2_table },
630 { NET_LLC_STATION, "station", trans_net_llc_station_table },
631 {}
632};
633
634static const struct trans_ctl_table trans_net_netfilter_table[] = {
635 { NET_NF_CONNTRACK_MAX, "nf_conntrack_max" },
636 { NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, "nf_conntrack_tcp_timeout_syn_sent" },
637 { NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, "nf_conntrack_tcp_timeout_syn_recv" },
638 { NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED, "nf_conntrack_tcp_timeout_established" },
639 { NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT, "nf_conntrack_tcp_timeout_fin_wait" },
640 { NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT, "nf_conntrack_tcp_timeout_close_wait" },
641 { NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK, "nf_conntrack_tcp_timeout_last_ack" },
642 { NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT, "nf_conntrack_tcp_timeout_time_wait" },
643 { NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE, "nf_conntrack_tcp_timeout_close" },
644 { NET_NF_CONNTRACK_UDP_TIMEOUT, "nf_conntrack_udp_timeout" },
645 { NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM, "nf_conntrack_udp_timeout_stream" },
646 { NET_NF_CONNTRACK_ICMP_TIMEOUT, "nf_conntrack_icmp_timeout" },
647 { NET_NF_CONNTRACK_GENERIC_TIMEOUT, "nf_conntrack_generic_timeout" },
648 { NET_NF_CONNTRACK_BUCKETS, "nf_conntrack_buckets" },
649 { NET_NF_CONNTRACK_LOG_INVALID, "nf_conntrack_log_invalid" },
650 { NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS, "nf_conntrack_tcp_timeout_max_retrans" },
651 { NET_NF_CONNTRACK_TCP_LOOSE, "nf_conntrack_tcp_loose" },
652 { NET_NF_CONNTRACK_TCP_BE_LIBERAL, "nf_conntrack_tcp_be_liberal" },
653 { NET_NF_CONNTRACK_TCP_MAX_RETRANS, "nf_conntrack_tcp_max_retrans" },
654 { NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED, "nf_conntrack_sctp_timeout_closed" },
655 { NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT, "nf_conntrack_sctp_timeout_cookie_wait" },
656 { NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED, "nf_conntrack_sctp_timeout_cookie_echoed" },
657 { NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED, "nf_conntrack_sctp_timeout_established" },
658 { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT, "nf_conntrack_sctp_timeout_shutdown_sent" },
659 { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD, "nf_conntrack_sctp_timeout_shutdown_recd" },
660 { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT, "nf_conntrack_sctp_timeout_shutdown_ack_sent" },
661 { NET_NF_CONNTRACK_COUNT, "nf_conntrack_count" },
662 { NET_NF_CONNTRACK_ICMPV6_TIMEOUT, "nf_conntrack_icmpv6_timeout" },
663 { NET_NF_CONNTRACK_FRAG6_TIMEOUT, "nf_conntrack_frag6_timeout" },
664 { NET_NF_CONNTRACK_FRAG6_LOW_THRESH, "nf_conntrack_frag6_low_thresh" },
665 { NET_NF_CONNTRACK_FRAG6_HIGH_THRESH, "nf_conntrack_frag6_high_thresh" },
666 { NET_NF_CONNTRACK_CHECKSUM, "nf_conntrack_checksum" },
667
668 {}
669};
670
671static const struct trans_ctl_table trans_net_dccp_table[] = {
672 { NET_DCCP_DEFAULT, "default" },
673 {}
674};
675
676static const struct trans_ctl_table trans_net_irda_table[] = {
677 { NET_IRDA_DISCOVERY, "discovery" },
678 { NET_IRDA_DEVNAME, "devname" },
679 { NET_IRDA_DEBUG, "debug" },
680 { NET_IRDA_FAST_POLL, "fast_poll_increase" },
681 { NET_IRDA_DISCOVERY_SLOTS, "discovery_slots" },
682 { NET_IRDA_DISCOVERY_TIMEOUT, "discovery_timeout" },
683 { NET_IRDA_SLOT_TIMEOUT, "slot_timeout" },
684 { NET_IRDA_MAX_BAUD_RATE, "max_baud_rate" },
685 { NET_IRDA_MIN_TX_TURN_TIME, "min_tx_turn_time" },
686 { NET_IRDA_MAX_TX_DATA_SIZE, "max_tx_data_size" },
687 { NET_IRDA_MAX_TX_WINDOW, "max_tx_window" },
688 { NET_IRDA_MAX_NOREPLY_TIME, "max_noreply_time" },
689 { NET_IRDA_WARN_NOREPLY_TIME, "warn_noreply_time" },
690 { NET_IRDA_LAP_KEEPALIVE_TIME, "lap_keepalive_time" },
691 {}
692};
693
694static const struct trans_ctl_table trans_net_table[] = {
695 { NET_CORE, "core", trans_net_core_table },
696 /* NET_ETHER not used */
697 /* NET_802 not used */
698 { NET_UNIX, "unix", trans_net_unix_table },
699 { NET_IPV4, "ipv4", trans_net_ipv4_table },
700 { NET_IPX, "ipx", trans_net_ipx_table },
701 { NET_ATALK, "appletalk", trans_net_atalk_table },
702 { NET_NETROM, "netrom", trans_net_netrom_table },
703 { NET_AX25, "ax25", trans_net_ax25_table },
704 { NET_BRIDGE, "bridge", trans_net_bridge_table },
705 { NET_ROSE, "rose", trans_net_rose_table },
706 { NET_IPV6, "ipv6", trans_net_ipv6_table },
707 { NET_X25, "x25", trans_net_x25_table },
708 { NET_TR, "token-ring", trans_net_tr_table },
709 { NET_DECNET, "decnet", trans_net_decnet_table },
710 /* NET_ECONET not used */
711 { NET_SCTP, "sctp", trans_net_sctp_table },
712 { NET_LLC, "llc", trans_net_llc_table },
713 { NET_NETFILTER, "netfilter", trans_net_netfilter_table },
714 { NET_DCCP, "dccp", trans_net_dccp_table },
715 { NET_IRDA, "irda", trans_net_irda_table },
716 { 2089, "nf_conntrack_max" },
717 {}
718};
719
720static const struct trans_ctl_table trans_fs_quota_table[] = {
721 { FS_DQ_LOOKUPS, "lookups" },
722 { FS_DQ_DROPS, "drops" },
723 { FS_DQ_READS, "reads" },
724 { FS_DQ_WRITES, "writes" },
725 { FS_DQ_CACHE_HITS, "cache_hits" },
726 { FS_DQ_ALLOCATED, "allocated_dquots" },
727 { FS_DQ_FREE, "free_dquots" },
728 { FS_DQ_SYNCS, "syncs" },
729 { FS_DQ_WARNINGS, "warnings" },
730 {}
731};
732
733static const struct trans_ctl_table trans_fs_xfs_table[] = {
734 { XFS_SGID_INHERIT, "irix_sgid_inherit" },
735 { XFS_SYMLINK_MODE, "irix_symlink_mode" },
736 { XFS_PANIC_MASK, "panic_mask" },
737
738 { XFS_ERRLEVEL, "error_level" },
739 { XFS_SYNCD_TIMER, "xfssyncd_centisecs" },
740 { XFS_INHERIT_SYNC, "inherit_sync" },
741 { XFS_INHERIT_NODUMP, "inherit_nodump" },
742 { XFS_INHERIT_NOATIME, "inherit_noatime" },
743 { XFS_BUF_TIMER, "xfsbufd_centisecs" },
744 { XFS_BUF_AGE, "age_buffer_centisecs" },
745 { XFS_INHERIT_NOSYM, "inherit_nosymlinks" },
746 { XFS_ROTORSTEP, "rotorstep" },
747 { XFS_INHERIT_NODFRG, "inherit_nodefrag" },
748 { XFS_FILESTREAM_TIMER, "filestream_centisecs" },
749 { XFS_STATS_CLEAR, "stats_clear" },
750 {}
751};
752
753static const struct trans_ctl_table trans_fs_ocfs2_nm_table[] = {
754 { 1, "hb_ctl_path" },
755 {}
756};
757
758static const struct trans_ctl_table trans_fs_ocfs2_table[] = {
759 { 1, "nm", trans_fs_ocfs2_nm_table },
760 {}
761};
762
763static const struct trans_ctl_table trans_inotify_table[] = {
764 { INOTIFY_MAX_USER_INSTANCES, "max_user_instances" },
765 { INOTIFY_MAX_USER_WATCHES, "max_user_watches" },
766 { INOTIFY_MAX_QUEUED_EVENTS, "max_queued_events" },
767 {}
768};
769
770static const struct trans_ctl_table trans_fs_table[] = {
771 { FS_NRINODE, "inode-nr" },
772 { FS_STATINODE, "inode-state" },
773 /* FS_MAXINODE unused */
774 /* FS_NRDQUOT unused */
775 /* FS_MAXDQUOT unused */
776 { FS_NRFILE, "file-nr" },
777 { FS_MAXFILE, "file-max" },
778 { FS_DENTRY, "dentry-state" },
779 /* FS_NRSUPER unused */
780 /* FS_MAXUPSER unused */
781 { FS_OVERFLOWUID, "overflowuid" },
782 { FS_OVERFLOWGID, "overflowgid" },
783 { FS_LEASES, "leases-enable" },
784 { FS_DIR_NOTIFY, "dir-notify-enable" },
785 { FS_LEASE_TIME, "lease-break-time" },
786 { FS_DQSTATS, "quota", trans_fs_quota_table },
787 { FS_XFS, "xfs", trans_fs_xfs_table },
788 { FS_AIO_NR, "aio-nr" },
789 { FS_AIO_MAX_NR, "aio-max-nr" },
790 { FS_INOTIFY, "inotify", trans_inotify_table },
791 { FS_OCFS2, "ocfs2", trans_fs_ocfs2_table },
792 { KERN_SETUID_DUMPABLE, "suid_dumpable" },
793 {}
794};
795
796static const struct trans_ctl_table trans_debug_table[] = {
797 {}
798};
799
800static const struct trans_ctl_table trans_cdrom_table[] = {
801 { DEV_CDROM_INFO, "info" },
802 { DEV_CDROM_AUTOCLOSE, "autoclose" },
803 { DEV_CDROM_AUTOEJECT, "autoeject" },
804 { DEV_CDROM_DEBUG, "debug" },
805 { DEV_CDROM_LOCK, "lock" },
806 { DEV_CDROM_CHECK_MEDIA, "check_media" },
807 {}
808};
809
810static const struct trans_ctl_table trans_ipmi_table[] = {
811 { DEV_IPMI_POWEROFF_POWERCYCLE, "poweroff_powercycle" },
812 {}
813};
814
815static const struct trans_ctl_table trans_mac_hid_files[] = {
816 /* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */
817 /* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */
818 { DEV_MAC_HID_MOUSE_BUTTON_EMULATION, "mouse_button_emulation" },
819 { DEV_MAC_HID_MOUSE_BUTTON2_KEYCODE, "mouse_button2_keycode" },
820 { DEV_MAC_HID_MOUSE_BUTTON3_KEYCODE, "mouse_button3_keycode" },
821 /* DEV_MAC_HID_ADB_MOUSE_SENDS_KEYCODES unused */
822 {}
823};
824
825static const struct trans_ctl_table trans_raid_table[] = {
826 { DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min" },
827 { DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max" },
828 {}
829};
830
831static const struct trans_ctl_table trans_scsi_table[] = {
832 { DEV_SCSI_LOGGING_LEVEL, "logging_level" },
833 {}
834};
835
836static const struct trans_ctl_table trans_parport_default_table[] = {
837 { DEV_PARPORT_DEFAULT_TIMESLICE, "timeslice" },
838 { DEV_PARPORT_DEFAULT_SPINTIME, "spintime" },
839 {}
840};
841
842static const struct trans_ctl_table trans_parport_device_table[] = {
843 { DEV_PARPORT_DEVICE_TIMESLICE, "timeslice" },
844 {}
845};
846
847static const struct trans_ctl_table trans_parport_devices_table[] = {
848 { DEV_PARPORT_DEVICES_ACTIVE, "active" },
849 { 0, NULL, trans_parport_device_table },
850 {}
851};
852
853static const struct trans_ctl_table trans_parport_parport_table[] = {
854 { DEV_PARPORT_SPINTIME, "spintime" },
855 { DEV_PARPORT_BASE_ADDR, "base-addr" },
856 { DEV_PARPORT_IRQ, "irq" },
857 { DEV_PARPORT_DMA, "dma" },
858 { DEV_PARPORT_MODES, "modes" },
859 { DEV_PARPORT_DEVICES, "devices", trans_parport_devices_table },
860 { DEV_PARPORT_AUTOPROBE, "autoprobe" },
861 { DEV_PARPORT_AUTOPROBE + 1, "autoprobe0" },
862 { DEV_PARPORT_AUTOPROBE + 2, "autoprobe1" },
863 { DEV_PARPORT_AUTOPROBE + 3, "autoprobe2" },
864 { DEV_PARPORT_AUTOPROBE + 4, "autoprobe3" },
865 {}
866};
867static const struct trans_ctl_table trans_parport_table[] = {
868 { DEV_PARPORT_DEFAULT, "default", trans_parport_default_table },
869 { 0, NULL, trans_parport_parport_table },
870 {}
871};
872
873static const struct trans_ctl_table trans_dev_table[] = {
874 { DEV_CDROM, "cdrom", trans_cdrom_table },
875 /* DEV_HWMON unused */
876 { DEV_PARPORT, "parport", trans_parport_table },
877 { DEV_RAID, "raid", trans_raid_table },
878 { DEV_MAC_HID, "mac_hid", trans_mac_hid_files },
879 { DEV_SCSI, "scsi", trans_scsi_table },
880 { DEV_IPMI, "ipmi", trans_ipmi_table },
881 {}
882};
883
884static const struct trans_ctl_table trans_bus_isa_table[] = {
885 { BUS_ISA_MEM_BASE, "membase" },
886 { BUS_ISA_PORT_BASE, "portbase" },
887 { BUS_ISA_PORT_SHIFT, "portshift" },
888 {}
889};
890
891static const struct trans_ctl_table trans_bus_table[] = {
892 { CTL_BUS_ISA, "isa", trans_bus_isa_table },
893 {}
894};
895
896static const struct trans_ctl_table trans_arlan_conf_table0[] = {
897 { 1, "spreadingCode" },
898 { 2, "channelNumber" },
899 { 3, "scramblingDisable" },
900 { 4, "txAttenuation" },
901 { 5, "systemId" },
902 { 6, "maxDatagramSize" },
903 { 7, "maxFrameSize" },
904 { 8, "maxRetries" },
905 { 9, "receiveMode" },
906 { 10, "priority" },
907 { 11, "rootOrRepeater" },
908 { 12, "SID" },
909 { 13, "registrationMode" },
910 { 14, "registrationFill" },
911 { 15, "localTalkAddress" },
912 { 16, "codeFormat" },
913 { 17, "numChannels" },
914 { 18, "channel1" },
915 { 19, "channel2" },
916 { 20, "channel3" },
917 { 21, "channel4" },
918 { 22, "txClear" },
919 { 23, "txRetries" },
920 { 24, "txRouting" },
921 { 25, "txScrambled" },
922 { 26, "rxParameter" },
923 { 27, "txTimeoutMs" },
924 { 28, "waitCardTimeout" },
925 { 29, "channelSet" },
926 { 30, "name" },
927 { 31, "waitTime" },
928 { 32, "lParameter" },
929 { 33, "_15" },
930 { 34, "headerSize" },
931 { 36, "tx_delay_ms" },
932 { 37, "retries" },
933 { 38, "ReTransmitPacketMaxSize" },
934 { 39, "waitReTransmitPacketMaxSize" },
935 { 40, "fastReTransCount" },
936 { 41, "driverRetransmissions" },
937 { 42, "txAckTimeoutMs" },
938 { 43, "registrationInterrupts" },
939 { 44, "hardwareType" },
940 { 45, "radioType" },
941 { 46, "writeEEPROM" },
942 { 47, "writeRadioType" },
943 { 48, "entry_exit_debug" },
944 { 49, "debug" },
945 { 50, "in_speed" },
946 { 51, "out_speed" },
947 { 52, "in_speed10" },
948 { 53, "out_speed10" },
949 { 54, "in_speed_max" },
950 { 55, "out_speed_max" },
951 { 56, "measure_rate" },
952 { 57, "pre_Command_Wait" },
953 { 58, "rx_tweak1" },
954 { 59, "rx_tweak2" },
955 { 60, "tx_queue_len" },
956
957 { 150, "arlan0-txRing" },
958 { 151, "arlan0-rxRing" },
959 { 152, "arlan0-18" },
960 { 153, "arlan0-ring" },
961 { 154, "arlan0-shm-cpy" },
962 { 155, "config0" },
963 { 156, "reset0" },
964 {}
965};
966
967static const struct trans_ctl_table trans_arlan_conf_table1[] = {
968 { 1, "spreadingCode" },
969 { 2, "channelNumber" },
970 { 3, "scramblingDisable" },
971 { 4, "txAttenuation" },
972 { 5, "systemId" },
973 { 6, "maxDatagramSize" },
974 { 7, "maxFrameSize" },
975 { 8, "maxRetries" },
976 { 9, "receiveMode" },
977 { 10, "priority" },
978 { 11, "rootOrRepeater" },
979 { 12, "SID" },
980 { 13, "registrationMode" },
981 { 14, "registrationFill" },
982 { 15, "localTalkAddress" },
983 { 16, "codeFormat" },
984 { 17, "numChannels" },
985 { 18, "channel1" },
986 { 19, "channel2" },
987 { 20, "channel3" },
988 { 21, "channel4" },
989 { 22, "txClear" },
990 { 23, "txRetries" },
991 { 24, "txRouting" },
992 { 25, "txScrambled" },
993 { 26, "rxParameter" },
994 { 27, "txTimeoutMs" },
995 { 28, "waitCardTimeout" },
996 { 29, "channelSet" },
997 { 30, "name" },
998 { 31, "waitTime" },
999 { 32, "lParameter" },
1000 { 33, "_15" },
1001 { 34, "headerSize" },
1002 { 36, "tx_delay_ms" },
1003 { 37, "retries" },
1004 { 38, "ReTransmitPacketMaxSize" },
1005 { 39, "waitReTransmitPacketMaxSize" },
1006 { 40, "fastReTransCount" },
1007 { 41, "driverRetransmissions" },
1008 { 42, "txAckTimeoutMs" },
1009 { 43, "registrationInterrupts" },
1010 { 44, "hardwareType" },
1011 { 45, "radioType" },
1012 { 46, "writeEEPROM" },
1013 { 47, "writeRadioType" },
1014 { 48, "entry_exit_debug" },
1015 { 49, "debug" },
1016 { 50, "in_speed" },
1017 { 51, "out_speed" },
1018 { 52, "in_speed10" },
1019 { 53, "out_speed10" },
1020 { 54, "in_speed_max" },
1021 { 55, "out_speed_max" },
1022 { 56, "measure_rate" },
1023 { 57, "pre_Command_Wait" },
1024 { 58, "rx_tweak1" },
1025 { 59, "rx_tweak2" },
1026 { 60, "tx_queue_len" },
1027
1028 { 150, "arlan1-txRing" },
1029 { 151, "arlan1-rxRing" },
1030 { 152, "arlan1-18" },
1031 { 153, "arlan1-ring" },
1032 { 154, "arlan1-shm-cpy" },
1033 { 155, "config1" },
1034 { 156, "reset1" },
1035 {}
1036};
1037
1038static const struct trans_ctl_table trans_arlan_conf_table2[] = {
1039 { 1, "spreadingCode" },
1040 { 2, "channelNumber" },
1041 { 3, "scramblingDisable" },
1042 { 4, "txAttenuation" },
1043 { 5, "systemId" },
1044 { 6, "maxDatagramSize" },
1045 { 7, "maxFrameSize" },
1046 { 8, "maxRetries" },
1047 { 9, "receiveMode" },
1048 { 10, "priority" },
1049 { 11, "rootOrRepeater" },
1050 { 12, "SID" },
1051 { 13, "registrationMode" },
1052 { 14, "registrationFill" },
1053 { 15, "localTalkAddress" },
1054 { 16, "codeFormat" },
1055 { 17, "numChannels" },
1056 { 18, "channel1" },
1057 { 19, "channel2" },
1058 { 20, "channel3" },
1059 { 21, "channel4" },
1060 { 22, "txClear" },
1061 { 23, "txRetries" },
1062 { 24, "txRouting" },
1063 { 25, "txScrambled" },
1064 { 26, "rxParameter" },
1065 { 27, "txTimeoutMs" },
1066 { 28, "waitCardTimeout" },
1067 { 29, "channelSet" },
1068 { 30, "name" },
1069 { 31, "waitTime" },
1070 { 32, "lParameter" },
1071 { 33, "_15" },
1072 { 34, "headerSize" },
1073 { 36, "tx_delay_ms" },
1074 { 37, "retries" },
1075 { 38, "ReTransmitPacketMaxSize" },
1076 { 39, "waitReTransmitPacketMaxSize" },
1077 { 40, "fastReTransCount" },
1078 { 41, "driverRetransmissions" },
1079 { 42, "txAckTimeoutMs" },
1080 { 43, "registrationInterrupts" },
1081 { 44, "hardwareType" },
1082 { 45, "radioType" },
1083 { 46, "writeEEPROM" },
1084 { 47, "writeRadioType" },
1085 { 48, "entry_exit_debug" },
1086 { 49, "debug" },
1087 { 50, "in_speed" },
1088 { 51, "out_speed" },
1089 { 52, "in_speed10" },
1090 { 53, "out_speed10" },
1091 { 54, "in_speed_max" },
1092 { 55, "out_speed_max" },
1093 { 56, "measure_rate" },
1094 { 57, "pre_Command_Wait" },
1095 { 58, "rx_tweak1" },
1096 { 59, "rx_tweak2" },
1097 { 60, "tx_queue_len" },
1098
1099 { 150, "arlan2-txRing" },
1100 { 151, "arlan2-rxRing" },
1101 { 152, "arlan2-18" },
1102 { 153, "arlan2-ring" },
1103 { 154, "arlan2-shm-cpy" },
1104 { 155, "config2" },
1105 { 156, "reset2" },
1106 {}
1107};
1108
1109static const struct trans_ctl_table trans_arlan_conf_table3[] = {
1110 { 1, "spreadingCode" },
1111 { 2, "channelNumber" },
1112 { 3, "scramblingDisable" },
1113 { 4, "txAttenuation" },
1114 { 5, "systemId" },
1115 { 6, "maxDatagramSize" },
1116 { 7, "maxFrameSize" },
1117 { 8, "maxRetries" },
1118 { 9, "receiveMode" },
1119 { 10, "priority" },
1120 { 11, "rootOrRepeater" },
1121 { 12, "SID" },
1122 { 13, "registrationMode" },
1123 { 14, "registrationFill" },
1124 { 15, "localTalkAddress" },
1125 { 16, "codeFormat" },
1126 { 17, "numChannels" },
1127 { 18, "channel1" },
1128 { 19, "channel2" },
1129 { 20, "channel3" },
1130 { 21, "channel4" },
1131 { 22, "txClear" },
1132 { 23, "txRetries" },
1133 { 24, "txRouting" },
1134 { 25, "txScrambled" },
1135 { 26, "rxParameter" },
1136 { 27, "txTimeoutMs" },
1137 { 28, "waitCardTimeout" },
1138 { 29, "channelSet" },
1139 { 30, "name" },
1140 { 31, "waitTime" },
1141 { 32, "lParameter" },
1142 { 33, "_15" },
1143 { 34, "headerSize" },
1144 { 36, "tx_delay_ms" },
1145 { 37, "retries" },
1146 { 38, "ReTransmitPacketMaxSize" },
1147 { 39, "waitReTransmitPacketMaxSize" },
1148 { 40, "fastReTransCount" },
1149 { 41, "driverRetransmissions" },
1150 { 42, "txAckTimeoutMs" },
1151 { 43, "registrationInterrupts" },
1152 { 44, "hardwareType" },
1153 { 45, "radioType" },
1154 { 46, "writeEEPROM" },
1155 { 47, "writeRadioType" },
1156 { 48, "entry_exit_debug" },
1157 { 49, "debug" },
1158 { 50, "in_speed" },
1159 { 51, "out_speed" },
1160 { 52, "in_speed10" },
1161 { 53, "out_speed10" },
1162 { 54, "in_speed_max" },
1163 { 55, "out_speed_max" },
1164 { 56, "measure_rate" },
1165 { 57, "pre_Command_Wait" },
1166 { 58, "rx_tweak1" },
1167 { 59, "rx_tweak2" },
1168 { 60, "tx_queue_len" },
1169
1170 { 150, "arlan3-txRing" },
1171 { 151, "arlan3-rxRing" },
1172 { 152, "arlan3-18" },
1173 { 153, "arlan3-ring" },
1174 { 154, "arlan3-shm-cpy" },
1175 { 155, "config3" },
1176 { 156, "reset3" },
1177 {}
1178};
1179
1180static const struct trans_ctl_table trans_arlan_table[] = {
1181 { 1, "arlan0", trans_arlan_conf_table0 },
1182 { 2, "arlan1", trans_arlan_conf_table1 },
1183 { 3, "arlan2", trans_arlan_conf_table2 },
1184 { 4, "arlan3", trans_arlan_conf_table3 },
1185 {}
1186};
1187
1188static const struct trans_ctl_table trans_s390dbf_table[] = {
1189 { 5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" },
1190 { 5679 /* CTL_S390DBF_ACTIVE */, "debug_active" },
1191 {}
1192};
1193
1194static const struct trans_ctl_table trans_sunrpc_table[] = {
1195 { CTL_RPCDEBUG, "rpc_debug" },
1196 { CTL_NFSDEBUG, "nfs_debug" },
1197 { CTL_NFSDDEBUG, "nfsd_debug" },
1198 { CTL_NLMDEBUG, "nlm_debug" },
1199 { CTL_SLOTTABLE_UDP, "udp_slot_table_entries" },
1200 { CTL_SLOTTABLE_TCP, "tcp_slot_table_entries" },
1201 { CTL_MIN_RESVPORT, "min_resvport" },
1202 { CTL_MAX_RESVPORT, "max_resvport" },
1203 {}
1204};
1205
1206static const struct trans_ctl_table trans_pm_table[] = {
1207 { 1 /* CTL_PM_SUSPEND */, "suspend" },
1208 { 2 /* CTL_PM_CMODE */, "cmode" },
1209 { 3 /* CTL_PM_P0 */, "p0" },
1210 { 4 /* CTL_PM_CM */, "cm" },
1211 {}
1212};
1213
1214static const struct trans_ctl_table trans_frv_table[] = {
1215 { 1, "cache-mode" },
1216 { 2, "pin-cxnr" },
1217 {}
1218};
1219
1220static const struct trans_ctl_table trans_root_table[] = {
1221 { CTL_KERN, "kernel", trans_kern_table },
1222 { CTL_VM, "vm", trans_vm_table },
1223 { CTL_NET, "net", trans_net_table },
1224 /* CTL_PROC not used */
1225 { CTL_FS, "fs", trans_fs_table },
1226 { CTL_DEBUG, "debug", trans_debug_table },
1227 { CTL_DEV, "dev", trans_dev_table },
1228 { CTL_BUS, "bus", trans_bus_table },
1229 { CTL_ABI, "abi" },
1230 /* CTL_CPU not used */
1231 { CTL_ARLAN, "arlan", trans_arlan_table },
1232 { CTL_S390DBF, "s390dbf", trans_s390dbf_table },
1233 { CTL_SUNRPC, "sunrpc", trans_sunrpc_table },
1234 { CTL_PM, "pm", trans_pm_table },
1235 { CTL_FRV, "frv", trans_frv_table },
1236 {}
1237};
1238
1239
1240
1241 8
1242static int sysctl_depth(struct ctl_table *table) 9static int sysctl_depth(struct ctl_table *table)
1243{ 10{
@@ -1261,47 +28,6 @@ static struct ctl_table *sysctl_parent(struct ctl_table *table, int n)
1261 return table; 28 return table;
1262} 29}
1263 30
1264static const struct trans_ctl_table *sysctl_binary_lookup(struct ctl_table *table)
1265{
1266 struct ctl_table *test;
1267 const struct trans_ctl_table *ref;
1268 int cur_depth;
1269
1270 cur_depth = sysctl_depth(table);
1271
1272 ref = trans_root_table;
1273repeat:
1274 test = sysctl_parent(table, cur_depth);
1275 for (; ref->ctl_name || ref->procname || ref->child; ref++) {
1276 int match = 0;
1277
1278 if (cur_depth && !ref->child)
1279 continue;
1280
1281 if (test->procname && ref->procname &&
1282 (strcmp(test->procname, ref->procname) == 0))
1283 match++;
1284
1285 if (test->ctl_name && ref->ctl_name &&
1286 (test->ctl_name == ref->ctl_name))
1287 match++;
1288
1289 if (!ref->ctl_name && !ref->procname)
1290 match++;
1291
1292 if (match) {
1293 if (cur_depth != 0) {
1294 cur_depth--;
1295 ref = ref->child;
1296 goto repeat;
1297 }
1298 goto out;
1299 }
1300 }
1301 ref = NULL;
1302out:
1303 return ref;
1304}
1305 31
1306static void sysctl_print_path(struct ctl_table *table) 32static void sysctl_print_path(struct ctl_table *table)
1307{ 33{
@@ -1315,26 +41,6 @@ static void sysctl_print_path(struct ctl_table *table)
1315 } 41 }
1316 } 42 }
1317 printk(" "); 43 printk(" ");
1318 if (table->ctl_name) {
1319 for (i = depth; i >= 0; i--) {
1320 tmp = sysctl_parent(table, i);
1321 printk(".%d", tmp->ctl_name);
1322 }
1323 }
1324}
1325
1326static void sysctl_repair_table(struct ctl_table *table)
1327{
1328 /* Don't complain about the classic default
1329 * sysctl strategy routine. Maybe later we
1330 * can get the tables fixed and complain about
1331 * this.
1332 */
1333 if (table->ctl_name && table->procname &&
1334 (table->proc_handler == proc_dointvec) &&
1335 (!table->strategy)) {
1336 table->strategy = sysctl_data;
1337 }
1338} 44}
1339 45
1340static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces, 46static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces,
@@ -1352,7 +58,7 @@ static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces,
1352 ref = head->ctl_table; 58 ref = head->ctl_table;
1353repeat: 59repeat:
1354 test = sysctl_parent(table, cur_depth); 60 test = sysctl_parent(table, cur_depth);
1355 for (; ref->ctl_name || ref->procname; ref++) { 61 for (; ref->procname; ref++) {
1356 int match = 0; 62 int match = 0;
1357 if (cur_depth && !ref->child) 63 if (cur_depth && !ref->child)
1358 continue; 64 continue;
@@ -1361,10 +67,6 @@ repeat:
1361 (strcmp(test->procname, ref->procname) == 0)) 67 (strcmp(test->procname, ref->procname) == 0))
1362 match++; 68 match++;
1363 69
1364 if (test->ctl_name && ref->ctl_name &&
1365 (test->ctl_name == ref->ctl_name))
1366 match++;
1367
1368 if (match) { 70 if (match) {
1369 if (cur_depth != 0) { 71 if (cur_depth != 0) {
1370 cur_depth--; 72 cur_depth--;
@@ -1392,38 +94,6 @@ static void set_fail(const char **fail, struct ctl_table *table, const char *str
1392 *fail = str; 94 *fail = str;
1393} 95}
1394 96
1395static int sysctl_check_dir(struct nsproxy *namespaces,
1396 struct ctl_table *table)
1397{
1398 struct ctl_table *ref;
1399 int error;
1400
1401 error = 0;
1402 ref = sysctl_check_lookup(namespaces, table);
1403 if (ref) {
1404 int match = 0;
1405 if ((!table->procname && !ref->procname) ||
1406 (table->procname && ref->procname &&
1407 (strcmp(table->procname, ref->procname) == 0)))
1408 match++;
1409
1410 if ((!table->ctl_name && !ref->ctl_name) ||
1411 (table->ctl_name && ref->ctl_name &&
1412 (table->ctl_name == ref->ctl_name)))
1413 match++;
1414
1415 if (match != 2) {
1416 printk(KERN_ERR "%s: failed: ", __func__);
1417 sysctl_print_path(table);
1418 printk(" ref: ");
1419 sysctl_print_path(ref);
1420 printk("\n");
1421 error = -EINVAL;
1422 }
1423 }
1424 return error;
1425}
1426
1427static void sysctl_check_leaf(struct nsproxy *namespaces, 97static void sysctl_check_leaf(struct nsproxy *namespaces,
1428 struct ctl_table *table, const char **fail) 98 struct ctl_table *table, const char **fail)
1429{ 99{
@@ -1434,37 +104,15 @@ static void sysctl_check_leaf(struct nsproxy *namespaces,
1434 set_fail(fail, table, "Sysctl already exists"); 104 set_fail(fail, table, "Sysctl already exists");
1435} 105}
1436 106
1437static void sysctl_check_bin_path(struct ctl_table *table, const char **fail)
1438{
1439 const struct trans_ctl_table *ref;
1440
1441 ref = sysctl_binary_lookup(table);
1442 if (table->ctl_name && !ref)
1443 set_fail(fail, table, "Unknown sysctl binary path");
1444 if (ref) {
1445 if (ref->procname &&
1446 (!table->procname ||
1447 (strcmp(table->procname, ref->procname) != 0)))
1448 set_fail(fail, table, "procname does not match binary path procname");
1449
1450 if (ref->ctl_name && table->ctl_name &&
1451 (table->ctl_name != ref->ctl_name))
1452 set_fail(fail, table, "ctl_name does not match binary path ctl_name");
1453 }
1454}
1455
1456int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) 107int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
1457{ 108{
1458 int error = 0; 109 int error = 0;
1459 for (; table->ctl_name || table->procname; table++) { 110 for (; table->procname; table++) {
1460 const char *fail = NULL; 111 const char *fail = NULL;
1461 112
1462 sysctl_repair_table(table);
1463 if (table->parent) { 113 if (table->parent) {
1464 if (table->procname && !table->parent->procname) 114 if (table->procname && !table->parent->procname)
1465 set_fail(&fail, table, "Parent without procname"); 115 set_fail(&fail, table, "Parent without procname");
1466 if (table->ctl_name && !table->parent->ctl_name)
1467 set_fail(&fail, table, "Parent without ctl_name");
1468 } 116 }
1469 if (!table->procname) 117 if (!table->procname)
1470 set_fail(&fail, table, "No procname"); 118 set_fail(&fail, table, "No procname");
@@ -1477,21 +125,12 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
1477 set_fail(&fail, table, "Writable sysctl directory"); 125 set_fail(&fail, table, "Writable sysctl directory");
1478 if (table->proc_handler) 126 if (table->proc_handler)
1479 set_fail(&fail, table, "Directory with proc_handler"); 127 set_fail(&fail, table, "Directory with proc_handler");
1480 if (table->strategy)
1481 set_fail(&fail, table, "Directory with strategy");
1482 if (table->extra1) 128 if (table->extra1)
1483 set_fail(&fail, table, "Directory with extra1"); 129 set_fail(&fail, table, "Directory with extra1");
1484 if (table->extra2) 130 if (table->extra2)
1485 set_fail(&fail, table, "Directory with extra2"); 131 set_fail(&fail, table, "Directory with extra2");
1486 if (sysctl_check_dir(namespaces, table))
1487 set_fail(&fail, table, "Inconsistent directory names");
1488 } else { 132 } else {
1489 if ((table->strategy == sysctl_data) || 133 if ((table->proc_handler == proc_dostring) ||
1490 (table->strategy == sysctl_string) ||
1491 (table->strategy == sysctl_intvec) ||
1492 (table->strategy == sysctl_jiffies) ||
1493 (table->strategy == sysctl_ms_jiffies) ||
1494 (table->proc_handler == proc_dostring) ||
1495 (table->proc_handler == proc_dointvec) || 134 (table->proc_handler == proc_dointvec) ||
1496 (table->proc_handler == proc_dointvec_minmax) || 135 (table->proc_handler == proc_dointvec_minmax) ||
1497 (table->proc_handler == proc_dointvec_jiffies) || 136 (table->proc_handler == proc_dointvec_jiffies) ||
@@ -1513,14 +152,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
1513 set_fail(&fail, table, "No max"); 152 set_fail(&fail, table, "No max");
1514 } 153 }
1515 } 154 }
1516#ifdef CONFIG_SYSCTL_SYSCALL
1517 if (table->ctl_name && !table->strategy)
1518 set_fail(&fail, table, "Missing strategy");
1519#endif
1520#if 0
1521 if (!table->ctl_name && table->strategy)
1522 set_fail(&fail, table, "Strategy without ctl_name");
1523#endif
1524#ifdef CONFIG_PROC_SYSCTL 155#ifdef CONFIG_PROC_SYSCTL
1525 if (table->procname && !table->proc_handler) 156 if (table->procname && !table->proc_handler)
1526 set_fail(&fail, table, "No proc_handler"); 157 set_fail(&fail, table, "No proc_handler");
@@ -1531,7 +162,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
1531#endif 162#endif
1532 sysctl_check_leaf(namespaces, table, &fail); 163 sysctl_check_leaf(namespaces, table, &fail);
1533 } 164 }
1534 sysctl_check_bin_path(table, &fail);
1535 if (table->mode > 0777) 165 if (table->mode > 0777)
1536 set_fail(&fail, table, "bogus .mode"); 166 set_fail(&fail, table, "bogus .mode");
1537 if (fail) { 167 if (fail) {
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index ea8384d3caa7..11281d5792bd 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -22,6 +22,7 @@
22#include <linux/delayacct.h> 22#include <linux/delayacct.h>
23#include <linux/cpumask.h> 23#include <linux/cpumask.h>
24#include <linux/percpu.h> 24#include <linux/percpu.h>
25#include <linux/slab.h>
25#include <linux/cgroupstats.h> 26#include <linux/cgroupstats.h>
26#include <linux/cgroup.h> 27#include <linux/cgroup.h>
27#include <linux/fs.h> 28#include <linux/fs.h>
@@ -46,15 +47,13 @@ static struct genl_family family = {
46 .maxattr = TASKSTATS_CMD_ATTR_MAX, 47 .maxattr = TASKSTATS_CMD_ATTR_MAX,
47}; 48};
48 49
49static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] 50static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = {
50__read_mostly = {
51 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, 51 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 },
52 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, 52 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
53 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, 53 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
54 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; 54 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
55 55
56static struct nla_policy 56static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = {
57cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] __read_mostly = {
58 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, 57 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 },
59}; 58};
60 59
diff --git a/kernel/time.c b/kernel/time.c
index 2e2e469a7fec..656dccfe1cbb 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -35,7 +35,6 @@
35#include <linux/syscalls.h> 35#include <linux/syscalls.h>
36#include <linux/security.h> 36#include <linux/security.h>
37#include <linux/fs.h> 37#include <linux/fs.h>
38#include <linux/slab.h>
39#include <linux/math64.h> 38#include <linux/math64.h>
40#include <linux/ptrace.h> 39#include <linux/ptrace.h>
41 40
@@ -662,6 +661,36 @@ u64 nsec_to_clock_t(u64 x)
662#endif 661#endif
663} 662}
664 663
664/**
665 * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
666 *
667 * @n: nsecs in u64
668 *
669 * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
670 * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
671 * for scheduler, not for use in device drivers to calculate timeout value.
672 *
673 * note:
674 * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
675 * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
676 */
677unsigned long nsecs_to_jiffies(u64 n)
678{
679#if (NSEC_PER_SEC % HZ) == 0
680 /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
681 return div_u64(n, NSEC_PER_SEC / HZ);
682#elif (HZ % 512) == 0
683 /* overflow after 292 years if HZ = 1024 */
684 return div_u64(n * HZ / 512, NSEC_PER_SEC / 512);
685#else
686 /*
687 * Generic case - optimized for cases where HZ is a multiple of 3.
688 * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc.
689 */
690 return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ);
691#endif
692}
693
665#if (BITS_PER_LONG < 64) 694#if (BITS_PER_LONG < 64)
666u64 get_jiffies_64(void) 695u64 get_jiffies_64(void)
667{ 696{
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 620b58abdc32..d7395fdfb9f3 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -20,6 +20,8 @@
20#include <linux/sysdev.h> 20#include <linux/sysdev.h>
21#include <linux/tick.h> 21#include <linux/tick.h>
22 22
23#include "tick-internal.h"
24
23/* The registered clock event devices */ 25/* The registered clock event devices */
24static LIST_HEAD(clockevent_devices); 26static LIST_HEAD(clockevent_devices);
25static LIST_HEAD(clockevents_released); 27static LIST_HEAD(clockevents_released);
@@ -28,7 +30,7 @@ static LIST_HEAD(clockevents_released);
28static RAW_NOTIFIER_HEAD(clockevents_chain); 30static RAW_NOTIFIER_HEAD(clockevents_chain);
29 31
30/* Protection for the above */ 32/* Protection for the above */
31static DEFINE_SPINLOCK(clockevents_lock); 33static DEFINE_RAW_SPINLOCK(clockevents_lock);
32 34
33/** 35/**
34 * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds 36 * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds
@@ -37,10 +39,9 @@ static DEFINE_SPINLOCK(clockevents_lock);
37 * 39 *
38 * Math helper, returns latch value converted to nanoseconds (bound checked) 40 * Math helper, returns latch value converted to nanoseconds (bound checked)
39 */ 41 */
40unsigned long clockevent_delta2ns(unsigned long latch, 42u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
41 struct clock_event_device *evt)
42{ 43{
43 u64 clc = ((u64) latch << evt->shift); 44 u64 clc = (u64) latch << evt->shift;
44 45
45 if (unlikely(!evt->mult)) { 46 if (unlikely(!evt->mult)) {
46 evt->mult = 1; 47 evt->mult = 1;
@@ -50,10 +51,10 @@ unsigned long clockevent_delta2ns(unsigned long latch,
50 do_div(clc, evt->mult); 51 do_div(clc, evt->mult);
51 if (clc < 1000) 52 if (clc < 1000)
52 clc = 1000; 53 clc = 1000;
53 if (clc > LONG_MAX) 54 if (clc > KTIME_MAX)
54 clc = LONG_MAX; 55 clc = KTIME_MAX;
55 56
56 return (unsigned long) clc; 57 return clc;
57} 58}
58EXPORT_SYMBOL_GPL(clockevent_delta2ns); 59EXPORT_SYMBOL_GPL(clockevent_delta2ns);
59 60
@@ -140,9 +141,9 @@ int clockevents_register_notifier(struct notifier_block *nb)
140 unsigned long flags; 141 unsigned long flags;
141 int ret; 142 int ret;
142 143
143 spin_lock_irqsave(&clockevents_lock, flags); 144 raw_spin_lock_irqsave(&clockevents_lock, flags);
144 ret = raw_notifier_chain_register(&clockevents_chain, nb); 145 ret = raw_notifier_chain_register(&clockevents_chain, nb);
145 spin_unlock_irqrestore(&clockevents_lock, flags); 146 raw_spin_unlock_irqrestore(&clockevents_lock, flags);
146 147
147 return ret; 148 return ret;
148} 149}
@@ -184,13 +185,13 @@ void clockevents_register_device(struct clock_event_device *dev)
184 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); 185 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
185 BUG_ON(!dev->cpumask); 186 BUG_ON(!dev->cpumask);
186 187
187 spin_lock_irqsave(&clockevents_lock, flags); 188 raw_spin_lock_irqsave(&clockevents_lock, flags);
188 189
189 list_add(&dev->list, &clockevent_devices); 190 list_add(&dev->list, &clockevent_devices);
190 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); 191 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
191 clockevents_notify_released(); 192 clockevents_notify_released();
192 193
193 spin_unlock_irqrestore(&clockevents_lock, flags); 194 raw_spin_unlock_irqrestore(&clockevents_lock, flags);
194} 195}
195EXPORT_SYMBOL_GPL(clockevents_register_device); 196EXPORT_SYMBOL_GPL(clockevents_register_device);
196 197
@@ -237,10 +238,11 @@ void clockevents_exchange_device(struct clock_event_device *old,
237 */ 238 */
238void clockevents_notify(unsigned long reason, void *arg) 239void clockevents_notify(unsigned long reason, void *arg)
239{ 240{
240 struct list_head *node, *tmp; 241 struct clock_event_device *dev, *tmp;
241 unsigned long flags; 242 unsigned long flags;
243 int cpu;
242 244
243 spin_lock_irqsave(&clockevents_lock, flags); 245 raw_spin_lock_irqsave(&clockevents_lock, flags);
244 clockevents_do_notify(reason, arg); 246 clockevents_do_notify(reason, arg);
245 247
246 switch (reason) { 248 switch (reason) {
@@ -249,13 +251,25 @@ void clockevents_notify(unsigned long reason, void *arg)
249 * Unregister the clock event devices which were 251 * Unregister the clock event devices which were
250 * released from the users in the notify chain. 252 * released from the users in the notify chain.
251 */ 253 */
252 list_for_each_safe(node, tmp, &clockevents_released) 254 list_for_each_entry_safe(dev, tmp, &clockevents_released, list)
253 list_del(node); 255 list_del(&dev->list);
256 /*
257 * Now check whether the CPU has left unused per cpu devices
258 */
259 cpu = *((int *)arg);
260 list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) {
261 if (cpumask_test_cpu(cpu, dev->cpumask) &&
262 cpumask_weight(dev->cpumask) == 1 &&
263 !tick_is_broadcast_device(dev)) {
264 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
265 list_del(&dev->list);
266 }
267 }
254 break; 268 break;
255 default: 269 default:
256 break; 270 break;
257 } 271 }
258 spin_unlock_irqrestore(&clockevents_lock, flags); 272 raw_spin_unlock_irqrestore(&clockevents_lock, flags);
259} 273}
260EXPORT_SYMBOL_GPL(clockevents_notify); 274EXPORT_SYMBOL_GPL(clockevents_notify);
261#endif 275#endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 5e18c6ab2c6a..1f5dde637457 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -39,7 +39,7 @@ void timecounter_init(struct timecounter *tc,
39 tc->cycle_last = cc->read(cc); 39 tc->cycle_last = cc->read(cc);
40 tc->nsec = start_tstamp; 40 tc->nsec = start_tstamp;
41} 41}
42EXPORT_SYMBOL(timecounter_init); 42EXPORT_SYMBOL_GPL(timecounter_init);
43 43
44/** 44/**
45 * timecounter_read_delta - get nanoseconds since last call of this function 45 * timecounter_read_delta - get nanoseconds since last call of this function
@@ -83,7 +83,7 @@ u64 timecounter_read(struct timecounter *tc)
83 83
84 return nsec; 84 return nsec;
85} 85}
86EXPORT_SYMBOL(timecounter_read); 86EXPORT_SYMBOL_GPL(timecounter_read);
87 87
88u64 timecounter_cyc2time(struct timecounter *tc, 88u64 timecounter_cyc2time(struct timecounter *tc,
89 cycle_t cycle_tstamp) 89 cycle_t cycle_tstamp)
@@ -105,7 +105,60 @@ u64 timecounter_cyc2time(struct timecounter *tc,
105 105
106 return nsec; 106 return nsec;
107} 107}
108EXPORT_SYMBOL(timecounter_cyc2time); 108EXPORT_SYMBOL_GPL(timecounter_cyc2time);
109
110/**
111 * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
112 * @mult: pointer to mult variable
113 * @shift: pointer to shift variable
114 * @from: frequency to convert from
115 * @to: frequency to convert to
116 * @minsec: guaranteed runtime conversion range in seconds
117 *
118 * The function evaluates the shift/mult pair for the scaled math
119 * operations of clocksources and clockevents.
120 *
121 * @to and @from are frequency values in HZ. For clock sources @to is
122 * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock
123 * event @to is the counter frequency and @from is NSEC_PER_SEC.
124 *
125 * The @minsec conversion range argument controls the time frame in
126 * seconds which must be covered by the runtime conversion with the
127 * calculated mult and shift factors. This guarantees that no 64bit
128 * overflow happens when the input value of the conversion is
129 * multiplied with the calculated mult factor. Larger ranges may
130 * reduce the conversion accuracy by chosing smaller mult and shift
131 * factors.
132 */
133void
134clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
135{
136 u64 tmp;
137 u32 sft, sftacc= 32;
138
139 /*
140 * Calculate the shift factor which is limiting the conversion
141 * range:
142 */
143 tmp = ((u64)minsec * from) >> 32;
144 while (tmp) {
145 tmp >>=1;
146 sftacc--;
147 }
148
149 /*
150 * Find the conversion shift/mult pair which has the best
151 * accuracy and fits the maxsec conversion range:
152 */
153 for (sft = 32; sft > 0; sft--) {
154 tmp = (u64) to << sft;
155 do_div(tmp, from);
156 if ((tmp >> sftacc) == 0)
157 break;
158 }
159 *mult = tmp;
160 *shift = sft;
161}
109 162
110/*[Clocksource internal variables]--------- 163/*[Clocksource internal variables]---------
111 * curr_clocksource: 164 * curr_clocksource:
@@ -290,7 +343,19 @@ static void clocksource_resume_watchdog(void)
290{ 343{
291 unsigned long flags; 344 unsigned long flags;
292 345
293 spin_lock_irqsave(&watchdog_lock, flags); 346 /*
347 * We use trylock here to avoid a potential dead lock when
348 * kgdb calls this code after the kernel has been stopped with
349 * watchdog_lock held. When watchdog_lock is held we just
350 * return and accept, that the watchdog might trigger and mark
351 * the monitored clock source (usually TSC) unstable.
352 *
353 * This does not affect the other caller clocksource_resume()
354 * because at this point the kernel is UP, interrupts are
355 * disabled and nothing can hold watchdog_lock.
356 */
357 if (!spin_trylock_irqsave(&watchdog_lock, flags))
358 return;
294 clocksource_reset_watchdog(); 359 clocksource_reset_watchdog();
295 spin_unlock_irqrestore(&watchdog_lock, flags); 360 spin_unlock_irqrestore(&watchdog_lock, flags);
296} 361}
@@ -388,6 +453,18 @@ static inline int clocksource_watchdog_kthread(void *data) { return 0; }
388#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ 453#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
389 454
390/** 455/**
456 * clocksource_suspend - suspend the clocksource(s)
457 */
458void clocksource_suspend(void)
459{
460 struct clocksource *cs;
461
462 list_for_each_entry_reverse(cs, &clocksource_list, list)
463 if (cs->suspend)
464 cs->suspend(cs);
465}
466
467/**
391 * clocksource_resume - resume the clocksource(s) 468 * clocksource_resume - resume the clocksource(s)
392 */ 469 */
393void clocksource_resume(void) 470void clocksource_resume(void)
@@ -396,7 +473,7 @@ void clocksource_resume(void)
396 473
397 list_for_each_entry(cs, &clocksource_list, list) 474 list_for_each_entry(cs, &clocksource_list, list)
398 if (cs->resume) 475 if (cs->resume)
399 cs->resume(); 476 cs->resume(cs);
400 477
401 clocksource_resume_watchdog(); 478 clocksource_resume_watchdog();
402} 479}
@@ -405,14 +482,55 @@ void clocksource_resume(void)
405 * clocksource_touch_watchdog - Update watchdog 482 * clocksource_touch_watchdog - Update watchdog
406 * 483 *
407 * Update the watchdog after exception contexts such as kgdb so as not 484 * Update the watchdog after exception contexts such as kgdb so as not
408 * to incorrectly trip the watchdog. 485 * to incorrectly trip the watchdog. This might fail when the kernel
409 * 486 * was stopped in code which holds watchdog_lock.
410 */ 487 */
411void clocksource_touch_watchdog(void) 488void clocksource_touch_watchdog(void)
412{ 489{
413 clocksource_resume_watchdog(); 490 clocksource_resume_watchdog();
414} 491}
415 492
493/**
494 * clocksource_max_deferment - Returns max time the clocksource can be deferred
495 * @cs: Pointer to clocksource
496 *
497 */
498static u64 clocksource_max_deferment(struct clocksource *cs)
499{
500 u64 max_nsecs, max_cycles;
501
502 /*
503 * Calculate the maximum number of cycles that we can pass to the
504 * cyc2ns function without overflowing a 64-bit signed result. The
505 * maximum number of cycles is equal to ULLONG_MAX/cs->mult which
506 * is equivalent to the below.
507 * max_cycles < (2^63)/cs->mult
508 * max_cycles < 2^(log2((2^63)/cs->mult))
509 * max_cycles < 2^(log2(2^63) - log2(cs->mult))
510 * max_cycles < 2^(63 - log2(cs->mult))
511 * max_cycles < 1 << (63 - log2(cs->mult))
512 * Please note that we add 1 to the result of the log2 to account for
513 * any rounding errors, ensure the above inequality is satisfied and
514 * no overflow will occur.
515 */
516 max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1));
517
518 /*
519 * The actual maximum number of cycles we can defer the clocksource is
520 * determined by the minimum of max_cycles and cs->mask.
521 */
522 max_cycles = min_t(u64, max_cycles, (u64) cs->mask);
523 max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift);
524
525 /*
526 * To ensure that the clocksource does not wrap whilst we are idle,
527 * limit the time the clocksource can be deferred by 12.5%. Please
528 * note a margin of 12.5% is used because this can be computed with
529 * a shift, versus say 10% which would require division.
530 */
531 return max_nsecs - (max_nsecs >> 5);
532}
533
416#ifdef CONFIG_GENERIC_TIME 534#ifdef CONFIG_GENERIC_TIME
417 535
418/** 536/**
@@ -474,6 +592,10 @@ static inline void clocksource_select(void) { }
474 */ 592 */
475static int __init clocksource_done_booting(void) 593static int __init clocksource_done_booting(void)
476{ 594{
595 mutex_lock(&clocksource_mutex);
596 curr_clocksource = clocksource_default_clock();
597 mutex_unlock(&clocksource_mutex);
598
477 finished_booting = 1; 599 finished_booting = 1;
478 600
479 /* 601 /*
@@ -511,6 +633,9 @@ static void clocksource_enqueue(struct clocksource *cs)
511 */ 633 */
512int clocksource_register(struct clocksource *cs) 634int clocksource_register(struct clocksource *cs)
513{ 635{
636 /* calculate max idle time permitted for this clocksource */
637 cs->max_idle_ns = clocksource_max_deferment(cs);
638
514 mutex_lock(&clocksource_mutex); 639 mutex_lock(&clocksource_mutex);
515 clocksource_enqueue(cs); 640 clocksource_enqueue(cs);
516 clocksource_select(); 641 clocksource_select();
@@ -580,7 +705,7 @@ sysfs_show_current_clocksources(struct sys_device *dev,
580 * @count: length of buffer 705 * @count: length of buffer
581 * 706 *
582 * Takes input from sysfs interface for manually overriding the default 707 * Takes input from sysfs interface for manually overriding the default
583 * clocksource selction. 708 * clocksource selection.
584 */ 709 */
585static ssize_t sysfs_override_clocksource(struct sys_device *dev, 710static ssize_t sysfs_override_clocksource(struct sys_device *dev,
586 struct sysdev_attribute *attr, 711 struct sysdev_attribute *attr,
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 4800f933910e..7c0f180d6e9d 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -58,10 +58,10 @@ static s64 time_offset;
58static long time_constant = 2; 58static long time_constant = 2;
59 59
60/* maximum error (usecs): */ 60/* maximum error (usecs): */
61long time_maxerror = NTP_PHASE_LIMIT; 61static long time_maxerror = NTP_PHASE_LIMIT;
62 62
63/* estimated error (usecs): */ 63/* estimated error (usecs): */
64long time_esterror = NTP_PHASE_LIMIT; 64static long time_esterror = NTP_PHASE_LIMIT;
65 65
66/* frequency offset (scaled nsecs/secs): */ 66/* frequency offset (scaled nsecs/secs): */
67static s64 time_freq; 67static s64 time_freq;
@@ -142,11 +142,11 @@ static void ntp_update_offset(long offset)
142 * Select how the frequency is to be controlled 142 * Select how the frequency is to be controlled
143 * and in which mode (PLL or FLL). 143 * and in which mode (PLL or FLL).
144 */ 144 */
145 secs = xtime.tv_sec - time_reftime; 145 secs = get_seconds() - time_reftime;
146 if (unlikely(time_status & STA_FREQHOLD)) 146 if (unlikely(time_status & STA_FREQHOLD))
147 secs = 0; 147 secs = 0;
148 148
149 time_reftime = xtime.tv_sec; 149 time_reftime = get_seconds();
150 150
151 offset64 = offset; 151 offset64 = offset;
152 freq_adj = (offset64 * secs) << 152 freq_adj = (offset64 * secs) <<
@@ -368,7 +368,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
368 * reference time to current time. 368 * reference time to current time.
369 */ 369 */
370 if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) 370 if (!(time_status & STA_PLL) && (txc->status & STA_PLL))
371 time_reftime = xtime.tv_sec; 371 time_reftime = get_seconds();
372 372
373 /* only set allowed bits */ 373 /* only set allowed bits */
374 time_status &= STA_RONLY; 374 time_status &= STA_RONLY;
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index c2ec25087a35..b3bafd5fc66d 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -31,7 +31,7 @@ static struct tick_device tick_broadcast_device;
31/* FIXME: Use cpumask_var_t. */ 31/* FIXME: Use cpumask_var_t. */
32static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS); 32static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS);
33static DECLARE_BITMAP(tmpmask, NR_CPUS); 33static DECLARE_BITMAP(tmpmask, NR_CPUS);
34static DEFINE_SPINLOCK(tick_broadcast_lock); 34static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
35static int tick_broadcast_force; 35static int tick_broadcast_force;
36 36
37#ifdef CONFIG_TICK_ONESHOT 37#ifdef CONFIG_TICK_ONESHOT
@@ -96,7 +96,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
96 unsigned long flags; 96 unsigned long flags;
97 int ret = 0; 97 int ret = 0;
98 98
99 spin_lock_irqsave(&tick_broadcast_lock, flags); 99 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
100 100
101 /* 101 /*
102 * Devices might be registered with both periodic and oneshot 102 * Devices might be registered with both periodic and oneshot
@@ -122,7 +122,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
122 tick_broadcast_clear_oneshot(cpu); 122 tick_broadcast_clear_oneshot(cpu);
123 } 123 }
124 } 124 }
125 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 125 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
126 return ret; 126 return ret;
127} 127}
128 128
@@ -161,13 +161,13 @@ static void tick_do_broadcast(struct cpumask *mask)
161 */ 161 */
162static void tick_do_periodic_broadcast(void) 162static void tick_do_periodic_broadcast(void)
163{ 163{
164 spin_lock(&tick_broadcast_lock); 164 raw_spin_lock(&tick_broadcast_lock);
165 165
166 cpumask_and(to_cpumask(tmpmask), 166 cpumask_and(to_cpumask(tmpmask),
167 cpu_online_mask, tick_get_broadcast_mask()); 167 cpu_online_mask, tick_get_broadcast_mask());
168 tick_do_broadcast(to_cpumask(tmpmask)); 168 tick_do_broadcast(to_cpumask(tmpmask));
169 169
170 spin_unlock(&tick_broadcast_lock); 170 raw_spin_unlock(&tick_broadcast_lock);
171} 171}
172 172
173/* 173/*
@@ -212,7 +212,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
212 unsigned long flags; 212 unsigned long flags;
213 int cpu, bc_stopped; 213 int cpu, bc_stopped;
214 214
215 spin_lock_irqsave(&tick_broadcast_lock, flags); 215 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
216 216
217 cpu = smp_processor_id(); 217 cpu = smp_processor_id();
218 td = &per_cpu(tick_cpu_device, cpu); 218 td = &per_cpu(tick_cpu_device, cpu);
@@ -263,7 +263,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
263 tick_broadcast_setup_oneshot(bc); 263 tick_broadcast_setup_oneshot(bc);
264 } 264 }
265out: 265out:
266 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 266 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
267} 267}
268 268
269/* 269/*
@@ -299,7 +299,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
299 unsigned long flags; 299 unsigned long flags;
300 unsigned int cpu = *cpup; 300 unsigned int cpu = *cpup;
301 301
302 spin_lock_irqsave(&tick_broadcast_lock, flags); 302 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
303 303
304 bc = tick_broadcast_device.evtdev; 304 bc = tick_broadcast_device.evtdev;
305 cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); 305 cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
@@ -309,7 +309,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
309 clockevents_shutdown(bc); 309 clockevents_shutdown(bc);
310 } 310 }
311 311
312 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 312 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
313} 313}
314 314
315void tick_suspend_broadcast(void) 315void tick_suspend_broadcast(void)
@@ -317,13 +317,13 @@ void tick_suspend_broadcast(void)
317 struct clock_event_device *bc; 317 struct clock_event_device *bc;
318 unsigned long flags; 318 unsigned long flags;
319 319
320 spin_lock_irqsave(&tick_broadcast_lock, flags); 320 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
321 321
322 bc = tick_broadcast_device.evtdev; 322 bc = tick_broadcast_device.evtdev;
323 if (bc) 323 if (bc)
324 clockevents_shutdown(bc); 324 clockevents_shutdown(bc);
325 325
326 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 326 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
327} 327}
328 328
329int tick_resume_broadcast(void) 329int tick_resume_broadcast(void)
@@ -332,7 +332,7 @@ int tick_resume_broadcast(void)
332 unsigned long flags; 332 unsigned long flags;
333 int broadcast = 0; 333 int broadcast = 0;
334 334
335 spin_lock_irqsave(&tick_broadcast_lock, flags); 335 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
336 336
337 bc = tick_broadcast_device.evtdev; 337 bc = tick_broadcast_device.evtdev;
338 338
@@ -351,7 +351,7 @@ int tick_resume_broadcast(void)
351 break; 351 break;
352 } 352 }
353 } 353 }
354 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 354 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
355 355
356 return broadcast; 356 return broadcast;
357} 357}
@@ -405,7 +405,7 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
405 ktime_t now, next_event; 405 ktime_t now, next_event;
406 int cpu; 406 int cpu;
407 407
408 spin_lock(&tick_broadcast_lock); 408 raw_spin_lock(&tick_broadcast_lock);
409again: 409again:
410 dev->next_event.tv64 = KTIME_MAX; 410 dev->next_event.tv64 = KTIME_MAX;
411 next_event.tv64 = KTIME_MAX; 411 next_event.tv64 = KTIME_MAX;
@@ -443,7 +443,7 @@ again:
443 if (tick_broadcast_set_event(next_event, 0)) 443 if (tick_broadcast_set_event(next_event, 0))
444 goto again; 444 goto again;
445 } 445 }
446 spin_unlock(&tick_broadcast_lock); 446 raw_spin_unlock(&tick_broadcast_lock);
447} 447}
448 448
449/* 449/*
@@ -457,7 +457,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
457 unsigned long flags; 457 unsigned long flags;
458 int cpu; 458 int cpu;
459 459
460 spin_lock_irqsave(&tick_broadcast_lock, flags); 460 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
461 461
462 /* 462 /*
463 * Periodic mode does not care about the enter/exit of power 463 * Periodic mode does not care about the enter/exit of power
@@ -492,7 +492,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
492 } 492 }
493 493
494out: 494out:
495 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 495 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
496} 496}
497 497
498/* 498/*
@@ -563,13 +563,13 @@ void tick_broadcast_switch_to_oneshot(void)
563 struct clock_event_device *bc; 563 struct clock_event_device *bc;
564 unsigned long flags; 564 unsigned long flags;
565 565
566 spin_lock_irqsave(&tick_broadcast_lock, flags); 566 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
567 567
568 tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; 568 tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT;
569 bc = tick_broadcast_device.evtdev; 569 bc = tick_broadcast_device.evtdev;
570 if (bc) 570 if (bc)
571 tick_broadcast_setup_oneshot(bc); 571 tick_broadcast_setup_oneshot(bc);
572 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 572 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
573} 573}
574 574
575 575
@@ -581,7 +581,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
581 unsigned long flags; 581 unsigned long flags;
582 unsigned int cpu = *cpup; 582 unsigned int cpu = *cpup;
583 583
584 spin_lock_irqsave(&tick_broadcast_lock, flags); 584 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
585 585
586 /* 586 /*
587 * Clear the broadcast mask flag for the dead cpu, but do not 587 * Clear the broadcast mask flag for the dead cpu, but do not
@@ -589,7 +589,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
589 */ 589 */
590 cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); 590 cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
591 591
592 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 592 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
593} 593}
594 594
595/* 595/*
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 83c4417b6a3c..b6b898d2eeef 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -34,7 +34,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
34ktime_t tick_next_period; 34ktime_t tick_next_period;
35ktime_t tick_period; 35ktime_t tick_period;
36int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; 36int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
37DEFINE_SPINLOCK(tick_device_lock); 37static DEFINE_RAW_SPINLOCK(tick_device_lock);
38 38
39/* 39/*
40 * Debugging: see timer_list.c 40 * Debugging: see timer_list.c
@@ -209,7 +209,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
209 int cpu, ret = NOTIFY_OK; 209 int cpu, ret = NOTIFY_OK;
210 unsigned long flags; 210 unsigned long flags;
211 211
212 spin_lock_irqsave(&tick_device_lock, flags); 212 raw_spin_lock_irqsave(&tick_device_lock, flags);
213 213
214 cpu = smp_processor_id(); 214 cpu = smp_processor_id();
215 if (!cpumask_test_cpu(cpu, newdev->cpumask)) 215 if (!cpumask_test_cpu(cpu, newdev->cpumask))
@@ -268,7 +268,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
268 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) 268 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
269 tick_oneshot_notify(); 269 tick_oneshot_notify();
270 270
271 spin_unlock_irqrestore(&tick_device_lock, flags); 271 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
272 return NOTIFY_STOP; 272 return NOTIFY_STOP;
273 273
274out_bc: 274out_bc:
@@ -278,7 +278,7 @@ out_bc:
278 if (tick_check_broadcast_device(newdev)) 278 if (tick_check_broadcast_device(newdev))
279 ret = NOTIFY_STOP; 279 ret = NOTIFY_STOP;
280 280
281 spin_unlock_irqrestore(&tick_device_lock, flags); 281 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
282 282
283 return ret; 283 return ret;
284} 284}
@@ -311,7 +311,7 @@ static void tick_shutdown(unsigned int *cpup)
311 struct clock_event_device *dev = td->evtdev; 311 struct clock_event_device *dev = td->evtdev;
312 unsigned long flags; 312 unsigned long flags;
313 313
314 spin_lock_irqsave(&tick_device_lock, flags); 314 raw_spin_lock_irqsave(&tick_device_lock, flags);
315 td->mode = TICKDEV_MODE_PERIODIC; 315 td->mode = TICKDEV_MODE_PERIODIC;
316 if (dev) { 316 if (dev) {
317 /* 317 /*
@@ -322,7 +322,7 @@ static void tick_shutdown(unsigned int *cpup)
322 clockevents_exchange_device(dev, NULL); 322 clockevents_exchange_device(dev, NULL);
323 td->evtdev = NULL; 323 td->evtdev = NULL;
324 } 324 }
325 spin_unlock_irqrestore(&tick_device_lock, flags); 325 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
326} 326}
327 327
328static void tick_suspend(void) 328static void tick_suspend(void)
@@ -330,9 +330,9 @@ static void tick_suspend(void)
330 struct tick_device *td = &__get_cpu_var(tick_cpu_device); 330 struct tick_device *td = &__get_cpu_var(tick_cpu_device);
331 unsigned long flags; 331 unsigned long flags;
332 332
333 spin_lock_irqsave(&tick_device_lock, flags); 333 raw_spin_lock_irqsave(&tick_device_lock, flags);
334 clockevents_shutdown(td->evtdev); 334 clockevents_shutdown(td->evtdev);
335 spin_unlock_irqrestore(&tick_device_lock, flags); 335 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
336} 336}
337 337
338static void tick_resume(void) 338static void tick_resume(void)
@@ -341,7 +341,7 @@ static void tick_resume(void)
341 unsigned long flags; 341 unsigned long flags;
342 int broadcast = tick_resume_broadcast(); 342 int broadcast = tick_resume_broadcast();
343 343
344 spin_lock_irqsave(&tick_device_lock, flags); 344 raw_spin_lock_irqsave(&tick_device_lock, flags);
345 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); 345 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);
346 346
347 if (!broadcast) { 347 if (!broadcast) {
@@ -350,7 +350,7 @@ static void tick_resume(void)
350 else 350 else
351 tick_resume_oneshot(); 351 tick_resume_oneshot();
352 } 352 }
353 spin_unlock_irqrestore(&tick_device_lock, flags); 353 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
354} 354}
355 355
356/* 356/*
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index b1c05bf75ee0..290eefbc1f60 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -6,7 +6,6 @@
6#define TICK_DO_TIMER_BOOT -2 6#define TICK_DO_TIMER_BOOT -2
7 7
8DECLARE_PER_CPU(struct tick_device, tick_cpu_device); 8DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
9extern spinlock_t tick_device_lock;
10extern ktime_t tick_next_period; 9extern ktime_t tick_next_period;
11extern ktime_t tick_period; 10extern ktime_t tick_period;
12extern int tick_do_timer_cpu __read_mostly; 11extern int tick_do_timer_cpu __read_mostly;
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index a96c0e2b89cf..aada0e52680a 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -22,6 +22,29 @@
22 22
23#include "tick-internal.h" 23#include "tick-internal.h"
24 24
25/* Limit min_delta to a jiffie */
26#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ)
27
28static int tick_increase_min_delta(struct clock_event_device *dev)
29{
30 /* Nothing to do if we already reached the limit */
31 if (dev->min_delta_ns >= MIN_DELTA_LIMIT)
32 return -ETIME;
33
34 if (dev->min_delta_ns < 5000)
35 dev->min_delta_ns = 5000;
36 else
37 dev->min_delta_ns += dev->min_delta_ns >> 1;
38
39 if (dev->min_delta_ns > MIN_DELTA_LIMIT)
40 dev->min_delta_ns = MIN_DELTA_LIMIT;
41
42 printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n",
43 dev->name ? dev->name : "?",
44 (unsigned long long) dev->min_delta_ns);
45 return 0;
46}
47
25/** 48/**
26 * tick_program_event internal worker function 49 * tick_program_event internal worker function
27 */ 50 */
@@ -37,23 +60,28 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
37 if (!ret || !force) 60 if (!ret || !force)
38 return ret; 61 return ret;
39 62
63 dev->retries++;
40 /* 64 /*
41 * We tried 2 times to program the device with the given 65 * We tried 3 times to program the device with the given
42 * min_delta_ns. If that's not working then we double it 66 * min_delta_ns. If that's not working then we increase it
43 * and emit a warning. 67 * and emit a warning.
44 */ 68 */
45 if (++i > 2) { 69 if (++i > 2) {
46 /* Increase the min. delta and try again */ 70 /* Increase the min. delta and try again */
47 if (!dev->min_delta_ns) 71 if (tick_increase_min_delta(dev)) {
48 dev->min_delta_ns = 5000; 72 /*
49 else 73 * Get out of the loop if min_delta_ns
50 dev->min_delta_ns += dev->min_delta_ns >> 1; 74 * hit the limit already. That's
51 75 * better than staying here forever.
52 printk(KERN_WARNING 76 *
53 "CE: %s increasing min_delta_ns to %lu nsec\n", 77 * We clear next_event so we have a
54 dev->name ? dev->name : "?", 78 * chance that the box survives.
55 dev->min_delta_ns << 1); 79 */
56 80 printk(KERN_WARNING
81 "CE: Reprogramming failure. Giving up\n");
82 dev->next_event.tv64 = KTIME_MAX;
83 return -ETIME;
84 }
57 i = 0; 85 i = 0;
58 } 86 }
59 87
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index dcbff7515489..0adc54bd7c7c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -134,18 +134,13 @@ __setup("nohz=", setup_tick_nohz);
134 * value. We do this unconditionally on any cpu, as we don't know whether the 134 * value. We do this unconditionally on any cpu, as we don't know whether the
135 * cpu, which has the update task assigned is in a long sleep. 135 * cpu, which has the update task assigned is in a long sleep.
136 */ 136 */
137static void tick_nohz_update_jiffies(void) 137static void tick_nohz_update_jiffies(ktime_t now)
138{ 138{
139 int cpu = smp_processor_id(); 139 int cpu = smp_processor_id();
140 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 140 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
141 unsigned long flags; 141 unsigned long flags;
142 ktime_t now;
143
144 if (!ts->tick_stopped)
145 return;
146 142
147 cpumask_clear_cpu(cpu, nohz_cpu_mask); 143 cpumask_clear_cpu(cpu, nohz_cpu_mask);
148 now = ktime_get();
149 ts->idle_waketime = now; 144 ts->idle_waketime = now;
150 145
151 local_irq_save(flags); 146 local_irq_save(flags);
@@ -155,20 +150,17 @@ static void tick_nohz_update_jiffies(void)
155 touch_softlockup_watchdog(); 150 touch_softlockup_watchdog();
156} 151}
157 152
158static void tick_nohz_stop_idle(int cpu) 153static void tick_nohz_stop_idle(int cpu, ktime_t now)
159{ 154{
160 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 155 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
156 ktime_t delta;
161 157
162 if (ts->idle_active) { 158 delta = ktime_sub(now, ts->idle_entrytime);
163 ktime_t now, delta; 159 ts->idle_lastupdate = now;
164 now = ktime_get(); 160 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
165 delta = ktime_sub(now, ts->idle_entrytime); 161 ts->idle_active = 0;
166 ts->idle_lastupdate = now;
167 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
168 ts->idle_active = 0;
169 162
170 sched_clock_idle_wakeup_event(0); 163 sched_clock_idle_wakeup_event(0);
171 }
172} 164}
173 165
174static ktime_t tick_nohz_start_idle(struct tick_sched *ts) 166static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
@@ -216,6 +208,7 @@ void tick_nohz_stop_sched_tick(int inidle)
216 struct tick_sched *ts; 208 struct tick_sched *ts;
217 ktime_t last_update, expires, now; 209 ktime_t last_update, expires, now;
218 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 210 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
211 u64 time_delta;
219 int cpu; 212 int cpu;
220 213
221 local_irq_save(flags); 214 local_irq_save(flags);
@@ -263,7 +256,7 @@ void tick_nohz_stop_sched_tick(int inidle)
263 256
264 if (ratelimit < 10) { 257 if (ratelimit < 10) {
265 printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", 258 printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
266 local_softirq_pending()); 259 (unsigned int) local_softirq_pending());
267 ratelimit++; 260 ratelimit++;
268 } 261 }
269 goto end; 262 goto end;
@@ -275,14 +268,18 @@ void tick_nohz_stop_sched_tick(int inidle)
275 seq = read_seqbegin(&xtime_lock); 268 seq = read_seqbegin(&xtime_lock);
276 last_update = last_jiffies_update; 269 last_update = last_jiffies_update;
277 last_jiffies = jiffies; 270 last_jiffies = jiffies;
271 time_delta = timekeeping_max_deferment();
278 } while (read_seqretry(&xtime_lock, seq)); 272 } while (read_seqretry(&xtime_lock, seq));
279 273
280 /* Get the next timer wheel timer */ 274 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
281 next_jiffies = get_next_timer_interrupt(last_jiffies); 275 arch_needs_cpu(cpu)) {
282 delta_jiffies = next_jiffies - last_jiffies; 276 next_jiffies = last_jiffies + 1;
283
284 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu))
285 delta_jiffies = 1; 277 delta_jiffies = 1;
278 } else {
279 /* Get the next timer wheel timer */
280 next_jiffies = get_next_timer_interrupt(last_jiffies);
281 delta_jiffies = next_jiffies - last_jiffies;
282 }
286 /* 283 /*
287 * Do not stop the tick, if we are only one off 284 * Do not stop the tick, if we are only one off
288 * or if the cpu is required for rcu 285 * or if the cpu is required for rcu
@@ -294,22 +291,51 @@ void tick_nohz_stop_sched_tick(int inidle)
294 if ((long)delta_jiffies >= 1) { 291 if ((long)delta_jiffies >= 1) {
295 292
296 /* 293 /*
297 * calculate the expiry time for the next timer wheel
298 * timer
299 */
300 expires = ktime_add_ns(last_update, tick_period.tv64 *
301 delta_jiffies);
302
303 /*
304 * If this cpu is the one which updates jiffies, then 294 * If this cpu is the one which updates jiffies, then
305 * give up the assignment and let it be taken by the 295 * give up the assignment and let it be taken by the
306 * cpu which runs the tick timer next, which might be 296 * cpu which runs the tick timer next, which might be
307 * this cpu as well. If we don't drop this here the 297 * this cpu as well. If we don't drop this here the
308 * jiffies might be stale and do_timer() never 298 * jiffies might be stale and do_timer() never
309 * invoked. 299 * invoked. Keep track of the fact that it was the one
300 * which had the do_timer() duty last. If this cpu is
301 * the one which had the do_timer() duty last, we
302 * limit the sleep time to the timekeeping
303 * max_deferement value which we retrieved
304 * above. Otherwise we can sleep as long as we want.
310 */ 305 */
311 if (cpu == tick_do_timer_cpu) 306 if (cpu == tick_do_timer_cpu) {
312 tick_do_timer_cpu = TICK_DO_TIMER_NONE; 307 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
308 ts->do_timer_last = 1;
309 } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
310 time_delta = KTIME_MAX;
311 ts->do_timer_last = 0;
312 } else if (!ts->do_timer_last) {
313 time_delta = KTIME_MAX;
314 }
315
316 /*
317 * calculate the expiry time for the next timer wheel
318 * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals
319 * that there is no timer pending or at least extremely
320 * far into the future (12 days for HZ=1000). In this
321 * case we set the expiry to the end of time.
322 */
323 if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) {
324 /*
325 * Calculate the time delta for the next timer event.
326 * If the time delta exceeds the maximum time delta
327 * permitted by the current clocksource then adjust
328 * the time delta accordingly to ensure the
329 * clocksource does not wrap.
330 */
331 time_delta = min_t(u64, time_delta,
332 tick_period.tv64 * delta_jiffies);
333 }
334
335 if (time_delta < KTIME_MAX)
336 expires = ktime_add_ns(last_update, time_delta);
337 else
338 expires.tv64 = KTIME_MAX;
313 339
314 if (delta_jiffies > 1) 340 if (delta_jiffies > 1)
315 cpumask_set_cpu(cpu, nohz_cpu_mask); 341 cpumask_set_cpu(cpu, nohz_cpu_mask);
@@ -342,22 +368,19 @@ void tick_nohz_stop_sched_tick(int inidle)
342 368
343 ts->idle_sleeps++; 369 ts->idle_sleeps++;
344 370
371 /* Mark expires */
372 ts->idle_expires = expires;
373
345 /* 374 /*
346 * delta_jiffies >= NEXT_TIMER_MAX_DELTA signals that 375 * If the expiration time == KTIME_MAX, then
347 * there is no timer pending or at least extremly far 376 * in this case we simply stop the tick timer.
348 * into the future (12 days for HZ=1000). In this case
349 * we simply stop the tick timer:
350 */ 377 */
351 if (unlikely(delta_jiffies >= NEXT_TIMER_MAX_DELTA)) { 378 if (unlikely(expires.tv64 == KTIME_MAX)) {
352 ts->idle_expires.tv64 = KTIME_MAX;
353 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) 379 if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
354 hrtimer_cancel(&ts->sched_timer); 380 hrtimer_cancel(&ts->sched_timer);
355 goto out; 381 goto out;
356 } 382 }
357 383
358 /* Mark expiries */
359 ts->idle_expires = expires;
360
361 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { 384 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
362 hrtimer_start(&ts->sched_timer, expires, 385 hrtimer_start(&ts->sched_timer, expires,
363 HRTIMER_MODE_ABS_PINNED); 386 HRTIMER_MODE_ABS_PINNED);
@@ -436,7 +459,11 @@ void tick_nohz_restart_sched_tick(void)
436 ktime_t now; 459 ktime_t now;
437 460
438 local_irq_disable(); 461 local_irq_disable();
439 tick_nohz_stop_idle(cpu); 462 if (ts->idle_active || (ts->inidle && ts->tick_stopped))
463 now = ktime_get();
464
465 if (ts->idle_active)
466 tick_nohz_stop_idle(cpu, now);
440 467
441 if (!ts->inidle || !ts->tick_stopped) { 468 if (!ts->inidle || !ts->tick_stopped) {
442 ts->inidle = 0; 469 ts->inidle = 0;
@@ -450,7 +477,6 @@ void tick_nohz_restart_sched_tick(void)
450 477
451 /* Update jiffies first */ 478 /* Update jiffies first */
452 select_nohz_load_balancer(0); 479 select_nohz_load_balancer(0);
453 now = ktime_get();
454 tick_do_update_jiffies64(now); 480 tick_do_update_jiffies64(now);
455 cpumask_clear_cpu(cpu, nohz_cpu_mask); 481 cpumask_clear_cpu(cpu, nohz_cpu_mask);
456 482
@@ -584,22 +610,18 @@ static void tick_nohz_switch_to_nohz(void)
584 * timer and do not touch the other magic bits which need to be done 610 * timer and do not touch the other magic bits which need to be done
585 * when idle is left. 611 * when idle is left.
586 */ 612 */
587static void tick_nohz_kick_tick(int cpu) 613static void tick_nohz_kick_tick(int cpu, ktime_t now)
588{ 614{
589#if 0 615#if 0
590 /* Switch back to 2.6.27 behaviour */ 616 /* Switch back to 2.6.27 behaviour */
591 617
592 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 618 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
593 ktime_t delta, now; 619 ktime_t delta;
594
595 if (!ts->tick_stopped)
596 return;
597 620
598 /* 621 /*
599 * Do not touch the tick device, when the next expiry is either 622 * Do not touch the tick device, when the next expiry is either
600 * already reached or less/equal than the tick period. 623 * already reached or less/equal than the tick period.
601 */ 624 */
602 now = ktime_get();
603 delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now); 625 delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now);
604 if (delta.tv64 <= tick_period.tv64) 626 if (delta.tv64 <= tick_period.tv64)
605 return; 627 return;
@@ -608,9 +630,26 @@ static void tick_nohz_kick_tick(int cpu)
608#endif 630#endif
609} 631}
610 632
633static inline void tick_check_nohz(int cpu)
634{
635 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
636 ktime_t now;
637
638 if (!ts->idle_active && !ts->tick_stopped)
639 return;
640 now = ktime_get();
641 if (ts->idle_active)
642 tick_nohz_stop_idle(cpu, now);
643 if (ts->tick_stopped) {
644 tick_nohz_update_jiffies(now);
645 tick_nohz_kick_tick(cpu, now);
646 }
647}
648
611#else 649#else
612 650
613static inline void tick_nohz_switch_to_nohz(void) { } 651static inline void tick_nohz_switch_to_nohz(void) { }
652static inline void tick_check_nohz(int cpu) { }
614 653
615#endif /* NO_HZ */ 654#endif /* NO_HZ */
616 655
@@ -620,11 +659,7 @@ static inline void tick_nohz_switch_to_nohz(void) { }
620void tick_check_idle(int cpu) 659void tick_check_idle(int cpu)
621{ 660{
622 tick_check_oneshot_broadcast(cpu); 661 tick_check_oneshot_broadcast(cpu);
623#ifdef CONFIG_NO_HZ 662 tick_check_nohz(cpu);
624 tick_nohz_stop_idle(cpu);
625 tick_nohz_update_jiffies();
626 tick_nohz_kick_tick(cpu);
627#endif
628} 663}
629 664
630/* 665/*
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
index 71e7f1a19156..ac38fbb176cc 100644
--- a/kernel/time/timecompare.c
+++ b/kernel/time/timecompare.c
@@ -19,6 +19,7 @@
19 19
20#include <linux/timecompare.h> 20#include <linux/timecompare.h>
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/slab.h>
22#include <linux/math64.h> 23#include <linux/math64.h>
23 24
24/* 25/*
@@ -40,7 +41,7 @@ ktime_t timecompare_transform(struct timecompare *sync,
40 41
41 return ns_to_ktime(nsec); 42 return ns_to_ktime(nsec);
42} 43}
43EXPORT_SYMBOL(timecompare_transform); 44EXPORT_SYMBOL_GPL(timecompare_transform);
44 45
45int timecompare_offset(struct timecompare *sync, 46int timecompare_offset(struct timecompare *sync,
46 s64 *offset, 47 s64 *offset,
@@ -89,7 +90,7 @@ int timecompare_offset(struct timecompare *sync,
89 * source time 90 * source time
90 */ 91 */
91 sample.offset = 92 sample.offset =
92 ktime_to_ns(ktime_add(end, start)) / 2 - 93 (ktime_to_ns(end) + ktime_to_ns(start)) / 2 -
93 ts; 94 ts;
94 95
95 /* simple insertion sort based on duration */ 96 /* simple insertion sort based on duration */
@@ -131,7 +132,7 @@ int timecompare_offset(struct timecompare *sync,
131 132
132 return used; 133 return used;
133} 134}
134EXPORT_SYMBOL(timecompare_offset); 135EXPORT_SYMBOL_GPL(timecompare_offset);
135 136
136void __timecompare_update(struct timecompare *sync, 137void __timecompare_update(struct timecompare *sync,
137 u64 source_tstamp) 138 u64 source_tstamp)
@@ -188,4 +189,4 @@ void __timecompare_update(struct timecompare *sync,
188 } 189 }
189 } 190 }
190} 191}
191EXPORT_SYMBOL(__timecompare_update); 192EXPORT_SYMBOL_GPL(__timecompare_update);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index c3a4e2907eaa..39f6177fafac 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -177,7 +177,7 @@ void timekeeping_leap_insert(int leapsecond)
177{ 177{
178 xtime.tv_sec += leapsecond; 178 xtime.tv_sec += leapsecond;
179 wall_to_monotonic.tv_sec -= leapsecond; 179 wall_to_monotonic.tv_sec -= leapsecond;
180 update_vsyscall(&xtime, timekeeper.clock); 180 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
181} 181}
182 182
183#ifdef CONFIG_GENERIC_TIME 183#ifdef CONFIG_GENERIC_TIME
@@ -337,7 +337,7 @@ int do_settimeofday(struct timespec *tv)
337 timekeeper.ntp_error = 0; 337 timekeeper.ntp_error = 0;
338 ntp_clear(); 338 ntp_clear();
339 339
340 update_vsyscall(&xtime, timekeeper.clock); 340 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
341 341
342 write_sequnlock_irqrestore(&xtime_lock, flags); 342 write_sequnlock_irqrestore(&xtime_lock, flags);
343 343
@@ -488,6 +488,17 @@ int timekeeping_valid_for_hres(void)
488} 488}
489 489
490/** 490/**
491 * timekeeping_max_deferment - Returns max time the clocksource can be deferred
492 *
493 * Caller must observe xtime_lock via read_seqbegin/read_seqretry to
494 * ensure that the clocksource does not change!
495 */
496u64 timekeeping_max_deferment(void)
497{
498 return timekeeper.clock->max_idle_ns;
499}
500
501/**
491 * read_persistent_clock - Return time from the persistent clock. 502 * read_persistent_clock - Return time from the persistent clock.
492 * 503 *
493 * Weak dummy function for arches that do not yet support it. 504 * Weak dummy function for arches that do not yet support it.
@@ -611,6 +622,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
611 write_sequnlock_irqrestore(&xtime_lock, flags); 622 write_sequnlock_irqrestore(&xtime_lock, flags);
612 623
613 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); 624 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
625 clocksource_suspend();
614 626
615 return 0; 627 return 0;
616} 628}
@@ -722,6 +734,51 @@ static void timekeeping_adjust(s64 offset)
722 timekeeper.ntp_error_shift; 734 timekeeper.ntp_error_shift;
723} 735}
724 736
737
738/**
739 * logarithmic_accumulation - shifted accumulation of cycles
740 *
741 * This functions accumulates a shifted interval of cycles into
742 * into a shifted interval nanoseconds. Allows for O(log) accumulation
743 * loop.
744 *
745 * Returns the unconsumed cycles.
746 */
747static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
748{
749 u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
750
751 /* If the offset is smaller then a shifted interval, do nothing */
752 if (offset < timekeeper.cycle_interval<<shift)
753 return offset;
754
755 /* Accumulate one shifted interval */
756 offset -= timekeeper.cycle_interval << shift;
757 timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift;
758
759 timekeeper.xtime_nsec += timekeeper.xtime_interval << shift;
760 while (timekeeper.xtime_nsec >= nsecps) {
761 timekeeper.xtime_nsec -= nsecps;
762 xtime.tv_sec++;
763 second_overflow();
764 }
765
766 /* Accumulate into raw time */
767 raw_time.tv_nsec += timekeeper.raw_interval << shift;;
768 while (raw_time.tv_nsec >= NSEC_PER_SEC) {
769 raw_time.tv_nsec -= NSEC_PER_SEC;
770 raw_time.tv_sec++;
771 }
772
773 /* Accumulate error between NTP and clock interval */
774 timekeeper.ntp_error += tick_length << shift;
775 timekeeper.ntp_error -= timekeeper.xtime_interval <<
776 (timekeeper.ntp_error_shift + shift);
777
778 return offset;
779}
780
781
725/** 782/**
726 * update_wall_time - Uses the current clocksource to increment the wall time 783 * update_wall_time - Uses the current clocksource to increment the wall time
727 * 784 *
@@ -732,6 +789,7 @@ void update_wall_time(void)
732 struct clocksource *clock; 789 struct clocksource *clock;
733 cycle_t offset; 790 cycle_t offset;
734 u64 nsecs; 791 u64 nsecs;
792 int shift = 0, maxshift;
735 793
736 /* Make sure we're fully resumed: */ 794 /* Make sure we're fully resumed: */
737 if (unlikely(timekeeping_suspended)) 795 if (unlikely(timekeeping_suspended))
@@ -745,33 +803,23 @@ void update_wall_time(void)
745#endif 803#endif
746 timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; 804 timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
747 805
748 /* normally this loop will run just once, however in the 806 /*
749 * case of lost or late ticks, it will accumulate correctly. 807 * With NO_HZ we may have to accumulate many cycle_intervals
808 * (think "ticks") worth of time at once. To do this efficiently,
809 * we calculate the largest doubling multiple of cycle_intervals
810 * that is smaller then the offset. We then accumulate that
811 * chunk in one go, and then try to consume the next smaller
812 * doubled multiple.
750 */ 813 */
814 shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
815 shift = max(0, shift);
816 /* Bound shift to one less then what overflows tick_length */
817 maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1;
818 shift = min(shift, maxshift);
751 while (offset >= timekeeper.cycle_interval) { 819 while (offset >= timekeeper.cycle_interval) {
752 u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; 820 offset = logarithmic_accumulation(offset, shift);
753 821 if(offset < timekeeper.cycle_interval<<shift)
754 /* accumulate one interval */ 822 shift--;
755 offset -= timekeeper.cycle_interval;
756 clock->cycle_last += timekeeper.cycle_interval;
757
758 timekeeper.xtime_nsec += timekeeper.xtime_interval;
759 if (timekeeper.xtime_nsec >= nsecps) {
760 timekeeper.xtime_nsec -= nsecps;
761 xtime.tv_sec++;
762 second_overflow();
763 }
764
765 raw_time.tv_nsec += timekeeper.raw_interval;
766 if (raw_time.tv_nsec >= NSEC_PER_SEC) {
767 raw_time.tv_nsec -= NSEC_PER_SEC;
768 raw_time.tv_sec++;
769 }
770
771 /* accumulate error between NTP and clock interval */
772 timekeeper.ntp_error += tick_length;
773 timekeeper.ntp_error -= timekeeper.xtime_interval <<
774 timekeeper.ntp_error_shift;
775 } 823 }
776 824
777 /* correct the clock when NTP error is too big */ 825 /* correct the clock when NTP error is too big */
@@ -811,7 +859,7 @@ void update_wall_time(void)
811 update_xtime_cache(nsecs); 859 update_xtime_cache(nsecs);
812 860
813 /* check to see if there is a new clocksource to use */ 861 /* check to see if there is a new clocksource to use */
814 update_vsyscall(&xtime, timekeeper.clock); 862 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
815} 863}
816 864
817/** 865/**
@@ -834,6 +882,7 @@ void getboottime(struct timespec *ts)
834 882
835 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); 883 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
836} 884}
885EXPORT_SYMBOL_GPL(getboottime);
837 886
838/** 887/**
839 * monotonic_to_bootbased - Convert the monotonic time to boot based. 888 * monotonic_to_bootbased - Convert the monotonic time to boot based.
@@ -843,6 +892,7 @@ void monotonic_to_bootbased(struct timespec *ts)
843{ 892{
844 *ts = timespec_add_safe(*ts, total_sleep_time); 893 *ts = timespec_add_safe(*ts, total_sleep_time);
845} 894}
895EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
846 896
847unsigned long get_seconds(void) 897unsigned long get_seconds(void)
848{ 898{
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 1b5b7aa2fdfd..1a4a7dd78777 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -84,7 +84,7 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
84 84
85next_one: 85next_one:
86 i = 0; 86 i = 0;
87 spin_lock_irqsave(&base->cpu_base->lock, flags); 87 raw_spin_lock_irqsave(&base->cpu_base->lock, flags);
88 88
89 curr = base->first; 89 curr = base->first;
90 /* 90 /*
@@ -100,13 +100,13 @@ next_one:
100 100
101 timer = rb_entry(curr, struct hrtimer, node); 101 timer = rb_entry(curr, struct hrtimer, node);
102 tmp = *timer; 102 tmp = *timer;
103 spin_unlock_irqrestore(&base->cpu_base->lock, flags); 103 raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags);
104 104
105 print_timer(m, timer, &tmp, i, now); 105 print_timer(m, timer, &tmp, i, now);
106 next++; 106 next++;
107 goto next_one; 107 goto next_one;
108 } 108 }
109 spin_unlock_irqrestore(&base->cpu_base->lock, flags); 109 raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags);
110} 110}
111 111
112static void 112static void
@@ -150,6 +150,9 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
150 P_ns(expires_next); 150 P_ns(expires_next);
151 P(hres_active); 151 P(hres_active);
152 P(nr_events); 152 P(nr_events);
153 P(nr_retries);
154 P(nr_hangs);
155 P_ns(max_hang_time);
153#endif 156#endif
154#undef P 157#undef P
155#undef P_ns 158#undef P_ns
@@ -204,10 +207,12 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
204 return; 207 return;
205 } 208 }
206 SEQ_printf(m, "%s\n", dev->name); 209 SEQ_printf(m, "%s\n", dev->name);
207 SEQ_printf(m, " max_delta_ns: %lu\n", dev->max_delta_ns); 210 SEQ_printf(m, " max_delta_ns: %llu\n",
208 SEQ_printf(m, " min_delta_ns: %lu\n", dev->min_delta_ns); 211 (unsigned long long) dev->max_delta_ns);
209 SEQ_printf(m, " mult: %lu\n", dev->mult); 212 SEQ_printf(m, " min_delta_ns: %llu\n",
210 SEQ_printf(m, " shift: %d\n", dev->shift); 213 (unsigned long long) dev->min_delta_ns);
214 SEQ_printf(m, " mult: %u\n", dev->mult);
215 SEQ_printf(m, " shift: %u\n", dev->shift);
211 SEQ_printf(m, " mode: %d\n", dev->mode); 216 SEQ_printf(m, " mode: %d\n", dev->mode);
212 SEQ_printf(m, " next_event: %Ld nsecs\n", 217 SEQ_printf(m, " next_event: %Ld nsecs\n",
213 (unsigned long long) ktime_to_ns(dev->next_event)); 218 (unsigned long long) ktime_to_ns(dev->next_event));
@@ -223,6 +228,7 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
223 SEQ_printf(m, " event_handler: "); 228 SEQ_printf(m, " event_handler: ");
224 print_name_offset(m, dev->event_handler); 229 print_name_offset(m, dev->event_handler);
225 SEQ_printf(m, "\n"); 230 SEQ_printf(m, "\n");
231 SEQ_printf(m, " retries: %lu\n", dev->retries);
226} 232}
227 233
228static void timer_list_show_tickdevices(struct seq_file *m) 234static void timer_list_show_tickdevices(struct seq_file *m)
@@ -232,10 +238,10 @@ static void timer_list_show_tickdevices(struct seq_file *m)
232#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST 238#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
233 print_tickdevice(m, tick_get_broadcast_device(), -1); 239 print_tickdevice(m, tick_get_broadcast_device(), -1);
234 SEQ_printf(m, "tick_broadcast_mask: %08lx\n", 240 SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
235 tick_get_broadcast_mask()->bits[0]); 241 cpumask_bits(tick_get_broadcast_mask())[0]);
236#ifdef CONFIG_TICK_ONESHOT 242#ifdef CONFIG_TICK_ONESHOT
237 SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n", 243 SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n",
238 tick_get_broadcast_oneshot_mask()->bits[0]); 244 cpumask_bits(tick_get_broadcast_oneshot_mask())[0]);
239#endif 245#endif
240 SEQ_printf(m, "\n"); 246 SEQ_printf(m, "\n");
241#endif 247#endif
@@ -252,7 +258,7 @@ static int timer_list_show(struct seq_file *m, void *v)
252 u64 now = ktime_to_ns(ktime_get()); 258 u64 now = ktime_to_ns(ktime_get());
253 int cpu; 259 int cpu;
254 260
255 SEQ_printf(m, "Timer List Version: v0.4\n"); 261 SEQ_printf(m, "Timer List Version: v0.6\n");
256 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); 262 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
257 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); 263 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
258 264
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index ee5681f8d7ec..2f3b585b8d7d 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -86,7 +86,7 @@ static DEFINE_SPINLOCK(table_lock);
86/* 86/*
87 * Per-CPU lookup locks for fast hash lookup: 87 * Per-CPU lookup locks for fast hash lookup:
88 */ 88 */
89static DEFINE_PER_CPU(spinlock_t, lookup_lock); 89static DEFINE_PER_CPU(raw_spinlock_t, tstats_lookup_lock);
90 90
91/* 91/*
92 * Mutex to serialize state changes with show-stats activities: 92 * Mutex to serialize state changes with show-stats activities:
@@ -238,14 +238,14 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
238 /* 238 /*
239 * It doesnt matter which lock we take: 239 * It doesnt matter which lock we take:
240 */ 240 */
241 spinlock_t *lock; 241 raw_spinlock_t *lock;
242 struct entry *entry, input; 242 struct entry *entry, input;
243 unsigned long flags; 243 unsigned long flags;
244 244
245 if (likely(!timer_stats_active)) 245 if (likely(!timer_stats_active))
246 return; 246 return;
247 247
248 lock = &per_cpu(lookup_lock, raw_smp_processor_id()); 248 lock = &per_cpu(tstats_lookup_lock, raw_smp_processor_id());
249 249
250 input.timer = timer; 250 input.timer = timer;
251 input.start_func = startf; 251 input.start_func = startf;
@@ -253,7 +253,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
253 input.pid = pid; 253 input.pid = pid;
254 input.timer_flag = timer_flag; 254 input.timer_flag = timer_flag;
255 255
256 spin_lock_irqsave(lock, flags); 256 raw_spin_lock_irqsave(lock, flags);
257 if (!timer_stats_active) 257 if (!timer_stats_active)
258 goto out_unlock; 258 goto out_unlock;
259 259
@@ -264,7 +264,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
264 atomic_inc(&overflow_count); 264 atomic_inc(&overflow_count);
265 265
266 out_unlock: 266 out_unlock:
267 spin_unlock_irqrestore(lock, flags); 267 raw_spin_unlock_irqrestore(lock, flags);
268} 268}
269 269
270static void print_name_offset(struct seq_file *m, unsigned long addr) 270static void print_name_offset(struct seq_file *m, unsigned long addr)
@@ -348,9 +348,11 @@ static void sync_access(void)
348 int cpu; 348 int cpu;
349 349
350 for_each_online_cpu(cpu) { 350 for_each_online_cpu(cpu) {
351 spin_lock_irqsave(&per_cpu(lookup_lock, cpu), flags); 351 raw_spinlock_t *lock = &per_cpu(tstats_lookup_lock, cpu);
352
353 raw_spin_lock_irqsave(lock, flags);
352 /* nothing */ 354 /* nothing */
353 spin_unlock_irqrestore(&per_cpu(lookup_lock, cpu), flags); 355 raw_spin_unlock_irqrestore(lock, flags);
354 } 356 }
355} 357}
356 358
@@ -408,7 +410,7 @@ void __init init_timer_stats(void)
408 int cpu; 410 int cpu;
409 411
410 for_each_possible_cpu(cpu) 412 for_each_possible_cpu(cpu)
411 spin_lock_init(&per_cpu(lookup_lock, cpu)); 413 raw_spin_lock_init(&per_cpu(tstats_lookup_lock, cpu));
412} 414}
413 415
414static int __init init_tstats_procfs(void) 416static int __init init_tstats_procfs(void)
diff --git a/kernel/timer.c b/kernel/timer.c
index 5db5a8d26811..aeb6a54f2771 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -39,6 +39,7 @@
39#include <linux/kallsyms.h> 39#include <linux/kallsyms.h>
40#include <linux/perf_event.h> 40#include <linux/perf_event.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/slab.h>
42 43
43#include <asm/uaccess.h> 44#include <asm/uaccess.h>
44#include <asm/unistd.h> 45#include <asm/unistd.h>
@@ -656,8 +657,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
656 657
657 debug_activate(timer, expires); 658 debug_activate(timer, expires);
658 659
659 new_base = __get_cpu_var(tvec_bases);
660
661 cpu = smp_processor_id(); 660 cpu = smp_processor_id();
662 661
663#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) 662#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
@@ -882,6 +881,7 @@ int try_to_del_timer_sync(struct timer_list *timer)
882 if (base->running_timer == timer) 881 if (base->running_timer == timer)
883 goto out; 882 goto out;
884 883
884 timer_stats_timer_clear_start_info(timer);
885 ret = 0; 885 ret = 0;
886 if (timer_pending(timer)) { 886 if (timer_pending(timer)) {
887 detach_timer(timer, 1); 887 detach_timer(timer, 1);
@@ -1200,6 +1200,7 @@ void update_process_times(int user_tick)
1200 run_local_timers(); 1200 run_local_timers();
1201 rcu_check_callbacks(cpu, user_tick); 1201 rcu_check_callbacks(cpu, user_tick);
1202 printk_tick(); 1202 printk_tick();
1203 perf_event_do_pending();
1203 scheduler_tick(); 1204 scheduler_tick();
1204 run_posix_cpu_timers(p); 1205 run_posix_cpu_timers(p);
1205} 1206}
@@ -1211,8 +1212,6 @@ static void run_timer_softirq(struct softirq_action *h)
1211{ 1212{
1212 struct tvec_base *base = __get_cpu_var(tvec_bases); 1213 struct tvec_base *base = __get_cpu_var(tvec_bases);
1213 1214
1214 perf_event_do_pending();
1215
1216 hrtimer_run_pending(); 1215 hrtimer_run_pending();
1217 1216
1218 if (time_after_eq(jiffies, base->timer_jiffies)) 1217 if (time_after_eq(jiffies, base->timer_jiffies))
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index b416512ad17f..13e13d428cd3 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -12,39 +12,37 @@ config NOP_TRACER
12config HAVE_FTRACE_NMI_ENTER 12config HAVE_FTRACE_NMI_ENTER
13 bool 13 bool
14 help 14 help
15 See Documentation/trace/ftrace-implementation.txt 15 See Documentation/trace/ftrace-design.txt
16 16
17config HAVE_FUNCTION_TRACER 17config HAVE_FUNCTION_TRACER
18 bool 18 bool
19 help 19 help
20 See Documentation/trace/ftrace-implementation.txt 20 See Documentation/trace/ftrace-design.txt
21 21
22config HAVE_FUNCTION_GRAPH_TRACER 22config HAVE_FUNCTION_GRAPH_TRACER
23 bool 23 bool
24 help 24 help
25 See Documentation/trace/ftrace-implementation.txt 25 See Documentation/trace/ftrace-design.txt
26 26
27config HAVE_FUNCTION_GRAPH_FP_TEST 27config HAVE_FUNCTION_GRAPH_FP_TEST
28 bool 28 bool
29 help 29 help
30 An arch may pass in a unique value (frame pointer) to both the 30 See Documentation/trace/ftrace-design.txt
31 entering and exiting of a function. On exit, the value is compared
32 and if it does not match, then it will panic the kernel.
33 31
34config HAVE_FUNCTION_TRACE_MCOUNT_TEST 32config HAVE_FUNCTION_TRACE_MCOUNT_TEST
35 bool 33 bool
36 help 34 help
37 See Documentation/trace/ftrace-implementation.txt 35 See Documentation/trace/ftrace-design.txt
38 36
39config HAVE_DYNAMIC_FTRACE 37config HAVE_DYNAMIC_FTRACE
40 bool 38 bool
41 help 39 help
42 See Documentation/trace/ftrace-implementation.txt 40 See Documentation/trace/ftrace-design.txt
43 41
44config HAVE_FTRACE_MCOUNT_RECORD 42config HAVE_FTRACE_MCOUNT_RECORD
45 bool 43 bool
46 help 44 help
47 See Documentation/trace/ftrace-implementation.txt 45 See Documentation/trace/ftrace-design.txt
48 46
49config HAVE_HW_BRANCH_TRACER 47config HAVE_HW_BRANCH_TRACER
50 bool 48 bool
@@ -52,7 +50,7 @@ config HAVE_HW_BRANCH_TRACER
52config HAVE_SYSCALL_TRACEPOINTS 50config HAVE_SYSCALL_TRACEPOINTS
53 bool 51 bool
54 help 52 help
55 See Documentation/trace/ftrace-implementation.txt 53 See Documentation/trace/ftrace-design.txt
56 54
57config TRACER_MAX_TRACE 55config TRACER_MAX_TRACE
58 bool 56 bool
@@ -83,7 +81,7 @@ config RING_BUFFER_ALLOW_SWAP
83# This allows those options to appear when no other tracer is selected. But the 81# This allows those options to appear when no other tracer is selected. But the
84# options do not appear when something else selects it. We need the two options 82# options do not appear when something else selects it. We need the two options
85# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the 83# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the
86# hidding of the automatic options. 84# hiding of the automatic options.
87 85
88config TRACING 86config TRACING
89 bool 87 bool
@@ -119,7 +117,7 @@ menuconfig FTRACE
119 bool "Tracers" 117 bool "Tracers"
120 default y if DEBUG_KERNEL 118 default y if DEBUG_KERNEL
121 help 119 help
122 Enable the kernel tracing infrastructure. 120 Enable the kernel tracing infrastructure.
123 121
124if FTRACE 122if FTRACE
125 123
@@ -133,7 +131,7 @@ config FUNCTION_TRACER
133 help 131 help
134 Enable the kernel to trace every kernel function. This is done 132 Enable the kernel to trace every kernel function. This is done
135 by using a compiler feature to insert a small, 5-byte No-Operation 133 by using a compiler feature to insert a small, 5-byte No-Operation
136 instruction to the beginning of every kernel function, which NOP 134 instruction at the beginning of every kernel function, which NOP
137 sequence is then dynamically patched into a tracer call when 135 sequence is then dynamically patched into a tracer call when
138 tracing is enabled by the administrator. If it's runtime disabled 136 tracing is enabled by the administrator. If it's runtime disabled
139 (the bootup default), then the overhead of the instructions is very 137 (the bootup default), then the overhead of the instructions is very
@@ -150,7 +148,7 @@ config FUNCTION_GRAPH_TRACER
150 and its entry. 148 and its entry.
151 Its first purpose is to trace the duration of functions and 149 Its first purpose is to trace the duration of functions and
152 draw a call graph for each thread with some information like 150 draw a call graph for each thread with some information like
153 the return value. This is done by setting the current return 151 the return value. This is done by setting the current return
154 address on the current task structure into a stack of calls. 152 address on the current task structure into a stack of calls.
155 153
156 154
@@ -173,7 +171,7 @@ config IRQSOFF_TRACER
173 171
174 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency 172 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
175 173
176 (Note that kernel size and overhead increases with this option 174 (Note that kernel size and overhead increase with this option
177 enabled. This option and the preempt-off timing option can be 175 enabled. This option and the preempt-off timing option can be
178 used together or separately.) 176 used together or separately.)
179 177
@@ -186,7 +184,7 @@ config PREEMPT_TRACER
186 select TRACER_MAX_TRACE 184 select TRACER_MAX_TRACE
187 select RING_BUFFER_ALLOW_SWAP 185 select RING_BUFFER_ALLOW_SWAP
188 help 186 help
189 This option measures the time spent in preemption off critical 187 This option measures the time spent in preemption-off critical
190 sections, with microsecond accuracy. 188 sections, with microsecond accuracy.
191 189
192 The default measurement method is a maximum search, which is 190 The default measurement method is a maximum search, which is
@@ -195,7 +193,7 @@ config PREEMPT_TRACER
195 193
196 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency 194 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
197 195
198 (Note that kernel size and overhead increases with this option 196 (Note that kernel size and overhead increase with this option
199 enabled. This option and the irqs-off timing option can be 197 enabled. This option and the irqs-off timing option can be
200 used together or separately.) 198 used together or separately.)
201 199
@@ -222,7 +220,7 @@ config ENABLE_DEFAULT_TRACERS
222 depends on !GENERIC_TRACER 220 depends on !GENERIC_TRACER
223 select TRACING 221 select TRACING
224 help 222 help
225 This tracer hooks to various trace points in the kernel 223 This tracer hooks to various trace points in the kernel,
226 allowing the user to pick and choose which trace point they 224 allowing the user to pick and choose which trace point they
227 want to trace. It also includes the sched_switch tracer plugin. 225 want to trace. It also includes the sched_switch tracer plugin.
228 226
@@ -265,19 +263,19 @@ choice
265 The likely/unlikely profiler only looks at the conditions that 263 The likely/unlikely profiler only looks at the conditions that
266 are annotated with a likely or unlikely macro. 264 are annotated with a likely or unlikely macro.
267 265
268 The "all branch" profiler will profile every if statement in the 266 The "all branch" profiler will profile every if-statement in the
269 kernel. This profiler will also enable the likely/unlikely 267 kernel. This profiler will also enable the likely/unlikely
270 profiler as well. 268 profiler.
271 269
272 Either of the above profilers add a bit of overhead to the system. 270 Either of the above profilers adds a bit of overhead to the system.
273 If unsure choose "No branch profiling". 271 If unsure, choose "No branch profiling".
274 272
275config BRANCH_PROFILE_NONE 273config BRANCH_PROFILE_NONE
276 bool "No branch profiling" 274 bool "No branch profiling"
277 help 275 help
278 No branch profiling. Branch profiling adds a bit of overhead. 276 No branch profiling. Branch profiling adds a bit of overhead.
279 Only enable it if you want to analyse the branching behavior. 277 Only enable it if you want to analyse the branching behavior.
280 Otherwise keep it disabled. 278 Otherwise keep it disabled.
281 279
282config PROFILE_ANNOTATED_BRANCHES 280config PROFILE_ANNOTATED_BRANCHES
283 bool "Trace likely/unlikely profiler" 281 bool "Trace likely/unlikely profiler"
@@ -288,7 +286,7 @@ config PROFILE_ANNOTATED_BRANCHES
288 286
289 /sys/kernel/debug/tracing/profile_annotated_branch 287 /sys/kernel/debug/tracing/profile_annotated_branch
290 288
291 Note: this will add a significant overhead, only turn this 289 Note: this will add a significant overhead; only turn this
292 on if you need to profile the system's use of these macros. 290 on if you need to profile the system's use of these macros.
293 291
294config PROFILE_ALL_BRANCHES 292config PROFILE_ALL_BRANCHES
@@ -305,7 +303,7 @@ config PROFILE_ALL_BRANCHES
305 303
306 This configuration, when enabled, will impose a great overhead 304 This configuration, when enabled, will impose a great overhead
307 on the system. This should only be enabled when the system 305 on the system. This should only be enabled when the system
308 is to be analyzed 306 is to be analyzed in much detail.
309endchoice 307endchoice
310 308
311config TRACING_BRANCHES 309config TRACING_BRANCHES
@@ -330,15 +328,27 @@ config BRANCH_TRACER
330 328
331 Say N if unsure. 329 Say N if unsure.
332 330
333config POWER_TRACER 331config KSYM_TRACER
334 bool "Trace power consumption behavior" 332 bool "Trace read and write access on kernel memory locations"
335 depends on X86 333 depends on HAVE_HW_BREAKPOINT
336 select GENERIC_TRACER 334 select TRACING
335 help
336 This tracer helps find read and write operations on any given kernel
337 symbol i.e. /proc/kallsyms.
338
339config PROFILE_KSYM_TRACER
340 bool "Profile all kernel memory accesses on 'watched' variables"
341 depends on KSYM_TRACER
337 help 342 help
338 This tracer helps developers to analyze and optimize the kernels 343 This tracer profiles kernel accesses on variables watched through the
339 power management decisions, specifically the C-state and P-state 344 ksym tracer ftrace plugin. Depending upon the hardware, all read
340 behavior. 345 and write operations on kernel variables can be monitored for
346 accesses.
347
348 The results will be displayed in:
349 /debugfs/tracing/profile_ksym
341 350
351 Say N if unsure.
342 352
343config STACK_TRACER 353config STACK_TRACER
344 bool "Trace max stack" 354 bool "Trace max stack"
@@ -370,14 +380,14 @@ config HW_BRANCH_TRACER
370 select GENERIC_TRACER 380 select GENERIC_TRACER
371 help 381 help
372 This tracer records all branches on the system in a circular 382 This tracer records all branches on the system in a circular
373 buffer giving access to the last N branches for each cpu. 383 buffer, giving access to the last N branches for each cpu.
374 384
375config KMEMTRACE 385config KMEMTRACE
376 bool "Trace SLAB allocations" 386 bool "Trace SLAB allocations"
377 select GENERIC_TRACER 387 select GENERIC_TRACER
378 help 388 help
379 kmemtrace provides tracing for slab allocator functions, such as 389 kmemtrace provides tracing for slab allocator functions, such as
380 kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected 390 kmalloc, kfree, kmem_cache_alloc, kmem_cache_free, etc. Collected
381 data is then fed to the userspace application in order to analyse 391 data is then fed to the userspace application in order to analyse
382 allocation hotspots, internal fragmentation and so on, making it 392 allocation hotspots, internal fragmentation and so on, making it
383 possible to see how well an allocator performs, as well as debug 393 possible to see how well an allocator performs, as well as debug
@@ -396,15 +406,15 @@ config WORKQUEUE_TRACER
396 bool "Trace workqueues" 406 bool "Trace workqueues"
397 select GENERIC_TRACER 407 select GENERIC_TRACER
398 help 408 help
399 The workqueue tracer provides some statistical informations 409 The workqueue tracer provides some statistical information
400 about each cpu workqueue thread such as the number of the 410 about each cpu workqueue thread such as the number of the
401 works inserted and executed since their creation. It can help 411 works inserted and executed since their creation. It can help
402 to evaluate the amount of work each of them have to perform. 412 to evaluate the amount of work each of them has to perform.
403 For example it can help a developer to decide whether he should 413 For example it can help a developer to decide whether he should
404 choose a per cpu workqueue instead of a singlethreaded one. 414 choose a per-cpu workqueue instead of a singlethreaded one.
405 415
406config BLK_DEV_IO_TRACE 416config BLK_DEV_IO_TRACE
407 bool "Support for tracing block io actions" 417 bool "Support for tracing block IO actions"
408 depends on SYSFS 418 depends on SYSFS
409 depends on BLOCK 419 depends on BLOCK
410 select RELAY 420 select RELAY
@@ -428,38 +438,55 @@ config BLK_DEV_IO_TRACE
428 438
429 If unsure, say N. 439 If unsure, say N.
430 440
441config KPROBE_EVENT
442 depends on KPROBES
443 depends on HAVE_REGS_AND_STACK_ACCESS_API
444 bool "Enable kprobes-based dynamic events"
445 select TRACING
446 default y
447 help
448 This allows the user to add tracing events (similar to tracepoints)
449 on the fly via the ftrace interface. See
450 Documentation/trace/kprobetrace.txt for more details.
451
452 Those events can be inserted wherever kprobes can probe, and record
453 various register and memory values.
454
455 This option is also required by perf-probe subcommand of perf tools.
456 If you want to use perf tools, this option is strongly recommended.
457
431config DYNAMIC_FTRACE 458config DYNAMIC_FTRACE
432 bool "enable/disable ftrace tracepoints dynamically" 459 bool "enable/disable ftrace tracepoints dynamically"
433 depends on FUNCTION_TRACER 460 depends on FUNCTION_TRACER
434 depends on HAVE_DYNAMIC_FTRACE 461 depends on HAVE_DYNAMIC_FTRACE
435 default y 462 default y
436 help 463 help
437 This option will modify all the calls to ftrace dynamically 464 This option will modify all the calls to ftrace dynamically
438 (will patch them out of the binary image and replaces them 465 (will patch them out of the binary image and replace them
439 with a No-Op instruction) as they are called. A table is 466 with a No-Op instruction) as they are called. A table is
440 created to dynamically enable them again. 467 created to dynamically enable them again.
441 468
442 This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but otherwise 469 This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but
443 has native performance as long as no tracing is active. 470 otherwise has native performance as long as no tracing is active.
444 471
445 The changes to the code are done by a kernel thread that 472 The changes to the code are done by a kernel thread that
446 wakes up once a second and checks to see if any ftrace calls 473 wakes up once a second and checks to see if any ftrace calls
447 were made. If so, it runs stop_machine (stops all CPUS) 474 were made. If so, it runs stop_machine (stops all CPUS)
448 and modifies the code to jump over the call to ftrace. 475 and modifies the code to jump over the call to ftrace.
449 476
450config FUNCTION_PROFILER 477config FUNCTION_PROFILER
451 bool "Kernel function profiler" 478 bool "Kernel function profiler"
452 depends on FUNCTION_TRACER 479 depends on FUNCTION_TRACER
453 default n 480 default n
454 help 481 help
455 This option enables the kernel function profiler. A file is created 482 This option enables the kernel function profiler. A file is created
456 in debugfs called function_profile_enabled which defaults to zero. 483 in debugfs called function_profile_enabled which defaults to zero.
457 When a 1 is echoed into this file profiling begins, and when a 484 When a 1 is echoed into this file profiling begins, and when a
458 zero is entered, profiling stops. A file in the trace_stats 485 zero is entered, profiling stops. A "functions" file is created in
459 directory called functions, that show the list of functions that 486 the trace_stats directory; this file shows the list of functions that
460 have been hit and their counters. 487 have been hit and their counters.
461 488
462 If in doubt, say N 489 If in doubt, say N.
463 490
464config FTRACE_MCOUNT_RECORD 491config FTRACE_MCOUNT_RECORD
465 def_bool y 492 def_bool y
@@ -518,8 +545,8 @@ config RING_BUFFER_BENCHMARK
518 tristate "Ring buffer benchmark stress tester" 545 tristate "Ring buffer benchmark stress tester"
519 depends on RING_BUFFER 546 depends on RING_BUFFER
520 help 547 help
521 This option creates a test to stress the ring buffer and bench mark it. 548 This option creates a test to stress the ring buffer and benchmark it.
522 It creates its own ring buffer such that it will not interfer with 549 It creates its own ring buffer such that it will not interfere with
523 any other users of the ring buffer (such as ftrace). It then creates 550 any other users of the ring buffer (such as ftrace). It then creates
524 a producer and consumer that will run for 10 seconds and sleep for 551 a producer and consumer that will run for 10 seconds and sleep for
525 10 seconds. Each interval it will print out the number of events 552 10 seconds. Each interval it will print out the number of events
@@ -528,7 +555,7 @@ config RING_BUFFER_BENCHMARK
528 It does not disable interrupts or raise its priority, so it may be 555 It does not disable interrupts or raise its priority, so it may be
529 affected by processes that are running. 556 affected by processes that are running.
530 557
531 If unsure, say N 558 If unsure, say N.
532 559
533endif # FTRACE 560endif # FTRACE
534 561
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 26f03ac07c2b..78edc6490038 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -51,8 +51,12 @@ endif
51obj-$(CONFIG_EVENT_TRACING) += trace_events.o 51obj-$(CONFIG_EVENT_TRACING) += trace_events.o
52obj-$(CONFIG_EVENT_TRACING) += trace_export.o 52obj-$(CONFIG_EVENT_TRACING) += trace_export.o
53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o 53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
54obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o 54ifeq ($(CONFIG_PERF_EVENTS),y)
55obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
56endif
55obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 57obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
58obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
59obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
56obj-$(CONFIG_EVENT_TRACING) += power-traces.o 60obj-$(CONFIG_EVENT_TRACING) += power-traces.o
57 61
58libftrace-y := ftrace.o 62libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index d9d6206e0b14..b3bc91a3f510 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -21,6 +21,7 @@
21#include <linux/percpu.h> 21#include <linux/percpu.h>
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/slab.h>
24#include <linux/debugfs.h> 25#include <linux/debugfs.h>
25#include <linux/smp_lock.h> 26#include <linux/smp_lock.h>
26#include <linux/time.h> 27#include <linux/time.h>
@@ -540,9 +541,10 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
540 if (ret) 541 if (ret)
541 return ret; 542 return ret;
542 543
543 if (copy_to_user(arg, &buts, sizeof(buts))) 544 if (copy_to_user(arg, &buts, sizeof(buts))) {
545 blk_trace_remove(q);
544 return -EFAULT; 546 return -EFAULT;
545 547 }
546 return 0; 548 return 0;
547} 549}
548EXPORT_SYMBOL_GPL(blk_trace_setup); 550EXPORT_SYMBOL_GPL(blk_trace_setup);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6dc4e5ef7a01..2404b59b3097 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -22,12 +22,13 @@
22#include <linux/hardirq.h> 22#include <linux/hardirq.h>
23#include <linux/kthread.h> 23#include <linux/kthread.h>
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
25#include <linux/kprobes.h>
26#include <linux/ftrace.h> 25#include <linux/ftrace.h>
27#include <linux/sysctl.h> 26#include <linux/sysctl.h>
27#include <linux/slab.h>
28#include <linux/ctype.h> 28#include <linux/ctype.h>
29#include <linux/list.h> 29#include <linux/list.h>
30#include <linux/hash.h> 30#include <linux/hash.h>
31#include <linux/rcupdate.h>
31 32
32#include <trace/events/sched.h> 33#include <trace/events/sched.h>
33 34
@@ -60,6 +61,13 @@ static int last_ftrace_enabled;
60/* Quick disabling of function tracer. */ 61/* Quick disabling of function tracer. */
61int function_trace_stop; 62int function_trace_stop;
62 63
64/* List for set_ftrace_pid's pids. */
65LIST_HEAD(ftrace_pids);
66struct ftrace_pid {
67 struct list_head list;
68 struct pid *pid;
69};
70
63/* 71/*
64 * ftrace_disabled is set when an anomaly is discovered. 72 * ftrace_disabled is set when an anomaly is discovered.
65 * ftrace_disabled is much stronger than ftrace_enabled. 73 * ftrace_disabled is much stronger than ftrace_enabled.
@@ -78,18 +86,22 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
78ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; 86ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
79ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; 87ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
80 88
89/*
90 * Traverse the ftrace_list, invoking all entries. The reason that we
91 * can use rcu_dereference_raw() is that elements removed from this list
92 * are simply leaked, so there is no need to interact with a grace-period
93 * mechanism. The rcu_dereference_raw() calls are needed to handle
94 * concurrent insertions into the ftrace_list.
95 *
96 * Silly Alpha and silly pointer-speculation compiler optimizations!
97 */
81static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) 98static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
82{ 99{
83 struct ftrace_ops *op = ftrace_list; 100 struct ftrace_ops *op = rcu_dereference_raw(ftrace_list); /*see above*/
84
85 /* in case someone actually ports this to alpha! */
86 read_barrier_depends();
87 101
88 while (op != &ftrace_list_end) { 102 while (op != &ftrace_list_end) {
89 /* silly alpha */
90 read_barrier_depends();
91 op->func(ip, parent_ip); 103 op->func(ip, parent_ip);
92 op = op->next; 104 op = rcu_dereference_raw(op->next); /*see above*/
93 }; 105 };
94} 106}
95 107
@@ -144,8 +156,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
144 * the ops->next pointer is valid before another CPU sees 156 * the ops->next pointer is valid before another CPU sees
145 * the ops pointer included into the ftrace_list. 157 * the ops pointer included into the ftrace_list.
146 */ 158 */
147 smp_wmb(); 159 rcu_assign_pointer(ftrace_list, ops);
148 ftrace_list = ops;
149 160
150 if (ftrace_enabled) { 161 if (ftrace_enabled) {
151 ftrace_func_t func; 162 ftrace_func_t func;
@@ -155,7 +166,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
155 else 166 else
156 func = ftrace_list_func; 167 func = ftrace_list_func;
157 168
158 if (ftrace_pid_trace) { 169 if (!list_empty(&ftrace_pids)) {
159 set_ftrace_pid_function(func); 170 set_ftrace_pid_function(func);
160 func = ftrace_pid_func; 171 func = ftrace_pid_func;
161 } 172 }
@@ -203,7 +214,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
203 if (ftrace_list->next == &ftrace_list_end) { 214 if (ftrace_list->next == &ftrace_list_end) {
204 ftrace_func_t func = ftrace_list->func; 215 ftrace_func_t func = ftrace_list->func;
205 216
206 if (ftrace_pid_trace) { 217 if (!list_empty(&ftrace_pids)) {
207 set_ftrace_pid_function(func); 218 set_ftrace_pid_function(func);
208 func = ftrace_pid_func; 219 func = ftrace_pid_func;
209 } 220 }
@@ -231,7 +242,7 @@ static void ftrace_update_pid_func(void)
231 func = __ftrace_trace_function; 242 func = __ftrace_trace_function;
232#endif 243#endif
233 244
234 if (ftrace_pid_trace) { 245 if (!list_empty(&ftrace_pids)) {
235 set_ftrace_pid_function(func); 246 set_ftrace_pid_function(func);
236 func = ftrace_pid_func; 247 func = ftrace_pid_func;
237 } else { 248 } else {
@@ -821,8 +832,6 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
821} 832}
822#endif /* CONFIG_FUNCTION_PROFILER */ 833#endif /* CONFIG_FUNCTION_PROFILER */
823 834
824/* set when tracing only a pid */
825struct pid *ftrace_pid_trace;
826static struct pid * const ftrace_swapper_pid = &init_struct_pid; 835static struct pid * const ftrace_swapper_pid = &init_struct_pid;
827 836
828#ifdef CONFIG_DYNAMIC_FTRACE 837#ifdef CONFIG_DYNAMIC_FTRACE
@@ -889,36 +898,6 @@ static struct dyn_ftrace *ftrace_free_records;
889 } \ 898 } \
890 } 899 }
891 900
892#ifdef CONFIG_KPROBES
893
894static int frozen_record_count;
895
896static inline void freeze_record(struct dyn_ftrace *rec)
897{
898 if (!(rec->flags & FTRACE_FL_FROZEN)) {
899 rec->flags |= FTRACE_FL_FROZEN;
900 frozen_record_count++;
901 }
902}
903
904static inline void unfreeze_record(struct dyn_ftrace *rec)
905{
906 if (rec->flags & FTRACE_FL_FROZEN) {
907 rec->flags &= ~FTRACE_FL_FROZEN;
908 frozen_record_count--;
909 }
910}
911
912static inline int record_frozen(struct dyn_ftrace *rec)
913{
914 return rec->flags & FTRACE_FL_FROZEN;
915}
916#else
917# define freeze_record(rec) ({ 0; })
918# define unfreeze_record(rec) ({ 0; })
919# define record_frozen(rec) ({ 0; })
920#endif /* CONFIG_KPROBES */
921
922static void ftrace_free_rec(struct dyn_ftrace *rec) 901static void ftrace_free_rec(struct dyn_ftrace *rec)
923{ 902{
924 rec->freelist = ftrace_free_records; 903 rec->freelist = ftrace_free_records;
@@ -1016,6 +995,21 @@ static void ftrace_bug(int failed, unsigned long ip)
1016} 995}
1017 996
1018 997
998/* Return 1 if the address range is reserved for ftrace */
999int ftrace_text_reserved(void *start, void *end)
1000{
1001 struct dyn_ftrace *rec;
1002 struct ftrace_page *pg;
1003
1004 do_for_each_ftrace_rec(pg, rec) {
1005 if (rec->ip <= (unsigned long)end &&
1006 rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start)
1007 return 1;
1008 } while_for_each_ftrace_rec();
1009 return 0;
1010}
1011
1012
1019static int 1013static int
1020__ftrace_replace_code(struct dyn_ftrace *rec, int enable) 1014__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1021{ 1015{
@@ -1067,14 +1061,6 @@ static void ftrace_replace_code(int enable)
1067 !(rec->flags & FTRACE_FL_CONVERTED)) 1061 !(rec->flags & FTRACE_FL_CONVERTED))
1068 continue; 1062 continue;
1069 1063
1070 /* ignore updates to this record's mcount site */
1071 if (get_kprobe((void *)rec->ip)) {
1072 freeze_record(rec);
1073 continue;
1074 } else {
1075 unfreeze_record(rec);
1076 }
1077
1078 failed = __ftrace_replace_code(rec, enable); 1064 failed = __ftrace_replace_code(rec, enable);
1079 if (failed) { 1065 if (failed) {
1080 rec->flags |= FTRACE_FL_FAILED; 1066 rec->flags |= FTRACE_FL_FAILED;
@@ -1261,12 +1247,34 @@ static int ftrace_update_code(struct module *mod)
1261 ftrace_new_addrs = p->newlist; 1247 ftrace_new_addrs = p->newlist;
1262 p->flags = 0L; 1248 p->flags = 0L;
1263 1249
1264 /* convert record (i.e, patch mcount-call with NOP) */ 1250 /*
1265 if (ftrace_code_disable(mod, p)) { 1251 * Do the initial record convertion from mcount jump
1266 p->flags |= FTRACE_FL_CONVERTED; 1252 * to the NOP instructions.
1267 ftrace_update_cnt++; 1253 */
1268 } else 1254 if (!ftrace_code_disable(mod, p)) {
1269 ftrace_free_rec(p); 1255 ftrace_free_rec(p);
1256 continue;
1257 }
1258
1259 p->flags |= FTRACE_FL_CONVERTED;
1260 ftrace_update_cnt++;
1261
1262 /*
1263 * If the tracing is enabled, go ahead and enable the record.
1264 *
1265 * The reason not to enable the record immediatelly is the
1266 * inherent check of ftrace_make_nop/ftrace_make_call for
1267 * correct previous instructions. Making first the NOP
1268 * conversion puts the module to the correct state, thus
1269 * passing the ftrace_make_call check.
1270 */
1271 if (ftrace_start_up) {
1272 int failed = __ftrace_replace_code(p, 1);
1273 if (failed) {
1274 ftrace_bug(failed, p->ip);
1275 ftrace_free_rec(p);
1276 }
1277 }
1270 } 1278 }
1271 1279
1272 stop = ftrace_now(raw_smp_processor_id()); 1280 stop = ftrace_now(raw_smp_processor_id());
@@ -1656,64 +1664,10 @@ ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
1656 return ret; 1664 return ret;
1657} 1665}
1658 1666
1659enum {
1660 MATCH_FULL,
1661 MATCH_FRONT_ONLY,
1662 MATCH_MIDDLE_ONLY,
1663 MATCH_END_ONLY,
1664};
1665
1666/*
1667 * (static function - no need for kernel doc)
1668 *
1669 * Pass in a buffer containing a glob and this function will
1670 * set search to point to the search part of the buffer and
1671 * return the type of search it is (see enum above).
1672 * This does modify buff.
1673 *
1674 * Returns enum type.
1675 * search returns the pointer to use for comparison.
1676 * not returns 1 if buff started with a '!'
1677 * 0 otherwise.
1678 */
1679static int
1680ftrace_setup_glob(char *buff, int len, char **search, int *not)
1681{
1682 int type = MATCH_FULL;
1683 int i;
1684
1685 if (buff[0] == '!') {
1686 *not = 1;
1687 buff++;
1688 len--;
1689 } else
1690 *not = 0;
1691
1692 *search = buff;
1693
1694 for (i = 0; i < len; i++) {
1695 if (buff[i] == '*') {
1696 if (!i) {
1697 *search = buff + 1;
1698 type = MATCH_END_ONLY;
1699 } else {
1700 if (type == MATCH_END_ONLY)
1701 type = MATCH_MIDDLE_ONLY;
1702 else
1703 type = MATCH_FRONT_ONLY;
1704 buff[i] = 0;
1705 break;
1706 }
1707 }
1708 }
1709
1710 return type;
1711}
1712
1713static int ftrace_match(char *str, char *regex, int len, int type) 1667static int ftrace_match(char *str, char *regex, int len, int type)
1714{ 1668{
1715 int matched = 0; 1669 int matched = 0;
1716 char *ptr; 1670 int slen;
1717 1671
1718 switch (type) { 1672 switch (type) {
1719 case MATCH_FULL: 1673 case MATCH_FULL:
@@ -1729,8 +1683,8 @@ static int ftrace_match(char *str, char *regex, int len, int type)
1729 matched = 1; 1683 matched = 1;
1730 break; 1684 break;
1731 case MATCH_END_ONLY: 1685 case MATCH_END_ONLY:
1732 ptr = strstr(str, regex); 1686 slen = strlen(str);
1733 if (ptr && (ptr[len] == 0)) 1687 if (slen >= len && memcmp(str + slen - len, regex, len) == 0)
1734 matched = 1; 1688 matched = 1;
1735 break; 1689 break;
1736 } 1690 }
@@ -1747,7 +1701,7 @@ ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type)
1747 return ftrace_match(str, regex, len, type); 1701 return ftrace_match(str, regex, len, type);
1748} 1702}
1749 1703
1750static void ftrace_match_records(char *buff, int len, int enable) 1704static int ftrace_match_records(char *buff, int len, int enable)
1751{ 1705{
1752 unsigned int search_len; 1706 unsigned int search_len;
1753 struct ftrace_page *pg; 1707 struct ftrace_page *pg;
@@ -1756,9 +1710,10 @@ static void ftrace_match_records(char *buff, int len, int enable)
1756 char *search; 1710 char *search;
1757 int type; 1711 int type;
1758 int not; 1712 int not;
1713 int found = 0;
1759 1714
1760 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; 1715 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1761 type = ftrace_setup_glob(buff, len, &search, &not); 1716 type = filter_parse_regex(buff, len, &search, &not);
1762 1717
1763 search_len = strlen(search); 1718 search_len = strlen(search);
1764 1719
@@ -1773,6 +1728,7 @@ static void ftrace_match_records(char *buff, int len, int enable)
1773 rec->flags &= ~flag; 1728 rec->flags &= ~flag;
1774 else 1729 else
1775 rec->flags |= flag; 1730 rec->flags |= flag;
1731 found = 1;
1776 } 1732 }
1777 /* 1733 /*
1778 * Only enable filtering if we have a function that 1734 * Only enable filtering if we have a function that
@@ -1782,6 +1738,8 @@ static void ftrace_match_records(char *buff, int len, int enable)
1782 ftrace_filtered = 1; 1738 ftrace_filtered = 1;
1783 } while_for_each_ftrace_rec(); 1739 } while_for_each_ftrace_rec();
1784 mutex_unlock(&ftrace_lock); 1740 mutex_unlock(&ftrace_lock);
1741
1742 return found;
1785} 1743}
1786 1744
1787static int 1745static int
@@ -1803,7 +1761,7 @@ ftrace_match_module_record(struct dyn_ftrace *rec, char *mod,
1803 return 1; 1761 return 1;
1804} 1762}
1805 1763
1806static void ftrace_match_module_records(char *buff, char *mod, int enable) 1764static int ftrace_match_module_records(char *buff, char *mod, int enable)
1807{ 1765{
1808 unsigned search_len = 0; 1766 unsigned search_len = 0;
1809 struct ftrace_page *pg; 1767 struct ftrace_page *pg;
@@ -1812,6 +1770,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
1812 char *search = buff; 1770 char *search = buff;
1813 unsigned long flag; 1771 unsigned long flag;
1814 int not = 0; 1772 int not = 0;
1773 int found = 0;
1815 1774
1816 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; 1775 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1817 1776
@@ -1826,7 +1785,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
1826 } 1785 }
1827 1786
1828 if (strlen(buff)) { 1787 if (strlen(buff)) {
1829 type = ftrace_setup_glob(buff, strlen(buff), &search, &not); 1788 type = filter_parse_regex(buff, strlen(buff), &search, &not);
1830 search_len = strlen(search); 1789 search_len = strlen(search);
1831 } 1790 }
1832 1791
@@ -1842,12 +1801,15 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
1842 rec->flags &= ~flag; 1801 rec->flags &= ~flag;
1843 else 1802 else
1844 rec->flags |= flag; 1803 rec->flags |= flag;
1804 found = 1;
1845 } 1805 }
1846 if (enable && (rec->flags & FTRACE_FL_FILTER)) 1806 if (enable && (rec->flags & FTRACE_FL_FILTER))
1847 ftrace_filtered = 1; 1807 ftrace_filtered = 1;
1848 1808
1849 } while_for_each_ftrace_rec(); 1809 } while_for_each_ftrace_rec();
1850 mutex_unlock(&ftrace_lock); 1810 mutex_unlock(&ftrace_lock);
1811
1812 return found;
1851} 1813}
1852 1814
1853/* 1815/*
@@ -1876,8 +1838,9 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
1876 if (!strlen(mod)) 1838 if (!strlen(mod))
1877 return -EINVAL; 1839 return -EINVAL;
1878 1840
1879 ftrace_match_module_records(func, mod, enable); 1841 if (ftrace_match_module_records(func, mod, enable))
1880 return 0; 1842 return 0;
1843 return -EINVAL;
1881} 1844}
1882 1845
1883static struct ftrace_func_command ftrace_mod_cmd = { 1846static struct ftrace_func_command ftrace_mod_cmd = {
@@ -1991,7 +1954,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
1991 int count = 0; 1954 int count = 0;
1992 char *search; 1955 char *search;
1993 1956
1994 type = ftrace_setup_glob(glob, strlen(glob), &search, &not); 1957 type = filter_parse_regex(glob, strlen(glob), &search, &not);
1995 len = strlen(search); 1958 len = strlen(search);
1996 1959
1997 /* we do not support '!' for function probes */ 1960 /* we do not support '!' for function probes */
@@ -2068,7 +2031,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
2068 else if (glob) { 2031 else if (glob) {
2069 int not; 2032 int not;
2070 2033
2071 type = ftrace_setup_glob(glob, strlen(glob), &search, &not); 2034 type = filter_parse_regex(glob, strlen(glob), &search, &not);
2072 len = strlen(search); 2035 len = strlen(search);
2073 2036
2074 /* we do not support '!' for function probes */ 2037 /* we do not support '!' for function probes */
@@ -2174,8 +2137,9 @@ static int ftrace_process_regex(char *buff, int len, int enable)
2174 func = strsep(&next, ":"); 2137 func = strsep(&next, ":");
2175 2138
2176 if (!next) { 2139 if (!next) {
2177 ftrace_match_records(func, len, enable); 2140 if (ftrace_match_records(func, len, enable))
2178 return 0; 2141 return 0;
2142 return ret;
2179 } 2143 }
2180 2144
2181 /* command found */ 2145 /* command found */
@@ -2221,10 +2185,9 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2221 !trace_parser_cont(parser)) { 2185 !trace_parser_cont(parser)) {
2222 ret = ftrace_process_regex(parser->buffer, 2186 ret = ftrace_process_regex(parser->buffer,
2223 parser->idx, enable); 2187 parser->idx, enable);
2188 trace_parser_clear(parser);
2224 if (ret) 2189 if (ret)
2225 goto out_unlock; 2190 goto out_unlock;
2226
2227 trace_parser_clear(parser);
2228 } 2191 }
2229 2192
2230 ret = read; 2193 ret = read;
@@ -2312,6 +2275,34 @@ static int __init set_ftrace_filter(char *str)
2312} 2275}
2313__setup("ftrace_filter=", set_ftrace_filter); 2276__setup("ftrace_filter=", set_ftrace_filter);
2314 2277
2278#ifdef CONFIG_FUNCTION_GRAPH_TRACER
2279static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
2280static int ftrace_set_func(unsigned long *array, int *idx, char *buffer);
2281
2282static int __init set_graph_function(char *str)
2283{
2284 strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
2285 return 1;
2286}
2287__setup("ftrace_graph_filter=", set_graph_function);
2288
2289static void __init set_ftrace_early_graph(char *buf)
2290{
2291 int ret;
2292 char *func;
2293
2294 while (buf) {
2295 func = strsep(&buf, ",");
2296 /* we allow only one expression at a time */
2297 ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
2298 func);
2299 if (ret)
2300 printk(KERN_DEBUG "ftrace: function %s not "
2301 "traceable\n", func);
2302 }
2303}
2304#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
2305
2315static void __init set_ftrace_early_filter(char *buf, int enable) 2306static void __init set_ftrace_early_filter(char *buf, int enable)
2316{ 2307{
2317 char *func; 2308 char *func;
@@ -2328,6 +2319,10 @@ static void __init set_ftrace_early_filters(void)
2328 set_ftrace_early_filter(ftrace_filter_buf, 1); 2319 set_ftrace_early_filter(ftrace_filter_buf, 1);
2329 if (ftrace_notrace_buf[0]) 2320 if (ftrace_notrace_buf[0])
2330 set_ftrace_early_filter(ftrace_notrace_buf, 0); 2321 set_ftrace_early_filter(ftrace_notrace_buf, 0);
2322#ifdef CONFIG_FUNCTION_GRAPH_TRACER
2323 if (ftrace_graph_buf[0])
2324 set_ftrace_early_graph(ftrace_graph_buf);
2325#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
2331} 2326}
2332 2327
2333static int 2328static int
@@ -2410,6 +2405,7 @@ static const struct file_operations ftrace_notrace_fops = {
2410static DEFINE_MUTEX(graph_lock); 2405static DEFINE_MUTEX(graph_lock);
2411 2406
2412int ftrace_graph_count; 2407int ftrace_graph_count;
2408int ftrace_graph_filter_enabled;
2413unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; 2409unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
2414 2410
2415static void * 2411static void *
@@ -2432,7 +2428,7 @@ static void *g_start(struct seq_file *m, loff_t *pos)
2432 mutex_lock(&graph_lock); 2428 mutex_lock(&graph_lock);
2433 2429
2434 /* Nothing, tell g_show to print all functions are enabled */ 2430 /* Nothing, tell g_show to print all functions are enabled */
2435 if (!ftrace_graph_count && !*pos) 2431 if (!ftrace_graph_filter_enabled && !*pos)
2436 return (void *)1; 2432 return (void *)1;
2437 2433
2438 return __g_next(m, pos); 2434 return __g_next(m, pos);
@@ -2478,6 +2474,7 @@ ftrace_graph_open(struct inode *inode, struct file *file)
2478 mutex_lock(&graph_lock); 2474 mutex_lock(&graph_lock);
2479 if ((file->f_mode & FMODE_WRITE) && 2475 if ((file->f_mode & FMODE_WRITE) &&
2480 (file->f_flags & O_TRUNC)) { 2476 (file->f_flags & O_TRUNC)) {
2477 ftrace_graph_filter_enabled = 0;
2481 ftrace_graph_count = 0; 2478 ftrace_graph_count = 0;
2482 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); 2479 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
2483 } 2480 }
@@ -2503,7 +2500,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2503 struct dyn_ftrace *rec; 2500 struct dyn_ftrace *rec;
2504 struct ftrace_page *pg; 2501 struct ftrace_page *pg;
2505 int search_len; 2502 int search_len;
2506 int found = 0; 2503 int fail = 1;
2507 int type, not; 2504 int type, not;
2508 char *search; 2505 char *search;
2509 bool exists; 2506 bool exists;
@@ -2513,39 +2510,52 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2513 return -ENODEV; 2510 return -ENODEV;
2514 2511
2515 /* decode regex */ 2512 /* decode regex */
2516 type = ftrace_setup_glob(buffer, strlen(buffer), &search, &not); 2513 type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
2517 if (not) 2514 if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS)
2518 return -EINVAL; 2515 return -EBUSY;
2519 2516
2520 search_len = strlen(search); 2517 search_len = strlen(search);
2521 2518
2522 mutex_lock(&ftrace_lock); 2519 mutex_lock(&ftrace_lock);
2523 do_for_each_ftrace_rec(pg, rec) { 2520 do_for_each_ftrace_rec(pg, rec) {
2524 2521
2525 if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
2526 break;
2527
2528 if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) 2522 if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
2529 continue; 2523 continue;
2530 2524
2531 if (ftrace_match_record(rec, search, search_len, type)) { 2525 if (ftrace_match_record(rec, search, search_len, type)) {
2532 /* ensure it is not already in the array */ 2526 /* if it is in the array */
2533 exists = false; 2527 exists = false;
2534 for (i = 0; i < *idx; i++) 2528 for (i = 0; i < *idx; i++) {
2535 if (array[i] == rec->ip) { 2529 if (array[i] == rec->ip) {
2536 exists = true; 2530 exists = true;
2537 break; 2531 break;
2538 } 2532 }
2539 if (!exists) { 2533 }
2540 array[(*idx)++] = rec->ip; 2534
2541 found = 1; 2535 if (!not) {
2536 fail = 0;
2537 if (!exists) {
2538 array[(*idx)++] = rec->ip;
2539 if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
2540 goto out;
2541 }
2542 } else {
2543 if (exists) {
2544 array[i] = array[--(*idx)];
2545 array[*idx] = 0;
2546 fail = 0;
2547 }
2542 } 2548 }
2543 } 2549 }
2544 } while_for_each_ftrace_rec(); 2550 } while_for_each_ftrace_rec();
2545 2551out:
2546 mutex_unlock(&ftrace_lock); 2552 mutex_unlock(&ftrace_lock);
2547 2553
2548 return found ? 0 : -EINVAL; 2554 if (fail)
2555 return -EINVAL;
2556
2557 ftrace_graph_filter_enabled = 1;
2558 return 0;
2549} 2559}
2550 2560
2551static ssize_t 2561static ssize_t
@@ -2555,16 +2565,11 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
2555 struct trace_parser parser; 2565 struct trace_parser parser;
2556 ssize_t read, ret; 2566 ssize_t read, ret;
2557 2567
2558 if (!cnt || cnt < 0) 2568 if (!cnt)
2559 return 0; 2569 return 0;
2560 2570
2561 mutex_lock(&graph_lock); 2571 mutex_lock(&graph_lock);
2562 2572
2563 if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) {
2564 ret = -EBUSY;
2565 goto out_unlock;
2566 }
2567
2568 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { 2573 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
2569 ret = -ENOMEM; 2574 ret = -ENOMEM;
2570 goto out_unlock; 2575 goto out_unlock;
@@ -2624,7 +2629,7 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
2624 return 0; 2629 return 0;
2625} 2630}
2626 2631
2627static int ftrace_convert_nops(struct module *mod, 2632static int ftrace_process_locs(struct module *mod,
2628 unsigned long *start, 2633 unsigned long *start,
2629 unsigned long *end) 2634 unsigned long *end)
2630{ 2635{
@@ -2684,7 +2689,7 @@ static void ftrace_init_module(struct module *mod,
2684{ 2689{
2685 if (ftrace_disabled || start == end) 2690 if (ftrace_disabled || start == end)
2686 return; 2691 return;
2687 ftrace_convert_nops(mod, start, end); 2692 ftrace_process_locs(mod, start, end);
2688} 2693}
2689 2694
2690static int ftrace_module_notify(struct notifier_block *self, 2695static int ftrace_module_notify(struct notifier_block *self,
@@ -2745,7 +2750,7 @@ void __init ftrace_init(void)
2745 2750
2746 last_ftrace_enabled = ftrace_enabled = 1; 2751 last_ftrace_enabled = ftrace_enabled = 1;
2747 2752
2748 ret = ftrace_convert_nops(NULL, 2753 ret = ftrace_process_locs(NULL,
2749 __start_mcount_loc, 2754 __start_mcount_loc,
2750 __stop_mcount_loc); 2755 __stop_mcount_loc);
2751 2756
@@ -2778,23 +2783,6 @@ static inline void ftrace_startup_enable(int command) { }
2778# define ftrace_shutdown_sysctl() do { } while (0) 2783# define ftrace_shutdown_sysctl() do { } while (0)
2779#endif /* CONFIG_DYNAMIC_FTRACE */ 2784#endif /* CONFIG_DYNAMIC_FTRACE */
2780 2785
2781static ssize_t
2782ftrace_pid_read(struct file *file, char __user *ubuf,
2783 size_t cnt, loff_t *ppos)
2784{
2785 char buf[64];
2786 int r;
2787
2788 if (ftrace_pid_trace == ftrace_swapper_pid)
2789 r = sprintf(buf, "swapper tasks\n");
2790 else if (ftrace_pid_trace)
2791 r = sprintf(buf, "%u\n", pid_vnr(ftrace_pid_trace));
2792 else
2793 r = sprintf(buf, "no pid\n");
2794
2795 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2796}
2797
2798static void clear_ftrace_swapper(void) 2786static void clear_ftrace_swapper(void)
2799{ 2787{
2800 struct task_struct *p; 2788 struct task_struct *p;
@@ -2845,14 +2833,12 @@ static void set_ftrace_pid(struct pid *pid)
2845 rcu_read_unlock(); 2833 rcu_read_unlock();
2846} 2834}
2847 2835
2848static void clear_ftrace_pid_task(struct pid **pid) 2836static void clear_ftrace_pid_task(struct pid *pid)
2849{ 2837{
2850 if (*pid == ftrace_swapper_pid) 2838 if (pid == ftrace_swapper_pid)
2851 clear_ftrace_swapper(); 2839 clear_ftrace_swapper();
2852 else 2840 else
2853 clear_ftrace_pid(*pid); 2841 clear_ftrace_pid(pid);
2854
2855 *pid = NULL;
2856} 2842}
2857 2843
2858static void set_ftrace_pid_task(struct pid *pid) 2844static void set_ftrace_pid_task(struct pid *pid)
@@ -2863,74 +2849,184 @@ static void set_ftrace_pid_task(struct pid *pid)
2863 set_ftrace_pid(pid); 2849 set_ftrace_pid(pid);
2864} 2850}
2865 2851
2866static ssize_t 2852static int ftrace_pid_add(int p)
2867ftrace_pid_write(struct file *filp, const char __user *ubuf,
2868 size_t cnt, loff_t *ppos)
2869{ 2853{
2870 struct pid *pid; 2854 struct pid *pid;
2871 char buf[64]; 2855 struct ftrace_pid *fpid;
2872 long val; 2856 int ret = -EINVAL;
2873 int ret;
2874 2857
2875 if (cnt >= sizeof(buf)) 2858 mutex_lock(&ftrace_lock);
2876 return -EINVAL;
2877 2859
2878 if (copy_from_user(&buf, ubuf, cnt)) 2860 if (!p)
2879 return -EFAULT; 2861 pid = ftrace_swapper_pid;
2862 else
2863 pid = find_get_pid(p);
2880 2864
2881 buf[cnt] = 0; 2865 if (!pid)
2866 goto out;
2882 2867
2883 ret = strict_strtol(buf, 10, &val); 2868 ret = 0;
2884 if (ret < 0)
2885 return ret;
2886 2869
2887 mutex_lock(&ftrace_lock); 2870 list_for_each_entry(fpid, &ftrace_pids, list)
2888 if (val < 0) { 2871 if (fpid->pid == pid)
2889 /* disable pid tracing */ 2872 goto out_put;
2890 if (!ftrace_pid_trace)
2891 goto out;
2892 2873
2893 clear_ftrace_pid_task(&ftrace_pid_trace); 2874 ret = -ENOMEM;
2894 2875
2895 } else { 2876 fpid = kmalloc(sizeof(*fpid), GFP_KERNEL);
2896 /* swapper task is special */ 2877 if (!fpid)
2897 if (!val) { 2878 goto out_put;
2898 pid = ftrace_swapper_pid;
2899 if (pid == ftrace_pid_trace)
2900 goto out;
2901 } else {
2902 pid = find_get_pid(val);
2903 2879
2904 if (pid == ftrace_pid_trace) { 2880 list_add(&fpid->list, &ftrace_pids);
2905 put_pid(pid); 2881 fpid->pid = pid;
2906 goto out;
2907 }
2908 }
2909 2882
2910 if (ftrace_pid_trace) 2883 set_ftrace_pid_task(pid);
2911 clear_ftrace_pid_task(&ftrace_pid_trace);
2912 2884
2913 if (!pid) 2885 ftrace_update_pid_func();
2914 goto out; 2886 ftrace_startup_enable(0);
2887
2888 mutex_unlock(&ftrace_lock);
2889 return 0;
2890
2891out_put:
2892 if (pid != ftrace_swapper_pid)
2893 put_pid(pid);
2894
2895out:
2896 mutex_unlock(&ftrace_lock);
2897 return ret;
2898}
2899
2900static void ftrace_pid_reset(void)
2901{
2902 struct ftrace_pid *fpid, *safe;
2903
2904 mutex_lock(&ftrace_lock);
2905 list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) {
2906 struct pid *pid = fpid->pid;
2915 2907
2916 ftrace_pid_trace = pid; 2908 clear_ftrace_pid_task(pid);
2917 2909
2918 set_ftrace_pid_task(ftrace_pid_trace); 2910 list_del(&fpid->list);
2911 kfree(fpid);
2919 } 2912 }
2920 2913
2921 /* update the function call */
2922 ftrace_update_pid_func(); 2914 ftrace_update_pid_func();
2923 ftrace_startup_enable(0); 2915 ftrace_startup_enable(0);
2924 2916
2925 out:
2926 mutex_unlock(&ftrace_lock); 2917 mutex_unlock(&ftrace_lock);
2918}
2927 2919
2928 return cnt; 2920static void *fpid_start(struct seq_file *m, loff_t *pos)
2921{
2922 mutex_lock(&ftrace_lock);
2923
2924 if (list_empty(&ftrace_pids) && (!*pos))
2925 return (void *) 1;
2926
2927 return seq_list_start(&ftrace_pids, *pos);
2928}
2929
2930static void *fpid_next(struct seq_file *m, void *v, loff_t *pos)
2931{
2932 if (v == (void *)1)
2933 return NULL;
2934
2935 return seq_list_next(v, &ftrace_pids, pos);
2936}
2937
2938static void fpid_stop(struct seq_file *m, void *p)
2939{
2940 mutex_unlock(&ftrace_lock);
2941}
2942
2943static int fpid_show(struct seq_file *m, void *v)
2944{
2945 const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list);
2946
2947 if (v == (void *)1) {
2948 seq_printf(m, "no pid\n");
2949 return 0;
2950 }
2951
2952 if (fpid->pid == ftrace_swapper_pid)
2953 seq_printf(m, "swapper tasks\n");
2954 else
2955 seq_printf(m, "%u\n", pid_vnr(fpid->pid));
2956
2957 return 0;
2958}
2959
2960static const struct seq_operations ftrace_pid_sops = {
2961 .start = fpid_start,
2962 .next = fpid_next,
2963 .stop = fpid_stop,
2964 .show = fpid_show,
2965};
2966
2967static int
2968ftrace_pid_open(struct inode *inode, struct file *file)
2969{
2970 int ret = 0;
2971
2972 if ((file->f_mode & FMODE_WRITE) &&
2973 (file->f_flags & O_TRUNC))
2974 ftrace_pid_reset();
2975
2976 if (file->f_mode & FMODE_READ)
2977 ret = seq_open(file, &ftrace_pid_sops);
2978
2979 return ret;
2980}
2981
2982static ssize_t
2983ftrace_pid_write(struct file *filp, const char __user *ubuf,
2984 size_t cnt, loff_t *ppos)
2985{
2986 char buf[64], *tmp;
2987 long val;
2988 int ret;
2989
2990 if (cnt >= sizeof(buf))
2991 return -EINVAL;
2992
2993 if (copy_from_user(&buf, ubuf, cnt))
2994 return -EFAULT;
2995
2996 buf[cnt] = 0;
2997
2998 /*
2999 * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid"
3000 * to clean the filter quietly.
3001 */
3002 tmp = strstrip(buf);
3003 if (strlen(tmp) == 0)
3004 return 1;
3005
3006 ret = strict_strtol(tmp, 10, &val);
3007 if (ret < 0)
3008 return ret;
3009
3010 ret = ftrace_pid_add(val);
3011
3012 return ret ? ret : cnt;
3013}
3014
3015static int
3016ftrace_pid_release(struct inode *inode, struct file *file)
3017{
3018 if (file->f_mode & FMODE_READ)
3019 seq_release(inode, file);
3020
3021 return 0;
2929} 3022}
2930 3023
2931static const struct file_operations ftrace_pid_fops = { 3024static const struct file_operations ftrace_pid_fops = {
2932 .read = ftrace_pid_read, 3025 .open = ftrace_pid_open,
2933 .write = ftrace_pid_write, 3026 .write = ftrace_pid_write,
3027 .read = seq_read,
3028 .llseek = seq_lseek,
3029 .release = ftrace_pid_release,
2934}; 3030};
2935 3031
2936static __init int ftrace_init_debugfs(void) 3032static __init int ftrace_init_debugfs(void)
@@ -3258,6 +3354,7 @@ void ftrace_graph_init_task(struct task_struct *t)
3258{ 3354{
3259 /* Make sure we do not use the parent ret_stack */ 3355 /* Make sure we do not use the parent ret_stack */
3260 t->ret_stack = NULL; 3356 t->ret_stack = NULL;
3357 t->curr_ret_stack = -1;
3261 3358
3262 if (ftrace_graph_active) { 3359 if (ftrace_graph_active) {
3263 struct ftrace_ret_stack *ret_stack; 3360 struct ftrace_ret_stack *ret_stack;
@@ -3267,7 +3364,6 @@ void ftrace_graph_init_task(struct task_struct *t)
3267 GFP_KERNEL); 3364 GFP_KERNEL);
3268 if (!ret_stack) 3365 if (!ret_stack)
3269 return; 3366 return;
3270 t->curr_ret_stack = -1;
3271 atomic_set(&t->tracing_graph_pause, 0); 3367 atomic_set(&t->tracing_graph_pause, 0);
3272 atomic_set(&t->trace_overrun, 0); 3368 atomic_set(&t->trace_overrun, 0);
3273 t->ftrace_timestamp = 0; 3369 t->ftrace_timestamp = 0;
@@ -3293,4 +3389,3 @@ void ftrace_graph_stop(void)
3293 ftrace_stop(); 3389 ftrace_stop();
3294} 3390}
3295#endif 3391#endif
3296
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index e06c6e3d56a3..a22582a06161 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -9,12 +9,9 @@
9#include <linux/workqueue.h> 9#include <linux/workqueue.h>
10#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/slab.h>
13 12
14#define CREATE_TRACE_POINTS 13#define CREATE_TRACE_POINTS
15#include <trace/events/power.h> 14#include <trace/events/power.h>
16 15
17EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
18EXPORT_TRACEPOINT_SYMBOL_GPL(power_end);
19EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency); 16EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency);
20 17
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 5dd017fea6f5..41ca394feb22 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -14,12 +14,14 @@
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/percpu.h> 15#include <linux/percpu.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <linux/slab.h>
17#include <linux/init.h> 18#include <linux/init.h>
18#include <linux/hash.h> 19#include <linux/hash.h>
19#include <linux/list.h> 20#include <linux/list.h>
20#include <linux/cpu.h> 21#include <linux/cpu.h>
21#include <linux/fs.h> 22#include <linux/fs.h>
22 23
24#include <asm/local.h>
23#include "trace.h" 25#include "trace.h"
24 26
25/* 27/*
@@ -206,6 +208,14 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
206#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) 208#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
207#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ 209#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
208 210
211#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
212# define RB_FORCE_8BYTE_ALIGNMENT 0
213# define RB_ARCH_ALIGNMENT RB_ALIGNMENT
214#else
215# define RB_FORCE_8BYTE_ALIGNMENT 1
216# define RB_ARCH_ALIGNMENT 8U
217#endif
218
209/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ 219/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
210#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX 220#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
211 221
@@ -397,18 +407,21 @@ int ring_buffer_print_page_header(struct trace_seq *s)
397 int ret; 407 int ret;
398 408
399 ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t" 409 ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t"
400 "offset:0;\tsize:%u;\n", 410 "offset:0;\tsize:%u;\tsigned:%u;\n",
401 (unsigned int)sizeof(field.time_stamp)); 411 (unsigned int)sizeof(field.time_stamp),
412 (unsigned int)is_signed_type(u64));
402 413
403 ret = trace_seq_printf(s, "\tfield: local_t commit;\t" 414 ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
404 "offset:%u;\tsize:%u;\n", 415 "offset:%u;\tsize:%u;\tsigned:%u;\n",
405 (unsigned int)offsetof(typeof(field), commit), 416 (unsigned int)offsetof(typeof(field), commit),
406 (unsigned int)sizeof(field.commit)); 417 (unsigned int)sizeof(field.commit),
418 (unsigned int)is_signed_type(long));
407 419
408 ret = trace_seq_printf(s, "\tfield: char data;\t" 420 ret = trace_seq_printf(s, "\tfield: char data;\t"
409 "offset:%u;\tsize:%u;\n", 421 "offset:%u;\tsize:%u;\tsigned:%u;\n",
410 (unsigned int)offsetof(typeof(field), data), 422 (unsigned int)offsetof(typeof(field), data),
411 (unsigned int)BUF_PAGE_SIZE); 423 (unsigned int)BUF_PAGE_SIZE,
424 (unsigned int)is_signed_type(char));
412 425
413 return ret; 426 return ret;
414} 427}
@@ -420,7 +433,7 @@ struct ring_buffer_per_cpu {
420 int cpu; 433 int cpu;
421 struct ring_buffer *buffer; 434 struct ring_buffer *buffer;
422 spinlock_t reader_lock; /* serialize readers */ 435 spinlock_t reader_lock; /* serialize readers */
423 raw_spinlock_t lock; 436 arch_spinlock_t lock;
424 struct lock_class_key lock_key; 437 struct lock_class_key lock_key;
425 struct list_head *pages; 438 struct list_head *pages;
426 struct buffer_page *head_page; /* read from head */ 439 struct buffer_page *head_page; /* read from head */
@@ -461,6 +474,8 @@ struct ring_buffer_iter {
461 struct ring_buffer_per_cpu *cpu_buffer; 474 struct ring_buffer_per_cpu *cpu_buffer;
462 unsigned long head; 475 unsigned long head;
463 struct buffer_page *head_page; 476 struct buffer_page *head_page;
477 struct buffer_page *cache_reader_page;
478 unsigned long cache_read;
464 u64 read_stamp; 479 u64 read_stamp;
465}; 480};
466 481
@@ -995,7 +1010,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
995 cpu_buffer->buffer = buffer; 1010 cpu_buffer->buffer = buffer;
996 spin_lock_init(&cpu_buffer->reader_lock); 1011 spin_lock_init(&cpu_buffer->reader_lock);
997 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); 1012 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
998 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 1013 cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
999 1014
1000 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1015 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
1001 GFP_KERNEL, cpu_to_node(cpu)); 1016 GFP_KERNEL, cpu_to_node(cpu));
@@ -1190,30 +1205,25 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
1190 struct list_head *p; 1205 struct list_head *p;
1191 unsigned i; 1206 unsigned i;
1192 1207
1193 atomic_inc(&cpu_buffer->record_disabled);
1194 synchronize_sched();
1195
1196 spin_lock_irq(&cpu_buffer->reader_lock); 1208 spin_lock_irq(&cpu_buffer->reader_lock);
1197 rb_head_page_deactivate(cpu_buffer); 1209 rb_head_page_deactivate(cpu_buffer);
1198 1210
1199 for (i = 0; i < nr_pages; i++) { 1211 for (i = 0; i < nr_pages; i++) {
1200 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) 1212 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
1201 return; 1213 goto out;
1202 p = cpu_buffer->pages->next; 1214 p = cpu_buffer->pages->next;
1203 bpage = list_entry(p, struct buffer_page, list); 1215 bpage = list_entry(p, struct buffer_page, list);
1204 list_del_init(&bpage->list); 1216 list_del_init(&bpage->list);
1205 free_buffer_page(bpage); 1217 free_buffer_page(bpage);
1206 } 1218 }
1207 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) 1219 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
1208 return; 1220 goto out;
1209 1221
1210 rb_reset_cpu(cpu_buffer); 1222 rb_reset_cpu(cpu_buffer);
1211 spin_unlock_irq(&cpu_buffer->reader_lock);
1212
1213 rb_check_pages(cpu_buffer); 1223 rb_check_pages(cpu_buffer);
1214 1224
1215 atomic_dec(&cpu_buffer->record_disabled); 1225out:
1216 1226 spin_unlock_irq(&cpu_buffer->reader_lock);
1217} 1227}
1218 1228
1219static void 1229static void
@@ -1224,26 +1234,22 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
1224 struct list_head *p; 1234 struct list_head *p;
1225 unsigned i; 1235 unsigned i;
1226 1236
1227 atomic_inc(&cpu_buffer->record_disabled);
1228 synchronize_sched();
1229
1230 spin_lock_irq(&cpu_buffer->reader_lock); 1237 spin_lock_irq(&cpu_buffer->reader_lock);
1231 rb_head_page_deactivate(cpu_buffer); 1238 rb_head_page_deactivate(cpu_buffer);
1232 1239
1233 for (i = 0; i < nr_pages; i++) { 1240 for (i = 0; i < nr_pages; i++) {
1234 if (RB_WARN_ON(cpu_buffer, list_empty(pages))) 1241 if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
1235 return; 1242 goto out;
1236 p = pages->next; 1243 p = pages->next;
1237 bpage = list_entry(p, struct buffer_page, list); 1244 bpage = list_entry(p, struct buffer_page, list);
1238 list_del_init(&bpage->list); 1245 list_del_init(&bpage->list);
1239 list_add_tail(&bpage->list, cpu_buffer->pages); 1246 list_add_tail(&bpage->list, cpu_buffer->pages);
1240 } 1247 }
1241 rb_reset_cpu(cpu_buffer); 1248 rb_reset_cpu(cpu_buffer);
1242 spin_unlock_irq(&cpu_buffer->reader_lock);
1243
1244 rb_check_pages(cpu_buffer); 1249 rb_check_pages(cpu_buffer);
1245 1250
1246 atomic_dec(&cpu_buffer->record_disabled); 1251out:
1252 spin_unlock_irq(&cpu_buffer->reader_lock);
1247} 1253}
1248 1254
1249/** 1255/**
@@ -1251,11 +1257,6 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
1251 * @buffer: the buffer to resize. 1257 * @buffer: the buffer to resize.
1252 * @size: the new size. 1258 * @size: the new size.
1253 * 1259 *
1254 * The tracer is responsible for making sure that the buffer is
1255 * not being used while changing the size.
1256 * Note: We may be able to change the above requirement by using
1257 * RCU synchronizations.
1258 *
1259 * Minimum size is 2 * BUF_PAGE_SIZE. 1260 * Minimum size is 2 * BUF_PAGE_SIZE.
1260 * 1261 *
1261 * Returns -1 on failure. 1262 * Returns -1 on failure.
@@ -1287,6 +1288,11 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1287 if (size == buffer_size) 1288 if (size == buffer_size)
1288 return size; 1289 return size;
1289 1290
1291 atomic_inc(&buffer->record_disabled);
1292
1293 /* Make sure all writers are done with this buffer. */
1294 synchronize_sched();
1295
1290 mutex_lock(&buffer->mutex); 1296 mutex_lock(&buffer->mutex);
1291 get_online_cpus(); 1297 get_online_cpus();
1292 1298
@@ -1349,6 +1355,8 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1349 put_online_cpus(); 1355 put_online_cpus();
1350 mutex_unlock(&buffer->mutex); 1356 mutex_unlock(&buffer->mutex);
1351 1357
1358 atomic_dec(&buffer->record_disabled);
1359
1352 return size; 1360 return size;
1353 1361
1354 free_pages: 1362 free_pages:
@@ -1358,6 +1366,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1358 } 1366 }
1359 put_online_cpus(); 1367 put_online_cpus();
1360 mutex_unlock(&buffer->mutex); 1368 mutex_unlock(&buffer->mutex);
1369 atomic_dec(&buffer->record_disabled);
1361 return -ENOMEM; 1370 return -ENOMEM;
1362 1371
1363 /* 1372 /*
@@ -1367,6 +1376,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1367 out_fail: 1376 out_fail:
1368 put_online_cpus(); 1377 put_online_cpus();
1369 mutex_unlock(&buffer->mutex); 1378 mutex_unlock(&buffer->mutex);
1379 atomic_dec(&buffer->record_disabled);
1370 return -1; 1380 return -1;
1371} 1381}
1372EXPORT_SYMBOL_GPL(ring_buffer_resize); 1382EXPORT_SYMBOL_GPL(ring_buffer_resize);
@@ -1548,7 +1558,7 @@ rb_update_event(struct ring_buffer_event *event,
1548 1558
1549 case 0: 1559 case 0:
1550 length -= RB_EVNT_HDR_SIZE; 1560 length -= RB_EVNT_HDR_SIZE;
1551 if (length > RB_MAX_SMALL_DATA) 1561 if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
1552 event->array[0] = length; 1562 event->array[0] = length;
1553 else 1563 else
1554 event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); 1564 event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
@@ -1723,11 +1733,11 @@ static unsigned rb_calculate_event_length(unsigned length)
1723 if (!length) 1733 if (!length)
1724 length = 1; 1734 length = 1;
1725 1735
1726 if (length > RB_MAX_SMALL_DATA) 1736 if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
1727 length += sizeof(event.array[0]); 1737 length += sizeof(event.array[0]);
1728 1738
1729 length += RB_EVNT_HDR_SIZE; 1739 length += RB_EVNT_HDR_SIZE;
1730 length = ALIGN(length, RB_ALIGNMENT); 1740 length = ALIGN(length, RB_ARCH_ALIGNMENT);
1731 1741
1732 return length; 1742 return length;
1733} 1743}
@@ -1787,9 +1797,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1787static struct ring_buffer_event * 1797static struct ring_buffer_event *
1788rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, 1798rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1789 unsigned long length, unsigned long tail, 1799 unsigned long length, unsigned long tail,
1790 struct buffer_page *commit_page,
1791 struct buffer_page *tail_page, u64 *ts) 1800 struct buffer_page *tail_page, u64 *ts)
1792{ 1801{
1802 struct buffer_page *commit_page = cpu_buffer->commit_page;
1793 struct ring_buffer *buffer = cpu_buffer->buffer; 1803 struct ring_buffer *buffer = cpu_buffer->buffer;
1794 struct buffer_page *next_page; 1804 struct buffer_page *next_page;
1795 int ret; 1805 int ret;
@@ -1892,13 +1902,10 @@ static struct ring_buffer_event *
1892__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, 1902__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1893 unsigned type, unsigned long length, u64 *ts) 1903 unsigned type, unsigned long length, u64 *ts)
1894{ 1904{
1895 struct buffer_page *tail_page, *commit_page; 1905 struct buffer_page *tail_page;
1896 struct ring_buffer_event *event; 1906 struct ring_buffer_event *event;
1897 unsigned long tail, write; 1907 unsigned long tail, write;
1898 1908
1899 commit_page = cpu_buffer->commit_page;
1900 /* we just need to protect against interrupts */
1901 barrier();
1902 tail_page = cpu_buffer->tail_page; 1909 tail_page = cpu_buffer->tail_page;
1903 write = local_add_return(length, &tail_page->write); 1910 write = local_add_return(length, &tail_page->write);
1904 1911
@@ -1909,7 +1916,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1909 /* See if we shot pass the end of this buffer page */ 1916 /* See if we shot pass the end of this buffer page */
1910 if (write > BUF_PAGE_SIZE) 1917 if (write > BUF_PAGE_SIZE)
1911 return rb_move_tail(cpu_buffer, length, tail, 1918 return rb_move_tail(cpu_buffer, length, tail,
1912 commit_page, tail_page, ts); 1919 tail_page, ts);
1913 1920
1914 /* We reserved something on the buffer */ 1921 /* We reserved something on the buffer */
1915 1922
@@ -2237,12 +2244,12 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
2237 if (ring_buffer_flags != RB_BUFFERS_ON) 2244 if (ring_buffer_flags != RB_BUFFERS_ON)
2238 return NULL; 2245 return NULL;
2239 2246
2240 if (atomic_read(&buffer->record_disabled))
2241 return NULL;
2242
2243 /* If we are tracing schedule, we don't want to recurse */ 2247 /* If we are tracing schedule, we don't want to recurse */
2244 resched = ftrace_preempt_disable(); 2248 resched = ftrace_preempt_disable();
2245 2249
2250 if (atomic_read(&buffer->record_disabled))
2251 goto out_nocheck;
2252
2246 if (trace_recursive_lock()) 2253 if (trace_recursive_lock())
2247 goto out_nocheck; 2254 goto out_nocheck;
2248 2255
@@ -2474,11 +2481,11 @@ int ring_buffer_write(struct ring_buffer *buffer,
2474 if (ring_buffer_flags != RB_BUFFERS_ON) 2481 if (ring_buffer_flags != RB_BUFFERS_ON)
2475 return -EBUSY; 2482 return -EBUSY;
2476 2483
2477 if (atomic_read(&buffer->record_disabled))
2478 return -EBUSY;
2479
2480 resched = ftrace_preempt_disable(); 2484 resched = ftrace_preempt_disable();
2481 2485
2486 if (atomic_read(&buffer->record_disabled))
2487 goto out;
2488
2482 cpu = raw_smp_processor_id(); 2489 cpu = raw_smp_processor_id();
2483 2490
2484 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2491 if (!cpumask_test_cpu(cpu, buffer->cpumask))
@@ -2546,7 +2553,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable);
2546 * @buffer: The ring buffer to enable writes 2553 * @buffer: The ring buffer to enable writes
2547 * 2554 *
2548 * Note, multiple disables will need the same number of enables 2555 * Note, multiple disables will need the same number of enables
2549 * to truely enable the writing (much like preempt_disable). 2556 * to truly enable the writing (much like preempt_disable).
2550 */ 2557 */
2551void ring_buffer_record_enable(struct ring_buffer *buffer) 2558void ring_buffer_record_enable(struct ring_buffer *buffer)
2552{ 2559{
@@ -2582,7 +2589,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu);
2582 * @cpu: The CPU to enable. 2589 * @cpu: The CPU to enable.
2583 * 2590 *
2584 * Note, multiple disables will need the same number of enables 2591 * Note, multiple disables will need the same number of enables
2585 * to truely enable the writing (much like preempt_disable). 2592 * to truly enable the writing (much like preempt_disable).
2586 */ 2593 */
2587void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) 2594void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
2588{ 2595{
@@ -2723,6 +2730,8 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
2723 iter->read_stamp = cpu_buffer->read_stamp; 2730 iter->read_stamp = cpu_buffer->read_stamp;
2724 else 2731 else
2725 iter->read_stamp = iter->head_page->page->time_stamp; 2732 iter->read_stamp = iter->head_page->page->time_stamp;
2733 iter->cache_reader_page = cpu_buffer->reader_page;
2734 iter->cache_read = cpu_buffer->read;
2726} 2735}
2727 2736
2728/** 2737/**
@@ -2834,7 +2843,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2834 int ret; 2843 int ret;
2835 2844
2836 local_irq_save(flags); 2845 local_irq_save(flags);
2837 __raw_spin_lock(&cpu_buffer->lock); 2846 arch_spin_lock(&cpu_buffer->lock);
2838 2847
2839 again: 2848 again:
2840 /* 2849 /*
@@ -2876,7 +2885,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2876 * Splice the empty reader page into the list around the head. 2885 * Splice the empty reader page into the list around the head.
2877 */ 2886 */
2878 reader = rb_set_head_page(cpu_buffer); 2887 reader = rb_set_head_page(cpu_buffer);
2879 cpu_buffer->reader_page->list.next = reader->list.next; 2888 cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next);
2880 cpu_buffer->reader_page->list.prev = reader->list.prev; 2889 cpu_buffer->reader_page->list.prev = reader->list.prev;
2881 2890
2882 /* 2891 /*
@@ -2913,7 +2922,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2913 * 2922 *
2914 * Now make the new head point back to the reader page. 2923 * Now make the new head point back to the reader page.
2915 */ 2924 */
2916 reader->list.next->prev = &cpu_buffer->reader_page->list; 2925 rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list;
2917 rb_inc_page(cpu_buffer, &cpu_buffer->head_page); 2926 rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
2918 2927
2919 /* Finally update the reader page to the new head */ 2928 /* Finally update the reader page to the new head */
@@ -2923,7 +2932,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2923 goto again; 2932 goto again;
2924 2933
2925 out: 2934 out:
2926 __raw_spin_unlock(&cpu_buffer->lock); 2935 arch_spin_unlock(&cpu_buffer->lock);
2927 local_irq_restore(flags); 2936 local_irq_restore(flags);
2928 2937
2929 return reader; 2938 return reader;
@@ -3067,13 +3076,22 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3067 struct ring_buffer_event *event; 3076 struct ring_buffer_event *event;
3068 int nr_loops = 0; 3077 int nr_loops = 0;
3069 3078
3070 if (ring_buffer_iter_empty(iter))
3071 return NULL;
3072
3073 cpu_buffer = iter->cpu_buffer; 3079 cpu_buffer = iter->cpu_buffer;
3074 buffer = cpu_buffer->buffer; 3080 buffer = cpu_buffer->buffer;
3075 3081
3082 /*
3083 * Check if someone performed a consuming read to
3084 * the buffer. A consuming read invalidates the iterator
3085 * and we need to reset the iterator in this case.
3086 */
3087 if (unlikely(iter->cache_read != cpu_buffer->read ||
3088 iter->cache_reader_page != cpu_buffer->reader_page))
3089 rb_iter_reset(iter);
3090
3076 again: 3091 again:
3092 if (ring_buffer_iter_empty(iter))
3093 return NULL;
3094
3077 /* 3095 /*
3078 * We repeat when a timestamp is encountered. 3096 * We repeat when a timestamp is encountered.
3079 * We can get multiple timestamps by nested interrupts or also 3097 * We can get multiple timestamps by nested interrupts or also
@@ -3088,6 +3106,11 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3088 if (rb_per_cpu_empty(cpu_buffer)) 3106 if (rb_per_cpu_empty(cpu_buffer))
3089 return NULL; 3107 return NULL;
3090 3108
3109 if (iter->head >= local_read(&iter->head_page->page->commit)) {
3110 rb_inc_iter(iter);
3111 goto again;
3112 }
3113
3091 event = rb_iter_head_event(iter); 3114 event = rb_iter_head_event(iter);
3092 3115
3093 switch (event->type_len) { 3116 switch (event->type_len) {
@@ -3286,9 +3309,9 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
3286 synchronize_sched(); 3309 synchronize_sched();
3287 3310
3288 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3311 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3289 __raw_spin_lock(&cpu_buffer->lock); 3312 arch_spin_lock(&cpu_buffer->lock);
3290 rb_iter_reset(iter); 3313 rb_iter_reset(iter);
3291 __raw_spin_unlock(&cpu_buffer->lock); 3314 arch_spin_unlock(&cpu_buffer->lock);
3292 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3315 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3293 3316
3294 return iter; 3317 return iter;
@@ -3408,11 +3431,11 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
3408 if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) 3431 if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
3409 goto out; 3432 goto out;
3410 3433
3411 __raw_spin_lock(&cpu_buffer->lock); 3434 arch_spin_lock(&cpu_buffer->lock);
3412 3435
3413 rb_reset_cpu(cpu_buffer); 3436 rb_reset_cpu(cpu_buffer);
3414 3437
3415 __raw_spin_unlock(&cpu_buffer->lock); 3438 arch_spin_unlock(&cpu_buffer->lock);
3416 3439
3417 out: 3440 out:
3418 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3441 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 573d3cc762c3..df74c7982255 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -8,6 +8,7 @@
8#include <linux/kthread.h> 8#include <linux/kthread.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/time.h> 10#include <linux/time.h>
11#include <asm/local.h>
11 12
12struct rb_page { 13struct rb_page {
13 u64 ts; 14 u64 ts;
@@ -35,6 +36,28 @@ static int disable_reader;
35module_param(disable_reader, uint, 0644); 36module_param(disable_reader, uint, 0644);
36MODULE_PARM_DESC(disable_reader, "only run producer"); 37MODULE_PARM_DESC(disable_reader, "only run producer");
37 38
39static int write_iteration = 50;
40module_param(write_iteration, uint, 0644);
41MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings");
42
43static int producer_nice = 19;
44static int consumer_nice = 19;
45
46static int producer_fifo = -1;
47static int consumer_fifo = -1;
48
49module_param(producer_nice, uint, 0644);
50MODULE_PARM_DESC(producer_nice, "nice prio for producer");
51
52module_param(consumer_nice, uint, 0644);
53MODULE_PARM_DESC(consumer_nice, "nice prio for consumer");
54
55module_param(producer_fifo, uint, 0644);
56MODULE_PARM_DESC(producer_fifo, "fifo prio for producer");
57
58module_param(consumer_fifo, uint, 0644);
59MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer");
60
38static int read_events; 61static int read_events;
39 62
40static int kill_test; 63static int kill_test;
@@ -208,15 +231,18 @@ static void ring_buffer_producer(void)
208 do { 231 do {
209 struct ring_buffer_event *event; 232 struct ring_buffer_event *event;
210 int *entry; 233 int *entry;
211 234 int i;
212 event = ring_buffer_lock_reserve(buffer, 10); 235
213 if (!event) { 236 for (i = 0; i < write_iteration; i++) {
214 missed++; 237 event = ring_buffer_lock_reserve(buffer, 10);
215 } else { 238 if (!event) {
216 hit++; 239 missed++;
217 entry = ring_buffer_event_data(event); 240 } else {
218 *entry = smp_processor_id(); 241 hit++;
219 ring_buffer_unlock_commit(buffer, event); 242 entry = ring_buffer_event_data(event);
243 *entry = smp_processor_id();
244 ring_buffer_unlock_commit(buffer, event);
245 }
220 } 246 }
221 do_gettimeofday(&end_tv); 247 do_gettimeofday(&end_tv);
222 248
@@ -263,6 +289,27 @@ static void ring_buffer_producer(void)
263 289
264 if (kill_test) 290 if (kill_test)
265 trace_printk("ERROR!\n"); 291 trace_printk("ERROR!\n");
292
293 if (!disable_reader) {
294 if (consumer_fifo < 0)
295 trace_printk("Running Consumer at nice: %d\n",
296 consumer_nice);
297 else
298 trace_printk("Running Consumer at SCHED_FIFO %d\n",
299 consumer_fifo);
300 }
301 if (producer_fifo < 0)
302 trace_printk("Running Producer at nice: %d\n",
303 producer_nice);
304 else
305 trace_printk("Running Producer at SCHED_FIFO %d\n",
306 producer_fifo);
307
308 /* Let the user know that the test is running at low priority */
309 if (producer_fifo < 0 && consumer_fifo < 0 &&
310 producer_nice == 19 && consumer_nice == 19)
311 trace_printk("WARNING!!! This test is running at lowest priority.\n");
312
266 trace_printk("Time: %lld (usecs)\n", time); 313 trace_printk("Time: %lld (usecs)\n", time);
267 trace_printk("Overruns: %lld\n", overruns); 314 trace_printk("Overruns: %lld\n", overruns);
268 if (disable_reader) 315 if (disable_reader)
@@ -392,6 +439,27 @@ static int __init ring_buffer_benchmark_init(void)
392 if (IS_ERR(producer)) 439 if (IS_ERR(producer))
393 goto out_kill; 440 goto out_kill;
394 441
442 /*
443 * Run them as low-prio background tasks by default:
444 */
445 if (!disable_reader) {
446 if (consumer_fifo >= 0) {
447 struct sched_param param = {
448 .sched_priority = consumer_fifo
449 };
450 sched_setscheduler(consumer, SCHED_FIFO, &param);
451 } else
452 set_user_nice(consumer, consumer_nice);
453 }
454
455 if (producer_fifo >= 0) {
456 struct sched_param param = {
457 .sched_priority = consumer_fifo
458 };
459 sched_setscheduler(producer, SCHED_FIFO, &param);
460 } else
461 set_user_nice(producer, producer_nice);
462
395 return 0; 463 return 0;
396 464
397 out_kill: 465 out_kill:
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b20d3ec75de9..44f916a04065 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -12,7 +12,7 @@
12 * Copyright (C) 2004 William Lee Irwin III 12 * Copyright (C) 2004 William Lee Irwin III
13 */ 13 */
14#include <linux/ring_buffer.h> 14#include <linux/ring_buffer.h>
15#include <linux/utsrelease.h> 15#include <generated/utsrelease.h>
16#include <linux/stacktrace.h> 16#include <linux/stacktrace.h>
17#include <linux/writeback.h> 17#include <linux/writeback.h>
18#include <linux/kallsyms.h> 18#include <linux/kallsyms.h>
@@ -32,10 +32,11 @@
32#include <linux/splice.h> 32#include <linux/splice.h>
33#include <linux/kdebug.h> 33#include <linux/kdebug.h>
34#include <linux/string.h> 34#include <linux/string.h>
35#include <linux/rwsem.h>
36#include <linux/slab.h>
35#include <linux/ctype.h> 37#include <linux/ctype.h>
36#include <linux/init.h> 38#include <linux/init.h>
37#include <linux/poll.h> 39#include <linux/poll.h>
38#include <linux/gfp.h>
39#include <linux/fs.h> 40#include <linux/fs.h>
40 41
41#include "trace.h" 42#include "trace.h"
@@ -86,25 +87,22 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
86 */ 87 */
87static int tracing_disabled = 1; 88static int tracing_disabled = 1;
88 89
89DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); 90DEFINE_PER_CPU(int, ftrace_cpu_disabled);
90 91
91static inline void ftrace_disable_cpu(void) 92static inline void ftrace_disable_cpu(void)
92{ 93{
93 preempt_disable(); 94 preempt_disable();
94 local_inc(&__get_cpu_var(ftrace_cpu_disabled)); 95 __this_cpu_inc(ftrace_cpu_disabled);
95} 96}
96 97
97static inline void ftrace_enable_cpu(void) 98static inline void ftrace_enable_cpu(void)
98{ 99{
99 local_dec(&__get_cpu_var(ftrace_cpu_disabled)); 100 __this_cpu_dec(ftrace_cpu_disabled);
100 preempt_enable(); 101 preempt_enable();
101} 102}
102 103
103static cpumask_var_t __read_mostly tracing_buffer_mask; 104static cpumask_var_t __read_mostly tracing_buffer_mask;
104 105
105/* Define which cpu buffers are currently read in trace_pipe */
106static cpumask_var_t tracing_reader_cpumask;
107
108#define for_each_tracing_cpu(cpu) \ 106#define for_each_tracing_cpu(cpu) \
109 for_each_cpu(cpu, tracing_buffer_mask) 107 for_each_cpu(cpu, tracing_buffer_mask)
110 108
@@ -129,7 +127,7 @@ static int tracing_set_tracer(const char *buf);
129static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; 127static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
130static char *default_bootup_tracer; 128static char *default_bootup_tracer;
131 129
132static int __init set_ftrace(char *str) 130static int __init set_cmdline_ftrace(char *str)
133{ 131{
134 strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); 132 strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
135 default_bootup_tracer = bootup_tracer_buf; 133 default_bootup_tracer = bootup_tracer_buf;
@@ -137,7 +135,7 @@ static int __init set_ftrace(char *str)
137 ring_buffer_expanded = 1; 135 ring_buffer_expanded = 1;
138 return 1; 136 return 1;
139} 137}
140__setup("ftrace=", set_ftrace); 138__setup("ftrace=", set_cmdline_ftrace);
141 139
142static int __init set_ftrace_dump_on_oops(char *str) 140static int __init set_ftrace_dump_on_oops(char *str)
143{ 141{
@@ -203,7 +201,7 @@ cycle_t ftrace_now(int cpu)
203 */ 201 */
204static struct trace_array max_tr; 202static struct trace_array max_tr;
205 203
206static DEFINE_PER_CPU(struct trace_array_cpu, max_data); 204static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data);
207 205
208/* tracer_enabled is used to toggle activation of a tracer */ 206/* tracer_enabled is used to toggle activation of a tracer */
209static int tracer_enabled = 1; 207static int tracer_enabled = 1;
@@ -243,12 +241,91 @@ static struct tracer *current_trace __read_mostly;
243 241
244/* 242/*
245 * trace_types_lock is used to protect the trace_types list. 243 * trace_types_lock is used to protect the trace_types list.
246 * This lock is also used to keep user access serialized.
247 * Accesses from userspace will grab this lock while userspace
248 * activities happen inside the kernel.
249 */ 244 */
250static DEFINE_MUTEX(trace_types_lock); 245static DEFINE_MUTEX(trace_types_lock);
251 246
247/*
248 * serialize the access of the ring buffer
249 *
250 * ring buffer serializes readers, but it is low level protection.
251 * The validity of the events (which returns by ring_buffer_peek() ..etc)
252 * are not protected by ring buffer.
253 *
254 * The content of events may become garbage if we allow other process consumes
255 * these events concurrently:
256 * A) the page of the consumed events may become a normal page
257 * (not reader page) in ring buffer, and this page will be rewrited
258 * by events producer.
259 * B) The page of the consumed events may become a page for splice_read,
260 * and this page will be returned to system.
261 *
262 * These primitives allow multi process access to different cpu ring buffer
263 * concurrently.
264 *
265 * These primitives don't distinguish read-only and read-consume access.
266 * Multi read-only access are also serialized.
267 */
268
269#ifdef CONFIG_SMP
270static DECLARE_RWSEM(all_cpu_access_lock);
271static DEFINE_PER_CPU(struct mutex, cpu_access_lock);
272
273static inline void trace_access_lock(int cpu)
274{
275 if (cpu == TRACE_PIPE_ALL_CPU) {
276 /* gain it for accessing the whole ring buffer. */
277 down_write(&all_cpu_access_lock);
278 } else {
279 /* gain it for accessing a cpu ring buffer. */
280
281 /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */
282 down_read(&all_cpu_access_lock);
283
284 /* Secondly block other access to this @cpu ring buffer. */
285 mutex_lock(&per_cpu(cpu_access_lock, cpu));
286 }
287}
288
289static inline void trace_access_unlock(int cpu)
290{
291 if (cpu == TRACE_PIPE_ALL_CPU) {
292 up_write(&all_cpu_access_lock);
293 } else {
294 mutex_unlock(&per_cpu(cpu_access_lock, cpu));
295 up_read(&all_cpu_access_lock);
296 }
297}
298
299static inline void trace_access_lock_init(void)
300{
301 int cpu;
302
303 for_each_possible_cpu(cpu)
304 mutex_init(&per_cpu(cpu_access_lock, cpu));
305}
306
307#else
308
309static DEFINE_MUTEX(access_lock);
310
311static inline void trace_access_lock(int cpu)
312{
313 (void)cpu;
314 mutex_lock(&access_lock);
315}
316
317static inline void trace_access_unlock(int cpu)
318{
319 (void)cpu;
320 mutex_unlock(&access_lock);
321}
322
323static inline void trace_access_lock_init(void)
324{
325}
326
327#endif
328
252/* trace_wait is a waitqueue for tasks blocked on trace_poll */ 329/* trace_wait is a waitqueue for tasks blocked on trace_poll */
253static DECLARE_WAIT_QUEUE_HEAD(trace_wait); 330static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
254 331
@@ -297,6 +374,21 @@ static int __init set_buf_size(char *str)
297} 374}
298__setup("trace_buf_size=", set_buf_size); 375__setup("trace_buf_size=", set_buf_size);
299 376
377static int __init set_tracing_thresh(char *str)
378{
379 unsigned long threshhold;
380 int ret;
381
382 if (!str)
383 return 0;
384 ret = strict_strtoul(str, 0, &threshhold);
385 if (ret < 0)
386 return 0;
387 tracing_thresh = threshhold * 1000;
388 return 1;
389}
390__setup("tracing_thresh=", set_tracing_thresh);
391
300unsigned long nsecs_to_usecs(unsigned long nsecs) 392unsigned long nsecs_to_usecs(unsigned long nsecs)
301{ 393{
302 return nsecs / 1000; 394 return nsecs / 1000;
@@ -313,7 +405,6 @@ static const char *trace_options[] = {
313 "bin", 405 "bin",
314 "block", 406 "block",
315 "stacktrace", 407 "stacktrace",
316 "sched-tree",
317 "trace_printk", 408 "trace_printk",
318 "ftrace_preempt", 409 "ftrace_preempt",
319 "branch", 410 "branch",
@@ -493,19 +584,20 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
493 * protected by per_cpu spinlocks. But the action of the swap 584 * protected by per_cpu spinlocks. But the action of the swap
494 * needs its own lock. 585 * needs its own lock.
495 * 586 *
496 * This is defined as a raw_spinlock_t in order to help 587 * This is defined as a arch_spinlock_t in order to help
497 * with performance when lockdep debugging is enabled. 588 * with performance when lockdep debugging is enabled.
498 * 589 *
499 * It is also used in other places outside the update_max_tr 590 * It is also used in other places outside the update_max_tr
500 * so it needs to be defined outside of the 591 * so it needs to be defined outside of the
501 * CONFIG_TRACER_MAX_TRACE. 592 * CONFIG_TRACER_MAX_TRACE.
502 */ 593 */
503static raw_spinlock_t ftrace_max_lock = 594static arch_spinlock_t ftrace_max_lock =
504 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 595 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
596
597unsigned long __read_mostly tracing_thresh;
505 598
506#ifdef CONFIG_TRACER_MAX_TRACE 599#ifdef CONFIG_TRACER_MAX_TRACE
507unsigned long __read_mostly tracing_max_latency; 600unsigned long __read_mostly tracing_max_latency;
508unsigned long __read_mostly tracing_thresh;
509 601
510/* 602/*
511 * Copy the new maximum trace into the separate maximum-trace 603 * Copy the new maximum trace into the separate maximum-trace
@@ -516,7 +608,7 @@ static void
516__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) 608__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
517{ 609{
518 struct trace_array_cpu *data = tr->data[cpu]; 610 struct trace_array_cpu *data = tr->data[cpu];
519 struct trace_array_cpu *max_data = tr->data[cpu]; 611 struct trace_array_cpu *max_data;
520 612
521 max_tr.cpu = cpu; 613 max_tr.cpu = cpu;
522 max_tr.time_start = data->preempt_timestamp; 614 max_tr.time_start = data->preempt_timestamp;
@@ -526,7 +618,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
526 max_data->critical_start = data->critical_start; 618 max_data->critical_start = data->critical_start;
527 max_data->critical_end = data->critical_end; 619 max_data->critical_end = data->critical_end;
528 620
529 memcpy(data->comm, tsk->comm, TASK_COMM_LEN); 621 memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN);
530 max_data->pid = tsk->pid; 622 max_data->pid = tsk->pid;
531 max_data->uid = task_uid(tsk); 623 max_data->uid = task_uid(tsk);
532 max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; 624 max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
@@ -555,13 +647,13 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
555 return; 647 return;
556 648
557 WARN_ON_ONCE(!irqs_disabled()); 649 WARN_ON_ONCE(!irqs_disabled());
558 __raw_spin_lock(&ftrace_max_lock); 650 arch_spin_lock(&ftrace_max_lock);
559 651
560 tr->buffer = max_tr.buffer; 652 tr->buffer = max_tr.buffer;
561 max_tr.buffer = buf; 653 max_tr.buffer = buf;
562 654
563 __update_max_tr(tr, tsk, cpu); 655 __update_max_tr(tr, tsk, cpu);
564 __raw_spin_unlock(&ftrace_max_lock); 656 arch_spin_unlock(&ftrace_max_lock);
565} 657}
566 658
567/** 659/**
@@ -581,7 +673,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
581 return; 673 return;
582 674
583 WARN_ON_ONCE(!irqs_disabled()); 675 WARN_ON_ONCE(!irqs_disabled());
584 __raw_spin_lock(&ftrace_max_lock); 676 arch_spin_lock(&ftrace_max_lock);
585 677
586 ftrace_disable_cpu(); 678 ftrace_disable_cpu();
587 679
@@ -603,7 +695,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
603 WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); 695 WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
604 696
605 __update_max_tr(tr, tsk, cpu); 697 __update_max_tr(tr, tsk, cpu);
606 __raw_spin_unlock(&ftrace_max_lock); 698 arch_spin_unlock(&ftrace_max_lock);
607} 699}
608#endif /* CONFIG_TRACER_MAX_TRACE */ 700#endif /* CONFIG_TRACER_MAX_TRACE */
609 701
@@ -748,10 +840,10 @@ out:
748 mutex_unlock(&trace_types_lock); 840 mutex_unlock(&trace_types_lock);
749} 841}
750 842
751static void __tracing_reset(struct trace_array *tr, int cpu) 843static void __tracing_reset(struct ring_buffer *buffer, int cpu)
752{ 844{
753 ftrace_disable_cpu(); 845 ftrace_disable_cpu();
754 ring_buffer_reset_cpu(tr->buffer, cpu); 846 ring_buffer_reset_cpu(buffer, cpu);
755 ftrace_enable_cpu(); 847 ftrace_enable_cpu();
756} 848}
757 849
@@ -763,7 +855,7 @@ void tracing_reset(struct trace_array *tr, int cpu)
763 855
764 /* Make sure all commits have finished */ 856 /* Make sure all commits have finished */
765 synchronize_sched(); 857 synchronize_sched();
766 __tracing_reset(tr, cpu); 858 __tracing_reset(buffer, cpu);
767 859
768 ring_buffer_record_enable(buffer); 860 ring_buffer_record_enable(buffer);
769} 861}
@@ -781,7 +873,7 @@ void tracing_reset_online_cpus(struct trace_array *tr)
781 tr->time_start = ftrace_now(tr->cpu); 873 tr->time_start = ftrace_now(tr->cpu);
782 874
783 for_each_online_cpu(cpu) 875 for_each_online_cpu(cpu)
784 __tracing_reset(tr, cpu); 876 __tracing_reset(buffer, cpu);
785 877
786 ring_buffer_record_enable(buffer); 878 ring_buffer_record_enable(buffer);
787} 879}
@@ -802,7 +894,7 @@ static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
802static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; 894static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
803static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; 895static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
804static int cmdline_idx; 896static int cmdline_idx;
805static raw_spinlock_t trace_cmdline_lock = __RAW_SPIN_LOCK_UNLOCKED; 897static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
806 898
807/* temporary disable recording */ 899/* temporary disable recording */
808static atomic_t trace_record_cmdline_disabled __read_mostly; 900static atomic_t trace_record_cmdline_disabled __read_mostly;
@@ -858,6 +950,8 @@ void tracing_start(void)
858 goto out; 950 goto out;
859 } 951 }
860 952
953 /* Prevent the buffers from switching */
954 arch_spin_lock(&ftrace_max_lock);
861 955
862 buffer = global_trace.buffer; 956 buffer = global_trace.buffer;
863 if (buffer) 957 if (buffer)
@@ -867,6 +961,8 @@ void tracing_start(void)
867 if (buffer) 961 if (buffer)
868 ring_buffer_record_enable(buffer); 962 ring_buffer_record_enable(buffer);
869 963
964 arch_spin_unlock(&ftrace_max_lock);
965
870 ftrace_start(); 966 ftrace_start();
871 out: 967 out:
872 spin_unlock_irqrestore(&tracing_start_lock, flags); 968 spin_unlock_irqrestore(&tracing_start_lock, flags);
@@ -888,6 +984,9 @@ void tracing_stop(void)
888 if (trace_stop_count++) 984 if (trace_stop_count++)
889 goto out; 985 goto out;
890 986
987 /* Prevent the buffers from switching */
988 arch_spin_lock(&ftrace_max_lock);
989
891 buffer = global_trace.buffer; 990 buffer = global_trace.buffer;
892 if (buffer) 991 if (buffer)
893 ring_buffer_record_disable(buffer); 992 ring_buffer_record_disable(buffer);
@@ -896,6 +995,8 @@ void tracing_stop(void)
896 if (buffer) 995 if (buffer)
897 ring_buffer_record_disable(buffer); 996 ring_buffer_record_disable(buffer);
898 997
998 arch_spin_unlock(&ftrace_max_lock);
999
899 out: 1000 out:
900 spin_unlock_irqrestore(&tracing_start_lock, flags); 1001 spin_unlock_irqrestore(&tracing_start_lock, flags);
901} 1002}
@@ -915,7 +1016,7 @@ static void trace_save_cmdline(struct task_struct *tsk)
915 * nor do we want to disable interrupts, 1016 * nor do we want to disable interrupts,
916 * so if we miss here, then better luck next time. 1017 * so if we miss here, then better luck next time.
917 */ 1018 */
918 if (!__raw_spin_trylock(&trace_cmdline_lock)) 1019 if (!arch_spin_trylock(&trace_cmdline_lock))
919 return; 1020 return;
920 1021
921 idx = map_pid_to_cmdline[tsk->pid]; 1022 idx = map_pid_to_cmdline[tsk->pid];
@@ -940,7 +1041,7 @@ static void trace_save_cmdline(struct task_struct *tsk)
940 1041
941 memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); 1042 memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN);
942 1043
943 __raw_spin_unlock(&trace_cmdline_lock); 1044 arch_spin_unlock(&trace_cmdline_lock);
944} 1045}
945 1046
946void trace_find_cmdline(int pid, char comm[]) 1047void trace_find_cmdline(int pid, char comm[])
@@ -952,20 +1053,25 @@ void trace_find_cmdline(int pid, char comm[])
952 return; 1053 return;
953 } 1054 }
954 1055
1056 if (WARN_ON_ONCE(pid < 0)) {
1057 strcpy(comm, "<XXX>");
1058 return;
1059 }
1060
955 if (pid > PID_MAX_DEFAULT) { 1061 if (pid > PID_MAX_DEFAULT) {
956 strcpy(comm, "<...>"); 1062 strcpy(comm, "<...>");
957 return; 1063 return;
958 } 1064 }
959 1065
960 preempt_disable(); 1066 preempt_disable();
961 __raw_spin_lock(&trace_cmdline_lock); 1067 arch_spin_lock(&trace_cmdline_lock);
962 map = map_pid_to_cmdline[pid]; 1068 map = map_pid_to_cmdline[pid];
963 if (map != NO_CMDLINE_MAP) 1069 if (map != NO_CMDLINE_MAP)
964 strcpy(comm, saved_cmdlines[map]); 1070 strcpy(comm, saved_cmdlines[map]);
965 else 1071 else
966 strcpy(comm, "<...>"); 1072 strcpy(comm, "<...>");
967 1073
968 __raw_spin_unlock(&trace_cmdline_lock); 1074 arch_spin_unlock(&trace_cmdline_lock);
969 preempt_enable(); 1075 preempt_enable();
970} 1076}
971 1077
@@ -1085,7 +1191,7 @@ trace_function(struct trace_array *tr,
1085 struct ftrace_entry *entry; 1191 struct ftrace_entry *entry;
1086 1192
1087 /* If we are reading the ring buffer, don't trace */ 1193 /* If we are reading the ring buffer, don't trace */
1088 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) 1194 if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
1089 return; 1195 return;
1090 1196
1091 event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), 1197 event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
@@ -1151,6 +1257,22 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
1151 __ftrace_trace_stack(tr->buffer, flags, skip, pc); 1257 __ftrace_trace_stack(tr->buffer, flags, skip, pc);
1152} 1258}
1153 1259
1260/**
1261 * trace_dump_stack - record a stack back trace in the trace buffer
1262 */
1263void trace_dump_stack(void)
1264{
1265 unsigned long flags;
1266
1267 if (tracing_disabled || tracing_selftest_running)
1268 return;
1269
1270 local_save_flags(flags);
1271
1272 /* skipping 3 traces, seems to get us at the caller of this function */
1273 __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count());
1274}
1275
1154void 1276void
1155ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) 1277ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1156{ 1278{
@@ -1162,6 +1284,13 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1162 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) 1284 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
1163 return; 1285 return;
1164 1286
1287 /*
1288 * NMIs can not handle page faults, even with fix ups.
1289 * The save user stack can (and often does) fault.
1290 */
1291 if (unlikely(in_nmi()))
1292 return;
1293
1165 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, 1294 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
1166 sizeof(*entry), flags, pc); 1295 sizeof(*entry), flags, pc);
1167 if (!event) 1296 if (!event)
@@ -1251,8 +1380,8 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
1251 */ 1380 */
1252int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) 1381int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1253{ 1382{
1254 static raw_spinlock_t trace_buf_lock = 1383 static arch_spinlock_t trace_buf_lock =
1255 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 1384 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
1256 static u32 trace_buf[TRACE_BUF_SIZE]; 1385 static u32 trace_buf[TRACE_BUF_SIZE];
1257 1386
1258 struct ftrace_event_call *call = &event_bprint; 1387 struct ftrace_event_call *call = &event_bprint;
@@ -1283,7 +1412,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1283 1412
1284 /* Lockdep uses trace_printk for lock tracing */ 1413 /* Lockdep uses trace_printk for lock tracing */
1285 local_irq_save(flags); 1414 local_irq_save(flags);
1286 __raw_spin_lock(&trace_buf_lock); 1415 arch_spin_lock(&trace_buf_lock);
1287 len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args); 1416 len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
1288 1417
1289 if (len > TRACE_BUF_SIZE || len < 0) 1418 if (len > TRACE_BUF_SIZE || len < 0)
@@ -1300,11 +1429,13 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1300 entry->fmt = fmt; 1429 entry->fmt = fmt;
1301 1430
1302 memcpy(entry->buf, trace_buf, sizeof(u32) * len); 1431 memcpy(entry->buf, trace_buf, sizeof(u32) * len);
1303 if (!filter_check_discard(call, entry, buffer, event)) 1432 if (!filter_check_discard(call, entry, buffer, event)) {
1304 ring_buffer_unlock_commit(buffer, event); 1433 ring_buffer_unlock_commit(buffer, event);
1434 ftrace_trace_stack(buffer, flags, 6, pc);
1435 }
1305 1436
1306out_unlock: 1437out_unlock:
1307 __raw_spin_unlock(&trace_buf_lock); 1438 arch_spin_unlock(&trace_buf_lock);
1308 local_irq_restore(flags); 1439 local_irq_restore(flags);
1309 1440
1310out: 1441out:
@@ -1334,7 +1465,7 @@ int trace_array_printk(struct trace_array *tr,
1334int trace_array_vprintk(struct trace_array *tr, 1465int trace_array_vprintk(struct trace_array *tr,
1335 unsigned long ip, const char *fmt, va_list args) 1466 unsigned long ip, const char *fmt, va_list args)
1336{ 1467{
1337 static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; 1468 static arch_spinlock_t trace_buf_lock = __ARCH_SPIN_LOCK_UNLOCKED;
1338 static char trace_buf[TRACE_BUF_SIZE]; 1469 static char trace_buf[TRACE_BUF_SIZE];
1339 1470
1340 struct ftrace_event_call *call = &event_print; 1471 struct ftrace_event_call *call = &event_print;
@@ -1360,12 +1491,9 @@ int trace_array_vprintk(struct trace_array *tr,
1360 1491
1361 pause_graph_tracing(); 1492 pause_graph_tracing();
1362 raw_local_irq_save(irq_flags); 1493 raw_local_irq_save(irq_flags);
1363 __raw_spin_lock(&trace_buf_lock); 1494 arch_spin_lock(&trace_buf_lock);
1364 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); 1495 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
1365 1496
1366 len = min(len, TRACE_BUF_SIZE-1);
1367 trace_buf[len] = 0;
1368
1369 size = sizeof(*entry) + len + 1; 1497 size = sizeof(*entry) + len + 1;
1370 buffer = tr->buffer; 1498 buffer = tr->buffer;
1371 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, 1499 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
@@ -1373,15 +1501,17 @@ int trace_array_vprintk(struct trace_array *tr,
1373 if (!event) 1501 if (!event)
1374 goto out_unlock; 1502 goto out_unlock;
1375 entry = ring_buffer_event_data(event); 1503 entry = ring_buffer_event_data(event);
1376 entry->ip = ip; 1504 entry->ip = ip;
1377 1505
1378 memcpy(&entry->buf, trace_buf, len); 1506 memcpy(&entry->buf, trace_buf, len);
1379 entry->buf[len] = 0; 1507 entry->buf[len] = '\0';
1380 if (!filter_check_discard(call, entry, buffer, event)) 1508 if (!filter_check_discard(call, entry, buffer, event)) {
1381 ring_buffer_unlock_commit(buffer, event); 1509 ring_buffer_unlock_commit(buffer, event);
1510 ftrace_trace_stack(buffer, irq_flags, 6, pc);
1511 }
1382 1512
1383 out_unlock: 1513 out_unlock:
1384 __raw_spin_unlock(&trace_buf_lock); 1514 arch_spin_unlock(&trace_buf_lock);
1385 raw_local_irq_restore(irq_flags); 1515 raw_local_irq_restore(irq_flags);
1386 unpause_graph_tracing(); 1516 unpause_graph_tracing();
1387 out: 1517 out:
@@ -1515,6 +1645,8 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
1515 int i = (int)*pos; 1645 int i = (int)*pos;
1516 void *ent; 1646 void *ent;
1517 1647
1648 WARN_ON_ONCE(iter->leftover);
1649
1518 (*pos)++; 1650 (*pos)++;
1519 1651
1520 /* can't go backwards */ 1652 /* can't go backwards */
@@ -1566,12 +1698,6 @@ static void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1566} 1698}
1567 1699
1568/* 1700/*
1569 * No necessary locking here. The worst thing which can
1570 * happen is loosing events consumed at the same time
1571 * by a trace_pipe reader.
1572 * Other than that, we don't risk to crash the ring buffer
1573 * because it serializes the readers.
1574 *
1575 * The current tracer is copied to avoid a global locking 1701 * The current tracer is copied to avoid a global locking
1576 * all around. 1702 * all around.
1577 */ 1703 */
@@ -1609,21 +1735,34 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1609 1735
1610 ftrace_enable_cpu(); 1736 ftrace_enable_cpu();
1611 1737
1738 iter->leftover = 0;
1612 for (p = iter; p && l < *pos; p = s_next(m, p, &l)) 1739 for (p = iter; p && l < *pos; p = s_next(m, p, &l))
1613 ; 1740 ;
1614 1741
1615 } else { 1742 } else {
1616 l = *pos - 1; 1743 /*
1617 p = s_next(m, p, &l); 1744 * If we overflowed the seq_file before, then we want
1745 * to just reuse the trace_seq buffer again.
1746 */
1747 if (iter->leftover)
1748 p = iter;
1749 else {
1750 l = *pos - 1;
1751 p = s_next(m, p, &l);
1752 }
1618 } 1753 }
1619 1754
1620 trace_event_read_lock(); 1755 trace_event_read_lock();
1756 trace_access_lock(cpu_file);
1621 return p; 1757 return p;
1622} 1758}
1623 1759
1624static void s_stop(struct seq_file *m, void *p) 1760static void s_stop(struct seq_file *m, void *p)
1625{ 1761{
1762 struct trace_iterator *iter = m->private;
1763
1626 atomic_dec(&trace_record_cmdline_disabled); 1764 atomic_dec(&trace_record_cmdline_disabled);
1765 trace_access_unlock(iter->cpu_file);
1627 trace_event_read_unlock(); 1766 trace_event_read_unlock();
1628} 1767}
1629 1768
@@ -1922,6 +2061,7 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
1922static int s_show(struct seq_file *m, void *v) 2061static int s_show(struct seq_file *m, void *v)
1923{ 2062{
1924 struct trace_iterator *iter = v; 2063 struct trace_iterator *iter = v;
2064 int ret;
1925 2065
1926 if (iter->ent == NULL) { 2066 if (iter->ent == NULL) {
1927 if (iter->tr) { 2067 if (iter->tr) {
@@ -1941,9 +2081,27 @@ static int s_show(struct seq_file *m, void *v)
1941 if (!(trace_flags & TRACE_ITER_VERBOSE)) 2081 if (!(trace_flags & TRACE_ITER_VERBOSE))
1942 print_func_help_header(m); 2082 print_func_help_header(m);
1943 } 2083 }
2084 } else if (iter->leftover) {
2085 /*
2086 * If we filled the seq_file buffer earlier, we
2087 * want to just show it now.
2088 */
2089 ret = trace_print_seq(m, &iter->seq);
2090
2091 /* ret should this time be zero, but you never know */
2092 iter->leftover = ret;
2093
1944 } else { 2094 } else {
1945 print_trace_line(iter); 2095 print_trace_line(iter);
1946 trace_print_seq(m, &iter->seq); 2096 ret = trace_print_seq(m, &iter->seq);
2097 /*
2098 * If we overflow the seq_file buffer, then it will
2099 * ask us for this data again at start up.
2100 * Use that instead.
2101 * ret is 0 if seq_file write succeeded.
2102 * -1 otherwise.
2103 */
2104 iter->leftover = ret;
1947 } 2105 }
1948 2106
1949 return 0; 2107 return 0;
@@ -2253,7 +2411,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2253 mutex_lock(&tracing_cpumask_update_lock); 2411 mutex_lock(&tracing_cpumask_update_lock);
2254 2412
2255 local_irq_disable(); 2413 local_irq_disable();
2256 __raw_spin_lock(&ftrace_max_lock); 2414 arch_spin_lock(&ftrace_max_lock);
2257 for_each_tracing_cpu(cpu) { 2415 for_each_tracing_cpu(cpu) {
2258 /* 2416 /*
2259 * Increase/decrease the disabled counter if we are 2417 * Increase/decrease the disabled counter if we are
@@ -2268,7 +2426,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2268 atomic_dec(&global_trace.data[cpu]->disabled); 2426 atomic_dec(&global_trace.data[cpu]->disabled);
2269 } 2427 }
2270 } 2428 }
2271 __raw_spin_unlock(&ftrace_max_lock); 2429 arch_spin_unlock(&ftrace_max_lock);
2272 local_irq_enable(); 2430 local_irq_enable();
2273 2431
2274 cpumask_copy(tracing_cpumask, tracing_cpumask_new); 2432 cpumask_copy(tracing_cpumask, tracing_cpumask_new);
@@ -2290,67 +2448,49 @@ static const struct file_operations tracing_cpumask_fops = {
2290 .write = tracing_cpumask_write, 2448 .write = tracing_cpumask_write,
2291}; 2449};
2292 2450
2293static ssize_t 2451static int tracing_trace_options_show(struct seq_file *m, void *v)
2294tracing_trace_options_read(struct file *filp, char __user *ubuf,
2295 size_t cnt, loff_t *ppos)
2296{ 2452{
2297 struct tracer_opt *trace_opts; 2453 struct tracer_opt *trace_opts;
2298 u32 tracer_flags; 2454 u32 tracer_flags;
2299 int len = 0;
2300 char *buf;
2301 int r = 0;
2302 int i; 2455 int i;
2303 2456
2304
2305 /* calculate max size */
2306 for (i = 0; trace_options[i]; i++) {
2307 len += strlen(trace_options[i]);
2308 len += 3; /* "no" and newline */
2309 }
2310
2311 mutex_lock(&trace_types_lock); 2457 mutex_lock(&trace_types_lock);
2312 tracer_flags = current_trace->flags->val; 2458 tracer_flags = current_trace->flags->val;
2313 trace_opts = current_trace->flags->opts; 2459 trace_opts = current_trace->flags->opts;
2314 2460
2315 /*
2316 * Increase the size with names of options specific
2317 * of the current tracer.
2318 */
2319 for (i = 0; trace_opts[i].name; i++) {
2320 len += strlen(trace_opts[i].name);
2321 len += 3; /* "no" and newline */
2322 }
2323
2324 /* +1 for \0 */
2325 buf = kmalloc(len + 1, GFP_KERNEL);
2326 if (!buf) {
2327 mutex_unlock(&trace_types_lock);
2328 return -ENOMEM;
2329 }
2330
2331 for (i = 0; trace_options[i]; i++) { 2461 for (i = 0; trace_options[i]; i++) {
2332 if (trace_flags & (1 << i)) 2462 if (trace_flags & (1 << i))
2333 r += sprintf(buf + r, "%s\n", trace_options[i]); 2463 seq_printf(m, "%s\n", trace_options[i]);
2334 else 2464 else
2335 r += sprintf(buf + r, "no%s\n", trace_options[i]); 2465 seq_printf(m, "no%s\n", trace_options[i]);
2336 } 2466 }
2337 2467
2338 for (i = 0; trace_opts[i].name; i++) { 2468 for (i = 0; trace_opts[i].name; i++) {
2339 if (tracer_flags & trace_opts[i].bit) 2469 if (tracer_flags & trace_opts[i].bit)
2340 r += sprintf(buf + r, "%s\n", 2470 seq_printf(m, "%s\n", trace_opts[i].name);
2341 trace_opts[i].name);
2342 else 2471 else
2343 r += sprintf(buf + r, "no%s\n", 2472 seq_printf(m, "no%s\n", trace_opts[i].name);
2344 trace_opts[i].name);
2345 } 2473 }
2346 mutex_unlock(&trace_types_lock); 2474 mutex_unlock(&trace_types_lock);
2347 2475
2348 WARN_ON(r >= len + 1); 2476 return 0;
2477}
2349 2478
2350 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 2479static int __set_tracer_option(struct tracer *trace,
2480 struct tracer_flags *tracer_flags,
2481 struct tracer_opt *opts, int neg)
2482{
2483 int ret;
2351 2484
2352 kfree(buf); 2485 ret = trace->set_flag(tracer_flags->val, opts->bit, !neg);
2353 return r; 2486 if (ret)
2487 return ret;
2488
2489 if (neg)
2490 tracer_flags->val &= ~opts->bit;
2491 else
2492 tracer_flags->val |= opts->bit;
2493 return 0;
2354} 2494}
2355 2495
2356/* Try to assign a tracer specific option */ 2496/* Try to assign a tracer specific option */
@@ -2358,33 +2498,17 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
2358{ 2498{
2359 struct tracer_flags *tracer_flags = trace->flags; 2499 struct tracer_flags *tracer_flags = trace->flags;
2360 struct tracer_opt *opts = NULL; 2500 struct tracer_opt *opts = NULL;
2361 int ret = 0, i = 0; 2501 int i;
2362 int len;
2363 2502
2364 for (i = 0; tracer_flags->opts[i].name; i++) { 2503 for (i = 0; tracer_flags->opts[i].name; i++) {
2365 opts = &tracer_flags->opts[i]; 2504 opts = &tracer_flags->opts[i];
2366 len = strlen(opts->name);
2367 2505
2368 if (strncmp(cmp, opts->name, len) == 0) { 2506 if (strcmp(cmp, opts->name) == 0)
2369 ret = trace->set_flag(tracer_flags->val, 2507 return __set_tracer_option(trace, trace->flags,
2370 opts->bit, !neg); 2508 opts, neg);
2371 break;
2372 }
2373 } 2509 }
2374 /* Not found */
2375 if (!tracer_flags->opts[i].name)
2376 return -EINVAL;
2377
2378 /* Refused to handle */
2379 if (ret)
2380 return ret;
2381
2382 if (neg)
2383 tracer_flags->val &= ~opts->bit;
2384 else
2385 tracer_flags->val |= opts->bit;
2386 2510
2387 return 0; 2511 return -EINVAL;
2388} 2512}
2389 2513
2390static void set_tracer_flags(unsigned int mask, int enabled) 2514static void set_tracer_flags(unsigned int mask, int enabled)
@@ -2404,7 +2528,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2404 size_t cnt, loff_t *ppos) 2528 size_t cnt, loff_t *ppos)
2405{ 2529{
2406 char buf[64]; 2530 char buf[64];
2407 char *cmp = buf; 2531 char *cmp;
2408 int neg = 0; 2532 int neg = 0;
2409 int ret; 2533 int ret;
2410 int i; 2534 int i;
@@ -2416,16 +2540,15 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2416 return -EFAULT; 2540 return -EFAULT;
2417 2541
2418 buf[cnt] = 0; 2542 buf[cnt] = 0;
2543 cmp = strstrip(buf);
2419 2544
2420 if (strncmp(buf, "no", 2) == 0) { 2545 if (strncmp(cmp, "no", 2) == 0) {
2421 neg = 1; 2546 neg = 1;
2422 cmp += 2; 2547 cmp += 2;
2423 } 2548 }
2424 2549
2425 for (i = 0; trace_options[i]; i++) { 2550 for (i = 0; trace_options[i]; i++) {
2426 int len = strlen(trace_options[i]); 2551 if (strcmp(cmp, trace_options[i]) == 0) {
2427
2428 if (strncmp(cmp, trace_options[i], len) == 0) {
2429 set_tracer_flags(1 << i, !neg); 2552 set_tracer_flags(1 << i, !neg);
2430 break; 2553 break;
2431 } 2554 }
@@ -2445,9 +2568,18 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2445 return cnt; 2568 return cnt;
2446} 2569}
2447 2570
2571static int tracing_trace_options_open(struct inode *inode, struct file *file)
2572{
2573 if (tracing_disabled)
2574 return -ENODEV;
2575 return single_open(file, tracing_trace_options_show, NULL);
2576}
2577
2448static const struct file_operations tracing_iter_fops = { 2578static const struct file_operations tracing_iter_fops = {
2449 .open = tracing_open_generic, 2579 .open = tracing_trace_options_open,
2450 .read = tracing_trace_options_read, 2580 .read = seq_read,
2581 .llseek = seq_lseek,
2582 .release = single_release,
2451 .write = tracing_trace_options_write, 2583 .write = tracing_trace_options_write,
2452}; 2584};
2453 2585
@@ -2821,22 +2953,6 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
2821 2953
2822 mutex_lock(&trace_types_lock); 2954 mutex_lock(&trace_types_lock);
2823 2955
2824 /* We only allow one reader per cpu */
2825 if (cpu_file == TRACE_PIPE_ALL_CPU) {
2826 if (!cpumask_empty(tracing_reader_cpumask)) {
2827 ret = -EBUSY;
2828 goto out;
2829 }
2830 cpumask_setall(tracing_reader_cpumask);
2831 } else {
2832 if (!cpumask_test_cpu(cpu_file, tracing_reader_cpumask))
2833 cpumask_set_cpu(cpu_file, tracing_reader_cpumask);
2834 else {
2835 ret = -EBUSY;
2836 goto out;
2837 }
2838 }
2839
2840 /* create a buffer to store the information to pass to userspace */ 2956 /* create a buffer to store the information to pass to userspace */
2841 iter = kzalloc(sizeof(*iter), GFP_KERNEL); 2957 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
2842 if (!iter) { 2958 if (!iter) {
@@ -2892,10 +3008,8 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
2892 3008
2893 mutex_lock(&trace_types_lock); 3009 mutex_lock(&trace_types_lock);
2894 3010
2895 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) 3011 if (iter->trace->pipe_close)
2896 cpumask_clear(tracing_reader_cpumask); 3012 iter->trace->pipe_close(iter);
2897 else
2898 cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask);
2899 3013
2900 mutex_unlock(&trace_types_lock); 3014 mutex_unlock(&trace_types_lock);
2901 3015
@@ -3055,6 +3169,7 @@ waitagain:
3055 iter->pos = -1; 3169 iter->pos = -1;
3056 3170
3057 trace_event_read_lock(); 3171 trace_event_read_lock();
3172 trace_access_lock(iter->cpu_file);
3058 while (find_next_entry_inc(iter) != NULL) { 3173 while (find_next_entry_inc(iter) != NULL) {
3059 enum print_line_t ret; 3174 enum print_line_t ret;
3060 int len = iter->seq.len; 3175 int len = iter->seq.len;
@@ -3071,6 +3186,7 @@ waitagain:
3071 if (iter->seq.len >= cnt) 3186 if (iter->seq.len >= cnt)
3072 break; 3187 break;
3073 } 3188 }
3189 trace_access_unlock(iter->cpu_file);
3074 trace_event_read_unlock(); 3190 trace_event_read_unlock();
3075 3191
3076 /* Now copy what we have to the user */ 3192 /* Now copy what we have to the user */
@@ -3103,7 +3219,7 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
3103 __free_page(spd->pages[idx]); 3219 __free_page(spd->pages[idx]);
3104} 3220}
3105 3221
3106static struct pipe_buf_operations tracing_pipe_buf_ops = { 3222static const struct pipe_buf_operations tracing_pipe_buf_ops = {
3107 .can_merge = 0, 3223 .can_merge = 0,
3108 .map = generic_pipe_buf_map, 3224 .map = generic_pipe_buf_map,
3109 .unmap = generic_pipe_buf_unmap, 3225 .unmap = generic_pipe_buf_unmap,
@@ -3196,6 +3312,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3196 } 3312 }
3197 3313
3198 trace_event_read_lock(); 3314 trace_event_read_lock();
3315 trace_access_lock(iter->cpu_file);
3199 3316
3200 /* Fill as many pages as possible. */ 3317 /* Fill as many pages as possible. */
3201 for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { 3318 for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
@@ -3219,6 +3336,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3219 trace_seq_init(&iter->seq); 3336 trace_seq_init(&iter->seq);
3220 } 3337 }
3221 3338
3339 trace_access_unlock(iter->cpu_file);
3222 trace_event_read_unlock(); 3340 trace_event_read_unlock();
3223 mutex_unlock(&iter->mutex); 3341 mutex_unlock(&iter->mutex);
3224 3342
@@ -3334,7 +3452,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3334 size_t cnt, loff_t *fpos) 3452 size_t cnt, loff_t *fpos)
3335{ 3453{
3336 char *buf; 3454 char *buf;
3337 char *end;
3338 3455
3339 if (tracing_disabled) 3456 if (tracing_disabled)
3340 return -EINVAL; 3457 return -EINVAL;
@@ -3342,7 +3459,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3342 if (cnt > TRACE_BUF_SIZE) 3459 if (cnt > TRACE_BUF_SIZE)
3343 cnt = TRACE_BUF_SIZE; 3460 cnt = TRACE_BUF_SIZE;
3344 3461
3345 buf = kmalloc(cnt + 1, GFP_KERNEL); 3462 buf = kmalloc(cnt + 2, GFP_KERNEL);
3346 if (buf == NULL) 3463 if (buf == NULL)
3347 return -ENOMEM; 3464 return -ENOMEM;
3348 3465
@@ -3350,35 +3467,31 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3350 kfree(buf); 3467 kfree(buf);
3351 return -EFAULT; 3468 return -EFAULT;
3352 } 3469 }
3470 if (buf[cnt-1] != '\n') {
3471 buf[cnt] = '\n';
3472 buf[cnt+1] = '\0';
3473 } else
3474 buf[cnt] = '\0';
3353 3475
3354 /* Cut from the first nil or newline. */ 3476 cnt = mark_printk("%s", buf);
3355 buf[cnt] = '\0';
3356 end = strchr(buf, '\n');
3357 if (end)
3358 *end = '\0';
3359
3360 cnt = mark_printk("%s\n", buf);
3361 kfree(buf); 3477 kfree(buf);
3362 *fpos += cnt; 3478 *fpos += cnt;
3363 3479
3364 return cnt; 3480 return cnt;
3365} 3481}
3366 3482
3367static ssize_t tracing_clock_read(struct file *filp, char __user *ubuf, 3483static int tracing_clock_show(struct seq_file *m, void *v)
3368 size_t cnt, loff_t *ppos)
3369{ 3484{
3370 char buf[64];
3371 int bufiter = 0;
3372 int i; 3485 int i;
3373 3486
3374 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) 3487 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++)
3375 bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter, 3488 seq_printf(m,
3376 "%s%s%s%s", i ? " " : "", 3489 "%s%s%s%s", i ? " " : "",
3377 i == trace_clock_id ? "[" : "", trace_clocks[i].name, 3490 i == trace_clock_id ? "[" : "", trace_clocks[i].name,
3378 i == trace_clock_id ? "]" : ""); 3491 i == trace_clock_id ? "]" : "");
3379 bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter, "\n"); 3492 seq_putc(m, '\n');
3380 3493
3381 return simple_read_from_buffer(ubuf, cnt, ppos, buf, bufiter); 3494 return 0;
3382} 3495}
3383 3496
3384static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, 3497static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
@@ -3420,6 +3533,13 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
3420 return cnt; 3533 return cnt;
3421} 3534}
3422 3535
3536static int tracing_clock_open(struct inode *inode, struct file *file)
3537{
3538 if (tracing_disabled)
3539 return -ENODEV;
3540 return single_open(file, tracing_clock_show, NULL);
3541}
3542
3423static const struct file_operations tracing_max_lat_fops = { 3543static const struct file_operations tracing_max_lat_fops = {
3424 .open = tracing_open_generic, 3544 .open = tracing_open_generic,
3425 .read = tracing_max_lat_read, 3545 .read = tracing_max_lat_read,
@@ -3458,8 +3578,10 @@ static const struct file_operations tracing_mark_fops = {
3458}; 3578};
3459 3579
3460static const struct file_operations trace_clock_fops = { 3580static const struct file_operations trace_clock_fops = {
3461 .open = tracing_open_generic, 3581 .open = tracing_clock_open,
3462 .read = tracing_clock_read, 3582 .read = seq_read,
3583 .llseek = seq_lseek,
3584 .release = single_release,
3463 .write = tracing_clock_write, 3585 .write = tracing_clock_write,
3464}; 3586};
3465 3587
@@ -3516,10 +3638,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
3516 3638
3517 info->read = 0; 3639 info->read = 0;
3518 3640
3641 trace_access_lock(info->cpu);
3519 ret = ring_buffer_read_page(info->tr->buffer, 3642 ret = ring_buffer_read_page(info->tr->buffer,
3520 &info->spare, 3643 &info->spare,
3521 count, 3644 count,
3522 info->cpu, 0); 3645 info->cpu, 0);
3646 trace_access_unlock(info->cpu);
3523 if (ret < 0) 3647 if (ret < 0)
3524 return 0; 3648 return 0;
3525 3649
@@ -3589,7 +3713,7 @@ static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
3589} 3713}
3590 3714
3591/* Pipe buffer operations for a buffer. */ 3715/* Pipe buffer operations for a buffer. */
3592static struct pipe_buf_operations buffer_pipe_buf_ops = { 3716static const struct pipe_buf_operations buffer_pipe_buf_ops = {
3593 .can_merge = 0, 3717 .can_merge = 0,
3594 .map = generic_pipe_buf_map, 3718 .map = generic_pipe_buf_map,
3595 .unmap = generic_pipe_buf_unmap, 3719 .unmap = generic_pipe_buf_unmap,
@@ -3647,6 +3771,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3647 len &= PAGE_MASK; 3771 len &= PAGE_MASK;
3648 } 3772 }
3649 3773
3774 trace_access_lock(info->cpu);
3650 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); 3775 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
3651 3776
3652 for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) { 3777 for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) {
@@ -3694,6 +3819,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3694 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); 3819 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
3695 } 3820 }
3696 3821
3822 trace_access_unlock(info->cpu);
3697 spd.nr_pages = i; 3823 spd.nr_pages = i;
3698 3824
3699 /* did we read anything? */ 3825 /* did we read anything? */
@@ -3730,7 +3856,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
3730 3856
3731 s = kmalloc(sizeof(*s), GFP_KERNEL); 3857 s = kmalloc(sizeof(*s), GFP_KERNEL);
3732 if (!s) 3858 if (!s)
3733 return ENOMEM; 3859 return -ENOMEM;
3734 3860
3735 trace_seq_init(s); 3861 trace_seq_init(s);
3736 3862
@@ -3920,39 +4046,16 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
3920 if (ret < 0) 4046 if (ret < 0)
3921 return ret; 4047 return ret;
3922 4048
3923 ret = 0; 4049 if (val != 0 && val != 1)
3924 switch (val) { 4050 return -EINVAL;
3925 case 0:
3926 /* do nothing if already cleared */
3927 if (!(topt->flags->val & topt->opt->bit))
3928 break;
3929
3930 mutex_lock(&trace_types_lock);
3931 if (current_trace->set_flag)
3932 ret = current_trace->set_flag(topt->flags->val,
3933 topt->opt->bit, 0);
3934 mutex_unlock(&trace_types_lock);
3935 if (ret)
3936 return ret;
3937 topt->flags->val &= ~topt->opt->bit;
3938 break;
3939 case 1:
3940 /* do nothing if already set */
3941 if (topt->flags->val & topt->opt->bit)
3942 break;
3943 4051
4052 if (!!(topt->flags->val & topt->opt->bit) != val) {
3944 mutex_lock(&trace_types_lock); 4053 mutex_lock(&trace_types_lock);
3945 if (current_trace->set_flag) 4054 ret = __set_tracer_option(current_trace, topt->flags,
3946 ret = current_trace->set_flag(topt->flags->val, 4055 topt->opt, !val);
3947 topt->opt->bit, 1);
3948 mutex_unlock(&trace_types_lock); 4056 mutex_unlock(&trace_types_lock);
3949 if (ret) 4057 if (ret)
3950 return ret; 4058 return ret;
3951 topt->flags->val |= topt->opt->bit;
3952 break;
3953
3954 default:
3955 return -EINVAL;
3956 } 4059 }
3957 4060
3958 *ppos += cnt; 4061 *ppos += cnt;
@@ -4153,6 +4256,8 @@ static __init int tracer_init_debugfs(void)
4153 struct dentry *d_tracer; 4256 struct dentry *d_tracer;
4154 int cpu; 4257 int cpu;
4155 4258
4259 trace_access_lock_init();
4260
4156 d_tracer = tracing_init_dentry(); 4261 d_tracer = tracing_init_dentry();
4157 4262
4158 trace_create_file("tracing_enabled", 0644, d_tracer, 4263 trace_create_file("tracing_enabled", 0644, d_tracer,
@@ -4176,10 +4281,10 @@ static __init int tracer_init_debugfs(void)
4176#ifdef CONFIG_TRACER_MAX_TRACE 4281#ifdef CONFIG_TRACER_MAX_TRACE
4177 trace_create_file("tracing_max_latency", 0644, d_tracer, 4282 trace_create_file("tracing_max_latency", 0644, d_tracer,
4178 &tracing_max_latency, &tracing_max_lat_fops); 4283 &tracing_max_latency, &tracing_max_lat_fops);
4284#endif
4179 4285
4180 trace_create_file("tracing_thresh", 0644, d_tracer, 4286 trace_create_file("tracing_thresh", 0644, d_tracer,
4181 &tracing_thresh, &tracing_max_lat_fops); 4287 &tracing_thresh, &tracing_max_lat_fops);
4182#endif
4183 4288
4184 trace_create_file("README", 0444, d_tracer, 4289 trace_create_file("README", 0444, d_tracer,
4185 NULL, &tracing_readme_fops); 4290 NULL, &tracing_readme_fops);
@@ -4279,8 +4384,8 @@ trace_printk_seq(struct trace_seq *s)
4279 4384
4280static void __ftrace_dump(bool disable_tracing) 4385static void __ftrace_dump(bool disable_tracing)
4281{ 4386{
4282 static raw_spinlock_t ftrace_dump_lock = 4387 static arch_spinlock_t ftrace_dump_lock =
4283 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 4388 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
4284 /* use static because iter can be a bit big for the stack */ 4389 /* use static because iter can be a bit big for the stack */
4285 static struct trace_iterator iter; 4390 static struct trace_iterator iter;
4286 unsigned int old_userobj; 4391 unsigned int old_userobj;
@@ -4290,7 +4395,7 @@ static void __ftrace_dump(bool disable_tracing)
4290 4395
4291 /* only one dump */ 4396 /* only one dump */
4292 local_irq_save(flags); 4397 local_irq_save(flags);
4293 __raw_spin_lock(&ftrace_dump_lock); 4398 arch_spin_lock(&ftrace_dump_lock);
4294 if (dump_ran) 4399 if (dump_ran)
4295 goto out; 4400 goto out;
4296 4401
@@ -4365,7 +4470,7 @@ static void __ftrace_dump(bool disable_tracing)
4365 } 4470 }
4366 4471
4367 out: 4472 out:
4368 __raw_spin_unlock(&ftrace_dump_lock); 4473 arch_spin_unlock(&ftrace_dump_lock);
4369 local_irq_restore(flags); 4474 local_irq_restore(flags);
4370} 4475}
4371 4476
@@ -4387,9 +4492,6 @@ __init static int tracer_alloc_buffers(void)
4387 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) 4492 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
4388 goto out_free_buffer_mask; 4493 goto out_free_buffer_mask;
4389 4494
4390 if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
4391 goto out_free_tracing_cpumask;
4392
4393 /* To save memory, keep the ring buffer size to its minimum */ 4495 /* To save memory, keep the ring buffer size to its minimum */
4394 if (ring_buffer_expanded) 4496 if (ring_buffer_expanded)
4395 ring_buf_size = trace_buf_size; 4497 ring_buf_size = trace_buf_size;
@@ -4426,7 +4528,7 @@ __init static int tracer_alloc_buffers(void)
4426 /* Allocate the first page for all buffers */ 4528 /* Allocate the first page for all buffers */
4427 for_each_tracing_cpu(i) { 4529 for_each_tracing_cpu(i) {
4428 global_trace.data[i] = &per_cpu(global_trace_cpu, i); 4530 global_trace.data[i] = &per_cpu(global_trace_cpu, i);
4429 max_tr.data[i] = &per_cpu(max_data, i); 4531 max_tr.data[i] = &per_cpu(max_tr_data, i);
4430 } 4532 }
4431 4533
4432 trace_init_cmdlines(); 4534 trace_init_cmdlines();
@@ -4447,8 +4549,6 @@ __init static int tracer_alloc_buffers(void)
4447 return 0; 4549 return 0;
4448 4550
4449out_free_cpumask: 4551out_free_cpumask:
4450 free_cpumask_var(tracing_reader_cpumask);
4451out_free_tracing_cpumask:
4452 free_cpumask_var(tracing_cpumask); 4552 free_cpumask_var(tracing_cpumask);
4453out_free_buffer_mask: 4553out_free_buffer_mask:
4454 free_cpumask_var(tracing_buffer_mask); 4554 free_cpumask_var(tracing_buffer_mask);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 405cb850b75d..2825ef2c0b15 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -11,6 +11,7 @@
11#include <linux/ftrace.h> 11#include <linux/ftrace.h>
12#include <trace/boot.h> 12#include <trace/boot.h>
13#include <linux/kmemtrace.h> 13#include <linux/kmemtrace.h>
14#include <linux/hw_breakpoint.h>
14 15
15#include <linux/trace_seq.h> 16#include <linux/trace_seq.h>
16#include <linux/ftrace_event.h> 17#include <linux/ftrace_event.h>
@@ -37,6 +38,7 @@ enum trace_type {
37 TRACE_KMEM_ALLOC, 38 TRACE_KMEM_ALLOC,
38 TRACE_KMEM_FREE, 39 TRACE_KMEM_FREE,
39 TRACE_BLK, 40 TRACE_BLK,
41 TRACE_KSYM,
40 42
41 __TRACE_LAST_TYPE, 43 __TRACE_LAST_TYPE,
42}; 44};
@@ -98,9 +100,32 @@ struct syscall_trace_enter {
98struct syscall_trace_exit { 100struct syscall_trace_exit {
99 struct trace_entry ent; 101 struct trace_entry ent;
100 int nr; 102 int nr;
101 unsigned long ret; 103 long ret;
102}; 104};
103 105
106struct kprobe_trace_entry {
107 struct trace_entry ent;
108 unsigned long ip;
109 int nargs;
110 unsigned long args[];
111};
112
113#define SIZEOF_KPROBE_TRACE_ENTRY(n) \
114 (offsetof(struct kprobe_trace_entry, args) + \
115 (sizeof(unsigned long) * (n)))
116
117struct kretprobe_trace_entry {
118 struct trace_entry ent;
119 unsigned long func;
120 unsigned long ret_ip;
121 int nargs;
122 unsigned long args[];
123};
124
125#define SIZEOF_KRETPROBE_TRACE_ENTRY(n) \
126 (offsetof(struct kretprobe_trace_entry, args) + \
127 (sizeof(unsigned long) * (n)))
128
104/* 129/*
105 * trace_flag_type is an enumeration that holds different 130 * trace_flag_type is an enumeration that holds different
106 * states when a trace occurs. These are: 131 * states when a trace occurs. These are:
@@ -209,6 +234,7 @@ extern void __ftrace_bad_type(void);
209 TRACE_KMEM_ALLOC); \ 234 TRACE_KMEM_ALLOC); \
210 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ 235 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
211 TRACE_KMEM_FREE); \ 236 TRACE_KMEM_FREE); \
237 IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\
212 __ftrace_bad_type(); \ 238 __ftrace_bad_type(); \
213 } while (0) 239 } while (0)
214 240
@@ -246,6 +272,7 @@ struct tracer_flags {
246 * @pipe_open: called when the trace_pipe file is opened 272 * @pipe_open: called when the trace_pipe file is opened
247 * @wait_pipe: override how the user waits for traces on trace_pipe 273 * @wait_pipe: override how the user waits for traces on trace_pipe
248 * @close: called when the trace file is released 274 * @close: called when the trace file is released
275 * @pipe_close: called when the trace_pipe file is released
249 * @read: override the default read callback on trace_pipe 276 * @read: override the default read callback on trace_pipe
250 * @splice_read: override the default splice_read callback on trace_pipe 277 * @splice_read: override the default splice_read callback on trace_pipe
251 * @selftest: selftest to run on boot (see trace_selftest.c) 278 * @selftest: selftest to run on boot (see trace_selftest.c)
@@ -264,6 +291,7 @@ struct tracer {
264 void (*pipe_open)(struct trace_iterator *iter); 291 void (*pipe_open)(struct trace_iterator *iter);
265 void (*wait_pipe)(struct trace_iterator *iter); 292 void (*wait_pipe)(struct trace_iterator *iter);
266 void (*close)(struct trace_iterator *iter); 293 void (*close)(struct trace_iterator *iter);
294 void (*pipe_close)(struct trace_iterator *iter);
267 ssize_t (*read)(struct trace_iterator *iter, 295 ssize_t (*read)(struct trace_iterator *iter,
268 struct file *filp, char __user *ubuf, 296 struct file *filp, char __user *ubuf,
269 size_t cnt, loff_t *ppos); 297 size_t cnt, loff_t *ppos);
@@ -364,11 +392,14 @@ int register_tracer(struct tracer *type);
364void unregister_tracer(struct tracer *type); 392void unregister_tracer(struct tracer *type);
365int is_tracing_stopped(void); 393int is_tracing_stopped(void);
366 394
395extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr);
396
367extern unsigned long nsecs_to_usecs(unsigned long nsecs); 397extern unsigned long nsecs_to_usecs(unsigned long nsecs);
368 398
399extern unsigned long tracing_thresh;
400
369#ifdef CONFIG_TRACER_MAX_TRACE 401#ifdef CONFIG_TRACER_MAX_TRACE
370extern unsigned long tracing_max_latency; 402extern unsigned long tracing_max_latency;
371extern unsigned long tracing_thresh;
372 403
373void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); 404void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
374void update_max_tr_single(struct trace_array *tr, 405void update_max_tr_single(struct trace_array *tr,
@@ -413,7 +444,7 @@ extern int DYN_FTRACE_TEST_NAME(void);
413 444
414extern int ring_buffer_expanded; 445extern int ring_buffer_expanded;
415extern bool tracing_selftest_disabled; 446extern bool tracing_selftest_disabled;
416DECLARE_PER_CPU(local_t, ftrace_cpu_disabled); 447DECLARE_PER_CPU(int, ftrace_cpu_disabled);
417 448
418#ifdef CONFIG_FTRACE_STARTUP_TEST 449#ifdef CONFIG_FTRACE_STARTUP_TEST
419extern int trace_selftest_startup_function(struct tracer *trace, 450extern int trace_selftest_startup_function(struct tracer *trace,
@@ -438,6 +469,8 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
438 struct trace_array *tr); 469 struct trace_array *tr);
439extern int trace_selftest_startup_hw_branches(struct tracer *trace, 470extern int trace_selftest_startup_hw_branches(struct tracer *trace,
440 struct trace_array *tr); 471 struct trace_array *tr);
472extern int trace_selftest_startup_ksym(struct tracer *trace,
473 struct trace_array *tr);
441#endif /* CONFIG_FTRACE_STARTUP_TEST */ 474#endif /* CONFIG_FTRACE_STARTUP_TEST */
442 475
443extern void *head_page(struct trace_array_cpu *data); 476extern void *head_page(struct trace_array_cpu *data);
@@ -465,6 +498,7 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
465#ifdef CONFIG_DYNAMIC_FTRACE 498#ifdef CONFIG_DYNAMIC_FTRACE
466/* TODO: make this variable */ 499/* TODO: make this variable */
467#define FTRACE_GRAPH_MAX_FUNCS 32 500#define FTRACE_GRAPH_MAX_FUNCS 32
501extern int ftrace_graph_filter_enabled;
468extern int ftrace_graph_count; 502extern int ftrace_graph_count;
469extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; 503extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS];
470 504
@@ -472,7 +506,7 @@ static inline int ftrace_graph_addr(unsigned long addr)
472{ 506{
473 int i; 507 int i;
474 508
475 if (!ftrace_graph_count || test_tsk_trace_graph(current)) 509 if (!ftrace_graph_filter_enabled)
476 return 1; 510 return 1;
477 511
478 for (i = 0; i < ftrace_graph_count; i++) { 512 for (i = 0; i < ftrace_graph_count; i++) {
@@ -483,10 +517,6 @@ static inline int ftrace_graph_addr(unsigned long addr)
483 return 0; 517 return 0;
484} 518}
485#else 519#else
486static inline int ftrace_trace_addr(unsigned long addr)
487{
488 return 1;
489}
490static inline int ftrace_graph_addr(unsigned long addr) 520static inline int ftrace_graph_addr(unsigned long addr)
491{ 521{
492 return 1; 522 return 1;
@@ -500,12 +530,12 @@ print_graph_function(struct trace_iterator *iter)
500} 530}
501#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 531#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
502 532
503extern struct pid *ftrace_pid_trace; 533extern struct list_head ftrace_pids;
504 534
505#ifdef CONFIG_FUNCTION_TRACER 535#ifdef CONFIG_FUNCTION_TRACER
506static inline int ftrace_trace_task(struct task_struct *task) 536static inline int ftrace_trace_task(struct task_struct *task)
507{ 537{
508 if (!ftrace_pid_trace) 538 if (list_empty(&ftrace_pids))
509 return 1; 539 return 1;
510 540
511 return test_tsk_trace_trace(task); 541 return test_tsk_trace_trace(task);
@@ -521,7 +551,7 @@ static inline int ftrace_trace_task(struct task_struct *task)
521 * struct trace_parser - servers for reading the user input separated by spaces 551 * struct trace_parser - servers for reading the user input separated by spaces
522 * @cont: set if the input is not complete - no final space char was found 552 * @cont: set if the input is not complete - no final space char was found
523 * @buffer: holds the parsed user input 553 * @buffer: holds the parsed user input
524 * @idx: user input lenght 554 * @idx: user input length
525 * @size: buffer size 555 * @size: buffer size
526 */ 556 */
527struct trace_parser { 557struct trace_parser {
@@ -569,18 +599,17 @@ enum trace_iterator_flags {
569 TRACE_ITER_BIN = 0x40, 599 TRACE_ITER_BIN = 0x40,
570 TRACE_ITER_BLOCK = 0x80, 600 TRACE_ITER_BLOCK = 0x80,
571 TRACE_ITER_STACKTRACE = 0x100, 601 TRACE_ITER_STACKTRACE = 0x100,
572 TRACE_ITER_SCHED_TREE = 0x200, 602 TRACE_ITER_PRINTK = 0x200,
573 TRACE_ITER_PRINTK = 0x400, 603 TRACE_ITER_PREEMPTONLY = 0x400,
574 TRACE_ITER_PREEMPTONLY = 0x800, 604 TRACE_ITER_BRANCH = 0x800,
575 TRACE_ITER_BRANCH = 0x1000, 605 TRACE_ITER_ANNOTATE = 0x1000,
576 TRACE_ITER_ANNOTATE = 0x2000, 606 TRACE_ITER_USERSTACKTRACE = 0x2000,
577 TRACE_ITER_USERSTACKTRACE = 0x4000, 607 TRACE_ITER_SYM_USEROBJ = 0x4000,
578 TRACE_ITER_SYM_USEROBJ = 0x8000, 608 TRACE_ITER_PRINTK_MSGONLY = 0x8000,
579 TRACE_ITER_PRINTK_MSGONLY = 0x10000, 609 TRACE_ITER_CONTEXT_INFO = 0x10000, /* Print pid/cpu/time */
580 TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */ 610 TRACE_ITER_LATENCY_FMT = 0x20000,
581 TRACE_ITER_LATENCY_FMT = 0x40000, 611 TRACE_ITER_SLEEP_TIME = 0x40000,
582 TRACE_ITER_SLEEP_TIME = 0x80000, 612 TRACE_ITER_GRAPH_TIME = 0x80000,
583 TRACE_ITER_GRAPH_TIME = 0x100000,
584}; 613};
585 614
586/* 615/*
@@ -687,7 +716,6 @@ struct event_filter {
687 int n_preds; 716 int n_preds;
688 struct filter_pred **preds; 717 struct filter_pred **preds;
689 char *filter_string; 718 char *filter_string;
690 bool no_reset;
691}; 719};
692 720
693struct event_subsystem { 721struct event_subsystem {
@@ -699,22 +727,40 @@ struct event_subsystem {
699}; 727};
700 728
701struct filter_pred; 729struct filter_pred;
730struct regex;
702 731
703typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, 732typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event,
704 int val1, int val2); 733 int val1, int val2);
705 734
735typedef int (*regex_match_func)(char *str, struct regex *r, int len);
736
737enum regex_type {
738 MATCH_FULL = 0,
739 MATCH_FRONT_ONLY,
740 MATCH_MIDDLE_ONLY,
741 MATCH_END_ONLY,
742};
743
744struct regex {
745 char pattern[MAX_FILTER_STR_VAL];
746 int len;
747 int field_len;
748 regex_match_func match;
749};
750
706struct filter_pred { 751struct filter_pred {
707 filter_pred_fn_t fn; 752 filter_pred_fn_t fn;
708 u64 val; 753 u64 val;
709 char str_val[MAX_FILTER_STR_VAL]; 754 struct regex regex;
710 int str_len; 755 char *field_name;
711 char *field_name; 756 int offset;
712 int offset; 757 int not;
713 int not; 758 int op;
714 int op; 759 int pop_n;
715 int pop_n;
716}; 760};
717 761
762extern enum regex_type
763filter_parse_regex(char *buff, int len, char **search, int *not);
718extern void print_event_filter(struct ftrace_event_call *call, 764extern void print_event_filter(struct ftrace_event_call *call,
719 struct trace_seq *s); 765 struct trace_seq *s);
720extern int apply_event_filter(struct ftrace_event_call *call, 766extern int apply_event_filter(struct ftrace_event_call *call,
@@ -730,7 +776,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
730 struct ring_buffer *buffer, 776 struct ring_buffer *buffer,
731 struct ring_buffer_event *event) 777 struct ring_buffer_event *event)
732{ 778{
733 if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) { 779 if (unlikely(call->filter_active) &&
780 !filter_match_preds(call->filter, rec)) {
734 ring_buffer_discard_commit(buffer, event); 781 ring_buffer_discard_commit(buffer, event);
735 return 1; 782 return 1;
736 } 783 }
@@ -746,7 +793,8 @@ extern const char *__stop___trace_bprintk_fmt[];
746 793
747#undef FTRACE_ENTRY 794#undef FTRACE_ENTRY
748#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ 795#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \
749 extern struct ftrace_event_call event_##call; 796 extern struct ftrace_event_call \
797 __attribute__((__aligned__(4))) event_##call;
750#undef FTRACE_ENTRY_DUP 798#undef FTRACE_ENTRY_DUP
751#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ 799#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \
752 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) 800 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 4a194f08f88c..b9bc4d470177 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -307,8 +307,23 @@ static int annotated_branch_stat_cmp(void *p1, void *p2)
307 return -1; 307 return -1;
308 if (percent_a > percent_b) 308 if (percent_a > percent_b)
309 return 1; 309 return 1;
310 else 310
311 return 0; 311 if (a->incorrect < b->incorrect)
312 return -1;
313 if (a->incorrect > b->incorrect)
314 return 1;
315
316 /*
317 * Since the above shows worse (incorrect) cases
318 * first, we continue that by showing best (correct)
319 * cases last.
320 */
321 if (a->correct > b->correct)
322 return -1;
323 if (a->correct < b->correct)
324 return 1;
325
326 return 0;
312} 327}
313 328
314static struct tracer_stat annotated_branch_stats = { 329static struct tracer_stat annotated_branch_stats = {
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 20c5f92e28a8..9d589d8dcd1a 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -13,6 +13,7 @@
13 * Tracer plugins will chose a default from these clocks. 13 * Tracer plugins will chose a default from these clocks.
14 */ 14 */
15#include <linux/spinlock.h> 15#include <linux/spinlock.h>
16#include <linux/irqflags.h>
16#include <linux/hardirq.h> 17#include <linux/hardirq.h>
17#include <linux/module.h> 18#include <linux/module.h>
18#include <linux/percpu.h> 19#include <linux/percpu.h>
@@ -20,6 +21,8 @@
20#include <linux/ktime.h> 21#include <linux/ktime.h>
21#include <linux/trace_clock.h> 22#include <linux/trace_clock.h>
22 23
24#include "trace.h"
25
23/* 26/*
24 * trace_clock_local(): the simplest and least coherent tracing clock. 27 * trace_clock_local(): the simplest and least coherent tracing clock.
25 * 28 *
@@ -28,17 +31,17 @@
28 */ 31 */
29u64 notrace trace_clock_local(void) 32u64 notrace trace_clock_local(void)
30{ 33{
31 unsigned long flags;
32 u64 clock; 34 u64 clock;
35 int resched;
33 36
34 /* 37 /*
35 * sched_clock() is an architecture implemented, fast, scalable, 38 * sched_clock() is an architecture implemented, fast, scalable,
36 * lockless clock. It is not guaranteed to be coherent across 39 * lockless clock. It is not guaranteed to be coherent across
37 * CPUs, nor across CPU idle events. 40 * CPUs, nor across CPU idle events.
38 */ 41 */
39 raw_local_irq_save(flags); 42 resched = ftrace_preempt_disable();
40 clock = sched_clock(); 43 clock = sched_clock();
41 raw_local_irq_restore(flags); 44 ftrace_preempt_enable(resched);
42 45
43 return clock; 46 return clock;
44} 47}
@@ -69,10 +72,10 @@ u64 notrace trace_clock(void)
69/* keep prev_time and lock in the same cacheline. */ 72/* keep prev_time and lock in the same cacheline. */
70static struct { 73static struct {
71 u64 prev_time; 74 u64 prev_time;
72 raw_spinlock_t lock; 75 arch_spinlock_t lock;
73} trace_clock_struct ____cacheline_aligned_in_smp = 76} trace_clock_struct ____cacheline_aligned_in_smp =
74 { 77 {
75 .lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED, 78 .lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED,
76 }; 79 };
77 80
78u64 notrace trace_clock_global(void) 81u64 notrace trace_clock_global(void)
@@ -81,7 +84,7 @@ u64 notrace trace_clock_global(void)
81 int this_cpu; 84 int this_cpu;
82 u64 now; 85 u64 now;
83 86
84 raw_local_irq_save(flags); 87 local_irq_save(flags);
85 88
86 this_cpu = raw_smp_processor_id(); 89 this_cpu = raw_smp_processor_id();
87 now = cpu_clock(this_cpu); 90 now = cpu_clock(this_cpu);
@@ -92,7 +95,7 @@ u64 notrace trace_clock_global(void)
92 if (unlikely(in_nmi())) 95 if (unlikely(in_nmi()))
93 goto out; 96 goto out;
94 97
95 __raw_spin_lock(&trace_clock_struct.lock); 98 arch_spin_lock(&trace_clock_struct.lock);
96 99
97 /* 100 /*
98 * TODO: if this happens often then maybe we should reset 101 * TODO: if this happens often then maybe we should reset
@@ -104,10 +107,10 @@ u64 notrace trace_clock_global(void)
104 107
105 trace_clock_struct.prev_time = now; 108 trace_clock_struct.prev_time = now;
106 109
107 __raw_spin_unlock(&trace_clock_struct.lock); 110 arch_spin_unlock(&trace_clock_struct.lock);
108 111
109 out: 112 out:
110 raw_local_irq_restore(flags); 113 local_irq_restore(flags);
111 114
112 return now; 115 return now;
113} 116}
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index ead3d724599d..c16a08f399df 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -364,3 +364,19 @@ FTRACE_ENTRY(kmem_free, kmemtrace_free_entry,
364 F_printk("type:%u call_site:%lx ptr:%p", 364 F_printk("type:%u call_site:%lx ptr:%p",
365 __entry->type_id, __entry->call_site, __entry->ptr) 365 __entry->type_id, __entry->call_site, __entry->ptr)
366); 366);
367
368FTRACE_ENTRY(ksym_trace, ksym_trace_entry,
369
370 TRACE_KSYM,
371
372 F_STRUCT(
373 __field( unsigned long, ip )
374 __field( unsigned char, type )
375 __array( char , cmd, TASK_COMM_LEN )
376 __field( unsigned long, addr )
377 ),
378
379 F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s",
380 (void *)__entry->ip, (unsigned int)__entry->type,
381 (void *)__entry->addr, __entry->cmd)
382);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
new file mode 100644
index 000000000000..0565bb42566f
--- /dev/null
+++ b/kernel/trace/trace_event_perf.c
@@ -0,0 +1,175 @@
1/*
2 * trace event based perf event profiling/tracing
3 *
4 * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com>
5 * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com>
6 */
7
8#include <linux/module.h>
9#include <linux/kprobes.h>
10#include "trace.h"
11
12DEFINE_PER_CPU(struct pt_regs, perf_trace_regs);
13EXPORT_PER_CPU_SYMBOL_GPL(perf_trace_regs);
14
15EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
16
17static char *perf_trace_buf;
18static char *perf_trace_buf_nmi;
19
20/*
21 * Force it to be aligned to unsigned long to avoid misaligned accesses
22 * suprises
23 */
24typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
25 perf_trace_t;
26
27/* Count the events in use (per event id, not per instance) */
28static int total_ref_count;
29
30static int perf_trace_event_enable(struct ftrace_event_call *event)
31{
32 char *buf;
33 int ret = -ENOMEM;
34
35 if (event->perf_refcount++ > 0)
36 return 0;
37
38 if (!total_ref_count) {
39 buf = (char *)alloc_percpu(perf_trace_t);
40 if (!buf)
41 goto fail_buf;
42
43 rcu_assign_pointer(perf_trace_buf, buf);
44
45 buf = (char *)alloc_percpu(perf_trace_t);
46 if (!buf)
47 goto fail_buf_nmi;
48
49 rcu_assign_pointer(perf_trace_buf_nmi, buf);
50 }
51
52 ret = event->perf_event_enable(event);
53 if (!ret) {
54 total_ref_count++;
55 return 0;
56 }
57
58fail_buf_nmi:
59 if (!total_ref_count) {
60 free_percpu(perf_trace_buf_nmi);
61 free_percpu(perf_trace_buf);
62 perf_trace_buf_nmi = NULL;
63 perf_trace_buf = NULL;
64 }
65fail_buf:
66 event->perf_refcount--;
67
68 return ret;
69}
70
71int perf_trace_enable(int event_id)
72{
73 struct ftrace_event_call *event;
74 int ret = -EINVAL;
75
76 mutex_lock(&event_mutex);
77 list_for_each_entry(event, &ftrace_events, list) {
78 if (event->id == event_id && event->perf_event_enable &&
79 try_module_get(event->mod)) {
80 ret = perf_trace_event_enable(event);
81 break;
82 }
83 }
84 mutex_unlock(&event_mutex);
85
86 return ret;
87}
88
89static void perf_trace_event_disable(struct ftrace_event_call *event)
90{
91 char *buf, *nmi_buf;
92
93 if (--event->perf_refcount > 0)
94 return;
95
96 event->perf_event_disable(event);
97
98 if (!--total_ref_count) {
99 buf = perf_trace_buf;
100 rcu_assign_pointer(perf_trace_buf, NULL);
101
102 nmi_buf = perf_trace_buf_nmi;
103 rcu_assign_pointer(perf_trace_buf_nmi, NULL);
104
105 /*
106 * Ensure every events in profiling have finished before
107 * releasing the buffers
108 */
109 synchronize_sched();
110
111 free_percpu(buf);
112 free_percpu(nmi_buf);
113 }
114}
115
116void perf_trace_disable(int event_id)
117{
118 struct ftrace_event_call *event;
119
120 mutex_lock(&event_mutex);
121 list_for_each_entry(event, &ftrace_events, list) {
122 if (event->id == event_id) {
123 perf_trace_event_disable(event);
124 module_put(event->mod);
125 break;
126 }
127 }
128 mutex_unlock(&event_mutex);
129}
130
131__kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
132 int *rctxp, unsigned long *irq_flags)
133{
134 struct trace_entry *entry;
135 char *trace_buf, *raw_data;
136 int pc, cpu;
137
138 BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
139
140 pc = preempt_count();
141
142 /* Protect the per cpu buffer, begin the rcu read side */
143 local_irq_save(*irq_flags);
144
145 *rctxp = perf_swevent_get_recursion_context();
146 if (*rctxp < 0)
147 goto err_recursion;
148
149 cpu = smp_processor_id();
150
151 if (in_nmi())
152 trace_buf = rcu_dereference_sched(perf_trace_buf_nmi);
153 else
154 trace_buf = rcu_dereference_sched(perf_trace_buf);
155
156 if (!trace_buf)
157 goto err;
158
159 raw_data = per_cpu_ptr(trace_buf, cpu);
160
161 /* zero the dead bytes from align to not leak stack to user */
162 memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
163
164 entry = (struct trace_entry *)raw_data;
165 tracing_generic_entry_update(entry, *irq_flags, pc);
166 entry->type = type;
167
168 return raw_data;
169err:
170 perf_swevent_put_recursion_context(*rctxp);
171err_recursion:
172 local_irq_restore(*irq_flags);
173 return NULL;
174}
175EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
deleted file mode 100644
index 8d5c171cc998..000000000000
--- a/kernel/trace/trace_event_profile.c
+++ /dev/null
@@ -1,125 +0,0 @@
1/*
2 * trace event based perf counter profiling
3 *
4 * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com>
5 *
6 */
7
8#include <linux/module.h>
9#include "trace.h"
10
11/*
12 * We can't use a size but a type in alloc_percpu()
13 * So let's create a dummy type that matches the desired size
14 */
15typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t;
16
17char *trace_profile_buf;
18EXPORT_SYMBOL_GPL(trace_profile_buf);
19
20char *trace_profile_buf_nmi;
21EXPORT_SYMBOL_GPL(trace_profile_buf_nmi);
22
23/* Count the events in use (per event id, not per instance) */
24static int total_profile_count;
25
26static int ftrace_profile_enable_event(struct ftrace_event_call *event)
27{
28 char *buf;
29 int ret = -ENOMEM;
30
31 if (atomic_inc_return(&event->profile_count))
32 return 0;
33
34 if (!total_profile_count) {
35 buf = (char *)alloc_percpu(profile_buf_t);
36 if (!buf)
37 goto fail_buf;
38
39 rcu_assign_pointer(trace_profile_buf, buf);
40
41 buf = (char *)alloc_percpu(profile_buf_t);
42 if (!buf)
43 goto fail_buf_nmi;
44
45 rcu_assign_pointer(trace_profile_buf_nmi, buf);
46 }
47
48 ret = event->profile_enable();
49 if (!ret) {
50 total_profile_count++;
51 return 0;
52 }
53
54fail_buf_nmi:
55 if (!total_profile_count) {
56 free_percpu(trace_profile_buf_nmi);
57 free_percpu(trace_profile_buf);
58 trace_profile_buf_nmi = NULL;
59 trace_profile_buf = NULL;
60 }
61fail_buf:
62 atomic_dec(&event->profile_count);
63
64 return ret;
65}
66
67int ftrace_profile_enable(int event_id)
68{
69 struct ftrace_event_call *event;
70 int ret = -EINVAL;
71
72 mutex_lock(&event_mutex);
73 list_for_each_entry(event, &ftrace_events, list) {
74 if (event->id == event_id && event->profile_enable &&
75 try_module_get(event->mod)) {
76 ret = ftrace_profile_enable_event(event);
77 break;
78 }
79 }
80 mutex_unlock(&event_mutex);
81
82 return ret;
83}
84
85static void ftrace_profile_disable_event(struct ftrace_event_call *event)
86{
87 char *buf, *nmi_buf;
88
89 if (!atomic_add_negative(-1, &event->profile_count))
90 return;
91
92 event->profile_disable();
93
94 if (!--total_profile_count) {
95 buf = trace_profile_buf;
96 rcu_assign_pointer(trace_profile_buf, NULL);
97
98 nmi_buf = trace_profile_buf_nmi;
99 rcu_assign_pointer(trace_profile_buf_nmi, NULL);
100
101 /*
102 * Ensure every events in profiling have finished before
103 * releasing the buffers
104 */
105 synchronize_sched();
106
107 free_percpu(buf);
108 free_percpu(nmi_buf);
109 }
110}
111
112void ftrace_profile_disable(int event_id)
113{
114 struct ftrace_event_call *event;
115
116 mutex_lock(&event_mutex);
117 list_for_each_entry(event, &ftrace_events, list) {
118 if (event->id == event_id) {
119 ftrace_profile_disable_event(event);
120 module_put(event->mod);
121 break;
122 }
123 }
124 mutex_unlock(&event_mutex);
125}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index d128f65778e6..c697c7043349 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -15,6 +15,7 @@
15#include <linux/uaccess.h> 15#include <linux/uaccess.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/ctype.h> 17#include <linux/ctype.h>
18#include <linux/slab.h>
18#include <linux/delay.h> 19#include <linux/delay.h>
19 20
20#include <asm/setup.h> 21#include <asm/setup.h>
@@ -60,10 +61,8 @@ int trace_define_field(struct ftrace_event_call *call, const char *type,
60 return 0; 61 return 0;
61 62
62err: 63err:
63 if (field) { 64 if (field)
64 kfree(field->name); 65 kfree(field->name);
65 kfree(field->type);
66 }
67 kfree(field); 66 kfree(field);
68 67
69 return -ENOMEM; 68 return -ENOMEM;
@@ -78,7 +77,7 @@ EXPORT_SYMBOL_GPL(trace_define_field);
78 if (ret) \ 77 if (ret) \
79 return ret; 78 return ret;
80 79
81int trace_define_common_fields(struct ftrace_event_call *call) 80static int trace_define_common_fields(struct ftrace_event_call *call)
82{ 81{
83 int ret; 82 int ret;
84 struct trace_entry ent; 83 struct trace_entry ent;
@@ -91,11 +90,8 @@ int trace_define_common_fields(struct ftrace_event_call *call)
91 90
92 return ret; 91 return ret;
93} 92}
94EXPORT_SYMBOL_GPL(trace_define_common_fields);
95
96#ifdef CONFIG_MODULES
97 93
98static void trace_destroy_fields(struct ftrace_event_call *call) 94void trace_destroy_fields(struct ftrace_event_call *call)
99{ 95{
100 struct ftrace_event_field *field, *next; 96 struct ftrace_event_field *field, *next;
101 97
@@ -107,27 +103,49 @@ static void trace_destroy_fields(struct ftrace_event_call *call)
107 } 103 }
108} 104}
109 105
110#endif /* CONFIG_MODULES */ 106int trace_event_raw_init(struct ftrace_event_call *call)
107{
108 int id;
111 109
112static void ftrace_event_enable_disable(struct ftrace_event_call *call, 110 id = register_ftrace_event(call->event);
111 if (!id)
112 return -ENODEV;
113 call->id = id;
114 INIT_LIST_HEAD(&call->fields);
115
116 return 0;
117}
118EXPORT_SYMBOL_GPL(trace_event_raw_init);
119
120static int ftrace_event_enable_disable(struct ftrace_event_call *call,
113 int enable) 121 int enable)
114{ 122{
123 int ret = 0;
124
115 switch (enable) { 125 switch (enable) {
116 case 0: 126 case 0:
117 if (call->enabled) { 127 if (call->enabled) {
118 call->enabled = 0; 128 call->enabled = 0;
119 tracing_stop_cmdline_record(); 129 tracing_stop_cmdline_record();
120 call->unregfunc(call->data); 130 call->unregfunc(call);
121 } 131 }
122 break; 132 break;
123 case 1: 133 case 1:
124 if (!call->enabled) { 134 if (!call->enabled) {
125 call->enabled = 1;
126 tracing_start_cmdline_record(); 135 tracing_start_cmdline_record();
127 call->regfunc(call->data); 136 ret = call->regfunc(call);
137 if (ret) {
138 tracing_stop_cmdline_record();
139 pr_info("event trace: Could not enable event "
140 "%s\n", call->name);
141 break;
142 }
143 call->enabled = 1;
128 } 144 }
129 break; 145 break;
130 } 146 }
147
148 return ret;
131} 149}
132 150
133static void ftrace_clear_events(void) 151static void ftrace_clear_events(void)
@@ -406,7 +424,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
406 case 0: 424 case 0:
407 case 1: 425 case 1:
408 mutex_lock(&event_mutex); 426 mutex_lock(&event_mutex);
409 ftrace_event_enable_disable(call, val); 427 ret = ftrace_event_enable_disable(call, val);
410 mutex_unlock(&event_mutex); 428 mutex_unlock(&event_mutex);
411 break; 429 break;
412 430
@@ -416,7 +434,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
416 434
417 *ppos += cnt; 435 *ppos += cnt;
418 436
419 return cnt; 437 return ret ? ret : cnt;
420} 438}
421 439
422static ssize_t 440static ssize_t
@@ -501,41 +519,16 @@ out:
501 return ret; 519 return ret;
502} 520}
503 521
504extern char *__bad_type_size(void);
505
506#undef FIELD
507#define FIELD(type, name) \
508 sizeof(type) != sizeof(field.name) ? __bad_type_size() : \
509 #type, "common_" #name, offsetof(typeof(field), name), \
510 sizeof(field.name)
511
512static int trace_write_header(struct trace_seq *s)
513{
514 struct trace_entry field;
515
516 /* struct trace_entry */
517 return trace_seq_printf(s,
518 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
519 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
520 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
521 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
522 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
523 "\n",
524 FIELD(unsigned short, type),
525 FIELD(unsigned char, flags),
526 FIELD(unsigned char, preempt_count),
527 FIELD(int, pid),
528 FIELD(int, lock_depth));
529}
530
531static ssize_t 522static ssize_t
532event_format_read(struct file *filp, char __user *ubuf, size_t cnt, 523event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
533 loff_t *ppos) 524 loff_t *ppos)
534{ 525{
535 struct ftrace_event_call *call = filp->private_data; 526 struct ftrace_event_call *call = filp->private_data;
527 struct ftrace_event_field *field;
536 struct trace_seq *s; 528 struct trace_seq *s;
529 int common_field_count = 5;
537 char *buf; 530 char *buf;
538 int r; 531 int r = 0;
539 532
540 if (*ppos) 533 if (*ppos)
541 return 0; 534 return 0;
@@ -546,14 +539,48 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
546 539
547 trace_seq_init(s); 540 trace_seq_init(s);
548 541
549 /* If any of the first writes fail, so will the show_format. */
550
551 trace_seq_printf(s, "name: %s\n", call->name); 542 trace_seq_printf(s, "name: %s\n", call->name);
552 trace_seq_printf(s, "ID: %d\n", call->id); 543 trace_seq_printf(s, "ID: %d\n", call->id);
553 trace_seq_printf(s, "format:\n"); 544 trace_seq_printf(s, "format:\n");
554 trace_write_header(s);
555 545
556 r = call->show_format(call, s); 546 list_for_each_entry_reverse(field, &call->fields, link) {
547 /*
548 * Smartly shows the array type(except dynamic array).
549 * Normal:
550 * field:TYPE VAR
551 * If TYPE := TYPE[LEN], it is shown:
552 * field:TYPE VAR[LEN]
553 */
554 const char *array_descriptor = strchr(field->type, '[');
555
556 if (!strncmp(field->type, "__data_loc", 10))
557 array_descriptor = NULL;
558
559 if (!array_descriptor) {
560 r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;"
561 "\tsize:%u;\tsigned:%d;\n",
562 field->type, field->name, field->offset,
563 field->size, !!field->is_signed);
564 } else {
565 r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;"
566 "\tsize:%u;\tsigned:%d;\n",
567 (int)(array_descriptor - field->type),
568 field->type, field->name,
569 array_descriptor, field->offset,
570 field->size, !!field->is_signed);
571 }
572
573 if (--common_field_count == 0)
574 r = trace_seq_printf(s, "\n");
575
576 if (!r)
577 break;
578 }
579
580 if (r)
581 r = trace_seq_printf(s, "\nprint fmt: %s\n",
582 call->print_fmt);
583
557 if (!r) { 584 if (!r) {
558 /* 585 /*
559 * ug! The format output is bigger than a PAGE!! 586 * ug! The format output is bigger than a PAGE!!
@@ -878,9 +905,9 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
878 "'%s/filter' entry\n", name); 905 "'%s/filter' entry\n", name);
879 } 906 }
880 907
881 entry = trace_create_file("enable", 0644, system->entry, 908 trace_create_file("enable", 0644, system->entry,
882 (void *)system->name, 909 (void *)system->name,
883 &ftrace_system_enable_fops); 910 &ftrace_system_enable_fops);
884 911
885 return system->entry; 912 return system->entry;
886} 913}
@@ -892,7 +919,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
892 const struct file_operations *filter, 919 const struct file_operations *filter,
893 const struct file_operations *format) 920 const struct file_operations *format)
894{ 921{
895 struct dentry *entry;
896 int ret; 922 int ret;
897 923
898 /* 924 /*
@@ -910,55 +936,72 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
910 } 936 }
911 937
912 if (call->regfunc) 938 if (call->regfunc)
913 entry = trace_create_file("enable", 0644, call->dir, call, 939 trace_create_file("enable", 0644, call->dir, call,
914 enable); 940 enable);
915 941
916 if (call->id && call->profile_enable) 942 if (call->id && call->perf_event_enable)
917 entry = trace_create_file("id", 0444, call->dir, call, 943 trace_create_file("id", 0444, call->dir, call,
918 id); 944 id);
919 945
920 if (call->define_fields) { 946 if (call->define_fields) {
921 ret = call->define_fields(call); 947 ret = trace_define_common_fields(call);
948 if (!ret)
949 ret = call->define_fields(call);
922 if (ret < 0) { 950 if (ret < 0) {
923 pr_warning("Could not initialize trace point" 951 pr_warning("Could not initialize trace point"
924 " events/%s\n", call->name); 952 " events/%s\n", call->name);
925 return ret; 953 return ret;
926 } 954 }
927 entry = trace_create_file("filter", 0644, call->dir, call, 955 trace_create_file("filter", 0644, call->dir, call,
928 filter); 956 filter);
929 } 957 }
930 958
931 /* A trace may not want to export its format */ 959 trace_create_file("format", 0444, call->dir, call,
932 if (!call->show_format) 960 format);
933 return 0;
934
935 entry = trace_create_file("format", 0444, call->dir, call,
936 format);
937 961
938 return 0; 962 return 0;
939} 963}
940 964
941#define for_each_event(event, start, end) \ 965static int __trace_add_event_call(struct ftrace_event_call *call)
942 for (event = start; \ 966{
943 (unsigned long)event < (unsigned long)end; \ 967 struct dentry *d_events;
944 event++) 968 int ret;
945 969
946#ifdef CONFIG_MODULES 970 if (!call->name)
971 return -EINVAL;
947 972
948static LIST_HEAD(ftrace_module_file_list); 973 if (call->raw_init) {
974 ret = call->raw_init(call);
975 if (ret < 0) {
976 if (ret != -ENOSYS)
977 pr_warning("Could not initialize trace "
978 "events/%s\n", call->name);
979 return ret;
980 }
981 }
949 982
950/* 983 d_events = event_trace_events_dir();
951 * Modules must own their file_operations to keep up with 984 if (!d_events)
952 * reference counting. 985 return -ENOENT;
953 */ 986
954struct ftrace_module_file_ops { 987 ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
955 struct list_head list; 988 &ftrace_enable_fops, &ftrace_event_filter_fops,
956 struct module *mod; 989 &ftrace_event_format_fops);
957 struct file_operations id; 990 if (!ret)
958 struct file_operations enable; 991 list_add(&call->list, &ftrace_events);
959 struct file_operations format; 992
960 struct file_operations filter; 993 return ret;
961}; 994}
995
996/* Add an additional event_call dynamically */
997int trace_add_event_call(struct ftrace_event_call *call)
998{
999 int ret;
1000 mutex_lock(&event_mutex);
1001 ret = __trace_add_event_call(call);
1002 mutex_unlock(&event_mutex);
1003 return ret;
1004}
962 1005
963static void remove_subsystem_dir(const char *name) 1006static void remove_subsystem_dir(const char *name)
964{ 1007{
@@ -986,6 +1029,53 @@ static void remove_subsystem_dir(const char *name)
986 } 1029 }
987} 1030}
988 1031
1032/*
1033 * Must be called under locking both of event_mutex and trace_event_mutex.
1034 */
1035static void __trace_remove_event_call(struct ftrace_event_call *call)
1036{
1037 ftrace_event_enable_disable(call, 0);
1038 if (call->event)
1039 __unregister_ftrace_event(call->event);
1040 debugfs_remove_recursive(call->dir);
1041 list_del(&call->list);
1042 trace_destroy_fields(call);
1043 destroy_preds(call);
1044 remove_subsystem_dir(call->system);
1045}
1046
1047/* Remove an event_call */
1048void trace_remove_event_call(struct ftrace_event_call *call)
1049{
1050 mutex_lock(&event_mutex);
1051 down_write(&trace_event_mutex);
1052 __trace_remove_event_call(call);
1053 up_write(&trace_event_mutex);
1054 mutex_unlock(&event_mutex);
1055}
1056
1057#define for_each_event(event, start, end) \
1058 for (event = start; \
1059 (unsigned long)event < (unsigned long)end; \
1060 event++)
1061
1062#ifdef CONFIG_MODULES
1063
1064static LIST_HEAD(ftrace_module_file_list);
1065
1066/*
1067 * Modules must own their file_operations to keep up with
1068 * reference counting.
1069 */
1070struct ftrace_module_file_ops {
1071 struct list_head list;
1072 struct module *mod;
1073 struct file_operations id;
1074 struct file_operations enable;
1075 struct file_operations format;
1076 struct file_operations filter;
1077};
1078
989static struct ftrace_module_file_ops * 1079static struct ftrace_module_file_ops *
990trace_create_file_ops(struct module *mod) 1080trace_create_file_ops(struct module *mod)
991{ 1081{
@@ -1043,7 +1133,7 @@ static void trace_module_add_events(struct module *mod)
1043 if (!call->name) 1133 if (!call->name)
1044 continue; 1134 continue;
1045 if (call->raw_init) { 1135 if (call->raw_init) {
1046 ret = call->raw_init(); 1136 ret = call->raw_init(call);
1047 if (ret < 0) { 1137 if (ret < 0) {
1048 if (ret != -ENOSYS) 1138 if (ret != -ENOSYS)
1049 pr_warning("Could not initialize trace " 1139 pr_warning("Could not initialize trace "
@@ -1061,10 +1151,11 @@ static void trace_module_add_events(struct module *mod)
1061 return; 1151 return;
1062 } 1152 }
1063 call->mod = mod; 1153 call->mod = mod;
1064 list_add(&call->list, &ftrace_events); 1154 ret = event_create_dir(call, d_events,
1065 event_create_dir(call, d_events, 1155 &file_ops->id, &file_ops->enable,
1066 &file_ops->id, &file_ops->enable, 1156 &file_ops->filter, &file_ops->format);
1067 &file_ops->filter, &file_ops->format); 1157 if (!ret)
1158 list_add(&call->list, &ftrace_events);
1068 } 1159 }
1069} 1160}
1070 1161
@@ -1078,14 +1169,7 @@ static void trace_module_remove_events(struct module *mod)
1078 list_for_each_entry_safe(call, p, &ftrace_events, list) { 1169 list_for_each_entry_safe(call, p, &ftrace_events, list) {
1079 if (call->mod == mod) { 1170 if (call->mod == mod) {
1080 found = true; 1171 found = true;
1081 ftrace_event_enable_disable(call, 0); 1172 __trace_remove_event_call(call);
1082 if (call->event)
1083 __unregister_ftrace_event(call->event);
1084 debugfs_remove_recursive(call->dir);
1085 list_del(&call->list);
1086 trace_destroy_fields(call);
1087 destroy_preds(call);
1088 remove_subsystem_dir(call->system);
1089 } 1173 }
1090 } 1174 }
1091 1175
@@ -1203,7 +1287,7 @@ static __init int event_trace_init(void)
1203 if (!call->name) 1287 if (!call->name)
1204 continue; 1288 continue;
1205 if (call->raw_init) { 1289 if (call->raw_init) {
1206 ret = call->raw_init(); 1290 ret = call->raw_init(call);
1207 if (ret < 0) { 1291 if (ret < 0) {
1208 if (ret != -ENOSYS) 1292 if (ret != -ENOSYS)
1209 pr_warning("Could not initialize trace " 1293 pr_warning("Could not initialize trace "
@@ -1211,10 +1295,12 @@ static __init int event_trace_init(void)
1211 continue; 1295 continue;
1212 } 1296 }
1213 } 1297 }
1214 list_add(&call->list, &ftrace_events); 1298 ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
1215 event_create_dir(call, d_events, &ftrace_event_id_fops, 1299 &ftrace_enable_fops,
1216 &ftrace_enable_fops, &ftrace_event_filter_fops, 1300 &ftrace_event_filter_fops,
1217 &ftrace_event_format_fops); 1301 &ftrace_event_format_fops);
1302 if (!ret)
1303 list_add(&call->list, &ftrace_events);
1218 } 1304 }
1219 1305
1220 while (true) { 1306 while (true) {
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 98a6cc5c64ed..88c0b6dbd7fe 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -18,11 +18,11 @@
18 * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com> 18 * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
19 */ 19 */
20 20
21#include <linux/debugfs.h>
22#include <linux/uaccess.h>
23#include <linux/module.h> 21#include <linux/module.h>
24#include <linux/ctype.h> 22#include <linux/ctype.h>
25#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/perf_event.h>
25#include <linux/slab.h>
26 26
27#include "trace.h" 27#include "trace.h"
28#include "trace_output.h" 28#include "trace_output.h"
@@ -31,6 +31,7 @@ enum filter_op_ids
31{ 31{
32 OP_OR, 32 OP_OR,
33 OP_AND, 33 OP_AND,
34 OP_GLOB,
34 OP_NE, 35 OP_NE,
35 OP_EQ, 36 OP_EQ,
36 OP_LT, 37 OP_LT,
@@ -48,16 +49,17 @@ struct filter_op {
48}; 49};
49 50
50static struct filter_op filter_ops[] = { 51static struct filter_op filter_ops[] = {
51 { OP_OR, "||", 1 }, 52 { OP_OR, "||", 1 },
52 { OP_AND, "&&", 2 }, 53 { OP_AND, "&&", 2 },
53 { OP_NE, "!=", 4 }, 54 { OP_GLOB, "~", 4 },
54 { OP_EQ, "==", 4 }, 55 { OP_NE, "!=", 4 },
55 { OP_LT, "<", 5 }, 56 { OP_EQ, "==", 4 },
56 { OP_LE, "<=", 5 }, 57 { OP_LT, "<", 5 },
57 { OP_GT, ">", 5 }, 58 { OP_LE, "<=", 5 },
58 { OP_GE, ">=", 5 }, 59 { OP_GT, ">", 5 },
59 { OP_NONE, "OP_NONE", 0 }, 60 { OP_GE, ">=", 5 },
60 { OP_OPEN_PAREN, "(", 0 }, 61 { OP_NONE, "OP_NONE", 0 },
62 { OP_OPEN_PAREN, "(", 0 },
61}; 63};
62 64
63enum { 65enum {
@@ -197,9 +199,9 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
197 char *addr = (char *)(event + pred->offset); 199 char *addr = (char *)(event + pred->offset);
198 int cmp, match; 200 int cmp, match;
199 201
200 cmp = strncmp(addr, pred->str_val, pred->str_len); 202 cmp = pred->regex.match(addr, &pred->regex, pred->regex.field_len);
201 203
202 match = (!cmp) ^ pred->not; 204 match = cmp ^ pred->not;
203 205
204 return match; 206 return match;
205} 207}
@@ -210,10 +212,11 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event,
210{ 212{
211 char **addr = (char **)(event + pred->offset); 213 char **addr = (char **)(event + pred->offset);
212 int cmp, match; 214 int cmp, match;
215 int len = strlen(*addr) + 1; /* including tailing '\0' */
213 216
214 cmp = strncmp(*addr, pred->str_val, pred->str_len); 217 cmp = pred->regex.match(*addr, &pred->regex, len);
215 218
216 match = (!cmp) ^ pred->not; 219 match = cmp ^ pred->not;
217 220
218 return match; 221 return match;
219} 222}
@@ -237,9 +240,9 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event,
237 char *addr = (char *)(event + str_loc); 240 char *addr = (char *)(event + str_loc);
238 int cmp, match; 241 int cmp, match;
239 242
240 cmp = strncmp(addr, pred->str_val, str_len); 243 cmp = pred->regex.match(addr, &pred->regex, str_len);
241 244
242 match = (!cmp) ^ pred->not; 245 match = cmp ^ pred->not;
243 246
244 return match; 247 return match;
245} 248}
@@ -250,10 +253,133 @@ static int filter_pred_none(struct filter_pred *pred, void *event,
250 return 0; 253 return 0;
251} 254}
252 255
256/*
257 * regex_match_foo - Basic regex callbacks
258 *
259 * @str: the string to be searched
260 * @r: the regex structure containing the pattern string
261 * @len: the length of the string to be searched (including '\0')
262 *
263 * Note:
264 * - @str might not be NULL-terminated if it's of type DYN_STRING
265 * or STATIC_STRING
266 */
267
268static int regex_match_full(char *str, struct regex *r, int len)
269{
270 if (strncmp(str, r->pattern, len) == 0)
271 return 1;
272 return 0;
273}
274
275static int regex_match_front(char *str, struct regex *r, int len)
276{
277 if (strncmp(str, r->pattern, r->len) == 0)
278 return 1;
279 return 0;
280}
281
282static int regex_match_middle(char *str, struct regex *r, int len)
283{
284 if (strnstr(str, r->pattern, len))
285 return 1;
286 return 0;
287}
288
289static int regex_match_end(char *str, struct regex *r, int len)
290{
291 int strlen = len - 1;
292
293 if (strlen >= r->len &&
294 memcmp(str + strlen - r->len, r->pattern, r->len) == 0)
295 return 1;
296 return 0;
297}
298
299/**
300 * filter_parse_regex - parse a basic regex
301 * @buff: the raw regex
302 * @len: length of the regex
303 * @search: will point to the beginning of the string to compare
304 * @not: tell whether the match will have to be inverted
305 *
306 * This passes in a buffer containing a regex and this function will
307 * set search to point to the search part of the buffer and
308 * return the type of search it is (see enum above).
309 * This does modify buff.
310 *
311 * Returns enum type.
312 * search returns the pointer to use for comparison.
313 * not returns 1 if buff started with a '!'
314 * 0 otherwise.
315 */
316enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not)
317{
318 int type = MATCH_FULL;
319 int i;
320
321 if (buff[0] == '!') {
322 *not = 1;
323 buff++;
324 len--;
325 } else
326 *not = 0;
327
328 *search = buff;
329
330 for (i = 0; i < len; i++) {
331 if (buff[i] == '*') {
332 if (!i) {
333 *search = buff + 1;
334 type = MATCH_END_ONLY;
335 } else {
336 if (type == MATCH_END_ONLY)
337 type = MATCH_MIDDLE_ONLY;
338 else
339 type = MATCH_FRONT_ONLY;
340 buff[i] = 0;
341 break;
342 }
343 }
344 }
345
346 return type;
347}
348
349static void filter_build_regex(struct filter_pred *pred)
350{
351 struct regex *r = &pred->regex;
352 char *search;
353 enum regex_type type = MATCH_FULL;
354 int not = 0;
355
356 if (pred->op == OP_GLOB) {
357 type = filter_parse_regex(r->pattern, r->len, &search, &not);
358 r->len = strlen(search);
359 memmove(r->pattern, search, r->len+1);
360 }
361
362 switch (type) {
363 case MATCH_FULL:
364 r->match = regex_match_full;
365 break;
366 case MATCH_FRONT_ONLY:
367 r->match = regex_match_front;
368 break;
369 case MATCH_MIDDLE_ONLY:
370 r->match = regex_match_middle;
371 break;
372 case MATCH_END_ONLY:
373 r->match = regex_match_end;
374 break;
375 }
376
377 pred->not ^= not;
378}
379
253/* return 1 if event matches, 0 otherwise (discard) */ 380/* return 1 if event matches, 0 otherwise (discard) */
254int filter_match_preds(struct ftrace_event_call *call, void *rec) 381int filter_match_preds(struct event_filter *filter, void *rec)
255{ 382{
256 struct event_filter *filter = call->filter;
257 int match, top = 0, val1 = 0, val2 = 0; 383 int match, top = 0, val1 = 0, val2 = 0;
258 int stack[MAX_FILTER_PRED]; 384 int stack[MAX_FILTER_PRED];
259 struct filter_pred *pred; 385 struct filter_pred *pred;
@@ -396,7 +522,7 @@ static void filter_clear_pred(struct filter_pred *pred)
396{ 522{
397 kfree(pred->field_name); 523 kfree(pred->field_name);
398 pred->field_name = NULL; 524 pred->field_name = NULL;
399 pred->str_len = 0; 525 pred->regex.len = 0;
400} 526}
401 527
402static int filter_set_pred(struct filter_pred *dest, 528static int filter_set_pred(struct filter_pred *dest,
@@ -426,9 +552,8 @@ static void filter_disable_preds(struct ftrace_event_call *call)
426 filter->preds[i]->fn = filter_pred_none; 552 filter->preds[i]->fn = filter_pred_none;
427} 553}
428 554
429void destroy_preds(struct ftrace_event_call *call) 555static void __free_preds(struct event_filter *filter)
430{ 556{
431 struct event_filter *filter = call->filter;
432 int i; 557 int i;
433 558
434 if (!filter) 559 if (!filter)
@@ -441,21 +566,24 @@ void destroy_preds(struct ftrace_event_call *call)
441 kfree(filter->preds); 566 kfree(filter->preds);
442 kfree(filter->filter_string); 567 kfree(filter->filter_string);
443 kfree(filter); 568 kfree(filter);
569}
570
571void destroy_preds(struct ftrace_event_call *call)
572{
573 __free_preds(call->filter);
444 call->filter = NULL; 574 call->filter = NULL;
575 call->filter_active = 0;
445} 576}
446 577
447static int init_preds(struct ftrace_event_call *call) 578static struct event_filter *__alloc_preds(void)
448{ 579{
449 struct event_filter *filter; 580 struct event_filter *filter;
450 struct filter_pred *pred; 581 struct filter_pred *pred;
451 int i; 582 int i;
452 583
453 if (call->filter) 584 filter = kzalloc(sizeof(*filter), GFP_KERNEL);
454 return 0; 585 if (!filter)
455 586 return ERR_PTR(-ENOMEM);
456 filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
457 if (!call->filter)
458 return -ENOMEM;
459 587
460 filter->n_preds = 0; 588 filter->n_preds = 0;
461 589
@@ -471,12 +599,24 @@ static int init_preds(struct ftrace_event_call *call)
471 filter->preds[i] = pred; 599 filter->preds[i] = pred;
472 } 600 }
473 601
474 return 0; 602 return filter;
475 603
476oom: 604oom:
477 destroy_preds(call); 605 __free_preds(filter);
606 return ERR_PTR(-ENOMEM);
607}
608
609static int init_preds(struct ftrace_event_call *call)
610{
611 if (call->filter)
612 return 0;
478 613
479 return -ENOMEM; 614 call->filter_active = 0;
615 call->filter = __alloc_preds();
616 if (IS_ERR(call->filter))
617 return PTR_ERR(call->filter);
618
619 return 0;
480} 620}
481 621
482static int init_subsystem_preds(struct event_subsystem *system) 622static int init_subsystem_preds(struct event_subsystem *system)
@@ -499,14 +639,7 @@ static int init_subsystem_preds(struct event_subsystem *system)
499 return 0; 639 return 0;
500} 640}
501 641
502enum { 642static void filter_free_subsystem_preds(struct event_subsystem *system)
503 FILTER_DISABLE_ALL,
504 FILTER_INIT_NO_RESET,
505 FILTER_SKIP_NO_RESET,
506};
507
508static void filter_free_subsystem_preds(struct event_subsystem *system,
509 int flag)
510{ 643{
511 struct ftrace_event_call *call; 644 struct ftrace_event_call *call;
512 645
@@ -517,14 +650,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system,
517 if (strcmp(call->system, system->name) != 0) 650 if (strcmp(call->system, system->name) != 0)
518 continue; 651 continue;
519 652
520 if (flag == FILTER_INIT_NO_RESET) {
521 call->filter->no_reset = false;
522 continue;
523 }
524
525 if (flag == FILTER_SKIP_NO_RESET && call->filter->no_reset)
526 continue;
527
528 filter_disable_preds(call); 653 filter_disable_preds(call);
529 remove_filter_string(call->filter); 654 remove_filter_string(call->filter);
530 } 655 }
@@ -532,10 +657,10 @@ static void filter_free_subsystem_preds(struct event_subsystem *system,
532 657
533static int filter_add_pred_fn(struct filter_parse_state *ps, 658static int filter_add_pred_fn(struct filter_parse_state *ps,
534 struct ftrace_event_call *call, 659 struct ftrace_event_call *call,
660 struct event_filter *filter,
535 struct filter_pred *pred, 661 struct filter_pred *pred,
536 filter_pred_fn_t fn) 662 filter_pred_fn_t fn)
537{ 663{
538 struct event_filter *filter = call->filter;
539 int idx, err; 664 int idx, err;
540 665
541 if (filter->n_preds == MAX_FILTER_PRED) { 666 if (filter->n_preds == MAX_FILTER_PRED) {
@@ -550,7 +675,6 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
550 return err; 675 return err;
551 676
552 filter->n_preds++; 677 filter->n_preds++;
553 call->filter_active = 1;
554 678
555 return 0; 679 return 0;
556} 680}
@@ -575,7 +699,10 @@ static bool is_string_field(struct ftrace_event_field *field)
575 699
576static int is_legal_op(struct ftrace_event_field *field, int op) 700static int is_legal_op(struct ftrace_event_field *field, int op)
577{ 701{
578 if (is_string_field(field) && (op != OP_EQ && op != OP_NE)) 702 if (is_string_field(field) &&
703 (op != OP_EQ && op != OP_NE && op != OP_GLOB))
704 return 0;
705 if (!is_string_field(field) && op == OP_GLOB)
579 return 0; 706 return 0;
580 707
581 return 1; 708 return 1;
@@ -626,6 +753,7 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
626 753
627static int filter_add_pred(struct filter_parse_state *ps, 754static int filter_add_pred(struct filter_parse_state *ps,
628 struct ftrace_event_call *call, 755 struct ftrace_event_call *call,
756 struct event_filter *filter,
629 struct filter_pred *pred, 757 struct filter_pred *pred,
630 bool dry_run) 758 bool dry_run)
631{ 759{
@@ -660,21 +788,20 @@ static int filter_add_pred(struct filter_parse_state *ps,
660 } 788 }
661 789
662 if (is_string_field(field)) { 790 if (is_string_field(field)) {
663 pred->str_len = field->size; 791 filter_build_regex(pred);
664 792
665 if (field->filter_type == FILTER_STATIC_STRING) 793 if (field->filter_type == FILTER_STATIC_STRING) {
666 fn = filter_pred_string; 794 fn = filter_pred_string;
667 else if (field->filter_type == FILTER_DYN_STRING) 795 pred->regex.field_len = field->size;
796 } else if (field->filter_type == FILTER_DYN_STRING)
668 fn = filter_pred_strloc; 797 fn = filter_pred_strloc;
669 else { 798 else
670 fn = filter_pred_pchar; 799 fn = filter_pred_pchar;
671 pred->str_len = strlen(pred->str_val);
672 }
673 } else { 800 } else {
674 if (field->is_signed) 801 if (field->is_signed)
675 ret = strict_strtoll(pred->str_val, 0, &val); 802 ret = strict_strtoll(pred->regex.pattern, 0, &val);
676 else 803 else
677 ret = strict_strtoull(pred->str_val, 0, &val); 804 ret = strict_strtoull(pred->regex.pattern, 0, &val);
678 if (ret) { 805 if (ret) {
679 parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); 806 parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
680 return -EINVAL; 807 return -EINVAL;
@@ -694,45 +821,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
694 821
695add_pred_fn: 822add_pred_fn:
696 if (!dry_run) 823 if (!dry_run)
697 return filter_add_pred_fn(ps, call, pred, fn); 824 return filter_add_pred_fn(ps, call, filter, pred, fn);
698 return 0;
699}
700
701static int filter_add_subsystem_pred(struct filter_parse_state *ps,
702 struct event_subsystem *system,
703 struct filter_pred *pred,
704 char *filter_string,
705 bool dry_run)
706{
707 struct ftrace_event_call *call;
708 int err = 0;
709 bool fail = true;
710
711 list_for_each_entry(call, &ftrace_events, list) {
712
713 if (!call->define_fields)
714 continue;
715
716 if (strcmp(call->system, system->name))
717 continue;
718
719 if (call->filter->no_reset)
720 continue;
721
722 err = filter_add_pred(ps, call, pred, dry_run);
723 if (err)
724 call->filter->no_reset = true;
725 else
726 fail = false;
727
728 if (!dry_run)
729 replace_filter_string(call->filter, filter_string);
730 }
731
732 if (fail) {
733 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
734 return err;
735 }
736 return 0; 825 return 0;
737} 826}
738 827
@@ -1045,8 +1134,8 @@ static struct filter_pred *create_pred(int op, char *operand1, char *operand2)
1045 return NULL; 1134 return NULL;
1046 } 1135 }
1047 1136
1048 strcpy(pred->str_val, operand2); 1137 strcpy(pred->regex.pattern, operand2);
1049 pred->str_len = strlen(operand2); 1138 pred->regex.len = strlen(pred->regex.pattern);
1050 1139
1051 pred->op = op; 1140 pred->op = op;
1052 1141
@@ -1090,8 +1179,8 @@ static int check_preds(struct filter_parse_state *ps)
1090 return 0; 1179 return 0;
1091} 1180}
1092 1181
1093static int replace_preds(struct event_subsystem *system, 1182static int replace_preds(struct ftrace_event_call *call,
1094 struct ftrace_event_call *call, 1183 struct event_filter *filter,
1095 struct filter_parse_state *ps, 1184 struct filter_parse_state *ps,
1096 char *filter_string, 1185 char *filter_string,
1097 bool dry_run) 1186 bool dry_run)
@@ -1138,11 +1227,7 @@ static int replace_preds(struct event_subsystem *system,
1138add_pred: 1227add_pred:
1139 if (!pred) 1228 if (!pred)
1140 return -ENOMEM; 1229 return -ENOMEM;
1141 if (call) 1230 err = filter_add_pred(ps, call, filter, pred, dry_run);
1142 err = filter_add_pred(ps, call, pred, false);
1143 else
1144 err = filter_add_subsystem_pred(ps, system, pred,
1145 filter_string, dry_run);
1146 filter_free_pred(pred); 1231 filter_free_pred(pred);
1147 if (err) 1232 if (err)
1148 return err; 1233 return err;
@@ -1153,10 +1238,50 @@ add_pred:
1153 return 0; 1238 return 0;
1154} 1239}
1155 1240
1156int apply_event_filter(struct ftrace_event_call *call, char *filter_string) 1241static int replace_system_preds(struct event_subsystem *system,
1242 struct filter_parse_state *ps,
1243 char *filter_string)
1157{ 1244{
1245 struct ftrace_event_call *call;
1246 bool fail = true;
1158 int err; 1247 int err;
1159 1248
1249 list_for_each_entry(call, &ftrace_events, list) {
1250 struct event_filter *filter = call->filter;
1251
1252 if (!call->define_fields)
1253 continue;
1254
1255 if (strcmp(call->system, system->name) != 0)
1256 continue;
1257
1258 /* try to see if the filter can be applied */
1259 err = replace_preds(call, filter, ps, filter_string, true);
1260 if (err)
1261 continue;
1262
1263 /* really apply the filter */
1264 filter_disable_preds(call);
1265 err = replace_preds(call, filter, ps, filter_string, false);
1266 if (err)
1267 filter_disable_preds(call);
1268 else {
1269 call->filter_active = 1;
1270 replace_filter_string(filter, filter_string);
1271 }
1272 fail = false;
1273 }
1274
1275 if (fail) {
1276 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
1277 return -EINVAL;
1278 }
1279 return 0;
1280}
1281
1282int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1283{
1284 int err;
1160 struct filter_parse_state *ps; 1285 struct filter_parse_state *ps;
1161 1286
1162 mutex_lock(&event_mutex); 1287 mutex_lock(&event_mutex);
@@ -1168,8 +1293,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1168 if (!strcmp(strstrip(filter_string), "0")) { 1293 if (!strcmp(strstrip(filter_string), "0")) {
1169 filter_disable_preds(call); 1294 filter_disable_preds(call);
1170 remove_filter_string(call->filter); 1295 remove_filter_string(call->filter);
1171 mutex_unlock(&event_mutex); 1296 goto out_unlock;
1172 return 0;
1173 } 1297 }
1174 1298
1175 err = -ENOMEM; 1299 err = -ENOMEM;
@@ -1187,10 +1311,11 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1187 goto out; 1311 goto out;
1188 } 1312 }
1189 1313
1190 err = replace_preds(NULL, call, ps, filter_string, false); 1314 err = replace_preds(call, call->filter, ps, filter_string, false);
1191 if (err) 1315 if (err)
1192 append_filter_err(ps, call->filter); 1316 append_filter_err(ps, call->filter);
1193 1317 else
1318 call->filter_active = 1;
1194out: 1319out:
1195 filter_opstack_clear(ps); 1320 filter_opstack_clear(ps);
1196 postfix_clear(ps); 1321 postfix_clear(ps);
@@ -1205,7 +1330,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1205 char *filter_string) 1330 char *filter_string)
1206{ 1331{
1207 int err; 1332 int err;
1208
1209 struct filter_parse_state *ps; 1333 struct filter_parse_state *ps;
1210 1334
1211 mutex_lock(&event_mutex); 1335 mutex_lock(&event_mutex);
@@ -1215,10 +1339,9 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1215 goto out_unlock; 1339 goto out_unlock;
1216 1340
1217 if (!strcmp(strstrip(filter_string), "0")) { 1341 if (!strcmp(strstrip(filter_string), "0")) {
1218 filter_free_subsystem_preds(system, FILTER_DISABLE_ALL); 1342 filter_free_subsystem_preds(system);
1219 remove_filter_string(system->filter); 1343 remove_filter_string(system->filter);
1220 mutex_unlock(&event_mutex); 1344 goto out_unlock;
1221 return 0;
1222 } 1345 }
1223 1346
1224 err = -ENOMEM; 1347 err = -ENOMEM;
@@ -1235,31 +1358,87 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1235 goto out; 1358 goto out;
1236 } 1359 }
1237 1360
1238 filter_free_subsystem_preds(system, FILTER_INIT_NO_RESET); 1361 err = replace_system_preds(system, ps, filter_string);
1239 1362 if (err)
1240 /* try to see the filter can be applied to which events */
1241 err = replace_preds(system, NULL, ps, filter_string, true);
1242 if (err) {
1243 append_filter_err(ps, system->filter); 1363 append_filter_err(ps, system->filter);
1244 goto out; 1364
1365out:
1366 filter_opstack_clear(ps);
1367 postfix_clear(ps);
1368 kfree(ps);
1369out_unlock:
1370 mutex_unlock(&event_mutex);
1371
1372 return err;
1373}
1374
1375#ifdef CONFIG_PERF_EVENTS
1376
1377void ftrace_profile_free_filter(struct perf_event *event)
1378{
1379 struct event_filter *filter = event->filter;
1380
1381 event->filter = NULL;
1382 __free_preds(filter);
1383}
1384
1385int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1386 char *filter_str)
1387{
1388 int err;
1389 struct event_filter *filter;
1390 struct filter_parse_state *ps;
1391 struct ftrace_event_call *call = NULL;
1392
1393 mutex_lock(&event_mutex);
1394
1395 list_for_each_entry(call, &ftrace_events, list) {
1396 if (call->id == event_id)
1397 break;
1245 } 1398 }
1246 1399
1247 filter_free_subsystem_preds(system, FILTER_SKIP_NO_RESET); 1400 err = -EINVAL;
1401 if (!call)
1402 goto out_unlock;
1403
1404 err = -EEXIST;
1405 if (event->filter)
1406 goto out_unlock;
1248 1407
1249 /* really apply the filter to the events */ 1408 filter = __alloc_preds();
1250 err = replace_preds(system, NULL, ps, filter_string, false); 1409 if (IS_ERR(filter)) {
1251 if (err) { 1410 err = PTR_ERR(filter);
1252 append_filter_err(ps, system->filter); 1411 goto out_unlock;
1253 filter_free_subsystem_preds(system, 2);
1254 } 1412 }
1255 1413
1256out: 1414 err = -ENOMEM;
1415 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1416 if (!ps)
1417 goto free_preds;
1418
1419 parse_init(ps, filter_ops, filter_str);
1420 err = filter_parse(ps);
1421 if (err)
1422 goto free_ps;
1423
1424 err = replace_preds(call, filter, ps, filter_str, false);
1425 if (!err)
1426 event->filter = filter;
1427
1428free_ps:
1257 filter_opstack_clear(ps); 1429 filter_opstack_clear(ps);
1258 postfix_clear(ps); 1430 postfix_clear(ps);
1259 kfree(ps); 1431 kfree(ps);
1432
1433free_preds:
1434 if (err)
1435 __free_preds(filter);
1436
1260out_unlock: 1437out_unlock:
1261 mutex_unlock(&event_mutex); 1438 mutex_unlock(&event_mutex);
1262 1439
1263 return err; 1440 return err;
1264} 1441}
1265 1442
1443#endif /* CONFIG_PERF_EVENTS */
1444
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 9753fcc61bc5..e091f64ba6ce 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -48,11 +48,11 @@
48struct ____ftrace_##name { \ 48struct ____ftrace_##name { \
49 tstruct \ 49 tstruct \
50}; \ 50}; \
51static void __used ____ftrace_check_##name(void) \ 51static void __always_unused ____ftrace_check_##name(void) \
52{ \ 52{ \
53 struct ____ftrace_##name *__entry = NULL; \ 53 struct ____ftrace_##name *__entry = NULL; \
54 \ 54 \
55 /* force cmpile-time check on F_printk() */ \ 55 /* force compile-time check on F_printk() */ \
56 printk(print); \ 56 printk(print); \
57} 57}
58 58
@@ -62,76 +62,6 @@ static void __used ____ftrace_check_##name(void) \
62 62
63#include "trace_entries.h" 63#include "trace_entries.h"
64 64
65
66#undef __field
67#define __field(type, item) \
68 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
69 "offset:%zu;\tsize:%zu;\n", \
70 offsetof(typeof(field), item), \
71 sizeof(field.item)); \
72 if (!ret) \
73 return 0;
74
75#undef __field_desc
76#define __field_desc(type, container, item) \
77 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
78 "offset:%zu;\tsize:%zu;\n", \
79 offsetof(typeof(field), container.item), \
80 sizeof(field.container.item)); \
81 if (!ret) \
82 return 0;
83
84#undef __array
85#define __array(type, item, len) \
86 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
87 "offset:%zu;\tsize:%zu;\n", \
88 offsetof(typeof(field), item), \
89 sizeof(field.item)); \
90 if (!ret) \
91 return 0;
92
93#undef __array_desc
94#define __array_desc(type, container, item, len) \
95 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
96 "offset:%zu;\tsize:%zu;\n", \
97 offsetof(typeof(field), container.item), \
98 sizeof(field.container.item)); \
99 if (!ret) \
100 return 0;
101
102#undef __dynamic_array
103#define __dynamic_array(type, item) \
104 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
105 "offset:%zu;\tsize:0;\n", \
106 offsetof(typeof(field), item)); \
107 if (!ret) \
108 return 0;
109
110#undef F_printk
111#define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
112
113#undef __entry
114#define __entry REC
115
116#undef FTRACE_ENTRY
117#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
118static int \
119ftrace_format_##name(struct ftrace_event_call *unused, \
120 struct trace_seq *s) \
121{ \
122 struct struct_name field __attribute__((unused)); \
123 int ret = 0; \
124 \
125 tstruct; \
126 \
127 trace_seq_printf(s, "\nprint fmt: " print); \
128 \
129 return ret; \
130}
131
132#include "trace_entries.h"
133
134
135#undef __field 65#undef __field
136#define __field(type, item) \ 66#define __field(type, item) \
137 ret = trace_define_field(event_call, #type, #item, \ 67 ret = trace_define_field(event_call, #type, #item, \
@@ -156,7 +86,8 @@ ftrace_format_##name(struct ftrace_event_call *unused, \
156 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ 86 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
157 ret = trace_define_field(event_call, #type "[" #len "]", #item, \ 87 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
158 offsetof(typeof(field), item), \ 88 offsetof(typeof(field), item), \
159 sizeof(field.item), 0, FILTER_OTHER); \ 89 sizeof(field.item), \
90 is_signed_type(type), FILTER_OTHER); \
160 if (ret) \ 91 if (ret) \
161 return ret; 92 return ret;
162 93
@@ -166,13 +97,18 @@ ftrace_format_##name(struct ftrace_event_call *unused, \
166 ret = trace_define_field(event_call, #type "[" #len "]", #item, \ 97 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
167 offsetof(typeof(field), \ 98 offsetof(typeof(field), \
168 container.item), \ 99 container.item), \
169 sizeof(field.container.item), 0, \ 100 sizeof(field.container.item), \
170 FILTER_OTHER); \ 101 is_signed_type(type), FILTER_OTHER); \
171 if (ret) \ 102 if (ret) \
172 return ret; 103 return ret;
173 104
174#undef __dynamic_array 105#undef __dynamic_array
175#define __dynamic_array(type, item) 106#define __dynamic_array(type, item) \
107 ret = trace_define_field(event_call, #type, #item, \
108 offsetof(typeof(field), item), \
109 0, is_signed_type(type), FILTER_OTHER);\
110 if (ret) \
111 return ret;
176 112
177#undef FTRACE_ENTRY 113#undef FTRACE_ENTRY
178#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ 114#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
@@ -182,10 +118,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
182 struct struct_name field; \ 118 struct struct_name field; \
183 int ret; \ 119 int ret; \
184 \ 120 \
185 ret = trace_define_common_fields(event_call); \
186 if (ret) \
187 return ret; \
188 \
189 tstruct; \ 121 tstruct; \
190 \ 122 \
191 return ret; \ 123 return ret; \
@@ -193,6 +125,14 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
193 125
194#include "trace_entries.h" 126#include "trace_entries.h"
195 127
128static int ftrace_raw_init_event(struct ftrace_event_call *call)
129{
130 INIT_LIST_HEAD(&call->fields);
131 return 0;
132}
133
134#undef __entry
135#define __entry REC
196 136
197#undef __field 137#undef __field
198#define __field(type, item) 138#define __field(type, item)
@@ -209,9 +149,11 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
209#undef __dynamic_array 149#undef __dynamic_array
210#define __dynamic_array(type, item) 150#define __dynamic_array(type, item)
211 151
152#undef F_printk
153#define F_printk(fmt, args...) #fmt ", " __stringify(args)
154
212#undef FTRACE_ENTRY 155#undef FTRACE_ENTRY
213#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ 156#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \
214static int ftrace_raw_init_event_##call(void); \
215 \ 157 \
216struct ftrace_event_call __used \ 158struct ftrace_event_call __used \
217__attribute__((__aligned__(4))) \ 159__attribute__((__aligned__(4))) \
@@ -219,14 +161,9 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
219 .name = #call, \ 161 .name = #call, \
220 .id = type, \ 162 .id = type, \
221 .system = __stringify(TRACE_SYSTEM), \ 163 .system = __stringify(TRACE_SYSTEM), \
222 .raw_init = ftrace_raw_init_event_##call, \ 164 .raw_init = ftrace_raw_init_event, \
223 .show_format = ftrace_format_##call, \ 165 .print_fmt = print, \
224 .define_fields = ftrace_define_fields_##call, \ 166 .define_fields = ftrace_define_fields_##call, \
225}; \ 167}; \
226static int ftrace_raw_init_event_##call(void) \
227{ \
228 INIT_LIST_HEAD(&event_##call.fields); \
229 return 0; \
230} \
231 168
232#include "trace_entries.h" 169#include "trace_entries.h"
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 45e6c01b2e4d..9aed1a5cf553 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -9,14 +9,27 @@
9#include <linux/debugfs.h> 9#include <linux/debugfs.h>
10#include <linux/uaccess.h> 10#include <linux/uaccess.h>
11#include <linux/ftrace.h> 11#include <linux/ftrace.h>
12#include <linux/slab.h>
12#include <linux/fs.h> 13#include <linux/fs.h>
13 14
14#include "trace.h" 15#include "trace.h"
15#include "trace_output.h" 16#include "trace_output.h"
16 17
17struct fgraph_data { 18struct fgraph_cpu_data {
18 pid_t last_pid; 19 pid_t last_pid;
19 int depth; 20 int depth;
21 int ignore;
22 unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH];
23};
24
25struct fgraph_data {
26 struct fgraph_cpu_data *cpu_data;
27
28 /* Place to preserve last processed entry. */
29 struct ftrace_graph_ent_entry ent;
30 struct ftrace_graph_ret_entry ret;
31 int failed;
32 int cpu;
20}; 33};
21 34
22#define TRACE_GRAPH_INDENT 2 35#define TRACE_GRAPH_INDENT 2
@@ -176,7 +189,7 @@ static int __trace_graph_entry(struct trace_array *tr,
176 struct ring_buffer *buffer = tr->buffer; 189 struct ring_buffer *buffer = tr->buffer;
177 struct ftrace_graph_ent_entry *entry; 190 struct ftrace_graph_ent_entry *entry;
178 191
179 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) 192 if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
180 return 0; 193 return 0;
181 194
182 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, 195 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
@@ -201,13 +214,11 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
201 int cpu; 214 int cpu;
202 int pc; 215 int pc;
203 216
204 if (unlikely(!tr))
205 return 0;
206
207 if (!ftrace_trace_task(current)) 217 if (!ftrace_trace_task(current))
208 return 0; 218 return 0;
209 219
210 if (!ftrace_graph_addr(trace->func)) 220 /* trace it when it is-nested-in or is a function enabled. */
221 if (!(trace->depth || ftrace_graph_addr(trace->func)))
211 return 0; 222 return 0;
212 223
213 local_irq_save(flags); 224 local_irq_save(flags);
@@ -220,9 +231,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
220 } else { 231 } else {
221 ret = 0; 232 ret = 0;
222 } 233 }
223 /* Only do the atomic if it is not already set */
224 if (!test_tsk_trace_graph(current))
225 set_tsk_trace_graph(current);
226 234
227 atomic_dec(&data->disabled); 235 atomic_dec(&data->disabled);
228 local_irq_restore(flags); 236 local_irq_restore(flags);
@@ -230,6 +238,14 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
230 return ret; 238 return ret;
231} 239}
232 240
241int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
242{
243 if (tracing_thresh)
244 return 1;
245 else
246 return trace_graph_entry(trace);
247}
248
233static void __trace_graph_return(struct trace_array *tr, 249static void __trace_graph_return(struct trace_array *tr,
234 struct ftrace_graph_ret *trace, 250 struct ftrace_graph_ret *trace,
235 unsigned long flags, 251 unsigned long flags,
@@ -240,7 +256,7 @@ static void __trace_graph_return(struct trace_array *tr,
240 struct ring_buffer *buffer = tr->buffer; 256 struct ring_buffer *buffer = tr->buffer;
241 struct ftrace_graph_ret_entry *entry; 257 struct ftrace_graph_ret_entry *entry;
242 258
243 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) 259 if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
244 return; 260 return;
245 261
246 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, 262 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
@@ -270,19 +286,39 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
270 pc = preempt_count(); 286 pc = preempt_count();
271 __trace_graph_return(tr, trace, flags, pc); 287 __trace_graph_return(tr, trace, flags, pc);
272 } 288 }
273 if (!trace->depth)
274 clear_tsk_trace_graph(current);
275 atomic_dec(&data->disabled); 289 atomic_dec(&data->disabled);
276 local_irq_restore(flags); 290 local_irq_restore(flags);
277} 291}
278 292
293void set_graph_array(struct trace_array *tr)
294{
295 graph_array = tr;
296
297 /* Make graph_array visible before we start tracing */
298
299 smp_mb();
300}
301
302void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
303{
304 if (tracing_thresh &&
305 (trace->rettime - trace->calltime < tracing_thresh))
306 return;
307 else
308 trace_graph_return(trace);
309}
310
279static int graph_trace_init(struct trace_array *tr) 311static int graph_trace_init(struct trace_array *tr)
280{ 312{
281 int ret; 313 int ret;
282 314
283 graph_array = tr; 315 set_graph_array(tr);
284 ret = register_ftrace_graph(&trace_graph_return, 316 if (tracing_thresh)
285 &trace_graph_entry); 317 ret = register_ftrace_graph(&trace_graph_thresh_return,
318 &trace_graph_thresh_entry);
319 else
320 ret = register_ftrace_graph(&trace_graph_return,
321 &trace_graph_entry);
286 if (ret) 322 if (ret)
287 return ret; 323 return ret;
288 tracing_start_cmdline_record(); 324 tracing_start_cmdline_record();
@@ -290,11 +326,6 @@ static int graph_trace_init(struct trace_array *tr)
290 return 0; 326 return 0;
291} 327}
292 328
293void set_graph_array(struct trace_array *tr)
294{
295 graph_array = tr;
296}
297
298static void graph_trace_reset(struct trace_array *tr) 329static void graph_trace_reset(struct trace_array *tr)
299{ 330{
300 tracing_stop_cmdline_record(); 331 tracing_stop_cmdline_record();
@@ -384,7 +415,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
384 if (!data) 415 if (!data)
385 return TRACE_TYPE_HANDLED; 416 return TRACE_TYPE_HANDLED;
386 417
387 last_pid = &(per_cpu_ptr(data, cpu)->last_pid); 418 last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
388 419
389 if (*last_pid == pid) 420 if (*last_pid == pid)
390 return TRACE_TYPE_HANDLED; 421 return TRACE_TYPE_HANDLED;
@@ -435,26 +466,49 @@ static struct ftrace_graph_ret_entry *
435get_return_for_leaf(struct trace_iterator *iter, 466get_return_for_leaf(struct trace_iterator *iter,
436 struct ftrace_graph_ent_entry *curr) 467 struct ftrace_graph_ent_entry *curr)
437{ 468{
438 struct ring_buffer_iter *ring_iter; 469 struct fgraph_data *data = iter->private;
470 struct ring_buffer_iter *ring_iter = NULL;
439 struct ring_buffer_event *event; 471 struct ring_buffer_event *event;
440 struct ftrace_graph_ret_entry *next; 472 struct ftrace_graph_ret_entry *next;
441 473
442 ring_iter = iter->buffer_iter[iter->cpu]; 474 /*
475 * If the previous output failed to write to the seq buffer,
476 * then we just reuse the data from before.
477 */
478 if (data && data->failed) {
479 curr = &data->ent;
480 next = &data->ret;
481 } else {
443 482
444 /* First peek to compare current entry and the next one */ 483 ring_iter = iter->buffer_iter[iter->cpu];
445 if (ring_iter) 484
446 event = ring_buffer_iter_peek(ring_iter, NULL); 485 /* First peek to compare current entry and the next one */
447 else { 486 if (ring_iter)
448 /* We need to consume the current entry to see the next one */ 487 event = ring_buffer_iter_peek(ring_iter, NULL);
449 ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL); 488 else {
450 event = ring_buffer_peek(iter->tr->buffer, iter->cpu, 489 /*
451 NULL); 490 * We need to consume the current entry to see
452 } 491 * the next one.
492 */
493 ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
494 event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
495 NULL);
496 }
453 497
454 if (!event) 498 if (!event)
455 return NULL; 499 return NULL;
456 500
457 next = ring_buffer_event_data(event); 501 next = ring_buffer_event_data(event);
502
503 if (data) {
504 /*
505 * Save current and next entries for later reference
506 * if the output fails.
507 */
508 data->ent = *curr;
509 data->ret = *next;
510 }
511 }
458 512
459 if (next->ent.type != TRACE_GRAPH_RET) 513 if (next->ent.type != TRACE_GRAPH_RET)
460 return NULL; 514 return NULL;
@@ -639,15 +693,21 @@ print_graph_entry_leaf(struct trace_iterator *iter,
639 duration = graph_ret->rettime - graph_ret->calltime; 693 duration = graph_ret->rettime - graph_ret->calltime;
640 694
641 if (data) { 695 if (data) {
696 struct fgraph_cpu_data *cpu_data;
642 int cpu = iter->cpu; 697 int cpu = iter->cpu;
643 int *depth = &(per_cpu_ptr(data, cpu)->depth); 698
699 cpu_data = per_cpu_ptr(data->cpu_data, cpu);
644 700
645 /* 701 /*
646 * Comments display at + 1 to depth. Since 702 * Comments display at + 1 to depth. Since
647 * this is a leaf function, keep the comments 703 * this is a leaf function, keep the comments
648 * equal to this depth. 704 * equal to this depth.
649 */ 705 */
650 *depth = call->depth - 1; 706 cpu_data->depth = call->depth - 1;
707
708 /* No need to keep this function around for this depth */
709 if (call->depth < FTRACE_RETFUNC_DEPTH)
710 cpu_data->enter_funcs[call->depth] = 0;
651 } 711 }
652 712
653 /* Overhead */ 713 /* Overhead */
@@ -687,10 +747,15 @@ print_graph_entry_nested(struct trace_iterator *iter,
687 int i; 747 int i;
688 748
689 if (data) { 749 if (data) {
750 struct fgraph_cpu_data *cpu_data;
690 int cpu = iter->cpu; 751 int cpu = iter->cpu;
691 int *depth = &(per_cpu_ptr(data, cpu)->depth);
692 752
693 *depth = call->depth; 753 cpu_data = per_cpu_ptr(data->cpu_data, cpu);
754 cpu_data->depth = call->depth;
755
756 /* Save this function pointer to see if the exit matches */
757 if (call->depth < FTRACE_RETFUNC_DEPTH)
758 cpu_data->enter_funcs[call->depth] = call->func;
694 } 759 }
695 760
696 /* No overhead */ 761 /* No overhead */
@@ -782,19 +847,34 @@ static enum print_line_t
782print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, 847print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
783 struct trace_iterator *iter) 848 struct trace_iterator *iter)
784{ 849{
785 int cpu = iter->cpu; 850 struct fgraph_data *data = iter->private;
786 struct ftrace_graph_ent *call = &field->graph_ent; 851 struct ftrace_graph_ent *call = &field->graph_ent;
787 struct ftrace_graph_ret_entry *leaf_ret; 852 struct ftrace_graph_ret_entry *leaf_ret;
853 static enum print_line_t ret;
854 int cpu = iter->cpu;
788 855
789 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func)) 856 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func))
790 return TRACE_TYPE_PARTIAL_LINE; 857 return TRACE_TYPE_PARTIAL_LINE;
791 858
792 leaf_ret = get_return_for_leaf(iter, field); 859 leaf_ret = get_return_for_leaf(iter, field);
793 if (leaf_ret) 860 if (leaf_ret)
794 return print_graph_entry_leaf(iter, field, leaf_ret, s); 861 ret = print_graph_entry_leaf(iter, field, leaf_ret, s);
795 else 862 else
796 return print_graph_entry_nested(iter, field, s, cpu); 863 ret = print_graph_entry_nested(iter, field, s, cpu);
864
865 if (data) {
866 /*
867 * If we failed to write our output, then we need to make
868 * note of it. Because we already consumed our entry.
869 */
870 if (s->full) {
871 data->failed = 1;
872 data->cpu = cpu;
873 } else
874 data->failed = 0;
875 }
797 876
877 return ret;
798} 878}
799 879
800static enum print_line_t 880static enum print_line_t
@@ -805,19 +885,28 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
805 struct fgraph_data *data = iter->private; 885 struct fgraph_data *data = iter->private;
806 pid_t pid = ent->pid; 886 pid_t pid = ent->pid;
807 int cpu = iter->cpu; 887 int cpu = iter->cpu;
888 int func_match = 1;
808 int ret; 889 int ret;
809 int i; 890 int i;
810 891
811 if (data) { 892 if (data) {
893 struct fgraph_cpu_data *cpu_data;
812 int cpu = iter->cpu; 894 int cpu = iter->cpu;
813 int *depth = &(per_cpu_ptr(data, cpu)->depth); 895
896 cpu_data = per_cpu_ptr(data->cpu_data, cpu);
814 897
815 /* 898 /*
816 * Comments display at + 1 to depth. This is the 899 * Comments display at + 1 to depth. This is the
817 * return from a function, we now want the comments 900 * return from a function, we now want the comments
818 * to display at the same level of the bracket. 901 * to display at the same level of the bracket.
819 */ 902 */
820 *depth = trace->depth - 1; 903 cpu_data->depth = trace->depth - 1;
904
905 if (trace->depth < FTRACE_RETFUNC_DEPTH) {
906 if (cpu_data->enter_funcs[trace->depth] != trace->func)
907 func_match = 0;
908 cpu_data->enter_funcs[trace->depth] = 0;
909 }
821 } 910 }
822 911
823 if (print_graph_prologue(iter, s, 0, 0)) 912 if (print_graph_prologue(iter, s, 0, 0))
@@ -842,9 +931,21 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
842 return TRACE_TYPE_PARTIAL_LINE; 931 return TRACE_TYPE_PARTIAL_LINE;
843 } 932 }
844 933
845 ret = trace_seq_printf(s, "}\n"); 934 /*
846 if (!ret) 935 * If the return function does not have a matching entry,
847 return TRACE_TYPE_PARTIAL_LINE; 936 * then the entry was lost. Instead of just printing
937 * the '}' and letting the user guess what function this
938 * belongs to, write out the function name.
939 */
940 if (func_match) {
941 ret = trace_seq_printf(s, "}\n");
942 if (!ret)
943 return TRACE_TYPE_PARTIAL_LINE;
944 } else {
945 ret = trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
946 if (!ret)
947 return TRACE_TYPE_PARTIAL_LINE;
948 }
848 949
849 /* Overrun */ 950 /* Overrun */
850 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) { 951 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
@@ -873,7 +974,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
873 int i; 974 int i;
874 975
875 if (data) 976 if (data)
876 depth = per_cpu_ptr(data, iter->cpu)->depth; 977 depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
877 978
878 if (print_graph_prologue(iter, s, 0, 0)) 979 if (print_graph_prologue(iter, s, 0, 0))
879 return TRACE_TYPE_PARTIAL_LINE; 980 return TRACE_TYPE_PARTIAL_LINE;
@@ -941,8 +1042,33 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
941enum print_line_t 1042enum print_line_t
942print_graph_function(struct trace_iterator *iter) 1043print_graph_function(struct trace_iterator *iter)
943{ 1044{
1045 struct ftrace_graph_ent_entry *field;
1046 struct fgraph_data *data = iter->private;
944 struct trace_entry *entry = iter->ent; 1047 struct trace_entry *entry = iter->ent;
945 struct trace_seq *s = &iter->seq; 1048 struct trace_seq *s = &iter->seq;
1049 int cpu = iter->cpu;
1050 int ret;
1051
1052 if (data && per_cpu_ptr(data->cpu_data, cpu)->ignore) {
1053 per_cpu_ptr(data->cpu_data, cpu)->ignore = 0;
1054 return TRACE_TYPE_HANDLED;
1055 }
1056
1057 /*
1058 * If the last output failed, there's a possibility we need
1059 * to print out the missing entry which would never go out.
1060 */
1061 if (data && data->failed) {
1062 field = &data->ent;
1063 iter->cpu = data->cpu;
1064 ret = print_graph_entry(field, s, iter);
1065 if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) {
1066 per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1;
1067 ret = TRACE_TYPE_NO_CONSUME;
1068 }
1069 iter->cpu = cpu;
1070 return ret;
1071 }
946 1072
947 switch (entry->type) { 1073 switch (entry->type) {
948 case TRACE_GRAPH_ENT: { 1074 case TRACE_GRAPH_ENT: {
@@ -952,7 +1078,7 @@ print_graph_function(struct trace_iterator *iter)
952 * sizeof(struct ftrace_graph_ent_entry) is very small, 1078 * sizeof(struct ftrace_graph_ent_entry) is very small,
953 * it can be safely saved at the stack. 1079 * it can be safely saved at the stack.
954 */ 1080 */
955 struct ftrace_graph_ent_entry *field, saved; 1081 struct ftrace_graph_ent_entry saved;
956 trace_assign_type(field, entry); 1082 trace_assign_type(field, entry);
957 saved = *field; 1083 saved = *field;
958 return print_graph_entry(&saved, s, iter); 1084 return print_graph_entry(&saved, s, iter);
@@ -1030,31 +1156,54 @@ static void print_graph_headers(struct seq_file *s)
1030static void graph_trace_open(struct trace_iterator *iter) 1156static void graph_trace_open(struct trace_iterator *iter)
1031{ 1157{
1032 /* pid and depth on the last trace processed */ 1158 /* pid and depth on the last trace processed */
1033 struct fgraph_data *data = alloc_percpu(struct fgraph_data); 1159 struct fgraph_data *data;
1034 int cpu; 1160 int cpu;
1035 1161
1162 iter->private = NULL;
1163
1164 data = kzalloc(sizeof(*data), GFP_KERNEL);
1036 if (!data) 1165 if (!data)
1037 pr_warning("function graph tracer: not enough memory\n"); 1166 goto out_err;
1038 else 1167
1039 for_each_possible_cpu(cpu) { 1168 data->cpu_data = alloc_percpu(struct fgraph_cpu_data);
1040 pid_t *pid = &(per_cpu_ptr(data, cpu)->last_pid); 1169 if (!data->cpu_data)
1041 int *depth = &(per_cpu_ptr(data, cpu)->depth); 1170 goto out_err_free;
1042 *pid = -1; 1171
1043 *depth = 0; 1172 for_each_possible_cpu(cpu) {
1044 } 1173 pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
1174 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
1175 int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore);
1176 *pid = -1;
1177 *depth = 0;
1178 *ignore = 0;
1179 }
1045 1180
1046 iter->private = data; 1181 iter->private = data;
1182
1183 return;
1184
1185 out_err_free:
1186 kfree(data);
1187 out_err:
1188 pr_warning("function graph tracer: not enough memory\n");
1047} 1189}
1048 1190
1049static void graph_trace_close(struct trace_iterator *iter) 1191static void graph_trace_close(struct trace_iterator *iter)
1050{ 1192{
1051 free_percpu(iter->private); 1193 struct fgraph_data *data = iter->private;
1194
1195 if (data) {
1196 free_percpu(data->cpu_data);
1197 kfree(data);
1198 }
1052} 1199}
1053 1200
1054static struct tracer graph_trace __read_mostly = { 1201static struct tracer graph_trace __read_mostly = {
1055 .name = "function_graph", 1202 .name = "function_graph",
1056 .open = graph_trace_open, 1203 .open = graph_trace_open,
1204 .pipe_open = graph_trace_open,
1057 .close = graph_trace_close, 1205 .close = graph_trace_close,
1206 .pipe_close = graph_trace_close,
1058 .wait_pipe = poll_wait_pipe, 1207 .wait_pipe = poll_wait_pipe,
1059 .init = graph_trace_init, 1208 .init = graph_trace_init,
1060 .reset = graph_trace_reset, 1209 .reset = graph_trace_reset,
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 69543a905cd5..7b97000745f5 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -20,10 +20,10 @@
20 20
21#define BTS_BUFFER_SIZE (1 << 13) 21#define BTS_BUFFER_SIZE (1 << 13)
22 22
23static DEFINE_PER_CPU(struct bts_tracer *, tracer); 23static DEFINE_PER_CPU(struct bts_tracer *, hwb_tracer);
24static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer); 24static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], hwb_buffer);
25 25
26#define this_tracer per_cpu(tracer, smp_processor_id()) 26#define this_tracer per_cpu(hwb_tracer, smp_processor_id())
27 27
28static int trace_hw_branches_enabled __read_mostly; 28static int trace_hw_branches_enabled __read_mostly;
29static int trace_hw_branches_suspended __read_mostly; 29static int trace_hw_branches_suspended __read_mostly;
@@ -32,12 +32,13 @@ static struct trace_array *hw_branch_trace __read_mostly;
32 32
33static void bts_trace_init_cpu(int cpu) 33static void bts_trace_init_cpu(int cpu)
34{ 34{
35 per_cpu(tracer, cpu) = 35 per_cpu(hwb_tracer, cpu) =
36 ds_request_bts_cpu(cpu, per_cpu(buffer, cpu), BTS_BUFFER_SIZE, 36 ds_request_bts_cpu(cpu, per_cpu(hwb_buffer, cpu),
37 NULL, (size_t)-1, BTS_KERNEL); 37 BTS_BUFFER_SIZE, NULL, (size_t)-1,
38 BTS_KERNEL);
38 39
39 if (IS_ERR(per_cpu(tracer, cpu))) 40 if (IS_ERR(per_cpu(hwb_tracer, cpu)))
40 per_cpu(tracer, cpu) = NULL; 41 per_cpu(hwb_tracer, cpu) = NULL;
41} 42}
42 43
43static int bts_trace_init(struct trace_array *tr) 44static int bts_trace_init(struct trace_array *tr)
@@ -51,7 +52,7 @@ static int bts_trace_init(struct trace_array *tr)
51 for_each_online_cpu(cpu) { 52 for_each_online_cpu(cpu) {
52 bts_trace_init_cpu(cpu); 53 bts_trace_init_cpu(cpu);
53 54
54 if (likely(per_cpu(tracer, cpu))) 55 if (likely(per_cpu(hwb_tracer, cpu)))
55 trace_hw_branches_enabled = 1; 56 trace_hw_branches_enabled = 1;
56 } 57 }
57 trace_hw_branches_suspended = 0; 58 trace_hw_branches_suspended = 0;
@@ -67,9 +68,9 @@ static void bts_trace_reset(struct trace_array *tr)
67 68
68 get_online_cpus(); 69 get_online_cpus();
69 for_each_online_cpu(cpu) { 70 for_each_online_cpu(cpu) {
70 if (likely(per_cpu(tracer, cpu))) { 71 if (likely(per_cpu(hwb_tracer, cpu))) {
71 ds_release_bts(per_cpu(tracer, cpu)); 72 ds_release_bts(per_cpu(hwb_tracer, cpu));
72 per_cpu(tracer, cpu) = NULL; 73 per_cpu(hwb_tracer, cpu) = NULL;
73 } 74 }
74 } 75 }
75 trace_hw_branches_enabled = 0; 76 trace_hw_branches_enabled = 0;
@@ -83,8 +84,8 @@ static void bts_trace_start(struct trace_array *tr)
83 84
84 get_online_cpus(); 85 get_online_cpus();
85 for_each_online_cpu(cpu) 86 for_each_online_cpu(cpu)
86 if (likely(per_cpu(tracer, cpu))) 87 if (likely(per_cpu(hwb_tracer, cpu)))
87 ds_resume_bts(per_cpu(tracer, cpu)); 88 ds_resume_bts(per_cpu(hwb_tracer, cpu));
88 trace_hw_branches_suspended = 0; 89 trace_hw_branches_suspended = 0;
89 put_online_cpus(); 90 put_online_cpus();
90} 91}
@@ -95,8 +96,8 @@ static void bts_trace_stop(struct trace_array *tr)
95 96
96 get_online_cpus(); 97 get_online_cpus();
97 for_each_online_cpu(cpu) 98 for_each_online_cpu(cpu)
98 if (likely(per_cpu(tracer, cpu))) 99 if (likely(per_cpu(hwb_tracer, cpu)))
99 ds_suspend_bts(per_cpu(tracer, cpu)); 100 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
100 trace_hw_branches_suspended = 1; 101 trace_hw_branches_suspended = 1;
101 put_online_cpus(); 102 put_online_cpus();
102} 103}
@@ -114,16 +115,16 @@ static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
114 bts_trace_init_cpu(cpu); 115 bts_trace_init_cpu(cpu);
115 116
116 if (trace_hw_branches_suspended && 117 if (trace_hw_branches_suspended &&
117 likely(per_cpu(tracer, cpu))) 118 likely(per_cpu(hwb_tracer, cpu)))
118 ds_suspend_bts(per_cpu(tracer, cpu)); 119 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
119 } 120 }
120 break; 121 break;
121 122
122 case CPU_DOWN_PREPARE: 123 case CPU_DOWN_PREPARE:
123 /* The notification is sent with interrupts enabled. */ 124 /* The notification is sent with interrupts enabled. */
124 if (likely(per_cpu(tracer, cpu))) { 125 if (likely(per_cpu(hwb_tracer, cpu))) {
125 ds_release_bts(per_cpu(tracer, cpu)); 126 ds_release_bts(per_cpu(hwb_tracer, cpu));
126 per_cpu(tracer, cpu) = NULL; 127 per_cpu(hwb_tracer, cpu) = NULL;
127 } 128 }
128 } 129 }
129 130
@@ -258,8 +259,8 @@ static void trace_bts_prepare(struct trace_iterator *iter)
258 259
259 get_online_cpus(); 260 get_online_cpus();
260 for_each_online_cpu(cpu) 261 for_each_online_cpu(cpu)
261 if (likely(per_cpu(tracer, cpu))) 262 if (likely(per_cpu(hwb_tracer, cpu)))
262 ds_suspend_bts(per_cpu(tracer, cpu)); 263 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
263 /* 264 /*
264 * We need to collect the trace on the respective cpu since ftrace 265 * We need to collect the trace on the respective cpu since ftrace
265 * implicitly adds the record for the current cpu. 266 * implicitly adds the record for the current cpu.
@@ -268,8 +269,8 @@ static void trace_bts_prepare(struct trace_iterator *iter)
268 on_each_cpu(trace_bts_cpu, iter->tr, 1); 269 on_each_cpu(trace_bts_cpu, iter->tr, 1);
269 270
270 for_each_online_cpu(cpu) 271 for_each_online_cpu(cpu)
271 if (likely(per_cpu(tracer, cpu))) 272 if (likely(per_cpu(hwb_tracer, cpu)))
272 ds_resume_bts(per_cpu(tracer, cpu)); 273 ds_resume_bts(per_cpu(hwb_tracer, cpu));
273 put_online_cpus(); 274 put_online_cpus();
274} 275}
275 276
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 3aa7eaa2114c..2974bc7538c7 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -151,6 +151,8 @@ check_critical_timing(struct trace_array *tr,
151 goto out_unlock; 151 goto out_unlock;
152 152
153 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 153 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
154 /* Skip 5 functions to get to the irq/preempt enable function */
155 __trace_stack(tr, flags, 5, pc);
154 156
155 if (data->critical_sequence != max_sequence) 157 if (data->critical_sequence != max_sequence)
156 goto out_unlock; 158 goto out_unlock;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
new file mode 100644
index 000000000000..1251e367bae9
--- /dev/null
+++ b/kernel/trace/trace_kprobe.c
@@ -0,0 +1,1488 @@
1/*
2 * Kprobes-based tracing events
3 *
4 * Created by Masami Hiramatsu <mhiramat@redhat.com>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20#include <linux/module.h>
21#include <linux/uaccess.h>
22#include <linux/kprobes.h>
23#include <linux/seq_file.h>
24#include <linux/slab.h>
25#include <linux/smp.h>
26#include <linux/debugfs.h>
27#include <linux/types.h>
28#include <linux/string.h>
29#include <linux/ctype.h>
30#include <linux/ptrace.h>
31#include <linux/perf_event.h>
32
33#include "trace.h"
34#include "trace_output.h"
35
36#define MAX_TRACE_ARGS 128
37#define MAX_ARGSTR_LEN 63
38#define MAX_EVENT_NAME_LEN 64
39#define KPROBE_EVENT_SYSTEM "kprobes"
40
41/* Reserved field names */
42#define FIELD_STRING_IP "__probe_ip"
43#define FIELD_STRING_NARGS "__probe_nargs"
44#define FIELD_STRING_RETIP "__probe_ret_ip"
45#define FIELD_STRING_FUNC "__probe_func"
46
47const char *reserved_field_names[] = {
48 "common_type",
49 "common_flags",
50 "common_preempt_count",
51 "common_pid",
52 "common_tgid",
53 "common_lock_depth",
54 FIELD_STRING_IP,
55 FIELD_STRING_NARGS,
56 FIELD_STRING_RETIP,
57 FIELD_STRING_FUNC,
58};
59
60struct fetch_func {
61 unsigned long (*func)(struct pt_regs *, void *);
62 void *data;
63};
64
65static __kprobes unsigned long call_fetch(struct fetch_func *f,
66 struct pt_regs *regs)
67{
68 return f->func(regs, f->data);
69}
70
71/* fetch handlers */
72static __kprobes unsigned long fetch_register(struct pt_regs *regs,
73 void *offset)
74{
75 return regs_get_register(regs, (unsigned int)((unsigned long)offset));
76}
77
78static __kprobes unsigned long fetch_stack(struct pt_regs *regs,
79 void *num)
80{
81 return regs_get_kernel_stack_nth(regs,
82 (unsigned int)((unsigned long)num));
83}
84
85static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr)
86{
87 unsigned long retval;
88
89 if (probe_kernel_address(addr, retval))
90 return 0;
91 return retval;
92}
93
94static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs,
95 void *dummy)
96{
97 return regs_return_value(regs);
98}
99
100static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs,
101 void *dummy)
102{
103 return kernel_stack_pointer(regs);
104}
105
106/* Memory fetching by symbol */
107struct symbol_cache {
108 char *symbol;
109 long offset;
110 unsigned long addr;
111};
112
113static unsigned long update_symbol_cache(struct symbol_cache *sc)
114{
115 sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
116 if (sc->addr)
117 sc->addr += sc->offset;
118 return sc->addr;
119}
120
121static void free_symbol_cache(struct symbol_cache *sc)
122{
123 kfree(sc->symbol);
124 kfree(sc);
125}
126
127static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
128{
129 struct symbol_cache *sc;
130
131 if (!sym || strlen(sym) == 0)
132 return NULL;
133 sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
134 if (!sc)
135 return NULL;
136
137 sc->symbol = kstrdup(sym, GFP_KERNEL);
138 if (!sc->symbol) {
139 kfree(sc);
140 return NULL;
141 }
142 sc->offset = offset;
143
144 update_symbol_cache(sc);
145 return sc;
146}
147
148static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data)
149{
150 struct symbol_cache *sc = data;
151
152 if (sc->addr)
153 return fetch_memory(regs, (void *)sc->addr);
154 else
155 return 0;
156}
157
158/* Special indirect memory access interface */
159struct indirect_fetch_data {
160 struct fetch_func orig;
161 long offset;
162};
163
164static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data)
165{
166 struct indirect_fetch_data *ind = data;
167 unsigned long addr;
168
169 addr = call_fetch(&ind->orig, regs);
170 if (addr) {
171 addr += ind->offset;
172 return fetch_memory(regs, (void *)addr);
173 } else
174 return 0;
175}
176
177static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data)
178{
179 if (data->orig.func == fetch_indirect)
180 free_indirect_fetch_data(data->orig.data);
181 else if (data->orig.func == fetch_symbol)
182 free_symbol_cache(data->orig.data);
183 kfree(data);
184}
185
186/**
187 * Kprobe event core functions
188 */
189
190struct probe_arg {
191 struct fetch_func fetch;
192 const char *name;
193};
194
195/* Flags for trace_probe */
196#define TP_FLAG_TRACE 1
197#define TP_FLAG_PROFILE 2
198
199struct trace_probe {
200 struct list_head list;
201 struct kretprobe rp; /* Use rp.kp for kprobe use */
202 unsigned long nhit;
203 unsigned int flags; /* For TP_FLAG_* */
204 const char *symbol; /* symbol name */
205 struct ftrace_event_call call;
206 struct trace_event event;
207 unsigned int nr_args;
208 struct probe_arg args[];
209};
210
211#define SIZEOF_TRACE_PROBE(n) \
212 (offsetof(struct trace_probe, args) + \
213 (sizeof(struct probe_arg) * (n)))
214
215static __kprobes int probe_is_return(struct trace_probe *tp)
216{
217 return tp->rp.handler != NULL;
218}
219
220static __kprobes const char *probe_symbol(struct trace_probe *tp)
221{
222 return tp->symbol ? tp->symbol : "unknown";
223}
224
225static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
226{
227 int ret = -EINVAL;
228
229 if (ff->func == fetch_register) {
230 const char *name;
231 name = regs_query_register_name((unsigned int)((long)ff->data));
232 ret = snprintf(buf, n, "%%%s", name);
233 } else if (ff->func == fetch_stack)
234 ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data);
235 else if (ff->func == fetch_memory)
236 ret = snprintf(buf, n, "@0x%p", ff->data);
237 else if (ff->func == fetch_symbol) {
238 struct symbol_cache *sc = ff->data;
239 if (sc->offset)
240 ret = snprintf(buf, n, "@%s%+ld", sc->symbol,
241 sc->offset);
242 else
243 ret = snprintf(buf, n, "@%s", sc->symbol);
244 } else if (ff->func == fetch_retvalue)
245 ret = snprintf(buf, n, "$retval");
246 else if (ff->func == fetch_stack_address)
247 ret = snprintf(buf, n, "$stack");
248 else if (ff->func == fetch_indirect) {
249 struct indirect_fetch_data *id = ff->data;
250 size_t l = 0;
251 ret = snprintf(buf, n, "%+ld(", id->offset);
252 if (ret >= n)
253 goto end;
254 l += ret;
255 ret = probe_arg_string(buf + l, n - l, &id->orig);
256 if (ret < 0)
257 goto end;
258 l += ret;
259 ret = snprintf(buf + l, n - l, ")");
260 ret += l;
261 }
262end:
263 if (ret >= n)
264 return -ENOSPC;
265 return ret;
266}
267
268static int register_probe_event(struct trace_probe *tp);
269static void unregister_probe_event(struct trace_probe *tp);
270
271static DEFINE_MUTEX(probe_lock);
272static LIST_HEAD(probe_list);
273
274static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
275static int kretprobe_dispatcher(struct kretprobe_instance *ri,
276 struct pt_regs *regs);
277
278/* Check the name is good for event/group */
279static int check_event_name(const char *name)
280{
281 if (!isalpha(*name) && *name != '_')
282 return 0;
283 while (*++name != '\0') {
284 if (!isalpha(*name) && !isdigit(*name) && *name != '_')
285 return 0;
286 }
287 return 1;
288}
289
290/*
291 * Allocate new trace_probe and initialize it (including kprobes).
292 */
293static struct trace_probe *alloc_trace_probe(const char *group,
294 const char *event,
295 void *addr,
296 const char *symbol,
297 unsigned long offs,
298 int nargs, int is_return)
299{
300 struct trace_probe *tp;
301 int ret = -ENOMEM;
302
303 tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL);
304 if (!tp)
305 return ERR_PTR(ret);
306
307 if (symbol) {
308 tp->symbol = kstrdup(symbol, GFP_KERNEL);
309 if (!tp->symbol)
310 goto error;
311 tp->rp.kp.symbol_name = tp->symbol;
312 tp->rp.kp.offset = offs;
313 } else
314 tp->rp.kp.addr = addr;
315
316 if (is_return)
317 tp->rp.handler = kretprobe_dispatcher;
318 else
319 tp->rp.kp.pre_handler = kprobe_dispatcher;
320
321 if (!event || !check_event_name(event)) {
322 ret = -EINVAL;
323 goto error;
324 }
325
326 tp->call.name = kstrdup(event, GFP_KERNEL);
327 if (!tp->call.name)
328 goto error;
329
330 if (!group || !check_event_name(group)) {
331 ret = -EINVAL;
332 goto error;
333 }
334
335 tp->call.system = kstrdup(group, GFP_KERNEL);
336 if (!tp->call.system)
337 goto error;
338
339 INIT_LIST_HEAD(&tp->list);
340 return tp;
341error:
342 kfree(tp->call.name);
343 kfree(tp->symbol);
344 kfree(tp);
345 return ERR_PTR(ret);
346}
347
348static void free_probe_arg(struct probe_arg *arg)
349{
350 if (arg->fetch.func == fetch_symbol)
351 free_symbol_cache(arg->fetch.data);
352 else if (arg->fetch.func == fetch_indirect)
353 free_indirect_fetch_data(arg->fetch.data);
354 kfree(arg->name);
355}
356
357static void free_trace_probe(struct trace_probe *tp)
358{
359 int i;
360
361 for (i = 0; i < tp->nr_args; i++)
362 free_probe_arg(&tp->args[i]);
363
364 kfree(tp->call.system);
365 kfree(tp->call.name);
366 kfree(tp->symbol);
367 kfree(tp);
368}
369
370static struct trace_probe *find_probe_event(const char *event,
371 const char *group)
372{
373 struct trace_probe *tp;
374
375 list_for_each_entry(tp, &probe_list, list)
376 if (strcmp(tp->call.name, event) == 0 &&
377 strcmp(tp->call.system, group) == 0)
378 return tp;
379 return NULL;
380}
381
382/* Unregister a trace_probe and probe_event: call with locking probe_lock */
383static void unregister_trace_probe(struct trace_probe *tp)
384{
385 if (probe_is_return(tp))
386 unregister_kretprobe(&tp->rp);
387 else
388 unregister_kprobe(&tp->rp.kp);
389 list_del(&tp->list);
390 unregister_probe_event(tp);
391}
392
393/* Register a trace_probe and probe_event */
394static int register_trace_probe(struct trace_probe *tp)
395{
396 struct trace_probe *old_tp;
397 int ret;
398
399 mutex_lock(&probe_lock);
400
401 /* register as an event */
402 old_tp = find_probe_event(tp->call.name, tp->call.system);
403 if (old_tp) {
404 /* delete old event */
405 unregister_trace_probe(old_tp);
406 free_trace_probe(old_tp);
407 }
408 ret = register_probe_event(tp);
409 if (ret) {
410 pr_warning("Faild to register probe event(%d)\n", ret);
411 goto end;
412 }
413
414 tp->rp.kp.flags |= KPROBE_FLAG_DISABLED;
415 if (probe_is_return(tp))
416 ret = register_kretprobe(&tp->rp);
417 else
418 ret = register_kprobe(&tp->rp.kp);
419
420 if (ret) {
421 pr_warning("Could not insert probe(%d)\n", ret);
422 if (ret == -EILSEQ) {
423 pr_warning("Probing address(0x%p) is not an "
424 "instruction boundary.\n",
425 tp->rp.kp.addr);
426 ret = -EINVAL;
427 }
428 unregister_probe_event(tp);
429 } else
430 list_add_tail(&tp->list, &probe_list);
431end:
432 mutex_unlock(&probe_lock);
433 return ret;
434}
435
436/* Split symbol and offset. */
437static int split_symbol_offset(char *symbol, unsigned long *offset)
438{
439 char *tmp;
440 int ret;
441
442 if (!offset)
443 return -EINVAL;
444
445 tmp = strchr(symbol, '+');
446 if (tmp) {
447 /* skip sign because strict_strtol doesn't accept '+' */
448 ret = strict_strtoul(tmp + 1, 0, offset);
449 if (ret)
450 return ret;
451 *tmp = '\0';
452 } else
453 *offset = 0;
454 return 0;
455}
456
457#define PARAM_MAX_ARGS 16
458#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
459
460static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
461{
462 int ret = 0;
463 unsigned long param;
464
465 if (strcmp(arg, "retval") == 0) {
466 if (is_return) {
467 ff->func = fetch_retvalue;
468 ff->data = NULL;
469 } else
470 ret = -EINVAL;
471 } else if (strncmp(arg, "stack", 5) == 0) {
472 if (arg[5] == '\0') {
473 ff->func = fetch_stack_address;
474 ff->data = NULL;
475 } else if (isdigit(arg[5])) {
476 ret = strict_strtoul(arg + 5, 10, &param);
477 if (ret || param > PARAM_MAX_STACK)
478 ret = -EINVAL;
479 else {
480 ff->func = fetch_stack;
481 ff->data = (void *)param;
482 }
483 } else
484 ret = -EINVAL;
485 } else
486 ret = -EINVAL;
487 return ret;
488}
489
490/* Recursive argument parser */
491static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
492{
493 int ret = 0;
494 unsigned long param;
495 long offset;
496 char *tmp;
497
498 switch (arg[0]) {
499 case '$':
500 ret = parse_probe_vars(arg + 1, ff, is_return);
501 break;
502 case '%': /* named register */
503 ret = regs_query_register_offset(arg + 1);
504 if (ret >= 0) {
505 ff->func = fetch_register;
506 ff->data = (void *)(unsigned long)ret;
507 ret = 0;
508 }
509 break;
510 case '@': /* memory or symbol */
511 if (isdigit(arg[1])) {
512 ret = strict_strtoul(arg + 1, 0, &param);
513 if (ret)
514 break;
515 ff->func = fetch_memory;
516 ff->data = (void *)param;
517 } else {
518 ret = split_symbol_offset(arg + 1, &offset);
519 if (ret)
520 break;
521 ff->data = alloc_symbol_cache(arg + 1, offset);
522 if (ff->data)
523 ff->func = fetch_symbol;
524 else
525 ret = -EINVAL;
526 }
527 break;
528 case '+': /* indirect memory */
529 case '-':
530 tmp = strchr(arg, '(');
531 if (!tmp) {
532 ret = -EINVAL;
533 break;
534 }
535 *tmp = '\0';
536 ret = strict_strtol(arg + 1, 0, &offset);
537 if (ret)
538 break;
539 if (arg[0] == '-')
540 offset = -offset;
541 arg = tmp + 1;
542 tmp = strrchr(arg, ')');
543 if (tmp) {
544 struct indirect_fetch_data *id;
545 *tmp = '\0';
546 id = kzalloc(sizeof(struct indirect_fetch_data),
547 GFP_KERNEL);
548 if (!id)
549 return -ENOMEM;
550 id->offset = offset;
551 ret = __parse_probe_arg(arg, &id->orig, is_return);
552 if (ret)
553 kfree(id);
554 else {
555 ff->func = fetch_indirect;
556 ff->data = (void *)id;
557 }
558 } else
559 ret = -EINVAL;
560 break;
561 default:
562 /* TODO: support custom handler */
563 ret = -EINVAL;
564 }
565 return ret;
566}
567
568/* String length checking wrapper */
569static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
570{
571 if (strlen(arg) > MAX_ARGSTR_LEN) {
572 pr_info("Argument is too long.: %s\n", arg);
573 return -ENOSPC;
574 }
575 return __parse_probe_arg(arg, ff, is_return);
576}
577
578/* Return 1 if name is reserved or already used by another argument */
579static int conflict_field_name(const char *name,
580 struct probe_arg *args, int narg)
581{
582 int i;
583 for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++)
584 if (strcmp(reserved_field_names[i], name) == 0)
585 return 1;
586 for (i = 0; i < narg; i++)
587 if (strcmp(args[i].name, name) == 0)
588 return 1;
589 return 0;
590}
591
592static int create_trace_probe(int argc, char **argv)
593{
594 /*
595 * Argument syntax:
596 * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS]
597 * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS]
598 * Fetch args:
599 * $retval : fetch return value
600 * $stack : fetch stack address
601 * $stackN : fetch Nth of stack (N:0-)
602 * @ADDR : fetch memory at ADDR (ADDR should be in kernel)
603 * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
604 * %REG : fetch register REG
605 * Indirect memory fetch:
606 * +|-offs(ARG) : fetch memory at ARG +|- offs address.
607 * Alias name of args:
608 * NAME=FETCHARG : set NAME as alias of FETCHARG.
609 */
610 struct trace_probe *tp;
611 int i, ret = 0;
612 int is_return = 0, is_delete = 0;
613 char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL;
614 unsigned long offset = 0;
615 void *addr = NULL;
616 char buf[MAX_EVENT_NAME_LEN];
617
618 /* argc must be >= 1 */
619 if (argv[0][0] == 'p')
620 is_return = 0;
621 else if (argv[0][0] == 'r')
622 is_return = 1;
623 else if (argv[0][0] == '-')
624 is_delete = 1;
625 else {
626 pr_info("Probe definition must be started with 'p', 'r' or"
627 " '-'.\n");
628 return -EINVAL;
629 }
630
631 if (argv[0][1] == ':') {
632 event = &argv[0][2];
633 if (strchr(event, '/')) {
634 group = event;
635 event = strchr(group, '/') + 1;
636 event[-1] = '\0';
637 if (strlen(group) == 0) {
638 pr_info("Group name is not specified\n");
639 return -EINVAL;
640 }
641 }
642 if (strlen(event) == 0) {
643 pr_info("Event name is not specified\n");
644 return -EINVAL;
645 }
646 }
647 if (!group)
648 group = KPROBE_EVENT_SYSTEM;
649
650 if (is_delete) {
651 if (!event) {
652 pr_info("Delete command needs an event name.\n");
653 return -EINVAL;
654 }
655 tp = find_probe_event(event, group);
656 if (!tp) {
657 pr_info("Event %s/%s doesn't exist.\n", group, event);
658 return -ENOENT;
659 }
660 /* delete an event */
661 unregister_trace_probe(tp);
662 free_trace_probe(tp);
663 return 0;
664 }
665
666 if (argc < 2) {
667 pr_info("Probe point is not specified.\n");
668 return -EINVAL;
669 }
670 if (isdigit(argv[1][0])) {
671 if (is_return) {
672 pr_info("Return probe point must be a symbol.\n");
673 return -EINVAL;
674 }
675 /* an address specified */
676 ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr);
677 if (ret) {
678 pr_info("Failed to parse address.\n");
679 return ret;
680 }
681 } else {
682 /* a symbol specified */
683 symbol = argv[1];
684 /* TODO: support .init module functions */
685 ret = split_symbol_offset(symbol, &offset);
686 if (ret) {
687 pr_info("Failed to parse symbol.\n");
688 return ret;
689 }
690 if (offset && is_return) {
691 pr_info("Return probe must be used without offset.\n");
692 return -EINVAL;
693 }
694 }
695 argc -= 2; argv += 2;
696
697 /* setup a probe */
698 if (!event) {
699 /* Make a new event name */
700 if (symbol)
701 snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_%ld",
702 is_return ? 'r' : 'p', symbol, offset);
703 else
704 snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p",
705 is_return ? 'r' : 'p', addr);
706 event = buf;
707 }
708 tp = alloc_trace_probe(group, event, addr, symbol, offset, argc,
709 is_return);
710 if (IS_ERR(tp)) {
711 pr_info("Failed to allocate trace_probe.(%d)\n",
712 (int)PTR_ERR(tp));
713 return PTR_ERR(tp);
714 }
715
716 /* parse arguments */
717 ret = 0;
718 for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
719 /* Parse argument name */
720 arg = strchr(argv[i], '=');
721 if (arg)
722 *arg++ = '\0';
723 else
724 arg = argv[i];
725
726 if (conflict_field_name(argv[i], tp->args, i)) {
727 pr_info("Argument%d name '%s' conflicts with "
728 "another field.\n", i, argv[i]);
729 ret = -EINVAL;
730 goto error;
731 }
732
733 tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
734 if (!tp->args[i].name) {
735 pr_info("Failed to allocate argument%d name '%s'.\n",
736 i, argv[i]);
737 ret = -ENOMEM;
738 goto error;
739 }
740
741 /* Parse fetch argument */
742 ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return);
743 if (ret) {
744 pr_info("Parse error at argument%d. (%d)\n", i, ret);
745 kfree(tp->args[i].name);
746 goto error;
747 }
748
749 tp->nr_args++;
750 }
751
752 ret = register_trace_probe(tp);
753 if (ret)
754 goto error;
755 return 0;
756
757error:
758 free_trace_probe(tp);
759 return ret;
760}
761
762static void cleanup_all_probes(void)
763{
764 struct trace_probe *tp;
765
766 mutex_lock(&probe_lock);
767 /* TODO: Use batch unregistration */
768 while (!list_empty(&probe_list)) {
769 tp = list_entry(probe_list.next, struct trace_probe, list);
770 unregister_trace_probe(tp);
771 free_trace_probe(tp);
772 }
773 mutex_unlock(&probe_lock);
774}
775
776
777/* Probes listing interfaces */
778static void *probes_seq_start(struct seq_file *m, loff_t *pos)
779{
780 mutex_lock(&probe_lock);
781 return seq_list_start(&probe_list, *pos);
782}
783
784static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos)
785{
786 return seq_list_next(v, &probe_list, pos);
787}
788
789static void probes_seq_stop(struct seq_file *m, void *v)
790{
791 mutex_unlock(&probe_lock);
792}
793
794static int probes_seq_show(struct seq_file *m, void *v)
795{
796 struct trace_probe *tp = v;
797 int i, ret;
798 char buf[MAX_ARGSTR_LEN + 1];
799
800 seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
801 seq_printf(m, ":%s/%s", tp->call.system, tp->call.name);
802
803 if (!tp->symbol)
804 seq_printf(m, " 0x%p", tp->rp.kp.addr);
805 else if (tp->rp.kp.offset)
806 seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset);
807 else
808 seq_printf(m, " %s", probe_symbol(tp));
809
810 for (i = 0; i < tp->nr_args; i++) {
811 ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch);
812 if (ret < 0) {
813 pr_warning("Argument%d decoding error(%d).\n", i, ret);
814 return ret;
815 }
816 seq_printf(m, " %s=%s", tp->args[i].name, buf);
817 }
818 seq_printf(m, "\n");
819 return 0;
820}
821
822static const struct seq_operations probes_seq_op = {
823 .start = probes_seq_start,
824 .next = probes_seq_next,
825 .stop = probes_seq_stop,
826 .show = probes_seq_show
827};
828
829static int probes_open(struct inode *inode, struct file *file)
830{
831 if ((file->f_mode & FMODE_WRITE) &&
832 (file->f_flags & O_TRUNC))
833 cleanup_all_probes();
834
835 return seq_open(file, &probes_seq_op);
836}
837
838static int command_trace_probe(const char *buf)
839{
840 char **argv;
841 int argc = 0, ret = 0;
842
843 argv = argv_split(GFP_KERNEL, buf, &argc);
844 if (!argv)
845 return -ENOMEM;
846
847 if (argc)
848 ret = create_trace_probe(argc, argv);
849
850 argv_free(argv);
851 return ret;
852}
853
854#define WRITE_BUFSIZE 128
855
856static ssize_t probes_write(struct file *file, const char __user *buffer,
857 size_t count, loff_t *ppos)
858{
859 char *kbuf, *tmp;
860 int ret;
861 size_t done;
862 size_t size;
863
864 kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
865 if (!kbuf)
866 return -ENOMEM;
867
868 ret = done = 0;
869 while (done < count) {
870 size = count - done;
871 if (size >= WRITE_BUFSIZE)
872 size = WRITE_BUFSIZE - 1;
873 if (copy_from_user(kbuf, buffer + done, size)) {
874 ret = -EFAULT;
875 goto out;
876 }
877 kbuf[size] = '\0';
878 tmp = strchr(kbuf, '\n');
879 if (tmp) {
880 *tmp = '\0';
881 size = tmp - kbuf + 1;
882 } else if (done + size < count) {
883 pr_warning("Line length is too long: "
884 "Should be less than %d.", WRITE_BUFSIZE);
885 ret = -EINVAL;
886 goto out;
887 }
888 done += size;
889 /* Remove comments */
890 tmp = strchr(kbuf, '#');
891 if (tmp)
892 *tmp = '\0';
893
894 ret = command_trace_probe(kbuf);
895 if (ret)
896 goto out;
897 }
898 ret = done;
899out:
900 kfree(kbuf);
901 return ret;
902}
903
904static const struct file_operations kprobe_events_ops = {
905 .owner = THIS_MODULE,
906 .open = probes_open,
907 .read = seq_read,
908 .llseek = seq_lseek,
909 .release = seq_release,
910 .write = probes_write,
911};
912
913/* Probes profiling interfaces */
914static int probes_profile_seq_show(struct seq_file *m, void *v)
915{
916 struct trace_probe *tp = v;
917
918 seq_printf(m, " %-44s %15lu %15lu\n", tp->call.name, tp->nhit,
919 tp->rp.kp.nmissed);
920
921 return 0;
922}
923
924static const struct seq_operations profile_seq_op = {
925 .start = probes_seq_start,
926 .next = probes_seq_next,
927 .stop = probes_seq_stop,
928 .show = probes_profile_seq_show
929};
930
931static int profile_open(struct inode *inode, struct file *file)
932{
933 return seq_open(file, &profile_seq_op);
934}
935
936static const struct file_operations kprobe_profile_ops = {
937 .owner = THIS_MODULE,
938 .open = profile_open,
939 .read = seq_read,
940 .llseek = seq_lseek,
941 .release = seq_release,
942};
943
944/* Kprobe handler */
945static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
946{
947 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
948 struct kprobe_trace_entry *entry;
949 struct ring_buffer_event *event;
950 struct ring_buffer *buffer;
951 int size, i, pc;
952 unsigned long irq_flags;
953 struct ftrace_event_call *call = &tp->call;
954
955 tp->nhit++;
956
957 local_save_flags(irq_flags);
958 pc = preempt_count();
959
960 size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
961
962 event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
963 irq_flags, pc);
964 if (!event)
965 return;
966
967 entry = ring_buffer_event_data(event);
968 entry->nargs = tp->nr_args;
969 entry->ip = (unsigned long)kp->addr;
970 for (i = 0; i < tp->nr_args; i++)
971 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
972
973 if (!filter_current_check_discard(buffer, call, entry, event))
974 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
975}
976
977/* Kretprobe handler */
978static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
979 struct pt_regs *regs)
980{
981 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
982 struct kretprobe_trace_entry *entry;
983 struct ring_buffer_event *event;
984 struct ring_buffer *buffer;
985 int size, i, pc;
986 unsigned long irq_flags;
987 struct ftrace_event_call *call = &tp->call;
988
989 local_save_flags(irq_flags);
990 pc = preempt_count();
991
992 size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
993
994 event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
995 irq_flags, pc);
996 if (!event)
997 return;
998
999 entry = ring_buffer_event_data(event);
1000 entry->nargs = tp->nr_args;
1001 entry->func = (unsigned long)tp->rp.kp.addr;
1002 entry->ret_ip = (unsigned long)ri->ret_addr;
1003 for (i = 0; i < tp->nr_args; i++)
1004 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1005
1006 if (!filter_current_check_discard(buffer, call, entry, event))
1007 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
1008}
1009
1010/* Event entry printers */
1011enum print_line_t
1012print_kprobe_event(struct trace_iterator *iter, int flags)
1013{
1014 struct kprobe_trace_entry *field;
1015 struct trace_seq *s = &iter->seq;
1016 struct trace_event *event;
1017 struct trace_probe *tp;
1018 int i;
1019
1020 field = (struct kprobe_trace_entry *)iter->ent;
1021 event = ftrace_find_event(field->ent.type);
1022 tp = container_of(event, struct trace_probe, event);
1023
1024 if (!trace_seq_printf(s, "%s: (", tp->call.name))
1025 goto partial;
1026
1027 if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
1028 goto partial;
1029
1030 if (!trace_seq_puts(s, ")"))
1031 goto partial;
1032
1033 for (i = 0; i < field->nargs; i++)
1034 if (!trace_seq_printf(s, " %s=%lx",
1035 tp->args[i].name, field->args[i]))
1036 goto partial;
1037
1038 if (!trace_seq_puts(s, "\n"))
1039 goto partial;
1040
1041 return TRACE_TYPE_HANDLED;
1042partial:
1043 return TRACE_TYPE_PARTIAL_LINE;
1044}
1045
1046enum print_line_t
1047print_kretprobe_event(struct trace_iterator *iter, int flags)
1048{
1049 struct kretprobe_trace_entry *field;
1050 struct trace_seq *s = &iter->seq;
1051 struct trace_event *event;
1052 struct trace_probe *tp;
1053 int i;
1054
1055 field = (struct kretprobe_trace_entry *)iter->ent;
1056 event = ftrace_find_event(field->ent.type);
1057 tp = container_of(event, struct trace_probe, event);
1058
1059 if (!trace_seq_printf(s, "%s: (", tp->call.name))
1060 goto partial;
1061
1062 if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
1063 goto partial;
1064
1065 if (!trace_seq_puts(s, " <- "))
1066 goto partial;
1067
1068 if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET))
1069 goto partial;
1070
1071 if (!trace_seq_puts(s, ")"))
1072 goto partial;
1073
1074 for (i = 0; i < field->nargs; i++)
1075 if (!trace_seq_printf(s, " %s=%lx",
1076 tp->args[i].name, field->args[i]))
1077 goto partial;
1078
1079 if (!trace_seq_puts(s, "\n"))
1080 goto partial;
1081
1082 return TRACE_TYPE_HANDLED;
1083partial:
1084 return TRACE_TYPE_PARTIAL_LINE;
1085}
1086
1087static int probe_event_enable(struct ftrace_event_call *call)
1088{
1089 struct trace_probe *tp = (struct trace_probe *)call->data;
1090
1091 tp->flags |= TP_FLAG_TRACE;
1092 if (probe_is_return(tp))
1093 return enable_kretprobe(&tp->rp);
1094 else
1095 return enable_kprobe(&tp->rp.kp);
1096}
1097
1098static void probe_event_disable(struct ftrace_event_call *call)
1099{
1100 struct trace_probe *tp = (struct trace_probe *)call->data;
1101
1102 tp->flags &= ~TP_FLAG_TRACE;
1103 if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) {
1104 if (probe_is_return(tp))
1105 disable_kretprobe(&tp->rp);
1106 else
1107 disable_kprobe(&tp->rp.kp);
1108 }
1109}
1110
1111static int probe_event_raw_init(struct ftrace_event_call *event_call)
1112{
1113 INIT_LIST_HEAD(&event_call->fields);
1114
1115 return 0;
1116}
1117
1118#undef DEFINE_FIELD
1119#define DEFINE_FIELD(type, item, name, is_signed) \
1120 do { \
1121 ret = trace_define_field(event_call, #type, name, \
1122 offsetof(typeof(field), item), \
1123 sizeof(field.item), is_signed, \
1124 FILTER_OTHER); \
1125 if (ret) \
1126 return ret; \
1127 } while (0)
1128
1129static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
1130{
1131 int ret, i;
1132 struct kprobe_trace_entry field;
1133 struct trace_probe *tp = (struct trace_probe *)event_call->data;
1134
1135 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
1136 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
1137 /* Set argument names as fields */
1138 for (i = 0; i < tp->nr_args; i++)
1139 DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
1140 return 0;
1141}
1142
1143static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
1144{
1145 int ret, i;
1146 struct kretprobe_trace_entry field;
1147 struct trace_probe *tp = (struct trace_probe *)event_call->data;
1148
1149 DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
1150 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
1151 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
1152 /* Set argument names as fields */
1153 for (i = 0; i < tp->nr_args; i++)
1154 DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
1155 return 0;
1156}
1157
1158static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
1159{
1160 int i;
1161 int pos = 0;
1162
1163 const char *fmt, *arg;
1164
1165 if (!probe_is_return(tp)) {
1166 fmt = "(%lx)";
1167 arg = "REC->" FIELD_STRING_IP;
1168 } else {
1169 fmt = "(%lx <- %lx)";
1170 arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
1171 }
1172
1173 /* When len=0, we just calculate the needed length */
1174#define LEN_OR_ZERO (len ? len - pos : 0)
1175
1176 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
1177
1178 for (i = 0; i < tp->nr_args; i++) {
1179 pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%%lx",
1180 tp->args[i].name);
1181 }
1182
1183 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
1184
1185 for (i = 0; i < tp->nr_args; i++) {
1186 pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
1187 tp->args[i].name);
1188 }
1189
1190#undef LEN_OR_ZERO
1191
1192 /* return the length of print_fmt */
1193 return pos;
1194}
1195
1196static int set_print_fmt(struct trace_probe *tp)
1197{
1198 int len;
1199 char *print_fmt;
1200
1201 /* First: called with 0 length to calculate the needed length */
1202 len = __set_print_fmt(tp, NULL, 0);
1203 print_fmt = kmalloc(len + 1, GFP_KERNEL);
1204 if (!print_fmt)
1205 return -ENOMEM;
1206
1207 /* Second: actually write the @print_fmt */
1208 __set_print_fmt(tp, print_fmt, len + 1);
1209 tp->call.print_fmt = print_fmt;
1210
1211 return 0;
1212}
1213
1214#ifdef CONFIG_PERF_EVENTS
1215
1216/* Kprobe profile handler */
1217static __kprobes void kprobe_perf_func(struct kprobe *kp,
1218 struct pt_regs *regs)
1219{
1220 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
1221 struct ftrace_event_call *call = &tp->call;
1222 struct kprobe_trace_entry *entry;
1223 int size, __size, i;
1224 unsigned long irq_flags;
1225 int rctx;
1226
1227 __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
1228 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1229 size -= sizeof(u32);
1230 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
1231 "profile buffer not large enough"))
1232 return;
1233
1234 entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags);
1235 if (!entry)
1236 return;
1237
1238 entry->nargs = tp->nr_args;
1239 entry->ip = (unsigned long)kp->addr;
1240 for (i = 0; i < tp->nr_args; i++)
1241 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1242
1243 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs);
1244}
1245
1246/* Kretprobe profile handler */
1247static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1248 struct pt_regs *regs)
1249{
1250 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
1251 struct ftrace_event_call *call = &tp->call;
1252 struct kretprobe_trace_entry *entry;
1253 int size, __size, i;
1254 unsigned long irq_flags;
1255 int rctx;
1256
1257 __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
1258 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1259 size -= sizeof(u32);
1260 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
1261 "profile buffer not large enough"))
1262 return;
1263
1264 entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags);
1265 if (!entry)
1266 return;
1267
1268 entry->nargs = tp->nr_args;
1269 entry->func = (unsigned long)tp->rp.kp.addr;
1270 entry->ret_ip = (unsigned long)ri->ret_addr;
1271 for (i = 0; i < tp->nr_args; i++)
1272 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1273
1274 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1,
1275 irq_flags, regs);
1276}
1277
1278static int probe_perf_enable(struct ftrace_event_call *call)
1279{
1280 struct trace_probe *tp = (struct trace_probe *)call->data;
1281
1282 tp->flags |= TP_FLAG_PROFILE;
1283
1284 if (probe_is_return(tp))
1285 return enable_kretprobe(&tp->rp);
1286 else
1287 return enable_kprobe(&tp->rp.kp);
1288}
1289
1290static void probe_perf_disable(struct ftrace_event_call *call)
1291{
1292 struct trace_probe *tp = (struct trace_probe *)call->data;
1293
1294 tp->flags &= ~TP_FLAG_PROFILE;
1295
1296 if (!(tp->flags & TP_FLAG_TRACE)) {
1297 if (probe_is_return(tp))
1298 disable_kretprobe(&tp->rp);
1299 else
1300 disable_kprobe(&tp->rp.kp);
1301 }
1302}
1303#endif /* CONFIG_PERF_EVENTS */
1304
1305
1306static __kprobes
1307int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
1308{
1309 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
1310
1311 if (tp->flags & TP_FLAG_TRACE)
1312 kprobe_trace_func(kp, regs);
1313#ifdef CONFIG_PERF_EVENTS
1314 if (tp->flags & TP_FLAG_PROFILE)
1315 kprobe_perf_func(kp, regs);
1316#endif
1317 return 0; /* We don't tweek kernel, so just return 0 */
1318}
1319
1320static __kprobes
1321int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
1322{
1323 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
1324
1325 if (tp->flags & TP_FLAG_TRACE)
1326 kretprobe_trace_func(ri, regs);
1327#ifdef CONFIG_PERF_EVENTS
1328 if (tp->flags & TP_FLAG_PROFILE)
1329 kretprobe_perf_func(ri, regs);
1330#endif
1331 return 0; /* We don't tweek kernel, so just return 0 */
1332}
1333
1334static int register_probe_event(struct trace_probe *tp)
1335{
1336 struct ftrace_event_call *call = &tp->call;
1337 int ret;
1338
1339 /* Initialize ftrace_event_call */
1340 if (probe_is_return(tp)) {
1341 tp->event.trace = print_kretprobe_event;
1342 call->raw_init = probe_event_raw_init;
1343 call->define_fields = kretprobe_event_define_fields;
1344 } else {
1345 tp->event.trace = print_kprobe_event;
1346 call->raw_init = probe_event_raw_init;
1347 call->define_fields = kprobe_event_define_fields;
1348 }
1349 if (set_print_fmt(tp) < 0)
1350 return -ENOMEM;
1351 call->event = &tp->event;
1352 call->id = register_ftrace_event(&tp->event);
1353 if (!call->id) {
1354 kfree(call->print_fmt);
1355 return -ENODEV;
1356 }
1357 call->enabled = 0;
1358 call->regfunc = probe_event_enable;
1359 call->unregfunc = probe_event_disable;
1360
1361#ifdef CONFIG_PERF_EVENTS
1362 call->perf_event_enable = probe_perf_enable;
1363 call->perf_event_disable = probe_perf_disable;
1364#endif
1365 call->data = tp;
1366 ret = trace_add_event_call(call);
1367 if (ret) {
1368 pr_info("Failed to register kprobe event: %s\n", call->name);
1369 kfree(call->print_fmt);
1370 unregister_ftrace_event(&tp->event);
1371 }
1372 return ret;
1373}
1374
1375static void unregister_probe_event(struct trace_probe *tp)
1376{
1377 /* tp->event is unregistered in trace_remove_event_call() */
1378 trace_remove_event_call(&tp->call);
1379 kfree(tp->call.print_fmt);
1380}
1381
1382/* Make a debugfs interface for controling probe points */
1383static __init int init_kprobe_trace(void)
1384{
1385 struct dentry *d_tracer;
1386 struct dentry *entry;
1387
1388 d_tracer = tracing_init_dentry();
1389 if (!d_tracer)
1390 return 0;
1391
1392 entry = debugfs_create_file("kprobe_events", 0644, d_tracer,
1393 NULL, &kprobe_events_ops);
1394
1395 /* Event list interface */
1396 if (!entry)
1397 pr_warning("Could not create debugfs "
1398 "'kprobe_events' entry\n");
1399
1400 /* Profile interface */
1401 entry = debugfs_create_file("kprobe_profile", 0444, d_tracer,
1402 NULL, &kprobe_profile_ops);
1403
1404 if (!entry)
1405 pr_warning("Could not create debugfs "
1406 "'kprobe_profile' entry\n");
1407 return 0;
1408}
1409fs_initcall(init_kprobe_trace);
1410
1411
1412#ifdef CONFIG_FTRACE_STARTUP_TEST
1413
1414static int kprobe_trace_selftest_target(int a1, int a2, int a3,
1415 int a4, int a5, int a6)
1416{
1417 return a1 + a2 + a3 + a4 + a5 + a6;
1418}
1419
1420static __init int kprobe_trace_self_tests_init(void)
1421{
1422 int ret, warn = 0;
1423 int (*target)(int, int, int, int, int, int);
1424 struct trace_probe *tp;
1425
1426 target = kprobe_trace_selftest_target;
1427
1428 pr_info("Testing kprobe tracing: ");
1429
1430 ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target "
1431 "$stack $stack0 +0($stack)");
1432 if (WARN_ON_ONCE(ret)) {
1433 pr_warning("error on probing function entry.\n");
1434 warn++;
1435 } else {
1436 /* Enable trace point */
1437 tp = find_probe_event("testprobe", KPROBE_EVENT_SYSTEM);
1438 if (WARN_ON_ONCE(tp == NULL)) {
1439 pr_warning("error on getting new probe.\n");
1440 warn++;
1441 } else
1442 probe_event_enable(&tp->call);
1443 }
1444
1445 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
1446 "$retval");
1447 if (WARN_ON_ONCE(ret)) {
1448 pr_warning("error on probing function return.\n");
1449 warn++;
1450 } else {
1451 /* Enable trace point */
1452 tp = find_probe_event("testprobe2", KPROBE_EVENT_SYSTEM);
1453 if (WARN_ON_ONCE(tp == NULL)) {
1454 pr_warning("error on getting new probe.\n");
1455 warn++;
1456 } else
1457 probe_event_enable(&tp->call);
1458 }
1459
1460 if (warn)
1461 goto end;
1462
1463 ret = target(1, 2, 3, 4, 5, 6);
1464
1465 ret = command_trace_probe("-:testprobe");
1466 if (WARN_ON_ONCE(ret)) {
1467 pr_warning("error on deleting a probe.\n");
1468 warn++;
1469 }
1470
1471 ret = command_trace_probe("-:testprobe2");
1472 if (WARN_ON_ONCE(ret)) {
1473 pr_warning("error on deleting a probe.\n");
1474 warn++;
1475 }
1476
1477end:
1478 cleanup_all_probes();
1479 if (warn)
1480 pr_cont("NG: Some tests are failed. Please check them.\n");
1481 else
1482 pr_cont("OK\n");
1483 return 0;
1484}
1485
1486late_initcall(kprobe_trace_self_tests_init);
1487
1488#endif
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
new file mode 100644
index 000000000000..d59cd6879477
--- /dev/null
+++ b/kernel/trace/trace_ksym.c
@@ -0,0 +1,520 @@
1/*
2 * trace_ksym.c - Kernel Symbol Tracer
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2009
19 */
20
21#include <linux/kallsyms.h>
22#include <linux/uaccess.h>
23#include <linux/debugfs.h>
24#include <linux/ftrace.h>
25#include <linux/module.h>
26#include <linux/slab.h>
27#include <linux/fs.h>
28
29#include "trace_output.h"
30#include "trace.h"
31
32#include <linux/hw_breakpoint.h>
33#include <asm/hw_breakpoint.h>
34
35#include <asm/atomic.h>
36
37/*
38 * For now, let us restrict the no. of symbols traced simultaneously to number
39 * of available hardware breakpoint registers.
40 */
41#define KSYM_TRACER_MAX HBP_NUM
42
43#define KSYM_TRACER_OP_LEN 3 /* rw- */
44
45struct trace_ksym {
46 struct perf_event **ksym_hbp;
47 struct perf_event_attr attr;
48#ifdef CONFIG_PROFILE_KSYM_TRACER
49 atomic64_t counter;
50#endif
51 struct hlist_node ksym_hlist;
52};
53
54static struct trace_array *ksym_trace_array;
55
56static unsigned int ksym_filter_entry_count;
57static unsigned int ksym_tracing_enabled;
58
59static HLIST_HEAD(ksym_filter_head);
60
61static DEFINE_MUTEX(ksym_tracer_mutex);
62
63#ifdef CONFIG_PROFILE_KSYM_TRACER
64
65#define MAX_UL_INT 0xffffffff
66
67void ksym_collect_stats(unsigned long hbp_hit_addr)
68{
69 struct hlist_node *node;
70 struct trace_ksym *entry;
71
72 rcu_read_lock();
73 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
74 if (entry->attr.bp_addr == hbp_hit_addr) {
75 atomic64_inc(&entry->counter);
76 break;
77 }
78 }
79 rcu_read_unlock();
80}
81#endif /* CONFIG_PROFILE_KSYM_TRACER */
82
83void ksym_hbp_handler(struct perf_event *hbp, int nmi,
84 struct perf_sample_data *data,
85 struct pt_regs *regs)
86{
87 struct ring_buffer_event *event;
88 struct ksym_trace_entry *entry;
89 struct ring_buffer *buffer;
90 int pc;
91
92 if (!ksym_tracing_enabled)
93 return;
94
95 buffer = ksym_trace_array->buffer;
96
97 pc = preempt_count();
98
99 event = trace_buffer_lock_reserve(buffer, TRACE_KSYM,
100 sizeof(*entry), 0, pc);
101 if (!event)
102 return;
103
104 entry = ring_buffer_event_data(event);
105 entry->ip = instruction_pointer(regs);
106 entry->type = hw_breakpoint_type(hbp);
107 entry->addr = hw_breakpoint_addr(hbp);
108 strlcpy(entry->cmd, current->comm, TASK_COMM_LEN);
109
110#ifdef CONFIG_PROFILE_KSYM_TRACER
111 ksym_collect_stats(hw_breakpoint_addr(hbp));
112#endif /* CONFIG_PROFILE_KSYM_TRACER */
113
114 trace_buffer_unlock_commit(buffer, event, 0, pc);
115}
116
117/* Valid access types are represented as
118 *
119 * rw- : Set Read/Write Access Breakpoint
120 * -w- : Set Write Access Breakpoint
121 * --- : Clear Breakpoints
122 * --x : Set Execution Break points (Not available yet)
123 *
124 */
125static int ksym_trace_get_access_type(char *str)
126{
127 int access = 0;
128
129 if (str[0] == 'r')
130 access |= HW_BREAKPOINT_R;
131
132 if (str[1] == 'w')
133 access |= HW_BREAKPOINT_W;
134
135 if (str[2] == 'x')
136 access |= HW_BREAKPOINT_X;
137
138 switch (access) {
139 case HW_BREAKPOINT_R:
140 case HW_BREAKPOINT_W:
141 case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
142 return access;
143 default:
144 return -EINVAL;
145 }
146}
147
148/*
149 * There can be several possible malformed requests and we attempt to capture
150 * all of them. We enumerate some of the rules
151 * 1. We will not allow kernel symbols with ':' since it is used as a delimiter.
152 * i.e. multiple ':' symbols disallowed. Possible uses are of the form
153 * <module>:<ksym_name>:<op>.
154 * 2. No delimiter symbol ':' in the input string
155 * 3. Spurious operator symbols or symbols not in their respective positions
156 * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file
157 * 5. Kernel symbol not a part of /proc/kallsyms
158 * 6. Duplicate requests
159 */
160static int parse_ksym_trace_str(char *input_string, char **ksymname,
161 unsigned long *addr)
162{
163 int ret;
164
165 *ksymname = strsep(&input_string, ":");
166 *addr = kallsyms_lookup_name(*ksymname);
167
168 /* Check for malformed request: (2), (1) and (5) */
169 if ((!input_string) ||
170 (strlen(input_string) != KSYM_TRACER_OP_LEN) ||
171 (*addr == 0))
172 return -EINVAL;;
173
174 ret = ksym_trace_get_access_type(input_string);
175
176 return ret;
177}
178
179int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
180{
181 struct trace_ksym *entry;
182 int ret = -ENOMEM;
183
184 if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
185 printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
186 " new requests for tracing can be accepted now.\n",
187 KSYM_TRACER_MAX);
188 return -ENOSPC;
189 }
190
191 entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
192 if (!entry)
193 return -ENOMEM;
194
195 hw_breakpoint_init(&entry->attr);
196
197 entry->attr.bp_type = op;
198 entry->attr.bp_addr = addr;
199 entry->attr.bp_len = HW_BREAKPOINT_LEN_4;
200
201 entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr,
202 ksym_hbp_handler);
203
204 if (IS_ERR(entry->ksym_hbp)) {
205 ret = PTR_ERR(entry->ksym_hbp);
206 printk(KERN_INFO "ksym_tracer request failed. Try again"
207 " later!!\n");
208 goto err;
209 }
210
211 hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
212 ksym_filter_entry_count++;
213
214 return 0;
215
216err:
217 kfree(entry);
218
219 return ret;
220}
221
222static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
223 size_t count, loff_t *ppos)
224{
225 struct trace_ksym *entry;
226 struct hlist_node *node;
227 struct trace_seq *s;
228 ssize_t cnt = 0;
229 int ret;
230
231 s = kmalloc(sizeof(*s), GFP_KERNEL);
232 if (!s)
233 return -ENOMEM;
234 trace_seq_init(s);
235
236 mutex_lock(&ksym_tracer_mutex);
237
238 hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
239 ret = trace_seq_printf(s, "%pS:",
240 (void *)(unsigned long)entry->attr.bp_addr);
241 if (entry->attr.bp_type == HW_BREAKPOINT_R)
242 ret = trace_seq_puts(s, "r--\n");
243 else if (entry->attr.bp_type == HW_BREAKPOINT_W)
244 ret = trace_seq_puts(s, "-w-\n");
245 else if (entry->attr.bp_type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R))
246 ret = trace_seq_puts(s, "rw-\n");
247 WARN_ON_ONCE(!ret);
248 }
249
250 cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
251
252 mutex_unlock(&ksym_tracer_mutex);
253
254 kfree(s);
255
256 return cnt;
257}
258
259static void __ksym_trace_reset(void)
260{
261 struct trace_ksym *entry;
262 struct hlist_node *node, *node1;
263
264 mutex_lock(&ksym_tracer_mutex);
265 hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
266 ksym_hlist) {
267 unregister_wide_hw_breakpoint(entry->ksym_hbp);
268 ksym_filter_entry_count--;
269 hlist_del_rcu(&(entry->ksym_hlist));
270 synchronize_rcu();
271 kfree(entry);
272 }
273 mutex_unlock(&ksym_tracer_mutex);
274}
275
276static ssize_t ksym_trace_filter_write(struct file *file,
277 const char __user *buffer,
278 size_t count, loff_t *ppos)
279{
280 struct trace_ksym *entry;
281 struct hlist_node *node;
282 char *buf, *input_string, *ksymname = NULL;
283 unsigned long ksym_addr = 0;
284 int ret, op, changed = 0;
285
286 buf = kzalloc(count + 1, GFP_KERNEL);
287 if (!buf)
288 return -ENOMEM;
289
290 ret = -EFAULT;
291 if (copy_from_user(buf, buffer, count))
292 goto out;
293
294 buf[count] = '\0';
295 input_string = strstrip(buf);
296
297 /*
298 * Clear all breakpoints if:
299 * 1: echo > ksym_trace_filter
300 * 2: echo 0 > ksym_trace_filter
301 * 3: echo "*:---" > ksym_trace_filter
302 */
303 if (!input_string[0] || !strcmp(input_string, "0") ||
304 !strcmp(input_string, "*:---")) {
305 __ksym_trace_reset();
306 ret = 0;
307 goto out;
308 }
309
310 ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
311 if (ret < 0)
312 goto out;
313
314 mutex_lock(&ksym_tracer_mutex);
315
316 ret = -EINVAL;
317 hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
318 if (entry->attr.bp_addr == ksym_addr) {
319 /* Check for malformed request: (6) */
320 if (entry->attr.bp_type != op)
321 changed = 1;
322 else
323 goto out_unlock;
324 break;
325 }
326 }
327 if (changed) {
328 unregister_wide_hw_breakpoint(entry->ksym_hbp);
329 entry->attr.bp_type = op;
330 ret = 0;
331 if (op > 0) {
332 entry->ksym_hbp =
333 register_wide_hw_breakpoint(&entry->attr,
334 ksym_hbp_handler);
335 if (IS_ERR(entry->ksym_hbp))
336 ret = PTR_ERR(entry->ksym_hbp);
337 else
338 goto out_unlock;
339 }
340 /* Error or "symbol:---" case: drop it */
341 ksym_filter_entry_count--;
342 hlist_del_rcu(&(entry->ksym_hlist));
343 synchronize_rcu();
344 kfree(entry);
345 goto out_unlock;
346 } else {
347 /* Check for malformed request: (4) */
348 if (op)
349 ret = process_new_ksym_entry(ksymname, op, ksym_addr);
350 }
351out_unlock:
352 mutex_unlock(&ksym_tracer_mutex);
353out:
354 kfree(buf);
355 return !ret ? count : ret;
356}
357
358static const struct file_operations ksym_tracing_fops = {
359 .open = tracing_open_generic,
360 .read = ksym_trace_filter_read,
361 .write = ksym_trace_filter_write,
362};
363
364static void ksym_trace_reset(struct trace_array *tr)
365{
366 ksym_tracing_enabled = 0;
367 __ksym_trace_reset();
368}
369
370static int ksym_trace_init(struct trace_array *tr)
371{
372 int cpu, ret = 0;
373
374 for_each_online_cpu(cpu)
375 tracing_reset(tr, cpu);
376 ksym_tracing_enabled = 1;
377 ksym_trace_array = tr;
378
379 return ret;
380}
381
382static void ksym_trace_print_header(struct seq_file *m)
383{
384 seq_puts(m,
385 "# TASK-PID CPU# Symbol "
386 "Type Function\n");
387 seq_puts(m,
388 "# | | | "
389 " | |\n");
390}
391
392static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
393{
394 struct trace_entry *entry = iter->ent;
395 struct trace_seq *s = &iter->seq;
396 struct ksym_trace_entry *field;
397 char str[KSYM_SYMBOL_LEN];
398 int ret;
399
400 if (entry->type != TRACE_KSYM)
401 return TRACE_TYPE_UNHANDLED;
402
403 trace_assign_type(field, entry);
404
405 ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd,
406 entry->pid, iter->cpu, (char *)field->addr);
407 if (!ret)
408 return TRACE_TYPE_PARTIAL_LINE;
409
410 switch (field->type) {
411 case HW_BREAKPOINT_R:
412 ret = trace_seq_printf(s, " R ");
413 break;
414 case HW_BREAKPOINT_W:
415 ret = trace_seq_printf(s, " W ");
416 break;
417 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
418 ret = trace_seq_printf(s, " RW ");
419 break;
420 default:
421 return TRACE_TYPE_PARTIAL_LINE;
422 }
423
424 if (!ret)
425 return TRACE_TYPE_PARTIAL_LINE;
426
427 sprint_symbol(str, field->ip);
428 ret = trace_seq_printf(s, "%s\n", str);
429 if (!ret)
430 return TRACE_TYPE_PARTIAL_LINE;
431
432 return TRACE_TYPE_HANDLED;
433}
434
435struct tracer ksym_tracer __read_mostly =
436{
437 .name = "ksym_tracer",
438 .init = ksym_trace_init,
439 .reset = ksym_trace_reset,
440#ifdef CONFIG_FTRACE_SELFTEST
441 .selftest = trace_selftest_startup_ksym,
442#endif
443 .print_header = ksym_trace_print_header,
444 .print_line = ksym_trace_output
445};
446
447#ifdef CONFIG_PROFILE_KSYM_TRACER
448static int ksym_profile_show(struct seq_file *m, void *v)
449{
450 struct hlist_node *node;
451 struct trace_ksym *entry;
452 int access_type = 0;
453 char fn_name[KSYM_NAME_LEN];
454
455 seq_puts(m, " Access Type ");
456 seq_puts(m, " Symbol Counter\n");
457 seq_puts(m, " ----------- ");
458 seq_puts(m, " ------ -------\n");
459
460 rcu_read_lock();
461 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
462
463 access_type = entry->attr.bp_type;
464
465 switch (access_type) {
466 case HW_BREAKPOINT_R:
467 seq_puts(m, " R ");
468 break;
469 case HW_BREAKPOINT_W:
470 seq_puts(m, " W ");
471 break;
472 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
473 seq_puts(m, " RW ");
474 break;
475 default:
476 seq_puts(m, " NA ");
477 }
478
479 if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0)
480 seq_printf(m, " %-36s", fn_name);
481 else
482 seq_printf(m, " %-36s", "<NA>");
483 seq_printf(m, " %15llu\n",
484 (unsigned long long)atomic64_read(&entry->counter));
485 }
486 rcu_read_unlock();
487
488 return 0;
489}
490
491static int ksym_profile_open(struct inode *node, struct file *file)
492{
493 return single_open(file, ksym_profile_show, NULL);
494}
495
496static const struct file_operations ksym_profile_fops = {
497 .open = ksym_profile_open,
498 .read = seq_read,
499 .llseek = seq_lseek,
500 .release = single_release,
501};
502#endif /* CONFIG_PROFILE_KSYM_TRACER */
503
504__init static int init_ksym_trace(void)
505{
506 struct dentry *d_tracer;
507
508 d_tracer = tracing_init_dentry();
509
510 trace_create_file("ksym_trace_filter", 0644, d_tracer,
511 NULL, &ksym_tracing_fops);
512
513#ifdef CONFIG_PROFILE_KSYM_TRACER
514 trace_create_file("ksym_profile", 0444, d_tracer,
515 NULL, &ksym_profile_fops);
516#endif
517
518 return register_tracer(&ksym_tracer);
519}
520device_initcall(init_ksym_trace);
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 0acd834659ed..017fa376505d 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -9,6 +9,7 @@
9#include <linux/kernel.h> 9#include <linux/kernel.h>
10#include <linux/mmiotrace.h> 10#include <linux/mmiotrace.h>
11#include <linux/pci.h> 11#include <linux/pci.h>
12#include <linux/slab.h>
12#include <linux/time.h> 13#include <linux/time.h>
13 14
14#include <asm/atomic.h> 15#include <asm/atomic.h>
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index b6c12c6a1bcd..8e46b3323cdc 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -23,13 +23,21 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
23 23
24static int next_event_type = __TRACE_LAST_TYPE + 1; 24static int next_event_type = __TRACE_LAST_TYPE + 1;
25 25
26void trace_print_seq(struct seq_file *m, struct trace_seq *s) 26int trace_print_seq(struct seq_file *m, struct trace_seq *s)
27{ 27{
28 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; 28 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
29 int ret;
30
31 ret = seq_write(m, s->buffer, len);
29 32
30 seq_write(m, s->buffer, len); 33 /*
34 * Only reset this buffer if we successfully wrote to the
35 * seq_file buffer.
36 */
37 if (!ret)
38 trace_seq_init(s);
31 39
32 trace_seq_init(s); 40 return ret;
33} 41}
34 42
35enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) 43enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
@@ -85,7 +93,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
85 va_list ap; 93 va_list ap;
86 int ret; 94 int ret;
87 95
88 if (!len) 96 if (s->full || !len)
89 return 0; 97 return 0;
90 98
91 va_start(ap, fmt); 99 va_start(ap, fmt);
@@ -93,8 +101,10 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
93 va_end(ap); 101 va_end(ap);
94 102
95 /* If we can't write it all, don't bother writing anything */ 103 /* If we can't write it all, don't bother writing anything */
96 if (ret >= len) 104 if (ret >= len) {
105 s->full = 1;
97 return 0; 106 return 0;
107 }
98 108
99 s->len += ret; 109 s->len += ret;
100 110
@@ -119,14 +129,16 @@ trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
119 int len = (PAGE_SIZE - 1) - s->len; 129 int len = (PAGE_SIZE - 1) - s->len;
120 int ret; 130 int ret;
121 131
122 if (!len) 132 if (s->full || !len)
123 return 0; 133 return 0;
124 134
125 ret = vsnprintf(s->buffer + s->len, len, fmt, args); 135 ret = vsnprintf(s->buffer + s->len, len, fmt, args);
126 136
127 /* If we can't write it all, don't bother writing anything */ 137 /* If we can't write it all, don't bother writing anything */
128 if (ret >= len) 138 if (ret >= len) {
139 s->full = 1;
129 return 0; 140 return 0;
141 }
130 142
131 s->len += ret; 143 s->len += ret;
132 144
@@ -139,14 +151,16 @@ int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
139 int len = (PAGE_SIZE - 1) - s->len; 151 int len = (PAGE_SIZE - 1) - s->len;
140 int ret; 152 int ret;
141 153
142 if (!len) 154 if (s->full || !len)
143 return 0; 155 return 0;
144 156
145 ret = bstr_printf(s->buffer + s->len, len, fmt, binary); 157 ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
146 158
147 /* If we can't write it all, don't bother writing anything */ 159 /* If we can't write it all, don't bother writing anything */
148 if (ret >= len) 160 if (ret >= len) {
161 s->full = 1;
149 return 0; 162 return 0;
163 }
150 164
151 s->len += ret; 165 s->len += ret;
152 166
@@ -167,8 +181,13 @@ int trace_seq_puts(struct trace_seq *s, const char *str)
167{ 181{
168 int len = strlen(str); 182 int len = strlen(str);
169 183
170 if (len > ((PAGE_SIZE - 1) - s->len)) 184 if (s->full)
185 return 0;
186
187 if (len > ((PAGE_SIZE - 1) - s->len)) {
188 s->full = 1;
171 return 0; 189 return 0;
190 }
172 191
173 memcpy(s->buffer + s->len, str, len); 192 memcpy(s->buffer + s->len, str, len);
174 s->len += len; 193 s->len += len;
@@ -178,9 +197,14 @@ int trace_seq_puts(struct trace_seq *s, const char *str)
178 197
179int trace_seq_putc(struct trace_seq *s, unsigned char c) 198int trace_seq_putc(struct trace_seq *s, unsigned char c)
180{ 199{
181 if (s->len >= (PAGE_SIZE - 1)) 200 if (s->full)
182 return 0; 201 return 0;
183 202
203 if (s->len >= (PAGE_SIZE - 1)) {
204 s->full = 1;
205 return 0;
206 }
207
184 s->buffer[s->len++] = c; 208 s->buffer[s->len++] = c;
185 209
186 return 1; 210 return 1;
@@ -188,9 +212,14 @@ int trace_seq_putc(struct trace_seq *s, unsigned char c)
188 212
189int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len) 213int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
190{ 214{
191 if (len > ((PAGE_SIZE - 1) - s->len)) 215 if (s->full)
192 return 0; 216 return 0;
193 217
218 if (len > ((PAGE_SIZE - 1) - s->len)) {
219 s->full = 1;
220 return 0;
221 }
222
194 memcpy(s->buffer + s->len, mem, len); 223 memcpy(s->buffer + s->len, mem, len);
195 s->len += len; 224 s->len += len;
196 225
@@ -203,6 +232,9 @@ int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len)
203 const unsigned char *data = mem; 232 const unsigned char *data = mem;
204 int i, j; 233 int i, j;
205 234
235 if (s->full)
236 return 0;
237
206#ifdef __BIG_ENDIAN 238#ifdef __BIG_ENDIAN
207 for (i = 0, j = 0; i < len; i++) { 239 for (i = 0, j = 0; i < len; i++) {
208#else 240#else
@@ -220,8 +252,13 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)
220{ 252{
221 void *ret; 253 void *ret;
222 254
223 if (len > ((PAGE_SIZE - 1) - s->len)) 255 if (s->full)
256 return 0;
257
258 if (len > ((PAGE_SIZE - 1) - s->len)) {
259 s->full = 1;
224 return NULL; 260 return NULL;
261 }
225 262
226 ret = s->buffer + s->len; 263 ret = s->buffer + s->len;
227 s->len += len; 264 s->len += len;
@@ -233,8 +270,14 @@ int trace_seq_path(struct trace_seq *s, struct path *path)
233{ 270{
234 unsigned char *p; 271 unsigned char *p;
235 272
236 if (s->len >= (PAGE_SIZE - 1)) 273 if (s->full)
274 return 0;
275
276 if (s->len >= (PAGE_SIZE - 1)) {
277 s->full = 1;
237 return 0; 278 return 0;
279 }
280
238 p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); 281 p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
239 if (!IS_ERR(p)) { 282 if (!IS_ERR(p)) {
240 p = mangle_path(s->buffer + s->len, p, "\n"); 283 p = mangle_path(s->buffer + s->len, p, "\n");
@@ -247,6 +290,7 @@ int trace_seq_path(struct trace_seq *s, struct path *path)
247 return 1; 290 return 1;
248 } 291 }
249 292
293 s->full = 1;
250 return 0; 294 return 0;
251} 295}
252 296
@@ -373,6 +417,9 @@ int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
373 unsigned long vmstart = 0; 417 unsigned long vmstart = 0;
374 int ret = 1; 418 int ret = 1;
375 419
420 if (s->full)
421 return 0;
422
376 if (mm) { 423 if (mm) {
377 const struct vm_area_struct *vma; 424 const struct vm_area_struct *vma;
378 425
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 26185d727676..0271742abb8d 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -28,8 +28,8 @@ static int wakeup_current_cpu;
28static unsigned wakeup_prio = -1; 28static unsigned wakeup_prio = -1;
29static int wakeup_rt; 29static int wakeup_rt;
30 30
31static raw_spinlock_t wakeup_lock = 31static arch_spinlock_t wakeup_lock =
32 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 32 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
33 33
34static void __wakeup_reset(struct trace_array *tr); 34static void __wakeup_reset(struct trace_array *tr);
35 35
@@ -143,7 +143,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
143 goto out; 143 goto out;
144 144
145 local_irq_save(flags); 145 local_irq_save(flags);
146 __raw_spin_lock(&wakeup_lock); 146 arch_spin_lock(&wakeup_lock);
147 147
148 /* We could race with grabbing wakeup_lock */ 148 /* We could race with grabbing wakeup_lock */
149 if (unlikely(!tracer_enabled || next != wakeup_task)) 149 if (unlikely(!tracer_enabled || next != wakeup_task))
@@ -169,7 +169,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
169 169
170out_unlock: 170out_unlock:
171 __wakeup_reset(wakeup_trace); 171 __wakeup_reset(wakeup_trace);
172 __raw_spin_unlock(&wakeup_lock); 172 arch_spin_unlock(&wakeup_lock);
173 local_irq_restore(flags); 173 local_irq_restore(flags);
174out: 174out:
175 atomic_dec(&wakeup_trace->data[cpu]->disabled); 175 atomic_dec(&wakeup_trace->data[cpu]->disabled);
@@ -193,9 +193,9 @@ static void wakeup_reset(struct trace_array *tr)
193 tracing_reset_online_cpus(tr); 193 tracing_reset_online_cpus(tr);
194 194
195 local_irq_save(flags); 195 local_irq_save(flags);
196 __raw_spin_lock(&wakeup_lock); 196 arch_spin_lock(&wakeup_lock);
197 __wakeup_reset(tr); 197 __wakeup_reset(tr);
198 __raw_spin_unlock(&wakeup_lock); 198 arch_spin_unlock(&wakeup_lock);
199 local_irq_restore(flags); 199 local_irq_restore(flags);
200} 200}
201 201
@@ -225,7 +225,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
225 goto out; 225 goto out;
226 226
227 /* interrupts should be off from try_to_wake_up */ 227 /* interrupts should be off from try_to_wake_up */
228 __raw_spin_lock(&wakeup_lock); 228 arch_spin_lock(&wakeup_lock);
229 229
230 /* check for races. */ 230 /* check for races. */
231 if (!tracer_enabled || p->prio >= wakeup_prio) 231 if (!tracer_enabled || p->prio >= wakeup_prio)
@@ -255,7 +255,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
255 trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); 255 trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
256 256
257out_locked: 257out_locked:
258 __raw_spin_unlock(&wakeup_lock); 258 arch_spin_unlock(&wakeup_lock);
259out: 259out:
260 atomic_dec(&wakeup_trace->data[cpu]->disabled); 260 atomic_dec(&wakeup_trace->data[cpu]->disabled);
261} 261}
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index d2cdbabb4ead..81003b4d617f 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -3,6 +3,7 @@
3#include <linux/stringify.h> 3#include <linux/stringify.h>
4#include <linux/kthread.h> 4#include <linux/kthread.h>
5#include <linux/delay.h> 5#include <linux/delay.h>
6#include <linux/slab.h>
6 7
7static inline int trace_valid_entry(struct trace_entry *entry) 8static inline int trace_valid_entry(struct trace_entry *entry)
8{ 9{
@@ -17,6 +18,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
17 case TRACE_GRAPH_ENT: 18 case TRACE_GRAPH_ENT:
18 case TRACE_GRAPH_RET: 19 case TRACE_GRAPH_RET:
19 case TRACE_HW_BRANCHES: 20 case TRACE_HW_BRANCHES:
21 case TRACE_KSYM:
20 return 1; 22 return 1;
21 } 23 }
22 return 0; 24 return 0;
@@ -66,7 +68,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
66 68
67 /* Don't allow flipping of max traces now */ 69 /* Don't allow flipping of max traces now */
68 local_irq_save(flags); 70 local_irq_save(flags);
69 __raw_spin_lock(&ftrace_max_lock); 71 arch_spin_lock(&ftrace_max_lock);
70 72
71 cnt = ring_buffer_entries(tr->buffer); 73 cnt = ring_buffer_entries(tr->buffer);
72 74
@@ -84,7 +86,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
84 break; 86 break;
85 } 87 }
86 tracing_on(); 88 tracing_on();
87 __raw_spin_unlock(&ftrace_max_lock); 89 arch_spin_unlock(&ftrace_max_lock);
88 local_irq_restore(flags); 90 local_irq_restore(flags);
89 91
90 if (count) 92 if (count)
@@ -808,3 +810,57 @@ trace_selftest_startup_hw_branches(struct tracer *trace,
808 return ret; 810 return ret;
809} 811}
810#endif /* CONFIG_HW_BRANCH_TRACER */ 812#endif /* CONFIG_HW_BRANCH_TRACER */
813
814#ifdef CONFIG_KSYM_TRACER
815static int ksym_selftest_dummy;
816
817int
818trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
819{
820 unsigned long count;
821 int ret;
822
823 /* start the tracing */
824 ret = tracer_init(trace, tr);
825 if (ret) {
826 warn_failed_init_tracer(trace, ret);
827 return ret;
828 }
829
830 ksym_selftest_dummy = 0;
831 /* Register the read-write tracing request */
832
833 ret = process_new_ksym_entry("ksym_selftest_dummy",
834 HW_BREAKPOINT_R | HW_BREAKPOINT_W,
835 (unsigned long)(&ksym_selftest_dummy));
836
837 if (ret < 0) {
838 printk(KERN_CONT "ksym_trace read-write startup test failed\n");
839 goto ret_path;
840 }
841 /* Perform a read and a write operation over the dummy variable to
842 * trigger the tracer
843 */
844 if (ksym_selftest_dummy == 0)
845 ksym_selftest_dummy++;
846
847 /* stop the tracing. */
848 tracing_stop();
849 /* check the trace buffer */
850 ret = trace_test_buffer(tr, &count);
851 trace->reset(tr);
852 tracing_start();
853
854 /* read & write operations - one each is performed on the dummy variable
855 * triggering two entries in the trace buffer
856 */
857 if (!ret && count != 2) {
858 printk(KERN_CONT "Ksym tracer startup test failed");
859 ret = -1;
860 }
861
862ret_path:
863 return ret;
864}
865#endif /* CONFIG_KSYM_TRACER */
866
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 8504ac71e4e8..f4bc9b27de5f 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -27,8 +27,8 @@ static struct stack_trace max_stack_trace = {
27}; 27};
28 28
29static unsigned long max_stack_size; 29static unsigned long max_stack_size;
30static raw_spinlock_t max_stack_lock = 30static arch_spinlock_t max_stack_lock =
31 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 31 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
32 32
33static int stack_trace_disabled __read_mostly; 33static int stack_trace_disabled __read_mostly;
34static DEFINE_PER_CPU(int, trace_active); 34static DEFINE_PER_CPU(int, trace_active);
@@ -54,7 +54,7 @@ static inline void check_stack(void)
54 return; 54 return;
55 55
56 local_irq_save(flags); 56 local_irq_save(flags);
57 __raw_spin_lock(&max_stack_lock); 57 arch_spin_lock(&max_stack_lock);
58 58
59 /* a race could have already updated it */ 59 /* a race could have already updated it */
60 if (this_size <= max_stack_size) 60 if (this_size <= max_stack_size)
@@ -103,7 +103,7 @@ static inline void check_stack(void)
103 } 103 }
104 104
105 out: 105 out:
106 __raw_spin_unlock(&max_stack_lock); 106 arch_spin_unlock(&max_stack_lock);
107 local_irq_restore(flags); 107 local_irq_restore(flags);
108} 108}
109 109
@@ -157,6 +157,7 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
157 unsigned long val, flags; 157 unsigned long val, flags;
158 char buf[64]; 158 char buf[64];
159 int ret; 159 int ret;
160 int cpu;
160 161
161 if (count >= sizeof(buf)) 162 if (count >= sizeof(buf))
162 return -EINVAL; 163 return -EINVAL;
@@ -171,9 +172,20 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
171 return ret; 172 return ret;
172 173
173 local_irq_save(flags); 174 local_irq_save(flags);
174 __raw_spin_lock(&max_stack_lock); 175
176 /*
177 * In case we trace inside arch_spin_lock() or after (NMI),
178 * we will cause circular lock, so we also need to increase
179 * the percpu trace_active here.
180 */
181 cpu = smp_processor_id();
182 per_cpu(trace_active, cpu)++;
183
184 arch_spin_lock(&max_stack_lock);
175 *ptr = val; 185 *ptr = val;
176 __raw_spin_unlock(&max_stack_lock); 186 arch_spin_unlock(&max_stack_lock);
187
188 per_cpu(trace_active, cpu)--;
177 local_irq_restore(flags); 189 local_irq_restore(flags);
178 190
179 return count; 191 return count;
@@ -206,8 +218,14 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
206 218
207static void *t_start(struct seq_file *m, loff_t *pos) 219static void *t_start(struct seq_file *m, loff_t *pos)
208{ 220{
221 int cpu;
222
209 local_irq_disable(); 223 local_irq_disable();
210 __raw_spin_lock(&max_stack_lock); 224
225 cpu = smp_processor_id();
226 per_cpu(trace_active, cpu)++;
227
228 arch_spin_lock(&max_stack_lock);
211 229
212 if (*pos == 0) 230 if (*pos == 0)
213 return SEQ_START_TOKEN; 231 return SEQ_START_TOKEN;
@@ -217,7 +235,13 @@ static void *t_start(struct seq_file *m, loff_t *pos)
217 235
218static void t_stop(struct seq_file *m, void *p) 236static void t_stop(struct seq_file *m, void *p)
219{ 237{
220 __raw_spin_unlock(&max_stack_lock); 238 int cpu;
239
240 arch_spin_unlock(&max_stack_lock);
241
242 cpu = smp_processor_id();
243 per_cpu(trace_active, cpu)--;
244
221 local_irq_enable(); 245 local_irq_enable();
222} 246}
223 247
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index a4bb239eb987..96cffb269e73 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -10,6 +10,7 @@
10 10
11 11
12#include <linux/list.h> 12#include <linux/list.h>
13#include <linux/slab.h>
13#include <linux/rbtree.h> 14#include <linux/rbtree.h>
14#include <linux/debugfs.h> 15#include <linux/debugfs.h>
15#include "trace_stat.h" 16#include "trace_stat.h"
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 527e17eae575..4d6d711717f2 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,5 +1,6 @@
1#include <trace/syscall.h> 1#include <trace/syscall.h>
2#include <trace/events/syscalls.h> 2#include <trace/events/syscalls.h>
3#include <linux/slab.h>
3#include <linux/kernel.h> 4#include <linux/kernel.h>
4#include <linux/ftrace.h> 5#include <linux/ftrace.h>
5#include <linux/perf_event.h> 6#include <linux/perf_event.h>
@@ -14,6 +15,43 @@ static int sys_refcount_exit;
14static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); 15static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
15static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); 16static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
16 17
18extern unsigned long __start_syscalls_metadata[];
19extern unsigned long __stop_syscalls_metadata[];
20
21static struct syscall_metadata **syscalls_metadata;
22
23static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
24{
25 struct syscall_metadata *start;
26 struct syscall_metadata *stop;
27 char str[KSYM_SYMBOL_LEN];
28
29
30 start = (struct syscall_metadata *)__start_syscalls_metadata;
31 stop = (struct syscall_metadata *)__stop_syscalls_metadata;
32 kallsyms_lookup(syscall, NULL, NULL, NULL, str);
33
34 for ( ; start < stop; start++) {
35 /*
36 * Only compare after the "sys" prefix. Archs that use
37 * syscall wrappers may have syscalls symbols aliases prefixed
38 * with "SyS" instead of "sys", leading to an unwanted
39 * mismatch.
40 */
41 if (start->name && !strcmp(start->name + 3, str + 3))
42 return start;
43 }
44 return NULL;
45}
46
47static struct syscall_metadata *syscall_nr_to_meta(int nr)
48{
49 if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
50 return NULL;
51
52 return syscalls_metadata[nr];
53}
54
17enum print_line_t 55enum print_line_t
18print_syscall_enter(struct trace_iterator *iter, int flags) 56print_syscall_enter(struct trace_iterator *iter, int flags)
19{ 57{
@@ -30,7 +68,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags)
30 if (!entry) 68 if (!entry)
31 goto end; 69 goto end;
32 70
33 if (entry->enter_id != ent->type) { 71 if (entry->enter_event->id != ent->type) {
34 WARN_ON_ONCE(1); 72 WARN_ON_ONCE(1);
35 goto end; 73 goto end;
36 } 74 }
@@ -85,7 +123,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
85 return TRACE_TYPE_HANDLED; 123 return TRACE_TYPE_HANDLED;
86 } 124 }
87 125
88 if (entry->exit_id != ent->type) { 126 if (entry->exit_event->id != ent->type) {
89 WARN_ON_ONCE(1); 127 WARN_ON_ONCE(1);
90 return TRACE_TYPE_UNHANDLED; 128 return TRACE_TYPE_UNHANDLED;
91 } 129 }
@@ -103,92 +141,79 @@ extern char *__bad_type_size(void);
103#define SYSCALL_FIELD(type, name) \ 141#define SYSCALL_FIELD(type, name) \
104 sizeof(type) != sizeof(trace.name) ? \ 142 sizeof(type) != sizeof(trace.name) ? \
105 __bad_type_size() : \ 143 __bad_type_size() : \
106 #type, #name, offsetof(typeof(trace), name), sizeof(trace.name) 144 #type, #name, offsetof(typeof(trace), name), \
145 sizeof(trace.name), is_signed_type(type)
107 146
108int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) 147static
148int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
109{ 149{
110 int i; 150 int i;
111 int nr; 151 int pos = 0;
112 int ret;
113 struct syscall_metadata *entry;
114 struct syscall_trace_enter trace;
115 int offset = offsetof(struct syscall_trace_enter, args);
116 152
117 nr = syscall_name_to_nr(call->data); 153 /* When len=0, we just calculate the needed length */
118 entry = syscall_nr_to_meta(nr); 154#define LEN_OR_ZERO (len ? len - pos : 0)
119
120 if (!entry)
121 return 0;
122
123 ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n",
124 SYSCALL_FIELD(int, nr));
125 if (!ret)
126 return 0;
127 155
156 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
128 for (i = 0; i < entry->nb_args; i++) { 157 for (i = 0; i < entry->nb_args; i++) {
129 ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i], 158 pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
130 entry->args[i]); 159 entry->args[i], sizeof(unsigned long),
131 if (!ret) 160 i == entry->nb_args - 1 ? "" : ", ");
132 return 0;
133 ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;\n", offset,
134 sizeof(unsigned long));
135 if (!ret)
136 return 0;
137 offset += sizeof(unsigned long);
138 } 161 }
162 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
139 163
140 trace_seq_puts(s, "\nprint fmt: \"");
141 for (i = 0; i < entry->nb_args; i++) { 164 for (i = 0; i < entry->nb_args; i++) {
142 ret = trace_seq_printf(s, "%s: 0x%%0%zulx%s", entry->args[i], 165 pos += snprintf(buf + pos, LEN_OR_ZERO,
143 sizeof(unsigned long), 166 ", ((unsigned long)(REC->%s))", entry->args[i]);
144 i == entry->nb_args - 1 ? "" : ", ");
145 if (!ret)
146 return 0;
147 } 167 }
148 trace_seq_putc(s, '"');
149 168
150 for (i = 0; i < entry->nb_args; i++) { 169#undef LEN_OR_ZERO
151 ret = trace_seq_printf(s, ", ((unsigned long)(REC->%s))",
152 entry->args[i]);
153 if (!ret)
154 return 0;
155 }
156 170
157 return trace_seq_putc(s, '\n'); 171 /* return the length of print_fmt */
172 return pos;
158} 173}
159 174
160int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) 175static int set_syscall_print_fmt(struct ftrace_event_call *call)
161{ 176{
162 int ret; 177 char *print_fmt;
163 struct syscall_trace_exit trace; 178 int len;
179 struct syscall_metadata *entry = call->data;
164 180
165 ret = trace_seq_printf(s, 181 if (entry->enter_event != call) {
166 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 182 call->print_fmt = "\"0x%lx\", REC->ret";
167 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n",
168 SYSCALL_FIELD(int, nr),
169 SYSCALL_FIELD(long, ret));
170 if (!ret)
171 return 0; 183 return 0;
184 }
185
186 /* First: called with 0 length to calculate the needed length */
187 len = __set_enter_print_fmt(entry, NULL, 0);
188
189 print_fmt = kmalloc(len + 1, GFP_KERNEL);
190 if (!print_fmt)
191 return -ENOMEM;
192
193 /* Second: actually write the @print_fmt */
194 __set_enter_print_fmt(entry, print_fmt, len + 1);
195 call->print_fmt = print_fmt;
196
197 return 0;
198}
199
200static void free_syscall_print_fmt(struct ftrace_event_call *call)
201{
202 struct syscall_metadata *entry = call->data;
172 203
173 return trace_seq_printf(s, "\nprint fmt: \"0x%%lx\", REC->ret\n"); 204 if (entry->enter_event == call)
205 kfree(call->print_fmt);
174} 206}
175 207
176int syscall_enter_define_fields(struct ftrace_event_call *call) 208int syscall_enter_define_fields(struct ftrace_event_call *call)
177{ 209{
178 struct syscall_trace_enter trace; 210 struct syscall_trace_enter trace;
179 struct syscall_metadata *meta; 211 struct syscall_metadata *meta = call->data;
180 int ret; 212 int ret;
181 int nr;
182 int i; 213 int i;
183 int offset = offsetof(typeof(trace), args); 214 int offset = offsetof(typeof(trace), args);
184 215
185 nr = syscall_name_to_nr(call->data); 216 ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
186 meta = syscall_nr_to_meta(nr);
187
188 if (!meta)
189 return 0;
190
191 ret = trace_define_common_fields(call);
192 if (ret) 217 if (ret)
193 return ret; 218 return ret;
194 219
@@ -208,11 +233,11 @@ int syscall_exit_define_fields(struct ftrace_event_call *call)
208 struct syscall_trace_exit trace; 233 struct syscall_trace_exit trace;
209 int ret; 234 int ret;
210 235
211 ret = trace_define_common_fields(call); 236 ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
212 if (ret) 237 if (ret)
213 return ret; 238 return ret;
214 239
215 ret = trace_define_field(call, SYSCALL_FIELD(long, ret), 0, 240 ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
216 FILTER_OTHER); 241 FILTER_OTHER);
217 242
218 return ret; 243 return ret;
@@ -239,8 +264,8 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
239 264
240 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; 265 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
241 266
242 event = trace_current_buffer_lock_reserve(&buffer, sys_data->enter_id, 267 event = trace_current_buffer_lock_reserve(&buffer,
243 size, 0, 0); 268 sys_data->enter_event->id, size, 0, 0);
244 if (!event) 269 if (!event)
245 return; 270 return;
246 271
@@ -271,8 +296,8 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
271 if (!sys_data) 296 if (!sys_data)
272 return; 297 return;
273 298
274 event = trace_current_buffer_lock_reserve(&buffer, sys_data->exit_id, 299 event = trace_current_buffer_lock_reserve(&buffer,
275 sizeof(*entry), 0, 0); 300 sys_data->exit_event->id, sizeof(*entry), 0, 0);
276 if (!event) 301 if (!event)
277 return; 302 return;
278 303
@@ -285,23 +310,18 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
285 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 310 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
286} 311}
287 312
288int reg_event_syscall_enter(void *ptr) 313int reg_event_syscall_enter(struct ftrace_event_call *call)
289{ 314{
290 int ret = 0; 315 int ret = 0;
291 int num; 316 int num;
292 char *name;
293 317
294 name = (char *)ptr; 318 num = ((struct syscall_metadata *)call->data)->syscall_nr;
295 num = syscall_name_to_nr(name);
296 if (num < 0 || num >= NR_syscalls) 319 if (num < 0 || num >= NR_syscalls)
297 return -ENOSYS; 320 return -ENOSYS;
298 mutex_lock(&syscall_trace_lock); 321 mutex_lock(&syscall_trace_lock);
299 if (!sys_refcount_enter) 322 if (!sys_refcount_enter)
300 ret = register_trace_sys_enter(ftrace_syscall_enter); 323 ret = register_trace_sys_enter(ftrace_syscall_enter);
301 if (ret) { 324 if (!ret) {
302 pr_info("event trace: Could not activate"
303 "syscall entry trace point");
304 } else {
305 set_bit(num, enabled_enter_syscalls); 325 set_bit(num, enabled_enter_syscalls);
306 sys_refcount_enter++; 326 sys_refcount_enter++;
307 } 327 }
@@ -309,13 +329,11 @@ int reg_event_syscall_enter(void *ptr)
309 return ret; 329 return ret;
310} 330}
311 331
312void unreg_event_syscall_enter(void *ptr) 332void unreg_event_syscall_enter(struct ftrace_event_call *call)
313{ 333{
314 int num; 334 int num;
315 char *name;
316 335
317 name = (char *)ptr; 336 num = ((struct syscall_metadata *)call->data)->syscall_nr;
318 num = syscall_name_to_nr(name);
319 if (num < 0 || num >= NR_syscalls) 337 if (num < 0 || num >= NR_syscalls)
320 return; 338 return;
321 mutex_lock(&syscall_trace_lock); 339 mutex_lock(&syscall_trace_lock);
@@ -326,23 +344,18 @@ void unreg_event_syscall_enter(void *ptr)
326 mutex_unlock(&syscall_trace_lock); 344 mutex_unlock(&syscall_trace_lock);
327} 345}
328 346
329int reg_event_syscall_exit(void *ptr) 347int reg_event_syscall_exit(struct ftrace_event_call *call)
330{ 348{
331 int ret = 0; 349 int ret = 0;
332 int num; 350 int num;
333 char *name;
334 351
335 name = (char *)ptr; 352 num = ((struct syscall_metadata *)call->data)->syscall_nr;
336 num = syscall_name_to_nr(name);
337 if (num < 0 || num >= NR_syscalls) 353 if (num < 0 || num >= NR_syscalls)
338 return -ENOSYS; 354 return -ENOSYS;
339 mutex_lock(&syscall_trace_lock); 355 mutex_lock(&syscall_trace_lock);
340 if (!sys_refcount_exit) 356 if (!sys_refcount_exit)
341 ret = register_trace_sys_exit(ftrace_syscall_exit); 357 ret = register_trace_sys_exit(ftrace_syscall_exit);
342 if (ret) { 358 if (!ret) {
343 pr_info("event trace: Could not activate"
344 "syscall exit trace point");
345 } else {
346 set_bit(num, enabled_exit_syscalls); 359 set_bit(num, enabled_exit_syscalls);
347 sys_refcount_exit++; 360 sys_refcount_exit++;
348 } 361 }
@@ -350,13 +363,11 @@ int reg_event_syscall_exit(void *ptr)
350 return ret; 363 return ret;
351} 364}
352 365
353void unreg_event_syscall_exit(void *ptr) 366void unreg_event_syscall_exit(struct ftrace_event_call *call)
354{ 367{
355 int num; 368 int num;
356 char *name;
357 369
358 name = (char *)ptr; 370 num = ((struct syscall_metadata *)call->data)->syscall_nr;
359 num = syscall_name_to_nr(name);
360 if (num < 0 || num >= NR_syscalls) 371 if (num < 0 || num >= NR_syscalls)
361 return; 372 return;
362 mutex_lock(&syscall_trace_lock); 373 mutex_lock(&syscall_trace_lock);
@@ -367,33 +378,73 @@ void unreg_event_syscall_exit(void *ptr)
367 mutex_unlock(&syscall_trace_lock); 378 mutex_unlock(&syscall_trace_lock);
368} 379}
369 380
370struct trace_event event_syscall_enter = { 381int init_syscall_trace(struct ftrace_event_call *call)
371 .trace = print_syscall_enter, 382{
372}; 383 int id;
384
385 if (set_syscall_print_fmt(call) < 0)
386 return -ENOMEM;
373 387
374struct trace_event event_syscall_exit = { 388 id = trace_event_raw_init(call);
375 .trace = print_syscall_exit,
376};
377 389
378#ifdef CONFIG_EVENT_PROFILE 390 if (id < 0) {
391 free_syscall_print_fmt(call);
392 return id;
393 }
379 394
380static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls); 395 return id;
381static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls); 396}
382static int sys_prof_refcount_enter; 397
383static int sys_prof_refcount_exit; 398unsigned long __init arch_syscall_addr(int nr)
399{
400 return (unsigned long)sys_call_table[nr];
401}
384 402
385static void prof_syscall_enter(struct pt_regs *regs, long id) 403int __init init_ftrace_syscalls(void)
404{
405 struct syscall_metadata *meta;
406 unsigned long addr;
407 int i;
408
409 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
410 NR_syscalls, GFP_KERNEL);
411 if (!syscalls_metadata) {
412 WARN_ON(1);
413 return -ENOMEM;
414 }
415
416 for (i = 0; i < NR_syscalls; i++) {
417 addr = arch_syscall_addr(i);
418 meta = find_syscall_meta(addr);
419 if (!meta)
420 continue;
421
422 meta->syscall_nr = i;
423 syscalls_metadata[i] = meta;
424 }
425
426 return 0;
427}
428core_initcall(init_ftrace_syscalls);
429
430#ifdef CONFIG_PERF_EVENTS
431
432static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
433static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
434static int sys_perf_refcount_enter;
435static int sys_perf_refcount_exit;
436
437static void perf_syscall_enter(struct pt_regs *regs, long id)
386{ 438{
387 struct syscall_metadata *sys_data; 439 struct syscall_metadata *sys_data;
388 struct syscall_trace_enter *rec; 440 struct syscall_trace_enter *rec;
389 unsigned long flags; 441 unsigned long flags;
390 char *raw_data;
391 int syscall_nr; 442 int syscall_nr;
443 int rctx;
392 int size; 444 int size;
393 int cpu;
394 445
395 syscall_nr = syscall_get_nr(current, regs); 446 syscall_nr = syscall_get_nr(current, regs);
396 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) 447 if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
397 return; 448 return;
398 449
399 sys_data = syscall_nr_to_meta(syscall_nr); 450 sys_data = syscall_nr_to_meta(syscall_nr);
@@ -405,91 +456,67 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
405 size = ALIGN(size + sizeof(u32), sizeof(u64)); 456 size = ALIGN(size + sizeof(u32), sizeof(u64));
406 size -= sizeof(u32); 457 size -= sizeof(u32);
407 458
408 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, 459 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
409 "profile buffer not large enough")) 460 "perf buffer not large enough"))
410 return; 461 return;
411 462
412 /* Protect the per cpu buffer, begin the rcu read side */ 463 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
413 local_irq_save(flags); 464 sys_data->enter_event->id, &rctx, &flags);
414 465 if (!rec)
415 cpu = smp_processor_id(); 466 return;
416
417 if (in_nmi())
418 raw_data = rcu_dereference(trace_profile_buf_nmi);
419 else
420 raw_data = rcu_dereference(trace_profile_buf);
421
422 if (!raw_data)
423 goto end;
424
425 raw_data = per_cpu_ptr(raw_data, cpu);
426
427 /* zero the dead bytes from align to not leak stack to user */
428 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
429 467
430 rec = (struct syscall_trace_enter *) raw_data;
431 tracing_generic_entry_update(&rec->ent, 0, 0);
432 rec->ent.type = sys_data->enter_id;
433 rec->nr = syscall_nr; 468 rec->nr = syscall_nr;
434 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 469 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
435 (unsigned long *)&rec->args); 470 (unsigned long *)&rec->args);
436 perf_tp_event(sys_data->enter_id, 0, 1, rec, size); 471 perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
437
438end:
439 local_irq_restore(flags);
440} 472}
441 473
442int reg_prof_syscall_enter(char *name) 474int perf_sysenter_enable(struct ftrace_event_call *call)
443{ 475{
444 int ret = 0; 476 int ret = 0;
445 int num; 477 int num;
446 478
447 num = syscall_name_to_nr(name); 479 num = ((struct syscall_metadata *)call->data)->syscall_nr;
448 if (num < 0 || num >= NR_syscalls)
449 return -ENOSYS;
450 480
451 mutex_lock(&syscall_trace_lock); 481 mutex_lock(&syscall_trace_lock);
452 if (!sys_prof_refcount_enter) 482 if (!sys_perf_refcount_enter)
453 ret = register_trace_sys_enter(prof_syscall_enter); 483 ret = register_trace_sys_enter(perf_syscall_enter);
454 if (ret) { 484 if (ret) {
455 pr_info("event trace: Could not activate" 485 pr_info("event trace: Could not activate"
456 "syscall entry trace point"); 486 "syscall entry trace point");
457 } else { 487 } else {
458 set_bit(num, enabled_prof_enter_syscalls); 488 set_bit(num, enabled_perf_enter_syscalls);
459 sys_prof_refcount_enter++; 489 sys_perf_refcount_enter++;
460 } 490 }
461 mutex_unlock(&syscall_trace_lock); 491 mutex_unlock(&syscall_trace_lock);
462 return ret; 492 return ret;
463} 493}
464 494
465void unreg_prof_syscall_enter(char *name) 495void perf_sysenter_disable(struct ftrace_event_call *call)
466{ 496{
467 int num; 497 int num;
468 498
469 num = syscall_name_to_nr(name); 499 num = ((struct syscall_metadata *)call->data)->syscall_nr;
470 if (num < 0 || num >= NR_syscalls)
471 return;
472 500
473 mutex_lock(&syscall_trace_lock); 501 mutex_lock(&syscall_trace_lock);
474 sys_prof_refcount_enter--; 502 sys_perf_refcount_enter--;
475 clear_bit(num, enabled_prof_enter_syscalls); 503 clear_bit(num, enabled_perf_enter_syscalls);
476 if (!sys_prof_refcount_enter) 504 if (!sys_perf_refcount_enter)
477 unregister_trace_sys_enter(prof_syscall_enter); 505 unregister_trace_sys_enter(perf_syscall_enter);
478 mutex_unlock(&syscall_trace_lock); 506 mutex_unlock(&syscall_trace_lock);
479} 507}
480 508
481static void prof_syscall_exit(struct pt_regs *regs, long ret) 509static void perf_syscall_exit(struct pt_regs *regs, long ret)
482{ 510{
483 struct syscall_metadata *sys_data; 511 struct syscall_metadata *sys_data;
484 struct syscall_trace_exit *rec; 512 struct syscall_trace_exit *rec;
485 unsigned long flags; 513 unsigned long flags;
486 int syscall_nr; 514 int syscall_nr;
487 char *raw_data; 515 int rctx;
488 int size; 516 int size;
489 int cpu;
490 517
491 syscall_nr = syscall_get_nr(current, regs); 518 syscall_nr = syscall_get_nr(current, regs);
492 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls)) 519 if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
493 return; 520 return;
494 521
495 sys_data = syscall_nr_to_meta(syscall_nr); 522 sys_data = syscall_nr_to_meta(syscall_nr);
@@ -504,79 +531,55 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
504 * Impossible, but be paranoid with the future 531 * Impossible, but be paranoid with the future
505 * How to put this check outside runtime? 532 * How to put this check outside runtime?
506 */ 533 */
507 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, 534 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
508 "exit event has grown above profile buffer size")) 535 "exit event has grown above perf buffer size"))
509 return; 536 return;
510 537
511 /* Protect the per cpu buffer, begin the rcu read side */ 538 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
512 local_irq_save(flags); 539 sys_data->exit_event->id, &rctx, &flags);
513 cpu = smp_processor_id(); 540 if (!rec)
514 541 return;
515 if (in_nmi())
516 raw_data = rcu_dereference(trace_profile_buf_nmi);
517 else
518 raw_data = rcu_dereference(trace_profile_buf);
519
520 if (!raw_data)
521 goto end;
522
523 raw_data = per_cpu_ptr(raw_data, cpu);
524
525 /* zero the dead bytes from align to not leak stack to user */
526 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
527
528 rec = (struct syscall_trace_exit *)raw_data;
529 542
530 tracing_generic_entry_update(&rec->ent, 0, 0);
531 rec->ent.type = sys_data->exit_id;
532 rec->nr = syscall_nr; 543 rec->nr = syscall_nr;
533 rec->ret = syscall_get_return_value(current, regs); 544 rec->ret = syscall_get_return_value(current, regs);
534 545
535 perf_tp_event(sys_data->exit_id, 0, 1, rec, size); 546 perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
536
537end:
538 local_irq_restore(flags);
539} 547}
540 548
541int reg_prof_syscall_exit(char *name) 549int perf_sysexit_enable(struct ftrace_event_call *call)
542{ 550{
543 int ret = 0; 551 int ret = 0;
544 int num; 552 int num;
545 553
546 num = syscall_name_to_nr(name); 554 num = ((struct syscall_metadata *)call->data)->syscall_nr;
547 if (num < 0 || num >= NR_syscalls)
548 return -ENOSYS;
549 555
550 mutex_lock(&syscall_trace_lock); 556 mutex_lock(&syscall_trace_lock);
551 if (!sys_prof_refcount_exit) 557 if (!sys_perf_refcount_exit)
552 ret = register_trace_sys_exit(prof_syscall_exit); 558 ret = register_trace_sys_exit(perf_syscall_exit);
553 if (ret) { 559 if (ret) {
554 pr_info("event trace: Could not activate" 560 pr_info("event trace: Could not activate"
555 "syscall entry trace point"); 561 "syscall exit trace point");
556 } else { 562 } else {
557 set_bit(num, enabled_prof_exit_syscalls); 563 set_bit(num, enabled_perf_exit_syscalls);
558 sys_prof_refcount_exit++; 564 sys_perf_refcount_exit++;
559 } 565 }
560 mutex_unlock(&syscall_trace_lock); 566 mutex_unlock(&syscall_trace_lock);
561 return ret; 567 return ret;
562} 568}
563 569
564void unreg_prof_syscall_exit(char *name) 570void perf_sysexit_disable(struct ftrace_event_call *call)
565{ 571{
566 int num; 572 int num;
567 573
568 num = syscall_name_to_nr(name); 574 num = ((struct syscall_metadata *)call->data)->syscall_nr;
569 if (num < 0 || num >= NR_syscalls)
570 return;
571 575
572 mutex_lock(&syscall_trace_lock); 576 mutex_lock(&syscall_trace_lock);
573 sys_prof_refcount_exit--; 577 sys_perf_refcount_exit--;
574 clear_bit(num, enabled_prof_exit_syscalls); 578 clear_bit(num, enabled_perf_exit_syscalls);
575 if (!sys_prof_refcount_exit) 579 if (!sys_perf_refcount_exit)
576 unregister_trace_sys_exit(prof_syscall_exit); 580 unregister_trace_sys_exit(perf_syscall_exit);
577 mutex_unlock(&syscall_trace_lock); 581 mutex_unlock(&syscall_trace_lock);
578} 582}
579 583
580#endif 584#endif /* CONFIG_PERF_EVENTS */
581
582 585
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index f6693969287d..a7974a552ca9 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -93,6 +93,7 @@ static const struct stacktrace_ops backtrace_ops = {
93 .warning_symbol = backtrace_warning_symbol, 93 .warning_symbol = backtrace_warning_symbol,
94 .stack = backtrace_stack, 94 .stack = backtrace_stack,
95 .address = backtrace_address, 95 .address = backtrace_address,
96 .walk_stack = print_context_stack,
96}; 97};
97 98
98static int 99static int
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 40cafb07dffd..cc2d2faa7d9e 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -9,6 +9,7 @@
9#include <trace/events/workqueue.h> 9#include <trace/events/workqueue.h>
10#include <linux/list.h> 10#include <linux/list.h>
11#include <linux/percpu.h> 11#include <linux/percpu.h>
12#include <linux/slab.h>
12#include <linux/kref.h> 13#include <linux/kref.h>
13#include "trace_stat.h" 14#include "trace_stat.h"
14#include "trace.h" 15#include "trace.h"
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 00d59d048edf..0a67e041edf8 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -21,6 +21,7 @@
21#include <linux/tsacct_kern.h> 21#include <linux/tsacct_kern.h>
22#include <linux/acct.h> 22#include <linux/acct.h>
23#include <linux/jiffies.h> 23#include <linux/jiffies.h>
24#include <linux/mm.h>
24 25
25/* 26/*
26 * fill in basic accounting fields 27 * fill in basic accounting fields
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
new file mode 100644
index 000000000000..eb27fd3430a2
--- /dev/null
+++ b/kernel/user-return-notifier.c
@@ -0,0 +1,44 @@
1
2#include <linux/user-return-notifier.h>
3#include <linux/percpu.h>
4#include <linux/sched.h>
5#include <linux/module.h>
6
7static DEFINE_PER_CPU(struct hlist_head, return_notifier_list);
8
9/*
10 * Request a notification when the current cpu returns to userspace. Must be
11 * called in atomic context. The notifier will also be called in atomic
12 * context.
13 */
14void user_return_notifier_register(struct user_return_notifier *urn)
15{
16 set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
17 hlist_add_head(&urn->link, &__get_cpu_var(return_notifier_list));
18}
19EXPORT_SYMBOL_GPL(user_return_notifier_register);
20
21/*
22 * Removes a registered user return notifier. Must be called from atomic
23 * context, and from the same cpu registration occured in.
24 */
25void user_return_notifier_unregister(struct user_return_notifier *urn)
26{
27 hlist_del(&urn->link);
28 if (hlist_empty(&__get_cpu_var(return_notifier_list)))
29 clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
30}
31EXPORT_SYMBOL_GPL(user_return_notifier_unregister);
32
33/* Calls registered user return notifiers */
34void fire_user_return_notifiers(void)
35{
36 struct user_return_notifier *urn;
37 struct hlist_node *tmp1, *tmp2;
38 struct hlist_head *head;
39
40 head = &get_cpu_var(return_notifier_list);
41 hlist_for_each_entry_safe(urn, tmp1, tmp2, head, link)
42 urn->on_user_return(urn);
43 put_cpu_var(return_notifier_list);
44}
diff --git a/kernel/user.c b/kernel/user.c
index 46d0165ca70c..766467b3bcb7 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -56,9 +56,6 @@ struct user_struct root_user = {
56 .sigpending = ATOMIC_INIT(0), 56 .sigpending = ATOMIC_INIT(0),
57 .locked_shm = 0, 57 .locked_shm = 0,
58 .user_ns = &init_user_ns, 58 .user_ns = &init_user_ns,
59#ifdef CONFIG_USER_SCHED
60 .tg = &init_task_group,
61#endif
62}; 59};
63 60
64/* 61/*
@@ -75,268 +72,6 @@ static void uid_hash_remove(struct user_struct *up)
75 put_user_ns(up->user_ns); 72 put_user_ns(up->user_ns);
76} 73}
77 74
78#ifdef CONFIG_USER_SCHED
79
80static void sched_destroy_user(struct user_struct *up)
81{
82 sched_destroy_group(up->tg);
83}
84
85static int sched_create_user(struct user_struct *up)
86{
87 int rc = 0;
88
89 up->tg = sched_create_group(&root_task_group);
90 if (IS_ERR(up->tg))
91 rc = -ENOMEM;
92
93 set_tg_uid(up);
94
95 return rc;
96}
97
98#else /* CONFIG_USER_SCHED */
99
100static void sched_destroy_user(struct user_struct *up) { }
101static int sched_create_user(struct user_struct *up) { return 0; }
102
103#endif /* CONFIG_USER_SCHED */
104
105#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
106
107static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
108{
109 struct user_struct *user;
110 struct hlist_node *h;
111
112 hlist_for_each_entry(user, h, hashent, uidhash_node) {
113 if (user->uid == uid) {
114 /* possibly resurrect an "almost deleted" object */
115 if (atomic_inc_return(&user->__count) == 1)
116 cancel_delayed_work(&user->work);
117 return user;
118 }
119 }
120
121 return NULL;
122}
123
124static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
125static DEFINE_MUTEX(uids_mutex);
126
127static inline void uids_mutex_lock(void)
128{
129 mutex_lock(&uids_mutex);
130}
131
132static inline void uids_mutex_unlock(void)
133{
134 mutex_unlock(&uids_mutex);
135}
136
137/* uid directory attributes */
138#ifdef CONFIG_FAIR_GROUP_SCHED
139static ssize_t cpu_shares_show(struct kobject *kobj,
140 struct kobj_attribute *attr,
141 char *buf)
142{
143 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
144
145 return sprintf(buf, "%lu\n", sched_group_shares(up->tg));
146}
147
148static ssize_t cpu_shares_store(struct kobject *kobj,
149 struct kobj_attribute *attr,
150 const char *buf, size_t size)
151{
152 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
153 unsigned long shares;
154 int rc;
155
156 sscanf(buf, "%lu", &shares);
157
158 rc = sched_group_set_shares(up->tg, shares);
159
160 return (rc ? rc : size);
161}
162
163static struct kobj_attribute cpu_share_attr =
164 __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store);
165#endif
166
167#ifdef CONFIG_RT_GROUP_SCHED
168static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
169 struct kobj_attribute *attr,
170 char *buf)
171{
172 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
173
174 return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg));
175}
176
177static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
178 struct kobj_attribute *attr,
179 const char *buf, size_t size)
180{
181 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
182 unsigned long rt_runtime;
183 int rc;
184
185 sscanf(buf, "%ld", &rt_runtime);
186
187 rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
188
189 return (rc ? rc : size);
190}
191
192static struct kobj_attribute cpu_rt_runtime_attr =
193 __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store);
194
195static ssize_t cpu_rt_period_show(struct kobject *kobj,
196 struct kobj_attribute *attr,
197 char *buf)
198{
199 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
200
201 return sprintf(buf, "%lu\n", sched_group_rt_period(up->tg));
202}
203
204static ssize_t cpu_rt_period_store(struct kobject *kobj,
205 struct kobj_attribute *attr,
206 const char *buf, size_t size)
207{
208 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
209 unsigned long rt_period;
210 int rc;
211
212 sscanf(buf, "%lu", &rt_period);
213
214 rc = sched_group_set_rt_period(up->tg, rt_period);
215
216 return (rc ? rc : size);
217}
218
219static struct kobj_attribute cpu_rt_period_attr =
220 __ATTR(cpu_rt_period, 0644, cpu_rt_period_show, cpu_rt_period_store);
221#endif
222
223/* default attributes per uid directory */
224static struct attribute *uids_attributes[] = {
225#ifdef CONFIG_FAIR_GROUP_SCHED
226 &cpu_share_attr.attr,
227#endif
228#ifdef CONFIG_RT_GROUP_SCHED
229 &cpu_rt_runtime_attr.attr,
230 &cpu_rt_period_attr.attr,
231#endif
232 NULL
233};
234
235/* the lifetime of user_struct is not managed by the core (now) */
236static void uids_release(struct kobject *kobj)
237{
238 return;
239}
240
241static struct kobj_type uids_ktype = {
242 .sysfs_ops = &kobj_sysfs_ops,
243 .default_attrs = uids_attributes,
244 .release = uids_release,
245};
246
247/*
248 * Create /sys/kernel/uids/<uid>/cpu_share file for this user
249 * We do not create this file for users in a user namespace (until
250 * sysfs tagging is implemented).
251 *
252 * See Documentation/scheduler/sched-design-CFS.txt for ramifications.
253 */
254static int uids_user_create(struct user_struct *up)
255{
256 struct kobject *kobj = &up->kobj;
257 int error;
258
259 memset(kobj, 0, sizeof(struct kobject));
260 if (up->user_ns != &init_user_ns)
261 return 0;
262 kobj->kset = uids_kset;
263 error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid);
264 if (error) {
265 kobject_put(kobj);
266 goto done;
267 }
268
269 kobject_uevent(kobj, KOBJ_ADD);
270done:
271 return error;
272}
273
274/* create these entries in sysfs:
275 * "/sys/kernel/uids" directory
276 * "/sys/kernel/uids/0" directory (for root user)
277 * "/sys/kernel/uids/0/cpu_share" file (for root user)
278 */
279int __init uids_sysfs_init(void)
280{
281 uids_kset = kset_create_and_add("uids", NULL, kernel_kobj);
282 if (!uids_kset)
283 return -ENOMEM;
284
285 return uids_user_create(&root_user);
286}
287
288/* delayed work function to remove sysfs directory for a user and free up
289 * corresponding structures.
290 */
291static void cleanup_user_struct(struct work_struct *w)
292{
293 struct user_struct *up = container_of(w, struct user_struct, work.work);
294 unsigned long flags;
295 int remove_user = 0;
296
297 /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
298 * atomic.
299 */
300 uids_mutex_lock();
301
302 spin_lock_irqsave(&uidhash_lock, flags);
303 if (atomic_read(&up->__count) == 0) {
304 uid_hash_remove(up);
305 remove_user = 1;
306 }
307 spin_unlock_irqrestore(&uidhash_lock, flags);
308
309 if (!remove_user)
310 goto done;
311
312 if (up->user_ns == &init_user_ns) {
313 kobject_uevent(&up->kobj, KOBJ_REMOVE);
314 kobject_del(&up->kobj);
315 kobject_put(&up->kobj);
316 }
317
318 sched_destroy_user(up);
319 key_put(up->uid_keyring);
320 key_put(up->session_keyring);
321 kmem_cache_free(uid_cachep, up);
322
323done:
324 uids_mutex_unlock();
325}
326
327/* IRQs are disabled and uidhash_lock is held upon function entry.
328 * IRQ state (as stored in flags) is restored and uidhash_lock released
329 * upon function exit.
330 */
331static void free_user(struct user_struct *up, unsigned long flags)
332{
333 INIT_DELAYED_WORK(&up->work, cleanup_user_struct);
334 schedule_delayed_work(&up->work, msecs_to_jiffies(1000));
335 spin_unlock_irqrestore(&uidhash_lock, flags);
336}
337
338#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */
339
340static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) 75static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
341{ 76{
342 struct user_struct *user; 77 struct user_struct *user;
@@ -352,11 +87,6 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
352 return NULL; 87 return NULL;
353} 88}
354 89
355int uids_sysfs_init(void) { return 0; }
356static inline int uids_user_create(struct user_struct *up) { return 0; }
357static inline void uids_mutex_lock(void) { }
358static inline void uids_mutex_unlock(void) { }
359
360/* IRQs are disabled and uidhash_lock is held upon function entry. 90/* IRQs are disabled and uidhash_lock is held upon function entry.
361 * IRQ state (as stored in flags) is restored and uidhash_lock released 91 * IRQ state (as stored in flags) is restored and uidhash_lock released
362 * upon function exit. 92 * upon function exit.
@@ -365,32 +95,11 @@ static void free_user(struct user_struct *up, unsigned long flags)
365{ 95{
366 uid_hash_remove(up); 96 uid_hash_remove(up);
367 spin_unlock_irqrestore(&uidhash_lock, flags); 97 spin_unlock_irqrestore(&uidhash_lock, flags);
368 sched_destroy_user(up);
369 key_put(up->uid_keyring); 98 key_put(up->uid_keyring);
370 key_put(up->session_keyring); 99 key_put(up->session_keyring);
371 kmem_cache_free(uid_cachep, up); 100 kmem_cache_free(uid_cachep, up);
372} 101}
373 102
374#endif
375
376#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED)
377/*
378 * We need to check if a setuid can take place. This function should be called
379 * before successfully completing the setuid.
380 */
381int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
382{
383
384 return sched_rt_can_attach(up->tg, tsk);
385
386}
387#else
388int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
389{
390 return 1;
391}
392#endif
393
394/* 103/*
395 * Locate the user_struct for the passed UID. If found, take a ref on it. The 104 * Locate the user_struct for the passed UID. If found, take a ref on it. The
396 * caller must undo that ref with free_uid(). 105 * caller must undo that ref with free_uid().
@@ -431,8 +140,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
431 /* Make uid_hash_find() + uids_user_create() + uid_hash_insert() 140 /* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
432 * atomic. 141 * atomic.
433 */ 142 */
434 uids_mutex_lock();
435
436 spin_lock_irq(&uidhash_lock); 143 spin_lock_irq(&uidhash_lock);
437 up = uid_hash_find(uid, hashent); 144 up = uid_hash_find(uid, hashent);
438 spin_unlock_irq(&uidhash_lock); 145 spin_unlock_irq(&uidhash_lock);
@@ -445,14 +152,8 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
445 new->uid = uid; 152 new->uid = uid;
446 atomic_set(&new->__count, 1); 153 atomic_set(&new->__count, 1);
447 154
448 if (sched_create_user(new) < 0)
449 goto out_free_user;
450
451 new->user_ns = get_user_ns(ns); 155 new->user_ns = get_user_ns(ns);
452 156
453 if (uids_user_create(new))
454 goto out_destoy_sched;
455
456 /* 157 /*
457 * Before adding this, check whether we raced 158 * Before adding this, check whether we raced
458 * on adding the same user already.. 159 * on adding the same user already..
@@ -475,17 +176,11 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
475 spin_unlock_irq(&uidhash_lock); 176 spin_unlock_irq(&uidhash_lock);
476 } 177 }
477 178
478 uids_mutex_unlock();
479
480 return up; 179 return up;
481 180
482out_destoy_sched:
483 sched_destroy_user(new);
484 put_user_ns(new->user_ns); 181 put_user_ns(new->user_ns);
485out_free_user:
486 kmem_cache_free(uid_cachep, new); 182 kmem_cache_free(uid_cachep, new);
487out_unlock: 183out_unlock:
488 uids_mutex_unlock();
489 return NULL; 184 return NULL;
490} 185}
491 186
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 69eae358a726..a2cd77e70d4d 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -57,78 +57,47 @@ static int proc_do_uts_string(ctl_table *table, int write,
57#define proc_do_uts_string NULL 57#define proc_do_uts_string NULL
58#endif 58#endif
59 59
60
61#ifdef CONFIG_SYSCTL_SYSCALL
62/* The generic string strategy routine: */
63static int sysctl_uts_string(ctl_table *table,
64 void __user *oldval, size_t __user *oldlenp,
65 void __user *newval, size_t newlen)
66{
67 struct ctl_table uts_table;
68 int r, write;
69 write = newval && newlen;
70 memcpy(&uts_table, table, sizeof(uts_table));
71 uts_table.data = get_uts(table, write);
72 r = sysctl_string(&uts_table, oldval, oldlenp, newval, newlen);
73 put_uts(table, write, uts_table.data);
74 return r;
75}
76#else
77#define sysctl_uts_string NULL
78#endif
79
80static struct ctl_table uts_kern_table[] = { 60static struct ctl_table uts_kern_table[] = {
81 { 61 {
82 .ctl_name = KERN_OSTYPE,
83 .procname = "ostype", 62 .procname = "ostype",
84 .data = init_uts_ns.name.sysname, 63 .data = init_uts_ns.name.sysname,
85 .maxlen = sizeof(init_uts_ns.name.sysname), 64 .maxlen = sizeof(init_uts_ns.name.sysname),
86 .mode = 0444, 65 .mode = 0444,
87 .proc_handler = proc_do_uts_string, 66 .proc_handler = proc_do_uts_string,
88 .strategy = sysctl_uts_string,
89 }, 67 },
90 { 68 {
91 .ctl_name = KERN_OSRELEASE,
92 .procname = "osrelease", 69 .procname = "osrelease",
93 .data = init_uts_ns.name.release, 70 .data = init_uts_ns.name.release,
94 .maxlen = sizeof(init_uts_ns.name.release), 71 .maxlen = sizeof(init_uts_ns.name.release),
95 .mode = 0444, 72 .mode = 0444,
96 .proc_handler = proc_do_uts_string, 73 .proc_handler = proc_do_uts_string,
97 .strategy = sysctl_uts_string,
98 }, 74 },
99 { 75 {
100 .ctl_name = KERN_VERSION,
101 .procname = "version", 76 .procname = "version",
102 .data = init_uts_ns.name.version, 77 .data = init_uts_ns.name.version,
103 .maxlen = sizeof(init_uts_ns.name.version), 78 .maxlen = sizeof(init_uts_ns.name.version),
104 .mode = 0444, 79 .mode = 0444,
105 .proc_handler = proc_do_uts_string, 80 .proc_handler = proc_do_uts_string,
106 .strategy = sysctl_uts_string,
107 }, 81 },
108 { 82 {
109 .ctl_name = KERN_NODENAME,
110 .procname = "hostname", 83 .procname = "hostname",
111 .data = init_uts_ns.name.nodename, 84 .data = init_uts_ns.name.nodename,
112 .maxlen = sizeof(init_uts_ns.name.nodename), 85 .maxlen = sizeof(init_uts_ns.name.nodename),
113 .mode = 0644, 86 .mode = 0644,
114 .proc_handler = proc_do_uts_string, 87 .proc_handler = proc_do_uts_string,
115 .strategy = sysctl_uts_string,
116 }, 88 },
117 { 89 {
118 .ctl_name = KERN_DOMAINNAME,
119 .procname = "domainname", 90 .procname = "domainname",
120 .data = init_uts_ns.name.domainname, 91 .data = init_uts_ns.name.domainname,
121 .maxlen = sizeof(init_uts_ns.name.domainname), 92 .maxlen = sizeof(init_uts_ns.name.domainname),
122 .mode = 0644, 93 .mode = 0644,
123 .proc_handler = proc_do_uts_string, 94 .proc_handler = proc_do_uts_string,
124 .strategy = sysctl_uts_string,
125 }, 95 },
126 {} 96 {}
127}; 97};
128 98
129static struct ctl_table uts_root_table[] = { 99static struct ctl_table uts_root_table[] = {
130 { 100 {
131 .ctl_name = CTL_KERN,
132 .procname = "kernel", 101 .procname = "kernel",
133 .mode = 0555, 102 .mode = 0555,
134 .child = uts_kern_table, 103 .child = uts_kern_table,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 67e526b6ae81..5bfb213984b2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -68,6 +68,116 @@ struct workqueue_struct {
68#endif 68#endif
69}; 69};
70 70
71#ifdef CONFIG_DEBUG_OBJECTS_WORK
72
73static struct debug_obj_descr work_debug_descr;
74
75/*
76 * fixup_init is called when:
77 * - an active object is initialized
78 */
79static int work_fixup_init(void *addr, enum debug_obj_state state)
80{
81 struct work_struct *work = addr;
82
83 switch (state) {
84 case ODEBUG_STATE_ACTIVE:
85 cancel_work_sync(work);
86 debug_object_init(work, &work_debug_descr);
87 return 1;
88 default:
89 return 0;
90 }
91}
92
93/*
94 * fixup_activate is called when:
95 * - an active object is activated
96 * - an unknown object is activated (might be a statically initialized object)
97 */
98static int work_fixup_activate(void *addr, enum debug_obj_state state)
99{
100 struct work_struct *work = addr;
101
102 switch (state) {
103
104 case ODEBUG_STATE_NOTAVAILABLE:
105 /*
106 * This is not really a fixup. The work struct was
107 * statically initialized. We just make sure that it
108 * is tracked in the object tracker.
109 */
110 if (test_bit(WORK_STRUCT_STATIC, work_data_bits(work))) {
111 debug_object_init(work, &work_debug_descr);
112 debug_object_activate(work, &work_debug_descr);
113 return 0;
114 }
115 WARN_ON_ONCE(1);
116 return 0;
117
118 case ODEBUG_STATE_ACTIVE:
119 WARN_ON(1);
120
121 default:
122 return 0;
123 }
124}
125
126/*
127 * fixup_free is called when:
128 * - an active object is freed
129 */
130static int work_fixup_free(void *addr, enum debug_obj_state state)
131{
132 struct work_struct *work = addr;
133
134 switch (state) {
135 case ODEBUG_STATE_ACTIVE:
136 cancel_work_sync(work);
137 debug_object_free(work, &work_debug_descr);
138 return 1;
139 default:
140 return 0;
141 }
142}
143
144static struct debug_obj_descr work_debug_descr = {
145 .name = "work_struct",
146 .fixup_init = work_fixup_init,
147 .fixup_activate = work_fixup_activate,
148 .fixup_free = work_fixup_free,
149};
150
151static inline void debug_work_activate(struct work_struct *work)
152{
153 debug_object_activate(work, &work_debug_descr);
154}
155
156static inline void debug_work_deactivate(struct work_struct *work)
157{
158 debug_object_deactivate(work, &work_debug_descr);
159}
160
161void __init_work(struct work_struct *work, int onstack)
162{
163 if (onstack)
164 debug_object_init_on_stack(work, &work_debug_descr);
165 else
166 debug_object_init(work, &work_debug_descr);
167}
168EXPORT_SYMBOL_GPL(__init_work);
169
170void destroy_work_on_stack(struct work_struct *work)
171{
172 debug_object_free(work, &work_debug_descr);
173}
174EXPORT_SYMBOL_GPL(destroy_work_on_stack);
175
176#else
177static inline void debug_work_activate(struct work_struct *work) { }
178static inline void debug_work_deactivate(struct work_struct *work) { }
179#endif
180
71/* Serializes the accesses to the list of workqueues. */ 181/* Serializes the accesses to the list of workqueues. */
72static DEFINE_SPINLOCK(workqueue_lock); 182static DEFINE_SPINLOCK(workqueue_lock);
73static LIST_HEAD(workqueues); 183static LIST_HEAD(workqueues);
@@ -145,6 +255,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
145{ 255{
146 unsigned long flags; 256 unsigned long flags;
147 257
258 debug_work_activate(work);
148 spin_lock_irqsave(&cwq->lock, flags); 259 spin_lock_irqsave(&cwq->lock, flags);
149 insert_work(cwq, work, &cwq->worklist); 260 insert_work(cwq, work, &cwq->worklist);
150 spin_unlock_irqrestore(&cwq->lock, flags); 261 spin_unlock_irqrestore(&cwq->lock, flags);
@@ -280,6 +391,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
280 struct lockdep_map lockdep_map = work->lockdep_map; 391 struct lockdep_map lockdep_map = work->lockdep_map;
281#endif 392#endif
282 trace_workqueue_execution(cwq->thread, work); 393 trace_workqueue_execution(cwq->thread, work);
394 debug_work_deactivate(work);
283 cwq->current_work = work; 395 cwq->current_work = work;
284 list_del_init(cwq->worklist.next); 396 list_del_init(cwq->worklist.next);
285 spin_unlock_irq(&cwq->lock); 397 spin_unlock_irq(&cwq->lock);
@@ -350,11 +462,18 @@ static void wq_barrier_func(struct work_struct *work)
350static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, 462static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
351 struct wq_barrier *barr, struct list_head *head) 463 struct wq_barrier *barr, struct list_head *head)
352{ 464{
353 INIT_WORK(&barr->work, wq_barrier_func); 465 /*
466 * debugobject calls are safe here even with cwq->lock locked
467 * as we know for sure that this will not trigger any of the
468 * checks and call back into the fixup functions where we
469 * might deadlock.
470 */
471 INIT_WORK_ON_STACK(&barr->work, wq_barrier_func);
354 __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work)); 472 __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
355 473
356 init_completion(&barr->done); 474 init_completion(&barr->done);
357 475
476 debug_work_activate(&barr->work);
358 insert_work(cwq, &barr->work, head); 477 insert_work(cwq, &barr->work, head);
359} 478}
360 479
@@ -372,8 +491,10 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
372 } 491 }
373 spin_unlock_irq(&cwq->lock); 492 spin_unlock_irq(&cwq->lock);
374 493
375 if (active) 494 if (active) {
376 wait_for_completion(&barr.done); 495 wait_for_completion(&barr.done);
496 destroy_work_on_stack(&barr.work);
497 }
377 498
378 return active; 499 return active;
379} 500}
@@ -451,6 +572,7 @@ out:
451 return 0; 572 return 0;
452 573
453 wait_for_completion(&barr.done); 574 wait_for_completion(&barr.done);
575 destroy_work_on_stack(&barr.work);
454 return 1; 576 return 1;
455} 577}
456EXPORT_SYMBOL_GPL(flush_work); 578EXPORT_SYMBOL_GPL(flush_work);
@@ -485,6 +607,7 @@ static int try_to_grab_pending(struct work_struct *work)
485 */ 607 */
486 smp_rmb(); 608 smp_rmb();
487 if (cwq == get_wq_data(work)) { 609 if (cwq == get_wq_data(work)) {
610 debug_work_deactivate(work);
488 list_del_init(&work->entry); 611 list_del_init(&work->entry);
489 ret = 1; 612 ret = 1;
490 } 613 }
@@ -507,8 +630,10 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
507 } 630 }
508 spin_unlock_irq(&cwq->lock); 631 spin_unlock_irq(&cwq->lock);
509 632
510 if (unlikely(running)) 633 if (unlikely(running)) {
511 wait_for_completion(&barr.done); 634 wait_for_completion(&barr.done);
635 destroy_work_on_stack(&barr.work);
636 }
512} 637}
513 638
514static void wait_on_work(struct work_struct *work) 639static void wait_on_work(struct work_struct *work)
@@ -649,7 +774,7 @@ void flush_delayed_work(struct delayed_work *dwork)
649{ 774{
650 if (del_timer_sync(&dwork->timer)) { 775 if (del_timer_sync(&dwork->timer)) {
651 struct cpu_workqueue_struct *cwq; 776 struct cpu_workqueue_struct *cwq;
652 cwq = wq_per_cpu(keventd_wq, get_cpu()); 777 cwq = wq_per_cpu(get_wq_data(&dwork->work)->wq, get_cpu());
653 __queue_work(cwq, &dwork->work); 778 __queue_work(cwq, &dwork->work);
654 put_cpu(); 779 put_cpu();
655 } 780 }