aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks202
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/acct.c3
-rw-r--r--kernel/capability.c15
-rw-r--r--kernel/cpu.c23
-rw-r--r--kernel/cpuset.c45
-rw-r--r--kernel/exit.c28
-rw-r--r--kernel/fork.c23
-rw-r--r--kernel/futex.c60
-rw-r--r--kernel/hrtimer.c170
-rw-r--r--kernel/hung_task.c2
-rw-r--r--kernel/hw_breakpoint.c4
-rw-r--r--kernel/irq/autoprobe.c20
-rw-r--r--kernel/irq/chip.c92
-rw-r--r--kernel/irq/handle.c22
-rw-r--r--kernel/irq/internals.h2
-rw-r--r--kernel/irq/manage.c52
-rw-r--r--kernel/irq/migration.c2
-rw-r--r--kernel/irq/numa_migrate.c8
-rw-r--r--kernel/irq/pm.c8
-rw-r--r--kernel/irq/proc.c44
-rw-r--r--kernel/irq/spurious.c30
-rw-r--r--kernel/itimer.c7
-rw-r--r--kernel/kexec.c59
-rw-r--r--kernel/kgdb.c58
-rw-r--r--kernel/kmod.c8
-rw-r--r--kernel/kprobes.c4
-rw-r--r--kernel/ksysfs.c21
-rw-r--r--kernel/lockdep.c47
-rw-r--r--kernel/module.c183
-rw-r--r--kernel/mutex-debug.h12
-rw-r--r--kernel/mutex.c4
-rw-r--r--kernel/panic.c3
-rw-r--r--kernel/params.c8
-rw-r--r--kernel/perf_event.c110
-rw-r--r--kernel/pid.c12
-rw-r--r--kernel/pm_qos_params.c20
-rw-r--r--kernel/posix-cpu-timers.c5
-rw-r--r--kernel/power/Makefile2
-rw-r--r--kernel/power/console.c7
-rw-r--r--kernel/power/hibernate.c30
-rw-r--r--kernel/power/main.c1
-rw-r--r--kernel/power/process.c14
-rw-r--r--kernel/power/swap.c107
-rw-r--r--kernel/power/swsusp.c130
-rw-r--r--kernel/printk.c126
-rw-r--r--kernel/rcupdate.c122
-rw-r--r--kernel/rcutiny.c282
-rw-r--r--kernel/rcutorture.c73
-rw-r--r--kernel/rcutree.c465
-rw-r--r--kernel/rcutree.h69
-rw-r--r--kernel/rcutree_plugin.h309
-rw-r--r--kernel/rcutree_trace.c12
-rw-r--r--kernel/relay.c2
-rw-r--r--kernel/resource.c26
-rw-r--r--kernel/rtmutex-debug.c4
-rw-r--r--kernel/rtmutex.c106
-rw-r--r--kernel/sched.c697
-rw-r--r--kernel/sched_cpupri.c10
-rw-r--r--kernel/sched_cpupri.h2
-rw-r--r--kernel/sched_debug.c21
-rw-r--r--kernel/sched_fair.c220
-rw-r--r--kernel/sched_features.h5
-rw-r--r--kernel/sched_idletask.c6
-rw-r--r--kernel/sched_rt.c123
-rw-r--r--kernel/signal.c84
-rw-r--r--kernel/slow-work-debugfs.c227
-rw-r--r--kernel/slow-work.c519
-rw-r--r--kernel/slow-work.h72
-rw-r--r--kernel/smp.c91
-rw-r--r--kernel/softirq.c6
-rw-r--r--kernel/softlockup.c54
-rw-r--r--kernel/spinlock.c448
-rw-r--r--kernel/srcu.c74
-rw-r--r--kernel/sys.c43
-rw-r--r--kernel/sys_ni.c3
-rw-r--r--kernel/sysctl.c932
-rw-r--r--kernel/sysctl_binary.c1507
-rw-r--r--kernel/sysctl_check.c1376
-rw-r--r--kernel/time.c31
-rw-r--r--kernel/time/clockevents.c27
-rw-r--r--kernel/time/clocksource.c105
-rw-r--r--kernel/time/tick-broadcast.c42
-rw-r--r--kernel/time/tick-common.c20
-rw-r--r--kernel/time/tick-internal.h1
-rw-r--r--kernel/time/tick-oneshot.c4
-rw-r--r--kernel/time/tick-sched.c141
-rw-r--r--kernel/time/timecompare.c8
-rw-r--r--kernel/time/timekeeping.c125
-rw-r--r--kernel/time/timer_list.c21
-rw-r--r--kernel/time/timer_stats.c18
-rw-r--r--kernel/trace/ftrace.c41
-rw-r--r--kernel/trace/power-traces.c2
-rw-r--r--kernel/trace/ring_buffer.c54
-rw-r--r--kernel/trace/ring_buffer_benchmark.c85
-rw-r--r--kernel/trace/trace.c312
-rw-r--r--kernel/trace/trace.h27
-rw-r--r--kernel/trace/trace_clock.c16
-rw-r--r--kernel/trace/trace_event_profile.c6
-rw-r--r--kernel/trace/trace_events.c41
-rw-r--r--kernel/trace/trace_export.c8
-rw-r--r--kernel/trace/trace_functions_graph.c169
-rw-r--r--kernel/trace/trace_hw_branches.c51
-rw-r--r--kernel/trace/trace_irqsoff.c2
-rw-r--r--kernel/trace/trace_kprobe.c9
-rw-r--r--kernel/trace/trace_ksym.c56
-rw-r--r--kernel/trace/trace_output.c75
-rw-r--r--kernel/trace/trace_sched_wakeup.c16
-rw-r--r--kernel/trace/trace_selftest.c4
-rw-r--r--kernel/trace/trace_stack.c16
-rw-r--r--kernel/trace/trace_syscalls.c18
-rw-r--r--kernel/user-return-notifier.c44
-rw-r--r--kernel/utsname_sysctl.c31
-rw-r--r--kernel/workqueue.c159
114 files changed, 6845 insertions, 4768 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
new file mode 100644
index 000000000000..88c92fb44618
--- /dev/null
+++ b/kernel/Kconfig.locks
@@ -0,0 +1,202 @@
1#
2# The ARCH_INLINE foo is necessary because select ignores "depends on"
3#
4config ARCH_INLINE_SPIN_TRYLOCK
5 bool
6
7config ARCH_INLINE_SPIN_TRYLOCK_BH
8 bool
9
10config ARCH_INLINE_SPIN_LOCK
11 bool
12
13config ARCH_INLINE_SPIN_LOCK_BH
14 bool
15
16config ARCH_INLINE_SPIN_LOCK_IRQ
17 bool
18
19config ARCH_INLINE_SPIN_LOCK_IRQSAVE
20 bool
21
22config ARCH_INLINE_SPIN_UNLOCK
23 bool
24
25config ARCH_INLINE_SPIN_UNLOCK_BH
26 bool
27
28config ARCH_INLINE_SPIN_UNLOCK_IRQ
29 bool
30
31config ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
32 bool
33
34
35config ARCH_INLINE_READ_TRYLOCK
36 bool
37
38config ARCH_INLINE_READ_LOCK
39 bool
40
41config ARCH_INLINE_READ_LOCK_BH
42 bool
43
44config ARCH_INLINE_READ_LOCK_IRQ
45 bool
46
47config ARCH_INLINE_READ_LOCK_IRQSAVE
48 bool
49
50config ARCH_INLINE_READ_UNLOCK
51 bool
52
53config ARCH_INLINE_READ_UNLOCK_BH
54 bool
55
56config ARCH_INLINE_READ_UNLOCK_IRQ
57 bool
58
59config ARCH_INLINE_READ_UNLOCK_IRQRESTORE
60 bool
61
62
63config ARCH_INLINE_WRITE_TRYLOCK
64 bool
65
66config ARCH_INLINE_WRITE_LOCK
67 bool
68
69config ARCH_INLINE_WRITE_LOCK_BH
70 bool
71
72config ARCH_INLINE_WRITE_LOCK_IRQ
73 bool
74
75config ARCH_INLINE_WRITE_LOCK_IRQSAVE
76 bool
77
78config ARCH_INLINE_WRITE_UNLOCK
79 bool
80
81config ARCH_INLINE_WRITE_UNLOCK_BH
82 bool
83
84config ARCH_INLINE_WRITE_UNLOCK_IRQ
85 bool
86
87config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
88 bool
89
90#
91# lock_* functions are inlined when:
92# - DEBUG_SPINLOCK=n and GENERIC_LOCKBREAK=n and ARCH_INLINE_*LOCK=y
93#
94# trylock_* functions are inlined when:
95# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
96#
97# unlock and unlock_irq functions are inlined when:
98# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
99# or
100# - DEBUG_SPINLOCK=n and PREEMPT=n
101#
102# unlock_bh and unlock_irqrestore functions are inlined when:
103# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
104#
105
106config INLINE_SPIN_TRYLOCK
107 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK
108
109config INLINE_SPIN_TRYLOCK_BH
110 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK_BH
111
112config INLINE_SPIN_LOCK
113 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK
114
115config INLINE_SPIN_LOCK_BH
116 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
117 ARCH_INLINE_SPIN_LOCK_BH
118
119config INLINE_SPIN_LOCK_IRQ
120 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
121 ARCH_INLINE_SPIN_LOCK_IRQ
122
123config INLINE_SPIN_LOCK_IRQSAVE
124 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
125 ARCH_INLINE_SPIN_LOCK_IRQSAVE
126
127config INLINE_SPIN_UNLOCK
128 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK)
129
130config INLINE_SPIN_UNLOCK_BH
131 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH
132
133config INLINE_SPIN_UNLOCK_IRQ
134 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH)
135
136config INLINE_SPIN_UNLOCK_IRQRESTORE
137 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
138
139
140config INLINE_READ_TRYLOCK
141 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_TRYLOCK
142
143config INLINE_READ_LOCK
144 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK
145
146config INLINE_READ_LOCK_BH
147 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
148 ARCH_INLINE_READ_LOCK_BH
149
150config INLINE_READ_LOCK_IRQ
151 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
152 ARCH_INLINE_READ_LOCK_IRQ
153
154config INLINE_READ_LOCK_IRQSAVE
155 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
156 ARCH_INLINE_READ_LOCK_IRQSAVE
157
158config INLINE_READ_UNLOCK
159 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK)
160
161config INLINE_READ_UNLOCK_BH
162 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_BH
163
164config INLINE_READ_UNLOCK_IRQ
165 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK_BH)
166
167config INLINE_READ_UNLOCK_IRQRESTORE
168 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_IRQRESTORE
169
170
171config INLINE_WRITE_TRYLOCK
172 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_TRYLOCK
173
174config INLINE_WRITE_LOCK
175 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK
176
177config INLINE_WRITE_LOCK_BH
178 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
179 ARCH_INLINE_WRITE_LOCK_BH
180
181config INLINE_WRITE_LOCK_IRQ
182 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
183 ARCH_INLINE_WRITE_LOCK_IRQ
184
185config INLINE_WRITE_LOCK_IRQSAVE
186 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
187 ARCH_INLINE_WRITE_LOCK_IRQSAVE
188
189config INLINE_WRITE_UNLOCK
190 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK)
191
192config INLINE_WRITE_UNLOCK_BH
193 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_BH
194
195config INLINE_WRITE_UNLOCK_IRQ
196 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH)
197
198config INLINE_WRITE_UNLOCK_IRQRESTORE
199 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
200
201config MUTEX_SPIN_ON_OWNER
202 def_bool SMP && !DEBUG_MUTEXES && !HAVE_DEFAULT_NO_SPIN_MUTEXES
diff --git a/kernel/Makefile b/kernel/Makefile
index 6b7ce8173dfd..864ff75d65f2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -4,7 +4,7 @@
4 4
5obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ 5obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
6 cpu.o exit.o itimer.o time.o softirq.o resource.o \ 6 cpu.o exit.o itimer.o time.o softirq.o resource.o \
7 sysctl.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
@@ -83,6 +83,7 @@ obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
83obj-$(CONFIG_TREE_RCU) += rcutree.o 83obj-$(CONFIG_TREE_RCU) += rcutree.o
84obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o 84obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
85obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o 85obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
86obj-$(CONFIG_TINY_RCU) += rcutiny.o
86obj-$(CONFIG_RELAY) += relay.o 87obj-$(CONFIG_RELAY) += relay.o
87obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 88obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
88obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 89obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
@@ -95,8 +96,10 @@ obj-$(CONFIG_X86_DS) += trace/
95obj-$(CONFIG_RING_BUFFER) += trace/ 96obj-$(CONFIG_RING_BUFFER) += trace/
96obj-$(CONFIG_SMP) += sched_cpupri.o 97obj-$(CONFIG_SMP) += sched_cpupri.o
97obj-$(CONFIG_SLOW_WORK) += slow-work.o 98obj-$(CONFIG_SLOW_WORK) += slow-work.o
99obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
98obj-$(CONFIG_PERF_EVENTS) += perf_event.o 100obj-$(CONFIG_PERF_EVENTS) += perf_event.o
99obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 101obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
102obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
100 103
101ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 104ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
102# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 105# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index 9a4715a2f6bf..a6605ca921b6 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -536,7 +536,8 @@ static void do_acct_process(struct bsd_acct_struct *acct,
536 do_div(elapsed, AHZ); 536 do_div(elapsed, AHZ);
537 ac.ac_btime = get_seconds() - elapsed; 537 ac.ac_btime = get_seconds() - elapsed;
538 /* we really need to bite the bullet and change layout */ 538 /* we really need to bite the bullet and change layout */
539 current_uid_gid(&ac.ac_uid, &ac.ac_gid); 539 ac.ac_uid = orig_cred->uid;
540 ac.ac_gid = orig_cred->gid;
540#if ACCT_VERSION==2 541#if ACCT_VERSION==2
541 ac.ac_ahz = AHZ; 542 ac.ac_ahz = AHZ;
542#endif 543#endif
diff --git a/kernel/capability.c b/kernel/capability.c
index 4e17041963f5..7f876e60521f 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -29,7 +29,6 @@ EXPORT_SYMBOL(__cap_empty_set);
29EXPORT_SYMBOL(__cap_full_set); 29EXPORT_SYMBOL(__cap_full_set);
30EXPORT_SYMBOL(__cap_init_eff_set); 30EXPORT_SYMBOL(__cap_init_eff_set);
31 31
32#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
33int file_caps_enabled = 1; 32int file_caps_enabled = 1;
34 33
35static int __init file_caps_disable(char *str) 34static int __init file_caps_disable(char *str)
@@ -38,7 +37,6 @@ static int __init file_caps_disable(char *str)
38 return 1; 37 return 1;
39} 38}
40__setup("no_file_caps", file_caps_disable); 39__setup("no_file_caps", file_caps_disable);
41#endif
42 40
43/* 41/*
44 * More recent versions of libcap are available from: 42 * More recent versions of libcap are available from:
@@ -169,8 +167,8 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
169 kernel_cap_t pE, pI, pP; 167 kernel_cap_t pE, pI, pP;
170 168
171 ret = cap_validate_magic(header, &tocopy); 169 ret = cap_validate_magic(header, &tocopy);
172 if (ret != 0) 170 if ((dataptr == NULL) || (ret != 0))
173 return ret; 171 return ((dataptr == NULL) && (ret == -EINVAL)) ? 0 : ret;
174 172
175 if (get_user(pid, &header->pid)) 173 if (get_user(pid, &header->pid))
176 return -EFAULT; 174 return -EFAULT;
@@ -238,7 +236,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
238SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) 236SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
239{ 237{
240 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; 238 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
241 unsigned i, tocopy; 239 unsigned i, tocopy, copybytes;
242 kernel_cap_t inheritable, permitted, effective; 240 kernel_cap_t inheritable, permitted, effective;
243 struct cred *new; 241 struct cred *new;
244 int ret; 242 int ret;
@@ -255,8 +253,11 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
255 if (pid != 0 && pid != task_pid_vnr(current)) 253 if (pid != 0 && pid != task_pid_vnr(current))
256 return -EPERM; 254 return -EPERM;
257 255
258 if (copy_from_user(&kdata, data, 256 copybytes = tocopy * sizeof(struct __user_cap_data_struct);
259 tocopy * sizeof(struct __user_cap_data_struct))) 257 if (copybytes > sizeof(kdata))
258 return -EFAULT;
259
260 if (copy_from_user(&kdata, data, copybytes))
260 return -EFAULT; 261 return -EFAULT;
261 262
262 for (i = 0; i < tocopy; i++) { 263 for (i = 0; i < tocopy; i++) {
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6ba0f1ecb212..291ac586f37f 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -212,6 +212,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
212 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, 212 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
213 hcpu, -1, &nr_calls); 213 hcpu, -1, &nr_calls);
214 if (err == NOTIFY_BAD) { 214 if (err == NOTIFY_BAD) {
215 set_cpu_active(cpu, true);
216
215 nr_calls--; 217 nr_calls--;
216 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 218 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
217 hcpu, nr_calls, NULL); 219 hcpu, nr_calls, NULL);
@@ -223,11 +225,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
223 225
224 /* Ensure that we are not runnable on dying cpu */ 226 /* Ensure that we are not runnable on dying cpu */
225 cpumask_copy(old_allowed, &current->cpus_allowed); 227 cpumask_copy(old_allowed, &current->cpus_allowed);
226 set_cpus_allowed_ptr(current, 228 set_cpus_allowed_ptr(current, cpu_active_mask);
227 cpumask_of(cpumask_any_but(cpu_online_mask, cpu)));
228 229
229 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); 230 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
230 if (err) { 231 if (err) {
232 set_cpu_active(cpu, true);
231 /* CPU didn't die: tell everyone. Can't complain. */ 233 /* CPU didn't die: tell everyone. Can't complain. */
232 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 234 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
233 hcpu) == NOTIFY_BAD) 235 hcpu) == NOTIFY_BAD)
@@ -292,9 +294,6 @@ int __ref cpu_down(unsigned int cpu)
292 294
293 err = _cpu_down(cpu, 0); 295 err = _cpu_down(cpu, 0);
294 296
295 if (cpu_online(cpu))
296 set_cpu_active(cpu, true);
297
298out: 297out:
299 cpu_maps_update_done(); 298 cpu_maps_update_done();
300 stop_machine_destroy(); 299 stop_machine_destroy();
@@ -387,15 +386,23 @@ int disable_nonboot_cpus(void)
387 * with the userspace trying to use the CPU hotplug at the same time 386 * with the userspace trying to use the CPU hotplug at the same time
388 */ 387 */
389 cpumask_clear(frozen_cpus); 388 cpumask_clear(frozen_cpus);
389
390 for_each_online_cpu(cpu) {
391 if (cpu == first_cpu)
392 continue;
393 set_cpu_active(cpu, false);
394 }
395
396 synchronize_sched();
397
390 printk("Disabling non-boot CPUs ...\n"); 398 printk("Disabling non-boot CPUs ...\n");
391 for_each_online_cpu(cpu) { 399 for_each_online_cpu(cpu) {
392 if (cpu == first_cpu) 400 if (cpu == first_cpu)
393 continue; 401 continue;
394 error = _cpu_down(cpu, 1); 402 error = _cpu_down(cpu, 1);
395 if (!error) { 403 if (!error)
396 cpumask_set_cpu(cpu, frozen_cpus); 404 cpumask_set_cpu(cpu, frozen_cpus);
397 printk("CPU%d is down\n", cpu); 405 else {
398 } else {
399 printk(KERN_ERR "Error taking CPU%d down: %d\n", 406 printk(KERN_ERR "Error taking CPU%d down: %d\n",
400 cpu, error); 407 cpu, error);
401 break; 408 break;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b5cb469d2545..ba401fab459f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -537,8 +537,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
537 * element of the partition (one sched domain) to be passed to 537 * element of the partition (one sched domain) to be passed to
538 * partition_sched_domains(). 538 * partition_sched_domains().
539 */ 539 */
540/* FIXME: see the FIXME in partition_sched_domains() */ 540static int generate_sched_domains(cpumask_var_t **domains,
541static int generate_sched_domains(struct cpumask **domains,
542 struct sched_domain_attr **attributes) 541 struct sched_domain_attr **attributes)
543{ 542{
544 LIST_HEAD(q); /* queue of cpusets to be scanned */ 543 LIST_HEAD(q); /* queue of cpusets to be scanned */
@@ -546,7 +545,7 @@ static int generate_sched_domains(struct cpumask **domains,
546 struct cpuset **csa; /* array of all cpuset ptrs */ 545 struct cpuset **csa; /* array of all cpuset ptrs */
547 int csn; /* how many cpuset ptrs in csa so far */ 546 int csn; /* how many cpuset ptrs in csa so far */
548 int i, j, k; /* indices for partition finding loops */ 547 int i, j, k; /* indices for partition finding loops */
549 struct cpumask *doms; /* resulting partition; i.e. sched domains */ 548 cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
550 struct sched_domain_attr *dattr; /* attributes for custom domains */ 549 struct sched_domain_attr *dattr; /* attributes for custom domains */
551 int ndoms = 0; /* number of sched domains in result */ 550 int ndoms = 0; /* number of sched domains in result */
552 int nslot; /* next empty doms[] struct cpumask slot */ 551 int nslot; /* next empty doms[] struct cpumask slot */
@@ -557,7 +556,8 @@ static int generate_sched_domains(struct cpumask **domains,
557 556
558 /* Special case for the 99% of systems with one, full, sched domain */ 557 /* Special case for the 99% of systems with one, full, sched domain */
559 if (is_sched_load_balance(&top_cpuset)) { 558 if (is_sched_load_balance(&top_cpuset)) {
560 doms = kmalloc(cpumask_size(), GFP_KERNEL); 559 ndoms = 1;
560 doms = alloc_sched_domains(ndoms);
561 if (!doms) 561 if (!doms)
562 goto done; 562 goto done;
563 563
@@ -566,9 +566,8 @@ static int generate_sched_domains(struct cpumask **domains,
566 *dattr = SD_ATTR_INIT; 566 *dattr = SD_ATTR_INIT;
567 update_domain_attr_tree(dattr, &top_cpuset); 567 update_domain_attr_tree(dattr, &top_cpuset);
568 } 568 }
569 cpumask_copy(doms, top_cpuset.cpus_allowed); 569 cpumask_copy(doms[0], top_cpuset.cpus_allowed);
570 570
571 ndoms = 1;
572 goto done; 571 goto done;
573 } 572 }
574 573
@@ -636,7 +635,7 @@ restart:
636 * Now we know how many domains to create. 635 * Now we know how many domains to create.
637 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. 636 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
638 */ 637 */
639 doms = kmalloc(ndoms * cpumask_size(), GFP_KERNEL); 638 doms = alloc_sched_domains(ndoms);
640 if (!doms) 639 if (!doms)
641 goto done; 640 goto done;
642 641
@@ -656,7 +655,7 @@ restart:
656 continue; 655 continue;
657 } 656 }
658 657
659 dp = doms + nslot; 658 dp = doms[nslot];
660 659
661 if (nslot == ndoms) { 660 if (nslot == ndoms) {
662 static int warnings = 10; 661 static int warnings = 10;
@@ -718,7 +717,7 @@ done:
718static void do_rebuild_sched_domains(struct work_struct *unused) 717static void do_rebuild_sched_domains(struct work_struct *unused)
719{ 718{
720 struct sched_domain_attr *attr; 719 struct sched_domain_attr *attr;
721 struct cpumask *doms; 720 cpumask_var_t *doms;
722 int ndoms; 721 int ndoms;
723 722
724 get_online_cpus(); 723 get_online_cpus();
@@ -738,7 +737,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused)
738{ 737{
739} 738}
740 739
741static int generate_sched_domains(struct cpumask **domains, 740static int generate_sched_domains(cpumask_var_t **domains,
742 struct sched_domain_attr **attributes) 741 struct sched_domain_attr **attributes)
743{ 742{
744 *domains = NULL; 743 *domains = NULL;
@@ -873,7 +872,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
873 if (retval < 0) 872 if (retval < 0)
874 return retval; 873 return retval;
875 874
876 if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask)) 875 if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
877 return -EINVAL; 876 return -EINVAL;
878 } 877 }
879 retval = validate_change(cs, trialcs); 878 retval = validate_change(cs, trialcs);
@@ -2011,7 +2010,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2011 } 2010 }
2012 2011
2013 /* Continue past cpusets with all cpus, mems online */ 2012 /* Continue past cpusets with all cpus, mems online */
2014 if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) && 2013 if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&
2015 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) 2014 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
2016 continue; 2015 continue;
2017 2016
@@ -2020,7 +2019,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2020 /* Remove offline cpus and mems from this cpuset. */ 2019 /* Remove offline cpus and mems from this cpuset. */
2021 mutex_lock(&callback_mutex); 2020 mutex_lock(&callback_mutex);
2022 cpumask_and(cp->cpus_allowed, cp->cpus_allowed, 2021 cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
2023 cpu_online_mask); 2022 cpu_active_mask);
2024 nodes_and(cp->mems_allowed, cp->mems_allowed, 2023 nodes_and(cp->mems_allowed, cp->mems_allowed,
2025 node_states[N_HIGH_MEMORY]); 2024 node_states[N_HIGH_MEMORY]);
2026 mutex_unlock(&callback_mutex); 2025 mutex_unlock(&callback_mutex);
@@ -2052,14 +2051,16 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2052 unsigned long phase, void *unused_cpu) 2051 unsigned long phase, void *unused_cpu)
2053{ 2052{
2054 struct sched_domain_attr *attr; 2053 struct sched_domain_attr *attr;
2055 struct cpumask *doms; 2054 cpumask_var_t *doms;
2056 int ndoms; 2055 int ndoms;
2057 2056
2058 switch (phase) { 2057 switch (phase) {
2059 case CPU_ONLINE: 2058 case CPU_ONLINE:
2060 case CPU_ONLINE_FROZEN: 2059 case CPU_ONLINE_FROZEN:
2061 case CPU_DEAD: 2060 case CPU_DOWN_PREPARE:
2062 case CPU_DEAD_FROZEN: 2061 case CPU_DOWN_PREPARE_FROZEN:
2062 case CPU_DOWN_FAILED:
2063 case CPU_DOWN_FAILED_FROZEN:
2063 break; 2064 break;
2064 2065
2065 default: 2066 default:
@@ -2068,7 +2069,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2068 2069
2069 cgroup_lock(); 2070 cgroup_lock();
2070 mutex_lock(&callback_mutex); 2071 mutex_lock(&callback_mutex);
2071 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); 2072 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2072 mutex_unlock(&callback_mutex); 2073 mutex_unlock(&callback_mutex);
2073 scan_for_empty_cpusets(&top_cpuset); 2074 scan_for_empty_cpusets(&top_cpuset);
2074 ndoms = generate_sched_domains(&doms, &attr); 2075 ndoms = generate_sched_domains(&doms, &attr);
@@ -2115,7 +2116,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2115 2116
2116void __init cpuset_init_smp(void) 2117void __init cpuset_init_smp(void)
2117{ 2118{
2118 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); 2119 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2119 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2120 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2120 2121
2121 hotcpu_notifier(cpuset_track_online_cpus, 0); 2122 hotcpu_notifier(cpuset_track_online_cpus, 0);
@@ -2537,15 +2538,9 @@ const struct file_operations proc_cpuset_operations = {
2537}; 2538};
2538#endif /* CONFIG_PROC_PID_CPUSET */ 2539#endif /* CONFIG_PROC_PID_CPUSET */
2539 2540
2540/* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */ 2541/* Display task mems_allowed in /proc/<pid>/status file. */
2541void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) 2542void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2542{ 2543{
2543 seq_printf(m, "Cpus_allowed:\t");
2544 seq_cpumask(m, &task->cpus_allowed);
2545 seq_printf(m, "\n");
2546 seq_printf(m, "Cpus_allowed_list:\t");
2547 seq_cpumask_list(m, &task->cpus_allowed);
2548 seq_printf(m, "\n");
2549 seq_printf(m, "Mems_allowed:\t"); 2544 seq_printf(m, "Mems_allowed:\t");
2550 seq_nodemask(m, &task->mems_allowed); 2545 seq_nodemask(m, &task->mems_allowed);
2551 seq_printf(m, "\n"); 2546 seq_printf(m, "\n");
diff --git a/kernel/exit.c b/kernel/exit.c
index 3f45e3cf931d..5962d7ccf243 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -111,9 +111,9 @@ static void __exit_signal(struct task_struct *tsk)
111 * We won't ever get here for the group leader, since it 111 * We won't ever get here for the group leader, since it
112 * will have been the last reference on the signal_struct. 112 * will have been the last reference on the signal_struct.
113 */ 113 */
114 sig->utime = cputime_add(sig->utime, task_utime(tsk)); 114 sig->utime = cputime_add(sig->utime, tsk->utime);
115 sig->stime = cputime_add(sig->stime, task_stime(tsk)); 115 sig->stime = cputime_add(sig->stime, tsk->stime);
116 sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); 116 sig->gtime = cputime_add(sig->gtime, tsk->gtime);
117 sig->min_flt += tsk->min_flt; 117 sig->min_flt += tsk->min_flt;
118 sig->maj_flt += tsk->maj_flt; 118 sig->maj_flt += tsk->maj_flt;
119 sig->nvcsw += tsk->nvcsw; 119 sig->nvcsw += tsk->nvcsw;
@@ -933,7 +933,7 @@ NORET_TYPE void do_exit(long code)
933 * an exiting task cleaning up the robust pi futexes. 933 * an exiting task cleaning up the robust pi futexes.
934 */ 934 */
935 smp_mb(); 935 smp_mb();
936 spin_unlock_wait(&tsk->pi_lock); 936 raw_spin_unlock_wait(&tsk->pi_lock);
937 937
938 if (unlikely(in_atomic())) 938 if (unlikely(in_atomic()))
939 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", 939 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
@@ -971,7 +971,7 @@ NORET_TYPE void do_exit(long code)
971 exit_thread(); 971 exit_thread();
972 cgroup_exit(tsk, 1); 972 cgroup_exit(tsk, 1);
973 973
974 if (group_dead && tsk->signal->leader) 974 if (group_dead)
975 disassociate_ctty(1); 975 disassociate_ctty(1);
976 976
977 module_put(task_thread_info(tsk)->exec_domain->module); 977 module_put(task_thread_info(tsk)->exec_domain->module);
@@ -1009,7 +1009,7 @@ NORET_TYPE void do_exit(long code)
1009 tsk->flags |= PF_EXITPIDONE; 1009 tsk->flags |= PF_EXITPIDONE;
1010 1010
1011 if (tsk->io_context) 1011 if (tsk->io_context)
1012 exit_io_context(); 1012 exit_io_context(tsk);
1013 1013
1014 if (tsk->splice_pipe) 1014 if (tsk->splice_pipe)
1015 __free_pipe_info(tsk->splice_pipe); 1015 __free_pipe_info(tsk->splice_pipe);
@@ -1210,6 +1210,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1210 struct signal_struct *psig; 1210 struct signal_struct *psig;
1211 struct signal_struct *sig; 1211 struct signal_struct *sig;
1212 unsigned long maxrss; 1212 unsigned long maxrss;
1213 cputime_t tgutime, tgstime;
1213 1214
1214 /* 1215 /*
1215 * The resource counters for the group leader are in its 1216 * The resource counters for the group leader are in its
@@ -1225,20 +1226,23 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1225 * need to protect the access to parent->signal fields, 1226 * need to protect the access to parent->signal fields,
1226 * as other threads in the parent group can be right 1227 * as other threads in the parent group can be right
1227 * here reaping other children at the same time. 1228 * here reaping other children at the same time.
1229 *
1230 * We use thread_group_times() to get times for the thread
1231 * group, which consolidates times for all threads in the
1232 * group including the group leader.
1228 */ 1233 */
1234 thread_group_times(p, &tgutime, &tgstime);
1229 spin_lock_irq(&p->real_parent->sighand->siglock); 1235 spin_lock_irq(&p->real_parent->sighand->siglock);
1230 psig = p->real_parent->signal; 1236 psig = p->real_parent->signal;
1231 sig = p->signal; 1237 sig = p->signal;
1232 psig->cutime = 1238 psig->cutime =
1233 cputime_add(psig->cutime, 1239 cputime_add(psig->cutime,
1234 cputime_add(p->utime, 1240 cputime_add(tgutime,
1235 cputime_add(sig->utime, 1241 sig->cutime));
1236 sig->cutime)));
1237 psig->cstime = 1242 psig->cstime =
1238 cputime_add(psig->cstime, 1243 cputime_add(psig->cstime,
1239 cputime_add(p->stime, 1244 cputime_add(tgstime,
1240 cputime_add(sig->stime, 1245 sig->cstime));
1241 sig->cstime)));
1242 psig->cgtime = 1246 psig->cgtime =
1243 cputime_add(psig->cgtime, 1247 cputime_add(psig->cgtime,
1244 cputime_add(p->gtime, 1248 cputime_add(p->gtime,
diff --git a/kernel/fork.c b/kernel/fork.c
index 166b8c49257c..202a0ba63d3c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -64,6 +64,7 @@
64#include <linux/magic.h> 64#include <linux/magic.h>
65#include <linux/perf_event.h> 65#include <linux/perf_event.h>
66#include <linux/posix-timers.h> 66#include <linux/posix-timers.h>
67#include <linux/user-return-notifier.h>
67 68
68#include <asm/pgtable.h> 69#include <asm/pgtable.h>
69#include <asm/pgalloc.h> 70#include <asm/pgalloc.h>
@@ -249,6 +250,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
249 goto out; 250 goto out;
250 251
251 setup_thread_stack(tsk, orig); 252 setup_thread_stack(tsk, orig);
253 clear_user_return_notifier(tsk);
252 stackend = end_of_stack(tsk); 254 stackend = end_of_stack(tsk);
253 *stackend = STACK_END_MAGIC; /* for overflow detection */ 255 *stackend = STACK_END_MAGIC; /* for overflow detection */
254 256
@@ -884,6 +886,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
884 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; 886 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
885 sig->gtime = cputime_zero; 887 sig->gtime = cputime_zero;
886 sig->cgtime = cputime_zero; 888 sig->cgtime = cputime_zero;
889#ifndef CONFIG_VIRT_CPU_ACCOUNTING
890 sig->prev_utime = sig->prev_stime = cputime_zero;
891#endif
887 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; 892 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
888 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; 893 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
889 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; 894 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
@@ -934,9 +939,9 @@ SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
934 939
935static void rt_mutex_init_task(struct task_struct *p) 940static void rt_mutex_init_task(struct task_struct *p)
936{ 941{
937 spin_lock_init(&p->pi_lock); 942 raw_spin_lock_init(&p->pi_lock);
938#ifdef CONFIG_RT_MUTEXES 943#ifdef CONFIG_RT_MUTEXES
939 plist_head_init(&p->pi_waiters, &p->pi_lock); 944 plist_head_init_raw(&p->pi_waiters, &p->pi_lock);
940 p->pi_blocked_on = NULL; 945 p->pi_blocked_on = NULL;
941#endif 946#endif
942} 947}
@@ -1066,8 +1071,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1066 p->gtime = cputime_zero; 1071 p->gtime = cputime_zero;
1067 p->utimescaled = cputime_zero; 1072 p->utimescaled = cputime_zero;
1068 p->stimescaled = cputime_zero; 1073 p->stimescaled = cputime_zero;
1074#ifndef CONFIG_VIRT_CPU_ACCOUNTING
1069 p->prev_utime = cputime_zero; 1075 p->prev_utime = cputime_zero;
1070 p->prev_stime = cputime_zero; 1076 p->prev_stime = cputime_zero;
1077#endif
1071 1078
1072 p->default_timer_slack_ns = current->timer_slack_ns; 1079 p->default_timer_slack_ns = current->timer_slack_ns;
1073 1080
@@ -1120,6 +1127,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1120#ifdef CONFIG_DEBUG_MUTEXES 1127#ifdef CONFIG_DEBUG_MUTEXES
1121 p->blocked_on = NULL; /* not blocked yet */ 1128 p->blocked_on = NULL; /* not blocked yet */
1122#endif 1129#endif
1130#ifdef CONFIG_CGROUP_MEM_RES_CTLR
1131 p->memcg_batch.do_batch = 0;
1132 p->memcg_batch.memcg = NULL;
1133#endif
1123 1134
1124 p->bts = NULL; 1135 p->bts = NULL;
1125 1136
@@ -1199,9 +1210,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1199 p->sas_ss_sp = p->sas_ss_size = 0; 1210 p->sas_ss_sp = p->sas_ss_size = 0;
1200 1211
1201 /* 1212 /*
1202 * Syscall tracing should be turned off in the child regardless 1213 * Syscall tracing and stepping should be turned off in the
1203 * of CLONE_PTRACE. 1214 * child regardless of CLONE_PTRACE.
1204 */ 1215 */
1216 user_disable_single_step(p);
1205 clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); 1217 clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
1206#ifdef TIF_SYSCALL_EMU 1218#ifdef TIF_SYSCALL_EMU
1207 clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); 1219 clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
@@ -1310,7 +1322,8 @@ bad_fork_free_pid:
1310 if (pid != &init_struct_pid) 1322 if (pid != &init_struct_pid)
1311 free_pid(pid); 1323 free_pid(pid);
1312bad_fork_cleanup_io: 1324bad_fork_cleanup_io:
1313 put_io_context(p->io_context); 1325 if (p->io_context)
1326 exit_io_context(p);
1314bad_fork_cleanup_namespaces: 1327bad_fork_cleanup_namespaces:
1315 exit_task_namespaces(p); 1328 exit_task_namespaces(p);
1316bad_fork_cleanup_mm: 1329bad_fork_cleanup_mm:
diff --git a/kernel/futex.c b/kernel/futex.c
index fb65e822fc41..8e3c3ffe1b9a 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -304,8 +304,14 @@ void put_futex_key(int fshared, union futex_key *key)
304 */ 304 */
305static int fault_in_user_writeable(u32 __user *uaddr) 305static int fault_in_user_writeable(u32 __user *uaddr)
306{ 306{
307 int ret = get_user_pages(current, current->mm, (unsigned long)uaddr, 307 struct mm_struct *mm = current->mm;
308 1, 1, 0, NULL, NULL); 308 int ret;
309
310 down_read(&mm->mmap_sem);
311 ret = get_user_pages(current, mm, (unsigned long)uaddr,
312 1, 1, 0, NULL, NULL);
313 up_read(&mm->mmap_sem);
314
309 return ret < 0 ? ret : 0; 315 return ret < 0 ? ret : 0;
310} 316}
311 317
@@ -397,9 +403,9 @@ static void free_pi_state(struct futex_pi_state *pi_state)
397 * and has cleaned up the pi_state already 403 * and has cleaned up the pi_state already
398 */ 404 */
399 if (pi_state->owner) { 405 if (pi_state->owner) {
400 spin_lock_irq(&pi_state->owner->pi_lock); 406 raw_spin_lock_irq(&pi_state->owner->pi_lock);
401 list_del_init(&pi_state->list); 407 list_del_init(&pi_state->list);
402 spin_unlock_irq(&pi_state->owner->pi_lock); 408 raw_spin_unlock_irq(&pi_state->owner->pi_lock);
403 409
404 rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); 410 rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
405 } 411 }
@@ -464,18 +470,18 @@ void exit_pi_state_list(struct task_struct *curr)
464 * pi_state_list anymore, but we have to be careful 470 * pi_state_list anymore, but we have to be careful
465 * versus waiters unqueueing themselves: 471 * versus waiters unqueueing themselves:
466 */ 472 */
467 spin_lock_irq(&curr->pi_lock); 473 raw_spin_lock_irq(&curr->pi_lock);
468 while (!list_empty(head)) { 474 while (!list_empty(head)) {
469 475
470 next = head->next; 476 next = head->next;
471 pi_state = list_entry(next, struct futex_pi_state, list); 477 pi_state = list_entry(next, struct futex_pi_state, list);
472 key = pi_state->key; 478 key = pi_state->key;
473 hb = hash_futex(&key); 479 hb = hash_futex(&key);
474 spin_unlock_irq(&curr->pi_lock); 480 raw_spin_unlock_irq(&curr->pi_lock);
475 481
476 spin_lock(&hb->lock); 482 spin_lock(&hb->lock);
477 483
478 spin_lock_irq(&curr->pi_lock); 484 raw_spin_lock_irq(&curr->pi_lock);
479 /* 485 /*
480 * We dropped the pi-lock, so re-check whether this 486 * We dropped the pi-lock, so re-check whether this
481 * task still owns the PI-state: 487 * task still owns the PI-state:
@@ -489,15 +495,15 @@ void exit_pi_state_list(struct task_struct *curr)
489 WARN_ON(list_empty(&pi_state->list)); 495 WARN_ON(list_empty(&pi_state->list));
490 list_del_init(&pi_state->list); 496 list_del_init(&pi_state->list);
491 pi_state->owner = NULL; 497 pi_state->owner = NULL;
492 spin_unlock_irq(&curr->pi_lock); 498 raw_spin_unlock_irq(&curr->pi_lock);
493 499
494 rt_mutex_unlock(&pi_state->pi_mutex); 500 rt_mutex_unlock(&pi_state->pi_mutex);
495 501
496 spin_unlock(&hb->lock); 502 spin_unlock(&hb->lock);
497 503
498 spin_lock_irq(&curr->pi_lock); 504 raw_spin_lock_irq(&curr->pi_lock);
499 } 505 }
500 spin_unlock_irq(&curr->pi_lock); 506 raw_spin_unlock_irq(&curr->pi_lock);
501} 507}
502 508
503static int 509static int
@@ -552,7 +558,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
552 * change of the task flags, we do this protected by 558 * change of the task flags, we do this protected by
553 * p->pi_lock: 559 * p->pi_lock:
554 */ 560 */
555 spin_lock_irq(&p->pi_lock); 561 raw_spin_lock_irq(&p->pi_lock);
556 if (unlikely(p->flags & PF_EXITING)) { 562 if (unlikely(p->flags & PF_EXITING)) {
557 /* 563 /*
558 * The task is on the way out. When PF_EXITPIDONE is 564 * The task is on the way out. When PF_EXITPIDONE is
@@ -561,7 +567,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
561 */ 567 */
562 int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN; 568 int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
563 569
564 spin_unlock_irq(&p->pi_lock); 570 raw_spin_unlock_irq(&p->pi_lock);
565 put_task_struct(p); 571 put_task_struct(p);
566 return ret; 572 return ret;
567 } 573 }
@@ -580,7 +586,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
580 WARN_ON(!list_empty(&pi_state->list)); 586 WARN_ON(!list_empty(&pi_state->list));
581 list_add(&pi_state->list, &p->pi_state_list); 587 list_add(&pi_state->list, &p->pi_state_list);
582 pi_state->owner = p; 588 pi_state->owner = p;
583 spin_unlock_irq(&p->pi_lock); 589 raw_spin_unlock_irq(&p->pi_lock);
584 590
585 put_task_struct(p); 591 put_task_struct(p);
586 592
@@ -754,7 +760,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
754 if (!pi_state) 760 if (!pi_state)
755 return -EINVAL; 761 return -EINVAL;
756 762
757 spin_lock(&pi_state->pi_mutex.wait_lock); 763 raw_spin_lock(&pi_state->pi_mutex.wait_lock);
758 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); 764 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
759 765
760 /* 766 /*
@@ -783,23 +789,23 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
783 else if (curval != uval) 789 else if (curval != uval)
784 ret = -EINVAL; 790 ret = -EINVAL;
785 if (ret) { 791 if (ret) {
786 spin_unlock(&pi_state->pi_mutex.wait_lock); 792 raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
787 return ret; 793 return ret;
788 } 794 }
789 } 795 }
790 796
791 spin_lock_irq(&pi_state->owner->pi_lock); 797 raw_spin_lock_irq(&pi_state->owner->pi_lock);
792 WARN_ON(list_empty(&pi_state->list)); 798 WARN_ON(list_empty(&pi_state->list));
793 list_del_init(&pi_state->list); 799 list_del_init(&pi_state->list);
794 spin_unlock_irq(&pi_state->owner->pi_lock); 800 raw_spin_unlock_irq(&pi_state->owner->pi_lock);
795 801
796 spin_lock_irq(&new_owner->pi_lock); 802 raw_spin_lock_irq(&new_owner->pi_lock);
797 WARN_ON(!list_empty(&pi_state->list)); 803 WARN_ON(!list_empty(&pi_state->list));
798 list_add(&pi_state->list, &new_owner->pi_state_list); 804 list_add(&pi_state->list, &new_owner->pi_state_list);
799 pi_state->owner = new_owner; 805 pi_state->owner = new_owner;
800 spin_unlock_irq(&new_owner->pi_lock); 806 raw_spin_unlock_irq(&new_owner->pi_lock);
801 807
802 spin_unlock(&pi_state->pi_mutex.wait_lock); 808 raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
803 rt_mutex_unlock(&pi_state->pi_mutex); 809 rt_mutex_unlock(&pi_state->pi_mutex);
804 810
805 return 0; 811 return 0;
@@ -1004,7 +1010,7 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1004 plist_add(&q->list, &hb2->chain); 1010 plist_add(&q->list, &hb2->chain);
1005 q->lock_ptr = &hb2->lock; 1011 q->lock_ptr = &hb2->lock;
1006#ifdef CONFIG_DEBUG_PI_LIST 1012#ifdef CONFIG_DEBUG_PI_LIST
1007 q->list.plist.lock = &hb2->lock; 1013 q->list.plist.spinlock = &hb2->lock;
1008#endif 1014#endif
1009 } 1015 }
1010 get_futex_key_refs(key2); 1016 get_futex_key_refs(key2);
@@ -1040,7 +1046,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1040 1046
1041 q->lock_ptr = &hb->lock; 1047 q->lock_ptr = &hb->lock;
1042#ifdef CONFIG_DEBUG_PI_LIST 1048#ifdef CONFIG_DEBUG_PI_LIST
1043 q->list.plist.lock = &hb->lock; 1049 q->list.plist.spinlock = &hb->lock;
1044#endif 1050#endif
1045 1051
1046 wake_up_state(q->task, TASK_NORMAL); 1052 wake_up_state(q->task, TASK_NORMAL);
@@ -1388,7 +1394,7 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1388 1394
1389 plist_node_init(&q->list, prio); 1395 plist_node_init(&q->list, prio);
1390#ifdef CONFIG_DEBUG_PI_LIST 1396#ifdef CONFIG_DEBUG_PI_LIST
1391 q->list.plist.lock = &hb->lock; 1397 q->list.plist.spinlock = &hb->lock;
1392#endif 1398#endif
1393 plist_add(&q->list, &hb->chain); 1399 plist_add(&q->list, &hb->chain);
1394 q->task = current; 1400 q->task = current;
@@ -1523,18 +1529,18 @@ retry:
1523 * itself. 1529 * itself.
1524 */ 1530 */
1525 if (pi_state->owner != NULL) { 1531 if (pi_state->owner != NULL) {
1526 spin_lock_irq(&pi_state->owner->pi_lock); 1532 raw_spin_lock_irq(&pi_state->owner->pi_lock);
1527 WARN_ON(list_empty(&pi_state->list)); 1533 WARN_ON(list_empty(&pi_state->list));
1528 list_del_init(&pi_state->list); 1534 list_del_init(&pi_state->list);
1529 spin_unlock_irq(&pi_state->owner->pi_lock); 1535 raw_spin_unlock_irq(&pi_state->owner->pi_lock);
1530 } 1536 }
1531 1537
1532 pi_state->owner = newowner; 1538 pi_state->owner = newowner;
1533 1539
1534 spin_lock_irq(&newowner->pi_lock); 1540 raw_spin_lock_irq(&newowner->pi_lock);
1535 WARN_ON(!list_empty(&pi_state->list)); 1541 WARN_ON(!list_empty(&pi_state->list));
1536 list_add(&pi_state->list, &newowner->pi_state_list); 1542 list_add(&pi_state->list, &newowner->pi_state_list);
1537 spin_unlock_irq(&newowner->pi_lock); 1543 raw_spin_unlock_irq(&newowner->pi_lock);
1538 return 0; 1544 return 0;
1539 1545
1540 /* 1546 /*
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 3e1c36e7998f..0086628b6e97 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -127,11 +127,11 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
127 for (;;) { 127 for (;;) {
128 base = timer->base; 128 base = timer->base;
129 if (likely(base != NULL)) { 129 if (likely(base != NULL)) {
130 spin_lock_irqsave(&base->cpu_base->lock, *flags); 130 raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
131 if (likely(base == timer->base)) 131 if (likely(base == timer->base))
132 return base; 132 return base;
133 /* The timer has migrated to another CPU: */ 133 /* The timer has migrated to another CPU: */
134 spin_unlock_irqrestore(&base->cpu_base->lock, *flags); 134 raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags);
135 } 135 }
136 cpu_relax(); 136 cpu_relax();
137 } 137 }
@@ -208,13 +208,13 @@ again:
208 208
209 /* See the comment in lock_timer_base() */ 209 /* See the comment in lock_timer_base() */
210 timer->base = NULL; 210 timer->base = NULL;
211 spin_unlock(&base->cpu_base->lock); 211 raw_spin_unlock(&base->cpu_base->lock);
212 spin_lock(&new_base->cpu_base->lock); 212 raw_spin_lock(&new_base->cpu_base->lock);
213 213
214 if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { 214 if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
215 cpu = this_cpu; 215 cpu = this_cpu;
216 spin_unlock(&new_base->cpu_base->lock); 216 raw_spin_unlock(&new_base->cpu_base->lock);
217 spin_lock(&base->cpu_base->lock); 217 raw_spin_lock(&base->cpu_base->lock);
218 timer->base = base; 218 timer->base = base;
219 goto again; 219 goto again;
220 } 220 }
@@ -230,7 +230,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
230{ 230{
231 struct hrtimer_clock_base *base = timer->base; 231 struct hrtimer_clock_base *base = timer->base;
232 232
233 spin_lock_irqsave(&base->cpu_base->lock, *flags); 233 raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
234 234
235 return base; 235 return base;
236} 236}
@@ -557,7 +557,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
557static int hrtimer_reprogram(struct hrtimer *timer, 557static int hrtimer_reprogram(struct hrtimer *timer,
558 struct hrtimer_clock_base *base) 558 struct hrtimer_clock_base *base)
559{ 559{
560 ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next; 560 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
561 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); 561 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
562 int res; 562 int res;
563 563
@@ -582,7 +582,16 @@ static int hrtimer_reprogram(struct hrtimer *timer,
582 if (expires.tv64 < 0) 582 if (expires.tv64 < 0)
583 return -ETIME; 583 return -ETIME;
584 584
585 if (expires.tv64 >= expires_next->tv64) 585 if (expires.tv64 >= cpu_base->expires_next.tv64)
586 return 0;
587
588 /*
589 * If a hang was detected in the last timer interrupt then we
590 * do not schedule a timer which is earlier than the expiry
591 * which we enforced in the hang detection. We want the system
592 * to make progress.
593 */
594 if (cpu_base->hang_detected)
586 return 0; 595 return 0;
587 596
588 /* 597 /*
@@ -590,7 +599,7 @@ static int hrtimer_reprogram(struct hrtimer *timer,
590 */ 599 */
591 res = tick_program_event(expires, 0); 600 res = tick_program_event(expires, 0);
592 if (!IS_ERR_VALUE(res)) 601 if (!IS_ERR_VALUE(res))
593 *expires_next = expires; 602 cpu_base->expires_next = expires;
594 return res; 603 return res;
595} 604}
596 605
@@ -619,12 +628,12 @@ static void retrigger_next_event(void *arg)
619 base = &__get_cpu_var(hrtimer_bases); 628 base = &__get_cpu_var(hrtimer_bases);
620 629
621 /* Adjust CLOCK_REALTIME offset */ 630 /* Adjust CLOCK_REALTIME offset */
622 spin_lock(&base->lock); 631 raw_spin_lock(&base->lock);
623 base->clock_base[CLOCK_REALTIME].offset = 632 base->clock_base[CLOCK_REALTIME].offset =
624 timespec_to_ktime(realtime_offset); 633 timespec_to_ktime(realtime_offset);
625 634
626 hrtimer_force_reprogram(base, 0); 635 hrtimer_force_reprogram(base, 0);
627 spin_unlock(&base->lock); 636 raw_spin_unlock(&base->lock);
628} 637}
629 638
630/* 639/*
@@ -685,9 +694,9 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
685{ 694{
686 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { 695 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
687 if (wakeup) { 696 if (wakeup) {
688 spin_unlock(&base->cpu_base->lock); 697 raw_spin_unlock(&base->cpu_base->lock);
689 raise_softirq_irqoff(HRTIMER_SOFTIRQ); 698 raise_softirq_irqoff(HRTIMER_SOFTIRQ);
690 spin_lock(&base->cpu_base->lock); 699 raw_spin_lock(&base->cpu_base->lock);
691 } else 700 } else
692 __raise_softirq_irqoff(HRTIMER_SOFTIRQ); 701 __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
693 702
@@ -747,17 +756,33 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
747 756
748#endif /* CONFIG_HIGH_RES_TIMERS */ 757#endif /* CONFIG_HIGH_RES_TIMERS */
749 758
750#ifdef CONFIG_TIMER_STATS 759static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
751void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr)
752{ 760{
761#ifdef CONFIG_TIMER_STATS
753 if (timer->start_site) 762 if (timer->start_site)
754 return; 763 return;
755 764 timer->start_site = __builtin_return_address(0);
756 timer->start_site = addr;
757 memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); 765 memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
758 timer->start_pid = current->pid; 766 timer->start_pid = current->pid;
767#endif
759} 768}
769
770static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer)
771{
772#ifdef CONFIG_TIMER_STATS
773 timer->start_site = NULL;
774#endif
775}
776
777static inline void timer_stats_account_hrtimer(struct hrtimer *timer)
778{
779#ifdef CONFIG_TIMER_STATS
780 if (likely(!timer_stats_active))
781 return;
782 timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
783 timer->function, timer->start_comm, 0);
760#endif 784#endif
785}
761 786
762/* 787/*
763 * Counterpart to lock_hrtimer_base above: 788 * Counterpart to lock_hrtimer_base above:
@@ -765,7 +790,7 @@ void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr)
765static inline 790static inline
766void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) 791void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
767{ 792{
768 spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); 793 raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
769} 794}
770 795
771/** 796/**
@@ -1098,7 +1123,7 @@ ktime_t hrtimer_get_next_event(void)
1098 unsigned long flags; 1123 unsigned long flags;
1099 int i; 1124 int i;
1100 1125
1101 spin_lock_irqsave(&cpu_base->lock, flags); 1126 raw_spin_lock_irqsave(&cpu_base->lock, flags);
1102 1127
1103 if (!hrtimer_hres_active()) { 1128 if (!hrtimer_hres_active()) {
1104 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { 1129 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
@@ -1115,7 +1140,7 @@ ktime_t hrtimer_get_next_event(void)
1115 } 1140 }
1116 } 1141 }
1117 1142
1118 spin_unlock_irqrestore(&cpu_base->lock, flags); 1143 raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
1119 1144
1120 if (mindelta.tv64 < 0) 1145 if (mindelta.tv64 < 0)
1121 mindelta.tv64 = 0; 1146 mindelta.tv64 = 0;
@@ -1197,11 +1222,11 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
1197 * they get migrated to another cpu, therefore its safe to unlock 1222 * they get migrated to another cpu, therefore its safe to unlock
1198 * the timer base. 1223 * the timer base.
1199 */ 1224 */
1200 spin_unlock(&cpu_base->lock); 1225 raw_spin_unlock(&cpu_base->lock);
1201 trace_hrtimer_expire_entry(timer, now); 1226 trace_hrtimer_expire_entry(timer, now);
1202 restart = fn(timer); 1227 restart = fn(timer);
1203 trace_hrtimer_expire_exit(timer); 1228 trace_hrtimer_expire_exit(timer);
1204 spin_lock(&cpu_base->lock); 1229 raw_spin_lock(&cpu_base->lock);
1205 1230
1206 /* 1231 /*
1207 * Note: We clear the CALLBACK bit after enqueue_hrtimer and 1232 * Note: We clear the CALLBACK bit after enqueue_hrtimer and
@@ -1217,29 +1242,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
1217 1242
1218#ifdef CONFIG_HIGH_RES_TIMERS 1243#ifdef CONFIG_HIGH_RES_TIMERS
1219 1244
1220static int force_clock_reprogram;
1221
1222/*
1223 * After 5 iteration's attempts, we consider that hrtimer_interrupt()
1224 * is hanging, which could happen with something that slows the interrupt
1225 * such as the tracing. Then we force the clock reprogramming for each future
1226 * hrtimer interrupts to avoid infinite loops and use the min_delta_ns
1227 * threshold that we will overwrite.
1228 * The next tick event will be scheduled to 3 times we currently spend on
1229 * hrtimer_interrupt(). This gives a good compromise, the cpus will spend
1230 * 1/4 of their time to process the hrtimer interrupts. This is enough to
1231 * let it running without serious starvation.
1232 */
1233
1234static inline void
1235hrtimer_interrupt_hanging(struct clock_event_device *dev,
1236 ktime_t try_time)
1237{
1238 force_clock_reprogram = 1;
1239 dev->min_delta_ns = (unsigned long)try_time.tv64 * 3;
1240 printk(KERN_WARNING "hrtimer: interrupt too slow, "
1241 "forcing clock min delta to %lu ns\n", dev->min_delta_ns);
1242}
1243/* 1245/*
1244 * High resolution timer interrupt 1246 * High resolution timer interrupt
1245 * Called with interrupts disabled 1247 * Called with interrupts disabled
@@ -1248,24 +1250,18 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1248{ 1250{
1249 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1251 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1250 struct hrtimer_clock_base *base; 1252 struct hrtimer_clock_base *base;
1251 ktime_t expires_next, now; 1253 ktime_t expires_next, now, entry_time, delta;
1252 int nr_retries = 0; 1254 int i, retries = 0;
1253 int i;
1254 1255
1255 BUG_ON(!cpu_base->hres_active); 1256 BUG_ON(!cpu_base->hres_active);
1256 cpu_base->nr_events++; 1257 cpu_base->nr_events++;
1257 dev->next_event.tv64 = KTIME_MAX; 1258 dev->next_event.tv64 = KTIME_MAX;
1258 1259
1259 retry: 1260 entry_time = now = ktime_get();
1260 /* 5 retries is enough to notice a hang */ 1261retry:
1261 if (!(++nr_retries % 5))
1262 hrtimer_interrupt_hanging(dev, ktime_sub(ktime_get(), now));
1263
1264 now = ktime_get();
1265
1266 expires_next.tv64 = KTIME_MAX; 1262 expires_next.tv64 = KTIME_MAX;
1267 1263
1268 spin_lock(&cpu_base->lock); 1264 raw_spin_lock(&cpu_base->lock);
1269 /* 1265 /*
1270 * We set expires_next to KTIME_MAX here with cpu_base->lock 1266 * We set expires_next to KTIME_MAX here with cpu_base->lock
1271 * held to prevent that a timer is enqueued in our queue via 1267 * held to prevent that a timer is enqueued in our queue via
@@ -1321,13 +1317,51 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1321 * against it. 1317 * against it.
1322 */ 1318 */
1323 cpu_base->expires_next = expires_next; 1319 cpu_base->expires_next = expires_next;
1324 spin_unlock(&cpu_base->lock); 1320 raw_spin_unlock(&cpu_base->lock);
1325 1321
1326 /* Reprogramming necessary ? */ 1322 /* Reprogramming necessary ? */
1327 if (expires_next.tv64 != KTIME_MAX) { 1323 if (expires_next.tv64 == KTIME_MAX ||
1328 if (tick_program_event(expires_next, force_clock_reprogram)) 1324 !tick_program_event(expires_next, 0)) {
1329 goto retry; 1325 cpu_base->hang_detected = 0;
1326 return;
1330 } 1327 }
1328
1329 /*
1330 * The next timer was already expired due to:
1331 * - tracing
1332 * - long lasting callbacks
1333 * - being scheduled away when running in a VM
1334 *
1335 * We need to prevent that we loop forever in the hrtimer
1336 * interrupt routine. We give it 3 attempts to avoid
1337 * overreacting on some spurious event.
1338 */
1339 now = ktime_get();
1340 cpu_base->nr_retries++;
1341 if (++retries < 3)
1342 goto retry;
1343 /*
1344 * Give the system a chance to do something else than looping
1345 * here. We stored the entry time, so we know exactly how long
1346 * we spent here. We schedule the next event this amount of
1347 * time away.
1348 */
1349 cpu_base->nr_hangs++;
1350 cpu_base->hang_detected = 1;
1351 delta = ktime_sub(now, entry_time);
1352 if (delta.tv64 > cpu_base->max_hang_time.tv64)
1353 cpu_base->max_hang_time = delta;
1354 /*
1355 * Limit it to a sensible value as we enforce a longer
1356 * delay. Give the CPU at least 100ms to catch up.
1357 */
1358 if (delta.tv64 > 100 * NSEC_PER_MSEC)
1359 expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
1360 else
1361 expires_next = ktime_add(now, delta);
1362 tick_program_event(expires_next, 1);
1363 printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n",
1364 ktime_to_ns(delta));
1331} 1365}
1332 1366
1333/* 1367/*
@@ -1423,7 +1457,7 @@ void hrtimer_run_queues(void)
1423 gettime = 0; 1457 gettime = 0;
1424 } 1458 }
1425 1459
1426 spin_lock(&cpu_base->lock); 1460 raw_spin_lock(&cpu_base->lock);
1427 1461
1428 while ((node = base->first)) { 1462 while ((node = base->first)) {
1429 struct hrtimer *timer; 1463 struct hrtimer *timer;
@@ -1435,7 +1469,7 @@ void hrtimer_run_queues(void)
1435 1469
1436 __run_hrtimer(timer, &base->softirq_time); 1470 __run_hrtimer(timer, &base->softirq_time);
1437 } 1471 }
1438 spin_unlock(&cpu_base->lock); 1472 raw_spin_unlock(&cpu_base->lock);
1439 } 1473 }
1440} 1474}
1441 1475
@@ -1591,7 +1625,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
1591 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); 1625 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
1592 int i; 1626 int i;
1593 1627
1594 spin_lock_init(&cpu_base->lock); 1628 raw_spin_lock_init(&cpu_base->lock);
1595 1629
1596 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) 1630 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
1597 cpu_base->clock_base[i].cpu_base = cpu_base; 1631 cpu_base->clock_base[i].cpu_base = cpu_base;
@@ -1649,16 +1683,16 @@ static void migrate_hrtimers(int scpu)
1649 * The caller is globally serialized and nobody else 1683 * The caller is globally serialized and nobody else
1650 * takes two locks at once, deadlock is not possible. 1684 * takes two locks at once, deadlock is not possible.
1651 */ 1685 */
1652 spin_lock(&new_base->lock); 1686 raw_spin_lock(&new_base->lock);
1653 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); 1687 raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1654 1688
1655 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1689 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1656 migrate_hrtimer_list(&old_base->clock_base[i], 1690 migrate_hrtimer_list(&old_base->clock_base[i],
1657 &new_base->clock_base[i]); 1691 &new_base->clock_base[i]);
1658 } 1692 }
1659 1693
1660 spin_unlock(&old_base->lock); 1694 raw_spin_unlock(&old_base->lock);
1661 spin_unlock(&new_base->lock); 1695 raw_spin_unlock(&new_base->lock);
1662 1696
1663 /* Check, if we got expired work to do */ 1697 /* Check, if we got expired work to do */
1664 __hrtimer_peek_ahead_timers(); 1698 __hrtimer_peek_ahead_timers();
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index d4e841747400..0c642d51aac2 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -144,7 +144,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
144 144
145 rcu_read_lock(); 145 rcu_read_lock();
146 do_each_thread(g, t) { 146 do_each_thread(g, t) {
147 if (!--max_count) 147 if (!max_count--)
148 goto unlock; 148 goto unlock;
149 if (!--batch_count) { 149 if (!--batch_count) {
150 batch_count = HUNG_TASK_BATCHING; 150 batch_count = HUNG_TASK_BATCHING;
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 366eedf949c0..dbcbf6a33a08 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -96,7 +96,7 @@ static int task_bp_pinned(struct task_struct *tsk)
96 96
97 list = &ctx->event_list; 97 list = &ctx->event_list;
98 98
99 spin_lock_irqsave(&ctx->lock, flags); 99 raw_spin_lock_irqsave(&ctx->lock, flags);
100 100
101 /* 101 /*
102 * The current breakpoint counter is not included in the list 102 * The current breakpoint counter is not included in the list
@@ -107,7 +107,7 @@ static int task_bp_pinned(struct task_struct *tsk)
107 count++; 107 count++;
108 } 108 }
109 109
110 spin_unlock_irqrestore(&ctx->lock, flags); 110 raw_spin_unlock_irqrestore(&ctx->lock, flags);
111 111
112 return count; 112 return count;
113} 113}
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 1de9700f416e..2295a31ef110 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -45,7 +45,7 @@ unsigned long probe_irq_on(void)
45 * flush such a longstanding irq before considering it as spurious. 45 * flush such a longstanding irq before considering it as spurious.
46 */ 46 */
47 for_each_irq_desc_reverse(i, desc) { 47 for_each_irq_desc_reverse(i, desc) {
48 spin_lock_irq(&desc->lock); 48 raw_spin_lock_irq(&desc->lock);
49 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 49 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
50 /* 50 /*
51 * An old-style architecture might still have 51 * An old-style architecture might still have
@@ -61,7 +61,7 @@ unsigned long probe_irq_on(void)
61 desc->chip->set_type(i, IRQ_TYPE_PROBE); 61 desc->chip->set_type(i, IRQ_TYPE_PROBE);
62 desc->chip->startup(i); 62 desc->chip->startup(i);
63 } 63 }
64 spin_unlock_irq(&desc->lock); 64 raw_spin_unlock_irq(&desc->lock);
65 } 65 }
66 66
67 /* Wait for longstanding interrupts to trigger. */ 67 /* Wait for longstanding interrupts to trigger. */
@@ -73,13 +73,13 @@ unsigned long probe_irq_on(void)
73 * happened in the previous stage, it may have masked itself) 73 * happened in the previous stage, it may have masked itself)
74 */ 74 */
75 for_each_irq_desc_reverse(i, desc) { 75 for_each_irq_desc_reverse(i, desc) {
76 spin_lock_irq(&desc->lock); 76 raw_spin_lock_irq(&desc->lock);
77 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 77 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
78 desc->status |= IRQ_AUTODETECT | IRQ_WAITING; 78 desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
79 if (desc->chip->startup(i)) 79 if (desc->chip->startup(i))
80 desc->status |= IRQ_PENDING; 80 desc->status |= IRQ_PENDING;
81 } 81 }
82 spin_unlock_irq(&desc->lock); 82 raw_spin_unlock_irq(&desc->lock);
83 } 83 }
84 84
85 /* 85 /*
@@ -91,7 +91,7 @@ unsigned long probe_irq_on(void)
91 * Now filter out any obviously spurious interrupts 91 * Now filter out any obviously spurious interrupts
92 */ 92 */
93 for_each_irq_desc(i, desc) { 93 for_each_irq_desc(i, desc) {
94 spin_lock_irq(&desc->lock); 94 raw_spin_lock_irq(&desc->lock);
95 status = desc->status; 95 status = desc->status;
96 96
97 if (status & IRQ_AUTODETECT) { 97 if (status & IRQ_AUTODETECT) {
@@ -103,7 +103,7 @@ unsigned long probe_irq_on(void)
103 if (i < 32) 103 if (i < 32)
104 mask |= 1 << i; 104 mask |= 1 << i;
105 } 105 }
106 spin_unlock_irq(&desc->lock); 106 raw_spin_unlock_irq(&desc->lock);
107 } 107 }
108 108
109 return mask; 109 return mask;
@@ -129,7 +129,7 @@ unsigned int probe_irq_mask(unsigned long val)
129 int i; 129 int i;
130 130
131 for_each_irq_desc(i, desc) { 131 for_each_irq_desc(i, desc) {
132 spin_lock_irq(&desc->lock); 132 raw_spin_lock_irq(&desc->lock);
133 status = desc->status; 133 status = desc->status;
134 134
135 if (status & IRQ_AUTODETECT) { 135 if (status & IRQ_AUTODETECT) {
@@ -139,7 +139,7 @@ unsigned int probe_irq_mask(unsigned long val)
139 desc->status = status & ~IRQ_AUTODETECT; 139 desc->status = status & ~IRQ_AUTODETECT;
140 desc->chip->shutdown(i); 140 desc->chip->shutdown(i);
141 } 141 }
142 spin_unlock_irq(&desc->lock); 142 raw_spin_unlock_irq(&desc->lock);
143 } 143 }
144 mutex_unlock(&probing_active); 144 mutex_unlock(&probing_active);
145 145
@@ -171,7 +171,7 @@ int probe_irq_off(unsigned long val)
171 unsigned int status; 171 unsigned int status;
172 172
173 for_each_irq_desc(i, desc) { 173 for_each_irq_desc(i, desc) {
174 spin_lock_irq(&desc->lock); 174 raw_spin_lock_irq(&desc->lock);
175 status = desc->status; 175 status = desc->status;
176 176
177 if (status & IRQ_AUTODETECT) { 177 if (status & IRQ_AUTODETECT) {
@@ -183,7 +183,7 @@ int probe_irq_off(unsigned long val)
183 desc->status = status & ~IRQ_AUTODETECT; 183 desc->status = status & ~IRQ_AUTODETECT;
184 desc->chip->shutdown(i); 184 desc->chip->shutdown(i);
185 } 185 }
186 spin_unlock_irq(&desc->lock); 186 raw_spin_unlock_irq(&desc->lock);
187 } 187 }
188 mutex_unlock(&probing_active); 188 mutex_unlock(&probing_active);
189 189
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c1660194d115..ecc3fa28f666 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -34,7 +34,7 @@ void dynamic_irq_init(unsigned int irq)
34 } 34 }
35 35
36 /* Ensure we don't have left over values from a previous use of this irq */ 36 /* Ensure we don't have left over values from a previous use of this irq */
37 spin_lock_irqsave(&desc->lock, flags); 37 raw_spin_lock_irqsave(&desc->lock, flags);
38 desc->status = IRQ_DISABLED; 38 desc->status = IRQ_DISABLED;
39 desc->chip = &no_irq_chip; 39 desc->chip = &no_irq_chip;
40 desc->handle_irq = handle_bad_irq; 40 desc->handle_irq = handle_bad_irq;
@@ -51,7 +51,7 @@ void dynamic_irq_init(unsigned int irq)
51 cpumask_clear(desc->pending_mask); 51 cpumask_clear(desc->pending_mask);
52#endif 52#endif
53#endif 53#endif
54 spin_unlock_irqrestore(&desc->lock, flags); 54 raw_spin_unlock_irqrestore(&desc->lock, flags);
55} 55}
56 56
57/** 57/**
@@ -68,9 +68,9 @@ void dynamic_irq_cleanup(unsigned int irq)
68 return; 68 return;
69 } 69 }
70 70
71 spin_lock_irqsave(&desc->lock, flags); 71 raw_spin_lock_irqsave(&desc->lock, flags);
72 if (desc->action) { 72 if (desc->action) {
73 spin_unlock_irqrestore(&desc->lock, flags); 73 raw_spin_unlock_irqrestore(&desc->lock, flags);
74 WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n", 74 WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n",
75 irq); 75 irq);
76 return; 76 return;
@@ -82,7 +82,7 @@ void dynamic_irq_cleanup(unsigned int irq)
82 desc->chip = &no_irq_chip; 82 desc->chip = &no_irq_chip;
83 desc->name = NULL; 83 desc->name = NULL;
84 clear_kstat_irqs(desc); 84 clear_kstat_irqs(desc);
85 spin_unlock_irqrestore(&desc->lock, flags); 85 raw_spin_unlock_irqrestore(&desc->lock, flags);
86} 86}
87 87
88 88
@@ -104,10 +104,10 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
104 if (!chip) 104 if (!chip)
105 chip = &no_irq_chip; 105 chip = &no_irq_chip;
106 106
107 spin_lock_irqsave(&desc->lock, flags); 107 raw_spin_lock_irqsave(&desc->lock, flags);
108 irq_chip_set_defaults(chip); 108 irq_chip_set_defaults(chip);
109 desc->chip = chip; 109 desc->chip = chip;
110 spin_unlock_irqrestore(&desc->lock, flags); 110 raw_spin_unlock_irqrestore(&desc->lock, flags);
111 111
112 return 0; 112 return 0;
113} 113}
@@ -133,9 +133,9 @@ int set_irq_type(unsigned int irq, unsigned int type)
133 if (type == IRQ_TYPE_NONE) 133 if (type == IRQ_TYPE_NONE)
134 return 0; 134 return 0;
135 135
136 spin_lock_irqsave(&desc->lock, flags); 136 raw_spin_lock_irqsave(&desc->lock, flags);
137 ret = __irq_set_trigger(desc, irq, type); 137 ret = __irq_set_trigger(desc, irq, type);
138 spin_unlock_irqrestore(&desc->lock, flags); 138 raw_spin_unlock_irqrestore(&desc->lock, flags);
139 return ret; 139 return ret;
140} 140}
141EXPORT_SYMBOL(set_irq_type); 141EXPORT_SYMBOL(set_irq_type);
@@ -158,19 +158,19 @@ int set_irq_data(unsigned int irq, void *data)
158 return -EINVAL; 158 return -EINVAL;
159 } 159 }
160 160
161 spin_lock_irqsave(&desc->lock, flags); 161 raw_spin_lock_irqsave(&desc->lock, flags);
162 desc->handler_data = data; 162 desc->handler_data = data;
163 spin_unlock_irqrestore(&desc->lock, flags); 163 raw_spin_unlock_irqrestore(&desc->lock, flags);
164 return 0; 164 return 0;
165} 165}
166EXPORT_SYMBOL(set_irq_data); 166EXPORT_SYMBOL(set_irq_data);
167 167
168/** 168/**
169 * set_irq_data - set irq type data for an irq 169 * set_irq_msi - set MSI descriptor data for an irq
170 * @irq: Interrupt number 170 * @irq: Interrupt number
171 * @entry: Pointer to MSI descriptor data 171 * @entry: Pointer to MSI descriptor data
172 * 172 *
173 * Set the hardware irq controller data for an irq 173 * Set the MSI descriptor entry for an irq
174 */ 174 */
175int set_irq_msi(unsigned int irq, struct msi_desc *entry) 175int set_irq_msi(unsigned int irq, struct msi_desc *entry)
176{ 176{
@@ -183,11 +183,11 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry)
183 return -EINVAL; 183 return -EINVAL;
184 } 184 }
185 185
186 spin_lock_irqsave(&desc->lock, flags); 186 raw_spin_lock_irqsave(&desc->lock, flags);
187 desc->msi_desc = entry; 187 desc->msi_desc = entry;
188 if (entry) 188 if (entry)
189 entry->irq = irq; 189 entry->irq = irq;
190 spin_unlock_irqrestore(&desc->lock, flags); 190 raw_spin_unlock_irqrestore(&desc->lock, flags);
191 return 0; 191 return 0;
192} 192}
193 193
@@ -214,9 +214,9 @@ int set_irq_chip_data(unsigned int irq, void *data)
214 return -EINVAL; 214 return -EINVAL;
215 } 215 }
216 216
217 spin_lock_irqsave(&desc->lock, flags); 217 raw_spin_lock_irqsave(&desc->lock, flags);
218 desc->chip_data = data; 218 desc->chip_data = data;
219 spin_unlock_irqrestore(&desc->lock, flags); 219 raw_spin_unlock_irqrestore(&desc->lock, flags);
220 220
221 return 0; 221 return 0;
222} 222}
@@ -241,12 +241,12 @@ void set_irq_nested_thread(unsigned int irq, int nest)
241 if (!desc) 241 if (!desc)
242 return; 242 return;
243 243
244 spin_lock_irqsave(&desc->lock, flags); 244 raw_spin_lock_irqsave(&desc->lock, flags);
245 if (nest) 245 if (nest)
246 desc->status |= IRQ_NESTED_THREAD; 246 desc->status |= IRQ_NESTED_THREAD;
247 else 247 else
248 desc->status &= ~IRQ_NESTED_THREAD; 248 desc->status &= ~IRQ_NESTED_THREAD;
249 spin_unlock_irqrestore(&desc->lock, flags); 249 raw_spin_unlock_irqrestore(&desc->lock, flags);
250} 250}
251EXPORT_SYMBOL_GPL(set_irq_nested_thread); 251EXPORT_SYMBOL_GPL(set_irq_nested_thread);
252 252
@@ -343,7 +343,7 @@ void handle_nested_irq(unsigned int irq)
343 343
344 might_sleep(); 344 might_sleep();
345 345
346 spin_lock_irq(&desc->lock); 346 raw_spin_lock_irq(&desc->lock);
347 347
348 kstat_incr_irqs_this_cpu(irq, desc); 348 kstat_incr_irqs_this_cpu(irq, desc);
349 349
@@ -352,17 +352,17 @@ void handle_nested_irq(unsigned int irq)
352 goto out_unlock; 352 goto out_unlock;
353 353
354 desc->status |= IRQ_INPROGRESS; 354 desc->status |= IRQ_INPROGRESS;
355 spin_unlock_irq(&desc->lock); 355 raw_spin_unlock_irq(&desc->lock);
356 356
357 action_ret = action->thread_fn(action->irq, action->dev_id); 357 action_ret = action->thread_fn(action->irq, action->dev_id);
358 if (!noirqdebug) 358 if (!noirqdebug)
359 note_interrupt(irq, desc, action_ret); 359 note_interrupt(irq, desc, action_ret);
360 360
361 spin_lock_irq(&desc->lock); 361 raw_spin_lock_irq(&desc->lock);
362 desc->status &= ~IRQ_INPROGRESS; 362 desc->status &= ~IRQ_INPROGRESS;
363 363
364out_unlock: 364out_unlock:
365 spin_unlock_irq(&desc->lock); 365 raw_spin_unlock_irq(&desc->lock);
366} 366}
367EXPORT_SYMBOL_GPL(handle_nested_irq); 367EXPORT_SYMBOL_GPL(handle_nested_irq);
368 368
@@ -384,7 +384,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
384 struct irqaction *action; 384 struct irqaction *action;
385 irqreturn_t action_ret; 385 irqreturn_t action_ret;
386 386
387 spin_lock(&desc->lock); 387 raw_spin_lock(&desc->lock);
388 388
389 if (unlikely(desc->status & IRQ_INPROGRESS)) 389 if (unlikely(desc->status & IRQ_INPROGRESS))
390 goto out_unlock; 390 goto out_unlock;
@@ -396,16 +396,16 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
396 goto out_unlock; 396 goto out_unlock;
397 397
398 desc->status |= IRQ_INPROGRESS; 398 desc->status |= IRQ_INPROGRESS;
399 spin_unlock(&desc->lock); 399 raw_spin_unlock(&desc->lock);
400 400
401 action_ret = handle_IRQ_event(irq, action); 401 action_ret = handle_IRQ_event(irq, action);
402 if (!noirqdebug) 402 if (!noirqdebug)
403 note_interrupt(irq, desc, action_ret); 403 note_interrupt(irq, desc, action_ret);
404 404
405 spin_lock(&desc->lock); 405 raw_spin_lock(&desc->lock);
406 desc->status &= ~IRQ_INPROGRESS; 406 desc->status &= ~IRQ_INPROGRESS;
407out_unlock: 407out_unlock:
408 spin_unlock(&desc->lock); 408 raw_spin_unlock(&desc->lock);
409} 409}
410 410
411/** 411/**
@@ -424,7 +424,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
424 struct irqaction *action; 424 struct irqaction *action;
425 irqreturn_t action_ret; 425 irqreturn_t action_ret;
426 426
427 spin_lock(&desc->lock); 427 raw_spin_lock(&desc->lock);
428 mask_ack_irq(desc, irq); 428 mask_ack_irq(desc, irq);
429 429
430 if (unlikely(desc->status & IRQ_INPROGRESS)) 430 if (unlikely(desc->status & IRQ_INPROGRESS))
@@ -441,13 +441,13 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
441 goto out_unlock; 441 goto out_unlock;
442 442
443 desc->status |= IRQ_INPROGRESS; 443 desc->status |= IRQ_INPROGRESS;
444 spin_unlock(&desc->lock); 444 raw_spin_unlock(&desc->lock);
445 445
446 action_ret = handle_IRQ_event(irq, action); 446 action_ret = handle_IRQ_event(irq, action);
447 if (!noirqdebug) 447 if (!noirqdebug)
448 note_interrupt(irq, desc, action_ret); 448 note_interrupt(irq, desc, action_ret);
449 449
450 spin_lock(&desc->lock); 450 raw_spin_lock(&desc->lock);
451 desc->status &= ~IRQ_INPROGRESS; 451 desc->status &= ~IRQ_INPROGRESS;
452 452
453 if (unlikely(desc->status & IRQ_ONESHOT)) 453 if (unlikely(desc->status & IRQ_ONESHOT))
@@ -455,7 +455,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
455 else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) 455 else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
456 desc->chip->unmask(irq); 456 desc->chip->unmask(irq);
457out_unlock: 457out_unlock:
458 spin_unlock(&desc->lock); 458 raw_spin_unlock(&desc->lock);
459} 459}
460EXPORT_SYMBOL_GPL(handle_level_irq); 460EXPORT_SYMBOL_GPL(handle_level_irq);
461 461
@@ -475,7 +475,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
475 struct irqaction *action; 475 struct irqaction *action;
476 irqreturn_t action_ret; 476 irqreturn_t action_ret;
477 477
478 spin_lock(&desc->lock); 478 raw_spin_lock(&desc->lock);
479 479
480 if (unlikely(desc->status & IRQ_INPROGRESS)) 480 if (unlikely(desc->status & IRQ_INPROGRESS))
481 goto out; 481 goto out;
@@ -497,18 +497,18 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
497 497
498 desc->status |= IRQ_INPROGRESS; 498 desc->status |= IRQ_INPROGRESS;
499 desc->status &= ~IRQ_PENDING; 499 desc->status &= ~IRQ_PENDING;
500 spin_unlock(&desc->lock); 500 raw_spin_unlock(&desc->lock);
501 501
502 action_ret = handle_IRQ_event(irq, action); 502 action_ret = handle_IRQ_event(irq, action);
503 if (!noirqdebug) 503 if (!noirqdebug)
504 note_interrupt(irq, desc, action_ret); 504 note_interrupt(irq, desc, action_ret);
505 505
506 spin_lock(&desc->lock); 506 raw_spin_lock(&desc->lock);
507 desc->status &= ~IRQ_INPROGRESS; 507 desc->status &= ~IRQ_INPROGRESS;
508out: 508out:
509 desc->chip->eoi(irq); 509 desc->chip->eoi(irq);
510 510
511 spin_unlock(&desc->lock); 511 raw_spin_unlock(&desc->lock);
512} 512}
513 513
514/** 514/**
@@ -530,7 +530,7 @@ out:
530void 530void
531handle_edge_irq(unsigned int irq, struct irq_desc *desc) 531handle_edge_irq(unsigned int irq, struct irq_desc *desc)
532{ 532{
533 spin_lock(&desc->lock); 533 raw_spin_lock(&desc->lock);
534 534
535 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 535 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
536 536
@@ -576,21 +576,21 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
576 } 576 }
577 577
578 desc->status &= ~IRQ_PENDING; 578 desc->status &= ~IRQ_PENDING;
579 spin_unlock(&desc->lock); 579 raw_spin_unlock(&desc->lock);
580 action_ret = handle_IRQ_event(irq, action); 580 action_ret = handle_IRQ_event(irq, action);
581 if (!noirqdebug) 581 if (!noirqdebug)
582 note_interrupt(irq, desc, action_ret); 582 note_interrupt(irq, desc, action_ret);
583 spin_lock(&desc->lock); 583 raw_spin_lock(&desc->lock);
584 584
585 } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING); 585 } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING);
586 586
587 desc->status &= ~IRQ_INPROGRESS; 587 desc->status &= ~IRQ_INPROGRESS;
588out_unlock: 588out_unlock:
589 spin_unlock(&desc->lock); 589 raw_spin_unlock(&desc->lock);
590} 590}
591 591
592/** 592/**
593 * handle_percpu_IRQ - Per CPU local irq handler 593 * handle_percpu_irq - Per CPU local irq handler
594 * @irq: the interrupt number 594 * @irq: the interrupt number
595 * @desc: the interrupt description structure for this irq 595 * @desc: the interrupt description structure for this irq
596 * 596 *
@@ -643,7 +643,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
643 } 643 }
644 644
645 chip_bus_lock(irq, desc); 645 chip_bus_lock(irq, desc);
646 spin_lock_irqsave(&desc->lock, flags); 646 raw_spin_lock_irqsave(&desc->lock, flags);
647 647
648 /* Uninstall? */ 648 /* Uninstall? */
649 if (handle == handle_bad_irq) { 649 if (handle == handle_bad_irq) {
@@ -661,7 +661,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
661 desc->depth = 0; 661 desc->depth = 0;
662 desc->chip->startup(irq); 662 desc->chip->startup(irq);
663 } 663 }
664 spin_unlock_irqrestore(&desc->lock, flags); 664 raw_spin_unlock_irqrestore(&desc->lock, flags);
665 chip_bus_sync_unlock(irq, desc); 665 chip_bus_sync_unlock(irq, desc);
666} 666}
667EXPORT_SYMBOL_GPL(__set_irq_handler); 667EXPORT_SYMBOL_GPL(__set_irq_handler);
@@ -692,9 +692,9 @@ void __init set_irq_noprobe(unsigned int irq)
692 return; 692 return;
693 } 693 }
694 694
695 spin_lock_irqsave(&desc->lock, flags); 695 raw_spin_lock_irqsave(&desc->lock, flags);
696 desc->status |= IRQ_NOPROBE; 696 desc->status |= IRQ_NOPROBE;
697 spin_unlock_irqrestore(&desc->lock, flags); 697 raw_spin_unlock_irqrestore(&desc->lock, flags);
698} 698}
699 699
700void __init set_irq_probe(unsigned int irq) 700void __init set_irq_probe(unsigned int irq)
@@ -707,7 +707,7 @@ void __init set_irq_probe(unsigned int irq)
707 return; 707 return;
708 } 708 }
709 709
710 spin_lock_irqsave(&desc->lock, flags); 710 raw_spin_lock_irqsave(&desc->lock, flags);
711 desc->status &= ~IRQ_NOPROBE; 711 desc->status &= ~IRQ_NOPROBE;
712 spin_unlock_irqrestore(&desc->lock, flags); 712 raw_spin_unlock_irqrestore(&desc->lock, flags);
713} 713}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 17c71bb565c6..814940e7f485 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -80,7 +80,7 @@ static struct irq_desc irq_desc_init = {
80 .chip = &no_irq_chip, 80 .chip = &no_irq_chip,
81 .handle_irq = handle_bad_irq, 81 .handle_irq = handle_bad_irq,
82 .depth = 1, 82 .depth = 1,
83 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), 83 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
84}; 84};
85 85
86void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr) 86void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
@@ -108,7 +108,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
108{ 108{
109 memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); 109 memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
110 110
111 spin_lock_init(&desc->lock); 111 raw_spin_lock_init(&desc->lock);
112 desc->irq = irq; 112 desc->irq = irq;
113#ifdef CONFIG_SMP 113#ifdef CONFIG_SMP
114 desc->node = node; 114 desc->node = node;
@@ -130,7 +130,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
130/* 130/*
131 * Protect the sparse_irqs: 131 * Protect the sparse_irqs:
132 */ 132 */
133DEFINE_SPINLOCK(sparse_irq_lock); 133DEFINE_RAW_SPINLOCK(sparse_irq_lock);
134 134
135struct irq_desc **irq_desc_ptrs __read_mostly; 135struct irq_desc **irq_desc_ptrs __read_mostly;
136 136
@@ -141,7 +141,7 @@ static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_sm
141 .chip = &no_irq_chip, 141 .chip = &no_irq_chip,
142 .handle_irq = handle_bad_irq, 142 .handle_irq = handle_bad_irq,
143 .depth = 1, 143 .depth = 1,
144 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), 144 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
145 } 145 }
146}; 146};
147 147
@@ -212,7 +212,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
212 if (desc) 212 if (desc)
213 return desc; 213 return desc;
214 214
215 spin_lock_irqsave(&sparse_irq_lock, flags); 215 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
216 216
217 /* We have to check it to avoid races with another CPU */ 217 /* We have to check it to avoid races with another CPU */
218 desc = irq_desc_ptrs[irq]; 218 desc = irq_desc_ptrs[irq];
@@ -234,7 +234,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
234 irq_desc_ptrs[irq] = desc; 234 irq_desc_ptrs[irq] = desc;
235 235
236out_unlock: 236out_unlock:
237 spin_unlock_irqrestore(&sparse_irq_lock, flags); 237 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
238 238
239 return desc; 239 return desc;
240} 240}
@@ -247,7 +247,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
247 .chip = &no_irq_chip, 247 .chip = &no_irq_chip,
248 .handle_irq = handle_bad_irq, 248 .handle_irq = handle_bad_irq,
249 .depth = 1, 249 .depth = 1,
250 .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock), 250 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
251 } 251 }
252}; 252};
253 253
@@ -473,7 +473,7 @@ unsigned int __do_IRQ(unsigned int irq)
473 return 1; 473 return 1;
474 } 474 }
475 475
476 spin_lock(&desc->lock); 476 raw_spin_lock(&desc->lock);
477 if (desc->chip->ack) 477 if (desc->chip->ack)
478 desc->chip->ack(irq); 478 desc->chip->ack(irq);
479 /* 479 /*
@@ -517,13 +517,13 @@ unsigned int __do_IRQ(unsigned int irq)
517 for (;;) { 517 for (;;) {
518 irqreturn_t action_ret; 518 irqreturn_t action_ret;
519 519
520 spin_unlock(&desc->lock); 520 raw_spin_unlock(&desc->lock);
521 521
522 action_ret = handle_IRQ_event(irq, action); 522 action_ret = handle_IRQ_event(irq, action);
523 if (!noirqdebug) 523 if (!noirqdebug)
524 note_interrupt(irq, desc, action_ret); 524 note_interrupt(irq, desc, action_ret);
525 525
526 spin_lock(&desc->lock); 526 raw_spin_lock(&desc->lock);
527 if (likely(!(desc->status & IRQ_PENDING))) 527 if (likely(!(desc->status & IRQ_PENDING)))
528 break; 528 break;
529 desc->status &= ~IRQ_PENDING; 529 desc->status &= ~IRQ_PENDING;
@@ -536,7 +536,7 @@ out:
536 * disabled while the handler was running. 536 * disabled while the handler was running.
537 */ 537 */
538 desc->chip->end(irq); 538 desc->chip->end(irq);
539 spin_unlock(&desc->lock); 539 raw_spin_unlock(&desc->lock);
540 540
541 return 1; 541 return 1;
542} 542}
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 1b5d742c6a77..b2821f070a3d 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -18,7 +18,7 @@ extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
18extern struct lock_class_key irq_desc_lock_class; 18extern struct lock_class_key irq_desc_lock_class;
19extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); 19extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
20extern void clear_kstat_irqs(struct irq_desc *desc); 20extern void clear_kstat_irqs(struct irq_desc *desc);
21extern spinlock_t sparse_irq_lock; 21extern raw_spinlock_t sparse_irq_lock;
22 22
23#ifdef CONFIG_SPARSE_IRQ 23#ifdef CONFIG_SPARSE_IRQ
24/* irq_desc_ptrs allocated at boot time */ 24/* irq_desc_ptrs allocated at boot time */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index bde4c667d24d..eb6078ca60c7 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -46,9 +46,9 @@ void synchronize_irq(unsigned int irq)
46 cpu_relax(); 46 cpu_relax();
47 47
48 /* Ok, that indicated we're done: double-check carefully. */ 48 /* Ok, that indicated we're done: double-check carefully. */
49 spin_lock_irqsave(&desc->lock, flags); 49 raw_spin_lock_irqsave(&desc->lock, flags);
50 status = desc->status; 50 status = desc->status;
51 spin_unlock_irqrestore(&desc->lock, flags); 51 raw_spin_unlock_irqrestore(&desc->lock, flags);
52 52
53 /* Oops, that failed? */ 53 /* Oops, that failed? */
54 } while (status & IRQ_INPROGRESS); 54 } while (status & IRQ_INPROGRESS);
@@ -114,7 +114,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
114 if (!desc->chip->set_affinity) 114 if (!desc->chip->set_affinity)
115 return -EINVAL; 115 return -EINVAL;
116 116
117 spin_lock_irqsave(&desc->lock, flags); 117 raw_spin_lock_irqsave(&desc->lock, flags);
118 118
119#ifdef CONFIG_GENERIC_PENDING_IRQ 119#ifdef CONFIG_GENERIC_PENDING_IRQ
120 if (desc->status & IRQ_MOVE_PCNTXT) { 120 if (desc->status & IRQ_MOVE_PCNTXT) {
@@ -134,7 +134,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
134 } 134 }
135#endif 135#endif
136 desc->status |= IRQ_AFFINITY_SET; 136 desc->status |= IRQ_AFFINITY_SET;
137 spin_unlock_irqrestore(&desc->lock, flags); 137 raw_spin_unlock_irqrestore(&desc->lock, flags);
138 return 0; 138 return 0;
139} 139}
140 140
@@ -181,11 +181,11 @@ int irq_select_affinity_usr(unsigned int irq)
181 unsigned long flags; 181 unsigned long flags;
182 int ret; 182 int ret;
183 183
184 spin_lock_irqsave(&desc->lock, flags); 184 raw_spin_lock_irqsave(&desc->lock, flags);
185 ret = setup_affinity(irq, desc); 185 ret = setup_affinity(irq, desc);
186 if (!ret) 186 if (!ret)
187 irq_set_thread_affinity(desc); 187 irq_set_thread_affinity(desc);
188 spin_unlock_irqrestore(&desc->lock, flags); 188 raw_spin_unlock_irqrestore(&desc->lock, flags);
189 189
190 return ret; 190 return ret;
191} 191}
@@ -231,9 +231,9 @@ void disable_irq_nosync(unsigned int irq)
231 return; 231 return;
232 232
233 chip_bus_lock(irq, desc); 233 chip_bus_lock(irq, desc);
234 spin_lock_irqsave(&desc->lock, flags); 234 raw_spin_lock_irqsave(&desc->lock, flags);
235 __disable_irq(desc, irq, false); 235 __disable_irq(desc, irq, false);
236 spin_unlock_irqrestore(&desc->lock, flags); 236 raw_spin_unlock_irqrestore(&desc->lock, flags);
237 chip_bus_sync_unlock(irq, desc); 237 chip_bus_sync_unlock(irq, desc);
238} 238}
239EXPORT_SYMBOL(disable_irq_nosync); 239EXPORT_SYMBOL(disable_irq_nosync);
@@ -308,9 +308,9 @@ void enable_irq(unsigned int irq)
308 return; 308 return;
309 309
310 chip_bus_lock(irq, desc); 310 chip_bus_lock(irq, desc);
311 spin_lock_irqsave(&desc->lock, flags); 311 raw_spin_lock_irqsave(&desc->lock, flags);
312 __enable_irq(desc, irq, false); 312 __enable_irq(desc, irq, false);
313 spin_unlock_irqrestore(&desc->lock, flags); 313 raw_spin_unlock_irqrestore(&desc->lock, flags);
314 chip_bus_sync_unlock(irq, desc); 314 chip_bus_sync_unlock(irq, desc);
315} 315}
316EXPORT_SYMBOL(enable_irq); 316EXPORT_SYMBOL(enable_irq);
@@ -347,7 +347,7 @@ int set_irq_wake(unsigned int irq, unsigned int on)
347 /* wakeup-capable irqs can be shared between drivers that 347 /* wakeup-capable irqs can be shared between drivers that
348 * don't need to have the same sleep mode behaviors. 348 * don't need to have the same sleep mode behaviors.
349 */ 349 */
350 spin_lock_irqsave(&desc->lock, flags); 350 raw_spin_lock_irqsave(&desc->lock, flags);
351 if (on) { 351 if (on) {
352 if (desc->wake_depth++ == 0) { 352 if (desc->wake_depth++ == 0) {
353 ret = set_irq_wake_real(irq, on); 353 ret = set_irq_wake_real(irq, on);
@@ -368,7 +368,7 @@ int set_irq_wake(unsigned int irq, unsigned int on)
368 } 368 }
369 } 369 }
370 370
371 spin_unlock_irqrestore(&desc->lock, flags); 371 raw_spin_unlock_irqrestore(&desc->lock, flags);
372 return ret; 372 return ret;
373} 373}
374EXPORT_SYMBOL(set_irq_wake); 374EXPORT_SYMBOL(set_irq_wake);
@@ -484,12 +484,12 @@ static int irq_wait_for_interrupt(struct irqaction *action)
484static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) 484static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
485{ 485{
486 chip_bus_lock(irq, desc); 486 chip_bus_lock(irq, desc);
487 spin_lock_irq(&desc->lock); 487 raw_spin_lock_irq(&desc->lock);
488 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { 488 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
489 desc->status &= ~IRQ_MASKED; 489 desc->status &= ~IRQ_MASKED;
490 desc->chip->unmask(irq); 490 desc->chip->unmask(irq);
491 } 491 }
492 spin_unlock_irq(&desc->lock); 492 raw_spin_unlock_irq(&desc->lock);
493 chip_bus_sync_unlock(irq, desc); 493 chip_bus_sync_unlock(irq, desc);
494} 494}
495 495
@@ -514,9 +514,9 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
514 return; 514 return;
515 } 515 }
516 516
517 spin_lock_irq(&desc->lock); 517 raw_spin_lock_irq(&desc->lock);
518 cpumask_copy(mask, desc->affinity); 518 cpumask_copy(mask, desc->affinity);
519 spin_unlock_irq(&desc->lock); 519 raw_spin_unlock_irq(&desc->lock);
520 520
521 set_cpus_allowed_ptr(current, mask); 521 set_cpus_allowed_ptr(current, mask);
522 free_cpumask_var(mask); 522 free_cpumask_var(mask);
@@ -545,7 +545,7 @@ static int irq_thread(void *data)
545 545
546 atomic_inc(&desc->threads_active); 546 atomic_inc(&desc->threads_active);
547 547
548 spin_lock_irq(&desc->lock); 548 raw_spin_lock_irq(&desc->lock);
549 if (unlikely(desc->status & IRQ_DISABLED)) { 549 if (unlikely(desc->status & IRQ_DISABLED)) {
550 /* 550 /*
551 * CHECKME: We might need a dedicated 551 * CHECKME: We might need a dedicated
@@ -555,9 +555,9 @@ static int irq_thread(void *data)
555 * retriggers the interrupt itself --- tglx 555 * retriggers the interrupt itself --- tglx
556 */ 556 */
557 desc->status |= IRQ_PENDING; 557 desc->status |= IRQ_PENDING;
558 spin_unlock_irq(&desc->lock); 558 raw_spin_unlock_irq(&desc->lock);
559 } else { 559 } else {
560 spin_unlock_irq(&desc->lock); 560 raw_spin_unlock_irq(&desc->lock);
561 561
562 action->thread_fn(action->irq, action->dev_id); 562 action->thread_fn(action->irq, action->dev_id);
563 563
@@ -679,7 +679,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
679 /* 679 /*
680 * The following block of code has to be executed atomically 680 * The following block of code has to be executed atomically
681 */ 681 */
682 spin_lock_irqsave(&desc->lock, flags); 682 raw_spin_lock_irqsave(&desc->lock, flags);
683 old_ptr = &desc->action; 683 old_ptr = &desc->action;
684 old = *old_ptr; 684 old = *old_ptr;
685 if (old) { 685 if (old) {
@@ -775,7 +775,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
775 __enable_irq(desc, irq, false); 775 __enable_irq(desc, irq, false);
776 } 776 }
777 777
778 spin_unlock_irqrestore(&desc->lock, flags); 778 raw_spin_unlock_irqrestore(&desc->lock, flags);
779 779
780 /* 780 /*
781 * Strictly no need to wake it up, but hung_task complains 781 * Strictly no need to wake it up, but hung_task complains
@@ -802,7 +802,7 @@ mismatch:
802 ret = -EBUSY; 802 ret = -EBUSY;
803 803
804out_thread: 804out_thread:
805 spin_unlock_irqrestore(&desc->lock, flags); 805 raw_spin_unlock_irqrestore(&desc->lock, flags);
806 if (new->thread) { 806 if (new->thread) {
807 struct task_struct *t = new->thread; 807 struct task_struct *t = new->thread;
808 808
@@ -844,7 +844,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
844 if (!desc) 844 if (!desc)
845 return NULL; 845 return NULL;
846 846
847 spin_lock_irqsave(&desc->lock, flags); 847 raw_spin_lock_irqsave(&desc->lock, flags);
848 848
849 /* 849 /*
850 * There can be multiple actions per IRQ descriptor, find the right 850 * There can be multiple actions per IRQ descriptor, find the right
@@ -856,7 +856,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
856 856
857 if (!action) { 857 if (!action) {
858 WARN(1, "Trying to free already-free IRQ %d\n", irq); 858 WARN(1, "Trying to free already-free IRQ %d\n", irq);
859 spin_unlock_irqrestore(&desc->lock, flags); 859 raw_spin_unlock_irqrestore(&desc->lock, flags);
860 860
861 return NULL; 861 return NULL;
862 } 862 }
@@ -884,7 +884,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
884 desc->chip->disable(irq); 884 desc->chip->disable(irq);
885 } 885 }
886 886
887 spin_unlock_irqrestore(&desc->lock, flags); 887 raw_spin_unlock_irqrestore(&desc->lock, flags);
888 888
889 unregister_handler_proc(irq, action); 889 unregister_handler_proc(irq, action);
890 890
@@ -1067,7 +1067,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1067 kfree(action); 1067 kfree(action);
1068 1068
1069#ifdef CONFIG_DEBUG_SHIRQ 1069#ifdef CONFIG_DEBUG_SHIRQ
1070 if (irqflags & IRQF_SHARED) { 1070 if (!retval && (irqflags & IRQF_SHARED)) {
1071 /* 1071 /*
1072 * It's a shared IRQ -- the driver ought to be prepared for it 1072 * It's a shared IRQ -- the driver ought to be prepared for it
1073 * to happen immediately, so let's make sure.... 1073 * to happen immediately, so let's make sure....
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index fcb6c96f2627..241962280836 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -27,7 +27,7 @@ void move_masked_irq(int irq)
27 if (!desc->chip->set_affinity) 27 if (!desc->chip->set_affinity)
28 return; 28 return;
29 29
30 assert_spin_locked(&desc->lock); 30 assert_raw_spin_locked(&desc->lock);
31 31
32 /* 32 /*
33 * If there was a valid mask to work with, please 33 * If there was a valid mask to work with, please
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 3fd30197da2e..26bac9d8f860 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -42,7 +42,7 @@ static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
42 "for migration.\n", irq); 42 "for migration.\n", irq);
43 return false; 43 return false;
44 } 44 }
45 spin_lock_init(&desc->lock); 45 raw_spin_lock_init(&desc->lock);
46 desc->node = node; 46 desc->node = node;
47 lockdep_set_class(&desc->lock, &irq_desc_lock_class); 47 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
48 init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids); 48 init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids);
@@ -67,7 +67,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
67 67
68 irq = old_desc->irq; 68 irq = old_desc->irq;
69 69
70 spin_lock_irqsave(&sparse_irq_lock, flags); 70 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
71 71
72 /* We have to check it to avoid races with another CPU */ 72 /* We have to check it to avoid races with another CPU */
73 desc = irq_desc_ptrs[irq]; 73 desc = irq_desc_ptrs[irq];
@@ -91,7 +91,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
91 } 91 }
92 92
93 irq_desc_ptrs[irq] = desc; 93 irq_desc_ptrs[irq] = desc;
94 spin_unlock_irqrestore(&sparse_irq_lock, flags); 94 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
95 95
96 /* free the old one */ 96 /* free the old one */
97 free_one_irq_desc(old_desc, desc); 97 free_one_irq_desc(old_desc, desc);
@@ -100,7 +100,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
100 return desc; 100 return desc;
101 101
102out_unlock: 102out_unlock:
103 spin_unlock_irqrestore(&sparse_irq_lock, flags); 103 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
104 104
105 return desc; 105 return desc;
106} 106}
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index a0bb09e79867..0d4005d85b03 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -28,9 +28,9 @@ void suspend_device_irqs(void)
28 for_each_irq_desc(irq, desc) { 28 for_each_irq_desc(irq, desc) {
29 unsigned long flags; 29 unsigned long flags;
30 30
31 spin_lock_irqsave(&desc->lock, flags); 31 raw_spin_lock_irqsave(&desc->lock, flags);
32 __disable_irq(desc, irq, true); 32 __disable_irq(desc, irq, true);
33 spin_unlock_irqrestore(&desc->lock, flags); 33 raw_spin_unlock_irqrestore(&desc->lock, flags);
34 } 34 }
35 35
36 for_each_irq_desc(irq, desc) 36 for_each_irq_desc(irq, desc)
@@ -56,9 +56,9 @@ void resume_device_irqs(void)
56 if (!(desc->status & IRQ_SUSPENDED)) 56 if (!(desc->status & IRQ_SUSPENDED))
57 continue; 57 continue;
58 58
59 spin_lock_irqsave(&desc->lock, flags); 59 raw_spin_lock_irqsave(&desc->lock, flags);
60 __enable_irq(desc, irq, true); 60 __enable_irq(desc, irq, true);
61 spin_unlock_irqrestore(&desc->lock, flags); 61 raw_spin_unlock_irqrestore(&desc->lock, flags);
62 } 62 }
63} 63}
64EXPORT_SYMBOL_GPL(resume_device_irqs); 64EXPORT_SYMBOL_GPL(resume_device_irqs);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 692363dd591f..6f50eccc79c0 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -136,7 +136,7 @@ out:
136 136
137static int default_affinity_open(struct inode *inode, struct file *file) 137static int default_affinity_open(struct inode *inode, struct file *file)
138{ 138{
139 return single_open(file, default_affinity_show, NULL); 139 return single_open(file, default_affinity_show, PDE(inode)->data);
140} 140}
141 141
142static const struct file_operations default_affinity_proc_fops = { 142static const struct file_operations default_affinity_proc_fops = {
@@ -148,18 +148,28 @@ static const struct file_operations default_affinity_proc_fops = {
148}; 148};
149#endif 149#endif
150 150
151static int irq_spurious_read(char *page, char **start, off_t off, 151static int irq_spurious_proc_show(struct seq_file *m, void *v)
152 int count, int *eof, void *data)
153{ 152{
154 struct irq_desc *desc = irq_to_desc((long) data); 153 struct irq_desc *desc = irq_to_desc((long) m->private);
155 return sprintf(page, "count %u\n" 154
156 "unhandled %u\n" 155 seq_printf(m, "count %u\n" "unhandled %u\n" "last_unhandled %u ms\n",
157 "last_unhandled %u ms\n", 156 desc->irq_count, desc->irqs_unhandled,
158 desc->irq_count, 157 jiffies_to_msecs(desc->last_unhandled));
159 desc->irqs_unhandled, 158 return 0;
160 jiffies_to_msecs(desc->last_unhandled)); 159}
160
161static int irq_spurious_proc_open(struct inode *inode, struct file *file)
162{
163 return single_open(file, irq_spurious_proc_show, NULL);
161} 164}
162 165
166static const struct file_operations irq_spurious_proc_fops = {
167 .open = irq_spurious_proc_open,
168 .read = seq_read,
169 .llseek = seq_lseek,
170 .release = single_release,
171};
172
163#define MAX_NAMELEN 128 173#define MAX_NAMELEN 128
164 174
165static int name_unique(unsigned int irq, struct irqaction *new_action) 175static int name_unique(unsigned int irq, struct irqaction *new_action)
@@ -169,7 +179,7 @@ static int name_unique(unsigned int irq, struct irqaction *new_action)
169 unsigned long flags; 179 unsigned long flags;
170 int ret = 1; 180 int ret = 1;
171 181
172 spin_lock_irqsave(&desc->lock, flags); 182 raw_spin_lock_irqsave(&desc->lock, flags);
173 for (action = desc->action ; action; action = action->next) { 183 for (action = desc->action ; action; action = action->next) {
174 if ((action != new_action) && action->name && 184 if ((action != new_action) && action->name &&
175 !strcmp(new_action->name, action->name)) { 185 !strcmp(new_action->name, action->name)) {
@@ -177,7 +187,7 @@ static int name_unique(unsigned int irq, struct irqaction *new_action)
177 break; 187 break;
178 } 188 }
179 } 189 }
180 spin_unlock_irqrestore(&desc->lock, flags); 190 raw_spin_unlock_irqrestore(&desc->lock, flags);
181 return ret; 191 return ret;
182} 192}
183 193
@@ -204,7 +214,6 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
204void register_irq_proc(unsigned int irq, struct irq_desc *desc) 214void register_irq_proc(unsigned int irq, struct irq_desc *desc)
205{ 215{
206 char name [MAX_NAMELEN]; 216 char name [MAX_NAMELEN];
207 struct proc_dir_entry *entry;
208 217
209 if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir) 218 if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir)
210 return; 219 return;
@@ -214,6 +223,8 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
214 223
215 /* create /proc/irq/1234 */ 224 /* create /proc/irq/1234 */
216 desc->dir = proc_mkdir(name, root_irq_dir); 225 desc->dir = proc_mkdir(name, root_irq_dir);
226 if (!desc->dir)
227 return;
217 228
218#ifdef CONFIG_SMP 229#ifdef CONFIG_SMP
219 /* create /proc/irq/<irq>/smp_affinity */ 230 /* create /proc/irq/<irq>/smp_affinity */
@@ -221,11 +232,8 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
221 &irq_affinity_proc_fops, (void *)(long)irq); 232 &irq_affinity_proc_fops, (void *)(long)irq);
222#endif 233#endif
223 234
224 entry = create_proc_entry("spurious", 0444, desc->dir); 235 proc_create_data("spurious", 0444, desc->dir,
225 if (entry) { 236 &irq_spurious_proc_fops, (void *)(long)irq);
226 entry->data = (void *)(long)irq;
227 entry->read_proc = irq_spurious_read;
228 }
229} 237}
230 238
231#undef MAX_NAMELEN 239#undef MAX_NAMELEN
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index bd7273e6282e..89fb90ae534f 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -28,7 +28,7 @@ static int try_one_irq(int irq, struct irq_desc *desc)
28 struct irqaction *action; 28 struct irqaction *action;
29 int ok = 0, work = 0; 29 int ok = 0, work = 0;
30 30
31 spin_lock(&desc->lock); 31 raw_spin_lock(&desc->lock);
32 /* Already running on another processor */ 32 /* Already running on another processor */
33 if (desc->status & IRQ_INPROGRESS) { 33 if (desc->status & IRQ_INPROGRESS) {
34 /* 34 /*
@@ -37,13 +37,13 @@ static int try_one_irq(int irq, struct irq_desc *desc)
37 */ 37 */
38 if (desc->action && (desc->action->flags & IRQF_SHARED)) 38 if (desc->action && (desc->action->flags & IRQF_SHARED))
39 desc->status |= IRQ_PENDING; 39 desc->status |= IRQ_PENDING;
40 spin_unlock(&desc->lock); 40 raw_spin_unlock(&desc->lock);
41 return ok; 41 return ok;
42 } 42 }
43 /* Honour the normal IRQ locking */ 43 /* Honour the normal IRQ locking */
44 desc->status |= IRQ_INPROGRESS; 44 desc->status |= IRQ_INPROGRESS;
45 action = desc->action; 45 action = desc->action;
46 spin_unlock(&desc->lock); 46 raw_spin_unlock(&desc->lock);
47 47
48 while (action) { 48 while (action) {
49 /* Only shared IRQ handlers are safe to call */ 49 /* Only shared IRQ handlers are safe to call */
@@ -56,7 +56,7 @@ static int try_one_irq(int irq, struct irq_desc *desc)
56 } 56 }
57 local_irq_disable(); 57 local_irq_disable();
58 /* Now clean up the flags */ 58 /* Now clean up the flags */
59 spin_lock(&desc->lock); 59 raw_spin_lock(&desc->lock);
60 action = desc->action; 60 action = desc->action;
61 61
62 /* 62 /*
@@ -68,9 +68,9 @@ static int try_one_irq(int irq, struct irq_desc *desc)
68 * Perform real IRQ processing for the IRQ we deferred 68 * Perform real IRQ processing for the IRQ we deferred
69 */ 69 */
70 work = 1; 70 work = 1;
71 spin_unlock(&desc->lock); 71 raw_spin_unlock(&desc->lock);
72 handle_IRQ_event(irq, action); 72 handle_IRQ_event(irq, action);
73 spin_lock(&desc->lock); 73 raw_spin_lock(&desc->lock);
74 desc->status &= ~IRQ_PENDING; 74 desc->status &= ~IRQ_PENDING;
75 } 75 }
76 desc->status &= ~IRQ_INPROGRESS; 76 desc->status &= ~IRQ_INPROGRESS;
@@ -80,7 +80,7 @@ static int try_one_irq(int irq, struct irq_desc *desc)
80 */ 80 */
81 if (work && desc->chip && desc->chip->end) 81 if (work && desc->chip && desc->chip->end)
82 desc->chip->end(irq); 82 desc->chip->end(irq);
83 spin_unlock(&desc->lock); 83 raw_spin_unlock(&desc->lock);
84 84
85 return ok; 85 return ok;
86} 86}
@@ -104,7 +104,7 @@ static int misrouted_irq(int irq)
104 return ok; 104 return ok;
105} 105}
106 106
107static void poll_all_shared_irqs(void) 107static void poll_spurious_irqs(unsigned long dummy)
108{ 108{
109 struct irq_desc *desc; 109 struct irq_desc *desc;
110 int i; 110 int i;
@@ -125,23 +125,11 @@ static void poll_all_shared_irqs(void)
125 try_one_irq(i, desc); 125 try_one_irq(i, desc);
126 local_irq_enable(); 126 local_irq_enable();
127 } 127 }
128}
129
130static void poll_spurious_irqs(unsigned long dummy)
131{
132 poll_all_shared_irqs();
133 128
134 mod_timer(&poll_spurious_irq_timer, 129 mod_timer(&poll_spurious_irq_timer,
135 jiffies + POLL_SPURIOUS_IRQ_INTERVAL); 130 jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
136} 131}
137 132
138#ifdef CONFIG_DEBUG_SHIRQ
139void debug_poll_all_shared_irqs(void)
140{
141 poll_all_shared_irqs();
142}
143#endif
144
145/* 133/*
146 * If 99,900 of the previous 100,000 interrupts have not been handled 134 * If 99,900 of the previous 100,000 interrupts have not been handled
147 * then assume that the IRQ is stuck in some manner. Drop a diagnostic 135 * then assume that the IRQ is stuck in some manner. Drop a diagnostic
@@ -232,7 +220,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
232 /* 220 /*
233 * If we are seeing only the odd spurious IRQ caused by 221 * If we are seeing only the odd spurious IRQ caused by
234 * bus asynchronicity then don't eventually trigger an error, 222 * bus asynchronicity then don't eventually trigger an error,
235 * otherwise the couter becomes a doomsday timer for otherwise 223 * otherwise the counter becomes a doomsday timer for otherwise
236 * working systems 224 * working systems
237 */ 225 */
238 if (time_after(jiffies, desc->last_unhandled + HZ/10)) 226 if (time_after(jiffies, desc->last_unhandled + HZ/10))
diff --git a/kernel/itimer.c b/kernel/itimer.c
index b03451ede528..d802883153da 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -146,6 +146,7 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
146{ 146{
147 cputime_t cval, nval, cinterval, ninterval; 147 cputime_t cval, nval, cinterval, ninterval;
148 s64 ns_ninterval, ns_nval; 148 s64 ns_ninterval, ns_nval;
149 u32 error, incr_error;
149 struct cpu_itimer *it = &tsk->signal->it[clock_id]; 150 struct cpu_itimer *it = &tsk->signal->it[clock_id];
150 151
151 nval = timeval_to_cputime(&value->it_value); 152 nval = timeval_to_cputime(&value->it_value);
@@ -153,8 +154,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
153 ninterval = timeval_to_cputime(&value->it_interval); 154 ninterval = timeval_to_cputime(&value->it_interval);
154 ns_ninterval = timeval_to_ns(&value->it_interval); 155 ns_ninterval = timeval_to_ns(&value->it_interval);
155 156
156 it->incr_error = cputime_sub_ns(ninterval, ns_ninterval); 157 error = cputime_sub_ns(nval, ns_nval);
157 it->error = cputime_sub_ns(nval, ns_nval); 158 incr_error = cputime_sub_ns(ninterval, ns_ninterval);
158 159
159 spin_lock_irq(&tsk->sighand->siglock); 160 spin_lock_irq(&tsk->sighand->siglock);
160 161
@@ -168,6 +169,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
168 } 169 }
169 it->expires = nval; 170 it->expires = nval;
170 it->incr = ninterval; 171 it->incr = ninterval;
172 it->error = error;
173 it->incr_error = incr_error;
171 trace_itimer_state(clock_id == CPUCLOCK_VIRT ? 174 trace_itimer_state(clock_id == CPUCLOCK_VIRT ?
172 ITIMER_VIRTUAL : ITIMER_PROF, value, nval); 175 ITIMER_VIRTUAL : ITIMER_PROF, value, nval);
173 176
diff --git a/kernel/kexec.c b/kernel/kexec.c
index f336e2107f98..433e9fcc1fc5 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -31,6 +31,7 @@
31#include <linux/cpu.h> 31#include <linux/cpu.h>
32#include <linux/console.h> 32#include <linux/console.h>
33#include <linux/vmalloc.h> 33#include <linux/vmalloc.h>
34#include <linux/swap.h>
34 35
35#include <asm/page.h> 36#include <asm/page.h>
36#include <asm/uaccess.h> 37#include <asm/uaccess.h>
@@ -1082,6 +1083,64 @@ void crash_kexec(struct pt_regs *regs)
1082 } 1083 }
1083} 1084}
1084 1085
1086size_t crash_get_memory_size(void)
1087{
1088 size_t size;
1089 mutex_lock(&kexec_mutex);
1090 size = crashk_res.end - crashk_res.start + 1;
1091 mutex_unlock(&kexec_mutex);
1092 return size;
1093}
1094
1095static void free_reserved_phys_range(unsigned long begin, unsigned long end)
1096{
1097 unsigned long addr;
1098
1099 for (addr = begin; addr < end; addr += PAGE_SIZE) {
1100 ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT));
1101 init_page_count(pfn_to_page(addr >> PAGE_SHIFT));
1102 free_page((unsigned long)__va(addr));
1103 totalram_pages++;
1104 }
1105}
1106
1107int crash_shrink_memory(unsigned long new_size)
1108{
1109 int ret = 0;
1110 unsigned long start, end;
1111
1112 mutex_lock(&kexec_mutex);
1113
1114 if (kexec_crash_image) {
1115 ret = -ENOENT;
1116 goto unlock;
1117 }
1118 start = crashk_res.start;
1119 end = crashk_res.end;
1120
1121 if (new_size >= end - start + 1) {
1122 ret = -EINVAL;
1123 if (new_size == end - start + 1)
1124 ret = 0;
1125 goto unlock;
1126 }
1127
1128 start = roundup(start, PAGE_SIZE);
1129 end = roundup(start + new_size, PAGE_SIZE);
1130
1131 free_reserved_phys_range(end, crashk_res.end);
1132
1133 if (start == end) {
1134 crashk_res.end = end;
1135 release_resource(&crashk_res);
1136 } else
1137 crashk_res.end = end - 1;
1138
1139unlock:
1140 mutex_unlock(&kexec_mutex);
1141 return ret;
1142}
1143
1085static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, 1144static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
1086 size_t data_len) 1145 size_t data_len)
1087{ 1146{
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 9147a3190c9d..2eb517e23514 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -129,6 +129,7 @@ struct task_struct *kgdb_usethread;
129struct task_struct *kgdb_contthread; 129struct task_struct *kgdb_contthread;
130 130
131int kgdb_single_step; 131int kgdb_single_step;
132pid_t kgdb_sstep_pid;
132 133
133/* Our I/O buffers. */ 134/* Our I/O buffers. */
134static char remcom_in_buffer[BUFMAX]; 135static char remcom_in_buffer[BUFMAX];
@@ -541,12 +542,17 @@ static struct task_struct *getthread(struct pt_regs *regs, int tid)
541 */ 542 */
542 if (tid == 0 || tid == -1) 543 if (tid == 0 || tid == -1)
543 tid = -atomic_read(&kgdb_active) - 2; 544 tid = -atomic_read(&kgdb_active) - 2;
544 if (tid < 0) { 545 if (tid < -1 && tid > -NR_CPUS - 2) {
545 if (kgdb_info[-tid - 2].task) 546 if (kgdb_info[-tid - 2].task)
546 return kgdb_info[-tid - 2].task; 547 return kgdb_info[-tid - 2].task;
547 else 548 else
548 return idle_task(-tid - 2); 549 return idle_task(-tid - 2);
549 } 550 }
551 if (tid <= 0) {
552 printk(KERN_ERR "KGDB: Internal thread select error\n");
553 dump_stack();
554 return NULL;
555 }
550 556
551 /* 557 /*
552 * find_task_by_pid_ns() does not take the tasklist lock anymore 558 * find_task_by_pid_ns() does not take the tasklist lock anymore
@@ -619,7 +625,8 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
619static int kgdb_activate_sw_breakpoints(void) 625static int kgdb_activate_sw_breakpoints(void)
620{ 626{
621 unsigned long addr; 627 unsigned long addr;
622 int error = 0; 628 int error;
629 int ret = 0;
623 int i; 630 int i;
624 631
625 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { 632 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
@@ -629,13 +636,16 @@ static int kgdb_activate_sw_breakpoints(void)
629 addr = kgdb_break[i].bpt_addr; 636 addr = kgdb_break[i].bpt_addr;
630 error = kgdb_arch_set_breakpoint(addr, 637 error = kgdb_arch_set_breakpoint(addr,
631 kgdb_break[i].saved_instr); 638 kgdb_break[i].saved_instr);
632 if (error) 639 if (error) {
633 return error; 640 ret = error;
641 printk(KERN_INFO "KGDB: BP install failed: %lx", addr);
642 continue;
643 }
634 644
635 kgdb_flush_swbreak_addr(addr); 645 kgdb_flush_swbreak_addr(addr);
636 kgdb_break[i].state = BP_ACTIVE; 646 kgdb_break[i].state = BP_ACTIVE;
637 } 647 }
638 return 0; 648 return ret;
639} 649}
640 650
641static int kgdb_set_sw_break(unsigned long addr) 651static int kgdb_set_sw_break(unsigned long addr)
@@ -682,7 +692,8 @@ static int kgdb_set_sw_break(unsigned long addr)
682static int kgdb_deactivate_sw_breakpoints(void) 692static int kgdb_deactivate_sw_breakpoints(void)
683{ 693{
684 unsigned long addr; 694 unsigned long addr;
685 int error = 0; 695 int error;
696 int ret = 0;
686 int i; 697 int i;
687 698
688 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { 699 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
@@ -691,13 +702,15 @@ static int kgdb_deactivate_sw_breakpoints(void)
691 addr = kgdb_break[i].bpt_addr; 702 addr = kgdb_break[i].bpt_addr;
692 error = kgdb_arch_remove_breakpoint(addr, 703 error = kgdb_arch_remove_breakpoint(addr,
693 kgdb_break[i].saved_instr); 704 kgdb_break[i].saved_instr);
694 if (error) 705 if (error) {
695 return error; 706 printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr);
707 ret = error;
708 }
696 709
697 kgdb_flush_swbreak_addr(addr); 710 kgdb_flush_swbreak_addr(addr);
698 kgdb_break[i].state = BP_SET; 711 kgdb_break[i].state = BP_SET;
699 } 712 }
700 return 0; 713 return ret;
701} 714}
702 715
703static int kgdb_remove_sw_break(unsigned long addr) 716static int kgdb_remove_sw_break(unsigned long addr)
@@ -870,7 +883,7 @@ static void gdb_cmd_getregs(struct kgdb_state *ks)
870 883
871 /* 884 /*
872 * All threads that don't have debuggerinfo should be 885 * All threads that don't have debuggerinfo should be
873 * in __schedule() sleeping, since all other CPUs 886 * in schedule() sleeping, since all other CPUs
874 * are in kgdb_wait, and thus have debuggerinfo. 887 * are in kgdb_wait, and thus have debuggerinfo.
875 */ 888 */
876 if (local_debuggerinfo) { 889 if (local_debuggerinfo) {
@@ -1204,8 +1217,10 @@ static int gdb_cmd_exception_pass(struct kgdb_state *ks)
1204 return 1; 1217 return 1;
1205 1218
1206 } else { 1219 } else {
1207 error_packet(remcom_out_buffer, -EINVAL); 1220 kgdb_msg_write("KGDB only knows signal 9 (pass)"
1208 return 0; 1221 " and 15 (pass and disconnect)\n"
1222 "Executing a continue without signal passing\n", 0);
1223 remcom_in_buffer[0] = 'c';
1209 } 1224 }
1210 1225
1211 /* Indicate fall through */ 1226 /* Indicate fall through */
@@ -1395,6 +1410,7 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
1395 struct kgdb_state kgdb_var; 1410 struct kgdb_state kgdb_var;
1396 struct kgdb_state *ks = &kgdb_var; 1411 struct kgdb_state *ks = &kgdb_var;
1397 unsigned long flags; 1412 unsigned long flags;
1413 int sstep_tries = 100;
1398 int error = 0; 1414 int error = 0;
1399 int i, cpu; 1415 int i, cpu;
1400 1416
@@ -1425,13 +1441,14 @@ acquirelock:
1425 cpu_relax(); 1441 cpu_relax();
1426 1442
1427 /* 1443 /*
1428 * Do not start the debugger connection on this CPU if the last 1444 * For single stepping, try to only enter on the processor
1429 * instance of the exception handler wanted to come into the 1445 * that was single stepping. To gaurd against a deadlock, the
1430 * debugger on a different CPU via a single step 1446 * kernel will only try for the value of sstep_tries before
1447 * giving up and continuing on.
1431 */ 1448 */
1432 if (atomic_read(&kgdb_cpu_doing_single_step) != -1 && 1449 if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
1433 atomic_read(&kgdb_cpu_doing_single_step) != cpu) { 1450 (kgdb_info[cpu].task &&
1434 1451 kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
1435 atomic_set(&kgdb_active, -1); 1452 atomic_set(&kgdb_active, -1);
1436 touch_softlockup_watchdog(); 1453 touch_softlockup_watchdog();
1437 clocksource_touch_watchdog(); 1454 clocksource_touch_watchdog();
@@ -1524,6 +1541,13 @@ acquirelock:
1524 } 1541 }
1525 1542
1526kgdb_restore: 1543kgdb_restore:
1544 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
1545 int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step);
1546 if (kgdb_info[sstep_cpu].task)
1547 kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid;
1548 else
1549 kgdb_sstep_pid = 0;
1550 }
1527 /* Free kgdb_active */ 1551 /* Free kgdb_active */
1528 atomic_set(&kgdb_active, -1); 1552 atomic_set(&kgdb_active, -1);
1529 touch_softlockup_watchdog(); 1553 touch_softlockup_watchdog();
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 9fcb53a11f87..25b103190364 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -80,16 +80,16 @@ int __request_module(bool wait, const char *fmt, ...)
80#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ 80#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
81 static int kmod_loop_msg; 81 static int kmod_loop_msg;
82 82
83 ret = security_kernel_module_request();
84 if (ret)
85 return ret;
86
87 va_start(args, fmt); 83 va_start(args, fmt);
88 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); 84 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
89 va_end(args); 85 va_end(args);
90 if (ret >= MODULE_NAME_LEN) 86 if (ret >= MODULE_NAME_LEN)
91 return -ENAMETOOLONG; 87 return -ENAMETOOLONG;
92 88
89 ret = security_kernel_module_request(module_name);
90 if (ret)
91 return ret;
92
93 /* If modprobe needs a service that is in a module, we get a recursive 93 /* If modprobe needs a service that is in a module, we get a recursive
94 * loop. Limit the number of running kmod threads to max_threads/2 or 94 * loop. Limit the number of running kmod threads to max_threads/2 or
95 * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method 95 * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 84495958e703..e5342a344c43 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1035,9 +1035,9 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
1035 /* Pre-allocate memory for max kretprobe instances */ 1035 /* Pre-allocate memory for max kretprobe instances */
1036 if (rp->maxactive <= 0) { 1036 if (rp->maxactive <= 0) {
1037#ifdef CONFIG_PREEMPT 1037#ifdef CONFIG_PREEMPT
1038 rp->maxactive = max(10, 2 * NR_CPUS); 1038 rp->maxactive = max(10, 2 * num_possible_cpus());
1039#else 1039#else
1040 rp->maxactive = NR_CPUS; 1040 rp->maxactive = num_possible_cpus();
1041#endif 1041#endif
1042 } 1042 }
1043 spin_lock_init(&rp->lock); 1043 spin_lock_init(&rp->lock);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 528dd78e7e7e..3feaf5a74514 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -100,6 +100,26 @@ static ssize_t kexec_crash_loaded_show(struct kobject *kobj,
100} 100}
101KERNEL_ATTR_RO(kexec_crash_loaded); 101KERNEL_ATTR_RO(kexec_crash_loaded);
102 102
103static ssize_t kexec_crash_size_show(struct kobject *kobj,
104 struct kobj_attribute *attr, char *buf)
105{
106 return sprintf(buf, "%zu\n", crash_get_memory_size());
107}
108static ssize_t kexec_crash_size_store(struct kobject *kobj,
109 struct kobj_attribute *attr,
110 const char *buf, size_t count)
111{
112 unsigned long cnt;
113 int ret;
114
115 if (strict_strtoul(buf, 0, &cnt))
116 return -EINVAL;
117
118 ret = crash_shrink_memory(cnt);
119 return ret < 0 ? ret : count;
120}
121KERNEL_ATTR_RW(kexec_crash_size);
122
103static ssize_t vmcoreinfo_show(struct kobject *kobj, 123static ssize_t vmcoreinfo_show(struct kobject *kobj,
104 struct kobj_attribute *attr, char *buf) 124 struct kobj_attribute *attr, char *buf)
105{ 125{
@@ -147,6 +167,7 @@ static struct attribute * kernel_attrs[] = {
147#ifdef CONFIG_KEXEC 167#ifdef CONFIG_KEXEC
148 &kexec_loaded_attr.attr, 168 &kexec_loaded_attr.attr,
149 &kexec_crash_loaded_attr.attr, 169 &kexec_crash_loaded_attr.attr,
170 &kexec_crash_size_attr.attr,
150 &vmcoreinfo_attr.attr, 171 &vmcoreinfo_attr.attr,
151#endif 172#endif
152 NULL 173 NULL
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index f5dcd36d3151..5feaddcdbe49 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -73,11 +73,11 @@ module_param(lock_stat, int, 0644);
73 * to use a raw spinlock - we really dont want the spinlock 73 * to use a raw spinlock - we really dont want the spinlock
74 * code to recurse back into the lockdep code... 74 * code to recurse back into the lockdep code...
75 */ 75 */
76static raw_spinlock_t lockdep_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 76static arch_spinlock_t lockdep_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
77 77
78static int graph_lock(void) 78static int graph_lock(void)
79{ 79{
80 __raw_spin_lock(&lockdep_lock); 80 arch_spin_lock(&lockdep_lock);
81 /* 81 /*
82 * Make sure that if another CPU detected a bug while 82 * Make sure that if another CPU detected a bug while
83 * walking the graph we dont change it (while the other 83 * walking the graph we dont change it (while the other
@@ -85,7 +85,7 @@ static int graph_lock(void)
85 * dropped already) 85 * dropped already)
86 */ 86 */
87 if (!debug_locks) { 87 if (!debug_locks) {
88 __raw_spin_unlock(&lockdep_lock); 88 arch_spin_unlock(&lockdep_lock);
89 return 0; 89 return 0;
90 } 90 }
91 /* prevent any recursions within lockdep from causing deadlocks */ 91 /* prevent any recursions within lockdep from causing deadlocks */
@@ -95,11 +95,11 @@ static int graph_lock(void)
95 95
96static inline int graph_unlock(void) 96static inline int graph_unlock(void)
97{ 97{
98 if (debug_locks && !__raw_spin_is_locked(&lockdep_lock)) 98 if (debug_locks && !arch_spin_is_locked(&lockdep_lock))
99 return DEBUG_LOCKS_WARN_ON(1); 99 return DEBUG_LOCKS_WARN_ON(1);
100 100
101 current->lockdep_recursion--; 101 current->lockdep_recursion--;
102 __raw_spin_unlock(&lockdep_lock); 102 arch_spin_unlock(&lockdep_lock);
103 return 0; 103 return 0;
104} 104}
105 105
@@ -111,7 +111,7 @@ static inline int debug_locks_off_graph_unlock(void)
111{ 111{
112 int ret = debug_locks_off(); 112 int ret = debug_locks_off();
113 113
114 __raw_spin_unlock(&lockdep_lock); 114 arch_spin_unlock(&lockdep_lock);
115 115
116 return ret; 116 return ret;
117} 117}
@@ -140,7 +140,8 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock)
140} 140}
141 141
142#ifdef CONFIG_LOCK_STAT 142#ifdef CONFIG_LOCK_STAT
143static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); 143static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS],
144 cpu_lock_stats);
144 145
145static inline u64 lockstat_clock(void) 146static inline u64 lockstat_clock(void)
146{ 147{
@@ -168,7 +169,7 @@ static void lock_time_inc(struct lock_time *lt, u64 time)
168 if (time > lt->max) 169 if (time > lt->max)
169 lt->max = time; 170 lt->max = time;
170 171
171 if (time < lt->min || !lt->min) 172 if (time < lt->min || !lt->nr)
172 lt->min = time; 173 lt->min = time;
173 174
174 lt->total += time; 175 lt->total += time;
@@ -177,8 +178,15 @@ static void lock_time_inc(struct lock_time *lt, u64 time)
177 178
178static inline void lock_time_add(struct lock_time *src, struct lock_time *dst) 179static inline void lock_time_add(struct lock_time *src, struct lock_time *dst)
179{ 180{
180 dst->min += src->min; 181 if (!src->nr)
181 dst->max += src->max; 182 return;
183
184 if (src->max > dst->max)
185 dst->max = src->max;
186
187 if (src->min < dst->min || !dst->nr)
188 dst->min = src->min;
189
182 dst->total += src->total; 190 dst->total += src->total;
183 dst->nr += src->nr; 191 dst->nr += src->nr;
184} 192}
@@ -191,7 +199,7 @@ struct lock_class_stats lock_stats(struct lock_class *class)
191 memset(&stats, 0, sizeof(struct lock_class_stats)); 199 memset(&stats, 0, sizeof(struct lock_class_stats));
192 for_each_possible_cpu(cpu) { 200 for_each_possible_cpu(cpu) {
193 struct lock_class_stats *pcs = 201 struct lock_class_stats *pcs =
194 &per_cpu(lock_stats, cpu)[class - lock_classes]; 202 &per_cpu(cpu_lock_stats, cpu)[class - lock_classes];
195 203
196 for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++) 204 for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
197 stats.contention_point[i] += pcs->contention_point[i]; 205 stats.contention_point[i] += pcs->contention_point[i];
@@ -218,7 +226,7 @@ void clear_lock_stats(struct lock_class *class)
218 226
219 for_each_possible_cpu(cpu) { 227 for_each_possible_cpu(cpu) {
220 struct lock_class_stats *cpu_stats = 228 struct lock_class_stats *cpu_stats =
221 &per_cpu(lock_stats, cpu)[class - lock_classes]; 229 &per_cpu(cpu_lock_stats, cpu)[class - lock_classes];
222 230
223 memset(cpu_stats, 0, sizeof(struct lock_class_stats)); 231 memset(cpu_stats, 0, sizeof(struct lock_class_stats));
224 } 232 }
@@ -228,12 +236,12 @@ void clear_lock_stats(struct lock_class *class)
228 236
229static struct lock_class_stats *get_lock_stats(struct lock_class *class) 237static struct lock_class_stats *get_lock_stats(struct lock_class *class)
230{ 238{
231 return &get_cpu_var(lock_stats)[class - lock_classes]; 239 return &get_cpu_var(cpu_lock_stats)[class - lock_classes];
232} 240}
233 241
234static void put_lock_stats(struct lock_class_stats *stats) 242static void put_lock_stats(struct lock_class_stats *stats)
235{ 243{
236 put_cpu_var(lock_stats); 244 put_cpu_var(cpu_lock_stats);
237} 245}
238 246
239static void lock_release_holdtime(struct held_lock *hlock) 247static void lock_release_holdtime(struct held_lock *hlock)
@@ -379,7 +387,8 @@ static int save_trace(struct stack_trace *trace)
379 * complete trace that maxes out the entries provided will be reported 387 * complete trace that maxes out the entries provided will be reported
380 * as incomplete, friggin useless </rant> 388 * as incomplete, friggin useless </rant>
381 */ 389 */
382 if (trace->entries[trace->nr_entries-1] == ULONG_MAX) 390 if (trace->nr_entries != 0 &&
391 trace->entries[trace->nr_entries-1] == ULONG_MAX)
383 trace->nr_entries--; 392 trace->nr_entries--;
384 393
385 trace->max_entries = trace->nr_entries; 394 trace->max_entries = trace->nr_entries;
@@ -1161,9 +1170,9 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class)
1161 this.class = class; 1170 this.class = class;
1162 1171
1163 local_irq_save(flags); 1172 local_irq_save(flags);
1164 __raw_spin_lock(&lockdep_lock); 1173 arch_spin_lock(&lockdep_lock);
1165 ret = __lockdep_count_forward_deps(&this); 1174 ret = __lockdep_count_forward_deps(&this);
1166 __raw_spin_unlock(&lockdep_lock); 1175 arch_spin_unlock(&lockdep_lock);
1167 local_irq_restore(flags); 1176 local_irq_restore(flags);
1168 1177
1169 return ret; 1178 return ret;
@@ -1188,9 +1197,9 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
1188 this.class = class; 1197 this.class = class;
1189 1198
1190 local_irq_save(flags); 1199 local_irq_save(flags);
1191 __raw_spin_lock(&lockdep_lock); 1200 arch_spin_lock(&lockdep_lock);
1192 ret = __lockdep_count_backward_deps(&this); 1201 ret = __lockdep_count_backward_deps(&this);
1193 __raw_spin_unlock(&lockdep_lock); 1202 arch_spin_unlock(&lockdep_lock);
1194 local_irq_restore(flags); 1203 local_irq_restore(flags);
1195 1204
1196 return ret; 1205 return ret;
diff --git a/kernel/module.c b/kernel/module.c
index 8b7d8805819d..a65dc787a27b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -370,8 +370,6 @@ EXPORT_SYMBOL_GPL(find_module);
370 370
371#ifdef CONFIG_SMP 371#ifdef CONFIG_SMP
372 372
373#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
374
375static void *percpu_modalloc(unsigned long size, unsigned long align, 373static void *percpu_modalloc(unsigned long size, unsigned long align,
376 const char *name) 374 const char *name)
377{ 375{
@@ -395,154 +393,6 @@ static void percpu_modfree(void *freeme)
395 free_percpu(freeme); 393 free_percpu(freeme);
396} 394}
397 395
398#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */
399
400/* Number of blocks used and allocated. */
401static unsigned int pcpu_num_used, pcpu_num_allocated;
402/* Size of each block. -ve means used. */
403static int *pcpu_size;
404
405static int split_block(unsigned int i, unsigned short size)
406{
407 /* Reallocation required? */
408 if (pcpu_num_used + 1 > pcpu_num_allocated) {
409 int *new;
410
411 new = krealloc(pcpu_size, sizeof(new[0])*pcpu_num_allocated*2,
412 GFP_KERNEL);
413 if (!new)
414 return 0;
415
416 pcpu_num_allocated *= 2;
417 pcpu_size = new;
418 }
419
420 /* Insert a new subblock */
421 memmove(&pcpu_size[i+1], &pcpu_size[i],
422 sizeof(pcpu_size[0]) * (pcpu_num_used - i));
423 pcpu_num_used++;
424
425 pcpu_size[i+1] -= size;
426 pcpu_size[i] = size;
427 return 1;
428}
429
430static inline unsigned int block_size(int val)
431{
432 if (val < 0)
433 return -val;
434 return val;
435}
436
437static void *percpu_modalloc(unsigned long size, unsigned long align,
438 const char *name)
439{
440 unsigned long extra;
441 unsigned int i;
442 void *ptr;
443 int cpu;
444
445 if (align > PAGE_SIZE) {
446 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
447 name, align, PAGE_SIZE);
448 align = PAGE_SIZE;
449 }
450
451 ptr = __per_cpu_start;
452 for (i = 0; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
453 /* Extra for alignment requirement. */
454 extra = ALIGN((unsigned long)ptr, align) - (unsigned long)ptr;
455 BUG_ON(i == 0 && extra != 0);
456
457 if (pcpu_size[i] < 0 || pcpu_size[i] < extra + size)
458 continue;
459
460 /* Transfer extra to previous block. */
461 if (pcpu_size[i-1] < 0)
462 pcpu_size[i-1] -= extra;
463 else
464 pcpu_size[i-1] += extra;
465 pcpu_size[i] -= extra;
466 ptr += extra;
467
468 /* Split block if warranted */
469 if (pcpu_size[i] - size > sizeof(unsigned long))
470 if (!split_block(i, size))
471 return NULL;
472
473 /* add the per-cpu scanning areas */
474 for_each_possible_cpu(cpu)
475 kmemleak_alloc(ptr + per_cpu_offset(cpu), size, 0,
476 GFP_KERNEL);
477
478 /* Mark allocated */
479 pcpu_size[i] = -pcpu_size[i];
480 return ptr;
481 }
482
483 printk(KERN_WARNING "Could not allocate %lu bytes percpu data\n",
484 size);
485 return NULL;
486}
487
488static void percpu_modfree(void *freeme)
489{
490 unsigned int i;
491 void *ptr = __per_cpu_start + block_size(pcpu_size[0]);
492 int cpu;
493
494 /* First entry is core kernel percpu data. */
495 for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
496 if (ptr == freeme) {
497 pcpu_size[i] = -pcpu_size[i];
498 goto free;
499 }
500 }
501 BUG();
502
503 free:
504 /* remove the per-cpu scanning areas */
505 for_each_possible_cpu(cpu)
506 kmemleak_free(freeme + per_cpu_offset(cpu));
507
508 /* Merge with previous? */
509 if (pcpu_size[i-1] >= 0) {
510 pcpu_size[i-1] += pcpu_size[i];
511 pcpu_num_used--;
512 memmove(&pcpu_size[i], &pcpu_size[i+1],
513 (pcpu_num_used - i) * sizeof(pcpu_size[0]));
514 i--;
515 }
516 /* Merge with next? */
517 if (i+1 < pcpu_num_used && pcpu_size[i+1] >= 0) {
518 pcpu_size[i] += pcpu_size[i+1];
519 pcpu_num_used--;
520 memmove(&pcpu_size[i+1], &pcpu_size[i+2],
521 (pcpu_num_used - (i+1)) * sizeof(pcpu_size[0]));
522 }
523}
524
525static int percpu_modinit(void)
526{
527 pcpu_num_used = 2;
528 pcpu_num_allocated = 2;
529 pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated,
530 GFP_KERNEL);
531 /* Static in-kernel percpu data (used). */
532 pcpu_size[0] = -(__per_cpu_end-__per_cpu_start);
533 /* Free room. */
534 pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0];
535 if (pcpu_size[1] < 0) {
536 printk(KERN_ERR "No per-cpu room for modules.\n");
537 pcpu_num_used = 1;
538 }
539
540 return 0;
541}
542__initcall(percpu_modinit);
543
544#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */
545
546static unsigned int find_pcpusec(Elf_Ehdr *hdr, 396static unsigned int find_pcpusec(Elf_Ehdr *hdr,
547 Elf_Shdr *sechdrs, 397 Elf_Shdr *sechdrs,
548 const char *secstrings) 398 const char *secstrings)
@@ -1030,11 +880,23 @@ static int try_to_force_load(struct module *mod, const char *reason)
1030} 880}
1031 881
1032#ifdef CONFIG_MODVERSIONS 882#ifdef CONFIG_MODVERSIONS
883/* If the arch applies (non-zero) relocations to kernel kcrctab, unapply it. */
884static unsigned long maybe_relocated(unsigned long crc,
885 const struct module *crc_owner)
886{
887#ifdef ARCH_RELOCATES_KCRCTAB
888 if (crc_owner == NULL)
889 return crc - (unsigned long)reloc_start;
890#endif
891 return crc;
892}
893
1033static int check_version(Elf_Shdr *sechdrs, 894static int check_version(Elf_Shdr *sechdrs,
1034 unsigned int versindex, 895 unsigned int versindex,
1035 const char *symname, 896 const char *symname,
1036 struct module *mod, 897 struct module *mod,
1037 const unsigned long *crc) 898 const unsigned long *crc,
899 const struct module *crc_owner)
1038{ 900{
1039 unsigned int i, num_versions; 901 unsigned int i, num_versions;
1040 struct modversion_info *versions; 902 struct modversion_info *versions;
@@ -1055,10 +917,10 @@ static int check_version(Elf_Shdr *sechdrs,
1055 if (strcmp(versions[i].name, symname) != 0) 917 if (strcmp(versions[i].name, symname) != 0)
1056 continue; 918 continue;
1057 919
1058 if (versions[i].crc == *crc) 920 if (versions[i].crc == maybe_relocated(*crc, crc_owner))
1059 return 1; 921 return 1;
1060 DEBUGP("Found checksum %lX vs module %lX\n", 922 DEBUGP("Found checksum %lX vs module %lX\n",
1061 *crc, versions[i].crc); 923 maybe_relocated(*crc, crc_owner), versions[i].crc);
1062 goto bad_version; 924 goto bad_version;
1063 } 925 }
1064 926
@@ -1081,7 +943,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
1081 if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL, 943 if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL,
1082 &crc, true, false)) 944 &crc, true, false))
1083 BUG(); 945 BUG();
1084 return check_version(sechdrs, versindex, "module_layout", mod, crc); 946 return check_version(sechdrs, versindex, "module_layout", mod, crc,
947 NULL);
1085} 948}
1086 949
1087/* First part is kernel version, which we ignore if module has crcs. */ 950/* First part is kernel version, which we ignore if module has crcs. */
@@ -1099,7 +962,8 @@ static inline int check_version(Elf_Shdr *sechdrs,
1099 unsigned int versindex, 962 unsigned int versindex,
1100 const char *symname, 963 const char *symname,
1101 struct module *mod, 964 struct module *mod,
1102 const unsigned long *crc) 965 const unsigned long *crc,
966 const struct module *crc_owner)
1103{ 967{
1104 return 1; 968 return 1;
1105} 969}
@@ -1134,8 +998,8 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
1134 /* use_module can fail due to OOM, 998 /* use_module can fail due to OOM,
1135 or module initialization or unloading */ 999 or module initialization or unloading */
1136 if (sym) { 1000 if (sym) {
1137 if (!check_version(sechdrs, versindex, name, mod, crc) || 1001 if (!check_version(sechdrs, versindex, name, mod, crc, owner)
1138 !use_module(mod, owner)) 1002 || !use_module(mod, owner))
1139 sym = NULL; 1003 sym = NULL;
1140 } 1004 }
1141 return sym; 1005 return sym;
@@ -1187,7 +1051,8 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1187 1051
1188 /* Count loaded sections and allocate structures */ 1052 /* Count loaded sections and allocate structures */
1189 for (i = 0; i < nsect; i++) 1053 for (i = 0; i < nsect; i++)
1190 if (sechdrs[i].sh_flags & SHF_ALLOC) 1054 if (sechdrs[i].sh_flags & SHF_ALLOC
1055 && sechdrs[i].sh_size)
1191 nloaded++; 1056 nloaded++;
1192 size[0] = ALIGN(sizeof(*sect_attrs) 1057 size[0] = ALIGN(sizeof(*sect_attrs)
1193 + nloaded * sizeof(sect_attrs->attrs[0]), 1058 + nloaded * sizeof(sect_attrs->attrs[0]),
@@ -1207,6 +1072,8 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1207 for (i = 0; i < nsect; i++) { 1072 for (i = 0; i < nsect; i++) {
1208 if (! (sechdrs[i].sh_flags & SHF_ALLOC)) 1073 if (! (sechdrs[i].sh_flags & SHF_ALLOC))
1209 continue; 1074 continue;
1075 if (!sechdrs[i].sh_size)
1076 continue;
1210 sattr->address = sechdrs[i].sh_addr; 1077 sattr->address = sechdrs[i].sh_addr;
1211 sattr->name = kstrdup(secstrings + sechdrs[i].sh_name, 1078 sattr->name = kstrdup(secstrings + sechdrs[i].sh_name,
1212 GFP_KERNEL); 1079 GFP_KERNEL);
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index 6b2d735846a5..57d527a16f9d 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -43,13 +43,13 @@ static inline void mutex_clear_owner(struct mutex *lock)
43 \ 43 \
44 DEBUG_LOCKS_WARN_ON(in_interrupt()); \ 44 DEBUG_LOCKS_WARN_ON(in_interrupt()); \
45 local_irq_save(flags); \ 45 local_irq_save(flags); \
46 __raw_spin_lock(&(lock)->raw_lock); \ 46 arch_spin_lock(&(lock)->rlock.raw_lock);\
47 DEBUG_LOCKS_WARN_ON(l->magic != l); \ 47 DEBUG_LOCKS_WARN_ON(l->magic != l); \
48 } while (0) 48 } while (0)
49 49
50#define spin_unlock_mutex(lock, flags) \ 50#define spin_unlock_mutex(lock, flags) \
51 do { \ 51 do { \
52 __raw_spin_unlock(&(lock)->raw_lock); \ 52 arch_spin_unlock(&(lock)->rlock.raw_lock); \
53 local_irq_restore(flags); \ 53 local_irq_restore(flags); \
54 preempt_check_resched(); \ 54 preempt_check_resched(); \
55 } while (0) 55 } while (0)
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 947b3ad551f8..632f04c57d82 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -148,8 +148,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
148 148
149 preempt_disable(); 149 preempt_disable();
150 mutex_acquire(&lock->dep_map, subclass, 0, ip); 150 mutex_acquire(&lock->dep_map, subclass, 0, ip);
151#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES) && \ 151
152 !defined(CONFIG_HAVE_DEFAULT_NO_SPIN_MUTEXES) 152#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
153 /* 153 /*
154 * Optimistic spinning. 154 * Optimistic spinning.
155 * 155 *
diff --git a/kernel/panic.c b/kernel/panic.c
index 96b45d0b4ba5..5827f7b97254 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -10,6 +10,7 @@
10 */ 10 */
11#include <linux/debug_locks.h> 11#include <linux/debug_locks.h>
12#include <linux/interrupt.h> 12#include <linux/interrupt.h>
13#include <linux/kmsg_dump.h>
13#include <linux/kallsyms.h> 14#include <linux/kallsyms.h>
14#include <linux/notifier.h> 15#include <linux/notifier.h>
15#include <linux/module.h> 16#include <linux/module.h>
@@ -74,6 +75,7 @@ NORET_TYPE void panic(const char * fmt, ...)
74 dump_stack(); 75 dump_stack();
75#endif 76#endif
76 77
78 kmsg_dump(KMSG_DUMP_PANIC);
77 /* 79 /*
78 * If we have crashed and we have a crash kernel loaded let it handle 80 * If we have crashed and we have a crash kernel loaded let it handle
79 * everything else. 81 * everything else.
@@ -339,6 +341,7 @@ void oops_exit(void)
339{ 341{
340 do_oops_enter_exit(); 342 do_oops_enter_exit();
341 print_oops_end_marker(); 343 print_oops_end_marker();
344 kmsg_dump(KMSG_DUMP_OOPS);
342} 345}
343 346
344#ifdef WANT_WARN_ON_SLOWPATH 347#ifdef WANT_WARN_ON_SLOWPATH
diff --git a/kernel/params.c b/kernel/params.c
index d656c276508d..cf1b69183127 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -24,6 +24,7 @@
24#include <linux/err.h> 24#include <linux/err.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/ctype.h> 26#include <linux/ctype.h>
27#include <linux/string.h>
27 28
28#if 0 29#if 0
29#define DEBUGP printk 30#define DEBUGP printk
@@ -122,9 +123,7 @@ static char *next_arg(char *args, char **param, char **val)
122 next = args + i; 123 next = args + i;
123 124
124 /* Chew up trailing spaces. */ 125 /* Chew up trailing spaces. */
125 while (isspace(*next)) 126 return skip_spaces(next);
126 next++;
127 return next;
128} 127}
129 128
130/* Args looks like "foo=bar,bar2 baz=fuz wiz". */ 129/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
@@ -139,8 +138,7 @@ int parse_args(const char *name,
139 DEBUGP("Parsing ARGS: %s\n", args); 138 DEBUGP("Parsing ARGS: %s\n", args);
140 139
141 /* Chew leading spaces */ 140 /* Chew leading spaces */
142 while (isspace(*args)) 141 args = skip_spaces(args);
143 args++;
144 142
145 while (*args) { 143 while (*args) {
146 int ret; 144 int ret;
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 2e0aaa34fc7e..8ab86988bd24 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -203,14 +203,14 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
203 * if so. If we locked the right context, then it 203 * if so. If we locked the right context, then it
204 * can't get swapped on us any more. 204 * can't get swapped on us any more.
205 */ 205 */
206 spin_lock_irqsave(&ctx->lock, *flags); 206 raw_spin_lock_irqsave(&ctx->lock, *flags);
207 if (ctx != rcu_dereference(task->perf_event_ctxp)) { 207 if (ctx != rcu_dereference(task->perf_event_ctxp)) {
208 spin_unlock_irqrestore(&ctx->lock, *flags); 208 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
209 goto retry; 209 goto retry;
210 } 210 }
211 211
212 if (!atomic_inc_not_zero(&ctx->refcount)) { 212 if (!atomic_inc_not_zero(&ctx->refcount)) {
213 spin_unlock_irqrestore(&ctx->lock, *flags); 213 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
214 ctx = NULL; 214 ctx = NULL;
215 } 215 }
216 } 216 }
@@ -231,7 +231,7 @@ static struct perf_event_context *perf_pin_task_context(struct task_struct *task
231 ctx = perf_lock_task_context(task, &flags); 231 ctx = perf_lock_task_context(task, &flags);
232 if (ctx) { 232 if (ctx) {
233 ++ctx->pin_count; 233 ++ctx->pin_count;
234 spin_unlock_irqrestore(&ctx->lock, flags); 234 raw_spin_unlock_irqrestore(&ctx->lock, flags);
235 } 235 }
236 return ctx; 236 return ctx;
237} 237}
@@ -240,9 +240,9 @@ static void perf_unpin_context(struct perf_event_context *ctx)
240{ 240{
241 unsigned long flags; 241 unsigned long flags;
242 242
243 spin_lock_irqsave(&ctx->lock, flags); 243 raw_spin_lock_irqsave(&ctx->lock, flags);
244 --ctx->pin_count; 244 --ctx->pin_count;
245 spin_unlock_irqrestore(&ctx->lock, flags); 245 raw_spin_unlock_irqrestore(&ctx->lock, flags);
246 put_ctx(ctx); 246 put_ctx(ctx);
247} 247}
248 248
@@ -427,7 +427,7 @@ static void __perf_event_remove_from_context(void *info)
427 if (ctx->task && cpuctx->task_ctx != ctx) 427 if (ctx->task && cpuctx->task_ctx != ctx)
428 return; 428 return;
429 429
430 spin_lock(&ctx->lock); 430 raw_spin_lock(&ctx->lock);
431 /* 431 /*
432 * Protect the list operation against NMI by disabling the 432 * Protect the list operation against NMI by disabling the
433 * events on a global level. 433 * events on a global level.
@@ -449,7 +449,7 @@ static void __perf_event_remove_from_context(void *info)
449 } 449 }
450 450
451 perf_enable(); 451 perf_enable();
452 spin_unlock(&ctx->lock); 452 raw_spin_unlock(&ctx->lock);
453} 453}
454 454
455 455
@@ -476,7 +476,7 @@ static void perf_event_remove_from_context(struct perf_event *event)
476 if (!task) { 476 if (!task) {
477 /* 477 /*
478 * Per cpu events are removed via an smp call and 478 * Per cpu events are removed via an smp call and
479 * the removal is always sucessful. 479 * the removal is always successful.
480 */ 480 */
481 smp_call_function_single(event->cpu, 481 smp_call_function_single(event->cpu,
482 __perf_event_remove_from_context, 482 __perf_event_remove_from_context,
@@ -488,12 +488,12 @@ retry:
488 task_oncpu_function_call(task, __perf_event_remove_from_context, 488 task_oncpu_function_call(task, __perf_event_remove_from_context,
489 event); 489 event);
490 490
491 spin_lock_irq(&ctx->lock); 491 raw_spin_lock_irq(&ctx->lock);
492 /* 492 /*
493 * If the context is active we need to retry the smp call. 493 * If the context is active we need to retry the smp call.
494 */ 494 */
495 if (ctx->nr_active && !list_empty(&event->group_entry)) { 495 if (ctx->nr_active && !list_empty(&event->group_entry)) {
496 spin_unlock_irq(&ctx->lock); 496 raw_spin_unlock_irq(&ctx->lock);
497 goto retry; 497 goto retry;
498 } 498 }
499 499
@@ -504,7 +504,7 @@ retry:
504 */ 504 */
505 if (!list_empty(&event->group_entry)) 505 if (!list_empty(&event->group_entry))
506 list_del_event(event, ctx); 506 list_del_event(event, ctx);
507 spin_unlock_irq(&ctx->lock); 507 raw_spin_unlock_irq(&ctx->lock);
508} 508}
509 509
510/* 510/*
@@ -535,7 +535,7 @@ static void __perf_event_disable(void *info)
535 if (ctx->task && cpuctx->task_ctx != ctx) 535 if (ctx->task && cpuctx->task_ctx != ctx)
536 return; 536 return;
537 537
538 spin_lock(&ctx->lock); 538 raw_spin_lock(&ctx->lock);
539 539
540 /* 540 /*
541 * If the event is on, turn it off. 541 * If the event is on, turn it off.
@@ -551,7 +551,7 @@ static void __perf_event_disable(void *info)
551 event->state = PERF_EVENT_STATE_OFF; 551 event->state = PERF_EVENT_STATE_OFF;
552 } 552 }
553 553
554 spin_unlock(&ctx->lock); 554 raw_spin_unlock(&ctx->lock);
555} 555}
556 556
557/* 557/*
@@ -584,12 +584,12 @@ void perf_event_disable(struct perf_event *event)
584 retry: 584 retry:
585 task_oncpu_function_call(task, __perf_event_disable, event); 585 task_oncpu_function_call(task, __perf_event_disable, event);
586 586
587 spin_lock_irq(&ctx->lock); 587 raw_spin_lock_irq(&ctx->lock);
588 /* 588 /*
589 * If the event is still active, we need to retry the cross-call. 589 * If the event is still active, we need to retry the cross-call.
590 */ 590 */
591 if (event->state == PERF_EVENT_STATE_ACTIVE) { 591 if (event->state == PERF_EVENT_STATE_ACTIVE) {
592 spin_unlock_irq(&ctx->lock); 592 raw_spin_unlock_irq(&ctx->lock);
593 goto retry; 593 goto retry;
594 } 594 }
595 595
@@ -602,7 +602,7 @@ void perf_event_disable(struct perf_event *event)
602 event->state = PERF_EVENT_STATE_OFF; 602 event->state = PERF_EVENT_STATE_OFF;
603 } 603 }
604 604
605 spin_unlock_irq(&ctx->lock); 605 raw_spin_unlock_irq(&ctx->lock);
606} 606}
607 607
608static int 608static int
@@ -770,7 +770,7 @@ static void __perf_install_in_context(void *info)
770 cpuctx->task_ctx = ctx; 770 cpuctx->task_ctx = ctx;
771 } 771 }
772 772
773 spin_lock(&ctx->lock); 773 raw_spin_lock(&ctx->lock);
774 ctx->is_active = 1; 774 ctx->is_active = 1;
775 update_context_time(ctx); 775 update_context_time(ctx);
776 776
@@ -823,7 +823,7 @@ static void __perf_install_in_context(void *info)
823 unlock: 823 unlock:
824 perf_enable(); 824 perf_enable();
825 825
826 spin_unlock(&ctx->lock); 826 raw_spin_unlock(&ctx->lock);
827} 827}
828 828
829/* 829/*
@@ -848,7 +848,7 @@ perf_install_in_context(struct perf_event_context *ctx,
848 if (!task) { 848 if (!task) {
849 /* 849 /*
850 * Per cpu events are installed via an smp call and 850 * Per cpu events are installed via an smp call and
851 * the install is always sucessful. 851 * the install is always successful.
852 */ 852 */
853 smp_call_function_single(cpu, __perf_install_in_context, 853 smp_call_function_single(cpu, __perf_install_in_context,
854 event, 1); 854 event, 1);
@@ -859,12 +859,12 @@ retry:
859 task_oncpu_function_call(task, __perf_install_in_context, 859 task_oncpu_function_call(task, __perf_install_in_context,
860 event); 860 event);
861 861
862 spin_lock_irq(&ctx->lock); 862 raw_spin_lock_irq(&ctx->lock);
863 /* 863 /*
864 * we need to retry the smp call. 864 * we need to retry the smp call.
865 */ 865 */
866 if (ctx->is_active && list_empty(&event->group_entry)) { 866 if (ctx->is_active && list_empty(&event->group_entry)) {
867 spin_unlock_irq(&ctx->lock); 867 raw_spin_unlock_irq(&ctx->lock);
868 goto retry; 868 goto retry;
869 } 869 }
870 870
@@ -875,7 +875,7 @@ retry:
875 */ 875 */
876 if (list_empty(&event->group_entry)) 876 if (list_empty(&event->group_entry))
877 add_event_to_ctx(event, ctx); 877 add_event_to_ctx(event, ctx);
878 spin_unlock_irq(&ctx->lock); 878 raw_spin_unlock_irq(&ctx->lock);
879} 879}
880 880
881/* 881/*
@@ -920,7 +920,7 @@ static void __perf_event_enable(void *info)
920 cpuctx->task_ctx = ctx; 920 cpuctx->task_ctx = ctx;
921 } 921 }
922 922
923 spin_lock(&ctx->lock); 923 raw_spin_lock(&ctx->lock);
924 ctx->is_active = 1; 924 ctx->is_active = 1;
925 update_context_time(ctx); 925 update_context_time(ctx);
926 926
@@ -965,7 +965,7 @@ static void __perf_event_enable(void *info)
965 } 965 }
966 966
967 unlock: 967 unlock:
968 spin_unlock(&ctx->lock); 968 raw_spin_unlock(&ctx->lock);
969} 969}
970 970
971/* 971/*
@@ -991,7 +991,7 @@ void perf_event_enable(struct perf_event *event)
991 return; 991 return;
992 } 992 }
993 993
994 spin_lock_irq(&ctx->lock); 994 raw_spin_lock_irq(&ctx->lock);
995 if (event->state >= PERF_EVENT_STATE_INACTIVE) 995 if (event->state >= PERF_EVENT_STATE_INACTIVE)
996 goto out; 996 goto out;
997 997
@@ -1006,10 +1006,10 @@ void perf_event_enable(struct perf_event *event)
1006 event->state = PERF_EVENT_STATE_OFF; 1006 event->state = PERF_EVENT_STATE_OFF;
1007 1007
1008 retry: 1008 retry:
1009 spin_unlock_irq(&ctx->lock); 1009 raw_spin_unlock_irq(&ctx->lock);
1010 task_oncpu_function_call(task, __perf_event_enable, event); 1010 task_oncpu_function_call(task, __perf_event_enable, event);
1011 1011
1012 spin_lock_irq(&ctx->lock); 1012 raw_spin_lock_irq(&ctx->lock);
1013 1013
1014 /* 1014 /*
1015 * If the context is active and the event is still off, 1015 * If the context is active and the event is still off,
@@ -1026,7 +1026,7 @@ void perf_event_enable(struct perf_event *event)
1026 __perf_event_mark_enabled(event, ctx); 1026 __perf_event_mark_enabled(event, ctx);
1027 1027
1028 out: 1028 out:
1029 spin_unlock_irq(&ctx->lock); 1029 raw_spin_unlock_irq(&ctx->lock);
1030} 1030}
1031 1031
1032static int perf_event_refresh(struct perf_event *event, int refresh) 1032static int perf_event_refresh(struct perf_event *event, int refresh)
@@ -1048,7 +1048,7 @@ void __perf_event_sched_out(struct perf_event_context *ctx,
1048{ 1048{
1049 struct perf_event *event; 1049 struct perf_event *event;
1050 1050
1051 spin_lock(&ctx->lock); 1051 raw_spin_lock(&ctx->lock);
1052 ctx->is_active = 0; 1052 ctx->is_active = 0;
1053 if (likely(!ctx->nr_events)) 1053 if (likely(!ctx->nr_events))
1054 goto out; 1054 goto out;
@@ -1061,7 +1061,7 @@ void __perf_event_sched_out(struct perf_event_context *ctx,
1061 } 1061 }
1062 perf_enable(); 1062 perf_enable();
1063 out: 1063 out:
1064 spin_unlock(&ctx->lock); 1064 raw_spin_unlock(&ctx->lock);
1065} 1065}
1066 1066
1067/* 1067/*
@@ -1199,8 +1199,8 @@ void perf_event_task_sched_out(struct task_struct *task,
1199 * order we take the locks because no other cpu could 1199 * order we take the locks because no other cpu could
1200 * be trying to lock both of these tasks. 1200 * be trying to lock both of these tasks.
1201 */ 1201 */
1202 spin_lock(&ctx->lock); 1202 raw_spin_lock(&ctx->lock);
1203 spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); 1203 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1204 if (context_equiv(ctx, next_ctx)) { 1204 if (context_equiv(ctx, next_ctx)) {
1205 /* 1205 /*
1206 * XXX do we need a memory barrier of sorts 1206 * XXX do we need a memory barrier of sorts
@@ -1214,8 +1214,8 @@ void perf_event_task_sched_out(struct task_struct *task,
1214 1214
1215 perf_event_sync_stat(ctx, next_ctx); 1215 perf_event_sync_stat(ctx, next_ctx);
1216 } 1216 }
1217 spin_unlock(&next_ctx->lock); 1217 raw_spin_unlock(&next_ctx->lock);
1218 spin_unlock(&ctx->lock); 1218 raw_spin_unlock(&ctx->lock);
1219 } 1219 }
1220 rcu_read_unlock(); 1220 rcu_read_unlock();
1221 1221
@@ -1257,7 +1257,7 @@ __perf_event_sched_in(struct perf_event_context *ctx,
1257 struct perf_event *event; 1257 struct perf_event *event;
1258 int can_add_hw = 1; 1258 int can_add_hw = 1;
1259 1259
1260 spin_lock(&ctx->lock); 1260 raw_spin_lock(&ctx->lock);
1261 ctx->is_active = 1; 1261 ctx->is_active = 1;
1262 if (likely(!ctx->nr_events)) 1262 if (likely(!ctx->nr_events))
1263 goto out; 1263 goto out;
@@ -1312,7 +1312,7 @@ __perf_event_sched_in(struct perf_event_context *ctx,
1312 } 1312 }
1313 perf_enable(); 1313 perf_enable();
1314 out: 1314 out:
1315 spin_unlock(&ctx->lock); 1315 raw_spin_unlock(&ctx->lock);
1316} 1316}
1317 1317
1318/* 1318/*
@@ -1376,7 +1376,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1376 struct hw_perf_event *hwc; 1376 struct hw_perf_event *hwc;
1377 u64 interrupts, freq; 1377 u64 interrupts, freq;
1378 1378
1379 spin_lock(&ctx->lock); 1379 raw_spin_lock(&ctx->lock);
1380 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 1380 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
1381 if (event->state != PERF_EVENT_STATE_ACTIVE) 1381 if (event->state != PERF_EVENT_STATE_ACTIVE)
1382 continue; 1382 continue;
@@ -1431,7 +1431,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1431 perf_enable(); 1431 perf_enable();
1432 } 1432 }
1433 } 1433 }
1434 spin_unlock(&ctx->lock); 1434 raw_spin_unlock(&ctx->lock);
1435} 1435}
1436 1436
1437/* 1437/*
@@ -1444,7 +1444,7 @@ static void rotate_ctx(struct perf_event_context *ctx)
1444 if (!ctx->nr_events) 1444 if (!ctx->nr_events)
1445 return; 1445 return;
1446 1446
1447 spin_lock(&ctx->lock); 1447 raw_spin_lock(&ctx->lock);
1448 /* 1448 /*
1449 * Rotate the first entry last (works just fine for group events too): 1449 * Rotate the first entry last (works just fine for group events too):
1450 */ 1450 */
@@ -1455,7 +1455,7 @@ static void rotate_ctx(struct perf_event_context *ctx)
1455 } 1455 }
1456 perf_enable(); 1456 perf_enable();
1457 1457
1458 spin_unlock(&ctx->lock); 1458 raw_spin_unlock(&ctx->lock);
1459} 1459}
1460 1460
1461void perf_event_task_tick(struct task_struct *curr, int cpu) 1461void perf_event_task_tick(struct task_struct *curr, int cpu)
@@ -1504,7 +1504,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1504 1504
1505 __perf_event_task_sched_out(ctx); 1505 __perf_event_task_sched_out(ctx);
1506 1506
1507 spin_lock(&ctx->lock); 1507 raw_spin_lock(&ctx->lock);
1508 1508
1509 list_for_each_entry(event, &ctx->group_list, group_entry) { 1509 list_for_each_entry(event, &ctx->group_list, group_entry) {
1510 if (!event->attr.enable_on_exec) 1510 if (!event->attr.enable_on_exec)
@@ -1522,7 +1522,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1522 if (enabled) 1522 if (enabled)
1523 unclone_ctx(ctx); 1523 unclone_ctx(ctx);
1524 1524
1525 spin_unlock(&ctx->lock); 1525 raw_spin_unlock(&ctx->lock);
1526 1526
1527 perf_event_task_sched_in(task, smp_processor_id()); 1527 perf_event_task_sched_in(task, smp_processor_id());
1528 out: 1528 out:
@@ -1548,10 +1548,10 @@ static void __perf_event_read(void *info)
1548 if (ctx->task && cpuctx->task_ctx != ctx) 1548 if (ctx->task && cpuctx->task_ctx != ctx)
1549 return; 1549 return;
1550 1550
1551 spin_lock(&ctx->lock); 1551 raw_spin_lock(&ctx->lock);
1552 update_context_time(ctx); 1552 update_context_time(ctx);
1553 update_event_times(event); 1553 update_event_times(event);
1554 spin_unlock(&ctx->lock); 1554 raw_spin_unlock(&ctx->lock);
1555 1555
1556 event->pmu->read(event); 1556 event->pmu->read(event);
1557} 1557}
@@ -1569,10 +1569,10 @@ static u64 perf_event_read(struct perf_event *event)
1569 struct perf_event_context *ctx = event->ctx; 1569 struct perf_event_context *ctx = event->ctx;
1570 unsigned long flags; 1570 unsigned long flags;
1571 1571
1572 spin_lock_irqsave(&ctx->lock, flags); 1572 raw_spin_lock_irqsave(&ctx->lock, flags);
1573 update_context_time(ctx); 1573 update_context_time(ctx);
1574 update_event_times(event); 1574 update_event_times(event);
1575 spin_unlock_irqrestore(&ctx->lock, flags); 1575 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1576 } 1576 }
1577 1577
1578 return atomic64_read(&event->count); 1578 return atomic64_read(&event->count);
@@ -1585,7 +1585,7 @@ static void
1585__perf_event_init_context(struct perf_event_context *ctx, 1585__perf_event_init_context(struct perf_event_context *ctx,
1586 struct task_struct *task) 1586 struct task_struct *task)
1587{ 1587{
1588 spin_lock_init(&ctx->lock); 1588 raw_spin_lock_init(&ctx->lock);
1589 mutex_init(&ctx->mutex); 1589 mutex_init(&ctx->mutex);
1590 INIT_LIST_HEAD(&ctx->group_list); 1590 INIT_LIST_HEAD(&ctx->group_list);
1591 INIT_LIST_HEAD(&ctx->event_list); 1591 INIT_LIST_HEAD(&ctx->event_list);
@@ -1652,7 +1652,7 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1652 ctx = perf_lock_task_context(task, &flags); 1652 ctx = perf_lock_task_context(task, &flags);
1653 if (ctx) { 1653 if (ctx) {
1654 unclone_ctx(ctx); 1654 unclone_ctx(ctx);
1655 spin_unlock_irqrestore(&ctx->lock, flags); 1655 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1656 } 1656 }
1657 1657
1658 if (!ctx) { 1658 if (!ctx) {
@@ -1990,7 +1990,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
1990 if (!value) 1990 if (!value)
1991 return -EINVAL; 1991 return -EINVAL;
1992 1992
1993 spin_lock_irq(&ctx->lock); 1993 raw_spin_lock_irq(&ctx->lock);
1994 if (event->attr.freq) { 1994 if (event->attr.freq) {
1995 if (value > sysctl_perf_event_sample_rate) { 1995 if (value > sysctl_perf_event_sample_rate) {
1996 ret = -EINVAL; 1996 ret = -EINVAL;
@@ -2003,7 +2003,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
2003 event->hw.sample_period = value; 2003 event->hw.sample_period = value;
2004 } 2004 }
2005unlock: 2005unlock:
2006 spin_unlock_irq(&ctx->lock); 2006 raw_spin_unlock_irq(&ctx->lock);
2007 2007
2008 return ret; 2008 return ret;
2009} 2009}
@@ -4995,7 +4995,7 @@ void perf_event_exit_task(struct task_struct *child)
4995 * reading child->perf_event_ctxp, we wait until it has 4995 * reading child->perf_event_ctxp, we wait until it has
4996 * incremented the context's refcount before we do put_ctx below. 4996 * incremented the context's refcount before we do put_ctx below.
4997 */ 4997 */
4998 spin_lock(&child_ctx->lock); 4998 raw_spin_lock(&child_ctx->lock);
4999 child->perf_event_ctxp = NULL; 4999 child->perf_event_ctxp = NULL;
5000 /* 5000 /*
5001 * If this context is a clone; unclone it so it can't get 5001 * If this context is a clone; unclone it so it can't get
@@ -5004,7 +5004,7 @@ void perf_event_exit_task(struct task_struct *child)
5004 */ 5004 */
5005 unclone_ctx(child_ctx); 5005 unclone_ctx(child_ctx);
5006 update_context_time(child_ctx); 5006 update_context_time(child_ctx);
5007 spin_unlock_irqrestore(&child_ctx->lock, flags); 5007 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
5008 5008
5009 /* 5009 /*
5010 * Report the task dead after unscheduling the events so that we 5010 * Report the task dead after unscheduling the events so that we
@@ -5295,11 +5295,11 @@ perf_set_reserve_percpu(struct sysdev_class *class,
5295 perf_reserved_percpu = val; 5295 perf_reserved_percpu = val;
5296 for_each_online_cpu(cpu) { 5296 for_each_online_cpu(cpu) {
5297 cpuctx = &per_cpu(perf_cpu_context, cpu); 5297 cpuctx = &per_cpu(perf_cpu_context, cpu);
5298 spin_lock_irq(&cpuctx->ctx.lock); 5298 raw_spin_lock_irq(&cpuctx->ctx.lock);
5299 mpt = min(perf_max_events - cpuctx->ctx.nr_events, 5299 mpt = min(perf_max_events - cpuctx->ctx.nr_events,
5300 perf_max_events - perf_reserved_percpu); 5300 perf_max_events - perf_reserved_percpu);
5301 cpuctx->max_pertask = mpt; 5301 cpuctx->max_pertask = mpt;
5302 spin_unlock_irq(&cpuctx->ctx.lock); 5302 raw_spin_unlock_irq(&cpuctx->ctx.lock);
5303 } 5303 }
5304 spin_unlock(&perf_resource_lock); 5304 spin_unlock(&perf_resource_lock);
5305 5305
diff --git a/kernel/pid.c b/kernel/pid.c
index d3f722d20f9c..2e17c9c92cbe 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -141,11 +141,12 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
141 * installing it: 141 * installing it:
142 */ 142 */
143 spin_lock_irq(&pidmap_lock); 143 spin_lock_irq(&pidmap_lock);
144 if (map->page) 144 if (!map->page) {
145 kfree(page);
146 else
147 map->page = page; 145 map->page = page;
146 page = NULL;
147 }
148 spin_unlock_irq(&pidmap_lock); 148 spin_unlock_irq(&pidmap_lock);
149 kfree(page);
149 if (unlikely(!map->page)) 150 if (unlikely(!map->page))
150 break; 151 break;
151 } 152 }
@@ -268,12 +269,11 @@ struct pid *alloc_pid(struct pid_namespace *ns)
268 for (type = 0; type < PIDTYPE_MAX; ++type) 269 for (type = 0; type < PIDTYPE_MAX; ++type)
269 INIT_HLIST_HEAD(&pid->tasks[type]); 270 INIT_HLIST_HEAD(&pid->tasks[type]);
270 271
272 upid = pid->numbers + ns->level;
271 spin_lock_irq(&pidmap_lock); 273 spin_lock_irq(&pidmap_lock);
272 for (i = ns->level; i >= 0; i--) { 274 for ( ; upid >= pid->numbers; --upid)
273 upid = &pid->numbers[i];
274 hlist_add_head_rcu(&upid->pid_chain, 275 hlist_add_head_rcu(&upid->pid_chain,
275 &pid_hash[pid_hashfn(upid->nr, upid->ns)]); 276 &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
276 }
277 spin_unlock_irq(&pidmap_lock); 277 spin_unlock_irq(&pidmap_lock);
278 278
279out: 279out:
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index dfdec524d1b7..3db49b9ca374 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -29,7 +29,6 @@
29 29
30#include <linux/pm_qos_params.h> 30#include <linux/pm_qos_params.h>
31#include <linux/sched.h> 31#include <linux/sched.h>
32#include <linux/smp_lock.h>
33#include <linux/spinlock.h> 32#include <linux/spinlock.h>
34#include <linux/slab.h> 33#include <linux/slab.h>
35#include <linux/time.h> 34#include <linux/time.h>
@@ -344,37 +343,33 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
344} 343}
345EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); 344EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
346 345
347#define PID_NAME_LEN sizeof("process_1234567890") 346#define PID_NAME_LEN 32
348static char name[PID_NAME_LEN];
349 347
350static int pm_qos_power_open(struct inode *inode, struct file *filp) 348static int pm_qos_power_open(struct inode *inode, struct file *filp)
351{ 349{
352 int ret; 350 int ret;
353 long pm_qos_class; 351 long pm_qos_class;
352 char name[PID_NAME_LEN];
354 353
355 lock_kernel();
356 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); 354 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
357 if (pm_qos_class >= 0) { 355 if (pm_qos_class >= 0) {
358 filp->private_data = (void *)pm_qos_class; 356 filp->private_data = (void *)pm_qos_class;
359 sprintf(name, "process_%d", current->pid); 357 snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
360 ret = pm_qos_add_requirement(pm_qos_class, name, 358 ret = pm_qos_add_requirement(pm_qos_class, name,
361 PM_QOS_DEFAULT_VALUE); 359 PM_QOS_DEFAULT_VALUE);
362 if (ret >= 0) { 360 if (ret >= 0)
363 unlock_kernel();
364 return 0; 361 return 0;
365 }
366 } 362 }
367 unlock_kernel();
368
369 return -EPERM; 363 return -EPERM;
370} 364}
371 365
372static int pm_qos_power_release(struct inode *inode, struct file *filp) 366static int pm_qos_power_release(struct inode *inode, struct file *filp)
373{ 367{
374 int pm_qos_class; 368 int pm_qos_class;
369 char name[PID_NAME_LEN];
375 370
376 pm_qos_class = (long)filp->private_data; 371 pm_qos_class = (long)filp->private_data;
377 sprintf(name, "process_%d", current->pid); 372 snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
378 pm_qos_remove_requirement(pm_qos_class, name); 373 pm_qos_remove_requirement(pm_qos_class, name);
379 374
380 return 0; 375 return 0;
@@ -385,13 +380,14 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
385{ 380{
386 s32 value; 381 s32 value;
387 int pm_qos_class; 382 int pm_qos_class;
383 char name[PID_NAME_LEN];
388 384
389 pm_qos_class = (long)filp->private_data; 385 pm_qos_class = (long)filp->private_data;
390 if (count != sizeof(s32)) 386 if (count != sizeof(s32))
391 return -EINVAL; 387 return -EINVAL;
392 if (copy_from_user(&value, buf, sizeof(s32))) 388 if (copy_from_user(&value, buf, sizeof(s32)))
393 return -EFAULT; 389 return -EFAULT;
394 sprintf(name, "process_%d", current->pid); 390 snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
395 pm_qos_update_requirement(pm_qos_class, name, value); 391 pm_qos_update_requirement(pm_qos_class, name, value);
396 392
397 return sizeof(s32); 393 return sizeof(s32);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 5c9dc228747b..438ff4523513 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -384,7 +384,8 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
384 384
385/* 385/*
386 * Validate the clockid_t for a new CPU-clock timer, and initialize the timer. 386 * Validate the clockid_t for a new CPU-clock timer, and initialize the timer.
387 * This is called from sys_timer_create with the new timer already locked. 387 * This is called from sys_timer_create() and do_cpu_nanosleep() with the
388 * new timer already all-zeros initialized.
388 */ 389 */
389int posix_cpu_timer_create(struct k_itimer *new_timer) 390int posix_cpu_timer_create(struct k_itimer *new_timer)
390{ 391{
@@ -396,8 +397,6 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
396 return -EINVAL; 397 return -EINVAL;
397 398
398 INIT_LIST_HEAD(&new_timer->it.cpu.entry); 399 INIT_LIST_HEAD(&new_timer->it.cpu.entry);
399 new_timer->it.cpu.incr.sched = 0;
400 new_timer->it.cpu.expires.sched = 0;
401 400
402 read_lock(&tasklist_lock); 401 read_lock(&tasklist_lock);
403 if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) { 402 if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index c3b81c30e5d5..43191815f874 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -8,7 +8,7 @@ obj-$(CONFIG_PM_SLEEP) += console.o
8obj-$(CONFIG_FREEZER) += process.o 8obj-$(CONFIG_FREEZER) += process.o
9obj-$(CONFIG_SUSPEND) += suspend.o 9obj-$(CONFIG_SUSPEND) += suspend.o
10obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o 10obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
11obj-$(CONFIG_HIBERNATION) += swsusp.o hibernate.o snapshot.o swap.o user.o 11obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o
12obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o 12obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o
13 13
14obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 14obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/console.c b/kernel/power/console.c
index 5187136fe1de..218e5af90156 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -6,7 +6,7 @@
6 6
7#include <linux/vt_kern.h> 7#include <linux/vt_kern.h>
8#include <linux/kbd_kern.h> 8#include <linux/kbd_kern.h>
9#include <linux/console.h> 9#include <linux/vt.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include "power.h" 11#include "power.h"
12 12
@@ -21,8 +21,7 @@ int pm_prepare_console(void)
21 if (orig_fgconsole < 0) 21 if (orig_fgconsole < 0)
22 return 1; 22 return 1;
23 23
24 orig_kmsg = kmsg_redirect; 24 orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE);
25 kmsg_redirect = SUSPEND_CONSOLE;
26 return 0; 25 return 0;
27} 26}
28 27
@@ -30,7 +29,7 @@ void pm_restore_console(void)
30{ 29{
31 if (orig_fgconsole >= 0) { 30 if (orig_fgconsole >= 0) {
32 vt_move_to_console(orig_fgconsole, 0); 31 vt_move_to_console(orig_fgconsole, 0);
33 kmsg_redirect = orig_kmsg; 32 vt_kmsg_redirect(orig_kmsg);
34 } 33 }
35} 34}
36#endif 35#endif
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 04a9e90d248f..bbfe472d7524 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -32,6 +32,7 @@ static int noresume = 0;
32static char resume_file[256] = CONFIG_PM_STD_PARTITION; 32static char resume_file[256] = CONFIG_PM_STD_PARTITION;
33dev_t swsusp_resume_device; 33dev_t swsusp_resume_device;
34sector_t swsusp_resume_block; 34sector_t swsusp_resume_block;
35int in_suspend __nosavedata = 0;
35 36
36enum { 37enum {
37 HIBERNATION_INVALID, 38 HIBERNATION_INVALID,
@@ -202,6 +203,35 @@ static void platform_recover(int platform_mode)
202} 203}
203 204
204/** 205/**
206 * swsusp_show_speed - print the time elapsed between two events.
207 * @start: Starting event.
208 * @stop: Final event.
209 * @nr_pages - number of pages processed between @start and @stop
210 * @msg - introductory message to print
211 */
212
213void swsusp_show_speed(struct timeval *start, struct timeval *stop,
214 unsigned nr_pages, char *msg)
215{
216 s64 elapsed_centisecs64;
217 int centisecs;
218 int k;
219 int kps;
220
221 elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
222 do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
223 centisecs = elapsed_centisecs64;
224 if (centisecs == 0)
225 centisecs = 1; /* avoid div-by-zero */
226 k = nr_pages * (PAGE_SIZE / 1024);
227 kps = (k * 100) / centisecs;
228 printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n",
229 msg, k,
230 centisecs / 100, centisecs % 100,
231 kps / 1000, (kps % 1000) / 10);
232}
233
234/**
205 * create_image - freeze devices that need to be frozen with interrupts 235 * create_image - freeze devices that need to be frozen with interrupts
206 * off, create the hibernation image and thaw those devices. Control 236 * off, create the hibernation image and thaw those devices. Control
207 * reappears in this routine after a restore. 237 * reappears in this routine after a restore.
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 347d2cc88cd0..0998c7139053 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -220,6 +220,7 @@ static struct attribute_group attr_group = {
220 220
221#ifdef CONFIG_PM_RUNTIME 221#ifdef CONFIG_PM_RUNTIME
222struct workqueue_struct *pm_wq; 222struct workqueue_struct *pm_wq;
223EXPORT_SYMBOL_GPL(pm_wq);
223 224
224static int __init pm_start_workqueue(void) 225static int __init pm_start_workqueue(void)
225{ 226{
diff --git a/kernel/power/process.c b/kernel/power/process.c
index cc2e55373b68..5ade1bdcf366 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -14,6 +14,7 @@
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/freezer.h> 16#include <linux/freezer.h>
17#include <linux/delay.h>
17 18
18/* 19/*
19 * Timeout for stopping processes 20 * Timeout for stopping processes
@@ -41,7 +42,7 @@ static int try_to_freeze_tasks(bool sig_only)
41 do_gettimeofday(&start); 42 do_gettimeofday(&start);
42 43
43 end_time = jiffies + TIMEOUT; 44 end_time = jiffies + TIMEOUT;
44 do { 45 while (true) {
45 todo = 0; 46 todo = 0;
46 read_lock(&tasklist_lock); 47 read_lock(&tasklist_lock);
47 do_each_thread(g, p) { 48 do_each_thread(g, p) {
@@ -62,10 +63,15 @@ static int try_to_freeze_tasks(bool sig_only)
62 todo++; 63 todo++;
63 } while_each_thread(g, p); 64 } while_each_thread(g, p);
64 read_unlock(&tasklist_lock); 65 read_unlock(&tasklist_lock);
65 yield(); /* Yield is okay here */ 66 if (!todo || time_after(jiffies, end_time))
66 if (time_after(jiffies, end_time))
67 break; 67 break;
68 } while (todo); 68
69 /*
70 * We need to retry, but first give the freezing tasks some
71 * time to enter the regrigerator.
72 */
73 msleep(10);
74 }
69 75
70 do_gettimeofday(&end); 76 do_gettimeofday(&end);
71 elapsed_csecs64 = timeval_to_ns(&end) - timeval_to_ns(&start); 77 elapsed_csecs64 = timeval_to_ns(&end) - timeval_to_ns(&start);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 890f6b11b1d3..09b2b0ae9e9d 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -38,6 +38,107 @@ struct swsusp_header {
38 38
39static struct swsusp_header *swsusp_header; 39static struct swsusp_header *swsusp_header;
40 40
41/**
42 * The following functions are used for tracing the allocated
43 * swap pages, so that they can be freed in case of an error.
44 */
45
46struct swsusp_extent {
47 struct rb_node node;
48 unsigned long start;
49 unsigned long end;
50};
51
52static struct rb_root swsusp_extents = RB_ROOT;
53
54static int swsusp_extents_insert(unsigned long swap_offset)
55{
56 struct rb_node **new = &(swsusp_extents.rb_node);
57 struct rb_node *parent = NULL;
58 struct swsusp_extent *ext;
59
60 /* Figure out where to put the new node */
61 while (*new) {
62 ext = container_of(*new, struct swsusp_extent, node);
63 parent = *new;
64 if (swap_offset < ext->start) {
65 /* Try to merge */
66 if (swap_offset == ext->start - 1) {
67 ext->start--;
68 return 0;
69 }
70 new = &((*new)->rb_left);
71 } else if (swap_offset > ext->end) {
72 /* Try to merge */
73 if (swap_offset == ext->end + 1) {
74 ext->end++;
75 return 0;
76 }
77 new = &((*new)->rb_right);
78 } else {
79 /* It already is in the tree */
80 return -EINVAL;
81 }
82 }
83 /* Add the new node and rebalance the tree. */
84 ext = kzalloc(sizeof(struct swsusp_extent), GFP_KERNEL);
85 if (!ext)
86 return -ENOMEM;
87
88 ext->start = swap_offset;
89 ext->end = swap_offset;
90 rb_link_node(&ext->node, parent, new);
91 rb_insert_color(&ext->node, &swsusp_extents);
92 return 0;
93}
94
95/**
96 * alloc_swapdev_block - allocate a swap page and register that it has
97 * been allocated, so that it can be freed in case of an error.
98 */
99
100sector_t alloc_swapdev_block(int swap)
101{
102 unsigned long offset;
103
104 offset = swp_offset(get_swap_page_of_type(swap));
105 if (offset) {
106 if (swsusp_extents_insert(offset))
107 swap_free(swp_entry(swap, offset));
108 else
109 return swapdev_block(swap, offset);
110 }
111 return 0;
112}
113
114/**
115 * free_all_swap_pages - free swap pages allocated for saving image data.
116 * It also frees the extents used to register which swap entres had been
117 * allocated.
118 */
119
120void free_all_swap_pages(int swap)
121{
122 struct rb_node *node;
123
124 while ((node = swsusp_extents.rb_node)) {
125 struct swsusp_extent *ext;
126 unsigned long offset;
127
128 ext = container_of(node, struct swsusp_extent, node);
129 rb_erase(node, &swsusp_extents);
130 for (offset = ext->start; offset <= ext->end; offset++)
131 swap_free(swp_entry(swap, offset));
132
133 kfree(ext);
134 }
135}
136
137int swsusp_swap_in_use(void)
138{
139 return (swsusp_extents.rb_node != NULL);
140}
141
41/* 142/*
42 * General things 143 * General things
43 */ 144 */
@@ -336,7 +437,7 @@ static int save_image(struct swap_map_handle *handle,
336 if (ret) 437 if (ret)
337 break; 438 break;
338 if (!(nr_pages % m)) 439 if (!(nr_pages % m))
339 printk("\b\b\b\b%3d%%", nr_pages / m); 440 printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
340 nr_pages++; 441 nr_pages++;
341 } 442 }
342 err2 = wait_on_bio_chain(&bio); 443 err2 = wait_on_bio_chain(&bio);
@@ -344,9 +445,9 @@ static int save_image(struct swap_map_handle *handle,
344 if (!ret) 445 if (!ret)
345 ret = err2; 446 ret = err2;
346 if (!ret) 447 if (!ret)
347 printk("\b\b\b\bdone\n"); 448 printk(KERN_CONT "\b\b\b\bdone\n");
348 else 449 else
349 printk("\n"); 450 printk(KERN_CONT "\n");
350 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); 451 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
351 return ret; 452 return ret;
352} 453}
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 6a07f4dbf2f8..5b3601bd1893 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -56,133 +56,3 @@
56#include "power.h" 56#include "power.h"
57 57
58int in_suspend __nosavedata = 0; 58int in_suspend __nosavedata = 0;
59
60/**
61 * The following functions are used for tracing the allocated
62 * swap pages, so that they can be freed in case of an error.
63 */
64
65struct swsusp_extent {
66 struct rb_node node;
67 unsigned long start;
68 unsigned long end;
69};
70
71static struct rb_root swsusp_extents = RB_ROOT;
72
73static int swsusp_extents_insert(unsigned long swap_offset)
74{
75 struct rb_node **new = &(swsusp_extents.rb_node);
76 struct rb_node *parent = NULL;
77 struct swsusp_extent *ext;
78
79 /* Figure out where to put the new node */
80 while (*new) {
81 ext = container_of(*new, struct swsusp_extent, node);
82 parent = *new;
83 if (swap_offset < ext->start) {
84 /* Try to merge */
85 if (swap_offset == ext->start - 1) {
86 ext->start--;
87 return 0;
88 }
89 new = &((*new)->rb_left);
90 } else if (swap_offset > ext->end) {
91 /* Try to merge */
92 if (swap_offset == ext->end + 1) {
93 ext->end++;
94 return 0;
95 }
96 new = &((*new)->rb_right);
97 } else {
98 /* It already is in the tree */
99 return -EINVAL;
100 }
101 }
102 /* Add the new node and rebalance the tree. */
103 ext = kzalloc(sizeof(struct swsusp_extent), GFP_KERNEL);
104 if (!ext)
105 return -ENOMEM;
106
107 ext->start = swap_offset;
108 ext->end = swap_offset;
109 rb_link_node(&ext->node, parent, new);
110 rb_insert_color(&ext->node, &swsusp_extents);
111 return 0;
112}
113
114/**
115 * alloc_swapdev_block - allocate a swap page and register that it has
116 * been allocated, so that it can be freed in case of an error.
117 */
118
119sector_t alloc_swapdev_block(int swap)
120{
121 unsigned long offset;
122
123 offset = swp_offset(get_swap_page_of_type(swap));
124 if (offset) {
125 if (swsusp_extents_insert(offset))
126 swap_free(swp_entry(swap, offset));
127 else
128 return swapdev_block(swap, offset);
129 }
130 return 0;
131}
132
133/**
134 * free_all_swap_pages - free swap pages allocated for saving image data.
135 * It also frees the extents used to register which swap entres had been
136 * allocated.
137 */
138
139void free_all_swap_pages(int swap)
140{
141 struct rb_node *node;
142
143 while ((node = swsusp_extents.rb_node)) {
144 struct swsusp_extent *ext;
145 unsigned long offset;
146
147 ext = container_of(node, struct swsusp_extent, node);
148 rb_erase(node, &swsusp_extents);
149 for (offset = ext->start; offset <= ext->end; offset++)
150 swap_free(swp_entry(swap, offset));
151
152 kfree(ext);
153 }
154}
155
156int swsusp_swap_in_use(void)
157{
158 return (swsusp_extents.rb_node != NULL);
159}
160
161/**
162 * swsusp_show_speed - print the time elapsed between two events represented by
163 * @start and @stop
164 *
165 * @nr_pages - number of pages processed between @start and @stop
166 * @msg - introductory message to print
167 */
168
169void swsusp_show_speed(struct timeval *start, struct timeval *stop,
170 unsigned nr_pages, char *msg)
171{
172 s64 elapsed_centisecs64;
173 int centisecs;
174 int k;
175 int kps;
176
177 elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
178 do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
179 centisecs = elapsed_centisecs64;
180 if (centisecs == 0)
181 centisecs = 1; /* avoid div-by-zero */
182 k = nr_pages * (PAGE_SIZE / 1024);
183 kps = (k * 100) / centisecs;
184 printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n",
185 msg, k,
186 centisecs / 100, centisecs % 100,
187 kps / 1000, (kps % 1000) / 10);
188}
diff --git a/kernel/printk.c b/kernel/printk.c
index f38b07f78a4e..1ded8e7dd19b 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -33,6 +33,8 @@
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/kexec.h> 35#include <linux/kexec.h>
36#include <linux/ratelimit.h>
37#include <linux/kmsg_dump.h>
36 38
37#include <asm/uaccess.h> 39#include <asm/uaccess.h>
38 40
@@ -1376,11 +1378,11 @@ late_initcall(disable_boot_consoles);
1376 */ 1378 */
1377DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10); 1379DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);
1378 1380
1379int printk_ratelimit(void) 1381int __printk_ratelimit(const char *func)
1380{ 1382{
1381 return __ratelimit(&printk_ratelimit_state); 1383 return ___ratelimit(&printk_ratelimit_state, func);
1382} 1384}
1383EXPORT_SYMBOL(printk_ratelimit); 1385EXPORT_SYMBOL(__printk_ratelimit);
1384 1386
1385/** 1387/**
1386 * printk_timed_ratelimit - caller-controlled printk ratelimiting 1388 * printk_timed_ratelimit - caller-controlled printk ratelimiting
@@ -1404,4 +1406,122 @@ bool printk_timed_ratelimit(unsigned long *caller_jiffies,
1404 return false; 1406 return false;
1405} 1407}
1406EXPORT_SYMBOL(printk_timed_ratelimit); 1408EXPORT_SYMBOL(printk_timed_ratelimit);
1409
1410static DEFINE_SPINLOCK(dump_list_lock);
1411static LIST_HEAD(dump_list);
1412
1413/**
1414 * kmsg_dump_register - register a kernel log dumper.
1415 * @dump: pointer to the kmsg_dumper structure
1416 *
1417 * Adds a kernel log dumper to the system. The dump callback in the
1418 * structure will be called when the kernel oopses or panics and must be
1419 * set. Returns zero on success and %-EINVAL or %-EBUSY otherwise.
1420 */
1421int kmsg_dump_register(struct kmsg_dumper *dumper)
1422{
1423 unsigned long flags;
1424 int err = -EBUSY;
1425
1426 /* The dump callback needs to be set */
1427 if (!dumper->dump)
1428 return -EINVAL;
1429
1430 spin_lock_irqsave(&dump_list_lock, flags);
1431 /* Don't allow registering multiple times */
1432 if (!dumper->registered) {
1433 dumper->registered = 1;
1434 list_add_tail(&dumper->list, &dump_list);
1435 err = 0;
1436 }
1437 spin_unlock_irqrestore(&dump_list_lock, flags);
1438
1439 return err;
1440}
1441EXPORT_SYMBOL_GPL(kmsg_dump_register);
1442
1443/**
1444 * kmsg_dump_unregister - unregister a kmsg dumper.
1445 * @dump: pointer to the kmsg_dumper structure
1446 *
1447 * Removes a dump device from the system. Returns zero on success and
1448 * %-EINVAL otherwise.
1449 */
1450int kmsg_dump_unregister(struct kmsg_dumper *dumper)
1451{
1452 unsigned long flags;
1453 int err = -EINVAL;
1454
1455 spin_lock_irqsave(&dump_list_lock, flags);
1456 if (dumper->registered) {
1457 dumper->registered = 0;
1458 list_del(&dumper->list);
1459 err = 0;
1460 }
1461 spin_unlock_irqrestore(&dump_list_lock, flags);
1462
1463 return err;
1464}
1465EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
1466
1467static const char const *kmsg_reasons[] = {
1468 [KMSG_DUMP_OOPS] = "oops",
1469 [KMSG_DUMP_PANIC] = "panic",
1470};
1471
1472static const char *kmsg_to_str(enum kmsg_dump_reason reason)
1473{
1474 if (reason >= ARRAY_SIZE(kmsg_reasons) || reason < 0)
1475 return "unknown";
1476
1477 return kmsg_reasons[reason];
1478}
1479
1480/**
1481 * kmsg_dump - dump kernel log to kernel message dumpers.
1482 * @reason: the reason (oops, panic etc) for dumping
1483 *
1484 * Iterate through each of the dump devices and call the oops/panic
1485 * callbacks with the log buffer.
1486 */
1487void kmsg_dump(enum kmsg_dump_reason reason)
1488{
1489 unsigned long end;
1490 unsigned chars;
1491 struct kmsg_dumper *dumper;
1492 const char *s1, *s2;
1493 unsigned long l1, l2;
1494 unsigned long flags;
1495
1496 /* Theoretically, the log could move on after we do this, but
1497 there's not a lot we can do about that. The new messages
1498 will overwrite the start of what we dump. */
1499 spin_lock_irqsave(&logbuf_lock, flags);
1500 end = log_end & LOG_BUF_MASK;
1501 chars = logged_chars;
1502 spin_unlock_irqrestore(&logbuf_lock, flags);
1503
1504 if (logged_chars > end) {
1505 s1 = log_buf + log_buf_len - logged_chars + end;
1506 l1 = logged_chars - end;
1507
1508 s2 = log_buf;
1509 l2 = end;
1510 } else {
1511 s1 = "";
1512 l1 = 0;
1513
1514 s2 = log_buf + end - logged_chars;
1515 l2 = logged_chars;
1516 }
1517
1518 if (!spin_trylock_irqsave(&dump_list_lock, flags)) {
1519 printk(KERN_ERR "dump_kmsg: dump list lock is held during %s, skipping dump\n",
1520 kmsg_to_str(reason));
1521 return;
1522 }
1523 list_for_each_entry(dumper, &dump_list, list)
1524 dumper->dump(dumper, reason, s1, l1, s2, l2);
1525 spin_unlock_irqrestore(&dump_list_lock, flags);
1526}
1407#endif 1527#endif
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 400183346ad2..9b7fd4723878 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -44,7 +44,6 @@
44#include <linux/cpu.h> 44#include <linux/cpu.h>
45#include <linux/mutex.h> 45#include <linux/mutex.h>
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/kernel_stat.h>
48 47
49#ifdef CONFIG_DEBUG_LOCK_ALLOC 48#ifdef CONFIG_DEBUG_LOCK_ALLOC
50static struct lock_class_key rcu_lock_key; 49static struct lock_class_key rcu_lock_key;
@@ -53,8 +52,6 @@ struct lockdep_map rcu_lock_map =
53EXPORT_SYMBOL_GPL(rcu_lock_map); 52EXPORT_SYMBOL_GPL(rcu_lock_map);
54#endif 53#endif
55 54
56int rcu_scheduler_active __read_mostly;
57
58/* 55/*
59 * Awaken the corresponding synchronize_rcu() instance now that a 56 * Awaken the corresponding synchronize_rcu() instance now that a
60 * grace period has elapsed. 57 * grace period has elapsed.
@@ -66,122 +63,3 @@ void wakeme_after_rcu(struct rcu_head *head)
66 rcu = container_of(head, struct rcu_synchronize, head); 63 rcu = container_of(head, struct rcu_synchronize, head);
67 complete(&rcu->completion); 64 complete(&rcu->completion);
68} 65}
69
70#ifdef CONFIG_TREE_PREEMPT_RCU
71
72/**
73 * synchronize_rcu - wait until a grace period has elapsed.
74 *
75 * Control will return to the caller some time after a full grace
76 * period has elapsed, in other words after all currently executing RCU
77 * read-side critical sections have completed. RCU read-side critical
78 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
79 * and may be nested.
80 */
81void synchronize_rcu(void)
82{
83 struct rcu_synchronize rcu;
84
85 if (!rcu_scheduler_active)
86 return;
87
88 init_completion(&rcu.completion);
89 /* Will wake me after RCU finished. */
90 call_rcu(&rcu.head, wakeme_after_rcu);
91 /* Wait for it. */
92 wait_for_completion(&rcu.completion);
93}
94EXPORT_SYMBOL_GPL(synchronize_rcu);
95
96#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
97
98/**
99 * synchronize_sched - wait until an rcu-sched grace period has elapsed.
100 *
101 * Control will return to the caller some time after a full rcu-sched
102 * grace period has elapsed, in other words after all currently executing
103 * rcu-sched read-side critical sections have completed. These read-side
104 * critical sections are delimited by rcu_read_lock_sched() and
105 * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(),
106 * local_irq_disable(), and so on may be used in place of
107 * rcu_read_lock_sched().
108 *
109 * This means that all preempt_disable code sequences, including NMI and
110 * hardware-interrupt handlers, in progress on entry will have completed
111 * before this primitive returns. However, this does not guarantee that
112 * softirq handlers will have completed, since in some kernels, these
113 * handlers can run in process context, and can block.
114 *
115 * This primitive provides the guarantees made by the (now removed)
116 * synchronize_kernel() API. In contrast, synchronize_rcu() only
117 * guarantees that rcu_read_lock() sections will have completed.
118 * In "classic RCU", these two guarantees happen to be one and
119 * the same, but can differ in realtime RCU implementations.
120 */
121void synchronize_sched(void)
122{
123 struct rcu_synchronize rcu;
124
125 if (rcu_blocking_is_gp())
126 return;
127
128 init_completion(&rcu.completion);
129 /* Will wake me after RCU finished. */
130 call_rcu_sched(&rcu.head, wakeme_after_rcu);
131 /* Wait for it. */
132 wait_for_completion(&rcu.completion);
133}
134EXPORT_SYMBOL_GPL(synchronize_sched);
135
136/**
137 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
138 *
139 * Control will return to the caller some time after a full rcu_bh grace
140 * period has elapsed, in other words after all currently executing rcu_bh
141 * read-side critical sections have completed. RCU read-side critical
142 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
143 * and may be nested.
144 */
145void synchronize_rcu_bh(void)
146{
147 struct rcu_synchronize rcu;
148
149 if (rcu_blocking_is_gp())
150 return;
151
152 init_completion(&rcu.completion);
153 /* Will wake me after RCU finished. */
154 call_rcu_bh(&rcu.head, wakeme_after_rcu);
155 /* Wait for it. */
156 wait_for_completion(&rcu.completion);
157}
158EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
159
160static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
161 unsigned long action, void *hcpu)
162{
163 return rcu_cpu_notify(self, action, hcpu);
164}
165
166void __init rcu_init(void)
167{
168 int i;
169
170 __rcu_init();
171 cpu_notifier(rcu_barrier_cpu_hotplug, 0);
172
173 /*
174 * We don't need protection against CPU-hotplug here because
175 * this is called early in boot, before either interrupts
176 * or the scheduler are operational.
177 */
178 for_each_online_cpu(i)
179 rcu_barrier_cpu_hotplug(NULL, CPU_UP_PREPARE, (void *)(long)i);
180}
181
182void rcu_scheduler_starting(void)
183{
184 WARN_ON(num_online_cpus() != 1);
185 WARN_ON(nr_context_switches() > 0);
186 rcu_scheduler_active = 1;
187}
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
new file mode 100644
index 000000000000..9f6d9ff2572c
--- /dev/null
+++ b/kernel/rcutiny.c
@@ -0,0 +1,282 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2008
19 *
20 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
21 *
22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU
24 */
25#include <linux/moduleparam.h>
26#include <linux/completion.h>
27#include <linux/interrupt.h>
28#include <linux/notifier.h>
29#include <linux/rcupdate.h>
30#include <linux/kernel.h>
31#include <linux/module.h>
32#include <linux/mutex.h>
33#include <linux/sched.h>
34#include <linux/types.h>
35#include <linux/init.h>
36#include <linux/time.h>
37#include <linux/cpu.h>
38
39/* Global control variables for rcupdate callback mechanism. */
40struct rcu_ctrlblk {
41 struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
42 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
43 struct rcu_head **curtail; /* ->next pointer of last CB. */
44};
45
46/* Definition for rcupdate control block. */
47static struct rcu_ctrlblk rcu_ctrlblk = {
48 .donetail = &rcu_ctrlblk.rcucblist,
49 .curtail = &rcu_ctrlblk.rcucblist,
50};
51
52static struct rcu_ctrlblk rcu_bh_ctrlblk = {
53 .donetail = &rcu_bh_ctrlblk.rcucblist,
54 .curtail = &rcu_bh_ctrlblk.rcucblist,
55};
56
57#ifdef CONFIG_NO_HZ
58
59static long rcu_dynticks_nesting = 1;
60
61/*
62 * Enter dynticks-idle mode, which is an extended quiescent state
63 * if we have fully entered that mode (i.e., if the new value of
64 * dynticks_nesting is zero).
65 */
66void rcu_enter_nohz(void)
67{
68 if (--rcu_dynticks_nesting == 0)
69 rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */
70}
71
72/*
73 * Exit dynticks-idle mode, so that we are no longer in an extended
74 * quiescent state.
75 */
76void rcu_exit_nohz(void)
77{
78 rcu_dynticks_nesting++;
79}
80
81#endif /* #ifdef CONFIG_NO_HZ */
82
83/*
84 * Helper function for rcu_qsctr_inc() and rcu_bh_qsctr_inc().
85 * Also disable irqs to avoid confusion due to interrupt handlers
86 * invoking call_rcu().
87 */
88static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
89{
90 unsigned long flags;
91
92 local_irq_save(flags);
93 if (rcp->rcucblist != NULL &&
94 rcp->donetail != rcp->curtail) {
95 rcp->donetail = rcp->curtail;
96 local_irq_restore(flags);
97 return 1;
98 }
99 local_irq_restore(flags);
100
101 return 0;
102}
103
104/*
105 * Record an rcu quiescent state. And an rcu_bh quiescent state while we
106 * are at it, given that any rcu quiescent state is also an rcu_bh
107 * quiescent state. Use "+" instead of "||" to defeat short circuiting.
108 */
109void rcu_sched_qs(int cpu)
110{
111 if (rcu_qsctr_help(&rcu_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk))
112 raise_softirq(RCU_SOFTIRQ);
113}
114
115/*
116 * Record an rcu_bh quiescent state.
117 */
118void rcu_bh_qs(int cpu)
119{
120 if (rcu_qsctr_help(&rcu_bh_ctrlblk))
121 raise_softirq(RCU_SOFTIRQ);
122}
123
124/*
125 * Check to see if the scheduling-clock interrupt came from an extended
126 * quiescent state, and, if so, tell RCU about it.
127 */
128void rcu_check_callbacks(int cpu, int user)
129{
130 if (user ||
131 (idle_cpu(cpu) &&
132 !in_softirq() &&
133 hardirq_count() <= (1 << HARDIRQ_SHIFT)))
134 rcu_sched_qs(cpu);
135 else if (!in_softirq())
136 rcu_bh_qs(cpu);
137}
138
139/*
140 * Helper function for rcu_process_callbacks() that operates on the
141 * specified rcu_ctrlkblk structure.
142 */
143static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
144{
145 struct rcu_head *next, *list;
146 unsigned long flags;
147
148 /* If no RCU callbacks ready to invoke, just return. */
149 if (&rcp->rcucblist == rcp->donetail)
150 return;
151
152 /* Move the ready-to-invoke callbacks to a local list. */
153 local_irq_save(flags);
154 list = rcp->rcucblist;
155 rcp->rcucblist = *rcp->donetail;
156 *rcp->donetail = NULL;
157 if (rcp->curtail == rcp->donetail)
158 rcp->curtail = &rcp->rcucblist;
159 rcp->donetail = &rcp->rcucblist;
160 local_irq_restore(flags);
161
162 /* Invoke the callbacks on the local list. */
163 while (list) {
164 next = list->next;
165 prefetch(next);
166 list->func(list);
167 list = next;
168 }
169}
170
171/*
172 * Invoke any callbacks whose grace period has completed.
173 */
174static void rcu_process_callbacks(struct softirq_action *unused)
175{
176 __rcu_process_callbacks(&rcu_ctrlblk);
177 __rcu_process_callbacks(&rcu_bh_ctrlblk);
178}
179
180/*
181 * Wait for a grace period to elapse. But it is illegal to invoke
182 * synchronize_sched() from within an RCU read-side critical section.
183 * Therefore, any legal call to synchronize_sched() is a quiescent
184 * state, and so on a UP system, synchronize_sched() need do nothing.
185 * Ditto for synchronize_rcu_bh(). (But Lai Jiangshan points out the
186 * benefits of doing might_sleep() to reduce latency.)
187 *
188 * Cool, huh? (Due to Josh Triplett.)
189 *
190 * But we want to make this a static inline later.
191 */
192void synchronize_sched(void)
193{
194 cond_resched();
195}
196EXPORT_SYMBOL_GPL(synchronize_sched);
197
198void synchronize_rcu_bh(void)
199{
200 synchronize_sched();
201}
202EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
203
204/*
205 * Helper function for call_rcu() and call_rcu_bh().
206 */
207static void __call_rcu(struct rcu_head *head,
208 void (*func)(struct rcu_head *rcu),
209 struct rcu_ctrlblk *rcp)
210{
211 unsigned long flags;
212
213 head->func = func;
214 head->next = NULL;
215
216 local_irq_save(flags);
217 *rcp->curtail = head;
218 rcp->curtail = &head->next;
219 local_irq_restore(flags);
220}
221
222/*
223 * Post an RCU callback to be invoked after the end of an RCU grace
224 * period. But since we have but one CPU, that would be after any
225 * quiescent state.
226 */
227void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
228{
229 __call_rcu(head, func, &rcu_ctrlblk);
230}
231EXPORT_SYMBOL_GPL(call_rcu);
232
233/*
234 * Post an RCU bottom-half callback to be invoked after any subsequent
235 * quiescent state.
236 */
237void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
238{
239 __call_rcu(head, func, &rcu_bh_ctrlblk);
240}
241EXPORT_SYMBOL_GPL(call_rcu_bh);
242
243void rcu_barrier(void)
244{
245 struct rcu_synchronize rcu;
246
247 init_completion(&rcu.completion);
248 /* Will wake me after RCU finished. */
249 call_rcu(&rcu.head, wakeme_after_rcu);
250 /* Wait for it. */
251 wait_for_completion(&rcu.completion);
252}
253EXPORT_SYMBOL_GPL(rcu_barrier);
254
255void rcu_barrier_bh(void)
256{
257 struct rcu_synchronize rcu;
258
259 init_completion(&rcu.completion);
260 /* Will wake me after RCU finished. */
261 call_rcu_bh(&rcu.head, wakeme_after_rcu);
262 /* Wait for it. */
263 wait_for_completion(&rcu.completion);
264}
265EXPORT_SYMBOL_GPL(rcu_barrier_bh);
266
267void rcu_barrier_sched(void)
268{
269 struct rcu_synchronize rcu;
270
271 init_completion(&rcu.completion);
272 /* Will wake me after RCU finished. */
273 call_rcu_sched(&rcu.head, wakeme_after_rcu);
274 /* Wait for it. */
275 wait_for_completion(&rcu.completion);
276}
277EXPORT_SYMBOL_GPL(rcu_barrier_sched);
278
279void __init rcu_init(void)
280{
281 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
282}
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 697c0a0229d4..9bb52177af02 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -327,6 +327,11 @@ rcu_torture_cb(struct rcu_head *p)
327 cur_ops->deferred_free(rp); 327 cur_ops->deferred_free(rp);
328} 328}
329 329
330static int rcu_no_completed(void)
331{
332 return 0;
333}
334
330static void rcu_torture_deferred_free(struct rcu_torture *p) 335static void rcu_torture_deferred_free(struct rcu_torture *p)
331{ 336{
332 call_rcu(&p->rtort_rcu, rcu_torture_cb); 337 call_rcu(&p->rtort_rcu, rcu_torture_cb);
@@ -388,6 +393,21 @@ static struct rcu_torture_ops rcu_sync_ops = {
388 .name = "rcu_sync" 393 .name = "rcu_sync"
389}; 394};
390 395
396static struct rcu_torture_ops rcu_expedited_ops = {
397 .init = rcu_sync_torture_init,
398 .cleanup = NULL,
399 .readlock = rcu_torture_read_lock,
400 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
401 .readunlock = rcu_torture_read_unlock,
402 .completed = rcu_no_completed,
403 .deferred_free = rcu_sync_torture_deferred_free,
404 .sync = synchronize_rcu_expedited,
405 .cb_barrier = NULL,
406 .stats = NULL,
407 .irq_capable = 1,
408 .name = "rcu_expedited"
409};
410
391/* 411/*
392 * Definitions for rcu_bh torture testing. 412 * Definitions for rcu_bh torture testing.
393 */ 413 */
@@ -547,6 +567,25 @@ static struct rcu_torture_ops srcu_ops = {
547 .name = "srcu" 567 .name = "srcu"
548}; 568};
549 569
570static void srcu_torture_synchronize_expedited(void)
571{
572 synchronize_srcu_expedited(&srcu_ctl);
573}
574
575static struct rcu_torture_ops srcu_expedited_ops = {
576 .init = srcu_torture_init,
577 .cleanup = srcu_torture_cleanup,
578 .readlock = srcu_torture_read_lock,
579 .read_delay = srcu_read_delay,
580 .readunlock = srcu_torture_read_unlock,
581 .completed = srcu_torture_completed,
582 .deferred_free = rcu_sync_torture_deferred_free,
583 .sync = srcu_torture_synchronize_expedited,
584 .cb_barrier = NULL,
585 .stats = srcu_torture_stats,
586 .name = "srcu_expedited"
587};
588
550/* 589/*
551 * Definitions for sched torture testing. 590 * Definitions for sched torture testing.
552 */ 591 */
@@ -562,11 +601,6 @@ static void sched_torture_read_unlock(int idx)
562 preempt_enable(); 601 preempt_enable();
563} 602}
564 603
565static int sched_torture_completed(void)
566{
567 return 0;
568}
569
570static void rcu_sched_torture_deferred_free(struct rcu_torture *p) 604static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
571{ 605{
572 call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); 606 call_rcu_sched(&p->rtort_rcu, rcu_torture_cb);
@@ -583,7 +617,7 @@ static struct rcu_torture_ops sched_ops = {
583 .readlock = sched_torture_read_lock, 617 .readlock = sched_torture_read_lock,
584 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 618 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
585 .readunlock = sched_torture_read_unlock, 619 .readunlock = sched_torture_read_unlock,
586 .completed = sched_torture_completed, 620 .completed = rcu_no_completed,
587 .deferred_free = rcu_sched_torture_deferred_free, 621 .deferred_free = rcu_sched_torture_deferred_free,
588 .sync = sched_torture_synchronize, 622 .sync = sched_torture_synchronize,
589 .cb_barrier = rcu_barrier_sched, 623 .cb_barrier = rcu_barrier_sched,
@@ -592,13 +626,13 @@ static struct rcu_torture_ops sched_ops = {
592 .name = "sched" 626 .name = "sched"
593}; 627};
594 628
595static struct rcu_torture_ops sched_ops_sync = { 629static struct rcu_torture_ops sched_sync_ops = {
596 .init = rcu_sync_torture_init, 630 .init = rcu_sync_torture_init,
597 .cleanup = NULL, 631 .cleanup = NULL,
598 .readlock = sched_torture_read_lock, 632 .readlock = sched_torture_read_lock,
599 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 633 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
600 .readunlock = sched_torture_read_unlock, 634 .readunlock = sched_torture_read_unlock,
601 .completed = sched_torture_completed, 635 .completed = rcu_no_completed,
602 .deferred_free = rcu_sync_torture_deferred_free, 636 .deferred_free = rcu_sync_torture_deferred_free,
603 .sync = sched_torture_synchronize, 637 .sync = sched_torture_synchronize,
604 .cb_barrier = NULL, 638 .cb_barrier = NULL,
@@ -612,7 +646,7 @@ static struct rcu_torture_ops sched_expedited_ops = {
612 .readlock = sched_torture_read_lock, 646 .readlock = sched_torture_read_lock,
613 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 647 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
614 .readunlock = sched_torture_read_unlock, 648 .readunlock = sched_torture_read_unlock,
615 .completed = sched_torture_completed, 649 .completed = rcu_no_completed,
616 .deferred_free = rcu_sync_torture_deferred_free, 650 .deferred_free = rcu_sync_torture_deferred_free,
617 .sync = synchronize_sched_expedited, 651 .sync = synchronize_sched_expedited,
618 .cb_barrier = NULL, 652 .cb_barrier = NULL,
@@ -729,13 +763,13 @@ static void rcu_torture_timer(unsigned long unused)
729 /* Should not happen, but... */ 763 /* Should not happen, but... */
730 pipe_count = RCU_TORTURE_PIPE_LEN; 764 pipe_count = RCU_TORTURE_PIPE_LEN;
731 } 765 }
732 ++__get_cpu_var(rcu_torture_count)[pipe_count]; 766 __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]);
733 completed = cur_ops->completed() - completed; 767 completed = cur_ops->completed() - completed;
734 if (completed > RCU_TORTURE_PIPE_LEN) { 768 if (completed > RCU_TORTURE_PIPE_LEN) {
735 /* Should not happen, but... */ 769 /* Should not happen, but... */
736 completed = RCU_TORTURE_PIPE_LEN; 770 completed = RCU_TORTURE_PIPE_LEN;
737 } 771 }
738 ++__get_cpu_var(rcu_torture_batch)[completed]; 772 __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]);
739 preempt_enable(); 773 preempt_enable();
740 cur_ops->readunlock(idx); 774 cur_ops->readunlock(idx);
741} 775}
@@ -784,13 +818,13 @@ rcu_torture_reader(void *arg)
784 /* Should not happen, but... */ 818 /* Should not happen, but... */
785 pipe_count = RCU_TORTURE_PIPE_LEN; 819 pipe_count = RCU_TORTURE_PIPE_LEN;
786 } 820 }
787 ++__get_cpu_var(rcu_torture_count)[pipe_count]; 821 __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]);
788 completed = cur_ops->completed() - completed; 822 completed = cur_ops->completed() - completed;
789 if (completed > RCU_TORTURE_PIPE_LEN) { 823 if (completed > RCU_TORTURE_PIPE_LEN) {
790 /* Should not happen, but... */ 824 /* Should not happen, but... */
791 completed = RCU_TORTURE_PIPE_LEN; 825 completed = RCU_TORTURE_PIPE_LEN;
792 } 826 }
793 ++__get_cpu_var(rcu_torture_batch)[completed]; 827 __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]);
794 preempt_enable(); 828 preempt_enable();
795 cur_ops->readunlock(idx); 829 cur_ops->readunlock(idx);
796 schedule(); 830 schedule();
@@ -1097,9 +1131,10 @@ rcu_torture_init(void)
1097 int cpu; 1131 int cpu;
1098 int firsterr = 0; 1132 int firsterr = 0;
1099 static struct rcu_torture_ops *torture_ops[] = 1133 static struct rcu_torture_ops *torture_ops[] =
1100 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, 1134 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
1101 &sched_expedited_ops, 1135 &rcu_bh_ops, &rcu_bh_sync_ops,
1102 &srcu_ops, &sched_ops, &sched_ops_sync, }; 1136 &srcu_ops, &srcu_expedited_ops,
1137 &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
1103 1138
1104 mutex_lock(&fullstop_mutex); 1139 mutex_lock(&fullstop_mutex);
1105 1140
@@ -1110,8 +1145,12 @@ rcu_torture_init(void)
1110 break; 1145 break;
1111 } 1146 }
1112 if (i == ARRAY_SIZE(torture_ops)) { 1147 if (i == ARRAY_SIZE(torture_ops)) {
1113 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", 1148 printk(KERN_ALERT "rcu-torture: invalid torture type: \"%s\"\n",
1114 torture_type); 1149 torture_type);
1150 printk(KERN_ALERT "rcu-torture types:");
1151 for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
1152 printk(KERN_ALERT " %s", torture_ops[i]->name);
1153 printk(KERN_ALERT "\n");
1115 mutex_unlock(&fullstop_mutex); 1154 mutex_unlock(&fullstop_mutex);
1116 return -EINVAL; 1155 return -EINVAL;
1117 } 1156 }
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index f3077c0ab181..53ae9598f798 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -46,18 +46,22 @@
46#include <linux/cpu.h> 46#include <linux/cpu.h>
47#include <linux/mutex.h> 47#include <linux/mutex.h>
48#include <linux/time.h> 48#include <linux/time.h>
49#include <linux/kernel_stat.h>
49 50
50#include "rcutree.h" 51#include "rcutree.h"
51 52
52/* Data structures. */ 53/* Data structures. */
53 54
55static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
56
54#define RCU_STATE_INITIALIZER(name) { \ 57#define RCU_STATE_INITIALIZER(name) { \
55 .level = { &name.node[0] }, \ 58 .level = { &name.node[0] }, \
56 .levelcnt = { \ 59 .levelcnt = { \
57 NUM_RCU_LVL_0, /* root of hierarchy. */ \ 60 NUM_RCU_LVL_0, /* root of hierarchy. */ \
58 NUM_RCU_LVL_1, \ 61 NUM_RCU_LVL_1, \
59 NUM_RCU_LVL_2, \ 62 NUM_RCU_LVL_2, \
60 NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \ 63 NUM_RCU_LVL_3, \
64 NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \
61 }, \ 65 }, \
62 .signaled = RCU_GP_IDLE, \ 66 .signaled = RCU_GP_IDLE, \
63 .gpnum = -300, \ 67 .gpnum = -300, \
@@ -77,6 +81,8 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
77struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 81struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
78DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 82DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
79 83
84static int rcu_scheduler_active __read_mostly;
85
80 86
81/* 87/*
82 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 88 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
@@ -98,7 +104,7 @@ void rcu_sched_qs(int cpu)
98 struct rcu_data *rdp; 104 struct rcu_data *rdp;
99 105
100 rdp = &per_cpu(rcu_sched_data, cpu); 106 rdp = &per_cpu(rcu_sched_data, cpu);
101 rdp->passed_quiesc_completed = rdp->completed; 107 rdp->passed_quiesc_completed = rdp->gpnum - 1;
102 barrier(); 108 barrier();
103 rdp->passed_quiesc = 1; 109 rdp->passed_quiesc = 1;
104 rcu_preempt_note_context_switch(cpu); 110 rcu_preempt_note_context_switch(cpu);
@@ -109,7 +115,7 @@ void rcu_bh_qs(int cpu)
109 struct rcu_data *rdp; 115 struct rcu_data *rdp;
110 116
111 rdp = &per_cpu(rcu_bh_data, cpu); 117 rdp = &per_cpu(rcu_bh_data, cpu);
112 rdp->passed_quiesc_completed = rdp->completed; 118 rdp->passed_quiesc_completed = rdp->gpnum - 1;
113 barrier(); 119 barrier();
114 rdp->passed_quiesc = 1; 120 rdp->passed_quiesc = 1;
115} 121}
@@ -335,28 +341,9 @@ void rcu_irq_exit(void)
335 set_need_resched(); 341 set_need_resched();
336} 342}
337 343
338/*
339 * Record the specified "completed" value, which is later used to validate
340 * dynticks counter manipulations. Specify "rsp->completed - 1" to
341 * unconditionally invalidate any future dynticks manipulations (which is
342 * useful at the beginning of a grace period).
343 */
344static void dyntick_record_completed(struct rcu_state *rsp, long comp)
345{
346 rsp->dynticks_completed = comp;
347}
348
349#ifdef CONFIG_SMP 344#ifdef CONFIG_SMP
350 345
351/* 346/*
352 * Recall the previously recorded value of the completion for dynticks.
353 */
354static long dyntick_recall_completed(struct rcu_state *rsp)
355{
356 return rsp->dynticks_completed;
357}
358
359/*
360 * Snapshot the specified CPU's dynticks counter so that we can later 347 * Snapshot the specified CPU's dynticks counter so that we can later
361 * credit them with an implicit quiescent state. Return 1 if this CPU 348 * credit them with an implicit quiescent state. Return 1 if this CPU
362 * is in dynticks idle mode, which is an extended quiescent state. 349 * is in dynticks idle mode, which is an extended quiescent state.
@@ -419,24 +406,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
419 406
420#else /* #ifdef CONFIG_NO_HZ */ 407#else /* #ifdef CONFIG_NO_HZ */
421 408
422static void dyntick_record_completed(struct rcu_state *rsp, long comp)
423{
424}
425
426#ifdef CONFIG_SMP 409#ifdef CONFIG_SMP
427 410
428/*
429 * If there are no dynticks, then the only way that a CPU can passively
430 * be in a quiescent state is to be offline. Unlike dynticks idle, which
431 * is a point in time during the prior (already finished) grace period,
432 * an offline CPU is always in a quiescent state, and thus can be
433 * unconditionally applied. So just return the current value of completed.
434 */
435static long dyntick_recall_completed(struct rcu_state *rsp)
436{
437 return rsp->completed;
438}
439
440static int dyntick_save_progress_counter(struct rcu_data *rdp) 411static int dyntick_save_progress_counter(struct rcu_data *rdp)
441{ 412{
442 return 0; 413 return 0;
@@ -553,13 +524,33 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
553/* 524/*
554 * Update CPU-local rcu_data state to record the newly noticed grace period. 525 * Update CPU-local rcu_data state to record the newly noticed grace period.
555 * This is used both when we started the grace period and when we notice 526 * This is used both when we started the grace period and when we notice
556 * that someone else started the grace period. 527 * that someone else started the grace period. The caller must hold the
528 * ->lock of the leaf rcu_node structure corresponding to the current CPU,
529 * and must have irqs disabled.
557 */ 530 */
531static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
532{
533 if (rdp->gpnum != rnp->gpnum) {
534 rdp->qs_pending = 1;
535 rdp->passed_quiesc = 0;
536 rdp->gpnum = rnp->gpnum;
537 }
538}
539
558static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp) 540static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
559{ 541{
560 rdp->qs_pending = 1; 542 unsigned long flags;
561 rdp->passed_quiesc = 0; 543 struct rcu_node *rnp;
562 rdp->gpnum = rsp->gpnum; 544
545 local_irq_save(flags);
546 rnp = rdp->mynode;
547 if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */
548 !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */
549 local_irq_restore(flags);
550 return;
551 }
552 __note_new_gpnum(rsp, rnp, rdp);
553 spin_unlock_irqrestore(&rnp->lock, flags);
563} 554}
564 555
565/* 556/*
@@ -583,6 +574,79 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
583} 574}
584 575
585/* 576/*
577 * Advance this CPU's callbacks, but only if the current grace period
578 * has ended. This may be called only from the CPU to whom the rdp
579 * belongs. In addition, the corresponding leaf rcu_node structure's
580 * ->lock must be held by the caller, with irqs disabled.
581 */
582static void
583__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
584{
585 /* Did another grace period end? */
586 if (rdp->completed != rnp->completed) {
587
588 /* Advance callbacks. No harm if list empty. */
589 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
590 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
591 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
592
593 /* Remember that we saw this grace-period completion. */
594 rdp->completed = rnp->completed;
595 }
596}
597
598/*
599 * Advance this CPU's callbacks, but only if the current grace period
600 * has ended. This may be called only from the CPU to whom the rdp
601 * belongs.
602 */
603static void
604rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
605{
606 unsigned long flags;
607 struct rcu_node *rnp;
608
609 local_irq_save(flags);
610 rnp = rdp->mynode;
611 if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */
612 !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */
613 local_irq_restore(flags);
614 return;
615 }
616 __rcu_process_gp_end(rsp, rnp, rdp);
617 spin_unlock_irqrestore(&rnp->lock, flags);
618}
619
620/*
621 * Do per-CPU grace-period initialization for running CPU. The caller
622 * must hold the lock of the leaf rcu_node structure corresponding to
623 * this CPU.
624 */
625static void
626rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
627{
628 /* Prior grace period ended, so advance callbacks for current CPU. */
629 __rcu_process_gp_end(rsp, rnp, rdp);
630
631 /*
632 * Because this CPU just now started the new grace period, we know
633 * that all of its callbacks will be covered by this upcoming grace
634 * period, even the ones that were registered arbitrarily recently.
635 * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL.
636 *
637 * Other CPUs cannot be sure exactly when the grace period started.
638 * Therefore, their recently registered callbacks must pass through
639 * an additional RCU_NEXT_READY stage, so that they will be handled
640 * by the next RCU grace period.
641 */
642 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
643 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
644
645 /* Set state so that this CPU will detect the next quiescent state. */
646 __note_new_gpnum(rsp, rnp, rdp);
647}
648
649/*
586 * Start a new RCU grace period if warranted, re-initializing the hierarchy 650 * Start a new RCU grace period if warranted, re-initializing the hierarchy
587 * in preparation for detecting the next grace period. The caller must hold 651 * in preparation for detecting the next grace period. The caller must hold
588 * the root node's ->lock, which is released before return. Hard irqs must 652 * the root node's ->lock, which is released before return. Hard irqs must
@@ -596,7 +660,23 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
596 struct rcu_node *rnp = rcu_get_root(rsp); 660 struct rcu_node *rnp = rcu_get_root(rsp);
597 661
598 if (!cpu_needs_another_gp(rsp, rdp)) { 662 if (!cpu_needs_another_gp(rsp, rdp)) {
599 spin_unlock_irqrestore(&rnp->lock, flags); 663 if (rnp->completed == rsp->completed) {
664 spin_unlock_irqrestore(&rnp->lock, flags);
665 return;
666 }
667 spin_unlock(&rnp->lock); /* irqs remain disabled. */
668
669 /*
670 * Propagate new ->completed value to rcu_node structures
671 * so that other CPUs don't have to wait until the start
672 * of the next grace period to process their callbacks.
673 */
674 rcu_for_each_node_breadth_first(rsp, rnp) {
675 spin_lock(&rnp->lock); /* irqs already disabled. */
676 rnp->completed = rsp->completed;
677 spin_unlock(&rnp->lock); /* irqs remain disabled. */
678 }
679 local_irq_restore(flags);
600 return; 680 return;
601 } 681 }
602 682
@@ -606,29 +686,15 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
606 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ 686 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
607 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 687 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
608 record_gp_stall_check_time(rsp); 688 record_gp_stall_check_time(rsp);
609 dyntick_record_completed(rsp, rsp->completed - 1);
610 note_new_gpnum(rsp, rdp);
611
612 /*
613 * Because this CPU just now started the new grace period, we know
614 * that all of its callbacks will be covered by this upcoming grace
615 * period, even the ones that were registered arbitrarily recently.
616 * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL.
617 *
618 * Other CPUs cannot be sure exactly when the grace period started.
619 * Therefore, their recently registered callbacks must pass through
620 * an additional RCU_NEXT_READY stage, so that they will be handled
621 * by the next RCU grace period.
622 */
623 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
624 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
625 689
626 /* Special-case the common single-level case. */ 690 /* Special-case the common single-level case. */
627 if (NUM_RCU_NODES == 1) { 691 if (NUM_RCU_NODES == 1) {
628 rcu_preempt_check_blocked_tasks(rnp); 692 rcu_preempt_check_blocked_tasks(rnp);
629 rnp->qsmask = rnp->qsmaskinit; 693 rnp->qsmask = rnp->qsmaskinit;
630 rnp->gpnum = rsp->gpnum; 694 rnp->gpnum = rsp->gpnum;
695 rnp->completed = rsp->completed;
631 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ 696 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
697 rcu_start_gp_per_cpu(rsp, rnp, rdp);
632 spin_unlock_irqrestore(&rnp->lock, flags); 698 spin_unlock_irqrestore(&rnp->lock, flags);
633 return; 699 return;
634 } 700 }
@@ -661,6 +727,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
661 rcu_preempt_check_blocked_tasks(rnp); 727 rcu_preempt_check_blocked_tasks(rnp);
662 rnp->qsmask = rnp->qsmaskinit; 728 rnp->qsmask = rnp->qsmaskinit;
663 rnp->gpnum = rsp->gpnum; 729 rnp->gpnum = rsp->gpnum;
730 rnp->completed = rsp->completed;
731 if (rnp == rdp->mynode)
732 rcu_start_gp_per_cpu(rsp, rnp, rdp);
664 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 733 spin_unlock(&rnp->lock); /* irqs remain disabled. */
665 } 734 }
666 735
@@ -672,58 +741,32 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
672} 741}
673 742
674/* 743/*
675 * Advance this CPU's callbacks, but only if the current grace period 744 * Report a full set of quiescent states to the specified rcu_state
676 * has ended. This may be called only from the CPU to whom the rdp 745 * data structure. This involves cleaning up after the prior grace
677 * belongs. 746 * period and letting rcu_start_gp() start up the next grace period
747 * if one is needed. Note that the caller must hold rnp->lock, as
748 * required by rcu_start_gp(), which will release it.
678 */ 749 */
679static void 750static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
680rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
681{
682 long completed_snap;
683 unsigned long flags;
684
685 local_irq_save(flags);
686 completed_snap = ACCESS_ONCE(rsp->completed); /* outside of lock. */
687
688 /* Did another grace period end? */
689 if (rdp->completed != completed_snap) {
690
691 /* Advance callbacks. No harm if list empty. */
692 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
693 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
694 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
695
696 /* Remember that we saw this grace-period completion. */
697 rdp->completed = completed_snap;
698 }
699 local_irq_restore(flags);
700}
701
702/*
703 * Clean up after the prior grace period and let rcu_start_gp() start up
704 * the next grace period if one is needed. Note that the caller must
705 * hold rnp->lock, as required by rcu_start_gp(), which will release it.
706 */
707static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags)
708 __releases(rcu_get_root(rsp)->lock) 751 __releases(rcu_get_root(rsp)->lock)
709{ 752{
710 WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); 753 WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
711 rsp->completed = rsp->gpnum; 754 rsp->completed = rsp->gpnum;
712 rsp->signaled = RCU_GP_IDLE; 755 rsp->signaled = RCU_GP_IDLE;
713 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
714 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ 756 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
715} 757}
716 758
717/* 759/*
718 * Similar to cpu_quiet(), for which it is a helper function. Allows 760 * Similar to rcu_report_qs_rdp(), for which it is a helper function.
719 * a group of CPUs to be quieted at one go, though all the CPUs in the 761 * Allows quiescent states for a group of CPUs to be reported at one go
720 * group must be represented by the same leaf rcu_node structure. 762 * to the specified rcu_node structure, though all the CPUs in the group
721 * That structure's lock must be held upon entry, and it is released 763 * must be represented by the same rcu_node structure (which need not be
722 * before return. 764 * a leaf rcu_node structure, though it often will be). That structure's
765 * lock must be held upon entry, and it is released before return.
723 */ 766 */
724static void 767static void
725cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp, 768rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
726 unsigned long flags) 769 struct rcu_node *rnp, unsigned long flags)
727 __releases(rnp->lock) 770 __releases(rnp->lock)
728{ 771{
729 struct rcu_node *rnp_c; 772 struct rcu_node *rnp_c;
@@ -759,21 +802,23 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
759 802
760 /* 803 /*
761 * Get here if we are the last CPU to pass through a quiescent 804 * Get here if we are the last CPU to pass through a quiescent
762 * state for this grace period. Invoke cpu_quiet_msk_finish() 805 * state for this grace period. Invoke rcu_report_qs_rsp()
763 * to clean up and start the next grace period if one is needed. 806 * to clean up and start the next grace period if one is needed.
764 */ 807 */
765 cpu_quiet_msk_finish(rsp, flags); /* releases rnp->lock. */ 808 rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */
766} 809}
767 810
768/* 811/*
769 * Record a quiescent state for the specified CPU, which must either be 812 * Record a quiescent state for the specified CPU to that CPU's rcu_data
770 * the current CPU. The lastcomp argument is used to make sure we are 813 * structure. This must be either called from the specified CPU, or
771 * still in the grace period of interest. We don't want to end the current 814 * called when the specified CPU is known to be offline (and when it is
772 * grace period based on quiescent states detected in an earlier grace 815 * also known that no other CPU is concurrently trying to help the offline
773 * period! 816 * CPU). The lastcomp argument is used to make sure we are still in the
817 * grace period of interest. We don't want to end the current grace period
818 * based on quiescent states detected in an earlier grace period!
774 */ 819 */
775static void 820static void
776cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) 821rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
777{ 822{
778 unsigned long flags; 823 unsigned long flags;
779 unsigned long mask; 824 unsigned long mask;
@@ -781,15 +826,15 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
781 826
782 rnp = rdp->mynode; 827 rnp = rdp->mynode;
783 spin_lock_irqsave(&rnp->lock, flags); 828 spin_lock_irqsave(&rnp->lock, flags);
784 if (lastcomp != ACCESS_ONCE(rsp->completed)) { 829 if (lastcomp != rnp->completed) {
785 830
786 /* 831 /*
787 * Someone beat us to it for this grace period, so leave. 832 * Someone beat us to it for this grace period, so leave.
788 * The race with GP start is resolved by the fact that we 833 * The race with GP start is resolved by the fact that we
789 * hold the leaf rcu_node lock, so that the per-CPU bits 834 * hold the leaf rcu_node lock, so that the per-CPU bits
790 * cannot yet be initialized -- so we would simply find our 835 * cannot yet be initialized -- so we would simply find our
791 * CPU's bit already cleared in cpu_quiet_msk() if this race 836 * CPU's bit already cleared in rcu_report_qs_rnp() if this
792 * occurred. 837 * race occurred.
793 */ 838 */
794 rdp->passed_quiesc = 0; /* try again later! */ 839 rdp->passed_quiesc = 0; /* try again later! */
795 spin_unlock_irqrestore(&rnp->lock, flags); 840 spin_unlock_irqrestore(&rnp->lock, flags);
@@ -807,7 +852,7 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
807 */ 852 */
808 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; 853 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
809 854
810 cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */ 855 rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
811 } 856 }
812} 857}
813 858
@@ -838,8 +883,11 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
838 if (!rdp->passed_quiesc) 883 if (!rdp->passed_quiesc)
839 return; 884 return;
840 885
841 /* Tell RCU we are done (but cpu_quiet() will be the judge of that). */ 886 /*
842 cpu_quiet(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed); 887 * Tell RCU we are done (but rcu_report_qs_rdp() will be the
888 * judge of that).
889 */
890 rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed);
843} 891}
844 892
845#ifdef CONFIG_HOTPLUG_CPU 893#ifdef CONFIG_HOTPLUG_CPU
@@ -899,8 +947,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
899static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) 947static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
900{ 948{
901 unsigned long flags; 949 unsigned long flags;
902 long lastcomp;
903 unsigned long mask; 950 unsigned long mask;
951 int need_report = 0;
904 struct rcu_data *rdp = rsp->rda[cpu]; 952 struct rcu_data *rdp = rsp->rda[cpu];
905 struct rcu_node *rnp; 953 struct rcu_node *rnp;
906 954
@@ -914,30 +962,32 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
914 spin_lock(&rnp->lock); /* irqs already disabled. */ 962 spin_lock(&rnp->lock); /* irqs already disabled. */
915 rnp->qsmaskinit &= ~mask; 963 rnp->qsmaskinit &= ~mask;
916 if (rnp->qsmaskinit != 0) { 964 if (rnp->qsmaskinit != 0) {
917 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 965 if (rnp != rdp->mynode)
966 spin_unlock(&rnp->lock); /* irqs remain disabled. */
918 break; 967 break;
919 } 968 }
920 969 if (rnp == rdp->mynode)
921 /* 970 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
922 * If there was a task blocking the current grace period, 971 else
923 * and if all CPUs have checked in, we need to propagate 972 spin_unlock(&rnp->lock); /* irqs remain disabled. */
924 * the quiescent state up the rcu_node hierarchy. But that
925 * is inconvenient at the moment due to deadlock issues if
926 * this should end the current grace period. So set the
927 * offlined CPU's bit in ->qsmask in order to force the
928 * next force_quiescent_state() invocation to clean up this
929 * mess in a deadlock-free manner.
930 */
931 if (rcu_preempt_offline_tasks(rsp, rnp, rdp) && !rnp->qsmask)
932 rnp->qsmask |= mask;
933
934 mask = rnp->grpmask; 973 mask = rnp->grpmask;
935 spin_unlock(&rnp->lock); /* irqs remain disabled. */
936 rnp = rnp->parent; 974 rnp = rnp->parent;
937 } while (rnp != NULL); 975 } while (rnp != NULL);
938 lastcomp = rsp->completed;
939 976
940 spin_unlock_irqrestore(&rsp->onofflock, flags); 977 /*
978 * We still hold the leaf rcu_node structure lock here, and
979 * irqs are still disabled. The reason for this subterfuge is
980 * because invoking rcu_report_unblock_qs_rnp() with ->onofflock
981 * held leads to deadlock.
982 */
983 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
984 rnp = rdp->mynode;
985 if (need_report & RCU_OFL_TASKS_NORM_GP)
986 rcu_report_unblock_qs_rnp(rnp, flags);
987 else
988 spin_unlock_irqrestore(&rnp->lock, flags);
989 if (need_report & RCU_OFL_TASKS_EXP_GP)
990 rcu_report_exp_rnp(rsp, rnp);
941 991
942 rcu_adopt_orphan_cbs(rsp); 992 rcu_adopt_orphan_cbs(rsp);
943} 993}
@@ -1109,7 +1159,7 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1109 rcu_for_each_leaf_node(rsp, rnp) { 1159 rcu_for_each_leaf_node(rsp, rnp) {
1110 mask = 0; 1160 mask = 0;
1111 spin_lock_irqsave(&rnp->lock, flags); 1161 spin_lock_irqsave(&rnp->lock, flags);
1112 if (rsp->completed != lastcomp) { 1162 if (rnp->completed != lastcomp) {
1113 spin_unlock_irqrestore(&rnp->lock, flags); 1163 spin_unlock_irqrestore(&rnp->lock, flags);
1114 return 1; 1164 return 1;
1115 } 1165 }
@@ -1123,10 +1173,10 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1123 if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) 1173 if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu]))
1124 mask |= bit; 1174 mask |= bit;
1125 } 1175 }
1126 if (mask != 0 && rsp->completed == lastcomp) { 1176 if (mask != 0 && rnp->completed == lastcomp) {
1127 1177
1128 /* cpu_quiet_msk() releases rnp->lock. */ 1178 /* rcu_report_qs_rnp() releases rnp->lock. */
1129 cpu_quiet_msk(mask, rsp, rnp, flags); 1179 rcu_report_qs_rnp(mask, rsp, rnp, flags);
1130 continue; 1180 continue;
1131 } 1181 }
1132 spin_unlock_irqrestore(&rnp->lock, flags); 1182 spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1144,6 +1194,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1144 long lastcomp; 1194 long lastcomp;
1145 struct rcu_node *rnp = rcu_get_root(rsp); 1195 struct rcu_node *rnp = rcu_get_root(rsp);
1146 u8 signaled; 1196 u8 signaled;
1197 u8 forcenow;
1147 1198
1148 if (!rcu_gp_in_progress(rsp)) 1199 if (!rcu_gp_in_progress(rsp))
1149 return; /* No grace period in progress, nothing to force. */ 1200 return; /* No grace period in progress, nothing to force. */
@@ -1156,10 +1207,10 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1156 goto unlock_ret; /* no emergency and done recently. */ 1207 goto unlock_ret; /* no emergency and done recently. */
1157 rsp->n_force_qs++; 1208 rsp->n_force_qs++;
1158 spin_lock(&rnp->lock); 1209 spin_lock(&rnp->lock);
1159 lastcomp = rsp->completed; 1210 lastcomp = rsp->gpnum - 1;
1160 signaled = rsp->signaled; 1211 signaled = rsp->signaled;
1161 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 1212 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
1162 if (lastcomp == rsp->gpnum) { 1213 if(!rcu_gp_in_progress(rsp)) {
1163 rsp->n_force_qs_ngp++; 1214 rsp->n_force_qs_ngp++;
1164 spin_unlock(&rnp->lock); 1215 spin_unlock(&rnp->lock);
1165 goto unlock_ret; /* no GP in progress, time updated. */ 1216 goto unlock_ret; /* no GP in progress, time updated. */
@@ -1180,21 +1231,29 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1180 if (rcu_process_dyntick(rsp, lastcomp, 1231 if (rcu_process_dyntick(rsp, lastcomp,
1181 dyntick_save_progress_counter)) 1232 dyntick_save_progress_counter))
1182 goto unlock_ret; 1233 goto unlock_ret;
1234 /* fall into next case. */
1235
1236 case RCU_SAVE_COMPLETED:
1183 1237
1184 /* Update state, record completion counter. */ 1238 /* Update state, record completion counter. */
1239 forcenow = 0;
1185 spin_lock(&rnp->lock); 1240 spin_lock(&rnp->lock);
1186 if (lastcomp == rsp->completed && 1241 if (lastcomp + 1 == rsp->gpnum &&
1187 rsp->signaled == RCU_SAVE_DYNTICK) { 1242 lastcomp == rsp->completed &&
1243 rsp->signaled == signaled) {
1188 rsp->signaled = RCU_FORCE_QS; 1244 rsp->signaled = RCU_FORCE_QS;
1189 dyntick_record_completed(rsp, lastcomp); 1245 rsp->completed_fqs = lastcomp;
1246 forcenow = signaled == RCU_SAVE_COMPLETED;
1190 } 1247 }
1191 spin_unlock(&rnp->lock); 1248 spin_unlock(&rnp->lock);
1192 break; 1249 if (!forcenow)
1250 break;
1251 /* fall into next case. */
1193 1252
1194 case RCU_FORCE_QS: 1253 case RCU_FORCE_QS:
1195 1254
1196 /* Check dyntick-idle state, send IPI to laggarts. */ 1255 /* Check dyntick-idle state, send IPI to laggarts. */
1197 if (rcu_process_dyntick(rsp, dyntick_recall_completed(rsp), 1256 if (rcu_process_dyntick(rsp, rsp->completed_fqs,
1198 rcu_implicit_dynticks_qs)) 1257 rcu_implicit_dynticks_qs))
1199 goto unlock_ret; 1258 goto unlock_ret;
1200 1259
@@ -1351,6 +1410,68 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1351} 1410}
1352EXPORT_SYMBOL_GPL(call_rcu_bh); 1411EXPORT_SYMBOL_GPL(call_rcu_bh);
1353 1412
1413/**
1414 * synchronize_sched - wait until an rcu-sched grace period has elapsed.
1415 *
1416 * Control will return to the caller some time after a full rcu-sched
1417 * grace period has elapsed, in other words after all currently executing
1418 * rcu-sched read-side critical sections have completed. These read-side
1419 * critical sections are delimited by rcu_read_lock_sched() and
1420 * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(),
1421 * local_irq_disable(), and so on may be used in place of
1422 * rcu_read_lock_sched().
1423 *
1424 * This means that all preempt_disable code sequences, including NMI and
1425 * hardware-interrupt handlers, in progress on entry will have completed
1426 * before this primitive returns. However, this does not guarantee that
1427 * softirq handlers will have completed, since in some kernels, these
1428 * handlers can run in process context, and can block.
1429 *
1430 * This primitive provides the guarantees made by the (now removed)
1431 * synchronize_kernel() API. In contrast, synchronize_rcu() only
1432 * guarantees that rcu_read_lock() sections will have completed.
1433 * In "classic RCU", these two guarantees happen to be one and
1434 * the same, but can differ in realtime RCU implementations.
1435 */
1436void synchronize_sched(void)
1437{
1438 struct rcu_synchronize rcu;
1439
1440 if (rcu_blocking_is_gp())
1441 return;
1442
1443 init_completion(&rcu.completion);
1444 /* Will wake me after RCU finished. */
1445 call_rcu_sched(&rcu.head, wakeme_after_rcu);
1446 /* Wait for it. */
1447 wait_for_completion(&rcu.completion);
1448}
1449EXPORT_SYMBOL_GPL(synchronize_sched);
1450
1451/**
1452 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
1453 *
1454 * Control will return to the caller some time after a full rcu_bh grace
1455 * period has elapsed, in other words after all currently executing rcu_bh
1456 * read-side critical sections have completed. RCU read-side critical
1457 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
1458 * and may be nested.
1459 */
1460void synchronize_rcu_bh(void)
1461{
1462 struct rcu_synchronize rcu;
1463
1464 if (rcu_blocking_is_gp())
1465 return;
1466
1467 init_completion(&rcu.completion);
1468 /* Will wake me after RCU finished. */
1469 call_rcu_bh(&rcu.head, wakeme_after_rcu);
1470 /* Wait for it. */
1471 wait_for_completion(&rcu.completion);
1472}
1473EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
1474
1354/* 1475/*
1355 * Check to see if there is any immediate RCU-related work to be done 1476 * Check to see if there is any immediate RCU-related work to be done
1356 * by the current CPU, for the specified type of RCU, returning 1 if so. 1477 * by the current CPU, for the specified type of RCU, returning 1 if so.
@@ -1360,6 +1481,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
1360 */ 1481 */
1361static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) 1482static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1362{ 1483{
1484 struct rcu_node *rnp = rdp->mynode;
1485
1363 rdp->n_rcu_pending++; 1486 rdp->n_rcu_pending++;
1364 1487
1365 /* Check for CPU stalls, if enabled. */ 1488 /* Check for CPU stalls, if enabled. */
@@ -1384,13 +1507,13 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1384 } 1507 }
1385 1508
1386 /* Has another RCU grace period completed? */ 1509 /* Has another RCU grace period completed? */
1387 if (ACCESS_ONCE(rsp->completed) != rdp->completed) { /* outside lock */ 1510 if (ACCESS_ONCE(rnp->completed) != rdp->completed) { /* outside lock */
1388 rdp->n_rp_gp_completed++; 1511 rdp->n_rp_gp_completed++;
1389 return 1; 1512 return 1;
1390 } 1513 }
1391 1514
1392 /* Has a new RCU grace period started? */ 1515 /* Has a new RCU grace period started? */
1393 if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) { /* outside lock */ 1516 if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */
1394 rdp->n_rp_gp_started++; 1517 rdp->n_rp_gp_started++;
1395 return 1; 1518 return 1;
1396 } 1519 }
@@ -1433,6 +1556,21 @@ int rcu_needs_cpu(int cpu)
1433 rcu_preempt_needs_cpu(cpu); 1556 rcu_preempt_needs_cpu(cpu);
1434} 1557}
1435 1558
1559/*
1560 * This function is invoked towards the end of the scheduler's initialization
1561 * process. Before this is called, the idle task might contain
1562 * RCU read-side critical sections (during which time, this idle
1563 * task is booting the system). After this function is called, the
1564 * idle tasks are prohibited from containing RCU read-side critical
1565 * sections.
1566 */
1567void rcu_scheduler_starting(void)
1568{
1569 WARN_ON(num_online_cpus() != 1);
1570 WARN_ON(nr_context_switches() > 0);
1571 rcu_scheduler_active = 1;
1572}
1573
1436static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; 1574static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
1437static atomic_t rcu_barrier_cpu_count; 1575static atomic_t rcu_barrier_cpu_count;
1438static DEFINE_MUTEX(rcu_barrier_mutex); 1576static DEFINE_MUTEX(rcu_barrier_mutex);
@@ -1544,21 +1682,16 @@ static void __cpuinit
1544rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) 1682rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1545{ 1683{
1546 unsigned long flags; 1684 unsigned long flags;
1547 long lastcomp;
1548 unsigned long mask; 1685 unsigned long mask;
1549 struct rcu_data *rdp = rsp->rda[cpu]; 1686 struct rcu_data *rdp = rsp->rda[cpu];
1550 struct rcu_node *rnp = rcu_get_root(rsp); 1687 struct rcu_node *rnp = rcu_get_root(rsp);
1551 1688
1552 /* Set up local state, ensuring consistent view of global state. */ 1689 /* Set up local state, ensuring consistent view of global state. */
1553 spin_lock_irqsave(&rnp->lock, flags); 1690 spin_lock_irqsave(&rnp->lock, flags);
1554 lastcomp = rsp->completed;
1555 rdp->completed = lastcomp;
1556 rdp->gpnum = lastcomp;
1557 rdp->passed_quiesc = 0; /* We could be racing with new GP, */ 1691 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1558 rdp->qs_pending = 1; /* so set up to respond to current GP. */ 1692 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1559 rdp->beenonline = 1; /* We have now been online. */ 1693 rdp->beenonline = 1; /* We have now been online. */
1560 rdp->preemptable = preemptable; 1694 rdp->preemptable = preemptable;
1561 rdp->passed_quiesc_completed = lastcomp - 1;
1562 rdp->qlen_last_fqs_check = 0; 1695 rdp->qlen_last_fqs_check = 0;
1563 rdp->n_force_qs_snap = rsp->n_force_qs; 1696 rdp->n_force_qs_snap = rsp->n_force_qs;
1564 rdp->blimit = blimit; 1697 rdp->blimit = blimit;
@@ -1580,6 +1713,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1580 spin_lock(&rnp->lock); /* irqs already disabled. */ 1713 spin_lock(&rnp->lock); /* irqs already disabled. */
1581 rnp->qsmaskinit |= mask; 1714 rnp->qsmaskinit |= mask;
1582 mask = rnp->grpmask; 1715 mask = rnp->grpmask;
1716 if (rnp == rdp->mynode) {
1717 rdp->gpnum = rnp->completed; /* if GP in progress... */
1718 rdp->completed = rnp->completed;
1719 rdp->passed_quiesc_completed = rnp->completed - 1;
1720 }
1583 spin_unlock(&rnp->lock); /* irqs already disabled. */ 1721 spin_unlock(&rnp->lock); /* irqs already disabled. */
1584 rnp = rnp->parent; 1722 rnp = rnp->parent;
1585 } while (rnp != NULL && !(rnp->qsmaskinit & mask)); 1723 } while (rnp != NULL && !(rnp->qsmaskinit & mask));
@@ -1597,8 +1735,8 @@ static void __cpuinit rcu_online_cpu(int cpu)
1597/* 1735/*
1598 * Handle CPU online/offline notification events. 1736 * Handle CPU online/offline notification events.
1599 */ 1737 */
1600int __cpuinit rcu_cpu_notify(struct notifier_block *self, 1738static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1601 unsigned long action, void *hcpu) 1739 unsigned long action, void *hcpu)
1602{ 1740{
1603 long cpu = (long)hcpu; 1741 long cpu = (long)hcpu;
1604 1742
@@ -1685,8 +1823,8 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1685 cpustride *= rsp->levelspread[i]; 1823 cpustride *= rsp->levelspread[i];
1686 rnp = rsp->level[i]; 1824 rnp = rsp->level[i];
1687 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { 1825 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
1688 if (rnp != rcu_get_root(rsp)) 1826 spin_lock_init(&rnp->lock);
1689 spin_lock_init(&rnp->lock); 1827 lockdep_set_class(&rnp->lock, &rcu_node_class[i]);
1690 rnp->gpnum = 0; 1828 rnp->gpnum = 0;
1691 rnp->qsmask = 0; 1829 rnp->qsmask = 0;
1692 rnp->qsmaskinit = 0; 1830 rnp->qsmaskinit = 0;
@@ -1707,9 +1845,10 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1707 rnp->level = i; 1845 rnp->level = i;
1708 INIT_LIST_HEAD(&rnp->blocked_tasks[0]); 1846 INIT_LIST_HEAD(&rnp->blocked_tasks[0]);
1709 INIT_LIST_HEAD(&rnp->blocked_tasks[1]); 1847 INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
1848 INIT_LIST_HEAD(&rnp->blocked_tasks[2]);
1849 INIT_LIST_HEAD(&rnp->blocked_tasks[3]);
1710 } 1850 }
1711 } 1851 }
1712 spin_lock_init(&rcu_get_root(rsp)->lock);
1713} 1852}
1714 1853
1715/* 1854/*
@@ -1735,16 +1874,30 @@ do { \
1735 } \ 1874 } \
1736} while (0) 1875} while (0)
1737 1876
1738void __init __rcu_init(void) 1877void __init rcu_init(void)
1739{ 1878{
1879 int i;
1880
1740 rcu_bootup_announce(); 1881 rcu_bootup_announce();
1741#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 1882#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1742 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); 1883 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1743#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 1884#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
1885#if NUM_RCU_LVL_4 != 0
1886 printk(KERN_INFO "Experimental four-level hierarchy is enabled.\n");
1887#endif /* #if NUM_RCU_LVL_4 != 0 */
1744 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); 1888 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
1745 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); 1889 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
1746 __rcu_init_preempt(); 1890 __rcu_init_preempt();
1747 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 1891 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1892
1893 /*
1894 * We don't need protection against CPU-hotplug here because
1895 * this is called early in boot, before either interrupts
1896 * or the scheduler are operational.
1897 */
1898 cpu_notifier(rcu_cpu_notify, 0);
1899 for_each_online_cpu(i)
1900 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)i);
1748} 1901}
1749 1902
1750#include "rcutree_plugin.h" 1903#include "rcutree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 1899023b0962..d2a0046f63b2 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -34,10 +34,11 @@
34 * In practice, this has not been tested, so there is probably some 34 * In practice, this has not been tested, so there is probably some
35 * bug somewhere. 35 * bug somewhere.
36 */ 36 */
37#define MAX_RCU_LVLS 3 37#define MAX_RCU_LVLS 4
38#define RCU_FANOUT (CONFIG_RCU_FANOUT) 38#define RCU_FANOUT (CONFIG_RCU_FANOUT)
39#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT) 39#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT)
40#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT) 40#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT)
41#define RCU_FANOUT_FOURTH (RCU_FANOUT_CUBE * RCU_FANOUT)
41 42
42#if NR_CPUS <= RCU_FANOUT 43#if NR_CPUS <= RCU_FANOUT
43# define NUM_RCU_LVLS 1 44# define NUM_RCU_LVLS 1
@@ -45,23 +46,33 @@
45# define NUM_RCU_LVL_1 (NR_CPUS) 46# define NUM_RCU_LVL_1 (NR_CPUS)
46# define NUM_RCU_LVL_2 0 47# define NUM_RCU_LVL_2 0
47# define NUM_RCU_LVL_3 0 48# define NUM_RCU_LVL_3 0
49# define NUM_RCU_LVL_4 0
48#elif NR_CPUS <= RCU_FANOUT_SQ 50#elif NR_CPUS <= RCU_FANOUT_SQ
49# define NUM_RCU_LVLS 2 51# define NUM_RCU_LVLS 2
50# define NUM_RCU_LVL_0 1 52# define NUM_RCU_LVL_0 1
51# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) 53# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
52# define NUM_RCU_LVL_2 (NR_CPUS) 54# define NUM_RCU_LVL_2 (NR_CPUS)
53# define NUM_RCU_LVL_3 0 55# define NUM_RCU_LVL_3 0
56# define NUM_RCU_LVL_4 0
54#elif NR_CPUS <= RCU_FANOUT_CUBE 57#elif NR_CPUS <= RCU_FANOUT_CUBE
55# define NUM_RCU_LVLS 3 58# define NUM_RCU_LVLS 3
56# define NUM_RCU_LVL_0 1 59# define NUM_RCU_LVL_0 1
57# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) 60# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
58# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) 61# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
59# define NUM_RCU_LVL_3 NR_CPUS 62# define NUM_RCU_LVL_3 NR_CPUS
63# define NUM_RCU_LVL_4 0
64#elif NR_CPUS <= RCU_FANOUT_FOURTH
65# define NUM_RCU_LVLS 4
66# define NUM_RCU_LVL_0 1
67# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE)
68# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
69# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
70# define NUM_RCU_LVL_4 NR_CPUS
60#else 71#else
61# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" 72# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
62#endif /* #if (NR_CPUS) <= RCU_FANOUT */ 73#endif /* #if (NR_CPUS) <= RCU_FANOUT */
63 74
64#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) 75#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
65#define NUM_RCU_NODES (RCU_SUM - NR_CPUS) 76#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
66 77
67/* 78/*
@@ -84,14 +95,21 @@ struct rcu_node {
84 long gpnum; /* Current grace period for this node. */ 95 long gpnum; /* Current grace period for this node. */
85 /* This will either be equal to or one */ 96 /* This will either be equal to or one */
86 /* behind the root rcu_node's gpnum. */ 97 /* behind the root rcu_node's gpnum. */
98 long completed; /* Last grace period completed for this node. */
99 /* This will either be equal to or one */
100 /* behind the root rcu_node's gpnum. */
87 unsigned long qsmask; /* CPUs or groups that need to switch in */ 101 unsigned long qsmask; /* CPUs or groups that need to switch in */
88 /* order for current grace period to proceed.*/ 102 /* order for current grace period to proceed.*/
89 /* In leaf rcu_node, each bit corresponds to */ 103 /* In leaf rcu_node, each bit corresponds to */
90 /* an rcu_data structure, otherwise, each */ 104 /* an rcu_data structure, otherwise, each */
91 /* bit corresponds to a child rcu_node */ 105 /* bit corresponds to a child rcu_node */
92 /* structure. */ 106 /* structure. */
107 unsigned long expmask; /* Groups that have ->blocked_tasks[] */
108 /* elements that need to drain to allow the */
109 /* current expedited grace period to */
110 /* complete (only for TREE_PREEMPT_RCU). */
93 unsigned long qsmaskinit; 111 unsigned long qsmaskinit;
94 /* Per-GP initialization for qsmask. */ 112 /* Per-GP initial value for qsmask & expmask. */
95 unsigned long grpmask; /* Mask to apply to parent qsmask. */ 113 unsigned long grpmask; /* Mask to apply to parent qsmask. */
96 /* Only one bit will be set in this mask. */ 114 /* Only one bit will be set in this mask. */
97 int grplo; /* lowest-numbered CPU or group here. */ 115 int grplo; /* lowest-numbered CPU or group here. */
@@ -99,7 +117,7 @@ struct rcu_node {
99 u8 grpnum; /* CPU/group number for next level up. */ 117 u8 grpnum; /* CPU/group number for next level up. */
100 u8 level; /* root is at level 0. */ 118 u8 level; /* root is at level 0. */
101 struct rcu_node *parent; 119 struct rcu_node *parent;
102 struct list_head blocked_tasks[2]; 120 struct list_head blocked_tasks[4];
103 /* Tasks blocked in RCU read-side critsect. */ 121 /* Tasks blocked in RCU read-side critsect. */
104 /* Grace period number (->gpnum) x blocked */ 122 /* Grace period number (->gpnum) x blocked */
105 /* by tasks on the (x & 0x1) element of the */ 123 /* by tasks on the (x & 0x1) element of the */
@@ -114,6 +132,21 @@ struct rcu_node {
114 for ((rnp) = &(rsp)->node[0]; \ 132 for ((rnp) = &(rsp)->node[0]; \
115 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) 133 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
116 134
135/*
136 * Do a breadth-first scan of the non-leaf rcu_node structures for the
137 * specified rcu_state structure. Note that if there is a singleton
138 * rcu_node tree with but one rcu_node structure, this loop is a no-op.
139 */
140#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
141 for ((rnp) = &(rsp)->node[0]; \
142 (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++)
143
144/*
145 * Scan the leaves of the rcu_node hierarchy for the specified rcu_state
146 * structure. Note that if there is a singleton rcu_node tree with but
147 * one rcu_node structure, this loop -will- visit the rcu_node structure.
148 * It is still a leaf node, even if it is also the root node.
149 */
117#define rcu_for_each_leaf_node(rsp, rnp) \ 150#define rcu_for_each_leaf_node(rsp, rnp) \
118 for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \ 151 for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \
119 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) 152 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
@@ -204,11 +237,12 @@ struct rcu_data {
204#define RCU_GP_IDLE 0 /* No grace period in progress. */ 237#define RCU_GP_IDLE 0 /* No grace period in progress. */
205#define RCU_GP_INIT 1 /* Grace period being initialized. */ 238#define RCU_GP_INIT 1 /* Grace period being initialized. */
206#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ 239#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */
207#define RCU_FORCE_QS 3 /* Need to force quiescent state. */ 240#define RCU_SAVE_COMPLETED 3 /* Need to save rsp->completed. */
241#define RCU_FORCE_QS 4 /* Need to force quiescent state. */
208#ifdef CONFIG_NO_HZ 242#ifdef CONFIG_NO_HZ
209#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK 243#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
210#else /* #ifdef CONFIG_NO_HZ */ 244#else /* #ifdef CONFIG_NO_HZ */
211#define RCU_SIGNAL_INIT RCU_FORCE_QS 245#define RCU_SIGNAL_INIT RCU_SAVE_COMPLETED
212#endif /* #else #ifdef CONFIG_NO_HZ */ 246#endif /* #else #ifdef CONFIG_NO_HZ */
213 247
214#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 248#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
@@ -246,7 +280,7 @@ struct rcu_state {
246 long gpnum; /* Current gp number. */ 280 long gpnum; /* Current gp number. */
247 long completed; /* # of last completed gp. */ 281 long completed; /* # of last completed gp. */
248 282
249 /* End of fields guarded by root rcu_node's lock. */ 283 /* End of fields guarded by root rcu_node's lock. */
250 284
251 spinlock_t onofflock; /* exclude on/offline and */ 285 spinlock_t onofflock; /* exclude on/offline and */
252 /* starting new GP. Also */ 286 /* starting new GP. Also */
@@ -260,6 +294,8 @@ struct rcu_state {
260 long orphan_qlen; /* Number of orphaned cbs. */ 294 long orphan_qlen; /* Number of orphaned cbs. */
261 spinlock_t fqslock; /* Only one task forcing */ 295 spinlock_t fqslock; /* Only one task forcing */
262 /* quiescent states. */ 296 /* quiescent states. */
297 long completed_fqs; /* Value of completed @ snap. */
298 /* Protected by fqslock. */
263 unsigned long jiffies_force_qs; /* Time at which to invoke */ 299 unsigned long jiffies_force_qs; /* Time at which to invoke */
264 /* force_quiescent_state(). */ 300 /* force_quiescent_state(). */
265 unsigned long n_force_qs; /* Number of calls to */ 301 unsigned long n_force_qs; /* Number of calls to */
@@ -274,11 +310,15 @@ struct rcu_state {
274 unsigned long jiffies_stall; /* Time at which to check */ 310 unsigned long jiffies_stall; /* Time at which to check */
275 /* for CPU stalls. */ 311 /* for CPU stalls. */
276#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 312#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
277#ifdef CONFIG_NO_HZ
278 long dynticks_completed; /* Value of completed @ snap. */
279#endif /* #ifdef CONFIG_NO_HZ */
280}; 313};
281 314
315/* Return values for rcu_preempt_offline_tasks(). */
316
317#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */
318 /* GP were moved to root. */
319#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */
320 /* GP were moved to root. */
321
282#ifdef RCU_TREE_NONCORE 322#ifdef RCU_TREE_NONCORE
283 323
284/* 324/*
@@ -298,10 +338,14 @@ DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
298#else /* #ifdef RCU_TREE_NONCORE */ 338#else /* #ifdef RCU_TREE_NONCORE */
299 339
300/* Forward declarations for rcutree_plugin.h */ 340/* Forward declarations for rcutree_plugin.h */
301static inline void rcu_bootup_announce(void); 341static void rcu_bootup_announce(void);
302long rcu_batches_completed(void); 342long rcu_batches_completed(void);
303static void rcu_preempt_note_context_switch(int cpu); 343static void rcu_preempt_note_context_switch(int cpu);
304static int rcu_preempted_readers(struct rcu_node *rnp); 344static int rcu_preempted_readers(struct rcu_node *rnp);
345#ifdef CONFIG_HOTPLUG_CPU
346static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
347 unsigned long flags);
348#endif /* #ifdef CONFIG_HOTPLUG_CPU */
305#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 349#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
306static void rcu_print_task_stall(struct rcu_node *rnp); 350static void rcu_print_task_stall(struct rcu_node *rnp);
307#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 351#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
@@ -315,6 +359,9 @@ static void rcu_preempt_offline_cpu(int cpu);
315static void rcu_preempt_check_callbacks(int cpu); 359static void rcu_preempt_check_callbacks(int cpu);
316static void rcu_preempt_process_callbacks(void); 360static void rcu_preempt_process_callbacks(void);
317void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 361void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
362#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
363static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
364#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
318static int rcu_preempt_pending(int cpu); 365static int rcu_preempt_pending(int cpu);
319static int rcu_preempt_needs_cpu(int cpu); 366static int rcu_preempt_needs_cpu(int cpu);
320static void __cpuinit rcu_preempt_init_percpu_data(int cpu); 367static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index ef2a58c2b9d5..37fbccdf41d5 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -24,16 +24,19 @@
24 * Paul E. McKenney <paulmck@linux.vnet.ibm.com> 24 * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
25 */ 25 */
26 26
27#include <linux/delay.h>
27 28
28#ifdef CONFIG_TREE_PREEMPT_RCU 29#ifdef CONFIG_TREE_PREEMPT_RCU
29 30
30struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); 31struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
31DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); 32DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
32 33
34static int rcu_preempted_readers_exp(struct rcu_node *rnp);
35
33/* 36/*
34 * Tell them what RCU they are running. 37 * Tell them what RCU they are running.
35 */ 38 */
36static inline void rcu_bootup_announce(void) 39static void __init rcu_bootup_announce(void)
37{ 40{
38 printk(KERN_INFO 41 printk(KERN_INFO
39 "Experimental preemptable hierarchical RCU implementation.\n"); 42 "Experimental preemptable hierarchical RCU implementation.\n");
@@ -67,7 +70,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
67static void rcu_preempt_qs(int cpu) 70static void rcu_preempt_qs(int cpu)
68{ 71{
69 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 72 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
70 rdp->passed_quiesc_completed = rdp->completed; 73 rdp->passed_quiesc_completed = rdp->gpnum - 1;
71 barrier(); 74 barrier();
72 rdp->passed_quiesc = 1; 75 rdp->passed_quiesc = 1;
73} 76}
@@ -157,14 +160,58 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock);
157 */ 160 */
158static int rcu_preempted_readers(struct rcu_node *rnp) 161static int rcu_preempted_readers(struct rcu_node *rnp)
159{ 162{
160 return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); 163 int phase = rnp->gpnum & 0x1;
164
165 return !list_empty(&rnp->blocked_tasks[phase]) ||
166 !list_empty(&rnp->blocked_tasks[phase + 2]);
167}
168
169/*
170 * Record a quiescent state for all tasks that were previously queued
171 * on the specified rcu_node structure and that were blocking the current
172 * RCU grace period. The caller must hold the specified rnp->lock with
173 * irqs disabled, and this lock is released upon return, but irqs remain
174 * disabled.
175 */
176static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
177 __releases(rnp->lock)
178{
179 unsigned long mask;
180 struct rcu_node *rnp_p;
181
182 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
183 spin_unlock_irqrestore(&rnp->lock, flags);
184 return; /* Still need more quiescent states! */
185 }
186
187 rnp_p = rnp->parent;
188 if (rnp_p == NULL) {
189 /*
190 * Either there is only one rcu_node in the tree,
191 * or tasks were kicked up to root rcu_node due to
192 * CPUs going offline.
193 */
194 rcu_report_qs_rsp(&rcu_preempt_state, flags);
195 return;
196 }
197
198 /* Report up the rest of the hierarchy. */
199 mask = rnp->grpmask;
200 spin_unlock(&rnp->lock); /* irqs remain disabled. */
201 spin_lock(&rnp_p->lock); /* irqs already disabled. */
202 rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
161} 203}
162 204
205/*
206 * Handle special cases during rcu_read_unlock(), such as needing to
207 * notify RCU core processing or task having blocked during the RCU
208 * read-side critical section.
209 */
163static void rcu_read_unlock_special(struct task_struct *t) 210static void rcu_read_unlock_special(struct task_struct *t)
164{ 211{
165 int empty; 212 int empty;
213 int empty_exp;
166 unsigned long flags; 214 unsigned long flags;
167 unsigned long mask;
168 struct rcu_node *rnp; 215 struct rcu_node *rnp;
169 int special; 216 int special;
170 217
@@ -207,36 +254,30 @@ static void rcu_read_unlock_special(struct task_struct *t)
207 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 254 spin_unlock(&rnp->lock); /* irqs remain disabled. */
208 } 255 }
209 empty = !rcu_preempted_readers(rnp); 256 empty = !rcu_preempted_readers(rnp);
257 empty_exp = !rcu_preempted_readers_exp(rnp);
258 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
210 list_del_init(&t->rcu_node_entry); 259 list_del_init(&t->rcu_node_entry);
211 t->rcu_blocked_node = NULL; 260 t->rcu_blocked_node = NULL;
212 261
213 /* 262 /*
214 * If this was the last task on the current list, and if 263 * If this was the last task on the current list, and if
215 * we aren't waiting on any CPUs, report the quiescent state. 264 * we aren't waiting on any CPUs, report the quiescent state.
216 * Note that both cpu_quiet_msk_finish() and cpu_quiet_msk() 265 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock.
217 * drop rnp->lock and restore irq.
218 */ 266 */
219 if (!empty && rnp->qsmask == 0 && 267 if (empty)
220 !rcu_preempted_readers(rnp)) {
221 struct rcu_node *rnp_p;
222
223 if (rnp->parent == NULL) {
224 /* Only one rcu_node in the tree. */
225 cpu_quiet_msk_finish(&rcu_preempt_state, flags);
226 return;
227 }
228 /* Report up the rest of the hierarchy. */
229 mask = rnp->grpmask;
230 spin_unlock_irqrestore(&rnp->lock, flags); 268 spin_unlock_irqrestore(&rnp->lock, flags);
231 rnp_p = rnp->parent; 269 else
232 spin_lock_irqsave(&rnp_p->lock, flags); 270 rcu_report_unblock_qs_rnp(rnp, flags);
233 WARN_ON_ONCE(rnp->qsmask); 271
234 cpu_quiet_msk(mask, &rcu_preempt_state, rnp_p, flags); 272 /*
235 return; 273 * If this was the last task on the expedited lists,
236 } 274 * then we need to report up the rcu_node hierarchy.
237 spin_unlock(&rnp->lock); 275 */
276 if (!empty_exp && !rcu_preempted_readers_exp(rnp))
277 rcu_report_exp_rnp(&rcu_preempt_state, rnp);
278 } else {
279 local_irq_restore(flags);
238 } 280 }
239 local_irq_restore(flags);
240} 281}
241 282
242/* 283/*
@@ -303,6 +344,8 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
303 * rcu_node. The reason for not just moving them to the immediate 344 * rcu_node. The reason for not just moving them to the immediate
304 * parent is to remove the need for rcu_read_unlock_special() to 345 * parent is to remove the need for rcu_read_unlock_special() to
305 * make more than two attempts to acquire the target rcu_node's lock. 346 * make more than two attempts to acquire the target rcu_node's lock.
347 * Returns true if there were tasks blocking the current RCU grace
348 * period.
306 * 349 *
307 * Returns 1 if there was previously a task blocking the current grace 350 * Returns 1 if there was previously a task blocking the current grace
308 * period on the specified rcu_node structure. 351 * period on the specified rcu_node structure.
@@ -316,7 +359,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
316 int i; 359 int i;
317 struct list_head *lp; 360 struct list_head *lp;
318 struct list_head *lp_root; 361 struct list_head *lp_root;
319 int retval = rcu_preempted_readers(rnp); 362 int retval = 0;
320 struct rcu_node *rnp_root = rcu_get_root(rsp); 363 struct rcu_node *rnp_root = rcu_get_root(rsp);
321 struct task_struct *tp; 364 struct task_struct *tp;
322 365
@@ -326,7 +369,9 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
326 } 369 }
327 WARN_ON_ONCE(rnp != rdp->mynode && 370 WARN_ON_ONCE(rnp != rdp->mynode &&
328 (!list_empty(&rnp->blocked_tasks[0]) || 371 (!list_empty(&rnp->blocked_tasks[0]) ||
329 !list_empty(&rnp->blocked_tasks[1]))); 372 !list_empty(&rnp->blocked_tasks[1]) ||
373 !list_empty(&rnp->blocked_tasks[2]) ||
374 !list_empty(&rnp->blocked_tasks[3])));
330 375
331 /* 376 /*
332 * Move tasks up to root rcu_node. Rely on the fact that the 377 * Move tasks up to root rcu_node. Rely on the fact that the
@@ -334,7 +379,11 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
334 * rcu_nodes in terms of gp_num value. This fact allows us to 379 * rcu_nodes in terms of gp_num value. This fact allows us to
335 * move the blocked_tasks[] array directly, element by element. 380 * move the blocked_tasks[] array directly, element by element.
336 */ 381 */
337 for (i = 0; i < 2; i++) { 382 if (rcu_preempted_readers(rnp))
383 retval |= RCU_OFL_TASKS_NORM_GP;
384 if (rcu_preempted_readers_exp(rnp))
385 retval |= RCU_OFL_TASKS_EXP_GP;
386 for (i = 0; i < 4; i++) {
338 lp = &rnp->blocked_tasks[i]; 387 lp = &rnp->blocked_tasks[i];
339 lp_root = &rnp_root->blocked_tasks[i]; 388 lp_root = &rnp_root->blocked_tasks[i];
340 while (!list_empty(lp)) { 389 while (!list_empty(lp)) {
@@ -346,7 +395,6 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
346 spin_unlock(&rnp_root->lock); /* irqs remain disabled */ 395 spin_unlock(&rnp_root->lock); /* irqs remain disabled */
347 } 396 }
348 } 397 }
349
350 return retval; 398 return retval;
351} 399}
352 400
@@ -398,14 +446,183 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
398} 446}
399EXPORT_SYMBOL_GPL(call_rcu); 447EXPORT_SYMBOL_GPL(call_rcu);
400 448
449/**
450 * synchronize_rcu - wait until a grace period has elapsed.
451 *
452 * Control will return to the caller some time after a full grace
453 * period has elapsed, in other words after all currently executing RCU
454 * read-side critical sections have completed. RCU read-side critical
455 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
456 * and may be nested.
457 */
458void synchronize_rcu(void)
459{
460 struct rcu_synchronize rcu;
461
462 if (!rcu_scheduler_active)
463 return;
464
465 init_completion(&rcu.completion);
466 /* Will wake me after RCU finished. */
467 call_rcu(&rcu.head, wakeme_after_rcu);
468 /* Wait for it. */
469 wait_for_completion(&rcu.completion);
470}
471EXPORT_SYMBOL_GPL(synchronize_rcu);
472
473static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
474static long sync_rcu_preempt_exp_count;
475static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
476
401/* 477/*
402 * Wait for an rcu-preempt grace period. We are supposed to expedite the 478 * Return non-zero if there are any tasks in RCU read-side critical
403 * grace period, but this is the crude slow compatability hack, so just 479 * sections blocking the current preemptible-RCU expedited grace period.
404 * invoke synchronize_rcu(). 480 * If there is no preemptible-RCU expedited grace period currently in
481 * progress, returns zero unconditionally.
482 */
483static int rcu_preempted_readers_exp(struct rcu_node *rnp)
484{
485 return !list_empty(&rnp->blocked_tasks[2]) ||
486 !list_empty(&rnp->blocked_tasks[3]);
487}
488
489/*
490 * return non-zero if there is no RCU expedited grace period in progress
491 * for the specified rcu_node structure, in other words, if all CPUs and
492 * tasks covered by the specified rcu_node structure have done their bit
493 * for the current expedited grace period. Works only for preemptible
494 * RCU -- other RCU implementation use other means.
495 *
496 * Caller must hold sync_rcu_preempt_exp_mutex.
497 */
498static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
499{
500 return !rcu_preempted_readers_exp(rnp) &&
501 ACCESS_ONCE(rnp->expmask) == 0;
502}
503
504/*
505 * Report the exit from RCU read-side critical section for the last task
506 * that queued itself during or before the current expedited preemptible-RCU
507 * grace period. This event is reported either to the rcu_node structure on
508 * which the task was queued or to one of that rcu_node structure's ancestors,
509 * recursively up the tree. (Calm down, calm down, we do the recursion
510 * iteratively!)
511 *
512 * Caller must hold sync_rcu_preempt_exp_mutex.
513 */
514static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
515{
516 unsigned long flags;
517 unsigned long mask;
518
519 spin_lock_irqsave(&rnp->lock, flags);
520 for (;;) {
521 if (!sync_rcu_preempt_exp_done(rnp))
522 break;
523 if (rnp->parent == NULL) {
524 wake_up(&sync_rcu_preempt_exp_wq);
525 break;
526 }
527 mask = rnp->grpmask;
528 spin_unlock(&rnp->lock); /* irqs remain disabled */
529 rnp = rnp->parent;
530 spin_lock(&rnp->lock); /* irqs already disabled */
531 rnp->expmask &= ~mask;
532 }
533 spin_unlock_irqrestore(&rnp->lock, flags);
534}
535
536/*
537 * Snapshot the tasks blocking the newly started preemptible-RCU expedited
538 * grace period for the specified rcu_node structure. If there are no such
539 * tasks, report it up the rcu_node hierarchy.
540 *
541 * Caller must hold sync_rcu_preempt_exp_mutex and rsp->onofflock.
542 */
543static void
544sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
545{
546 int must_wait;
547
548 spin_lock(&rnp->lock); /* irqs already disabled */
549 list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]);
550 list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]);
551 must_wait = rcu_preempted_readers_exp(rnp);
552 spin_unlock(&rnp->lock); /* irqs remain disabled */
553 if (!must_wait)
554 rcu_report_exp_rnp(rsp, rnp);
555}
556
557/*
558 * Wait for an rcu-preempt grace period, but expedite it. The basic idea
559 * is to invoke synchronize_sched_expedited() to push all the tasks to
560 * the ->blocked_tasks[] lists, move all entries from the first set of
561 * ->blocked_tasks[] lists to the second set, and finally wait for this
562 * second set to drain.
405 */ 563 */
406void synchronize_rcu_expedited(void) 564void synchronize_rcu_expedited(void)
407{ 565{
408 synchronize_rcu(); 566 unsigned long flags;
567 struct rcu_node *rnp;
568 struct rcu_state *rsp = &rcu_preempt_state;
569 long snap;
570 int trycount = 0;
571
572 smp_mb(); /* Caller's modifications seen first by other CPUs. */
573 snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1;
574 smp_mb(); /* Above access cannot bleed into critical section. */
575
576 /*
577 * Acquire lock, falling back to synchronize_rcu() if too many
578 * lock-acquisition failures. Of course, if someone does the
579 * expedited grace period for us, just leave.
580 */
581 while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
582 if (trycount++ < 10)
583 udelay(trycount * num_online_cpus());
584 else {
585 synchronize_rcu();
586 return;
587 }
588 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
589 goto mb_ret; /* Others did our work for us. */
590 }
591 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
592 goto unlock_mb_ret; /* Others did our work for us. */
593
594 /* force all RCU readers onto blocked_tasks[]. */
595 synchronize_sched_expedited();
596
597 spin_lock_irqsave(&rsp->onofflock, flags);
598
599 /* Initialize ->expmask for all non-leaf rcu_node structures. */
600 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
601 spin_lock(&rnp->lock); /* irqs already disabled. */
602 rnp->expmask = rnp->qsmaskinit;
603 spin_unlock(&rnp->lock); /* irqs remain disabled. */
604 }
605
606 /* Snapshot current state of ->blocked_tasks[] lists. */
607 rcu_for_each_leaf_node(rsp, rnp)
608 sync_rcu_preempt_exp_init(rsp, rnp);
609 if (NUM_RCU_NODES > 1)
610 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
611
612 spin_unlock_irqrestore(&rsp->onofflock, flags);
613
614 /* Wait for snapshotted ->blocked_tasks[] lists to drain. */
615 rnp = rcu_get_root(rsp);
616 wait_event(sync_rcu_preempt_exp_wq,
617 sync_rcu_preempt_exp_done(rnp));
618
619 /* Clean up and exit. */
620 smp_mb(); /* ensure expedited GP seen before counter increment. */
621 ACCESS_ONCE(sync_rcu_preempt_exp_count)++;
622unlock_mb_ret:
623 mutex_unlock(&sync_rcu_preempt_exp_mutex);
624mb_ret:
625 smp_mb(); /* ensure subsequent action seen after grace period. */
409} 626}
410EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 627EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
411 628
@@ -481,7 +698,7 @@ void exit_rcu(void)
481/* 698/*
482 * Tell them what RCU they are running. 699 * Tell them what RCU they are running.
483 */ 700 */
484static inline void rcu_bootup_announce(void) 701static void __init rcu_bootup_announce(void)
485{ 702{
486 printk(KERN_INFO "Hierarchical RCU implementation.\n"); 703 printk(KERN_INFO "Hierarchical RCU implementation.\n");
487} 704}
@@ -512,6 +729,16 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
512 return 0; 729 return 0;
513} 730}
514 731
732#ifdef CONFIG_HOTPLUG_CPU
733
734/* Because preemptible RCU does not exist, no quieting of tasks. */
735static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
736{
737 spin_unlock_irqrestore(&rnp->lock, flags);
738}
739
740#endif /* #ifdef CONFIG_HOTPLUG_CPU */
741
515#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 742#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
516 743
517/* 744/*
@@ -594,6 +821,20 @@ void synchronize_rcu_expedited(void)
594} 821}
595EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 822EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
596 823
824#ifdef CONFIG_HOTPLUG_CPU
825
826/*
827 * Because preemptable RCU does not exist, there is never any need to
828 * report on tasks preempted in RCU read-side critical sections during
829 * expedited RCU grace periods.
830 */
831static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
832{
833 return;
834}
835
836#endif /* #ifdef CONFIG_HOTPLUG_CPU */
837
597/* 838/*
598 * Because preemptable RCU does not exist, it never has any work to do. 839 * Because preemptable RCU does not exist, it never has any work to do.
599 */ 840 */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 4b31c779e62e..9d2c88423b31 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -155,12 +155,15 @@ static const struct file_operations rcudata_csv_fops = {
155 155
156static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) 156static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
157{ 157{
158 long gpnum;
158 int level = 0; 159 int level = 0;
160 int phase;
159 struct rcu_node *rnp; 161 struct rcu_node *rnp;
160 162
163 gpnum = rsp->gpnum;
161 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x " 164 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x "
162 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", 165 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
163 rsp->completed, rsp->gpnum, rsp->signaled, 166 rsp->completed, gpnum, rsp->signaled,
164 (long)(rsp->jiffies_force_qs - jiffies), 167 (long)(rsp->jiffies_force_qs - jiffies),
165 (int)(jiffies & 0xffff), 168 (int)(jiffies & 0xffff),
166 rsp->n_force_qs, rsp->n_force_qs_ngp, 169 rsp->n_force_qs, rsp->n_force_qs_ngp,
@@ -171,8 +174,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
171 seq_puts(m, "\n"); 174 seq_puts(m, "\n");
172 level = rnp->level; 175 level = rnp->level;
173 } 176 }
174 seq_printf(m, "%lx/%lx %d:%d ^%d ", 177 phase = gpnum & 0x1;
178 seq_printf(m, "%lx/%lx %c%c>%c%c %d:%d ^%d ",
175 rnp->qsmask, rnp->qsmaskinit, 179 rnp->qsmask, rnp->qsmaskinit,
180 "T."[list_empty(&rnp->blocked_tasks[phase])],
181 "E."[list_empty(&rnp->blocked_tasks[phase + 2])],
182 "T."[list_empty(&rnp->blocked_tasks[!phase])],
183 "E."[list_empty(&rnp->blocked_tasks[!phase + 2])],
176 rnp->grplo, rnp->grphi, rnp->grpnum); 184 rnp->grplo, rnp->grphi, rnp->grpnum);
177 } 185 }
178 seq_puts(m, "\n"); 186 seq_puts(m, "\n");
diff --git a/kernel/relay.c b/kernel/relay.c
index 760c26209a3c..c705a41b4ba3 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1198,7 +1198,7 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe,
1198 relay_consume_bytes(rbuf, buf->private); 1198 relay_consume_bytes(rbuf, buf->private);
1199} 1199}
1200 1200
1201static struct pipe_buf_operations relay_pipe_buf_ops = { 1201static const struct pipe_buf_operations relay_pipe_buf_ops = {
1202 .can_merge = 0, 1202 .can_merge = 0,
1203 .map = generic_pipe_buf_map, 1203 .map = generic_pipe_buf_map,
1204 .unmap = generic_pipe_buf_unmap, 1204 .unmap = generic_pipe_buf_unmap,
diff --git a/kernel/resource.c b/kernel/resource.c
index fb11a58b9594..dc15686b7a77 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -308,35 +308,37 @@ static int find_resource(struct resource *root, struct resource *new,
308 void *alignf_data) 308 void *alignf_data)
309{ 309{
310 struct resource *this = root->child; 310 struct resource *this = root->child;
311 resource_size_t start, end;
311 312
312 new->start = root->start; 313 start = root->start;
313 /* 314 /*
314 * Skip past an allocated resource that starts at 0, since the assignment 315 * Skip past an allocated resource that starts at 0, since the assignment
315 * of this->start - 1 to new->end below would cause an underflow. 316 * of this->start - 1 to new->end below would cause an underflow.
316 */ 317 */
317 if (this && this->start == 0) { 318 if (this && this->start == 0) {
318 new->start = this->end + 1; 319 start = this->end + 1;
319 this = this->sibling; 320 this = this->sibling;
320 } 321 }
321 for(;;) { 322 for(;;) {
322 if (this) 323 if (this)
323 new->end = this->start - 1; 324 end = this->start - 1;
324 else 325 else
325 new->end = root->end; 326 end = root->end;
326 if (new->start < min) 327 if (start < min)
327 new->start = min; 328 start = min;
328 if (new->end > max) 329 if (end > max)
329 new->end = max; 330 end = max;
330 new->start = ALIGN(new->start, align); 331 start = ALIGN(start, align);
331 if (alignf) 332 if (alignf)
332 alignf(alignf_data, new, size, align); 333 alignf(alignf_data, new, size, align);
333 if (new->start < new->end && new->end - new->start >= size - 1) { 334 if (start < end && end - start >= size - 1) {
334 new->end = new->start + size - 1; 335 new->start = start;
336 new->end = start + size - 1;
335 return 0; 337 return 0;
336 } 338 }
337 if (!this) 339 if (!this)
338 break; 340 break;
339 new->start = this->end + 1; 341 start = this->end + 1;
340 this = this->sibling; 342 this = this->sibling;
341 } 343 }
342 return -EBUSY; 344 return -EBUSY;
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 5fcb4fe645e2..ddabb54bb5c8 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -37,8 +37,8 @@ do { \
37 if (rt_trace_on) { \ 37 if (rt_trace_on) { \
38 rt_trace_on = 0; \ 38 rt_trace_on = 0; \
39 console_verbose(); \ 39 console_verbose(); \
40 if (spin_is_locked(&current->pi_lock)) \ 40 if (raw_spin_is_locked(&current->pi_lock)) \
41 spin_unlock(&current->pi_lock); \ 41 raw_spin_unlock(&current->pi_lock); \
42 } \ 42 } \
43} while (0) 43} while (0)
44 44
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 29bd4baf9e75..a9604815786a 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -138,9 +138,9 @@ static void rt_mutex_adjust_prio(struct task_struct *task)
138{ 138{
139 unsigned long flags; 139 unsigned long flags;
140 140
141 spin_lock_irqsave(&task->pi_lock, flags); 141 raw_spin_lock_irqsave(&task->pi_lock, flags);
142 __rt_mutex_adjust_prio(task); 142 __rt_mutex_adjust_prio(task);
143 spin_unlock_irqrestore(&task->pi_lock, flags); 143 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
144} 144}
145 145
146/* 146/*
@@ -195,7 +195,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
195 /* 195 /*
196 * Task can not go away as we did a get_task() before ! 196 * Task can not go away as we did a get_task() before !
197 */ 197 */
198 spin_lock_irqsave(&task->pi_lock, flags); 198 raw_spin_lock_irqsave(&task->pi_lock, flags);
199 199
200 waiter = task->pi_blocked_on; 200 waiter = task->pi_blocked_on;
201 /* 201 /*
@@ -231,8 +231,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
231 goto out_unlock_pi; 231 goto out_unlock_pi;
232 232
233 lock = waiter->lock; 233 lock = waiter->lock;
234 if (!spin_trylock(&lock->wait_lock)) { 234 if (!raw_spin_trylock(&lock->wait_lock)) {
235 spin_unlock_irqrestore(&task->pi_lock, flags); 235 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
236 cpu_relax(); 236 cpu_relax();
237 goto retry; 237 goto retry;
238 } 238 }
@@ -240,7 +240,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
240 /* Deadlock detection */ 240 /* Deadlock detection */
241 if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { 241 if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
242 debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); 242 debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
243 spin_unlock(&lock->wait_lock); 243 raw_spin_unlock(&lock->wait_lock);
244 ret = deadlock_detect ? -EDEADLK : 0; 244 ret = deadlock_detect ? -EDEADLK : 0;
245 goto out_unlock_pi; 245 goto out_unlock_pi;
246 } 246 }
@@ -253,13 +253,13 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
253 plist_add(&waiter->list_entry, &lock->wait_list); 253 plist_add(&waiter->list_entry, &lock->wait_list);
254 254
255 /* Release the task */ 255 /* Release the task */
256 spin_unlock_irqrestore(&task->pi_lock, flags); 256 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
257 put_task_struct(task); 257 put_task_struct(task);
258 258
259 /* Grab the next task */ 259 /* Grab the next task */
260 task = rt_mutex_owner(lock); 260 task = rt_mutex_owner(lock);
261 get_task_struct(task); 261 get_task_struct(task);
262 spin_lock_irqsave(&task->pi_lock, flags); 262 raw_spin_lock_irqsave(&task->pi_lock, flags);
263 263
264 if (waiter == rt_mutex_top_waiter(lock)) { 264 if (waiter == rt_mutex_top_waiter(lock)) {
265 /* Boost the owner */ 265 /* Boost the owner */
@@ -277,10 +277,10 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
277 __rt_mutex_adjust_prio(task); 277 __rt_mutex_adjust_prio(task);
278 } 278 }
279 279
280 spin_unlock_irqrestore(&task->pi_lock, flags); 280 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
281 281
282 top_waiter = rt_mutex_top_waiter(lock); 282 top_waiter = rt_mutex_top_waiter(lock);
283 spin_unlock(&lock->wait_lock); 283 raw_spin_unlock(&lock->wait_lock);
284 284
285 if (!detect_deadlock && waiter != top_waiter) 285 if (!detect_deadlock && waiter != top_waiter)
286 goto out_put_task; 286 goto out_put_task;
@@ -288,7 +288,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
288 goto again; 288 goto again;
289 289
290 out_unlock_pi: 290 out_unlock_pi:
291 spin_unlock_irqrestore(&task->pi_lock, flags); 291 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
292 out_put_task: 292 out_put_task:
293 put_task_struct(task); 293 put_task_struct(task);
294 294
@@ -313,9 +313,9 @@ static inline int try_to_steal_lock(struct rt_mutex *lock,
313 if (pendowner == task) 313 if (pendowner == task)
314 return 1; 314 return 1;
315 315
316 spin_lock_irqsave(&pendowner->pi_lock, flags); 316 raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
317 if (task->prio >= pendowner->prio) { 317 if (task->prio >= pendowner->prio) {
318 spin_unlock_irqrestore(&pendowner->pi_lock, flags); 318 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
319 return 0; 319 return 0;
320 } 320 }
321 321
@@ -325,7 +325,7 @@ static inline int try_to_steal_lock(struct rt_mutex *lock,
325 * priority. 325 * priority.
326 */ 326 */
327 if (likely(!rt_mutex_has_waiters(lock))) { 327 if (likely(!rt_mutex_has_waiters(lock))) {
328 spin_unlock_irqrestore(&pendowner->pi_lock, flags); 328 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
329 return 1; 329 return 1;
330 } 330 }
331 331
@@ -333,7 +333,7 @@ static inline int try_to_steal_lock(struct rt_mutex *lock,
333 next = rt_mutex_top_waiter(lock); 333 next = rt_mutex_top_waiter(lock);
334 plist_del(&next->pi_list_entry, &pendowner->pi_waiters); 334 plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
335 __rt_mutex_adjust_prio(pendowner); 335 __rt_mutex_adjust_prio(pendowner);
336 spin_unlock_irqrestore(&pendowner->pi_lock, flags); 336 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
337 337
338 /* 338 /*
339 * We are going to steal the lock and a waiter was 339 * We are going to steal the lock and a waiter was
@@ -350,10 +350,10 @@ static inline int try_to_steal_lock(struct rt_mutex *lock,
350 * might be task: 350 * might be task:
351 */ 351 */
352 if (likely(next->task != task)) { 352 if (likely(next->task != task)) {
353 spin_lock_irqsave(&task->pi_lock, flags); 353 raw_spin_lock_irqsave(&task->pi_lock, flags);
354 plist_add(&next->pi_list_entry, &task->pi_waiters); 354 plist_add(&next->pi_list_entry, &task->pi_waiters);
355 __rt_mutex_adjust_prio(task); 355 __rt_mutex_adjust_prio(task);
356 spin_unlock_irqrestore(&task->pi_lock, flags); 356 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
357 } 357 }
358 return 1; 358 return 1;
359} 359}
@@ -420,7 +420,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
420 unsigned long flags; 420 unsigned long flags;
421 int chain_walk = 0, res; 421 int chain_walk = 0, res;
422 422
423 spin_lock_irqsave(&task->pi_lock, flags); 423 raw_spin_lock_irqsave(&task->pi_lock, flags);
424 __rt_mutex_adjust_prio(task); 424 __rt_mutex_adjust_prio(task);
425 waiter->task = task; 425 waiter->task = task;
426 waiter->lock = lock; 426 waiter->lock = lock;
@@ -434,17 +434,17 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
434 434
435 task->pi_blocked_on = waiter; 435 task->pi_blocked_on = waiter;
436 436
437 spin_unlock_irqrestore(&task->pi_lock, flags); 437 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
438 438
439 if (waiter == rt_mutex_top_waiter(lock)) { 439 if (waiter == rt_mutex_top_waiter(lock)) {
440 spin_lock_irqsave(&owner->pi_lock, flags); 440 raw_spin_lock_irqsave(&owner->pi_lock, flags);
441 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); 441 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
442 plist_add(&waiter->pi_list_entry, &owner->pi_waiters); 442 plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
443 443
444 __rt_mutex_adjust_prio(owner); 444 __rt_mutex_adjust_prio(owner);
445 if (owner->pi_blocked_on) 445 if (owner->pi_blocked_on)
446 chain_walk = 1; 446 chain_walk = 1;
447 spin_unlock_irqrestore(&owner->pi_lock, flags); 447 raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
448 } 448 }
449 else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) 449 else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock))
450 chain_walk = 1; 450 chain_walk = 1;
@@ -459,12 +459,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
459 */ 459 */
460 get_task_struct(owner); 460 get_task_struct(owner);
461 461
462 spin_unlock(&lock->wait_lock); 462 raw_spin_unlock(&lock->wait_lock);
463 463
464 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, 464 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
465 task); 465 task);
466 466
467 spin_lock(&lock->wait_lock); 467 raw_spin_lock(&lock->wait_lock);
468 468
469 return res; 469 return res;
470} 470}
@@ -483,7 +483,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
483 struct task_struct *pendowner; 483 struct task_struct *pendowner;
484 unsigned long flags; 484 unsigned long flags;
485 485
486 spin_lock_irqsave(&current->pi_lock, flags); 486 raw_spin_lock_irqsave(&current->pi_lock, flags);
487 487
488 waiter = rt_mutex_top_waiter(lock); 488 waiter = rt_mutex_top_waiter(lock);
489 plist_del(&waiter->list_entry, &lock->wait_list); 489 plist_del(&waiter->list_entry, &lock->wait_list);
@@ -500,7 +500,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
500 500
501 rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); 501 rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING);
502 502
503 spin_unlock_irqrestore(&current->pi_lock, flags); 503 raw_spin_unlock_irqrestore(&current->pi_lock, flags);
504 504
505 /* 505 /*
506 * Clear the pi_blocked_on variable and enqueue a possible 506 * Clear the pi_blocked_on variable and enqueue a possible
@@ -509,7 +509,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
509 * waiter with higher priority than pending-owner->normal_prio 509 * waiter with higher priority than pending-owner->normal_prio
510 * is blocked on the unboosted (pending) owner. 510 * is blocked on the unboosted (pending) owner.
511 */ 511 */
512 spin_lock_irqsave(&pendowner->pi_lock, flags); 512 raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
513 513
514 WARN_ON(!pendowner->pi_blocked_on); 514 WARN_ON(!pendowner->pi_blocked_on);
515 WARN_ON(pendowner->pi_blocked_on != waiter); 515 WARN_ON(pendowner->pi_blocked_on != waiter);
@@ -523,7 +523,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
523 next = rt_mutex_top_waiter(lock); 523 next = rt_mutex_top_waiter(lock);
524 plist_add(&next->pi_list_entry, &pendowner->pi_waiters); 524 plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
525 } 525 }
526 spin_unlock_irqrestore(&pendowner->pi_lock, flags); 526 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
527 527
528 wake_up_process(pendowner); 528 wake_up_process(pendowner);
529} 529}
@@ -541,15 +541,15 @@ static void remove_waiter(struct rt_mutex *lock,
541 unsigned long flags; 541 unsigned long flags;
542 int chain_walk = 0; 542 int chain_walk = 0;
543 543
544 spin_lock_irqsave(&current->pi_lock, flags); 544 raw_spin_lock_irqsave(&current->pi_lock, flags);
545 plist_del(&waiter->list_entry, &lock->wait_list); 545 plist_del(&waiter->list_entry, &lock->wait_list);
546 waiter->task = NULL; 546 waiter->task = NULL;
547 current->pi_blocked_on = NULL; 547 current->pi_blocked_on = NULL;
548 spin_unlock_irqrestore(&current->pi_lock, flags); 548 raw_spin_unlock_irqrestore(&current->pi_lock, flags);
549 549
550 if (first && owner != current) { 550 if (first && owner != current) {
551 551
552 spin_lock_irqsave(&owner->pi_lock, flags); 552 raw_spin_lock_irqsave(&owner->pi_lock, flags);
553 553
554 plist_del(&waiter->pi_list_entry, &owner->pi_waiters); 554 plist_del(&waiter->pi_list_entry, &owner->pi_waiters);
555 555
@@ -564,7 +564,7 @@ static void remove_waiter(struct rt_mutex *lock,
564 if (owner->pi_blocked_on) 564 if (owner->pi_blocked_on)
565 chain_walk = 1; 565 chain_walk = 1;
566 566
567 spin_unlock_irqrestore(&owner->pi_lock, flags); 567 raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
568 } 568 }
569 569
570 WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); 570 WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
@@ -575,11 +575,11 @@ static void remove_waiter(struct rt_mutex *lock,
575 /* gets dropped in rt_mutex_adjust_prio_chain()! */ 575 /* gets dropped in rt_mutex_adjust_prio_chain()! */
576 get_task_struct(owner); 576 get_task_struct(owner);
577 577
578 spin_unlock(&lock->wait_lock); 578 raw_spin_unlock(&lock->wait_lock);
579 579
580 rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current); 580 rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current);
581 581
582 spin_lock(&lock->wait_lock); 582 raw_spin_lock(&lock->wait_lock);
583} 583}
584 584
585/* 585/*
@@ -592,15 +592,15 @@ void rt_mutex_adjust_pi(struct task_struct *task)
592 struct rt_mutex_waiter *waiter; 592 struct rt_mutex_waiter *waiter;
593 unsigned long flags; 593 unsigned long flags;
594 594
595 spin_lock_irqsave(&task->pi_lock, flags); 595 raw_spin_lock_irqsave(&task->pi_lock, flags);
596 596
597 waiter = task->pi_blocked_on; 597 waiter = task->pi_blocked_on;
598 if (!waiter || waiter->list_entry.prio == task->prio) { 598 if (!waiter || waiter->list_entry.prio == task->prio) {
599 spin_unlock_irqrestore(&task->pi_lock, flags); 599 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
600 return; 600 return;
601 } 601 }
602 602
603 spin_unlock_irqrestore(&task->pi_lock, flags); 603 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
604 604
605 /* gets dropped in rt_mutex_adjust_prio_chain()! */ 605 /* gets dropped in rt_mutex_adjust_prio_chain()! */
606 get_task_struct(task); 606 get_task_struct(task);
@@ -672,14 +672,14 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
672 break; 672 break;
673 } 673 }
674 674
675 spin_unlock(&lock->wait_lock); 675 raw_spin_unlock(&lock->wait_lock);
676 676
677 debug_rt_mutex_print_deadlock(waiter); 677 debug_rt_mutex_print_deadlock(waiter);
678 678
679 if (waiter->task) 679 if (waiter->task)
680 schedule_rt_mutex(lock); 680 schedule_rt_mutex(lock);
681 681
682 spin_lock(&lock->wait_lock); 682 raw_spin_lock(&lock->wait_lock);
683 set_current_state(state); 683 set_current_state(state);
684 } 684 }
685 685
@@ -700,11 +700,11 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
700 debug_rt_mutex_init_waiter(&waiter); 700 debug_rt_mutex_init_waiter(&waiter);
701 waiter.task = NULL; 701 waiter.task = NULL;
702 702
703 spin_lock(&lock->wait_lock); 703 raw_spin_lock(&lock->wait_lock);
704 704
705 /* Try to acquire the lock again: */ 705 /* Try to acquire the lock again: */
706 if (try_to_take_rt_mutex(lock)) { 706 if (try_to_take_rt_mutex(lock)) {
707 spin_unlock(&lock->wait_lock); 707 raw_spin_unlock(&lock->wait_lock);
708 return 0; 708 return 0;
709 } 709 }
710 710
@@ -731,7 +731,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
731 */ 731 */
732 fixup_rt_mutex_waiters(lock); 732 fixup_rt_mutex_waiters(lock);
733 733
734 spin_unlock(&lock->wait_lock); 734 raw_spin_unlock(&lock->wait_lock);
735 735
736 /* Remove pending timer: */ 736 /* Remove pending timer: */
737 if (unlikely(timeout)) 737 if (unlikely(timeout))
@@ -758,7 +758,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
758{ 758{
759 int ret = 0; 759 int ret = 0;
760 760
761 spin_lock(&lock->wait_lock); 761 raw_spin_lock(&lock->wait_lock);
762 762
763 if (likely(rt_mutex_owner(lock) != current)) { 763 if (likely(rt_mutex_owner(lock) != current)) {
764 764
@@ -770,7 +770,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
770 fixup_rt_mutex_waiters(lock); 770 fixup_rt_mutex_waiters(lock);
771 } 771 }
772 772
773 spin_unlock(&lock->wait_lock); 773 raw_spin_unlock(&lock->wait_lock);
774 774
775 return ret; 775 return ret;
776} 776}
@@ -781,7 +781,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
781static void __sched 781static void __sched
782rt_mutex_slowunlock(struct rt_mutex *lock) 782rt_mutex_slowunlock(struct rt_mutex *lock)
783{ 783{
784 spin_lock(&lock->wait_lock); 784 raw_spin_lock(&lock->wait_lock);
785 785
786 debug_rt_mutex_unlock(lock); 786 debug_rt_mutex_unlock(lock);
787 787
@@ -789,13 +789,13 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
789 789
790 if (!rt_mutex_has_waiters(lock)) { 790 if (!rt_mutex_has_waiters(lock)) {
791 lock->owner = NULL; 791 lock->owner = NULL;
792 spin_unlock(&lock->wait_lock); 792 raw_spin_unlock(&lock->wait_lock);
793 return; 793 return;
794 } 794 }
795 795
796 wakeup_next_waiter(lock); 796 wakeup_next_waiter(lock);
797 797
798 spin_unlock(&lock->wait_lock); 798 raw_spin_unlock(&lock->wait_lock);
799 799
800 /* Undo pi boosting if necessary: */ 800 /* Undo pi boosting if necessary: */
801 rt_mutex_adjust_prio(current); 801 rt_mutex_adjust_prio(current);
@@ -970,8 +970,8 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy);
970void __rt_mutex_init(struct rt_mutex *lock, const char *name) 970void __rt_mutex_init(struct rt_mutex *lock, const char *name)
971{ 971{
972 lock->owner = NULL; 972 lock->owner = NULL;
973 spin_lock_init(&lock->wait_lock); 973 raw_spin_lock_init(&lock->wait_lock);
974 plist_head_init(&lock->wait_list, &lock->wait_lock); 974 plist_head_init_raw(&lock->wait_list, &lock->wait_lock);
975 975
976 debug_rt_mutex_init(lock, name); 976 debug_rt_mutex_init(lock, name);
977} 977}
@@ -1032,7 +1032,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1032{ 1032{
1033 int ret; 1033 int ret;
1034 1034
1035 spin_lock(&lock->wait_lock); 1035 raw_spin_lock(&lock->wait_lock);
1036 1036
1037 mark_rt_mutex_waiters(lock); 1037 mark_rt_mutex_waiters(lock);
1038 1038
@@ -1040,7 +1040,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1040 /* We got the lock for task. */ 1040 /* We got the lock for task. */
1041 debug_rt_mutex_lock(lock); 1041 debug_rt_mutex_lock(lock);
1042 rt_mutex_set_owner(lock, task, 0); 1042 rt_mutex_set_owner(lock, task, 0);
1043 spin_unlock(&lock->wait_lock); 1043 raw_spin_unlock(&lock->wait_lock);
1044 rt_mutex_deadlock_account_lock(lock, task); 1044 rt_mutex_deadlock_account_lock(lock, task);
1045 return 1; 1045 return 1;
1046 } 1046 }
@@ -1056,7 +1056,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1056 */ 1056 */
1057 ret = 0; 1057 ret = 0;
1058 } 1058 }
1059 spin_unlock(&lock->wait_lock); 1059 raw_spin_unlock(&lock->wait_lock);
1060 1060
1061 debug_rt_mutex_print_deadlock(waiter); 1061 debug_rt_mutex_print_deadlock(waiter);
1062 1062
@@ -1106,7 +1106,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1106{ 1106{
1107 int ret; 1107 int ret;
1108 1108
1109 spin_lock(&lock->wait_lock); 1109 raw_spin_lock(&lock->wait_lock);
1110 1110
1111 set_current_state(TASK_INTERRUPTIBLE); 1111 set_current_state(TASK_INTERRUPTIBLE);
1112 1112
@@ -1124,7 +1124,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1124 */ 1124 */
1125 fixup_rt_mutex_waiters(lock); 1125 fixup_rt_mutex_waiters(lock);
1126 1126
1127 spin_unlock(&lock->wait_lock); 1127 raw_spin_unlock(&lock->wait_lock);
1128 1128
1129 /* 1129 /*
1130 * Readjust priority, when we did not get the lock. We might have been 1130 * Readjust priority, when we did not get the lock. We might have been
diff --git a/kernel/sched.c b/kernel/sched.c
index 3c11ae0a948d..18cceeecce35 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -141,7 +141,7 @@ struct rt_prio_array {
141 141
142struct rt_bandwidth { 142struct rt_bandwidth {
143 /* nests inside the rq lock: */ 143 /* nests inside the rq lock: */
144 spinlock_t rt_runtime_lock; 144 raw_spinlock_t rt_runtime_lock;
145 ktime_t rt_period; 145 ktime_t rt_period;
146 u64 rt_runtime; 146 u64 rt_runtime;
147 struct hrtimer rt_period_timer; 147 struct hrtimer rt_period_timer;
@@ -178,7 +178,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
178 rt_b->rt_period = ns_to_ktime(period); 178 rt_b->rt_period = ns_to_ktime(period);
179 rt_b->rt_runtime = runtime; 179 rt_b->rt_runtime = runtime;
180 180
181 spin_lock_init(&rt_b->rt_runtime_lock); 181 raw_spin_lock_init(&rt_b->rt_runtime_lock);
182 182
183 hrtimer_init(&rt_b->rt_period_timer, 183 hrtimer_init(&rt_b->rt_period_timer,
184 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 184 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
@@ -200,7 +200,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
200 if (hrtimer_active(&rt_b->rt_period_timer)) 200 if (hrtimer_active(&rt_b->rt_period_timer))
201 return; 201 return;
202 202
203 spin_lock(&rt_b->rt_runtime_lock); 203 raw_spin_lock(&rt_b->rt_runtime_lock);
204 for (;;) { 204 for (;;) {
205 unsigned long delta; 205 unsigned long delta;
206 ktime_t soft, hard; 206 ktime_t soft, hard;
@@ -217,7 +217,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
217 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, 217 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
218 HRTIMER_MODE_ABS_PINNED, 0); 218 HRTIMER_MODE_ABS_PINNED, 0);
219 } 219 }
220 spin_unlock(&rt_b->rt_runtime_lock); 220 raw_spin_unlock(&rt_b->rt_runtime_lock);
221} 221}
222 222
223#ifdef CONFIG_RT_GROUP_SCHED 223#ifdef CONFIG_RT_GROUP_SCHED
@@ -298,7 +298,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
298 298
299#ifdef CONFIG_RT_GROUP_SCHED 299#ifdef CONFIG_RT_GROUP_SCHED
300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq); 301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var);
302#endif /* CONFIG_RT_GROUP_SCHED */ 302#endif /* CONFIG_RT_GROUP_SCHED */
303#else /* !CONFIG_USER_SCHED */ 303#else /* !CONFIG_USER_SCHED */
304#define root_task_group init_task_group 304#define root_task_group init_task_group
@@ -470,7 +470,7 @@ struct rt_rq {
470 u64 rt_time; 470 u64 rt_time;
471 u64 rt_runtime; 471 u64 rt_runtime;
472 /* Nests inside the rq lock: */ 472 /* Nests inside the rq lock: */
473 spinlock_t rt_runtime_lock; 473 raw_spinlock_t rt_runtime_lock;
474 474
475#ifdef CONFIG_RT_GROUP_SCHED 475#ifdef CONFIG_RT_GROUP_SCHED
476 unsigned long rt_nr_boosted; 476 unsigned long rt_nr_boosted;
@@ -525,7 +525,7 @@ static struct root_domain def_root_domain;
525 */ 525 */
526struct rq { 526struct rq {
527 /* runqueue lock: */ 527 /* runqueue lock: */
528 spinlock_t lock; 528 raw_spinlock_t lock;
529 529
530 /* 530 /*
531 * nr_running and cpu_load should be in the same cacheline because 531 * nr_running and cpu_load should be in the same cacheline because
@@ -535,14 +535,12 @@ struct rq {
535 #define CPU_LOAD_IDX_MAX 5 535 #define CPU_LOAD_IDX_MAX 5
536 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 536 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
537#ifdef CONFIG_NO_HZ 537#ifdef CONFIG_NO_HZ
538 unsigned long last_tick_seen;
539 unsigned char in_nohz_recently; 538 unsigned char in_nohz_recently;
540#endif 539#endif
541 /* capture load from *all* tasks on this cpu: */ 540 /* capture load from *all* tasks on this cpu: */
542 struct load_weight load; 541 struct load_weight load;
543 unsigned long nr_load_updates; 542 unsigned long nr_load_updates;
544 u64 nr_switches; 543 u64 nr_switches;
545 u64 nr_migrations_in;
546 544
547 struct cfs_rq cfs; 545 struct cfs_rq cfs;
548 struct rt_rq rt; 546 struct rt_rq rt;
@@ -591,6 +589,8 @@ struct rq {
591 589
592 u64 rt_avg; 590 u64 rt_avg;
593 u64 age_stamp; 591 u64 age_stamp;
592 u64 idle_stamp;
593 u64 avg_idle;
594#endif 594#endif
595 595
596 /* calc_load related fields */ 596 /* calc_load related fields */
@@ -685,7 +685,7 @@ inline void update_rq_clock(struct rq *rq)
685 */ 685 */
686int runqueue_is_locked(int cpu) 686int runqueue_is_locked(int cpu)
687{ 687{
688 return spin_is_locked(&cpu_rq(cpu)->lock); 688 return raw_spin_is_locked(&cpu_rq(cpu)->lock);
689} 689}
690 690
691/* 691/*
@@ -772,7 +772,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
772 if (!sched_feat_names[i]) 772 if (!sched_feat_names[i])
773 return -EINVAL; 773 return -EINVAL;
774 774
775 filp->f_pos += cnt; 775 *ppos += cnt;
776 776
777 return cnt; 777 return cnt;
778} 778}
@@ -814,6 +814,7 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
814 * default: 0.25ms 814 * default: 0.25ms
815 */ 815 */
816unsigned int sysctl_sched_shares_ratelimit = 250000; 816unsigned int sysctl_sched_shares_ratelimit = 250000;
817unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
817 818
818/* 819/*
819 * Inject some fuzzyness into changing the per-cpu group shares 820 * Inject some fuzzyness into changing the per-cpu group shares
@@ -892,7 +893,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
892 */ 893 */
893 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); 894 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
894 895
895 spin_unlock_irq(&rq->lock); 896 raw_spin_unlock_irq(&rq->lock);
896} 897}
897 898
898#else /* __ARCH_WANT_UNLOCKED_CTXSW */ 899#else /* __ARCH_WANT_UNLOCKED_CTXSW */
@@ -916,9 +917,9 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
916 next->oncpu = 1; 917 next->oncpu = 1;
917#endif 918#endif
918#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 919#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
919 spin_unlock_irq(&rq->lock); 920 raw_spin_unlock_irq(&rq->lock);
920#else 921#else
921 spin_unlock(&rq->lock); 922 raw_spin_unlock(&rq->lock);
922#endif 923#endif
923} 924}
924 925
@@ -948,10 +949,10 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
948{ 949{
949 for (;;) { 950 for (;;) {
950 struct rq *rq = task_rq(p); 951 struct rq *rq = task_rq(p);
951 spin_lock(&rq->lock); 952 raw_spin_lock(&rq->lock);
952 if (likely(rq == task_rq(p))) 953 if (likely(rq == task_rq(p)))
953 return rq; 954 return rq;
954 spin_unlock(&rq->lock); 955 raw_spin_unlock(&rq->lock);
955 } 956 }
956} 957}
957 958
@@ -968,10 +969,10 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
968 for (;;) { 969 for (;;) {
969 local_irq_save(*flags); 970 local_irq_save(*flags);
970 rq = task_rq(p); 971 rq = task_rq(p);
971 spin_lock(&rq->lock); 972 raw_spin_lock(&rq->lock);
972 if (likely(rq == task_rq(p))) 973 if (likely(rq == task_rq(p)))
973 return rq; 974 return rq;
974 spin_unlock_irqrestore(&rq->lock, *flags); 975 raw_spin_unlock_irqrestore(&rq->lock, *flags);
975 } 976 }
976} 977}
977 978
@@ -980,19 +981,19 @@ void task_rq_unlock_wait(struct task_struct *p)
980 struct rq *rq = task_rq(p); 981 struct rq *rq = task_rq(p);
981 982
982 smp_mb(); /* spin-unlock-wait is not a full memory barrier */ 983 smp_mb(); /* spin-unlock-wait is not a full memory barrier */
983 spin_unlock_wait(&rq->lock); 984 raw_spin_unlock_wait(&rq->lock);
984} 985}
985 986
986static void __task_rq_unlock(struct rq *rq) 987static void __task_rq_unlock(struct rq *rq)
987 __releases(rq->lock) 988 __releases(rq->lock)
988{ 989{
989 spin_unlock(&rq->lock); 990 raw_spin_unlock(&rq->lock);
990} 991}
991 992
992static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) 993static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
993 __releases(rq->lock) 994 __releases(rq->lock)
994{ 995{
995 spin_unlock_irqrestore(&rq->lock, *flags); 996 raw_spin_unlock_irqrestore(&rq->lock, *flags);
996} 997}
997 998
998/* 999/*
@@ -1005,7 +1006,7 @@ static struct rq *this_rq_lock(void)
1005 1006
1006 local_irq_disable(); 1007 local_irq_disable();
1007 rq = this_rq(); 1008 rq = this_rq();
1008 spin_lock(&rq->lock); 1009 raw_spin_lock(&rq->lock);
1009 1010
1010 return rq; 1011 return rq;
1011} 1012}
@@ -1052,10 +1053,10 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
1052 1053
1053 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); 1054 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1054 1055
1055 spin_lock(&rq->lock); 1056 raw_spin_lock(&rq->lock);
1056 update_rq_clock(rq); 1057 update_rq_clock(rq);
1057 rq->curr->sched_class->task_tick(rq, rq->curr, 1); 1058 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1058 spin_unlock(&rq->lock); 1059 raw_spin_unlock(&rq->lock);
1059 1060
1060 return HRTIMER_NORESTART; 1061 return HRTIMER_NORESTART;
1061} 1062}
@@ -1068,10 +1069,10 @@ static void __hrtick_start(void *arg)
1068{ 1069{
1069 struct rq *rq = arg; 1070 struct rq *rq = arg;
1070 1071
1071 spin_lock(&rq->lock); 1072 raw_spin_lock(&rq->lock);
1072 hrtimer_restart(&rq->hrtick_timer); 1073 hrtimer_restart(&rq->hrtick_timer);
1073 rq->hrtick_csd_pending = 0; 1074 rq->hrtick_csd_pending = 0;
1074 spin_unlock(&rq->lock); 1075 raw_spin_unlock(&rq->lock);
1075} 1076}
1076 1077
1077/* 1078/*
@@ -1178,7 +1179,7 @@ static void resched_task(struct task_struct *p)
1178{ 1179{
1179 int cpu; 1180 int cpu;
1180 1181
1181 assert_spin_locked(&task_rq(p)->lock); 1182 assert_raw_spin_locked(&task_rq(p)->lock);
1182 1183
1183 if (test_tsk_need_resched(p)) 1184 if (test_tsk_need_resched(p))
1184 return; 1185 return;
@@ -1200,10 +1201,10 @@ static void resched_cpu(int cpu)
1200 struct rq *rq = cpu_rq(cpu); 1201 struct rq *rq = cpu_rq(cpu);
1201 unsigned long flags; 1202 unsigned long flags;
1202 1203
1203 if (!spin_trylock_irqsave(&rq->lock, flags)) 1204 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
1204 return; 1205 return;
1205 resched_task(cpu_curr(cpu)); 1206 resched_task(cpu_curr(cpu));
1206 spin_unlock_irqrestore(&rq->lock, flags); 1207 raw_spin_unlock_irqrestore(&rq->lock, flags);
1207} 1208}
1208 1209
1209#ifdef CONFIG_NO_HZ 1210#ifdef CONFIG_NO_HZ
@@ -1272,7 +1273,7 @@ static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1272#else /* !CONFIG_SMP */ 1273#else /* !CONFIG_SMP */
1273static void resched_task(struct task_struct *p) 1274static void resched_task(struct task_struct *p)
1274{ 1275{
1275 assert_spin_locked(&task_rq(p)->lock); 1276 assert_raw_spin_locked(&task_rq(p)->lock);
1276 set_tsk_need_resched(p); 1277 set_tsk_need_resched(p);
1277} 1278}
1278 1279
@@ -1599,11 +1600,11 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu,
1599 struct rq *rq = cpu_rq(cpu); 1600 struct rq *rq = cpu_rq(cpu);
1600 unsigned long flags; 1601 unsigned long flags;
1601 1602
1602 spin_lock_irqsave(&rq->lock, flags); 1603 raw_spin_lock_irqsave(&rq->lock, flags);
1603 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; 1604 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1604 tg->cfs_rq[cpu]->shares = boost ? 0 : shares; 1605 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1605 __set_se_shares(tg->se[cpu], shares); 1606 __set_se_shares(tg->se[cpu], shares);
1606 spin_unlock_irqrestore(&rq->lock, flags); 1607 raw_spin_unlock_irqrestore(&rq->lock, flags);
1607 } 1608 }
1608} 1609}
1609 1610
@@ -1614,7 +1615,7 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu,
1614 */ 1615 */
1615static int tg_shares_up(struct task_group *tg, void *data) 1616static int tg_shares_up(struct task_group *tg, void *data)
1616{ 1617{
1617 unsigned long weight, rq_weight = 0, shares = 0; 1618 unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
1618 unsigned long *usd_rq_weight; 1619 unsigned long *usd_rq_weight;
1619 struct sched_domain *sd = data; 1620 struct sched_domain *sd = data;
1620 unsigned long flags; 1621 unsigned long flags;
@@ -1630,6 +1631,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
1630 weight = tg->cfs_rq[i]->load.weight; 1631 weight = tg->cfs_rq[i]->load.weight;
1631 usd_rq_weight[i] = weight; 1632 usd_rq_weight[i] = weight;
1632 1633
1634 rq_weight += weight;
1633 /* 1635 /*
1634 * If there are currently no tasks on the cpu pretend there 1636 * If there are currently no tasks on the cpu pretend there
1635 * is one of average load so that when a new task gets to 1637 * is one of average load so that when a new task gets to
@@ -1638,10 +1640,13 @@ static int tg_shares_up(struct task_group *tg, void *data)
1638 if (!weight) 1640 if (!weight)
1639 weight = NICE_0_LOAD; 1641 weight = NICE_0_LOAD;
1640 1642
1641 rq_weight += weight; 1643 sum_weight += weight;
1642 shares += tg->cfs_rq[i]->shares; 1644 shares += tg->cfs_rq[i]->shares;
1643 } 1645 }
1644 1646
1647 if (!rq_weight)
1648 rq_weight = sum_weight;
1649
1645 if ((!shares && rq_weight) || shares > tg->shares) 1650 if ((!shares && rq_weight) || shares > tg->shares)
1646 shares = tg->shares; 1651 shares = tg->shares;
1647 1652
@@ -1701,9 +1706,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1701 if (root_task_group_empty()) 1706 if (root_task_group_empty())
1702 return; 1707 return;
1703 1708
1704 spin_unlock(&rq->lock); 1709 raw_spin_unlock(&rq->lock);
1705 update_shares(sd); 1710 update_shares(sd);
1706 spin_lock(&rq->lock); 1711 raw_spin_lock(&rq->lock);
1707} 1712}
1708 1713
1709static void update_h_load(long cpu) 1714static void update_h_load(long cpu)
@@ -1743,7 +1748,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1743 __acquires(busiest->lock) 1748 __acquires(busiest->lock)
1744 __acquires(this_rq->lock) 1749 __acquires(this_rq->lock)
1745{ 1750{
1746 spin_unlock(&this_rq->lock); 1751 raw_spin_unlock(&this_rq->lock);
1747 double_rq_lock(this_rq, busiest); 1752 double_rq_lock(this_rq, busiest);
1748 1753
1749 return 1; 1754 return 1;
@@ -1764,14 +1769,16 @@ static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1764{ 1769{
1765 int ret = 0; 1770 int ret = 0;
1766 1771
1767 if (unlikely(!spin_trylock(&busiest->lock))) { 1772 if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1768 if (busiest < this_rq) { 1773 if (busiest < this_rq) {
1769 spin_unlock(&this_rq->lock); 1774 raw_spin_unlock(&this_rq->lock);
1770 spin_lock(&busiest->lock); 1775 raw_spin_lock(&busiest->lock);
1771 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING); 1776 raw_spin_lock_nested(&this_rq->lock,
1777 SINGLE_DEPTH_NESTING);
1772 ret = 1; 1778 ret = 1;
1773 } else 1779 } else
1774 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING); 1780 raw_spin_lock_nested(&busiest->lock,
1781 SINGLE_DEPTH_NESTING);
1775 } 1782 }
1776 return ret; 1783 return ret;
1777} 1784}
@@ -1785,7 +1792,7 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1785{ 1792{
1786 if (unlikely(!irqs_disabled())) { 1793 if (unlikely(!irqs_disabled())) {
1787 /* printk() doesn't work good under rq->lock */ 1794 /* printk() doesn't work good under rq->lock */
1788 spin_unlock(&this_rq->lock); 1795 raw_spin_unlock(&this_rq->lock);
1789 BUG_ON(1); 1796 BUG_ON(1);
1790 } 1797 }
1791 1798
@@ -1795,7 +1802,7 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1795static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) 1802static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1796 __releases(busiest->lock) 1803 __releases(busiest->lock)
1797{ 1804{
1798 spin_unlock(&busiest->lock); 1805 raw_spin_unlock(&busiest->lock);
1799 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); 1806 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1800} 1807}
1801#endif 1808#endif
@@ -1810,6 +1817,22 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1810#endif 1817#endif
1811 1818
1812static void calc_load_account_active(struct rq *this_rq); 1819static void calc_load_account_active(struct rq *this_rq);
1820static void update_sysctl(void);
1821static int get_update_sysctl_factor(void);
1822
1823static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1824{
1825 set_task_rq(p, cpu);
1826#ifdef CONFIG_SMP
1827 /*
1828 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1829 * successfuly executed on another CPU. We must ensure that updates of
1830 * per-task data have been completed by this moment.
1831 */
1832 smp_wmb();
1833 task_thread_info(p)->cpu = cpu;
1834#endif
1835}
1813 1836
1814#include "sched_stats.h" 1837#include "sched_stats.h"
1815#include "sched_idletask.c" 1838#include "sched_idletask.c"
@@ -1967,20 +1990,6 @@ inline int task_curr(const struct task_struct *p)
1967 return cpu_curr(task_cpu(p)) == p; 1990 return cpu_curr(task_cpu(p)) == p;
1968} 1991}
1969 1992
1970static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1971{
1972 set_task_rq(p, cpu);
1973#ifdef CONFIG_SMP
1974 /*
1975 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1976 * successfuly executed on another CPU. We must ensure that updates of
1977 * per-task data have been completed by this moment.
1978 */
1979 smp_wmb();
1980 task_thread_info(p)->cpu = cpu;
1981#endif
1982}
1983
1984static inline void check_class_changed(struct rq *rq, struct task_struct *p, 1993static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1985 const struct sched_class *prev_class, 1994 const struct sched_class *prev_class,
1986 int oldprio, int running) 1995 int oldprio, int running)
@@ -2016,12 +2025,13 @@ void kthread_bind(struct task_struct *p, unsigned int cpu)
2016 return; 2025 return;
2017 } 2026 }
2018 2027
2019 spin_lock_irqsave(&rq->lock, flags); 2028 raw_spin_lock_irqsave(&rq->lock, flags);
2029 update_rq_clock(rq);
2020 set_task_cpu(p, cpu); 2030 set_task_cpu(p, cpu);
2021 p->cpus_allowed = cpumask_of_cpu(cpu); 2031 p->cpus_allowed = cpumask_of_cpu(cpu);
2022 p->rt.nr_cpus_allowed = 1; 2032 p->rt.nr_cpus_allowed = 1;
2023 p->flags |= PF_THREAD_BOUND; 2033 p->flags |= PF_THREAD_BOUND;
2024 spin_unlock_irqrestore(&rq->lock, flags); 2034 raw_spin_unlock_irqrestore(&rq->lock, flags);
2025} 2035}
2026EXPORT_SYMBOL(kthread_bind); 2036EXPORT_SYMBOL(kthread_bind);
2027 2037
@@ -2059,30 +2069,13 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2059void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 2069void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2060{ 2070{
2061 int old_cpu = task_cpu(p); 2071 int old_cpu = task_cpu(p);
2062 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
2063 struct cfs_rq *old_cfsrq = task_cfs_rq(p), 2072 struct cfs_rq *old_cfsrq = task_cfs_rq(p),
2064 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); 2073 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
2065 u64 clock_offset;
2066
2067 clock_offset = old_rq->clock - new_rq->clock;
2068 2074
2069 trace_sched_migrate_task(p, new_cpu); 2075 trace_sched_migrate_task(p, new_cpu);
2070 2076
2071#ifdef CONFIG_SCHEDSTATS
2072 if (p->se.wait_start)
2073 p->se.wait_start -= clock_offset;
2074 if (p->se.sleep_start)
2075 p->se.sleep_start -= clock_offset;
2076 if (p->se.block_start)
2077 p->se.block_start -= clock_offset;
2078#endif
2079 if (old_cpu != new_cpu) { 2077 if (old_cpu != new_cpu) {
2080 p->se.nr_migrations++; 2078 p->se.nr_migrations++;
2081 new_rq->nr_migrations_in++;
2082#ifdef CONFIG_SCHEDSTATS
2083 if (task_hot(p, old_rq->clock, NULL))
2084 schedstat_inc(p, se.nr_forced2_migrations);
2085#endif
2086 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 2079 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
2087 1, 1, NULL, 0); 2080 1, 1, NULL, 0);
2088 } 2081 }
@@ -2115,6 +2108,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2115 * it is sufficient to simply update the task's cpu field. 2108 * it is sufficient to simply update the task's cpu field.
2116 */ 2109 */
2117 if (!p->se.on_rq && !task_running(rq, p)) { 2110 if (!p->se.on_rq && !task_running(rq, p)) {
2111 update_rq_clock(rq);
2118 set_task_cpu(p, dest_cpu); 2112 set_task_cpu(p, dest_cpu);
2119 return 0; 2113 return 0;
2120 } 2114 }
@@ -2322,6 +2316,14 @@ void task_oncpu_function_call(struct task_struct *p,
2322 preempt_enable(); 2316 preempt_enable();
2323} 2317}
2324 2318
2319#ifdef CONFIG_SMP
2320static inline
2321int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2322{
2323 return p->sched_class->select_task_rq(p, sd_flags, wake_flags);
2324}
2325#endif
2326
2325/*** 2327/***
2326 * try_to_wake_up - wake up a thread 2328 * try_to_wake_up - wake up a thread
2327 * @p: the to-be-woken-up thread 2329 * @p: the to-be-woken-up thread
@@ -2373,16 +2375,14 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2373 if (task_contributes_to_load(p)) 2375 if (task_contributes_to_load(p))
2374 rq->nr_uninterruptible--; 2376 rq->nr_uninterruptible--;
2375 p->state = TASK_WAKING; 2377 p->state = TASK_WAKING;
2376 task_rq_unlock(rq, &flags); 2378 __task_rq_unlock(rq);
2377 2379
2378 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 2380 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2379 if (cpu != orig_cpu) 2381 if (cpu != orig_cpu)
2380 set_task_cpu(p, cpu); 2382 set_task_cpu(p, cpu);
2381 2383
2382 rq = task_rq_lock(p, &flags); 2384 rq = __task_rq_lock(p);
2383 2385 update_rq_clock(rq);
2384 if (rq != orig_rq)
2385 update_rq_clock(rq);
2386 2386
2387 WARN_ON(p->state != TASK_WAKING); 2387 WARN_ON(p->state != TASK_WAKING);
2388 cpu = task_cpu(p); 2388 cpu = task_cpu(p);
@@ -2440,6 +2440,17 @@ out_running:
2440#ifdef CONFIG_SMP 2440#ifdef CONFIG_SMP
2441 if (p->sched_class->task_wake_up) 2441 if (p->sched_class->task_wake_up)
2442 p->sched_class->task_wake_up(rq, p); 2442 p->sched_class->task_wake_up(rq, p);
2443
2444 if (unlikely(rq->idle_stamp)) {
2445 u64 delta = rq->clock - rq->idle_stamp;
2446 u64 max = 2*sysctl_sched_migration_cost;
2447
2448 if (delta > max)
2449 rq->avg_idle = max;
2450 else
2451 update_avg(&rq->avg_idle, delta);
2452 rq->idle_stamp = 0;
2453 }
2443#endif 2454#endif
2444out: 2455out:
2445 task_rq_unlock(rq, &flags); 2456 task_rq_unlock(rq, &flags);
@@ -2486,7 +2497,6 @@ static void __sched_fork(struct task_struct *p)
2486 p->se.avg_overlap = 0; 2497 p->se.avg_overlap = 0;
2487 p->se.start_runtime = 0; 2498 p->se.start_runtime = 0;
2488 p->se.avg_wakeup = sysctl_sched_wakeup_granularity; 2499 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2489 p->se.avg_running = 0;
2490 2500
2491#ifdef CONFIG_SCHEDSTATS 2501#ifdef CONFIG_SCHEDSTATS
2492 p->se.wait_start = 0; 2502 p->se.wait_start = 0;
@@ -2508,7 +2518,6 @@ static void __sched_fork(struct task_struct *p)
2508 p->se.nr_failed_migrations_running = 0; 2518 p->se.nr_failed_migrations_running = 0;
2509 p->se.nr_failed_migrations_hot = 0; 2519 p->se.nr_failed_migrations_hot = 0;
2510 p->se.nr_forced_migrations = 0; 2520 p->se.nr_forced_migrations = 0;
2511 p->se.nr_forced2_migrations = 0;
2512 2521
2513 p->se.nr_wakeups = 0; 2522 p->se.nr_wakeups = 0;
2514 p->se.nr_wakeups_sync = 0; 2523 p->se.nr_wakeups_sync = 0;
@@ -2578,8 +2587,11 @@ void sched_fork(struct task_struct *p, int clone_flags)
2578 if (!rt_prio(p->prio)) 2587 if (!rt_prio(p->prio))
2579 p->sched_class = &fair_sched_class; 2588 p->sched_class = &fair_sched_class;
2580 2589
2590 if (p->sched_class->task_fork)
2591 p->sched_class->task_fork(p);
2592
2581#ifdef CONFIG_SMP 2593#ifdef CONFIG_SMP
2582 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); 2594 cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
2583#endif 2595#endif
2584 set_task_cpu(p, cpu); 2596 set_task_cpu(p, cpu);
2585 2597
@@ -2614,17 +2626,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2614 rq = task_rq_lock(p, &flags); 2626 rq = task_rq_lock(p, &flags);
2615 BUG_ON(p->state != TASK_RUNNING); 2627 BUG_ON(p->state != TASK_RUNNING);
2616 update_rq_clock(rq); 2628 update_rq_clock(rq);
2617 2629 activate_task(rq, p, 0);
2618 if (!p->sched_class->task_new || !current->se.on_rq) {
2619 activate_task(rq, p, 0);
2620 } else {
2621 /*
2622 * Let the scheduling class do new task startup
2623 * management (if any):
2624 */
2625 p->sched_class->task_new(rq, p);
2626 inc_nr_running(rq);
2627 }
2628 trace_sched_wakeup_new(rq, p, 1); 2630 trace_sched_wakeup_new(rq, p, 1);
2629 check_preempt_curr(rq, p, WF_FORK); 2631 check_preempt_curr(rq, p, WF_FORK);
2630#ifdef CONFIG_SMP 2632#ifdef CONFIG_SMP
@@ -2781,10 +2783,10 @@ static inline void post_schedule(struct rq *rq)
2781 if (rq->post_schedule) { 2783 if (rq->post_schedule) {
2782 unsigned long flags; 2784 unsigned long flags;
2783 2785
2784 spin_lock_irqsave(&rq->lock, flags); 2786 raw_spin_lock_irqsave(&rq->lock, flags);
2785 if (rq->curr->sched_class->post_schedule) 2787 if (rq->curr->sched_class->post_schedule)
2786 rq->curr->sched_class->post_schedule(rq); 2788 rq->curr->sched_class->post_schedule(rq);
2787 spin_unlock_irqrestore(&rq->lock, flags); 2789 raw_spin_unlock_irqrestore(&rq->lock, flags);
2788 2790
2789 rq->post_schedule = 0; 2791 rq->post_schedule = 0;
2790 } 2792 }
@@ -2848,14 +2850,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
2848 */ 2850 */
2849 arch_start_context_switch(prev); 2851 arch_start_context_switch(prev);
2850 2852
2851 if (unlikely(!mm)) { 2853 if (likely(!mm)) {
2852 next->active_mm = oldmm; 2854 next->active_mm = oldmm;
2853 atomic_inc(&oldmm->mm_count); 2855 atomic_inc(&oldmm->mm_count);
2854 enter_lazy_tlb(oldmm, next); 2856 enter_lazy_tlb(oldmm, next);
2855 } else 2857 } else
2856 switch_mm(oldmm, mm, next); 2858 switch_mm(oldmm, mm, next);
2857 2859
2858 if (unlikely(!prev->mm)) { 2860 if (likely(!prev->mm)) {
2859 prev->active_mm = NULL; 2861 prev->active_mm = NULL;
2860 rq->prev_mm = oldmm; 2862 rq->prev_mm = oldmm;
2861 } 2863 }
@@ -3018,15 +3020,6 @@ static void calc_load_account_active(struct rq *this_rq)
3018} 3020}
3019 3021
3020/* 3022/*
3021 * Externally visible per-cpu scheduler statistics:
3022 * cpu_nr_migrations(cpu) - number of migrations into that cpu
3023 */
3024u64 cpu_nr_migrations(int cpu)
3025{
3026 return cpu_rq(cpu)->nr_migrations_in;
3027}
3028
3029/*
3030 * Update rq->cpu_load[] statistics. This function is usually called every 3023 * Update rq->cpu_load[] statistics. This function is usually called every
3031 * scheduler tick (TICK_NSEC). 3024 * scheduler tick (TICK_NSEC).
3032 */ 3025 */
@@ -3075,15 +3068,15 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
3075{ 3068{
3076 BUG_ON(!irqs_disabled()); 3069 BUG_ON(!irqs_disabled());
3077 if (rq1 == rq2) { 3070 if (rq1 == rq2) {
3078 spin_lock(&rq1->lock); 3071 raw_spin_lock(&rq1->lock);
3079 __acquire(rq2->lock); /* Fake it out ;) */ 3072 __acquire(rq2->lock); /* Fake it out ;) */
3080 } else { 3073 } else {
3081 if (rq1 < rq2) { 3074 if (rq1 < rq2) {
3082 spin_lock(&rq1->lock); 3075 raw_spin_lock(&rq1->lock);
3083 spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); 3076 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
3084 } else { 3077 } else {
3085 spin_lock(&rq2->lock); 3078 raw_spin_lock(&rq2->lock);
3086 spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); 3079 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
3087 } 3080 }
3088 } 3081 }
3089 update_rq_clock(rq1); 3082 update_rq_clock(rq1);
@@ -3100,9 +3093,9 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
3100 __releases(rq1->lock) 3093 __releases(rq1->lock)
3101 __releases(rq2->lock) 3094 __releases(rq2->lock)
3102{ 3095{
3103 spin_unlock(&rq1->lock); 3096 raw_spin_unlock(&rq1->lock);
3104 if (rq1 != rq2) 3097 if (rq1 != rq2)
3105 spin_unlock(&rq2->lock); 3098 raw_spin_unlock(&rq2->lock);
3106 else 3099 else
3107 __release(rq2->lock); 3100 __release(rq2->lock);
3108} 3101}
@@ -3148,7 +3141,7 @@ out:
3148void sched_exec(void) 3141void sched_exec(void)
3149{ 3142{
3150 int new_cpu, this_cpu = get_cpu(); 3143 int new_cpu, this_cpu = get_cpu();
3151 new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0); 3144 new_cpu = select_task_rq(current, SD_BALANCE_EXEC, 0);
3152 put_cpu(); 3145 put_cpu();
3153 if (new_cpu != this_cpu) 3146 if (new_cpu != this_cpu)
3154 sched_migrate_task(current, new_cpu); 3147 sched_migrate_task(current, new_cpu);
@@ -3164,10 +3157,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
3164 deactivate_task(src_rq, p, 0); 3157 deactivate_task(src_rq, p, 0);
3165 set_task_cpu(p, this_cpu); 3158 set_task_cpu(p, this_cpu);
3166 activate_task(this_rq, p, 0); 3159 activate_task(this_rq, p, 0);
3167 /*
3168 * Note that idle threads have a prio of MAX_PRIO, for this test
3169 * to be always true for them.
3170 */
3171 check_preempt_curr(this_rq, p, 0); 3160 check_preempt_curr(this_rq, p, 0);
3172} 3161}
3173 3162
@@ -4126,7 +4115,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4126 unsigned long flags; 4115 unsigned long flags;
4127 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4116 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4128 4117
4129 cpumask_setall(cpus); 4118 cpumask_copy(cpus, cpu_active_mask);
4130 4119
4131 /* 4120 /*
4132 * When power savings policy is enabled for the parent domain, idle 4121 * When power savings policy is enabled for the parent domain, idle
@@ -4199,14 +4188,15 @@ redo:
4199 4188
4200 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { 4189 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
4201 4190
4202 spin_lock_irqsave(&busiest->lock, flags); 4191 raw_spin_lock_irqsave(&busiest->lock, flags);
4203 4192
4204 /* don't kick the migration_thread, if the curr 4193 /* don't kick the migration_thread, if the curr
4205 * task on busiest cpu can't be moved to this_cpu 4194 * task on busiest cpu can't be moved to this_cpu
4206 */ 4195 */
4207 if (!cpumask_test_cpu(this_cpu, 4196 if (!cpumask_test_cpu(this_cpu,
4208 &busiest->curr->cpus_allowed)) { 4197 &busiest->curr->cpus_allowed)) {
4209 spin_unlock_irqrestore(&busiest->lock, flags); 4198 raw_spin_unlock_irqrestore(&busiest->lock,
4199 flags);
4210 all_pinned = 1; 4200 all_pinned = 1;
4211 goto out_one_pinned; 4201 goto out_one_pinned;
4212 } 4202 }
@@ -4216,7 +4206,7 @@ redo:
4216 busiest->push_cpu = this_cpu; 4206 busiest->push_cpu = this_cpu;
4217 active_balance = 1; 4207 active_balance = 1;
4218 } 4208 }
4219 spin_unlock_irqrestore(&busiest->lock, flags); 4209 raw_spin_unlock_irqrestore(&busiest->lock, flags);
4220 if (active_balance) 4210 if (active_balance)
4221 wake_up_process(busiest->migration_thread); 4211 wake_up_process(busiest->migration_thread);
4222 4212
@@ -4289,7 +4279,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4289 int all_pinned = 0; 4279 int all_pinned = 0;
4290 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4280 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4291 4281
4292 cpumask_setall(cpus); 4282 cpumask_copy(cpus, cpu_active_mask);
4293 4283
4294 /* 4284 /*
4295 * When power savings policy is enabled for the parent domain, idle 4285 * When power savings policy is enabled for the parent domain, idle
@@ -4398,10 +4388,10 @@ redo:
4398 /* 4388 /*
4399 * Should not call ttwu while holding a rq->lock 4389 * Should not call ttwu while holding a rq->lock
4400 */ 4390 */
4401 spin_unlock(&this_rq->lock); 4391 raw_spin_unlock(&this_rq->lock);
4402 if (active_balance) 4392 if (active_balance)
4403 wake_up_process(busiest->migration_thread); 4393 wake_up_process(busiest->migration_thread);
4404 spin_lock(&this_rq->lock); 4394 raw_spin_lock(&this_rq->lock);
4405 4395
4406 } else 4396 } else
4407 sd->nr_balance_failed = 0; 4397 sd->nr_balance_failed = 0;
@@ -4429,6 +4419,11 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
4429 int pulled_task = 0; 4419 int pulled_task = 0;
4430 unsigned long next_balance = jiffies + HZ; 4420 unsigned long next_balance = jiffies + HZ;
4431 4421
4422 this_rq->idle_stamp = this_rq->clock;
4423
4424 if (this_rq->avg_idle < sysctl_sched_migration_cost)
4425 return;
4426
4432 for_each_domain(this_cpu, sd) { 4427 for_each_domain(this_cpu, sd) {
4433 unsigned long interval; 4428 unsigned long interval;
4434 4429
@@ -4443,8 +4438,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
4443 interval = msecs_to_jiffies(sd->balance_interval); 4438 interval = msecs_to_jiffies(sd->balance_interval);
4444 if (time_after(next_balance, sd->last_balance + interval)) 4439 if (time_after(next_balance, sd->last_balance + interval))
4445 next_balance = sd->last_balance + interval; 4440 next_balance = sd->last_balance + interval;
4446 if (pulled_task) 4441 if (pulled_task) {
4442 this_rq->idle_stamp = 0;
4447 break; 4443 break;
4444 }
4448 } 4445 }
4449 if (pulled_task || time_after(jiffies, this_rq->next_balance)) { 4446 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4450 /* 4447 /*
@@ -4679,7 +4676,7 @@ int select_nohz_load_balancer(int stop_tick)
4679 cpumask_set_cpu(cpu, nohz.cpu_mask); 4676 cpumask_set_cpu(cpu, nohz.cpu_mask);
4680 4677
4681 /* time for ilb owner also to sleep */ 4678 /* time for ilb owner also to sleep */
4682 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { 4679 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
4683 if (atomic_read(&nohz.load_balancer) == cpu) 4680 if (atomic_read(&nohz.load_balancer) == cpu)
4684 atomic_set(&nohz.load_balancer, -1); 4681 atomic_set(&nohz.load_balancer, -1);
4685 return 0; 4682 return 0;
@@ -5046,8 +5043,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
5046 p->gtime = cputime_add(p->gtime, cputime); 5043 p->gtime = cputime_add(p->gtime, cputime);
5047 5044
5048 /* Add guest time to cpustat. */ 5045 /* Add guest time to cpustat. */
5049 cpustat->user = cputime64_add(cpustat->user, tmp); 5046 if (TASK_NICE(p) > 0) {
5050 cpustat->guest = cputime64_add(cpustat->guest, tmp); 5047 cpustat->nice = cputime64_add(cpustat->nice, tmp);
5048 cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);
5049 } else {
5050 cpustat->user = cputime64_add(cpustat->user, tmp);
5051 cpustat->guest = cputime64_add(cpustat->guest, tmp);
5052 }
5051} 5053}
5052 5054
5053/* 5055/*
@@ -5162,60 +5164,86 @@ void account_idle_ticks(unsigned long ticks)
5162 * Use precise platform statistics if available: 5164 * Use precise platform statistics if available:
5163 */ 5165 */
5164#ifdef CONFIG_VIRT_CPU_ACCOUNTING 5166#ifdef CONFIG_VIRT_CPU_ACCOUNTING
5165cputime_t task_utime(struct task_struct *p) 5167void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5166{ 5168{
5167 return p->utime; 5169 *ut = p->utime;
5170 *st = p->stime;
5168} 5171}
5169 5172
5170cputime_t task_stime(struct task_struct *p) 5173void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5171{ 5174{
5172 return p->stime; 5175 struct task_cputime cputime;
5176
5177 thread_group_cputime(p, &cputime);
5178
5179 *ut = cputime.utime;
5180 *st = cputime.stime;
5173} 5181}
5174#else 5182#else
5175cputime_t task_utime(struct task_struct *p) 5183
5184#ifndef nsecs_to_cputime
5185# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
5186#endif
5187
5188void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5176{ 5189{
5177 clock_t utime = cputime_to_clock_t(p->utime), 5190 cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);
5178 total = utime + cputime_to_clock_t(p->stime);
5179 u64 temp;
5180 5191
5181 /* 5192 /*
5182 * Use CFS's precise accounting: 5193 * Use CFS's precise accounting:
5183 */ 5194 */
5184 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); 5195 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
5185 5196
5186 if (total) { 5197 if (total) {
5187 temp *= utime; 5198 u64 temp;
5199
5200 temp = (u64)(rtime * utime);
5188 do_div(temp, total); 5201 do_div(temp, total);
5189 } 5202 utime = (cputime_t)temp;
5190 utime = (clock_t)temp; 5203 } else
5204 utime = rtime;
5191 5205
5192 p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); 5206 /*
5193 return p->prev_utime; 5207 * Compare with previous values, to keep monotonicity:
5208 */
5209 p->prev_utime = max(p->prev_utime, utime);
5210 p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));
5211
5212 *ut = p->prev_utime;
5213 *st = p->prev_stime;
5194} 5214}
5195 5215
5196cputime_t task_stime(struct task_struct *p) 5216/*
5217 * Must be called with siglock held.
5218 */
5219void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5197{ 5220{
5198 clock_t stime; 5221 struct signal_struct *sig = p->signal;
5222 struct task_cputime cputime;
5223 cputime_t rtime, utime, total;
5199 5224
5200 /* 5225 thread_group_cputime(p, &cputime);
5201 * Use CFS's precise accounting. (we subtract utime from
5202 * the total, to make sure the total observed by userspace
5203 * grows monotonically - apps rely on that):
5204 */
5205 stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
5206 cputime_to_clock_t(task_utime(p));
5207 5226
5208 if (stime >= 0) 5227 total = cputime_add(cputime.utime, cputime.stime);
5209 p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); 5228 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
5210 5229
5211 return p->prev_stime; 5230 if (total) {
5212} 5231 u64 temp;
5213#endif
5214 5232
5215inline cputime_t task_gtime(struct task_struct *p) 5233 temp = (u64)(rtime * cputime.utime);
5216{ 5234 do_div(temp, total);
5217 return p->gtime; 5235 utime = (cputime_t)temp;
5236 } else
5237 utime = rtime;
5238
5239 sig->prev_utime = max(sig->prev_utime, utime);
5240 sig->prev_stime = max(sig->prev_stime,
5241 cputime_sub(rtime, sig->prev_utime));
5242
5243 *ut = sig->prev_utime;
5244 *st = sig->prev_stime;
5218} 5245}
5246#endif
5219 5247
5220/* 5248/*
5221 * This function gets called by the timer code, with HZ frequency. 5249 * This function gets called by the timer code, with HZ frequency.
@@ -5232,11 +5260,11 @@ void scheduler_tick(void)
5232 5260
5233 sched_clock_tick(); 5261 sched_clock_tick();
5234 5262
5235 spin_lock(&rq->lock); 5263 raw_spin_lock(&rq->lock);
5236 update_rq_clock(rq); 5264 update_rq_clock(rq);
5237 update_cpu_load(rq); 5265 update_cpu_load(rq);
5238 curr->sched_class->task_tick(rq, curr, 0); 5266 curr->sched_class->task_tick(rq, curr, 0);
5239 spin_unlock(&rq->lock); 5267 raw_spin_unlock(&rq->lock);
5240 5268
5241 perf_event_task_tick(curr, cpu); 5269 perf_event_task_tick(curr, cpu);
5242 5270
@@ -5350,13 +5378,14 @@ static inline void schedule_debug(struct task_struct *prev)
5350#endif 5378#endif
5351} 5379}
5352 5380
5353static void put_prev_task(struct rq *rq, struct task_struct *p) 5381static void put_prev_task(struct rq *rq, struct task_struct *prev)
5354{ 5382{
5355 u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime; 5383 if (prev->state == TASK_RUNNING) {
5384 u64 runtime = prev->se.sum_exec_runtime;
5356 5385
5357 update_avg(&p->se.avg_running, runtime); 5386 runtime -= prev->se.prev_sum_exec_runtime;
5387 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5358 5388
5359 if (p->state == TASK_RUNNING) {
5360 /* 5389 /*
5361 * In order to avoid avg_overlap growing stale when we are 5390 * In order to avoid avg_overlap growing stale when we are
5362 * indeed overlapping and hence not getting put to sleep, grow 5391 * indeed overlapping and hence not getting put to sleep, grow
@@ -5366,12 +5395,9 @@ static void put_prev_task(struct rq *rq, struct task_struct *p)
5366 * correlates to the amount of cache footprint a task can 5395 * correlates to the amount of cache footprint a task can
5367 * build up. 5396 * build up.
5368 */ 5397 */
5369 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); 5398 update_avg(&prev->se.avg_overlap, runtime);
5370 update_avg(&p->se.avg_overlap, runtime);
5371 } else {
5372 update_avg(&p->se.avg_running, 0);
5373 } 5399 }
5374 p->sched_class->put_prev_task(rq, p); 5400 prev->sched_class->put_prev_task(rq, prev);
5375} 5401}
5376 5402
5377/* 5403/*
@@ -5432,7 +5458,7 @@ need_resched_nonpreemptible:
5432 if (sched_feat(HRTICK)) 5458 if (sched_feat(HRTICK))
5433 hrtick_clear(rq); 5459 hrtick_clear(rq);
5434 5460
5435 spin_lock_irq(&rq->lock); 5461 raw_spin_lock_irq(&rq->lock);
5436 update_rq_clock(rq); 5462 update_rq_clock(rq);
5437 clear_tsk_need_resched(prev); 5463 clear_tsk_need_resched(prev);
5438 5464
@@ -5468,7 +5494,7 @@ need_resched_nonpreemptible:
5468 cpu = smp_processor_id(); 5494 cpu = smp_processor_id();
5469 rq = cpu_rq(cpu); 5495 rq = cpu_rq(cpu);
5470 } else 5496 } else
5471 spin_unlock_irq(&rq->lock); 5497 raw_spin_unlock_irq(&rq->lock);
5472 5498
5473 post_schedule(rq); 5499 post_schedule(rq);
5474 5500
@@ -5481,7 +5507,7 @@ need_resched_nonpreemptible:
5481} 5507}
5482EXPORT_SYMBOL(schedule); 5508EXPORT_SYMBOL(schedule);
5483 5509
5484#ifdef CONFIG_SMP 5510#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
5485/* 5511/*
5486 * Look out! "owner" is an entirely speculative pointer 5512 * Look out! "owner" is an entirely speculative pointer
5487 * access and not reliable. 5513 * access and not reliable.
@@ -6175,22 +6201,14 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
6175 BUG_ON(p->se.on_rq); 6201 BUG_ON(p->se.on_rq);
6176 6202
6177 p->policy = policy; 6203 p->policy = policy;
6178 switch (p->policy) {
6179 case SCHED_NORMAL:
6180 case SCHED_BATCH:
6181 case SCHED_IDLE:
6182 p->sched_class = &fair_sched_class;
6183 break;
6184 case SCHED_FIFO:
6185 case SCHED_RR:
6186 p->sched_class = &rt_sched_class;
6187 break;
6188 }
6189
6190 p->rt_priority = prio; 6204 p->rt_priority = prio;
6191 p->normal_prio = normal_prio(p); 6205 p->normal_prio = normal_prio(p);
6192 /* we are holding p->pi_lock already */ 6206 /* we are holding p->pi_lock already */
6193 p->prio = rt_mutex_getprio(p); 6207 p->prio = rt_mutex_getprio(p);
6208 if (rt_prio(p->prio))
6209 p->sched_class = &rt_sched_class;
6210 else
6211 p->sched_class = &fair_sched_class;
6194 set_load_weight(p); 6212 set_load_weight(p);
6195} 6213}
6196 6214
@@ -6305,7 +6323,7 @@ recheck:
6305 * make sure no PI-waiters arrive (or leave) while we are 6323 * make sure no PI-waiters arrive (or leave) while we are
6306 * changing the priority of the task: 6324 * changing the priority of the task:
6307 */ 6325 */
6308 spin_lock_irqsave(&p->pi_lock, flags); 6326 raw_spin_lock_irqsave(&p->pi_lock, flags);
6309 /* 6327 /*
6310 * To be able to change p->policy safely, the apropriate 6328 * To be able to change p->policy safely, the apropriate
6311 * runqueue lock must be held. 6329 * runqueue lock must be held.
@@ -6315,7 +6333,7 @@ recheck:
6315 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 6333 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
6316 policy = oldpolicy = -1; 6334 policy = oldpolicy = -1;
6317 __task_rq_unlock(rq); 6335 __task_rq_unlock(rq);
6318 spin_unlock_irqrestore(&p->pi_lock, flags); 6336 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
6319 goto recheck; 6337 goto recheck;
6320 } 6338 }
6321 update_rq_clock(rq); 6339 update_rq_clock(rq);
@@ -6339,7 +6357,7 @@ recheck:
6339 check_class_changed(rq, p, prev_class, oldprio, running); 6357 check_class_changed(rq, p, prev_class, oldprio, running);
6340 } 6358 }
6341 __task_rq_unlock(rq); 6359 __task_rq_unlock(rq);
6342 spin_unlock_irqrestore(&p->pi_lock, flags); 6360 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
6343 6361
6344 rt_mutex_adjust_pi(p); 6362 rt_mutex_adjust_pi(p);
6345 6363
@@ -6593,6 +6611,8 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
6593long sched_getaffinity(pid_t pid, struct cpumask *mask) 6611long sched_getaffinity(pid_t pid, struct cpumask *mask)
6594{ 6612{
6595 struct task_struct *p; 6613 struct task_struct *p;
6614 unsigned long flags;
6615 struct rq *rq;
6596 int retval; 6616 int retval;
6597 6617
6598 get_online_cpus(); 6618 get_online_cpus();
@@ -6607,7 +6627,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
6607 if (retval) 6627 if (retval)
6608 goto out_unlock; 6628 goto out_unlock;
6609 6629
6630 rq = task_rq_lock(p, &flags);
6610 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 6631 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
6632 task_rq_unlock(rq, &flags);
6611 6633
6612out_unlock: 6634out_unlock:
6613 read_unlock(&tasklist_lock); 6635 read_unlock(&tasklist_lock);
@@ -6665,7 +6687,7 @@ SYSCALL_DEFINE0(sched_yield)
6665 */ 6687 */
6666 __release(rq->lock); 6688 __release(rq->lock);
6667 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 6689 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
6668 _raw_spin_unlock(&rq->lock); 6690 do_raw_spin_unlock(&rq->lock);
6669 preempt_enable_no_resched(); 6691 preempt_enable_no_resched();
6670 6692
6671 schedule(); 6693 schedule();
@@ -6845,6 +6867,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6845{ 6867{
6846 struct task_struct *p; 6868 struct task_struct *p;
6847 unsigned int time_slice; 6869 unsigned int time_slice;
6870 unsigned long flags;
6871 struct rq *rq;
6848 int retval; 6872 int retval;
6849 struct timespec t; 6873 struct timespec t;
6850 6874
@@ -6861,7 +6885,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6861 if (retval) 6885 if (retval)
6862 goto out_unlock; 6886 goto out_unlock;
6863 6887
6864 time_slice = p->sched_class->get_rr_interval(p); 6888 rq = task_rq_lock(p, &flags);
6889 time_slice = p->sched_class->get_rr_interval(rq, p);
6890 task_rq_unlock(rq, &flags);
6865 6891
6866 read_unlock(&tasklist_lock); 6892 read_unlock(&tasklist_lock);
6867 jiffies_to_timespec(time_slice, &t); 6893 jiffies_to_timespec(time_slice, &t);
@@ -6935,7 +6961,7 @@ void show_state_filter(unsigned long state_filter)
6935 /* 6961 /*
6936 * Only show locks if all tasks are dumped: 6962 * Only show locks if all tasks are dumped:
6937 */ 6963 */
6938 if (state_filter == -1) 6964 if (!state_filter)
6939 debug_show_all_locks(); 6965 debug_show_all_locks();
6940} 6966}
6941 6967
@@ -6957,12 +6983,11 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
6957 struct rq *rq = cpu_rq(cpu); 6983 struct rq *rq = cpu_rq(cpu);
6958 unsigned long flags; 6984 unsigned long flags;
6959 6985
6960 spin_lock_irqsave(&rq->lock, flags); 6986 raw_spin_lock_irqsave(&rq->lock, flags);
6961 6987
6962 __sched_fork(idle); 6988 __sched_fork(idle);
6963 idle->se.exec_start = sched_clock(); 6989 idle->se.exec_start = sched_clock();
6964 6990
6965 idle->prio = idle->normal_prio = MAX_PRIO;
6966 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); 6991 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
6967 __set_task_cpu(idle, cpu); 6992 __set_task_cpu(idle, cpu);
6968 6993
@@ -6970,7 +6995,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
6970#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 6995#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
6971 idle->oncpu = 1; 6996 idle->oncpu = 1;
6972#endif 6997#endif
6973 spin_unlock_irqrestore(&rq->lock, flags); 6998 raw_spin_unlock_irqrestore(&rq->lock, flags);
6974 6999
6975 /* Set the preempt count _outside_ the spinlocks! */ 7000 /* Set the preempt count _outside_ the spinlocks! */
6976#if defined(CONFIG_PREEMPT) 7001#if defined(CONFIG_PREEMPT)
@@ -7003,22 +7028,43 @@ cpumask_var_t nohz_cpu_mask;
7003 * 7028 *
7004 * This idea comes from the SD scheduler of Con Kolivas: 7029 * This idea comes from the SD scheduler of Con Kolivas:
7005 */ 7030 */
7006static inline void sched_init_granularity(void) 7031static int get_update_sysctl_factor(void)
7007{ 7032{
7008 unsigned int factor = 1 + ilog2(num_online_cpus()); 7033 unsigned int cpus = min_t(int, num_online_cpus(), 8);
7009 const unsigned long limit = 200000000; 7034 unsigned int factor;
7010 7035
7011 sysctl_sched_min_granularity *= factor; 7036 switch (sysctl_sched_tunable_scaling) {
7012 if (sysctl_sched_min_granularity > limit) 7037 case SCHED_TUNABLESCALING_NONE:
7013 sysctl_sched_min_granularity = limit; 7038 factor = 1;
7039 break;
7040 case SCHED_TUNABLESCALING_LINEAR:
7041 factor = cpus;
7042 break;
7043 case SCHED_TUNABLESCALING_LOG:
7044 default:
7045 factor = 1 + ilog2(cpus);
7046 break;
7047 }
7014 7048
7015 sysctl_sched_latency *= factor; 7049 return factor;
7016 if (sysctl_sched_latency > limit) 7050}
7017 sysctl_sched_latency = limit;
7018 7051
7019 sysctl_sched_wakeup_granularity *= factor; 7052static void update_sysctl(void)
7053{
7054 unsigned int factor = get_update_sysctl_factor();
7055
7056#define SET_SYSCTL(name) \
7057 (sysctl_##name = (factor) * normalized_sysctl_##name)
7058 SET_SYSCTL(sched_min_granularity);
7059 SET_SYSCTL(sched_latency);
7060 SET_SYSCTL(sched_wakeup_granularity);
7061 SET_SYSCTL(sched_shares_ratelimit);
7062#undef SET_SYSCTL
7063}
7020 7064
7021 sysctl_sched_shares_ratelimit *= factor; 7065static inline void sched_init_granularity(void)
7066{
7067 update_sysctl();
7022} 7068}
7023 7069
7024#ifdef CONFIG_SMP 7070#ifdef CONFIG_SMP
@@ -7055,7 +7101,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7055 int ret = 0; 7101 int ret = 0;
7056 7102
7057 rq = task_rq_lock(p, &flags); 7103 rq = task_rq_lock(p, &flags);
7058 if (!cpumask_intersects(new_mask, cpu_online_mask)) { 7104 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
7059 ret = -EINVAL; 7105 ret = -EINVAL;
7060 goto out; 7106 goto out;
7061 } 7107 }
@@ -7077,7 +7123,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7077 if (cpumask_test_cpu(task_cpu(p), new_mask)) 7123 if (cpumask_test_cpu(task_cpu(p), new_mask))
7078 goto out; 7124 goto out;
7079 7125
7080 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { 7126 if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) {
7081 /* Need help from migration thread: drop lock and wait. */ 7127 /* Need help from migration thread: drop lock and wait. */
7082 struct task_struct *mt = rq->migration_thread; 7128 struct task_struct *mt = rq->migration_thread;
7083 7129
@@ -7166,10 +7212,10 @@ static int migration_thread(void *data)
7166 struct migration_req *req; 7212 struct migration_req *req;
7167 struct list_head *head; 7213 struct list_head *head;
7168 7214
7169 spin_lock_irq(&rq->lock); 7215 raw_spin_lock_irq(&rq->lock);
7170 7216
7171 if (cpu_is_offline(cpu)) { 7217 if (cpu_is_offline(cpu)) {
7172 spin_unlock_irq(&rq->lock); 7218 raw_spin_unlock_irq(&rq->lock);
7173 break; 7219 break;
7174 } 7220 }
7175 7221
@@ -7181,7 +7227,7 @@ static int migration_thread(void *data)
7181 head = &rq->migration_queue; 7227 head = &rq->migration_queue;
7182 7228
7183 if (list_empty(head)) { 7229 if (list_empty(head)) {
7184 spin_unlock_irq(&rq->lock); 7230 raw_spin_unlock_irq(&rq->lock);
7185 schedule(); 7231 schedule();
7186 set_current_state(TASK_INTERRUPTIBLE); 7232 set_current_state(TASK_INTERRUPTIBLE);
7187 continue; 7233 continue;
@@ -7190,14 +7236,14 @@ static int migration_thread(void *data)
7190 list_del_init(head->next); 7236 list_del_init(head->next);
7191 7237
7192 if (req->task != NULL) { 7238 if (req->task != NULL) {
7193 spin_unlock(&rq->lock); 7239 raw_spin_unlock(&rq->lock);
7194 __migrate_task(req->task, cpu, req->dest_cpu); 7240 __migrate_task(req->task, cpu, req->dest_cpu);
7195 } else if (likely(cpu == (badcpu = smp_processor_id()))) { 7241 } else if (likely(cpu == (badcpu = smp_processor_id()))) {
7196 req->dest_cpu = RCU_MIGRATION_GOT_QS; 7242 req->dest_cpu = RCU_MIGRATION_GOT_QS;
7197 spin_unlock(&rq->lock); 7243 raw_spin_unlock(&rq->lock);
7198 } else { 7244 } else {
7199 req->dest_cpu = RCU_MIGRATION_MUST_SYNC; 7245 req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
7200 spin_unlock(&rq->lock); 7246 raw_spin_unlock(&rq->lock);
7201 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu); 7247 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
7202 } 7248 }
7203 local_irq_enable(); 7249 local_irq_enable();
@@ -7231,19 +7277,19 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
7231 7277
7232again: 7278again:
7233 /* Look for allowed, online CPU in same node. */ 7279 /* Look for allowed, online CPU in same node. */
7234 for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask) 7280 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
7235 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 7281 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
7236 goto move; 7282 goto move;
7237 7283
7238 /* Any allowed, online CPU? */ 7284 /* Any allowed, online CPU? */
7239 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask); 7285 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
7240 if (dest_cpu < nr_cpu_ids) 7286 if (dest_cpu < nr_cpu_ids)
7241 goto move; 7287 goto move;
7242 7288
7243 /* No more Mr. Nice Guy. */ 7289 /* No more Mr. Nice Guy. */
7244 if (dest_cpu >= nr_cpu_ids) { 7290 if (dest_cpu >= nr_cpu_ids) {
7245 cpuset_cpus_allowed_locked(p, &p->cpus_allowed); 7291 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
7246 dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed); 7292 dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
7247 7293
7248 /* 7294 /*
7249 * Don't tell them about moving exiting tasks or 7295 * Don't tell them about moving exiting tasks or
@@ -7272,7 +7318,7 @@ move:
7272 */ 7318 */
7273static void migrate_nr_uninterruptible(struct rq *rq_src) 7319static void migrate_nr_uninterruptible(struct rq *rq_src)
7274{ 7320{
7275 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask)); 7321 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
7276 unsigned long flags; 7322 unsigned long flags;
7277 7323
7278 local_irq_save(flags); 7324 local_irq_save(flags);
@@ -7320,14 +7366,14 @@ void sched_idle_next(void)
7320 * Strictly not necessary since rest of the CPUs are stopped by now 7366 * Strictly not necessary since rest of the CPUs are stopped by now
7321 * and interrupts disabled on the current cpu. 7367 * and interrupts disabled on the current cpu.
7322 */ 7368 */
7323 spin_lock_irqsave(&rq->lock, flags); 7369 raw_spin_lock_irqsave(&rq->lock, flags);
7324 7370
7325 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); 7371 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
7326 7372
7327 update_rq_clock(rq); 7373 update_rq_clock(rq);
7328 activate_task(rq, p, 0); 7374 activate_task(rq, p, 0);
7329 7375
7330 spin_unlock_irqrestore(&rq->lock, flags); 7376 raw_spin_unlock_irqrestore(&rq->lock, flags);
7331} 7377}
7332 7378
7333/* 7379/*
@@ -7363,9 +7409,9 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
7363 * that's OK. No task can be added to this CPU, so iteration is 7409 * that's OK. No task can be added to this CPU, so iteration is
7364 * fine. 7410 * fine.
7365 */ 7411 */
7366 spin_unlock_irq(&rq->lock); 7412 raw_spin_unlock_irq(&rq->lock);
7367 move_task_off_dead_cpu(dead_cpu, p); 7413 move_task_off_dead_cpu(dead_cpu, p);
7368 spin_lock_irq(&rq->lock); 7414 raw_spin_lock_irq(&rq->lock);
7369 7415
7370 put_task_struct(p); 7416 put_task_struct(p);
7371} 7417}
@@ -7406,17 +7452,16 @@ static struct ctl_table sd_ctl_dir[] = {
7406 .procname = "sched_domain", 7452 .procname = "sched_domain",
7407 .mode = 0555, 7453 .mode = 0555,
7408 }, 7454 },
7409 {0, }, 7455 {}
7410}; 7456};
7411 7457
7412static struct ctl_table sd_ctl_root[] = { 7458static struct ctl_table sd_ctl_root[] = {
7413 { 7459 {
7414 .ctl_name = CTL_KERN,
7415 .procname = "kernel", 7460 .procname = "kernel",
7416 .mode = 0555, 7461 .mode = 0555,
7417 .child = sd_ctl_dir, 7462 .child = sd_ctl_dir,
7418 }, 7463 },
7419 {0, }, 7464 {}
7420}; 7465};
7421 7466
7422static struct ctl_table *sd_alloc_ctl_entry(int n) 7467static struct ctl_table *sd_alloc_ctl_entry(int n)
@@ -7526,7 +7571,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
7526static struct ctl_table_header *sd_sysctl_header; 7571static struct ctl_table_header *sd_sysctl_header;
7527static void register_sched_domain_sysctl(void) 7572static void register_sched_domain_sysctl(void)
7528{ 7573{
7529 int i, cpu_num = num_online_cpus(); 7574 int i, cpu_num = num_possible_cpus();
7530 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); 7575 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
7531 char buf[32]; 7576 char buf[32];
7532 7577
@@ -7536,7 +7581,7 @@ static void register_sched_domain_sysctl(void)
7536 if (entry == NULL) 7581 if (entry == NULL)
7537 return; 7582 return;
7538 7583
7539 for_each_online_cpu(i) { 7584 for_each_possible_cpu(i) {
7540 snprintf(buf, 32, "cpu%d", i); 7585 snprintf(buf, 32, "cpu%d", i);
7541 entry->procname = kstrdup(buf, GFP_KERNEL); 7586 entry->procname = kstrdup(buf, GFP_KERNEL);
7542 entry->mode = 0555; 7587 entry->mode = 0555;
@@ -7632,13 +7677,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7632 7677
7633 /* Update our root-domain */ 7678 /* Update our root-domain */
7634 rq = cpu_rq(cpu); 7679 rq = cpu_rq(cpu);
7635 spin_lock_irqsave(&rq->lock, flags); 7680 raw_spin_lock_irqsave(&rq->lock, flags);
7636 if (rq->rd) { 7681 if (rq->rd) {
7637 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 7682 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7638 7683
7639 set_rq_online(rq); 7684 set_rq_online(rq);
7640 } 7685 }
7641 spin_unlock_irqrestore(&rq->lock, flags); 7686 raw_spin_unlock_irqrestore(&rq->lock, flags);
7642 break; 7687 break;
7643 7688
7644#ifdef CONFIG_HOTPLUG_CPU 7689#ifdef CONFIG_HOTPLUG_CPU
@@ -7663,14 +7708,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7663 put_task_struct(rq->migration_thread); 7708 put_task_struct(rq->migration_thread);
7664 rq->migration_thread = NULL; 7709 rq->migration_thread = NULL;
7665 /* Idle task back to normal (off runqueue, low prio) */ 7710 /* Idle task back to normal (off runqueue, low prio) */
7666 spin_lock_irq(&rq->lock); 7711 raw_spin_lock_irq(&rq->lock);
7667 update_rq_clock(rq); 7712 update_rq_clock(rq);
7668 deactivate_task(rq, rq->idle, 0); 7713 deactivate_task(rq, rq->idle, 0);
7669 rq->idle->static_prio = MAX_PRIO;
7670 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); 7714 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
7671 rq->idle->sched_class = &idle_sched_class; 7715 rq->idle->sched_class = &idle_sched_class;
7672 migrate_dead_tasks(cpu); 7716 migrate_dead_tasks(cpu);
7673 spin_unlock_irq(&rq->lock); 7717 raw_spin_unlock_irq(&rq->lock);
7674 cpuset_unlock(); 7718 cpuset_unlock();
7675 migrate_nr_uninterruptible(rq); 7719 migrate_nr_uninterruptible(rq);
7676 BUG_ON(rq->nr_running != 0); 7720 BUG_ON(rq->nr_running != 0);
@@ -7680,30 +7724,30 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7680 * they didn't take sched_hotcpu_mutex. Just wake up 7724 * they didn't take sched_hotcpu_mutex. Just wake up
7681 * the requestors. 7725 * the requestors.
7682 */ 7726 */
7683 spin_lock_irq(&rq->lock); 7727 raw_spin_lock_irq(&rq->lock);
7684 while (!list_empty(&rq->migration_queue)) { 7728 while (!list_empty(&rq->migration_queue)) {
7685 struct migration_req *req; 7729 struct migration_req *req;
7686 7730
7687 req = list_entry(rq->migration_queue.next, 7731 req = list_entry(rq->migration_queue.next,
7688 struct migration_req, list); 7732 struct migration_req, list);
7689 list_del_init(&req->list); 7733 list_del_init(&req->list);
7690 spin_unlock_irq(&rq->lock); 7734 raw_spin_unlock_irq(&rq->lock);
7691 complete(&req->done); 7735 complete(&req->done);
7692 spin_lock_irq(&rq->lock); 7736 raw_spin_lock_irq(&rq->lock);
7693 } 7737 }
7694 spin_unlock_irq(&rq->lock); 7738 raw_spin_unlock_irq(&rq->lock);
7695 break; 7739 break;
7696 7740
7697 case CPU_DYING: 7741 case CPU_DYING:
7698 case CPU_DYING_FROZEN: 7742 case CPU_DYING_FROZEN:
7699 /* Update our root-domain */ 7743 /* Update our root-domain */
7700 rq = cpu_rq(cpu); 7744 rq = cpu_rq(cpu);
7701 spin_lock_irqsave(&rq->lock, flags); 7745 raw_spin_lock_irqsave(&rq->lock, flags);
7702 if (rq->rd) { 7746 if (rq->rd) {
7703 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 7747 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7704 set_rq_offline(rq); 7748 set_rq_offline(rq);
7705 } 7749 }
7706 spin_unlock_irqrestore(&rq->lock, flags); 7750 raw_spin_unlock_irqrestore(&rq->lock, flags);
7707 break; 7751 break;
7708#endif 7752#endif
7709 } 7753 }
@@ -7740,6 +7784,16 @@ early_initcall(migration_init);
7740 7784
7741#ifdef CONFIG_SCHED_DEBUG 7785#ifdef CONFIG_SCHED_DEBUG
7742 7786
7787static __read_mostly int sched_domain_debug_enabled;
7788
7789static int __init sched_domain_debug_setup(char *str)
7790{
7791 sched_domain_debug_enabled = 1;
7792
7793 return 0;
7794}
7795early_param("sched_debug", sched_domain_debug_setup);
7796
7743static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 7797static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7744 struct cpumask *groupmask) 7798 struct cpumask *groupmask)
7745{ 7799{
@@ -7826,6 +7880,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
7826 cpumask_var_t groupmask; 7880 cpumask_var_t groupmask;
7827 int level = 0; 7881 int level = 0;
7828 7882
7883 if (!sched_domain_debug_enabled)
7884 return;
7885
7829 if (!sd) { 7886 if (!sd) {
7830 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); 7887 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
7831 return; 7888 return;
@@ -7905,6 +7962,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
7905 7962
7906static void free_rootdomain(struct root_domain *rd) 7963static void free_rootdomain(struct root_domain *rd)
7907{ 7964{
7965 synchronize_sched();
7966
7908 cpupri_cleanup(&rd->cpupri); 7967 cpupri_cleanup(&rd->cpupri);
7909 7968
7910 free_cpumask_var(rd->rto_mask); 7969 free_cpumask_var(rd->rto_mask);
@@ -7918,7 +7977,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7918 struct root_domain *old_rd = NULL; 7977 struct root_domain *old_rd = NULL;
7919 unsigned long flags; 7978 unsigned long flags;
7920 7979
7921 spin_lock_irqsave(&rq->lock, flags); 7980 raw_spin_lock_irqsave(&rq->lock, flags);
7922 7981
7923 if (rq->rd) { 7982 if (rq->rd) {
7924 old_rd = rq->rd; 7983 old_rd = rq->rd;
@@ -7944,7 +8003,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7944 if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) 8003 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
7945 set_rq_online(rq); 8004 set_rq_online(rq);
7946 8005
7947 spin_unlock_irqrestore(&rq->lock, flags); 8006 raw_spin_unlock_irqrestore(&rq->lock, flags);
7948 8007
7949 if (old_rd) 8008 if (old_rd)
7950 free_rootdomain(old_rd); 8009 free_rootdomain(old_rd);
@@ -8045,6 +8104,7 @@ static cpumask_var_t cpu_isolated_map;
8045/* Setup the mask of cpus configured for isolated domains */ 8104/* Setup the mask of cpus configured for isolated domains */
8046static int __init isolated_cpu_setup(char *str) 8105static int __init isolated_cpu_setup(char *str)
8047{ 8106{
8107 alloc_bootmem_cpumask_var(&cpu_isolated_map);
8048 cpulist_parse(str, cpu_isolated_map); 8108 cpulist_parse(str, cpu_isolated_map);
8049 return 1; 8109 return 1;
8050} 8110}
@@ -8229,14 +8289,14 @@ enum s_alloc {
8229 */ 8289 */
8230#ifdef CONFIG_SCHED_SMT 8290#ifdef CONFIG_SCHED_SMT
8231static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); 8291static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
8232static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus); 8292static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
8233 8293
8234static int 8294static int
8235cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, 8295cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
8236 struct sched_group **sg, struct cpumask *unused) 8296 struct sched_group **sg, struct cpumask *unused)
8237{ 8297{
8238 if (sg) 8298 if (sg)
8239 *sg = &per_cpu(sched_group_cpus, cpu).sg; 8299 *sg = &per_cpu(sched_groups, cpu).sg;
8240 return cpu; 8300 return cpu;
8241} 8301}
8242#endif /* CONFIG_SCHED_SMT */ 8302#endif /* CONFIG_SCHED_SMT */
@@ -8881,7 +8941,7 @@ static int build_sched_domains(const struct cpumask *cpu_map)
8881 return __build_sched_domains(cpu_map, NULL); 8941 return __build_sched_domains(cpu_map, NULL);
8882} 8942}
8883 8943
8884static struct cpumask *doms_cur; /* current sched domains */ 8944static cpumask_var_t *doms_cur; /* current sched domains */
8885static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 8945static int ndoms_cur; /* number of sched domains in 'doms_cur' */
8886static struct sched_domain_attr *dattr_cur; 8946static struct sched_domain_attr *dattr_cur;
8887 /* attribues of custom domains in 'doms_cur' */ 8947 /* attribues of custom domains in 'doms_cur' */
@@ -8903,6 +8963,31 @@ int __attribute__((weak)) arch_update_cpu_topology(void)
8903 return 0; 8963 return 0;
8904} 8964}
8905 8965
8966cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
8967{
8968 int i;
8969 cpumask_var_t *doms;
8970
8971 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
8972 if (!doms)
8973 return NULL;
8974 for (i = 0; i < ndoms; i++) {
8975 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
8976 free_sched_domains(doms, i);
8977 return NULL;
8978 }
8979 }
8980 return doms;
8981}
8982
8983void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
8984{
8985 unsigned int i;
8986 for (i = 0; i < ndoms; i++)
8987 free_cpumask_var(doms[i]);
8988 kfree(doms);
8989}
8990
8906/* 8991/*
8907 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 8992 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
8908 * For now this just excludes isolated cpus, but could be used to 8993 * For now this just excludes isolated cpus, but could be used to
@@ -8914,12 +8999,12 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
8914 8999
8915 arch_update_cpu_topology(); 9000 arch_update_cpu_topology();
8916 ndoms_cur = 1; 9001 ndoms_cur = 1;
8917 doms_cur = kmalloc(cpumask_size(), GFP_KERNEL); 9002 doms_cur = alloc_sched_domains(ndoms_cur);
8918 if (!doms_cur) 9003 if (!doms_cur)
8919 doms_cur = fallback_doms; 9004 doms_cur = &fallback_doms;
8920 cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); 9005 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
8921 dattr_cur = NULL; 9006 dattr_cur = NULL;
8922 err = build_sched_domains(doms_cur); 9007 err = build_sched_domains(doms_cur[0]);
8923 register_sched_domain_sysctl(); 9008 register_sched_domain_sysctl();
8924 9009
8925 return err; 9010 return err;
@@ -8969,19 +9054,19 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
8969 * doms_new[] to the current sched domain partitioning, doms_cur[]. 9054 * doms_new[] to the current sched domain partitioning, doms_cur[].
8970 * It destroys each deleted domain and builds each new domain. 9055 * It destroys each deleted domain and builds each new domain.
8971 * 9056 *
8972 * 'doms_new' is an array of cpumask's of length 'ndoms_new'. 9057 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
8973 * The masks don't intersect (don't overlap.) We should setup one 9058 * The masks don't intersect (don't overlap.) We should setup one
8974 * sched domain for each mask. CPUs not in any of the cpumasks will 9059 * sched domain for each mask. CPUs not in any of the cpumasks will
8975 * not be load balanced. If the same cpumask appears both in the 9060 * not be load balanced. If the same cpumask appears both in the
8976 * current 'doms_cur' domains and in the new 'doms_new', we can leave 9061 * current 'doms_cur' domains and in the new 'doms_new', we can leave
8977 * it as it is. 9062 * it as it is.
8978 * 9063 *
8979 * The passed in 'doms_new' should be kmalloc'd. This routine takes 9064 * The passed in 'doms_new' should be allocated using
8980 * ownership of it and will kfree it when done with it. If the caller 9065 * alloc_sched_domains. This routine takes ownership of it and will
8981 * failed the kmalloc call, then it can pass in doms_new == NULL && 9066 * free_sched_domains it when done with it. If the caller failed the
8982 * ndoms_new == 1, and partition_sched_domains() will fallback to 9067 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
8983 * the single partition 'fallback_doms', it also forces the domains 9068 * and partition_sched_domains() will fallback to the single partition
8984 * to be rebuilt. 9069 * 'fallback_doms', it also forces the domains to be rebuilt.
8985 * 9070 *
8986 * If doms_new == NULL it will be replaced with cpu_online_mask. 9071 * If doms_new == NULL it will be replaced with cpu_online_mask.
8987 * ndoms_new == 0 is a special case for destroying existing domains, 9072 * ndoms_new == 0 is a special case for destroying existing domains,
@@ -8989,8 +9074,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
8989 * 9074 *
8990 * Call with hotplug lock held 9075 * Call with hotplug lock held
8991 */ 9076 */
8992/* FIXME: Change to struct cpumask *doms_new[] */ 9077void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
8993void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
8994 struct sched_domain_attr *dattr_new) 9078 struct sched_domain_attr *dattr_new)
8995{ 9079{
8996 int i, j, n; 9080 int i, j, n;
@@ -9009,40 +9093,40 @@ void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
9009 /* Destroy deleted domains */ 9093 /* Destroy deleted domains */
9010 for (i = 0; i < ndoms_cur; i++) { 9094 for (i = 0; i < ndoms_cur; i++) {
9011 for (j = 0; j < n && !new_topology; j++) { 9095 for (j = 0; j < n && !new_topology; j++) {
9012 if (cpumask_equal(&doms_cur[i], &doms_new[j]) 9096 if (cpumask_equal(doms_cur[i], doms_new[j])
9013 && dattrs_equal(dattr_cur, i, dattr_new, j)) 9097 && dattrs_equal(dattr_cur, i, dattr_new, j))
9014 goto match1; 9098 goto match1;
9015 } 9099 }
9016 /* no match - a current sched domain not in new doms_new[] */ 9100 /* no match - a current sched domain not in new doms_new[] */
9017 detach_destroy_domains(doms_cur + i); 9101 detach_destroy_domains(doms_cur[i]);
9018match1: 9102match1:
9019 ; 9103 ;
9020 } 9104 }
9021 9105
9022 if (doms_new == NULL) { 9106 if (doms_new == NULL) {
9023 ndoms_cur = 0; 9107 ndoms_cur = 0;
9024 doms_new = fallback_doms; 9108 doms_new = &fallback_doms;
9025 cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); 9109 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
9026 WARN_ON_ONCE(dattr_new); 9110 WARN_ON_ONCE(dattr_new);
9027 } 9111 }
9028 9112
9029 /* Build new domains */ 9113 /* Build new domains */
9030 for (i = 0; i < ndoms_new; i++) { 9114 for (i = 0; i < ndoms_new; i++) {
9031 for (j = 0; j < ndoms_cur && !new_topology; j++) { 9115 for (j = 0; j < ndoms_cur && !new_topology; j++) {
9032 if (cpumask_equal(&doms_new[i], &doms_cur[j]) 9116 if (cpumask_equal(doms_new[i], doms_cur[j])
9033 && dattrs_equal(dattr_new, i, dattr_cur, j)) 9117 && dattrs_equal(dattr_new, i, dattr_cur, j))
9034 goto match2; 9118 goto match2;
9035 } 9119 }
9036 /* no match - add a new doms_new */ 9120 /* no match - add a new doms_new */
9037 __build_sched_domains(doms_new + i, 9121 __build_sched_domains(doms_new[i],
9038 dattr_new ? dattr_new + i : NULL); 9122 dattr_new ? dattr_new + i : NULL);
9039match2: 9123match2:
9040 ; 9124 ;
9041 } 9125 }
9042 9126
9043 /* Remember the new sched domains */ 9127 /* Remember the new sched domains */
9044 if (doms_cur != fallback_doms) 9128 if (doms_cur != &fallback_doms)
9045 kfree(doms_cur); 9129 free_sched_domains(doms_cur, ndoms_cur);
9046 kfree(dattr_cur); /* kfree(NULL) is safe */ 9130 kfree(dattr_cur); /* kfree(NULL) is safe */
9047 doms_cur = doms_new; 9131 doms_cur = doms_new;
9048 dattr_cur = dattr_new; 9132 dattr_cur = dattr_new;
@@ -9153,8 +9237,10 @@ static int update_sched_domains(struct notifier_block *nfb,
9153 switch (action) { 9237 switch (action) {
9154 case CPU_ONLINE: 9238 case CPU_ONLINE:
9155 case CPU_ONLINE_FROZEN: 9239 case CPU_ONLINE_FROZEN:
9156 case CPU_DEAD: 9240 case CPU_DOWN_PREPARE:
9157 case CPU_DEAD_FROZEN: 9241 case CPU_DOWN_PREPARE_FROZEN:
9242 case CPU_DOWN_FAILED:
9243 case CPU_DOWN_FAILED_FROZEN:
9158 partition_sched_domains(1, NULL, NULL); 9244 partition_sched_domains(1, NULL, NULL);
9159 return NOTIFY_OK; 9245 return NOTIFY_OK;
9160 9246
@@ -9201,7 +9287,7 @@ void __init sched_init_smp(void)
9201#endif 9287#endif
9202 get_online_cpus(); 9288 get_online_cpus();
9203 mutex_lock(&sched_domains_mutex); 9289 mutex_lock(&sched_domains_mutex);
9204 arch_init_sched_domains(cpu_online_mask); 9290 arch_init_sched_domains(cpu_active_mask);
9205 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 9291 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
9206 if (cpumask_empty(non_isolated_cpus)) 9292 if (cpumask_empty(non_isolated_cpus))
9207 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 9293 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -9274,13 +9360,13 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
9274#ifdef CONFIG_SMP 9360#ifdef CONFIG_SMP
9275 rt_rq->rt_nr_migratory = 0; 9361 rt_rq->rt_nr_migratory = 0;
9276 rt_rq->overloaded = 0; 9362 rt_rq->overloaded = 0;
9277 plist_head_init(&rt_rq->pushable_tasks, &rq->lock); 9363 plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock);
9278#endif 9364#endif
9279 9365
9280 rt_rq->rt_time = 0; 9366 rt_rq->rt_time = 0;
9281 rt_rq->rt_throttled = 0; 9367 rt_rq->rt_throttled = 0;
9282 rt_rq->rt_runtime = 0; 9368 rt_rq->rt_runtime = 0;
9283 spin_lock_init(&rt_rq->rt_runtime_lock); 9369 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
9284 9370
9285#ifdef CONFIG_RT_GROUP_SCHED 9371#ifdef CONFIG_RT_GROUP_SCHED
9286 rt_rq->rt_nr_boosted = 0; 9372 rt_rq->rt_nr_boosted = 0;
@@ -9364,10 +9450,6 @@ void __init sched_init(void)
9364#ifdef CONFIG_CPUMASK_OFFSTACK 9450#ifdef CONFIG_CPUMASK_OFFSTACK
9365 alloc_size += num_possible_cpus() * cpumask_size(); 9451 alloc_size += num_possible_cpus() * cpumask_size();
9366#endif 9452#endif
9367 /*
9368 * As sched_init() is called before page_alloc is setup,
9369 * we use alloc_bootmem().
9370 */
9371 if (alloc_size) { 9453 if (alloc_size) {
9372 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 9454 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
9373 9455
@@ -9444,7 +9526,7 @@ void __init sched_init(void)
9444 struct rq *rq; 9526 struct rq *rq;
9445 9527
9446 rq = cpu_rq(i); 9528 rq = cpu_rq(i);
9447 spin_lock_init(&rq->lock); 9529 raw_spin_lock_init(&rq->lock);
9448 rq->nr_running = 0; 9530 rq->nr_running = 0;
9449 rq->calc_load_active = 0; 9531 rq->calc_load_active = 0;
9450 rq->calc_load_update = jiffies + LOAD_FREQ; 9532 rq->calc_load_update = jiffies + LOAD_FREQ;
@@ -9504,7 +9586,7 @@ void __init sched_init(void)
9504#elif defined CONFIG_USER_SCHED 9586#elif defined CONFIG_USER_SCHED
9505 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL); 9587 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
9506 init_tg_rt_entry(&init_task_group, 9588 init_tg_rt_entry(&init_task_group,
9507 &per_cpu(init_rt_rq, i), 9589 &per_cpu(init_rt_rq_var, i),
9508 &per_cpu(init_sched_rt_entity, i), i, 1, 9590 &per_cpu(init_sched_rt_entity, i), i, 1,
9509 root_task_group.rt_se[i]); 9591 root_task_group.rt_se[i]);
9510#endif 9592#endif
@@ -9522,6 +9604,8 @@ void __init sched_init(void)
9522 rq->cpu = i; 9604 rq->cpu = i;
9523 rq->online = 0; 9605 rq->online = 0;
9524 rq->migration_thread = NULL; 9606 rq->migration_thread = NULL;
9607 rq->idle_stamp = 0;
9608 rq->avg_idle = 2*sysctl_sched_migration_cost;
9525 INIT_LIST_HEAD(&rq->migration_queue); 9609 INIT_LIST_HEAD(&rq->migration_queue);
9526 rq_attach_root(rq, &def_root_domain); 9610 rq_attach_root(rq, &def_root_domain);
9527#endif 9611#endif
@@ -9540,7 +9624,7 @@ void __init sched_init(void)
9540#endif 9624#endif
9541 9625
9542#ifdef CONFIG_RT_MUTEXES 9626#ifdef CONFIG_RT_MUTEXES
9543 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); 9627 plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock);
9544#endif 9628#endif
9545 9629
9546 /* 9630 /*
@@ -9571,7 +9655,9 @@ void __init sched_init(void)
9571 zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); 9655 zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
9572 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); 9656 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
9573#endif 9657#endif
9574 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 9658 /* May be allocated at isolcpus cmdline parse time */
9659 if (cpu_isolated_map == NULL)
9660 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
9575#endif /* SMP */ 9661#endif /* SMP */
9576 9662
9577 perf_event_init(); 9663 perf_event_init();
@@ -9663,13 +9749,13 @@ void normalize_rt_tasks(void)
9663 continue; 9749 continue;
9664 } 9750 }
9665 9751
9666 spin_lock(&p->pi_lock); 9752 raw_spin_lock(&p->pi_lock);
9667 rq = __task_rq_lock(p); 9753 rq = __task_rq_lock(p);
9668 9754
9669 normalize_task(rq, p); 9755 normalize_task(rq, p);
9670 9756
9671 __task_rq_unlock(rq); 9757 __task_rq_unlock(rq);
9672 spin_unlock(&p->pi_lock); 9758 raw_spin_unlock(&p->pi_lock);
9673 } while_each_thread(g, p); 9759 } while_each_thread(g, p);
9674 9760
9675 read_unlock_irqrestore(&tasklist_lock, flags); 9761 read_unlock_irqrestore(&tasklist_lock, flags);
@@ -9765,13 +9851,15 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
9765 se = kzalloc_node(sizeof(struct sched_entity), 9851 se = kzalloc_node(sizeof(struct sched_entity),
9766 GFP_KERNEL, cpu_to_node(i)); 9852 GFP_KERNEL, cpu_to_node(i));
9767 if (!se) 9853 if (!se)
9768 goto err; 9854 goto err_free_rq;
9769 9855
9770 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); 9856 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
9771 } 9857 }
9772 9858
9773 return 1; 9859 return 1;
9774 9860
9861 err_free_rq:
9862 kfree(cfs_rq);
9775 err: 9863 err:
9776 return 0; 9864 return 0;
9777} 9865}
@@ -9853,13 +9941,15 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
9853 rt_se = kzalloc_node(sizeof(struct sched_rt_entity), 9941 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
9854 GFP_KERNEL, cpu_to_node(i)); 9942 GFP_KERNEL, cpu_to_node(i));
9855 if (!rt_se) 9943 if (!rt_se)
9856 goto err; 9944 goto err_free_rq;
9857 9945
9858 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); 9946 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
9859 } 9947 }
9860 9948
9861 return 1; 9949 return 1;
9862 9950
9951 err_free_rq:
9952 kfree(rt_rq);
9863 err: 9953 err:
9864 return 0; 9954 return 0;
9865} 9955}
@@ -10028,9 +10118,9 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
10028 struct rq *rq = cfs_rq->rq; 10118 struct rq *rq = cfs_rq->rq;
10029 unsigned long flags; 10119 unsigned long flags;
10030 10120
10031 spin_lock_irqsave(&rq->lock, flags); 10121 raw_spin_lock_irqsave(&rq->lock, flags);
10032 __set_se_shares(se, shares); 10122 __set_se_shares(se, shares);
10033 spin_unlock_irqrestore(&rq->lock, flags); 10123 raw_spin_unlock_irqrestore(&rq->lock, flags);
10034} 10124}
10035 10125
10036static DEFINE_MUTEX(shares_mutex); 10126static DEFINE_MUTEX(shares_mutex);
@@ -10215,18 +10305,18 @@ static int tg_set_bandwidth(struct task_group *tg,
10215 if (err) 10305 if (err)
10216 goto unlock; 10306 goto unlock;
10217 10307
10218 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 10308 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
10219 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 10309 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
10220 tg->rt_bandwidth.rt_runtime = rt_runtime; 10310 tg->rt_bandwidth.rt_runtime = rt_runtime;
10221 10311
10222 for_each_possible_cpu(i) { 10312 for_each_possible_cpu(i) {
10223 struct rt_rq *rt_rq = tg->rt_rq[i]; 10313 struct rt_rq *rt_rq = tg->rt_rq[i];
10224 10314
10225 spin_lock(&rt_rq->rt_runtime_lock); 10315 raw_spin_lock(&rt_rq->rt_runtime_lock);
10226 rt_rq->rt_runtime = rt_runtime; 10316 rt_rq->rt_runtime = rt_runtime;
10227 spin_unlock(&rt_rq->rt_runtime_lock); 10317 raw_spin_unlock(&rt_rq->rt_runtime_lock);
10228 } 10318 }
10229 spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 10319 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
10230 unlock: 10320 unlock:
10231 read_unlock(&tasklist_lock); 10321 read_unlock(&tasklist_lock);
10232 mutex_unlock(&rt_constraints_mutex); 10322 mutex_unlock(&rt_constraints_mutex);
@@ -10331,15 +10421,15 @@ static int sched_rt_global_constraints(void)
10331 if (sysctl_sched_rt_runtime == 0) 10421 if (sysctl_sched_rt_runtime == 0)
10332 return -EBUSY; 10422 return -EBUSY;
10333 10423
10334 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 10424 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
10335 for_each_possible_cpu(i) { 10425 for_each_possible_cpu(i) {
10336 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 10426 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
10337 10427
10338 spin_lock(&rt_rq->rt_runtime_lock); 10428 raw_spin_lock(&rt_rq->rt_runtime_lock);
10339 rt_rq->rt_runtime = global_rt_runtime(); 10429 rt_rq->rt_runtime = global_rt_runtime();
10340 spin_unlock(&rt_rq->rt_runtime_lock); 10430 raw_spin_unlock(&rt_rq->rt_runtime_lock);
10341 } 10431 }
10342 spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 10432 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
10343 10433
10344 return 0; 10434 return 0;
10345} 10435}
@@ -10630,9 +10720,9 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
10630 /* 10720 /*
10631 * Take rq->lock to make 64-bit read safe on 32-bit platforms. 10721 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
10632 */ 10722 */
10633 spin_lock_irq(&cpu_rq(cpu)->lock); 10723 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
10634 data = *cpuusage; 10724 data = *cpuusage;
10635 spin_unlock_irq(&cpu_rq(cpu)->lock); 10725 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
10636#else 10726#else
10637 data = *cpuusage; 10727 data = *cpuusage;
10638#endif 10728#endif
@@ -10648,9 +10738,9 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
10648 /* 10738 /*
10649 * Take rq->lock to make 64-bit write safe on 32-bit platforms. 10739 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
10650 */ 10740 */
10651 spin_lock_irq(&cpu_rq(cpu)->lock); 10741 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
10652 *cpuusage = val; 10742 *cpuusage = val;
10653 spin_unlock_irq(&cpu_rq(cpu)->lock); 10743 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
10654#else 10744#else
10655 *cpuusage = val; 10745 *cpuusage = val;
10656#endif 10746#endif
@@ -10884,9 +10974,9 @@ void synchronize_sched_expedited(void)
10884 init_completion(&req->done); 10974 init_completion(&req->done);
10885 req->task = NULL; 10975 req->task = NULL;
10886 req->dest_cpu = RCU_MIGRATION_NEED_QS; 10976 req->dest_cpu = RCU_MIGRATION_NEED_QS;
10887 spin_lock_irqsave(&rq->lock, flags); 10977 raw_spin_lock_irqsave(&rq->lock, flags);
10888 list_add(&req->list, &rq->migration_queue); 10978 list_add(&req->list, &rq->migration_queue);
10889 spin_unlock_irqrestore(&rq->lock, flags); 10979 raw_spin_unlock_irqrestore(&rq->lock, flags);
10890 wake_up_process(rq->migration_thread); 10980 wake_up_process(rq->migration_thread);
10891 } 10981 }
10892 for_each_online_cpu(cpu) { 10982 for_each_online_cpu(cpu) {
@@ -10894,13 +10984,14 @@ void synchronize_sched_expedited(void)
10894 req = &per_cpu(rcu_migration_req, cpu); 10984 req = &per_cpu(rcu_migration_req, cpu);
10895 rq = cpu_rq(cpu); 10985 rq = cpu_rq(cpu);
10896 wait_for_completion(&req->done); 10986 wait_for_completion(&req->done);
10897 spin_lock_irqsave(&rq->lock, flags); 10987 raw_spin_lock_irqsave(&rq->lock, flags);
10898 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC)) 10988 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
10899 need_full_sync = 1; 10989 need_full_sync = 1;
10900 req->dest_cpu = RCU_MIGRATION_IDLE; 10990 req->dest_cpu = RCU_MIGRATION_IDLE;
10901 spin_unlock_irqrestore(&rq->lock, flags); 10991 raw_spin_unlock_irqrestore(&rq->lock, flags);
10902 } 10992 }
10903 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; 10993 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
10994 synchronize_sched_expedited_count++;
10904 mutex_unlock(&rcu_sched_expedited_mutex); 10995 mutex_unlock(&rcu_sched_expedited_mutex);
10905 put_online_cpus(); 10996 put_online_cpus();
10906 if (need_full_sync) 10997 if (need_full_sync)
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 0f052fc674d5..597b33099dfa 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -135,26 +135,26 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
135 if (likely(newpri != CPUPRI_INVALID)) { 135 if (likely(newpri != CPUPRI_INVALID)) {
136 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; 136 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
137 137
138 spin_lock_irqsave(&vec->lock, flags); 138 raw_spin_lock_irqsave(&vec->lock, flags);
139 139
140 cpumask_set_cpu(cpu, vec->mask); 140 cpumask_set_cpu(cpu, vec->mask);
141 vec->count++; 141 vec->count++;
142 if (vec->count == 1) 142 if (vec->count == 1)
143 set_bit(newpri, cp->pri_active); 143 set_bit(newpri, cp->pri_active);
144 144
145 spin_unlock_irqrestore(&vec->lock, flags); 145 raw_spin_unlock_irqrestore(&vec->lock, flags);
146 } 146 }
147 if (likely(oldpri != CPUPRI_INVALID)) { 147 if (likely(oldpri != CPUPRI_INVALID)) {
148 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; 148 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
149 149
150 spin_lock_irqsave(&vec->lock, flags); 150 raw_spin_lock_irqsave(&vec->lock, flags);
151 151
152 vec->count--; 152 vec->count--;
153 if (!vec->count) 153 if (!vec->count)
154 clear_bit(oldpri, cp->pri_active); 154 clear_bit(oldpri, cp->pri_active);
155 cpumask_clear_cpu(cpu, vec->mask); 155 cpumask_clear_cpu(cpu, vec->mask);
156 156
157 spin_unlock_irqrestore(&vec->lock, flags); 157 raw_spin_unlock_irqrestore(&vec->lock, flags);
158 } 158 }
159 159
160 *currpri = newpri; 160 *currpri = newpri;
@@ -180,7 +180,7 @@ int cpupri_init(struct cpupri *cp, bool bootmem)
180 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { 180 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
181 struct cpupri_vec *vec = &cp->pri_to_cpu[i]; 181 struct cpupri_vec *vec = &cp->pri_to_cpu[i];
182 182
183 spin_lock_init(&vec->lock); 183 raw_spin_lock_init(&vec->lock);
184 vec->count = 0; 184 vec->count = 0;
185 if (!zalloc_cpumask_var(&vec->mask, gfp)) 185 if (!zalloc_cpumask_var(&vec->mask, gfp))
186 goto cleanup; 186 goto cleanup;
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 9a7e859b8fbf..7cb5bb6b95be 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -12,7 +12,7 @@
12/* values 2-101 are RT priorities 0-99 */ 12/* values 2-101 are RT priorities 0-99 */
13 13
14struct cpupri_vec { 14struct cpupri_vec {
15 spinlock_t lock; 15 raw_spinlock_t lock;
16 int count; 16 int count;
17 cpumask_var_t mask; 17 cpumask_var_t mask;
18}; 18};
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index efb84409bc43..67f95aada4b9 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -184,7 +184,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
184 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", 184 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
185 SPLIT_NS(cfs_rq->exec_clock)); 185 SPLIT_NS(cfs_rq->exec_clock));
186 186
187 spin_lock_irqsave(&rq->lock, flags); 187 raw_spin_lock_irqsave(&rq->lock, flags);
188 if (cfs_rq->rb_leftmost) 188 if (cfs_rq->rb_leftmost)
189 MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; 189 MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime;
190 last = __pick_last_entity(cfs_rq); 190 last = __pick_last_entity(cfs_rq);
@@ -192,7 +192,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
192 max_vruntime = last->vruntime; 192 max_vruntime = last->vruntime;
193 min_vruntime = cfs_rq->min_vruntime; 193 min_vruntime = cfs_rq->min_vruntime;
194 rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime; 194 rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
195 spin_unlock_irqrestore(&rq->lock, flags); 195 raw_spin_unlock_irqrestore(&rq->lock, flags);
196 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", 196 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
197 SPLIT_NS(MIN_vruntime)); 197 SPLIT_NS(MIN_vruntime));
198 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", 198 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
@@ -285,12 +285,16 @@ static void print_cpu(struct seq_file *m, int cpu)
285 285
286#ifdef CONFIG_SCHEDSTATS 286#ifdef CONFIG_SCHEDSTATS
287#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); 287#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
288#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n);
288 289
289 P(yld_count); 290 P(yld_count);
290 291
291 P(sched_switch); 292 P(sched_switch);
292 P(sched_count); 293 P(sched_count);
293 P(sched_goidle); 294 P(sched_goidle);
295#ifdef CONFIG_SMP
296 P64(avg_idle);
297#endif
294 298
295 P(ttwu_count); 299 P(ttwu_count);
296 P(ttwu_local); 300 P(ttwu_local);
@@ -305,6 +309,12 @@ static void print_cpu(struct seq_file *m, int cpu)
305 print_rq(m, rq, cpu); 309 print_rq(m, rq, cpu);
306} 310}
307 311
312static const char *sched_tunable_scaling_names[] = {
313 "none",
314 "logaritmic",
315 "linear"
316};
317
308static int sched_debug_show(struct seq_file *m, void *v) 318static int sched_debug_show(struct seq_file *m, void *v)
309{ 319{
310 u64 now = ktime_to_ns(ktime_get()); 320 u64 now = ktime_to_ns(ktime_get());
@@ -330,6 +340,10 @@ static int sched_debug_show(struct seq_file *m, void *v)
330#undef PN 340#undef PN
331#undef P 341#undef P
332 342
343 SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling",
344 sysctl_sched_tunable_scaling,
345 sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
346
333 for_each_online_cpu(cpu) 347 for_each_online_cpu(cpu)
334 print_cpu(m, cpu); 348 print_cpu(m, cpu);
335 349
@@ -395,7 +409,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
395 PN(se.sum_exec_runtime); 409 PN(se.sum_exec_runtime);
396 PN(se.avg_overlap); 410 PN(se.avg_overlap);
397 PN(se.avg_wakeup); 411 PN(se.avg_wakeup);
398 PN(se.avg_running);
399 412
400 nr_switches = p->nvcsw + p->nivcsw; 413 nr_switches = p->nvcsw + p->nivcsw;
401 414
@@ -419,7 +432,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
419 P(se.nr_failed_migrations_running); 432 P(se.nr_failed_migrations_running);
420 P(se.nr_failed_migrations_hot); 433 P(se.nr_failed_migrations_hot);
421 P(se.nr_forced_migrations); 434 P(se.nr_forced_migrations);
422 P(se.nr_forced2_migrations);
423 P(se.nr_wakeups); 435 P(se.nr_wakeups);
424 P(se.nr_wakeups_sync); 436 P(se.nr_wakeups_sync);
425 P(se.nr_wakeups_migrate); 437 P(se.nr_wakeups_migrate);
@@ -495,7 +507,6 @@ void proc_sched_set_task(struct task_struct *p)
495 p->se.nr_failed_migrations_running = 0; 507 p->se.nr_failed_migrations_running = 0;
496 p->se.nr_failed_migrations_hot = 0; 508 p->se.nr_failed_migrations_hot = 0;
497 p->se.nr_forced_migrations = 0; 509 p->se.nr_forced_migrations = 0;
498 p->se.nr_forced2_migrations = 0;
499 p->se.nr_wakeups = 0; 510 p->se.nr_wakeups = 0;
500 p->se.nr_wakeups_sync = 0; 511 p->se.nr_wakeups_sync = 0;
501 p->se.nr_wakeups_migrate = 0; 512 p->se.nr_wakeups_migrate = 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 37087a7fac22..5bedf6e3ebf3 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -21,6 +21,7 @@
21 */ 21 */
22 22
23#include <linux/latencytop.h> 23#include <linux/latencytop.h>
24#include <linux/sched.h>
24 25
25/* 26/*
26 * Targeted preemption latency for CPU-bound tasks: 27 * Targeted preemption latency for CPU-bound tasks:
@@ -35,12 +36,26 @@
35 * run vmstat and monitor the context-switches (cs) field) 36 * run vmstat and monitor the context-switches (cs) field)
36 */ 37 */
37unsigned int sysctl_sched_latency = 5000000ULL; 38unsigned int sysctl_sched_latency = 5000000ULL;
39unsigned int normalized_sysctl_sched_latency = 5000000ULL;
40
41/*
42 * The initial- and re-scaling of tunables is configurable
43 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
44 *
45 * Options are:
46 * SCHED_TUNABLESCALING_NONE - unscaled, always *1
47 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
48 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
49 */
50enum sched_tunable_scaling sysctl_sched_tunable_scaling
51 = SCHED_TUNABLESCALING_LOG;
38 52
39/* 53/*
40 * Minimal preemption granularity for CPU-bound tasks: 54 * Minimal preemption granularity for CPU-bound tasks:
41 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) 55 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
42 */ 56 */
43unsigned int sysctl_sched_min_granularity = 1000000ULL; 57unsigned int sysctl_sched_min_granularity = 1000000ULL;
58unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL;
44 59
45/* 60/*
46 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity 61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
@@ -70,6 +85,7 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
70 * have immediate wakeup/sleep latencies. 85 * have immediate wakeup/sleep latencies.
71 */ 86 */
72unsigned int sysctl_sched_wakeup_granularity = 1000000UL; 87unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
88unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
73 89
74const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 90const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
75 91
@@ -383,11 +399,12 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
383 */ 399 */
384 400
385#ifdef CONFIG_SCHED_DEBUG 401#ifdef CONFIG_SCHED_DEBUG
386int sched_nr_latency_handler(struct ctl_table *table, int write, 402int sched_proc_update_handler(struct ctl_table *table, int write,
387 void __user *buffer, size_t *lenp, 403 void __user *buffer, size_t *lenp,
388 loff_t *ppos) 404 loff_t *ppos)
389{ 405{
390 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 406 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
407 int factor = get_update_sysctl_factor();
391 408
392 if (ret || !write) 409 if (ret || !write)
393 return ret; 410 return ret;
@@ -395,6 +412,14 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
395 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, 412 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
396 sysctl_sched_min_granularity); 413 sysctl_sched_min_granularity);
397 414
415#define WRT_SYSCTL(name) \
416 (normalized_sysctl_##name = sysctl_##name / (factor))
417 WRT_SYSCTL(sched_min_granularity);
418 WRT_SYSCTL(sched_latency);
419 WRT_SYSCTL(sched_wakeup_granularity);
420 WRT_SYSCTL(sched_shares_ratelimit);
421#undef WRT_SYSCTL
422
398 return 0; 423 return 0;
399} 424}
400#endif 425#endif
@@ -1345,6 +1370,37 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1345} 1370}
1346 1371
1347/* 1372/*
1373 * Try and locate an idle CPU in the sched_domain.
1374 */
1375static int
1376select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
1377{
1378 int cpu = smp_processor_id();
1379 int prev_cpu = task_cpu(p);
1380 int i;
1381
1382 /*
1383 * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE
1384 * test in select_task_rq_fair) and the prev_cpu is idle then that's
1385 * always a better target than the current cpu.
1386 */
1387 if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running)
1388 return prev_cpu;
1389
1390 /*
1391 * Otherwise, iterate the domain and find an elegible idle cpu.
1392 */
1393 for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
1394 if (!cpu_rq(i)->cfs.nr_running) {
1395 target = i;
1396 break;
1397 }
1398 }
1399
1400 return target;
1401}
1402
1403/*
1348 * sched_balance_self: balance the current task (running on cpu) in domains 1404 * sched_balance_self: balance the current task (running on cpu) in domains
1349 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and 1405 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1350 * SD_BALANCE_EXEC. 1406 * SD_BALANCE_EXEC.
@@ -1372,7 +1428,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1372 new_cpu = prev_cpu; 1428 new_cpu = prev_cpu;
1373 } 1429 }
1374 1430
1375 rcu_read_lock();
1376 for_each_domain(cpu, tmp) { 1431 for_each_domain(cpu, tmp) {
1377 /* 1432 /*
1378 * If power savings logic is enabled for a domain, see if we 1433 * If power savings logic is enabled for a domain, see if we
@@ -1398,11 +1453,35 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1398 want_sd = 0; 1453 want_sd = 0;
1399 } 1454 }
1400 1455
1401 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && 1456 /*
1402 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { 1457 * While iterating the domains looking for a spanning
1458 * WAKE_AFFINE domain, adjust the affine target to any idle cpu
1459 * in cache sharing domains along the way.
1460 */
1461 if (want_affine) {
1462 int target = -1;
1463
1464 /*
1465 * If both cpu and prev_cpu are part of this domain,
1466 * cpu is a valid SD_WAKE_AFFINE target.
1467 */
1468 if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
1469 target = cpu;
1403 1470
1404 affine_sd = tmp; 1471 /*
1405 want_affine = 0; 1472 * If there's an idle sibling in this domain, make that
1473 * the wake_affine target instead of the current cpu.
1474 */
1475 if (tmp->flags & SD_PREFER_SIBLING)
1476 target = select_idle_sibling(p, tmp, target);
1477
1478 if (target >= 0) {
1479 if (tmp->flags & SD_WAKE_AFFINE) {
1480 affine_sd = tmp;
1481 want_affine = 0;
1482 }
1483 cpu = target;
1484 }
1406 } 1485 }
1407 1486
1408 if (!want_sd && !want_affine) 1487 if (!want_sd && !want_affine)
@@ -1429,10 +1508,8 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1429 update_shares(tmp); 1508 update_shares(tmp);
1430 } 1509 }
1431 1510
1432 if (affine_sd && wake_affine(affine_sd, p, sync)) { 1511 if (affine_sd && wake_affine(affine_sd, p, sync))
1433 new_cpu = cpu; 1512 return cpu;
1434 goto out;
1435 }
1436 1513
1437 while (sd) { 1514 while (sd) {
1438 int load_idx = sd->forkexec_idx; 1515 int load_idx = sd->forkexec_idx;
@@ -1473,8 +1550,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1473 /* while loop will break here if sd == NULL */ 1550 /* while loop will break here if sd == NULL */
1474 } 1551 }
1475 1552
1476out:
1477 rcu_read_unlock();
1478 return new_cpu; 1553 return new_cpu;
1479} 1554}
1480#endif /* CONFIG_SMP */ 1555#endif /* CONFIG_SMP */
@@ -1596,12 +1671,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1596 int sync = wake_flags & WF_SYNC; 1671 int sync = wake_flags & WF_SYNC;
1597 int scale = cfs_rq->nr_running >= sched_nr_latency; 1672 int scale = cfs_rq->nr_running >= sched_nr_latency;
1598 1673
1599 update_curr(cfs_rq); 1674 if (unlikely(rt_prio(p->prio)))
1600 1675 goto preempt;
1601 if (unlikely(rt_prio(p->prio))) {
1602 resched_task(curr);
1603 return;
1604 }
1605 1676
1606 if (unlikely(p->sched_class != &fair_sched_class)) 1677 if (unlikely(p->sched_class != &fair_sched_class))
1607 return; 1678 return;
@@ -1627,50 +1698,44 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1627 return; 1698 return;
1628 1699
1629 /* Idle tasks are by definition preempted by everybody. */ 1700 /* Idle tasks are by definition preempted by everybody. */
1630 if (unlikely(curr->policy == SCHED_IDLE)) { 1701 if (unlikely(curr->policy == SCHED_IDLE))
1631 resched_task(curr); 1702 goto preempt;
1632 return;
1633 }
1634 1703
1635 if ((sched_feat(WAKEUP_SYNC) && sync) || 1704 if (sched_feat(WAKEUP_SYNC) && sync)
1636 (sched_feat(WAKEUP_OVERLAP) && 1705 goto preempt;
1637 (se->avg_overlap < sysctl_sched_migration_cost &&
1638 pse->avg_overlap < sysctl_sched_migration_cost))) {
1639 resched_task(curr);
1640 return;
1641 }
1642 1706
1643 if (sched_feat(WAKEUP_RUNNING)) { 1707 if (sched_feat(WAKEUP_OVERLAP) &&
1644 if (pse->avg_running < se->avg_running) { 1708 se->avg_overlap < sysctl_sched_migration_cost &&
1645 set_next_buddy(pse); 1709 pse->avg_overlap < sysctl_sched_migration_cost)
1646 resched_task(curr); 1710 goto preempt;
1647 return;
1648 }
1649 }
1650 1711
1651 if (!sched_feat(WAKEUP_PREEMPT)) 1712 if (!sched_feat(WAKEUP_PREEMPT))
1652 return; 1713 return;
1653 1714
1715 update_curr(cfs_rq);
1654 find_matching_se(&se, &pse); 1716 find_matching_se(&se, &pse);
1655
1656 BUG_ON(!pse); 1717 BUG_ON(!pse);
1718 if (wakeup_preempt_entity(se, pse) == 1)
1719 goto preempt;
1657 1720
1658 if (wakeup_preempt_entity(se, pse) == 1) { 1721 return;
1659 resched_task(curr); 1722
1660 /* 1723preempt:
1661 * Only set the backward buddy when the current task is still 1724 resched_task(curr);
1662 * on the rq. This can happen when a wakeup gets interleaved 1725 /*
1663 * with schedule on the ->pre_schedule() or idle_balance() 1726 * Only set the backward buddy when the current task is still
1664 * point, either of which can * drop the rq lock. 1727 * on the rq. This can happen when a wakeup gets interleaved
1665 * 1728 * with schedule on the ->pre_schedule() or idle_balance()
1666 * Also, during early boot the idle thread is in the fair class, 1729 * point, either of which can * drop the rq lock.
1667 * for obvious reasons its a bad idea to schedule back to it. 1730 *
1668 */ 1731 * Also, during early boot the idle thread is in the fair class,
1669 if (unlikely(!se->on_rq || curr == rq->idle)) 1732 * for obvious reasons its a bad idea to schedule back to it.
1670 return; 1733 */
1671 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) 1734 if (unlikely(!se->on_rq || curr == rq->idle))
1672 set_last_buddy(se); 1735 return;
1673 } 1736
1737 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
1738 set_last_buddy(se);
1674} 1739}
1675 1740
1676static struct task_struct *pick_next_task_fair(struct rq *rq) 1741static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1679,7 +1744,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
1679 struct cfs_rq *cfs_rq = &rq->cfs; 1744 struct cfs_rq *cfs_rq = &rq->cfs;
1680 struct sched_entity *se; 1745 struct sched_entity *se;
1681 1746
1682 if (unlikely(!cfs_rq->nr_running)) 1747 if (!cfs_rq->nr_running)
1683 return NULL; 1748 return NULL;
1684 1749
1685 do { 1750 do {
@@ -1850,6 +1915,17 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1850 1915
1851 return 0; 1916 return 0;
1852} 1917}
1918
1919static void rq_online_fair(struct rq *rq)
1920{
1921 update_sysctl();
1922}
1923
1924static void rq_offline_fair(struct rq *rq)
1925{
1926 update_sysctl();
1927}
1928
1853#endif /* CONFIG_SMP */ 1929#endif /* CONFIG_SMP */
1854 1930
1855/* 1931/*
@@ -1867,28 +1943,30 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
1867} 1943}
1868 1944
1869/* 1945/*
1870 * Share the fairness runtime between parent and child, thus the 1946 * called on fork with the child task as argument from the parent's context
1871 * total amount of pressure for CPU stays equal - new tasks 1947 * - child not yet on the tasklist
1872 * get a chance to run but frequent forkers are not allowed to 1948 * - preemption disabled
1873 * monopolize the CPU. Note: the parent runqueue is locked,
1874 * the child is not running yet.
1875 */ 1949 */
1876static void task_new_fair(struct rq *rq, struct task_struct *p) 1950static void task_fork_fair(struct task_struct *p)
1877{ 1951{
1878 struct cfs_rq *cfs_rq = task_cfs_rq(p); 1952 struct cfs_rq *cfs_rq = task_cfs_rq(current);
1879 struct sched_entity *se = &p->se, *curr = cfs_rq->curr; 1953 struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
1880 int this_cpu = smp_processor_id(); 1954 int this_cpu = smp_processor_id();
1955 struct rq *rq = this_rq();
1956 unsigned long flags;
1957
1958 raw_spin_lock_irqsave(&rq->lock, flags);
1881 1959
1882 sched_info_queued(p); 1960 if (unlikely(task_cpu(p) != this_cpu))
1961 __set_task_cpu(p, this_cpu);
1883 1962
1884 update_curr(cfs_rq); 1963 update_curr(cfs_rq);
1964
1885 if (curr) 1965 if (curr)
1886 se->vruntime = curr->vruntime; 1966 se->vruntime = curr->vruntime;
1887 place_entity(cfs_rq, se, 1); 1967 place_entity(cfs_rq, se, 1);
1888 1968
1889 /* 'curr' will be NULL if the child belongs to a different group */ 1969 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
1890 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
1891 curr && entity_before(curr, se)) {
1892 /* 1970 /*
1893 * Upon rescheduling, sched_class::put_prev_task() will place 1971 * Upon rescheduling, sched_class::put_prev_task() will place
1894 * 'current' within the tree based on its new key value. 1972 * 'current' within the tree based on its new key value.
@@ -1897,7 +1975,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1897 resched_task(rq->curr); 1975 resched_task(rq->curr);
1898 } 1976 }
1899 1977
1900 enqueue_task_fair(rq, p, 0); 1978 raw_spin_unlock_irqrestore(&rq->lock, flags);
1901} 1979}
1902 1980
1903/* 1981/*
@@ -1959,21 +2037,17 @@ static void moved_group_fair(struct task_struct *p)
1959} 2037}
1960#endif 2038#endif
1961 2039
1962unsigned int get_rr_interval_fair(struct task_struct *task) 2040unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
1963{ 2041{
1964 struct sched_entity *se = &task->se; 2042 struct sched_entity *se = &task->se;
1965 unsigned long flags;
1966 struct rq *rq;
1967 unsigned int rr_interval = 0; 2043 unsigned int rr_interval = 0;
1968 2044
1969 /* 2045 /*
1970 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise 2046 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
1971 * idle runqueue: 2047 * idle runqueue:
1972 */ 2048 */
1973 rq = task_rq_lock(task, &flags);
1974 if (rq->cfs.load.weight) 2049 if (rq->cfs.load.weight)
1975 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); 2050 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
1976 task_rq_unlock(rq, &flags);
1977 2051
1978 return rr_interval; 2052 return rr_interval;
1979} 2053}
@@ -1997,11 +2071,13 @@ static const struct sched_class fair_sched_class = {
1997 2071
1998 .load_balance = load_balance_fair, 2072 .load_balance = load_balance_fair,
1999 .move_one_task = move_one_task_fair, 2073 .move_one_task = move_one_task_fair,
2074 .rq_online = rq_online_fair,
2075 .rq_offline = rq_offline_fair,
2000#endif 2076#endif
2001 2077
2002 .set_curr_task = set_curr_task_fair, 2078 .set_curr_task = set_curr_task_fair,
2003 .task_tick = task_tick_fair, 2079 .task_tick = task_tick_fair,
2004 .task_new = task_new_fair, 2080 .task_fork = task_fork_fair,
2005 2081
2006 .prio_changed = prio_changed_fair, 2082 .prio_changed = prio_changed_fair,
2007 .switched_to = switched_to_fair, 2083 .switched_to = switched_to_fair,
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 0d94083582c7..d5059fd761d9 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -54,11 +54,6 @@ SCHED_FEAT(WAKEUP_SYNC, 0)
54SCHED_FEAT(WAKEUP_OVERLAP, 0) 54SCHED_FEAT(WAKEUP_OVERLAP, 0)
55 55
56/* 56/*
57 * Wakeup preemption towards tasks that run short
58 */
59SCHED_FEAT(WAKEUP_RUNNING, 0)
60
61/*
62 * Use the SYNC wakeup hint, pipes and the likes use this to indicate 57 * Use the SYNC wakeup hint, pipes and the likes use this to indicate
63 * the remote end is likely to consume the data we just wrote, and 58 * the remote end is likely to consume the data we just wrote, and
64 * therefore has cache benefit from being placed on the same cpu, see 59 * therefore has cache benefit from being placed on the same cpu, see
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index b133a28fcde3..5f93b570d383 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -34,10 +34,10 @@ static struct task_struct *pick_next_task_idle(struct rq *rq)
34static void 34static void
35dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep) 35dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep)
36{ 36{
37 spin_unlock_irq(&rq->lock); 37 raw_spin_unlock_irq(&rq->lock);
38 printk(KERN_ERR "bad: scheduling from the idle thread!\n"); 38 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
39 dump_stack(); 39 dump_stack();
40 spin_lock_irq(&rq->lock); 40 raw_spin_lock_irq(&rq->lock);
41} 41}
42 42
43static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) 43static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
@@ -97,7 +97,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
97 check_preempt_curr(rq, p, 0); 97 check_preempt_curr(rq, p, 0);
98} 98}
99 99
100unsigned int get_rr_interval_idle(struct task_struct *task) 100unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
101{ 101{
102 return 0; 102 return 0;
103} 103}
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index a4d790cddb19..d2ea2828164e 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -327,7 +327,7 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
327 327
328 weight = cpumask_weight(rd->span); 328 weight = cpumask_weight(rd->span);
329 329
330 spin_lock(&rt_b->rt_runtime_lock); 330 raw_spin_lock(&rt_b->rt_runtime_lock);
331 rt_period = ktime_to_ns(rt_b->rt_period); 331 rt_period = ktime_to_ns(rt_b->rt_period);
332 for_each_cpu(i, rd->span) { 332 for_each_cpu(i, rd->span) {
333 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 333 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
@@ -336,7 +336,7 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
336 if (iter == rt_rq) 336 if (iter == rt_rq)
337 continue; 337 continue;
338 338
339 spin_lock(&iter->rt_runtime_lock); 339 raw_spin_lock(&iter->rt_runtime_lock);
340 /* 340 /*
341 * Either all rqs have inf runtime and there's nothing to steal 341 * Either all rqs have inf runtime and there's nothing to steal
342 * or __disable_runtime() below sets a specific rq to inf to 342 * or __disable_runtime() below sets a specific rq to inf to
@@ -358,14 +358,14 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
358 rt_rq->rt_runtime += diff; 358 rt_rq->rt_runtime += diff;
359 more = 1; 359 more = 1;
360 if (rt_rq->rt_runtime == rt_period) { 360 if (rt_rq->rt_runtime == rt_period) {
361 spin_unlock(&iter->rt_runtime_lock); 361 raw_spin_unlock(&iter->rt_runtime_lock);
362 break; 362 break;
363 } 363 }
364 } 364 }
365next: 365next:
366 spin_unlock(&iter->rt_runtime_lock); 366 raw_spin_unlock(&iter->rt_runtime_lock);
367 } 367 }
368 spin_unlock(&rt_b->rt_runtime_lock); 368 raw_spin_unlock(&rt_b->rt_runtime_lock);
369 369
370 return more; 370 return more;
371} 371}
@@ -386,8 +386,8 @@ static void __disable_runtime(struct rq *rq)
386 s64 want; 386 s64 want;
387 int i; 387 int i;
388 388
389 spin_lock(&rt_b->rt_runtime_lock); 389 raw_spin_lock(&rt_b->rt_runtime_lock);
390 spin_lock(&rt_rq->rt_runtime_lock); 390 raw_spin_lock(&rt_rq->rt_runtime_lock);
391 /* 391 /*
392 * Either we're all inf and nobody needs to borrow, or we're 392 * Either we're all inf and nobody needs to borrow, or we're
393 * already disabled and thus have nothing to do, or we have 393 * already disabled and thus have nothing to do, or we have
@@ -396,7 +396,7 @@ static void __disable_runtime(struct rq *rq)
396 if (rt_rq->rt_runtime == RUNTIME_INF || 396 if (rt_rq->rt_runtime == RUNTIME_INF ||
397 rt_rq->rt_runtime == rt_b->rt_runtime) 397 rt_rq->rt_runtime == rt_b->rt_runtime)
398 goto balanced; 398 goto balanced;
399 spin_unlock(&rt_rq->rt_runtime_lock); 399 raw_spin_unlock(&rt_rq->rt_runtime_lock);
400 400
401 /* 401 /*
402 * Calculate the difference between what we started out with 402 * Calculate the difference between what we started out with
@@ -418,7 +418,7 @@ static void __disable_runtime(struct rq *rq)
418 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) 418 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
419 continue; 419 continue;
420 420
421 spin_lock(&iter->rt_runtime_lock); 421 raw_spin_lock(&iter->rt_runtime_lock);
422 if (want > 0) { 422 if (want > 0) {
423 diff = min_t(s64, iter->rt_runtime, want); 423 diff = min_t(s64, iter->rt_runtime, want);
424 iter->rt_runtime -= diff; 424 iter->rt_runtime -= diff;
@@ -427,13 +427,13 @@ static void __disable_runtime(struct rq *rq)
427 iter->rt_runtime -= want; 427 iter->rt_runtime -= want;
428 want -= want; 428 want -= want;
429 } 429 }
430 spin_unlock(&iter->rt_runtime_lock); 430 raw_spin_unlock(&iter->rt_runtime_lock);
431 431
432 if (!want) 432 if (!want)
433 break; 433 break;
434 } 434 }
435 435
436 spin_lock(&rt_rq->rt_runtime_lock); 436 raw_spin_lock(&rt_rq->rt_runtime_lock);
437 /* 437 /*
438 * We cannot be left wanting - that would mean some runtime 438 * We cannot be left wanting - that would mean some runtime
439 * leaked out of the system. 439 * leaked out of the system.
@@ -445,8 +445,8 @@ balanced:
445 * runtime - in which case borrowing doesn't make sense. 445 * runtime - in which case borrowing doesn't make sense.
446 */ 446 */
447 rt_rq->rt_runtime = RUNTIME_INF; 447 rt_rq->rt_runtime = RUNTIME_INF;
448 spin_unlock(&rt_rq->rt_runtime_lock); 448 raw_spin_unlock(&rt_rq->rt_runtime_lock);
449 spin_unlock(&rt_b->rt_runtime_lock); 449 raw_spin_unlock(&rt_b->rt_runtime_lock);
450 } 450 }
451} 451}
452 452
@@ -454,9 +454,9 @@ static void disable_runtime(struct rq *rq)
454{ 454{
455 unsigned long flags; 455 unsigned long flags;
456 456
457 spin_lock_irqsave(&rq->lock, flags); 457 raw_spin_lock_irqsave(&rq->lock, flags);
458 __disable_runtime(rq); 458 __disable_runtime(rq);
459 spin_unlock_irqrestore(&rq->lock, flags); 459 raw_spin_unlock_irqrestore(&rq->lock, flags);
460} 460}
461 461
462static void __enable_runtime(struct rq *rq) 462static void __enable_runtime(struct rq *rq)
@@ -472,13 +472,13 @@ static void __enable_runtime(struct rq *rq)
472 for_each_leaf_rt_rq(rt_rq, rq) { 472 for_each_leaf_rt_rq(rt_rq, rq) {
473 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 473 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
474 474
475 spin_lock(&rt_b->rt_runtime_lock); 475 raw_spin_lock(&rt_b->rt_runtime_lock);
476 spin_lock(&rt_rq->rt_runtime_lock); 476 raw_spin_lock(&rt_rq->rt_runtime_lock);
477 rt_rq->rt_runtime = rt_b->rt_runtime; 477 rt_rq->rt_runtime = rt_b->rt_runtime;
478 rt_rq->rt_time = 0; 478 rt_rq->rt_time = 0;
479 rt_rq->rt_throttled = 0; 479 rt_rq->rt_throttled = 0;
480 spin_unlock(&rt_rq->rt_runtime_lock); 480 raw_spin_unlock(&rt_rq->rt_runtime_lock);
481 spin_unlock(&rt_b->rt_runtime_lock); 481 raw_spin_unlock(&rt_b->rt_runtime_lock);
482 } 482 }
483} 483}
484 484
@@ -486,9 +486,9 @@ static void enable_runtime(struct rq *rq)
486{ 486{
487 unsigned long flags; 487 unsigned long flags;
488 488
489 spin_lock_irqsave(&rq->lock, flags); 489 raw_spin_lock_irqsave(&rq->lock, flags);
490 __enable_runtime(rq); 490 __enable_runtime(rq);
491 spin_unlock_irqrestore(&rq->lock, flags); 491 raw_spin_unlock_irqrestore(&rq->lock, flags);
492} 492}
493 493
494static int balance_runtime(struct rt_rq *rt_rq) 494static int balance_runtime(struct rt_rq *rt_rq)
@@ -496,9 +496,9 @@ static int balance_runtime(struct rt_rq *rt_rq)
496 int more = 0; 496 int more = 0;
497 497
498 if (rt_rq->rt_time > rt_rq->rt_runtime) { 498 if (rt_rq->rt_time > rt_rq->rt_runtime) {
499 spin_unlock(&rt_rq->rt_runtime_lock); 499 raw_spin_unlock(&rt_rq->rt_runtime_lock);
500 more = do_balance_runtime(rt_rq); 500 more = do_balance_runtime(rt_rq);
501 spin_lock(&rt_rq->rt_runtime_lock); 501 raw_spin_lock(&rt_rq->rt_runtime_lock);
502 } 502 }
503 503
504 return more; 504 return more;
@@ -524,11 +524,11 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
524 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); 524 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
525 struct rq *rq = rq_of_rt_rq(rt_rq); 525 struct rq *rq = rq_of_rt_rq(rt_rq);
526 526
527 spin_lock(&rq->lock); 527 raw_spin_lock(&rq->lock);
528 if (rt_rq->rt_time) { 528 if (rt_rq->rt_time) {
529 u64 runtime; 529 u64 runtime;
530 530
531 spin_lock(&rt_rq->rt_runtime_lock); 531 raw_spin_lock(&rt_rq->rt_runtime_lock);
532 if (rt_rq->rt_throttled) 532 if (rt_rq->rt_throttled)
533 balance_runtime(rt_rq); 533 balance_runtime(rt_rq);
534 runtime = rt_rq->rt_runtime; 534 runtime = rt_rq->rt_runtime;
@@ -539,13 +539,13 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
539 } 539 }
540 if (rt_rq->rt_time || rt_rq->rt_nr_running) 540 if (rt_rq->rt_time || rt_rq->rt_nr_running)
541 idle = 0; 541 idle = 0;
542 spin_unlock(&rt_rq->rt_runtime_lock); 542 raw_spin_unlock(&rt_rq->rt_runtime_lock);
543 } else if (rt_rq->rt_nr_running) 543 } else if (rt_rq->rt_nr_running)
544 idle = 0; 544 idle = 0;
545 545
546 if (enqueue) 546 if (enqueue)
547 sched_rt_rq_enqueue(rt_rq); 547 sched_rt_rq_enqueue(rt_rq);
548 spin_unlock(&rq->lock); 548 raw_spin_unlock(&rq->lock);
549 } 549 }
550 550
551 return idle; 551 return idle;
@@ -624,11 +624,11 @@ static void update_curr_rt(struct rq *rq)
624 rt_rq = rt_rq_of_se(rt_se); 624 rt_rq = rt_rq_of_se(rt_se);
625 625
626 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { 626 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
627 spin_lock(&rt_rq->rt_runtime_lock); 627 raw_spin_lock(&rt_rq->rt_runtime_lock);
628 rt_rq->rt_time += delta_exec; 628 rt_rq->rt_time += delta_exec;
629 if (sched_rt_runtime_exceeded(rt_rq)) 629 if (sched_rt_runtime_exceeded(rt_rq))
630 resched_task(curr); 630 resched_task(curr);
631 spin_unlock(&rt_rq->rt_runtime_lock); 631 raw_spin_unlock(&rt_rq->rt_runtime_lock);
632 } 632 }
633 } 633 }
634} 634}
@@ -1153,29 +1153,12 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
1153 1153
1154static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); 1154static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1155 1155
1156static inline int pick_optimal_cpu(int this_cpu,
1157 const struct cpumask *mask)
1158{
1159 int first;
1160
1161 /* "this_cpu" is cheaper to preempt than a remote processor */
1162 if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask))
1163 return this_cpu;
1164
1165 first = cpumask_first(mask);
1166 if (first < nr_cpu_ids)
1167 return first;
1168
1169 return -1;
1170}
1171
1172static int find_lowest_rq(struct task_struct *task) 1156static int find_lowest_rq(struct task_struct *task)
1173{ 1157{
1174 struct sched_domain *sd; 1158 struct sched_domain *sd;
1175 struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); 1159 struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
1176 int this_cpu = smp_processor_id(); 1160 int this_cpu = smp_processor_id();
1177 int cpu = task_cpu(task); 1161 int cpu = task_cpu(task);
1178 cpumask_var_t domain_mask;
1179 1162
1180 if (task->rt.nr_cpus_allowed == 1) 1163 if (task->rt.nr_cpus_allowed == 1)
1181 return -1; /* No other targets possible */ 1164 return -1; /* No other targets possible */
@@ -1198,28 +1181,26 @@ static int find_lowest_rq(struct task_struct *task)
1198 * Otherwise, we consult the sched_domains span maps to figure 1181 * Otherwise, we consult the sched_domains span maps to figure
1199 * out which cpu is logically closest to our hot cache data. 1182 * out which cpu is logically closest to our hot cache data.
1200 */ 1183 */
1201 if (this_cpu == cpu) 1184 if (!cpumask_test_cpu(this_cpu, lowest_mask))
1202 this_cpu = -1; /* Skip this_cpu opt if the same */ 1185 this_cpu = -1; /* Skip this_cpu opt if not among lowest */
1203
1204 if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) {
1205 for_each_domain(cpu, sd) {
1206 if (sd->flags & SD_WAKE_AFFINE) {
1207 int best_cpu;
1208 1186
1209 cpumask_and(domain_mask, 1187 for_each_domain(cpu, sd) {
1210 sched_domain_span(sd), 1188 if (sd->flags & SD_WAKE_AFFINE) {
1211 lowest_mask); 1189 int best_cpu;
1212 1190
1213 best_cpu = pick_optimal_cpu(this_cpu, 1191 /*
1214 domain_mask); 1192 * "this_cpu" is cheaper to preempt than a
1215 1193 * remote processor.
1216 if (best_cpu != -1) { 1194 */
1217 free_cpumask_var(domain_mask); 1195 if (this_cpu != -1 &&
1218 return best_cpu; 1196 cpumask_test_cpu(this_cpu, sched_domain_span(sd)))
1219 } 1197 return this_cpu;
1220 } 1198
1199 best_cpu = cpumask_first_and(lowest_mask,
1200 sched_domain_span(sd));
1201 if (best_cpu < nr_cpu_ids)
1202 return best_cpu;
1221 } 1203 }
1222 free_cpumask_var(domain_mask);
1223 } 1204 }
1224 1205
1225 /* 1206 /*
@@ -1227,7 +1208,13 @@ static int find_lowest_rq(struct task_struct *task)
1227 * just give the caller *something* to work with from the compatible 1208 * just give the caller *something* to work with from the compatible
1228 * locations. 1209 * locations.
1229 */ 1210 */
1230 return pick_optimal_cpu(this_cpu, lowest_mask); 1211 if (this_cpu != -1)
1212 return this_cpu;
1213
1214 cpu = cpumask_any(lowest_mask);
1215 if (cpu < nr_cpu_ids)
1216 return cpu;
1217 return -1;
1231} 1218}
1232 1219
1233/* Will lock the rq it finds */ 1220/* Will lock the rq it finds */
@@ -1259,7 +1246,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1259 task_running(rq, task) || 1246 task_running(rq, task) ||
1260 !task->se.on_rq)) { 1247 !task->se.on_rq)) {
1261 1248
1262 spin_unlock(&lowest_rq->lock); 1249 raw_spin_unlock(&lowest_rq->lock);
1263 lowest_rq = NULL; 1250 lowest_rq = NULL;
1264 break; 1251 break;
1265 } 1252 }
@@ -1734,7 +1721,7 @@ static void set_curr_task_rt(struct rq *rq)
1734 dequeue_pushable_task(rq, p); 1721 dequeue_pushable_task(rq, p);
1735} 1722}
1736 1723
1737unsigned int get_rr_interval_rt(struct task_struct *task) 1724unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
1738{ 1725{
1739 /* 1726 /*
1740 * Time slice is 0 for SCHED_FIFO tasks 1727 * Time slice is 0 for SCHED_FIFO tasks
diff --git a/kernel/signal.c b/kernel/signal.c
index 93e72e5feae6..1814e68e4de3 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -22,6 +22,7 @@
22#include <linux/ptrace.h> 22#include <linux/ptrace.h>
23#include <linux/signal.h> 23#include <linux/signal.h>
24#include <linux/signalfd.h> 24#include <linux/signalfd.h>
25#include <linux/ratelimit.h>
25#include <linux/tracehook.h> 26#include <linux/tracehook.h>
26#include <linux/capability.h> 27#include <linux/capability.h>
27#include <linux/freezer.h> 28#include <linux/freezer.h>
@@ -42,6 +43,8 @@
42 43
43static struct kmem_cache *sigqueue_cachep; 44static struct kmem_cache *sigqueue_cachep;
44 45
46int print_fatal_signals __read_mostly;
47
45static void __user *sig_handler(struct task_struct *t, int sig) 48static void __user *sig_handler(struct task_struct *t, int sig)
46{ 49{
47 return t->sighand->action[sig - 1].sa.sa_handler; 50 return t->sighand->action[sig - 1].sa.sa_handler;
@@ -160,7 +163,7 @@ int next_signal(struct sigpending *pending, sigset_t *mask)
160{ 163{
161 unsigned long i, *s, *m, x; 164 unsigned long i, *s, *m, x;
162 int sig = 0; 165 int sig = 0;
163 166
164 s = pending->signal.sig; 167 s = pending->signal.sig;
165 m = mask->sig; 168 m = mask->sig;
166 switch (_NSIG_WORDS) { 169 switch (_NSIG_WORDS) {
@@ -185,17 +188,31 @@ int next_signal(struct sigpending *pending, sigset_t *mask)
185 sig = ffz(~x) + 1; 188 sig = ffz(~x) + 1;
186 break; 189 break;
187 } 190 }
188 191
189 return sig; 192 return sig;
190} 193}
191 194
195static inline void print_dropped_signal(int sig)
196{
197 static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
198
199 if (!print_fatal_signals)
200 return;
201
202 if (!__ratelimit(&ratelimit_state))
203 return;
204
205 printk(KERN_INFO "%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n",
206 current->comm, current->pid, sig);
207}
208
192/* 209/*
193 * allocate a new signal queue record 210 * allocate a new signal queue record
194 * - this may be called without locks if and only if t == current, otherwise an 211 * - this may be called without locks if and only if t == current, otherwise an
195 * appopriate lock must be held to stop the target task from exiting 212 * appopriate lock must be held to stop the target task from exiting
196 */ 213 */
197static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, 214static struct sigqueue *
198 int override_rlimit) 215__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
199{ 216{
200 struct sigqueue *q = NULL; 217 struct sigqueue *q = NULL;
201 struct user_struct *user; 218 struct user_struct *user;
@@ -208,10 +225,15 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
208 */ 225 */
209 user = get_uid(__task_cred(t)->user); 226 user = get_uid(__task_cred(t)->user);
210 atomic_inc(&user->sigpending); 227 atomic_inc(&user->sigpending);
228
211 if (override_rlimit || 229 if (override_rlimit ||
212 atomic_read(&user->sigpending) <= 230 atomic_read(&user->sigpending) <=
213 t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) 231 t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) {
214 q = kmem_cache_alloc(sigqueue_cachep, flags); 232 q = kmem_cache_alloc(sigqueue_cachep, flags);
233 } else {
234 print_dropped_signal(sig);
235 }
236
215 if (unlikely(q == NULL)) { 237 if (unlikely(q == NULL)) {
216 atomic_dec(&user->sigpending); 238 atomic_dec(&user->sigpending);
217 free_uid(user); 239 free_uid(user);
@@ -401,7 +423,7 @@ still_pending:
401 */ 423 */
402 info->si_signo = sig; 424 info->si_signo = sig;
403 info->si_errno = 0; 425 info->si_errno = 0;
404 info->si_code = 0; 426 info->si_code = SI_USER;
405 info->si_pid = 0; 427 info->si_pid = 0;
406 info->si_uid = 0; 428 info->si_uid = 0;
407 } 429 }
@@ -585,6 +607,17 @@ static int rm_from_queue(unsigned long mask, struct sigpending *s)
585 return 1; 607 return 1;
586} 608}
587 609
610static inline int is_si_special(const struct siginfo *info)
611{
612 return info <= SEND_SIG_FORCED;
613}
614
615static inline bool si_fromuser(const struct siginfo *info)
616{
617 return info == SEND_SIG_NOINFO ||
618 (!is_si_special(info) && SI_FROMUSER(info));
619}
620
588/* 621/*
589 * Bad permissions for sending the signal 622 * Bad permissions for sending the signal
590 * - the caller must hold at least the RCU read lock 623 * - the caller must hold at least the RCU read lock
@@ -599,7 +632,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
599 if (!valid_signal(sig)) 632 if (!valid_signal(sig))
600 return -EINVAL; 633 return -EINVAL;
601 634
602 if (info != SEND_SIG_NOINFO && (is_si_special(info) || SI_FROMKERNEL(info))) 635 if (!si_fromuser(info))
603 return 0; 636 return 0;
604 637
605 error = audit_signal_info(sig, t); /* Let audit system see the signal */ 638 error = audit_signal_info(sig, t); /* Let audit system see the signal */
@@ -870,7 +903,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
870 else 903 else
871 override_rlimit = 0; 904 override_rlimit = 0;
872 905
873 q = __sigqueue_alloc(t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE, 906 q = __sigqueue_alloc(sig, t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE,
874 override_rlimit); 907 override_rlimit);
875 if (q) { 908 if (q) {
876 list_add_tail(&q->list, &pending->list); 909 list_add_tail(&q->list, &pending->list);
@@ -927,16 +960,13 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
927 int from_ancestor_ns = 0; 960 int from_ancestor_ns = 0;
928 961
929#ifdef CONFIG_PID_NS 962#ifdef CONFIG_PID_NS
930 if (!is_si_special(info) && SI_FROMUSER(info) && 963 from_ancestor_ns = si_fromuser(info) &&
931 task_pid_nr_ns(current, task_active_pid_ns(t)) <= 0) 964 !task_pid_nr_ns(current, task_active_pid_ns(t));
932 from_ancestor_ns = 1;
933#endif 965#endif
934 966
935 return __send_signal(sig, info, t, group, from_ancestor_ns); 967 return __send_signal(sig, info, t, group, from_ancestor_ns);
936} 968}
937 969
938int print_fatal_signals;
939
940static void print_fatal_signal(struct pt_regs *regs, int signr) 970static void print_fatal_signal(struct pt_regs *regs, int signr)
941{ 971{
942 printk("%s/%d: potentially unexpected fatal signal %d.\n", 972 printk("%s/%d: potentially unexpected fatal signal %d.\n",
@@ -1032,12 +1062,6 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
1032 return ret; 1062 return ret;
1033} 1063}
1034 1064
1035void
1036force_sig_specific(int sig, struct task_struct *t)
1037{
1038 force_sig_info(sig, SEND_SIG_FORCED, t);
1039}
1040
1041/* 1065/*
1042 * Nuke all other threads in the group. 1066 * Nuke all other threads in the group.
1043 */ 1067 */
@@ -1166,8 +1190,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
1166 goto out_unlock; 1190 goto out_unlock;
1167 } 1191 }
1168 pcred = __task_cred(p); 1192 pcred = __task_cred(p);
1169 if ((info == SEND_SIG_NOINFO || 1193 if (si_fromuser(info) &&
1170 (!is_si_special(info) && SI_FROMUSER(info))) &&
1171 euid != pcred->suid && euid != pcred->uid && 1194 euid != pcred->suid && euid != pcred->uid &&
1172 uid != pcred->suid && uid != pcred->uid) { 1195 uid != pcred->suid && uid != pcred->uid) {
1173 ret = -EPERM; 1196 ret = -EPERM;
@@ -1303,19 +1326,19 @@ EXPORT_SYMBOL(kill_pid);
1303 * These functions support sending signals using preallocated sigqueue 1326 * These functions support sending signals using preallocated sigqueue
1304 * structures. This is needed "because realtime applications cannot 1327 * structures. This is needed "because realtime applications cannot
1305 * afford to lose notifications of asynchronous events, like timer 1328 * afford to lose notifications of asynchronous events, like timer
1306 * expirations or I/O completions". In the case of Posix Timers 1329 * expirations or I/O completions". In the case of Posix Timers
1307 * we allocate the sigqueue structure from the timer_create. If this 1330 * we allocate the sigqueue structure from the timer_create. If this
1308 * allocation fails we are able to report the failure to the application 1331 * allocation fails we are able to report the failure to the application
1309 * with an EAGAIN error. 1332 * with an EAGAIN error.
1310 */ 1333 */
1311
1312struct sigqueue *sigqueue_alloc(void) 1334struct sigqueue *sigqueue_alloc(void)
1313{ 1335{
1314 struct sigqueue *q; 1336 struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
1315 1337
1316 if ((q = __sigqueue_alloc(current, GFP_KERNEL, 0))) 1338 if (q)
1317 q->flags |= SIGQUEUE_PREALLOC; 1339 q->flags |= SIGQUEUE_PREALLOC;
1318 return(q); 1340
1341 return q;
1319} 1342}
1320 1343
1321void sigqueue_free(struct sigqueue *q) 1344void sigqueue_free(struct sigqueue *q)
@@ -1817,11 +1840,6 @@ relock:
1817 1840
1818 for (;;) { 1841 for (;;) {
1819 struct k_sigaction *ka; 1842 struct k_sigaction *ka;
1820
1821 if (unlikely(signal->group_stop_count > 0) &&
1822 do_signal_stop(0))
1823 goto relock;
1824
1825 /* 1843 /*
1826 * Tracing can induce an artifical signal and choose sigaction. 1844 * Tracing can induce an artifical signal and choose sigaction.
1827 * The return value in @signr determines the default action, 1845 * The return value in @signr determines the default action,
@@ -1833,6 +1851,10 @@ relock:
1833 if (unlikely(signr != 0)) 1851 if (unlikely(signr != 0))
1834 ka = return_ka; 1852 ka = return_ka;
1835 else { 1853 else {
1854 if (unlikely(signal->group_stop_count > 0) &&
1855 do_signal_stop(0))
1856 goto relock;
1857
1836 signr = dequeue_signal(current, &current->blocked, 1858 signr = dequeue_signal(current, &current->blocked,
1837 info); 1859 info);
1838 1860
diff --git a/kernel/slow-work-debugfs.c b/kernel/slow-work-debugfs.c
new file mode 100644
index 000000000000..e45c43645298
--- /dev/null
+++ b/kernel/slow-work-debugfs.c
@@ -0,0 +1,227 @@
1/* Slow work debugging
2 *
3 * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/module.h>
13#include <linux/slow-work.h>
14#include <linux/fs.h>
15#include <linux/time.h>
16#include <linux/seq_file.h>
17#include "slow-work.h"
18
19#define ITERATOR_SHIFT (BITS_PER_LONG - 4)
20#define ITERATOR_SELECTOR (0xfUL << ITERATOR_SHIFT)
21#define ITERATOR_COUNTER (~ITERATOR_SELECTOR)
22
23void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m)
24{
25 seq_puts(m, "Slow-work: New thread");
26}
27
28/*
29 * Render the time mark field on a work item into a 5-char time with units plus
30 * a space
31 */
32static void slow_work_print_mark(struct seq_file *m, struct slow_work *work)
33{
34 struct timespec now, diff;
35
36 now = CURRENT_TIME;
37 diff = timespec_sub(now, work->mark);
38
39 if (diff.tv_sec < 0)
40 seq_puts(m, " -ve ");
41 else if (diff.tv_sec == 0 && diff.tv_nsec < 1000)
42 seq_printf(m, "%3luns ", diff.tv_nsec);
43 else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000)
44 seq_printf(m, "%3luus ", diff.tv_nsec / 1000);
45 else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000)
46 seq_printf(m, "%3lums ", diff.tv_nsec / 1000000);
47 else if (diff.tv_sec <= 1)
48 seq_puts(m, " 1s ");
49 else if (diff.tv_sec < 60)
50 seq_printf(m, "%4lus ", diff.tv_sec);
51 else if (diff.tv_sec < 60 * 60)
52 seq_printf(m, "%4lum ", diff.tv_sec / 60);
53 else if (diff.tv_sec < 60 * 60 * 24)
54 seq_printf(m, "%4luh ", diff.tv_sec / 3600);
55 else
56 seq_puts(m, "exces ");
57}
58
59/*
60 * Describe a slow work item for debugfs
61 */
62static int slow_work_runqueue_show(struct seq_file *m, void *v)
63{
64 struct slow_work *work;
65 struct list_head *p = v;
66 unsigned long id;
67
68 switch ((unsigned long) v) {
69 case 1:
70 seq_puts(m, "THR PID ITEM ADDR FL MARK DESC\n");
71 return 0;
72 case 2:
73 seq_puts(m, "=== ===== ================ == ===== ==========\n");
74 return 0;
75
76 case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1:
77 id = (unsigned long) v - 3;
78
79 read_lock(&slow_work_execs_lock);
80 work = slow_work_execs[id];
81 if (work) {
82 smp_read_barrier_depends();
83
84 seq_printf(m, "%3lu %5d %16p %2lx ",
85 id, slow_work_pids[id], work, work->flags);
86 slow_work_print_mark(m, work);
87
88 if (work->ops->desc)
89 work->ops->desc(work, m);
90 seq_putc(m, '\n');
91 }
92 read_unlock(&slow_work_execs_lock);
93 return 0;
94
95 default:
96 work = list_entry(p, struct slow_work, link);
97 seq_printf(m, "%3s - %16p %2lx ",
98 work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq",
99 work, work->flags);
100 slow_work_print_mark(m, work);
101
102 if (work->ops->desc)
103 work->ops->desc(work, m);
104 seq_putc(m, '\n');
105 return 0;
106 }
107}
108
109/*
110 * map the iterator to a work item
111 */
112static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos)
113{
114 struct list_head *p;
115 unsigned long count, id;
116
117 switch (*_pos >> ITERATOR_SHIFT) {
118 case 0x0:
119 if (*_pos == 0)
120 *_pos = 1;
121 if (*_pos < 3)
122 return (void *)(unsigned long) *_pos;
123 if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT)
124 for (id = *_pos - 3;
125 id < SLOW_WORK_THREAD_LIMIT;
126 id++, (*_pos)++)
127 if (slow_work_execs[id])
128 return (void *)(unsigned long) *_pos;
129 *_pos = 0x1UL << ITERATOR_SHIFT;
130
131 case 0x1:
132 count = *_pos & ITERATOR_COUNTER;
133 list_for_each(p, &slow_work_queue) {
134 if (count == 0)
135 return p;
136 count--;
137 }
138 *_pos = 0x2UL << ITERATOR_SHIFT;
139
140 case 0x2:
141 count = *_pos & ITERATOR_COUNTER;
142 list_for_each(p, &vslow_work_queue) {
143 if (count == 0)
144 return p;
145 count--;
146 }
147 *_pos = 0x3UL << ITERATOR_SHIFT;
148
149 default:
150 return NULL;
151 }
152}
153
154/*
155 * set up the iterator to start reading from the first line
156 */
157static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos)
158{
159 spin_lock_irq(&slow_work_queue_lock);
160 return slow_work_runqueue_index(m, _pos);
161}
162
163/*
164 * move to the next line
165 */
166static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos)
167{
168 struct list_head *p = v;
169 unsigned long selector = *_pos >> ITERATOR_SHIFT;
170
171 (*_pos)++;
172 switch (selector) {
173 case 0x0:
174 return slow_work_runqueue_index(m, _pos);
175
176 case 0x1:
177 if (*_pos >> ITERATOR_SHIFT == 0x1) {
178 p = p->next;
179 if (p != &slow_work_queue)
180 return p;
181 }
182 *_pos = 0x2UL << ITERATOR_SHIFT;
183 p = &vslow_work_queue;
184
185 case 0x2:
186 if (*_pos >> ITERATOR_SHIFT == 0x2) {
187 p = p->next;
188 if (p != &vslow_work_queue)
189 return p;
190 }
191 *_pos = 0x3UL << ITERATOR_SHIFT;
192
193 default:
194 return NULL;
195 }
196}
197
198/*
199 * clean up after reading
200 */
201static void slow_work_runqueue_stop(struct seq_file *m, void *v)
202{
203 spin_unlock_irq(&slow_work_queue_lock);
204}
205
206static const struct seq_operations slow_work_runqueue_ops = {
207 .start = slow_work_runqueue_start,
208 .stop = slow_work_runqueue_stop,
209 .next = slow_work_runqueue_next,
210 .show = slow_work_runqueue_show,
211};
212
213/*
214 * open "/sys/kernel/debug/slow_work/runqueue" to list queue contents
215 */
216static int slow_work_runqueue_open(struct inode *inode, struct file *file)
217{
218 return seq_open(file, &slow_work_runqueue_ops);
219}
220
221const struct file_operations slow_work_runqueue_fops = {
222 .owner = THIS_MODULE,
223 .open = slow_work_runqueue_open,
224 .read = seq_read,
225 .llseek = seq_lseek,
226 .release = seq_release,
227};
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 0d31135efbf4..7494bbf5a270 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -16,11 +16,8 @@
16#include <linux/kthread.h> 16#include <linux/kthread.h>
17#include <linux/freezer.h> 17#include <linux/freezer.h>
18#include <linux/wait.h> 18#include <linux/wait.h>
19 19#include <linux/debugfs.h>
20#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of 20#include "slow-work.h"
21 * things to do */
22#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after
23 * OOM */
24 21
25static void slow_work_cull_timeout(unsigned long); 22static void slow_work_cull_timeout(unsigned long);
26static void slow_work_oom_timeout(unsigned long); 23static void slow_work_oom_timeout(unsigned long);
@@ -46,13 +43,12 @@ static unsigned vslow_work_proportion = 50; /* % of threads that may process
46 43
47#ifdef CONFIG_SYSCTL 44#ifdef CONFIG_SYSCTL
48static const int slow_work_min_min_threads = 2; 45static const int slow_work_min_min_threads = 2;
49static int slow_work_max_max_threads = 255; 46static int slow_work_max_max_threads = SLOW_WORK_THREAD_LIMIT;
50static const int slow_work_min_vslow = 1; 47static const int slow_work_min_vslow = 1;
51static const int slow_work_max_vslow = 99; 48static const int slow_work_max_vslow = 99;
52 49
53ctl_table slow_work_sysctls[] = { 50ctl_table slow_work_sysctls[] = {
54 { 51 {
55 .ctl_name = CTL_UNNUMBERED,
56 .procname = "min-threads", 52 .procname = "min-threads",
57 .data = &slow_work_min_threads, 53 .data = &slow_work_min_threads,
58 .maxlen = sizeof(unsigned), 54 .maxlen = sizeof(unsigned),
@@ -62,7 +58,6 @@ ctl_table slow_work_sysctls[] = {
62 .extra2 = &slow_work_max_threads, 58 .extra2 = &slow_work_max_threads,
63 }, 59 },
64 { 60 {
65 .ctl_name = CTL_UNNUMBERED,
66 .procname = "max-threads", 61 .procname = "max-threads",
67 .data = &slow_work_max_threads, 62 .data = &slow_work_max_threads,
68 .maxlen = sizeof(unsigned), 63 .maxlen = sizeof(unsigned),
@@ -72,16 +67,15 @@ ctl_table slow_work_sysctls[] = {
72 .extra2 = (void *) &slow_work_max_max_threads, 67 .extra2 = (void *) &slow_work_max_max_threads,
73 }, 68 },
74 { 69 {
75 .ctl_name = CTL_UNNUMBERED,
76 .procname = "vslow-percentage", 70 .procname = "vslow-percentage",
77 .data = &vslow_work_proportion, 71 .data = &vslow_work_proportion,
78 .maxlen = sizeof(unsigned), 72 .maxlen = sizeof(unsigned),
79 .mode = 0644, 73 .mode = 0644,
80 .proc_handler = &proc_dointvec_minmax, 74 .proc_handler = proc_dointvec_minmax,
81 .extra1 = (void *) &slow_work_min_vslow, 75 .extra1 = (void *) &slow_work_min_vslow,
82 .extra2 = (void *) &slow_work_max_vslow, 76 .extra2 = (void *) &slow_work_max_vslow,
83 }, 77 },
84 { .ctl_name = 0 } 78 {}
85}; 79};
86#endif 80#endif
87 81
@@ -98,6 +92,56 @@ static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0);
98static struct slow_work slow_work_new_thread; /* new thread starter */ 92static struct slow_work slow_work_new_thread; /* new thread starter */
99 93
100/* 94/*
95 * slow work ID allocation (use slow_work_queue_lock)
96 */
97static DECLARE_BITMAP(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
98
99/*
100 * Unregistration tracking to prevent put_ref() from disappearing during module
101 * unload
102 */
103#ifdef CONFIG_MODULES
104static struct module *slow_work_thread_processing[SLOW_WORK_THREAD_LIMIT];
105static struct module *slow_work_unreg_module;
106static struct slow_work *slow_work_unreg_work_item;
107static DECLARE_WAIT_QUEUE_HEAD(slow_work_unreg_wq);
108static DEFINE_MUTEX(slow_work_unreg_sync_lock);
109
110static void slow_work_set_thread_processing(int id, struct slow_work *work)
111{
112 if (work)
113 slow_work_thread_processing[id] = work->owner;
114}
115static void slow_work_done_thread_processing(int id, struct slow_work *work)
116{
117 struct module *module = slow_work_thread_processing[id];
118
119 slow_work_thread_processing[id] = NULL;
120 smp_mb();
121 if (slow_work_unreg_work_item == work ||
122 slow_work_unreg_module == module)
123 wake_up_all(&slow_work_unreg_wq);
124}
125static void slow_work_clear_thread_processing(int id)
126{
127 slow_work_thread_processing[id] = NULL;
128}
129#else
130static void slow_work_set_thread_processing(int id, struct slow_work *work) {}
131static void slow_work_done_thread_processing(int id, struct slow_work *work) {}
132static void slow_work_clear_thread_processing(int id) {}
133#endif
134
135/*
136 * Data for tracking currently executing items for indication through /proc
137 */
138#ifdef CONFIG_SLOW_WORK_DEBUG
139struct slow_work *slow_work_execs[SLOW_WORK_THREAD_LIMIT];
140pid_t slow_work_pids[SLOW_WORK_THREAD_LIMIT];
141DEFINE_RWLOCK(slow_work_execs_lock);
142#endif
143
144/*
101 * The queues of work items and the lock governing access to them. These are 145 * The queues of work items and the lock governing access to them. These are
102 * shared between all the CPUs. It doesn't make sense to have per-CPU queues 146 * shared between all the CPUs. It doesn't make sense to have per-CPU queues
103 * as the number of threads bears no relation to the number of CPUs. 147 * as the number of threads bears no relation to the number of CPUs.
@@ -105,9 +149,18 @@ static struct slow_work slow_work_new_thread; /* new thread starter */
105 * There are two queues of work items: one for slow work items, and one for 149 * There are two queues of work items: one for slow work items, and one for
106 * very slow work items. 150 * very slow work items.
107 */ 151 */
108static LIST_HEAD(slow_work_queue); 152LIST_HEAD(slow_work_queue);
109static LIST_HEAD(vslow_work_queue); 153LIST_HEAD(vslow_work_queue);
110static DEFINE_SPINLOCK(slow_work_queue_lock); 154DEFINE_SPINLOCK(slow_work_queue_lock);
155
156/*
157 * The following are two wait queues that get pinged when a work item is placed
158 * on an empty queue. These allow work items that are hogging a thread by
159 * sleeping in a way that could be deferred to yield their thread and enqueue
160 * themselves.
161 */
162static DECLARE_WAIT_QUEUE_HEAD(slow_work_queue_waits_for_occupation);
163static DECLARE_WAIT_QUEUE_HEAD(vslow_work_queue_waits_for_occupation);
111 164
112/* 165/*
113 * The thread controls. A variable used to signal to the threads that they 166 * The thread controls. A variable used to signal to the threads that they
@@ -126,6 +179,20 @@ static DECLARE_COMPLETION(slow_work_last_thread_exited);
126static int slow_work_user_count; 179static int slow_work_user_count;
127static DEFINE_MUTEX(slow_work_user_lock); 180static DEFINE_MUTEX(slow_work_user_lock);
128 181
182static inline int slow_work_get_ref(struct slow_work *work)
183{
184 if (work->ops->get_ref)
185 return work->ops->get_ref(work);
186
187 return 0;
188}
189
190static inline void slow_work_put_ref(struct slow_work *work)
191{
192 if (work->ops->put_ref)
193 work->ops->put_ref(work);
194}
195
129/* 196/*
130 * Calculate the maximum number of active threads in the pool that are 197 * Calculate the maximum number of active threads in the pool that are
131 * permitted to process very slow work items. 198 * permitted to process very slow work items.
@@ -149,7 +216,7 @@ static unsigned slow_work_calc_vsmax(void)
149 * Attempt to execute stuff queued on a slow thread. Return true if we managed 216 * Attempt to execute stuff queued on a slow thread. Return true if we managed
150 * it, false if there was nothing to do. 217 * it, false if there was nothing to do.
151 */ 218 */
152static bool slow_work_execute(void) 219static noinline bool slow_work_execute(int id)
153{ 220{
154 struct slow_work *work = NULL; 221 struct slow_work *work = NULL;
155 unsigned vsmax; 222 unsigned vsmax;
@@ -186,6 +253,13 @@ static bool slow_work_execute(void)
186 } else { 253 } else {
187 very_slow = false; /* avoid the compiler warning */ 254 very_slow = false; /* avoid the compiler warning */
188 } 255 }
256
257 slow_work_set_thread_processing(id, work);
258 if (work) {
259 slow_work_mark_time(work);
260 slow_work_begin_exec(id, work);
261 }
262
189 spin_unlock_irq(&slow_work_queue_lock); 263 spin_unlock_irq(&slow_work_queue_lock);
190 264
191 if (!work) 265 if (!work)
@@ -194,12 +268,19 @@ static bool slow_work_execute(void)
194 if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags)) 268 if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags))
195 BUG(); 269 BUG();
196 270
197 work->ops->execute(work); 271 /* don't execute if the work is in the process of being cancelled */
272 if (!test_bit(SLOW_WORK_CANCELLING, &work->flags))
273 work->ops->execute(work);
198 274
199 if (very_slow) 275 if (very_slow)
200 atomic_dec(&vslow_work_executing_count); 276 atomic_dec(&vslow_work_executing_count);
201 clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags); 277 clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags);
202 278
279 /* wake up anyone waiting for this work to be complete */
280 wake_up_bit(&work->flags, SLOW_WORK_EXECUTING);
281
282 slow_work_end_exec(id, work);
283
203 /* if someone tried to enqueue the item whilst we were executing it, 284 /* if someone tried to enqueue the item whilst we were executing it,
204 * then it'll be left unenqueued to avoid multiple threads trying to 285 * then it'll be left unenqueued to avoid multiple threads trying to
205 * execute it simultaneously 286 * execute it simultaneously
@@ -219,7 +300,10 @@ static bool slow_work_execute(void)
219 spin_unlock_irq(&slow_work_queue_lock); 300 spin_unlock_irq(&slow_work_queue_lock);
220 } 301 }
221 302
222 work->ops->put_ref(work); 303 /* sort out the race between module unloading and put_ref() */
304 slow_work_put_ref(work);
305 slow_work_done_thread_processing(id, work);
306
223 return true; 307 return true;
224 308
225auto_requeue: 309auto_requeue:
@@ -227,15 +311,61 @@ auto_requeue:
227 * - we transfer our ref on the item back to the appropriate queue 311 * - we transfer our ref on the item back to the appropriate queue
228 * - don't wake another thread up as we're awake already 312 * - don't wake another thread up as we're awake already
229 */ 313 */
314 slow_work_mark_time(work);
230 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) 315 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
231 list_add_tail(&work->link, &vslow_work_queue); 316 list_add_tail(&work->link, &vslow_work_queue);
232 else 317 else
233 list_add_tail(&work->link, &slow_work_queue); 318 list_add_tail(&work->link, &slow_work_queue);
234 spin_unlock_irq(&slow_work_queue_lock); 319 spin_unlock_irq(&slow_work_queue_lock);
320 slow_work_clear_thread_processing(id);
235 return true; 321 return true;
236} 322}
237 323
238/** 324/**
325 * slow_work_sleep_till_thread_needed - Sleep till thread needed by other work
326 * work: The work item under execution that wants to sleep
327 * _timeout: Scheduler sleep timeout
328 *
329 * Allow a requeueable work item to sleep on a slow-work processor thread until
330 * that thread is needed to do some other work or the sleep is interrupted by
331 * some other event.
332 *
333 * The caller must set up a wake up event before calling this and must have set
334 * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own
335 * condition before calling this function as no test is made here.
336 *
337 * False is returned if there is nothing on the queue; true is returned if the
338 * work item should be requeued
339 */
340bool slow_work_sleep_till_thread_needed(struct slow_work *work,
341 signed long *_timeout)
342{
343 wait_queue_head_t *wfo_wq;
344 struct list_head *queue;
345
346 DEFINE_WAIT(wait);
347
348 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
349 wfo_wq = &vslow_work_queue_waits_for_occupation;
350 queue = &vslow_work_queue;
351 } else {
352 wfo_wq = &slow_work_queue_waits_for_occupation;
353 queue = &slow_work_queue;
354 }
355
356 if (!list_empty(queue))
357 return true;
358
359 add_wait_queue_exclusive(wfo_wq, &wait);
360 if (list_empty(queue))
361 *_timeout = schedule_timeout(*_timeout);
362 finish_wait(wfo_wq, &wait);
363
364 return !list_empty(queue);
365}
366EXPORT_SYMBOL(slow_work_sleep_till_thread_needed);
367
368/**
239 * slow_work_enqueue - Schedule a slow work item for processing 369 * slow_work_enqueue - Schedule a slow work item for processing
240 * @work: The work item to queue 370 * @work: The work item to queue
241 * 371 *
@@ -260,16 +390,22 @@ auto_requeue:
260 * allowed to pick items to execute. This ensures that very slow items won't 390 * allowed to pick items to execute. This ensures that very slow items won't
261 * overly block ones that are just ordinarily slow. 391 * overly block ones that are just ordinarily slow.
262 * 392 *
263 * Returns 0 if successful, -EAGAIN if not. 393 * Returns 0 if successful, -EAGAIN if not (or -ECANCELED if cancelled work is
394 * attempted queued)
264 */ 395 */
265int slow_work_enqueue(struct slow_work *work) 396int slow_work_enqueue(struct slow_work *work)
266{ 397{
398 wait_queue_head_t *wfo_wq;
399 struct list_head *queue;
267 unsigned long flags; 400 unsigned long flags;
401 int ret;
402
403 if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
404 return -ECANCELED;
268 405
269 BUG_ON(slow_work_user_count <= 0); 406 BUG_ON(slow_work_user_count <= 0);
270 BUG_ON(!work); 407 BUG_ON(!work);
271 BUG_ON(!work->ops); 408 BUG_ON(!work->ops);
272 BUG_ON(!work->ops->get_ref);
273 409
274 /* when honouring an enqueue request, we only promise that we will run 410 /* when honouring an enqueue request, we only promise that we will run
275 * the work function in the future; we do not promise to run it once 411 * the work function in the future; we do not promise to run it once
@@ -280,8 +416,19 @@ int slow_work_enqueue(struct slow_work *work)
280 * maintaining our promise 416 * maintaining our promise
281 */ 417 */
282 if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) { 418 if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
419 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
420 wfo_wq = &vslow_work_queue_waits_for_occupation;
421 queue = &vslow_work_queue;
422 } else {
423 wfo_wq = &slow_work_queue_waits_for_occupation;
424 queue = &slow_work_queue;
425 }
426
283 spin_lock_irqsave(&slow_work_queue_lock, flags); 427 spin_lock_irqsave(&slow_work_queue_lock, flags);
284 428
429 if (unlikely(test_bit(SLOW_WORK_CANCELLING, &work->flags)))
430 goto cancelled;
431
285 /* we promise that we will not attempt to execute the work 432 /* we promise that we will not attempt to execute the work
286 * function in more than one thread simultaneously 433 * function in more than one thread simultaneously
287 * 434 *
@@ -299,25 +446,221 @@ int slow_work_enqueue(struct slow_work *work)
299 if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) { 446 if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
300 set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags); 447 set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
301 } else { 448 } else {
302 if (work->ops->get_ref(work) < 0) 449 ret = slow_work_get_ref(work);
303 goto cant_get_ref; 450 if (ret < 0)
304 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) 451 goto failed;
305 list_add_tail(&work->link, &vslow_work_queue); 452 slow_work_mark_time(work);
306 else 453 list_add_tail(&work->link, queue);
307 list_add_tail(&work->link, &slow_work_queue);
308 wake_up(&slow_work_thread_wq); 454 wake_up(&slow_work_thread_wq);
455
456 /* if someone who could be requeued is sleeping on a
457 * thread, then ask them to yield their thread */
458 if (work->link.prev == queue)
459 wake_up(wfo_wq);
309 } 460 }
310 461
311 spin_unlock_irqrestore(&slow_work_queue_lock, flags); 462 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
312 } 463 }
313 return 0; 464 return 0;
314 465
315cant_get_ref: 466cancelled:
467 ret = -ECANCELED;
468failed:
316 spin_unlock_irqrestore(&slow_work_queue_lock, flags); 469 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
317 return -EAGAIN; 470 return ret;
318} 471}
319EXPORT_SYMBOL(slow_work_enqueue); 472EXPORT_SYMBOL(slow_work_enqueue);
320 473
474static int slow_work_wait(void *word)
475{
476 schedule();
477 return 0;
478}
479
480/**
481 * slow_work_cancel - Cancel a slow work item
482 * @work: The work item to cancel
483 *
484 * This function will cancel a previously enqueued work item. If we cannot
485 * cancel the work item, it is guarenteed to have run when this function
486 * returns.
487 */
488void slow_work_cancel(struct slow_work *work)
489{
490 bool wait = true, put = false;
491
492 set_bit(SLOW_WORK_CANCELLING, &work->flags);
493 smp_mb();
494
495 /* if the work item is a delayed work item with an active timer, we
496 * need to wait for the timer to finish _before_ getting the spinlock,
497 * lest we deadlock against the timer routine
498 *
499 * the timer routine will leave DELAYED set if it notices the
500 * CANCELLING flag in time
501 */
502 if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
503 struct delayed_slow_work *dwork =
504 container_of(work, struct delayed_slow_work, work);
505 del_timer_sync(&dwork->timer);
506 }
507
508 spin_lock_irq(&slow_work_queue_lock);
509
510 if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
511 /* the timer routine aborted or never happened, so we are left
512 * holding the timer's reference on the item and should just
513 * drop the pending flag and wait for any ongoing execution to
514 * finish */
515 struct delayed_slow_work *dwork =
516 container_of(work, struct delayed_slow_work, work);
517
518 BUG_ON(timer_pending(&dwork->timer));
519 BUG_ON(!list_empty(&work->link));
520
521 clear_bit(SLOW_WORK_DELAYED, &work->flags);
522 put = true;
523 clear_bit(SLOW_WORK_PENDING, &work->flags);
524
525 } else if (test_bit(SLOW_WORK_PENDING, &work->flags) &&
526 !list_empty(&work->link)) {
527 /* the link in the pending queue holds a reference on the item
528 * that we will need to release */
529 list_del_init(&work->link);
530 wait = false;
531 put = true;
532 clear_bit(SLOW_WORK_PENDING, &work->flags);
533
534 } else if (test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) {
535 /* the executor is holding our only reference on the item, so
536 * we merely need to wait for it to finish executing */
537 clear_bit(SLOW_WORK_PENDING, &work->flags);
538 }
539
540 spin_unlock_irq(&slow_work_queue_lock);
541
542 /* the EXECUTING flag is set by the executor whilst the spinlock is set
543 * and before the item is dequeued - so assuming the above doesn't
544 * actually dequeue it, simply waiting for the EXECUTING flag to be
545 * released here should be sufficient */
546 if (wait)
547 wait_on_bit(&work->flags, SLOW_WORK_EXECUTING, slow_work_wait,
548 TASK_UNINTERRUPTIBLE);
549
550 clear_bit(SLOW_WORK_CANCELLING, &work->flags);
551 if (put)
552 slow_work_put_ref(work);
553}
554EXPORT_SYMBOL(slow_work_cancel);
555
556/*
557 * Handle expiry of the delay timer, indicating that a delayed slow work item
558 * should now be queued if not cancelled
559 */
560static void delayed_slow_work_timer(unsigned long data)
561{
562 wait_queue_head_t *wfo_wq;
563 struct list_head *queue;
564 struct slow_work *work = (struct slow_work *) data;
565 unsigned long flags;
566 bool queued = false, put = false, first = false;
567
568 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
569 wfo_wq = &vslow_work_queue_waits_for_occupation;
570 queue = &vslow_work_queue;
571 } else {
572 wfo_wq = &slow_work_queue_waits_for_occupation;
573 queue = &slow_work_queue;
574 }
575
576 spin_lock_irqsave(&slow_work_queue_lock, flags);
577 if (likely(!test_bit(SLOW_WORK_CANCELLING, &work->flags))) {
578 clear_bit(SLOW_WORK_DELAYED, &work->flags);
579
580 if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
581 /* we discard the reference the timer was holding in
582 * favour of the one the executor holds */
583 set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
584 put = true;
585 } else {
586 slow_work_mark_time(work);
587 list_add_tail(&work->link, queue);
588 queued = true;
589 if (work->link.prev == queue)
590 first = true;
591 }
592 }
593
594 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
595 if (put)
596 slow_work_put_ref(work);
597 if (first)
598 wake_up(wfo_wq);
599 if (queued)
600 wake_up(&slow_work_thread_wq);
601}
602
603/**
604 * delayed_slow_work_enqueue - Schedule a delayed slow work item for processing
605 * @dwork: The delayed work item to queue
606 * @delay: When to start executing the work, in jiffies from now
607 *
608 * This is similar to slow_work_enqueue(), but it adds a delay before the work
609 * is actually queued for processing.
610 *
611 * The item can have delayed processing requested on it whilst it is being
612 * executed. The delay will begin immediately, and if it expires before the
613 * item finishes executing, the item will be placed back on the queue when it
614 * has done executing.
615 */
616int delayed_slow_work_enqueue(struct delayed_slow_work *dwork,
617 unsigned long delay)
618{
619 struct slow_work *work = &dwork->work;
620 unsigned long flags;
621 int ret;
622
623 if (delay == 0)
624 return slow_work_enqueue(&dwork->work);
625
626 BUG_ON(slow_work_user_count <= 0);
627 BUG_ON(!work);
628 BUG_ON(!work->ops);
629
630 if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
631 return -ECANCELED;
632
633 if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
634 spin_lock_irqsave(&slow_work_queue_lock, flags);
635
636 if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
637 goto cancelled;
638
639 /* the timer holds a reference whilst it is pending */
640 ret = work->ops->get_ref(work);
641 if (ret < 0)
642 goto cant_get_ref;
643
644 if (test_and_set_bit(SLOW_WORK_DELAYED, &work->flags))
645 BUG();
646 dwork->timer.expires = jiffies + delay;
647 dwork->timer.data = (unsigned long) work;
648 dwork->timer.function = delayed_slow_work_timer;
649 add_timer(&dwork->timer);
650
651 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
652 }
653
654 return 0;
655
656cancelled:
657 ret = -ECANCELED;
658cant_get_ref:
659 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
660 return ret;
661}
662EXPORT_SYMBOL(delayed_slow_work_enqueue);
663
321/* 664/*
322 * Schedule a cull of the thread pool at some time in the near future 665 * Schedule a cull of the thread pool at some time in the near future
323 */ 666 */
@@ -368,13 +711,23 @@ static inline bool slow_work_available(int vsmax)
368 */ 711 */
369static int slow_work_thread(void *_data) 712static int slow_work_thread(void *_data)
370{ 713{
371 int vsmax; 714 int vsmax, id;
372 715
373 DEFINE_WAIT(wait); 716 DEFINE_WAIT(wait);
374 717
375 set_freezable(); 718 set_freezable();
376 set_user_nice(current, -5); 719 set_user_nice(current, -5);
377 720
721 /* allocate ourselves an ID */
722 spin_lock_irq(&slow_work_queue_lock);
723 id = find_first_zero_bit(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
724 BUG_ON(id < 0 || id >= SLOW_WORK_THREAD_LIMIT);
725 __set_bit(id, slow_work_ids);
726 slow_work_set_thread_pid(id, current->pid);
727 spin_unlock_irq(&slow_work_queue_lock);
728
729 sprintf(current->comm, "kslowd%03u", id);
730
378 for (;;) { 731 for (;;) {
379 vsmax = vslow_work_proportion; 732 vsmax = vslow_work_proportion;
380 vsmax *= atomic_read(&slow_work_thread_count); 733 vsmax *= atomic_read(&slow_work_thread_count);
@@ -395,7 +748,7 @@ static int slow_work_thread(void *_data)
395 vsmax *= atomic_read(&slow_work_thread_count); 748 vsmax *= atomic_read(&slow_work_thread_count);
396 vsmax /= 100; 749 vsmax /= 100;
397 750
398 if (slow_work_available(vsmax) && slow_work_execute()) { 751 if (slow_work_available(vsmax) && slow_work_execute(id)) {
399 cond_resched(); 752 cond_resched();
400 if (list_empty(&slow_work_queue) && 753 if (list_empty(&slow_work_queue) &&
401 list_empty(&vslow_work_queue) && 754 list_empty(&vslow_work_queue) &&
@@ -412,6 +765,11 @@ static int slow_work_thread(void *_data)
412 break; 765 break;
413 } 766 }
414 767
768 spin_lock_irq(&slow_work_queue_lock);
769 slow_work_set_thread_pid(id, 0);
770 __clear_bit(id, slow_work_ids);
771 spin_unlock_irq(&slow_work_queue_lock);
772
415 if (atomic_dec_and_test(&slow_work_thread_count)) 773 if (atomic_dec_and_test(&slow_work_thread_count))
416 complete_and_exit(&slow_work_last_thread_exited, 0); 774 complete_and_exit(&slow_work_last_thread_exited, 0);
417 return 0; 775 return 0;
@@ -427,21 +785,6 @@ static void slow_work_cull_timeout(unsigned long data)
427} 785}
428 786
429/* 787/*
430 * Get a reference on slow work thread starter
431 */
432static int slow_work_new_thread_get_ref(struct slow_work *work)
433{
434 return 0;
435}
436
437/*
438 * Drop a reference on slow work thread starter
439 */
440static void slow_work_new_thread_put_ref(struct slow_work *work)
441{
442}
443
444/*
445 * Start a new slow work thread 788 * Start a new slow work thread
446 */ 789 */
447static void slow_work_new_thread_execute(struct slow_work *work) 790static void slow_work_new_thread_execute(struct slow_work *work)
@@ -475,9 +818,11 @@ static void slow_work_new_thread_execute(struct slow_work *work)
475} 818}
476 819
477static const struct slow_work_ops slow_work_new_thread_ops = { 820static const struct slow_work_ops slow_work_new_thread_ops = {
478 .get_ref = slow_work_new_thread_get_ref, 821 .owner = THIS_MODULE,
479 .put_ref = slow_work_new_thread_put_ref,
480 .execute = slow_work_new_thread_execute, 822 .execute = slow_work_new_thread_execute,
823#ifdef CONFIG_SLOW_WORK_DEBUG
824 .desc = slow_work_new_thread_desc,
825#endif
481}; 826};
482 827
483/* 828/*
@@ -546,12 +891,13 @@ static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
546 891
547/** 892/**
548 * slow_work_register_user - Register a user of the facility 893 * slow_work_register_user - Register a user of the facility
894 * @module: The module about to make use of the facility
549 * 895 *
550 * Register a user of the facility, starting up the initial threads if there 896 * Register a user of the facility, starting up the initial threads if there
551 * aren't any other users at this point. This will return 0 if successful, or 897 * aren't any other users at this point. This will return 0 if successful, or
552 * an error if not. 898 * an error if not.
553 */ 899 */
554int slow_work_register_user(void) 900int slow_work_register_user(struct module *module)
555{ 901{
556 struct task_struct *p; 902 struct task_struct *p;
557 int loop; 903 int loop;
@@ -598,14 +944,81 @@ error:
598} 944}
599EXPORT_SYMBOL(slow_work_register_user); 945EXPORT_SYMBOL(slow_work_register_user);
600 946
947/*
948 * wait for all outstanding items from the calling module to complete
949 * - note that more items may be queued whilst we're waiting
950 */
951static void slow_work_wait_for_items(struct module *module)
952{
953#ifdef CONFIG_MODULES
954 DECLARE_WAITQUEUE(myself, current);
955 struct slow_work *work;
956 int loop;
957
958 mutex_lock(&slow_work_unreg_sync_lock);
959 add_wait_queue(&slow_work_unreg_wq, &myself);
960
961 for (;;) {
962 spin_lock_irq(&slow_work_queue_lock);
963
964 /* first of all, we wait for the last queued item in each list
965 * to be processed */
966 list_for_each_entry_reverse(work, &vslow_work_queue, link) {
967 if (work->owner == module) {
968 set_current_state(TASK_UNINTERRUPTIBLE);
969 slow_work_unreg_work_item = work;
970 goto do_wait;
971 }
972 }
973 list_for_each_entry_reverse(work, &slow_work_queue, link) {
974 if (work->owner == module) {
975 set_current_state(TASK_UNINTERRUPTIBLE);
976 slow_work_unreg_work_item = work;
977 goto do_wait;
978 }
979 }
980
981 /* then we wait for the items being processed to finish */
982 slow_work_unreg_module = module;
983 smp_mb();
984 for (loop = 0; loop < SLOW_WORK_THREAD_LIMIT; loop++) {
985 if (slow_work_thread_processing[loop] == module)
986 goto do_wait;
987 }
988 spin_unlock_irq(&slow_work_queue_lock);
989 break; /* okay, we're done */
990
991 do_wait:
992 spin_unlock_irq(&slow_work_queue_lock);
993 schedule();
994 slow_work_unreg_work_item = NULL;
995 slow_work_unreg_module = NULL;
996 }
997
998 remove_wait_queue(&slow_work_unreg_wq, &myself);
999 mutex_unlock(&slow_work_unreg_sync_lock);
1000#endif /* CONFIG_MODULES */
1001}
1002
601/** 1003/**
602 * slow_work_unregister_user - Unregister a user of the facility 1004 * slow_work_unregister_user - Unregister a user of the facility
1005 * @module: The module whose items should be cleared
603 * 1006 *
604 * Unregister a user of the facility, killing all the threads if this was the 1007 * Unregister a user of the facility, killing all the threads if this was the
605 * last one. 1008 * last one.
1009 *
1010 * This waits for all the work items belonging to the nominated module to go
1011 * away before proceeding.
606 */ 1012 */
607void slow_work_unregister_user(void) 1013void slow_work_unregister_user(struct module *module)
608{ 1014{
1015 /* first of all, wait for all outstanding items from the calling module
1016 * to complete */
1017 if (module)
1018 slow_work_wait_for_items(module);
1019
1020 /* then we can actually go about shutting down the facility if need
1021 * be */
609 mutex_lock(&slow_work_user_lock); 1022 mutex_lock(&slow_work_user_lock);
610 1023
611 BUG_ON(slow_work_user_count <= 0); 1024 BUG_ON(slow_work_user_count <= 0);
@@ -639,6 +1052,16 @@ static int __init init_slow_work(void)
639 if (slow_work_max_max_threads < nr_cpus * 2) 1052 if (slow_work_max_max_threads < nr_cpus * 2)
640 slow_work_max_max_threads = nr_cpus * 2; 1053 slow_work_max_max_threads = nr_cpus * 2;
641#endif 1054#endif
1055#ifdef CONFIG_SLOW_WORK_DEBUG
1056 {
1057 struct dentry *dbdir;
1058
1059 dbdir = debugfs_create_dir("slow_work", NULL);
1060 if (dbdir && !IS_ERR(dbdir))
1061 debugfs_create_file("runqueue", S_IFREG | 0400, dbdir,
1062 NULL, &slow_work_runqueue_fops);
1063 }
1064#endif
642 return 0; 1065 return 0;
643} 1066}
644 1067
diff --git a/kernel/slow-work.h b/kernel/slow-work.h
new file mode 100644
index 000000000000..321f3c59d732
--- /dev/null
+++ b/kernel/slow-work.h
@@ -0,0 +1,72 @@
1/* Slow work private definitions
2 *
3 * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of
13 * things to do */
14#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after
15 * OOM */
16
17#define SLOW_WORK_THREAD_LIMIT 255 /* abs maximum number of slow-work threads */
18
19/*
20 * slow-work.c
21 */
22#ifdef CONFIG_SLOW_WORK_DEBUG
23extern struct slow_work *slow_work_execs[];
24extern pid_t slow_work_pids[];
25extern rwlock_t slow_work_execs_lock;
26#endif
27
28extern struct list_head slow_work_queue;
29extern struct list_head vslow_work_queue;
30extern spinlock_t slow_work_queue_lock;
31
32/*
33 * slow-work-debugfs.c
34 */
35#ifdef CONFIG_SLOW_WORK_DEBUG
36extern const struct file_operations slow_work_runqueue_fops;
37
38extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *);
39#endif
40
41/*
42 * Helper functions
43 */
44static inline void slow_work_set_thread_pid(int id, pid_t pid)
45{
46#ifdef CONFIG_SLOW_WORK_PROC
47 slow_work_pids[id] = pid;
48#endif
49}
50
51static inline void slow_work_mark_time(struct slow_work *work)
52{
53#ifdef CONFIG_SLOW_WORK_PROC
54 work->mark = CURRENT_TIME;
55#endif
56}
57
58static inline void slow_work_begin_exec(int id, struct slow_work *work)
59{
60#ifdef CONFIG_SLOW_WORK_PROC
61 slow_work_execs[id] = work;
62#endif
63}
64
65static inline void slow_work_end_exec(int id, struct slow_work *work)
66{
67#ifdef CONFIG_SLOW_WORK_PROC
68 write_lock(&slow_work_execs_lock);
69 slow_work_execs[id] = NULL;
70 write_unlock(&slow_work_execs_lock);
71#endif
72}
diff --git a/kernel/smp.c b/kernel/smp.c
index c9d1c7835c2f..de735a6637d0 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -16,11 +16,11 @@ static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
16 16
17static struct { 17static struct {
18 struct list_head queue; 18 struct list_head queue;
19 spinlock_t lock; 19 raw_spinlock_t lock;
20} call_function __cacheline_aligned_in_smp = 20} call_function __cacheline_aligned_in_smp =
21 { 21 {
22 .queue = LIST_HEAD_INIT(call_function.queue), 22 .queue = LIST_HEAD_INIT(call_function.queue),
23 .lock = __SPIN_LOCK_UNLOCKED(call_function.lock), 23 .lock = __RAW_SPIN_LOCK_UNLOCKED(call_function.lock),
24 }; 24 };
25 25
26enum { 26enum {
@@ -35,7 +35,7 @@ struct call_function_data {
35 35
36struct call_single_queue { 36struct call_single_queue {
37 struct list_head list; 37 struct list_head list;
38 spinlock_t lock; 38 raw_spinlock_t lock;
39}; 39};
40 40
41static DEFINE_PER_CPU(struct call_function_data, cfd_data); 41static DEFINE_PER_CPU(struct call_function_data, cfd_data);
@@ -80,7 +80,7 @@ static int __cpuinit init_call_single_data(void)
80 for_each_possible_cpu(i) { 80 for_each_possible_cpu(i) {
81 struct call_single_queue *q = &per_cpu(call_single_queue, i); 81 struct call_single_queue *q = &per_cpu(call_single_queue, i);
82 82
83 spin_lock_init(&q->lock); 83 raw_spin_lock_init(&q->lock);
84 INIT_LIST_HEAD(&q->list); 84 INIT_LIST_HEAD(&q->list);
85 } 85 }
86 86
@@ -141,10 +141,10 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
141 unsigned long flags; 141 unsigned long flags;
142 int ipi; 142 int ipi;
143 143
144 spin_lock_irqsave(&dst->lock, flags); 144 raw_spin_lock_irqsave(&dst->lock, flags);
145 ipi = list_empty(&dst->list); 145 ipi = list_empty(&dst->list);
146 list_add_tail(&data->list, &dst->list); 146 list_add_tail(&data->list, &dst->list);
147 spin_unlock_irqrestore(&dst->lock, flags); 147 raw_spin_unlock_irqrestore(&dst->lock, flags);
148 148
149 /* 149 /*
150 * The list addition should be visible before sending the IPI 150 * The list addition should be visible before sending the IPI
@@ -171,7 +171,7 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
171void generic_smp_call_function_interrupt(void) 171void generic_smp_call_function_interrupt(void)
172{ 172{
173 struct call_function_data *data; 173 struct call_function_data *data;
174 int cpu = get_cpu(); 174 int cpu = smp_processor_id();
175 175
176 /* 176 /*
177 * Shouldn't receive this interrupt on a cpu that is not yet online. 177 * Shouldn't receive this interrupt on a cpu that is not yet online.
@@ -201,9 +201,9 @@ void generic_smp_call_function_interrupt(void)
201 refs = atomic_dec_return(&data->refs); 201 refs = atomic_dec_return(&data->refs);
202 WARN_ON(refs < 0); 202 WARN_ON(refs < 0);
203 if (!refs) { 203 if (!refs) {
204 spin_lock(&call_function.lock); 204 raw_spin_lock(&call_function.lock);
205 list_del_rcu(&data->csd.list); 205 list_del_rcu(&data->csd.list);
206 spin_unlock(&call_function.lock); 206 raw_spin_unlock(&call_function.lock);
207 } 207 }
208 208
209 if (refs) 209 if (refs)
@@ -212,7 +212,6 @@ void generic_smp_call_function_interrupt(void)
212 csd_unlock(&data->csd); 212 csd_unlock(&data->csd);
213 } 213 }
214 214
215 put_cpu();
216} 215}
217 216
218/* 217/*
@@ -230,9 +229,9 @@ void generic_smp_call_function_single_interrupt(void)
230 */ 229 */
231 WARN_ON_ONCE(!cpu_online(smp_processor_id())); 230 WARN_ON_ONCE(!cpu_online(smp_processor_id()));
232 231
233 spin_lock(&q->lock); 232 raw_spin_lock(&q->lock);
234 list_replace_init(&q->list, &list); 233 list_replace_init(&q->list, &list);
235 spin_unlock(&q->lock); 234 raw_spin_unlock(&q->lock);
236 235
237 while (!list_empty(&list)) { 236 while (!list_empty(&list)) {
238 struct call_single_data *data; 237 struct call_single_data *data;
@@ -265,9 +264,7 @@ static DEFINE_PER_CPU(struct call_single_data, csd_data);
265 * @info: An arbitrary pointer to pass to the function. 264 * @info: An arbitrary pointer to pass to the function.
266 * @wait: If true, wait until function has completed on other CPUs. 265 * @wait: If true, wait until function has completed on other CPUs.
267 * 266 *
268 * Returns 0 on success, else a negative status code. Note that @wait 267 * Returns 0 on success, else a negative status code.
269 * will be implicitly turned on in case of allocation failures, since
270 * we fall back to on-stack allocation.
271 */ 268 */
272int smp_call_function_single(int cpu, void (*func) (void *info), void *info, 269int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
273 int wait) 270 int wait)
@@ -321,6 +318,51 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
321} 318}
322EXPORT_SYMBOL(smp_call_function_single); 319EXPORT_SYMBOL(smp_call_function_single);
323 320
321/*
322 * smp_call_function_any - Run a function on any of the given cpus
323 * @mask: The mask of cpus it can run on.
324 * @func: The function to run. This must be fast and non-blocking.
325 * @info: An arbitrary pointer to pass to the function.
326 * @wait: If true, wait until function has completed.
327 *
328 * Returns 0 on success, else a negative status code (if no cpus were online).
329 * Note that @wait will be implicitly turned on in case of allocation failures,
330 * since we fall back to on-stack allocation.
331 *
332 * Selection preference:
333 * 1) current cpu if in @mask
334 * 2) any cpu of current node if in @mask
335 * 3) any other online cpu in @mask
336 */
337int smp_call_function_any(const struct cpumask *mask,
338 void (*func)(void *info), void *info, int wait)
339{
340 unsigned int cpu;
341 const struct cpumask *nodemask;
342 int ret;
343
344 /* Try for same CPU (cheapest) */
345 cpu = get_cpu();
346 if (cpumask_test_cpu(cpu, mask))
347 goto call;
348
349 /* Try for same node. */
350 nodemask = cpumask_of_node(cpu);
351 for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids;
352 cpu = cpumask_next_and(cpu, nodemask, mask)) {
353 if (cpu_online(cpu))
354 goto call;
355 }
356
357 /* Any online will do: smp_call_function_single handles nr_cpu_ids. */
358 cpu = cpumask_any_and(mask, cpu_online_mask);
359call:
360 ret = smp_call_function_single(cpu, func, info, wait);
361 put_cpu();
362 return ret;
363}
364EXPORT_SYMBOL_GPL(smp_call_function_any);
365
324/** 366/**
325 * __smp_call_function_single(): Run a function on another CPU 367 * __smp_call_function_single(): Run a function on another CPU
326 * @cpu: The CPU to run on. 368 * @cpu: The CPU to run on.
@@ -355,9 +397,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
355 * @wait: If true, wait (atomically) until function has completed 397 * @wait: If true, wait (atomically) until function has completed
356 * on other CPUs. 398 * on other CPUs.
357 * 399 *
358 * If @wait is true, then returns once @func has returned. Note that @wait 400 * If @wait is true, then returns once @func has returned.
359 * will be implicitly turned on in case of allocation failures, since
360 * we fall back to on-stack allocation.
361 * 401 *
362 * You must not call this function with disabled interrupts or from a 402 * You must not call this function with disabled interrupts or from a
363 * hardware interrupt handler or from a bottom half handler. Preemption 403 * hardware interrupt handler or from a bottom half handler. Preemption
@@ -408,14 +448,14 @@ void smp_call_function_many(const struct cpumask *mask,
408 cpumask_clear_cpu(this_cpu, data->cpumask); 448 cpumask_clear_cpu(this_cpu, data->cpumask);
409 atomic_set(&data->refs, cpumask_weight(data->cpumask)); 449 atomic_set(&data->refs, cpumask_weight(data->cpumask));
410 450
411 spin_lock_irqsave(&call_function.lock, flags); 451 raw_spin_lock_irqsave(&call_function.lock, flags);
412 /* 452 /*
413 * Place entry at the _HEAD_ of the list, so that any cpu still 453 * Place entry at the _HEAD_ of the list, so that any cpu still
414 * observing the entry in generic_smp_call_function_interrupt() 454 * observing the entry in generic_smp_call_function_interrupt()
415 * will not miss any other list entries: 455 * will not miss any other list entries:
416 */ 456 */
417 list_add_rcu(&data->csd.list, &call_function.queue); 457 list_add_rcu(&data->csd.list, &call_function.queue);
418 spin_unlock_irqrestore(&call_function.lock, flags); 458 raw_spin_unlock_irqrestore(&call_function.lock, flags);
419 459
420 /* 460 /*
421 * Make the list addition visible before sending the ipi. 461 * Make the list addition visible before sending the ipi.
@@ -443,8 +483,7 @@ EXPORT_SYMBOL(smp_call_function_many);
443 * Returns 0. 483 * Returns 0.
444 * 484 *
445 * If @wait is true, then returns once @func has returned; otherwise 485 * If @wait is true, then returns once @func has returned; otherwise
446 * it returns just before the target cpu calls @func. In case of allocation 486 * it returns just before the target cpu calls @func.
447 * failure, @wait will be implicitly turned on.
448 * 487 *
449 * You must not call this function with disabled interrupts or from a 488 * You must not call this function with disabled interrupts or from a
450 * hardware interrupt handler or from a bottom half handler. 489 * hardware interrupt handler or from a bottom half handler.
@@ -461,20 +500,20 @@ EXPORT_SYMBOL(smp_call_function);
461 500
462void ipi_call_lock(void) 501void ipi_call_lock(void)
463{ 502{
464 spin_lock(&call_function.lock); 503 raw_spin_lock(&call_function.lock);
465} 504}
466 505
467void ipi_call_unlock(void) 506void ipi_call_unlock(void)
468{ 507{
469 spin_unlock(&call_function.lock); 508 raw_spin_unlock(&call_function.lock);
470} 509}
471 510
472void ipi_call_lock_irq(void) 511void ipi_call_lock_irq(void)
473{ 512{
474 spin_lock_irq(&call_function.lock); 513 raw_spin_lock_irq(&call_function.lock);
475} 514}
476 515
477void ipi_call_unlock_irq(void) 516void ipi_call_unlock_irq(void)
478{ 517{
479 spin_unlock_irq(&call_function.lock); 518 raw_spin_unlock_irq(&call_function.lock);
480} 519}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index f8749e5216e0..a09502e2ef75 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -302,9 +302,9 @@ void irq_exit(void)
302 if (!in_interrupt() && local_softirq_pending()) 302 if (!in_interrupt() && local_softirq_pending())
303 invoke_softirq(); 303 invoke_softirq();
304 304
305 rcu_irq_exit();
305#ifdef CONFIG_NO_HZ 306#ifdef CONFIG_NO_HZ
306 /* Make sure that timer wheel updates are propagated */ 307 /* Make sure that timer wheel updates are propagated */
307 rcu_irq_exit();
308 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) 308 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
309 tick_nohz_stop_sched_tick(0); 309 tick_nohz_stop_sched_tick(0);
310#endif 310#endif
@@ -697,7 +697,7 @@ void __init softirq_init(void)
697 open_softirq(HI_SOFTIRQ, tasklet_hi_action); 697 open_softirq(HI_SOFTIRQ, tasklet_hi_action);
698} 698}
699 699
700static int ksoftirqd(void * __bind_cpu) 700static int run_ksoftirqd(void * __bind_cpu)
701{ 701{
702 set_current_state(TASK_INTERRUPTIBLE); 702 set_current_state(TASK_INTERRUPTIBLE);
703 703
@@ -810,7 +810,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
810 switch (action) { 810 switch (action) {
811 case CPU_UP_PREPARE: 811 case CPU_UP_PREPARE:
812 case CPU_UP_PREPARE_FROZEN: 812 case CPU_UP_PREPARE_FROZEN:
813 p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); 813 p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
814 if (IS_ERR(p)) { 814 if (IS_ERR(p)) {
815 printk("ksoftirqd for %i failed\n", hotcpu); 815 printk("ksoftirqd for %i failed\n", hotcpu);
816 return NOTIFY_BAD; 816 return NOTIFY_BAD;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 81324d12eb35..d22579087e27 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -22,9 +22,9 @@
22 22
23static DEFINE_SPINLOCK(print_lock); 23static DEFINE_SPINLOCK(print_lock);
24 24
25static DEFINE_PER_CPU(unsigned long, touch_timestamp); 25static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */
26static DEFINE_PER_CPU(unsigned long, print_timestamp); 26static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */
27static DEFINE_PER_CPU(struct task_struct *, watchdog_task); 27static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
28 28
29static int __read_mostly did_panic; 29static int __read_mostly did_panic;
30int __read_mostly softlockup_thresh = 60; 30int __read_mostly softlockup_thresh = 60;
@@ -70,12 +70,12 @@ static void __touch_softlockup_watchdog(void)
70{ 70{
71 int this_cpu = raw_smp_processor_id(); 71 int this_cpu = raw_smp_processor_id();
72 72
73 __raw_get_cpu_var(touch_timestamp) = get_timestamp(this_cpu); 73 __raw_get_cpu_var(softlockup_touch_ts) = get_timestamp(this_cpu);
74} 74}
75 75
76void touch_softlockup_watchdog(void) 76void touch_softlockup_watchdog(void)
77{ 77{
78 __raw_get_cpu_var(touch_timestamp) = 0; 78 __raw_get_cpu_var(softlockup_touch_ts) = 0;
79} 79}
80EXPORT_SYMBOL(touch_softlockup_watchdog); 80EXPORT_SYMBOL(touch_softlockup_watchdog);
81 81
@@ -85,7 +85,7 @@ void touch_all_softlockup_watchdogs(void)
85 85
86 /* Cause each CPU to re-update its timestamp rather than complain */ 86 /* Cause each CPU to re-update its timestamp rather than complain */
87 for_each_online_cpu(cpu) 87 for_each_online_cpu(cpu)
88 per_cpu(touch_timestamp, cpu) = 0; 88 per_cpu(softlockup_touch_ts, cpu) = 0;
89} 89}
90EXPORT_SYMBOL(touch_all_softlockup_watchdogs); 90EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
91 91
@@ -104,28 +104,28 @@ int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
104void softlockup_tick(void) 104void softlockup_tick(void)
105{ 105{
106 int this_cpu = smp_processor_id(); 106 int this_cpu = smp_processor_id();
107 unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu); 107 unsigned long touch_ts = per_cpu(softlockup_touch_ts, this_cpu);
108 unsigned long print_timestamp; 108 unsigned long print_ts;
109 struct pt_regs *regs = get_irq_regs(); 109 struct pt_regs *regs = get_irq_regs();
110 unsigned long now; 110 unsigned long now;
111 111
112 /* Is detection switched off? */ 112 /* Is detection switched off? */
113 if (!per_cpu(watchdog_task, this_cpu) || softlockup_thresh <= 0) { 113 if (!per_cpu(softlockup_watchdog, this_cpu) || softlockup_thresh <= 0) {
114 /* Be sure we don't false trigger if switched back on */ 114 /* Be sure we don't false trigger if switched back on */
115 if (touch_timestamp) 115 if (touch_ts)
116 per_cpu(touch_timestamp, this_cpu) = 0; 116 per_cpu(softlockup_touch_ts, this_cpu) = 0;
117 return; 117 return;
118 } 118 }
119 119
120 if (touch_timestamp == 0) { 120 if (touch_ts == 0) {
121 __touch_softlockup_watchdog(); 121 __touch_softlockup_watchdog();
122 return; 122 return;
123 } 123 }
124 124
125 print_timestamp = per_cpu(print_timestamp, this_cpu); 125 print_ts = per_cpu(softlockup_print_ts, this_cpu);
126 126
127 /* report at most once a second */ 127 /* report at most once a second */
128 if (print_timestamp == touch_timestamp || did_panic) 128 if (print_ts == touch_ts || did_panic)
129 return; 129 return;
130 130
131 /* do not print during early bootup: */ 131 /* do not print during early bootup: */
@@ -140,18 +140,18 @@ void softlockup_tick(void)
140 * Wake up the high-prio watchdog task twice per 140 * Wake up the high-prio watchdog task twice per
141 * threshold timespan. 141 * threshold timespan.
142 */ 142 */
143 if (now > touch_timestamp + softlockup_thresh/2) 143 if (now > touch_ts + softlockup_thresh/2)
144 wake_up_process(per_cpu(watchdog_task, this_cpu)); 144 wake_up_process(per_cpu(softlockup_watchdog, this_cpu));
145 145
146 /* Warn about unreasonable delays: */ 146 /* Warn about unreasonable delays: */
147 if (now <= (touch_timestamp + softlockup_thresh)) 147 if (now <= (touch_ts + softlockup_thresh))
148 return; 148 return;
149 149
150 per_cpu(print_timestamp, this_cpu) = touch_timestamp; 150 per_cpu(softlockup_print_ts, this_cpu) = touch_ts;
151 151
152 spin_lock(&print_lock); 152 spin_lock(&print_lock);
153 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n", 153 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n",
154 this_cpu, now - touch_timestamp, 154 this_cpu, now - touch_ts,
155 current->comm, task_pid_nr(current)); 155 current->comm, task_pid_nr(current));
156 print_modules(); 156 print_modules();
157 print_irqtrace_events(current); 157 print_irqtrace_events(current);
@@ -209,32 +209,32 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
209 switch (action) { 209 switch (action) {
210 case CPU_UP_PREPARE: 210 case CPU_UP_PREPARE:
211 case CPU_UP_PREPARE_FROZEN: 211 case CPU_UP_PREPARE_FROZEN:
212 BUG_ON(per_cpu(watchdog_task, hotcpu)); 212 BUG_ON(per_cpu(softlockup_watchdog, hotcpu));
213 p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu); 213 p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
214 if (IS_ERR(p)) { 214 if (IS_ERR(p)) {
215 printk(KERN_ERR "watchdog for %i failed\n", hotcpu); 215 printk(KERN_ERR "watchdog for %i failed\n", hotcpu);
216 return NOTIFY_BAD; 216 return NOTIFY_BAD;
217 } 217 }
218 per_cpu(touch_timestamp, hotcpu) = 0; 218 per_cpu(softlockup_touch_ts, hotcpu) = 0;
219 per_cpu(watchdog_task, hotcpu) = p; 219 per_cpu(softlockup_watchdog, hotcpu) = p;
220 kthread_bind(p, hotcpu); 220 kthread_bind(p, hotcpu);
221 break; 221 break;
222 case CPU_ONLINE: 222 case CPU_ONLINE:
223 case CPU_ONLINE_FROZEN: 223 case CPU_ONLINE_FROZEN:
224 wake_up_process(per_cpu(watchdog_task, hotcpu)); 224 wake_up_process(per_cpu(softlockup_watchdog, hotcpu));
225 break; 225 break;
226#ifdef CONFIG_HOTPLUG_CPU 226#ifdef CONFIG_HOTPLUG_CPU
227 case CPU_UP_CANCELED: 227 case CPU_UP_CANCELED:
228 case CPU_UP_CANCELED_FROZEN: 228 case CPU_UP_CANCELED_FROZEN:
229 if (!per_cpu(watchdog_task, hotcpu)) 229 if (!per_cpu(softlockup_watchdog, hotcpu))
230 break; 230 break;
231 /* Unbind so it can run. Fall thru. */ 231 /* Unbind so it can run. Fall thru. */
232 kthread_bind(per_cpu(watchdog_task, hotcpu), 232 kthread_bind(per_cpu(softlockup_watchdog, hotcpu),
233 cpumask_any(cpu_online_mask)); 233 cpumask_any(cpu_online_mask));
234 case CPU_DEAD: 234 case CPU_DEAD:
235 case CPU_DEAD_FROZEN: 235 case CPU_DEAD_FROZEN:
236 p = per_cpu(watchdog_task, hotcpu); 236 p = per_cpu(softlockup_watchdog, hotcpu);
237 per_cpu(watchdog_task, hotcpu) = NULL; 237 per_cpu(softlockup_watchdog, hotcpu) = NULL;
238 kthread_stop(p); 238 kthread_stop(p);
239 break; 239 break;
240#endif /* CONFIG_HOTPLUG_CPU */ 240#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 5ddab730cb2f..be6517fb9c14 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -21,193 +21,72 @@
21#include <linux/debug_locks.h> 21#include <linux/debug_locks.h>
22#include <linux/module.h> 22#include <linux/module.h>
23 23
24#ifndef _spin_trylock
25int __lockfunc _spin_trylock(spinlock_t *lock)
26{
27 return __spin_trylock(lock);
28}
29EXPORT_SYMBOL(_spin_trylock);
30#endif
31
32#ifndef _read_trylock
33int __lockfunc _read_trylock(rwlock_t *lock)
34{
35 return __read_trylock(lock);
36}
37EXPORT_SYMBOL(_read_trylock);
38#endif
39
40#ifndef _write_trylock
41int __lockfunc _write_trylock(rwlock_t *lock)
42{
43 return __write_trylock(lock);
44}
45EXPORT_SYMBOL(_write_trylock);
46#endif
47
48/* 24/*
49 * If lockdep is enabled then we use the non-preemption spin-ops 25 * If lockdep is enabled then we use the non-preemption spin-ops
50 * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are 26 * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
51 * not re-enabled during lock-acquire (which the preempt-spin-ops do): 27 * not re-enabled during lock-acquire (which the preempt-spin-ops do):
52 */ 28 */
53#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) 29#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
54
55#ifndef _read_lock
56void __lockfunc _read_lock(rwlock_t *lock)
57{
58 __read_lock(lock);
59}
60EXPORT_SYMBOL(_read_lock);
61#endif
62
63#ifndef _spin_lock_irqsave
64unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
65{
66 return __spin_lock_irqsave(lock);
67}
68EXPORT_SYMBOL(_spin_lock_irqsave);
69#endif
70
71#ifndef _spin_lock_irq
72void __lockfunc _spin_lock_irq(spinlock_t *lock)
73{
74 __spin_lock_irq(lock);
75}
76EXPORT_SYMBOL(_spin_lock_irq);
77#endif
78
79#ifndef _spin_lock_bh
80void __lockfunc _spin_lock_bh(spinlock_t *lock)
81{
82 __spin_lock_bh(lock);
83}
84EXPORT_SYMBOL(_spin_lock_bh);
85#endif
86
87#ifndef _read_lock_irqsave
88unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
89{
90 return __read_lock_irqsave(lock);
91}
92EXPORT_SYMBOL(_read_lock_irqsave);
93#endif
94
95#ifndef _read_lock_irq
96void __lockfunc _read_lock_irq(rwlock_t *lock)
97{
98 __read_lock_irq(lock);
99}
100EXPORT_SYMBOL(_read_lock_irq);
101#endif
102
103#ifndef _read_lock_bh
104void __lockfunc _read_lock_bh(rwlock_t *lock)
105{
106 __read_lock_bh(lock);
107}
108EXPORT_SYMBOL(_read_lock_bh);
109#endif
110
111#ifndef _write_lock_irqsave
112unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
113{
114 return __write_lock_irqsave(lock);
115}
116EXPORT_SYMBOL(_write_lock_irqsave);
117#endif
118
119#ifndef _write_lock_irq
120void __lockfunc _write_lock_irq(rwlock_t *lock)
121{
122 __write_lock_irq(lock);
123}
124EXPORT_SYMBOL(_write_lock_irq);
125#endif
126
127#ifndef _write_lock_bh
128void __lockfunc _write_lock_bh(rwlock_t *lock)
129{
130 __write_lock_bh(lock);
131}
132EXPORT_SYMBOL(_write_lock_bh);
133#endif
134
135#ifndef _spin_lock
136void __lockfunc _spin_lock(spinlock_t *lock)
137{
138 __spin_lock(lock);
139}
140EXPORT_SYMBOL(_spin_lock);
141#endif
142
143#ifndef _write_lock
144void __lockfunc _write_lock(rwlock_t *lock)
145{
146 __write_lock(lock);
147}
148EXPORT_SYMBOL(_write_lock);
149#endif
150
151#else /* CONFIG_PREEMPT: */
152
153/* 30/*
31 * The __lock_function inlines are taken from
32 * include/linux/spinlock_api_smp.h
33 */
34#else
35#define raw_read_can_lock(l) read_can_lock(l)
36#define raw_write_can_lock(l) write_can_lock(l)
37/*
38 * We build the __lock_function inlines here. They are too large for
39 * inlining all over the place, but here is only one user per function
40 * which embedds them into the calling _lock_function below.
41 *
154 * This could be a long-held lock. We both prepare to spin for a long 42 * This could be a long-held lock. We both prepare to spin for a long
155 * time (making _this_ CPU preemptable if possible), and we also signal 43 * time (making _this_ CPU preemptable if possible), and we also signal
156 * towards that other CPU that it should break the lock ASAP. 44 * towards that other CPU that it should break the lock ASAP.
157 *
158 * (We do this in a function because inlining it would be excessive.)
159 */ 45 */
160
161#define BUILD_LOCK_OPS(op, locktype) \ 46#define BUILD_LOCK_OPS(op, locktype) \
162void __lockfunc _##op##_lock(locktype##_t *lock) \ 47void __lockfunc __raw_##op##_lock(locktype##_t *lock) \
163{ \ 48{ \
164 for (;;) { \ 49 for (;;) { \
165 preempt_disable(); \ 50 preempt_disable(); \
166 if (likely(_raw_##op##_trylock(lock))) \ 51 if (likely(do_raw_##op##_trylock(lock))) \
167 break; \ 52 break; \
168 preempt_enable(); \ 53 preempt_enable(); \
169 \ 54 \
170 if (!(lock)->break_lock) \ 55 if (!(lock)->break_lock) \
171 (lock)->break_lock = 1; \ 56 (lock)->break_lock = 1; \
172 while (!op##_can_lock(lock) && (lock)->break_lock) \ 57 while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\
173 _raw_##op##_relax(&lock->raw_lock); \ 58 arch_##op##_relax(&lock->raw_lock); \
174 } \ 59 } \
175 (lock)->break_lock = 0; \ 60 (lock)->break_lock = 0; \
176} \ 61} \
177 \ 62 \
178EXPORT_SYMBOL(_##op##_lock); \ 63unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \
179 \
180unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \
181{ \ 64{ \
182 unsigned long flags; \ 65 unsigned long flags; \
183 \ 66 \
184 for (;;) { \ 67 for (;;) { \
185 preempt_disable(); \ 68 preempt_disable(); \
186 local_irq_save(flags); \ 69 local_irq_save(flags); \
187 if (likely(_raw_##op##_trylock(lock))) \ 70 if (likely(do_raw_##op##_trylock(lock))) \
188 break; \ 71 break; \
189 local_irq_restore(flags); \ 72 local_irq_restore(flags); \
190 preempt_enable(); \ 73 preempt_enable(); \
191 \ 74 \
192 if (!(lock)->break_lock) \ 75 if (!(lock)->break_lock) \
193 (lock)->break_lock = 1; \ 76 (lock)->break_lock = 1; \
194 while (!op##_can_lock(lock) && (lock)->break_lock) \ 77 while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\
195 _raw_##op##_relax(&lock->raw_lock); \ 78 arch_##op##_relax(&lock->raw_lock); \
196 } \ 79 } \
197 (lock)->break_lock = 0; \ 80 (lock)->break_lock = 0; \
198 return flags; \ 81 return flags; \
199} \ 82} \
200 \ 83 \
201EXPORT_SYMBOL(_##op##_lock_irqsave); \ 84void __lockfunc __raw_##op##_lock_irq(locktype##_t *lock) \
202 \
203void __lockfunc _##op##_lock_irq(locktype##_t *lock) \
204{ \ 85{ \
205 _##op##_lock_irqsave(lock); \ 86 _raw_##op##_lock_irqsave(lock); \
206} \ 87} \
207 \ 88 \
208EXPORT_SYMBOL(_##op##_lock_irq); \ 89void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \
209 \
210void __lockfunc _##op##_lock_bh(locktype##_t *lock) \
211{ \ 90{ \
212 unsigned long flags; \ 91 unsigned long flags; \
213 \ 92 \
@@ -216,164 +95,283 @@ void __lockfunc _##op##_lock_bh(locktype##_t *lock) \
216 /* irq-disabling. We use the generic preemption-aware */ \ 95 /* irq-disabling. We use the generic preemption-aware */ \
217 /* function: */ \ 96 /* function: */ \
218 /**/ \ 97 /**/ \
219 flags = _##op##_lock_irqsave(lock); \ 98 flags = _raw_##op##_lock_irqsave(lock); \
220 local_bh_disable(); \ 99 local_bh_disable(); \
221 local_irq_restore(flags); \ 100 local_irq_restore(flags); \
222} \ 101} \
223 \
224EXPORT_SYMBOL(_##op##_lock_bh)
225 102
226/* 103/*
227 * Build preemption-friendly versions of the following 104 * Build preemption-friendly versions of the following
228 * lock-spinning functions: 105 * lock-spinning functions:
229 * 106 *
230 * _[spin|read|write]_lock() 107 * __[spin|read|write]_lock()
231 * _[spin|read|write]_lock_irq() 108 * __[spin|read|write]_lock_irq()
232 * _[spin|read|write]_lock_irqsave() 109 * __[spin|read|write]_lock_irqsave()
233 * _[spin|read|write]_lock_bh() 110 * __[spin|read|write]_lock_bh()
234 */ 111 */
235BUILD_LOCK_OPS(spin, spinlock); 112BUILD_LOCK_OPS(spin, raw_spinlock);
236BUILD_LOCK_OPS(read, rwlock); 113BUILD_LOCK_OPS(read, rwlock);
237BUILD_LOCK_OPS(write, rwlock); 114BUILD_LOCK_OPS(write, rwlock);
238 115
239#endif /* CONFIG_PREEMPT */ 116#endif
240 117
241#ifdef CONFIG_DEBUG_LOCK_ALLOC 118#ifndef CONFIG_INLINE_SPIN_TRYLOCK
119int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock)
120{
121 return __raw_spin_trylock(lock);
122}
123EXPORT_SYMBOL(_raw_spin_trylock);
124#endif
242 125
243void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) 126#ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH
127int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock)
244{ 128{
245 preempt_disable(); 129 return __raw_spin_trylock_bh(lock);
246 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
247 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
248} 130}
249EXPORT_SYMBOL(_spin_lock_nested); 131EXPORT_SYMBOL(_raw_spin_trylock_bh);
132#endif
250 133
251unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass) 134#ifndef CONFIG_INLINE_SPIN_LOCK
135void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)
252{ 136{
253 unsigned long flags; 137 __raw_spin_lock(lock);
138}
139EXPORT_SYMBOL(_raw_spin_lock);
140#endif
254 141
255 local_irq_save(flags); 142#ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE
256 preempt_disable(); 143unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock)
257 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); 144{
258 LOCK_CONTENDED_FLAGS(lock, _raw_spin_trylock, _raw_spin_lock, 145 return __raw_spin_lock_irqsave(lock);
259 _raw_spin_lock_flags, &flags);
260 return flags;
261} 146}
262EXPORT_SYMBOL(_spin_lock_irqsave_nested); 147EXPORT_SYMBOL(_raw_spin_lock_irqsave);
148#endif
263 149
264void __lockfunc _spin_lock_nest_lock(spinlock_t *lock, 150#ifndef CONFIG_INLINE_SPIN_LOCK_IRQ
265 struct lockdep_map *nest_lock) 151void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock)
266{ 152{
267 preempt_disable(); 153 __raw_spin_lock_irq(lock);
268 spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
269 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
270} 154}
271EXPORT_SYMBOL(_spin_lock_nest_lock); 155EXPORT_SYMBOL(_raw_spin_lock_irq);
156#endif
272 157
158#ifndef CONFIG_INLINE_SPIN_LOCK_BH
159void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
160{
161 __raw_spin_lock_bh(lock);
162}
163EXPORT_SYMBOL(_raw_spin_lock_bh);
273#endif 164#endif
274 165
275#ifndef _spin_unlock 166#ifndef CONFIG_INLINE_SPIN_UNLOCK
276void __lockfunc _spin_unlock(spinlock_t *lock) 167void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
277{ 168{
278 __spin_unlock(lock); 169 __raw_spin_unlock(lock);
279} 170}
280EXPORT_SYMBOL(_spin_unlock); 171EXPORT_SYMBOL(_raw_spin_unlock);
281#endif 172#endif
282 173
283#ifndef _write_unlock 174#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE
284void __lockfunc _write_unlock(rwlock_t *lock) 175void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags)
285{ 176{
286 __write_unlock(lock); 177 __raw_spin_unlock_irqrestore(lock, flags);
287} 178}
288EXPORT_SYMBOL(_write_unlock); 179EXPORT_SYMBOL(_raw_spin_unlock_irqrestore);
289#endif 180#endif
290 181
291#ifndef _read_unlock 182#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ
292void __lockfunc _read_unlock(rwlock_t *lock) 183void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock)
293{ 184{
294 __read_unlock(lock); 185 __raw_spin_unlock_irq(lock);
295} 186}
296EXPORT_SYMBOL(_read_unlock); 187EXPORT_SYMBOL(_raw_spin_unlock_irq);
297#endif 188#endif
298 189
299#ifndef _spin_unlock_irqrestore 190#ifndef CONFIG_INLINE_SPIN_UNLOCK_BH
300void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) 191void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
301{ 192{
302 __spin_unlock_irqrestore(lock, flags); 193 __raw_spin_unlock_bh(lock);
303} 194}
304EXPORT_SYMBOL(_spin_unlock_irqrestore); 195EXPORT_SYMBOL(_raw_spin_unlock_bh);
305#endif 196#endif
306 197
307#ifndef _spin_unlock_irq 198#ifndef CONFIG_INLINE_READ_TRYLOCK
308void __lockfunc _spin_unlock_irq(spinlock_t *lock) 199int __lockfunc _raw_read_trylock(rwlock_t *lock)
309{ 200{
310 __spin_unlock_irq(lock); 201 return __raw_read_trylock(lock);
311} 202}
312EXPORT_SYMBOL(_spin_unlock_irq); 203EXPORT_SYMBOL(_raw_read_trylock);
313#endif 204#endif
314 205
315#ifndef _spin_unlock_bh 206#ifndef CONFIG_INLINE_READ_LOCK
316void __lockfunc _spin_unlock_bh(spinlock_t *lock) 207void __lockfunc _raw_read_lock(rwlock_t *lock)
317{ 208{
318 __spin_unlock_bh(lock); 209 __raw_read_lock(lock);
319} 210}
320EXPORT_SYMBOL(_spin_unlock_bh); 211EXPORT_SYMBOL(_raw_read_lock);
321#endif 212#endif
322 213
323#ifndef _read_unlock_irqrestore 214#ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE
324void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 215unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock)
325{ 216{
326 __read_unlock_irqrestore(lock, flags); 217 return __raw_read_lock_irqsave(lock);
327} 218}
328EXPORT_SYMBOL(_read_unlock_irqrestore); 219EXPORT_SYMBOL(_raw_read_lock_irqsave);
329#endif 220#endif
330 221
331#ifndef _read_unlock_irq 222#ifndef CONFIG_INLINE_READ_LOCK_IRQ
332void __lockfunc _read_unlock_irq(rwlock_t *lock) 223void __lockfunc _raw_read_lock_irq(rwlock_t *lock)
333{ 224{
334 __read_unlock_irq(lock); 225 __raw_read_lock_irq(lock);
335} 226}
336EXPORT_SYMBOL(_read_unlock_irq); 227EXPORT_SYMBOL(_raw_read_lock_irq);
337#endif 228#endif
338 229
339#ifndef _read_unlock_bh 230#ifndef CONFIG_INLINE_READ_LOCK_BH
340void __lockfunc _read_unlock_bh(rwlock_t *lock) 231void __lockfunc _raw_read_lock_bh(rwlock_t *lock)
341{ 232{
342 __read_unlock_bh(lock); 233 __raw_read_lock_bh(lock);
343} 234}
344EXPORT_SYMBOL(_read_unlock_bh); 235EXPORT_SYMBOL(_raw_read_lock_bh);
345#endif 236#endif
346 237
347#ifndef _write_unlock_irqrestore 238#ifndef CONFIG_INLINE_READ_UNLOCK
348void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 239void __lockfunc _raw_read_unlock(rwlock_t *lock)
349{ 240{
350 __write_unlock_irqrestore(lock, flags); 241 __raw_read_unlock(lock);
351} 242}
352EXPORT_SYMBOL(_write_unlock_irqrestore); 243EXPORT_SYMBOL(_raw_read_unlock);
353#endif 244#endif
354 245
355#ifndef _write_unlock_irq 246#ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE
356void __lockfunc _write_unlock_irq(rwlock_t *lock) 247void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
357{ 248{
358 __write_unlock_irq(lock); 249 __raw_read_unlock_irqrestore(lock, flags);
359} 250}
360EXPORT_SYMBOL(_write_unlock_irq); 251EXPORT_SYMBOL(_raw_read_unlock_irqrestore);
361#endif 252#endif
362 253
363#ifndef _write_unlock_bh 254#ifndef CONFIG_INLINE_READ_UNLOCK_IRQ
364void __lockfunc _write_unlock_bh(rwlock_t *lock) 255void __lockfunc _raw_read_unlock_irq(rwlock_t *lock)
365{ 256{
366 __write_unlock_bh(lock); 257 __raw_read_unlock_irq(lock);
367} 258}
368EXPORT_SYMBOL(_write_unlock_bh); 259EXPORT_SYMBOL(_raw_read_unlock_irq);
369#endif 260#endif
370 261
371#ifndef _spin_trylock_bh 262#ifndef CONFIG_INLINE_READ_UNLOCK_BH
372int __lockfunc _spin_trylock_bh(spinlock_t *lock) 263void __lockfunc _raw_read_unlock_bh(rwlock_t *lock)
373{ 264{
374 return __spin_trylock_bh(lock); 265 __raw_read_unlock_bh(lock);
375} 266}
376EXPORT_SYMBOL(_spin_trylock_bh); 267EXPORT_SYMBOL(_raw_read_unlock_bh);
268#endif
269
270#ifndef CONFIG_INLINE_WRITE_TRYLOCK
271int __lockfunc _raw_write_trylock(rwlock_t *lock)
272{
273 return __raw_write_trylock(lock);
274}
275EXPORT_SYMBOL(_raw_write_trylock);
276#endif
277
278#ifndef CONFIG_INLINE_WRITE_LOCK
279void __lockfunc _raw_write_lock(rwlock_t *lock)
280{
281 __raw_write_lock(lock);
282}
283EXPORT_SYMBOL(_raw_write_lock);
284#endif
285
286#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE
287unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock)
288{
289 return __raw_write_lock_irqsave(lock);
290}
291EXPORT_SYMBOL(_raw_write_lock_irqsave);
292#endif
293
294#ifndef CONFIG_INLINE_WRITE_LOCK_IRQ
295void __lockfunc _raw_write_lock_irq(rwlock_t *lock)
296{
297 __raw_write_lock_irq(lock);
298}
299EXPORT_SYMBOL(_raw_write_lock_irq);
300#endif
301
302#ifndef CONFIG_INLINE_WRITE_LOCK_BH
303void __lockfunc _raw_write_lock_bh(rwlock_t *lock)
304{
305 __raw_write_lock_bh(lock);
306}
307EXPORT_SYMBOL(_raw_write_lock_bh);
308#endif
309
310#ifndef CONFIG_INLINE_WRITE_UNLOCK
311void __lockfunc _raw_write_unlock(rwlock_t *lock)
312{
313 __raw_write_unlock(lock);
314}
315EXPORT_SYMBOL(_raw_write_unlock);
316#endif
317
318#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE
319void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
320{
321 __raw_write_unlock_irqrestore(lock, flags);
322}
323EXPORT_SYMBOL(_raw_write_unlock_irqrestore);
324#endif
325
326#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ
327void __lockfunc _raw_write_unlock_irq(rwlock_t *lock)
328{
329 __raw_write_unlock_irq(lock);
330}
331EXPORT_SYMBOL(_raw_write_unlock_irq);
332#endif
333
334#ifndef CONFIG_INLINE_WRITE_UNLOCK_BH
335void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
336{
337 __raw_write_unlock_bh(lock);
338}
339EXPORT_SYMBOL(_raw_write_unlock_bh);
340#endif
341
342#ifdef CONFIG_DEBUG_LOCK_ALLOC
343
344void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
345{
346 preempt_disable();
347 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
348 LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
349}
350EXPORT_SYMBOL(_raw_spin_lock_nested);
351
352unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock,
353 int subclass)
354{
355 unsigned long flags;
356
357 local_irq_save(flags);
358 preempt_disable();
359 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
360 LOCK_CONTENDED_FLAGS(lock, do_raw_spin_trylock, do_raw_spin_lock,
361 do_raw_spin_lock_flags, &flags);
362 return flags;
363}
364EXPORT_SYMBOL(_raw_spin_lock_irqsave_nested);
365
366void __lockfunc _raw_spin_lock_nest_lock(raw_spinlock_t *lock,
367 struct lockdep_map *nest_lock)
368{
369 preempt_disable();
370 spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
371 LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
372}
373EXPORT_SYMBOL(_raw_spin_lock_nest_lock);
374
377#endif 375#endif
378 376
379notrace int in_lock_functions(unsigned long addr) 377notrace int in_lock_functions(unsigned long addr)
diff --git a/kernel/srcu.c b/kernel/srcu.c
index b0aeeaf22ce4..818d7d9aa03c 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -49,6 +49,7 @@ int init_srcu_struct(struct srcu_struct *sp)
49 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); 49 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
50 return (sp->per_cpu_ref ? 0 : -ENOMEM); 50 return (sp->per_cpu_ref ? 0 : -ENOMEM);
51} 51}
52EXPORT_SYMBOL_GPL(init_srcu_struct);
52 53
53/* 54/*
54 * srcu_readers_active_idx -- returns approximate number of readers 55 * srcu_readers_active_idx -- returns approximate number of readers
@@ -97,6 +98,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
97 free_percpu(sp->per_cpu_ref); 98 free_percpu(sp->per_cpu_ref);
98 sp->per_cpu_ref = NULL; 99 sp->per_cpu_ref = NULL;
99} 100}
101EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
100 102
101/** 103/**
102 * srcu_read_lock - register a new reader for an SRCU-protected structure. 104 * srcu_read_lock - register a new reader for an SRCU-protected structure.
@@ -118,6 +120,7 @@ int srcu_read_lock(struct srcu_struct *sp)
118 preempt_enable(); 120 preempt_enable();
119 return idx; 121 return idx;
120} 122}
123EXPORT_SYMBOL_GPL(srcu_read_lock);
121 124
122/** 125/**
123 * srcu_read_unlock - unregister a old reader from an SRCU-protected structure. 126 * srcu_read_unlock - unregister a old reader from an SRCU-protected structure.
@@ -136,22 +139,12 @@ void srcu_read_unlock(struct srcu_struct *sp, int idx)
136 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; 139 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--;
137 preempt_enable(); 140 preempt_enable();
138} 141}
142EXPORT_SYMBOL_GPL(srcu_read_unlock);
139 143
140/** 144/*
141 * synchronize_srcu - wait for prior SRCU read-side critical-section completion 145 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
142 * @sp: srcu_struct with which to synchronize.
143 *
144 * Flip the completed counter, and wait for the old count to drain to zero.
145 * As with classic RCU, the updater must use some separate means of
146 * synchronizing concurrent updates. Can block; must be called from
147 * process context.
148 *
149 * Note that it is illegal to call synchornize_srcu() from the corresponding
150 * SRCU read-side critical section; doing so will result in deadlock.
151 * However, it is perfectly legal to call synchronize_srcu() on one
152 * srcu_struct from some other srcu_struct's read-side critical section.
153 */ 146 */
154void synchronize_srcu(struct srcu_struct *sp) 147void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
155{ 148{
156 int idx; 149 int idx;
157 150
@@ -173,7 +166,7 @@ void synchronize_srcu(struct srcu_struct *sp)
173 return; 166 return;
174 } 167 }
175 168
176 synchronize_sched(); /* Force memory barrier on all CPUs. */ 169 sync_func(); /* Force memory barrier on all CPUs. */
177 170
178 /* 171 /*
179 * The preceding synchronize_sched() ensures that any CPU that 172 * The preceding synchronize_sched() ensures that any CPU that
@@ -190,7 +183,7 @@ void synchronize_srcu(struct srcu_struct *sp)
190 idx = sp->completed & 0x1; 183 idx = sp->completed & 0x1;
191 sp->completed++; 184 sp->completed++;
192 185
193 synchronize_sched(); /* Force memory barrier on all CPUs. */ 186 sync_func(); /* Force memory barrier on all CPUs. */
194 187
195 /* 188 /*
196 * At this point, because of the preceding synchronize_sched(), 189 * At this point, because of the preceding synchronize_sched(),
@@ -203,7 +196,7 @@ void synchronize_srcu(struct srcu_struct *sp)
203 while (srcu_readers_active_idx(sp, idx)) 196 while (srcu_readers_active_idx(sp, idx))
204 schedule_timeout_interruptible(1); 197 schedule_timeout_interruptible(1);
205 198
206 synchronize_sched(); /* Force memory barrier on all CPUs. */ 199 sync_func(); /* Force memory barrier on all CPUs. */
207 200
208 /* 201 /*
209 * The preceding synchronize_sched() forces all srcu_read_unlock() 202 * The preceding synchronize_sched() forces all srcu_read_unlock()
@@ -237,6 +230,47 @@ void synchronize_srcu(struct srcu_struct *sp)
237} 230}
238 231
239/** 232/**
233 * synchronize_srcu - wait for prior SRCU read-side critical-section completion
234 * @sp: srcu_struct with which to synchronize.
235 *
236 * Flip the completed counter, and wait for the old count to drain to zero.
237 * As with classic RCU, the updater must use some separate means of
238 * synchronizing concurrent updates. Can block; must be called from
239 * process context.
240 *
241 * Note that it is illegal to call synchronize_srcu() from the corresponding
242 * SRCU read-side critical section; doing so will result in deadlock.
243 * However, it is perfectly legal to call synchronize_srcu() on one
244 * srcu_struct from some other srcu_struct's read-side critical section.
245 */
246void synchronize_srcu(struct srcu_struct *sp)
247{
248 __synchronize_srcu(sp, synchronize_sched);
249}
250EXPORT_SYMBOL_GPL(synchronize_srcu);
251
252/**
253 * synchronize_srcu_expedited - like synchronize_srcu, but less patient
254 * @sp: srcu_struct with which to synchronize.
255 *
256 * Flip the completed counter, and wait for the old count to drain to zero.
257 * As with classic RCU, the updater must use some separate means of
258 * synchronizing concurrent updates. Can block; must be called from
259 * process context.
260 *
261 * Note that it is illegal to call synchronize_srcu_expedited()
262 * from the corresponding SRCU read-side critical section; doing so
263 * will result in deadlock. However, it is perfectly legal to call
264 * synchronize_srcu_expedited() on one srcu_struct from some other
265 * srcu_struct's read-side critical section.
266 */
267void synchronize_srcu_expedited(struct srcu_struct *sp)
268{
269 __synchronize_srcu(sp, synchronize_sched_expedited);
270}
271EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
272
273/**
240 * srcu_batches_completed - return batches completed. 274 * srcu_batches_completed - return batches completed.
241 * @sp: srcu_struct on which to report batch completion. 275 * @sp: srcu_struct on which to report batch completion.
242 * 276 *
@@ -248,10 +282,4 @@ long srcu_batches_completed(struct srcu_struct *sp)
248{ 282{
249 return sp->completed; 283 return sp->completed;
250} 284}
251
252EXPORT_SYMBOL_GPL(init_srcu_struct);
253EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
254EXPORT_SYMBOL_GPL(srcu_read_lock);
255EXPORT_SYMBOL_GPL(srcu_read_unlock);
256EXPORT_SYMBOL_GPL(synchronize_srcu);
257EXPORT_SYMBOL_GPL(srcu_batches_completed); 285EXPORT_SYMBOL_GPL(srcu_batches_completed);
diff --git a/kernel/sys.c b/kernel/sys.c
index ce17760d9c51..20ccfb5da6af 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -8,7 +8,6 @@
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/utsname.h> 9#include <linux/utsname.h>
10#include <linux/mman.h> 10#include <linux/mman.h>
11#include <linux/smp_lock.h>
12#include <linux/notifier.h> 11#include <linux/notifier.h>
13#include <linux/reboot.h> 12#include <linux/reboot.h>
14#include <linux/prctl.h> 13#include <linux/prctl.h>
@@ -190,10 +189,10 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
190 !(user = find_user(who))) 189 !(user = find_user(who)))
191 goto out_unlock; /* No processes for this user */ 190 goto out_unlock; /* No processes for this user */
192 191
193 do_each_thread(g, p) 192 do_each_thread(g, p) {
194 if (__task_cred(p)->uid == who) 193 if (__task_cred(p)->uid == who)
195 error = set_one_prio(p, niceval, error); 194 error = set_one_prio(p, niceval, error);
196 while_each_thread(g, p); 195 } while_each_thread(g, p);
197 if (who != cred->uid) 196 if (who != cred->uid)
198 free_uid(user); /* For find_user() */ 197 free_uid(user); /* For find_user() */
199 break; 198 break;
@@ -253,13 +252,13 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
253 !(user = find_user(who))) 252 !(user = find_user(who)))
254 goto out_unlock; /* No processes for this user */ 253 goto out_unlock; /* No processes for this user */
255 254
256 do_each_thread(g, p) 255 do_each_thread(g, p) {
257 if (__task_cred(p)->uid == who) { 256 if (__task_cred(p)->uid == who) {
258 niceval = 20 - task_nice(p); 257 niceval = 20 - task_nice(p);
259 if (niceval > retval) 258 if (niceval > retval)
260 retval = niceval; 259 retval = niceval;
261 } 260 }
262 while_each_thread(g, p); 261 } while_each_thread(g, p);
263 if (who != cred->uid) 262 if (who != cred->uid)
264 free_uid(user); /* for find_user() */ 263 free_uid(user); /* for find_user() */
265 break; 264 break;
@@ -349,6 +348,9 @@ void kernel_power_off(void)
349 machine_power_off(); 348 machine_power_off();
350} 349}
351EXPORT_SYMBOL_GPL(kernel_power_off); 350EXPORT_SYMBOL_GPL(kernel_power_off);
351
352static DEFINE_MUTEX(reboot_mutex);
353
352/* 354/*
353 * Reboot system call: for obvious reasons only root may call it, 355 * Reboot system call: for obvious reasons only root may call it,
354 * and even root needs to set up some magic numbers in the registers 356 * and even root needs to set up some magic numbers in the registers
@@ -381,7 +383,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
381 if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) 383 if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
382 cmd = LINUX_REBOOT_CMD_HALT; 384 cmd = LINUX_REBOOT_CMD_HALT;
383 385
384 lock_kernel(); 386 mutex_lock(&reboot_mutex);
385 switch (cmd) { 387 switch (cmd) {
386 case LINUX_REBOOT_CMD_RESTART: 388 case LINUX_REBOOT_CMD_RESTART:
387 kernel_restart(NULL); 389 kernel_restart(NULL);
@@ -397,20 +399,18 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
397 399
398 case LINUX_REBOOT_CMD_HALT: 400 case LINUX_REBOOT_CMD_HALT:
399 kernel_halt(); 401 kernel_halt();
400 unlock_kernel();
401 do_exit(0); 402 do_exit(0);
402 panic("cannot halt"); 403 panic("cannot halt");
403 404
404 case LINUX_REBOOT_CMD_POWER_OFF: 405 case LINUX_REBOOT_CMD_POWER_OFF:
405 kernel_power_off(); 406 kernel_power_off();
406 unlock_kernel();
407 do_exit(0); 407 do_exit(0);
408 break; 408 break;
409 409
410 case LINUX_REBOOT_CMD_RESTART2: 410 case LINUX_REBOOT_CMD_RESTART2:
411 if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) { 411 if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) {
412 unlock_kernel(); 412 ret = -EFAULT;
413 return -EFAULT; 413 break;
414 } 414 }
415 buffer[sizeof(buffer) - 1] = '\0'; 415 buffer[sizeof(buffer) - 1] = '\0';
416 416
@@ -433,7 +433,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
433 ret = -EINVAL; 433 ret = -EINVAL;
434 break; 434 break;
435 } 435 }
436 unlock_kernel(); 436 mutex_unlock(&reboot_mutex);
437 return ret; 437 return ret;
438} 438}
439 439
@@ -911,16 +911,15 @@ change_okay:
911 911
912void do_sys_times(struct tms *tms) 912void do_sys_times(struct tms *tms)
913{ 913{
914 struct task_cputime cputime; 914 cputime_t tgutime, tgstime, cutime, cstime;
915 cputime_t cutime, cstime;
916 915
917 thread_group_cputime(current, &cputime);
918 spin_lock_irq(&current->sighand->siglock); 916 spin_lock_irq(&current->sighand->siglock);
917 thread_group_times(current, &tgutime, &tgstime);
919 cutime = current->signal->cutime; 918 cutime = current->signal->cutime;
920 cstime = current->signal->cstime; 919 cstime = current->signal->cstime;
921 spin_unlock_irq(&current->sighand->siglock); 920 spin_unlock_irq(&current->sighand->siglock);
922 tms->tms_utime = cputime_to_clock_t(cputime.utime); 921 tms->tms_utime = cputime_to_clock_t(tgutime);
923 tms->tms_stime = cputime_to_clock_t(cputime.stime); 922 tms->tms_stime = cputime_to_clock_t(tgstime);
924 tms->tms_cutime = cputime_to_clock_t(cutime); 923 tms->tms_cutime = cputime_to_clock_t(cutime);
925 tms->tms_cstime = cputime_to_clock_t(cstime); 924 tms->tms_cstime = cputime_to_clock_t(cstime);
926} 925}
@@ -1338,16 +1337,14 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1338{ 1337{
1339 struct task_struct *t; 1338 struct task_struct *t;
1340 unsigned long flags; 1339 unsigned long flags;
1341 cputime_t utime, stime; 1340 cputime_t tgutime, tgstime, utime, stime;
1342 struct task_cputime cputime;
1343 unsigned long maxrss = 0; 1341 unsigned long maxrss = 0;
1344 1342
1345 memset((char *) r, 0, sizeof *r); 1343 memset((char *) r, 0, sizeof *r);
1346 utime = stime = cputime_zero; 1344 utime = stime = cputime_zero;
1347 1345
1348 if (who == RUSAGE_THREAD) { 1346 if (who == RUSAGE_THREAD) {
1349 utime = task_utime(current); 1347 task_times(current, &utime, &stime);
1350 stime = task_stime(current);
1351 accumulate_thread_rusage(p, r); 1348 accumulate_thread_rusage(p, r);
1352 maxrss = p->signal->maxrss; 1349 maxrss = p->signal->maxrss;
1353 goto out; 1350 goto out;
@@ -1373,9 +1370,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1373 break; 1370 break;
1374 1371
1375 case RUSAGE_SELF: 1372 case RUSAGE_SELF:
1376 thread_group_cputime(p, &cputime); 1373 thread_group_times(p, &tgutime, &tgstime);
1377 utime = cputime_add(utime, cputime.utime); 1374 utime = cputime_add(utime, tgutime);
1378 stime = cputime_add(stime, cputime.stime); 1375 stime = cputime_add(stime, tgstime);
1379 r->ru_nvcsw += p->signal->nvcsw; 1376 r->ru_nvcsw += p->signal->nvcsw;
1380 r->ru_nivcsw += p->signal->nivcsw; 1377 r->ru_nivcsw += p->signal->nivcsw;
1381 r->ru_minflt += p->signal->min_flt; 1378 r->ru_minflt += p->signal->min_flt;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index e06d0b8d1951..695384f12a7d 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -48,8 +48,10 @@ cond_syscall(sys_shutdown);
48cond_syscall(sys_sendmsg); 48cond_syscall(sys_sendmsg);
49cond_syscall(compat_sys_sendmsg); 49cond_syscall(compat_sys_sendmsg);
50cond_syscall(sys_recvmsg); 50cond_syscall(sys_recvmsg);
51cond_syscall(sys_recvmmsg);
51cond_syscall(compat_sys_recvmsg); 52cond_syscall(compat_sys_recvmsg);
52cond_syscall(compat_sys_recvfrom); 53cond_syscall(compat_sys_recvfrom);
54cond_syscall(compat_sys_recvmmsg);
53cond_syscall(sys_socketcall); 55cond_syscall(sys_socketcall);
54cond_syscall(sys_futex); 56cond_syscall(sys_futex);
55cond_syscall(compat_sys_futex); 57cond_syscall(compat_sys_futex);
@@ -139,7 +141,6 @@ cond_syscall(sys_pciconfig_read);
139cond_syscall(sys_pciconfig_write); 141cond_syscall(sys_pciconfig_write);
140cond_syscall(sys_pciconfig_iobase); 142cond_syscall(sys_pciconfig_iobase);
141cond_syscall(sys32_ipc); 143cond_syscall(sys32_ipc);
142cond_syscall(sys32_sysctl);
143cond_syscall(ppc_rtas); 144cond_syscall(ppc_rtas);
144cond_syscall(sys_spu_run); 145cond_syscall(sys_spu_run);
145cond_syscall(sys_spu_create); 146cond_syscall(sys_spu_create);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0d949c517412..45e4bef0012a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -27,7 +27,6 @@
27#include <linux/security.h> 27#include <linux/security.h>
28#include <linux/ctype.h> 28#include <linux/ctype.h>
29#include <linux/kmemcheck.h> 29#include <linux/kmemcheck.h>
30#include <linux/smp_lock.h>
31#include <linux/fs.h> 30#include <linux/fs.h>
32#include <linux/init.h> 31#include <linux/init.h>
33#include <linux/kernel.h> 32#include <linux/kernel.h>
@@ -36,6 +35,7 @@
36#include <linux/sysrq.h> 35#include <linux/sysrq.h>
37#include <linux/highuid.h> 36#include <linux/highuid.h>
38#include <linux/writeback.h> 37#include <linux/writeback.h>
38#include <linux/ratelimit.h>
39#include <linux/hugetlb.h> 39#include <linux/hugetlb.h>
40#include <linux/initrd.h> 40#include <linux/initrd.h>
41#include <linux/key.h> 41#include <linux/key.h>
@@ -60,7 +60,6 @@
60#include <asm/io.h> 60#include <asm/io.h>
61#endif 61#endif
62 62
63static int deprecated_sysctl_warning(struct __sysctl_args *args);
64 63
65#if defined(CONFIG_SYSCTL) 64#if defined(CONFIG_SYSCTL)
66 65
@@ -158,6 +157,8 @@ extern int no_unaligned_warning;
158extern int unaligned_dump_stack; 157extern int unaligned_dump_stack;
159#endif 158#endif
160 159
160extern struct ratelimit_state printk_ratelimit_state;
161
161#ifdef CONFIG_RT_MUTEXES 162#ifdef CONFIG_RT_MUTEXES
162extern int max_lock_depth; 163extern int max_lock_depth;
163#endif 164#endif
@@ -207,31 +208,26 @@ extern int lock_stat;
207 208
208static struct ctl_table root_table[] = { 209static struct ctl_table root_table[] = {
209 { 210 {
210 .ctl_name = CTL_KERN,
211 .procname = "kernel", 211 .procname = "kernel",
212 .mode = 0555, 212 .mode = 0555,
213 .child = kern_table, 213 .child = kern_table,
214 }, 214 },
215 { 215 {
216 .ctl_name = CTL_VM,
217 .procname = "vm", 216 .procname = "vm",
218 .mode = 0555, 217 .mode = 0555,
219 .child = vm_table, 218 .child = vm_table,
220 }, 219 },
221 { 220 {
222 .ctl_name = CTL_FS,
223 .procname = "fs", 221 .procname = "fs",
224 .mode = 0555, 222 .mode = 0555,
225 .child = fs_table, 223 .child = fs_table,
226 }, 224 },
227 { 225 {
228 .ctl_name = CTL_DEBUG,
229 .procname = "debug", 226 .procname = "debug",
230 .mode = 0555, 227 .mode = 0555,
231 .child = debug_table, 228 .child = debug_table,
232 }, 229 },
233 { 230 {
234 .ctl_name = CTL_DEV,
235 .procname = "dev", 231 .procname = "dev",
236 .mode = 0555, 232 .mode = 0555,
237 .child = dev_table, 233 .child = dev_table,
@@ -240,7 +236,7 @@ static struct ctl_table root_table[] = {
240 * NOTE: do not add new entries to this table unless you have read 236 * NOTE: do not add new entries to this table unless you have read
241 * Documentation/sysctl/ctl_unnumbered.txt 237 * Documentation/sysctl/ctl_unnumbered.txt
242 */ 238 */
243 { .ctl_name = 0 } 239 { }
244}; 240};
245 241
246#ifdef CONFIG_SCHED_DEBUG 242#ifdef CONFIG_SCHED_DEBUG
@@ -248,196 +244,178 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */
248static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ 244static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
249static int min_wakeup_granularity_ns; /* 0 usecs */ 245static int min_wakeup_granularity_ns; /* 0 usecs */
250static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ 246static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
247static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
248static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
249static int min_sched_shares_ratelimit = 100000; /* 100 usec */
250static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
251#endif 251#endif
252 252
253static struct ctl_table kern_table[] = { 253static struct ctl_table kern_table[] = {
254 { 254 {
255 .ctl_name = CTL_UNNUMBERED,
256 .procname = "sched_child_runs_first", 255 .procname = "sched_child_runs_first",
257 .data = &sysctl_sched_child_runs_first, 256 .data = &sysctl_sched_child_runs_first,
258 .maxlen = sizeof(unsigned int), 257 .maxlen = sizeof(unsigned int),
259 .mode = 0644, 258 .mode = 0644,
260 .proc_handler = &proc_dointvec, 259 .proc_handler = proc_dointvec,
261 }, 260 },
262#ifdef CONFIG_SCHED_DEBUG 261#ifdef CONFIG_SCHED_DEBUG
263 { 262 {
264 .ctl_name = CTL_UNNUMBERED,
265 .procname = "sched_min_granularity_ns", 263 .procname = "sched_min_granularity_ns",
266 .data = &sysctl_sched_min_granularity, 264 .data = &sysctl_sched_min_granularity,
267 .maxlen = sizeof(unsigned int), 265 .maxlen = sizeof(unsigned int),
268 .mode = 0644, 266 .mode = 0644,
269 .proc_handler = &sched_nr_latency_handler, 267 .proc_handler = sched_proc_update_handler,
270 .strategy = &sysctl_intvec,
271 .extra1 = &min_sched_granularity_ns, 268 .extra1 = &min_sched_granularity_ns,
272 .extra2 = &max_sched_granularity_ns, 269 .extra2 = &max_sched_granularity_ns,
273 }, 270 },
274 { 271 {
275 .ctl_name = CTL_UNNUMBERED,
276 .procname = "sched_latency_ns", 272 .procname = "sched_latency_ns",
277 .data = &sysctl_sched_latency, 273 .data = &sysctl_sched_latency,
278 .maxlen = sizeof(unsigned int), 274 .maxlen = sizeof(unsigned int),
279 .mode = 0644, 275 .mode = 0644,
280 .proc_handler = &sched_nr_latency_handler, 276 .proc_handler = sched_proc_update_handler,
281 .strategy = &sysctl_intvec,
282 .extra1 = &min_sched_granularity_ns, 277 .extra1 = &min_sched_granularity_ns,
283 .extra2 = &max_sched_granularity_ns, 278 .extra2 = &max_sched_granularity_ns,
284 }, 279 },
285 { 280 {
286 .ctl_name = CTL_UNNUMBERED,
287 .procname = "sched_wakeup_granularity_ns", 281 .procname = "sched_wakeup_granularity_ns",
288 .data = &sysctl_sched_wakeup_granularity, 282 .data = &sysctl_sched_wakeup_granularity,
289 .maxlen = sizeof(unsigned int), 283 .maxlen = sizeof(unsigned int),
290 .mode = 0644, 284 .mode = 0644,
291 .proc_handler = &proc_dointvec_minmax, 285 .proc_handler = sched_proc_update_handler,
292 .strategy = &sysctl_intvec,
293 .extra1 = &min_wakeup_granularity_ns, 286 .extra1 = &min_wakeup_granularity_ns,
294 .extra2 = &max_wakeup_granularity_ns, 287 .extra2 = &max_wakeup_granularity_ns,
295 }, 288 },
296 { 289 {
297 .ctl_name = CTL_UNNUMBERED,
298 .procname = "sched_shares_ratelimit", 290 .procname = "sched_shares_ratelimit",
299 .data = &sysctl_sched_shares_ratelimit, 291 .data = &sysctl_sched_shares_ratelimit,
300 .maxlen = sizeof(unsigned int), 292 .maxlen = sizeof(unsigned int),
301 .mode = 0644, 293 .mode = 0644,
302 .proc_handler = &proc_dointvec, 294 .proc_handler = sched_proc_update_handler,
295 .extra1 = &min_sched_shares_ratelimit,
296 .extra2 = &max_sched_shares_ratelimit,
303 }, 297 },
304 { 298 {
305 .ctl_name = CTL_UNNUMBERED, 299 .procname = "sched_tunable_scaling",
306 .procname = "sched_shares_thresh", 300 .data = &sysctl_sched_tunable_scaling,
307 .data = &sysctl_sched_shares_thresh, 301 .maxlen = sizeof(enum sched_tunable_scaling),
308 .maxlen = sizeof(unsigned int),
309 .mode = 0644, 302 .mode = 0644,
310 .proc_handler = &proc_dointvec_minmax, 303 .proc_handler = sched_proc_update_handler,
311 .strategy = &sysctl_intvec, 304 .extra1 = &min_sched_tunable_scaling,
312 .extra1 = &zero, 305 .extra2 = &max_sched_tunable_scaling,
313 }, 306 },
314 { 307 {
315 .ctl_name = CTL_UNNUMBERED, 308 .procname = "sched_shares_thresh",
316 .procname = "sched_features", 309 .data = &sysctl_sched_shares_thresh,
317 .data = &sysctl_sched_features,
318 .maxlen = sizeof(unsigned int), 310 .maxlen = sizeof(unsigned int),
319 .mode = 0644, 311 .mode = 0644,
320 .proc_handler = &proc_dointvec, 312 .proc_handler = proc_dointvec_minmax,
313 .extra1 = &zero,
321 }, 314 },
322 { 315 {
323 .ctl_name = CTL_UNNUMBERED,
324 .procname = "sched_migration_cost", 316 .procname = "sched_migration_cost",
325 .data = &sysctl_sched_migration_cost, 317 .data = &sysctl_sched_migration_cost,
326 .maxlen = sizeof(unsigned int), 318 .maxlen = sizeof(unsigned int),
327 .mode = 0644, 319 .mode = 0644,
328 .proc_handler = &proc_dointvec, 320 .proc_handler = proc_dointvec,
329 }, 321 },
330 { 322 {
331 .ctl_name = CTL_UNNUMBERED,
332 .procname = "sched_nr_migrate", 323 .procname = "sched_nr_migrate",
333 .data = &sysctl_sched_nr_migrate, 324 .data = &sysctl_sched_nr_migrate,
334 .maxlen = sizeof(unsigned int), 325 .maxlen = sizeof(unsigned int),
335 .mode = 0644, 326 .mode = 0644,
336 .proc_handler = &proc_dointvec, 327 .proc_handler = proc_dointvec,
337 }, 328 },
338 { 329 {
339 .ctl_name = CTL_UNNUMBERED,
340 .procname = "sched_time_avg", 330 .procname = "sched_time_avg",
341 .data = &sysctl_sched_time_avg, 331 .data = &sysctl_sched_time_avg,
342 .maxlen = sizeof(unsigned int), 332 .maxlen = sizeof(unsigned int),
343 .mode = 0644, 333 .mode = 0644,
344 .proc_handler = &proc_dointvec, 334 .proc_handler = proc_dointvec,
345 }, 335 },
346 { 336 {
347 .ctl_name = CTL_UNNUMBERED,
348 .procname = "timer_migration", 337 .procname = "timer_migration",
349 .data = &sysctl_timer_migration, 338 .data = &sysctl_timer_migration,
350 .maxlen = sizeof(unsigned int), 339 .maxlen = sizeof(unsigned int),
351 .mode = 0644, 340 .mode = 0644,
352 .proc_handler = &proc_dointvec_minmax, 341 .proc_handler = proc_dointvec_minmax,
353 .strategy = &sysctl_intvec,
354 .extra1 = &zero, 342 .extra1 = &zero,
355 .extra2 = &one, 343 .extra2 = &one,
356 }, 344 },
357#endif 345#endif
358 { 346 {
359 .ctl_name = CTL_UNNUMBERED,
360 .procname = "sched_rt_period_us", 347 .procname = "sched_rt_period_us",
361 .data = &sysctl_sched_rt_period, 348 .data = &sysctl_sched_rt_period,
362 .maxlen = sizeof(unsigned int), 349 .maxlen = sizeof(unsigned int),
363 .mode = 0644, 350 .mode = 0644,
364 .proc_handler = &sched_rt_handler, 351 .proc_handler = sched_rt_handler,
365 }, 352 },
366 { 353 {
367 .ctl_name = CTL_UNNUMBERED,
368 .procname = "sched_rt_runtime_us", 354 .procname = "sched_rt_runtime_us",
369 .data = &sysctl_sched_rt_runtime, 355 .data = &sysctl_sched_rt_runtime,
370 .maxlen = sizeof(int), 356 .maxlen = sizeof(int),
371 .mode = 0644, 357 .mode = 0644,
372 .proc_handler = &sched_rt_handler, 358 .proc_handler = sched_rt_handler,
373 }, 359 },
374 { 360 {
375 .ctl_name = CTL_UNNUMBERED,
376 .procname = "sched_compat_yield", 361 .procname = "sched_compat_yield",
377 .data = &sysctl_sched_compat_yield, 362 .data = &sysctl_sched_compat_yield,
378 .maxlen = sizeof(unsigned int), 363 .maxlen = sizeof(unsigned int),
379 .mode = 0644, 364 .mode = 0644,
380 .proc_handler = &proc_dointvec, 365 .proc_handler = proc_dointvec,
381 }, 366 },
382#ifdef CONFIG_PROVE_LOCKING 367#ifdef CONFIG_PROVE_LOCKING
383 { 368 {
384 .ctl_name = CTL_UNNUMBERED,
385 .procname = "prove_locking", 369 .procname = "prove_locking",
386 .data = &prove_locking, 370 .data = &prove_locking,
387 .maxlen = sizeof(int), 371 .maxlen = sizeof(int),
388 .mode = 0644, 372 .mode = 0644,
389 .proc_handler = &proc_dointvec, 373 .proc_handler = proc_dointvec,
390 }, 374 },
391#endif 375#endif
392#ifdef CONFIG_LOCK_STAT 376#ifdef CONFIG_LOCK_STAT
393 { 377 {
394 .ctl_name = CTL_UNNUMBERED,
395 .procname = "lock_stat", 378 .procname = "lock_stat",
396 .data = &lock_stat, 379 .data = &lock_stat,
397 .maxlen = sizeof(int), 380 .maxlen = sizeof(int),
398 .mode = 0644, 381 .mode = 0644,
399 .proc_handler = &proc_dointvec, 382 .proc_handler = proc_dointvec,
400 }, 383 },
401#endif 384#endif
402 { 385 {
403 .ctl_name = KERN_PANIC,
404 .procname = "panic", 386 .procname = "panic",
405 .data = &panic_timeout, 387 .data = &panic_timeout,
406 .maxlen = sizeof(int), 388 .maxlen = sizeof(int),
407 .mode = 0644, 389 .mode = 0644,
408 .proc_handler = &proc_dointvec, 390 .proc_handler = proc_dointvec,
409 }, 391 },
410 { 392 {
411 .ctl_name = KERN_CORE_USES_PID,
412 .procname = "core_uses_pid", 393 .procname = "core_uses_pid",
413 .data = &core_uses_pid, 394 .data = &core_uses_pid,
414 .maxlen = sizeof(int), 395 .maxlen = sizeof(int),
415 .mode = 0644, 396 .mode = 0644,
416 .proc_handler = &proc_dointvec, 397 .proc_handler = proc_dointvec,
417 }, 398 },
418 { 399 {
419 .ctl_name = KERN_CORE_PATTERN,
420 .procname = "core_pattern", 400 .procname = "core_pattern",
421 .data = core_pattern, 401 .data = core_pattern,
422 .maxlen = CORENAME_MAX_SIZE, 402 .maxlen = CORENAME_MAX_SIZE,
423 .mode = 0644, 403 .mode = 0644,
424 .proc_handler = &proc_dostring, 404 .proc_handler = proc_dostring,
425 .strategy = &sysctl_string,
426 }, 405 },
427 { 406 {
428 .ctl_name = CTL_UNNUMBERED,
429 .procname = "core_pipe_limit", 407 .procname = "core_pipe_limit",
430 .data = &core_pipe_limit, 408 .data = &core_pipe_limit,
431 .maxlen = sizeof(unsigned int), 409 .maxlen = sizeof(unsigned int),
432 .mode = 0644, 410 .mode = 0644,
433 .proc_handler = &proc_dointvec, 411 .proc_handler = proc_dointvec,
434 }, 412 },
435#ifdef CONFIG_PROC_SYSCTL 413#ifdef CONFIG_PROC_SYSCTL
436 { 414 {
437 .procname = "tainted", 415 .procname = "tainted",
438 .maxlen = sizeof(long), 416 .maxlen = sizeof(long),
439 .mode = 0644, 417 .mode = 0644,
440 .proc_handler = &proc_taint, 418 .proc_handler = proc_taint,
441 }, 419 },
442#endif 420#endif
443#ifdef CONFIG_LATENCYTOP 421#ifdef CONFIG_LATENCYTOP
@@ -446,181 +424,160 @@ static struct ctl_table kern_table[] = {
446 .data = &latencytop_enabled, 424 .data = &latencytop_enabled,
447 .maxlen = sizeof(int), 425 .maxlen = sizeof(int),
448 .mode = 0644, 426 .mode = 0644,
449 .proc_handler = &proc_dointvec, 427 .proc_handler = proc_dointvec,
450 }, 428 },
451#endif 429#endif
452#ifdef CONFIG_BLK_DEV_INITRD 430#ifdef CONFIG_BLK_DEV_INITRD
453 { 431 {
454 .ctl_name = KERN_REALROOTDEV,
455 .procname = "real-root-dev", 432 .procname = "real-root-dev",
456 .data = &real_root_dev, 433 .data = &real_root_dev,
457 .maxlen = sizeof(int), 434 .maxlen = sizeof(int),
458 .mode = 0644, 435 .mode = 0644,
459 .proc_handler = &proc_dointvec, 436 .proc_handler = proc_dointvec,
460 }, 437 },
461#endif 438#endif
462 { 439 {
463 .ctl_name = CTL_UNNUMBERED,
464 .procname = "print-fatal-signals", 440 .procname = "print-fatal-signals",
465 .data = &print_fatal_signals, 441 .data = &print_fatal_signals,
466 .maxlen = sizeof(int), 442 .maxlen = sizeof(int),
467 .mode = 0644, 443 .mode = 0644,
468 .proc_handler = &proc_dointvec, 444 .proc_handler = proc_dointvec,
469 }, 445 },
470#ifdef CONFIG_SPARC 446#ifdef CONFIG_SPARC
471 { 447 {
472 .ctl_name = KERN_SPARC_REBOOT,
473 .procname = "reboot-cmd", 448 .procname = "reboot-cmd",
474 .data = reboot_command, 449 .data = reboot_command,
475 .maxlen = 256, 450 .maxlen = 256,
476 .mode = 0644, 451 .mode = 0644,
477 .proc_handler = &proc_dostring, 452 .proc_handler = proc_dostring,
478 .strategy = &sysctl_string,
479 }, 453 },
480 { 454 {
481 .ctl_name = KERN_SPARC_STOP_A,
482 .procname = "stop-a", 455 .procname = "stop-a",
483 .data = &stop_a_enabled, 456 .data = &stop_a_enabled,
484 .maxlen = sizeof (int), 457 .maxlen = sizeof (int),
485 .mode = 0644, 458 .mode = 0644,
486 .proc_handler = &proc_dointvec, 459 .proc_handler = proc_dointvec,
487 }, 460 },
488 { 461 {
489 .ctl_name = KERN_SPARC_SCONS_PWROFF,
490 .procname = "scons-poweroff", 462 .procname = "scons-poweroff",
491 .data = &scons_pwroff, 463 .data = &scons_pwroff,
492 .maxlen = sizeof (int), 464 .maxlen = sizeof (int),
493 .mode = 0644, 465 .mode = 0644,
494 .proc_handler = &proc_dointvec, 466 .proc_handler = proc_dointvec,
495 }, 467 },
496#endif 468#endif
497#ifdef CONFIG_SPARC64 469#ifdef CONFIG_SPARC64
498 { 470 {
499 .ctl_name = CTL_UNNUMBERED,
500 .procname = "tsb-ratio", 471 .procname = "tsb-ratio",
501 .data = &sysctl_tsb_ratio, 472 .data = &sysctl_tsb_ratio,
502 .maxlen = sizeof (int), 473 .maxlen = sizeof (int),
503 .mode = 0644, 474 .mode = 0644,
504 .proc_handler = &proc_dointvec, 475 .proc_handler = proc_dointvec,
505 }, 476 },
506#endif 477#endif
507#ifdef __hppa__ 478#ifdef __hppa__
508 { 479 {
509 .ctl_name = KERN_HPPA_PWRSW,
510 .procname = "soft-power", 480 .procname = "soft-power",
511 .data = &pwrsw_enabled, 481 .data = &pwrsw_enabled,
512 .maxlen = sizeof (int), 482 .maxlen = sizeof (int),
513 .mode = 0644, 483 .mode = 0644,
514 .proc_handler = &proc_dointvec, 484 .proc_handler = proc_dointvec,
515 }, 485 },
516 { 486 {
517 .ctl_name = KERN_HPPA_UNALIGNED,
518 .procname = "unaligned-trap", 487 .procname = "unaligned-trap",
519 .data = &unaligned_enabled, 488 .data = &unaligned_enabled,
520 .maxlen = sizeof (int), 489 .maxlen = sizeof (int),
521 .mode = 0644, 490 .mode = 0644,
522 .proc_handler = &proc_dointvec, 491 .proc_handler = proc_dointvec,
523 }, 492 },
524#endif 493#endif
525 { 494 {
526 .ctl_name = KERN_CTLALTDEL,
527 .procname = "ctrl-alt-del", 495 .procname = "ctrl-alt-del",
528 .data = &C_A_D, 496 .data = &C_A_D,
529 .maxlen = sizeof(int), 497 .maxlen = sizeof(int),
530 .mode = 0644, 498 .mode = 0644,
531 .proc_handler = &proc_dointvec, 499 .proc_handler = proc_dointvec,
532 }, 500 },
533#ifdef CONFIG_FUNCTION_TRACER 501#ifdef CONFIG_FUNCTION_TRACER
534 { 502 {
535 .ctl_name = CTL_UNNUMBERED,
536 .procname = "ftrace_enabled", 503 .procname = "ftrace_enabled",
537 .data = &ftrace_enabled, 504 .data = &ftrace_enabled,
538 .maxlen = sizeof(int), 505 .maxlen = sizeof(int),
539 .mode = 0644, 506 .mode = 0644,
540 .proc_handler = &ftrace_enable_sysctl, 507 .proc_handler = ftrace_enable_sysctl,
541 }, 508 },
542#endif 509#endif
543#ifdef CONFIG_STACK_TRACER 510#ifdef CONFIG_STACK_TRACER
544 { 511 {
545 .ctl_name = CTL_UNNUMBERED,
546 .procname = "stack_tracer_enabled", 512 .procname = "stack_tracer_enabled",
547 .data = &stack_tracer_enabled, 513 .data = &stack_tracer_enabled,
548 .maxlen = sizeof(int), 514 .maxlen = sizeof(int),
549 .mode = 0644, 515 .mode = 0644,
550 .proc_handler = &stack_trace_sysctl, 516 .proc_handler = stack_trace_sysctl,
551 }, 517 },
552#endif 518#endif
553#ifdef CONFIG_TRACING 519#ifdef CONFIG_TRACING
554 { 520 {
555 .ctl_name = CTL_UNNUMBERED,
556 .procname = "ftrace_dump_on_oops", 521 .procname = "ftrace_dump_on_oops",
557 .data = &ftrace_dump_on_oops, 522 .data = &ftrace_dump_on_oops,
558 .maxlen = sizeof(int), 523 .maxlen = sizeof(int),
559 .mode = 0644, 524 .mode = 0644,
560 .proc_handler = &proc_dointvec, 525 .proc_handler = proc_dointvec,
561 }, 526 },
562#endif 527#endif
563#ifdef CONFIG_MODULES 528#ifdef CONFIG_MODULES
564 { 529 {
565 .ctl_name = KERN_MODPROBE,
566 .procname = "modprobe", 530 .procname = "modprobe",
567 .data = &modprobe_path, 531 .data = &modprobe_path,
568 .maxlen = KMOD_PATH_LEN, 532 .maxlen = KMOD_PATH_LEN,
569 .mode = 0644, 533 .mode = 0644,
570 .proc_handler = &proc_dostring, 534 .proc_handler = proc_dostring,
571 .strategy = &sysctl_string,
572 }, 535 },
573 { 536 {
574 .ctl_name = CTL_UNNUMBERED,
575 .procname = "modules_disabled", 537 .procname = "modules_disabled",
576 .data = &modules_disabled, 538 .data = &modules_disabled,
577 .maxlen = sizeof(int), 539 .maxlen = sizeof(int),
578 .mode = 0644, 540 .mode = 0644,
579 /* only handle a transition from default "0" to "1" */ 541 /* only handle a transition from default "0" to "1" */
580 .proc_handler = &proc_dointvec_minmax, 542 .proc_handler = proc_dointvec_minmax,
581 .extra1 = &one, 543 .extra1 = &one,
582 .extra2 = &one, 544 .extra2 = &one,
583 }, 545 },
584#endif 546#endif
585#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) 547#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
586 { 548 {
587 .ctl_name = KERN_HOTPLUG,
588 .procname = "hotplug", 549 .procname = "hotplug",
589 .data = &uevent_helper, 550 .data = &uevent_helper,
590 .maxlen = UEVENT_HELPER_PATH_LEN, 551 .maxlen = UEVENT_HELPER_PATH_LEN,
591 .mode = 0644, 552 .mode = 0644,
592 .proc_handler = &proc_dostring, 553 .proc_handler = proc_dostring,
593 .strategy = &sysctl_string,
594 }, 554 },
595#endif 555#endif
596#ifdef CONFIG_CHR_DEV_SG 556#ifdef CONFIG_CHR_DEV_SG
597 { 557 {
598 .ctl_name = KERN_SG_BIG_BUFF,
599 .procname = "sg-big-buff", 558 .procname = "sg-big-buff",
600 .data = &sg_big_buff, 559 .data = &sg_big_buff,
601 .maxlen = sizeof (int), 560 .maxlen = sizeof (int),
602 .mode = 0444, 561 .mode = 0444,
603 .proc_handler = &proc_dointvec, 562 .proc_handler = proc_dointvec,
604 }, 563 },
605#endif 564#endif
606#ifdef CONFIG_BSD_PROCESS_ACCT 565#ifdef CONFIG_BSD_PROCESS_ACCT
607 { 566 {
608 .ctl_name = KERN_ACCT,
609 .procname = "acct", 567 .procname = "acct",
610 .data = &acct_parm, 568 .data = &acct_parm,
611 .maxlen = 3*sizeof(int), 569 .maxlen = 3*sizeof(int),
612 .mode = 0644, 570 .mode = 0644,
613 .proc_handler = &proc_dointvec, 571 .proc_handler = proc_dointvec,
614 }, 572 },
615#endif 573#endif
616#ifdef CONFIG_MAGIC_SYSRQ 574#ifdef CONFIG_MAGIC_SYSRQ
617 { 575 {
618 .ctl_name = KERN_SYSRQ,
619 .procname = "sysrq", 576 .procname = "sysrq",
620 .data = &__sysrq_enabled, 577 .data = &__sysrq_enabled,
621 .maxlen = sizeof (int), 578 .maxlen = sizeof (int),
622 .mode = 0644, 579 .mode = 0644,
623 .proc_handler = &proc_dointvec, 580 .proc_handler = proc_dointvec,
624 }, 581 },
625#endif 582#endif
626#ifdef CONFIG_PROC_SYSCTL 583#ifdef CONFIG_PROC_SYSCTL
@@ -629,215 +586,188 @@ static struct ctl_table kern_table[] = {
629 .data = NULL, 586 .data = NULL,
630 .maxlen = sizeof (int), 587 .maxlen = sizeof (int),
631 .mode = 0600, 588 .mode = 0600,
632 .proc_handler = &proc_do_cad_pid, 589 .proc_handler = proc_do_cad_pid,
633 }, 590 },
634#endif 591#endif
635 { 592 {
636 .ctl_name = KERN_MAX_THREADS,
637 .procname = "threads-max", 593 .procname = "threads-max",
638 .data = &max_threads, 594 .data = &max_threads,
639 .maxlen = sizeof(int), 595 .maxlen = sizeof(int),
640 .mode = 0644, 596 .mode = 0644,
641 .proc_handler = &proc_dointvec, 597 .proc_handler = proc_dointvec,
642 }, 598 },
643 { 599 {
644 .ctl_name = KERN_RANDOM,
645 .procname = "random", 600 .procname = "random",
646 .mode = 0555, 601 .mode = 0555,
647 .child = random_table, 602 .child = random_table,
648 }, 603 },
649 { 604 {
650 .ctl_name = KERN_OVERFLOWUID,
651 .procname = "overflowuid", 605 .procname = "overflowuid",
652 .data = &overflowuid, 606 .data = &overflowuid,
653 .maxlen = sizeof(int), 607 .maxlen = sizeof(int),
654 .mode = 0644, 608 .mode = 0644,
655 .proc_handler = &proc_dointvec_minmax, 609 .proc_handler = proc_dointvec_minmax,
656 .strategy = &sysctl_intvec,
657 .extra1 = &minolduid, 610 .extra1 = &minolduid,
658 .extra2 = &maxolduid, 611 .extra2 = &maxolduid,
659 }, 612 },
660 { 613 {
661 .ctl_name = KERN_OVERFLOWGID,
662 .procname = "overflowgid", 614 .procname = "overflowgid",
663 .data = &overflowgid, 615 .data = &overflowgid,
664 .maxlen = sizeof(int), 616 .maxlen = sizeof(int),
665 .mode = 0644, 617 .mode = 0644,
666 .proc_handler = &proc_dointvec_minmax, 618 .proc_handler = proc_dointvec_minmax,
667 .strategy = &sysctl_intvec,
668 .extra1 = &minolduid, 619 .extra1 = &minolduid,
669 .extra2 = &maxolduid, 620 .extra2 = &maxolduid,
670 }, 621 },
671#ifdef CONFIG_S390 622#ifdef CONFIG_S390
672#ifdef CONFIG_MATHEMU 623#ifdef CONFIG_MATHEMU
673 { 624 {
674 .ctl_name = KERN_IEEE_EMULATION_WARNINGS,
675 .procname = "ieee_emulation_warnings", 625 .procname = "ieee_emulation_warnings",
676 .data = &sysctl_ieee_emulation_warnings, 626 .data = &sysctl_ieee_emulation_warnings,
677 .maxlen = sizeof(int), 627 .maxlen = sizeof(int),
678 .mode = 0644, 628 .mode = 0644,
679 .proc_handler = &proc_dointvec, 629 .proc_handler = proc_dointvec,
680 }, 630 },
681#endif 631#endif
682 { 632 {
683 .ctl_name = KERN_S390_USER_DEBUG_LOGGING,
684 .procname = "userprocess_debug", 633 .procname = "userprocess_debug",
685 .data = &sysctl_userprocess_debug, 634 .data = &sysctl_userprocess_debug,
686 .maxlen = sizeof(int), 635 .maxlen = sizeof(int),
687 .mode = 0644, 636 .mode = 0644,
688 .proc_handler = &proc_dointvec, 637 .proc_handler = proc_dointvec,
689 }, 638 },
690#endif 639#endif
691 { 640 {
692 .ctl_name = KERN_PIDMAX,
693 .procname = "pid_max", 641 .procname = "pid_max",
694 .data = &pid_max, 642 .data = &pid_max,
695 .maxlen = sizeof (int), 643 .maxlen = sizeof (int),
696 .mode = 0644, 644 .mode = 0644,
697 .proc_handler = &proc_dointvec_minmax, 645 .proc_handler = proc_dointvec_minmax,
698 .strategy = sysctl_intvec,
699 .extra1 = &pid_max_min, 646 .extra1 = &pid_max_min,
700 .extra2 = &pid_max_max, 647 .extra2 = &pid_max_max,
701 }, 648 },
702 { 649 {
703 .ctl_name = KERN_PANIC_ON_OOPS,
704 .procname = "panic_on_oops", 650 .procname = "panic_on_oops",
705 .data = &panic_on_oops, 651 .data = &panic_on_oops,
706 .maxlen = sizeof(int), 652 .maxlen = sizeof(int),
707 .mode = 0644, 653 .mode = 0644,
708 .proc_handler = &proc_dointvec, 654 .proc_handler = proc_dointvec,
709 }, 655 },
710#if defined CONFIG_PRINTK 656#if defined CONFIG_PRINTK
711 { 657 {
712 .ctl_name = KERN_PRINTK,
713 .procname = "printk", 658 .procname = "printk",
714 .data = &console_loglevel, 659 .data = &console_loglevel,
715 .maxlen = 4*sizeof(int), 660 .maxlen = 4*sizeof(int),
716 .mode = 0644, 661 .mode = 0644,
717 .proc_handler = &proc_dointvec, 662 .proc_handler = proc_dointvec,
718 }, 663 },
719 { 664 {
720 .ctl_name = KERN_PRINTK_RATELIMIT,
721 .procname = "printk_ratelimit", 665 .procname = "printk_ratelimit",
722 .data = &printk_ratelimit_state.interval, 666 .data = &printk_ratelimit_state.interval,
723 .maxlen = sizeof(int), 667 .maxlen = sizeof(int),
724 .mode = 0644, 668 .mode = 0644,
725 .proc_handler = &proc_dointvec_jiffies, 669 .proc_handler = proc_dointvec_jiffies,
726 .strategy = &sysctl_jiffies,
727 }, 670 },
728 { 671 {
729 .ctl_name = KERN_PRINTK_RATELIMIT_BURST,
730 .procname = "printk_ratelimit_burst", 672 .procname = "printk_ratelimit_burst",
731 .data = &printk_ratelimit_state.burst, 673 .data = &printk_ratelimit_state.burst,
732 .maxlen = sizeof(int), 674 .maxlen = sizeof(int),
733 .mode = 0644, 675 .mode = 0644,
734 .proc_handler = &proc_dointvec, 676 .proc_handler = proc_dointvec,
735 }, 677 },
736 { 678 {
737 .ctl_name = CTL_UNNUMBERED,
738 .procname = "printk_delay", 679 .procname = "printk_delay",
739 .data = &printk_delay_msec, 680 .data = &printk_delay_msec,
740 .maxlen = sizeof(int), 681 .maxlen = sizeof(int),
741 .mode = 0644, 682 .mode = 0644,
742 .proc_handler = &proc_dointvec_minmax, 683 .proc_handler = proc_dointvec_minmax,
743 .strategy = &sysctl_intvec,
744 .extra1 = &zero, 684 .extra1 = &zero,
745 .extra2 = &ten_thousand, 685 .extra2 = &ten_thousand,
746 }, 686 },
747#endif 687#endif
748 { 688 {
749 .ctl_name = KERN_NGROUPS_MAX,
750 .procname = "ngroups_max", 689 .procname = "ngroups_max",
751 .data = &ngroups_max, 690 .data = &ngroups_max,
752 .maxlen = sizeof (int), 691 .maxlen = sizeof (int),
753 .mode = 0444, 692 .mode = 0444,
754 .proc_handler = &proc_dointvec, 693 .proc_handler = proc_dointvec,
755 }, 694 },
756#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) 695#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
757 { 696 {
758 .ctl_name = KERN_UNKNOWN_NMI_PANIC,
759 .procname = "unknown_nmi_panic", 697 .procname = "unknown_nmi_panic",
760 .data = &unknown_nmi_panic, 698 .data = &unknown_nmi_panic,
761 .maxlen = sizeof (int), 699 .maxlen = sizeof (int),
762 .mode = 0644, 700 .mode = 0644,
763 .proc_handler = &proc_dointvec, 701 .proc_handler = proc_dointvec,
764 }, 702 },
765 { 703 {
766 .procname = "nmi_watchdog", 704 .procname = "nmi_watchdog",
767 .data = &nmi_watchdog_enabled, 705 .data = &nmi_watchdog_enabled,
768 .maxlen = sizeof (int), 706 .maxlen = sizeof (int),
769 .mode = 0644, 707 .mode = 0644,
770 .proc_handler = &proc_nmi_enabled, 708 .proc_handler = proc_nmi_enabled,
771 }, 709 },
772#endif 710#endif
773#if defined(CONFIG_X86) 711#if defined(CONFIG_X86)
774 { 712 {
775 .ctl_name = KERN_PANIC_ON_NMI,
776 .procname = "panic_on_unrecovered_nmi", 713 .procname = "panic_on_unrecovered_nmi",
777 .data = &panic_on_unrecovered_nmi, 714 .data = &panic_on_unrecovered_nmi,
778 .maxlen = sizeof(int), 715 .maxlen = sizeof(int),
779 .mode = 0644, 716 .mode = 0644,
780 .proc_handler = &proc_dointvec, 717 .proc_handler = proc_dointvec,
781 }, 718 },
782 { 719 {
783 .ctl_name = CTL_UNNUMBERED,
784 .procname = "panic_on_io_nmi", 720 .procname = "panic_on_io_nmi",
785 .data = &panic_on_io_nmi, 721 .data = &panic_on_io_nmi,
786 .maxlen = sizeof(int), 722 .maxlen = sizeof(int),
787 .mode = 0644, 723 .mode = 0644,
788 .proc_handler = &proc_dointvec, 724 .proc_handler = proc_dointvec,
789 }, 725 },
790 { 726 {
791 .ctl_name = KERN_BOOTLOADER_TYPE,
792 .procname = "bootloader_type", 727 .procname = "bootloader_type",
793 .data = &bootloader_type, 728 .data = &bootloader_type,
794 .maxlen = sizeof (int), 729 .maxlen = sizeof (int),
795 .mode = 0444, 730 .mode = 0444,
796 .proc_handler = &proc_dointvec, 731 .proc_handler = proc_dointvec,
797 }, 732 },
798 { 733 {
799 .ctl_name = CTL_UNNUMBERED,
800 .procname = "bootloader_version", 734 .procname = "bootloader_version",
801 .data = &bootloader_version, 735 .data = &bootloader_version,
802 .maxlen = sizeof (int), 736 .maxlen = sizeof (int),
803 .mode = 0444, 737 .mode = 0444,
804 .proc_handler = &proc_dointvec, 738 .proc_handler = proc_dointvec,
805 }, 739 },
806 { 740 {
807 .ctl_name = CTL_UNNUMBERED,
808 .procname = "kstack_depth_to_print", 741 .procname = "kstack_depth_to_print",
809 .data = &kstack_depth_to_print, 742 .data = &kstack_depth_to_print,
810 .maxlen = sizeof(int), 743 .maxlen = sizeof(int),
811 .mode = 0644, 744 .mode = 0644,
812 .proc_handler = &proc_dointvec, 745 .proc_handler = proc_dointvec,
813 }, 746 },
814 { 747 {
815 .ctl_name = CTL_UNNUMBERED,
816 .procname = "io_delay_type", 748 .procname = "io_delay_type",
817 .data = &io_delay_type, 749 .data = &io_delay_type,
818 .maxlen = sizeof(int), 750 .maxlen = sizeof(int),
819 .mode = 0644, 751 .mode = 0644,
820 .proc_handler = &proc_dointvec, 752 .proc_handler = proc_dointvec,
821 }, 753 },
822#endif 754#endif
823#if defined(CONFIG_MMU) 755#if defined(CONFIG_MMU)
824 { 756 {
825 .ctl_name = KERN_RANDOMIZE,
826 .procname = "randomize_va_space", 757 .procname = "randomize_va_space",
827 .data = &randomize_va_space, 758 .data = &randomize_va_space,
828 .maxlen = sizeof(int), 759 .maxlen = sizeof(int),
829 .mode = 0644, 760 .mode = 0644,
830 .proc_handler = &proc_dointvec, 761 .proc_handler = proc_dointvec,
831 }, 762 },
832#endif 763#endif
833#if defined(CONFIG_S390) && defined(CONFIG_SMP) 764#if defined(CONFIG_S390) && defined(CONFIG_SMP)
834 { 765 {
835 .ctl_name = KERN_SPIN_RETRY,
836 .procname = "spin_retry", 766 .procname = "spin_retry",
837 .data = &spin_retry, 767 .data = &spin_retry,
838 .maxlen = sizeof (int), 768 .maxlen = sizeof (int),
839 .mode = 0644, 769 .mode = 0644,
840 .proc_handler = &proc_dointvec, 770 .proc_handler = proc_dointvec,
841 }, 771 },
842#endif 772#endif
843#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) 773#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86)
@@ -846,123 +776,104 @@ static struct ctl_table kern_table[] = {
846 .data = &acpi_realmode_flags, 776 .data = &acpi_realmode_flags,
847 .maxlen = sizeof (unsigned long), 777 .maxlen = sizeof (unsigned long),
848 .mode = 0644, 778 .mode = 0644,
849 .proc_handler = &proc_doulongvec_minmax, 779 .proc_handler = proc_doulongvec_minmax,
850 }, 780 },
851#endif 781#endif
852#ifdef CONFIG_IA64 782#ifdef CONFIG_IA64
853 { 783 {
854 .ctl_name = KERN_IA64_UNALIGNED,
855 .procname = "ignore-unaligned-usertrap", 784 .procname = "ignore-unaligned-usertrap",
856 .data = &no_unaligned_warning, 785 .data = &no_unaligned_warning,
857 .maxlen = sizeof (int), 786 .maxlen = sizeof (int),
858 .mode = 0644, 787 .mode = 0644,
859 .proc_handler = &proc_dointvec, 788 .proc_handler = proc_dointvec,
860 }, 789 },
861 { 790 {
862 .ctl_name = CTL_UNNUMBERED,
863 .procname = "unaligned-dump-stack", 791 .procname = "unaligned-dump-stack",
864 .data = &unaligned_dump_stack, 792 .data = &unaligned_dump_stack,
865 .maxlen = sizeof (int), 793 .maxlen = sizeof (int),
866 .mode = 0644, 794 .mode = 0644,
867 .proc_handler = &proc_dointvec, 795 .proc_handler = proc_dointvec,
868 }, 796 },
869#endif 797#endif
870#ifdef CONFIG_DETECT_SOFTLOCKUP 798#ifdef CONFIG_DETECT_SOFTLOCKUP
871 { 799 {
872 .ctl_name = CTL_UNNUMBERED,
873 .procname = "softlockup_panic", 800 .procname = "softlockup_panic",
874 .data = &softlockup_panic, 801 .data = &softlockup_panic,
875 .maxlen = sizeof(int), 802 .maxlen = sizeof(int),
876 .mode = 0644, 803 .mode = 0644,
877 .proc_handler = &proc_dointvec_minmax, 804 .proc_handler = proc_dointvec_minmax,
878 .strategy = &sysctl_intvec,
879 .extra1 = &zero, 805 .extra1 = &zero,
880 .extra2 = &one, 806 .extra2 = &one,
881 }, 807 },
882 { 808 {
883 .ctl_name = CTL_UNNUMBERED,
884 .procname = "softlockup_thresh", 809 .procname = "softlockup_thresh",
885 .data = &softlockup_thresh, 810 .data = &softlockup_thresh,
886 .maxlen = sizeof(int), 811 .maxlen = sizeof(int),
887 .mode = 0644, 812 .mode = 0644,
888 .proc_handler = &proc_dosoftlockup_thresh, 813 .proc_handler = proc_dosoftlockup_thresh,
889 .strategy = &sysctl_intvec,
890 .extra1 = &neg_one, 814 .extra1 = &neg_one,
891 .extra2 = &sixty, 815 .extra2 = &sixty,
892 }, 816 },
893#endif 817#endif
894#ifdef CONFIG_DETECT_HUNG_TASK 818#ifdef CONFIG_DETECT_HUNG_TASK
895 { 819 {
896 .ctl_name = CTL_UNNUMBERED,
897 .procname = "hung_task_panic", 820 .procname = "hung_task_panic",
898 .data = &sysctl_hung_task_panic, 821 .data = &sysctl_hung_task_panic,
899 .maxlen = sizeof(int), 822 .maxlen = sizeof(int),
900 .mode = 0644, 823 .mode = 0644,
901 .proc_handler = &proc_dointvec_minmax, 824 .proc_handler = proc_dointvec_minmax,
902 .strategy = &sysctl_intvec,
903 .extra1 = &zero, 825 .extra1 = &zero,
904 .extra2 = &one, 826 .extra2 = &one,
905 }, 827 },
906 { 828 {
907 .ctl_name = CTL_UNNUMBERED,
908 .procname = "hung_task_check_count", 829 .procname = "hung_task_check_count",
909 .data = &sysctl_hung_task_check_count, 830 .data = &sysctl_hung_task_check_count,
910 .maxlen = sizeof(unsigned long), 831 .maxlen = sizeof(unsigned long),
911 .mode = 0644, 832 .mode = 0644,
912 .proc_handler = &proc_doulongvec_minmax, 833 .proc_handler = proc_doulongvec_minmax,
913 .strategy = &sysctl_intvec,
914 }, 834 },
915 { 835 {
916 .ctl_name = CTL_UNNUMBERED,
917 .procname = "hung_task_timeout_secs", 836 .procname = "hung_task_timeout_secs",
918 .data = &sysctl_hung_task_timeout_secs, 837 .data = &sysctl_hung_task_timeout_secs,
919 .maxlen = sizeof(unsigned long), 838 .maxlen = sizeof(unsigned long),
920 .mode = 0644, 839 .mode = 0644,
921 .proc_handler = &proc_dohung_task_timeout_secs, 840 .proc_handler = proc_dohung_task_timeout_secs,
922 .strategy = &sysctl_intvec,
923 }, 841 },
924 { 842 {
925 .ctl_name = CTL_UNNUMBERED,
926 .procname = "hung_task_warnings", 843 .procname = "hung_task_warnings",
927 .data = &sysctl_hung_task_warnings, 844 .data = &sysctl_hung_task_warnings,
928 .maxlen = sizeof(unsigned long), 845 .maxlen = sizeof(unsigned long),
929 .mode = 0644, 846 .mode = 0644,
930 .proc_handler = &proc_doulongvec_minmax, 847 .proc_handler = proc_doulongvec_minmax,
931 .strategy = &sysctl_intvec,
932 }, 848 },
933#endif 849#endif
934#ifdef CONFIG_COMPAT 850#ifdef CONFIG_COMPAT
935 { 851 {
936 .ctl_name = KERN_COMPAT_LOG,
937 .procname = "compat-log", 852 .procname = "compat-log",
938 .data = &compat_log, 853 .data = &compat_log,
939 .maxlen = sizeof (int), 854 .maxlen = sizeof (int),
940 .mode = 0644, 855 .mode = 0644,
941 .proc_handler = &proc_dointvec, 856 .proc_handler = proc_dointvec,
942 }, 857 },
943#endif 858#endif
944#ifdef CONFIG_RT_MUTEXES 859#ifdef CONFIG_RT_MUTEXES
945 { 860 {
946 .ctl_name = KERN_MAX_LOCK_DEPTH,
947 .procname = "max_lock_depth", 861 .procname = "max_lock_depth",
948 .data = &max_lock_depth, 862 .data = &max_lock_depth,
949 .maxlen = sizeof(int), 863 .maxlen = sizeof(int),
950 .mode = 0644, 864 .mode = 0644,
951 .proc_handler = &proc_dointvec, 865 .proc_handler = proc_dointvec,
952 }, 866 },
953#endif 867#endif
954 { 868 {
955 .ctl_name = CTL_UNNUMBERED,
956 .procname = "poweroff_cmd", 869 .procname = "poweroff_cmd",
957 .data = &poweroff_cmd, 870 .data = &poweroff_cmd,
958 .maxlen = POWEROFF_CMD_PATH_LEN, 871 .maxlen = POWEROFF_CMD_PATH_LEN,
959 .mode = 0644, 872 .mode = 0644,
960 .proc_handler = &proc_dostring, 873 .proc_handler = proc_dostring,
961 .strategy = &sysctl_string,
962 }, 874 },
963#ifdef CONFIG_KEYS 875#ifdef CONFIG_KEYS
964 { 876 {
965 .ctl_name = CTL_UNNUMBERED,
966 .procname = "keys", 877 .procname = "keys",
967 .mode = 0555, 878 .mode = 0555,
968 .child = key_sysctls, 879 .child = key_sysctls,
@@ -970,17 +881,15 @@ static struct ctl_table kern_table[] = {
970#endif 881#endif
971#ifdef CONFIG_RCU_TORTURE_TEST 882#ifdef CONFIG_RCU_TORTURE_TEST
972 { 883 {
973 .ctl_name = CTL_UNNUMBERED,
974 .procname = "rcutorture_runnable", 884 .procname = "rcutorture_runnable",
975 .data = &rcutorture_runnable, 885 .data = &rcutorture_runnable,
976 .maxlen = sizeof(int), 886 .maxlen = sizeof(int),
977 .mode = 0644, 887 .mode = 0644,
978 .proc_handler = &proc_dointvec, 888 .proc_handler = proc_dointvec,
979 }, 889 },
980#endif 890#endif
981#ifdef CONFIG_SLOW_WORK 891#ifdef CONFIG_SLOW_WORK
982 { 892 {
983 .ctl_name = CTL_UNNUMBERED,
984 .procname = "slow-work", 893 .procname = "slow-work",
985 .mode = 0555, 894 .mode = 0555,
986 .child = slow_work_sysctls, 895 .child = slow_work_sysctls,
@@ -988,146 +897,127 @@ static struct ctl_table kern_table[] = {
988#endif 897#endif
989#ifdef CONFIG_PERF_EVENTS 898#ifdef CONFIG_PERF_EVENTS
990 { 899 {
991 .ctl_name = CTL_UNNUMBERED,
992 .procname = "perf_event_paranoid", 900 .procname = "perf_event_paranoid",
993 .data = &sysctl_perf_event_paranoid, 901 .data = &sysctl_perf_event_paranoid,
994 .maxlen = sizeof(sysctl_perf_event_paranoid), 902 .maxlen = sizeof(sysctl_perf_event_paranoid),
995 .mode = 0644, 903 .mode = 0644,
996 .proc_handler = &proc_dointvec, 904 .proc_handler = proc_dointvec,
997 }, 905 },
998 { 906 {
999 .ctl_name = CTL_UNNUMBERED,
1000 .procname = "perf_event_mlock_kb", 907 .procname = "perf_event_mlock_kb",
1001 .data = &sysctl_perf_event_mlock, 908 .data = &sysctl_perf_event_mlock,
1002 .maxlen = sizeof(sysctl_perf_event_mlock), 909 .maxlen = sizeof(sysctl_perf_event_mlock),
1003 .mode = 0644, 910 .mode = 0644,
1004 .proc_handler = &proc_dointvec, 911 .proc_handler = proc_dointvec,
1005 }, 912 },
1006 { 913 {
1007 .ctl_name = CTL_UNNUMBERED,
1008 .procname = "perf_event_max_sample_rate", 914 .procname = "perf_event_max_sample_rate",
1009 .data = &sysctl_perf_event_sample_rate, 915 .data = &sysctl_perf_event_sample_rate,
1010 .maxlen = sizeof(sysctl_perf_event_sample_rate), 916 .maxlen = sizeof(sysctl_perf_event_sample_rate),
1011 .mode = 0644, 917 .mode = 0644,
1012 .proc_handler = &proc_dointvec, 918 .proc_handler = proc_dointvec,
1013 }, 919 },
1014#endif 920#endif
1015#ifdef CONFIG_KMEMCHECK 921#ifdef CONFIG_KMEMCHECK
1016 { 922 {
1017 .ctl_name = CTL_UNNUMBERED,
1018 .procname = "kmemcheck", 923 .procname = "kmemcheck",
1019 .data = &kmemcheck_enabled, 924 .data = &kmemcheck_enabled,
1020 .maxlen = sizeof(int), 925 .maxlen = sizeof(int),
1021 .mode = 0644, 926 .mode = 0644,
1022 .proc_handler = &proc_dointvec, 927 .proc_handler = proc_dointvec,
1023 }, 928 },
1024#endif 929#endif
1025#ifdef CONFIG_BLOCK 930#ifdef CONFIG_BLOCK
1026 { 931 {
1027 .ctl_name = CTL_UNNUMBERED,
1028 .procname = "blk_iopoll", 932 .procname = "blk_iopoll",
1029 .data = &blk_iopoll_enabled, 933 .data = &blk_iopoll_enabled,
1030 .maxlen = sizeof(int), 934 .maxlen = sizeof(int),
1031 .mode = 0644, 935 .mode = 0644,
1032 .proc_handler = &proc_dointvec, 936 .proc_handler = proc_dointvec,
1033 }, 937 },
1034#endif 938#endif
1035/* 939/*
1036 * NOTE: do not add new entries to this table unless you have read 940 * NOTE: do not add new entries to this table unless you have read
1037 * Documentation/sysctl/ctl_unnumbered.txt 941 * Documentation/sysctl/ctl_unnumbered.txt
1038 */ 942 */
1039 { .ctl_name = 0 } 943 { }
1040}; 944};
1041 945
1042static struct ctl_table vm_table[] = { 946static struct ctl_table vm_table[] = {
1043 { 947 {
1044 .ctl_name = VM_OVERCOMMIT_MEMORY,
1045 .procname = "overcommit_memory", 948 .procname = "overcommit_memory",
1046 .data = &sysctl_overcommit_memory, 949 .data = &sysctl_overcommit_memory,
1047 .maxlen = sizeof(sysctl_overcommit_memory), 950 .maxlen = sizeof(sysctl_overcommit_memory),
1048 .mode = 0644, 951 .mode = 0644,
1049 .proc_handler = &proc_dointvec, 952 .proc_handler = proc_dointvec,
1050 }, 953 },
1051 { 954 {
1052 .ctl_name = VM_PANIC_ON_OOM,
1053 .procname = "panic_on_oom", 955 .procname = "panic_on_oom",
1054 .data = &sysctl_panic_on_oom, 956 .data = &sysctl_panic_on_oom,
1055 .maxlen = sizeof(sysctl_panic_on_oom), 957 .maxlen = sizeof(sysctl_panic_on_oom),
1056 .mode = 0644, 958 .mode = 0644,
1057 .proc_handler = &proc_dointvec, 959 .proc_handler = proc_dointvec,
1058 }, 960 },
1059 { 961 {
1060 .ctl_name = CTL_UNNUMBERED,
1061 .procname = "oom_kill_allocating_task", 962 .procname = "oom_kill_allocating_task",
1062 .data = &sysctl_oom_kill_allocating_task, 963 .data = &sysctl_oom_kill_allocating_task,
1063 .maxlen = sizeof(sysctl_oom_kill_allocating_task), 964 .maxlen = sizeof(sysctl_oom_kill_allocating_task),
1064 .mode = 0644, 965 .mode = 0644,
1065 .proc_handler = &proc_dointvec, 966 .proc_handler = proc_dointvec,
1066 }, 967 },
1067 { 968 {
1068 .ctl_name = CTL_UNNUMBERED,
1069 .procname = "oom_dump_tasks", 969 .procname = "oom_dump_tasks",
1070 .data = &sysctl_oom_dump_tasks, 970 .data = &sysctl_oom_dump_tasks,
1071 .maxlen = sizeof(sysctl_oom_dump_tasks), 971 .maxlen = sizeof(sysctl_oom_dump_tasks),
1072 .mode = 0644, 972 .mode = 0644,
1073 .proc_handler = &proc_dointvec, 973 .proc_handler = proc_dointvec,
1074 }, 974 },
1075 { 975 {
1076 .ctl_name = VM_OVERCOMMIT_RATIO,
1077 .procname = "overcommit_ratio", 976 .procname = "overcommit_ratio",
1078 .data = &sysctl_overcommit_ratio, 977 .data = &sysctl_overcommit_ratio,
1079 .maxlen = sizeof(sysctl_overcommit_ratio), 978 .maxlen = sizeof(sysctl_overcommit_ratio),
1080 .mode = 0644, 979 .mode = 0644,
1081 .proc_handler = &proc_dointvec, 980 .proc_handler = proc_dointvec,
1082 }, 981 },
1083 { 982 {
1084 .ctl_name = VM_PAGE_CLUSTER,
1085 .procname = "page-cluster", 983 .procname = "page-cluster",
1086 .data = &page_cluster, 984 .data = &page_cluster,
1087 .maxlen = sizeof(int), 985 .maxlen = sizeof(int),
1088 .mode = 0644, 986 .mode = 0644,
1089 .proc_handler = &proc_dointvec, 987 .proc_handler = proc_dointvec,
1090 }, 988 },
1091 { 989 {
1092 .ctl_name = VM_DIRTY_BACKGROUND,
1093 .procname = "dirty_background_ratio", 990 .procname = "dirty_background_ratio",
1094 .data = &dirty_background_ratio, 991 .data = &dirty_background_ratio,
1095 .maxlen = sizeof(dirty_background_ratio), 992 .maxlen = sizeof(dirty_background_ratio),
1096 .mode = 0644, 993 .mode = 0644,
1097 .proc_handler = &dirty_background_ratio_handler, 994 .proc_handler = dirty_background_ratio_handler,
1098 .strategy = &sysctl_intvec,
1099 .extra1 = &zero, 995 .extra1 = &zero,
1100 .extra2 = &one_hundred, 996 .extra2 = &one_hundred,
1101 }, 997 },
1102 { 998 {
1103 .ctl_name = CTL_UNNUMBERED,
1104 .procname = "dirty_background_bytes", 999 .procname = "dirty_background_bytes",
1105 .data = &dirty_background_bytes, 1000 .data = &dirty_background_bytes,
1106 .maxlen = sizeof(dirty_background_bytes), 1001 .maxlen = sizeof(dirty_background_bytes),
1107 .mode = 0644, 1002 .mode = 0644,
1108 .proc_handler = &dirty_background_bytes_handler, 1003 .proc_handler = dirty_background_bytes_handler,
1109 .strategy = &sysctl_intvec,
1110 .extra1 = &one_ul, 1004 .extra1 = &one_ul,
1111 }, 1005 },
1112 { 1006 {
1113 .ctl_name = VM_DIRTY_RATIO,
1114 .procname = "dirty_ratio", 1007 .procname = "dirty_ratio",
1115 .data = &vm_dirty_ratio, 1008 .data = &vm_dirty_ratio,
1116 .maxlen = sizeof(vm_dirty_ratio), 1009 .maxlen = sizeof(vm_dirty_ratio),
1117 .mode = 0644, 1010 .mode = 0644,
1118 .proc_handler = &dirty_ratio_handler, 1011 .proc_handler = dirty_ratio_handler,
1119 .strategy = &sysctl_intvec,
1120 .extra1 = &zero, 1012 .extra1 = &zero,
1121 .extra2 = &one_hundred, 1013 .extra2 = &one_hundred,
1122 }, 1014 },
1123 { 1015 {
1124 .ctl_name = CTL_UNNUMBERED,
1125 .procname = "dirty_bytes", 1016 .procname = "dirty_bytes",
1126 .data = &vm_dirty_bytes, 1017 .data = &vm_dirty_bytes,
1127 .maxlen = sizeof(vm_dirty_bytes), 1018 .maxlen = sizeof(vm_dirty_bytes),
1128 .mode = 0644, 1019 .mode = 0644,
1129 .proc_handler = &dirty_bytes_handler, 1020 .proc_handler = dirty_bytes_handler,
1130 .strategy = &sysctl_intvec,
1131 .extra1 = &dirty_bytes_min, 1021 .extra1 = &dirty_bytes_min,
1132 }, 1022 },
1133 { 1023 {
@@ -1135,289 +1025,256 @@ static struct ctl_table vm_table[] = {
1135 .data = &dirty_writeback_interval, 1025 .data = &dirty_writeback_interval,
1136 .maxlen = sizeof(dirty_writeback_interval), 1026 .maxlen = sizeof(dirty_writeback_interval),
1137 .mode = 0644, 1027 .mode = 0644,
1138 .proc_handler = &dirty_writeback_centisecs_handler, 1028 .proc_handler = dirty_writeback_centisecs_handler,
1139 }, 1029 },
1140 { 1030 {
1141 .procname = "dirty_expire_centisecs", 1031 .procname = "dirty_expire_centisecs",
1142 .data = &dirty_expire_interval, 1032 .data = &dirty_expire_interval,
1143 .maxlen = sizeof(dirty_expire_interval), 1033 .maxlen = sizeof(dirty_expire_interval),
1144 .mode = 0644, 1034 .mode = 0644,
1145 .proc_handler = &proc_dointvec, 1035 .proc_handler = proc_dointvec,
1146 }, 1036 },
1147 { 1037 {
1148 .ctl_name = VM_NR_PDFLUSH_THREADS,
1149 .procname = "nr_pdflush_threads", 1038 .procname = "nr_pdflush_threads",
1150 .data = &nr_pdflush_threads, 1039 .data = &nr_pdflush_threads,
1151 .maxlen = sizeof nr_pdflush_threads, 1040 .maxlen = sizeof nr_pdflush_threads,
1152 .mode = 0444 /* read-only*/, 1041 .mode = 0444 /* read-only*/,
1153 .proc_handler = &proc_dointvec, 1042 .proc_handler = proc_dointvec,
1154 }, 1043 },
1155 { 1044 {
1156 .ctl_name = VM_SWAPPINESS,
1157 .procname = "swappiness", 1045 .procname = "swappiness",
1158 .data = &vm_swappiness, 1046 .data = &vm_swappiness,
1159 .maxlen = sizeof(vm_swappiness), 1047 .maxlen = sizeof(vm_swappiness),
1160 .mode = 0644, 1048 .mode = 0644,
1161 .proc_handler = &proc_dointvec_minmax, 1049 .proc_handler = proc_dointvec_minmax,
1162 .strategy = &sysctl_intvec,
1163 .extra1 = &zero, 1050 .extra1 = &zero,
1164 .extra2 = &one_hundred, 1051 .extra2 = &one_hundred,
1165 }, 1052 },
1166#ifdef CONFIG_HUGETLB_PAGE 1053#ifdef CONFIG_HUGETLB_PAGE
1167 { 1054 {
1168 .procname = "nr_hugepages", 1055 .procname = "nr_hugepages",
1169 .data = NULL, 1056 .data = NULL,
1170 .maxlen = sizeof(unsigned long), 1057 .maxlen = sizeof(unsigned long),
1171 .mode = 0644, 1058 .mode = 0644,
1172 .proc_handler = &hugetlb_sysctl_handler, 1059 .proc_handler = hugetlb_sysctl_handler,
1173 .extra1 = (void *)&hugetlb_zero, 1060 .extra1 = (void *)&hugetlb_zero,
1174 .extra2 = (void *)&hugetlb_infinity, 1061 .extra2 = (void *)&hugetlb_infinity,
1175 }, 1062 },
1063#ifdef CONFIG_NUMA
1064 {
1065 .procname = "nr_hugepages_mempolicy",
1066 .data = NULL,
1067 .maxlen = sizeof(unsigned long),
1068 .mode = 0644,
1069 .proc_handler = &hugetlb_mempolicy_sysctl_handler,
1070 .extra1 = (void *)&hugetlb_zero,
1071 .extra2 = (void *)&hugetlb_infinity,
1072 },
1073#endif
1176 { 1074 {
1177 .ctl_name = VM_HUGETLB_GROUP,
1178 .procname = "hugetlb_shm_group", 1075 .procname = "hugetlb_shm_group",
1179 .data = &sysctl_hugetlb_shm_group, 1076 .data = &sysctl_hugetlb_shm_group,
1180 .maxlen = sizeof(gid_t), 1077 .maxlen = sizeof(gid_t),
1181 .mode = 0644, 1078 .mode = 0644,
1182 .proc_handler = &proc_dointvec, 1079 .proc_handler = proc_dointvec,
1183 }, 1080 },
1184 { 1081 {
1185 .ctl_name = CTL_UNNUMBERED,
1186 .procname = "hugepages_treat_as_movable", 1082 .procname = "hugepages_treat_as_movable",
1187 .data = &hugepages_treat_as_movable, 1083 .data = &hugepages_treat_as_movable,
1188 .maxlen = sizeof(int), 1084 .maxlen = sizeof(int),
1189 .mode = 0644, 1085 .mode = 0644,
1190 .proc_handler = &hugetlb_treat_movable_handler, 1086 .proc_handler = hugetlb_treat_movable_handler,
1191 }, 1087 },
1192 { 1088 {
1193 .ctl_name = CTL_UNNUMBERED,
1194 .procname = "nr_overcommit_hugepages", 1089 .procname = "nr_overcommit_hugepages",
1195 .data = NULL, 1090 .data = NULL,
1196 .maxlen = sizeof(unsigned long), 1091 .maxlen = sizeof(unsigned long),
1197 .mode = 0644, 1092 .mode = 0644,
1198 .proc_handler = &hugetlb_overcommit_handler, 1093 .proc_handler = hugetlb_overcommit_handler,
1199 .extra1 = (void *)&hugetlb_zero, 1094 .extra1 = (void *)&hugetlb_zero,
1200 .extra2 = (void *)&hugetlb_infinity, 1095 .extra2 = (void *)&hugetlb_infinity,
1201 }, 1096 },
1202#endif 1097#endif
1203 { 1098 {
1204 .ctl_name = VM_LOWMEM_RESERVE_RATIO,
1205 .procname = "lowmem_reserve_ratio", 1099 .procname = "lowmem_reserve_ratio",
1206 .data = &sysctl_lowmem_reserve_ratio, 1100 .data = &sysctl_lowmem_reserve_ratio,
1207 .maxlen = sizeof(sysctl_lowmem_reserve_ratio), 1101 .maxlen = sizeof(sysctl_lowmem_reserve_ratio),
1208 .mode = 0644, 1102 .mode = 0644,
1209 .proc_handler = &lowmem_reserve_ratio_sysctl_handler, 1103 .proc_handler = lowmem_reserve_ratio_sysctl_handler,
1210 .strategy = &sysctl_intvec,
1211 }, 1104 },
1212 { 1105 {
1213 .ctl_name = VM_DROP_PAGECACHE,
1214 .procname = "drop_caches", 1106 .procname = "drop_caches",
1215 .data = &sysctl_drop_caches, 1107 .data = &sysctl_drop_caches,
1216 .maxlen = sizeof(int), 1108 .maxlen = sizeof(int),
1217 .mode = 0644, 1109 .mode = 0644,
1218 .proc_handler = drop_caches_sysctl_handler, 1110 .proc_handler = drop_caches_sysctl_handler,
1219 .strategy = &sysctl_intvec,
1220 }, 1111 },
1221 { 1112 {
1222 .ctl_name = VM_MIN_FREE_KBYTES,
1223 .procname = "min_free_kbytes", 1113 .procname = "min_free_kbytes",
1224 .data = &min_free_kbytes, 1114 .data = &min_free_kbytes,
1225 .maxlen = sizeof(min_free_kbytes), 1115 .maxlen = sizeof(min_free_kbytes),
1226 .mode = 0644, 1116 .mode = 0644,
1227 .proc_handler = &min_free_kbytes_sysctl_handler, 1117 .proc_handler = min_free_kbytes_sysctl_handler,
1228 .strategy = &sysctl_intvec,
1229 .extra1 = &zero, 1118 .extra1 = &zero,
1230 }, 1119 },
1231 { 1120 {
1232 .ctl_name = VM_PERCPU_PAGELIST_FRACTION,
1233 .procname = "percpu_pagelist_fraction", 1121 .procname = "percpu_pagelist_fraction",
1234 .data = &percpu_pagelist_fraction, 1122 .data = &percpu_pagelist_fraction,
1235 .maxlen = sizeof(percpu_pagelist_fraction), 1123 .maxlen = sizeof(percpu_pagelist_fraction),
1236 .mode = 0644, 1124 .mode = 0644,
1237 .proc_handler = &percpu_pagelist_fraction_sysctl_handler, 1125 .proc_handler = percpu_pagelist_fraction_sysctl_handler,
1238 .strategy = &sysctl_intvec,
1239 .extra1 = &min_percpu_pagelist_fract, 1126 .extra1 = &min_percpu_pagelist_fract,
1240 }, 1127 },
1241#ifdef CONFIG_MMU 1128#ifdef CONFIG_MMU
1242 { 1129 {
1243 .ctl_name = VM_MAX_MAP_COUNT,
1244 .procname = "max_map_count", 1130 .procname = "max_map_count",
1245 .data = &sysctl_max_map_count, 1131 .data = &sysctl_max_map_count,
1246 .maxlen = sizeof(sysctl_max_map_count), 1132 .maxlen = sizeof(sysctl_max_map_count),
1247 .mode = 0644, 1133 .mode = 0644,
1248 .proc_handler = &proc_dointvec 1134 .proc_handler = proc_dointvec,
1135 .extra1 = &zero,
1249 }, 1136 },
1250#else 1137#else
1251 { 1138 {
1252 .ctl_name = CTL_UNNUMBERED,
1253 .procname = "nr_trim_pages", 1139 .procname = "nr_trim_pages",
1254 .data = &sysctl_nr_trim_pages, 1140 .data = &sysctl_nr_trim_pages,
1255 .maxlen = sizeof(sysctl_nr_trim_pages), 1141 .maxlen = sizeof(sysctl_nr_trim_pages),
1256 .mode = 0644, 1142 .mode = 0644,
1257 .proc_handler = &proc_dointvec_minmax, 1143 .proc_handler = proc_dointvec_minmax,
1258 .strategy = &sysctl_intvec,
1259 .extra1 = &zero, 1144 .extra1 = &zero,
1260 }, 1145 },
1261#endif 1146#endif
1262 { 1147 {
1263 .ctl_name = VM_LAPTOP_MODE,
1264 .procname = "laptop_mode", 1148 .procname = "laptop_mode",
1265 .data = &laptop_mode, 1149 .data = &laptop_mode,
1266 .maxlen = sizeof(laptop_mode), 1150 .maxlen = sizeof(laptop_mode),
1267 .mode = 0644, 1151 .mode = 0644,
1268 .proc_handler = &proc_dointvec_jiffies, 1152 .proc_handler = proc_dointvec_jiffies,
1269 .strategy = &sysctl_jiffies,
1270 }, 1153 },
1271 { 1154 {
1272 .ctl_name = VM_BLOCK_DUMP,
1273 .procname = "block_dump", 1155 .procname = "block_dump",
1274 .data = &block_dump, 1156 .data = &block_dump,
1275 .maxlen = sizeof(block_dump), 1157 .maxlen = sizeof(block_dump),
1276 .mode = 0644, 1158 .mode = 0644,
1277 .proc_handler = &proc_dointvec, 1159 .proc_handler = proc_dointvec,
1278 .strategy = &sysctl_intvec,
1279 .extra1 = &zero, 1160 .extra1 = &zero,
1280 }, 1161 },
1281 { 1162 {
1282 .ctl_name = VM_VFS_CACHE_PRESSURE,
1283 .procname = "vfs_cache_pressure", 1163 .procname = "vfs_cache_pressure",
1284 .data = &sysctl_vfs_cache_pressure, 1164 .data = &sysctl_vfs_cache_pressure,
1285 .maxlen = sizeof(sysctl_vfs_cache_pressure), 1165 .maxlen = sizeof(sysctl_vfs_cache_pressure),
1286 .mode = 0644, 1166 .mode = 0644,
1287 .proc_handler = &proc_dointvec, 1167 .proc_handler = proc_dointvec,
1288 .strategy = &sysctl_intvec,
1289 .extra1 = &zero, 1168 .extra1 = &zero,
1290 }, 1169 },
1291#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT 1170#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
1292 { 1171 {
1293 .ctl_name = VM_LEGACY_VA_LAYOUT,
1294 .procname = "legacy_va_layout", 1172 .procname = "legacy_va_layout",
1295 .data = &sysctl_legacy_va_layout, 1173 .data = &sysctl_legacy_va_layout,
1296 .maxlen = sizeof(sysctl_legacy_va_layout), 1174 .maxlen = sizeof(sysctl_legacy_va_layout),
1297 .mode = 0644, 1175 .mode = 0644,
1298 .proc_handler = &proc_dointvec, 1176 .proc_handler = proc_dointvec,
1299 .strategy = &sysctl_intvec,
1300 .extra1 = &zero, 1177 .extra1 = &zero,
1301 }, 1178 },
1302#endif 1179#endif
1303#ifdef CONFIG_NUMA 1180#ifdef CONFIG_NUMA
1304 { 1181 {
1305 .ctl_name = VM_ZONE_RECLAIM_MODE,
1306 .procname = "zone_reclaim_mode", 1182 .procname = "zone_reclaim_mode",
1307 .data = &zone_reclaim_mode, 1183 .data = &zone_reclaim_mode,
1308 .maxlen = sizeof(zone_reclaim_mode), 1184 .maxlen = sizeof(zone_reclaim_mode),
1309 .mode = 0644, 1185 .mode = 0644,
1310 .proc_handler = &proc_dointvec, 1186 .proc_handler = proc_dointvec,
1311 .strategy = &sysctl_intvec,
1312 .extra1 = &zero, 1187 .extra1 = &zero,
1313 }, 1188 },
1314 { 1189 {
1315 .ctl_name = VM_MIN_UNMAPPED,
1316 .procname = "min_unmapped_ratio", 1190 .procname = "min_unmapped_ratio",
1317 .data = &sysctl_min_unmapped_ratio, 1191 .data = &sysctl_min_unmapped_ratio,
1318 .maxlen = sizeof(sysctl_min_unmapped_ratio), 1192 .maxlen = sizeof(sysctl_min_unmapped_ratio),
1319 .mode = 0644, 1193 .mode = 0644,
1320 .proc_handler = &sysctl_min_unmapped_ratio_sysctl_handler, 1194 .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler,
1321 .strategy = &sysctl_intvec,
1322 .extra1 = &zero, 1195 .extra1 = &zero,
1323 .extra2 = &one_hundred, 1196 .extra2 = &one_hundred,
1324 }, 1197 },
1325 { 1198 {
1326 .ctl_name = VM_MIN_SLAB,
1327 .procname = "min_slab_ratio", 1199 .procname = "min_slab_ratio",
1328 .data = &sysctl_min_slab_ratio, 1200 .data = &sysctl_min_slab_ratio,
1329 .maxlen = sizeof(sysctl_min_slab_ratio), 1201 .maxlen = sizeof(sysctl_min_slab_ratio),
1330 .mode = 0644, 1202 .mode = 0644,
1331 .proc_handler = &sysctl_min_slab_ratio_sysctl_handler, 1203 .proc_handler = sysctl_min_slab_ratio_sysctl_handler,
1332 .strategy = &sysctl_intvec,
1333 .extra1 = &zero, 1204 .extra1 = &zero,
1334 .extra2 = &one_hundred, 1205 .extra2 = &one_hundred,
1335 }, 1206 },
1336#endif 1207#endif
1337#ifdef CONFIG_SMP 1208#ifdef CONFIG_SMP
1338 { 1209 {
1339 .ctl_name = CTL_UNNUMBERED,
1340 .procname = "stat_interval", 1210 .procname = "stat_interval",
1341 .data = &sysctl_stat_interval, 1211 .data = &sysctl_stat_interval,
1342 .maxlen = sizeof(sysctl_stat_interval), 1212 .maxlen = sizeof(sysctl_stat_interval),
1343 .mode = 0644, 1213 .mode = 0644,
1344 .proc_handler = &proc_dointvec_jiffies, 1214 .proc_handler = proc_dointvec_jiffies,
1345 .strategy = &sysctl_jiffies,
1346 }, 1215 },
1347#endif 1216#endif
1348 { 1217 {
1349 .ctl_name = CTL_UNNUMBERED,
1350 .procname = "mmap_min_addr", 1218 .procname = "mmap_min_addr",
1351 .data = &dac_mmap_min_addr, 1219 .data = &dac_mmap_min_addr,
1352 .maxlen = sizeof(unsigned long), 1220 .maxlen = sizeof(unsigned long),
1353 .mode = 0644, 1221 .mode = 0644,
1354 .proc_handler = &mmap_min_addr_handler, 1222 .proc_handler = mmap_min_addr_handler,
1355 }, 1223 },
1356#ifdef CONFIG_NUMA 1224#ifdef CONFIG_NUMA
1357 { 1225 {
1358 .ctl_name = CTL_UNNUMBERED,
1359 .procname = "numa_zonelist_order", 1226 .procname = "numa_zonelist_order",
1360 .data = &numa_zonelist_order, 1227 .data = &numa_zonelist_order,
1361 .maxlen = NUMA_ZONELIST_ORDER_LEN, 1228 .maxlen = NUMA_ZONELIST_ORDER_LEN,
1362 .mode = 0644, 1229 .mode = 0644,
1363 .proc_handler = &numa_zonelist_order_handler, 1230 .proc_handler = numa_zonelist_order_handler,
1364 .strategy = &sysctl_string,
1365 }, 1231 },
1366#endif 1232#endif
1367#if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \ 1233#if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \
1368 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) 1234 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
1369 { 1235 {
1370 .ctl_name = VM_VDSO_ENABLED,
1371 .procname = "vdso_enabled", 1236 .procname = "vdso_enabled",
1372 .data = &vdso_enabled, 1237 .data = &vdso_enabled,
1373 .maxlen = sizeof(vdso_enabled), 1238 .maxlen = sizeof(vdso_enabled),
1374 .mode = 0644, 1239 .mode = 0644,
1375 .proc_handler = &proc_dointvec, 1240 .proc_handler = proc_dointvec,
1376 .strategy = &sysctl_intvec,
1377 .extra1 = &zero, 1241 .extra1 = &zero,
1378 }, 1242 },
1379#endif 1243#endif
1380#ifdef CONFIG_HIGHMEM 1244#ifdef CONFIG_HIGHMEM
1381 { 1245 {
1382 .ctl_name = CTL_UNNUMBERED,
1383 .procname = "highmem_is_dirtyable", 1246 .procname = "highmem_is_dirtyable",
1384 .data = &vm_highmem_is_dirtyable, 1247 .data = &vm_highmem_is_dirtyable,
1385 .maxlen = sizeof(vm_highmem_is_dirtyable), 1248 .maxlen = sizeof(vm_highmem_is_dirtyable),
1386 .mode = 0644, 1249 .mode = 0644,
1387 .proc_handler = &proc_dointvec_minmax, 1250 .proc_handler = proc_dointvec_minmax,
1388 .strategy = &sysctl_intvec,
1389 .extra1 = &zero, 1251 .extra1 = &zero,
1390 .extra2 = &one, 1252 .extra2 = &one,
1391 }, 1253 },
1392#endif 1254#endif
1393 { 1255 {
1394 .ctl_name = CTL_UNNUMBERED,
1395 .procname = "scan_unevictable_pages", 1256 .procname = "scan_unevictable_pages",
1396 .data = &scan_unevictable_pages, 1257 .data = &scan_unevictable_pages,
1397 .maxlen = sizeof(scan_unevictable_pages), 1258 .maxlen = sizeof(scan_unevictable_pages),
1398 .mode = 0644, 1259 .mode = 0644,
1399 .proc_handler = &scan_unevictable_handler, 1260 .proc_handler = scan_unevictable_handler,
1400 }, 1261 },
1401#ifdef CONFIG_MEMORY_FAILURE 1262#ifdef CONFIG_MEMORY_FAILURE
1402 { 1263 {
1403 .ctl_name = CTL_UNNUMBERED,
1404 .procname = "memory_failure_early_kill", 1264 .procname = "memory_failure_early_kill",
1405 .data = &sysctl_memory_failure_early_kill, 1265 .data = &sysctl_memory_failure_early_kill,
1406 .maxlen = sizeof(sysctl_memory_failure_early_kill), 1266 .maxlen = sizeof(sysctl_memory_failure_early_kill),
1407 .mode = 0644, 1267 .mode = 0644,
1408 .proc_handler = &proc_dointvec_minmax, 1268 .proc_handler = proc_dointvec_minmax,
1409 .strategy = &sysctl_intvec,
1410 .extra1 = &zero, 1269 .extra1 = &zero,
1411 .extra2 = &one, 1270 .extra2 = &one,
1412 }, 1271 },
1413 { 1272 {
1414 .ctl_name = CTL_UNNUMBERED,
1415 .procname = "memory_failure_recovery", 1273 .procname = "memory_failure_recovery",
1416 .data = &sysctl_memory_failure_recovery, 1274 .data = &sysctl_memory_failure_recovery,
1417 .maxlen = sizeof(sysctl_memory_failure_recovery), 1275 .maxlen = sizeof(sysctl_memory_failure_recovery),
1418 .mode = 0644, 1276 .mode = 0644,
1419 .proc_handler = &proc_dointvec_minmax, 1277 .proc_handler = proc_dointvec_minmax,
1420 .strategy = &sysctl_intvec,
1421 .extra1 = &zero, 1278 .extra1 = &zero,
1422 .extra2 = &one, 1279 .extra2 = &one,
1423 }, 1280 },
@@ -1427,116 +1284,104 @@ static struct ctl_table vm_table[] = {
1427 * NOTE: do not add new entries to this table unless you have read 1284 * NOTE: do not add new entries to this table unless you have read
1428 * Documentation/sysctl/ctl_unnumbered.txt 1285 * Documentation/sysctl/ctl_unnumbered.txt
1429 */ 1286 */
1430 { .ctl_name = 0 } 1287 { }
1431}; 1288};
1432 1289
1433#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) 1290#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
1434static struct ctl_table binfmt_misc_table[] = { 1291static struct ctl_table binfmt_misc_table[] = {
1435 { .ctl_name = 0 } 1292 { }
1436}; 1293};
1437#endif 1294#endif
1438 1295
1439static struct ctl_table fs_table[] = { 1296static struct ctl_table fs_table[] = {
1440 { 1297 {
1441 .ctl_name = FS_NRINODE,
1442 .procname = "inode-nr", 1298 .procname = "inode-nr",
1443 .data = &inodes_stat, 1299 .data = &inodes_stat,
1444 .maxlen = 2*sizeof(int), 1300 .maxlen = 2*sizeof(int),
1445 .mode = 0444, 1301 .mode = 0444,
1446 .proc_handler = &proc_dointvec, 1302 .proc_handler = proc_dointvec,
1447 }, 1303 },
1448 { 1304 {
1449 .ctl_name = FS_STATINODE,
1450 .procname = "inode-state", 1305 .procname = "inode-state",
1451 .data = &inodes_stat, 1306 .data = &inodes_stat,
1452 .maxlen = 7*sizeof(int), 1307 .maxlen = 7*sizeof(int),
1453 .mode = 0444, 1308 .mode = 0444,
1454 .proc_handler = &proc_dointvec, 1309 .proc_handler = proc_dointvec,
1455 }, 1310 },
1456 { 1311 {
1457 .procname = "file-nr", 1312 .procname = "file-nr",
1458 .data = &files_stat, 1313 .data = &files_stat,
1459 .maxlen = 3*sizeof(int), 1314 .maxlen = 3*sizeof(int),
1460 .mode = 0444, 1315 .mode = 0444,
1461 .proc_handler = &proc_nr_files, 1316 .proc_handler = proc_nr_files,
1462 }, 1317 },
1463 { 1318 {
1464 .ctl_name = FS_MAXFILE,
1465 .procname = "file-max", 1319 .procname = "file-max",
1466 .data = &files_stat.max_files, 1320 .data = &files_stat.max_files,
1467 .maxlen = sizeof(int), 1321 .maxlen = sizeof(int),
1468 .mode = 0644, 1322 .mode = 0644,
1469 .proc_handler = &proc_dointvec, 1323 .proc_handler = proc_dointvec,
1470 }, 1324 },
1471 { 1325 {
1472 .ctl_name = CTL_UNNUMBERED,
1473 .procname = "nr_open", 1326 .procname = "nr_open",
1474 .data = &sysctl_nr_open, 1327 .data = &sysctl_nr_open,
1475 .maxlen = sizeof(int), 1328 .maxlen = sizeof(int),
1476 .mode = 0644, 1329 .mode = 0644,
1477 .proc_handler = &proc_dointvec_minmax, 1330 .proc_handler = proc_dointvec_minmax,
1478 .extra1 = &sysctl_nr_open_min, 1331 .extra1 = &sysctl_nr_open_min,
1479 .extra2 = &sysctl_nr_open_max, 1332 .extra2 = &sysctl_nr_open_max,
1480 }, 1333 },
1481 { 1334 {
1482 .ctl_name = FS_DENTRY,
1483 .procname = "dentry-state", 1335 .procname = "dentry-state",
1484 .data = &dentry_stat, 1336 .data = &dentry_stat,
1485 .maxlen = 6*sizeof(int), 1337 .maxlen = 6*sizeof(int),
1486 .mode = 0444, 1338 .mode = 0444,
1487 .proc_handler = &proc_dointvec, 1339 .proc_handler = proc_dointvec,
1488 }, 1340 },
1489 { 1341 {
1490 .ctl_name = FS_OVERFLOWUID,
1491 .procname = "overflowuid", 1342 .procname = "overflowuid",
1492 .data = &fs_overflowuid, 1343 .data = &fs_overflowuid,
1493 .maxlen = sizeof(int), 1344 .maxlen = sizeof(int),
1494 .mode = 0644, 1345 .mode = 0644,
1495 .proc_handler = &proc_dointvec_minmax, 1346 .proc_handler = proc_dointvec_minmax,
1496 .strategy = &sysctl_intvec,
1497 .extra1 = &minolduid, 1347 .extra1 = &minolduid,
1498 .extra2 = &maxolduid, 1348 .extra2 = &maxolduid,
1499 }, 1349 },
1500 { 1350 {
1501 .ctl_name = FS_OVERFLOWGID,
1502 .procname = "overflowgid", 1351 .procname = "overflowgid",
1503 .data = &fs_overflowgid, 1352 .data = &fs_overflowgid,
1504 .maxlen = sizeof(int), 1353 .maxlen = sizeof(int),
1505 .mode = 0644, 1354 .mode = 0644,
1506 .proc_handler = &proc_dointvec_minmax, 1355 .proc_handler = proc_dointvec_minmax,
1507 .strategy = &sysctl_intvec,
1508 .extra1 = &minolduid, 1356 .extra1 = &minolduid,
1509 .extra2 = &maxolduid, 1357 .extra2 = &maxolduid,
1510 }, 1358 },
1511#ifdef CONFIG_FILE_LOCKING 1359#ifdef CONFIG_FILE_LOCKING
1512 { 1360 {
1513 .ctl_name = FS_LEASES,
1514 .procname = "leases-enable", 1361 .procname = "leases-enable",
1515 .data = &leases_enable, 1362 .data = &leases_enable,
1516 .maxlen = sizeof(int), 1363 .maxlen = sizeof(int),
1517 .mode = 0644, 1364 .mode = 0644,
1518 .proc_handler = &proc_dointvec, 1365 .proc_handler = proc_dointvec,
1519 }, 1366 },
1520#endif 1367#endif
1521#ifdef CONFIG_DNOTIFY 1368#ifdef CONFIG_DNOTIFY
1522 { 1369 {
1523 .ctl_name = FS_DIR_NOTIFY,
1524 .procname = "dir-notify-enable", 1370 .procname = "dir-notify-enable",
1525 .data = &dir_notify_enable, 1371 .data = &dir_notify_enable,
1526 .maxlen = sizeof(int), 1372 .maxlen = sizeof(int),
1527 .mode = 0644, 1373 .mode = 0644,
1528 .proc_handler = &proc_dointvec, 1374 .proc_handler = proc_dointvec,
1529 }, 1375 },
1530#endif 1376#endif
1531#ifdef CONFIG_MMU 1377#ifdef CONFIG_MMU
1532#ifdef CONFIG_FILE_LOCKING 1378#ifdef CONFIG_FILE_LOCKING
1533 { 1379 {
1534 .ctl_name = FS_LEASE_TIME,
1535 .procname = "lease-break-time", 1380 .procname = "lease-break-time",
1536 .data = &lease_break_time, 1381 .data = &lease_break_time,
1537 .maxlen = sizeof(int), 1382 .maxlen = sizeof(int),
1538 .mode = 0644, 1383 .mode = 0644,
1539 .proc_handler = &proc_dointvec, 1384 .proc_handler = proc_dointvec,
1540 }, 1385 },
1541#endif 1386#endif
1542#ifdef CONFIG_AIO 1387#ifdef CONFIG_AIO
@@ -1545,19 +1390,18 @@ static struct ctl_table fs_table[] = {
1545 .data = &aio_nr, 1390 .data = &aio_nr,
1546 .maxlen = sizeof(aio_nr), 1391 .maxlen = sizeof(aio_nr),
1547 .mode = 0444, 1392 .mode = 0444,
1548 .proc_handler = &proc_doulongvec_minmax, 1393 .proc_handler = proc_doulongvec_minmax,
1549 }, 1394 },
1550 { 1395 {
1551 .procname = "aio-max-nr", 1396 .procname = "aio-max-nr",
1552 .data = &aio_max_nr, 1397 .data = &aio_max_nr,
1553 .maxlen = sizeof(aio_max_nr), 1398 .maxlen = sizeof(aio_max_nr),
1554 .mode = 0644, 1399 .mode = 0644,
1555 .proc_handler = &proc_doulongvec_minmax, 1400 .proc_handler = proc_doulongvec_minmax,
1556 }, 1401 },
1557#endif /* CONFIG_AIO */ 1402#endif /* CONFIG_AIO */
1558#ifdef CONFIG_INOTIFY_USER 1403#ifdef CONFIG_INOTIFY_USER
1559 { 1404 {
1560 .ctl_name = FS_INOTIFY,
1561 .procname = "inotify", 1405 .procname = "inotify",
1562 .mode = 0555, 1406 .mode = 0555,
1563 .child = inotify_table, 1407 .child = inotify_table,
@@ -1572,19 +1416,16 @@ static struct ctl_table fs_table[] = {
1572#endif 1416#endif
1573#endif 1417#endif
1574 { 1418 {
1575 .ctl_name = KERN_SETUID_DUMPABLE,
1576 .procname = "suid_dumpable", 1419 .procname = "suid_dumpable",
1577 .data = &suid_dumpable, 1420 .data = &suid_dumpable,
1578 .maxlen = sizeof(int), 1421 .maxlen = sizeof(int),
1579 .mode = 0644, 1422 .mode = 0644,
1580 .proc_handler = &proc_dointvec_minmax, 1423 .proc_handler = proc_dointvec_minmax,
1581 .strategy = &sysctl_intvec,
1582 .extra1 = &zero, 1424 .extra1 = &zero,
1583 .extra2 = &two, 1425 .extra2 = &two,
1584 }, 1426 },
1585#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) 1427#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
1586 { 1428 {
1587 .ctl_name = CTL_UNNUMBERED,
1588 .procname = "binfmt_misc", 1429 .procname = "binfmt_misc",
1589 .mode = 0555, 1430 .mode = 0555,
1590 .child = binfmt_misc_table, 1431 .child = binfmt_misc_table,
@@ -1594,13 +1435,12 @@ static struct ctl_table fs_table[] = {
1594 * NOTE: do not add new entries to this table unless you have read 1435 * NOTE: do not add new entries to this table unless you have read
1595 * Documentation/sysctl/ctl_unnumbered.txt 1436 * Documentation/sysctl/ctl_unnumbered.txt
1596 */ 1437 */
1597 { .ctl_name = 0 } 1438 { }
1598}; 1439};
1599 1440
1600static struct ctl_table debug_table[] = { 1441static struct ctl_table debug_table[] = {
1601#if defined(CONFIG_X86) || defined(CONFIG_PPC) 1442#if defined(CONFIG_X86) || defined(CONFIG_PPC)
1602 { 1443 {
1603 .ctl_name = CTL_UNNUMBERED,
1604 .procname = "exception-trace", 1444 .procname = "exception-trace",
1605 .data = &show_unhandled_signals, 1445 .data = &show_unhandled_signals,
1606 .maxlen = sizeof(int), 1446 .maxlen = sizeof(int),
@@ -1608,11 +1448,11 @@ static struct ctl_table debug_table[] = {
1608 .proc_handler = proc_dointvec 1448 .proc_handler = proc_dointvec
1609 }, 1449 },
1610#endif 1450#endif
1611 { .ctl_name = 0 } 1451 { }
1612}; 1452};
1613 1453
1614static struct ctl_table dev_table[] = { 1454static struct ctl_table dev_table[] = {
1615 { .ctl_name = 0 } 1455 { }
1616}; 1456};
1617 1457
1618static DEFINE_SPINLOCK(sysctl_lock); 1458static DEFINE_SPINLOCK(sysctl_lock);
@@ -1766,122 +1606,6 @@ void register_sysctl_root(struct ctl_table_root *root)
1766 spin_unlock(&sysctl_lock); 1606 spin_unlock(&sysctl_lock);
1767} 1607}
1768 1608
1769#ifdef CONFIG_SYSCTL_SYSCALL
1770/* Perform the actual read/write of a sysctl table entry. */
1771static int do_sysctl_strategy(struct ctl_table_root *root,
1772 struct ctl_table *table,
1773 void __user *oldval, size_t __user *oldlenp,
1774 void __user *newval, size_t newlen)
1775{
1776 int op = 0, rc;
1777
1778 if (oldval)
1779 op |= MAY_READ;
1780 if (newval)
1781 op |= MAY_WRITE;
1782 if (sysctl_perm(root, table, op))
1783 return -EPERM;
1784
1785 if (table->strategy) {
1786 rc = table->strategy(table, oldval, oldlenp, newval, newlen);
1787 if (rc < 0)
1788 return rc;
1789 if (rc > 0)
1790 return 0;
1791 }
1792
1793 /* If there is no strategy routine, or if the strategy returns
1794 * zero, proceed with automatic r/w */
1795 if (table->data && table->maxlen) {
1796 rc = sysctl_data(table, oldval, oldlenp, newval, newlen);
1797 if (rc < 0)
1798 return rc;
1799 }
1800 return 0;
1801}
1802
1803static int parse_table(int __user *name, int nlen,
1804 void __user *oldval, size_t __user *oldlenp,
1805 void __user *newval, size_t newlen,
1806 struct ctl_table_root *root,
1807 struct ctl_table *table)
1808{
1809 int n;
1810repeat:
1811 if (!nlen)
1812 return -ENOTDIR;
1813 if (get_user(n, name))
1814 return -EFAULT;
1815 for ( ; table->ctl_name || table->procname; table++) {
1816 if (!table->ctl_name)
1817 continue;
1818 if (n == table->ctl_name) {
1819 int error;
1820 if (table->child) {
1821 if (sysctl_perm(root, table, MAY_EXEC))
1822 return -EPERM;
1823 name++;
1824 nlen--;
1825 table = table->child;
1826 goto repeat;
1827 }
1828 error = do_sysctl_strategy(root, table,
1829 oldval, oldlenp,
1830 newval, newlen);
1831 return error;
1832 }
1833 }
1834 return -ENOTDIR;
1835}
1836
1837int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp,
1838 void __user *newval, size_t newlen)
1839{
1840 struct ctl_table_header *head;
1841 int error = -ENOTDIR;
1842
1843 if (nlen <= 0 || nlen >= CTL_MAXNAME)
1844 return -ENOTDIR;
1845 if (oldval) {
1846 int old_len;
1847 if (!oldlenp || get_user(old_len, oldlenp))
1848 return -EFAULT;
1849 }
1850
1851 for (head = sysctl_head_next(NULL); head;
1852 head = sysctl_head_next(head)) {
1853 error = parse_table(name, nlen, oldval, oldlenp,
1854 newval, newlen,
1855 head->root, head->ctl_table);
1856 if (error != -ENOTDIR) {
1857 sysctl_head_finish(head);
1858 break;
1859 }
1860 }
1861 return error;
1862}
1863
1864SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
1865{
1866 struct __sysctl_args tmp;
1867 int error;
1868
1869 if (copy_from_user(&tmp, args, sizeof(tmp)))
1870 return -EFAULT;
1871
1872 error = deprecated_sysctl_warning(&tmp);
1873 if (error)
1874 goto out;
1875
1876 lock_kernel();
1877 error = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, tmp.oldlenp,
1878 tmp.newval, tmp.newlen);
1879 unlock_kernel();
1880out:
1881 return error;
1882}
1883#endif /* CONFIG_SYSCTL_SYSCALL */
1884
1885/* 1609/*
1886 * sysctl_perm does NOT grant the superuser all rights automatically, because 1610 * sysctl_perm does NOT grant the superuser all rights automatically, because
1887 * some sysctl variables are readonly even to root. 1611 * some sysctl variables are readonly even to root.
@@ -1917,7 +1641,7 @@ int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
1917 1641
1918static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) 1642static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
1919{ 1643{
1920 for (; table->ctl_name || table->procname; table++) { 1644 for (; table->procname; table++) {
1921 table->parent = parent; 1645 table->parent = parent;
1922 if (table->child) 1646 if (table->child)
1923 sysctl_set_parent(table, table->child); 1647 sysctl_set_parent(table, table->child);
@@ -1949,11 +1673,11 @@ static struct ctl_table *is_branch_in(struct ctl_table *branch,
1949 return NULL; 1673 return NULL;
1950 1674
1951 /* ... and nothing else */ 1675 /* ... and nothing else */
1952 if (branch[1].procname || branch[1].ctl_name) 1676 if (branch[1].procname)
1953 return NULL; 1677 return NULL;
1954 1678
1955 /* table should contain subdirectory with the same name */ 1679 /* table should contain subdirectory with the same name */
1956 for (p = table; p->procname || p->ctl_name; p++) { 1680 for (p = table; p->procname; p++) {
1957 if (!p->child) 1681 if (!p->child)
1958 continue; 1682 continue;
1959 if (p->procname && strcmp(p->procname, s) == 0) 1683 if (p->procname && strcmp(p->procname, s) == 0)
@@ -1998,9 +1722,6 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
1998 * 1722 *
1999 * The members of the &struct ctl_table structure are used as follows: 1723 * The members of the &struct ctl_table structure are used as follows:
2000 * 1724 *
2001 * ctl_name - This is the numeric sysctl value used by sysctl(2). The number
2002 * must be unique within that level of sysctl
2003 *
2004 * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not 1725 * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
2005 * enter a sysctl file 1726 * enter a sysctl file
2006 * 1727 *
@@ -2015,8 +1736,6 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
2015 * 1736 *
2016 * proc_handler - the text handler routine (described below) 1737 * proc_handler - the text handler routine (described below)
2017 * 1738 *
2018 * strategy - the strategy routine (described below)
2019 *
2020 * de - for internal use by the sysctl routines 1739 * de - for internal use by the sysctl routines
2021 * 1740 *
2022 * extra1, extra2 - extra pointers usable by the proc handler routines 1741 * extra1, extra2 - extra pointers usable by the proc handler routines
@@ -2029,19 +1748,6 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
2029 * struct enable minimal validation of the values being written to be 1748 * struct enable minimal validation of the values being written to be
2030 * performed, and the mode field allows minimal authentication. 1749 * performed, and the mode field allows minimal authentication.
2031 * 1750 *
2032 * More sophisticated management can be enabled by the provision of a
2033 * strategy routine with the table entry. This will be called before
2034 * any automatic read or write of the data is performed.
2035 *
2036 * The strategy routine may return
2037 *
2038 * < 0 - Error occurred (error is passed to user process)
2039 *
2040 * 0 - OK - proceed with automatic read or write.
2041 *
2042 * > 0 - OK - read or write has been done by the strategy routine, so
2043 * return immediately.
2044 *
2045 * There must be a proc_handler routine for any terminal nodes 1751 * There must be a proc_handler routine for any terminal nodes
2046 * mirrored under /proc/sys (non-terminals are handled by a built-in 1752 * mirrored under /proc/sys (non-terminals are handled by a built-in
2047 * directory handler). Several default handlers are available to 1753 * directory handler). Several default handlers are available to
@@ -2068,13 +1774,13 @@ struct ctl_table_header *__register_sysctl_paths(
2068 struct ctl_table_set *set; 1774 struct ctl_table_set *set;
2069 1775
2070 /* Count the path components */ 1776 /* Count the path components */
2071 for (npath = 0; path[npath].ctl_name || path[npath].procname; ++npath) 1777 for (npath = 0; path[npath].procname; ++npath)
2072 ; 1778 ;
2073 1779
2074 /* 1780 /*
2075 * For each path component, allocate a 2-element ctl_table array. 1781 * For each path component, allocate a 2-element ctl_table array.
2076 * The first array element will be filled with the sysctl entry 1782 * The first array element will be filled with the sysctl entry
2077 * for this, the second will be the sentinel (ctl_name == 0). 1783 * for this, the second will be the sentinel (procname == 0).
2078 * 1784 *
2079 * We allocate everything in one go so that we don't have to 1785 * We allocate everything in one go so that we don't have to
2080 * worry about freeing additional memory in unregister_sysctl_table. 1786 * worry about freeing additional memory in unregister_sysctl_table.
@@ -2091,7 +1797,6 @@ struct ctl_table_header *__register_sysctl_paths(
2091 for (n = 0; n < npath; ++n, ++path) { 1797 for (n = 0; n < npath; ++n, ++path) {
2092 /* Copy the procname */ 1798 /* Copy the procname */
2093 new->procname = path->procname; 1799 new->procname = path->procname;
2094 new->ctl_name = path->ctl_name;
2095 new->mode = 0555; 1800 new->mode = 0555;
2096 1801
2097 *prevp = new; 1802 *prevp = new;
@@ -2953,286 +2658,6 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2953 2658
2954#endif /* CONFIG_PROC_FS */ 2659#endif /* CONFIG_PROC_FS */
2955 2660
2956
2957#ifdef CONFIG_SYSCTL_SYSCALL
2958/*
2959 * General sysctl support routines
2960 */
2961
2962/* The generic sysctl data routine (used if no strategy routine supplied) */
2963int sysctl_data(struct ctl_table *table,
2964 void __user *oldval, size_t __user *oldlenp,
2965 void __user *newval, size_t newlen)
2966{
2967 size_t len;
2968
2969 /* Get out of I don't have a variable */
2970 if (!table->data || !table->maxlen)
2971 return -ENOTDIR;
2972
2973 if (oldval && oldlenp) {
2974 if (get_user(len, oldlenp))
2975 return -EFAULT;
2976 if (len) {
2977 if (len > table->maxlen)
2978 len = table->maxlen;
2979 if (copy_to_user(oldval, table->data, len))
2980 return -EFAULT;
2981 if (put_user(len, oldlenp))
2982 return -EFAULT;
2983 }
2984 }
2985
2986 if (newval && newlen) {
2987 if (newlen > table->maxlen)
2988 newlen = table->maxlen;
2989
2990 if (copy_from_user(table->data, newval, newlen))
2991 return -EFAULT;
2992 }
2993 return 1;
2994}
2995
2996/* The generic string strategy routine: */
2997int sysctl_string(struct ctl_table *table,
2998 void __user *oldval, size_t __user *oldlenp,
2999 void __user *newval, size_t newlen)
3000{
3001 if (!table->data || !table->maxlen)
3002 return -ENOTDIR;
3003
3004 if (oldval && oldlenp) {
3005 size_t bufsize;
3006 if (get_user(bufsize, oldlenp))
3007 return -EFAULT;
3008 if (bufsize) {
3009 size_t len = strlen(table->data), copied;
3010
3011 /* This shouldn't trigger for a well-formed sysctl */
3012 if (len > table->maxlen)
3013 len = table->maxlen;
3014
3015 /* Copy up to a max of bufsize-1 bytes of the string */
3016 copied = (len >= bufsize) ? bufsize - 1 : len;
3017
3018 if (copy_to_user(oldval, table->data, copied) ||
3019 put_user(0, (char __user *)(oldval + copied)))
3020 return -EFAULT;
3021 if (put_user(len, oldlenp))
3022 return -EFAULT;
3023 }
3024 }
3025 if (newval && newlen) {
3026 size_t len = newlen;
3027 if (len > table->maxlen)
3028 len = table->maxlen;
3029 if(copy_from_user(table->data, newval, len))
3030 return -EFAULT;
3031 if (len == table->maxlen)
3032 len--;
3033 ((char *) table->data)[len] = 0;
3034 }
3035 return 1;
3036}
3037
3038/*
3039 * This function makes sure that all of the integers in the vector
3040 * are between the minimum and maximum values given in the arrays
3041 * table->extra1 and table->extra2, respectively.
3042 */
3043int sysctl_intvec(struct ctl_table *table,
3044 void __user *oldval, size_t __user *oldlenp,
3045 void __user *newval, size_t newlen)
3046{
3047
3048 if (newval && newlen) {
3049 int __user *vec = (int __user *) newval;
3050 int *min = (int *) table->extra1;
3051 int *max = (int *) table->extra2;
3052 size_t length;
3053 int i;
3054
3055 if (newlen % sizeof(int) != 0)
3056 return -EINVAL;
3057
3058 if (!table->extra1 && !table->extra2)
3059 return 0;
3060
3061 if (newlen > table->maxlen)
3062 newlen = table->maxlen;
3063 length = newlen / sizeof(int);
3064
3065 for (i = 0; i < length; i++) {
3066 int value;
3067 if (get_user(value, vec + i))
3068 return -EFAULT;
3069 if (min && value < min[i])
3070 return -EINVAL;
3071 if (max && value > max[i])
3072 return -EINVAL;
3073 }
3074 }
3075 return 0;
3076}
3077
3078/* Strategy function to convert jiffies to seconds */
3079int sysctl_jiffies(struct ctl_table *table,
3080 void __user *oldval, size_t __user *oldlenp,
3081 void __user *newval, size_t newlen)
3082{
3083 if (oldval && oldlenp) {
3084 size_t olen;
3085
3086 if (get_user(olen, oldlenp))
3087 return -EFAULT;
3088 if (olen) {
3089 int val;
3090
3091 if (olen < sizeof(int))
3092 return -EINVAL;
3093
3094 val = *(int *)(table->data) / HZ;
3095 if (put_user(val, (int __user *)oldval))
3096 return -EFAULT;
3097 if (put_user(sizeof(int), oldlenp))
3098 return -EFAULT;
3099 }
3100 }
3101 if (newval && newlen) {
3102 int new;
3103 if (newlen != sizeof(int))
3104 return -EINVAL;
3105 if (get_user(new, (int __user *)newval))
3106 return -EFAULT;
3107 *(int *)(table->data) = new*HZ;
3108 }
3109 return 1;
3110}
3111
3112/* Strategy function to convert jiffies to seconds */
3113int sysctl_ms_jiffies(struct ctl_table *table,
3114 void __user *oldval, size_t __user *oldlenp,
3115 void __user *newval, size_t newlen)
3116{
3117 if (oldval && oldlenp) {
3118 size_t olen;
3119
3120 if (get_user(olen, oldlenp))
3121 return -EFAULT;
3122 if (olen) {
3123 int val;
3124
3125 if (olen < sizeof(int))
3126 return -EINVAL;
3127
3128 val = jiffies_to_msecs(*(int *)(table->data));
3129 if (put_user(val, (int __user *)oldval))
3130 return -EFAULT;
3131 if (put_user(sizeof(int), oldlenp))
3132 return -EFAULT;
3133 }
3134 }
3135 if (newval && newlen) {
3136 int new;
3137 if (newlen != sizeof(int))
3138 return -EINVAL;
3139 if (get_user(new, (int __user *)newval))
3140 return -EFAULT;
3141 *(int *)(table->data) = msecs_to_jiffies(new);
3142 }
3143 return 1;
3144}
3145
3146
3147
3148#else /* CONFIG_SYSCTL_SYSCALL */
3149
3150
3151SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
3152{
3153 struct __sysctl_args tmp;
3154 int error;
3155
3156 if (copy_from_user(&tmp, args, sizeof(tmp)))
3157 return -EFAULT;
3158
3159 error = deprecated_sysctl_warning(&tmp);
3160
3161 /* If no error reading the parameters then just -ENOSYS ... */
3162 if (!error)
3163 error = -ENOSYS;
3164
3165 return error;
3166}
3167
3168int sysctl_data(struct ctl_table *table,
3169 void __user *oldval, size_t __user *oldlenp,
3170 void __user *newval, size_t newlen)
3171{
3172 return -ENOSYS;
3173}
3174
3175int sysctl_string(struct ctl_table *table,
3176 void __user *oldval, size_t __user *oldlenp,
3177 void __user *newval, size_t newlen)
3178{
3179 return -ENOSYS;
3180}
3181
3182int sysctl_intvec(struct ctl_table *table,
3183 void __user *oldval, size_t __user *oldlenp,
3184 void __user *newval, size_t newlen)
3185{
3186 return -ENOSYS;
3187}
3188
3189int sysctl_jiffies(struct ctl_table *table,
3190 void __user *oldval, size_t __user *oldlenp,
3191 void __user *newval, size_t newlen)
3192{
3193 return -ENOSYS;
3194}
3195
3196int sysctl_ms_jiffies(struct ctl_table *table,
3197 void __user *oldval, size_t __user *oldlenp,
3198 void __user *newval, size_t newlen)
3199{
3200 return -ENOSYS;
3201}
3202
3203#endif /* CONFIG_SYSCTL_SYSCALL */
3204
3205static int deprecated_sysctl_warning(struct __sysctl_args *args)
3206{
3207 static int msg_count;
3208 int name[CTL_MAXNAME];
3209 int i;
3210
3211 /* Check args->nlen. */
3212 if (args->nlen < 0 || args->nlen > CTL_MAXNAME)
3213 return -ENOTDIR;
3214
3215 /* Read in the sysctl name for better debug message logging */
3216 for (i = 0; i < args->nlen; i++)
3217 if (get_user(name[i], args->name + i))
3218 return -EFAULT;
3219
3220 /* Ignore accesses to kernel.version */
3221 if ((args->nlen == 2) && (name[0] == CTL_KERN) && (name[1] == KERN_VERSION))
3222 return 0;
3223
3224 if (msg_count < 5) {
3225 msg_count++;
3226 printk(KERN_INFO
3227 "warning: process `%s' used the deprecated sysctl "
3228 "system call with ", current->comm);
3229 for (i = 0; i < args->nlen; i++)
3230 printk("%d.", name[i]);
3231 printk("\n");
3232 }
3233 return 0;
3234}
3235
3236/* 2661/*
3237 * No sense putting this after each symbol definition, twice, 2662 * No sense putting this after each symbol definition, twice,
3238 * exception granted :-) 2663 * exception granted :-)
@@ -3247,9 +2672,4 @@ EXPORT_SYMBOL(proc_doulongvec_minmax);
3247EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); 2672EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
3248EXPORT_SYMBOL(register_sysctl_table); 2673EXPORT_SYMBOL(register_sysctl_table);
3249EXPORT_SYMBOL(register_sysctl_paths); 2674EXPORT_SYMBOL(register_sysctl_paths);
3250EXPORT_SYMBOL(sysctl_intvec);
3251EXPORT_SYMBOL(sysctl_jiffies);
3252EXPORT_SYMBOL(sysctl_ms_jiffies);
3253EXPORT_SYMBOL(sysctl_string);
3254EXPORT_SYMBOL(sysctl_data);
3255EXPORT_SYMBOL(unregister_sysctl_table); 2675EXPORT_SYMBOL(unregister_sysctl_table);
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
new file mode 100644
index 000000000000..b75dbf40f573
--- /dev/null
+++ b/kernel/sysctl_binary.c
@@ -0,0 +1,1507 @@
1#include <linux/stat.h>
2#include <linux/sysctl.h>
3#include "../fs/xfs/linux-2.6/xfs_sysctl.h"
4#include <linux/sunrpc/debug.h>
5#include <linux/string.h>
6#include <net/ip_vs.h>
7#include <linux/syscalls.h>
8#include <linux/namei.h>
9#include <linux/mount.h>
10#include <linux/fs.h>
11#include <linux/nsproxy.h>
12#include <linux/pid_namespace.h>
13#include <linux/file.h>
14#include <linux/ctype.h>
15#include <linux/netdevice.h>
16
17#ifdef CONFIG_SYSCTL_SYSCALL
18
19struct bin_table;
20typedef ssize_t bin_convert_t(struct file *file,
21 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen);
22
23static bin_convert_t bin_dir;
24static bin_convert_t bin_string;
25static bin_convert_t bin_intvec;
26static bin_convert_t bin_ulongvec;
27static bin_convert_t bin_uuid;
28static bin_convert_t bin_dn_node_address;
29
30#define CTL_DIR bin_dir
31#define CTL_STR bin_string
32#define CTL_INT bin_intvec
33#define CTL_ULONG bin_ulongvec
34#define CTL_UUID bin_uuid
35#define CTL_DNADR bin_dn_node_address
36
37#define BUFSZ 256
38
39struct bin_table {
40 bin_convert_t *convert;
41 int ctl_name;
42 const char *procname;
43 const struct bin_table *child;
44};
45
46static const struct bin_table bin_random_table[] = {
47 { CTL_INT, RANDOM_POOLSIZE, "poolsize" },
48 { CTL_INT, RANDOM_ENTROPY_COUNT, "entropy_avail" },
49 { CTL_INT, RANDOM_READ_THRESH, "read_wakeup_threshold" },
50 { CTL_INT, RANDOM_WRITE_THRESH, "write_wakeup_threshold" },
51 { CTL_UUID, RANDOM_BOOT_ID, "boot_id" },
52 { CTL_UUID, RANDOM_UUID, "uuid" },
53 {}
54};
55
56static const struct bin_table bin_pty_table[] = {
57 { CTL_INT, PTY_MAX, "max" },
58 { CTL_INT, PTY_NR, "nr" },
59 {}
60};
61
62static const struct bin_table bin_kern_table[] = {
63 { CTL_STR, KERN_OSTYPE, "ostype" },
64 { CTL_STR, KERN_OSRELEASE, "osrelease" },
65 /* KERN_OSREV not used */
66 { CTL_STR, KERN_VERSION, "version" },
67 /* KERN_SECUREMASK not used */
68 /* KERN_PROF not used */
69 { CTL_STR, KERN_NODENAME, "hostname" },
70 { CTL_STR, KERN_DOMAINNAME, "domainname" },
71
72 { CTL_INT, KERN_PANIC, "panic" },
73 { CTL_INT, KERN_REALROOTDEV, "real-root-dev" },
74
75 { CTL_STR, KERN_SPARC_REBOOT, "reboot-cmd" },
76 { CTL_INT, KERN_CTLALTDEL, "ctrl-alt-del" },
77 { CTL_INT, KERN_PRINTK, "printk" },
78
79 /* KERN_NAMETRANS not used */
80 /* KERN_PPC_HTABRECLAIM not used */
81 /* KERN_PPC_ZEROPAGED not used */
82 { CTL_INT, KERN_PPC_POWERSAVE_NAP, "powersave-nap" },
83
84 { CTL_STR, KERN_MODPROBE, "modprobe" },
85 { CTL_INT, KERN_SG_BIG_BUFF, "sg-big-buff" },
86 { CTL_INT, KERN_ACCT, "acct" },
87 /* KERN_PPC_L2CR "l2cr" no longer used */
88
89 /* KERN_RTSIGNR not used */
90 /* KERN_RTSIGMAX not used */
91
92 { CTL_ULONG, KERN_SHMMAX, "shmmax" },
93 { CTL_INT, KERN_MSGMAX, "msgmax" },
94 { CTL_INT, KERN_MSGMNB, "msgmnb" },
95 /* KERN_MSGPOOL not used*/
96 { CTL_INT, KERN_SYSRQ, "sysrq" },
97 { CTL_INT, KERN_MAX_THREADS, "threads-max" },
98 { CTL_DIR, KERN_RANDOM, "random", bin_random_table },
99 { CTL_ULONG, KERN_SHMALL, "shmall" },
100 { CTL_INT, KERN_MSGMNI, "msgmni" },
101 { CTL_INT, KERN_SEM, "sem" },
102 { CTL_INT, KERN_SPARC_STOP_A, "stop-a" },
103 { CTL_INT, KERN_SHMMNI, "shmmni" },
104
105 { CTL_INT, KERN_OVERFLOWUID, "overflowuid" },
106 { CTL_INT, KERN_OVERFLOWGID, "overflowgid" },
107
108 { CTL_STR, KERN_HOTPLUG, "hotplug", },
109 { CTL_INT, KERN_IEEE_EMULATION_WARNINGS, "ieee_emulation_warnings" },
110
111 { CTL_INT, KERN_S390_USER_DEBUG_LOGGING, "userprocess_debug" },
112 { CTL_INT, KERN_CORE_USES_PID, "core_uses_pid" },
113 /* KERN_TAINTED "tainted" no longer used */
114 { CTL_INT, KERN_CADPID, "cad_pid" },
115 { CTL_INT, KERN_PIDMAX, "pid_max" },
116 { CTL_STR, KERN_CORE_PATTERN, "core_pattern" },
117 { CTL_INT, KERN_PANIC_ON_OOPS, "panic_on_oops" },
118 { CTL_INT, KERN_HPPA_PWRSW, "soft-power" },
119 { CTL_INT, KERN_HPPA_UNALIGNED, "unaligned-trap" },
120
121 { CTL_INT, KERN_PRINTK_RATELIMIT, "printk_ratelimit" },
122 { CTL_INT, KERN_PRINTK_RATELIMIT_BURST, "printk_ratelimit_burst" },
123
124 { CTL_DIR, KERN_PTY, "pty", bin_pty_table },
125 { CTL_INT, KERN_NGROUPS_MAX, "ngroups_max" },
126 { CTL_INT, KERN_SPARC_SCONS_PWROFF, "scons-poweroff" },
127 /* KERN_HZ_TIMER "hz_timer" no longer used */
128 { CTL_INT, KERN_UNKNOWN_NMI_PANIC, "unknown_nmi_panic" },
129 { CTL_INT, KERN_BOOTLOADER_TYPE, "bootloader_type" },
130 { CTL_INT, KERN_RANDOMIZE, "randomize_va_space" },
131
132 { CTL_INT, KERN_SPIN_RETRY, "spin_retry" },
133 /* KERN_ACPI_VIDEO_FLAGS "acpi_video_flags" no longer used */
134 { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" },
135 { CTL_INT, KERN_COMPAT_LOG, "compat-log" },
136 { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
137 { CTL_INT, KERN_NMI_WATCHDOG, "nmi_watchdog" },
138 { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
139 {}
140};
141
142static const struct bin_table bin_vm_table[] = {
143 { CTL_INT, VM_OVERCOMMIT_MEMORY, "overcommit_memory" },
144 { CTL_INT, VM_PAGE_CLUSTER, "page-cluster" },
145 { CTL_INT, VM_DIRTY_BACKGROUND, "dirty_background_ratio" },
146 { CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" },
147 /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */
148 /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */
149 { CTL_INT, VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" },
150 { CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" },
151 /* VM_PAGEBUF unused */
152 /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */
153 { CTL_INT, VM_SWAPPINESS, "swappiness" },
154 { CTL_INT, VM_LOWMEM_RESERVE_RATIO, "lowmem_reserve_ratio" },
155 { CTL_INT, VM_MIN_FREE_KBYTES, "min_free_kbytes" },
156 { CTL_INT, VM_MAX_MAP_COUNT, "max_map_count" },
157 { CTL_INT, VM_LAPTOP_MODE, "laptop_mode" },
158 { CTL_INT, VM_BLOCK_DUMP, "block_dump" },
159 { CTL_INT, VM_HUGETLB_GROUP, "hugetlb_shm_group" },
160 { CTL_INT, VM_VFS_CACHE_PRESSURE, "vfs_cache_pressure" },
161 { CTL_INT, VM_LEGACY_VA_LAYOUT, "legacy_va_layout" },
162 /* VM_SWAP_TOKEN_TIMEOUT unused */
163 { CTL_INT, VM_DROP_PAGECACHE, "drop_caches" },
164 { CTL_INT, VM_PERCPU_PAGELIST_FRACTION, "percpu_pagelist_fraction" },
165 { CTL_INT, VM_ZONE_RECLAIM_MODE, "zone_reclaim_mode" },
166 { CTL_INT, VM_MIN_UNMAPPED, "min_unmapped_ratio" },
167 { CTL_INT, VM_PANIC_ON_OOM, "panic_on_oom" },
168 { CTL_INT, VM_VDSO_ENABLED, "vdso_enabled" },
169 { CTL_INT, VM_MIN_SLAB, "min_slab_ratio" },
170
171 {}
172};
173
174static const struct bin_table bin_net_core_table[] = {
175 { CTL_INT, NET_CORE_WMEM_MAX, "wmem_max" },
176 { CTL_INT, NET_CORE_RMEM_MAX, "rmem_max" },
177 { CTL_INT, NET_CORE_WMEM_DEFAULT, "wmem_default" },
178 { CTL_INT, NET_CORE_RMEM_DEFAULT, "rmem_default" },
179 /* NET_CORE_DESTROY_DELAY unused */
180 { CTL_INT, NET_CORE_MAX_BACKLOG, "netdev_max_backlog" },
181 /* NET_CORE_FASTROUTE unused */
182 { CTL_INT, NET_CORE_MSG_COST, "message_cost" },
183 { CTL_INT, NET_CORE_MSG_BURST, "message_burst" },
184 { CTL_INT, NET_CORE_OPTMEM_MAX, "optmem_max" },
185 /* NET_CORE_HOT_LIST_LENGTH unused */
186 /* NET_CORE_DIVERT_VERSION unused */
187 /* NET_CORE_NO_CONG_THRESH unused */
188 /* NET_CORE_NO_CONG unused */
189 /* NET_CORE_LO_CONG unused */
190 /* NET_CORE_MOD_CONG unused */
191 { CTL_INT, NET_CORE_DEV_WEIGHT, "dev_weight" },
192 { CTL_INT, NET_CORE_SOMAXCONN, "somaxconn" },
193 { CTL_INT, NET_CORE_BUDGET, "netdev_budget" },
194 { CTL_INT, NET_CORE_AEVENT_ETIME, "xfrm_aevent_etime" },
195 { CTL_INT, NET_CORE_AEVENT_RSEQTH, "xfrm_aevent_rseqth" },
196 { CTL_INT, NET_CORE_WARNINGS, "warnings" },
197 {},
198};
199
200static const struct bin_table bin_net_unix_table[] = {
201 /* NET_UNIX_DESTROY_DELAY unused */
202 /* NET_UNIX_DELETE_DELAY unused */
203 { CTL_INT, NET_UNIX_MAX_DGRAM_QLEN, "max_dgram_qlen" },
204 {}
205};
206
207static const struct bin_table bin_net_ipv4_route_table[] = {
208 { CTL_INT, NET_IPV4_ROUTE_FLUSH, "flush" },
209 /* NET_IPV4_ROUTE_MIN_DELAY "min_delay" no longer used */
210 /* NET_IPV4_ROUTE_MAX_DELAY "max_delay" no longer used */
211 { CTL_INT, NET_IPV4_ROUTE_GC_THRESH, "gc_thresh" },
212 { CTL_INT, NET_IPV4_ROUTE_MAX_SIZE, "max_size" },
213 { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" },
214 { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" },
215 { CTL_INT, NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" },
216 { CTL_INT, NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval" },
217 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" },
218 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" },
219 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" },
220 { CTL_INT, NET_IPV4_ROUTE_ERROR_COST, "error_cost" },
221 { CTL_INT, NET_IPV4_ROUTE_ERROR_BURST, "error_burst" },
222 { CTL_INT, NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity" },
223 { CTL_INT, NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" },
224 { CTL_INT, NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" },
225 { CTL_INT, NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" },
226 { CTL_INT, NET_IPV4_ROUTE_SECRET_INTERVAL, "secret_interval" },
227 {}
228};
229
230static const struct bin_table bin_net_ipv4_conf_vars_table[] = {
231 { CTL_INT, NET_IPV4_CONF_FORWARDING, "forwarding" },
232 { CTL_INT, NET_IPV4_CONF_MC_FORWARDING, "mc_forwarding" },
233
234 { CTL_INT, NET_IPV4_CONF_ACCEPT_REDIRECTS, "accept_redirects" },
235 { CTL_INT, NET_IPV4_CONF_SECURE_REDIRECTS, "secure_redirects" },
236 { CTL_INT, NET_IPV4_CONF_SEND_REDIRECTS, "send_redirects" },
237 { CTL_INT, NET_IPV4_CONF_SHARED_MEDIA, "shared_media" },
238 { CTL_INT, NET_IPV4_CONF_RP_FILTER, "rp_filter" },
239 { CTL_INT, NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
240 { CTL_INT, NET_IPV4_CONF_PROXY_ARP, "proxy_arp" },
241 { CTL_INT, NET_IPV4_CONF_MEDIUM_ID, "medium_id" },
242 { CTL_INT, NET_IPV4_CONF_BOOTP_RELAY, "bootp_relay" },
243 { CTL_INT, NET_IPV4_CONF_LOG_MARTIANS, "log_martians" },
244 { CTL_INT, NET_IPV4_CONF_TAG, "tag" },
245 { CTL_INT, NET_IPV4_CONF_ARPFILTER, "arp_filter" },
246 { CTL_INT, NET_IPV4_CONF_ARP_ANNOUNCE, "arp_announce" },
247 { CTL_INT, NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" },
248 { CTL_INT, NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" },
249 { CTL_INT, NET_IPV4_CONF_ARP_NOTIFY, "arp_notify" },
250
251 { CTL_INT, NET_IPV4_CONF_NOXFRM, "disable_xfrm" },
252 { CTL_INT, NET_IPV4_CONF_NOPOLICY, "disable_policy" },
253 { CTL_INT, NET_IPV4_CONF_FORCE_IGMP_VERSION, "force_igmp_version" },
254 { CTL_INT, NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" },
255 {}
256};
257
258static const struct bin_table bin_net_ipv4_conf_table[] = {
259 { CTL_DIR, NET_PROTO_CONF_ALL, "all", bin_net_ipv4_conf_vars_table },
260 { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_ipv4_conf_vars_table },
261 { CTL_DIR, 0, NULL, bin_net_ipv4_conf_vars_table },
262 {}
263};
264
265static const struct bin_table bin_net_neigh_vars_table[] = {
266 { CTL_INT, NET_NEIGH_MCAST_SOLICIT, "mcast_solicit" },
267 { CTL_INT, NET_NEIGH_UCAST_SOLICIT, "ucast_solicit" },
268 { CTL_INT, NET_NEIGH_APP_SOLICIT, "app_solicit" },
269 /* NET_NEIGH_RETRANS_TIME "retrans_time" no longer used */
270 { CTL_INT, NET_NEIGH_REACHABLE_TIME, "base_reachable_time" },
271 { CTL_INT, NET_NEIGH_DELAY_PROBE_TIME, "delay_first_probe_time" },
272 { CTL_INT, NET_NEIGH_GC_STALE_TIME, "gc_stale_time" },
273 { CTL_INT, NET_NEIGH_UNRES_QLEN, "unres_qlen" },
274 { CTL_INT, NET_NEIGH_PROXY_QLEN, "proxy_qlen" },
275 /* NET_NEIGH_ANYCAST_DELAY "anycast_delay" no longer used */
276 /* NET_NEIGH_PROXY_DELAY "proxy_delay" no longer used */
277 /* NET_NEIGH_LOCKTIME "locktime" no longer used */
278 { CTL_INT, NET_NEIGH_GC_INTERVAL, "gc_interval" },
279 { CTL_INT, NET_NEIGH_GC_THRESH1, "gc_thresh1" },
280 { CTL_INT, NET_NEIGH_GC_THRESH2, "gc_thresh2" },
281 { CTL_INT, NET_NEIGH_GC_THRESH3, "gc_thresh3" },
282 { CTL_INT, NET_NEIGH_RETRANS_TIME_MS, "retrans_time_ms" },
283 { CTL_INT, NET_NEIGH_REACHABLE_TIME_MS, "base_reachable_time_ms" },
284 {}
285};
286
287static const struct bin_table bin_net_neigh_table[] = {
288 { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_neigh_vars_table },
289 { CTL_DIR, 0, NULL, bin_net_neigh_vars_table },
290 {}
291};
292
293static const struct bin_table bin_net_ipv4_netfilter_table[] = {
294 { CTL_INT, NET_IPV4_NF_CONNTRACK_MAX, "ip_conntrack_max" },
295
296 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "ip_conntrack_tcp_timeout_syn_sent" no longer used */
297 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "ip_conntrack_tcp_timeout_syn_recv" no longer used */
298 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "ip_conntrack_tcp_timeout_established" no longer used */
299 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "ip_conntrack_tcp_timeout_fin_wait" no longer used */
300 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT "ip_conntrack_tcp_timeout_close_wait" no longer used */
301 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "ip_conntrack_tcp_timeout_last_ack" no longer used */
302 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "ip_conntrack_tcp_timeout_time_wait" no longer used */
303 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "ip_conntrack_tcp_timeout_close" no longer used */
304
305 /* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT "ip_conntrack_udp_timeout" no longer used */
306 /* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM "ip_conntrack_udp_timeout_stream" no longer used */
307 /* NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT "ip_conntrack_icmp_timeout" no longer used */
308 /* NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT "ip_conntrack_generic_timeout" no longer used */
309
310 { CTL_INT, NET_IPV4_NF_CONNTRACK_BUCKETS, "ip_conntrack_buckets" },
311 { CTL_INT, NET_IPV4_NF_CONNTRACK_LOG_INVALID, "ip_conntrack_log_invalid" },
312 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "ip_conntrack_tcp_timeout_max_retrans" no longer used */
313 { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_LOOSE, "ip_conntrack_tcp_loose" },
314 { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL, "ip_conntrack_tcp_be_liberal" },
315 { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS, "ip_conntrack_tcp_max_retrans" },
316
317 /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "ip_conntrack_sctp_timeout_closed" no longer used */
318 /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "ip_conntrack_sctp_timeout_cookie_wait" no longer used */
319 /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "ip_conntrack_sctp_timeout_cookie_echoed" no longer used */
320 /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "ip_conntrack_sctp_timeout_established" no longer used */
321 /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "ip_conntrack_sctp_timeout_shutdown_sent" no longer used */
322 /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "ip_conntrack_sctp_timeout_shutdown_recd" no longer used */
323 /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "ip_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */
324
325 { CTL_INT, NET_IPV4_NF_CONNTRACK_COUNT, "ip_conntrack_count" },
326 { CTL_INT, NET_IPV4_NF_CONNTRACK_CHECKSUM, "ip_conntrack_checksum" },
327 {}
328};
329
330static const struct bin_table bin_net_ipv4_table[] = {
331 {CTL_INT, NET_IPV4_FORWARD, "ip_forward" },
332
333 { CTL_DIR, NET_IPV4_CONF, "conf", bin_net_ipv4_conf_table },
334 { CTL_DIR, NET_IPV4_NEIGH, "neigh", bin_net_neigh_table },
335 { CTL_DIR, NET_IPV4_ROUTE, "route", bin_net_ipv4_route_table },
336 /* NET_IPV4_FIB_HASH unused */
337 { CTL_DIR, NET_IPV4_NETFILTER, "netfilter", bin_net_ipv4_netfilter_table },
338
339 { CTL_INT, NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps" },
340 { CTL_INT, NET_IPV4_TCP_WINDOW_SCALING, "tcp_window_scaling" },
341 { CTL_INT, NET_IPV4_TCP_SACK, "tcp_sack" },
342 { CTL_INT, NET_IPV4_TCP_RETRANS_COLLAPSE, "tcp_retrans_collapse" },
343 { CTL_INT, NET_IPV4_DEFAULT_TTL, "ip_default_ttl" },
344 /* NET_IPV4_AUTOCONFIG unused */
345 { CTL_INT, NET_IPV4_NO_PMTU_DISC, "ip_no_pmtu_disc" },
346 { CTL_INT, NET_IPV4_NONLOCAL_BIND, "ip_nonlocal_bind" },
347 { CTL_INT, NET_IPV4_TCP_SYN_RETRIES, "tcp_syn_retries" },
348 { CTL_INT, NET_TCP_SYNACK_RETRIES, "tcp_synack_retries" },
349 { CTL_INT, NET_TCP_MAX_ORPHANS, "tcp_max_orphans" },
350 { CTL_INT, NET_TCP_MAX_TW_BUCKETS, "tcp_max_tw_buckets" },
351 { CTL_INT, NET_IPV4_DYNADDR, "ip_dynaddr" },
352 { CTL_INT, NET_IPV4_TCP_KEEPALIVE_TIME, "tcp_keepalive_time" },
353 { CTL_INT, NET_IPV4_TCP_KEEPALIVE_PROBES, "tcp_keepalive_probes" },
354 { CTL_INT, NET_IPV4_TCP_KEEPALIVE_INTVL, "tcp_keepalive_intvl" },
355 { CTL_INT, NET_IPV4_TCP_RETRIES1, "tcp_retries1" },
356 { CTL_INT, NET_IPV4_TCP_RETRIES2, "tcp_retries2" },
357 { CTL_INT, NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout" },
358 { CTL_INT, NET_TCP_SYNCOOKIES, "tcp_syncookies" },
359 { CTL_INT, NET_TCP_TW_RECYCLE, "tcp_tw_recycle" },
360 { CTL_INT, NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow" },
361 { CTL_INT, NET_TCP_STDURG, "tcp_stdurg" },
362 { CTL_INT, NET_TCP_RFC1337, "tcp_rfc1337" },
363 { CTL_INT, NET_TCP_MAX_SYN_BACKLOG, "tcp_max_syn_backlog" },
364 { CTL_INT, NET_IPV4_LOCAL_PORT_RANGE, "ip_local_port_range" },
365 { CTL_INT, NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships" },
366 { CTL_INT, NET_IPV4_IGMP_MAX_MSF, "igmp_max_msf" },
367 { CTL_INT, NET_IPV4_INET_PEER_THRESHOLD, "inet_peer_threshold" },
368 { CTL_INT, NET_IPV4_INET_PEER_MINTTL, "inet_peer_minttl" },
369 { CTL_INT, NET_IPV4_INET_PEER_MAXTTL, "inet_peer_maxttl" },
370 { CTL_INT, NET_IPV4_INET_PEER_GC_MINTIME, "inet_peer_gc_mintime" },
371 { CTL_INT, NET_IPV4_INET_PEER_GC_MAXTIME, "inet_peer_gc_maxtime" },
372 { CTL_INT, NET_TCP_ORPHAN_RETRIES, "tcp_orphan_retries" },
373 { CTL_INT, NET_TCP_FACK, "tcp_fack" },
374 { CTL_INT, NET_TCP_REORDERING, "tcp_reordering" },
375 { CTL_INT, NET_TCP_ECN, "tcp_ecn" },
376 { CTL_INT, NET_TCP_DSACK, "tcp_dsack" },
377 { CTL_INT, NET_TCP_MEM, "tcp_mem" },
378 { CTL_INT, NET_TCP_WMEM, "tcp_wmem" },
379 { CTL_INT, NET_TCP_RMEM, "tcp_rmem" },
380 { CTL_INT, NET_TCP_APP_WIN, "tcp_app_win" },
381 { CTL_INT, NET_TCP_ADV_WIN_SCALE, "tcp_adv_win_scale" },
382 { CTL_INT, NET_TCP_TW_REUSE, "tcp_tw_reuse" },
383 { CTL_INT, NET_TCP_FRTO, "tcp_frto" },
384 { CTL_INT, NET_TCP_FRTO_RESPONSE, "tcp_frto_response" },
385 { CTL_INT, NET_TCP_LOW_LATENCY, "tcp_low_latency" },
386 { CTL_INT, NET_TCP_NO_METRICS_SAVE, "tcp_no_metrics_save" },
387 { CTL_INT, NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" },
388 { CTL_INT, NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" },
389 { CTL_STR, NET_TCP_CONG_CONTROL, "tcp_congestion_control" },
390 { CTL_INT, NET_TCP_ABC, "tcp_abc" },
391 { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" },
392 { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" },
393 { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" },
394 { CTL_INT, NET_TCP_DMA_COPYBREAK, "tcp_dma_copybreak" },
395 { CTL_INT, NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" },
396 { CTL_INT, NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" },
397 { CTL_INT, NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" },
398 { CTL_INT, NET_CIPSOV4_RBM_OPTFMT, "cipso_rbm_optfmt" },
399 { CTL_INT, NET_CIPSOV4_RBM_STRICTVALID, "cipso_rbm_strictvalid" },
400 /* NET_TCP_AVAIL_CONG_CONTROL "tcp_available_congestion_control" no longer used */
401 { CTL_STR, NET_TCP_ALLOWED_CONG_CONTROL, "tcp_allowed_congestion_control" },
402 { CTL_INT, NET_TCP_MAX_SSTHRESH, "tcp_max_ssthresh" },
403
404 { CTL_INT, NET_IPV4_ICMP_ECHO_IGNORE_ALL, "icmp_echo_ignore_all" },
405 { CTL_INT, NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, "icmp_echo_ignore_broadcasts" },
406 { CTL_INT, NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, "icmp_ignore_bogus_error_responses" },
407 { CTL_INT, NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR, "icmp_errors_use_inbound_ifaddr" },
408 { CTL_INT, NET_IPV4_ICMP_RATELIMIT, "icmp_ratelimit" },
409 { CTL_INT, NET_IPV4_ICMP_RATEMASK, "icmp_ratemask" },
410
411 { CTL_INT, NET_IPV4_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh" },
412 { CTL_INT, NET_IPV4_IPFRAG_LOW_THRESH, "ipfrag_low_thresh" },
413 { CTL_INT, NET_IPV4_IPFRAG_TIME, "ipfrag_time" },
414
415 { CTL_INT, NET_IPV4_IPFRAG_SECRET_INTERVAL, "ipfrag_secret_interval" },
416 /* NET_IPV4_IPFRAG_MAX_DIST "ipfrag_max_dist" no longer used */
417
418 { CTL_INT, 2088 /* NET_IPQ_QMAX */, "ip_queue_maxlen" },
419
420 /* NET_TCP_DEFAULT_WIN_SCALE unused */
421 /* NET_TCP_BIC_BETA unused */
422 /* NET_IPV4_TCP_MAX_KA_PROBES unused */
423 /* NET_IPV4_IP_MASQ_DEBUG unused */
424 /* NET_TCP_SYN_TAILDROP unused */
425 /* NET_IPV4_ICMP_SOURCEQUENCH_RATE unused */
426 /* NET_IPV4_ICMP_DESTUNREACH_RATE unused */
427 /* NET_IPV4_ICMP_TIMEEXCEED_RATE unused */
428 /* NET_IPV4_ICMP_PARAMPROB_RATE unused */
429 /* NET_IPV4_ICMP_ECHOREPLY_RATE unused */
430 /* NET_IPV4_ALWAYS_DEFRAG unused */
431 {}
432};
433
434static const struct bin_table bin_net_ipx_table[] = {
435 { CTL_INT, NET_IPX_PPROP_BROADCASTING, "ipx_pprop_broadcasting" },
436 /* NET_IPX_FORWARDING unused */
437 {}
438};
439
440static const struct bin_table bin_net_atalk_table[] = {
441 { CTL_INT, NET_ATALK_AARP_EXPIRY_TIME, "aarp-expiry-time" },
442 { CTL_INT, NET_ATALK_AARP_TICK_TIME, "aarp-tick-time" },
443 { CTL_INT, NET_ATALK_AARP_RETRANSMIT_LIMIT, "aarp-retransmit-limit" },
444 { CTL_INT, NET_ATALK_AARP_RESOLVE_TIME, "aarp-resolve-time" },
445 {},
446};
447
448static const struct bin_table bin_net_netrom_table[] = {
449 { CTL_INT, NET_NETROM_DEFAULT_PATH_QUALITY, "default_path_quality" },
450 { CTL_INT, NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER, "obsolescence_count_initialiser" },
451 { CTL_INT, NET_NETROM_NETWORK_TTL_INITIALISER, "network_ttl_initialiser" },
452 { CTL_INT, NET_NETROM_TRANSPORT_TIMEOUT, "transport_timeout" },
453 { CTL_INT, NET_NETROM_TRANSPORT_MAXIMUM_TRIES, "transport_maximum_tries" },
454 { CTL_INT, NET_NETROM_TRANSPORT_ACKNOWLEDGE_DELAY, "transport_acknowledge_delay" },
455 { CTL_INT, NET_NETROM_TRANSPORT_BUSY_DELAY, "transport_busy_delay" },
456 { CTL_INT, NET_NETROM_TRANSPORT_REQUESTED_WINDOW_SIZE, "transport_requested_window_size" },
457 { CTL_INT, NET_NETROM_TRANSPORT_NO_ACTIVITY_TIMEOUT, "transport_no_activity_timeout" },
458 { CTL_INT, NET_NETROM_ROUTING_CONTROL, "routing_control" },
459 { CTL_INT, NET_NETROM_LINK_FAILS_COUNT, "link_fails_count" },
460 { CTL_INT, NET_NETROM_RESET, "reset" },
461 {}
462};
463
464static const struct bin_table bin_net_ax25_param_table[] = {
465 { CTL_INT, NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" },
466 { CTL_INT, NET_AX25_DEFAULT_MODE, "ax25_default_mode" },
467 { CTL_INT, NET_AX25_BACKOFF_TYPE, "backoff_type" },
468 { CTL_INT, NET_AX25_CONNECT_MODE, "connect_mode" },
469 { CTL_INT, NET_AX25_STANDARD_WINDOW, "standard_window_size" },
470 { CTL_INT, NET_AX25_EXTENDED_WINDOW, "extended_window_size" },
471 { CTL_INT, NET_AX25_T1_TIMEOUT, "t1_timeout" },
472 { CTL_INT, NET_AX25_T2_TIMEOUT, "t2_timeout" },
473 { CTL_INT, NET_AX25_T3_TIMEOUT, "t3_timeout" },
474 { CTL_INT, NET_AX25_IDLE_TIMEOUT, "idle_timeout" },
475 { CTL_INT, NET_AX25_N2, "maximum_retry_count" },
476 { CTL_INT, NET_AX25_PACLEN, "maximum_packet_length" },
477 { CTL_INT, NET_AX25_PROTOCOL, "protocol" },
478 { CTL_INT, NET_AX25_DAMA_SLAVE_TIMEOUT, "dama_slave_timeout" },
479 {}
480};
481
482static const struct bin_table bin_net_ax25_table[] = {
483 { CTL_DIR, 0, NULL, bin_net_ax25_param_table },
484 {}
485};
486
487static const struct bin_table bin_net_rose_table[] = {
488 { CTL_INT, NET_ROSE_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" },
489 { CTL_INT, NET_ROSE_CALL_REQUEST_TIMEOUT, "call_request_timeout" },
490 { CTL_INT, NET_ROSE_RESET_REQUEST_TIMEOUT, "reset_request_timeout" },
491 { CTL_INT, NET_ROSE_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" },
492 { CTL_INT, NET_ROSE_ACK_HOLD_BACK_TIMEOUT, "acknowledge_hold_back_timeout" },
493 { CTL_INT, NET_ROSE_ROUTING_CONTROL, "routing_control" },
494 { CTL_INT, NET_ROSE_LINK_FAIL_TIMEOUT, "link_fail_timeout" },
495 { CTL_INT, NET_ROSE_MAX_VCS, "maximum_virtual_circuits" },
496 { CTL_INT, NET_ROSE_WINDOW_SIZE, "window_size" },
497 { CTL_INT, NET_ROSE_NO_ACTIVITY_TIMEOUT, "no_activity_timeout" },
498 {}
499};
500
501static const struct bin_table bin_net_ipv6_conf_var_table[] = {
502 { CTL_INT, NET_IPV6_FORWARDING, "forwarding" },
503 { CTL_INT, NET_IPV6_HOP_LIMIT, "hop_limit" },
504 { CTL_INT, NET_IPV6_MTU, "mtu" },
505 { CTL_INT, NET_IPV6_ACCEPT_RA, "accept_ra" },
506 { CTL_INT, NET_IPV6_ACCEPT_REDIRECTS, "accept_redirects" },
507 { CTL_INT, NET_IPV6_AUTOCONF, "autoconf" },
508 { CTL_INT, NET_IPV6_DAD_TRANSMITS, "dad_transmits" },
509 { CTL_INT, NET_IPV6_RTR_SOLICITS, "router_solicitations" },
510 { CTL_INT, NET_IPV6_RTR_SOLICIT_INTERVAL, "router_solicitation_interval" },
511 { CTL_INT, NET_IPV6_RTR_SOLICIT_DELAY, "router_solicitation_delay" },
512 { CTL_INT, NET_IPV6_USE_TEMPADDR, "use_tempaddr" },
513 { CTL_INT, NET_IPV6_TEMP_VALID_LFT, "temp_valid_lft" },
514 { CTL_INT, NET_IPV6_TEMP_PREFERED_LFT, "temp_prefered_lft" },
515 { CTL_INT, NET_IPV6_REGEN_MAX_RETRY, "regen_max_retry" },
516 { CTL_INT, NET_IPV6_MAX_DESYNC_FACTOR, "max_desync_factor" },
517 { CTL_INT, NET_IPV6_MAX_ADDRESSES, "max_addresses" },
518 { CTL_INT, NET_IPV6_FORCE_MLD_VERSION, "force_mld_version" },
519 { CTL_INT, NET_IPV6_ACCEPT_RA_DEFRTR, "accept_ra_defrtr" },
520 { CTL_INT, NET_IPV6_ACCEPT_RA_PINFO, "accept_ra_pinfo" },
521 { CTL_INT, NET_IPV6_ACCEPT_RA_RTR_PREF, "accept_ra_rtr_pref" },
522 { CTL_INT, NET_IPV6_RTR_PROBE_INTERVAL, "router_probe_interval" },
523 { CTL_INT, NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" },
524 { CTL_INT, NET_IPV6_PROXY_NDP, "proxy_ndp" },
525 { CTL_INT, NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
526 {}
527};
528
529static const struct bin_table bin_net_ipv6_conf_table[] = {
530 { CTL_DIR, NET_PROTO_CONF_ALL, "all", bin_net_ipv6_conf_var_table },
531 { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_ipv6_conf_var_table },
532 { CTL_DIR, 0, NULL, bin_net_ipv6_conf_var_table },
533 {}
534};
535
536static const struct bin_table bin_net_ipv6_route_table[] = {
537 /* NET_IPV6_ROUTE_FLUSH "flush" no longer used */
538 { CTL_INT, NET_IPV6_ROUTE_GC_THRESH, "gc_thresh" },
539 { CTL_INT, NET_IPV6_ROUTE_MAX_SIZE, "max_size" },
540 { CTL_INT, NET_IPV6_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" },
541 { CTL_INT, NET_IPV6_ROUTE_GC_TIMEOUT, "gc_timeout" },
542 { CTL_INT, NET_IPV6_ROUTE_GC_INTERVAL, "gc_interval" },
543 { CTL_INT, NET_IPV6_ROUTE_GC_ELASTICITY, "gc_elasticity" },
544 { CTL_INT, NET_IPV6_ROUTE_MTU_EXPIRES, "mtu_expires" },
545 { CTL_INT, NET_IPV6_ROUTE_MIN_ADVMSS, "min_adv_mss" },
546 { CTL_INT, NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" },
547 {}
548};
549
550static const struct bin_table bin_net_ipv6_icmp_table[] = {
551 { CTL_INT, NET_IPV6_ICMP_RATELIMIT, "ratelimit" },
552 {}
553};
554
555static const struct bin_table bin_net_ipv6_table[] = {
556 { CTL_DIR, NET_IPV6_CONF, "conf", bin_net_ipv6_conf_table },
557 { CTL_DIR, NET_IPV6_NEIGH, "neigh", bin_net_neigh_table },
558 { CTL_DIR, NET_IPV6_ROUTE, "route", bin_net_ipv6_route_table },
559 { CTL_DIR, NET_IPV6_ICMP, "icmp", bin_net_ipv6_icmp_table },
560 { CTL_INT, NET_IPV6_BINDV6ONLY, "bindv6only" },
561 { CTL_INT, NET_IPV6_IP6FRAG_HIGH_THRESH, "ip6frag_high_thresh" },
562 { CTL_INT, NET_IPV6_IP6FRAG_LOW_THRESH, "ip6frag_low_thresh" },
563 { CTL_INT, NET_IPV6_IP6FRAG_TIME, "ip6frag_time" },
564 { CTL_INT, NET_IPV6_IP6FRAG_SECRET_INTERVAL, "ip6frag_secret_interval" },
565 { CTL_INT, NET_IPV6_MLD_MAX_MSF, "mld_max_msf" },
566 { CTL_INT, 2088 /* IPQ_QMAX */, "ip6_queue_maxlen" },
567 {}
568};
569
570static const struct bin_table bin_net_x25_table[] = {
571 { CTL_INT, NET_X25_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" },
572 { CTL_INT, NET_X25_CALL_REQUEST_TIMEOUT, "call_request_timeout" },
573 { CTL_INT, NET_X25_RESET_REQUEST_TIMEOUT, "reset_request_timeout" },
574 { CTL_INT, NET_X25_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" },
575 { CTL_INT, NET_X25_ACK_HOLD_BACK_TIMEOUT, "acknowledgement_hold_back_timeout" },
576 { CTL_INT, NET_X25_FORWARD, "x25_forward" },
577 {}
578};
579
580static const struct bin_table bin_net_tr_table[] = {
581 { CTL_INT, NET_TR_RIF_TIMEOUT, "rif_timeout" },
582 {}
583};
584
585
586static const struct bin_table bin_net_decnet_conf_vars[] = {
587 { CTL_INT, NET_DECNET_CONF_DEV_FORWARDING, "forwarding" },
588 { CTL_INT, NET_DECNET_CONF_DEV_PRIORITY, "priority" },
589 { CTL_INT, NET_DECNET_CONF_DEV_T2, "t2" },
590 { CTL_INT, NET_DECNET_CONF_DEV_T3, "t3" },
591 {}
592};
593
594static const struct bin_table bin_net_decnet_conf[] = {
595 { CTL_DIR, NET_DECNET_CONF_ETHER, "ethernet", bin_net_decnet_conf_vars },
596 { CTL_DIR, NET_DECNET_CONF_GRE, "ipgre", bin_net_decnet_conf_vars },
597 { CTL_DIR, NET_DECNET_CONF_X25, "x25", bin_net_decnet_conf_vars },
598 { CTL_DIR, NET_DECNET_CONF_PPP, "ppp", bin_net_decnet_conf_vars },
599 { CTL_DIR, NET_DECNET_CONF_DDCMP, "ddcmp", bin_net_decnet_conf_vars },
600 { CTL_DIR, NET_DECNET_CONF_LOOPBACK, "loopback", bin_net_decnet_conf_vars },
601 { CTL_DIR, 0, NULL, bin_net_decnet_conf_vars },
602 {}
603};
604
605static const struct bin_table bin_net_decnet_table[] = {
606 { CTL_DIR, NET_DECNET_CONF, "conf", bin_net_decnet_conf },
607 { CTL_DNADR, NET_DECNET_NODE_ADDRESS, "node_address" },
608 { CTL_STR, NET_DECNET_NODE_NAME, "node_name" },
609 { CTL_STR, NET_DECNET_DEFAULT_DEVICE, "default_device" },
610 { CTL_INT, NET_DECNET_TIME_WAIT, "time_wait" },
611 { CTL_INT, NET_DECNET_DN_COUNT, "dn_count" },
612 { CTL_INT, NET_DECNET_DI_COUNT, "di_count" },
613 { CTL_INT, NET_DECNET_DR_COUNT, "dr_count" },
614 { CTL_INT, NET_DECNET_DST_GC_INTERVAL, "dst_gc_interval" },
615 { CTL_INT, NET_DECNET_NO_FC_MAX_CWND, "no_fc_max_cwnd" },
616 { CTL_INT, NET_DECNET_MEM, "decnet_mem" },
617 { CTL_INT, NET_DECNET_RMEM, "decnet_rmem" },
618 { CTL_INT, NET_DECNET_WMEM, "decnet_wmem" },
619 { CTL_INT, NET_DECNET_DEBUG_LEVEL, "debug" },
620 {}
621};
622
623static const struct bin_table bin_net_sctp_table[] = {
624 { CTL_INT, NET_SCTP_RTO_INITIAL, "rto_initial" },
625 { CTL_INT, NET_SCTP_RTO_MIN, "rto_min" },
626 { CTL_INT, NET_SCTP_RTO_MAX, "rto_max" },
627 { CTL_INT, NET_SCTP_RTO_ALPHA, "rto_alpha_exp_divisor" },
628 { CTL_INT, NET_SCTP_RTO_BETA, "rto_beta_exp_divisor" },
629 { CTL_INT, NET_SCTP_VALID_COOKIE_LIFE, "valid_cookie_life" },
630 { CTL_INT, NET_SCTP_ASSOCIATION_MAX_RETRANS, "association_max_retrans" },
631 { CTL_INT, NET_SCTP_PATH_MAX_RETRANS, "path_max_retrans" },
632 { CTL_INT, NET_SCTP_MAX_INIT_RETRANSMITS, "max_init_retransmits" },
633 { CTL_INT, NET_SCTP_HB_INTERVAL, "hb_interval" },
634 { CTL_INT, NET_SCTP_PRESERVE_ENABLE, "cookie_preserve_enable" },
635 { CTL_INT, NET_SCTP_MAX_BURST, "max_burst" },
636 { CTL_INT, NET_SCTP_ADDIP_ENABLE, "addip_enable" },
637 { CTL_INT, NET_SCTP_PRSCTP_ENABLE, "prsctp_enable" },
638 { CTL_INT, NET_SCTP_SNDBUF_POLICY, "sndbuf_policy" },
639 { CTL_INT, NET_SCTP_SACK_TIMEOUT, "sack_timeout" },
640 { CTL_INT, NET_SCTP_RCVBUF_POLICY, "rcvbuf_policy" },
641 {}
642};
643
644static const struct bin_table bin_net_llc_llc2_timeout_table[] = {
645 { CTL_INT, NET_LLC2_ACK_TIMEOUT, "ack" },
646 { CTL_INT, NET_LLC2_P_TIMEOUT, "p" },
647 { CTL_INT, NET_LLC2_REJ_TIMEOUT, "rej" },
648 { CTL_INT, NET_LLC2_BUSY_TIMEOUT, "busy" },
649 {}
650};
651
652static const struct bin_table bin_net_llc_station_table[] = {
653 { CTL_INT, NET_LLC_STATION_ACK_TIMEOUT, "ack_timeout" },
654 {}
655};
656
657static const struct bin_table bin_net_llc_llc2_table[] = {
658 { CTL_DIR, NET_LLC2, "timeout", bin_net_llc_llc2_timeout_table },
659 {}
660};
661
662static const struct bin_table bin_net_llc_table[] = {
663 { CTL_DIR, NET_LLC2, "llc2", bin_net_llc_llc2_table },
664 { CTL_DIR, NET_LLC_STATION, "station", bin_net_llc_station_table },
665 {}
666};
667
668static const struct bin_table bin_net_netfilter_table[] = {
669 { CTL_INT, NET_NF_CONNTRACK_MAX, "nf_conntrack_max" },
670 /* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "nf_conntrack_tcp_timeout_syn_sent" no longer used */
671 /* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "nf_conntrack_tcp_timeout_syn_recv" no longer used */
672 /* NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "nf_conntrack_tcp_timeout_established" no longer used */
673 /* NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "nf_conntrack_tcp_timeout_fin_wait" no longer used */
674 /* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT "nf_conntrack_tcp_timeout_close_wait" no longer used */
675 /* NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "nf_conntrack_tcp_timeout_last_ack" no longer used */
676 /* NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "nf_conntrack_tcp_timeout_time_wait" no longer used */
677 /* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "nf_conntrack_tcp_timeout_close" no longer used */
678 /* NET_NF_CONNTRACK_UDP_TIMEOUT "nf_conntrack_udp_timeout" no longer used */
679 /* NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM "nf_conntrack_udp_timeout_stream" no longer used */
680 /* NET_NF_CONNTRACK_ICMP_TIMEOUT "nf_conntrack_icmp_timeout" no longer used */
681 /* NET_NF_CONNTRACK_GENERIC_TIMEOUT "nf_conntrack_generic_timeout" no longer used */
682 { CTL_INT, NET_NF_CONNTRACK_BUCKETS, "nf_conntrack_buckets" },
683 { CTL_INT, NET_NF_CONNTRACK_LOG_INVALID, "nf_conntrack_log_invalid" },
684 /* NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "nf_conntrack_tcp_timeout_max_retrans" no longer used */
685 { CTL_INT, NET_NF_CONNTRACK_TCP_LOOSE, "nf_conntrack_tcp_loose" },
686 { CTL_INT, NET_NF_CONNTRACK_TCP_BE_LIBERAL, "nf_conntrack_tcp_be_liberal" },
687 { CTL_INT, NET_NF_CONNTRACK_TCP_MAX_RETRANS, "nf_conntrack_tcp_max_retrans" },
688 /* NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "nf_conntrack_sctp_timeout_closed" no longer used */
689 /* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "nf_conntrack_sctp_timeout_cookie_wait" no longer used */
690 /* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "nf_conntrack_sctp_timeout_cookie_echoed" no longer used */
691 /* NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "nf_conntrack_sctp_timeout_established" no longer used */
692 /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "nf_conntrack_sctp_timeout_shutdown_sent" no longer used */
693 /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "nf_conntrack_sctp_timeout_shutdown_recd" no longer used */
694 /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "nf_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */
695 { CTL_INT, NET_NF_CONNTRACK_COUNT, "nf_conntrack_count" },
696 /* NET_NF_CONNTRACK_ICMPV6_TIMEOUT "nf_conntrack_icmpv6_timeout" no longer used */
697 /* NET_NF_CONNTRACK_FRAG6_TIMEOUT "nf_conntrack_frag6_timeout" no longer used */
698 { CTL_INT, NET_NF_CONNTRACK_FRAG6_LOW_THRESH, "nf_conntrack_frag6_low_thresh" },
699 { CTL_INT, NET_NF_CONNTRACK_FRAG6_HIGH_THRESH, "nf_conntrack_frag6_high_thresh" },
700 { CTL_INT, NET_NF_CONNTRACK_CHECKSUM, "nf_conntrack_checksum" },
701
702 {}
703};
704
705static const struct bin_table bin_net_irda_table[] = {
706 { CTL_INT, NET_IRDA_DISCOVERY, "discovery" },
707 { CTL_STR, NET_IRDA_DEVNAME, "devname" },
708 { CTL_INT, NET_IRDA_DEBUG, "debug" },
709 { CTL_INT, NET_IRDA_FAST_POLL, "fast_poll_increase" },
710 { CTL_INT, NET_IRDA_DISCOVERY_SLOTS, "discovery_slots" },
711 { CTL_INT, NET_IRDA_DISCOVERY_TIMEOUT, "discovery_timeout" },
712 { CTL_INT, NET_IRDA_SLOT_TIMEOUT, "slot_timeout" },
713 { CTL_INT, NET_IRDA_MAX_BAUD_RATE, "max_baud_rate" },
714 { CTL_INT, NET_IRDA_MIN_TX_TURN_TIME, "min_tx_turn_time" },
715 { CTL_INT, NET_IRDA_MAX_TX_DATA_SIZE, "max_tx_data_size" },
716 { CTL_INT, NET_IRDA_MAX_TX_WINDOW, "max_tx_window" },
717 { CTL_INT, NET_IRDA_MAX_NOREPLY_TIME, "max_noreply_time" },
718 { CTL_INT, NET_IRDA_WARN_NOREPLY_TIME, "warn_noreply_time" },
719 { CTL_INT, NET_IRDA_LAP_KEEPALIVE_TIME, "lap_keepalive_time" },
720 {}
721};
722
723static const struct bin_table bin_net_table[] = {
724 { CTL_DIR, NET_CORE, "core", bin_net_core_table },
725 /* NET_ETHER not used */
726 /* NET_802 not used */
727 { CTL_DIR, NET_UNIX, "unix", bin_net_unix_table },
728 { CTL_DIR, NET_IPV4, "ipv4", bin_net_ipv4_table },
729 { CTL_DIR, NET_IPX, "ipx", bin_net_ipx_table },
730 { CTL_DIR, NET_ATALK, "appletalk", bin_net_atalk_table },
731 { CTL_DIR, NET_NETROM, "netrom", bin_net_netrom_table },
732 { CTL_DIR, NET_AX25, "ax25", bin_net_ax25_table },
733 /* NET_BRIDGE "bridge" no longer used */
734 { CTL_DIR, NET_ROSE, "rose", bin_net_rose_table },
735 { CTL_DIR, NET_IPV6, "ipv6", bin_net_ipv6_table },
736 { CTL_DIR, NET_X25, "x25", bin_net_x25_table },
737 { CTL_DIR, NET_TR, "token-ring", bin_net_tr_table },
738 { CTL_DIR, NET_DECNET, "decnet", bin_net_decnet_table },
739 /* NET_ECONET not used */
740 { CTL_DIR, NET_SCTP, "sctp", bin_net_sctp_table },
741 { CTL_DIR, NET_LLC, "llc", bin_net_llc_table },
742 { CTL_DIR, NET_NETFILTER, "netfilter", bin_net_netfilter_table },
743 /* NET_DCCP "dccp" no longer used */
744 { CTL_DIR, NET_IRDA, "irda", bin_net_irda_table },
745 { CTL_INT, 2089, "nf_conntrack_max" },
746 {}
747};
748
749static const struct bin_table bin_fs_quota_table[] = {
750 { CTL_INT, FS_DQ_LOOKUPS, "lookups" },
751 { CTL_INT, FS_DQ_DROPS, "drops" },
752 { CTL_INT, FS_DQ_READS, "reads" },
753 { CTL_INT, FS_DQ_WRITES, "writes" },
754 { CTL_INT, FS_DQ_CACHE_HITS, "cache_hits" },
755 { CTL_INT, FS_DQ_ALLOCATED, "allocated_dquots" },
756 { CTL_INT, FS_DQ_FREE, "free_dquots" },
757 { CTL_INT, FS_DQ_SYNCS, "syncs" },
758 { CTL_INT, FS_DQ_WARNINGS, "warnings" },
759 {}
760};
761
762static const struct bin_table bin_fs_xfs_table[] = {
763 { CTL_INT, XFS_SGID_INHERIT, "irix_sgid_inherit" },
764 { CTL_INT, XFS_SYMLINK_MODE, "irix_symlink_mode" },
765 { CTL_INT, XFS_PANIC_MASK, "panic_mask" },
766
767 { CTL_INT, XFS_ERRLEVEL, "error_level" },
768 { CTL_INT, XFS_SYNCD_TIMER, "xfssyncd_centisecs" },
769 { CTL_INT, XFS_INHERIT_SYNC, "inherit_sync" },
770 { CTL_INT, XFS_INHERIT_NODUMP, "inherit_nodump" },
771 { CTL_INT, XFS_INHERIT_NOATIME, "inherit_noatime" },
772 { CTL_INT, XFS_BUF_TIMER, "xfsbufd_centisecs" },
773 { CTL_INT, XFS_BUF_AGE, "age_buffer_centisecs" },
774 { CTL_INT, XFS_INHERIT_NOSYM, "inherit_nosymlinks" },
775 { CTL_INT, XFS_ROTORSTEP, "rotorstep" },
776 { CTL_INT, XFS_INHERIT_NODFRG, "inherit_nodefrag" },
777 { CTL_INT, XFS_FILESTREAM_TIMER, "filestream_centisecs" },
778 { CTL_INT, XFS_STATS_CLEAR, "stats_clear" },
779 {}
780};
781
782static const struct bin_table bin_fs_ocfs2_nm_table[] = {
783 { CTL_STR, 1, "hb_ctl_path" },
784 {}
785};
786
787static const struct bin_table bin_fs_ocfs2_table[] = {
788 { CTL_DIR, 1, "nm", bin_fs_ocfs2_nm_table },
789 {}
790};
791
792static const struct bin_table bin_inotify_table[] = {
793 { CTL_INT, INOTIFY_MAX_USER_INSTANCES, "max_user_instances" },
794 { CTL_INT, INOTIFY_MAX_USER_WATCHES, "max_user_watches" },
795 { CTL_INT, INOTIFY_MAX_QUEUED_EVENTS, "max_queued_events" },
796 {}
797};
798
799static const struct bin_table bin_fs_table[] = {
800 { CTL_INT, FS_NRINODE, "inode-nr" },
801 { CTL_INT, FS_STATINODE, "inode-state" },
802 /* FS_MAXINODE unused */
803 /* FS_NRDQUOT unused */
804 /* FS_MAXDQUOT unused */
805 /* FS_NRFILE "file-nr" no longer used */
806 { CTL_INT, FS_MAXFILE, "file-max" },
807 { CTL_INT, FS_DENTRY, "dentry-state" },
808 /* FS_NRSUPER unused */
809 /* FS_MAXUPSER unused */
810 { CTL_INT, FS_OVERFLOWUID, "overflowuid" },
811 { CTL_INT, FS_OVERFLOWGID, "overflowgid" },
812 { CTL_INT, FS_LEASES, "leases-enable" },
813 { CTL_INT, FS_DIR_NOTIFY, "dir-notify-enable" },
814 { CTL_INT, FS_LEASE_TIME, "lease-break-time" },
815 { CTL_DIR, FS_DQSTATS, "quota", bin_fs_quota_table },
816 { CTL_DIR, FS_XFS, "xfs", bin_fs_xfs_table },
817 { CTL_ULONG, FS_AIO_NR, "aio-nr" },
818 { CTL_ULONG, FS_AIO_MAX_NR, "aio-max-nr" },
819 { CTL_DIR, FS_INOTIFY, "inotify", bin_inotify_table },
820 { CTL_DIR, FS_OCFS2, "ocfs2", bin_fs_ocfs2_table },
821 { CTL_INT, KERN_SETUID_DUMPABLE, "suid_dumpable" },
822 {}
823};
824
825static const struct bin_table bin_ipmi_table[] = {
826 { CTL_INT, DEV_IPMI_POWEROFF_POWERCYCLE, "poweroff_powercycle" },
827 {}
828};
829
830static const struct bin_table bin_mac_hid_files[] = {
831 /* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */
832 /* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */
833 { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON_EMULATION, "mouse_button_emulation" },
834 { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON2_KEYCODE, "mouse_button2_keycode" },
835 { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON3_KEYCODE, "mouse_button3_keycode" },
836 /* DEV_MAC_HID_ADB_MOUSE_SENDS_KEYCODES unused */
837 {}
838};
839
840static const struct bin_table bin_raid_table[] = {
841 { CTL_INT, DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min" },
842 { CTL_INT, DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max" },
843 {}
844};
845
846static const struct bin_table bin_scsi_table[] = {
847 { CTL_INT, DEV_SCSI_LOGGING_LEVEL, "logging_level" },
848 {}
849};
850
851static const struct bin_table bin_dev_table[] = {
852 /* DEV_CDROM "cdrom" no longer used */
853 /* DEV_HWMON unused */
854 /* DEV_PARPORT "parport" no longer used */
855 { CTL_DIR, DEV_RAID, "raid", bin_raid_table },
856 { CTL_DIR, DEV_MAC_HID, "mac_hid", bin_mac_hid_files },
857 { CTL_DIR, DEV_SCSI, "scsi", bin_scsi_table },
858 { CTL_DIR, DEV_IPMI, "ipmi", bin_ipmi_table },
859 {}
860};
861
862static const struct bin_table bin_bus_isa_table[] = {
863 { CTL_INT, BUS_ISA_MEM_BASE, "membase" },
864 { CTL_INT, BUS_ISA_PORT_BASE, "portbase" },
865 { CTL_INT, BUS_ISA_PORT_SHIFT, "portshift" },
866 {}
867};
868
869static const struct bin_table bin_bus_table[] = {
870 { CTL_DIR, CTL_BUS_ISA, "isa", bin_bus_isa_table },
871 {}
872};
873
874
875static const struct bin_table bin_s390dbf_table[] = {
876 { CTL_INT, 5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" },
877 { CTL_INT, 5679 /* CTL_S390DBF_ACTIVE */, "debug_active" },
878 {}
879};
880
881static const struct bin_table bin_sunrpc_table[] = {
882 /* CTL_RPCDEBUG "rpc_debug" no longer used */
883 /* CTL_NFSDEBUG "nfs_debug" no longer used */
884 /* CTL_NFSDDEBUG "nfsd_debug" no longer used */
885 /* CTL_NLMDEBUG "nlm_debug" no longer used */
886
887 { CTL_INT, CTL_SLOTTABLE_UDP, "udp_slot_table_entries" },
888 { CTL_INT, CTL_SLOTTABLE_TCP, "tcp_slot_table_entries" },
889 { CTL_INT, CTL_MIN_RESVPORT, "min_resvport" },
890 { CTL_INT, CTL_MAX_RESVPORT, "max_resvport" },
891 {}
892};
893
894static const struct bin_table bin_pm_table[] = {
895 /* frv specific */
896 /* 1 == CTL_PM_SUSPEND "suspend" no longer used" */
897 { CTL_INT, 2 /* CTL_PM_CMODE */, "cmode" },
898 { CTL_INT, 3 /* CTL_PM_P0 */, "p0" },
899 { CTL_INT, 4 /* CTL_PM_CM */, "cm" },
900 {}
901};
902
903static const struct bin_table bin_root_table[] = {
904 { CTL_DIR, CTL_KERN, "kernel", bin_kern_table },
905 { CTL_DIR, CTL_VM, "vm", bin_vm_table },
906 { CTL_DIR, CTL_NET, "net", bin_net_table },
907 /* CTL_PROC not used */
908 { CTL_DIR, CTL_FS, "fs", bin_fs_table },
909 /* CTL_DEBUG "debug" no longer used */
910 { CTL_DIR, CTL_DEV, "dev", bin_dev_table },
911 { CTL_DIR, CTL_BUS, "bus", bin_bus_table },
912 { CTL_DIR, CTL_ABI, "abi" },
913 /* CTL_CPU not used */
914 /* CTL_ARLAN "arlan" no longer used */
915 { CTL_DIR, CTL_S390DBF, "s390dbf", bin_s390dbf_table },
916 { CTL_DIR, CTL_SUNRPC, "sunrpc", bin_sunrpc_table },
917 { CTL_DIR, CTL_PM, "pm", bin_pm_table },
918 {}
919};
920
921static ssize_t bin_dir(struct file *file,
922 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
923{
924 return -ENOTDIR;
925}
926
927
928static ssize_t bin_string(struct file *file,
929 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
930{
931 ssize_t result, copied = 0;
932
933 if (oldval && oldlen) {
934 char __user *lastp;
935 loff_t pos = 0;
936 int ch;
937
938 result = vfs_read(file, oldval, oldlen, &pos);
939 if (result < 0)
940 goto out;
941
942 copied = result;
943 lastp = oldval + copied - 1;
944
945 result = -EFAULT;
946 if (get_user(ch, lastp))
947 goto out;
948
949 /* Trim off the trailing newline */
950 if (ch == '\n') {
951 result = -EFAULT;
952 if (put_user('\0', lastp))
953 goto out;
954 copied -= 1;
955 }
956 }
957
958 if (newval && newlen) {
959 loff_t pos = 0;
960
961 result = vfs_write(file, newval, newlen, &pos);
962 if (result < 0)
963 goto out;
964 }
965
966 result = copied;
967out:
968 return result;
969}
970
971static ssize_t bin_intvec(struct file *file,
972 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
973{
974 mm_segment_t old_fs = get_fs();
975 ssize_t copied = 0;
976 char *buffer;
977 ssize_t result;
978
979 result = -ENOMEM;
980 buffer = kmalloc(BUFSZ, GFP_KERNEL);
981 if (!buffer)
982 goto out;
983
984 if (oldval && oldlen) {
985 unsigned __user *vec = oldval;
986 size_t length = oldlen / sizeof(*vec);
987 loff_t pos = 0;
988 char *str, *end;
989 int i;
990
991 set_fs(KERNEL_DS);
992 result = vfs_read(file, buffer, BUFSZ - 1, &pos);
993 set_fs(old_fs);
994 if (result < 0)
995 goto out_kfree;
996
997 str = buffer;
998 end = str + result;
999 *end++ = '\0';
1000 for (i = 0; i < length; i++) {
1001 unsigned long value;
1002
1003 value = simple_strtoul(str, &str, 10);
1004 while (isspace(*str))
1005 str++;
1006
1007 result = -EFAULT;
1008 if (put_user(value, vec + i))
1009 goto out_kfree;
1010
1011 copied += sizeof(*vec);
1012 if (!isdigit(*str))
1013 break;
1014 }
1015 }
1016
1017 if (newval && newlen) {
1018 unsigned __user *vec = newval;
1019 size_t length = newlen / sizeof(*vec);
1020 loff_t pos = 0;
1021 char *str, *end;
1022 int i;
1023
1024 str = buffer;
1025 end = str + BUFSZ;
1026 for (i = 0; i < length; i++) {
1027 unsigned long value;
1028
1029 result = -EFAULT;
1030 if (get_user(value, vec + i))
1031 goto out_kfree;
1032
1033 str += snprintf(str, end - str, "%lu\t", value);
1034 }
1035
1036 set_fs(KERNEL_DS);
1037 result = vfs_write(file, buffer, str - buffer, &pos);
1038 set_fs(old_fs);
1039 if (result < 0)
1040 goto out_kfree;
1041 }
1042 result = copied;
1043out_kfree:
1044 kfree(buffer);
1045out:
1046 return result;
1047}
1048
1049static ssize_t bin_ulongvec(struct file *file,
1050 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1051{
1052 mm_segment_t old_fs = get_fs();
1053 ssize_t copied = 0;
1054 char *buffer;
1055 ssize_t result;
1056
1057 result = -ENOMEM;
1058 buffer = kmalloc(BUFSZ, GFP_KERNEL);
1059 if (!buffer)
1060 goto out;
1061
1062 if (oldval && oldlen) {
1063 unsigned long __user *vec = oldval;
1064 size_t length = oldlen / sizeof(*vec);
1065 loff_t pos = 0;
1066 char *str, *end;
1067 int i;
1068
1069 set_fs(KERNEL_DS);
1070 result = vfs_read(file, buffer, BUFSZ - 1, &pos);
1071 set_fs(old_fs);
1072 if (result < 0)
1073 goto out_kfree;
1074
1075 str = buffer;
1076 end = str + result;
1077 *end++ = '\0';
1078 for (i = 0; i < length; i++) {
1079 unsigned long value;
1080
1081 value = simple_strtoul(str, &str, 10);
1082 while (isspace(*str))
1083 str++;
1084
1085 result = -EFAULT;
1086 if (put_user(value, vec + i))
1087 goto out_kfree;
1088
1089 copied += sizeof(*vec);
1090 if (!isdigit(*str))
1091 break;
1092 }
1093 }
1094
1095 if (newval && newlen) {
1096 unsigned long __user *vec = newval;
1097 size_t length = newlen / sizeof(*vec);
1098 loff_t pos = 0;
1099 char *str, *end;
1100 int i;
1101
1102 str = buffer;
1103 end = str + BUFSZ;
1104 for (i = 0; i < length; i++) {
1105 unsigned long value;
1106
1107 result = -EFAULT;
1108 if (get_user(value, vec + i))
1109 goto out_kfree;
1110
1111 str += snprintf(str, end - str, "%lu\t", value);
1112 }
1113
1114 set_fs(KERNEL_DS);
1115 result = vfs_write(file, buffer, str - buffer, &pos);
1116 set_fs(old_fs);
1117 if (result < 0)
1118 goto out_kfree;
1119 }
1120 result = copied;
1121out_kfree:
1122 kfree(buffer);
1123out:
1124 return result;
1125}
1126
1127static unsigned hex_value(int ch)
1128{
1129 return isdigit(ch) ? ch - '0' : ((ch | 0x20) - 'a') + 10;
1130}
1131
1132static ssize_t bin_uuid(struct file *file,
1133 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1134{
1135 mm_segment_t old_fs = get_fs();
1136 ssize_t result, copied = 0;
1137
1138 /* Only supports reads */
1139 if (oldval && oldlen) {
1140 loff_t pos = 0;
1141 char buf[40], *str = buf;
1142 unsigned char uuid[16];
1143 int i;
1144
1145 set_fs(KERNEL_DS);
1146 result = vfs_read(file, buf, sizeof(buf) - 1, &pos);
1147 set_fs(old_fs);
1148 if (result < 0)
1149 goto out;
1150
1151 buf[result] = '\0';
1152
1153 /* Convert the uuid to from a string to binary */
1154 for (i = 0; i < 16; i++) {
1155 result = -EIO;
1156 if (!isxdigit(str[0]) || !isxdigit(str[1]))
1157 goto out;
1158
1159 uuid[i] = (hex_value(str[0]) << 4) | hex_value(str[1]);
1160 str += 2;
1161 if (*str == '-')
1162 str++;
1163 }
1164
1165 if (oldlen > 16)
1166 oldlen = 16;
1167
1168 result = -EFAULT;
1169 if (copy_to_user(oldval, uuid, oldlen))
1170 goto out;
1171
1172 copied = oldlen;
1173 }
1174 result = copied;
1175out:
1176 return result;
1177}
1178
1179static ssize_t bin_dn_node_address(struct file *file,
1180 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1181{
1182 mm_segment_t old_fs = get_fs();
1183 ssize_t result, copied = 0;
1184
1185 if (oldval && oldlen) {
1186 loff_t pos = 0;
1187 char buf[15], *nodep;
1188 unsigned long area, node;
1189 __le16 dnaddr;
1190
1191 set_fs(KERNEL_DS);
1192 result = vfs_read(file, buf, sizeof(buf) - 1, &pos);
1193 set_fs(old_fs);
1194 if (result < 0)
1195 goto out;
1196
1197 buf[result] = '\0';
1198
1199 /* Convert the decnet addresss to binary */
1200 result = -EIO;
1201 nodep = strchr(buf, '.') + 1;
1202 if (!nodep)
1203 goto out;
1204
1205 area = simple_strtoul(buf, NULL, 10);
1206 node = simple_strtoul(nodep, NULL, 10);
1207
1208 result = -EIO;
1209 if ((area > 63)||(node > 1023))
1210 goto out;
1211
1212 dnaddr = cpu_to_le16((area << 10) | node);
1213
1214 result = -EFAULT;
1215 if (put_user(dnaddr, (__le16 __user *)oldval))
1216 goto out;
1217
1218 copied = sizeof(dnaddr);
1219 }
1220
1221 if (newval && newlen) {
1222 loff_t pos = 0;
1223 __le16 dnaddr;
1224 char buf[15];
1225 int len;
1226
1227 result = -EINVAL;
1228 if (newlen != sizeof(dnaddr))
1229 goto out;
1230
1231 result = -EFAULT;
1232 if (get_user(dnaddr, (__le16 __user *)newval))
1233 goto out;
1234
1235 len = snprintf(buf, sizeof(buf), "%hu.%hu",
1236 le16_to_cpu(dnaddr) >> 10,
1237 le16_to_cpu(dnaddr) & 0x3ff);
1238
1239 set_fs(KERNEL_DS);
1240 result = vfs_write(file, buf, len, &pos);
1241 set_fs(old_fs);
1242 if (result < 0)
1243 goto out;
1244 }
1245
1246 result = copied;
1247out:
1248 return result;
1249}
1250
1251static const struct bin_table *get_sysctl(const int *name, int nlen, char *path)
1252{
1253 const struct bin_table *table = &bin_root_table[0];
1254 int ctl_name;
1255
1256 /* The binary sysctl tables have a small maximum depth so
1257 * there is no danger of overflowing our path as it PATH_MAX
1258 * bytes long.
1259 */
1260 memcpy(path, "sys/", 4);
1261 path += 4;
1262
1263repeat:
1264 if (!nlen)
1265 return ERR_PTR(-ENOTDIR);
1266 ctl_name = *name;
1267 name++;
1268 nlen--;
1269 for ( ; table->convert; table++) {
1270 int len = 0;
1271
1272 /*
1273 * For a wild card entry map from ifindex to network
1274 * device name.
1275 */
1276 if (!table->ctl_name) {
1277#ifdef CONFIG_NET
1278 struct net *net = current->nsproxy->net_ns;
1279 struct net_device *dev;
1280 dev = dev_get_by_index(net, ctl_name);
1281 if (dev) {
1282 len = strlen(dev->name);
1283 memcpy(path, dev->name, len);
1284 dev_put(dev);
1285 }
1286#endif
1287 /* Use the well known sysctl number to proc name mapping */
1288 } else if (ctl_name == table->ctl_name) {
1289 len = strlen(table->procname);
1290 memcpy(path, table->procname, len);
1291 }
1292 if (len) {
1293 path += len;
1294 if (table->child) {
1295 *path++ = '/';
1296 table = table->child;
1297 goto repeat;
1298 }
1299 *path = '\0';
1300 return table;
1301 }
1302 }
1303 return ERR_PTR(-ENOTDIR);
1304}
1305
1306static char *sysctl_getname(const int *name, int nlen, const struct bin_table **tablep)
1307{
1308 char *tmp, *result;
1309
1310 result = ERR_PTR(-ENOMEM);
1311 tmp = __getname();
1312 if (tmp) {
1313 const struct bin_table *table = get_sysctl(name, nlen, tmp);
1314 result = tmp;
1315 *tablep = table;
1316 if (IS_ERR(table)) {
1317 __putname(tmp);
1318 result = ERR_CAST(table);
1319 }
1320 }
1321 return result;
1322}
1323
1324static ssize_t binary_sysctl(const int *name, int nlen,
1325 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1326{
1327 const struct bin_table *table = NULL;
1328 struct nameidata nd;
1329 struct vfsmount *mnt;
1330 struct file *file;
1331 ssize_t result;
1332 char *pathname;
1333 int flags;
1334 int acc_mode, fmode;
1335
1336 pathname = sysctl_getname(name, nlen, &table);
1337 result = PTR_ERR(pathname);
1338 if (IS_ERR(pathname))
1339 goto out;
1340
1341 /* How should the sysctl be accessed? */
1342 if (oldval && oldlen && newval && newlen) {
1343 flags = O_RDWR;
1344 acc_mode = MAY_READ | MAY_WRITE;
1345 fmode = FMODE_READ | FMODE_WRITE;
1346 } else if (newval && newlen) {
1347 flags = O_WRONLY;
1348 acc_mode = MAY_WRITE;
1349 fmode = FMODE_WRITE;
1350 } else if (oldval && oldlen) {
1351 flags = O_RDONLY;
1352 acc_mode = MAY_READ;
1353 fmode = FMODE_READ;
1354 } else {
1355 result = 0;
1356 goto out_putname;
1357 }
1358
1359 mnt = current->nsproxy->pid_ns->proc_mnt;
1360 result = vfs_path_lookup(mnt->mnt_root, mnt, pathname, 0, &nd);
1361 if (result)
1362 goto out_putname;
1363
1364 result = may_open(&nd.path, acc_mode, fmode);
1365 if (result)
1366 goto out_putpath;
1367
1368 file = dentry_open(nd.path.dentry, nd.path.mnt, flags, current_cred());
1369 result = PTR_ERR(file);
1370 if (IS_ERR(file))
1371 goto out_putname;
1372
1373 result = table->convert(file, oldval, oldlen, newval, newlen);
1374
1375 fput(file);
1376out_putname:
1377 putname(pathname);
1378out:
1379 return result;
1380
1381out_putpath:
1382 path_put(&nd.path);
1383 goto out_putname;
1384}
1385
1386
1387#else /* CONFIG_SYSCTL_SYSCALL */
1388
1389static ssize_t binary_sysctl(const int *name, int nlen,
1390 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1391{
1392 return -ENOSYS;
1393}
1394
1395#endif /* CONFIG_SYSCTL_SYSCALL */
1396
1397
1398static void deprecated_sysctl_warning(const int *name, int nlen)
1399{
1400 int i;
1401
1402 if (printk_ratelimit()) {
1403 printk(KERN_INFO
1404 "warning: process `%s' used the deprecated sysctl "
1405 "system call with ", current->comm);
1406 for (i = 0; i < nlen; i++)
1407 printk("%d.", name[i]);
1408 printk("\n");
1409 }
1410 return;
1411}
1412
1413static ssize_t do_sysctl(int __user *args_name, int nlen,
1414 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1415{
1416 int name[CTL_MAXNAME];
1417 int i;
1418
1419 /* Check args->nlen. */
1420 if (nlen < 0 || nlen > CTL_MAXNAME)
1421 return -ENOTDIR;
1422 /* Read in the sysctl name for simplicity */
1423 for (i = 0; i < nlen; i++)
1424 if (get_user(name[i], args_name + i))
1425 return -EFAULT;
1426
1427 deprecated_sysctl_warning(name, nlen);
1428
1429 return binary_sysctl(name, nlen, oldval, oldlen, newval, newlen);
1430}
1431
1432SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
1433{
1434 struct __sysctl_args tmp;
1435 size_t oldlen = 0;
1436 ssize_t result;
1437
1438 if (copy_from_user(&tmp, args, sizeof(tmp)))
1439 return -EFAULT;
1440
1441 if (tmp.oldval && !tmp.oldlenp)
1442 return -EFAULT;
1443
1444 if (tmp.oldlenp && get_user(oldlen, tmp.oldlenp))
1445 return -EFAULT;
1446
1447 result = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, oldlen,
1448 tmp.newval, tmp.newlen);
1449
1450 if (result >= 0) {
1451 oldlen = result;
1452 result = 0;
1453 }
1454
1455 if (tmp.oldlenp && put_user(oldlen, tmp.oldlenp))
1456 return -EFAULT;
1457
1458 return result;
1459}
1460
1461
1462#ifdef CONFIG_COMPAT
1463#include <asm/compat.h>
1464
1465struct compat_sysctl_args {
1466 compat_uptr_t name;
1467 int nlen;
1468 compat_uptr_t oldval;
1469 compat_uptr_t oldlenp;
1470 compat_uptr_t newval;
1471 compat_size_t newlen;
1472 compat_ulong_t __unused[4];
1473};
1474
1475asmlinkage long compat_sys_sysctl(struct compat_sysctl_args __user *args)
1476{
1477 struct compat_sysctl_args tmp;
1478 compat_size_t __user *compat_oldlenp;
1479 size_t oldlen = 0;
1480 ssize_t result;
1481
1482 if (copy_from_user(&tmp, args, sizeof(tmp)))
1483 return -EFAULT;
1484
1485 if (tmp.oldval && !tmp.oldlenp)
1486 return -EFAULT;
1487
1488 compat_oldlenp = compat_ptr(tmp.oldlenp);
1489 if (compat_oldlenp && get_user(oldlen, compat_oldlenp))
1490 return -EFAULT;
1491
1492 result = do_sysctl(compat_ptr(tmp.name), tmp.nlen,
1493 compat_ptr(tmp.oldval), oldlen,
1494 compat_ptr(tmp.newval), tmp.newlen);
1495
1496 if (result >= 0) {
1497 oldlen = result;
1498 result = 0;
1499 }
1500
1501 if (compat_oldlenp && put_user(oldlen, compat_oldlenp))
1502 return -EFAULT;
1503
1504 return result;
1505}
1506
1507#endif /* CONFIG_COMPAT */
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index b6e7aaea4604..04cdcf72c827 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -5,1239 +5,6 @@
5#include <linux/string.h> 5#include <linux/string.h>
6#include <net/ip_vs.h> 6#include <net/ip_vs.h>
7 7
8struct trans_ctl_table {
9 int ctl_name;
10 const char *procname;
11 const struct trans_ctl_table *child;
12};
13
14static const struct trans_ctl_table trans_random_table[] = {
15 { RANDOM_POOLSIZE, "poolsize" },
16 { RANDOM_ENTROPY_COUNT, "entropy_avail" },
17 { RANDOM_READ_THRESH, "read_wakeup_threshold" },
18 { RANDOM_WRITE_THRESH, "write_wakeup_threshold" },
19 { RANDOM_BOOT_ID, "boot_id" },
20 { RANDOM_UUID, "uuid" },
21 {}
22};
23
24static const struct trans_ctl_table trans_pty_table[] = {
25 { PTY_MAX, "max" },
26 { PTY_NR, "nr" },
27 {}
28};
29
30static const struct trans_ctl_table trans_kern_table[] = {
31 { KERN_OSTYPE, "ostype" },
32 { KERN_OSRELEASE, "osrelease" },
33 /* KERN_OSREV not used */
34 { KERN_VERSION, "version" },
35 /* KERN_SECUREMASK not used */
36 /* KERN_PROF not used */
37 { KERN_NODENAME, "hostname" },
38 { KERN_DOMAINNAME, "domainname" },
39
40 { KERN_PANIC, "panic" },
41 { KERN_REALROOTDEV, "real-root-dev" },
42
43 { KERN_SPARC_REBOOT, "reboot-cmd" },
44 { KERN_CTLALTDEL, "ctrl-alt-del" },
45 { KERN_PRINTK, "printk" },
46
47 /* KERN_NAMETRANS not used */
48 /* KERN_PPC_HTABRECLAIM not used */
49 /* KERN_PPC_ZEROPAGED not used */
50 { KERN_PPC_POWERSAVE_NAP, "powersave-nap" },
51
52 { KERN_MODPROBE, "modprobe" },
53 { KERN_SG_BIG_BUFF, "sg-big-buff" },
54 { KERN_ACCT, "acct" },
55 { KERN_PPC_L2CR, "l2cr" },
56
57 /* KERN_RTSIGNR not used */
58 /* KERN_RTSIGMAX not used */
59
60 { KERN_SHMMAX, "shmmax" },
61 { KERN_MSGMAX, "msgmax" },
62 { KERN_MSGMNB, "msgmnb" },
63 /* KERN_MSGPOOL not used*/
64 { KERN_SYSRQ, "sysrq" },
65 { KERN_MAX_THREADS, "threads-max" },
66 { KERN_RANDOM, "random", trans_random_table },
67 { KERN_SHMALL, "shmall" },
68 { KERN_MSGMNI, "msgmni" },
69 { KERN_SEM, "sem" },
70 { KERN_SPARC_STOP_A, "stop-a" },
71 { KERN_SHMMNI, "shmmni" },
72
73 { KERN_OVERFLOWUID, "overflowuid" },
74 { KERN_OVERFLOWGID, "overflowgid" },
75
76 { KERN_HOTPLUG, "hotplug", },
77 { KERN_IEEE_EMULATION_WARNINGS, "ieee_emulation_warnings" },
78
79 { KERN_S390_USER_DEBUG_LOGGING, "userprocess_debug" },
80 { KERN_CORE_USES_PID, "core_uses_pid" },
81 { KERN_TAINTED, "tainted" },
82 { KERN_CADPID, "cad_pid" },
83 { KERN_PIDMAX, "pid_max" },
84 { KERN_CORE_PATTERN, "core_pattern" },
85 { KERN_PANIC_ON_OOPS, "panic_on_oops" },
86 { KERN_HPPA_PWRSW, "soft-power" },
87 { KERN_HPPA_UNALIGNED, "unaligned-trap" },
88
89 { KERN_PRINTK_RATELIMIT, "printk_ratelimit" },
90 { KERN_PRINTK_RATELIMIT_BURST, "printk_ratelimit_burst" },
91
92 { KERN_PTY, "pty", trans_pty_table },
93 { KERN_NGROUPS_MAX, "ngroups_max" },
94 { KERN_SPARC_SCONS_PWROFF, "scons-poweroff" },
95 { KERN_HZ_TIMER, "hz_timer" },
96 { KERN_UNKNOWN_NMI_PANIC, "unknown_nmi_panic" },
97 { KERN_BOOTLOADER_TYPE, "bootloader_type" },
98 { KERN_RANDOMIZE, "randomize_va_space" },
99
100 { KERN_SPIN_RETRY, "spin_retry" },
101 { KERN_ACPI_VIDEO_FLAGS, "acpi_video_flags" },
102 { KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" },
103 { KERN_COMPAT_LOG, "compat-log" },
104 { KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
105 { KERN_NMI_WATCHDOG, "nmi_watchdog" },
106 { KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
107 {}
108};
109
110static const struct trans_ctl_table trans_vm_table[] = {
111 { VM_OVERCOMMIT_MEMORY, "overcommit_memory" },
112 { VM_PAGE_CLUSTER, "page-cluster" },
113 { VM_DIRTY_BACKGROUND, "dirty_background_ratio" },
114 { VM_DIRTY_RATIO, "dirty_ratio" },
115 { VM_DIRTY_WB_CS, "dirty_writeback_centisecs" },
116 { VM_DIRTY_EXPIRE_CS, "dirty_expire_centisecs" },
117 { VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" },
118 { VM_OVERCOMMIT_RATIO, "overcommit_ratio" },
119 /* VM_PAGEBUF unused */
120 { VM_HUGETLB_PAGES, "nr_hugepages" },
121 { VM_SWAPPINESS, "swappiness" },
122 { VM_LOWMEM_RESERVE_RATIO, "lowmem_reserve_ratio" },
123 { VM_MIN_FREE_KBYTES, "min_free_kbytes" },
124 { VM_MAX_MAP_COUNT, "max_map_count" },
125 { VM_LAPTOP_MODE, "laptop_mode" },
126 { VM_BLOCK_DUMP, "block_dump" },
127 { VM_HUGETLB_GROUP, "hugetlb_shm_group" },
128 { VM_VFS_CACHE_PRESSURE, "vfs_cache_pressure" },
129 { VM_LEGACY_VA_LAYOUT, "legacy_va_layout" },
130 /* VM_SWAP_TOKEN_TIMEOUT unused */
131 { VM_DROP_PAGECACHE, "drop_caches" },
132 { VM_PERCPU_PAGELIST_FRACTION, "percpu_pagelist_fraction" },
133 { VM_ZONE_RECLAIM_MODE, "zone_reclaim_mode" },
134 { VM_MIN_UNMAPPED, "min_unmapped_ratio" },
135 { VM_PANIC_ON_OOM, "panic_on_oom" },
136 { VM_VDSO_ENABLED, "vdso_enabled" },
137 { VM_MIN_SLAB, "min_slab_ratio" },
138
139 {}
140};
141
142static const struct trans_ctl_table trans_net_core_table[] = {
143 { NET_CORE_WMEM_MAX, "wmem_max" },
144 { NET_CORE_RMEM_MAX, "rmem_max" },
145 { NET_CORE_WMEM_DEFAULT, "wmem_default" },
146 { NET_CORE_RMEM_DEFAULT, "rmem_default" },
147 /* NET_CORE_DESTROY_DELAY unused */
148 { NET_CORE_MAX_BACKLOG, "netdev_max_backlog" },
149 /* NET_CORE_FASTROUTE unused */
150 { NET_CORE_MSG_COST, "message_cost" },
151 { NET_CORE_MSG_BURST, "message_burst" },
152 { NET_CORE_OPTMEM_MAX, "optmem_max" },
153 /* NET_CORE_HOT_LIST_LENGTH unused */
154 /* NET_CORE_DIVERT_VERSION unused */
155 /* NET_CORE_NO_CONG_THRESH unused */
156 /* NET_CORE_NO_CONG unused */
157 /* NET_CORE_LO_CONG unused */
158 /* NET_CORE_MOD_CONG unused */
159 { NET_CORE_DEV_WEIGHT, "dev_weight" },
160 { NET_CORE_SOMAXCONN, "somaxconn" },
161 { NET_CORE_BUDGET, "netdev_budget" },
162 { NET_CORE_AEVENT_ETIME, "xfrm_aevent_etime" },
163 { NET_CORE_AEVENT_RSEQTH, "xfrm_aevent_rseqth" },
164 { NET_CORE_WARNINGS, "warnings" },
165 {},
166};
167
168static const struct trans_ctl_table trans_net_unix_table[] = {
169 /* NET_UNIX_DESTROY_DELAY unused */
170 /* NET_UNIX_DELETE_DELAY unused */
171 { NET_UNIX_MAX_DGRAM_QLEN, "max_dgram_qlen" },
172 {}
173};
174
175static const struct trans_ctl_table trans_net_ipv4_route_table[] = {
176 { NET_IPV4_ROUTE_FLUSH, "flush" },
177 { NET_IPV4_ROUTE_MIN_DELAY, "min_delay" },
178 { NET_IPV4_ROUTE_MAX_DELAY, "max_delay" },
179 { NET_IPV4_ROUTE_GC_THRESH, "gc_thresh" },
180 { NET_IPV4_ROUTE_MAX_SIZE, "max_size" },
181 { NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" },
182 { NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" },
183 { NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval" },
184 { NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" },
185 { NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" },
186 { NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" },
187 { NET_IPV4_ROUTE_ERROR_COST, "error_cost" },
188 { NET_IPV4_ROUTE_ERROR_BURST, "error_burst" },
189 { NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity" },
190 { NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" },
191 { NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" },
192 { NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" },
193 { NET_IPV4_ROUTE_SECRET_INTERVAL, "secret_interval" },
194 { NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" },
195 {}
196};
197
198static const struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = {
199 { NET_IPV4_CONF_FORWARDING, "forwarding" },
200 { NET_IPV4_CONF_MC_FORWARDING, "mc_forwarding" },
201
202 { NET_IPV4_CONF_PROXY_ARP, "proxy_arp" },
203 { NET_IPV4_CONF_ACCEPT_REDIRECTS, "accept_redirects" },
204 { NET_IPV4_CONF_SECURE_REDIRECTS, "secure_redirects" },
205 { NET_IPV4_CONF_SEND_REDIRECTS, "send_redirects" },
206 { NET_IPV4_CONF_SHARED_MEDIA, "shared_media" },
207 { NET_IPV4_CONF_RP_FILTER, "rp_filter" },
208 { NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
209 { NET_IPV4_CONF_BOOTP_RELAY, "bootp_relay" },
210 { NET_IPV4_CONF_LOG_MARTIANS, "log_martians" },
211 { NET_IPV4_CONF_TAG, "tag" },
212 { NET_IPV4_CONF_ARPFILTER, "arp_filter" },
213 { NET_IPV4_CONF_MEDIUM_ID, "medium_id" },
214 { NET_IPV4_CONF_NOXFRM, "disable_xfrm" },
215 { NET_IPV4_CONF_NOPOLICY, "disable_policy" },
216 { NET_IPV4_CONF_FORCE_IGMP_VERSION, "force_igmp_version" },
217
218 { NET_IPV4_CONF_ARP_ANNOUNCE, "arp_announce" },
219 { NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" },
220 { NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" },
221 { NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" },
222 { NET_IPV4_CONF_ARP_NOTIFY, "arp_notify" },
223 {}
224};
225
226static const struct trans_ctl_table trans_net_ipv4_conf_table[] = {
227 { NET_PROTO_CONF_ALL, "all", trans_net_ipv4_conf_vars_table },
228 { NET_PROTO_CONF_DEFAULT, "default", trans_net_ipv4_conf_vars_table },
229 { 0, NULL, trans_net_ipv4_conf_vars_table },
230 {}
231};
232
233static const struct trans_ctl_table trans_net_neigh_vars_table[] = {
234 { NET_NEIGH_MCAST_SOLICIT, "mcast_solicit" },
235 { NET_NEIGH_UCAST_SOLICIT, "ucast_solicit" },
236 { NET_NEIGH_APP_SOLICIT, "app_solicit" },
237 { NET_NEIGH_RETRANS_TIME, "retrans_time" },
238 { NET_NEIGH_REACHABLE_TIME, "base_reachable_time" },
239 { NET_NEIGH_DELAY_PROBE_TIME, "delay_first_probe_time" },
240 { NET_NEIGH_GC_STALE_TIME, "gc_stale_time" },
241 { NET_NEIGH_UNRES_QLEN, "unres_qlen" },
242 { NET_NEIGH_PROXY_QLEN, "proxy_qlen" },
243 { NET_NEIGH_ANYCAST_DELAY, "anycast_delay" },
244 { NET_NEIGH_PROXY_DELAY, "proxy_delay" },
245 { NET_NEIGH_LOCKTIME, "locktime" },
246 { NET_NEIGH_GC_INTERVAL, "gc_interval" },
247 { NET_NEIGH_GC_THRESH1, "gc_thresh1" },
248 { NET_NEIGH_GC_THRESH2, "gc_thresh2" },
249 { NET_NEIGH_GC_THRESH3, "gc_thresh3" },
250 { NET_NEIGH_RETRANS_TIME_MS, "retrans_time_ms" },
251 { NET_NEIGH_REACHABLE_TIME_MS, "base_reachable_time_ms" },
252 {}
253};
254
255static const struct trans_ctl_table trans_net_neigh_table[] = {
256 { NET_PROTO_CONF_DEFAULT, "default", trans_net_neigh_vars_table },
257 { 0, NULL, trans_net_neigh_vars_table },
258 {}
259};
260
261static const struct trans_ctl_table trans_net_ipv4_netfilter_table[] = {
262 { NET_IPV4_NF_CONNTRACK_MAX, "ip_conntrack_max" },
263
264 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, "ip_conntrack_tcp_timeout_syn_sent" },
265 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, "ip_conntrack_tcp_timeout_syn_recv" },
266 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED, "ip_conntrack_tcp_timeout_established" },
267 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT, "ip_conntrack_tcp_timeout_fin_wait" },
268 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT, "ip_conntrack_tcp_timeout_close_wait" },
269 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK, "ip_conntrack_tcp_timeout_last_ack" },
270 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT, "ip_conntrack_tcp_timeout_time_wait" },
271 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE, "ip_conntrack_tcp_timeout_close" },
272
273 { NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT, "ip_conntrack_udp_timeout" },
274 { NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM, "ip_conntrack_udp_timeout_stream" },
275 { NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT, "ip_conntrack_icmp_timeout" },
276 { NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT, "ip_conntrack_generic_timeout" },
277
278 { NET_IPV4_NF_CONNTRACK_BUCKETS, "ip_conntrack_buckets" },
279 { NET_IPV4_NF_CONNTRACK_LOG_INVALID, "ip_conntrack_log_invalid" },
280 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS, "ip_conntrack_tcp_timeout_max_retrans" },
281 { NET_IPV4_NF_CONNTRACK_TCP_LOOSE, "ip_conntrack_tcp_loose" },
282 { NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL, "ip_conntrack_tcp_be_liberal" },
283 { NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS, "ip_conntrack_tcp_max_retrans" },
284
285 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED, "ip_conntrack_sctp_timeout_closed" },
286 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT, "ip_conntrack_sctp_timeout_cookie_wait" },
287 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED, "ip_conntrack_sctp_timeout_cookie_echoed" },
288 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED, "ip_conntrack_sctp_timeout_established" },
289 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT, "ip_conntrack_sctp_timeout_shutdown_sent" },
290 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD, "ip_conntrack_sctp_timeout_shutdown_recd" },
291 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT, "ip_conntrack_sctp_timeout_shutdown_ack_sent" },
292
293 { NET_IPV4_NF_CONNTRACK_COUNT, "ip_conntrack_count" },
294 { NET_IPV4_NF_CONNTRACK_CHECKSUM, "ip_conntrack_checksum" },
295 {}
296};
297
298static const struct trans_ctl_table trans_net_ipv4_table[] = {
299 { NET_IPV4_FORWARD, "ip_forward" },
300 { NET_IPV4_DYNADDR, "ip_dynaddr" },
301
302 { NET_IPV4_CONF, "conf", trans_net_ipv4_conf_table },
303 { NET_IPV4_NEIGH, "neigh", trans_net_neigh_table },
304 { NET_IPV4_ROUTE, "route", trans_net_ipv4_route_table },
305 /* NET_IPV4_FIB_HASH unused */
306 { NET_IPV4_NETFILTER, "netfilter", trans_net_ipv4_netfilter_table },
307
308 { NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps" },
309 { NET_IPV4_TCP_WINDOW_SCALING, "tcp_window_scaling" },
310 { NET_IPV4_TCP_SACK, "tcp_sack" },
311 { NET_IPV4_TCP_RETRANS_COLLAPSE, "tcp_retrans_collapse" },
312 { NET_IPV4_DEFAULT_TTL, "ip_default_ttl" },
313 /* NET_IPV4_AUTOCONFIG unused */
314 { NET_IPV4_NO_PMTU_DISC, "ip_no_pmtu_disc" },
315 { NET_IPV4_TCP_SYN_RETRIES, "tcp_syn_retries" },
316 { NET_IPV4_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh" },
317 { NET_IPV4_IPFRAG_LOW_THRESH, "ipfrag_low_thresh" },
318 { NET_IPV4_IPFRAG_TIME, "ipfrag_time" },
319 /* NET_IPV4_TCP_MAX_KA_PROBES unused */
320 { NET_IPV4_TCP_KEEPALIVE_TIME, "tcp_keepalive_time" },
321 { NET_IPV4_TCP_KEEPALIVE_PROBES, "tcp_keepalive_probes" },
322 { NET_IPV4_TCP_RETRIES1, "tcp_retries1" },
323 { NET_IPV4_TCP_RETRIES2, "tcp_retries2" },
324 { NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout" },
325 /* NET_IPV4_IP_MASQ_DEBUG unused */
326 { NET_TCP_SYNCOOKIES, "tcp_syncookies" },
327 { NET_TCP_STDURG, "tcp_stdurg" },
328 { NET_TCP_RFC1337, "tcp_rfc1337" },
329 /* NET_TCP_SYN_TAILDROP unused */
330 { NET_TCP_MAX_SYN_BACKLOG, "tcp_max_syn_backlog" },
331 { NET_IPV4_LOCAL_PORT_RANGE, "ip_local_port_range" },
332 { NET_IPV4_ICMP_ECHO_IGNORE_ALL, "icmp_echo_ignore_all" },
333 { NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, "icmp_echo_ignore_broadcasts" },
334 /* NET_IPV4_ICMP_SOURCEQUENCH_RATE unused */
335 /* NET_IPV4_ICMP_DESTUNREACH_RATE unused */
336 /* NET_IPV4_ICMP_TIMEEXCEED_RATE unused */
337 /* NET_IPV4_ICMP_PARAMPROB_RATE unused */
338 /* NET_IPV4_ICMP_ECHOREPLY_RATE unused */
339 { NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, "icmp_ignore_bogus_error_responses" },
340 { NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships" },
341 { NET_TCP_TW_RECYCLE, "tcp_tw_recycle" },
342 /* NET_IPV4_ALWAYS_DEFRAG unused */
343 { NET_IPV4_TCP_KEEPALIVE_INTVL, "tcp_keepalive_intvl" },
344 { NET_IPV4_INET_PEER_THRESHOLD, "inet_peer_threshold" },
345 { NET_IPV4_INET_PEER_MINTTL, "inet_peer_minttl" },
346 { NET_IPV4_INET_PEER_MAXTTL, "inet_peer_maxttl" },
347 { NET_IPV4_INET_PEER_GC_MINTIME, "inet_peer_gc_mintime" },
348 { NET_IPV4_INET_PEER_GC_MAXTIME, "inet_peer_gc_maxtime" },
349 { NET_TCP_ORPHAN_RETRIES, "tcp_orphan_retries" },
350 { NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow" },
351 { NET_TCP_SYNACK_RETRIES, "tcp_synack_retries" },
352 { NET_TCP_MAX_ORPHANS, "tcp_max_orphans" },
353 { NET_TCP_MAX_TW_BUCKETS, "tcp_max_tw_buckets" },
354 { NET_TCP_FACK, "tcp_fack" },
355 { NET_TCP_REORDERING, "tcp_reordering" },
356 { NET_TCP_ECN, "tcp_ecn" },
357 { NET_TCP_DSACK, "tcp_dsack" },
358 { NET_TCP_MEM, "tcp_mem" },
359 { NET_TCP_WMEM, "tcp_wmem" },
360 { NET_TCP_RMEM, "tcp_rmem" },
361 { NET_TCP_APP_WIN, "tcp_app_win" },
362 { NET_TCP_ADV_WIN_SCALE, "tcp_adv_win_scale" },
363 { NET_IPV4_NONLOCAL_BIND, "ip_nonlocal_bind" },
364 { NET_IPV4_ICMP_RATELIMIT, "icmp_ratelimit" },
365 { NET_IPV4_ICMP_RATEMASK, "icmp_ratemask" },
366 { NET_TCP_TW_REUSE, "tcp_tw_reuse" },
367 { NET_TCP_FRTO, "tcp_frto" },
368 { NET_TCP_LOW_LATENCY, "tcp_low_latency" },
369 { NET_IPV4_IPFRAG_SECRET_INTERVAL, "ipfrag_secret_interval" },
370 { NET_IPV4_IGMP_MAX_MSF, "igmp_max_msf" },
371 { NET_TCP_NO_METRICS_SAVE, "tcp_no_metrics_save" },
372 /* NET_TCP_DEFAULT_WIN_SCALE unused */
373 { NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" },
374 { NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" },
375 /* NET_TCP_BIC_BETA unused */
376 { NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR, "icmp_errors_use_inbound_ifaddr" },
377 { NET_TCP_CONG_CONTROL, "tcp_congestion_control" },
378 { NET_TCP_ABC, "tcp_abc" },
379 { NET_IPV4_IPFRAG_MAX_DIST, "ipfrag_max_dist" },
380 { NET_TCP_MTU_PROBING, "tcp_mtu_probing" },
381 { NET_TCP_BASE_MSS, "tcp_base_mss" },
382 { NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" },
383 { NET_TCP_DMA_COPYBREAK, "tcp_dma_copybreak" },
384 { NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" },
385 { NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" },
386 { NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" },
387 { NET_CIPSOV4_RBM_OPTFMT, "cipso_rbm_optfmt" },
388 { NET_CIPSOV4_RBM_STRICTVALID, "cipso_rbm_strictvalid" },
389 { NET_TCP_AVAIL_CONG_CONTROL, "tcp_available_congestion_control" },
390 { NET_TCP_ALLOWED_CONG_CONTROL, "tcp_allowed_congestion_control" },
391 { NET_TCP_MAX_SSTHRESH, "tcp_max_ssthresh" },
392 { NET_TCP_FRTO_RESPONSE, "tcp_frto_response" },
393 { 2088 /* NET_IPQ_QMAX */, "ip_queue_maxlen" },
394 {}
395};
396
397static const struct trans_ctl_table trans_net_ipx_table[] = {
398 { NET_IPX_PPROP_BROADCASTING, "ipx_pprop_broadcasting" },
399 /* NET_IPX_FORWARDING unused */
400 {}
401};
402
403static const struct trans_ctl_table trans_net_atalk_table[] = {
404 { NET_ATALK_AARP_EXPIRY_TIME, "aarp-expiry-time" },
405 { NET_ATALK_AARP_TICK_TIME, "aarp-tick-time" },
406 { NET_ATALK_AARP_RETRANSMIT_LIMIT, "aarp-retransmit-limit" },
407 { NET_ATALK_AARP_RESOLVE_TIME, "aarp-resolve-time" },
408 {},
409};
410
411static const struct trans_ctl_table trans_net_netrom_table[] = {
412 { NET_NETROM_DEFAULT_PATH_QUALITY, "default_path_quality" },
413 { NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER, "obsolescence_count_initialiser" },
414 { NET_NETROM_NETWORK_TTL_INITIALISER, "network_ttl_initialiser" },
415 { NET_NETROM_TRANSPORT_TIMEOUT, "transport_timeout" },
416 { NET_NETROM_TRANSPORT_MAXIMUM_TRIES, "transport_maximum_tries" },
417 { NET_NETROM_TRANSPORT_ACKNOWLEDGE_DELAY, "transport_acknowledge_delay" },
418 { NET_NETROM_TRANSPORT_BUSY_DELAY, "transport_busy_delay" },
419 { NET_NETROM_TRANSPORT_REQUESTED_WINDOW_SIZE, "transport_requested_window_size" },
420 { NET_NETROM_TRANSPORT_NO_ACTIVITY_TIMEOUT, "transport_no_activity_timeout" },
421 { NET_NETROM_ROUTING_CONTROL, "routing_control" },
422 { NET_NETROM_LINK_FAILS_COUNT, "link_fails_count" },
423 { NET_NETROM_RESET, "reset" },
424 {}
425};
426
427static const struct trans_ctl_table trans_net_ax25_param_table[] = {
428 { NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" },
429 { NET_AX25_DEFAULT_MODE, "ax25_default_mode" },
430 { NET_AX25_BACKOFF_TYPE, "backoff_type" },
431 { NET_AX25_CONNECT_MODE, "connect_mode" },
432 { NET_AX25_STANDARD_WINDOW, "standard_window_size" },
433 { NET_AX25_EXTENDED_WINDOW, "extended_window_size" },
434 { NET_AX25_T1_TIMEOUT, "t1_timeout" },
435 { NET_AX25_T2_TIMEOUT, "t2_timeout" },
436 { NET_AX25_T3_TIMEOUT, "t3_timeout" },
437 { NET_AX25_IDLE_TIMEOUT, "idle_timeout" },
438 { NET_AX25_N2, "maximum_retry_count" },
439 { NET_AX25_PACLEN, "maximum_packet_length" },
440 { NET_AX25_PROTOCOL, "protocol" },
441 { NET_AX25_DAMA_SLAVE_TIMEOUT, "dama_slave_timeout" },
442 {}
443};
444
445static const struct trans_ctl_table trans_net_ax25_table[] = {
446 { 0, NULL, trans_net_ax25_param_table },
447 {}
448};
449
450static const struct trans_ctl_table trans_net_bridge_table[] = {
451 { NET_BRIDGE_NF_CALL_ARPTABLES, "bridge-nf-call-arptables" },
452 { NET_BRIDGE_NF_CALL_IPTABLES, "bridge-nf-call-iptables" },
453 { NET_BRIDGE_NF_CALL_IP6TABLES, "bridge-nf-call-ip6tables" },
454 { NET_BRIDGE_NF_FILTER_VLAN_TAGGED, "bridge-nf-filter-vlan-tagged" },
455 { NET_BRIDGE_NF_FILTER_PPPOE_TAGGED, "bridge-nf-filter-pppoe-tagged" },
456 {}
457};
458
459static const struct trans_ctl_table trans_net_rose_table[] = {
460 { NET_ROSE_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" },
461 { NET_ROSE_CALL_REQUEST_TIMEOUT, "call_request_timeout" },
462 { NET_ROSE_RESET_REQUEST_TIMEOUT, "reset_request_timeout" },
463 { NET_ROSE_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" },
464 { NET_ROSE_ACK_HOLD_BACK_TIMEOUT, "acknowledge_hold_back_timeout" },
465 { NET_ROSE_ROUTING_CONTROL, "routing_control" },
466 { NET_ROSE_LINK_FAIL_TIMEOUT, "link_fail_timeout" },
467 { NET_ROSE_MAX_VCS, "maximum_virtual_circuits" },
468 { NET_ROSE_WINDOW_SIZE, "window_size" },
469 { NET_ROSE_NO_ACTIVITY_TIMEOUT, "no_activity_timeout" },
470 {}
471};
472
473static const struct trans_ctl_table trans_net_ipv6_conf_var_table[] = {
474 { NET_IPV6_FORWARDING, "forwarding" },
475 { NET_IPV6_HOP_LIMIT, "hop_limit" },
476 { NET_IPV6_MTU, "mtu" },
477 { NET_IPV6_ACCEPT_RA, "accept_ra" },
478 { NET_IPV6_ACCEPT_REDIRECTS, "accept_redirects" },
479 { NET_IPV6_AUTOCONF, "autoconf" },
480 { NET_IPV6_DAD_TRANSMITS, "dad_transmits" },
481 { NET_IPV6_RTR_SOLICITS, "router_solicitations" },
482 { NET_IPV6_RTR_SOLICIT_INTERVAL, "router_solicitation_interval" },
483 { NET_IPV6_RTR_SOLICIT_DELAY, "router_solicitation_delay" },
484 { NET_IPV6_USE_TEMPADDR, "use_tempaddr" },
485 { NET_IPV6_TEMP_VALID_LFT, "temp_valid_lft" },
486 { NET_IPV6_TEMP_PREFERED_LFT, "temp_prefered_lft" },
487 { NET_IPV6_REGEN_MAX_RETRY, "regen_max_retry" },
488 { NET_IPV6_MAX_DESYNC_FACTOR, "max_desync_factor" },
489 { NET_IPV6_MAX_ADDRESSES, "max_addresses" },
490 { NET_IPV6_FORCE_MLD_VERSION, "force_mld_version" },
491 { NET_IPV6_ACCEPT_RA_DEFRTR, "accept_ra_defrtr" },
492 { NET_IPV6_ACCEPT_RA_PINFO, "accept_ra_pinfo" },
493 { NET_IPV6_ACCEPT_RA_RTR_PREF, "accept_ra_rtr_pref" },
494 { NET_IPV6_RTR_PROBE_INTERVAL, "router_probe_interval" },
495 { NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" },
496 { NET_IPV6_PROXY_NDP, "proxy_ndp" },
497 { NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
498 {}
499};
500
501static const struct trans_ctl_table trans_net_ipv6_conf_table[] = {
502 { NET_PROTO_CONF_ALL, "all", trans_net_ipv6_conf_var_table },
503 { NET_PROTO_CONF_DEFAULT, "default", trans_net_ipv6_conf_var_table },
504 { 0, NULL, trans_net_ipv6_conf_var_table },
505 {}
506};
507
508static const struct trans_ctl_table trans_net_ipv6_route_table[] = {
509 { NET_IPV6_ROUTE_FLUSH, "flush" },
510 { NET_IPV6_ROUTE_GC_THRESH, "gc_thresh" },
511 { NET_IPV6_ROUTE_MAX_SIZE, "max_size" },
512 { NET_IPV6_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" },
513 { NET_IPV6_ROUTE_GC_TIMEOUT, "gc_timeout" },
514 { NET_IPV6_ROUTE_GC_INTERVAL, "gc_interval" },
515 { NET_IPV6_ROUTE_GC_ELASTICITY, "gc_elasticity" },
516 { NET_IPV6_ROUTE_MTU_EXPIRES, "mtu_expires" },
517 { NET_IPV6_ROUTE_MIN_ADVMSS, "min_adv_mss" },
518 { NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" },
519 {}
520};
521
522static const struct trans_ctl_table trans_net_ipv6_icmp_table[] = {
523 { NET_IPV6_ICMP_RATELIMIT, "ratelimit" },
524 {}
525};
526
527static const struct trans_ctl_table trans_net_ipv6_table[] = {
528 { NET_IPV6_CONF, "conf", trans_net_ipv6_conf_table },
529 { NET_IPV6_NEIGH, "neigh", trans_net_neigh_table },
530 { NET_IPV6_ROUTE, "route", trans_net_ipv6_route_table },
531 { NET_IPV6_ICMP, "icmp", trans_net_ipv6_icmp_table },
532 { NET_IPV6_BINDV6ONLY, "bindv6only" },
533 { NET_IPV6_IP6FRAG_HIGH_THRESH, "ip6frag_high_thresh" },
534 { NET_IPV6_IP6FRAG_LOW_THRESH, "ip6frag_low_thresh" },
535 { NET_IPV6_IP6FRAG_TIME, "ip6frag_time" },
536 { NET_IPV6_IP6FRAG_SECRET_INTERVAL, "ip6frag_secret_interval" },
537 { NET_IPV6_MLD_MAX_MSF, "mld_max_msf" },
538 { 2088 /* IPQ_QMAX */, "ip6_queue_maxlen" },
539 {}
540};
541
542static const struct trans_ctl_table trans_net_x25_table[] = {
543 { NET_X25_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" },
544 { NET_X25_CALL_REQUEST_TIMEOUT, "call_request_timeout" },
545 { NET_X25_RESET_REQUEST_TIMEOUT, "reset_request_timeout" },
546 { NET_X25_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" },
547 { NET_X25_ACK_HOLD_BACK_TIMEOUT, "acknowledgement_hold_back_timeout" },
548 { NET_X25_FORWARD, "x25_forward" },
549 {}
550};
551
552static const struct trans_ctl_table trans_net_tr_table[] = {
553 { NET_TR_RIF_TIMEOUT, "rif_timeout" },
554 {}
555};
556
557
558static const struct trans_ctl_table trans_net_decnet_conf_vars[] = {
559 { NET_DECNET_CONF_DEV_FORWARDING, "forwarding" },
560 { NET_DECNET_CONF_DEV_PRIORITY, "priority" },
561 { NET_DECNET_CONF_DEV_T2, "t2" },
562 { NET_DECNET_CONF_DEV_T3, "t3" },
563 {}
564};
565
566static const struct trans_ctl_table trans_net_decnet_conf[] = {
567 { 0, NULL, trans_net_decnet_conf_vars },
568 {}
569};
570
571static const struct trans_ctl_table trans_net_decnet_table[] = {
572 { NET_DECNET_CONF, "conf", trans_net_decnet_conf },
573 { NET_DECNET_NODE_ADDRESS, "node_address" },
574 { NET_DECNET_NODE_NAME, "node_name" },
575 { NET_DECNET_DEFAULT_DEVICE, "default_device" },
576 { NET_DECNET_TIME_WAIT, "time_wait" },
577 { NET_DECNET_DN_COUNT, "dn_count" },
578 { NET_DECNET_DI_COUNT, "di_count" },
579 { NET_DECNET_DR_COUNT, "dr_count" },
580 { NET_DECNET_DST_GC_INTERVAL, "dst_gc_interval" },
581 { NET_DECNET_NO_FC_MAX_CWND, "no_fc_max_cwnd" },
582 { NET_DECNET_MEM, "decnet_mem" },
583 { NET_DECNET_RMEM, "decnet_rmem" },
584 { NET_DECNET_WMEM, "decnet_wmem" },
585 { NET_DECNET_DEBUG_LEVEL, "debug" },
586 {}
587};
588
589static const struct trans_ctl_table trans_net_sctp_table[] = {
590 { NET_SCTP_RTO_INITIAL, "rto_initial" },
591 { NET_SCTP_RTO_MIN, "rto_min" },
592 { NET_SCTP_RTO_MAX, "rto_max" },
593 { NET_SCTP_RTO_ALPHA, "rto_alpha_exp_divisor" },
594 { NET_SCTP_RTO_BETA, "rto_beta_exp_divisor" },
595 { NET_SCTP_VALID_COOKIE_LIFE, "valid_cookie_life" },
596 { NET_SCTP_ASSOCIATION_MAX_RETRANS, "association_max_retrans" },
597 { NET_SCTP_PATH_MAX_RETRANS, "path_max_retrans" },
598 { NET_SCTP_MAX_INIT_RETRANSMITS, "max_init_retransmits" },
599 { NET_SCTP_HB_INTERVAL, "hb_interval" },
600 { NET_SCTP_PRESERVE_ENABLE, "cookie_preserve_enable" },
601 { NET_SCTP_MAX_BURST, "max_burst" },
602 { NET_SCTP_ADDIP_ENABLE, "addip_enable" },
603 { NET_SCTP_PRSCTP_ENABLE, "prsctp_enable" },
604 { NET_SCTP_SNDBUF_POLICY, "sndbuf_policy" },
605 { NET_SCTP_SACK_TIMEOUT, "sack_timeout" },
606 { NET_SCTP_RCVBUF_POLICY, "rcvbuf_policy" },
607 {}
608};
609
610static const struct trans_ctl_table trans_net_llc_llc2_timeout_table[] = {
611 { NET_LLC2_ACK_TIMEOUT, "ack" },
612 { NET_LLC2_P_TIMEOUT, "p" },
613 { NET_LLC2_REJ_TIMEOUT, "rej" },
614 { NET_LLC2_BUSY_TIMEOUT, "busy" },
615 {}
616};
617
618static const struct trans_ctl_table trans_net_llc_station_table[] = {
619 { NET_LLC_STATION_ACK_TIMEOUT, "ack_timeout" },
620 {}
621};
622
623static const struct trans_ctl_table trans_net_llc_llc2_table[] = {
624 { NET_LLC2, "timeout", trans_net_llc_llc2_timeout_table },
625 {}
626};
627
628static const struct trans_ctl_table trans_net_llc_table[] = {
629 { NET_LLC2, "llc2", trans_net_llc_llc2_table },
630 { NET_LLC_STATION, "station", trans_net_llc_station_table },
631 {}
632};
633
634static const struct trans_ctl_table trans_net_netfilter_table[] = {
635 { NET_NF_CONNTRACK_MAX, "nf_conntrack_max" },
636 { NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, "nf_conntrack_tcp_timeout_syn_sent" },
637 { NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, "nf_conntrack_tcp_timeout_syn_recv" },
638 { NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED, "nf_conntrack_tcp_timeout_established" },
639 { NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT, "nf_conntrack_tcp_timeout_fin_wait" },
640 { NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT, "nf_conntrack_tcp_timeout_close_wait" },
641 { NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK, "nf_conntrack_tcp_timeout_last_ack" },
642 { NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT, "nf_conntrack_tcp_timeout_time_wait" },
643 { NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE, "nf_conntrack_tcp_timeout_close" },
644 { NET_NF_CONNTRACK_UDP_TIMEOUT, "nf_conntrack_udp_timeout" },
645 { NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM, "nf_conntrack_udp_timeout_stream" },
646 { NET_NF_CONNTRACK_ICMP_TIMEOUT, "nf_conntrack_icmp_timeout" },
647 { NET_NF_CONNTRACK_GENERIC_TIMEOUT, "nf_conntrack_generic_timeout" },
648 { NET_NF_CONNTRACK_BUCKETS, "nf_conntrack_buckets" },
649 { NET_NF_CONNTRACK_LOG_INVALID, "nf_conntrack_log_invalid" },
650 { NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS, "nf_conntrack_tcp_timeout_max_retrans" },
651 { NET_NF_CONNTRACK_TCP_LOOSE, "nf_conntrack_tcp_loose" },
652 { NET_NF_CONNTRACK_TCP_BE_LIBERAL, "nf_conntrack_tcp_be_liberal" },
653 { NET_NF_CONNTRACK_TCP_MAX_RETRANS, "nf_conntrack_tcp_max_retrans" },
654 { NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED, "nf_conntrack_sctp_timeout_closed" },
655 { NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT, "nf_conntrack_sctp_timeout_cookie_wait" },
656 { NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED, "nf_conntrack_sctp_timeout_cookie_echoed" },
657 { NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED, "nf_conntrack_sctp_timeout_established" },
658 { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT, "nf_conntrack_sctp_timeout_shutdown_sent" },
659 { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD, "nf_conntrack_sctp_timeout_shutdown_recd" },
660 { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT, "nf_conntrack_sctp_timeout_shutdown_ack_sent" },
661 { NET_NF_CONNTRACK_COUNT, "nf_conntrack_count" },
662 { NET_NF_CONNTRACK_ICMPV6_TIMEOUT, "nf_conntrack_icmpv6_timeout" },
663 { NET_NF_CONNTRACK_FRAG6_TIMEOUT, "nf_conntrack_frag6_timeout" },
664 { NET_NF_CONNTRACK_FRAG6_LOW_THRESH, "nf_conntrack_frag6_low_thresh" },
665 { NET_NF_CONNTRACK_FRAG6_HIGH_THRESH, "nf_conntrack_frag6_high_thresh" },
666 { NET_NF_CONNTRACK_CHECKSUM, "nf_conntrack_checksum" },
667
668 {}
669};
670
671static const struct trans_ctl_table trans_net_dccp_table[] = {
672 { NET_DCCP_DEFAULT, "default" },
673 {}
674};
675
676static const struct trans_ctl_table trans_net_irda_table[] = {
677 { NET_IRDA_DISCOVERY, "discovery" },
678 { NET_IRDA_DEVNAME, "devname" },
679 { NET_IRDA_DEBUG, "debug" },
680 { NET_IRDA_FAST_POLL, "fast_poll_increase" },
681 { NET_IRDA_DISCOVERY_SLOTS, "discovery_slots" },
682 { NET_IRDA_DISCOVERY_TIMEOUT, "discovery_timeout" },
683 { NET_IRDA_SLOT_TIMEOUT, "slot_timeout" },
684 { NET_IRDA_MAX_BAUD_RATE, "max_baud_rate" },
685 { NET_IRDA_MIN_TX_TURN_TIME, "min_tx_turn_time" },
686 { NET_IRDA_MAX_TX_DATA_SIZE, "max_tx_data_size" },
687 { NET_IRDA_MAX_TX_WINDOW, "max_tx_window" },
688 { NET_IRDA_MAX_NOREPLY_TIME, "max_noreply_time" },
689 { NET_IRDA_WARN_NOREPLY_TIME, "warn_noreply_time" },
690 { NET_IRDA_LAP_KEEPALIVE_TIME, "lap_keepalive_time" },
691 {}
692};
693
694static const struct trans_ctl_table trans_net_table[] = {
695 { NET_CORE, "core", trans_net_core_table },
696 /* NET_ETHER not used */
697 /* NET_802 not used */
698 { NET_UNIX, "unix", trans_net_unix_table },
699 { NET_IPV4, "ipv4", trans_net_ipv4_table },
700 { NET_IPX, "ipx", trans_net_ipx_table },
701 { NET_ATALK, "appletalk", trans_net_atalk_table },
702 { NET_NETROM, "netrom", trans_net_netrom_table },
703 { NET_AX25, "ax25", trans_net_ax25_table },
704 { NET_BRIDGE, "bridge", trans_net_bridge_table },
705 { NET_ROSE, "rose", trans_net_rose_table },
706 { NET_IPV6, "ipv6", trans_net_ipv6_table },
707 { NET_X25, "x25", trans_net_x25_table },
708 { NET_TR, "token-ring", trans_net_tr_table },
709 { NET_DECNET, "decnet", trans_net_decnet_table },
710 /* NET_ECONET not used */
711 { NET_SCTP, "sctp", trans_net_sctp_table },
712 { NET_LLC, "llc", trans_net_llc_table },
713 { NET_NETFILTER, "netfilter", trans_net_netfilter_table },
714 { NET_DCCP, "dccp", trans_net_dccp_table },
715 { NET_IRDA, "irda", trans_net_irda_table },
716 { 2089, "nf_conntrack_max" },
717 {}
718};
719
720static const struct trans_ctl_table trans_fs_quota_table[] = {
721 { FS_DQ_LOOKUPS, "lookups" },
722 { FS_DQ_DROPS, "drops" },
723 { FS_DQ_READS, "reads" },
724 { FS_DQ_WRITES, "writes" },
725 { FS_DQ_CACHE_HITS, "cache_hits" },
726 { FS_DQ_ALLOCATED, "allocated_dquots" },
727 { FS_DQ_FREE, "free_dquots" },
728 { FS_DQ_SYNCS, "syncs" },
729 { FS_DQ_WARNINGS, "warnings" },
730 {}
731};
732
733static const struct trans_ctl_table trans_fs_xfs_table[] = {
734 { XFS_SGID_INHERIT, "irix_sgid_inherit" },
735 { XFS_SYMLINK_MODE, "irix_symlink_mode" },
736 { XFS_PANIC_MASK, "panic_mask" },
737
738 { XFS_ERRLEVEL, "error_level" },
739 { XFS_SYNCD_TIMER, "xfssyncd_centisecs" },
740 { XFS_INHERIT_SYNC, "inherit_sync" },
741 { XFS_INHERIT_NODUMP, "inherit_nodump" },
742 { XFS_INHERIT_NOATIME, "inherit_noatime" },
743 { XFS_BUF_TIMER, "xfsbufd_centisecs" },
744 { XFS_BUF_AGE, "age_buffer_centisecs" },
745 { XFS_INHERIT_NOSYM, "inherit_nosymlinks" },
746 { XFS_ROTORSTEP, "rotorstep" },
747 { XFS_INHERIT_NODFRG, "inherit_nodefrag" },
748 { XFS_FILESTREAM_TIMER, "filestream_centisecs" },
749 { XFS_STATS_CLEAR, "stats_clear" },
750 {}
751};
752
753static const struct trans_ctl_table trans_fs_ocfs2_nm_table[] = {
754 { 1, "hb_ctl_path" },
755 {}
756};
757
758static const struct trans_ctl_table trans_fs_ocfs2_table[] = {
759 { 1, "nm", trans_fs_ocfs2_nm_table },
760 {}
761};
762
763static const struct trans_ctl_table trans_inotify_table[] = {
764 { INOTIFY_MAX_USER_INSTANCES, "max_user_instances" },
765 { INOTIFY_MAX_USER_WATCHES, "max_user_watches" },
766 { INOTIFY_MAX_QUEUED_EVENTS, "max_queued_events" },
767 {}
768};
769
770static const struct trans_ctl_table trans_fs_table[] = {
771 { FS_NRINODE, "inode-nr" },
772 { FS_STATINODE, "inode-state" },
773 /* FS_MAXINODE unused */
774 /* FS_NRDQUOT unused */
775 /* FS_MAXDQUOT unused */
776 { FS_NRFILE, "file-nr" },
777 { FS_MAXFILE, "file-max" },
778 { FS_DENTRY, "dentry-state" },
779 /* FS_NRSUPER unused */
780 /* FS_MAXUPSER unused */
781 { FS_OVERFLOWUID, "overflowuid" },
782 { FS_OVERFLOWGID, "overflowgid" },
783 { FS_LEASES, "leases-enable" },
784 { FS_DIR_NOTIFY, "dir-notify-enable" },
785 { FS_LEASE_TIME, "lease-break-time" },
786 { FS_DQSTATS, "quota", trans_fs_quota_table },
787 { FS_XFS, "xfs", trans_fs_xfs_table },
788 { FS_AIO_NR, "aio-nr" },
789 { FS_AIO_MAX_NR, "aio-max-nr" },
790 { FS_INOTIFY, "inotify", trans_inotify_table },
791 { FS_OCFS2, "ocfs2", trans_fs_ocfs2_table },
792 { KERN_SETUID_DUMPABLE, "suid_dumpable" },
793 {}
794};
795
796static const struct trans_ctl_table trans_debug_table[] = {
797 {}
798};
799
800static const struct trans_ctl_table trans_cdrom_table[] = {
801 { DEV_CDROM_INFO, "info" },
802 { DEV_CDROM_AUTOCLOSE, "autoclose" },
803 { DEV_CDROM_AUTOEJECT, "autoeject" },
804 { DEV_CDROM_DEBUG, "debug" },
805 { DEV_CDROM_LOCK, "lock" },
806 { DEV_CDROM_CHECK_MEDIA, "check_media" },
807 {}
808};
809
810static const struct trans_ctl_table trans_ipmi_table[] = {
811 { DEV_IPMI_POWEROFF_POWERCYCLE, "poweroff_powercycle" },
812 {}
813};
814
815static const struct trans_ctl_table trans_mac_hid_files[] = {
816 /* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */
817 /* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */
818 { DEV_MAC_HID_MOUSE_BUTTON_EMULATION, "mouse_button_emulation" },
819 { DEV_MAC_HID_MOUSE_BUTTON2_KEYCODE, "mouse_button2_keycode" },
820 { DEV_MAC_HID_MOUSE_BUTTON3_KEYCODE, "mouse_button3_keycode" },
821 /* DEV_MAC_HID_ADB_MOUSE_SENDS_KEYCODES unused */
822 {}
823};
824
825static const struct trans_ctl_table trans_raid_table[] = {
826 { DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min" },
827 { DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max" },
828 {}
829};
830
831static const struct trans_ctl_table trans_scsi_table[] = {
832 { DEV_SCSI_LOGGING_LEVEL, "logging_level" },
833 {}
834};
835
836static const struct trans_ctl_table trans_parport_default_table[] = {
837 { DEV_PARPORT_DEFAULT_TIMESLICE, "timeslice" },
838 { DEV_PARPORT_DEFAULT_SPINTIME, "spintime" },
839 {}
840};
841
842static const struct trans_ctl_table trans_parport_device_table[] = {
843 { DEV_PARPORT_DEVICE_TIMESLICE, "timeslice" },
844 {}
845};
846
847static const struct trans_ctl_table trans_parport_devices_table[] = {
848 { DEV_PARPORT_DEVICES_ACTIVE, "active" },
849 { 0, NULL, trans_parport_device_table },
850 {}
851};
852
853static const struct trans_ctl_table trans_parport_parport_table[] = {
854 { DEV_PARPORT_SPINTIME, "spintime" },
855 { DEV_PARPORT_BASE_ADDR, "base-addr" },
856 { DEV_PARPORT_IRQ, "irq" },
857 { DEV_PARPORT_DMA, "dma" },
858 { DEV_PARPORT_MODES, "modes" },
859 { DEV_PARPORT_DEVICES, "devices", trans_parport_devices_table },
860 { DEV_PARPORT_AUTOPROBE, "autoprobe" },
861 { DEV_PARPORT_AUTOPROBE + 1, "autoprobe0" },
862 { DEV_PARPORT_AUTOPROBE + 2, "autoprobe1" },
863 { DEV_PARPORT_AUTOPROBE + 3, "autoprobe2" },
864 { DEV_PARPORT_AUTOPROBE + 4, "autoprobe3" },
865 {}
866};
867static const struct trans_ctl_table trans_parport_table[] = {
868 { DEV_PARPORT_DEFAULT, "default", trans_parport_default_table },
869 { 0, NULL, trans_parport_parport_table },
870 {}
871};
872
873static const struct trans_ctl_table trans_dev_table[] = {
874 { DEV_CDROM, "cdrom", trans_cdrom_table },
875 /* DEV_HWMON unused */
876 { DEV_PARPORT, "parport", trans_parport_table },
877 { DEV_RAID, "raid", trans_raid_table },
878 { DEV_MAC_HID, "mac_hid", trans_mac_hid_files },
879 { DEV_SCSI, "scsi", trans_scsi_table },
880 { DEV_IPMI, "ipmi", trans_ipmi_table },
881 {}
882};
883
884static const struct trans_ctl_table trans_bus_isa_table[] = {
885 { BUS_ISA_MEM_BASE, "membase" },
886 { BUS_ISA_PORT_BASE, "portbase" },
887 { BUS_ISA_PORT_SHIFT, "portshift" },
888 {}
889};
890
891static const struct trans_ctl_table trans_bus_table[] = {
892 { CTL_BUS_ISA, "isa", trans_bus_isa_table },
893 {}
894};
895
896static const struct trans_ctl_table trans_arlan_conf_table0[] = {
897 { 1, "spreadingCode" },
898 { 2, "channelNumber" },
899 { 3, "scramblingDisable" },
900 { 4, "txAttenuation" },
901 { 5, "systemId" },
902 { 6, "maxDatagramSize" },
903 { 7, "maxFrameSize" },
904 { 8, "maxRetries" },
905 { 9, "receiveMode" },
906 { 10, "priority" },
907 { 11, "rootOrRepeater" },
908 { 12, "SID" },
909 { 13, "registrationMode" },
910 { 14, "registrationFill" },
911 { 15, "localTalkAddress" },
912 { 16, "codeFormat" },
913 { 17, "numChannels" },
914 { 18, "channel1" },
915 { 19, "channel2" },
916 { 20, "channel3" },
917 { 21, "channel4" },
918 { 22, "txClear" },
919 { 23, "txRetries" },
920 { 24, "txRouting" },
921 { 25, "txScrambled" },
922 { 26, "rxParameter" },
923 { 27, "txTimeoutMs" },
924 { 28, "waitCardTimeout" },
925 { 29, "channelSet" },
926 { 30, "name" },
927 { 31, "waitTime" },
928 { 32, "lParameter" },
929 { 33, "_15" },
930 { 34, "headerSize" },
931 { 36, "tx_delay_ms" },
932 { 37, "retries" },
933 { 38, "ReTransmitPacketMaxSize" },
934 { 39, "waitReTransmitPacketMaxSize" },
935 { 40, "fastReTransCount" },
936 { 41, "driverRetransmissions" },
937 { 42, "txAckTimeoutMs" },
938 { 43, "registrationInterrupts" },
939 { 44, "hardwareType" },
940 { 45, "radioType" },
941 { 46, "writeEEPROM" },
942 { 47, "writeRadioType" },
943 { 48, "entry_exit_debug" },
944 { 49, "debug" },
945 { 50, "in_speed" },
946 { 51, "out_speed" },
947 { 52, "in_speed10" },
948 { 53, "out_speed10" },
949 { 54, "in_speed_max" },
950 { 55, "out_speed_max" },
951 { 56, "measure_rate" },
952 { 57, "pre_Command_Wait" },
953 { 58, "rx_tweak1" },
954 { 59, "rx_tweak2" },
955 { 60, "tx_queue_len" },
956
957 { 150, "arlan0-txRing" },
958 { 151, "arlan0-rxRing" },
959 { 152, "arlan0-18" },
960 { 153, "arlan0-ring" },
961 { 154, "arlan0-shm-cpy" },
962 { 155, "config0" },
963 { 156, "reset0" },
964 {}
965};
966
967static const struct trans_ctl_table trans_arlan_conf_table1[] = {
968 { 1, "spreadingCode" },
969 { 2, "channelNumber" },
970 { 3, "scramblingDisable" },
971 { 4, "txAttenuation" },
972 { 5, "systemId" },
973 { 6, "maxDatagramSize" },
974 { 7, "maxFrameSize" },
975 { 8, "maxRetries" },
976 { 9, "receiveMode" },
977 { 10, "priority" },
978 { 11, "rootOrRepeater" },
979 { 12, "SID" },
980 { 13, "registrationMode" },
981 { 14, "registrationFill" },
982 { 15, "localTalkAddress" },
983 { 16, "codeFormat" },
984 { 17, "numChannels" },
985 { 18, "channel1" },
986 { 19, "channel2" },
987 { 20, "channel3" },
988 { 21, "channel4" },
989 { 22, "txClear" },
990 { 23, "txRetries" },
991 { 24, "txRouting" },
992 { 25, "txScrambled" },
993 { 26, "rxParameter" },
994 { 27, "txTimeoutMs" },
995 { 28, "waitCardTimeout" },
996 { 29, "channelSet" },
997 { 30, "name" },
998 { 31, "waitTime" },
999 { 32, "lParameter" },
1000 { 33, "_15" },
1001 { 34, "headerSize" },
1002 { 36, "tx_delay_ms" },
1003 { 37, "retries" },
1004 { 38, "ReTransmitPacketMaxSize" },
1005 { 39, "waitReTransmitPacketMaxSize" },
1006 { 40, "fastReTransCount" },
1007 { 41, "driverRetransmissions" },
1008 { 42, "txAckTimeoutMs" },
1009 { 43, "registrationInterrupts" },
1010 { 44, "hardwareType" },
1011 { 45, "radioType" },
1012 { 46, "writeEEPROM" },
1013 { 47, "writeRadioType" },
1014 { 48, "entry_exit_debug" },
1015 { 49, "debug" },
1016 { 50, "in_speed" },
1017 { 51, "out_speed" },
1018 { 52, "in_speed10" },
1019 { 53, "out_speed10" },
1020 { 54, "in_speed_max" },
1021 { 55, "out_speed_max" },
1022 { 56, "measure_rate" },
1023 { 57, "pre_Command_Wait" },
1024 { 58, "rx_tweak1" },
1025 { 59, "rx_tweak2" },
1026 { 60, "tx_queue_len" },
1027
1028 { 150, "arlan1-txRing" },
1029 { 151, "arlan1-rxRing" },
1030 { 152, "arlan1-18" },
1031 { 153, "arlan1-ring" },
1032 { 154, "arlan1-shm-cpy" },
1033 { 155, "config1" },
1034 { 156, "reset1" },
1035 {}
1036};
1037
1038static const struct trans_ctl_table trans_arlan_conf_table2[] = {
1039 { 1, "spreadingCode" },
1040 { 2, "channelNumber" },
1041 { 3, "scramblingDisable" },
1042 { 4, "txAttenuation" },
1043 { 5, "systemId" },
1044 { 6, "maxDatagramSize" },
1045 { 7, "maxFrameSize" },
1046 { 8, "maxRetries" },
1047 { 9, "receiveMode" },
1048 { 10, "priority" },
1049 { 11, "rootOrRepeater" },
1050 { 12, "SID" },
1051 { 13, "registrationMode" },
1052 { 14, "registrationFill" },
1053 { 15, "localTalkAddress" },
1054 { 16, "codeFormat" },
1055 { 17, "numChannels" },
1056 { 18, "channel1" },
1057 { 19, "channel2" },
1058 { 20, "channel3" },
1059 { 21, "channel4" },
1060 { 22, "txClear" },
1061 { 23, "txRetries" },
1062 { 24, "txRouting" },
1063 { 25, "txScrambled" },
1064 { 26, "rxParameter" },
1065 { 27, "txTimeoutMs" },
1066 { 28, "waitCardTimeout" },
1067 { 29, "channelSet" },
1068 { 30, "name" },
1069 { 31, "waitTime" },
1070 { 32, "lParameter" },
1071 { 33, "_15" },
1072 { 34, "headerSize" },
1073 { 36, "tx_delay_ms" },
1074 { 37, "retries" },
1075 { 38, "ReTransmitPacketMaxSize" },
1076 { 39, "waitReTransmitPacketMaxSize" },
1077 { 40, "fastReTransCount" },
1078 { 41, "driverRetransmissions" },
1079 { 42, "txAckTimeoutMs" },
1080 { 43, "registrationInterrupts" },
1081 { 44, "hardwareType" },
1082 { 45, "radioType" },
1083 { 46, "writeEEPROM" },
1084 { 47, "writeRadioType" },
1085 { 48, "entry_exit_debug" },
1086 { 49, "debug" },
1087 { 50, "in_speed" },
1088 { 51, "out_speed" },
1089 { 52, "in_speed10" },
1090 { 53, "out_speed10" },
1091 { 54, "in_speed_max" },
1092 { 55, "out_speed_max" },
1093 { 56, "measure_rate" },
1094 { 57, "pre_Command_Wait" },
1095 { 58, "rx_tweak1" },
1096 { 59, "rx_tweak2" },
1097 { 60, "tx_queue_len" },
1098
1099 { 150, "arlan2-txRing" },
1100 { 151, "arlan2-rxRing" },
1101 { 152, "arlan2-18" },
1102 { 153, "arlan2-ring" },
1103 { 154, "arlan2-shm-cpy" },
1104 { 155, "config2" },
1105 { 156, "reset2" },
1106 {}
1107};
1108
1109static const struct trans_ctl_table trans_arlan_conf_table3[] = {
1110 { 1, "spreadingCode" },
1111 { 2, "channelNumber" },
1112 { 3, "scramblingDisable" },
1113 { 4, "txAttenuation" },
1114 { 5, "systemId" },
1115 { 6, "maxDatagramSize" },
1116 { 7, "maxFrameSize" },
1117 { 8, "maxRetries" },
1118 { 9, "receiveMode" },
1119 { 10, "priority" },
1120 { 11, "rootOrRepeater" },
1121 { 12, "SID" },
1122 { 13, "registrationMode" },
1123 { 14, "registrationFill" },
1124 { 15, "localTalkAddress" },
1125 { 16, "codeFormat" },
1126 { 17, "numChannels" },
1127 { 18, "channel1" },
1128 { 19, "channel2" },
1129 { 20, "channel3" },
1130 { 21, "channel4" },
1131 { 22, "txClear" },
1132 { 23, "txRetries" },
1133 { 24, "txRouting" },
1134 { 25, "txScrambled" },
1135 { 26, "rxParameter" },
1136 { 27, "txTimeoutMs" },
1137 { 28, "waitCardTimeout" },
1138 { 29, "channelSet" },
1139 { 30, "name" },
1140 { 31, "waitTime" },
1141 { 32, "lParameter" },
1142 { 33, "_15" },
1143 { 34, "headerSize" },
1144 { 36, "tx_delay_ms" },
1145 { 37, "retries" },
1146 { 38, "ReTransmitPacketMaxSize" },
1147 { 39, "waitReTransmitPacketMaxSize" },
1148 { 40, "fastReTransCount" },
1149 { 41, "driverRetransmissions" },
1150 { 42, "txAckTimeoutMs" },
1151 { 43, "registrationInterrupts" },
1152 { 44, "hardwareType" },
1153 { 45, "radioType" },
1154 { 46, "writeEEPROM" },
1155 { 47, "writeRadioType" },
1156 { 48, "entry_exit_debug" },
1157 { 49, "debug" },
1158 { 50, "in_speed" },
1159 { 51, "out_speed" },
1160 { 52, "in_speed10" },
1161 { 53, "out_speed10" },
1162 { 54, "in_speed_max" },
1163 { 55, "out_speed_max" },
1164 { 56, "measure_rate" },
1165 { 57, "pre_Command_Wait" },
1166 { 58, "rx_tweak1" },
1167 { 59, "rx_tweak2" },
1168 { 60, "tx_queue_len" },
1169
1170 { 150, "arlan3-txRing" },
1171 { 151, "arlan3-rxRing" },
1172 { 152, "arlan3-18" },
1173 { 153, "arlan3-ring" },
1174 { 154, "arlan3-shm-cpy" },
1175 { 155, "config3" },
1176 { 156, "reset3" },
1177 {}
1178};
1179
1180static const struct trans_ctl_table trans_arlan_table[] = {
1181 { 1, "arlan0", trans_arlan_conf_table0 },
1182 { 2, "arlan1", trans_arlan_conf_table1 },
1183 { 3, "arlan2", trans_arlan_conf_table2 },
1184 { 4, "arlan3", trans_arlan_conf_table3 },
1185 {}
1186};
1187
1188static const struct trans_ctl_table trans_s390dbf_table[] = {
1189 { 5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" },
1190 { 5679 /* CTL_S390DBF_ACTIVE */, "debug_active" },
1191 {}
1192};
1193
1194static const struct trans_ctl_table trans_sunrpc_table[] = {
1195 { CTL_RPCDEBUG, "rpc_debug" },
1196 { CTL_NFSDEBUG, "nfs_debug" },
1197 { CTL_NFSDDEBUG, "nfsd_debug" },
1198 { CTL_NLMDEBUG, "nlm_debug" },
1199 { CTL_SLOTTABLE_UDP, "udp_slot_table_entries" },
1200 { CTL_SLOTTABLE_TCP, "tcp_slot_table_entries" },
1201 { CTL_MIN_RESVPORT, "min_resvport" },
1202 { CTL_MAX_RESVPORT, "max_resvport" },
1203 {}
1204};
1205
1206static const struct trans_ctl_table trans_pm_table[] = {
1207 { 1 /* CTL_PM_SUSPEND */, "suspend" },
1208 { 2 /* CTL_PM_CMODE */, "cmode" },
1209 { 3 /* CTL_PM_P0 */, "p0" },
1210 { 4 /* CTL_PM_CM */, "cm" },
1211 {}
1212};
1213
1214static const struct trans_ctl_table trans_frv_table[] = {
1215 { 1, "cache-mode" },
1216 { 2, "pin-cxnr" },
1217 {}
1218};
1219
1220static const struct trans_ctl_table trans_root_table[] = {
1221 { CTL_KERN, "kernel", trans_kern_table },
1222 { CTL_VM, "vm", trans_vm_table },
1223 { CTL_NET, "net", trans_net_table },
1224 /* CTL_PROC not used */
1225 { CTL_FS, "fs", trans_fs_table },
1226 { CTL_DEBUG, "debug", trans_debug_table },
1227 { CTL_DEV, "dev", trans_dev_table },
1228 { CTL_BUS, "bus", trans_bus_table },
1229 { CTL_ABI, "abi" },
1230 /* CTL_CPU not used */
1231 { CTL_ARLAN, "arlan", trans_arlan_table },
1232 { CTL_S390DBF, "s390dbf", trans_s390dbf_table },
1233 { CTL_SUNRPC, "sunrpc", trans_sunrpc_table },
1234 { CTL_PM, "pm", trans_pm_table },
1235 { CTL_FRV, "frv", trans_frv_table },
1236 {}
1237};
1238
1239
1240
1241 8
1242static int sysctl_depth(struct ctl_table *table) 9static int sysctl_depth(struct ctl_table *table)
1243{ 10{
@@ -1261,47 +28,6 @@ static struct ctl_table *sysctl_parent(struct ctl_table *table, int n)
1261 return table; 28 return table;
1262} 29}
1263 30
1264static const struct trans_ctl_table *sysctl_binary_lookup(struct ctl_table *table)
1265{
1266 struct ctl_table *test;
1267 const struct trans_ctl_table *ref;
1268 int cur_depth;
1269
1270 cur_depth = sysctl_depth(table);
1271
1272 ref = trans_root_table;
1273repeat:
1274 test = sysctl_parent(table, cur_depth);
1275 for (; ref->ctl_name || ref->procname || ref->child; ref++) {
1276 int match = 0;
1277
1278 if (cur_depth && !ref->child)
1279 continue;
1280
1281 if (test->procname && ref->procname &&
1282 (strcmp(test->procname, ref->procname) == 0))
1283 match++;
1284
1285 if (test->ctl_name && ref->ctl_name &&
1286 (test->ctl_name == ref->ctl_name))
1287 match++;
1288
1289 if (!ref->ctl_name && !ref->procname)
1290 match++;
1291
1292 if (match) {
1293 if (cur_depth != 0) {
1294 cur_depth--;
1295 ref = ref->child;
1296 goto repeat;
1297 }
1298 goto out;
1299 }
1300 }
1301 ref = NULL;
1302out:
1303 return ref;
1304}
1305 31
1306static void sysctl_print_path(struct ctl_table *table) 32static void sysctl_print_path(struct ctl_table *table)
1307{ 33{
@@ -1315,26 +41,6 @@ static void sysctl_print_path(struct ctl_table *table)
1315 } 41 }
1316 } 42 }
1317 printk(" "); 43 printk(" ");
1318 if (table->ctl_name) {
1319 for (i = depth; i >= 0; i--) {
1320 tmp = sysctl_parent(table, i);
1321 printk(".%d", tmp->ctl_name);
1322 }
1323 }
1324}
1325
1326static void sysctl_repair_table(struct ctl_table *table)
1327{
1328 /* Don't complain about the classic default
1329 * sysctl strategy routine. Maybe later we
1330 * can get the tables fixed and complain about
1331 * this.
1332 */
1333 if (table->ctl_name && table->procname &&
1334 (table->proc_handler == proc_dointvec) &&
1335 (!table->strategy)) {
1336 table->strategy = sysctl_data;
1337 }
1338} 44}
1339 45
1340static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces, 46static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces,
@@ -1352,7 +58,7 @@ static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces,
1352 ref = head->ctl_table; 58 ref = head->ctl_table;
1353repeat: 59repeat:
1354 test = sysctl_parent(table, cur_depth); 60 test = sysctl_parent(table, cur_depth);
1355 for (; ref->ctl_name || ref->procname; ref++) { 61 for (; ref->procname; ref++) {
1356 int match = 0; 62 int match = 0;
1357 if (cur_depth && !ref->child) 63 if (cur_depth && !ref->child)
1358 continue; 64 continue;
@@ -1361,10 +67,6 @@ repeat:
1361 (strcmp(test->procname, ref->procname) == 0)) 67 (strcmp(test->procname, ref->procname) == 0))
1362 match++; 68 match++;
1363 69
1364 if (test->ctl_name && ref->ctl_name &&
1365 (test->ctl_name == ref->ctl_name))
1366 match++;
1367
1368 if (match) { 70 if (match) {
1369 if (cur_depth != 0) { 71 if (cur_depth != 0) {
1370 cur_depth--; 72 cur_depth--;
@@ -1392,38 +94,6 @@ static void set_fail(const char **fail, struct ctl_table *table, const char *str
1392 *fail = str; 94 *fail = str;
1393} 95}
1394 96
1395static int sysctl_check_dir(struct nsproxy *namespaces,
1396 struct ctl_table *table)
1397{
1398 struct ctl_table *ref;
1399 int error;
1400
1401 error = 0;
1402 ref = sysctl_check_lookup(namespaces, table);
1403 if (ref) {
1404 int match = 0;
1405 if ((!table->procname && !ref->procname) ||
1406 (table->procname && ref->procname &&
1407 (strcmp(table->procname, ref->procname) == 0)))
1408 match++;
1409
1410 if ((!table->ctl_name && !ref->ctl_name) ||
1411 (table->ctl_name && ref->ctl_name &&
1412 (table->ctl_name == ref->ctl_name)))
1413 match++;
1414
1415 if (match != 2) {
1416 printk(KERN_ERR "%s: failed: ", __func__);
1417 sysctl_print_path(table);
1418 printk(" ref: ");
1419 sysctl_print_path(ref);
1420 printk("\n");
1421 error = -EINVAL;
1422 }
1423 }
1424 return error;
1425}
1426
1427static void sysctl_check_leaf(struct nsproxy *namespaces, 97static void sysctl_check_leaf(struct nsproxy *namespaces,
1428 struct ctl_table *table, const char **fail) 98 struct ctl_table *table, const char **fail)
1429{ 99{
@@ -1434,37 +104,15 @@ static void sysctl_check_leaf(struct nsproxy *namespaces,
1434 set_fail(fail, table, "Sysctl already exists"); 104 set_fail(fail, table, "Sysctl already exists");
1435} 105}
1436 106
1437static void sysctl_check_bin_path(struct ctl_table *table, const char **fail)
1438{
1439 const struct trans_ctl_table *ref;
1440
1441 ref = sysctl_binary_lookup(table);
1442 if (table->ctl_name && !ref)
1443 set_fail(fail, table, "Unknown sysctl binary path");
1444 if (ref) {
1445 if (ref->procname &&
1446 (!table->procname ||
1447 (strcmp(table->procname, ref->procname) != 0)))
1448 set_fail(fail, table, "procname does not match binary path procname");
1449
1450 if (ref->ctl_name && table->ctl_name &&
1451 (table->ctl_name != ref->ctl_name))
1452 set_fail(fail, table, "ctl_name does not match binary path ctl_name");
1453 }
1454}
1455
1456int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) 107int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
1457{ 108{
1458 int error = 0; 109 int error = 0;
1459 for (; table->ctl_name || table->procname; table++) { 110 for (; table->procname; table++) {
1460 const char *fail = NULL; 111 const char *fail = NULL;
1461 112
1462 sysctl_repair_table(table);
1463 if (table->parent) { 113 if (table->parent) {
1464 if (table->procname && !table->parent->procname) 114 if (table->procname && !table->parent->procname)
1465 set_fail(&fail, table, "Parent without procname"); 115 set_fail(&fail, table, "Parent without procname");
1466 if (table->ctl_name && !table->parent->ctl_name)
1467 set_fail(&fail, table, "Parent without ctl_name");
1468 } 116 }
1469 if (!table->procname) 117 if (!table->procname)
1470 set_fail(&fail, table, "No procname"); 118 set_fail(&fail, table, "No procname");
@@ -1477,21 +125,12 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
1477 set_fail(&fail, table, "Writable sysctl directory"); 125 set_fail(&fail, table, "Writable sysctl directory");
1478 if (table->proc_handler) 126 if (table->proc_handler)
1479 set_fail(&fail, table, "Directory with proc_handler"); 127 set_fail(&fail, table, "Directory with proc_handler");
1480 if (table->strategy)
1481 set_fail(&fail, table, "Directory with strategy");
1482 if (table->extra1) 128 if (table->extra1)
1483 set_fail(&fail, table, "Directory with extra1"); 129 set_fail(&fail, table, "Directory with extra1");
1484 if (table->extra2) 130 if (table->extra2)
1485 set_fail(&fail, table, "Directory with extra2"); 131 set_fail(&fail, table, "Directory with extra2");
1486 if (sysctl_check_dir(namespaces, table))
1487 set_fail(&fail, table, "Inconsistent directory names");
1488 } else { 132 } else {
1489 if ((table->strategy == sysctl_data) || 133 if ((table->proc_handler == proc_dostring) ||
1490 (table->strategy == sysctl_string) ||
1491 (table->strategy == sysctl_intvec) ||
1492 (table->strategy == sysctl_jiffies) ||
1493 (table->strategy == sysctl_ms_jiffies) ||
1494 (table->proc_handler == proc_dostring) ||
1495 (table->proc_handler == proc_dointvec) || 134 (table->proc_handler == proc_dointvec) ||
1496 (table->proc_handler == proc_dointvec_minmax) || 135 (table->proc_handler == proc_dointvec_minmax) ||
1497 (table->proc_handler == proc_dointvec_jiffies) || 136 (table->proc_handler == proc_dointvec_jiffies) ||
@@ -1513,14 +152,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
1513 set_fail(&fail, table, "No max"); 152 set_fail(&fail, table, "No max");
1514 } 153 }
1515 } 154 }
1516#ifdef CONFIG_SYSCTL_SYSCALL
1517 if (table->ctl_name && !table->strategy)
1518 set_fail(&fail, table, "Missing strategy");
1519#endif
1520#if 0
1521 if (!table->ctl_name && table->strategy)
1522 set_fail(&fail, table, "Strategy without ctl_name");
1523#endif
1524#ifdef CONFIG_PROC_SYSCTL 155#ifdef CONFIG_PROC_SYSCTL
1525 if (table->procname && !table->proc_handler) 156 if (table->procname && !table->proc_handler)
1526 set_fail(&fail, table, "No proc_handler"); 157 set_fail(&fail, table, "No proc_handler");
@@ -1531,7 +162,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
1531#endif 162#endif
1532 sysctl_check_leaf(namespaces, table, &fail); 163 sysctl_check_leaf(namespaces, table, &fail);
1533 } 164 }
1534 sysctl_check_bin_path(table, &fail);
1535 if (table->mode > 0777) 165 if (table->mode > 0777)
1536 set_fail(&fail, table, "bogus .mode"); 166 set_fail(&fail, table, "bogus .mode");
1537 if (fail) { 167 if (fail) {
diff --git a/kernel/time.c b/kernel/time.c
index 2e2e469a7fec..c6324d96009e 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -136,7 +136,6 @@ static inline void warp_clock(void)
136 write_seqlock_irq(&xtime_lock); 136 write_seqlock_irq(&xtime_lock);
137 wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; 137 wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60;
138 xtime.tv_sec += sys_tz.tz_minuteswest * 60; 138 xtime.tv_sec += sys_tz.tz_minuteswest * 60;
139 update_xtime_cache(0);
140 write_sequnlock_irq(&xtime_lock); 139 write_sequnlock_irq(&xtime_lock);
141 clock_was_set(); 140 clock_was_set();
142} 141}
@@ -662,6 +661,36 @@ u64 nsec_to_clock_t(u64 x)
662#endif 661#endif
663} 662}
664 663
664/**
665 * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
666 *
667 * @n: nsecs in u64
668 *
669 * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
670 * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
671 * for scheduler, not for use in device drivers to calculate timeout value.
672 *
673 * note:
674 * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
675 * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
676 */
677unsigned long nsecs_to_jiffies(u64 n)
678{
679#if (NSEC_PER_SEC % HZ) == 0
680 /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
681 return div_u64(n, NSEC_PER_SEC / HZ);
682#elif (HZ % 512) == 0
683 /* overflow after 292 years if HZ = 1024 */
684 return div_u64(n * HZ / 512, NSEC_PER_SEC / 512);
685#else
686 /*
687 * Generic case - optimized for cases where HZ is a multiple of 3.
688 * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc.
689 */
690 return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ);
691#endif
692}
693
665#if (BITS_PER_LONG < 64) 694#if (BITS_PER_LONG < 64)
666u64 get_jiffies_64(void) 695u64 get_jiffies_64(void)
667{ 696{
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 620b58abdc32..3d5fc0fd1cca 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -20,6 +20,8 @@
20#include <linux/sysdev.h> 20#include <linux/sysdev.h>
21#include <linux/tick.h> 21#include <linux/tick.h>
22 22
23#include "tick-internal.h"
24
23/* The registered clock event devices */ 25/* The registered clock event devices */
24static LIST_HEAD(clockevent_devices); 26static LIST_HEAD(clockevent_devices);
25static LIST_HEAD(clockevents_released); 27static LIST_HEAD(clockevents_released);
@@ -28,7 +30,7 @@ static LIST_HEAD(clockevents_released);
28static RAW_NOTIFIER_HEAD(clockevents_chain); 30static RAW_NOTIFIER_HEAD(clockevents_chain);
29 31
30/* Protection for the above */ 32/* Protection for the above */
31static DEFINE_SPINLOCK(clockevents_lock); 33static DEFINE_RAW_SPINLOCK(clockevents_lock);
32 34
33/** 35/**
34 * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds 36 * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds
@@ -37,10 +39,9 @@ static DEFINE_SPINLOCK(clockevents_lock);
37 * 39 *
38 * Math helper, returns latch value converted to nanoseconds (bound checked) 40 * Math helper, returns latch value converted to nanoseconds (bound checked)
39 */ 41 */
40unsigned long clockevent_delta2ns(unsigned long latch, 42u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
41 struct clock_event_device *evt)
42{ 43{
43 u64 clc = ((u64) latch << evt->shift); 44 u64 clc = (u64) latch << evt->shift;
44 45
45 if (unlikely(!evt->mult)) { 46 if (unlikely(!evt->mult)) {
46 evt->mult = 1; 47 evt->mult = 1;
@@ -50,10 +51,10 @@ unsigned long clockevent_delta2ns(unsigned long latch,
50 do_div(clc, evt->mult); 51 do_div(clc, evt->mult);
51 if (clc < 1000) 52 if (clc < 1000)
52 clc = 1000; 53 clc = 1000;
53 if (clc > LONG_MAX) 54 if (clc > KTIME_MAX)
54 clc = LONG_MAX; 55 clc = KTIME_MAX;
55 56
56 return (unsigned long) clc; 57 return clc;
57} 58}
58EXPORT_SYMBOL_GPL(clockevent_delta2ns); 59EXPORT_SYMBOL_GPL(clockevent_delta2ns);
59 60
@@ -140,9 +141,9 @@ int clockevents_register_notifier(struct notifier_block *nb)
140 unsigned long flags; 141 unsigned long flags;
141 int ret; 142 int ret;
142 143
143 spin_lock_irqsave(&clockevents_lock, flags); 144 raw_spin_lock_irqsave(&clockevents_lock, flags);
144 ret = raw_notifier_chain_register(&clockevents_chain, nb); 145 ret = raw_notifier_chain_register(&clockevents_chain, nb);
145 spin_unlock_irqrestore(&clockevents_lock, flags); 146 raw_spin_unlock_irqrestore(&clockevents_lock, flags);
146 147
147 return ret; 148 return ret;
148} 149}
@@ -184,13 +185,13 @@ void clockevents_register_device(struct clock_event_device *dev)
184 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); 185 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
185 BUG_ON(!dev->cpumask); 186 BUG_ON(!dev->cpumask);
186 187
187 spin_lock_irqsave(&clockevents_lock, flags); 188 raw_spin_lock_irqsave(&clockevents_lock, flags);
188 189
189 list_add(&dev->list, &clockevent_devices); 190 list_add(&dev->list, &clockevent_devices);
190 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); 191 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
191 clockevents_notify_released(); 192 clockevents_notify_released();
192 193
193 spin_unlock_irqrestore(&clockevents_lock, flags); 194 raw_spin_unlock_irqrestore(&clockevents_lock, flags);
194} 195}
195EXPORT_SYMBOL_GPL(clockevents_register_device); 196EXPORT_SYMBOL_GPL(clockevents_register_device);
196 197
@@ -240,7 +241,7 @@ void clockevents_notify(unsigned long reason, void *arg)
240 struct list_head *node, *tmp; 241 struct list_head *node, *tmp;
241 unsigned long flags; 242 unsigned long flags;
242 243
243 spin_lock_irqsave(&clockevents_lock, flags); 244 raw_spin_lock_irqsave(&clockevents_lock, flags);
244 clockevents_do_notify(reason, arg); 245 clockevents_do_notify(reason, arg);
245 246
246 switch (reason) { 247 switch (reason) {
@@ -255,7 +256,7 @@ void clockevents_notify(unsigned long reason, void *arg)
255 default: 256 default:
256 break; 257 break;
257 } 258 }
258 spin_unlock_irqrestore(&clockevents_lock, flags); 259 raw_spin_unlock_irqrestore(&clockevents_lock, flags);
259} 260}
260EXPORT_SYMBOL_GPL(clockevents_notify); 261EXPORT_SYMBOL_GPL(clockevents_notify);
261#endif 262#endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 5e18c6ab2c6a..e85c23404d34 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -39,7 +39,7 @@ void timecounter_init(struct timecounter *tc,
39 tc->cycle_last = cc->read(cc); 39 tc->cycle_last = cc->read(cc);
40 tc->nsec = start_tstamp; 40 tc->nsec = start_tstamp;
41} 41}
42EXPORT_SYMBOL(timecounter_init); 42EXPORT_SYMBOL_GPL(timecounter_init);
43 43
44/** 44/**
45 * timecounter_read_delta - get nanoseconds since last call of this function 45 * timecounter_read_delta - get nanoseconds since last call of this function
@@ -83,7 +83,7 @@ u64 timecounter_read(struct timecounter *tc)
83 83
84 return nsec; 84 return nsec;
85} 85}
86EXPORT_SYMBOL(timecounter_read); 86EXPORT_SYMBOL_GPL(timecounter_read);
87 87
88u64 timecounter_cyc2time(struct timecounter *tc, 88u64 timecounter_cyc2time(struct timecounter *tc,
89 cycle_t cycle_tstamp) 89 cycle_t cycle_tstamp)
@@ -105,7 +105,60 @@ u64 timecounter_cyc2time(struct timecounter *tc,
105 105
106 return nsec; 106 return nsec;
107} 107}
108EXPORT_SYMBOL(timecounter_cyc2time); 108EXPORT_SYMBOL_GPL(timecounter_cyc2time);
109
110/**
111 * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
112 * @mult: pointer to mult variable
113 * @shift: pointer to shift variable
114 * @from: frequency to convert from
115 * @to: frequency to convert to
116 * @minsec: guaranteed runtime conversion range in seconds
117 *
118 * The function evaluates the shift/mult pair for the scaled math
119 * operations of clocksources and clockevents.
120 *
121 * @to and @from are frequency values in HZ. For clock sources @to is
122 * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock
123 * event @to is the counter frequency and @from is NSEC_PER_SEC.
124 *
125 * The @minsec conversion range argument controls the time frame in
126 * seconds which must be covered by the runtime conversion with the
127 * calculated mult and shift factors. This guarantees that no 64bit
128 * overflow happens when the input value of the conversion is
129 * multiplied with the calculated mult factor. Larger ranges may
130 * reduce the conversion accuracy by chosing smaller mult and shift
131 * factors.
132 */
133void
134clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
135{
136 u64 tmp;
137 u32 sft, sftacc= 32;
138
139 /*
140 * Calculate the shift factor which is limiting the conversion
141 * range:
142 */
143 tmp = ((u64)minsec * from) >> 32;
144 while (tmp) {
145 tmp >>=1;
146 sftacc--;
147 }
148
149 /*
150 * Find the conversion shift/mult pair which has the best
151 * accuracy and fits the maxsec conversion range:
152 */
153 for (sft = 32; sft > 0; sft--) {
154 tmp = (u64) to << sft;
155 do_div(tmp, from);
156 if ((tmp >> sftacc) == 0)
157 break;
158 }
159 *mult = tmp;
160 *shift = sft;
161}
109 162
110/*[Clocksource internal variables]--------- 163/*[Clocksource internal variables]---------
111 * curr_clocksource: 164 * curr_clocksource:
@@ -413,6 +466,47 @@ void clocksource_touch_watchdog(void)
413 clocksource_resume_watchdog(); 466 clocksource_resume_watchdog();
414} 467}
415 468
469/**
470 * clocksource_max_deferment - Returns max time the clocksource can be deferred
471 * @cs: Pointer to clocksource
472 *
473 */
474static u64 clocksource_max_deferment(struct clocksource *cs)
475{
476 u64 max_nsecs, max_cycles;
477
478 /*
479 * Calculate the maximum number of cycles that we can pass to the
480 * cyc2ns function without overflowing a 64-bit signed result. The
481 * maximum number of cycles is equal to ULLONG_MAX/cs->mult which
482 * is equivalent to the below.
483 * max_cycles < (2^63)/cs->mult
484 * max_cycles < 2^(log2((2^63)/cs->mult))
485 * max_cycles < 2^(log2(2^63) - log2(cs->mult))
486 * max_cycles < 2^(63 - log2(cs->mult))
487 * max_cycles < 1 << (63 - log2(cs->mult))
488 * Please note that we add 1 to the result of the log2 to account for
489 * any rounding errors, ensure the above inequality is satisfied and
490 * no overflow will occur.
491 */
492 max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1));
493
494 /*
495 * The actual maximum number of cycles we can defer the clocksource is
496 * determined by the minimum of max_cycles and cs->mask.
497 */
498 max_cycles = min_t(u64, max_cycles, (u64) cs->mask);
499 max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift);
500
501 /*
502 * To ensure that the clocksource does not wrap whilst we are idle,
503 * limit the time the clocksource can be deferred by 12.5%. Please
504 * note a margin of 12.5% is used because this can be computed with
505 * a shift, versus say 10% which would require division.
506 */
507 return max_nsecs - (max_nsecs >> 5);
508}
509
416#ifdef CONFIG_GENERIC_TIME 510#ifdef CONFIG_GENERIC_TIME
417 511
418/** 512/**
@@ -511,6 +605,9 @@ static void clocksource_enqueue(struct clocksource *cs)
511 */ 605 */
512int clocksource_register(struct clocksource *cs) 606int clocksource_register(struct clocksource *cs)
513{ 607{
608 /* calculate max idle time permitted for this clocksource */
609 cs->max_idle_ns = clocksource_max_deferment(cs);
610
514 mutex_lock(&clocksource_mutex); 611 mutex_lock(&clocksource_mutex);
515 clocksource_enqueue(cs); 612 clocksource_enqueue(cs);
516 clocksource_select(); 613 clocksource_select();
@@ -580,7 +677,7 @@ sysfs_show_current_clocksources(struct sys_device *dev,
580 * @count: length of buffer 677 * @count: length of buffer
581 * 678 *
582 * Takes input from sysfs interface for manually overriding the default 679 * Takes input from sysfs interface for manually overriding the default
583 * clocksource selction. 680 * clocksource selection.
584 */ 681 */
585static ssize_t sysfs_override_clocksource(struct sys_device *dev, 682static ssize_t sysfs_override_clocksource(struct sys_device *dev,
586 struct sysdev_attribute *attr, 683 struct sysdev_attribute *attr,
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index c2ec25087a35..b3bafd5fc66d 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -31,7 +31,7 @@ static struct tick_device tick_broadcast_device;
31/* FIXME: Use cpumask_var_t. */ 31/* FIXME: Use cpumask_var_t. */
32static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS); 32static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS);
33static DECLARE_BITMAP(tmpmask, NR_CPUS); 33static DECLARE_BITMAP(tmpmask, NR_CPUS);
34static DEFINE_SPINLOCK(tick_broadcast_lock); 34static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
35static int tick_broadcast_force; 35static int tick_broadcast_force;
36 36
37#ifdef CONFIG_TICK_ONESHOT 37#ifdef CONFIG_TICK_ONESHOT
@@ -96,7 +96,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
96 unsigned long flags; 96 unsigned long flags;
97 int ret = 0; 97 int ret = 0;
98 98
99 spin_lock_irqsave(&tick_broadcast_lock, flags); 99 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
100 100
101 /* 101 /*
102 * Devices might be registered with both periodic and oneshot 102 * Devices might be registered with both periodic and oneshot
@@ -122,7 +122,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
122 tick_broadcast_clear_oneshot(cpu); 122 tick_broadcast_clear_oneshot(cpu);
123 } 123 }
124 } 124 }
125 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 125 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
126 return ret; 126 return ret;
127} 127}
128 128
@@ -161,13 +161,13 @@ static void tick_do_broadcast(struct cpumask *mask)
161 */ 161 */
162static void tick_do_periodic_broadcast(void) 162static void tick_do_periodic_broadcast(void)
163{ 163{
164 spin_lock(&tick_broadcast_lock); 164 raw_spin_lock(&tick_broadcast_lock);
165 165
166 cpumask_and(to_cpumask(tmpmask), 166 cpumask_and(to_cpumask(tmpmask),
167 cpu_online_mask, tick_get_broadcast_mask()); 167 cpu_online_mask, tick_get_broadcast_mask());
168 tick_do_broadcast(to_cpumask(tmpmask)); 168 tick_do_broadcast(to_cpumask(tmpmask));
169 169
170 spin_unlock(&tick_broadcast_lock); 170 raw_spin_unlock(&tick_broadcast_lock);
171} 171}
172 172
173/* 173/*
@@ -212,7 +212,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
212 unsigned long flags; 212 unsigned long flags;
213 int cpu, bc_stopped; 213 int cpu, bc_stopped;
214 214
215 spin_lock_irqsave(&tick_broadcast_lock, flags); 215 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
216 216
217 cpu = smp_processor_id(); 217 cpu = smp_processor_id();
218 td = &per_cpu(tick_cpu_device, cpu); 218 td = &per_cpu(tick_cpu_device, cpu);
@@ -263,7 +263,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
263 tick_broadcast_setup_oneshot(bc); 263 tick_broadcast_setup_oneshot(bc);
264 } 264 }
265out: 265out:
266 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 266 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
267} 267}
268 268
269/* 269/*
@@ -299,7 +299,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
299 unsigned long flags; 299 unsigned long flags;
300 unsigned int cpu = *cpup; 300 unsigned int cpu = *cpup;
301 301
302 spin_lock_irqsave(&tick_broadcast_lock, flags); 302 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
303 303
304 bc = tick_broadcast_device.evtdev; 304 bc = tick_broadcast_device.evtdev;
305 cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); 305 cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
@@ -309,7 +309,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
309 clockevents_shutdown(bc); 309 clockevents_shutdown(bc);
310 } 310 }
311 311
312 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 312 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
313} 313}
314 314
315void tick_suspend_broadcast(void) 315void tick_suspend_broadcast(void)
@@ -317,13 +317,13 @@ void tick_suspend_broadcast(void)
317 struct clock_event_device *bc; 317 struct clock_event_device *bc;
318 unsigned long flags; 318 unsigned long flags;
319 319
320 spin_lock_irqsave(&tick_broadcast_lock, flags); 320 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
321 321
322 bc = tick_broadcast_device.evtdev; 322 bc = tick_broadcast_device.evtdev;
323 if (bc) 323 if (bc)
324 clockevents_shutdown(bc); 324 clockevents_shutdown(bc);
325 325
326 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 326 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
327} 327}
328 328
329int tick_resume_broadcast(void) 329int tick_resume_broadcast(void)
@@ -332,7 +332,7 @@ int tick_resume_broadcast(void)
332 unsigned long flags; 332 unsigned long flags;
333 int broadcast = 0; 333 int broadcast = 0;
334 334
335 spin_lock_irqsave(&tick_broadcast_lock, flags); 335 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
336 336
337 bc = tick_broadcast_device.evtdev; 337 bc = tick_broadcast_device.evtdev;
338 338
@@ -351,7 +351,7 @@ int tick_resume_broadcast(void)
351 break; 351 break;
352 } 352 }
353 } 353 }
354 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 354 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
355 355
356 return broadcast; 356 return broadcast;
357} 357}
@@ -405,7 +405,7 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
405 ktime_t now, next_event; 405 ktime_t now, next_event;
406 int cpu; 406 int cpu;
407 407
408 spin_lock(&tick_broadcast_lock); 408 raw_spin_lock(&tick_broadcast_lock);
409again: 409again:
410 dev->next_event.tv64 = KTIME_MAX; 410 dev->next_event.tv64 = KTIME_MAX;
411 next_event.tv64 = KTIME_MAX; 411 next_event.tv64 = KTIME_MAX;
@@ -443,7 +443,7 @@ again:
443 if (tick_broadcast_set_event(next_event, 0)) 443 if (tick_broadcast_set_event(next_event, 0))
444 goto again; 444 goto again;
445 } 445 }
446 spin_unlock(&tick_broadcast_lock); 446 raw_spin_unlock(&tick_broadcast_lock);
447} 447}
448 448
449/* 449/*
@@ -457,7 +457,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
457 unsigned long flags; 457 unsigned long flags;
458 int cpu; 458 int cpu;
459 459
460 spin_lock_irqsave(&tick_broadcast_lock, flags); 460 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
461 461
462 /* 462 /*
463 * Periodic mode does not care about the enter/exit of power 463 * Periodic mode does not care about the enter/exit of power
@@ -492,7 +492,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
492 } 492 }
493 493
494out: 494out:
495 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 495 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
496} 496}
497 497
498/* 498/*
@@ -563,13 +563,13 @@ void tick_broadcast_switch_to_oneshot(void)
563 struct clock_event_device *bc; 563 struct clock_event_device *bc;
564 unsigned long flags; 564 unsigned long flags;
565 565
566 spin_lock_irqsave(&tick_broadcast_lock, flags); 566 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
567 567
568 tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; 568 tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT;
569 bc = tick_broadcast_device.evtdev; 569 bc = tick_broadcast_device.evtdev;
570 if (bc) 570 if (bc)
571 tick_broadcast_setup_oneshot(bc); 571 tick_broadcast_setup_oneshot(bc);
572 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 572 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
573} 573}
574 574
575 575
@@ -581,7 +581,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
581 unsigned long flags; 581 unsigned long flags;
582 unsigned int cpu = *cpup; 582 unsigned int cpu = *cpup;
583 583
584 spin_lock_irqsave(&tick_broadcast_lock, flags); 584 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
585 585
586 /* 586 /*
587 * Clear the broadcast mask flag for the dead cpu, but do not 587 * Clear the broadcast mask flag for the dead cpu, but do not
@@ -589,7 +589,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
589 */ 589 */
590 cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); 590 cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
591 591
592 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 592 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
593} 593}
594 594
595/* 595/*
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 83c4417b6a3c..b6b898d2eeef 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -34,7 +34,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
34ktime_t tick_next_period; 34ktime_t tick_next_period;
35ktime_t tick_period; 35ktime_t tick_period;
36int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; 36int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
37DEFINE_SPINLOCK(tick_device_lock); 37static DEFINE_RAW_SPINLOCK(tick_device_lock);
38 38
39/* 39/*
40 * Debugging: see timer_list.c 40 * Debugging: see timer_list.c
@@ -209,7 +209,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
209 int cpu, ret = NOTIFY_OK; 209 int cpu, ret = NOTIFY_OK;
210 unsigned long flags; 210 unsigned long flags;
211 211
212 spin_lock_irqsave(&tick_device_lock, flags); 212 raw_spin_lock_irqsave(&tick_device_lock, flags);
213 213
214 cpu = smp_processor_id(); 214 cpu = smp_processor_id();
215 if (!cpumask_test_cpu(cpu, newdev->cpumask)) 215 if (!cpumask_test_cpu(cpu, newdev->cpumask))
@@ -268,7 +268,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
268 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) 268 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
269 tick_oneshot_notify(); 269 tick_oneshot_notify();
270 270
271 spin_unlock_irqrestore(&tick_device_lock, flags); 271 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
272 return NOTIFY_STOP; 272 return NOTIFY_STOP;
273 273
274out_bc: 274out_bc:
@@ -278,7 +278,7 @@ out_bc:
278 if (tick_check_broadcast_device(newdev)) 278 if (tick_check_broadcast_device(newdev))
279 ret = NOTIFY_STOP; 279 ret = NOTIFY_STOP;
280 280
281 spin_unlock_irqrestore(&tick_device_lock, flags); 281 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
282 282
283 return ret; 283 return ret;
284} 284}
@@ -311,7 +311,7 @@ static void tick_shutdown(unsigned int *cpup)
311 struct clock_event_device *dev = td->evtdev; 311 struct clock_event_device *dev = td->evtdev;
312 unsigned long flags; 312 unsigned long flags;
313 313
314 spin_lock_irqsave(&tick_device_lock, flags); 314 raw_spin_lock_irqsave(&tick_device_lock, flags);
315 td->mode = TICKDEV_MODE_PERIODIC; 315 td->mode = TICKDEV_MODE_PERIODIC;
316 if (dev) { 316 if (dev) {
317 /* 317 /*
@@ -322,7 +322,7 @@ static void tick_shutdown(unsigned int *cpup)
322 clockevents_exchange_device(dev, NULL); 322 clockevents_exchange_device(dev, NULL);
323 td->evtdev = NULL; 323 td->evtdev = NULL;
324 } 324 }
325 spin_unlock_irqrestore(&tick_device_lock, flags); 325 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
326} 326}
327 327
328static void tick_suspend(void) 328static void tick_suspend(void)
@@ -330,9 +330,9 @@ static void tick_suspend(void)
330 struct tick_device *td = &__get_cpu_var(tick_cpu_device); 330 struct tick_device *td = &__get_cpu_var(tick_cpu_device);
331 unsigned long flags; 331 unsigned long flags;
332 332
333 spin_lock_irqsave(&tick_device_lock, flags); 333 raw_spin_lock_irqsave(&tick_device_lock, flags);
334 clockevents_shutdown(td->evtdev); 334 clockevents_shutdown(td->evtdev);
335 spin_unlock_irqrestore(&tick_device_lock, flags); 335 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
336} 336}
337 337
338static void tick_resume(void) 338static void tick_resume(void)
@@ -341,7 +341,7 @@ static void tick_resume(void)
341 unsigned long flags; 341 unsigned long flags;
342 int broadcast = tick_resume_broadcast(); 342 int broadcast = tick_resume_broadcast();
343 343
344 spin_lock_irqsave(&tick_device_lock, flags); 344 raw_spin_lock_irqsave(&tick_device_lock, flags);
345 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); 345 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);
346 346
347 if (!broadcast) { 347 if (!broadcast) {
@@ -350,7 +350,7 @@ static void tick_resume(void)
350 else 350 else
351 tick_resume_oneshot(); 351 tick_resume_oneshot();
352 } 352 }
353 spin_unlock_irqrestore(&tick_device_lock, flags); 353 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
354} 354}
355 355
356/* 356/*
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index b1c05bf75ee0..290eefbc1f60 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -6,7 +6,6 @@
6#define TICK_DO_TIMER_BOOT -2 6#define TICK_DO_TIMER_BOOT -2
7 7
8DECLARE_PER_CPU(struct tick_device, tick_cpu_device); 8DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
9extern spinlock_t tick_device_lock;
10extern ktime_t tick_next_period; 9extern ktime_t tick_next_period;
11extern ktime_t tick_period; 10extern ktime_t tick_period;
12extern int tick_do_timer_cpu __read_mostly; 11extern int tick_do_timer_cpu __read_mostly;
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index a96c0e2b89cf..0a8a213016f0 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -50,9 +50,9 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
50 dev->min_delta_ns += dev->min_delta_ns >> 1; 50 dev->min_delta_ns += dev->min_delta_ns >> 1;
51 51
52 printk(KERN_WARNING 52 printk(KERN_WARNING
53 "CE: %s increasing min_delta_ns to %lu nsec\n", 53 "CE: %s increasing min_delta_ns to %llu nsec\n",
54 dev->name ? dev->name : "?", 54 dev->name ? dev->name : "?",
55 dev->min_delta_ns << 1); 55 (unsigned long long) dev->min_delta_ns << 1);
56 56
57 i = 0; 57 i = 0;
58 } 58 }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 89aed5933ed4..f992762d7f51 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -134,18 +134,13 @@ __setup("nohz=", setup_tick_nohz);
134 * value. We do this unconditionally on any cpu, as we don't know whether the 134 * value. We do this unconditionally on any cpu, as we don't know whether the
135 * cpu, which has the update task assigned is in a long sleep. 135 * cpu, which has the update task assigned is in a long sleep.
136 */ 136 */
137static void tick_nohz_update_jiffies(void) 137static void tick_nohz_update_jiffies(ktime_t now)
138{ 138{
139 int cpu = smp_processor_id(); 139 int cpu = smp_processor_id();
140 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 140 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
141 unsigned long flags; 141 unsigned long flags;
142 ktime_t now;
143
144 if (!ts->tick_stopped)
145 return;
146 142
147 cpumask_clear_cpu(cpu, nohz_cpu_mask); 143 cpumask_clear_cpu(cpu, nohz_cpu_mask);
148 now = ktime_get();
149 ts->idle_waketime = now; 144 ts->idle_waketime = now;
150 145
151 local_irq_save(flags); 146 local_irq_save(flags);
@@ -155,20 +150,17 @@ static void tick_nohz_update_jiffies(void)
155 touch_softlockup_watchdog(); 150 touch_softlockup_watchdog();
156} 151}
157 152
158static void tick_nohz_stop_idle(int cpu) 153static void tick_nohz_stop_idle(int cpu, ktime_t now)
159{ 154{
160 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 155 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
156 ktime_t delta;
161 157
162 if (ts->idle_active) { 158 delta = ktime_sub(now, ts->idle_entrytime);
163 ktime_t now, delta; 159 ts->idle_lastupdate = now;
164 now = ktime_get(); 160 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
165 delta = ktime_sub(now, ts->idle_entrytime); 161 ts->idle_active = 0;
166 ts->idle_lastupdate = now;
167 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
168 ts->idle_active = 0;
169 162
170 sched_clock_idle_wakeup_event(0); 163 sched_clock_idle_wakeup_event(0);
171 }
172} 164}
173 165
174static ktime_t tick_nohz_start_idle(struct tick_sched *ts) 166static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
@@ -216,6 +208,7 @@ void tick_nohz_stop_sched_tick(int inidle)
216 struct tick_sched *ts; 208 struct tick_sched *ts;
217 ktime_t last_update, expires, now; 209 ktime_t last_update, expires, now;
218 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 210 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
211 u64 time_delta;
219 int cpu; 212 int cpu;
220 213
221 local_irq_save(flags); 214 local_irq_save(flags);
@@ -263,7 +256,7 @@ void tick_nohz_stop_sched_tick(int inidle)
263 256
264 if (ratelimit < 10) { 257 if (ratelimit < 10) {
265 printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", 258 printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
266 local_softirq_pending()); 259 (unsigned int) local_softirq_pending());
267 ratelimit++; 260 ratelimit++;
268 } 261 }
269 goto end; 262 goto end;
@@ -275,14 +268,18 @@ void tick_nohz_stop_sched_tick(int inidle)
275 seq = read_seqbegin(&xtime_lock); 268 seq = read_seqbegin(&xtime_lock);
276 last_update = last_jiffies_update; 269 last_update = last_jiffies_update;
277 last_jiffies = jiffies; 270 last_jiffies = jiffies;
271 time_delta = timekeeping_max_deferment();
278 } while (read_seqretry(&xtime_lock, seq)); 272 } while (read_seqretry(&xtime_lock, seq));
279 273
280 /* Get the next timer wheel timer */ 274 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
281 next_jiffies = get_next_timer_interrupt(last_jiffies); 275 arch_needs_cpu(cpu)) {
282 delta_jiffies = next_jiffies - last_jiffies; 276 next_jiffies = last_jiffies + 1;
283
284 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu))
285 delta_jiffies = 1; 277 delta_jiffies = 1;
278 } else {
279 /* Get the next timer wheel timer */
280 next_jiffies = get_next_timer_interrupt(last_jiffies);
281 delta_jiffies = next_jiffies - last_jiffies;
282 }
286 /* 283 /*
287 * Do not stop the tick, if we are only one off 284 * Do not stop the tick, if we are only one off
288 * or if the cpu is required for rcu 285 * or if the cpu is required for rcu
@@ -294,22 +291,51 @@ void tick_nohz_stop_sched_tick(int inidle)
294 if ((long)delta_jiffies >= 1) { 291 if ((long)delta_jiffies >= 1) {
295 292
296 /* 293 /*
297 * calculate the expiry time for the next timer wheel
298 * timer
299 */
300 expires = ktime_add_ns(last_update, tick_period.tv64 *
301 delta_jiffies);
302
303 /*
304 * If this cpu is the one which updates jiffies, then 294 * If this cpu is the one which updates jiffies, then
305 * give up the assignment and let it be taken by the 295 * give up the assignment and let it be taken by the
306 * cpu which runs the tick timer next, which might be 296 * cpu which runs the tick timer next, which might be
307 * this cpu as well. If we don't drop this here the 297 * this cpu as well. If we don't drop this here the
308 * jiffies might be stale and do_timer() never 298 * jiffies might be stale and do_timer() never
309 * invoked. 299 * invoked. Keep track of the fact that it was the one
300 * which had the do_timer() duty last. If this cpu is
301 * the one which had the do_timer() duty last, we
302 * limit the sleep time to the timekeeping
303 * max_deferement value which we retrieved
304 * above. Otherwise we can sleep as long as we want.
310 */ 305 */
311 if (cpu == tick_do_timer_cpu) 306 if (cpu == tick_do_timer_cpu) {
312 tick_do_timer_cpu = TICK_DO_TIMER_NONE; 307 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
308 ts->do_timer_last = 1;
309 } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
310 time_delta = KTIME_MAX;
311 ts->do_timer_last = 0;
312 } else if (!ts->do_timer_last) {
313 time_delta = KTIME_MAX;
314 }
315
316 /*
317 * calculate the expiry time for the next timer wheel
318 * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals
319 * that there is no timer pending or at least extremely
320 * far into the future (12 days for HZ=1000). In this
321 * case we set the expiry to the end of time.
322 */
323 if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) {
324 /*
325 * Calculate the time delta for the next timer event.
326 * If the time delta exceeds the maximum time delta
327 * permitted by the current clocksource then adjust
328 * the time delta accordingly to ensure the
329 * clocksource does not wrap.
330 */
331 time_delta = min_t(u64, time_delta,
332 tick_period.tv64 * delta_jiffies);
333 }
334
335 if (time_delta < KTIME_MAX)
336 expires = ktime_add_ns(last_update, time_delta);
337 else
338 expires.tv64 = KTIME_MAX;
313 339
314 if (delta_jiffies > 1) 340 if (delta_jiffies > 1)
315 cpumask_set_cpu(cpu, nohz_cpu_mask); 341 cpumask_set_cpu(cpu, nohz_cpu_mask);
@@ -342,22 +368,19 @@ void tick_nohz_stop_sched_tick(int inidle)
342 368
343 ts->idle_sleeps++; 369 ts->idle_sleeps++;
344 370
371 /* Mark expires */
372 ts->idle_expires = expires;
373
345 /* 374 /*
346 * delta_jiffies >= NEXT_TIMER_MAX_DELTA signals that 375 * If the expiration time == KTIME_MAX, then
347 * there is no timer pending or at least extremly far 376 * in this case we simply stop the tick timer.
348 * into the future (12 days for HZ=1000). In this case
349 * we simply stop the tick timer:
350 */ 377 */
351 if (unlikely(delta_jiffies >= NEXT_TIMER_MAX_DELTA)) { 378 if (unlikely(expires.tv64 == KTIME_MAX)) {
352 ts->idle_expires.tv64 = KTIME_MAX;
353 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) 379 if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
354 hrtimer_cancel(&ts->sched_timer); 380 hrtimer_cancel(&ts->sched_timer);
355 goto out; 381 goto out;
356 } 382 }
357 383
358 /* Mark expiries */
359 ts->idle_expires = expires;
360
361 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { 384 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
362 hrtimer_start(&ts->sched_timer, expires, 385 hrtimer_start(&ts->sched_timer, expires,
363 HRTIMER_MODE_ABS_PINNED); 386 HRTIMER_MODE_ABS_PINNED);
@@ -436,7 +459,11 @@ void tick_nohz_restart_sched_tick(void)
436 ktime_t now; 459 ktime_t now;
437 460
438 local_irq_disable(); 461 local_irq_disable();
439 tick_nohz_stop_idle(cpu); 462 if (ts->idle_active || (ts->inidle && ts->tick_stopped))
463 now = ktime_get();
464
465 if (ts->idle_active)
466 tick_nohz_stop_idle(cpu, now);
440 467
441 if (!ts->inidle || !ts->tick_stopped) { 468 if (!ts->inidle || !ts->tick_stopped) {
442 ts->inidle = 0; 469 ts->inidle = 0;
@@ -450,7 +477,6 @@ void tick_nohz_restart_sched_tick(void)
450 477
451 /* Update jiffies first */ 478 /* Update jiffies first */
452 select_nohz_load_balancer(0); 479 select_nohz_load_balancer(0);
453 now = ktime_get();
454 tick_do_update_jiffies64(now); 480 tick_do_update_jiffies64(now);
455 cpumask_clear_cpu(cpu, nohz_cpu_mask); 481 cpumask_clear_cpu(cpu, nohz_cpu_mask);
456 482
@@ -584,22 +610,18 @@ static void tick_nohz_switch_to_nohz(void)
584 * timer and do not touch the other magic bits which need to be done 610 * timer and do not touch the other magic bits which need to be done
585 * when idle is left. 611 * when idle is left.
586 */ 612 */
587static void tick_nohz_kick_tick(int cpu) 613static void tick_nohz_kick_tick(int cpu, ktime_t now)
588{ 614{
589#if 0 615#if 0
590 /* Switch back to 2.6.27 behaviour */ 616 /* Switch back to 2.6.27 behaviour */
591 617
592 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 618 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
593 ktime_t delta, now; 619 ktime_t delta;
594
595 if (!ts->tick_stopped)
596 return;
597 620
598 /* 621 /*
599 * Do not touch the tick device, when the next expiry is either 622 * Do not touch the tick device, when the next expiry is either
600 * already reached or less/equal than the tick period. 623 * already reached or less/equal than the tick period.
601 */ 624 */
602 now = ktime_get();
603 delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now); 625 delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now);
604 if (delta.tv64 <= tick_period.tv64) 626 if (delta.tv64 <= tick_period.tv64)
605 return; 627 return;
@@ -608,9 +630,26 @@ static void tick_nohz_kick_tick(int cpu)
608#endif 630#endif
609} 631}
610 632
633static inline void tick_check_nohz(int cpu)
634{
635 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
636 ktime_t now;
637
638 if (!ts->idle_active && !ts->tick_stopped)
639 return;
640 now = ktime_get();
641 if (ts->idle_active)
642 tick_nohz_stop_idle(cpu, now);
643 if (ts->tick_stopped) {
644 tick_nohz_update_jiffies(now);
645 tick_nohz_kick_tick(cpu, now);
646 }
647}
648
611#else 649#else
612 650
613static inline void tick_nohz_switch_to_nohz(void) { } 651static inline void tick_nohz_switch_to_nohz(void) { }
652static inline void tick_check_nohz(int cpu) { }
614 653
615#endif /* NO_HZ */ 654#endif /* NO_HZ */
616 655
@@ -620,11 +659,7 @@ static inline void tick_nohz_switch_to_nohz(void) { }
620void tick_check_idle(int cpu) 659void tick_check_idle(int cpu)
621{ 660{
622 tick_check_oneshot_broadcast(cpu); 661 tick_check_oneshot_broadcast(cpu);
623#ifdef CONFIG_NO_HZ 662 tick_check_nohz(cpu);
624 tick_nohz_stop_idle(cpu);
625 tick_nohz_update_jiffies();
626 tick_nohz_kick_tick(cpu);
627#endif
628} 663}
629 664
630/* 665/*
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
index 71e7f1a19156..12f5c55090be 100644
--- a/kernel/time/timecompare.c
+++ b/kernel/time/timecompare.c
@@ -40,7 +40,7 @@ ktime_t timecompare_transform(struct timecompare *sync,
40 40
41 return ns_to_ktime(nsec); 41 return ns_to_ktime(nsec);
42} 42}
43EXPORT_SYMBOL(timecompare_transform); 43EXPORT_SYMBOL_GPL(timecompare_transform);
44 44
45int timecompare_offset(struct timecompare *sync, 45int timecompare_offset(struct timecompare *sync,
46 s64 *offset, 46 s64 *offset,
@@ -89,7 +89,7 @@ int timecompare_offset(struct timecompare *sync,
89 * source time 89 * source time
90 */ 90 */
91 sample.offset = 91 sample.offset =
92 ktime_to_ns(ktime_add(end, start)) / 2 - 92 (ktime_to_ns(end) + ktime_to_ns(start)) / 2 -
93 ts; 93 ts;
94 94
95 /* simple insertion sort based on duration */ 95 /* simple insertion sort based on duration */
@@ -131,7 +131,7 @@ int timecompare_offset(struct timecompare *sync,
131 131
132 return used; 132 return used;
133} 133}
134EXPORT_SYMBOL(timecompare_offset); 134EXPORT_SYMBOL_GPL(timecompare_offset);
135 135
136void __timecompare_update(struct timecompare *sync, 136void __timecompare_update(struct timecompare *sync,
137 u64 source_tstamp) 137 u64 source_tstamp)
@@ -188,4 +188,4 @@ void __timecompare_update(struct timecompare *sync,
188 } 188 }
189 } 189 }
190} 190}
191EXPORT_SYMBOL(__timecompare_update); 191EXPORT_SYMBOL_GPL(__timecompare_update);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index c3a4e2907eaa..af4135f05825 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -165,19 +165,12 @@ struct timespec raw_time;
165/* flag for if timekeeping is suspended */ 165/* flag for if timekeeping is suspended */
166int __read_mostly timekeeping_suspended; 166int __read_mostly timekeeping_suspended;
167 167
168static struct timespec xtime_cache __attribute__ ((aligned (16)));
169void update_xtime_cache(u64 nsec)
170{
171 xtime_cache = xtime;
172 timespec_add_ns(&xtime_cache, nsec);
173}
174
175/* must hold xtime_lock */ 168/* must hold xtime_lock */
176void timekeeping_leap_insert(int leapsecond) 169void timekeeping_leap_insert(int leapsecond)
177{ 170{
178 xtime.tv_sec += leapsecond; 171 xtime.tv_sec += leapsecond;
179 wall_to_monotonic.tv_sec -= leapsecond; 172 wall_to_monotonic.tv_sec -= leapsecond;
180 update_vsyscall(&xtime, timekeeper.clock); 173 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
181} 174}
182 175
183#ifdef CONFIG_GENERIC_TIME 176#ifdef CONFIG_GENERIC_TIME
@@ -332,12 +325,10 @@ int do_settimeofday(struct timespec *tv)
332 325
333 xtime = *tv; 326 xtime = *tv;
334 327
335 update_xtime_cache(0);
336
337 timekeeper.ntp_error = 0; 328 timekeeper.ntp_error = 0;
338 ntp_clear(); 329 ntp_clear();
339 330
340 update_vsyscall(&xtime, timekeeper.clock); 331 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
341 332
342 write_sequnlock_irqrestore(&xtime_lock, flags); 333 write_sequnlock_irqrestore(&xtime_lock, flags);
343 334
@@ -488,6 +479,17 @@ int timekeeping_valid_for_hres(void)
488} 479}
489 480
490/** 481/**
482 * timekeeping_max_deferment - Returns max time the clocksource can be deferred
483 *
484 * Caller must observe xtime_lock via read_seqbegin/read_seqretry to
485 * ensure that the clocksource does not change!
486 */
487u64 timekeeping_max_deferment(void)
488{
489 return timekeeper.clock->max_idle_ns;
490}
491
492/**
491 * read_persistent_clock - Return time from the persistent clock. 493 * read_persistent_clock - Return time from the persistent clock.
492 * 494 *
493 * Weak dummy function for arches that do not yet support it. 495 * Weak dummy function for arches that do not yet support it.
@@ -548,7 +550,6 @@ void __init timekeeping_init(void)
548 } 550 }
549 set_normalized_timespec(&wall_to_monotonic, 551 set_normalized_timespec(&wall_to_monotonic,
550 -boot.tv_sec, -boot.tv_nsec); 552 -boot.tv_sec, -boot.tv_nsec);
551 update_xtime_cache(0);
552 total_sleep_time.tv_sec = 0; 553 total_sleep_time.tv_sec = 0;
553 total_sleep_time.tv_nsec = 0; 554 total_sleep_time.tv_nsec = 0;
554 write_sequnlock_irqrestore(&xtime_lock, flags); 555 write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -582,7 +583,6 @@ static int timekeeping_resume(struct sys_device *dev)
582 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); 583 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
583 total_sleep_time = timespec_add_safe(total_sleep_time, ts); 584 total_sleep_time = timespec_add_safe(total_sleep_time, ts);
584 } 585 }
585 update_xtime_cache(0);
586 /* re-base the last cycle value */ 586 /* re-base the last cycle value */
587 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); 587 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
588 timekeeper.ntp_error = 0; 588 timekeeper.ntp_error = 0;
@@ -723,6 +723,49 @@ static void timekeeping_adjust(s64 offset)
723} 723}
724 724
725/** 725/**
726 * logarithmic_accumulation - shifted accumulation of cycles
727 *
728 * This functions accumulates a shifted interval of cycles into
729 * into a shifted interval nanoseconds. Allows for O(log) accumulation
730 * loop.
731 *
732 * Returns the unconsumed cycles.
733 */
734static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
735{
736 u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
737
738 /* If the offset is smaller then a shifted interval, do nothing */
739 if (offset < timekeeper.cycle_interval<<shift)
740 return offset;
741
742 /* Accumulate one shifted interval */
743 offset -= timekeeper.cycle_interval << shift;
744 timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift;
745
746 timekeeper.xtime_nsec += timekeeper.xtime_interval << shift;
747 while (timekeeper.xtime_nsec >= nsecps) {
748 timekeeper.xtime_nsec -= nsecps;
749 xtime.tv_sec++;
750 second_overflow();
751 }
752
753 /* Accumulate into raw time */
754 raw_time.tv_nsec += timekeeper.raw_interval << shift;;
755 while (raw_time.tv_nsec >= NSEC_PER_SEC) {
756 raw_time.tv_nsec -= NSEC_PER_SEC;
757 raw_time.tv_sec++;
758 }
759
760 /* Accumulate error between NTP and clock interval */
761 timekeeper.ntp_error += tick_length << shift;
762 timekeeper.ntp_error -= timekeeper.xtime_interval <<
763 (timekeeper.ntp_error_shift + shift);
764
765 return offset;
766}
767
768/**
726 * update_wall_time - Uses the current clocksource to increment the wall time 769 * update_wall_time - Uses the current clocksource to increment the wall time
727 * 770 *
728 * Called from the timer interrupt, must hold a write on xtime_lock. 771 * Called from the timer interrupt, must hold a write on xtime_lock.
@@ -731,7 +774,7 @@ void update_wall_time(void)
731{ 774{
732 struct clocksource *clock; 775 struct clocksource *clock;
733 cycle_t offset; 776 cycle_t offset;
734 u64 nsecs; 777 int shift = 0, maxshift;
735 778
736 /* Make sure we're fully resumed: */ 779 /* Make sure we're fully resumed: */
737 if (unlikely(timekeeping_suspended)) 780 if (unlikely(timekeeping_suspended))
@@ -745,33 +788,22 @@ void update_wall_time(void)
745#endif 788#endif
746 timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; 789 timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
747 790
748 /* normally this loop will run just once, however in the 791 /*
749 * case of lost or late ticks, it will accumulate correctly. 792 * With NO_HZ we may have to accumulate many cycle_intervals
793 * (think "ticks") worth of time at once. To do this efficiently,
794 * we calculate the largest doubling multiple of cycle_intervals
795 * that is smaller then the offset. We then accumulate that
796 * chunk in one go, and then try to consume the next smaller
797 * doubled multiple.
750 */ 798 */
799 shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
800 shift = max(0, shift);
801 /* Bound shift to one less then what overflows tick_length */
802 maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1;
803 shift = min(shift, maxshift);
751 while (offset >= timekeeper.cycle_interval) { 804 while (offset >= timekeeper.cycle_interval) {
752 u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; 805 offset = logarithmic_accumulation(offset, shift);
753 806 shift--;
754 /* accumulate one interval */
755 offset -= timekeeper.cycle_interval;
756 clock->cycle_last += timekeeper.cycle_interval;
757
758 timekeeper.xtime_nsec += timekeeper.xtime_interval;
759 if (timekeeper.xtime_nsec >= nsecps) {
760 timekeeper.xtime_nsec -= nsecps;
761 xtime.tv_sec++;
762 second_overflow();
763 }
764
765 raw_time.tv_nsec += timekeeper.raw_interval;
766 if (raw_time.tv_nsec >= NSEC_PER_SEC) {
767 raw_time.tv_nsec -= NSEC_PER_SEC;
768 raw_time.tv_sec++;
769 }
770
771 /* accumulate error between NTP and clock interval */
772 timekeeper.ntp_error += tick_length;
773 timekeeper.ntp_error -= timekeeper.xtime_interval <<
774 timekeeper.ntp_error_shift;
775 } 807 }
776 808
777 /* correct the clock when NTP error is too big */ 809 /* correct the clock when NTP error is too big */
@@ -807,11 +839,8 @@ void update_wall_time(void)
807 timekeeper.ntp_error += timekeeper.xtime_nsec << 839 timekeeper.ntp_error += timekeeper.xtime_nsec <<
808 timekeeper.ntp_error_shift; 840 timekeeper.ntp_error_shift;
809 841
810 nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift);
811 update_xtime_cache(nsecs);
812
813 /* check to see if there is a new clocksource to use */ 842 /* check to see if there is a new clocksource to use */
814 update_vsyscall(&xtime, timekeeper.clock); 843 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
815} 844}
816 845
817/** 846/**
@@ -846,13 +875,13 @@ void monotonic_to_bootbased(struct timespec *ts)
846 875
847unsigned long get_seconds(void) 876unsigned long get_seconds(void)
848{ 877{
849 return xtime_cache.tv_sec; 878 return xtime.tv_sec;
850} 879}
851EXPORT_SYMBOL(get_seconds); 880EXPORT_SYMBOL(get_seconds);
852 881
853struct timespec __current_kernel_time(void) 882struct timespec __current_kernel_time(void)
854{ 883{
855 return xtime_cache; 884 return xtime;
856} 885}
857 886
858struct timespec current_kernel_time(void) 887struct timespec current_kernel_time(void)
@@ -862,8 +891,7 @@ struct timespec current_kernel_time(void)
862 891
863 do { 892 do {
864 seq = read_seqbegin(&xtime_lock); 893 seq = read_seqbegin(&xtime_lock);
865 894 now = xtime;
866 now = xtime_cache;
867 } while (read_seqretry(&xtime_lock, seq)); 895 } while (read_seqretry(&xtime_lock, seq));
868 896
869 return now; 897 return now;
@@ -877,8 +905,7 @@ struct timespec get_monotonic_coarse(void)
877 905
878 do { 906 do {
879 seq = read_seqbegin(&xtime_lock); 907 seq = read_seqbegin(&xtime_lock);
880 908 now = xtime;
881 now = xtime_cache;
882 mono = wall_to_monotonic; 909 mono = wall_to_monotonic;
883 } while (read_seqretry(&xtime_lock, seq)); 910 } while (read_seqretry(&xtime_lock, seq));
884 911
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 1b5b7aa2fdfd..28265636b6c2 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -84,7 +84,7 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
84 84
85next_one: 85next_one:
86 i = 0; 86 i = 0;
87 spin_lock_irqsave(&base->cpu_base->lock, flags); 87 raw_spin_lock_irqsave(&base->cpu_base->lock, flags);
88 88
89 curr = base->first; 89 curr = base->first;
90 /* 90 /*
@@ -100,13 +100,13 @@ next_one:
100 100
101 timer = rb_entry(curr, struct hrtimer, node); 101 timer = rb_entry(curr, struct hrtimer, node);
102 tmp = *timer; 102 tmp = *timer;
103 spin_unlock_irqrestore(&base->cpu_base->lock, flags); 103 raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags);
104 104
105 print_timer(m, timer, &tmp, i, now); 105 print_timer(m, timer, &tmp, i, now);
106 next++; 106 next++;
107 goto next_one; 107 goto next_one;
108 } 108 }
109 spin_unlock_irqrestore(&base->cpu_base->lock, flags); 109 raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags);
110} 110}
111 111
112static void 112static void
@@ -150,6 +150,9 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
150 P_ns(expires_next); 150 P_ns(expires_next);
151 P(hres_active); 151 P(hres_active);
152 P(nr_events); 152 P(nr_events);
153 P(nr_retries);
154 P(nr_hangs);
155 P_ns(max_hang_time);
153#endif 156#endif
154#undef P 157#undef P
155#undef P_ns 158#undef P_ns
@@ -204,10 +207,12 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
204 return; 207 return;
205 } 208 }
206 SEQ_printf(m, "%s\n", dev->name); 209 SEQ_printf(m, "%s\n", dev->name);
207 SEQ_printf(m, " max_delta_ns: %lu\n", dev->max_delta_ns); 210 SEQ_printf(m, " max_delta_ns: %llu\n",
208 SEQ_printf(m, " min_delta_ns: %lu\n", dev->min_delta_ns); 211 (unsigned long long) dev->max_delta_ns);
209 SEQ_printf(m, " mult: %lu\n", dev->mult); 212 SEQ_printf(m, " min_delta_ns: %llu\n",
210 SEQ_printf(m, " shift: %d\n", dev->shift); 213 (unsigned long long) dev->min_delta_ns);
214 SEQ_printf(m, " mult: %u\n", dev->mult);
215 SEQ_printf(m, " shift: %u\n", dev->shift);
211 SEQ_printf(m, " mode: %d\n", dev->mode); 216 SEQ_printf(m, " mode: %d\n", dev->mode);
212 SEQ_printf(m, " next_event: %Ld nsecs\n", 217 SEQ_printf(m, " next_event: %Ld nsecs\n",
213 (unsigned long long) ktime_to_ns(dev->next_event)); 218 (unsigned long long) ktime_to_ns(dev->next_event));
@@ -252,7 +257,7 @@ static int timer_list_show(struct seq_file *m, void *v)
252 u64 now = ktime_to_ns(ktime_get()); 257 u64 now = ktime_to_ns(ktime_get());
253 int cpu; 258 int cpu;
254 259
255 SEQ_printf(m, "Timer List Version: v0.4\n"); 260 SEQ_printf(m, "Timer List Version: v0.5\n");
256 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); 261 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
257 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); 262 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
258 263
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index ee5681f8d7ec..2f3b585b8d7d 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -86,7 +86,7 @@ static DEFINE_SPINLOCK(table_lock);
86/* 86/*
87 * Per-CPU lookup locks for fast hash lookup: 87 * Per-CPU lookup locks for fast hash lookup:
88 */ 88 */
89static DEFINE_PER_CPU(spinlock_t, lookup_lock); 89static DEFINE_PER_CPU(raw_spinlock_t, tstats_lookup_lock);
90 90
91/* 91/*
92 * Mutex to serialize state changes with show-stats activities: 92 * Mutex to serialize state changes with show-stats activities:
@@ -238,14 +238,14 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
238 /* 238 /*
239 * It doesnt matter which lock we take: 239 * It doesnt matter which lock we take:
240 */ 240 */
241 spinlock_t *lock; 241 raw_spinlock_t *lock;
242 struct entry *entry, input; 242 struct entry *entry, input;
243 unsigned long flags; 243 unsigned long flags;
244 244
245 if (likely(!timer_stats_active)) 245 if (likely(!timer_stats_active))
246 return; 246 return;
247 247
248 lock = &per_cpu(lookup_lock, raw_smp_processor_id()); 248 lock = &per_cpu(tstats_lookup_lock, raw_smp_processor_id());
249 249
250 input.timer = timer; 250 input.timer = timer;
251 input.start_func = startf; 251 input.start_func = startf;
@@ -253,7 +253,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
253 input.pid = pid; 253 input.pid = pid;
254 input.timer_flag = timer_flag; 254 input.timer_flag = timer_flag;
255 255
256 spin_lock_irqsave(lock, flags); 256 raw_spin_lock_irqsave(lock, flags);
257 if (!timer_stats_active) 257 if (!timer_stats_active)
258 goto out_unlock; 258 goto out_unlock;
259 259
@@ -264,7 +264,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
264 atomic_inc(&overflow_count); 264 atomic_inc(&overflow_count);
265 265
266 out_unlock: 266 out_unlock:
267 spin_unlock_irqrestore(lock, flags); 267 raw_spin_unlock_irqrestore(lock, flags);
268} 268}
269 269
270static void print_name_offset(struct seq_file *m, unsigned long addr) 270static void print_name_offset(struct seq_file *m, unsigned long addr)
@@ -348,9 +348,11 @@ static void sync_access(void)
348 int cpu; 348 int cpu;
349 349
350 for_each_online_cpu(cpu) { 350 for_each_online_cpu(cpu) {
351 spin_lock_irqsave(&per_cpu(lookup_lock, cpu), flags); 351 raw_spinlock_t *lock = &per_cpu(tstats_lookup_lock, cpu);
352
353 raw_spin_lock_irqsave(lock, flags);
352 /* nothing */ 354 /* nothing */
353 spin_unlock_irqrestore(&per_cpu(lookup_lock, cpu), flags); 355 raw_spin_unlock_irqrestore(lock, flags);
354 } 356 }
355} 357}
356 358
@@ -408,7 +410,7 @@ void __init init_timer_stats(void)
408 int cpu; 410 int cpu;
409 411
410 for_each_possible_cpu(cpu) 412 for_each_possible_cpu(cpu)
411 spin_lock_init(&per_cpu(lookup_lock, cpu)); 413 raw_spin_lock_init(&per_cpu(tstats_lookup_lock, cpu));
412} 414}
413 415
414static int __init init_tstats_procfs(void) 416static int __init init_tstats_procfs(void)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 7cb6f1922598..7968762c8167 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1724,7 +1724,7 @@ ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type)
1724 return ftrace_match(str, regex, len, type); 1724 return ftrace_match(str, regex, len, type);
1725} 1725}
1726 1726
1727static void ftrace_match_records(char *buff, int len, int enable) 1727static int ftrace_match_records(char *buff, int len, int enable)
1728{ 1728{
1729 unsigned int search_len; 1729 unsigned int search_len;
1730 struct ftrace_page *pg; 1730 struct ftrace_page *pg;
@@ -1733,6 +1733,7 @@ static void ftrace_match_records(char *buff, int len, int enable)
1733 char *search; 1733 char *search;
1734 int type; 1734 int type;
1735 int not; 1735 int not;
1736 int found = 0;
1736 1737
1737 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; 1738 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1738 type = filter_parse_regex(buff, len, &search, &not); 1739 type = filter_parse_regex(buff, len, &search, &not);
@@ -1750,6 +1751,7 @@ static void ftrace_match_records(char *buff, int len, int enable)
1750 rec->flags &= ~flag; 1751 rec->flags &= ~flag;
1751 else 1752 else
1752 rec->flags |= flag; 1753 rec->flags |= flag;
1754 found = 1;
1753 } 1755 }
1754 /* 1756 /*
1755 * Only enable filtering if we have a function that 1757 * Only enable filtering if we have a function that
@@ -1759,6 +1761,8 @@ static void ftrace_match_records(char *buff, int len, int enable)
1759 ftrace_filtered = 1; 1761 ftrace_filtered = 1;
1760 } while_for_each_ftrace_rec(); 1762 } while_for_each_ftrace_rec();
1761 mutex_unlock(&ftrace_lock); 1763 mutex_unlock(&ftrace_lock);
1764
1765 return found;
1762} 1766}
1763 1767
1764static int 1768static int
@@ -1780,7 +1784,7 @@ ftrace_match_module_record(struct dyn_ftrace *rec, char *mod,
1780 return 1; 1784 return 1;
1781} 1785}
1782 1786
1783static void ftrace_match_module_records(char *buff, char *mod, int enable) 1787static int ftrace_match_module_records(char *buff, char *mod, int enable)
1784{ 1788{
1785 unsigned search_len = 0; 1789 unsigned search_len = 0;
1786 struct ftrace_page *pg; 1790 struct ftrace_page *pg;
@@ -1789,6 +1793,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
1789 char *search = buff; 1793 char *search = buff;
1790 unsigned long flag; 1794 unsigned long flag;
1791 int not = 0; 1795 int not = 0;
1796 int found = 0;
1792 1797
1793 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; 1798 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1794 1799
@@ -1819,12 +1824,15 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
1819 rec->flags &= ~flag; 1824 rec->flags &= ~flag;
1820 else 1825 else
1821 rec->flags |= flag; 1826 rec->flags |= flag;
1827 found = 1;
1822 } 1828 }
1823 if (enable && (rec->flags & FTRACE_FL_FILTER)) 1829 if (enable && (rec->flags & FTRACE_FL_FILTER))
1824 ftrace_filtered = 1; 1830 ftrace_filtered = 1;
1825 1831
1826 } while_for_each_ftrace_rec(); 1832 } while_for_each_ftrace_rec();
1827 mutex_unlock(&ftrace_lock); 1833 mutex_unlock(&ftrace_lock);
1834
1835 return found;
1828} 1836}
1829 1837
1830/* 1838/*
@@ -1853,8 +1861,9 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
1853 if (!strlen(mod)) 1861 if (!strlen(mod))
1854 return -EINVAL; 1862 return -EINVAL;
1855 1863
1856 ftrace_match_module_records(func, mod, enable); 1864 if (ftrace_match_module_records(func, mod, enable))
1857 return 0; 1865 return 0;
1866 return -EINVAL;
1858} 1867}
1859 1868
1860static struct ftrace_func_command ftrace_mod_cmd = { 1869static struct ftrace_func_command ftrace_mod_cmd = {
@@ -2151,8 +2160,9 @@ static int ftrace_process_regex(char *buff, int len, int enable)
2151 func = strsep(&next, ":"); 2160 func = strsep(&next, ":");
2152 2161
2153 if (!next) { 2162 if (!next) {
2154 ftrace_match_records(func, len, enable); 2163 if (ftrace_match_records(func, len, enable))
2155 return 0; 2164 return 0;
2165 return ret;
2156 } 2166 }
2157 2167
2158 /* command found */ 2168 /* command found */
@@ -2198,10 +2208,9 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2198 !trace_parser_cont(parser)) { 2208 !trace_parser_cont(parser)) {
2199 ret = ftrace_process_regex(parser->buffer, 2209 ret = ftrace_process_regex(parser->buffer,
2200 parser->idx, enable); 2210 parser->idx, enable);
2211 trace_parser_clear(parser);
2201 if (ret) 2212 if (ret)
2202 goto out_unlock; 2213 goto out_unlock;
2203
2204 trace_parser_clear(parser);
2205 } 2214 }
2206 2215
2207 ret = read; 2216 ret = read;
@@ -2274,7 +2283,6 @@ void ftrace_set_notrace(unsigned char *buf, int len, int reset)
2274#define FTRACE_FILTER_SIZE COMMAND_LINE_SIZE 2283#define FTRACE_FILTER_SIZE COMMAND_LINE_SIZE
2275static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata; 2284static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
2276static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata; 2285static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
2277static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
2278 2286
2279static int __init set_ftrace_notrace(char *str) 2287static int __init set_ftrace_notrace(char *str)
2280{ 2288{
@@ -2291,6 +2299,7 @@ static int __init set_ftrace_filter(char *str)
2291__setup("ftrace_filter=", set_ftrace_filter); 2299__setup("ftrace_filter=", set_ftrace_filter);
2292 2300
2293#ifdef CONFIG_FUNCTION_GRAPH_TRACER 2301#ifdef CONFIG_FUNCTION_GRAPH_TRACER
2302static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
2294static int __init set_graph_function(char *str) 2303static int __init set_graph_function(char *str)
2295{ 2304{
2296 strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); 2305 strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
@@ -2543,10 +2552,9 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2543 exists = true; 2552 exists = true;
2544 break; 2553 break;
2545 } 2554 }
2546 if (!exists) { 2555 if (!exists)
2547 array[(*idx)++] = rec->ip; 2556 array[(*idx)++] = rec->ip;
2548 found = 1; 2557 found = 1;
2549 }
2550 } 2558 }
2551 } while_for_each_ftrace_rec(); 2559 } while_for_each_ftrace_rec();
2552 2560
@@ -2985,7 +2993,7 @@ static ssize_t
2985ftrace_pid_write(struct file *filp, const char __user *ubuf, 2993ftrace_pid_write(struct file *filp, const char __user *ubuf,
2986 size_t cnt, loff_t *ppos) 2994 size_t cnt, loff_t *ppos)
2987{ 2995{
2988 char buf[64]; 2996 char buf[64], *tmp;
2989 long val; 2997 long val;
2990 int ret; 2998 int ret;
2991 2999
@@ -3001,11 +3009,11 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
3001 * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid" 3009 * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid"
3002 * to clean the filter quietly. 3010 * to clean the filter quietly.
3003 */ 3011 */
3004 strstrip(buf); 3012 tmp = strstrip(buf);
3005 if (strlen(buf) == 0) 3013 if (strlen(tmp) == 0)
3006 return 1; 3014 return 1;
3007 3015
3008 ret = strict_strtol(buf, 10, &val); 3016 ret = strict_strtol(tmp, 10, &val);
3009 if (ret < 0) 3017 if (ret < 0)
3010 return ret; 3018 return ret;
3011 3019
@@ -3391,4 +3399,3 @@ void ftrace_graph_stop(void)
3391 ftrace_stop(); 3399 ftrace_stop();
3392} 3400}
3393#endif 3401#endif
3394
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index e06c6e3d56a3..9f4f565b01e6 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -14,7 +14,5 @@
14#define CREATE_TRACE_POINTS 14#define CREATE_TRACE_POINTS
15#include <trace/events/power.h> 15#include <trace/events/power.h>
16 16
17EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
18EXPORT_TRACEPOINT_SYMBOL_GPL(power_end);
19EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency); 17EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency);
20 18
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index db223fe8887f..2326b04c95c4 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -423,7 +423,7 @@ struct ring_buffer_per_cpu {
423 int cpu; 423 int cpu;
424 struct ring_buffer *buffer; 424 struct ring_buffer *buffer;
425 spinlock_t reader_lock; /* serialize readers */ 425 spinlock_t reader_lock; /* serialize readers */
426 raw_spinlock_t lock; 426 arch_spinlock_t lock;
427 struct lock_class_key lock_key; 427 struct lock_class_key lock_key;
428 struct list_head *pages; 428 struct list_head *pages;
429 struct buffer_page *head_page; /* read from head */ 429 struct buffer_page *head_page; /* read from head */
@@ -998,7 +998,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
998 cpu_buffer->buffer = buffer; 998 cpu_buffer->buffer = buffer;
999 spin_lock_init(&cpu_buffer->reader_lock); 999 spin_lock_init(&cpu_buffer->reader_lock);
1000 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); 1000 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
1001 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 1001 cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
1002 1002
1003 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1003 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
1004 GFP_KERNEL, cpu_to_node(cpu)); 1004 GFP_KERNEL, cpu_to_node(cpu));
@@ -1193,9 +1193,6 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
1193 struct list_head *p; 1193 struct list_head *p;
1194 unsigned i; 1194 unsigned i;
1195 1195
1196 atomic_inc(&cpu_buffer->record_disabled);
1197 synchronize_sched();
1198
1199 spin_lock_irq(&cpu_buffer->reader_lock); 1196 spin_lock_irq(&cpu_buffer->reader_lock);
1200 rb_head_page_deactivate(cpu_buffer); 1197 rb_head_page_deactivate(cpu_buffer);
1201 1198
@@ -1211,12 +1208,9 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
1211 return; 1208 return;
1212 1209
1213 rb_reset_cpu(cpu_buffer); 1210 rb_reset_cpu(cpu_buffer);
1214 spin_unlock_irq(&cpu_buffer->reader_lock);
1215
1216 rb_check_pages(cpu_buffer); 1211 rb_check_pages(cpu_buffer);
1217 1212
1218 atomic_dec(&cpu_buffer->record_disabled); 1213 spin_unlock_irq(&cpu_buffer->reader_lock);
1219
1220} 1214}
1221 1215
1222static void 1216static void
@@ -1227,9 +1221,6 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
1227 struct list_head *p; 1221 struct list_head *p;
1228 unsigned i; 1222 unsigned i;
1229 1223
1230 atomic_inc(&cpu_buffer->record_disabled);
1231 synchronize_sched();
1232
1233 spin_lock_irq(&cpu_buffer->reader_lock); 1224 spin_lock_irq(&cpu_buffer->reader_lock);
1234 rb_head_page_deactivate(cpu_buffer); 1225 rb_head_page_deactivate(cpu_buffer);
1235 1226
@@ -1242,11 +1233,9 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
1242 list_add_tail(&bpage->list, cpu_buffer->pages); 1233 list_add_tail(&bpage->list, cpu_buffer->pages);
1243 } 1234 }
1244 rb_reset_cpu(cpu_buffer); 1235 rb_reset_cpu(cpu_buffer);
1245 spin_unlock_irq(&cpu_buffer->reader_lock);
1246
1247 rb_check_pages(cpu_buffer); 1236 rb_check_pages(cpu_buffer);
1248 1237
1249 atomic_dec(&cpu_buffer->record_disabled); 1238 spin_unlock_irq(&cpu_buffer->reader_lock);
1250} 1239}
1251 1240
1252/** 1241/**
@@ -1254,11 +1243,6 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
1254 * @buffer: the buffer to resize. 1243 * @buffer: the buffer to resize.
1255 * @size: the new size. 1244 * @size: the new size.
1256 * 1245 *
1257 * The tracer is responsible for making sure that the buffer is
1258 * not being used while changing the size.
1259 * Note: We may be able to change the above requirement by using
1260 * RCU synchronizations.
1261 *
1262 * Minimum size is 2 * BUF_PAGE_SIZE. 1246 * Minimum size is 2 * BUF_PAGE_SIZE.
1263 * 1247 *
1264 * Returns -1 on failure. 1248 * Returns -1 on failure.
@@ -1290,6 +1274,11 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1290 if (size == buffer_size) 1274 if (size == buffer_size)
1291 return size; 1275 return size;
1292 1276
1277 atomic_inc(&buffer->record_disabled);
1278
1279 /* Make sure all writers are done with this buffer. */
1280 synchronize_sched();
1281
1293 mutex_lock(&buffer->mutex); 1282 mutex_lock(&buffer->mutex);
1294 get_online_cpus(); 1283 get_online_cpus();
1295 1284
@@ -1352,6 +1341,8 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1352 put_online_cpus(); 1341 put_online_cpus();
1353 mutex_unlock(&buffer->mutex); 1342 mutex_unlock(&buffer->mutex);
1354 1343
1344 atomic_dec(&buffer->record_disabled);
1345
1355 return size; 1346 return size;
1356 1347
1357 free_pages: 1348 free_pages:
@@ -1361,6 +1352,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1361 } 1352 }
1362 put_online_cpus(); 1353 put_online_cpus();
1363 mutex_unlock(&buffer->mutex); 1354 mutex_unlock(&buffer->mutex);
1355 atomic_dec(&buffer->record_disabled);
1364 return -ENOMEM; 1356 return -ENOMEM;
1365 1357
1366 /* 1358 /*
@@ -1370,6 +1362,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1370 out_fail: 1362 out_fail:
1371 put_online_cpus(); 1363 put_online_cpus();
1372 mutex_unlock(&buffer->mutex); 1364 mutex_unlock(&buffer->mutex);
1365 atomic_dec(&buffer->record_disabled);
1373 return -1; 1366 return -1;
1374} 1367}
1375EXPORT_SYMBOL_GPL(ring_buffer_resize); 1368EXPORT_SYMBOL_GPL(ring_buffer_resize);
@@ -1790,9 +1783,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1790static struct ring_buffer_event * 1783static struct ring_buffer_event *
1791rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, 1784rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1792 unsigned long length, unsigned long tail, 1785 unsigned long length, unsigned long tail,
1793 struct buffer_page *commit_page,
1794 struct buffer_page *tail_page, u64 *ts) 1786 struct buffer_page *tail_page, u64 *ts)
1795{ 1787{
1788 struct buffer_page *commit_page = cpu_buffer->commit_page;
1796 struct ring_buffer *buffer = cpu_buffer->buffer; 1789 struct ring_buffer *buffer = cpu_buffer->buffer;
1797 struct buffer_page *next_page; 1790 struct buffer_page *next_page;
1798 int ret; 1791 int ret;
@@ -1895,13 +1888,10 @@ static struct ring_buffer_event *
1895__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, 1888__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1896 unsigned type, unsigned long length, u64 *ts) 1889 unsigned type, unsigned long length, u64 *ts)
1897{ 1890{
1898 struct buffer_page *tail_page, *commit_page; 1891 struct buffer_page *tail_page;
1899 struct ring_buffer_event *event; 1892 struct ring_buffer_event *event;
1900 unsigned long tail, write; 1893 unsigned long tail, write;
1901 1894
1902 commit_page = cpu_buffer->commit_page;
1903 /* we just need to protect against interrupts */
1904 barrier();
1905 tail_page = cpu_buffer->tail_page; 1895 tail_page = cpu_buffer->tail_page;
1906 write = local_add_return(length, &tail_page->write); 1896 write = local_add_return(length, &tail_page->write);
1907 1897
@@ -1912,7 +1902,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1912 /* See if we shot pass the end of this buffer page */ 1902 /* See if we shot pass the end of this buffer page */
1913 if (write > BUF_PAGE_SIZE) 1903 if (write > BUF_PAGE_SIZE)
1914 return rb_move_tail(cpu_buffer, length, tail, 1904 return rb_move_tail(cpu_buffer, length, tail,
1915 commit_page, tail_page, ts); 1905 tail_page, ts);
1916 1906
1917 /* We reserved something on the buffer */ 1907 /* We reserved something on the buffer */
1918 1908
@@ -2837,7 +2827,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2837 int ret; 2827 int ret;
2838 2828
2839 local_irq_save(flags); 2829 local_irq_save(flags);
2840 __raw_spin_lock(&cpu_buffer->lock); 2830 arch_spin_lock(&cpu_buffer->lock);
2841 2831
2842 again: 2832 again:
2843 /* 2833 /*
@@ -2926,7 +2916,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2926 goto again; 2916 goto again;
2927 2917
2928 out: 2918 out:
2929 __raw_spin_unlock(&cpu_buffer->lock); 2919 arch_spin_unlock(&cpu_buffer->lock);
2930 local_irq_restore(flags); 2920 local_irq_restore(flags);
2931 2921
2932 return reader; 2922 return reader;
@@ -3289,9 +3279,9 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
3289 synchronize_sched(); 3279 synchronize_sched();
3290 3280
3291 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3281 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3292 __raw_spin_lock(&cpu_buffer->lock); 3282 arch_spin_lock(&cpu_buffer->lock);
3293 rb_iter_reset(iter); 3283 rb_iter_reset(iter);
3294 __raw_spin_unlock(&cpu_buffer->lock); 3284 arch_spin_unlock(&cpu_buffer->lock);
3295 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3285 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3296 3286
3297 return iter; 3287 return iter;
@@ -3411,11 +3401,11 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
3411 if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) 3401 if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
3412 goto out; 3402 goto out;
3413 3403
3414 __raw_spin_lock(&cpu_buffer->lock); 3404 arch_spin_lock(&cpu_buffer->lock);
3415 3405
3416 rb_reset_cpu(cpu_buffer); 3406 rb_reset_cpu(cpu_buffer);
3417 3407
3418 __raw_spin_unlock(&cpu_buffer->lock); 3408 arch_spin_unlock(&cpu_buffer->lock);
3419 3409
3420 out: 3410 out:
3421 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3411 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 573d3cc762c3..b2477caf09c2 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -35,6 +35,28 @@ static int disable_reader;
35module_param(disable_reader, uint, 0644); 35module_param(disable_reader, uint, 0644);
36MODULE_PARM_DESC(disable_reader, "only run producer"); 36MODULE_PARM_DESC(disable_reader, "only run producer");
37 37
38static int write_iteration = 50;
39module_param(write_iteration, uint, 0644);
40MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings");
41
42static int producer_nice = 19;
43static int consumer_nice = 19;
44
45static int producer_fifo = -1;
46static int consumer_fifo = -1;
47
48module_param(producer_nice, uint, 0644);
49MODULE_PARM_DESC(producer_nice, "nice prio for producer");
50
51module_param(consumer_nice, uint, 0644);
52MODULE_PARM_DESC(consumer_nice, "nice prio for consumer");
53
54module_param(producer_fifo, uint, 0644);
55MODULE_PARM_DESC(producer_fifo, "fifo prio for producer");
56
57module_param(consumer_fifo, uint, 0644);
58MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer");
59
38static int read_events; 60static int read_events;
39 61
40static int kill_test; 62static int kill_test;
@@ -208,15 +230,18 @@ static void ring_buffer_producer(void)
208 do { 230 do {
209 struct ring_buffer_event *event; 231 struct ring_buffer_event *event;
210 int *entry; 232 int *entry;
211 233 int i;
212 event = ring_buffer_lock_reserve(buffer, 10); 234
213 if (!event) { 235 for (i = 0; i < write_iteration; i++) {
214 missed++; 236 event = ring_buffer_lock_reserve(buffer, 10);
215 } else { 237 if (!event) {
216 hit++; 238 missed++;
217 entry = ring_buffer_event_data(event); 239 } else {
218 *entry = smp_processor_id(); 240 hit++;
219 ring_buffer_unlock_commit(buffer, event); 241 entry = ring_buffer_event_data(event);
242 *entry = smp_processor_id();
243 ring_buffer_unlock_commit(buffer, event);
244 }
220 } 245 }
221 do_gettimeofday(&end_tv); 246 do_gettimeofday(&end_tv);
222 247
@@ -263,6 +288,27 @@ static void ring_buffer_producer(void)
263 288
264 if (kill_test) 289 if (kill_test)
265 trace_printk("ERROR!\n"); 290 trace_printk("ERROR!\n");
291
292 if (!disable_reader) {
293 if (consumer_fifo < 0)
294 trace_printk("Running Consumer at nice: %d\n",
295 consumer_nice);
296 else
297 trace_printk("Running Consumer at SCHED_FIFO %d\n",
298 consumer_fifo);
299 }
300 if (producer_fifo < 0)
301 trace_printk("Running Producer at nice: %d\n",
302 producer_nice);
303 else
304 trace_printk("Running Producer at SCHED_FIFO %d\n",
305 producer_fifo);
306
307 /* Let the user know that the test is running at low priority */
308 if (producer_fifo < 0 && consumer_fifo < 0 &&
309 producer_nice == 19 && consumer_nice == 19)
310 trace_printk("WARNING!!! This test is running at lowest priority.\n");
311
266 trace_printk("Time: %lld (usecs)\n", time); 312 trace_printk("Time: %lld (usecs)\n", time);
267 trace_printk("Overruns: %lld\n", overruns); 313 trace_printk("Overruns: %lld\n", overruns);
268 if (disable_reader) 314 if (disable_reader)
@@ -392,6 +438,27 @@ static int __init ring_buffer_benchmark_init(void)
392 if (IS_ERR(producer)) 438 if (IS_ERR(producer))
393 goto out_kill; 439 goto out_kill;
394 440
441 /*
442 * Run them as low-prio background tasks by default:
443 */
444 if (!disable_reader) {
445 if (consumer_fifo >= 0) {
446 struct sched_param param = {
447 .sched_priority = consumer_fifo
448 };
449 sched_setscheduler(consumer, SCHED_FIFO, &param);
450 } else
451 set_user_nice(consumer, consumer_nice);
452 }
453
454 if (producer_fifo >= 0) {
455 struct sched_param param = {
456 .sched_priority = consumer_fifo
457 };
458 sched_setscheduler(producer, SCHED_FIFO, &param);
459 } else
460 set_user_nice(producer, producer_nice);
461
395 return 0; 462 return 0;
396 463
397 out_kill: 464 out_kill:
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9d3067a62d43..06ba26747d7e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -86,17 +86,17 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
86 */ 86 */
87static int tracing_disabled = 1; 87static int tracing_disabled = 1;
88 88
89DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); 89DEFINE_PER_CPU(int, ftrace_cpu_disabled);
90 90
91static inline void ftrace_disable_cpu(void) 91static inline void ftrace_disable_cpu(void)
92{ 92{
93 preempt_disable(); 93 preempt_disable();
94 local_inc(&__get_cpu_var(ftrace_cpu_disabled)); 94 __this_cpu_inc(per_cpu_var(ftrace_cpu_disabled));
95} 95}
96 96
97static inline void ftrace_enable_cpu(void) 97static inline void ftrace_enable_cpu(void)
98{ 98{
99 local_dec(&__get_cpu_var(ftrace_cpu_disabled)); 99 __this_cpu_dec(per_cpu_var(ftrace_cpu_disabled));
100 preempt_enable(); 100 preempt_enable();
101} 101}
102 102
@@ -203,7 +203,7 @@ cycle_t ftrace_now(int cpu)
203 */ 203 */
204static struct trace_array max_tr; 204static struct trace_array max_tr;
205 205
206static DEFINE_PER_CPU(struct trace_array_cpu, max_data); 206static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data);
207 207
208/* tracer_enabled is used to toggle activation of a tracer */ 208/* tracer_enabled is used to toggle activation of a tracer */
209static int tracer_enabled = 1; 209static int tracer_enabled = 1;
@@ -313,7 +313,6 @@ static const char *trace_options[] = {
313 "bin", 313 "bin",
314 "block", 314 "block",
315 "stacktrace", 315 "stacktrace",
316 "sched-tree",
317 "trace_printk", 316 "trace_printk",
318 "ftrace_preempt", 317 "ftrace_preempt",
319 "branch", 318 "branch",
@@ -493,15 +492,15 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
493 * protected by per_cpu spinlocks. But the action of the swap 492 * protected by per_cpu spinlocks. But the action of the swap
494 * needs its own lock. 493 * needs its own lock.
495 * 494 *
496 * This is defined as a raw_spinlock_t in order to help 495 * This is defined as a arch_spinlock_t in order to help
497 * with performance when lockdep debugging is enabled. 496 * with performance when lockdep debugging is enabled.
498 * 497 *
499 * It is also used in other places outside the update_max_tr 498 * It is also used in other places outside the update_max_tr
500 * so it needs to be defined outside of the 499 * so it needs to be defined outside of the
501 * CONFIG_TRACER_MAX_TRACE. 500 * CONFIG_TRACER_MAX_TRACE.
502 */ 501 */
503static raw_spinlock_t ftrace_max_lock = 502static arch_spinlock_t ftrace_max_lock =
504 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 503 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
505 504
506#ifdef CONFIG_TRACER_MAX_TRACE 505#ifdef CONFIG_TRACER_MAX_TRACE
507unsigned long __read_mostly tracing_max_latency; 506unsigned long __read_mostly tracing_max_latency;
@@ -555,13 +554,13 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
555 return; 554 return;
556 555
557 WARN_ON_ONCE(!irqs_disabled()); 556 WARN_ON_ONCE(!irqs_disabled());
558 __raw_spin_lock(&ftrace_max_lock); 557 arch_spin_lock(&ftrace_max_lock);
559 558
560 tr->buffer = max_tr.buffer; 559 tr->buffer = max_tr.buffer;
561 max_tr.buffer = buf; 560 max_tr.buffer = buf;
562 561
563 __update_max_tr(tr, tsk, cpu); 562 __update_max_tr(tr, tsk, cpu);
564 __raw_spin_unlock(&ftrace_max_lock); 563 arch_spin_unlock(&ftrace_max_lock);
565} 564}
566 565
567/** 566/**
@@ -581,7 +580,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
581 return; 580 return;
582 581
583 WARN_ON_ONCE(!irqs_disabled()); 582 WARN_ON_ONCE(!irqs_disabled());
584 __raw_spin_lock(&ftrace_max_lock); 583 arch_spin_lock(&ftrace_max_lock);
585 584
586 ftrace_disable_cpu(); 585 ftrace_disable_cpu();
587 586
@@ -603,7 +602,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
603 WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); 602 WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
604 603
605 __update_max_tr(tr, tsk, cpu); 604 __update_max_tr(tr, tsk, cpu);
606 __raw_spin_unlock(&ftrace_max_lock); 605 arch_spin_unlock(&ftrace_max_lock);
607} 606}
608#endif /* CONFIG_TRACER_MAX_TRACE */ 607#endif /* CONFIG_TRACER_MAX_TRACE */
609 608
@@ -802,7 +801,7 @@ static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
802static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; 801static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
803static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; 802static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
804static int cmdline_idx; 803static int cmdline_idx;
805static raw_spinlock_t trace_cmdline_lock = __RAW_SPIN_LOCK_UNLOCKED; 804static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
806 805
807/* temporary disable recording */ 806/* temporary disable recording */
808static atomic_t trace_record_cmdline_disabled __read_mostly; 807static atomic_t trace_record_cmdline_disabled __read_mostly;
@@ -915,7 +914,7 @@ static void trace_save_cmdline(struct task_struct *tsk)
915 * nor do we want to disable interrupts, 914 * nor do we want to disable interrupts,
916 * so if we miss here, then better luck next time. 915 * so if we miss here, then better luck next time.
917 */ 916 */
918 if (!__raw_spin_trylock(&trace_cmdline_lock)) 917 if (!arch_spin_trylock(&trace_cmdline_lock))
919 return; 918 return;
920 919
921 idx = map_pid_to_cmdline[tsk->pid]; 920 idx = map_pid_to_cmdline[tsk->pid];
@@ -940,7 +939,7 @@ static void trace_save_cmdline(struct task_struct *tsk)
940 939
941 memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); 940 memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN);
942 941
943 __raw_spin_unlock(&trace_cmdline_lock); 942 arch_spin_unlock(&trace_cmdline_lock);
944} 943}
945 944
946void trace_find_cmdline(int pid, char comm[]) 945void trace_find_cmdline(int pid, char comm[])
@@ -958,14 +957,14 @@ void trace_find_cmdline(int pid, char comm[])
958 } 957 }
959 958
960 preempt_disable(); 959 preempt_disable();
961 __raw_spin_lock(&trace_cmdline_lock); 960 arch_spin_lock(&trace_cmdline_lock);
962 map = map_pid_to_cmdline[pid]; 961 map = map_pid_to_cmdline[pid];
963 if (map != NO_CMDLINE_MAP) 962 if (map != NO_CMDLINE_MAP)
964 strcpy(comm, saved_cmdlines[map]); 963 strcpy(comm, saved_cmdlines[map]);
965 else 964 else
966 strcpy(comm, "<...>"); 965 strcpy(comm, "<...>");
967 966
968 __raw_spin_unlock(&trace_cmdline_lock); 967 arch_spin_unlock(&trace_cmdline_lock);
969 preempt_enable(); 968 preempt_enable();
970} 969}
971 970
@@ -1085,7 +1084,7 @@ trace_function(struct trace_array *tr,
1085 struct ftrace_entry *entry; 1084 struct ftrace_entry *entry;
1086 1085
1087 /* If we are reading the ring buffer, don't trace */ 1086 /* If we are reading the ring buffer, don't trace */
1088 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) 1087 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
1089 return; 1088 return;
1090 1089
1091 event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), 1090 event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
@@ -1151,6 +1150,22 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
1151 __ftrace_trace_stack(tr->buffer, flags, skip, pc); 1150 __ftrace_trace_stack(tr->buffer, flags, skip, pc);
1152} 1151}
1153 1152
1153/**
1154 * trace_dump_stack - record a stack back trace in the trace buffer
1155 */
1156void trace_dump_stack(void)
1157{
1158 unsigned long flags;
1159
1160 if (tracing_disabled || tracing_selftest_running)
1161 return;
1162
1163 local_save_flags(flags);
1164
1165 /* skipping 3 traces, seems to get us at the caller of this function */
1166 __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count());
1167}
1168
1154void 1169void
1155ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) 1170ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1156{ 1171{
@@ -1251,8 +1266,8 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
1251 */ 1266 */
1252int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) 1267int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1253{ 1268{
1254 static raw_spinlock_t trace_buf_lock = 1269 static arch_spinlock_t trace_buf_lock =
1255 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 1270 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
1256 static u32 trace_buf[TRACE_BUF_SIZE]; 1271 static u32 trace_buf[TRACE_BUF_SIZE];
1257 1272
1258 struct ftrace_event_call *call = &event_bprint; 1273 struct ftrace_event_call *call = &event_bprint;
@@ -1283,7 +1298,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1283 1298
1284 /* Lockdep uses trace_printk for lock tracing */ 1299 /* Lockdep uses trace_printk for lock tracing */
1285 local_irq_save(flags); 1300 local_irq_save(flags);
1286 __raw_spin_lock(&trace_buf_lock); 1301 arch_spin_lock(&trace_buf_lock);
1287 len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args); 1302 len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
1288 1303
1289 if (len > TRACE_BUF_SIZE || len < 0) 1304 if (len > TRACE_BUF_SIZE || len < 0)
@@ -1304,7 +1319,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1304 ring_buffer_unlock_commit(buffer, event); 1319 ring_buffer_unlock_commit(buffer, event);
1305 1320
1306out_unlock: 1321out_unlock:
1307 __raw_spin_unlock(&trace_buf_lock); 1322 arch_spin_unlock(&trace_buf_lock);
1308 local_irq_restore(flags); 1323 local_irq_restore(flags);
1309 1324
1310out: 1325out:
@@ -1334,7 +1349,7 @@ int trace_array_printk(struct trace_array *tr,
1334int trace_array_vprintk(struct trace_array *tr, 1349int trace_array_vprintk(struct trace_array *tr,
1335 unsigned long ip, const char *fmt, va_list args) 1350 unsigned long ip, const char *fmt, va_list args)
1336{ 1351{
1337 static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; 1352 static arch_spinlock_t trace_buf_lock = __ARCH_SPIN_LOCK_UNLOCKED;
1338 static char trace_buf[TRACE_BUF_SIZE]; 1353 static char trace_buf[TRACE_BUF_SIZE];
1339 1354
1340 struct ftrace_event_call *call = &event_print; 1355 struct ftrace_event_call *call = &event_print;
@@ -1360,12 +1375,9 @@ int trace_array_vprintk(struct trace_array *tr,
1360 1375
1361 pause_graph_tracing(); 1376 pause_graph_tracing();
1362 raw_local_irq_save(irq_flags); 1377 raw_local_irq_save(irq_flags);
1363 __raw_spin_lock(&trace_buf_lock); 1378 arch_spin_lock(&trace_buf_lock);
1364 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); 1379 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
1365 1380
1366 len = min(len, TRACE_BUF_SIZE-1);
1367 trace_buf[len] = 0;
1368
1369 size = sizeof(*entry) + len + 1; 1381 size = sizeof(*entry) + len + 1;
1370 buffer = tr->buffer; 1382 buffer = tr->buffer;
1371 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, 1383 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
@@ -1373,15 +1385,15 @@ int trace_array_vprintk(struct trace_array *tr,
1373 if (!event) 1385 if (!event)
1374 goto out_unlock; 1386 goto out_unlock;
1375 entry = ring_buffer_event_data(event); 1387 entry = ring_buffer_event_data(event);
1376 entry->ip = ip; 1388 entry->ip = ip;
1377 1389
1378 memcpy(&entry->buf, trace_buf, len); 1390 memcpy(&entry->buf, trace_buf, len);
1379 entry->buf[len] = 0; 1391 entry->buf[len] = '\0';
1380 if (!filter_check_discard(call, entry, buffer, event)) 1392 if (!filter_check_discard(call, entry, buffer, event))
1381 ring_buffer_unlock_commit(buffer, event); 1393 ring_buffer_unlock_commit(buffer, event);
1382 1394
1383 out_unlock: 1395 out_unlock:
1384 __raw_spin_unlock(&trace_buf_lock); 1396 arch_spin_unlock(&trace_buf_lock);
1385 raw_local_irq_restore(irq_flags); 1397 raw_local_irq_restore(irq_flags);
1386 unpause_graph_tracing(); 1398 unpause_graph_tracing();
1387 out: 1399 out:
@@ -1515,6 +1527,8 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
1515 int i = (int)*pos; 1527 int i = (int)*pos;
1516 void *ent; 1528 void *ent;
1517 1529
1530 WARN_ON_ONCE(iter->leftover);
1531
1518 (*pos)++; 1532 (*pos)++;
1519 1533
1520 /* can't go backwards */ 1534 /* can't go backwards */
@@ -1613,8 +1627,16 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1613 ; 1627 ;
1614 1628
1615 } else { 1629 } else {
1616 l = *pos - 1; 1630 /*
1617 p = s_next(m, p, &l); 1631 * If we overflowed the seq_file before, then we want
1632 * to just reuse the trace_seq buffer again.
1633 */
1634 if (iter->leftover)
1635 p = iter;
1636 else {
1637 l = *pos - 1;
1638 p = s_next(m, p, &l);
1639 }
1618 } 1640 }
1619 1641
1620 trace_event_read_lock(); 1642 trace_event_read_lock();
@@ -1922,6 +1944,7 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
1922static int s_show(struct seq_file *m, void *v) 1944static int s_show(struct seq_file *m, void *v)
1923{ 1945{
1924 struct trace_iterator *iter = v; 1946 struct trace_iterator *iter = v;
1947 int ret;
1925 1948
1926 if (iter->ent == NULL) { 1949 if (iter->ent == NULL) {
1927 if (iter->tr) { 1950 if (iter->tr) {
@@ -1941,9 +1964,27 @@ static int s_show(struct seq_file *m, void *v)
1941 if (!(trace_flags & TRACE_ITER_VERBOSE)) 1964 if (!(trace_flags & TRACE_ITER_VERBOSE))
1942 print_func_help_header(m); 1965 print_func_help_header(m);
1943 } 1966 }
1967 } else if (iter->leftover) {
1968 /*
1969 * If we filled the seq_file buffer earlier, we
1970 * want to just show it now.
1971 */
1972 ret = trace_print_seq(m, &iter->seq);
1973
1974 /* ret should this time be zero, but you never know */
1975 iter->leftover = ret;
1976
1944 } else { 1977 } else {
1945 print_trace_line(iter); 1978 print_trace_line(iter);
1946 trace_print_seq(m, &iter->seq); 1979 ret = trace_print_seq(m, &iter->seq);
1980 /*
1981 * If we overflow the seq_file buffer, then it will
1982 * ask us for this data again at start up.
1983 * Use that instead.
1984 * ret is 0 if seq_file write succeeded.
1985 * -1 otherwise.
1986 */
1987 iter->leftover = ret;
1947 } 1988 }
1948 1989
1949 return 0; 1990 return 0;
@@ -2253,7 +2294,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2253 mutex_lock(&tracing_cpumask_update_lock); 2294 mutex_lock(&tracing_cpumask_update_lock);
2254 2295
2255 local_irq_disable(); 2296 local_irq_disable();
2256 __raw_spin_lock(&ftrace_max_lock); 2297 arch_spin_lock(&ftrace_max_lock);
2257 for_each_tracing_cpu(cpu) { 2298 for_each_tracing_cpu(cpu) {
2258 /* 2299 /*
2259 * Increase/decrease the disabled counter if we are 2300 * Increase/decrease the disabled counter if we are
@@ -2268,7 +2309,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2268 atomic_dec(&global_trace.data[cpu]->disabled); 2309 atomic_dec(&global_trace.data[cpu]->disabled);
2269 } 2310 }
2270 } 2311 }
2271 __raw_spin_unlock(&ftrace_max_lock); 2312 arch_spin_unlock(&ftrace_max_lock);
2272 local_irq_enable(); 2313 local_irq_enable();
2273 2314
2274 cpumask_copy(tracing_cpumask, tracing_cpumask_new); 2315 cpumask_copy(tracing_cpumask, tracing_cpumask_new);
@@ -2290,67 +2331,49 @@ static const struct file_operations tracing_cpumask_fops = {
2290 .write = tracing_cpumask_write, 2331 .write = tracing_cpumask_write,
2291}; 2332};
2292 2333
2293static ssize_t 2334static int tracing_trace_options_show(struct seq_file *m, void *v)
2294tracing_trace_options_read(struct file *filp, char __user *ubuf,
2295 size_t cnt, loff_t *ppos)
2296{ 2335{
2297 struct tracer_opt *trace_opts; 2336 struct tracer_opt *trace_opts;
2298 u32 tracer_flags; 2337 u32 tracer_flags;
2299 int len = 0;
2300 char *buf;
2301 int r = 0;
2302 int i; 2338 int i;
2303 2339
2304
2305 /* calculate max size */
2306 for (i = 0; trace_options[i]; i++) {
2307 len += strlen(trace_options[i]);
2308 len += 3; /* "no" and newline */
2309 }
2310
2311 mutex_lock(&trace_types_lock); 2340 mutex_lock(&trace_types_lock);
2312 tracer_flags = current_trace->flags->val; 2341 tracer_flags = current_trace->flags->val;
2313 trace_opts = current_trace->flags->opts; 2342 trace_opts = current_trace->flags->opts;
2314 2343
2315 /*
2316 * Increase the size with names of options specific
2317 * of the current tracer.
2318 */
2319 for (i = 0; trace_opts[i].name; i++) {
2320 len += strlen(trace_opts[i].name);
2321 len += 3; /* "no" and newline */
2322 }
2323
2324 /* +1 for \0 */
2325 buf = kmalloc(len + 1, GFP_KERNEL);
2326 if (!buf) {
2327 mutex_unlock(&trace_types_lock);
2328 return -ENOMEM;
2329 }
2330
2331 for (i = 0; trace_options[i]; i++) { 2344 for (i = 0; trace_options[i]; i++) {
2332 if (trace_flags & (1 << i)) 2345 if (trace_flags & (1 << i))
2333 r += sprintf(buf + r, "%s\n", trace_options[i]); 2346 seq_printf(m, "%s\n", trace_options[i]);
2334 else 2347 else
2335 r += sprintf(buf + r, "no%s\n", trace_options[i]); 2348 seq_printf(m, "no%s\n", trace_options[i]);
2336 } 2349 }
2337 2350
2338 for (i = 0; trace_opts[i].name; i++) { 2351 for (i = 0; trace_opts[i].name; i++) {
2339 if (tracer_flags & trace_opts[i].bit) 2352 if (tracer_flags & trace_opts[i].bit)
2340 r += sprintf(buf + r, "%s\n", 2353 seq_printf(m, "%s\n", trace_opts[i].name);
2341 trace_opts[i].name);
2342 else 2354 else
2343 r += sprintf(buf + r, "no%s\n", 2355 seq_printf(m, "no%s\n", trace_opts[i].name);
2344 trace_opts[i].name);
2345 } 2356 }
2346 mutex_unlock(&trace_types_lock); 2357 mutex_unlock(&trace_types_lock);
2347 2358
2348 WARN_ON(r >= len + 1); 2359 return 0;
2360}
2349 2361
2350 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 2362static int __set_tracer_option(struct tracer *trace,
2363 struct tracer_flags *tracer_flags,
2364 struct tracer_opt *opts, int neg)
2365{
2366 int ret;
2351 2367
2352 kfree(buf); 2368 ret = trace->set_flag(tracer_flags->val, opts->bit, !neg);
2353 return r; 2369 if (ret)
2370 return ret;
2371
2372 if (neg)
2373 tracer_flags->val &= ~opts->bit;
2374 else
2375 tracer_flags->val |= opts->bit;
2376 return 0;
2354} 2377}
2355 2378
2356/* Try to assign a tracer specific option */ 2379/* Try to assign a tracer specific option */
@@ -2358,33 +2381,17 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
2358{ 2381{
2359 struct tracer_flags *tracer_flags = trace->flags; 2382 struct tracer_flags *tracer_flags = trace->flags;
2360 struct tracer_opt *opts = NULL; 2383 struct tracer_opt *opts = NULL;
2361 int ret = 0, i = 0; 2384 int i;
2362 int len;
2363 2385
2364 for (i = 0; tracer_flags->opts[i].name; i++) { 2386 for (i = 0; tracer_flags->opts[i].name; i++) {
2365 opts = &tracer_flags->opts[i]; 2387 opts = &tracer_flags->opts[i];
2366 len = strlen(opts->name);
2367 2388
2368 if (strncmp(cmp, opts->name, len) == 0) { 2389 if (strcmp(cmp, opts->name) == 0)
2369 ret = trace->set_flag(tracer_flags->val, 2390 return __set_tracer_option(trace, trace->flags,
2370 opts->bit, !neg); 2391 opts, neg);
2371 break;
2372 }
2373 } 2392 }
2374 /* Not found */
2375 if (!tracer_flags->opts[i].name)
2376 return -EINVAL;
2377
2378 /* Refused to handle */
2379 if (ret)
2380 return ret;
2381
2382 if (neg)
2383 tracer_flags->val &= ~opts->bit;
2384 else
2385 tracer_flags->val |= opts->bit;
2386 2393
2387 return 0; 2394 return -EINVAL;
2388} 2395}
2389 2396
2390static void set_tracer_flags(unsigned int mask, int enabled) 2397static void set_tracer_flags(unsigned int mask, int enabled)
@@ -2404,7 +2411,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2404 size_t cnt, loff_t *ppos) 2411 size_t cnt, loff_t *ppos)
2405{ 2412{
2406 char buf[64]; 2413 char buf[64];
2407 char *cmp = buf; 2414 char *cmp;
2408 int neg = 0; 2415 int neg = 0;
2409 int ret; 2416 int ret;
2410 int i; 2417 int i;
@@ -2416,16 +2423,15 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2416 return -EFAULT; 2423 return -EFAULT;
2417 2424
2418 buf[cnt] = 0; 2425 buf[cnt] = 0;
2426 cmp = strstrip(buf);
2419 2427
2420 if (strncmp(buf, "no", 2) == 0) { 2428 if (strncmp(cmp, "no", 2) == 0) {
2421 neg = 1; 2429 neg = 1;
2422 cmp += 2; 2430 cmp += 2;
2423 } 2431 }
2424 2432
2425 for (i = 0; trace_options[i]; i++) { 2433 for (i = 0; trace_options[i]; i++) {
2426 int len = strlen(trace_options[i]); 2434 if (strcmp(cmp, trace_options[i]) == 0) {
2427
2428 if (strncmp(cmp, trace_options[i], len) == 0) {
2429 set_tracer_flags(1 << i, !neg); 2435 set_tracer_flags(1 << i, !neg);
2430 break; 2436 break;
2431 } 2437 }
@@ -2445,9 +2451,18 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2445 return cnt; 2451 return cnt;
2446} 2452}
2447 2453
2454static int tracing_trace_options_open(struct inode *inode, struct file *file)
2455{
2456 if (tracing_disabled)
2457 return -ENODEV;
2458 return single_open(file, tracing_trace_options_show, NULL);
2459}
2460
2448static const struct file_operations tracing_iter_fops = { 2461static const struct file_operations tracing_iter_fops = {
2449 .open = tracing_open_generic, 2462 .open = tracing_trace_options_open,
2450 .read = tracing_trace_options_read, 2463 .read = seq_read,
2464 .llseek = seq_lseek,
2465 .release = single_release,
2451 .write = tracing_trace_options_write, 2466 .write = tracing_trace_options_write,
2452}; 2467};
2453 2468
@@ -2897,6 +2912,10 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
2897 else 2912 else
2898 cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask); 2913 cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask);
2899 2914
2915
2916 if (iter->trace->pipe_close)
2917 iter->trace->pipe_close(iter);
2918
2900 mutex_unlock(&trace_types_lock); 2919 mutex_unlock(&trace_types_lock);
2901 2920
2902 free_cpumask_var(iter->started); 2921 free_cpumask_var(iter->started);
@@ -3103,7 +3122,7 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
3103 __free_page(spd->pages[idx]); 3122 __free_page(spd->pages[idx]);
3104} 3123}
3105 3124
3106static struct pipe_buf_operations tracing_pipe_buf_ops = { 3125static const struct pipe_buf_operations tracing_pipe_buf_ops = {
3107 .can_merge = 0, 3126 .can_merge = 0,
3108 .map = generic_pipe_buf_map, 3127 .map = generic_pipe_buf_map,
3109 .unmap = generic_pipe_buf_unmap, 3128 .unmap = generic_pipe_buf_unmap,
@@ -3334,7 +3353,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3334 size_t cnt, loff_t *fpos) 3353 size_t cnt, loff_t *fpos)
3335{ 3354{
3336 char *buf; 3355 char *buf;
3337 char *end;
3338 3356
3339 if (tracing_disabled) 3357 if (tracing_disabled)
3340 return -EINVAL; 3358 return -EINVAL;
@@ -3342,7 +3360,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3342 if (cnt > TRACE_BUF_SIZE) 3360 if (cnt > TRACE_BUF_SIZE)
3343 cnt = TRACE_BUF_SIZE; 3361 cnt = TRACE_BUF_SIZE;
3344 3362
3345 buf = kmalloc(cnt + 1, GFP_KERNEL); 3363 buf = kmalloc(cnt + 2, GFP_KERNEL);
3346 if (buf == NULL) 3364 if (buf == NULL)
3347 return -ENOMEM; 3365 return -ENOMEM;
3348 3366
@@ -3350,35 +3368,31 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3350 kfree(buf); 3368 kfree(buf);
3351 return -EFAULT; 3369 return -EFAULT;
3352 } 3370 }
3371 if (buf[cnt-1] != '\n') {
3372 buf[cnt] = '\n';
3373 buf[cnt+1] = '\0';
3374 } else
3375 buf[cnt] = '\0';
3353 3376
3354 /* Cut from the first nil or newline. */ 3377 cnt = mark_printk("%s", buf);
3355 buf[cnt] = '\0';
3356 end = strchr(buf, '\n');
3357 if (end)
3358 *end = '\0';
3359
3360 cnt = mark_printk("%s\n", buf);
3361 kfree(buf); 3378 kfree(buf);
3362 *fpos += cnt; 3379 *fpos += cnt;
3363 3380
3364 return cnt; 3381 return cnt;
3365} 3382}
3366 3383
3367static ssize_t tracing_clock_read(struct file *filp, char __user *ubuf, 3384static int tracing_clock_show(struct seq_file *m, void *v)
3368 size_t cnt, loff_t *ppos)
3369{ 3385{
3370 char buf[64];
3371 int bufiter = 0;
3372 int i; 3386 int i;
3373 3387
3374 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) 3388 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++)
3375 bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter, 3389 seq_printf(m,
3376 "%s%s%s%s", i ? " " : "", 3390 "%s%s%s%s", i ? " " : "",
3377 i == trace_clock_id ? "[" : "", trace_clocks[i].name, 3391 i == trace_clock_id ? "[" : "", trace_clocks[i].name,
3378 i == trace_clock_id ? "]" : ""); 3392 i == trace_clock_id ? "]" : "");
3379 bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter, "\n"); 3393 seq_putc(m, '\n');
3380 3394
3381 return simple_read_from_buffer(ubuf, cnt, ppos, buf, bufiter); 3395 return 0;
3382} 3396}
3383 3397
3384static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, 3398static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
@@ -3420,6 +3434,13 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
3420 return cnt; 3434 return cnt;
3421} 3435}
3422 3436
3437static int tracing_clock_open(struct inode *inode, struct file *file)
3438{
3439 if (tracing_disabled)
3440 return -ENODEV;
3441 return single_open(file, tracing_clock_show, NULL);
3442}
3443
3423static const struct file_operations tracing_max_lat_fops = { 3444static const struct file_operations tracing_max_lat_fops = {
3424 .open = tracing_open_generic, 3445 .open = tracing_open_generic,
3425 .read = tracing_max_lat_read, 3446 .read = tracing_max_lat_read,
@@ -3458,8 +3479,10 @@ static const struct file_operations tracing_mark_fops = {
3458}; 3479};
3459 3480
3460static const struct file_operations trace_clock_fops = { 3481static const struct file_operations trace_clock_fops = {
3461 .open = tracing_open_generic, 3482 .open = tracing_clock_open,
3462 .read = tracing_clock_read, 3483 .read = seq_read,
3484 .llseek = seq_lseek,
3485 .release = single_release,
3463 .write = tracing_clock_write, 3486 .write = tracing_clock_write,
3464}; 3487};
3465 3488
@@ -3589,7 +3612,7 @@ static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
3589} 3612}
3590 3613
3591/* Pipe buffer operations for a buffer. */ 3614/* Pipe buffer operations for a buffer. */
3592static struct pipe_buf_operations buffer_pipe_buf_ops = { 3615static const struct pipe_buf_operations buffer_pipe_buf_ops = {
3593 .can_merge = 0, 3616 .can_merge = 0,
3594 .map = generic_pipe_buf_map, 3617 .map = generic_pipe_buf_map,
3595 .unmap = generic_pipe_buf_unmap, 3618 .unmap = generic_pipe_buf_unmap,
@@ -3730,7 +3753,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
3730 3753
3731 s = kmalloc(sizeof(*s), GFP_KERNEL); 3754 s = kmalloc(sizeof(*s), GFP_KERNEL);
3732 if (!s) 3755 if (!s)
3733 return ENOMEM; 3756 return -ENOMEM;
3734 3757
3735 trace_seq_init(s); 3758 trace_seq_init(s);
3736 3759
@@ -3920,39 +3943,16 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
3920 if (ret < 0) 3943 if (ret < 0)
3921 return ret; 3944 return ret;
3922 3945
3923 ret = 0; 3946 if (val != 0 && val != 1)
3924 switch (val) { 3947 return -EINVAL;
3925 case 0:
3926 /* do nothing if already cleared */
3927 if (!(topt->flags->val & topt->opt->bit))
3928 break;
3929
3930 mutex_lock(&trace_types_lock);
3931 if (current_trace->set_flag)
3932 ret = current_trace->set_flag(topt->flags->val,
3933 topt->opt->bit, 0);
3934 mutex_unlock(&trace_types_lock);
3935 if (ret)
3936 return ret;
3937 topt->flags->val &= ~topt->opt->bit;
3938 break;
3939 case 1:
3940 /* do nothing if already set */
3941 if (topt->flags->val & topt->opt->bit)
3942 break;
3943 3948
3949 if (!!(topt->flags->val & topt->opt->bit) != val) {
3944 mutex_lock(&trace_types_lock); 3950 mutex_lock(&trace_types_lock);
3945 if (current_trace->set_flag) 3951 ret = __set_tracer_option(current_trace, topt->flags,
3946 ret = current_trace->set_flag(topt->flags->val, 3952 topt->opt, val);
3947 topt->opt->bit, 1);
3948 mutex_unlock(&trace_types_lock); 3953 mutex_unlock(&trace_types_lock);
3949 if (ret) 3954 if (ret)
3950 return ret; 3955 return ret;
3951 topt->flags->val |= topt->opt->bit;
3952 break;
3953
3954 default:
3955 return -EINVAL;
3956 } 3956 }
3957 3957
3958 *ppos += cnt; 3958 *ppos += cnt;
@@ -4279,8 +4279,8 @@ trace_printk_seq(struct trace_seq *s)
4279 4279
4280static void __ftrace_dump(bool disable_tracing) 4280static void __ftrace_dump(bool disable_tracing)
4281{ 4281{
4282 static raw_spinlock_t ftrace_dump_lock = 4282 static arch_spinlock_t ftrace_dump_lock =
4283 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 4283 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
4284 /* use static because iter can be a bit big for the stack */ 4284 /* use static because iter can be a bit big for the stack */
4285 static struct trace_iterator iter; 4285 static struct trace_iterator iter;
4286 unsigned int old_userobj; 4286 unsigned int old_userobj;
@@ -4290,7 +4290,7 @@ static void __ftrace_dump(bool disable_tracing)
4290 4290
4291 /* only one dump */ 4291 /* only one dump */
4292 local_irq_save(flags); 4292 local_irq_save(flags);
4293 __raw_spin_lock(&ftrace_dump_lock); 4293 arch_spin_lock(&ftrace_dump_lock);
4294 if (dump_ran) 4294 if (dump_ran)
4295 goto out; 4295 goto out;
4296 4296
@@ -4365,7 +4365,7 @@ static void __ftrace_dump(bool disable_tracing)
4365 } 4365 }
4366 4366
4367 out: 4367 out:
4368 __raw_spin_unlock(&ftrace_dump_lock); 4368 arch_spin_unlock(&ftrace_dump_lock);
4369 local_irq_restore(flags); 4369 local_irq_restore(flags);
4370} 4370}
4371 4371
@@ -4426,7 +4426,7 @@ __init static int tracer_alloc_buffers(void)
4426 /* Allocate the first page for all buffers */ 4426 /* Allocate the first page for all buffers */
4427 for_each_tracing_cpu(i) { 4427 for_each_tracing_cpu(i) {
4428 global_trace.data[i] = &per_cpu(global_trace_cpu, i); 4428 global_trace.data[i] = &per_cpu(global_trace_cpu, i);
4429 max_tr.data[i] = &per_cpu(max_data, i); 4429 max_tr.data[i] = &per_cpu(max_tr_data, i);
4430 } 4430 }
4431 4431
4432 trace_init_cmdlines(); 4432 trace_init_cmdlines();
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 1d7f4830a80d..4df6a77eb196 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -272,6 +272,7 @@ struct tracer_flags {
272 * @pipe_open: called when the trace_pipe file is opened 272 * @pipe_open: called when the trace_pipe file is opened
273 * @wait_pipe: override how the user waits for traces on trace_pipe 273 * @wait_pipe: override how the user waits for traces on trace_pipe
274 * @close: called when the trace file is released 274 * @close: called when the trace file is released
275 * @pipe_close: called when the trace_pipe file is released
275 * @read: override the default read callback on trace_pipe 276 * @read: override the default read callback on trace_pipe
276 * @splice_read: override the default splice_read callback on trace_pipe 277 * @splice_read: override the default splice_read callback on trace_pipe
277 * @selftest: selftest to run on boot (see trace_selftest.c) 278 * @selftest: selftest to run on boot (see trace_selftest.c)
@@ -290,6 +291,7 @@ struct tracer {
290 void (*pipe_open)(struct trace_iterator *iter); 291 void (*pipe_open)(struct trace_iterator *iter);
291 void (*wait_pipe)(struct trace_iterator *iter); 292 void (*wait_pipe)(struct trace_iterator *iter);
292 void (*close)(struct trace_iterator *iter); 293 void (*close)(struct trace_iterator *iter);
294 void (*pipe_close)(struct trace_iterator *iter);
293 ssize_t (*read)(struct trace_iterator *iter, 295 ssize_t (*read)(struct trace_iterator *iter,
294 struct file *filp, char __user *ubuf, 296 struct file *filp, char __user *ubuf,
295 size_t cnt, loff_t *ppos); 297 size_t cnt, loff_t *ppos);
@@ -441,7 +443,7 @@ extern int DYN_FTRACE_TEST_NAME(void);
441 443
442extern int ring_buffer_expanded; 444extern int ring_buffer_expanded;
443extern bool tracing_selftest_disabled; 445extern bool tracing_selftest_disabled;
444DECLARE_PER_CPU(local_t, ftrace_cpu_disabled); 446DECLARE_PER_CPU(int, ftrace_cpu_disabled);
445 447
446#ifdef CONFIG_FTRACE_STARTUP_TEST 448#ifdef CONFIG_FTRACE_STARTUP_TEST
447extern int trace_selftest_startup_function(struct tracer *trace, 449extern int trace_selftest_startup_function(struct tracer *trace,
@@ -595,18 +597,17 @@ enum trace_iterator_flags {
595 TRACE_ITER_BIN = 0x40, 597 TRACE_ITER_BIN = 0x40,
596 TRACE_ITER_BLOCK = 0x80, 598 TRACE_ITER_BLOCK = 0x80,
597 TRACE_ITER_STACKTRACE = 0x100, 599 TRACE_ITER_STACKTRACE = 0x100,
598 TRACE_ITER_SCHED_TREE = 0x200, 600 TRACE_ITER_PRINTK = 0x200,
599 TRACE_ITER_PRINTK = 0x400, 601 TRACE_ITER_PREEMPTONLY = 0x400,
600 TRACE_ITER_PREEMPTONLY = 0x800, 602 TRACE_ITER_BRANCH = 0x800,
601 TRACE_ITER_BRANCH = 0x1000, 603 TRACE_ITER_ANNOTATE = 0x1000,
602 TRACE_ITER_ANNOTATE = 0x2000, 604 TRACE_ITER_USERSTACKTRACE = 0x2000,
603 TRACE_ITER_USERSTACKTRACE = 0x4000, 605 TRACE_ITER_SYM_USEROBJ = 0x4000,
604 TRACE_ITER_SYM_USEROBJ = 0x8000, 606 TRACE_ITER_PRINTK_MSGONLY = 0x8000,
605 TRACE_ITER_PRINTK_MSGONLY = 0x10000, 607 TRACE_ITER_CONTEXT_INFO = 0x10000, /* Print pid/cpu/time */
606 TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */ 608 TRACE_ITER_LATENCY_FMT = 0x20000,
607 TRACE_ITER_LATENCY_FMT = 0x40000, 609 TRACE_ITER_SLEEP_TIME = 0x40000,
608 TRACE_ITER_SLEEP_TIME = 0x80000, 610 TRACE_ITER_GRAPH_TIME = 0x80000,
609 TRACE_ITER_GRAPH_TIME = 0x100000,
610}; 611};
611 612
612/* 613/*
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 20c5f92e28a8..84a3a7ba072a 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -20,6 +20,8 @@
20#include <linux/ktime.h> 20#include <linux/ktime.h>
21#include <linux/trace_clock.h> 21#include <linux/trace_clock.h>
22 22
23#include "trace.h"
24
23/* 25/*
24 * trace_clock_local(): the simplest and least coherent tracing clock. 26 * trace_clock_local(): the simplest and least coherent tracing clock.
25 * 27 *
@@ -28,17 +30,17 @@
28 */ 30 */
29u64 notrace trace_clock_local(void) 31u64 notrace trace_clock_local(void)
30{ 32{
31 unsigned long flags;
32 u64 clock; 33 u64 clock;
34 int resched;
33 35
34 /* 36 /*
35 * sched_clock() is an architecture implemented, fast, scalable, 37 * sched_clock() is an architecture implemented, fast, scalable,
36 * lockless clock. It is not guaranteed to be coherent across 38 * lockless clock. It is not guaranteed to be coherent across
37 * CPUs, nor across CPU idle events. 39 * CPUs, nor across CPU idle events.
38 */ 40 */
39 raw_local_irq_save(flags); 41 resched = ftrace_preempt_disable();
40 clock = sched_clock(); 42 clock = sched_clock();
41 raw_local_irq_restore(flags); 43 ftrace_preempt_enable(resched);
42 44
43 return clock; 45 return clock;
44} 46}
@@ -69,10 +71,10 @@ u64 notrace trace_clock(void)
69/* keep prev_time and lock in the same cacheline. */ 71/* keep prev_time and lock in the same cacheline. */
70static struct { 72static struct {
71 u64 prev_time; 73 u64 prev_time;
72 raw_spinlock_t lock; 74 arch_spinlock_t lock;
73} trace_clock_struct ____cacheline_aligned_in_smp = 75} trace_clock_struct ____cacheline_aligned_in_smp =
74 { 76 {
75 .lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED, 77 .lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED,
76 }; 78 };
77 79
78u64 notrace trace_clock_global(void) 80u64 notrace trace_clock_global(void)
@@ -92,7 +94,7 @@ u64 notrace trace_clock_global(void)
92 if (unlikely(in_nmi())) 94 if (unlikely(in_nmi()))
93 goto out; 95 goto out;
94 96
95 __raw_spin_lock(&trace_clock_struct.lock); 97 arch_spin_lock(&trace_clock_struct.lock);
96 98
97 /* 99 /*
98 * TODO: if this happens often then maybe we should reset 100 * TODO: if this happens often then maybe we should reset
@@ -104,7 +106,7 @@ u64 notrace trace_clock_global(void)
104 106
105 trace_clock_struct.prev_time = now; 107 trace_clock_struct.prev_time = now;
106 108
107 __raw_spin_unlock(&trace_clock_struct.lock); 109 arch_spin_unlock(&trace_clock_struct.lock);
108 110
109 out: 111 out:
110 raw_local_irq_restore(flags); 112 raw_local_irq_restore(flags);
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index d9c60f80aa0d..9e25573242cf 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -25,7 +25,7 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event)
25 char *buf; 25 char *buf;
26 int ret = -ENOMEM; 26 int ret = -ENOMEM;
27 27
28 if (atomic_inc_return(&event->profile_count)) 28 if (event->profile_count++ > 0)
29 return 0; 29 return 0;
30 30
31 if (!total_profile_count) { 31 if (!total_profile_count) {
@@ -56,7 +56,7 @@ fail_buf_nmi:
56 perf_trace_buf = NULL; 56 perf_trace_buf = NULL;
57 } 57 }
58fail_buf: 58fail_buf:
59 atomic_dec(&event->profile_count); 59 event->profile_count--;
60 60
61 return ret; 61 return ret;
62} 62}
@@ -83,7 +83,7 @@ static void ftrace_profile_disable_event(struct ftrace_event_call *event)
83{ 83{
84 char *buf, *nmi_buf; 84 char *buf, *nmi_buf;
85 85
86 if (!atomic_add_negative(-1, &event->profile_count)) 86 if (--event->profile_count > 0)
87 return; 87 return;
88 88
89 event->profile_disable(event); 89 event->profile_disable(event);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 1d18315dc836..189b09baf4fb 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -78,7 +78,7 @@ EXPORT_SYMBOL_GPL(trace_define_field);
78 if (ret) \ 78 if (ret) \
79 return ret; 79 return ret;
80 80
81int trace_define_common_fields(struct ftrace_event_call *call) 81static int trace_define_common_fields(struct ftrace_event_call *call)
82{ 82{
83 int ret; 83 int ret;
84 struct trace_entry ent; 84 struct trace_entry ent;
@@ -91,7 +91,6 @@ int trace_define_common_fields(struct ftrace_event_call *call)
91 91
92 return ret; 92 return ret;
93} 93}
94EXPORT_SYMBOL_GPL(trace_define_common_fields);
95 94
96void trace_destroy_fields(struct ftrace_event_call *call) 95void trace_destroy_fields(struct ftrace_event_call *call)
97{ 96{
@@ -105,9 +104,25 @@ void trace_destroy_fields(struct ftrace_event_call *call)
105 } 104 }
106} 105}
107 106
108static void ftrace_event_enable_disable(struct ftrace_event_call *call, 107int trace_event_raw_init(struct ftrace_event_call *call)
108{
109 int id;
110
111 id = register_ftrace_event(call->event);
112 if (!id)
113 return -ENODEV;
114 call->id = id;
115 INIT_LIST_HEAD(&call->fields);
116
117 return 0;
118}
119EXPORT_SYMBOL_GPL(trace_event_raw_init);
120
121static int ftrace_event_enable_disable(struct ftrace_event_call *call,
109 int enable) 122 int enable)
110{ 123{
124 int ret = 0;
125
111 switch (enable) { 126 switch (enable) {
112 case 0: 127 case 0:
113 if (call->enabled) { 128 if (call->enabled) {
@@ -118,12 +133,20 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
118 break; 133 break;
119 case 1: 134 case 1:
120 if (!call->enabled) { 135 if (!call->enabled) {
121 call->enabled = 1;
122 tracing_start_cmdline_record(); 136 tracing_start_cmdline_record();
123 call->regfunc(call); 137 ret = call->regfunc(call);
138 if (ret) {
139 tracing_stop_cmdline_record();
140 pr_info("event trace: Could not enable event "
141 "%s\n", call->name);
142 break;
143 }
144 call->enabled = 1;
124 } 145 }
125 break; 146 break;
126 } 147 }
148
149 return ret;
127} 150}
128 151
129static void ftrace_clear_events(void) 152static void ftrace_clear_events(void)
@@ -402,7 +425,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
402 case 0: 425 case 0:
403 case 1: 426 case 1:
404 mutex_lock(&event_mutex); 427 mutex_lock(&event_mutex);
405 ftrace_event_enable_disable(call, val); 428 ret = ftrace_event_enable_disable(call, val);
406 mutex_unlock(&event_mutex); 429 mutex_unlock(&event_mutex);
407 break; 430 break;
408 431
@@ -412,7 +435,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
412 435
413 *ppos += cnt; 436 *ppos += cnt;
414 437
415 return cnt; 438 return ret ? ret : cnt;
416} 439}
417 440
418static ssize_t 441static ssize_t
@@ -913,7 +936,9 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
913 id); 936 id);
914 937
915 if (call->define_fields) { 938 if (call->define_fields) {
916 ret = call->define_fields(call); 939 ret = trace_define_common_fields(call);
940 if (!ret)
941 ret = call->define_fields(call);
917 if (ret < 0) { 942 if (ret < 0) {
918 pr_warning("Could not initialize trace point" 943 pr_warning("Could not initialize trace point"
919 " events/%s\n", call->name); 944 " events/%s\n", call->name);
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 934d81fb4ca4..458e5bfe26d0 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -48,11 +48,11 @@
48struct ____ftrace_##name { \ 48struct ____ftrace_##name { \
49 tstruct \ 49 tstruct \
50}; \ 50}; \
51static void __used ____ftrace_check_##name(void) \ 51static void __always_unused ____ftrace_check_##name(void) \
52{ \ 52{ \
53 struct ____ftrace_##name *__entry = NULL; \ 53 struct ____ftrace_##name *__entry = NULL; \
54 \ 54 \
55 /* force cmpile-time check on F_printk() */ \ 55 /* force compile-time check on F_printk() */ \
56 printk(print); \ 56 printk(print); \
57} 57}
58 58
@@ -184,10 +184,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
184 struct struct_name field; \ 184 struct struct_name field; \
185 int ret; \ 185 int ret; \
186 \ 186 \
187 ret = trace_define_common_fields(event_call); \
188 if (ret) \
189 return ret; \
190 \
191 tstruct; \ 187 tstruct; \
192 \ 188 \
193 return ret; \ 189 return ret; \
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 45e6c01b2e4d..b1342c5d37cf 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -14,9 +14,20 @@
14#include "trace.h" 14#include "trace.h"
15#include "trace_output.h" 15#include "trace_output.h"
16 16
17struct fgraph_data { 17struct fgraph_cpu_data {
18 pid_t last_pid; 18 pid_t last_pid;
19 int depth; 19 int depth;
20 int ignore;
21};
22
23struct fgraph_data {
24 struct fgraph_cpu_data *cpu_data;
25
26 /* Place to preserve last processed entry. */
27 struct ftrace_graph_ent_entry ent;
28 struct ftrace_graph_ret_entry ret;
29 int failed;
30 int cpu;
20}; 31};
21 32
22#define TRACE_GRAPH_INDENT 2 33#define TRACE_GRAPH_INDENT 2
@@ -176,7 +187,7 @@ static int __trace_graph_entry(struct trace_array *tr,
176 struct ring_buffer *buffer = tr->buffer; 187 struct ring_buffer *buffer = tr->buffer;
177 struct ftrace_graph_ent_entry *entry; 188 struct ftrace_graph_ent_entry *entry;
178 189
179 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) 190 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
180 return 0; 191 return 0;
181 192
182 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, 193 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
@@ -240,7 +251,7 @@ static void __trace_graph_return(struct trace_array *tr,
240 struct ring_buffer *buffer = tr->buffer; 251 struct ring_buffer *buffer = tr->buffer;
241 struct ftrace_graph_ret_entry *entry; 252 struct ftrace_graph_ret_entry *entry;
242 253
243 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) 254 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
244 return; 255 return;
245 256
246 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, 257 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
@@ -384,7 +395,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
384 if (!data) 395 if (!data)
385 return TRACE_TYPE_HANDLED; 396 return TRACE_TYPE_HANDLED;
386 397
387 last_pid = &(per_cpu_ptr(data, cpu)->last_pid); 398 last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
388 399
389 if (*last_pid == pid) 400 if (*last_pid == pid)
390 return TRACE_TYPE_HANDLED; 401 return TRACE_TYPE_HANDLED;
@@ -435,26 +446,49 @@ static struct ftrace_graph_ret_entry *
435get_return_for_leaf(struct trace_iterator *iter, 446get_return_for_leaf(struct trace_iterator *iter,
436 struct ftrace_graph_ent_entry *curr) 447 struct ftrace_graph_ent_entry *curr)
437{ 448{
438 struct ring_buffer_iter *ring_iter; 449 struct fgraph_data *data = iter->private;
450 struct ring_buffer_iter *ring_iter = NULL;
439 struct ring_buffer_event *event; 451 struct ring_buffer_event *event;
440 struct ftrace_graph_ret_entry *next; 452 struct ftrace_graph_ret_entry *next;
441 453
442 ring_iter = iter->buffer_iter[iter->cpu]; 454 /*
455 * If the previous output failed to write to the seq buffer,
456 * then we just reuse the data from before.
457 */
458 if (data && data->failed) {
459 curr = &data->ent;
460 next = &data->ret;
461 } else {
443 462
444 /* First peek to compare current entry and the next one */ 463 ring_iter = iter->buffer_iter[iter->cpu];
445 if (ring_iter) 464
446 event = ring_buffer_iter_peek(ring_iter, NULL); 465 /* First peek to compare current entry and the next one */
447 else { 466 if (ring_iter)
448 /* We need to consume the current entry to see the next one */ 467 event = ring_buffer_iter_peek(ring_iter, NULL);
449 ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL); 468 else {
450 event = ring_buffer_peek(iter->tr->buffer, iter->cpu, 469 /*
451 NULL); 470 * We need to consume the current entry to see
452 } 471 * the next one.
472 */
473 ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
474 event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
475 NULL);
476 }
453 477
454 if (!event) 478 if (!event)
455 return NULL; 479 return NULL;
480
481 next = ring_buffer_event_data(event);
456 482
457 next = ring_buffer_event_data(event); 483 if (data) {
484 /*
485 * Save current and next entries for later reference
486 * if the output fails.
487 */
488 data->ent = *curr;
489 data->ret = *next;
490 }
491 }
458 492
459 if (next->ent.type != TRACE_GRAPH_RET) 493 if (next->ent.type != TRACE_GRAPH_RET)
460 return NULL; 494 return NULL;
@@ -640,7 +674,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
640 674
641 if (data) { 675 if (data) {
642 int cpu = iter->cpu; 676 int cpu = iter->cpu;
643 int *depth = &(per_cpu_ptr(data, cpu)->depth); 677 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
644 678
645 /* 679 /*
646 * Comments display at + 1 to depth. Since 680 * Comments display at + 1 to depth. Since
@@ -688,7 +722,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
688 722
689 if (data) { 723 if (data) {
690 int cpu = iter->cpu; 724 int cpu = iter->cpu;
691 int *depth = &(per_cpu_ptr(data, cpu)->depth); 725 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
692 726
693 *depth = call->depth; 727 *depth = call->depth;
694 } 728 }
@@ -782,19 +816,34 @@ static enum print_line_t
782print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, 816print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
783 struct trace_iterator *iter) 817 struct trace_iterator *iter)
784{ 818{
785 int cpu = iter->cpu; 819 struct fgraph_data *data = iter->private;
786 struct ftrace_graph_ent *call = &field->graph_ent; 820 struct ftrace_graph_ent *call = &field->graph_ent;
787 struct ftrace_graph_ret_entry *leaf_ret; 821 struct ftrace_graph_ret_entry *leaf_ret;
822 static enum print_line_t ret;
823 int cpu = iter->cpu;
788 824
789 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func)) 825 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func))
790 return TRACE_TYPE_PARTIAL_LINE; 826 return TRACE_TYPE_PARTIAL_LINE;
791 827
792 leaf_ret = get_return_for_leaf(iter, field); 828 leaf_ret = get_return_for_leaf(iter, field);
793 if (leaf_ret) 829 if (leaf_ret)
794 return print_graph_entry_leaf(iter, field, leaf_ret, s); 830 ret = print_graph_entry_leaf(iter, field, leaf_ret, s);
795 else 831 else
796 return print_graph_entry_nested(iter, field, s, cpu); 832 ret = print_graph_entry_nested(iter, field, s, cpu);
797 833
834 if (data) {
835 /*
836 * If we failed to write our output, then we need to make
837 * note of it. Because we already consumed our entry.
838 */
839 if (s->full) {
840 data->failed = 1;
841 data->cpu = cpu;
842 } else
843 data->failed = 0;
844 }
845
846 return ret;
798} 847}
799 848
800static enum print_line_t 849static enum print_line_t
@@ -810,7 +859,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
810 859
811 if (data) { 860 if (data) {
812 int cpu = iter->cpu; 861 int cpu = iter->cpu;
813 int *depth = &(per_cpu_ptr(data, cpu)->depth); 862 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
814 863
815 /* 864 /*
816 * Comments display at + 1 to depth. This is the 865 * Comments display at + 1 to depth. This is the
@@ -873,7 +922,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
873 int i; 922 int i;
874 923
875 if (data) 924 if (data)
876 depth = per_cpu_ptr(data, iter->cpu)->depth; 925 depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
877 926
878 if (print_graph_prologue(iter, s, 0, 0)) 927 if (print_graph_prologue(iter, s, 0, 0))
879 return TRACE_TYPE_PARTIAL_LINE; 928 return TRACE_TYPE_PARTIAL_LINE;
@@ -941,8 +990,33 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
941enum print_line_t 990enum print_line_t
942print_graph_function(struct trace_iterator *iter) 991print_graph_function(struct trace_iterator *iter)
943{ 992{
993 struct ftrace_graph_ent_entry *field;
994 struct fgraph_data *data = iter->private;
944 struct trace_entry *entry = iter->ent; 995 struct trace_entry *entry = iter->ent;
945 struct trace_seq *s = &iter->seq; 996 struct trace_seq *s = &iter->seq;
997 int cpu = iter->cpu;
998 int ret;
999
1000 if (data && per_cpu_ptr(data->cpu_data, cpu)->ignore) {
1001 per_cpu_ptr(data->cpu_data, cpu)->ignore = 0;
1002 return TRACE_TYPE_HANDLED;
1003 }
1004
1005 /*
1006 * If the last output failed, there's a possibility we need
1007 * to print out the missing entry which would never go out.
1008 */
1009 if (data && data->failed) {
1010 field = &data->ent;
1011 iter->cpu = data->cpu;
1012 ret = print_graph_entry(field, s, iter);
1013 if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) {
1014 per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1;
1015 ret = TRACE_TYPE_NO_CONSUME;
1016 }
1017 iter->cpu = cpu;
1018 return ret;
1019 }
946 1020
947 switch (entry->type) { 1021 switch (entry->type) {
948 case TRACE_GRAPH_ENT: { 1022 case TRACE_GRAPH_ENT: {
@@ -952,7 +1026,7 @@ print_graph_function(struct trace_iterator *iter)
952 * sizeof(struct ftrace_graph_ent_entry) is very small, 1026 * sizeof(struct ftrace_graph_ent_entry) is very small,
953 * it can be safely saved at the stack. 1027 * it can be safely saved at the stack.
954 */ 1028 */
955 struct ftrace_graph_ent_entry *field, saved; 1029 struct ftrace_graph_ent_entry saved;
956 trace_assign_type(field, entry); 1030 trace_assign_type(field, entry);
957 saved = *field; 1031 saved = *field;
958 return print_graph_entry(&saved, s, iter); 1032 return print_graph_entry(&saved, s, iter);
@@ -1030,31 +1104,54 @@ static void print_graph_headers(struct seq_file *s)
1030static void graph_trace_open(struct trace_iterator *iter) 1104static void graph_trace_open(struct trace_iterator *iter)
1031{ 1105{
1032 /* pid and depth on the last trace processed */ 1106 /* pid and depth on the last trace processed */
1033 struct fgraph_data *data = alloc_percpu(struct fgraph_data); 1107 struct fgraph_data *data;
1034 int cpu; 1108 int cpu;
1035 1109
1110 iter->private = NULL;
1111
1112 data = kzalloc(sizeof(*data), GFP_KERNEL);
1036 if (!data) 1113 if (!data)
1037 pr_warning("function graph tracer: not enough memory\n"); 1114 goto out_err;
1038 else 1115
1039 for_each_possible_cpu(cpu) { 1116 data->cpu_data = alloc_percpu(struct fgraph_cpu_data);
1040 pid_t *pid = &(per_cpu_ptr(data, cpu)->last_pid); 1117 if (!data->cpu_data)
1041 int *depth = &(per_cpu_ptr(data, cpu)->depth); 1118 goto out_err_free;
1042 *pid = -1; 1119
1043 *depth = 0; 1120 for_each_possible_cpu(cpu) {
1044 } 1121 pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
1122 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
1123 int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore);
1124 *pid = -1;
1125 *depth = 0;
1126 *ignore = 0;
1127 }
1045 1128
1046 iter->private = data; 1129 iter->private = data;
1130
1131 return;
1132
1133 out_err_free:
1134 kfree(data);
1135 out_err:
1136 pr_warning("function graph tracer: not enough memory\n");
1047} 1137}
1048 1138
1049static void graph_trace_close(struct trace_iterator *iter) 1139static void graph_trace_close(struct trace_iterator *iter)
1050{ 1140{
1051 free_percpu(iter->private); 1141 struct fgraph_data *data = iter->private;
1142
1143 if (data) {
1144 free_percpu(data->cpu_data);
1145 kfree(data);
1146 }
1052} 1147}
1053 1148
1054static struct tracer graph_trace __read_mostly = { 1149static struct tracer graph_trace __read_mostly = {
1055 .name = "function_graph", 1150 .name = "function_graph",
1056 .open = graph_trace_open, 1151 .open = graph_trace_open,
1152 .pipe_open = graph_trace_open,
1057 .close = graph_trace_close, 1153 .close = graph_trace_close,
1154 .pipe_close = graph_trace_close,
1058 .wait_pipe = poll_wait_pipe, 1155 .wait_pipe = poll_wait_pipe,
1059 .init = graph_trace_init, 1156 .init = graph_trace_init,
1060 .reset = graph_trace_reset, 1157 .reset = graph_trace_reset,
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 69543a905cd5..7b97000745f5 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -20,10 +20,10 @@
20 20
21#define BTS_BUFFER_SIZE (1 << 13) 21#define BTS_BUFFER_SIZE (1 << 13)
22 22
23static DEFINE_PER_CPU(struct bts_tracer *, tracer); 23static DEFINE_PER_CPU(struct bts_tracer *, hwb_tracer);
24static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer); 24static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], hwb_buffer);
25 25
26#define this_tracer per_cpu(tracer, smp_processor_id()) 26#define this_tracer per_cpu(hwb_tracer, smp_processor_id())
27 27
28static int trace_hw_branches_enabled __read_mostly; 28static int trace_hw_branches_enabled __read_mostly;
29static int trace_hw_branches_suspended __read_mostly; 29static int trace_hw_branches_suspended __read_mostly;
@@ -32,12 +32,13 @@ static struct trace_array *hw_branch_trace __read_mostly;
32 32
33static void bts_trace_init_cpu(int cpu) 33static void bts_trace_init_cpu(int cpu)
34{ 34{
35 per_cpu(tracer, cpu) = 35 per_cpu(hwb_tracer, cpu) =
36 ds_request_bts_cpu(cpu, per_cpu(buffer, cpu), BTS_BUFFER_SIZE, 36 ds_request_bts_cpu(cpu, per_cpu(hwb_buffer, cpu),
37 NULL, (size_t)-1, BTS_KERNEL); 37 BTS_BUFFER_SIZE, NULL, (size_t)-1,
38 BTS_KERNEL);
38 39
39 if (IS_ERR(per_cpu(tracer, cpu))) 40 if (IS_ERR(per_cpu(hwb_tracer, cpu)))
40 per_cpu(tracer, cpu) = NULL; 41 per_cpu(hwb_tracer, cpu) = NULL;
41} 42}
42 43
43static int bts_trace_init(struct trace_array *tr) 44static int bts_trace_init(struct trace_array *tr)
@@ -51,7 +52,7 @@ static int bts_trace_init(struct trace_array *tr)
51 for_each_online_cpu(cpu) { 52 for_each_online_cpu(cpu) {
52 bts_trace_init_cpu(cpu); 53 bts_trace_init_cpu(cpu);
53 54
54 if (likely(per_cpu(tracer, cpu))) 55 if (likely(per_cpu(hwb_tracer, cpu)))
55 trace_hw_branches_enabled = 1; 56 trace_hw_branches_enabled = 1;
56 } 57 }
57 trace_hw_branches_suspended = 0; 58 trace_hw_branches_suspended = 0;
@@ -67,9 +68,9 @@ static void bts_trace_reset(struct trace_array *tr)
67 68
68 get_online_cpus(); 69 get_online_cpus();
69 for_each_online_cpu(cpu) { 70 for_each_online_cpu(cpu) {
70 if (likely(per_cpu(tracer, cpu))) { 71 if (likely(per_cpu(hwb_tracer, cpu))) {
71 ds_release_bts(per_cpu(tracer, cpu)); 72 ds_release_bts(per_cpu(hwb_tracer, cpu));
72 per_cpu(tracer, cpu) = NULL; 73 per_cpu(hwb_tracer, cpu) = NULL;
73 } 74 }
74 } 75 }
75 trace_hw_branches_enabled = 0; 76 trace_hw_branches_enabled = 0;
@@ -83,8 +84,8 @@ static void bts_trace_start(struct trace_array *tr)
83 84
84 get_online_cpus(); 85 get_online_cpus();
85 for_each_online_cpu(cpu) 86 for_each_online_cpu(cpu)
86 if (likely(per_cpu(tracer, cpu))) 87 if (likely(per_cpu(hwb_tracer, cpu)))
87 ds_resume_bts(per_cpu(tracer, cpu)); 88 ds_resume_bts(per_cpu(hwb_tracer, cpu));
88 trace_hw_branches_suspended = 0; 89 trace_hw_branches_suspended = 0;
89 put_online_cpus(); 90 put_online_cpus();
90} 91}
@@ -95,8 +96,8 @@ static void bts_trace_stop(struct trace_array *tr)
95 96
96 get_online_cpus(); 97 get_online_cpus();
97 for_each_online_cpu(cpu) 98 for_each_online_cpu(cpu)
98 if (likely(per_cpu(tracer, cpu))) 99 if (likely(per_cpu(hwb_tracer, cpu)))
99 ds_suspend_bts(per_cpu(tracer, cpu)); 100 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
100 trace_hw_branches_suspended = 1; 101 trace_hw_branches_suspended = 1;
101 put_online_cpus(); 102 put_online_cpus();
102} 103}
@@ -114,16 +115,16 @@ static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
114 bts_trace_init_cpu(cpu); 115 bts_trace_init_cpu(cpu);
115 116
116 if (trace_hw_branches_suspended && 117 if (trace_hw_branches_suspended &&
117 likely(per_cpu(tracer, cpu))) 118 likely(per_cpu(hwb_tracer, cpu)))
118 ds_suspend_bts(per_cpu(tracer, cpu)); 119 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
119 } 120 }
120 break; 121 break;
121 122
122 case CPU_DOWN_PREPARE: 123 case CPU_DOWN_PREPARE:
123 /* The notification is sent with interrupts enabled. */ 124 /* The notification is sent with interrupts enabled. */
124 if (likely(per_cpu(tracer, cpu))) { 125 if (likely(per_cpu(hwb_tracer, cpu))) {
125 ds_release_bts(per_cpu(tracer, cpu)); 126 ds_release_bts(per_cpu(hwb_tracer, cpu));
126 per_cpu(tracer, cpu) = NULL; 127 per_cpu(hwb_tracer, cpu) = NULL;
127 } 128 }
128 } 129 }
129 130
@@ -258,8 +259,8 @@ static void trace_bts_prepare(struct trace_iterator *iter)
258 259
259 get_online_cpus(); 260 get_online_cpus();
260 for_each_online_cpu(cpu) 261 for_each_online_cpu(cpu)
261 if (likely(per_cpu(tracer, cpu))) 262 if (likely(per_cpu(hwb_tracer, cpu)))
262 ds_suspend_bts(per_cpu(tracer, cpu)); 263 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
263 /* 264 /*
264 * We need to collect the trace on the respective cpu since ftrace 265 * We need to collect the trace on the respective cpu since ftrace
265 * implicitly adds the record for the current cpu. 266 * implicitly adds the record for the current cpu.
@@ -268,8 +269,8 @@ static void trace_bts_prepare(struct trace_iterator *iter)
268 on_each_cpu(trace_bts_cpu, iter->tr, 1); 269 on_each_cpu(trace_bts_cpu, iter->tr, 1);
269 270
270 for_each_online_cpu(cpu) 271 for_each_online_cpu(cpu)
271 if (likely(per_cpu(tracer, cpu))) 272 if (likely(per_cpu(hwb_tracer, cpu)))
272 ds_resume_bts(per_cpu(tracer, cpu)); 273 ds_resume_bts(per_cpu(hwb_tracer, cpu));
273 put_online_cpus(); 274 put_online_cpus();
274} 275}
275 276
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 3aa7eaa2114c..2974bc7538c7 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -151,6 +151,8 @@ check_critical_timing(struct trace_array *tr,
151 goto out_unlock; 151 goto out_unlock;
152 152
153 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 153 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
154 /* Skip 5 functions to get to the irq/preempt enable function */
155 __trace_stack(tr, flags, 5, pc);
154 156
155 if (data->critical_sequence != max_sequence) 157 if (data->critical_sequence != max_sequence)
156 goto out_unlock; 158 goto out_unlock;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index b52d397e57eb..7ecab06547a5 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1132,10 +1132,6 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
1132 struct kprobe_trace_entry field; 1132 struct kprobe_trace_entry field;
1133 struct trace_probe *tp = (struct trace_probe *)event_call->data; 1133 struct trace_probe *tp = (struct trace_probe *)event_call->data;
1134 1134
1135 ret = trace_define_common_fields(event_call);
1136 if (ret)
1137 return ret;
1138
1139 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); 1135 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
1140 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1); 1136 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
1141 /* Set argument names as fields */ 1137 /* Set argument names as fields */
@@ -1150,10 +1146,6 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
1150 struct kretprobe_trace_entry field; 1146 struct kretprobe_trace_entry field;
1151 struct trace_probe *tp = (struct trace_probe *)event_call->data; 1147 struct trace_probe *tp = (struct trace_probe *)event_call->data;
1152 1148
1153 ret = trace_define_common_fields(event_call);
1154 if (ret)
1155 return ret;
1156
1157 DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); 1149 DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
1158 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); 1150 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
1159 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1); 1151 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
@@ -1453,7 +1445,6 @@ static int register_probe_event(struct trace_probe *tp)
1453 call->unregfunc = probe_event_disable; 1445 call->unregfunc = probe_event_disable;
1454 1446
1455#ifdef CONFIG_EVENT_PROFILE 1447#ifdef CONFIG_EVENT_PROFILE
1456 atomic_set(&call->profile_count, -1);
1457 call->profile_enable = probe_profile_enable; 1448 call->profile_enable = probe_profile_enable;
1458 call->profile_disable = probe_profile_disable; 1449 call->profile_disable = probe_profile_disable;
1459#endif 1450#endif
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index acb87d4a4ac1..faf37fa4408c 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -236,7 +236,8 @@ static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
236 mutex_lock(&ksym_tracer_mutex); 236 mutex_lock(&ksym_tracer_mutex);
237 237
238 hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) { 238 hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
239 ret = trace_seq_printf(s, "%pS:", (void *)entry->attr.bp_addr); 239 ret = trace_seq_printf(s, "%pS:",
240 (void *)(unsigned long)entry->attr.bp_addr);
240 if (entry->attr.bp_type == HW_BREAKPOINT_R) 241 if (entry->attr.bp_type == HW_BREAKPOINT_R)
241 ret = trace_seq_puts(s, "r--\n"); 242 ret = trace_seq_puts(s, "r--\n");
242 else if (entry->attr.bp_type == HW_BREAKPOINT_W) 243 else if (entry->attr.bp_type == HW_BREAKPOINT_W)
@@ -278,21 +279,20 @@ static ssize_t ksym_trace_filter_write(struct file *file,
278{ 279{
279 struct trace_ksym *entry; 280 struct trace_ksym *entry;
280 struct hlist_node *node; 281 struct hlist_node *node;
281 char *input_string, *ksymname = NULL; 282 char *buf, *input_string, *ksymname = NULL;
282 unsigned long ksym_addr = 0; 283 unsigned long ksym_addr = 0;
283 int ret, op, changed = 0; 284 int ret, op, changed = 0;
284 285
285 input_string = kzalloc(count + 1, GFP_KERNEL); 286 buf = kzalloc(count + 1, GFP_KERNEL);
286 if (!input_string) 287 if (!buf)
287 return -ENOMEM; 288 return -ENOMEM;
288 289
289 if (copy_from_user(input_string, buffer, count)) { 290 ret = -EFAULT;
290 kfree(input_string); 291 if (copy_from_user(buf, buffer, count))
291 return -EFAULT; 292 goto out;
292 }
293 input_string[count] = '\0';
294 293
295 strstrip(input_string); 294 buf[count] = '\0';
295 input_string = strstrip(buf);
296 296
297 /* 297 /*
298 * Clear all breakpoints if: 298 * Clear all breakpoints if:
@@ -300,18 +300,16 @@ static ssize_t ksym_trace_filter_write(struct file *file,
300 * 2: echo 0 > ksym_trace_filter 300 * 2: echo 0 > ksym_trace_filter
301 * 3: echo "*:---" > ksym_trace_filter 301 * 3: echo "*:---" > ksym_trace_filter
302 */ 302 */
303 if (!input_string[0] || !strcmp(input_string, "0") || 303 if (!buf[0] || !strcmp(buf, "0") ||
304 !strcmp(input_string, "*:---")) { 304 !strcmp(buf, "*:---")) {
305 __ksym_trace_reset(); 305 __ksym_trace_reset();
306 kfree(input_string); 306 ret = 0;
307 return count; 307 goto out;
308 } 308 }
309 309
310 ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr); 310 ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
311 if (ret < 0) { 311 if (ret < 0)
312 kfree(input_string); 312 goto out;
313 return ret;
314 }
315 313
316 mutex_lock(&ksym_tracer_mutex); 314 mutex_lock(&ksym_tracer_mutex);
317 315
@@ -322,7 +320,7 @@ static ssize_t ksym_trace_filter_write(struct file *file,
322 if (entry->attr.bp_type != op) 320 if (entry->attr.bp_type != op)
323 changed = 1; 321 changed = 1;
324 else 322 else
325 goto out; 323 goto out_unlock;
326 break; 324 break;
327 } 325 }
328 } 326 }
@@ -337,28 +335,24 @@ static ssize_t ksym_trace_filter_write(struct file *file,
337 if (IS_ERR(entry->ksym_hbp)) 335 if (IS_ERR(entry->ksym_hbp))
338 ret = PTR_ERR(entry->ksym_hbp); 336 ret = PTR_ERR(entry->ksym_hbp);
339 else 337 else
340 goto out; 338 goto out_unlock;
341 } 339 }
342 /* Error or "symbol:---" case: drop it */ 340 /* Error or "symbol:---" case: drop it */
343 ksym_filter_entry_count--; 341 ksym_filter_entry_count--;
344 hlist_del_rcu(&(entry->ksym_hlist)); 342 hlist_del_rcu(&(entry->ksym_hlist));
345 synchronize_rcu(); 343 synchronize_rcu();
346 kfree(entry); 344 kfree(entry);
347 goto out; 345 goto out_unlock;
348 } else { 346 } else {
349 /* Check for malformed request: (4) */ 347 /* Check for malformed request: (4) */
350 if (op == 0) 348 if (op)
351 goto out; 349 ret = process_new_ksym_entry(ksymname, op, ksym_addr);
352 ret = process_new_ksym_entry(ksymname, op, ksym_addr);
353 } 350 }
354out: 351out_unlock:
355 mutex_unlock(&ksym_tracer_mutex); 352 mutex_unlock(&ksym_tracer_mutex);
356 353out:
357 kfree(input_string); 354 kfree(buf);
358 355 return !ret ? count : ret;
359 if (!ret)
360 ret = count;
361 return ret;
362} 356}
363 357
364static const struct file_operations ksym_tracing_fops = { 358static const struct file_operations ksym_tracing_fops = {
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index b6c12c6a1bcd..8e46b3323cdc 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -23,13 +23,21 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
23 23
24static int next_event_type = __TRACE_LAST_TYPE + 1; 24static int next_event_type = __TRACE_LAST_TYPE + 1;
25 25
26void trace_print_seq(struct seq_file *m, struct trace_seq *s) 26int trace_print_seq(struct seq_file *m, struct trace_seq *s)
27{ 27{
28 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; 28 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
29 int ret;
30
31 ret = seq_write(m, s->buffer, len);
29 32
30 seq_write(m, s->buffer, len); 33 /*
34 * Only reset this buffer if we successfully wrote to the
35 * seq_file buffer.
36 */
37 if (!ret)
38 trace_seq_init(s);
31 39
32 trace_seq_init(s); 40 return ret;
33} 41}
34 42
35enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) 43enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
@@ -85,7 +93,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
85 va_list ap; 93 va_list ap;
86 int ret; 94 int ret;
87 95
88 if (!len) 96 if (s->full || !len)
89 return 0; 97 return 0;
90 98
91 va_start(ap, fmt); 99 va_start(ap, fmt);
@@ -93,8 +101,10 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
93 va_end(ap); 101 va_end(ap);
94 102
95 /* If we can't write it all, don't bother writing anything */ 103 /* If we can't write it all, don't bother writing anything */
96 if (ret >= len) 104 if (ret >= len) {
105 s->full = 1;
97 return 0; 106 return 0;
107 }
98 108
99 s->len += ret; 109 s->len += ret;
100 110
@@ -119,14 +129,16 @@ trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
119 int len = (PAGE_SIZE - 1) - s->len; 129 int len = (PAGE_SIZE - 1) - s->len;
120 int ret; 130 int ret;
121 131
122 if (!len) 132 if (s->full || !len)
123 return 0; 133 return 0;
124 134
125 ret = vsnprintf(s->buffer + s->len, len, fmt, args); 135 ret = vsnprintf(s->buffer + s->len, len, fmt, args);
126 136
127 /* If we can't write it all, don't bother writing anything */ 137 /* If we can't write it all, don't bother writing anything */
128 if (ret >= len) 138 if (ret >= len) {
139 s->full = 1;
129 return 0; 140 return 0;
141 }
130 142
131 s->len += ret; 143 s->len += ret;
132 144
@@ -139,14 +151,16 @@ int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
139 int len = (PAGE_SIZE - 1) - s->len; 151 int len = (PAGE_SIZE - 1) - s->len;
140 int ret; 152 int ret;
141 153
142 if (!len) 154 if (s->full || !len)
143 return 0; 155 return 0;
144 156
145 ret = bstr_printf(s->buffer + s->len, len, fmt, binary); 157 ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
146 158
147 /* If we can't write it all, don't bother writing anything */ 159 /* If we can't write it all, don't bother writing anything */
148 if (ret >= len) 160 if (ret >= len) {
161 s->full = 1;
149 return 0; 162 return 0;
163 }
150 164
151 s->len += ret; 165 s->len += ret;
152 166
@@ -167,8 +181,13 @@ int trace_seq_puts(struct trace_seq *s, const char *str)
167{ 181{
168 int len = strlen(str); 182 int len = strlen(str);
169 183
170 if (len > ((PAGE_SIZE - 1) - s->len)) 184 if (s->full)
185 return 0;
186
187 if (len > ((PAGE_SIZE - 1) - s->len)) {
188 s->full = 1;
171 return 0; 189 return 0;
190 }
172 191
173 memcpy(s->buffer + s->len, str, len); 192 memcpy(s->buffer + s->len, str, len);
174 s->len += len; 193 s->len += len;
@@ -178,9 +197,14 @@ int trace_seq_puts(struct trace_seq *s, const char *str)
178 197
179int trace_seq_putc(struct trace_seq *s, unsigned char c) 198int trace_seq_putc(struct trace_seq *s, unsigned char c)
180{ 199{
181 if (s->len >= (PAGE_SIZE - 1)) 200 if (s->full)
182 return 0; 201 return 0;
183 202
203 if (s->len >= (PAGE_SIZE - 1)) {
204 s->full = 1;
205 return 0;
206 }
207
184 s->buffer[s->len++] = c; 208 s->buffer[s->len++] = c;
185 209
186 return 1; 210 return 1;
@@ -188,9 +212,14 @@ int trace_seq_putc(struct trace_seq *s, unsigned char c)
188 212
189int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len) 213int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
190{ 214{
191 if (len > ((PAGE_SIZE - 1) - s->len)) 215 if (s->full)
192 return 0; 216 return 0;
193 217
218 if (len > ((PAGE_SIZE - 1) - s->len)) {
219 s->full = 1;
220 return 0;
221 }
222
194 memcpy(s->buffer + s->len, mem, len); 223 memcpy(s->buffer + s->len, mem, len);
195 s->len += len; 224 s->len += len;
196 225
@@ -203,6 +232,9 @@ int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len)
203 const unsigned char *data = mem; 232 const unsigned char *data = mem;
204 int i, j; 233 int i, j;
205 234
235 if (s->full)
236 return 0;
237
206#ifdef __BIG_ENDIAN 238#ifdef __BIG_ENDIAN
207 for (i = 0, j = 0; i < len; i++) { 239 for (i = 0, j = 0; i < len; i++) {
208#else 240#else
@@ -220,8 +252,13 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)
220{ 252{
221 void *ret; 253 void *ret;
222 254
223 if (len > ((PAGE_SIZE - 1) - s->len)) 255 if (s->full)
256 return 0;
257
258 if (len > ((PAGE_SIZE - 1) - s->len)) {
259 s->full = 1;
224 return NULL; 260 return NULL;
261 }
225 262
226 ret = s->buffer + s->len; 263 ret = s->buffer + s->len;
227 s->len += len; 264 s->len += len;
@@ -233,8 +270,14 @@ int trace_seq_path(struct trace_seq *s, struct path *path)
233{ 270{
234 unsigned char *p; 271 unsigned char *p;
235 272
236 if (s->len >= (PAGE_SIZE - 1)) 273 if (s->full)
274 return 0;
275
276 if (s->len >= (PAGE_SIZE - 1)) {
277 s->full = 1;
237 return 0; 278 return 0;
279 }
280
238 p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); 281 p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
239 if (!IS_ERR(p)) { 282 if (!IS_ERR(p)) {
240 p = mangle_path(s->buffer + s->len, p, "\n"); 283 p = mangle_path(s->buffer + s->len, p, "\n");
@@ -247,6 +290,7 @@ int trace_seq_path(struct trace_seq *s, struct path *path)
247 return 1; 290 return 1;
248 } 291 }
249 292
293 s->full = 1;
250 return 0; 294 return 0;
251} 295}
252 296
@@ -373,6 +417,9 @@ int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
373 unsigned long vmstart = 0; 417 unsigned long vmstart = 0;
374 int ret = 1; 418 int ret = 1;
375 419
420 if (s->full)
421 return 0;
422
376 if (mm) { 423 if (mm) {
377 const struct vm_area_struct *vma; 424 const struct vm_area_struct *vma;
378 425
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 26185d727676..0271742abb8d 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -28,8 +28,8 @@ static int wakeup_current_cpu;
28static unsigned wakeup_prio = -1; 28static unsigned wakeup_prio = -1;
29static int wakeup_rt; 29static int wakeup_rt;
30 30
31static raw_spinlock_t wakeup_lock = 31static arch_spinlock_t wakeup_lock =
32 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 32 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
33 33
34static void __wakeup_reset(struct trace_array *tr); 34static void __wakeup_reset(struct trace_array *tr);
35 35
@@ -143,7 +143,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
143 goto out; 143 goto out;
144 144
145 local_irq_save(flags); 145 local_irq_save(flags);
146 __raw_spin_lock(&wakeup_lock); 146 arch_spin_lock(&wakeup_lock);
147 147
148 /* We could race with grabbing wakeup_lock */ 148 /* We could race with grabbing wakeup_lock */
149 if (unlikely(!tracer_enabled || next != wakeup_task)) 149 if (unlikely(!tracer_enabled || next != wakeup_task))
@@ -169,7 +169,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
169 169
170out_unlock: 170out_unlock:
171 __wakeup_reset(wakeup_trace); 171 __wakeup_reset(wakeup_trace);
172 __raw_spin_unlock(&wakeup_lock); 172 arch_spin_unlock(&wakeup_lock);
173 local_irq_restore(flags); 173 local_irq_restore(flags);
174out: 174out:
175 atomic_dec(&wakeup_trace->data[cpu]->disabled); 175 atomic_dec(&wakeup_trace->data[cpu]->disabled);
@@ -193,9 +193,9 @@ static void wakeup_reset(struct trace_array *tr)
193 tracing_reset_online_cpus(tr); 193 tracing_reset_online_cpus(tr);
194 194
195 local_irq_save(flags); 195 local_irq_save(flags);
196 __raw_spin_lock(&wakeup_lock); 196 arch_spin_lock(&wakeup_lock);
197 __wakeup_reset(tr); 197 __wakeup_reset(tr);
198 __raw_spin_unlock(&wakeup_lock); 198 arch_spin_unlock(&wakeup_lock);
199 local_irq_restore(flags); 199 local_irq_restore(flags);
200} 200}
201 201
@@ -225,7 +225,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
225 goto out; 225 goto out;
226 226
227 /* interrupts should be off from try_to_wake_up */ 227 /* interrupts should be off from try_to_wake_up */
228 __raw_spin_lock(&wakeup_lock); 228 arch_spin_lock(&wakeup_lock);
229 229
230 /* check for races. */ 230 /* check for races. */
231 if (!tracer_enabled || p->prio >= wakeup_prio) 231 if (!tracer_enabled || p->prio >= wakeup_prio)
@@ -255,7 +255,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
255 trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); 255 trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
256 256
257out_locked: 257out_locked:
258 __raw_spin_unlock(&wakeup_lock); 258 arch_spin_unlock(&wakeup_lock);
259out: 259out:
260 atomic_dec(&wakeup_trace->data[cpu]->disabled); 260 atomic_dec(&wakeup_trace->data[cpu]->disabled);
261} 261}
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index dc98309e839a..280fea470d67 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -67,7 +67,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
67 67
68 /* Don't allow flipping of max traces now */ 68 /* Don't allow flipping of max traces now */
69 local_irq_save(flags); 69 local_irq_save(flags);
70 __raw_spin_lock(&ftrace_max_lock); 70 arch_spin_lock(&ftrace_max_lock);
71 71
72 cnt = ring_buffer_entries(tr->buffer); 72 cnt = ring_buffer_entries(tr->buffer);
73 73
@@ -85,7 +85,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
85 break; 85 break;
86 } 86 }
87 tracing_on(); 87 tracing_on();
88 __raw_spin_unlock(&ftrace_max_lock); 88 arch_spin_unlock(&ftrace_max_lock);
89 local_irq_restore(flags); 89 local_irq_restore(flags);
90 90
91 if (count) 91 if (count)
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 8504ac71e4e8..678a5120ee30 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -27,8 +27,8 @@ static struct stack_trace max_stack_trace = {
27}; 27};
28 28
29static unsigned long max_stack_size; 29static unsigned long max_stack_size;
30static raw_spinlock_t max_stack_lock = 30static arch_spinlock_t max_stack_lock =
31 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 31 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
32 32
33static int stack_trace_disabled __read_mostly; 33static int stack_trace_disabled __read_mostly;
34static DEFINE_PER_CPU(int, trace_active); 34static DEFINE_PER_CPU(int, trace_active);
@@ -54,7 +54,7 @@ static inline void check_stack(void)
54 return; 54 return;
55 55
56 local_irq_save(flags); 56 local_irq_save(flags);
57 __raw_spin_lock(&max_stack_lock); 57 arch_spin_lock(&max_stack_lock);
58 58
59 /* a race could have already updated it */ 59 /* a race could have already updated it */
60 if (this_size <= max_stack_size) 60 if (this_size <= max_stack_size)
@@ -103,7 +103,7 @@ static inline void check_stack(void)
103 } 103 }
104 104
105 out: 105 out:
106 __raw_spin_unlock(&max_stack_lock); 106 arch_spin_unlock(&max_stack_lock);
107 local_irq_restore(flags); 107 local_irq_restore(flags);
108} 108}
109 109
@@ -171,9 +171,9 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
171 return ret; 171 return ret;
172 172
173 local_irq_save(flags); 173 local_irq_save(flags);
174 __raw_spin_lock(&max_stack_lock); 174 arch_spin_lock(&max_stack_lock);
175 *ptr = val; 175 *ptr = val;
176 __raw_spin_unlock(&max_stack_lock); 176 arch_spin_unlock(&max_stack_lock);
177 local_irq_restore(flags); 177 local_irq_restore(flags);
178 178
179 return count; 179 return count;
@@ -207,7 +207,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
207static void *t_start(struct seq_file *m, loff_t *pos) 207static void *t_start(struct seq_file *m, loff_t *pos)
208{ 208{
209 local_irq_disable(); 209 local_irq_disable();
210 __raw_spin_lock(&max_stack_lock); 210 arch_spin_lock(&max_stack_lock);
211 211
212 if (*pos == 0) 212 if (*pos == 0)
213 return SEQ_START_TOKEN; 213 return SEQ_START_TOKEN;
@@ -217,7 +217,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
217 217
218static void t_stop(struct seq_file *m, void *p) 218static void t_stop(struct seq_file *m, void *p)
219{ 219{
220 __raw_spin_unlock(&max_stack_lock); 220 arch_spin_unlock(&max_stack_lock);
221 local_irq_enable(); 221 local_irq_enable();
222} 222}
223 223
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 57501d90096a..75289f372dd2 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -217,10 +217,6 @@ int syscall_enter_define_fields(struct ftrace_event_call *call)
217 int i; 217 int i;
218 int offset = offsetof(typeof(trace), args); 218 int offset = offsetof(typeof(trace), args);
219 219
220 ret = trace_define_common_fields(call);
221 if (ret)
222 return ret;
223
224 ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); 220 ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
225 if (ret) 221 if (ret)
226 return ret; 222 return ret;
@@ -241,10 +237,6 @@ int syscall_exit_define_fields(struct ftrace_event_call *call)
241 struct syscall_trace_exit trace; 237 struct syscall_trace_exit trace;
242 int ret; 238 int ret;
243 239
244 ret = trace_define_common_fields(call);
245 if (ret)
246 return ret;
247
248 ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); 240 ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
249 if (ret) 241 if (ret)
250 return ret; 242 return ret;
@@ -333,10 +325,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
333 mutex_lock(&syscall_trace_lock); 325 mutex_lock(&syscall_trace_lock);
334 if (!sys_refcount_enter) 326 if (!sys_refcount_enter)
335 ret = register_trace_sys_enter(ftrace_syscall_enter); 327 ret = register_trace_sys_enter(ftrace_syscall_enter);
336 if (ret) { 328 if (!ret) {
337 pr_info("event trace: Could not activate"
338 "syscall entry trace point");
339 } else {
340 set_bit(num, enabled_enter_syscalls); 329 set_bit(num, enabled_enter_syscalls);
341 sys_refcount_enter++; 330 sys_refcount_enter++;
342 } 331 }
@@ -370,10 +359,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
370 mutex_lock(&syscall_trace_lock); 359 mutex_lock(&syscall_trace_lock);
371 if (!sys_refcount_exit) 360 if (!sys_refcount_exit)
372 ret = register_trace_sys_exit(ftrace_syscall_exit); 361 ret = register_trace_sys_exit(ftrace_syscall_exit);
373 if (ret) { 362 if (!ret) {
374 pr_info("event trace: Could not activate"
375 "syscall exit trace point");
376 } else {
377 set_bit(num, enabled_exit_syscalls); 363 set_bit(num, enabled_exit_syscalls);
378 sys_refcount_exit++; 364 sys_refcount_exit++;
379 } 365 }
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
new file mode 100644
index 000000000000..eb27fd3430a2
--- /dev/null
+++ b/kernel/user-return-notifier.c
@@ -0,0 +1,44 @@
1
2#include <linux/user-return-notifier.h>
3#include <linux/percpu.h>
4#include <linux/sched.h>
5#include <linux/module.h>
6
7static DEFINE_PER_CPU(struct hlist_head, return_notifier_list);
8
9/*
10 * Request a notification when the current cpu returns to userspace. Must be
11 * called in atomic context. The notifier will also be called in atomic
12 * context.
13 */
14void user_return_notifier_register(struct user_return_notifier *urn)
15{
16 set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
17 hlist_add_head(&urn->link, &__get_cpu_var(return_notifier_list));
18}
19EXPORT_SYMBOL_GPL(user_return_notifier_register);
20
21/*
22 * Removes a registered user return notifier. Must be called from atomic
23 * context, and from the same cpu registration occured in.
24 */
25void user_return_notifier_unregister(struct user_return_notifier *urn)
26{
27 hlist_del(&urn->link);
28 if (hlist_empty(&__get_cpu_var(return_notifier_list)))
29 clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
30}
31EXPORT_SYMBOL_GPL(user_return_notifier_unregister);
32
33/* Calls registered user return notifiers */
34void fire_user_return_notifiers(void)
35{
36 struct user_return_notifier *urn;
37 struct hlist_node *tmp1, *tmp2;
38 struct hlist_head *head;
39
40 head = &get_cpu_var(return_notifier_list);
41 hlist_for_each_entry_safe(urn, tmp1, tmp2, head, link)
42 urn->on_user_return(urn);
43 put_cpu_var(return_notifier_list);
44}
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 69eae358a726..a2cd77e70d4d 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -57,78 +57,47 @@ static int proc_do_uts_string(ctl_table *table, int write,
57#define proc_do_uts_string NULL 57#define proc_do_uts_string NULL
58#endif 58#endif
59 59
60
61#ifdef CONFIG_SYSCTL_SYSCALL
62/* The generic string strategy routine: */
63static int sysctl_uts_string(ctl_table *table,
64 void __user *oldval, size_t __user *oldlenp,
65 void __user *newval, size_t newlen)
66{
67 struct ctl_table uts_table;
68 int r, write;
69 write = newval && newlen;
70 memcpy(&uts_table, table, sizeof(uts_table));
71 uts_table.data = get_uts(table, write);
72 r = sysctl_string(&uts_table, oldval, oldlenp, newval, newlen);
73 put_uts(table, write, uts_table.data);
74 return r;
75}
76#else
77#define sysctl_uts_string NULL
78#endif
79
80static struct ctl_table uts_kern_table[] = { 60static struct ctl_table uts_kern_table[] = {
81 { 61 {
82 .ctl_name = KERN_OSTYPE,
83 .procname = "ostype", 62 .procname = "ostype",
84 .data = init_uts_ns.name.sysname, 63 .data = init_uts_ns.name.sysname,
85 .maxlen = sizeof(init_uts_ns.name.sysname), 64 .maxlen = sizeof(init_uts_ns.name.sysname),
86 .mode = 0444, 65 .mode = 0444,
87 .proc_handler = proc_do_uts_string, 66 .proc_handler = proc_do_uts_string,
88 .strategy = sysctl_uts_string,
89 }, 67 },
90 { 68 {
91 .ctl_name = KERN_OSRELEASE,
92 .procname = "osrelease", 69 .procname = "osrelease",
93 .data = init_uts_ns.name.release, 70 .data = init_uts_ns.name.release,
94 .maxlen = sizeof(init_uts_ns.name.release), 71 .maxlen = sizeof(init_uts_ns.name.release),
95 .mode = 0444, 72 .mode = 0444,
96 .proc_handler = proc_do_uts_string, 73 .proc_handler = proc_do_uts_string,
97 .strategy = sysctl_uts_string,
98 }, 74 },
99 { 75 {
100 .ctl_name = KERN_VERSION,
101 .procname = "version", 76 .procname = "version",
102 .data = init_uts_ns.name.version, 77 .data = init_uts_ns.name.version,
103 .maxlen = sizeof(init_uts_ns.name.version), 78 .maxlen = sizeof(init_uts_ns.name.version),
104 .mode = 0444, 79 .mode = 0444,
105 .proc_handler = proc_do_uts_string, 80 .proc_handler = proc_do_uts_string,
106 .strategy = sysctl_uts_string,
107 }, 81 },
108 { 82 {
109 .ctl_name = KERN_NODENAME,
110 .procname = "hostname", 83 .procname = "hostname",
111 .data = init_uts_ns.name.nodename, 84 .data = init_uts_ns.name.nodename,
112 .maxlen = sizeof(init_uts_ns.name.nodename), 85 .maxlen = sizeof(init_uts_ns.name.nodename),
113 .mode = 0644, 86 .mode = 0644,
114 .proc_handler = proc_do_uts_string, 87 .proc_handler = proc_do_uts_string,
115 .strategy = sysctl_uts_string,
116 }, 88 },
117 { 89 {
118 .ctl_name = KERN_DOMAINNAME,
119 .procname = "domainname", 90 .procname = "domainname",
120 .data = init_uts_ns.name.domainname, 91 .data = init_uts_ns.name.domainname,
121 .maxlen = sizeof(init_uts_ns.name.domainname), 92 .maxlen = sizeof(init_uts_ns.name.domainname),
122 .mode = 0644, 93 .mode = 0644,
123 .proc_handler = proc_do_uts_string, 94 .proc_handler = proc_do_uts_string,
124 .strategy = sysctl_uts_string,
125 }, 95 },
126 {} 96 {}
127}; 97};
128 98
129static struct ctl_table uts_root_table[] = { 99static struct ctl_table uts_root_table[] = {
130 { 100 {
131 .ctl_name = CTL_KERN,
132 .procname = "kernel", 101 .procname = "kernel",
133 .mode = 0555, 102 .mode = 0555,
134 .child = uts_kern_table, 103 .child = uts_kern_table,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 12328147132c..dee48658805c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -68,6 +68,116 @@ struct workqueue_struct {
68#endif 68#endif
69}; 69};
70 70
71#ifdef CONFIG_DEBUG_OBJECTS_WORK
72
73static struct debug_obj_descr work_debug_descr;
74
75/*
76 * fixup_init is called when:
77 * - an active object is initialized
78 */
79static int work_fixup_init(void *addr, enum debug_obj_state state)
80{
81 struct work_struct *work = addr;
82
83 switch (state) {
84 case ODEBUG_STATE_ACTIVE:
85 cancel_work_sync(work);
86 debug_object_init(work, &work_debug_descr);
87 return 1;
88 default:
89 return 0;
90 }
91}
92
93/*
94 * fixup_activate is called when:
95 * - an active object is activated
96 * - an unknown object is activated (might be a statically initialized object)
97 */
98static int work_fixup_activate(void *addr, enum debug_obj_state state)
99{
100 struct work_struct *work = addr;
101
102 switch (state) {
103
104 case ODEBUG_STATE_NOTAVAILABLE:
105 /*
106 * This is not really a fixup. The work struct was
107 * statically initialized. We just make sure that it
108 * is tracked in the object tracker.
109 */
110 if (test_bit(WORK_STRUCT_STATIC, work_data_bits(work))) {
111 debug_object_init(work, &work_debug_descr);
112 debug_object_activate(work, &work_debug_descr);
113 return 0;
114 }
115 WARN_ON_ONCE(1);
116 return 0;
117
118 case ODEBUG_STATE_ACTIVE:
119 WARN_ON(1);
120
121 default:
122 return 0;
123 }
124}
125
126/*
127 * fixup_free is called when:
128 * - an active object is freed
129 */
130static int work_fixup_free(void *addr, enum debug_obj_state state)
131{
132 struct work_struct *work = addr;
133
134 switch (state) {
135 case ODEBUG_STATE_ACTIVE:
136 cancel_work_sync(work);
137 debug_object_free(work, &work_debug_descr);
138 return 1;
139 default:
140 return 0;
141 }
142}
143
144static struct debug_obj_descr work_debug_descr = {
145 .name = "work_struct",
146 .fixup_init = work_fixup_init,
147 .fixup_activate = work_fixup_activate,
148 .fixup_free = work_fixup_free,
149};
150
151static inline void debug_work_activate(struct work_struct *work)
152{
153 debug_object_activate(work, &work_debug_descr);
154}
155
156static inline void debug_work_deactivate(struct work_struct *work)
157{
158 debug_object_deactivate(work, &work_debug_descr);
159}
160
161void __init_work(struct work_struct *work, int onstack)
162{
163 if (onstack)
164 debug_object_init_on_stack(work, &work_debug_descr);
165 else
166 debug_object_init(work, &work_debug_descr);
167}
168EXPORT_SYMBOL_GPL(__init_work);
169
170void destroy_work_on_stack(struct work_struct *work)
171{
172 debug_object_free(work, &work_debug_descr);
173}
174EXPORT_SYMBOL_GPL(destroy_work_on_stack);
175
176#else
177static inline void debug_work_activate(struct work_struct *work) { }
178static inline void debug_work_deactivate(struct work_struct *work) { }
179#endif
180
71/* Serializes the accesses to the list of workqueues. */ 181/* Serializes the accesses to the list of workqueues. */
72static DEFINE_SPINLOCK(workqueue_lock); 182static DEFINE_SPINLOCK(workqueue_lock);
73static LIST_HEAD(workqueues); 183static LIST_HEAD(workqueues);
@@ -145,6 +255,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
145{ 255{
146 unsigned long flags; 256 unsigned long flags;
147 257
258 debug_work_activate(work);
148 spin_lock_irqsave(&cwq->lock, flags); 259 spin_lock_irqsave(&cwq->lock, flags);
149 insert_work(cwq, work, &cwq->worklist); 260 insert_work(cwq, work, &cwq->worklist);
150 spin_unlock_irqrestore(&cwq->lock, flags); 261 spin_unlock_irqrestore(&cwq->lock, flags);
@@ -280,6 +391,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
280 struct lockdep_map lockdep_map = work->lockdep_map; 391 struct lockdep_map lockdep_map = work->lockdep_map;
281#endif 392#endif
282 trace_workqueue_execution(cwq->thread, work); 393 trace_workqueue_execution(cwq->thread, work);
394 debug_work_deactivate(work);
283 cwq->current_work = work; 395 cwq->current_work = work;
284 list_del_init(cwq->worklist.next); 396 list_del_init(cwq->worklist.next);
285 spin_unlock_irq(&cwq->lock); 397 spin_unlock_irq(&cwq->lock);
@@ -350,11 +462,18 @@ static void wq_barrier_func(struct work_struct *work)
350static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, 462static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
351 struct wq_barrier *barr, struct list_head *head) 463 struct wq_barrier *barr, struct list_head *head)
352{ 464{
353 INIT_WORK(&barr->work, wq_barrier_func); 465 /*
466 * debugobject calls are safe here even with cwq->lock locked
467 * as we know for sure that this will not trigger any of the
468 * checks and call back into the fixup functions where we
469 * might deadlock.
470 */
471 INIT_WORK_ON_STACK(&barr->work, wq_barrier_func);
354 __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work)); 472 __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
355 473
356 init_completion(&barr->done); 474 init_completion(&barr->done);
357 475
476 debug_work_activate(&barr->work);
358 insert_work(cwq, &barr->work, head); 477 insert_work(cwq, &barr->work, head);
359} 478}
360 479
@@ -372,8 +491,10 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
372 } 491 }
373 spin_unlock_irq(&cwq->lock); 492 spin_unlock_irq(&cwq->lock);
374 493
375 if (active) 494 if (active) {
376 wait_for_completion(&barr.done); 495 wait_for_completion(&barr.done);
496 destroy_work_on_stack(&barr.work);
497 }
377 498
378 return active; 499 return active;
379} 500}
@@ -451,6 +572,7 @@ out:
451 return 0; 572 return 0;
452 573
453 wait_for_completion(&barr.done); 574 wait_for_completion(&barr.done);
575 destroy_work_on_stack(&barr.work);
454 return 1; 576 return 1;
455} 577}
456EXPORT_SYMBOL_GPL(flush_work); 578EXPORT_SYMBOL_GPL(flush_work);
@@ -485,6 +607,7 @@ static int try_to_grab_pending(struct work_struct *work)
485 */ 607 */
486 smp_rmb(); 608 smp_rmb();
487 if (cwq == get_wq_data(work)) { 609 if (cwq == get_wq_data(work)) {
610 debug_work_deactivate(work);
488 list_del_init(&work->entry); 611 list_del_init(&work->entry);
489 ret = 1; 612 ret = 1;
490 } 613 }
@@ -507,8 +630,10 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
507 } 630 }
508 spin_unlock_irq(&cwq->lock); 631 spin_unlock_irq(&cwq->lock);
509 632
510 if (unlikely(running)) 633 if (unlikely(running)) {
511 wait_for_completion(&barr.done); 634 wait_for_completion(&barr.done);
635 destroy_work_on_stack(&barr.work);
636 }
512} 637}
513 638
514static void wait_on_work(struct work_struct *work) 639static void wait_on_work(struct work_struct *work)
@@ -692,31 +817,29 @@ int schedule_on_each_cpu(work_func_t func)
692 if (!works) 817 if (!works)
693 return -ENOMEM; 818 return -ENOMEM;
694 819
820 get_online_cpus();
821
695 /* 822 /*
696 * when running in keventd don't schedule a work item on itself. 823 * When running in keventd don't schedule a work item on
697 * Can just call directly because the work queue is already bound. 824 * itself. Can just call directly because the work queue is
698 * This also is faster. 825 * already bound. This also is faster.
699 * Make this a generic parameter for other workqueues?
700 */ 826 */
701 if (current_is_keventd()) { 827 if (current_is_keventd())
702 orig = raw_smp_processor_id(); 828 orig = raw_smp_processor_id();
703 INIT_WORK(per_cpu_ptr(works, orig), func);
704 func(per_cpu_ptr(works, orig));
705 }
706 829
707 get_online_cpus();
708 for_each_online_cpu(cpu) { 830 for_each_online_cpu(cpu) {
709 struct work_struct *work = per_cpu_ptr(works, cpu); 831 struct work_struct *work = per_cpu_ptr(works, cpu);
710 832
711 if (cpu == orig)
712 continue;
713 INIT_WORK(work, func); 833 INIT_WORK(work, func);
714 schedule_work_on(cpu, work);
715 }
716 for_each_online_cpu(cpu) {
717 if (cpu != orig) 834 if (cpu != orig)
718 flush_work(per_cpu_ptr(works, cpu)); 835 schedule_work_on(cpu, work);
719 } 836 }
837 if (orig >= 0)
838 func(per_cpu_ptr(works, orig));
839
840 for_each_online_cpu(cpu)
841 flush_work(per_cpu_ptr(works, cpu));
842
720 put_online_cpus(); 843 put_online_cpus();
721 free_percpu(works); 844 free_percpu(works);
722 return 0; 845 return 0;