aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks202
-rw-r--r--kernel/Makefile6
-rw-r--r--kernel/capability.c15
-rw-r--r--kernel/cpu.c23
-rw-r--r--kernel/cpuset.c45
-rw-r--r--kernel/exit.c31
-rw-r--r--kernel/fork.c10
-rw-r--r--kernel/futex.c10
-rw-r--r--kernel/hrtimer.c120
-rw-r--r--kernel/hung_task.c2
-rw-r--r--kernel/hw_breakpoint.c453
-rw-r--r--kernel/irq/chip.c6
-rw-r--r--kernel/irq/manage.c2
-rw-r--r--kernel/irq/proc.c40
-rw-r--r--kernel/irq/spurious.c16
-rw-r--r--kernel/itimer.c7
-rw-r--r--kernel/kallsyms.c1
-rw-r--r--kernel/kgdb.c58
-rw-r--r--kernel/kmod.c8
-rw-r--r--kernel/kprobes.c72
-rw-r--r--kernel/lockdep.c18
-rw-r--r--kernel/mutex.c4
-rw-r--r--kernel/notifier.c2
-rw-r--r--kernel/perf_event.c676
-rw-r--r--kernel/pm_qos_params.c20
-rw-r--r--kernel/posix-cpu-timers.c5
-rw-r--r--kernel/power/Makefile2
-rw-r--r--kernel/power/hibernate.c30
-rw-r--r--kernel/power/main.c1
-rw-r--r--kernel/power/process.c14
-rw-r--r--kernel/power/swap.c107
-rw-r--r--kernel/power/swsusp.c130
-rw-r--r--kernel/printk.c7
-rw-r--r--kernel/rcupdate.c122
-rw-r--r--kernel/rcutiny.c282
-rw-r--r--kernel/rcutorture.c65
-rw-r--r--kernel/rcutree.c465
-rw-r--r--kernel/rcutree.h69
-rw-r--r--kernel/rcutree_plugin.h309
-rw-r--r--kernel/rcutree_trace.c12
-rw-r--r--kernel/resource.c26
-rw-r--r--kernel/sched.c466
-rw-r--r--kernel/sched_debug.c17
-rw-r--r--kernel/sched_fair.c220
-rw-r--r--kernel/sched_features.h5
-rw-r--r--kernel/sched_idletask.c2
-rw-r--r--kernel/sched_rt.c63
-rw-r--r--kernel/signal.c73
-rw-r--r--kernel/slow-work.c7
-rw-r--r--kernel/smp.c56
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/spinlock.c310
-rw-r--r--kernel/srcu.c74
-rw-r--r--kernel/sys.c35
-rw-r--r--kernel/sys_ni.c3
-rw-r--r--kernel/sysctl.c916
-rw-r--r--kernel/sysctl_binary.c1507
-rw-r--r--kernel/sysctl_check.c1376
-rw-r--r--kernel/time.c31
-rw-r--r--kernel/time/clockevents.c13
-rw-r--r--kernel/time/clocksource.c105
-rw-r--r--kernel/time/tick-oneshot.c4
-rw-r--r--kernel/time/tick-sched.c141
-rw-r--r--kernel/time/timecompare.c6
-rw-r--r--kernel/time/timekeeping.c125
-rw-r--r--kernel/time/timer_list.c15
-rw-r--r--kernel/trace/Kconfig38
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/ftrace.c375
-rw-r--r--kernel/trace/ring_buffer.c24
-rw-r--r--kernel/trace/ring_buffer_benchmark.c85
-rw-r--r--kernel/trace/trace.c68
-rw-r--r--kernel/trace/trace.h82
-rw-r--r--kernel/trace/trace_clock.c8
-rw-r--r--kernel/trace/trace_entries.h16
-rw-r--r--kernel/trace/trace_event_profile.c43
-rw-r--r--kernel/trace/trace_events.c191
-rw-r--r--kernel/trace/trace_events_filter.c423
-rw-r--r--kernel/trace/trace_export.c43
-rw-r--r--kernel/trace/trace_functions_graph.c165
-rw-r--r--kernel/trace/trace_kprobe.c1542
-rw-r--r--kernel/trace/trace_ksym.c551
-rw-r--r--kernel/trace/trace_output.c75
-rw-r--r--kernel/trace/trace_selftest.c55
-rw-r--r--kernel/trace/trace_syscalls.c229
-rw-r--r--kernel/user-return-notifier.c44
-rw-r--r--kernel/utsname_sysctl.c31
-rw-r--r--kernel/workqueue.c131
88 files changed, 8974 insertions, 4312 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
new file mode 100644
index 000000000000..88c92fb44618
--- /dev/null
+++ b/kernel/Kconfig.locks
@@ -0,0 +1,202 @@
1#
2# The ARCH_INLINE foo is necessary because select ignores "depends on"
3#
4config ARCH_INLINE_SPIN_TRYLOCK
5 bool
6
7config ARCH_INLINE_SPIN_TRYLOCK_BH
8 bool
9
10config ARCH_INLINE_SPIN_LOCK
11 bool
12
13config ARCH_INLINE_SPIN_LOCK_BH
14 bool
15
16config ARCH_INLINE_SPIN_LOCK_IRQ
17 bool
18
19config ARCH_INLINE_SPIN_LOCK_IRQSAVE
20 bool
21
22config ARCH_INLINE_SPIN_UNLOCK
23 bool
24
25config ARCH_INLINE_SPIN_UNLOCK_BH
26 bool
27
28config ARCH_INLINE_SPIN_UNLOCK_IRQ
29 bool
30
31config ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
32 bool
33
34
35config ARCH_INLINE_READ_TRYLOCK
36 bool
37
38config ARCH_INLINE_READ_LOCK
39 bool
40
41config ARCH_INLINE_READ_LOCK_BH
42 bool
43
44config ARCH_INLINE_READ_LOCK_IRQ
45 bool
46
47config ARCH_INLINE_READ_LOCK_IRQSAVE
48 bool
49
50config ARCH_INLINE_READ_UNLOCK
51 bool
52
53config ARCH_INLINE_READ_UNLOCK_BH
54 bool
55
56config ARCH_INLINE_READ_UNLOCK_IRQ
57 bool
58
59config ARCH_INLINE_READ_UNLOCK_IRQRESTORE
60 bool
61
62
63config ARCH_INLINE_WRITE_TRYLOCK
64 bool
65
66config ARCH_INLINE_WRITE_LOCK
67 bool
68
69config ARCH_INLINE_WRITE_LOCK_BH
70 bool
71
72config ARCH_INLINE_WRITE_LOCK_IRQ
73 bool
74
75config ARCH_INLINE_WRITE_LOCK_IRQSAVE
76 bool
77
78config ARCH_INLINE_WRITE_UNLOCK
79 bool
80
81config ARCH_INLINE_WRITE_UNLOCK_BH
82 bool
83
84config ARCH_INLINE_WRITE_UNLOCK_IRQ
85 bool
86
87config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
88 bool
89
90#
91# lock_* functions are inlined when:
92# - DEBUG_SPINLOCK=n and GENERIC_LOCKBREAK=n and ARCH_INLINE_*LOCK=y
93#
94# trylock_* functions are inlined when:
95# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
96#
97# unlock and unlock_irq functions are inlined when:
98# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
99# or
100# - DEBUG_SPINLOCK=n and PREEMPT=n
101#
102# unlock_bh and unlock_irqrestore functions are inlined when:
103# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
104#
105
106config INLINE_SPIN_TRYLOCK
107 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK
108
109config INLINE_SPIN_TRYLOCK_BH
110 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK_BH
111
112config INLINE_SPIN_LOCK
113 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK
114
115config INLINE_SPIN_LOCK_BH
116 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
117 ARCH_INLINE_SPIN_LOCK_BH
118
119config INLINE_SPIN_LOCK_IRQ
120 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
121 ARCH_INLINE_SPIN_LOCK_IRQ
122
123config INLINE_SPIN_LOCK_IRQSAVE
124 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
125 ARCH_INLINE_SPIN_LOCK_IRQSAVE
126
127config INLINE_SPIN_UNLOCK
128 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK)
129
130config INLINE_SPIN_UNLOCK_BH
131 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH
132
133config INLINE_SPIN_UNLOCK_IRQ
134 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH)
135
136config INLINE_SPIN_UNLOCK_IRQRESTORE
137 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
138
139
140config INLINE_READ_TRYLOCK
141 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_TRYLOCK
142
143config INLINE_READ_LOCK
144 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK
145
146config INLINE_READ_LOCK_BH
147 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
148 ARCH_INLINE_READ_LOCK_BH
149
150config INLINE_READ_LOCK_IRQ
151 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
152 ARCH_INLINE_READ_LOCK_IRQ
153
154config INLINE_READ_LOCK_IRQSAVE
155 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
156 ARCH_INLINE_READ_LOCK_IRQSAVE
157
158config INLINE_READ_UNLOCK
159 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK)
160
161config INLINE_READ_UNLOCK_BH
162 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_BH
163
164config INLINE_READ_UNLOCK_IRQ
165 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK_BH)
166
167config INLINE_READ_UNLOCK_IRQRESTORE
168 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_IRQRESTORE
169
170
171config INLINE_WRITE_TRYLOCK
172 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_TRYLOCK
173
174config INLINE_WRITE_LOCK
175 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK
176
177config INLINE_WRITE_LOCK_BH
178 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
179 ARCH_INLINE_WRITE_LOCK_BH
180
181config INLINE_WRITE_LOCK_IRQ
182 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
183 ARCH_INLINE_WRITE_LOCK_IRQ
184
185config INLINE_WRITE_LOCK_IRQSAVE
186 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
187 ARCH_INLINE_WRITE_LOCK_IRQSAVE
188
189config INLINE_WRITE_UNLOCK
190 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK)
191
192config INLINE_WRITE_UNLOCK_BH
193 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_BH
194
195config INLINE_WRITE_UNLOCK_IRQ
196 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH)
197
198config INLINE_WRITE_UNLOCK_IRQRESTORE
199 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
200
201config MUTEX_SPIN_ON_OWNER
202 def_bool SMP && !DEBUG_MUTEXES && !HAVE_DEFAULT_NO_SPIN_MUTEXES
diff --git a/kernel/Makefile b/kernel/Makefile
index d7c13d249b2d..864ff75d65f2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -4,7 +4,7 @@
4 4
5obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ 5obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
6 cpu.o exit.o itimer.o time.o softirq.o resource.o \ 6 cpu.o exit.o itimer.o time.o softirq.o resource.o \
7 sysctl.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
@@ -21,6 +21,7 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
21CFLAGS_REMOVE_rtmutex-debug.o = -pg 21CFLAGS_REMOVE_rtmutex-debug.o = -pg
22CFLAGS_REMOVE_cgroup-debug.o = -pg 22CFLAGS_REMOVE_cgroup-debug.o = -pg
23CFLAGS_REMOVE_sched_clock.o = -pg 23CFLAGS_REMOVE_sched_clock.o = -pg
24CFLAGS_REMOVE_perf_event.o = -pg
24endif 25endif
25 26
26obj-$(CONFIG_FREEZER) += freezer.o 27obj-$(CONFIG_FREEZER) += freezer.o
@@ -82,6 +83,7 @@ obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
82obj-$(CONFIG_TREE_RCU) += rcutree.o 83obj-$(CONFIG_TREE_RCU) += rcutree.o
83obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o 84obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
84obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o 85obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
86obj-$(CONFIG_TINY_RCU) += rcutiny.o
85obj-$(CONFIG_RELAY) += relay.o 87obj-$(CONFIG_RELAY) += relay.o
86obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 88obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
87obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 89obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
@@ -96,6 +98,8 @@ obj-$(CONFIG_SMP) += sched_cpupri.o
96obj-$(CONFIG_SLOW_WORK) += slow-work.o 98obj-$(CONFIG_SLOW_WORK) += slow-work.o
97obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o 99obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
98obj-$(CONFIG_PERF_EVENTS) += perf_event.o 100obj-$(CONFIG_PERF_EVENTS) += perf_event.o
101obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
102obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
99 103
100ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 104ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
101# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 105# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/capability.c b/kernel/capability.c
index 4e17041963f5..7f876e60521f 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -29,7 +29,6 @@ EXPORT_SYMBOL(__cap_empty_set);
29EXPORT_SYMBOL(__cap_full_set); 29EXPORT_SYMBOL(__cap_full_set);
30EXPORT_SYMBOL(__cap_init_eff_set); 30EXPORT_SYMBOL(__cap_init_eff_set);
31 31
32#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
33int file_caps_enabled = 1; 32int file_caps_enabled = 1;
34 33
35static int __init file_caps_disable(char *str) 34static int __init file_caps_disable(char *str)
@@ -38,7 +37,6 @@ static int __init file_caps_disable(char *str)
38 return 1; 37 return 1;
39} 38}
40__setup("no_file_caps", file_caps_disable); 39__setup("no_file_caps", file_caps_disable);
41#endif
42 40
43/* 41/*
44 * More recent versions of libcap are available from: 42 * More recent versions of libcap are available from:
@@ -169,8 +167,8 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
169 kernel_cap_t pE, pI, pP; 167 kernel_cap_t pE, pI, pP;
170 168
171 ret = cap_validate_magic(header, &tocopy); 169 ret = cap_validate_magic(header, &tocopy);
172 if (ret != 0) 170 if ((dataptr == NULL) || (ret != 0))
173 return ret; 171 return ((dataptr == NULL) && (ret == -EINVAL)) ? 0 : ret;
174 172
175 if (get_user(pid, &header->pid)) 173 if (get_user(pid, &header->pid))
176 return -EFAULT; 174 return -EFAULT;
@@ -238,7 +236,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
238SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) 236SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
239{ 237{
240 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; 238 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
241 unsigned i, tocopy; 239 unsigned i, tocopy, copybytes;
242 kernel_cap_t inheritable, permitted, effective; 240 kernel_cap_t inheritable, permitted, effective;
243 struct cred *new; 241 struct cred *new;
244 int ret; 242 int ret;
@@ -255,8 +253,11 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
255 if (pid != 0 && pid != task_pid_vnr(current)) 253 if (pid != 0 && pid != task_pid_vnr(current))
256 return -EPERM; 254 return -EPERM;
257 255
258 if (copy_from_user(&kdata, data, 256 copybytes = tocopy * sizeof(struct __user_cap_data_struct);
259 tocopy * sizeof(struct __user_cap_data_struct))) 257 if (copybytes > sizeof(kdata))
258 return -EFAULT;
259
260 if (copy_from_user(&kdata, data, copybytes))
260 return -EFAULT; 261 return -EFAULT;
261 262
262 for (i = 0; i < tocopy; i++) { 263 for (i = 0; i < tocopy; i++) {
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6ba0f1ecb212..291ac586f37f 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -212,6 +212,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
212 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, 212 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
213 hcpu, -1, &nr_calls); 213 hcpu, -1, &nr_calls);
214 if (err == NOTIFY_BAD) { 214 if (err == NOTIFY_BAD) {
215 set_cpu_active(cpu, true);
216
215 nr_calls--; 217 nr_calls--;
216 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 218 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
217 hcpu, nr_calls, NULL); 219 hcpu, nr_calls, NULL);
@@ -223,11 +225,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
223 225
224 /* Ensure that we are not runnable on dying cpu */ 226 /* Ensure that we are not runnable on dying cpu */
225 cpumask_copy(old_allowed, &current->cpus_allowed); 227 cpumask_copy(old_allowed, &current->cpus_allowed);
226 set_cpus_allowed_ptr(current, 228 set_cpus_allowed_ptr(current, cpu_active_mask);
227 cpumask_of(cpumask_any_but(cpu_online_mask, cpu)));
228 229
229 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); 230 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
230 if (err) { 231 if (err) {
232 set_cpu_active(cpu, true);
231 /* CPU didn't die: tell everyone. Can't complain. */ 233 /* CPU didn't die: tell everyone. Can't complain. */
232 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 234 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
233 hcpu) == NOTIFY_BAD) 235 hcpu) == NOTIFY_BAD)
@@ -292,9 +294,6 @@ int __ref cpu_down(unsigned int cpu)
292 294
293 err = _cpu_down(cpu, 0); 295 err = _cpu_down(cpu, 0);
294 296
295 if (cpu_online(cpu))
296 set_cpu_active(cpu, true);
297
298out: 297out:
299 cpu_maps_update_done(); 298 cpu_maps_update_done();
300 stop_machine_destroy(); 299 stop_machine_destroy();
@@ -387,15 +386,23 @@ int disable_nonboot_cpus(void)
387 * with the userspace trying to use the CPU hotplug at the same time 386 * with the userspace trying to use the CPU hotplug at the same time
388 */ 387 */
389 cpumask_clear(frozen_cpus); 388 cpumask_clear(frozen_cpus);
389
390 for_each_online_cpu(cpu) {
391 if (cpu == first_cpu)
392 continue;
393 set_cpu_active(cpu, false);
394 }
395
396 synchronize_sched();
397
390 printk("Disabling non-boot CPUs ...\n"); 398 printk("Disabling non-boot CPUs ...\n");
391 for_each_online_cpu(cpu) { 399 for_each_online_cpu(cpu) {
392 if (cpu == first_cpu) 400 if (cpu == first_cpu)
393 continue; 401 continue;
394 error = _cpu_down(cpu, 1); 402 error = _cpu_down(cpu, 1);
395 if (!error) { 403 if (!error)
396 cpumask_set_cpu(cpu, frozen_cpus); 404 cpumask_set_cpu(cpu, frozen_cpus);
397 printk("CPU%d is down\n", cpu); 405 else {
398 } else {
399 printk(KERN_ERR "Error taking CPU%d down: %d\n", 406 printk(KERN_ERR "Error taking CPU%d down: %d\n",
400 cpu, error); 407 cpu, error);
401 break; 408 break;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b5cb469d2545..ba401fab459f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -537,8 +537,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
537 * element of the partition (one sched domain) to be passed to 537 * element of the partition (one sched domain) to be passed to
538 * partition_sched_domains(). 538 * partition_sched_domains().
539 */ 539 */
540/* FIXME: see the FIXME in partition_sched_domains() */ 540static int generate_sched_domains(cpumask_var_t **domains,
541static int generate_sched_domains(struct cpumask **domains,
542 struct sched_domain_attr **attributes) 541 struct sched_domain_attr **attributes)
543{ 542{
544 LIST_HEAD(q); /* queue of cpusets to be scanned */ 543 LIST_HEAD(q); /* queue of cpusets to be scanned */
@@ -546,7 +545,7 @@ static int generate_sched_domains(struct cpumask **domains,
546 struct cpuset **csa; /* array of all cpuset ptrs */ 545 struct cpuset **csa; /* array of all cpuset ptrs */
547 int csn; /* how many cpuset ptrs in csa so far */ 546 int csn; /* how many cpuset ptrs in csa so far */
548 int i, j, k; /* indices for partition finding loops */ 547 int i, j, k; /* indices for partition finding loops */
549 struct cpumask *doms; /* resulting partition; i.e. sched domains */ 548 cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
550 struct sched_domain_attr *dattr; /* attributes for custom domains */ 549 struct sched_domain_attr *dattr; /* attributes for custom domains */
551 int ndoms = 0; /* number of sched domains in result */ 550 int ndoms = 0; /* number of sched domains in result */
552 int nslot; /* next empty doms[] struct cpumask slot */ 551 int nslot; /* next empty doms[] struct cpumask slot */
@@ -557,7 +556,8 @@ static int generate_sched_domains(struct cpumask **domains,
557 556
558 /* Special case for the 99% of systems with one, full, sched domain */ 557 /* Special case for the 99% of systems with one, full, sched domain */
559 if (is_sched_load_balance(&top_cpuset)) { 558 if (is_sched_load_balance(&top_cpuset)) {
560 doms = kmalloc(cpumask_size(), GFP_KERNEL); 559 ndoms = 1;
560 doms = alloc_sched_domains(ndoms);
561 if (!doms) 561 if (!doms)
562 goto done; 562 goto done;
563 563
@@ -566,9 +566,8 @@ static int generate_sched_domains(struct cpumask **domains,
566 *dattr = SD_ATTR_INIT; 566 *dattr = SD_ATTR_INIT;
567 update_domain_attr_tree(dattr, &top_cpuset); 567 update_domain_attr_tree(dattr, &top_cpuset);
568 } 568 }
569 cpumask_copy(doms, top_cpuset.cpus_allowed); 569 cpumask_copy(doms[0], top_cpuset.cpus_allowed);
570 570
571 ndoms = 1;
572 goto done; 571 goto done;
573 } 572 }
574 573
@@ -636,7 +635,7 @@ restart:
636 * Now we know how many domains to create. 635 * Now we know how many domains to create.
637 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. 636 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
638 */ 637 */
639 doms = kmalloc(ndoms * cpumask_size(), GFP_KERNEL); 638 doms = alloc_sched_domains(ndoms);
640 if (!doms) 639 if (!doms)
641 goto done; 640 goto done;
642 641
@@ -656,7 +655,7 @@ restart:
656 continue; 655 continue;
657 } 656 }
658 657
659 dp = doms + nslot; 658 dp = doms[nslot];
660 659
661 if (nslot == ndoms) { 660 if (nslot == ndoms) {
662 static int warnings = 10; 661 static int warnings = 10;
@@ -718,7 +717,7 @@ done:
718static void do_rebuild_sched_domains(struct work_struct *unused) 717static void do_rebuild_sched_domains(struct work_struct *unused)
719{ 718{
720 struct sched_domain_attr *attr; 719 struct sched_domain_attr *attr;
721 struct cpumask *doms; 720 cpumask_var_t *doms;
722 int ndoms; 721 int ndoms;
723 722
724 get_online_cpus(); 723 get_online_cpus();
@@ -738,7 +737,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused)
738{ 737{
739} 738}
740 739
741static int generate_sched_domains(struct cpumask **domains, 740static int generate_sched_domains(cpumask_var_t **domains,
742 struct sched_domain_attr **attributes) 741 struct sched_domain_attr **attributes)
743{ 742{
744 *domains = NULL; 743 *domains = NULL;
@@ -873,7 +872,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
873 if (retval < 0) 872 if (retval < 0)
874 return retval; 873 return retval;
875 874
876 if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask)) 875 if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
877 return -EINVAL; 876 return -EINVAL;
878 } 877 }
879 retval = validate_change(cs, trialcs); 878 retval = validate_change(cs, trialcs);
@@ -2011,7 +2010,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2011 } 2010 }
2012 2011
2013 /* Continue past cpusets with all cpus, mems online */ 2012 /* Continue past cpusets with all cpus, mems online */
2014 if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) && 2013 if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&
2015 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) 2014 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
2016 continue; 2015 continue;
2017 2016
@@ -2020,7 +2019,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2020 /* Remove offline cpus and mems from this cpuset. */ 2019 /* Remove offline cpus and mems from this cpuset. */
2021 mutex_lock(&callback_mutex); 2020 mutex_lock(&callback_mutex);
2022 cpumask_and(cp->cpus_allowed, cp->cpus_allowed, 2021 cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
2023 cpu_online_mask); 2022 cpu_active_mask);
2024 nodes_and(cp->mems_allowed, cp->mems_allowed, 2023 nodes_and(cp->mems_allowed, cp->mems_allowed,
2025 node_states[N_HIGH_MEMORY]); 2024 node_states[N_HIGH_MEMORY]);
2026 mutex_unlock(&callback_mutex); 2025 mutex_unlock(&callback_mutex);
@@ -2052,14 +2051,16 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2052 unsigned long phase, void *unused_cpu) 2051 unsigned long phase, void *unused_cpu)
2053{ 2052{
2054 struct sched_domain_attr *attr; 2053 struct sched_domain_attr *attr;
2055 struct cpumask *doms; 2054 cpumask_var_t *doms;
2056 int ndoms; 2055 int ndoms;
2057 2056
2058 switch (phase) { 2057 switch (phase) {
2059 case CPU_ONLINE: 2058 case CPU_ONLINE:
2060 case CPU_ONLINE_FROZEN: 2059 case CPU_ONLINE_FROZEN:
2061 case CPU_DEAD: 2060 case CPU_DOWN_PREPARE:
2062 case CPU_DEAD_FROZEN: 2061 case CPU_DOWN_PREPARE_FROZEN:
2062 case CPU_DOWN_FAILED:
2063 case CPU_DOWN_FAILED_FROZEN:
2063 break; 2064 break;
2064 2065
2065 default: 2066 default:
@@ -2068,7 +2069,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2068 2069
2069 cgroup_lock(); 2070 cgroup_lock();
2070 mutex_lock(&callback_mutex); 2071 mutex_lock(&callback_mutex);
2071 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); 2072 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2072 mutex_unlock(&callback_mutex); 2073 mutex_unlock(&callback_mutex);
2073 scan_for_empty_cpusets(&top_cpuset); 2074 scan_for_empty_cpusets(&top_cpuset);
2074 ndoms = generate_sched_domains(&doms, &attr); 2075 ndoms = generate_sched_domains(&doms, &attr);
@@ -2115,7 +2116,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2115 2116
2116void __init cpuset_init_smp(void) 2117void __init cpuset_init_smp(void)
2117{ 2118{
2118 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); 2119 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2119 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2120 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2120 2121
2121 hotcpu_notifier(cpuset_track_online_cpus, 0); 2122 hotcpu_notifier(cpuset_track_online_cpus, 0);
@@ -2537,15 +2538,9 @@ const struct file_operations proc_cpuset_operations = {
2537}; 2538};
2538#endif /* CONFIG_PROC_PID_CPUSET */ 2539#endif /* CONFIG_PROC_PID_CPUSET */
2539 2540
2540/* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */ 2541/* Display task mems_allowed in /proc/<pid>/status file. */
2541void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) 2542void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2542{ 2543{
2543 seq_printf(m, "Cpus_allowed:\t");
2544 seq_cpumask(m, &task->cpus_allowed);
2545 seq_printf(m, "\n");
2546 seq_printf(m, "Cpus_allowed_list:\t");
2547 seq_cpumask_list(m, &task->cpus_allowed);
2548 seq_printf(m, "\n");
2549 seq_printf(m, "Mems_allowed:\t"); 2544 seq_printf(m, "Mems_allowed:\t");
2550 seq_nodemask(m, &task->mems_allowed); 2545 seq_nodemask(m, &task->mems_allowed);
2551 seq_printf(m, "\n"); 2546 seq_printf(m, "\n");
diff --git a/kernel/exit.c b/kernel/exit.c
index f7864ac2ecc1..6f50ef55a6f3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -49,6 +49,7 @@
49#include <linux/init_task.h> 49#include <linux/init_task.h>
50#include <linux/perf_event.h> 50#include <linux/perf_event.h>
51#include <trace/events/sched.h> 51#include <trace/events/sched.h>
52#include <linux/hw_breakpoint.h>
52 53
53#include <asm/uaccess.h> 54#include <asm/uaccess.h>
54#include <asm/unistd.h> 55#include <asm/unistd.h>
@@ -110,9 +111,9 @@ static void __exit_signal(struct task_struct *tsk)
110 * We won't ever get here for the group leader, since it 111 * We won't ever get here for the group leader, since it
111 * will have been the last reference on the signal_struct. 112 * will have been the last reference on the signal_struct.
112 */ 113 */
113 sig->utime = cputime_add(sig->utime, task_utime(tsk)); 114 sig->utime = cputime_add(sig->utime, tsk->utime);
114 sig->stime = cputime_add(sig->stime, task_stime(tsk)); 115 sig->stime = cputime_add(sig->stime, tsk->stime);
115 sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); 116 sig->gtime = cputime_add(sig->gtime, tsk->gtime);
116 sig->min_flt += tsk->min_flt; 117 sig->min_flt += tsk->min_flt;
117 sig->maj_flt += tsk->maj_flt; 118 sig->maj_flt += tsk->maj_flt;
118 sig->nvcsw += tsk->nvcsw; 119 sig->nvcsw += tsk->nvcsw;
@@ -970,7 +971,7 @@ NORET_TYPE void do_exit(long code)
970 exit_thread(); 971 exit_thread();
971 cgroup_exit(tsk, 1); 972 cgroup_exit(tsk, 1);
972 973
973 if (group_dead && tsk->signal->leader) 974 if (group_dead)
974 disassociate_ctty(1); 975 disassociate_ctty(1);
975 976
976 module_put(task_thread_info(tsk)->exec_domain->module); 977 module_put(task_thread_info(tsk)->exec_domain->module);
@@ -978,6 +979,10 @@ NORET_TYPE void do_exit(long code)
978 proc_exit_connector(tsk); 979 proc_exit_connector(tsk);
979 980
980 /* 981 /*
982 * FIXME: do that only when needed, using sched_exit tracepoint
983 */
984 flush_ptrace_hw_breakpoint(tsk);
985 /*
981 * Flush inherited counters to the parent - before the parent 986 * Flush inherited counters to the parent - before the parent
982 * gets woken up by child-exit notifications. 987 * gets woken up by child-exit notifications.
983 */ 988 */
@@ -1004,7 +1009,7 @@ NORET_TYPE void do_exit(long code)
1004 tsk->flags |= PF_EXITPIDONE; 1009 tsk->flags |= PF_EXITPIDONE;
1005 1010
1006 if (tsk->io_context) 1011 if (tsk->io_context)
1007 exit_io_context(); 1012 exit_io_context(tsk);
1008 1013
1009 if (tsk->splice_pipe) 1014 if (tsk->splice_pipe)
1010 __free_pipe_info(tsk->splice_pipe); 1015 __free_pipe_info(tsk->splice_pipe);
@@ -1205,6 +1210,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1205 struct signal_struct *psig; 1210 struct signal_struct *psig;
1206 struct signal_struct *sig; 1211 struct signal_struct *sig;
1207 unsigned long maxrss; 1212 unsigned long maxrss;
1213 cputime_t tgutime, tgstime;
1208 1214
1209 /* 1215 /*
1210 * The resource counters for the group leader are in its 1216 * The resource counters for the group leader are in its
@@ -1220,20 +1226,23 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1220 * need to protect the access to parent->signal fields, 1226 * need to protect the access to parent->signal fields,
1221 * as other threads in the parent group can be right 1227 * as other threads in the parent group can be right
1222 * here reaping other children at the same time. 1228 * here reaping other children at the same time.
1229 *
1230 * We use thread_group_times() to get times for the thread
1231 * group, which consolidates times for all threads in the
1232 * group including the group leader.
1223 */ 1233 */
1234 thread_group_times(p, &tgutime, &tgstime);
1224 spin_lock_irq(&p->real_parent->sighand->siglock); 1235 spin_lock_irq(&p->real_parent->sighand->siglock);
1225 psig = p->real_parent->signal; 1236 psig = p->real_parent->signal;
1226 sig = p->signal; 1237 sig = p->signal;
1227 psig->cutime = 1238 psig->cutime =
1228 cputime_add(psig->cutime, 1239 cputime_add(psig->cutime,
1229 cputime_add(p->utime, 1240 cputime_add(tgutime,
1230 cputime_add(sig->utime, 1241 sig->cutime));
1231 sig->cutime)));
1232 psig->cstime = 1242 psig->cstime =
1233 cputime_add(psig->cstime, 1243 cputime_add(psig->cstime,
1234 cputime_add(p->stime, 1244 cputime_add(tgstime,
1235 cputime_add(sig->stime, 1245 sig->cstime));
1236 sig->cstime)));
1237 psig->cgtime = 1246 psig->cgtime =
1238 cputime_add(psig->cgtime, 1247 cputime_add(psig->cgtime,
1239 cputime_add(p->gtime, 1248 cputime_add(p->gtime,
diff --git a/kernel/fork.c b/kernel/fork.c
index 166b8c49257c..1415dc4598ae 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -64,6 +64,7 @@
64#include <linux/magic.h> 64#include <linux/magic.h>
65#include <linux/perf_event.h> 65#include <linux/perf_event.h>
66#include <linux/posix-timers.h> 66#include <linux/posix-timers.h>
67#include <linux/user-return-notifier.h>
67 68
68#include <asm/pgtable.h> 69#include <asm/pgtable.h>
69#include <asm/pgalloc.h> 70#include <asm/pgalloc.h>
@@ -249,6 +250,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
249 goto out; 250 goto out;
250 251
251 setup_thread_stack(tsk, orig); 252 setup_thread_stack(tsk, orig);
253 clear_user_return_notifier(tsk);
252 stackend = end_of_stack(tsk); 254 stackend = end_of_stack(tsk);
253 *stackend = STACK_END_MAGIC; /* for overflow detection */ 255 *stackend = STACK_END_MAGIC; /* for overflow detection */
254 256
@@ -884,6 +886,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
884 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; 886 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
885 sig->gtime = cputime_zero; 887 sig->gtime = cputime_zero;
886 sig->cgtime = cputime_zero; 888 sig->cgtime = cputime_zero;
889#ifndef CONFIG_VIRT_CPU_ACCOUNTING
890 sig->prev_utime = sig->prev_stime = cputime_zero;
891#endif
887 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; 892 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
888 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; 893 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
889 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; 894 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
@@ -1066,8 +1071,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1066 p->gtime = cputime_zero; 1071 p->gtime = cputime_zero;
1067 p->utimescaled = cputime_zero; 1072 p->utimescaled = cputime_zero;
1068 p->stimescaled = cputime_zero; 1073 p->stimescaled = cputime_zero;
1074#ifndef CONFIG_VIRT_CPU_ACCOUNTING
1069 p->prev_utime = cputime_zero; 1075 p->prev_utime = cputime_zero;
1070 p->prev_stime = cputime_zero; 1076 p->prev_stime = cputime_zero;
1077#endif
1071 1078
1072 p->default_timer_slack_ns = current->timer_slack_ns; 1079 p->default_timer_slack_ns = current->timer_slack_ns;
1073 1080
@@ -1310,7 +1317,8 @@ bad_fork_free_pid:
1310 if (pid != &init_struct_pid) 1317 if (pid != &init_struct_pid)
1311 free_pid(pid); 1318 free_pid(pid);
1312bad_fork_cleanup_io: 1319bad_fork_cleanup_io:
1313 put_io_context(p->io_context); 1320 if (p->io_context)
1321 exit_io_context(p);
1314bad_fork_cleanup_namespaces: 1322bad_fork_cleanup_namespaces:
1315 exit_task_namespaces(p); 1323 exit_task_namespaces(p);
1316bad_fork_cleanup_mm: 1324bad_fork_cleanup_mm:
diff --git a/kernel/futex.c b/kernel/futex.c
index fb65e822fc41..d73ef1f3e55d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -304,8 +304,14 @@ void put_futex_key(int fshared, union futex_key *key)
304 */ 304 */
305static int fault_in_user_writeable(u32 __user *uaddr) 305static int fault_in_user_writeable(u32 __user *uaddr)
306{ 306{
307 int ret = get_user_pages(current, current->mm, (unsigned long)uaddr, 307 struct mm_struct *mm = current->mm;
308 1, 1, 0, NULL, NULL); 308 int ret;
309
310 down_read(&mm->mmap_sem);
311 ret = get_user_pages(current, mm, (unsigned long)uaddr,
312 1, 1, 0, NULL, NULL);
313 up_read(&mm->mmap_sem);
314
309 return ret < 0 ? ret : 0; 315 return ret < 0 ? ret : 0;
310} 316}
311 317
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 3e1c36e7998f..d2f9239dc6ba 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -557,7 +557,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
557static int hrtimer_reprogram(struct hrtimer *timer, 557static int hrtimer_reprogram(struct hrtimer *timer,
558 struct hrtimer_clock_base *base) 558 struct hrtimer_clock_base *base)
559{ 559{
560 ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next; 560 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
561 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); 561 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
562 int res; 562 int res;
563 563
@@ -582,7 +582,16 @@ static int hrtimer_reprogram(struct hrtimer *timer,
582 if (expires.tv64 < 0) 582 if (expires.tv64 < 0)
583 return -ETIME; 583 return -ETIME;
584 584
585 if (expires.tv64 >= expires_next->tv64) 585 if (expires.tv64 >= cpu_base->expires_next.tv64)
586 return 0;
587
588 /*
589 * If a hang was detected in the last timer interrupt then we
590 * do not schedule a timer which is earlier than the expiry
591 * which we enforced in the hang detection. We want the system
592 * to make progress.
593 */
594 if (cpu_base->hang_detected)
586 return 0; 595 return 0;
587 596
588 /* 597 /*
@@ -590,7 +599,7 @@ static int hrtimer_reprogram(struct hrtimer *timer,
590 */ 599 */
591 res = tick_program_event(expires, 0); 600 res = tick_program_event(expires, 0);
592 if (!IS_ERR_VALUE(res)) 601 if (!IS_ERR_VALUE(res))
593 *expires_next = expires; 602 cpu_base->expires_next = expires;
594 return res; 603 return res;
595} 604}
596 605
@@ -747,17 +756,33 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
747 756
748#endif /* CONFIG_HIGH_RES_TIMERS */ 757#endif /* CONFIG_HIGH_RES_TIMERS */
749 758
750#ifdef CONFIG_TIMER_STATS 759static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
751void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr)
752{ 760{
761#ifdef CONFIG_TIMER_STATS
753 if (timer->start_site) 762 if (timer->start_site)
754 return; 763 return;
755 764 timer->start_site = __builtin_return_address(0);
756 timer->start_site = addr;
757 memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); 765 memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
758 timer->start_pid = current->pid; 766 timer->start_pid = current->pid;
767#endif
759} 768}
769
770static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer)
771{
772#ifdef CONFIG_TIMER_STATS
773 timer->start_site = NULL;
774#endif
775}
776
777static inline void timer_stats_account_hrtimer(struct hrtimer *timer)
778{
779#ifdef CONFIG_TIMER_STATS
780 if (likely(!timer_stats_active))
781 return;
782 timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
783 timer->function, timer->start_comm, 0);
760#endif 784#endif
785}
761 786
762/* 787/*
763 * Counterpart to lock_hrtimer_base above: 788 * Counterpart to lock_hrtimer_base above:
@@ -1217,29 +1242,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
1217 1242
1218#ifdef CONFIG_HIGH_RES_TIMERS 1243#ifdef CONFIG_HIGH_RES_TIMERS
1219 1244
1220static int force_clock_reprogram;
1221
1222/*
1223 * After 5 iteration's attempts, we consider that hrtimer_interrupt()
1224 * is hanging, which could happen with something that slows the interrupt
1225 * such as the tracing. Then we force the clock reprogramming for each future
1226 * hrtimer interrupts to avoid infinite loops and use the min_delta_ns
1227 * threshold that we will overwrite.
1228 * The next tick event will be scheduled to 3 times we currently spend on
1229 * hrtimer_interrupt(). This gives a good compromise, the cpus will spend
1230 * 1/4 of their time to process the hrtimer interrupts. This is enough to
1231 * let it running without serious starvation.
1232 */
1233
1234static inline void
1235hrtimer_interrupt_hanging(struct clock_event_device *dev,
1236 ktime_t try_time)
1237{
1238 force_clock_reprogram = 1;
1239 dev->min_delta_ns = (unsigned long)try_time.tv64 * 3;
1240 printk(KERN_WARNING "hrtimer: interrupt too slow, "
1241 "forcing clock min delta to %lu ns\n", dev->min_delta_ns);
1242}
1243/* 1245/*
1244 * High resolution timer interrupt 1246 * High resolution timer interrupt
1245 * Called with interrupts disabled 1247 * Called with interrupts disabled
@@ -1248,21 +1250,15 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1248{ 1250{
1249 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1251 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1250 struct hrtimer_clock_base *base; 1252 struct hrtimer_clock_base *base;
1251 ktime_t expires_next, now; 1253 ktime_t expires_next, now, entry_time, delta;
1252 int nr_retries = 0; 1254 int i, retries = 0;
1253 int i;
1254 1255
1255 BUG_ON(!cpu_base->hres_active); 1256 BUG_ON(!cpu_base->hres_active);
1256 cpu_base->nr_events++; 1257 cpu_base->nr_events++;
1257 dev->next_event.tv64 = KTIME_MAX; 1258 dev->next_event.tv64 = KTIME_MAX;
1258 1259
1259 retry: 1260 entry_time = now = ktime_get();
1260 /* 5 retries is enough to notice a hang */ 1261retry:
1261 if (!(++nr_retries % 5))
1262 hrtimer_interrupt_hanging(dev, ktime_sub(ktime_get(), now));
1263
1264 now = ktime_get();
1265
1266 expires_next.tv64 = KTIME_MAX; 1262 expires_next.tv64 = KTIME_MAX;
1267 1263
1268 spin_lock(&cpu_base->lock); 1264 spin_lock(&cpu_base->lock);
@@ -1324,10 +1320,48 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1324 spin_unlock(&cpu_base->lock); 1320 spin_unlock(&cpu_base->lock);
1325 1321
1326 /* Reprogramming necessary ? */ 1322 /* Reprogramming necessary ? */
1327 if (expires_next.tv64 != KTIME_MAX) { 1323 if (expires_next.tv64 == KTIME_MAX ||
1328 if (tick_program_event(expires_next, force_clock_reprogram)) 1324 !tick_program_event(expires_next, 0)) {
1329 goto retry; 1325 cpu_base->hang_detected = 0;
1326 return;
1330 } 1327 }
1328
1329 /*
1330 * The next timer was already expired due to:
1331 * - tracing
1332 * - long lasting callbacks
1333 * - being scheduled away when running in a VM
1334 *
1335 * We need to prevent that we loop forever in the hrtimer
1336 * interrupt routine. We give it 3 attempts to avoid
1337 * overreacting on some spurious event.
1338 */
1339 now = ktime_get();
1340 cpu_base->nr_retries++;
1341 if (++retries < 3)
1342 goto retry;
1343 /*
1344 * Give the system a chance to do something else than looping
1345 * here. We stored the entry time, so we know exactly how long
1346 * we spent here. We schedule the next event this amount of
1347 * time away.
1348 */
1349 cpu_base->nr_hangs++;
1350 cpu_base->hang_detected = 1;
1351 delta = ktime_sub(now, entry_time);
1352 if (delta.tv64 > cpu_base->max_hang_time.tv64)
1353 cpu_base->max_hang_time = delta;
1354 /*
1355 * Limit it to a sensible value as we enforce a longer
1356 * delay. Give the CPU at least 100ms to catch up.
1357 */
1358 if (delta.tv64 > 100 * NSEC_PER_MSEC)
1359 expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
1360 else
1361 expires_next = ktime_add(now, delta);
1362 tick_program_event(expires_next, 1);
1363 printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n",
1364 ktime_to_ns(delta));
1331} 1365}
1332 1366
1333/* 1367/*
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index d4e841747400..0c642d51aac2 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -144,7 +144,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
144 144
145 rcu_read_lock(); 145 rcu_read_lock();
146 do_each_thread(g, t) { 146 do_each_thread(g, t) {
147 if (!--max_count) 147 if (!max_count--)
148 goto unlock; 148 goto unlock;
149 if (!--batch_count) { 149 if (!--batch_count) {
150 batch_count = HUNG_TASK_BATCHING; 150 batch_count = HUNG_TASK_BATCHING;
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
new file mode 100644
index 000000000000..366eedf949c0
--- /dev/null
+++ b/kernel/hw_breakpoint.c
@@ -0,0 +1,453 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) 2007 Alan Stern
17 * Copyright (C) IBM Corporation, 2009
18 * Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com>
19 *
20 * Thanks to Ingo Molnar for his many suggestions.
21 *
22 * Authors: Alan Stern <stern@rowland.harvard.edu>
23 * K.Prasad <prasad@linux.vnet.ibm.com>
24 * Frederic Weisbecker <fweisbec@gmail.com>
25 */
26
27/*
28 * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
29 * using the CPU's debug registers.
30 * This file contains the arch-independent routines.
31 */
32
33#include <linux/irqflags.h>
34#include <linux/kallsyms.h>
35#include <linux/notifier.h>
36#include <linux/kprobes.h>
37#include <linux/kdebug.h>
38#include <linux/kernel.h>
39#include <linux/module.h>
40#include <linux/percpu.h>
41#include <linux/sched.h>
42#include <linux/init.h>
43#include <linux/smp.h>
44
45#include <linux/hw_breakpoint.h>
46
47/*
48 * Constraints data
49 */
50
51/* Number of pinned cpu breakpoints in a cpu */
52static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned);
53
54/* Number of pinned task breakpoints in a cpu */
55static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[HBP_NUM]);
56
57/* Number of non-pinned cpu/task breakpoints in a cpu */
58static DEFINE_PER_CPU(unsigned int, nr_bp_flexible);
59
60/* Gather the number of total pinned and un-pinned bp in a cpuset */
61struct bp_busy_slots {
62 unsigned int pinned;
63 unsigned int flexible;
64};
65
66/* Serialize accesses to the above constraints */
67static DEFINE_MUTEX(nr_bp_mutex);
68
69/*
70 * Report the maximum number of pinned breakpoints a task
71 * have in this cpu
72 */
73static unsigned int max_task_bp_pinned(int cpu)
74{
75 int i;
76 unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned, cpu);
77
78 for (i = HBP_NUM -1; i >= 0; i--) {
79 if (tsk_pinned[i] > 0)
80 return i + 1;
81 }
82
83 return 0;
84}
85
86static int task_bp_pinned(struct task_struct *tsk)
87{
88 struct perf_event_context *ctx = tsk->perf_event_ctxp;
89 struct list_head *list;
90 struct perf_event *bp;
91 unsigned long flags;
92 int count = 0;
93
94 if (WARN_ONCE(!ctx, "No perf context for this task"))
95 return 0;
96
97 list = &ctx->event_list;
98
99 spin_lock_irqsave(&ctx->lock, flags);
100
101 /*
102 * The current breakpoint counter is not included in the list
103 * at the open() callback time
104 */
105 list_for_each_entry(bp, list, event_entry) {
106 if (bp->attr.type == PERF_TYPE_BREAKPOINT)
107 count++;
108 }
109
110 spin_unlock_irqrestore(&ctx->lock, flags);
111
112 return count;
113}
114
115/*
116 * Report the number of pinned/un-pinned breakpoints we have in
117 * a given cpu (cpu > -1) or in all of them (cpu = -1).
118 */
119static void
120fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
121{
122 int cpu = bp->cpu;
123 struct task_struct *tsk = bp->ctx->task;
124
125 if (cpu >= 0) {
126 slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu);
127 if (!tsk)
128 slots->pinned += max_task_bp_pinned(cpu);
129 else
130 slots->pinned += task_bp_pinned(tsk);
131 slots->flexible = per_cpu(nr_bp_flexible, cpu);
132
133 return;
134 }
135
136 for_each_online_cpu(cpu) {
137 unsigned int nr;
138
139 nr = per_cpu(nr_cpu_bp_pinned, cpu);
140 if (!tsk)
141 nr += max_task_bp_pinned(cpu);
142 else
143 nr += task_bp_pinned(tsk);
144
145 if (nr > slots->pinned)
146 slots->pinned = nr;
147
148 nr = per_cpu(nr_bp_flexible, cpu);
149
150 if (nr > slots->flexible)
151 slots->flexible = nr;
152 }
153}
154
155/*
156 * Add a pinned breakpoint for the given task in our constraint table
157 */
158static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable)
159{
160 unsigned int *tsk_pinned;
161 int count = 0;
162
163 count = task_bp_pinned(tsk);
164
165 tsk_pinned = per_cpu(nr_task_bp_pinned, cpu);
166 if (enable) {
167 tsk_pinned[count]++;
168 if (count > 0)
169 tsk_pinned[count-1]--;
170 } else {
171 tsk_pinned[count]--;
172 if (count > 0)
173 tsk_pinned[count-1]++;
174 }
175}
176
177/*
178 * Add/remove the given breakpoint in our constraint table
179 */
180static void toggle_bp_slot(struct perf_event *bp, bool enable)
181{
182 int cpu = bp->cpu;
183 struct task_struct *tsk = bp->ctx->task;
184
185 /* Pinned counter task profiling */
186 if (tsk) {
187 if (cpu >= 0) {
188 toggle_bp_task_slot(tsk, cpu, enable);
189 return;
190 }
191
192 for_each_online_cpu(cpu)
193 toggle_bp_task_slot(tsk, cpu, enable);
194 return;
195 }
196
197 /* Pinned counter cpu profiling */
198 if (enable)
199 per_cpu(nr_cpu_bp_pinned, bp->cpu)++;
200 else
201 per_cpu(nr_cpu_bp_pinned, bp->cpu)--;
202}
203
204/*
205 * Contraints to check before allowing this new breakpoint counter:
206 *
207 * == Non-pinned counter == (Considered as pinned for now)
208 *
209 * - If attached to a single cpu, check:
210 *
211 * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu)
212 * + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM
213 *
214 * -> If there are already non-pinned counters in this cpu, it means
215 * there is already a free slot for them.
216 * Otherwise, we check that the maximum number of per task
217 * breakpoints (for this cpu) plus the number of per cpu breakpoint
218 * (for this cpu) doesn't cover every registers.
219 *
220 * - If attached to every cpus, check:
221 *
222 * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *))
223 * + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM
224 *
225 * -> This is roughly the same, except we check the number of per cpu
226 * bp for every cpu and we keep the max one. Same for the per tasks
227 * breakpoints.
228 *
229 *
230 * == Pinned counter ==
231 *
232 * - If attached to a single cpu, check:
233 *
234 * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu)
235 * + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM
236 *
237 * -> Same checks as before. But now the nr_bp_flexible, if any, must keep
238 * one register at least (or they will never be fed).
239 *
240 * - If attached to every cpus, check:
241 *
242 * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
243 * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM
244 */
245int reserve_bp_slot(struct perf_event *bp)
246{
247 struct bp_busy_slots slots = {0};
248 int ret = 0;
249
250 mutex_lock(&nr_bp_mutex);
251
252 fetch_bp_busy_slots(&slots, bp);
253
254 /* Flexible counters need to keep at least one slot */
255 if (slots.pinned + (!!slots.flexible) == HBP_NUM) {
256 ret = -ENOSPC;
257 goto end;
258 }
259
260 toggle_bp_slot(bp, true);
261
262end:
263 mutex_unlock(&nr_bp_mutex);
264
265 return ret;
266}
267
268void release_bp_slot(struct perf_event *bp)
269{
270 mutex_lock(&nr_bp_mutex);
271
272 toggle_bp_slot(bp, false);
273
274 mutex_unlock(&nr_bp_mutex);
275}
276
277
278int register_perf_hw_breakpoint(struct perf_event *bp)
279{
280 int ret;
281
282 ret = reserve_bp_slot(bp);
283 if (ret)
284 return ret;
285
286 /*
287 * Ptrace breakpoints can be temporary perf events only
288 * meant to reserve a slot. In this case, it is created disabled and
289 * we don't want to check the params right now (as we put a null addr)
290 * But perf tools create events as disabled and we want to check
291 * the params for them.
292 * This is a quick hack that will be removed soon, once we remove
293 * the tmp breakpoints from ptrace
294 */
295 if (!bp->attr.disabled || !bp->overflow_handler)
296 ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
297
298 return ret;
299}
300
301/**
302 * register_user_hw_breakpoint - register a hardware breakpoint for user space
303 * @attr: breakpoint attributes
304 * @triggered: callback to trigger when we hit the breakpoint
305 * @tsk: pointer to 'task_struct' of the process to which the address belongs
306 */
307struct perf_event *
308register_user_hw_breakpoint(struct perf_event_attr *attr,
309 perf_overflow_handler_t triggered,
310 struct task_struct *tsk)
311{
312 return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered);
313}
314EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
315
316/**
317 * modify_user_hw_breakpoint - modify a user-space hardware breakpoint
318 * @bp: the breakpoint structure to modify
319 * @attr: new breakpoint attributes
320 * @triggered: callback to trigger when we hit the breakpoint
321 * @tsk: pointer to 'task_struct' of the process to which the address belongs
322 */
323int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr)
324{
325 u64 old_addr = bp->attr.bp_addr;
326 int old_type = bp->attr.bp_type;
327 int old_len = bp->attr.bp_len;
328 int err = 0;
329
330 perf_event_disable(bp);
331
332 bp->attr.bp_addr = attr->bp_addr;
333 bp->attr.bp_type = attr->bp_type;
334 bp->attr.bp_len = attr->bp_len;
335
336 if (attr->disabled)
337 goto end;
338
339 err = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
340 if (!err)
341 perf_event_enable(bp);
342
343 if (err) {
344 bp->attr.bp_addr = old_addr;
345 bp->attr.bp_type = old_type;
346 bp->attr.bp_len = old_len;
347 if (!bp->attr.disabled)
348 perf_event_enable(bp);
349
350 return err;
351 }
352
353end:
354 bp->attr.disabled = attr->disabled;
355
356 return 0;
357}
358EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
359
360/**
361 * unregister_hw_breakpoint - unregister a user-space hardware breakpoint
362 * @bp: the breakpoint structure to unregister
363 */
364void unregister_hw_breakpoint(struct perf_event *bp)
365{
366 if (!bp)
367 return;
368 perf_event_release_kernel(bp);
369}
370EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
371
372/**
373 * register_wide_hw_breakpoint - register a wide breakpoint in the kernel
374 * @attr: breakpoint attributes
375 * @triggered: callback to trigger when we hit the breakpoint
376 *
377 * @return a set of per_cpu pointers to perf events
378 */
379struct perf_event **
380register_wide_hw_breakpoint(struct perf_event_attr *attr,
381 perf_overflow_handler_t triggered)
382{
383 struct perf_event **cpu_events, **pevent, *bp;
384 long err;
385 int cpu;
386
387 cpu_events = alloc_percpu(typeof(*cpu_events));
388 if (!cpu_events)
389 return ERR_PTR(-ENOMEM);
390
391 for_each_possible_cpu(cpu) {
392 pevent = per_cpu_ptr(cpu_events, cpu);
393 bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered);
394
395 *pevent = bp;
396
397 if (IS_ERR(bp)) {
398 err = PTR_ERR(bp);
399 goto fail;
400 }
401 }
402
403 return cpu_events;
404
405fail:
406 for_each_possible_cpu(cpu) {
407 pevent = per_cpu_ptr(cpu_events, cpu);
408 if (IS_ERR(*pevent))
409 break;
410 unregister_hw_breakpoint(*pevent);
411 }
412 free_percpu(cpu_events);
413 /* return the error if any */
414 return ERR_PTR(err);
415}
416EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
417
418/**
419 * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel
420 * @cpu_events: the per cpu set of events to unregister
421 */
422void unregister_wide_hw_breakpoint(struct perf_event **cpu_events)
423{
424 int cpu;
425 struct perf_event **pevent;
426
427 for_each_possible_cpu(cpu) {
428 pevent = per_cpu_ptr(cpu_events, cpu);
429 unregister_hw_breakpoint(*pevent);
430 }
431 free_percpu(cpu_events);
432}
433EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint);
434
435static struct notifier_block hw_breakpoint_exceptions_nb = {
436 .notifier_call = hw_breakpoint_exceptions_notify,
437 /* we need to be notified first */
438 .priority = 0x7fffffff
439};
440
441static int __init init_hw_breakpoint(void)
442{
443 return register_die_notifier(&hw_breakpoint_exceptions_nb);
444}
445core_initcall(init_hw_breakpoint);
446
447
448struct pmu perf_ops_bp = {
449 .enable = arch_install_hw_breakpoint,
450 .disable = arch_uninstall_hw_breakpoint,
451 .read = hw_breakpoint_pmu_read,
452 .unthrottle = hw_breakpoint_pmu_unthrottle
453};
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c1660194d115..ba566c261adc 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -166,11 +166,11 @@ int set_irq_data(unsigned int irq, void *data)
166EXPORT_SYMBOL(set_irq_data); 166EXPORT_SYMBOL(set_irq_data);
167 167
168/** 168/**
169 * set_irq_data - set irq type data for an irq 169 * set_irq_msi - set MSI descriptor data for an irq
170 * @irq: Interrupt number 170 * @irq: Interrupt number
171 * @entry: Pointer to MSI descriptor data 171 * @entry: Pointer to MSI descriptor data
172 * 172 *
173 * Set the hardware irq controller data for an irq 173 * Set the MSI descriptor entry for an irq
174 */ 174 */
175int set_irq_msi(unsigned int irq, struct msi_desc *entry) 175int set_irq_msi(unsigned int irq, struct msi_desc *entry)
176{ 176{
@@ -590,7 +590,7 @@ out_unlock:
590} 590}
591 591
592/** 592/**
593 * handle_percpu_IRQ - Per CPU local irq handler 593 * handle_percpu_irq - Per CPU local irq handler
594 * @irq: the interrupt number 594 * @irq: the interrupt number
595 * @desc: the interrupt description structure for this irq 595 * @desc: the interrupt description structure for this irq
596 * 596 *
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index bde4c667d24d..7305b297d1eb 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1067,7 +1067,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1067 kfree(action); 1067 kfree(action);
1068 1068
1069#ifdef CONFIG_DEBUG_SHIRQ 1069#ifdef CONFIG_DEBUG_SHIRQ
1070 if (irqflags & IRQF_SHARED) { 1070 if (!retval && (irqflags & IRQF_SHARED)) {
1071 /* 1071 /*
1072 * It's a shared IRQ -- the driver ought to be prepared for it 1072 * It's a shared IRQ -- the driver ought to be prepared for it
1073 * to happen immediately, so let's make sure.... 1073 * to happen immediately, so let's make sure....
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 692363dd591f..0832145fea97 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -136,7 +136,7 @@ out:
136 136
137static int default_affinity_open(struct inode *inode, struct file *file) 137static int default_affinity_open(struct inode *inode, struct file *file)
138{ 138{
139 return single_open(file, default_affinity_show, NULL); 139 return single_open(file, default_affinity_show, PDE(inode)->data);
140} 140}
141 141
142static const struct file_operations default_affinity_proc_fops = { 142static const struct file_operations default_affinity_proc_fops = {
@@ -148,18 +148,28 @@ static const struct file_operations default_affinity_proc_fops = {
148}; 148};
149#endif 149#endif
150 150
151static int irq_spurious_read(char *page, char **start, off_t off, 151static int irq_spurious_proc_show(struct seq_file *m, void *v)
152 int count, int *eof, void *data)
153{ 152{
154 struct irq_desc *desc = irq_to_desc((long) data); 153 struct irq_desc *desc = irq_to_desc((long) m->private);
155 return sprintf(page, "count %u\n" 154
156 "unhandled %u\n" 155 seq_printf(m, "count %u\n" "unhandled %u\n" "last_unhandled %u ms\n",
157 "last_unhandled %u ms\n", 156 desc->irq_count, desc->irqs_unhandled,
158 desc->irq_count, 157 jiffies_to_msecs(desc->last_unhandled));
159 desc->irqs_unhandled, 158 return 0;
160 jiffies_to_msecs(desc->last_unhandled)); 159}
160
161static int irq_spurious_proc_open(struct inode *inode, struct file *file)
162{
163 return single_open(file, irq_spurious_proc_show, NULL);
161} 164}
162 165
166static const struct file_operations irq_spurious_proc_fops = {
167 .open = irq_spurious_proc_open,
168 .read = seq_read,
169 .llseek = seq_lseek,
170 .release = single_release,
171};
172
163#define MAX_NAMELEN 128 173#define MAX_NAMELEN 128
164 174
165static int name_unique(unsigned int irq, struct irqaction *new_action) 175static int name_unique(unsigned int irq, struct irqaction *new_action)
@@ -204,7 +214,6 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
204void register_irq_proc(unsigned int irq, struct irq_desc *desc) 214void register_irq_proc(unsigned int irq, struct irq_desc *desc)
205{ 215{
206 char name [MAX_NAMELEN]; 216 char name [MAX_NAMELEN];
207 struct proc_dir_entry *entry;
208 217
209 if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir) 218 if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir)
210 return; 219 return;
@@ -214,6 +223,8 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
214 223
215 /* create /proc/irq/1234 */ 224 /* create /proc/irq/1234 */
216 desc->dir = proc_mkdir(name, root_irq_dir); 225 desc->dir = proc_mkdir(name, root_irq_dir);
226 if (!desc->dir)
227 return;
217 228
218#ifdef CONFIG_SMP 229#ifdef CONFIG_SMP
219 /* create /proc/irq/<irq>/smp_affinity */ 230 /* create /proc/irq/<irq>/smp_affinity */
@@ -221,11 +232,8 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
221 &irq_affinity_proc_fops, (void *)(long)irq); 232 &irq_affinity_proc_fops, (void *)(long)irq);
222#endif 233#endif
223 234
224 entry = create_proc_entry("spurious", 0444, desc->dir); 235 proc_create_data("spurious", 0444, desc->dir,
225 if (entry) { 236 &irq_spurious_proc_fops, (void *)(long)irq);
226 entry->data = (void *)(long)irq;
227 entry->read_proc = irq_spurious_read;
228 }
229} 237}
230 238
231#undef MAX_NAMELEN 239#undef MAX_NAMELEN
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index bd7273e6282e..e49ea1c5232d 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -104,7 +104,7 @@ static int misrouted_irq(int irq)
104 return ok; 104 return ok;
105} 105}
106 106
107static void poll_all_shared_irqs(void) 107static void poll_spurious_irqs(unsigned long dummy)
108{ 108{
109 struct irq_desc *desc; 109 struct irq_desc *desc;
110 int i; 110 int i;
@@ -125,23 +125,11 @@ static void poll_all_shared_irqs(void)
125 try_one_irq(i, desc); 125 try_one_irq(i, desc);
126 local_irq_enable(); 126 local_irq_enable();
127 } 127 }
128}
129
130static void poll_spurious_irqs(unsigned long dummy)
131{
132 poll_all_shared_irqs();
133 128
134 mod_timer(&poll_spurious_irq_timer, 129 mod_timer(&poll_spurious_irq_timer,
135 jiffies + POLL_SPURIOUS_IRQ_INTERVAL); 130 jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
136} 131}
137 132
138#ifdef CONFIG_DEBUG_SHIRQ
139void debug_poll_all_shared_irqs(void)
140{
141 poll_all_shared_irqs();
142}
143#endif
144
145/* 133/*
146 * If 99,900 of the previous 100,000 interrupts have not been handled 134 * If 99,900 of the previous 100,000 interrupts have not been handled
147 * then assume that the IRQ is stuck in some manner. Drop a diagnostic 135 * then assume that the IRQ is stuck in some manner. Drop a diagnostic
@@ -232,7 +220,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
232 /* 220 /*
233 * If we are seeing only the odd spurious IRQ caused by 221 * If we are seeing only the odd spurious IRQ caused by
234 * bus asynchronicity then don't eventually trigger an error, 222 * bus asynchronicity then don't eventually trigger an error,
235 * otherwise the couter becomes a doomsday timer for otherwise 223 * otherwise the counter becomes a doomsday timer for otherwise
236 * working systems 224 * working systems
237 */ 225 */
238 if (time_after(jiffies, desc->last_unhandled + HZ/10)) 226 if (time_after(jiffies, desc->last_unhandled + HZ/10))
diff --git a/kernel/itimer.c b/kernel/itimer.c
index b03451ede528..d802883153da 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -146,6 +146,7 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
146{ 146{
147 cputime_t cval, nval, cinterval, ninterval; 147 cputime_t cval, nval, cinterval, ninterval;
148 s64 ns_ninterval, ns_nval; 148 s64 ns_ninterval, ns_nval;
149 u32 error, incr_error;
149 struct cpu_itimer *it = &tsk->signal->it[clock_id]; 150 struct cpu_itimer *it = &tsk->signal->it[clock_id];
150 151
151 nval = timeval_to_cputime(&value->it_value); 152 nval = timeval_to_cputime(&value->it_value);
@@ -153,8 +154,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
153 ninterval = timeval_to_cputime(&value->it_interval); 154 ninterval = timeval_to_cputime(&value->it_interval);
154 ns_ninterval = timeval_to_ns(&value->it_interval); 155 ns_ninterval = timeval_to_ns(&value->it_interval);
155 156
156 it->incr_error = cputime_sub_ns(ninterval, ns_ninterval); 157 error = cputime_sub_ns(nval, ns_nval);
157 it->error = cputime_sub_ns(nval, ns_nval); 158 incr_error = cputime_sub_ns(ninterval, ns_ninterval);
158 159
159 spin_lock_irq(&tsk->sighand->siglock); 160 spin_lock_irq(&tsk->sighand->siglock);
160 161
@@ -168,6 +169,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
168 } 169 }
169 it->expires = nval; 170 it->expires = nval;
170 it->incr = ninterval; 171 it->incr = ninterval;
172 it->error = error;
173 it->incr_error = incr_error;
171 trace_itimer_state(clock_id == CPUCLOCK_VIRT ? 174 trace_itimer_state(clock_id == CPUCLOCK_VIRT ?
172 ITIMER_VIRTUAL : ITIMER_PROF, value, nval); 175 ITIMER_VIRTUAL : ITIMER_PROF, value, nval);
173 176
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 8b6b8b697c68..8e5288a8a355 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -181,6 +181,7 @@ unsigned long kallsyms_lookup_name(const char *name)
181 } 181 }
182 return module_kallsyms_lookup_name(name); 182 return module_kallsyms_lookup_name(name);
183} 183}
184EXPORT_SYMBOL_GPL(kallsyms_lookup_name);
184 185
185int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, 186int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
186 unsigned long), 187 unsigned long),
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 9147a3190c9d..2eb517e23514 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -129,6 +129,7 @@ struct task_struct *kgdb_usethread;
129struct task_struct *kgdb_contthread; 129struct task_struct *kgdb_contthread;
130 130
131int kgdb_single_step; 131int kgdb_single_step;
132pid_t kgdb_sstep_pid;
132 133
133/* Our I/O buffers. */ 134/* Our I/O buffers. */
134static char remcom_in_buffer[BUFMAX]; 135static char remcom_in_buffer[BUFMAX];
@@ -541,12 +542,17 @@ static struct task_struct *getthread(struct pt_regs *regs, int tid)
541 */ 542 */
542 if (tid == 0 || tid == -1) 543 if (tid == 0 || tid == -1)
543 tid = -atomic_read(&kgdb_active) - 2; 544 tid = -atomic_read(&kgdb_active) - 2;
544 if (tid < 0) { 545 if (tid < -1 && tid > -NR_CPUS - 2) {
545 if (kgdb_info[-tid - 2].task) 546 if (kgdb_info[-tid - 2].task)
546 return kgdb_info[-tid - 2].task; 547 return kgdb_info[-tid - 2].task;
547 else 548 else
548 return idle_task(-tid - 2); 549 return idle_task(-tid - 2);
549 } 550 }
551 if (tid <= 0) {
552 printk(KERN_ERR "KGDB: Internal thread select error\n");
553 dump_stack();
554 return NULL;
555 }
550 556
551 /* 557 /*
552 * find_task_by_pid_ns() does not take the tasklist lock anymore 558 * find_task_by_pid_ns() does not take the tasklist lock anymore
@@ -619,7 +625,8 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
619static int kgdb_activate_sw_breakpoints(void) 625static int kgdb_activate_sw_breakpoints(void)
620{ 626{
621 unsigned long addr; 627 unsigned long addr;
622 int error = 0; 628 int error;
629 int ret = 0;
623 int i; 630 int i;
624 631
625 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { 632 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
@@ -629,13 +636,16 @@ static int kgdb_activate_sw_breakpoints(void)
629 addr = kgdb_break[i].bpt_addr; 636 addr = kgdb_break[i].bpt_addr;
630 error = kgdb_arch_set_breakpoint(addr, 637 error = kgdb_arch_set_breakpoint(addr,
631 kgdb_break[i].saved_instr); 638 kgdb_break[i].saved_instr);
632 if (error) 639 if (error) {
633 return error; 640 ret = error;
641 printk(KERN_INFO "KGDB: BP install failed: %lx", addr);
642 continue;
643 }
634 644
635 kgdb_flush_swbreak_addr(addr); 645 kgdb_flush_swbreak_addr(addr);
636 kgdb_break[i].state = BP_ACTIVE; 646 kgdb_break[i].state = BP_ACTIVE;
637 } 647 }
638 return 0; 648 return ret;
639} 649}
640 650
641static int kgdb_set_sw_break(unsigned long addr) 651static int kgdb_set_sw_break(unsigned long addr)
@@ -682,7 +692,8 @@ static int kgdb_set_sw_break(unsigned long addr)
682static int kgdb_deactivate_sw_breakpoints(void) 692static int kgdb_deactivate_sw_breakpoints(void)
683{ 693{
684 unsigned long addr; 694 unsigned long addr;
685 int error = 0; 695 int error;
696 int ret = 0;
686 int i; 697 int i;
687 698
688 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { 699 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
@@ -691,13 +702,15 @@ static int kgdb_deactivate_sw_breakpoints(void)
691 addr = kgdb_break[i].bpt_addr; 702 addr = kgdb_break[i].bpt_addr;
692 error = kgdb_arch_remove_breakpoint(addr, 703 error = kgdb_arch_remove_breakpoint(addr,
693 kgdb_break[i].saved_instr); 704 kgdb_break[i].saved_instr);
694 if (error) 705 if (error) {
695 return error; 706 printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr);
707 ret = error;
708 }
696 709
697 kgdb_flush_swbreak_addr(addr); 710 kgdb_flush_swbreak_addr(addr);
698 kgdb_break[i].state = BP_SET; 711 kgdb_break[i].state = BP_SET;
699 } 712 }
700 return 0; 713 return ret;
701} 714}
702 715
703static int kgdb_remove_sw_break(unsigned long addr) 716static int kgdb_remove_sw_break(unsigned long addr)
@@ -870,7 +883,7 @@ static void gdb_cmd_getregs(struct kgdb_state *ks)
870 883
871 /* 884 /*
872 * All threads that don't have debuggerinfo should be 885 * All threads that don't have debuggerinfo should be
873 * in __schedule() sleeping, since all other CPUs 886 * in schedule() sleeping, since all other CPUs
874 * are in kgdb_wait, and thus have debuggerinfo. 887 * are in kgdb_wait, and thus have debuggerinfo.
875 */ 888 */
876 if (local_debuggerinfo) { 889 if (local_debuggerinfo) {
@@ -1204,8 +1217,10 @@ static int gdb_cmd_exception_pass(struct kgdb_state *ks)
1204 return 1; 1217 return 1;
1205 1218
1206 } else { 1219 } else {
1207 error_packet(remcom_out_buffer, -EINVAL); 1220 kgdb_msg_write("KGDB only knows signal 9 (pass)"
1208 return 0; 1221 " and 15 (pass and disconnect)\n"
1222 "Executing a continue without signal passing\n", 0);
1223 remcom_in_buffer[0] = 'c';
1209 } 1224 }
1210 1225
1211 /* Indicate fall through */ 1226 /* Indicate fall through */
@@ -1395,6 +1410,7 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
1395 struct kgdb_state kgdb_var; 1410 struct kgdb_state kgdb_var;
1396 struct kgdb_state *ks = &kgdb_var; 1411 struct kgdb_state *ks = &kgdb_var;
1397 unsigned long flags; 1412 unsigned long flags;
1413 int sstep_tries = 100;
1398 int error = 0; 1414 int error = 0;
1399 int i, cpu; 1415 int i, cpu;
1400 1416
@@ -1425,13 +1441,14 @@ acquirelock:
1425 cpu_relax(); 1441 cpu_relax();
1426 1442
1427 /* 1443 /*
1428 * Do not start the debugger connection on this CPU if the last 1444 * For single stepping, try to only enter on the processor
1429 * instance of the exception handler wanted to come into the 1445 * that was single stepping. To gaurd against a deadlock, the
1430 * debugger on a different CPU via a single step 1446 * kernel will only try for the value of sstep_tries before
1447 * giving up and continuing on.
1431 */ 1448 */
1432 if (atomic_read(&kgdb_cpu_doing_single_step) != -1 && 1449 if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
1433 atomic_read(&kgdb_cpu_doing_single_step) != cpu) { 1450 (kgdb_info[cpu].task &&
1434 1451 kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
1435 atomic_set(&kgdb_active, -1); 1452 atomic_set(&kgdb_active, -1);
1436 touch_softlockup_watchdog(); 1453 touch_softlockup_watchdog();
1437 clocksource_touch_watchdog(); 1454 clocksource_touch_watchdog();
@@ -1524,6 +1541,13 @@ acquirelock:
1524 } 1541 }
1525 1542
1526kgdb_restore: 1543kgdb_restore:
1544 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
1545 int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step);
1546 if (kgdb_info[sstep_cpu].task)
1547 kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid;
1548 else
1549 kgdb_sstep_pid = 0;
1550 }
1527 /* Free kgdb_active */ 1551 /* Free kgdb_active */
1528 atomic_set(&kgdb_active, -1); 1552 atomic_set(&kgdb_active, -1);
1529 touch_softlockup_watchdog(); 1553 touch_softlockup_watchdog();
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 9fcb53a11f87..25b103190364 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -80,16 +80,16 @@ int __request_module(bool wait, const char *fmt, ...)
80#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ 80#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
81 static int kmod_loop_msg; 81 static int kmod_loop_msg;
82 82
83 ret = security_kernel_module_request();
84 if (ret)
85 return ret;
86
87 va_start(args, fmt); 83 va_start(args, fmt);
88 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); 84 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
89 va_end(args); 85 va_end(args);
90 if (ret >= MODULE_NAME_LEN) 86 if (ret >= MODULE_NAME_LEN)
91 return -ENAMETOOLONG; 87 return -ENAMETOOLONG;
92 88
89 ret = security_kernel_module_request(module_name);
90 if (ret)
91 return ret;
92
93 /* If modprobe needs a service that is in a module, we get a recursive 93 /* If modprobe needs a service that is in a module, we get a recursive
94 * loop. Limit the number of running kmod threads to max_threads/2 or 94 * loop. Limit the number of running kmod threads to max_threads/2 or
95 * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method 95 * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 5240d75f4c60..e5342a344c43 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -90,6 +90,9 @@ static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
90 */ 90 */
91static struct kprobe_blackpoint kprobe_blacklist[] = { 91static struct kprobe_blackpoint kprobe_blacklist[] = {
92 {"preempt_schedule",}, 92 {"preempt_schedule",},
93 {"native_get_debugreg",},
94 {"irq_entries_start",},
95 {"common_interrupt",},
93 {NULL} /* Terminator */ 96 {NULL} /* Terminator */
94}; 97};
95 98
@@ -673,6 +676,40 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
673 return (kprobe_opcode_t *)(((char *)addr) + p->offset); 676 return (kprobe_opcode_t *)(((char *)addr) + p->offset);
674} 677}
675 678
679/* Check passed kprobe is valid and return kprobe in kprobe_table. */
680static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
681{
682 struct kprobe *old_p, *list_p;
683
684 old_p = get_kprobe(p->addr);
685 if (unlikely(!old_p))
686 return NULL;
687
688 if (p != old_p) {
689 list_for_each_entry_rcu(list_p, &old_p->list, list)
690 if (list_p == p)
691 /* kprobe p is a valid probe */
692 goto valid;
693 return NULL;
694 }
695valid:
696 return old_p;
697}
698
699/* Return error if the kprobe is being re-registered */
700static inline int check_kprobe_rereg(struct kprobe *p)
701{
702 int ret = 0;
703 struct kprobe *old_p;
704
705 mutex_lock(&kprobe_mutex);
706 old_p = __get_valid_kprobe(p);
707 if (old_p)
708 ret = -EINVAL;
709 mutex_unlock(&kprobe_mutex);
710 return ret;
711}
712
676int __kprobes register_kprobe(struct kprobe *p) 713int __kprobes register_kprobe(struct kprobe *p)
677{ 714{
678 int ret = 0; 715 int ret = 0;
@@ -685,6 +722,10 @@ int __kprobes register_kprobe(struct kprobe *p)
685 return -EINVAL; 722 return -EINVAL;
686 p->addr = addr; 723 p->addr = addr;
687 724
725 ret = check_kprobe_rereg(p);
726 if (ret)
727 return ret;
728
688 preempt_disable(); 729 preempt_disable();
689 if (!kernel_text_address((unsigned long) p->addr) || 730 if (!kernel_text_address((unsigned long) p->addr) ||
690 in_kprobes_functions((unsigned long) p->addr)) { 731 in_kprobes_functions((unsigned long) p->addr)) {
@@ -754,26 +795,6 @@ out:
754} 795}
755EXPORT_SYMBOL_GPL(register_kprobe); 796EXPORT_SYMBOL_GPL(register_kprobe);
756 797
757/* Check passed kprobe is valid and return kprobe in kprobe_table. */
758static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
759{
760 struct kprobe *old_p, *list_p;
761
762 old_p = get_kprobe(p->addr);
763 if (unlikely(!old_p))
764 return NULL;
765
766 if (p != old_p) {
767 list_for_each_entry_rcu(list_p, &old_p->list, list)
768 if (list_p == p)
769 /* kprobe p is a valid probe */
770 goto valid;
771 return NULL;
772 }
773valid:
774 return old_p;
775}
776
777/* 798/*
778 * Unregister a kprobe without a scheduler synchronization. 799 * Unregister a kprobe without a scheduler synchronization.
779 */ 800 */
@@ -1014,9 +1035,9 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
1014 /* Pre-allocate memory for max kretprobe instances */ 1035 /* Pre-allocate memory for max kretprobe instances */
1015 if (rp->maxactive <= 0) { 1036 if (rp->maxactive <= 0) {
1016#ifdef CONFIG_PREEMPT 1037#ifdef CONFIG_PREEMPT
1017 rp->maxactive = max(10, 2 * NR_CPUS); 1038 rp->maxactive = max(10, 2 * num_possible_cpus());
1018#else 1039#else
1019 rp->maxactive = NR_CPUS; 1040 rp->maxactive = num_possible_cpus();
1020#endif 1041#endif
1021 } 1042 }
1022 spin_lock_init(&rp->lock); 1043 spin_lock_init(&rp->lock);
@@ -1141,6 +1162,13 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1141 arch_remove_kprobe(p); 1162 arch_remove_kprobe(p);
1142} 1163}
1143 1164
1165void __kprobes dump_kprobe(struct kprobe *kp)
1166{
1167 printk(KERN_WARNING "Dumping kprobe:\n");
1168 printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n",
1169 kp->symbol_name, kp->addr, kp->offset);
1170}
1171
1144/* Module notifier call back, checking kprobes on the module */ 1172/* Module notifier call back, checking kprobes on the module */
1145static int __kprobes kprobes_module_callback(struct notifier_block *nb, 1173static int __kprobes kprobes_module_callback(struct notifier_block *nb,
1146 unsigned long val, void *data) 1174 unsigned long val, void *data)
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 9af56723c096..4f8df01dbe51 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -49,7 +49,7 @@
49#include "lockdep_internals.h" 49#include "lockdep_internals.h"
50 50
51#define CREATE_TRACE_POINTS 51#define CREATE_TRACE_POINTS
52#include <trace/events/lockdep.h> 52#include <trace/events/lock.h>
53 53
54#ifdef CONFIG_PROVE_LOCKING 54#ifdef CONFIG_PROVE_LOCKING
55int prove_locking = 1; 55int prove_locking = 1;
@@ -168,7 +168,7 @@ static void lock_time_inc(struct lock_time *lt, u64 time)
168 if (time > lt->max) 168 if (time > lt->max)
169 lt->max = time; 169 lt->max = time;
170 170
171 if (time < lt->min || !lt->min) 171 if (time < lt->min || !lt->nr)
172 lt->min = time; 172 lt->min = time;
173 173
174 lt->total += time; 174 lt->total += time;
@@ -177,8 +177,15 @@ static void lock_time_inc(struct lock_time *lt, u64 time)
177 177
178static inline void lock_time_add(struct lock_time *src, struct lock_time *dst) 178static inline void lock_time_add(struct lock_time *src, struct lock_time *dst)
179{ 179{
180 dst->min += src->min; 180 if (!src->nr)
181 dst->max += src->max; 181 return;
182
183 if (src->max > dst->max)
184 dst->max = src->max;
185
186 if (src->min < dst->min || !dst->nr)
187 dst->min = src->min;
188
182 dst->total += src->total; 189 dst->total += src->total;
183 dst->nr += src->nr; 190 dst->nr += src->nr;
184} 191}
@@ -379,7 +386,8 @@ static int save_trace(struct stack_trace *trace)
379 * complete trace that maxes out the entries provided will be reported 386 * complete trace that maxes out the entries provided will be reported
380 * as incomplete, friggin useless </rant> 387 * as incomplete, friggin useless </rant>
381 */ 388 */
382 if (trace->entries[trace->nr_entries-1] == ULONG_MAX) 389 if (trace->nr_entries != 0 &&
390 trace->entries[trace->nr_entries-1] == ULONG_MAX)
383 trace->nr_entries--; 391 trace->nr_entries--;
384 392
385 trace->max_entries = trace->nr_entries; 393 trace->max_entries = trace->nr_entries;
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 947b3ad551f8..632f04c57d82 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -148,8 +148,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
148 148
149 preempt_disable(); 149 preempt_disable();
150 mutex_acquire(&lock->dep_map, subclass, 0, ip); 150 mutex_acquire(&lock->dep_map, subclass, 0, ip);
151#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES) && \ 151
152 !defined(CONFIG_HAVE_DEFAULT_NO_SPIN_MUTEXES) 152#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
153 /* 153 /*
154 * Optimistic spinning. 154 * Optimistic spinning.
155 * 155 *
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 61d5aa5eced3..acd24e7643eb 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -558,7 +558,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier);
558 558
559static ATOMIC_NOTIFIER_HEAD(die_chain); 559static ATOMIC_NOTIFIER_HEAD(die_chain);
560 560
561int notrace notify_die(enum die_val val, const char *str, 561int notrace __kprobes notify_die(enum die_val val, const char *str,
562 struct pt_regs *regs, long err, int trap, int sig) 562 struct pt_regs *regs, long err, int trap, int sig)
563{ 563{
564 struct die_args args = { 564 struct die_args args = {
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 7f29643c8985..e73e53c7582f 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -28,13 +28,15 @@
28#include <linux/anon_inodes.h> 28#include <linux/anon_inodes.h>
29#include <linux/kernel_stat.h> 29#include <linux/kernel_stat.h>
30#include <linux/perf_event.h> 30#include <linux/perf_event.h>
31#include <linux/ftrace_event.h>
32#include <linux/hw_breakpoint.h>
31 33
32#include <asm/irq_regs.h> 34#include <asm/irq_regs.h>
33 35
34/* 36/*
35 * Each CPU has a list of per CPU events: 37 * Each CPU has a list of per CPU events:
36 */ 38 */
37DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); 39static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
38 40
39int perf_max_events __read_mostly = 1; 41int perf_max_events __read_mostly = 1;
40static int perf_reserved_percpu __read_mostly; 42static int perf_reserved_percpu __read_mostly;
@@ -244,6 +246,49 @@ static void perf_unpin_context(struct perf_event_context *ctx)
244 put_ctx(ctx); 246 put_ctx(ctx);
245} 247}
246 248
249static inline u64 perf_clock(void)
250{
251 return cpu_clock(smp_processor_id());
252}
253
254/*
255 * Update the record of the current time in a context.
256 */
257static void update_context_time(struct perf_event_context *ctx)
258{
259 u64 now = perf_clock();
260
261 ctx->time += now - ctx->timestamp;
262 ctx->timestamp = now;
263}
264
265/*
266 * Update the total_time_enabled and total_time_running fields for a event.
267 */
268static void update_event_times(struct perf_event *event)
269{
270 struct perf_event_context *ctx = event->ctx;
271 u64 run_end;
272
273 if (event->state < PERF_EVENT_STATE_INACTIVE ||
274 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
275 return;
276
277 if (ctx->is_active)
278 run_end = ctx->time;
279 else
280 run_end = event->tstamp_stopped;
281
282 event->total_time_enabled = run_end - event->tstamp_enabled;
283
284 if (event->state == PERF_EVENT_STATE_INACTIVE)
285 run_end = event->tstamp_stopped;
286 else
287 run_end = ctx->time;
288
289 event->total_time_running = run_end - event->tstamp_running;
290}
291
247/* 292/*
248 * Add a event from the lists for its context. 293 * Add a event from the lists for its context.
249 * Must be called with ctx->mutex and ctx->lock held. 294 * Must be called with ctx->mutex and ctx->lock held.
@@ -292,6 +337,18 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
292 if (event->group_leader != event) 337 if (event->group_leader != event)
293 event->group_leader->nr_siblings--; 338 event->group_leader->nr_siblings--;
294 339
340 update_event_times(event);
341
342 /*
343 * If event was in error state, then keep it
344 * that way, otherwise bogus counts will be
345 * returned on read(). The only way to get out
346 * of error state is by explicit re-enabling
347 * of the event
348 */
349 if (event->state > PERF_EVENT_STATE_OFF)
350 event->state = PERF_EVENT_STATE_OFF;
351
295 /* 352 /*
296 * If this was a group event with sibling events then 353 * If this was a group event with sibling events then
297 * upgrade the siblings to singleton events by adding them 354 * upgrade the siblings to singleton events by adding them
@@ -419,7 +476,7 @@ static void perf_event_remove_from_context(struct perf_event *event)
419 if (!task) { 476 if (!task) {
420 /* 477 /*
421 * Per cpu events are removed via an smp call and 478 * Per cpu events are removed via an smp call and
422 * the removal is always sucessful. 479 * the removal is always successful.
423 */ 480 */
424 smp_call_function_single(event->cpu, 481 smp_call_function_single(event->cpu,
425 __perf_event_remove_from_context, 482 __perf_event_remove_from_context,
@@ -445,50 +502,11 @@ retry:
445 * can remove the event safely, if the call above did not 502 * can remove the event safely, if the call above did not
446 * succeed. 503 * succeed.
447 */ 504 */
448 if (!list_empty(&event->group_entry)) { 505 if (!list_empty(&event->group_entry))
449 list_del_event(event, ctx); 506 list_del_event(event, ctx);
450 }
451 spin_unlock_irq(&ctx->lock); 507 spin_unlock_irq(&ctx->lock);
452} 508}
453 509
454static inline u64 perf_clock(void)
455{
456 return cpu_clock(smp_processor_id());
457}
458
459/*
460 * Update the record of the current time in a context.
461 */
462static void update_context_time(struct perf_event_context *ctx)
463{
464 u64 now = perf_clock();
465
466 ctx->time += now - ctx->timestamp;
467 ctx->timestamp = now;
468}
469
470/*
471 * Update the total_time_enabled and total_time_running fields for a event.
472 */
473static void update_event_times(struct perf_event *event)
474{
475 struct perf_event_context *ctx = event->ctx;
476 u64 run_end;
477
478 if (event->state < PERF_EVENT_STATE_INACTIVE ||
479 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
480 return;
481
482 event->total_time_enabled = ctx->time - event->tstamp_enabled;
483
484 if (event->state == PERF_EVENT_STATE_INACTIVE)
485 run_end = event->tstamp_stopped;
486 else
487 run_end = ctx->time;
488
489 event->total_time_running = run_end - event->tstamp_running;
490}
491
492/* 510/*
493 * Update total_time_enabled and total_time_running for all events in a group. 511 * Update total_time_enabled and total_time_running for all events in a group.
494 */ 512 */
@@ -549,7 +567,7 @@ static void __perf_event_disable(void *info)
549 * is the current context on this CPU and preemption is disabled, 567 * is the current context on this CPU and preemption is disabled,
550 * hence we can't get into perf_event_task_sched_out for this context. 568 * hence we can't get into perf_event_task_sched_out for this context.
551 */ 569 */
552static void perf_event_disable(struct perf_event *event) 570void perf_event_disable(struct perf_event *event)
553{ 571{
554 struct perf_event_context *ctx = event->ctx; 572 struct perf_event_context *ctx = event->ctx;
555 struct task_struct *task = ctx->task; 573 struct task_struct *task = ctx->task;
@@ -827,7 +845,7 @@ perf_install_in_context(struct perf_event_context *ctx,
827 if (!task) { 845 if (!task) {
828 /* 846 /*
829 * Per cpu events are installed via an smp call and 847 * Per cpu events are installed via an smp call and
830 * the install is always sucessful. 848 * the install is always successful.
831 */ 849 */
832 smp_call_function_single(cpu, __perf_install_in_context, 850 smp_call_function_single(cpu, __perf_install_in_context,
833 event, 1); 851 event, 1);
@@ -953,7 +971,7 @@ static void __perf_event_enable(void *info)
953 * perf_event_for_each_child or perf_event_for_each as described 971 * perf_event_for_each_child or perf_event_for_each as described
954 * for perf_event_disable. 972 * for perf_event_disable.
955 */ 973 */
956static void perf_event_enable(struct perf_event *event) 974void perf_event_enable(struct perf_event *event)
957{ 975{
958 struct perf_event_context *ctx = event->ctx; 976 struct perf_event_context *ctx = event->ctx;
959 struct task_struct *task = ctx->task; 977 struct task_struct *task = ctx->task;
@@ -1031,10 +1049,10 @@ void __perf_event_sched_out(struct perf_event_context *ctx,
1031 update_context_time(ctx); 1049 update_context_time(ctx);
1032 1050
1033 perf_disable(); 1051 perf_disable();
1034 if (ctx->nr_active) 1052 if (ctx->nr_active) {
1035 list_for_each_entry(event, &ctx->group_list, group_entry) 1053 list_for_each_entry(event, &ctx->group_list, group_entry)
1036 group_sched_out(event, cpuctx, ctx); 1054 group_sched_out(event, cpuctx, ctx);
1037 1055 }
1038 perf_enable(); 1056 perf_enable();
1039 out: 1057 out:
1040 spin_unlock(&ctx->lock); 1058 spin_unlock(&ctx->lock);
@@ -1059,8 +1077,6 @@ static int context_equiv(struct perf_event_context *ctx1,
1059 && !ctx1->pin_count && !ctx2->pin_count; 1077 && !ctx1->pin_count && !ctx2->pin_count;
1060} 1078}
1061 1079
1062static void __perf_event_read(void *event);
1063
1064static void __perf_event_sync_stat(struct perf_event *event, 1080static void __perf_event_sync_stat(struct perf_event *event,
1065 struct perf_event *next_event) 1081 struct perf_event *next_event)
1066{ 1082{
@@ -1078,8 +1094,8 @@ static void __perf_event_sync_stat(struct perf_event *event,
1078 */ 1094 */
1079 switch (event->state) { 1095 switch (event->state) {
1080 case PERF_EVENT_STATE_ACTIVE: 1096 case PERF_EVENT_STATE_ACTIVE:
1081 __perf_event_read(event); 1097 event->pmu->read(event);
1082 break; 1098 /* fall-through */
1083 1099
1084 case PERF_EVENT_STATE_INACTIVE: 1100 case PERF_EVENT_STATE_INACTIVE:
1085 update_event_times(event); 1101 update_event_times(event);
@@ -1118,6 +1134,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1118 if (!ctx->nr_stat) 1134 if (!ctx->nr_stat)
1119 return; 1135 return;
1120 1136
1137 update_context_time(ctx);
1138
1121 event = list_first_entry(&ctx->event_list, 1139 event = list_first_entry(&ctx->event_list,
1122 struct perf_event, event_entry); 1140 struct perf_event, event_entry);
1123 1141
@@ -1161,8 +1179,6 @@ void perf_event_task_sched_out(struct task_struct *task,
1161 if (likely(!ctx || !cpuctx->task_ctx)) 1179 if (likely(!ctx || !cpuctx->task_ctx))
1162 return; 1180 return;
1163 1181
1164 update_context_time(ctx);
1165
1166 rcu_read_lock(); 1182 rcu_read_lock();
1167 parent = rcu_dereference(ctx->parent_ctx); 1183 parent = rcu_dereference(ctx->parent_ctx);
1168 next_ctx = next->perf_event_ctxp; 1184 next_ctx = next->perf_event_ctxp;
@@ -1515,7 +1531,6 @@ static void __perf_event_read(void *info)
1515 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1531 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1516 struct perf_event *event = info; 1532 struct perf_event *event = info;
1517 struct perf_event_context *ctx = event->ctx; 1533 struct perf_event_context *ctx = event->ctx;
1518 unsigned long flags;
1519 1534
1520 /* 1535 /*
1521 * If this is a task context, we need to check whether it is 1536 * If this is a task context, we need to check whether it is
@@ -1527,12 +1542,12 @@ static void __perf_event_read(void *info)
1527 if (ctx->task && cpuctx->task_ctx != ctx) 1542 if (ctx->task && cpuctx->task_ctx != ctx)
1528 return; 1543 return;
1529 1544
1530 local_irq_save(flags); 1545 spin_lock(&ctx->lock);
1531 if (ctx->is_active) 1546 update_context_time(ctx);
1532 update_context_time(ctx);
1533 event->pmu->read(event);
1534 update_event_times(event); 1547 update_event_times(event);
1535 local_irq_restore(flags); 1548 spin_unlock(&ctx->lock);
1549
1550 event->pmu->read(event);
1536} 1551}
1537 1552
1538static u64 perf_event_read(struct perf_event *event) 1553static u64 perf_event_read(struct perf_event *event)
@@ -1545,7 +1560,13 @@ static u64 perf_event_read(struct perf_event *event)
1545 smp_call_function_single(event->oncpu, 1560 smp_call_function_single(event->oncpu,
1546 __perf_event_read, event, 1); 1561 __perf_event_read, event, 1);
1547 } else if (event->state == PERF_EVENT_STATE_INACTIVE) { 1562 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
1563 struct perf_event_context *ctx = event->ctx;
1564 unsigned long flags;
1565
1566 spin_lock_irqsave(&ctx->lock, flags);
1567 update_context_time(ctx);
1548 update_event_times(event); 1568 update_event_times(event);
1569 spin_unlock_irqrestore(&ctx->lock, flags);
1549 } 1570 }
1550 1571
1551 return atomic64_read(&event->count); 1572 return atomic64_read(&event->count);
@@ -1558,7 +1579,6 @@ static void
1558__perf_event_init_context(struct perf_event_context *ctx, 1579__perf_event_init_context(struct perf_event_context *ctx,
1559 struct task_struct *task) 1580 struct task_struct *task)
1560{ 1581{
1561 memset(ctx, 0, sizeof(*ctx));
1562 spin_lock_init(&ctx->lock); 1582 spin_lock_init(&ctx->lock);
1563 mutex_init(&ctx->mutex); 1583 mutex_init(&ctx->mutex);
1564 INIT_LIST_HEAD(&ctx->group_list); 1584 INIT_LIST_HEAD(&ctx->group_list);
@@ -1633,7 +1653,7 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1633 } 1653 }
1634 1654
1635 if (!ctx) { 1655 if (!ctx) {
1636 ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL); 1656 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
1637 err = -ENOMEM; 1657 err = -ENOMEM;
1638 if (!ctx) 1658 if (!ctx)
1639 goto errout; 1659 goto errout;
@@ -1658,6 +1678,8 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1658 return ERR_PTR(err); 1678 return ERR_PTR(err);
1659} 1679}
1660 1680
1681static void perf_event_free_filter(struct perf_event *event);
1682
1661static void free_event_rcu(struct rcu_head *head) 1683static void free_event_rcu(struct rcu_head *head)
1662{ 1684{
1663 struct perf_event *event; 1685 struct perf_event *event;
@@ -1665,6 +1687,7 @@ static void free_event_rcu(struct rcu_head *head)
1665 event = container_of(head, struct perf_event, rcu_head); 1687 event = container_of(head, struct perf_event, rcu_head);
1666 if (event->ns) 1688 if (event->ns)
1667 put_pid_ns(event->ns); 1689 put_pid_ns(event->ns);
1690 perf_event_free_filter(event);
1668 kfree(event); 1691 kfree(event);
1669} 1692}
1670 1693
@@ -1696,16 +1719,10 @@ static void free_event(struct perf_event *event)
1696 call_rcu(&event->rcu_head, free_event_rcu); 1719 call_rcu(&event->rcu_head, free_event_rcu);
1697} 1720}
1698 1721
1699/* 1722int perf_event_release_kernel(struct perf_event *event)
1700 * Called when the last reference to the file is gone.
1701 */
1702static int perf_release(struct inode *inode, struct file *file)
1703{ 1723{
1704 struct perf_event *event = file->private_data;
1705 struct perf_event_context *ctx = event->ctx; 1724 struct perf_event_context *ctx = event->ctx;
1706 1725
1707 file->private_data = NULL;
1708
1709 WARN_ON_ONCE(ctx->parent_ctx); 1726 WARN_ON_ONCE(ctx->parent_ctx);
1710 mutex_lock(&ctx->mutex); 1727 mutex_lock(&ctx->mutex);
1711 perf_event_remove_from_context(event); 1728 perf_event_remove_from_context(event);
@@ -1720,6 +1737,19 @@ static int perf_release(struct inode *inode, struct file *file)
1720 1737
1721 return 0; 1738 return 0;
1722} 1739}
1740EXPORT_SYMBOL_GPL(perf_event_release_kernel);
1741
1742/*
1743 * Called when the last reference to the file is gone.
1744 */
1745static int perf_release(struct inode *inode, struct file *file)
1746{
1747 struct perf_event *event = file->private_data;
1748
1749 file->private_data = NULL;
1750
1751 return perf_event_release_kernel(event);
1752}
1723 1753
1724static int perf_event_read_size(struct perf_event *event) 1754static int perf_event_read_size(struct perf_event *event)
1725{ 1755{
@@ -1746,91 +1776,94 @@ static int perf_event_read_size(struct perf_event *event)
1746 return size; 1776 return size;
1747} 1777}
1748 1778
1749static u64 perf_event_read_value(struct perf_event *event) 1779u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
1750{ 1780{
1751 struct perf_event *child; 1781 struct perf_event *child;
1752 u64 total = 0; 1782 u64 total = 0;
1753 1783
1784 *enabled = 0;
1785 *running = 0;
1786
1787 mutex_lock(&event->child_mutex);
1754 total += perf_event_read(event); 1788 total += perf_event_read(event);
1755 list_for_each_entry(child, &event->child_list, child_list) 1789 *enabled += event->total_time_enabled +
1790 atomic64_read(&event->child_total_time_enabled);
1791 *running += event->total_time_running +
1792 atomic64_read(&event->child_total_time_running);
1793
1794 list_for_each_entry(child, &event->child_list, child_list) {
1756 total += perf_event_read(child); 1795 total += perf_event_read(child);
1796 *enabled += child->total_time_enabled;
1797 *running += child->total_time_running;
1798 }
1799 mutex_unlock(&event->child_mutex);
1757 1800
1758 return total; 1801 return total;
1759} 1802}
1760 1803EXPORT_SYMBOL_GPL(perf_event_read_value);
1761static int perf_event_read_entry(struct perf_event *event,
1762 u64 read_format, char __user *buf)
1763{
1764 int n = 0, count = 0;
1765 u64 values[2];
1766
1767 values[n++] = perf_event_read_value(event);
1768 if (read_format & PERF_FORMAT_ID)
1769 values[n++] = primary_event_id(event);
1770
1771 count = n * sizeof(u64);
1772
1773 if (copy_to_user(buf, values, count))
1774 return -EFAULT;
1775
1776 return count;
1777}
1778 1804
1779static int perf_event_read_group(struct perf_event *event, 1805static int perf_event_read_group(struct perf_event *event,
1780 u64 read_format, char __user *buf) 1806 u64 read_format, char __user *buf)
1781{ 1807{
1782 struct perf_event *leader = event->group_leader, *sub; 1808 struct perf_event *leader = event->group_leader, *sub;
1783 int n = 0, size = 0, err = -EFAULT; 1809 int n = 0, size = 0, ret = -EFAULT;
1784 u64 values[3]; 1810 struct perf_event_context *ctx = leader->ctx;
1811 u64 values[5];
1812 u64 count, enabled, running;
1813
1814 mutex_lock(&ctx->mutex);
1815 count = perf_event_read_value(leader, &enabled, &running);
1785 1816
1786 values[n++] = 1 + leader->nr_siblings; 1817 values[n++] = 1 + leader->nr_siblings;
1787 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 1818 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1788 values[n++] = leader->total_time_enabled + 1819 values[n++] = enabled;
1789 atomic64_read(&leader->child_total_time_enabled); 1820 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1790 } 1821 values[n++] = running;
1791 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { 1822 values[n++] = count;
1792 values[n++] = leader->total_time_running + 1823 if (read_format & PERF_FORMAT_ID)
1793 atomic64_read(&leader->child_total_time_running); 1824 values[n++] = primary_event_id(leader);
1794 }
1795 1825
1796 size = n * sizeof(u64); 1826 size = n * sizeof(u64);
1797 1827
1798 if (copy_to_user(buf, values, size)) 1828 if (copy_to_user(buf, values, size))
1799 return -EFAULT; 1829 goto unlock;
1800
1801 err = perf_event_read_entry(leader, read_format, buf + size);
1802 if (err < 0)
1803 return err;
1804 1830
1805 size += err; 1831 ret = size;
1806 1832
1807 list_for_each_entry(sub, &leader->sibling_list, group_entry) { 1833 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1808 err = perf_event_read_entry(sub, read_format, 1834 n = 0;
1809 buf + size); 1835
1810 if (err < 0) 1836 values[n++] = perf_event_read_value(sub, &enabled, &running);
1811 return err; 1837 if (read_format & PERF_FORMAT_ID)
1838 values[n++] = primary_event_id(sub);
1812 1839
1813 size += err; 1840 size = n * sizeof(u64);
1841
1842 if (copy_to_user(buf + ret, values, size)) {
1843 ret = -EFAULT;
1844 goto unlock;
1845 }
1846
1847 ret += size;
1814 } 1848 }
1849unlock:
1850 mutex_unlock(&ctx->mutex);
1815 1851
1816 return size; 1852 return ret;
1817} 1853}
1818 1854
1819static int perf_event_read_one(struct perf_event *event, 1855static int perf_event_read_one(struct perf_event *event,
1820 u64 read_format, char __user *buf) 1856 u64 read_format, char __user *buf)
1821{ 1857{
1858 u64 enabled, running;
1822 u64 values[4]; 1859 u64 values[4];
1823 int n = 0; 1860 int n = 0;
1824 1861
1825 values[n++] = perf_event_read_value(event); 1862 values[n++] = perf_event_read_value(event, &enabled, &running);
1826 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 1863 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1827 values[n++] = event->total_time_enabled + 1864 values[n++] = enabled;
1828 atomic64_read(&event->child_total_time_enabled); 1865 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1829 } 1866 values[n++] = running;
1830 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1831 values[n++] = event->total_time_running +
1832 atomic64_read(&event->child_total_time_running);
1833 }
1834 if (read_format & PERF_FORMAT_ID) 1867 if (read_format & PERF_FORMAT_ID)
1835 values[n++] = primary_event_id(event); 1868 values[n++] = primary_event_id(event);
1836 1869
@@ -1861,12 +1894,10 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
1861 return -ENOSPC; 1894 return -ENOSPC;
1862 1895
1863 WARN_ON_ONCE(event->ctx->parent_ctx); 1896 WARN_ON_ONCE(event->ctx->parent_ctx);
1864 mutex_lock(&event->child_mutex);
1865 if (read_format & PERF_FORMAT_GROUP) 1897 if (read_format & PERF_FORMAT_GROUP)
1866 ret = perf_event_read_group(event, read_format, buf); 1898 ret = perf_event_read_group(event, read_format, buf);
1867 else 1899 else
1868 ret = perf_event_read_one(event, read_format, buf); 1900 ret = perf_event_read_one(event, read_format, buf);
1869 mutex_unlock(&event->child_mutex);
1870 1901
1871 return ret; 1902 return ret;
1872} 1903}
@@ -1974,7 +2005,8 @@ unlock:
1974 return ret; 2005 return ret;
1975} 2006}
1976 2007
1977int perf_event_set_output(struct perf_event *event, int output_fd); 2008static int perf_event_set_output(struct perf_event *event, int output_fd);
2009static int perf_event_set_filter(struct perf_event *event, void __user *arg);
1978 2010
1979static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 2011static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1980{ 2012{
@@ -2002,6 +2034,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2002 case PERF_EVENT_IOC_SET_OUTPUT: 2034 case PERF_EVENT_IOC_SET_OUTPUT:
2003 return perf_event_set_output(event, arg); 2035 return perf_event_set_output(event, arg);
2004 2036
2037 case PERF_EVENT_IOC_SET_FILTER:
2038 return perf_event_set_filter(event, (void __user *)arg);
2039
2005 default: 2040 default:
2006 return -ENOTTY; 2041 return -ENOTTY;
2007 } 2042 }
@@ -2174,6 +2209,7 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)
2174 perf_mmap_free_page((unsigned long)data->user_page); 2209 perf_mmap_free_page((unsigned long)data->user_page);
2175 for (i = 0; i < data->nr_pages; i++) 2210 for (i = 0; i < data->nr_pages; i++)
2176 perf_mmap_free_page((unsigned long)data->data_pages[i]); 2211 perf_mmap_free_page((unsigned long)data->data_pages[i]);
2212 kfree(data);
2177} 2213}
2178 2214
2179#else 2215#else
@@ -2214,6 +2250,7 @@ static void perf_mmap_data_free_work(struct work_struct *work)
2214 perf_mmap_unmark_page(base + (i * PAGE_SIZE)); 2250 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
2215 2251
2216 vfree(base); 2252 vfree(base);
2253 kfree(data);
2217} 2254}
2218 2255
2219static void perf_mmap_data_free(struct perf_mmap_data *data) 2256static void perf_mmap_data_free(struct perf_mmap_data *data)
@@ -2307,7 +2344,7 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2307 } 2344 }
2308 2345
2309 if (!data->watermark) 2346 if (!data->watermark)
2310 data->watermark = max_t(long, PAGE_SIZE, max_size / 2); 2347 data->watermark = max_size / 2;
2311 2348
2312 2349
2313 rcu_assign_pointer(event->data, data); 2350 rcu_assign_pointer(event->data, data);
@@ -2319,7 +2356,6 @@ static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
2319 2356
2320 data = container_of(rcu_head, struct perf_mmap_data, rcu_head); 2357 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2321 perf_mmap_data_free(data); 2358 perf_mmap_data_free(data);
2322 kfree(data);
2323} 2359}
2324 2360
2325static void perf_mmap_data_release(struct perf_event *event) 2361static void perf_mmap_data_release(struct perf_event *event)
@@ -2666,20 +2702,21 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
2666static void perf_output_lock(struct perf_output_handle *handle) 2702static void perf_output_lock(struct perf_output_handle *handle)
2667{ 2703{
2668 struct perf_mmap_data *data = handle->data; 2704 struct perf_mmap_data *data = handle->data;
2669 int cpu; 2705 int cur, cpu = get_cpu();
2670 2706
2671 handle->locked = 0; 2707 handle->locked = 0;
2672 2708
2673 local_irq_save(handle->flags); 2709 for (;;) {
2674 cpu = smp_processor_id(); 2710 cur = atomic_cmpxchg(&data->lock, -1, cpu);
2675 2711 if (cur == -1) {
2676 if (in_nmi() && atomic_read(&data->lock) == cpu) 2712 handle->locked = 1;
2677 return; 2713 break;
2714 }
2715 if (cur == cpu)
2716 break;
2678 2717
2679 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2680 cpu_relax(); 2718 cpu_relax();
2681 2719 }
2682 handle->locked = 1;
2683} 2720}
2684 2721
2685static void perf_output_unlock(struct perf_output_handle *handle) 2722static void perf_output_unlock(struct perf_output_handle *handle)
@@ -2725,7 +2762,7 @@ again:
2725 if (atomic_xchg(&data->wakeup, 0)) 2762 if (atomic_xchg(&data->wakeup, 0))
2726 perf_output_wakeup(handle); 2763 perf_output_wakeup(handle);
2727out: 2764out:
2728 local_irq_restore(handle->flags); 2765 put_cpu();
2729} 2766}
2730 2767
2731void perf_output_copy(struct perf_output_handle *handle, 2768void perf_output_copy(struct perf_output_handle *handle,
@@ -3236,15 +3273,10 @@ static void perf_event_task_ctx(struct perf_event_context *ctx,
3236{ 3273{
3237 struct perf_event *event; 3274 struct perf_event *event;
3238 3275
3239 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3240 return;
3241
3242 rcu_read_lock();
3243 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3276 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3244 if (perf_event_task_match(event)) 3277 if (perf_event_task_match(event))
3245 perf_event_task_output(event, task_event); 3278 perf_event_task_output(event, task_event);
3246 } 3279 }
3247 rcu_read_unlock();
3248} 3280}
3249 3281
3250static void perf_event_task_event(struct perf_task_event *task_event) 3282static void perf_event_task_event(struct perf_task_event *task_event)
@@ -3252,11 +3284,11 @@ static void perf_event_task_event(struct perf_task_event *task_event)
3252 struct perf_cpu_context *cpuctx; 3284 struct perf_cpu_context *cpuctx;
3253 struct perf_event_context *ctx = task_event->task_ctx; 3285 struct perf_event_context *ctx = task_event->task_ctx;
3254 3286
3287 rcu_read_lock();
3255 cpuctx = &get_cpu_var(perf_cpu_context); 3288 cpuctx = &get_cpu_var(perf_cpu_context);
3256 perf_event_task_ctx(&cpuctx->ctx, task_event); 3289 perf_event_task_ctx(&cpuctx->ctx, task_event);
3257 put_cpu_var(perf_cpu_context); 3290 put_cpu_var(perf_cpu_context);
3258 3291
3259 rcu_read_lock();
3260 if (!ctx) 3292 if (!ctx)
3261 ctx = rcu_dereference(task_event->task->perf_event_ctxp); 3293 ctx = rcu_dereference(task_event->task->perf_event_ctxp);
3262 if (ctx) 3294 if (ctx)
@@ -3348,15 +3380,10 @@ static void perf_event_comm_ctx(struct perf_event_context *ctx,
3348{ 3380{
3349 struct perf_event *event; 3381 struct perf_event *event;
3350 3382
3351 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3352 return;
3353
3354 rcu_read_lock();
3355 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3383 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3356 if (perf_event_comm_match(event)) 3384 if (perf_event_comm_match(event))
3357 perf_event_comm_output(event, comm_event); 3385 perf_event_comm_output(event, comm_event);
3358 } 3386 }
3359 rcu_read_unlock();
3360} 3387}
3361 3388
3362static void perf_event_comm_event(struct perf_comm_event *comm_event) 3389static void perf_event_comm_event(struct perf_comm_event *comm_event)
@@ -3367,7 +3394,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3367 char comm[TASK_COMM_LEN]; 3394 char comm[TASK_COMM_LEN];
3368 3395
3369 memset(comm, 0, sizeof(comm)); 3396 memset(comm, 0, sizeof(comm));
3370 strncpy(comm, comm_event->task->comm, sizeof(comm)); 3397 strlcpy(comm, comm_event->task->comm, sizeof(comm));
3371 size = ALIGN(strlen(comm)+1, sizeof(u64)); 3398 size = ALIGN(strlen(comm)+1, sizeof(u64));
3372 3399
3373 comm_event->comm = comm; 3400 comm_event->comm = comm;
@@ -3375,11 +3402,11 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3375 3402
3376 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; 3403 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3377 3404
3405 rcu_read_lock();
3378 cpuctx = &get_cpu_var(perf_cpu_context); 3406 cpuctx = &get_cpu_var(perf_cpu_context);
3379 perf_event_comm_ctx(&cpuctx->ctx, comm_event); 3407 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3380 put_cpu_var(perf_cpu_context); 3408 put_cpu_var(perf_cpu_context);
3381 3409
3382 rcu_read_lock();
3383 /* 3410 /*
3384 * doesn't really matter which of the child contexts the 3411 * doesn't really matter which of the child contexts the
3385 * events ends up in. 3412 * events ends up in.
@@ -3472,15 +3499,10 @@ static void perf_event_mmap_ctx(struct perf_event_context *ctx,
3472{ 3499{
3473 struct perf_event *event; 3500 struct perf_event *event;
3474 3501
3475 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3476 return;
3477
3478 rcu_read_lock();
3479 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3502 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3480 if (perf_event_mmap_match(event, mmap_event)) 3503 if (perf_event_mmap_match(event, mmap_event))
3481 perf_event_mmap_output(event, mmap_event); 3504 perf_event_mmap_output(event, mmap_event);
3482 } 3505 }
3483 rcu_read_unlock();
3484} 3506}
3485 3507
3486static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) 3508static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
@@ -3536,11 +3558,11 @@ got_name:
3536 3558
3537 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; 3559 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3538 3560
3561 rcu_read_lock();
3539 cpuctx = &get_cpu_var(perf_cpu_context); 3562 cpuctx = &get_cpu_var(perf_cpu_context);
3540 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); 3563 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
3541 put_cpu_var(perf_cpu_context); 3564 put_cpu_var(perf_cpu_context);
3542 3565
3543 rcu_read_lock();
3544 /* 3566 /*
3545 * doesn't really matter which of the child contexts the 3567 * doesn't really matter which of the child contexts the
3546 * events ends up in. 3568 * events ends up in.
@@ -3679,7 +3701,11 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
3679 perf_event_disable(event); 3701 perf_event_disable(event);
3680 } 3702 }
3681 3703
3682 perf_event_output(event, nmi, data, regs); 3704 if (event->overflow_handler)
3705 event->overflow_handler(event, nmi, data, regs);
3706 else
3707 perf_event_output(event, nmi, data, regs);
3708
3683 return ret; 3709 return ret;
3684} 3710}
3685 3711
@@ -3724,16 +3750,16 @@ again:
3724 return nr; 3750 return nr;
3725} 3751}
3726 3752
3727static void perf_swevent_overflow(struct perf_event *event, 3753static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
3728 int nmi, struct perf_sample_data *data, 3754 int nmi, struct perf_sample_data *data,
3729 struct pt_regs *regs) 3755 struct pt_regs *regs)
3730{ 3756{
3731 struct hw_perf_event *hwc = &event->hw; 3757 struct hw_perf_event *hwc = &event->hw;
3732 int throttle = 0; 3758 int throttle = 0;
3733 u64 overflow;
3734 3759
3735 data->period = event->hw.last_period; 3760 data->period = event->hw.last_period;
3736 overflow = perf_swevent_set_period(event); 3761 if (!overflow)
3762 overflow = perf_swevent_set_period(event);
3737 3763
3738 if (hwc->interrupts == MAX_INTERRUPTS) 3764 if (hwc->interrupts == MAX_INTERRUPTS)
3739 return; 3765 return;
@@ -3766,14 +3792,19 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
3766 3792
3767 atomic64_add(nr, &event->count); 3793 atomic64_add(nr, &event->count);
3768 3794
3795 if (!regs)
3796 return;
3797
3769 if (!hwc->sample_period) 3798 if (!hwc->sample_period)
3770 return; 3799 return;
3771 3800
3772 if (!regs) 3801 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
3802 return perf_swevent_overflow(event, 1, nmi, data, regs);
3803
3804 if (atomic64_add_negative(nr, &hwc->period_left))
3773 return; 3805 return;
3774 3806
3775 if (!atomic64_add_negative(nr, &hwc->period_left)) 3807 perf_swevent_overflow(event, 0, nmi, data, regs);
3776 perf_swevent_overflow(event, nmi, data, regs);
3777} 3808}
3778 3809
3779static int perf_swevent_is_counting(struct perf_event *event) 3810static int perf_swevent_is_counting(struct perf_event *event)
@@ -3806,25 +3837,44 @@ static int perf_swevent_is_counting(struct perf_event *event)
3806 return 1; 3837 return 1;
3807} 3838}
3808 3839
3840static int perf_tp_event_match(struct perf_event *event,
3841 struct perf_sample_data *data);
3842
3843static int perf_exclude_event(struct perf_event *event,
3844 struct pt_regs *regs)
3845{
3846 if (regs) {
3847 if (event->attr.exclude_user && user_mode(regs))
3848 return 1;
3849
3850 if (event->attr.exclude_kernel && !user_mode(regs))
3851 return 1;
3852 }
3853
3854 return 0;
3855}
3856
3809static int perf_swevent_match(struct perf_event *event, 3857static int perf_swevent_match(struct perf_event *event,
3810 enum perf_type_id type, 3858 enum perf_type_id type,
3811 u32 event_id, struct pt_regs *regs) 3859 u32 event_id,
3860 struct perf_sample_data *data,
3861 struct pt_regs *regs)
3812{ 3862{
3813 if (!perf_swevent_is_counting(event)) 3863 if (!perf_swevent_is_counting(event))
3814 return 0; 3864 return 0;
3815 3865
3816 if (event->attr.type != type) 3866 if (event->attr.type != type)
3817 return 0; 3867 return 0;
3868
3818 if (event->attr.config != event_id) 3869 if (event->attr.config != event_id)
3819 return 0; 3870 return 0;
3820 3871
3821 if (regs) { 3872 if (perf_exclude_event(event, regs))
3822 if (event->attr.exclude_user && user_mode(regs)) 3873 return 0;
3823 return 0;
3824 3874
3825 if (event->attr.exclude_kernel && !user_mode(regs)) 3875 if (event->attr.type == PERF_TYPE_TRACEPOINT &&
3826 return 0; 3876 !perf_tp_event_match(event, data))
3827 } 3877 return 0;
3828 3878
3829 return 1; 3879 return 1;
3830} 3880}
@@ -3837,49 +3887,59 @@ static void perf_swevent_ctx_event(struct perf_event_context *ctx,
3837{ 3887{
3838 struct perf_event *event; 3888 struct perf_event *event;
3839 3889
3840 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3841 return;
3842
3843 rcu_read_lock();
3844 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3890 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3845 if (perf_swevent_match(event, type, event_id, regs)) 3891 if (perf_swevent_match(event, type, event_id, data, regs))
3846 perf_swevent_add(event, nr, nmi, data, regs); 3892 perf_swevent_add(event, nr, nmi, data, regs);
3847 } 3893 }
3848 rcu_read_unlock();
3849} 3894}
3850 3895
3851static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx) 3896int perf_swevent_get_recursion_context(void)
3852{ 3897{
3898 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3899 int rctx;
3900
3853 if (in_nmi()) 3901 if (in_nmi())
3854 return &cpuctx->recursion[3]; 3902 rctx = 3;
3903 else if (in_irq())
3904 rctx = 2;
3905 else if (in_softirq())
3906 rctx = 1;
3907 else
3908 rctx = 0;
3855 3909
3856 if (in_irq()) 3910 if (cpuctx->recursion[rctx]) {
3857 return &cpuctx->recursion[2]; 3911 put_cpu_var(perf_cpu_context);
3912 return -1;
3913 }
3858 3914
3859 if (in_softirq()) 3915 cpuctx->recursion[rctx]++;
3860 return &cpuctx->recursion[1]; 3916 barrier();
3861 3917
3862 return &cpuctx->recursion[0]; 3918 return rctx;
3863} 3919}
3920EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
3921
3922void perf_swevent_put_recursion_context(int rctx)
3923{
3924 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
3925 barrier();
3926 cpuctx->recursion[rctx]--;
3927 put_cpu_var(perf_cpu_context);
3928}
3929EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
3864 3930
3865static void do_perf_sw_event(enum perf_type_id type, u32 event_id, 3931static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
3866 u64 nr, int nmi, 3932 u64 nr, int nmi,
3867 struct perf_sample_data *data, 3933 struct perf_sample_data *data,
3868 struct pt_regs *regs) 3934 struct pt_regs *regs)
3869{ 3935{
3870 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); 3936 struct perf_cpu_context *cpuctx;
3871 int *recursion = perf_swevent_recursion_context(cpuctx);
3872 struct perf_event_context *ctx; 3937 struct perf_event_context *ctx;
3873 3938
3874 if (*recursion) 3939 cpuctx = &__get_cpu_var(perf_cpu_context);
3875 goto out; 3940 rcu_read_lock();
3876
3877 (*recursion)++;
3878 barrier();
3879
3880 perf_swevent_ctx_event(&cpuctx->ctx, type, event_id, 3941 perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
3881 nr, nmi, data, regs); 3942 nr, nmi, data, regs);
3882 rcu_read_lock();
3883 /* 3943 /*
3884 * doesn't really matter which of the child contexts the 3944 * doesn't really matter which of the child contexts the
3885 * events ends up in. 3945 * events ends up in.
@@ -3888,23 +3948,24 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
3888 if (ctx) 3948 if (ctx)
3889 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs); 3949 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
3890 rcu_read_unlock(); 3950 rcu_read_unlock();
3891
3892 barrier();
3893 (*recursion)--;
3894
3895out:
3896 put_cpu_var(perf_cpu_context);
3897} 3951}
3898 3952
3899void __perf_sw_event(u32 event_id, u64 nr, int nmi, 3953void __perf_sw_event(u32 event_id, u64 nr, int nmi,
3900 struct pt_regs *regs, u64 addr) 3954 struct pt_regs *regs, u64 addr)
3901{ 3955{
3902 struct perf_sample_data data = { 3956 struct perf_sample_data data;
3903 .addr = addr, 3957 int rctx;
3904 };
3905 3958
3906 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, 3959 rctx = perf_swevent_get_recursion_context();
3907 &data, regs); 3960 if (rctx < 0)
3961 return;
3962
3963 data.addr = addr;
3964 data.raw = NULL;
3965
3966 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
3967
3968 perf_swevent_put_recursion_context(rctx);
3908} 3969}
3909 3970
3910static void perf_swevent_read(struct perf_event *event) 3971static void perf_swevent_read(struct perf_event *event)
@@ -3949,6 +4010,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
3949 event->pmu->read(event); 4010 event->pmu->read(event);
3950 4011
3951 data.addr = 0; 4012 data.addr = 0;
4013 data.raw = NULL;
4014 data.period = event->hw.last_period;
3952 regs = get_irq_regs(); 4015 regs = get_irq_regs();
3953 /* 4016 /*
3954 * In case we exclude kernel IPs or are somehow not in interrupt 4017 * In case we exclude kernel IPs or are somehow not in interrupt
@@ -4017,8 +4080,7 @@ static void cpu_clock_perf_event_update(struct perf_event *event)
4017 u64 now; 4080 u64 now;
4018 4081
4019 now = cpu_clock(cpu); 4082 now = cpu_clock(cpu);
4020 prev = atomic64_read(&event->hw.prev_count); 4083 prev = atomic64_xchg(&event->hw.prev_count, now);
4021 atomic64_set(&event->hw.prev_count, now);
4022 atomic64_add(now - prev, &event->count); 4084 atomic64_add(now - prev, &event->count);
4023} 4085}
4024 4086
@@ -4108,6 +4170,7 @@ static const struct pmu perf_ops_task_clock = {
4108}; 4170};
4109 4171
4110#ifdef CONFIG_EVENT_PROFILE 4172#ifdef CONFIG_EVENT_PROFILE
4173
4111void perf_tp_event(int event_id, u64 addr, u64 count, void *record, 4174void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4112 int entry_size) 4175 int entry_size)
4113{ 4176{
@@ -4126,13 +4189,21 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4126 if (!regs) 4189 if (!regs)
4127 regs = task_pt_regs(current); 4190 regs = task_pt_regs(current);
4128 4191
4192 /* Trace events already protected against recursion */
4129 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, 4193 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
4130 &data, regs); 4194 &data, regs);
4131} 4195}
4132EXPORT_SYMBOL_GPL(perf_tp_event); 4196EXPORT_SYMBOL_GPL(perf_tp_event);
4133 4197
4134extern int ftrace_profile_enable(int); 4198static int perf_tp_event_match(struct perf_event *event,
4135extern void ftrace_profile_disable(int); 4199 struct perf_sample_data *data)
4200{
4201 void *record = data->raw->data;
4202
4203 if (likely(!event->filter) || filter_match_preds(event->filter, record))
4204 return 1;
4205 return 0;
4206}
4136 4207
4137static void tp_perf_event_destroy(struct perf_event *event) 4208static void tp_perf_event_destroy(struct perf_event *event)
4138{ 4209{
@@ -4157,11 +4228,93 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
4157 4228
4158 return &perf_ops_generic; 4229 return &perf_ops_generic;
4159} 4230}
4231
4232static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4233{
4234 char *filter_str;
4235 int ret;
4236
4237 if (event->attr.type != PERF_TYPE_TRACEPOINT)
4238 return -EINVAL;
4239
4240 filter_str = strndup_user(arg, PAGE_SIZE);
4241 if (IS_ERR(filter_str))
4242 return PTR_ERR(filter_str);
4243
4244 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
4245
4246 kfree(filter_str);
4247 return ret;
4248}
4249
4250static void perf_event_free_filter(struct perf_event *event)
4251{
4252 ftrace_profile_free_filter(event);
4253}
4254
4160#else 4255#else
4256
4257static int perf_tp_event_match(struct perf_event *event,
4258 struct perf_sample_data *data)
4259{
4260 return 1;
4261}
4262
4161static const struct pmu *tp_perf_event_init(struct perf_event *event) 4263static const struct pmu *tp_perf_event_init(struct perf_event *event)
4162{ 4264{
4163 return NULL; 4265 return NULL;
4164} 4266}
4267
4268static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4269{
4270 return -ENOENT;
4271}
4272
4273static void perf_event_free_filter(struct perf_event *event)
4274{
4275}
4276
4277#endif /* CONFIG_EVENT_PROFILE */
4278
4279#ifdef CONFIG_HAVE_HW_BREAKPOINT
4280static void bp_perf_event_destroy(struct perf_event *event)
4281{
4282 release_bp_slot(event);
4283}
4284
4285static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4286{
4287 int err;
4288
4289 err = register_perf_hw_breakpoint(bp);
4290 if (err)
4291 return ERR_PTR(err);
4292
4293 bp->destroy = bp_perf_event_destroy;
4294
4295 return &perf_ops_bp;
4296}
4297
4298void perf_bp_event(struct perf_event *bp, void *data)
4299{
4300 struct perf_sample_data sample;
4301 struct pt_regs *regs = data;
4302
4303 sample.raw = NULL;
4304 sample.addr = bp->attr.bp_addr;
4305
4306 if (!perf_exclude_event(bp, regs))
4307 perf_swevent_add(bp, 1, 1, &sample, regs);
4308}
4309#else
4310static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4311{
4312 return NULL;
4313}
4314
4315void perf_bp_event(struct perf_event *bp, void *regs)
4316{
4317}
4165#endif 4318#endif
4166 4319
4167atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; 4320atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
@@ -4208,6 +4361,8 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
4208 case PERF_COUNT_SW_PAGE_FAULTS_MAJ: 4361 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
4209 case PERF_COUNT_SW_CONTEXT_SWITCHES: 4362 case PERF_COUNT_SW_CONTEXT_SWITCHES:
4210 case PERF_COUNT_SW_CPU_MIGRATIONS: 4363 case PERF_COUNT_SW_CPU_MIGRATIONS:
4364 case PERF_COUNT_SW_ALIGNMENT_FAULTS:
4365 case PERF_COUNT_SW_EMULATION_FAULTS:
4211 if (!event->parent) { 4366 if (!event->parent) {
4212 atomic_inc(&perf_swevent_enabled[event_id]); 4367 atomic_inc(&perf_swevent_enabled[event_id]);
4213 event->destroy = sw_perf_event_destroy; 4368 event->destroy = sw_perf_event_destroy;
@@ -4228,6 +4383,7 @@ perf_event_alloc(struct perf_event_attr *attr,
4228 struct perf_event_context *ctx, 4383 struct perf_event_context *ctx,
4229 struct perf_event *group_leader, 4384 struct perf_event *group_leader,
4230 struct perf_event *parent_event, 4385 struct perf_event *parent_event,
4386 perf_overflow_handler_t overflow_handler,
4231 gfp_t gfpflags) 4387 gfp_t gfpflags)
4232{ 4388{
4233 const struct pmu *pmu; 4389 const struct pmu *pmu;
@@ -4270,6 +4426,11 @@ perf_event_alloc(struct perf_event_attr *attr,
4270 4426
4271 event->state = PERF_EVENT_STATE_INACTIVE; 4427 event->state = PERF_EVENT_STATE_INACTIVE;
4272 4428
4429 if (!overflow_handler && parent_event)
4430 overflow_handler = parent_event->overflow_handler;
4431
4432 event->overflow_handler = overflow_handler;
4433
4273 if (attr->disabled) 4434 if (attr->disabled)
4274 event->state = PERF_EVENT_STATE_OFF; 4435 event->state = PERF_EVENT_STATE_OFF;
4275 4436
@@ -4304,6 +4465,11 @@ perf_event_alloc(struct perf_event_attr *attr,
4304 pmu = tp_perf_event_init(event); 4465 pmu = tp_perf_event_init(event);
4305 break; 4466 break;
4306 4467
4468 case PERF_TYPE_BREAKPOINT:
4469 pmu = bp_perf_event_init(event);
4470 break;
4471
4472
4307 default: 4473 default:
4308 break; 4474 break;
4309 } 4475 }
@@ -4416,7 +4582,7 @@ err_size:
4416 goto out; 4582 goto out;
4417} 4583}
4418 4584
4419int perf_event_set_output(struct perf_event *event, int output_fd) 4585static int perf_event_set_output(struct perf_event *event, int output_fd)
4420{ 4586{
4421 struct perf_event *output_event = NULL; 4587 struct perf_event *output_event = NULL;
4422 struct file *output_file = NULL; 4588 struct file *output_file = NULL;
@@ -4546,7 +4712,7 @@ SYSCALL_DEFINE5(perf_event_open,
4546 } 4712 }
4547 4713
4548 event = perf_event_alloc(&attr, cpu, ctx, group_leader, 4714 event = perf_event_alloc(&attr, cpu, ctx, group_leader,
4549 NULL, GFP_KERNEL); 4715 NULL, NULL, GFP_KERNEL);
4550 err = PTR_ERR(event); 4716 err = PTR_ERR(event);
4551 if (IS_ERR(event)) 4717 if (IS_ERR(event))
4552 goto err_put_context; 4718 goto err_put_context;
@@ -4594,6 +4760,61 @@ err_put_context:
4594 return err; 4760 return err;
4595} 4761}
4596 4762
4763/**
4764 * perf_event_create_kernel_counter
4765 *
4766 * @attr: attributes of the counter to create
4767 * @cpu: cpu in which the counter is bound
4768 * @pid: task to profile
4769 */
4770struct perf_event *
4771perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
4772 pid_t pid,
4773 perf_overflow_handler_t overflow_handler)
4774{
4775 struct perf_event *event;
4776 struct perf_event_context *ctx;
4777 int err;
4778
4779 /*
4780 * Get the target context (task or percpu):
4781 */
4782
4783 ctx = find_get_context(pid, cpu);
4784 if (IS_ERR(ctx)) {
4785 err = PTR_ERR(ctx);
4786 goto err_exit;
4787 }
4788
4789 event = perf_event_alloc(attr, cpu, ctx, NULL,
4790 NULL, overflow_handler, GFP_KERNEL);
4791 if (IS_ERR(event)) {
4792 err = PTR_ERR(event);
4793 goto err_put_context;
4794 }
4795
4796 event->filp = NULL;
4797 WARN_ON_ONCE(ctx->parent_ctx);
4798 mutex_lock(&ctx->mutex);
4799 perf_install_in_context(ctx, event, cpu);
4800 ++ctx->generation;
4801 mutex_unlock(&ctx->mutex);
4802
4803 event->owner = current;
4804 get_task_struct(current);
4805 mutex_lock(&current->perf_event_mutex);
4806 list_add_tail(&event->owner_entry, &current->perf_event_list);
4807 mutex_unlock(&current->perf_event_mutex);
4808
4809 return event;
4810
4811 err_put_context:
4812 put_ctx(ctx);
4813 err_exit:
4814 return ERR_PTR(err);
4815}
4816EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
4817
4597/* 4818/*
4598 * inherit a event from parent task to child task: 4819 * inherit a event from parent task to child task:
4599 */ 4820 */
@@ -4619,7 +4840,7 @@ inherit_event(struct perf_event *parent_event,
4619 child_event = perf_event_alloc(&parent_event->attr, 4840 child_event = perf_event_alloc(&parent_event->attr,
4620 parent_event->cpu, child_ctx, 4841 parent_event->cpu, child_ctx,
4621 group_leader, parent_event, 4842 group_leader, parent_event,
4622 GFP_KERNEL); 4843 NULL, GFP_KERNEL);
4623 if (IS_ERR(child_event)) 4844 if (IS_ERR(child_event))
4624 return child_event; 4845 return child_event;
4625 get_ctx(child_ctx); 4846 get_ctx(child_ctx);
@@ -4637,6 +4858,8 @@ inherit_event(struct perf_event *parent_event,
4637 if (parent_event->attr.freq) 4858 if (parent_event->attr.freq)
4638 child_event->hw.sample_period = parent_event->hw.sample_period; 4859 child_event->hw.sample_period = parent_event->hw.sample_period;
4639 4860
4861 child_event->overflow_handler = parent_event->overflow_handler;
4862
4640 /* 4863 /*
4641 * Link it up in the child's context: 4864 * Link it up in the child's context:
4642 */ 4865 */
@@ -4726,7 +4949,6 @@ __perf_event_exit_task(struct perf_event *child_event,
4726{ 4949{
4727 struct perf_event *parent_event; 4950 struct perf_event *parent_event;
4728 4951
4729 update_event_times(child_event);
4730 perf_event_remove_from_context(child_event); 4952 perf_event_remove_from_context(child_event);
4731 4953
4732 parent_event = child_event->parent; 4954 parent_event = child_event->parent;
@@ -4778,6 +5000,7 @@ void perf_event_exit_task(struct task_struct *child)
4778 * the events from it. 5000 * the events from it.
4779 */ 5001 */
4780 unclone_ctx(child_ctx); 5002 unclone_ctx(child_ctx);
5003 update_context_time(child_ctx);
4781 spin_unlock_irqrestore(&child_ctx->lock, flags); 5004 spin_unlock_irqrestore(&child_ctx->lock, flags);
4782 5005
4783 /* 5006 /*
@@ -4861,7 +5084,7 @@ again:
4861 */ 5084 */
4862int perf_event_init_task(struct task_struct *child) 5085int perf_event_init_task(struct task_struct *child)
4863{ 5086{
4864 struct perf_event_context *child_ctx, *parent_ctx; 5087 struct perf_event_context *child_ctx = NULL, *parent_ctx;
4865 struct perf_event_context *cloned_ctx; 5088 struct perf_event_context *cloned_ctx;
4866 struct perf_event *event; 5089 struct perf_event *event;
4867 struct task_struct *parent = current; 5090 struct task_struct *parent = current;
@@ -4877,20 +5100,6 @@ int perf_event_init_task(struct task_struct *child)
4877 return 0; 5100 return 0;
4878 5101
4879 /* 5102 /*
4880 * This is executed from the parent task context, so inherit
4881 * events that have been marked for cloning.
4882 * First allocate and initialize a context for the child.
4883 */
4884
4885 child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
4886 if (!child_ctx)
4887 return -ENOMEM;
4888
4889 __perf_event_init_context(child_ctx, child);
4890 child->perf_event_ctxp = child_ctx;
4891 get_task_struct(child);
4892
4893 /*
4894 * If the parent's context is a clone, pin it so it won't get 5103 * If the parent's context is a clone, pin it so it won't get
4895 * swapped under us. 5104 * swapped under us.
4896 */ 5105 */
@@ -4920,6 +5129,26 @@ int perf_event_init_task(struct task_struct *child)
4920 continue; 5129 continue;
4921 } 5130 }
4922 5131
5132 if (!child->perf_event_ctxp) {
5133 /*
5134 * This is executed from the parent task context, so
5135 * inherit events that have been marked for cloning.
5136 * First allocate and initialize a context for the
5137 * child.
5138 */
5139
5140 child_ctx = kzalloc(sizeof(struct perf_event_context),
5141 GFP_KERNEL);
5142 if (!child_ctx) {
5143 ret = -ENOMEM;
5144 goto exit;
5145 }
5146
5147 __perf_event_init_context(child_ctx, child);
5148 child->perf_event_ctxp = child_ctx;
5149 get_task_struct(child);
5150 }
5151
4923 ret = inherit_group(event, parent, parent_ctx, 5152 ret = inherit_group(event, parent, parent_ctx,
4924 child, child_ctx); 5153 child, child_ctx);
4925 if (ret) { 5154 if (ret) {
@@ -4948,6 +5177,7 @@ int perf_event_init_task(struct task_struct *child)
4948 get_ctx(child_ctx->parent_ctx); 5177 get_ctx(child_ctx->parent_ctx);
4949 } 5178 }
4950 5179
5180exit:
4951 mutex_unlock(&parent_ctx->mutex); 5181 mutex_unlock(&parent_ctx->mutex);
4952 5182
4953 perf_unpin_context(parent_ctx); 5183 perf_unpin_context(parent_ctx);
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index dfdec524d1b7..3db49b9ca374 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -29,7 +29,6 @@
29 29
30#include <linux/pm_qos_params.h> 30#include <linux/pm_qos_params.h>
31#include <linux/sched.h> 31#include <linux/sched.h>
32#include <linux/smp_lock.h>
33#include <linux/spinlock.h> 32#include <linux/spinlock.h>
34#include <linux/slab.h> 33#include <linux/slab.h>
35#include <linux/time.h> 34#include <linux/time.h>
@@ -344,37 +343,33 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
344} 343}
345EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); 344EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
346 345
347#define PID_NAME_LEN sizeof("process_1234567890") 346#define PID_NAME_LEN 32
348static char name[PID_NAME_LEN];
349 347
350static int pm_qos_power_open(struct inode *inode, struct file *filp) 348static int pm_qos_power_open(struct inode *inode, struct file *filp)
351{ 349{
352 int ret; 350 int ret;
353 long pm_qos_class; 351 long pm_qos_class;
352 char name[PID_NAME_LEN];
354 353
355 lock_kernel();
356 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); 354 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
357 if (pm_qos_class >= 0) { 355 if (pm_qos_class >= 0) {
358 filp->private_data = (void *)pm_qos_class; 356 filp->private_data = (void *)pm_qos_class;
359 sprintf(name, "process_%d", current->pid); 357 snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
360 ret = pm_qos_add_requirement(pm_qos_class, name, 358 ret = pm_qos_add_requirement(pm_qos_class, name,
361 PM_QOS_DEFAULT_VALUE); 359 PM_QOS_DEFAULT_VALUE);
362 if (ret >= 0) { 360 if (ret >= 0)
363 unlock_kernel();
364 return 0; 361 return 0;
365 }
366 } 362 }
367 unlock_kernel();
368
369 return -EPERM; 363 return -EPERM;
370} 364}
371 365
372static int pm_qos_power_release(struct inode *inode, struct file *filp) 366static int pm_qos_power_release(struct inode *inode, struct file *filp)
373{ 367{
374 int pm_qos_class; 368 int pm_qos_class;
369 char name[PID_NAME_LEN];
375 370
376 pm_qos_class = (long)filp->private_data; 371 pm_qos_class = (long)filp->private_data;
377 sprintf(name, "process_%d", current->pid); 372 snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
378 pm_qos_remove_requirement(pm_qos_class, name); 373 pm_qos_remove_requirement(pm_qos_class, name);
379 374
380 return 0; 375 return 0;
@@ -385,13 +380,14 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
385{ 380{
386 s32 value; 381 s32 value;
387 int pm_qos_class; 382 int pm_qos_class;
383 char name[PID_NAME_LEN];
388 384
389 pm_qos_class = (long)filp->private_data; 385 pm_qos_class = (long)filp->private_data;
390 if (count != sizeof(s32)) 386 if (count != sizeof(s32))
391 return -EINVAL; 387 return -EINVAL;
392 if (copy_from_user(&value, buf, sizeof(s32))) 388 if (copy_from_user(&value, buf, sizeof(s32)))
393 return -EFAULT; 389 return -EFAULT;
394 sprintf(name, "process_%d", current->pid); 390 snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
395 pm_qos_update_requirement(pm_qos_class, name, value); 391 pm_qos_update_requirement(pm_qos_class, name, value);
396 392
397 return sizeof(s32); 393 return sizeof(s32);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 5c9dc228747b..438ff4523513 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -384,7 +384,8 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
384 384
385/* 385/*
386 * Validate the clockid_t for a new CPU-clock timer, and initialize the timer. 386 * Validate the clockid_t for a new CPU-clock timer, and initialize the timer.
387 * This is called from sys_timer_create with the new timer already locked. 387 * This is called from sys_timer_create() and do_cpu_nanosleep() with the
388 * new timer already all-zeros initialized.
388 */ 389 */
389int posix_cpu_timer_create(struct k_itimer *new_timer) 390int posix_cpu_timer_create(struct k_itimer *new_timer)
390{ 391{
@@ -396,8 +397,6 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
396 return -EINVAL; 397 return -EINVAL;
397 398
398 INIT_LIST_HEAD(&new_timer->it.cpu.entry); 399 INIT_LIST_HEAD(&new_timer->it.cpu.entry);
399 new_timer->it.cpu.incr.sched = 0;
400 new_timer->it.cpu.expires.sched = 0;
401 400
402 read_lock(&tasklist_lock); 401 read_lock(&tasklist_lock);
403 if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) { 402 if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index c3b81c30e5d5..43191815f874 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -8,7 +8,7 @@ obj-$(CONFIG_PM_SLEEP) += console.o
8obj-$(CONFIG_FREEZER) += process.o 8obj-$(CONFIG_FREEZER) += process.o
9obj-$(CONFIG_SUSPEND) += suspend.o 9obj-$(CONFIG_SUSPEND) += suspend.o
10obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o 10obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
11obj-$(CONFIG_HIBERNATION) += swsusp.o hibernate.o snapshot.o swap.o user.o 11obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o
12obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o 12obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o
13 13
14obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 14obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 04a9e90d248f..bbfe472d7524 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -32,6 +32,7 @@ static int noresume = 0;
32static char resume_file[256] = CONFIG_PM_STD_PARTITION; 32static char resume_file[256] = CONFIG_PM_STD_PARTITION;
33dev_t swsusp_resume_device; 33dev_t swsusp_resume_device;
34sector_t swsusp_resume_block; 34sector_t swsusp_resume_block;
35int in_suspend __nosavedata = 0;
35 36
36enum { 37enum {
37 HIBERNATION_INVALID, 38 HIBERNATION_INVALID,
@@ -202,6 +203,35 @@ static void platform_recover(int platform_mode)
202} 203}
203 204
204/** 205/**
206 * swsusp_show_speed - print the time elapsed between two events.
207 * @start: Starting event.
208 * @stop: Final event.
209 * @nr_pages - number of pages processed between @start and @stop
210 * @msg - introductory message to print
211 */
212
213void swsusp_show_speed(struct timeval *start, struct timeval *stop,
214 unsigned nr_pages, char *msg)
215{
216 s64 elapsed_centisecs64;
217 int centisecs;
218 int k;
219 int kps;
220
221 elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
222 do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
223 centisecs = elapsed_centisecs64;
224 if (centisecs == 0)
225 centisecs = 1; /* avoid div-by-zero */
226 k = nr_pages * (PAGE_SIZE / 1024);
227 kps = (k * 100) / centisecs;
228 printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n",
229 msg, k,
230 centisecs / 100, centisecs % 100,
231 kps / 1000, (kps % 1000) / 10);
232}
233
234/**
205 * create_image - freeze devices that need to be frozen with interrupts 235 * create_image - freeze devices that need to be frozen with interrupts
206 * off, create the hibernation image and thaw those devices. Control 236 * off, create the hibernation image and thaw those devices. Control
207 * reappears in this routine after a restore. 237 * reappears in this routine after a restore.
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 347d2cc88cd0..0998c7139053 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -220,6 +220,7 @@ static struct attribute_group attr_group = {
220 220
221#ifdef CONFIG_PM_RUNTIME 221#ifdef CONFIG_PM_RUNTIME
222struct workqueue_struct *pm_wq; 222struct workqueue_struct *pm_wq;
223EXPORT_SYMBOL_GPL(pm_wq);
223 224
224static int __init pm_start_workqueue(void) 225static int __init pm_start_workqueue(void)
225{ 226{
diff --git a/kernel/power/process.c b/kernel/power/process.c
index cc2e55373b68..5ade1bdcf366 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -14,6 +14,7 @@
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/freezer.h> 16#include <linux/freezer.h>
17#include <linux/delay.h>
17 18
18/* 19/*
19 * Timeout for stopping processes 20 * Timeout for stopping processes
@@ -41,7 +42,7 @@ static int try_to_freeze_tasks(bool sig_only)
41 do_gettimeofday(&start); 42 do_gettimeofday(&start);
42 43
43 end_time = jiffies + TIMEOUT; 44 end_time = jiffies + TIMEOUT;
44 do { 45 while (true) {
45 todo = 0; 46 todo = 0;
46 read_lock(&tasklist_lock); 47 read_lock(&tasklist_lock);
47 do_each_thread(g, p) { 48 do_each_thread(g, p) {
@@ -62,10 +63,15 @@ static int try_to_freeze_tasks(bool sig_only)
62 todo++; 63 todo++;
63 } while_each_thread(g, p); 64 } while_each_thread(g, p);
64 read_unlock(&tasklist_lock); 65 read_unlock(&tasklist_lock);
65 yield(); /* Yield is okay here */ 66 if (!todo || time_after(jiffies, end_time))
66 if (time_after(jiffies, end_time))
67 break; 67 break;
68 } while (todo); 68
69 /*
70 * We need to retry, but first give the freezing tasks some
71 * time to enter the regrigerator.
72 */
73 msleep(10);
74 }
69 75
70 do_gettimeofday(&end); 76 do_gettimeofday(&end);
71 elapsed_csecs64 = timeval_to_ns(&end) - timeval_to_ns(&start); 77 elapsed_csecs64 = timeval_to_ns(&end) - timeval_to_ns(&start);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 890f6b11b1d3..09b2b0ae9e9d 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -38,6 +38,107 @@ struct swsusp_header {
38 38
39static struct swsusp_header *swsusp_header; 39static struct swsusp_header *swsusp_header;
40 40
41/**
42 * The following functions are used for tracing the allocated
43 * swap pages, so that they can be freed in case of an error.
44 */
45
46struct swsusp_extent {
47 struct rb_node node;
48 unsigned long start;
49 unsigned long end;
50};
51
52static struct rb_root swsusp_extents = RB_ROOT;
53
54static int swsusp_extents_insert(unsigned long swap_offset)
55{
56 struct rb_node **new = &(swsusp_extents.rb_node);
57 struct rb_node *parent = NULL;
58 struct swsusp_extent *ext;
59
60 /* Figure out where to put the new node */
61 while (*new) {
62 ext = container_of(*new, struct swsusp_extent, node);
63 parent = *new;
64 if (swap_offset < ext->start) {
65 /* Try to merge */
66 if (swap_offset == ext->start - 1) {
67 ext->start--;
68 return 0;
69 }
70 new = &((*new)->rb_left);
71 } else if (swap_offset > ext->end) {
72 /* Try to merge */
73 if (swap_offset == ext->end + 1) {
74 ext->end++;
75 return 0;
76 }
77 new = &((*new)->rb_right);
78 } else {
79 /* It already is in the tree */
80 return -EINVAL;
81 }
82 }
83 /* Add the new node and rebalance the tree. */
84 ext = kzalloc(sizeof(struct swsusp_extent), GFP_KERNEL);
85 if (!ext)
86 return -ENOMEM;
87
88 ext->start = swap_offset;
89 ext->end = swap_offset;
90 rb_link_node(&ext->node, parent, new);
91 rb_insert_color(&ext->node, &swsusp_extents);
92 return 0;
93}
94
95/**
96 * alloc_swapdev_block - allocate a swap page and register that it has
97 * been allocated, so that it can be freed in case of an error.
98 */
99
100sector_t alloc_swapdev_block(int swap)
101{
102 unsigned long offset;
103
104 offset = swp_offset(get_swap_page_of_type(swap));
105 if (offset) {
106 if (swsusp_extents_insert(offset))
107 swap_free(swp_entry(swap, offset));
108 else
109 return swapdev_block(swap, offset);
110 }
111 return 0;
112}
113
114/**
115 * free_all_swap_pages - free swap pages allocated for saving image data.
116 * It also frees the extents used to register which swap entres had been
117 * allocated.
118 */
119
120void free_all_swap_pages(int swap)
121{
122 struct rb_node *node;
123
124 while ((node = swsusp_extents.rb_node)) {
125 struct swsusp_extent *ext;
126 unsigned long offset;
127
128 ext = container_of(node, struct swsusp_extent, node);
129 rb_erase(node, &swsusp_extents);
130 for (offset = ext->start; offset <= ext->end; offset++)
131 swap_free(swp_entry(swap, offset));
132
133 kfree(ext);
134 }
135}
136
137int swsusp_swap_in_use(void)
138{
139 return (swsusp_extents.rb_node != NULL);
140}
141
41/* 142/*
42 * General things 143 * General things
43 */ 144 */
@@ -336,7 +437,7 @@ static int save_image(struct swap_map_handle *handle,
336 if (ret) 437 if (ret)
337 break; 438 break;
338 if (!(nr_pages % m)) 439 if (!(nr_pages % m))
339 printk("\b\b\b\b%3d%%", nr_pages / m); 440 printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
340 nr_pages++; 441 nr_pages++;
341 } 442 }
342 err2 = wait_on_bio_chain(&bio); 443 err2 = wait_on_bio_chain(&bio);
@@ -344,9 +445,9 @@ static int save_image(struct swap_map_handle *handle,
344 if (!ret) 445 if (!ret)
345 ret = err2; 446 ret = err2;
346 if (!ret) 447 if (!ret)
347 printk("\b\b\b\bdone\n"); 448 printk(KERN_CONT "\b\b\b\bdone\n");
348 else 449 else
349 printk("\n"); 450 printk(KERN_CONT "\n");
350 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); 451 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
351 return ret; 452 return ret;
352} 453}
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 6a07f4dbf2f8..5b3601bd1893 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -56,133 +56,3 @@
56#include "power.h" 56#include "power.h"
57 57
58int in_suspend __nosavedata = 0; 58int in_suspend __nosavedata = 0;
59
60/**
61 * The following functions are used for tracing the allocated
62 * swap pages, so that they can be freed in case of an error.
63 */
64
65struct swsusp_extent {
66 struct rb_node node;
67 unsigned long start;
68 unsigned long end;
69};
70
71static struct rb_root swsusp_extents = RB_ROOT;
72
73static int swsusp_extents_insert(unsigned long swap_offset)
74{
75 struct rb_node **new = &(swsusp_extents.rb_node);
76 struct rb_node *parent = NULL;
77 struct swsusp_extent *ext;
78
79 /* Figure out where to put the new node */
80 while (*new) {
81 ext = container_of(*new, struct swsusp_extent, node);
82 parent = *new;
83 if (swap_offset < ext->start) {
84 /* Try to merge */
85 if (swap_offset == ext->start - 1) {
86 ext->start--;
87 return 0;
88 }
89 new = &((*new)->rb_left);
90 } else if (swap_offset > ext->end) {
91 /* Try to merge */
92 if (swap_offset == ext->end + 1) {
93 ext->end++;
94 return 0;
95 }
96 new = &((*new)->rb_right);
97 } else {
98 /* It already is in the tree */
99 return -EINVAL;
100 }
101 }
102 /* Add the new node and rebalance the tree. */
103 ext = kzalloc(sizeof(struct swsusp_extent), GFP_KERNEL);
104 if (!ext)
105 return -ENOMEM;
106
107 ext->start = swap_offset;
108 ext->end = swap_offset;
109 rb_link_node(&ext->node, parent, new);
110 rb_insert_color(&ext->node, &swsusp_extents);
111 return 0;
112}
113
114/**
115 * alloc_swapdev_block - allocate a swap page and register that it has
116 * been allocated, so that it can be freed in case of an error.
117 */
118
119sector_t alloc_swapdev_block(int swap)
120{
121 unsigned long offset;
122
123 offset = swp_offset(get_swap_page_of_type(swap));
124 if (offset) {
125 if (swsusp_extents_insert(offset))
126 swap_free(swp_entry(swap, offset));
127 else
128 return swapdev_block(swap, offset);
129 }
130 return 0;
131}
132
133/**
134 * free_all_swap_pages - free swap pages allocated for saving image data.
135 * It also frees the extents used to register which swap entres had been
136 * allocated.
137 */
138
139void free_all_swap_pages(int swap)
140{
141 struct rb_node *node;
142
143 while ((node = swsusp_extents.rb_node)) {
144 struct swsusp_extent *ext;
145 unsigned long offset;
146
147 ext = container_of(node, struct swsusp_extent, node);
148 rb_erase(node, &swsusp_extents);
149 for (offset = ext->start; offset <= ext->end; offset++)
150 swap_free(swp_entry(swap, offset));
151
152 kfree(ext);
153 }
154}
155
156int swsusp_swap_in_use(void)
157{
158 return (swsusp_extents.rb_node != NULL);
159}
160
161/**
162 * swsusp_show_speed - print the time elapsed between two events represented by
163 * @start and @stop
164 *
165 * @nr_pages - number of pages processed between @start and @stop
166 * @msg - introductory message to print
167 */
168
169void swsusp_show_speed(struct timeval *start, struct timeval *stop,
170 unsigned nr_pages, char *msg)
171{
172 s64 elapsed_centisecs64;
173 int centisecs;
174 int k;
175 int kps;
176
177 elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
178 do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
179 centisecs = elapsed_centisecs64;
180 if (centisecs == 0)
181 centisecs = 1; /* avoid div-by-zero */
182 k = nr_pages * (PAGE_SIZE / 1024);
183 kps = (k * 100) / centisecs;
184 printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n",
185 msg, k,
186 centisecs / 100, centisecs % 100,
187 kps / 1000, (kps % 1000) / 10);
188}
diff --git a/kernel/printk.c b/kernel/printk.c
index f38b07f78a4e..b5ac4d99c667 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -33,6 +33,7 @@
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/kexec.h> 35#include <linux/kexec.h>
36#include <linux/ratelimit.h>
36 37
37#include <asm/uaccess.h> 38#include <asm/uaccess.h>
38 39
@@ -1376,11 +1377,11 @@ late_initcall(disable_boot_consoles);
1376 */ 1377 */
1377DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10); 1378DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);
1378 1379
1379int printk_ratelimit(void) 1380int __printk_ratelimit(const char *func)
1380{ 1381{
1381 return __ratelimit(&printk_ratelimit_state); 1382 return ___ratelimit(&printk_ratelimit_state, func);
1382} 1383}
1383EXPORT_SYMBOL(printk_ratelimit); 1384EXPORT_SYMBOL(__printk_ratelimit);
1384 1385
1385/** 1386/**
1386 * printk_timed_ratelimit - caller-controlled printk ratelimiting 1387 * printk_timed_ratelimit - caller-controlled printk ratelimiting
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 400183346ad2..9b7fd4723878 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -44,7 +44,6 @@
44#include <linux/cpu.h> 44#include <linux/cpu.h>
45#include <linux/mutex.h> 45#include <linux/mutex.h>
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/kernel_stat.h>
48 47
49#ifdef CONFIG_DEBUG_LOCK_ALLOC 48#ifdef CONFIG_DEBUG_LOCK_ALLOC
50static struct lock_class_key rcu_lock_key; 49static struct lock_class_key rcu_lock_key;
@@ -53,8 +52,6 @@ struct lockdep_map rcu_lock_map =
53EXPORT_SYMBOL_GPL(rcu_lock_map); 52EXPORT_SYMBOL_GPL(rcu_lock_map);
54#endif 53#endif
55 54
56int rcu_scheduler_active __read_mostly;
57
58/* 55/*
59 * Awaken the corresponding synchronize_rcu() instance now that a 56 * Awaken the corresponding synchronize_rcu() instance now that a
60 * grace period has elapsed. 57 * grace period has elapsed.
@@ -66,122 +63,3 @@ void wakeme_after_rcu(struct rcu_head *head)
66 rcu = container_of(head, struct rcu_synchronize, head); 63 rcu = container_of(head, struct rcu_synchronize, head);
67 complete(&rcu->completion); 64 complete(&rcu->completion);
68} 65}
69
70#ifdef CONFIG_TREE_PREEMPT_RCU
71
72/**
73 * synchronize_rcu - wait until a grace period has elapsed.
74 *
75 * Control will return to the caller some time after a full grace
76 * period has elapsed, in other words after all currently executing RCU
77 * read-side critical sections have completed. RCU read-side critical
78 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
79 * and may be nested.
80 */
81void synchronize_rcu(void)
82{
83 struct rcu_synchronize rcu;
84
85 if (!rcu_scheduler_active)
86 return;
87
88 init_completion(&rcu.completion);
89 /* Will wake me after RCU finished. */
90 call_rcu(&rcu.head, wakeme_after_rcu);
91 /* Wait for it. */
92 wait_for_completion(&rcu.completion);
93}
94EXPORT_SYMBOL_GPL(synchronize_rcu);
95
96#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
97
98/**
99 * synchronize_sched - wait until an rcu-sched grace period has elapsed.
100 *
101 * Control will return to the caller some time after a full rcu-sched
102 * grace period has elapsed, in other words after all currently executing
103 * rcu-sched read-side critical sections have completed. These read-side
104 * critical sections are delimited by rcu_read_lock_sched() and
105 * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(),
106 * local_irq_disable(), and so on may be used in place of
107 * rcu_read_lock_sched().
108 *
109 * This means that all preempt_disable code sequences, including NMI and
110 * hardware-interrupt handlers, in progress on entry will have completed
111 * before this primitive returns. However, this does not guarantee that
112 * softirq handlers will have completed, since in some kernels, these
113 * handlers can run in process context, and can block.
114 *
115 * This primitive provides the guarantees made by the (now removed)
116 * synchronize_kernel() API. In contrast, synchronize_rcu() only
117 * guarantees that rcu_read_lock() sections will have completed.
118 * In "classic RCU", these two guarantees happen to be one and
119 * the same, but can differ in realtime RCU implementations.
120 */
121void synchronize_sched(void)
122{
123 struct rcu_synchronize rcu;
124
125 if (rcu_blocking_is_gp())
126 return;
127
128 init_completion(&rcu.completion);
129 /* Will wake me after RCU finished. */
130 call_rcu_sched(&rcu.head, wakeme_after_rcu);
131 /* Wait for it. */
132 wait_for_completion(&rcu.completion);
133}
134EXPORT_SYMBOL_GPL(synchronize_sched);
135
136/**
137 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
138 *
139 * Control will return to the caller some time after a full rcu_bh grace
140 * period has elapsed, in other words after all currently executing rcu_bh
141 * read-side critical sections have completed. RCU read-side critical
142 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
143 * and may be nested.
144 */
145void synchronize_rcu_bh(void)
146{
147 struct rcu_synchronize rcu;
148
149 if (rcu_blocking_is_gp())
150 return;
151
152 init_completion(&rcu.completion);
153 /* Will wake me after RCU finished. */
154 call_rcu_bh(&rcu.head, wakeme_after_rcu);
155 /* Wait for it. */
156 wait_for_completion(&rcu.completion);
157}
158EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
159
160static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
161 unsigned long action, void *hcpu)
162{
163 return rcu_cpu_notify(self, action, hcpu);
164}
165
166void __init rcu_init(void)
167{
168 int i;
169
170 __rcu_init();
171 cpu_notifier(rcu_barrier_cpu_hotplug, 0);
172
173 /*
174 * We don't need protection against CPU-hotplug here because
175 * this is called early in boot, before either interrupts
176 * or the scheduler are operational.
177 */
178 for_each_online_cpu(i)
179 rcu_barrier_cpu_hotplug(NULL, CPU_UP_PREPARE, (void *)(long)i);
180}
181
182void rcu_scheduler_starting(void)
183{
184 WARN_ON(num_online_cpus() != 1);
185 WARN_ON(nr_context_switches() > 0);
186 rcu_scheduler_active = 1;
187}
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
new file mode 100644
index 000000000000..9f6d9ff2572c
--- /dev/null
+++ b/kernel/rcutiny.c
@@ -0,0 +1,282 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2008
19 *
20 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
21 *
22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU
24 */
25#include <linux/moduleparam.h>
26#include <linux/completion.h>
27#include <linux/interrupt.h>
28#include <linux/notifier.h>
29#include <linux/rcupdate.h>
30#include <linux/kernel.h>
31#include <linux/module.h>
32#include <linux/mutex.h>
33#include <linux/sched.h>
34#include <linux/types.h>
35#include <linux/init.h>
36#include <linux/time.h>
37#include <linux/cpu.h>
38
39/* Global control variables for rcupdate callback mechanism. */
40struct rcu_ctrlblk {
41 struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
42 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
43 struct rcu_head **curtail; /* ->next pointer of last CB. */
44};
45
46/* Definition for rcupdate control block. */
47static struct rcu_ctrlblk rcu_ctrlblk = {
48 .donetail = &rcu_ctrlblk.rcucblist,
49 .curtail = &rcu_ctrlblk.rcucblist,
50};
51
52static struct rcu_ctrlblk rcu_bh_ctrlblk = {
53 .donetail = &rcu_bh_ctrlblk.rcucblist,
54 .curtail = &rcu_bh_ctrlblk.rcucblist,
55};
56
57#ifdef CONFIG_NO_HZ
58
59static long rcu_dynticks_nesting = 1;
60
61/*
62 * Enter dynticks-idle mode, which is an extended quiescent state
63 * if we have fully entered that mode (i.e., if the new value of
64 * dynticks_nesting is zero).
65 */
66void rcu_enter_nohz(void)
67{
68 if (--rcu_dynticks_nesting == 0)
69 rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */
70}
71
72/*
73 * Exit dynticks-idle mode, so that we are no longer in an extended
74 * quiescent state.
75 */
76void rcu_exit_nohz(void)
77{
78 rcu_dynticks_nesting++;
79}
80
81#endif /* #ifdef CONFIG_NO_HZ */
82
83/*
84 * Helper function for rcu_qsctr_inc() and rcu_bh_qsctr_inc().
85 * Also disable irqs to avoid confusion due to interrupt handlers
86 * invoking call_rcu().
87 */
88static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
89{
90 unsigned long flags;
91
92 local_irq_save(flags);
93 if (rcp->rcucblist != NULL &&
94 rcp->donetail != rcp->curtail) {
95 rcp->donetail = rcp->curtail;
96 local_irq_restore(flags);
97 return 1;
98 }
99 local_irq_restore(flags);
100
101 return 0;
102}
103
104/*
105 * Record an rcu quiescent state. And an rcu_bh quiescent state while we
106 * are at it, given that any rcu quiescent state is also an rcu_bh
107 * quiescent state. Use "+" instead of "||" to defeat short circuiting.
108 */
109void rcu_sched_qs(int cpu)
110{
111 if (rcu_qsctr_help(&rcu_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk))
112 raise_softirq(RCU_SOFTIRQ);
113}
114
115/*
116 * Record an rcu_bh quiescent state.
117 */
118void rcu_bh_qs(int cpu)
119{
120 if (rcu_qsctr_help(&rcu_bh_ctrlblk))
121 raise_softirq(RCU_SOFTIRQ);
122}
123
124/*
125 * Check to see if the scheduling-clock interrupt came from an extended
126 * quiescent state, and, if so, tell RCU about it.
127 */
128void rcu_check_callbacks(int cpu, int user)
129{
130 if (user ||
131 (idle_cpu(cpu) &&
132 !in_softirq() &&
133 hardirq_count() <= (1 << HARDIRQ_SHIFT)))
134 rcu_sched_qs(cpu);
135 else if (!in_softirq())
136 rcu_bh_qs(cpu);
137}
138
139/*
140 * Helper function for rcu_process_callbacks() that operates on the
141 * specified rcu_ctrlkblk structure.
142 */
143static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
144{
145 struct rcu_head *next, *list;
146 unsigned long flags;
147
148 /* If no RCU callbacks ready to invoke, just return. */
149 if (&rcp->rcucblist == rcp->donetail)
150 return;
151
152 /* Move the ready-to-invoke callbacks to a local list. */
153 local_irq_save(flags);
154 list = rcp->rcucblist;
155 rcp->rcucblist = *rcp->donetail;
156 *rcp->donetail = NULL;
157 if (rcp->curtail == rcp->donetail)
158 rcp->curtail = &rcp->rcucblist;
159 rcp->donetail = &rcp->rcucblist;
160 local_irq_restore(flags);
161
162 /* Invoke the callbacks on the local list. */
163 while (list) {
164 next = list->next;
165 prefetch(next);
166 list->func(list);
167 list = next;
168 }
169}
170
171/*
172 * Invoke any callbacks whose grace period has completed.
173 */
174static void rcu_process_callbacks(struct softirq_action *unused)
175{
176 __rcu_process_callbacks(&rcu_ctrlblk);
177 __rcu_process_callbacks(&rcu_bh_ctrlblk);
178}
179
180/*
181 * Wait for a grace period to elapse. But it is illegal to invoke
182 * synchronize_sched() from within an RCU read-side critical section.
183 * Therefore, any legal call to synchronize_sched() is a quiescent
184 * state, and so on a UP system, synchronize_sched() need do nothing.
185 * Ditto for synchronize_rcu_bh(). (But Lai Jiangshan points out the
186 * benefits of doing might_sleep() to reduce latency.)
187 *
188 * Cool, huh? (Due to Josh Triplett.)
189 *
190 * But we want to make this a static inline later.
191 */
192void synchronize_sched(void)
193{
194 cond_resched();
195}
196EXPORT_SYMBOL_GPL(synchronize_sched);
197
198void synchronize_rcu_bh(void)
199{
200 synchronize_sched();
201}
202EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
203
204/*
205 * Helper function for call_rcu() and call_rcu_bh().
206 */
207static void __call_rcu(struct rcu_head *head,
208 void (*func)(struct rcu_head *rcu),
209 struct rcu_ctrlblk *rcp)
210{
211 unsigned long flags;
212
213 head->func = func;
214 head->next = NULL;
215
216 local_irq_save(flags);
217 *rcp->curtail = head;
218 rcp->curtail = &head->next;
219 local_irq_restore(flags);
220}
221
222/*
223 * Post an RCU callback to be invoked after the end of an RCU grace
224 * period. But since we have but one CPU, that would be after any
225 * quiescent state.
226 */
227void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
228{
229 __call_rcu(head, func, &rcu_ctrlblk);
230}
231EXPORT_SYMBOL_GPL(call_rcu);
232
233/*
234 * Post an RCU bottom-half callback to be invoked after any subsequent
235 * quiescent state.
236 */
237void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
238{
239 __call_rcu(head, func, &rcu_bh_ctrlblk);
240}
241EXPORT_SYMBOL_GPL(call_rcu_bh);
242
243void rcu_barrier(void)
244{
245 struct rcu_synchronize rcu;
246
247 init_completion(&rcu.completion);
248 /* Will wake me after RCU finished. */
249 call_rcu(&rcu.head, wakeme_after_rcu);
250 /* Wait for it. */
251 wait_for_completion(&rcu.completion);
252}
253EXPORT_SYMBOL_GPL(rcu_barrier);
254
255void rcu_barrier_bh(void)
256{
257 struct rcu_synchronize rcu;
258
259 init_completion(&rcu.completion);
260 /* Will wake me after RCU finished. */
261 call_rcu_bh(&rcu.head, wakeme_after_rcu);
262 /* Wait for it. */
263 wait_for_completion(&rcu.completion);
264}
265EXPORT_SYMBOL_GPL(rcu_barrier_bh);
266
267void rcu_barrier_sched(void)
268{
269 struct rcu_synchronize rcu;
270
271 init_completion(&rcu.completion);
272 /* Will wake me after RCU finished. */
273 call_rcu_sched(&rcu.head, wakeme_after_rcu);
274 /* Wait for it. */
275 wait_for_completion(&rcu.completion);
276}
277EXPORT_SYMBOL_GPL(rcu_barrier_sched);
278
279void __init rcu_init(void)
280{
281 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
282}
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 697c0a0229d4..a621a67ef4e3 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -327,6 +327,11 @@ rcu_torture_cb(struct rcu_head *p)
327 cur_ops->deferred_free(rp); 327 cur_ops->deferred_free(rp);
328} 328}
329 329
330static int rcu_no_completed(void)
331{
332 return 0;
333}
334
330static void rcu_torture_deferred_free(struct rcu_torture *p) 335static void rcu_torture_deferred_free(struct rcu_torture *p)
331{ 336{
332 call_rcu(&p->rtort_rcu, rcu_torture_cb); 337 call_rcu(&p->rtort_rcu, rcu_torture_cb);
@@ -388,6 +393,21 @@ static struct rcu_torture_ops rcu_sync_ops = {
388 .name = "rcu_sync" 393 .name = "rcu_sync"
389}; 394};
390 395
396static struct rcu_torture_ops rcu_expedited_ops = {
397 .init = rcu_sync_torture_init,
398 .cleanup = NULL,
399 .readlock = rcu_torture_read_lock,
400 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
401 .readunlock = rcu_torture_read_unlock,
402 .completed = rcu_no_completed,
403 .deferred_free = rcu_sync_torture_deferred_free,
404 .sync = synchronize_rcu_expedited,
405 .cb_barrier = NULL,
406 .stats = NULL,
407 .irq_capable = 1,
408 .name = "rcu_expedited"
409};
410
391/* 411/*
392 * Definitions for rcu_bh torture testing. 412 * Definitions for rcu_bh torture testing.
393 */ 413 */
@@ -547,6 +567,25 @@ static struct rcu_torture_ops srcu_ops = {
547 .name = "srcu" 567 .name = "srcu"
548}; 568};
549 569
570static void srcu_torture_synchronize_expedited(void)
571{
572 synchronize_srcu_expedited(&srcu_ctl);
573}
574
575static struct rcu_torture_ops srcu_expedited_ops = {
576 .init = srcu_torture_init,
577 .cleanup = srcu_torture_cleanup,
578 .readlock = srcu_torture_read_lock,
579 .read_delay = srcu_read_delay,
580 .readunlock = srcu_torture_read_unlock,
581 .completed = srcu_torture_completed,
582 .deferred_free = rcu_sync_torture_deferred_free,
583 .sync = srcu_torture_synchronize_expedited,
584 .cb_barrier = NULL,
585 .stats = srcu_torture_stats,
586 .name = "srcu_expedited"
587};
588
550/* 589/*
551 * Definitions for sched torture testing. 590 * Definitions for sched torture testing.
552 */ 591 */
@@ -562,11 +601,6 @@ static void sched_torture_read_unlock(int idx)
562 preempt_enable(); 601 preempt_enable();
563} 602}
564 603
565static int sched_torture_completed(void)
566{
567 return 0;
568}
569
570static void rcu_sched_torture_deferred_free(struct rcu_torture *p) 604static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
571{ 605{
572 call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); 606 call_rcu_sched(&p->rtort_rcu, rcu_torture_cb);
@@ -583,7 +617,7 @@ static struct rcu_torture_ops sched_ops = {
583 .readlock = sched_torture_read_lock, 617 .readlock = sched_torture_read_lock,
584 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 618 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
585 .readunlock = sched_torture_read_unlock, 619 .readunlock = sched_torture_read_unlock,
586 .completed = sched_torture_completed, 620 .completed = rcu_no_completed,
587 .deferred_free = rcu_sched_torture_deferred_free, 621 .deferred_free = rcu_sched_torture_deferred_free,
588 .sync = sched_torture_synchronize, 622 .sync = sched_torture_synchronize,
589 .cb_barrier = rcu_barrier_sched, 623 .cb_barrier = rcu_barrier_sched,
@@ -592,13 +626,13 @@ static struct rcu_torture_ops sched_ops = {
592 .name = "sched" 626 .name = "sched"
593}; 627};
594 628
595static struct rcu_torture_ops sched_ops_sync = { 629static struct rcu_torture_ops sched_sync_ops = {
596 .init = rcu_sync_torture_init, 630 .init = rcu_sync_torture_init,
597 .cleanup = NULL, 631 .cleanup = NULL,
598 .readlock = sched_torture_read_lock, 632 .readlock = sched_torture_read_lock,
599 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 633 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
600 .readunlock = sched_torture_read_unlock, 634 .readunlock = sched_torture_read_unlock,
601 .completed = sched_torture_completed, 635 .completed = rcu_no_completed,
602 .deferred_free = rcu_sync_torture_deferred_free, 636 .deferred_free = rcu_sync_torture_deferred_free,
603 .sync = sched_torture_synchronize, 637 .sync = sched_torture_synchronize,
604 .cb_barrier = NULL, 638 .cb_barrier = NULL,
@@ -612,7 +646,7 @@ static struct rcu_torture_ops sched_expedited_ops = {
612 .readlock = sched_torture_read_lock, 646 .readlock = sched_torture_read_lock,
613 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 647 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
614 .readunlock = sched_torture_read_unlock, 648 .readunlock = sched_torture_read_unlock,
615 .completed = sched_torture_completed, 649 .completed = rcu_no_completed,
616 .deferred_free = rcu_sync_torture_deferred_free, 650 .deferred_free = rcu_sync_torture_deferred_free,
617 .sync = synchronize_sched_expedited, 651 .sync = synchronize_sched_expedited,
618 .cb_barrier = NULL, 652 .cb_barrier = NULL,
@@ -1097,9 +1131,10 @@ rcu_torture_init(void)
1097 int cpu; 1131 int cpu;
1098 int firsterr = 0; 1132 int firsterr = 0;
1099 static struct rcu_torture_ops *torture_ops[] = 1133 static struct rcu_torture_ops *torture_ops[] =
1100 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, 1134 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
1101 &sched_expedited_ops, 1135 &rcu_bh_ops, &rcu_bh_sync_ops,
1102 &srcu_ops, &sched_ops, &sched_ops_sync, }; 1136 &srcu_ops, &srcu_expedited_ops,
1137 &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
1103 1138
1104 mutex_lock(&fullstop_mutex); 1139 mutex_lock(&fullstop_mutex);
1105 1140
@@ -1110,8 +1145,12 @@ rcu_torture_init(void)
1110 break; 1145 break;
1111 } 1146 }
1112 if (i == ARRAY_SIZE(torture_ops)) { 1147 if (i == ARRAY_SIZE(torture_ops)) {
1113 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", 1148 printk(KERN_ALERT "rcu-torture: invalid torture type: \"%s\"\n",
1114 torture_type); 1149 torture_type);
1150 printk(KERN_ALERT "rcu-torture types:");
1151 for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
1152 printk(KERN_ALERT " %s", torture_ops[i]->name);
1153 printk(KERN_ALERT "\n");
1115 mutex_unlock(&fullstop_mutex); 1154 mutex_unlock(&fullstop_mutex);
1116 return -EINVAL; 1155 return -EINVAL;
1117 } 1156 }
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index f3077c0ab181..53ae9598f798 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -46,18 +46,22 @@
46#include <linux/cpu.h> 46#include <linux/cpu.h>
47#include <linux/mutex.h> 47#include <linux/mutex.h>
48#include <linux/time.h> 48#include <linux/time.h>
49#include <linux/kernel_stat.h>
49 50
50#include "rcutree.h" 51#include "rcutree.h"
51 52
52/* Data structures. */ 53/* Data structures. */
53 54
55static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
56
54#define RCU_STATE_INITIALIZER(name) { \ 57#define RCU_STATE_INITIALIZER(name) { \
55 .level = { &name.node[0] }, \ 58 .level = { &name.node[0] }, \
56 .levelcnt = { \ 59 .levelcnt = { \
57 NUM_RCU_LVL_0, /* root of hierarchy. */ \ 60 NUM_RCU_LVL_0, /* root of hierarchy. */ \
58 NUM_RCU_LVL_1, \ 61 NUM_RCU_LVL_1, \
59 NUM_RCU_LVL_2, \ 62 NUM_RCU_LVL_2, \
60 NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \ 63 NUM_RCU_LVL_3, \
64 NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \
61 }, \ 65 }, \
62 .signaled = RCU_GP_IDLE, \ 66 .signaled = RCU_GP_IDLE, \
63 .gpnum = -300, \ 67 .gpnum = -300, \
@@ -77,6 +81,8 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
77struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 81struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
78DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 82DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
79 83
84static int rcu_scheduler_active __read_mostly;
85
80 86
81/* 87/*
82 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 88 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
@@ -98,7 +104,7 @@ void rcu_sched_qs(int cpu)
98 struct rcu_data *rdp; 104 struct rcu_data *rdp;
99 105
100 rdp = &per_cpu(rcu_sched_data, cpu); 106 rdp = &per_cpu(rcu_sched_data, cpu);
101 rdp->passed_quiesc_completed = rdp->completed; 107 rdp->passed_quiesc_completed = rdp->gpnum - 1;
102 barrier(); 108 barrier();
103 rdp->passed_quiesc = 1; 109 rdp->passed_quiesc = 1;
104 rcu_preempt_note_context_switch(cpu); 110 rcu_preempt_note_context_switch(cpu);
@@ -109,7 +115,7 @@ void rcu_bh_qs(int cpu)
109 struct rcu_data *rdp; 115 struct rcu_data *rdp;
110 116
111 rdp = &per_cpu(rcu_bh_data, cpu); 117 rdp = &per_cpu(rcu_bh_data, cpu);
112 rdp->passed_quiesc_completed = rdp->completed; 118 rdp->passed_quiesc_completed = rdp->gpnum - 1;
113 barrier(); 119 barrier();
114 rdp->passed_quiesc = 1; 120 rdp->passed_quiesc = 1;
115} 121}
@@ -335,28 +341,9 @@ void rcu_irq_exit(void)
335 set_need_resched(); 341 set_need_resched();
336} 342}
337 343
338/*
339 * Record the specified "completed" value, which is later used to validate
340 * dynticks counter manipulations. Specify "rsp->completed - 1" to
341 * unconditionally invalidate any future dynticks manipulations (which is
342 * useful at the beginning of a grace period).
343 */
344static void dyntick_record_completed(struct rcu_state *rsp, long comp)
345{
346 rsp->dynticks_completed = comp;
347}
348
349#ifdef CONFIG_SMP 344#ifdef CONFIG_SMP
350 345
351/* 346/*
352 * Recall the previously recorded value of the completion for dynticks.
353 */
354static long dyntick_recall_completed(struct rcu_state *rsp)
355{
356 return rsp->dynticks_completed;
357}
358
359/*
360 * Snapshot the specified CPU's dynticks counter so that we can later 347 * Snapshot the specified CPU's dynticks counter so that we can later
361 * credit them with an implicit quiescent state. Return 1 if this CPU 348 * credit them with an implicit quiescent state. Return 1 if this CPU
362 * is in dynticks idle mode, which is an extended quiescent state. 349 * is in dynticks idle mode, which is an extended quiescent state.
@@ -419,24 +406,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
419 406
420#else /* #ifdef CONFIG_NO_HZ */ 407#else /* #ifdef CONFIG_NO_HZ */
421 408
422static void dyntick_record_completed(struct rcu_state *rsp, long comp)
423{
424}
425
426#ifdef CONFIG_SMP 409#ifdef CONFIG_SMP
427 410
428/*
429 * If there are no dynticks, then the only way that a CPU can passively
430 * be in a quiescent state is to be offline. Unlike dynticks idle, which
431 * is a point in time during the prior (already finished) grace period,
432 * an offline CPU is always in a quiescent state, and thus can be
433 * unconditionally applied. So just return the current value of completed.
434 */
435static long dyntick_recall_completed(struct rcu_state *rsp)
436{
437 return rsp->completed;
438}
439
440static int dyntick_save_progress_counter(struct rcu_data *rdp) 411static int dyntick_save_progress_counter(struct rcu_data *rdp)
441{ 412{
442 return 0; 413 return 0;
@@ -553,13 +524,33 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
553/* 524/*
554 * Update CPU-local rcu_data state to record the newly noticed grace period. 525 * Update CPU-local rcu_data state to record the newly noticed grace period.
555 * This is used both when we started the grace period and when we notice 526 * This is used both when we started the grace period and when we notice
556 * that someone else started the grace period. 527 * that someone else started the grace period. The caller must hold the
528 * ->lock of the leaf rcu_node structure corresponding to the current CPU,
529 * and must have irqs disabled.
557 */ 530 */
531static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
532{
533 if (rdp->gpnum != rnp->gpnum) {
534 rdp->qs_pending = 1;
535 rdp->passed_quiesc = 0;
536 rdp->gpnum = rnp->gpnum;
537 }
538}
539
558static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp) 540static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
559{ 541{
560 rdp->qs_pending = 1; 542 unsigned long flags;
561 rdp->passed_quiesc = 0; 543 struct rcu_node *rnp;
562 rdp->gpnum = rsp->gpnum; 544
545 local_irq_save(flags);
546 rnp = rdp->mynode;
547 if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */
548 !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */
549 local_irq_restore(flags);
550 return;
551 }
552 __note_new_gpnum(rsp, rnp, rdp);
553 spin_unlock_irqrestore(&rnp->lock, flags);
563} 554}
564 555
565/* 556/*
@@ -583,6 +574,79 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
583} 574}
584 575
585/* 576/*
577 * Advance this CPU's callbacks, but only if the current grace period
578 * has ended. This may be called only from the CPU to whom the rdp
579 * belongs. In addition, the corresponding leaf rcu_node structure's
580 * ->lock must be held by the caller, with irqs disabled.
581 */
582static void
583__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
584{
585 /* Did another grace period end? */
586 if (rdp->completed != rnp->completed) {
587
588 /* Advance callbacks. No harm if list empty. */
589 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
590 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
591 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
592
593 /* Remember that we saw this grace-period completion. */
594 rdp->completed = rnp->completed;
595 }
596}
597
598/*
599 * Advance this CPU's callbacks, but only if the current grace period
600 * has ended. This may be called only from the CPU to whom the rdp
601 * belongs.
602 */
603static void
604rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
605{
606 unsigned long flags;
607 struct rcu_node *rnp;
608
609 local_irq_save(flags);
610 rnp = rdp->mynode;
611 if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */
612 !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */
613 local_irq_restore(flags);
614 return;
615 }
616 __rcu_process_gp_end(rsp, rnp, rdp);
617 spin_unlock_irqrestore(&rnp->lock, flags);
618}
619
620/*
621 * Do per-CPU grace-period initialization for running CPU. The caller
622 * must hold the lock of the leaf rcu_node structure corresponding to
623 * this CPU.
624 */
625static void
626rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
627{
628 /* Prior grace period ended, so advance callbacks for current CPU. */
629 __rcu_process_gp_end(rsp, rnp, rdp);
630
631 /*
632 * Because this CPU just now started the new grace period, we know
633 * that all of its callbacks will be covered by this upcoming grace
634 * period, even the ones that were registered arbitrarily recently.
635 * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL.
636 *
637 * Other CPUs cannot be sure exactly when the grace period started.
638 * Therefore, their recently registered callbacks must pass through
639 * an additional RCU_NEXT_READY stage, so that they will be handled
640 * by the next RCU grace period.
641 */
642 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
643 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
644
645 /* Set state so that this CPU will detect the next quiescent state. */
646 __note_new_gpnum(rsp, rnp, rdp);
647}
648
649/*
586 * Start a new RCU grace period if warranted, re-initializing the hierarchy 650 * Start a new RCU grace period if warranted, re-initializing the hierarchy
587 * in preparation for detecting the next grace period. The caller must hold 651 * in preparation for detecting the next grace period. The caller must hold
588 * the root node's ->lock, which is released before return. Hard irqs must 652 * the root node's ->lock, which is released before return. Hard irqs must
@@ -596,7 +660,23 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
596 struct rcu_node *rnp = rcu_get_root(rsp); 660 struct rcu_node *rnp = rcu_get_root(rsp);
597 661
598 if (!cpu_needs_another_gp(rsp, rdp)) { 662 if (!cpu_needs_another_gp(rsp, rdp)) {
599 spin_unlock_irqrestore(&rnp->lock, flags); 663 if (rnp->completed == rsp->completed) {
664 spin_unlock_irqrestore(&rnp->lock, flags);
665 return;
666 }
667 spin_unlock(&rnp->lock); /* irqs remain disabled. */
668
669 /*
670 * Propagate new ->completed value to rcu_node structures
671 * so that other CPUs don't have to wait until the start
672 * of the next grace period to process their callbacks.
673 */
674 rcu_for_each_node_breadth_first(rsp, rnp) {
675 spin_lock(&rnp->lock); /* irqs already disabled. */
676 rnp->completed = rsp->completed;
677 spin_unlock(&rnp->lock); /* irqs remain disabled. */
678 }
679 local_irq_restore(flags);
600 return; 680 return;
601 } 681 }
602 682
@@ -606,29 +686,15 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
606 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ 686 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
607 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 687 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
608 record_gp_stall_check_time(rsp); 688 record_gp_stall_check_time(rsp);
609 dyntick_record_completed(rsp, rsp->completed - 1);
610 note_new_gpnum(rsp, rdp);
611
612 /*
613 * Because this CPU just now started the new grace period, we know
614 * that all of its callbacks will be covered by this upcoming grace
615 * period, even the ones that were registered arbitrarily recently.
616 * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL.
617 *
618 * Other CPUs cannot be sure exactly when the grace period started.
619 * Therefore, their recently registered callbacks must pass through
620 * an additional RCU_NEXT_READY stage, so that they will be handled
621 * by the next RCU grace period.
622 */
623 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
624 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
625 689
626 /* Special-case the common single-level case. */ 690 /* Special-case the common single-level case. */
627 if (NUM_RCU_NODES == 1) { 691 if (NUM_RCU_NODES == 1) {
628 rcu_preempt_check_blocked_tasks(rnp); 692 rcu_preempt_check_blocked_tasks(rnp);
629 rnp->qsmask = rnp->qsmaskinit; 693 rnp->qsmask = rnp->qsmaskinit;
630 rnp->gpnum = rsp->gpnum; 694 rnp->gpnum = rsp->gpnum;
695 rnp->completed = rsp->completed;
631 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ 696 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
697 rcu_start_gp_per_cpu(rsp, rnp, rdp);
632 spin_unlock_irqrestore(&rnp->lock, flags); 698 spin_unlock_irqrestore(&rnp->lock, flags);
633 return; 699 return;
634 } 700 }
@@ -661,6 +727,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
661 rcu_preempt_check_blocked_tasks(rnp); 727 rcu_preempt_check_blocked_tasks(rnp);
662 rnp->qsmask = rnp->qsmaskinit; 728 rnp->qsmask = rnp->qsmaskinit;
663 rnp->gpnum = rsp->gpnum; 729 rnp->gpnum = rsp->gpnum;
730 rnp->completed = rsp->completed;
731 if (rnp == rdp->mynode)
732 rcu_start_gp_per_cpu(rsp, rnp, rdp);
664 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 733 spin_unlock(&rnp->lock); /* irqs remain disabled. */
665 } 734 }
666 735
@@ -672,58 +741,32 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
672} 741}
673 742
674/* 743/*
675 * Advance this CPU's callbacks, but only if the current grace period 744 * Report a full set of quiescent states to the specified rcu_state
676 * has ended. This may be called only from the CPU to whom the rdp 745 * data structure. This involves cleaning up after the prior grace
677 * belongs. 746 * period and letting rcu_start_gp() start up the next grace period
747 * if one is needed. Note that the caller must hold rnp->lock, as
748 * required by rcu_start_gp(), which will release it.
678 */ 749 */
679static void 750static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
680rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
681{
682 long completed_snap;
683 unsigned long flags;
684
685 local_irq_save(flags);
686 completed_snap = ACCESS_ONCE(rsp->completed); /* outside of lock. */
687
688 /* Did another grace period end? */
689 if (rdp->completed != completed_snap) {
690
691 /* Advance callbacks. No harm if list empty. */
692 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
693 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
694 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
695
696 /* Remember that we saw this grace-period completion. */
697 rdp->completed = completed_snap;
698 }
699 local_irq_restore(flags);
700}
701
702/*
703 * Clean up after the prior grace period and let rcu_start_gp() start up
704 * the next grace period if one is needed. Note that the caller must
705 * hold rnp->lock, as required by rcu_start_gp(), which will release it.
706 */
707static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags)
708 __releases(rcu_get_root(rsp)->lock) 751 __releases(rcu_get_root(rsp)->lock)
709{ 752{
710 WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); 753 WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
711 rsp->completed = rsp->gpnum; 754 rsp->completed = rsp->gpnum;
712 rsp->signaled = RCU_GP_IDLE; 755 rsp->signaled = RCU_GP_IDLE;
713 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
714 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ 756 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
715} 757}
716 758
717/* 759/*
718 * Similar to cpu_quiet(), for which it is a helper function. Allows 760 * Similar to rcu_report_qs_rdp(), for which it is a helper function.
719 * a group of CPUs to be quieted at one go, though all the CPUs in the 761 * Allows quiescent states for a group of CPUs to be reported at one go
720 * group must be represented by the same leaf rcu_node structure. 762 * to the specified rcu_node structure, though all the CPUs in the group
721 * That structure's lock must be held upon entry, and it is released 763 * must be represented by the same rcu_node structure (which need not be
722 * before return. 764 * a leaf rcu_node structure, though it often will be). That structure's
765 * lock must be held upon entry, and it is released before return.
723 */ 766 */
724static void 767static void
725cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp, 768rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
726 unsigned long flags) 769 struct rcu_node *rnp, unsigned long flags)
727 __releases(rnp->lock) 770 __releases(rnp->lock)
728{ 771{
729 struct rcu_node *rnp_c; 772 struct rcu_node *rnp_c;
@@ -759,21 +802,23 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
759 802
760 /* 803 /*
761 * Get here if we are the last CPU to pass through a quiescent 804 * Get here if we are the last CPU to pass through a quiescent
762 * state for this grace period. Invoke cpu_quiet_msk_finish() 805 * state for this grace period. Invoke rcu_report_qs_rsp()
763 * to clean up and start the next grace period if one is needed. 806 * to clean up and start the next grace period if one is needed.
764 */ 807 */
765 cpu_quiet_msk_finish(rsp, flags); /* releases rnp->lock. */ 808 rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */
766} 809}
767 810
768/* 811/*
769 * Record a quiescent state for the specified CPU, which must either be 812 * Record a quiescent state for the specified CPU to that CPU's rcu_data
770 * the current CPU. The lastcomp argument is used to make sure we are 813 * structure. This must be either called from the specified CPU, or
771 * still in the grace period of interest. We don't want to end the current 814 * called when the specified CPU is known to be offline (and when it is
772 * grace period based on quiescent states detected in an earlier grace 815 * also known that no other CPU is concurrently trying to help the offline
773 * period! 816 * CPU). The lastcomp argument is used to make sure we are still in the
817 * grace period of interest. We don't want to end the current grace period
818 * based on quiescent states detected in an earlier grace period!
774 */ 819 */
775static void 820static void
776cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) 821rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
777{ 822{
778 unsigned long flags; 823 unsigned long flags;
779 unsigned long mask; 824 unsigned long mask;
@@ -781,15 +826,15 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
781 826
782 rnp = rdp->mynode; 827 rnp = rdp->mynode;
783 spin_lock_irqsave(&rnp->lock, flags); 828 spin_lock_irqsave(&rnp->lock, flags);
784 if (lastcomp != ACCESS_ONCE(rsp->completed)) { 829 if (lastcomp != rnp->completed) {
785 830
786 /* 831 /*
787 * Someone beat us to it for this grace period, so leave. 832 * Someone beat us to it for this grace period, so leave.
788 * The race with GP start is resolved by the fact that we 833 * The race with GP start is resolved by the fact that we
789 * hold the leaf rcu_node lock, so that the per-CPU bits 834 * hold the leaf rcu_node lock, so that the per-CPU bits
790 * cannot yet be initialized -- so we would simply find our 835 * cannot yet be initialized -- so we would simply find our
791 * CPU's bit already cleared in cpu_quiet_msk() if this race 836 * CPU's bit already cleared in rcu_report_qs_rnp() if this
792 * occurred. 837 * race occurred.
793 */ 838 */
794 rdp->passed_quiesc = 0; /* try again later! */ 839 rdp->passed_quiesc = 0; /* try again later! */
795 spin_unlock_irqrestore(&rnp->lock, flags); 840 spin_unlock_irqrestore(&rnp->lock, flags);
@@ -807,7 +852,7 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
807 */ 852 */
808 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; 853 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
809 854
810 cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */ 855 rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
811 } 856 }
812} 857}
813 858
@@ -838,8 +883,11 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
838 if (!rdp->passed_quiesc) 883 if (!rdp->passed_quiesc)
839 return; 884 return;
840 885
841 /* Tell RCU we are done (but cpu_quiet() will be the judge of that). */ 886 /*
842 cpu_quiet(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed); 887 * Tell RCU we are done (but rcu_report_qs_rdp() will be the
888 * judge of that).
889 */
890 rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed);
843} 891}
844 892
845#ifdef CONFIG_HOTPLUG_CPU 893#ifdef CONFIG_HOTPLUG_CPU
@@ -899,8 +947,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
899static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) 947static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
900{ 948{
901 unsigned long flags; 949 unsigned long flags;
902 long lastcomp;
903 unsigned long mask; 950 unsigned long mask;
951 int need_report = 0;
904 struct rcu_data *rdp = rsp->rda[cpu]; 952 struct rcu_data *rdp = rsp->rda[cpu];
905 struct rcu_node *rnp; 953 struct rcu_node *rnp;
906 954
@@ -914,30 +962,32 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
914 spin_lock(&rnp->lock); /* irqs already disabled. */ 962 spin_lock(&rnp->lock); /* irqs already disabled. */
915 rnp->qsmaskinit &= ~mask; 963 rnp->qsmaskinit &= ~mask;
916 if (rnp->qsmaskinit != 0) { 964 if (rnp->qsmaskinit != 0) {
917 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 965 if (rnp != rdp->mynode)
966 spin_unlock(&rnp->lock); /* irqs remain disabled. */
918 break; 967 break;
919 } 968 }
920 969 if (rnp == rdp->mynode)
921 /* 970 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
922 * If there was a task blocking the current grace period, 971 else
923 * and if all CPUs have checked in, we need to propagate 972 spin_unlock(&rnp->lock); /* irqs remain disabled. */
924 * the quiescent state up the rcu_node hierarchy. But that
925 * is inconvenient at the moment due to deadlock issues if
926 * this should end the current grace period. So set the
927 * offlined CPU's bit in ->qsmask in order to force the
928 * next force_quiescent_state() invocation to clean up this
929 * mess in a deadlock-free manner.
930 */
931 if (rcu_preempt_offline_tasks(rsp, rnp, rdp) && !rnp->qsmask)
932 rnp->qsmask |= mask;
933
934 mask = rnp->grpmask; 973 mask = rnp->grpmask;
935 spin_unlock(&rnp->lock); /* irqs remain disabled. */
936 rnp = rnp->parent; 974 rnp = rnp->parent;
937 } while (rnp != NULL); 975 } while (rnp != NULL);
938 lastcomp = rsp->completed;
939 976
940 spin_unlock_irqrestore(&rsp->onofflock, flags); 977 /*
978 * We still hold the leaf rcu_node structure lock here, and
979 * irqs are still disabled. The reason for this subterfuge is
980 * because invoking rcu_report_unblock_qs_rnp() with ->onofflock
981 * held leads to deadlock.
982 */
983 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
984 rnp = rdp->mynode;
985 if (need_report & RCU_OFL_TASKS_NORM_GP)
986 rcu_report_unblock_qs_rnp(rnp, flags);
987 else
988 spin_unlock_irqrestore(&rnp->lock, flags);
989 if (need_report & RCU_OFL_TASKS_EXP_GP)
990 rcu_report_exp_rnp(rsp, rnp);
941 991
942 rcu_adopt_orphan_cbs(rsp); 992 rcu_adopt_orphan_cbs(rsp);
943} 993}
@@ -1109,7 +1159,7 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1109 rcu_for_each_leaf_node(rsp, rnp) { 1159 rcu_for_each_leaf_node(rsp, rnp) {
1110 mask = 0; 1160 mask = 0;
1111 spin_lock_irqsave(&rnp->lock, flags); 1161 spin_lock_irqsave(&rnp->lock, flags);
1112 if (rsp->completed != lastcomp) { 1162 if (rnp->completed != lastcomp) {
1113 spin_unlock_irqrestore(&rnp->lock, flags); 1163 spin_unlock_irqrestore(&rnp->lock, flags);
1114 return 1; 1164 return 1;
1115 } 1165 }
@@ -1123,10 +1173,10 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1123 if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) 1173 if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu]))
1124 mask |= bit; 1174 mask |= bit;
1125 } 1175 }
1126 if (mask != 0 && rsp->completed == lastcomp) { 1176 if (mask != 0 && rnp->completed == lastcomp) {
1127 1177
1128 /* cpu_quiet_msk() releases rnp->lock. */ 1178 /* rcu_report_qs_rnp() releases rnp->lock. */
1129 cpu_quiet_msk(mask, rsp, rnp, flags); 1179 rcu_report_qs_rnp(mask, rsp, rnp, flags);
1130 continue; 1180 continue;
1131 } 1181 }
1132 spin_unlock_irqrestore(&rnp->lock, flags); 1182 spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1144,6 +1194,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1144 long lastcomp; 1194 long lastcomp;
1145 struct rcu_node *rnp = rcu_get_root(rsp); 1195 struct rcu_node *rnp = rcu_get_root(rsp);
1146 u8 signaled; 1196 u8 signaled;
1197 u8 forcenow;
1147 1198
1148 if (!rcu_gp_in_progress(rsp)) 1199 if (!rcu_gp_in_progress(rsp))
1149 return; /* No grace period in progress, nothing to force. */ 1200 return; /* No grace period in progress, nothing to force. */
@@ -1156,10 +1207,10 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1156 goto unlock_ret; /* no emergency and done recently. */ 1207 goto unlock_ret; /* no emergency and done recently. */
1157 rsp->n_force_qs++; 1208 rsp->n_force_qs++;
1158 spin_lock(&rnp->lock); 1209 spin_lock(&rnp->lock);
1159 lastcomp = rsp->completed; 1210 lastcomp = rsp->gpnum - 1;
1160 signaled = rsp->signaled; 1211 signaled = rsp->signaled;
1161 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 1212 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
1162 if (lastcomp == rsp->gpnum) { 1213 if(!rcu_gp_in_progress(rsp)) {
1163 rsp->n_force_qs_ngp++; 1214 rsp->n_force_qs_ngp++;
1164 spin_unlock(&rnp->lock); 1215 spin_unlock(&rnp->lock);
1165 goto unlock_ret; /* no GP in progress, time updated. */ 1216 goto unlock_ret; /* no GP in progress, time updated. */
@@ -1180,21 +1231,29 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1180 if (rcu_process_dyntick(rsp, lastcomp, 1231 if (rcu_process_dyntick(rsp, lastcomp,
1181 dyntick_save_progress_counter)) 1232 dyntick_save_progress_counter))
1182 goto unlock_ret; 1233 goto unlock_ret;
1234 /* fall into next case. */
1235
1236 case RCU_SAVE_COMPLETED:
1183 1237
1184 /* Update state, record completion counter. */ 1238 /* Update state, record completion counter. */
1239 forcenow = 0;
1185 spin_lock(&rnp->lock); 1240 spin_lock(&rnp->lock);
1186 if (lastcomp == rsp->completed && 1241 if (lastcomp + 1 == rsp->gpnum &&
1187 rsp->signaled == RCU_SAVE_DYNTICK) { 1242 lastcomp == rsp->completed &&
1243 rsp->signaled == signaled) {
1188 rsp->signaled = RCU_FORCE_QS; 1244 rsp->signaled = RCU_FORCE_QS;
1189 dyntick_record_completed(rsp, lastcomp); 1245 rsp->completed_fqs = lastcomp;
1246 forcenow = signaled == RCU_SAVE_COMPLETED;
1190 } 1247 }
1191 spin_unlock(&rnp->lock); 1248 spin_unlock(&rnp->lock);
1192 break; 1249 if (!forcenow)
1250 break;
1251 /* fall into next case. */
1193 1252
1194 case RCU_FORCE_QS: 1253 case RCU_FORCE_QS:
1195 1254
1196 /* Check dyntick-idle state, send IPI to laggarts. */ 1255 /* Check dyntick-idle state, send IPI to laggarts. */
1197 if (rcu_process_dyntick(rsp, dyntick_recall_completed(rsp), 1256 if (rcu_process_dyntick(rsp, rsp->completed_fqs,
1198 rcu_implicit_dynticks_qs)) 1257 rcu_implicit_dynticks_qs))
1199 goto unlock_ret; 1258 goto unlock_ret;
1200 1259
@@ -1351,6 +1410,68 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1351} 1410}
1352EXPORT_SYMBOL_GPL(call_rcu_bh); 1411EXPORT_SYMBOL_GPL(call_rcu_bh);
1353 1412
1413/**
1414 * synchronize_sched - wait until an rcu-sched grace period has elapsed.
1415 *
1416 * Control will return to the caller some time after a full rcu-sched
1417 * grace period has elapsed, in other words after all currently executing
1418 * rcu-sched read-side critical sections have completed. These read-side
1419 * critical sections are delimited by rcu_read_lock_sched() and
1420 * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(),
1421 * local_irq_disable(), and so on may be used in place of
1422 * rcu_read_lock_sched().
1423 *
1424 * This means that all preempt_disable code sequences, including NMI and
1425 * hardware-interrupt handlers, in progress on entry will have completed
1426 * before this primitive returns. However, this does not guarantee that
1427 * softirq handlers will have completed, since in some kernels, these
1428 * handlers can run in process context, and can block.
1429 *
1430 * This primitive provides the guarantees made by the (now removed)
1431 * synchronize_kernel() API. In contrast, synchronize_rcu() only
1432 * guarantees that rcu_read_lock() sections will have completed.
1433 * In "classic RCU", these two guarantees happen to be one and
1434 * the same, but can differ in realtime RCU implementations.
1435 */
1436void synchronize_sched(void)
1437{
1438 struct rcu_synchronize rcu;
1439
1440 if (rcu_blocking_is_gp())
1441 return;
1442
1443 init_completion(&rcu.completion);
1444 /* Will wake me after RCU finished. */
1445 call_rcu_sched(&rcu.head, wakeme_after_rcu);
1446 /* Wait for it. */
1447 wait_for_completion(&rcu.completion);
1448}
1449EXPORT_SYMBOL_GPL(synchronize_sched);
1450
1451/**
1452 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
1453 *
1454 * Control will return to the caller some time after a full rcu_bh grace
1455 * period has elapsed, in other words after all currently executing rcu_bh
1456 * read-side critical sections have completed. RCU read-side critical
1457 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
1458 * and may be nested.
1459 */
1460void synchronize_rcu_bh(void)
1461{
1462 struct rcu_synchronize rcu;
1463
1464 if (rcu_blocking_is_gp())
1465 return;
1466
1467 init_completion(&rcu.completion);
1468 /* Will wake me after RCU finished. */
1469 call_rcu_bh(&rcu.head, wakeme_after_rcu);
1470 /* Wait for it. */
1471 wait_for_completion(&rcu.completion);
1472}
1473EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
1474
1354/* 1475/*
1355 * Check to see if there is any immediate RCU-related work to be done 1476 * Check to see if there is any immediate RCU-related work to be done
1356 * by the current CPU, for the specified type of RCU, returning 1 if so. 1477 * by the current CPU, for the specified type of RCU, returning 1 if so.
@@ -1360,6 +1481,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
1360 */ 1481 */
1361static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) 1482static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1362{ 1483{
1484 struct rcu_node *rnp = rdp->mynode;
1485
1363 rdp->n_rcu_pending++; 1486 rdp->n_rcu_pending++;
1364 1487
1365 /* Check for CPU stalls, if enabled. */ 1488 /* Check for CPU stalls, if enabled. */
@@ -1384,13 +1507,13 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1384 } 1507 }
1385 1508
1386 /* Has another RCU grace period completed? */ 1509 /* Has another RCU grace period completed? */
1387 if (ACCESS_ONCE(rsp->completed) != rdp->completed) { /* outside lock */ 1510 if (ACCESS_ONCE(rnp->completed) != rdp->completed) { /* outside lock */
1388 rdp->n_rp_gp_completed++; 1511 rdp->n_rp_gp_completed++;
1389 return 1; 1512 return 1;
1390 } 1513 }
1391 1514
1392 /* Has a new RCU grace period started? */ 1515 /* Has a new RCU grace period started? */
1393 if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) { /* outside lock */ 1516 if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */
1394 rdp->n_rp_gp_started++; 1517 rdp->n_rp_gp_started++;
1395 return 1; 1518 return 1;
1396 } 1519 }
@@ -1433,6 +1556,21 @@ int rcu_needs_cpu(int cpu)
1433 rcu_preempt_needs_cpu(cpu); 1556 rcu_preempt_needs_cpu(cpu);
1434} 1557}
1435 1558
1559/*
1560 * This function is invoked towards the end of the scheduler's initialization
1561 * process. Before this is called, the idle task might contain
1562 * RCU read-side critical sections (during which time, this idle
1563 * task is booting the system). After this function is called, the
1564 * idle tasks are prohibited from containing RCU read-side critical
1565 * sections.
1566 */
1567void rcu_scheduler_starting(void)
1568{
1569 WARN_ON(num_online_cpus() != 1);
1570 WARN_ON(nr_context_switches() > 0);
1571 rcu_scheduler_active = 1;
1572}
1573
1436static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; 1574static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
1437static atomic_t rcu_barrier_cpu_count; 1575static atomic_t rcu_barrier_cpu_count;
1438static DEFINE_MUTEX(rcu_barrier_mutex); 1576static DEFINE_MUTEX(rcu_barrier_mutex);
@@ -1544,21 +1682,16 @@ static void __cpuinit
1544rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) 1682rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1545{ 1683{
1546 unsigned long flags; 1684 unsigned long flags;
1547 long lastcomp;
1548 unsigned long mask; 1685 unsigned long mask;
1549 struct rcu_data *rdp = rsp->rda[cpu]; 1686 struct rcu_data *rdp = rsp->rda[cpu];
1550 struct rcu_node *rnp = rcu_get_root(rsp); 1687 struct rcu_node *rnp = rcu_get_root(rsp);
1551 1688
1552 /* Set up local state, ensuring consistent view of global state. */ 1689 /* Set up local state, ensuring consistent view of global state. */
1553 spin_lock_irqsave(&rnp->lock, flags); 1690 spin_lock_irqsave(&rnp->lock, flags);
1554 lastcomp = rsp->completed;
1555 rdp->completed = lastcomp;
1556 rdp->gpnum = lastcomp;
1557 rdp->passed_quiesc = 0; /* We could be racing with new GP, */ 1691 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1558 rdp->qs_pending = 1; /* so set up to respond to current GP. */ 1692 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1559 rdp->beenonline = 1; /* We have now been online. */ 1693 rdp->beenonline = 1; /* We have now been online. */
1560 rdp->preemptable = preemptable; 1694 rdp->preemptable = preemptable;
1561 rdp->passed_quiesc_completed = lastcomp - 1;
1562 rdp->qlen_last_fqs_check = 0; 1695 rdp->qlen_last_fqs_check = 0;
1563 rdp->n_force_qs_snap = rsp->n_force_qs; 1696 rdp->n_force_qs_snap = rsp->n_force_qs;
1564 rdp->blimit = blimit; 1697 rdp->blimit = blimit;
@@ -1580,6 +1713,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1580 spin_lock(&rnp->lock); /* irqs already disabled. */ 1713 spin_lock(&rnp->lock); /* irqs already disabled. */
1581 rnp->qsmaskinit |= mask; 1714 rnp->qsmaskinit |= mask;
1582 mask = rnp->grpmask; 1715 mask = rnp->grpmask;
1716 if (rnp == rdp->mynode) {
1717 rdp->gpnum = rnp->completed; /* if GP in progress... */
1718 rdp->completed = rnp->completed;
1719 rdp->passed_quiesc_completed = rnp->completed - 1;
1720 }
1583 spin_unlock(&rnp->lock); /* irqs already disabled. */ 1721 spin_unlock(&rnp->lock); /* irqs already disabled. */
1584 rnp = rnp->parent; 1722 rnp = rnp->parent;
1585 } while (rnp != NULL && !(rnp->qsmaskinit & mask)); 1723 } while (rnp != NULL && !(rnp->qsmaskinit & mask));
@@ -1597,8 +1735,8 @@ static void __cpuinit rcu_online_cpu(int cpu)
1597/* 1735/*
1598 * Handle CPU online/offline notification events. 1736 * Handle CPU online/offline notification events.
1599 */ 1737 */
1600int __cpuinit rcu_cpu_notify(struct notifier_block *self, 1738static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1601 unsigned long action, void *hcpu) 1739 unsigned long action, void *hcpu)
1602{ 1740{
1603 long cpu = (long)hcpu; 1741 long cpu = (long)hcpu;
1604 1742
@@ -1685,8 +1823,8 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1685 cpustride *= rsp->levelspread[i]; 1823 cpustride *= rsp->levelspread[i];
1686 rnp = rsp->level[i]; 1824 rnp = rsp->level[i];
1687 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { 1825 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
1688 if (rnp != rcu_get_root(rsp)) 1826 spin_lock_init(&rnp->lock);
1689 spin_lock_init(&rnp->lock); 1827 lockdep_set_class(&rnp->lock, &rcu_node_class[i]);
1690 rnp->gpnum = 0; 1828 rnp->gpnum = 0;
1691 rnp->qsmask = 0; 1829 rnp->qsmask = 0;
1692 rnp->qsmaskinit = 0; 1830 rnp->qsmaskinit = 0;
@@ -1707,9 +1845,10 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1707 rnp->level = i; 1845 rnp->level = i;
1708 INIT_LIST_HEAD(&rnp->blocked_tasks[0]); 1846 INIT_LIST_HEAD(&rnp->blocked_tasks[0]);
1709 INIT_LIST_HEAD(&rnp->blocked_tasks[1]); 1847 INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
1848 INIT_LIST_HEAD(&rnp->blocked_tasks[2]);
1849 INIT_LIST_HEAD(&rnp->blocked_tasks[3]);
1710 } 1850 }
1711 } 1851 }
1712 spin_lock_init(&rcu_get_root(rsp)->lock);
1713} 1852}
1714 1853
1715/* 1854/*
@@ -1735,16 +1874,30 @@ do { \
1735 } \ 1874 } \
1736} while (0) 1875} while (0)
1737 1876
1738void __init __rcu_init(void) 1877void __init rcu_init(void)
1739{ 1878{
1879 int i;
1880
1740 rcu_bootup_announce(); 1881 rcu_bootup_announce();
1741#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 1882#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1742 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); 1883 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1743#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 1884#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
1885#if NUM_RCU_LVL_4 != 0
1886 printk(KERN_INFO "Experimental four-level hierarchy is enabled.\n");
1887#endif /* #if NUM_RCU_LVL_4 != 0 */
1744 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); 1888 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
1745 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); 1889 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
1746 __rcu_init_preempt(); 1890 __rcu_init_preempt();
1747 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 1891 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1892
1893 /*
1894 * We don't need protection against CPU-hotplug here because
1895 * this is called early in boot, before either interrupts
1896 * or the scheduler are operational.
1897 */
1898 cpu_notifier(rcu_cpu_notify, 0);
1899 for_each_online_cpu(i)
1900 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)i);
1748} 1901}
1749 1902
1750#include "rcutree_plugin.h" 1903#include "rcutree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 1899023b0962..d2a0046f63b2 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -34,10 +34,11 @@
34 * In practice, this has not been tested, so there is probably some 34 * In practice, this has not been tested, so there is probably some
35 * bug somewhere. 35 * bug somewhere.
36 */ 36 */
37#define MAX_RCU_LVLS 3 37#define MAX_RCU_LVLS 4
38#define RCU_FANOUT (CONFIG_RCU_FANOUT) 38#define RCU_FANOUT (CONFIG_RCU_FANOUT)
39#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT) 39#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT)
40#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT) 40#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT)
41#define RCU_FANOUT_FOURTH (RCU_FANOUT_CUBE * RCU_FANOUT)
41 42
42#if NR_CPUS <= RCU_FANOUT 43#if NR_CPUS <= RCU_FANOUT
43# define NUM_RCU_LVLS 1 44# define NUM_RCU_LVLS 1
@@ -45,23 +46,33 @@
45# define NUM_RCU_LVL_1 (NR_CPUS) 46# define NUM_RCU_LVL_1 (NR_CPUS)
46# define NUM_RCU_LVL_2 0 47# define NUM_RCU_LVL_2 0
47# define NUM_RCU_LVL_3 0 48# define NUM_RCU_LVL_3 0
49# define NUM_RCU_LVL_4 0
48#elif NR_CPUS <= RCU_FANOUT_SQ 50#elif NR_CPUS <= RCU_FANOUT_SQ
49# define NUM_RCU_LVLS 2 51# define NUM_RCU_LVLS 2
50# define NUM_RCU_LVL_0 1 52# define NUM_RCU_LVL_0 1
51# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) 53# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
52# define NUM_RCU_LVL_2 (NR_CPUS) 54# define NUM_RCU_LVL_2 (NR_CPUS)
53# define NUM_RCU_LVL_3 0 55# define NUM_RCU_LVL_3 0
56# define NUM_RCU_LVL_4 0
54#elif NR_CPUS <= RCU_FANOUT_CUBE 57#elif NR_CPUS <= RCU_FANOUT_CUBE
55# define NUM_RCU_LVLS 3 58# define NUM_RCU_LVLS 3
56# define NUM_RCU_LVL_0 1 59# define NUM_RCU_LVL_0 1
57# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) 60# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
58# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) 61# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
59# define NUM_RCU_LVL_3 NR_CPUS 62# define NUM_RCU_LVL_3 NR_CPUS
63# define NUM_RCU_LVL_4 0
64#elif NR_CPUS <= RCU_FANOUT_FOURTH
65# define NUM_RCU_LVLS 4
66# define NUM_RCU_LVL_0 1
67# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE)
68# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
69# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
70# define NUM_RCU_LVL_4 NR_CPUS
60#else 71#else
61# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" 72# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
62#endif /* #if (NR_CPUS) <= RCU_FANOUT */ 73#endif /* #if (NR_CPUS) <= RCU_FANOUT */
63 74
64#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) 75#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
65#define NUM_RCU_NODES (RCU_SUM - NR_CPUS) 76#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
66 77
67/* 78/*
@@ -84,14 +95,21 @@ struct rcu_node {
84 long gpnum; /* Current grace period for this node. */ 95 long gpnum; /* Current grace period for this node. */
85 /* This will either be equal to or one */ 96 /* This will either be equal to or one */
86 /* behind the root rcu_node's gpnum. */ 97 /* behind the root rcu_node's gpnum. */
98 long completed; /* Last grace period completed for this node. */
99 /* This will either be equal to or one */
100 /* behind the root rcu_node's gpnum. */
87 unsigned long qsmask; /* CPUs or groups that need to switch in */ 101 unsigned long qsmask; /* CPUs or groups that need to switch in */
88 /* order for current grace period to proceed.*/ 102 /* order for current grace period to proceed.*/
89 /* In leaf rcu_node, each bit corresponds to */ 103 /* In leaf rcu_node, each bit corresponds to */
90 /* an rcu_data structure, otherwise, each */ 104 /* an rcu_data structure, otherwise, each */
91 /* bit corresponds to a child rcu_node */ 105 /* bit corresponds to a child rcu_node */
92 /* structure. */ 106 /* structure. */
107 unsigned long expmask; /* Groups that have ->blocked_tasks[] */
108 /* elements that need to drain to allow the */
109 /* current expedited grace period to */
110 /* complete (only for TREE_PREEMPT_RCU). */
93 unsigned long qsmaskinit; 111 unsigned long qsmaskinit;
94 /* Per-GP initialization for qsmask. */ 112 /* Per-GP initial value for qsmask & expmask. */
95 unsigned long grpmask; /* Mask to apply to parent qsmask. */ 113 unsigned long grpmask; /* Mask to apply to parent qsmask. */
96 /* Only one bit will be set in this mask. */ 114 /* Only one bit will be set in this mask. */
97 int grplo; /* lowest-numbered CPU or group here. */ 115 int grplo; /* lowest-numbered CPU or group here. */
@@ -99,7 +117,7 @@ struct rcu_node {
99 u8 grpnum; /* CPU/group number for next level up. */ 117 u8 grpnum; /* CPU/group number for next level up. */
100 u8 level; /* root is at level 0. */ 118 u8 level; /* root is at level 0. */
101 struct rcu_node *parent; 119 struct rcu_node *parent;
102 struct list_head blocked_tasks[2]; 120 struct list_head blocked_tasks[4];
103 /* Tasks blocked in RCU read-side critsect. */ 121 /* Tasks blocked in RCU read-side critsect. */
104 /* Grace period number (->gpnum) x blocked */ 122 /* Grace period number (->gpnum) x blocked */
105 /* by tasks on the (x & 0x1) element of the */ 123 /* by tasks on the (x & 0x1) element of the */
@@ -114,6 +132,21 @@ struct rcu_node {
114 for ((rnp) = &(rsp)->node[0]; \ 132 for ((rnp) = &(rsp)->node[0]; \
115 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) 133 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
116 134
135/*
136 * Do a breadth-first scan of the non-leaf rcu_node structures for the
137 * specified rcu_state structure. Note that if there is a singleton
138 * rcu_node tree with but one rcu_node structure, this loop is a no-op.
139 */
140#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
141 for ((rnp) = &(rsp)->node[0]; \
142 (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++)
143
144/*
145 * Scan the leaves of the rcu_node hierarchy for the specified rcu_state
146 * structure. Note that if there is a singleton rcu_node tree with but
147 * one rcu_node structure, this loop -will- visit the rcu_node structure.
148 * It is still a leaf node, even if it is also the root node.
149 */
117#define rcu_for_each_leaf_node(rsp, rnp) \ 150#define rcu_for_each_leaf_node(rsp, rnp) \
118 for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \ 151 for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \
119 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) 152 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
@@ -204,11 +237,12 @@ struct rcu_data {
204#define RCU_GP_IDLE 0 /* No grace period in progress. */ 237#define RCU_GP_IDLE 0 /* No grace period in progress. */
205#define RCU_GP_INIT 1 /* Grace period being initialized. */ 238#define RCU_GP_INIT 1 /* Grace period being initialized. */
206#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ 239#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */
207#define RCU_FORCE_QS 3 /* Need to force quiescent state. */ 240#define RCU_SAVE_COMPLETED 3 /* Need to save rsp->completed. */
241#define RCU_FORCE_QS 4 /* Need to force quiescent state. */
208#ifdef CONFIG_NO_HZ 242#ifdef CONFIG_NO_HZ
209#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK 243#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
210#else /* #ifdef CONFIG_NO_HZ */ 244#else /* #ifdef CONFIG_NO_HZ */
211#define RCU_SIGNAL_INIT RCU_FORCE_QS 245#define RCU_SIGNAL_INIT RCU_SAVE_COMPLETED
212#endif /* #else #ifdef CONFIG_NO_HZ */ 246#endif /* #else #ifdef CONFIG_NO_HZ */
213 247
214#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 248#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
@@ -246,7 +280,7 @@ struct rcu_state {
246 long gpnum; /* Current gp number. */ 280 long gpnum; /* Current gp number. */
247 long completed; /* # of last completed gp. */ 281 long completed; /* # of last completed gp. */
248 282
249 /* End of fields guarded by root rcu_node's lock. */ 283 /* End of fields guarded by root rcu_node's lock. */
250 284
251 spinlock_t onofflock; /* exclude on/offline and */ 285 spinlock_t onofflock; /* exclude on/offline and */
252 /* starting new GP. Also */ 286 /* starting new GP. Also */
@@ -260,6 +294,8 @@ struct rcu_state {
260 long orphan_qlen; /* Number of orphaned cbs. */ 294 long orphan_qlen; /* Number of orphaned cbs. */
261 spinlock_t fqslock; /* Only one task forcing */ 295 spinlock_t fqslock; /* Only one task forcing */
262 /* quiescent states. */ 296 /* quiescent states. */
297 long completed_fqs; /* Value of completed @ snap. */
298 /* Protected by fqslock. */
263 unsigned long jiffies_force_qs; /* Time at which to invoke */ 299 unsigned long jiffies_force_qs; /* Time at which to invoke */
264 /* force_quiescent_state(). */ 300 /* force_quiescent_state(). */
265 unsigned long n_force_qs; /* Number of calls to */ 301 unsigned long n_force_qs; /* Number of calls to */
@@ -274,11 +310,15 @@ struct rcu_state {
274 unsigned long jiffies_stall; /* Time at which to check */ 310 unsigned long jiffies_stall; /* Time at which to check */
275 /* for CPU stalls. */ 311 /* for CPU stalls. */
276#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 312#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
277#ifdef CONFIG_NO_HZ
278 long dynticks_completed; /* Value of completed @ snap. */
279#endif /* #ifdef CONFIG_NO_HZ */
280}; 313};
281 314
315/* Return values for rcu_preempt_offline_tasks(). */
316
317#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */
318 /* GP were moved to root. */
319#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */
320 /* GP were moved to root. */
321
282#ifdef RCU_TREE_NONCORE 322#ifdef RCU_TREE_NONCORE
283 323
284/* 324/*
@@ -298,10 +338,14 @@ DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
298#else /* #ifdef RCU_TREE_NONCORE */ 338#else /* #ifdef RCU_TREE_NONCORE */
299 339
300/* Forward declarations for rcutree_plugin.h */ 340/* Forward declarations for rcutree_plugin.h */
301static inline void rcu_bootup_announce(void); 341static void rcu_bootup_announce(void);
302long rcu_batches_completed(void); 342long rcu_batches_completed(void);
303static void rcu_preempt_note_context_switch(int cpu); 343static void rcu_preempt_note_context_switch(int cpu);
304static int rcu_preempted_readers(struct rcu_node *rnp); 344static int rcu_preempted_readers(struct rcu_node *rnp);
345#ifdef CONFIG_HOTPLUG_CPU
346static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
347 unsigned long flags);
348#endif /* #ifdef CONFIG_HOTPLUG_CPU */
305#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 349#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
306static void rcu_print_task_stall(struct rcu_node *rnp); 350static void rcu_print_task_stall(struct rcu_node *rnp);
307#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 351#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
@@ -315,6 +359,9 @@ static void rcu_preempt_offline_cpu(int cpu);
315static void rcu_preempt_check_callbacks(int cpu); 359static void rcu_preempt_check_callbacks(int cpu);
316static void rcu_preempt_process_callbacks(void); 360static void rcu_preempt_process_callbacks(void);
317void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 361void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
362#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
363static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
364#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
318static int rcu_preempt_pending(int cpu); 365static int rcu_preempt_pending(int cpu);
319static int rcu_preempt_needs_cpu(int cpu); 366static int rcu_preempt_needs_cpu(int cpu);
320static void __cpuinit rcu_preempt_init_percpu_data(int cpu); 367static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index ef2a58c2b9d5..37fbccdf41d5 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -24,16 +24,19 @@
24 * Paul E. McKenney <paulmck@linux.vnet.ibm.com> 24 * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
25 */ 25 */
26 26
27#include <linux/delay.h>
27 28
28#ifdef CONFIG_TREE_PREEMPT_RCU 29#ifdef CONFIG_TREE_PREEMPT_RCU
29 30
30struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); 31struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
31DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); 32DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
32 33
34static int rcu_preempted_readers_exp(struct rcu_node *rnp);
35
33/* 36/*
34 * Tell them what RCU they are running. 37 * Tell them what RCU they are running.
35 */ 38 */
36static inline void rcu_bootup_announce(void) 39static void __init rcu_bootup_announce(void)
37{ 40{
38 printk(KERN_INFO 41 printk(KERN_INFO
39 "Experimental preemptable hierarchical RCU implementation.\n"); 42 "Experimental preemptable hierarchical RCU implementation.\n");
@@ -67,7 +70,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
67static void rcu_preempt_qs(int cpu) 70static void rcu_preempt_qs(int cpu)
68{ 71{
69 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 72 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
70 rdp->passed_quiesc_completed = rdp->completed; 73 rdp->passed_quiesc_completed = rdp->gpnum - 1;
71 barrier(); 74 barrier();
72 rdp->passed_quiesc = 1; 75 rdp->passed_quiesc = 1;
73} 76}
@@ -157,14 +160,58 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock);
157 */ 160 */
158static int rcu_preempted_readers(struct rcu_node *rnp) 161static int rcu_preempted_readers(struct rcu_node *rnp)
159{ 162{
160 return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); 163 int phase = rnp->gpnum & 0x1;
164
165 return !list_empty(&rnp->blocked_tasks[phase]) ||
166 !list_empty(&rnp->blocked_tasks[phase + 2]);
167}
168
169/*
170 * Record a quiescent state for all tasks that were previously queued
171 * on the specified rcu_node structure and that were blocking the current
172 * RCU grace period. The caller must hold the specified rnp->lock with
173 * irqs disabled, and this lock is released upon return, but irqs remain
174 * disabled.
175 */
176static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
177 __releases(rnp->lock)
178{
179 unsigned long mask;
180 struct rcu_node *rnp_p;
181
182 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
183 spin_unlock_irqrestore(&rnp->lock, flags);
184 return; /* Still need more quiescent states! */
185 }
186
187 rnp_p = rnp->parent;
188 if (rnp_p == NULL) {
189 /*
190 * Either there is only one rcu_node in the tree,
191 * or tasks were kicked up to root rcu_node due to
192 * CPUs going offline.
193 */
194 rcu_report_qs_rsp(&rcu_preempt_state, flags);
195 return;
196 }
197
198 /* Report up the rest of the hierarchy. */
199 mask = rnp->grpmask;
200 spin_unlock(&rnp->lock); /* irqs remain disabled. */
201 spin_lock(&rnp_p->lock); /* irqs already disabled. */
202 rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
161} 203}
162 204
205/*
206 * Handle special cases during rcu_read_unlock(), such as needing to
207 * notify RCU core processing or task having blocked during the RCU
208 * read-side critical section.
209 */
163static void rcu_read_unlock_special(struct task_struct *t) 210static void rcu_read_unlock_special(struct task_struct *t)
164{ 211{
165 int empty; 212 int empty;
213 int empty_exp;
166 unsigned long flags; 214 unsigned long flags;
167 unsigned long mask;
168 struct rcu_node *rnp; 215 struct rcu_node *rnp;
169 int special; 216 int special;
170 217
@@ -207,36 +254,30 @@ static void rcu_read_unlock_special(struct task_struct *t)
207 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 254 spin_unlock(&rnp->lock); /* irqs remain disabled. */
208 } 255 }
209 empty = !rcu_preempted_readers(rnp); 256 empty = !rcu_preempted_readers(rnp);
257 empty_exp = !rcu_preempted_readers_exp(rnp);
258 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
210 list_del_init(&t->rcu_node_entry); 259 list_del_init(&t->rcu_node_entry);
211 t->rcu_blocked_node = NULL; 260 t->rcu_blocked_node = NULL;
212 261
213 /* 262 /*
214 * If this was the last task on the current list, and if 263 * If this was the last task on the current list, and if
215 * we aren't waiting on any CPUs, report the quiescent state. 264 * we aren't waiting on any CPUs, report the quiescent state.
216 * Note that both cpu_quiet_msk_finish() and cpu_quiet_msk() 265 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock.
217 * drop rnp->lock and restore irq.
218 */ 266 */
219 if (!empty && rnp->qsmask == 0 && 267 if (empty)
220 !rcu_preempted_readers(rnp)) {
221 struct rcu_node *rnp_p;
222
223 if (rnp->parent == NULL) {
224 /* Only one rcu_node in the tree. */
225 cpu_quiet_msk_finish(&rcu_preempt_state, flags);
226 return;
227 }
228 /* Report up the rest of the hierarchy. */
229 mask = rnp->grpmask;
230 spin_unlock_irqrestore(&rnp->lock, flags); 268 spin_unlock_irqrestore(&rnp->lock, flags);
231 rnp_p = rnp->parent; 269 else
232 spin_lock_irqsave(&rnp_p->lock, flags); 270 rcu_report_unblock_qs_rnp(rnp, flags);
233 WARN_ON_ONCE(rnp->qsmask); 271
234 cpu_quiet_msk(mask, &rcu_preempt_state, rnp_p, flags); 272 /*
235 return; 273 * If this was the last task on the expedited lists,
236 } 274 * then we need to report up the rcu_node hierarchy.
237 spin_unlock(&rnp->lock); 275 */
276 if (!empty_exp && !rcu_preempted_readers_exp(rnp))
277 rcu_report_exp_rnp(&rcu_preempt_state, rnp);
278 } else {
279 local_irq_restore(flags);
238 } 280 }
239 local_irq_restore(flags);
240} 281}
241 282
242/* 283/*
@@ -303,6 +344,8 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
303 * rcu_node. The reason for not just moving them to the immediate 344 * rcu_node. The reason for not just moving them to the immediate
304 * parent is to remove the need for rcu_read_unlock_special() to 345 * parent is to remove the need for rcu_read_unlock_special() to
305 * make more than two attempts to acquire the target rcu_node's lock. 346 * make more than two attempts to acquire the target rcu_node's lock.
347 * Returns true if there were tasks blocking the current RCU grace
348 * period.
306 * 349 *
307 * Returns 1 if there was previously a task blocking the current grace 350 * Returns 1 if there was previously a task blocking the current grace
308 * period on the specified rcu_node structure. 351 * period on the specified rcu_node structure.
@@ -316,7 +359,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
316 int i; 359 int i;
317 struct list_head *lp; 360 struct list_head *lp;
318 struct list_head *lp_root; 361 struct list_head *lp_root;
319 int retval = rcu_preempted_readers(rnp); 362 int retval = 0;
320 struct rcu_node *rnp_root = rcu_get_root(rsp); 363 struct rcu_node *rnp_root = rcu_get_root(rsp);
321 struct task_struct *tp; 364 struct task_struct *tp;
322 365
@@ -326,7 +369,9 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
326 } 369 }
327 WARN_ON_ONCE(rnp != rdp->mynode && 370 WARN_ON_ONCE(rnp != rdp->mynode &&
328 (!list_empty(&rnp->blocked_tasks[0]) || 371 (!list_empty(&rnp->blocked_tasks[0]) ||
329 !list_empty(&rnp->blocked_tasks[1]))); 372 !list_empty(&rnp->blocked_tasks[1]) ||
373 !list_empty(&rnp->blocked_tasks[2]) ||
374 !list_empty(&rnp->blocked_tasks[3])));
330 375
331 /* 376 /*
332 * Move tasks up to root rcu_node. Rely on the fact that the 377 * Move tasks up to root rcu_node. Rely on the fact that the
@@ -334,7 +379,11 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
334 * rcu_nodes in terms of gp_num value. This fact allows us to 379 * rcu_nodes in terms of gp_num value. This fact allows us to
335 * move the blocked_tasks[] array directly, element by element. 380 * move the blocked_tasks[] array directly, element by element.
336 */ 381 */
337 for (i = 0; i < 2; i++) { 382 if (rcu_preempted_readers(rnp))
383 retval |= RCU_OFL_TASKS_NORM_GP;
384 if (rcu_preempted_readers_exp(rnp))
385 retval |= RCU_OFL_TASKS_EXP_GP;
386 for (i = 0; i < 4; i++) {
338 lp = &rnp->blocked_tasks[i]; 387 lp = &rnp->blocked_tasks[i];
339 lp_root = &rnp_root->blocked_tasks[i]; 388 lp_root = &rnp_root->blocked_tasks[i];
340 while (!list_empty(lp)) { 389 while (!list_empty(lp)) {
@@ -346,7 +395,6 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
346 spin_unlock(&rnp_root->lock); /* irqs remain disabled */ 395 spin_unlock(&rnp_root->lock); /* irqs remain disabled */
347 } 396 }
348 } 397 }
349
350 return retval; 398 return retval;
351} 399}
352 400
@@ -398,14 +446,183 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
398} 446}
399EXPORT_SYMBOL_GPL(call_rcu); 447EXPORT_SYMBOL_GPL(call_rcu);
400 448
449/**
450 * synchronize_rcu - wait until a grace period has elapsed.
451 *
452 * Control will return to the caller some time after a full grace
453 * period has elapsed, in other words after all currently executing RCU
454 * read-side critical sections have completed. RCU read-side critical
455 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
456 * and may be nested.
457 */
458void synchronize_rcu(void)
459{
460 struct rcu_synchronize rcu;
461
462 if (!rcu_scheduler_active)
463 return;
464
465 init_completion(&rcu.completion);
466 /* Will wake me after RCU finished. */
467 call_rcu(&rcu.head, wakeme_after_rcu);
468 /* Wait for it. */
469 wait_for_completion(&rcu.completion);
470}
471EXPORT_SYMBOL_GPL(synchronize_rcu);
472
473static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
474static long sync_rcu_preempt_exp_count;
475static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
476
401/* 477/*
402 * Wait for an rcu-preempt grace period. We are supposed to expedite the 478 * Return non-zero if there are any tasks in RCU read-side critical
403 * grace period, but this is the crude slow compatability hack, so just 479 * sections blocking the current preemptible-RCU expedited grace period.
404 * invoke synchronize_rcu(). 480 * If there is no preemptible-RCU expedited grace period currently in
481 * progress, returns zero unconditionally.
482 */
483static int rcu_preempted_readers_exp(struct rcu_node *rnp)
484{
485 return !list_empty(&rnp->blocked_tasks[2]) ||
486 !list_empty(&rnp->blocked_tasks[3]);
487}
488
489/*
490 * return non-zero if there is no RCU expedited grace period in progress
491 * for the specified rcu_node structure, in other words, if all CPUs and
492 * tasks covered by the specified rcu_node structure have done their bit
493 * for the current expedited grace period. Works only for preemptible
494 * RCU -- other RCU implementation use other means.
495 *
496 * Caller must hold sync_rcu_preempt_exp_mutex.
497 */
498static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
499{
500 return !rcu_preempted_readers_exp(rnp) &&
501 ACCESS_ONCE(rnp->expmask) == 0;
502}
503
504/*
505 * Report the exit from RCU read-side critical section for the last task
506 * that queued itself during or before the current expedited preemptible-RCU
507 * grace period. This event is reported either to the rcu_node structure on
508 * which the task was queued or to one of that rcu_node structure's ancestors,
509 * recursively up the tree. (Calm down, calm down, we do the recursion
510 * iteratively!)
511 *
512 * Caller must hold sync_rcu_preempt_exp_mutex.
513 */
514static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
515{
516 unsigned long flags;
517 unsigned long mask;
518
519 spin_lock_irqsave(&rnp->lock, flags);
520 for (;;) {
521 if (!sync_rcu_preempt_exp_done(rnp))
522 break;
523 if (rnp->parent == NULL) {
524 wake_up(&sync_rcu_preempt_exp_wq);
525 break;
526 }
527 mask = rnp->grpmask;
528 spin_unlock(&rnp->lock); /* irqs remain disabled */
529 rnp = rnp->parent;
530 spin_lock(&rnp->lock); /* irqs already disabled */
531 rnp->expmask &= ~mask;
532 }
533 spin_unlock_irqrestore(&rnp->lock, flags);
534}
535
536/*
537 * Snapshot the tasks blocking the newly started preemptible-RCU expedited
538 * grace period for the specified rcu_node structure. If there are no such
539 * tasks, report it up the rcu_node hierarchy.
540 *
541 * Caller must hold sync_rcu_preempt_exp_mutex and rsp->onofflock.
542 */
543static void
544sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
545{
546 int must_wait;
547
548 spin_lock(&rnp->lock); /* irqs already disabled */
549 list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]);
550 list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]);
551 must_wait = rcu_preempted_readers_exp(rnp);
552 spin_unlock(&rnp->lock); /* irqs remain disabled */
553 if (!must_wait)
554 rcu_report_exp_rnp(rsp, rnp);
555}
556
557/*
558 * Wait for an rcu-preempt grace period, but expedite it. The basic idea
559 * is to invoke synchronize_sched_expedited() to push all the tasks to
560 * the ->blocked_tasks[] lists, move all entries from the first set of
561 * ->blocked_tasks[] lists to the second set, and finally wait for this
562 * second set to drain.
405 */ 563 */
406void synchronize_rcu_expedited(void) 564void synchronize_rcu_expedited(void)
407{ 565{
408 synchronize_rcu(); 566 unsigned long flags;
567 struct rcu_node *rnp;
568 struct rcu_state *rsp = &rcu_preempt_state;
569 long snap;
570 int trycount = 0;
571
572 smp_mb(); /* Caller's modifications seen first by other CPUs. */
573 snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1;
574 smp_mb(); /* Above access cannot bleed into critical section. */
575
576 /*
577 * Acquire lock, falling back to synchronize_rcu() if too many
578 * lock-acquisition failures. Of course, if someone does the
579 * expedited grace period for us, just leave.
580 */
581 while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
582 if (trycount++ < 10)
583 udelay(trycount * num_online_cpus());
584 else {
585 synchronize_rcu();
586 return;
587 }
588 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
589 goto mb_ret; /* Others did our work for us. */
590 }
591 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
592 goto unlock_mb_ret; /* Others did our work for us. */
593
594 /* force all RCU readers onto blocked_tasks[]. */
595 synchronize_sched_expedited();
596
597 spin_lock_irqsave(&rsp->onofflock, flags);
598
599 /* Initialize ->expmask for all non-leaf rcu_node structures. */
600 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
601 spin_lock(&rnp->lock); /* irqs already disabled. */
602 rnp->expmask = rnp->qsmaskinit;
603 spin_unlock(&rnp->lock); /* irqs remain disabled. */
604 }
605
606 /* Snapshot current state of ->blocked_tasks[] lists. */
607 rcu_for_each_leaf_node(rsp, rnp)
608 sync_rcu_preempt_exp_init(rsp, rnp);
609 if (NUM_RCU_NODES > 1)
610 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
611
612 spin_unlock_irqrestore(&rsp->onofflock, flags);
613
614 /* Wait for snapshotted ->blocked_tasks[] lists to drain. */
615 rnp = rcu_get_root(rsp);
616 wait_event(sync_rcu_preempt_exp_wq,
617 sync_rcu_preempt_exp_done(rnp));
618
619 /* Clean up and exit. */
620 smp_mb(); /* ensure expedited GP seen before counter increment. */
621 ACCESS_ONCE(sync_rcu_preempt_exp_count)++;
622unlock_mb_ret:
623 mutex_unlock(&sync_rcu_preempt_exp_mutex);
624mb_ret:
625 smp_mb(); /* ensure subsequent action seen after grace period. */
409} 626}
410EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 627EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
411 628
@@ -481,7 +698,7 @@ void exit_rcu(void)
481/* 698/*
482 * Tell them what RCU they are running. 699 * Tell them what RCU they are running.
483 */ 700 */
484static inline void rcu_bootup_announce(void) 701static void __init rcu_bootup_announce(void)
485{ 702{
486 printk(KERN_INFO "Hierarchical RCU implementation.\n"); 703 printk(KERN_INFO "Hierarchical RCU implementation.\n");
487} 704}
@@ -512,6 +729,16 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
512 return 0; 729 return 0;
513} 730}
514 731
732#ifdef CONFIG_HOTPLUG_CPU
733
734/* Because preemptible RCU does not exist, no quieting of tasks. */
735static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
736{
737 spin_unlock_irqrestore(&rnp->lock, flags);
738}
739
740#endif /* #ifdef CONFIG_HOTPLUG_CPU */
741
515#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 742#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
516 743
517/* 744/*
@@ -594,6 +821,20 @@ void synchronize_rcu_expedited(void)
594} 821}
595EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 822EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
596 823
824#ifdef CONFIG_HOTPLUG_CPU
825
826/*
827 * Because preemptable RCU does not exist, there is never any need to
828 * report on tasks preempted in RCU read-side critical sections during
829 * expedited RCU grace periods.
830 */
831static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
832{
833 return;
834}
835
836#endif /* #ifdef CONFIG_HOTPLUG_CPU */
837
597/* 838/*
598 * Because preemptable RCU does not exist, it never has any work to do. 839 * Because preemptable RCU does not exist, it never has any work to do.
599 */ 840 */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 4b31c779e62e..9d2c88423b31 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -155,12 +155,15 @@ static const struct file_operations rcudata_csv_fops = {
155 155
156static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) 156static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
157{ 157{
158 long gpnum;
158 int level = 0; 159 int level = 0;
160 int phase;
159 struct rcu_node *rnp; 161 struct rcu_node *rnp;
160 162
163 gpnum = rsp->gpnum;
161 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x " 164 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x "
162 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", 165 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
163 rsp->completed, rsp->gpnum, rsp->signaled, 166 rsp->completed, gpnum, rsp->signaled,
164 (long)(rsp->jiffies_force_qs - jiffies), 167 (long)(rsp->jiffies_force_qs - jiffies),
165 (int)(jiffies & 0xffff), 168 (int)(jiffies & 0xffff),
166 rsp->n_force_qs, rsp->n_force_qs_ngp, 169 rsp->n_force_qs, rsp->n_force_qs_ngp,
@@ -171,8 +174,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
171 seq_puts(m, "\n"); 174 seq_puts(m, "\n");
172 level = rnp->level; 175 level = rnp->level;
173 } 176 }
174 seq_printf(m, "%lx/%lx %d:%d ^%d ", 177 phase = gpnum & 0x1;
178 seq_printf(m, "%lx/%lx %c%c>%c%c %d:%d ^%d ",
175 rnp->qsmask, rnp->qsmaskinit, 179 rnp->qsmask, rnp->qsmaskinit,
180 "T."[list_empty(&rnp->blocked_tasks[phase])],
181 "E."[list_empty(&rnp->blocked_tasks[phase + 2])],
182 "T."[list_empty(&rnp->blocked_tasks[!phase])],
183 "E."[list_empty(&rnp->blocked_tasks[!phase + 2])],
176 rnp->grplo, rnp->grphi, rnp->grpnum); 184 rnp->grplo, rnp->grphi, rnp->grpnum);
177 } 185 }
178 seq_puts(m, "\n"); 186 seq_puts(m, "\n");
diff --git a/kernel/resource.c b/kernel/resource.c
index fb11a58b9594..dc15686b7a77 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -308,35 +308,37 @@ static int find_resource(struct resource *root, struct resource *new,
308 void *alignf_data) 308 void *alignf_data)
309{ 309{
310 struct resource *this = root->child; 310 struct resource *this = root->child;
311 resource_size_t start, end;
311 312
312 new->start = root->start; 313 start = root->start;
313 /* 314 /*
314 * Skip past an allocated resource that starts at 0, since the assignment 315 * Skip past an allocated resource that starts at 0, since the assignment
315 * of this->start - 1 to new->end below would cause an underflow. 316 * of this->start - 1 to new->end below would cause an underflow.
316 */ 317 */
317 if (this && this->start == 0) { 318 if (this && this->start == 0) {
318 new->start = this->end + 1; 319 start = this->end + 1;
319 this = this->sibling; 320 this = this->sibling;
320 } 321 }
321 for(;;) { 322 for(;;) {
322 if (this) 323 if (this)
323 new->end = this->start - 1; 324 end = this->start - 1;
324 else 325 else
325 new->end = root->end; 326 end = root->end;
326 if (new->start < min) 327 if (start < min)
327 new->start = min; 328 start = min;
328 if (new->end > max) 329 if (end > max)
329 new->end = max; 330 end = max;
330 new->start = ALIGN(new->start, align); 331 start = ALIGN(start, align);
331 if (alignf) 332 if (alignf)
332 alignf(alignf_data, new, size, align); 333 alignf(alignf_data, new, size, align);
333 if (new->start < new->end && new->end - new->start >= size - 1) { 334 if (start < end && end - start >= size - 1) {
334 new->end = new->start + size - 1; 335 new->start = start;
336 new->end = start + size - 1;
335 return 0; 337 return 0;
336 } 338 }
337 if (!this) 339 if (!this)
338 break; 340 break;
339 new->start = this->end + 1; 341 start = this->end + 1;
340 this = this->sibling; 342 this = this->sibling;
341 } 343 }
342 return -EBUSY; 344 return -EBUSY;
diff --git a/kernel/sched.c b/kernel/sched.c
index 3c11ae0a948d..ff39cadf621e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -535,14 +535,12 @@ struct rq {
535 #define CPU_LOAD_IDX_MAX 5 535 #define CPU_LOAD_IDX_MAX 5
536 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 536 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
537#ifdef CONFIG_NO_HZ 537#ifdef CONFIG_NO_HZ
538 unsigned long last_tick_seen;
539 unsigned char in_nohz_recently; 538 unsigned char in_nohz_recently;
540#endif 539#endif
541 /* capture load from *all* tasks on this cpu: */ 540 /* capture load from *all* tasks on this cpu: */
542 struct load_weight load; 541 struct load_weight load;
543 unsigned long nr_load_updates; 542 unsigned long nr_load_updates;
544 u64 nr_switches; 543 u64 nr_switches;
545 u64 nr_migrations_in;
546 544
547 struct cfs_rq cfs; 545 struct cfs_rq cfs;
548 struct rt_rq rt; 546 struct rt_rq rt;
@@ -591,6 +589,8 @@ struct rq {
591 589
592 u64 rt_avg; 590 u64 rt_avg;
593 u64 age_stamp; 591 u64 age_stamp;
592 u64 idle_stamp;
593 u64 avg_idle;
594#endif 594#endif
595 595
596 /* calc_load related fields */ 596 /* calc_load related fields */
@@ -772,7 +772,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
772 if (!sched_feat_names[i]) 772 if (!sched_feat_names[i])
773 return -EINVAL; 773 return -EINVAL;
774 774
775 filp->f_pos += cnt; 775 *ppos += cnt;
776 776
777 return cnt; 777 return cnt;
778} 778}
@@ -814,6 +814,7 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
814 * default: 0.25ms 814 * default: 0.25ms
815 */ 815 */
816unsigned int sysctl_sched_shares_ratelimit = 250000; 816unsigned int sysctl_sched_shares_ratelimit = 250000;
817unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
817 818
818/* 819/*
819 * Inject some fuzzyness into changing the per-cpu group shares 820 * Inject some fuzzyness into changing the per-cpu group shares
@@ -1614,7 +1615,7 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu,
1614 */ 1615 */
1615static int tg_shares_up(struct task_group *tg, void *data) 1616static int tg_shares_up(struct task_group *tg, void *data)
1616{ 1617{
1617 unsigned long weight, rq_weight = 0, shares = 0; 1618 unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
1618 unsigned long *usd_rq_weight; 1619 unsigned long *usd_rq_weight;
1619 struct sched_domain *sd = data; 1620 struct sched_domain *sd = data;
1620 unsigned long flags; 1621 unsigned long flags;
@@ -1630,6 +1631,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
1630 weight = tg->cfs_rq[i]->load.weight; 1631 weight = tg->cfs_rq[i]->load.weight;
1631 usd_rq_weight[i] = weight; 1632 usd_rq_weight[i] = weight;
1632 1633
1634 rq_weight += weight;
1633 /* 1635 /*
1634 * If there are currently no tasks on the cpu pretend there 1636 * If there are currently no tasks on the cpu pretend there
1635 * is one of average load so that when a new task gets to 1637 * is one of average load so that when a new task gets to
@@ -1638,10 +1640,13 @@ static int tg_shares_up(struct task_group *tg, void *data)
1638 if (!weight) 1640 if (!weight)
1639 weight = NICE_0_LOAD; 1641 weight = NICE_0_LOAD;
1640 1642
1641 rq_weight += weight; 1643 sum_weight += weight;
1642 shares += tg->cfs_rq[i]->shares; 1644 shares += tg->cfs_rq[i]->shares;
1643 } 1645 }
1644 1646
1647 if (!rq_weight)
1648 rq_weight = sum_weight;
1649
1645 if ((!shares && rq_weight) || shares > tg->shares) 1650 if ((!shares && rq_weight) || shares > tg->shares)
1646 shares = tg->shares; 1651 shares = tg->shares;
1647 1652
@@ -1810,6 +1815,22 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1810#endif 1815#endif
1811 1816
1812static void calc_load_account_active(struct rq *this_rq); 1817static void calc_load_account_active(struct rq *this_rq);
1818static void update_sysctl(void);
1819static int get_update_sysctl_factor(void);
1820
1821static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1822{
1823 set_task_rq(p, cpu);
1824#ifdef CONFIG_SMP
1825 /*
1826 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1827 * successfuly executed on another CPU. We must ensure that updates of
1828 * per-task data have been completed by this moment.
1829 */
1830 smp_wmb();
1831 task_thread_info(p)->cpu = cpu;
1832#endif
1833}
1813 1834
1814#include "sched_stats.h" 1835#include "sched_stats.h"
1815#include "sched_idletask.c" 1836#include "sched_idletask.c"
@@ -1967,20 +1988,6 @@ inline int task_curr(const struct task_struct *p)
1967 return cpu_curr(task_cpu(p)) == p; 1988 return cpu_curr(task_cpu(p)) == p;
1968} 1989}
1969 1990
1970static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1971{
1972 set_task_rq(p, cpu);
1973#ifdef CONFIG_SMP
1974 /*
1975 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1976 * successfuly executed on another CPU. We must ensure that updates of
1977 * per-task data have been completed by this moment.
1978 */
1979 smp_wmb();
1980 task_thread_info(p)->cpu = cpu;
1981#endif
1982}
1983
1984static inline void check_class_changed(struct rq *rq, struct task_struct *p, 1991static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1985 const struct sched_class *prev_class, 1992 const struct sched_class *prev_class,
1986 int oldprio, int running) 1993 int oldprio, int running)
@@ -2017,6 +2024,7 @@ void kthread_bind(struct task_struct *p, unsigned int cpu)
2017 } 2024 }
2018 2025
2019 spin_lock_irqsave(&rq->lock, flags); 2026 spin_lock_irqsave(&rq->lock, flags);
2027 update_rq_clock(rq);
2020 set_task_cpu(p, cpu); 2028 set_task_cpu(p, cpu);
2021 p->cpus_allowed = cpumask_of_cpu(cpu); 2029 p->cpus_allowed = cpumask_of_cpu(cpu);
2022 p->rt.nr_cpus_allowed = 1; 2030 p->rt.nr_cpus_allowed = 1;
@@ -2059,30 +2067,13 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2059void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 2067void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2060{ 2068{
2061 int old_cpu = task_cpu(p); 2069 int old_cpu = task_cpu(p);
2062 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
2063 struct cfs_rq *old_cfsrq = task_cfs_rq(p), 2070 struct cfs_rq *old_cfsrq = task_cfs_rq(p),
2064 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); 2071 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
2065 u64 clock_offset;
2066
2067 clock_offset = old_rq->clock - new_rq->clock;
2068 2072
2069 trace_sched_migrate_task(p, new_cpu); 2073 trace_sched_migrate_task(p, new_cpu);
2070 2074
2071#ifdef CONFIG_SCHEDSTATS
2072 if (p->se.wait_start)
2073 p->se.wait_start -= clock_offset;
2074 if (p->se.sleep_start)
2075 p->se.sleep_start -= clock_offset;
2076 if (p->se.block_start)
2077 p->se.block_start -= clock_offset;
2078#endif
2079 if (old_cpu != new_cpu) { 2075 if (old_cpu != new_cpu) {
2080 p->se.nr_migrations++; 2076 p->se.nr_migrations++;
2081 new_rq->nr_migrations_in++;
2082#ifdef CONFIG_SCHEDSTATS
2083 if (task_hot(p, old_rq->clock, NULL))
2084 schedstat_inc(p, se.nr_forced2_migrations);
2085#endif
2086 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 2077 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
2087 1, 1, NULL, 0); 2078 1, 1, NULL, 0);
2088 } 2079 }
@@ -2115,6 +2106,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2115 * it is sufficient to simply update the task's cpu field. 2106 * it is sufficient to simply update the task's cpu field.
2116 */ 2107 */
2117 if (!p->se.on_rq && !task_running(rq, p)) { 2108 if (!p->se.on_rq && !task_running(rq, p)) {
2109 update_rq_clock(rq);
2118 set_task_cpu(p, dest_cpu); 2110 set_task_cpu(p, dest_cpu);
2119 return 0; 2111 return 0;
2120 } 2112 }
@@ -2322,6 +2314,14 @@ void task_oncpu_function_call(struct task_struct *p,
2322 preempt_enable(); 2314 preempt_enable();
2323} 2315}
2324 2316
2317#ifdef CONFIG_SMP
2318static inline
2319int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2320{
2321 return p->sched_class->select_task_rq(p, sd_flags, wake_flags);
2322}
2323#endif
2324
2325/*** 2325/***
2326 * try_to_wake_up - wake up a thread 2326 * try_to_wake_up - wake up a thread
2327 * @p: the to-be-woken-up thread 2327 * @p: the to-be-woken-up thread
@@ -2373,16 +2373,14 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2373 if (task_contributes_to_load(p)) 2373 if (task_contributes_to_load(p))
2374 rq->nr_uninterruptible--; 2374 rq->nr_uninterruptible--;
2375 p->state = TASK_WAKING; 2375 p->state = TASK_WAKING;
2376 task_rq_unlock(rq, &flags); 2376 __task_rq_unlock(rq);
2377 2377
2378 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 2378 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2379 if (cpu != orig_cpu) 2379 if (cpu != orig_cpu)
2380 set_task_cpu(p, cpu); 2380 set_task_cpu(p, cpu);
2381 2381
2382 rq = task_rq_lock(p, &flags); 2382 rq = __task_rq_lock(p);
2383 2383 update_rq_clock(rq);
2384 if (rq != orig_rq)
2385 update_rq_clock(rq);
2386 2384
2387 WARN_ON(p->state != TASK_WAKING); 2385 WARN_ON(p->state != TASK_WAKING);
2388 cpu = task_cpu(p); 2386 cpu = task_cpu(p);
@@ -2440,6 +2438,17 @@ out_running:
2440#ifdef CONFIG_SMP 2438#ifdef CONFIG_SMP
2441 if (p->sched_class->task_wake_up) 2439 if (p->sched_class->task_wake_up)
2442 p->sched_class->task_wake_up(rq, p); 2440 p->sched_class->task_wake_up(rq, p);
2441
2442 if (unlikely(rq->idle_stamp)) {
2443 u64 delta = rq->clock - rq->idle_stamp;
2444 u64 max = 2*sysctl_sched_migration_cost;
2445
2446 if (delta > max)
2447 rq->avg_idle = max;
2448 else
2449 update_avg(&rq->avg_idle, delta);
2450 rq->idle_stamp = 0;
2451 }
2443#endif 2452#endif
2444out: 2453out:
2445 task_rq_unlock(rq, &flags); 2454 task_rq_unlock(rq, &flags);
@@ -2486,7 +2495,6 @@ static void __sched_fork(struct task_struct *p)
2486 p->se.avg_overlap = 0; 2495 p->se.avg_overlap = 0;
2487 p->se.start_runtime = 0; 2496 p->se.start_runtime = 0;
2488 p->se.avg_wakeup = sysctl_sched_wakeup_granularity; 2497 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2489 p->se.avg_running = 0;
2490 2498
2491#ifdef CONFIG_SCHEDSTATS 2499#ifdef CONFIG_SCHEDSTATS
2492 p->se.wait_start = 0; 2500 p->se.wait_start = 0;
@@ -2508,7 +2516,6 @@ static void __sched_fork(struct task_struct *p)
2508 p->se.nr_failed_migrations_running = 0; 2516 p->se.nr_failed_migrations_running = 0;
2509 p->se.nr_failed_migrations_hot = 0; 2517 p->se.nr_failed_migrations_hot = 0;
2510 p->se.nr_forced_migrations = 0; 2518 p->se.nr_forced_migrations = 0;
2511 p->se.nr_forced2_migrations = 0;
2512 2519
2513 p->se.nr_wakeups = 0; 2520 p->se.nr_wakeups = 0;
2514 p->se.nr_wakeups_sync = 0; 2521 p->se.nr_wakeups_sync = 0;
@@ -2578,8 +2585,11 @@ void sched_fork(struct task_struct *p, int clone_flags)
2578 if (!rt_prio(p->prio)) 2585 if (!rt_prio(p->prio))
2579 p->sched_class = &fair_sched_class; 2586 p->sched_class = &fair_sched_class;
2580 2587
2588 if (p->sched_class->task_fork)
2589 p->sched_class->task_fork(p);
2590
2581#ifdef CONFIG_SMP 2591#ifdef CONFIG_SMP
2582 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); 2592 cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
2583#endif 2593#endif
2584 set_task_cpu(p, cpu); 2594 set_task_cpu(p, cpu);
2585 2595
@@ -2614,17 +2624,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2614 rq = task_rq_lock(p, &flags); 2624 rq = task_rq_lock(p, &flags);
2615 BUG_ON(p->state != TASK_RUNNING); 2625 BUG_ON(p->state != TASK_RUNNING);
2616 update_rq_clock(rq); 2626 update_rq_clock(rq);
2617 2627 activate_task(rq, p, 0);
2618 if (!p->sched_class->task_new || !current->se.on_rq) {
2619 activate_task(rq, p, 0);
2620 } else {
2621 /*
2622 * Let the scheduling class do new task startup
2623 * management (if any):
2624 */
2625 p->sched_class->task_new(rq, p);
2626 inc_nr_running(rq);
2627 }
2628 trace_sched_wakeup_new(rq, p, 1); 2628 trace_sched_wakeup_new(rq, p, 1);
2629 check_preempt_curr(rq, p, WF_FORK); 2629 check_preempt_curr(rq, p, WF_FORK);
2630#ifdef CONFIG_SMP 2630#ifdef CONFIG_SMP
@@ -2848,14 +2848,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
2848 */ 2848 */
2849 arch_start_context_switch(prev); 2849 arch_start_context_switch(prev);
2850 2850
2851 if (unlikely(!mm)) { 2851 if (likely(!mm)) {
2852 next->active_mm = oldmm; 2852 next->active_mm = oldmm;
2853 atomic_inc(&oldmm->mm_count); 2853 atomic_inc(&oldmm->mm_count);
2854 enter_lazy_tlb(oldmm, next); 2854 enter_lazy_tlb(oldmm, next);
2855 } else 2855 } else
2856 switch_mm(oldmm, mm, next); 2856 switch_mm(oldmm, mm, next);
2857 2857
2858 if (unlikely(!prev->mm)) { 2858 if (likely(!prev->mm)) {
2859 prev->active_mm = NULL; 2859 prev->active_mm = NULL;
2860 rq->prev_mm = oldmm; 2860 rq->prev_mm = oldmm;
2861 } 2861 }
@@ -3018,15 +3018,6 @@ static void calc_load_account_active(struct rq *this_rq)
3018} 3018}
3019 3019
3020/* 3020/*
3021 * Externally visible per-cpu scheduler statistics:
3022 * cpu_nr_migrations(cpu) - number of migrations into that cpu
3023 */
3024u64 cpu_nr_migrations(int cpu)
3025{
3026 return cpu_rq(cpu)->nr_migrations_in;
3027}
3028
3029/*
3030 * Update rq->cpu_load[] statistics. This function is usually called every 3021 * Update rq->cpu_load[] statistics. This function is usually called every
3031 * scheduler tick (TICK_NSEC). 3022 * scheduler tick (TICK_NSEC).
3032 */ 3023 */
@@ -3148,7 +3139,7 @@ out:
3148void sched_exec(void) 3139void sched_exec(void)
3149{ 3140{
3150 int new_cpu, this_cpu = get_cpu(); 3141 int new_cpu, this_cpu = get_cpu();
3151 new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0); 3142 new_cpu = select_task_rq(current, SD_BALANCE_EXEC, 0);
3152 put_cpu(); 3143 put_cpu();
3153 if (new_cpu != this_cpu) 3144 if (new_cpu != this_cpu)
3154 sched_migrate_task(current, new_cpu); 3145 sched_migrate_task(current, new_cpu);
@@ -3164,10 +3155,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
3164 deactivate_task(src_rq, p, 0); 3155 deactivate_task(src_rq, p, 0);
3165 set_task_cpu(p, this_cpu); 3156 set_task_cpu(p, this_cpu);
3166 activate_task(this_rq, p, 0); 3157 activate_task(this_rq, p, 0);
3167 /*
3168 * Note that idle threads have a prio of MAX_PRIO, for this test
3169 * to be always true for them.
3170 */
3171 check_preempt_curr(this_rq, p, 0); 3158 check_preempt_curr(this_rq, p, 0);
3172} 3159}
3173 3160
@@ -4126,7 +4113,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4126 unsigned long flags; 4113 unsigned long flags;
4127 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4114 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4128 4115
4129 cpumask_setall(cpus); 4116 cpumask_copy(cpus, cpu_active_mask);
4130 4117
4131 /* 4118 /*
4132 * When power savings policy is enabled for the parent domain, idle 4119 * When power savings policy is enabled for the parent domain, idle
@@ -4289,7 +4276,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4289 int all_pinned = 0; 4276 int all_pinned = 0;
4290 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4277 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4291 4278
4292 cpumask_setall(cpus); 4279 cpumask_copy(cpus, cpu_active_mask);
4293 4280
4294 /* 4281 /*
4295 * When power savings policy is enabled for the parent domain, idle 4282 * When power savings policy is enabled for the parent domain, idle
@@ -4429,6 +4416,11 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
4429 int pulled_task = 0; 4416 int pulled_task = 0;
4430 unsigned long next_balance = jiffies + HZ; 4417 unsigned long next_balance = jiffies + HZ;
4431 4418
4419 this_rq->idle_stamp = this_rq->clock;
4420
4421 if (this_rq->avg_idle < sysctl_sched_migration_cost)
4422 return;
4423
4432 for_each_domain(this_cpu, sd) { 4424 for_each_domain(this_cpu, sd) {
4433 unsigned long interval; 4425 unsigned long interval;
4434 4426
@@ -4443,8 +4435,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
4443 interval = msecs_to_jiffies(sd->balance_interval); 4435 interval = msecs_to_jiffies(sd->balance_interval);
4444 if (time_after(next_balance, sd->last_balance + interval)) 4436 if (time_after(next_balance, sd->last_balance + interval))
4445 next_balance = sd->last_balance + interval; 4437 next_balance = sd->last_balance + interval;
4446 if (pulled_task) 4438 if (pulled_task) {
4439 this_rq->idle_stamp = 0;
4447 break; 4440 break;
4441 }
4448 } 4442 }
4449 if (pulled_task || time_after(jiffies, this_rq->next_balance)) { 4443 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4450 /* 4444 /*
@@ -4679,7 +4673,7 @@ int select_nohz_load_balancer(int stop_tick)
4679 cpumask_set_cpu(cpu, nohz.cpu_mask); 4673 cpumask_set_cpu(cpu, nohz.cpu_mask);
4680 4674
4681 /* time for ilb owner also to sleep */ 4675 /* time for ilb owner also to sleep */
4682 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { 4676 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
4683 if (atomic_read(&nohz.load_balancer) == cpu) 4677 if (atomic_read(&nohz.load_balancer) == cpu)
4684 atomic_set(&nohz.load_balancer, -1); 4678 atomic_set(&nohz.load_balancer, -1);
4685 return 0; 4679 return 0;
@@ -5046,8 +5040,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
5046 p->gtime = cputime_add(p->gtime, cputime); 5040 p->gtime = cputime_add(p->gtime, cputime);
5047 5041
5048 /* Add guest time to cpustat. */ 5042 /* Add guest time to cpustat. */
5049 cpustat->user = cputime64_add(cpustat->user, tmp); 5043 if (TASK_NICE(p) > 0) {
5050 cpustat->guest = cputime64_add(cpustat->guest, tmp); 5044 cpustat->nice = cputime64_add(cpustat->nice, tmp);
5045 cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);
5046 } else {
5047 cpustat->user = cputime64_add(cpustat->user, tmp);
5048 cpustat->guest = cputime64_add(cpustat->guest, tmp);
5049 }
5051} 5050}
5052 5051
5053/* 5052/*
@@ -5162,60 +5161,86 @@ void account_idle_ticks(unsigned long ticks)
5162 * Use precise platform statistics if available: 5161 * Use precise platform statistics if available:
5163 */ 5162 */
5164#ifdef CONFIG_VIRT_CPU_ACCOUNTING 5163#ifdef CONFIG_VIRT_CPU_ACCOUNTING
5165cputime_t task_utime(struct task_struct *p) 5164void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5166{ 5165{
5167 return p->utime; 5166 *ut = p->utime;
5167 *st = p->stime;
5168} 5168}
5169 5169
5170cputime_t task_stime(struct task_struct *p) 5170void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5171{ 5171{
5172 return p->stime; 5172 struct task_cputime cputime;
5173
5174 thread_group_cputime(p, &cputime);
5175
5176 *ut = cputime.utime;
5177 *st = cputime.stime;
5173} 5178}
5174#else 5179#else
5175cputime_t task_utime(struct task_struct *p) 5180
5181#ifndef nsecs_to_cputime
5182# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
5183#endif
5184
5185void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5176{ 5186{
5177 clock_t utime = cputime_to_clock_t(p->utime), 5187 cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);
5178 total = utime + cputime_to_clock_t(p->stime);
5179 u64 temp;
5180 5188
5181 /* 5189 /*
5182 * Use CFS's precise accounting: 5190 * Use CFS's precise accounting:
5183 */ 5191 */
5184 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); 5192 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
5185 5193
5186 if (total) { 5194 if (total) {
5187 temp *= utime; 5195 u64 temp;
5196
5197 temp = (u64)(rtime * utime);
5188 do_div(temp, total); 5198 do_div(temp, total);
5189 } 5199 utime = (cputime_t)temp;
5190 utime = (clock_t)temp; 5200 } else
5201 utime = rtime;
5191 5202
5192 p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); 5203 /*
5193 return p->prev_utime; 5204 * Compare with previous values, to keep monotonicity:
5205 */
5206 p->prev_utime = max(p->prev_utime, utime);
5207 p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));
5208
5209 *ut = p->prev_utime;
5210 *st = p->prev_stime;
5194} 5211}
5195 5212
5196cputime_t task_stime(struct task_struct *p) 5213/*
5214 * Must be called with siglock held.
5215 */
5216void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5197{ 5217{
5198 clock_t stime; 5218 struct signal_struct *sig = p->signal;
5219 struct task_cputime cputime;
5220 cputime_t rtime, utime, total;
5199 5221
5200 /* 5222 thread_group_cputime(p, &cputime);
5201 * Use CFS's precise accounting. (we subtract utime from
5202 * the total, to make sure the total observed by userspace
5203 * grows monotonically - apps rely on that):
5204 */
5205 stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
5206 cputime_to_clock_t(task_utime(p));
5207 5223
5208 if (stime >= 0) 5224 total = cputime_add(cputime.utime, cputime.stime);
5209 p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); 5225 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
5210 5226
5211 return p->prev_stime; 5227 if (total) {
5212} 5228 u64 temp;
5213#endif
5214 5229
5215inline cputime_t task_gtime(struct task_struct *p) 5230 temp = (u64)(rtime * cputime.utime);
5216{ 5231 do_div(temp, total);
5217 return p->gtime; 5232 utime = (cputime_t)temp;
5233 } else
5234 utime = rtime;
5235
5236 sig->prev_utime = max(sig->prev_utime, utime);
5237 sig->prev_stime = max(sig->prev_stime,
5238 cputime_sub(rtime, sig->prev_utime));
5239
5240 *ut = sig->prev_utime;
5241 *st = sig->prev_stime;
5218} 5242}
5243#endif
5219 5244
5220/* 5245/*
5221 * This function gets called by the timer code, with HZ frequency. 5246 * This function gets called by the timer code, with HZ frequency.
@@ -5350,13 +5375,14 @@ static inline void schedule_debug(struct task_struct *prev)
5350#endif 5375#endif
5351} 5376}
5352 5377
5353static void put_prev_task(struct rq *rq, struct task_struct *p) 5378static void put_prev_task(struct rq *rq, struct task_struct *prev)
5354{ 5379{
5355 u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime; 5380 if (prev->state == TASK_RUNNING) {
5381 u64 runtime = prev->se.sum_exec_runtime;
5356 5382
5357 update_avg(&p->se.avg_running, runtime); 5383 runtime -= prev->se.prev_sum_exec_runtime;
5384 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5358 5385
5359 if (p->state == TASK_RUNNING) {
5360 /* 5386 /*
5361 * In order to avoid avg_overlap growing stale when we are 5387 * In order to avoid avg_overlap growing stale when we are
5362 * indeed overlapping and hence not getting put to sleep, grow 5388 * indeed overlapping and hence not getting put to sleep, grow
@@ -5366,12 +5392,9 @@ static void put_prev_task(struct rq *rq, struct task_struct *p)
5366 * correlates to the amount of cache footprint a task can 5392 * correlates to the amount of cache footprint a task can
5367 * build up. 5393 * build up.
5368 */ 5394 */
5369 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); 5395 update_avg(&prev->se.avg_overlap, runtime);
5370 update_avg(&p->se.avg_overlap, runtime);
5371 } else {
5372 update_avg(&p->se.avg_running, 0);
5373 } 5396 }
5374 p->sched_class->put_prev_task(rq, p); 5397 prev->sched_class->put_prev_task(rq, prev);
5375} 5398}
5376 5399
5377/* 5400/*
@@ -5481,7 +5504,7 @@ need_resched_nonpreemptible:
5481} 5504}
5482EXPORT_SYMBOL(schedule); 5505EXPORT_SYMBOL(schedule);
5483 5506
5484#ifdef CONFIG_SMP 5507#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
5485/* 5508/*
5486 * Look out! "owner" is an entirely speculative pointer 5509 * Look out! "owner" is an entirely speculative pointer
5487 * access and not reliable. 5510 * access and not reliable.
@@ -6175,22 +6198,14 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
6175 BUG_ON(p->se.on_rq); 6198 BUG_ON(p->se.on_rq);
6176 6199
6177 p->policy = policy; 6200 p->policy = policy;
6178 switch (p->policy) {
6179 case SCHED_NORMAL:
6180 case SCHED_BATCH:
6181 case SCHED_IDLE:
6182 p->sched_class = &fair_sched_class;
6183 break;
6184 case SCHED_FIFO:
6185 case SCHED_RR:
6186 p->sched_class = &rt_sched_class;
6187 break;
6188 }
6189
6190 p->rt_priority = prio; 6201 p->rt_priority = prio;
6191 p->normal_prio = normal_prio(p); 6202 p->normal_prio = normal_prio(p);
6192 /* we are holding p->pi_lock already */ 6203 /* we are holding p->pi_lock already */
6193 p->prio = rt_mutex_getprio(p); 6204 p->prio = rt_mutex_getprio(p);
6205 if (rt_prio(p->prio))
6206 p->sched_class = &rt_sched_class;
6207 else
6208 p->sched_class = &fair_sched_class;
6194 set_load_weight(p); 6209 set_load_weight(p);
6195} 6210}
6196 6211
@@ -6593,6 +6608,8 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
6593long sched_getaffinity(pid_t pid, struct cpumask *mask) 6608long sched_getaffinity(pid_t pid, struct cpumask *mask)
6594{ 6609{
6595 struct task_struct *p; 6610 struct task_struct *p;
6611 unsigned long flags;
6612 struct rq *rq;
6596 int retval; 6613 int retval;
6597 6614
6598 get_online_cpus(); 6615 get_online_cpus();
@@ -6607,7 +6624,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
6607 if (retval) 6624 if (retval)
6608 goto out_unlock; 6625 goto out_unlock;
6609 6626
6627 rq = task_rq_lock(p, &flags);
6610 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 6628 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
6629 task_rq_unlock(rq, &flags);
6611 6630
6612out_unlock: 6631out_unlock:
6613 read_unlock(&tasklist_lock); 6632 read_unlock(&tasklist_lock);
@@ -6845,6 +6864,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6845{ 6864{
6846 struct task_struct *p; 6865 struct task_struct *p;
6847 unsigned int time_slice; 6866 unsigned int time_slice;
6867 unsigned long flags;
6868 struct rq *rq;
6848 int retval; 6869 int retval;
6849 struct timespec t; 6870 struct timespec t;
6850 6871
@@ -6861,7 +6882,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6861 if (retval) 6882 if (retval)
6862 goto out_unlock; 6883 goto out_unlock;
6863 6884
6864 time_slice = p->sched_class->get_rr_interval(p); 6885 rq = task_rq_lock(p, &flags);
6886 time_slice = p->sched_class->get_rr_interval(rq, p);
6887 task_rq_unlock(rq, &flags);
6865 6888
6866 read_unlock(&tasklist_lock); 6889 read_unlock(&tasklist_lock);
6867 jiffies_to_timespec(time_slice, &t); 6890 jiffies_to_timespec(time_slice, &t);
@@ -6935,7 +6958,7 @@ void show_state_filter(unsigned long state_filter)
6935 /* 6958 /*
6936 * Only show locks if all tasks are dumped: 6959 * Only show locks if all tasks are dumped:
6937 */ 6960 */
6938 if (state_filter == -1) 6961 if (!state_filter)
6939 debug_show_all_locks(); 6962 debug_show_all_locks();
6940} 6963}
6941 6964
@@ -6962,7 +6985,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
6962 __sched_fork(idle); 6985 __sched_fork(idle);
6963 idle->se.exec_start = sched_clock(); 6986 idle->se.exec_start = sched_clock();
6964 6987
6965 idle->prio = idle->normal_prio = MAX_PRIO;
6966 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); 6988 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
6967 __set_task_cpu(idle, cpu); 6989 __set_task_cpu(idle, cpu);
6968 6990
@@ -7003,22 +7025,43 @@ cpumask_var_t nohz_cpu_mask;
7003 * 7025 *
7004 * This idea comes from the SD scheduler of Con Kolivas: 7026 * This idea comes from the SD scheduler of Con Kolivas:
7005 */ 7027 */
7006static inline void sched_init_granularity(void) 7028static int get_update_sysctl_factor(void)
7007{ 7029{
7008 unsigned int factor = 1 + ilog2(num_online_cpus()); 7030 unsigned int cpus = min_t(int, num_online_cpus(), 8);
7009 const unsigned long limit = 200000000; 7031 unsigned int factor;
7010 7032
7011 sysctl_sched_min_granularity *= factor; 7033 switch (sysctl_sched_tunable_scaling) {
7012 if (sysctl_sched_min_granularity > limit) 7034 case SCHED_TUNABLESCALING_NONE:
7013 sysctl_sched_min_granularity = limit; 7035 factor = 1;
7036 break;
7037 case SCHED_TUNABLESCALING_LINEAR:
7038 factor = cpus;
7039 break;
7040 case SCHED_TUNABLESCALING_LOG:
7041 default:
7042 factor = 1 + ilog2(cpus);
7043 break;
7044 }
7014 7045
7015 sysctl_sched_latency *= factor; 7046 return factor;
7016 if (sysctl_sched_latency > limit) 7047}
7017 sysctl_sched_latency = limit;
7018 7048
7019 sysctl_sched_wakeup_granularity *= factor; 7049static void update_sysctl(void)
7050{
7051 unsigned int factor = get_update_sysctl_factor();
7052
7053#define SET_SYSCTL(name) \
7054 (sysctl_##name = (factor) * normalized_sysctl_##name)
7055 SET_SYSCTL(sched_min_granularity);
7056 SET_SYSCTL(sched_latency);
7057 SET_SYSCTL(sched_wakeup_granularity);
7058 SET_SYSCTL(sched_shares_ratelimit);
7059#undef SET_SYSCTL
7060}
7020 7061
7021 sysctl_sched_shares_ratelimit *= factor; 7062static inline void sched_init_granularity(void)
7063{
7064 update_sysctl();
7022} 7065}
7023 7066
7024#ifdef CONFIG_SMP 7067#ifdef CONFIG_SMP
@@ -7055,7 +7098,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7055 int ret = 0; 7098 int ret = 0;
7056 7099
7057 rq = task_rq_lock(p, &flags); 7100 rq = task_rq_lock(p, &flags);
7058 if (!cpumask_intersects(new_mask, cpu_online_mask)) { 7101 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
7059 ret = -EINVAL; 7102 ret = -EINVAL;
7060 goto out; 7103 goto out;
7061 } 7104 }
@@ -7077,7 +7120,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7077 if (cpumask_test_cpu(task_cpu(p), new_mask)) 7120 if (cpumask_test_cpu(task_cpu(p), new_mask))
7078 goto out; 7121 goto out;
7079 7122
7080 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { 7123 if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) {
7081 /* Need help from migration thread: drop lock and wait. */ 7124 /* Need help from migration thread: drop lock and wait. */
7082 struct task_struct *mt = rq->migration_thread; 7125 struct task_struct *mt = rq->migration_thread;
7083 7126
@@ -7231,19 +7274,19 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
7231 7274
7232again: 7275again:
7233 /* Look for allowed, online CPU in same node. */ 7276 /* Look for allowed, online CPU in same node. */
7234 for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask) 7277 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
7235 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 7278 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
7236 goto move; 7279 goto move;
7237 7280
7238 /* Any allowed, online CPU? */ 7281 /* Any allowed, online CPU? */
7239 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask); 7282 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
7240 if (dest_cpu < nr_cpu_ids) 7283 if (dest_cpu < nr_cpu_ids)
7241 goto move; 7284 goto move;
7242 7285
7243 /* No more Mr. Nice Guy. */ 7286 /* No more Mr. Nice Guy. */
7244 if (dest_cpu >= nr_cpu_ids) { 7287 if (dest_cpu >= nr_cpu_ids) {
7245 cpuset_cpus_allowed_locked(p, &p->cpus_allowed); 7288 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
7246 dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed); 7289 dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
7247 7290
7248 /* 7291 /*
7249 * Don't tell them about moving exiting tasks or 7292 * Don't tell them about moving exiting tasks or
@@ -7272,7 +7315,7 @@ move:
7272 */ 7315 */
7273static void migrate_nr_uninterruptible(struct rq *rq_src) 7316static void migrate_nr_uninterruptible(struct rq *rq_src)
7274{ 7317{
7275 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask)); 7318 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
7276 unsigned long flags; 7319 unsigned long flags;
7277 7320
7278 local_irq_save(flags); 7321 local_irq_save(flags);
@@ -7406,17 +7449,16 @@ static struct ctl_table sd_ctl_dir[] = {
7406 .procname = "sched_domain", 7449 .procname = "sched_domain",
7407 .mode = 0555, 7450 .mode = 0555,
7408 }, 7451 },
7409 {0, }, 7452 {}
7410}; 7453};
7411 7454
7412static struct ctl_table sd_ctl_root[] = { 7455static struct ctl_table sd_ctl_root[] = {
7413 { 7456 {
7414 .ctl_name = CTL_KERN,
7415 .procname = "kernel", 7457 .procname = "kernel",
7416 .mode = 0555, 7458 .mode = 0555,
7417 .child = sd_ctl_dir, 7459 .child = sd_ctl_dir,
7418 }, 7460 },
7419 {0, }, 7461 {}
7420}; 7462};
7421 7463
7422static struct ctl_table *sd_alloc_ctl_entry(int n) 7464static struct ctl_table *sd_alloc_ctl_entry(int n)
@@ -7526,7 +7568,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
7526static struct ctl_table_header *sd_sysctl_header; 7568static struct ctl_table_header *sd_sysctl_header;
7527static void register_sched_domain_sysctl(void) 7569static void register_sched_domain_sysctl(void)
7528{ 7570{
7529 int i, cpu_num = num_online_cpus(); 7571 int i, cpu_num = num_possible_cpus();
7530 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); 7572 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
7531 char buf[32]; 7573 char buf[32];
7532 7574
@@ -7536,7 +7578,7 @@ static void register_sched_domain_sysctl(void)
7536 if (entry == NULL) 7578 if (entry == NULL)
7537 return; 7579 return;
7538 7580
7539 for_each_online_cpu(i) { 7581 for_each_possible_cpu(i) {
7540 snprintf(buf, 32, "cpu%d", i); 7582 snprintf(buf, 32, "cpu%d", i);
7541 entry->procname = kstrdup(buf, GFP_KERNEL); 7583 entry->procname = kstrdup(buf, GFP_KERNEL);
7542 entry->mode = 0555; 7584 entry->mode = 0555;
@@ -7666,7 +7708,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7666 spin_lock_irq(&rq->lock); 7708 spin_lock_irq(&rq->lock);
7667 update_rq_clock(rq); 7709 update_rq_clock(rq);
7668 deactivate_task(rq, rq->idle, 0); 7710 deactivate_task(rq, rq->idle, 0);
7669 rq->idle->static_prio = MAX_PRIO;
7670 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); 7711 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
7671 rq->idle->sched_class = &idle_sched_class; 7712 rq->idle->sched_class = &idle_sched_class;
7672 migrate_dead_tasks(cpu); 7713 migrate_dead_tasks(cpu);
@@ -7740,6 +7781,16 @@ early_initcall(migration_init);
7740 7781
7741#ifdef CONFIG_SCHED_DEBUG 7782#ifdef CONFIG_SCHED_DEBUG
7742 7783
7784static __read_mostly int sched_domain_debug_enabled;
7785
7786static int __init sched_domain_debug_setup(char *str)
7787{
7788 sched_domain_debug_enabled = 1;
7789
7790 return 0;
7791}
7792early_param("sched_debug", sched_domain_debug_setup);
7793
7743static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 7794static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7744 struct cpumask *groupmask) 7795 struct cpumask *groupmask)
7745{ 7796{
@@ -7826,6 +7877,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
7826 cpumask_var_t groupmask; 7877 cpumask_var_t groupmask;
7827 int level = 0; 7878 int level = 0;
7828 7879
7880 if (!sched_domain_debug_enabled)
7881 return;
7882
7829 if (!sd) { 7883 if (!sd) {
7830 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); 7884 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
7831 return; 7885 return;
@@ -7905,6 +7959,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
7905 7959
7906static void free_rootdomain(struct root_domain *rd) 7960static void free_rootdomain(struct root_domain *rd)
7907{ 7961{
7962 synchronize_sched();
7963
7908 cpupri_cleanup(&rd->cpupri); 7964 cpupri_cleanup(&rd->cpupri);
7909 7965
7910 free_cpumask_var(rd->rto_mask); 7966 free_cpumask_var(rd->rto_mask);
@@ -8045,6 +8101,7 @@ static cpumask_var_t cpu_isolated_map;
8045/* Setup the mask of cpus configured for isolated domains */ 8101/* Setup the mask of cpus configured for isolated domains */
8046static int __init isolated_cpu_setup(char *str) 8102static int __init isolated_cpu_setup(char *str)
8047{ 8103{
8104 alloc_bootmem_cpumask_var(&cpu_isolated_map);
8048 cpulist_parse(str, cpu_isolated_map); 8105 cpulist_parse(str, cpu_isolated_map);
8049 return 1; 8106 return 1;
8050} 8107}
@@ -8881,7 +8938,7 @@ static int build_sched_domains(const struct cpumask *cpu_map)
8881 return __build_sched_domains(cpu_map, NULL); 8938 return __build_sched_domains(cpu_map, NULL);
8882} 8939}
8883 8940
8884static struct cpumask *doms_cur; /* current sched domains */ 8941static cpumask_var_t *doms_cur; /* current sched domains */
8885static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 8942static int ndoms_cur; /* number of sched domains in 'doms_cur' */
8886static struct sched_domain_attr *dattr_cur; 8943static struct sched_domain_attr *dattr_cur;
8887 /* attribues of custom domains in 'doms_cur' */ 8944 /* attribues of custom domains in 'doms_cur' */
@@ -8903,6 +8960,31 @@ int __attribute__((weak)) arch_update_cpu_topology(void)
8903 return 0; 8960 return 0;
8904} 8961}
8905 8962
8963cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
8964{
8965 int i;
8966 cpumask_var_t *doms;
8967
8968 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
8969 if (!doms)
8970 return NULL;
8971 for (i = 0; i < ndoms; i++) {
8972 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
8973 free_sched_domains(doms, i);
8974 return NULL;
8975 }
8976 }
8977 return doms;
8978}
8979
8980void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
8981{
8982 unsigned int i;
8983 for (i = 0; i < ndoms; i++)
8984 free_cpumask_var(doms[i]);
8985 kfree(doms);
8986}
8987
8906/* 8988/*
8907 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 8989 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
8908 * For now this just excludes isolated cpus, but could be used to 8990 * For now this just excludes isolated cpus, but could be used to
@@ -8914,12 +8996,12 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
8914 8996
8915 arch_update_cpu_topology(); 8997 arch_update_cpu_topology();
8916 ndoms_cur = 1; 8998 ndoms_cur = 1;
8917 doms_cur = kmalloc(cpumask_size(), GFP_KERNEL); 8999 doms_cur = alloc_sched_domains(ndoms_cur);
8918 if (!doms_cur) 9000 if (!doms_cur)
8919 doms_cur = fallback_doms; 9001 doms_cur = &fallback_doms;
8920 cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); 9002 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
8921 dattr_cur = NULL; 9003 dattr_cur = NULL;
8922 err = build_sched_domains(doms_cur); 9004 err = build_sched_domains(doms_cur[0]);
8923 register_sched_domain_sysctl(); 9005 register_sched_domain_sysctl();
8924 9006
8925 return err; 9007 return err;
@@ -8969,19 +9051,19 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
8969 * doms_new[] to the current sched domain partitioning, doms_cur[]. 9051 * doms_new[] to the current sched domain partitioning, doms_cur[].
8970 * It destroys each deleted domain and builds each new domain. 9052 * It destroys each deleted domain and builds each new domain.
8971 * 9053 *
8972 * 'doms_new' is an array of cpumask's of length 'ndoms_new'. 9054 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
8973 * The masks don't intersect (don't overlap.) We should setup one 9055 * The masks don't intersect (don't overlap.) We should setup one
8974 * sched domain for each mask. CPUs not in any of the cpumasks will 9056 * sched domain for each mask. CPUs not in any of the cpumasks will
8975 * not be load balanced. If the same cpumask appears both in the 9057 * not be load balanced. If the same cpumask appears both in the
8976 * current 'doms_cur' domains and in the new 'doms_new', we can leave 9058 * current 'doms_cur' domains and in the new 'doms_new', we can leave
8977 * it as it is. 9059 * it as it is.
8978 * 9060 *
8979 * The passed in 'doms_new' should be kmalloc'd. This routine takes 9061 * The passed in 'doms_new' should be allocated using
8980 * ownership of it and will kfree it when done with it. If the caller 9062 * alloc_sched_domains. This routine takes ownership of it and will
8981 * failed the kmalloc call, then it can pass in doms_new == NULL && 9063 * free_sched_domains it when done with it. If the caller failed the
8982 * ndoms_new == 1, and partition_sched_domains() will fallback to 9064 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
8983 * the single partition 'fallback_doms', it also forces the domains 9065 * and partition_sched_domains() will fallback to the single partition
8984 * to be rebuilt. 9066 * 'fallback_doms', it also forces the domains to be rebuilt.
8985 * 9067 *
8986 * If doms_new == NULL it will be replaced with cpu_online_mask. 9068 * If doms_new == NULL it will be replaced with cpu_online_mask.
8987 * ndoms_new == 0 is a special case for destroying existing domains, 9069 * ndoms_new == 0 is a special case for destroying existing domains,
@@ -8989,8 +9071,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
8989 * 9071 *
8990 * Call with hotplug lock held 9072 * Call with hotplug lock held
8991 */ 9073 */
8992/* FIXME: Change to struct cpumask *doms_new[] */ 9074void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
8993void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
8994 struct sched_domain_attr *dattr_new) 9075 struct sched_domain_attr *dattr_new)
8995{ 9076{
8996 int i, j, n; 9077 int i, j, n;
@@ -9009,40 +9090,40 @@ void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
9009 /* Destroy deleted domains */ 9090 /* Destroy deleted domains */
9010 for (i = 0; i < ndoms_cur; i++) { 9091 for (i = 0; i < ndoms_cur; i++) {
9011 for (j = 0; j < n && !new_topology; j++) { 9092 for (j = 0; j < n && !new_topology; j++) {
9012 if (cpumask_equal(&doms_cur[i], &doms_new[j]) 9093 if (cpumask_equal(doms_cur[i], doms_new[j])
9013 && dattrs_equal(dattr_cur, i, dattr_new, j)) 9094 && dattrs_equal(dattr_cur, i, dattr_new, j))
9014 goto match1; 9095 goto match1;
9015 } 9096 }
9016 /* no match - a current sched domain not in new doms_new[] */ 9097 /* no match - a current sched domain not in new doms_new[] */
9017 detach_destroy_domains(doms_cur + i); 9098 detach_destroy_domains(doms_cur[i]);
9018match1: 9099match1:
9019 ; 9100 ;
9020 } 9101 }
9021 9102
9022 if (doms_new == NULL) { 9103 if (doms_new == NULL) {
9023 ndoms_cur = 0; 9104 ndoms_cur = 0;
9024 doms_new = fallback_doms; 9105 doms_new = &fallback_doms;
9025 cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); 9106 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
9026 WARN_ON_ONCE(dattr_new); 9107 WARN_ON_ONCE(dattr_new);
9027 } 9108 }
9028 9109
9029 /* Build new domains */ 9110 /* Build new domains */
9030 for (i = 0; i < ndoms_new; i++) { 9111 for (i = 0; i < ndoms_new; i++) {
9031 for (j = 0; j < ndoms_cur && !new_topology; j++) { 9112 for (j = 0; j < ndoms_cur && !new_topology; j++) {
9032 if (cpumask_equal(&doms_new[i], &doms_cur[j]) 9113 if (cpumask_equal(doms_new[i], doms_cur[j])
9033 && dattrs_equal(dattr_new, i, dattr_cur, j)) 9114 && dattrs_equal(dattr_new, i, dattr_cur, j))
9034 goto match2; 9115 goto match2;
9035 } 9116 }
9036 /* no match - add a new doms_new */ 9117 /* no match - add a new doms_new */
9037 __build_sched_domains(doms_new + i, 9118 __build_sched_domains(doms_new[i],
9038 dattr_new ? dattr_new + i : NULL); 9119 dattr_new ? dattr_new + i : NULL);
9039match2: 9120match2:
9040 ; 9121 ;
9041 } 9122 }
9042 9123
9043 /* Remember the new sched domains */ 9124 /* Remember the new sched domains */
9044 if (doms_cur != fallback_doms) 9125 if (doms_cur != &fallback_doms)
9045 kfree(doms_cur); 9126 free_sched_domains(doms_cur, ndoms_cur);
9046 kfree(dattr_cur); /* kfree(NULL) is safe */ 9127 kfree(dattr_cur); /* kfree(NULL) is safe */
9047 doms_cur = doms_new; 9128 doms_cur = doms_new;
9048 dattr_cur = dattr_new; 9129 dattr_cur = dattr_new;
@@ -9153,8 +9234,10 @@ static int update_sched_domains(struct notifier_block *nfb,
9153 switch (action) { 9234 switch (action) {
9154 case CPU_ONLINE: 9235 case CPU_ONLINE:
9155 case CPU_ONLINE_FROZEN: 9236 case CPU_ONLINE_FROZEN:
9156 case CPU_DEAD: 9237 case CPU_DOWN_PREPARE:
9157 case CPU_DEAD_FROZEN: 9238 case CPU_DOWN_PREPARE_FROZEN:
9239 case CPU_DOWN_FAILED:
9240 case CPU_DOWN_FAILED_FROZEN:
9158 partition_sched_domains(1, NULL, NULL); 9241 partition_sched_domains(1, NULL, NULL);
9159 return NOTIFY_OK; 9242 return NOTIFY_OK;
9160 9243
@@ -9201,7 +9284,7 @@ void __init sched_init_smp(void)
9201#endif 9284#endif
9202 get_online_cpus(); 9285 get_online_cpus();
9203 mutex_lock(&sched_domains_mutex); 9286 mutex_lock(&sched_domains_mutex);
9204 arch_init_sched_domains(cpu_online_mask); 9287 arch_init_sched_domains(cpu_active_mask);
9205 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 9288 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
9206 if (cpumask_empty(non_isolated_cpus)) 9289 if (cpumask_empty(non_isolated_cpus))
9207 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 9290 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -9364,10 +9447,6 @@ void __init sched_init(void)
9364#ifdef CONFIG_CPUMASK_OFFSTACK 9447#ifdef CONFIG_CPUMASK_OFFSTACK
9365 alloc_size += num_possible_cpus() * cpumask_size(); 9448 alloc_size += num_possible_cpus() * cpumask_size();
9366#endif 9449#endif
9367 /*
9368 * As sched_init() is called before page_alloc is setup,
9369 * we use alloc_bootmem().
9370 */
9371 if (alloc_size) { 9450 if (alloc_size) {
9372 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 9451 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
9373 9452
@@ -9522,6 +9601,8 @@ void __init sched_init(void)
9522 rq->cpu = i; 9601 rq->cpu = i;
9523 rq->online = 0; 9602 rq->online = 0;
9524 rq->migration_thread = NULL; 9603 rq->migration_thread = NULL;
9604 rq->idle_stamp = 0;
9605 rq->avg_idle = 2*sysctl_sched_migration_cost;
9525 INIT_LIST_HEAD(&rq->migration_queue); 9606 INIT_LIST_HEAD(&rq->migration_queue);
9526 rq_attach_root(rq, &def_root_domain); 9607 rq_attach_root(rq, &def_root_domain);
9527#endif 9608#endif
@@ -9571,7 +9652,9 @@ void __init sched_init(void)
9571 zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); 9652 zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
9572 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); 9653 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
9573#endif 9654#endif
9574 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 9655 /* May be allocated at isolcpus cmdline parse time */
9656 if (cpu_isolated_map == NULL)
9657 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
9575#endif /* SMP */ 9658#endif /* SMP */
9576 9659
9577 perf_event_init(); 9660 perf_event_init();
@@ -9765,13 +9848,15 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
9765 se = kzalloc_node(sizeof(struct sched_entity), 9848 se = kzalloc_node(sizeof(struct sched_entity),
9766 GFP_KERNEL, cpu_to_node(i)); 9849 GFP_KERNEL, cpu_to_node(i));
9767 if (!se) 9850 if (!se)
9768 goto err; 9851 goto err_free_rq;
9769 9852
9770 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); 9853 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
9771 } 9854 }
9772 9855
9773 return 1; 9856 return 1;
9774 9857
9858 err_free_rq:
9859 kfree(cfs_rq);
9775 err: 9860 err:
9776 return 0; 9861 return 0;
9777} 9862}
@@ -9853,13 +9938,15 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
9853 rt_se = kzalloc_node(sizeof(struct sched_rt_entity), 9938 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
9854 GFP_KERNEL, cpu_to_node(i)); 9939 GFP_KERNEL, cpu_to_node(i));
9855 if (!rt_se) 9940 if (!rt_se)
9856 goto err; 9941 goto err_free_rq;
9857 9942
9858 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); 9943 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
9859 } 9944 }
9860 9945
9861 return 1; 9946 return 1;
9862 9947
9948 err_free_rq:
9949 kfree(rt_rq);
9863 err: 9950 err:
9864 return 0; 9951 return 0;
9865} 9952}
@@ -10901,6 +10988,7 @@ void synchronize_sched_expedited(void)
10901 spin_unlock_irqrestore(&rq->lock, flags); 10988 spin_unlock_irqrestore(&rq->lock, flags);
10902 } 10989 }
10903 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; 10990 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
10991 synchronize_sched_expedited_count++;
10904 mutex_unlock(&rcu_sched_expedited_mutex); 10992 mutex_unlock(&rcu_sched_expedited_mutex);
10905 put_online_cpus(); 10993 put_online_cpus();
10906 if (need_full_sync) 10994 if (need_full_sync)
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index efb84409bc43..5ae24fc65d75 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -285,12 +285,16 @@ static void print_cpu(struct seq_file *m, int cpu)
285 285
286#ifdef CONFIG_SCHEDSTATS 286#ifdef CONFIG_SCHEDSTATS
287#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); 287#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
288#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n);
288 289
289 P(yld_count); 290 P(yld_count);
290 291
291 P(sched_switch); 292 P(sched_switch);
292 P(sched_count); 293 P(sched_count);
293 P(sched_goidle); 294 P(sched_goidle);
295#ifdef CONFIG_SMP
296 P64(avg_idle);
297#endif
294 298
295 P(ttwu_count); 299 P(ttwu_count);
296 P(ttwu_local); 300 P(ttwu_local);
@@ -305,6 +309,12 @@ static void print_cpu(struct seq_file *m, int cpu)
305 print_rq(m, rq, cpu); 309 print_rq(m, rq, cpu);
306} 310}
307 311
312static const char *sched_tunable_scaling_names[] = {
313 "none",
314 "logaritmic",
315 "linear"
316};
317
308static int sched_debug_show(struct seq_file *m, void *v) 318static int sched_debug_show(struct seq_file *m, void *v)
309{ 319{
310 u64 now = ktime_to_ns(ktime_get()); 320 u64 now = ktime_to_ns(ktime_get());
@@ -330,6 +340,10 @@ static int sched_debug_show(struct seq_file *m, void *v)
330#undef PN 340#undef PN
331#undef P 341#undef P
332 342
343 SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling",
344 sysctl_sched_tunable_scaling,
345 sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
346
333 for_each_online_cpu(cpu) 347 for_each_online_cpu(cpu)
334 print_cpu(m, cpu); 348 print_cpu(m, cpu);
335 349
@@ -395,7 +409,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
395 PN(se.sum_exec_runtime); 409 PN(se.sum_exec_runtime);
396 PN(se.avg_overlap); 410 PN(se.avg_overlap);
397 PN(se.avg_wakeup); 411 PN(se.avg_wakeup);
398 PN(se.avg_running);
399 412
400 nr_switches = p->nvcsw + p->nivcsw; 413 nr_switches = p->nvcsw + p->nivcsw;
401 414
@@ -419,7 +432,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
419 P(se.nr_failed_migrations_running); 432 P(se.nr_failed_migrations_running);
420 P(se.nr_failed_migrations_hot); 433 P(se.nr_failed_migrations_hot);
421 P(se.nr_forced_migrations); 434 P(se.nr_forced_migrations);
422 P(se.nr_forced2_migrations);
423 P(se.nr_wakeups); 435 P(se.nr_wakeups);
424 P(se.nr_wakeups_sync); 436 P(se.nr_wakeups_sync);
425 P(se.nr_wakeups_migrate); 437 P(se.nr_wakeups_migrate);
@@ -495,7 +507,6 @@ void proc_sched_set_task(struct task_struct *p)
495 p->se.nr_failed_migrations_running = 0; 507 p->se.nr_failed_migrations_running = 0;
496 p->se.nr_failed_migrations_hot = 0; 508 p->se.nr_failed_migrations_hot = 0;
497 p->se.nr_forced_migrations = 0; 509 p->se.nr_forced_migrations = 0;
498 p->se.nr_forced2_migrations = 0;
499 p->se.nr_wakeups = 0; 510 p->se.nr_wakeups = 0;
500 p->se.nr_wakeups_sync = 0; 511 p->se.nr_wakeups_sync = 0;
501 p->se.nr_wakeups_migrate = 0; 512 p->se.nr_wakeups_migrate = 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 37087a7fac22..804a411838f1 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -21,6 +21,7 @@
21 */ 21 */
22 22
23#include <linux/latencytop.h> 23#include <linux/latencytop.h>
24#include <linux/sched.h>
24 25
25/* 26/*
26 * Targeted preemption latency for CPU-bound tasks: 27 * Targeted preemption latency for CPU-bound tasks:
@@ -35,12 +36,26 @@
35 * run vmstat and monitor the context-switches (cs) field) 36 * run vmstat and monitor the context-switches (cs) field)
36 */ 37 */
37unsigned int sysctl_sched_latency = 5000000ULL; 38unsigned int sysctl_sched_latency = 5000000ULL;
39unsigned int normalized_sysctl_sched_latency = 5000000ULL;
40
41/*
42 * The initial- and re-scaling of tunables is configurable
43 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
44 *
45 * Options are:
46 * SCHED_TUNABLESCALING_NONE - unscaled, always *1
47 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
48 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
49 */
50enum sched_tunable_scaling sysctl_sched_tunable_scaling
51 = SCHED_TUNABLESCALING_LOG;
38 52
39/* 53/*
40 * Minimal preemption granularity for CPU-bound tasks: 54 * Minimal preemption granularity for CPU-bound tasks:
41 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) 55 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
42 */ 56 */
43unsigned int sysctl_sched_min_granularity = 1000000ULL; 57unsigned int sysctl_sched_min_granularity = 1000000ULL;
58unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL;
44 59
45/* 60/*
46 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity 61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
@@ -70,6 +85,7 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
70 * have immediate wakeup/sleep latencies. 85 * have immediate wakeup/sleep latencies.
71 */ 86 */
72unsigned int sysctl_sched_wakeup_granularity = 1000000UL; 87unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
88unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
73 89
74const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 90const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
75 91
@@ -383,11 +399,12 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
383 */ 399 */
384 400
385#ifdef CONFIG_SCHED_DEBUG 401#ifdef CONFIG_SCHED_DEBUG
386int sched_nr_latency_handler(struct ctl_table *table, int write, 402int sched_proc_update_handler(struct ctl_table *table, int write,
387 void __user *buffer, size_t *lenp, 403 void __user *buffer, size_t *lenp,
388 loff_t *ppos) 404 loff_t *ppos)
389{ 405{
390 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 406 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
407 int factor = get_update_sysctl_factor();
391 408
392 if (ret || !write) 409 if (ret || !write)
393 return ret; 410 return ret;
@@ -395,6 +412,14 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
395 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, 412 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
396 sysctl_sched_min_granularity); 413 sysctl_sched_min_granularity);
397 414
415#define WRT_SYSCTL(name) \
416 (normalized_sysctl_##name = sysctl_##name / (factor))
417 WRT_SYSCTL(sched_min_granularity);
418 WRT_SYSCTL(sched_latency);
419 WRT_SYSCTL(sched_wakeup_granularity);
420 WRT_SYSCTL(sched_shares_ratelimit);
421#undef WRT_SYSCTL
422
398 return 0; 423 return 0;
399} 424}
400#endif 425#endif
@@ -1345,6 +1370,37 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1345} 1370}
1346 1371
1347/* 1372/*
1373 * Try and locate an idle CPU in the sched_domain.
1374 */
1375static int
1376select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
1377{
1378 int cpu = smp_processor_id();
1379 int prev_cpu = task_cpu(p);
1380 int i;
1381
1382 /*
1383 * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE
1384 * test in select_task_rq_fair) and the prev_cpu is idle then that's
1385 * always a better target than the current cpu.
1386 */
1387 if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running)
1388 return prev_cpu;
1389
1390 /*
1391 * Otherwise, iterate the domain and find an elegible idle cpu.
1392 */
1393 for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
1394 if (!cpu_rq(i)->cfs.nr_running) {
1395 target = i;
1396 break;
1397 }
1398 }
1399
1400 return target;
1401}
1402
1403/*
1348 * sched_balance_self: balance the current task (running on cpu) in domains 1404 * sched_balance_self: balance the current task (running on cpu) in domains
1349 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and 1405 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1350 * SD_BALANCE_EXEC. 1406 * SD_BALANCE_EXEC.
@@ -1372,7 +1428,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1372 new_cpu = prev_cpu; 1428 new_cpu = prev_cpu;
1373 } 1429 }
1374 1430
1375 rcu_read_lock();
1376 for_each_domain(cpu, tmp) { 1431 for_each_domain(cpu, tmp) {
1377 /* 1432 /*
1378 * If power savings logic is enabled for a domain, see if we 1433 * If power savings logic is enabled for a domain, see if we
@@ -1398,11 +1453,35 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1398 want_sd = 0; 1453 want_sd = 0;
1399 } 1454 }
1400 1455
1401 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && 1456 /*
1402 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { 1457 * While iterating the domains looking for a spanning
1458 * WAKE_AFFINE domain, adjust the affine target to any idle cpu
1459 * in cache sharing domains along the way.
1460 */
1461 if (want_affine) {
1462 int target = -1;
1463
1464 /*
1465 * If both cpu and prev_cpu are part of this domain,
1466 * cpu is a valid SD_WAKE_AFFINE target.
1467 */
1468 if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
1469 target = cpu;
1403 1470
1404 affine_sd = tmp; 1471 /*
1405 want_affine = 0; 1472 * If there's an idle sibling in this domain, make that
1473 * the wake_affine target instead of the current cpu.
1474 */
1475 if (tmp->flags & SD_PREFER_SIBLING)
1476 target = select_idle_sibling(p, tmp, target);
1477
1478 if (target >= 0) {
1479 if (tmp->flags & SD_WAKE_AFFINE) {
1480 affine_sd = tmp;
1481 want_affine = 0;
1482 }
1483 cpu = target;
1484 }
1406 } 1485 }
1407 1486
1408 if (!want_sd && !want_affine) 1487 if (!want_sd && !want_affine)
@@ -1429,10 +1508,8 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1429 update_shares(tmp); 1508 update_shares(tmp);
1430 } 1509 }
1431 1510
1432 if (affine_sd && wake_affine(affine_sd, p, sync)) { 1511 if (affine_sd && wake_affine(affine_sd, p, sync))
1433 new_cpu = cpu; 1512 return cpu;
1434 goto out;
1435 }
1436 1513
1437 while (sd) { 1514 while (sd) {
1438 int load_idx = sd->forkexec_idx; 1515 int load_idx = sd->forkexec_idx;
@@ -1473,8 +1550,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1473 /* while loop will break here if sd == NULL */ 1550 /* while loop will break here if sd == NULL */
1474 } 1551 }
1475 1552
1476out:
1477 rcu_read_unlock();
1478 return new_cpu; 1553 return new_cpu;
1479} 1554}
1480#endif /* CONFIG_SMP */ 1555#endif /* CONFIG_SMP */
@@ -1596,12 +1671,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1596 int sync = wake_flags & WF_SYNC; 1671 int sync = wake_flags & WF_SYNC;
1597 int scale = cfs_rq->nr_running >= sched_nr_latency; 1672 int scale = cfs_rq->nr_running >= sched_nr_latency;
1598 1673
1599 update_curr(cfs_rq); 1674 if (unlikely(rt_prio(p->prio)))
1600 1675 goto preempt;
1601 if (unlikely(rt_prio(p->prio))) {
1602 resched_task(curr);
1603 return;
1604 }
1605 1676
1606 if (unlikely(p->sched_class != &fair_sched_class)) 1677 if (unlikely(p->sched_class != &fair_sched_class))
1607 return; 1678 return;
@@ -1627,50 +1698,44 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1627 return; 1698 return;
1628 1699
1629 /* Idle tasks are by definition preempted by everybody. */ 1700 /* Idle tasks are by definition preempted by everybody. */
1630 if (unlikely(curr->policy == SCHED_IDLE)) { 1701 if (unlikely(curr->policy == SCHED_IDLE))
1631 resched_task(curr); 1702 goto preempt;
1632 return;
1633 }
1634 1703
1635 if ((sched_feat(WAKEUP_SYNC) && sync) || 1704 if (sched_feat(WAKEUP_SYNC) && sync)
1636 (sched_feat(WAKEUP_OVERLAP) && 1705 goto preempt;
1637 (se->avg_overlap < sysctl_sched_migration_cost &&
1638 pse->avg_overlap < sysctl_sched_migration_cost))) {
1639 resched_task(curr);
1640 return;
1641 }
1642 1706
1643 if (sched_feat(WAKEUP_RUNNING)) { 1707 if (sched_feat(WAKEUP_OVERLAP) &&
1644 if (pse->avg_running < se->avg_running) { 1708 se->avg_overlap < sysctl_sched_migration_cost &&
1645 set_next_buddy(pse); 1709 pse->avg_overlap < sysctl_sched_migration_cost)
1646 resched_task(curr); 1710 goto preempt;
1647 return;
1648 }
1649 }
1650 1711
1651 if (!sched_feat(WAKEUP_PREEMPT)) 1712 if (!sched_feat(WAKEUP_PREEMPT))
1652 return; 1713 return;
1653 1714
1715 update_curr(cfs_rq);
1654 find_matching_se(&se, &pse); 1716 find_matching_se(&se, &pse);
1655
1656 BUG_ON(!pse); 1717 BUG_ON(!pse);
1718 if (wakeup_preempt_entity(se, pse) == 1)
1719 goto preempt;
1657 1720
1658 if (wakeup_preempt_entity(se, pse) == 1) { 1721 return;
1659 resched_task(curr); 1722
1660 /* 1723preempt:
1661 * Only set the backward buddy when the current task is still 1724 resched_task(curr);
1662 * on the rq. This can happen when a wakeup gets interleaved 1725 /*
1663 * with schedule on the ->pre_schedule() or idle_balance() 1726 * Only set the backward buddy when the current task is still
1664 * point, either of which can * drop the rq lock. 1727 * on the rq. This can happen when a wakeup gets interleaved
1665 * 1728 * with schedule on the ->pre_schedule() or idle_balance()
1666 * Also, during early boot the idle thread is in the fair class, 1729 * point, either of which can * drop the rq lock.
1667 * for obvious reasons its a bad idea to schedule back to it. 1730 *
1668 */ 1731 * Also, during early boot the idle thread is in the fair class,
1669 if (unlikely(!se->on_rq || curr == rq->idle)) 1732 * for obvious reasons its a bad idea to schedule back to it.
1670 return; 1733 */
1671 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) 1734 if (unlikely(!se->on_rq || curr == rq->idle))
1672 set_last_buddy(se); 1735 return;
1673 } 1736
1737 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
1738 set_last_buddy(se);
1674} 1739}
1675 1740
1676static struct task_struct *pick_next_task_fair(struct rq *rq) 1741static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1679,7 +1744,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
1679 struct cfs_rq *cfs_rq = &rq->cfs; 1744 struct cfs_rq *cfs_rq = &rq->cfs;
1680 struct sched_entity *se; 1745 struct sched_entity *se;
1681 1746
1682 if (unlikely(!cfs_rq->nr_running)) 1747 if (!cfs_rq->nr_running)
1683 return NULL; 1748 return NULL;
1684 1749
1685 do { 1750 do {
@@ -1850,6 +1915,17 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1850 1915
1851 return 0; 1916 return 0;
1852} 1917}
1918
1919static void rq_online_fair(struct rq *rq)
1920{
1921 update_sysctl();
1922}
1923
1924static void rq_offline_fair(struct rq *rq)
1925{
1926 update_sysctl();
1927}
1928
1853#endif /* CONFIG_SMP */ 1929#endif /* CONFIG_SMP */
1854 1930
1855/* 1931/*
@@ -1867,28 +1943,30 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
1867} 1943}
1868 1944
1869/* 1945/*
1870 * Share the fairness runtime between parent and child, thus the 1946 * called on fork with the child task as argument from the parent's context
1871 * total amount of pressure for CPU stays equal - new tasks 1947 * - child not yet on the tasklist
1872 * get a chance to run but frequent forkers are not allowed to 1948 * - preemption disabled
1873 * monopolize the CPU. Note: the parent runqueue is locked,
1874 * the child is not running yet.
1875 */ 1949 */
1876static void task_new_fair(struct rq *rq, struct task_struct *p) 1950static void task_fork_fair(struct task_struct *p)
1877{ 1951{
1878 struct cfs_rq *cfs_rq = task_cfs_rq(p); 1952 struct cfs_rq *cfs_rq = task_cfs_rq(current);
1879 struct sched_entity *se = &p->se, *curr = cfs_rq->curr; 1953 struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
1880 int this_cpu = smp_processor_id(); 1954 int this_cpu = smp_processor_id();
1955 struct rq *rq = this_rq();
1956 unsigned long flags;
1957
1958 spin_lock_irqsave(&rq->lock, flags);
1881 1959
1882 sched_info_queued(p); 1960 if (unlikely(task_cpu(p) != this_cpu))
1961 __set_task_cpu(p, this_cpu);
1883 1962
1884 update_curr(cfs_rq); 1963 update_curr(cfs_rq);
1964
1885 if (curr) 1965 if (curr)
1886 se->vruntime = curr->vruntime; 1966 se->vruntime = curr->vruntime;
1887 place_entity(cfs_rq, se, 1); 1967 place_entity(cfs_rq, se, 1);
1888 1968
1889 /* 'curr' will be NULL if the child belongs to a different group */ 1969 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
1890 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
1891 curr && entity_before(curr, se)) {
1892 /* 1970 /*
1893 * Upon rescheduling, sched_class::put_prev_task() will place 1971 * Upon rescheduling, sched_class::put_prev_task() will place
1894 * 'current' within the tree based on its new key value. 1972 * 'current' within the tree based on its new key value.
@@ -1897,7 +1975,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1897 resched_task(rq->curr); 1975 resched_task(rq->curr);
1898 } 1976 }
1899 1977
1900 enqueue_task_fair(rq, p, 0); 1978 spin_unlock_irqrestore(&rq->lock, flags);
1901} 1979}
1902 1980
1903/* 1981/*
@@ -1959,21 +2037,17 @@ static void moved_group_fair(struct task_struct *p)
1959} 2037}
1960#endif 2038#endif
1961 2039
1962unsigned int get_rr_interval_fair(struct task_struct *task) 2040unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
1963{ 2041{
1964 struct sched_entity *se = &task->se; 2042 struct sched_entity *se = &task->se;
1965 unsigned long flags;
1966 struct rq *rq;
1967 unsigned int rr_interval = 0; 2043 unsigned int rr_interval = 0;
1968 2044
1969 /* 2045 /*
1970 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise 2046 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
1971 * idle runqueue: 2047 * idle runqueue:
1972 */ 2048 */
1973 rq = task_rq_lock(task, &flags);
1974 if (rq->cfs.load.weight) 2049 if (rq->cfs.load.weight)
1975 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); 2050 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
1976 task_rq_unlock(rq, &flags);
1977 2051
1978 return rr_interval; 2052 return rr_interval;
1979} 2053}
@@ -1997,11 +2071,13 @@ static const struct sched_class fair_sched_class = {
1997 2071
1998 .load_balance = load_balance_fair, 2072 .load_balance = load_balance_fair,
1999 .move_one_task = move_one_task_fair, 2073 .move_one_task = move_one_task_fair,
2074 .rq_online = rq_online_fair,
2075 .rq_offline = rq_offline_fair,
2000#endif 2076#endif
2001 2077
2002 .set_curr_task = set_curr_task_fair, 2078 .set_curr_task = set_curr_task_fair,
2003 .task_tick = task_tick_fair, 2079 .task_tick = task_tick_fair,
2004 .task_new = task_new_fair, 2080 .task_fork = task_fork_fair,
2005 2081
2006 .prio_changed = prio_changed_fair, 2082 .prio_changed = prio_changed_fair,
2007 .switched_to = switched_to_fair, 2083 .switched_to = switched_to_fair,
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 0d94083582c7..d5059fd761d9 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -54,11 +54,6 @@ SCHED_FEAT(WAKEUP_SYNC, 0)
54SCHED_FEAT(WAKEUP_OVERLAP, 0) 54SCHED_FEAT(WAKEUP_OVERLAP, 0)
55 55
56/* 56/*
57 * Wakeup preemption towards tasks that run short
58 */
59SCHED_FEAT(WAKEUP_RUNNING, 0)
60
61/*
62 * Use the SYNC wakeup hint, pipes and the likes use this to indicate 57 * Use the SYNC wakeup hint, pipes and the likes use this to indicate
63 * the remote end is likely to consume the data we just wrote, and 58 * the remote end is likely to consume the data we just wrote, and
64 * therefore has cache benefit from being placed on the same cpu, see 59 * therefore has cache benefit from being placed on the same cpu, see
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index b133a28fcde3..33d5384a73a8 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -97,7 +97,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
97 check_preempt_curr(rq, p, 0); 97 check_preempt_curr(rq, p, 0);
98} 98}
99 99
100unsigned int get_rr_interval_idle(struct task_struct *task) 100unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
101{ 101{
102 return 0; 102 return 0;
103} 103}
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index a4d790cddb19..aecbd9c6b20c 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1153,29 +1153,12 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
1153 1153
1154static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); 1154static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1155 1155
1156static inline int pick_optimal_cpu(int this_cpu,
1157 const struct cpumask *mask)
1158{
1159 int first;
1160
1161 /* "this_cpu" is cheaper to preempt than a remote processor */
1162 if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask))
1163 return this_cpu;
1164
1165 first = cpumask_first(mask);
1166 if (first < nr_cpu_ids)
1167 return first;
1168
1169 return -1;
1170}
1171
1172static int find_lowest_rq(struct task_struct *task) 1156static int find_lowest_rq(struct task_struct *task)
1173{ 1157{
1174 struct sched_domain *sd; 1158 struct sched_domain *sd;
1175 struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); 1159 struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
1176 int this_cpu = smp_processor_id(); 1160 int this_cpu = smp_processor_id();
1177 int cpu = task_cpu(task); 1161 int cpu = task_cpu(task);
1178 cpumask_var_t domain_mask;
1179 1162
1180 if (task->rt.nr_cpus_allowed == 1) 1163 if (task->rt.nr_cpus_allowed == 1)
1181 return -1; /* No other targets possible */ 1164 return -1; /* No other targets possible */
@@ -1198,28 +1181,26 @@ static int find_lowest_rq(struct task_struct *task)
1198 * Otherwise, we consult the sched_domains span maps to figure 1181 * Otherwise, we consult the sched_domains span maps to figure
1199 * out which cpu is logically closest to our hot cache data. 1182 * out which cpu is logically closest to our hot cache data.
1200 */ 1183 */
1201 if (this_cpu == cpu) 1184 if (!cpumask_test_cpu(this_cpu, lowest_mask))
1202 this_cpu = -1; /* Skip this_cpu opt if the same */ 1185 this_cpu = -1; /* Skip this_cpu opt if not among lowest */
1203
1204 if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) {
1205 for_each_domain(cpu, sd) {
1206 if (sd->flags & SD_WAKE_AFFINE) {
1207 int best_cpu;
1208 1186
1209 cpumask_and(domain_mask, 1187 for_each_domain(cpu, sd) {
1210 sched_domain_span(sd), 1188 if (sd->flags & SD_WAKE_AFFINE) {
1211 lowest_mask); 1189 int best_cpu;
1212 1190
1213 best_cpu = pick_optimal_cpu(this_cpu, 1191 /*
1214 domain_mask); 1192 * "this_cpu" is cheaper to preempt than a
1215 1193 * remote processor.
1216 if (best_cpu != -1) { 1194 */
1217 free_cpumask_var(domain_mask); 1195 if (this_cpu != -1 &&
1218 return best_cpu; 1196 cpumask_test_cpu(this_cpu, sched_domain_span(sd)))
1219 } 1197 return this_cpu;
1220 } 1198
1199 best_cpu = cpumask_first_and(lowest_mask,
1200 sched_domain_span(sd));
1201 if (best_cpu < nr_cpu_ids)
1202 return best_cpu;
1221 } 1203 }
1222 free_cpumask_var(domain_mask);
1223 } 1204 }
1224 1205
1225 /* 1206 /*
@@ -1227,7 +1208,13 @@ static int find_lowest_rq(struct task_struct *task)
1227 * just give the caller *something* to work with from the compatible 1208 * just give the caller *something* to work with from the compatible
1228 * locations. 1209 * locations.
1229 */ 1210 */
1230 return pick_optimal_cpu(this_cpu, lowest_mask); 1211 if (this_cpu != -1)
1212 return this_cpu;
1213
1214 cpu = cpumask_any(lowest_mask);
1215 if (cpu < nr_cpu_ids)
1216 return cpu;
1217 return -1;
1231} 1218}
1232 1219
1233/* Will lock the rq it finds */ 1220/* Will lock the rq it finds */
@@ -1734,7 +1721,7 @@ static void set_curr_task_rt(struct rq *rq)
1734 dequeue_pushable_task(rq, p); 1721 dequeue_pushable_task(rq, p);
1735} 1722}
1736 1723
1737unsigned int get_rr_interval_rt(struct task_struct *task) 1724unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
1738{ 1725{
1739 /* 1726 /*
1740 * Time slice is 0 for SCHED_FIFO tasks 1727 * Time slice is 0 for SCHED_FIFO tasks
diff --git a/kernel/signal.c b/kernel/signal.c
index 6705320784fd..6b982f2cf524 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -22,12 +22,14 @@
22#include <linux/ptrace.h> 22#include <linux/ptrace.h>
23#include <linux/signal.h> 23#include <linux/signal.h>
24#include <linux/signalfd.h> 24#include <linux/signalfd.h>
25#include <linux/ratelimit.h>
25#include <linux/tracehook.h> 26#include <linux/tracehook.h>
26#include <linux/capability.h> 27#include <linux/capability.h>
27#include <linux/freezer.h> 28#include <linux/freezer.h>
28#include <linux/pid_namespace.h> 29#include <linux/pid_namespace.h>
29#include <linux/nsproxy.h> 30#include <linux/nsproxy.h>
30#include <trace/events/sched.h> 31#define CREATE_TRACE_POINTS
32#include <trace/events/signal.h>
31 33
32#include <asm/param.h> 34#include <asm/param.h>
33#include <asm/uaccess.h> 35#include <asm/uaccess.h>
@@ -41,6 +43,8 @@
41 43
42static struct kmem_cache *sigqueue_cachep; 44static struct kmem_cache *sigqueue_cachep;
43 45
46int print_fatal_signals __read_mostly;
47
44static void __user *sig_handler(struct task_struct *t, int sig) 48static void __user *sig_handler(struct task_struct *t, int sig)
45{ 49{
46 return t->sighand->action[sig - 1].sa.sa_handler; 50 return t->sighand->action[sig - 1].sa.sa_handler;
@@ -159,7 +163,7 @@ int next_signal(struct sigpending *pending, sigset_t *mask)
159{ 163{
160 unsigned long i, *s, *m, x; 164 unsigned long i, *s, *m, x;
161 int sig = 0; 165 int sig = 0;
162 166
163 s = pending->signal.sig; 167 s = pending->signal.sig;
164 m = mask->sig; 168 m = mask->sig;
165 switch (_NSIG_WORDS) { 169 switch (_NSIG_WORDS) {
@@ -184,17 +188,31 @@ int next_signal(struct sigpending *pending, sigset_t *mask)
184 sig = ffz(~x) + 1; 188 sig = ffz(~x) + 1;
185 break; 189 break;
186 } 190 }
187 191
188 return sig; 192 return sig;
189} 193}
190 194
195static inline void print_dropped_signal(int sig)
196{
197 static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
198
199 if (!print_fatal_signals)
200 return;
201
202 if (!__ratelimit(&ratelimit_state))
203 return;
204
205 printk(KERN_INFO "%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n",
206 current->comm, current->pid, sig);
207}
208
191/* 209/*
192 * allocate a new signal queue record 210 * allocate a new signal queue record
193 * - this may be called without locks if and only if t == current, otherwise an 211 * - this may be called without locks if and only if t == current, otherwise an
194 * appopriate lock must be held to stop the target task from exiting 212 * appopriate lock must be held to stop the target task from exiting
195 */ 213 */
196static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, 214static struct sigqueue *
197 int override_rlimit) 215__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
198{ 216{
199 struct sigqueue *q = NULL; 217 struct sigqueue *q = NULL;
200 struct user_struct *user; 218 struct user_struct *user;
@@ -207,10 +225,15 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
207 */ 225 */
208 user = get_uid(__task_cred(t)->user); 226 user = get_uid(__task_cred(t)->user);
209 atomic_inc(&user->sigpending); 227 atomic_inc(&user->sigpending);
228
210 if (override_rlimit || 229 if (override_rlimit ||
211 atomic_read(&user->sigpending) <= 230 atomic_read(&user->sigpending) <=
212 t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) 231 t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) {
213 q = kmem_cache_alloc(sigqueue_cachep, flags); 232 q = kmem_cache_alloc(sigqueue_cachep, flags);
233 } else {
234 print_dropped_signal(sig);
235 }
236
214 if (unlikely(q == NULL)) { 237 if (unlikely(q == NULL)) {
215 atomic_dec(&user->sigpending); 238 atomic_dec(&user->sigpending);
216 free_uid(user); 239 free_uid(user);
@@ -834,7 +857,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
834 struct sigqueue *q; 857 struct sigqueue *q;
835 int override_rlimit; 858 int override_rlimit;
836 859
837 trace_sched_signal_send(sig, t); 860 trace_signal_generate(sig, info, t);
838 861
839 assert_spin_locked(&t->sighand->siglock); 862 assert_spin_locked(&t->sighand->siglock);
840 863
@@ -869,7 +892,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
869 else 892 else
870 override_rlimit = 0; 893 override_rlimit = 0;
871 894
872 q = __sigqueue_alloc(t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE, 895 q = __sigqueue_alloc(sig, t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE,
873 override_rlimit); 896 override_rlimit);
874 if (q) { 897 if (q) {
875 list_add_tail(&q->list, &pending->list); 898 list_add_tail(&q->list, &pending->list);
@@ -896,12 +919,21 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
896 break; 919 break;
897 } 920 }
898 } else if (!is_si_special(info)) { 921 } else if (!is_si_special(info)) {
899 if (sig >= SIGRTMIN && info->si_code != SI_USER) 922 if (sig >= SIGRTMIN && info->si_code != SI_USER) {
900 /* 923 /*
901 * Queue overflow, abort. We may abort if the signal was rt 924 * Queue overflow, abort. We may abort if the
902 * and sent by user using something other than kill(). 925 * signal was rt and sent by user using something
903 */ 926 * other than kill().
927 */
928 trace_signal_overflow_fail(sig, group, info);
904 return -EAGAIN; 929 return -EAGAIN;
930 } else {
931 /*
932 * This is a silent loss of information. We still
933 * send the signal, but the *info bits are lost.
934 */
935 trace_signal_lose_info(sig, group, info);
936 }
905 } 937 }
906 938
907out_set: 939out_set:
@@ -925,8 +957,6 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
925 return __send_signal(sig, info, t, group, from_ancestor_ns); 957 return __send_signal(sig, info, t, group, from_ancestor_ns);
926} 958}
927 959
928int print_fatal_signals;
929
930static void print_fatal_signal(struct pt_regs *regs, int signr) 960static void print_fatal_signal(struct pt_regs *regs, int signr)
931{ 961{
932 printk("%s/%d: potentially unexpected fatal signal %d.\n", 962 printk("%s/%d: potentially unexpected fatal signal %d.\n",
@@ -1293,19 +1323,19 @@ EXPORT_SYMBOL(kill_pid);
1293 * These functions support sending signals using preallocated sigqueue 1323 * These functions support sending signals using preallocated sigqueue
1294 * structures. This is needed "because realtime applications cannot 1324 * structures. This is needed "because realtime applications cannot
1295 * afford to lose notifications of asynchronous events, like timer 1325 * afford to lose notifications of asynchronous events, like timer
1296 * expirations or I/O completions". In the case of Posix Timers 1326 * expirations or I/O completions". In the case of Posix Timers
1297 * we allocate the sigqueue structure from the timer_create. If this 1327 * we allocate the sigqueue structure from the timer_create. If this
1298 * allocation fails we are able to report the failure to the application 1328 * allocation fails we are able to report the failure to the application
1299 * with an EAGAIN error. 1329 * with an EAGAIN error.
1300 */ 1330 */
1301
1302struct sigqueue *sigqueue_alloc(void) 1331struct sigqueue *sigqueue_alloc(void)
1303{ 1332{
1304 struct sigqueue *q; 1333 struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
1305 1334
1306 if ((q = __sigqueue_alloc(current, GFP_KERNEL, 0))) 1335 if (q)
1307 q->flags |= SIGQUEUE_PREALLOC; 1336 q->flags |= SIGQUEUE_PREALLOC;
1308 return(q); 1337
1338 return q;
1309} 1339}
1310 1340
1311void sigqueue_free(struct sigqueue *q) 1341void sigqueue_free(struct sigqueue *q)
@@ -1839,6 +1869,9 @@ relock:
1839 ka = &sighand->action[signr-1]; 1869 ka = &sighand->action[signr-1];
1840 } 1870 }
1841 1871
1872 /* Trace actually delivered signals. */
1873 trace_signal_deliver(signr, info, ka);
1874
1842 if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ 1875 if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */
1843 continue; 1876 continue;
1844 if (ka->sa.sa_handler != SIG_DFL) { 1877 if (ka->sa.sa_handler != SIG_DFL) {
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 00889bd3c590..7494bbf5a270 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -49,7 +49,6 @@ static const int slow_work_max_vslow = 99;
49 49
50ctl_table slow_work_sysctls[] = { 50ctl_table slow_work_sysctls[] = {
51 { 51 {
52 .ctl_name = CTL_UNNUMBERED,
53 .procname = "min-threads", 52 .procname = "min-threads",
54 .data = &slow_work_min_threads, 53 .data = &slow_work_min_threads,
55 .maxlen = sizeof(unsigned), 54 .maxlen = sizeof(unsigned),
@@ -59,7 +58,6 @@ ctl_table slow_work_sysctls[] = {
59 .extra2 = &slow_work_max_threads, 58 .extra2 = &slow_work_max_threads,
60 }, 59 },
61 { 60 {
62 .ctl_name = CTL_UNNUMBERED,
63 .procname = "max-threads", 61 .procname = "max-threads",
64 .data = &slow_work_max_threads, 62 .data = &slow_work_max_threads,
65 .maxlen = sizeof(unsigned), 63 .maxlen = sizeof(unsigned),
@@ -69,16 +67,15 @@ ctl_table slow_work_sysctls[] = {
69 .extra2 = (void *) &slow_work_max_max_threads, 67 .extra2 = (void *) &slow_work_max_max_threads,
70 }, 68 },
71 { 69 {
72 .ctl_name = CTL_UNNUMBERED,
73 .procname = "vslow-percentage", 70 .procname = "vslow-percentage",
74 .data = &vslow_work_proportion, 71 .data = &vslow_work_proportion,
75 .maxlen = sizeof(unsigned), 72 .maxlen = sizeof(unsigned),
76 .mode = 0644, 73 .mode = 0644,
77 .proc_handler = &proc_dointvec_minmax, 74 .proc_handler = proc_dointvec_minmax,
78 .extra1 = (void *) &slow_work_min_vslow, 75 .extra1 = (void *) &slow_work_min_vslow,
79 .extra2 = (void *) &slow_work_max_vslow, 76 .extra2 = (void *) &slow_work_max_vslow,
80 }, 77 },
81 { .ctl_name = 0 } 78 {}
82}; 79};
83#endif 80#endif
84 81
diff --git a/kernel/smp.c b/kernel/smp.c
index c9d1c7835c2f..a8c76069cf50 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -265,9 +265,7 @@ static DEFINE_PER_CPU(struct call_single_data, csd_data);
265 * @info: An arbitrary pointer to pass to the function. 265 * @info: An arbitrary pointer to pass to the function.
266 * @wait: If true, wait until function has completed on other CPUs. 266 * @wait: If true, wait until function has completed on other CPUs.
267 * 267 *
268 * Returns 0 on success, else a negative status code. Note that @wait 268 * Returns 0 on success, else a negative status code.
269 * will be implicitly turned on in case of allocation failures, since
270 * we fall back to on-stack allocation.
271 */ 269 */
272int smp_call_function_single(int cpu, void (*func) (void *info), void *info, 270int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
273 int wait) 271 int wait)
@@ -321,6 +319,51 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
321} 319}
322EXPORT_SYMBOL(smp_call_function_single); 320EXPORT_SYMBOL(smp_call_function_single);
323 321
322/*
323 * smp_call_function_any - Run a function on any of the given cpus
324 * @mask: The mask of cpus it can run on.
325 * @func: The function to run. This must be fast and non-blocking.
326 * @info: An arbitrary pointer to pass to the function.
327 * @wait: If true, wait until function has completed.
328 *
329 * Returns 0 on success, else a negative status code (if no cpus were online).
330 * Note that @wait will be implicitly turned on in case of allocation failures,
331 * since we fall back to on-stack allocation.
332 *
333 * Selection preference:
334 * 1) current cpu if in @mask
335 * 2) any cpu of current node if in @mask
336 * 3) any other online cpu in @mask
337 */
338int smp_call_function_any(const struct cpumask *mask,
339 void (*func)(void *info), void *info, int wait)
340{
341 unsigned int cpu;
342 const struct cpumask *nodemask;
343 int ret;
344
345 /* Try for same CPU (cheapest) */
346 cpu = get_cpu();
347 if (cpumask_test_cpu(cpu, mask))
348 goto call;
349
350 /* Try for same node. */
351 nodemask = cpumask_of_node(cpu);
352 for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids;
353 cpu = cpumask_next_and(cpu, nodemask, mask)) {
354 if (cpu_online(cpu))
355 goto call;
356 }
357
358 /* Any online will do: smp_call_function_single handles nr_cpu_ids. */
359 cpu = cpumask_any_and(mask, cpu_online_mask);
360call:
361 ret = smp_call_function_single(cpu, func, info, wait);
362 put_cpu();
363 return ret;
364}
365EXPORT_SYMBOL_GPL(smp_call_function_any);
366
324/** 367/**
325 * __smp_call_function_single(): Run a function on another CPU 368 * __smp_call_function_single(): Run a function on another CPU
326 * @cpu: The CPU to run on. 369 * @cpu: The CPU to run on.
@@ -355,9 +398,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
355 * @wait: If true, wait (atomically) until function has completed 398 * @wait: If true, wait (atomically) until function has completed
356 * on other CPUs. 399 * on other CPUs.
357 * 400 *
358 * If @wait is true, then returns once @func has returned. Note that @wait 401 * If @wait is true, then returns once @func has returned.
359 * will be implicitly turned on in case of allocation failures, since
360 * we fall back to on-stack allocation.
361 * 402 *
362 * You must not call this function with disabled interrupts or from a 403 * You must not call this function with disabled interrupts or from a
363 * hardware interrupt handler or from a bottom half handler. Preemption 404 * hardware interrupt handler or from a bottom half handler. Preemption
@@ -443,8 +484,7 @@ EXPORT_SYMBOL(smp_call_function_many);
443 * Returns 0. 484 * Returns 0.
444 * 485 *
445 * If @wait is true, then returns once @func has returned; otherwise 486 * If @wait is true, then returns once @func has returned; otherwise
446 * it returns just before the target cpu calls @func. In case of allocation 487 * it returns just before the target cpu calls @func.
447 * failure, @wait will be implicitly turned on.
448 * 488 *
449 * You must not call this function with disabled interrupts or from a 489 * You must not call this function with disabled interrupts or from a
450 * hardware interrupt handler or from a bottom half handler. 490 * hardware interrupt handler or from a bottom half handler.
diff --git a/kernel/softirq.c b/kernel/softirq.c
index f8749e5216e0..21939d9e830e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -302,9 +302,9 @@ void irq_exit(void)
302 if (!in_interrupt() && local_softirq_pending()) 302 if (!in_interrupt() && local_softirq_pending())
303 invoke_softirq(); 303 invoke_softirq();
304 304
305 rcu_irq_exit();
305#ifdef CONFIG_NO_HZ 306#ifdef CONFIG_NO_HZ
306 /* Make sure that timer wheel updates are propagated */ 307 /* Make sure that timer wheel updates are propagated */
307 rcu_irq_exit();
308 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) 308 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
309 tick_nohz_stop_sched_tick(0); 309 tick_nohz_stop_sched_tick(0);
310#endif 310#endif
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 5ddab730cb2f..41e042219ff6 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -21,145 +21,28 @@
21#include <linux/debug_locks.h> 21#include <linux/debug_locks.h>
22#include <linux/module.h> 22#include <linux/module.h>
23 23
24#ifndef _spin_trylock
25int __lockfunc _spin_trylock(spinlock_t *lock)
26{
27 return __spin_trylock(lock);
28}
29EXPORT_SYMBOL(_spin_trylock);
30#endif
31
32#ifndef _read_trylock
33int __lockfunc _read_trylock(rwlock_t *lock)
34{
35 return __read_trylock(lock);
36}
37EXPORT_SYMBOL(_read_trylock);
38#endif
39
40#ifndef _write_trylock
41int __lockfunc _write_trylock(rwlock_t *lock)
42{
43 return __write_trylock(lock);
44}
45EXPORT_SYMBOL(_write_trylock);
46#endif
47
48/* 24/*
49 * If lockdep is enabled then we use the non-preemption spin-ops 25 * If lockdep is enabled then we use the non-preemption spin-ops
50 * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are 26 * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
51 * not re-enabled during lock-acquire (which the preempt-spin-ops do): 27 * not re-enabled during lock-acquire (which the preempt-spin-ops do):
52 */ 28 */
53#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) 29#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
54
55#ifndef _read_lock
56void __lockfunc _read_lock(rwlock_t *lock)
57{
58 __read_lock(lock);
59}
60EXPORT_SYMBOL(_read_lock);
61#endif
62
63#ifndef _spin_lock_irqsave
64unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
65{
66 return __spin_lock_irqsave(lock);
67}
68EXPORT_SYMBOL(_spin_lock_irqsave);
69#endif
70
71#ifndef _spin_lock_irq
72void __lockfunc _spin_lock_irq(spinlock_t *lock)
73{
74 __spin_lock_irq(lock);
75}
76EXPORT_SYMBOL(_spin_lock_irq);
77#endif
78
79#ifndef _spin_lock_bh
80void __lockfunc _spin_lock_bh(spinlock_t *lock)
81{
82 __spin_lock_bh(lock);
83}
84EXPORT_SYMBOL(_spin_lock_bh);
85#endif
86
87#ifndef _read_lock_irqsave
88unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
89{
90 return __read_lock_irqsave(lock);
91}
92EXPORT_SYMBOL(_read_lock_irqsave);
93#endif
94
95#ifndef _read_lock_irq
96void __lockfunc _read_lock_irq(rwlock_t *lock)
97{
98 __read_lock_irq(lock);
99}
100EXPORT_SYMBOL(_read_lock_irq);
101#endif
102
103#ifndef _read_lock_bh
104void __lockfunc _read_lock_bh(rwlock_t *lock)
105{
106 __read_lock_bh(lock);
107}
108EXPORT_SYMBOL(_read_lock_bh);
109#endif
110
111#ifndef _write_lock_irqsave
112unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
113{
114 return __write_lock_irqsave(lock);
115}
116EXPORT_SYMBOL(_write_lock_irqsave);
117#endif
118
119#ifndef _write_lock_irq
120void __lockfunc _write_lock_irq(rwlock_t *lock)
121{
122 __write_lock_irq(lock);
123}
124EXPORT_SYMBOL(_write_lock_irq);
125#endif
126
127#ifndef _write_lock_bh
128void __lockfunc _write_lock_bh(rwlock_t *lock)
129{
130 __write_lock_bh(lock);
131}
132EXPORT_SYMBOL(_write_lock_bh);
133#endif
134
135#ifndef _spin_lock
136void __lockfunc _spin_lock(spinlock_t *lock)
137{
138 __spin_lock(lock);
139}
140EXPORT_SYMBOL(_spin_lock);
141#endif
142
143#ifndef _write_lock
144void __lockfunc _write_lock(rwlock_t *lock)
145{
146 __write_lock(lock);
147}
148EXPORT_SYMBOL(_write_lock);
149#endif
150
151#else /* CONFIG_PREEMPT: */
152
153/* 30/*
31 * The __lock_function inlines are taken from
32 * include/linux/spinlock_api_smp.h
33 */
34#else
35/*
36 * We build the __lock_function inlines here. They are too large for
37 * inlining all over the place, but here is only one user per function
38 * which embedds them into the calling _lock_function below.
39 *
154 * This could be a long-held lock. We both prepare to spin for a long 40 * This could be a long-held lock. We both prepare to spin for a long
155 * time (making _this_ CPU preemptable if possible), and we also signal 41 * time (making _this_ CPU preemptable if possible), and we also signal
156 * towards that other CPU that it should break the lock ASAP. 42 * towards that other CPU that it should break the lock ASAP.
157 *
158 * (We do this in a function because inlining it would be excessive.)
159 */ 43 */
160
161#define BUILD_LOCK_OPS(op, locktype) \ 44#define BUILD_LOCK_OPS(op, locktype) \
162void __lockfunc _##op##_lock(locktype##_t *lock) \ 45void __lockfunc __##op##_lock(locktype##_t *lock) \
163{ \ 46{ \
164 for (;;) { \ 47 for (;;) { \
165 preempt_disable(); \ 48 preempt_disable(); \
@@ -175,9 +58,7 @@ void __lockfunc _##op##_lock(locktype##_t *lock) \
175 (lock)->break_lock = 0; \ 58 (lock)->break_lock = 0; \
176} \ 59} \
177 \ 60 \
178EXPORT_SYMBOL(_##op##_lock); \ 61unsigned long __lockfunc __##op##_lock_irqsave(locktype##_t *lock) \
179 \
180unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \
181{ \ 62{ \
182 unsigned long flags; \ 63 unsigned long flags; \
183 \ 64 \
@@ -198,16 +79,12 @@ unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \
198 return flags; \ 79 return flags; \
199} \ 80} \
200 \ 81 \
201EXPORT_SYMBOL(_##op##_lock_irqsave); \ 82void __lockfunc __##op##_lock_irq(locktype##_t *lock) \
202 \
203void __lockfunc _##op##_lock_irq(locktype##_t *lock) \
204{ \ 83{ \
205 _##op##_lock_irqsave(lock); \ 84 _##op##_lock_irqsave(lock); \
206} \ 85} \
207 \ 86 \
208EXPORT_SYMBOL(_##op##_lock_irq); \ 87void __lockfunc __##op##_lock_bh(locktype##_t *lock) \
209 \
210void __lockfunc _##op##_lock_bh(locktype##_t *lock) \
211{ \ 88{ \
212 unsigned long flags; \ 89 unsigned long flags; \
213 \ 90 \
@@ -220,23 +97,21 @@ void __lockfunc _##op##_lock_bh(locktype##_t *lock) \
220 local_bh_disable(); \ 97 local_bh_disable(); \
221 local_irq_restore(flags); \ 98 local_irq_restore(flags); \
222} \ 99} \
223 \
224EXPORT_SYMBOL(_##op##_lock_bh)
225 100
226/* 101/*
227 * Build preemption-friendly versions of the following 102 * Build preemption-friendly versions of the following
228 * lock-spinning functions: 103 * lock-spinning functions:
229 * 104 *
230 * _[spin|read|write]_lock() 105 * __[spin|read|write]_lock()
231 * _[spin|read|write]_lock_irq() 106 * __[spin|read|write]_lock_irq()
232 * _[spin|read|write]_lock_irqsave() 107 * __[spin|read|write]_lock_irqsave()
233 * _[spin|read|write]_lock_bh() 108 * __[spin|read|write]_lock_bh()
234 */ 109 */
235BUILD_LOCK_OPS(spin, spinlock); 110BUILD_LOCK_OPS(spin, spinlock);
236BUILD_LOCK_OPS(read, rwlock); 111BUILD_LOCK_OPS(read, rwlock);
237BUILD_LOCK_OPS(write, rwlock); 112BUILD_LOCK_OPS(write, rwlock);
238 113
239#endif /* CONFIG_PREEMPT */ 114#endif
240 115
241#ifdef CONFIG_DEBUG_LOCK_ALLOC 116#ifdef CONFIG_DEBUG_LOCK_ALLOC
242 117
@@ -248,7 +123,8 @@ void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
248} 123}
249EXPORT_SYMBOL(_spin_lock_nested); 124EXPORT_SYMBOL(_spin_lock_nested);
250 125
251unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass) 126unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock,
127 int subclass)
252{ 128{
253 unsigned long flags; 129 unsigned long flags;
254 130
@@ -272,7 +148,127 @@ EXPORT_SYMBOL(_spin_lock_nest_lock);
272 148
273#endif 149#endif
274 150
275#ifndef _spin_unlock 151#ifndef CONFIG_INLINE_SPIN_TRYLOCK
152int __lockfunc _spin_trylock(spinlock_t *lock)
153{
154 return __spin_trylock(lock);
155}
156EXPORT_SYMBOL(_spin_trylock);
157#endif
158
159#ifndef CONFIG_INLINE_READ_TRYLOCK
160int __lockfunc _read_trylock(rwlock_t *lock)
161{
162 return __read_trylock(lock);
163}
164EXPORT_SYMBOL(_read_trylock);
165#endif
166
167#ifndef CONFIG_INLINE_WRITE_TRYLOCK
168int __lockfunc _write_trylock(rwlock_t *lock)
169{
170 return __write_trylock(lock);
171}
172EXPORT_SYMBOL(_write_trylock);
173#endif
174
175#ifndef CONFIG_INLINE_READ_LOCK
176void __lockfunc _read_lock(rwlock_t *lock)
177{
178 __read_lock(lock);
179}
180EXPORT_SYMBOL(_read_lock);
181#endif
182
183#ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE
184unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
185{
186 return __spin_lock_irqsave(lock);
187}
188EXPORT_SYMBOL(_spin_lock_irqsave);
189#endif
190
191#ifndef CONFIG_INLINE_SPIN_LOCK_IRQ
192void __lockfunc _spin_lock_irq(spinlock_t *lock)
193{
194 __spin_lock_irq(lock);
195}
196EXPORT_SYMBOL(_spin_lock_irq);
197#endif
198
199#ifndef CONFIG_INLINE_SPIN_LOCK_BH
200void __lockfunc _spin_lock_bh(spinlock_t *lock)
201{
202 __spin_lock_bh(lock);
203}
204EXPORT_SYMBOL(_spin_lock_bh);
205#endif
206
207#ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE
208unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
209{
210 return __read_lock_irqsave(lock);
211}
212EXPORT_SYMBOL(_read_lock_irqsave);
213#endif
214
215#ifndef CONFIG_INLINE_READ_LOCK_IRQ
216void __lockfunc _read_lock_irq(rwlock_t *lock)
217{
218 __read_lock_irq(lock);
219}
220EXPORT_SYMBOL(_read_lock_irq);
221#endif
222
223#ifndef CONFIG_INLINE_READ_LOCK_BH
224void __lockfunc _read_lock_bh(rwlock_t *lock)
225{
226 __read_lock_bh(lock);
227}
228EXPORT_SYMBOL(_read_lock_bh);
229#endif
230
231#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE
232unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
233{
234 return __write_lock_irqsave(lock);
235}
236EXPORT_SYMBOL(_write_lock_irqsave);
237#endif
238
239#ifndef CONFIG_INLINE_WRITE_LOCK_IRQ
240void __lockfunc _write_lock_irq(rwlock_t *lock)
241{
242 __write_lock_irq(lock);
243}
244EXPORT_SYMBOL(_write_lock_irq);
245#endif
246
247#ifndef CONFIG_INLINE_WRITE_LOCK_BH
248void __lockfunc _write_lock_bh(rwlock_t *lock)
249{
250 __write_lock_bh(lock);
251}
252EXPORT_SYMBOL(_write_lock_bh);
253#endif
254
255#ifndef CONFIG_INLINE_SPIN_LOCK
256void __lockfunc _spin_lock(spinlock_t *lock)
257{
258 __spin_lock(lock);
259}
260EXPORT_SYMBOL(_spin_lock);
261#endif
262
263#ifndef CONFIG_INLINE_WRITE_LOCK
264void __lockfunc _write_lock(rwlock_t *lock)
265{
266 __write_lock(lock);
267}
268EXPORT_SYMBOL(_write_lock);
269#endif
270
271#ifndef CONFIG_INLINE_SPIN_UNLOCK
276void __lockfunc _spin_unlock(spinlock_t *lock) 272void __lockfunc _spin_unlock(spinlock_t *lock)
277{ 273{
278 __spin_unlock(lock); 274 __spin_unlock(lock);
@@ -280,7 +276,7 @@ void __lockfunc _spin_unlock(spinlock_t *lock)
280EXPORT_SYMBOL(_spin_unlock); 276EXPORT_SYMBOL(_spin_unlock);
281#endif 277#endif
282 278
283#ifndef _write_unlock 279#ifndef CONFIG_INLINE_WRITE_UNLOCK
284void __lockfunc _write_unlock(rwlock_t *lock) 280void __lockfunc _write_unlock(rwlock_t *lock)
285{ 281{
286 __write_unlock(lock); 282 __write_unlock(lock);
@@ -288,7 +284,7 @@ void __lockfunc _write_unlock(rwlock_t *lock)
288EXPORT_SYMBOL(_write_unlock); 284EXPORT_SYMBOL(_write_unlock);
289#endif 285#endif
290 286
291#ifndef _read_unlock 287#ifndef CONFIG_INLINE_READ_UNLOCK
292void __lockfunc _read_unlock(rwlock_t *lock) 288void __lockfunc _read_unlock(rwlock_t *lock)
293{ 289{
294 __read_unlock(lock); 290 __read_unlock(lock);
@@ -296,7 +292,7 @@ void __lockfunc _read_unlock(rwlock_t *lock)
296EXPORT_SYMBOL(_read_unlock); 292EXPORT_SYMBOL(_read_unlock);
297#endif 293#endif
298 294
299#ifndef _spin_unlock_irqrestore 295#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE
300void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) 296void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
301{ 297{
302 __spin_unlock_irqrestore(lock, flags); 298 __spin_unlock_irqrestore(lock, flags);
@@ -304,7 +300,7 @@ void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
304EXPORT_SYMBOL(_spin_unlock_irqrestore); 300EXPORT_SYMBOL(_spin_unlock_irqrestore);
305#endif 301#endif
306 302
307#ifndef _spin_unlock_irq 303#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ
308void __lockfunc _spin_unlock_irq(spinlock_t *lock) 304void __lockfunc _spin_unlock_irq(spinlock_t *lock)
309{ 305{
310 __spin_unlock_irq(lock); 306 __spin_unlock_irq(lock);
@@ -312,7 +308,7 @@ void __lockfunc _spin_unlock_irq(spinlock_t *lock)
312EXPORT_SYMBOL(_spin_unlock_irq); 308EXPORT_SYMBOL(_spin_unlock_irq);
313#endif 309#endif
314 310
315#ifndef _spin_unlock_bh 311#ifndef CONFIG_INLINE_SPIN_UNLOCK_BH
316void __lockfunc _spin_unlock_bh(spinlock_t *lock) 312void __lockfunc _spin_unlock_bh(spinlock_t *lock)
317{ 313{
318 __spin_unlock_bh(lock); 314 __spin_unlock_bh(lock);
@@ -320,7 +316,7 @@ void __lockfunc _spin_unlock_bh(spinlock_t *lock)
320EXPORT_SYMBOL(_spin_unlock_bh); 316EXPORT_SYMBOL(_spin_unlock_bh);
321#endif 317#endif
322 318
323#ifndef _read_unlock_irqrestore 319#ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE
324void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 320void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
325{ 321{
326 __read_unlock_irqrestore(lock, flags); 322 __read_unlock_irqrestore(lock, flags);
@@ -328,7 +324,7 @@ void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
328EXPORT_SYMBOL(_read_unlock_irqrestore); 324EXPORT_SYMBOL(_read_unlock_irqrestore);
329#endif 325#endif
330 326
331#ifndef _read_unlock_irq 327#ifndef CONFIG_INLINE_READ_UNLOCK_IRQ
332void __lockfunc _read_unlock_irq(rwlock_t *lock) 328void __lockfunc _read_unlock_irq(rwlock_t *lock)
333{ 329{
334 __read_unlock_irq(lock); 330 __read_unlock_irq(lock);
@@ -336,7 +332,7 @@ void __lockfunc _read_unlock_irq(rwlock_t *lock)
336EXPORT_SYMBOL(_read_unlock_irq); 332EXPORT_SYMBOL(_read_unlock_irq);
337#endif 333#endif
338 334
339#ifndef _read_unlock_bh 335#ifndef CONFIG_INLINE_READ_UNLOCK_BH
340void __lockfunc _read_unlock_bh(rwlock_t *lock) 336void __lockfunc _read_unlock_bh(rwlock_t *lock)
341{ 337{
342 __read_unlock_bh(lock); 338 __read_unlock_bh(lock);
@@ -344,7 +340,7 @@ void __lockfunc _read_unlock_bh(rwlock_t *lock)
344EXPORT_SYMBOL(_read_unlock_bh); 340EXPORT_SYMBOL(_read_unlock_bh);
345#endif 341#endif
346 342
347#ifndef _write_unlock_irqrestore 343#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE
348void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 344void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
349{ 345{
350 __write_unlock_irqrestore(lock, flags); 346 __write_unlock_irqrestore(lock, flags);
@@ -352,7 +348,7 @@ void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
352EXPORT_SYMBOL(_write_unlock_irqrestore); 348EXPORT_SYMBOL(_write_unlock_irqrestore);
353#endif 349#endif
354 350
355#ifndef _write_unlock_irq 351#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ
356void __lockfunc _write_unlock_irq(rwlock_t *lock) 352void __lockfunc _write_unlock_irq(rwlock_t *lock)
357{ 353{
358 __write_unlock_irq(lock); 354 __write_unlock_irq(lock);
@@ -360,7 +356,7 @@ void __lockfunc _write_unlock_irq(rwlock_t *lock)
360EXPORT_SYMBOL(_write_unlock_irq); 356EXPORT_SYMBOL(_write_unlock_irq);
361#endif 357#endif
362 358
363#ifndef _write_unlock_bh 359#ifndef CONFIG_INLINE_WRITE_UNLOCK_BH
364void __lockfunc _write_unlock_bh(rwlock_t *lock) 360void __lockfunc _write_unlock_bh(rwlock_t *lock)
365{ 361{
366 __write_unlock_bh(lock); 362 __write_unlock_bh(lock);
@@ -368,7 +364,7 @@ void __lockfunc _write_unlock_bh(rwlock_t *lock)
368EXPORT_SYMBOL(_write_unlock_bh); 364EXPORT_SYMBOL(_write_unlock_bh);
369#endif 365#endif
370 366
371#ifndef _spin_trylock_bh 367#ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH
372int __lockfunc _spin_trylock_bh(spinlock_t *lock) 368int __lockfunc _spin_trylock_bh(spinlock_t *lock)
373{ 369{
374 return __spin_trylock_bh(lock); 370 return __spin_trylock_bh(lock);
diff --git a/kernel/srcu.c b/kernel/srcu.c
index b0aeeaf22ce4..818d7d9aa03c 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -49,6 +49,7 @@ int init_srcu_struct(struct srcu_struct *sp)
49 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); 49 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
50 return (sp->per_cpu_ref ? 0 : -ENOMEM); 50 return (sp->per_cpu_ref ? 0 : -ENOMEM);
51} 51}
52EXPORT_SYMBOL_GPL(init_srcu_struct);
52 53
53/* 54/*
54 * srcu_readers_active_idx -- returns approximate number of readers 55 * srcu_readers_active_idx -- returns approximate number of readers
@@ -97,6 +98,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
97 free_percpu(sp->per_cpu_ref); 98 free_percpu(sp->per_cpu_ref);
98 sp->per_cpu_ref = NULL; 99 sp->per_cpu_ref = NULL;
99} 100}
101EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
100 102
101/** 103/**
102 * srcu_read_lock - register a new reader for an SRCU-protected structure. 104 * srcu_read_lock - register a new reader for an SRCU-protected structure.
@@ -118,6 +120,7 @@ int srcu_read_lock(struct srcu_struct *sp)
118 preempt_enable(); 120 preempt_enable();
119 return idx; 121 return idx;
120} 122}
123EXPORT_SYMBOL_GPL(srcu_read_lock);
121 124
122/** 125/**
123 * srcu_read_unlock - unregister a old reader from an SRCU-protected structure. 126 * srcu_read_unlock - unregister a old reader from an SRCU-protected structure.
@@ -136,22 +139,12 @@ void srcu_read_unlock(struct srcu_struct *sp, int idx)
136 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; 139 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--;
137 preempt_enable(); 140 preempt_enable();
138} 141}
142EXPORT_SYMBOL_GPL(srcu_read_unlock);
139 143
140/** 144/*
141 * synchronize_srcu - wait for prior SRCU read-side critical-section completion 145 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
142 * @sp: srcu_struct with which to synchronize.
143 *
144 * Flip the completed counter, and wait for the old count to drain to zero.
145 * As with classic RCU, the updater must use some separate means of
146 * synchronizing concurrent updates. Can block; must be called from
147 * process context.
148 *
149 * Note that it is illegal to call synchornize_srcu() from the corresponding
150 * SRCU read-side critical section; doing so will result in deadlock.
151 * However, it is perfectly legal to call synchronize_srcu() on one
152 * srcu_struct from some other srcu_struct's read-side critical section.
153 */ 146 */
154void synchronize_srcu(struct srcu_struct *sp) 147void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
155{ 148{
156 int idx; 149 int idx;
157 150
@@ -173,7 +166,7 @@ void synchronize_srcu(struct srcu_struct *sp)
173 return; 166 return;
174 } 167 }
175 168
176 synchronize_sched(); /* Force memory barrier on all CPUs. */ 169 sync_func(); /* Force memory barrier on all CPUs. */
177 170
178 /* 171 /*
179 * The preceding synchronize_sched() ensures that any CPU that 172 * The preceding synchronize_sched() ensures that any CPU that
@@ -190,7 +183,7 @@ void synchronize_srcu(struct srcu_struct *sp)
190 idx = sp->completed & 0x1; 183 idx = sp->completed & 0x1;
191 sp->completed++; 184 sp->completed++;
192 185
193 synchronize_sched(); /* Force memory barrier on all CPUs. */ 186 sync_func(); /* Force memory barrier on all CPUs. */
194 187
195 /* 188 /*
196 * At this point, because of the preceding synchronize_sched(), 189 * At this point, because of the preceding synchronize_sched(),
@@ -203,7 +196,7 @@ void synchronize_srcu(struct srcu_struct *sp)
203 while (srcu_readers_active_idx(sp, idx)) 196 while (srcu_readers_active_idx(sp, idx))
204 schedule_timeout_interruptible(1); 197 schedule_timeout_interruptible(1);
205 198
206 synchronize_sched(); /* Force memory barrier on all CPUs. */ 199 sync_func(); /* Force memory barrier on all CPUs. */
207 200
208 /* 201 /*
209 * The preceding synchronize_sched() forces all srcu_read_unlock() 202 * The preceding synchronize_sched() forces all srcu_read_unlock()
@@ -237,6 +230,47 @@ void synchronize_srcu(struct srcu_struct *sp)
237} 230}
238 231
239/** 232/**
233 * synchronize_srcu - wait for prior SRCU read-side critical-section completion
234 * @sp: srcu_struct with which to synchronize.
235 *
236 * Flip the completed counter, and wait for the old count to drain to zero.
237 * As with classic RCU, the updater must use some separate means of
238 * synchronizing concurrent updates. Can block; must be called from
239 * process context.
240 *
241 * Note that it is illegal to call synchronize_srcu() from the corresponding
242 * SRCU read-side critical section; doing so will result in deadlock.
243 * However, it is perfectly legal to call synchronize_srcu() on one
244 * srcu_struct from some other srcu_struct's read-side critical section.
245 */
246void synchronize_srcu(struct srcu_struct *sp)
247{
248 __synchronize_srcu(sp, synchronize_sched);
249}
250EXPORT_SYMBOL_GPL(synchronize_srcu);
251
252/**
253 * synchronize_srcu_expedited - like synchronize_srcu, but less patient
254 * @sp: srcu_struct with which to synchronize.
255 *
256 * Flip the completed counter, and wait for the old count to drain to zero.
257 * As with classic RCU, the updater must use some separate means of
258 * synchronizing concurrent updates. Can block; must be called from
259 * process context.
260 *
261 * Note that it is illegal to call synchronize_srcu_expedited()
262 * from the corresponding SRCU read-side critical section; doing so
263 * will result in deadlock. However, it is perfectly legal to call
264 * synchronize_srcu_expedited() on one srcu_struct from some other
265 * srcu_struct's read-side critical section.
266 */
267void synchronize_srcu_expedited(struct srcu_struct *sp)
268{
269 __synchronize_srcu(sp, synchronize_sched_expedited);
270}
271EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
272
273/**
240 * srcu_batches_completed - return batches completed. 274 * srcu_batches_completed - return batches completed.
241 * @sp: srcu_struct on which to report batch completion. 275 * @sp: srcu_struct on which to report batch completion.
242 * 276 *
@@ -248,10 +282,4 @@ long srcu_batches_completed(struct srcu_struct *sp)
248{ 282{
249 return sp->completed; 283 return sp->completed;
250} 284}
251
252EXPORT_SYMBOL_GPL(init_srcu_struct);
253EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
254EXPORT_SYMBOL_GPL(srcu_read_lock);
255EXPORT_SYMBOL_GPL(srcu_read_unlock);
256EXPORT_SYMBOL_GPL(synchronize_srcu);
257EXPORT_SYMBOL_GPL(srcu_batches_completed); 285EXPORT_SYMBOL_GPL(srcu_batches_completed);
diff --git a/kernel/sys.c b/kernel/sys.c
index ce17760d9c51..585d6cd10040 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -8,7 +8,6 @@
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/utsname.h> 9#include <linux/utsname.h>
10#include <linux/mman.h> 10#include <linux/mman.h>
11#include <linux/smp_lock.h>
12#include <linux/notifier.h> 11#include <linux/notifier.h>
13#include <linux/reboot.h> 12#include <linux/reboot.h>
14#include <linux/prctl.h> 13#include <linux/prctl.h>
@@ -349,6 +348,9 @@ void kernel_power_off(void)
349 machine_power_off(); 348 machine_power_off();
350} 349}
351EXPORT_SYMBOL_GPL(kernel_power_off); 350EXPORT_SYMBOL_GPL(kernel_power_off);
351
352static DEFINE_MUTEX(reboot_mutex);
353
352/* 354/*
353 * Reboot system call: for obvious reasons only root may call it, 355 * Reboot system call: for obvious reasons only root may call it,
354 * and even root needs to set up some magic numbers in the registers 356 * and even root needs to set up some magic numbers in the registers
@@ -381,7 +383,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
381 if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) 383 if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
382 cmd = LINUX_REBOOT_CMD_HALT; 384 cmd = LINUX_REBOOT_CMD_HALT;
383 385
384 lock_kernel(); 386 mutex_lock(&reboot_mutex);
385 switch (cmd) { 387 switch (cmd) {
386 case LINUX_REBOOT_CMD_RESTART: 388 case LINUX_REBOOT_CMD_RESTART:
387 kernel_restart(NULL); 389 kernel_restart(NULL);
@@ -397,20 +399,18 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
397 399
398 case LINUX_REBOOT_CMD_HALT: 400 case LINUX_REBOOT_CMD_HALT:
399 kernel_halt(); 401 kernel_halt();
400 unlock_kernel();
401 do_exit(0); 402 do_exit(0);
402 panic("cannot halt"); 403 panic("cannot halt");
403 404
404 case LINUX_REBOOT_CMD_POWER_OFF: 405 case LINUX_REBOOT_CMD_POWER_OFF:
405 kernel_power_off(); 406 kernel_power_off();
406 unlock_kernel();
407 do_exit(0); 407 do_exit(0);
408 break; 408 break;
409 409
410 case LINUX_REBOOT_CMD_RESTART2: 410 case LINUX_REBOOT_CMD_RESTART2:
411 if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) { 411 if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) {
412 unlock_kernel(); 412 ret = -EFAULT;
413 return -EFAULT; 413 break;
414 } 414 }
415 buffer[sizeof(buffer) - 1] = '\0'; 415 buffer[sizeof(buffer) - 1] = '\0';
416 416
@@ -433,7 +433,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
433 ret = -EINVAL; 433 ret = -EINVAL;
434 break; 434 break;
435 } 435 }
436 unlock_kernel(); 436 mutex_unlock(&reboot_mutex);
437 return ret; 437 return ret;
438} 438}
439 439
@@ -911,16 +911,15 @@ change_okay:
911 911
912void do_sys_times(struct tms *tms) 912void do_sys_times(struct tms *tms)
913{ 913{
914 struct task_cputime cputime; 914 cputime_t tgutime, tgstime, cutime, cstime;
915 cputime_t cutime, cstime;
916 915
917 thread_group_cputime(current, &cputime);
918 spin_lock_irq(&current->sighand->siglock); 916 spin_lock_irq(&current->sighand->siglock);
917 thread_group_times(current, &tgutime, &tgstime);
919 cutime = current->signal->cutime; 918 cutime = current->signal->cutime;
920 cstime = current->signal->cstime; 919 cstime = current->signal->cstime;
921 spin_unlock_irq(&current->sighand->siglock); 920 spin_unlock_irq(&current->sighand->siglock);
922 tms->tms_utime = cputime_to_clock_t(cputime.utime); 921 tms->tms_utime = cputime_to_clock_t(tgutime);
923 tms->tms_stime = cputime_to_clock_t(cputime.stime); 922 tms->tms_stime = cputime_to_clock_t(tgstime);
924 tms->tms_cutime = cputime_to_clock_t(cutime); 923 tms->tms_cutime = cputime_to_clock_t(cutime);
925 tms->tms_cstime = cputime_to_clock_t(cstime); 924 tms->tms_cstime = cputime_to_clock_t(cstime);
926} 925}
@@ -1338,16 +1337,14 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1338{ 1337{
1339 struct task_struct *t; 1338 struct task_struct *t;
1340 unsigned long flags; 1339 unsigned long flags;
1341 cputime_t utime, stime; 1340 cputime_t tgutime, tgstime, utime, stime;
1342 struct task_cputime cputime;
1343 unsigned long maxrss = 0; 1341 unsigned long maxrss = 0;
1344 1342
1345 memset((char *) r, 0, sizeof *r); 1343 memset((char *) r, 0, sizeof *r);
1346 utime = stime = cputime_zero; 1344 utime = stime = cputime_zero;
1347 1345
1348 if (who == RUSAGE_THREAD) { 1346 if (who == RUSAGE_THREAD) {
1349 utime = task_utime(current); 1347 task_times(current, &utime, &stime);
1350 stime = task_stime(current);
1351 accumulate_thread_rusage(p, r); 1348 accumulate_thread_rusage(p, r);
1352 maxrss = p->signal->maxrss; 1349 maxrss = p->signal->maxrss;
1353 goto out; 1350 goto out;
@@ -1373,9 +1370,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1373 break; 1370 break;
1374 1371
1375 case RUSAGE_SELF: 1372 case RUSAGE_SELF:
1376 thread_group_cputime(p, &cputime); 1373 thread_group_times(p, &tgutime, &tgstime);
1377 utime = cputime_add(utime, cputime.utime); 1374 utime = cputime_add(utime, tgutime);
1378 stime = cputime_add(stime, cputime.stime); 1375 stime = cputime_add(stime, tgstime);
1379 r->ru_nvcsw += p->signal->nvcsw; 1376 r->ru_nvcsw += p->signal->nvcsw;
1380 r->ru_nivcsw += p->signal->nivcsw; 1377 r->ru_nivcsw += p->signal->nivcsw;
1381 r->ru_minflt += p->signal->min_flt; 1378 r->ru_minflt += p->signal->min_flt;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index e06d0b8d1951..695384f12a7d 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -48,8 +48,10 @@ cond_syscall(sys_shutdown);
48cond_syscall(sys_sendmsg); 48cond_syscall(sys_sendmsg);
49cond_syscall(compat_sys_sendmsg); 49cond_syscall(compat_sys_sendmsg);
50cond_syscall(sys_recvmsg); 50cond_syscall(sys_recvmsg);
51cond_syscall(sys_recvmmsg);
51cond_syscall(compat_sys_recvmsg); 52cond_syscall(compat_sys_recvmsg);
52cond_syscall(compat_sys_recvfrom); 53cond_syscall(compat_sys_recvfrom);
54cond_syscall(compat_sys_recvmmsg);
53cond_syscall(sys_socketcall); 55cond_syscall(sys_socketcall);
54cond_syscall(sys_futex); 56cond_syscall(sys_futex);
55cond_syscall(compat_sys_futex); 57cond_syscall(compat_sys_futex);
@@ -139,7 +141,6 @@ cond_syscall(sys_pciconfig_read);
139cond_syscall(sys_pciconfig_write); 141cond_syscall(sys_pciconfig_write);
140cond_syscall(sys_pciconfig_iobase); 142cond_syscall(sys_pciconfig_iobase);
141cond_syscall(sys32_ipc); 143cond_syscall(sys32_ipc);
142cond_syscall(sys32_sysctl);
143cond_syscall(ppc_rtas); 144cond_syscall(ppc_rtas);
144cond_syscall(sys_spu_run); 145cond_syscall(sys_spu_run);
145cond_syscall(sys_spu_create); 146cond_syscall(sys_spu_create);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0d949c517412..554ac4894f0f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -27,7 +27,6 @@
27#include <linux/security.h> 27#include <linux/security.h>
28#include <linux/ctype.h> 28#include <linux/ctype.h>
29#include <linux/kmemcheck.h> 29#include <linux/kmemcheck.h>
30#include <linux/smp_lock.h>
31#include <linux/fs.h> 30#include <linux/fs.h>
32#include <linux/init.h> 31#include <linux/init.h>
33#include <linux/kernel.h> 32#include <linux/kernel.h>
@@ -36,6 +35,7 @@
36#include <linux/sysrq.h> 35#include <linux/sysrq.h>
37#include <linux/highuid.h> 36#include <linux/highuid.h>
38#include <linux/writeback.h> 37#include <linux/writeback.h>
38#include <linux/ratelimit.h>
39#include <linux/hugetlb.h> 39#include <linux/hugetlb.h>
40#include <linux/initrd.h> 40#include <linux/initrd.h>
41#include <linux/key.h> 41#include <linux/key.h>
@@ -60,7 +60,6 @@
60#include <asm/io.h> 60#include <asm/io.h>
61#endif 61#endif
62 62
63static int deprecated_sysctl_warning(struct __sysctl_args *args);
64 63
65#if defined(CONFIG_SYSCTL) 64#if defined(CONFIG_SYSCTL)
66 65
@@ -158,6 +157,8 @@ extern int no_unaligned_warning;
158extern int unaligned_dump_stack; 157extern int unaligned_dump_stack;
159#endif 158#endif
160 159
160extern struct ratelimit_state printk_ratelimit_state;
161
161#ifdef CONFIG_RT_MUTEXES 162#ifdef CONFIG_RT_MUTEXES
162extern int max_lock_depth; 163extern int max_lock_depth;
163#endif 164#endif
@@ -207,31 +208,26 @@ extern int lock_stat;
207 208
208static struct ctl_table root_table[] = { 209static struct ctl_table root_table[] = {
209 { 210 {
210 .ctl_name = CTL_KERN,
211 .procname = "kernel", 211 .procname = "kernel",
212 .mode = 0555, 212 .mode = 0555,
213 .child = kern_table, 213 .child = kern_table,
214 }, 214 },
215 { 215 {
216 .ctl_name = CTL_VM,
217 .procname = "vm", 216 .procname = "vm",
218 .mode = 0555, 217 .mode = 0555,
219 .child = vm_table, 218 .child = vm_table,
220 }, 219 },
221 { 220 {
222 .ctl_name = CTL_FS,
223 .procname = "fs", 221 .procname = "fs",
224 .mode = 0555, 222 .mode = 0555,
225 .child = fs_table, 223 .child = fs_table,
226 }, 224 },
227 { 225 {
228 .ctl_name = CTL_DEBUG,
229 .procname = "debug", 226 .procname = "debug",
230 .mode = 0555, 227 .mode = 0555,
231 .child = debug_table, 228 .child = debug_table,
232 }, 229 },
233 { 230 {
234 .ctl_name = CTL_DEV,
235 .procname = "dev", 231 .procname = "dev",
236 .mode = 0555, 232 .mode = 0555,
237 .child = dev_table, 233 .child = dev_table,
@@ -240,7 +236,7 @@ static struct ctl_table root_table[] = {
240 * NOTE: do not add new entries to this table unless you have read 236 * NOTE: do not add new entries to this table unless you have read
241 * Documentation/sysctl/ctl_unnumbered.txt 237 * Documentation/sysctl/ctl_unnumbered.txt
242 */ 238 */
243 { .ctl_name = 0 } 239 { }
244}; 240};
245 241
246#ifdef CONFIG_SCHED_DEBUG 242#ifdef CONFIG_SCHED_DEBUG
@@ -248,196 +244,178 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */
248static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ 244static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
249static int min_wakeup_granularity_ns; /* 0 usecs */ 245static int min_wakeup_granularity_ns; /* 0 usecs */
250static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ 246static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
247static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
248static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
249static int min_sched_shares_ratelimit = 100000; /* 100 usec */
250static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
251#endif 251#endif
252 252
253static struct ctl_table kern_table[] = { 253static struct ctl_table kern_table[] = {
254 { 254 {
255 .ctl_name = CTL_UNNUMBERED,
256 .procname = "sched_child_runs_first", 255 .procname = "sched_child_runs_first",
257 .data = &sysctl_sched_child_runs_first, 256 .data = &sysctl_sched_child_runs_first,
258 .maxlen = sizeof(unsigned int), 257 .maxlen = sizeof(unsigned int),
259 .mode = 0644, 258 .mode = 0644,
260 .proc_handler = &proc_dointvec, 259 .proc_handler = proc_dointvec,
261 }, 260 },
262#ifdef CONFIG_SCHED_DEBUG 261#ifdef CONFIG_SCHED_DEBUG
263 { 262 {
264 .ctl_name = CTL_UNNUMBERED,
265 .procname = "sched_min_granularity_ns", 263 .procname = "sched_min_granularity_ns",
266 .data = &sysctl_sched_min_granularity, 264 .data = &sysctl_sched_min_granularity,
267 .maxlen = sizeof(unsigned int), 265 .maxlen = sizeof(unsigned int),
268 .mode = 0644, 266 .mode = 0644,
269 .proc_handler = &sched_nr_latency_handler, 267 .proc_handler = sched_proc_update_handler,
270 .strategy = &sysctl_intvec,
271 .extra1 = &min_sched_granularity_ns, 268 .extra1 = &min_sched_granularity_ns,
272 .extra2 = &max_sched_granularity_ns, 269 .extra2 = &max_sched_granularity_ns,
273 }, 270 },
274 { 271 {
275 .ctl_name = CTL_UNNUMBERED,
276 .procname = "sched_latency_ns", 272 .procname = "sched_latency_ns",
277 .data = &sysctl_sched_latency, 273 .data = &sysctl_sched_latency,
278 .maxlen = sizeof(unsigned int), 274 .maxlen = sizeof(unsigned int),
279 .mode = 0644, 275 .mode = 0644,
280 .proc_handler = &sched_nr_latency_handler, 276 .proc_handler = sched_proc_update_handler,
281 .strategy = &sysctl_intvec,
282 .extra1 = &min_sched_granularity_ns, 277 .extra1 = &min_sched_granularity_ns,
283 .extra2 = &max_sched_granularity_ns, 278 .extra2 = &max_sched_granularity_ns,
284 }, 279 },
285 { 280 {
286 .ctl_name = CTL_UNNUMBERED,
287 .procname = "sched_wakeup_granularity_ns", 281 .procname = "sched_wakeup_granularity_ns",
288 .data = &sysctl_sched_wakeup_granularity, 282 .data = &sysctl_sched_wakeup_granularity,
289 .maxlen = sizeof(unsigned int), 283 .maxlen = sizeof(unsigned int),
290 .mode = 0644, 284 .mode = 0644,
291 .proc_handler = &proc_dointvec_minmax, 285 .proc_handler = sched_proc_update_handler,
292 .strategy = &sysctl_intvec,
293 .extra1 = &min_wakeup_granularity_ns, 286 .extra1 = &min_wakeup_granularity_ns,
294 .extra2 = &max_wakeup_granularity_ns, 287 .extra2 = &max_wakeup_granularity_ns,
295 }, 288 },
296 { 289 {
297 .ctl_name = CTL_UNNUMBERED,
298 .procname = "sched_shares_ratelimit", 290 .procname = "sched_shares_ratelimit",
299 .data = &sysctl_sched_shares_ratelimit, 291 .data = &sysctl_sched_shares_ratelimit,
300 .maxlen = sizeof(unsigned int), 292 .maxlen = sizeof(unsigned int),
301 .mode = 0644, 293 .mode = 0644,
302 .proc_handler = &proc_dointvec, 294 .proc_handler = sched_proc_update_handler,
295 .extra1 = &min_sched_shares_ratelimit,
296 .extra2 = &max_sched_shares_ratelimit,
303 }, 297 },
304 { 298 {
305 .ctl_name = CTL_UNNUMBERED, 299 .procname = "sched_tunable_scaling",
306 .procname = "sched_shares_thresh", 300 .data = &sysctl_sched_tunable_scaling,
307 .data = &sysctl_sched_shares_thresh, 301 .maxlen = sizeof(enum sched_tunable_scaling),
308 .maxlen = sizeof(unsigned int),
309 .mode = 0644, 302 .mode = 0644,
310 .proc_handler = &proc_dointvec_minmax, 303 .proc_handler = sched_proc_update_handler,
311 .strategy = &sysctl_intvec, 304 .extra1 = &min_sched_tunable_scaling,
312 .extra1 = &zero, 305 .extra2 = &max_sched_tunable_scaling,
313 }, 306 },
314 { 307 {
315 .ctl_name = CTL_UNNUMBERED, 308 .procname = "sched_shares_thresh",
316 .procname = "sched_features", 309 .data = &sysctl_sched_shares_thresh,
317 .data = &sysctl_sched_features,
318 .maxlen = sizeof(unsigned int), 310 .maxlen = sizeof(unsigned int),
319 .mode = 0644, 311 .mode = 0644,
320 .proc_handler = &proc_dointvec, 312 .proc_handler = proc_dointvec_minmax,
313 .extra1 = &zero,
321 }, 314 },
322 { 315 {
323 .ctl_name = CTL_UNNUMBERED,
324 .procname = "sched_migration_cost", 316 .procname = "sched_migration_cost",
325 .data = &sysctl_sched_migration_cost, 317 .data = &sysctl_sched_migration_cost,
326 .maxlen = sizeof(unsigned int), 318 .maxlen = sizeof(unsigned int),
327 .mode = 0644, 319 .mode = 0644,
328 .proc_handler = &proc_dointvec, 320 .proc_handler = proc_dointvec,
329 }, 321 },
330 { 322 {
331 .ctl_name = CTL_UNNUMBERED,
332 .procname = "sched_nr_migrate", 323 .procname = "sched_nr_migrate",
333 .data = &sysctl_sched_nr_migrate, 324 .data = &sysctl_sched_nr_migrate,
334 .maxlen = sizeof(unsigned int), 325 .maxlen = sizeof(unsigned int),
335 .mode = 0644, 326 .mode = 0644,
336 .proc_handler = &proc_dointvec, 327 .proc_handler = proc_dointvec,
337 }, 328 },
338 { 329 {
339 .ctl_name = CTL_UNNUMBERED,
340 .procname = "sched_time_avg", 330 .procname = "sched_time_avg",
341 .data = &sysctl_sched_time_avg, 331 .data = &sysctl_sched_time_avg,
342 .maxlen = sizeof(unsigned int), 332 .maxlen = sizeof(unsigned int),
343 .mode = 0644, 333 .mode = 0644,
344 .proc_handler = &proc_dointvec, 334 .proc_handler = proc_dointvec,
345 }, 335 },
346 { 336 {
347 .ctl_name = CTL_UNNUMBERED,
348 .procname = "timer_migration", 337 .procname = "timer_migration",
349 .data = &sysctl_timer_migration, 338 .data = &sysctl_timer_migration,
350 .maxlen = sizeof(unsigned int), 339 .maxlen = sizeof(unsigned int),
351 .mode = 0644, 340 .mode = 0644,
352 .proc_handler = &proc_dointvec_minmax, 341 .proc_handler = proc_dointvec_minmax,
353 .strategy = &sysctl_intvec,
354 .extra1 = &zero, 342 .extra1 = &zero,
355 .extra2 = &one, 343 .extra2 = &one,
356 }, 344 },
357#endif 345#endif
358 { 346 {
359 .ctl_name = CTL_UNNUMBERED,
360 .procname = "sched_rt_period_us", 347 .procname = "sched_rt_period_us",
361 .data = &sysctl_sched_rt_period, 348 .data = &sysctl_sched_rt_period,
362 .maxlen = sizeof(unsigned int), 349 .maxlen = sizeof(unsigned int),
363 .mode = 0644, 350 .mode = 0644,
364 .proc_handler = &sched_rt_handler, 351 .proc_handler = sched_rt_handler,
365 }, 352 },
366 { 353 {
367 .ctl_name = CTL_UNNUMBERED,
368 .procname = "sched_rt_runtime_us", 354 .procname = "sched_rt_runtime_us",
369 .data = &sysctl_sched_rt_runtime, 355 .data = &sysctl_sched_rt_runtime,
370 .maxlen = sizeof(int), 356 .maxlen = sizeof(int),
371 .mode = 0644, 357 .mode = 0644,
372 .proc_handler = &sched_rt_handler, 358 .proc_handler = sched_rt_handler,
373 }, 359 },
374 { 360 {
375 .ctl_name = CTL_UNNUMBERED,
376 .procname = "sched_compat_yield", 361 .procname = "sched_compat_yield",
377 .data = &sysctl_sched_compat_yield, 362 .data = &sysctl_sched_compat_yield,
378 .maxlen = sizeof(unsigned int), 363 .maxlen = sizeof(unsigned int),
379 .mode = 0644, 364 .mode = 0644,
380 .proc_handler = &proc_dointvec, 365 .proc_handler = proc_dointvec,
381 }, 366 },
382#ifdef CONFIG_PROVE_LOCKING 367#ifdef CONFIG_PROVE_LOCKING
383 { 368 {
384 .ctl_name = CTL_UNNUMBERED,
385 .procname = "prove_locking", 369 .procname = "prove_locking",
386 .data = &prove_locking, 370 .data = &prove_locking,
387 .maxlen = sizeof(int), 371 .maxlen = sizeof(int),
388 .mode = 0644, 372 .mode = 0644,
389 .proc_handler = &proc_dointvec, 373 .proc_handler = proc_dointvec,
390 }, 374 },
391#endif 375#endif
392#ifdef CONFIG_LOCK_STAT 376#ifdef CONFIG_LOCK_STAT
393 { 377 {
394 .ctl_name = CTL_UNNUMBERED,
395 .procname = "lock_stat", 378 .procname = "lock_stat",
396 .data = &lock_stat, 379 .data = &lock_stat,
397 .maxlen = sizeof(int), 380 .maxlen = sizeof(int),
398 .mode = 0644, 381 .mode = 0644,
399 .proc_handler = &proc_dointvec, 382 .proc_handler = proc_dointvec,
400 }, 383 },
401#endif 384#endif
402 { 385 {
403 .ctl_name = KERN_PANIC,
404 .procname = "panic", 386 .procname = "panic",
405 .data = &panic_timeout, 387 .data = &panic_timeout,
406 .maxlen = sizeof(int), 388 .maxlen = sizeof(int),
407 .mode = 0644, 389 .mode = 0644,
408 .proc_handler = &proc_dointvec, 390 .proc_handler = proc_dointvec,
409 }, 391 },
410 { 392 {
411 .ctl_name = KERN_CORE_USES_PID,
412 .procname = "core_uses_pid", 393 .procname = "core_uses_pid",
413 .data = &core_uses_pid, 394 .data = &core_uses_pid,
414 .maxlen = sizeof(int), 395 .maxlen = sizeof(int),
415 .mode = 0644, 396 .mode = 0644,
416 .proc_handler = &proc_dointvec, 397 .proc_handler = proc_dointvec,
417 }, 398 },
418 { 399 {
419 .ctl_name = KERN_CORE_PATTERN,
420 .procname = "core_pattern", 400 .procname = "core_pattern",
421 .data = core_pattern, 401 .data = core_pattern,
422 .maxlen = CORENAME_MAX_SIZE, 402 .maxlen = CORENAME_MAX_SIZE,
423 .mode = 0644, 403 .mode = 0644,
424 .proc_handler = &proc_dostring, 404 .proc_handler = proc_dostring,
425 .strategy = &sysctl_string,
426 }, 405 },
427 { 406 {
428 .ctl_name = CTL_UNNUMBERED,
429 .procname = "core_pipe_limit", 407 .procname = "core_pipe_limit",
430 .data = &core_pipe_limit, 408 .data = &core_pipe_limit,
431 .maxlen = sizeof(unsigned int), 409 .maxlen = sizeof(unsigned int),
432 .mode = 0644, 410 .mode = 0644,
433 .proc_handler = &proc_dointvec, 411 .proc_handler = proc_dointvec,
434 }, 412 },
435#ifdef CONFIG_PROC_SYSCTL 413#ifdef CONFIG_PROC_SYSCTL
436 { 414 {
437 .procname = "tainted", 415 .procname = "tainted",
438 .maxlen = sizeof(long), 416 .maxlen = sizeof(long),
439 .mode = 0644, 417 .mode = 0644,
440 .proc_handler = &proc_taint, 418 .proc_handler = proc_taint,
441 }, 419 },
442#endif 420#endif
443#ifdef CONFIG_LATENCYTOP 421#ifdef CONFIG_LATENCYTOP
@@ -446,181 +424,160 @@ static struct ctl_table kern_table[] = {
446 .data = &latencytop_enabled, 424 .data = &latencytop_enabled,
447 .maxlen = sizeof(int), 425 .maxlen = sizeof(int),
448 .mode = 0644, 426 .mode = 0644,
449 .proc_handler = &proc_dointvec, 427 .proc_handler = proc_dointvec,
450 }, 428 },
451#endif 429#endif
452#ifdef CONFIG_BLK_DEV_INITRD 430#ifdef CONFIG_BLK_DEV_INITRD
453 { 431 {
454 .ctl_name = KERN_REALROOTDEV,
455 .procname = "real-root-dev", 432 .procname = "real-root-dev",
456 .data = &real_root_dev, 433 .data = &real_root_dev,
457 .maxlen = sizeof(int), 434 .maxlen = sizeof(int),
458 .mode = 0644, 435 .mode = 0644,
459 .proc_handler = &proc_dointvec, 436 .proc_handler = proc_dointvec,
460 }, 437 },
461#endif 438#endif
462 { 439 {
463 .ctl_name = CTL_UNNUMBERED,
464 .procname = "print-fatal-signals", 440 .procname = "print-fatal-signals",
465 .data = &print_fatal_signals, 441 .data = &print_fatal_signals,
466 .maxlen = sizeof(int), 442 .maxlen = sizeof(int),
467 .mode = 0644, 443 .mode = 0644,
468 .proc_handler = &proc_dointvec, 444 .proc_handler = proc_dointvec,
469 }, 445 },
470#ifdef CONFIG_SPARC 446#ifdef CONFIG_SPARC
471 { 447 {
472 .ctl_name = KERN_SPARC_REBOOT,
473 .procname = "reboot-cmd", 448 .procname = "reboot-cmd",
474 .data = reboot_command, 449 .data = reboot_command,
475 .maxlen = 256, 450 .maxlen = 256,
476 .mode = 0644, 451 .mode = 0644,
477 .proc_handler = &proc_dostring, 452 .proc_handler = proc_dostring,
478 .strategy = &sysctl_string,
479 }, 453 },
480 { 454 {
481 .ctl_name = KERN_SPARC_STOP_A,
482 .procname = "stop-a", 455 .procname = "stop-a",
483 .data = &stop_a_enabled, 456 .data = &stop_a_enabled,
484 .maxlen = sizeof (int), 457 .maxlen = sizeof (int),
485 .mode = 0644, 458 .mode = 0644,
486 .proc_handler = &proc_dointvec, 459 .proc_handler = proc_dointvec,
487 }, 460 },
488 { 461 {
489 .ctl_name = KERN_SPARC_SCONS_PWROFF,
490 .procname = "scons-poweroff", 462 .procname = "scons-poweroff",
491 .data = &scons_pwroff, 463 .data = &scons_pwroff,
492 .maxlen = sizeof (int), 464 .maxlen = sizeof (int),
493 .mode = 0644, 465 .mode = 0644,
494 .proc_handler = &proc_dointvec, 466 .proc_handler = proc_dointvec,
495 }, 467 },
496#endif 468#endif
497#ifdef CONFIG_SPARC64 469#ifdef CONFIG_SPARC64
498 { 470 {
499 .ctl_name = CTL_UNNUMBERED,
500 .procname = "tsb-ratio", 471 .procname = "tsb-ratio",
501 .data = &sysctl_tsb_ratio, 472 .data = &sysctl_tsb_ratio,
502 .maxlen = sizeof (int), 473 .maxlen = sizeof (int),
503 .mode = 0644, 474 .mode = 0644,
504 .proc_handler = &proc_dointvec, 475 .proc_handler = proc_dointvec,
505 }, 476 },
506#endif 477#endif
507#ifdef __hppa__ 478#ifdef __hppa__
508 { 479 {
509 .ctl_name = KERN_HPPA_PWRSW,
510 .procname = "soft-power", 480 .procname = "soft-power",
511 .data = &pwrsw_enabled, 481 .data = &pwrsw_enabled,
512 .maxlen = sizeof (int), 482 .maxlen = sizeof (int),
513 .mode = 0644, 483 .mode = 0644,
514 .proc_handler = &proc_dointvec, 484 .proc_handler = proc_dointvec,
515 }, 485 },
516 { 486 {
517 .ctl_name = KERN_HPPA_UNALIGNED,
518 .procname = "unaligned-trap", 487 .procname = "unaligned-trap",
519 .data = &unaligned_enabled, 488 .data = &unaligned_enabled,
520 .maxlen = sizeof (int), 489 .maxlen = sizeof (int),
521 .mode = 0644, 490 .mode = 0644,
522 .proc_handler = &proc_dointvec, 491 .proc_handler = proc_dointvec,
523 }, 492 },
524#endif 493#endif
525 { 494 {
526 .ctl_name = KERN_CTLALTDEL,
527 .procname = "ctrl-alt-del", 495 .procname = "ctrl-alt-del",
528 .data = &C_A_D, 496 .data = &C_A_D,
529 .maxlen = sizeof(int), 497 .maxlen = sizeof(int),
530 .mode = 0644, 498 .mode = 0644,
531 .proc_handler = &proc_dointvec, 499 .proc_handler = proc_dointvec,
532 }, 500 },
533#ifdef CONFIG_FUNCTION_TRACER 501#ifdef CONFIG_FUNCTION_TRACER
534 { 502 {
535 .ctl_name = CTL_UNNUMBERED,
536 .procname = "ftrace_enabled", 503 .procname = "ftrace_enabled",
537 .data = &ftrace_enabled, 504 .data = &ftrace_enabled,
538 .maxlen = sizeof(int), 505 .maxlen = sizeof(int),
539 .mode = 0644, 506 .mode = 0644,
540 .proc_handler = &ftrace_enable_sysctl, 507 .proc_handler = ftrace_enable_sysctl,
541 }, 508 },
542#endif 509#endif
543#ifdef CONFIG_STACK_TRACER 510#ifdef CONFIG_STACK_TRACER
544 { 511 {
545 .ctl_name = CTL_UNNUMBERED,
546 .procname = "stack_tracer_enabled", 512 .procname = "stack_tracer_enabled",
547 .data = &stack_tracer_enabled, 513 .data = &stack_tracer_enabled,
548 .maxlen = sizeof(int), 514 .maxlen = sizeof(int),
549 .mode = 0644, 515 .mode = 0644,
550 .proc_handler = &stack_trace_sysctl, 516 .proc_handler = stack_trace_sysctl,
551 }, 517 },
552#endif 518#endif
553#ifdef CONFIG_TRACING 519#ifdef CONFIG_TRACING
554 { 520 {
555 .ctl_name = CTL_UNNUMBERED,
556 .procname = "ftrace_dump_on_oops", 521 .procname = "ftrace_dump_on_oops",
557 .data = &ftrace_dump_on_oops, 522 .data = &ftrace_dump_on_oops,
558 .maxlen = sizeof(int), 523 .maxlen = sizeof(int),
559 .mode = 0644, 524 .mode = 0644,
560 .proc_handler = &proc_dointvec, 525 .proc_handler = proc_dointvec,
561 }, 526 },
562#endif 527#endif
563#ifdef CONFIG_MODULES 528#ifdef CONFIG_MODULES
564 { 529 {
565 .ctl_name = KERN_MODPROBE,
566 .procname = "modprobe", 530 .procname = "modprobe",
567 .data = &modprobe_path, 531 .data = &modprobe_path,
568 .maxlen = KMOD_PATH_LEN, 532 .maxlen = KMOD_PATH_LEN,
569 .mode = 0644, 533 .mode = 0644,
570 .proc_handler = &proc_dostring, 534 .proc_handler = proc_dostring,
571 .strategy = &sysctl_string,
572 }, 535 },
573 { 536 {
574 .ctl_name = CTL_UNNUMBERED,
575 .procname = "modules_disabled", 537 .procname = "modules_disabled",
576 .data = &modules_disabled, 538 .data = &modules_disabled,
577 .maxlen = sizeof(int), 539 .maxlen = sizeof(int),
578 .mode = 0644, 540 .mode = 0644,
579 /* only handle a transition from default "0" to "1" */ 541 /* only handle a transition from default "0" to "1" */
580 .proc_handler = &proc_dointvec_minmax, 542 .proc_handler = proc_dointvec_minmax,
581 .extra1 = &one, 543 .extra1 = &one,
582 .extra2 = &one, 544 .extra2 = &one,
583 }, 545 },
584#endif 546#endif
585#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) 547#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
586 { 548 {
587 .ctl_name = KERN_HOTPLUG,
588 .procname = "hotplug", 549 .procname = "hotplug",
589 .data = &uevent_helper, 550 .data = &uevent_helper,
590 .maxlen = UEVENT_HELPER_PATH_LEN, 551 .maxlen = UEVENT_HELPER_PATH_LEN,
591 .mode = 0644, 552 .mode = 0644,
592 .proc_handler = &proc_dostring, 553 .proc_handler = proc_dostring,
593 .strategy = &sysctl_string,
594 }, 554 },
595#endif 555#endif
596#ifdef CONFIG_CHR_DEV_SG 556#ifdef CONFIG_CHR_DEV_SG
597 { 557 {
598 .ctl_name = KERN_SG_BIG_BUFF,
599 .procname = "sg-big-buff", 558 .procname = "sg-big-buff",
600 .data = &sg_big_buff, 559 .data = &sg_big_buff,
601 .maxlen = sizeof (int), 560 .maxlen = sizeof (int),
602 .mode = 0444, 561 .mode = 0444,
603 .proc_handler = &proc_dointvec, 562 .proc_handler = proc_dointvec,
604 }, 563 },
605#endif 564#endif
606#ifdef CONFIG_BSD_PROCESS_ACCT 565#ifdef CONFIG_BSD_PROCESS_ACCT
607 { 566 {
608 .ctl_name = KERN_ACCT,
609 .procname = "acct", 567 .procname = "acct",
610 .data = &acct_parm, 568 .data = &acct_parm,
611 .maxlen = 3*sizeof(int), 569 .maxlen = 3*sizeof(int),
612 .mode = 0644, 570 .mode = 0644,
613 .proc_handler = &proc_dointvec, 571 .proc_handler = proc_dointvec,
614 }, 572 },
615#endif 573#endif
616#ifdef CONFIG_MAGIC_SYSRQ 574#ifdef CONFIG_MAGIC_SYSRQ
617 { 575 {
618 .ctl_name = KERN_SYSRQ,
619 .procname = "sysrq", 576 .procname = "sysrq",
620 .data = &__sysrq_enabled, 577 .data = &__sysrq_enabled,
621 .maxlen = sizeof (int), 578 .maxlen = sizeof (int),
622 .mode = 0644, 579 .mode = 0644,
623 .proc_handler = &proc_dointvec, 580 .proc_handler = proc_dointvec,
624 }, 581 },
625#endif 582#endif
626#ifdef CONFIG_PROC_SYSCTL 583#ifdef CONFIG_PROC_SYSCTL
@@ -629,215 +586,188 @@ static struct ctl_table kern_table[] = {
629 .data = NULL, 586 .data = NULL,
630 .maxlen = sizeof (int), 587 .maxlen = sizeof (int),
631 .mode = 0600, 588 .mode = 0600,
632 .proc_handler = &proc_do_cad_pid, 589 .proc_handler = proc_do_cad_pid,
633 }, 590 },
634#endif 591#endif
635 { 592 {
636 .ctl_name = KERN_MAX_THREADS,
637 .procname = "threads-max", 593 .procname = "threads-max",
638 .data = &max_threads, 594 .data = &max_threads,
639 .maxlen = sizeof(int), 595 .maxlen = sizeof(int),
640 .mode = 0644, 596 .mode = 0644,
641 .proc_handler = &proc_dointvec, 597 .proc_handler = proc_dointvec,
642 }, 598 },
643 { 599 {
644 .ctl_name = KERN_RANDOM,
645 .procname = "random", 600 .procname = "random",
646 .mode = 0555, 601 .mode = 0555,
647 .child = random_table, 602 .child = random_table,
648 }, 603 },
649 { 604 {
650 .ctl_name = KERN_OVERFLOWUID,
651 .procname = "overflowuid", 605 .procname = "overflowuid",
652 .data = &overflowuid, 606 .data = &overflowuid,
653 .maxlen = sizeof(int), 607 .maxlen = sizeof(int),
654 .mode = 0644, 608 .mode = 0644,
655 .proc_handler = &proc_dointvec_minmax, 609 .proc_handler = proc_dointvec_minmax,
656 .strategy = &sysctl_intvec,
657 .extra1 = &minolduid, 610 .extra1 = &minolduid,
658 .extra2 = &maxolduid, 611 .extra2 = &maxolduid,
659 }, 612 },
660 { 613 {
661 .ctl_name = KERN_OVERFLOWGID,
662 .procname = "overflowgid", 614 .procname = "overflowgid",
663 .data = &overflowgid, 615 .data = &overflowgid,
664 .maxlen = sizeof(int), 616 .maxlen = sizeof(int),
665 .mode = 0644, 617 .mode = 0644,
666 .proc_handler = &proc_dointvec_minmax, 618 .proc_handler = proc_dointvec_minmax,
667 .strategy = &sysctl_intvec,
668 .extra1 = &minolduid, 619 .extra1 = &minolduid,
669 .extra2 = &maxolduid, 620 .extra2 = &maxolduid,
670 }, 621 },
671#ifdef CONFIG_S390 622#ifdef CONFIG_S390
672#ifdef CONFIG_MATHEMU 623#ifdef CONFIG_MATHEMU
673 { 624 {
674 .ctl_name = KERN_IEEE_EMULATION_WARNINGS,
675 .procname = "ieee_emulation_warnings", 625 .procname = "ieee_emulation_warnings",
676 .data = &sysctl_ieee_emulation_warnings, 626 .data = &sysctl_ieee_emulation_warnings,
677 .maxlen = sizeof(int), 627 .maxlen = sizeof(int),
678 .mode = 0644, 628 .mode = 0644,
679 .proc_handler = &proc_dointvec, 629 .proc_handler = proc_dointvec,
680 }, 630 },
681#endif 631#endif
682 { 632 {
683 .ctl_name = KERN_S390_USER_DEBUG_LOGGING,
684 .procname = "userprocess_debug", 633 .procname = "userprocess_debug",
685 .data = &sysctl_userprocess_debug, 634 .data = &sysctl_userprocess_debug,
686 .maxlen = sizeof(int), 635 .maxlen = sizeof(int),
687 .mode = 0644, 636 .mode = 0644,
688 .proc_handler = &proc_dointvec, 637 .proc_handler = proc_dointvec,
689 }, 638 },
690#endif 639#endif
691 { 640 {
692 .ctl_name = KERN_PIDMAX,
693 .procname = "pid_max", 641 .procname = "pid_max",
694 .data = &pid_max, 642 .data = &pid_max,
695 .maxlen = sizeof (int), 643 .maxlen = sizeof (int),
696 .mode = 0644, 644 .mode = 0644,
697 .proc_handler = &proc_dointvec_minmax, 645 .proc_handler = proc_dointvec_minmax,
698 .strategy = sysctl_intvec,
699 .extra1 = &pid_max_min, 646 .extra1 = &pid_max_min,
700 .extra2 = &pid_max_max, 647 .extra2 = &pid_max_max,
701 }, 648 },
702 { 649 {
703 .ctl_name = KERN_PANIC_ON_OOPS,
704 .procname = "panic_on_oops", 650 .procname = "panic_on_oops",
705 .data = &panic_on_oops, 651 .data = &panic_on_oops,
706 .maxlen = sizeof(int), 652 .maxlen = sizeof(int),
707 .mode = 0644, 653 .mode = 0644,
708 .proc_handler = &proc_dointvec, 654 .proc_handler = proc_dointvec,
709 }, 655 },
710#if defined CONFIG_PRINTK 656#if defined CONFIG_PRINTK
711 { 657 {
712 .ctl_name = KERN_PRINTK,
713 .procname = "printk", 658 .procname = "printk",
714 .data = &console_loglevel, 659 .data = &console_loglevel,
715 .maxlen = 4*sizeof(int), 660 .maxlen = 4*sizeof(int),
716 .mode = 0644, 661 .mode = 0644,
717 .proc_handler = &proc_dointvec, 662 .proc_handler = proc_dointvec,
718 }, 663 },
719 { 664 {
720 .ctl_name = KERN_PRINTK_RATELIMIT,
721 .procname = "printk_ratelimit", 665 .procname = "printk_ratelimit",
722 .data = &printk_ratelimit_state.interval, 666 .data = &printk_ratelimit_state.interval,
723 .maxlen = sizeof(int), 667 .maxlen = sizeof(int),
724 .mode = 0644, 668 .mode = 0644,
725 .proc_handler = &proc_dointvec_jiffies, 669 .proc_handler = proc_dointvec_jiffies,
726 .strategy = &sysctl_jiffies,
727 }, 670 },
728 { 671 {
729 .ctl_name = KERN_PRINTK_RATELIMIT_BURST,
730 .procname = "printk_ratelimit_burst", 672 .procname = "printk_ratelimit_burst",
731 .data = &printk_ratelimit_state.burst, 673 .data = &printk_ratelimit_state.burst,
732 .maxlen = sizeof(int), 674 .maxlen = sizeof(int),
733 .mode = 0644, 675 .mode = 0644,
734 .proc_handler = &proc_dointvec, 676 .proc_handler = proc_dointvec,
735 }, 677 },
736 { 678 {
737 .ctl_name = CTL_UNNUMBERED,
738 .procname = "printk_delay", 679 .procname = "printk_delay",
739 .data = &printk_delay_msec, 680 .data = &printk_delay_msec,
740 .maxlen = sizeof(int), 681 .maxlen = sizeof(int),
741 .mode = 0644, 682 .mode = 0644,
742 .proc_handler = &proc_dointvec_minmax, 683 .proc_handler = proc_dointvec_minmax,
743 .strategy = &sysctl_intvec,
744 .extra1 = &zero, 684 .extra1 = &zero,
745 .extra2 = &ten_thousand, 685 .extra2 = &ten_thousand,
746 }, 686 },
747#endif 687#endif
748 { 688 {
749 .ctl_name = KERN_NGROUPS_MAX,
750 .procname = "ngroups_max", 689 .procname = "ngroups_max",
751 .data = &ngroups_max, 690 .data = &ngroups_max,
752 .maxlen = sizeof (int), 691 .maxlen = sizeof (int),
753 .mode = 0444, 692 .mode = 0444,
754 .proc_handler = &proc_dointvec, 693 .proc_handler = proc_dointvec,
755 }, 694 },
756#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) 695#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
757 { 696 {
758 .ctl_name = KERN_UNKNOWN_NMI_PANIC,
759 .procname = "unknown_nmi_panic", 697 .procname = "unknown_nmi_panic",
760 .data = &unknown_nmi_panic, 698 .data = &unknown_nmi_panic,
761 .maxlen = sizeof (int), 699 .maxlen = sizeof (int),
762 .mode = 0644, 700 .mode = 0644,
763 .proc_handler = &proc_dointvec, 701 .proc_handler = proc_dointvec,
764 }, 702 },
765 { 703 {
766 .procname = "nmi_watchdog", 704 .procname = "nmi_watchdog",
767 .data = &nmi_watchdog_enabled, 705 .data = &nmi_watchdog_enabled,
768 .maxlen = sizeof (int), 706 .maxlen = sizeof (int),
769 .mode = 0644, 707 .mode = 0644,
770 .proc_handler = &proc_nmi_enabled, 708 .proc_handler = proc_nmi_enabled,
771 }, 709 },
772#endif 710#endif
773#if defined(CONFIG_X86) 711#if defined(CONFIG_X86)
774 { 712 {
775 .ctl_name = KERN_PANIC_ON_NMI,
776 .procname = "panic_on_unrecovered_nmi", 713 .procname = "panic_on_unrecovered_nmi",
777 .data = &panic_on_unrecovered_nmi, 714 .data = &panic_on_unrecovered_nmi,
778 .maxlen = sizeof(int), 715 .maxlen = sizeof(int),
779 .mode = 0644, 716 .mode = 0644,
780 .proc_handler = &proc_dointvec, 717 .proc_handler = proc_dointvec,
781 }, 718 },
782 { 719 {
783 .ctl_name = CTL_UNNUMBERED,
784 .procname = "panic_on_io_nmi", 720 .procname = "panic_on_io_nmi",
785 .data = &panic_on_io_nmi, 721 .data = &panic_on_io_nmi,
786 .maxlen = sizeof(int), 722 .maxlen = sizeof(int),
787 .mode = 0644, 723 .mode = 0644,
788 .proc_handler = &proc_dointvec, 724 .proc_handler = proc_dointvec,
789 }, 725 },
790 { 726 {
791 .ctl_name = KERN_BOOTLOADER_TYPE,
792 .procname = "bootloader_type", 727 .procname = "bootloader_type",
793 .data = &bootloader_type, 728 .data = &bootloader_type,
794 .maxlen = sizeof (int), 729 .maxlen = sizeof (int),
795 .mode = 0444, 730 .mode = 0444,
796 .proc_handler = &proc_dointvec, 731 .proc_handler = proc_dointvec,
797 }, 732 },
798 { 733 {
799 .ctl_name = CTL_UNNUMBERED,
800 .procname = "bootloader_version", 734 .procname = "bootloader_version",
801 .data = &bootloader_version, 735 .data = &bootloader_version,
802 .maxlen = sizeof (int), 736 .maxlen = sizeof (int),
803 .mode = 0444, 737 .mode = 0444,
804 .proc_handler = &proc_dointvec, 738 .proc_handler = proc_dointvec,
805 }, 739 },
806 { 740 {
807 .ctl_name = CTL_UNNUMBERED,
808 .procname = "kstack_depth_to_print", 741 .procname = "kstack_depth_to_print",
809 .data = &kstack_depth_to_print, 742 .data = &kstack_depth_to_print,
810 .maxlen = sizeof(int), 743 .maxlen = sizeof(int),
811 .mode = 0644, 744 .mode = 0644,
812 .proc_handler = &proc_dointvec, 745 .proc_handler = proc_dointvec,
813 }, 746 },
814 { 747 {
815 .ctl_name = CTL_UNNUMBERED,
816 .procname = "io_delay_type", 748 .procname = "io_delay_type",
817 .data = &io_delay_type, 749 .data = &io_delay_type,
818 .maxlen = sizeof(int), 750 .maxlen = sizeof(int),
819 .mode = 0644, 751 .mode = 0644,
820 .proc_handler = &proc_dointvec, 752 .proc_handler = proc_dointvec,
821 }, 753 },
822#endif 754#endif
823#if defined(CONFIG_MMU) 755#if defined(CONFIG_MMU)
824 { 756 {
825 .ctl_name = KERN_RANDOMIZE,
826 .procname = "randomize_va_space", 757 .procname = "randomize_va_space",
827 .data = &randomize_va_space, 758 .data = &randomize_va_space,
828 .maxlen = sizeof(int), 759 .maxlen = sizeof(int),
829 .mode = 0644, 760 .mode = 0644,
830 .proc_handler = &proc_dointvec, 761 .proc_handler = proc_dointvec,
831 }, 762 },
832#endif 763#endif
833#if defined(CONFIG_S390) && defined(CONFIG_SMP) 764#if defined(CONFIG_S390) && defined(CONFIG_SMP)
834 { 765 {
835 .ctl_name = KERN_SPIN_RETRY,
836 .procname = "spin_retry", 766 .procname = "spin_retry",
837 .data = &spin_retry, 767 .data = &spin_retry,
838 .maxlen = sizeof (int), 768 .maxlen = sizeof (int),
839 .mode = 0644, 769 .mode = 0644,
840 .proc_handler = &proc_dointvec, 770 .proc_handler = proc_dointvec,
841 }, 771 },
842#endif 772#endif
843#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) 773#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86)
@@ -846,123 +776,104 @@ static struct ctl_table kern_table[] = {
846 .data = &acpi_realmode_flags, 776 .data = &acpi_realmode_flags,
847 .maxlen = sizeof (unsigned long), 777 .maxlen = sizeof (unsigned long),
848 .mode = 0644, 778 .mode = 0644,
849 .proc_handler = &proc_doulongvec_minmax, 779 .proc_handler = proc_doulongvec_minmax,
850 }, 780 },
851#endif 781#endif
852#ifdef CONFIG_IA64 782#ifdef CONFIG_IA64
853 { 783 {
854 .ctl_name = KERN_IA64_UNALIGNED,
855 .procname = "ignore-unaligned-usertrap", 784 .procname = "ignore-unaligned-usertrap",
856 .data = &no_unaligned_warning, 785 .data = &no_unaligned_warning,
857 .maxlen = sizeof (int), 786 .maxlen = sizeof (int),
858 .mode = 0644, 787 .mode = 0644,
859 .proc_handler = &proc_dointvec, 788 .proc_handler = proc_dointvec,
860 }, 789 },
861 { 790 {
862 .ctl_name = CTL_UNNUMBERED,
863 .procname = "unaligned-dump-stack", 791 .procname = "unaligned-dump-stack",
864 .data = &unaligned_dump_stack, 792 .data = &unaligned_dump_stack,
865 .maxlen = sizeof (int), 793 .maxlen = sizeof (int),
866 .mode = 0644, 794 .mode = 0644,
867 .proc_handler = &proc_dointvec, 795 .proc_handler = proc_dointvec,
868 }, 796 },
869#endif 797#endif
870#ifdef CONFIG_DETECT_SOFTLOCKUP 798#ifdef CONFIG_DETECT_SOFTLOCKUP
871 { 799 {
872 .ctl_name = CTL_UNNUMBERED,
873 .procname = "softlockup_panic", 800 .procname = "softlockup_panic",
874 .data = &softlockup_panic, 801 .data = &softlockup_panic,
875 .maxlen = sizeof(int), 802 .maxlen = sizeof(int),
876 .mode = 0644, 803 .mode = 0644,
877 .proc_handler = &proc_dointvec_minmax, 804 .proc_handler = proc_dointvec_minmax,
878 .strategy = &sysctl_intvec,
879 .extra1 = &zero, 805 .extra1 = &zero,
880 .extra2 = &one, 806 .extra2 = &one,
881 }, 807 },
882 { 808 {
883 .ctl_name = CTL_UNNUMBERED,
884 .procname = "softlockup_thresh", 809 .procname = "softlockup_thresh",
885 .data = &softlockup_thresh, 810 .data = &softlockup_thresh,
886 .maxlen = sizeof(int), 811 .maxlen = sizeof(int),
887 .mode = 0644, 812 .mode = 0644,
888 .proc_handler = &proc_dosoftlockup_thresh, 813 .proc_handler = proc_dosoftlockup_thresh,
889 .strategy = &sysctl_intvec,
890 .extra1 = &neg_one, 814 .extra1 = &neg_one,
891 .extra2 = &sixty, 815 .extra2 = &sixty,
892 }, 816 },
893#endif 817#endif
894#ifdef CONFIG_DETECT_HUNG_TASK 818#ifdef CONFIG_DETECT_HUNG_TASK
895 { 819 {
896 .ctl_name = CTL_UNNUMBERED,
897 .procname = "hung_task_panic", 820 .procname = "hung_task_panic",
898 .data = &sysctl_hung_task_panic, 821 .data = &sysctl_hung_task_panic,
899 .maxlen = sizeof(int), 822 .maxlen = sizeof(int),
900 .mode = 0644, 823 .mode = 0644,
901 .proc_handler = &proc_dointvec_minmax, 824 .proc_handler = proc_dointvec_minmax,
902 .strategy = &sysctl_intvec,
903 .extra1 = &zero, 825 .extra1 = &zero,
904 .extra2 = &one, 826 .extra2 = &one,
905 }, 827 },
906 { 828 {
907 .ctl_name = CTL_UNNUMBERED,
908 .procname = "hung_task_check_count", 829 .procname = "hung_task_check_count",
909 .data = &sysctl_hung_task_check_count, 830 .data = &sysctl_hung_task_check_count,
910 .maxlen = sizeof(unsigned long), 831 .maxlen = sizeof(unsigned long),
911 .mode = 0644, 832 .mode = 0644,
912 .proc_handler = &proc_doulongvec_minmax, 833 .proc_handler = proc_doulongvec_minmax,
913 .strategy = &sysctl_intvec,
914 }, 834 },
915 { 835 {
916 .ctl_name = CTL_UNNUMBERED,
917 .procname = "hung_task_timeout_secs", 836 .procname = "hung_task_timeout_secs",
918 .data = &sysctl_hung_task_timeout_secs, 837 .data = &sysctl_hung_task_timeout_secs,
919 .maxlen = sizeof(unsigned long), 838 .maxlen = sizeof(unsigned long),
920 .mode = 0644, 839 .mode = 0644,
921 .proc_handler = &proc_dohung_task_timeout_secs, 840 .proc_handler = proc_dohung_task_timeout_secs,
922 .strategy = &sysctl_intvec,
923 }, 841 },
924 { 842 {
925 .ctl_name = CTL_UNNUMBERED,
926 .procname = "hung_task_warnings", 843 .procname = "hung_task_warnings",
927 .data = &sysctl_hung_task_warnings, 844 .data = &sysctl_hung_task_warnings,
928 .maxlen = sizeof(unsigned long), 845 .maxlen = sizeof(unsigned long),
929 .mode = 0644, 846 .mode = 0644,
930 .proc_handler = &proc_doulongvec_minmax, 847 .proc_handler = proc_doulongvec_minmax,
931 .strategy = &sysctl_intvec,
932 }, 848 },
933#endif 849#endif
934#ifdef CONFIG_COMPAT 850#ifdef CONFIG_COMPAT
935 { 851 {
936 .ctl_name = KERN_COMPAT_LOG,
937 .procname = "compat-log", 852 .procname = "compat-log",
938 .data = &compat_log, 853 .data = &compat_log,
939 .maxlen = sizeof (int), 854 .maxlen = sizeof (int),
940 .mode = 0644, 855 .mode = 0644,
941 .proc_handler = &proc_dointvec, 856 .proc_handler = proc_dointvec,
942 }, 857 },
943#endif 858#endif
944#ifdef CONFIG_RT_MUTEXES 859#ifdef CONFIG_RT_MUTEXES
945 { 860 {
946 .ctl_name = KERN_MAX_LOCK_DEPTH,
947 .procname = "max_lock_depth", 861 .procname = "max_lock_depth",
948 .data = &max_lock_depth, 862 .data = &max_lock_depth,
949 .maxlen = sizeof(int), 863 .maxlen = sizeof(int),
950 .mode = 0644, 864 .mode = 0644,
951 .proc_handler = &proc_dointvec, 865 .proc_handler = proc_dointvec,
952 }, 866 },
953#endif 867#endif
954 { 868 {
955 .ctl_name = CTL_UNNUMBERED,
956 .procname = "poweroff_cmd", 869 .procname = "poweroff_cmd",
957 .data = &poweroff_cmd, 870 .data = &poweroff_cmd,
958 .maxlen = POWEROFF_CMD_PATH_LEN, 871 .maxlen = POWEROFF_CMD_PATH_LEN,
959 .mode = 0644, 872 .mode = 0644,
960 .proc_handler = &proc_dostring, 873 .proc_handler = proc_dostring,
961 .strategy = &sysctl_string,
962 }, 874 },
963#ifdef CONFIG_KEYS 875#ifdef CONFIG_KEYS
964 { 876 {
965 .ctl_name = CTL_UNNUMBERED,
966 .procname = "keys", 877 .procname = "keys",
967 .mode = 0555, 878 .mode = 0555,
968 .child = key_sysctls, 879 .child = key_sysctls,
@@ -970,17 +881,15 @@ static struct ctl_table kern_table[] = {
970#endif 881#endif
971#ifdef CONFIG_RCU_TORTURE_TEST 882#ifdef CONFIG_RCU_TORTURE_TEST
972 { 883 {
973 .ctl_name = CTL_UNNUMBERED,
974 .procname = "rcutorture_runnable", 884 .procname = "rcutorture_runnable",
975 .data = &rcutorture_runnable, 885 .data = &rcutorture_runnable,
976 .maxlen = sizeof(int), 886 .maxlen = sizeof(int),
977 .mode = 0644, 887 .mode = 0644,
978 .proc_handler = &proc_dointvec, 888 .proc_handler = proc_dointvec,
979 }, 889 },
980#endif 890#endif
981#ifdef CONFIG_SLOW_WORK 891#ifdef CONFIG_SLOW_WORK
982 { 892 {
983 .ctl_name = CTL_UNNUMBERED,
984 .procname = "slow-work", 893 .procname = "slow-work",
985 .mode = 0555, 894 .mode = 0555,
986 .child = slow_work_sysctls, 895 .child = slow_work_sysctls,
@@ -988,146 +897,127 @@ static struct ctl_table kern_table[] = {
988#endif 897#endif
989#ifdef CONFIG_PERF_EVENTS 898#ifdef CONFIG_PERF_EVENTS
990 { 899 {
991 .ctl_name = CTL_UNNUMBERED,
992 .procname = "perf_event_paranoid", 900 .procname = "perf_event_paranoid",
993 .data = &sysctl_perf_event_paranoid, 901 .data = &sysctl_perf_event_paranoid,
994 .maxlen = sizeof(sysctl_perf_event_paranoid), 902 .maxlen = sizeof(sysctl_perf_event_paranoid),
995 .mode = 0644, 903 .mode = 0644,
996 .proc_handler = &proc_dointvec, 904 .proc_handler = proc_dointvec,
997 }, 905 },
998 { 906 {
999 .ctl_name = CTL_UNNUMBERED,
1000 .procname = "perf_event_mlock_kb", 907 .procname = "perf_event_mlock_kb",
1001 .data = &sysctl_perf_event_mlock, 908 .data = &sysctl_perf_event_mlock,
1002 .maxlen = sizeof(sysctl_perf_event_mlock), 909 .maxlen = sizeof(sysctl_perf_event_mlock),
1003 .mode = 0644, 910 .mode = 0644,
1004 .proc_handler = &proc_dointvec, 911 .proc_handler = proc_dointvec,
1005 }, 912 },
1006 { 913 {
1007 .ctl_name = CTL_UNNUMBERED,
1008 .procname = "perf_event_max_sample_rate", 914 .procname = "perf_event_max_sample_rate",
1009 .data = &sysctl_perf_event_sample_rate, 915 .data = &sysctl_perf_event_sample_rate,
1010 .maxlen = sizeof(sysctl_perf_event_sample_rate), 916 .maxlen = sizeof(sysctl_perf_event_sample_rate),
1011 .mode = 0644, 917 .mode = 0644,
1012 .proc_handler = &proc_dointvec, 918 .proc_handler = proc_dointvec,
1013 }, 919 },
1014#endif 920#endif
1015#ifdef CONFIG_KMEMCHECK 921#ifdef CONFIG_KMEMCHECK
1016 { 922 {
1017 .ctl_name = CTL_UNNUMBERED,
1018 .procname = "kmemcheck", 923 .procname = "kmemcheck",
1019 .data = &kmemcheck_enabled, 924 .data = &kmemcheck_enabled,
1020 .maxlen = sizeof(int), 925 .maxlen = sizeof(int),
1021 .mode = 0644, 926 .mode = 0644,
1022 .proc_handler = &proc_dointvec, 927 .proc_handler = proc_dointvec,
1023 }, 928 },
1024#endif 929#endif
1025#ifdef CONFIG_BLOCK 930#ifdef CONFIG_BLOCK
1026 { 931 {
1027 .ctl_name = CTL_UNNUMBERED,
1028 .procname = "blk_iopoll", 932 .procname = "blk_iopoll",
1029 .data = &blk_iopoll_enabled, 933 .data = &blk_iopoll_enabled,
1030 .maxlen = sizeof(int), 934 .maxlen = sizeof(int),
1031 .mode = 0644, 935 .mode = 0644,
1032 .proc_handler = &proc_dointvec, 936 .proc_handler = proc_dointvec,
1033 }, 937 },
1034#endif 938#endif
1035/* 939/*
1036 * NOTE: do not add new entries to this table unless you have read 940 * NOTE: do not add new entries to this table unless you have read
1037 * Documentation/sysctl/ctl_unnumbered.txt 941 * Documentation/sysctl/ctl_unnumbered.txt
1038 */ 942 */
1039 { .ctl_name = 0 } 943 { }
1040}; 944};
1041 945
1042static struct ctl_table vm_table[] = { 946static struct ctl_table vm_table[] = {
1043 { 947 {
1044 .ctl_name = VM_OVERCOMMIT_MEMORY,
1045 .procname = "overcommit_memory", 948 .procname = "overcommit_memory",
1046 .data = &sysctl_overcommit_memory, 949 .data = &sysctl_overcommit_memory,
1047 .maxlen = sizeof(sysctl_overcommit_memory), 950 .maxlen = sizeof(sysctl_overcommit_memory),
1048 .mode = 0644, 951 .mode = 0644,
1049 .proc_handler = &proc_dointvec, 952 .proc_handler = proc_dointvec,
1050 }, 953 },
1051 { 954 {
1052 .ctl_name = VM_PANIC_ON_OOM,
1053 .procname = "panic_on_oom", 955 .procname = "panic_on_oom",
1054 .data = &sysctl_panic_on_oom, 956 .data = &sysctl_panic_on_oom,
1055 .maxlen = sizeof(sysctl_panic_on_oom), 957 .maxlen = sizeof(sysctl_panic_on_oom),
1056 .mode = 0644, 958 .mode = 0644,
1057 .proc_handler = &proc_dointvec, 959 .proc_handler = proc_dointvec,
1058 }, 960 },
1059 { 961 {
1060 .ctl_name = CTL_UNNUMBERED,
1061 .procname = "oom_kill_allocating_task", 962 .procname = "oom_kill_allocating_task",
1062 .data = &sysctl_oom_kill_allocating_task, 963 .data = &sysctl_oom_kill_allocating_task,
1063 .maxlen = sizeof(sysctl_oom_kill_allocating_task), 964 .maxlen = sizeof(sysctl_oom_kill_allocating_task),
1064 .mode = 0644, 965 .mode = 0644,
1065 .proc_handler = &proc_dointvec, 966 .proc_handler = proc_dointvec,
1066 }, 967 },
1067 { 968 {
1068 .ctl_name = CTL_UNNUMBERED,
1069 .procname = "oom_dump_tasks", 969 .procname = "oom_dump_tasks",
1070 .data = &sysctl_oom_dump_tasks, 970 .data = &sysctl_oom_dump_tasks,
1071 .maxlen = sizeof(sysctl_oom_dump_tasks), 971 .maxlen = sizeof(sysctl_oom_dump_tasks),
1072 .mode = 0644, 972 .mode = 0644,
1073 .proc_handler = &proc_dointvec, 973 .proc_handler = proc_dointvec,
1074 }, 974 },
1075 { 975 {
1076 .ctl_name = VM_OVERCOMMIT_RATIO,
1077 .procname = "overcommit_ratio", 976 .procname = "overcommit_ratio",
1078 .data = &sysctl_overcommit_ratio, 977 .data = &sysctl_overcommit_ratio,
1079 .maxlen = sizeof(sysctl_overcommit_ratio), 978 .maxlen = sizeof(sysctl_overcommit_ratio),
1080 .mode = 0644, 979 .mode = 0644,
1081 .proc_handler = &proc_dointvec, 980 .proc_handler = proc_dointvec,
1082 }, 981 },
1083 { 982 {
1084 .ctl_name = VM_PAGE_CLUSTER,
1085 .procname = "page-cluster", 983 .procname = "page-cluster",
1086 .data = &page_cluster, 984 .data = &page_cluster,
1087 .maxlen = sizeof(int), 985 .maxlen = sizeof(int),
1088 .mode = 0644, 986 .mode = 0644,
1089 .proc_handler = &proc_dointvec, 987 .proc_handler = proc_dointvec,
1090 }, 988 },
1091 { 989 {
1092 .ctl_name = VM_DIRTY_BACKGROUND,
1093 .procname = "dirty_background_ratio", 990 .procname = "dirty_background_ratio",
1094 .data = &dirty_background_ratio, 991 .data = &dirty_background_ratio,
1095 .maxlen = sizeof(dirty_background_ratio), 992 .maxlen = sizeof(dirty_background_ratio),
1096 .mode = 0644, 993 .mode = 0644,
1097 .proc_handler = &dirty_background_ratio_handler, 994 .proc_handler = dirty_background_ratio_handler,
1098 .strategy = &sysctl_intvec,
1099 .extra1 = &zero, 995 .extra1 = &zero,
1100 .extra2 = &one_hundred, 996 .extra2 = &one_hundred,
1101 }, 997 },
1102 { 998 {
1103 .ctl_name = CTL_UNNUMBERED,
1104 .procname = "dirty_background_bytes", 999 .procname = "dirty_background_bytes",
1105 .data = &dirty_background_bytes, 1000 .data = &dirty_background_bytes,
1106 .maxlen = sizeof(dirty_background_bytes), 1001 .maxlen = sizeof(dirty_background_bytes),
1107 .mode = 0644, 1002 .mode = 0644,
1108 .proc_handler = &dirty_background_bytes_handler, 1003 .proc_handler = dirty_background_bytes_handler,
1109 .strategy = &sysctl_intvec,
1110 .extra1 = &one_ul, 1004 .extra1 = &one_ul,
1111 }, 1005 },
1112 { 1006 {
1113 .ctl_name = VM_DIRTY_RATIO,
1114 .procname = "dirty_ratio", 1007 .procname = "dirty_ratio",
1115 .data = &vm_dirty_ratio, 1008 .data = &vm_dirty_ratio,
1116 .maxlen = sizeof(vm_dirty_ratio), 1009 .maxlen = sizeof(vm_dirty_ratio),
1117 .mode = 0644, 1010 .mode = 0644,
1118 .proc_handler = &dirty_ratio_handler, 1011 .proc_handler = dirty_ratio_handler,
1119 .strategy = &sysctl_intvec,
1120 .extra1 = &zero, 1012 .extra1 = &zero,
1121 .extra2 = &one_hundred, 1013 .extra2 = &one_hundred,
1122 }, 1014 },
1123 { 1015 {
1124 .ctl_name = CTL_UNNUMBERED,
1125 .procname = "dirty_bytes", 1016 .procname = "dirty_bytes",
1126 .data = &vm_dirty_bytes, 1017 .data = &vm_dirty_bytes,
1127 .maxlen = sizeof(vm_dirty_bytes), 1018 .maxlen = sizeof(vm_dirty_bytes),
1128 .mode = 0644, 1019 .mode = 0644,
1129 .proc_handler = &dirty_bytes_handler, 1020 .proc_handler = dirty_bytes_handler,
1130 .strategy = &sysctl_intvec,
1131 .extra1 = &dirty_bytes_min, 1021 .extra1 = &dirty_bytes_min,
1132 }, 1022 },
1133 { 1023 {
@@ -1135,31 +1025,28 @@ static struct ctl_table vm_table[] = {
1135 .data = &dirty_writeback_interval, 1025 .data = &dirty_writeback_interval,
1136 .maxlen = sizeof(dirty_writeback_interval), 1026 .maxlen = sizeof(dirty_writeback_interval),
1137 .mode = 0644, 1027 .mode = 0644,
1138 .proc_handler = &dirty_writeback_centisecs_handler, 1028 .proc_handler = dirty_writeback_centisecs_handler,
1139 }, 1029 },
1140 { 1030 {
1141 .procname = "dirty_expire_centisecs", 1031 .procname = "dirty_expire_centisecs",
1142 .data = &dirty_expire_interval, 1032 .data = &dirty_expire_interval,
1143 .maxlen = sizeof(dirty_expire_interval), 1033 .maxlen = sizeof(dirty_expire_interval),
1144 .mode = 0644, 1034 .mode = 0644,
1145 .proc_handler = &proc_dointvec, 1035 .proc_handler = proc_dointvec,
1146 }, 1036 },
1147 { 1037 {
1148 .ctl_name = VM_NR_PDFLUSH_THREADS,
1149 .procname = "nr_pdflush_threads", 1038 .procname = "nr_pdflush_threads",
1150 .data = &nr_pdflush_threads, 1039 .data = &nr_pdflush_threads,
1151 .maxlen = sizeof nr_pdflush_threads, 1040 .maxlen = sizeof nr_pdflush_threads,
1152 .mode = 0444 /* read-only*/, 1041 .mode = 0444 /* read-only*/,
1153 .proc_handler = &proc_dointvec, 1042 .proc_handler = proc_dointvec,
1154 }, 1043 },
1155 { 1044 {
1156 .ctl_name = VM_SWAPPINESS,
1157 .procname = "swappiness", 1045 .procname = "swappiness",
1158 .data = &vm_swappiness, 1046 .data = &vm_swappiness,
1159 .maxlen = sizeof(vm_swappiness), 1047 .maxlen = sizeof(vm_swappiness),
1160 .mode = 0644, 1048 .mode = 0644,
1161 .proc_handler = &proc_dointvec_minmax, 1049 .proc_handler = proc_dointvec_minmax,
1162 .strategy = &sysctl_intvec,
1163 .extra1 = &zero, 1050 .extra1 = &zero,
1164 .extra2 = &one_hundred, 1051 .extra2 = &one_hundred,
1165 }, 1052 },
@@ -1169,255 +1056,213 @@ static struct ctl_table vm_table[] = {
1169 .data = NULL, 1056 .data = NULL,
1170 .maxlen = sizeof(unsigned long), 1057 .maxlen = sizeof(unsigned long),
1171 .mode = 0644, 1058 .mode = 0644,
1172 .proc_handler = &hugetlb_sysctl_handler, 1059 .proc_handler = hugetlb_sysctl_handler,
1173 .extra1 = (void *)&hugetlb_zero, 1060 .extra1 = (void *)&hugetlb_zero,
1174 .extra2 = (void *)&hugetlb_infinity, 1061 .extra2 = (void *)&hugetlb_infinity,
1175 }, 1062 },
1176 { 1063 {
1177 .ctl_name = VM_HUGETLB_GROUP,
1178 .procname = "hugetlb_shm_group", 1064 .procname = "hugetlb_shm_group",
1179 .data = &sysctl_hugetlb_shm_group, 1065 .data = &sysctl_hugetlb_shm_group,
1180 .maxlen = sizeof(gid_t), 1066 .maxlen = sizeof(gid_t),
1181 .mode = 0644, 1067 .mode = 0644,
1182 .proc_handler = &proc_dointvec, 1068 .proc_handler = proc_dointvec,
1183 }, 1069 },
1184 { 1070 {
1185 .ctl_name = CTL_UNNUMBERED,
1186 .procname = "hugepages_treat_as_movable", 1071 .procname = "hugepages_treat_as_movable",
1187 .data = &hugepages_treat_as_movable, 1072 .data = &hugepages_treat_as_movable,
1188 .maxlen = sizeof(int), 1073 .maxlen = sizeof(int),
1189 .mode = 0644, 1074 .mode = 0644,
1190 .proc_handler = &hugetlb_treat_movable_handler, 1075 .proc_handler = hugetlb_treat_movable_handler,
1191 }, 1076 },
1192 { 1077 {
1193 .ctl_name = CTL_UNNUMBERED,
1194 .procname = "nr_overcommit_hugepages", 1078 .procname = "nr_overcommit_hugepages",
1195 .data = NULL, 1079 .data = NULL,
1196 .maxlen = sizeof(unsigned long), 1080 .maxlen = sizeof(unsigned long),
1197 .mode = 0644, 1081 .mode = 0644,
1198 .proc_handler = &hugetlb_overcommit_handler, 1082 .proc_handler = hugetlb_overcommit_handler,
1199 .extra1 = (void *)&hugetlb_zero, 1083 .extra1 = (void *)&hugetlb_zero,
1200 .extra2 = (void *)&hugetlb_infinity, 1084 .extra2 = (void *)&hugetlb_infinity,
1201 }, 1085 },
1202#endif 1086#endif
1203 { 1087 {
1204 .ctl_name = VM_LOWMEM_RESERVE_RATIO,
1205 .procname = "lowmem_reserve_ratio", 1088 .procname = "lowmem_reserve_ratio",
1206 .data = &sysctl_lowmem_reserve_ratio, 1089 .data = &sysctl_lowmem_reserve_ratio,
1207 .maxlen = sizeof(sysctl_lowmem_reserve_ratio), 1090 .maxlen = sizeof(sysctl_lowmem_reserve_ratio),
1208 .mode = 0644, 1091 .mode = 0644,
1209 .proc_handler = &lowmem_reserve_ratio_sysctl_handler, 1092 .proc_handler = lowmem_reserve_ratio_sysctl_handler,
1210 .strategy = &sysctl_intvec,
1211 }, 1093 },
1212 { 1094 {
1213 .ctl_name = VM_DROP_PAGECACHE,
1214 .procname = "drop_caches", 1095 .procname = "drop_caches",
1215 .data = &sysctl_drop_caches, 1096 .data = &sysctl_drop_caches,
1216 .maxlen = sizeof(int), 1097 .maxlen = sizeof(int),
1217 .mode = 0644, 1098 .mode = 0644,
1218 .proc_handler = drop_caches_sysctl_handler, 1099 .proc_handler = drop_caches_sysctl_handler,
1219 .strategy = &sysctl_intvec,
1220 }, 1100 },
1221 { 1101 {
1222 .ctl_name = VM_MIN_FREE_KBYTES,
1223 .procname = "min_free_kbytes", 1102 .procname = "min_free_kbytes",
1224 .data = &min_free_kbytes, 1103 .data = &min_free_kbytes,
1225 .maxlen = sizeof(min_free_kbytes), 1104 .maxlen = sizeof(min_free_kbytes),
1226 .mode = 0644, 1105 .mode = 0644,
1227 .proc_handler = &min_free_kbytes_sysctl_handler, 1106 .proc_handler = min_free_kbytes_sysctl_handler,
1228 .strategy = &sysctl_intvec,
1229 .extra1 = &zero, 1107 .extra1 = &zero,
1230 }, 1108 },
1231 { 1109 {
1232 .ctl_name = VM_PERCPU_PAGELIST_FRACTION,
1233 .procname = "percpu_pagelist_fraction", 1110 .procname = "percpu_pagelist_fraction",
1234 .data = &percpu_pagelist_fraction, 1111 .data = &percpu_pagelist_fraction,
1235 .maxlen = sizeof(percpu_pagelist_fraction), 1112 .maxlen = sizeof(percpu_pagelist_fraction),
1236 .mode = 0644, 1113 .mode = 0644,
1237 .proc_handler = &percpu_pagelist_fraction_sysctl_handler, 1114 .proc_handler = percpu_pagelist_fraction_sysctl_handler,
1238 .strategy = &sysctl_intvec,
1239 .extra1 = &min_percpu_pagelist_fract, 1115 .extra1 = &min_percpu_pagelist_fract,
1240 }, 1116 },
1241#ifdef CONFIG_MMU 1117#ifdef CONFIG_MMU
1242 { 1118 {
1243 .ctl_name = VM_MAX_MAP_COUNT,
1244 .procname = "max_map_count", 1119 .procname = "max_map_count",
1245 .data = &sysctl_max_map_count, 1120 .data = &sysctl_max_map_count,
1246 .maxlen = sizeof(sysctl_max_map_count), 1121 .maxlen = sizeof(sysctl_max_map_count),
1247 .mode = 0644, 1122 .mode = 0644,
1248 .proc_handler = &proc_dointvec 1123 .proc_handler = proc_dointvec
1249 }, 1124 },
1250#else 1125#else
1251 { 1126 {
1252 .ctl_name = CTL_UNNUMBERED,
1253 .procname = "nr_trim_pages", 1127 .procname = "nr_trim_pages",
1254 .data = &sysctl_nr_trim_pages, 1128 .data = &sysctl_nr_trim_pages,
1255 .maxlen = sizeof(sysctl_nr_trim_pages), 1129 .maxlen = sizeof(sysctl_nr_trim_pages),
1256 .mode = 0644, 1130 .mode = 0644,
1257 .proc_handler = &proc_dointvec_minmax, 1131 .proc_handler = proc_dointvec_minmax,
1258 .strategy = &sysctl_intvec,
1259 .extra1 = &zero, 1132 .extra1 = &zero,
1260 }, 1133 },
1261#endif 1134#endif
1262 { 1135 {
1263 .ctl_name = VM_LAPTOP_MODE,
1264 .procname = "laptop_mode", 1136 .procname = "laptop_mode",
1265 .data = &laptop_mode, 1137 .data = &laptop_mode,
1266 .maxlen = sizeof(laptop_mode), 1138 .maxlen = sizeof(laptop_mode),
1267 .mode = 0644, 1139 .mode = 0644,
1268 .proc_handler = &proc_dointvec_jiffies, 1140 .proc_handler = proc_dointvec_jiffies,
1269 .strategy = &sysctl_jiffies,
1270 }, 1141 },
1271 { 1142 {
1272 .ctl_name = VM_BLOCK_DUMP,
1273 .procname = "block_dump", 1143 .procname = "block_dump",
1274 .data = &block_dump, 1144 .data = &block_dump,
1275 .maxlen = sizeof(block_dump), 1145 .maxlen = sizeof(block_dump),
1276 .mode = 0644, 1146 .mode = 0644,
1277 .proc_handler = &proc_dointvec, 1147 .proc_handler = proc_dointvec,
1278 .strategy = &sysctl_intvec,
1279 .extra1 = &zero, 1148 .extra1 = &zero,
1280 }, 1149 },
1281 { 1150 {
1282 .ctl_name = VM_VFS_CACHE_PRESSURE,
1283 .procname = "vfs_cache_pressure", 1151 .procname = "vfs_cache_pressure",
1284 .data = &sysctl_vfs_cache_pressure, 1152 .data = &sysctl_vfs_cache_pressure,
1285 .maxlen = sizeof(sysctl_vfs_cache_pressure), 1153 .maxlen = sizeof(sysctl_vfs_cache_pressure),
1286 .mode = 0644, 1154 .mode = 0644,
1287 .proc_handler = &proc_dointvec, 1155 .proc_handler = proc_dointvec,
1288 .strategy = &sysctl_intvec,
1289 .extra1 = &zero, 1156 .extra1 = &zero,
1290 }, 1157 },
1291#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT 1158#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
1292 { 1159 {
1293 .ctl_name = VM_LEGACY_VA_LAYOUT,
1294 .procname = "legacy_va_layout", 1160 .procname = "legacy_va_layout",
1295 .data = &sysctl_legacy_va_layout, 1161 .data = &sysctl_legacy_va_layout,
1296 .maxlen = sizeof(sysctl_legacy_va_layout), 1162 .maxlen = sizeof(sysctl_legacy_va_layout),
1297 .mode = 0644, 1163 .mode = 0644,
1298 .proc_handler = &proc_dointvec, 1164 .proc_handler = proc_dointvec,
1299 .strategy = &sysctl_intvec,
1300 .extra1 = &zero, 1165 .extra1 = &zero,
1301 }, 1166 },
1302#endif 1167#endif
1303#ifdef CONFIG_NUMA 1168#ifdef CONFIG_NUMA
1304 { 1169 {
1305 .ctl_name = VM_ZONE_RECLAIM_MODE,
1306 .procname = "zone_reclaim_mode", 1170 .procname = "zone_reclaim_mode",
1307 .data = &zone_reclaim_mode, 1171 .data = &zone_reclaim_mode,
1308 .maxlen = sizeof(zone_reclaim_mode), 1172 .maxlen = sizeof(zone_reclaim_mode),
1309 .mode = 0644, 1173 .mode = 0644,
1310 .proc_handler = &proc_dointvec, 1174 .proc_handler = proc_dointvec,
1311 .strategy = &sysctl_intvec,
1312 .extra1 = &zero, 1175 .extra1 = &zero,
1313 }, 1176 },
1314 { 1177 {
1315 .ctl_name = VM_MIN_UNMAPPED,
1316 .procname = "min_unmapped_ratio", 1178 .procname = "min_unmapped_ratio",
1317 .data = &sysctl_min_unmapped_ratio, 1179 .data = &sysctl_min_unmapped_ratio,
1318 .maxlen = sizeof(sysctl_min_unmapped_ratio), 1180 .maxlen = sizeof(sysctl_min_unmapped_ratio),
1319 .mode = 0644, 1181 .mode = 0644,
1320 .proc_handler = &sysctl_min_unmapped_ratio_sysctl_handler, 1182 .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler,
1321 .strategy = &sysctl_intvec,
1322 .extra1 = &zero, 1183 .extra1 = &zero,
1323 .extra2 = &one_hundred, 1184 .extra2 = &one_hundred,
1324 }, 1185 },
1325 { 1186 {
1326 .ctl_name = VM_MIN_SLAB,
1327 .procname = "min_slab_ratio", 1187 .procname = "min_slab_ratio",
1328 .data = &sysctl_min_slab_ratio, 1188 .data = &sysctl_min_slab_ratio,
1329 .maxlen = sizeof(sysctl_min_slab_ratio), 1189 .maxlen = sizeof(sysctl_min_slab_ratio),
1330 .mode = 0644, 1190 .mode = 0644,
1331 .proc_handler = &sysctl_min_slab_ratio_sysctl_handler, 1191 .proc_handler = sysctl_min_slab_ratio_sysctl_handler,
1332 .strategy = &sysctl_intvec,
1333 .extra1 = &zero, 1192 .extra1 = &zero,
1334 .extra2 = &one_hundred, 1193 .extra2 = &one_hundred,
1335 }, 1194 },
1336#endif 1195#endif
1337#ifdef CONFIG_SMP 1196#ifdef CONFIG_SMP
1338 { 1197 {
1339 .ctl_name = CTL_UNNUMBERED,
1340 .procname = "stat_interval", 1198 .procname = "stat_interval",
1341 .data = &sysctl_stat_interval, 1199 .data = &sysctl_stat_interval,
1342 .maxlen = sizeof(sysctl_stat_interval), 1200 .maxlen = sizeof(sysctl_stat_interval),
1343 .mode = 0644, 1201 .mode = 0644,
1344 .proc_handler = &proc_dointvec_jiffies, 1202 .proc_handler = proc_dointvec_jiffies,
1345 .strategy = &sysctl_jiffies,
1346 }, 1203 },
1347#endif 1204#endif
1348 { 1205 {
1349 .ctl_name = CTL_UNNUMBERED,
1350 .procname = "mmap_min_addr", 1206 .procname = "mmap_min_addr",
1351 .data = &dac_mmap_min_addr, 1207 .data = &dac_mmap_min_addr,
1352 .maxlen = sizeof(unsigned long), 1208 .maxlen = sizeof(unsigned long),
1353 .mode = 0644, 1209 .mode = 0644,
1354 .proc_handler = &mmap_min_addr_handler, 1210 .proc_handler = mmap_min_addr_handler,
1355 }, 1211 },
1356#ifdef CONFIG_NUMA 1212#ifdef CONFIG_NUMA
1357 { 1213 {
1358 .ctl_name = CTL_UNNUMBERED,
1359 .procname = "numa_zonelist_order", 1214 .procname = "numa_zonelist_order",
1360 .data = &numa_zonelist_order, 1215 .data = &numa_zonelist_order,
1361 .maxlen = NUMA_ZONELIST_ORDER_LEN, 1216 .maxlen = NUMA_ZONELIST_ORDER_LEN,
1362 .mode = 0644, 1217 .mode = 0644,
1363 .proc_handler = &numa_zonelist_order_handler, 1218 .proc_handler = numa_zonelist_order_handler,
1364 .strategy = &sysctl_string,
1365 }, 1219 },
1366#endif 1220#endif
1367#if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \ 1221#if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \
1368 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) 1222 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
1369 { 1223 {
1370 .ctl_name = VM_VDSO_ENABLED,
1371 .procname = "vdso_enabled", 1224 .procname = "vdso_enabled",
1372 .data = &vdso_enabled, 1225 .data = &vdso_enabled,
1373 .maxlen = sizeof(vdso_enabled), 1226 .maxlen = sizeof(vdso_enabled),
1374 .mode = 0644, 1227 .mode = 0644,
1375 .proc_handler = &proc_dointvec, 1228 .proc_handler = proc_dointvec,
1376 .strategy = &sysctl_intvec,
1377 .extra1 = &zero, 1229 .extra1 = &zero,
1378 }, 1230 },
1379#endif 1231#endif
1380#ifdef CONFIG_HIGHMEM 1232#ifdef CONFIG_HIGHMEM
1381 { 1233 {
1382 .ctl_name = CTL_UNNUMBERED,
1383 .procname = "highmem_is_dirtyable", 1234 .procname = "highmem_is_dirtyable",
1384 .data = &vm_highmem_is_dirtyable, 1235 .data = &vm_highmem_is_dirtyable,
1385 .maxlen = sizeof(vm_highmem_is_dirtyable), 1236 .maxlen = sizeof(vm_highmem_is_dirtyable),
1386 .mode = 0644, 1237 .mode = 0644,
1387 .proc_handler = &proc_dointvec_minmax, 1238 .proc_handler = proc_dointvec_minmax,
1388 .strategy = &sysctl_intvec,
1389 .extra1 = &zero, 1239 .extra1 = &zero,
1390 .extra2 = &one, 1240 .extra2 = &one,
1391 }, 1241 },
1392#endif 1242#endif
1393 { 1243 {
1394 .ctl_name = CTL_UNNUMBERED,
1395 .procname = "scan_unevictable_pages", 1244 .procname = "scan_unevictable_pages",
1396 .data = &scan_unevictable_pages, 1245 .data = &scan_unevictable_pages,
1397 .maxlen = sizeof(scan_unevictable_pages), 1246 .maxlen = sizeof(scan_unevictable_pages),
1398 .mode = 0644, 1247 .mode = 0644,
1399 .proc_handler = &scan_unevictable_handler, 1248 .proc_handler = scan_unevictable_handler,
1400 }, 1249 },
1401#ifdef CONFIG_MEMORY_FAILURE 1250#ifdef CONFIG_MEMORY_FAILURE
1402 { 1251 {
1403 .ctl_name = CTL_UNNUMBERED,
1404 .procname = "memory_failure_early_kill", 1252 .procname = "memory_failure_early_kill",
1405 .data = &sysctl_memory_failure_early_kill, 1253 .data = &sysctl_memory_failure_early_kill,
1406 .maxlen = sizeof(sysctl_memory_failure_early_kill), 1254 .maxlen = sizeof(sysctl_memory_failure_early_kill),
1407 .mode = 0644, 1255 .mode = 0644,
1408 .proc_handler = &proc_dointvec_minmax, 1256 .proc_handler = proc_dointvec_minmax,
1409 .strategy = &sysctl_intvec,
1410 .extra1 = &zero, 1257 .extra1 = &zero,
1411 .extra2 = &one, 1258 .extra2 = &one,
1412 }, 1259 },
1413 { 1260 {
1414 .ctl_name = CTL_UNNUMBERED,
1415 .procname = "memory_failure_recovery", 1261 .procname = "memory_failure_recovery",
1416 .data = &sysctl_memory_failure_recovery, 1262 .data = &sysctl_memory_failure_recovery,
1417 .maxlen = sizeof(sysctl_memory_failure_recovery), 1263 .maxlen = sizeof(sysctl_memory_failure_recovery),
1418 .mode = 0644, 1264 .mode = 0644,
1419 .proc_handler = &proc_dointvec_minmax, 1265 .proc_handler = proc_dointvec_minmax,
1420 .strategy = &sysctl_intvec,
1421 .extra1 = &zero, 1266 .extra1 = &zero,
1422 .extra2 = &one, 1267 .extra2 = &one,
1423 }, 1268 },
@@ -1427,116 +1272,104 @@ static struct ctl_table vm_table[] = {
1427 * NOTE: do not add new entries to this table unless you have read 1272 * NOTE: do not add new entries to this table unless you have read
1428 * Documentation/sysctl/ctl_unnumbered.txt 1273 * Documentation/sysctl/ctl_unnumbered.txt
1429 */ 1274 */
1430 { .ctl_name = 0 } 1275 { }
1431}; 1276};
1432 1277
1433#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) 1278#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
1434static struct ctl_table binfmt_misc_table[] = { 1279static struct ctl_table binfmt_misc_table[] = {
1435 { .ctl_name = 0 } 1280 { }
1436}; 1281};
1437#endif 1282#endif
1438 1283
1439static struct ctl_table fs_table[] = { 1284static struct ctl_table fs_table[] = {
1440 { 1285 {
1441 .ctl_name = FS_NRINODE,
1442 .procname = "inode-nr", 1286 .procname = "inode-nr",
1443 .data = &inodes_stat, 1287 .data = &inodes_stat,
1444 .maxlen = 2*sizeof(int), 1288 .maxlen = 2*sizeof(int),
1445 .mode = 0444, 1289 .mode = 0444,
1446 .proc_handler = &proc_dointvec, 1290 .proc_handler = proc_dointvec,
1447 }, 1291 },
1448 { 1292 {
1449 .ctl_name = FS_STATINODE,
1450 .procname = "inode-state", 1293 .procname = "inode-state",
1451 .data = &inodes_stat, 1294 .data = &inodes_stat,
1452 .maxlen = 7*sizeof(int), 1295 .maxlen = 7*sizeof(int),
1453 .mode = 0444, 1296 .mode = 0444,
1454 .proc_handler = &proc_dointvec, 1297 .proc_handler = proc_dointvec,
1455 }, 1298 },
1456 { 1299 {
1457 .procname = "file-nr", 1300 .procname = "file-nr",
1458 .data = &files_stat, 1301 .data = &files_stat,
1459 .maxlen = 3*sizeof(int), 1302 .maxlen = 3*sizeof(int),
1460 .mode = 0444, 1303 .mode = 0444,
1461 .proc_handler = &proc_nr_files, 1304 .proc_handler = proc_nr_files,
1462 }, 1305 },
1463 { 1306 {
1464 .ctl_name = FS_MAXFILE,
1465 .procname = "file-max", 1307 .procname = "file-max",
1466 .data = &files_stat.max_files, 1308 .data = &files_stat.max_files,
1467 .maxlen = sizeof(int), 1309 .maxlen = sizeof(int),
1468 .mode = 0644, 1310 .mode = 0644,
1469 .proc_handler = &proc_dointvec, 1311 .proc_handler = proc_dointvec,
1470 }, 1312 },
1471 { 1313 {
1472 .ctl_name = CTL_UNNUMBERED,
1473 .procname = "nr_open", 1314 .procname = "nr_open",
1474 .data = &sysctl_nr_open, 1315 .data = &sysctl_nr_open,
1475 .maxlen = sizeof(int), 1316 .maxlen = sizeof(int),
1476 .mode = 0644, 1317 .mode = 0644,
1477 .proc_handler = &proc_dointvec_minmax, 1318 .proc_handler = proc_dointvec_minmax,
1478 .extra1 = &sysctl_nr_open_min, 1319 .extra1 = &sysctl_nr_open_min,
1479 .extra2 = &sysctl_nr_open_max, 1320 .extra2 = &sysctl_nr_open_max,
1480 }, 1321 },
1481 { 1322 {
1482 .ctl_name = FS_DENTRY,
1483 .procname = "dentry-state", 1323 .procname = "dentry-state",
1484 .data = &dentry_stat, 1324 .data = &dentry_stat,
1485 .maxlen = 6*sizeof(int), 1325 .maxlen = 6*sizeof(int),
1486 .mode = 0444, 1326 .mode = 0444,
1487 .proc_handler = &proc_dointvec, 1327 .proc_handler = proc_dointvec,
1488 }, 1328 },
1489 { 1329 {
1490 .ctl_name = FS_OVERFLOWUID,
1491 .procname = "overflowuid", 1330 .procname = "overflowuid",
1492 .data = &fs_overflowuid, 1331 .data = &fs_overflowuid,
1493 .maxlen = sizeof(int), 1332 .maxlen = sizeof(int),
1494 .mode = 0644, 1333 .mode = 0644,
1495 .proc_handler = &proc_dointvec_minmax, 1334 .proc_handler = proc_dointvec_minmax,
1496 .strategy = &sysctl_intvec,
1497 .extra1 = &minolduid, 1335 .extra1 = &minolduid,
1498 .extra2 = &maxolduid, 1336 .extra2 = &maxolduid,
1499 }, 1337 },
1500 { 1338 {
1501 .ctl_name = FS_OVERFLOWGID,
1502 .procname = "overflowgid", 1339 .procname = "overflowgid",
1503 .data = &fs_overflowgid, 1340 .data = &fs_overflowgid,
1504 .maxlen = sizeof(int), 1341 .maxlen = sizeof(int),
1505 .mode = 0644, 1342 .mode = 0644,
1506 .proc_handler = &proc_dointvec_minmax, 1343 .proc_handler = proc_dointvec_minmax,
1507 .strategy = &sysctl_intvec,
1508 .extra1 = &minolduid, 1344 .extra1 = &minolduid,
1509 .extra2 = &maxolduid, 1345 .extra2 = &maxolduid,
1510 }, 1346 },
1511#ifdef CONFIG_FILE_LOCKING 1347#ifdef CONFIG_FILE_LOCKING
1512 { 1348 {
1513 .ctl_name = FS_LEASES,
1514 .procname = "leases-enable", 1349 .procname = "leases-enable",
1515 .data = &leases_enable, 1350 .data = &leases_enable,
1516 .maxlen = sizeof(int), 1351 .maxlen = sizeof(int),
1517 .mode = 0644, 1352 .mode = 0644,
1518 .proc_handler = &proc_dointvec, 1353 .proc_handler = proc_dointvec,
1519 }, 1354 },
1520#endif 1355#endif
1521#ifdef CONFIG_DNOTIFY 1356#ifdef CONFIG_DNOTIFY
1522 { 1357 {
1523 .ctl_name = FS_DIR_NOTIFY,
1524 .procname = "dir-notify-enable", 1358 .procname = "dir-notify-enable",
1525 .data = &dir_notify_enable, 1359 .data = &dir_notify_enable,
1526 .maxlen = sizeof(int), 1360 .maxlen = sizeof(int),
1527 .mode = 0644, 1361 .mode = 0644,
1528 .proc_handler = &proc_dointvec, 1362 .proc_handler = proc_dointvec,
1529 }, 1363 },
1530#endif 1364#endif
1531#ifdef CONFIG_MMU 1365#ifdef CONFIG_MMU
1532#ifdef CONFIG_FILE_LOCKING 1366#ifdef CONFIG_FILE_LOCKING
1533 { 1367 {
1534 .ctl_name = FS_LEASE_TIME,
1535 .procname = "lease-break-time", 1368 .procname = "lease-break-time",
1536 .data = &lease_break_time, 1369 .data = &lease_break_time,
1537 .maxlen = sizeof(int), 1370 .maxlen = sizeof(int),
1538 .mode = 0644, 1371 .mode = 0644,
1539 .proc_handler = &proc_dointvec, 1372 .proc_handler = proc_dointvec,
1540 }, 1373 },
1541#endif 1374#endif
1542#ifdef CONFIG_AIO 1375#ifdef CONFIG_AIO
@@ -1545,19 +1378,18 @@ static struct ctl_table fs_table[] = {
1545 .data = &aio_nr, 1378 .data = &aio_nr,
1546 .maxlen = sizeof(aio_nr), 1379 .maxlen = sizeof(aio_nr),
1547 .mode = 0444, 1380 .mode = 0444,
1548 .proc_handler = &proc_doulongvec_minmax, 1381 .proc_handler = proc_doulongvec_minmax,
1549 }, 1382 },
1550 { 1383 {
1551 .procname = "aio-max-nr", 1384 .procname = "aio-max-nr",
1552 .data = &aio_max_nr, 1385 .data = &aio_max_nr,
1553 .maxlen = sizeof(aio_max_nr), 1386 .maxlen = sizeof(aio_max_nr),
1554 .mode = 0644, 1387 .mode = 0644,
1555 .proc_handler = &proc_doulongvec_minmax, 1388 .proc_handler = proc_doulongvec_minmax,
1556 }, 1389 },
1557#endif /* CONFIG_AIO */ 1390#endif /* CONFIG_AIO */
1558#ifdef CONFIG_INOTIFY_USER 1391#ifdef CONFIG_INOTIFY_USER
1559 { 1392 {
1560 .ctl_name = FS_INOTIFY,
1561 .procname = "inotify", 1393 .procname = "inotify",
1562 .mode = 0555, 1394 .mode = 0555,
1563 .child = inotify_table, 1395 .child = inotify_table,
@@ -1572,19 +1404,16 @@ static struct ctl_table fs_table[] = {
1572#endif 1404#endif
1573#endif 1405#endif
1574 { 1406 {
1575 .ctl_name = KERN_SETUID_DUMPABLE,
1576 .procname = "suid_dumpable", 1407 .procname = "suid_dumpable",
1577 .data = &suid_dumpable, 1408 .data = &suid_dumpable,
1578 .maxlen = sizeof(int), 1409 .maxlen = sizeof(int),
1579 .mode = 0644, 1410 .mode = 0644,
1580 .proc_handler = &proc_dointvec_minmax, 1411 .proc_handler = proc_dointvec_minmax,
1581 .strategy = &sysctl_intvec,
1582 .extra1 = &zero, 1412 .extra1 = &zero,
1583 .extra2 = &two, 1413 .extra2 = &two,
1584 }, 1414 },
1585#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) 1415#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
1586 { 1416 {
1587 .ctl_name = CTL_UNNUMBERED,
1588 .procname = "binfmt_misc", 1417 .procname = "binfmt_misc",
1589 .mode = 0555, 1418 .mode = 0555,
1590 .child = binfmt_misc_table, 1419 .child = binfmt_misc_table,
@@ -1594,13 +1423,12 @@ static struct ctl_table fs_table[] = {
1594 * NOTE: do not add new entries to this table unless you have read 1423 * NOTE: do not add new entries to this table unless you have read
1595 * Documentation/sysctl/ctl_unnumbered.txt 1424 * Documentation/sysctl/ctl_unnumbered.txt
1596 */ 1425 */
1597 { .ctl_name = 0 } 1426 { }
1598}; 1427};
1599 1428
1600static struct ctl_table debug_table[] = { 1429static struct ctl_table debug_table[] = {
1601#if defined(CONFIG_X86) || defined(CONFIG_PPC) 1430#if defined(CONFIG_X86) || defined(CONFIG_PPC)
1602 { 1431 {
1603 .ctl_name = CTL_UNNUMBERED,
1604 .procname = "exception-trace", 1432 .procname = "exception-trace",
1605 .data = &show_unhandled_signals, 1433 .data = &show_unhandled_signals,
1606 .maxlen = sizeof(int), 1434 .maxlen = sizeof(int),
@@ -1608,11 +1436,11 @@ static struct ctl_table debug_table[] = {
1608 .proc_handler = proc_dointvec 1436 .proc_handler = proc_dointvec
1609 }, 1437 },
1610#endif 1438#endif
1611 { .ctl_name = 0 } 1439 { }
1612}; 1440};
1613 1441
1614static struct ctl_table dev_table[] = { 1442static struct ctl_table dev_table[] = {
1615 { .ctl_name = 0 } 1443 { }
1616}; 1444};
1617 1445
1618static DEFINE_SPINLOCK(sysctl_lock); 1446static DEFINE_SPINLOCK(sysctl_lock);
@@ -1766,122 +1594,6 @@ void register_sysctl_root(struct ctl_table_root *root)
1766 spin_unlock(&sysctl_lock); 1594 spin_unlock(&sysctl_lock);
1767} 1595}
1768 1596
1769#ifdef CONFIG_SYSCTL_SYSCALL
1770/* Perform the actual read/write of a sysctl table entry. */
1771static int do_sysctl_strategy(struct ctl_table_root *root,
1772 struct ctl_table *table,
1773 void __user *oldval, size_t __user *oldlenp,
1774 void __user *newval, size_t newlen)
1775{
1776 int op = 0, rc;
1777
1778 if (oldval)
1779 op |= MAY_READ;
1780 if (newval)
1781 op |= MAY_WRITE;
1782 if (sysctl_perm(root, table, op))
1783 return -EPERM;
1784
1785 if (table->strategy) {
1786 rc = table->strategy(table, oldval, oldlenp, newval, newlen);
1787 if (rc < 0)
1788 return rc;
1789 if (rc > 0)
1790 return 0;
1791 }
1792
1793 /* If there is no strategy routine, or if the strategy returns
1794 * zero, proceed with automatic r/w */
1795 if (table->data && table->maxlen) {
1796 rc = sysctl_data(table, oldval, oldlenp, newval, newlen);
1797 if (rc < 0)
1798 return rc;
1799 }
1800 return 0;
1801}
1802
1803static int parse_table(int __user *name, int nlen,
1804 void __user *oldval, size_t __user *oldlenp,
1805 void __user *newval, size_t newlen,
1806 struct ctl_table_root *root,
1807 struct ctl_table *table)
1808{
1809 int n;
1810repeat:
1811 if (!nlen)
1812 return -ENOTDIR;
1813 if (get_user(n, name))
1814 return -EFAULT;
1815 for ( ; table->ctl_name || table->procname; table++) {
1816 if (!table->ctl_name)
1817 continue;
1818 if (n == table->ctl_name) {
1819 int error;
1820 if (table->child) {
1821 if (sysctl_perm(root, table, MAY_EXEC))
1822 return -EPERM;
1823 name++;
1824 nlen--;
1825 table = table->child;
1826 goto repeat;
1827 }
1828 error = do_sysctl_strategy(root, table,
1829 oldval, oldlenp,
1830 newval, newlen);
1831 return error;
1832 }
1833 }
1834 return -ENOTDIR;
1835}
1836
1837int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp,
1838 void __user *newval, size_t newlen)
1839{
1840 struct ctl_table_header *head;
1841 int error = -ENOTDIR;
1842
1843 if (nlen <= 0 || nlen >= CTL_MAXNAME)
1844 return -ENOTDIR;
1845 if (oldval) {
1846 int old_len;
1847 if (!oldlenp || get_user(old_len, oldlenp))
1848 return -EFAULT;
1849 }
1850
1851 for (head = sysctl_head_next(NULL); head;
1852 head = sysctl_head_next(head)) {
1853 error = parse_table(name, nlen, oldval, oldlenp,
1854 newval, newlen,
1855 head->root, head->ctl_table);
1856 if (error != -ENOTDIR) {
1857 sysctl_head_finish(head);
1858 break;
1859 }
1860 }
1861 return error;
1862}
1863
1864SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
1865{
1866 struct __sysctl_args tmp;
1867 int error;
1868
1869 if (copy_from_user(&tmp, args, sizeof(tmp)))
1870 return -EFAULT;
1871
1872 error = deprecated_sysctl_warning(&tmp);
1873 if (error)
1874 goto out;
1875
1876 lock_kernel();
1877 error = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, tmp.oldlenp,
1878 tmp.newval, tmp.newlen);
1879 unlock_kernel();
1880out:
1881 return error;
1882}
1883#endif /* CONFIG_SYSCTL_SYSCALL */
1884
1885/* 1597/*
1886 * sysctl_perm does NOT grant the superuser all rights automatically, because 1598 * sysctl_perm does NOT grant the superuser all rights automatically, because
1887 * some sysctl variables are readonly even to root. 1599 * some sysctl variables are readonly even to root.
@@ -1917,7 +1629,7 @@ int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
1917 1629
1918static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) 1630static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
1919{ 1631{
1920 for (; table->ctl_name || table->procname; table++) { 1632 for (; table->procname; table++) {
1921 table->parent = parent; 1633 table->parent = parent;
1922 if (table->child) 1634 if (table->child)
1923 sysctl_set_parent(table, table->child); 1635 sysctl_set_parent(table, table->child);
@@ -1949,11 +1661,11 @@ static struct ctl_table *is_branch_in(struct ctl_table *branch,
1949 return NULL; 1661 return NULL;
1950 1662
1951 /* ... and nothing else */ 1663 /* ... and nothing else */
1952 if (branch[1].procname || branch[1].ctl_name) 1664 if (branch[1].procname)
1953 return NULL; 1665 return NULL;
1954 1666
1955 /* table should contain subdirectory with the same name */ 1667 /* table should contain subdirectory with the same name */
1956 for (p = table; p->procname || p->ctl_name; p++) { 1668 for (p = table; p->procname; p++) {
1957 if (!p->child) 1669 if (!p->child)
1958 continue; 1670 continue;
1959 if (p->procname && strcmp(p->procname, s) == 0) 1671 if (p->procname && strcmp(p->procname, s) == 0)
@@ -1998,9 +1710,6 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
1998 * 1710 *
1999 * The members of the &struct ctl_table structure are used as follows: 1711 * The members of the &struct ctl_table structure are used as follows:
2000 * 1712 *
2001 * ctl_name - This is the numeric sysctl value used by sysctl(2). The number
2002 * must be unique within that level of sysctl
2003 *
2004 * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not 1713 * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
2005 * enter a sysctl file 1714 * enter a sysctl file
2006 * 1715 *
@@ -2015,8 +1724,6 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
2015 * 1724 *
2016 * proc_handler - the text handler routine (described below) 1725 * proc_handler - the text handler routine (described below)
2017 * 1726 *
2018 * strategy - the strategy routine (described below)
2019 *
2020 * de - for internal use by the sysctl routines 1727 * de - for internal use by the sysctl routines
2021 * 1728 *
2022 * extra1, extra2 - extra pointers usable by the proc handler routines 1729 * extra1, extra2 - extra pointers usable by the proc handler routines
@@ -2029,19 +1736,6 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
2029 * struct enable minimal validation of the values being written to be 1736 * struct enable minimal validation of the values being written to be
2030 * performed, and the mode field allows minimal authentication. 1737 * performed, and the mode field allows minimal authentication.
2031 * 1738 *
2032 * More sophisticated management can be enabled by the provision of a
2033 * strategy routine with the table entry. This will be called before
2034 * any automatic read or write of the data is performed.
2035 *
2036 * The strategy routine may return
2037 *
2038 * < 0 - Error occurred (error is passed to user process)
2039 *
2040 * 0 - OK - proceed with automatic read or write.
2041 *
2042 * > 0 - OK - read or write has been done by the strategy routine, so
2043 * return immediately.
2044 *
2045 * There must be a proc_handler routine for any terminal nodes 1739 * There must be a proc_handler routine for any terminal nodes
2046 * mirrored under /proc/sys (non-terminals are handled by a built-in 1740 * mirrored under /proc/sys (non-terminals are handled by a built-in
2047 * directory handler). Several default handlers are available to 1741 * directory handler). Several default handlers are available to
@@ -2068,13 +1762,13 @@ struct ctl_table_header *__register_sysctl_paths(
2068 struct ctl_table_set *set; 1762 struct ctl_table_set *set;
2069 1763
2070 /* Count the path components */ 1764 /* Count the path components */
2071 for (npath = 0; path[npath].ctl_name || path[npath].procname; ++npath) 1765 for (npath = 0; path[npath].procname; ++npath)
2072 ; 1766 ;
2073 1767
2074 /* 1768 /*
2075 * For each path component, allocate a 2-element ctl_table array. 1769 * For each path component, allocate a 2-element ctl_table array.
2076 * The first array element will be filled with the sysctl entry 1770 * The first array element will be filled with the sysctl entry
2077 * for this, the second will be the sentinel (ctl_name == 0). 1771 * for this, the second will be the sentinel (procname == 0).
2078 * 1772 *
2079 * We allocate everything in one go so that we don't have to 1773 * We allocate everything in one go so that we don't have to
2080 * worry about freeing additional memory in unregister_sysctl_table. 1774 * worry about freeing additional memory in unregister_sysctl_table.
@@ -2091,7 +1785,6 @@ struct ctl_table_header *__register_sysctl_paths(
2091 for (n = 0; n < npath; ++n, ++path) { 1785 for (n = 0; n < npath; ++n, ++path) {
2092 /* Copy the procname */ 1786 /* Copy the procname */
2093 new->procname = path->procname; 1787 new->procname = path->procname;
2094 new->ctl_name = path->ctl_name;
2095 new->mode = 0555; 1788 new->mode = 0555;
2096 1789
2097 *prevp = new; 1790 *prevp = new;
@@ -2953,286 +2646,6 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2953 2646
2954#endif /* CONFIG_PROC_FS */ 2647#endif /* CONFIG_PROC_FS */
2955 2648
2956
2957#ifdef CONFIG_SYSCTL_SYSCALL
2958/*
2959 * General sysctl support routines
2960 */
2961
2962/* The generic sysctl data routine (used if no strategy routine supplied) */
2963int sysctl_data(struct ctl_table *table,
2964 void __user *oldval, size_t __user *oldlenp,
2965 void __user *newval, size_t newlen)
2966{
2967 size_t len;
2968
2969 /* Get out of I don't have a variable */
2970 if (!table->data || !table->maxlen)
2971 return -ENOTDIR;
2972
2973 if (oldval && oldlenp) {
2974 if (get_user(len, oldlenp))
2975 return -EFAULT;
2976 if (len) {
2977 if (len > table->maxlen)
2978 len = table->maxlen;
2979 if (copy_to_user(oldval, table->data, len))
2980 return -EFAULT;
2981 if (put_user(len, oldlenp))
2982 return -EFAULT;
2983 }
2984 }
2985
2986 if (newval && newlen) {
2987 if (newlen > table->maxlen)
2988 newlen = table->maxlen;
2989
2990 if (copy_from_user(table->data, newval, newlen))
2991 return -EFAULT;
2992 }
2993 return 1;
2994}
2995
2996/* The generic string strategy routine: */
2997int sysctl_string(struct ctl_table *table,
2998 void __user *oldval, size_t __user *oldlenp,
2999 void __user *newval, size_t newlen)
3000{
3001 if (!table->data || !table->maxlen)
3002 return -ENOTDIR;
3003
3004 if (oldval && oldlenp) {
3005 size_t bufsize;
3006 if (get_user(bufsize, oldlenp))
3007 return -EFAULT;
3008 if (bufsize) {
3009 size_t len = strlen(table->data), copied;
3010
3011 /* This shouldn't trigger for a well-formed sysctl */
3012 if (len > table->maxlen)
3013 len = table->maxlen;
3014
3015 /* Copy up to a max of bufsize-1 bytes of the string */
3016 copied = (len >= bufsize) ? bufsize - 1 : len;
3017
3018 if (copy_to_user(oldval, table->data, copied) ||
3019 put_user(0, (char __user *)(oldval + copied)))
3020 return -EFAULT;
3021 if (put_user(len, oldlenp))
3022 return -EFAULT;
3023 }
3024 }
3025 if (newval && newlen) {
3026 size_t len = newlen;
3027 if (len > table->maxlen)
3028 len = table->maxlen;
3029 if(copy_from_user(table->data, newval, len))
3030 return -EFAULT;
3031 if (len == table->maxlen)
3032 len--;
3033 ((char *) table->data)[len] = 0;
3034 }
3035 return 1;
3036}
3037
3038/*
3039 * This function makes sure that all of the integers in the vector
3040 * are between the minimum and maximum values given in the arrays
3041 * table->extra1 and table->extra2, respectively.
3042 */
3043int sysctl_intvec(struct ctl_table *table,
3044 void __user *oldval, size_t __user *oldlenp,
3045 void __user *newval, size_t newlen)
3046{
3047
3048 if (newval && newlen) {
3049 int __user *vec = (int __user *) newval;
3050 int *min = (int *) table->extra1;
3051 int *max = (int *) table->extra2;
3052 size_t length;
3053 int i;
3054
3055 if (newlen % sizeof(int) != 0)
3056 return -EINVAL;
3057
3058 if (!table->extra1 && !table->extra2)
3059 return 0;
3060
3061 if (newlen > table->maxlen)
3062 newlen = table->maxlen;
3063 length = newlen / sizeof(int);
3064
3065 for (i = 0; i < length; i++) {
3066 int value;
3067 if (get_user(value, vec + i))
3068 return -EFAULT;
3069 if (min && value < min[i])
3070 return -EINVAL;
3071 if (max && value > max[i])
3072 return -EINVAL;
3073 }
3074 }
3075 return 0;
3076}
3077
3078/* Strategy function to convert jiffies to seconds */
3079int sysctl_jiffies(struct ctl_table *table,
3080 void __user *oldval, size_t __user *oldlenp,
3081 void __user *newval, size_t newlen)
3082{
3083 if (oldval && oldlenp) {
3084 size_t olen;
3085
3086 if (get_user(olen, oldlenp))
3087 return -EFAULT;
3088 if (olen) {
3089 int val;
3090
3091 if (olen < sizeof(int))
3092 return -EINVAL;
3093
3094 val = *(int *)(table->data) / HZ;
3095 if (put_user(val, (int __user *)oldval))
3096 return -EFAULT;
3097 if (put_user(sizeof(int), oldlenp))
3098 return -EFAULT;
3099 }
3100 }
3101 if (newval && newlen) {
3102 int new;
3103 if (newlen != sizeof(int))
3104 return -EINVAL;
3105 if (get_user(new, (int __user *)newval))
3106 return -EFAULT;
3107 *(int *)(table->data) = new*HZ;
3108 }
3109 return 1;
3110}
3111
3112/* Strategy function to convert jiffies to seconds */
3113int sysctl_ms_jiffies(struct ctl_table *table,
3114 void __user *oldval, size_t __user *oldlenp,
3115 void __user *newval, size_t newlen)
3116{
3117 if (oldval && oldlenp) {
3118 size_t olen;
3119
3120 if (get_user(olen, oldlenp))
3121 return -EFAULT;
3122 if (olen) {
3123 int val;
3124
3125 if (olen < sizeof(int))
3126 return -EINVAL;
3127
3128 val = jiffies_to_msecs(*(int *)(table->data));
3129 if (put_user(val, (int __user *)oldval))
3130 return -EFAULT;
3131 if (put_user(sizeof(int), oldlenp))
3132 return -EFAULT;
3133 }
3134 }
3135 if (newval && newlen) {
3136 int new;
3137 if (newlen != sizeof(int))
3138 return -EINVAL;
3139 if (get_user(new, (int __user *)newval))
3140 return -EFAULT;
3141 *(int *)(table->data) = msecs_to_jiffies(new);
3142 }
3143 return 1;
3144}
3145
3146
3147
3148#else /* CONFIG_SYSCTL_SYSCALL */
3149
3150
3151SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
3152{
3153 struct __sysctl_args tmp;
3154 int error;
3155
3156 if (copy_from_user(&tmp, args, sizeof(tmp)))
3157 return -EFAULT;
3158
3159 error = deprecated_sysctl_warning(&tmp);
3160
3161 /* If no error reading the parameters then just -ENOSYS ... */
3162 if (!error)
3163 error = -ENOSYS;
3164
3165 return error;
3166}
3167
3168int sysctl_data(struct ctl_table *table,
3169 void __user *oldval, size_t __user *oldlenp,
3170 void __user *newval, size_t newlen)
3171{
3172 return -ENOSYS;
3173}
3174
3175int sysctl_string(struct ctl_table *table,
3176 void __user *oldval, size_t __user *oldlenp,
3177 void __user *newval, size_t newlen)
3178{
3179 return -ENOSYS;
3180}
3181
3182int sysctl_intvec(struct ctl_table *table,
3183 void __user *oldval, size_t __user *oldlenp,
3184 void __user *newval, size_t newlen)
3185{
3186 return -ENOSYS;
3187}
3188
3189int sysctl_jiffies(struct ctl_table *table,
3190 void __user *oldval, size_t __user *oldlenp,
3191 void __user *newval, size_t newlen)
3192{
3193 return -ENOSYS;
3194}
3195
3196int sysctl_ms_jiffies(struct ctl_table *table,
3197 void __user *oldval, size_t __user *oldlenp,
3198 void __user *newval, size_t newlen)
3199{
3200 return -ENOSYS;
3201}
3202
3203#endif /* CONFIG_SYSCTL_SYSCALL */
3204
3205static int deprecated_sysctl_warning(struct __sysctl_args *args)
3206{
3207 static int msg_count;
3208 int name[CTL_MAXNAME];
3209 int i;
3210
3211 /* Check args->nlen. */
3212 if (args->nlen < 0 || args->nlen > CTL_MAXNAME)
3213 return -ENOTDIR;
3214
3215 /* Read in the sysctl name for better debug message logging */
3216 for (i = 0; i < args->nlen; i++)
3217 if (get_user(name[i], args->name + i))
3218 return -EFAULT;
3219
3220 /* Ignore accesses to kernel.version */
3221 if ((args->nlen == 2) && (name[0] == CTL_KERN) && (name[1] == KERN_VERSION))
3222 return 0;
3223
3224 if (msg_count < 5) {
3225 msg_count++;
3226 printk(KERN_INFO
3227 "warning: process `%s' used the deprecated sysctl "
3228 "system call with ", current->comm);
3229 for (i = 0; i < args->nlen; i++)
3230 printk("%d.", name[i]);
3231 printk("\n");
3232 }
3233 return 0;
3234}
3235
3236/* 2649/*
3237 * No sense putting this after each symbol definition, twice, 2650 * No sense putting this after each symbol definition, twice,
3238 * exception granted :-) 2651 * exception granted :-)
@@ -3247,9 +2660,4 @@ EXPORT_SYMBOL(proc_doulongvec_minmax);
3247EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); 2660EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
3248EXPORT_SYMBOL(register_sysctl_table); 2661EXPORT_SYMBOL(register_sysctl_table);
3249EXPORT_SYMBOL(register_sysctl_paths); 2662EXPORT_SYMBOL(register_sysctl_paths);
3250EXPORT_SYMBOL(sysctl_intvec);
3251EXPORT_SYMBOL(sysctl_jiffies);
3252EXPORT_SYMBOL(sysctl_ms_jiffies);
3253EXPORT_SYMBOL(sysctl_string);
3254EXPORT_SYMBOL(sysctl_data);
3255EXPORT_SYMBOL(unregister_sysctl_table); 2663EXPORT_SYMBOL(unregister_sysctl_table);
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
new file mode 100644
index 000000000000..b75dbf40f573
--- /dev/null
+++ b/kernel/sysctl_binary.c
@@ -0,0 +1,1507 @@
1#include <linux/stat.h>
2#include <linux/sysctl.h>
3#include "../fs/xfs/linux-2.6/xfs_sysctl.h"
4#include <linux/sunrpc/debug.h>
5#include <linux/string.h>
6#include <net/ip_vs.h>
7#include <linux/syscalls.h>
8#include <linux/namei.h>
9#include <linux/mount.h>
10#include <linux/fs.h>
11#include <linux/nsproxy.h>
12#include <linux/pid_namespace.h>
13#include <linux/file.h>
14#include <linux/ctype.h>
15#include <linux/netdevice.h>
16
17#ifdef CONFIG_SYSCTL_SYSCALL
18
19struct bin_table;
20typedef ssize_t bin_convert_t(struct file *file,
21 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen);
22
23static bin_convert_t bin_dir;
24static bin_convert_t bin_string;
25static bin_convert_t bin_intvec;
26static bin_convert_t bin_ulongvec;
27static bin_convert_t bin_uuid;
28static bin_convert_t bin_dn_node_address;
29
30#define CTL_DIR bin_dir
31#define CTL_STR bin_string
32#define CTL_INT bin_intvec
33#define CTL_ULONG bin_ulongvec
34#define CTL_UUID bin_uuid
35#define CTL_DNADR bin_dn_node_address
36
37#define BUFSZ 256
38
39struct bin_table {
40 bin_convert_t *convert;
41 int ctl_name;
42 const char *procname;
43 const struct bin_table *child;
44};
45
46static const struct bin_table bin_random_table[] = {
47 { CTL_INT, RANDOM_POOLSIZE, "poolsize" },
48 { CTL_INT, RANDOM_ENTROPY_COUNT, "entropy_avail" },
49 { CTL_INT, RANDOM_READ_THRESH, "read_wakeup_threshold" },
50 { CTL_INT, RANDOM_WRITE_THRESH, "write_wakeup_threshold" },
51 { CTL_UUID, RANDOM_BOOT_ID, "boot_id" },
52 { CTL_UUID, RANDOM_UUID, "uuid" },
53 {}
54};
55
56static const struct bin_table bin_pty_table[] = {
57 { CTL_INT, PTY_MAX, "max" },
58 { CTL_INT, PTY_NR, "nr" },
59 {}
60};
61
62static const struct bin_table bin_kern_table[] = {
63 { CTL_STR, KERN_OSTYPE, "ostype" },
64 { CTL_STR, KERN_OSRELEASE, "osrelease" },
65 /* KERN_OSREV not used */
66 { CTL_STR, KERN_VERSION, "version" },
67 /* KERN_SECUREMASK not used */
68 /* KERN_PROF not used */
69 { CTL_STR, KERN_NODENAME, "hostname" },
70 { CTL_STR, KERN_DOMAINNAME, "domainname" },
71
72 { CTL_INT, KERN_PANIC, "panic" },
73 { CTL_INT, KERN_REALROOTDEV, "real-root-dev" },
74
75 { CTL_STR, KERN_SPARC_REBOOT, "reboot-cmd" },
76 { CTL_INT, KERN_CTLALTDEL, "ctrl-alt-del" },
77 { CTL_INT, KERN_PRINTK, "printk" },
78
79 /* KERN_NAMETRANS not used */
80 /* KERN_PPC_HTABRECLAIM not used */
81 /* KERN_PPC_ZEROPAGED not used */
82 { CTL_INT, KERN_PPC_POWERSAVE_NAP, "powersave-nap" },
83
84 { CTL_STR, KERN_MODPROBE, "modprobe" },
85 { CTL_INT, KERN_SG_BIG_BUFF, "sg-big-buff" },
86 { CTL_INT, KERN_ACCT, "acct" },
87 /* KERN_PPC_L2CR "l2cr" no longer used */
88
89 /* KERN_RTSIGNR not used */
90 /* KERN_RTSIGMAX not used */
91
92 { CTL_ULONG, KERN_SHMMAX, "shmmax" },
93 { CTL_INT, KERN_MSGMAX, "msgmax" },
94 { CTL_INT, KERN_MSGMNB, "msgmnb" },
95 /* KERN_MSGPOOL not used*/
96 { CTL_INT, KERN_SYSRQ, "sysrq" },
97 { CTL_INT, KERN_MAX_THREADS, "threads-max" },
98 { CTL_DIR, KERN_RANDOM, "random", bin_random_table },
99 { CTL_ULONG, KERN_SHMALL, "shmall" },
100 { CTL_INT, KERN_MSGMNI, "msgmni" },
101 { CTL_INT, KERN_SEM, "sem" },
102 { CTL_INT, KERN_SPARC_STOP_A, "stop-a" },
103 { CTL_INT, KERN_SHMMNI, "shmmni" },
104
105 { CTL_INT, KERN_OVERFLOWUID, "overflowuid" },
106 { CTL_INT, KERN_OVERFLOWGID, "overflowgid" },
107
108 { CTL_STR, KERN_HOTPLUG, "hotplug", },
109 { CTL_INT, KERN_IEEE_EMULATION_WARNINGS, "ieee_emulation_warnings" },
110
111 { CTL_INT, KERN_S390_USER_DEBUG_LOGGING, "userprocess_debug" },
112 { CTL_INT, KERN_CORE_USES_PID, "core_uses_pid" },
113 /* KERN_TAINTED "tainted" no longer used */
114 { CTL_INT, KERN_CADPID, "cad_pid" },
115 { CTL_INT, KERN_PIDMAX, "pid_max" },
116 { CTL_STR, KERN_CORE_PATTERN, "core_pattern" },
117 { CTL_INT, KERN_PANIC_ON_OOPS, "panic_on_oops" },
118 { CTL_INT, KERN_HPPA_PWRSW, "soft-power" },
119 { CTL_INT, KERN_HPPA_UNALIGNED, "unaligned-trap" },
120
121 { CTL_INT, KERN_PRINTK_RATELIMIT, "printk_ratelimit" },
122 { CTL_INT, KERN_PRINTK_RATELIMIT_BURST, "printk_ratelimit_burst" },
123
124 { CTL_DIR, KERN_PTY, "pty", bin_pty_table },
125 { CTL_INT, KERN_NGROUPS_MAX, "ngroups_max" },
126 { CTL_INT, KERN_SPARC_SCONS_PWROFF, "scons-poweroff" },
127 /* KERN_HZ_TIMER "hz_timer" no longer used */
128 { CTL_INT, KERN_UNKNOWN_NMI_PANIC, "unknown_nmi_panic" },
129 { CTL_INT, KERN_BOOTLOADER_TYPE, "bootloader_type" },
130 { CTL_INT, KERN_RANDOMIZE, "randomize_va_space" },
131
132 { CTL_INT, KERN_SPIN_RETRY, "spin_retry" },
133 /* KERN_ACPI_VIDEO_FLAGS "acpi_video_flags" no longer used */
134 { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" },
135 { CTL_INT, KERN_COMPAT_LOG, "compat-log" },
136 { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
137 { CTL_INT, KERN_NMI_WATCHDOG, "nmi_watchdog" },
138 { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
139 {}
140};
141
142static const struct bin_table bin_vm_table[] = {
143 { CTL_INT, VM_OVERCOMMIT_MEMORY, "overcommit_memory" },
144 { CTL_INT, VM_PAGE_CLUSTER, "page-cluster" },
145 { CTL_INT, VM_DIRTY_BACKGROUND, "dirty_background_ratio" },
146 { CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" },
147 /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */
148 /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */
149 { CTL_INT, VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" },
150 { CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" },
151 /* VM_PAGEBUF unused */
152 /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */
153 { CTL_INT, VM_SWAPPINESS, "swappiness" },
154 { CTL_INT, VM_LOWMEM_RESERVE_RATIO, "lowmem_reserve_ratio" },
155 { CTL_INT, VM_MIN_FREE_KBYTES, "min_free_kbytes" },
156 { CTL_INT, VM_MAX_MAP_COUNT, "max_map_count" },
157 { CTL_INT, VM_LAPTOP_MODE, "laptop_mode" },
158 { CTL_INT, VM_BLOCK_DUMP, "block_dump" },
159 { CTL_INT, VM_HUGETLB_GROUP, "hugetlb_shm_group" },
160 { CTL_INT, VM_VFS_CACHE_PRESSURE, "vfs_cache_pressure" },
161 { CTL_INT, VM_LEGACY_VA_LAYOUT, "legacy_va_layout" },
162 /* VM_SWAP_TOKEN_TIMEOUT unused */
163 { CTL_INT, VM_DROP_PAGECACHE, "drop_caches" },
164 { CTL_INT, VM_PERCPU_PAGELIST_FRACTION, "percpu_pagelist_fraction" },
165 { CTL_INT, VM_ZONE_RECLAIM_MODE, "zone_reclaim_mode" },
166 { CTL_INT, VM_MIN_UNMAPPED, "min_unmapped_ratio" },
167 { CTL_INT, VM_PANIC_ON_OOM, "panic_on_oom" },
168 { CTL_INT, VM_VDSO_ENABLED, "vdso_enabled" },
169 { CTL_INT, VM_MIN_SLAB, "min_slab_ratio" },
170
171 {}
172};
173
174static const struct bin_table bin_net_core_table[] = {
175 { CTL_INT, NET_CORE_WMEM_MAX, "wmem_max" },
176 { CTL_INT, NET_CORE_RMEM_MAX, "rmem_max" },
177 { CTL_INT, NET_CORE_WMEM_DEFAULT, "wmem_default" },
178 { CTL_INT, NET_CORE_RMEM_DEFAULT, "rmem_default" },
179 /* NET_CORE_DESTROY_DELAY unused */
180 { CTL_INT, NET_CORE_MAX_BACKLOG, "netdev_max_backlog" },
181 /* NET_CORE_FASTROUTE unused */
182 { CTL_INT, NET_CORE_MSG_COST, "message_cost" },
183 { CTL_INT, NET_CORE_MSG_BURST, "message_burst" },
184 { CTL_INT, NET_CORE_OPTMEM_MAX, "optmem_max" },
185 /* NET_CORE_HOT_LIST_LENGTH unused */
186 /* NET_CORE_DIVERT_VERSION unused */
187 /* NET_CORE_NO_CONG_THRESH unused */
188 /* NET_CORE_NO_CONG unused */
189 /* NET_CORE_LO_CONG unused */
190 /* NET_CORE_MOD_CONG unused */
191 { CTL_INT, NET_CORE_DEV_WEIGHT, "dev_weight" },
192 { CTL_INT, NET_CORE_SOMAXCONN, "somaxconn" },
193 { CTL_INT, NET_CORE_BUDGET, "netdev_budget" },
194 { CTL_INT, NET_CORE_AEVENT_ETIME, "xfrm_aevent_etime" },
195 { CTL_INT, NET_CORE_AEVENT_RSEQTH, "xfrm_aevent_rseqth" },
196 { CTL_INT, NET_CORE_WARNINGS, "warnings" },
197 {},
198};
199
200static const struct bin_table bin_net_unix_table[] = {
201 /* NET_UNIX_DESTROY_DELAY unused */
202 /* NET_UNIX_DELETE_DELAY unused */
203 { CTL_INT, NET_UNIX_MAX_DGRAM_QLEN, "max_dgram_qlen" },
204 {}
205};
206
207static const struct bin_table bin_net_ipv4_route_table[] = {
208 { CTL_INT, NET_IPV4_ROUTE_FLUSH, "flush" },
209 /* NET_IPV4_ROUTE_MIN_DELAY "min_delay" no longer used */
210 /* NET_IPV4_ROUTE_MAX_DELAY "max_delay" no longer used */
211 { CTL_INT, NET_IPV4_ROUTE_GC_THRESH, "gc_thresh" },
212 { CTL_INT, NET_IPV4_ROUTE_MAX_SIZE, "max_size" },
213 { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" },
214 { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" },
215 { CTL_INT, NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" },
216 { CTL_INT, NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval" },
217 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" },
218 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" },
219 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" },
220 { CTL_INT, NET_IPV4_ROUTE_ERROR_COST, "error_cost" },
221 { CTL_INT, NET_IPV4_ROUTE_ERROR_BURST, "error_burst" },
222 { CTL_INT, NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity" },
223 { CTL_INT, NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" },
224 { CTL_INT, NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" },
225 { CTL_INT, NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" },
226 { CTL_INT, NET_IPV4_ROUTE_SECRET_INTERVAL, "secret_interval" },
227 {}
228};
229
230static const struct bin_table bin_net_ipv4_conf_vars_table[] = {
231 { CTL_INT, NET_IPV4_CONF_FORWARDING, "forwarding" },
232 { CTL_INT, NET_IPV4_CONF_MC_FORWARDING, "mc_forwarding" },
233
234 { CTL_INT, NET_IPV4_CONF_ACCEPT_REDIRECTS, "accept_redirects" },
235 { CTL_INT, NET_IPV4_CONF_SECURE_REDIRECTS, "secure_redirects" },
236 { CTL_INT, NET_IPV4_CONF_SEND_REDIRECTS, "send_redirects" },
237 { CTL_INT, NET_IPV4_CONF_SHARED_MEDIA, "shared_media" },
238 { CTL_INT, NET_IPV4_CONF_RP_FILTER, "rp_filter" },
239 { CTL_INT, NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
240 { CTL_INT, NET_IPV4_CONF_PROXY_ARP, "proxy_arp" },
241 { CTL_INT, NET_IPV4_CONF_MEDIUM_ID, "medium_id" },
242 { CTL_INT, NET_IPV4_CONF_BOOTP_RELAY, "bootp_relay" },
243 { CTL_INT, NET_IPV4_CONF_LOG_MARTIANS, "log_martians" },
244 { CTL_INT, NET_IPV4_CONF_TAG, "tag" },
245 { CTL_INT, NET_IPV4_CONF_ARPFILTER, "arp_filter" },
246 { CTL_INT, NET_IPV4_CONF_ARP_ANNOUNCE, "arp_announce" },
247 { CTL_INT, NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" },
248 { CTL_INT, NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" },
249 { CTL_INT, NET_IPV4_CONF_ARP_NOTIFY, "arp_notify" },
250
251 { CTL_INT, NET_IPV4_CONF_NOXFRM, "disable_xfrm" },
252 { CTL_INT, NET_IPV4_CONF_NOPOLICY, "disable_policy" },
253 { CTL_INT, NET_IPV4_CONF_FORCE_IGMP_VERSION, "force_igmp_version" },
254 { CTL_INT, NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" },
255 {}
256};
257
258static const struct bin_table bin_net_ipv4_conf_table[] = {
259 { CTL_DIR, NET_PROTO_CONF_ALL, "all", bin_net_ipv4_conf_vars_table },
260 { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_ipv4_conf_vars_table },
261 { CTL_DIR, 0, NULL, bin_net_ipv4_conf_vars_table },
262 {}
263};
264
265static const struct bin_table bin_net_neigh_vars_table[] = {
266 { CTL_INT, NET_NEIGH_MCAST_SOLICIT, "mcast_solicit" },
267 { CTL_INT, NET_NEIGH_UCAST_SOLICIT, "ucast_solicit" },
268 { CTL_INT, NET_NEIGH_APP_SOLICIT, "app_solicit" },
269 /* NET_NEIGH_RETRANS_TIME "retrans_time" no longer used */
270 { CTL_INT, NET_NEIGH_REACHABLE_TIME, "base_reachable_time" },
271 { CTL_INT, NET_NEIGH_DELAY_PROBE_TIME, "delay_first_probe_time" },
272 { CTL_INT, NET_NEIGH_GC_STALE_TIME, "gc_stale_time" },
273 { CTL_INT, NET_NEIGH_UNRES_QLEN, "unres_qlen" },
274 { CTL_INT, NET_NEIGH_PROXY_QLEN, "proxy_qlen" },
275 /* NET_NEIGH_ANYCAST_DELAY "anycast_delay" no longer used */
276 /* NET_NEIGH_PROXY_DELAY "proxy_delay" no longer used */
277 /* NET_NEIGH_LOCKTIME "locktime" no longer used */
278 { CTL_INT, NET_NEIGH_GC_INTERVAL, "gc_interval" },
279 { CTL_INT, NET_NEIGH_GC_THRESH1, "gc_thresh1" },
280 { CTL_INT, NET_NEIGH_GC_THRESH2, "gc_thresh2" },
281 { CTL_INT, NET_NEIGH_GC_THRESH3, "gc_thresh3" },
282 { CTL_INT, NET_NEIGH_RETRANS_TIME_MS, "retrans_time_ms" },
283 { CTL_INT, NET_NEIGH_REACHABLE_TIME_MS, "base_reachable_time_ms" },
284 {}
285};
286
287static const struct bin_table bin_net_neigh_table[] = {
288 { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_neigh_vars_table },
289 { CTL_DIR, 0, NULL, bin_net_neigh_vars_table },
290 {}
291};
292
293static const struct bin_table bin_net_ipv4_netfilter_table[] = {
294 { CTL_INT, NET_IPV4_NF_CONNTRACK_MAX, "ip_conntrack_max" },
295
296 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "ip_conntrack_tcp_timeout_syn_sent" no longer used */
297 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "ip_conntrack_tcp_timeout_syn_recv" no longer used */
298 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "ip_conntrack_tcp_timeout_established" no longer used */
299 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "ip_conntrack_tcp_timeout_fin_wait" no longer used */
300 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT "ip_conntrack_tcp_timeout_close_wait" no longer used */
301 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "ip_conntrack_tcp_timeout_last_ack" no longer used */
302 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "ip_conntrack_tcp_timeout_time_wait" no longer used */
303 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "ip_conntrack_tcp_timeout_close" no longer used */
304
305 /* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT "ip_conntrack_udp_timeout" no longer used */
306 /* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM "ip_conntrack_udp_timeout_stream" no longer used */
307 /* NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT "ip_conntrack_icmp_timeout" no longer used */
308 /* NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT "ip_conntrack_generic_timeout" no longer used */
309
310 { CTL_INT, NET_IPV4_NF_CONNTRACK_BUCKETS, "ip_conntrack_buckets" },
311 { CTL_INT, NET_IPV4_NF_CONNTRACK_LOG_INVALID, "ip_conntrack_log_invalid" },
312 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "ip_conntrack_tcp_timeout_max_retrans" no longer used */
313 { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_LOOSE, "ip_conntrack_tcp_loose" },
314 { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL, "ip_conntrack_tcp_be_liberal" },
315 { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS, "ip_conntrack_tcp_max_retrans" },
316
317 /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "ip_conntrack_sctp_timeout_closed" no longer used */
318 /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "ip_conntrack_sctp_timeout_cookie_wait" no longer used */
319 /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "ip_conntrack_sctp_timeout_cookie_echoed" no longer used */
320 /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "ip_conntrack_sctp_timeout_established" no longer used */
321 /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "ip_conntrack_sctp_timeout_shutdown_sent" no longer used */
322 /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "ip_conntrack_sctp_timeout_shutdown_recd" no longer used */
323 /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "ip_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */
324
325 { CTL_INT, NET_IPV4_NF_CONNTRACK_COUNT, "ip_conntrack_count" },
326 { CTL_INT, NET_IPV4_NF_CONNTRACK_CHECKSUM, "ip_conntrack_checksum" },
327 {}
328};
329
330static const struct bin_table bin_net_ipv4_table[] = {
331 {CTL_INT, NET_IPV4_FORWARD, "ip_forward" },
332
333 { CTL_DIR, NET_IPV4_CONF, "conf", bin_net_ipv4_conf_table },
334 { CTL_DIR, NET_IPV4_NEIGH, "neigh", bin_net_neigh_table },
335 { CTL_DIR, NET_IPV4_ROUTE, "route", bin_net_ipv4_route_table },
336 /* NET_IPV4_FIB_HASH unused */
337 { CTL_DIR, NET_IPV4_NETFILTER, "netfilter", bin_net_ipv4_netfilter_table },
338
339 { CTL_INT, NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps" },
340 { CTL_INT, NET_IPV4_TCP_WINDOW_SCALING, "tcp_window_scaling" },
341 { CTL_INT, NET_IPV4_TCP_SACK, "tcp_sack" },
342 { CTL_INT, NET_IPV4_TCP_RETRANS_COLLAPSE, "tcp_retrans_collapse" },
343 { CTL_INT, NET_IPV4_DEFAULT_TTL, "ip_default_ttl" },
344 /* NET_IPV4_AUTOCONFIG unused */
345 { CTL_INT, NET_IPV4_NO_PMTU_DISC, "ip_no_pmtu_disc" },
346 { CTL_INT, NET_IPV4_NONLOCAL_BIND, "ip_nonlocal_bind" },
347 { CTL_INT, NET_IPV4_TCP_SYN_RETRIES, "tcp_syn_retries" },
348 { CTL_INT, NET_TCP_SYNACK_RETRIES, "tcp_synack_retries" },
349 { CTL_INT, NET_TCP_MAX_ORPHANS, "tcp_max_orphans" },
350 { CTL_INT, NET_TCP_MAX_TW_BUCKETS, "tcp_max_tw_buckets" },
351 { CTL_INT, NET_IPV4_DYNADDR, "ip_dynaddr" },
352 { CTL_INT, NET_IPV4_TCP_KEEPALIVE_TIME, "tcp_keepalive_time" },
353 { CTL_INT, NET_IPV4_TCP_KEEPALIVE_PROBES, "tcp_keepalive_probes" },
354 { CTL_INT, NET_IPV4_TCP_KEEPALIVE_INTVL, "tcp_keepalive_intvl" },
355 { CTL_INT, NET_IPV4_TCP_RETRIES1, "tcp_retries1" },
356 { CTL_INT, NET_IPV4_TCP_RETRIES2, "tcp_retries2" },
357 { CTL_INT, NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout" },
358 { CTL_INT, NET_TCP_SYNCOOKIES, "tcp_syncookies" },
359 { CTL_INT, NET_TCP_TW_RECYCLE, "tcp_tw_recycle" },
360 { CTL_INT, NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow" },
361 { CTL_INT, NET_TCP_STDURG, "tcp_stdurg" },
362 { CTL_INT, NET_TCP_RFC1337, "tcp_rfc1337" },
363 { CTL_INT, NET_TCP_MAX_SYN_BACKLOG, "tcp_max_syn_backlog" },
364 { CTL_INT, NET_IPV4_LOCAL_PORT_RANGE, "ip_local_port_range" },
365 { CTL_INT, NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships" },
366 { CTL_INT, NET_IPV4_IGMP_MAX_MSF, "igmp_max_msf" },
367 { CTL_INT, NET_IPV4_INET_PEER_THRESHOLD, "inet_peer_threshold" },
368 { CTL_INT, NET_IPV4_INET_PEER_MINTTL, "inet_peer_minttl" },
369 { CTL_INT, NET_IPV4_INET_PEER_MAXTTL, "inet_peer_maxttl" },
370 { CTL_INT, NET_IPV4_INET_PEER_GC_MINTIME, "inet_peer_gc_mintime" },
371 { CTL_INT, NET_IPV4_INET_PEER_GC_MAXTIME, "inet_peer_gc_maxtime" },
372 { CTL_INT, NET_TCP_ORPHAN_RETRIES, "tcp_orphan_retries" },
373 { CTL_INT, NET_TCP_FACK, "tcp_fack" },
374 { CTL_INT, NET_TCP_REORDERING, "tcp_reordering" },
375 { CTL_INT, NET_TCP_ECN, "tcp_ecn" },
376 { CTL_INT, NET_TCP_DSACK, "tcp_dsack" },
377 { CTL_INT, NET_TCP_MEM, "tcp_mem" },
378 { CTL_INT, NET_TCP_WMEM, "tcp_wmem" },
379 { CTL_INT, NET_TCP_RMEM, "tcp_rmem" },
380 { CTL_INT, NET_TCP_APP_WIN, "tcp_app_win" },
381 { CTL_INT, NET_TCP_ADV_WIN_SCALE, "tcp_adv_win_scale" },
382 { CTL_INT, NET_TCP_TW_REUSE, "tcp_tw_reuse" },
383 { CTL_INT, NET_TCP_FRTO, "tcp_frto" },
384 { CTL_INT, NET_TCP_FRTO_RESPONSE, "tcp_frto_response" },
385 { CTL_INT, NET_TCP_LOW_LATENCY, "tcp_low_latency" },
386 { CTL_INT, NET_TCP_NO_METRICS_SAVE, "tcp_no_metrics_save" },
387 { CTL_INT, NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" },
388 { CTL_INT, NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" },
389 { CTL_STR, NET_TCP_CONG_CONTROL, "tcp_congestion_control" },
390 { CTL_INT, NET_TCP_ABC, "tcp_abc" },
391 { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" },
392 { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" },
393 { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" },
394 { CTL_INT, NET_TCP_DMA_COPYBREAK, "tcp_dma_copybreak" },
395 { CTL_INT, NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" },
396 { CTL_INT, NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" },
397 { CTL_INT, NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" },
398 { CTL_INT, NET_CIPSOV4_RBM_OPTFMT, "cipso_rbm_optfmt" },
399 { CTL_INT, NET_CIPSOV4_RBM_STRICTVALID, "cipso_rbm_strictvalid" },
400 /* NET_TCP_AVAIL_CONG_CONTROL "tcp_available_congestion_control" no longer used */
401 { CTL_STR, NET_TCP_ALLOWED_CONG_CONTROL, "tcp_allowed_congestion_control" },
402 { CTL_INT, NET_TCP_MAX_SSTHRESH, "tcp_max_ssthresh" },
403
404 { CTL_INT, NET_IPV4_ICMP_ECHO_IGNORE_ALL, "icmp_echo_ignore_all" },
405 { CTL_INT, NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, "icmp_echo_ignore_broadcasts" },
406 { CTL_INT, NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, "icmp_ignore_bogus_error_responses" },
407 { CTL_INT, NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR, "icmp_errors_use_inbound_ifaddr" },
408 { CTL_INT, NET_IPV4_ICMP_RATELIMIT, "icmp_ratelimit" },
409 { CTL_INT, NET_IPV4_ICMP_RATEMASK, "icmp_ratemask" },
410
411 { CTL_INT, NET_IPV4_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh" },
412 { CTL_INT, NET_IPV4_IPFRAG_LOW_THRESH, "ipfrag_low_thresh" },
413 { CTL_INT, NET_IPV4_IPFRAG_TIME, "ipfrag_time" },
414
415 { CTL_INT, NET_IPV4_IPFRAG_SECRET_INTERVAL, "ipfrag_secret_interval" },
416 /* NET_IPV4_IPFRAG_MAX_DIST "ipfrag_max_dist" no longer used */
417
418 { CTL_INT, 2088 /* NET_IPQ_QMAX */, "ip_queue_maxlen" },
419
420 /* NET_TCP_DEFAULT_WIN_SCALE unused */
421 /* NET_TCP_BIC_BETA unused */
422 /* NET_IPV4_TCP_MAX_KA_PROBES unused */
423 /* NET_IPV4_IP_MASQ_DEBUG unused */
424 /* NET_TCP_SYN_TAILDROP unused */
425 /* NET_IPV4_ICMP_SOURCEQUENCH_RATE unused */
426 /* NET_IPV4_ICMP_DESTUNREACH_RATE unused */
427 /* NET_IPV4_ICMP_TIMEEXCEED_RATE unused */
428 /* NET_IPV4_ICMP_PARAMPROB_RATE unused */
429 /* NET_IPV4_ICMP_ECHOREPLY_RATE unused */
430 /* NET_IPV4_ALWAYS_DEFRAG unused */
431 {}
432};
433
434static const struct bin_table bin_net_ipx_table[] = {
435 { CTL_INT, NET_IPX_PPROP_BROADCASTING, "ipx_pprop_broadcasting" },
436 /* NET_IPX_FORWARDING unused */
437 {}
438};
439
440static const struct bin_table bin_net_atalk_table[] = {
441 { CTL_INT, NET_ATALK_AARP_EXPIRY_TIME, "aarp-expiry-time" },
442 { CTL_INT, NET_ATALK_AARP_TICK_TIME, "aarp-tick-time" },
443 { CTL_INT, NET_ATALK_AARP_RETRANSMIT_LIMIT, "aarp-retransmit-limit" },
444 { CTL_INT, NET_ATALK_AARP_RESOLVE_TIME, "aarp-resolve-time" },
445 {},
446};
447
448static const struct bin_table bin_net_netrom_table[] = {
449 { CTL_INT, NET_NETROM_DEFAULT_PATH_QUALITY, "default_path_quality" },
450 { CTL_INT, NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER, "obsolescence_count_initialiser" },
451 { CTL_INT, NET_NETROM_NETWORK_TTL_INITIALISER, "network_ttl_initialiser" },
452 { CTL_INT, NET_NETROM_TRANSPORT_TIMEOUT, "transport_timeout" },
453 { CTL_INT, NET_NETROM_TRANSPORT_MAXIMUM_TRIES, "transport_maximum_tries" },
454 { CTL_INT, NET_NETROM_TRANSPORT_ACKNOWLEDGE_DELAY, "transport_acknowledge_delay" },
455 { CTL_INT, NET_NETROM_TRANSPORT_BUSY_DELAY, "transport_busy_delay" },
456 { CTL_INT, NET_NETROM_TRANSPORT_REQUESTED_WINDOW_SIZE, "transport_requested_window_size" },
457 { CTL_INT, NET_NETROM_TRANSPORT_NO_ACTIVITY_TIMEOUT, "transport_no_activity_timeout" },
458 { CTL_INT, NET_NETROM_ROUTING_CONTROL, "routing_control" },
459 { CTL_INT, NET_NETROM_LINK_FAILS_COUNT, "link_fails_count" },
460 { CTL_INT, NET_NETROM_RESET, "reset" },
461 {}
462};
463
464static const struct bin_table bin_net_ax25_param_table[] = {
465 { CTL_INT, NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" },
466 { CTL_INT, NET_AX25_DEFAULT_MODE, "ax25_default_mode" },
467 { CTL_INT, NET_AX25_BACKOFF_TYPE, "backoff_type" },
468 { CTL_INT, NET_AX25_CONNECT_MODE, "connect_mode" },
469 { CTL_INT, NET_AX25_STANDARD_WINDOW, "standard_window_size" },
470 { CTL_INT, NET_AX25_EXTENDED_WINDOW, "extended_window_size" },
471 { CTL_INT, NET_AX25_T1_TIMEOUT, "t1_timeout" },
472 { CTL_INT, NET_AX25_T2_TIMEOUT, "t2_timeout" },
473 { CTL_INT, NET_AX25_T3_TIMEOUT, "t3_timeout" },
474 { CTL_INT, NET_AX25_IDLE_TIMEOUT, "idle_timeout" },
475 { CTL_INT, NET_AX25_N2, "maximum_retry_count" },
476 { CTL_INT, NET_AX25_PACLEN, "maximum_packet_length" },
477 { CTL_INT, NET_AX25_PROTOCOL, "protocol" },
478 { CTL_INT, NET_AX25_DAMA_SLAVE_TIMEOUT, "dama_slave_timeout" },
479 {}
480};
481
482static const struct bin_table bin_net_ax25_table[] = {
483 { CTL_DIR, 0, NULL, bin_net_ax25_param_table },
484 {}
485};
486
487static const struct bin_table bin_net_rose_table[] = {
488 { CTL_INT, NET_ROSE_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" },
489 { CTL_INT, NET_ROSE_CALL_REQUEST_TIMEOUT, "call_request_timeout" },
490 { CTL_INT, NET_ROSE_RESET_REQUEST_TIMEOUT, "reset_request_timeout" },
491 { CTL_INT, NET_ROSE_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" },
492 { CTL_INT, NET_ROSE_ACK_HOLD_BACK_TIMEOUT, "acknowledge_hold_back_timeout" },
493 { CTL_INT, NET_ROSE_ROUTING_CONTROL, "routing_control" },
494 { CTL_INT, NET_ROSE_LINK_FAIL_TIMEOUT, "link_fail_timeout" },
495 { CTL_INT, NET_ROSE_MAX_VCS, "maximum_virtual_circuits" },
496 { CTL_INT, NET_ROSE_WINDOW_SIZE, "window_size" },
497 { CTL_INT, NET_ROSE_NO_ACTIVITY_TIMEOUT, "no_activity_timeout" },
498 {}
499};
500
501static const struct bin_table bin_net_ipv6_conf_var_table[] = {
502 { CTL_INT, NET_IPV6_FORWARDING, "forwarding" },
503 { CTL_INT, NET_IPV6_HOP_LIMIT, "hop_limit" },
504 { CTL_INT, NET_IPV6_MTU, "mtu" },
505 { CTL_INT, NET_IPV6_ACCEPT_RA, "accept_ra" },
506 { CTL_INT, NET_IPV6_ACCEPT_REDIRECTS, "accept_redirects" },
507 { CTL_INT, NET_IPV6_AUTOCONF, "autoconf" },
508 { CTL_INT, NET_IPV6_DAD_TRANSMITS, "dad_transmits" },
509 { CTL_INT, NET_IPV6_RTR_SOLICITS, "router_solicitations" },
510 { CTL_INT, NET_IPV6_RTR_SOLICIT_INTERVAL, "router_solicitation_interval" },
511 { CTL_INT, NET_IPV6_RTR_SOLICIT_DELAY, "router_solicitation_delay" },
512 { CTL_INT, NET_IPV6_USE_TEMPADDR, "use_tempaddr" },
513 { CTL_INT, NET_IPV6_TEMP_VALID_LFT, "temp_valid_lft" },
514 { CTL_INT, NET_IPV6_TEMP_PREFERED_LFT, "temp_prefered_lft" },
515 { CTL_INT, NET_IPV6_REGEN_MAX_RETRY, "regen_max_retry" },
516 { CTL_INT, NET_IPV6_MAX_DESYNC_FACTOR, "max_desync_factor" },
517 { CTL_INT, NET_IPV6_MAX_ADDRESSES, "max_addresses" },
518 { CTL_INT, NET_IPV6_FORCE_MLD_VERSION, "force_mld_version" },
519 { CTL_INT, NET_IPV6_ACCEPT_RA_DEFRTR, "accept_ra_defrtr" },
520 { CTL_INT, NET_IPV6_ACCEPT_RA_PINFO, "accept_ra_pinfo" },
521 { CTL_INT, NET_IPV6_ACCEPT_RA_RTR_PREF, "accept_ra_rtr_pref" },
522 { CTL_INT, NET_IPV6_RTR_PROBE_INTERVAL, "router_probe_interval" },
523 { CTL_INT, NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" },
524 { CTL_INT, NET_IPV6_PROXY_NDP, "proxy_ndp" },
525 { CTL_INT, NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
526 {}
527};
528
529static const struct bin_table bin_net_ipv6_conf_table[] = {
530 { CTL_DIR, NET_PROTO_CONF_ALL, "all", bin_net_ipv6_conf_var_table },
531 { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_ipv6_conf_var_table },
532 { CTL_DIR, 0, NULL, bin_net_ipv6_conf_var_table },
533 {}
534};
535
536static const struct bin_table bin_net_ipv6_route_table[] = {
537 /* NET_IPV6_ROUTE_FLUSH "flush" no longer used */
538 { CTL_INT, NET_IPV6_ROUTE_GC_THRESH, "gc_thresh" },
539 { CTL_INT, NET_IPV6_ROUTE_MAX_SIZE, "max_size" },
540 { CTL_INT, NET_IPV6_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" },
541 { CTL_INT, NET_IPV6_ROUTE_GC_TIMEOUT, "gc_timeout" },
542 { CTL_INT, NET_IPV6_ROUTE_GC_INTERVAL, "gc_interval" },
543 { CTL_INT, NET_IPV6_ROUTE_GC_ELASTICITY, "gc_elasticity" },
544 { CTL_INT, NET_IPV6_ROUTE_MTU_EXPIRES, "mtu_expires" },
545 { CTL_INT, NET_IPV6_ROUTE_MIN_ADVMSS, "min_adv_mss" },
546 { CTL_INT, NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" },
547 {}
548};
549
550static const struct bin_table bin_net_ipv6_icmp_table[] = {
551 { CTL_INT, NET_IPV6_ICMP_RATELIMIT, "ratelimit" },
552 {}
553};
554
555static const struct bin_table bin_net_ipv6_table[] = {
556 { CTL_DIR, NET_IPV6_CONF, "conf", bin_net_ipv6_conf_table },
557 { CTL_DIR, NET_IPV6_NEIGH, "neigh", bin_net_neigh_table },
558 { CTL_DIR, NET_IPV6_ROUTE, "route", bin_net_ipv6_route_table },
559 { CTL_DIR, NET_IPV6_ICMP, "icmp", bin_net_ipv6_icmp_table },
560 { CTL_INT, NET_IPV6_BINDV6ONLY, "bindv6only" },
561 { CTL_INT, NET_IPV6_IP6FRAG_HIGH_THRESH, "ip6frag_high_thresh" },
562 { CTL_INT, NET_IPV6_IP6FRAG_LOW_THRESH, "ip6frag_low_thresh" },
563 { CTL_INT, NET_IPV6_IP6FRAG_TIME, "ip6frag_time" },
564 { CTL_INT, NET_IPV6_IP6FRAG_SECRET_INTERVAL, "ip6frag_secret_interval" },
565 { CTL_INT, NET_IPV6_MLD_MAX_MSF, "mld_max_msf" },
566 { CTL_INT, 2088 /* IPQ_QMAX */, "ip6_queue_maxlen" },
567 {}
568};
569
570static const struct bin_table bin_net_x25_table[] = {
571 { CTL_INT, NET_X25_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" },
572 { CTL_INT, NET_X25_CALL_REQUEST_TIMEOUT, "call_request_timeout" },
573 { CTL_INT, NET_X25_RESET_REQUEST_TIMEOUT, "reset_request_timeout" },
574 { CTL_INT, NET_X25_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" },
575 { CTL_INT, NET_X25_ACK_HOLD_BACK_TIMEOUT, "acknowledgement_hold_back_timeout" },
576 { CTL_INT, NET_X25_FORWARD, "x25_forward" },
577 {}
578};
579
580static const struct bin_table bin_net_tr_table[] = {
581 { CTL_INT, NET_TR_RIF_TIMEOUT, "rif_timeout" },
582 {}
583};
584
585
586static const struct bin_table bin_net_decnet_conf_vars[] = {
587 { CTL_INT, NET_DECNET_CONF_DEV_FORWARDING, "forwarding" },
588 { CTL_INT, NET_DECNET_CONF_DEV_PRIORITY, "priority" },
589 { CTL_INT, NET_DECNET_CONF_DEV_T2, "t2" },
590 { CTL_INT, NET_DECNET_CONF_DEV_T3, "t3" },
591 {}
592};
593
594static const struct bin_table bin_net_decnet_conf[] = {
595 { CTL_DIR, NET_DECNET_CONF_ETHER, "ethernet", bin_net_decnet_conf_vars },
596 { CTL_DIR, NET_DECNET_CONF_GRE, "ipgre", bin_net_decnet_conf_vars },
597 { CTL_DIR, NET_DECNET_CONF_X25, "x25", bin_net_decnet_conf_vars },
598 { CTL_DIR, NET_DECNET_CONF_PPP, "ppp", bin_net_decnet_conf_vars },
599 { CTL_DIR, NET_DECNET_CONF_DDCMP, "ddcmp", bin_net_decnet_conf_vars },
600 { CTL_DIR, NET_DECNET_CONF_LOOPBACK, "loopback", bin_net_decnet_conf_vars },
601 { CTL_DIR, 0, NULL, bin_net_decnet_conf_vars },
602 {}
603};
604
605static const struct bin_table bin_net_decnet_table[] = {
606 { CTL_DIR, NET_DECNET_CONF, "conf", bin_net_decnet_conf },
607 { CTL_DNADR, NET_DECNET_NODE_ADDRESS, "node_address" },
608 { CTL_STR, NET_DECNET_NODE_NAME, "node_name" },
609 { CTL_STR, NET_DECNET_DEFAULT_DEVICE, "default_device" },
610 { CTL_INT, NET_DECNET_TIME_WAIT, "time_wait" },
611 { CTL_INT, NET_DECNET_DN_COUNT, "dn_count" },
612 { CTL_INT, NET_DECNET_DI_COUNT, "di_count" },
613 { CTL_INT, NET_DECNET_DR_COUNT, "dr_count" },
614 { CTL_INT, NET_DECNET_DST_GC_INTERVAL, "dst_gc_interval" },
615 { CTL_INT, NET_DECNET_NO_FC_MAX_CWND, "no_fc_max_cwnd" },
616 { CTL_INT, NET_DECNET_MEM, "decnet_mem" },
617 { CTL_INT, NET_DECNET_RMEM, "decnet_rmem" },
618 { CTL_INT, NET_DECNET_WMEM, "decnet_wmem" },
619 { CTL_INT, NET_DECNET_DEBUG_LEVEL, "debug" },
620 {}
621};
622
623static const struct bin_table bin_net_sctp_table[] = {
624 { CTL_INT, NET_SCTP_RTO_INITIAL, "rto_initial" },
625 { CTL_INT, NET_SCTP_RTO_MIN, "rto_min" },
626 { CTL_INT, NET_SCTP_RTO_MAX, "rto_max" },
627 { CTL_INT, NET_SCTP_RTO_ALPHA, "rto_alpha_exp_divisor" },
628 { CTL_INT, NET_SCTP_RTO_BETA, "rto_beta_exp_divisor" },
629 { CTL_INT, NET_SCTP_VALID_COOKIE_LIFE, "valid_cookie_life" },
630 { CTL_INT, NET_SCTP_ASSOCIATION_MAX_RETRANS, "association_max_retrans" },
631 { CTL_INT, NET_SCTP_PATH_MAX_RETRANS, "path_max_retrans" },
632 { CTL_INT, NET_SCTP_MAX_INIT_RETRANSMITS, "max_init_retransmits" },
633 { CTL_INT, NET_SCTP_HB_INTERVAL, "hb_interval" },
634 { CTL_INT, NET_SCTP_PRESERVE_ENABLE, "cookie_preserve_enable" },
635 { CTL_INT, NET_SCTP_MAX_BURST, "max_burst" },
636 { CTL_INT, NET_SCTP_ADDIP_ENABLE, "addip_enable" },
637 { CTL_INT, NET_SCTP_PRSCTP_ENABLE, "prsctp_enable" },
638 { CTL_INT, NET_SCTP_SNDBUF_POLICY, "sndbuf_policy" },
639 { CTL_INT, NET_SCTP_SACK_TIMEOUT, "sack_timeout" },
640 { CTL_INT, NET_SCTP_RCVBUF_POLICY, "rcvbuf_policy" },
641 {}
642};
643
644static const struct bin_table bin_net_llc_llc2_timeout_table[] = {
645 { CTL_INT, NET_LLC2_ACK_TIMEOUT, "ack" },
646 { CTL_INT, NET_LLC2_P_TIMEOUT, "p" },
647 { CTL_INT, NET_LLC2_REJ_TIMEOUT, "rej" },
648 { CTL_INT, NET_LLC2_BUSY_TIMEOUT, "busy" },
649 {}
650};
651
652static const struct bin_table bin_net_llc_station_table[] = {
653 { CTL_INT, NET_LLC_STATION_ACK_TIMEOUT, "ack_timeout" },
654 {}
655};
656
657static const struct bin_table bin_net_llc_llc2_table[] = {
658 { CTL_DIR, NET_LLC2, "timeout", bin_net_llc_llc2_timeout_table },
659 {}
660};
661
662static const struct bin_table bin_net_llc_table[] = {
663 { CTL_DIR, NET_LLC2, "llc2", bin_net_llc_llc2_table },
664 { CTL_DIR, NET_LLC_STATION, "station", bin_net_llc_station_table },
665 {}
666};
667
668static const struct bin_table bin_net_netfilter_table[] = {
669 { CTL_INT, NET_NF_CONNTRACK_MAX, "nf_conntrack_max" },
670 /* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "nf_conntrack_tcp_timeout_syn_sent" no longer used */
671 /* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "nf_conntrack_tcp_timeout_syn_recv" no longer used */
672 /* NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "nf_conntrack_tcp_timeout_established" no longer used */
673 /* NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "nf_conntrack_tcp_timeout_fin_wait" no longer used */
674 /* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT "nf_conntrack_tcp_timeout_close_wait" no longer used */
675 /* NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "nf_conntrack_tcp_timeout_last_ack" no longer used */
676 /* NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "nf_conntrack_tcp_timeout_time_wait" no longer used */
677 /* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "nf_conntrack_tcp_timeout_close" no longer used */
678 /* NET_NF_CONNTRACK_UDP_TIMEOUT "nf_conntrack_udp_timeout" no longer used */
679 /* NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM "nf_conntrack_udp_timeout_stream" no longer used */
680 /* NET_NF_CONNTRACK_ICMP_TIMEOUT "nf_conntrack_icmp_timeout" no longer used */
681 /* NET_NF_CONNTRACK_GENERIC_TIMEOUT "nf_conntrack_generic_timeout" no longer used */
682 { CTL_INT, NET_NF_CONNTRACK_BUCKETS, "nf_conntrack_buckets" },
683 { CTL_INT, NET_NF_CONNTRACK_LOG_INVALID, "nf_conntrack_log_invalid" },
684 /* NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "nf_conntrack_tcp_timeout_max_retrans" no longer used */
685 { CTL_INT, NET_NF_CONNTRACK_TCP_LOOSE, "nf_conntrack_tcp_loose" },
686 { CTL_INT, NET_NF_CONNTRACK_TCP_BE_LIBERAL, "nf_conntrack_tcp_be_liberal" },
687 { CTL_INT, NET_NF_CONNTRACK_TCP_MAX_RETRANS, "nf_conntrack_tcp_max_retrans" },
688 /* NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "nf_conntrack_sctp_timeout_closed" no longer used */
689 /* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "nf_conntrack_sctp_timeout_cookie_wait" no longer used */
690 /* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "nf_conntrack_sctp_timeout_cookie_echoed" no longer used */
691 /* NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "nf_conntrack_sctp_timeout_established" no longer used */
692 /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "nf_conntrack_sctp_timeout_shutdown_sent" no longer used */
693 /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "nf_conntrack_sctp_timeout_shutdown_recd" no longer used */
694 /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "nf_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */
695 { CTL_INT, NET_NF_CONNTRACK_COUNT, "nf_conntrack_count" },
696 /* NET_NF_CONNTRACK_ICMPV6_TIMEOUT "nf_conntrack_icmpv6_timeout" no longer used */
697 /* NET_NF_CONNTRACK_FRAG6_TIMEOUT "nf_conntrack_frag6_timeout" no longer used */
698 { CTL_INT, NET_NF_CONNTRACK_FRAG6_LOW_THRESH, "nf_conntrack_frag6_low_thresh" },
699 { CTL_INT, NET_NF_CONNTRACK_FRAG6_HIGH_THRESH, "nf_conntrack_frag6_high_thresh" },
700 { CTL_INT, NET_NF_CONNTRACK_CHECKSUM, "nf_conntrack_checksum" },
701
702 {}
703};
704
705static const struct bin_table bin_net_irda_table[] = {
706 { CTL_INT, NET_IRDA_DISCOVERY, "discovery" },
707 { CTL_STR, NET_IRDA_DEVNAME, "devname" },
708 { CTL_INT, NET_IRDA_DEBUG, "debug" },
709 { CTL_INT, NET_IRDA_FAST_POLL, "fast_poll_increase" },
710 { CTL_INT, NET_IRDA_DISCOVERY_SLOTS, "discovery_slots" },
711 { CTL_INT, NET_IRDA_DISCOVERY_TIMEOUT, "discovery_timeout" },
712 { CTL_INT, NET_IRDA_SLOT_TIMEOUT, "slot_timeout" },
713 { CTL_INT, NET_IRDA_MAX_BAUD_RATE, "max_baud_rate" },
714 { CTL_INT, NET_IRDA_MIN_TX_TURN_TIME, "min_tx_turn_time" },
715 { CTL_INT, NET_IRDA_MAX_TX_DATA_SIZE, "max_tx_data_size" },
716 { CTL_INT, NET_IRDA_MAX_TX_WINDOW, "max_tx_window" },
717 { CTL_INT, NET_IRDA_MAX_NOREPLY_TIME, "max_noreply_time" },
718 { CTL_INT, NET_IRDA_WARN_NOREPLY_TIME, "warn_noreply_time" },
719 { CTL_INT, NET_IRDA_LAP_KEEPALIVE_TIME, "lap_keepalive_time" },
720 {}
721};
722
723static const struct bin_table bin_net_table[] = {
724 { CTL_DIR, NET_CORE, "core", bin_net_core_table },
725 /* NET_ETHER not used */
726 /* NET_802 not used */
727 { CTL_DIR, NET_UNIX, "unix", bin_net_unix_table },
728 { CTL_DIR, NET_IPV4, "ipv4", bin_net_ipv4_table },
729 { CTL_DIR, NET_IPX, "ipx", bin_net_ipx_table },
730 { CTL_DIR, NET_ATALK, "appletalk", bin_net_atalk_table },
731 { CTL_DIR, NET_NETROM, "netrom", bin_net_netrom_table },
732 { CTL_DIR, NET_AX25, "ax25", bin_net_ax25_table },
733 /* NET_BRIDGE "bridge" no longer used */
734 { CTL_DIR, NET_ROSE, "rose", bin_net_rose_table },
735 { CTL_DIR, NET_IPV6, "ipv6", bin_net_ipv6_table },
736 { CTL_DIR, NET_X25, "x25", bin_net_x25_table },
737 { CTL_DIR, NET_TR, "token-ring", bin_net_tr_table },
738 { CTL_DIR, NET_DECNET, "decnet", bin_net_decnet_table },
739 /* NET_ECONET not used */
740 { CTL_DIR, NET_SCTP, "sctp", bin_net_sctp_table },
741 { CTL_DIR, NET_LLC, "llc", bin_net_llc_table },
742 { CTL_DIR, NET_NETFILTER, "netfilter", bin_net_netfilter_table },
743 /* NET_DCCP "dccp" no longer used */
744 { CTL_DIR, NET_IRDA, "irda", bin_net_irda_table },
745 { CTL_INT, 2089, "nf_conntrack_max" },
746 {}
747};
748
749static const struct bin_table bin_fs_quota_table[] = {
750 { CTL_INT, FS_DQ_LOOKUPS, "lookups" },
751 { CTL_INT, FS_DQ_DROPS, "drops" },
752 { CTL_INT, FS_DQ_READS, "reads" },
753 { CTL_INT, FS_DQ_WRITES, "writes" },
754 { CTL_INT, FS_DQ_CACHE_HITS, "cache_hits" },
755 { CTL_INT, FS_DQ_ALLOCATED, "allocated_dquots" },
756 { CTL_INT, FS_DQ_FREE, "free_dquots" },
757 { CTL_INT, FS_DQ_SYNCS, "syncs" },
758 { CTL_INT, FS_DQ_WARNINGS, "warnings" },
759 {}
760};
761
762static const struct bin_table bin_fs_xfs_table[] = {
763 { CTL_INT, XFS_SGID_INHERIT, "irix_sgid_inherit" },
764 { CTL_INT, XFS_SYMLINK_MODE, "irix_symlink_mode" },
765 { CTL_INT, XFS_PANIC_MASK, "panic_mask" },
766
767 { CTL_INT, XFS_ERRLEVEL, "error_level" },
768 { CTL_INT, XFS_SYNCD_TIMER, "xfssyncd_centisecs" },
769 { CTL_INT, XFS_INHERIT_SYNC, "inherit_sync" },
770 { CTL_INT, XFS_INHERIT_NODUMP, "inherit_nodump" },
771 { CTL_INT, XFS_INHERIT_NOATIME, "inherit_noatime" },
772 { CTL_INT, XFS_BUF_TIMER, "xfsbufd_centisecs" },
773 { CTL_INT, XFS_BUF_AGE, "age_buffer_centisecs" },
774 { CTL_INT, XFS_INHERIT_NOSYM, "inherit_nosymlinks" },
775 { CTL_INT, XFS_ROTORSTEP, "rotorstep" },
776 { CTL_INT, XFS_INHERIT_NODFRG, "inherit_nodefrag" },
777 { CTL_INT, XFS_FILESTREAM_TIMER, "filestream_centisecs" },
778 { CTL_INT, XFS_STATS_CLEAR, "stats_clear" },
779 {}
780};
781
782static const struct bin_table bin_fs_ocfs2_nm_table[] = {
783 { CTL_STR, 1, "hb_ctl_path" },
784 {}
785};
786
787static const struct bin_table bin_fs_ocfs2_table[] = {
788 { CTL_DIR, 1, "nm", bin_fs_ocfs2_nm_table },
789 {}
790};
791
792static const struct bin_table bin_inotify_table[] = {
793 { CTL_INT, INOTIFY_MAX_USER_INSTANCES, "max_user_instances" },
794 { CTL_INT, INOTIFY_MAX_USER_WATCHES, "max_user_watches" },
795 { CTL_INT, INOTIFY_MAX_QUEUED_EVENTS, "max_queued_events" },
796 {}
797};
798
799static const struct bin_table bin_fs_table[] = {
800 { CTL_INT, FS_NRINODE, "inode-nr" },
801 { CTL_INT, FS_STATINODE, "inode-state" },
802 /* FS_MAXINODE unused */
803 /* FS_NRDQUOT unused */
804 /* FS_MAXDQUOT unused */
805 /* FS_NRFILE "file-nr" no longer used */
806 { CTL_INT, FS_MAXFILE, "file-max" },
807 { CTL_INT, FS_DENTRY, "dentry-state" },
808 /* FS_NRSUPER unused */
809 /* FS_MAXUPSER unused */
810 { CTL_INT, FS_OVERFLOWUID, "overflowuid" },
811 { CTL_INT, FS_OVERFLOWGID, "overflowgid" },
812 { CTL_INT, FS_LEASES, "leases-enable" },
813 { CTL_INT, FS_DIR_NOTIFY, "dir-notify-enable" },
814 { CTL_INT, FS_LEASE_TIME, "lease-break-time" },
815 { CTL_DIR, FS_DQSTATS, "quota", bin_fs_quota_table },
816 { CTL_DIR, FS_XFS, "xfs", bin_fs_xfs_table },
817 { CTL_ULONG, FS_AIO_NR, "aio-nr" },
818 { CTL_ULONG, FS_AIO_MAX_NR, "aio-max-nr" },
819 { CTL_DIR, FS_INOTIFY, "inotify", bin_inotify_table },
820 { CTL_DIR, FS_OCFS2, "ocfs2", bin_fs_ocfs2_table },
821 { CTL_INT, KERN_SETUID_DUMPABLE, "suid_dumpable" },
822 {}
823};
824
825static const struct bin_table bin_ipmi_table[] = {
826 { CTL_INT, DEV_IPMI_POWEROFF_POWERCYCLE, "poweroff_powercycle" },
827 {}
828};
829
830static const struct bin_table bin_mac_hid_files[] = {
831 /* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */
832 /* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */
833 { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON_EMULATION, "mouse_button_emulation" },
834 { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON2_KEYCODE, "mouse_button2_keycode" },
835 { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON3_KEYCODE, "mouse_button3_keycode" },
836 /* DEV_MAC_HID_ADB_MOUSE_SENDS_KEYCODES unused */
837 {}
838};
839
840static const struct bin_table bin_raid_table[] = {
841 { CTL_INT, DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min" },
842 { CTL_INT, DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max" },
843 {}
844};
845
846static const struct bin_table bin_scsi_table[] = {
847 { CTL_INT, DEV_SCSI_LOGGING_LEVEL, "logging_level" },
848 {}
849};
850
851static const struct bin_table bin_dev_table[] = {
852 /* DEV_CDROM "cdrom" no longer used */
853 /* DEV_HWMON unused */
854 /* DEV_PARPORT "parport" no longer used */
855 { CTL_DIR, DEV_RAID, "raid", bin_raid_table },
856 { CTL_DIR, DEV_MAC_HID, "mac_hid", bin_mac_hid_files },
857 { CTL_DIR, DEV_SCSI, "scsi", bin_scsi_table },
858 { CTL_DIR, DEV_IPMI, "ipmi", bin_ipmi_table },
859 {}
860};
861
862static const struct bin_table bin_bus_isa_table[] = {
863 { CTL_INT, BUS_ISA_MEM_BASE, "membase" },
864 { CTL_INT, BUS_ISA_PORT_BASE, "portbase" },
865 { CTL_INT, BUS_ISA_PORT_SHIFT, "portshift" },
866 {}
867};
868
869static const struct bin_table bin_bus_table[] = {
870 { CTL_DIR, CTL_BUS_ISA, "isa", bin_bus_isa_table },
871 {}
872};
873
874
875static const struct bin_table bin_s390dbf_table[] = {
876 { CTL_INT, 5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" },
877 { CTL_INT, 5679 /* CTL_S390DBF_ACTIVE */, "debug_active" },
878 {}
879};
880
881static const struct bin_table bin_sunrpc_table[] = {
882 /* CTL_RPCDEBUG "rpc_debug" no longer used */
883 /* CTL_NFSDEBUG "nfs_debug" no longer used */
884 /* CTL_NFSDDEBUG "nfsd_debug" no longer used */
885 /* CTL_NLMDEBUG "nlm_debug" no longer used */
886
887 { CTL_INT, CTL_SLOTTABLE_UDP, "udp_slot_table_entries" },
888 { CTL_INT, CTL_SLOTTABLE_TCP, "tcp_slot_table_entries" },
889 { CTL_INT, CTL_MIN_RESVPORT, "min_resvport" },
890 { CTL_INT, CTL_MAX_RESVPORT, "max_resvport" },
891 {}
892};
893
894static const struct bin_table bin_pm_table[] = {
895 /* frv specific */
896 /* 1 == CTL_PM_SUSPEND "suspend" no longer used" */
897 { CTL_INT, 2 /* CTL_PM_CMODE */, "cmode" },
898 { CTL_INT, 3 /* CTL_PM_P0 */, "p0" },
899 { CTL_INT, 4 /* CTL_PM_CM */, "cm" },
900 {}
901};
902
903static const struct bin_table bin_root_table[] = {
904 { CTL_DIR, CTL_KERN, "kernel", bin_kern_table },
905 { CTL_DIR, CTL_VM, "vm", bin_vm_table },
906 { CTL_DIR, CTL_NET, "net", bin_net_table },
907 /* CTL_PROC not used */
908 { CTL_DIR, CTL_FS, "fs", bin_fs_table },
909 /* CTL_DEBUG "debug" no longer used */
910 { CTL_DIR, CTL_DEV, "dev", bin_dev_table },
911 { CTL_DIR, CTL_BUS, "bus", bin_bus_table },
912 { CTL_DIR, CTL_ABI, "abi" },
913 /* CTL_CPU not used */
914 /* CTL_ARLAN "arlan" no longer used */
915 { CTL_DIR, CTL_S390DBF, "s390dbf", bin_s390dbf_table },
916 { CTL_DIR, CTL_SUNRPC, "sunrpc", bin_sunrpc_table },
917 { CTL_DIR, CTL_PM, "pm", bin_pm_table },
918 {}
919};
920
921static ssize_t bin_dir(struct file *file,
922 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
923{
924 return -ENOTDIR;
925}
926
927
928static ssize_t bin_string(struct file *file,
929 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
930{
931 ssize_t result, copied = 0;
932
933 if (oldval && oldlen) {
934 char __user *lastp;
935 loff_t pos = 0;
936 int ch;
937
938 result = vfs_read(file, oldval, oldlen, &pos);
939 if (result < 0)
940 goto out;
941
942 copied = result;
943 lastp = oldval + copied - 1;
944
945 result = -EFAULT;
946 if (get_user(ch, lastp))
947 goto out;
948
949 /* Trim off the trailing newline */
950 if (ch == '\n') {
951 result = -EFAULT;
952 if (put_user('\0', lastp))
953 goto out;
954 copied -= 1;
955 }
956 }
957
958 if (newval && newlen) {
959 loff_t pos = 0;
960
961 result = vfs_write(file, newval, newlen, &pos);
962 if (result < 0)
963 goto out;
964 }
965
966 result = copied;
967out:
968 return result;
969}
970
971static ssize_t bin_intvec(struct file *file,
972 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
973{
974 mm_segment_t old_fs = get_fs();
975 ssize_t copied = 0;
976 char *buffer;
977 ssize_t result;
978
979 result = -ENOMEM;
980 buffer = kmalloc(BUFSZ, GFP_KERNEL);
981 if (!buffer)
982 goto out;
983
984 if (oldval && oldlen) {
985 unsigned __user *vec = oldval;
986 size_t length = oldlen / sizeof(*vec);
987 loff_t pos = 0;
988 char *str, *end;
989 int i;
990
991 set_fs(KERNEL_DS);
992 result = vfs_read(file, buffer, BUFSZ - 1, &pos);
993 set_fs(old_fs);
994 if (result < 0)
995 goto out_kfree;
996
997 str = buffer;
998 end = str + result;
999 *end++ = '\0';
1000 for (i = 0; i < length; i++) {
1001 unsigned long value;
1002
1003 value = simple_strtoul(str, &str, 10);
1004 while (isspace(*str))
1005 str++;
1006
1007 result = -EFAULT;
1008 if (put_user(value, vec + i))
1009 goto out_kfree;
1010
1011 copied += sizeof(*vec);
1012 if (!isdigit(*str))
1013 break;
1014 }
1015 }
1016
1017 if (newval && newlen) {
1018 unsigned __user *vec = newval;
1019 size_t length = newlen / sizeof(*vec);
1020 loff_t pos = 0;
1021 char *str, *end;
1022 int i;
1023
1024 str = buffer;
1025 end = str + BUFSZ;
1026 for (i = 0; i < length; i++) {
1027 unsigned long value;
1028
1029 result = -EFAULT;
1030 if (get_user(value, vec + i))
1031 goto out_kfree;
1032
1033 str += snprintf(str, end - str, "%lu\t", value);
1034 }
1035
1036 set_fs(KERNEL_DS);
1037 result = vfs_write(file, buffer, str - buffer, &pos);
1038 set_fs(old_fs);
1039 if (result < 0)
1040 goto out_kfree;
1041 }
1042 result = copied;
1043out_kfree:
1044 kfree(buffer);
1045out:
1046 return result;
1047}
1048
1049static ssize_t bin_ulongvec(struct file *file,
1050 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1051{
1052 mm_segment_t old_fs = get_fs();
1053 ssize_t copied = 0;
1054 char *buffer;
1055 ssize_t result;
1056
1057 result = -ENOMEM;
1058 buffer = kmalloc(BUFSZ, GFP_KERNEL);
1059 if (!buffer)
1060 goto out;
1061
1062 if (oldval && oldlen) {
1063 unsigned long __user *vec = oldval;
1064 size_t length = oldlen / sizeof(*vec);
1065 loff_t pos = 0;
1066 char *str, *end;
1067 int i;
1068
1069 set_fs(KERNEL_DS);
1070 result = vfs_read(file, buffer, BUFSZ - 1, &pos);
1071 set_fs(old_fs);
1072 if (result < 0)
1073 goto out_kfree;
1074
1075 str = buffer;
1076 end = str + result;
1077 *end++ = '\0';
1078 for (i = 0; i < length; i++) {
1079 unsigned long value;
1080
1081 value = simple_strtoul(str, &str, 10);
1082 while (isspace(*str))
1083 str++;
1084
1085 result = -EFAULT;
1086 if (put_user(value, vec + i))
1087 goto out_kfree;
1088
1089 copied += sizeof(*vec);
1090 if (!isdigit(*str))
1091 break;
1092 }
1093 }
1094
1095 if (newval && newlen) {
1096 unsigned long __user *vec = newval;
1097 size_t length = newlen / sizeof(*vec);
1098 loff_t pos = 0;
1099 char *str, *end;
1100 int i;
1101
1102 str = buffer;
1103 end = str + BUFSZ;
1104 for (i = 0; i < length; i++) {
1105 unsigned long value;
1106
1107 result = -EFAULT;
1108 if (get_user(value, vec + i))
1109 goto out_kfree;
1110
1111 str += snprintf(str, end - str, "%lu\t", value);
1112 }
1113
1114 set_fs(KERNEL_DS);
1115 result = vfs_write(file, buffer, str - buffer, &pos);
1116 set_fs(old_fs);
1117 if (result < 0)
1118 goto out_kfree;
1119 }
1120 result = copied;
1121out_kfree:
1122 kfree(buffer);
1123out:
1124 return result;
1125}
1126
1127static unsigned hex_value(int ch)
1128{
1129 return isdigit(ch) ? ch - '0' : ((ch | 0x20) - 'a') + 10;
1130}
1131
1132static ssize_t bin_uuid(struct file *file,
1133 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1134{
1135 mm_segment_t old_fs = get_fs();
1136 ssize_t result, copied = 0;
1137
1138 /* Only supports reads */
1139 if (oldval && oldlen) {
1140 loff_t pos = 0;
1141 char buf[40], *str = buf;
1142 unsigned char uuid[16];
1143 int i;
1144
1145 set_fs(KERNEL_DS);
1146 result = vfs_read(file, buf, sizeof(buf) - 1, &pos);
1147 set_fs(old_fs);
1148 if (result < 0)
1149 goto out;
1150
1151 buf[result] = '\0';
1152
1153 /* Convert the uuid to from a string to binary */
1154 for (i = 0; i < 16; i++) {
1155 result = -EIO;
1156 if (!isxdigit(str[0]) || !isxdigit(str[1]))
1157 goto out;
1158
1159 uuid[i] = (hex_value(str[0]) << 4) | hex_value(str[1]);
1160 str += 2;
1161 if (*str == '-')
1162 str++;
1163 }
1164
1165 if (oldlen > 16)
1166 oldlen = 16;
1167
1168 result = -EFAULT;
1169 if (copy_to_user(oldval, uuid, oldlen))
1170 goto out;
1171
1172 copied = oldlen;
1173 }
1174 result = copied;
1175out:
1176 return result;
1177}
1178
1179static ssize_t bin_dn_node_address(struct file *file,
1180 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1181{
1182 mm_segment_t old_fs = get_fs();
1183 ssize_t result, copied = 0;
1184
1185 if (oldval && oldlen) {
1186 loff_t pos = 0;
1187 char buf[15], *nodep;
1188 unsigned long area, node;
1189 __le16 dnaddr;
1190
1191 set_fs(KERNEL_DS);
1192 result = vfs_read(file, buf, sizeof(buf) - 1, &pos);
1193 set_fs(old_fs);
1194 if (result < 0)
1195 goto out;
1196
1197 buf[result] = '\0';
1198
1199 /* Convert the decnet addresss to binary */
1200 result = -EIO;
1201 nodep = strchr(buf, '.') + 1;
1202 if (!nodep)
1203 goto out;
1204
1205 area = simple_strtoul(buf, NULL, 10);
1206 node = simple_strtoul(nodep, NULL, 10);
1207
1208 result = -EIO;
1209 if ((area > 63)||(node > 1023))
1210 goto out;
1211
1212 dnaddr = cpu_to_le16((area << 10) | node);
1213
1214 result = -EFAULT;
1215 if (put_user(dnaddr, (__le16 __user *)oldval))
1216 goto out;
1217
1218 copied = sizeof(dnaddr);
1219 }
1220
1221 if (newval && newlen) {
1222 loff_t pos = 0;
1223 __le16 dnaddr;
1224 char buf[15];
1225 int len;
1226
1227 result = -EINVAL;
1228 if (newlen != sizeof(dnaddr))
1229 goto out;
1230
1231 result = -EFAULT;
1232 if (get_user(dnaddr, (__le16 __user *)newval))
1233 goto out;
1234
1235 len = snprintf(buf, sizeof(buf), "%hu.%hu",
1236 le16_to_cpu(dnaddr) >> 10,
1237 le16_to_cpu(dnaddr) & 0x3ff);
1238
1239 set_fs(KERNEL_DS);
1240 result = vfs_write(file, buf, len, &pos);
1241 set_fs(old_fs);
1242 if (result < 0)
1243 goto out;
1244 }
1245
1246 result = copied;
1247out:
1248 return result;
1249}
1250
1251static const struct bin_table *get_sysctl(const int *name, int nlen, char *path)
1252{
1253 const struct bin_table *table = &bin_root_table[0];
1254 int ctl_name;
1255
1256 /* The binary sysctl tables have a small maximum depth so
1257 * there is no danger of overflowing our path as it PATH_MAX
1258 * bytes long.
1259 */
1260 memcpy(path, "sys/", 4);
1261 path += 4;
1262
1263repeat:
1264 if (!nlen)
1265 return ERR_PTR(-ENOTDIR);
1266 ctl_name = *name;
1267 name++;
1268 nlen--;
1269 for ( ; table->convert; table++) {
1270 int len = 0;
1271
1272 /*
1273 * For a wild card entry map from ifindex to network
1274 * device name.
1275 */
1276 if (!table->ctl_name) {
1277#ifdef CONFIG_NET
1278 struct net *net = current->nsproxy->net_ns;
1279 struct net_device *dev;
1280 dev = dev_get_by_index(net, ctl_name);
1281 if (dev) {
1282 len = strlen(dev->name);
1283 memcpy(path, dev->name, len);
1284 dev_put(dev);
1285 }
1286#endif
1287 /* Use the well known sysctl number to proc name mapping */
1288 } else if (ctl_name == table->ctl_name) {
1289 len = strlen(table->procname);
1290 memcpy(path, table->procname, len);
1291 }
1292 if (len) {
1293 path += len;
1294 if (table->child) {
1295 *path++ = '/';
1296 table = table->child;
1297 goto repeat;
1298 }
1299 *path = '\0';
1300 return table;
1301 }
1302 }
1303 return ERR_PTR(-ENOTDIR);
1304}
1305
1306static char *sysctl_getname(const int *name, int nlen, const struct bin_table **tablep)
1307{
1308 char *tmp, *result;
1309
1310 result = ERR_PTR(-ENOMEM);
1311 tmp = __getname();
1312 if (tmp) {
1313 const struct bin_table *table = get_sysctl(name, nlen, tmp);
1314 result = tmp;
1315 *tablep = table;
1316 if (IS_ERR(table)) {
1317 __putname(tmp);
1318 result = ERR_CAST(table);
1319 }
1320 }
1321 return result;
1322}
1323
1324static ssize_t binary_sysctl(const int *name, int nlen,
1325 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1326{
1327 const struct bin_table *table = NULL;
1328 struct nameidata nd;
1329 struct vfsmount *mnt;
1330 struct file *file;
1331 ssize_t result;
1332 char *pathname;
1333 int flags;
1334 int acc_mode, fmode;
1335
1336 pathname = sysctl_getname(name, nlen, &table);
1337 result = PTR_ERR(pathname);
1338 if (IS_ERR(pathname))
1339 goto out;
1340
1341 /* How should the sysctl be accessed? */
1342 if (oldval && oldlen && newval && newlen) {
1343 flags = O_RDWR;
1344 acc_mode = MAY_READ | MAY_WRITE;
1345 fmode = FMODE_READ | FMODE_WRITE;
1346 } else if (newval && newlen) {
1347 flags = O_WRONLY;
1348 acc_mode = MAY_WRITE;
1349 fmode = FMODE_WRITE;
1350 } else if (oldval && oldlen) {
1351 flags = O_RDONLY;
1352 acc_mode = MAY_READ;
1353 fmode = FMODE_READ;
1354 } else {
1355 result = 0;
1356 goto out_putname;
1357 }
1358
1359 mnt = current->nsproxy->pid_ns->proc_mnt;
1360 result = vfs_path_lookup(mnt->mnt_root, mnt, pathname, 0, &nd);
1361 if (result)
1362 goto out_putname;
1363
1364 result = may_open(&nd.path, acc_mode, fmode);
1365 if (result)
1366 goto out_putpath;
1367
1368 file = dentry_open(nd.path.dentry, nd.path.mnt, flags, current_cred());
1369 result = PTR_ERR(file);
1370 if (IS_ERR(file))
1371 goto out_putname;
1372
1373 result = table->convert(file, oldval, oldlen, newval, newlen);
1374
1375 fput(file);
1376out_putname:
1377 putname(pathname);
1378out:
1379 return result;
1380
1381out_putpath:
1382 path_put(&nd.path);
1383 goto out_putname;
1384}
1385
1386
1387#else /* CONFIG_SYSCTL_SYSCALL */
1388
1389static ssize_t binary_sysctl(const int *name, int nlen,
1390 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1391{
1392 return -ENOSYS;
1393}
1394
1395#endif /* CONFIG_SYSCTL_SYSCALL */
1396
1397
1398static void deprecated_sysctl_warning(const int *name, int nlen)
1399{
1400 int i;
1401
1402 if (printk_ratelimit()) {
1403 printk(KERN_INFO
1404 "warning: process `%s' used the deprecated sysctl "
1405 "system call with ", current->comm);
1406 for (i = 0; i < nlen; i++)
1407 printk("%d.", name[i]);
1408 printk("\n");
1409 }
1410 return;
1411}
1412
1413static ssize_t do_sysctl(int __user *args_name, int nlen,
1414 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1415{
1416 int name[CTL_MAXNAME];
1417 int i;
1418
1419 /* Check args->nlen. */
1420 if (nlen < 0 || nlen > CTL_MAXNAME)
1421 return -ENOTDIR;
1422 /* Read in the sysctl name for simplicity */
1423 for (i = 0; i < nlen; i++)
1424 if (get_user(name[i], args_name + i))
1425 return -EFAULT;
1426
1427 deprecated_sysctl_warning(name, nlen);
1428
1429 return binary_sysctl(name, nlen, oldval, oldlen, newval, newlen);
1430}
1431
1432SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
1433{
1434 struct __sysctl_args tmp;
1435 size_t oldlen = 0;
1436 ssize_t result;
1437
1438 if (copy_from_user(&tmp, args, sizeof(tmp)))
1439 return -EFAULT;
1440
1441 if (tmp.oldval && !tmp.oldlenp)
1442 return -EFAULT;
1443
1444 if (tmp.oldlenp && get_user(oldlen, tmp.oldlenp))
1445 return -EFAULT;
1446
1447 result = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, oldlen,
1448 tmp.newval, tmp.newlen);
1449
1450 if (result >= 0) {
1451 oldlen = result;
1452 result = 0;
1453 }
1454
1455 if (tmp.oldlenp && put_user(oldlen, tmp.oldlenp))
1456 return -EFAULT;
1457
1458 return result;
1459}
1460
1461
1462#ifdef CONFIG_COMPAT
1463#include <asm/compat.h>
1464
1465struct compat_sysctl_args {
1466 compat_uptr_t name;
1467 int nlen;
1468 compat_uptr_t oldval;
1469 compat_uptr_t oldlenp;
1470 compat_uptr_t newval;
1471 compat_size_t newlen;
1472 compat_ulong_t __unused[4];
1473};
1474
1475asmlinkage long compat_sys_sysctl(struct compat_sysctl_args __user *args)
1476{
1477 struct compat_sysctl_args tmp;
1478 compat_size_t __user *compat_oldlenp;
1479 size_t oldlen = 0;
1480 ssize_t result;
1481
1482 if (copy_from_user(&tmp, args, sizeof(tmp)))
1483 return -EFAULT;
1484
1485 if (tmp.oldval && !tmp.oldlenp)
1486 return -EFAULT;
1487
1488 compat_oldlenp = compat_ptr(tmp.oldlenp);
1489 if (compat_oldlenp && get_user(oldlen, compat_oldlenp))
1490 return -EFAULT;
1491
1492 result = do_sysctl(compat_ptr(tmp.name), tmp.nlen,
1493 compat_ptr(tmp.oldval), oldlen,
1494 compat_ptr(tmp.newval), tmp.newlen);
1495
1496 if (result >= 0) {
1497 oldlen = result;
1498 result = 0;
1499 }
1500
1501 if (compat_oldlenp && put_user(oldlen, compat_oldlenp))
1502 return -EFAULT;
1503
1504 return result;
1505}
1506
1507#endif /* CONFIG_COMPAT */
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index b6e7aaea4604..04cdcf72c827 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -5,1239 +5,6 @@
5#include <linux/string.h> 5#include <linux/string.h>
6#include <net/ip_vs.h> 6#include <net/ip_vs.h>
7 7
8struct trans_ctl_table {
9 int ctl_name;
10 const char *procname;
11 const struct trans_ctl_table *child;
12};
13
14static const struct trans_ctl_table trans_random_table[] = {
15 { RANDOM_POOLSIZE, "poolsize" },
16 { RANDOM_ENTROPY_COUNT, "entropy_avail" },
17 { RANDOM_READ_THRESH, "read_wakeup_threshold" },
18 { RANDOM_WRITE_THRESH, "write_wakeup_threshold" },
19 { RANDOM_BOOT_ID, "boot_id" },
20 { RANDOM_UUID, "uuid" },
21 {}
22};
23
24static const struct trans_ctl_table trans_pty_table[] = {
25 { PTY_MAX, "max" },
26 { PTY_NR, "nr" },
27 {}
28};
29
30static const struct trans_ctl_table trans_kern_table[] = {
31 { KERN_OSTYPE, "ostype" },
32 { KERN_OSRELEASE, "osrelease" },
33 /* KERN_OSREV not used */
34 { KERN_VERSION, "version" },
35 /* KERN_SECUREMASK not used */
36 /* KERN_PROF not used */
37 { KERN_NODENAME, "hostname" },
38 { KERN_DOMAINNAME, "domainname" },
39
40 { KERN_PANIC, "panic" },
41 { KERN_REALROOTDEV, "real-root-dev" },
42
43 { KERN_SPARC_REBOOT, "reboot-cmd" },
44 { KERN_CTLALTDEL, "ctrl-alt-del" },
45 { KERN_PRINTK, "printk" },
46
47 /* KERN_NAMETRANS not used */
48 /* KERN_PPC_HTABRECLAIM not used */
49 /* KERN_PPC_ZEROPAGED not used */
50 { KERN_PPC_POWERSAVE_NAP, "powersave-nap" },
51
52 { KERN_MODPROBE, "modprobe" },
53 { KERN_SG_BIG_BUFF, "sg-big-buff" },
54 { KERN_ACCT, "acct" },
55 { KERN_PPC_L2CR, "l2cr" },
56
57 /* KERN_RTSIGNR not used */
58 /* KERN_RTSIGMAX not used */
59
60 { KERN_SHMMAX, "shmmax" },
61 { KERN_MSGMAX, "msgmax" },
62 { KERN_MSGMNB, "msgmnb" },
63 /* KERN_MSGPOOL not used*/
64 { KERN_SYSRQ, "sysrq" },
65 { KERN_MAX_THREADS, "threads-max" },
66 { KERN_RANDOM, "random", trans_random_table },
67 { KERN_SHMALL, "shmall" },
68 { KERN_MSGMNI, "msgmni" },
69 { KERN_SEM, "sem" },
70 { KERN_SPARC_STOP_A, "stop-a" },
71 { KERN_SHMMNI, "shmmni" },
72
73 { KERN_OVERFLOWUID, "overflowuid" },
74 { KERN_OVERFLOWGID, "overflowgid" },
75
76 { KERN_HOTPLUG, "hotplug", },
77 { KERN_IEEE_EMULATION_WARNINGS, "ieee_emulation_warnings" },
78
79 { KERN_S390_USER_DEBUG_LOGGING, "userprocess_debug" },
80 { KERN_CORE_USES_PID, "core_uses_pid" },
81 { KERN_TAINTED, "tainted" },
82 { KERN_CADPID, "cad_pid" },
83 { KERN_PIDMAX, "pid_max" },
84 { KERN_CORE_PATTERN, "core_pattern" },
85 { KERN_PANIC_ON_OOPS, "panic_on_oops" },
86 { KERN_HPPA_PWRSW, "soft-power" },
87 { KERN_HPPA_UNALIGNED, "unaligned-trap" },
88
89 { KERN_PRINTK_RATELIMIT, "printk_ratelimit" },
90 { KERN_PRINTK_RATELIMIT_BURST, "printk_ratelimit_burst" },
91
92 { KERN_PTY, "pty", trans_pty_table },
93 { KERN_NGROUPS_MAX, "ngroups_max" },
94 { KERN_SPARC_SCONS_PWROFF, "scons-poweroff" },
95 { KERN_HZ_TIMER, "hz_timer" },
96 { KERN_UNKNOWN_NMI_PANIC, "unknown_nmi_panic" },
97 { KERN_BOOTLOADER_TYPE, "bootloader_type" },
98 { KERN_RANDOMIZE, "randomize_va_space" },
99
100 { KERN_SPIN_RETRY, "spin_retry" },
101 { KERN_ACPI_VIDEO_FLAGS, "acpi_video_flags" },
102 { KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" },
103 { KERN_COMPAT_LOG, "compat-log" },
104 { KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
105 { KERN_NMI_WATCHDOG, "nmi_watchdog" },
106 { KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
107 {}
108};
109
110static const struct trans_ctl_table trans_vm_table[] = {
111 { VM_OVERCOMMIT_MEMORY, "overcommit_memory" },
112 { VM_PAGE_CLUSTER, "page-cluster" },
113 { VM_DIRTY_BACKGROUND, "dirty_background_ratio" },
114 { VM_DIRTY_RATIO, "dirty_ratio" },
115 { VM_DIRTY_WB_CS, "dirty_writeback_centisecs" },
116 { VM_DIRTY_EXPIRE_CS, "dirty_expire_centisecs" },
117 { VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" },
118 { VM_OVERCOMMIT_RATIO, "overcommit_ratio" },
119 /* VM_PAGEBUF unused */
120 { VM_HUGETLB_PAGES, "nr_hugepages" },
121 { VM_SWAPPINESS, "swappiness" },
122 { VM_LOWMEM_RESERVE_RATIO, "lowmem_reserve_ratio" },
123 { VM_MIN_FREE_KBYTES, "min_free_kbytes" },
124 { VM_MAX_MAP_COUNT, "max_map_count" },
125 { VM_LAPTOP_MODE, "laptop_mode" },
126 { VM_BLOCK_DUMP, "block_dump" },
127 { VM_HUGETLB_GROUP, "hugetlb_shm_group" },
128 { VM_VFS_CACHE_PRESSURE, "vfs_cache_pressure" },
129 { VM_LEGACY_VA_LAYOUT, "legacy_va_layout" },
130 /* VM_SWAP_TOKEN_TIMEOUT unused */
131 { VM_DROP_PAGECACHE, "drop_caches" },
132 { VM_PERCPU_PAGELIST_FRACTION, "percpu_pagelist_fraction" },
133 { VM_ZONE_RECLAIM_MODE, "zone_reclaim_mode" },
134 { VM_MIN_UNMAPPED, "min_unmapped_ratio" },
135 { VM_PANIC_ON_OOM, "panic_on_oom" },
136 { VM_VDSO_ENABLED, "vdso_enabled" },
137 { VM_MIN_SLAB, "min_slab_ratio" },
138
139 {}
140};
141
142static const struct trans_ctl_table trans_net_core_table[] = {
143 { NET_CORE_WMEM_MAX, "wmem_max" },
144 { NET_CORE_RMEM_MAX, "rmem_max" },
145 { NET_CORE_WMEM_DEFAULT, "wmem_default" },
146 { NET_CORE_RMEM_DEFAULT, "rmem_default" },
147 /* NET_CORE_DESTROY_DELAY unused */
148 { NET_CORE_MAX_BACKLOG, "netdev_max_backlog" },
149 /* NET_CORE_FASTROUTE unused */
150 { NET_CORE_MSG_COST, "message_cost" },
151 { NET_CORE_MSG_BURST, "message_burst" },
152 { NET_CORE_OPTMEM_MAX, "optmem_max" },
153 /* NET_CORE_HOT_LIST_LENGTH unused */
154 /* NET_CORE_DIVERT_VERSION unused */
155 /* NET_CORE_NO_CONG_THRESH unused */
156 /* NET_CORE_NO_CONG unused */
157 /* NET_CORE_LO_CONG unused */
158 /* NET_CORE_MOD_CONG unused */
159 { NET_CORE_DEV_WEIGHT, "dev_weight" },
160 { NET_CORE_SOMAXCONN, "somaxconn" },
161 { NET_CORE_BUDGET, "netdev_budget" },
162 { NET_CORE_AEVENT_ETIME, "xfrm_aevent_etime" },
163 { NET_CORE_AEVENT_RSEQTH, "xfrm_aevent_rseqth" },
164 { NET_CORE_WARNINGS, "warnings" },
165 {},
166};
167
168static const struct trans_ctl_table trans_net_unix_table[] = {
169 /* NET_UNIX_DESTROY_DELAY unused */
170 /* NET_UNIX_DELETE_DELAY unused */
171 { NET_UNIX_MAX_DGRAM_QLEN, "max_dgram_qlen" },
172 {}
173};
174
175static const struct trans_ctl_table trans_net_ipv4_route_table[] = {
176 { NET_IPV4_ROUTE_FLUSH, "flush" },
177 { NET_IPV4_ROUTE_MIN_DELAY, "min_delay" },
178 { NET_IPV4_ROUTE_MAX_DELAY, "max_delay" },
179 { NET_IPV4_ROUTE_GC_THRESH, "gc_thresh" },
180 { NET_IPV4_ROUTE_MAX_SIZE, "max_size" },
181 { NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" },
182 { NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" },
183 { NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval" },
184 { NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" },
185 { NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" },
186 { NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" },
187 { NET_IPV4_ROUTE_ERROR_COST, "error_cost" },
188 { NET_IPV4_ROUTE_ERROR_BURST, "error_burst" },
189 { NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity" },
190 { NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" },
191 { NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" },
192 { NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" },
193 { NET_IPV4_ROUTE_SECRET_INTERVAL, "secret_interval" },
194 { NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" },
195 {}
196};
197
198static const struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = {
199 { NET_IPV4_CONF_FORWARDING, "forwarding" },
200 { NET_IPV4_CONF_MC_FORWARDING, "mc_forwarding" },
201
202 { NET_IPV4_CONF_PROXY_ARP, "proxy_arp" },
203 { NET_IPV4_CONF_ACCEPT_REDIRECTS, "accept_redirects" },
204 { NET_IPV4_CONF_SECURE_REDIRECTS, "secure_redirects" },
205 { NET_IPV4_CONF_SEND_REDIRECTS, "send_redirects" },
206 { NET_IPV4_CONF_SHARED_MEDIA, "shared_media" },
207 { NET_IPV4_CONF_RP_FILTER, "rp_filter" },
208 { NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
209 { NET_IPV4_CONF_BOOTP_RELAY, "bootp_relay" },
210 { NET_IPV4_CONF_LOG_MARTIANS, "log_martians" },
211 { NET_IPV4_CONF_TAG, "tag" },
212 { NET_IPV4_CONF_ARPFILTER, "arp_filter" },
213 { NET_IPV4_CONF_MEDIUM_ID, "medium_id" },
214 { NET_IPV4_CONF_NOXFRM, "disable_xfrm" },
215 { NET_IPV4_CONF_NOPOLICY, "disable_policy" },
216 { NET_IPV4_CONF_FORCE_IGMP_VERSION, "force_igmp_version" },
217
218 { NET_IPV4_CONF_ARP_ANNOUNCE, "arp_announce" },
219 { NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" },
220 { NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" },
221 { NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" },
222 { NET_IPV4_CONF_ARP_NOTIFY, "arp_notify" },
223 {}
224};
225
226static const struct trans_ctl_table trans_net_ipv4_conf_table[] = {
227 { NET_PROTO_CONF_ALL, "all", trans_net_ipv4_conf_vars_table },
228 { NET_PROTO_CONF_DEFAULT, "default", trans_net_ipv4_conf_vars_table },
229 { 0, NULL, trans_net_ipv4_conf_vars_table },
230 {}
231};
232
233static const struct trans_ctl_table trans_net_neigh_vars_table[] = {
234 { NET_NEIGH_MCAST_SOLICIT, "mcast_solicit" },
235 { NET_NEIGH_UCAST_SOLICIT, "ucast_solicit" },
236 { NET_NEIGH_APP_SOLICIT, "app_solicit" },
237 { NET_NEIGH_RETRANS_TIME, "retrans_time" },
238 { NET_NEIGH_REACHABLE_TIME, "base_reachable_time" },
239 { NET_NEIGH_DELAY_PROBE_TIME, "delay_first_probe_time" },
240 { NET_NEIGH_GC_STALE_TIME, "gc_stale_time" },
241 { NET_NEIGH_UNRES_QLEN, "unres_qlen" },
242 { NET_NEIGH_PROXY_QLEN, "proxy_qlen" },
243 { NET_NEIGH_ANYCAST_DELAY, "anycast_delay" },
244 { NET_NEIGH_PROXY_DELAY, "proxy_delay" },
245 { NET_NEIGH_LOCKTIME, "locktime" },
246 { NET_NEIGH_GC_INTERVAL, "gc_interval" },
247 { NET_NEIGH_GC_THRESH1, "gc_thresh1" },
248 { NET_NEIGH_GC_THRESH2, "gc_thresh2" },
249 { NET_NEIGH_GC_THRESH3, "gc_thresh3" },
250 { NET_NEIGH_RETRANS_TIME_MS, "retrans_time_ms" },
251 { NET_NEIGH_REACHABLE_TIME_MS, "base_reachable_time_ms" },
252 {}
253};
254
255static const struct trans_ctl_table trans_net_neigh_table[] = {
256 { NET_PROTO_CONF_DEFAULT, "default", trans_net_neigh_vars_table },
257 { 0, NULL, trans_net_neigh_vars_table },
258 {}
259};
260
261static const struct trans_ctl_table trans_net_ipv4_netfilter_table[] = {
262 { NET_IPV4_NF_CONNTRACK_MAX, "ip_conntrack_max" },
263
264 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, "ip_conntrack_tcp_timeout_syn_sent" },
265 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, "ip_conntrack_tcp_timeout_syn_recv" },
266 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED, "ip_conntrack_tcp_timeout_established" },
267 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT, "ip_conntrack_tcp_timeout_fin_wait" },
268 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT, "ip_conntrack_tcp_timeout_close_wait" },
269 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK, "ip_conntrack_tcp_timeout_last_ack" },
270 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT, "ip_conntrack_tcp_timeout_time_wait" },
271 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE, "ip_conntrack_tcp_timeout_close" },
272
273 { NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT, "ip_conntrack_udp_timeout" },
274 { NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM, "ip_conntrack_udp_timeout_stream" },
275 { NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT, "ip_conntrack_icmp_timeout" },
276 { NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT, "ip_conntrack_generic_timeout" },
277
278 { NET_IPV4_NF_CONNTRACK_BUCKETS, "ip_conntrack_buckets" },
279 { NET_IPV4_NF_CONNTRACK_LOG_INVALID, "ip_conntrack_log_invalid" },
280 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS, "ip_conntrack_tcp_timeout_max_retrans" },
281 { NET_IPV4_NF_CONNTRACK_TCP_LOOSE, "ip_conntrack_tcp_loose" },
282 { NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL, "ip_conntrack_tcp_be_liberal" },
283 { NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS, "ip_conntrack_tcp_max_retrans" },
284
285 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED, "ip_conntrack_sctp_timeout_closed" },
286 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT, "ip_conntrack_sctp_timeout_cookie_wait" },
287 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED, "ip_conntrack_sctp_timeout_cookie_echoed" },
288 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED, "ip_conntrack_sctp_timeout_established" },
289 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT, "ip_conntrack_sctp_timeout_shutdown_sent" },
290 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD, "ip_conntrack_sctp_timeout_shutdown_recd" },
291 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT, "ip_conntrack_sctp_timeout_shutdown_ack_sent" },
292
293 { NET_IPV4_NF_CONNTRACK_COUNT, "ip_conntrack_count" },
294 { NET_IPV4_NF_CONNTRACK_CHECKSUM, "ip_conntrack_checksum" },
295 {}
296};
297
298static const struct trans_ctl_table trans_net_ipv4_table[] = {
299 { NET_IPV4_FORWARD, "ip_forward" },
300 { NET_IPV4_DYNADDR, "ip_dynaddr" },
301
302 { NET_IPV4_CONF, "conf", trans_net_ipv4_conf_table },
303 { NET_IPV4_NEIGH, "neigh", trans_net_neigh_table },
304 { NET_IPV4_ROUTE, "route", trans_net_ipv4_route_table },
305 /* NET_IPV4_FIB_HASH unused */
306 { NET_IPV4_NETFILTER, "netfilter", trans_net_ipv4_netfilter_table },
307
308 { NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps" },
309 { NET_IPV4_TCP_WINDOW_SCALING, "tcp_window_scaling" },
310 { NET_IPV4_TCP_SACK, "tcp_sack" },
311 { NET_IPV4_TCP_RETRANS_COLLAPSE, "tcp_retrans_collapse" },
312 { NET_IPV4_DEFAULT_TTL, "ip_default_ttl" },
313 /* NET_IPV4_AUTOCONFIG unused */
314 { NET_IPV4_NO_PMTU_DISC, "ip_no_pmtu_disc" },
315 { NET_IPV4_TCP_SYN_RETRIES, "tcp_syn_retries" },
316 { NET_IPV4_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh" },
317 { NET_IPV4_IPFRAG_LOW_THRESH, "ipfrag_low_thresh" },
318 { NET_IPV4_IPFRAG_TIME, "ipfrag_time" },
319 /* NET_IPV4_TCP_MAX_KA_PROBES unused */
320 { NET_IPV4_TCP_KEEPALIVE_TIME, "tcp_keepalive_time" },
321 { NET_IPV4_TCP_KEEPALIVE_PROBES, "tcp_keepalive_probes" },
322 { NET_IPV4_TCP_RETRIES1, "tcp_retries1" },
323 { NET_IPV4_TCP_RETRIES2, "tcp_retries2" },
324 { NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout" },
325 /* NET_IPV4_IP_MASQ_DEBUG unused */
326 { NET_TCP_SYNCOOKIES, "tcp_syncookies" },
327 { NET_TCP_STDURG, "tcp_stdurg" },
328 { NET_TCP_RFC1337, "tcp_rfc1337" },
329 /* NET_TCP_SYN_TAILDROP unused */
330 { NET_TCP_MAX_SYN_BACKLOG, "tcp_max_syn_backlog" },
331 { NET_IPV4_LOCAL_PORT_RANGE, "ip_local_port_range" },
332 { NET_IPV4_ICMP_ECHO_IGNORE_ALL, "icmp_echo_ignore_all" },
333 { NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, "icmp_echo_ignore_broadcasts" },
334 /* NET_IPV4_ICMP_SOURCEQUENCH_RATE unused */
335 /* NET_IPV4_ICMP_DESTUNREACH_RATE unused */
336 /* NET_IPV4_ICMP_TIMEEXCEED_RATE unused */
337 /* NET_IPV4_ICMP_PARAMPROB_RATE unused */
338 /* NET_IPV4_ICMP_ECHOREPLY_RATE unused */
339 { NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, "icmp_ignore_bogus_error_responses" },
340 { NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships" },
341 { NET_TCP_TW_RECYCLE, "tcp_tw_recycle" },
342 /* NET_IPV4_ALWAYS_DEFRAG unused */
343 { NET_IPV4_TCP_KEEPALIVE_INTVL, "tcp_keepalive_intvl" },
344 { NET_IPV4_INET_PEER_THRESHOLD, "inet_peer_threshold" },
345 { NET_IPV4_INET_PEER_MINTTL, "inet_peer_minttl" },
346 { NET_IPV4_INET_PEER_MAXTTL, "inet_peer_maxttl" },
347 { NET_IPV4_INET_PEER_GC_MINTIME, "inet_peer_gc_mintime" },
348 { NET_IPV4_INET_PEER_GC_MAXTIME, "inet_peer_gc_maxtime" },
349 { NET_TCP_ORPHAN_RETRIES, "tcp_orphan_retries" },
350 { NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow" },
351 { NET_TCP_SYNACK_RETRIES, "tcp_synack_retries" },
352 { NET_TCP_MAX_ORPHANS, "tcp_max_orphans" },
353 { NET_TCP_MAX_TW_BUCKETS, "tcp_max_tw_buckets" },
354 { NET_TCP_FACK, "tcp_fack" },
355 { NET_TCP_REORDERING, "tcp_reordering" },
356 { NET_TCP_ECN, "tcp_ecn" },
357 { NET_TCP_DSACK, "tcp_dsack" },
358 { NET_TCP_MEM, "tcp_mem" },
359 { NET_TCP_WMEM, "tcp_wmem" },
360 { NET_TCP_RMEM, "tcp_rmem" },
361 { NET_TCP_APP_WIN, "tcp_app_win" },
362 { NET_TCP_ADV_WIN_SCALE, "tcp_adv_win_scale" },
363 { NET_IPV4_NONLOCAL_BIND, "ip_nonlocal_bind" },
364 { NET_IPV4_ICMP_RATELIMIT, "icmp_ratelimit" },
365 { NET_IPV4_ICMP_RATEMASK, "icmp_ratemask" },
366 { NET_TCP_TW_REUSE, "tcp_tw_reuse" },
367 { NET_TCP_FRTO, "tcp_frto" },
368 { NET_TCP_LOW_LATENCY, "tcp_low_latency" },
369 { NET_IPV4_IPFRAG_SECRET_INTERVAL, "ipfrag_secret_interval" },
370 { NET_IPV4_IGMP_MAX_MSF, "igmp_max_msf" },
371 { NET_TCP_NO_METRICS_SAVE, "tcp_no_metrics_save" },
372 /* NET_TCP_DEFAULT_WIN_SCALE unused */
373 { NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" },
374 { NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" },
375 /* NET_TCP_BIC_BETA unused */
376 { NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR, "icmp_errors_use_inbound_ifaddr" },
377 { NET_TCP_CONG_CONTROL, "tcp_congestion_control" },
378 { NET_TCP_ABC, "tcp_abc" },
379 { NET_IPV4_IPFRAG_MAX_DIST, "ipfrag_max_dist" },
380 { NET_TCP_MTU_PROBING, "tcp_mtu_probing" },
381 { NET_TCP_BASE_MSS, "tcp_base_mss" },
382 { NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" },
383 { NET_TCP_DMA_COPYBREAK, "tcp_dma_copybreak" },
384 { NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" },
385 { NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" },
386 { NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" },
387 { NET_CIPSOV4_RBM_OPTFMT, "cipso_rbm_optfmt" },
388 { NET_CIPSOV4_RBM_STRICTVALID, "cipso_rbm_strictvalid" },
389 { NET_TCP_AVAIL_CONG_CONTROL, "tcp_available_congestion_control" },
390 { NET_TCP_ALLOWED_CONG_CONTROL, "tcp_allowed_congestion_control" },
391 { NET_TCP_MAX_SSTHRESH, "tcp_max_ssthresh" },
392 { NET_TCP_FRTO_RESPONSE, "tcp_frto_response" },
393 { 2088 /* NET_IPQ_QMAX */, "ip_queue_maxlen" },
394 {}
395};
396
397static const struct trans_ctl_table trans_net_ipx_table[] = {
398 { NET_IPX_PPROP_BROADCASTING, "ipx_pprop_broadcasting" },
399 /* NET_IPX_FORWARDING unused */
400 {}
401};
402
403static const struct trans_ctl_table trans_net_atalk_table[] = {
404 { NET_ATALK_AARP_EXPIRY_TIME, "aarp-expiry-time" },
405 { NET_ATALK_AARP_TICK_TIME, "aarp-tick-time" },
406 { NET_ATALK_AARP_RETRANSMIT_LIMIT, "aarp-retransmit-limit" },
407 { NET_ATALK_AARP_RESOLVE_TIME, "aarp-resolve-time" },
408 {},
409};
410
411static const struct trans_ctl_table trans_net_netrom_table[] = {
412 { NET_NETROM_DEFAULT_PATH_QUALITY, "default_path_quality" },
413 { NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER, "obsolescence_count_initialiser" },
414 { NET_NETROM_NETWORK_TTL_INITIALISER, "network_ttl_initialiser" },
415 { NET_NETROM_TRANSPORT_TIMEOUT, "transport_timeout" },
416 { NET_NETROM_TRANSPORT_MAXIMUM_TRIES, "transport_maximum_tries" },
417 { NET_NETROM_TRANSPORT_ACKNOWLEDGE_DELAY, "transport_acknowledge_delay" },
418 { NET_NETROM_TRANSPORT_BUSY_DELAY, "transport_busy_delay" },
419 { NET_NETROM_TRANSPORT_REQUESTED_WINDOW_SIZE, "transport_requested_window_size" },
420 { NET_NETROM_TRANSPORT_NO_ACTIVITY_TIMEOUT, "transport_no_activity_timeout" },
421 { NET_NETROM_ROUTING_CONTROL, "routing_control" },
422 { NET_NETROM_LINK_FAILS_COUNT, "link_fails_count" },
423 { NET_NETROM_RESET, "reset" },
424 {}
425};
426
427static const struct trans_ctl_table trans_net_ax25_param_table[] = {
428 { NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" },
429 { NET_AX25_DEFAULT_MODE, "ax25_default_mode" },
430 { NET_AX25_BACKOFF_TYPE, "backoff_type" },
431 { NET_AX25_CONNECT_MODE, "connect_mode" },
432 { NET_AX25_STANDARD_WINDOW, "standard_window_size" },
433 { NET_AX25_EXTENDED_WINDOW, "extended_window_size" },
434 { NET_AX25_T1_TIMEOUT, "t1_timeout" },
435 { NET_AX25_T2_TIMEOUT, "t2_timeout" },
436 { NET_AX25_T3_TIMEOUT, "t3_timeout" },
437 { NET_AX25_IDLE_TIMEOUT, "idle_timeout" },
438 { NET_AX25_N2, "maximum_retry_count" },
439 { NET_AX25_PACLEN, "maximum_packet_length" },
440 { NET_AX25_PROTOCOL, "protocol" },
441 { NET_AX25_DAMA_SLAVE_TIMEOUT, "dama_slave_timeout" },
442 {}
443};
444
445static const struct trans_ctl_table trans_net_ax25_table[] = {
446 { 0, NULL, trans_net_ax25_param_table },
447 {}
448};
449
450static const struct trans_ctl_table trans_net_bridge_table[] = {
451 { NET_BRIDGE_NF_CALL_ARPTABLES, "bridge-nf-call-arptables" },
452 { NET_BRIDGE_NF_CALL_IPTABLES, "bridge-nf-call-iptables" },
453 { NET_BRIDGE_NF_CALL_IP6TABLES, "bridge-nf-call-ip6tables" },
454 { NET_BRIDGE_NF_FILTER_VLAN_TAGGED, "bridge-nf-filter-vlan-tagged" },
455 { NET_BRIDGE_NF_FILTER_PPPOE_TAGGED, "bridge-nf-filter-pppoe-tagged" },
456 {}
457};
458
459static const struct trans_ctl_table trans_net_rose_table[] = {
460 { NET_ROSE_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" },
461 { NET_ROSE_CALL_REQUEST_TIMEOUT, "call_request_timeout" },
462 { NET_ROSE_RESET_REQUEST_TIMEOUT, "reset_request_timeout" },
463 { NET_ROSE_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" },
464 { NET_ROSE_ACK_HOLD_BACK_TIMEOUT, "acknowledge_hold_back_timeout" },
465 { NET_ROSE_ROUTING_CONTROL, "routing_control" },
466 { NET_ROSE_LINK_FAIL_TIMEOUT, "link_fail_timeout" },
467 { NET_ROSE_MAX_VCS, "maximum_virtual_circuits" },
468 { NET_ROSE_WINDOW_SIZE, "window_size" },
469 { NET_ROSE_NO_ACTIVITY_TIMEOUT, "no_activity_timeout" },
470 {}
471};
472
473static const struct trans_ctl_table trans_net_ipv6_conf_var_table[] = {
474 { NET_IPV6_FORWARDING, "forwarding" },
475 { NET_IPV6_HOP_LIMIT, "hop_limit" },
476 { NET_IPV6_MTU, "mtu" },
477 { NET_IPV6_ACCEPT_RA, "accept_ra" },
478 { NET_IPV6_ACCEPT_REDIRECTS, "accept_redirects" },
479 { NET_IPV6_AUTOCONF, "autoconf" },
480 { NET_IPV6_DAD_TRANSMITS, "dad_transmits" },
481 { NET_IPV6_RTR_SOLICITS, "router_solicitations" },
482 { NET_IPV6_RTR_SOLICIT_INTERVAL, "router_solicitation_interval" },
483 { NET_IPV6_RTR_SOLICIT_DELAY, "router_solicitation_delay" },
484 { NET_IPV6_USE_TEMPADDR, "use_tempaddr" },
485 { NET_IPV6_TEMP_VALID_LFT, "temp_valid_lft" },
486 { NET_IPV6_TEMP_PREFERED_LFT, "temp_prefered_lft" },
487 { NET_IPV6_REGEN_MAX_RETRY, "regen_max_retry" },
488 { NET_IPV6_MAX_DESYNC_FACTOR, "max_desync_factor" },
489 { NET_IPV6_MAX_ADDRESSES, "max_addresses" },
490 { NET_IPV6_FORCE_MLD_VERSION, "force_mld_version" },
491 { NET_IPV6_ACCEPT_RA_DEFRTR, "accept_ra_defrtr" },
492 { NET_IPV6_ACCEPT_RA_PINFO, "accept_ra_pinfo" },
493 { NET_IPV6_ACCEPT_RA_RTR_PREF, "accept_ra_rtr_pref" },
494 { NET_IPV6_RTR_PROBE_INTERVAL, "router_probe_interval" },
495 { NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" },
496 { NET_IPV6_PROXY_NDP, "proxy_ndp" },
497 { NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
498 {}
499};
500
501static const struct trans_ctl_table trans_net_ipv6_conf_table[] = {
502 { NET_PROTO_CONF_ALL, "all", trans_net_ipv6_conf_var_table },
503 { NET_PROTO_CONF_DEFAULT, "default", trans_net_ipv6_conf_var_table },
504 { 0, NULL, trans_net_ipv6_conf_var_table },
505 {}
506};
507
508static const struct trans_ctl_table trans_net_ipv6_route_table[] = {
509 { NET_IPV6_ROUTE_FLUSH, "flush" },
510 { NET_IPV6_ROUTE_GC_THRESH, "gc_thresh" },
511 { NET_IPV6_ROUTE_MAX_SIZE, "max_size" },
512 { NET_IPV6_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" },
513 { NET_IPV6_ROUTE_GC_TIMEOUT, "gc_timeout" },
514 { NET_IPV6_ROUTE_GC_INTERVAL, "gc_interval" },
515 { NET_IPV6_ROUTE_GC_ELASTICITY, "gc_elasticity" },
516 { NET_IPV6_ROUTE_MTU_EXPIRES, "mtu_expires" },
517 { NET_IPV6_ROUTE_MIN_ADVMSS, "min_adv_mss" },
518 { NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" },
519 {}
520};
521
522static const struct trans_ctl_table trans_net_ipv6_icmp_table[] = {
523 { NET_IPV6_ICMP_RATELIMIT, "ratelimit" },
524 {}
525};
526
527static const struct trans_ctl_table trans_net_ipv6_table[] = {
528 { NET_IPV6_CONF, "conf", trans_net_ipv6_conf_table },
529 { NET_IPV6_NEIGH, "neigh", trans_net_neigh_table },
530 { NET_IPV6_ROUTE, "route", trans_net_ipv6_route_table },
531 { NET_IPV6_ICMP, "icmp", trans_net_ipv6_icmp_table },
532 { NET_IPV6_BINDV6ONLY, "bindv6only" },
533 { NET_IPV6_IP6FRAG_HIGH_THRESH, "ip6frag_high_thresh" },
534 { NET_IPV6_IP6FRAG_LOW_THRESH, "ip6frag_low_thresh" },
535 { NET_IPV6_IP6FRAG_TIME, "ip6frag_time" },
536 { NET_IPV6_IP6FRAG_SECRET_INTERVAL, "ip6frag_secret_interval" },
537 { NET_IPV6_MLD_MAX_MSF, "mld_max_msf" },
538 { 2088 /* IPQ_QMAX */, "ip6_queue_maxlen" },
539 {}
540};
541
542static const struct trans_ctl_table trans_net_x25_table[] = {
543 { NET_X25_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" },
544 { NET_X25_CALL_REQUEST_TIMEOUT, "call_request_timeout" },
545 { NET_X25_RESET_REQUEST_TIMEOUT, "reset_request_timeout" },
546 { NET_X25_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" },
547 { NET_X25_ACK_HOLD_BACK_TIMEOUT, "acknowledgement_hold_back_timeout" },
548 { NET_X25_FORWARD, "x25_forward" },
549 {}
550};
551
552static const struct trans_ctl_table trans_net_tr_table[] = {
553 { NET_TR_RIF_TIMEOUT, "rif_timeout" },
554 {}
555};
556
557
558static const struct trans_ctl_table trans_net_decnet_conf_vars[] = {
559 { NET_DECNET_CONF_DEV_FORWARDING, "forwarding" },
560 { NET_DECNET_CONF_DEV_PRIORITY, "priority" },
561 { NET_DECNET_CONF_DEV_T2, "t2" },
562 { NET_DECNET_CONF_DEV_T3, "t3" },
563 {}
564};
565
566static const struct trans_ctl_table trans_net_decnet_conf[] = {
567 { 0, NULL, trans_net_decnet_conf_vars },
568 {}
569};
570
571static const struct trans_ctl_table trans_net_decnet_table[] = {
572 { NET_DECNET_CONF, "conf", trans_net_decnet_conf },
573 { NET_DECNET_NODE_ADDRESS, "node_address" },
574 { NET_DECNET_NODE_NAME, "node_name" },
575 { NET_DECNET_DEFAULT_DEVICE, "default_device" },
576 { NET_DECNET_TIME_WAIT, "time_wait" },
577 { NET_DECNET_DN_COUNT, "dn_count" },
578 { NET_DECNET_DI_COUNT, "di_count" },
579 { NET_DECNET_DR_COUNT, "dr_count" },
580 { NET_DECNET_DST_GC_INTERVAL, "dst_gc_interval" },
581 { NET_DECNET_NO_FC_MAX_CWND, "no_fc_max_cwnd" },
582 { NET_DECNET_MEM, "decnet_mem" },
583 { NET_DECNET_RMEM, "decnet_rmem" },
584 { NET_DECNET_WMEM, "decnet_wmem" },
585 { NET_DECNET_DEBUG_LEVEL, "debug" },
586 {}
587};
588
589static const struct trans_ctl_table trans_net_sctp_table[] = {
590 { NET_SCTP_RTO_INITIAL, "rto_initial" },
591 { NET_SCTP_RTO_MIN, "rto_min" },
592 { NET_SCTP_RTO_MAX, "rto_max" },
593 { NET_SCTP_RTO_ALPHA, "rto_alpha_exp_divisor" },
594 { NET_SCTP_RTO_BETA, "rto_beta_exp_divisor" },
595 { NET_SCTP_VALID_COOKIE_LIFE, "valid_cookie_life" },
596 { NET_SCTP_ASSOCIATION_MAX_RETRANS, "association_max_retrans" },
597 { NET_SCTP_PATH_MAX_RETRANS, "path_max_retrans" },
598 { NET_SCTP_MAX_INIT_RETRANSMITS, "max_init_retransmits" },
599 { NET_SCTP_HB_INTERVAL, "hb_interval" },
600 { NET_SCTP_PRESERVE_ENABLE, "cookie_preserve_enable" },
601 { NET_SCTP_MAX_BURST, "max_burst" },
602 { NET_SCTP_ADDIP_ENABLE, "addip_enable" },
603 { NET_SCTP_PRSCTP_ENABLE, "prsctp_enable" },
604 { NET_SCTP_SNDBUF_POLICY, "sndbuf_policy" },
605 { NET_SCTP_SACK_TIMEOUT, "sack_timeout" },
606 { NET_SCTP_RCVBUF_POLICY, "rcvbuf_policy" },
607 {}
608};
609
610static const struct trans_ctl_table trans_net_llc_llc2_timeout_table[] = {
611 { NET_LLC2_ACK_TIMEOUT, "ack" },
612 { NET_LLC2_P_TIMEOUT, "p" },
613 { NET_LLC2_REJ_TIMEOUT, "rej" },
614 { NET_LLC2_BUSY_TIMEOUT, "busy" },
615 {}
616};
617
618static const struct trans_ctl_table trans_net_llc_station_table[] = {
619 { NET_LLC_STATION_ACK_TIMEOUT, "ack_timeout" },
620 {}
621};
622
623static const struct trans_ctl_table trans_net_llc_llc2_table[] = {
624 { NET_LLC2, "timeout", trans_net_llc_llc2_timeout_table },
625 {}
626};
627
628static const struct trans_ctl_table trans_net_llc_table[] = {
629 { NET_LLC2, "llc2", trans_net_llc_llc2_table },
630 { NET_LLC_STATION, "station", trans_net_llc_station_table },
631 {}
632};
633
634static const struct trans_ctl_table trans_net_netfilter_table[] = {
635 { NET_NF_CONNTRACK_MAX, "nf_conntrack_max" },
636 { NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, "nf_conntrack_tcp_timeout_syn_sent" },
637 { NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, "nf_conntrack_tcp_timeout_syn_recv" },
638 { NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED, "nf_conntrack_tcp_timeout_established" },
639 { NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT, "nf_conntrack_tcp_timeout_fin_wait" },
640 { NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT, "nf_conntrack_tcp_timeout_close_wait" },
641 { NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK, "nf_conntrack_tcp_timeout_last_ack" },
642 { NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT, "nf_conntrack_tcp_timeout_time_wait" },
643 { NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE, "nf_conntrack_tcp_timeout_close" },
644 { NET_NF_CONNTRACK_UDP_TIMEOUT, "nf_conntrack_udp_timeout" },
645 { NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM, "nf_conntrack_udp_timeout_stream" },
646 { NET_NF_CONNTRACK_ICMP_TIMEOUT, "nf_conntrack_icmp_timeout" },
647 { NET_NF_CONNTRACK_GENERIC_TIMEOUT, "nf_conntrack_generic_timeout" },
648 { NET_NF_CONNTRACK_BUCKETS, "nf_conntrack_buckets" },
649 { NET_NF_CONNTRACK_LOG_INVALID, "nf_conntrack_log_invalid" },
650 { NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS, "nf_conntrack_tcp_timeout_max_retrans" },
651 { NET_NF_CONNTRACK_TCP_LOOSE, "nf_conntrack_tcp_loose" },
652 { NET_NF_CONNTRACK_TCP_BE_LIBERAL, "nf_conntrack_tcp_be_liberal" },
653 { NET_NF_CONNTRACK_TCP_MAX_RETRANS, "nf_conntrack_tcp_max_retrans" },
654 { NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED, "nf_conntrack_sctp_timeout_closed" },
655 { NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT, "nf_conntrack_sctp_timeout_cookie_wait" },
656 { NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED, "nf_conntrack_sctp_timeout_cookie_echoed" },
657 { NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED, "nf_conntrack_sctp_timeout_established" },
658 { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT, "nf_conntrack_sctp_timeout_shutdown_sent" },
659 { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD, "nf_conntrack_sctp_timeout_shutdown_recd" },
660 { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT, "nf_conntrack_sctp_timeout_shutdown_ack_sent" },
661 { NET_NF_CONNTRACK_COUNT, "nf_conntrack_count" },
662 { NET_NF_CONNTRACK_ICMPV6_TIMEOUT, "nf_conntrack_icmpv6_timeout" },
663 { NET_NF_CONNTRACK_FRAG6_TIMEOUT, "nf_conntrack_frag6_timeout" },
664 { NET_NF_CONNTRACK_FRAG6_LOW_THRESH, "nf_conntrack_frag6_low_thresh" },
665 { NET_NF_CONNTRACK_FRAG6_HIGH_THRESH, "nf_conntrack_frag6_high_thresh" },
666 { NET_NF_CONNTRACK_CHECKSUM, "nf_conntrack_checksum" },
667
668 {}
669};
670
671static const struct trans_ctl_table trans_net_dccp_table[] = {
672 { NET_DCCP_DEFAULT, "default" },
673 {}
674};
675
676static const struct trans_ctl_table trans_net_irda_table[] = {
677 { NET_IRDA_DISCOVERY, "discovery" },
678 { NET_IRDA_DEVNAME, "devname" },
679 { NET_IRDA_DEBUG, "debug" },
680 { NET_IRDA_FAST_POLL, "fast_poll_increase" },
681 { NET_IRDA_DISCOVERY_SLOTS, "discovery_slots" },
682 { NET_IRDA_DISCOVERY_TIMEOUT, "discovery_timeout" },
683 { NET_IRDA_SLOT_TIMEOUT, "slot_timeout" },
684 { NET_IRDA_MAX_BAUD_RATE, "max_baud_rate" },
685 { NET_IRDA_MIN_TX_TURN_TIME, "min_tx_turn_time" },
686 { NET_IRDA_MAX_TX_DATA_SIZE, "max_tx_data_size" },
687 { NET_IRDA_MAX_TX_WINDOW, "max_tx_window" },
688 { NET_IRDA_MAX_NOREPLY_TIME, "max_noreply_time" },
689 { NET_IRDA_WARN_NOREPLY_TIME, "warn_noreply_time" },
690 { NET_IRDA_LAP_KEEPALIVE_TIME, "lap_keepalive_time" },
691 {}
692};
693
694static const struct trans_ctl_table trans_net_table[] = {
695 { NET_CORE, "core", trans_net_core_table },
696 /* NET_ETHER not used */
697 /* NET_802 not used */
698 { NET_UNIX, "unix", trans_net_unix_table },
699 { NET_IPV4, "ipv4", trans_net_ipv4_table },
700 { NET_IPX, "ipx", trans_net_ipx_table },
701 { NET_ATALK, "appletalk", trans_net_atalk_table },
702 { NET_NETROM, "netrom", trans_net_netrom_table },
703 { NET_AX25, "ax25", trans_net_ax25_table },
704 { NET_BRIDGE, "bridge", trans_net_bridge_table },
705 { NET_ROSE, "rose", trans_net_rose_table },
706 { NET_IPV6, "ipv6", trans_net_ipv6_table },
707 { NET_X25, "x25", trans_net_x25_table },
708 { NET_TR, "token-ring", trans_net_tr_table },
709 { NET_DECNET, "decnet", trans_net_decnet_table },
710 /* NET_ECONET not used */
711 { NET_SCTP, "sctp", trans_net_sctp_table },
712 { NET_LLC, "llc", trans_net_llc_table },
713 { NET_NETFILTER, "netfilter", trans_net_netfilter_table },
714 { NET_DCCP, "dccp", trans_net_dccp_table },
715 { NET_IRDA, "irda", trans_net_irda_table },
716 { 2089, "nf_conntrack_max" },
717 {}
718};
719
720static const struct trans_ctl_table trans_fs_quota_table[] = {
721 { FS_DQ_LOOKUPS, "lookups" },
722 { FS_DQ_DROPS, "drops" },
723 { FS_DQ_READS, "reads" },
724 { FS_DQ_WRITES, "writes" },
725 { FS_DQ_CACHE_HITS, "cache_hits" },
726 { FS_DQ_ALLOCATED, "allocated_dquots" },
727 { FS_DQ_FREE, "free_dquots" },
728 { FS_DQ_SYNCS, "syncs" },
729 { FS_DQ_WARNINGS, "warnings" },
730 {}
731};
732
733static const struct trans_ctl_table trans_fs_xfs_table[] = {
734 { XFS_SGID_INHERIT, "irix_sgid_inherit" },
735 { XFS_SYMLINK_MODE, "irix_symlink_mode" },
736 { XFS_PANIC_MASK, "panic_mask" },
737
738 { XFS_ERRLEVEL, "error_level" },
739 { XFS_SYNCD_TIMER, "xfssyncd_centisecs" },
740 { XFS_INHERIT_SYNC, "inherit_sync" },
741 { XFS_INHERIT_NODUMP, "inherit_nodump" },
742 { XFS_INHERIT_NOATIME, "inherit_noatime" },
743 { XFS_BUF_TIMER, "xfsbufd_centisecs" },
744 { XFS_BUF_AGE, "age_buffer_centisecs" },
745 { XFS_INHERIT_NOSYM, "inherit_nosymlinks" },
746 { XFS_ROTORSTEP, "rotorstep" },
747 { XFS_INHERIT_NODFRG, "inherit_nodefrag" },
748 { XFS_FILESTREAM_TIMER, "filestream_centisecs" },
749 { XFS_STATS_CLEAR, "stats_clear" },
750 {}
751};
752
753static const struct trans_ctl_table trans_fs_ocfs2_nm_table[] = {
754 { 1, "hb_ctl_path" },
755 {}
756};
757
758static const struct trans_ctl_table trans_fs_ocfs2_table[] = {
759 { 1, "nm", trans_fs_ocfs2_nm_table },
760 {}
761};
762
763static const struct trans_ctl_table trans_inotify_table[] = {
764 { INOTIFY_MAX_USER_INSTANCES, "max_user_instances" },
765 { INOTIFY_MAX_USER_WATCHES, "max_user_watches" },
766 { INOTIFY_MAX_QUEUED_EVENTS, "max_queued_events" },
767 {}
768};
769
770static const struct trans_ctl_table trans_fs_table[] = {
771 { FS_NRINODE, "inode-nr" },
772 { FS_STATINODE, "inode-state" },
773 /* FS_MAXINODE unused */
774 /* FS_NRDQUOT unused */
775 /* FS_MAXDQUOT unused */
776 { FS_NRFILE, "file-nr" },
777 { FS_MAXFILE, "file-max" },
778 { FS_DENTRY, "dentry-state" },
779 /* FS_NRSUPER unused */
780 /* FS_MAXUPSER unused */
781 { FS_OVERFLOWUID, "overflowuid" },
782 { FS_OVERFLOWGID, "overflowgid" },
783 { FS_LEASES, "leases-enable" },
784 { FS_DIR_NOTIFY, "dir-notify-enable" },
785 { FS_LEASE_TIME, "lease-break-time" },
786 { FS_DQSTATS, "quota", trans_fs_quota_table },
787 { FS_XFS, "xfs", trans_fs_xfs_table },
788 { FS_AIO_NR, "aio-nr" },
789 { FS_AIO_MAX_NR, "aio-max-nr" },
790 { FS_INOTIFY, "inotify", trans_inotify_table },
791 { FS_OCFS2, "ocfs2", trans_fs_ocfs2_table },
792 { KERN_SETUID_DUMPABLE, "suid_dumpable" },
793 {}
794};
795
796static const struct trans_ctl_table trans_debug_table[] = {
797 {}
798};
799
800static const struct trans_ctl_table trans_cdrom_table[] = {
801 { DEV_CDROM_INFO, "info" },
802 { DEV_CDROM_AUTOCLOSE, "autoclose" },
803 { DEV_CDROM_AUTOEJECT, "autoeject" },
804 { DEV_CDROM_DEBUG, "debug" },
805 { DEV_CDROM_LOCK, "lock" },
806 { DEV_CDROM_CHECK_MEDIA, "check_media" },
807 {}
808};
809
810static const struct trans_ctl_table trans_ipmi_table[] = {
811 { DEV_IPMI_POWEROFF_POWERCYCLE, "poweroff_powercycle" },
812 {}
813};
814
815static const struct trans_ctl_table trans_mac_hid_files[] = {
816 /* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */
817 /* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */
818 { DEV_MAC_HID_MOUSE_BUTTON_EMULATION, "mouse_button_emulation" },
819 { DEV_MAC_HID_MOUSE_BUTTON2_KEYCODE, "mouse_button2_keycode" },
820 { DEV_MAC_HID_MOUSE_BUTTON3_KEYCODE, "mouse_button3_keycode" },
821 /* DEV_MAC_HID_ADB_MOUSE_SENDS_KEYCODES unused */
822 {}
823};
824
825static const struct trans_ctl_table trans_raid_table[] = {
826 { DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min" },
827 { DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max" },
828 {}
829};
830
831static const struct trans_ctl_table trans_scsi_table[] = {
832 { DEV_SCSI_LOGGING_LEVEL, "logging_level" },
833 {}
834};
835
836static const struct trans_ctl_table trans_parport_default_table[] = {
837 { DEV_PARPORT_DEFAULT_TIMESLICE, "timeslice" },
838 { DEV_PARPORT_DEFAULT_SPINTIME, "spintime" },
839 {}
840};
841
842static const struct trans_ctl_table trans_parport_device_table[] = {
843 { DEV_PARPORT_DEVICE_TIMESLICE, "timeslice" },
844 {}
845};
846
847static const struct trans_ctl_table trans_parport_devices_table[] = {
848 { DEV_PARPORT_DEVICES_ACTIVE, "active" },
849 { 0, NULL, trans_parport_device_table },
850 {}
851};
852
853static const struct trans_ctl_table trans_parport_parport_table[] = {
854 { DEV_PARPORT_SPINTIME, "spintime" },
855 { DEV_PARPORT_BASE_ADDR, "base-addr" },
856 { DEV_PARPORT_IRQ, "irq" },
857 { DEV_PARPORT_DMA, "dma" },
858 { DEV_PARPORT_MODES, "modes" },
859 { DEV_PARPORT_DEVICES, "devices", trans_parport_devices_table },
860 { DEV_PARPORT_AUTOPROBE, "autoprobe" },
861 { DEV_PARPORT_AUTOPROBE + 1, "autoprobe0" },
862 { DEV_PARPORT_AUTOPROBE + 2, "autoprobe1" },
863 { DEV_PARPORT_AUTOPROBE + 3, "autoprobe2" },
864 { DEV_PARPORT_AUTOPROBE + 4, "autoprobe3" },
865 {}
866};
867static const struct trans_ctl_table trans_parport_table[] = {
868 { DEV_PARPORT_DEFAULT, "default", trans_parport_default_table },
869 { 0, NULL, trans_parport_parport_table },
870 {}
871};
872
873static const struct trans_ctl_table trans_dev_table[] = {
874 { DEV_CDROM, "cdrom", trans_cdrom_table },
875 /* DEV_HWMON unused */
876 { DEV_PARPORT, "parport", trans_parport_table },
877 { DEV_RAID, "raid", trans_raid_table },
878 { DEV_MAC_HID, "mac_hid", trans_mac_hid_files },
879 { DEV_SCSI, "scsi", trans_scsi_table },
880 { DEV_IPMI, "ipmi", trans_ipmi_table },
881 {}
882};
883
884static const struct trans_ctl_table trans_bus_isa_table[] = {
885 { BUS_ISA_MEM_BASE, "membase" },
886 { BUS_ISA_PORT_BASE, "portbase" },
887 { BUS_ISA_PORT_SHIFT, "portshift" },
888 {}
889};
890
891static const struct trans_ctl_table trans_bus_table[] = {
892 { CTL_BUS_ISA, "isa", trans_bus_isa_table },
893 {}
894};
895
896static const struct trans_ctl_table trans_arlan_conf_table0[] = {
897 { 1, "spreadingCode" },
898 { 2, "channelNumber" },
899 { 3, "scramblingDisable" },
900 { 4, "txAttenuation" },
901 { 5, "systemId" },
902 { 6, "maxDatagramSize" },
903 { 7, "maxFrameSize" },
904 { 8, "maxRetries" },
905 { 9, "receiveMode" },
906 { 10, "priority" },
907 { 11, "rootOrRepeater" },
908 { 12, "SID" },
909 { 13, "registrationMode" },
910 { 14, "registrationFill" },
911 { 15, "localTalkAddress" },
912 { 16, "codeFormat" },
913 { 17, "numChannels" },
914 { 18, "channel1" },
915 { 19, "channel2" },
916 { 20, "channel3" },
917 { 21, "channel4" },
918 { 22, "txClear" },
919 { 23, "txRetries" },
920 { 24, "txRouting" },
921 { 25, "txScrambled" },
922 { 26, "rxParameter" },
923 { 27, "txTimeoutMs" },
924 { 28, "waitCardTimeout" },
925 { 29, "channelSet" },
926 { 30, "name" },
927 { 31, "waitTime" },
928 { 32, "lParameter" },
929 { 33, "_15" },
930 { 34, "headerSize" },
931 { 36, "tx_delay_ms" },
932 { 37, "retries" },
933 { 38, "ReTransmitPacketMaxSize" },
934 { 39, "waitReTransmitPacketMaxSize" },
935 { 40, "fastReTransCount" },
936 { 41, "driverRetransmissions" },
937 { 42, "txAckTimeoutMs" },
938 { 43, "registrationInterrupts" },
939 { 44, "hardwareType" },
940 { 45, "radioType" },
941 { 46, "writeEEPROM" },
942 { 47, "writeRadioType" },
943 { 48, "entry_exit_debug" },
944 { 49, "debug" },
945 { 50, "in_speed" },
946 { 51, "out_speed" },
947 { 52, "in_speed10" },
948 { 53, "out_speed10" },
949 { 54, "in_speed_max" },
950 { 55, "out_speed_max" },
951 { 56, "measure_rate" },
952 { 57, "pre_Command_Wait" },
953 { 58, "rx_tweak1" },
954 { 59, "rx_tweak2" },
955 { 60, "tx_queue_len" },
956
957 { 150, "arlan0-txRing" },
958 { 151, "arlan0-rxRing" },
959 { 152, "arlan0-18" },
960 { 153, "arlan0-ring" },
961 { 154, "arlan0-shm-cpy" },
962 { 155, "config0" },
963 { 156, "reset0" },
964 {}
965};
966
967static const struct trans_ctl_table trans_arlan_conf_table1[] = {
968 { 1, "spreadingCode" },
969 { 2, "channelNumber" },
970 { 3, "scramblingDisable" },
971 { 4, "txAttenuation" },
972 { 5, "systemId" },
973 { 6, "maxDatagramSize" },
974 { 7, "maxFrameSize" },
975 { 8, "maxRetries" },
976 { 9, "receiveMode" },
977 { 10, "priority" },
978 { 11, "rootOrRepeater" },
979 { 12, "SID" },
980 { 13, "registrationMode" },
981 { 14, "registrationFill" },
982 { 15, "localTalkAddress" },
983 { 16, "codeFormat" },
984 { 17, "numChannels" },
985 { 18, "channel1" },
986 { 19, "channel2" },
987 { 20, "channel3" },
988 { 21, "channel4" },
989 { 22, "txClear" },
990 { 23, "txRetries" },
991 { 24, "txRouting" },
992 { 25, "txScrambled" },
993 { 26, "rxParameter" },
994 { 27, "txTimeoutMs" },
995 { 28, "waitCardTimeout" },
996 { 29, "channelSet" },
997 { 30, "name" },
998 { 31, "waitTime" },
999 { 32, "lParameter" },
1000 { 33, "_15" },
1001 { 34, "headerSize" },
1002 { 36, "tx_delay_ms" },
1003 { 37, "retries" },
1004 { 38, "ReTransmitPacketMaxSize" },
1005 { 39, "waitReTransmitPacketMaxSize" },
1006 { 40, "fastReTransCount" },
1007 { 41, "driverRetransmissions" },
1008 { 42, "txAckTimeoutMs" },
1009 { 43, "registrationInterrupts" },
1010 { 44, "hardwareType" },
1011 { 45, "radioType" },
1012 { 46, "writeEEPROM" },
1013 { 47, "writeRadioType" },
1014 { 48, "entry_exit_debug" },
1015 { 49, "debug" },
1016 { 50, "in_speed" },
1017 { 51, "out_speed" },
1018 { 52, "in_speed10" },
1019 { 53, "out_speed10" },
1020 { 54, "in_speed_max" },
1021 { 55, "out_speed_max" },
1022 { 56, "measure_rate" },
1023 { 57, "pre_Command_Wait" },
1024 { 58, "rx_tweak1" },
1025 { 59, "rx_tweak2" },
1026 { 60, "tx_queue_len" },
1027
1028 { 150, "arlan1-txRing" },
1029 { 151, "arlan1-rxRing" },
1030 { 152, "arlan1-18" },
1031 { 153, "arlan1-ring" },
1032 { 154, "arlan1-shm-cpy" },
1033 { 155, "config1" },
1034 { 156, "reset1" },
1035 {}
1036};
1037
1038static const struct trans_ctl_table trans_arlan_conf_table2[] = {
1039 { 1, "spreadingCode" },
1040 { 2, "channelNumber" },
1041 { 3, "scramblingDisable" },
1042 { 4, "txAttenuation" },
1043 { 5, "systemId" },
1044 { 6, "maxDatagramSize" },
1045 { 7, "maxFrameSize" },
1046 { 8, "maxRetries" },
1047 { 9, "receiveMode" },
1048 { 10, "priority" },
1049 { 11, "rootOrRepeater" },
1050 { 12, "SID" },
1051 { 13, "registrationMode" },
1052 { 14, "registrationFill" },
1053 { 15, "localTalkAddress" },
1054 { 16, "codeFormat" },
1055 { 17, "numChannels" },
1056 { 18, "channel1" },
1057 { 19, "channel2" },
1058 { 20, "channel3" },
1059 { 21, "channel4" },
1060 { 22, "txClear" },
1061 { 23, "txRetries" },
1062 { 24, "txRouting" },
1063 { 25, "txScrambled" },
1064 { 26, "rxParameter" },
1065 { 27, "txTimeoutMs" },
1066 { 28, "waitCardTimeout" },
1067 { 29, "channelSet" },
1068 { 30, "name" },
1069 { 31, "waitTime" },
1070 { 32, "lParameter" },
1071 { 33, "_15" },
1072 { 34, "headerSize" },
1073 { 36, "tx_delay_ms" },
1074 { 37, "retries" },
1075 { 38, "ReTransmitPacketMaxSize" },
1076 { 39, "waitReTransmitPacketMaxSize" },
1077 { 40, "fastReTransCount" },
1078 { 41, "driverRetransmissions" },
1079 { 42, "txAckTimeoutMs" },
1080 { 43, "registrationInterrupts" },
1081 { 44, "hardwareType" },
1082 { 45, "radioType" },
1083 { 46, "writeEEPROM" },
1084 { 47, "writeRadioType" },
1085 { 48, "entry_exit_debug" },
1086 { 49, "debug" },
1087 { 50, "in_speed" },
1088 { 51, "out_speed" },
1089 { 52, "in_speed10" },
1090 { 53, "out_speed10" },
1091 { 54, "in_speed_max" },
1092 { 55, "out_speed_max" },
1093 { 56, "measure_rate" },
1094 { 57, "pre_Command_Wait" },
1095 { 58, "rx_tweak1" },
1096 { 59, "rx_tweak2" },
1097 { 60, "tx_queue_len" },
1098
1099 { 150, "arlan2-txRing" },
1100 { 151, "arlan2-rxRing" },
1101 { 152, "arlan2-18" },
1102 { 153, "arlan2-ring" },
1103 { 154, "arlan2-shm-cpy" },
1104 { 155, "config2" },
1105 { 156, "reset2" },
1106 {}
1107};
1108
1109static const struct trans_ctl_table trans_arlan_conf_table3[] = {
1110 { 1, "spreadingCode" },
1111 { 2, "channelNumber" },
1112 { 3, "scramblingDisable" },
1113 { 4, "txAttenuation" },
1114 { 5, "systemId" },
1115 { 6, "maxDatagramSize" },
1116 { 7, "maxFrameSize" },
1117 { 8, "maxRetries" },
1118 { 9, "receiveMode" },
1119 { 10, "priority" },
1120 { 11, "rootOrRepeater" },
1121 { 12, "SID" },
1122 { 13, "registrationMode" },
1123 { 14, "registrationFill" },
1124 { 15, "localTalkAddress" },
1125 { 16, "codeFormat" },
1126 { 17, "numChannels" },
1127 { 18, "channel1" },
1128 { 19, "channel2" },
1129 { 20, "channel3" },
1130 { 21, "channel4" },
1131 { 22, "txClear" },
1132 { 23, "txRetries" },
1133 { 24, "txRouting" },
1134 { 25, "txScrambled" },
1135 { 26, "rxParameter" },
1136 { 27, "txTimeoutMs" },
1137 { 28, "waitCardTimeout" },
1138 { 29, "channelSet" },
1139 { 30, "name" },
1140 { 31, "waitTime" },
1141 { 32, "lParameter" },
1142 { 33, "_15" },
1143 { 34, "headerSize" },
1144 { 36, "tx_delay_ms" },
1145 { 37, "retries" },
1146 { 38, "ReTransmitPacketMaxSize" },
1147 { 39, "waitReTransmitPacketMaxSize" },
1148 { 40, "fastReTransCount" },
1149 { 41, "driverRetransmissions" },
1150 { 42, "txAckTimeoutMs" },
1151 { 43, "registrationInterrupts" },
1152 { 44, "hardwareType" },
1153 { 45, "radioType" },
1154 { 46, "writeEEPROM" },
1155 { 47, "writeRadioType" },
1156 { 48, "entry_exit_debug" },
1157 { 49, "debug" },
1158 { 50, "in_speed" },
1159 { 51, "out_speed" },
1160 { 52, "in_speed10" },
1161 { 53, "out_speed10" },
1162 { 54, "in_speed_max" },
1163 { 55, "out_speed_max" },
1164 { 56, "measure_rate" },
1165 { 57, "pre_Command_Wait" },
1166 { 58, "rx_tweak1" },
1167 { 59, "rx_tweak2" },
1168 { 60, "tx_queue_len" },
1169
1170 { 150, "arlan3-txRing" },
1171 { 151, "arlan3-rxRing" },
1172 { 152, "arlan3-18" },
1173 { 153, "arlan3-ring" },
1174 { 154, "arlan3-shm-cpy" },
1175 { 155, "config3" },
1176 { 156, "reset3" },
1177 {}
1178};
1179
1180static const struct trans_ctl_table trans_arlan_table[] = {
1181 { 1, "arlan0", trans_arlan_conf_table0 },
1182 { 2, "arlan1", trans_arlan_conf_table1 },
1183 { 3, "arlan2", trans_arlan_conf_table2 },
1184 { 4, "arlan3", trans_arlan_conf_table3 },
1185 {}
1186};
1187
1188static const struct trans_ctl_table trans_s390dbf_table[] = {
1189 { 5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" },
1190 { 5679 /* CTL_S390DBF_ACTIVE */, "debug_active" },
1191 {}
1192};
1193
1194static const struct trans_ctl_table trans_sunrpc_table[] = {
1195 { CTL_RPCDEBUG, "rpc_debug" },
1196 { CTL_NFSDEBUG, "nfs_debug" },
1197 { CTL_NFSDDEBUG, "nfsd_debug" },
1198 { CTL_NLMDEBUG, "nlm_debug" },
1199 { CTL_SLOTTABLE_UDP, "udp_slot_table_entries" },
1200 { CTL_SLOTTABLE_TCP, "tcp_slot_table_entries" },
1201 { CTL_MIN_RESVPORT, "min_resvport" },
1202 { CTL_MAX_RESVPORT, "max_resvport" },
1203 {}
1204};
1205
1206static const struct trans_ctl_table trans_pm_table[] = {
1207 { 1 /* CTL_PM_SUSPEND */, "suspend" },
1208 { 2 /* CTL_PM_CMODE */, "cmode" },
1209 { 3 /* CTL_PM_P0 */, "p0" },
1210 { 4 /* CTL_PM_CM */, "cm" },
1211 {}
1212};
1213
1214static const struct trans_ctl_table trans_frv_table[] = {
1215 { 1, "cache-mode" },
1216 { 2, "pin-cxnr" },
1217 {}
1218};
1219
1220static const struct trans_ctl_table trans_root_table[] = {
1221 { CTL_KERN, "kernel", trans_kern_table },
1222 { CTL_VM, "vm", trans_vm_table },
1223 { CTL_NET, "net", trans_net_table },
1224 /* CTL_PROC not used */
1225 { CTL_FS, "fs", trans_fs_table },
1226 { CTL_DEBUG, "debug", trans_debug_table },
1227 { CTL_DEV, "dev", trans_dev_table },
1228 { CTL_BUS, "bus", trans_bus_table },
1229 { CTL_ABI, "abi" },
1230 /* CTL_CPU not used */
1231 { CTL_ARLAN, "arlan", trans_arlan_table },
1232 { CTL_S390DBF, "s390dbf", trans_s390dbf_table },
1233 { CTL_SUNRPC, "sunrpc", trans_sunrpc_table },
1234 { CTL_PM, "pm", trans_pm_table },
1235 { CTL_FRV, "frv", trans_frv_table },
1236 {}
1237};
1238
1239
1240
1241 8
1242static int sysctl_depth(struct ctl_table *table) 9static int sysctl_depth(struct ctl_table *table)
1243{ 10{
@@ -1261,47 +28,6 @@ static struct ctl_table *sysctl_parent(struct ctl_table *table, int n)
1261 return table; 28 return table;
1262} 29}
1263 30
1264static const struct trans_ctl_table *sysctl_binary_lookup(struct ctl_table *table)
1265{
1266 struct ctl_table *test;
1267 const struct trans_ctl_table *ref;
1268 int cur_depth;
1269
1270 cur_depth = sysctl_depth(table);
1271
1272 ref = trans_root_table;
1273repeat:
1274 test = sysctl_parent(table, cur_depth);
1275 for (; ref->ctl_name || ref->procname || ref->child; ref++) {
1276 int match = 0;
1277
1278 if (cur_depth && !ref->child)
1279 continue;
1280
1281 if (test->procname && ref->procname &&
1282 (strcmp(test->procname, ref->procname) == 0))
1283 match++;
1284
1285 if (test->ctl_name && ref->ctl_name &&
1286 (test->ctl_name == ref->ctl_name))
1287 match++;
1288
1289 if (!ref->ctl_name && !ref->procname)
1290 match++;
1291
1292 if (match) {
1293 if (cur_depth != 0) {
1294 cur_depth--;
1295 ref = ref->child;
1296 goto repeat;
1297 }
1298 goto out;
1299 }
1300 }
1301 ref = NULL;
1302out:
1303 return ref;
1304}
1305 31
1306static void sysctl_print_path(struct ctl_table *table) 32static void sysctl_print_path(struct ctl_table *table)
1307{ 33{
@@ -1315,26 +41,6 @@ static void sysctl_print_path(struct ctl_table *table)
1315 } 41 }
1316 } 42 }
1317 printk(" "); 43 printk(" ");
1318 if (table->ctl_name) {
1319 for (i = depth; i >= 0; i--) {
1320 tmp = sysctl_parent(table, i);
1321 printk(".%d", tmp->ctl_name);
1322 }
1323 }
1324}
1325
1326static void sysctl_repair_table(struct ctl_table *table)
1327{
1328 /* Don't complain about the classic default
1329 * sysctl strategy routine. Maybe later we
1330 * can get the tables fixed and complain about
1331 * this.
1332 */
1333 if (table->ctl_name && table->procname &&
1334 (table->proc_handler == proc_dointvec) &&
1335 (!table->strategy)) {
1336 table->strategy = sysctl_data;
1337 }
1338} 44}
1339 45
1340static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces, 46static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces,
@@ -1352,7 +58,7 @@ static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces,
1352 ref = head->ctl_table; 58 ref = head->ctl_table;
1353repeat: 59repeat:
1354 test = sysctl_parent(table, cur_depth); 60 test = sysctl_parent(table, cur_depth);
1355 for (; ref->ctl_name || ref->procname; ref++) { 61 for (; ref->procname; ref++) {
1356 int match = 0; 62 int match = 0;
1357 if (cur_depth && !ref->child) 63 if (cur_depth && !ref->child)
1358 continue; 64 continue;
@@ -1361,10 +67,6 @@ repeat:
1361 (strcmp(test->procname, ref->procname) == 0)) 67 (strcmp(test->procname, ref->procname) == 0))
1362 match++; 68 match++;
1363 69
1364 if (test->ctl_name && ref->ctl_name &&
1365 (test->ctl_name == ref->ctl_name))
1366 match++;
1367
1368 if (match) { 70 if (match) {
1369 if (cur_depth != 0) { 71 if (cur_depth != 0) {
1370 cur_depth--; 72 cur_depth--;
@@ -1392,38 +94,6 @@ static void set_fail(const char **fail, struct ctl_table *table, const char *str
1392 *fail = str; 94 *fail = str;
1393} 95}
1394 96
1395static int sysctl_check_dir(struct nsproxy *namespaces,
1396 struct ctl_table *table)
1397{
1398 struct ctl_table *ref;
1399 int error;
1400
1401 error = 0;
1402 ref = sysctl_check_lookup(namespaces, table);
1403 if (ref) {
1404 int match = 0;
1405 if ((!table->procname && !ref->procname) ||
1406 (table->procname && ref->procname &&
1407 (strcmp(table->procname, ref->procname) == 0)))
1408 match++;
1409
1410 if ((!table->ctl_name && !ref->ctl_name) ||
1411 (table->ctl_name && ref->ctl_name &&
1412 (table->ctl_name == ref->ctl_name)))
1413 match++;
1414
1415 if (match != 2) {
1416 printk(KERN_ERR "%s: failed: ", __func__);
1417 sysctl_print_path(table);
1418 printk(" ref: ");
1419 sysctl_print_path(ref);
1420 printk("\n");
1421 error = -EINVAL;
1422 }
1423 }
1424 return error;
1425}
1426
1427static void sysctl_check_leaf(struct nsproxy *namespaces, 97static void sysctl_check_leaf(struct nsproxy *namespaces,
1428 struct ctl_table *table, const char **fail) 98 struct ctl_table *table, const char **fail)
1429{ 99{
@@ -1434,37 +104,15 @@ static void sysctl_check_leaf(struct nsproxy *namespaces,
1434 set_fail(fail, table, "Sysctl already exists"); 104 set_fail(fail, table, "Sysctl already exists");
1435} 105}
1436 106
1437static void sysctl_check_bin_path(struct ctl_table *table, const char **fail)
1438{
1439 const struct trans_ctl_table *ref;
1440
1441 ref = sysctl_binary_lookup(table);
1442 if (table->ctl_name && !ref)
1443 set_fail(fail, table, "Unknown sysctl binary path");
1444 if (ref) {
1445 if (ref->procname &&
1446 (!table->procname ||
1447 (strcmp(table->procname, ref->procname) != 0)))
1448 set_fail(fail, table, "procname does not match binary path procname");
1449
1450 if (ref->ctl_name && table->ctl_name &&
1451 (table->ctl_name != ref->ctl_name))
1452 set_fail(fail, table, "ctl_name does not match binary path ctl_name");
1453 }
1454}
1455
1456int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) 107int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
1457{ 108{
1458 int error = 0; 109 int error = 0;
1459 for (; table->ctl_name || table->procname; table++) { 110 for (; table->procname; table++) {
1460 const char *fail = NULL; 111 const char *fail = NULL;
1461 112
1462 sysctl_repair_table(table);
1463 if (table->parent) { 113 if (table->parent) {
1464 if (table->procname && !table->parent->procname) 114 if (table->procname && !table->parent->procname)
1465 set_fail(&fail, table, "Parent without procname"); 115 set_fail(&fail, table, "Parent without procname");
1466 if (table->ctl_name && !table->parent->ctl_name)
1467 set_fail(&fail, table, "Parent without ctl_name");
1468 } 116 }
1469 if (!table->procname) 117 if (!table->procname)
1470 set_fail(&fail, table, "No procname"); 118 set_fail(&fail, table, "No procname");
@@ -1477,21 +125,12 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
1477 set_fail(&fail, table, "Writable sysctl directory"); 125 set_fail(&fail, table, "Writable sysctl directory");
1478 if (table->proc_handler) 126 if (table->proc_handler)
1479 set_fail(&fail, table, "Directory with proc_handler"); 127 set_fail(&fail, table, "Directory with proc_handler");
1480 if (table->strategy)
1481 set_fail(&fail, table, "Directory with strategy");
1482 if (table->extra1) 128 if (table->extra1)
1483 set_fail(&fail, table, "Directory with extra1"); 129 set_fail(&fail, table, "Directory with extra1");
1484 if (table->extra2) 130 if (table->extra2)
1485 set_fail(&fail, table, "Directory with extra2"); 131 set_fail(&fail, table, "Directory with extra2");
1486 if (sysctl_check_dir(namespaces, table))
1487 set_fail(&fail, table, "Inconsistent directory names");
1488 } else { 132 } else {
1489 if ((table->strategy == sysctl_data) || 133 if ((table->proc_handler == proc_dostring) ||
1490 (table->strategy == sysctl_string) ||
1491 (table->strategy == sysctl_intvec) ||
1492 (table->strategy == sysctl_jiffies) ||
1493 (table->strategy == sysctl_ms_jiffies) ||
1494 (table->proc_handler == proc_dostring) ||
1495 (table->proc_handler == proc_dointvec) || 134 (table->proc_handler == proc_dointvec) ||
1496 (table->proc_handler == proc_dointvec_minmax) || 135 (table->proc_handler == proc_dointvec_minmax) ||
1497 (table->proc_handler == proc_dointvec_jiffies) || 136 (table->proc_handler == proc_dointvec_jiffies) ||
@@ -1513,14 +152,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
1513 set_fail(&fail, table, "No max"); 152 set_fail(&fail, table, "No max");
1514 } 153 }
1515 } 154 }
1516#ifdef CONFIG_SYSCTL_SYSCALL
1517 if (table->ctl_name && !table->strategy)
1518 set_fail(&fail, table, "Missing strategy");
1519#endif
1520#if 0
1521 if (!table->ctl_name && table->strategy)
1522 set_fail(&fail, table, "Strategy without ctl_name");
1523#endif
1524#ifdef CONFIG_PROC_SYSCTL 155#ifdef CONFIG_PROC_SYSCTL
1525 if (table->procname && !table->proc_handler) 156 if (table->procname && !table->proc_handler)
1526 set_fail(&fail, table, "No proc_handler"); 157 set_fail(&fail, table, "No proc_handler");
@@ -1531,7 +162,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
1531#endif 162#endif
1532 sysctl_check_leaf(namespaces, table, &fail); 163 sysctl_check_leaf(namespaces, table, &fail);
1533 } 164 }
1534 sysctl_check_bin_path(table, &fail);
1535 if (table->mode > 0777) 165 if (table->mode > 0777)
1536 set_fail(&fail, table, "bogus .mode"); 166 set_fail(&fail, table, "bogus .mode");
1537 if (fail) { 167 if (fail) {
diff --git a/kernel/time.c b/kernel/time.c
index 2e2e469a7fec..c6324d96009e 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -136,7 +136,6 @@ static inline void warp_clock(void)
136 write_seqlock_irq(&xtime_lock); 136 write_seqlock_irq(&xtime_lock);
137 wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; 137 wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60;
138 xtime.tv_sec += sys_tz.tz_minuteswest * 60; 138 xtime.tv_sec += sys_tz.tz_minuteswest * 60;
139 update_xtime_cache(0);
140 write_sequnlock_irq(&xtime_lock); 139 write_sequnlock_irq(&xtime_lock);
141 clock_was_set(); 140 clock_was_set();
142} 141}
@@ -662,6 +661,36 @@ u64 nsec_to_clock_t(u64 x)
662#endif 661#endif
663} 662}
664 663
664/**
665 * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
666 *
667 * @n: nsecs in u64
668 *
669 * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
670 * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
671 * for scheduler, not for use in device drivers to calculate timeout value.
672 *
673 * note:
674 * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
675 * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
676 */
677unsigned long nsecs_to_jiffies(u64 n)
678{
679#if (NSEC_PER_SEC % HZ) == 0
680 /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
681 return div_u64(n, NSEC_PER_SEC / HZ);
682#elif (HZ % 512) == 0
683 /* overflow after 292 years if HZ = 1024 */
684 return div_u64(n * HZ / 512, NSEC_PER_SEC / 512);
685#else
686 /*
687 * Generic case - optimized for cases where HZ is a multiple of 3.
688 * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc.
689 */
690 return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ);
691#endif
692}
693
665#if (BITS_PER_LONG < 64) 694#if (BITS_PER_LONG < 64)
666u64 get_jiffies_64(void) 695u64 get_jiffies_64(void)
667{ 696{
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 620b58abdc32..20a8920029ee 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -20,6 +20,8 @@
20#include <linux/sysdev.h> 20#include <linux/sysdev.h>
21#include <linux/tick.h> 21#include <linux/tick.h>
22 22
23#include "tick-internal.h"
24
23/* The registered clock event devices */ 25/* The registered clock event devices */
24static LIST_HEAD(clockevent_devices); 26static LIST_HEAD(clockevent_devices);
25static LIST_HEAD(clockevents_released); 27static LIST_HEAD(clockevents_released);
@@ -37,10 +39,9 @@ static DEFINE_SPINLOCK(clockevents_lock);
37 * 39 *
38 * Math helper, returns latch value converted to nanoseconds (bound checked) 40 * Math helper, returns latch value converted to nanoseconds (bound checked)
39 */ 41 */
40unsigned long clockevent_delta2ns(unsigned long latch, 42u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
41 struct clock_event_device *evt)
42{ 43{
43 u64 clc = ((u64) latch << evt->shift); 44 u64 clc = (u64) latch << evt->shift;
44 45
45 if (unlikely(!evt->mult)) { 46 if (unlikely(!evt->mult)) {
46 evt->mult = 1; 47 evt->mult = 1;
@@ -50,10 +51,10 @@ unsigned long clockevent_delta2ns(unsigned long latch,
50 do_div(clc, evt->mult); 51 do_div(clc, evt->mult);
51 if (clc < 1000) 52 if (clc < 1000)
52 clc = 1000; 53 clc = 1000;
53 if (clc > LONG_MAX) 54 if (clc > KTIME_MAX)
54 clc = LONG_MAX; 55 clc = KTIME_MAX;
55 56
56 return (unsigned long) clc; 57 return clc;
57} 58}
58EXPORT_SYMBOL_GPL(clockevent_delta2ns); 59EXPORT_SYMBOL_GPL(clockevent_delta2ns);
59 60
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 5e18c6ab2c6a..e85c23404d34 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -39,7 +39,7 @@ void timecounter_init(struct timecounter *tc,
39 tc->cycle_last = cc->read(cc); 39 tc->cycle_last = cc->read(cc);
40 tc->nsec = start_tstamp; 40 tc->nsec = start_tstamp;
41} 41}
42EXPORT_SYMBOL(timecounter_init); 42EXPORT_SYMBOL_GPL(timecounter_init);
43 43
44/** 44/**
45 * timecounter_read_delta - get nanoseconds since last call of this function 45 * timecounter_read_delta - get nanoseconds since last call of this function
@@ -83,7 +83,7 @@ u64 timecounter_read(struct timecounter *tc)
83 83
84 return nsec; 84 return nsec;
85} 85}
86EXPORT_SYMBOL(timecounter_read); 86EXPORT_SYMBOL_GPL(timecounter_read);
87 87
88u64 timecounter_cyc2time(struct timecounter *tc, 88u64 timecounter_cyc2time(struct timecounter *tc,
89 cycle_t cycle_tstamp) 89 cycle_t cycle_tstamp)
@@ -105,7 +105,60 @@ u64 timecounter_cyc2time(struct timecounter *tc,
105 105
106 return nsec; 106 return nsec;
107} 107}
108EXPORT_SYMBOL(timecounter_cyc2time); 108EXPORT_SYMBOL_GPL(timecounter_cyc2time);
109
110/**
111 * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
112 * @mult: pointer to mult variable
113 * @shift: pointer to shift variable
114 * @from: frequency to convert from
115 * @to: frequency to convert to
116 * @minsec: guaranteed runtime conversion range in seconds
117 *
118 * The function evaluates the shift/mult pair for the scaled math
119 * operations of clocksources and clockevents.
120 *
121 * @to and @from are frequency values in HZ. For clock sources @to is
122 * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock
123 * event @to is the counter frequency and @from is NSEC_PER_SEC.
124 *
125 * The @minsec conversion range argument controls the time frame in
126 * seconds which must be covered by the runtime conversion with the
127 * calculated mult and shift factors. This guarantees that no 64bit
128 * overflow happens when the input value of the conversion is
129 * multiplied with the calculated mult factor. Larger ranges may
130 * reduce the conversion accuracy by chosing smaller mult and shift
131 * factors.
132 */
133void
134clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
135{
136 u64 tmp;
137 u32 sft, sftacc= 32;
138
139 /*
140 * Calculate the shift factor which is limiting the conversion
141 * range:
142 */
143 tmp = ((u64)minsec * from) >> 32;
144 while (tmp) {
145 tmp >>=1;
146 sftacc--;
147 }
148
149 /*
150 * Find the conversion shift/mult pair which has the best
151 * accuracy and fits the maxsec conversion range:
152 */
153 for (sft = 32; sft > 0; sft--) {
154 tmp = (u64) to << sft;
155 do_div(tmp, from);
156 if ((tmp >> sftacc) == 0)
157 break;
158 }
159 *mult = tmp;
160 *shift = sft;
161}
109 162
110/*[Clocksource internal variables]--------- 163/*[Clocksource internal variables]---------
111 * curr_clocksource: 164 * curr_clocksource:
@@ -413,6 +466,47 @@ void clocksource_touch_watchdog(void)
413 clocksource_resume_watchdog(); 466 clocksource_resume_watchdog();
414} 467}
415 468
469/**
470 * clocksource_max_deferment - Returns max time the clocksource can be deferred
471 * @cs: Pointer to clocksource
472 *
473 */
474static u64 clocksource_max_deferment(struct clocksource *cs)
475{
476 u64 max_nsecs, max_cycles;
477
478 /*
479 * Calculate the maximum number of cycles that we can pass to the
480 * cyc2ns function without overflowing a 64-bit signed result. The
481 * maximum number of cycles is equal to ULLONG_MAX/cs->mult which
482 * is equivalent to the below.
483 * max_cycles < (2^63)/cs->mult
484 * max_cycles < 2^(log2((2^63)/cs->mult))
485 * max_cycles < 2^(log2(2^63) - log2(cs->mult))
486 * max_cycles < 2^(63 - log2(cs->mult))
487 * max_cycles < 1 << (63 - log2(cs->mult))
488 * Please note that we add 1 to the result of the log2 to account for
489 * any rounding errors, ensure the above inequality is satisfied and
490 * no overflow will occur.
491 */
492 max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1));
493
494 /*
495 * The actual maximum number of cycles we can defer the clocksource is
496 * determined by the minimum of max_cycles and cs->mask.
497 */
498 max_cycles = min_t(u64, max_cycles, (u64) cs->mask);
499 max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift);
500
501 /*
502 * To ensure that the clocksource does not wrap whilst we are idle,
503 * limit the time the clocksource can be deferred by 12.5%. Please
504 * note a margin of 12.5% is used because this can be computed with
505 * a shift, versus say 10% which would require division.
506 */
507 return max_nsecs - (max_nsecs >> 5);
508}
509
416#ifdef CONFIG_GENERIC_TIME 510#ifdef CONFIG_GENERIC_TIME
417 511
418/** 512/**
@@ -511,6 +605,9 @@ static void clocksource_enqueue(struct clocksource *cs)
511 */ 605 */
512int clocksource_register(struct clocksource *cs) 606int clocksource_register(struct clocksource *cs)
513{ 607{
608 /* calculate max idle time permitted for this clocksource */
609 cs->max_idle_ns = clocksource_max_deferment(cs);
610
514 mutex_lock(&clocksource_mutex); 611 mutex_lock(&clocksource_mutex);
515 clocksource_enqueue(cs); 612 clocksource_enqueue(cs);
516 clocksource_select(); 613 clocksource_select();
@@ -580,7 +677,7 @@ sysfs_show_current_clocksources(struct sys_device *dev,
580 * @count: length of buffer 677 * @count: length of buffer
581 * 678 *
582 * Takes input from sysfs interface for manually overriding the default 679 * Takes input from sysfs interface for manually overriding the default
583 * clocksource selction. 680 * clocksource selection.
584 */ 681 */
585static ssize_t sysfs_override_clocksource(struct sys_device *dev, 682static ssize_t sysfs_override_clocksource(struct sys_device *dev,
586 struct sysdev_attribute *attr, 683 struct sysdev_attribute *attr,
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index a96c0e2b89cf..0a8a213016f0 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -50,9 +50,9 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
50 dev->min_delta_ns += dev->min_delta_ns >> 1; 50 dev->min_delta_ns += dev->min_delta_ns >> 1;
51 51
52 printk(KERN_WARNING 52 printk(KERN_WARNING
53 "CE: %s increasing min_delta_ns to %lu nsec\n", 53 "CE: %s increasing min_delta_ns to %llu nsec\n",
54 dev->name ? dev->name : "?", 54 dev->name ? dev->name : "?",
55 dev->min_delta_ns << 1); 55 (unsigned long long) dev->min_delta_ns << 1);
56 56
57 i = 0; 57 i = 0;
58 } 58 }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 89aed5933ed4..f992762d7f51 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -134,18 +134,13 @@ __setup("nohz=", setup_tick_nohz);
134 * value. We do this unconditionally on any cpu, as we don't know whether the 134 * value. We do this unconditionally on any cpu, as we don't know whether the
135 * cpu, which has the update task assigned is in a long sleep. 135 * cpu, which has the update task assigned is in a long sleep.
136 */ 136 */
137static void tick_nohz_update_jiffies(void) 137static void tick_nohz_update_jiffies(ktime_t now)
138{ 138{
139 int cpu = smp_processor_id(); 139 int cpu = smp_processor_id();
140 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 140 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
141 unsigned long flags; 141 unsigned long flags;
142 ktime_t now;
143
144 if (!ts->tick_stopped)
145 return;
146 142
147 cpumask_clear_cpu(cpu, nohz_cpu_mask); 143 cpumask_clear_cpu(cpu, nohz_cpu_mask);
148 now = ktime_get();
149 ts->idle_waketime = now; 144 ts->idle_waketime = now;
150 145
151 local_irq_save(flags); 146 local_irq_save(flags);
@@ -155,20 +150,17 @@ static void tick_nohz_update_jiffies(void)
155 touch_softlockup_watchdog(); 150 touch_softlockup_watchdog();
156} 151}
157 152
158static void tick_nohz_stop_idle(int cpu) 153static void tick_nohz_stop_idle(int cpu, ktime_t now)
159{ 154{
160 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 155 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
156 ktime_t delta;
161 157
162 if (ts->idle_active) { 158 delta = ktime_sub(now, ts->idle_entrytime);
163 ktime_t now, delta; 159 ts->idle_lastupdate = now;
164 now = ktime_get(); 160 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
165 delta = ktime_sub(now, ts->idle_entrytime); 161 ts->idle_active = 0;
166 ts->idle_lastupdate = now;
167 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
168 ts->idle_active = 0;
169 162
170 sched_clock_idle_wakeup_event(0); 163 sched_clock_idle_wakeup_event(0);
171 }
172} 164}
173 165
174static ktime_t tick_nohz_start_idle(struct tick_sched *ts) 166static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
@@ -216,6 +208,7 @@ void tick_nohz_stop_sched_tick(int inidle)
216 struct tick_sched *ts; 208 struct tick_sched *ts;
217 ktime_t last_update, expires, now; 209 ktime_t last_update, expires, now;
218 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 210 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
211 u64 time_delta;
219 int cpu; 212 int cpu;
220 213
221 local_irq_save(flags); 214 local_irq_save(flags);
@@ -263,7 +256,7 @@ void tick_nohz_stop_sched_tick(int inidle)
263 256
264 if (ratelimit < 10) { 257 if (ratelimit < 10) {
265 printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", 258 printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
266 local_softirq_pending()); 259 (unsigned int) local_softirq_pending());
267 ratelimit++; 260 ratelimit++;
268 } 261 }
269 goto end; 262 goto end;
@@ -275,14 +268,18 @@ void tick_nohz_stop_sched_tick(int inidle)
275 seq = read_seqbegin(&xtime_lock); 268 seq = read_seqbegin(&xtime_lock);
276 last_update = last_jiffies_update; 269 last_update = last_jiffies_update;
277 last_jiffies = jiffies; 270 last_jiffies = jiffies;
271 time_delta = timekeeping_max_deferment();
278 } while (read_seqretry(&xtime_lock, seq)); 272 } while (read_seqretry(&xtime_lock, seq));
279 273
280 /* Get the next timer wheel timer */ 274 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
281 next_jiffies = get_next_timer_interrupt(last_jiffies); 275 arch_needs_cpu(cpu)) {
282 delta_jiffies = next_jiffies - last_jiffies; 276 next_jiffies = last_jiffies + 1;
283
284 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu))
285 delta_jiffies = 1; 277 delta_jiffies = 1;
278 } else {
279 /* Get the next timer wheel timer */
280 next_jiffies = get_next_timer_interrupt(last_jiffies);
281 delta_jiffies = next_jiffies - last_jiffies;
282 }
286 /* 283 /*
287 * Do not stop the tick, if we are only one off 284 * Do not stop the tick, if we are only one off
288 * or if the cpu is required for rcu 285 * or if the cpu is required for rcu
@@ -294,22 +291,51 @@ void tick_nohz_stop_sched_tick(int inidle)
294 if ((long)delta_jiffies >= 1) { 291 if ((long)delta_jiffies >= 1) {
295 292
296 /* 293 /*
297 * calculate the expiry time for the next timer wheel
298 * timer
299 */
300 expires = ktime_add_ns(last_update, tick_period.tv64 *
301 delta_jiffies);
302
303 /*
304 * If this cpu is the one which updates jiffies, then 294 * If this cpu is the one which updates jiffies, then
305 * give up the assignment and let it be taken by the 295 * give up the assignment and let it be taken by the
306 * cpu which runs the tick timer next, which might be 296 * cpu which runs the tick timer next, which might be
307 * this cpu as well. If we don't drop this here the 297 * this cpu as well. If we don't drop this here the
308 * jiffies might be stale and do_timer() never 298 * jiffies might be stale and do_timer() never
309 * invoked. 299 * invoked. Keep track of the fact that it was the one
300 * which had the do_timer() duty last. If this cpu is
301 * the one which had the do_timer() duty last, we
302 * limit the sleep time to the timekeeping
303 * max_deferement value which we retrieved
304 * above. Otherwise we can sleep as long as we want.
310 */ 305 */
311 if (cpu == tick_do_timer_cpu) 306 if (cpu == tick_do_timer_cpu) {
312 tick_do_timer_cpu = TICK_DO_TIMER_NONE; 307 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
308 ts->do_timer_last = 1;
309 } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
310 time_delta = KTIME_MAX;
311 ts->do_timer_last = 0;
312 } else if (!ts->do_timer_last) {
313 time_delta = KTIME_MAX;
314 }
315
316 /*
317 * calculate the expiry time for the next timer wheel
318 * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals
319 * that there is no timer pending or at least extremely
320 * far into the future (12 days for HZ=1000). In this
321 * case we set the expiry to the end of time.
322 */
323 if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) {
324 /*
325 * Calculate the time delta for the next timer event.
326 * If the time delta exceeds the maximum time delta
327 * permitted by the current clocksource then adjust
328 * the time delta accordingly to ensure the
329 * clocksource does not wrap.
330 */
331 time_delta = min_t(u64, time_delta,
332 tick_period.tv64 * delta_jiffies);
333 }
334
335 if (time_delta < KTIME_MAX)
336 expires = ktime_add_ns(last_update, time_delta);
337 else
338 expires.tv64 = KTIME_MAX;
313 339
314 if (delta_jiffies > 1) 340 if (delta_jiffies > 1)
315 cpumask_set_cpu(cpu, nohz_cpu_mask); 341 cpumask_set_cpu(cpu, nohz_cpu_mask);
@@ -342,22 +368,19 @@ void tick_nohz_stop_sched_tick(int inidle)
342 368
343 ts->idle_sleeps++; 369 ts->idle_sleeps++;
344 370
371 /* Mark expires */
372 ts->idle_expires = expires;
373
345 /* 374 /*
346 * delta_jiffies >= NEXT_TIMER_MAX_DELTA signals that 375 * If the expiration time == KTIME_MAX, then
347 * there is no timer pending or at least extremly far 376 * in this case we simply stop the tick timer.
348 * into the future (12 days for HZ=1000). In this case
349 * we simply stop the tick timer:
350 */ 377 */
351 if (unlikely(delta_jiffies >= NEXT_TIMER_MAX_DELTA)) { 378 if (unlikely(expires.tv64 == KTIME_MAX)) {
352 ts->idle_expires.tv64 = KTIME_MAX;
353 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) 379 if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
354 hrtimer_cancel(&ts->sched_timer); 380 hrtimer_cancel(&ts->sched_timer);
355 goto out; 381 goto out;
356 } 382 }
357 383
358 /* Mark expiries */
359 ts->idle_expires = expires;
360
361 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { 384 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
362 hrtimer_start(&ts->sched_timer, expires, 385 hrtimer_start(&ts->sched_timer, expires,
363 HRTIMER_MODE_ABS_PINNED); 386 HRTIMER_MODE_ABS_PINNED);
@@ -436,7 +459,11 @@ void tick_nohz_restart_sched_tick(void)
436 ktime_t now; 459 ktime_t now;
437 460
438 local_irq_disable(); 461 local_irq_disable();
439 tick_nohz_stop_idle(cpu); 462 if (ts->idle_active || (ts->inidle && ts->tick_stopped))
463 now = ktime_get();
464
465 if (ts->idle_active)
466 tick_nohz_stop_idle(cpu, now);
440 467
441 if (!ts->inidle || !ts->tick_stopped) { 468 if (!ts->inidle || !ts->tick_stopped) {
442 ts->inidle = 0; 469 ts->inidle = 0;
@@ -450,7 +477,6 @@ void tick_nohz_restart_sched_tick(void)
450 477
451 /* Update jiffies first */ 478 /* Update jiffies first */
452 select_nohz_load_balancer(0); 479 select_nohz_load_balancer(0);
453 now = ktime_get();
454 tick_do_update_jiffies64(now); 480 tick_do_update_jiffies64(now);
455 cpumask_clear_cpu(cpu, nohz_cpu_mask); 481 cpumask_clear_cpu(cpu, nohz_cpu_mask);
456 482
@@ -584,22 +610,18 @@ static void tick_nohz_switch_to_nohz(void)
584 * timer and do not touch the other magic bits which need to be done 610 * timer and do not touch the other magic bits which need to be done
585 * when idle is left. 611 * when idle is left.
586 */ 612 */
587static void tick_nohz_kick_tick(int cpu) 613static void tick_nohz_kick_tick(int cpu, ktime_t now)
588{ 614{
589#if 0 615#if 0
590 /* Switch back to 2.6.27 behaviour */ 616 /* Switch back to 2.6.27 behaviour */
591 617
592 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 618 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
593 ktime_t delta, now; 619 ktime_t delta;
594
595 if (!ts->tick_stopped)
596 return;
597 620
598 /* 621 /*
599 * Do not touch the tick device, when the next expiry is either 622 * Do not touch the tick device, when the next expiry is either
600 * already reached or less/equal than the tick period. 623 * already reached or less/equal than the tick period.
601 */ 624 */
602 now = ktime_get();
603 delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now); 625 delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now);
604 if (delta.tv64 <= tick_period.tv64) 626 if (delta.tv64 <= tick_period.tv64)
605 return; 627 return;
@@ -608,9 +630,26 @@ static void tick_nohz_kick_tick(int cpu)
608#endif 630#endif
609} 631}
610 632
633static inline void tick_check_nohz(int cpu)
634{
635 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
636 ktime_t now;
637
638 if (!ts->idle_active && !ts->tick_stopped)
639 return;
640 now = ktime_get();
641 if (ts->idle_active)
642 tick_nohz_stop_idle(cpu, now);
643 if (ts->tick_stopped) {
644 tick_nohz_update_jiffies(now);
645 tick_nohz_kick_tick(cpu, now);
646 }
647}
648
611#else 649#else
612 650
613static inline void tick_nohz_switch_to_nohz(void) { } 651static inline void tick_nohz_switch_to_nohz(void) { }
652static inline void tick_check_nohz(int cpu) { }
614 653
615#endif /* NO_HZ */ 654#endif /* NO_HZ */
616 655
@@ -620,11 +659,7 @@ static inline void tick_nohz_switch_to_nohz(void) { }
620void tick_check_idle(int cpu) 659void tick_check_idle(int cpu)
621{ 660{
622 tick_check_oneshot_broadcast(cpu); 661 tick_check_oneshot_broadcast(cpu);
623#ifdef CONFIG_NO_HZ 662 tick_check_nohz(cpu);
624 tick_nohz_stop_idle(cpu);
625 tick_nohz_update_jiffies();
626 tick_nohz_kick_tick(cpu);
627#endif
628} 663}
629 664
630/* 665/*
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
index 71e7f1a19156..96ff643a5a59 100644
--- a/kernel/time/timecompare.c
+++ b/kernel/time/timecompare.c
@@ -40,7 +40,7 @@ ktime_t timecompare_transform(struct timecompare *sync,
40 40
41 return ns_to_ktime(nsec); 41 return ns_to_ktime(nsec);
42} 42}
43EXPORT_SYMBOL(timecompare_transform); 43EXPORT_SYMBOL_GPL(timecompare_transform);
44 44
45int timecompare_offset(struct timecompare *sync, 45int timecompare_offset(struct timecompare *sync,
46 s64 *offset, 46 s64 *offset,
@@ -131,7 +131,7 @@ int timecompare_offset(struct timecompare *sync,
131 131
132 return used; 132 return used;
133} 133}
134EXPORT_SYMBOL(timecompare_offset); 134EXPORT_SYMBOL_GPL(timecompare_offset);
135 135
136void __timecompare_update(struct timecompare *sync, 136void __timecompare_update(struct timecompare *sync,
137 u64 source_tstamp) 137 u64 source_tstamp)
@@ -188,4 +188,4 @@ void __timecompare_update(struct timecompare *sync,
188 } 188 }
189 } 189 }
190} 190}
191EXPORT_SYMBOL(__timecompare_update); 191EXPORT_SYMBOL_GPL(__timecompare_update);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index c3a4e2907eaa..af4135f05825 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -165,19 +165,12 @@ struct timespec raw_time;
165/* flag for if timekeeping is suspended */ 165/* flag for if timekeeping is suspended */
166int __read_mostly timekeeping_suspended; 166int __read_mostly timekeeping_suspended;
167 167
168static struct timespec xtime_cache __attribute__ ((aligned (16)));
169void update_xtime_cache(u64 nsec)
170{
171 xtime_cache = xtime;
172 timespec_add_ns(&xtime_cache, nsec);
173}
174
175/* must hold xtime_lock */ 168/* must hold xtime_lock */
176void timekeeping_leap_insert(int leapsecond) 169void timekeeping_leap_insert(int leapsecond)
177{ 170{
178 xtime.tv_sec += leapsecond; 171 xtime.tv_sec += leapsecond;
179 wall_to_monotonic.tv_sec -= leapsecond; 172 wall_to_monotonic.tv_sec -= leapsecond;
180 update_vsyscall(&xtime, timekeeper.clock); 173 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
181} 174}
182 175
183#ifdef CONFIG_GENERIC_TIME 176#ifdef CONFIG_GENERIC_TIME
@@ -332,12 +325,10 @@ int do_settimeofday(struct timespec *tv)
332 325
333 xtime = *tv; 326 xtime = *tv;
334 327
335 update_xtime_cache(0);
336
337 timekeeper.ntp_error = 0; 328 timekeeper.ntp_error = 0;
338 ntp_clear(); 329 ntp_clear();
339 330
340 update_vsyscall(&xtime, timekeeper.clock); 331 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
341 332
342 write_sequnlock_irqrestore(&xtime_lock, flags); 333 write_sequnlock_irqrestore(&xtime_lock, flags);
343 334
@@ -488,6 +479,17 @@ int timekeeping_valid_for_hres(void)
488} 479}
489 480
490/** 481/**
482 * timekeeping_max_deferment - Returns max time the clocksource can be deferred
483 *
484 * Caller must observe xtime_lock via read_seqbegin/read_seqretry to
485 * ensure that the clocksource does not change!
486 */
487u64 timekeeping_max_deferment(void)
488{
489 return timekeeper.clock->max_idle_ns;
490}
491
492/**
491 * read_persistent_clock - Return time from the persistent clock. 493 * read_persistent_clock - Return time from the persistent clock.
492 * 494 *
493 * Weak dummy function for arches that do not yet support it. 495 * Weak dummy function for arches that do not yet support it.
@@ -548,7 +550,6 @@ void __init timekeeping_init(void)
548 } 550 }
549 set_normalized_timespec(&wall_to_monotonic, 551 set_normalized_timespec(&wall_to_monotonic,
550 -boot.tv_sec, -boot.tv_nsec); 552 -boot.tv_sec, -boot.tv_nsec);
551 update_xtime_cache(0);
552 total_sleep_time.tv_sec = 0; 553 total_sleep_time.tv_sec = 0;
553 total_sleep_time.tv_nsec = 0; 554 total_sleep_time.tv_nsec = 0;
554 write_sequnlock_irqrestore(&xtime_lock, flags); 555 write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -582,7 +583,6 @@ static int timekeeping_resume(struct sys_device *dev)
582 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); 583 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
583 total_sleep_time = timespec_add_safe(total_sleep_time, ts); 584 total_sleep_time = timespec_add_safe(total_sleep_time, ts);
584 } 585 }
585 update_xtime_cache(0);
586 /* re-base the last cycle value */ 586 /* re-base the last cycle value */
587 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); 587 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
588 timekeeper.ntp_error = 0; 588 timekeeper.ntp_error = 0;
@@ -723,6 +723,49 @@ static void timekeeping_adjust(s64 offset)
723} 723}
724 724
725/** 725/**
726 * logarithmic_accumulation - shifted accumulation of cycles
727 *
728 * This functions accumulates a shifted interval of cycles into
729 * into a shifted interval nanoseconds. Allows for O(log) accumulation
730 * loop.
731 *
732 * Returns the unconsumed cycles.
733 */
734static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
735{
736 u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
737
738 /* If the offset is smaller then a shifted interval, do nothing */
739 if (offset < timekeeper.cycle_interval<<shift)
740 return offset;
741
742 /* Accumulate one shifted interval */
743 offset -= timekeeper.cycle_interval << shift;
744 timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift;
745
746 timekeeper.xtime_nsec += timekeeper.xtime_interval << shift;
747 while (timekeeper.xtime_nsec >= nsecps) {
748 timekeeper.xtime_nsec -= nsecps;
749 xtime.tv_sec++;
750 second_overflow();
751 }
752
753 /* Accumulate into raw time */
754 raw_time.tv_nsec += timekeeper.raw_interval << shift;;
755 while (raw_time.tv_nsec >= NSEC_PER_SEC) {
756 raw_time.tv_nsec -= NSEC_PER_SEC;
757 raw_time.tv_sec++;
758 }
759
760 /* Accumulate error between NTP and clock interval */
761 timekeeper.ntp_error += tick_length << shift;
762 timekeeper.ntp_error -= timekeeper.xtime_interval <<
763 (timekeeper.ntp_error_shift + shift);
764
765 return offset;
766}
767
768/**
726 * update_wall_time - Uses the current clocksource to increment the wall time 769 * update_wall_time - Uses the current clocksource to increment the wall time
727 * 770 *
728 * Called from the timer interrupt, must hold a write on xtime_lock. 771 * Called from the timer interrupt, must hold a write on xtime_lock.
@@ -731,7 +774,7 @@ void update_wall_time(void)
731{ 774{
732 struct clocksource *clock; 775 struct clocksource *clock;
733 cycle_t offset; 776 cycle_t offset;
734 u64 nsecs; 777 int shift = 0, maxshift;
735 778
736 /* Make sure we're fully resumed: */ 779 /* Make sure we're fully resumed: */
737 if (unlikely(timekeeping_suspended)) 780 if (unlikely(timekeeping_suspended))
@@ -745,33 +788,22 @@ void update_wall_time(void)
745#endif 788#endif
746 timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; 789 timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
747 790
748 /* normally this loop will run just once, however in the 791 /*
749 * case of lost or late ticks, it will accumulate correctly. 792 * With NO_HZ we may have to accumulate many cycle_intervals
793 * (think "ticks") worth of time at once. To do this efficiently,
794 * we calculate the largest doubling multiple of cycle_intervals
795 * that is smaller then the offset. We then accumulate that
796 * chunk in one go, and then try to consume the next smaller
797 * doubled multiple.
750 */ 798 */
799 shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
800 shift = max(0, shift);
801 /* Bound shift to one less then what overflows tick_length */
802 maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1;
803 shift = min(shift, maxshift);
751 while (offset >= timekeeper.cycle_interval) { 804 while (offset >= timekeeper.cycle_interval) {
752 u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; 805 offset = logarithmic_accumulation(offset, shift);
753 806 shift--;
754 /* accumulate one interval */
755 offset -= timekeeper.cycle_interval;
756 clock->cycle_last += timekeeper.cycle_interval;
757
758 timekeeper.xtime_nsec += timekeeper.xtime_interval;
759 if (timekeeper.xtime_nsec >= nsecps) {
760 timekeeper.xtime_nsec -= nsecps;
761 xtime.tv_sec++;
762 second_overflow();
763 }
764
765 raw_time.tv_nsec += timekeeper.raw_interval;
766 if (raw_time.tv_nsec >= NSEC_PER_SEC) {
767 raw_time.tv_nsec -= NSEC_PER_SEC;
768 raw_time.tv_sec++;
769 }
770
771 /* accumulate error between NTP and clock interval */
772 timekeeper.ntp_error += tick_length;
773 timekeeper.ntp_error -= timekeeper.xtime_interval <<
774 timekeeper.ntp_error_shift;
775 } 807 }
776 808
777 /* correct the clock when NTP error is too big */ 809 /* correct the clock when NTP error is too big */
@@ -807,11 +839,8 @@ void update_wall_time(void)
807 timekeeper.ntp_error += timekeeper.xtime_nsec << 839 timekeeper.ntp_error += timekeeper.xtime_nsec <<
808 timekeeper.ntp_error_shift; 840 timekeeper.ntp_error_shift;
809 841
810 nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift);
811 update_xtime_cache(nsecs);
812
813 /* check to see if there is a new clocksource to use */ 842 /* check to see if there is a new clocksource to use */
814 update_vsyscall(&xtime, timekeeper.clock); 843 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
815} 844}
816 845
817/** 846/**
@@ -846,13 +875,13 @@ void monotonic_to_bootbased(struct timespec *ts)
846 875
847unsigned long get_seconds(void) 876unsigned long get_seconds(void)
848{ 877{
849 return xtime_cache.tv_sec; 878 return xtime.tv_sec;
850} 879}
851EXPORT_SYMBOL(get_seconds); 880EXPORT_SYMBOL(get_seconds);
852 881
853struct timespec __current_kernel_time(void) 882struct timespec __current_kernel_time(void)
854{ 883{
855 return xtime_cache; 884 return xtime;
856} 885}
857 886
858struct timespec current_kernel_time(void) 887struct timespec current_kernel_time(void)
@@ -862,8 +891,7 @@ struct timespec current_kernel_time(void)
862 891
863 do { 892 do {
864 seq = read_seqbegin(&xtime_lock); 893 seq = read_seqbegin(&xtime_lock);
865 894 now = xtime;
866 now = xtime_cache;
867 } while (read_seqretry(&xtime_lock, seq)); 895 } while (read_seqretry(&xtime_lock, seq));
868 896
869 return now; 897 return now;
@@ -877,8 +905,7 @@ struct timespec get_monotonic_coarse(void)
877 905
878 do { 906 do {
879 seq = read_seqbegin(&xtime_lock); 907 seq = read_seqbegin(&xtime_lock);
880 908 now = xtime;
881 now = xtime_cache;
882 mono = wall_to_monotonic; 909 mono = wall_to_monotonic;
883 } while (read_seqretry(&xtime_lock, seq)); 910 } while (read_seqretry(&xtime_lock, seq));
884 911
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 1b5b7aa2fdfd..9d80db4747d4 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -150,6 +150,9 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
150 P_ns(expires_next); 150 P_ns(expires_next);
151 P(hres_active); 151 P(hres_active);
152 P(nr_events); 152 P(nr_events);
153 P(nr_retries);
154 P(nr_hangs);
155 P_ns(max_hang_time);
153#endif 156#endif
154#undef P 157#undef P
155#undef P_ns 158#undef P_ns
@@ -204,10 +207,12 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
204 return; 207 return;
205 } 208 }
206 SEQ_printf(m, "%s\n", dev->name); 209 SEQ_printf(m, "%s\n", dev->name);
207 SEQ_printf(m, " max_delta_ns: %lu\n", dev->max_delta_ns); 210 SEQ_printf(m, " max_delta_ns: %llu\n",
208 SEQ_printf(m, " min_delta_ns: %lu\n", dev->min_delta_ns); 211 (unsigned long long) dev->max_delta_ns);
209 SEQ_printf(m, " mult: %lu\n", dev->mult); 212 SEQ_printf(m, " min_delta_ns: %llu\n",
210 SEQ_printf(m, " shift: %d\n", dev->shift); 213 (unsigned long long) dev->min_delta_ns);
214 SEQ_printf(m, " mult: %u\n", dev->mult);
215 SEQ_printf(m, " shift: %u\n", dev->shift);
211 SEQ_printf(m, " mode: %d\n", dev->mode); 216 SEQ_printf(m, " mode: %d\n", dev->mode);
212 SEQ_printf(m, " next_event: %Ld nsecs\n", 217 SEQ_printf(m, " next_event: %Ld nsecs\n",
213 (unsigned long long) ktime_to_ns(dev->next_event)); 218 (unsigned long long) ktime_to_ns(dev->next_event));
@@ -252,7 +257,7 @@ static int timer_list_show(struct seq_file *m, void *v)
252 u64 now = ktime_to_ns(ktime_get()); 257 u64 now = ktime_to_ns(ktime_get());
253 int cpu; 258 int cpu;
254 259
255 SEQ_printf(m, "Timer List Version: v0.4\n"); 260 SEQ_printf(m, "Timer List Version: v0.5\n");
256 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); 261 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
257 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); 262 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
258 263
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index b416512ad17f..d006554888dc 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -339,6 +339,27 @@ config POWER_TRACER
339 power management decisions, specifically the C-state and P-state 339 power management decisions, specifically the C-state and P-state
340 behavior. 340 behavior.
341 341
342config KSYM_TRACER
343 bool "Trace read and write access on kernel memory locations"
344 depends on HAVE_HW_BREAKPOINT
345 select TRACING
346 help
347 This tracer helps find read and write operations on any given kernel
348 symbol i.e. /proc/kallsyms.
349
350config PROFILE_KSYM_TRACER
351 bool "Profile all kernel memory accesses on 'watched' variables"
352 depends on KSYM_TRACER
353 help
354 This tracer profiles kernel accesses on variables watched through the
355 ksym tracer ftrace plugin. Depending upon the hardware, all read
356 and write operations on kernel variables can be monitored for
357 accesses.
358
359 The results will be displayed in:
360 /debugfs/tracing/profile_ksym
361
362 Say N if unsure.
342 363
343config STACK_TRACER 364config STACK_TRACER
344 bool "Trace max stack" 365 bool "Trace max stack"
@@ -428,6 +449,23 @@ config BLK_DEV_IO_TRACE
428 449
429 If unsure, say N. 450 If unsure, say N.
430 451
452config KPROBE_EVENT
453 depends on KPROBES
454 depends on X86
455 bool "Enable kprobes-based dynamic events"
456 select TRACING
457 default y
458 help
459 This allows the user to add tracing events (similar to tracepoints) on the fly
460 via the ftrace interface. See Documentation/trace/kprobetrace.txt
461 for more details.
462
463 Those events can be inserted wherever kprobes can probe, and record
464 various register and memory values.
465
466 This option is also required by perf-probe subcommand of perf tools. If
467 you want to use perf tools, this option is strongly recommended.
468
431config DYNAMIC_FTRACE 469config DYNAMIC_FTRACE
432 bool "enable/disable ftrace tracepoints dynamically" 470 bool "enable/disable ftrace tracepoints dynamically"
433 depends on FUNCTION_TRACER 471 depends on FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 26f03ac07c2b..cd9ecd89ec77 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -53,6 +53,8 @@ obj-$(CONFIG_EVENT_TRACING) += trace_export.o
53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o 53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
54obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o 54obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
55obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 55obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
56obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
57obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
56obj-$(CONFIG_EVENT_TRACING) += power-traces.o 58obj-$(CONFIG_EVENT_TRACING) += power-traces.o
57 59
58libftrace-y := ftrace.o 60libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6dc4e5ef7a01..e51a1bcb7bed 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -60,6 +60,13 @@ static int last_ftrace_enabled;
60/* Quick disabling of function tracer. */ 60/* Quick disabling of function tracer. */
61int function_trace_stop; 61int function_trace_stop;
62 62
63/* List for set_ftrace_pid's pids. */
64LIST_HEAD(ftrace_pids);
65struct ftrace_pid {
66 struct list_head list;
67 struct pid *pid;
68};
69
63/* 70/*
64 * ftrace_disabled is set when an anomaly is discovered. 71 * ftrace_disabled is set when an anomaly is discovered.
65 * ftrace_disabled is much stronger than ftrace_enabled. 72 * ftrace_disabled is much stronger than ftrace_enabled.
@@ -78,6 +85,10 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
78ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; 85ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
79ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; 86ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
80 87
88#ifdef CONFIG_FUNCTION_GRAPH_TRACER
89static int ftrace_set_func(unsigned long *array, int *idx, char *buffer);
90#endif
91
81static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) 92static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
82{ 93{
83 struct ftrace_ops *op = ftrace_list; 94 struct ftrace_ops *op = ftrace_list;
@@ -155,7 +166,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
155 else 166 else
156 func = ftrace_list_func; 167 func = ftrace_list_func;
157 168
158 if (ftrace_pid_trace) { 169 if (!list_empty(&ftrace_pids)) {
159 set_ftrace_pid_function(func); 170 set_ftrace_pid_function(func);
160 func = ftrace_pid_func; 171 func = ftrace_pid_func;
161 } 172 }
@@ -203,7 +214,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
203 if (ftrace_list->next == &ftrace_list_end) { 214 if (ftrace_list->next == &ftrace_list_end) {
204 ftrace_func_t func = ftrace_list->func; 215 ftrace_func_t func = ftrace_list->func;
205 216
206 if (ftrace_pid_trace) { 217 if (!list_empty(&ftrace_pids)) {
207 set_ftrace_pid_function(func); 218 set_ftrace_pid_function(func);
208 func = ftrace_pid_func; 219 func = ftrace_pid_func;
209 } 220 }
@@ -231,7 +242,7 @@ static void ftrace_update_pid_func(void)
231 func = __ftrace_trace_function; 242 func = __ftrace_trace_function;
232#endif 243#endif
233 244
234 if (ftrace_pid_trace) { 245 if (!list_empty(&ftrace_pids)) {
235 set_ftrace_pid_function(func); 246 set_ftrace_pid_function(func);
236 func = ftrace_pid_func; 247 func = ftrace_pid_func;
237 } else { 248 } else {
@@ -821,8 +832,6 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
821} 832}
822#endif /* CONFIG_FUNCTION_PROFILER */ 833#endif /* CONFIG_FUNCTION_PROFILER */
823 834
824/* set when tracing only a pid */
825struct pid *ftrace_pid_trace;
826static struct pid * const ftrace_swapper_pid = &init_struct_pid; 835static struct pid * const ftrace_swapper_pid = &init_struct_pid;
827 836
828#ifdef CONFIG_DYNAMIC_FTRACE 837#ifdef CONFIG_DYNAMIC_FTRACE
@@ -1261,12 +1270,34 @@ static int ftrace_update_code(struct module *mod)
1261 ftrace_new_addrs = p->newlist; 1270 ftrace_new_addrs = p->newlist;
1262 p->flags = 0L; 1271 p->flags = 0L;
1263 1272
1264 /* convert record (i.e, patch mcount-call with NOP) */ 1273 /*
1265 if (ftrace_code_disable(mod, p)) { 1274 * Do the initial record convertion from mcount jump
1266 p->flags |= FTRACE_FL_CONVERTED; 1275 * to the NOP instructions.
1267 ftrace_update_cnt++; 1276 */
1268 } else 1277 if (!ftrace_code_disable(mod, p)) {
1269 ftrace_free_rec(p); 1278 ftrace_free_rec(p);
1279 continue;
1280 }
1281
1282 p->flags |= FTRACE_FL_CONVERTED;
1283 ftrace_update_cnt++;
1284
1285 /*
1286 * If the tracing is enabled, go ahead and enable the record.
1287 *
1288 * The reason not to enable the record immediatelly is the
1289 * inherent check of ftrace_make_nop/ftrace_make_call for
1290 * correct previous instructions. Making first the NOP
1291 * conversion puts the module to the correct state, thus
1292 * passing the ftrace_make_call check.
1293 */
1294 if (ftrace_start_up) {
1295 int failed = __ftrace_replace_code(p, 1);
1296 if (failed) {
1297 ftrace_bug(failed, p->ip);
1298 ftrace_free_rec(p);
1299 }
1300 }
1270 } 1301 }
1271 1302
1272 stop = ftrace_now(raw_smp_processor_id()); 1303 stop = ftrace_now(raw_smp_processor_id());
@@ -1656,60 +1687,6 @@ ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
1656 return ret; 1687 return ret;
1657} 1688}
1658 1689
1659enum {
1660 MATCH_FULL,
1661 MATCH_FRONT_ONLY,
1662 MATCH_MIDDLE_ONLY,
1663 MATCH_END_ONLY,
1664};
1665
1666/*
1667 * (static function - no need for kernel doc)
1668 *
1669 * Pass in a buffer containing a glob and this function will
1670 * set search to point to the search part of the buffer and
1671 * return the type of search it is (see enum above).
1672 * This does modify buff.
1673 *
1674 * Returns enum type.
1675 * search returns the pointer to use for comparison.
1676 * not returns 1 if buff started with a '!'
1677 * 0 otherwise.
1678 */
1679static int
1680ftrace_setup_glob(char *buff, int len, char **search, int *not)
1681{
1682 int type = MATCH_FULL;
1683 int i;
1684
1685 if (buff[0] == '!') {
1686 *not = 1;
1687 buff++;
1688 len--;
1689 } else
1690 *not = 0;
1691
1692 *search = buff;
1693
1694 for (i = 0; i < len; i++) {
1695 if (buff[i] == '*') {
1696 if (!i) {
1697 *search = buff + 1;
1698 type = MATCH_END_ONLY;
1699 } else {
1700 if (type == MATCH_END_ONLY)
1701 type = MATCH_MIDDLE_ONLY;
1702 else
1703 type = MATCH_FRONT_ONLY;
1704 buff[i] = 0;
1705 break;
1706 }
1707 }
1708 }
1709
1710 return type;
1711}
1712
1713static int ftrace_match(char *str, char *regex, int len, int type) 1690static int ftrace_match(char *str, char *regex, int len, int type)
1714{ 1691{
1715 int matched = 0; 1692 int matched = 0;
@@ -1758,7 +1735,7 @@ static void ftrace_match_records(char *buff, int len, int enable)
1758 int not; 1735 int not;
1759 1736
1760 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; 1737 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1761 type = ftrace_setup_glob(buff, len, &search, &not); 1738 type = filter_parse_regex(buff, len, &search, &not);
1762 1739
1763 search_len = strlen(search); 1740 search_len = strlen(search);
1764 1741
@@ -1826,7 +1803,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
1826 } 1803 }
1827 1804
1828 if (strlen(buff)) { 1805 if (strlen(buff)) {
1829 type = ftrace_setup_glob(buff, strlen(buff), &search, &not); 1806 type = filter_parse_regex(buff, strlen(buff), &search, &not);
1830 search_len = strlen(search); 1807 search_len = strlen(search);
1831 } 1808 }
1832 1809
@@ -1991,7 +1968,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
1991 int count = 0; 1968 int count = 0;
1992 char *search; 1969 char *search;
1993 1970
1994 type = ftrace_setup_glob(glob, strlen(glob), &search, &not); 1971 type = filter_parse_regex(glob, strlen(glob), &search, &not);
1995 len = strlen(search); 1972 len = strlen(search);
1996 1973
1997 /* we do not support '!' for function probes */ 1974 /* we do not support '!' for function probes */
@@ -2068,7 +2045,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
2068 else if (glob) { 2045 else if (glob) {
2069 int not; 2046 int not;
2070 2047
2071 type = ftrace_setup_glob(glob, strlen(glob), &search, &not); 2048 type = filter_parse_regex(glob, strlen(glob), &search, &not);
2072 len = strlen(search); 2049 len = strlen(search);
2073 2050
2074 /* we do not support '!' for function probes */ 2051 /* we do not support '!' for function probes */
@@ -2312,6 +2289,32 @@ static int __init set_ftrace_filter(char *str)
2312} 2289}
2313__setup("ftrace_filter=", set_ftrace_filter); 2290__setup("ftrace_filter=", set_ftrace_filter);
2314 2291
2292#ifdef CONFIG_FUNCTION_GRAPH_TRACER
2293static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
2294static int __init set_graph_function(char *str)
2295{
2296 strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
2297 return 1;
2298}
2299__setup("ftrace_graph_filter=", set_graph_function);
2300
2301static void __init set_ftrace_early_graph(char *buf)
2302{
2303 int ret;
2304 char *func;
2305
2306 while (buf) {
2307 func = strsep(&buf, ",");
2308 /* we allow only one expression at a time */
2309 ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
2310 func);
2311 if (ret)
2312 printk(KERN_DEBUG "ftrace: function %s not "
2313 "traceable\n", func);
2314 }
2315}
2316#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
2317
2315static void __init set_ftrace_early_filter(char *buf, int enable) 2318static void __init set_ftrace_early_filter(char *buf, int enable)
2316{ 2319{
2317 char *func; 2320 char *func;
@@ -2328,6 +2331,10 @@ static void __init set_ftrace_early_filters(void)
2328 set_ftrace_early_filter(ftrace_filter_buf, 1); 2331 set_ftrace_early_filter(ftrace_filter_buf, 1);
2329 if (ftrace_notrace_buf[0]) 2332 if (ftrace_notrace_buf[0])
2330 set_ftrace_early_filter(ftrace_notrace_buf, 0); 2333 set_ftrace_early_filter(ftrace_notrace_buf, 0);
2334#ifdef CONFIG_FUNCTION_GRAPH_TRACER
2335 if (ftrace_graph_buf[0])
2336 set_ftrace_early_graph(ftrace_graph_buf);
2337#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
2331} 2338}
2332 2339
2333static int 2340static int
@@ -2513,7 +2520,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2513 return -ENODEV; 2520 return -ENODEV;
2514 2521
2515 /* decode regex */ 2522 /* decode regex */
2516 type = ftrace_setup_glob(buffer, strlen(buffer), &search, &not); 2523 type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
2517 if (not) 2524 if (not)
2518 return -EINVAL; 2525 return -EINVAL;
2519 2526
@@ -2624,7 +2631,7 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
2624 return 0; 2631 return 0;
2625} 2632}
2626 2633
2627static int ftrace_convert_nops(struct module *mod, 2634static int ftrace_process_locs(struct module *mod,
2628 unsigned long *start, 2635 unsigned long *start,
2629 unsigned long *end) 2636 unsigned long *end)
2630{ 2637{
@@ -2684,7 +2691,7 @@ static void ftrace_init_module(struct module *mod,
2684{ 2691{
2685 if (ftrace_disabled || start == end) 2692 if (ftrace_disabled || start == end)
2686 return; 2693 return;
2687 ftrace_convert_nops(mod, start, end); 2694 ftrace_process_locs(mod, start, end);
2688} 2695}
2689 2696
2690static int ftrace_module_notify(struct notifier_block *self, 2697static int ftrace_module_notify(struct notifier_block *self,
@@ -2745,7 +2752,7 @@ void __init ftrace_init(void)
2745 2752
2746 last_ftrace_enabled = ftrace_enabled = 1; 2753 last_ftrace_enabled = ftrace_enabled = 1;
2747 2754
2748 ret = ftrace_convert_nops(NULL, 2755 ret = ftrace_process_locs(NULL,
2749 __start_mcount_loc, 2756 __start_mcount_loc,
2750 __stop_mcount_loc); 2757 __stop_mcount_loc);
2751 2758
@@ -2778,23 +2785,6 @@ static inline void ftrace_startup_enable(int command) { }
2778# define ftrace_shutdown_sysctl() do { } while (0) 2785# define ftrace_shutdown_sysctl() do { } while (0)
2779#endif /* CONFIG_DYNAMIC_FTRACE */ 2786#endif /* CONFIG_DYNAMIC_FTRACE */
2780 2787
2781static ssize_t
2782ftrace_pid_read(struct file *file, char __user *ubuf,
2783 size_t cnt, loff_t *ppos)
2784{
2785 char buf[64];
2786 int r;
2787
2788 if (ftrace_pid_trace == ftrace_swapper_pid)
2789 r = sprintf(buf, "swapper tasks\n");
2790 else if (ftrace_pid_trace)
2791 r = sprintf(buf, "%u\n", pid_vnr(ftrace_pid_trace));
2792 else
2793 r = sprintf(buf, "no pid\n");
2794
2795 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2796}
2797
2798static void clear_ftrace_swapper(void) 2788static void clear_ftrace_swapper(void)
2799{ 2789{
2800 struct task_struct *p; 2790 struct task_struct *p;
@@ -2845,14 +2835,12 @@ static void set_ftrace_pid(struct pid *pid)
2845 rcu_read_unlock(); 2835 rcu_read_unlock();
2846} 2836}
2847 2837
2848static void clear_ftrace_pid_task(struct pid **pid) 2838static void clear_ftrace_pid_task(struct pid *pid)
2849{ 2839{
2850 if (*pid == ftrace_swapper_pid) 2840 if (pid == ftrace_swapper_pid)
2851 clear_ftrace_swapper(); 2841 clear_ftrace_swapper();
2852 else 2842 else
2853 clear_ftrace_pid(*pid); 2843 clear_ftrace_pid(pid);
2854
2855 *pid = NULL;
2856} 2844}
2857 2845
2858static void set_ftrace_pid_task(struct pid *pid) 2846static void set_ftrace_pid_task(struct pid *pid)
@@ -2863,74 +2851,184 @@ static void set_ftrace_pid_task(struct pid *pid)
2863 set_ftrace_pid(pid); 2851 set_ftrace_pid(pid);
2864} 2852}
2865 2853
2866static ssize_t 2854static int ftrace_pid_add(int p)
2867ftrace_pid_write(struct file *filp, const char __user *ubuf,
2868 size_t cnt, loff_t *ppos)
2869{ 2855{
2870 struct pid *pid; 2856 struct pid *pid;
2871 char buf[64]; 2857 struct ftrace_pid *fpid;
2872 long val; 2858 int ret = -EINVAL;
2873 int ret;
2874 2859
2875 if (cnt >= sizeof(buf)) 2860 mutex_lock(&ftrace_lock);
2876 return -EINVAL;
2877 2861
2878 if (copy_from_user(&buf, ubuf, cnt)) 2862 if (!p)
2879 return -EFAULT; 2863 pid = ftrace_swapper_pid;
2864 else
2865 pid = find_get_pid(p);
2880 2866
2881 buf[cnt] = 0; 2867 if (!pid)
2868 goto out;
2882 2869
2883 ret = strict_strtol(buf, 10, &val); 2870 ret = 0;
2884 if (ret < 0)
2885 return ret;
2886 2871
2887 mutex_lock(&ftrace_lock); 2872 list_for_each_entry(fpid, &ftrace_pids, list)
2888 if (val < 0) { 2873 if (fpid->pid == pid)
2889 /* disable pid tracing */ 2874 goto out_put;
2890 if (!ftrace_pid_trace)
2891 goto out;
2892 2875
2893 clear_ftrace_pid_task(&ftrace_pid_trace); 2876 ret = -ENOMEM;
2894 2877
2895 } else { 2878 fpid = kmalloc(sizeof(*fpid), GFP_KERNEL);
2896 /* swapper task is special */ 2879 if (!fpid)
2897 if (!val) { 2880 goto out_put;
2898 pid = ftrace_swapper_pid;
2899 if (pid == ftrace_pid_trace)
2900 goto out;
2901 } else {
2902 pid = find_get_pid(val);
2903 2881
2904 if (pid == ftrace_pid_trace) { 2882 list_add(&fpid->list, &ftrace_pids);
2905 put_pid(pid); 2883 fpid->pid = pid;
2906 goto out;
2907 }
2908 }
2909 2884
2910 if (ftrace_pid_trace) 2885 set_ftrace_pid_task(pid);
2911 clear_ftrace_pid_task(&ftrace_pid_trace);
2912 2886
2913 if (!pid) 2887 ftrace_update_pid_func();
2914 goto out; 2888 ftrace_startup_enable(0);
2889
2890 mutex_unlock(&ftrace_lock);
2891 return 0;
2892
2893out_put:
2894 if (pid != ftrace_swapper_pid)
2895 put_pid(pid);
2915 2896
2916 ftrace_pid_trace = pid; 2897out:
2898 mutex_unlock(&ftrace_lock);
2899 return ret;
2900}
2901
2902static void ftrace_pid_reset(void)
2903{
2904 struct ftrace_pid *fpid, *safe;
2917 2905
2918 set_ftrace_pid_task(ftrace_pid_trace); 2906 mutex_lock(&ftrace_lock);
2907 list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) {
2908 struct pid *pid = fpid->pid;
2909
2910 clear_ftrace_pid_task(pid);
2911
2912 list_del(&fpid->list);
2913 kfree(fpid);
2919 } 2914 }
2920 2915
2921 /* update the function call */
2922 ftrace_update_pid_func(); 2916 ftrace_update_pid_func();
2923 ftrace_startup_enable(0); 2917 ftrace_startup_enable(0);
2924 2918
2925 out:
2926 mutex_unlock(&ftrace_lock); 2919 mutex_unlock(&ftrace_lock);
2920}
2927 2921
2928 return cnt; 2922static void *fpid_start(struct seq_file *m, loff_t *pos)
2923{
2924 mutex_lock(&ftrace_lock);
2925
2926 if (list_empty(&ftrace_pids) && (!*pos))
2927 return (void *) 1;
2928
2929 return seq_list_start(&ftrace_pids, *pos);
2930}
2931
2932static void *fpid_next(struct seq_file *m, void *v, loff_t *pos)
2933{
2934 if (v == (void *)1)
2935 return NULL;
2936
2937 return seq_list_next(v, &ftrace_pids, pos);
2938}
2939
2940static void fpid_stop(struct seq_file *m, void *p)
2941{
2942 mutex_unlock(&ftrace_lock);
2943}
2944
2945static int fpid_show(struct seq_file *m, void *v)
2946{
2947 const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list);
2948
2949 if (v == (void *)1) {
2950 seq_printf(m, "no pid\n");
2951 return 0;
2952 }
2953
2954 if (fpid->pid == ftrace_swapper_pid)
2955 seq_printf(m, "swapper tasks\n");
2956 else
2957 seq_printf(m, "%u\n", pid_vnr(fpid->pid));
2958
2959 return 0;
2960}
2961
2962static const struct seq_operations ftrace_pid_sops = {
2963 .start = fpid_start,
2964 .next = fpid_next,
2965 .stop = fpid_stop,
2966 .show = fpid_show,
2967};
2968
2969static int
2970ftrace_pid_open(struct inode *inode, struct file *file)
2971{
2972 int ret = 0;
2973
2974 if ((file->f_mode & FMODE_WRITE) &&
2975 (file->f_flags & O_TRUNC))
2976 ftrace_pid_reset();
2977
2978 if (file->f_mode & FMODE_READ)
2979 ret = seq_open(file, &ftrace_pid_sops);
2980
2981 return ret;
2982}
2983
2984static ssize_t
2985ftrace_pid_write(struct file *filp, const char __user *ubuf,
2986 size_t cnt, loff_t *ppos)
2987{
2988 char buf[64], *tmp;
2989 long val;
2990 int ret;
2991
2992 if (cnt >= sizeof(buf))
2993 return -EINVAL;
2994
2995 if (copy_from_user(&buf, ubuf, cnt))
2996 return -EFAULT;
2997
2998 buf[cnt] = 0;
2999
3000 /*
3001 * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid"
3002 * to clean the filter quietly.
3003 */
3004 tmp = strstrip(buf);
3005 if (strlen(tmp) == 0)
3006 return 1;
3007
3008 ret = strict_strtol(tmp, 10, &val);
3009 if (ret < 0)
3010 return ret;
3011
3012 ret = ftrace_pid_add(val);
3013
3014 return ret ? ret : cnt;
3015}
3016
3017static int
3018ftrace_pid_release(struct inode *inode, struct file *file)
3019{
3020 if (file->f_mode & FMODE_READ)
3021 seq_release(inode, file);
3022
3023 return 0;
2929} 3024}
2930 3025
2931static const struct file_operations ftrace_pid_fops = { 3026static const struct file_operations ftrace_pid_fops = {
2932 .read = ftrace_pid_read, 3027 .open = ftrace_pid_open,
2933 .write = ftrace_pid_write, 3028 .write = ftrace_pid_write,
3029 .read = seq_read,
3030 .llseek = seq_lseek,
3031 .release = ftrace_pid_release,
2934}; 3032};
2935 3033
2936static __init int ftrace_init_debugfs(void) 3034static __init int ftrace_init_debugfs(void)
@@ -3293,4 +3391,3 @@ void ftrace_graph_stop(void)
3293 ftrace_stop(); 3391 ftrace_stop();
3294} 3392}
3295#endif 3393#endif
3296
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 5dd017fea6f5..a1ca4956ab5e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -397,18 +397,21 @@ int ring_buffer_print_page_header(struct trace_seq *s)
397 int ret; 397 int ret;
398 398
399 ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t" 399 ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t"
400 "offset:0;\tsize:%u;\n", 400 "offset:0;\tsize:%u;\tsigned:%u;\n",
401 (unsigned int)sizeof(field.time_stamp)); 401 (unsigned int)sizeof(field.time_stamp),
402 (unsigned int)is_signed_type(u64));
402 403
403 ret = trace_seq_printf(s, "\tfield: local_t commit;\t" 404 ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
404 "offset:%u;\tsize:%u;\n", 405 "offset:%u;\tsize:%u;\tsigned:%u;\n",
405 (unsigned int)offsetof(typeof(field), commit), 406 (unsigned int)offsetof(typeof(field), commit),
406 (unsigned int)sizeof(field.commit)); 407 (unsigned int)sizeof(field.commit),
408 (unsigned int)is_signed_type(long));
407 409
408 ret = trace_seq_printf(s, "\tfield: char data;\t" 410 ret = trace_seq_printf(s, "\tfield: char data;\t"
409 "offset:%u;\tsize:%u;\n", 411 "offset:%u;\tsize:%u;\tsigned:%u;\n",
410 (unsigned int)offsetof(typeof(field), data), 412 (unsigned int)offsetof(typeof(field), data),
411 (unsigned int)BUF_PAGE_SIZE); 413 (unsigned int)BUF_PAGE_SIZE,
414 (unsigned int)is_signed_type(char));
412 415
413 return ret; 416 return ret;
414} 417}
@@ -1787,9 +1790,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1787static struct ring_buffer_event * 1790static struct ring_buffer_event *
1788rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, 1791rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1789 unsigned long length, unsigned long tail, 1792 unsigned long length, unsigned long tail,
1790 struct buffer_page *commit_page,
1791 struct buffer_page *tail_page, u64 *ts) 1793 struct buffer_page *tail_page, u64 *ts)
1792{ 1794{
1795 struct buffer_page *commit_page = cpu_buffer->commit_page;
1793 struct ring_buffer *buffer = cpu_buffer->buffer; 1796 struct ring_buffer *buffer = cpu_buffer->buffer;
1794 struct buffer_page *next_page; 1797 struct buffer_page *next_page;
1795 int ret; 1798 int ret;
@@ -1892,13 +1895,10 @@ static struct ring_buffer_event *
1892__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, 1895__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1893 unsigned type, unsigned long length, u64 *ts) 1896 unsigned type, unsigned long length, u64 *ts)
1894{ 1897{
1895 struct buffer_page *tail_page, *commit_page; 1898 struct buffer_page *tail_page;
1896 struct ring_buffer_event *event; 1899 struct ring_buffer_event *event;
1897 unsigned long tail, write; 1900 unsigned long tail, write;
1898 1901
1899 commit_page = cpu_buffer->commit_page;
1900 /* we just need to protect against interrupts */
1901 barrier();
1902 tail_page = cpu_buffer->tail_page; 1902 tail_page = cpu_buffer->tail_page;
1903 write = local_add_return(length, &tail_page->write); 1903 write = local_add_return(length, &tail_page->write);
1904 1904
@@ -1909,7 +1909,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1909 /* See if we shot pass the end of this buffer page */ 1909 /* See if we shot pass the end of this buffer page */
1910 if (write > BUF_PAGE_SIZE) 1910 if (write > BUF_PAGE_SIZE)
1911 return rb_move_tail(cpu_buffer, length, tail, 1911 return rb_move_tail(cpu_buffer, length, tail,
1912 commit_page, tail_page, ts); 1912 tail_page, ts);
1913 1913
1914 /* We reserved something on the buffer */ 1914 /* We reserved something on the buffer */
1915 1915
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 573d3cc762c3..b2477caf09c2 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -35,6 +35,28 @@ static int disable_reader;
35module_param(disable_reader, uint, 0644); 35module_param(disable_reader, uint, 0644);
36MODULE_PARM_DESC(disable_reader, "only run producer"); 36MODULE_PARM_DESC(disable_reader, "only run producer");
37 37
38static int write_iteration = 50;
39module_param(write_iteration, uint, 0644);
40MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings");
41
42static int producer_nice = 19;
43static int consumer_nice = 19;
44
45static int producer_fifo = -1;
46static int consumer_fifo = -1;
47
48module_param(producer_nice, uint, 0644);
49MODULE_PARM_DESC(producer_nice, "nice prio for producer");
50
51module_param(consumer_nice, uint, 0644);
52MODULE_PARM_DESC(consumer_nice, "nice prio for consumer");
53
54module_param(producer_fifo, uint, 0644);
55MODULE_PARM_DESC(producer_fifo, "fifo prio for producer");
56
57module_param(consumer_fifo, uint, 0644);
58MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer");
59
38static int read_events; 60static int read_events;
39 61
40static int kill_test; 62static int kill_test;
@@ -208,15 +230,18 @@ static void ring_buffer_producer(void)
208 do { 230 do {
209 struct ring_buffer_event *event; 231 struct ring_buffer_event *event;
210 int *entry; 232 int *entry;
211 233 int i;
212 event = ring_buffer_lock_reserve(buffer, 10); 234
213 if (!event) { 235 for (i = 0; i < write_iteration; i++) {
214 missed++; 236 event = ring_buffer_lock_reserve(buffer, 10);
215 } else { 237 if (!event) {
216 hit++; 238 missed++;
217 entry = ring_buffer_event_data(event); 239 } else {
218 *entry = smp_processor_id(); 240 hit++;
219 ring_buffer_unlock_commit(buffer, event); 241 entry = ring_buffer_event_data(event);
242 *entry = smp_processor_id();
243 ring_buffer_unlock_commit(buffer, event);
244 }
220 } 245 }
221 do_gettimeofday(&end_tv); 246 do_gettimeofday(&end_tv);
222 247
@@ -263,6 +288,27 @@ static void ring_buffer_producer(void)
263 288
264 if (kill_test) 289 if (kill_test)
265 trace_printk("ERROR!\n"); 290 trace_printk("ERROR!\n");
291
292 if (!disable_reader) {
293 if (consumer_fifo < 0)
294 trace_printk("Running Consumer at nice: %d\n",
295 consumer_nice);
296 else
297 trace_printk("Running Consumer at SCHED_FIFO %d\n",
298 consumer_fifo);
299 }
300 if (producer_fifo < 0)
301 trace_printk("Running Producer at nice: %d\n",
302 producer_nice);
303 else
304 trace_printk("Running Producer at SCHED_FIFO %d\n",
305 producer_fifo);
306
307 /* Let the user know that the test is running at low priority */
308 if (producer_fifo < 0 && consumer_fifo < 0 &&
309 producer_nice == 19 && consumer_nice == 19)
310 trace_printk("WARNING!!! This test is running at lowest priority.\n");
311
266 trace_printk("Time: %lld (usecs)\n", time); 312 trace_printk("Time: %lld (usecs)\n", time);
267 trace_printk("Overruns: %lld\n", overruns); 313 trace_printk("Overruns: %lld\n", overruns);
268 if (disable_reader) 314 if (disable_reader)
@@ -392,6 +438,27 @@ static int __init ring_buffer_benchmark_init(void)
392 if (IS_ERR(producer)) 438 if (IS_ERR(producer))
393 goto out_kill; 439 goto out_kill;
394 440
441 /*
442 * Run them as low-prio background tasks by default:
443 */
444 if (!disable_reader) {
445 if (consumer_fifo >= 0) {
446 struct sched_param param = {
447 .sched_priority = consumer_fifo
448 };
449 sched_setscheduler(consumer, SCHED_FIFO, &param);
450 } else
451 set_user_nice(consumer, consumer_nice);
452 }
453
454 if (producer_fifo >= 0) {
455 struct sched_param param = {
456 .sched_priority = consumer_fifo
457 };
458 sched_setscheduler(producer, SCHED_FIFO, &param);
459 } else
460 set_user_nice(producer, producer_nice);
461
395 return 0; 462 return 0;
396 463
397 out_kill: 464 out_kill:
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b20d3ec75de9..88bd9ae2a9ed 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -129,7 +129,7 @@ static int tracing_set_tracer(const char *buf);
129static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; 129static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
130static char *default_bootup_tracer; 130static char *default_bootup_tracer;
131 131
132static int __init set_ftrace(char *str) 132static int __init set_cmdline_ftrace(char *str)
133{ 133{
134 strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); 134 strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
135 default_bootup_tracer = bootup_tracer_buf; 135 default_bootup_tracer = bootup_tracer_buf;
@@ -137,7 +137,7 @@ static int __init set_ftrace(char *str)
137 ring_buffer_expanded = 1; 137 ring_buffer_expanded = 1;
138 return 1; 138 return 1;
139} 139}
140__setup("ftrace=", set_ftrace); 140__setup("ftrace=", set_cmdline_ftrace);
141 141
142static int __init set_ftrace_dump_on_oops(char *str) 142static int __init set_ftrace_dump_on_oops(char *str)
143{ 143{
@@ -1363,9 +1363,6 @@ int trace_array_vprintk(struct trace_array *tr,
1363 __raw_spin_lock(&trace_buf_lock); 1363 __raw_spin_lock(&trace_buf_lock);
1364 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); 1364 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
1365 1365
1366 len = min(len, TRACE_BUF_SIZE-1);
1367 trace_buf[len] = 0;
1368
1369 size = sizeof(*entry) + len + 1; 1366 size = sizeof(*entry) + len + 1;
1370 buffer = tr->buffer; 1367 buffer = tr->buffer;
1371 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, 1368 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
@@ -1373,10 +1370,10 @@ int trace_array_vprintk(struct trace_array *tr,
1373 if (!event) 1370 if (!event)
1374 goto out_unlock; 1371 goto out_unlock;
1375 entry = ring_buffer_event_data(event); 1372 entry = ring_buffer_event_data(event);
1376 entry->ip = ip; 1373 entry->ip = ip;
1377 1374
1378 memcpy(&entry->buf, trace_buf, len); 1375 memcpy(&entry->buf, trace_buf, len);
1379 entry->buf[len] = 0; 1376 entry->buf[len] = '\0';
1380 if (!filter_check_discard(call, entry, buffer, event)) 1377 if (!filter_check_discard(call, entry, buffer, event))
1381 ring_buffer_unlock_commit(buffer, event); 1378 ring_buffer_unlock_commit(buffer, event);
1382 1379
@@ -1515,6 +1512,8 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
1515 int i = (int)*pos; 1512 int i = (int)*pos;
1516 void *ent; 1513 void *ent;
1517 1514
1515 WARN_ON_ONCE(iter->leftover);
1516
1518 (*pos)++; 1517 (*pos)++;
1519 1518
1520 /* can't go backwards */ 1519 /* can't go backwards */
@@ -1613,8 +1612,16 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1613 ; 1612 ;
1614 1613
1615 } else { 1614 } else {
1616 l = *pos - 1; 1615 /*
1617 p = s_next(m, p, &l); 1616 * If we overflowed the seq_file before, then we want
1617 * to just reuse the trace_seq buffer again.
1618 */
1619 if (iter->leftover)
1620 p = iter;
1621 else {
1622 l = *pos - 1;
1623 p = s_next(m, p, &l);
1624 }
1618 } 1625 }
1619 1626
1620 trace_event_read_lock(); 1627 trace_event_read_lock();
@@ -1922,6 +1929,7 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
1922static int s_show(struct seq_file *m, void *v) 1929static int s_show(struct seq_file *m, void *v)
1923{ 1930{
1924 struct trace_iterator *iter = v; 1931 struct trace_iterator *iter = v;
1932 int ret;
1925 1933
1926 if (iter->ent == NULL) { 1934 if (iter->ent == NULL) {
1927 if (iter->tr) { 1935 if (iter->tr) {
@@ -1941,9 +1949,27 @@ static int s_show(struct seq_file *m, void *v)
1941 if (!(trace_flags & TRACE_ITER_VERBOSE)) 1949 if (!(trace_flags & TRACE_ITER_VERBOSE))
1942 print_func_help_header(m); 1950 print_func_help_header(m);
1943 } 1951 }
1952 } else if (iter->leftover) {
1953 /*
1954 * If we filled the seq_file buffer earlier, we
1955 * want to just show it now.
1956 */
1957 ret = trace_print_seq(m, &iter->seq);
1958
1959 /* ret should this time be zero, but you never know */
1960 iter->leftover = ret;
1961
1944 } else { 1962 } else {
1945 print_trace_line(iter); 1963 print_trace_line(iter);
1946 trace_print_seq(m, &iter->seq); 1964 ret = trace_print_seq(m, &iter->seq);
1965 /*
1966 * If we overflow the seq_file buffer, then it will
1967 * ask us for this data again at start up.
1968 * Use that instead.
1969 * ret is 0 if seq_file write succeeded.
1970 * -1 otherwise.
1971 */
1972 iter->leftover = ret;
1947 } 1973 }
1948 1974
1949 return 0; 1975 return 0;
@@ -2897,6 +2923,10 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
2897 else 2923 else
2898 cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask); 2924 cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask);
2899 2925
2926
2927 if (iter->trace->pipe_close)
2928 iter->trace->pipe_close(iter);
2929
2900 mutex_unlock(&trace_types_lock); 2930 mutex_unlock(&trace_types_lock);
2901 2931
2902 free_cpumask_var(iter->started); 2932 free_cpumask_var(iter->started);
@@ -3334,7 +3364,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3334 size_t cnt, loff_t *fpos) 3364 size_t cnt, loff_t *fpos)
3335{ 3365{
3336 char *buf; 3366 char *buf;
3337 char *end;
3338 3367
3339 if (tracing_disabled) 3368 if (tracing_disabled)
3340 return -EINVAL; 3369 return -EINVAL;
@@ -3342,7 +3371,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3342 if (cnt > TRACE_BUF_SIZE) 3371 if (cnt > TRACE_BUF_SIZE)
3343 cnt = TRACE_BUF_SIZE; 3372 cnt = TRACE_BUF_SIZE;
3344 3373
3345 buf = kmalloc(cnt + 1, GFP_KERNEL); 3374 buf = kmalloc(cnt + 2, GFP_KERNEL);
3346 if (buf == NULL) 3375 if (buf == NULL)
3347 return -ENOMEM; 3376 return -ENOMEM;
3348 3377
@@ -3350,14 +3379,13 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3350 kfree(buf); 3379 kfree(buf);
3351 return -EFAULT; 3380 return -EFAULT;
3352 } 3381 }
3382 if (buf[cnt-1] != '\n') {
3383 buf[cnt] = '\n';
3384 buf[cnt+1] = '\0';
3385 } else
3386 buf[cnt] = '\0';
3353 3387
3354 /* Cut from the first nil or newline. */ 3388 cnt = mark_printk("%s", buf);
3355 buf[cnt] = '\0';
3356 end = strchr(buf, '\n');
3357 if (end)
3358 *end = '\0';
3359
3360 cnt = mark_printk("%s\n", buf);
3361 kfree(buf); 3389 kfree(buf);
3362 *fpos += cnt; 3390 *fpos += cnt;
3363 3391
@@ -3730,7 +3758,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
3730 3758
3731 s = kmalloc(sizeof(*s), GFP_KERNEL); 3759 s = kmalloc(sizeof(*s), GFP_KERNEL);
3732 if (!s) 3760 if (!s)
3733 return ENOMEM; 3761 return -ENOMEM;
3734 3762
3735 trace_seq_init(s); 3763 trace_seq_init(s);
3736 3764
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 405cb850b75d..7fa33cab6962 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -11,6 +11,7 @@
11#include <linux/ftrace.h> 11#include <linux/ftrace.h>
12#include <trace/boot.h> 12#include <trace/boot.h>
13#include <linux/kmemtrace.h> 13#include <linux/kmemtrace.h>
14#include <linux/hw_breakpoint.h>
14 15
15#include <linux/trace_seq.h> 16#include <linux/trace_seq.h>
16#include <linux/ftrace_event.h> 17#include <linux/ftrace_event.h>
@@ -37,6 +38,7 @@ enum trace_type {
37 TRACE_KMEM_ALLOC, 38 TRACE_KMEM_ALLOC,
38 TRACE_KMEM_FREE, 39 TRACE_KMEM_FREE,
39 TRACE_BLK, 40 TRACE_BLK,
41 TRACE_KSYM,
40 42
41 __TRACE_LAST_TYPE, 43 __TRACE_LAST_TYPE,
42}; 44};
@@ -98,9 +100,32 @@ struct syscall_trace_enter {
98struct syscall_trace_exit { 100struct syscall_trace_exit {
99 struct trace_entry ent; 101 struct trace_entry ent;
100 int nr; 102 int nr;
101 unsigned long ret; 103 long ret;
102}; 104};
103 105
106struct kprobe_trace_entry {
107 struct trace_entry ent;
108 unsigned long ip;
109 int nargs;
110 unsigned long args[];
111};
112
113#define SIZEOF_KPROBE_TRACE_ENTRY(n) \
114 (offsetof(struct kprobe_trace_entry, args) + \
115 (sizeof(unsigned long) * (n)))
116
117struct kretprobe_trace_entry {
118 struct trace_entry ent;
119 unsigned long func;
120 unsigned long ret_ip;
121 int nargs;
122 unsigned long args[];
123};
124
125#define SIZEOF_KRETPROBE_TRACE_ENTRY(n) \
126 (offsetof(struct kretprobe_trace_entry, args) + \
127 (sizeof(unsigned long) * (n)))
128
104/* 129/*
105 * trace_flag_type is an enumeration that holds different 130 * trace_flag_type is an enumeration that holds different
106 * states when a trace occurs. These are: 131 * states when a trace occurs. These are:
@@ -209,6 +234,7 @@ extern void __ftrace_bad_type(void);
209 TRACE_KMEM_ALLOC); \ 234 TRACE_KMEM_ALLOC); \
210 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ 235 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
211 TRACE_KMEM_FREE); \ 236 TRACE_KMEM_FREE); \
237 IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\
212 __ftrace_bad_type(); \ 238 __ftrace_bad_type(); \
213 } while (0) 239 } while (0)
214 240
@@ -246,6 +272,7 @@ struct tracer_flags {
246 * @pipe_open: called when the trace_pipe file is opened 272 * @pipe_open: called when the trace_pipe file is opened
247 * @wait_pipe: override how the user waits for traces on trace_pipe 273 * @wait_pipe: override how the user waits for traces on trace_pipe
248 * @close: called when the trace file is released 274 * @close: called when the trace file is released
275 * @pipe_close: called when the trace_pipe file is released
249 * @read: override the default read callback on trace_pipe 276 * @read: override the default read callback on trace_pipe
250 * @splice_read: override the default splice_read callback on trace_pipe 277 * @splice_read: override the default splice_read callback on trace_pipe
251 * @selftest: selftest to run on boot (see trace_selftest.c) 278 * @selftest: selftest to run on boot (see trace_selftest.c)
@@ -264,6 +291,7 @@ struct tracer {
264 void (*pipe_open)(struct trace_iterator *iter); 291 void (*pipe_open)(struct trace_iterator *iter);
265 void (*wait_pipe)(struct trace_iterator *iter); 292 void (*wait_pipe)(struct trace_iterator *iter);
266 void (*close)(struct trace_iterator *iter); 293 void (*close)(struct trace_iterator *iter);
294 void (*pipe_close)(struct trace_iterator *iter);
267 ssize_t (*read)(struct trace_iterator *iter, 295 ssize_t (*read)(struct trace_iterator *iter,
268 struct file *filp, char __user *ubuf, 296 struct file *filp, char __user *ubuf,
269 size_t cnt, loff_t *ppos); 297 size_t cnt, loff_t *ppos);
@@ -364,6 +392,8 @@ int register_tracer(struct tracer *type);
364void unregister_tracer(struct tracer *type); 392void unregister_tracer(struct tracer *type);
365int is_tracing_stopped(void); 393int is_tracing_stopped(void);
366 394
395extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr);
396
367extern unsigned long nsecs_to_usecs(unsigned long nsecs); 397extern unsigned long nsecs_to_usecs(unsigned long nsecs);
368 398
369#ifdef CONFIG_TRACER_MAX_TRACE 399#ifdef CONFIG_TRACER_MAX_TRACE
@@ -438,6 +468,8 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
438 struct trace_array *tr); 468 struct trace_array *tr);
439extern int trace_selftest_startup_hw_branches(struct tracer *trace, 469extern int trace_selftest_startup_hw_branches(struct tracer *trace,
440 struct trace_array *tr); 470 struct trace_array *tr);
471extern int trace_selftest_startup_ksym(struct tracer *trace,
472 struct trace_array *tr);
441#endif /* CONFIG_FTRACE_STARTUP_TEST */ 473#endif /* CONFIG_FTRACE_STARTUP_TEST */
442 474
443extern void *head_page(struct trace_array_cpu *data); 475extern void *head_page(struct trace_array_cpu *data);
@@ -483,10 +515,6 @@ static inline int ftrace_graph_addr(unsigned long addr)
483 return 0; 515 return 0;
484} 516}
485#else 517#else
486static inline int ftrace_trace_addr(unsigned long addr)
487{
488 return 1;
489}
490static inline int ftrace_graph_addr(unsigned long addr) 518static inline int ftrace_graph_addr(unsigned long addr)
491{ 519{
492 return 1; 520 return 1;
@@ -500,12 +528,12 @@ print_graph_function(struct trace_iterator *iter)
500} 528}
501#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 529#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
502 530
503extern struct pid *ftrace_pid_trace; 531extern struct list_head ftrace_pids;
504 532
505#ifdef CONFIG_FUNCTION_TRACER 533#ifdef CONFIG_FUNCTION_TRACER
506static inline int ftrace_trace_task(struct task_struct *task) 534static inline int ftrace_trace_task(struct task_struct *task)
507{ 535{
508 if (!ftrace_pid_trace) 536 if (list_empty(&ftrace_pids))
509 return 1; 537 return 1;
510 538
511 return test_tsk_trace_trace(task); 539 return test_tsk_trace_trace(task);
@@ -687,7 +715,6 @@ struct event_filter {
687 int n_preds; 715 int n_preds;
688 struct filter_pred **preds; 716 struct filter_pred **preds;
689 char *filter_string; 717 char *filter_string;
690 bool no_reset;
691}; 718};
692 719
693struct event_subsystem { 720struct event_subsystem {
@@ -699,22 +726,40 @@ struct event_subsystem {
699}; 726};
700 727
701struct filter_pred; 728struct filter_pred;
729struct regex;
702 730
703typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, 731typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event,
704 int val1, int val2); 732 int val1, int val2);
705 733
734typedef int (*regex_match_func)(char *str, struct regex *r, int len);
735
736enum regex_type {
737 MATCH_FULL = 0,
738 MATCH_FRONT_ONLY,
739 MATCH_MIDDLE_ONLY,
740 MATCH_END_ONLY,
741};
742
743struct regex {
744 char pattern[MAX_FILTER_STR_VAL];
745 int len;
746 int field_len;
747 regex_match_func match;
748};
749
706struct filter_pred { 750struct filter_pred {
707 filter_pred_fn_t fn; 751 filter_pred_fn_t fn;
708 u64 val; 752 u64 val;
709 char str_val[MAX_FILTER_STR_VAL]; 753 struct regex regex;
710 int str_len; 754 char *field_name;
711 char *field_name; 755 int offset;
712 int offset; 756 int not;
713 int not; 757 int op;
714 int op; 758 int pop_n;
715 int pop_n;
716}; 759};
717 760
761extern enum regex_type
762filter_parse_regex(char *buff, int len, char **search, int *not);
718extern void print_event_filter(struct ftrace_event_call *call, 763extern void print_event_filter(struct ftrace_event_call *call,
719 struct trace_seq *s); 764 struct trace_seq *s);
720extern int apply_event_filter(struct ftrace_event_call *call, 765extern int apply_event_filter(struct ftrace_event_call *call,
@@ -730,7 +775,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
730 struct ring_buffer *buffer, 775 struct ring_buffer *buffer,
731 struct ring_buffer_event *event) 776 struct ring_buffer_event *event)
732{ 777{
733 if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) { 778 if (unlikely(call->filter_active) &&
779 !filter_match_preds(call->filter, rec)) {
734 ring_buffer_discard_commit(buffer, event); 780 ring_buffer_discard_commit(buffer, event);
735 return 1; 781 return 1;
736 } 782 }
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 20c5f92e28a8..878c03f386ba 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -20,6 +20,8 @@
20#include <linux/ktime.h> 20#include <linux/ktime.h>
21#include <linux/trace_clock.h> 21#include <linux/trace_clock.h>
22 22
23#include "trace.h"
24
23/* 25/*
24 * trace_clock_local(): the simplest and least coherent tracing clock. 26 * trace_clock_local(): the simplest and least coherent tracing clock.
25 * 27 *
@@ -28,17 +30,17 @@
28 */ 30 */
29u64 notrace trace_clock_local(void) 31u64 notrace trace_clock_local(void)
30{ 32{
31 unsigned long flags;
32 u64 clock; 33 u64 clock;
34 int resched;
33 35
34 /* 36 /*
35 * sched_clock() is an architecture implemented, fast, scalable, 37 * sched_clock() is an architecture implemented, fast, scalable,
36 * lockless clock. It is not guaranteed to be coherent across 38 * lockless clock. It is not guaranteed to be coherent across
37 * CPUs, nor across CPU idle events. 39 * CPUs, nor across CPU idle events.
38 */ 40 */
39 raw_local_irq_save(flags); 41 resched = ftrace_preempt_disable();
40 clock = sched_clock(); 42 clock = sched_clock();
41 raw_local_irq_restore(flags); 43 ftrace_preempt_enable(resched);
42 44
43 return clock; 45 return clock;
44} 46}
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index ead3d724599d..c16a08f399df 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -364,3 +364,19 @@ FTRACE_ENTRY(kmem_free, kmemtrace_free_entry,
364 F_printk("type:%u call_site:%lx ptr:%p", 364 F_printk("type:%u call_site:%lx ptr:%p",
365 __entry->type_id, __entry->call_site, __entry->ptr) 365 __entry->type_id, __entry->call_site, __entry->ptr)
366); 366);
367
368FTRACE_ENTRY(ksym_trace, ksym_trace_entry,
369
370 TRACE_KSYM,
371
372 F_STRUCT(
373 __field( unsigned long, ip )
374 __field( unsigned char, type )
375 __array( char , cmd, TASK_COMM_LEN )
376 __field( unsigned long, addr )
377 ),
378
379 F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s",
380 (void *)__entry->ip, (unsigned int)__entry->type,
381 (void *)__entry->addr, __entry->cmd)
382);
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 8d5c171cc998..d9c60f80aa0d 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -8,17 +8,14 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include "trace.h" 9#include "trace.h"
10 10
11/*
12 * We can't use a size but a type in alloc_percpu()
13 * So let's create a dummy type that matches the desired size
14 */
15typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t;
16 11
17char *trace_profile_buf; 12char *perf_trace_buf;
18EXPORT_SYMBOL_GPL(trace_profile_buf); 13EXPORT_SYMBOL_GPL(perf_trace_buf);
14
15char *perf_trace_buf_nmi;
16EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
19 17
20char *trace_profile_buf_nmi; 18typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;
21EXPORT_SYMBOL_GPL(trace_profile_buf_nmi);
22 19
23/* Count the events in use (per event id, not per instance) */ 20/* Count the events in use (per event id, not per instance) */
24static int total_profile_count; 21static int total_profile_count;
@@ -32,20 +29,20 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event)
32 return 0; 29 return 0;
33 30
34 if (!total_profile_count) { 31 if (!total_profile_count) {
35 buf = (char *)alloc_percpu(profile_buf_t); 32 buf = (char *)alloc_percpu(perf_trace_t);
36 if (!buf) 33 if (!buf)
37 goto fail_buf; 34 goto fail_buf;
38 35
39 rcu_assign_pointer(trace_profile_buf, buf); 36 rcu_assign_pointer(perf_trace_buf, buf);
40 37
41 buf = (char *)alloc_percpu(profile_buf_t); 38 buf = (char *)alloc_percpu(perf_trace_t);
42 if (!buf) 39 if (!buf)
43 goto fail_buf_nmi; 40 goto fail_buf_nmi;
44 41
45 rcu_assign_pointer(trace_profile_buf_nmi, buf); 42 rcu_assign_pointer(perf_trace_buf_nmi, buf);
46 } 43 }
47 44
48 ret = event->profile_enable(); 45 ret = event->profile_enable(event);
49 if (!ret) { 46 if (!ret) {
50 total_profile_count++; 47 total_profile_count++;
51 return 0; 48 return 0;
@@ -53,10 +50,10 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event)
53 50
54fail_buf_nmi: 51fail_buf_nmi:
55 if (!total_profile_count) { 52 if (!total_profile_count) {
56 free_percpu(trace_profile_buf_nmi); 53 free_percpu(perf_trace_buf_nmi);
57 free_percpu(trace_profile_buf); 54 free_percpu(perf_trace_buf);
58 trace_profile_buf_nmi = NULL; 55 perf_trace_buf_nmi = NULL;
59 trace_profile_buf = NULL; 56 perf_trace_buf = NULL;
60 } 57 }
61fail_buf: 58fail_buf:
62 atomic_dec(&event->profile_count); 59 atomic_dec(&event->profile_count);
@@ -89,14 +86,14 @@ static void ftrace_profile_disable_event(struct ftrace_event_call *event)
89 if (!atomic_add_negative(-1, &event->profile_count)) 86 if (!atomic_add_negative(-1, &event->profile_count))
90 return; 87 return;
91 88
92 event->profile_disable(); 89 event->profile_disable(event);
93 90
94 if (!--total_profile_count) { 91 if (!--total_profile_count) {
95 buf = trace_profile_buf; 92 buf = perf_trace_buf;
96 rcu_assign_pointer(trace_profile_buf, NULL); 93 rcu_assign_pointer(perf_trace_buf, NULL);
97 94
98 nmi_buf = trace_profile_buf_nmi; 95 nmi_buf = perf_trace_buf_nmi;
99 rcu_assign_pointer(trace_profile_buf_nmi, NULL); 96 rcu_assign_pointer(perf_trace_buf_nmi, NULL);
100 97
101 /* 98 /*
102 * Ensure every events in profiling have finished before 99 * Ensure every events in profiling have finished before
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index d128f65778e6..1d18315dc836 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -93,9 +93,7 @@ int trace_define_common_fields(struct ftrace_event_call *call)
93} 93}
94EXPORT_SYMBOL_GPL(trace_define_common_fields); 94EXPORT_SYMBOL_GPL(trace_define_common_fields);
95 95
96#ifdef CONFIG_MODULES 96void trace_destroy_fields(struct ftrace_event_call *call)
97
98static void trace_destroy_fields(struct ftrace_event_call *call)
99{ 97{
100 struct ftrace_event_field *field, *next; 98 struct ftrace_event_field *field, *next;
101 99
@@ -107,8 +105,6 @@ static void trace_destroy_fields(struct ftrace_event_call *call)
107 } 105 }
108} 106}
109 107
110#endif /* CONFIG_MODULES */
111
112static void ftrace_event_enable_disable(struct ftrace_event_call *call, 108static void ftrace_event_enable_disable(struct ftrace_event_call *call,
113 int enable) 109 int enable)
114{ 110{
@@ -117,14 +113,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
117 if (call->enabled) { 113 if (call->enabled) {
118 call->enabled = 0; 114 call->enabled = 0;
119 tracing_stop_cmdline_record(); 115 tracing_stop_cmdline_record();
120 call->unregfunc(call->data); 116 call->unregfunc(call);
121 } 117 }
122 break; 118 break;
123 case 1: 119 case 1:
124 if (!call->enabled) { 120 if (!call->enabled) {
125 call->enabled = 1; 121 call->enabled = 1;
126 tracing_start_cmdline_record(); 122 tracing_start_cmdline_record();
127 call->regfunc(call->data); 123 call->regfunc(call);
128 } 124 }
129 break; 125 break;
130 } 126 }
@@ -507,7 +503,7 @@ extern char *__bad_type_size(void);
507#define FIELD(type, name) \ 503#define FIELD(type, name) \
508 sizeof(type) != sizeof(field.name) ? __bad_type_size() : \ 504 sizeof(type) != sizeof(field.name) ? __bad_type_size() : \
509 #type, "common_" #name, offsetof(typeof(field), name), \ 505 #type, "common_" #name, offsetof(typeof(field), name), \
510 sizeof(field.name) 506 sizeof(field.name), is_signed_type(type)
511 507
512static int trace_write_header(struct trace_seq *s) 508static int trace_write_header(struct trace_seq *s)
513{ 509{
@@ -515,17 +511,17 @@ static int trace_write_header(struct trace_seq *s)
515 511
516 /* struct trace_entry */ 512 /* struct trace_entry */
517 return trace_seq_printf(s, 513 return trace_seq_printf(s,
518 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 514 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
519 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 515 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
520 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 516 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
521 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 517 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
522 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 518 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
523 "\n", 519 "\n",
524 FIELD(unsigned short, type), 520 FIELD(unsigned short, type),
525 FIELD(unsigned char, flags), 521 FIELD(unsigned char, flags),
526 FIELD(unsigned char, preempt_count), 522 FIELD(unsigned char, preempt_count),
527 FIELD(int, pid), 523 FIELD(int, pid),
528 FIELD(int, lock_depth)); 524 FIELD(int, lock_depth));
529} 525}
530 526
531static ssize_t 527static ssize_t
@@ -878,9 +874,9 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
878 "'%s/filter' entry\n", name); 874 "'%s/filter' entry\n", name);
879 } 875 }
880 876
881 entry = trace_create_file("enable", 0644, system->entry, 877 trace_create_file("enable", 0644, system->entry,
882 (void *)system->name, 878 (void *)system->name,
883 &ftrace_system_enable_fops); 879 &ftrace_system_enable_fops);
884 880
885 return system->entry; 881 return system->entry;
886} 882}
@@ -892,7 +888,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
892 const struct file_operations *filter, 888 const struct file_operations *filter,
893 const struct file_operations *format) 889 const struct file_operations *format)
894{ 890{
895 struct dentry *entry;
896 int ret; 891 int ret;
897 892
898 /* 893 /*
@@ -910,12 +905,12 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
910 } 905 }
911 906
912 if (call->regfunc) 907 if (call->regfunc)
913 entry = trace_create_file("enable", 0644, call->dir, call, 908 trace_create_file("enable", 0644, call->dir, call,
914 enable); 909 enable);
915 910
916 if (call->id && call->profile_enable) 911 if (call->id && call->profile_enable)
917 entry = trace_create_file("id", 0444, call->dir, call, 912 trace_create_file("id", 0444, call->dir, call,
918 id); 913 id);
919 914
920 if (call->define_fields) { 915 if (call->define_fields) {
921 ret = call->define_fields(call); 916 ret = call->define_fields(call);
@@ -924,41 +919,60 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
924 " events/%s\n", call->name); 919 " events/%s\n", call->name);
925 return ret; 920 return ret;
926 } 921 }
927 entry = trace_create_file("filter", 0644, call->dir, call, 922 trace_create_file("filter", 0644, call->dir, call,
928 filter); 923 filter);
929 } 924 }
930 925
931 /* A trace may not want to export its format */ 926 /* A trace may not want to export its format */
932 if (!call->show_format) 927 if (!call->show_format)
933 return 0; 928 return 0;
934 929
935 entry = trace_create_file("format", 0444, call->dir, call, 930 trace_create_file("format", 0444, call->dir, call,
936 format); 931 format);
937 932
938 return 0; 933 return 0;
939} 934}
940 935
941#define for_each_event(event, start, end) \ 936static int __trace_add_event_call(struct ftrace_event_call *call)
942 for (event = start; \ 937{
943 (unsigned long)event < (unsigned long)end; \ 938 struct dentry *d_events;
944 event++) 939 int ret;
945 940
946#ifdef CONFIG_MODULES 941 if (!call->name)
942 return -EINVAL;
947 943
948static LIST_HEAD(ftrace_module_file_list); 944 if (call->raw_init) {
945 ret = call->raw_init(call);
946 if (ret < 0) {
947 if (ret != -ENOSYS)
948 pr_warning("Could not initialize trace "
949 "events/%s\n", call->name);
950 return ret;
951 }
952 }
949 953
950/* 954 d_events = event_trace_events_dir();
951 * Modules must own their file_operations to keep up with 955 if (!d_events)
952 * reference counting. 956 return -ENOENT;
953 */ 957
954struct ftrace_module_file_ops { 958 ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
955 struct list_head list; 959 &ftrace_enable_fops, &ftrace_event_filter_fops,
956 struct module *mod; 960 &ftrace_event_format_fops);
957 struct file_operations id; 961 if (!ret)
958 struct file_operations enable; 962 list_add(&call->list, &ftrace_events);
959 struct file_operations format; 963
960 struct file_operations filter; 964 return ret;
961}; 965}
966
967/* Add an additional event_call dynamically */
968int trace_add_event_call(struct ftrace_event_call *call)
969{
970 int ret;
971 mutex_lock(&event_mutex);
972 ret = __trace_add_event_call(call);
973 mutex_unlock(&event_mutex);
974 return ret;
975}
962 976
963static void remove_subsystem_dir(const char *name) 977static void remove_subsystem_dir(const char *name)
964{ 978{
@@ -986,6 +1000,53 @@ static void remove_subsystem_dir(const char *name)
986 } 1000 }
987} 1001}
988 1002
1003/*
1004 * Must be called under locking both of event_mutex and trace_event_mutex.
1005 */
1006static void __trace_remove_event_call(struct ftrace_event_call *call)
1007{
1008 ftrace_event_enable_disable(call, 0);
1009 if (call->event)
1010 __unregister_ftrace_event(call->event);
1011 debugfs_remove_recursive(call->dir);
1012 list_del(&call->list);
1013 trace_destroy_fields(call);
1014 destroy_preds(call);
1015 remove_subsystem_dir(call->system);
1016}
1017
1018/* Remove an event_call */
1019void trace_remove_event_call(struct ftrace_event_call *call)
1020{
1021 mutex_lock(&event_mutex);
1022 down_write(&trace_event_mutex);
1023 __trace_remove_event_call(call);
1024 up_write(&trace_event_mutex);
1025 mutex_unlock(&event_mutex);
1026}
1027
1028#define for_each_event(event, start, end) \
1029 for (event = start; \
1030 (unsigned long)event < (unsigned long)end; \
1031 event++)
1032
1033#ifdef CONFIG_MODULES
1034
1035static LIST_HEAD(ftrace_module_file_list);
1036
1037/*
1038 * Modules must own their file_operations to keep up with
1039 * reference counting.
1040 */
1041struct ftrace_module_file_ops {
1042 struct list_head list;
1043 struct module *mod;
1044 struct file_operations id;
1045 struct file_operations enable;
1046 struct file_operations format;
1047 struct file_operations filter;
1048};
1049
989static struct ftrace_module_file_ops * 1050static struct ftrace_module_file_ops *
990trace_create_file_ops(struct module *mod) 1051trace_create_file_ops(struct module *mod)
991{ 1052{
@@ -1043,7 +1104,7 @@ static void trace_module_add_events(struct module *mod)
1043 if (!call->name) 1104 if (!call->name)
1044 continue; 1105 continue;
1045 if (call->raw_init) { 1106 if (call->raw_init) {
1046 ret = call->raw_init(); 1107 ret = call->raw_init(call);
1047 if (ret < 0) { 1108 if (ret < 0) {
1048 if (ret != -ENOSYS) 1109 if (ret != -ENOSYS)
1049 pr_warning("Could not initialize trace " 1110 pr_warning("Could not initialize trace "
@@ -1061,10 +1122,11 @@ static void trace_module_add_events(struct module *mod)
1061 return; 1122 return;
1062 } 1123 }
1063 call->mod = mod; 1124 call->mod = mod;
1064 list_add(&call->list, &ftrace_events); 1125 ret = event_create_dir(call, d_events,
1065 event_create_dir(call, d_events, 1126 &file_ops->id, &file_ops->enable,
1066 &file_ops->id, &file_ops->enable, 1127 &file_ops->filter, &file_ops->format);
1067 &file_ops->filter, &file_ops->format); 1128 if (!ret)
1129 list_add(&call->list, &ftrace_events);
1068 } 1130 }
1069} 1131}
1070 1132
@@ -1078,14 +1140,7 @@ static void trace_module_remove_events(struct module *mod)
1078 list_for_each_entry_safe(call, p, &ftrace_events, list) { 1140 list_for_each_entry_safe(call, p, &ftrace_events, list) {
1079 if (call->mod == mod) { 1141 if (call->mod == mod) {
1080 found = true; 1142 found = true;
1081 ftrace_event_enable_disable(call, 0); 1143 __trace_remove_event_call(call);
1082 if (call->event)
1083 __unregister_ftrace_event(call->event);
1084 debugfs_remove_recursive(call->dir);
1085 list_del(&call->list);
1086 trace_destroy_fields(call);
1087 destroy_preds(call);
1088 remove_subsystem_dir(call->system);
1089 } 1144 }
1090 } 1145 }
1091 1146
@@ -1203,7 +1258,7 @@ static __init int event_trace_init(void)
1203 if (!call->name) 1258 if (!call->name)
1204 continue; 1259 continue;
1205 if (call->raw_init) { 1260 if (call->raw_init) {
1206 ret = call->raw_init(); 1261 ret = call->raw_init(call);
1207 if (ret < 0) { 1262 if (ret < 0) {
1208 if (ret != -ENOSYS) 1263 if (ret != -ENOSYS)
1209 pr_warning("Could not initialize trace " 1264 pr_warning("Could not initialize trace "
@@ -1211,10 +1266,12 @@ static __init int event_trace_init(void)
1211 continue; 1266 continue;
1212 } 1267 }
1213 } 1268 }
1214 list_add(&call->list, &ftrace_events); 1269 ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
1215 event_create_dir(call, d_events, &ftrace_event_id_fops, 1270 &ftrace_enable_fops,
1216 &ftrace_enable_fops, &ftrace_event_filter_fops, 1271 &ftrace_event_filter_fops,
1217 &ftrace_event_format_fops); 1272 &ftrace_event_format_fops);
1273 if (!ret)
1274 list_add(&call->list, &ftrace_events);
1218 } 1275 }
1219 1276
1220 while (true) { 1277 while (true) {
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 98a6cc5c64ed..50504cb228de 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -18,11 +18,10 @@
18 * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com> 18 * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
19 */ 19 */
20 20
21#include <linux/debugfs.h>
22#include <linux/uaccess.h>
23#include <linux/module.h> 21#include <linux/module.h>
24#include <linux/ctype.h> 22#include <linux/ctype.h>
25#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/perf_event.h>
26 25
27#include "trace.h" 26#include "trace.h"
28#include "trace_output.h" 27#include "trace_output.h"
@@ -31,6 +30,7 @@ enum filter_op_ids
31{ 30{
32 OP_OR, 31 OP_OR,
33 OP_AND, 32 OP_AND,
33 OP_GLOB,
34 OP_NE, 34 OP_NE,
35 OP_EQ, 35 OP_EQ,
36 OP_LT, 36 OP_LT,
@@ -48,16 +48,17 @@ struct filter_op {
48}; 48};
49 49
50static struct filter_op filter_ops[] = { 50static struct filter_op filter_ops[] = {
51 { OP_OR, "||", 1 }, 51 { OP_OR, "||", 1 },
52 { OP_AND, "&&", 2 }, 52 { OP_AND, "&&", 2 },
53 { OP_NE, "!=", 4 }, 53 { OP_GLOB, "~", 4 },
54 { OP_EQ, "==", 4 }, 54 { OP_NE, "!=", 4 },
55 { OP_LT, "<", 5 }, 55 { OP_EQ, "==", 4 },
56 { OP_LE, "<=", 5 }, 56 { OP_LT, "<", 5 },
57 { OP_GT, ">", 5 }, 57 { OP_LE, "<=", 5 },
58 { OP_GE, ">=", 5 }, 58 { OP_GT, ">", 5 },
59 { OP_NONE, "OP_NONE", 0 }, 59 { OP_GE, ">=", 5 },
60 { OP_OPEN_PAREN, "(", 0 }, 60 { OP_NONE, "OP_NONE", 0 },
61 { OP_OPEN_PAREN, "(", 0 },
61}; 62};
62 63
63enum { 64enum {
@@ -197,9 +198,9 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
197 char *addr = (char *)(event + pred->offset); 198 char *addr = (char *)(event + pred->offset);
198 int cmp, match; 199 int cmp, match;
199 200
200 cmp = strncmp(addr, pred->str_val, pred->str_len); 201 cmp = pred->regex.match(addr, &pred->regex, pred->regex.field_len);
201 202
202 match = (!cmp) ^ pred->not; 203 match = cmp ^ pred->not;
203 204
204 return match; 205 return match;
205} 206}
@@ -211,9 +212,9 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event,
211 char **addr = (char **)(event + pred->offset); 212 char **addr = (char **)(event + pred->offset);
212 int cmp, match; 213 int cmp, match;
213 214
214 cmp = strncmp(*addr, pred->str_val, pred->str_len); 215 cmp = pred->regex.match(*addr, &pred->regex, pred->regex.field_len);
215 216
216 match = (!cmp) ^ pred->not; 217 match = cmp ^ pred->not;
217 218
218 return match; 219 return match;
219} 220}
@@ -237,9 +238,9 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event,
237 char *addr = (char *)(event + str_loc); 238 char *addr = (char *)(event + str_loc);
238 int cmp, match; 239 int cmp, match;
239 240
240 cmp = strncmp(addr, pred->str_val, str_len); 241 cmp = pred->regex.match(addr, &pred->regex, str_len);
241 242
242 match = (!cmp) ^ pred->not; 243 match = cmp ^ pred->not;
243 244
244 return match; 245 return match;
245} 246}
@@ -250,10 +251,121 @@ static int filter_pred_none(struct filter_pred *pred, void *event,
250 return 0; 251 return 0;
251} 252}
252 253
254/* Basic regex callbacks */
255static int regex_match_full(char *str, struct regex *r, int len)
256{
257 if (strncmp(str, r->pattern, len) == 0)
258 return 1;
259 return 0;
260}
261
262static int regex_match_front(char *str, struct regex *r, int len)
263{
264 if (strncmp(str, r->pattern, len) == 0)
265 return 1;
266 return 0;
267}
268
269static int regex_match_middle(char *str, struct regex *r, int len)
270{
271 if (strstr(str, r->pattern))
272 return 1;
273 return 0;
274}
275
276static int regex_match_end(char *str, struct regex *r, int len)
277{
278 char *ptr = strstr(str, r->pattern);
279
280 if (ptr && (ptr[r->len] == 0))
281 return 1;
282 return 0;
283}
284
285/**
286 * filter_parse_regex - parse a basic regex
287 * @buff: the raw regex
288 * @len: length of the regex
289 * @search: will point to the beginning of the string to compare
290 * @not: tell whether the match will have to be inverted
291 *
292 * This passes in a buffer containing a regex and this function will
293 * set search to point to the search part of the buffer and
294 * return the type of search it is (see enum above).
295 * This does modify buff.
296 *
297 * Returns enum type.
298 * search returns the pointer to use for comparison.
299 * not returns 1 if buff started with a '!'
300 * 0 otherwise.
301 */
302enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not)
303{
304 int type = MATCH_FULL;
305 int i;
306
307 if (buff[0] == '!') {
308 *not = 1;
309 buff++;
310 len--;
311 } else
312 *not = 0;
313
314 *search = buff;
315
316 for (i = 0; i < len; i++) {
317 if (buff[i] == '*') {
318 if (!i) {
319 *search = buff + 1;
320 type = MATCH_END_ONLY;
321 } else {
322 if (type == MATCH_END_ONLY)
323 type = MATCH_MIDDLE_ONLY;
324 else
325 type = MATCH_FRONT_ONLY;
326 buff[i] = 0;
327 break;
328 }
329 }
330 }
331
332 return type;
333}
334
335static void filter_build_regex(struct filter_pred *pred)
336{
337 struct regex *r = &pred->regex;
338 char *search;
339 enum regex_type type = MATCH_FULL;
340 int not = 0;
341
342 if (pred->op == OP_GLOB) {
343 type = filter_parse_regex(r->pattern, r->len, &search, &not);
344 r->len = strlen(search);
345 memmove(r->pattern, search, r->len+1);
346 }
347
348 switch (type) {
349 case MATCH_FULL:
350 r->match = regex_match_full;
351 break;
352 case MATCH_FRONT_ONLY:
353 r->match = regex_match_front;
354 break;
355 case MATCH_MIDDLE_ONLY:
356 r->match = regex_match_middle;
357 break;
358 case MATCH_END_ONLY:
359 r->match = regex_match_end;
360 break;
361 }
362
363 pred->not ^= not;
364}
365
253/* return 1 if event matches, 0 otherwise (discard) */ 366/* return 1 if event matches, 0 otherwise (discard) */
254int filter_match_preds(struct ftrace_event_call *call, void *rec) 367int filter_match_preds(struct event_filter *filter, void *rec)
255{ 368{
256 struct event_filter *filter = call->filter;
257 int match, top = 0, val1 = 0, val2 = 0; 369 int match, top = 0, val1 = 0, val2 = 0;
258 int stack[MAX_FILTER_PRED]; 370 int stack[MAX_FILTER_PRED];
259 struct filter_pred *pred; 371 struct filter_pred *pred;
@@ -396,7 +508,7 @@ static void filter_clear_pred(struct filter_pred *pred)
396{ 508{
397 kfree(pred->field_name); 509 kfree(pred->field_name);
398 pred->field_name = NULL; 510 pred->field_name = NULL;
399 pred->str_len = 0; 511 pred->regex.len = 0;
400} 512}
401 513
402static int filter_set_pred(struct filter_pred *dest, 514static int filter_set_pred(struct filter_pred *dest,
@@ -426,9 +538,8 @@ static void filter_disable_preds(struct ftrace_event_call *call)
426 filter->preds[i]->fn = filter_pred_none; 538 filter->preds[i]->fn = filter_pred_none;
427} 539}
428 540
429void destroy_preds(struct ftrace_event_call *call) 541static void __free_preds(struct event_filter *filter)
430{ 542{
431 struct event_filter *filter = call->filter;
432 int i; 543 int i;
433 544
434 if (!filter) 545 if (!filter)
@@ -441,21 +552,24 @@ void destroy_preds(struct ftrace_event_call *call)
441 kfree(filter->preds); 552 kfree(filter->preds);
442 kfree(filter->filter_string); 553 kfree(filter->filter_string);
443 kfree(filter); 554 kfree(filter);
555}
556
557void destroy_preds(struct ftrace_event_call *call)
558{
559 __free_preds(call->filter);
444 call->filter = NULL; 560 call->filter = NULL;
561 call->filter_active = 0;
445} 562}
446 563
447static int init_preds(struct ftrace_event_call *call) 564static struct event_filter *__alloc_preds(void)
448{ 565{
449 struct event_filter *filter; 566 struct event_filter *filter;
450 struct filter_pred *pred; 567 struct filter_pred *pred;
451 int i; 568 int i;
452 569
453 if (call->filter) 570 filter = kzalloc(sizeof(*filter), GFP_KERNEL);
454 return 0; 571 if (!filter)
455 572 return ERR_PTR(-ENOMEM);
456 filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
457 if (!call->filter)
458 return -ENOMEM;
459 573
460 filter->n_preds = 0; 574 filter->n_preds = 0;
461 575
@@ -471,12 +585,24 @@ static int init_preds(struct ftrace_event_call *call)
471 filter->preds[i] = pred; 585 filter->preds[i] = pred;
472 } 586 }
473 587
474 return 0; 588 return filter;
475 589
476oom: 590oom:
477 destroy_preds(call); 591 __free_preds(filter);
592 return ERR_PTR(-ENOMEM);
593}
478 594
479 return -ENOMEM; 595static int init_preds(struct ftrace_event_call *call)
596{
597 if (call->filter)
598 return 0;
599
600 call->filter_active = 0;
601 call->filter = __alloc_preds();
602 if (IS_ERR(call->filter))
603 return PTR_ERR(call->filter);
604
605 return 0;
480} 606}
481 607
482static int init_subsystem_preds(struct event_subsystem *system) 608static int init_subsystem_preds(struct event_subsystem *system)
@@ -499,14 +625,7 @@ static int init_subsystem_preds(struct event_subsystem *system)
499 return 0; 625 return 0;
500} 626}
501 627
502enum { 628static void filter_free_subsystem_preds(struct event_subsystem *system)
503 FILTER_DISABLE_ALL,
504 FILTER_INIT_NO_RESET,
505 FILTER_SKIP_NO_RESET,
506};
507
508static void filter_free_subsystem_preds(struct event_subsystem *system,
509 int flag)
510{ 629{
511 struct ftrace_event_call *call; 630 struct ftrace_event_call *call;
512 631
@@ -517,14 +636,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system,
517 if (strcmp(call->system, system->name) != 0) 636 if (strcmp(call->system, system->name) != 0)
518 continue; 637 continue;
519 638
520 if (flag == FILTER_INIT_NO_RESET) {
521 call->filter->no_reset = false;
522 continue;
523 }
524
525 if (flag == FILTER_SKIP_NO_RESET && call->filter->no_reset)
526 continue;
527
528 filter_disable_preds(call); 639 filter_disable_preds(call);
529 remove_filter_string(call->filter); 640 remove_filter_string(call->filter);
530 } 641 }
@@ -532,10 +643,10 @@ static void filter_free_subsystem_preds(struct event_subsystem *system,
532 643
533static int filter_add_pred_fn(struct filter_parse_state *ps, 644static int filter_add_pred_fn(struct filter_parse_state *ps,
534 struct ftrace_event_call *call, 645 struct ftrace_event_call *call,
646 struct event_filter *filter,
535 struct filter_pred *pred, 647 struct filter_pred *pred,
536 filter_pred_fn_t fn) 648 filter_pred_fn_t fn)
537{ 649{
538 struct event_filter *filter = call->filter;
539 int idx, err; 650 int idx, err;
540 651
541 if (filter->n_preds == MAX_FILTER_PRED) { 652 if (filter->n_preds == MAX_FILTER_PRED) {
@@ -550,7 +661,6 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
550 return err; 661 return err;
551 662
552 filter->n_preds++; 663 filter->n_preds++;
553 call->filter_active = 1;
554 664
555 return 0; 665 return 0;
556} 666}
@@ -575,7 +685,10 @@ static bool is_string_field(struct ftrace_event_field *field)
575 685
576static int is_legal_op(struct ftrace_event_field *field, int op) 686static int is_legal_op(struct ftrace_event_field *field, int op)
577{ 687{
578 if (is_string_field(field) && (op != OP_EQ && op != OP_NE)) 688 if (is_string_field(field) &&
689 (op != OP_EQ && op != OP_NE && op != OP_GLOB))
690 return 0;
691 if (!is_string_field(field) && op == OP_GLOB)
579 return 0; 692 return 0;
580 693
581 return 1; 694 return 1;
@@ -626,6 +739,7 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
626 739
627static int filter_add_pred(struct filter_parse_state *ps, 740static int filter_add_pred(struct filter_parse_state *ps,
628 struct ftrace_event_call *call, 741 struct ftrace_event_call *call,
742 struct event_filter *filter,
629 struct filter_pred *pred, 743 struct filter_pred *pred,
630 bool dry_run) 744 bool dry_run)
631{ 745{
@@ -660,21 +774,22 @@ static int filter_add_pred(struct filter_parse_state *ps,
660 } 774 }
661 775
662 if (is_string_field(field)) { 776 if (is_string_field(field)) {
663 pred->str_len = field->size; 777 filter_build_regex(pred);
664 778
665 if (field->filter_type == FILTER_STATIC_STRING) 779 if (field->filter_type == FILTER_STATIC_STRING) {
666 fn = filter_pred_string; 780 fn = filter_pred_string;
667 else if (field->filter_type == FILTER_DYN_STRING) 781 pred->regex.field_len = field->size;
782 } else if (field->filter_type == FILTER_DYN_STRING)
668 fn = filter_pred_strloc; 783 fn = filter_pred_strloc;
669 else { 784 else {
670 fn = filter_pred_pchar; 785 fn = filter_pred_pchar;
671 pred->str_len = strlen(pred->str_val); 786 pred->regex.field_len = strlen(pred->regex.pattern);
672 } 787 }
673 } else { 788 } else {
674 if (field->is_signed) 789 if (field->is_signed)
675 ret = strict_strtoll(pred->str_val, 0, &val); 790 ret = strict_strtoll(pred->regex.pattern, 0, &val);
676 else 791 else
677 ret = strict_strtoull(pred->str_val, 0, &val); 792 ret = strict_strtoull(pred->regex.pattern, 0, &val);
678 if (ret) { 793 if (ret) {
679 parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); 794 parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
680 return -EINVAL; 795 return -EINVAL;
@@ -694,45 +809,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
694 809
695add_pred_fn: 810add_pred_fn:
696 if (!dry_run) 811 if (!dry_run)
697 return filter_add_pred_fn(ps, call, pred, fn); 812 return filter_add_pred_fn(ps, call, filter, pred, fn);
698 return 0;
699}
700
701static int filter_add_subsystem_pred(struct filter_parse_state *ps,
702 struct event_subsystem *system,
703 struct filter_pred *pred,
704 char *filter_string,
705 bool dry_run)
706{
707 struct ftrace_event_call *call;
708 int err = 0;
709 bool fail = true;
710
711 list_for_each_entry(call, &ftrace_events, list) {
712
713 if (!call->define_fields)
714 continue;
715
716 if (strcmp(call->system, system->name))
717 continue;
718
719 if (call->filter->no_reset)
720 continue;
721
722 err = filter_add_pred(ps, call, pred, dry_run);
723 if (err)
724 call->filter->no_reset = true;
725 else
726 fail = false;
727
728 if (!dry_run)
729 replace_filter_string(call->filter, filter_string);
730 }
731
732 if (fail) {
733 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
734 return err;
735 }
736 return 0; 813 return 0;
737} 814}
738 815
@@ -1045,8 +1122,8 @@ static struct filter_pred *create_pred(int op, char *operand1, char *operand2)
1045 return NULL; 1122 return NULL;
1046 } 1123 }
1047 1124
1048 strcpy(pred->str_val, operand2); 1125 strcpy(pred->regex.pattern, operand2);
1049 pred->str_len = strlen(operand2); 1126 pred->regex.len = strlen(pred->regex.pattern);
1050 1127
1051 pred->op = op; 1128 pred->op = op;
1052 1129
@@ -1090,8 +1167,8 @@ static int check_preds(struct filter_parse_state *ps)
1090 return 0; 1167 return 0;
1091} 1168}
1092 1169
1093static int replace_preds(struct event_subsystem *system, 1170static int replace_preds(struct ftrace_event_call *call,
1094 struct ftrace_event_call *call, 1171 struct event_filter *filter,
1095 struct filter_parse_state *ps, 1172 struct filter_parse_state *ps,
1096 char *filter_string, 1173 char *filter_string,
1097 bool dry_run) 1174 bool dry_run)
@@ -1138,11 +1215,7 @@ static int replace_preds(struct event_subsystem *system,
1138add_pred: 1215add_pred:
1139 if (!pred) 1216 if (!pred)
1140 return -ENOMEM; 1217 return -ENOMEM;
1141 if (call) 1218 err = filter_add_pred(ps, call, filter, pred, dry_run);
1142 err = filter_add_pred(ps, call, pred, false);
1143 else
1144 err = filter_add_subsystem_pred(ps, system, pred,
1145 filter_string, dry_run);
1146 filter_free_pred(pred); 1219 filter_free_pred(pred);
1147 if (err) 1220 if (err)
1148 return err; 1221 return err;
@@ -1153,10 +1226,50 @@ add_pred:
1153 return 0; 1226 return 0;
1154} 1227}
1155 1228
1156int apply_event_filter(struct ftrace_event_call *call, char *filter_string) 1229static int replace_system_preds(struct event_subsystem *system,
1230 struct filter_parse_state *ps,
1231 char *filter_string)
1157{ 1232{
1233 struct ftrace_event_call *call;
1234 bool fail = true;
1158 int err; 1235 int err;
1159 1236
1237 list_for_each_entry(call, &ftrace_events, list) {
1238 struct event_filter *filter = call->filter;
1239
1240 if (!call->define_fields)
1241 continue;
1242
1243 if (strcmp(call->system, system->name) != 0)
1244 continue;
1245
1246 /* try to see if the filter can be applied */
1247 err = replace_preds(call, filter, ps, filter_string, true);
1248 if (err)
1249 continue;
1250
1251 /* really apply the filter */
1252 filter_disable_preds(call);
1253 err = replace_preds(call, filter, ps, filter_string, false);
1254 if (err)
1255 filter_disable_preds(call);
1256 else {
1257 call->filter_active = 1;
1258 replace_filter_string(filter, filter_string);
1259 }
1260 fail = false;
1261 }
1262
1263 if (fail) {
1264 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
1265 return -EINVAL;
1266 }
1267 return 0;
1268}
1269
1270int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1271{
1272 int err;
1160 struct filter_parse_state *ps; 1273 struct filter_parse_state *ps;
1161 1274
1162 mutex_lock(&event_mutex); 1275 mutex_lock(&event_mutex);
@@ -1168,8 +1281,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1168 if (!strcmp(strstrip(filter_string), "0")) { 1281 if (!strcmp(strstrip(filter_string), "0")) {
1169 filter_disable_preds(call); 1282 filter_disable_preds(call);
1170 remove_filter_string(call->filter); 1283 remove_filter_string(call->filter);
1171 mutex_unlock(&event_mutex); 1284 goto out_unlock;
1172 return 0;
1173 } 1285 }
1174 1286
1175 err = -ENOMEM; 1287 err = -ENOMEM;
@@ -1187,10 +1299,11 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1187 goto out; 1299 goto out;
1188 } 1300 }
1189 1301
1190 err = replace_preds(NULL, call, ps, filter_string, false); 1302 err = replace_preds(call, call->filter, ps, filter_string, false);
1191 if (err) 1303 if (err)
1192 append_filter_err(ps, call->filter); 1304 append_filter_err(ps, call->filter);
1193 1305 else
1306 call->filter_active = 1;
1194out: 1307out:
1195 filter_opstack_clear(ps); 1308 filter_opstack_clear(ps);
1196 postfix_clear(ps); 1309 postfix_clear(ps);
@@ -1205,7 +1318,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1205 char *filter_string) 1318 char *filter_string)
1206{ 1319{
1207 int err; 1320 int err;
1208
1209 struct filter_parse_state *ps; 1321 struct filter_parse_state *ps;
1210 1322
1211 mutex_lock(&event_mutex); 1323 mutex_lock(&event_mutex);
@@ -1215,10 +1327,9 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1215 goto out_unlock; 1327 goto out_unlock;
1216 1328
1217 if (!strcmp(strstrip(filter_string), "0")) { 1329 if (!strcmp(strstrip(filter_string), "0")) {
1218 filter_free_subsystem_preds(system, FILTER_DISABLE_ALL); 1330 filter_free_subsystem_preds(system);
1219 remove_filter_string(system->filter); 1331 remove_filter_string(system->filter);
1220 mutex_unlock(&event_mutex); 1332 goto out_unlock;
1221 return 0;
1222 } 1333 }
1223 1334
1224 err = -ENOMEM; 1335 err = -ENOMEM;
@@ -1235,31 +1346,87 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1235 goto out; 1346 goto out;
1236 } 1347 }
1237 1348
1238 filter_free_subsystem_preds(system, FILTER_INIT_NO_RESET); 1349 err = replace_system_preds(system, ps, filter_string);
1239 1350 if (err)
1240 /* try to see the filter can be applied to which events */
1241 err = replace_preds(system, NULL, ps, filter_string, true);
1242 if (err) {
1243 append_filter_err(ps, system->filter); 1351 append_filter_err(ps, system->filter);
1244 goto out; 1352
1353out:
1354 filter_opstack_clear(ps);
1355 postfix_clear(ps);
1356 kfree(ps);
1357out_unlock:
1358 mutex_unlock(&event_mutex);
1359
1360 return err;
1361}
1362
1363#ifdef CONFIG_EVENT_PROFILE
1364
1365void ftrace_profile_free_filter(struct perf_event *event)
1366{
1367 struct event_filter *filter = event->filter;
1368
1369 event->filter = NULL;
1370 __free_preds(filter);
1371}
1372
1373int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1374 char *filter_str)
1375{
1376 int err;
1377 struct event_filter *filter;
1378 struct filter_parse_state *ps;
1379 struct ftrace_event_call *call = NULL;
1380
1381 mutex_lock(&event_mutex);
1382
1383 list_for_each_entry(call, &ftrace_events, list) {
1384 if (call->id == event_id)
1385 break;
1245 } 1386 }
1246 1387
1247 filter_free_subsystem_preds(system, FILTER_SKIP_NO_RESET); 1388 err = -EINVAL;
1389 if (!call)
1390 goto out_unlock;
1248 1391
1249 /* really apply the filter to the events */ 1392 err = -EEXIST;
1250 err = replace_preds(system, NULL, ps, filter_string, false); 1393 if (event->filter)
1251 if (err) { 1394 goto out_unlock;
1252 append_filter_err(ps, system->filter); 1395
1253 filter_free_subsystem_preds(system, 2); 1396 filter = __alloc_preds();
1397 if (IS_ERR(filter)) {
1398 err = PTR_ERR(filter);
1399 goto out_unlock;
1254 } 1400 }
1255 1401
1256out: 1402 err = -ENOMEM;
1403 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1404 if (!ps)
1405 goto free_preds;
1406
1407 parse_init(ps, filter_ops, filter_str);
1408 err = filter_parse(ps);
1409 if (err)
1410 goto free_ps;
1411
1412 err = replace_preds(call, filter, ps, filter_str, false);
1413 if (!err)
1414 event->filter = filter;
1415
1416free_ps:
1257 filter_opstack_clear(ps); 1417 filter_opstack_clear(ps);
1258 postfix_clear(ps); 1418 postfix_clear(ps);
1259 kfree(ps); 1419 kfree(ps);
1420
1421free_preds:
1422 if (err)
1423 __free_preds(filter);
1424
1260out_unlock: 1425out_unlock:
1261 mutex_unlock(&event_mutex); 1426 mutex_unlock(&event_mutex);
1262 1427
1263 return err; 1428 return err;
1264} 1429}
1265 1430
1431#endif /* CONFIG_EVENT_PROFILE */
1432
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 9753fcc61bc5..dff8c84ddf17 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -48,11 +48,11 @@
48struct ____ftrace_##name { \ 48struct ____ftrace_##name { \
49 tstruct \ 49 tstruct \
50}; \ 50}; \
51static void __used ____ftrace_check_##name(void) \ 51static void __always_unused ____ftrace_check_##name(void) \
52{ \ 52{ \
53 struct ____ftrace_##name *__entry = NULL; \ 53 struct ____ftrace_##name *__entry = NULL; \
54 \ 54 \
55 /* force cmpile-time check on F_printk() */ \ 55 /* force compile-time check on F_printk() */ \
56 printk(print); \ 56 printk(print); \
57} 57}
58 58
@@ -66,44 +66,47 @@ static void __used ____ftrace_check_##name(void) \
66#undef __field 66#undef __field
67#define __field(type, item) \ 67#define __field(type, item) \
68 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ 68 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
69 "offset:%zu;\tsize:%zu;\n", \ 69 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
70 offsetof(typeof(field), item), \ 70 offsetof(typeof(field), item), \
71 sizeof(field.item)); \ 71 sizeof(field.item), is_signed_type(type)); \
72 if (!ret) \ 72 if (!ret) \
73 return 0; 73 return 0;
74 74
75#undef __field_desc 75#undef __field_desc
76#define __field_desc(type, container, item) \ 76#define __field_desc(type, container, item) \
77 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ 77 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
78 "offset:%zu;\tsize:%zu;\n", \ 78 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
79 offsetof(typeof(field), container.item), \ 79 offsetof(typeof(field), container.item), \
80 sizeof(field.container.item)); \ 80 sizeof(field.container.item), \
81 is_signed_type(type)); \
81 if (!ret) \ 82 if (!ret) \
82 return 0; 83 return 0;
83 84
84#undef __array 85#undef __array
85#define __array(type, item, len) \ 86#define __array(type, item, len) \
86 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ 87 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
87 "offset:%zu;\tsize:%zu;\n", \ 88 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
88 offsetof(typeof(field), item), \ 89 offsetof(typeof(field), item), \
89 sizeof(field.item)); \ 90 sizeof(field.item), is_signed_type(type)); \
90 if (!ret) \ 91 if (!ret) \
91 return 0; 92 return 0;
92 93
93#undef __array_desc 94#undef __array_desc
94#define __array_desc(type, container, item, len) \ 95#define __array_desc(type, container, item, len) \
95 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ 96 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
96 "offset:%zu;\tsize:%zu;\n", \ 97 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
97 offsetof(typeof(field), container.item), \ 98 offsetof(typeof(field), container.item), \
98 sizeof(field.container.item)); \ 99 sizeof(field.container.item), \
100 is_signed_type(type)); \
99 if (!ret) \ 101 if (!ret) \
100 return 0; 102 return 0;
101 103
102#undef __dynamic_array 104#undef __dynamic_array
103#define __dynamic_array(type, item) \ 105#define __dynamic_array(type, item) \
104 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ 106 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
105 "offset:%zu;\tsize:0;\n", \ 107 "offset:%zu;\tsize:0;\tsigned:%u;\n", \
106 offsetof(typeof(field), item)); \ 108 offsetof(typeof(field), item), \
109 is_signed_type(type)); \
107 if (!ret) \ 110 if (!ret) \
108 return 0; 111 return 0;
109 112
@@ -131,7 +134,6 @@ ftrace_format_##name(struct ftrace_event_call *unused, \
131 134
132#include "trace_entries.h" 135#include "trace_entries.h"
133 136
134
135#undef __field 137#undef __field
136#define __field(type, item) \ 138#define __field(type, item) \
137 ret = trace_define_field(event_call, #type, #item, \ 139 ret = trace_define_field(event_call, #type, #item, \
@@ -193,6 +195,11 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
193 195
194#include "trace_entries.h" 196#include "trace_entries.h"
195 197
198static int ftrace_raw_init_event(struct ftrace_event_call *call)
199{
200 INIT_LIST_HEAD(&call->fields);
201 return 0;
202}
196 203
197#undef __field 204#undef __field
198#define __field(type, item) 205#define __field(type, item)
@@ -211,7 +218,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
211 218
212#undef FTRACE_ENTRY 219#undef FTRACE_ENTRY
213#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ 220#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \
214static int ftrace_raw_init_event_##call(void); \
215 \ 221 \
216struct ftrace_event_call __used \ 222struct ftrace_event_call __used \
217__attribute__((__aligned__(4))) \ 223__attribute__((__aligned__(4))) \
@@ -219,14 +225,9 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
219 .name = #call, \ 225 .name = #call, \
220 .id = type, \ 226 .id = type, \
221 .system = __stringify(TRACE_SYSTEM), \ 227 .system = __stringify(TRACE_SYSTEM), \
222 .raw_init = ftrace_raw_init_event_##call, \ 228 .raw_init = ftrace_raw_init_event, \
223 .show_format = ftrace_format_##call, \ 229 .show_format = ftrace_format_##call, \
224 .define_fields = ftrace_define_fields_##call, \ 230 .define_fields = ftrace_define_fields_##call, \
225}; \ 231}; \
226static int ftrace_raw_init_event_##call(void) \
227{ \
228 INIT_LIST_HEAD(&event_##call.fields); \
229 return 0; \
230} \
231 232
232#include "trace_entries.h" 233#include "trace_entries.h"
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 45e6c01b2e4d..a43d009c561a 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -14,9 +14,20 @@
14#include "trace.h" 14#include "trace.h"
15#include "trace_output.h" 15#include "trace_output.h"
16 16
17struct fgraph_data { 17struct fgraph_cpu_data {
18 pid_t last_pid; 18 pid_t last_pid;
19 int depth; 19 int depth;
20 int ignore;
21};
22
23struct fgraph_data {
24 struct fgraph_cpu_data *cpu_data;
25
26 /* Place to preserve last processed entry. */
27 struct ftrace_graph_ent_entry ent;
28 struct ftrace_graph_ret_entry ret;
29 int failed;
30 int cpu;
20}; 31};
21 32
22#define TRACE_GRAPH_INDENT 2 33#define TRACE_GRAPH_INDENT 2
@@ -384,7 +395,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
384 if (!data) 395 if (!data)
385 return TRACE_TYPE_HANDLED; 396 return TRACE_TYPE_HANDLED;
386 397
387 last_pid = &(per_cpu_ptr(data, cpu)->last_pid); 398 last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
388 399
389 if (*last_pid == pid) 400 if (*last_pid == pid)
390 return TRACE_TYPE_HANDLED; 401 return TRACE_TYPE_HANDLED;
@@ -435,26 +446,49 @@ static struct ftrace_graph_ret_entry *
435get_return_for_leaf(struct trace_iterator *iter, 446get_return_for_leaf(struct trace_iterator *iter,
436 struct ftrace_graph_ent_entry *curr) 447 struct ftrace_graph_ent_entry *curr)
437{ 448{
438 struct ring_buffer_iter *ring_iter; 449 struct fgraph_data *data = iter->private;
450 struct ring_buffer_iter *ring_iter = NULL;
439 struct ring_buffer_event *event; 451 struct ring_buffer_event *event;
440 struct ftrace_graph_ret_entry *next; 452 struct ftrace_graph_ret_entry *next;
441 453
442 ring_iter = iter->buffer_iter[iter->cpu]; 454 /*
455 * If the previous output failed to write to the seq buffer,
456 * then we just reuse the data from before.
457 */
458 if (data && data->failed) {
459 curr = &data->ent;
460 next = &data->ret;
461 } else {
443 462
444 /* First peek to compare current entry and the next one */ 463 ring_iter = iter->buffer_iter[iter->cpu];
445 if (ring_iter) 464
446 event = ring_buffer_iter_peek(ring_iter, NULL); 465 /* First peek to compare current entry and the next one */
447 else { 466 if (ring_iter)
448 /* We need to consume the current entry to see the next one */ 467 event = ring_buffer_iter_peek(ring_iter, NULL);
449 ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL); 468 else {
450 event = ring_buffer_peek(iter->tr->buffer, iter->cpu, 469 /*
451 NULL); 470 * We need to consume the current entry to see
452 } 471 * the next one.
472 */
473 ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
474 event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
475 NULL);
476 }
453 477
454 if (!event) 478 if (!event)
455 return NULL; 479 return NULL;
480
481 next = ring_buffer_event_data(event);
456 482
457 next = ring_buffer_event_data(event); 483 if (data) {
484 /*
485 * Save current and next entries for later reference
486 * if the output fails.
487 */
488 data->ent = *curr;
489 data->ret = *next;
490 }
491 }
458 492
459 if (next->ent.type != TRACE_GRAPH_RET) 493 if (next->ent.type != TRACE_GRAPH_RET)
460 return NULL; 494 return NULL;
@@ -640,7 +674,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
640 674
641 if (data) { 675 if (data) {
642 int cpu = iter->cpu; 676 int cpu = iter->cpu;
643 int *depth = &(per_cpu_ptr(data, cpu)->depth); 677 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
644 678
645 /* 679 /*
646 * Comments display at + 1 to depth. Since 680 * Comments display at + 1 to depth. Since
@@ -688,7 +722,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
688 722
689 if (data) { 723 if (data) {
690 int cpu = iter->cpu; 724 int cpu = iter->cpu;
691 int *depth = &(per_cpu_ptr(data, cpu)->depth); 725 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
692 726
693 *depth = call->depth; 727 *depth = call->depth;
694 } 728 }
@@ -782,19 +816,34 @@ static enum print_line_t
782print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, 816print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
783 struct trace_iterator *iter) 817 struct trace_iterator *iter)
784{ 818{
785 int cpu = iter->cpu; 819 struct fgraph_data *data = iter->private;
786 struct ftrace_graph_ent *call = &field->graph_ent; 820 struct ftrace_graph_ent *call = &field->graph_ent;
787 struct ftrace_graph_ret_entry *leaf_ret; 821 struct ftrace_graph_ret_entry *leaf_ret;
822 static enum print_line_t ret;
823 int cpu = iter->cpu;
788 824
789 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func)) 825 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func))
790 return TRACE_TYPE_PARTIAL_LINE; 826 return TRACE_TYPE_PARTIAL_LINE;
791 827
792 leaf_ret = get_return_for_leaf(iter, field); 828 leaf_ret = get_return_for_leaf(iter, field);
793 if (leaf_ret) 829 if (leaf_ret)
794 return print_graph_entry_leaf(iter, field, leaf_ret, s); 830 ret = print_graph_entry_leaf(iter, field, leaf_ret, s);
795 else 831 else
796 return print_graph_entry_nested(iter, field, s, cpu); 832 ret = print_graph_entry_nested(iter, field, s, cpu);
797 833
834 if (data) {
835 /*
836 * If we failed to write our output, then we need to make
837 * note of it. Because we already consumed our entry.
838 */
839 if (s->full) {
840 data->failed = 1;
841 data->cpu = cpu;
842 } else
843 data->failed = 0;
844 }
845
846 return ret;
798} 847}
799 848
800static enum print_line_t 849static enum print_line_t
@@ -810,7 +859,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
810 859
811 if (data) { 860 if (data) {
812 int cpu = iter->cpu; 861 int cpu = iter->cpu;
813 int *depth = &(per_cpu_ptr(data, cpu)->depth); 862 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
814 863
815 /* 864 /*
816 * Comments display at + 1 to depth. This is the 865 * Comments display at + 1 to depth. This is the
@@ -873,7 +922,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
873 int i; 922 int i;
874 923
875 if (data) 924 if (data)
876 depth = per_cpu_ptr(data, iter->cpu)->depth; 925 depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
877 926
878 if (print_graph_prologue(iter, s, 0, 0)) 927 if (print_graph_prologue(iter, s, 0, 0))
879 return TRACE_TYPE_PARTIAL_LINE; 928 return TRACE_TYPE_PARTIAL_LINE;
@@ -941,8 +990,33 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
941enum print_line_t 990enum print_line_t
942print_graph_function(struct trace_iterator *iter) 991print_graph_function(struct trace_iterator *iter)
943{ 992{
993 struct ftrace_graph_ent_entry *field;
994 struct fgraph_data *data = iter->private;
944 struct trace_entry *entry = iter->ent; 995 struct trace_entry *entry = iter->ent;
945 struct trace_seq *s = &iter->seq; 996 struct trace_seq *s = &iter->seq;
997 int cpu = iter->cpu;
998 int ret;
999
1000 if (data && per_cpu_ptr(data->cpu_data, cpu)->ignore) {
1001 per_cpu_ptr(data->cpu_data, cpu)->ignore = 0;
1002 return TRACE_TYPE_HANDLED;
1003 }
1004
1005 /*
1006 * If the last output failed, there's a possibility we need
1007 * to print out the missing entry which would never go out.
1008 */
1009 if (data && data->failed) {
1010 field = &data->ent;
1011 iter->cpu = data->cpu;
1012 ret = print_graph_entry(field, s, iter);
1013 if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) {
1014 per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1;
1015 ret = TRACE_TYPE_NO_CONSUME;
1016 }
1017 iter->cpu = cpu;
1018 return ret;
1019 }
946 1020
947 switch (entry->type) { 1021 switch (entry->type) {
948 case TRACE_GRAPH_ENT: { 1022 case TRACE_GRAPH_ENT: {
@@ -952,7 +1026,7 @@ print_graph_function(struct trace_iterator *iter)
952 * sizeof(struct ftrace_graph_ent_entry) is very small, 1026 * sizeof(struct ftrace_graph_ent_entry) is very small,
953 * it can be safely saved at the stack. 1027 * it can be safely saved at the stack.
954 */ 1028 */
955 struct ftrace_graph_ent_entry *field, saved; 1029 struct ftrace_graph_ent_entry saved;
956 trace_assign_type(field, entry); 1030 trace_assign_type(field, entry);
957 saved = *field; 1031 saved = *field;
958 return print_graph_entry(&saved, s, iter); 1032 return print_graph_entry(&saved, s, iter);
@@ -1030,31 +1104,54 @@ static void print_graph_headers(struct seq_file *s)
1030static void graph_trace_open(struct trace_iterator *iter) 1104static void graph_trace_open(struct trace_iterator *iter)
1031{ 1105{
1032 /* pid and depth on the last trace processed */ 1106 /* pid and depth on the last trace processed */
1033 struct fgraph_data *data = alloc_percpu(struct fgraph_data); 1107 struct fgraph_data *data;
1034 int cpu; 1108 int cpu;
1035 1109
1110 iter->private = NULL;
1111
1112 data = kzalloc(sizeof(*data), GFP_KERNEL);
1036 if (!data) 1113 if (!data)
1037 pr_warning("function graph tracer: not enough memory\n"); 1114 goto out_err;
1038 else 1115
1039 for_each_possible_cpu(cpu) { 1116 data->cpu_data = alloc_percpu(struct fgraph_cpu_data);
1040 pid_t *pid = &(per_cpu_ptr(data, cpu)->last_pid); 1117 if (!data->cpu_data)
1041 int *depth = &(per_cpu_ptr(data, cpu)->depth); 1118 goto out_err_free;
1042 *pid = -1; 1119
1043 *depth = 0; 1120 for_each_possible_cpu(cpu) {
1044 } 1121 pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
1122 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
1123 int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore);
1124 *pid = -1;
1125 *depth = 0;
1126 *ignore = 0;
1127 }
1045 1128
1046 iter->private = data; 1129 iter->private = data;
1130
1131 return;
1132
1133 out_err_free:
1134 kfree(data);
1135 out_err:
1136 pr_warning("function graph tracer: not enough memory\n");
1047} 1137}
1048 1138
1049static void graph_trace_close(struct trace_iterator *iter) 1139static void graph_trace_close(struct trace_iterator *iter)
1050{ 1140{
1051 free_percpu(iter->private); 1141 struct fgraph_data *data = iter->private;
1142
1143 if (data) {
1144 free_percpu(data->cpu_data);
1145 kfree(data);
1146 }
1052} 1147}
1053 1148
1054static struct tracer graph_trace __read_mostly = { 1149static struct tracer graph_trace __read_mostly = {
1055 .name = "function_graph", 1150 .name = "function_graph",
1056 .open = graph_trace_open, 1151 .open = graph_trace_open,
1152 .pipe_open = graph_trace_open,
1057 .close = graph_trace_close, 1153 .close = graph_trace_close,
1154 .pipe_close = graph_trace_close,
1058 .wait_pipe = poll_wait_pipe, 1155 .wait_pipe = poll_wait_pipe,
1059 .init = graph_trace_init, 1156 .init = graph_trace_init,
1060 .reset = graph_trace_reset, 1157 .reset = graph_trace_reset,
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
new file mode 100644
index 000000000000..b52d397e57eb
--- /dev/null
+++ b/kernel/trace/trace_kprobe.c
@@ -0,0 +1,1542 @@
1/*
2 * Kprobes-based tracing events
3 *
4 * Created by Masami Hiramatsu <mhiramat@redhat.com>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20#include <linux/module.h>
21#include <linux/uaccess.h>
22#include <linux/kprobes.h>
23#include <linux/seq_file.h>
24#include <linux/slab.h>
25#include <linux/smp.h>
26#include <linux/debugfs.h>
27#include <linux/types.h>
28#include <linux/string.h>
29#include <linux/ctype.h>
30#include <linux/ptrace.h>
31#include <linux/perf_event.h>
32
33#include "trace.h"
34#include "trace_output.h"
35
36#define MAX_TRACE_ARGS 128
37#define MAX_ARGSTR_LEN 63
38#define MAX_EVENT_NAME_LEN 64
39#define KPROBE_EVENT_SYSTEM "kprobes"
40
41/* Reserved field names */
42#define FIELD_STRING_IP "__probe_ip"
43#define FIELD_STRING_NARGS "__probe_nargs"
44#define FIELD_STRING_RETIP "__probe_ret_ip"
45#define FIELD_STRING_FUNC "__probe_func"
46
47const char *reserved_field_names[] = {
48 "common_type",
49 "common_flags",
50 "common_preempt_count",
51 "common_pid",
52 "common_tgid",
53 "common_lock_depth",
54 FIELD_STRING_IP,
55 FIELD_STRING_NARGS,
56 FIELD_STRING_RETIP,
57 FIELD_STRING_FUNC,
58};
59
60struct fetch_func {
61 unsigned long (*func)(struct pt_regs *, void *);
62 void *data;
63};
64
65static __kprobes unsigned long call_fetch(struct fetch_func *f,
66 struct pt_regs *regs)
67{
68 return f->func(regs, f->data);
69}
70
71/* fetch handlers */
72static __kprobes unsigned long fetch_register(struct pt_regs *regs,
73 void *offset)
74{
75 return regs_get_register(regs, (unsigned int)((unsigned long)offset));
76}
77
78static __kprobes unsigned long fetch_stack(struct pt_regs *regs,
79 void *num)
80{
81 return regs_get_kernel_stack_nth(regs,
82 (unsigned int)((unsigned long)num));
83}
84
85static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr)
86{
87 unsigned long retval;
88
89 if (probe_kernel_address(addr, retval))
90 return 0;
91 return retval;
92}
93
94static __kprobes unsigned long fetch_argument(struct pt_regs *regs, void *num)
95{
96 return regs_get_argument_nth(regs, (unsigned int)((unsigned long)num));
97}
98
99static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs,
100 void *dummy)
101{
102 return regs_return_value(regs);
103}
104
105static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs,
106 void *dummy)
107{
108 return kernel_stack_pointer(regs);
109}
110
111/* Memory fetching by symbol */
112struct symbol_cache {
113 char *symbol;
114 long offset;
115 unsigned long addr;
116};
117
118static unsigned long update_symbol_cache(struct symbol_cache *sc)
119{
120 sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
121 if (sc->addr)
122 sc->addr += sc->offset;
123 return sc->addr;
124}
125
126static void free_symbol_cache(struct symbol_cache *sc)
127{
128 kfree(sc->symbol);
129 kfree(sc);
130}
131
132static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
133{
134 struct symbol_cache *sc;
135
136 if (!sym || strlen(sym) == 0)
137 return NULL;
138 sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
139 if (!sc)
140 return NULL;
141
142 sc->symbol = kstrdup(sym, GFP_KERNEL);
143 if (!sc->symbol) {
144 kfree(sc);
145 return NULL;
146 }
147 sc->offset = offset;
148
149 update_symbol_cache(sc);
150 return sc;
151}
152
153static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data)
154{
155 struct symbol_cache *sc = data;
156
157 if (sc->addr)
158 return fetch_memory(regs, (void *)sc->addr);
159 else
160 return 0;
161}
162
163/* Special indirect memory access interface */
164struct indirect_fetch_data {
165 struct fetch_func orig;
166 long offset;
167};
168
169static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data)
170{
171 struct indirect_fetch_data *ind = data;
172 unsigned long addr;
173
174 addr = call_fetch(&ind->orig, regs);
175 if (addr) {
176 addr += ind->offset;
177 return fetch_memory(regs, (void *)addr);
178 } else
179 return 0;
180}
181
182static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data)
183{
184 if (data->orig.func == fetch_indirect)
185 free_indirect_fetch_data(data->orig.data);
186 else if (data->orig.func == fetch_symbol)
187 free_symbol_cache(data->orig.data);
188 kfree(data);
189}
190
191/**
192 * Kprobe event core functions
193 */
194
195struct probe_arg {
196 struct fetch_func fetch;
197 const char *name;
198};
199
200/* Flags for trace_probe */
201#define TP_FLAG_TRACE 1
202#define TP_FLAG_PROFILE 2
203
204struct trace_probe {
205 struct list_head list;
206 struct kretprobe rp; /* Use rp.kp for kprobe use */
207 unsigned long nhit;
208 unsigned int flags; /* For TP_FLAG_* */
209 const char *symbol; /* symbol name */
210 struct ftrace_event_call call;
211 struct trace_event event;
212 unsigned int nr_args;
213 struct probe_arg args[];
214};
215
216#define SIZEOF_TRACE_PROBE(n) \
217 (offsetof(struct trace_probe, args) + \
218 (sizeof(struct probe_arg) * (n)))
219
220static __kprobes int probe_is_return(struct trace_probe *tp)
221{
222 return tp->rp.handler != NULL;
223}
224
225static __kprobes const char *probe_symbol(struct trace_probe *tp)
226{
227 return tp->symbol ? tp->symbol : "unknown";
228}
229
230static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
231{
232 int ret = -EINVAL;
233
234 if (ff->func == fetch_argument)
235 ret = snprintf(buf, n, "$arg%lu", (unsigned long)ff->data);
236 else if (ff->func == fetch_register) {
237 const char *name;
238 name = regs_query_register_name((unsigned int)((long)ff->data));
239 ret = snprintf(buf, n, "%%%s", name);
240 } else if (ff->func == fetch_stack)
241 ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data);
242 else if (ff->func == fetch_memory)
243 ret = snprintf(buf, n, "@0x%p", ff->data);
244 else if (ff->func == fetch_symbol) {
245 struct symbol_cache *sc = ff->data;
246 if (sc->offset)
247 ret = snprintf(buf, n, "@%s%+ld", sc->symbol,
248 sc->offset);
249 else
250 ret = snprintf(buf, n, "@%s", sc->symbol);
251 } else if (ff->func == fetch_retvalue)
252 ret = snprintf(buf, n, "$retval");
253 else if (ff->func == fetch_stack_address)
254 ret = snprintf(buf, n, "$stack");
255 else if (ff->func == fetch_indirect) {
256 struct indirect_fetch_data *id = ff->data;
257 size_t l = 0;
258 ret = snprintf(buf, n, "%+ld(", id->offset);
259 if (ret >= n)
260 goto end;
261 l += ret;
262 ret = probe_arg_string(buf + l, n - l, &id->orig);
263 if (ret < 0)
264 goto end;
265 l += ret;
266 ret = snprintf(buf + l, n - l, ")");
267 ret += l;
268 }
269end:
270 if (ret >= n)
271 return -ENOSPC;
272 return ret;
273}
274
275static int register_probe_event(struct trace_probe *tp);
276static void unregister_probe_event(struct trace_probe *tp);
277
278static DEFINE_MUTEX(probe_lock);
279static LIST_HEAD(probe_list);
280
281static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
282static int kretprobe_dispatcher(struct kretprobe_instance *ri,
283 struct pt_regs *regs);
284
285/*
286 * Allocate new trace_probe and initialize it (including kprobes).
287 */
288static struct trace_probe *alloc_trace_probe(const char *group,
289 const char *event,
290 void *addr,
291 const char *symbol,
292 unsigned long offs,
293 int nargs, int is_return)
294{
295 struct trace_probe *tp;
296
297 tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL);
298 if (!tp)
299 return ERR_PTR(-ENOMEM);
300
301 if (symbol) {
302 tp->symbol = kstrdup(symbol, GFP_KERNEL);
303 if (!tp->symbol)
304 goto error;
305 tp->rp.kp.symbol_name = tp->symbol;
306 tp->rp.kp.offset = offs;
307 } else
308 tp->rp.kp.addr = addr;
309
310 if (is_return)
311 tp->rp.handler = kretprobe_dispatcher;
312 else
313 tp->rp.kp.pre_handler = kprobe_dispatcher;
314
315 if (!event)
316 goto error;
317 tp->call.name = kstrdup(event, GFP_KERNEL);
318 if (!tp->call.name)
319 goto error;
320
321 if (!group)
322 goto error;
323 tp->call.system = kstrdup(group, GFP_KERNEL);
324 if (!tp->call.system)
325 goto error;
326
327 INIT_LIST_HEAD(&tp->list);
328 return tp;
329error:
330 kfree(tp->call.name);
331 kfree(tp->symbol);
332 kfree(tp);
333 return ERR_PTR(-ENOMEM);
334}
335
336static void free_probe_arg(struct probe_arg *arg)
337{
338 if (arg->fetch.func == fetch_symbol)
339 free_symbol_cache(arg->fetch.data);
340 else if (arg->fetch.func == fetch_indirect)
341 free_indirect_fetch_data(arg->fetch.data);
342 kfree(arg->name);
343}
344
345static void free_trace_probe(struct trace_probe *tp)
346{
347 int i;
348
349 for (i = 0; i < tp->nr_args; i++)
350 free_probe_arg(&tp->args[i]);
351
352 kfree(tp->call.system);
353 kfree(tp->call.name);
354 kfree(tp->symbol);
355 kfree(tp);
356}
357
358static struct trace_probe *find_probe_event(const char *event,
359 const char *group)
360{
361 struct trace_probe *tp;
362
363 list_for_each_entry(tp, &probe_list, list)
364 if (strcmp(tp->call.name, event) == 0 &&
365 strcmp(tp->call.system, group) == 0)
366 return tp;
367 return NULL;
368}
369
370/* Unregister a trace_probe and probe_event: call with locking probe_lock */
371static void unregister_trace_probe(struct trace_probe *tp)
372{
373 if (probe_is_return(tp))
374 unregister_kretprobe(&tp->rp);
375 else
376 unregister_kprobe(&tp->rp.kp);
377 list_del(&tp->list);
378 unregister_probe_event(tp);
379}
380
381/* Register a trace_probe and probe_event */
382static int register_trace_probe(struct trace_probe *tp)
383{
384 struct trace_probe *old_tp;
385 int ret;
386
387 mutex_lock(&probe_lock);
388
389 /* register as an event */
390 old_tp = find_probe_event(tp->call.name, tp->call.system);
391 if (old_tp) {
392 /* delete old event */
393 unregister_trace_probe(old_tp);
394 free_trace_probe(old_tp);
395 }
396 ret = register_probe_event(tp);
397 if (ret) {
398 pr_warning("Faild to register probe event(%d)\n", ret);
399 goto end;
400 }
401
402 tp->rp.kp.flags |= KPROBE_FLAG_DISABLED;
403 if (probe_is_return(tp))
404 ret = register_kretprobe(&tp->rp);
405 else
406 ret = register_kprobe(&tp->rp.kp);
407
408 if (ret) {
409 pr_warning("Could not insert probe(%d)\n", ret);
410 if (ret == -EILSEQ) {
411 pr_warning("Probing address(0x%p) is not an "
412 "instruction boundary.\n",
413 tp->rp.kp.addr);
414 ret = -EINVAL;
415 }
416 unregister_probe_event(tp);
417 } else
418 list_add_tail(&tp->list, &probe_list);
419end:
420 mutex_unlock(&probe_lock);
421 return ret;
422}
423
424/* Split symbol and offset. */
425static int split_symbol_offset(char *symbol, unsigned long *offset)
426{
427 char *tmp;
428 int ret;
429
430 if (!offset)
431 return -EINVAL;
432
433 tmp = strchr(symbol, '+');
434 if (tmp) {
435 /* skip sign because strict_strtol doesn't accept '+' */
436 ret = strict_strtoul(tmp + 1, 0, offset);
437 if (ret)
438 return ret;
439 *tmp = '\0';
440 } else
441 *offset = 0;
442 return 0;
443}
444
445#define PARAM_MAX_ARGS 16
446#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
447
448static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
449{
450 int ret = 0;
451 unsigned long param;
452
453 if (strcmp(arg, "retval") == 0) {
454 if (is_return) {
455 ff->func = fetch_retvalue;
456 ff->data = NULL;
457 } else
458 ret = -EINVAL;
459 } else if (strncmp(arg, "stack", 5) == 0) {
460 if (arg[5] == '\0') {
461 ff->func = fetch_stack_address;
462 ff->data = NULL;
463 } else if (isdigit(arg[5])) {
464 ret = strict_strtoul(arg + 5, 10, &param);
465 if (ret || param > PARAM_MAX_STACK)
466 ret = -EINVAL;
467 else {
468 ff->func = fetch_stack;
469 ff->data = (void *)param;
470 }
471 } else
472 ret = -EINVAL;
473 } else if (strncmp(arg, "arg", 3) == 0 && isdigit(arg[3])) {
474 ret = strict_strtoul(arg + 3, 10, &param);
475 if (ret || param > PARAM_MAX_ARGS)
476 ret = -EINVAL;
477 else {
478 ff->func = fetch_argument;
479 ff->data = (void *)param;
480 }
481 } else
482 ret = -EINVAL;
483 return ret;
484}
485
486/* Recursive argument parser */
487static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
488{
489 int ret = 0;
490 unsigned long param;
491 long offset;
492 char *tmp;
493
494 switch (arg[0]) {
495 case '$':
496 ret = parse_probe_vars(arg + 1, ff, is_return);
497 break;
498 case '%': /* named register */
499 ret = regs_query_register_offset(arg + 1);
500 if (ret >= 0) {
501 ff->func = fetch_register;
502 ff->data = (void *)(unsigned long)ret;
503 ret = 0;
504 }
505 break;
506 case '@': /* memory or symbol */
507 if (isdigit(arg[1])) {
508 ret = strict_strtoul(arg + 1, 0, &param);
509 if (ret)
510 break;
511 ff->func = fetch_memory;
512 ff->data = (void *)param;
513 } else {
514 ret = split_symbol_offset(arg + 1, &offset);
515 if (ret)
516 break;
517 ff->data = alloc_symbol_cache(arg + 1, offset);
518 if (ff->data)
519 ff->func = fetch_symbol;
520 else
521 ret = -EINVAL;
522 }
523 break;
524 case '+': /* indirect memory */
525 case '-':
526 tmp = strchr(arg, '(');
527 if (!tmp) {
528 ret = -EINVAL;
529 break;
530 }
531 *tmp = '\0';
532 ret = strict_strtol(arg + 1, 0, &offset);
533 if (ret)
534 break;
535 if (arg[0] == '-')
536 offset = -offset;
537 arg = tmp + 1;
538 tmp = strrchr(arg, ')');
539 if (tmp) {
540 struct indirect_fetch_data *id;
541 *tmp = '\0';
542 id = kzalloc(sizeof(struct indirect_fetch_data),
543 GFP_KERNEL);
544 if (!id)
545 return -ENOMEM;
546 id->offset = offset;
547 ret = __parse_probe_arg(arg, &id->orig, is_return);
548 if (ret)
549 kfree(id);
550 else {
551 ff->func = fetch_indirect;
552 ff->data = (void *)id;
553 }
554 } else
555 ret = -EINVAL;
556 break;
557 default:
558 /* TODO: support custom handler */
559 ret = -EINVAL;
560 }
561 return ret;
562}
563
564/* String length checking wrapper */
565static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
566{
567 if (strlen(arg) > MAX_ARGSTR_LEN) {
568 pr_info("Argument is too long.: %s\n", arg);
569 return -ENOSPC;
570 }
571 return __parse_probe_arg(arg, ff, is_return);
572}
573
574/* Return 1 if name is reserved or already used by another argument */
575static int conflict_field_name(const char *name,
576 struct probe_arg *args, int narg)
577{
578 int i;
579 for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++)
580 if (strcmp(reserved_field_names[i], name) == 0)
581 return 1;
582 for (i = 0; i < narg; i++)
583 if (strcmp(args[i].name, name) == 0)
584 return 1;
585 return 0;
586}
587
588static int create_trace_probe(int argc, char **argv)
589{
590 /*
591 * Argument syntax:
592 * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS]
593 * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS]
594 * Fetch args:
595 * $argN : fetch Nth of function argument. (N:0-)
596 * $retval : fetch return value
597 * $stack : fetch stack address
598 * $stackN : fetch Nth of stack (N:0-)
599 * @ADDR : fetch memory at ADDR (ADDR should be in kernel)
600 * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
601 * %REG : fetch register REG
602 * Indirect memory fetch:
603 * +|-offs(ARG) : fetch memory at ARG +|- offs address.
604 * Alias name of args:
605 * NAME=FETCHARG : set NAME as alias of FETCHARG.
606 */
607 struct trace_probe *tp;
608 int i, ret = 0;
609 int is_return = 0, is_delete = 0;
610 char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL;
611 unsigned long offset = 0;
612 void *addr = NULL;
613 char buf[MAX_EVENT_NAME_LEN];
614
615 /* argc must be >= 1 */
616 if (argv[0][0] == 'p')
617 is_return = 0;
618 else if (argv[0][0] == 'r')
619 is_return = 1;
620 else if (argv[0][0] == '-')
621 is_delete = 1;
622 else {
623 pr_info("Probe definition must be started with 'p', 'r' or"
624 " '-'.\n");
625 return -EINVAL;
626 }
627
628 if (argv[0][1] == ':') {
629 event = &argv[0][2];
630 if (strchr(event, '/')) {
631 group = event;
632 event = strchr(group, '/') + 1;
633 event[-1] = '\0';
634 if (strlen(group) == 0) {
635 pr_info("Group name is not specifiled\n");
636 return -EINVAL;
637 }
638 }
639 if (strlen(event) == 0) {
640 pr_info("Event name is not specifiled\n");
641 return -EINVAL;
642 }
643 }
644 if (!group)
645 group = KPROBE_EVENT_SYSTEM;
646
647 if (is_delete) {
648 if (!event) {
649 pr_info("Delete command needs an event name.\n");
650 return -EINVAL;
651 }
652 tp = find_probe_event(event, group);
653 if (!tp) {
654 pr_info("Event %s/%s doesn't exist.\n", group, event);
655 return -ENOENT;
656 }
657 /* delete an event */
658 unregister_trace_probe(tp);
659 free_trace_probe(tp);
660 return 0;
661 }
662
663 if (argc < 2) {
664 pr_info("Probe point is not specified.\n");
665 return -EINVAL;
666 }
667 if (isdigit(argv[1][0])) {
668 if (is_return) {
669 pr_info("Return probe point must be a symbol.\n");
670 return -EINVAL;
671 }
672 /* an address specified */
673 ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr);
674 if (ret) {
675 pr_info("Failed to parse address.\n");
676 return ret;
677 }
678 } else {
679 /* a symbol specified */
680 symbol = argv[1];
681 /* TODO: support .init module functions */
682 ret = split_symbol_offset(symbol, &offset);
683 if (ret) {
684 pr_info("Failed to parse symbol.\n");
685 return ret;
686 }
687 if (offset && is_return) {
688 pr_info("Return probe must be used without offset.\n");
689 return -EINVAL;
690 }
691 }
692 argc -= 2; argv += 2;
693
694 /* setup a probe */
695 if (!event) {
696 /* Make a new event name */
697 if (symbol)
698 snprintf(buf, MAX_EVENT_NAME_LEN, "%c@%s%+ld",
699 is_return ? 'r' : 'p', symbol, offset);
700 else
701 snprintf(buf, MAX_EVENT_NAME_LEN, "%c@0x%p",
702 is_return ? 'r' : 'p', addr);
703 event = buf;
704 }
705 tp = alloc_trace_probe(group, event, addr, symbol, offset, argc,
706 is_return);
707 if (IS_ERR(tp)) {
708 pr_info("Failed to allocate trace_probe.(%d)\n",
709 (int)PTR_ERR(tp));
710 return PTR_ERR(tp);
711 }
712
713 /* parse arguments */
714 ret = 0;
715 for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
716 /* Parse argument name */
717 arg = strchr(argv[i], '=');
718 if (arg)
719 *arg++ = '\0';
720 else
721 arg = argv[i];
722
723 if (conflict_field_name(argv[i], tp->args, i)) {
724 pr_info("Argument%d name '%s' conflicts with "
725 "another field.\n", i, argv[i]);
726 ret = -EINVAL;
727 goto error;
728 }
729
730 tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
731 if (!tp->args[i].name) {
732 pr_info("Failed to allocate argument%d name '%s'.\n",
733 i, argv[i]);
734 ret = -ENOMEM;
735 goto error;
736 }
737
738 /* Parse fetch argument */
739 ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return);
740 if (ret) {
741 pr_info("Parse error at argument%d. (%d)\n", i, ret);
742 kfree(tp->args[i].name);
743 goto error;
744 }
745
746 tp->nr_args++;
747 }
748
749 ret = register_trace_probe(tp);
750 if (ret)
751 goto error;
752 return 0;
753
754error:
755 free_trace_probe(tp);
756 return ret;
757}
758
759static void cleanup_all_probes(void)
760{
761 struct trace_probe *tp;
762
763 mutex_lock(&probe_lock);
764 /* TODO: Use batch unregistration */
765 while (!list_empty(&probe_list)) {
766 tp = list_entry(probe_list.next, struct trace_probe, list);
767 unregister_trace_probe(tp);
768 free_trace_probe(tp);
769 }
770 mutex_unlock(&probe_lock);
771}
772
773
774/* Probes listing interfaces */
775static void *probes_seq_start(struct seq_file *m, loff_t *pos)
776{
777 mutex_lock(&probe_lock);
778 return seq_list_start(&probe_list, *pos);
779}
780
781static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos)
782{
783 return seq_list_next(v, &probe_list, pos);
784}
785
786static void probes_seq_stop(struct seq_file *m, void *v)
787{
788 mutex_unlock(&probe_lock);
789}
790
791static int probes_seq_show(struct seq_file *m, void *v)
792{
793 struct trace_probe *tp = v;
794 int i, ret;
795 char buf[MAX_ARGSTR_LEN + 1];
796
797 seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
798 seq_printf(m, ":%s/%s", tp->call.system, tp->call.name);
799
800 if (!tp->symbol)
801 seq_printf(m, " 0x%p", tp->rp.kp.addr);
802 else if (tp->rp.kp.offset)
803 seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset);
804 else
805 seq_printf(m, " %s", probe_symbol(tp));
806
807 for (i = 0; i < tp->nr_args; i++) {
808 ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch);
809 if (ret < 0) {
810 pr_warning("Argument%d decoding error(%d).\n", i, ret);
811 return ret;
812 }
813 seq_printf(m, " %s=%s", tp->args[i].name, buf);
814 }
815 seq_printf(m, "\n");
816 return 0;
817}
818
819static const struct seq_operations probes_seq_op = {
820 .start = probes_seq_start,
821 .next = probes_seq_next,
822 .stop = probes_seq_stop,
823 .show = probes_seq_show
824};
825
826static int probes_open(struct inode *inode, struct file *file)
827{
828 if ((file->f_mode & FMODE_WRITE) &&
829 (file->f_flags & O_TRUNC))
830 cleanup_all_probes();
831
832 return seq_open(file, &probes_seq_op);
833}
834
835static int command_trace_probe(const char *buf)
836{
837 char **argv;
838 int argc = 0, ret = 0;
839
840 argv = argv_split(GFP_KERNEL, buf, &argc);
841 if (!argv)
842 return -ENOMEM;
843
844 if (argc)
845 ret = create_trace_probe(argc, argv);
846
847 argv_free(argv);
848 return ret;
849}
850
851#define WRITE_BUFSIZE 128
852
853static ssize_t probes_write(struct file *file, const char __user *buffer,
854 size_t count, loff_t *ppos)
855{
856 char *kbuf, *tmp;
857 int ret;
858 size_t done;
859 size_t size;
860
861 kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
862 if (!kbuf)
863 return -ENOMEM;
864
865 ret = done = 0;
866 while (done < count) {
867 size = count - done;
868 if (size >= WRITE_BUFSIZE)
869 size = WRITE_BUFSIZE - 1;
870 if (copy_from_user(kbuf, buffer + done, size)) {
871 ret = -EFAULT;
872 goto out;
873 }
874 kbuf[size] = '\0';
875 tmp = strchr(kbuf, '\n');
876 if (tmp) {
877 *tmp = '\0';
878 size = tmp - kbuf + 1;
879 } else if (done + size < count) {
880 pr_warning("Line length is too long: "
881 "Should be less than %d.", WRITE_BUFSIZE);
882 ret = -EINVAL;
883 goto out;
884 }
885 done += size;
886 /* Remove comments */
887 tmp = strchr(kbuf, '#');
888 if (tmp)
889 *tmp = '\0';
890
891 ret = command_trace_probe(kbuf);
892 if (ret)
893 goto out;
894 }
895 ret = done;
896out:
897 kfree(kbuf);
898 return ret;
899}
900
901static const struct file_operations kprobe_events_ops = {
902 .owner = THIS_MODULE,
903 .open = probes_open,
904 .read = seq_read,
905 .llseek = seq_lseek,
906 .release = seq_release,
907 .write = probes_write,
908};
909
910/* Probes profiling interfaces */
911static int probes_profile_seq_show(struct seq_file *m, void *v)
912{
913 struct trace_probe *tp = v;
914
915 seq_printf(m, " %-44s %15lu %15lu\n", tp->call.name, tp->nhit,
916 tp->rp.kp.nmissed);
917
918 return 0;
919}
920
921static const struct seq_operations profile_seq_op = {
922 .start = probes_seq_start,
923 .next = probes_seq_next,
924 .stop = probes_seq_stop,
925 .show = probes_profile_seq_show
926};
927
928static int profile_open(struct inode *inode, struct file *file)
929{
930 return seq_open(file, &profile_seq_op);
931}
932
933static const struct file_operations kprobe_profile_ops = {
934 .owner = THIS_MODULE,
935 .open = profile_open,
936 .read = seq_read,
937 .llseek = seq_lseek,
938 .release = seq_release,
939};
940
941/* Kprobe handler */
942static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
943{
944 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
945 struct kprobe_trace_entry *entry;
946 struct ring_buffer_event *event;
947 struct ring_buffer *buffer;
948 int size, i, pc;
949 unsigned long irq_flags;
950 struct ftrace_event_call *call = &tp->call;
951
952 tp->nhit++;
953
954 local_save_flags(irq_flags);
955 pc = preempt_count();
956
957 size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
958
959 event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
960 irq_flags, pc);
961 if (!event)
962 return 0;
963
964 entry = ring_buffer_event_data(event);
965 entry->nargs = tp->nr_args;
966 entry->ip = (unsigned long)kp->addr;
967 for (i = 0; i < tp->nr_args; i++)
968 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
969
970 if (!filter_current_check_discard(buffer, call, entry, event))
971 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
972 return 0;
973}
974
975/* Kretprobe handler */
976static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
977 struct pt_regs *regs)
978{
979 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
980 struct kretprobe_trace_entry *entry;
981 struct ring_buffer_event *event;
982 struct ring_buffer *buffer;
983 int size, i, pc;
984 unsigned long irq_flags;
985 struct ftrace_event_call *call = &tp->call;
986
987 local_save_flags(irq_flags);
988 pc = preempt_count();
989
990 size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
991
992 event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
993 irq_flags, pc);
994 if (!event)
995 return 0;
996
997 entry = ring_buffer_event_data(event);
998 entry->nargs = tp->nr_args;
999 entry->func = (unsigned long)tp->rp.kp.addr;
1000 entry->ret_ip = (unsigned long)ri->ret_addr;
1001 for (i = 0; i < tp->nr_args; i++)
1002 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1003
1004 if (!filter_current_check_discard(buffer, call, entry, event))
1005 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
1006
1007 return 0;
1008}
1009
1010/* Event entry printers */
1011enum print_line_t
1012print_kprobe_event(struct trace_iterator *iter, int flags)
1013{
1014 struct kprobe_trace_entry *field;
1015 struct trace_seq *s = &iter->seq;
1016 struct trace_event *event;
1017 struct trace_probe *tp;
1018 int i;
1019
1020 field = (struct kprobe_trace_entry *)iter->ent;
1021 event = ftrace_find_event(field->ent.type);
1022 tp = container_of(event, struct trace_probe, event);
1023
1024 if (!trace_seq_printf(s, "%s: (", tp->call.name))
1025 goto partial;
1026
1027 if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
1028 goto partial;
1029
1030 if (!trace_seq_puts(s, ")"))
1031 goto partial;
1032
1033 for (i = 0; i < field->nargs; i++)
1034 if (!trace_seq_printf(s, " %s=%lx",
1035 tp->args[i].name, field->args[i]))
1036 goto partial;
1037
1038 if (!trace_seq_puts(s, "\n"))
1039 goto partial;
1040
1041 return TRACE_TYPE_HANDLED;
1042partial:
1043 return TRACE_TYPE_PARTIAL_LINE;
1044}
1045
1046enum print_line_t
1047print_kretprobe_event(struct trace_iterator *iter, int flags)
1048{
1049 struct kretprobe_trace_entry *field;
1050 struct trace_seq *s = &iter->seq;
1051 struct trace_event *event;
1052 struct trace_probe *tp;
1053 int i;
1054
1055 field = (struct kretprobe_trace_entry *)iter->ent;
1056 event = ftrace_find_event(field->ent.type);
1057 tp = container_of(event, struct trace_probe, event);
1058
1059 if (!trace_seq_printf(s, "%s: (", tp->call.name))
1060 goto partial;
1061
1062 if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
1063 goto partial;
1064
1065 if (!trace_seq_puts(s, " <- "))
1066 goto partial;
1067
1068 if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET))
1069 goto partial;
1070
1071 if (!trace_seq_puts(s, ")"))
1072 goto partial;
1073
1074 for (i = 0; i < field->nargs; i++)
1075 if (!trace_seq_printf(s, " %s=%lx",
1076 tp->args[i].name, field->args[i]))
1077 goto partial;
1078
1079 if (!trace_seq_puts(s, "\n"))
1080 goto partial;
1081
1082 return TRACE_TYPE_HANDLED;
1083partial:
1084 return TRACE_TYPE_PARTIAL_LINE;
1085}
1086
1087static int probe_event_enable(struct ftrace_event_call *call)
1088{
1089 struct trace_probe *tp = (struct trace_probe *)call->data;
1090
1091 tp->flags |= TP_FLAG_TRACE;
1092 if (probe_is_return(tp))
1093 return enable_kretprobe(&tp->rp);
1094 else
1095 return enable_kprobe(&tp->rp.kp);
1096}
1097
1098static void probe_event_disable(struct ftrace_event_call *call)
1099{
1100 struct trace_probe *tp = (struct trace_probe *)call->data;
1101
1102 tp->flags &= ~TP_FLAG_TRACE;
1103 if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) {
1104 if (probe_is_return(tp))
1105 disable_kretprobe(&tp->rp);
1106 else
1107 disable_kprobe(&tp->rp.kp);
1108 }
1109}
1110
1111static int probe_event_raw_init(struct ftrace_event_call *event_call)
1112{
1113 INIT_LIST_HEAD(&event_call->fields);
1114
1115 return 0;
1116}
1117
1118#undef DEFINE_FIELD
1119#define DEFINE_FIELD(type, item, name, is_signed) \
1120 do { \
1121 ret = trace_define_field(event_call, #type, name, \
1122 offsetof(typeof(field), item), \
1123 sizeof(field.item), is_signed, \
1124 FILTER_OTHER); \
1125 if (ret) \
1126 return ret; \
1127 } while (0)
1128
1129static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
1130{
1131 int ret, i;
1132 struct kprobe_trace_entry field;
1133 struct trace_probe *tp = (struct trace_probe *)event_call->data;
1134
1135 ret = trace_define_common_fields(event_call);
1136 if (ret)
1137 return ret;
1138
1139 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
1140 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
1141 /* Set argument names as fields */
1142 for (i = 0; i < tp->nr_args; i++)
1143 DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
1144 return 0;
1145}
1146
1147static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
1148{
1149 int ret, i;
1150 struct kretprobe_trace_entry field;
1151 struct trace_probe *tp = (struct trace_probe *)event_call->data;
1152
1153 ret = trace_define_common_fields(event_call);
1154 if (ret)
1155 return ret;
1156
1157 DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
1158 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
1159 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
1160 /* Set argument names as fields */
1161 for (i = 0; i < tp->nr_args; i++)
1162 DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
1163 return 0;
1164}
1165
1166static int __probe_event_show_format(struct trace_seq *s,
1167 struct trace_probe *tp, const char *fmt,
1168 const char *arg)
1169{
1170 int i;
1171
1172 /* Show format */
1173 if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt))
1174 return 0;
1175
1176 for (i = 0; i < tp->nr_args; i++)
1177 if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name))
1178 return 0;
1179
1180 if (!trace_seq_printf(s, "\", %s", arg))
1181 return 0;
1182
1183 for (i = 0; i < tp->nr_args; i++)
1184 if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name))
1185 return 0;
1186
1187 return trace_seq_puts(s, "\n");
1188}
1189
1190#undef SHOW_FIELD
1191#define SHOW_FIELD(type, item, name) \
1192 do { \
1193 ret = trace_seq_printf(s, "\tfield: " #type " %s;\t" \
1194 "offset:%u;\tsize:%u;\n", name, \
1195 (unsigned int)offsetof(typeof(field), item),\
1196 (unsigned int)sizeof(type)); \
1197 if (!ret) \
1198 return 0; \
1199 } while (0)
1200
1201static int kprobe_event_show_format(struct ftrace_event_call *call,
1202 struct trace_seq *s)
1203{
1204 struct kprobe_trace_entry field __attribute__((unused));
1205 int ret, i;
1206 struct trace_probe *tp = (struct trace_probe *)call->data;
1207
1208 SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP);
1209 SHOW_FIELD(int, nargs, FIELD_STRING_NARGS);
1210
1211 /* Show fields */
1212 for (i = 0; i < tp->nr_args; i++)
1213 SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
1214 trace_seq_puts(s, "\n");
1215
1216 return __probe_event_show_format(s, tp, "(%lx)",
1217 "REC->" FIELD_STRING_IP);
1218}
1219
1220static int kretprobe_event_show_format(struct ftrace_event_call *call,
1221 struct trace_seq *s)
1222{
1223 struct kretprobe_trace_entry field __attribute__((unused));
1224 int ret, i;
1225 struct trace_probe *tp = (struct trace_probe *)call->data;
1226
1227 SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC);
1228 SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP);
1229 SHOW_FIELD(int, nargs, FIELD_STRING_NARGS);
1230
1231 /* Show fields */
1232 for (i = 0; i < tp->nr_args; i++)
1233 SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
1234 trace_seq_puts(s, "\n");
1235
1236 return __probe_event_show_format(s, tp, "(%lx <- %lx)",
1237 "REC->" FIELD_STRING_FUNC
1238 ", REC->" FIELD_STRING_RETIP);
1239}
1240
1241#ifdef CONFIG_EVENT_PROFILE
1242
1243/* Kprobe profile handler */
1244static __kprobes int kprobe_profile_func(struct kprobe *kp,
1245 struct pt_regs *regs)
1246{
1247 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
1248 struct ftrace_event_call *call = &tp->call;
1249 struct kprobe_trace_entry *entry;
1250 struct trace_entry *ent;
1251 int size, __size, i, pc, __cpu;
1252 unsigned long irq_flags;
1253 char *trace_buf;
1254 char *raw_data;
1255 int rctx;
1256
1257 pc = preempt_count();
1258 __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
1259 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1260 size -= sizeof(u32);
1261 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
1262 "profile buffer not large enough"))
1263 return 0;
1264
1265 /*
1266 * Protect the non nmi buffer
1267 * This also protects the rcu read side
1268 */
1269 local_irq_save(irq_flags);
1270
1271 rctx = perf_swevent_get_recursion_context();
1272 if (rctx < 0)
1273 goto end_recursion;
1274
1275 __cpu = smp_processor_id();
1276
1277 if (in_nmi())
1278 trace_buf = rcu_dereference(perf_trace_buf_nmi);
1279 else
1280 trace_buf = rcu_dereference(perf_trace_buf);
1281
1282 if (!trace_buf)
1283 goto end;
1284
1285 raw_data = per_cpu_ptr(trace_buf, __cpu);
1286
1287 /* Zero dead bytes from alignment to avoid buffer leak to userspace */
1288 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
1289 entry = (struct kprobe_trace_entry *)raw_data;
1290 ent = &entry->ent;
1291
1292 tracing_generic_entry_update(ent, irq_flags, pc);
1293 ent->type = call->id;
1294 entry->nargs = tp->nr_args;
1295 entry->ip = (unsigned long)kp->addr;
1296 for (i = 0; i < tp->nr_args; i++)
1297 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1298 perf_tp_event(call->id, entry->ip, 1, entry, size);
1299
1300end:
1301 perf_swevent_put_recursion_context(rctx);
1302end_recursion:
1303 local_irq_restore(irq_flags);
1304
1305 return 0;
1306}
1307
1308/* Kretprobe profile handler */
1309static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
1310 struct pt_regs *regs)
1311{
1312 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
1313 struct ftrace_event_call *call = &tp->call;
1314 struct kretprobe_trace_entry *entry;
1315 struct trace_entry *ent;
1316 int size, __size, i, pc, __cpu;
1317 unsigned long irq_flags;
1318 char *trace_buf;
1319 char *raw_data;
1320 int rctx;
1321
1322 pc = preempt_count();
1323 __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
1324 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1325 size -= sizeof(u32);
1326 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
1327 "profile buffer not large enough"))
1328 return 0;
1329
1330 /*
1331 * Protect the non nmi buffer
1332 * This also protects the rcu read side
1333 */
1334 local_irq_save(irq_flags);
1335
1336 rctx = perf_swevent_get_recursion_context();
1337 if (rctx < 0)
1338 goto end_recursion;
1339
1340 __cpu = smp_processor_id();
1341
1342 if (in_nmi())
1343 trace_buf = rcu_dereference(perf_trace_buf_nmi);
1344 else
1345 trace_buf = rcu_dereference(perf_trace_buf);
1346
1347 if (!trace_buf)
1348 goto end;
1349
1350 raw_data = per_cpu_ptr(trace_buf, __cpu);
1351
1352 /* Zero dead bytes from alignment to avoid buffer leak to userspace */
1353 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
1354 entry = (struct kretprobe_trace_entry *)raw_data;
1355 ent = &entry->ent;
1356
1357 tracing_generic_entry_update(ent, irq_flags, pc);
1358 ent->type = call->id;
1359 entry->nargs = tp->nr_args;
1360 entry->func = (unsigned long)tp->rp.kp.addr;
1361 entry->ret_ip = (unsigned long)ri->ret_addr;
1362 for (i = 0; i < tp->nr_args; i++)
1363 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1364 perf_tp_event(call->id, entry->ret_ip, 1, entry, size);
1365
1366end:
1367 perf_swevent_put_recursion_context(rctx);
1368end_recursion:
1369 local_irq_restore(irq_flags);
1370
1371 return 0;
1372}
1373
1374static int probe_profile_enable(struct ftrace_event_call *call)
1375{
1376 struct trace_probe *tp = (struct trace_probe *)call->data;
1377
1378 tp->flags |= TP_FLAG_PROFILE;
1379
1380 if (probe_is_return(tp))
1381 return enable_kretprobe(&tp->rp);
1382 else
1383 return enable_kprobe(&tp->rp.kp);
1384}
1385
1386static void probe_profile_disable(struct ftrace_event_call *call)
1387{
1388 struct trace_probe *tp = (struct trace_probe *)call->data;
1389
1390 tp->flags &= ~TP_FLAG_PROFILE;
1391
1392 if (!(tp->flags & TP_FLAG_TRACE)) {
1393 if (probe_is_return(tp))
1394 disable_kretprobe(&tp->rp);
1395 else
1396 disable_kprobe(&tp->rp.kp);
1397 }
1398}
1399#endif /* CONFIG_EVENT_PROFILE */
1400
1401
1402static __kprobes
1403int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
1404{
1405 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
1406
1407 if (tp->flags & TP_FLAG_TRACE)
1408 kprobe_trace_func(kp, regs);
1409#ifdef CONFIG_EVENT_PROFILE
1410 if (tp->flags & TP_FLAG_PROFILE)
1411 kprobe_profile_func(kp, regs);
1412#endif /* CONFIG_EVENT_PROFILE */
1413 return 0; /* We don't tweek kernel, so just return 0 */
1414}
1415
1416static __kprobes
1417int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
1418{
1419 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
1420
1421 if (tp->flags & TP_FLAG_TRACE)
1422 kretprobe_trace_func(ri, regs);
1423#ifdef CONFIG_EVENT_PROFILE
1424 if (tp->flags & TP_FLAG_PROFILE)
1425 kretprobe_profile_func(ri, regs);
1426#endif /* CONFIG_EVENT_PROFILE */
1427 return 0; /* We don't tweek kernel, so just return 0 */
1428}
1429
1430static int register_probe_event(struct trace_probe *tp)
1431{
1432 struct ftrace_event_call *call = &tp->call;
1433 int ret;
1434
1435 /* Initialize ftrace_event_call */
1436 if (probe_is_return(tp)) {
1437 tp->event.trace = print_kretprobe_event;
1438 call->raw_init = probe_event_raw_init;
1439 call->show_format = kretprobe_event_show_format;
1440 call->define_fields = kretprobe_event_define_fields;
1441 } else {
1442 tp->event.trace = print_kprobe_event;
1443 call->raw_init = probe_event_raw_init;
1444 call->show_format = kprobe_event_show_format;
1445 call->define_fields = kprobe_event_define_fields;
1446 }
1447 call->event = &tp->event;
1448 call->id = register_ftrace_event(&tp->event);
1449 if (!call->id)
1450 return -ENODEV;
1451 call->enabled = 0;
1452 call->regfunc = probe_event_enable;
1453 call->unregfunc = probe_event_disable;
1454
1455#ifdef CONFIG_EVENT_PROFILE
1456 atomic_set(&call->profile_count, -1);
1457 call->profile_enable = probe_profile_enable;
1458 call->profile_disable = probe_profile_disable;
1459#endif
1460 call->data = tp;
1461 ret = trace_add_event_call(call);
1462 if (ret) {
1463 pr_info("Failed to register kprobe event: %s\n", call->name);
1464 unregister_ftrace_event(&tp->event);
1465 }
1466 return ret;
1467}
1468
1469static void unregister_probe_event(struct trace_probe *tp)
1470{
1471 /* tp->event is unregistered in trace_remove_event_call() */
1472 trace_remove_event_call(&tp->call);
1473}
1474
1475/* Make a debugfs interface for controling probe points */
1476static __init int init_kprobe_trace(void)
1477{
1478 struct dentry *d_tracer;
1479 struct dentry *entry;
1480
1481 d_tracer = tracing_init_dentry();
1482 if (!d_tracer)
1483 return 0;
1484
1485 entry = debugfs_create_file("kprobe_events", 0644, d_tracer,
1486 NULL, &kprobe_events_ops);
1487
1488 /* Event list interface */
1489 if (!entry)
1490 pr_warning("Could not create debugfs "
1491 "'kprobe_events' entry\n");
1492
1493 /* Profile interface */
1494 entry = debugfs_create_file("kprobe_profile", 0444, d_tracer,
1495 NULL, &kprobe_profile_ops);
1496
1497 if (!entry)
1498 pr_warning("Could not create debugfs "
1499 "'kprobe_profile' entry\n");
1500 return 0;
1501}
1502fs_initcall(init_kprobe_trace);
1503
1504
1505#ifdef CONFIG_FTRACE_STARTUP_TEST
1506
1507static int kprobe_trace_selftest_target(int a1, int a2, int a3,
1508 int a4, int a5, int a6)
1509{
1510 return a1 + a2 + a3 + a4 + a5 + a6;
1511}
1512
1513static __init int kprobe_trace_self_tests_init(void)
1514{
1515 int ret;
1516 int (*target)(int, int, int, int, int, int);
1517
1518 target = kprobe_trace_selftest_target;
1519
1520 pr_info("Testing kprobe tracing: ");
1521
1522 ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target "
1523 "$arg1 $arg2 $arg3 $arg4 $stack $stack0");
1524 if (WARN_ON_ONCE(ret))
1525 pr_warning("error enabling function entry\n");
1526
1527 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
1528 "$retval");
1529 if (WARN_ON_ONCE(ret))
1530 pr_warning("error enabling function return\n");
1531
1532 ret = target(1, 2, 3, 4, 5, 6);
1533
1534 cleanup_all_probes();
1535
1536 pr_cont("OK\n");
1537 return 0;
1538}
1539
1540late_initcall(kprobe_trace_self_tests_init);
1541
1542#endif
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
new file mode 100644
index 000000000000..acb87d4a4ac1
--- /dev/null
+++ b/kernel/trace/trace_ksym.c
@@ -0,0 +1,551 @@
1/*
2 * trace_ksym.c - Kernel Symbol Tracer
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2009
19 */
20
21#include <linux/kallsyms.h>
22#include <linux/uaccess.h>
23#include <linux/debugfs.h>
24#include <linux/ftrace.h>
25#include <linux/module.h>
26#include <linux/fs.h>
27
28#include "trace_output.h"
29#include "trace_stat.h"
30#include "trace.h"
31
32#include <linux/hw_breakpoint.h>
33#include <asm/hw_breakpoint.h>
34
35/*
36 * For now, let us restrict the no. of symbols traced simultaneously to number
37 * of available hardware breakpoint registers.
38 */
39#define KSYM_TRACER_MAX HBP_NUM
40
41#define KSYM_TRACER_OP_LEN 3 /* rw- */
42
43struct trace_ksym {
44 struct perf_event **ksym_hbp;
45 struct perf_event_attr attr;
46#ifdef CONFIG_PROFILE_KSYM_TRACER
47 unsigned long counter;
48#endif
49 struct hlist_node ksym_hlist;
50};
51
52static struct trace_array *ksym_trace_array;
53
54static unsigned int ksym_filter_entry_count;
55static unsigned int ksym_tracing_enabled;
56
57static HLIST_HEAD(ksym_filter_head);
58
59static DEFINE_MUTEX(ksym_tracer_mutex);
60
61#ifdef CONFIG_PROFILE_KSYM_TRACER
62
63#define MAX_UL_INT 0xffffffff
64
65void ksym_collect_stats(unsigned long hbp_hit_addr)
66{
67 struct hlist_node *node;
68 struct trace_ksym *entry;
69
70 rcu_read_lock();
71 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
72 if ((entry->attr.bp_addr == hbp_hit_addr) &&
73 (entry->counter <= MAX_UL_INT)) {
74 entry->counter++;
75 break;
76 }
77 }
78 rcu_read_unlock();
79}
80#endif /* CONFIG_PROFILE_KSYM_TRACER */
81
82void ksym_hbp_handler(struct perf_event *hbp, int nmi,
83 struct perf_sample_data *data,
84 struct pt_regs *regs)
85{
86 struct ring_buffer_event *event;
87 struct ksym_trace_entry *entry;
88 struct ring_buffer *buffer;
89 int pc;
90
91 if (!ksym_tracing_enabled)
92 return;
93
94 buffer = ksym_trace_array->buffer;
95
96 pc = preempt_count();
97
98 event = trace_buffer_lock_reserve(buffer, TRACE_KSYM,
99 sizeof(*entry), 0, pc);
100 if (!event)
101 return;
102
103 entry = ring_buffer_event_data(event);
104 entry->ip = instruction_pointer(regs);
105 entry->type = hw_breakpoint_type(hbp);
106 entry->addr = hw_breakpoint_addr(hbp);
107 strlcpy(entry->cmd, current->comm, TASK_COMM_LEN);
108
109#ifdef CONFIG_PROFILE_KSYM_TRACER
110 ksym_collect_stats(hw_breakpoint_addr(hbp));
111#endif /* CONFIG_PROFILE_KSYM_TRACER */
112
113 trace_buffer_unlock_commit(buffer, event, 0, pc);
114}
115
116/* Valid access types are represented as
117 *
118 * rw- : Set Read/Write Access Breakpoint
119 * -w- : Set Write Access Breakpoint
120 * --- : Clear Breakpoints
121 * --x : Set Execution Break points (Not available yet)
122 *
123 */
124static int ksym_trace_get_access_type(char *str)
125{
126 int access = 0;
127
128 if (str[0] == 'r')
129 access |= HW_BREAKPOINT_R;
130
131 if (str[1] == 'w')
132 access |= HW_BREAKPOINT_W;
133
134 if (str[2] == 'x')
135 access |= HW_BREAKPOINT_X;
136
137 switch (access) {
138 case HW_BREAKPOINT_R:
139 case HW_BREAKPOINT_W:
140 case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
141 return access;
142 default:
143 return -EINVAL;
144 }
145}
146
147/*
148 * There can be several possible malformed requests and we attempt to capture
149 * all of them. We enumerate some of the rules
150 * 1. We will not allow kernel symbols with ':' since it is used as a delimiter.
151 * i.e. multiple ':' symbols disallowed. Possible uses are of the form
152 * <module>:<ksym_name>:<op>.
153 * 2. No delimiter symbol ':' in the input string
154 * 3. Spurious operator symbols or symbols not in their respective positions
155 * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file
156 * 5. Kernel symbol not a part of /proc/kallsyms
157 * 6. Duplicate requests
158 */
159static int parse_ksym_trace_str(char *input_string, char **ksymname,
160 unsigned long *addr)
161{
162 int ret;
163
164 *ksymname = strsep(&input_string, ":");
165 *addr = kallsyms_lookup_name(*ksymname);
166
167 /* Check for malformed request: (2), (1) and (5) */
168 if ((!input_string) ||
169 (strlen(input_string) != KSYM_TRACER_OP_LEN) ||
170 (*addr == 0))
171 return -EINVAL;;
172
173 ret = ksym_trace_get_access_type(input_string);
174
175 return ret;
176}
177
178int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
179{
180 struct trace_ksym *entry;
181 int ret = -ENOMEM;
182
183 if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
184 printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
185 " new requests for tracing can be accepted now.\n",
186 KSYM_TRACER_MAX);
187 return -ENOSPC;
188 }
189
190 entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
191 if (!entry)
192 return -ENOMEM;
193
194 hw_breakpoint_init(&entry->attr);
195
196 entry->attr.bp_type = op;
197 entry->attr.bp_addr = addr;
198 entry->attr.bp_len = HW_BREAKPOINT_LEN_4;
199
200 ret = -EAGAIN;
201 entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr,
202 ksym_hbp_handler);
203
204 if (IS_ERR(entry->ksym_hbp)) {
205 ret = PTR_ERR(entry->ksym_hbp);
206 printk(KERN_INFO "ksym_tracer request failed. Try again"
207 " later!!\n");
208 goto err;
209 }
210
211 hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
212 ksym_filter_entry_count++;
213
214 return 0;
215
216err:
217 kfree(entry);
218
219 return ret;
220}
221
222static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
223 size_t count, loff_t *ppos)
224{
225 struct trace_ksym *entry;
226 struct hlist_node *node;
227 struct trace_seq *s;
228 ssize_t cnt = 0;
229 int ret;
230
231 s = kmalloc(sizeof(*s), GFP_KERNEL);
232 if (!s)
233 return -ENOMEM;
234 trace_seq_init(s);
235
236 mutex_lock(&ksym_tracer_mutex);
237
238 hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
239 ret = trace_seq_printf(s, "%pS:", (void *)entry->attr.bp_addr);
240 if (entry->attr.bp_type == HW_BREAKPOINT_R)
241 ret = trace_seq_puts(s, "r--\n");
242 else if (entry->attr.bp_type == HW_BREAKPOINT_W)
243 ret = trace_seq_puts(s, "-w-\n");
244 else if (entry->attr.bp_type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R))
245 ret = trace_seq_puts(s, "rw-\n");
246 WARN_ON_ONCE(!ret);
247 }
248
249 cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
250
251 mutex_unlock(&ksym_tracer_mutex);
252
253 kfree(s);
254
255 return cnt;
256}
257
258static void __ksym_trace_reset(void)
259{
260 struct trace_ksym *entry;
261 struct hlist_node *node, *node1;
262
263 mutex_lock(&ksym_tracer_mutex);
264 hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
265 ksym_hlist) {
266 unregister_wide_hw_breakpoint(entry->ksym_hbp);
267 ksym_filter_entry_count--;
268 hlist_del_rcu(&(entry->ksym_hlist));
269 synchronize_rcu();
270 kfree(entry);
271 }
272 mutex_unlock(&ksym_tracer_mutex);
273}
274
275static ssize_t ksym_trace_filter_write(struct file *file,
276 const char __user *buffer,
277 size_t count, loff_t *ppos)
278{
279 struct trace_ksym *entry;
280 struct hlist_node *node;
281 char *input_string, *ksymname = NULL;
282 unsigned long ksym_addr = 0;
283 int ret, op, changed = 0;
284
285 input_string = kzalloc(count + 1, GFP_KERNEL);
286 if (!input_string)
287 return -ENOMEM;
288
289 if (copy_from_user(input_string, buffer, count)) {
290 kfree(input_string);
291 return -EFAULT;
292 }
293 input_string[count] = '\0';
294
295 strstrip(input_string);
296
297 /*
298 * Clear all breakpoints if:
299 * 1: echo > ksym_trace_filter
300 * 2: echo 0 > ksym_trace_filter
301 * 3: echo "*:---" > ksym_trace_filter
302 */
303 if (!input_string[0] || !strcmp(input_string, "0") ||
304 !strcmp(input_string, "*:---")) {
305 __ksym_trace_reset();
306 kfree(input_string);
307 return count;
308 }
309
310 ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
311 if (ret < 0) {
312 kfree(input_string);
313 return ret;
314 }
315
316 mutex_lock(&ksym_tracer_mutex);
317
318 ret = -EINVAL;
319 hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
320 if (entry->attr.bp_addr == ksym_addr) {
321 /* Check for malformed request: (6) */
322 if (entry->attr.bp_type != op)
323 changed = 1;
324 else
325 goto out;
326 break;
327 }
328 }
329 if (changed) {
330 unregister_wide_hw_breakpoint(entry->ksym_hbp);
331 entry->attr.bp_type = op;
332 ret = 0;
333 if (op > 0) {
334 entry->ksym_hbp =
335 register_wide_hw_breakpoint(&entry->attr,
336 ksym_hbp_handler);
337 if (IS_ERR(entry->ksym_hbp))
338 ret = PTR_ERR(entry->ksym_hbp);
339 else
340 goto out;
341 }
342 /* Error or "symbol:---" case: drop it */
343 ksym_filter_entry_count--;
344 hlist_del_rcu(&(entry->ksym_hlist));
345 synchronize_rcu();
346 kfree(entry);
347 goto out;
348 } else {
349 /* Check for malformed request: (4) */
350 if (op == 0)
351 goto out;
352 ret = process_new_ksym_entry(ksymname, op, ksym_addr);
353 }
354out:
355 mutex_unlock(&ksym_tracer_mutex);
356
357 kfree(input_string);
358
359 if (!ret)
360 ret = count;
361 return ret;
362}
363
364static const struct file_operations ksym_tracing_fops = {
365 .open = tracing_open_generic,
366 .read = ksym_trace_filter_read,
367 .write = ksym_trace_filter_write,
368};
369
370static void ksym_trace_reset(struct trace_array *tr)
371{
372 ksym_tracing_enabled = 0;
373 __ksym_trace_reset();
374}
375
376static int ksym_trace_init(struct trace_array *tr)
377{
378 int cpu, ret = 0;
379
380 for_each_online_cpu(cpu)
381 tracing_reset(tr, cpu);
382 ksym_tracing_enabled = 1;
383 ksym_trace_array = tr;
384
385 return ret;
386}
387
388static void ksym_trace_print_header(struct seq_file *m)
389{
390 seq_puts(m,
391 "# TASK-PID CPU# Symbol "
392 "Type Function\n");
393 seq_puts(m,
394 "# | | | "
395 " | |\n");
396}
397
398static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
399{
400 struct trace_entry *entry = iter->ent;
401 struct trace_seq *s = &iter->seq;
402 struct ksym_trace_entry *field;
403 char str[KSYM_SYMBOL_LEN];
404 int ret;
405
406 if (entry->type != TRACE_KSYM)
407 return TRACE_TYPE_UNHANDLED;
408
409 trace_assign_type(field, entry);
410
411 ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd,
412 entry->pid, iter->cpu, (char *)field->addr);
413 if (!ret)
414 return TRACE_TYPE_PARTIAL_LINE;
415
416 switch (field->type) {
417 case HW_BREAKPOINT_R:
418 ret = trace_seq_printf(s, " R ");
419 break;
420 case HW_BREAKPOINT_W:
421 ret = trace_seq_printf(s, " W ");
422 break;
423 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
424 ret = trace_seq_printf(s, " RW ");
425 break;
426 default:
427 return TRACE_TYPE_PARTIAL_LINE;
428 }
429
430 if (!ret)
431 return TRACE_TYPE_PARTIAL_LINE;
432
433 sprint_symbol(str, field->ip);
434 ret = trace_seq_printf(s, "%s\n", str);
435 if (!ret)
436 return TRACE_TYPE_PARTIAL_LINE;
437
438 return TRACE_TYPE_HANDLED;
439}
440
441struct tracer ksym_tracer __read_mostly =
442{
443 .name = "ksym_tracer",
444 .init = ksym_trace_init,
445 .reset = ksym_trace_reset,
446#ifdef CONFIG_FTRACE_SELFTEST
447 .selftest = trace_selftest_startup_ksym,
448#endif
449 .print_header = ksym_trace_print_header,
450 .print_line = ksym_trace_output
451};
452
453__init static int init_ksym_trace(void)
454{
455 struct dentry *d_tracer;
456 struct dentry *entry;
457
458 d_tracer = tracing_init_dentry();
459 ksym_filter_entry_count = 0;
460
461 entry = debugfs_create_file("ksym_trace_filter", 0644, d_tracer,
462 NULL, &ksym_tracing_fops);
463 if (!entry)
464 pr_warning("Could not create debugfs "
465 "'ksym_trace_filter' file\n");
466
467 return register_tracer(&ksym_tracer);
468}
469device_initcall(init_ksym_trace);
470
471
472#ifdef CONFIG_PROFILE_KSYM_TRACER
473static int ksym_tracer_stat_headers(struct seq_file *m)
474{
475 seq_puts(m, " Access Type ");
476 seq_puts(m, " Symbol Counter\n");
477 seq_puts(m, " ----------- ");
478 seq_puts(m, " ------ -------\n");
479 return 0;
480}
481
482static int ksym_tracer_stat_show(struct seq_file *m, void *v)
483{
484 struct hlist_node *stat = v;
485 struct trace_ksym *entry;
486 int access_type = 0;
487 char fn_name[KSYM_NAME_LEN];
488
489 entry = hlist_entry(stat, struct trace_ksym, ksym_hlist);
490
491 access_type = entry->attr.bp_type;
492
493 switch (access_type) {
494 case HW_BREAKPOINT_R:
495 seq_puts(m, " R ");
496 break;
497 case HW_BREAKPOINT_W:
498 seq_puts(m, " W ");
499 break;
500 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
501 seq_puts(m, " RW ");
502 break;
503 default:
504 seq_puts(m, " NA ");
505 }
506
507 if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0)
508 seq_printf(m, " %-36s", fn_name);
509 else
510 seq_printf(m, " %-36s", "<NA>");
511 seq_printf(m, " %15lu\n", entry->counter);
512
513 return 0;
514}
515
516static void *ksym_tracer_stat_start(struct tracer_stat *trace)
517{
518 return ksym_filter_head.first;
519}
520
521static void *
522ksym_tracer_stat_next(void *v, int idx)
523{
524 struct hlist_node *stat = v;
525
526 return stat->next;
527}
528
529static struct tracer_stat ksym_tracer_stats = {
530 .name = "ksym_tracer",
531 .stat_start = ksym_tracer_stat_start,
532 .stat_next = ksym_tracer_stat_next,
533 .stat_headers = ksym_tracer_stat_headers,
534 .stat_show = ksym_tracer_stat_show
535};
536
537__init static int ksym_tracer_stat_init(void)
538{
539 int ret;
540
541 ret = register_stat_tracer(&ksym_tracer_stats);
542 if (ret) {
543 printk(KERN_WARNING "Warning: could not register "
544 "ksym tracer stats\n");
545 return 1;
546 }
547
548 return 0;
549}
550fs_initcall(ksym_tracer_stat_init);
551#endif /* CONFIG_PROFILE_KSYM_TRACER */
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index b6c12c6a1bcd..8e46b3323cdc 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -23,13 +23,21 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
23 23
24static int next_event_type = __TRACE_LAST_TYPE + 1; 24static int next_event_type = __TRACE_LAST_TYPE + 1;
25 25
26void trace_print_seq(struct seq_file *m, struct trace_seq *s) 26int trace_print_seq(struct seq_file *m, struct trace_seq *s)
27{ 27{
28 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; 28 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
29 int ret;
30
31 ret = seq_write(m, s->buffer, len);
29 32
30 seq_write(m, s->buffer, len); 33 /*
34 * Only reset this buffer if we successfully wrote to the
35 * seq_file buffer.
36 */
37 if (!ret)
38 trace_seq_init(s);
31 39
32 trace_seq_init(s); 40 return ret;
33} 41}
34 42
35enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) 43enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
@@ -85,7 +93,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
85 va_list ap; 93 va_list ap;
86 int ret; 94 int ret;
87 95
88 if (!len) 96 if (s->full || !len)
89 return 0; 97 return 0;
90 98
91 va_start(ap, fmt); 99 va_start(ap, fmt);
@@ -93,8 +101,10 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
93 va_end(ap); 101 va_end(ap);
94 102
95 /* If we can't write it all, don't bother writing anything */ 103 /* If we can't write it all, don't bother writing anything */
96 if (ret >= len) 104 if (ret >= len) {
105 s->full = 1;
97 return 0; 106 return 0;
107 }
98 108
99 s->len += ret; 109 s->len += ret;
100 110
@@ -119,14 +129,16 @@ trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
119 int len = (PAGE_SIZE - 1) - s->len; 129 int len = (PAGE_SIZE - 1) - s->len;
120 int ret; 130 int ret;
121 131
122 if (!len) 132 if (s->full || !len)
123 return 0; 133 return 0;
124 134
125 ret = vsnprintf(s->buffer + s->len, len, fmt, args); 135 ret = vsnprintf(s->buffer + s->len, len, fmt, args);
126 136
127 /* If we can't write it all, don't bother writing anything */ 137 /* If we can't write it all, don't bother writing anything */
128 if (ret >= len) 138 if (ret >= len) {
139 s->full = 1;
129 return 0; 140 return 0;
141 }
130 142
131 s->len += ret; 143 s->len += ret;
132 144
@@ -139,14 +151,16 @@ int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
139 int len = (PAGE_SIZE - 1) - s->len; 151 int len = (PAGE_SIZE - 1) - s->len;
140 int ret; 152 int ret;
141 153
142 if (!len) 154 if (s->full || !len)
143 return 0; 155 return 0;
144 156
145 ret = bstr_printf(s->buffer + s->len, len, fmt, binary); 157 ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
146 158
147 /* If we can't write it all, don't bother writing anything */ 159 /* If we can't write it all, don't bother writing anything */
148 if (ret >= len) 160 if (ret >= len) {
161 s->full = 1;
149 return 0; 162 return 0;
163 }
150 164
151 s->len += ret; 165 s->len += ret;
152 166
@@ -167,8 +181,13 @@ int trace_seq_puts(struct trace_seq *s, const char *str)
167{ 181{
168 int len = strlen(str); 182 int len = strlen(str);
169 183
170 if (len > ((PAGE_SIZE - 1) - s->len)) 184 if (s->full)
185 return 0;
186
187 if (len > ((PAGE_SIZE - 1) - s->len)) {
188 s->full = 1;
171 return 0; 189 return 0;
190 }
172 191
173 memcpy(s->buffer + s->len, str, len); 192 memcpy(s->buffer + s->len, str, len);
174 s->len += len; 193 s->len += len;
@@ -178,9 +197,14 @@ int trace_seq_puts(struct trace_seq *s, const char *str)
178 197
179int trace_seq_putc(struct trace_seq *s, unsigned char c) 198int trace_seq_putc(struct trace_seq *s, unsigned char c)
180{ 199{
181 if (s->len >= (PAGE_SIZE - 1)) 200 if (s->full)
182 return 0; 201 return 0;
183 202
203 if (s->len >= (PAGE_SIZE - 1)) {
204 s->full = 1;
205 return 0;
206 }
207
184 s->buffer[s->len++] = c; 208 s->buffer[s->len++] = c;
185 209
186 return 1; 210 return 1;
@@ -188,9 +212,14 @@ int trace_seq_putc(struct trace_seq *s, unsigned char c)
188 212
189int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len) 213int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
190{ 214{
191 if (len > ((PAGE_SIZE - 1) - s->len)) 215 if (s->full)
192 return 0; 216 return 0;
193 217
218 if (len > ((PAGE_SIZE - 1) - s->len)) {
219 s->full = 1;
220 return 0;
221 }
222
194 memcpy(s->buffer + s->len, mem, len); 223 memcpy(s->buffer + s->len, mem, len);
195 s->len += len; 224 s->len += len;
196 225
@@ -203,6 +232,9 @@ int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len)
203 const unsigned char *data = mem; 232 const unsigned char *data = mem;
204 int i, j; 233 int i, j;
205 234
235 if (s->full)
236 return 0;
237
206#ifdef __BIG_ENDIAN 238#ifdef __BIG_ENDIAN
207 for (i = 0, j = 0; i < len; i++) { 239 for (i = 0, j = 0; i < len; i++) {
208#else 240#else
@@ -220,8 +252,13 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)
220{ 252{
221 void *ret; 253 void *ret;
222 254
223 if (len > ((PAGE_SIZE - 1) - s->len)) 255 if (s->full)
256 return 0;
257
258 if (len > ((PAGE_SIZE - 1) - s->len)) {
259 s->full = 1;
224 return NULL; 260 return NULL;
261 }
225 262
226 ret = s->buffer + s->len; 263 ret = s->buffer + s->len;
227 s->len += len; 264 s->len += len;
@@ -233,8 +270,14 @@ int trace_seq_path(struct trace_seq *s, struct path *path)
233{ 270{
234 unsigned char *p; 271 unsigned char *p;
235 272
236 if (s->len >= (PAGE_SIZE - 1)) 273 if (s->full)
274 return 0;
275
276 if (s->len >= (PAGE_SIZE - 1)) {
277 s->full = 1;
237 return 0; 278 return 0;
279 }
280
238 p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); 281 p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
239 if (!IS_ERR(p)) { 282 if (!IS_ERR(p)) {
240 p = mangle_path(s->buffer + s->len, p, "\n"); 283 p = mangle_path(s->buffer + s->len, p, "\n");
@@ -247,6 +290,7 @@ int trace_seq_path(struct trace_seq *s, struct path *path)
247 return 1; 290 return 1;
248 } 291 }
249 292
293 s->full = 1;
250 return 0; 294 return 0;
251} 295}
252 296
@@ -373,6 +417,9 @@ int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
373 unsigned long vmstart = 0; 417 unsigned long vmstart = 0;
374 int ret = 1; 418 int ret = 1;
375 419
420 if (s->full)
421 return 0;
422
376 if (mm) { 423 if (mm) {
377 const struct vm_area_struct *vma; 424 const struct vm_area_struct *vma;
378 425
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index d2cdbabb4ead..dc98309e839a 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -17,6 +17,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
17 case TRACE_GRAPH_ENT: 17 case TRACE_GRAPH_ENT:
18 case TRACE_GRAPH_RET: 18 case TRACE_GRAPH_RET:
19 case TRACE_HW_BRANCHES: 19 case TRACE_HW_BRANCHES:
20 case TRACE_KSYM:
20 return 1; 21 return 1;
21 } 22 }
22 return 0; 23 return 0;
@@ -808,3 +809,57 @@ trace_selftest_startup_hw_branches(struct tracer *trace,
808 return ret; 809 return ret;
809} 810}
810#endif /* CONFIG_HW_BRANCH_TRACER */ 811#endif /* CONFIG_HW_BRANCH_TRACER */
812
813#ifdef CONFIG_KSYM_TRACER
814static int ksym_selftest_dummy;
815
816int
817trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
818{
819 unsigned long count;
820 int ret;
821
822 /* start the tracing */
823 ret = tracer_init(trace, tr);
824 if (ret) {
825 warn_failed_init_tracer(trace, ret);
826 return ret;
827 }
828
829 ksym_selftest_dummy = 0;
830 /* Register the read-write tracing request */
831
832 ret = process_new_ksym_entry("ksym_selftest_dummy",
833 HW_BREAKPOINT_R | HW_BREAKPOINT_W,
834 (unsigned long)(&ksym_selftest_dummy));
835
836 if (ret < 0) {
837 printk(KERN_CONT "ksym_trace read-write startup test failed\n");
838 goto ret_path;
839 }
840 /* Perform a read and a write operation over the dummy variable to
841 * trigger the tracer
842 */
843 if (ksym_selftest_dummy == 0)
844 ksym_selftest_dummy++;
845
846 /* stop the tracing. */
847 tracing_stop();
848 /* check the trace buffer */
849 ret = trace_test_buffer(tr, &count);
850 trace->reset(tr);
851 tracing_start();
852
853 /* read & write operations - one each is performed on the dummy variable
854 * triggering two entries in the trace buffer
855 */
856 if (!ret && count != 2) {
857 printk(KERN_CONT "Ksym tracer startup test failed");
858 ret = -1;
859 }
860
861ret_path:
862 return ret;
863}
864#endif /* CONFIG_KSYM_TRACER */
865
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 527e17eae575..57501d90096a 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -14,6 +14,43 @@ static int sys_refcount_exit;
14static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); 14static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
15static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); 15static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
16 16
17extern unsigned long __start_syscalls_metadata[];
18extern unsigned long __stop_syscalls_metadata[];
19
20static struct syscall_metadata **syscalls_metadata;
21
22static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
23{
24 struct syscall_metadata *start;
25 struct syscall_metadata *stop;
26 char str[KSYM_SYMBOL_LEN];
27
28
29 start = (struct syscall_metadata *)__start_syscalls_metadata;
30 stop = (struct syscall_metadata *)__stop_syscalls_metadata;
31 kallsyms_lookup(syscall, NULL, NULL, NULL, str);
32
33 for ( ; start < stop; start++) {
34 /*
35 * Only compare after the "sys" prefix. Archs that use
36 * syscall wrappers may have syscalls symbols aliases prefixed
37 * with "SyS" instead of "sys", leading to an unwanted
38 * mismatch.
39 */
40 if (start->name && !strcmp(start->name + 3, str + 3))
41 return start;
42 }
43 return NULL;
44}
45
46static struct syscall_metadata *syscall_nr_to_meta(int nr)
47{
48 if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
49 return NULL;
50
51 return syscalls_metadata[nr];
52}
53
17enum print_line_t 54enum print_line_t
18print_syscall_enter(struct trace_iterator *iter, int flags) 55print_syscall_enter(struct trace_iterator *iter, int flags)
19{ 56{
@@ -30,7 +67,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags)
30 if (!entry) 67 if (!entry)
31 goto end; 68 goto end;
32 69
33 if (entry->enter_id != ent->type) { 70 if (entry->enter_event->id != ent->type) {
34 WARN_ON_ONCE(1); 71 WARN_ON_ONCE(1);
35 goto end; 72 goto end;
36 } 73 }
@@ -85,7 +122,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
85 return TRACE_TYPE_HANDLED; 122 return TRACE_TYPE_HANDLED;
86 } 123 }
87 124
88 if (entry->exit_id != ent->type) { 125 if (entry->exit_event->id != ent->type) {
89 WARN_ON_ONCE(1); 126 WARN_ON_ONCE(1);
90 return TRACE_TYPE_UNHANDLED; 127 return TRACE_TYPE_UNHANDLED;
91 } 128 }
@@ -103,24 +140,19 @@ extern char *__bad_type_size(void);
103#define SYSCALL_FIELD(type, name) \ 140#define SYSCALL_FIELD(type, name) \
104 sizeof(type) != sizeof(trace.name) ? \ 141 sizeof(type) != sizeof(trace.name) ? \
105 __bad_type_size() : \ 142 __bad_type_size() : \
106 #type, #name, offsetof(typeof(trace), name), sizeof(trace.name) 143 #type, #name, offsetof(typeof(trace), name), \
144 sizeof(trace.name), is_signed_type(type)
107 145
108int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) 146int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
109{ 147{
110 int i; 148 int i;
111 int nr;
112 int ret; 149 int ret;
113 struct syscall_metadata *entry; 150 struct syscall_metadata *entry = call->data;
114 struct syscall_trace_enter trace; 151 struct syscall_trace_enter trace;
115 int offset = offsetof(struct syscall_trace_enter, args); 152 int offset = offsetof(struct syscall_trace_enter, args);
116 153
117 nr = syscall_name_to_nr(call->data); 154 ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
118 entry = syscall_nr_to_meta(nr); 155 "\tsigned:%u;\n",
119
120 if (!entry)
121 return 0;
122
123 ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n",
124 SYSCALL_FIELD(int, nr)); 156 SYSCALL_FIELD(int, nr));
125 if (!ret) 157 if (!ret)
126 return 0; 158 return 0;
@@ -130,8 +162,10 @@ int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
130 entry->args[i]); 162 entry->args[i]);
131 if (!ret) 163 if (!ret)
132 return 0; 164 return 0;
133 ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;\n", offset, 165 ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;"
134 sizeof(unsigned long)); 166 "\tsigned:%u;\n", offset,
167 sizeof(unsigned long),
168 is_signed_type(unsigned long));
135 if (!ret) 169 if (!ret)
136 return 0; 170 return 0;
137 offset += sizeof(unsigned long); 171 offset += sizeof(unsigned long);
@@ -163,8 +197,10 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
163 struct syscall_trace_exit trace; 197 struct syscall_trace_exit trace;
164 198
165 ret = trace_seq_printf(s, 199 ret = trace_seq_printf(s,
166 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 200 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
167 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n", 201 "\tsigned:%u;\n"
202 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
203 "\tsigned:%u;\n",
168 SYSCALL_FIELD(int, nr), 204 SYSCALL_FIELD(int, nr),
169 SYSCALL_FIELD(long, ret)); 205 SYSCALL_FIELD(long, ret));
170 if (!ret) 206 if (!ret)
@@ -176,22 +212,19 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
176int syscall_enter_define_fields(struct ftrace_event_call *call) 212int syscall_enter_define_fields(struct ftrace_event_call *call)
177{ 213{
178 struct syscall_trace_enter trace; 214 struct syscall_trace_enter trace;
179 struct syscall_metadata *meta; 215 struct syscall_metadata *meta = call->data;
180 int ret; 216 int ret;
181 int nr;
182 int i; 217 int i;
183 int offset = offsetof(typeof(trace), args); 218 int offset = offsetof(typeof(trace), args);
184 219
185 nr = syscall_name_to_nr(call->data);
186 meta = syscall_nr_to_meta(nr);
187
188 if (!meta)
189 return 0;
190
191 ret = trace_define_common_fields(call); 220 ret = trace_define_common_fields(call);
192 if (ret) 221 if (ret)
193 return ret; 222 return ret;
194 223
224 ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
225 if (ret)
226 return ret;
227
195 for (i = 0; i < meta->nb_args; i++) { 228 for (i = 0; i < meta->nb_args; i++) {
196 ret = trace_define_field(call, meta->types[i], 229 ret = trace_define_field(call, meta->types[i],
197 meta->args[i], offset, 230 meta->args[i], offset,
@@ -212,7 +245,11 @@ int syscall_exit_define_fields(struct ftrace_event_call *call)
212 if (ret) 245 if (ret)
213 return ret; 246 return ret;
214 247
215 ret = trace_define_field(call, SYSCALL_FIELD(long, ret), 0, 248 ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
249 if (ret)
250 return ret;
251
252 ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
216 FILTER_OTHER); 253 FILTER_OTHER);
217 254
218 return ret; 255 return ret;
@@ -239,8 +276,8 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
239 276
240 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; 277 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
241 278
242 event = trace_current_buffer_lock_reserve(&buffer, sys_data->enter_id, 279 event = trace_current_buffer_lock_reserve(&buffer,
243 size, 0, 0); 280 sys_data->enter_event->id, size, 0, 0);
244 if (!event) 281 if (!event)
245 return; 282 return;
246 283
@@ -271,8 +308,8 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
271 if (!sys_data) 308 if (!sys_data)
272 return; 309 return;
273 310
274 event = trace_current_buffer_lock_reserve(&buffer, sys_data->exit_id, 311 event = trace_current_buffer_lock_reserve(&buffer,
275 sizeof(*entry), 0, 0); 312 sys_data->exit_event->id, sizeof(*entry), 0, 0);
276 if (!event) 313 if (!event)
277 return; 314 return;
278 315
@@ -285,14 +322,12 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
285 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 322 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
286} 323}
287 324
288int reg_event_syscall_enter(void *ptr) 325int reg_event_syscall_enter(struct ftrace_event_call *call)
289{ 326{
290 int ret = 0; 327 int ret = 0;
291 int num; 328 int num;
292 char *name;
293 329
294 name = (char *)ptr; 330 num = ((struct syscall_metadata *)call->data)->syscall_nr;
295 num = syscall_name_to_nr(name);
296 if (num < 0 || num >= NR_syscalls) 331 if (num < 0 || num >= NR_syscalls)
297 return -ENOSYS; 332 return -ENOSYS;
298 mutex_lock(&syscall_trace_lock); 333 mutex_lock(&syscall_trace_lock);
@@ -309,13 +344,11 @@ int reg_event_syscall_enter(void *ptr)
309 return ret; 344 return ret;
310} 345}
311 346
312void unreg_event_syscall_enter(void *ptr) 347void unreg_event_syscall_enter(struct ftrace_event_call *call)
313{ 348{
314 int num; 349 int num;
315 char *name;
316 350
317 name = (char *)ptr; 351 num = ((struct syscall_metadata *)call->data)->syscall_nr;
318 num = syscall_name_to_nr(name);
319 if (num < 0 || num >= NR_syscalls) 352 if (num < 0 || num >= NR_syscalls)
320 return; 353 return;
321 mutex_lock(&syscall_trace_lock); 354 mutex_lock(&syscall_trace_lock);
@@ -326,14 +359,12 @@ void unreg_event_syscall_enter(void *ptr)
326 mutex_unlock(&syscall_trace_lock); 359 mutex_unlock(&syscall_trace_lock);
327} 360}
328 361
329int reg_event_syscall_exit(void *ptr) 362int reg_event_syscall_exit(struct ftrace_event_call *call)
330{ 363{
331 int ret = 0; 364 int ret = 0;
332 int num; 365 int num;
333 char *name;
334 366
335 name = (char *)ptr; 367 num = ((struct syscall_metadata *)call->data)->syscall_nr;
336 num = syscall_name_to_nr(name);
337 if (num < 0 || num >= NR_syscalls) 368 if (num < 0 || num >= NR_syscalls)
338 return -ENOSYS; 369 return -ENOSYS;
339 mutex_lock(&syscall_trace_lock); 370 mutex_lock(&syscall_trace_lock);
@@ -350,13 +381,11 @@ int reg_event_syscall_exit(void *ptr)
350 return ret; 381 return ret;
351} 382}
352 383
353void unreg_event_syscall_exit(void *ptr) 384void unreg_event_syscall_exit(struct ftrace_event_call *call)
354{ 385{
355 int num; 386 int num;
356 char *name;
357 387
358 name = (char *)ptr; 388 num = ((struct syscall_metadata *)call->data)->syscall_nr;
359 num = syscall_name_to_nr(name);
360 if (num < 0 || num >= NR_syscalls) 389 if (num < 0 || num >= NR_syscalls)
361 return; 390 return;
362 mutex_lock(&syscall_trace_lock); 391 mutex_lock(&syscall_trace_lock);
@@ -367,13 +396,44 @@ void unreg_event_syscall_exit(void *ptr)
367 mutex_unlock(&syscall_trace_lock); 396 mutex_unlock(&syscall_trace_lock);
368} 397}
369 398
370struct trace_event event_syscall_enter = { 399int init_syscall_trace(struct ftrace_event_call *call)
371 .trace = print_syscall_enter, 400{
372}; 401 int id;
402
403 id = register_ftrace_event(call->event);
404 if (!id)
405 return -ENODEV;
406 call->id = id;
407 INIT_LIST_HEAD(&call->fields);
408 return 0;
409}
410
411int __init init_ftrace_syscalls(void)
412{
413 struct syscall_metadata *meta;
414 unsigned long addr;
415 int i;
416
417 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
418 NR_syscalls, GFP_KERNEL);
419 if (!syscalls_metadata) {
420 WARN_ON(1);
421 return -ENOMEM;
422 }
423
424 for (i = 0; i < NR_syscalls; i++) {
425 addr = arch_syscall_addr(i);
426 meta = find_syscall_meta(addr);
427 if (!meta)
428 continue;
429
430 meta->syscall_nr = i;
431 syscalls_metadata[i] = meta;
432 }
373 433
374struct trace_event event_syscall_exit = { 434 return 0;
375 .trace = print_syscall_exit, 435}
376}; 436core_initcall(init_ftrace_syscalls);
377 437
378#ifdef CONFIG_EVENT_PROFILE 438#ifdef CONFIG_EVENT_PROFILE
379 439
@@ -387,8 +447,10 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
387 struct syscall_metadata *sys_data; 447 struct syscall_metadata *sys_data;
388 struct syscall_trace_enter *rec; 448 struct syscall_trace_enter *rec;
389 unsigned long flags; 449 unsigned long flags;
450 char *trace_buf;
390 char *raw_data; 451 char *raw_data;
391 int syscall_nr; 452 int syscall_nr;
453 int rctx;
392 int size; 454 int size;
393 int cpu; 455 int cpu;
394 456
@@ -412,41 +474,42 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
412 /* Protect the per cpu buffer, begin the rcu read side */ 474 /* Protect the per cpu buffer, begin the rcu read side */
413 local_irq_save(flags); 475 local_irq_save(flags);
414 476
477 rctx = perf_swevent_get_recursion_context();
478 if (rctx < 0)
479 goto end_recursion;
480
415 cpu = smp_processor_id(); 481 cpu = smp_processor_id();
416 482
417 if (in_nmi()) 483 trace_buf = rcu_dereference(perf_trace_buf);
418 raw_data = rcu_dereference(trace_profile_buf_nmi);
419 else
420 raw_data = rcu_dereference(trace_profile_buf);
421 484
422 if (!raw_data) 485 if (!trace_buf)
423 goto end; 486 goto end;
424 487
425 raw_data = per_cpu_ptr(raw_data, cpu); 488 raw_data = per_cpu_ptr(trace_buf, cpu);
426 489
427 /* zero the dead bytes from align to not leak stack to user */ 490 /* zero the dead bytes from align to not leak stack to user */
428 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; 491 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
429 492
430 rec = (struct syscall_trace_enter *) raw_data; 493 rec = (struct syscall_trace_enter *) raw_data;
431 tracing_generic_entry_update(&rec->ent, 0, 0); 494 tracing_generic_entry_update(&rec->ent, 0, 0);
432 rec->ent.type = sys_data->enter_id; 495 rec->ent.type = sys_data->enter_event->id;
433 rec->nr = syscall_nr; 496 rec->nr = syscall_nr;
434 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 497 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
435 (unsigned long *)&rec->args); 498 (unsigned long *)&rec->args);
436 perf_tp_event(sys_data->enter_id, 0, 1, rec, size); 499 perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size);
437 500
438end: 501end:
502 perf_swevent_put_recursion_context(rctx);
503end_recursion:
439 local_irq_restore(flags); 504 local_irq_restore(flags);
440} 505}
441 506
442int reg_prof_syscall_enter(char *name) 507int prof_sysenter_enable(struct ftrace_event_call *call)
443{ 508{
444 int ret = 0; 509 int ret = 0;
445 int num; 510 int num;
446 511
447 num = syscall_name_to_nr(name); 512 num = ((struct syscall_metadata *)call->data)->syscall_nr;
448 if (num < 0 || num >= NR_syscalls)
449 return -ENOSYS;
450 513
451 mutex_lock(&syscall_trace_lock); 514 mutex_lock(&syscall_trace_lock);
452 if (!sys_prof_refcount_enter) 515 if (!sys_prof_refcount_enter)
@@ -462,13 +525,11 @@ int reg_prof_syscall_enter(char *name)
462 return ret; 525 return ret;
463} 526}
464 527
465void unreg_prof_syscall_enter(char *name) 528void prof_sysenter_disable(struct ftrace_event_call *call)
466{ 529{
467 int num; 530 int num;
468 531
469 num = syscall_name_to_nr(name); 532 num = ((struct syscall_metadata *)call->data)->syscall_nr;
470 if (num < 0 || num >= NR_syscalls)
471 return;
472 533
473 mutex_lock(&syscall_trace_lock); 534 mutex_lock(&syscall_trace_lock);
474 sys_prof_refcount_enter--; 535 sys_prof_refcount_enter--;
@@ -484,7 +545,9 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
484 struct syscall_trace_exit *rec; 545 struct syscall_trace_exit *rec;
485 unsigned long flags; 546 unsigned long flags;
486 int syscall_nr; 547 int syscall_nr;
548 char *trace_buf;
487 char *raw_data; 549 char *raw_data;
550 int rctx;
488 int size; 551 int size;
489 int cpu; 552 int cpu;
490 553
@@ -510,17 +573,19 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
510 573
511 /* Protect the per cpu buffer, begin the rcu read side */ 574 /* Protect the per cpu buffer, begin the rcu read side */
512 local_irq_save(flags); 575 local_irq_save(flags);
576
577 rctx = perf_swevent_get_recursion_context();
578 if (rctx < 0)
579 goto end_recursion;
580
513 cpu = smp_processor_id(); 581 cpu = smp_processor_id();
514 582
515 if (in_nmi()) 583 trace_buf = rcu_dereference(perf_trace_buf);
516 raw_data = rcu_dereference(trace_profile_buf_nmi);
517 else
518 raw_data = rcu_dereference(trace_profile_buf);
519 584
520 if (!raw_data) 585 if (!trace_buf)
521 goto end; 586 goto end;
522 587
523 raw_data = per_cpu_ptr(raw_data, cpu); 588 raw_data = per_cpu_ptr(trace_buf, cpu);
524 589
525 /* zero the dead bytes from align to not leak stack to user */ 590 /* zero the dead bytes from align to not leak stack to user */
526 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; 591 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
@@ -528,24 +593,24 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
528 rec = (struct syscall_trace_exit *)raw_data; 593 rec = (struct syscall_trace_exit *)raw_data;
529 594
530 tracing_generic_entry_update(&rec->ent, 0, 0); 595 tracing_generic_entry_update(&rec->ent, 0, 0);
531 rec->ent.type = sys_data->exit_id; 596 rec->ent.type = sys_data->exit_event->id;
532 rec->nr = syscall_nr; 597 rec->nr = syscall_nr;
533 rec->ret = syscall_get_return_value(current, regs); 598 rec->ret = syscall_get_return_value(current, regs);
534 599
535 perf_tp_event(sys_data->exit_id, 0, 1, rec, size); 600 perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size);
536 601
537end: 602end:
603 perf_swevent_put_recursion_context(rctx);
604end_recursion:
538 local_irq_restore(flags); 605 local_irq_restore(flags);
539} 606}
540 607
541int reg_prof_syscall_exit(char *name) 608int prof_sysexit_enable(struct ftrace_event_call *call)
542{ 609{
543 int ret = 0; 610 int ret = 0;
544 int num; 611 int num;
545 612
546 num = syscall_name_to_nr(name); 613 num = ((struct syscall_metadata *)call->data)->syscall_nr;
547 if (num < 0 || num >= NR_syscalls)
548 return -ENOSYS;
549 614
550 mutex_lock(&syscall_trace_lock); 615 mutex_lock(&syscall_trace_lock);
551 if (!sys_prof_refcount_exit) 616 if (!sys_prof_refcount_exit)
@@ -561,13 +626,11 @@ int reg_prof_syscall_exit(char *name)
561 return ret; 626 return ret;
562} 627}
563 628
564void unreg_prof_syscall_exit(char *name) 629void prof_sysexit_disable(struct ftrace_event_call *call)
565{ 630{
566 int num; 631 int num;
567 632
568 num = syscall_name_to_nr(name); 633 num = ((struct syscall_metadata *)call->data)->syscall_nr;
569 if (num < 0 || num >= NR_syscalls)
570 return;
571 634
572 mutex_lock(&syscall_trace_lock); 635 mutex_lock(&syscall_trace_lock);
573 sys_prof_refcount_exit--; 636 sys_prof_refcount_exit--;
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
new file mode 100644
index 000000000000..eb27fd3430a2
--- /dev/null
+++ b/kernel/user-return-notifier.c
@@ -0,0 +1,44 @@
1
2#include <linux/user-return-notifier.h>
3#include <linux/percpu.h>
4#include <linux/sched.h>
5#include <linux/module.h>
6
7static DEFINE_PER_CPU(struct hlist_head, return_notifier_list);
8
9/*
10 * Request a notification when the current cpu returns to userspace. Must be
11 * called in atomic context. The notifier will also be called in atomic
12 * context.
13 */
14void user_return_notifier_register(struct user_return_notifier *urn)
15{
16 set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
17 hlist_add_head(&urn->link, &__get_cpu_var(return_notifier_list));
18}
19EXPORT_SYMBOL_GPL(user_return_notifier_register);
20
21/*
22 * Removes a registered user return notifier. Must be called from atomic
23 * context, and from the same cpu registration occured in.
24 */
25void user_return_notifier_unregister(struct user_return_notifier *urn)
26{
27 hlist_del(&urn->link);
28 if (hlist_empty(&__get_cpu_var(return_notifier_list)))
29 clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
30}
31EXPORT_SYMBOL_GPL(user_return_notifier_unregister);
32
33/* Calls registered user return notifiers */
34void fire_user_return_notifiers(void)
35{
36 struct user_return_notifier *urn;
37 struct hlist_node *tmp1, *tmp2;
38 struct hlist_head *head;
39
40 head = &get_cpu_var(return_notifier_list);
41 hlist_for_each_entry_safe(urn, tmp1, tmp2, head, link)
42 urn->on_user_return(urn);
43 put_cpu_var(return_notifier_list);
44}
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 69eae358a726..a2cd77e70d4d 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -57,78 +57,47 @@ static int proc_do_uts_string(ctl_table *table, int write,
57#define proc_do_uts_string NULL 57#define proc_do_uts_string NULL
58#endif 58#endif
59 59
60
61#ifdef CONFIG_SYSCTL_SYSCALL
62/* The generic string strategy routine: */
63static int sysctl_uts_string(ctl_table *table,
64 void __user *oldval, size_t __user *oldlenp,
65 void __user *newval, size_t newlen)
66{
67 struct ctl_table uts_table;
68 int r, write;
69 write = newval && newlen;
70 memcpy(&uts_table, table, sizeof(uts_table));
71 uts_table.data = get_uts(table, write);
72 r = sysctl_string(&uts_table, oldval, oldlenp, newval, newlen);
73 put_uts(table, write, uts_table.data);
74 return r;
75}
76#else
77#define sysctl_uts_string NULL
78#endif
79
80static struct ctl_table uts_kern_table[] = { 60static struct ctl_table uts_kern_table[] = {
81 { 61 {
82 .ctl_name = KERN_OSTYPE,
83 .procname = "ostype", 62 .procname = "ostype",
84 .data = init_uts_ns.name.sysname, 63 .data = init_uts_ns.name.sysname,
85 .maxlen = sizeof(init_uts_ns.name.sysname), 64 .maxlen = sizeof(init_uts_ns.name.sysname),
86 .mode = 0444, 65 .mode = 0444,
87 .proc_handler = proc_do_uts_string, 66 .proc_handler = proc_do_uts_string,
88 .strategy = sysctl_uts_string,
89 }, 67 },
90 { 68 {
91 .ctl_name = KERN_OSRELEASE,
92 .procname = "osrelease", 69 .procname = "osrelease",
93 .data = init_uts_ns.name.release, 70 .data = init_uts_ns.name.release,
94 .maxlen = sizeof(init_uts_ns.name.release), 71 .maxlen = sizeof(init_uts_ns.name.release),
95 .mode = 0444, 72 .mode = 0444,
96 .proc_handler = proc_do_uts_string, 73 .proc_handler = proc_do_uts_string,
97 .strategy = sysctl_uts_string,
98 }, 74 },
99 { 75 {
100 .ctl_name = KERN_VERSION,
101 .procname = "version", 76 .procname = "version",
102 .data = init_uts_ns.name.version, 77 .data = init_uts_ns.name.version,
103 .maxlen = sizeof(init_uts_ns.name.version), 78 .maxlen = sizeof(init_uts_ns.name.version),
104 .mode = 0444, 79 .mode = 0444,
105 .proc_handler = proc_do_uts_string, 80 .proc_handler = proc_do_uts_string,
106 .strategy = sysctl_uts_string,
107 }, 81 },
108 { 82 {
109 .ctl_name = KERN_NODENAME,
110 .procname = "hostname", 83 .procname = "hostname",
111 .data = init_uts_ns.name.nodename, 84 .data = init_uts_ns.name.nodename,
112 .maxlen = sizeof(init_uts_ns.name.nodename), 85 .maxlen = sizeof(init_uts_ns.name.nodename),
113 .mode = 0644, 86 .mode = 0644,
114 .proc_handler = proc_do_uts_string, 87 .proc_handler = proc_do_uts_string,
115 .strategy = sysctl_uts_string,
116 }, 88 },
117 { 89 {
118 .ctl_name = KERN_DOMAINNAME,
119 .procname = "domainname", 90 .procname = "domainname",
120 .data = init_uts_ns.name.domainname, 91 .data = init_uts_ns.name.domainname,
121 .maxlen = sizeof(init_uts_ns.name.domainname), 92 .maxlen = sizeof(init_uts_ns.name.domainname),
122 .mode = 0644, 93 .mode = 0644,
123 .proc_handler = proc_do_uts_string, 94 .proc_handler = proc_do_uts_string,
124 .strategy = sysctl_uts_string,
125 }, 95 },
126 {} 96 {}
127}; 97};
128 98
129static struct ctl_table uts_root_table[] = { 99static struct ctl_table uts_root_table[] = {
130 { 100 {
131 .ctl_name = CTL_KERN,
132 .procname = "kernel", 101 .procname = "kernel",
133 .mode = 0555, 102 .mode = 0555,
134 .child = uts_kern_table, 103 .child = uts_kern_table,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 67e526b6ae81..dee48658805c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -68,6 +68,116 @@ struct workqueue_struct {
68#endif 68#endif
69}; 69};
70 70
71#ifdef CONFIG_DEBUG_OBJECTS_WORK
72
73static struct debug_obj_descr work_debug_descr;
74
75/*
76 * fixup_init is called when:
77 * - an active object is initialized
78 */
79static int work_fixup_init(void *addr, enum debug_obj_state state)
80{
81 struct work_struct *work = addr;
82
83 switch (state) {
84 case ODEBUG_STATE_ACTIVE:
85 cancel_work_sync(work);
86 debug_object_init(work, &work_debug_descr);
87 return 1;
88 default:
89 return 0;
90 }
91}
92
93/*
94 * fixup_activate is called when:
95 * - an active object is activated
96 * - an unknown object is activated (might be a statically initialized object)
97 */
98static int work_fixup_activate(void *addr, enum debug_obj_state state)
99{
100 struct work_struct *work = addr;
101
102 switch (state) {
103
104 case ODEBUG_STATE_NOTAVAILABLE:
105 /*
106 * This is not really a fixup. The work struct was
107 * statically initialized. We just make sure that it
108 * is tracked in the object tracker.
109 */
110 if (test_bit(WORK_STRUCT_STATIC, work_data_bits(work))) {
111 debug_object_init(work, &work_debug_descr);
112 debug_object_activate(work, &work_debug_descr);
113 return 0;
114 }
115 WARN_ON_ONCE(1);
116 return 0;
117
118 case ODEBUG_STATE_ACTIVE:
119 WARN_ON(1);
120
121 default:
122 return 0;
123 }
124}
125
126/*
127 * fixup_free is called when:
128 * - an active object is freed
129 */
130static int work_fixup_free(void *addr, enum debug_obj_state state)
131{
132 struct work_struct *work = addr;
133
134 switch (state) {
135 case ODEBUG_STATE_ACTIVE:
136 cancel_work_sync(work);
137 debug_object_free(work, &work_debug_descr);
138 return 1;
139 default:
140 return 0;
141 }
142}
143
144static struct debug_obj_descr work_debug_descr = {
145 .name = "work_struct",
146 .fixup_init = work_fixup_init,
147 .fixup_activate = work_fixup_activate,
148 .fixup_free = work_fixup_free,
149};
150
151static inline void debug_work_activate(struct work_struct *work)
152{
153 debug_object_activate(work, &work_debug_descr);
154}
155
156static inline void debug_work_deactivate(struct work_struct *work)
157{
158 debug_object_deactivate(work, &work_debug_descr);
159}
160
161void __init_work(struct work_struct *work, int onstack)
162{
163 if (onstack)
164 debug_object_init_on_stack(work, &work_debug_descr);
165 else
166 debug_object_init(work, &work_debug_descr);
167}
168EXPORT_SYMBOL_GPL(__init_work);
169
170void destroy_work_on_stack(struct work_struct *work)
171{
172 debug_object_free(work, &work_debug_descr);
173}
174EXPORT_SYMBOL_GPL(destroy_work_on_stack);
175
176#else
177static inline void debug_work_activate(struct work_struct *work) { }
178static inline void debug_work_deactivate(struct work_struct *work) { }
179#endif
180
71/* Serializes the accesses to the list of workqueues. */ 181/* Serializes the accesses to the list of workqueues. */
72static DEFINE_SPINLOCK(workqueue_lock); 182static DEFINE_SPINLOCK(workqueue_lock);
73static LIST_HEAD(workqueues); 183static LIST_HEAD(workqueues);
@@ -145,6 +255,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
145{ 255{
146 unsigned long flags; 256 unsigned long flags;
147 257
258 debug_work_activate(work);
148 spin_lock_irqsave(&cwq->lock, flags); 259 spin_lock_irqsave(&cwq->lock, flags);
149 insert_work(cwq, work, &cwq->worklist); 260 insert_work(cwq, work, &cwq->worklist);
150 spin_unlock_irqrestore(&cwq->lock, flags); 261 spin_unlock_irqrestore(&cwq->lock, flags);
@@ -280,6 +391,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
280 struct lockdep_map lockdep_map = work->lockdep_map; 391 struct lockdep_map lockdep_map = work->lockdep_map;
281#endif 392#endif
282 trace_workqueue_execution(cwq->thread, work); 393 trace_workqueue_execution(cwq->thread, work);
394 debug_work_deactivate(work);
283 cwq->current_work = work; 395 cwq->current_work = work;
284 list_del_init(cwq->worklist.next); 396 list_del_init(cwq->worklist.next);
285 spin_unlock_irq(&cwq->lock); 397 spin_unlock_irq(&cwq->lock);
@@ -350,11 +462,18 @@ static void wq_barrier_func(struct work_struct *work)
350static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, 462static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
351 struct wq_barrier *barr, struct list_head *head) 463 struct wq_barrier *barr, struct list_head *head)
352{ 464{
353 INIT_WORK(&barr->work, wq_barrier_func); 465 /*
466 * debugobject calls are safe here even with cwq->lock locked
467 * as we know for sure that this will not trigger any of the
468 * checks and call back into the fixup functions where we
469 * might deadlock.
470 */
471 INIT_WORK_ON_STACK(&barr->work, wq_barrier_func);
354 __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work)); 472 __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
355 473
356 init_completion(&barr->done); 474 init_completion(&barr->done);
357 475
476 debug_work_activate(&barr->work);
358 insert_work(cwq, &barr->work, head); 477 insert_work(cwq, &barr->work, head);
359} 478}
360 479
@@ -372,8 +491,10 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
372 } 491 }
373 spin_unlock_irq(&cwq->lock); 492 spin_unlock_irq(&cwq->lock);
374 493
375 if (active) 494 if (active) {
376 wait_for_completion(&barr.done); 495 wait_for_completion(&barr.done);
496 destroy_work_on_stack(&barr.work);
497 }
377 498
378 return active; 499 return active;
379} 500}
@@ -451,6 +572,7 @@ out:
451 return 0; 572 return 0;
452 573
453 wait_for_completion(&barr.done); 574 wait_for_completion(&barr.done);
575 destroy_work_on_stack(&barr.work);
454 return 1; 576 return 1;
455} 577}
456EXPORT_SYMBOL_GPL(flush_work); 578EXPORT_SYMBOL_GPL(flush_work);
@@ -485,6 +607,7 @@ static int try_to_grab_pending(struct work_struct *work)
485 */ 607 */
486 smp_rmb(); 608 smp_rmb();
487 if (cwq == get_wq_data(work)) { 609 if (cwq == get_wq_data(work)) {
610 debug_work_deactivate(work);
488 list_del_init(&work->entry); 611 list_del_init(&work->entry);
489 ret = 1; 612 ret = 1;
490 } 613 }
@@ -507,8 +630,10 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
507 } 630 }
508 spin_unlock_irq(&cwq->lock); 631 spin_unlock_irq(&cwq->lock);
509 632
510 if (unlikely(running)) 633 if (unlikely(running)) {
511 wait_for_completion(&barr.done); 634 wait_for_completion(&barr.done);
635 destroy_work_on_stack(&barr.work);
636 }
512} 637}
513 638
514static void wait_on_work(struct work_struct *work) 639static void wait_on_work(struct work_struct *work)