aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks202
-rw-r--r--kernel/Makefile8
-rw-r--r--kernel/acct.c3
-rw-r--r--kernel/audit.c18
-rw-r--r--kernel/audit_watch.c2
-rw-r--r--kernel/auditsc.c6
-rw-r--r--kernel/capability.c15
-rw-r--r--kernel/cgroup.c1130
-rw-r--r--kernel/cgroup_debug.c105
-rw-r--r--kernel/cgroup_freezer.c15
-rw-r--r--kernel/cpu.c23
-rw-r--r--kernel/cpuset.c111
-rw-r--r--kernel/cred.c19
-rw-r--r--kernel/exit.c185
-rw-r--r--kernel/fork.c69
-rw-r--r--kernel/futex.c220
-rw-r--r--kernel/gcov/Kconfig2
-rw-r--r--kernel/hrtimer.c225
-rw-r--r--kernel/hung_task.c6
-rw-r--r--kernel/hw_breakpoint.c453
-rw-r--r--kernel/irq/autoprobe.c20
-rw-r--r--kernel/irq/chip.c92
-rw-r--r--kernel/irq/handle.c23
-rw-r--r--kernel/irq/internals.h2
-rw-r--r--kernel/irq/manage.c52
-rw-r--r--kernel/irq/migration.c2
-rw-r--r--kernel/irq/numa_migrate.c8
-rw-r--r--kernel/irq/pm.c8
-rw-r--r--kernel/irq/proc.c44
-rw-r--r--kernel/irq/spurious.c32
-rw-r--r--kernel/itimer.c7
-rw-r--r--kernel/kallsyms.c1
-rw-r--r--kernel/kexec.c59
-rw-r--r--kernel/kgdb.c58
-rw-r--r--kernel/kmod.c21
-rw-r--r--kernel/kprobes.c76
-rw-r--r--kernel/ksysfs.c21
-rw-r--r--kernel/kthread.c23
-rw-r--r--kernel/lockdep.c69
-rw-r--r--kernel/module.c316
-rw-r--r--kernel/mutex-debug.c1
-rw-r--r--kernel/mutex-debug.h12
-rw-r--r--kernel/mutex.c4
-rw-r--r--kernel/notifier.c2
-rw-r--r--kernel/ns_cgroup.c16
-rw-r--r--kernel/panic.c3
-rw-r--r--kernel/params.c28
-rw-r--r--kernel/perf_event.c1114
-rw-r--r--kernel/pid.c12
-rw-r--r--kernel/pid_namespace.c2
-rw-r--r--kernel/pm_qos_params.c20
-rw-r--r--kernel/posix-cpu-timers.c5
-rw-r--r--kernel/power/Makefile2
-rw-r--r--kernel/power/console.c7
-rw-r--r--kernel/power/hibernate.c41
-rw-r--r--kernel/power/main.c1
-rw-r--r--kernel/power/process.c14
-rw-r--r--kernel/power/suspend_test.c5
-rw-r--r--kernel/power/swap.c147
-rw-r--r--kernel/power/swsusp.c130
-rw-r--r--kernel/printk.c7
-rw-r--r--kernel/ptrace.c11
-rw-r--r--kernel/rcupdate.c260
-rw-r--r--kernel/rcutiny.c282
-rw-r--r--kernel/rcutorture.c77
-rw-r--r--kernel/rcutree.c787
-rw-r--r--kernel/rcutree.h160
-rw-r--r--kernel/rcutree_plugin.h442
-rw-r--r--kernel/rcutree_trace.c26
-rw-r--r--kernel/relay.c4
-rw-r--r--kernel/res_counter.c3
-rw-r--r--kernel/resource.c26
-rw-r--r--kernel/rtmutex-debug.c4
-rw-r--r--kernel/rtmutex.c106
-rw-r--r--kernel/sched.c824
-rw-r--r--kernel/sched_clock.c4
-rw-r--r--kernel/sched_cpupri.c10
-rw-r--r--kernel/sched_cpupri.h2
-rw-r--r--kernel/sched_debug.c21
-rw-r--r--kernel/sched_fair.c268
-rw-r--r--kernel/sched_features.h5
-rw-r--r--kernel/sched_idletask.c6
-rw-r--r--kernel/sched_rt.c123
-rw-r--r--kernel/signal.c279
-rw-r--r--kernel/slow-work-debugfs.c227
-rw-r--r--kernel/slow-work.c531
-rw-r--r--kernel/slow-work.h72
-rw-r--r--kernel/smp.c98
-rw-r--r--kernel/softirq.c6
-rw-r--r--kernel/softlockup.c58
-rw-r--r--kernel/spinlock.c448
-rw-r--r--kernel/srcu.c74
-rw-r--r--kernel/sys.c80
-rw-r--r--kernel/sys_ni.c4
-rw-r--r--kernel/sysctl.c1029
-rw-r--r--kernel/sysctl_binary.c1507
-rw-r--r--kernel/sysctl_check.c1378
-rw-r--r--kernel/time.c31
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/clockevents.c27
-rw-r--r--kernel/time/clocksource.c109
-rw-r--r--kernel/time/tick-broadcast.c42
-rw-r--r--kernel/time/tick-common.c20
-rw-r--r--kernel/time/tick-internal.h1
-rw-r--r--kernel/time/tick-oneshot.c4
-rw-r--r--kernel/time/tick-sched.c150
-rw-r--r--kernel/time/timecompare.c8
-rw-r--r--kernel/time/timeconv.c127
-rw-r--r--kernel/time/timekeeping.c126
-rw-r--r--kernel/time/timer_list.c23
-rw-r--r--kernel/time/timer_stats.c20
-rw-r--r--kernel/trace/Kconfig38
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/blktrace.c39
-rw-r--r--kernel/trace/ftrace.c437
-rw-r--r--kernel/trace/kmemtrace.c2
-rw-r--r--kernel/trace/ring_buffer.c54
-rw-r--r--kernel/trace/ring_buffer_benchmark.c85
-rw-r--r--kernel/trace/trace.c153
-rw-r--r--kernel/trace/trace.h84
-rw-r--r--kernel/trace/trace_branch.c8
-rw-r--r--kernel/trace/trace_clock.c16
-rw-r--r--kernel/trace/trace_entries.h16
-rw-r--r--kernel/trace/trace_event_profile.c50
-rw-r--r--kernel/trace/trace_events.c198
-rw-r--r--kernel/trace/trace_events_filter.c426
-rw-r--r--kernel/trace/trace_export.c43
-rw-r--r--kernel/trace/trace_functions_graph.c169
-rw-r--r--kernel/trace/trace_hw_branches.c59
-rw-r--r--kernel/trace/trace_kprobe.c1542
-rw-r--r--kernel/trace/trace_ksym.c551
-rw-r--r--kernel/trace/trace_output.c98
-rw-r--r--kernel/trace/trace_sched_wakeup.c16
-rw-r--r--kernel/trace/trace_selftest.c59
-rw-r--r--kernel/trace/trace_stack.c20
-rw-r--r--kernel/trace/trace_syscalls.c231
-rw-r--r--kernel/uid16.c1
-rw-r--r--kernel/user-return-notifier.c44
-rw-r--r--kernel/user.c2
-rw-r--r--kernel/utsname_sysctl.c35
-rw-r--r--kernel/workqueue.c166
141 files changed, 13274 insertions, 6626 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
new file mode 100644
index 000000000000..88c92fb44618
--- /dev/null
+++ b/kernel/Kconfig.locks
@@ -0,0 +1,202 @@
1#
2# The ARCH_INLINE foo is necessary because select ignores "depends on"
3#
4config ARCH_INLINE_SPIN_TRYLOCK
5 bool
6
7config ARCH_INLINE_SPIN_TRYLOCK_BH
8 bool
9
10config ARCH_INLINE_SPIN_LOCK
11 bool
12
13config ARCH_INLINE_SPIN_LOCK_BH
14 bool
15
16config ARCH_INLINE_SPIN_LOCK_IRQ
17 bool
18
19config ARCH_INLINE_SPIN_LOCK_IRQSAVE
20 bool
21
22config ARCH_INLINE_SPIN_UNLOCK
23 bool
24
25config ARCH_INLINE_SPIN_UNLOCK_BH
26 bool
27
28config ARCH_INLINE_SPIN_UNLOCK_IRQ
29 bool
30
31config ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
32 bool
33
34
35config ARCH_INLINE_READ_TRYLOCK
36 bool
37
38config ARCH_INLINE_READ_LOCK
39 bool
40
41config ARCH_INLINE_READ_LOCK_BH
42 bool
43
44config ARCH_INLINE_READ_LOCK_IRQ
45 bool
46
47config ARCH_INLINE_READ_LOCK_IRQSAVE
48 bool
49
50config ARCH_INLINE_READ_UNLOCK
51 bool
52
53config ARCH_INLINE_READ_UNLOCK_BH
54 bool
55
56config ARCH_INLINE_READ_UNLOCK_IRQ
57 bool
58
59config ARCH_INLINE_READ_UNLOCK_IRQRESTORE
60 bool
61
62
63config ARCH_INLINE_WRITE_TRYLOCK
64 bool
65
66config ARCH_INLINE_WRITE_LOCK
67 bool
68
69config ARCH_INLINE_WRITE_LOCK_BH
70 bool
71
72config ARCH_INLINE_WRITE_LOCK_IRQ
73 bool
74
75config ARCH_INLINE_WRITE_LOCK_IRQSAVE
76 bool
77
78config ARCH_INLINE_WRITE_UNLOCK
79 bool
80
81config ARCH_INLINE_WRITE_UNLOCK_BH
82 bool
83
84config ARCH_INLINE_WRITE_UNLOCK_IRQ
85 bool
86
87config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
88 bool
89
90#
91# lock_* functions are inlined when:
92# - DEBUG_SPINLOCK=n and GENERIC_LOCKBREAK=n and ARCH_INLINE_*LOCK=y
93#
94# trylock_* functions are inlined when:
95# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
96#
97# unlock and unlock_irq functions are inlined when:
98# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
99# or
100# - DEBUG_SPINLOCK=n and PREEMPT=n
101#
102# unlock_bh and unlock_irqrestore functions are inlined when:
103# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
104#
105
106config INLINE_SPIN_TRYLOCK
107 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK
108
109config INLINE_SPIN_TRYLOCK_BH
110 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK_BH
111
112config INLINE_SPIN_LOCK
113 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK
114
115config INLINE_SPIN_LOCK_BH
116 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
117 ARCH_INLINE_SPIN_LOCK_BH
118
119config INLINE_SPIN_LOCK_IRQ
120 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
121 ARCH_INLINE_SPIN_LOCK_IRQ
122
123config INLINE_SPIN_LOCK_IRQSAVE
124 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
125 ARCH_INLINE_SPIN_LOCK_IRQSAVE
126
127config INLINE_SPIN_UNLOCK
128 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK)
129
130config INLINE_SPIN_UNLOCK_BH
131 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH
132
133config INLINE_SPIN_UNLOCK_IRQ
134 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH)
135
136config INLINE_SPIN_UNLOCK_IRQRESTORE
137 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
138
139
140config INLINE_READ_TRYLOCK
141 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_TRYLOCK
142
143config INLINE_READ_LOCK
144 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK
145
146config INLINE_READ_LOCK_BH
147 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
148 ARCH_INLINE_READ_LOCK_BH
149
150config INLINE_READ_LOCK_IRQ
151 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
152 ARCH_INLINE_READ_LOCK_IRQ
153
154config INLINE_READ_LOCK_IRQSAVE
155 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
156 ARCH_INLINE_READ_LOCK_IRQSAVE
157
158config INLINE_READ_UNLOCK
159 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK)
160
161config INLINE_READ_UNLOCK_BH
162 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_BH
163
164config INLINE_READ_UNLOCK_IRQ
165 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK_BH)
166
167config INLINE_READ_UNLOCK_IRQRESTORE
168 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_IRQRESTORE
169
170
171config INLINE_WRITE_TRYLOCK
172 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_TRYLOCK
173
174config INLINE_WRITE_LOCK
175 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK
176
177config INLINE_WRITE_LOCK_BH
178 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
179 ARCH_INLINE_WRITE_LOCK_BH
180
181config INLINE_WRITE_LOCK_IRQ
182 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
183 ARCH_INLINE_WRITE_LOCK_IRQ
184
185config INLINE_WRITE_LOCK_IRQSAVE
186 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
187 ARCH_INLINE_WRITE_LOCK_IRQSAVE
188
189config INLINE_WRITE_UNLOCK
190 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK)
191
192config INLINE_WRITE_UNLOCK_BH
193 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_BH
194
195config INLINE_WRITE_UNLOCK_IRQ
196 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH)
197
198config INLINE_WRITE_UNLOCK_IRQRESTORE
199 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
200
201config MUTEX_SPIN_ON_OWNER
202 def_bool SMP && !DEBUG_MUTEXES && !HAVE_DEFAULT_NO_SPIN_MUTEXES
diff --git a/kernel/Makefile b/kernel/Makefile
index 187c89b4783d..864ff75d65f2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -4,7 +4,7 @@
4 4
5obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ 5obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
6 cpu.o exit.o itimer.o time.o softirq.o resource.o \ 6 cpu.o exit.o itimer.o time.o softirq.o resource.o \
7 sysctl.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
@@ -21,6 +21,7 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
21CFLAGS_REMOVE_rtmutex-debug.o = -pg 21CFLAGS_REMOVE_rtmutex-debug.o = -pg
22CFLAGS_REMOVE_cgroup-debug.o = -pg 22CFLAGS_REMOVE_cgroup-debug.o = -pg
23CFLAGS_REMOVE_sched_clock.o = -pg 23CFLAGS_REMOVE_sched_clock.o = -pg
24CFLAGS_REMOVE_perf_event.o = -pg
24endif 25endif
25 26
26obj-$(CONFIG_FREEZER) += freezer.o 27obj-$(CONFIG_FREEZER) += freezer.o
@@ -58,7 +59,6 @@ obj-$(CONFIG_KEXEC) += kexec.o
58obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o 59obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
59obj-$(CONFIG_COMPAT) += compat.o 60obj-$(CONFIG_COMPAT) += compat.o
60obj-$(CONFIG_CGROUPS) += cgroup.o 61obj-$(CONFIG_CGROUPS) += cgroup.o
61obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
62obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o 62obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
63obj-$(CONFIG_CPUSETS) += cpuset.o 63obj-$(CONFIG_CPUSETS) += cpuset.o
64obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o 64obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
@@ -83,6 +83,7 @@ obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
83obj-$(CONFIG_TREE_RCU) += rcutree.o 83obj-$(CONFIG_TREE_RCU) += rcutree.o
84obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o 84obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
85obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o 85obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
86obj-$(CONFIG_TINY_RCU) += rcutiny.o
86obj-$(CONFIG_RELAY) += relay.o 87obj-$(CONFIG_RELAY) += relay.o
87obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 88obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
88obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 89obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
@@ -95,7 +96,10 @@ obj-$(CONFIG_X86_DS) += trace/
95obj-$(CONFIG_RING_BUFFER) += trace/ 96obj-$(CONFIG_RING_BUFFER) += trace/
96obj-$(CONFIG_SMP) += sched_cpupri.o 97obj-$(CONFIG_SMP) += sched_cpupri.o
97obj-$(CONFIG_SLOW_WORK) += slow-work.o 98obj-$(CONFIG_SLOW_WORK) += slow-work.o
99obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
98obj-$(CONFIG_PERF_EVENTS) += perf_event.o 100obj-$(CONFIG_PERF_EVENTS) += perf_event.o
101obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
102obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
99 103
100ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 104ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
101# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 105# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index 9a4715a2f6bf..a6605ca921b6 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -536,7 +536,8 @@ static void do_acct_process(struct bsd_acct_struct *acct,
536 do_div(elapsed, AHZ); 536 do_div(elapsed, AHZ);
537 ac.ac_btime = get_seconds() - elapsed; 537 ac.ac_btime = get_seconds() - elapsed;
538 /* we really need to bite the bullet and change layout */ 538 /* we really need to bite the bullet and change layout */
539 current_uid_gid(&ac.ac_uid, &ac.ac_gid); 539 ac.ac_uid = orig_cred->uid;
540 ac.ac_gid = orig_cred->gid;
540#if ACCT_VERSION==2 541#if ACCT_VERSION==2
541 ac.ac_ahz = AHZ; 542 ac.ac_ahz = AHZ;
542#endif 543#endif
diff --git a/kernel/audit.c b/kernel/audit.c
index defc2e6f1e3b..5feed232be9d 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -855,18 +855,24 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
855 break; 855 break;
856 } 856 }
857 case AUDIT_SIGNAL_INFO: 857 case AUDIT_SIGNAL_INFO:
858 err = security_secid_to_secctx(audit_sig_sid, &ctx, &len); 858 len = 0;
859 if (err) 859 if (audit_sig_sid) {
860 return err; 860 err = security_secid_to_secctx(audit_sig_sid, &ctx, &len);
861 if (err)
862 return err;
863 }
861 sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); 864 sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
862 if (!sig_data) { 865 if (!sig_data) {
863 security_release_secctx(ctx, len); 866 if (audit_sig_sid)
867 security_release_secctx(ctx, len);
864 return -ENOMEM; 868 return -ENOMEM;
865 } 869 }
866 sig_data->uid = audit_sig_uid; 870 sig_data->uid = audit_sig_uid;
867 sig_data->pid = audit_sig_pid; 871 sig_data->pid = audit_sig_pid;
868 memcpy(sig_data->ctx, ctx, len); 872 if (audit_sig_sid) {
869 security_release_secctx(ctx, len); 873 memcpy(sig_data->ctx, ctx, len);
874 security_release_secctx(ctx, len);
875 }
870 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, 876 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO,
871 0, 0, sig_data, sizeof(*sig_data) + len); 877 0, 0, sig_data, sizeof(*sig_data) + len);
872 kfree(sig_data); 878 kfree(sig_data);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 0e96dbc60ea9..cc7e87936cbc 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -45,8 +45,8 @@
45 45
46struct audit_watch { 46struct audit_watch {
47 atomic_t count; /* reference count */ 47 atomic_t count; /* reference count */
48 char *path; /* insertion path */
49 dev_t dev; /* associated superblock device */ 48 dev_t dev; /* associated superblock device */
49 char *path; /* insertion path */
50 unsigned long ino; /* associated inode number */ 50 unsigned long ino; /* associated inode number */
51 struct audit_parent *parent; /* associated parent */ 51 struct audit_parent *parent; /* associated parent */
52 struct list_head wlist; /* entry in parent->watches list */ 52 struct list_head wlist; /* entry in parent->watches list */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 68d3c6a0ecd6..267e484f0198 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -168,12 +168,12 @@ struct audit_context {
168 int in_syscall; /* 1 if task is in a syscall */ 168 int in_syscall; /* 1 if task is in a syscall */
169 enum audit_state state, current_state; 169 enum audit_state state, current_state;
170 unsigned int serial; /* serial number for record */ 170 unsigned int serial; /* serial number for record */
171 struct timespec ctime; /* time of syscall entry */
172 int major; /* syscall number */ 171 int major; /* syscall number */
172 struct timespec ctime; /* time of syscall entry */
173 unsigned long argv[4]; /* syscall arguments */ 173 unsigned long argv[4]; /* syscall arguments */
174 int return_valid; /* return code is valid */
175 long return_code;/* syscall return code */ 174 long return_code;/* syscall return code */
176 u64 prio; 175 u64 prio;
176 int return_valid; /* return code is valid */
177 int name_count; 177 int name_count;
178 struct audit_names names[AUDIT_NAMES]; 178 struct audit_names names[AUDIT_NAMES];
179 char * filterkey; /* key for rule that triggered record */ 179 char * filterkey; /* key for rule that triggered record */
@@ -198,8 +198,8 @@ struct audit_context {
198 char target_comm[TASK_COMM_LEN]; 198 char target_comm[TASK_COMM_LEN];
199 199
200 struct audit_tree_refs *trees, *first_trees; 200 struct audit_tree_refs *trees, *first_trees;
201 int tree_count;
202 struct list_head killed_trees; 201 struct list_head killed_trees;
202 int tree_count;
203 203
204 int type; 204 int type;
205 union { 205 union {
diff --git a/kernel/capability.c b/kernel/capability.c
index 4e17041963f5..7f876e60521f 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -29,7 +29,6 @@ EXPORT_SYMBOL(__cap_empty_set);
29EXPORT_SYMBOL(__cap_full_set); 29EXPORT_SYMBOL(__cap_full_set);
30EXPORT_SYMBOL(__cap_init_eff_set); 30EXPORT_SYMBOL(__cap_init_eff_set);
31 31
32#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
33int file_caps_enabled = 1; 32int file_caps_enabled = 1;
34 33
35static int __init file_caps_disable(char *str) 34static int __init file_caps_disable(char *str)
@@ -38,7 +37,6 @@ static int __init file_caps_disable(char *str)
38 return 1; 37 return 1;
39} 38}
40__setup("no_file_caps", file_caps_disable); 39__setup("no_file_caps", file_caps_disable);
41#endif
42 40
43/* 41/*
44 * More recent versions of libcap are available from: 42 * More recent versions of libcap are available from:
@@ -169,8 +167,8 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
169 kernel_cap_t pE, pI, pP; 167 kernel_cap_t pE, pI, pP;
170 168
171 ret = cap_validate_magic(header, &tocopy); 169 ret = cap_validate_magic(header, &tocopy);
172 if (ret != 0) 170 if ((dataptr == NULL) || (ret != 0))
173 return ret; 171 return ((dataptr == NULL) && (ret == -EINVAL)) ? 0 : ret;
174 172
175 if (get_user(pid, &header->pid)) 173 if (get_user(pid, &header->pid))
176 return -EFAULT; 174 return -EFAULT;
@@ -238,7 +236,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
238SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) 236SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
239{ 237{
240 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; 238 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
241 unsigned i, tocopy; 239 unsigned i, tocopy, copybytes;
242 kernel_cap_t inheritable, permitted, effective; 240 kernel_cap_t inheritable, permitted, effective;
243 struct cred *new; 241 struct cred *new;
244 int ret; 242 int ret;
@@ -255,8 +253,11 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
255 if (pid != 0 && pid != task_pid_vnr(current)) 253 if (pid != 0 && pid != task_pid_vnr(current))
256 return -EPERM; 254 return -EPERM;
257 255
258 if (copy_from_user(&kdata, data, 256 copybytes = tocopy * sizeof(struct __user_cap_data_struct);
259 tocopy * sizeof(struct __user_cap_data_struct))) 257 if (copybytes > sizeof(kdata))
258 return -EFAULT;
259
260 if (copy_from_user(&kdata, data, copybytes))
260 return -EFAULT; 261 return -EFAULT;
261 262
262 for (i = 0; i < tocopy; i++) { 263 for (i = 0; i < tocopy; i++) {
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index cd83d9933b6b..0249f4be9b5c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -23,6 +23,7 @@
23 */ 23 */
24 24
25#include <linux/cgroup.h> 25#include <linux/cgroup.h>
26#include <linux/ctype.h>
26#include <linux/errno.h> 27#include <linux/errno.h>
27#include <linux/fs.h> 28#include <linux/fs.h>
28#include <linux/kernel.h> 29#include <linux/kernel.h>
@@ -48,6 +49,8 @@
48#include <linux/namei.h> 49#include <linux/namei.h>
49#include <linux/smp_lock.h> 50#include <linux/smp_lock.h>
50#include <linux/pid_namespace.h> 51#include <linux/pid_namespace.h>
52#include <linux/idr.h>
53#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
51 54
52#include <asm/atomic.h> 55#include <asm/atomic.h>
53 56
@@ -60,6 +63,8 @@ static struct cgroup_subsys *subsys[] = {
60#include <linux/cgroup_subsys.h> 63#include <linux/cgroup_subsys.h>
61}; 64};
62 65
66#define MAX_CGROUP_ROOT_NAMELEN 64
67
63/* 68/*
64 * A cgroupfs_root represents the root of a cgroup hierarchy, 69 * A cgroupfs_root represents the root of a cgroup hierarchy,
65 * and may be associated with a superblock to form an active 70 * and may be associated with a superblock to form an active
@@ -74,6 +79,9 @@ struct cgroupfs_root {
74 */ 79 */
75 unsigned long subsys_bits; 80 unsigned long subsys_bits;
76 81
82 /* Unique id for this hierarchy. */
83 int hierarchy_id;
84
77 /* The bitmask of subsystems currently attached to this hierarchy */ 85 /* The bitmask of subsystems currently attached to this hierarchy */
78 unsigned long actual_subsys_bits; 86 unsigned long actual_subsys_bits;
79 87
@@ -94,6 +102,9 @@ struct cgroupfs_root {
94 102
95 /* The path to use for release notifications. */ 103 /* The path to use for release notifications. */
96 char release_agent_path[PATH_MAX]; 104 char release_agent_path[PATH_MAX];
105
106 /* The name for this hierarchy - may be empty */
107 char name[MAX_CGROUP_ROOT_NAMELEN];
97}; 108};
98 109
99/* 110/*
@@ -141,6 +152,10 @@ struct css_id {
141static LIST_HEAD(roots); 152static LIST_HEAD(roots);
142static int root_count; 153static int root_count;
143 154
155static DEFINE_IDA(hierarchy_ida);
156static int next_hierarchy_id;
157static DEFINE_SPINLOCK(hierarchy_id_lock);
158
144/* dummytop is a shorthand for the dummy hierarchy's top cgroup */ 159/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
145#define dummytop (&rootnode.top_cgroup) 160#define dummytop (&rootnode.top_cgroup)
146 161
@@ -201,6 +216,7 @@ struct cg_cgroup_link {
201 * cgroup, anchored on cgroup->css_sets 216 * cgroup, anchored on cgroup->css_sets
202 */ 217 */
203 struct list_head cgrp_link_list; 218 struct list_head cgrp_link_list;
219 struct cgroup *cgrp;
204 /* 220 /*
205 * List running through cg_cgroup_links pointing at a 221 * List running through cg_cgroup_links pointing at a
206 * single css_set object, anchored on css_set->cg_links 222 * single css_set object, anchored on css_set->cg_links
@@ -227,8 +243,11 @@ static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
227static DEFINE_RWLOCK(css_set_lock); 243static DEFINE_RWLOCK(css_set_lock);
228static int css_set_count; 244static int css_set_count;
229 245
230/* hash table for cgroup groups. This improves the performance to 246/*
231 * find an existing css_set */ 247 * hash table for cgroup groups. This improves the performance to find
248 * an existing css_set. This hash doesn't (currently) take into
249 * account cgroups in empty hierarchies.
250 */
232#define CSS_SET_HASH_BITS 7 251#define CSS_SET_HASH_BITS 7
233#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS) 252#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS)
234static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE]; 253static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
@@ -248,48 +267,22 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
248 return &css_set_table[index]; 267 return &css_set_table[index];
249} 268}
250 269
270static void free_css_set_rcu(struct rcu_head *obj)
271{
272 struct css_set *cg = container_of(obj, struct css_set, rcu_head);
273 kfree(cg);
274}
275
251/* We don't maintain the lists running through each css_set to its 276/* We don't maintain the lists running through each css_set to its
252 * task until after the first call to cgroup_iter_start(). This 277 * task until after the first call to cgroup_iter_start(). This
253 * reduces the fork()/exit() overhead for people who have cgroups 278 * reduces the fork()/exit() overhead for people who have cgroups
254 * compiled into their kernel but not actually in use */ 279 * compiled into their kernel but not actually in use */
255static int use_task_css_set_links __read_mostly; 280static int use_task_css_set_links __read_mostly;
256 281
257/* When we create or destroy a css_set, the operation simply 282static void __put_css_set(struct css_set *cg, int taskexit)
258 * takes/releases a reference count on all the cgroups referenced
259 * by subsystems in this css_set. This can end up multiple-counting
260 * some cgroups, but that's OK - the ref-count is just a
261 * busy/not-busy indicator; ensuring that we only count each cgroup
262 * once would require taking a global lock to ensure that no
263 * subsystems moved between hierarchies while we were doing so.
264 *
265 * Possible TODO: decide at boot time based on the number of
266 * registered subsystems and the number of CPUs or NUMA nodes whether
267 * it's better for performance to ref-count every subsystem, or to
268 * take a global lock and only add one ref count to each hierarchy.
269 */
270
271/*
272 * unlink a css_set from the list and free it
273 */
274static void unlink_css_set(struct css_set *cg)
275{ 283{
276 struct cg_cgroup_link *link; 284 struct cg_cgroup_link *link;
277 struct cg_cgroup_link *saved_link; 285 struct cg_cgroup_link *saved_link;
278
279 hlist_del(&cg->hlist);
280 css_set_count--;
281
282 list_for_each_entry_safe(link, saved_link, &cg->cg_links,
283 cg_link_list) {
284 list_del(&link->cg_link_list);
285 list_del(&link->cgrp_link_list);
286 kfree(link);
287 }
288}
289
290static void __put_css_set(struct css_set *cg, int taskexit)
291{
292 int i;
293 /* 286 /*
294 * Ensure that the refcount doesn't hit zero while any readers 287 * Ensure that the refcount doesn't hit zero while any readers
295 * can see it. Similar to atomic_dec_and_lock(), but for an 288 * can see it. Similar to atomic_dec_and_lock(), but for an
@@ -302,21 +295,28 @@ static void __put_css_set(struct css_set *cg, int taskexit)
302 write_unlock(&css_set_lock); 295 write_unlock(&css_set_lock);
303 return; 296 return;
304 } 297 }
305 unlink_css_set(cg);
306 write_unlock(&css_set_lock);
307 298
308 rcu_read_lock(); 299 /* This css_set is dead. unlink it and release cgroup refcounts */
309 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 300 hlist_del(&cg->hlist);
310 struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup); 301 css_set_count--;
302
303 list_for_each_entry_safe(link, saved_link, &cg->cg_links,
304 cg_link_list) {
305 struct cgroup *cgrp = link->cgrp;
306 list_del(&link->cg_link_list);
307 list_del(&link->cgrp_link_list);
311 if (atomic_dec_and_test(&cgrp->count) && 308 if (atomic_dec_and_test(&cgrp->count) &&
312 notify_on_release(cgrp)) { 309 notify_on_release(cgrp)) {
313 if (taskexit) 310 if (taskexit)
314 set_bit(CGRP_RELEASABLE, &cgrp->flags); 311 set_bit(CGRP_RELEASABLE, &cgrp->flags);
315 check_for_release(cgrp); 312 check_for_release(cgrp);
316 } 313 }
314
315 kfree(link);
317 } 316 }
318 rcu_read_unlock(); 317
319 kfree(cg); 318 write_unlock(&css_set_lock);
319 call_rcu(&cg->rcu_head, free_css_set_rcu);
320} 320}
321 321
322/* 322/*
@@ -338,6 +338,78 @@ static inline void put_css_set_taskexit(struct css_set *cg)
338} 338}
339 339
340/* 340/*
341 * compare_css_sets - helper function for find_existing_css_set().
342 * @cg: candidate css_set being tested
343 * @old_cg: existing css_set for a task
344 * @new_cgrp: cgroup that's being entered by the task
345 * @template: desired set of css pointers in css_set (pre-calculated)
346 *
347 * Returns true if "cg" matches "old_cg" except for the hierarchy
348 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
349 */
350static bool compare_css_sets(struct css_set *cg,
351 struct css_set *old_cg,
352 struct cgroup *new_cgrp,
353 struct cgroup_subsys_state *template[])
354{
355 struct list_head *l1, *l2;
356
357 if (memcmp(template, cg->subsys, sizeof(cg->subsys))) {
358 /* Not all subsystems matched */
359 return false;
360 }
361
362 /*
363 * Compare cgroup pointers in order to distinguish between
364 * different cgroups in heirarchies with no subsystems. We
365 * could get by with just this check alone (and skip the
366 * memcmp above) but on most setups the memcmp check will
367 * avoid the need for this more expensive check on almost all
368 * candidates.
369 */
370
371 l1 = &cg->cg_links;
372 l2 = &old_cg->cg_links;
373 while (1) {
374 struct cg_cgroup_link *cgl1, *cgl2;
375 struct cgroup *cg1, *cg2;
376
377 l1 = l1->next;
378 l2 = l2->next;
379 /* See if we reached the end - both lists are equal length. */
380 if (l1 == &cg->cg_links) {
381 BUG_ON(l2 != &old_cg->cg_links);
382 break;
383 } else {
384 BUG_ON(l2 == &old_cg->cg_links);
385 }
386 /* Locate the cgroups associated with these links. */
387 cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list);
388 cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list);
389 cg1 = cgl1->cgrp;
390 cg2 = cgl2->cgrp;
391 /* Hierarchies should be linked in the same order. */
392 BUG_ON(cg1->root != cg2->root);
393
394 /*
395 * If this hierarchy is the hierarchy of the cgroup
396 * that's changing, then we need to check that this
397 * css_set points to the new cgroup; if it's any other
398 * hierarchy, then this css_set should point to the
399 * same cgroup as the old css_set.
400 */
401 if (cg1->root == new_cgrp->root) {
402 if (cg1 != new_cgrp)
403 return false;
404 } else {
405 if (cg1 != cg2)
406 return false;
407 }
408 }
409 return true;
410}
411
412/*
341 * find_existing_css_set() is a helper for 413 * find_existing_css_set() is a helper for
342 * find_css_set(), and checks to see whether an existing 414 * find_css_set(), and checks to see whether an existing
343 * css_set is suitable. 415 * css_set is suitable.
@@ -378,10 +450,11 @@ static struct css_set *find_existing_css_set(
378 450
379 hhead = css_set_hash(template); 451 hhead = css_set_hash(template);
380 hlist_for_each_entry(cg, node, hhead, hlist) { 452 hlist_for_each_entry(cg, node, hhead, hlist) {
381 if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) { 453 if (!compare_css_sets(cg, oldcg, cgrp, template))
382 /* All subsystems matched */ 454 continue;
383 return cg; 455
384 } 456 /* This css_set matches what we need */
457 return cg;
385 } 458 }
386 459
387 /* No existing cgroup group matched */ 460 /* No existing cgroup group matched */
@@ -435,8 +508,14 @@ static void link_css_set(struct list_head *tmp_cg_links,
435 link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, 508 link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
436 cgrp_link_list); 509 cgrp_link_list);
437 link->cg = cg; 510 link->cg = cg;
511 link->cgrp = cgrp;
512 atomic_inc(&cgrp->count);
438 list_move(&link->cgrp_link_list, &cgrp->css_sets); 513 list_move(&link->cgrp_link_list, &cgrp->css_sets);
439 list_add(&link->cg_link_list, &cg->cg_links); 514 /*
515 * Always add links to the tail of the list so that the list
516 * is sorted by order of hierarchy creation
517 */
518 list_add_tail(&link->cg_link_list, &cg->cg_links);
440} 519}
441 520
442/* 521/*
@@ -451,11 +530,11 @@ static struct css_set *find_css_set(
451{ 530{
452 struct css_set *res; 531 struct css_set *res;
453 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; 532 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
454 int i;
455 533
456 struct list_head tmp_cg_links; 534 struct list_head tmp_cg_links;
457 535
458 struct hlist_head *hhead; 536 struct hlist_head *hhead;
537 struct cg_cgroup_link *link;
459 538
460 /* First see if we already have a cgroup group that matches 539 /* First see if we already have a cgroup group that matches
461 * the desired set */ 540 * the desired set */
@@ -489,20 +568,12 @@ static struct css_set *find_css_set(
489 568
490 write_lock(&css_set_lock); 569 write_lock(&css_set_lock);
491 /* Add reference counts and links from the new css_set. */ 570 /* Add reference counts and links from the new css_set. */
492 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 571 list_for_each_entry(link, &oldcg->cg_links, cg_link_list) {
493 struct cgroup *cgrp = res->subsys[i]->cgroup; 572 struct cgroup *c = link->cgrp;
494 struct cgroup_subsys *ss = subsys[i]; 573 if (c->root == cgrp->root)
495 atomic_inc(&cgrp->count); 574 c = cgrp;
496 /* 575 link_css_set(&tmp_cg_links, res, c);
497 * We want to add a link once per cgroup, so we
498 * only do it for the first subsystem in each
499 * hierarchy
500 */
501 if (ss->root->subsys_list.next == &ss->sibling)
502 link_css_set(&tmp_cg_links, res, cgrp);
503 } 576 }
504 if (list_empty(&rootnode.subsys_list))
505 link_css_set(&tmp_cg_links, res, dummytop);
506 577
507 BUG_ON(!list_empty(&tmp_cg_links)); 578 BUG_ON(!list_empty(&tmp_cg_links));
508 579
@@ -518,6 +589,41 @@ static struct css_set *find_css_set(
518} 589}
519 590
520/* 591/*
592 * Return the cgroup for "task" from the given hierarchy. Must be
593 * called with cgroup_mutex held.
594 */
595static struct cgroup *task_cgroup_from_root(struct task_struct *task,
596 struct cgroupfs_root *root)
597{
598 struct css_set *css;
599 struct cgroup *res = NULL;
600
601 BUG_ON(!mutex_is_locked(&cgroup_mutex));
602 read_lock(&css_set_lock);
603 /*
604 * No need to lock the task - since we hold cgroup_mutex the
605 * task can't change groups, so the only thing that can happen
606 * is that it exits and its css is set back to init_css_set.
607 */
608 css = task->cgroups;
609 if (css == &init_css_set) {
610 res = &root->top_cgroup;
611 } else {
612 struct cg_cgroup_link *link;
613 list_for_each_entry(link, &css->cg_links, cg_link_list) {
614 struct cgroup *c = link->cgrp;
615 if (c->root == root) {
616 res = c;
617 break;
618 }
619 }
620 }
621 read_unlock(&css_set_lock);
622 BUG_ON(!res);
623 return res;
624}
625
626/*
521 * There is one global cgroup mutex. We also require taking 627 * There is one global cgroup mutex. We also require taking
522 * task_lock() when dereferencing a task's cgroup subsys pointers. 628 * task_lock() when dereferencing a task's cgroup subsys pointers.
523 * See "The task_lock() exception", at the end of this comment. 629 * See "The task_lock() exception", at the end of this comment.
@@ -597,7 +703,7 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
597static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 703static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
598static int cgroup_populate_dir(struct cgroup *cgrp); 704static int cgroup_populate_dir(struct cgroup *cgrp);
599static const struct inode_operations cgroup_dir_inode_operations; 705static const struct inode_operations cgroup_dir_inode_operations;
600static struct file_operations proc_cgroupstats_operations; 706static const struct file_operations proc_cgroupstats_operations;
601 707
602static struct backing_dev_info cgroup_backing_dev_info = { 708static struct backing_dev_info cgroup_backing_dev_info = {
603 .name = "cgroup", 709 .name = "cgroup",
@@ -677,6 +783,12 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
677 */ 783 */
678 deactivate_super(cgrp->root->sb); 784 deactivate_super(cgrp->root->sb);
679 785
786 /*
787 * if we're getting rid of the cgroup, refcount should ensure
788 * that there are no pidlists left.
789 */
790 BUG_ON(!list_empty(&cgrp->pidlists));
791
680 call_rcu(&cgrp->rcu_head, free_cgroup_rcu); 792 call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
681 } 793 }
682 iput(inode); 794 iput(inode);
@@ -841,6 +953,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
841 seq_puts(seq, ",noprefix"); 953 seq_puts(seq, ",noprefix");
842 if (strlen(root->release_agent_path)) 954 if (strlen(root->release_agent_path))
843 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 955 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
956 if (strlen(root->name))
957 seq_printf(seq, ",name=%s", root->name);
844 mutex_unlock(&cgroup_mutex); 958 mutex_unlock(&cgroup_mutex);
845 return 0; 959 return 0;
846} 960}
@@ -849,6 +963,12 @@ struct cgroup_sb_opts {
849 unsigned long subsys_bits; 963 unsigned long subsys_bits;
850 unsigned long flags; 964 unsigned long flags;
851 char *release_agent; 965 char *release_agent;
966 char *name;
967 /* User explicitly requested empty subsystem */
968 bool none;
969
970 struct cgroupfs_root *new_root;
971
852}; 972};
853 973
854/* Convert a hierarchy specifier into a bitmask of subsystems and 974/* Convert a hierarchy specifier into a bitmask of subsystems and
@@ -863,9 +983,7 @@ static int parse_cgroupfs_options(char *data,
863 mask = ~(1UL << cpuset_subsys_id); 983 mask = ~(1UL << cpuset_subsys_id);
864#endif 984#endif
865 985
866 opts->subsys_bits = 0; 986 memset(opts, 0, sizeof(*opts));
867 opts->flags = 0;
868 opts->release_agent = NULL;
869 987
870 while ((token = strsep(&o, ",")) != NULL) { 988 while ((token = strsep(&o, ",")) != NULL) {
871 if (!*token) 989 if (!*token)
@@ -879,17 +997,42 @@ static int parse_cgroupfs_options(char *data,
879 if (!ss->disabled) 997 if (!ss->disabled)
880 opts->subsys_bits |= 1ul << i; 998 opts->subsys_bits |= 1ul << i;
881 } 999 }
1000 } else if (!strcmp(token, "none")) {
1001 /* Explicitly have no subsystems */
1002 opts->none = true;
882 } else if (!strcmp(token, "noprefix")) { 1003 } else if (!strcmp(token, "noprefix")) {
883 set_bit(ROOT_NOPREFIX, &opts->flags); 1004 set_bit(ROOT_NOPREFIX, &opts->flags);
884 } else if (!strncmp(token, "release_agent=", 14)) { 1005 } else if (!strncmp(token, "release_agent=", 14)) {
885 /* Specifying two release agents is forbidden */ 1006 /* Specifying two release agents is forbidden */
886 if (opts->release_agent) 1007 if (opts->release_agent)
887 return -EINVAL; 1008 return -EINVAL;
888 opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL); 1009 opts->release_agent =
1010 kstrndup(token + 14, PATH_MAX, GFP_KERNEL);
889 if (!opts->release_agent) 1011 if (!opts->release_agent)
890 return -ENOMEM; 1012 return -ENOMEM;
891 strncpy(opts->release_agent, token + 14, PATH_MAX - 1); 1013 } else if (!strncmp(token, "name=", 5)) {
892 opts->release_agent[PATH_MAX - 1] = 0; 1014 int i;
1015 const char *name = token + 5;
1016 /* Can't specify an empty name */
1017 if (!strlen(name))
1018 return -EINVAL;
1019 /* Must match [\w.-]+ */
1020 for (i = 0; i < strlen(name); i++) {
1021 char c = name[i];
1022 if (isalnum(c))
1023 continue;
1024 if ((c == '.') || (c == '-') || (c == '_'))
1025 continue;
1026 return -EINVAL;
1027 }
1028 /* Specifying two names is forbidden */
1029 if (opts->name)
1030 return -EINVAL;
1031 opts->name = kstrndup(name,
1032 MAX_CGROUP_ROOT_NAMELEN,
1033 GFP_KERNEL);
1034 if (!opts->name)
1035 return -ENOMEM;
893 } else { 1036 } else {
894 struct cgroup_subsys *ss; 1037 struct cgroup_subsys *ss;
895 int i; 1038 int i;
@@ -906,6 +1049,8 @@ static int parse_cgroupfs_options(char *data,
906 } 1049 }
907 } 1050 }
908 1051
1052 /* Consistency checks */
1053
909 /* 1054 /*
910 * Option noprefix was introduced just for backward compatibility 1055 * Option noprefix was introduced just for backward compatibility
911 * with the old cpuset, so we allow noprefix only if mounting just 1056 * with the old cpuset, so we allow noprefix only if mounting just
@@ -915,8 +1060,16 @@ static int parse_cgroupfs_options(char *data,
915 (opts->subsys_bits & mask)) 1060 (opts->subsys_bits & mask))
916 return -EINVAL; 1061 return -EINVAL;
917 1062
918 /* We can't have an empty hierarchy */ 1063
919 if (!opts->subsys_bits) 1064 /* Can't specify "none" and some subsystems */
1065 if (opts->subsys_bits && opts->none)
1066 return -EINVAL;
1067
1068 /*
1069 * We either have to specify by name or by subsystems. (So all
1070 * empty hierarchies must have a name).
1071 */
1072 if (!opts->subsys_bits && !opts->name)
920 return -EINVAL; 1073 return -EINVAL;
921 1074
922 return 0; 1075 return 0;
@@ -944,6 +1097,12 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
944 goto out_unlock; 1097 goto out_unlock;
945 } 1098 }
946 1099
1100 /* Don't allow name to change at remount */
1101 if (opts.name && strcmp(opts.name, root->name)) {
1102 ret = -EINVAL;
1103 goto out_unlock;
1104 }
1105
947 ret = rebind_subsystems(root, opts.subsys_bits); 1106 ret = rebind_subsystems(root, opts.subsys_bits);
948 if (ret) 1107 if (ret)
949 goto out_unlock; 1108 goto out_unlock;
@@ -955,6 +1114,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
955 strcpy(root->release_agent_path, opts.release_agent); 1114 strcpy(root->release_agent_path, opts.release_agent);
956 out_unlock: 1115 out_unlock:
957 kfree(opts.release_agent); 1116 kfree(opts.release_agent);
1117 kfree(opts.name);
958 mutex_unlock(&cgroup_mutex); 1118 mutex_unlock(&cgroup_mutex);
959 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1119 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
960 unlock_kernel(); 1120 unlock_kernel();
@@ -974,9 +1134,10 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
974 INIT_LIST_HEAD(&cgrp->children); 1134 INIT_LIST_HEAD(&cgrp->children);
975 INIT_LIST_HEAD(&cgrp->css_sets); 1135 INIT_LIST_HEAD(&cgrp->css_sets);
976 INIT_LIST_HEAD(&cgrp->release_list); 1136 INIT_LIST_HEAD(&cgrp->release_list);
977 INIT_LIST_HEAD(&cgrp->pids_list); 1137 INIT_LIST_HEAD(&cgrp->pidlists);
978 init_rwsem(&cgrp->pids_mutex); 1138 mutex_init(&cgrp->pidlist_mutex);
979} 1139}
1140
980static void init_cgroup_root(struct cgroupfs_root *root) 1141static void init_cgroup_root(struct cgroupfs_root *root)
981{ 1142{
982 struct cgroup *cgrp = &root->top_cgroup; 1143 struct cgroup *cgrp = &root->top_cgroup;
@@ -988,33 +1149,106 @@ static void init_cgroup_root(struct cgroupfs_root *root)
988 init_cgroup_housekeeping(cgrp); 1149 init_cgroup_housekeeping(cgrp);
989} 1150}
990 1151
1152static bool init_root_id(struct cgroupfs_root *root)
1153{
1154 int ret = 0;
1155
1156 do {
1157 if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL))
1158 return false;
1159 spin_lock(&hierarchy_id_lock);
1160 /* Try to allocate the next unused ID */
1161 ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id,
1162 &root->hierarchy_id);
1163 if (ret == -ENOSPC)
1164 /* Try again starting from 0 */
1165 ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id);
1166 if (!ret) {
1167 next_hierarchy_id = root->hierarchy_id + 1;
1168 } else if (ret != -EAGAIN) {
1169 /* Can only get here if the 31-bit IDR is full ... */
1170 BUG_ON(ret);
1171 }
1172 spin_unlock(&hierarchy_id_lock);
1173 } while (ret);
1174 return true;
1175}
1176
991static int cgroup_test_super(struct super_block *sb, void *data) 1177static int cgroup_test_super(struct super_block *sb, void *data)
992{ 1178{
993 struct cgroupfs_root *new = data; 1179 struct cgroup_sb_opts *opts = data;
994 struct cgroupfs_root *root = sb->s_fs_info; 1180 struct cgroupfs_root *root = sb->s_fs_info;
995 1181
996 /* First check subsystems */ 1182 /* If we asked for a name then it must match */
997 if (new->subsys_bits != root->subsys_bits) 1183 if (opts->name && strcmp(opts->name, root->name))
998 return 0; 1184 return 0;
999 1185
1000 /* Next check flags */ 1186 /*
1001 if (new->flags != root->flags) 1187 * If we asked for subsystems (or explicitly for no
1188 * subsystems) then they must match
1189 */
1190 if ((opts->subsys_bits || opts->none)
1191 && (opts->subsys_bits != root->subsys_bits))
1002 return 0; 1192 return 0;
1003 1193
1004 return 1; 1194 return 1;
1005} 1195}
1006 1196
1197static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1198{
1199 struct cgroupfs_root *root;
1200
1201 if (!opts->subsys_bits && !opts->none)
1202 return NULL;
1203
1204 root = kzalloc(sizeof(*root), GFP_KERNEL);
1205 if (!root)
1206 return ERR_PTR(-ENOMEM);
1207
1208 if (!init_root_id(root)) {
1209 kfree(root);
1210 return ERR_PTR(-ENOMEM);
1211 }
1212 init_cgroup_root(root);
1213
1214 root->subsys_bits = opts->subsys_bits;
1215 root->flags = opts->flags;
1216 if (opts->release_agent)
1217 strcpy(root->release_agent_path, opts->release_agent);
1218 if (opts->name)
1219 strcpy(root->name, opts->name);
1220 return root;
1221}
1222
1223static void cgroup_drop_root(struct cgroupfs_root *root)
1224{
1225 if (!root)
1226 return;
1227
1228 BUG_ON(!root->hierarchy_id);
1229 spin_lock(&hierarchy_id_lock);
1230 ida_remove(&hierarchy_ida, root->hierarchy_id);
1231 spin_unlock(&hierarchy_id_lock);
1232 kfree(root);
1233}
1234
1007static int cgroup_set_super(struct super_block *sb, void *data) 1235static int cgroup_set_super(struct super_block *sb, void *data)
1008{ 1236{
1009 int ret; 1237 int ret;
1010 struct cgroupfs_root *root = data; 1238 struct cgroup_sb_opts *opts = data;
1239
1240 /* If we don't have a new root, we can't set up a new sb */
1241 if (!opts->new_root)
1242 return -EINVAL;
1243
1244 BUG_ON(!opts->subsys_bits && !opts->none);
1011 1245
1012 ret = set_anon_super(sb, NULL); 1246 ret = set_anon_super(sb, NULL);
1013 if (ret) 1247 if (ret)
1014 return ret; 1248 return ret;
1015 1249
1016 sb->s_fs_info = root; 1250 sb->s_fs_info = opts->new_root;
1017 root->sb = sb; 1251 opts->new_root->sb = sb;
1018 1252
1019 sb->s_blocksize = PAGE_CACHE_SIZE; 1253 sb->s_blocksize = PAGE_CACHE_SIZE;
1020 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 1254 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
@@ -1051,48 +1285,43 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1051 void *data, struct vfsmount *mnt) 1285 void *data, struct vfsmount *mnt)
1052{ 1286{
1053 struct cgroup_sb_opts opts; 1287 struct cgroup_sb_opts opts;
1288 struct cgroupfs_root *root;
1054 int ret = 0; 1289 int ret = 0;
1055 struct super_block *sb; 1290 struct super_block *sb;
1056 struct cgroupfs_root *root; 1291 struct cgroupfs_root *new_root;
1057 struct list_head tmp_cg_links;
1058 1292
1059 /* First find the desired set of subsystems */ 1293 /* First find the desired set of subsystems */
1060 ret = parse_cgroupfs_options(data, &opts); 1294 ret = parse_cgroupfs_options(data, &opts);
1061 if (ret) { 1295 if (ret)
1062 kfree(opts.release_agent); 1296 goto out_err;
1063 return ret;
1064 }
1065
1066 root = kzalloc(sizeof(*root), GFP_KERNEL);
1067 if (!root) {
1068 kfree(opts.release_agent);
1069 return -ENOMEM;
1070 }
1071 1297
1072 init_cgroup_root(root); 1298 /*
1073 root->subsys_bits = opts.subsys_bits; 1299 * Allocate a new cgroup root. We may not need it if we're
1074 root->flags = opts.flags; 1300 * reusing an existing hierarchy.
1075 if (opts.release_agent) { 1301 */
1076 strcpy(root->release_agent_path, opts.release_agent); 1302 new_root = cgroup_root_from_opts(&opts);
1077 kfree(opts.release_agent); 1303 if (IS_ERR(new_root)) {
1304 ret = PTR_ERR(new_root);
1305 goto out_err;
1078 } 1306 }
1307 opts.new_root = new_root;
1079 1308
1080 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root); 1309 /* Locate an existing or new sb for this hierarchy */
1081 1310 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts);
1082 if (IS_ERR(sb)) { 1311 if (IS_ERR(sb)) {
1083 kfree(root); 1312 ret = PTR_ERR(sb);
1084 return PTR_ERR(sb); 1313 cgroup_drop_root(opts.new_root);
1314 goto out_err;
1085 } 1315 }
1086 1316
1087 if (sb->s_fs_info != root) { 1317 root = sb->s_fs_info;
1088 /* Reusing an existing superblock */ 1318 BUG_ON(!root);
1089 BUG_ON(sb->s_root == NULL); 1319 if (root == opts.new_root) {
1090 kfree(root); 1320 /* We used the new root structure, so this is a new hierarchy */
1091 root = NULL; 1321 struct list_head tmp_cg_links;
1092 } else {
1093 /* New superblock */
1094 struct cgroup *root_cgrp = &root->top_cgroup; 1322 struct cgroup *root_cgrp = &root->top_cgroup;
1095 struct inode *inode; 1323 struct inode *inode;
1324 struct cgroupfs_root *existing_root;
1096 int i; 1325 int i;
1097 1326
1098 BUG_ON(sb->s_root != NULL); 1327 BUG_ON(sb->s_root != NULL);
@@ -1105,6 +1334,18 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1105 mutex_lock(&inode->i_mutex); 1334 mutex_lock(&inode->i_mutex);
1106 mutex_lock(&cgroup_mutex); 1335 mutex_lock(&cgroup_mutex);
1107 1336
1337 if (strlen(root->name)) {
1338 /* Check for name clashes with existing mounts */
1339 for_each_active_root(existing_root) {
1340 if (!strcmp(existing_root->name, root->name)) {
1341 ret = -EBUSY;
1342 mutex_unlock(&cgroup_mutex);
1343 mutex_unlock(&inode->i_mutex);
1344 goto drop_new_super;
1345 }
1346 }
1347 }
1348
1108 /* 1349 /*
1109 * We're accessing css_set_count without locking 1350 * We're accessing css_set_count without locking
1110 * css_set_lock here, but that's OK - it can only be 1351 * css_set_lock here, but that's OK - it can only be
@@ -1123,7 +1364,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1123 if (ret == -EBUSY) { 1364 if (ret == -EBUSY) {
1124 mutex_unlock(&cgroup_mutex); 1365 mutex_unlock(&cgroup_mutex);
1125 mutex_unlock(&inode->i_mutex); 1366 mutex_unlock(&inode->i_mutex);
1126 goto free_cg_links; 1367 free_cg_links(&tmp_cg_links);
1368 goto drop_new_super;
1127 } 1369 }
1128 1370
1129 /* EBUSY should be the only error here */ 1371 /* EBUSY should be the only error here */
@@ -1155,17 +1397,27 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1155 BUG_ON(root->number_of_cgroups != 1); 1397 BUG_ON(root->number_of_cgroups != 1);
1156 1398
1157 cgroup_populate_dir(root_cgrp); 1399 cgroup_populate_dir(root_cgrp);
1158 mutex_unlock(&inode->i_mutex);
1159 mutex_unlock(&cgroup_mutex); 1400 mutex_unlock(&cgroup_mutex);
1401 mutex_unlock(&inode->i_mutex);
1402 } else {
1403 /*
1404 * We re-used an existing hierarchy - the new root (if
1405 * any) is not needed
1406 */
1407 cgroup_drop_root(opts.new_root);
1160 } 1408 }
1161 1409
1162 simple_set_mnt(mnt, sb); 1410 simple_set_mnt(mnt, sb);
1411 kfree(opts.release_agent);
1412 kfree(opts.name);
1163 return 0; 1413 return 0;
1164 1414
1165 free_cg_links:
1166 free_cg_links(&tmp_cg_links);
1167 drop_new_super: 1415 drop_new_super:
1168 deactivate_locked_super(sb); 1416 deactivate_locked_super(sb);
1417 out_err:
1418 kfree(opts.release_agent);
1419 kfree(opts.name);
1420
1169 return ret; 1421 return ret;
1170} 1422}
1171 1423
@@ -1211,7 +1463,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1211 mutex_unlock(&cgroup_mutex); 1463 mutex_unlock(&cgroup_mutex);
1212 1464
1213 kill_litter_super(sb); 1465 kill_litter_super(sb);
1214 kfree(root); 1466 cgroup_drop_root(root);
1215} 1467}
1216 1468
1217static struct file_system_type cgroup_fs_type = { 1469static struct file_system_type cgroup_fs_type = {
@@ -1276,27 +1528,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1276 return 0; 1528 return 0;
1277} 1529}
1278 1530
1279/*
1280 * Return the first subsystem attached to a cgroup's hierarchy, and
1281 * its subsystem id.
1282 */
1283
1284static void get_first_subsys(const struct cgroup *cgrp,
1285 struct cgroup_subsys_state **css, int *subsys_id)
1286{
1287 const struct cgroupfs_root *root = cgrp->root;
1288 const struct cgroup_subsys *test_ss;
1289 BUG_ON(list_empty(&root->subsys_list));
1290 test_ss = list_entry(root->subsys_list.next,
1291 struct cgroup_subsys, sibling);
1292 if (css) {
1293 *css = cgrp->subsys[test_ss->subsys_id];
1294 BUG_ON(!*css);
1295 }
1296 if (subsys_id)
1297 *subsys_id = test_ss->subsys_id;
1298}
1299
1300/** 1531/**
1301 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' 1532 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
1302 * @cgrp: the cgroup the task is attaching to 1533 * @cgrp: the cgroup the task is attaching to
@@ -1313,18 +1544,15 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1313 struct css_set *cg; 1544 struct css_set *cg;
1314 struct css_set *newcg; 1545 struct css_set *newcg;
1315 struct cgroupfs_root *root = cgrp->root; 1546 struct cgroupfs_root *root = cgrp->root;
1316 int subsys_id;
1317
1318 get_first_subsys(cgrp, NULL, &subsys_id);
1319 1547
1320 /* Nothing to do if the task is already in that cgroup */ 1548 /* Nothing to do if the task is already in that cgroup */
1321 oldcgrp = task_cgroup(tsk, subsys_id); 1549 oldcgrp = task_cgroup_from_root(tsk, root);
1322 if (cgrp == oldcgrp) 1550 if (cgrp == oldcgrp)
1323 return 0; 1551 return 0;
1324 1552
1325 for_each_subsys(root, ss) { 1553 for_each_subsys(root, ss) {
1326 if (ss->can_attach) { 1554 if (ss->can_attach) {
1327 retval = ss->can_attach(ss, cgrp, tsk); 1555 retval = ss->can_attach(ss, cgrp, tsk, false);
1328 if (retval) 1556 if (retval)
1329 return retval; 1557 return retval;
1330 } 1558 }
@@ -1362,7 +1590,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1362 1590
1363 for_each_subsys(root, ss) { 1591 for_each_subsys(root, ss) {
1364 if (ss->attach) 1592 if (ss->attach)
1365 ss->attach(ss, cgrp, oldcgrp, tsk); 1593 ss->attach(ss, cgrp, oldcgrp, tsk, false);
1366 } 1594 }
1367 set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1595 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1368 synchronize_rcu(); 1596 synchronize_rcu();
@@ -1423,15 +1651,6 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
1423 return ret; 1651 return ret;
1424} 1652}
1425 1653
1426/* The various types of files and directories in a cgroup file system */
1427enum cgroup_filetype {
1428 FILE_ROOT,
1429 FILE_DIR,
1430 FILE_TASKLIST,
1431 FILE_NOTIFY_ON_RELEASE,
1432 FILE_RELEASE_AGENT,
1433};
1434
1435/** 1654/**
1436 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. 1655 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
1437 * @cgrp: the cgroup to be checked for liveness 1656 * @cgrp: the cgroup to be checked for liveness
@@ -1491,14 +1710,13 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
1491 return -EFAULT; 1710 return -EFAULT;
1492 1711
1493 buffer[nbytes] = 0; /* nul-terminate */ 1712 buffer[nbytes] = 0; /* nul-terminate */
1494 strstrip(buffer);
1495 if (cft->write_u64) { 1713 if (cft->write_u64) {
1496 u64 val = simple_strtoull(buffer, &end, 0); 1714 u64 val = simple_strtoull(strstrip(buffer), &end, 0);
1497 if (*end) 1715 if (*end)
1498 return -EINVAL; 1716 return -EINVAL;
1499 retval = cft->write_u64(cgrp, cft, val); 1717 retval = cft->write_u64(cgrp, cft, val);
1500 } else { 1718 } else {
1501 s64 val = simple_strtoll(buffer, &end, 0); 1719 s64 val = simple_strtoll(strstrip(buffer), &end, 0);
1502 if (*end) 1720 if (*end)
1503 return -EINVAL; 1721 return -EINVAL;
1504 retval = cft->write_s64(cgrp, cft, val); 1722 retval = cft->write_s64(cgrp, cft, val);
@@ -1534,8 +1752,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
1534 } 1752 }
1535 1753
1536 buffer[nbytes] = 0; /* nul-terminate */ 1754 buffer[nbytes] = 0; /* nul-terminate */
1537 strstrip(buffer); 1755 retval = cft->write_string(cgrp, cft, strstrip(buffer));
1538 retval = cft->write_string(cgrp, cft, buffer);
1539 if (!retval) 1756 if (!retval)
1540 retval = nbytes; 1757 retval = nbytes;
1541out: 1758out:
@@ -1644,7 +1861,7 @@ static int cgroup_seqfile_release(struct inode *inode, struct file *file)
1644 return single_release(inode, file); 1861 return single_release(inode, file);
1645} 1862}
1646 1863
1647static struct file_operations cgroup_seqfile_operations = { 1864static const struct file_operations cgroup_seqfile_operations = {
1648 .read = seq_read, 1865 .read = seq_read,
1649 .write = cgroup_file_write, 1866 .write = cgroup_file_write,
1650 .llseek = seq_lseek, 1867 .llseek = seq_lseek,
@@ -1703,7 +1920,7 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
1703 return simple_rename(old_dir, old_dentry, new_dir, new_dentry); 1920 return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
1704} 1921}
1705 1922
1706static struct file_operations cgroup_file_operations = { 1923static const struct file_operations cgroup_file_operations = {
1707 .read = cgroup_file_read, 1924 .read = cgroup_file_read,
1708 .write = cgroup_file_write, 1925 .write = cgroup_file_write,
1709 .llseek = generic_file_llseek, 1926 .llseek = generic_file_llseek,
@@ -1876,7 +2093,7 @@ int cgroup_task_count(const struct cgroup *cgrp)
1876 * the start of a css_set 2093 * the start of a css_set
1877 */ 2094 */
1878static void cgroup_advance_iter(struct cgroup *cgrp, 2095static void cgroup_advance_iter(struct cgroup *cgrp,
1879 struct cgroup_iter *it) 2096 struct cgroup_iter *it)
1880{ 2097{
1881 struct list_head *l = it->cg_link; 2098 struct list_head *l = it->cg_link;
1882 struct cg_cgroup_link *link; 2099 struct cg_cgroup_link *link;
@@ -2129,7 +2346,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
2129} 2346}
2130 2347
2131/* 2348/*
2132 * Stuff for reading the 'tasks' file. 2349 * Stuff for reading the 'tasks'/'procs' files.
2133 * 2350 *
2134 * Reading this file can return large amounts of data if a cgroup has 2351 * Reading this file can return large amounts of data if a cgroup has
2135 * *lots* of attached tasks. So it may need several calls to read(), 2352 * *lots* of attached tasks. So it may need several calls to read(),
@@ -2139,27 +2356,196 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
2139 */ 2356 */
2140 2357
2141/* 2358/*
2142 * Load into 'pidarray' up to 'npids' of the tasks using cgroup 2359 * The following two functions "fix" the issue where there are more pids
2143 * 'cgrp'. Return actual number of pids loaded. No need to 2360 * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
2144 * task_lock(p) when reading out p->cgroup, since we're in an RCU 2361 * TODO: replace with a kernel-wide solution to this problem
2145 * read section, so the css_set can't go away, and is 2362 */
2146 * immutable after creation. 2363#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
2364static void *pidlist_allocate(int count)
2365{
2366 if (PIDLIST_TOO_LARGE(count))
2367 return vmalloc(count * sizeof(pid_t));
2368 else
2369 return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
2370}
2371static void pidlist_free(void *p)
2372{
2373 if (is_vmalloc_addr(p))
2374 vfree(p);
2375 else
2376 kfree(p);
2377}
2378static void *pidlist_resize(void *p, int newcount)
2379{
2380 void *newlist;
2381 /* note: if new alloc fails, old p will still be valid either way */
2382 if (is_vmalloc_addr(p)) {
2383 newlist = vmalloc(newcount * sizeof(pid_t));
2384 if (!newlist)
2385 return NULL;
2386 memcpy(newlist, p, newcount * sizeof(pid_t));
2387 vfree(p);
2388 } else {
2389 newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL);
2390 }
2391 return newlist;
2392}
2393
2394/*
2395 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
2396 * If the new stripped list is sufficiently smaller and there's enough memory
2397 * to allocate a new buffer, will let go of the unneeded memory. Returns the
2398 * number of unique elements.
2399 */
2400/* is the size difference enough that we should re-allocate the array? */
2401#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new))
2402static int pidlist_uniq(pid_t **p, int length)
2403{
2404 int src, dest = 1;
2405 pid_t *list = *p;
2406 pid_t *newlist;
2407
2408 /*
2409 * we presume the 0th element is unique, so i starts at 1. trivial
2410 * edge cases first; no work needs to be done for either
2411 */
2412 if (length == 0 || length == 1)
2413 return length;
2414 /* src and dest walk down the list; dest counts unique elements */
2415 for (src = 1; src < length; src++) {
2416 /* find next unique element */
2417 while (list[src] == list[src-1]) {
2418 src++;
2419 if (src == length)
2420 goto after;
2421 }
2422 /* dest always points to where the next unique element goes */
2423 list[dest] = list[src];
2424 dest++;
2425 }
2426after:
2427 /*
2428 * if the length difference is large enough, we want to allocate a
2429 * smaller buffer to save memory. if this fails due to out of memory,
2430 * we'll just stay with what we've got.
2431 */
2432 if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
2433 newlist = pidlist_resize(list, dest);
2434 if (newlist)
2435 *p = newlist;
2436 }
2437 return dest;
2438}
2439
2440static int cmppid(const void *a, const void *b)
2441{
2442 return *(pid_t *)a - *(pid_t *)b;
2443}
2444
2445/*
2446 * find the appropriate pidlist for our purpose (given procs vs tasks)
2447 * returns with the lock on that pidlist already held, and takes care
2448 * of the use count, or returns NULL with no locks held if we're out of
2449 * memory.
2147 */ 2450 */
2148static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) 2451static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2452 enum cgroup_filetype type)
2149{ 2453{
2150 int n = 0, pid; 2454 struct cgroup_pidlist *l;
2455 /* don't need task_nsproxy() if we're looking at ourself */
2456 struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns);
2457 /*
2458 * We can't drop the pidlist_mutex before taking the l->mutex in case
2459 * the last ref-holder is trying to remove l from the list at the same
2460 * time. Holding the pidlist_mutex precludes somebody taking whichever
2461 * list we find out from under us - compare release_pid_array().
2462 */
2463 mutex_lock(&cgrp->pidlist_mutex);
2464 list_for_each_entry(l, &cgrp->pidlists, links) {
2465 if (l->key.type == type && l->key.ns == ns) {
2466 /* found a matching list - drop the extra refcount */
2467 put_pid_ns(ns);
2468 /* make sure l doesn't vanish out from under us */
2469 down_write(&l->mutex);
2470 mutex_unlock(&cgrp->pidlist_mutex);
2471 l->use_count++;
2472 return l;
2473 }
2474 }
2475 /* entry not found; create a new one */
2476 l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
2477 if (!l) {
2478 mutex_unlock(&cgrp->pidlist_mutex);
2479 put_pid_ns(ns);
2480 return l;
2481 }
2482 init_rwsem(&l->mutex);
2483 down_write(&l->mutex);
2484 l->key.type = type;
2485 l->key.ns = ns;
2486 l->use_count = 0; /* don't increment here */
2487 l->list = NULL;
2488 l->owner = cgrp;
2489 list_add(&l->links, &cgrp->pidlists);
2490 mutex_unlock(&cgrp->pidlist_mutex);
2491 return l;
2492}
2493
2494/*
2495 * Load a cgroup's pidarray with either procs' tgids or tasks' pids
2496 */
2497static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
2498 struct cgroup_pidlist **lp)
2499{
2500 pid_t *array;
2501 int length;
2502 int pid, n = 0; /* used for populating the array */
2151 struct cgroup_iter it; 2503 struct cgroup_iter it;
2152 struct task_struct *tsk; 2504 struct task_struct *tsk;
2505 struct cgroup_pidlist *l;
2506
2507 /*
2508 * If cgroup gets more users after we read count, we won't have
2509 * enough space - tough. This race is indistinguishable to the
2510 * caller from the case that the additional cgroup users didn't
2511 * show up until sometime later on.
2512 */
2513 length = cgroup_task_count(cgrp);
2514 array = pidlist_allocate(length);
2515 if (!array)
2516 return -ENOMEM;
2517 /* now, populate the array */
2153 cgroup_iter_start(cgrp, &it); 2518 cgroup_iter_start(cgrp, &it);
2154 while ((tsk = cgroup_iter_next(cgrp, &it))) { 2519 while ((tsk = cgroup_iter_next(cgrp, &it))) {
2155 if (unlikely(n == npids)) 2520 if (unlikely(n == length))
2156 break; 2521 break;
2157 pid = task_pid_vnr(tsk); 2522 /* get tgid or pid for procs or tasks file respectively */
2158 if (pid > 0) 2523 if (type == CGROUP_FILE_PROCS)
2159 pidarray[n++] = pid; 2524 pid = task_tgid_vnr(tsk);
2525 else
2526 pid = task_pid_vnr(tsk);
2527 if (pid > 0) /* make sure to only use valid results */
2528 array[n++] = pid;
2160 } 2529 }
2161 cgroup_iter_end(cgrp, &it); 2530 cgroup_iter_end(cgrp, &it);
2162 return n; 2531 length = n;
2532 /* now sort & (if procs) strip out duplicates */
2533 sort(array, length, sizeof(pid_t), cmppid, NULL);
2534 if (type == CGROUP_FILE_PROCS)
2535 length = pidlist_uniq(&array, length);
2536 l = cgroup_pidlist_find(cgrp, type);
2537 if (!l) {
2538 pidlist_free(array);
2539 return -ENOMEM;
2540 }
2541 /* store array, freeing old if necessary - lock already held */
2542 pidlist_free(l->list);
2543 l->list = array;
2544 l->length = length;
2545 l->use_count++;
2546 up_write(&l->mutex);
2547 *lp = l;
2548 return 0;
2163} 2549}
2164 2550
2165/** 2551/**
@@ -2216,37 +2602,14 @@ err:
2216 return ret; 2602 return ret;
2217} 2603}
2218 2604
2219/*
2220 * Cache pids for all threads in the same pid namespace that are
2221 * opening the same "tasks" file.
2222 */
2223struct cgroup_pids {
2224 /* The node in cgrp->pids_list */
2225 struct list_head list;
2226 /* The cgroup those pids belong to */
2227 struct cgroup *cgrp;
2228 /* The namepsace those pids belong to */
2229 struct pid_namespace *ns;
2230 /* Array of process ids in the cgroup */
2231 pid_t *tasks_pids;
2232 /* How many files are using the this tasks_pids array */
2233 int use_count;
2234 /* Length of the current tasks_pids array */
2235 int length;
2236};
2237
2238static int cmppid(const void *a, const void *b)
2239{
2240 return *(pid_t *)a - *(pid_t *)b;
2241}
2242 2605
2243/* 2606/*
2244 * seq_file methods for the "tasks" file. The seq_file position is the 2607 * seq_file methods for the tasks/procs files. The seq_file position is the
2245 * next pid to display; the seq_file iterator is a pointer to the pid 2608 * next pid to display; the seq_file iterator is a pointer to the pid
2246 * in the cgroup->tasks_pids array. 2609 * in the cgroup->l->list array.
2247 */ 2610 */
2248 2611
2249static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) 2612static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
2250{ 2613{
2251 /* 2614 /*
2252 * Initially we receive a position value that corresponds to 2615 * Initially we receive a position value that corresponds to
@@ -2254,48 +2617,45 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
2254 * after a seek to the start). Use a binary-search to find the 2617 * after a seek to the start). Use a binary-search to find the
2255 * next pid to display, if any 2618 * next pid to display, if any
2256 */ 2619 */
2257 struct cgroup_pids *cp = s->private; 2620 struct cgroup_pidlist *l = s->private;
2258 struct cgroup *cgrp = cp->cgrp;
2259 int index = 0, pid = *pos; 2621 int index = 0, pid = *pos;
2260 int *iter; 2622 int *iter;
2261 2623
2262 down_read(&cgrp->pids_mutex); 2624 down_read(&l->mutex);
2263 if (pid) { 2625 if (pid) {
2264 int end = cp->length; 2626 int end = l->length;
2265 2627
2266 while (index < end) { 2628 while (index < end) {
2267 int mid = (index + end) / 2; 2629 int mid = (index + end) / 2;
2268 if (cp->tasks_pids[mid] == pid) { 2630 if (l->list[mid] == pid) {
2269 index = mid; 2631 index = mid;
2270 break; 2632 break;
2271 } else if (cp->tasks_pids[mid] <= pid) 2633 } else if (l->list[mid] <= pid)
2272 index = mid + 1; 2634 index = mid + 1;
2273 else 2635 else
2274 end = mid; 2636 end = mid;
2275 } 2637 }
2276 } 2638 }
2277 /* If we're off the end of the array, we're done */ 2639 /* If we're off the end of the array, we're done */
2278 if (index >= cp->length) 2640 if (index >= l->length)
2279 return NULL; 2641 return NULL;
2280 /* Update the abstract position to be the actual pid that we found */ 2642 /* Update the abstract position to be the actual pid that we found */
2281 iter = cp->tasks_pids + index; 2643 iter = l->list + index;
2282 *pos = *iter; 2644 *pos = *iter;
2283 return iter; 2645 return iter;
2284} 2646}
2285 2647
2286static void cgroup_tasks_stop(struct seq_file *s, void *v) 2648static void cgroup_pidlist_stop(struct seq_file *s, void *v)
2287{ 2649{
2288 struct cgroup_pids *cp = s->private; 2650 struct cgroup_pidlist *l = s->private;
2289 struct cgroup *cgrp = cp->cgrp; 2651 up_read(&l->mutex);
2290 up_read(&cgrp->pids_mutex);
2291} 2652}
2292 2653
2293static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) 2654static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
2294{ 2655{
2295 struct cgroup_pids *cp = s->private; 2656 struct cgroup_pidlist *l = s->private;
2296 int *p = v; 2657 pid_t *p = v;
2297 int *end = cp->tasks_pids + cp->length; 2658 pid_t *end = l->list + l->length;
2298
2299 /* 2659 /*
2300 * Advance to the next pid in the array. If this goes off the 2660 * Advance to the next pid in the array. If this goes off the
2301 * end, we're done 2661 * end, we're done
@@ -2309,124 +2669,107 @@ static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
2309 } 2669 }
2310} 2670}
2311 2671
2312static int cgroup_tasks_show(struct seq_file *s, void *v) 2672static int cgroup_pidlist_show(struct seq_file *s, void *v)
2313{ 2673{
2314 return seq_printf(s, "%d\n", *(int *)v); 2674 return seq_printf(s, "%d\n", *(int *)v);
2315} 2675}
2316 2676
2317static const struct seq_operations cgroup_tasks_seq_operations = { 2677/*
2318 .start = cgroup_tasks_start, 2678 * seq_operations functions for iterating on pidlists through seq_file -
2319 .stop = cgroup_tasks_stop, 2679 * independent of whether it's tasks or procs
2320 .next = cgroup_tasks_next, 2680 */
2321 .show = cgroup_tasks_show, 2681static const struct seq_operations cgroup_pidlist_seq_operations = {
2682 .start = cgroup_pidlist_start,
2683 .stop = cgroup_pidlist_stop,
2684 .next = cgroup_pidlist_next,
2685 .show = cgroup_pidlist_show,
2322}; 2686};
2323 2687
2324static void release_cgroup_pid_array(struct cgroup_pids *cp) 2688static void cgroup_release_pid_array(struct cgroup_pidlist *l)
2325{ 2689{
2326 struct cgroup *cgrp = cp->cgrp; 2690 /*
2327 2691 * the case where we're the last user of this particular pidlist will
2328 down_write(&cgrp->pids_mutex); 2692 * have us remove it from the cgroup's list, which entails taking the
2329 BUG_ON(!cp->use_count); 2693 * mutex. since in pidlist_find the pidlist->lock depends on cgroup->
2330 if (!--cp->use_count) { 2694 * pidlist_mutex, we have to take pidlist_mutex first.
2331 list_del(&cp->list); 2695 */
2332 put_pid_ns(cp->ns); 2696 mutex_lock(&l->owner->pidlist_mutex);
2333 kfree(cp->tasks_pids); 2697 down_write(&l->mutex);
2334 kfree(cp); 2698 BUG_ON(!l->use_count);
2699 if (!--l->use_count) {
2700 /* we're the last user if refcount is 0; remove and free */
2701 list_del(&l->links);
2702 mutex_unlock(&l->owner->pidlist_mutex);
2703 pidlist_free(l->list);
2704 put_pid_ns(l->key.ns);
2705 up_write(&l->mutex);
2706 kfree(l);
2707 return;
2335 } 2708 }
2336 up_write(&cgrp->pids_mutex); 2709 mutex_unlock(&l->owner->pidlist_mutex);
2710 up_write(&l->mutex);
2337} 2711}
2338 2712
2339static int cgroup_tasks_release(struct inode *inode, struct file *file) 2713static int cgroup_pidlist_release(struct inode *inode, struct file *file)
2340{ 2714{
2341 struct seq_file *seq; 2715 struct cgroup_pidlist *l;
2342 struct cgroup_pids *cp;
2343
2344 if (!(file->f_mode & FMODE_READ)) 2716 if (!(file->f_mode & FMODE_READ))
2345 return 0; 2717 return 0;
2346 2718 /*
2347 seq = file->private_data; 2719 * the seq_file will only be initialized if the file was opened for
2348 cp = seq->private; 2720 * reading; hence we check if it's not null only in that case.
2349 2721 */
2350 release_cgroup_pid_array(cp); 2722 l = ((struct seq_file *)file->private_data)->private;
2723 cgroup_release_pid_array(l);
2351 return seq_release(inode, file); 2724 return seq_release(inode, file);
2352} 2725}
2353 2726
2354static struct file_operations cgroup_tasks_operations = { 2727static const struct file_operations cgroup_pidlist_operations = {
2355 .read = seq_read, 2728 .read = seq_read,
2356 .llseek = seq_lseek, 2729 .llseek = seq_lseek,
2357 .write = cgroup_file_write, 2730 .write = cgroup_file_write,
2358 .release = cgroup_tasks_release, 2731 .release = cgroup_pidlist_release,
2359}; 2732};
2360 2733
2361/* 2734/*
2362 * Handle an open on 'tasks' file. Prepare an array containing the 2735 * The following functions handle opens on a file that displays a pidlist
2363 * process id's of tasks currently attached to the cgroup being opened. 2736 * (tasks or procs). Prepare an array of the process/thread IDs of whoever's
2737 * in the cgroup.
2364 */ 2738 */
2365 2739/* helper function for the two below it */
2366static int cgroup_tasks_open(struct inode *unused, struct file *file) 2740static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
2367{ 2741{
2368 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2742 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2369 struct pid_namespace *ns = current->nsproxy->pid_ns; 2743 struct cgroup_pidlist *l;
2370 struct cgroup_pids *cp;
2371 pid_t *pidarray;
2372 int npids;
2373 int retval; 2744 int retval;
2374 2745
2375 /* Nothing to do for write-only files */ 2746 /* Nothing to do for write-only files */
2376 if (!(file->f_mode & FMODE_READ)) 2747 if (!(file->f_mode & FMODE_READ))
2377 return 0; 2748 return 0;
2378 2749
2379 /* 2750 /* have the array populated */
2380 * If cgroup gets more users after we read count, we won't have 2751 retval = pidlist_array_load(cgrp, type, &l);
2381 * enough space - tough. This race is indistinguishable to the 2752 if (retval)
2382 * caller from the case that the additional cgroup users didn't 2753 return retval;
2383 * show up until sometime later on. 2754 /* configure file information */
2384 */ 2755 file->f_op = &cgroup_pidlist_operations;
2385 npids = cgroup_task_count(cgrp);
2386 pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
2387 if (!pidarray)
2388 return -ENOMEM;
2389 npids = pid_array_load(pidarray, npids, cgrp);
2390 sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
2391
2392 /*
2393 * Store the array in the cgroup, freeing the old
2394 * array if necessary
2395 */
2396 down_write(&cgrp->pids_mutex);
2397
2398 list_for_each_entry(cp, &cgrp->pids_list, list) {
2399 if (ns == cp->ns)
2400 goto found;
2401 }
2402
2403 cp = kzalloc(sizeof(*cp), GFP_KERNEL);
2404 if (!cp) {
2405 up_write(&cgrp->pids_mutex);
2406 kfree(pidarray);
2407 return -ENOMEM;
2408 }
2409 cp->cgrp = cgrp;
2410 cp->ns = ns;
2411 get_pid_ns(ns);
2412 list_add(&cp->list, &cgrp->pids_list);
2413found:
2414 kfree(cp->tasks_pids);
2415 cp->tasks_pids = pidarray;
2416 cp->length = npids;
2417 cp->use_count++;
2418 up_write(&cgrp->pids_mutex);
2419
2420 file->f_op = &cgroup_tasks_operations;
2421 2756
2422 retval = seq_open(file, &cgroup_tasks_seq_operations); 2757 retval = seq_open(file, &cgroup_pidlist_seq_operations);
2423 if (retval) { 2758 if (retval) {
2424 release_cgroup_pid_array(cp); 2759 cgroup_release_pid_array(l);
2425 return retval; 2760 return retval;
2426 } 2761 }
2427 ((struct seq_file *)file->private_data)->private = cp; 2762 ((struct seq_file *)file->private_data)->private = l;
2428 return 0; 2763 return 0;
2429} 2764}
2765static int cgroup_tasks_open(struct inode *unused, struct file *file)
2766{
2767 return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
2768}
2769static int cgroup_procs_open(struct inode *unused, struct file *file)
2770{
2771 return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
2772}
2430 2773
2431static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, 2774static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
2432 struct cftype *cft) 2775 struct cftype *cft)
@@ -2449,21 +2792,27 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
2449/* 2792/*
2450 * for the common functions, 'private' gives the type of file 2793 * for the common functions, 'private' gives the type of file
2451 */ 2794 */
2795/* for hysterical raisins, we can't put this on the older files */
2796#define CGROUP_FILE_GENERIC_PREFIX "cgroup."
2452static struct cftype files[] = { 2797static struct cftype files[] = {
2453 { 2798 {
2454 .name = "tasks", 2799 .name = "tasks",
2455 .open = cgroup_tasks_open, 2800 .open = cgroup_tasks_open,
2456 .write_u64 = cgroup_tasks_write, 2801 .write_u64 = cgroup_tasks_write,
2457 .release = cgroup_tasks_release, 2802 .release = cgroup_pidlist_release,
2458 .private = FILE_TASKLIST,
2459 .mode = S_IRUGO | S_IWUSR, 2803 .mode = S_IRUGO | S_IWUSR,
2460 }, 2804 },
2461 2805 {
2806 .name = CGROUP_FILE_GENERIC_PREFIX "procs",
2807 .open = cgroup_procs_open,
2808 /* .write_u64 = cgroup_procs_write, TODO */
2809 .release = cgroup_pidlist_release,
2810 .mode = S_IRUGO,
2811 },
2462 { 2812 {
2463 .name = "notify_on_release", 2813 .name = "notify_on_release",
2464 .read_u64 = cgroup_read_notify_on_release, 2814 .read_u64 = cgroup_read_notify_on_release,
2465 .write_u64 = cgroup_write_notify_on_release, 2815 .write_u64 = cgroup_write_notify_on_release,
2466 .private = FILE_NOTIFY_ON_RELEASE,
2467 }, 2816 },
2468}; 2817};
2469 2818
@@ -2472,7 +2821,6 @@ static struct cftype cft_release_agent = {
2472 .read_seq_string = cgroup_release_agent_show, 2821 .read_seq_string = cgroup_release_agent_show,
2473 .write_string = cgroup_release_agent_write, 2822 .write_string = cgroup_release_agent_write,
2474 .max_write_len = PATH_MAX, 2823 .max_write_len = PATH_MAX,
2475 .private = FILE_RELEASE_AGENT,
2476}; 2824};
2477 2825
2478static int cgroup_populate_dir(struct cgroup *cgrp) 2826static int cgroup_populate_dir(struct cgroup *cgrp)
@@ -2879,6 +3227,7 @@ int __init cgroup_init_early(void)
2879 init_task.cgroups = &init_css_set; 3227 init_task.cgroups = &init_css_set;
2880 3228
2881 init_css_set_link.cg = &init_css_set; 3229 init_css_set_link.cg = &init_css_set;
3230 init_css_set_link.cgrp = dummytop;
2882 list_add(&init_css_set_link.cgrp_link_list, 3231 list_add(&init_css_set_link.cgrp_link_list,
2883 &rootnode.top_cgroup.css_sets); 3232 &rootnode.top_cgroup.css_sets);
2884 list_add(&init_css_set_link.cg_link_list, 3233 list_add(&init_css_set_link.cg_link_list,
@@ -2933,7 +3282,7 @@ int __init cgroup_init(void)
2933 /* Add init_css_set to the hash table */ 3282 /* Add init_css_set to the hash table */
2934 hhead = css_set_hash(init_css_set.subsys); 3283 hhead = css_set_hash(init_css_set.subsys);
2935 hlist_add_head(&init_css_set.hlist, hhead); 3284 hlist_add_head(&init_css_set.hlist, hhead);
2936 3285 BUG_ON(!init_root_id(&rootnode));
2937 err = register_filesystem(&cgroup_fs_type); 3286 err = register_filesystem(&cgroup_fs_type);
2938 if (err < 0) 3287 if (err < 0)
2939 goto out; 3288 goto out;
@@ -2986,15 +3335,16 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
2986 for_each_active_root(root) { 3335 for_each_active_root(root) {
2987 struct cgroup_subsys *ss; 3336 struct cgroup_subsys *ss;
2988 struct cgroup *cgrp; 3337 struct cgroup *cgrp;
2989 int subsys_id;
2990 int count = 0; 3338 int count = 0;
2991 3339
2992 seq_printf(m, "%lu:", root->subsys_bits); 3340 seq_printf(m, "%d:", root->hierarchy_id);
2993 for_each_subsys(root, ss) 3341 for_each_subsys(root, ss)
2994 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 3342 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
3343 if (strlen(root->name))
3344 seq_printf(m, "%sname=%s", count ? "," : "",
3345 root->name);
2995 seq_putc(m, ':'); 3346 seq_putc(m, ':');
2996 get_first_subsys(&root->top_cgroup, NULL, &subsys_id); 3347 cgrp = task_cgroup_from_root(tsk, root);
2997 cgrp = task_cgroup(tsk, subsys_id);
2998 retval = cgroup_path(cgrp, buf, PAGE_SIZE); 3348 retval = cgroup_path(cgrp, buf, PAGE_SIZE);
2999 if (retval < 0) 3349 if (retval < 0)
3000 goto out_unlock; 3350 goto out_unlock;
@@ -3017,7 +3367,7 @@ static int cgroup_open(struct inode *inode, struct file *file)
3017 return single_open(file, proc_cgroup_show, pid); 3367 return single_open(file, proc_cgroup_show, pid);
3018} 3368}
3019 3369
3020struct file_operations proc_cgroup_operations = { 3370const struct file_operations proc_cgroup_operations = {
3021 .open = cgroup_open, 3371 .open = cgroup_open,
3022 .read = seq_read, 3372 .read = seq_read,
3023 .llseek = seq_lseek, 3373 .llseek = seq_lseek,
@@ -3033,8 +3383,8 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
3033 mutex_lock(&cgroup_mutex); 3383 mutex_lock(&cgroup_mutex);
3034 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3384 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3035 struct cgroup_subsys *ss = subsys[i]; 3385 struct cgroup_subsys *ss = subsys[i];
3036 seq_printf(m, "%s\t%lu\t%d\t%d\n", 3386 seq_printf(m, "%s\t%d\t%d\t%d\n",
3037 ss->name, ss->root->subsys_bits, 3387 ss->name, ss->root->hierarchy_id,
3038 ss->root->number_of_cgroups, !ss->disabled); 3388 ss->root->number_of_cgroups, !ss->disabled);
3039 } 3389 }
3040 mutex_unlock(&cgroup_mutex); 3390 mutex_unlock(&cgroup_mutex);
@@ -3046,7 +3396,7 @@ static int cgroupstats_open(struct inode *inode, struct file *file)
3046 return single_open(file, proc_cgroupstats_show, NULL); 3396 return single_open(file, proc_cgroupstats_show, NULL);
3047} 3397}
3048 3398
3049static struct file_operations proc_cgroupstats_operations = { 3399static const struct file_operations proc_cgroupstats_operations = {
3050 .open = cgroupstats_open, 3400 .open = cgroupstats_open,
3051 .read = seq_read, 3401 .read = seq_read,
3052 .llseek = seq_lseek, 3402 .llseek = seq_lseek,
@@ -3320,13 +3670,11 @@ int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
3320{ 3670{
3321 int ret; 3671 int ret;
3322 struct cgroup *target; 3672 struct cgroup *target;
3323 int subsys_id;
3324 3673
3325 if (cgrp == dummytop) 3674 if (cgrp == dummytop)
3326 return 1; 3675 return 1;
3327 3676
3328 get_first_subsys(cgrp, NULL, &subsys_id); 3677 target = task_cgroup_from_root(task, cgrp->root);
3329 target = task_cgroup(task, subsys_id);
3330 while (cgrp != target && cgrp!= cgrp->top_cgroup) 3678 while (cgrp != target && cgrp!= cgrp->top_cgroup)
3331 cgrp = cgrp->parent; 3679 cgrp = cgrp->parent;
3332 ret = (cgrp == target); 3680 ret = (cgrp == target);
@@ -3358,8 +3706,10 @@ static void check_for_release(struct cgroup *cgrp)
3358void __css_put(struct cgroup_subsys_state *css) 3706void __css_put(struct cgroup_subsys_state *css)
3359{ 3707{
3360 struct cgroup *cgrp = css->cgroup; 3708 struct cgroup *cgrp = css->cgroup;
3709 int val;
3361 rcu_read_lock(); 3710 rcu_read_lock();
3362 if (atomic_dec_return(&css->refcnt) == 1) { 3711 val = atomic_dec_return(&css->refcnt);
3712 if (val == 1) {
3363 if (notify_on_release(cgrp)) { 3713 if (notify_on_release(cgrp)) {
3364 set_bit(CGRP_RELEASABLE, &cgrp->flags); 3714 set_bit(CGRP_RELEASABLE, &cgrp->flags);
3365 check_for_release(cgrp); 3715 check_for_release(cgrp);
@@ -3367,6 +3717,7 @@ void __css_put(struct cgroup_subsys_state *css)
3367 cgroup_wakeup_rmdir_waiter(cgrp); 3717 cgroup_wakeup_rmdir_waiter(cgrp);
3368 } 3718 }
3369 rcu_read_unlock(); 3719 rcu_read_unlock();
3720 WARN_ON_ONCE(val < 1);
3370} 3721}
3371 3722
3372/* 3723/*
@@ -3693,3 +4044,154 @@ css_get_next(struct cgroup_subsys *ss, int id,
3693 return ret; 4044 return ret;
3694} 4045}
3695 4046
4047#ifdef CONFIG_CGROUP_DEBUG
4048static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
4049 struct cgroup *cont)
4050{
4051 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
4052
4053 if (!css)
4054 return ERR_PTR(-ENOMEM);
4055
4056 return css;
4057}
4058
4059static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
4060{
4061 kfree(cont->subsys[debug_subsys_id]);
4062}
4063
4064static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
4065{
4066 return atomic_read(&cont->count);
4067}
4068
4069static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft)
4070{
4071 return cgroup_task_count(cont);
4072}
4073
4074static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
4075{
4076 return (u64)(unsigned long)current->cgroups;
4077}
4078
4079static u64 current_css_set_refcount_read(struct cgroup *cont,
4080 struct cftype *cft)
4081{
4082 u64 count;
4083
4084 rcu_read_lock();
4085 count = atomic_read(&current->cgroups->refcount);
4086 rcu_read_unlock();
4087 return count;
4088}
4089
4090static int current_css_set_cg_links_read(struct cgroup *cont,
4091 struct cftype *cft,
4092 struct seq_file *seq)
4093{
4094 struct cg_cgroup_link *link;
4095 struct css_set *cg;
4096
4097 read_lock(&css_set_lock);
4098 rcu_read_lock();
4099 cg = rcu_dereference(current->cgroups);
4100 list_for_each_entry(link, &cg->cg_links, cg_link_list) {
4101 struct cgroup *c = link->cgrp;
4102 const char *name;
4103
4104 if (c->dentry)
4105 name = c->dentry->d_name.name;
4106 else
4107 name = "?";
4108 seq_printf(seq, "Root %d group %s\n",
4109 c->root->hierarchy_id, name);
4110 }
4111 rcu_read_unlock();
4112 read_unlock(&css_set_lock);
4113 return 0;
4114}
4115
4116#define MAX_TASKS_SHOWN_PER_CSS 25
4117static int cgroup_css_links_read(struct cgroup *cont,
4118 struct cftype *cft,
4119 struct seq_file *seq)
4120{
4121 struct cg_cgroup_link *link;
4122
4123 read_lock(&css_set_lock);
4124 list_for_each_entry(link, &cont->css_sets, cgrp_link_list) {
4125 struct css_set *cg = link->cg;
4126 struct task_struct *task;
4127 int count = 0;
4128 seq_printf(seq, "css_set %p\n", cg);
4129 list_for_each_entry(task, &cg->tasks, cg_list) {
4130 if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
4131 seq_puts(seq, " ...\n");
4132 break;
4133 } else {
4134 seq_printf(seq, " task %d\n",
4135 task_pid_vnr(task));
4136 }
4137 }
4138 }
4139 read_unlock(&css_set_lock);
4140 return 0;
4141}
4142
4143static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
4144{
4145 return test_bit(CGRP_RELEASABLE, &cgrp->flags);
4146}
4147
4148static struct cftype debug_files[] = {
4149 {
4150 .name = "cgroup_refcount",
4151 .read_u64 = cgroup_refcount_read,
4152 },
4153 {
4154 .name = "taskcount",
4155 .read_u64 = debug_taskcount_read,
4156 },
4157
4158 {
4159 .name = "current_css_set",
4160 .read_u64 = current_css_set_read,
4161 },
4162
4163 {
4164 .name = "current_css_set_refcount",
4165 .read_u64 = current_css_set_refcount_read,
4166 },
4167
4168 {
4169 .name = "current_css_set_cg_links",
4170 .read_seq_string = current_css_set_cg_links_read,
4171 },
4172
4173 {
4174 .name = "cgroup_css_links",
4175 .read_seq_string = cgroup_css_links_read,
4176 },
4177
4178 {
4179 .name = "releasable",
4180 .read_u64 = releasable_read,
4181 },
4182};
4183
4184static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
4185{
4186 return cgroup_add_files(cont, ss, debug_files,
4187 ARRAY_SIZE(debug_files));
4188}
4189
4190struct cgroup_subsys debug_subsys = {
4191 .name = "debug",
4192 .create = debug_create,
4193 .destroy = debug_destroy,
4194 .populate = debug_populate,
4195 .subsys_id = debug_subsys_id,
4196};
4197#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
deleted file mode 100644
index 0c92d797baa6..000000000000
--- a/kernel/cgroup_debug.c
+++ /dev/null
@@ -1,105 +0,0 @@
1/*
2 * kernel/cgroup_debug.c - Example cgroup subsystem that
3 * exposes debug info
4 *
5 * Copyright (C) Google Inc, 2007
6 *
7 * Developed by Paul Menage (menage@google.com)
8 *
9 */
10
11#include <linux/cgroup.h>
12#include <linux/fs.h>
13#include <linux/slab.h>
14#include <linux/rcupdate.h>
15
16#include <asm/atomic.h>
17
18static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
19 struct cgroup *cont)
20{
21 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
22
23 if (!css)
24 return ERR_PTR(-ENOMEM);
25
26 return css;
27}
28
29static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
30{
31 kfree(cont->subsys[debug_subsys_id]);
32}
33
34static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
35{
36 return atomic_read(&cont->count);
37}
38
39static u64 taskcount_read(struct cgroup *cont, struct cftype *cft)
40{
41 u64 count;
42
43 count = cgroup_task_count(cont);
44 return count;
45}
46
47static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
48{
49 return (u64)(long)current->cgroups;
50}
51
52static u64 current_css_set_refcount_read(struct cgroup *cont,
53 struct cftype *cft)
54{
55 u64 count;
56
57 rcu_read_lock();
58 count = atomic_read(&current->cgroups->refcount);
59 rcu_read_unlock();
60 return count;
61}
62
63static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
64{
65 return test_bit(CGRP_RELEASABLE, &cgrp->flags);
66}
67
68static struct cftype files[] = {
69 {
70 .name = "cgroup_refcount",
71 .read_u64 = cgroup_refcount_read,
72 },
73 {
74 .name = "taskcount",
75 .read_u64 = taskcount_read,
76 },
77
78 {
79 .name = "current_css_set",
80 .read_u64 = current_css_set_read,
81 },
82
83 {
84 .name = "current_css_set_refcount",
85 .read_u64 = current_css_set_refcount_read,
86 },
87
88 {
89 .name = "releasable",
90 .read_u64 = releasable_read,
91 },
92};
93
94static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
95{
96 return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
97}
98
99struct cgroup_subsys debug_subsys = {
100 .name = "debug",
101 .create = debug_create,
102 .destroy = debug_destroy,
103 .populate = debug_populate,
104 .subsys_id = debug_subsys_id,
105};
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index fb249e2bcada..59e9ef6aab40 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -159,7 +159,7 @@ static bool is_task_frozen_enough(struct task_struct *task)
159 */ 159 */
160static int freezer_can_attach(struct cgroup_subsys *ss, 160static int freezer_can_attach(struct cgroup_subsys *ss,
161 struct cgroup *new_cgroup, 161 struct cgroup *new_cgroup,
162 struct task_struct *task) 162 struct task_struct *task, bool threadgroup)
163{ 163{
164 struct freezer *freezer; 164 struct freezer *freezer;
165 165
@@ -177,6 +177,19 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
177 if (freezer->state == CGROUP_FROZEN) 177 if (freezer->state == CGROUP_FROZEN)
178 return -EBUSY; 178 return -EBUSY;
179 179
180 if (threadgroup) {
181 struct task_struct *c;
182
183 rcu_read_lock();
184 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
185 if (is_task_frozen_enough(c)) {
186 rcu_read_unlock();
187 return -EBUSY;
188 }
189 }
190 rcu_read_unlock();
191 }
192
180 return 0; 193 return 0;
181} 194}
182 195
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6ba0f1ecb212..291ac586f37f 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -212,6 +212,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
212 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, 212 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
213 hcpu, -1, &nr_calls); 213 hcpu, -1, &nr_calls);
214 if (err == NOTIFY_BAD) { 214 if (err == NOTIFY_BAD) {
215 set_cpu_active(cpu, true);
216
215 nr_calls--; 217 nr_calls--;
216 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 218 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
217 hcpu, nr_calls, NULL); 219 hcpu, nr_calls, NULL);
@@ -223,11 +225,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
223 225
224 /* Ensure that we are not runnable on dying cpu */ 226 /* Ensure that we are not runnable on dying cpu */
225 cpumask_copy(old_allowed, &current->cpus_allowed); 227 cpumask_copy(old_allowed, &current->cpus_allowed);
226 set_cpus_allowed_ptr(current, 228 set_cpus_allowed_ptr(current, cpu_active_mask);
227 cpumask_of(cpumask_any_but(cpu_online_mask, cpu)));
228 229
229 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); 230 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
230 if (err) { 231 if (err) {
232 set_cpu_active(cpu, true);
231 /* CPU didn't die: tell everyone. Can't complain. */ 233 /* CPU didn't die: tell everyone. Can't complain. */
232 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 234 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
233 hcpu) == NOTIFY_BAD) 235 hcpu) == NOTIFY_BAD)
@@ -292,9 +294,6 @@ int __ref cpu_down(unsigned int cpu)
292 294
293 err = _cpu_down(cpu, 0); 295 err = _cpu_down(cpu, 0);
294 296
295 if (cpu_online(cpu))
296 set_cpu_active(cpu, true);
297
298out: 297out:
299 cpu_maps_update_done(); 298 cpu_maps_update_done();
300 stop_machine_destroy(); 299 stop_machine_destroy();
@@ -387,15 +386,23 @@ int disable_nonboot_cpus(void)
387 * with the userspace trying to use the CPU hotplug at the same time 386 * with the userspace trying to use the CPU hotplug at the same time
388 */ 387 */
389 cpumask_clear(frozen_cpus); 388 cpumask_clear(frozen_cpus);
389
390 for_each_online_cpu(cpu) {
391 if (cpu == first_cpu)
392 continue;
393 set_cpu_active(cpu, false);
394 }
395
396 synchronize_sched();
397
390 printk("Disabling non-boot CPUs ...\n"); 398 printk("Disabling non-boot CPUs ...\n");
391 for_each_online_cpu(cpu) { 399 for_each_online_cpu(cpu) {
392 if (cpu == first_cpu) 400 if (cpu == first_cpu)
393 continue; 401 continue;
394 error = _cpu_down(cpu, 1); 402 error = _cpu_down(cpu, 1);
395 if (!error) { 403 if (!error)
396 cpumask_set_cpu(cpu, frozen_cpus); 404 cpumask_set_cpu(cpu, frozen_cpus);
397 printk("CPU%d is down\n", cpu); 405 else {
398 } else {
399 printk(KERN_ERR "Error taking CPU%d down: %d\n", 406 printk(KERN_ERR "Error taking CPU%d down: %d\n",
400 cpu, error); 407 cpu, error);
401 break; 408 break;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7e75a41bd508..ba401fab459f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -537,8 +537,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
537 * element of the partition (one sched domain) to be passed to 537 * element of the partition (one sched domain) to be passed to
538 * partition_sched_domains(). 538 * partition_sched_domains().
539 */ 539 */
540/* FIXME: see the FIXME in partition_sched_domains() */ 540static int generate_sched_domains(cpumask_var_t **domains,
541static int generate_sched_domains(struct cpumask **domains,
542 struct sched_domain_attr **attributes) 541 struct sched_domain_attr **attributes)
543{ 542{
544 LIST_HEAD(q); /* queue of cpusets to be scanned */ 543 LIST_HEAD(q); /* queue of cpusets to be scanned */
@@ -546,7 +545,7 @@ static int generate_sched_domains(struct cpumask **domains,
546 struct cpuset **csa; /* array of all cpuset ptrs */ 545 struct cpuset **csa; /* array of all cpuset ptrs */
547 int csn; /* how many cpuset ptrs in csa so far */ 546 int csn; /* how many cpuset ptrs in csa so far */
548 int i, j, k; /* indices for partition finding loops */ 547 int i, j, k; /* indices for partition finding loops */
549 struct cpumask *doms; /* resulting partition; i.e. sched domains */ 548 cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
550 struct sched_domain_attr *dattr; /* attributes for custom domains */ 549 struct sched_domain_attr *dattr; /* attributes for custom domains */
551 int ndoms = 0; /* number of sched domains in result */ 550 int ndoms = 0; /* number of sched domains in result */
552 int nslot; /* next empty doms[] struct cpumask slot */ 551 int nslot; /* next empty doms[] struct cpumask slot */
@@ -557,7 +556,8 @@ static int generate_sched_domains(struct cpumask **domains,
557 556
558 /* Special case for the 99% of systems with one, full, sched domain */ 557 /* Special case for the 99% of systems with one, full, sched domain */
559 if (is_sched_load_balance(&top_cpuset)) { 558 if (is_sched_load_balance(&top_cpuset)) {
560 doms = kmalloc(cpumask_size(), GFP_KERNEL); 559 ndoms = 1;
560 doms = alloc_sched_domains(ndoms);
561 if (!doms) 561 if (!doms)
562 goto done; 562 goto done;
563 563
@@ -566,9 +566,8 @@ static int generate_sched_domains(struct cpumask **domains,
566 *dattr = SD_ATTR_INIT; 566 *dattr = SD_ATTR_INIT;
567 update_domain_attr_tree(dattr, &top_cpuset); 567 update_domain_attr_tree(dattr, &top_cpuset);
568 } 568 }
569 cpumask_copy(doms, top_cpuset.cpus_allowed); 569 cpumask_copy(doms[0], top_cpuset.cpus_allowed);
570 570
571 ndoms = 1;
572 goto done; 571 goto done;
573 } 572 }
574 573
@@ -636,7 +635,7 @@ restart:
636 * Now we know how many domains to create. 635 * Now we know how many domains to create.
637 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. 636 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
638 */ 637 */
639 doms = kmalloc(ndoms * cpumask_size(), GFP_KERNEL); 638 doms = alloc_sched_domains(ndoms);
640 if (!doms) 639 if (!doms)
641 goto done; 640 goto done;
642 641
@@ -656,7 +655,7 @@ restart:
656 continue; 655 continue;
657 } 656 }
658 657
659 dp = doms + nslot; 658 dp = doms[nslot];
660 659
661 if (nslot == ndoms) { 660 if (nslot == ndoms) {
662 static int warnings = 10; 661 static int warnings = 10;
@@ -718,7 +717,7 @@ done:
718static void do_rebuild_sched_domains(struct work_struct *unused) 717static void do_rebuild_sched_domains(struct work_struct *unused)
719{ 718{
720 struct sched_domain_attr *attr; 719 struct sched_domain_attr *attr;
721 struct cpumask *doms; 720 cpumask_var_t *doms;
722 int ndoms; 721 int ndoms;
723 722
724 get_online_cpus(); 723 get_online_cpus();
@@ -738,7 +737,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused)
738{ 737{
739} 738}
740 739
741static int generate_sched_domains(struct cpumask **domains, 740static int generate_sched_domains(cpumask_var_t **domains,
742 struct sched_domain_attr **attributes) 741 struct sched_domain_attr **attributes)
743{ 742{
744 *domains = NULL; 743 *domains = NULL;
@@ -873,7 +872,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
873 if (retval < 0) 872 if (retval < 0)
874 return retval; 873 return retval;
875 874
876 if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask)) 875 if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
877 return -EINVAL; 876 return -EINVAL;
878 } 877 }
879 retval = validate_change(cs, trialcs); 878 retval = validate_change(cs, trialcs);
@@ -1324,9 +1323,10 @@ static int fmeter_getrate(struct fmeter *fmp)
1324static cpumask_var_t cpus_attach; 1323static cpumask_var_t cpus_attach;
1325 1324
1326/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ 1325/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1327static int cpuset_can_attach(struct cgroup_subsys *ss, 1326static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1328 struct cgroup *cont, struct task_struct *tsk) 1327 struct task_struct *tsk, bool threadgroup)
1329{ 1328{
1329 int ret;
1330 struct cpuset *cs = cgroup_cs(cont); 1330 struct cpuset *cs = cgroup_cs(cont);
1331 1331
1332 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1332 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
@@ -1343,18 +1343,51 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
1343 if (tsk->flags & PF_THREAD_BOUND) 1343 if (tsk->flags & PF_THREAD_BOUND)
1344 return -EINVAL; 1344 return -EINVAL;
1345 1345
1346 return security_task_setscheduler(tsk, 0, NULL); 1346 ret = security_task_setscheduler(tsk, 0, NULL);
1347 if (ret)
1348 return ret;
1349 if (threadgroup) {
1350 struct task_struct *c;
1351
1352 rcu_read_lock();
1353 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1354 ret = security_task_setscheduler(c, 0, NULL);
1355 if (ret) {
1356 rcu_read_unlock();
1357 return ret;
1358 }
1359 }
1360 rcu_read_unlock();
1361 }
1362 return 0;
1363}
1364
1365static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
1366 struct cpuset *cs)
1367{
1368 int err;
1369 /*
1370 * can_attach beforehand should guarantee that this doesn't fail.
1371 * TODO: have a better way to handle failure here
1372 */
1373 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1374 WARN_ON_ONCE(err);
1375
1376 task_lock(tsk);
1377 cpuset_change_task_nodemask(tsk, to);
1378 task_unlock(tsk);
1379 cpuset_update_task_spread_flag(cs, tsk);
1380
1347} 1381}
1348 1382
1349static void cpuset_attach(struct cgroup_subsys *ss, 1383static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1350 struct cgroup *cont, struct cgroup *oldcont, 1384 struct cgroup *oldcont, struct task_struct *tsk,
1351 struct task_struct *tsk) 1385 bool threadgroup)
1352{ 1386{
1353 nodemask_t from, to; 1387 nodemask_t from, to;
1354 struct mm_struct *mm; 1388 struct mm_struct *mm;
1355 struct cpuset *cs = cgroup_cs(cont); 1389 struct cpuset *cs = cgroup_cs(cont);
1356 struct cpuset *oldcs = cgroup_cs(oldcont); 1390 struct cpuset *oldcs = cgroup_cs(oldcont);
1357 int err;
1358 1391
1359 if (cs == &top_cpuset) { 1392 if (cs == &top_cpuset) {
1360 cpumask_copy(cpus_attach, cpu_possible_mask); 1393 cpumask_copy(cpus_attach, cpu_possible_mask);
@@ -1363,15 +1396,19 @@ static void cpuset_attach(struct cgroup_subsys *ss,
1363 guarantee_online_cpus(cs, cpus_attach); 1396 guarantee_online_cpus(cs, cpus_attach);
1364 guarantee_online_mems(cs, &to); 1397 guarantee_online_mems(cs, &to);
1365 } 1398 }
1366 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1367 if (err)
1368 return;
1369 1399
1370 task_lock(tsk); 1400 /* do per-task migration stuff possibly for each in the threadgroup */
1371 cpuset_change_task_nodemask(tsk, &to); 1401 cpuset_attach_task(tsk, &to, cs);
1372 task_unlock(tsk); 1402 if (threadgroup) {
1373 cpuset_update_task_spread_flag(cs, tsk); 1403 struct task_struct *c;
1404 rcu_read_lock();
1405 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1406 cpuset_attach_task(c, &to, cs);
1407 }
1408 rcu_read_unlock();
1409 }
1374 1410
1411 /* change mm; only needs to be done once even if threadgroup */
1375 from = oldcs->mems_allowed; 1412 from = oldcs->mems_allowed;
1376 to = cs->mems_allowed; 1413 to = cs->mems_allowed;
1377 mm = get_task_mm(tsk); 1414 mm = get_task_mm(tsk);
@@ -1973,7 +2010,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
1973 } 2010 }
1974 2011
1975 /* Continue past cpusets with all cpus, mems online */ 2012 /* Continue past cpusets with all cpus, mems online */
1976 if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) && 2013 if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&
1977 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) 2014 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
1978 continue; 2015 continue;
1979 2016
@@ -1982,7 +2019,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
1982 /* Remove offline cpus and mems from this cpuset. */ 2019 /* Remove offline cpus and mems from this cpuset. */
1983 mutex_lock(&callback_mutex); 2020 mutex_lock(&callback_mutex);
1984 cpumask_and(cp->cpus_allowed, cp->cpus_allowed, 2021 cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
1985 cpu_online_mask); 2022 cpu_active_mask);
1986 nodes_and(cp->mems_allowed, cp->mems_allowed, 2023 nodes_and(cp->mems_allowed, cp->mems_allowed,
1987 node_states[N_HIGH_MEMORY]); 2024 node_states[N_HIGH_MEMORY]);
1988 mutex_unlock(&callback_mutex); 2025 mutex_unlock(&callback_mutex);
@@ -2014,14 +2051,16 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2014 unsigned long phase, void *unused_cpu) 2051 unsigned long phase, void *unused_cpu)
2015{ 2052{
2016 struct sched_domain_attr *attr; 2053 struct sched_domain_attr *attr;
2017 struct cpumask *doms; 2054 cpumask_var_t *doms;
2018 int ndoms; 2055 int ndoms;
2019 2056
2020 switch (phase) { 2057 switch (phase) {
2021 case CPU_ONLINE: 2058 case CPU_ONLINE:
2022 case CPU_ONLINE_FROZEN: 2059 case CPU_ONLINE_FROZEN:
2023 case CPU_DEAD: 2060 case CPU_DOWN_PREPARE:
2024 case CPU_DEAD_FROZEN: 2061 case CPU_DOWN_PREPARE_FROZEN:
2062 case CPU_DOWN_FAILED:
2063 case CPU_DOWN_FAILED_FROZEN:
2025 break; 2064 break;
2026 2065
2027 default: 2066 default:
@@ -2030,7 +2069,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2030 2069
2031 cgroup_lock(); 2070 cgroup_lock();
2032 mutex_lock(&callback_mutex); 2071 mutex_lock(&callback_mutex);
2033 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); 2072 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2034 mutex_unlock(&callback_mutex); 2073 mutex_unlock(&callback_mutex);
2035 scan_for_empty_cpusets(&top_cpuset); 2074 scan_for_empty_cpusets(&top_cpuset);
2036 ndoms = generate_sched_domains(&doms, &attr); 2075 ndoms = generate_sched_domains(&doms, &attr);
@@ -2077,7 +2116,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2077 2116
2078void __init cpuset_init_smp(void) 2117void __init cpuset_init_smp(void)
2079{ 2118{
2080 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); 2119 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2081 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2120 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2082 2121
2083 hotcpu_notifier(cpuset_track_online_cpus, 0); 2122 hotcpu_notifier(cpuset_track_online_cpus, 0);
@@ -2499,15 +2538,9 @@ const struct file_operations proc_cpuset_operations = {
2499}; 2538};
2500#endif /* CONFIG_PROC_PID_CPUSET */ 2539#endif /* CONFIG_PROC_PID_CPUSET */
2501 2540
2502/* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */ 2541/* Display task mems_allowed in /proc/<pid>/status file. */
2503void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) 2542void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2504{ 2543{
2505 seq_printf(m, "Cpus_allowed:\t");
2506 seq_cpumask(m, &task->cpus_allowed);
2507 seq_printf(m, "\n");
2508 seq_printf(m, "Cpus_allowed_list:\t");
2509 seq_cpumask_list(m, &task->cpus_allowed);
2510 seq_printf(m, "\n");
2511 seq_printf(m, "Mems_allowed:\t"); 2544 seq_printf(m, "Mems_allowed:\t");
2512 seq_nodemask(m, &task->mems_allowed); 2545 seq_nodemask(m, &task->mems_allowed);
2513 seq_printf(m, "\n"); 2546 seq_printf(m, "\n");
diff --git a/kernel/cred.c b/kernel/cred.c
index d7f7a01082eb..dd76cfe5f5b0 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -782,6 +782,25 @@ EXPORT_SYMBOL(set_create_files_as);
782 782
783#ifdef CONFIG_DEBUG_CREDENTIALS 783#ifdef CONFIG_DEBUG_CREDENTIALS
784 784
785bool creds_are_invalid(const struct cred *cred)
786{
787 if (cred->magic != CRED_MAGIC)
788 return true;
789 if (atomic_read(&cred->usage) < atomic_read(&cred->subscribers))
790 return true;
791#ifdef CONFIG_SECURITY_SELINUX
792 if (selinux_is_enabled()) {
793 if ((unsigned long) cred->security < PAGE_SIZE)
794 return true;
795 if ((*(u32 *)cred->security & 0xffffff00) ==
796 (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8))
797 return true;
798 }
799#endif
800 return false;
801}
802EXPORT_SYMBOL(creds_are_invalid);
803
785/* 804/*
786 * dump invalid credentials 805 * dump invalid credentials
787 */ 806 */
diff --git a/kernel/exit.c b/kernel/exit.c
index 60d6fdcc9265..5962d7ccf243 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -49,6 +49,7 @@
49#include <linux/init_task.h> 49#include <linux/init_task.h>
50#include <linux/perf_event.h> 50#include <linux/perf_event.h>
51#include <trace/events/sched.h> 51#include <trace/events/sched.h>
52#include <linux/hw_breakpoint.h>
52 53
53#include <asm/uaccess.h> 54#include <asm/uaccess.h>
54#include <asm/unistd.h> 55#include <asm/unistd.h>
@@ -110,9 +111,9 @@ static void __exit_signal(struct task_struct *tsk)
110 * We won't ever get here for the group leader, since it 111 * We won't ever get here for the group leader, since it
111 * will have been the last reference on the signal_struct. 112 * will have been the last reference on the signal_struct.
112 */ 113 */
113 sig->utime = cputime_add(sig->utime, task_utime(tsk)); 114 sig->utime = cputime_add(sig->utime, tsk->utime);
114 sig->stime = cputime_add(sig->stime, task_stime(tsk)); 115 sig->stime = cputime_add(sig->stime, tsk->stime);
115 sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); 116 sig->gtime = cputime_add(sig->gtime, tsk->gtime);
116 sig->min_flt += tsk->min_flt; 117 sig->min_flt += tsk->min_flt;
117 sig->maj_flt += tsk->maj_flt; 118 sig->maj_flt += tsk->maj_flt;
118 sig->nvcsw += tsk->nvcsw; 119 sig->nvcsw += tsk->nvcsw;
@@ -359,10 +360,8 @@ void __set_special_pids(struct pid *pid)
359{ 360{
360 struct task_struct *curr = current->group_leader; 361 struct task_struct *curr = current->group_leader;
361 362
362 if (task_session(curr) != pid) { 363 if (task_session(curr) != pid)
363 change_pid(curr, PIDTYPE_SID, pid); 364 change_pid(curr, PIDTYPE_SID, pid);
364 proc_sid_connector(curr);
365 }
366 365
367 if (task_pgrp(curr) != pid) 366 if (task_pgrp(curr) != pid)
368 change_pid(curr, PIDTYPE_PGID, pid); 367 change_pid(curr, PIDTYPE_PGID, pid);
@@ -934,7 +933,7 @@ NORET_TYPE void do_exit(long code)
934 * an exiting task cleaning up the robust pi futexes. 933 * an exiting task cleaning up the robust pi futexes.
935 */ 934 */
936 smp_mb(); 935 smp_mb();
937 spin_unlock_wait(&tsk->pi_lock); 936 raw_spin_unlock_wait(&tsk->pi_lock);
938 937
939 if (unlikely(in_atomic())) 938 if (unlikely(in_atomic()))
940 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", 939 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
@@ -972,16 +971,18 @@ NORET_TYPE void do_exit(long code)
972 exit_thread(); 971 exit_thread();
973 cgroup_exit(tsk, 1); 972 cgroup_exit(tsk, 1);
974 973
975 if (group_dead && tsk->signal->leader) 974 if (group_dead)
976 disassociate_ctty(1); 975 disassociate_ctty(1);
977 976
978 module_put(task_thread_info(tsk)->exec_domain->module); 977 module_put(task_thread_info(tsk)->exec_domain->module);
979 if (tsk->binfmt)
980 module_put(tsk->binfmt->module);
981 978
982 proc_exit_connector(tsk); 979 proc_exit_connector(tsk);
983 980
984 /* 981 /*
982 * FIXME: do that only when needed, using sched_exit tracepoint
983 */
984 flush_ptrace_hw_breakpoint(tsk);
985 /*
985 * Flush inherited counters to the parent - before the parent 986 * Flush inherited counters to the parent - before the parent
986 * gets woken up by child-exit notifications. 987 * gets woken up by child-exit notifications.
987 */ 988 */
@@ -993,8 +994,6 @@ NORET_TYPE void do_exit(long code)
993 tsk->mempolicy = NULL; 994 tsk->mempolicy = NULL;
994#endif 995#endif
995#ifdef CONFIG_FUTEX 996#ifdef CONFIG_FUTEX
996 if (unlikely(!list_empty(&tsk->pi_state_list)))
997 exit_pi_state_list(tsk);
998 if (unlikely(current->pi_state_cache)) 997 if (unlikely(current->pi_state_cache))
999 kfree(current->pi_state_cache); 998 kfree(current->pi_state_cache);
1000#endif 999#endif
@@ -1010,7 +1009,7 @@ NORET_TYPE void do_exit(long code)
1010 tsk->flags |= PF_EXITPIDONE; 1009 tsk->flags |= PF_EXITPIDONE;
1011 1010
1012 if (tsk->io_context) 1011 if (tsk->io_context)
1013 exit_io_context(); 1012 exit_io_context(tsk);
1014 1013
1015 if (tsk->splice_pipe) 1014 if (tsk->splice_pipe)
1016 __free_pipe_info(tsk->splice_pipe); 1015 __free_pipe_info(tsk->splice_pipe);
@@ -1097,28 +1096,28 @@ struct wait_opts {
1097 int __user *wo_stat; 1096 int __user *wo_stat;
1098 struct rusage __user *wo_rusage; 1097 struct rusage __user *wo_rusage;
1099 1098
1099 wait_queue_t child_wait;
1100 int notask_error; 1100 int notask_error;
1101}; 1101};
1102 1102
1103static struct pid *task_pid_type(struct task_struct *task, enum pid_type type) 1103static inline
1104struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
1104{ 1105{
1105 struct pid *pid = NULL; 1106 if (type != PIDTYPE_PID)
1106 if (type == PIDTYPE_PID) 1107 task = task->group_leader;
1107 pid = task->pids[type].pid; 1108 return task->pids[type].pid;
1108 else if (type < PIDTYPE_MAX)
1109 pid = task->group_leader->pids[type].pid;
1110 return pid;
1111} 1109}
1112 1110
1113static int eligible_child(struct wait_opts *wo, struct task_struct *p) 1111static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
1114{ 1112{
1115 int err; 1113 return wo->wo_type == PIDTYPE_MAX ||
1116 1114 task_pid_type(p, wo->wo_type) == wo->wo_pid;
1117 if (wo->wo_type < PIDTYPE_MAX) { 1115}
1118 if (task_pid_type(p, wo->wo_type) != wo->wo_pid)
1119 return 0;
1120 }
1121 1116
1117static int eligible_child(struct wait_opts *wo, struct task_struct *p)
1118{
1119 if (!eligible_pid(wo, p))
1120 return 0;
1122 /* Wait for all children (clone and not) if __WALL is set; 1121 /* Wait for all children (clone and not) if __WALL is set;
1123 * otherwise, wait for clone children *only* if __WCLONE is 1122 * otherwise, wait for clone children *only* if __WCLONE is
1124 * set; otherwise, wait for non-clone children *only*. (Note: 1123 * set; otherwise, wait for non-clone children *only*. (Note:
@@ -1128,10 +1127,6 @@ static int eligible_child(struct wait_opts *wo, struct task_struct *p)
1128 && !(wo->wo_flags & __WALL)) 1127 && !(wo->wo_flags & __WALL))
1129 return 0; 1128 return 0;
1130 1129
1131 err = security_task_wait(p);
1132 if (err)
1133 return err;
1134
1135 return 1; 1130 return 1;
1136} 1131}
1137 1132
@@ -1144,18 +1139,20 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
1144 1139
1145 put_task_struct(p); 1140 put_task_struct(p);
1146 infop = wo->wo_info; 1141 infop = wo->wo_info;
1147 if (!retval) 1142 if (infop) {
1148 retval = put_user(SIGCHLD, &infop->si_signo); 1143 if (!retval)
1149 if (!retval) 1144 retval = put_user(SIGCHLD, &infop->si_signo);
1150 retval = put_user(0, &infop->si_errno); 1145 if (!retval)
1151 if (!retval) 1146 retval = put_user(0, &infop->si_errno);
1152 retval = put_user((short)why, &infop->si_code); 1147 if (!retval)
1153 if (!retval) 1148 retval = put_user((short)why, &infop->si_code);
1154 retval = put_user(pid, &infop->si_pid); 1149 if (!retval)
1155 if (!retval) 1150 retval = put_user(pid, &infop->si_pid);
1156 retval = put_user(uid, &infop->si_uid); 1151 if (!retval)
1157 if (!retval) 1152 retval = put_user(uid, &infop->si_uid);
1158 retval = put_user(status, &infop->si_status); 1153 if (!retval)
1154 retval = put_user(status, &infop->si_status);
1155 }
1159 if (!retval) 1156 if (!retval)
1160 retval = pid; 1157 retval = pid;
1161 return retval; 1158 return retval;
@@ -1213,6 +1210,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1213 struct signal_struct *psig; 1210 struct signal_struct *psig;
1214 struct signal_struct *sig; 1211 struct signal_struct *sig;
1215 unsigned long maxrss; 1212 unsigned long maxrss;
1213 cputime_t tgutime, tgstime;
1216 1214
1217 /* 1215 /*
1218 * The resource counters for the group leader are in its 1216 * The resource counters for the group leader are in its
@@ -1228,20 +1226,23 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1228 * need to protect the access to parent->signal fields, 1226 * need to protect the access to parent->signal fields,
1229 * as other threads in the parent group can be right 1227 * as other threads in the parent group can be right
1230 * here reaping other children at the same time. 1228 * here reaping other children at the same time.
1229 *
1230 * We use thread_group_times() to get times for the thread
1231 * group, which consolidates times for all threads in the
1232 * group including the group leader.
1231 */ 1233 */
1234 thread_group_times(p, &tgutime, &tgstime);
1232 spin_lock_irq(&p->real_parent->sighand->siglock); 1235 spin_lock_irq(&p->real_parent->sighand->siglock);
1233 psig = p->real_parent->signal; 1236 psig = p->real_parent->signal;
1234 sig = p->signal; 1237 sig = p->signal;
1235 psig->cutime = 1238 psig->cutime =
1236 cputime_add(psig->cutime, 1239 cputime_add(psig->cutime,
1237 cputime_add(p->utime, 1240 cputime_add(tgutime,
1238 cputime_add(sig->utime, 1241 sig->cutime));
1239 sig->cutime)));
1240 psig->cstime = 1242 psig->cstime =
1241 cputime_add(psig->cstime, 1243 cputime_add(psig->cstime,
1242 cputime_add(p->stime, 1244 cputime_add(tgstime,
1243 cputime_add(sig->stime, 1245 sig->cstime));
1244 sig->cstime)));
1245 psig->cgtime = 1246 psig->cgtime =
1246 cputime_add(psig->cgtime, 1247 cputime_add(psig->cgtime,
1247 cputime_add(p->gtime, 1248 cputime_add(p->gtime,
@@ -1485,13 +1486,14 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1485 * then ->notask_error is 0 if @p is an eligible child, 1486 * then ->notask_error is 0 if @p is an eligible child,
1486 * or another error from security_task_wait(), or still -ECHILD. 1487 * or another error from security_task_wait(), or still -ECHILD.
1487 */ 1488 */
1488static int wait_consider_task(struct wait_opts *wo, struct task_struct *parent, 1489static int wait_consider_task(struct wait_opts *wo, int ptrace,
1489 int ptrace, struct task_struct *p) 1490 struct task_struct *p)
1490{ 1491{
1491 int ret = eligible_child(wo, p); 1492 int ret = eligible_child(wo, p);
1492 if (!ret) 1493 if (!ret)
1493 return ret; 1494 return ret;
1494 1495
1496 ret = security_task_wait(p);
1495 if (unlikely(ret < 0)) { 1497 if (unlikely(ret < 0)) {
1496 /* 1498 /*
1497 * If we have not yet seen any eligible child, 1499 * If we have not yet seen any eligible child,
@@ -1553,7 +1555,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
1553 * Do not consider detached threads. 1555 * Do not consider detached threads.
1554 */ 1556 */
1555 if (!task_detached(p)) { 1557 if (!task_detached(p)) {
1556 int ret = wait_consider_task(wo, tsk, 0, p); 1558 int ret = wait_consider_task(wo, 0, p);
1557 if (ret) 1559 if (ret)
1558 return ret; 1560 return ret;
1559 } 1561 }
@@ -1567,7 +1569,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
1567 struct task_struct *p; 1569 struct task_struct *p;
1568 1570
1569 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { 1571 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
1570 int ret = wait_consider_task(wo, tsk, 1, p); 1572 int ret = wait_consider_task(wo, 1, p);
1571 if (ret) 1573 if (ret)
1572 return ret; 1574 return ret;
1573 } 1575 }
@@ -1575,15 +1577,38 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
1575 return 0; 1577 return 0;
1576} 1578}
1577 1579
1580static int child_wait_callback(wait_queue_t *wait, unsigned mode,
1581 int sync, void *key)
1582{
1583 struct wait_opts *wo = container_of(wait, struct wait_opts,
1584 child_wait);
1585 struct task_struct *p = key;
1586
1587 if (!eligible_pid(wo, p))
1588 return 0;
1589
1590 if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
1591 return 0;
1592
1593 return default_wake_function(wait, mode, sync, key);
1594}
1595
1596void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
1597{
1598 __wake_up_sync_key(&parent->signal->wait_chldexit,
1599 TASK_INTERRUPTIBLE, 1, p);
1600}
1601
1578static long do_wait(struct wait_opts *wo) 1602static long do_wait(struct wait_opts *wo)
1579{ 1603{
1580 DECLARE_WAITQUEUE(wait, current);
1581 struct task_struct *tsk; 1604 struct task_struct *tsk;
1582 int retval; 1605 int retval;
1583 1606
1584 trace_sched_process_wait(wo->wo_pid); 1607 trace_sched_process_wait(wo->wo_pid);
1585 1608
1586 add_wait_queue(&current->signal->wait_chldexit,&wait); 1609 init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
1610 wo->child_wait.private = current;
1611 add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1587repeat: 1612repeat:
1588 /* 1613 /*
1589 * If there is nothing that can match our critiera just get out. 1614 * If there is nothing that can match our critiera just get out.
@@ -1624,32 +1649,7 @@ notask:
1624 } 1649 }
1625end: 1650end:
1626 __set_current_state(TASK_RUNNING); 1651 __set_current_state(TASK_RUNNING);
1627 remove_wait_queue(&current->signal->wait_chldexit,&wait); 1652 remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1628 if (wo->wo_info) {
1629 struct siginfo __user *infop = wo->wo_info;
1630
1631 if (retval > 0)
1632 retval = 0;
1633 else {
1634 /*
1635 * For a WNOHANG return, clear out all the fields
1636 * we would set so the user can easily tell the
1637 * difference.
1638 */
1639 if (!retval)
1640 retval = put_user(0, &infop->si_signo);
1641 if (!retval)
1642 retval = put_user(0, &infop->si_errno);
1643 if (!retval)
1644 retval = put_user(0, &infop->si_code);
1645 if (!retval)
1646 retval = put_user(0, &infop->si_pid);
1647 if (!retval)
1648 retval = put_user(0, &infop->si_uid);
1649 if (!retval)
1650 retval = put_user(0, &infop->si_status);
1651 }
1652 }
1653 return retval; 1653 return retval;
1654} 1654}
1655 1655
@@ -1694,6 +1694,29 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1694 wo.wo_stat = NULL; 1694 wo.wo_stat = NULL;
1695 wo.wo_rusage = ru; 1695 wo.wo_rusage = ru;
1696 ret = do_wait(&wo); 1696 ret = do_wait(&wo);
1697
1698 if (ret > 0) {
1699 ret = 0;
1700 } else if (infop) {
1701 /*
1702 * For a WNOHANG return, clear out all the fields
1703 * we would set so the user can easily tell the
1704 * difference.
1705 */
1706 if (!ret)
1707 ret = put_user(0, &infop->si_signo);
1708 if (!ret)
1709 ret = put_user(0, &infop->si_errno);
1710 if (!ret)
1711 ret = put_user(0, &infop->si_code);
1712 if (!ret)
1713 ret = put_user(0, &infop->si_pid);
1714 if (!ret)
1715 ret = put_user(0, &infop->si_uid);
1716 if (!ret)
1717 ret = put_user(0, &infop->si_status);
1718 }
1719
1697 put_pid(pid); 1720 put_pid(pid);
1698 1721
1699 /* avoid REGPARM breakage on x86: */ 1722 /* avoid REGPARM breakage on x86: */
diff --git a/kernel/fork.c b/kernel/fork.c
index 51ad0b0b7266..202a0ba63d3c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -64,6 +64,7 @@
64#include <linux/magic.h> 64#include <linux/magic.h>
65#include <linux/perf_event.h> 65#include <linux/perf_event.h>
66#include <linux/posix-timers.h> 66#include <linux/posix-timers.h>
67#include <linux/user-return-notifier.h>
67 68
68#include <asm/pgtable.h> 69#include <asm/pgtable.h>
69#include <asm/pgalloc.h> 70#include <asm/pgalloc.h>
@@ -91,7 +92,7 @@ int nr_processes(void)
91 int cpu; 92 int cpu;
92 int total = 0; 93 int total = 0;
93 94
94 for_each_online_cpu(cpu) 95 for_each_possible_cpu(cpu)
95 total += per_cpu(process_counts, cpu); 96 total += per_cpu(process_counts, cpu);
96 97
97 return total; 98 return total;
@@ -249,6 +250,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
249 goto out; 250 goto out;
250 251
251 setup_thread_stack(tsk, orig); 252 setup_thread_stack(tsk, orig);
253 clear_user_return_notifier(tsk);
252 stackend = end_of_stack(tsk); 254 stackend = end_of_stack(tsk);
253 *stackend = STACK_END_MAGIC; /* for overflow detection */ 255 *stackend = STACK_END_MAGIC; /* for overflow detection */
254 256
@@ -434,6 +436,14 @@ __setup("coredump_filter=", coredump_filter_setup);
434 436
435#include <linux/init_task.h> 437#include <linux/init_task.h>
436 438
439static void mm_init_aio(struct mm_struct *mm)
440{
441#ifdef CONFIG_AIO
442 spin_lock_init(&mm->ioctx_lock);
443 INIT_HLIST_HEAD(&mm->ioctx_list);
444#endif
445}
446
437static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) 447static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
438{ 448{
439 atomic_set(&mm->mm_users, 1); 449 atomic_set(&mm->mm_users, 1);
@@ -447,10 +457,9 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
447 set_mm_counter(mm, file_rss, 0); 457 set_mm_counter(mm, file_rss, 0);
448 set_mm_counter(mm, anon_rss, 0); 458 set_mm_counter(mm, anon_rss, 0);
449 spin_lock_init(&mm->page_table_lock); 459 spin_lock_init(&mm->page_table_lock);
450 spin_lock_init(&mm->ioctx_lock);
451 INIT_HLIST_HEAD(&mm->ioctx_list);
452 mm->free_area_cache = TASK_UNMAPPED_BASE; 460 mm->free_area_cache = TASK_UNMAPPED_BASE;
453 mm->cached_hole_size = ~0UL; 461 mm->cached_hole_size = ~0UL;
462 mm_init_aio(mm);
454 mm_init_owner(mm, p); 463 mm_init_owner(mm, p);
455 464
456 if (likely(!mm_alloc_pgd(mm))) { 465 if (likely(!mm_alloc_pgd(mm))) {
@@ -511,6 +520,8 @@ void mmput(struct mm_struct *mm)
511 spin_unlock(&mmlist_lock); 520 spin_unlock(&mmlist_lock);
512 } 521 }
513 put_swap_token(mm); 522 put_swap_token(mm);
523 if (mm->binfmt)
524 module_put(mm->binfmt->module);
514 mmdrop(mm); 525 mmdrop(mm);
515 } 526 }
516} 527}
@@ -561,12 +572,18 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
561 572
562 /* Get rid of any futexes when releasing the mm */ 573 /* Get rid of any futexes when releasing the mm */
563#ifdef CONFIG_FUTEX 574#ifdef CONFIG_FUTEX
564 if (unlikely(tsk->robust_list)) 575 if (unlikely(tsk->robust_list)) {
565 exit_robust_list(tsk); 576 exit_robust_list(tsk);
577 tsk->robust_list = NULL;
578 }
566#ifdef CONFIG_COMPAT 579#ifdef CONFIG_COMPAT
567 if (unlikely(tsk->compat_robust_list)) 580 if (unlikely(tsk->compat_robust_list)) {
568 compat_exit_robust_list(tsk); 581 compat_exit_robust_list(tsk);
582 tsk->compat_robust_list = NULL;
583 }
569#endif 584#endif
585 if (unlikely(!list_empty(&tsk->pi_state_list)))
586 exit_pi_state_list(tsk);
570#endif 587#endif
571 588
572 /* Get rid of any cached register state */ 589 /* Get rid of any cached register state */
@@ -636,9 +653,14 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
636 mm->hiwater_rss = get_mm_rss(mm); 653 mm->hiwater_rss = get_mm_rss(mm);
637 mm->hiwater_vm = mm->total_vm; 654 mm->hiwater_vm = mm->total_vm;
638 655
656 if (mm->binfmt && !try_module_get(mm->binfmt->module))
657 goto free_pt;
658
639 return mm; 659 return mm;
640 660
641free_pt: 661free_pt:
662 /* don't put binfmt in mmput, we haven't got module yet */
663 mm->binfmt = NULL;
642 mmput(mm); 664 mmput(mm);
643 665
644fail_nomem: 666fail_nomem:
@@ -864,6 +886,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
864 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; 886 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
865 sig->gtime = cputime_zero; 887 sig->gtime = cputime_zero;
866 sig->cgtime = cputime_zero; 888 sig->cgtime = cputime_zero;
889#ifndef CONFIG_VIRT_CPU_ACCOUNTING
890 sig->prev_utime = sig->prev_stime = cputime_zero;
891#endif
867 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; 892 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
868 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; 893 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
869 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; 894 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
@@ -914,9 +939,9 @@ SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
914 939
915static void rt_mutex_init_task(struct task_struct *p) 940static void rt_mutex_init_task(struct task_struct *p)
916{ 941{
917 spin_lock_init(&p->pi_lock); 942 raw_spin_lock_init(&p->pi_lock);
918#ifdef CONFIG_RT_MUTEXES 943#ifdef CONFIG_RT_MUTEXES
919 plist_head_init(&p->pi_waiters, &p->pi_lock); 944 plist_head_init_raw(&p->pi_waiters, &p->pi_lock);
920 p->pi_blocked_on = NULL; 945 p->pi_blocked_on = NULL;
921#endif 946#endif
922} 947}
@@ -979,6 +1004,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
979 if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) 1004 if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
980 return ERR_PTR(-EINVAL); 1005 return ERR_PTR(-EINVAL);
981 1006
1007 /*
1008 * Siblings of global init remain as zombies on exit since they are
1009 * not reaped by their parent (swapper). To solve this and to avoid
1010 * multi-rooted process trees, prevent global and container-inits
1011 * from creating siblings.
1012 */
1013 if ((clone_flags & CLONE_PARENT) &&
1014 current->signal->flags & SIGNAL_UNKILLABLE)
1015 return ERR_PTR(-EINVAL);
1016
982 retval = security_task_create(clone_flags); 1017 retval = security_task_create(clone_flags);
983 if (retval) 1018 if (retval)
984 goto fork_out; 1019 goto fork_out;
@@ -1020,9 +1055,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1020 if (!try_module_get(task_thread_info(p)->exec_domain->module)) 1055 if (!try_module_get(task_thread_info(p)->exec_domain->module))
1021 goto bad_fork_cleanup_count; 1056 goto bad_fork_cleanup_count;
1022 1057
1023 if (p->binfmt && !try_module_get(p->binfmt->module))
1024 goto bad_fork_cleanup_put_domain;
1025
1026 p->did_exec = 0; 1058 p->did_exec = 0;
1027 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ 1059 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
1028 copy_flags(clone_flags, p); 1060 copy_flags(clone_flags, p);
@@ -1039,8 +1071,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1039 p->gtime = cputime_zero; 1071 p->gtime = cputime_zero;
1040 p->utimescaled = cputime_zero; 1072 p->utimescaled = cputime_zero;
1041 p->stimescaled = cputime_zero; 1073 p->stimescaled = cputime_zero;
1074#ifndef CONFIG_VIRT_CPU_ACCOUNTING
1042 p->prev_utime = cputime_zero; 1075 p->prev_utime = cputime_zero;
1043 p->prev_stime = cputime_zero; 1076 p->prev_stime = cputime_zero;
1077#endif
1044 1078
1045 p->default_timer_slack_ns = current->timer_slack_ns; 1079 p->default_timer_slack_ns = current->timer_slack_ns;
1046 1080
@@ -1093,6 +1127,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1093#ifdef CONFIG_DEBUG_MUTEXES 1127#ifdef CONFIG_DEBUG_MUTEXES
1094 p->blocked_on = NULL; /* not blocked yet */ 1128 p->blocked_on = NULL; /* not blocked yet */
1095#endif 1129#endif
1130#ifdef CONFIG_CGROUP_MEM_RES_CTLR
1131 p->memcg_batch.do_batch = 0;
1132 p->memcg_batch.memcg = NULL;
1133#endif
1096 1134
1097 p->bts = NULL; 1135 p->bts = NULL;
1098 1136
@@ -1172,9 +1210,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1172 p->sas_ss_sp = p->sas_ss_size = 0; 1210 p->sas_ss_sp = p->sas_ss_size = 0;
1173 1211
1174 /* 1212 /*
1175 * Syscall tracing should be turned off in the child regardless 1213 * Syscall tracing and stepping should be turned off in the
1176 * of CLONE_PTRACE. 1214 * child regardless of CLONE_PTRACE.
1177 */ 1215 */
1216 user_disable_single_step(p);
1178 clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); 1217 clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
1179#ifdef TIF_SYSCALL_EMU 1218#ifdef TIF_SYSCALL_EMU
1180 clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); 1219 clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
@@ -1283,7 +1322,8 @@ bad_fork_free_pid:
1283 if (pid != &init_struct_pid) 1322 if (pid != &init_struct_pid)
1284 free_pid(pid); 1323 free_pid(pid);
1285bad_fork_cleanup_io: 1324bad_fork_cleanup_io:
1286 put_io_context(p->io_context); 1325 if (p->io_context)
1326 exit_io_context(p);
1287bad_fork_cleanup_namespaces: 1327bad_fork_cleanup_namespaces:
1288 exit_task_namespaces(p); 1328 exit_task_namespaces(p);
1289bad_fork_cleanup_mm: 1329bad_fork_cleanup_mm:
@@ -1310,9 +1350,6 @@ bad_fork_cleanup_cgroup:
1310#endif 1350#endif
1311 cgroup_exit(p, cgroup_callbacks_done); 1351 cgroup_exit(p, cgroup_callbacks_done);
1312 delayacct_tsk_free(p); 1352 delayacct_tsk_free(p);
1313 if (p->binfmt)
1314 module_put(p->binfmt->module);
1315bad_fork_cleanup_put_domain:
1316 module_put(task_thread_info(p)->exec_domain->module); 1353 module_put(task_thread_info(p)->exec_domain->module);
1317bad_fork_cleanup_count: 1354bad_fork_cleanup_count:
1318 atomic_dec(&p->cred->user->processes); 1355 atomic_dec(&p->cred->user->processes);
diff --git a/kernel/futex.c b/kernel/futex.c
index 248dd119a86e..8e3c3ffe1b9a 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -89,36 +89,36 @@ struct futex_pi_state {
89 union futex_key key; 89 union futex_key key;
90}; 90};
91 91
92/* 92/**
93 * We use this hashed waitqueue instead of a normal wait_queue_t, so 93 * struct futex_q - The hashed futex queue entry, one per waiting task
94 * @task: the task waiting on the futex
95 * @lock_ptr: the hash bucket lock
96 * @key: the key the futex is hashed on
97 * @pi_state: optional priority inheritance state
98 * @rt_waiter: rt_waiter storage for use with requeue_pi
99 * @requeue_pi_key: the requeue_pi target futex key
100 * @bitset: bitset for the optional bitmasked wakeup
101 *
102 * We use this hashed waitqueue, instead of a normal wait_queue_t, so
94 * we can wake only the relevant ones (hashed queues may be shared). 103 * we can wake only the relevant ones (hashed queues may be shared).
95 * 104 *
96 * A futex_q has a woken state, just like tasks have TASK_RUNNING. 105 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
97 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. 106 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
98 * The order of wakup is always to make the first condition true, then 107 * The order of wakup is always to make the first condition true, then
99 * wake up q->waiter, then make the second condition true. 108 * the second.
109 *
110 * PI futexes are typically woken before they are removed from the hash list via
111 * the rt_mutex code. See unqueue_me_pi().
100 */ 112 */
101struct futex_q { 113struct futex_q {
102 struct plist_node list; 114 struct plist_node list;
103 /* Waiter reference */
104 struct task_struct *task;
105 115
106 /* Which hash list lock to use: */ 116 struct task_struct *task;
107 spinlock_t *lock_ptr; 117 spinlock_t *lock_ptr;
108
109 /* Key which the futex is hashed on: */
110 union futex_key key; 118 union futex_key key;
111
112 /* Optional priority inheritance state: */
113 struct futex_pi_state *pi_state; 119 struct futex_pi_state *pi_state;
114
115 /* rt_waiter storage for requeue_pi: */
116 struct rt_mutex_waiter *rt_waiter; 120 struct rt_mutex_waiter *rt_waiter;
117
118 /* The expected requeue pi target futex key: */
119 union futex_key *requeue_pi_key; 121 union futex_key *requeue_pi_key;
120
121 /* Bitset for the optional bitmasked wakeup */
122 u32 bitset; 122 u32 bitset;
123}; 123};
124 124
@@ -150,7 +150,8 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key)
150 */ 150 */
151static inline int match_futex(union futex_key *key1, union futex_key *key2) 151static inline int match_futex(union futex_key *key1, union futex_key *key2)
152{ 152{
153 return (key1->both.word == key2->both.word 153 return (key1 && key2
154 && key1->both.word == key2->both.word
154 && key1->both.ptr == key2->both.ptr 155 && key1->both.ptr == key2->both.ptr
155 && key1->both.offset == key2->both.offset); 156 && key1->both.offset == key2->both.offset);
156} 157}
@@ -198,11 +199,12 @@ static void drop_futex_key_refs(union futex_key *key)
198} 199}
199 200
200/** 201/**
201 * get_futex_key - Get parameters which are the keys for a futex. 202 * get_futex_key() - Get parameters which are the keys for a futex
202 * @uaddr: virtual address of the futex 203 * @uaddr: virtual address of the futex
203 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED 204 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
204 * @key: address where result is stored. 205 * @key: address where result is stored.
205 * @rw: mapping needs to be read/write (values: VERIFY_READ, VERIFY_WRITE) 206 * @rw: mapping needs to be read/write (values: VERIFY_READ,
207 * VERIFY_WRITE)
206 * 208 *
207 * Returns a negative error code or 0 209 * Returns a negative error code or 0
208 * The key words are stored in *key on success. 210 * The key words are stored in *key on success.
@@ -288,8 +290,8 @@ void put_futex_key(int fshared, union futex_key *key)
288 drop_futex_key_refs(key); 290 drop_futex_key_refs(key);
289} 291}
290 292
291/* 293/**
292 * fault_in_user_writeable - fault in user address and verify RW access 294 * fault_in_user_writeable() - Fault in user address and verify RW access
293 * @uaddr: pointer to faulting user space address 295 * @uaddr: pointer to faulting user space address
294 * 296 *
295 * Slow path to fixup the fault we just took in the atomic write 297 * Slow path to fixup the fault we just took in the atomic write
@@ -302,15 +304,21 @@ void put_futex_key(int fshared, union futex_key *key)
302 */ 304 */
303static int fault_in_user_writeable(u32 __user *uaddr) 305static int fault_in_user_writeable(u32 __user *uaddr)
304{ 306{
305 int ret = get_user_pages(current, current->mm, (unsigned long)uaddr, 307 struct mm_struct *mm = current->mm;
306 1, 1, 0, NULL, NULL); 308 int ret;
309
310 down_read(&mm->mmap_sem);
311 ret = get_user_pages(current, mm, (unsigned long)uaddr,
312 1, 1, 0, NULL, NULL);
313 up_read(&mm->mmap_sem);
314
307 return ret < 0 ? ret : 0; 315 return ret < 0 ? ret : 0;
308} 316}
309 317
310/** 318/**
311 * futex_top_waiter() - Return the highest priority waiter on a futex 319 * futex_top_waiter() - Return the highest priority waiter on a futex
312 * @hb: the hash bucket the futex_q's reside in 320 * @hb: the hash bucket the futex_q's reside in
313 * @key: the futex key (to distinguish it from other futex futex_q's) 321 * @key: the futex key (to distinguish it from other futex futex_q's)
314 * 322 *
315 * Must be called with the hb lock held. 323 * Must be called with the hb lock held.
316 */ 324 */
@@ -395,9 +403,9 @@ static void free_pi_state(struct futex_pi_state *pi_state)
395 * and has cleaned up the pi_state already 403 * and has cleaned up the pi_state already
396 */ 404 */
397 if (pi_state->owner) { 405 if (pi_state->owner) {
398 spin_lock_irq(&pi_state->owner->pi_lock); 406 raw_spin_lock_irq(&pi_state->owner->pi_lock);
399 list_del_init(&pi_state->list); 407 list_del_init(&pi_state->list);
400 spin_unlock_irq(&pi_state->owner->pi_lock); 408 raw_spin_unlock_irq(&pi_state->owner->pi_lock);
401 409
402 rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); 410 rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
403 } 411 }
@@ -462,18 +470,18 @@ void exit_pi_state_list(struct task_struct *curr)
462 * pi_state_list anymore, but we have to be careful 470 * pi_state_list anymore, but we have to be careful
463 * versus waiters unqueueing themselves: 471 * versus waiters unqueueing themselves:
464 */ 472 */
465 spin_lock_irq(&curr->pi_lock); 473 raw_spin_lock_irq(&curr->pi_lock);
466 while (!list_empty(head)) { 474 while (!list_empty(head)) {
467 475
468 next = head->next; 476 next = head->next;
469 pi_state = list_entry(next, struct futex_pi_state, list); 477 pi_state = list_entry(next, struct futex_pi_state, list);
470 key = pi_state->key; 478 key = pi_state->key;
471 hb = hash_futex(&key); 479 hb = hash_futex(&key);
472 spin_unlock_irq(&curr->pi_lock); 480 raw_spin_unlock_irq(&curr->pi_lock);
473 481
474 spin_lock(&hb->lock); 482 spin_lock(&hb->lock);
475 483
476 spin_lock_irq(&curr->pi_lock); 484 raw_spin_lock_irq(&curr->pi_lock);
477 /* 485 /*
478 * We dropped the pi-lock, so re-check whether this 486 * We dropped the pi-lock, so re-check whether this
479 * task still owns the PI-state: 487 * task still owns the PI-state:
@@ -487,15 +495,15 @@ void exit_pi_state_list(struct task_struct *curr)
487 WARN_ON(list_empty(&pi_state->list)); 495 WARN_ON(list_empty(&pi_state->list));
488 list_del_init(&pi_state->list); 496 list_del_init(&pi_state->list);
489 pi_state->owner = NULL; 497 pi_state->owner = NULL;
490 spin_unlock_irq(&curr->pi_lock); 498 raw_spin_unlock_irq(&curr->pi_lock);
491 499
492 rt_mutex_unlock(&pi_state->pi_mutex); 500 rt_mutex_unlock(&pi_state->pi_mutex);
493 501
494 spin_unlock(&hb->lock); 502 spin_unlock(&hb->lock);
495 503
496 spin_lock_irq(&curr->pi_lock); 504 raw_spin_lock_irq(&curr->pi_lock);
497 } 505 }
498 spin_unlock_irq(&curr->pi_lock); 506 raw_spin_unlock_irq(&curr->pi_lock);
499} 507}
500 508
501static int 509static int
@@ -550,7 +558,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
550 * change of the task flags, we do this protected by 558 * change of the task flags, we do this protected by
551 * p->pi_lock: 559 * p->pi_lock:
552 */ 560 */
553 spin_lock_irq(&p->pi_lock); 561 raw_spin_lock_irq(&p->pi_lock);
554 if (unlikely(p->flags & PF_EXITING)) { 562 if (unlikely(p->flags & PF_EXITING)) {
555 /* 563 /*
556 * The task is on the way out. When PF_EXITPIDONE is 564 * The task is on the way out. When PF_EXITPIDONE is
@@ -559,7 +567,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
559 */ 567 */
560 int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN; 568 int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
561 569
562 spin_unlock_irq(&p->pi_lock); 570 raw_spin_unlock_irq(&p->pi_lock);
563 put_task_struct(p); 571 put_task_struct(p);
564 return ret; 572 return ret;
565 } 573 }
@@ -578,7 +586,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
578 WARN_ON(!list_empty(&pi_state->list)); 586 WARN_ON(!list_empty(&pi_state->list));
579 list_add(&pi_state->list, &p->pi_state_list); 587 list_add(&pi_state->list, &p->pi_state_list);
580 pi_state->owner = p; 588 pi_state->owner = p;
581 spin_unlock_irq(&p->pi_lock); 589 raw_spin_unlock_irq(&p->pi_lock);
582 590
583 put_task_struct(p); 591 put_task_struct(p);
584 592
@@ -588,7 +596,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
588} 596}
589 597
590/** 598/**
591 * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex 599 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
592 * @uaddr: the pi futex user address 600 * @uaddr: the pi futex user address
593 * @hb: the pi futex hash bucket 601 * @hb: the pi futex hash bucket
594 * @key: the futex key associated with uaddr and hb 602 * @key: the futex key associated with uaddr and hb
@@ -752,7 +760,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
752 if (!pi_state) 760 if (!pi_state)
753 return -EINVAL; 761 return -EINVAL;
754 762
755 spin_lock(&pi_state->pi_mutex.wait_lock); 763 raw_spin_lock(&pi_state->pi_mutex.wait_lock);
756 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); 764 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
757 765
758 /* 766 /*
@@ -781,23 +789,23 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
781 else if (curval != uval) 789 else if (curval != uval)
782 ret = -EINVAL; 790 ret = -EINVAL;
783 if (ret) { 791 if (ret) {
784 spin_unlock(&pi_state->pi_mutex.wait_lock); 792 raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
785 return ret; 793 return ret;
786 } 794 }
787 } 795 }
788 796
789 spin_lock_irq(&pi_state->owner->pi_lock); 797 raw_spin_lock_irq(&pi_state->owner->pi_lock);
790 WARN_ON(list_empty(&pi_state->list)); 798 WARN_ON(list_empty(&pi_state->list));
791 list_del_init(&pi_state->list); 799 list_del_init(&pi_state->list);
792 spin_unlock_irq(&pi_state->owner->pi_lock); 800 raw_spin_unlock_irq(&pi_state->owner->pi_lock);
793 801
794 spin_lock_irq(&new_owner->pi_lock); 802 raw_spin_lock_irq(&new_owner->pi_lock);
795 WARN_ON(!list_empty(&pi_state->list)); 803 WARN_ON(!list_empty(&pi_state->list));
796 list_add(&pi_state->list, &new_owner->pi_state_list); 804 list_add(&pi_state->list, &new_owner->pi_state_list);
797 pi_state->owner = new_owner; 805 pi_state->owner = new_owner;
798 spin_unlock_irq(&new_owner->pi_lock); 806 raw_spin_unlock_irq(&new_owner->pi_lock);
799 807
800 spin_unlock(&pi_state->pi_mutex.wait_lock); 808 raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
801 rt_mutex_unlock(&pi_state->pi_mutex); 809 rt_mutex_unlock(&pi_state->pi_mutex);
802 810
803 return 0; 811 return 0;
@@ -915,8 +923,8 @@ retry:
915 hb1 = hash_futex(&key1); 923 hb1 = hash_futex(&key1);
916 hb2 = hash_futex(&key2); 924 hb2 = hash_futex(&key2);
917 925
918 double_lock_hb(hb1, hb2);
919retry_private: 926retry_private:
927 double_lock_hb(hb1, hb2);
920 op_ret = futex_atomic_op_inuser(op, uaddr2); 928 op_ret = futex_atomic_op_inuser(op, uaddr2);
921 if (unlikely(op_ret < 0)) { 929 if (unlikely(op_ret < 0)) {
922 930
@@ -1002,7 +1010,7 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1002 plist_add(&q->list, &hb2->chain); 1010 plist_add(&q->list, &hb2->chain);
1003 q->lock_ptr = &hb2->lock; 1011 q->lock_ptr = &hb2->lock;
1004#ifdef CONFIG_DEBUG_PI_LIST 1012#ifdef CONFIG_DEBUG_PI_LIST
1005 q->list.plist.lock = &hb2->lock; 1013 q->list.plist.spinlock = &hb2->lock;
1006#endif 1014#endif
1007 } 1015 }
1008 get_futex_key_refs(key2); 1016 get_futex_key_refs(key2);
@@ -1011,9 +1019,9 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1011 1019
1012/** 1020/**
1013 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue 1021 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
1014 * q: the futex_q 1022 * @q: the futex_q
1015 * key: the key of the requeue target futex 1023 * @key: the key of the requeue target futex
1016 * hb: the hash_bucket of the requeue target futex 1024 * @hb: the hash_bucket of the requeue target futex
1017 * 1025 *
1018 * During futex_requeue, with requeue_pi=1, it is possible to acquire the 1026 * During futex_requeue, with requeue_pi=1, it is possible to acquire the
1019 * target futex if it is uncontended or via a lock steal. Set the futex_q key 1027 * target futex if it is uncontended or via a lock steal. Set the futex_q key
@@ -1027,7 +1035,6 @@ static inline
1027void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, 1035void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1028 struct futex_hash_bucket *hb) 1036 struct futex_hash_bucket *hb)
1029{ 1037{
1030 drop_futex_key_refs(&q->key);
1031 get_futex_key_refs(key); 1038 get_futex_key_refs(key);
1032 q->key = *key; 1039 q->key = *key;
1033 1040
@@ -1039,7 +1046,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1039 1046
1040 q->lock_ptr = &hb->lock; 1047 q->lock_ptr = &hb->lock;
1041#ifdef CONFIG_DEBUG_PI_LIST 1048#ifdef CONFIG_DEBUG_PI_LIST
1042 q->list.plist.lock = &hb->lock; 1049 q->list.plist.spinlock = &hb->lock;
1043#endif 1050#endif
1044 1051
1045 wake_up_state(q->task, TASK_NORMAL); 1052 wake_up_state(q->task, TASK_NORMAL);
@@ -1225,6 +1232,7 @@ retry_private:
1225 */ 1232 */
1226 if (ret == 1) { 1233 if (ret == 1) {
1227 WARN_ON(pi_state); 1234 WARN_ON(pi_state);
1235 drop_count++;
1228 task_count++; 1236 task_count++;
1229 ret = get_futex_value_locked(&curval2, uaddr2); 1237 ret = get_futex_value_locked(&curval2, uaddr2);
1230 if (!ret) 1238 if (!ret)
@@ -1303,6 +1311,7 @@ retry_private:
1303 if (ret == 1) { 1311 if (ret == 1) {
1304 /* We got the lock. */ 1312 /* We got the lock. */
1305 requeue_pi_wake_futex(this, &key2, hb2); 1313 requeue_pi_wake_futex(this, &key2, hb2);
1314 drop_count++;
1306 continue; 1315 continue;
1307 } else if (ret) { 1316 } else if (ret) {
1308 /* -EDEADLK */ 1317 /* -EDEADLK */
@@ -1350,6 +1359,25 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
1350 return hb; 1359 return hb;
1351} 1360}
1352 1361
1362static inline void
1363queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
1364{
1365 spin_unlock(&hb->lock);
1366 drop_futex_key_refs(&q->key);
1367}
1368
1369/**
1370 * queue_me() - Enqueue the futex_q on the futex_hash_bucket
1371 * @q: The futex_q to enqueue
1372 * @hb: The destination hash bucket
1373 *
1374 * The hb->lock must be held by the caller, and is released here. A call to
1375 * queue_me() is typically paired with exactly one call to unqueue_me(). The
1376 * exceptions involve the PI related operations, which may use unqueue_me_pi()
1377 * or nothing if the unqueue is done as part of the wake process and the unqueue
1378 * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
1379 * an example).
1380 */
1353static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) 1381static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1354{ 1382{
1355 int prio; 1383 int prio;
@@ -1366,26 +1394,24 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1366 1394
1367 plist_node_init(&q->list, prio); 1395 plist_node_init(&q->list, prio);
1368#ifdef CONFIG_DEBUG_PI_LIST 1396#ifdef CONFIG_DEBUG_PI_LIST
1369 q->list.plist.lock = &hb->lock; 1397 q->list.plist.spinlock = &hb->lock;
1370#endif 1398#endif
1371 plist_add(&q->list, &hb->chain); 1399 plist_add(&q->list, &hb->chain);
1372 q->task = current; 1400 q->task = current;
1373 spin_unlock(&hb->lock); 1401 spin_unlock(&hb->lock);
1374} 1402}
1375 1403
1376static inline void 1404/**
1377queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) 1405 * unqueue_me() - Remove the futex_q from its futex_hash_bucket
1378{ 1406 * @q: The futex_q to unqueue
1379 spin_unlock(&hb->lock); 1407 *
1380 drop_futex_key_refs(&q->key); 1408 * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
1381} 1409 * be paired with exactly one earlier call to queue_me().
1382 1410 *
1383/* 1411 * Returns:
1384 * queue_me and unqueue_me must be called as a pair, each 1412 * 1 - if the futex_q was still queued (and we removed unqueued it)
1385 * exactly once. They are called with the hashed spinlock held. 1413 * 0 - if the futex_q was already removed by the waking thread
1386 */ 1414 */
1387
1388/* Return 1 if we were still queued (ie. 0 means we were woken) */
1389static int unqueue_me(struct futex_q *q) 1415static int unqueue_me(struct futex_q *q)
1390{ 1416{
1391 spinlock_t *lock_ptr; 1417 spinlock_t *lock_ptr;
@@ -1503,18 +1529,18 @@ retry:
1503 * itself. 1529 * itself.
1504 */ 1530 */
1505 if (pi_state->owner != NULL) { 1531 if (pi_state->owner != NULL) {
1506 spin_lock_irq(&pi_state->owner->pi_lock); 1532 raw_spin_lock_irq(&pi_state->owner->pi_lock);
1507 WARN_ON(list_empty(&pi_state->list)); 1533 WARN_ON(list_empty(&pi_state->list));
1508 list_del_init(&pi_state->list); 1534 list_del_init(&pi_state->list);
1509 spin_unlock_irq(&pi_state->owner->pi_lock); 1535 raw_spin_unlock_irq(&pi_state->owner->pi_lock);
1510 } 1536 }
1511 1537
1512 pi_state->owner = newowner; 1538 pi_state->owner = newowner;
1513 1539
1514 spin_lock_irq(&newowner->pi_lock); 1540 raw_spin_lock_irq(&newowner->pi_lock);
1515 WARN_ON(!list_empty(&pi_state->list)); 1541 WARN_ON(!list_empty(&pi_state->list));
1516 list_add(&pi_state->list, &newowner->pi_state_list); 1542 list_add(&pi_state->list, &newowner->pi_state_list);
1517 spin_unlock_irq(&newowner->pi_lock); 1543 raw_spin_unlock_irq(&newowner->pi_lock);
1518 return 0; 1544 return 0;
1519 1545
1520 /* 1546 /*
@@ -1638,17 +1664,14 @@ out:
1638static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, 1664static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1639 struct hrtimer_sleeper *timeout) 1665 struct hrtimer_sleeper *timeout)
1640{ 1666{
1641 queue_me(q, hb);
1642
1643 /* 1667 /*
1644 * There might have been scheduling since the queue_me(), as we 1668 * The task state is guaranteed to be set before another task can
1645 * cannot hold a spinlock across the get_user() in case it 1669 * wake it. set_current_state() is implemented using set_mb() and
1646 * faults, and we cannot just set TASK_INTERRUPTIBLE state when 1670 * queue_me() calls spin_unlock() upon completion, both serializing
1647 * queueing ourselves into the futex hash. This code thus has to 1671 * access to the hash list and forcing another memory barrier.
1648 * rely on the futex_wake() code removing us from hash when it
1649 * wakes us up.
1650 */ 1672 */
1651 set_current_state(TASK_INTERRUPTIBLE); 1673 set_current_state(TASK_INTERRUPTIBLE);
1674 queue_me(q, hb);
1652 1675
1653 /* Arm the timer */ 1676 /* Arm the timer */
1654 if (timeout) { 1677 if (timeout) {
@@ -1658,8 +1681,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1658 } 1681 }
1659 1682
1660 /* 1683 /*
1661 * !plist_node_empty() is safe here without any lock. 1684 * If we have been removed from the hash list, then another task
1662 * q.lock_ptr != 0 is not safe, because of ordering against wakeup. 1685 * has tried to wake us, and we can skip the call to schedule().
1663 */ 1686 */
1664 if (likely(!plist_node_empty(&q->list))) { 1687 if (likely(!plist_node_empty(&q->list))) {
1665 /* 1688 /*
@@ -1776,6 +1799,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1776 current->timer_slack_ns); 1799 current->timer_slack_ns);
1777 } 1800 }
1778 1801
1802retry:
1779 /* Prepare to wait on uaddr. */ 1803 /* Prepare to wait on uaddr. */
1780 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); 1804 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
1781 if (ret) 1805 if (ret)
@@ -1793,9 +1817,14 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1793 goto out_put_key; 1817 goto out_put_key;
1794 1818
1795 /* 1819 /*
1796 * We expect signal_pending(current), but another thread may 1820 * We expect signal_pending(current), but we might be the
1797 * have handled it for us already. 1821 * victim of a spurious wakeup as well.
1798 */ 1822 */
1823 if (!signal_pending(current)) {
1824 put_futex_key(fshared, &q.key);
1825 goto retry;
1826 }
1827
1799 ret = -ERESTARTSYS; 1828 ret = -ERESTARTSYS;
1800 if (!abs_time) 1829 if (!abs_time)
1801 goto out_put_key; 1830 goto out_put_key;
@@ -2102,11 +2131,12 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2102 * Unqueue the futex_q and determine which it was. 2131 * Unqueue the futex_q and determine which it was.
2103 */ 2132 */
2104 plist_del(&q->list, &q->list.plist); 2133 plist_del(&q->list, &q->list.plist);
2105 drop_futex_key_refs(&q->key);
2106 2134
2135 /* Handle spurious wakeups gracefully */
2136 ret = -EWOULDBLOCK;
2107 if (timeout && !timeout->task) 2137 if (timeout && !timeout->task)
2108 ret = -ETIMEDOUT; 2138 ret = -ETIMEDOUT;
2109 else 2139 else if (signal_pending(current))
2110 ret = -ERESTARTNOINTR; 2140 ret = -ERESTARTNOINTR;
2111 } 2141 }
2112 return ret; 2142 return ret;
@@ -2114,12 +2144,12 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2114 2144
2115/** 2145/**
2116 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 2146 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
2117 * @uaddr: the futex we initialyl wait on (non-pi) 2147 * @uaddr: the futex we initially wait on (non-pi)
2118 * @fshared: whether the futexes are shared (1) or not (0). They must be 2148 * @fshared: whether the futexes are shared (1) or not (0). They must be
2119 * the same type, no requeueing from private to shared, etc. 2149 * the same type, no requeueing from private to shared, etc.
2120 * @val: the expected value of uaddr 2150 * @val: the expected value of uaddr
2121 * @abs_time: absolute timeout 2151 * @abs_time: absolute timeout
2122 * @bitset: 32 bit wakeup bitset set by userspace, defaults to all. 2152 * @bitset: 32 bit wakeup bitset set by userspace, defaults to all
2123 * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0) 2153 * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
2124 * @uaddr2: the pi futex we will take prior to returning to user-space 2154 * @uaddr2: the pi futex we will take prior to returning to user-space
2125 * 2155 *
@@ -2246,7 +2276,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2246 res = fixup_owner(uaddr2, fshared, &q, !ret); 2276 res = fixup_owner(uaddr2, fshared, &q, !ret);
2247 /* 2277 /*
2248 * If fixup_owner() returned an error, proprogate that. If it 2278 * If fixup_owner() returned an error, proprogate that. If it
2249 * acquired the lock, clear our -ETIMEDOUT or -EINTR. 2279 * acquired the lock, clear -ETIMEDOUT or -EINTR.
2250 */ 2280 */
2251 if (res) 2281 if (res)
2252 ret = (res < 0) ? res : 0; 2282 ret = (res < 0) ? res : 0;
@@ -2302,9 +2332,9 @@ out:
2302 */ 2332 */
2303 2333
2304/** 2334/**
2305 * sys_set_robust_list - set the robust-futex list head of a task 2335 * sys_set_robust_list() - Set the robust-futex list head of a task
2306 * @head: pointer to the list-head 2336 * @head: pointer to the list-head
2307 * @len: length of the list-head, as userspace expects 2337 * @len: length of the list-head, as userspace expects
2308 */ 2338 */
2309SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, 2339SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
2310 size_t, len) 2340 size_t, len)
@@ -2323,10 +2353,10 @@ SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
2323} 2353}
2324 2354
2325/** 2355/**
2326 * sys_get_robust_list - get the robust-futex list head of a task 2356 * sys_get_robust_list() - Get the robust-futex list head of a task
2327 * @pid: pid of the process [zero for current task] 2357 * @pid: pid of the process [zero for current task]
2328 * @head_ptr: pointer to a list-head pointer, the kernel fills it in 2358 * @head_ptr: pointer to a list-head pointer, the kernel fills it in
2329 * @len_ptr: pointer to a length field, the kernel fills in the header size 2359 * @len_ptr: pointer to a length field, the kernel fills in the header size
2330 */ 2360 */
2331SYSCALL_DEFINE3(get_robust_list, int, pid, 2361SYSCALL_DEFINE3(get_robust_list, int, pid,
2332 struct robust_list_head __user * __user *, head_ptr, 2362 struct robust_list_head __user * __user *, head_ptr,
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 654efd09f6a9..70a298d6da71 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -34,7 +34,7 @@ config GCOV_KERNEL
34config GCOV_PROFILE_ALL 34config GCOV_PROFILE_ALL
35 bool "Profile entire Kernel" 35 bool "Profile entire Kernel"
36 depends on GCOV_KERNEL 36 depends on GCOV_KERNEL
37 depends on S390 || X86 || (PPC && EXPERIMENTAL) 37 depends on S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE
38 default n 38 default n
39 ---help--- 39 ---help---
40 This options activates profiling for the entire kernel. 40 This options activates profiling for the entire kernel.
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index e5d98ce50f89..0086628b6e97 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -127,11 +127,11 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
127 for (;;) { 127 for (;;) {
128 base = timer->base; 128 base = timer->base;
129 if (likely(base != NULL)) { 129 if (likely(base != NULL)) {
130 spin_lock_irqsave(&base->cpu_base->lock, *flags); 130 raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
131 if (likely(base == timer->base)) 131 if (likely(base == timer->base))
132 return base; 132 return base;
133 /* The timer has migrated to another CPU: */ 133 /* The timer has migrated to another CPU: */
134 spin_unlock_irqrestore(&base->cpu_base->lock, *flags); 134 raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags);
135 } 135 }
136 cpu_relax(); 136 cpu_relax();
137 } 137 }
@@ -208,13 +208,13 @@ again:
208 208
209 /* See the comment in lock_timer_base() */ 209 /* See the comment in lock_timer_base() */
210 timer->base = NULL; 210 timer->base = NULL;
211 spin_unlock(&base->cpu_base->lock); 211 raw_spin_unlock(&base->cpu_base->lock);
212 spin_lock(&new_base->cpu_base->lock); 212 raw_spin_lock(&new_base->cpu_base->lock);
213 213
214 if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { 214 if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
215 cpu = this_cpu; 215 cpu = this_cpu;
216 spin_unlock(&new_base->cpu_base->lock); 216 raw_spin_unlock(&new_base->cpu_base->lock);
217 spin_lock(&base->cpu_base->lock); 217 raw_spin_lock(&base->cpu_base->lock);
218 timer->base = base; 218 timer->base = base;
219 goto again; 219 goto again;
220 } 220 }
@@ -230,7 +230,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
230{ 230{
231 struct hrtimer_clock_base *base = timer->base; 231 struct hrtimer_clock_base *base = timer->base;
232 232
233 spin_lock_irqsave(&base->cpu_base->lock, *flags); 233 raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
234 234
235 return base; 235 return base;
236} 236}
@@ -509,13 +509,14 @@ static inline int hrtimer_hres_active(void)
509 * next event 509 * next event
510 * Called with interrupts disabled and base->lock held 510 * Called with interrupts disabled and base->lock held
511 */ 511 */
512static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base) 512static void
513hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
513{ 514{
514 int i; 515 int i;
515 struct hrtimer_clock_base *base = cpu_base->clock_base; 516 struct hrtimer_clock_base *base = cpu_base->clock_base;
516 ktime_t expires; 517 ktime_t expires, expires_next;
517 518
518 cpu_base->expires_next.tv64 = KTIME_MAX; 519 expires_next.tv64 = KTIME_MAX;
519 520
520 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { 521 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
521 struct hrtimer *timer; 522 struct hrtimer *timer;
@@ -531,10 +532,15 @@ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base)
531 */ 532 */
532 if (expires.tv64 < 0) 533 if (expires.tv64 < 0)
533 expires.tv64 = 0; 534 expires.tv64 = 0;
534 if (expires.tv64 < cpu_base->expires_next.tv64) 535 if (expires.tv64 < expires_next.tv64)
535 cpu_base->expires_next = expires; 536 expires_next = expires;
536 } 537 }
537 538
539 if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64)
540 return;
541
542 cpu_base->expires_next.tv64 = expires_next.tv64;
543
538 if (cpu_base->expires_next.tv64 != KTIME_MAX) 544 if (cpu_base->expires_next.tv64 != KTIME_MAX)
539 tick_program_event(cpu_base->expires_next, 1); 545 tick_program_event(cpu_base->expires_next, 1);
540} 546}
@@ -551,7 +557,7 @@ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base)
551static int hrtimer_reprogram(struct hrtimer *timer, 557static int hrtimer_reprogram(struct hrtimer *timer,
552 struct hrtimer_clock_base *base) 558 struct hrtimer_clock_base *base)
553{ 559{
554 ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next; 560 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
555 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); 561 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
556 int res; 562 int res;
557 563
@@ -576,7 +582,16 @@ static int hrtimer_reprogram(struct hrtimer *timer,
576 if (expires.tv64 < 0) 582 if (expires.tv64 < 0)
577 return -ETIME; 583 return -ETIME;
578 584
579 if (expires.tv64 >= expires_next->tv64) 585 if (expires.tv64 >= cpu_base->expires_next.tv64)
586 return 0;
587
588 /*
589 * If a hang was detected in the last timer interrupt then we
590 * do not schedule a timer which is earlier than the expiry
591 * which we enforced in the hang detection. We want the system
592 * to make progress.
593 */
594 if (cpu_base->hang_detected)
580 return 0; 595 return 0;
581 596
582 /* 597 /*
@@ -584,7 +599,7 @@ static int hrtimer_reprogram(struct hrtimer *timer,
584 */ 599 */
585 res = tick_program_event(expires, 0); 600 res = tick_program_event(expires, 0);
586 if (!IS_ERR_VALUE(res)) 601 if (!IS_ERR_VALUE(res))
587 *expires_next = expires; 602 cpu_base->expires_next = expires;
588 return res; 603 return res;
589} 604}
590 605
@@ -613,12 +628,12 @@ static void retrigger_next_event(void *arg)
613 base = &__get_cpu_var(hrtimer_bases); 628 base = &__get_cpu_var(hrtimer_bases);
614 629
615 /* Adjust CLOCK_REALTIME offset */ 630 /* Adjust CLOCK_REALTIME offset */
616 spin_lock(&base->lock); 631 raw_spin_lock(&base->lock);
617 base->clock_base[CLOCK_REALTIME].offset = 632 base->clock_base[CLOCK_REALTIME].offset =
618 timespec_to_ktime(realtime_offset); 633 timespec_to_ktime(realtime_offset);
619 634
620 hrtimer_force_reprogram(base); 635 hrtimer_force_reprogram(base, 0);
621 spin_unlock(&base->lock); 636 raw_spin_unlock(&base->lock);
622} 637}
623 638
624/* 639/*
@@ -679,9 +694,9 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
679{ 694{
680 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { 695 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
681 if (wakeup) { 696 if (wakeup) {
682 spin_unlock(&base->cpu_base->lock); 697 raw_spin_unlock(&base->cpu_base->lock);
683 raise_softirq_irqoff(HRTIMER_SOFTIRQ); 698 raise_softirq_irqoff(HRTIMER_SOFTIRQ);
684 spin_lock(&base->cpu_base->lock); 699 raw_spin_lock(&base->cpu_base->lock);
685 } else 700 } else
686 __raise_softirq_irqoff(HRTIMER_SOFTIRQ); 701 __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
687 702
@@ -720,8 +735,6 @@ static int hrtimer_switch_to_hres(void)
720 /* "Retrigger" the interrupt to get things going */ 735 /* "Retrigger" the interrupt to get things going */
721 retrigger_next_event(NULL); 736 retrigger_next_event(NULL);
722 local_irq_restore(flags); 737 local_irq_restore(flags);
723 printk(KERN_DEBUG "Switched to high resolution mode on CPU %d\n",
724 smp_processor_id());
725 return 1; 738 return 1;
726} 739}
727 740
@@ -730,7 +743,8 @@ static int hrtimer_switch_to_hres(void)
730static inline int hrtimer_hres_active(void) { return 0; } 743static inline int hrtimer_hres_active(void) { return 0; }
731static inline int hrtimer_is_hres_enabled(void) { return 0; } 744static inline int hrtimer_is_hres_enabled(void) { return 0; }
732static inline int hrtimer_switch_to_hres(void) { return 0; } 745static inline int hrtimer_switch_to_hres(void) { return 0; }
733static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { } 746static inline void
747hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
734static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, 748static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
735 struct hrtimer_clock_base *base, 749 struct hrtimer_clock_base *base,
736 int wakeup) 750 int wakeup)
@@ -742,17 +756,33 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
742 756
743#endif /* CONFIG_HIGH_RES_TIMERS */ 757#endif /* CONFIG_HIGH_RES_TIMERS */
744 758
745#ifdef CONFIG_TIMER_STATS 759static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
746void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr)
747{ 760{
761#ifdef CONFIG_TIMER_STATS
748 if (timer->start_site) 762 if (timer->start_site)
749 return; 763 return;
750 764 timer->start_site = __builtin_return_address(0);
751 timer->start_site = addr;
752 memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); 765 memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
753 timer->start_pid = current->pid; 766 timer->start_pid = current->pid;
767#endif
768}
769
770static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer)
771{
772#ifdef CONFIG_TIMER_STATS
773 timer->start_site = NULL;
774#endif
754} 775}
776
777static inline void timer_stats_account_hrtimer(struct hrtimer *timer)
778{
779#ifdef CONFIG_TIMER_STATS
780 if (likely(!timer_stats_active))
781 return;
782 timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
783 timer->function, timer->start_comm, 0);
755#endif 784#endif
785}
756 786
757/* 787/*
758 * Counterpart to lock_hrtimer_base above: 788 * Counterpart to lock_hrtimer_base above:
@@ -760,7 +790,7 @@ void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr)
760static inline 790static inline
761void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) 791void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
762{ 792{
763 spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); 793 raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
764} 794}
765 795
766/** 796/**
@@ -873,19 +903,29 @@ static void __remove_hrtimer(struct hrtimer *timer,
873 struct hrtimer_clock_base *base, 903 struct hrtimer_clock_base *base,
874 unsigned long newstate, int reprogram) 904 unsigned long newstate, int reprogram)
875{ 905{
876 if (timer->state & HRTIMER_STATE_ENQUEUED) { 906 if (!(timer->state & HRTIMER_STATE_ENQUEUED))
877 /* 907 goto out;
878 * Remove the timer from the rbtree and replace the 908
879 * first entry pointer if necessary. 909 /*
880 */ 910 * Remove the timer from the rbtree and replace the first
881 if (base->first == &timer->node) { 911 * entry pointer if necessary.
882 base->first = rb_next(&timer->node); 912 */
883 /* Reprogram the clock event device. if enabled */ 913 if (base->first == &timer->node) {
884 if (reprogram && hrtimer_hres_active()) 914 base->first = rb_next(&timer->node);
885 hrtimer_force_reprogram(base->cpu_base); 915#ifdef CONFIG_HIGH_RES_TIMERS
916 /* Reprogram the clock event device. if enabled */
917 if (reprogram && hrtimer_hres_active()) {
918 ktime_t expires;
919
920 expires = ktime_sub(hrtimer_get_expires(timer),
921 base->offset);
922 if (base->cpu_base->expires_next.tv64 == expires.tv64)
923 hrtimer_force_reprogram(base->cpu_base, 1);
886 } 924 }
887 rb_erase(&timer->node, &base->active); 925#endif
888 } 926 }
927 rb_erase(&timer->node, &base->active);
928out:
889 timer->state = newstate; 929 timer->state = newstate;
890} 930}
891 931
@@ -1083,7 +1123,7 @@ ktime_t hrtimer_get_next_event(void)
1083 unsigned long flags; 1123 unsigned long flags;
1084 int i; 1124 int i;
1085 1125
1086 spin_lock_irqsave(&cpu_base->lock, flags); 1126 raw_spin_lock_irqsave(&cpu_base->lock, flags);
1087 1127
1088 if (!hrtimer_hres_active()) { 1128 if (!hrtimer_hres_active()) {
1089 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { 1129 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
@@ -1100,7 +1140,7 @@ ktime_t hrtimer_get_next_event(void)
1100 } 1140 }
1101 } 1141 }
1102 1142
1103 spin_unlock_irqrestore(&cpu_base->lock, flags); 1143 raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
1104 1144
1105 if (mindelta.tv64 < 0) 1145 if (mindelta.tv64 < 0)
1106 mindelta.tv64 = 0; 1146 mindelta.tv64 = 0;
@@ -1182,11 +1222,11 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
1182 * they get migrated to another cpu, therefore its safe to unlock 1222 * they get migrated to another cpu, therefore its safe to unlock
1183 * the timer base. 1223 * the timer base.
1184 */ 1224 */
1185 spin_unlock(&cpu_base->lock); 1225 raw_spin_unlock(&cpu_base->lock);
1186 trace_hrtimer_expire_entry(timer, now); 1226 trace_hrtimer_expire_entry(timer, now);
1187 restart = fn(timer); 1227 restart = fn(timer);
1188 trace_hrtimer_expire_exit(timer); 1228 trace_hrtimer_expire_exit(timer);
1189 spin_lock(&cpu_base->lock); 1229 raw_spin_lock(&cpu_base->lock);
1190 1230
1191 /* 1231 /*
1192 * Note: We clear the CALLBACK bit after enqueue_hrtimer and 1232 * Note: We clear the CALLBACK bit after enqueue_hrtimer and
@@ -1202,29 +1242,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
1202 1242
1203#ifdef CONFIG_HIGH_RES_TIMERS 1243#ifdef CONFIG_HIGH_RES_TIMERS
1204 1244
1205static int force_clock_reprogram;
1206
1207/*
1208 * After 5 iteration's attempts, we consider that hrtimer_interrupt()
1209 * is hanging, which could happen with something that slows the interrupt
1210 * such as the tracing. Then we force the clock reprogramming for each future
1211 * hrtimer interrupts to avoid infinite loops and use the min_delta_ns
1212 * threshold that we will overwrite.
1213 * The next tick event will be scheduled to 3 times we currently spend on
1214 * hrtimer_interrupt(). This gives a good compromise, the cpus will spend
1215 * 1/4 of their time to process the hrtimer interrupts. This is enough to
1216 * let it running without serious starvation.
1217 */
1218
1219static inline void
1220hrtimer_interrupt_hanging(struct clock_event_device *dev,
1221 ktime_t try_time)
1222{
1223 force_clock_reprogram = 1;
1224 dev->min_delta_ns = (unsigned long)try_time.tv64 * 3;
1225 printk(KERN_WARNING "hrtimer: interrupt too slow, "
1226 "forcing clock min delta to %lu ns\n", dev->min_delta_ns);
1227}
1228/* 1245/*
1229 * High resolution timer interrupt 1246 * High resolution timer interrupt
1230 * Called with interrupts disabled 1247 * Called with interrupts disabled
@@ -1233,24 +1250,18 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1233{ 1250{
1234 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1251 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1235 struct hrtimer_clock_base *base; 1252 struct hrtimer_clock_base *base;
1236 ktime_t expires_next, now; 1253 ktime_t expires_next, now, entry_time, delta;
1237 int nr_retries = 0; 1254 int i, retries = 0;
1238 int i;
1239 1255
1240 BUG_ON(!cpu_base->hres_active); 1256 BUG_ON(!cpu_base->hres_active);
1241 cpu_base->nr_events++; 1257 cpu_base->nr_events++;
1242 dev->next_event.tv64 = KTIME_MAX; 1258 dev->next_event.tv64 = KTIME_MAX;
1243 1259
1244 retry: 1260 entry_time = now = ktime_get();
1245 /* 5 retries is enough to notice a hang */ 1261retry:
1246 if (!(++nr_retries % 5))
1247 hrtimer_interrupt_hanging(dev, ktime_sub(ktime_get(), now));
1248
1249 now = ktime_get();
1250
1251 expires_next.tv64 = KTIME_MAX; 1262 expires_next.tv64 = KTIME_MAX;
1252 1263
1253 spin_lock(&cpu_base->lock); 1264 raw_spin_lock(&cpu_base->lock);
1254 /* 1265 /*
1255 * We set expires_next to KTIME_MAX here with cpu_base->lock 1266 * We set expires_next to KTIME_MAX here with cpu_base->lock
1256 * held to prevent that a timer is enqueued in our queue via 1267 * held to prevent that a timer is enqueued in our queue via
@@ -1306,13 +1317,51 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1306 * against it. 1317 * against it.
1307 */ 1318 */
1308 cpu_base->expires_next = expires_next; 1319 cpu_base->expires_next = expires_next;
1309 spin_unlock(&cpu_base->lock); 1320 raw_spin_unlock(&cpu_base->lock);
1310 1321
1311 /* Reprogramming necessary ? */ 1322 /* Reprogramming necessary ? */
1312 if (expires_next.tv64 != KTIME_MAX) { 1323 if (expires_next.tv64 == KTIME_MAX ||
1313 if (tick_program_event(expires_next, force_clock_reprogram)) 1324 !tick_program_event(expires_next, 0)) {
1314 goto retry; 1325 cpu_base->hang_detected = 0;
1326 return;
1315 } 1327 }
1328
1329 /*
1330 * The next timer was already expired due to:
1331 * - tracing
1332 * - long lasting callbacks
1333 * - being scheduled away when running in a VM
1334 *
1335 * We need to prevent that we loop forever in the hrtimer
1336 * interrupt routine. We give it 3 attempts to avoid
1337 * overreacting on some spurious event.
1338 */
1339 now = ktime_get();
1340 cpu_base->nr_retries++;
1341 if (++retries < 3)
1342 goto retry;
1343 /*
1344 * Give the system a chance to do something else than looping
1345 * here. We stored the entry time, so we know exactly how long
1346 * we spent here. We schedule the next event this amount of
1347 * time away.
1348 */
1349 cpu_base->nr_hangs++;
1350 cpu_base->hang_detected = 1;
1351 delta = ktime_sub(now, entry_time);
1352 if (delta.tv64 > cpu_base->max_hang_time.tv64)
1353 cpu_base->max_hang_time = delta;
1354 /*
1355 * Limit it to a sensible value as we enforce a longer
1356 * delay. Give the CPU at least 100ms to catch up.
1357 */
1358 if (delta.tv64 > 100 * NSEC_PER_MSEC)
1359 expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
1360 else
1361 expires_next = ktime_add(now, delta);
1362 tick_program_event(expires_next, 1);
1363 printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n",
1364 ktime_to_ns(delta));
1316} 1365}
1317 1366
1318/* 1367/*
@@ -1408,7 +1457,7 @@ void hrtimer_run_queues(void)
1408 gettime = 0; 1457 gettime = 0;
1409 } 1458 }
1410 1459
1411 spin_lock(&cpu_base->lock); 1460 raw_spin_lock(&cpu_base->lock);
1412 1461
1413 while ((node = base->first)) { 1462 while ((node = base->first)) {
1414 struct hrtimer *timer; 1463 struct hrtimer *timer;
@@ -1420,7 +1469,7 @@ void hrtimer_run_queues(void)
1420 1469
1421 __run_hrtimer(timer, &base->softirq_time); 1470 __run_hrtimer(timer, &base->softirq_time);
1422 } 1471 }
1423 spin_unlock(&cpu_base->lock); 1472 raw_spin_unlock(&cpu_base->lock);
1424 } 1473 }
1425} 1474}
1426 1475
@@ -1576,7 +1625,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
1576 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); 1625 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
1577 int i; 1626 int i;
1578 1627
1579 spin_lock_init(&cpu_base->lock); 1628 raw_spin_lock_init(&cpu_base->lock);
1580 1629
1581 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) 1630 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
1582 cpu_base->clock_base[i].cpu_base = cpu_base; 1631 cpu_base->clock_base[i].cpu_base = cpu_base;
@@ -1634,16 +1683,16 @@ static void migrate_hrtimers(int scpu)
1634 * The caller is globally serialized and nobody else 1683 * The caller is globally serialized and nobody else
1635 * takes two locks at once, deadlock is not possible. 1684 * takes two locks at once, deadlock is not possible.
1636 */ 1685 */
1637 spin_lock(&new_base->lock); 1686 raw_spin_lock(&new_base->lock);
1638 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); 1687 raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1639 1688
1640 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1689 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1641 migrate_hrtimer_list(&old_base->clock_base[i], 1690 migrate_hrtimer_list(&old_base->clock_base[i],
1642 &new_base->clock_base[i]); 1691 &new_base->clock_base[i]);
1643 } 1692 }
1644 1693
1645 spin_unlock(&old_base->lock); 1694 raw_spin_unlock(&old_base->lock);
1646 spin_unlock(&new_base->lock); 1695 raw_spin_unlock(&new_base->lock);
1647 1696
1648 /* Check, if we got expired work to do */ 1697 /* Check, if we got expired work to do */
1649 __hrtimer_peek_ahead_timers(); 1698 __hrtimer_peek_ahead_timers();
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 022a4927b785..0c642d51aac2 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -144,7 +144,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
144 144
145 rcu_read_lock(); 145 rcu_read_lock();
146 do_each_thread(g, t) { 146 do_each_thread(g, t) {
147 if (!--max_count) 147 if (!max_count--)
148 goto unlock; 148 goto unlock;
149 if (!--batch_count) { 149 if (!--batch_count) {
150 batch_count = HUNG_TASK_BATCHING; 150 batch_count = HUNG_TASK_BATCHING;
@@ -171,12 +171,12 @@ static unsigned long timeout_jiffies(unsigned long timeout)
171 * Process updating of timeout sysctl 171 * Process updating of timeout sysctl
172 */ 172 */
173int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, 173int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
174 struct file *filp, void __user *buffer, 174 void __user *buffer,
175 size_t *lenp, loff_t *ppos) 175 size_t *lenp, loff_t *ppos)
176{ 176{
177 int ret; 177 int ret;
178 178
179 ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); 179 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
180 180
181 if (ret || !write) 181 if (ret || !write)
182 goto out; 182 goto out;
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
new file mode 100644
index 000000000000..dbcbf6a33a08
--- /dev/null
+++ b/kernel/hw_breakpoint.c
@@ -0,0 +1,453 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) 2007 Alan Stern
17 * Copyright (C) IBM Corporation, 2009
18 * Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com>
19 *
20 * Thanks to Ingo Molnar for his many suggestions.
21 *
22 * Authors: Alan Stern <stern@rowland.harvard.edu>
23 * K.Prasad <prasad@linux.vnet.ibm.com>
24 * Frederic Weisbecker <fweisbec@gmail.com>
25 */
26
27/*
28 * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
29 * using the CPU's debug registers.
30 * This file contains the arch-independent routines.
31 */
32
33#include <linux/irqflags.h>
34#include <linux/kallsyms.h>
35#include <linux/notifier.h>
36#include <linux/kprobes.h>
37#include <linux/kdebug.h>
38#include <linux/kernel.h>
39#include <linux/module.h>
40#include <linux/percpu.h>
41#include <linux/sched.h>
42#include <linux/init.h>
43#include <linux/smp.h>
44
45#include <linux/hw_breakpoint.h>
46
47/*
48 * Constraints data
49 */
50
51/* Number of pinned cpu breakpoints in a cpu */
52static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned);
53
54/* Number of pinned task breakpoints in a cpu */
55static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[HBP_NUM]);
56
57/* Number of non-pinned cpu/task breakpoints in a cpu */
58static DEFINE_PER_CPU(unsigned int, nr_bp_flexible);
59
60/* Gather the number of total pinned and un-pinned bp in a cpuset */
61struct bp_busy_slots {
62 unsigned int pinned;
63 unsigned int flexible;
64};
65
66/* Serialize accesses to the above constraints */
67static DEFINE_MUTEX(nr_bp_mutex);
68
69/*
70 * Report the maximum number of pinned breakpoints a task
71 * have in this cpu
72 */
73static unsigned int max_task_bp_pinned(int cpu)
74{
75 int i;
76 unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned, cpu);
77
78 for (i = HBP_NUM -1; i >= 0; i--) {
79 if (tsk_pinned[i] > 0)
80 return i + 1;
81 }
82
83 return 0;
84}
85
86static int task_bp_pinned(struct task_struct *tsk)
87{
88 struct perf_event_context *ctx = tsk->perf_event_ctxp;
89 struct list_head *list;
90 struct perf_event *bp;
91 unsigned long flags;
92 int count = 0;
93
94 if (WARN_ONCE(!ctx, "No perf context for this task"))
95 return 0;
96
97 list = &ctx->event_list;
98
99 raw_spin_lock_irqsave(&ctx->lock, flags);
100
101 /*
102 * The current breakpoint counter is not included in the list
103 * at the open() callback time
104 */
105 list_for_each_entry(bp, list, event_entry) {
106 if (bp->attr.type == PERF_TYPE_BREAKPOINT)
107 count++;
108 }
109
110 raw_spin_unlock_irqrestore(&ctx->lock, flags);
111
112 return count;
113}
114
115/*
116 * Report the number of pinned/un-pinned breakpoints we have in
117 * a given cpu (cpu > -1) or in all of them (cpu = -1).
118 */
119static void
120fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
121{
122 int cpu = bp->cpu;
123 struct task_struct *tsk = bp->ctx->task;
124
125 if (cpu >= 0) {
126 slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu);
127 if (!tsk)
128 slots->pinned += max_task_bp_pinned(cpu);
129 else
130 slots->pinned += task_bp_pinned(tsk);
131 slots->flexible = per_cpu(nr_bp_flexible, cpu);
132
133 return;
134 }
135
136 for_each_online_cpu(cpu) {
137 unsigned int nr;
138
139 nr = per_cpu(nr_cpu_bp_pinned, cpu);
140 if (!tsk)
141 nr += max_task_bp_pinned(cpu);
142 else
143 nr += task_bp_pinned(tsk);
144
145 if (nr > slots->pinned)
146 slots->pinned = nr;
147
148 nr = per_cpu(nr_bp_flexible, cpu);
149
150 if (nr > slots->flexible)
151 slots->flexible = nr;
152 }
153}
154
155/*
156 * Add a pinned breakpoint for the given task in our constraint table
157 */
158static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable)
159{
160 unsigned int *tsk_pinned;
161 int count = 0;
162
163 count = task_bp_pinned(tsk);
164
165 tsk_pinned = per_cpu(nr_task_bp_pinned, cpu);
166 if (enable) {
167 tsk_pinned[count]++;
168 if (count > 0)
169 tsk_pinned[count-1]--;
170 } else {
171 tsk_pinned[count]--;
172 if (count > 0)
173 tsk_pinned[count-1]++;
174 }
175}
176
177/*
178 * Add/remove the given breakpoint in our constraint table
179 */
180static void toggle_bp_slot(struct perf_event *bp, bool enable)
181{
182 int cpu = bp->cpu;
183 struct task_struct *tsk = bp->ctx->task;
184
185 /* Pinned counter task profiling */
186 if (tsk) {
187 if (cpu >= 0) {
188 toggle_bp_task_slot(tsk, cpu, enable);
189 return;
190 }
191
192 for_each_online_cpu(cpu)
193 toggle_bp_task_slot(tsk, cpu, enable);
194 return;
195 }
196
197 /* Pinned counter cpu profiling */
198 if (enable)
199 per_cpu(nr_cpu_bp_pinned, bp->cpu)++;
200 else
201 per_cpu(nr_cpu_bp_pinned, bp->cpu)--;
202}
203
204/*
205 * Contraints to check before allowing this new breakpoint counter:
206 *
207 * == Non-pinned counter == (Considered as pinned for now)
208 *
209 * - If attached to a single cpu, check:
210 *
211 * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu)
212 * + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM
213 *
214 * -> If there are already non-pinned counters in this cpu, it means
215 * there is already a free slot for them.
216 * Otherwise, we check that the maximum number of per task
217 * breakpoints (for this cpu) plus the number of per cpu breakpoint
218 * (for this cpu) doesn't cover every registers.
219 *
220 * - If attached to every cpus, check:
221 *
222 * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *))
223 * + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM
224 *
225 * -> This is roughly the same, except we check the number of per cpu
226 * bp for every cpu and we keep the max one. Same for the per tasks
227 * breakpoints.
228 *
229 *
230 * == Pinned counter ==
231 *
232 * - If attached to a single cpu, check:
233 *
234 * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu)
235 * + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM
236 *
237 * -> Same checks as before. But now the nr_bp_flexible, if any, must keep
238 * one register at least (or they will never be fed).
239 *
240 * - If attached to every cpus, check:
241 *
242 * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
243 * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM
244 */
245int reserve_bp_slot(struct perf_event *bp)
246{
247 struct bp_busy_slots slots = {0};
248 int ret = 0;
249
250 mutex_lock(&nr_bp_mutex);
251
252 fetch_bp_busy_slots(&slots, bp);
253
254 /* Flexible counters need to keep at least one slot */
255 if (slots.pinned + (!!slots.flexible) == HBP_NUM) {
256 ret = -ENOSPC;
257 goto end;
258 }
259
260 toggle_bp_slot(bp, true);
261
262end:
263 mutex_unlock(&nr_bp_mutex);
264
265 return ret;
266}
267
268void release_bp_slot(struct perf_event *bp)
269{
270 mutex_lock(&nr_bp_mutex);
271
272 toggle_bp_slot(bp, false);
273
274 mutex_unlock(&nr_bp_mutex);
275}
276
277
278int register_perf_hw_breakpoint(struct perf_event *bp)
279{
280 int ret;
281
282 ret = reserve_bp_slot(bp);
283 if (ret)
284 return ret;
285
286 /*
287 * Ptrace breakpoints can be temporary perf events only
288 * meant to reserve a slot. In this case, it is created disabled and
289 * we don't want to check the params right now (as we put a null addr)
290 * But perf tools create events as disabled and we want to check
291 * the params for them.
292 * This is a quick hack that will be removed soon, once we remove
293 * the tmp breakpoints from ptrace
294 */
295 if (!bp->attr.disabled || !bp->overflow_handler)
296 ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
297
298 return ret;
299}
300
301/**
302 * register_user_hw_breakpoint - register a hardware breakpoint for user space
303 * @attr: breakpoint attributes
304 * @triggered: callback to trigger when we hit the breakpoint
305 * @tsk: pointer to 'task_struct' of the process to which the address belongs
306 */
307struct perf_event *
308register_user_hw_breakpoint(struct perf_event_attr *attr,
309 perf_overflow_handler_t triggered,
310 struct task_struct *tsk)
311{
312 return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered);
313}
314EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
315
316/**
317 * modify_user_hw_breakpoint - modify a user-space hardware breakpoint
318 * @bp: the breakpoint structure to modify
319 * @attr: new breakpoint attributes
320 * @triggered: callback to trigger when we hit the breakpoint
321 * @tsk: pointer to 'task_struct' of the process to which the address belongs
322 */
323int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr)
324{
325 u64 old_addr = bp->attr.bp_addr;
326 int old_type = bp->attr.bp_type;
327 int old_len = bp->attr.bp_len;
328 int err = 0;
329
330 perf_event_disable(bp);
331
332 bp->attr.bp_addr = attr->bp_addr;
333 bp->attr.bp_type = attr->bp_type;
334 bp->attr.bp_len = attr->bp_len;
335
336 if (attr->disabled)
337 goto end;
338
339 err = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
340 if (!err)
341 perf_event_enable(bp);
342
343 if (err) {
344 bp->attr.bp_addr = old_addr;
345 bp->attr.bp_type = old_type;
346 bp->attr.bp_len = old_len;
347 if (!bp->attr.disabled)
348 perf_event_enable(bp);
349
350 return err;
351 }
352
353end:
354 bp->attr.disabled = attr->disabled;
355
356 return 0;
357}
358EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
359
360/**
361 * unregister_hw_breakpoint - unregister a user-space hardware breakpoint
362 * @bp: the breakpoint structure to unregister
363 */
364void unregister_hw_breakpoint(struct perf_event *bp)
365{
366 if (!bp)
367 return;
368 perf_event_release_kernel(bp);
369}
370EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
371
372/**
373 * register_wide_hw_breakpoint - register a wide breakpoint in the kernel
374 * @attr: breakpoint attributes
375 * @triggered: callback to trigger when we hit the breakpoint
376 *
377 * @return a set of per_cpu pointers to perf events
378 */
379struct perf_event **
380register_wide_hw_breakpoint(struct perf_event_attr *attr,
381 perf_overflow_handler_t triggered)
382{
383 struct perf_event **cpu_events, **pevent, *bp;
384 long err;
385 int cpu;
386
387 cpu_events = alloc_percpu(typeof(*cpu_events));
388 if (!cpu_events)
389 return ERR_PTR(-ENOMEM);
390
391 for_each_possible_cpu(cpu) {
392 pevent = per_cpu_ptr(cpu_events, cpu);
393 bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered);
394
395 *pevent = bp;
396
397 if (IS_ERR(bp)) {
398 err = PTR_ERR(bp);
399 goto fail;
400 }
401 }
402
403 return cpu_events;
404
405fail:
406 for_each_possible_cpu(cpu) {
407 pevent = per_cpu_ptr(cpu_events, cpu);
408 if (IS_ERR(*pevent))
409 break;
410 unregister_hw_breakpoint(*pevent);
411 }
412 free_percpu(cpu_events);
413 /* return the error if any */
414 return ERR_PTR(err);
415}
416EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
417
418/**
419 * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel
420 * @cpu_events: the per cpu set of events to unregister
421 */
422void unregister_wide_hw_breakpoint(struct perf_event **cpu_events)
423{
424 int cpu;
425 struct perf_event **pevent;
426
427 for_each_possible_cpu(cpu) {
428 pevent = per_cpu_ptr(cpu_events, cpu);
429 unregister_hw_breakpoint(*pevent);
430 }
431 free_percpu(cpu_events);
432}
433EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint);
434
435static struct notifier_block hw_breakpoint_exceptions_nb = {
436 .notifier_call = hw_breakpoint_exceptions_notify,
437 /* we need to be notified first */
438 .priority = 0x7fffffff
439};
440
441static int __init init_hw_breakpoint(void)
442{
443 return register_die_notifier(&hw_breakpoint_exceptions_nb);
444}
445core_initcall(init_hw_breakpoint);
446
447
448struct pmu perf_ops_bp = {
449 .enable = arch_install_hw_breakpoint,
450 .disable = arch_uninstall_hw_breakpoint,
451 .read = hw_breakpoint_pmu_read,
452 .unthrottle = hw_breakpoint_pmu_unthrottle
453};
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 1de9700f416e..2295a31ef110 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -45,7 +45,7 @@ unsigned long probe_irq_on(void)
45 * flush such a longstanding irq before considering it as spurious. 45 * flush such a longstanding irq before considering it as spurious.
46 */ 46 */
47 for_each_irq_desc_reverse(i, desc) { 47 for_each_irq_desc_reverse(i, desc) {
48 spin_lock_irq(&desc->lock); 48 raw_spin_lock_irq(&desc->lock);
49 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 49 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
50 /* 50 /*
51 * An old-style architecture might still have 51 * An old-style architecture might still have
@@ -61,7 +61,7 @@ unsigned long probe_irq_on(void)
61 desc->chip->set_type(i, IRQ_TYPE_PROBE); 61 desc->chip->set_type(i, IRQ_TYPE_PROBE);
62 desc->chip->startup(i); 62 desc->chip->startup(i);
63 } 63 }
64 spin_unlock_irq(&desc->lock); 64 raw_spin_unlock_irq(&desc->lock);
65 } 65 }
66 66
67 /* Wait for longstanding interrupts to trigger. */ 67 /* Wait for longstanding interrupts to trigger. */
@@ -73,13 +73,13 @@ unsigned long probe_irq_on(void)
73 * happened in the previous stage, it may have masked itself) 73 * happened in the previous stage, it may have masked itself)
74 */ 74 */
75 for_each_irq_desc_reverse(i, desc) { 75 for_each_irq_desc_reverse(i, desc) {
76 spin_lock_irq(&desc->lock); 76 raw_spin_lock_irq(&desc->lock);
77 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 77 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
78 desc->status |= IRQ_AUTODETECT | IRQ_WAITING; 78 desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
79 if (desc->chip->startup(i)) 79 if (desc->chip->startup(i))
80 desc->status |= IRQ_PENDING; 80 desc->status |= IRQ_PENDING;
81 } 81 }
82 spin_unlock_irq(&desc->lock); 82 raw_spin_unlock_irq(&desc->lock);
83 } 83 }
84 84
85 /* 85 /*
@@ -91,7 +91,7 @@ unsigned long probe_irq_on(void)
91 * Now filter out any obviously spurious interrupts 91 * Now filter out any obviously spurious interrupts
92 */ 92 */
93 for_each_irq_desc(i, desc) { 93 for_each_irq_desc(i, desc) {
94 spin_lock_irq(&desc->lock); 94 raw_spin_lock_irq(&desc->lock);
95 status = desc->status; 95 status = desc->status;
96 96
97 if (status & IRQ_AUTODETECT) { 97 if (status & IRQ_AUTODETECT) {
@@ -103,7 +103,7 @@ unsigned long probe_irq_on(void)
103 if (i < 32) 103 if (i < 32)
104 mask |= 1 << i; 104 mask |= 1 << i;
105 } 105 }
106 spin_unlock_irq(&desc->lock); 106 raw_spin_unlock_irq(&desc->lock);
107 } 107 }
108 108
109 return mask; 109 return mask;
@@ -129,7 +129,7 @@ unsigned int probe_irq_mask(unsigned long val)
129 int i; 129 int i;
130 130
131 for_each_irq_desc(i, desc) { 131 for_each_irq_desc(i, desc) {
132 spin_lock_irq(&desc->lock); 132 raw_spin_lock_irq(&desc->lock);
133 status = desc->status; 133 status = desc->status;
134 134
135 if (status & IRQ_AUTODETECT) { 135 if (status & IRQ_AUTODETECT) {
@@ -139,7 +139,7 @@ unsigned int probe_irq_mask(unsigned long val)
139 desc->status = status & ~IRQ_AUTODETECT; 139 desc->status = status & ~IRQ_AUTODETECT;
140 desc->chip->shutdown(i); 140 desc->chip->shutdown(i);
141 } 141 }
142 spin_unlock_irq(&desc->lock); 142 raw_spin_unlock_irq(&desc->lock);
143 } 143 }
144 mutex_unlock(&probing_active); 144 mutex_unlock(&probing_active);
145 145
@@ -171,7 +171,7 @@ int probe_irq_off(unsigned long val)
171 unsigned int status; 171 unsigned int status;
172 172
173 for_each_irq_desc(i, desc) { 173 for_each_irq_desc(i, desc) {
174 spin_lock_irq(&desc->lock); 174 raw_spin_lock_irq(&desc->lock);
175 status = desc->status; 175 status = desc->status;
176 176
177 if (status & IRQ_AUTODETECT) { 177 if (status & IRQ_AUTODETECT) {
@@ -183,7 +183,7 @@ int probe_irq_off(unsigned long val)
183 desc->status = status & ~IRQ_AUTODETECT; 183 desc->status = status & ~IRQ_AUTODETECT;
184 desc->chip->shutdown(i); 184 desc->chip->shutdown(i);
185 } 185 }
186 spin_unlock_irq(&desc->lock); 186 raw_spin_unlock_irq(&desc->lock);
187 } 187 }
188 mutex_unlock(&probing_active); 188 mutex_unlock(&probing_active);
189 189
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c1660194d115..ecc3fa28f666 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -34,7 +34,7 @@ void dynamic_irq_init(unsigned int irq)
34 } 34 }
35 35
36 /* Ensure we don't have left over values from a previous use of this irq */ 36 /* Ensure we don't have left over values from a previous use of this irq */
37 spin_lock_irqsave(&desc->lock, flags); 37 raw_spin_lock_irqsave(&desc->lock, flags);
38 desc->status = IRQ_DISABLED; 38 desc->status = IRQ_DISABLED;
39 desc->chip = &no_irq_chip; 39 desc->chip = &no_irq_chip;
40 desc->handle_irq = handle_bad_irq; 40 desc->handle_irq = handle_bad_irq;
@@ -51,7 +51,7 @@ void dynamic_irq_init(unsigned int irq)
51 cpumask_clear(desc->pending_mask); 51 cpumask_clear(desc->pending_mask);
52#endif 52#endif
53#endif 53#endif
54 spin_unlock_irqrestore(&desc->lock, flags); 54 raw_spin_unlock_irqrestore(&desc->lock, flags);
55} 55}
56 56
57/** 57/**
@@ -68,9 +68,9 @@ void dynamic_irq_cleanup(unsigned int irq)
68 return; 68 return;
69 } 69 }
70 70
71 spin_lock_irqsave(&desc->lock, flags); 71 raw_spin_lock_irqsave(&desc->lock, flags);
72 if (desc->action) { 72 if (desc->action) {
73 spin_unlock_irqrestore(&desc->lock, flags); 73 raw_spin_unlock_irqrestore(&desc->lock, flags);
74 WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n", 74 WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n",
75 irq); 75 irq);
76 return; 76 return;
@@ -82,7 +82,7 @@ void dynamic_irq_cleanup(unsigned int irq)
82 desc->chip = &no_irq_chip; 82 desc->chip = &no_irq_chip;
83 desc->name = NULL; 83 desc->name = NULL;
84 clear_kstat_irqs(desc); 84 clear_kstat_irqs(desc);
85 spin_unlock_irqrestore(&desc->lock, flags); 85 raw_spin_unlock_irqrestore(&desc->lock, flags);
86} 86}
87 87
88 88
@@ -104,10 +104,10 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
104 if (!chip) 104 if (!chip)
105 chip = &no_irq_chip; 105 chip = &no_irq_chip;
106 106
107 spin_lock_irqsave(&desc->lock, flags); 107 raw_spin_lock_irqsave(&desc->lock, flags);
108 irq_chip_set_defaults(chip); 108 irq_chip_set_defaults(chip);
109 desc->chip = chip; 109 desc->chip = chip;
110 spin_unlock_irqrestore(&desc->lock, flags); 110 raw_spin_unlock_irqrestore(&desc->lock, flags);
111 111
112 return 0; 112 return 0;
113} 113}
@@ -133,9 +133,9 @@ int set_irq_type(unsigned int irq, unsigned int type)
133 if (type == IRQ_TYPE_NONE) 133 if (type == IRQ_TYPE_NONE)
134 return 0; 134 return 0;
135 135
136 spin_lock_irqsave(&desc->lock, flags); 136 raw_spin_lock_irqsave(&desc->lock, flags);
137 ret = __irq_set_trigger(desc, irq, type); 137 ret = __irq_set_trigger(desc, irq, type);
138 spin_unlock_irqrestore(&desc->lock, flags); 138 raw_spin_unlock_irqrestore(&desc->lock, flags);
139 return ret; 139 return ret;
140} 140}
141EXPORT_SYMBOL(set_irq_type); 141EXPORT_SYMBOL(set_irq_type);
@@ -158,19 +158,19 @@ int set_irq_data(unsigned int irq, void *data)
158 return -EINVAL; 158 return -EINVAL;
159 } 159 }
160 160
161 spin_lock_irqsave(&desc->lock, flags); 161 raw_spin_lock_irqsave(&desc->lock, flags);
162 desc->handler_data = data; 162 desc->handler_data = data;
163 spin_unlock_irqrestore(&desc->lock, flags); 163 raw_spin_unlock_irqrestore(&desc->lock, flags);
164 return 0; 164 return 0;
165} 165}
166EXPORT_SYMBOL(set_irq_data); 166EXPORT_SYMBOL(set_irq_data);
167 167
168/** 168/**
169 * set_irq_data - set irq type data for an irq 169 * set_irq_msi - set MSI descriptor data for an irq
170 * @irq: Interrupt number 170 * @irq: Interrupt number
171 * @entry: Pointer to MSI descriptor data 171 * @entry: Pointer to MSI descriptor data
172 * 172 *
173 * Set the hardware irq controller data for an irq 173 * Set the MSI descriptor entry for an irq
174 */ 174 */
175int set_irq_msi(unsigned int irq, struct msi_desc *entry) 175int set_irq_msi(unsigned int irq, struct msi_desc *entry)
176{ 176{
@@ -183,11 +183,11 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry)
183 return -EINVAL; 183 return -EINVAL;
184 } 184 }
185 185
186 spin_lock_irqsave(&desc->lock, flags); 186 raw_spin_lock_irqsave(&desc->lock, flags);
187 desc->msi_desc = entry; 187 desc->msi_desc = entry;
188 if (entry) 188 if (entry)
189 entry->irq = irq; 189 entry->irq = irq;
190 spin_unlock_irqrestore(&desc->lock, flags); 190 raw_spin_unlock_irqrestore(&desc->lock, flags);
191 return 0; 191 return 0;
192} 192}
193 193
@@ -214,9 +214,9 @@ int set_irq_chip_data(unsigned int irq, void *data)
214 return -EINVAL; 214 return -EINVAL;
215 } 215 }
216 216
217 spin_lock_irqsave(&desc->lock, flags); 217 raw_spin_lock_irqsave(&desc->lock, flags);
218 desc->chip_data = data; 218 desc->chip_data = data;
219 spin_unlock_irqrestore(&desc->lock, flags); 219 raw_spin_unlock_irqrestore(&desc->lock, flags);
220 220
221 return 0; 221 return 0;
222} 222}
@@ -241,12 +241,12 @@ void set_irq_nested_thread(unsigned int irq, int nest)
241 if (!desc) 241 if (!desc)
242 return; 242 return;
243 243
244 spin_lock_irqsave(&desc->lock, flags); 244 raw_spin_lock_irqsave(&desc->lock, flags);
245 if (nest) 245 if (nest)
246 desc->status |= IRQ_NESTED_THREAD; 246 desc->status |= IRQ_NESTED_THREAD;
247 else 247 else
248 desc->status &= ~IRQ_NESTED_THREAD; 248 desc->status &= ~IRQ_NESTED_THREAD;
249 spin_unlock_irqrestore(&desc->lock, flags); 249 raw_spin_unlock_irqrestore(&desc->lock, flags);
250} 250}
251EXPORT_SYMBOL_GPL(set_irq_nested_thread); 251EXPORT_SYMBOL_GPL(set_irq_nested_thread);
252 252
@@ -343,7 +343,7 @@ void handle_nested_irq(unsigned int irq)
343 343
344 might_sleep(); 344 might_sleep();
345 345
346 spin_lock_irq(&desc->lock); 346 raw_spin_lock_irq(&desc->lock);
347 347
348 kstat_incr_irqs_this_cpu(irq, desc); 348 kstat_incr_irqs_this_cpu(irq, desc);
349 349
@@ -352,17 +352,17 @@ void handle_nested_irq(unsigned int irq)
352 goto out_unlock; 352 goto out_unlock;
353 353
354 desc->status |= IRQ_INPROGRESS; 354 desc->status |= IRQ_INPROGRESS;
355 spin_unlock_irq(&desc->lock); 355 raw_spin_unlock_irq(&desc->lock);
356 356
357 action_ret = action->thread_fn(action->irq, action->dev_id); 357 action_ret = action->thread_fn(action->irq, action->dev_id);
358 if (!noirqdebug) 358 if (!noirqdebug)
359 note_interrupt(irq, desc, action_ret); 359 note_interrupt(irq, desc, action_ret);
360 360
361 spin_lock_irq(&desc->lock); 361 raw_spin_lock_irq(&desc->lock);
362 desc->status &= ~IRQ_INPROGRESS; 362 desc->status &= ~IRQ_INPROGRESS;
363 363
364out_unlock: 364out_unlock:
365 spin_unlock_irq(&desc->lock); 365 raw_spin_unlock_irq(&desc->lock);
366} 366}
367EXPORT_SYMBOL_GPL(handle_nested_irq); 367EXPORT_SYMBOL_GPL(handle_nested_irq);
368 368
@@ -384,7 +384,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
384 struct irqaction *action; 384 struct irqaction *action;
385 irqreturn_t action_ret; 385 irqreturn_t action_ret;
386 386
387 spin_lock(&desc->lock); 387 raw_spin_lock(&desc->lock);
388 388
389 if (unlikely(desc->status & IRQ_INPROGRESS)) 389 if (unlikely(desc->status & IRQ_INPROGRESS))
390 goto out_unlock; 390 goto out_unlock;
@@ -396,16 +396,16 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
396 goto out_unlock; 396 goto out_unlock;
397 397
398 desc->status |= IRQ_INPROGRESS; 398 desc->status |= IRQ_INPROGRESS;
399 spin_unlock(&desc->lock); 399 raw_spin_unlock(&desc->lock);
400 400
401 action_ret = handle_IRQ_event(irq, action); 401 action_ret = handle_IRQ_event(irq, action);
402 if (!noirqdebug) 402 if (!noirqdebug)
403 note_interrupt(irq, desc, action_ret); 403 note_interrupt(irq, desc, action_ret);
404 404
405 spin_lock(&desc->lock); 405 raw_spin_lock(&desc->lock);
406 desc->status &= ~IRQ_INPROGRESS; 406 desc->status &= ~IRQ_INPROGRESS;
407out_unlock: 407out_unlock:
408 spin_unlock(&desc->lock); 408 raw_spin_unlock(&desc->lock);
409} 409}
410 410
411/** 411/**
@@ -424,7 +424,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
424 struct irqaction *action; 424 struct irqaction *action;
425 irqreturn_t action_ret; 425 irqreturn_t action_ret;
426 426
427 spin_lock(&desc->lock); 427 raw_spin_lock(&desc->lock);
428 mask_ack_irq(desc, irq); 428 mask_ack_irq(desc, irq);
429 429
430 if (unlikely(desc->status & IRQ_INPROGRESS)) 430 if (unlikely(desc->status & IRQ_INPROGRESS))
@@ -441,13 +441,13 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
441 goto out_unlock; 441 goto out_unlock;
442 442
443 desc->status |= IRQ_INPROGRESS; 443 desc->status |= IRQ_INPROGRESS;
444 spin_unlock(&desc->lock); 444 raw_spin_unlock(&desc->lock);
445 445
446 action_ret = handle_IRQ_event(irq, action); 446 action_ret = handle_IRQ_event(irq, action);
447 if (!noirqdebug) 447 if (!noirqdebug)
448 note_interrupt(irq, desc, action_ret); 448 note_interrupt(irq, desc, action_ret);
449 449
450 spin_lock(&desc->lock); 450 raw_spin_lock(&desc->lock);
451 desc->status &= ~IRQ_INPROGRESS; 451 desc->status &= ~IRQ_INPROGRESS;
452 452
453 if (unlikely(desc->status & IRQ_ONESHOT)) 453 if (unlikely(desc->status & IRQ_ONESHOT))
@@ -455,7 +455,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
455 else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) 455 else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
456 desc->chip->unmask(irq); 456 desc->chip->unmask(irq);
457out_unlock: 457out_unlock:
458 spin_unlock(&desc->lock); 458 raw_spin_unlock(&desc->lock);
459} 459}
460EXPORT_SYMBOL_GPL(handle_level_irq); 460EXPORT_SYMBOL_GPL(handle_level_irq);
461 461
@@ -475,7 +475,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
475 struct irqaction *action; 475 struct irqaction *action;
476 irqreturn_t action_ret; 476 irqreturn_t action_ret;
477 477
478 spin_lock(&desc->lock); 478 raw_spin_lock(&desc->lock);
479 479
480 if (unlikely(desc->status & IRQ_INPROGRESS)) 480 if (unlikely(desc->status & IRQ_INPROGRESS))
481 goto out; 481 goto out;
@@ -497,18 +497,18 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
497 497
498 desc->status |= IRQ_INPROGRESS; 498 desc->status |= IRQ_INPROGRESS;
499 desc->status &= ~IRQ_PENDING; 499 desc->status &= ~IRQ_PENDING;
500 spin_unlock(&desc->lock); 500 raw_spin_unlock(&desc->lock);
501 501
502 action_ret = handle_IRQ_event(irq, action); 502 action_ret = handle_IRQ_event(irq, action);
503 if (!noirqdebug) 503 if (!noirqdebug)
504 note_interrupt(irq, desc, action_ret); 504 note_interrupt(irq, desc, action_ret);
505 505
506 spin_lock(&desc->lock); 506 raw_spin_lock(&desc->lock);
507 desc->status &= ~IRQ_INPROGRESS; 507 desc->status &= ~IRQ_INPROGRESS;
508out: 508out:
509 desc->chip->eoi(irq); 509 desc->chip->eoi(irq);
510 510
511 spin_unlock(&desc->lock); 511 raw_spin_unlock(&desc->lock);
512} 512}
513 513
514/** 514/**
@@ -530,7 +530,7 @@ out:
530void 530void
531handle_edge_irq(unsigned int irq, struct irq_desc *desc) 531handle_edge_irq(unsigned int irq, struct irq_desc *desc)
532{ 532{
533 spin_lock(&desc->lock); 533 raw_spin_lock(&desc->lock);
534 534
535 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 535 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
536 536
@@ -576,21 +576,21 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
576 } 576 }
577 577
578 desc->status &= ~IRQ_PENDING; 578 desc->status &= ~IRQ_PENDING;
579 spin_unlock(&desc->lock); 579 raw_spin_unlock(&desc->lock);
580 action_ret = handle_IRQ_event(irq, action); 580 action_ret = handle_IRQ_event(irq, action);
581 if (!noirqdebug) 581 if (!noirqdebug)
582 note_interrupt(irq, desc, action_ret); 582 note_interrupt(irq, desc, action_ret);
583 spin_lock(&desc->lock); 583 raw_spin_lock(&desc->lock);
584 584
585 } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING); 585 } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING);
586 586
587 desc->status &= ~IRQ_INPROGRESS; 587 desc->status &= ~IRQ_INPROGRESS;
588out_unlock: 588out_unlock:
589 spin_unlock(&desc->lock); 589 raw_spin_unlock(&desc->lock);
590} 590}
591 591
592/** 592/**
593 * handle_percpu_IRQ - Per CPU local irq handler 593 * handle_percpu_irq - Per CPU local irq handler
594 * @irq: the interrupt number 594 * @irq: the interrupt number
595 * @desc: the interrupt description structure for this irq 595 * @desc: the interrupt description structure for this irq
596 * 596 *
@@ -643,7 +643,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
643 } 643 }
644 644
645 chip_bus_lock(irq, desc); 645 chip_bus_lock(irq, desc);
646 spin_lock_irqsave(&desc->lock, flags); 646 raw_spin_lock_irqsave(&desc->lock, flags);
647 647
648 /* Uninstall? */ 648 /* Uninstall? */
649 if (handle == handle_bad_irq) { 649 if (handle == handle_bad_irq) {
@@ -661,7 +661,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
661 desc->depth = 0; 661 desc->depth = 0;
662 desc->chip->startup(irq); 662 desc->chip->startup(irq);
663 } 663 }
664 spin_unlock_irqrestore(&desc->lock, flags); 664 raw_spin_unlock_irqrestore(&desc->lock, flags);
665 chip_bus_sync_unlock(irq, desc); 665 chip_bus_sync_unlock(irq, desc);
666} 666}
667EXPORT_SYMBOL_GPL(__set_irq_handler); 667EXPORT_SYMBOL_GPL(__set_irq_handler);
@@ -692,9 +692,9 @@ void __init set_irq_noprobe(unsigned int irq)
692 return; 692 return;
693 } 693 }
694 694
695 spin_lock_irqsave(&desc->lock, flags); 695 raw_spin_lock_irqsave(&desc->lock, flags);
696 desc->status |= IRQ_NOPROBE; 696 desc->status |= IRQ_NOPROBE;
697 spin_unlock_irqrestore(&desc->lock, flags); 697 raw_spin_unlock_irqrestore(&desc->lock, flags);
698} 698}
699 699
700void __init set_irq_probe(unsigned int irq) 700void __init set_irq_probe(unsigned int irq)
@@ -707,7 +707,7 @@ void __init set_irq_probe(unsigned int irq)
707 return; 707 return;
708 } 708 }
709 709
710 spin_lock_irqsave(&desc->lock, flags); 710 raw_spin_lock_irqsave(&desc->lock, flags);
711 desc->status &= ~IRQ_NOPROBE; 711 desc->status &= ~IRQ_NOPROBE;
712 spin_unlock_irqrestore(&desc->lock, flags); 712 raw_spin_unlock_irqrestore(&desc->lock, flags);
713} 713}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a81cf80554db..814940e7f485 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -11,6 +11,7 @@
11 */ 11 */
12 12
13#include <linux/irq.h> 13#include <linux/irq.h>
14#include <linux/sched.h>
14#include <linux/slab.h> 15#include <linux/slab.h>
15#include <linux/module.h> 16#include <linux/module.h>
16#include <linux/random.h> 17#include <linux/random.h>
@@ -79,7 +80,7 @@ static struct irq_desc irq_desc_init = {
79 .chip = &no_irq_chip, 80 .chip = &no_irq_chip,
80 .handle_irq = handle_bad_irq, 81 .handle_irq = handle_bad_irq,
81 .depth = 1, 82 .depth = 1,
82 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), 83 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
83}; 84};
84 85
85void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr) 86void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
@@ -107,7 +108,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
107{ 108{
108 memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); 109 memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
109 110
110 spin_lock_init(&desc->lock); 111 raw_spin_lock_init(&desc->lock);
111 desc->irq = irq; 112 desc->irq = irq;
112#ifdef CONFIG_SMP 113#ifdef CONFIG_SMP
113 desc->node = node; 114 desc->node = node;
@@ -129,7 +130,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
129/* 130/*
130 * Protect the sparse_irqs: 131 * Protect the sparse_irqs:
131 */ 132 */
132DEFINE_SPINLOCK(sparse_irq_lock); 133DEFINE_RAW_SPINLOCK(sparse_irq_lock);
133 134
134struct irq_desc **irq_desc_ptrs __read_mostly; 135struct irq_desc **irq_desc_ptrs __read_mostly;
135 136
@@ -140,7 +141,7 @@ static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_sm
140 .chip = &no_irq_chip, 141 .chip = &no_irq_chip,
141 .handle_irq = handle_bad_irq, 142 .handle_irq = handle_bad_irq,
142 .depth = 1, 143 .depth = 1,
143 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), 144 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
144 } 145 }
145}; 146};
146 147
@@ -211,7 +212,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
211 if (desc) 212 if (desc)
212 return desc; 213 return desc;
213 214
214 spin_lock_irqsave(&sparse_irq_lock, flags); 215 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
215 216
216 /* We have to check it to avoid races with another CPU */ 217 /* We have to check it to avoid races with another CPU */
217 desc = irq_desc_ptrs[irq]; 218 desc = irq_desc_ptrs[irq];
@@ -233,7 +234,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
233 irq_desc_ptrs[irq] = desc; 234 irq_desc_ptrs[irq] = desc;
234 235
235out_unlock: 236out_unlock:
236 spin_unlock_irqrestore(&sparse_irq_lock, flags); 237 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
237 238
238 return desc; 239 return desc;
239} 240}
@@ -246,7 +247,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
246 .chip = &no_irq_chip, 247 .chip = &no_irq_chip,
247 .handle_irq = handle_bad_irq, 248 .handle_irq = handle_bad_irq,
248 .depth = 1, 249 .depth = 1,
249 .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock), 250 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
250 } 251 }
251}; 252};
252 253
@@ -472,7 +473,7 @@ unsigned int __do_IRQ(unsigned int irq)
472 return 1; 473 return 1;
473 } 474 }
474 475
475 spin_lock(&desc->lock); 476 raw_spin_lock(&desc->lock);
476 if (desc->chip->ack) 477 if (desc->chip->ack)
477 desc->chip->ack(irq); 478 desc->chip->ack(irq);
478 /* 479 /*
@@ -516,13 +517,13 @@ unsigned int __do_IRQ(unsigned int irq)
516 for (;;) { 517 for (;;) {
517 irqreturn_t action_ret; 518 irqreturn_t action_ret;
518 519
519 spin_unlock(&desc->lock); 520 raw_spin_unlock(&desc->lock);
520 521
521 action_ret = handle_IRQ_event(irq, action); 522 action_ret = handle_IRQ_event(irq, action);
522 if (!noirqdebug) 523 if (!noirqdebug)
523 note_interrupt(irq, desc, action_ret); 524 note_interrupt(irq, desc, action_ret);
524 525
525 spin_lock(&desc->lock); 526 raw_spin_lock(&desc->lock);
526 if (likely(!(desc->status & IRQ_PENDING))) 527 if (likely(!(desc->status & IRQ_PENDING)))
527 break; 528 break;
528 desc->status &= ~IRQ_PENDING; 529 desc->status &= ~IRQ_PENDING;
@@ -535,7 +536,7 @@ out:
535 * disabled while the handler was running. 536 * disabled while the handler was running.
536 */ 537 */
537 desc->chip->end(irq); 538 desc->chip->end(irq);
538 spin_unlock(&desc->lock); 539 raw_spin_unlock(&desc->lock);
539 540
540 return 1; 541 return 1;
541} 542}
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 1b5d742c6a77..b2821f070a3d 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -18,7 +18,7 @@ extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
18extern struct lock_class_key irq_desc_lock_class; 18extern struct lock_class_key irq_desc_lock_class;
19extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); 19extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
20extern void clear_kstat_irqs(struct irq_desc *desc); 20extern void clear_kstat_irqs(struct irq_desc *desc);
21extern spinlock_t sparse_irq_lock; 21extern raw_spinlock_t sparse_irq_lock;
22 22
23#ifdef CONFIG_SPARSE_IRQ 23#ifdef CONFIG_SPARSE_IRQ
24/* irq_desc_ptrs allocated at boot time */ 24/* irq_desc_ptrs allocated at boot time */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index bde4c667d24d..eb6078ca60c7 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -46,9 +46,9 @@ void synchronize_irq(unsigned int irq)
46 cpu_relax(); 46 cpu_relax();
47 47
48 /* Ok, that indicated we're done: double-check carefully. */ 48 /* Ok, that indicated we're done: double-check carefully. */
49 spin_lock_irqsave(&desc->lock, flags); 49 raw_spin_lock_irqsave(&desc->lock, flags);
50 status = desc->status; 50 status = desc->status;
51 spin_unlock_irqrestore(&desc->lock, flags); 51 raw_spin_unlock_irqrestore(&desc->lock, flags);
52 52
53 /* Oops, that failed? */ 53 /* Oops, that failed? */
54 } while (status & IRQ_INPROGRESS); 54 } while (status & IRQ_INPROGRESS);
@@ -114,7 +114,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
114 if (!desc->chip->set_affinity) 114 if (!desc->chip->set_affinity)
115 return -EINVAL; 115 return -EINVAL;
116 116
117 spin_lock_irqsave(&desc->lock, flags); 117 raw_spin_lock_irqsave(&desc->lock, flags);
118 118
119#ifdef CONFIG_GENERIC_PENDING_IRQ 119#ifdef CONFIG_GENERIC_PENDING_IRQ
120 if (desc->status & IRQ_MOVE_PCNTXT) { 120 if (desc->status & IRQ_MOVE_PCNTXT) {
@@ -134,7 +134,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
134 } 134 }
135#endif 135#endif
136 desc->status |= IRQ_AFFINITY_SET; 136 desc->status |= IRQ_AFFINITY_SET;
137 spin_unlock_irqrestore(&desc->lock, flags); 137 raw_spin_unlock_irqrestore(&desc->lock, flags);
138 return 0; 138 return 0;
139} 139}
140 140
@@ -181,11 +181,11 @@ int irq_select_affinity_usr(unsigned int irq)
181 unsigned long flags; 181 unsigned long flags;
182 int ret; 182 int ret;
183 183
184 spin_lock_irqsave(&desc->lock, flags); 184 raw_spin_lock_irqsave(&desc->lock, flags);
185 ret = setup_affinity(irq, desc); 185 ret = setup_affinity(irq, desc);
186 if (!ret) 186 if (!ret)
187 irq_set_thread_affinity(desc); 187 irq_set_thread_affinity(desc);
188 spin_unlock_irqrestore(&desc->lock, flags); 188 raw_spin_unlock_irqrestore(&desc->lock, flags);
189 189
190 return ret; 190 return ret;
191} 191}
@@ -231,9 +231,9 @@ void disable_irq_nosync(unsigned int irq)
231 return; 231 return;
232 232
233 chip_bus_lock(irq, desc); 233 chip_bus_lock(irq, desc);
234 spin_lock_irqsave(&desc->lock, flags); 234 raw_spin_lock_irqsave(&desc->lock, flags);
235 __disable_irq(desc, irq, false); 235 __disable_irq(desc, irq, false);
236 spin_unlock_irqrestore(&desc->lock, flags); 236 raw_spin_unlock_irqrestore(&desc->lock, flags);
237 chip_bus_sync_unlock(irq, desc); 237 chip_bus_sync_unlock(irq, desc);
238} 238}
239EXPORT_SYMBOL(disable_irq_nosync); 239EXPORT_SYMBOL(disable_irq_nosync);
@@ -308,9 +308,9 @@ void enable_irq(unsigned int irq)
308 return; 308 return;
309 309
310 chip_bus_lock(irq, desc); 310 chip_bus_lock(irq, desc);
311 spin_lock_irqsave(&desc->lock, flags); 311 raw_spin_lock_irqsave(&desc->lock, flags);
312 __enable_irq(desc, irq, false); 312 __enable_irq(desc, irq, false);
313 spin_unlock_irqrestore(&desc->lock, flags); 313 raw_spin_unlock_irqrestore(&desc->lock, flags);
314 chip_bus_sync_unlock(irq, desc); 314 chip_bus_sync_unlock(irq, desc);
315} 315}
316EXPORT_SYMBOL(enable_irq); 316EXPORT_SYMBOL(enable_irq);
@@ -347,7 +347,7 @@ int set_irq_wake(unsigned int irq, unsigned int on)
347 /* wakeup-capable irqs can be shared between drivers that 347 /* wakeup-capable irqs can be shared between drivers that
348 * don't need to have the same sleep mode behaviors. 348 * don't need to have the same sleep mode behaviors.
349 */ 349 */
350 spin_lock_irqsave(&desc->lock, flags); 350 raw_spin_lock_irqsave(&desc->lock, flags);
351 if (on) { 351 if (on) {
352 if (desc->wake_depth++ == 0) { 352 if (desc->wake_depth++ == 0) {
353 ret = set_irq_wake_real(irq, on); 353 ret = set_irq_wake_real(irq, on);
@@ -368,7 +368,7 @@ int set_irq_wake(unsigned int irq, unsigned int on)
368 } 368 }
369 } 369 }
370 370
371 spin_unlock_irqrestore(&desc->lock, flags); 371 raw_spin_unlock_irqrestore(&desc->lock, flags);
372 return ret; 372 return ret;
373} 373}
374EXPORT_SYMBOL(set_irq_wake); 374EXPORT_SYMBOL(set_irq_wake);
@@ -484,12 +484,12 @@ static int irq_wait_for_interrupt(struct irqaction *action)
484static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) 484static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
485{ 485{
486 chip_bus_lock(irq, desc); 486 chip_bus_lock(irq, desc);
487 spin_lock_irq(&desc->lock); 487 raw_spin_lock_irq(&desc->lock);
488 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { 488 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
489 desc->status &= ~IRQ_MASKED; 489 desc->status &= ~IRQ_MASKED;
490 desc->chip->unmask(irq); 490 desc->chip->unmask(irq);
491 } 491 }
492 spin_unlock_irq(&desc->lock); 492 raw_spin_unlock_irq(&desc->lock);
493 chip_bus_sync_unlock(irq, desc); 493 chip_bus_sync_unlock(irq, desc);
494} 494}
495 495
@@ -514,9 +514,9 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
514 return; 514 return;
515 } 515 }
516 516
517 spin_lock_irq(&desc->lock); 517 raw_spin_lock_irq(&desc->lock);
518 cpumask_copy(mask, desc->affinity); 518 cpumask_copy(mask, desc->affinity);
519 spin_unlock_irq(&desc->lock); 519 raw_spin_unlock_irq(&desc->lock);
520 520
521 set_cpus_allowed_ptr(current, mask); 521 set_cpus_allowed_ptr(current, mask);
522 free_cpumask_var(mask); 522 free_cpumask_var(mask);
@@ -545,7 +545,7 @@ static int irq_thread(void *data)
545 545
546 atomic_inc(&desc->threads_active); 546 atomic_inc(&desc->threads_active);
547 547
548 spin_lock_irq(&desc->lock); 548 raw_spin_lock_irq(&desc->lock);
549 if (unlikely(desc->status & IRQ_DISABLED)) { 549 if (unlikely(desc->status & IRQ_DISABLED)) {
550 /* 550 /*
551 * CHECKME: We might need a dedicated 551 * CHECKME: We might need a dedicated
@@ -555,9 +555,9 @@ static int irq_thread(void *data)
555 * retriggers the interrupt itself --- tglx 555 * retriggers the interrupt itself --- tglx
556 */ 556 */
557 desc->status |= IRQ_PENDING; 557 desc->status |= IRQ_PENDING;
558 spin_unlock_irq(&desc->lock); 558 raw_spin_unlock_irq(&desc->lock);
559 } else { 559 } else {
560 spin_unlock_irq(&desc->lock); 560 raw_spin_unlock_irq(&desc->lock);
561 561
562 action->thread_fn(action->irq, action->dev_id); 562 action->thread_fn(action->irq, action->dev_id);
563 563
@@ -679,7 +679,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
679 /* 679 /*
680 * The following block of code has to be executed atomically 680 * The following block of code has to be executed atomically
681 */ 681 */
682 spin_lock_irqsave(&desc->lock, flags); 682 raw_spin_lock_irqsave(&desc->lock, flags);
683 old_ptr = &desc->action; 683 old_ptr = &desc->action;
684 old = *old_ptr; 684 old = *old_ptr;
685 if (old) { 685 if (old) {
@@ -775,7 +775,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
775 __enable_irq(desc, irq, false); 775 __enable_irq(desc, irq, false);
776 } 776 }
777 777
778 spin_unlock_irqrestore(&desc->lock, flags); 778 raw_spin_unlock_irqrestore(&desc->lock, flags);
779 779
780 /* 780 /*
781 * Strictly no need to wake it up, but hung_task complains 781 * Strictly no need to wake it up, but hung_task complains
@@ -802,7 +802,7 @@ mismatch:
802 ret = -EBUSY; 802 ret = -EBUSY;
803 803
804out_thread: 804out_thread:
805 spin_unlock_irqrestore(&desc->lock, flags); 805 raw_spin_unlock_irqrestore(&desc->lock, flags);
806 if (new->thread) { 806 if (new->thread) {
807 struct task_struct *t = new->thread; 807 struct task_struct *t = new->thread;
808 808
@@ -844,7 +844,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
844 if (!desc) 844 if (!desc)
845 return NULL; 845 return NULL;
846 846
847 spin_lock_irqsave(&desc->lock, flags); 847 raw_spin_lock_irqsave(&desc->lock, flags);
848 848
849 /* 849 /*
850 * There can be multiple actions per IRQ descriptor, find the right 850 * There can be multiple actions per IRQ descriptor, find the right
@@ -856,7 +856,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
856 856
857 if (!action) { 857 if (!action) {
858 WARN(1, "Trying to free already-free IRQ %d\n", irq); 858 WARN(1, "Trying to free already-free IRQ %d\n", irq);
859 spin_unlock_irqrestore(&desc->lock, flags); 859 raw_spin_unlock_irqrestore(&desc->lock, flags);
860 860
861 return NULL; 861 return NULL;
862 } 862 }
@@ -884,7 +884,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
884 desc->chip->disable(irq); 884 desc->chip->disable(irq);
885 } 885 }
886 886
887 spin_unlock_irqrestore(&desc->lock, flags); 887 raw_spin_unlock_irqrestore(&desc->lock, flags);
888 888
889 unregister_handler_proc(irq, action); 889 unregister_handler_proc(irq, action);
890 890
@@ -1067,7 +1067,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1067 kfree(action); 1067 kfree(action);
1068 1068
1069#ifdef CONFIG_DEBUG_SHIRQ 1069#ifdef CONFIG_DEBUG_SHIRQ
1070 if (irqflags & IRQF_SHARED) { 1070 if (!retval && (irqflags & IRQF_SHARED)) {
1071 /* 1071 /*
1072 * It's a shared IRQ -- the driver ought to be prepared for it 1072 * It's a shared IRQ -- the driver ought to be prepared for it
1073 * to happen immediately, so let's make sure.... 1073 * to happen immediately, so let's make sure....
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index fcb6c96f2627..241962280836 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -27,7 +27,7 @@ void move_masked_irq(int irq)
27 if (!desc->chip->set_affinity) 27 if (!desc->chip->set_affinity)
28 return; 28 return;
29 29
30 assert_spin_locked(&desc->lock); 30 assert_raw_spin_locked(&desc->lock);
31 31
32 /* 32 /*
33 * If there was a valid mask to work with, please 33 * If there was a valid mask to work with, please
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 3fd30197da2e..26bac9d8f860 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -42,7 +42,7 @@ static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
42 "for migration.\n", irq); 42 "for migration.\n", irq);
43 return false; 43 return false;
44 } 44 }
45 spin_lock_init(&desc->lock); 45 raw_spin_lock_init(&desc->lock);
46 desc->node = node; 46 desc->node = node;
47 lockdep_set_class(&desc->lock, &irq_desc_lock_class); 47 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
48 init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids); 48 init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids);
@@ -67,7 +67,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
67 67
68 irq = old_desc->irq; 68 irq = old_desc->irq;
69 69
70 spin_lock_irqsave(&sparse_irq_lock, flags); 70 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
71 71
72 /* We have to check it to avoid races with another CPU */ 72 /* We have to check it to avoid races with another CPU */
73 desc = irq_desc_ptrs[irq]; 73 desc = irq_desc_ptrs[irq];
@@ -91,7 +91,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
91 } 91 }
92 92
93 irq_desc_ptrs[irq] = desc; 93 irq_desc_ptrs[irq] = desc;
94 spin_unlock_irqrestore(&sparse_irq_lock, flags); 94 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
95 95
96 /* free the old one */ 96 /* free the old one */
97 free_one_irq_desc(old_desc, desc); 97 free_one_irq_desc(old_desc, desc);
@@ -100,7 +100,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
100 return desc; 100 return desc;
101 101
102out_unlock: 102out_unlock:
103 spin_unlock_irqrestore(&sparse_irq_lock, flags); 103 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
104 104
105 return desc; 105 return desc;
106} 106}
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index a0bb09e79867..0d4005d85b03 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -28,9 +28,9 @@ void suspend_device_irqs(void)
28 for_each_irq_desc(irq, desc) { 28 for_each_irq_desc(irq, desc) {
29 unsigned long flags; 29 unsigned long flags;
30 30
31 spin_lock_irqsave(&desc->lock, flags); 31 raw_spin_lock_irqsave(&desc->lock, flags);
32 __disable_irq(desc, irq, true); 32 __disable_irq(desc, irq, true);
33 spin_unlock_irqrestore(&desc->lock, flags); 33 raw_spin_unlock_irqrestore(&desc->lock, flags);
34 } 34 }
35 35
36 for_each_irq_desc(irq, desc) 36 for_each_irq_desc(irq, desc)
@@ -56,9 +56,9 @@ void resume_device_irqs(void)
56 if (!(desc->status & IRQ_SUSPENDED)) 56 if (!(desc->status & IRQ_SUSPENDED))
57 continue; 57 continue;
58 58
59 spin_lock_irqsave(&desc->lock, flags); 59 raw_spin_lock_irqsave(&desc->lock, flags);
60 __enable_irq(desc, irq, true); 60 __enable_irq(desc, irq, true);
61 spin_unlock_irqrestore(&desc->lock, flags); 61 raw_spin_unlock_irqrestore(&desc->lock, flags);
62 } 62 }
63} 63}
64EXPORT_SYMBOL_GPL(resume_device_irqs); 64EXPORT_SYMBOL_GPL(resume_device_irqs);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 692363dd591f..6f50eccc79c0 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -136,7 +136,7 @@ out:
136 136
137static int default_affinity_open(struct inode *inode, struct file *file) 137static int default_affinity_open(struct inode *inode, struct file *file)
138{ 138{
139 return single_open(file, default_affinity_show, NULL); 139 return single_open(file, default_affinity_show, PDE(inode)->data);
140} 140}
141 141
142static const struct file_operations default_affinity_proc_fops = { 142static const struct file_operations default_affinity_proc_fops = {
@@ -148,18 +148,28 @@ static const struct file_operations default_affinity_proc_fops = {
148}; 148};
149#endif 149#endif
150 150
151static int irq_spurious_read(char *page, char **start, off_t off, 151static int irq_spurious_proc_show(struct seq_file *m, void *v)
152 int count, int *eof, void *data)
153{ 152{
154 struct irq_desc *desc = irq_to_desc((long) data); 153 struct irq_desc *desc = irq_to_desc((long) m->private);
155 return sprintf(page, "count %u\n" 154
156 "unhandled %u\n" 155 seq_printf(m, "count %u\n" "unhandled %u\n" "last_unhandled %u ms\n",
157 "last_unhandled %u ms\n", 156 desc->irq_count, desc->irqs_unhandled,
158 desc->irq_count, 157 jiffies_to_msecs(desc->last_unhandled));
159 desc->irqs_unhandled, 158 return 0;
160 jiffies_to_msecs(desc->last_unhandled)); 159}
160
161static int irq_spurious_proc_open(struct inode *inode, struct file *file)
162{
163 return single_open(file, irq_spurious_proc_show, NULL);
161} 164}
162 165
166static const struct file_operations irq_spurious_proc_fops = {
167 .open = irq_spurious_proc_open,
168 .read = seq_read,
169 .llseek = seq_lseek,
170 .release = single_release,
171};
172
163#define MAX_NAMELEN 128 173#define MAX_NAMELEN 128
164 174
165static int name_unique(unsigned int irq, struct irqaction *new_action) 175static int name_unique(unsigned int irq, struct irqaction *new_action)
@@ -169,7 +179,7 @@ static int name_unique(unsigned int irq, struct irqaction *new_action)
169 unsigned long flags; 179 unsigned long flags;
170 int ret = 1; 180 int ret = 1;
171 181
172 spin_lock_irqsave(&desc->lock, flags); 182 raw_spin_lock_irqsave(&desc->lock, flags);
173 for (action = desc->action ; action; action = action->next) { 183 for (action = desc->action ; action; action = action->next) {
174 if ((action != new_action) && action->name && 184 if ((action != new_action) && action->name &&
175 !strcmp(new_action->name, action->name)) { 185 !strcmp(new_action->name, action->name)) {
@@ -177,7 +187,7 @@ static int name_unique(unsigned int irq, struct irqaction *new_action)
177 break; 187 break;
178 } 188 }
179 } 189 }
180 spin_unlock_irqrestore(&desc->lock, flags); 190 raw_spin_unlock_irqrestore(&desc->lock, flags);
181 return ret; 191 return ret;
182} 192}
183 193
@@ -204,7 +214,6 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
204void register_irq_proc(unsigned int irq, struct irq_desc *desc) 214void register_irq_proc(unsigned int irq, struct irq_desc *desc)
205{ 215{
206 char name [MAX_NAMELEN]; 216 char name [MAX_NAMELEN];
207 struct proc_dir_entry *entry;
208 217
209 if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir) 218 if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir)
210 return; 219 return;
@@ -214,6 +223,8 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
214 223
215 /* create /proc/irq/1234 */ 224 /* create /proc/irq/1234 */
216 desc->dir = proc_mkdir(name, root_irq_dir); 225 desc->dir = proc_mkdir(name, root_irq_dir);
226 if (!desc->dir)
227 return;
217 228
218#ifdef CONFIG_SMP 229#ifdef CONFIG_SMP
219 /* create /proc/irq/<irq>/smp_affinity */ 230 /* create /proc/irq/<irq>/smp_affinity */
@@ -221,11 +232,8 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
221 &irq_affinity_proc_fops, (void *)(long)irq); 232 &irq_affinity_proc_fops, (void *)(long)irq);
222#endif 233#endif
223 234
224 entry = create_proc_entry("spurious", 0444, desc->dir); 235 proc_create_data("spurious", 0444, desc->dir,
225 if (entry) { 236 &irq_spurious_proc_fops, (void *)(long)irq);
226 entry->data = (void *)(long)irq;
227 entry->read_proc = irq_spurious_read;
228 }
229} 237}
230 238
231#undef MAX_NAMELEN 239#undef MAX_NAMELEN
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 114e704760fe..89fb90ae534f 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -28,7 +28,7 @@ static int try_one_irq(int irq, struct irq_desc *desc)
28 struct irqaction *action; 28 struct irqaction *action;
29 int ok = 0, work = 0; 29 int ok = 0, work = 0;
30 30
31 spin_lock(&desc->lock); 31 raw_spin_lock(&desc->lock);
32 /* Already running on another processor */ 32 /* Already running on another processor */
33 if (desc->status & IRQ_INPROGRESS) { 33 if (desc->status & IRQ_INPROGRESS) {
34 /* 34 /*
@@ -37,13 +37,13 @@ static int try_one_irq(int irq, struct irq_desc *desc)
37 */ 37 */
38 if (desc->action && (desc->action->flags & IRQF_SHARED)) 38 if (desc->action && (desc->action->flags & IRQF_SHARED))
39 desc->status |= IRQ_PENDING; 39 desc->status |= IRQ_PENDING;
40 spin_unlock(&desc->lock); 40 raw_spin_unlock(&desc->lock);
41 return ok; 41 return ok;
42 } 42 }
43 /* Honour the normal IRQ locking */ 43 /* Honour the normal IRQ locking */
44 desc->status |= IRQ_INPROGRESS; 44 desc->status |= IRQ_INPROGRESS;
45 action = desc->action; 45 action = desc->action;
46 spin_unlock(&desc->lock); 46 raw_spin_unlock(&desc->lock);
47 47
48 while (action) { 48 while (action) {
49 /* Only shared IRQ handlers are safe to call */ 49 /* Only shared IRQ handlers are safe to call */
@@ -56,7 +56,7 @@ static int try_one_irq(int irq, struct irq_desc *desc)
56 } 56 }
57 local_irq_disable(); 57 local_irq_disable();
58 /* Now clean up the flags */ 58 /* Now clean up the flags */
59 spin_lock(&desc->lock); 59 raw_spin_lock(&desc->lock);
60 action = desc->action; 60 action = desc->action;
61 61
62 /* 62 /*
@@ -68,9 +68,9 @@ static int try_one_irq(int irq, struct irq_desc *desc)
68 * Perform real IRQ processing for the IRQ we deferred 68 * Perform real IRQ processing for the IRQ we deferred
69 */ 69 */
70 work = 1; 70 work = 1;
71 spin_unlock(&desc->lock); 71 raw_spin_unlock(&desc->lock);
72 handle_IRQ_event(irq, action); 72 handle_IRQ_event(irq, action);
73 spin_lock(&desc->lock); 73 raw_spin_lock(&desc->lock);
74 desc->status &= ~IRQ_PENDING; 74 desc->status &= ~IRQ_PENDING;
75 } 75 }
76 desc->status &= ~IRQ_INPROGRESS; 76 desc->status &= ~IRQ_INPROGRESS;
@@ -80,7 +80,7 @@ static int try_one_irq(int irq, struct irq_desc *desc)
80 */ 80 */
81 if (work && desc->chip && desc->chip->end) 81 if (work && desc->chip && desc->chip->end)
82 desc->chip->end(irq); 82 desc->chip->end(irq);
83 spin_unlock(&desc->lock); 83 raw_spin_unlock(&desc->lock);
84 84
85 return ok; 85 return ok;
86} 86}
@@ -104,7 +104,7 @@ static int misrouted_irq(int irq)
104 return ok; 104 return ok;
105} 105}
106 106
107static void poll_all_shared_irqs(void) 107static void poll_spurious_irqs(unsigned long dummy)
108{ 108{
109 struct irq_desc *desc; 109 struct irq_desc *desc;
110 int i; 110 int i;
@@ -121,25 +121,15 @@ static void poll_all_shared_irqs(void)
121 if (!(status & IRQ_SPURIOUS_DISABLED)) 121 if (!(status & IRQ_SPURIOUS_DISABLED))
122 continue; 122 continue;
123 123
124 local_irq_disable();
124 try_one_irq(i, desc); 125 try_one_irq(i, desc);
126 local_irq_enable();
125 } 127 }
126}
127
128static void poll_spurious_irqs(unsigned long dummy)
129{
130 poll_all_shared_irqs();
131 128
132 mod_timer(&poll_spurious_irq_timer, 129 mod_timer(&poll_spurious_irq_timer,
133 jiffies + POLL_SPURIOUS_IRQ_INTERVAL); 130 jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
134} 131}
135 132
136#ifdef CONFIG_DEBUG_SHIRQ
137void debug_poll_all_shared_irqs(void)
138{
139 poll_all_shared_irqs();
140}
141#endif
142
143/* 133/*
144 * If 99,900 of the previous 100,000 interrupts have not been handled 134 * If 99,900 of the previous 100,000 interrupts have not been handled
145 * then assume that the IRQ is stuck in some manner. Drop a diagnostic 135 * then assume that the IRQ is stuck in some manner. Drop a diagnostic
@@ -230,7 +220,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
230 /* 220 /*
231 * If we are seeing only the odd spurious IRQ caused by 221 * If we are seeing only the odd spurious IRQ caused by
232 * bus asynchronicity then don't eventually trigger an error, 222 * bus asynchronicity then don't eventually trigger an error,
233 * otherwise the couter becomes a doomsday timer for otherwise 223 * otherwise the counter becomes a doomsday timer for otherwise
234 * working systems 224 * working systems
235 */ 225 */
236 if (time_after(jiffies, desc->last_unhandled + HZ/10)) 226 if (time_after(jiffies, desc->last_unhandled + HZ/10))
diff --git a/kernel/itimer.c b/kernel/itimer.c
index b03451ede528..d802883153da 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -146,6 +146,7 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
146{ 146{
147 cputime_t cval, nval, cinterval, ninterval; 147 cputime_t cval, nval, cinterval, ninterval;
148 s64 ns_ninterval, ns_nval; 148 s64 ns_ninterval, ns_nval;
149 u32 error, incr_error;
149 struct cpu_itimer *it = &tsk->signal->it[clock_id]; 150 struct cpu_itimer *it = &tsk->signal->it[clock_id];
150 151
151 nval = timeval_to_cputime(&value->it_value); 152 nval = timeval_to_cputime(&value->it_value);
@@ -153,8 +154,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
153 ninterval = timeval_to_cputime(&value->it_interval); 154 ninterval = timeval_to_cputime(&value->it_interval);
154 ns_ninterval = timeval_to_ns(&value->it_interval); 155 ns_ninterval = timeval_to_ns(&value->it_interval);
155 156
156 it->incr_error = cputime_sub_ns(ninterval, ns_ninterval); 157 error = cputime_sub_ns(nval, ns_nval);
157 it->error = cputime_sub_ns(nval, ns_nval); 158 incr_error = cputime_sub_ns(ninterval, ns_ninterval);
158 159
159 spin_lock_irq(&tsk->sighand->siglock); 160 spin_lock_irq(&tsk->sighand->siglock);
160 161
@@ -168,6 +169,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
168 } 169 }
169 it->expires = nval; 170 it->expires = nval;
170 it->incr = ninterval; 171 it->incr = ninterval;
172 it->error = error;
173 it->incr_error = incr_error;
171 trace_itimer_state(clock_id == CPUCLOCK_VIRT ? 174 trace_itimer_state(clock_id == CPUCLOCK_VIRT ?
172 ITIMER_VIRTUAL : ITIMER_PROF, value, nval); 175 ITIMER_VIRTUAL : ITIMER_PROF, value, nval);
173 176
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 8b6b8b697c68..8e5288a8a355 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -181,6 +181,7 @@ unsigned long kallsyms_lookup_name(const char *name)
181 } 181 }
182 return module_kallsyms_lookup_name(name); 182 return module_kallsyms_lookup_name(name);
183} 183}
184EXPORT_SYMBOL_GPL(kallsyms_lookup_name);
184 185
185int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, 186int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
186 unsigned long), 187 unsigned long),
diff --git a/kernel/kexec.c b/kernel/kexec.c
index f336e2107f98..433e9fcc1fc5 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -31,6 +31,7 @@
31#include <linux/cpu.h> 31#include <linux/cpu.h>
32#include <linux/console.h> 32#include <linux/console.h>
33#include <linux/vmalloc.h> 33#include <linux/vmalloc.h>
34#include <linux/swap.h>
34 35
35#include <asm/page.h> 36#include <asm/page.h>
36#include <asm/uaccess.h> 37#include <asm/uaccess.h>
@@ -1082,6 +1083,64 @@ void crash_kexec(struct pt_regs *regs)
1082 } 1083 }
1083} 1084}
1084 1085
1086size_t crash_get_memory_size(void)
1087{
1088 size_t size;
1089 mutex_lock(&kexec_mutex);
1090 size = crashk_res.end - crashk_res.start + 1;
1091 mutex_unlock(&kexec_mutex);
1092 return size;
1093}
1094
1095static void free_reserved_phys_range(unsigned long begin, unsigned long end)
1096{
1097 unsigned long addr;
1098
1099 for (addr = begin; addr < end; addr += PAGE_SIZE) {
1100 ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT));
1101 init_page_count(pfn_to_page(addr >> PAGE_SHIFT));
1102 free_page((unsigned long)__va(addr));
1103 totalram_pages++;
1104 }
1105}
1106
1107int crash_shrink_memory(unsigned long new_size)
1108{
1109 int ret = 0;
1110 unsigned long start, end;
1111
1112 mutex_lock(&kexec_mutex);
1113
1114 if (kexec_crash_image) {
1115 ret = -ENOENT;
1116 goto unlock;
1117 }
1118 start = crashk_res.start;
1119 end = crashk_res.end;
1120
1121 if (new_size >= end - start + 1) {
1122 ret = -EINVAL;
1123 if (new_size == end - start + 1)
1124 ret = 0;
1125 goto unlock;
1126 }
1127
1128 start = roundup(start, PAGE_SIZE);
1129 end = roundup(start + new_size, PAGE_SIZE);
1130
1131 free_reserved_phys_range(end, crashk_res.end);
1132
1133 if (start == end) {
1134 crashk_res.end = end;
1135 release_resource(&crashk_res);
1136 } else
1137 crashk_res.end = end - 1;
1138
1139unlock:
1140 mutex_unlock(&kexec_mutex);
1141 return ret;
1142}
1143
1085static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, 1144static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
1086 size_t data_len) 1145 size_t data_len)
1087{ 1146{
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 9147a3190c9d..2eb517e23514 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -129,6 +129,7 @@ struct task_struct *kgdb_usethread;
129struct task_struct *kgdb_contthread; 129struct task_struct *kgdb_contthread;
130 130
131int kgdb_single_step; 131int kgdb_single_step;
132pid_t kgdb_sstep_pid;
132 133
133/* Our I/O buffers. */ 134/* Our I/O buffers. */
134static char remcom_in_buffer[BUFMAX]; 135static char remcom_in_buffer[BUFMAX];
@@ -541,12 +542,17 @@ static struct task_struct *getthread(struct pt_regs *regs, int tid)
541 */ 542 */
542 if (tid == 0 || tid == -1) 543 if (tid == 0 || tid == -1)
543 tid = -atomic_read(&kgdb_active) - 2; 544 tid = -atomic_read(&kgdb_active) - 2;
544 if (tid < 0) { 545 if (tid < -1 && tid > -NR_CPUS - 2) {
545 if (kgdb_info[-tid - 2].task) 546 if (kgdb_info[-tid - 2].task)
546 return kgdb_info[-tid - 2].task; 547 return kgdb_info[-tid - 2].task;
547 else 548 else
548 return idle_task(-tid - 2); 549 return idle_task(-tid - 2);
549 } 550 }
551 if (tid <= 0) {
552 printk(KERN_ERR "KGDB: Internal thread select error\n");
553 dump_stack();
554 return NULL;
555 }
550 556
551 /* 557 /*
552 * find_task_by_pid_ns() does not take the tasklist lock anymore 558 * find_task_by_pid_ns() does not take the tasklist lock anymore
@@ -619,7 +625,8 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
619static int kgdb_activate_sw_breakpoints(void) 625static int kgdb_activate_sw_breakpoints(void)
620{ 626{
621 unsigned long addr; 627 unsigned long addr;
622 int error = 0; 628 int error;
629 int ret = 0;
623 int i; 630 int i;
624 631
625 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { 632 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
@@ -629,13 +636,16 @@ static int kgdb_activate_sw_breakpoints(void)
629 addr = kgdb_break[i].bpt_addr; 636 addr = kgdb_break[i].bpt_addr;
630 error = kgdb_arch_set_breakpoint(addr, 637 error = kgdb_arch_set_breakpoint(addr,
631 kgdb_break[i].saved_instr); 638 kgdb_break[i].saved_instr);
632 if (error) 639 if (error) {
633 return error; 640 ret = error;
641 printk(KERN_INFO "KGDB: BP install failed: %lx", addr);
642 continue;
643 }
634 644
635 kgdb_flush_swbreak_addr(addr); 645 kgdb_flush_swbreak_addr(addr);
636 kgdb_break[i].state = BP_ACTIVE; 646 kgdb_break[i].state = BP_ACTIVE;
637 } 647 }
638 return 0; 648 return ret;
639} 649}
640 650
641static int kgdb_set_sw_break(unsigned long addr) 651static int kgdb_set_sw_break(unsigned long addr)
@@ -682,7 +692,8 @@ static int kgdb_set_sw_break(unsigned long addr)
682static int kgdb_deactivate_sw_breakpoints(void) 692static int kgdb_deactivate_sw_breakpoints(void)
683{ 693{
684 unsigned long addr; 694 unsigned long addr;
685 int error = 0; 695 int error;
696 int ret = 0;
686 int i; 697 int i;
687 698
688 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { 699 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
@@ -691,13 +702,15 @@ static int kgdb_deactivate_sw_breakpoints(void)
691 addr = kgdb_break[i].bpt_addr; 702 addr = kgdb_break[i].bpt_addr;
692 error = kgdb_arch_remove_breakpoint(addr, 703 error = kgdb_arch_remove_breakpoint(addr,
693 kgdb_break[i].saved_instr); 704 kgdb_break[i].saved_instr);
694 if (error) 705 if (error) {
695 return error; 706 printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr);
707 ret = error;
708 }
696 709
697 kgdb_flush_swbreak_addr(addr); 710 kgdb_flush_swbreak_addr(addr);
698 kgdb_break[i].state = BP_SET; 711 kgdb_break[i].state = BP_SET;
699 } 712 }
700 return 0; 713 return ret;
701} 714}
702 715
703static int kgdb_remove_sw_break(unsigned long addr) 716static int kgdb_remove_sw_break(unsigned long addr)
@@ -870,7 +883,7 @@ static void gdb_cmd_getregs(struct kgdb_state *ks)
870 883
871 /* 884 /*
872 * All threads that don't have debuggerinfo should be 885 * All threads that don't have debuggerinfo should be
873 * in __schedule() sleeping, since all other CPUs 886 * in schedule() sleeping, since all other CPUs
874 * are in kgdb_wait, and thus have debuggerinfo. 887 * are in kgdb_wait, and thus have debuggerinfo.
875 */ 888 */
876 if (local_debuggerinfo) { 889 if (local_debuggerinfo) {
@@ -1204,8 +1217,10 @@ static int gdb_cmd_exception_pass(struct kgdb_state *ks)
1204 return 1; 1217 return 1;
1205 1218
1206 } else { 1219 } else {
1207 error_packet(remcom_out_buffer, -EINVAL); 1220 kgdb_msg_write("KGDB only knows signal 9 (pass)"
1208 return 0; 1221 " and 15 (pass and disconnect)\n"
1222 "Executing a continue without signal passing\n", 0);
1223 remcom_in_buffer[0] = 'c';
1209 } 1224 }
1210 1225
1211 /* Indicate fall through */ 1226 /* Indicate fall through */
@@ -1395,6 +1410,7 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
1395 struct kgdb_state kgdb_var; 1410 struct kgdb_state kgdb_var;
1396 struct kgdb_state *ks = &kgdb_var; 1411 struct kgdb_state *ks = &kgdb_var;
1397 unsigned long flags; 1412 unsigned long flags;
1413 int sstep_tries = 100;
1398 int error = 0; 1414 int error = 0;
1399 int i, cpu; 1415 int i, cpu;
1400 1416
@@ -1425,13 +1441,14 @@ acquirelock:
1425 cpu_relax(); 1441 cpu_relax();
1426 1442
1427 /* 1443 /*
1428 * Do not start the debugger connection on this CPU if the last 1444 * For single stepping, try to only enter on the processor
1429 * instance of the exception handler wanted to come into the 1445 * that was single stepping. To gaurd against a deadlock, the
1430 * debugger on a different CPU via a single step 1446 * kernel will only try for the value of sstep_tries before
1447 * giving up and continuing on.
1431 */ 1448 */
1432 if (atomic_read(&kgdb_cpu_doing_single_step) != -1 && 1449 if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
1433 atomic_read(&kgdb_cpu_doing_single_step) != cpu) { 1450 (kgdb_info[cpu].task &&
1434 1451 kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
1435 atomic_set(&kgdb_active, -1); 1452 atomic_set(&kgdb_active, -1);
1436 touch_softlockup_watchdog(); 1453 touch_softlockup_watchdog();
1437 clocksource_touch_watchdog(); 1454 clocksource_touch_watchdog();
@@ -1524,6 +1541,13 @@ acquirelock:
1524 } 1541 }
1525 1542
1526kgdb_restore: 1543kgdb_restore:
1544 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
1545 int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step);
1546 if (kgdb_info[sstep_cpu].task)
1547 kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid;
1548 else
1549 kgdb_sstep_pid = 0;
1550 }
1527 /* Free kgdb_active */ 1551 /* Free kgdb_active */
1528 atomic_set(&kgdb_active, -1); 1552 atomic_set(&kgdb_active, -1);
1529 touch_softlockup_watchdog(); 1553 touch_softlockup_watchdog();
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 689d20f39305..25b103190364 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -80,16 +80,16 @@ int __request_module(bool wait, const char *fmt, ...)
80#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ 80#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
81 static int kmod_loop_msg; 81 static int kmod_loop_msg;
82 82
83 ret = security_kernel_module_request();
84 if (ret)
85 return ret;
86
87 va_start(args, fmt); 83 va_start(args, fmt);
88 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); 84 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
89 va_end(args); 85 va_end(args);
90 if (ret >= MODULE_NAME_LEN) 86 if (ret >= MODULE_NAME_LEN)
91 return -ENAMETOOLONG; 87 return -ENAMETOOLONG;
92 88
89 ret = security_kernel_module_request(module_name);
90 if (ret)
91 return ret;
92
93 /* If modprobe needs a service that is in a module, we get a recursive 93 /* If modprobe needs a service that is in a module, we get a recursive
94 * loop. Limit the number of running kmod threads to max_threads/2 or 94 * loop. Limit the number of running kmod threads to max_threads/2 or
95 * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method 95 * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method
@@ -143,7 +143,6 @@ struct subprocess_info {
143static int ____call_usermodehelper(void *data) 143static int ____call_usermodehelper(void *data)
144{ 144{
145 struct subprocess_info *sub_info = data; 145 struct subprocess_info *sub_info = data;
146 enum umh_wait wait = sub_info->wait;
147 int retval; 146 int retval;
148 147
149 BUG_ON(atomic_read(&sub_info->cred->usage) != 1); 148 BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
@@ -185,14 +184,10 @@ static int ____call_usermodehelper(void *data)
185 */ 184 */
186 set_user_nice(current, 0); 185 set_user_nice(current, 0);
187 186
188 if (wait == UMH_WAIT_EXEC)
189 complete(sub_info->complete);
190
191 retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp); 187 retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp);
192 188
193 /* Exec failed? */ 189 /* Exec failed? */
194 if (wait != UMH_WAIT_EXEC) 190 sub_info->retval = retval;
195 sub_info->retval = retval;
196 do_exit(0); 191 do_exit(0);
197} 192}
198 193
@@ -271,14 +266,16 @@ static void __call_usermodehelper(struct work_struct *work)
271 266
272 switch (wait) { 267 switch (wait) {
273 case UMH_NO_WAIT: 268 case UMH_NO_WAIT:
274 case UMH_WAIT_EXEC:
275 break; 269 break;
276 270
277 case UMH_WAIT_PROC: 271 case UMH_WAIT_PROC:
278 if (pid > 0) 272 if (pid > 0)
279 break; 273 break;
280 sub_info->retval = pid; 274 sub_info->retval = pid;
281 break; 275 /* FALLTHROUGH */
276
277 case UMH_WAIT_EXEC:
278 complete(sub_info->complete);
282 } 279 }
283} 280}
284 281
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index cfadc1291d0b..e5342a344c43 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -90,6 +90,9 @@ static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
90 */ 90 */
91static struct kprobe_blackpoint kprobe_blacklist[] = { 91static struct kprobe_blackpoint kprobe_blacklist[] = {
92 {"preempt_schedule",}, 92 {"preempt_schedule",},
93 {"native_get_debugreg",},
94 {"irq_entries_start",},
95 {"common_interrupt",},
93 {NULL} /* Terminator */ 96 {NULL} /* Terminator */
94}; 97};
95 98
@@ -673,6 +676,40 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
673 return (kprobe_opcode_t *)(((char *)addr) + p->offset); 676 return (kprobe_opcode_t *)(((char *)addr) + p->offset);
674} 677}
675 678
679/* Check passed kprobe is valid and return kprobe in kprobe_table. */
680static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
681{
682 struct kprobe *old_p, *list_p;
683
684 old_p = get_kprobe(p->addr);
685 if (unlikely(!old_p))
686 return NULL;
687
688 if (p != old_p) {
689 list_for_each_entry_rcu(list_p, &old_p->list, list)
690 if (list_p == p)
691 /* kprobe p is a valid probe */
692 goto valid;
693 return NULL;
694 }
695valid:
696 return old_p;
697}
698
699/* Return error if the kprobe is being re-registered */
700static inline int check_kprobe_rereg(struct kprobe *p)
701{
702 int ret = 0;
703 struct kprobe *old_p;
704
705 mutex_lock(&kprobe_mutex);
706 old_p = __get_valid_kprobe(p);
707 if (old_p)
708 ret = -EINVAL;
709 mutex_unlock(&kprobe_mutex);
710 return ret;
711}
712
676int __kprobes register_kprobe(struct kprobe *p) 713int __kprobes register_kprobe(struct kprobe *p)
677{ 714{
678 int ret = 0; 715 int ret = 0;
@@ -685,6 +722,10 @@ int __kprobes register_kprobe(struct kprobe *p)
685 return -EINVAL; 722 return -EINVAL;
686 p->addr = addr; 723 p->addr = addr;
687 724
725 ret = check_kprobe_rereg(p);
726 if (ret)
727 return ret;
728
688 preempt_disable(); 729 preempt_disable();
689 if (!kernel_text_address((unsigned long) p->addr) || 730 if (!kernel_text_address((unsigned long) p->addr) ||
690 in_kprobes_functions((unsigned long) p->addr)) { 731 in_kprobes_functions((unsigned long) p->addr)) {
@@ -754,26 +795,6 @@ out:
754} 795}
755EXPORT_SYMBOL_GPL(register_kprobe); 796EXPORT_SYMBOL_GPL(register_kprobe);
756 797
757/* Check passed kprobe is valid and return kprobe in kprobe_table. */
758static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
759{
760 struct kprobe *old_p, *list_p;
761
762 old_p = get_kprobe(p->addr);
763 if (unlikely(!old_p))
764 return NULL;
765
766 if (p != old_p) {
767 list_for_each_entry_rcu(list_p, &old_p->list, list)
768 if (list_p == p)
769 /* kprobe p is a valid probe */
770 goto valid;
771 return NULL;
772 }
773valid:
774 return old_p;
775}
776
777/* 798/*
778 * Unregister a kprobe without a scheduler synchronization. 799 * Unregister a kprobe without a scheduler synchronization.
779 */ 800 */
@@ -1014,9 +1035,9 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
1014 /* Pre-allocate memory for max kretprobe instances */ 1035 /* Pre-allocate memory for max kretprobe instances */
1015 if (rp->maxactive <= 0) { 1036 if (rp->maxactive <= 0) {
1016#ifdef CONFIG_PREEMPT 1037#ifdef CONFIG_PREEMPT
1017 rp->maxactive = max(10, 2 * NR_CPUS); 1038 rp->maxactive = max(10, 2 * num_possible_cpus());
1018#else 1039#else
1019 rp->maxactive = NR_CPUS; 1040 rp->maxactive = num_possible_cpus();
1020#endif 1041#endif
1021 } 1042 }
1022 spin_lock_init(&rp->lock); 1043 spin_lock_init(&rp->lock);
@@ -1141,6 +1162,13 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1141 arch_remove_kprobe(p); 1162 arch_remove_kprobe(p);
1142} 1163}
1143 1164
1165void __kprobes dump_kprobe(struct kprobe *kp)
1166{
1167 printk(KERN_WARNING "Dumping kprobe:\n");
1168 printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n",
1169 kp->symbol_name, kp->addr, kp->offset);
1170}
1171
1144/* Module notifier call back, checking kprobes on the module */ 1172/* Module notifier call back, checking kprobes on the module */
1145static int __kprobes kprobes_module_callback(struct notifier_block *nb, 1173static int __kprobes kprobes_module_callback(struct notifier_block *nb,
1146 unsigned long val, void *data) 1174 unsigned long val, void *data)
@@ -1333,7 +1361,7 @@ static int __kprobes kprobes_open(struct inode *inode, struct file *filp)
1333 return seq_open(filp, &kprobes_seq_ops); 1361 return seq_open(filp, &kprobes_seq_ops);
1334} 1362}
1335 1363
1336static struct file_operations debugfs_kprobes_operations = { 1364static const struct file_operations debugfs_kprobes_operations = {
1337 .open = kprobes_open, 1365 .open = kprobes_open,
1338 .read = seq_read, 1366 .read = seq_read,
1339 .llseek = seq_lseek, 1367 .llseek = seq_lseek,
@@ -1515,7 +1543,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
1515 return count; 1543 return count;
1516} 1544}
1517 1545
1518static struct file_operations fops_kp = { 1546static const struct file_operations fops_kp = {
1519 .read = read_enabled_file_bool, 1547 .read = read_enabled_file_bool,
1520 .write = write_enabled_file_bool, 1548 .write = write_enabled_file_bool,
1521}; 1549};
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 528dd78e7e7e..3feaf5a74514 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -100,6 +100,26 @@ static ssize_t kexec_crash_loaded_show(struct kobject *kobj,
100} 100}
101KERNEL_ATTR_RO(kexec_crash_loaded); 101KERNEL_ATTR_RO(kexec_crash_loaded);
102 102
103static ssize_t kexec_crash_size_show(struct kobject *kobj,
104 struct kobj_attribute *attr, char *buf)
105{
106 return sprintf(buf, "%zu\n", crash_get_memory_size());
107}
108static ssize_t kexec_crash_size_store(struct kobject *kobj,
109 struct kobj_attribute *attr,
110 const char *buf, size_t count)
111{
112 unsigned long cnt;
113 int ret;
114
115 if (strict_strtoul(buf, 0, &cnt))
116 return -EINVAL;
117
118 ret = crash_shrink_memory(cnt);
119 return ret < 0 ? ret : count;
120}
121KERNEL_ATTR_RW(kexec_crash_size);
122
103static ssize_t vmcoreinfo_show(struct kobject *kobj, 123static ssize_t vmcoreinfo_show(struct kobject *kobj,
104 struct kobj_attribute *attr, char *buf) 124 struct kobj_attribute *attr, char *buf)
105{ 125{
@@ -147,6 +167,7 @@ static struct attribute * kernel_attrs[] = {
147#ifdef CONFIG_KEXEC 167#ifdef CONFIG_KEXEC
148 &kexec_loaded_attr.attr, 168 &kexec_loaded_attr.attr,
149 &kexec_crash_loaded_attr.attr, 169 &kexec_crash_loaded_attr.attr,
170 &kexec_crash_size_attr.attr,
150 &vmcoreinfo_attr.attr, 171 &vmcoreinfo_attr.attr,
151#endif 172#endif
152 NULL 173 NULL
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 5fe709982caa..ab7ae57773e1 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -150,29 +150,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
150EXPORT_SYMBOL(kthread_create); 150EXPORT_SYMBOL(kthread_create);
151 151
152/** 152/**
153 * kthread_bind - bind a just-created kthread to a cpu.
154 * @k: thread created by kthread_create().
155 * @cpu: cpu (might not be online, must be possible) for @k to run on.
156 *
157 * Description: This function is equivalent to set_cpus_allowed(),
158 * except that @cpu doesn't need to be online, and the thread must be
159 * stopped (i.e., just returned from kthread_create()).
160 */
161void kthread_bind(struct task_struct *k, unsigned int cpu)
162{
163 /* Must have done schedule() in kthread() before we set_task_cpu */
164 if (!wait_task_inactive(k, TASK_UNINTERRUPTIBLE)) {
165 WARN_ON(1);
166 return;
167 }
168 set_task_cpu(k, cpu);
169 k->cpus_allowed = cpumask_of_cpu(cpu);
170 k->rt.nr_cpus_allowed = 1;
171 k->flags |= PF_THREAD_BOUND;
172}
173EXPORT_SYMBOL(kthread_bind);
174
175/**
176 * kthread_stop - stop a thread created by kthread_create(). 153 * kthread_stop - stop a thread created by kthread_create().
177 * @k: thread created by kthread_create(). 154 * @k: thread created by kthread_create().
178 * 155 *
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 3815ac1d58b2..5feaddcdbe49 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -49,7 +49,7 @@
49#include "lockdep_internals.h" 49#include "lockdep_internals.h"
50 50
51#define CREATE_TRACE_POINTS 51#define CREATE_TRACE_POINTS
52#include <trace/events/lockdep.h> 52#include <trace/events/lock.h>
53 53
54#ifdef CONFIG_PROVE_LOCKING 54#ifdef CONFIG_PROVE_LOCKING
55int prove_locking = 1; 55int prove_locking = 1;
@@ -73,11 +73,11 @@ module_param(lock_stat, int, 0644);
73 * to use a raw spinlock - we really dont want the spinlock 73 * to use a raw spinlock - we really dont want the spinlock
74 * code to recurse back into the lockdep code... 74 * code to recurse back into the lockdep code...
75 */ 75 */
76static raw_spinlock_t lockdep_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 76static arch_spinlock_t lockdep_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
77 77
78static int graph_lock(void) 78static int graph_lock(void)
79{ 79{
80 __raw_spin_lock(&lockdep_lock); 80 arch_spin_lock(&lockdep_lock);
81 /* 81 /*
82 * Make sure that if another CPU detected a bug while 82 * Make sure that if another CPU detected a bug while
83 * walking the graph we dont change it (while the other 83 * walking the graph we dont change it (while the other
@@ -85,7 +85,7 @@ static int graph_lock(void)
85 * dropped already) 85 * dropped already)
86 */ 86 */
87 if (!debug_locks) { 87 if (!debug_locks) {
88 __raw_spin_unlock(&lockdep_lock); 88 arch_spin_unlock(&lockdep_lock);
89 return 0; 89 return 0;
90 } 90 }
91 /* prevent any recursions within lockdep from causing deadlocks */ 91 /* prevent any recursions within lockdep from causing deadlocks */
@@ -95,11 +95,11 @@ static int graph_lock(void)
95 95
96static inline int graph_unlock(void) 96static inline int graph_unlock(void)
97{ 97{
98 if (debug_locks && !__raw_spin_is_locked(&lockdep_lock)) 98 if (debug_locks && !arch_spin_is_locked(&lockdep_lock))
99 return DEBUG_LOCKS_WARN_ON(1); 99 return DEBUG_LOCKS_WARN_ON(1);
100 100
101 current->lockdep_recursion--; 101 current->lockdep_recursion--;
102 __raw_spin_unlock(&lockdep_lock); 102 arch_spin_unlock(&lockdep_lock);
103 return 0; 103 return 0;
104} 104}
105 105
@@ -111,7 +111,7 @@ static inline int debug_locks_off_graph_unlock(void)
111{ 111{
112 int ret = debug_locks_off(); 112 int ret = debug_locks_off();
113 113
114 __raw_spin_unlock(&lockdep_lock); 114 arch_spin_unlock(&lockdep_lock);
115 115
116 return ret; 116 return ret;
117} 117}
@@ -140,7 +140,13 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock)
140} 140}
141 141
142#ifdef CONFIG_LOCK_STAT 142#ifdef CONFIG_LOCK_STAT
143static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); 143static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS],
144 cpu_lock_stats);
145
146static inline u64 lockstat_clock(void)
147{
148 return cpu_clock(smp_processor_id());
149}
144 150
145static int lock_point(unsigned long points[], unsigned long ip) 151static int lock_point(unsigned long points[], unsigned long ip)
146{ 152{
@@ -158,12 +164,12 @@ static int lock_point(unsigned long points[], unsigned long ip)
158 return i; 164 return i;
159} 165}
160 166
161static void lock_time_inc(struct lock_time *lt, s64 time) 167static void lock_time_inc(struct lock_time *lt, u64 time)
162{ 168{
163 if (time > lt->max) 169 if (time > lt->max)
164 lt->max = time; 170 lt->max = time;
165 171
166 if (time < lt->min || !lt->min) 172 if (time < lt->min || !lt->nr)
167 lt->min = time; 173 lt->min = time;
168 174
169 lt->total += time; 175 lt->total += time;
@@ -172,8 +178,15 @@ static void lock_time_inc(struct lock_time *lt, s64 time)
172 178
173static inline void lock_time_add(struct lock_time *src, struct lock_time *dst) 179static inline void lock_time_add(struct lock_time *src, struct lock_time *dst)
174{ 180{
175 dst->min += src->min; 181 if (!src->nr)
176 dst->max += src->max; 182 return;
183
184 if (src->max > dst->max)
185 dst->max = src->max;
186
187 if (src->min < dst->min || !dst->nr)
188 dst->min = src->min;
189
177 dst->total += src->total; 190 dst->total += src->total;
178 dst->nr += src->nr; 191 dst->nr += src->nr;
179} 192}
@@ -186,7 +199,7 @@ struct lock_class_stats lock_stats(struct lock_class *class)
186 memset(&stats, 0, sizeof(struct lock_class_stats)); 199 memset(&stats, 0, sizeof(struct lock_class_stats));
187 for_each_possible_cpu(cpu) { 200 for_each_possible_cpu(cpu) {
188 struct lock_class_stats *pcs = 201 struct lock_class_stats *pcs =
189 &per_cpu(lock_stats, cpu)[class - lock_classes]; 202 &per_cpu(cpu_lock_stats, cpu)[class - lock_classes];
190 203
191 for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++) 204 for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
192 stats.contention_point[i] += pcs->contention_point[i]; 205 stats.contention_point[i] += pcs->contention_point[i];
@@ -213,7 +226,7 @@ void clear_lock_stats(struct lock_class *class)
213 226
214 for_each_possible_cpu(cpu) { 227 for_each_possible_cpu(cpu) {
215 struct lock_class_stats *cpu_stats = 228 struct lock_class_stats *cpu_stats =
216 &per_cpu(lock_stats, cpu)[class - lock_classes]; 229 &per_cpu(cpu_lock_stats, cpu)[class - lock_classes];
217 230
218 memset(cpu_stats, 0, sizeof(struct lock_class_stats)); 231 memset(cpu_stats, 0, sizeof(struct lock_class_stats));
219 } 232 }
@@ -223,23 +236,23 @@ void clear_lock_stats(struct lock_class *class)
223 236
224static struct lock_class_stats *get_lock_stats(struct lock_class *class) 237static struct lock_class_stats *get_lock_stats(struct lock_class *class)
225{ 238{
226 return &get_cpu_var(lock_stats)[class - lock_classes]; 239 return &get_cpu_var(cpu_lock_stats)[class - lock_classes];
227} 240}
228 241
229static void put_lock_stats(struct lock_class_stats *stats) 242static void put_lock_stats(struct lock_class_stats *stats)
230{ 243{
231 put_cpu_var(lock_stats); 244 put_cpu_var(cpu_lock_stats);
232} 245}
233 246
234static void lock_release_holdtime(struct held_lock *hlock) 247static void lock_release_holdtime(struct held_lock *hlock)
235{ 248{
236 struct lock_class_stats *stats; 249 struct lock_class_stats *stats;
237 s64 holdtime; 250 u64 holdtime;
238 251
239 if (!lock_stat) 252 if (!lock_stat)
240 return; 253 return;
241 254
242 holdtime = sched_clock() - hlock->holdtime_stamp; 255 holdtime = lockstat_clock() - hlock->holdtime_stamp;
243 256
244 stats = get_lock_stats(hlock_class(hlock)); 257 stats = get_lock_stats(hlock_class(hlock));
245 if (hlock->read) 258 if (hlock->read)
@@ -374,7 +387,8 @@ static int save_trace(struct stack_trace *trace)
374 * complete trace that maxes out the entries provided will be reported 387 * complete trace that maxes out the entries provided will be reported
375 * as incomplete, friggin useless </rant> 388 * as incomplete, friggin useless </rant>
376 */ 389 */
377 if (trace->entries[trace->nr_entries-1] == ULONG_MAX) 390 if (trace->nr_entries != 0 &&
391 trace->entries[trace->nr_entries-1] == ULONG_MAX)
378 trace->nr_entries--; 392 trace->nr_entries--;
379 393
380 trace->max_entries = trace->nr_entries; 394 trace->max_entries = trace->nr_entries;
@@ -1156,9 +1170,9 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class)
1156 this.class = class; 1170 this.class = class;
1157 1171
1158 local_irq_save(flags); 1172 local_irq_save(flags);
1159 __raw_spin_lock(&lockdep_lock); 1173 arch_spin_lock(&lockdep_lock);
1160 ret = __lockdep_count_forward_deps(&this); 1174 ret = __lockdep_count_forward_deps(&this);
1161 __raw_spin_unlock(&lockdep_lock); 1175 arch_spin_unlock(&lockdep_lock);
1162 local_irq_restore(flags); 1176 local_irq_restore(flags);
1163 1177
1164 return ret; 1178 return ret;
@@ -1183,9 +1197,9 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
1183 this.class = class; 1197 this.class = class;
1184 1198
1185 local_irq_save(flags); 1199 local_irq_save(flags);
1186 __raw_spin_lock(&lockdep_lock); 1200 arch_spin_lock(&lockdep_lock);
1187 ret = __lockdep_count_backward_deps(&this); 1201 ret = __lockdep_count_backward_deps(&this);
1188 __raw_spin_unlock(&lockdep_lock); 1202 arch_spin_unlock(&lockdep_lock);
1189 local_irq_restore(flags); 1203 local_irq_restore(flags);
1190 1204
1191 return ret; 1205 return ret;
@@ -2792,7 +2806,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2792 hlock->references = references; 2806 hlock->references = references;
2793#ifdef CONFIG_LOCK_STAT 2807#ifdef CONFIG_LOCK_STAT
2794 hlock->waittime_stamp = 0; 2808 hlock->waittime_stamp = 0;
2795 hlock->holdtime_stamp = sched_clock(); 2809 hlock->holdtime_stamp = lockstat_clock();
2796#endif 2810#endif
2797 2811
2798 if (check == 2 && !mark_irqflags(curr, hlock)) 2812 if (check == 2 && !mark_irqflags(curr, hlock))
@@ -3322,7 +3336,7 @@ found_it:
3322 if (hlock->instance != lock) 3336 if (hlock->instance != lock)
3323 return; 3337 return;
3324 3338
3325 hlock->waittime_stamp = sched_clock(); 3339 hlock->waittime_stamp = lockstat_clock();
3326 3340
3327 contention_point = lock_point(hlock_class(hlock)->contention_point, ip); 3341 contention_point = lock_point(hlock_class(hlock)->contention_point, ip);
3328 contending_point = lock_point(hlock_class(hlock)->contending_point, 3342 contending_point = lock_point(hlock_class(hlock)->contending_point,
@@ -3345,8 +3359,7 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
3345 struct held_lock *hlock, *prev_hlock; 3359 struct held_lock *hlock, *prev_hlock;
3346 struct lock_class_stats *stats; 3360 struct lock_class_stats *stats;
3347 unsigned int depth; 3361 unsigned int depth;
3348 u64 now; 3362 u64 now, waittime = 0;
3349 s64 waittime = 0;
3350 int i, cpu; 3363 int i, cpu;
3351 3364
3352 depth = curr->lockdep_depth; 3365 depth = curr->lockdep_depth;
@@ -3374,7 +3387,7 @@ found_it:
3374 3387
3375 cpu = smp_processor_id(); 3388 cpu = smp_processor_id();
3376 if (hlock->waittime_stamp) { 3389 if (hlock->waittime_stamp) {
3377 now = sched_clock(); 3390 now = lockstat_clock();
3378 waittime = now - hlock->waittime_stamp; 3391 waittime = now - hlock->waittime_stamp;
3379 hlock->holdtime_stamp = now; 3392 hlock->holdtime_stamp = now;
3380 } 3393 }
diff --git a/kernel/module.c b/kernel/module.c
index e6bc4b28aa62..12afc5a3ddd3 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -370,8 +370,6 @@ EXPORT_SYMBOL_GPL(find_module);
370 370
371#ifdef CONFIG_SMP 371#ifdef CONFIG_SMP
372 372
373#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
374
375static void *percpu_modalloc(unsigned long size, unsigned long align, 373static void *percpu_modalloc(unsigned long size, unsigned long align,
376 const char *name) 374 const char *name)
377{ 375{
@@ -395,154 +393,6 @@ static void percpu_modfree(void *freeme)
395 free_percpu(freeme); 393 free_percpu(freeme);
396} 394}
397 395
398#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */
399
400/* Number of blocks used and allocated. */
401static unsigned int pcpu_num_used, pcpu_num_allocated;
402/* Size of each block. -ve means used. */
403static int *pcpu_size;
404
405static int split_block(unsigned int i, unsigned short size)
406{
407 /* Reallocation required? */
408 if (pcpu_num_used + 1 > pcpu_num_allocated) {
409 int *new;
410
411 new = krealloc(pcpu_size, sizeof(new[0])*pcpu_num_allocated*2,
412 GFP_KERNEL);
413 if (!new)
414 return 0;
415
416 pcpu_num_allocated *= 2;
417 pcpu_size = new;
418 }
419
420 /* Insert a new subblock */
421 memmove(&pcpu_size[i+1], &pcpu_size[i],
422 sizeof(pcpu_size[0]) * (pcpu_num_used - i));
423 pcpu_num_used++;
424
425 pcpu_size[i+1] -= size;
426 pcpu_size[i] = size;
427 return 1;
428}
429
430static inline unsigned int block_size(int val)
431{
432 if (val < 0)
433 return -val;
434 return val;
435}
436
437static void *percpu_modalloc(unsigned long size, unsigned long align,
438 const char *name)
439{
440 unsigned long extra;
441 unsigned int i;
442 void *ptr;
443 int cpu;
444
445 if (align > PAGE_SIZE) {
446 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
447 name, align, PAGE_SIZE);
448 align = PAGE_SIZE;
449 }
450
451 ptr = __per_cpu_start;
452 for (i = 0; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
453 /* Extra for alignment requirement. */
454 extra = ALIGN((unsigned long)ptr, align) - (unsigned long)ptr;
455 BUG_ON(i == 0 && extra != 0);
456
457 if (pcpu_size[i] < 0 || pcpu_size[i] < extra + size)
458 continue;
459
460 /* Transfer extra to previous block. */
461 if (pcpu_size[i-1] < 0)
462 pcpu_size[i-1] -= extra;
463 else
464 pcpu_size[i-1] += extra;
465 pcpu_size[i] -= extra;
466 ptr += extra;
467
468 /* Split block if warranted */
469 if (pcpu_size[i] - size > sizeof(unsigned long))
470 if (!split_block(i, size))
471 return NULL;
472
473 /* add the per-cpu scanning areas */
474 for_each_possible_cpu(cpu)
475 kmemleak_alloc(ptr + per_cpu_offset(cpu), size, 0,
476 GFP_KERNEL);
477
478 /* Mark allocated */
479 pcpu_size[i] = -pcpu_size[i];
480 return ptr;
481 }
482
483 printk(KERN_WARNING "Could not allocate %lu bytes percpu data\n",
484 size);
485 return NULL;
486}
487
488static void percpu_modfree(void *freeme)
489{
490 unsigned int i;
491 void *ptr = __per_cpu_start + block_size(pcpu_size[0]);
492 int cpu;
493
494 /* First entry is core kernel percpu data. */
495 for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
496 if (ptr == freeme) {
497 pcpu_size[i] = -pcpu_size[i];
498 goto free;
499 }
500 }
501 BUG();
502
503 free:
504 /* remove the per-cpu scanning areas */
505 for_each_possible_cpu(cpu)
506 kmemleak_free(freeme + per_cpu_offset(cpu));
507
508 /* Merge with previous? */
509 if (pcpu_size[i-1] >= 0) {
510 pcpu_size[i-1] += pcpu_size[i];
511 pcpu_num_used--;
512 memmove(&pcpu_size[i], &pcpu_size[i+1],
513 (pcpu_num_used - i) * sizeof(pcpu_size[0]));
514 i--;
515 }
516 /* Merge with next? */
517 if (i+1 < pcpu_num_used && pcpu_size[i+1] >= 0) {
518 pcpu_size[i] += pcpu_size[i+1];
519 pcpu_num_used--;
520 memmove(&pcpu_size[i+1], &pcpu_size[i+2],
521 (pcpu_num_used - (i+1)) * sizeof(pcpu_size[0]));
522 }
523}
524
525static int percpu_modinit(void)
526{
527 pcpu_num_used = 2;
528 pcpu_num_allocated = 2;
529 pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated,
530 GFP_KERNEL);
531 /* Static in-kernel percpu data (used). */
532 pcpu_size[0] = -(__per_cpu_end-__per_cpu_start);
533 /* Free room. */
534 pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0];
535 if (pcpu_size[1] < 0) {
536 printk(KERN_ERR "No per-cpu room for modules.\n");
537 pcpu_num_used = 1;
538 }
539
540 return 0;
541}
542__initcall(percpu_modinit);
543
544#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */
545
546static unsigned int find_pcpusec(Elf_Ehdr *hdr, 396static unsigned int find_pcpusec(Elf_Ehdr *hdr,
547 Elf_Shdr *sechdrs, 397 Elf_Shdr *sechdrs,
548 const char *secstrings) 398 const char *secstrings)
@@ -1187,7 +1037,8 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1187 1037
1188 /* Count loaded sections and allocate structures */ 1038 /* Count loaded sections and allocate structures */
1189 for (i = 0; i < nsect; i++) 1039 for (i = 0; i < nsect; i++)
1190 if (sechdrs[i].sh_flags & SHF_ALLOC) 1040 if (sechdrs[i].sh_flags & SHF_ALLOC
1041 && sechdrs[i].sh_size)
1191 nloaded++; 1042 nloaded++;
1192 size[0] = ALIGN(sizeof(*sect_attrs) 1043 size[0] = ALIGN(sizeof(*sect_attrs)
1193 + nloaded * sizeof(sect_attrs->attrs[0]), 1044 + nloaded * sizeof(sect_attrs->attrs[0]),
@@ -1207,6 +1058,8 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1207 for (i = 0; i < nsect; i++) { 1058 for (i = 0; i < nsect; i++) {
1208 if (! (sechdrs[i].sh_flags & SHF_ALLOC)) 1059 if (! (sechdrs[i].sh_flags & SHF_ALLOC))
1209 continue; 1060 continue;
1061 if (!sechdrs[i].sh_size)
1062 continue;
1210 sattr->address = sechdrs[i].sh_addr; 1063 sattr->address = sechdrs[i].sh_addr;
1211 sattr->name = kstrdup(secstrings + sechdrs[i].sh_name, 1064 sattr->name = kstrdup(secstrings + sechdrs[i].sh_name,
1212 GFP_KERNEL); 1065 GFP_KERNEL);
@@ -1797,6 +1650,17 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
1797 } 1650 }
1798} 1651}
1799 1652
1653static void free_modinfo(struct module *mod)
1654{
1655 struct module_attribute *attr;
1656 int i;
1657
1658 for (i = 0; (attr = modinfo_attrs[i]); i++) {
1659 if (attr->free)
1660 attr->free(mod);
1661 }
1662}
1663
1800#ifdef CONFIG_KALLSYMS 1664#ifdef CONFIG_KALLSYMS
1801 1665
1802/* lookup symbol in given range of kernel_symbols */ 1666/* lookup symbol in given range of kernel_symbols */
@@ -1862,13 +1726,93 @@ static char elf_type(const Elf_Sym *sym,
1862 return '?'; 1726 return '?';
1863} 1727}
1864 1728
1729static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
1730 unsigned int shnum)
1731{
1732 const Elf_Shdr *sec;
1733
1734 if (src->st_shndx == SHN_UNDEF
1735 || src->st_shndx >= shnum
1736 || !src->st_name)
1737 return false;
1738
1739 sec = sechdrs + src->st_shndx;
1740 if (!(sec->sh_flags & SHF_ALLOC)
1741#ifndef CONFIG_KALLSYMS_ALL
1742 || !(sec->sh_flags & SHF_EXECINSTR)
1743#endif
1744 || (sec->sh_entsize & INIT_OFFSET_MASK))
1745 return false;
1746
1747 return true;
1748}
1749
1750static unsigned long layout_symtab(struct module *mod,
1751 Elf_Shdr *sechdrs,
1752 unsigned int symindex,
1753 unsigned int strindex,
1754 const Elf_Ehdr *hdr,
1755 const char *secstrings,
1756 unsigned long *pstroffs,
1757 unsigned long *strmap)
1758{
1759 unsigned long symoffs;
1760 Elf_Shdr *symsect = sechdrs + symindex;
1761 Elf_Shdr *strsect = sechdrs + strindex;
1762 const Elf_Sym *src;
1763 const char *strtab;
1764 unsigned int i, nsrc, ndst;
1765
1766 /* Put symbol section at end of init part of module. */
1767 symsect->sh_flags |= SHF_ALLOC;
1768 symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
1769 symindex) | INIT_OFFSET_MASK;
1770 DEBUGP("\t%s\n", secstrings + symsect->sh_name);
1771
1772 src = (void *)hdr + symsect->sh_offset;
1773 nsrc = symsect->sh_size / sizeof(*src);
1774 strtab = (void *)hdr + strsect->sh_offset;
1775 for (ndst = i = 1; i < nsrc; ++i, ++src)
1776 if (is_core_symbol(src, sechdrs, hdr->e_shnum)) {
1777 unsigned int j = src->st_name;
1778
1779 while(!__test_and_set_bit(j, strmap) && strtab[j])
1780 ++j;
1781 ++ndst;
1782 }
1783
1784 /* Append room for core symbols at end of core part. */
1785 symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
1786 mod->core_size = symoffs + ndst * sizeof(Elf_Sym);
1787
1788 /* Put string table section at end of init part of module. */
1789 strsect->sh_flags |= SHF_ALLOC;
1790 strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
1791 strindex) | INIT_OFFSET_MASK;
1792 DEBUGP("\t%s\n", secstrings + strsect->sh_name);
1793
1794 /* Append room for core symbols' strings at end of core part. */
1795 *pstroffs = mod->core_size;
1796 __set_bit(0, strmap);
1797 mod->core_size += bitmap_weight(strmap, strsect->sh_size);
1798
1799 return symoffs;
1800}
1801
1865static void add_kallsyms(struct module *mod, 1802static void add_kallsyms(struct module *mod,
1866 Elf_Shdr *sechdrs, 1803 Elf_Shdr *sechdrs,
1804 unsigned int shnum,
1867 unsigned int symindex, 1805 unsigned int symindex,
1868 unsigned int strindex, 1806 unsigned int strindex,
1869 const char *secstrings) 1807 unsigned long symoffs,
1808 unsigned long stroffs,
1809 const char *secstrings,
1810 unsigned long *strmap)
1870{ 1811{
1871 unsigned int i; 1812 unsigned int i, ndst;
1813 const Elf_Sym *src;
1814 Elf_Sym *dst;
1815 char *s;
1872 1816
1873 mod->symtab = (void *)sechdrs[symindex].sh_addr; 1817 mod->symtab = (void *)sechdrs[symindex].sh_addr;
1874 mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym); 1818 mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
@@ -1878,13 +1822,46 @@ static void add_kallsyms(struct module *mod,
1878 for (i = 0; i < mod->num_symtab; i++) 1822 for (i = 0; i < mod->num_symtab; i++)
1879 mod->symtab[i].st_info 1823 mod->symtab[i].st_info
1880 = elf_type(&mod->symtab[i], sechdrs, secstrings, mod); 1824 = elf_type(&mod->symtab[i], sechdrs, secstrings, mod);
1825
1826 mod->core_symtab = dst = mod->module_core + symoffs;
1827 src = mod->symtab;
1828 *dst = *src;
1829 for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) {
1830 if (!is_core_symbol(src, sechdrs, shnum))
1831 continue;
1832 dst[ndst] = *src;
1833 dst[ndst].st_name = bitmap_weight(strmap, dst[ndst].st_name);
1834 ++ndst;
1835 }
1836 mod->core_num_syms = ndst;
1837
1838 mod->core_strtab = s = mod->module_core + stroffs;
1839 for (*s = 0, i = 1; i < sechdrs[strindex].sh_size; ++i)
1840 if (test_bit(i, strmap))
1841 *++s = mod->strtab[i];
1881} 1842}
1882#else 1843#else
1844static inline unsigned long layout_symtab(struct module *mod,
1845 Elf_Shdr *sechdrs,
1846 unsigned int symindex,
1847 unsigned int strindex,
1848 const Elf_Ehdr *hdr,
1849 const char *secstrings,
1850 unsigned long *pstroffs,
1851 unsigned long *strmap)
1852{
1853 return 0;
1854}
1855
1883static inline void add_kallsyms(struct module *mod, 1856static inline void add_kallsyms(struct module *mod,
1884 Elf_Shdr *sechdrs, 1857 Elf_Shdr *sechdrs,
1858 unsigned int shnum,
1885 unsigned int symindex, 1859 unsigned int symindex,
1886 unsigned int strindex, 1860 unsigned int strindex,
1887 const char *secstrings) 1861 unsigned long symoffs,
1862 unsigned long stroffs,
1863 const char *secstrings,
1864 const unsigned long *strmap)
1888{ 1865{
1889} 1866}
1890#endif /* CONFIG_KALLSYMS */ 1867#endif /* CONFIG_KALLSYMS */
@@ -1959,6 +1936,8 @@ static noinline struct module *load_module(void __user *umod,
1959 struct module *mod; 1936 struct module *mod;
1960 long err = 0; 1937 long err = 0;
1961 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 1938 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
1939 unsigned long symoffs, stroffs, *strmap;
1940
1962 mm_segment_t old_fs; 1941 mm_segment_t old_fs;
1963 1942
1964 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", 1943 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
@@ -2040,11 +2019,6 @@ static noinline struct module *load_module(void __user *umod,
2040 /* Don't keep modinfo and version sections. */ 2019 /* Don't keep modinfo and version sections. */
2041 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 2020 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
2042 sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 2021 sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
2043#ifdef CONFIG_KALLSYMS
2044 /* Keep symbol and string tables for decoding later. */
2045 sechdrs[symindex].sh_flags |= SHF_ALLOC;
2046 sechdrs[strindex].sh_flags |= SHF_ALLOC;
2047#endif
2048 2022
2049 /* Check module struct version now, before we try to use module. */ 2023 /* Check module struct version now, before we try to use module. */
2050 if (!check_modstruct_version(sechdrs, versindex, mod)) { 2024 if (!check_modstruct_version(sechdrs, versindex, mod)) {
@@ -2080,6 +2054,13 @@ static noinline struct module *load_module(void __user *umod,
2080 goto free_hdr; 2054 goto free_hdr;
2081 } 2055 }
2082 2056
2057 strmap = kzalloc(BITS_TO_LONGS(sechdrs[strindex].sh_size)
2058 * sizeof(long), GFP_KERNEL);
2059 if (!strmap) {
2060 err = -ENOMEM;
2061 goto free_mod;
2062 }
2063
2083 if (find_module(mod->name)) { 2064 if (find_module(mod->name)) {
2084 err = -EEXIST; 2065 err = -EEXIST;
2085 goto free_mod; 2066 goto free_mod;
@@ -2109,6 +2090,8 @@ static noinline struct module *load_module(void __user *umod,
2109 this is done generically; there doesn't appear to be any 2090 this is done generically; there doesn't appear to be any
2110 special cases for the architectures. */ 2091 special cases for the architectures. */
2111 layout_sections(mod, hdr, sechdrs, secstrings); 2092 layout_sections(mod, hdr, sechdrs, secstrings);
2093 symoffs = layout_symtab(mod, sechdrs, symindex, strindex, hdr,
2094 secstrings, &stroffs, strmap);
2112 2095
2113 /* Do the allocs. */ 2096 /* Do the allocs. */
2114 ptr = module_alloc_update_bounds(mod->core_size); 2097 ptr = module_alloc_update_bounds(mod->core_size);
@@ -2313,7 +2296,10 @@ static noinline struct module *load_module(void __user *umod,
2313 percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr, 2296 percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr,
2314 sechdrs[pcpuindex].sh_size); 2297 sechdrs[pcpuindex].sh_size);
2315 2298
2316 add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); 2299 add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex,
2300 symoffs, stroffs, secstrings, strmap);
2301 kfree(strmap);
2302 strmap = NULL;
2317 2303
2318 if (!mod->taints) { 2304 if (!mod->taints) {
2319 struct _ddebug *debug; 2305 struct _ddebug *debug;
@@ -2385,13 +2371,14 @@ static noinline struct module *load_module(void __user *umod,
2385 synchronize_sched(); 2371 synchronize_sched();
2386 module_arch_cleanup(mod); 2372 module_arch_cleanup(mod);
2387 cleanup: 2373 cleanup:
2374 free_modinfo(mod);
2388 kobject_del(&mod->mkobj.kobj); 2375 kobject_del(&mod->mkobj.kobj);
2389 kobject_put(&mod->mkobj.kobj); 2376 kobject_put(&mod->mkobj.kobj);
2390 free_unload: 2377 free_unload:
2391 module_unload_free(mod); 2378 module_unload_free(mod);
2392#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 2379#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
2393 free_init:
2394 percpu_modfree(mod->refptr); 2380 percpu_modfree(mod->refptr);
2381 free_init:
2395#endif 2382#endif
2396 module_free(mod, mod->module_init); 2383 module_free(mod, mod->module_init);
2397 free_core: 2384 free_core:
@@ -2402,6 +2389,7 @@ static noinline struct module *load_module(void __user *umod,
2402 percpu_modfree(percpu); 2389 percpu_modfree(percpu);
2403 free_mod: 2390 free_mod:
2404 kfree(args); 2391 kfree(args);
2392 kfree(strmap);
2405 free_hdr: 2393 free_hdr:
2406 vfree(hdr); 2394 vfree(hdr);
2407 return ERR_PTR(err); 2395 return ERR_PTR(err);
@@ -2491,6 +2479,11 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2491 /* Drop initial reference. */ 2479 /* Drop initial reference. */
2492 module_put(mod); 2480 module_put(mod);
2493 trim_init_extable(mod); 2481 trim_init_extable(mod);
2482#ifdef CONFIG_KALLSYMS
2483 mod->num_symtab = mod->core_num_syms;
2484 mod->symtab = mod->core_symtab;
2485 mod->strtab = mod->core_strtab;
2486#endif
2494 module_free(mod, mod->module_init); 2487 module_free(mod, mod->module_init);
2495 mod->module_init = NULL; 2488 mod->module_init = NULL;
2496 mod->init_size = 0; 2489 mod->init_size = 0;
@@ -2952,7 +2945,6 @@ void module_layout(struct module *mod,
2952 struct modversion_info *ver, 2945 struct modversion_info *ver,
2953 struct kernel_param *kp, 2946 struct kernel_param *kp,
2954 struct kernel_symbol *ks, 2947 struct kernel_symbol *ks,
2955 struct marker *marker,
2956 struct tracepoint *tp) 2948 struct tracepoint *tp)
2957{ 2949{
2958} 2950}
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 50d022e5a560..ec815a960b5d 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -16,6 +16,7 @@
16#include <linux/delay.h> 16#include <linux/delay.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/poison.h> 18#include <linux/poison.h>
19#include <linux/sched.h>
19#include <linux/spinlock.h> 20#include <linux/spinlock.h>
20#include <linux/kallsyms.h> 21#include <linux/kallsyms.h>
21#include <linux/interrupt.h> 22#include <linux/interrupt.h>
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index 6b2d735846a5..57d527a16f9d 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -43,13 +43,13 @@ static inline void mutex_clear_owner(struct mutex *lock)
43 \ 43 \
44 DEBUG_LOCKS_WARN_ON(in_interrupt()); \ 44 DEBUG_LOCKS_WARN_ON(in_interrupt()); \
45 local_irq_save(flags); \ 45 local_irq_save(flags); \
46 __raw_spin_lock(&(lock)->raw_lock); \ 46 arch_spin_lock(&(lock)->rlock.raw_lock);\
47 DEBUG_LOCKS_WARN_ON(l->magic != l); \ 47 DEBUG_LOCKS_WARN_ON(l->magic != l); \
48 } while (0) 48 } while (0)
49 49
50#define spin_unlock_mutex(lock, flags) \ 50#define spin_unlock_mutex(lock, flags) \
51 do { \ 51 do { \
52 __raw_spin_unlock(&(lock)->raw_lock); \ 52 arch_spin_unlock(&(lock)->rlock.raw_lock); \
53 local_irq_restore(flags); \ 53 local_irq_restore(flags); \
54 preempt_check_resched(); \ 54 preempt_check_resched(); \
55 } while (0) 55 } while (0)
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 947b3ad551f8..632f04c57d82 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -148,8 +148,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
148 148
149 preempt_disable(); 149 preempt_disable();
150 mutex_acquire(&lock->dep_map, subclass, 0, ip); 150 mutex_acquire(&lock->dep_map, subclass, 0, ip);
151#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES) && \ 151
152 !defined(CONFIG_HAVE_DEFAULT_NO_SPIN_MUTEXES) 152#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
153 /* 153 /*
154 * Optimistic spinning. 154 * Optimistic spinning.
155 * 155 *
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 61d5aa5eced3..acd24e7643eb 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -558,7 +558,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier);
558 558
559static ATOMIC_NOTIFIER_HEAD(die_chain); 559static ATOMIC_NOTIFIER_HEAD(die_chain);
560 560
561int notrace notify_die(enum die_val val, const char *str, 561int notrace __kprobes notify_die(enum die_val val, const char *str,
562 struct pt_regs *regs, long err, int trap, int sig) 562 struct pt_regs *regs, long err, int trap, int sig)
563{ 563{
564 struct die_args args = { 564 struct die_args args = {
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 5aa854f9e5ae..2a5dfec8efe0 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -42,8 +42,8 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
42 * (hence either you are in the same cgroup as task, or in an 42 * (hence either you are in the same cgroup as task, or in an
43 * ancestor cgroup thereof) 43 * ancestor cgroup thereof)
44 */ 44 */
45static int ns_can_attach(struct cgroup_subsys *ss, 45static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
46 struct cgroup *new_cgroup, struct task_struct *task) 46 struct task_struct *task, bool threadgroup)
47{ 47{
48 if (current != task) { 48 if (current != task) {
49 if (!capable(CAP_SYS_ADMIN)) 49 if (!capable(CAP_SYS_ADMIN))
@@ -56,6 +56,18 @@ static int ns_can_attach(struct cgroup_subsys *ss,
56 if (!cgroup_is_descendant(new_cgroup, task)) 56 if (!cgroup_is_descendant(new_cgroup, task))
57 return -EPERM; 57 return -EPERM;
58 58
59 if (threadgroup) {
60 struct task_struct *c;
61 rcu_read_lock();
62 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
63 if (!cgroup_is_descendant(new_cgroup, c)) {
64 rcu_read_unlock();
65 return -EPERM;
66 }
67 }
68 rcu_read_unlock();
69 }
70
59 return 0; 71 return 0;
60} 72}
61 73
diff --git a/kernel/panic.c b/kernel/panic.c
index 8c43226a544d..5827f7b97254 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -92,6 +92,8 @@ NORET_TYPE void panic(const char * fmt, ...)
92 92
93 atomic_notifier_call_chain(&panic_notifier_list, 0, buf); 93 atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
94 94
95 bust_spinlocks(0);
96
95 if (!panic_blink) 97 if (!panic_blink)
96 panic_blink = no_blink; 98 panic_blink = no_blink;
97 99
@@ -138,7 +140,6 @@ NORET_TYPE void panic(const char * fmt, ...)
138 mdelay(1); 140 mdelay(1);
139 i++; 141 i++;
140 } 142 }
141 bust_spinlocks(0);
142} 143}
143 144
144EXPORT_SYMBOL(panic); 145EXPORT_SYMBOL(panic);
diff --git a/kernel/params.c b/kernel/params.c
index 7f6912ced2ba..cf1b69183127 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -23,6 +23,8 @@
23#include <linux/device.h> 23#include <linux/device.h>
24#include <linux/err.h> 24#include <linux/err.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/ctype.h>
27#include <linux/string.h>
26 28
27#if 0 29#if 0
28#define DEBUGP printk 30#define DEBUGP printk
@@ -87,7 +89,7 @@ static char *next_arg(char *args, char **param, char **val)
87 } 89 }
88 90
89 for (i = 0; args[i]; i++) { 91 for (i = 0; args[i]; i++) {
90 if (args[i] == ' ' && !in_quote) 92 if (isspace(args[i]) && !in_quote)
91 break; 93 break;
92 if (equals == 0) { 94 if (equals == 0) {
93 if (args[i] == '=') 95 if (args[i] == '=')
@@ -121,9 +123,7 @@ static char *next_arg(char *args, char **param, char **val)
121 next = args + i; 123 next = args + i;
122 124
123 /* Chew up trailing spaces. */ 125 /* Chew up trailing spaces. */
124 while (*next == ' ') 126 return skip_spaces(next);
125 next++;
126 return next;
127} 127}
128 128
129/* Args looks like "foo=bar,bar2 baz=fuz wiz". */ 129/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
@@ -138,8 +138,7 @@ int parse_args(const char *name,
138 DEBUGP("Parsing ARGS: %s\n", args); 138 DEBUGP("Parsing ARGS: %s\n", args);
139 139
140 /* Chew leading spaces */ 140 /* Chew leading spaces */
141 while (*args == ' ') 141 args = skip_spaces(args);
142 args++;
143 142
144 while (*args) { 143 while (*args) {
145 int ret; 144 int ret;
@@ -217,15 +216,11 @@ int param_set_charp(const char *val, struct kernel_param *kp)
217 return -ENOSPC; 216 return -ENOSPC;
218 } 217 }
219 218
220 if (kp->flags & KPARAM_KMALLOCED)
221 kfree(*(char **)kp->arg);
222
223 /* This is a hack. We can't need to strdup in early boot, and we 219 /* This is a hack. We can't need to strdup in early boot, and we
224 * don't need to; this mangled commandline is preserved. */ 220 * don't need to; this mangled commandline is preserved. */
225 if (slab_is_available()) { 221 if (slab_is_available()) {
226 kp->flags |= KPARAM_KMALLOCED;
227 *(char **)kp->arg = kstrdup(val, GFP_KERNEL); 222 *(char **)kp->arg = kstrdup(val, GFP_KERNEL);
228 if (!kp->arg) 223 if (!*(char **)kp->arg)
229 return -ENOMEM; 224 return -ENOMEM;
230 } else 225 } else
231 *(const char **)kp->arg = val; 226 *(const char **)kp->arg = val;
@@ -303,6 +298,7 @@ static int param_array(const char *name,
303 unsigned int min, unsigned int max, 298 unsigned int min, unsigned int max,
304 void *elem, int elemsize, 299 void *elem, int elemsize,
305 int (*set)(const char *, struct kernel_param *kp), 300 int (*set)(const char *, struct kernel_param *kp),
301 u16 flags,
306 unsigned int *num) 302 unsigned int *num)
307{ 303{
308 int ret; 304 int ret;
@@ -312,6 +308,7 @@ static int param_array(const char *name,
312 /* Get the name right for errors. */ 308 /* Get the name right for errors. */
313 kp.name = name; 309 kp.name = name;
314 kp.arg = elem; 310 kp.arg = elem;
311 kp.flags = flags;
315 312
316 /* No equals sign? */ 313 /* No equals sign? */
317 if (!val) { 314 if (!val) {
@@ -357,7 +354,8 @@ int param_array_set(const char *val, struct kernel_param *kp)
357 unsigned int temp_num; 354 unsigned int temp_num;
358 355
359 return param_array(kp->name, val, 1, arr->max, arr->elem, 356 return param_array(kp->name, val, 1, arr->max, arr->elem,
360 arr->elemsize, arr->set, arr->num ?: &temp_num); 357 arr->elemsize, arr->set, kp->flags,
358 arr->num ?: &temp_num);
361} 359}
362 360
363int param_array_get(char *buffer, struct kernel_param *kp) 361int param_array_get(char *buffer, struct kernel_param *kp)
@@ -604,11 +602,7 @@ void module_param_sysfs_remove(struct module *mod)
604 602
605void destroy_params(const struct kernel_param *params, unsigned num) 603void destroy_params(const struct kernel_param *params, unsigned num)
606{ 604{
607 unsigned int i; 605 /* FIXME: This should free kmalloced charp parameters. It doesn't. */
608
609 for (i = 0; i < num; i++)
610 if (params[i].flags & KPARAM_KMALLOCED)
611 kfree(*(char **)params[i].arg);
612} 606}
613 607
614static void __init kernel_add_sysfs_param(const char *name, 608static void __init kernel_add_sysfs_param(const char *name,
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 76ac4db405e9..9052d6c8c9fd 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -20,6 +20,7 @@
20#include <linux/percpu.h> 20#include <linux/percpu.h>
21#include <linux/ptrace.h> 21#include <linux/ptrace.h>
22#include <linux/vmstat.h> 22#include <linux/vmstat.h>
23#include <linux/vmalloc.h>
23#include <linux/hardirq.h> 24#include <linux/hardirq.h>
24#include <linux/rculist.h> 25#include <linux/rculist.h>
25#include <linux/uaccess.h> 26#include <linux/uaccess.h>
@@ -27,13 +28,15 @@
27#include <linux/anon_inodes.h> 28#include <linux/anon_inodes.h>
28#include <linux/kernel_stat.h> 29#include <linux/kernel_stat.h>
29#include <linux/perf_event.h> 30#include <linux/perf_event.h>
31#include <linux/ftrace_event.h>
32#include <linux/hw_breakpoint.h>
30 33
31#include <asm/irq_regs.h> 34#include <asm/irq_regs.h>
32 35
33/* 36/*
34 * Each CPU has a list of per CPU events: 37 * Each CPU has a list of per CPU events:
35 */ 38 */
36DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); 39static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
37 40
38int perf_max_events __read_mostly = 1; 41int perf_max_events __read_mostly = 1;
39static int perf_reserved_percpu __read_mostly; 42static int perf_reserved_percpu __read_mostly;
@@ -200,14 +203,14 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
200 * if so. If we locked the right context, then it 203 * if so. If we locked the right context, then it
201 * can't get swapped on us any more. 204 * can't get swapped on us any more.
202 */ 205 */
203 spin_lock_irqsave(&ctx->lock, *flags); 206 raw_spin_lock_irqsave(&ctx->lock, *flags);
204 if (ctx != rcu_dereference(task->perf_event_ctxp)) { 207 if (ctx != rcu_dereference(task->perf_event_ctxp)) {
205 spin_unlock_irqrestore(&ctx->lock, *flags); 208 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
206 goto retry; 209 goto retry;
207 } 210 }
208 211
209 if (!atomic_inc_not_zero(&ctx->refcount)) { 212 if (!atomic_inc_not_zero(&ctx->refcount)) {
210 spin_unlock_irqrestore(&ctx->lock, *flags); 213 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
211 ctx = NULL; 214 ctx = NULL;
212 } 215 }
213 } 216 }
@@ -228,7 +231,7 @@ static struct perf_event_context *perf_pin_task_context(struct task_struct *task
228 ctx = perf_lock_task_context(task, &flags); 231 ctx = perf_lock_task_context(task, &flags);
229 if (ctx) { 232 if (ctx) {
230 ++ctx->pin_count; 233 ++ctx->pin_count;
231 spin_unlock_irqrestore(&ctx->lock, flags); 234 raw_spin_unlock_irqrestore(&ctx->lock, flags);
232 } 235 }
233 return ctx; 236 return ctx;
234} 237}
@@ -237,12 +240,55 @@ static void perf_unpin_context(struct perf_event_context *ctx)
237{ 240{
238 unsigned long flags; 241 unsigned long flags;
239 242
240 spin_lock_irqsave(&ctx->lock, flags); 243 raw_spin_lock_irqsave(&ctx->lock, flags);
241 --ctx->pin_count; 244 --ctx->pin_count;
242 spin_unlock_irqrestore(&ctx->lock, flags); 245 raw_spin_unlock_irqrestore(&ctx->lock, flags);
243 put_ctx(ctx); 246 put_ctx(ctx);
244} 247}
245 248
249static inline u64 perf_clock(void)
250{
251 return cpu_clock(smp_processor_id());
252}
253
254/*
255 * Update the record of the current time in a context.
256 */
257static void update_context_time(struct perf_event_context *ctx)
258{
259 u64 now = perf_clock();
260
261 ctx->time += now - ctx->timestamp;
262 ctx->timestamp = now;
263}
264
265/*
266 * Update the total_time_enabled and total_time_running fields for a event.
267 */
268static void update_event_times(struct perf_event *event)
269{
270 struct perf_event_context *ctx = event->ctx;
271 u64 run_end;
272
273 if (event->state < PERF_EVENT_STATE_INACTIVE ||
274 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
275 return;
276
277 if (ctx->is_active)
278 run_end = ctx->time;
279 else
280 run_end = event->tstamp_stopped;
281
282 event->total_time_enabled = run_end - event->tstamp_enabled;
283
284 if (event->state == PERF_EVENT_STATE_INACTIVE)
285 run_end = event->tstamp_stopped;
286 else
287 run_end = ctx->time;
288
289 event->total_time_running = run_end - event->tstamp_running;
290}
291
246/* 292/*
247 * Add a event from the lists for its context. 293 * Add a event from the lists for its context.
248 * Must be called with ctx->mutex and ctx->lock held. 294 * Must be called with ctx->mutex and ctx->lock held.
@@ -291,6 +337,18 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
291 if (event->group_leader != event) 337 if (event->group_leader != event)
292 event->group_leader->nr_siblings--; 338 event->group_leader->nr_siblings--;
293 339
340 update_event_times(event);
341
342 /*
343 * If event was in error state, then keep it
344 * that way, otherwise bogus counts will be
345 * returned on read(). The only way to get out
346 * of error state is by explicit re-enabling
347 * of the event
348 */
349 if (event->state > PERF_EVENT_STATE_OFF)
350 event->state = PERF_EVENT_STATE_OFF;
351
294 /* 352 /*
295 * If this was a group event with sibling events then 353 * If this was a group event with sibling events then
296 * upgrade the siblings to singleton events by adding them 354 * upgrade the siblings to singleton events by adding them
@@ -369,7 +427,7 @@ static void __perf_event_remove_from_context(void *info)
369 if (ctx->task && cpuctx->task_ctx != ctx) 427 if (ctx->task && cpuctx->task_ctx != ctx)
370 return; 428 return;
371 429
372 spin_lock(&ctx->lock); 430 raw_spin_lock(&ctx->lock);
373 /* 431 /*
374 * Protect the list operation against NMI by disabling the 432 * Protect the list operation against NMI by disabling the
375 * events on a global level. 433 * events on a global level.
@@ -391,7 +449,7 @@ static void __perf_event_remove_from_context(void *info)
391 } 449 }
392 450
393 perf_enable(); 451 perf_enable();
394 spin_unlock(&ctx->lock); 452 raw_spin_unlock(&ctx->lock);
395} 453}
396 454
397 455
@@ -418,7 +476,7 @@ static void perf_event_remove_from_context(struct perf_event *event)
418 if (!task) { 476 if (!task) {
419 /* 477 /*
420 * Per cpu events are removed via an smp call and 478 * Per cpu events are removed via an smp call and
421 * the removal is always sucessful. 479 * the removal is always successful.
422 */ 480 */
423 smp_call_function_single(event->cpu, 481 smp_call_function_single(event->cpu,
424 __perf_event_remove_from_context, 482 __perf_event_remove_from_context,
@@ -430,12 +488,12 @@ retry:
430 task_oncpu_function_call(task, __perf_event_remove_from_context, 488 task_oncpu_function_call(task, __perf_event_remove_from_context,
431 event); 489 event);
432 490
433 spin_lock_irq(&ctx->lock); 491 raw_spin_lock_irq(&ctx->lock);
434 /* 492 /*
435 * If the context is active we need to retry the smp call. 493 * If the context is active we need to retry the smp call.
436 */ 494 */
437 if (ctx->nr_active && !list_empty(&event->group_entry)) { 495 if (ctx->nr_active && !list_empty(&event->group_entry)) {
438 spin_unlock_irq(&ctx->lock); 496 raw_spin_unlock_irq(&ctx->lock);
439 goto retry; 497 goto retry;
440 } 498 }
441 499
@@ -444,48 +502,9 @@ retry:
444 * can remove the event safely, if the call above did not 502 * can remove the event safely, if the call above did not
445 * succeed. 503 * succeed.
446 */ 504 */
447 if (!list_empty(&event->group_entry)) { 505 if (!list_empty(&event->group_entry))
448 list_del_event(event, ctx); 506 list_del_event(event, ctx);
449 } 507 raw_spin_unlock_irq(&ctx->lock);
450 spin_unlock_irq(&ctx->lock);
451}
452
453static inline u64 perf_clock(void)
454{
455 return cpu_clock(smp_processor_id());
456}
457
458/*
459 * Update the record of the current time in a context.
460 */
461static void update_context_time(struct perf_event_context *ctx)
462{
463 u64 now = perf_clock();
464
465 ctx->time += now - ctx->timestamp;
466 ctx->timestamp = now;
467}
468
469/*
470 * Update the total_time_enabled and total_time_running fields for a event.
471 */
472static void update_event_times(struct perf_event *event)
473{
474 struct perf_event_context *ctx = event->ctx;
475 u64 run_end;
476
477 if (event->state < PERF_EVENT_STATE_INACTIVE ||
478 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
479 return;
480
481 event->total_time_enabled = ctx->time - event->tstamp_enabled;
482
483 if (event->state == PERF_EVENT_STATE_INACTIVE)
484 run_end = event->tstamp_stopped;
485 else
486 run_end = ctx->time;
487
488 event->total_time_running = run_end - event->tstamp_running;
489} 508}
490 509
491/* 510/*
@@ -516,7 +535,7 @@ static void __perf_event_disable(void *info)
516 if (ctx->task && cpuctx->task_ctx != ctx) 535 if (ctx->task && cpuctx->task_ctx != ctx)
517 return; 536 return;
518 537
519 spin_lock(&ctx->lock); 538 raw_spin_lock(&ctx->lock);
520 539
521 /* 540 /*
522 * If the event is on, turn it off. 541 * If the event is on, turn it off.
@@ -532,7 +551,7 @@ static void __perf_event_disable(void *info)
532 event->state = PERF_EVENT_STATE_OFF; 551 event->state = PERF_EVENT_STATE_OFF;
533 } 552 }
534 553
535 spin_unlock(&ctx->lock); 554 raw_spin_unlock(&ctx->lock);
536} 555}
537 556
538/* 557/*
@@ -548,7 +567,7 @@ static void __perf_event_disable(void *info)
548 * is the current context on this CPU and preemption is disabled, 567 * is the current context on this CPU and preemption is disabled,
549 * hence we can't get into perf_event_task_sched_out for this context. 568 * hence we can't get into perf_event_task_sched_out for this context.
550 */ 569 */
551static void perf_event_disable(struct perf_event *event) 570void perf_event_disable(struct perf_event *event)
552{ 571{
553 struct perf_event_context *ctx = event->ctx; 572 struct perf_event_context *ctx = event->ctx;
554 struct task_struct *task = ctx->task; 573 struct task_struct *task = ctx->task;
@@ -565,12 +584,12 @@ static void perf_event_disable(struct perf_event *event)
565 retry: 584 retry:
566 task_oncpu_function_call(task, __perf_event_disable, event); 585 task_oncpu_function_call(task, __perf_event_disable, event);
567 586
568 spin_lock_irq(&ctx->lock); 587 raw_spin_lock_irq(&ctx->lock);
569 /* 588 /*
570 * If the event is still active, we need to retry the cross-call. 589 * If the event is still active, we need to retry the cross-call.
571 */ 590 */
572 if (event->state == PERF_EVENT_STATE_ACTIVE) { 591 if (event->state == PERF_EVENT_STATE_ACTIVE) {
573 spin_unlock_irq(&ctx->lock); 592 raw_spin_unlock_irq(&ctx->lock);
574 goto retry; 593 goto retry;
575 } 594 }
576 595
@@ -583,7 +602,7 @@ static void perf_event_disable(struct perf_event *event)
583 event->state = PERF_EVENT_STATE_OFF; 602 event->state = PERF_EVENT_STATE_OFF;
584 } 603 }
585 604
586 spin_unlock_irq(&ctx->lock); 605 raw_spin_unlock_irq(&ctx->lock);
587} 606}
588 607
589static int 608static int
@@ -751,7 +770,7 @@ static void __perf_install_in_context(void *info)
751 cpuctx->task_ctx = ctx; 770 cpuctx->task_ctx = ctx;
752 } 771 }
753 772
754 spin_lock(&ctx->lock); 773 raw_spin_lock(&ctx->lock);
755 ctx->is_active = 1; 774 ctx->is_active = 1;
756 update_context_time(ctx); 775 update_context_time(ctx);
757 776
@@ -801,7 +820,7 @@ static void __perf_install_in_context(void *info)
801 unlock: 820 unlock:
802 perf_enable(); 821 perf_enable();
803 822
804 spin_unlock(&ctx->lock); 823 raw_spin_unlock(&ctx->lock);
805} 824}
806 825
807/* 826/*
@@ -826,7 +845,7 @@ perf_install_in_context(struct perf_event_context *ctx,
826 if (!task) { 845 if (!task) {
827 /* 846 /*
828 * Per cpu events are installed via an smp call and 847 * Per cpu events are installed via an smp call and
829 * the install is always sucessful. 848 * the install is always successful.
830 */ 849 */
831 smp_call_function_single(cpu, __perf_install_in_context, 850 smp_call_function_single(cpu, __perf_install_in_context,
832 event, 1); 851 event, 1);
@@ -837,12 +856,12 @@ retry:
837 task_oncpu_function_call(task, __perf_install_in_context, 856 task_oncpu_function_call(task, __perf_install_in_context,
838 event); 857 event);
839 858
840 spin_lock_irq(&ctx->lock); 859 raw_spin_lock_irq(&ctx->lock);
841 /* 860 /*
842 * we need to retry the smp call. 861 * we need to retry the smp call.
843 */ 862 */
844 if (ctx->is_active && list_empty(&event->group_entry)) { 863 if (ctx->is_active && list_empty(&event->group_entry)) {
845 spin_unlock_irq(&ctx->lock); 864 raw_spin_unlock_irq(&ctx->lock);
846 goto retry; 865 goto retry;
847 } 866 }
848 867
@@ -853,7 +872,7 @@ retry:
853 */ 872 */
854 if (list_empty(&event->group_entry)) 873 if (list_empty(&event->group_entry))
855 add_event_to_ctx(event, ctx); 874 add_event_to_ctx(event, ctx);
856 spin_unlock_irq(&ctx->lock); 875 raw_spin_unlock_irq(&ctx->lock);
857} 876}
858 877
859/* 878/*
@@ -898,7 +917,7 @@ static void __perf_event_enable(void *info)
898 cpuctx->task_ctx = ctx; 917 cpuctx->task_ctx = ctx;
899 } 918 }
900 919
901 spin_lock(&ctx->lock); 920 raw_spin_lock(&ctx->lock);
902 ctx->is_active = 1; 921 ctx->is_active = 1;
903 update_context_time(ctx); 922 update_context_time(ctx);
904 923
@@ -940,7 +959,7 @@ static void __perf_event_enable(void *info)
940 } 959 }
941 960
942 unlock: 961 unlock:
943 spin_unlock(&ctx->lock); 962 raw_spin_unlock(&ctx->lock);
944} 963}
945 964
946/* 965/*
@@ -952,7 +971,7 @@ static void __perf_event_enable(void *info)
952 * perf_event_for_each_child or perf_event_for_each as described 971 * perf_event_for_each_child or perf_event_for_each as described
953 * for perf_event_disable. 972 * for perf_event_disable.
954 */ 973 */
955static void perf_event_enable(struct perf_event *event) 974void perf_event_enable(struct perf_event *event)
956{ 975{
957 struct perf_event_context *ctx = event->ctx; 976 struct perf_event_context *ctx = event->ctx;
958 struct task_struct *task = ctx->task; 977 struct task_struct *task = ctx->task;
@@ -966,7 +985,7 @@ static void perf_event_enable(struct perf_event *event)
966 return; 985 return;
967 } 986 }
968 987
969 spin_lock_irq(&ctx->lock); 988 raw_spin_lock_irq(&ctx->lock);
970 if (event->state >= PERF_EVENT_STATE_INACTIVE) 989 if (event->state >= PERF_EVENT_STATE_INACTIVE)
971 goto out; 990 goto out;
972 991
@@ -981,10 +1000,10 @@ static void perf_event_enable(struct perf_event *event)
981 event->state = PERF_EVENT_STATE_OFF; 1000 event->state = PERF_EVENT_STATE_OFF;
982 1001
983 retry: 1002 retry:
984 spin_unlock_irq(&ctx->lock); 1003 raw_spin_unlock_irq(&ctx->lock);
985 task_oncpu_function_call(task, __perf_event_enable, event); 1004 task_oncpu_function_call(task, __perf_event_enable, event);
986 1005
987 spin_lock_irq(&ctx->lock); 1006 raw_spin_lock_irq(&ctx->lock);
988 1007
989 /* 1008 /*
990 * If the context is active and the event is still off, 1009 * If the context is active and the event is still off,
@@ -1001,7 +1020,7 @@ static void perf_event_enable(struct perf_event *event)
1001 __perf_event_mark_enabled(event, ctx); 1020 __perf_event_mark_enabled(event, ctx);
1002 1021
1003 out: 1022 out:
1004 spin_unlock_irq(&ctx->lock); 1023 raw_spin_unlock_irq(&ctx->lock);
1005} 1024}
1006 1025
1007static int perf_event_refresh(struct perf_event *event, int refresh) 1026static int perf_event_refresh(struct perf_event *event, int refresh)
@@ -1023,7 +1042,7 @@ void __perf_event_sched_out(struct perf_event_context *ctx,
1023{ 1042{
1024 struct perf_event *event; 1043 struct perf_event *event;
1025 1044
1026 spin_lock(&ctx->lock); 1045 raw_spin_lock(&ctx->lock);
1027 ctx->is_active = 0; 1046 ctx->is_active = 0;
1028 if (likely(!ctx->nr_events)) 1047 if (likely(!ctx->nr_events))
1029 goto out; 1048 goto out;
@@ -1031,16 +1050,12 @@ void __perf_event_sched_out(struct perf_event_context *ctx,
1031 1050
1032 perf_disable(); 1051 perf_disable();
1033 if (ctx->nr_active) { 1052 if (ctx->nr_active) {
1034 list_for_each_entry(event, &ctx->group_list, group_entry) { 1053 list_for_each_entry(event, &ctx->group_list, group_entry)
1035 if (event != event->group_leader) 1054 group_sched_out(event, cpuctx, ctx);
1036 event_sched_out(event, cpuctx, ctx);
1037 else
1038 group_sched_out(event, cpuctx, ctx);
1039 }
1040 } 1055 }
1041 perf_enable(); 1056 perf_enable();
1042 out: 1057 out:
1043 spin_unlock(&ctx->lock); 1058 raw_spin_unlock(&ctx->lock);
1044} 1059}
1045 1060
1046/* 1061/*
@@ -1062,8 +1077,6 @@ static int context_equiv(struct perf_event_context *ctx1,
1062 && !ctx1->pin_count && !ctx2->pin_count; 1077 && !ctx1->pin_count && !ctx2->pin_count;
1063} 1078}
1064 1079
1065static void __perf_event_read(void *event);
1066
1067static void __perf_event_sync_stat(struct perf_event *event, 1080static void __perf_event_sync_stat(struct perf_event *event,
1068 struct perf_event *next_event) 1081 struct perf_event *next_event)
1069{ 1082{
@@ -1081,8 +1094,8 @@ static void __perf_event_sync_stat(struct perf_event *event,
1081 */ 1094 */
1082 switch (event->state) { 1095 switch (event->state) {
1083 case PERF_EVENT_STATE_ACTIVE: 1096 case PERF_EVENT_STATE_ACTIVE:
1084 __perf_event_read(event); 1097 event->pmu->read(event);
1085 break; 1098 /* fall-through */
1086 1099
1087 case PERF_EVENT_STATE_INACTIVE: 1100 case PERF_EVENT_STATE_INACTIVE:
1088 update_event_times(event); 1101 update_event_times(event);
@@ -1121,6 +1134,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1121 if (!ctx->nr_stat) 1134 if (!ctx->nr_stat)
1122 return; 1135 return;
1123 1136
1137 update_context_time(ctx);
1138
1124 event = list_first_entry(&ctx->event_list, 1139 event = list_first_entry(&ctx->event_list,
1125 struct perf_event, event_entry); 1140 struct perf_event, event_entry);
1126 1141
@@ -1164,8 +1179,6 @@ void perf_event_task_sched_out(struct task_struct *task,
1164 if (likely(!ctx || !cpuctx->task_ctx)) 1179 if (likely(!ctx || !cpuctx->task_ctx))
1165 return; 1180 return;
1166 1181
1167 update_context_time(ctx);
1168
1169 rcu_read_lock(); 1182 rcu_read_lock();
1170 parent = rcu_dereference(ctx->parent_ctx); 1183 parent = rcu_dereference(ctx->parent_ctx);
1171 next_ctx = next->perf_event_ctxp; 1184 next_ctx = next->perf_event_ctxp;
@@ -1180,8 +1193,8 @@ void perf_event_task_sched_out(struct task_struct *task,
1180 * order we take the locks because no other cpu could 1193 * order we take the locks because no other cpu could
1181 * be trying to lock both of these tasks. 1194 * be trying to lock both of these tasks.
1182 */ 1195 */
1183 spin_lock(&ctx->lock); 1196 raw_spin_lock(&ctx->lock);
1184 spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); 1197 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1185 if (context_equiv(ctx, next_ctx)) { 1198 if (context_equiv(ctx, next_ctx)) {
1186 /* 1199 /*
1187 * XXX do we need a memory barrier of sorts 1200 * XXX do we need a memory barrier of sorts
@@ -1195,8 +1208,8 @@ void perf_event_task_sched_out(struct task_struct *task,
1195 1208
1196 perf_event_sync_stat(ctx, next_ctx); 1209 perf_event_sync_stat(ctx, next_ctx);
1197 } 1210 }
1198 spin_unlock(&next_ctx->lock); 1211 raw_spin_unlock(&next_ctx->lock);
1199 spin_unlock(&ctx->lock); 1212 raw_spin_unlock(&ctx->lock);
1200 } 1213 }
1201 rcu_read_unlock(); 1214 rcu_read_unlock();
1202 1215
@@ -1238,7 +1251,7 @@ __perf_event_sched_in(struct perf_event_context *ctx,
1238 struct perf_event *event; 1251 struct perf_event *event;
1239 int can_add_hw = 1; 1252 int can_add_hw = 1;
1240 1253
1241 spin_lock(&ctx->lock); 1254 raw_spin_lock(&ctx->lock);
1242 ctx->is_active = 1; 1255 ctx->is_active = 1;
1243 if (likely(!ctx->nr_events)) 1256 if (likely(!ctx->nr_events))
1244 goto out; 1257 goto out;
@@ -1258,12 +1271,8 @@ __perf_event_sched_in(struct perf_event_context *ctx,
1258 if (event->cpu != -1 && event->cpu != cpu) 1271 if (event->cpu != -1 && event->cpu != cpu)
1259 continue; 1272 continue;
1260 1273
1261 if (event != event->group_leader) 1274 if (group_can_go_on(event, cpuctx, 1))
1262 event_sched_in(event, cpuctx, ctx, cpu); 1275 group_sched_in(event, cpuctx, ctx, cpu);
1263 else {
1264 if (group_can_go_on(event, cpuctx, 1))
1265 group_sched_in(event, cpuctx, ctx, cpu);
1266 }
1267 1276
1268 /* 1277 /*
1269 * If this pinned group hasn't been scheduled, 1278 * If this pinned group hasn't been scheduled,
@@ -1291,19 +1300,13 @@ __perf_event_sched_in(struct perf_event_context *ctx,
1291 if (event->cpu != -1 && event->cpu != cpu) 1300 if (event->cpu != -1 && event->cpu != cpu)
1292 continue; 1301 continue;
1293 1302
1294 if (event != event->group_leader) { 1303 if (group_can_go_on(event, cpuctx, can_add_hw))
1295 if (event_sched_in(event, cpuctx, ctx, cpu)) 1304 if (group_sched_in(event, cpuctx, ctx, cpu))
1296 can_add_hw = 0; 1305 can_add_hw = 0;
1297 } else {
1298 if (group_can_go_on(event, cpuctx, can_add_hw)) {
1299 if (group_sched_in(event, cpuctx, ctx, cpu))
1300 can_add_hw = 0;
1301 }
1302 }
1303 } 1306 }
1304 perf_enable(); 1307 perf_enable();
1305 out: 1308 out:
1306 spin_unlock(&ctx->lock); 1309 raw_spin_unlock(&ctx->lock);
1307} 1310}
1308 1311
1309/* 1312/*
@@ -1367,8 +1370,8 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1367 struct hw_perf_event *hwc; 1370 struct hw_perf_event *hwc;
1368 u64 interrupts, freq; 1371 u64 interrupts, freq;
1369 1372
1370 spin_lock(&ctx->lock); 1373 raw_spin_lock(&ctx->lock);
1371 list_for_each_entry(event, &ctx->group_list, group_entry) { 1374 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
1372 if (event->state != PERF_EVENT_STATE_ACTIVE) 1375 if (event->state != PERF_EVENT_STATE_ACTIVE)
1373 continue; 1376 continue;
1374 1377
@@ -1422,7 +1425,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1422 perf_enable(); 1425 perf_enable();
1423 } 1426 }
1424 } 1427 }
1425 spin_unlock(&ctx->lock); 1428 raw_spin_unlock(&ctx->lock);
1426} 1429}
1427 1430
1428/* 1431/*
@@ -1435,7 +1438,7 @@ static void rotate_ctx(struct perf_event_context *ctx)
1435 if (!ctx->nr_events) 1438 if (!ctx->nr_events)
1436 return; 1439 return;
1437 1440
1438 spin_lock(&ctx->lock); 1441 raw_spin_lock(&ctx->lock);
1439 /* 1442 /*
1440 * Rotate the first entry last (works just fine for group events too): 1443 * Rotate the first entry last (works just fine for group events too):
1441 */ 1444 */
@@ -1446,7 +1449,7 @@ static void rotate_ctx(struct perf_event_context *ctx)
1446 } 1449 }
1447 perf_enable(); 1450 perf_enable();
1448 1451
1449 spin_unlock(&ctx->lock); 1452 raw_spin_unlock(&ctx->lock);
1450} 1453}
1451 1454
1452void perf_event_task_tick(struct task_struct *curr, int cpu) 1455void perf_event_task_tick(struct task_struct *curr, int cpu)
@@ -1495,7 +1498,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1495 1498
1496 __perf_event_task_sched_out(ctx); 1499 __perf_event_task_sched_out(ctx);
1497 1500
1498 spin_lock(&ctx->lock); 1501 raw_spin_lock(&ctx->lock);
1499 1502
1500 list_for_each_entry(event, &ctx->group_list, group_entry) { 1503 list_for_each_entry(event, &ctx->group_list, group_entry) {
1501 if (!event->attr.enable_on_exec) 1504 if (!event->attr.enable_on_exec)
@@ -1513,7 +1516,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1513 if (enabled) 1516 if (enabled)
1514 unclone_ctx(ctx); 1517 unclone_ctx(ctx);
1515 1518
1516 spin_unlock(&ctx->lock); 1519 raw_spin_unlock(&ctx->lock);
1517 1520
1518 perf_event_task_sched_in(task, smp_processor_id()); 1521 perf_event_task_sched_in(task, smp_processor_id());
1519 out: 1522 out:
@@ -1528,7 +1531,6 @@ static void __perf_event_read(void *info)
1528 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1531 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1529 struct perf_event *event = info; 1532 struct perf_event *event = info;
1530 struct perf_event_context *ctx = event->ctx; 1533 struct perf_event_context *ctx = event->ctx;
1531 unsigned long flags;
1532 1534
1533 /* 1535 /*
1534 * If this is a task context, we need to check whether it is 1536 * If this is a task context, we need to check whether it is
@@ -1540,12 +1542,12 @@ static void __perf_event_read(void *info)
1540 if (ctx->task && cpuctx->task_ctx != ctx) 1542 if (ctx->task && cpuctx->task_ctx != ctx)
1541 return; 1543 return;
1542 1544
1543 local_irq_save(flags); 1545 raw_spin_lock(&ctx->lock);
1544 if (ctx->is_active) 1546 update_context_time(ctx);
1545 update_context_time(ctx);
1546 event->pmu->read(event);
1547 update_event_times(event); 1547 update_event_times(event);
1548 local_irq_restore(flags); 1548 raw_spin_unlock(&ctx->lock);
1549
1550 event->pmu->read(event);
1549} 1551}
1550 1552
1551static u64 perf_event_read(struct perf_event *event) 1553static u64 perf_event_read(struct perf_event *event)
@@ -1558,7 +1560,13 @@ static u64 perf_event_read(struct perf_event *event)
1558 smp_call_function_single(event->oncpu, 1560 smp_call_function_single(event->oncpu,
1559 __perf_event_read, event, 1); 1561 __perf_event_read, event, 1);
1560 } else if (event->state == PERF_EVENT_STATE_INACTIVE) { 1562 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
1563 struct perf_event_context *ctx = event->ctx;
1564 unsigned long flags;
1565
1566 raw_spin_lock_irqsave(&ctx->lock, flags);
1567 update_context_time(ctx);
1561 update_event_times(event); 1568 update_event_times(event);
1569 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1562 } 1570 }
1563 1571
1564 return atomic64_read(&event->count); 1572 return atomic64_read(&event->count);
@@ -1571,8 +1579,7 @@ static void
1571__perf_event_init_context(struct perf_event_context *ctx, 1579__perf_event_init_context(struct perf_event_context *ctx,
1572 struct task_struct *task) 1580 struct task_struct *task)
1573{ 1581{
1574 memset(ctx, 0, sizeof(*ctx)); 1582 raw_spin_lock_init(&ctx->lock);
1575 spin_lock_init(&ctx->lock);
1576 mutex_init(&ctx->mutex); 1583 mutex_init(&ctx->mutex);
1577 INIT_LIST_HEAD(&ctx->group_list); 1584 INIT_LIST_HEAD(&ctx->group_list);
1578 INIT_LIST_HEAD(&ctx->event_list); 1585 INIT_LIST_HEAD(&ctx->event_list);
@@ -1642,11 +1649,11 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1642 ctx = perf_lock_task_context(task, &flags); 1649 ctx = perf_lock_task_context(task, &flags);
1643 if (ctx) { 1650 if (ctx) {
1644 unclone_ctx(ctx); 1651 unclone_ctx(ctx);
1645 spin_unlock_irqrestore(&ctx->lock, flags); 1652 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1646 } 1653 }
1647 1654
1648 if (!ctx) { 1655 if (!ctx) {
1649 ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL); 1656 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
1650 err = -ENOMEM; 1657 err = -ENOMEM;
1651 if (!ctx) 1658 if (!ctx)
1652 goto errout; 1659 goto errout;
@@ -1671,6 +1678,8 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1671 return ERR_PTR(err); 1678 return ERR_PTR(err);
1672} 1679}
1673 1680
1681static void perf_event_free_filter(struct perf_event *event);
1682
1674static void free_event_rcu(struct rcu_head *head) 1683static void free_event_rcu(struct rcu_head *head)
1675{ 1684{
1676 struct perf_event *event; 1685 struct perf_event *event;
@@ -1678,6 +1687,7 @@ static void free_event_rcu(struct rcu_head *head)
1678 event = container_of(head, struct perf_event, rcu_head); 1687 event = container_of(head, struct perf_event, rcu_head);
1679 if (event->ns) 1688 if (event->ns)
1680 put_pid_ns(event->ns); 1689 put_pid_ns(event->ns);
1690 perf_event_free_filter(event);
1681 kfree(event); 1691 kfree(event);
1682} 1692}
1683 1693
@@ -1709,16 +1719,10 @@ static void free_event(struct perf_event *event)
1709 call_rcu(&event->rcu_head, free_event_rcu); 1719 call_rcu(&event->rcu_head, free_event_rcu);
1710} 1720}
1711 1721
1712/* 1722int perf_event_release_kernel(struct perf_event *event)
1713 * Called when the last reference to the file is gone.
1714 */
1715static int perf_release(struct inode *inode, struct file *file)
1716{ 1723{
1717 struct perf_event *event = file->private_data;
1718 struct perf_event_context *ctx = event->ctx; 1724 struct perf_event_context *ctx = event->ctx;
1719 1725
1720 file->private_data = NULL;
1721
1722 WARN_ON_ONCE(ctx->parent_ctx); 1726 WARN_ON_ONCE(ctx->parent_ctx);
1723 mutex_lock(&ctx->mutex); 1727 mutex_lock(&ctx->mutex);
1724 perf_event_remove_from_context(event); 1728 perf_event_remove_from_context(event);
@@ -1733,6 +1737,19 @@ static int perf_release(struct inode *inode, struct file *file)
1733 1737
1734 return 0; 1738 return 0;
1735} 1739}
1740EXPORT_SYMBOL_GPL(perf_event_release_kernel);
1741
1742/*
1743 * Called when the last reference to the file is gone.
1744 */
1745static int perf_release(struct inode *inode, struct file *file)
1746{
1747 struct perf_event *event = file->private_data;
1748
1749 file->private_data = NULL;
1750
1751 return perf_event_release_kernel(event);
1752}
1736 1753
1737static int perf_event_read_size(struct perf_event *event) 1754static int perf_event_read_size(struct perf_event *event)
1738{ 1755{
@@ -1759,91 +1776,94 @@ static int perf_event_read_size(struct perf_event *event)
1759 return size; 1776 return size;
1760} 1777}
1761 1778
1762static u64 perf_event_read_value(struct perf_event *event) 1779u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
1763{ 1780{
1764 struct perf_event *child; 1781 struct perf_event *child;
1765 u64 total = 0; 1782 u64 total = 0;
1766 1783
1784 *enabled = 0;
1785 *running = 0;
1786
1787 mutex_lock(&event->child_mutex);
1767 total += perf_event_read(event); 1788 total += perf_event_read(event);
1768 list_for_each_entry(child, &event->child_list, child_list) 1789 *enabled += event->total_time_enabled +
1790 atomic64_read(&event->child_total_time_enabled);
1791 *running += event->total_time_running +
1792 atomic64_read(&event->child_total_time_running);
1793
1794 list_for_each_entry(child, &event->child_list, child_list) {
1769 total += perf_event_read(child); 1795 total += perf_event_read(child);
1796 *enabled += child->total_time_enabled;
1797 *running += child->total_time_running;
1798 }
1799 mutex_unlock(&event->child_mutex);
1770 1800
1771 return total; 1801 return total;
1772} 1802}
1773 1803EXPORT_SYMBOL_GPL(perf_event_read_value);
1774static int perf_event_read_entry(struct perf_event *event,
1775 u64 read_format, char __user *buf)
1776{
1777 int n = 0, count = 0;
1778 u64 values[2];
1779
1780 values[n++] = perf_event_read_value(event);
1781 if (read_format & PERF_FORMAT_ID)
1782 values[n++] = primary_event_id(event);
1783
1784 count = n * sizeof(u64);
1785
1786 if (copy_to_user(buf, values, count))
1787 return -EFAULT;
1788
1789 return count;
1790}
1791 1804
1792static int perf_event_read_group(struct perf_event *event, 1805static int perf_event_read_group(struct perf_event *event,
1793 u64 read_format, char __user *buf) 1806 u64 read_format, char __user *buf)
1794{ 1807{
1795 struct perf_event *leader = event->group_leader, *sub; 1808 struct perf_event *leader = event->group_leader, *sub;
1796 int n = 0, size = 0, err = -EFAULT; 1809 int n = 0, size = 0, ret = -EFAULT;
1797 u64 values[3]; 1810 struct perf_event_context *ctx = leader->ctx;
1811 u64 values[5];
1812 u64 count, enabled, running;
1813
1814 mutex_lock(&ctx->mutex);
1815 count = perf_event_read_value(leader, &enabled, &running);
1798 1816
1799 values[n++] = 1 + leader->nr_siblings; 1817 values[n++] = 1 + leader->nr_siblings;
1800 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 1818 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1801 values[n++] = leader->total_time_enabled + 1819 values[n++] = enabled;
1802 atomic64_read(&leader->child_total_time_enabled); 1820 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1803 } 1821 values[n++] = running;
1804 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { 1822 values[n++] = count;
1805 values[n++] = leader->total_time_running + 1823 if (read_format & PERF_FORMAT_ID)
1806 atomic64_read(&leader->child_total_time_running); 1824 values[n++] = primary_event_id(leader);
1807 }
1808 1825
1809 size = n * sizeof(u64); 1826 size = n * sizeof(u64);
1810 1827
1811 if (copy_to_user(buf, values, size)) 1828 if (copy_to_user(buf, values, size))
1812 return -EFAULT; 1829 goto unlock;
1813
1814 err = perf_event_read_entry(leader, read_format, buf + size);
1815 if (err < 0)
1816 return err;
1817 1830
1818 size += err; 1831 ret = size;
1819 1832
1820 list_for_each_entry(sub, &leader->sibling_list, group_entry) { 1833 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1821 err = perf_event_read_entry(sub, read_format, 1834 n = 0;
1822 buf + size); 1835
1823 if (err < 0) 1836 values[n++] = perf_event_read_value(sub, &enabled, &running);
1824 return err; 1837 if (read_format & PERF_FORMAT_ID)
1838 values[n++] = primary_event_id(sub);
1839
1840 size = n * sizeof(u64);
1841
1842 if (copy_to_user(buf + ret, values, size)) {
1843 ret = -EFAULT;
1844 goto unlock;
1845 }
1825 1846
1826 size += err; 1847 ret += size;
1827 } 1848 }
1849unlock:
1850 mutex_unlock(&ctx->mutex);
1828 1851
1829 return size; 1852 return ret;
1830} 1853}
1831 1854
1832static int perf_event_read_one(struct perf_event *event, 1855static int perf_event_read_one(struct perf_event *event,
1833 u64 read_format, char __user *buf) 1856 u64 read_format, char __user *buf)
1834{ 1857{
1858 u64 enabled, running;
1835 u64 values[4]; 1859 u64 values[4];
1836 int n = 0; 1860 int n = 0;
1837 1861
1838 values[n++] = perf_event_read_value(event); 1862 values[n++] = perf_event_read_value(event, &enabled, &running);
1839 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 1863 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1840 values[n++] = event->total_time_enabled + 1864 values[n++] = enabled;
1841 atomic64_read(&event->child_total_time_enabled); 1865 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1842 } 1866 values[n++] = running;
1843 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1844 values[n++] = event->total_time_running +
1845 atomic64_read(&event->child_total_time_running);
1846 }
1847 if (read_format & PERF_FORMAT_ID) 1867 if (read_format & PERF_FORMAT_ID)
1848 values[n++] = primary_event_id(event); 1868 values[n++] = primary_event_id(event);
1849 1869
@@ -1874,12 +1894,10 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
1874 return -ENOSPC; 1894 return -ENOSPC;
1875 1895
1876 WARN_ON_ONCE(event->ctx->parent_ctx); 1896 WARN_ON_ONCE(event->ctx->parent_ctx);
1877 mutex_lock(&event->child_mutex);
1878 if (read_format & PERF_FORMAT_GROUP) 1897 if (read_format & PERF_FORMAT_GROUP)
1879 ret = perf_event_read_group(event, read_format, buf); 1898 ret = perf_event_read_group(event, read_format, buf);
1880 else 1899 else
1881 ret = perf_event_read_one(event, read_format, buf); 1900 ret = perf_event_read_one(event, read_format, buf);
1882 mutex_unlock(&event->child_mutex);
1883 1901
1884 return ret; 1902 return ret;
1885} 1903}
@@ -1969,7 +1987,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
1969 if (!value) 1987 if (!value)
1970 return -EINVAL; 1988 return -EINVAL;
1971 1989
1972 spin_lock_irq(&ctx->lock); 1990 raw_spin_lock_irq(&ctx->lock);
1973 if (event->attr.freq) { 1991 if (event->attr.freq) {
1974 if (value > sysctl_perf_event_sample_rate) { 1992 if (value > sysctl_perf_event_sample_rate) {
1975 ret = -EINVAL; 1993 ret = -EINVAL;
@@ -1982,12 +2000,13 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
1982 event->hw.sample_period = value; 2000 event->hw.sample_period = value;
1983 } 2001 }
1984unlock: 2002unlock:
1985 spin_unlock_irq(&ctx->lock); 2003 raw_spin_unlock_irq(&ctx->lock);
1986 2004
1987 return ret; 2005 return ret;
1988} 2006}
1989 2007
1990int perf_event_set_output(struct perf_event *event, int output_fd); 2008static int perf_event_set_output(struct perf_event *event, int output_fd);
2009static int perf_event_set_filter(struct perf_event *event, void __user *arg);
1991 2010
1992static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 2011static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1993{ 2012{
@@ -2015,6 +2034,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2015 case PERF_EVENT_IOC_SET_OUTPUT: 2034 case PERF_EVENT_IOC_SET_OUTPUT:
2016 return perf_event_set_output(event, arg); 2035 return perf_event_set_output(event, arg);
2017 2036
2037 case PERF_EVENT_IOC_SET_FILTER:
2038 return perf_event_set_filter(event, (void __user *)arg);
2039
2018 default: 2040 default:
2019 return -ENOTTY; 2041 return -ENOTTY;
2020 } 2042 }
@@ -2105,49 +2127,31 @@ unlock:
2105 rcu_read_unlock(); 2127 rcu_read_unlock();
2106} 2128}
2107 2129
2108static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 2130static unsigned long perf_data_size(struct perf_mmap_data *data)
2109{ 2131{
2110 struct perf_event *event = vma->vm_file->private_data; 2132 return data->nr_pages << (PAGE_SHIFT + data->data_order);
2111 struct perf_mmap_data *data; 2133}
2112 int ret = VM_FAULT_SIGBUS;
2113
2114 if (vmf->flags & FAULT_FLAG_MKWRITE) {
2115 if (vmf->pgoff == 0)
2116 ret = 0;
2117 return ret;
2118 }
2119
2120 rcu_read_lock();
2121 data = rcu_dereference(event->data);
2122 if (!data)
2123 goto unlock;
2124
2125 if (vmf->pgoff == 0) {
2126 vmf->page = virt_to_page(data->user_page);
2127 } else {
2128 int nr = vmf->pgoff - 1;
2129 2134
2130 if ((unsigned)nr > data->nr_pages) 2135#ifndef CONFIG_PERF_USE_VMALLOC
2131 goto unlock;
2132 2136
2133 if (vmf->flags & FAULT_FLAG_WRITE) 2137/*
2134 goto unlock; 2138 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
2139 */
2135 2140
2136 vmf->page = virt_to_page(data->data_pages[nr]); 2141static struct page *
2137 } 2142perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2143{
2144 if (pgoff > data->nr_pages)
2145 return NULL;
2138 2146
2139 get_page(vmf->page); 2147 if (pgoff == 0)
2140 vmf->page->mapping = vma->vm_file->f_mapping; 2148 return virt_to_page(data->user_page);
2141 vmf->page->index = vmf->pgoff;
2142 2149
2143 ret = 0; 2150 return virt_to_page(data->data_pages[pgoff - 1]);
2144unlock:
2145 rcu_read_unlock();
2146
2147 return ret;
2148} 2151}
2149 2152
2150static int perf_mmap_data_alloc(struct perf_event *event, int nr_pages) 2153static struct perf_mmap_data *
2154perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2151{ 2155{
2152 struct perf_mmap_data *data; 2156 struct perf_mmap_data *data;
2153 unsigned long size; 2157 unsigned long size;
@@ -2172,19 +2176,10 @@ static int perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2172 goto fail_data_pages; 2176 goto fail_data_pages;
2173 } 2177 }
2174 2178
2179 data->data_order = 0;
2175 data->nr_pages = nr_pages; 2180 data->nr_pages = nr_pages;
2176 atomic_set(&data->lock, -1);
2177 2181
2178 if (event->attr.watermark) { 2182 return data;
2179 data->watermark = min_t(long, PAGE_SIZE * nr_pages,
2180 event->attr.wakeup_watermark);
2181 }
2182 if (!data->watermark)
2183 data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4);
2184
2185 rcu_assign_pointer(event->data, data);
2186
2187 return 0;
2188 2183
2189fail_data_pages: 2184fail_data_pages:
2190 for (i--; i >= 0; i--) 2185 for (i--; i >= 0; i--)
@@ -2196,7 +2191,7 @@ fail_user_page:
2196 kfree(data); 2191 kfree(data);
2197 2192
2198fail: 2193fail:
2199 return -ENOMEM; 2194 return NULL;
2200} 2195}
2201 2196
2202static void perf_mmap_free_page(unsigned long addr) 2197static void perf_mmap_free_page(unsigned long addr)
@@ -2207,28 +2202,170 @@ static void perf_mmap_free_page(unsigned long addr)
2207 __free_page(page); 2202 __free_page(page);
2208} 2203}
2209 2204
2210static void __perf_mmap_data_free(struct rcu_head *rcu_head) 2205static void perf_mmap_data_free(struct perf_mmap_data *data)
2211{ 2206{
2212 struct perf_mmap_data *data;
2213 int i; 2207 int i;
2214 2208
2215 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2216
2217 perf_mmap_free_page((unsigned long)data->user_page); 2209 perf_mmap_free_page((unsigned long)data->user_page);
2218 for (i = 0; i < data->nr_pages; i++) 2210 for (i = 0; i < data->nr_pages; i++)
2219 perf_mmap_free_page((unsigned long)data->data_pages[i]); 2211 perf_mmap_free_page((unsigned long)data->data_pages[i]);
2212 kfree(data);
2213}
2214
2215#else
2216
2217/*
2218 * Back perf_mmap() with vmalloc memory.
2219 *
2220 * Required for architectures that have d-cache aliasing issues.
2221 */
2222
2223static struct page *
2224perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2225{
2226 if (pgoff > (1UL << data->data_order))
2227 return NULL;
2228
2229 return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
2230}
2231
2232static void perf_mmap_unmark_page(void *addr)
2233{
2234 struct page *page = vmalloc_to_page(addr);
2235
2236 page->mapping = NULL;
2237}
2238
2239static void perf_mmap_data_free_work(struct work_struct *work)
2240{
2241 struct perf_mmap_data *data;
2242 void *base;
2243 int i, nr;
2220 2244
2245 data = container_of(work, struct perf_mmap_data, work);
2246 nr = 1 << data->data_order;
2247
2248 base = data->user_page;
2249 for (i = 0; i < nr + 1; i++)
2250 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
2251
2252 vfree(base);
2253 kfree(data);
2254}
2255
2256static void perf_mmap_data_free(struct perf_mmap_data *data)
2257{
2258 schedule_work(&data->work);
2259}
2260
2261static struct perf_mmap_data *
2262perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2263{
2264 struct perf_mmap_data *data;
2265 unsigned long size;
2266 void *all_buf;
2267
2268 WARN_ON(atomic_read(&event->mmap_count));
2269
2270 size = sizeof(struct perf_mmap_data);
2271 size += sizeof(void *);
2272
2273 data = kzalloc(size, GFP_KERNEL);
2274 if (!data)
2275 goto fail;
2276
2277 INIT_WORK(&data->work, perf_mmap_data_free_work);
2278
2279 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
2280 if (!all_buf)
2281 goto fail_all_buf;
2282
2283 data->user_page = all_buf;
2284 data->data_pages[0] = all_buf + PAGE_SIZE;
2285 data->data_order = ilog2(nr_pages);
2286 data->nr_pages = 1;
2287
2288 return data;
2289
2290fail_all_buf:
2221 kfree(data); 2291 kfree(data);
2292
2293fail:
2294 return NULL;
2295}
2296
2297#endif
2298
2299static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2300{
2301 struct perf_event *event = vma->vm_file->private_data;
2302 struct perf_mmap_data *data;
2303 int ret = VM_FAULT_SIGBUS;
2304
2305 if (vmf->flags & FAULT_FLAG_MKWRITE) {
2306 if (vmf->pgoff == 0)
2307 ret = 0;
2308 return ret;
2309 }
2310
2311 rcu_read_lock();
2312 data = rcu_dereference(event->data);
2313 if (!data)
2314 goto unlock;
2315
2316 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
2317 goto unlock;
2318
2319 vmf->page = perf_mmap_to_page(data, vmf->pgoff);
2320 if (!vmf->page)
2321 goto unlock;
2322
2323 get_page(vmf->page);
2324 vmf->page->mapping = vma->vm_file->f_mapping;
2325 vmf->page->index = vmf->pgoff;
2326
2327 ret = 0;
2328unlock:
2329 rcu_read_unlock();
2330
2331 return ret;
2332}
2333
2334static void
2335perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2336{
2337 long max_size = perf_data_size(data);
2338
2339 atomic_set(&data->lock, -1);
2340
2341 if (event->attr.watermark) {
2342 data->watermark = min_t(long, max_size,
2343 event->attr.wakeup_watermark);
2344 }
2345
2346 if (!data->watermark)
2347 data->watermark = max_size / 2;
2348
2349
2350 rcu_assign_pointer(event->data, data);
2351}
2352
2353static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
2354{
2355 struct perf_mmap_data *data;
2356
2357 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2358 perf_mmap_data_free(data);
2222} 2359}
2223 2360
2224static void perf_mmap_data_free(struct perf_event *event) 2361static void perf_mmap_data_release(struct perf_event *event)
2225{ 2362{
2226 struct perf_mmap_data *data = event->data; 2363 struct perf_mmap_data *data = event->data;
2227 2364
2228 WARN_ON(atomic_read(&event->mmap_count)); 2365 WARN_ON(atomic_read(&event->mmap_count));
2229 2366
2230 rcu_assign_pointer(event->data, NULL); 2367 rcu_assign_pointer(event->data, NULL);
2231 call_rcu(&data->rcu_head, __perf_mmap_data_free); 2368 call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
2232} 2369}
2233 2370
2234static void perf_mmap_open(struct vm_area_struct *vma) 2371static void perf_mmap_open(struct vm_area_struct *vma)
@@ -2244,16 +2381,17 @@ static void perf_mmap_close(struct vm_area_struct *vma)
2244 2381
2245 WARN_ON_ONCE(event->ctx->parent_ctx); 2382 WARN_ON_ONCE(event->ctx->parent_ctx);
2246 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { 2383 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
2384 unsigned long size = perf_data_size(event->data);
2247 struct user_struct *user = current_user(); 2385 struct user_struct *user = current_user();
2248 2386
2249 atomic_long_sub(event->data->nr_pages + 1, &user->locked_vm); 2387 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
2250 vma->vm_mm->locked_vm -= event->data->nr_locked; 2388 vma->vm_mm->locked_vm -= event->data->nr_locked;
2251 perf_mmap_data_free(event); 2389 perf_mmap_data_release(event);
2252 mutex_unlock(&event->mmap_mutex); 2390 mutex_unlock(&event->mmap_mutex);
2253 } 2391 }
2254} 2392}
2255 2393
2256static struct vm_operations_struct perf_mmap_vmops = { 2394static const struct vm_operations_struct perf_mmap_vmops = {
2257 .open = perf_mmap_open, 2395 .open = perf_mmap_open,
2258 .close = perf_mmap_close, 2396 .close = perf_mmap_close,
2259 .fault = perf_mmap_fault, 2397 .fault = perf_mmap_fault,
@@ -2266,6 +2404,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2266 unsigned long user_locked, user_lock_limit; 2404 unsigned long user_locked, user_lock_limit;
2267 struct user_struct *user = current_user(); 2405 struct user_struct *user = current_user();
2268 unsigned long locked, lock_limit; 2406 unsigned long locked, lock_limit;
2407 struct perf_mmap_data *data;
2269 unsigned long vma_size; 2408 unsigned long vma_size;
2270 unsigned long nr_pages; 2409 unsigned long nr_pages;
2271 long user_extra, extra; 2410 long user_extra, extra;
@@ -2328,10 +2467,15 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2328 } 2467 }
2329 2468
2330 WARN_ON(event->data); 2469 WARN_ON(event->data);
2331 ret = perf_mmap_data_alloc(event, nr_pages); 2470
2332 if (ret) 2471 data = perf_mmap_data_alloc(event, nr_pages);
2472 ret = -ENOMEM;
2473 if (!data)
2333 goto unlock; 2474 goto unlock;
2334 2475
2476 ret = 0;
2477 perf_mmap_data_init(event, data);
2478
2335 atomic_set(&event->mmap_count, 1); 2479 atomic_set(&event->mmap_count, 1);
2336 atomic_long_add(user_extra, &user->locked_vm); 2480 atomic_long_add(user_extra, &user->locked_vm);
2337 vma->vm_mm->locked_vm += extra; 2481 vma->vm_mm->locked_vm += extra;
@@ -2519,7 +2663,7 @@ static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
2519 if (!data->writable) 2663 if (!data->writable)
2520 return true; 2664 return true;
2521 2665
2522 mask = (data->nr_pages << PAGE_SHIFT) - 1; 2666 mask = perf_data_size(data) - 1;
2523 2667
2524 offset = (offset - tail) & mask; 2668 offset = (offset - tail) & mask;
2525 head = (head - tail) & mask; 2669 head = (head - tail) & mask;
@@ -2558,20 +2702,21 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
2558static void perf_output_lock(struct perf_output_handle *handle) 2702static void perf_output_lock(struct perf_output_handle *handle)
2559{ 2703{
2560 struct perf_mmap_data *data = handle->data; 2704 struct perf_mmap_data *data = handle->data;
2561 int cpu; 2705 int cur, cpu = get_cpu();
2562 2706
2563 handle->locked = 0; 2707 handle->locked = 0;
2564 2708
2565 local_irq_save(handle->flags); 2709 for (;;) {
2566 cpu = smp_processor_id(); 2710 cur = atomic_cmpxchg(&data->lock, -1, cpu);
2567 2711 if (cur == -1) {
2568 if (in_nmi() && atomic_read(&data->lock) == cpu) 2712 handle->locked = 1;
2569 return; 2713 break;
2714 }
2715 if (cur == cpu)
2716 break;
2570 2717
2571 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2572 cpu_relax(); 2718 cpu_relax();
2573 2719 }
2574 handle->locked = 1;
2575} 2720}
2576 2721
2577static void perf_output_unlock(struct perf_output_handle *handle) 2722static void perf_output_unlock(struct perf_output_handle *handle)
@@ -2617,14 +2762,14 @@ again:
2617 if (atomic_xchg(&data->wakeup, 0)) 2762 if (atomic_xchg(&data->wakeup, 0))
2618 perf_output_wakeup(handle); 2763 perf_output_wakeup(handle);
2619out: 2764out:
2620 local_irq_restore(handle->flags); 2765 put_cpu();
2621} 2766}
2622 2767
2623void perf_output_copy(struct perf_output_handle *handle, 2768void perf_output_copy(struct perf_output_handle *handle,
2624 const void *buf, unsigned int len) 2769 const void *buf, unsigned int len)
2625{ 2770{
2626 unsigned int pages_mask; 2771 unsigned int pages_mask;
2627 unsigned int offset; 2772 unsigned long offset;
2628 unsigned int size; 2773 unsigned int size;
2629 void **pages; 2774 void **pages;
2630 2775
@@ -2633,12 +2778,14 @@ void perf_output_copy(struct perf_output_handle *handle,
2633 pages = handle->data->data_pages; 2778 pages = handle->data->data_pages;
2634 2779
2635 do { 2780 do {
2636 unsigned int page_offset; 2781 unsigned long page_offset;
2782 unsigned long page_size;
2637 int nr; 2783 int nr;
2638 2784
2639 nr = (offset >> PAGE_SHIFT) & pages_mask; 2785 nr = (offset >> PAGE_SHIFT) & pages_mask;
2640 page_offset = offset & (PAGE_SIZE - 1); 2786 page_size = 1UL << (handle->data->data_order + PAGE_SHIFT);
2641 size = min_t(unsigned int, PAGE_SIZE - page_offset, len); 2787 page_offset = offset & (page_size - 1);
2788 size = min_t(unsigned int, page_size - page_offset, len);
2642 2789
2643 memcpy(pages[nr] + page_offset, buf, size); 2790 memcpy(pages[nr] + page_offset, buf, size);
2644 2791
@@ -3126,15 +3273,10 @@ static void perf_event_task_ctx(struct perf_event_context *ctx,
3126{ 3273{
3127 struct perf_event *event; 3274 struct perf_event *event;
3128 3275
3129 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3130 return;
3131
3132 rcu_read_lock();
3133 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3276 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3134 if (perf_event_task_match(event)) 3277 if (perf_event_task_match(event))
3135 perf_event_task_output(event, task_event); 3278 perf_event_task_output(event, task_event);
3136 } 3279 }
3137 rcu_read_unlock();
3138} 3280}
3139 3281
3140static void perf_event_task_event(struct perf_task_event *task_event) 3282static void perf_event_task_event(struct perf_task_event *task_event)
@@ -3142,11 +3284,11 @@ static void perf_event_task_event(struct perf_task_event *task_event)
3142 struct perf_cpu_context *cpuctx; 3284 struct perf_cpu_context *cpuctx;
3143 struct perf_event_context *ctx = task_event->task_ctx; 3285 struct perf_event_context *ctx = task_event->task_ctx;
3144 3286
3287 rcu_read_lock();
3145 cpuctx = &get_cpu_var(perf_cpu_context); 3288 cpuctx = &get_cpu_var(perf_cpu_context);
3146 perf_event_task_ctx(&cpuctx->ctx, task_event); 3289 perf_event_task_ctx(&cpuctx->ctx, task_event);
3147 put_cpu_var(perf_cpu_context); 3290 put_cpu_var(perf_cpu_context);
3148 3291
3149 rcu_read_lock();
3150 if (!ctx) 3292 if (!ctx)
3151 ctx = rcu_dereference(task_event->task->perf_event_ctxp); 3293 ctx = rcu_dereference(task_event->task->perf_event_ctxp);
3152 if (ctx) 3294 if (ctx)
@@ -3238,15 +3380,10 @@ static void perf_event_comm_ctx(struct perf_event_context *ctx,
3238{ 3380{
3239 struct perf_event *event; 3381 struct perf_event *event;
3240 3382
3241 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3242 return;
3243
3244 rcu_read_lock();
3245 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3383 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3246 if (perf_event_comm_match(event)) 3384 if (perf_event_comm_match(event))
3247 perf_event_comm_output(event, comm_event); 3385 perf_event_comm_output(event, comm_event);
3248 } 3386 }
3249 rcu_read_unlock();
3250} 3387}
3251 3388
3252static void perf_event_comm_event(struct perf_comm_event *comm_event) 3389static void perf_event_comm_event(struct perf_comm_event *comm_event)
@@ -3257,7 +3394,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3257 char comm[TASK_COMM_LEN]; 3394 char comm[TASK_COMM_LEN];
3258 3395
3259 memset(comm, 0, sizeof(comm)); 3396 memset(comm, 0, sizeof(comm));
3260 strncpy(comm, comm_event->task->comm, sizeof(comm)); 3397 strlcpy(comm, comm_event->task->comm, sizeof(comm));
3261 size = ALIGN(strlen(comm)+1, sizeof(u64)); 3398 size = ALIGN(strlen(comm)+1, sizeof(u64));
3262 3399
3263 comm_event->comm = comm; 3400 comm_event->comm = comm;
@@ -3265,11 +3402,11 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3265 3402
3266 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; 3403 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3267 3404
3405 rcu_read_lock();
3268 cpuctx = &get_cpu_var(perf_cpu_context); 3406 cpuctx = &get_cpu_var(perf_cpu_context);
3269 perf_event_comm_ctx(&cpuctx->ctx, comm_event); 3407 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3270 put_cpu_var(perf_cpu_context); 3408 put_cpu_var(perf_cpu_context);
3271 3409
3272 rcu_read_lock();
3273 /* 3410 /*
3274 * doesn't really matter which of the child contexts the 3411 * doesn't really matter which of the child contexts the
3275 * events ends up in. 3412 * events ends up in.
@@ -3362,15 +3499,10 @@ static void perf_event_mmap_ctx(struct perf_event_context *ctx,
3362{ 3499{
3363 struct perf_event *event; 3500 struct perf_event *event;
3364 3501
3365 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3366 return;
3367
3368 rcu_read_lock();
3369 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3502 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3370 if (perf_event_mmap_match(event, mmap_event)) 3503 if (perf_event_mmap_match(event, mmap_event))
3371 perf_event_mmap_output(event, mmap_event); 3504 perf_event_mmap_output(event, mmap_event);
3372 } 3505 }
3373 rcu_read_unlock();
3374} 3506}
3375 3507
3376static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) 3508static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
@@ -3426,11 +3558,11 @@ got_name:
3426 3558
3427 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; 3559 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3428 3560
3561 rcu_read_lock();
3429 cpuctx = &get_cpu_var(perf_cpu_context); 3562 cpuctx = &get_cpu_var(perf_cpu_context);
3430 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); 3563 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
3431 put_cpu_var(perf_cpu_context); 3564 put_cpu_var(perf_cpu_context);
3432 3565
3433 rcu_read_lock();
3434 /* 3566 /*
3435 * doesn't really matter which of the child contexts the 3567 * doesn't really matter which of the child contexts the
3436 * events ends up in. 3568 * events ends up in.
@@ -3569,7 +3701,11 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
3569 perf_event_disable(event); 3701 perf_event_disable(event);
3570 } 3702 }
3571 3703
3572 perf_event_output(event, nmi, data, regs); 3704 if (event->overflow_handler)
3705 event->overflow_handler(event, nmi, data, regs);
3706 else
3707 perf_event_output(event, nmi, data, regs);
3708
3573 return ret; 3709 return ret;
3574} 3710}
3575 3711
@@ -3614,16 +3750,16 @@ again:
3614 return nr; 3750 return nr;
3615} 3751}
3616 3752
3617static void perf_swevent_overflow(struct perf_event *event, 3753static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
3618 int nmi, struct perf_sample_data *data, 3754 int nmi, struct perf_sample_data *data,
3619 struct pt_regs *regs) 3755 struct pt_regs *regs)
3620{ 3756{
3621 struct hw_perf_event *hwc = &event->hw; 3757 struct hw_perf_event *hwc = &event->hw;
3622 int throttle = 0; 3758 int throttle = 0;
3623 u64 overflow;
3624 3759
3625 data->period = event->hw.last_period; 3760 data->period = event->hw.last_period;
3626 overflow = perf_swevent_set_period(event); 3761 if (!overflow)
3762 overflow = perf_swevent_set_period(event);
3627 3763
3628 if (hwc->interrupts == MAX_INTERRUPTS) 3764 if (hwc->interrupts == MAX_INTERRUPTS)
3629 return; 3765 return;
@@ -3656,14 +3792,19 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
3656 3792
3657 atomic64_add(nr, &event->count); 3793 atomic64_add(nr, &event->count);
3658 3794
3795 if (!regs)
3796 return;
3797
3659 if (!hwc->sample_period) 3798 if (!hwc->sample_period)
3660 return; 3799 return;
3661 3800
3662 if (!regs) 3801 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
3802 return perf_swevent_overflow(event, 1, nmi, data, regs);
3803
3804 if (atomic64_add_negative(nr, &hwc->period_left))
3663 return; 3805 return;
3664 3806
3665 if (!atomic64_add_negative(nr, &hwc->period_left)) 3807 perf_swevent_overflow(event, 0, nmi, data, regs);
3666 perf_swevent_overflow(event, nmi, data, regs);
3667} 3808}
3668 3809
3669static int perf_swevent_is_counting(struct perf_event *event) 3810static int perf_swevent_is_counting(struct perf_event *event)
@@ -3696,25 +3837,44 @@ static int perf_swevent_is_counting(struct perf_event *event)
3696 return 1; 3837 return 1;
3697} 3838}
3698 3839
3840static int perf_tp_event_match(struct perf_event *event,
3841 struct perf_sample_data *data);
3842
3843static int perf_exclude_event(struct perf_event *event,
3844 struct pt_regs *regs)
3845{
3846 if (regs) {
3847 if (event->attr.exclude_user && user_mode(regs))
3848 return 1;
3849
3850 if (event->attr.exclude_kernel && !user_mode(regs))
3851 return 1;
3852 }
3853
3854 return 0;
3855}
3856
3699static int perf_swevent_match(struct perf_event *event, 3857static int perf_swevent_match(struct perf_event *event,
3700 enum perf_type_id type, 3858 enum perf_type_id type,
3701 u32 event_id, struct pt_regs *regs) 3859 u32 event_id,
3860 struct perf_sample_data *data,
3861 struct pt_regs *regs)
3702{ 3862{
3703 if (!perf_swevent_is_counting(event)) 3863 if (!perf_swevent_is_counting(event))
3704 return 0; 3864 return 0;
3705 3865
3706 if (event->attr.type != type) 3866 if (event->attr.type != type)
3707 return 0; 3867 return 0;
3868
3708 if (event->attr.config != event_id) 3869 if (event->attr.config != event_id)
3709 return 0; 3870 return 0;
3710 3871
3711 if (regs) { 3872 if (perf_exclude_event(event, regs))
3712 if (event->attr.exclude_user && user_mode(regs)) 3873 return 0;
3713 return 0;
3714 3874
3715 if (event->attr.exclude_kernel && !user_mode(regs)) 3875 if (event->attr.type == PERF_TYPE_TRACEPOINT &&
3716 return 0; 3876 !perf_tp_event_match(event, data))
3717 } 3877 return 0;
3718 3878
3719 return 1; 3879 return 1;
3720} 3880}
@@ -3727,49 +3887,59 @@ static void perf_swevent_ctx_event(struct perf_event_context *ctx,
3727{ 3887{
3728 struct perf_event *event; 3888 struct perf_event *event;
3729 3889
3730 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3731 return;
3732
3733 rcu_read_lock();
3734 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3890 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3735 if (perf_swevent_match(event, type, event_id, regs)) 3891 if (perf_swevent_match(event, type, event_id, data, regs))
3736 perf_swevent_add(event, nr, nmi, data, regs); 3892 perf_swevent_add(event, nr, nmi, data, regs);
3737 } 3893 }
3738 rcu_read_unlock();
3739} 3894}
3740 3895
3741static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx) 3896int perf_swevent_get_recursion_context(void)
3742{ 3897{
3898 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3899 int rctx;
3900
3743 if (in_nmi()) 3901 if (in_nmi())
3744 return &cpuctx->recursion[3]; 3902 rctx = 3;
3903 else if (in_irq())
3904 rctx = 2;
3905 else if (in_softirq())
3906 rctx = 1;
3907 else
3908 rctx = 0;
3745 3909
3746 if (in_irq()) 3910 if (cpuctx->recursion[rctx]) {
3747 return &cpuctx->recursion[2]; 3911 put_cpu_var(perf_cpu_context);
3912 return -1;
3913 }
3748 3914
3749 if (in_softirq()) 3915 cpuctx->recursion[rctx]++;
3750 return &cpuctx->recursion[1]; 3916 barrier();
3751 3917
3752 return &cpuctx->recursion[0]; 3918 return rctx;
3919}
3920EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
3921
3922void perf_swevent_put_recursion_context(int rctx)
3923{
3924 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
3925 barrier();
3926 cpuctx->recursion[rctx]--;
3927 put_cpu_var(perf_cpu_context);
3753} 3928}
3929EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
3754 3930
3755static void do_perf_sw_event(enum perf_type_id type, u32 event_id, 3931static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
3756 u64 nr, int nmi, 3932 u64 nr, int nmi,
3757 struct perf_sample_data *data, 3933 struct perf_sample_data *data,
3758 struct pt_regs *regs) 3934 struct pt_regs *regs)
3759{ 3935{
3760 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); 3936 struct perf_cpu_context *cpuctx;
3761 int *recursion = perf_swevent_recursion_context(cpuctx);
3762 struct perf_event_context *ctx; 3937 struct perf_event_context *ctx;
3763 3938
3764 if (*recursion) 3939 cpuctx = &__get_cpu_var(perf_cpu_context);
3765 goto out; 3940 rcu_read_lock();
3766
3767 (*recursion)++;
3768 barrier();
3769
3770 perf_swevent_ctx_event(&cpuctx->ctx, type, event_id, 3941 perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
3771 nr, nmi, data, regs); 3942 nr, nmi, data, regs);
3772 rcu_read_lock();
3773 /* 3943 /*
3774 * doesn't really matter which of the child contexts the 3944 * doesn't really matter which of the child contexts the
3775 * events ends up in. 3945 * events ends up in.
@@ -3778,23 +3948,24 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
3778 if (ctx) 3948 if (ctx)
3779 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs); 3949 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
3780 rcu_read_unlock(); 3950 rcu_read_unlock();
3781
3782 barrier();
3783 (*recursion)--;
3784
3785out:
3786 put_cpu_var(perf_cpu_context);
3787} 3951}
3788 3952
3789void __perf_sw_event(u32 event_id, u64 nr, int nmi, 3953void __perf_sw_event(u32 event_id, u64 nr, int nmi,
3790 struct pt_regs *regs, u64 addr) 3954 struct pt_regs *regs, u64 addr)
3791{ 3955{
3792 struct perf_sample_data data = { 3956 struct perf_sample_data data;
3793 .addr = addr, 3957 int rctx;
3794 };
3795 3958
3796 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, 3959 rctx = perf_swevent_get_recursion_context();
3797 &data, regs); 3960 if (rctx < 0)
3961 return;
3962
3963 data.addr = addr;
3964 data.raw = NULL;
3965
3966 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
3967
3968 perf_swevent_put_recursion_context(rctx);
3798} 3969}
3799 3970
3800static void perf_swevent_read(struct perf_event *event) 3971static void perf_swevent_read(struct perf_event *event)
@@ -3839,6 +4010,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
3839 event->pmu->read(event); 4010 event->pmu->read(event);
3840 4011
3841 data.addr = 0; 4012 data.addr = 0;
4013 data.raw = NULL;
4014 data.period = event->hw.last_period;
3842 regs = get_irq_regs(); 4015 regs = get_irq_regs();
3843 /* 4016 /*
3844 * In case we exclude kernel IPs or are somehow not in interrupt 4017 * In case we exclude kernel IPs or are somehow not in interrupt
@@ -3849,8 +4022,9 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
3849 regs = task_pt_regs(current); 4022 regs = task_pt_regs(current);
3850 4023
3851 if (regs) { 4024 if (regs) {
3852 if (perf_event_overflow(event, 0, &data, regs)) 4025 if (!(event->attr.exclude_idle && current->pid == 0))
3853 ret = HRTIMER_NORESTART; 4026 if (perf_event_overflow(event, 0, &data, regs))
4027 ret = HRTIMER_NORESTART;
3854 } 4028 }
3855 4029
3856 period = max_t(u64, 10000, event->hw.sample_period); 4030 period = max_t(u64, 10000, event->hw.sample_period);
@@ -3859,6 +4033,42 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
3859 return ret; 4033 return ret;
3860} 4034}
3861 4035
4036static void perf_swevent_start_hrtimer(struct perf_event *event)
4037{
4038 struct hw_perf_event *hwc = &event->hw;
4039
4040 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4041 hwc->hrtimer.function = perf_swevent_hrtimer;
4042 if (hwc->sample_period) {
4043 u64 period;
4044
4045 if (hwc->remaining) {
4046 if (hwc->remaining < 0)
4047 period = 10000;
4048 else
4049 period = hwc->remaining;
4050 hwc->remaining = 0;
4051 } else {
4052 period = max_t(u64, 10000, hwc->sample_period);
4053 }
4054 __hrtimer_start_range_ns(&hwc->hrtimer,
4055 ns_to_ktime(period), 0,
4056 HRTIMER_MODE_REL, 0);
4057 }
4058}
4059
4060static void perf_swevent_cancel_hrtimer(struct perf_event *event)
4061{
4062 struct hw_perf_event *hwc = &event->hw;
4063
4064 if (hwc->sample_period) {
4065 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4066 hwc->remaining = ktime_to_ns(remaining);
4067
4068 hrtimer_cancel(&hwc->hrtimer);
4069 }
4070}
4071
3862/* 4072/*
3863 * Software event: cpu wall time clock 4073 * Software event: cpu wall time clock
3864 */ 4074 */
@@ -3870,8 +4080,7 @@ static void cpu_clock_perf_event_update(struct perf_event *event)
3870 u64 now; 4080 u64 now;
3871 4081
3872 now = cpu_clock(cpu); 4082 now = cpu_clock(cpu);
3873 prev = atomic64_read(&event->hw.prev_count); 4083 prev = atomic64_xchg(&event->hw.prev_count, now);
3874 atomic64_set(&event->hw.prev_count, now);
3875 atomic64_add(now - prev, &event->count); 4084 atomic64_add(now - prev, &event->count);
3876} 4085}
3877 4086
@@ -3881,22 +4090,14 @@ static int cpu_clock_perf_event_enable(struct perf_event *event)
3881 int cpu = raw_smp_processor_id(); 4090 int cpu = raw_smp_processor_id();
3882 4091
3883 atomic64_set(&hwc->prev_count, cpu_clock(cpu)); 4092 atomic64_set(&hwc->prev_count, cpu_clock(cpu));
3884 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 4093 perf_swevent_start_hrtimer(event);
3885 hwc->hrtimer.function = perf_swevent_hrtimer;
3886 if (hwc->sample_period) {
3887 u64 period = max_t(u64, 10000, hwc->sample_period);
3888 __hrtimer_start_range_ns(&hwc->hrtimer,
3889 ns_to_ktime(period), 0,
3890 HRTIMER_MODE_REL, 0);
3891 }
3892 4094
3893 return 0; 4095 return 0;
3894} 4096}
3895 4097
3896static void cpu_clock_perf_event_disable(struct perf_event *event) 4098static void cpu_clock_perf_event_disable(struct perf_event *event)
3897{ 4099{
3898 if (event->hw.sample_period) 4100 perf_swevent_cancel_hrtimer(event);
3899 hrtimer_cancel(&event->hw.hrtimer);
3900 cpu_clock_perf_event_update(event); 4101 cpu_clock_perf_event_update(event);
3901} 4102}
3902 4103
@@ -3933,22 +4134,15 @@ static int task_clock_perf_event_enable(struct perf_event *event)
3933 now = event->ctx->time; 4134 now = event->ctx->time;
3934 4135
3935 atomic64_set(&hwc->prev_count, now); 4136 atomic64_set(&hwc->prev_count, now);
3936 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 4137
3937 hwc->hrtimer.function = perf_swevent_hrtimer; 4138 perf_swevent_start_hrtimer(event);
3938 if (hwc->sample_period) {
3939 u64 period = max_t(u64, 10000, hwc->sample_period);
3940 __hrtimer_start_range_ns(&hwc->hrtimer,
3941 ns_to_ktime(period), 0,
3942 HRTIMER_MODE_REL, 0);
3943 }
3944 4139
3945 return 0; 4140 return 0;
3946} 4141}
3947 4142
3948static void task_clock_perf_event_disable(struct perf_event *event) 4143static void task_clock_perf_event_disable(struct perf_event *event)
3949{ 4144{
3950 if (event->hw.sample_period) 4145 perf_swevent_cancel_hrtimer(event);
3951 hrtimer_cancel(&event->hw.hrtimer);
3952 task_clock_perf_event_update(event, event->ctx->time); 4146 task_clock_perf_event_update(event, event->ctx->time);
3953 4147
3954} 4148}
@@ -3976,6 +4170,7 @@ static const struct pmu perf_ops_task_clock = {
3976}; 4170};
3977 4171
3978#ifdef CONFIG_EVENT_PROFILE 4172#ifdef CONFIG_EVENT_PROFILE
4173
3979void perf_tp_event(int event_id, u64 addr, u64 count, void *record, 4174void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
3980 int entry_size) 4175 int entry_size)
3981{ 4176{
@@ -3994,13 +4189,21 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
3994 if (!regs) 4189 if (!regs)
3995 regs = task_pt_regs(current); 4190 regs = task_pt_regs(current);
3996 4191
4192 /* Trace events already protected against recursion */
3997 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, 4193 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
3998 &data, regs); 4194 &data, regs);
3999} 4195}
4000EXPORT_SYMBOL_GPL(perf_tp_event); 4196EXPORT_SYMBOL_GPL(perf_tp_event);
4001 4197
4002extern int ftrace_profile_enable(int); 4198static int perf_tp_event_match(struct perf_event *event,
4003extern void ftrace_profile_disable(int); 4199 struct perf_sample_data *data)
4200{
4201 void *record = data->raw->data;
4202
4203 if (likely(!event->filter) || filter_match_preds(event->filter, record))
4204 return 1;
4205 return 0;
4206}
4004 4207
4005static void tp_perf_event_destroy(struct perf_event *event) 4208static void tp_perf_event_destroy(struct perf_event *event)
4006{ 4209{
@@ -4025,11 +4228,93 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
4025 4228
4026 return &perf_ops_generic; 4229 return &perf_ops_generic;
4027} 4230}
4231
4232static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4233{
4234 char *filter_str;
4235 int ret;
4236
4237 if (event->attr.type != PERF_TYPE_TRACEPOINT)
4238 return -EINVAL;
4239
4240 filter_str = strndup_user(arg, PAGE_SIZE);
4241 if (IS_ERR(filter_str))
4242 return PTR_ERR(filter_str);
4243
4244 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
4245
4246 kfree(filter_str);
4247 return ret;
4248}
4249
4250static void perf_event_free_filter(struct perf_event *event)
4251{
4252 ftrace_profile_free_filter(event);
4253}
4254
4028#else 4255#else
4256
4257static int perf_tp_event_match(struct perf_event *event,
4258 struct perf_sample_data *data)
4259{
4260 return 1;
4261}
4262
4029static const struct pmu *tp_perf_event_init(struct perf_event *event) 4263static const struct pmu *tp_perf_event_init(struct perf_event *event)
4030{ 4264{
4031 return NULL; 4265 return NULL;
4032} 4266}
4267
4268static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4269{
4270 return -ENOENT;
4271}
4272
4273static void perf_event_free_filter(struct perf_event *event)
4274{
4275}
4276
4277#endif /* CONFIG_EVENT_PROFILE */
4278
4279#ifdef CONFIG_HAVE_HW_BREAKPOINT
4280static void bp_perf_event_destroy(struct perf_event *event)
4281{
4282 release_bp_slot(event);
4283}
4284
4285static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4286{
4287 int err;
4288
4289 err = register_perf_hw_breakpoint(bp);
4290 if (err)
4291 return ERR_PTR(err);
4292
4293 bp->destroy = bp_perf_event_destroy;
4294
4295 return &perf_ops_bp;
4296}
4297
4298void perf_bp_event(struct perf_event *bp, void *data)
4299{
4300 struct perf_sample_data sample;
4301 struct pt_regs *regs = data;
4302
4303 sample.raw = NULL;
4304 sample.addr = bp->attr.bp_addr;
4305
4306 if (!perf_exclude_event(bp, regs))
4307 perf_swevent_add(bp, 1, 1, &sample, regs);
4308}
4309#else
4310static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4311{
4312 return NULL;
4313}
4314
4315void perf_bp_event(struct perf_event *bp, void *regs)
4316{
4317}
4033#endif 4318#endif
4034 4319
4035atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; 4320atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
@@ -4076,6 +4361,8 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
4076 case PERF_COUNT_SW_PAGE_FAULTS_MAJ: 4361 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
4077 case PERF_COUNT_SW_CONTEXT_SWITCHES: 4362 case PERF_COUNT_SW_CONTEXT_SWITCHES:
4078 case PERF_COUNT_SW_CPU_MIGRATIONS: 4363 case PERF_COUNT_SW_CPU_MIGRATIONS:
4364 case PERF_COUNT_SW_ALIGNMENT_FAULTS:
4365 case PERF_COUNT_SW_EMULATION_FAULTS:
4079 if (!event->parent) { 4366 if (!event->parent) {
4080 atomic_inc(&perf_swevent_enabled[event_id]); 4367 atomic_inc(&perf_swevent_enabled[event_id]);
4081 event->destroy = sw_perf_event_destroy; 4368 event->destroy = sw_perf_event_destroy;
@@ -4096,6 +4383,7 @@ perf_event_alloc(struct perf_event_attr *attr,
4096 struct perf_event_context *ctx, 4383 struct perf_event_context *ctx,
4097 struct perf_event *group_leader, 4384 struct perf_event *group_leader,
4098 struct perf_event *parent_event, 4385 struct perf_event *parent_event,
4386 perf_overflow_handler_t overflow_handler,
4099 gfp_t gfpflags) 4387 gfp_t gfpflags)
4100{ 4388{
4101 const struct pmu *pmu; 4389 const struct pmu *pmu;
@@ -4138,6 +4426,11 @@ perf_event_alloc(struct perf_event_attr *attr,
4138 4426
4139 event->state = PERF_EVENT_STATE_INACTIVE; 4427 event->state = PERF_EVENT_STATE_INACTIVE;
4140 4428
4429 if (!overflow_handler && parent_event)
4430 overflow_handler = parent_event->overflow_handler;
4431
4432 event->overflow_handler = overflow_handler;
4433
4141 if (attr->disabled) 4434 if (attr->disabled)
4142 event->state = PERF_EVENT_STATE_OFF; 4435 event->state = PERF_EVENT_STATE_OFF;
4143 4436
@@ -4172,6 +4465,11 @@ perf_event_alloc(struct perf_event_attr *attr,
4172 pmu = tp_perf_event_init(event); 4465 pmu = tp_perf_event_init(event);
4173 break; 4466 break;
4174 4467
4468 case PERF_TYPE_BREAKPOINT:
4469 pmu = bp_perf_event_init(event);
4470 break;
4471
4472
4175 default: 4473 default:
4176 break; 4474 break;
4177 } 4475 }
@@ -4284,7 +4582,7 @@ err_size:
4284 goto out; 4582 goto out;
4285} 4583}
4286 4584
4287int perf_event_set_output(struct perf_event *event, int output_fd) 4585static int perf_event_set_output(struct perf_event *event, int output_fd)
4288{ 4586{
4289 struct perf_event *output_event = NULL; 4587 struct perf_event *output_event = NULL;
4290 struct file *output_file = NULL; 4588 struct file *output_file = NULL;
@@ -4414,7 +4712,7 @@ SYSCALL_DEFINE5(perf_event_open,
4414 } 4712 }
4415 4713
4416 event = perf_event_alloc(&attr, cpu, ctx, group_leader, 4714 event = perf_event_alloc(&attr, cpu, ctx, group_leader,
4417 NULL, GFP_KERNEL); 4715 NULL, NULL, GFP_KERNEL);
4418 err = PTR_ERR(event); 4716 err = PTR_ERR(event);
4419 if (IS_ERR(event)) 4717 if (IS_ERR(event))
4420 goto err_put_context; 4718 goto err_put_context;
@@ -4462,6 +4760,61 @@ err_put_context:
4462 return err; 4760 return err;
4463} 4761}
4464 4762
4763/**
4764 * perf_event_create_kernel_counter
4765 *
4766 * @attr: attributes of the counter to create
4767 * @cpu: cpu in which the counter is bound
4768 * @pid: task to profile
4769 */
4770struct perf_event *
4771perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
4772 pid_t pid,
4773 perf_overflow_handler_t overflow_handler)
4774{
4775 struct perf_event *event;
4776 struct perf_event_context *ctx;
4777 int err;
4778
4779 /*
4780 * Get the target context (task or percpu):
4781 */
4782
4783 ctx = find_get_context(pid, cpu);
4784 if (IS_ERR(ctx)) {
4785 err = PTR_ERR(ctx);
4786 goto err_exit;
4787 }
4788
4789 event = perf_event_alloc(attr, cpu, ctx, NULL,
4790 NULL, overflow_handler, GFP_KERNEL);
4791 if (IS_ERR(event)) {
4792 err = PTR_ERR(event);
4793 goto err_put_context;
4794 }
4795
4796 event->filp = NULL;
4797 WARN_ON_ONCE(ctx->parent_ctx);
4798 mutex_lock(&ctx->mutex);
4799 perf_install_in_context(ctx, event, cpu);
4800 ++ctx->generation;
4801 mutex_unlock(&ctx->mutex);
4802
4803 event->owner = current;
4804 get_task_struct(current);
4805 mutex_lock(&current->perf_event_mutex);
4806 list_add_tail(&event->owner_entry, &current->perf_event_list);
4807 mutex_unlock(&current->perf_event_mutex);
4808
4809 return event;
4810
4811 err_put_context:
4812 put_ctx(ctx);
4813 err_exit:
4814 return ERR_PTR(err);
4815}
4816EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
4817
4465/* 4818/*
4466 * inherit a event from parent task to child task: 4819 * inherit a event from parent task to child task:
4467 */ 4820 */
@@ -4487,7 +4840,7 @@ inherit_event(struct perf_event *parent_event,
4487 child_event = perf_event_alloc(&parent_event->attr, 4840 child_event = perf_event_alloc(&parent_event->attr,
4488 parent_event->cpu, child_ctx, 4841 parent_event->cpu, child_ctx,
4489 group_leader, parent_event, 4842 group_leader, parent_event,
4490 GFP_KERNEL); 4843 NULL, GFP_KERNEL);
4491 if (IS_ERR(child_event)) 4844 if (IS_ERR(child_event))
4492 return child_event; 4845 return child_event;
4493 get_ctx(child_ctx); 4846 get_ctx(child_ctx);
@@ -4505,6 +4858,8 @@ inherit_event(struct perf_event *parent_event,
4505 if (parent_event->attr.freq) 4858 if (parent_event->attr.freq)
4506 child_event->hw.sample_period = parent_event->hw.sample_period; 4859 child_event->hw.sample_period = parent_event->hw.sample_period;
4507 4860
4861 child_event->overflow_handler = parent_event->overflow_handler;
4862
4508 /* 4863 /*
4509 * Link it up in the child's context: 4864 * Link it up in the child's context:
4510 */ 4865 */
@@ -4594,7 +4949,6 @@ __perf_event_exit_task(struct perf_event *child_event,
4594{ 4949{
4595 struct perf_event *parent_event; 4950 struct perf_event *parent_event;
4596 4951
4597 update_event_times(child_event);
4598 perf_event_remove_from_context(child_event); 4952 perf_event_remove_from_context(child_event);
4599 4953
4600 parent_event = child_event->parent; 4954 parent_event = child_event->parent;
@@ -4638,7 +4992,7 @@ void perf_event_exit_task(struct task_struct *child)
4638 * reading child->perf_event_ctxp, we wait until it has 4992 * reading child->perf_event_ctxp, we wait until it has
4639 * incremented the context's refcount before we do put_ctx below. 4993 * incremented the context's refcount before we do put_ctx below.
4640 */ 4994 */
4641 spin_lock(&child_ctx->lock); 4995 raw_spin_lock(&child_ctx->lock);
4642 child->perf_event_ctxp = NULL; 4996 child->perf_event_ctxp = NULL;
4643 /* 4997 /*
4644 * If this context is a clone; unclone it so it can't get 4998 * If this context is a clone; unclone it so it can't get
@@ -4646,7 +5000,8 @@ void perf_event_exit_task(struct task_struct *child)
4646 * the events from it. 5000 * the events from it.
4647 */ 5001 */
4648 unclone_ctx(child_ctx); 5002 unclone_ctx(child_ctx);
4649 spin_unlock_irqrestore(&child_ctx->lock, flags); 5003 update_context_time(child_ctx);
5004 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
4650 5005
4651 /* 5006 /*
4652 * Report the task dead after unscheduling the events so that we 5007 * Report the task dead after unscheduling the events so that we
@@ -4729,7 +5084,7 @@ again:
4729 */ 5084 */
4730int perf_event_init_task(struct task_struct *child) 5085int perf_event_init_task(struct task_struct *child)
4731{ 5086{
4732 struct perf_event_context *child_ctx, *parent_ctx; 5087 struct perf_event_context *child_ctx = NULL, *parent_ctx;
4733 struct perf_event_context *cloned_ctx; 5088 struct perf_event_context *cloned_ctx;
4734 struct perf_event *event; 5089 struct perf_event *event;
4735 struct task_struct *parent = current; 5090 struct task_struct *parent = current;
@@ -4745,20 +5100,6 @@ int perf_event_init_task(struct task_struct *child)
4745 return 0; 5100 return 0;
4746 5101
4747 /* 5102 /*
4748 * This is executed from the parent task context, so inherit
4749 * events that have been marked for cloning.
4750 * First allocate and initialize a context for the child.
4751 */
4752
4753 child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
4754 if (!child_ctx)
4755 return -ENOMEM;
4756
4757 __perf_event_init_context(child_ctx, child);
4758 child->perf_event_ctxp = child_ctx;
4759 get_task_struct(child);
4760
4761 /*
4762 * If the parent's context is a clone, pin it so it won't get 5103 * If the parent's context is a clone, pin it so it won't get
4763 * swapped under us. 5104 * swapped under us.
4764 */ 5105 */
@@ -4781,15 +5122,33 @@ int perf_event_init_task(struct task_struct *child)
4781 * We dont have to disable NMIs - we are only looking at 5122 * We dont have to disable NMIs - we are only looking at
4782 * the list, not manipulating it: 5123 * the list, not manipulating it:
4783 */ 5124 */
4784 list_for_each_entry_rcu(event, &parent_ctx->event_list, event_entry) { 5125 list_for_each_entry(event, &parent_ctx->group_list, group_entry) {
4785 if (event != event->group_leader)
4786 continue;
4787 5126
4788 if (!event->attr.inherit) { 5127 if (!event->attr.inherit) {
4789 inherited_all = 0; 5128 inherited_all = 0;
4790 continue; 5129 continue;
4791 } 5130 }
4792 5131
5132 if (!child->perf_event_ctxp) {
5133 /*
5134 * This is executed from the parent task context, so
5135 * inherit events that have been marked for cloning.
5136 * First allocate and initialize a context for the
5137 * child.
5138 */
5139
5140 child_ctx = kzalloc(sizeof(struct perf_event_context),
5141 GFP_KERNEL);
5142 if (!child_ctx) {
5143 ret = -ENOMEM;
5144 goto exit;
5145 }
5146
5147 __perf_event_init_context(child_ctx, child);
5148 child->perf_event_ctxp = child_ctx;
5149 get_task_struct(child);
5150 }
5151
4793 ret = inherit_group(event, parent, parent_ctx, 5152 ret = inherit_group(event, parent, parent_ctx,
4794 child, child_ctx); 5153 child, child_ctx);
4795 if (ret) { 5154 if (ret) {
@@ -4818,6 +5177,7 @@ int perf_event_init_task(struct task_struct *child)
4818 get_ctx(child_ctx->parent_ctx); 5177 get_ctx(child_ctx->parent_ctx);
4819 } 5178 }
4820 5179
5180exit:
4821 mutex_unlock(&parent_ctx->mutex); 5181 mutex_unlock(&parent_ctx->mutex);
4822 5182
4823 perf_unpin_context(parent_ctx); 5183 perf_unpin_context(parent_ctx);
@@ -4932,11 +5292,11 @@ perf_set_reserve_percpu(struct sysdev_class *class,
4932 perf_reserved_percpu = val; 5292 perf_reserved_percpu = val;
4933 for_each_online_cpu(cpu) { 5293 for_each_online_cpu(cpu) {
4934 cpuctx = &per_cpu(perf_cpu_context, cpu); 5294 cpuctx = &per_cpu(perf_cpu_context, cpu);
4935 spin_lock_irq(&cpuctx->ctx.lock); 5295 raw_spin_lock_irq(&cpuctx->ctx.lock);
4936 mpt = min(perf_max_events - cpuctx->ctx.nr_events, 5296 mpt = min(perf_max_events - cpuctx->ctx.nr_events,
4937 perf_max_events - perf_reserved_percpu); 5297 perf_max_events - perf_reserved_percpu);
4938 cpuctx->max_pertask = mpt; 5298 cpuctx->max_pertask = mpt;
4939 spin_unlock_irq(&cpuctx->ctx.lock); 5299 raw_spin_unlock_irq(&cpuctx->ctx.lock);
4940 } 5300 }
4941 spin_unlock(&perf_resource_lock); 5301 spin_unlock(&perf_resource_lock);
4942 5302
diff --git a/kernel/pid.c b/kernel/pid.c
index d3f722d20f9c..2e17c9c92cbe 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -141,11 +141,12 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
141 * installing it: 141 * installing it:
142 */ 142 */
143 spin_lock_irq(&pidmap_lock); 143 spin_lock_irq(&pidmap_lock);
144 if (map->page) 144 if (!map->page) {
145 kfree(page);
146 else
147 map->page = page; 145 map->page = page;
146 page = NULL;
147 }
148 spin_unlock_irq(&pidmap_lock); 148 spin_unlock_irq(&pidmap_lock);
149 kfree(page);
149 if (unlikely(!map->page)) 150 if (unlikely(!map->page))
150 break; 151 break;
151 } 152 }
@@ -268,12 +269,11 @@ struct pid *alloc_pid(struct pid_namespace *ns)
268 for (type = 0; type < PIDTYPE_MAX; ++type) 269 for (type = 0; type < PIDTYPE_MAX; ++type)
269 INIT_HLIST_HEAD(&pid->tasks[type]); 270 INIT_HLIST_HEAD(&pid->tasks[type]);
270 271
272 upid = pid->numbers + ns->level;
271 spin_lock_irq(&pidmap_lock); 273 spin_lock_irq(&pidmap_lock);
272 for (i = ns->level; i >= 0; i--) { 274 for ( ; upid >= pid->numbers; --upid)
273 upid = &pid->numbers[i];
274 hlist_add_head_rcu(&upid->pid_chain, 275 hlist_add_head_rcu(&upid->pid_chain,
275 &pid_hash[pid_hashfn(upid->nr, upid->ns)]); 276 &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
276 }
277 spin_unlock_irq(&pidmap_lock); 277 spin_unlock_irq(&pidmap_lock);
278 278
279out: 279out:
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 821722ae58a7..86b3796b0436 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -118,7 +118,7 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old
118{ 118{
119 if (!(flags & CLONE_NEWPID)) 119 if (!(flags & CLONE_NEWPID))
120 return get_pid_ns(old_ns); 120 return get_pid_ns(old_ns);
121 if (flags & CLONE_THREAD) 121 if (flags & (CLONE_THREAD|CLONE_PARENT))
122 return ERR_PTR(-EINVAL); 122 return ERR_PTR(-EINVAL);
123 return create_pid_namespace(old_ns); 123 return create_pid_namespace(old_ns);
124} 124}
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index dfdec524d1b7..3db49b9ca374 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -29,7 +29,6 @@
29 29
30#include <linux/pm_qos_params.h> 30#include <linux/pm_qos_params.h>
31#include <linux/sched.h> 31#include <linux/sched.h>
32#include <linux/smp_lock.h>
33#include <linux/spinlock.h> 32#include <linux/spinlock.h>
34#include <linux/slab.h> 33#include <linux/slab.h>
35#include <linux/time.h> 34#include <linux/time.h>
@@ -344,37 +343,33 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
344} 343}
345EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); 344EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
346 345
347#define PID_NAME_LEN sizeof("process_1234567890") 346#define PID_NAME_LEN 32
348static char name[PID_NAME_LEN];
349 347
350static int pm_qos_power_open(struct inode *inode, struct file *filp) 348static int pm_qos_power_open(struct inode *inode, struct file *filp)
351{ 349{
352 int ret; 350 int ret;
353 long pm_qos_class; 351 long pm_qos_class;
352 char name[PID_NAME_LEN];
354 353
355 lock_kernel();
356 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); 354 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
357 if (pm_qos_class >= 0) { 355 if (pm_qos_class >= 0) {
358 filp->private_data = (void *)pm_qos_class; 356 filp->private_data = (void *)pm_qos_class;
359 sprintf(name, "process_%d", current->pid); 357 snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
360 ret = pm_qos_add_requirement(pm_qos_class, name, 358 ret = pm_qos_add_requirement(pm_qos_class, name,
361 PM_QOS_DEFAULT_VALUE); 359 PM_QOS_DEFAULT_VALUE);
362 if (ret >= 0) { 360 if (ret >= 0)
363 unlock_kernel();
364 return 0; 361 return 0;
365 }
366 } 362 }
367 unlock_kernel();
368
369 return -EPERM; 363 return -EPERM;
370} 364}
371 365
372static int pm_qos_power_release(struct inode *inode, struct file *filp) 366static int pm_qos_power_release(struct inode *inode, struct file *filp)
373{ 367{
374 int pm_qos_class; 368 int pm_qos_class;
369 char name[PID_NAME_LEN];
375 370
376 pm_qos_class = (long)filp->private_data; 371 pm_qos_class = (long)filp->private_data;
377 sprintf(name, "process_%d", current->pid); 372 snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
378 pm_qos_remove_requirement(pm_qos_class, name); 373 pm_qos_remove_requirement(pm_qos_class, name);
379 374
380 return 0; 375 return 0;
@@ -385,13 +380,14 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
385{ 380{
386 s32 value; 381 s32 value;
387 int pm_qos_class; 382 int pm_qos_class;
383 char name[PID_NAME_LEN];
388 384
389 pm_qos_class = (long)filp->private_data; 385 pm_qos_class = (long)filp->private_data;
390 if (count != sizeof(s32)) 386 if (count != sizeof(s32))
391 return -EINVAL; 387 return -EINVAL;
392 if (copy_from_user(&value, buf, sizeof(s32))) 388 if (copy_from_user(&value, buf, sizeof(s32)))
393 return -EFAULT; 389 return -EFAULT;
394 sprintf(name, "process_%d", current->pid); 390 snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
395 pm_qos_update_requirement(pm_qos_class, name, value); 391 pm_qos_update_requirement(pm_qos_class, name, value);
396 392
397 return sizeof(s32); 393 return sizeof(s32);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 5c9dc228747b..438ff4523513 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -384,7 +384,8 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
384 384
385/* 385/*
386 * Validate the clockid_t for a new CPU-clock timer, and initialize the timer. 386 * Validate the clockid_t for a new CPU-clock timer, and initialize the timer.
387 * This is called from sys_timer_create with the new timer already locked. 387 * This is called from sys_timer_create() and do_cpu_nanosleep() with the
388 * new timer already all-zeros initialized.
388 */ 389 */
389int posix_cpu_timer_create(struct k_itimer *new_timer) 390int posix_cpu_timer_create(struct k_itimer *new_timer)
390{ 391{
@@ -396,8 +397,6 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
396 return -EINVAL; 397 return -EINVAL;
397 398
398 INIT_LIST_HEAD(&new_timer->it.cpu.entry); 399 INIT_LIST_HEAD(&new_timer->it.cpu.entry);
399 new_timer->it.cpu.incr.sched = 0;
400 new_timer->it.cpu.expires.sched = 0;
401 400
402 read_lock(&tasklist_lock); 401 read_lock(&tasklist_lock);
403 if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) { 402 if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index c3b81c30e5d5..43191815f874 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -8,7 +8,7 @@ obj-$(CONFIG_PM_SLEEP) += console.o
8obj-$(CONFIG_FREEZER) += process.o 8obj-$(CONFIG_FREEZER) += process.o
9obj-$(CONFIG_SUSPEND) += suspend.o 9obj-$(CONFIG_SUSPEND) += suspend.o
10obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o 10obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
11obj-$(CONFIG_HIBERNATION) += swsusp.o hibernate.o snapshot.o swap.o user.o 11obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o
12obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o 12obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o
13 13
14obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 14obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/console.c b/kernel/power/console.c
index 5187136fe1de..218e5af90156 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -6,7 +6,7 @@
6 6
7#include <linux/vt_kern.h> 7#include <linux/vt_kern.h>
8#include <linux/kbd_kern.h> 8#include <linux/kbd_kern.h>
9#include <linux/console.h> 9#include <linux/vt.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include "power.h" 11#include "power.h"
12 12
@@ -21,8 +21,7 @@ int pm_prepare_console(void)
21 if (orig_fgconsole < 0) 21 if (orig_fgconsole < 0)
22 return 1; 22 return 1;
23 23
24 orig_kmsg = kmsg_redirect; 24 orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE);
25 kmsg_redirect = SUSPEND_CONSOLE;
26 return 0; 25 return 0;
27} 26}
28 27
@@ -30,7 +29,7 @@ void pm_restore_console(void)
30{ 29{
31 if (orig_fgconsole >= 0) { 30 if (orig_fgconsole >= 0) {
32 vt_move_to_console(orig_fgconsole, 0); 31 vt_move_to_console(orig_fgconsole, 0);
33 kmsg_redirect = orig_kmsg; 32 vt_kmsg_redirect(orig_kmsg);
34 } 33 }
35} 34}
36#endif 35#endif
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 04b3a83d686f..bbfe472d7524 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -32,6 +32,7 @@ static int noresume = 0;
32static char resume_file[256] = CONFIG_PM_STD_PARTITION; 32static char resume_file[256] = CONFIG_PM_STD_PARTITION;
33dev_t swsusp_resume_device; 33dev_t swsusp_resume_device;
34sector_t swsusp_resume_block; 34sector_t swsusp_resume_block;
35int in_suspend __nosavedata = 0;
35 36
36enum { 37enum {
37 HIBERNATION_INVALID, 38 HIBERNATION_INVALID,
@@ -202,6 +203,35 @@ static void platform_recover(int platform_mode)
202} 203}
203 204
204/** 205/**
206 * swsusp_show_speed - print the time elapsed between two events.
207 * @start: Starting event.
208 * @stop: Final event.
209 * @nr_pages - number of pages processed between @start and @stop
210 * @msg - introductory message to print
211 */
212
213void swsusp_show_speed(struct timeval *start, struct timeval *stop,
214 unsigned nr_pages, char *msg)
215{
216 s64 elapsed_centisecs64;
217 int centisecs;
218 int k;
219 int kps;
220
221 elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
222 do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
223 centisecs = elapsed_centisecs64;
224 if (centisecs == 0)
225 centisecs = 1; /* avoid div-by-zero */
226 k = nr_pages * (PAGE_SIZE / 1024);
227 kps = (k * 100) / centisecs;
228 printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n",
229 msg, k,
230 centisecs / 100, centisecs % 100,
231 kps / 1000, (kps % 1000) / 10);
232}
233
234/**
205 * create_image - freeze devices that need to be frozen with interrupts 235 * create_image - freeze devices that need to be frozen with interrupts
206 * off, create the hibernation image and thaw those devices. Control 236 * off, create the hibernation image and thaw those devices. Control
207 * reappears in this routine after a restore. 237 * reappears in this routine after a restore.
@@ -693,21 +723,22 @@ static int software_resume(void)
693 /* The snapshot device should not be opened while we're running */ 723 /* The snapshot device should not be opened while we're running */
694 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { 724 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
695 error = -EBUSY; 725 error = -EBUSY;
726 swsusp_close(FMODE_READ);
696 goto Unlock; 727 goto Unlock;
697 } 728 }
698 729
699 pm_prepare_console(); 730 pm_prepare_console();
700 error = pm_notifier_call_chain(PM_RESTORE_PREPARE); 731 error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
701 if (error) 732 if (error)
702 goto Finish; 733 goto close_finish;
703 734
704 error = usermodehelper_disable(); 735 error = usermodehelper_disable();
705 if (error) 736 if (error)
706 goto Finish; 737 goto close_finish;
707 738
708 error = create_basic_memory_bitmaps(); 739 error = create_basic_memory_bitmaps();
709 if (error) 740 if (error)
710 goto Finish; 741 goto close_finish;
711 742
712 pr_debug("PM: Preparing processes for restore.\n"); 743 pr_debug("PM: Preparing processes for restore.\n");
713 error = prepare_processes(); 744 error = prepare_processes();
@@ -719,6 +750,7 @@ static int software_resume(void)
719 pr_debug("PM: Reading hibernation image.\n"); 750 pr_debug("PM: Reading hibernation image.\n");
720 751
721 error = swsusp_read(&flags); 752 error = swsusp_read(&flags);
753 swsusp_close(FMODE_READ);
722 if (!error) 754 if (!error)
723 hibernation_restore(flags & SF_PLATFORM_MODE); 755 hibernation_restore(flags & SF_PLATFORM_MODE);
724 756
@@ -737,6 +769,9 @@ static int software_resume(void)
737 mutex_unlock(&pm_mutex); 769 mutex_unlock(&pm_mutex);
738 pr_debug("PM: Resume from disk failed.\n"); 770 pr_debug("PM: Resume from disk failed.\n");
739 return error; 771 return error;
772close_finish:
773 swsusp_close(FMODE_READ);
774 goto Finish;
740} 775}
741 776
742late_initcall(software_resume); 777late_initcall(software_resume);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 347d2cc88cd0..0998c7139053 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -220,6 +220,7 @@ static struct attribute_group attr_group = {
220 220
221#ifdef CONFIG_PM_RUNTIME 221#ifdef CONFIG_PM_RUNTIME
222struct workqueue_struct *pm_wq; 222struct workqueue_struct *pm_wq;
223EXPORT_SYMBOL_GPL(pm_wq);
223 224
224static int __init pm_start_workqueue(void) 225static int __init pm_start_workqueue(void)
225{ 226{
diff --git a/kernel/power/process.c b/kernel/power/process.c
index cc2e55373b68..5ade1bdcf366 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -14,6 +14,7 @@
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/freezer.h> 16#include <linux/freezer.h>
17#include <linux/delay.h>
17 18
18/* 19/*
19 * Timeout for stopping processes 20 * Timeout for stopping processes
@@ -41,7 +42,7 @@ static int try_to_freeze_tasks(bool sig_only)
41 do_gettimeofday(&start); 42 do_gettimeofday(&start);
42 43
43 end_time = jiffies + TIMEOUT; 44 end_time = jiffies + TIMEOUT;
44 do { 45 while (true) {
45 todo = 0; 46 todo = 0;
46 read_lock(&tasklist_lock); 47 read_lock(&tasklist_lock);
47 do_each_thread(g, p) { 48 do_each_thread(g, p) {
@@ -62,10 +63,15 @@ static int try_to_freeze_tasks(bool sig_only)
62 todo++; 63 todo++;
63 } while_each_thread(g, p); 64 } while_each_thread(g, p);
64 read_unlock(&tasklist_lock); 65 read_unlock(&tasklist_lock);
65 yield(); /* Yield is okay here */ 66 if (!todo || time_after(jiffies, end_time))
66 if (time_after(jiffies, end_time))
67 break; 67 break;
68 } while (todo); 68
69 /*
70 * We need to retry, but first give the freezing tasks some
71 * time to enter the regrigerator.
72 */
73 msleep(10);
74 }
69 75
70 do_gettimeofday(&end); 76 do_gettimeofday(&end);
71 elapsed_csecs64 = timeval_to_ns(&end) - timeval_to_ns(&start); 77 elapsed_csecs64 = timeval_to_ns(&end) - timeval_to_ns(&start);
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 17d8bb1acf9c..25596e450ac7 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -19,7 +19,7 @@
19 * The time it takes is system-specific though, so when we test this 19 * The time it takes is system-specific though, so when we test this
20 * during system bootup we allow a LOT of time. 20 * during system bootup we allow a LOT of time.
21 */ 21 */
22#define TEST_SUSPEND_SECONDS 5 22#define TEST_SUSPEND_SECONDS 10
23 23
24static unsigned long suspend_test_start_time; 24static unsigned long suspend_test_start_time;
25 25
@@ -49,7 +49,8 @@ void suspend_test_finish(const char *label)
49 * has some performance issues. The stack dump of a WARN_ON 49 * has some performance issues. The stack dump of a WARN_ON
50 * is more likely to get the right attention than a printk... 50 * is more likely to get the right attention than a printk...
51 */ 51 */
52 WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label); 52 WARN(msec > (TEST_SUSPEND_SECONDS * 1000),
53 "Component: %s, time: %u\n", label, msec);
53} 54}
54 55
55/* 56/*
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 8ba052c86d48..09b2b0ae9e9d 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -13,7 +13,6 @@
13 13
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/file.h> 15#include <linux/file.h>
16#include <linux/utsname.h>
17#include <linux/delay.h> 16#include <linux/delay.h>
18#include <linux/bitops.h> 17#include <linux/bitops.h>
19#include <linux/genhd.h> 18#include <linux/genhd.h>
@@ -39,6 +38,107 @@ struct swsusp_header {
39 38
40static struct swsusp_header *swsusp_header; 39static struct swsusp_header *swsusp_header;
41 40
41/**
42 * The following functions are used for tracing the allocated
43 * swap pages, so that they can be freed in case of an error.
44 */
45
46struct swsusp_extent {
47 struct rb_node node;
48 unsigned long start;
49 unsigned long end;
50};
51
52static struct rb_root swsusp_extents = RB_ROOT;
53
54static int swsusp_extents_insert(unsigned long swap_offset)
55{
56 struct rb_node **new = &(swsusp_extents.rb_node);
57 struct rb_node *parent = NULL;
58 struct swsusp_extent *ext;
59
60 /* Figure out where to put the new node */
61 while (*new) {
62 ext = container_of(*new, struct swsusp_extent, node);
63 parent = *new;
64 if (swap_offset < ext->start) {
65 /* Try to merge */
66 if (swap_offset == ext->start - 1) {
67 ext->start--;
68 return 0;
69 }
70 new = &((*new)->rb_left);
71 } else if (swap_offset > ext->end) {
72 /* Try to merge */
73 if (swap_offset == ext->end + 1) {
74 ext->end++;
75 return 0;
76 }
77 new = &((*new)->rb_right);
78 } else {
79 /* It already is in the tree */
80 return -EINVAL;
81 }
82 }
83 /* Add the new node and rebalance the tree. */
84 ext = kzalloc(sizeof(struct swsusp_extent), GFP_KERNEL);
85 if (!ext)
86 return -ENOMEM;
87
88 ext->start = swap_offset;
89 ext->end = swap_offset;
90 rb_link_node(&ext->node, parent, new);
91 rb_insert_color(&ext->node, &swsusp_extents);
92 return 0;
93}
94
95/**
96 * alloc_swapdev_block - allocate a swap page and register that it has
97 * been allocated, so that it can be freed in case of an error.
98 */
99
100sector_t alloc_swapdev_block(int swap)
101{
102 unsigned long offset;
103
104 offset = swp_offset(get_swap_page_of_type(swap));
105 if (offset) {
106 if (swsusp_extents_insert(offset))
107 swap_free(swp_entry(swap, offset));
108 else
109 return swapdev_block(swap, offset);
110 }
111 return 0;
112}
113
114/**
115 * free_all_swap_pages - free swap pages allocated for saving image data.
116 * It also frees the extents used to register which swap entres had been
117 * allocated.
118 */
119
120void free_all_swap_pages(int swap)
121{
122 struct rb_node *node;
123
124 while ((node = swsusp_extents.rb_node)) {
125 struct swsusp_extent *ext;
126 unsigned long offset;
127
128 ext = container_of(node, struct swsusp_extent, node);
129 rb_erase(node, &swsusp_extents);
130 for (offset = ext->start; offset <= ext->end; offset++)
131 swap_free(swp_entry(swap, offset));
132
133 kfree(ext);
134 }
135}
136
137int swsusp_swap_in_use(void)
138{
139 return (swsusp_extents.rb_node != NULL);
140}
141
42/* 142/*
43 * General things 143 * General things
44 */ 144 */
@@ -315,7 +415,6 @@ static int save_image(struct swap_map_handle *handle,
315{ 415{
316 unsigned int m; 416 unsigned int m;
317 int ret; 417 int ret;
318 int error = 0;
319 int nr_pages; 418 int nr_pages;
320 int err2; 419 int err2;
321 struct bio *bio; 420 struct bio *bio;
@@ -330,26 +429,27 @@ static int save_image(struct swap_map_handle *handle,
330 nr_pages = 0; 429 nr_pages = 0;
331 bio = NULL; 430 bio = NULL;
332 do_gettimeofday(&start); 431 do_gettimeofday(&start);
333 do { 432 while (1) {
334 ret = snapshot_read_next(snapshot, PAGE_SIZE); 433 ret = snapshot_read_next(snapshot, PAGE_SIZE);
335 if (ret > 0) { 434 if (ret <= 0)
336 error = swap_write_page(handle, data_of(*snapshot), 435 break;
337 &bio); 436 ret = swap_write_page(handle, data_of(*snapshot), &bio);
338 if (error) 437 if (ret)
339 break; 438 break;
340 if (!(nr_pages % m)) 439 if (!(nr_pages % m))
341 printk("\b\b\b\b%3d%%", nr_pages / m); 440 printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
342 nr_pages++; 441 nr_pages++;
343 } 442 }
344 } while (ret > 0);
345 err2 = wait_on_bio_chain(&bio); 443 err2 = wait_on_bio_chain(&bio);
346 do_gettimeofday(&stop); 444 do_gettimeofday(&stop);
347 if (!error) 445 if (!ret)
348 error = err2; 446 ret = err2;
349 if (!error) 447 if (!ret)
350 printk("\b\b\b\bdone\n"); 448 printk(KERN_CONT "\b\b\b\bdone\n");
449 else
450 printk(KERN_CONT "\n");
351 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); 451 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
352 return error; 452 return ret;
353} 453}
354 454
355/** 455/**
@@ -537,7 +637,8 @@ static int load_image(struct swap_map_handle *handle,
537 snapshot_write_finalize(snapshot); 637 snapshot_write_finalize(snapshot);
538 if (!snapshot_image_loaded(snapshot)) 638 if (!snapshot_image_loaded(snapshot))
539 error = -ENODATA; 639 error = -ENODATA;
540 } 640 } else
641 printk("\n");
541 swsusp_show_speed(&start, &stop, nr_to_read, "Read"); 642 swsusp_show_speed(&start, &stop, nr_to_read, "Read");
542 return error; 643 return error;
543} 644}
@@ -573,8 +674,6 @@ int swsusp_read(unsigned int *flags_p)
573 error = load_image(&handle, &snapshot, header->pages - 1); 674 error = load_image(&handle, &snapshot, header->pages - 1);
574 release_swap_reader(&handle); 675 release_swap_reader(&handle);
575 676
576 blkdev_put(resume_bdev, FMODE_READ);
577
578 if (!error) 677 if (!error)
579 pr_debug("PM: Image successfully loaded\n"); 678 pr_debug("PM: Image successfully loaded\n");
580 else 679 else
@@ -597,7 +696,7 @@ int swsusp_check(void)
597 error = bio_read_page(swsusp_resume_block, 696 error = bio_read_page(swsusp_resume_block,
598 swsusp_header, NULL); 697 swsusp_header, NULL);
599 if (error) 698 if (error)
600 return error; 699 goto put;
601 700
602 if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) { 701 if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) {
603 memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); 702 memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
@@ -605,8 +704,10 @@ int swsusp_check(void)
605 error = bio_write_page(swsusp_resume_block, 704 error = bio_write_page(swsusp_resume_block,
606 swsusp_header, NULL); 705 swsusp_header, NULL);
607 } else { 706 } else {
608 return -EINVAL; 707 error = -EINVAL;
609 } 708 }
709
710put:
610 if (error) 711 if (error)
611 blkdev_put(resume_bdev, FMODE_READ); 712 blkdev_put(resume_bdev, FMODE_READ);
612 else 713 else
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 6a07f4dbf2f8..5b3601bd1893 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -56,133 +56,3 @@
56#include "power.h" 56#include "power.h"
57 57
58int in_suspend __nosavedata = 0; 58int in_suspend __nosavedata = 0;
59
60/**
61 * The following functions are used for tracing the allocated
62 * swap pages, so that they can be freed in case of an error.
63 */
64
65struct swsusp_extent {
66 struct rb_node node;
67 unsigned long start;
68 unsigned long end;
69};
70
71static struct rb_root swsusp_extents = RB_ROOT;
72
73static int swsusp_extents_insert(unsigned long swap_offset)
74{
75 struct rb_node **new = &(swsusp_extents.rb_node);
76 struct rb_node *parent = NULL;
77 struct swsusp_extent *ext;
78
79 /* Figure out where to put the new node */
80 while (*new) {
81 ext = container_of(*new, struct swsusp_extent, node);
82 parent = *new;
83 if (swap_offset < ext->start) {
84 /* Try to merge */
85 if (swap_offset == ext->start - 1) {
86 ext->start--;
87 return 0;
88 }
89 new = &((*new)->rb_left);
90 } else if (swap_offset > ext->end) {
91 /* Try to merge */
92 if (swap_offset == ext->end + 1) {
93 ext->end++;
94 return 0;
95 }
96 new = &((*new)->rb_right);
97 } else {
98 /* It already is in the tree */
99 return -EINVAL;
100 }
101 }
102 /* Add the new node and rebalance the tree. */
103 ext = kzalloc(sizeof(struct swsusp_extent), GFP_KERNEL);
104 if (!ext)
105 return -ENOMEM;
106
107 ext->start = swap_offset;
108 ext->end = swap_offset;
109 rb_link_node(&ext->node, parent, new);
110 rb_insert_color(&ext->node, &swsusp_extents);
111 return 0;
112}
113
114/**
115 * alloc_swapdev_block - allocate a swap page and register that it has
116 * been allocated, so that it can be freed in case of an error.
117 */
118
119sector_t alloc_swapdev_block(int swap)
120{
121 unsigned long offset;
122
123 offset = swp_offset(get_swap_page_of_type(swap));
124 if (offset) {
125 if (swsusp_extents_insert(offset))
126 swap_free(swp_entry(swap, offset));
127 else
128 return swapdev_block(swap, offset);
129 }
130 return 0;
131}
132
133/**
134 * free_all_swap_pages - free swap pages allocated for saving image data.
135 * It also frees the extents used to register which swap entres had been
136 * allocated.
137 */
138
139void free_all_swap_pages(int swap)
140{
141 struct rb_node *node;
142
143 while ((node = swsusp_extents.rb_node)) {
144 struct swsusp_extent *ext;
145 unsigned long offset;
146
147 ext = container_of(node, struct swsusp_extent, node);
148 rb_erase(node, &swsusp_extents);
149 for (offset = ext->start; offset <= ext->end; offset++)
150 swap_free(swp_entry(swap, offset));
151
152 kfree(ext);
153 }
154}
155
156int swsusp_swap_in_use(void)
157{
158 return (swsusp_extents.rb_node != NULL);
159}
160
161/**
162 * swsusp_show_speed - print the time elapsed between two events represented by
163 * @start and @stop
164 *
165 * @nr_pages - number of pages processed between @start and @stop
166 * @msg - introductory message to print
167 */
168
169void swsusp_show_speed(struct timeval *start, struct timeval *stop,
170 unsigned nr_pages, char *msg)
171{
172 s64 elapsed_centisecs64;
173 int centisecs;
174 int k;
175 int kps;
176
177 elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
178 do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
179 centisecs = elapsed_centisecs64;
180 if (centisecs == 0)
181 centisecs = 1; /* avoid div-by-zero */
182 k = nr_pages * (PAGE_SIZE / 1024);
183 kps = (k * 100) / centisecs;
184 printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n",
185 msg, k,
186 centisecs / 100, centisecs % 100,
187 kps / 1000, (kps % 1000) / 10);
188}
diff --git a/kernel/printk.c b/kernel/printk.c
index 2a564570f822..1ded8e7dd19b 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -33,6 +33,7 @@
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/kexec.h> 35#include <linux/kexec.h>
36#include <linux/ratelimit.h>
36#include <linux/kmsg_dump.h> 37#include <linux/kmsg_dump.h>
37 38
38#include <asm/uaccess.h> 39#include <asm/uaccess.h>
@@ -1377,11 +1378,11 @@ late_initcall(disable_boot_consoles);
1377 */ 1378 */
1378DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10); 1379DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);
1379 1380
1380int printk_ratelimit(void) 1381int __printk_ratelimit(const char *func)
1381{ 1382{
1382 return __ratelimit(&printk_ratelimit_state); 1383 return ___ratelimit(&printk_ratelimit_state, func);
1383} 1384}
1384EXPORT_SYMBOL(printk_ratelimit); 1385EXPORT_SYMBOL(__printk_ratelimit);
1385 1386
1386/** 1387/**
1387 * printk_timed_ratelimit - caller-controlled printk ratelimiting 1388 * printk_timed_ratelimit - caller-controlled printk ratelimiting
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 307c285af59e..23bd09cd042e 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -266,9 +266,10 @@ static int ignoring_children(struct sighand_struct *sigh)
266 * or self-reaping. Do notification now if it would have happened earlier. 266 * or self-reaping. Do notification now if it would have happened earlier.
267 * If it should reap itself, return true. 267 * If it should reap itself, return true.
268 * 268 *
269 * If it's our own child, there is no notification to do. 269 * If it's our own child, there is no notification to do. But if our normal
270 * But if our normal children self-reap, then this child 270 * children self-reap, then this child was prevented by ptrace and we must
271 * was prevented by ptrace and we must reap it now. 271 * reap it now, in that case we must also wake up sub-threads sleeping in
272 * do_wait().
272 */ 273 */
273static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) 274static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
274{ 275{
@@ -278,8 +279,10 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
278 if (!task_detached(p) && thread_group_empty(p)) { 279 if (!task_detached(p) && thread_group_empty(p)) {
279 if (!same_thread_group(p->real_parent, tracer)) 280 if (!same_thread_group(p->real_parent, tracer))
280 do_notify_parent(p, p->exit_signal); 281 do_notify_parent(p, p->exit_signal);
281 else if (ignoring_children(tracer->sighand)) 282 else if (ignoring_children(tracer->sighand)) {
283 __wake_up_parent(p, tracer);
282 p->exit_signal = -1; 284 p->exit_signal = -1;
285 }
283 } 286 }
284 if (task_detached(p)) { 287 if (task_detached(p)) {
285 /* Mark it as in the process of being reaped. */ 288 /* Mark it as in the process of being reaped. */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 37ac45483082..9b7fd4723878 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -44,23 +44,13 @@
44#include <linux/cpu.h> 44#include <linux/cpu.h>
45#include <linux/mutex.h> 45#include <linux/mutex.h>
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/kernel_stat.h>
48 47
49enum rcu_barrier { 48#ifdef CONFIG_DEBUG_LOCK_ALLOC
50 RCU_BARRIER_STD, 49static struct lock_class_key rcu_lock_key;
51 RCU_BARRIER_BH, 50struct lockdep_map rcu_lock_map =
52 RCU_BARRIER_SCHED, 51 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
53}; 52EXPORT_SYMBOL_GPL(rcu_lock_map);
54 53#endif
55static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
56static atomic_t rcu_barrier_cpu_count;
57static DEFINE_MUTEX(rcu_barrier_mutex);
58static struct completion rcu_barrier_completion;
59int rcu_scheduler_active __read_mostly;
60
61static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0);
62static struct rcu_head rcu_migrate_head[3];
63static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq);
64 54
65/* 55/*
66 * Awaken the corresponding synchronize_rcu() instance now that a 56 * Awaken the corresponding synchronize_rcu() instance now that a
@@ -73,241 +63,3 @@ void wakeme_after_rcu(struct rcu_head *head)
73 rcu = container_of(head, struct rcu_synchronize, head); 63 rcu = container_of(head, struct rcu_synchronize, head);
74 complete(&rcu->completion); 64 complete(&rcu->completion);
75} 65}
76
77#ifdef CONFIG_TREE_PREEMPT_RCU
78
79/**
80 * synchronize_rcu - wait until a grace period has elapsed.
81 *
82 * Control will return to the caller some time after a full grace
83 * period has elapsed, in other words after all currently executing RCU
84 * read-side critical sections have completed. RCU read-side critical
85 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
86 * and may be nested.
87 */
88void synchronize_rcu(void)
89{
90 struct rcu_synchronize rcu;
91
92 if (!rcu_scheduler_active)
93 return;
94
95 init_completion(&rcu.completion);
96 /* Will wake me after RCU finished. */
97 call_rcu(&rcu.head, wakeme_after_rcu);
98 /* Wait for it. */
99 wait_for_completion(&rcu.completion);
100}
101EXPORT_SYMBOL_GPL(synchronize_rcu);
102
103#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
104
105/**
106 * synchronize_sched - wait until an rcu-sched grace period has elapsed.
107 *
108 * Control will return to the caller some time after a full rcu-sched
109 * grace period has elapsed, in other words after all currently executing
110 * rcu-sched read-side critical sections have completed. These read-side
111 * critical sections are delimited by rcu_read_lock_sched() and
112 * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(),
113 * local_irq_disable(), and so on may be used in place of
114 * rcu_read_lock_sched().
115 *
116 * This means that all preempt_disable code sequences, including NMI and
117 * hardware-interrupt handlers, in progress on entry will have completed
118 * before this primitive returns. However, this does not guarantee that
119 * softirq handlers will have completed, since in some kernels, these
120 * handlers can run in process context, and can block.
121 *
122 * This primitive provides the guarantees made by the (now removed)
123 * synchronize_kernel() API. In contrast, synchronize_rcu() only
124 * guarantees that rcu_read_lock() sections will have completed.
125 * In "classic RCU", these two guarantees happen to be one and
126 * the same, but can differ in realtime RCU implementations.
127 */
128void synchronize_sched(void)
129{
130 struct rcu_synchronize rcu;
131
132 if (rcu_blocking_is_gp())
133 return;
134
135 init_completion(&rcu.completion);
136 /* Will wake me after RCU finished. */
137 call_rcu_sched(&rcu.head, wakeme_after_rcu);
138 /* Wait for it. */
139 wait_for_completion(&rcu.completion);
140}
141EXPORT_SYMBOL_GPL(synchronize_sched);
142
143/**
144 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
145 *
146 * Control will return to the caller some time after a full rcu_bh grace
147 * period has elapsed, in other words after all currently executing rcu_bh
148 * read-side critical sections have completed. RCU read-side critical
149 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
150 * and may be nested.
151 */
152void synchronize_rcu_bh(void)
153{
154 struct rcu_synchronize rcu;
155
156 if (rcu_blocking_is_gp())
157 return;
158
159 init_completion(&rcu.completion);
160 /* Will wake me after RCU finished. */
161 call_rcu_bh(&rcu.head, wakeme_after_rcu);
162 /* Wait for it. */
163 wait_for_completion(&rcu.completion);
164}
165EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
166
167static void rcu_barrier_callback(struct rcu_head *notused)
168{
169 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
170 complete(&rcu_barrier_completion);
171}
172
173/*
174 * Called with preemption disabled, and from cross-cpu IRQ context.
175 */
176static void rcu_barrier_func(void *type)
177{
178 int cpu = smp_processor_id();
179 struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
180
181 atomic_inc(&rcu_barrier_cpu_count);
182 switch ((enum rcu_barrier)type) {
183 case RCU_BARRIER_STD:
184 call_rcu(head, rcu_barrier_callback);
185 break;
186 case RCU_BARRIER_BH:
187 call_rcu_bh(head, rcu_barrier_callback);
188 break;
189 case RCU_BARRIER_SCHED:
190 call_rcu_sched(head, rcu_barrier_callback);
191 break;
192 }
193}
194
195static inline void wait_migrated_callbacks(void)
196{
197 wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count));
198 smp_mb(); /* In case we didn't sleep. */
199}
200
201/*
202 * Orchestrate the specified type of RCU barrier, waiting for all
203 * RCU callbacks of the specified type to complete.
204 */
205static void _rcu_barrier(enum rcu_barrier type)
206{
207 BUG_ON(in_interrupt());
208 /* Take cpucontrol mutex to protect against CPU hotplug */
209 mutex_lock(&rcu_barrier_mutex);
210 init_completion(&rcu_barrier_completion);
211 /*
212 * Initialize rcu_barrier_cpu_count to 1, then invoke
213 * rcu_barrier_func() on each CPU, so that each CPU also has
214 * incremented rcu_barrier_cpu_count. Only then is it safe to
215 * decrement rcu_barrier_cpu_count -- otherwise the first CPU
216 * might complete its grace period before all of the other CPUs
217 * did their increment, causing this function to return too
218 * early.
219 */
220 atomic_set(&rcu_barrier_cpu_count, 1);
221 on_each_cpu(rcu_barrier_func, (void *)type, 1);
222 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
223 complete(&rcu_barrier_completion);
224 wait_for_completion(&rcu_barrier_completion);
225 mutex_unlock(&rcu_barrier_mutex);
226 wait_migrated_callbacks();
227}
228
229/**
230 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
231 */
232void rcu_barrier(void)
233{
234 _rcu_barrier(RCU_BARRIER_STD);
235}
236EXPORT_SYMBOL_GPL(rcu_barrier);
237
238/**
239 * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
240 */
241void rcu_barrier_bh(void)
242{
243 _rcu_barrier(RCU_BARRIER_BH);
244}
245EXPORT_SYMBOL_GPL(rcu_barrier_bh);
246
247/**
248 * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
249 */
250void rcu_barrier_sched(void)
251{
252 _rcu_barrier(RCU_BARRIER_SCHED);
253}
254EXPORT_SYMBOL_GPL(rcu_barrier_sched);
255
256static void rcu_migrate_callback(struct rcu_head *notused)
257{
258 if (atomic_dec_and_test(&rcu_migrate_type_count))
259 wake_up(&rcu_migrate_wq);
260}
261
262extern int rcu_cpu_notify(struct notifier_block *self,
263 unsigned long action, void *hcpu);
264
265static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
266 unsigned long action, void *hcpu)
267{
268 rcu_cpu_notify(self, action, hcpu);
269 if (action == CPU_DYING) {
270 /*
271 * preempt_disable() in on_each_cpu() prevents stop_machine(),
272 * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
273 * returns, all online cpus have queued rcu_barrier_func(),
274 * and the dead cpu(if it exist) queues rcu_migrate_callback()s.
275 *
276 * These callbacks ensure _rcu_barrier() waits for all
277 * RCU callbacks of the specified type to complete.
278 */
279 atomic_set(&rcu_migrate_type_count, 3);
280 call_rcu_bh(rcu_migrate_head, rcu_migrate_callback);
281 call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback);
282 call_rcu(rcu_migrate_head + 2, rcu_migrate_callback);
283 } else if (action == CPU_DOWN_PREPARE) {
284 /* Don't need to wait until next removal operation. */
285 /* rcu_migrate_head is protected by cpu_add_remove_lock */
286 wait_migrated_callbacks();
287 }
288
289 return NOTIFY_OK;
290}
291
292void __init rcu_init(void)
293{
294 int i;
295
296 __rcu_init();
297 cpu_notifier(rcu_barrier_cpu_hotplug, 0);
298
299 /*
300 * We don't need protection against CPU-hotplug here because
301 * this is called early in boot, before either interrupts
302 * or the scheduler are operational.
303 */
304 for_each_online_cpu(i)
305 rcu_barrier_cpu_hotplug(NULL, CPU_UP_PREPARE, (void *)(long)i);
306}
307
308void rcu_scheduler_starting(void)
309{
310 WARN_ON(num_online_cpus() != 1);
311 WARN_ON(nr_context_switches() > 0);
312 rcu_scheduler_active = 1;
313}
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
new file mode 100644
index 000000000000..9f6d9ff2572c
--- /dev/null
+++ b/kernel/rcutiny.c
@@ -0,0 +1,282 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2008
19 *
20 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
21 *
22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU
24 */
25#include <linux/moduleparam.h>
26#include <linux/completion.h>
27#include <linux/interrupt.h>
28#include <linux/notifier.h>
29#include <linux/rcupdate.h>
30#include <linux/kernel.h>
31#include <linux/module.h>
32#include <linux/mutex.h>
33#include <linux/sched.h>
34#include <linux/types.h>
35#include <linux/init.h>
36#include <linux/time.h>
37#include <linux/cpu.h>
38
39/* Global control variables for rcupdate callback mechanism. */
40struct rcu_ctrlblk {
41 struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
42 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
43 struct rcu_head **curtail; /* ->next pointer of last CB. */
44};
45
46/* Definition for rcupdate control block. */
47static struct rcu_ctrlblk rcu_ctrlblk = {
48 .donetail = &rcu_ctrlblk.rcucblist,
49 .curtail = &rcu_ctrlblk.rcucblist,
50};
51
52static struct rcu_ctrlblk rcu_bh_ctrlblk = {
53 .donetail = &rcu_bh_ctrlblk.rcucblist,
54 .curtail = &rcu_bh_ctrlblk.rcucblist,
55};
56
57#ifdef CONFIG_NO_HZ
58
59static long rcu_dynticks_nesting = 1;
60
61/*
62 * Enter dynticks-idle mode, which is an extended quiescent state
63 * if we have fully entered that mode (i.e., if the new value of
64 * dynticks_nesting is zero).
65 */
66void rcu_enter_nohz(void)
67{
68 if (--rcu_dynticks_nesting == 0)
69 rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */
70}
71
72/*
73 * Exit dynticks-idle mode, so that we are no longer in an extended
74 * quiescent state.
75 */
76void rcu_exit_nohz(void)
77{
78 rcu_dynticks_nesting++;
79}
80
81#endif /* #ifdef CONFIG_NO_HZ */
82
83/*
84 * Helper function for rcu_qsctr_inc() and rcu_bh_qsctr_inc().
85 * Also disable irqs to avoid confusion due to interrupt handlers
86 * invoking call_rcu().
87 */
88static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
89{
90 unsigned long flags;
91
92 local_irq_save(flags);
93 if (rcp->rcucblist != NULL &&
94 rcp->donetail != rcp->curtail) {
95 rcp->donetail = rcp->curtail;
96 local_irq_restore(flags);
97 return 1;
98 }
99 local_irq_restore(flags);
100
101 return 0;
102}
103
104/*
105 * Record an rcu quiescent state. And an rcu_bh quiescent state while we
106 * are at it, given that any rcu quiescent state is also an rcu_bh
107 * quiescent state. Use "+" instead of "||" to defeat short circuiting.
108 */
109void rcu_sched_qs(int cpu)
110{
111 if (rcu_qsctr_help(&rcu_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk))
112 raise_softirq(RCU_SOFTIRQ);
113}
114
115/*
116 * Record an rcu_bh quiescent state.
117 */
118void rcu_bh_qs(int cpu)
119{
120 if (rcu_qsctr_help(&rcu_bh_ctrlblk))
121 raise_softirq(RCU_SOFTIRQ);
122}
123
124/*
125 * Check to see if the scheduling-clock interrupt came from an extended
126 * quiescent state, and, if so, tell RCU about it.
127 */
128void rcu_check_callbacks(int cpu, int user)
129{
130 if (user ||
131 (idle_cpu(cpu) &&
132 !in_softirq() &&
133 hardirq_count() <= (1 << HARDIRQ_SHIFT)))
134 rcu_sched_qs(cpu);
135 else if (!in_softirq())
136 rcu_bh_qs(cpu);
137}
138
139/*
140 * Helper function for rcu_process_callbacks() that operates on the
141 * specified rcu_ctrlkblk structure.
142 */
143static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
144{
145 struct rcu_head *next, *list;
146 unsigned long flags;
147
148 /* If no RCU callbacks ready to invoke, just return. */
149 if (&rcp->rcucblist == rcp->donetail)
150 return;
151
152 /* Move the ready-to-invoke callbacks to a local list. */
153 local_irq_save(flags);
154 list = rcp->rcucblist;
155 rcp->rcucblist = *rcp->donetail;
156 *rcp->donetail = NULL;
157 if (rcp->curtail == rcp->donetail)
158 rcp->curtail = &rcp->rcucblist;
159 rcp->donetail = &rcp->rcucblist;
160 local_irq_restore(flags);
161
162 /* Invoke the callbacks on the local list. */
163 while (list) {
164 next = list->next;
165 prefetch(next);
166 list->func(list);
167 list = next;
168 }
169}
170
171/*
172 * Invoke any callbacks whose grace period has completed.
173 */
174static void rcu_process_callbacks(struct softirq_action *unused)
175{
176 __rcu_process_callbacks(&rcu_ctrlblk);
177 __rcu_process_callbacks(&rcu_bh_ctrlblk);
178}
179
180/*
181 * Wait for a grace period to elapse. But it is illegal to invoke
182 * synchronize_sched() from within an RCU read-side critical section.
183 * Therefore, any legal call to synchronize_sched() is a quiescent
184 * state, and so on a UP system, synchronize_sched() need do nothing.
185 * Ditto for synchronize_rcu_bh(). (But Lai Jiangshan points out the
186 * benefits of doing might_sleep() to reduce latency.)
187 *
188 * Cool, huh? (Due to Josh Triplett.)
189 *
190 * But we want to make this a static inline later.
191 */
192void synchronize_sched(void)
193{
194 cond_resched();
195}
196EXPORT_SYMBOL_GPL(synchronize_sched);
197
198void synchronize_rcu_bh(void)
199{
200 synchronize_sched();
201}
202EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
203
204/*
205 * Helper function for call_rcu() and call_rcu_bh().
206 */
207static void __call_rcu(struct rcu_head *head,
208 void (*func)(struct rcu_head *rcu),
209 struct rcu_ctrlblk *rcp)
210{
211 unsigned long flags;
212
213 head->func = func;
214 head->next = NULL;
215
216 local_irq_save(flags);
217 *rcp->curtail = head;
218 rcp->curtail = &head->next;
219 local_irq_restore(flags);
220}
221
222/*
223 * Post an RCU callback to be invoked after the end of an RCU grace
224 * period. But since we have but one CPU, that would be after any
225 * quiescent state.
226 */
227void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
228{
229 __call_rcu(head, func, &rcu_ctrlblk);
230}
231EXPORT_SYMBOL_GPL(call_rcu);
232
233/*
234 * Post an RCU bottom-half callback to be invoked after any subsequent
235 * quiescent state.
236 */
237void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
238{
239 __call_rcu(head, func, &rcu_bh_ctrlblk);
240}
241EXPORT_SYMBOL_GPL(call_rcu_bh);
242
243void rcu_barrier(void)
244{
245 struct rcu_synchronize rcu;
246
247 init_completion(&rcu.completion);
248 /* Will wake me after RCU finished. */
249 call_rcu(&rcu.head, wakeme_after_rcu);
250 /* Wait for it. */
251 wait_for_completion(&rcu.completion);
252}
253EXPORT_SYMBOL_GPL(rcu_barrier);
254
255void rcu_barrier_bh(void)
256{
257 struct rcu_synchronize rcu;
258
259 init_completion(&rcu.completion);
260 /* Will wake me after RCU finished. */
261 call_rcu_bh(&rcu.head, wakeme_after_rcu);
262 /* Wait for it. */
263 wait_for_completion(&rcu.completion);
264}
265EXPORT_SYMBOL_GPL(rcu_barrier_bh);
266
267void rcu_barrier_sched(void)
268{
269 struct rcu_synchronize rcu;
270
271 init_completion(&rcu.completion);
272 /* Will wake me after RCU finished. */
273 call_rcu_sched(&rcu.head, wakeme_after_rcu);
274 /* Wait for it. */
275 wait_for_completion(&rcu.completion);
276}
277EXPORT_SYMBOL_GPL(rcu_barrier_sched);
278
279void __init rcu_init(void)
280{
281 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
282}
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 233768f21f97..9bb52177af02 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -327,6 +327,11 @@ rcu_torture_cb(struct rcu_head *p)
327 cur_ops->deferred_free(rp); 327 cur_ops->deferred_free(rp);
328} 328}
329 329
330static int rcu_no_completed(void)
331{
332 return 0;
333}
334
330static void rcu_torture_deferred_free(struct rcu_torture *p) 335static void rcu_torture_deferred_free(struct rcu_torture *p)
331{ 336{
332 call_rcu(&p->rtort_rcu, rcu_torture_cb); 337 call_rcu(&p->rtort_rcu, rcu_torture_cb);
@@ -388,6 +393,21 @@ static struct rcu_torture_ops rcu_sync_ops = {
388 .name = "rcu_sync" 393 .name = "rcu_sync"
389}; 394};
390 395
396static struct rcu_torture_ops rcu_expedited_ops = {
397 .init = rcu_sync_torture_init,
398 .cleanup = NULL,
399 .readlock = rcu_torture_read_lock,
400 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
401 .readunlock = rcu_torture_read_unlock,
402 .completed = rcu_no_completed,
403 .deferred_free = rcu_sync_torture_deferred_free,
404 .sync = synchronize_rcu_expedited,
405 .cb_barrier = NULL,
406 .stats = NULL,
407 .irq_capable = 1,
408 .name = "rcu_expedited"
409};
410
391/* 411/*
392 * Definitions for rcu_bh torture testing. 412 * Definitions for rcu_bh torture testing.
393 */ 413 */
@@ -547,6 +567,25 @@ static struct rcu_torture_ops srcu_ops = {
547 .name = "srcu" 567 .name = "srcu"
548}; 568};
549 569
570static void srcu_torture_synchronize_expedited(void)
571{
572 synchronize_srcu_expedited(&srcu_ctl);
573}
574
575static struct rcu_torture_ops srcu_expedited_ops = {
576 .init = srcu_torture_init,
577 .cleanup = srcu_torture_cleanup,
578 .readlock = srcu_torture_read_lock,
579 .read_delay = srcu_read_delay,
580 .readunlock = srcu_torture_read_unlock,
581 .completed = srcu_torture_completed,
582 .deferred_free = rcu_sync_torture_deferred_free,
583 .sync = srcu_torture_synchronize_expedited,
584 .cb_barrier = NULL,
585 .stats = srcu_torture_stats,
586 .name = "srcu_expedited"
587};
588
550/* 589/*
551 * Definitions for sched torture testing. 590 * Definitions for sched torture testing.
552 */ 591 */
@@ -562,11 +601,6 @@ static void sched_torture_read_unlock(int idx)
562 preempt_enable(); 601 preempt_enable();
563} 602}
564 603
565static int sched_torture_completed(void)
566{
567 return 0;
568}
569
570static void rcu_sched_torture_deferred_free(struct rcu_torture *p) 604static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
571{ 605{
572 call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); 606 call_rcu_sched(&p->rtort_rcu, rcu_torture_cb);
@@ -583,7 +617,7 @@ static struct rcu_torture_ops sched_ops = {
583 .readlock = sched_torture_read_lock, 617 .readlock = sched_torture_read_lock,
584 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 618 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
585 .readunlock = sched_torture_read_unlock, 619 .readunlock = sched_torture_read_unlock,
586 .completed = sched_torture_completed, 620 .completed = rcu_no_completed,
587 .deferred_free = rcu_sched_torture_deferred_free, 621 .deferred_free = rcu_sched_torture_deferred_free,
588 .sync = sched_torture_synchronize, 622 .sync = sched_torture_synchronize,
589 .cb_barrier = rcu_barrier_sched, 623 .cb_barrier = rcu_barrier_sched,
@@ -592,13 +626,13 @@ static struct rcu_torture_ops sched_ops = {
592 .name = "sched" 626 .name = "sched"
593}; 627};
594 628
595static struct rcu_torture_ops sched_ops_sync = { 629static struct rcu_torture_ops sched_sync_ops = {
596 .init = rcu_sync_torture_init, 630 .init = rcu_sync_torture_init,
597 .cleanup = NULL, 631 .cleanup = NULL,
598 .readlock = sched_torture_read_lock, 632 .readlock = sched_torture_read_lock,
599 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 633 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
600 .readunlock = sched_torture_read_unlock, 634 .readunlock = sched_torture_read_unlock,
601 .completed = sched_torture_completed, 635 .completed = rcu_no_completed,
602 .deferred_free = rcu_sync_torture_deferred_free, 636 .deferred_free = rcu_sync_torture_deferred_free,
603 .sync = sched_torture_synchronize, 637 .sync = sched_torture_synchronize,
604 .cb_barrier = NULL, 638 .cb_barrier = NULL,
@@ -606,15 +640,13 @@ static struct rcu_torture_ops sched_ops_sync = {
606 .name = "sched_sync" 640 .name = "sched_sync"
607}; 641};
608 642
609extern int rcu_expedited_torture_stats(char *page);
610
611static struct rcu_torture_ops sched_expedited_ops = { 643static struct rcu_torture_ops sched_expedited_ops = {
612 .init = rcu_sync_torture_init, 644 .init = rcu_sync_torture_init,
613 .cleanup = NULL, 645 .cleanup = NULL,
614 .readlock = sched_torture_read_lock, 646 .readlock = sched_torture_read_lock,
615 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 647 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
616 .readunlock = sched_torture_read_unlock, 648 .readunlock = sched_torture_read_unlock,
617 .completed = sched_torture_completed, 649 .completed = rcu_no_completed,
618 .deferred_free = rcu_sync_torture_deferred_free, 650 .deferred_free = rcu_sync_torture_deferred_free,
619 .sync = synchronize_sched_expedited, 651 .sync = synchronize_sched_expedited,
620 .cb_barrier = NULL, 652 .cb_barrier = NULL,
@@ -650,7 +682,7 @@ rcu_torture_writer(void *arg)
650 old_rp = rcu_torture_current; 682 old_rp = rcu_torture_current;
651 rp->rtort_mbtest = 1; 683 rp->rtort_mbtest = 1;
652 rcu_assign_pointer(rcu_torture_current, rp); 684 rcu_assign_pointer(rcu_torture_current, rp);
653 smp_wmb(); 685 smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */
654 if (old_rp) { 686 if (old_rp) {
655 i = old_rp->rtort_pipe_count; 687 i = old_rp->rtort_pipe_count;
656 if (i > RCU_TORTURE_PIPE_LEN) 688 if (i > RCU_TORTURE_PIPE_LEN)
@@ -731,13 +763,13 @@ static void rcu_torture_timer(unsigned long unused)
731 /* Should not happen, but... */ 763 /* Should not happen, but... */
732 pipe_count = RCU_TORTURE_PIPE_LEN; 764 pipe_count = RCU_TORTURE_PIPE_LEN;
733 } 765 }
734 ++__get_cpu_var(rcu_torture_count)[pipe_count]; 766 __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]);
735 completed = cur_ops->completed() - completed; 767 completed = cur_ops->completed() - completed;
736 if (completed > RCU_TORTURE_PIPE_LEN) { 768 if (completed > RCU_TORTURE_PIPE_LEN) {
737 /* Should not happen, but... */ 769 /* Should not happen, but... */
738 completed = RCU_TORTURE_PIPE_LEN; 770 completed = RCU_TORTURE_PIPE_LEN;
739 } 771 }
740 ++__get_cpu_var(rcu_torture_batch)[completed]; 772 __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]);
741 preempt_enable(); 773 preempt_enable();
742 cur_ops->readunlock(idx); 774 cur_ops->readunlock(idx);
743} 775}
@@ -786,13 +818,13 @@ rcu_torture_reader(void *arg)
786 /* Should not happen, but... */ 818 /* Should not happen, but... */
787 pipe_count = RCU_TORTURE_PIPE_LEN; 819 pipe_count = RCU_TORTURE_PIPE_LEN;
788 } 820 }
789 ++__get_cpu_var(rcu_torture_count)[pipe_count]; 821 __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]);
790 completed = cur_ops->completed() - completed; 822 completed = cur_ops->completed() - completed;
791 if (completed > RCU_TORTURE_PIPE_LEN) { 823 if (completed > RCU_TORTURE_PIPE_LEN) {
792 /* Should not happen, but... */ 824 /* Should not happen, but... */
793 completed = RCU_TORTURE_PIPE_LEN; 825 completed = RCU_TORTURE_PIPE_LEN;
794 } 826 }
795 ++__get_cpu_var(rcu_torture_batch)[completed]; 827 __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]);
796 preempt_enable(); 828 preempt_enable();
797 cur_ops->readunlock(idx); 829 cur_ops->readunlock(idx);
798 schedule(); 830 schedule();
@@ -1099,9 +1131,10 @@ rcu_torture_init(void)
1099 int cpu; 1131 int cpu;
1100 int firsterr = 0; 1132 int firsterr = 0;
1101 static struct rcu_torture_ops *torture_ops[] = 1133 static struct rcu_torture_ops *torture_ops[] =
1102 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, 1134 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
1103 &sched_expedited_ops, 1135 &rcu_bh_ops, &rcu_bh_sync_ops,
1104 &srcu_ops, &sched_ops, &sched_ops_sync, }; 1136 &srcu_ops, &srcu_expedited_ops,
1137 &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
1105 1138
1106 mutex_lock(&fullstop_mutex); 1139 mutex_lock(&fullstop_mutex);
1107 1140
@@ -1112,8 +1145,12 @@ rcu_torture_init(void)
1112 break; 1145 break;
1113 } 1146 }
1114 if (i == ARRAY_SIZE(torture_ops)) { 1147 if (i == ARRAY_SIZE(torture_ops)) {
1115 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", 1148 printk(KERN_ALERT "rcu-torture: invalid torture type: \"%s\"\n",
1116 torture_type); 1149 torture_type);
1150 printk(KERN_ALERT "rcu-torture types:");
1151 for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
1152 printk(KERN_ALERT " %s", torture_ops[i]->name);
1153 printk(KERN_ALERT "\n");
1117 mutex_unlock(&fullstop_mutex); 1154 mutex_unlock(&fullstop_mutex);
1118 return -EINVAL; 1155 return -EINVAL;
1119 } 1156 }
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 52b06f6e158c..53ae9598f798 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -46,30 +46,30 @@
46#include <linux/cpu.h> 46#include <linux/cpu.h>
47#include <linux/mutex.h> 47#include <linux/mutex.h>
48#include <linux/time.h> 48#include <linux/time.h>
49#include <linux/kernel_stat.h>
49 50
50#include "rcutree.h" 51#include "rcutree.h"
51 52
52#ifdef CONFIG_DEBUG_LOCK_ALLOC
53static struct lock_class_key rcu_lock_key;
54struct lockdep_map rcu_lock_map =
55 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
56EXPORT_SYMBOL_GPL(rcu_lock_map);
57#endif
58
59/* Data structures. */ 53/* Data structures. */
60 54
55static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
56
61#define RCU_STATE_INITIALIZER(name) { \ 57#define RCU_STATE_INITIALIZER(name) { \
62 .level = { &name.node[0] }, \ 58 .level = { &name.node[0] }, \
63 .levelcnt = { \ 59 .levelcnt = { \
64 NUM_RCU_LVL_0, /* root of hierarchy. */ \ 60 NUM_RCU_LVL_0, /* root of hierarchy. */ \
65 NUM_RCU_LVL_1, \ 61 NUM_RCU_LVL_1, \
66 NUM_RCU_LVL_2, \ 62 NUM_RCU_LVL_2, \
67 NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \ 63 NUM_RCU_LVL_3, \
64 NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \
68 }, \ 65 }, \
69 .signaled = RCU_SIGNAL_INIT, \ 66 .signaled = RCU_GP_IDLE, \
70 .gpnum = -300, \ 67 .gpnum = -300, \
71 .completed = -300, \ 68 .completed = -300, \
72 .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \ 69 .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \
70 .orphan_cbs_list = NULL, \
71 .orphan_cbs_tail = &name.orphan_cbs_list, \
72 .orphan_qlen = 0, \
73 .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \ 73 .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \
74 .n_force_qs = 0, \ 74 .n_force_qs = 0, \
75 .n_force_qs_ngp = 0, \ 75 .n_force_qs_ngp = 0, \
@@ -81,24 +81,18 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
81struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 81struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
82DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 82DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
83 83
84extern long rcu_batches_completed_sched(void); 84static int rcu_scheduler_active __read_mostly;
85static struct rcu_node *rcu_get_root(struct rcu_state *rsp);
86static void cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp,
87 struct rcu_node *rnp, unsigned long flags);
88static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags);
89#ifdef CONFIG_HOTPLUG_CPU
90static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp);
91#endif /* #ifdef CONFIG_HOTPLUG_CPU */
92static void __rcu_process_callbacks(struct rcu_state *rsp,
93 struct rcu_data *rdp);
94static void __call_rcu(struct rcu_head *head,
95 void (*func)(struct rcu_head *rcu),
96 struct rcu_state *rsp);
97static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp);
98static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp,
99 int preemptable);
100 85
101#include "rcutree_plugin.h" 86
87/*
88 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
89 * permit this function to be invoked without holding the root rcu_node
90 * structure's ->lock, but of course results can be subject to change.
91 */
92static int rcu_gp_in_progress(struct rcu_state *rsp)
93{
94 return ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum);
95}
102 96
103/* 97/*
104 * Note a quiescent state. Because we do not need to know 98 * Note a quiescent state. Because we do not need to know
@@ -110,7 +104,7 @@ void rcu_sched_qs(int cpu)
110 struct rcu_data *rdp; 104 struct rcu_data *rdp;
111 105
112 rdp = &per_cpu(rcu_sched_data, cpu); 106 rdp = &per_cpu(rcu_sched_data, cpu);
113 rdp->passed_quiesc_completed = rdp->completed; 107 rdp->passed_quiesc_completed = rdp->gpnum - 1;
114 barrier(); 108 barrier();
115 rdp->passed_quiesc = 1; 109 rdp->passed_quiesc = 1;
116 rcu_preempt_note_context_switch(cpu); 110 rcu_preempt_note_context_switch(cpu);
@@ -121,7 +115,7 @@ void rcu_bh_qs(int cpu)
121 struct rcu_data *rdp; 115 struct rcu_data *rdp;
122 116
123 rdp = &per_cpu(rcu_bh_data, cpu); 117 rdp = &per_cpu(rcu_bh_data, cpu);
124 rdp->passed_quiesc_completed = rdp->completed; 118 rdp->passed_quiesc_completed = rdp->gpnum - 1;
125 barrier(); 119 barrier();
126 rdp->passed_quiesc = 1; 120 rdp->passed_quiesc = 1;
127} 121}
@@ -137,6 +131,10 @@ static int blimit = 10; /* Maximum callbacks per softirq. */
137static int qhimark = 10000; /* If this many pending, ignore blimit. */ 131static int qhimark = 10000; /* If this many pending, ignore blimit. */
138static int qlowmark = 100; /* Once only this many pending, use blimit. */ 132static int qlowmark = 100; /* Once only this many pending, use blimit. */
139 133
134module_param(blimit, int, 0);
135module_param(qhimark, int, 0);
136module_param(qlowmark, int, 0);
137
140static void force_quiescent_state(struct rcu_state *rsp, int relaxed); 138static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
141static int rcu_pending(int cpu); 139static int rcu_pending(int cpu);
142 140
@@ -173,9 +171,7 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
173static int 171static int
174cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) 172cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
175{ 173{
176 /* ACCESS_ONCE() because we are accessing outside of lock. */ 174 return *rdp->nxttail[RCU_DONE_TAIL] && !rcu_gp_in_progress(rsp);
177 return *rdp->nxttail[RCU_DONE_TAIL] &&
178 ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum);
179} 175}
180 176
181/* 177/*
@@ -345,31 +341,12 @@ void rcu_irq_exit(void)
345 set_need_resched(); 341 set_need_resched();
346} 342}
347 343
348/*
349 * Record the specified "completed" value, which is later used to validate
350 * dynticks counter manipulations. Specify "rsp->completed - 1" to
351 * unconditionally invalidate any future dynticks manipulations (which is
352 * useful at the beginning of a grace period).
353 */
354static void dyntick_record_completed(struct rcu_state *rsp, long comp)
355{
356 rsp->dynticks_completed = comp;
357}
358
359#ifdef CONFIG_SMP 344#ifdef CONFIG_SMP
360 345
361/* 346/*
362 * Recall the previously recorded value of the completion for dynticks.
363 */
364static long dyntick_recall_completed(struct rcu_state *rsp)
365{
366 return rsp->dynticks_completed;
367}
368
369/*
370 * Snapshot the specified CPU's dynticks counter so that we can later 347 * Snapshot the specified CPU's dynticks counter so that we can later
371 * credit them with an implicit quiescent state. Return 1 if this CPU 348 * credit them with an implicit quiescent state. Return 1 if this CPU
372 * is already in a quiescent state courtesy of dynticks idle mode. 349 * is in dynticks idle mode, which is an extended quiescent state.
373 */ 350 */
374static int dyntick_save_progress_counter(struct rcu_data *rdp) 351static int dyntick_save_progress_counter(struct rcu_data *rdp)
375{ 352{
@@ -429,24 +406,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
429 406
430#else /* #ifdef CONFIG_NO_HZ */ 407#else /* #ifdef CONFIG_NO_HZ */
431 408
432static void dyntick_record_completed(struct rcu_state *rsp, long comp)
433{
434}
435
436#ifdef CONFIG_SMP 409#ifdef CONFIG_SMP
437 410
438/*
439 * If there are no dynticks, then the only way that a CPU can passively
440 * be in a quiescent state is to be offline. Unlike dynticks idle, which
441 * is a point in time during the prior (already finished) grace period,
442 * an offline CPU is always in a quiescent state, and thus can be
443 * unconditionally applied. So just return the current value of completed.
444 */
445static long dyntick_recall_completed(struct rcu_state *rsp)
446{
447 return rsp->completed;
448}
449
450static int dyntick_save_progress_counter(struct rcu_data *rdp) 411static int dyntick_save_progress_counter(struct rcu_data *rdp)
451{ 412{
452 return 0; 413 return 0;
@@ -475,30 +436,34 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
475 long delta; 436 long delta;
476 unsigned long flags; 437 unsigned long flags;
477 struct rcu_node *rnp = rcu_get_root(rsp); 438 struct rcu_node *rnp = rcu_get_root(rsp);
478 struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
479 struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
480 439
481 /* Only let one CPU complain about others per time interval. */ 440 /* Only let one CPU complain about others per time interval. */
482 441
483 spin_lock_irqsave(&rnp->lock, flags); 442 spin_lock_irqsave(&rnp->lock, flags);
484 delta = jiffies - rsp->jiffies_stall; 443 delta = jiffies - rsp->jiffies_stall;
485 if (delta < RCU_STALL_RAT_DELAY || rsp->gpnum == rsp->completed) { 444 if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
486 spin_unlock_irqrestore(&rnp->lock, flags); 445 spin_unlock_irqrestore(&rnp->lock, flags);
487 return; 446 return;
488 } 447 }
489 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; 448 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
449
450 /*
451 * Now rat on any tasks that got kicked up to the root rcu_node
452 * due to CPU offlining.
453 */
454 rcu_print_task_stall(rnp);
490 spin_unlock_irqrestore(&rnp->lock, flags); 455 spin_unlock_irqrestore(&rnp->lock, flags);
491 456
492 /* OK, time to rat on our buddy... */ 457 /* OK, time to rat on our buddy... */
493 458
494 printk(KERN_ERR "INFO: RCU detected CPU stalls:"); 459 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
495 for (; rnp_cur < rnp_end; rnp_cur++) { 460 rcu_for_each_leaf_node(rsp, rnp) {
496 rcu_print_task_stall(rnp); 461 rcu_print_task_stall(rnp);
497 if (rnp_cur->qsmask == 0) 462 if (rnp->qsmask == 0)
498 continue; 463 continue;
499 for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++) 464 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
500 if (rnp_cur->qsmask & (1UL << cpu)) 465 if (rnp->qsmask & (1UL << cpu))
501 printk(" %d", rnp_cur->grplo + cpu); 466 printk(" %d", rnp->grplo + cpu);
502 } 467 }
503 printk(" (detected by %d, t=%ld jiffies)\n", 468 printk(" (detected by %d, t=%ld jiffies)\n",
504 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 469 smp_processor_id(), (long)(jiffies - rsp->gp_start));
@@ -537,8 +502,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
537 /* We haven't checked in, so go dump stack. */ 502 /* We haven't checked in, so go dump stack. */
538 print_cpu_stall(rsp); 503 print_cpu_stall(rsp);
539 504
540 } else if (rsp->gpnum != rsp->completed && 505 } else if (rcu_gp_in_progress(rsp) && delta >= RCU_STALL_RAT_DELAY) {
541 delta >= RCU_STALL_RAT_DELAY) {
542 506
543 /* They had two time units to dump stack, so complain. */ 507 /* They had two time units to dump stack, so complain. */
544 print_other_cpu_stall(rsp); 508 print_other_cpu_stall(rsp);
@@ -560,13 +524,33 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
560/* 524/*
561 * Update CPU-local rcu_data state to record the newly noticed grace period. 525 * Update CPU-local rcu_data state to record the newly noticed grace period.
562 * This is used both when we started the grace period and when we notice 526 * This is used both when we started the grace period and when we notice
563 * that someone else started the grace period. 527 * that someone else started the grace period. The caller must hold the
528 * ->lock of the leaf rcu_node structure corresponding to the current CPU,
529 * and must have irqs disabled.
564 */ 530 */
531static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
532{
533 if (rdp->gpnum != rnp->gpnum) {
534 rdp->qs_pending = 1;
535 rdp->passed_quiesc = 0;
536 rdp->gpnum = rnp->gpnum;
537 }
538}
539
565static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp) 540static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
566{ 541{
567 rdp->qs_pending = 1; 542 unsigned long flags;
568 rdp->passed_quiesc = 0; 543 struct rcu_node *rnp;
569 rdp->gpnum = rsp->gpnum; 544
545 local_irq_save(flags);
546 rnp = rdp->mynode;
547 if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */
548 !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */
549 local_irq_restore(flags);
550 return;
551 }
552 __note_new_gpnum(rsp, rnp, rdp);
553 spin_unlock_irqrestore(&rnp->lock, flags);
570} 554}
571 555
572/* 556/*
@@ -590,6 +574,79 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
590} 574}
591 575
592/* 576/*
577 * Advance this CPU's callbacks, but only if the current grace period
578 * has ended. This may be called only from the CPU to whom the rdp
579 * belongs. In addition, the corresponding leaf rcu_node structure's
580 * ->lock must be held by the caller, with irqs disabled.
581 */
582static void
583__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
584{
585 /* Did another grace period end? */
586 if (rdp->completed != rnp->completed) {
587
588 /* Advance callbacks. No harm if list empty. */
589 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
590 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
591 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
592
593 /* Remember that we saw this grace-period completion. */
594 rdp->completed = rnp->completed;
595 }
596}
597
598/*
599 * Advance this CPU's callbacks, but only if the current grace period
600 * has ended. This may be called only from the CPU to whom the rdp
601 * belongs.
602 */
603static void
604rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
605{
606 unsigned long flags;
607 struct rcu_node *rnp;
608
609 local_irq_save(flags);
610 rnp = rdp->mynode;
611 if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */
612 !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */
613 local_irq_restore(flags);
614 return;
615 }
616 __rcu_process_gp_end(rsp, rnp, rdp);
617 spin_unlock_irqrestore(&rnp->lock, flags);
618}
619
620/*
621 * Do per-CPU grace-period initialization for running CPU. The caller
622 * must hold the lock of the leaf rcu_node structure corresponding to
623 * this CPU.
624 */
625static void
626rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
627{
628 /* Prior grace period ended, so advance callbacks for current CPU. */
629 __rcu_process_gp_end(rsp, rnp, rdp);
630
631 /*
632 * Because this CPU just now started the new grace period, we know
633 * that all of its callbacks will be covered by this upcoming grace
634 * period, even the ones that were registered arbitrarily recently.
635 * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL.
636 *
637 * Other CPUs cannot be sure exactly when the grace period started.
638 * Therefore, their recently registered callbacks must pass through
639 * an additional RCU_NEXT_READY stage, so that they will be handled
640 * by the next RCU grace period.
641 */
642 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
643 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
644
645 /* Set state so that this CPU will detect the next quiescent state. */
646 __note_new_gpnum(rsp, rnp, rdp);
647}
648
649/*
593 * Start a new RCU grace period if warranted, re-initializing the hierarchy 650 * Start a new RCU grace period if warranted, re-initializing the hierarchy
594 * in preparation for detecting the next grace period. The caller must hold 651 * in preparation for detecting the next grace period. The caller must hold
595 * the root node's ->lock, which is released before return. Hard irqs must 652 * the root node's ->lock, which is released before return. Hard irqs must
@@ -603,7 +660,23 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
603 struct rcu_node *rnp = rcu_get_root(rsp); 660 struct rcu_node *rnp = rcu_get_root(rsp);
604 661
605 if (!cpu_needs_another_gp(rsp, rdp)) { 662 if (!cpu_needs_another_gp(rsp, rdp)) {
606 spin_unlock_irqrestore(&rnp->lock, flags); 663 if (rnp->completed == rsp->completed) {
664 spin_unlock_irqrestore(&rnp->lock, flags);
665 return;
666 }
667 spin_unlock(&rnp->lock); /* irqs remain disabled. */
668
669 /*
670 * Propagate new ->completed value to rcu_node structures
671 * so that other CPUs don't have to wait until the start
672 * of the next grace period to process their callbacks.
673 */
674 rcu_for_each_node_breadth_first(rsp, rnp) {
675 spin_lock(&rnp->lock); /* irqs already disabled. */
676 rnp->completed = rsp->completed;
677 spin_unlock(&rnp->lock); /* irqs remain disabled. */
678 }
679 local_irq_restore(flags);
607 return; 680 return;
608 } 681 }
609 682
@@ -613,23 +686,15 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
613 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ 686 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
614 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 687 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
615 record_gp_stall_check_time(rsp); 688 record_gp_stall_check_time(rsp);
616 dyntick_record_completed(rsp, rsp->completed - 1);
617 note_new_gpnum(rsp, rdp);
618
619 /*
620 * Because we are first, we know that all our callbacks will
621 * be covered by this upcoming grace period, even the ones
622 * that were registered arbitrarily recently.
623 */
624 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
625 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
626 689
627 /* Special-case the common single-level case. */ 690 /* Special-case the common single-level case. */
628 if (NUM_RCU_NODES == 1) { 691 if (NUM_RCU_NODES == 1) {
629 rcu_preempt_check_blocked_tasks(rnp); 692 rcu_preempt_check_blocked_tasks(rnp);
630 rnp->qsmask = rnp->qsmaskinit; 693 rnp->qsmask = rnp->qsmaskinit;
631 rnp->gpnum = rsp->gpnum; 694 rnp->gpnum = rsp->gpnum;
695 rnp->completed = rsp->completed;
632 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ 696 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
697 rcu_start_gp_per_cpu(rsp, rnp, rdp);
633 spin_unlock_irqrestore(&rnp->lock, flags); 698 spin_unlock_irqrestore(&rnp->lock, flags);
634 return; 699 return;
635 } 700 }
@@ -657,70 +722,51 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
657 * one corresponding to this CPU, due to the fact that we have 722 * one corresponding to this CPU, due to the fact that we have
658 * irqs disabled. 723 * irqs disabled.
659 */ 724 */
660 for (rnp = &rsp->node[0]; rnp < &rsp->node[NUM_RCU_NODES]; rnp++) { 725 rcu_for_each_node_breadth_first(rsp, rnp) {
661 spin_lock(&rnp->lock); /* irqs already disabled. */ 726 spin_lock(&rnp->lock); /* irqs already disabled. */
662 rcu_preempt_check_blocked_tasks(rnp); 727 rcu_preempt_check_blocked_tasks(rnp);
663 rnp->qsmask = rnp->qsmaskinit; 728 rnp->qsmask = rnp->qsmaskinit;
664 rnp->gpnum = rsp->gpnum; 729 rnp->gpnum = rsp->gpnum;
665 spin_unlock(&rnp->lock); /* irqs already disabled. */ 730 rnp->completed = rsp->completed;
731 if (rnp == rdp->mynode)
732 rcu_start_gp_per_cpu(rsp, rnp, rdp);
733 spin_unlock(&rnp->lock); /* irqs remain disabled. */
666 } 734 }
667 735
736 rnp = rcu_get_root(rsp);
737 spin_lock(&rnp->lock); /* irqs already disabled. */
668 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ 738 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
739 spin_unlock(&rnp->lock); /* irqs remain disabled. */
669 spin_unlock_irqrestore(&rsp->onofflock, flags); 740 spin_unlock_irqrestore(&rsp->onofflock, flags);
670} 741}
671 742
672/* 743/*
673 * Advance this CPU's callbacks, but only if the current grace period 744 * Report a full set of quiescent states to the specified rcu_state
674 * has ended. This may be called only from the CPU to whom the rdp 745 * data structure. This involves cleaning up after the prior grace
675 * belongs. 746 * period and letting rcu_start_gp() start up the next grace period
676 */ 747 * if one is needed. Note that the caller must hold rnp->lock, as
677static void 748 * required by rcu_start_gp(), which will release it.
678rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
679{
680 long completed_snap;
681 unsigned long flags;
682
683 local_irq_save(flags);
684 completed_snap = ACCESS_ONCE(rsp->completed); /* outside of lock. */
685
686 /* Did another grace period end? */
687 if (rdp->completed != completed_snap) {
688
689 /* Advance callbacks. No harm if list empty. */
690 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
691 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
692 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
693
694 /* Remember that we saw this grace-period completion. */
695 rdp->completed = completed_snap;
696 }
697 local_irq_restore(flags);
698}
699
700/*
701 * Clean up after the prior grace period and let rcu_start_gp() start up
702 * the next grace period if one is needed. Note that the caller must
703 * hold rnp->lock, as required by rcu_start_gp(), which will release it.
704 */ 749 */
705static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags) 750static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
706 __releases(rnp->lock) 751 __releases(rcu_get_root(rsp)->lock)
707{ 752{
708 WARN_ON_ONCE(rsp->completed == rsp->gpnum); 753 WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
709 rsp->completed = rsp->gpnum; 754 rsp->completed = rsp->gpnum;
710 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]); 755 rsp->signaled = RCU_GP_IDLE;
711 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ 756 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
712} 757}
713 758
714/* 759/*
715 * Similar to cpu_quiet(), for which it is a helper function. Allows 760 * Similar to rcu_report_qs_rdp(), for which it is a helper function.
716 * a group of CPUs to be quieted at one go, though all the CPUs in the 761 * Allows quiescent states for a group of CPUs to be reported at one go
717 * group must be represented by the same leaf rcu_node structure. 762 * to the specified rcu_node structure, though all the CPUs in the group
718 * That structure's lock must be held upon entry, and it is released 763 * must be represented by the same rcu_node structure (which need not be
719 * before return. 764 * a leaf rcu_node structure, though it often will be). That structure's
765 * lock must be held upon entry, and it is released before return.
720 */ 766 */
721static void 767static void
722cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp, 768rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
723 unsigned long flags) 769 struct rcu_node *rnp, unsigned long flags)
724 __releases(rnp->lock) 770 __releases(rnp->lock)
725{ 771{
726 struct rcu_node *rnp_c; 772 struct rcu_node *rnp_c;
@@ -756,21 +802,23 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
756 802
757 /* 803 /*
758 * Get here if we are the last CPU to pass through a quiescent 804 * Get here if we are the last CPU to pass through a quiescent
759 * state for this grace period. Invoke cpu_quiet_msk_finish() 805 * state for this grace period. Invoke rcu_report_qs_rsp()
760 * to clean up and start the next grace period if one is needed. 806 * to clean up and start the next grace period if one is needed.
761 */ 807 */
762 cpu_quiet_msk_finish(rsp, flags); /* releases rnp->lock. */ 808 rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */
763} 809}
764 810
765/* 811/*
766 * Record a quiescent state for the specified CPU, which must either be 812 * Record a quiescent state for the specified CPU to that CPU's rcu_data
767 * the current CPU. The lastcomp argument is used to make sure we are 813 * structure. This must be either called from the specified CPU, or
768 * still in the grace period of interest. We don't want to end the current 814 * called when the specified CPU is known to be offline (and when it is
769 * grace period based on quiescent states detected in an earlier grace 815 * also known that no other CPU is concurrently trying to help the offline
770 * period! 816 * CPU). The lastcomp argument is used to make sure we are still in the
817 * grace period of interest. We don't want to end the current grace period
818 * based on quiescent states detected in an earlier grace period!
771 */ 819 */
772static void 820static void
773cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) 821rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
774{ 822{
775 unsigned long flags; 823 unsigned long flags;
776 unsigned long mask; 824 unsigned long mask;
@@ -778,15 +826,15 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
778 826
779 rnp = rdp->mynode; 827 rnp = rdp->mynode;
780 spin_lock_irqsave(&rnp->lock, flags); 828 spin_lock_irqsave(&rnp->lock, flags);
781 if (lastcomp != ACCESS_ONCE(rsp->completed)) { 829 if (lastcomp != rnp->completed) {
782 830
783 /* 831 /*
784 * Someone beat us to it for this grace period, so leave. 832 * Someone beat us to it for this grace period, so leave.
785 * The race with GP start is resolved by the fact that we 833 * The race with GP start is resolved by the fact that we
786 * hold the leaf rcu_node lock, so that the per-CPU bits 834 * hold the leaf rcu_node lock, so that the per-CPU bits
787 * cannot yet be initialized -- so we would simply find our 835 * cannot yet be initialized -- so we would simply find our
788 * CPU's bit already cleared in cpu_quiet_msk() if this race 836 * CPU's bit already cleared in rcu_report_qs_rnp() if this
789 * occurred. 837 * race occurred.
790 */ 838 */
791 rdp->passed_quiesc = 0; /* try again later! */ 839 rdp->passed_quiesc = 0; /* try again later! */
792 spin_unlock_irqrestore(&rnp->lock, flags); 840 spin_unlock_irqrestore(&rnp->lock, flags);
@@ -804,7 +852,7 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
804 */ 852 */
805 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; 853 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
806 854
807 cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */ 855 rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
808 } 856 }
809} 857}
810 858
@@ -835,24 +883,73 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
835 if (!rdp->passed_quiesc) 883 if (!rdp->passed_quiesc)
836 return; 884 return;
837 885
838 /* Tell RCU we are done (but cpu_quiet() will be the judge of that). */ 886 /*
839 cpu_quiet(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed); 887 * Tell RCU we are done (but rcu_report_qs_rdp() will be the
888 * judge of that).
889 */
890 rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed);
840} 891}
841 892
842#ifdef CONFIG_HOTPLUG_CPU 893#ifdef CONFIG_HOTPLUG_CPU
843 894
844/* 895/*
896 * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the
897 * specified flavor of RCU. The callbacks will be adopted by the next
898 * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever
899 * comes first. Because this is invoked from the CPU_DYING notifier,
900 * irqs are already disabled.
901 */
902static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
903{
904 int i;
905 struct rcu_data *rdp = rsp->rda[smp_processor_id()];
906
907 if (rdp->nxtlist == NULL)
908 return; /* irqs disabled, so comparison is stable. */
909 spin_lock(&rsp->onofflock); /* irqs already disabled. */
910 *rsp->orphan_cbs_tail = rdp->nxtlist;
911 rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL];
912 rdp->nxtlist = NULL;
913 for (i = 0; i < RCU_NEXT_SIZE; i++)
914 rdp->nxttail[i] = &rdp->nxtlist;
915 rsp->orphan_qlen += rdp->qlen;
916 rdp->qlen = 0;
917 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
918}
919
920/*
921 * Adopt previously orphaned RCU callbacks.
922 */
923static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
924{
925 unsigned long flags;
926 struct rcu_data *rdp;
927
928 spin_lock_irqsave(&rsp->onofflock, flags);
929 rdp = rsp->rda[smp_processor_id()];
930 if (rsp->orphan_cbs_list == NULL) {
931 spin_unlock_irqrestore(&rsp->onofflock, flags);
932 return;
933 }
934 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
935 rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
936 rdp->qlen += rsp->orphan_qlen;
937 rsp->orphan_cbs_list = NULL;
938 rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
939 rsp->orphan_qlen = 0;
940 spin_unlock_irqrestore(&rsp->onofflock, flags);
941}
942
943/*
845 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy 944 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
846 * and move all callbacks from the outgoing CPU to the current one. 945 * and move all callbacks from the outgoing CPU to the current one.
847 */ 946 */
848static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) 947static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
849{ 948{
850 int i;
851 unsigned long flags; 949 unsigned long flags;
852 long lastcomp;
853 unsigned long mask; 950 unsigned long mask;
951 int need_report = 0;
854 struct rcu_data *rdp = rsp->rda[cpu]; 952 struct rcu_data *rdp = rsp->rda[cpu];
855 struct rcu_data *rdp_me;
856 struct rcu_node *rnp; 953 struct rcu_node *rnp;
857 954
858 /* Exclude any attempts to start a new grace period. */ 955 /* Exclude any attempts to start a new grace period. */
@@ -865,42 +962,34 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
865 spin_lock(&rnp->lock); /* irqs already disabled. */ 962 spin_lock(&rnp->lock); /* irqs already disabled. */
866 rnp->qsmaskinit &= ~mask; 963 rnp->qsmaskinit &= ~mask;
867 if (rnp->qsmaskinit != 0) { 964 if (rnp->qsmaskinit != 0) {
868 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 965 if (rnp != rdp->mynode)
966 spin_unlock(&rnp->lock); /* irqs remain disabled. */
869 break; 967 break;
870 } 968 }
871 rcu_preempt_offline_tasks(rsp, rnp, rdp); 969 if (rnp == rdp->mynode)
970 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
971 else
972 spin_unlock(&rnp->lock); /* irqs remain disabled. */
872 mask = rnp->grpmask; 973 mask = rnp->grpmask;
873 spin_unlock(&rnp->lock); /* irqs remain disabled. */
874 rnp = rnp->parent; 974 rnp = rnp->parent;
875 } while (rnp != NULL); 975 } while (rnp != NULL);
876 lastcomp = rsp->completed;
877
878 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
879 976
880 /* 977 /*
881 * Move callbacks from the outgoing CPU to the running CPU. 978 * We still hold the leaf rcu_node structure lock here, and
882 * Note that the outgoing CPU is now quiscent, so it is now 979 * irqs are still disabled. The reason for this subterfuge is
883 * (uncharacteristically) safe to access its rcu_data structure. 980 * because invoking rcu_report_unblock_qs_rnp() with ->onofflock
884 * Note also that we must carefully retain the order of the 981 * held leads to deadlock.
885 * outgoing CPU's callbacks in order for rcu_barrier() to work
886 * correctly. Finally, note that we start all the callbacks
887 * afresh, even those that have passed through a grace period
888 * and are therefore ready to invoke. The theory is that hotplug
889 * events are rare, and that if they are frequent enough to
890 * indefinitely delay callbacks, you have far worse things to
891 * be worrying about.
892 */ 982 */
893 rdp_me = rsp->rda[smp_processor_id()]; 983 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
894 if (rdp->nxtlist != NULL) { 984 rnp = rdp->mynode;
895 *rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; 985 if (need_report & RCU_OFL_TASKS_NORM_GP)
896 rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; 986 rcu_report_unblock_qs_rnp(rnp, flags);
897 rdp->nxtlist = NULL; 987 else
898 for (i = 0; i < RCU_NEXT_SIZE; i++) 988 spin_unlock_irqrestore(&rnp->lock, flags);
899 rdp->nxttail[i] = &rdp->nxtlist; 989 if (need_report & RCU_OFL_TASKS_EXP_GP)
900 rdp_me->qlen += rdp->qlen; 990 rcu_report_exp_rnp(rsp, rnp);
901 rdp->qlen = 0; 991
902 } 992 rcu_adopt_orphan_cbs(rsp);
903 local_irq_restore(flags);
904} 993}
905 994
906/* 995/*
@@ -918,6 +1007,14 @@ static void rcu_offline_cpu(int cpu)
918 1007
919#else /* #ifdef CONFIG_HOTPLUG_CPU */ 1008#else /* #ifdef CONFIG_HOTPLUG_CPU */
920 1009
1010static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
1011{
1012}
1013
1014static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1015{
1016}
1017
921static void rcu_offline_cpu(int cpu) 1018static void rcu_offline_cpu(int cpu)
922{ 1019{
923} 1020}
@@ -928,7 +1025,7 @@ static void rcu_offline_cpu(int cpu)
928 * Invoke any RCU callbacks that have made it to the end of their grace 1025 * Invoke any RCU callbacks that have made it to the end of their grace
929 * period. Thottle as specified by rdp->blimit. 1026 * period. Thottle as specified by rdp->blimit.
930 */ 1027 */
931static void rcu_do_batch(struct rcu_data *rdp) 1028static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
932{ 1029{
933 unsigned long flags; 1030 unsigned long flags;
934 struct rcu_head *next, *list, **tail; 1031 struct rcu_head *next, *list, **tail;
@@ -981,6 +1078,13 @@ static void rcu_do_batch(struct rcu_data *rdp)
981 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) 1078 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
982 rdp->blimit = blimit; 1079 rdp->blimit = blimit;
983 1080
1081 /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
1082 if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) {
1083 rdp->qlen_last_fqs_check = 0;
1084 rdp->n_force_qs_snap = rsp->n_force_qs;
1085 } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark)
1086 rdp->qlen_last_fqs_check = rdp->qlen;
1087
984 local_irq_restore(flags); 1088 local_irq_restore(flags);
985 1089
986 /* Re-raise the RCU softirq if there are callbacks remaining. */ 1090 /* Re-raise the RCU softirq if there are callbacks remaining. */
@@ -1050,33 +1154,32 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1050 int cpu; 1154 int cpu;
1051 unsigned long flags; 1155 unsigned long flags;
1052 unsigned long mask; 1156 unsigned long mask;
1053 struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1]; 1157 struct rcu_node *rnp;
1054 struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
1055 1158
1056 for (; rnp_cur < rnp_end; rnp_cur++) { 1159 rcu_for_each_leaf_node(rsp, rnp) {
1057 mask = 0; 1160 mask = 0;
1058 spin_lock_irqsave(&rnp_cur->lock, flags); 1161 spin_lock_irqsave(&rnp->lock, flags);
1059 if (rsp->completed != lastcomp) { 1162 if (rnp->completed != lastcomp) {
1060 spin_unlock_irqrestore(&rnp_cur->lock, flags); 1163 spin_unlock_irqrestore(&rnp->lock, flags);
1061 return 1; 1164 return 1;
1062 } 1165 }
1063 if (rnp_cur->qsmask == 0) { 1166 if (rnp->qsmask == 0) {
1064 spin_unlock_irqrestore(&rnp_cur->lock, flags); 1167 spin_unlock_irqrestore(&rnp->lock, flags);
1065 continue; 1168 continue;
1066 } 1169 }
1067 cpu = rnp_cur->grplo; 1170 cpu = rnp->grplo;
1068 bit = 1; 1171 bit = 1;
1069 for (; cpu <= rnp_cur->grphi; cpu++, bit <<= 1) { 1172 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
1070 if ((rnp_cur->qsmask & bit) != 0 && f(rsp->rda[cpu])) 1173 if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu]))
1071 mask |= bit; 1174 mask |= bit;
1072 } 1175 }
1073 if (mask != 0 && rsp->completed == lastcomp) { 1176 if (mask != 0 && rnp->completed == lastcomp) {
1074 1177
1075 /* cpu_quiet_msk() releases rnp_cur->lock. */ 1178 /* rcu_report_qs_rnp() releases rnp->lock. */
1076 cpu_quiet_msk(mask, rsp, rnp_cur, flags); 1179 rcu_report_qs_rnp(mask, rsp, rnp, flags);
1077 continue; 1180 continue;
1078 } 1181 }
1079 spin_unlock_irqrestore(&rnp_cur->lock, flags); 1182 spin_unlock_irqrestore(&rnp->lock, flags);
1080 } 1183 }
1081 return 0; 1184 return 0;
1082} 1185}
@@ -1091,8 +1194,9 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1091 long lastcomp; 1194 long lastcomp;
1092 struct rcu_node *rnp = rcu_get_root(rsp); 1195 struct rcu_node *rnp = rcu_get_root(rsp);
1093 u8 signaled; 1196 u8 signaled;
1197 u8 forcenow;
1094 1198
1095 if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum)) 1199 if (!rcu_gp_in_progress(rsp))
1096 return; /* No grace period in progress, nothing to force. */ 1200 return; /* No grace period in progress, nothing to force. */
1097 if (!spin_trylock_irqsave(&rsp->fqslock, flags)) { 1201 if (!spin_trylock_irqsave(&rsp->fqslock, flags)) {
1098 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ 1202 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */
@@ -1103,19 +1207,20 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1103 goto unlock_ret; /* no emergency and done recently. */ 1207 goto unlock_ret; /* no emergency and done recently. */
1104 rsp->n_force_qs++; 1208 rsp->n_force_qs++;
1105 spin_lock(&rnp->lock); 1209 spin_lock(&rnp->lock);
1106 lastcomp = rsp->completed; 1210 lastcomp = rsp->gpnum - 1;
1107 signaled = rsp->signaled; 1211 signaled = rsp->signaled;
1108 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 1212 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
1109 if (lastcomp == rsp->gpnum) { 1213 if(!rcu_gp_in_progress(rsp)) {
1110 rsp->n_force_qs_ngp++; 1214 rsp->n_force_qs_ngp++;
1111 spin_unlock(&rnp->lock); 1215 spin_unlock(&rnp->lock);
1112 goto unlock_ret; /* no GP in progress, time updated. */ 1216 goto unlock_ret; /* no GP in progress, time updated. */
1113 } 1217 }
1114 spin_unlock(&rnp->lock); 1218 spin_unlock(&rnp->lock);
1115 switch (signaled) { 1219 switch (signaled) {
1220 case RCU_GP_IDLE:
1116 case RCU_GP_INIT: 1221 case RCU_GP_INIT:
1117 1222
1118 break; /* grace period still initializing, ignore. */ 1223 break; /* grace period idle or initializing, ignore. */
1119 1224
1120 case RCU_SAVE_DYNTICK: 1225 case RCU_SAVE_DYNTICK:
1121 1226
@@ -1126,20 +1231,29 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1126 if (rcu_process_dyntick(rsp, lastcomp, 1231 if (rcu_process_dyntick(rsp, lastcomp,
1127 dyntick_save_progress_counter)) 1232 dyntick_save_progress_counter))
1128 goto unlock_ret; 1233 goto unlock_ret;
1234 /* fall into next case. */
1235
1236 case RCU_SAVE_COMPLETED:
1129 1237
1130 /* Update state, record completion counter. */ 1238 /* Update state, record completion counter. */
1239 forcenow = 0;
1131 spin_lock(&rnp->lock); 1240 spin_lock(&rnp->lock);
1132 if (lastcomp == rsp->completed) { 1241 if (lastcomp + 1 == rsp->gpnum &&
1242 lastcomp == rsp->completed &&
1243 rsp->signaled == signaled) {
1133 rsp->signaled = RCU_FORCE_QS; 1244 rsp->signaled = RCU_FORCE_QS;
1134 dyntick_record_completed(rsp, lastcomp); 1245 rsp->completed_fqs = lastcomp;
1246 forcenow = signaled == RCU_SAVE_COMPLETED;
1135 } 1247 }
1136 spin_unlock(&rnp->lock); 1248 spin_unlock(&rnp->lock);
1137 break; 1249 if (!forcenow)
1250 break;
1251 /* fall into next case. */
1138 1252
1139 case RCU_FORCE_QS: 1253 case RCU_FORCE_QS:
1140 1254
1141 /* Check dyntick-idle state, send IPI to laggarts. */ 1255 /* Check dyntick-idle state, send IPI to laggarts. */
1142 if (rcu_process_dyntick(rsp, dyntick_recall_completed(rsp), 1256 if (rcu_process_dyntick(rsp, rsp->completed_fqs,
1143 rcu_implicit_dynticks_qs)) 1257 rcu_implicit_dynticks_qs))
1144 goto unlock_ret; 1258 goto unlock_ret;
1145 1259
@@ -1195,7 +1309,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1195 } 1309 }
1196 1310
1197 /* If there are callbacks ready, invoke them. */ 1311 /* If there are callbacks ready, invoke them. */
1198 rcu_do_batch(rdp); 1312 rcu_do_batch(rsp, rdp);
1199} 1313}
1200 1314
1201/* 1315/*
@@ -1251,7 +1365,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1251 rdp->nxttail[RCU_NEXT_TAIL] = &head->next; 1365 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1252 1366
1253 /* Start a new grace period if one not already started. */ 1367 /* Start a new grace period if one not already started. */
1254 if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum)) { 1368 if (!rcu_gp_in_progress(rsp)) {
1255 unsigned long nestflag; 1369 unsigned long nestflag;
1256 struct rcu_node *rnp_root = rcu_get_root(rsp); 1370 struct rcu_node *rnp_root = rcu_get_root(rsp);
1257 1371
@@ -1259,10 +1373,20 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1259 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ 1373 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */
1260 } 1374 }
1261 1375
1262 /* Force the grace period if too many callbacks or too long waiting. */ 1376 /*
1263 if (unlikely(++rdp->qlen > qhimark)) { 1377 * Force the grace period if too many callbacks or too long waiting.
1378 * Enforce hysteresis, and don't invoke force_quiescent_state()
1379 * if some other CPU has recently done so. Also, don't bother
1380 * invoking force_quiescent_state() if the newly enqueued callback
1381 * is the only one waiting for a grace period to complete.
1382 */
1383 if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
1264 rdp->blimit = LONG_MAX; 1384 rdp->blimit = LONG_MAX;
1265 force_quiescent_state(rsp, 0); 1385 if (rsp->n_force_qs == rdp->n_force_qs_snap &&
1386 *rdp->nxttail[RCU_DONE_TAIL] != head)
1387 force_quiescent_state(rsp, 0);
1388 rdp->n_force_qs_snap = rsp->n_force_qs;
1389 rdp->qlen_last_fqs_check = rdp->qlen;
1266 } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) 1390 } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
1267 force_quiescent_state(rsp, 1); 1391 force_quiescent_state(rsp, 1);
1268 local_irq_restore(flags); 1392 local_irq_restore(flags);
@@ -1286,6 +1410,68 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1286} 1410}
1287EXPORT_SYMBOL_GPL(call_rcu_bh); 1411EXPORT_SYMBOL_GPL(call_rcu_bh);
1288 1412
1413/**
1414 * synchronize_sched - wait until an rcu-sched grace period has elapsed.
1415 *
1416 * Control will return to the caller some time after a full rcu-sched
1417 * grace period has elapsed, in other words after all currently executing
1418 * rcu-sched read-side critical sections have completed. These read-side
1419 * critical sections are delimited by rcu_read_lock_sched() and
1420 * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(),
1421 * local_irq_disable(), and so on may be used in place of
1422 * rcu_read_lock_sched().
1423 *
1424 * This means that all preempt_disable code sequences, including NMI and
1425 * hardware-interrupt handlers, in progress on entry will have completed
1426 * before this primitive returns. However, this does not guarantee that
1427 * softirq handlers will have completed, since in some kernels, these
1428 * handlers can run in process context, and can block.
1429 *
1430 * This primitive provides the guarantees made by the (now removed)
1431 * synchronize_kernel() API. In contrast, synchronize_rcu() only
1432 * guarantees that rcu_read_lock() sections will have completed.
1433 * In "classic RCU", these two guarantees happen to be one and
1434 * the same, but can differ in realtime RCU implementations.
1435 */
1436void synchronize_sched(void)
1437{
1438 struct rcu_synchronize rcu;
1439
1440 if (rcu_blocking_is_gp())
1441 return;
1442
1443 init_completion(&rcu.completion);
1444 /* Will wake me after RCU finished. */
1445 call_rcu_sched(&rcu.head, wakeme_after_rcu);
1446 /* Wait for it. */
1447 wait_for_completion(&rcu.completion);
1448}
1449EXPORT_SYMBOL_GPL(synchronize_sched);
1450
1451/**
1452 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
1453 *
1454 * Control will return to the caller some time after a full rcu_bh grace
1455 * period has elapsed, in other words after all currently executing rcu_bh
1456 * read-side critical sections have completed. RCU read-side critical
1457 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
1458 * and may be nested.
1459 */
1460void synchronize_rcu_bh(void)
1461{
1462 struct rcu_synchronize rcu;
1463
1464 if (rcu_blocking_is_gp())
1465 return;
1466
1467 init_completion(&rcu.completion);
1468 /* Will wake me after RCU finished. */
1469 call_rcu_bh(&rcu.head, wakeme_after_rcu);
1470 /* Wait for it. */
1471 wait_for_completion(&rcu.completion);
1472}
1473EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
1474
1289/* 1475/*
1290 * Check to see if there is any immediate RCU-related work to be done 1476 * Check to see if there is any immediate RCU-related work to be done
1291 * by the current CPU, for the specified type of RCU, returning 1 if so. 1477 * by the current CPU, for the specified type of RCU, returning 1 if so.
@@ -1295,6 +1481,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
1295 */ 1481 */
1296static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) 1482static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1297{ 1483{
1484 struct rcu_node *rnp = rdp->mynode;
1485
1298 rdp->n_rcu_pending++; 1486 rdp->n_rcu_pending++;
1299 1487
1300 /* Check for CPU stalls, if enabled. */ 1488 /* Check for CPU stalls, if enabled. */
@@ -1319,19 +1507,19 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1319 } 1507 }
1320 1508
1321 /* Has another RCU grace period completed? */ 1509 /* Has another RCU grace period completed? */
1322 if (ACCESS_ONCE(rsp->completed) != rdp->completed) { /* outside lock */ 1510 if (ACCESS_ONCE(rnp->completed) != rdp->completed) { /* outside lock */
1323 rdp->n_rp_gp_completed++; 1511 rdp->n_rp_gp_completed++;
1324 return 1; 1512 return 1;
1325 } 1513 }
1326 1514
1327 /* Has a new RCU grace period started? */ 1515 /* Has a new RCU grace period started? */
1328 if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) { /* outside lock */ 1516 if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */
1329 rdp->n_rp_gp_started++; 1517 rdp->n_rp_gp_started++;
1330 return 1; 1518 return 1;
1331 } 1519 }
1332 1520
1333 /* Has an RCU GP gone long enough to send resched IPIs &c? */ 1521 /* Has an RCU GP gone long enough to send resched IPIs &c? */
1334 if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) && 1522 if (rcu_gp_in_progress(rsp) &&
1335 ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) { 1523 ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) {
1336 rdp->n_rp_need_fqs++; 1524 rdp->n_rp_need_fqs++;
1337 return 1; 1525 return 1;
@@ -1369,6 +1557,97 @@ int rcu_needs_cpu(int cpu)
1369} 1557}
1370 1558
1371/* 1559/*
1560 * This function is invoked towards the end of the scheduler's initialization
1561 * process. Before this is called, the idle task might contain
1562 * RCU read-side critical sections (during which time, this idle
1563 * task is booting the system). After this function is called, the
1564 * idle tasks are prohibited from containing RCU read-side critical
1565 * sections.
1566 */
1567void rcu_scheduler_starting(void)
1568{
1569 WARN_ON(num_online_cpus() != 1);
1570 WARN_ON(nr_context_switches() > 0);
1571 rcu_scheduler_active = 1;
1572}
1573
1574static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
1575static atomic_t rcu_barrier_cpu_count;
1576static DEFINE_MUTEX(rcu_barrier_mutex);
1577static struct completion rcu_barrier_completion;
1578
1579static void rcu_barrier_callback(struct rcu_head *notused)
1580{
1581 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
1582 complete(&rcu_barrier_completion);
1583}
1584
1585/*
1586 * Called with preemption disabled, and from cross-cpu IRQ context.
1587 */
1588static void rcu_barrier_func(void *type)
1589{
1590 int cpu = smp_processor_id();
1591 struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
1592 void (*call_rcu_func)(struct rcu_head *head,
1593 void (*func)(struct rcu_head *head));
1594
1595 atomic_inc(&rcu_barrier_cpu_count);
1596 call_rcu_func = type;
1597 call_rcu_func(head, rcu_barrier_callback);
1598}
1599
1600/*
1601 * Orchestrate the specified type of RCU barrier, waiting for all
1602 * RCU callbacks of the specified type to complete.
1603 */
1604static void _rcu_barrier(struct rcu_state *rsp,
1605 void (*call_rcu_func)(struct rcu_head *head,
1606 void (*func)(struct rcu_head *head)))
1607{
1608 BUG_ON(in_interrupt());
1609 /* Take mutex to serialize concurrent rcu_barrier() requests. */
1610 mutex_lock(&rcu_barrier_mutex);
1611 init_completion(&rcu_barrier_completion);
1612 /*
1613 * Initialize rcu_barrier_cpu_count to 1, then invoke
1614 * rcu_barrier_func() on each CPU, so that each CPU also has
1615 * incremented rcu_barrier_cpu_count. Only then is it safe to
1616 * decrement rcu_barrier_cpu_count -- otherwise the first CPU
1617 * might complete its grace period before all of the other CPUs
1618 * did their increment, causing this function to return too
1619 * early.
1620 */
1621 atomic_set(&rcu_barrier_cpu_count, 1);
1622 preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */
1623 rcu_adopt_orphan_cbs(rsp);
1624 on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
1625 preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */
1626 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
1627 complete(&rcu_barrier_completion);
1628 wait_for_completion(&rcu_barrier_completion);
1629 mutex_unlock(&rcu_barrier_mutex);
1630}
1631
1632/**
1633 * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
1634 */
1635void rcu_barrier_bh(void)
1636{
1637 _rcu_barrier(&rcu_bh_state, call_rcu_bh);
1638}
1639EXPORT_SYMBOL_GPL(rcu_barrier_bh);
1640
1641/**
1642 * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
1643 */
1644void rcu_barrier_sched(void)
1645{
1646 _rcu_barrier(&rcu_sched_state, call_rcu_sched);
1647}
1648EXPORT_SYMBOL_GPL(rcu_barrier_sched);
1649
1650/*
1372 * Do boot-time initialization of a CPU's per-CPU RCU data. 1651 * Do boot-time initialization of a CPU's per-CPU RCU data.
1373 */ 1652 */
1374static void __init 1653static void __init
@@ -1403,21 +1682,18 @@ static void __cpuinit
1403rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) 1682rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1404{ 1683{
1405 unsigned long flags; 1684 unsigned long flags;
1406 long lastcomp;
1407 unsigned long mask; 1685 unsigned long mask;
1408 struct rcu_data *rdp = rsp->rda[cpu]; 1686 struct rcu_data *rdp = rsp->rda[cpu];
1409 struct rcu_node *rnp = rcu_get_root(rsp); 1687 struct rcu_node *rnp = rcu_get_root(rsp);
1410 1688
1411 /* Set up local state, ensuring consistent view of global state. */ 1689 /* Set up local state, ensuring consistent view of global state. */
1412 spin_lock_irqsave(&rnp->lock, flags); 1690 spin_lock_irqsave(&rnp->lock, flags);
1413 lastcomp = rsp->completed;
1414 rdp->completed = lastcomp;
1415 rdp->gpnum = lastcomp;
1416 rdp->passed_quiesc = 0; /* We could be racing with new GP, */ 1691 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1417 rdp->qs_pending = 1; /* so set up to respond to current GP. */ 1692 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1418 rdp->beenonline = 1; /* We have now been online. */ 1693 rdp->beenonline = 1; /* We have now been online. */
1419 rdp->preemptable = preemptable; 1694 rdp->preemptable = preemptable;
1420 rdp->passed_quiesc_completed = lastcomp - 1; 1695 rdp->qlen_last_fqs_check = 0;
1696 rdp->n_force_qs_snap = rsp->n_force_qs;
1421 rdp->blimit = blimit; 1697 rdp->blimit = blimit;
1422 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1698 spin_unlock(&rnp->lock); /* irqs remain disabled. */
1423 1699
@@ -1437,6 +1713,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1437 spin_lock(&rnp->lock); /* irqs already disabled. */ 1713 spin_lock(&rnp->lock); /* irqs already disabled. */
1438 rnp->qsmaskinit |= mask; 1714 rnp->qsmaskinit |= mask;
1439 mask = rnp->grpmask; 1715 mask = rnp->grpmask;
1716 if (rnp == rdp->mynode) {
1717 rdp->gpnum = rnp->completed; /* if GP in progress... */
1718 rdp->completed = rnp->completed;
1719 rdp->passed_quiesc_completed = rnp->completed - 1;
1720 }
1440 spin_unlock(&rnp->lock); /* irqs already disabled. */ 1721 spin_unlock(&rnp->lock); /* irqs already disabled. */
1441 rnp = rnp->parent; 1722 rnp = rnp->parent;
1442 } while (rnp != NULL && !(rnp->qsmaskinit & mask)); 1723 } while (rnp != NULL && !(rnp->qsmaskinit & mask));
@@ -1454,8 +1735,8 @@ static void __cpuinit rcu_online_cpu(int cpu)
1454/* 1735/*
1455 * Handle CPU online/offline notification events. 1736 * Handle CPU online/offline notification events.
1456 */ 1737 */
1457int __cpuinit rcu_cpu_notify(struct notifier_block *self, 1738static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1458 unsigned long action, void *hcpu) 1739 unsigned long action, void *hcpu)
1459{ 1740{
1460 long cpu = (long)hcpu; 1741 long cpu = (long)hcpu;
1461 1742
@@ -1464,6 +1745,22 @@ int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1464 case CPU_UP_PREPARE_FROZEN: 1745 case CPU_UP_PREPARE_FROZEN:
1465 rcu_online_cpu(cpu); 1746 rcu_online_cpu(cpu);
1466 break; 1747 break;
1748 case CPU_DYING:
1749 case CPU_DYING_FROZEN:
1750 /*
1751 * preempt_disable() in _rcu_barrier() prevents stop_machine(),
1752 * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
1753 * returns, all online cpus have queued rcu_barrier_func().
1754 * The dying CPU clears its cpu_online_mask bit and
1755 * moves all of its RCU callbacks to ->orphan_cbs_list
1756 * in the context of stop_machine(), so subsequent calls
1757 * to _rcu_barrier() will adopt these callbacks and only
1758 * then queue rcu_barrier_func() on all remaining CPUs.
1759 */
1760 rcu_send_cbs_to_orphanage(&rcu_bh_state);
1761 rcu_send_cbs_to_orphanage(&rcu_sched_state);
1762 rcu_preempt_send_cbs_to_orphanage();
1763 break;
1467 case CPU_DEAD: 1764 case CPU_DEAD:
1468 case CPU_DEAD_FROZEN: 1765 case CPU_DEAD_FROZEN:
1469 case CPU_UP_CANCELED: 1766 case CPU_UP_CANCELED:
@@ -1527,6 +1824,7 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1527 rnp = rsp->level[i]; 1824 rnp = rsp->level[i];
1528 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { 1825 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
1529 spin_lock_init(&rnp->lock); 1826 spin_lock_init(&rnp->lock);
1827 lockdep_set_class(&rnp->lock, &rcu_node_class[i]);
1530 rnp->gpnum = 0; 1828 rnp->gpnum = 0;
1531 rnp->qsmask = 0; 1829 rnp->qsmask = 0;
1532 rnp->qsmaskinit = 0; 1830 rnp->qsmaskinit = 0;
@@ -1547,6 +1845,8 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1547 rnp->level = i; 1845 rnp->level = i;
1548 INIT_LIST_HEAD(&rnp->blocked_tasks[0]); 1846 INIT_LIST_HEAD(&rnp->blocked_tasks[0]);
1549 INIT_LIST_HEAD(&rnp->blocked_tasks[1]); 1847 INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
1848 INIT_LIST_HEAD(&rnp->blocked_tasks[2]);
1849 INIT_LIST_HEAD(&rnp->blocked_tasks[3]);
1550 } 1850 }
1551 } 1851 }
1552} 1852}
@@ -1558,6 +1858,10 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1558 */ 1858 */
1559#define RCU_INIT_FLAVOR(rsp, rcu_data) \ 1859#define RCU_INIT_FLAVOR(rsp, rcu_data) \
1560do { \ 1860do { \
1861 int i; \
1862 int j; \
1863 struct rcu_node *rnp; \
1864 \
1561 rcu_init_one(rsp); \ 1865 rcu_init_one(rsp); \
1562 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \ 1866 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
1563 j = 0; \ 1867 j = 0; \
@@ -1570,41 +1874,30 @@ do { \
1570 } \ 1874 } \
1571} while (0) 1875} while (0)
1572 1876
1573#ifdef CONFIG_TREE_PREEMPT_RCU 1877void __init rcu_init(void)
1574
1575void __init __rcu_init_preempt(void)
1576{
1577 int i; /* All used by RCU_INIT_FLAVOR(). */
1578 int j;
1579 struct rcu_node *rnp;
1580
1581 RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data);
1582}
1583
1584#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1585
1586void __init __rcu_init_preempt(void)
1587{ 1878{
1588} 1879 int i;
1589
1590#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
1591
1592void __init __rcu_init(void)
1593{
1594 int i; /* All used by RCU_INIT_FLAVOR(). */
1595 int j;
1596 struct rcu_node *rnp;
1597 1880
1598 rcu_bootup_announce(); 1881 rcu_bootup_announce();
1599#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 1882#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1600 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); 1883 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1601#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 1884#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
1885#if NUM_RCU_LVL_4 != 0
1886 printk(KERN_INFO "Experimental four-level hierarchy is enabled.\n");
1887#endif /* #if NUM_RCU_LVL_4 != 0 */
1602 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); 1888 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
1603 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); 1889 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
1604 __rcu_init_preempt(); 1890 __rcu_init_preempt();
1605 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 1891 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1892
1893 /*
1894 * We don't need protection against CPU-hotplug here because
1895 * this is called early in boot, before either interrupts
1896 * or the scheduler are operational.
1897 */
1898 cpu_notifier(rcu_cpu_notify, 0);
1899 for_each_online_cpu(i)
1900 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)i);
1606} 1901}
1607 1902
1608module_param(blimit, int, 0); 1903#include "rcutree_plugin.h"
1609module_param(qhimark, int, 0);
1610module_param(qlowmark, int, 0);
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 8e8287a983c2..d2a0046f63b2 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -34,10 +34,11 @@
34 * In practice, this has not been tested, so there is probably some 34 * In practice, this has not been tested, so there is probably some
35 * bug somewhere. 35 * bug somewhere.
36 */ 36 */
37#define MAX_RCU_LVLS 3 37#define MAX_RCU_LVLS 4
38#define RCU_FANOUT (CONFIG_RCU_FANOUT) 38#define RCU_FANOUT (CONFIG_RCU_FANOUT)
39#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT) 39#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT)
40#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT) 40#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT)
41#define RCU_FANOUT_FOURTH (RCU_FANOUT_CUBE * RCU_FANOUT)
41 42
42#if NR_CPUS <= RCU_FANOUT 43#if NR_CPUS <= RCU_FANOUT
43# define NUM_RCU_LVLS 1 44# define NUM_RCU_LVLS 1
@@ -45,23 +46,33 @@
45# define NUM_RCU_LVL_1 (NR_CPUS) 46# define NUM_RCU_LVL_1 (NR_CPUS)
46# define NUM_RCU_LVL_2 0 47# define NUM_RCU_LVL_2 0
47# define NUM_RCU_LVL_3 0 48# define NUM_RCU_LVL_3 0
49# define NUM_RCU_LVL_4 0
48#elif NR_CPUS <= RCU_FANOUT_SQ 50#elif NR_CPUS <= RCU_FANOUT_SQ
49# define NUM_RCU_LVLS 2 51# define NUM_RCU_LVLS 2
50# define NUM_RCU_LVL_0 1 52# define NUM_RCU_LVL_0 1
51# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT) 53# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
52# define NUM_RCU_LVL_2 (NR_CPUS) 54# define NUM_RCU_LVL_2 (NR_CPUS)
53# define NUM_RCU_LVL_3 0 55# define NUM_RCU_LVL_3 0
56# define NUM_RCU_LVL_4 0
54#elif NR_CPUS <= RCU_FANOUT_CUBE 57#elif NR_CPUS <= RCU_FANOUT_CUBE
55# define NUM_RCU_LVLS 3 58# define NUM_RCU_LVLS 3
56# define NUM_RCU_LVL_0 1 59# define NUM_RCU_LVL_0 1
57# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ) 60# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
58# define NUM_RCU_LVL_2 (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT)) 61# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
59# define NUM_RCU_LVL_3 NR_CPUS 62# define NUM_RCU_LVL_3 NR_CPUS
63# define NUM_RCU_LVL_4 0
64#elif NR_CPUS <= RCU_FANOUT_FOURTH
65# define NUM_RCU_LVLS 4
66# define NUM_RCU_LVL_0 1
67# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE)
68# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
69# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
70# define NUM_RCU_LVL_4 NR_CPUS
60#else 71#else
61# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" 72# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
62#endif /* #if (NR_CPUS) <= RCU_FANOUT */ 73#endif /* #if (NR_CPUS) <= RCU_FANOUT */
63 74
64#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) 75#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
65#define NUM_RCU_NODES (RCU_SUM - NR_CPUS) 76#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
66 77
67/* 78/*
@@ -79,24 +90,67 @@ struct rcu_dynticks {
79 * Definition for node within the RCU grace-period-detection hierarchy. 90 * Definition for node within the RCU grace-period-detection hierarchy.
80 */ 91 */
81struct rcu_node { 92struct rcu_node {
82 spinlock_t lock; 93 spinlock_t lock; /* Root rcu_node's lock protects some */
94 /* rcu_state fields as well as following. */
83 long gpnum; /* Current grace period for this node. */ 95 long gpnum; /* Current grace period for this node. */
84 /* This will either be equal to or one */ 96 /* This will either be equal to or one */
85 /* behind the root rcu_node's gpnum. */ 97 /* behind the root rcu_node's gpnum. */
98 long completed; /* Last grace period completed for this node. */
99 /* This will either be equal to or one */
100 /* behind the root rcu_node's gpnum. */
86 unsigned long qsmask; /* CPUs or groups that need to switch in */ 101 unsigned long qsmask; /* CPUs or groups that need to switch in */
87 /* order for current grace period to proceed.*/ 102 /* order for current grace period to proceed.*/
103 /* In leaf rcu_node, each bit corresponds to */
104 /* an rcu_data structure, otherwise, each */
105 /* bit corresponds to a child rcu_node */
106 /* structure. */
107 unsigned long expmask; /* Groups that have ->blocked_tasks[] */
108 /* elements that need to drain to allow the */
109 /* current expedited grace period to */
110 /* complete (only for TREE_PREEMPT_RCU). */
88 unsigned long qsmaskinit; 111 unsigned long qsmaskinit;
89 /* Per-GP initialization for qsmask. */ 112 /* Per-GP initial value for qsmask & expmask. */
90 unsigned long grpmask; /* Mask to apply to parent qsmask. */ 113 unsigned long grpmask; /* Mask to apply to parent qsmask. */
114 /* Only one bit will be set in this mask. */
91 int grplo; /* lowest-numbered CPU or group here. */ 115 int grplo; /* lowest-numbered CPU or group here. */
92 int grphi; /* highest-numbered CPU or group here. */ 116 int grphi; /* highest-numbered CPU or group here. */
93 u8 grpnum; /* CPU/group number for next level up. */ 117 u8 grpnum; /* CPU/group number for next level up. */
94 u8 level; /* root is at level 0. */ 118 u8 level; /* root is at level 0. */
95 struct rcu_node *parent; 119 struct rcu_node *parent;
96 struct list_head blocked_tasks[2]; 120 struct list_head blocked_tasks[4];
97 /* Tasks blocked in RCU read-side critsect. */ 121 /* Tasks blocked in RCU read-side critsect. */
122 /* Grace period number (->gpnum) x blocked */
123 /* by tasks on the (x & 0x1) element of the */
124 /* blocked_tasks[] array. */
98} ____cacheline_internodealigned_in_smp; 125} ____cacheline_internodealigned_in_smp;
99 126
127/*
128 * Do a full breadth-first scan of the rcu_node structures for the
129 * specified rcu_state structure.
130 */
131#define rcu_for_each_node_breadth_first(rsp, rnp) \
132 for ((rnp) = &(rsp)->node[0]; \
133 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
134
135/*
136 * Do a breadth-first scan of the non-leaf rcu_node structures for the
137 * specified rcu_state structure. Note that if there is a singleton
138 * rcu_node tree with but one rcu_node structure, this loop is a no-op.
139 */
140#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
141 for ((rnp) = &(rsp)->node[0]; \
142 (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++)
143
144/*
145 * Scan the leaves of the rcu_node hierarchy for the specified rcu_state
146 * structure. Note that if there is a singleton rcu_node tree with but
147 * one rcu_node structure, this loop -will- visit the rcu_node structure.
148 * It is still a leaf node, even if it is also the root node.
149 */
150#define rcu_for_each_leaf_node(rsp, rnp) \
151 for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \
152 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
153
100/* Index values for nxttail array in struct rcu_data. */ 154/* Index values for nxttail array in struct rcu_data. */
101#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ 155#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
102#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ 156#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */
@@ -126,23 +180,30 @@ struct rcu_data {
126 * Any of the partitions might be empty, in which case the 180 * Any of the partitions might be empty, in which case the
127 * pointer to that partition will be equal to the pointer for 181 * pointer to that partition will be equal to the pointer for
128 * the following partition. When the list is empty, all of 182 * the following partition. When the list is empty, all of
129 * the nxttail elements point to nxtlist, which is NULL. 183 * the nxttail elements point to the ->nxtlist pointer itself,
184 * which in that case is NULL.
130 * 185 *
131 * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]):
132 * Entries that might have arrived after current GP ended
133 * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
134 * Entries known to have arrived before current GP ended
135 * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
136 * Entries that batch # <= ->completed - 1: waiting for current GP
137 * [nxtlist, *nxttail[RCU_DONE_TAIL]): 186 * [nxtlist, *nxttail[RCU_DONE_TAIL]):
138 * Entries that batch # <= ->completed 187 * Entries that batch # <= ->completed
139 * The grace period for these entries has completed, and 188 * The grace period for these entries has completed, and
140 * the other grace-period-completed entries may be moved 189 * the other grace-period-completed entries may be moved
141 * here temporarily in rcu_process_callbacks(). 190 * here temporarily in rcu_process_callbacks().
191 * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
192 * Entries that batch # <= ->completed - 1: waiting for current GP
193 * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
194 * Entries known to have arrived before current GP ended
195 * [*nxttail[RCU_NEXT_READY_TAIL], *nxttail[RCU_NEXT_TAIL]):
196 * Entries that might have arrived after current GP ended
197 * Note that the value of *nxttail[RCU_NEXT_TAIL] will
198 * always be NULL, as this is the end of the list.
142 */ 199 */
143 struct rcu_head *nxtlist; 200 struct rcu_head *nxtlist;
144 struct rcu_head **nxttail[RCU_NEXT_SIZE]; 201 struct rcu_head **nxttail[RCU_NEXT_SIZE];
145 long qlen; /* # of queued callbacks */ 202 long qlen; /* # of queued callbacks */
203 long qlen_last_fqs_check;
204 /* qlen at last check for QS forcing */
205 unsigned long n_force_qs_snap;
206 /* did other CPU force QS recently? */
146 long blimit; /* Upper limit on a processed batch */ 207 long blimit; /* Upper limit on a processed batch */
147 208
148#ifdef CONFIG_NO_HZ 209#ifdef CONFIG_NO_HZ
@@ -173,13 +234,15 @@ struct rcu_data {
173}; 234};
174 235
175/* Values for signaled field in struct rcu_state. */ 236/* Values for signaled field in struct rcu_state. */
176#define RCU_GP_INIT 0 /* Grace period being initialized. */ 237#define RCU_GP_IDLE 0 /* No grace period in progress. */
177#define RCU_SAVE_DYNTICK 1 /* Need to scan dyntick state. */ 238#define RCU_GP_INIT 1 /* Grace period being initialized. */
178#define RCU_FORCE_QS 2 /* Need to force quiescent state. */ 239#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */
240#define RCU_SAVE_COMPLETED 3 /* Need to save rsp->completed. */
241#define RCU_FORCE_QS 4 /* Need to force quiescent state. */
179#ifdef CONFIG_NO_HZ 242#ifdef CONFIG_NO_HZ
180#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK 243#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
181#else /* #ifdef CONFIG_NO_HZ */ 244#else /* #ifdef CONFIG_NO_HZ */
182#define RCU_SIGNAL_INIT RCU_FORCE_QS 245#define RCU_SIGNAL_INIT RCU_SAVE_COMPLETED
183#endif /* #else #ifdef CONFIG_NO_HZ */ 246#endif /* #else #ifdef CONFIG_NO_HZ */
184 247
185#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 248#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
@@ -216,10 +279,23 @@ struct rcu_state {
216 /* Force QS state. */ 279 /* Force QS state. */
217 long gpnum; /* Current gp number. */ 280 long gpnum; /* Current gp number. */
218 long completed; /* # of last completed gp. */ 281 long completed; /* # of last completed gp. */
282
283 /* End of fields guarded by root rcu_node's lock. */
284
219 spinlock_t onofflock; /* exclude on/offline and */ 285 spinlock_t onofflock; /* exclude on/offline and */
220 /* starting new GP. */ 286 /* starting new GP. Also */
287 /* protects the following */
288 /* orphan_cbs fields. */
289 struct rcu_head *orphan_cbs_list; /* list of rcu_head structs */
290 /* orphaned by all CPUs in */
291 /* a given leaf rcu_node */
292 /* going offline. */
293 struct rcu_head **orphan_cbs_tail; /* And tail pointer. */
294 long orphan_qlen; /* Number of orphaned cbs. */
221 spinlock_t fqslock; /* Only one task forcing */ 295 spinlock_t fqslock; /* Only one task forcing */
222 /* quiescent states. */ 296 /* quiescent states. */
297 long completed_fqs; /* Value of completed @ snap. */
298 /* Protected by fqslock. */
223 unsigned long jiffies_force_qs; /* Time at which to invoke */ 299 unsigned long jiffies_force_qs; /* Time at which to invoke */
224 /* force_quiescent_state(). */ 300 /* force_quiescent_state(). */
225 unsigned long n_force_qs; /* Number of calls to */ 301 unsigned long n_force_qs; /* Number of calls to */
@@ -234,11 +310,15 @@ struct rcu_state {
234 unsigned long jiffies_stall; /* Time at which to check */ 310 unsigned long jiffies_stall; /* Time at which to check */
235 /* for CPU stalls. */ 311 /* for CPU stalls. */
236#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 312#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
237#ifdef CONFIG_NO_HZ
238 long dynticks_completed; /* Value of completed @ snap. */
239#endif /* #ifdef CONFIG_NO_HZ */
240}; 313};
241 314
315/* Return values for rcu_preempt_offline_tasks(). */
316
317#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */
318 /* GP were moved to root. */
319#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */
320 /* GP were moved to root. */
321
242#ifdef RCU_TREE_NONCORE 322#ifdef RCU_TREE_NONCORE
243 323
244/* 324/*
@@ -255,5 +335,37 @@ extern struct rcu_state rcu_preempt_state;
255DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); 335DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
256#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 336#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
257 337
258#endif /* #ifdef RCU_TREE_NONCORE */ 338#else /* #ifdef RCU_TREE_NONCORE */
339
340/* Forward declarations for rcutree_plugin.h */
341static void rcu_bootup_announce(void);
342long rcu_batches_completed(void);
343static void rcu_preempt_note_context_switch(int cpu);
344static int rcu_preempted_readers(struct rcu_node *rnp);
345#ifdef CONFIG_HOTPLUG_CPU
346static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
347 unsigned long flags);
348#endif /* #ifdef CONFIG_HOTPLUG_CPU */
349#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
350static void rcu_print_task_stall(struct rcu_node *rnp);
351#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
352static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
353#ifdef CONFIG_HOTPLUG_CPU
354static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
355 struct rcu_node *rnp,
356 struct rcu_data *rdp);
357static void rcu_preempt_offline_cpu(int cpu);
358#endif /* #ifdef CONFIG_HOTPLUG_CPU */
359static void rcu_preempt_check_callbacks(int cpu);
360static void rcu_preempt_process_callbacks(void);
361void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
362#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
363static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
364#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
365static int rcu_preempt_pending(int cpu);
366static int rcu_preempt_needs_cpu(int cpu);
367static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
368static void rcu_preempt_send_cbs_to_orphanage(void);
369static void __init __rcu_init_preempt(void);
259 370
371#endif /* #else #ifdef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 1cee04f627eb..37fbccdf41d5 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -24,16 +24,19 @@
24 * Paul E. McKenney <paulmck@linux.vnet.ibm.com> 24 * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
25 */ 25 */
26 26
27#include <linux/delay.h>
27 28
28#ifdef CONFIG_TREE_PREEMPT_RCU 29#ifdef CONFIG_TREE_PREEMPT_RCU
29 30
30struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); 31struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
31DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); 32DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
32 33
34static int rcu_preempted_readers_exp(struct rcu_node *rnp);
35
33/* 36/*
34 * Tell them what RCU they are running. 37 * Tell them what RCU they are running.
35 */ 38 */
36static inline void rcu_bootup_announce(void) 39static void __init rcu_bootup_announce(void)
37{ 40{
38 printk(KERN_INFO 41 printk(KERN_INFO
39 "Experimental preemptable hierarchical RCU implementation.\n"); 42 "Experimental preemptable hierarchical RCU implementation.\n");
@@ -67,7 +70,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
67static void rcu_preempt_qs(int cpu) 70static void rcu_preempt_qs(int cpu)
68{ 71{
69 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 72 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
70 rdp->passed_quiesc_completed = rdp->completed; 73 rdp->passed_quiesc_completed = rdp->gpnum - 1;
71 barrier(); 74 barrier();
72 rdp->passed_quiesc = 1; 75 rdp->passed_quiesc = 1;
73} 76}
@@ -150,11 +153,65 @@ void __rcu_read_lock(void)
150} 153}
151EXPORT_SYMBOL_GPL(__rcu_read_lock); 154EXPORT_SYMBOL_GPL(__rcu_read_lock);
152 155
156/*
157 * Check for preempted RCU readers blocking the current grace period
158 * for the specified rcu_node structure. If the caller needs a reliable
159 * answer, it must hold the rcu_node's ->lock.
160 */
161static int rcu_preempted_readers(struct rcu_node *rnp)
162{
163 int phase = rnp->gpnum & 0x1;
164
165 return !list_empty(&rnp->blocked_tasks[phase]) ||
166 !list_empty(&rnp->blocked_tasks[phase + 2]);
167}
168
169/*
170 * Record a quiescent state for all tasks that were previously queued
171 * on the specified rcu_node structure and that were blocking the current
172 * RCU grace period. The caller must hold the specified rnp->lock with
173 * irqs disabled, and this lock is released upon return, but irqs remain
174 * disabled.
175 */
176static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
177 __releases(rnp->lock)
178{
179 unsigned long mask;
180 struct rcu_node *rnp_p;
181
182 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
183 spin_unlock_irqrestore(&rnp->lock, flags);
184 return; /* Still need more quiescent states! */
185 }
186
187 rnp_p = rnp->parent;
188 if (rnp_p == NULL) {
189 /*
190 * Either there is only one rcu_node in the tree,
191 * or tasks were kicked up to root rcu_node due to
192 * CPUs going offline.
193 */
194 rcu_report_qs_rsp(&rcu_preempt_state, flags);
195 return;
196 }
197
198 /* Report up the rest of the hierarchy. */
199 mask = rnp->grpmask;
200 spin_unlock(&rnp->lock); /* irqs remain disabled. */
201 spin_lock(&rnp_p->lock); /* irqs already disabled. */
202 rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
203}
204
205/*
206 * Handle special cases during rcu_read_unlock(), such as needing to
207 * notify RCU core processing or task having blocked during the RCU
208 * read-side critical section.
209 */
153static void rcu_read_unlock_special(struct task_struct *t) 210static void rcu_read_unlock_special(struct task_struct *t)
154{ 211{
155 int empty; 212 int empty;
213 int empty_exp;
156 unsigned long flags; 214 unsigned long flags;
157 unsigned long mask;
158 struct rcu_node *rnp; 215 struct rcu_node *rnp;
159 int special; 216 int special;
160 217
@@ -196,37 +253,31 @@ static void rcu_read_unlock_special(struct task_struct *t)
196 break; 253 break;
197 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 254 spin_unlock(&rnp->lock); /* irqs remain disabled. */
198 } 255 }
199 empty = list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); 256 empty = !rcu_preempted_readers(rnp);
257 empty_exp = !rcu_preempted_readers_exp(rnp);
258 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
200 list_del_init(&t->rcu_node_entry); 259 list_del_init(&t->rcu_node_entry);
201 t->rcu_blocked_node = NULL; 260 t->rcu_blocked_node = NULL;
202 261
203 /* 262 /*
204 * If this was the last task on the current list, and if 263 * If this was the last task on the current list, and if
205 * we aren't waiting on any CPUs, report the quiescent state. 264 * we aren't waiting on any CPUs, report the quiescent state.
206 * Note that both cpu_quiet_msk_finish() and cpu_quiet_msk() 265 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock.
207 * drop rnp->lock and restore irq.
208 */ 266 */
209 if (!empty && rnp->qsmask == 0 && 267 if (empty)
210 list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])) {
211 struct rcu_node *rnp_p;
212
213 if (rnp->parent == NULL) {
214 /* Only one rcu_node in the tree. */
215 cpu_quiet_msk_finish(&rcu_preempt_state, flags);
216 return;
217 }
218 /* Report up the rest of the hierarchy. */
219 mask = rnp->grpmask;
220 spin_unlock_irqrestore(&rnp->lock, flags); 268 spin_unlock_irqrestore(&rnp->lock, flags);
221 rnp_p = rnp->parent; 269 else
222 spin_lock_irqsave(&rnp_p->lock, flags); 270 rcu_report_unblock_qs_rnp(rnp, flags);
223 WARN_ON_ONCE(rnp->qsmask); 271
224 cpu_quiet_msk(mask, &rcu_preempt_state, rnp_p, flags); 272 /*
225 return; 273 * If this was the last task on the expedited lists,
226 } 274 * then we need to report up the rcu_node hierarchy.
227 spin_unlock(&rnp->lock); 275 */
276 if (!empty_exp && !rcu_preempted_readers_exp(rnp))
277 rcu_report_exp_rnp(&rcu_preempt_state, rnp);
278 } else {
279 local_irq_restore(flags);
228 } 280 }
229 local_irq_restore(flags);
230} 281}
231 282
232/* 283/*
@@ -257,12 +308,12 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
257{ 308{
258 unsigned long flags; 309 unsigned long flags;
259 struct list_head *lp; 310 struct list_head *lp;
260 int phase = rnp->gpnum & 0x1; 311 int phase;
261 struct task_struct *t; 312 struct task_struct *t;
262 313
263 if (!list_empty(&rnp->blocked_tasks[phase])) { 314 if (rcu_preempted_readers(rnp)) {
264 spin_lock_irqsave(&rnp->lock, flags); 315 spin_lock_irqsave(&rnp->lock, flags);
265 phase = rnp->gpnum & 0x1; /* re-read under lock. */ 316 phase = rnp->gpnum & 0x1;
266 lp = &rnp->blocked_tasks[phase]; 317 lp = &rnp->blocked_tasks[phase];
267 list_for_each_entry(t, lp, rcu_node_entry) 318 list_for_each_entry(t, lp, rcu_node_entry)
268 printk(" P%d", t->pid); 319 printk(" P%d", t->pid);
@@ -281,20 +332,10 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
281 */ 332 */
282static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) 333static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
283{ 334{
284 WARN_ON_ONCE(!list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])); 335 WARN_ON_ONCE(rcu_preempted_readers(rnp));
285 WARN_ON_ONCE(rnp->qsmask); 336 WARN_ON_ONCE(rnp->qsmask);
286} 337}
287 338
288/*
289 * Check for preempted RCU readers for the specified rcu_node structure.
290 * If the caller needs a reliable answer, it must hold the rcu_node's
291 * >lock.
292 */
293static int rcu_preempted_readers(struct rcu_node *rnp)
294{
295 return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
296}
297
298#ifdef CONFIG_HOTPLUG_CPU 339#ifdef CONFIG_HOTPLUG_CPU
299 340
300/* 341/*
@@ -303,26 +344,34 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
303 * rcu_node. The reason for not just moving them to the immediate 344 * rcu_node. The reason for not just moving them to the immediate
304 * parent is to remove the need for rcu_read_unlock_special() to 345 * parent is to remove the need for rcu_read_unlock_special() to
305 * make more than two attempts to acquire the target rcu_node's lock. 346 * make more than two attempts to acquire the target rcu_node's lock.
347 * Returns true if there were tasks blocking the current RCU grace
348 * period.
349 *
350 * Returns 1 if there was previously a task blocking the current grace
351 * period on the specified rcu_node structure.
306 * 352 *
307 * The caller must hold rnp->lock with irqs disabled. 353 * The caller must hold rnp->lock with irqs disabled.
308 */ 354 */
309static void rcu_preempt_offline_tasks(struct rcu_state *rsp, 355static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
310 struct rcu_node *rnp, 356 struct rcu_node *rnp,
311 struct rcu_data *rdp) 357 struct rcu_data *rdp)
312{ 358{
313 int i; 359 int i;
314 struct list_head *lp; 360 struct list_head *lp;
315 struct list_head *lp_root; 361 struct list_head *lp_root;
362 int retval = 0;
316 struct rcu_node *rnp_root = rcu_get_root(rsp); 363 struct rcu_node *rnp_root = rcu_get_root(rsp);
317 struct task_struct *tp; 364 struct task_struct *tp;
318 365
319 if (rnp == rnp_root) { 366 if (rnp == rnp_root) {
320 WARN_ONCE(1, "Last CPU thought to be offlined?"); 367 WARN_ONCE(1, "Last CPU thought to be offlined?");
321 return; /* Shouldn't happen: at least one CPU online. */ 368 return 0; /* Shouldn't happen: at least one CPU online. */
322 } 369 }
323 WARN_ON_ONCE(rnp != rdp->mynode && 370 WARN_ON_ONCE(rnp != rdp->mynode &&
324 (!list_empty(&rnp->blocked_tasks[0]) || 371 (!list_empty(&rnp->blocked_tasks[0]) ||
325 !list_empty(&rnp->blocked_tasks[1]))); 372 !list_empty(&rnp->blocked_tasks[1]) ||
373 !list_empty(&rnp->blocked_tasks[2]) ||
374 !list_empty(&rnp->blocked_tasks[3])));
326 375
327 /* 376 /*
328 * Move tasks up to root rcu_node. Rely on the fact that the 377 * Move tasks up to root rcu_node. Rely on the fact that the
@@ -330,7 +379,11 @@ static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
330 * rcu_nodes in terms of gp_num value. This fact allows us to 379 * rcu_nodes in terms of gp_num value. This fact allows us to
331 * move the blocked_tasks[] array directly, element by element. 380 * move the blocked_tasks[] array directly, element by element.
332 */ 381 */
333 for (i = 0; i < 2; i++) { 382 if (rcu_preempted_readers(rnp))
383 retval |= RCU_OFL_TASKS_NORM_GP;
384 if (rcu_preempted_readers_exp(rnp))
385 retval |= RCU_OFL_TASKS_EXP_GP;
386 for (i = 0; i < 4; i++) {
334 lp = &rnp->blocked_tasks[i]; 387 lp = &rnp->blocked_tasks[i];
335 lp_root = &rnp_root->blocked_tasks[i]; 388 lp_root = &rnp_root->blocked_tasks[i];
336 while (!list_empty(lp)) { 389 while (!list_empty(lp)) {
@@ -342,6 +395,7 @@ static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
342 spin_unlock(&rnp_root->lock); /* irqs remain disabled */ 395 spin_unlock(&rnp_root->lock); /* irqs remain disabled */
343 } 396 }
344 } 397 }
398 return retval;
345} 399}
346 400
347/* 401/*
@@ -392,6 +446,186 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
392} 446}
393EXPORT_SYMBOL_GPL(call_rcu); 447EXPORT_SYMBOL_GPL(call_rcu);
394 448
449/**
450 * synchronize_rcu - wait until a grace period has elapsed.
451 *
452 * Control will return to the caller some time after a full grace
453 * period has elapsed, in other words after all currently executing RCU
454 * read-side critical sections have completed. RCU read-side critical
455 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
456 * and may be nested.
457 */
458void synchronize_rcu(void)
459{
460 struct rcu_synchronize rcu;
461
462 if (!rcu_scheduler_active)
463 return;
464
465 init_completion(&rcu.completion);
466 /* Will wake me after RCU finished. */
467 call_rcu(&rcu.head, wakeme_after_rcu);
468 /* Wait for it. */
469 wait_for_completion(&rcu.completion);
470}
471EXPORT_SYMBOL_GPL(synchronize_rcu);
472
473static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
474static long sync_rcu_preempt_exp_count;
475static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
476
477/*
478 * Return non-zero if there are any tasks in RCU read-side critical
479 * sections blocking the current preemptible-RCU expedited grace period.
480 * If there is no preemptible-RCU expedited grace period currently in
481 * progress, returns zero unconditionally.
482 */
483static int rcu_preempted_readers_exp(struct rcu_node *rnp)
484{
485 return !list_empty(&rnp->blocked_tasks[2]) ||
486 !list_empty(&rnp->blocked_tasks[3]);
487}
488
489/*
490 * return non-zero if there is no RCU expedited grace period in progress
491 * for the specified rcu_node structure, in other words, if all CPUs and
492 * tasks covered by the specified rcu_node structure have done their bit
493 * for the current expedited grace period. Works only for preemptible
494 * RCU -- other RCU implementation use other means.
495 *
496 * Caller must hold sync_rcu_preempt_exp_mutex.
497 */
498static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
499{
500 return !rcu_preempted_readers_exp(rnp) &&
501 ACCESS_ONCE(rnp->expmask) == 0;
502}
503
504/*
505 * Report the exit from RCU read-side critical section for the last task
506 * that queued itself during or before the current expedited preemptible-RCU
507 * grace period. This event is reported either to the rcu_node structure on
508 * which the task was queued or to one of that rcu_node structure's ancestors,
509 * recursively up the tree. (Calm down, calm down, we do the recursion
510 * iteratively!)
511 *
512 * Caller must hold sync_rcu_preempt_exp_mutex.
513 */
514static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
515{
516 unsigned long flags;
517 unsigned long mask;
518
519 spin_lock_irqsave(&rnp->lock, flags);
520 for (;;) {
521 if (!sync_rcu_preempt_exp_done(rnp))
522 break;
523 if (rnp->parent == NULL) {
524 wake_up(&sync_rcu_preempt_exp_wq);
525 break;
526 }
527 mask = rnp->grpmask;
528 spin_unlock(&rnp->lock); /* irqs remain disabled */
529 rnp = rnp->parent;
530 spin_lock(&rnp->lock); /* irqs already disabled */
531 rnp->expmask &= ~mask;
532 }
533 spin_unlock_irqrestore(&rnp->lock, flags);
534}
535
536/*
537 * Snapshot the tasks blocking the newly started preemptible-RCU expedited
538 * grace period for the specified rcu_node structure. If there are no such
539 * tasks, report it up the rcu_node hierarchy.
540 *
541 * Caller must hold sync_rcu_preempt_exp_mutex and rsp->onofflock.
542 */
543static void
544sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
545{
546 int must_wait;
547
548 spin_lock(&rnp->lock); /* irqs already disabled */
549 list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]);
550 list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]);
551 must_wait = rcu_preempted_readers_exp(rnp);
552 spin_unlock(&rnp->lock); /* irqs remain disabled */
553 if (!must_wait)
554 rcu_report_exp_rnp(rsp, rnp);
555}
556
557/*
558 * Wait for an rcu-preempt grace period, but expedite it. The basic idea
559 * is to invoke synchronize_sched_expedited() to push all the tasks to
560 * the ->blocked_tasks[] lists, move all entries from the first set of
561 * ->blocked_tasks[] lists to the second set, and finally wait for this
562 * second set to drain.
563 */
564void synchronize_rcu_expedited(void)
565{
566 unsigned long flags;
567 struct rcu_node *rnp;
568 struct rcu_state *rsp = &rcu_preempt_state;
569 long snap;
570 int trycount = 0;
571
572 smp_mb(); /* Caller's modifications seen first by other CPUs. */
573 snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1;
574 smp_mb(); /* Above access cannot bleed into critical section. */
575
576 /*
577 * Acquire lock, falling back to synchronize_rcu() if too many
578 * lock-acquisition failures. Of course, if someone does the
579 * expedited grace period for us, just leave.
580 */
581 while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
582 if (trycount++ < 10)
583 udelay(trycount * num_online_cpus());
584 else {
585 synchronize_rcu();
586 return;
587 }
588 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
589 goto mb_ret; /* Others did our work for us. */
590 }
591 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
592 goto unlock_mb_ret; /* Others did our work for us. */
593
594 /* force all RCU readers onto blocked_tasks[]. */
595 synchronize_sched_expedited();
596
597 spin_lock_irqsave(&rsp->onofflock, flags);
598
599 /* Initialize ->expmask for all non-leaf rcu_node structures. */
600 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
601 spin_lock(&rnp->lock); /* irqs already disabled. */
602 rnp->expmask = rnp->qsmaskinit;
603 spin_unlock(&rnp->lock); /* irqs remain disabled. */
604 }
605
606 /* Snapshot current state of ->blocked_tasks[] lists. */
607 rcu_for_each_leaf_node(rsp, rnp)
608 sync_rcu_preempt_exp_init(rsp, rnp);
609 if (NUM_RCU_NODES > 1)
610 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
611
612 spin_unlock_irqrestore(&rsp->onofflock, flags);
613
614 /* Wait for snapshotted ->blocked_tasks[] lists to drain. */
615 rnp = rcu_get_root(rsp);
616 wait_event(sync_rcu_preempt_exp_wq,
617 sync_rcu_preempt_exp_done(rnp));
618
619 /* Clean up and exit. */
620 smp_mb(); /* ensure expedited GP seen before counter increment. */
621 ACCESS_ONCE(sync_rcu_preempt_exp_count)++;
622unlock_mb_ret:
623 mutex_unlock(&sync_rcu_preempt_exp_mutex);
624mb_ret:
625 smp_mb(); /* ensure subsequent action seen after grace period. */
626}
627EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
628
395/* 629/*
396 * Check to see if there is any immediate preemptable-RCU-related work 630 * Check to see if there is any immediate preemptable-RCU-related work
397 * to be done. 631 * to be done.
@@ -410,6 +644,15 @@ static int rcu_preempt_needs_cpu(int cpu)
410 return !!per_cpu(rcu_preempt_data, cpu).nxtlist; 644 return !!per_cpu(rcu_preempt_data, cpu).nxtlist;
411} 645}
412 646
647/**
648 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
649 */
650void rcu_barrier(void)
651{
652 _rcu_barrier(&rcu_preempt_state, call_rcu);
653}
654EXPORT_SYMBOL_GPL(rcu_barrier);
655
413/* 656/*
414 * Initialize preemptable RCU's per-CPU data. 657 * Initialize preemptable RCU's per-CPU data.
415 */ 658 */
@@ -419,6 +662,22 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
419} 662}
420 663
421/* 664/*
665 * Move preemptable RCU's callbacks to ->orphan_cbs_list.
666 */
667static void rcu_preempt_send_cbs_to_orphanage(void)
668{
669 rcu_send_cbs_to_orphanage(&rcu_preempt_state);
670}
671
672/*
673 * Initialize preemptable RCU's state structures.
674 */
675static void __init __rcu_init_preempt(void)
676{
677 RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data);
678}
679
680/*
422 * Check for a task exiting while in a preemptable-RCU read-side 681 * Check for a task exiting while in a preemptable-RCU read-side
423 * critical section, clean up if so. No need to issue warnings, 682 * critical section, clean up if so. No need to issue warnings,
424 * as debug_check_no_locks_held() already does this if lockdep 683 * as debug_check_no_locks_held() already does this if lockdep
@@ -439,7 +698,7 @@ void exit_rcu(void)
439/* 698/*
440 * Tell them what RCU they are running. 699 * Tell them what RCU they are running.
441 */ 700 */
442static inline void rcu_bootup_announce(void) 701static void __init rcu_bootup_announce(void)
443{ 702{
444 printk(KERN_INFO "Hierarchical RCU implementation.\n"); 703 printk(KERN_INFO "Hierarchical RCU implementation.\n");
445} 704}
@@ -461,6 +720,25 @@ static void rcu_preempt_note_context_switch(int cpu)
461{ 720{
462} 721}
463 722
723/*
724 * Because preemptable RCU does not exist, there are never any preempted
725 * RCU readers.
726 */
727static int rcu_preempted_readers(struct rcu_node *rnp)
728{
729 return 0;
730}
731
732#ifdef CONFIG_HOTPLUG_CPU
733
734/* Because preemptible RCU does not exist, no quieting of tasks. */
735static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
736{
737 spin_unlock_irqrestore(&rnp->lock, flags);
738}
739
740#endif /* #ifdef CONFIG_HOTPLUG_CPU */
741
464#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 742#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
465 743
466/* 744/*
@@ -483,25 +761,19 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
483 WARN_ON_ONCE(rnp->qsmask); 761 WARN_ON_ONCE(rnp->qsmask);
484} 762}
485 763
486/*
487 * Because preemptable RCU does not exist, there are never any preempted
488 * RCU readers.
489 */
490static int rcu_preempted_readers(struct rcu_node *rnp)
491{
492 return 0;
493}
494
495#ifdef CONFIG_HOTPLUG_CPU 764#ifdef CONFIG_HOTPLUG_CPU
496 765
497/* 766/*
498 * Because preemptable RCU does not exist, it never needs to migrate 767 * Because preemptable RCU does not exist, it never needs to migrate
499 * tasks that were blocked within RCU read-side critical sections. 768 * tasks that were blocked within RCU read-side critical sections, and
769 * such non-existent tasks cannot possibly have been blocking the current
770 * grace period.
500 */ 771 */
501static void rcu_preempt_offline_tasks(struct rcu_state *rsp, 772static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
502 struct rcu_node *rnp, 773 struct rcu_node *rnp,
503 struct rcu_data *rdp) 774 struct rcu_data *rdp)
504{ 775{
776 return 0;
505} 777}
506 778
507/* 779/*
@@ -518,7 +790,7 @@ static void rcu_preempt_offline_cpu(int cpu)
518 * Because preemptable RCU does not exist, it never has any callbacks 790 * Because preemptable RCU does not exist, it never has any callbacks
519 * to check. 791 * to check.
520 */ 792 */
521void rcu_preempt_check_callbacks(int cpu) 793static void rcu_preempt_check_callbacks(int cpu)
522{ 794{
523} 795}
524 796
@@ -526,7 +798,7 @@ void rcu_preempt_check_callbacks(int cpu)
526 * Because preemptable RCU does not exist, it never has any callbacks 798 * Because preemptable RCU does not exist, it never has any callbacks
527 * to process. 799 * to process.
528 */ 800 */
529void rcu_preempt_process_callbacks(void) 801static void rcu_preempt_process_callbacks(void)
530{ 802{
531} 803}
532 804
@@ -540,6 +812,30 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
540EXPORT_SYMBOL_GPL(call_rcu); 812EXPORT_SYMBOL_GPL(call_rcu);
541 813
542/* 814/*
815 * Wait for an rcu-preempt grace period, but make it happen quickly.
816 * But because preemptable RCU does not exist, map to rcu-sched.
817 */
818void synchronize_rcu_expedited(void)
819{
820 synchronize_sched_expedited();
821}
822EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
823
824#ifdef CONFIG_HOTPLUG_CPU
825
826/*
827 * Because preemptable RCU does not exist, there is never any need to
828 * report on tasks preempted in RCU read-side critical sections during
829 * expedited RCU grace periods.
830 */
831static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
832{
833 return;
834}
835
836#endif /* #ifdef CONFIG_HOTPLUG_CPU */
837
838/*
543 * Because preemptable RCU does not exist, it never has any work to do. 839 * Because preemptable RCU does not exist, it never has any work to do.
544 */ 840 */
545static int rcu_preempt_pending(int cpu) 841static int rcu_preempt_pending(int cpu)
@@ -556,6 +852,16 @@ static int rcu_preempt_needs_cpu(int cpu)
556} 852}
557 853
558/* 854/*
855 * Because preemptable RCU does not exist, rcu_barrier() is just
856 * another name for rcu_barrier_sched().
857 */
858void rcu_barrier(void)
859{
860 rcu_barrier_sched();
861}
862EXPORT_SYMBOL_GPL(rcu_barrier);
863
864/*
559 * Because preemptable RCU does not exist, there is no per-CPU 865 * Because preemptable RCU does not exist, there is no per-CPU
560 * data to initialize. 866 * data to initialize.
561 */ 867 */
@@ -563,4 +869,18 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
563{ 869{
564} 870}
565 871
872/*
873 * Because there is no preemptable RCU, there are no callbacks to move.
874 */
875static void rcu_preempt_send_cbs_to_orphanage(void)
876{
877}
878
879/*
880 * Because preemptable RCU does not exist, it need not be initialized.
881 */
882static void __init __rcu_init_preempt(void)
883{
884}
885
566#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ 886#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index c89f5e9fd173..9d2c88423b31 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -93,7 +93,7 @@ static int rcudata_open(struct inode *inode, struct file *file)
93 return single_open(file, show_rcudata, NULL); 93 return single_open(file, show_rcudata, NULL);
94} 94}
95 95
96static struct file_operations rcudata_fops = { 96static const struct file_operations rcudata_fops = {
97 .owner = THIS_MODULE, 97 .owner = THIS_MODULE,
98 .open = rcudata_open, 98 .open = rcudata_open,
99 .read = seq_read, 99 .read = seq_read,
@@ -145,7 +145,7 @@ static int rcudata_csv_open(struct inode *inode, struct file *file)
145 return single_open(file, show_rcudata_csv, NULL); 145 return single_open(file, show_rcudata_csv, NULL);
146} 146}
147 147
148static struct file_operations rcudata_csv_fops = { 148static const struct file_operations rcudata_csv_fops = {
149 .owner = THIS_MODULE, 149 .owner = THIS_MODULE,
150 .open = rcudata_csv_open, 150 .open = rcudata_csv_open,
151 .read = seq_read, 151 .read = seq_read,
@@ -155,24 +155,32 @@ static struct file_operations rcudata_csv_fops = {
155 155
156static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) 156static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
157{ 157{
158 long gpnum;
158 int level = 0; 159 int level = 0;
160 int phase;
159 struct rcu_node *rnp; 161 struct rcu_node *rnp;
160 162
163 gpnum = rsp->gpnum;
161 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x " 164 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x "
162 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", 165 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
163 rsp->completed, rsp->gpnum, rsp->signaled, 166 rsp->completed, gpnum, rsp->signaled,
164 (long)(rsp->jiffies_force_qs - jiffies), 167 (long)(rsp->jiffies_force_qs - jiffies),
165 (int)(jiffies & 0xffff), 168 (int)(jiffies & 0xffff),
166 rsp->n_force_qs, rsp->n_force_qs_ngp, 169 rsp->n_force_qs, rsp->n_force_qs_ngp,
167 rsp->n_force_qs - rsp->n_force_qs_ngp, 170 rsp->n_force_qs - rsp->n_force_qs_ngp,
168 rsp->n_force_qs_lh); 171 rsp->n_force_qs_lh, rsp->orphan_qlen);
169 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { 172 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
170 if (rnp->level != level) { 173 if (rnp->level != level) {
171 seq_puts(m, "\n"); 174 seq_puts(m, "\n");
172 level = rnp->level; 175 level = rnp->level;
173 } 176 }
174 seq_printf(m, "%lx/%lx %d:%d ^%d ", 177 phase = gpnum & 0x1;
178 seq_printf(m, "%lx/%lx %c%c>%c%c %d:%d ^%d ",
175 rnp->qsmask, rnp->qsmaskinit, 179 rnp->qsmask, rnp->qsmaskinit,
180 "T."[list_empty(&rnp->blocked_tasks[phase])],
181 "E."[list_empty(&rnp->blocked_tasks[phase + 2])],
182 "T."[list_empty(&rnp->blocked_tasks[!phase])],
183 "E."[list_empty(&rnp->blocked_tasks[!phase + 2])],
176 rnp->grplo, rnp->grphi, rnp->grpnum); 184 rnp->grplo, rnp->grphi, rnp->grpnum);
177 } 185 }
178 seq_puts(m, "\n"); 186 seq_puts(m, "\n");
@@ -196,7 +204,7 @@ static int rcuhier_open(struct inode *inode, struct file *file)
196 return single_open(file, show_rcuhier, NULL); 204 return single_open(file, show_rcuhier, NULL);
197} 205}
198 206
199static struct file_operations rcuhier_fops = { 207static const struct file_operations rcuhier_fops = {
200 .owner = THIS_MODULE, 208 .owner = THIS_MODULE,
201 .open = rcuhier_open, 209 .open = rcuhier_open,
202 .read = seq_read, 210 .read = seq_read,
@@ -222,7 +230,7 @@ static int rcugp_open(struct inode *inode, struct file *file)
222 return single_open(file, show_rcugp, NULL); 230 return single_open(file, show_rcugp, NULL);
223} 231}
224 232
225static struct file_operations rcugp_fops = { 233static const struct file_operations rcugp_fops = {
226 .owner = THIS_MODULE, 234 .owner = THIS_MODULE,
227 .open = rcugp_open, 235 .open = rcugp_open,
228 .read = seq_read, 236 .read = seq_read,
@@ -276,7 +284,7 @@ static int rcu_pending_open(struct inode *inode, struct file *file)
276 return single_open(file, show_rcu_pending, NULL); 284 return single_open(file, show_rcu_pending, NULL);
277} 285}
278 286
279static struct file_operations rcu_pending_fops = { 287static const struct file_operations rcu_pending_fops = {
280 .owner = THIS_MODULE, 288 .owner = THIS_MODULE,
281 .open = rcu_pending_open, 289 .open = rcu_pending_open,
282 .read = seq_read, 290 .read = seq_read,
diff --git a/kernel/relay.c b/kernel/relay.c
index bc188549788f..c705a41b4ba3 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -60,7 +60,7 @@ static int relay_buf_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
60/* 60/*
61 * vm_ops for relay file mappings. 61 * vm_ops for relay file mappings.
62 */ 62 */
63static struct vm_operations_struct relay_file_mmap_ops = { 63static const struct vm_operations_struct relay_file_mmap_ops = {
64 .fault = relay_buf_fault, 64 .fault = relay_buf_fault,
65 .close = relay_file_mmap_close, 65 .close = relay_file_mmap_close,
66}; 66};
@@ -1198,7 +1198,7 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe,
1198 relay_consume_bytes(rbuf, buf->private); 1198 relay_consume_bytes(rbuf, buf->private);
1199} 1199}
1200 1200
1201static struct pipe_buf_operations relay_pipe_buf_ops = { 1201static const struct pipe_buf_operations relay_pipe_buf_ops = {
1202 .can_merge = 0, 1202 .can_merge = 0,
1203 .map = generic_pipe_buf_map, 1203 .map = generic_pipe_buf_map,
1204 .unmap = generic_pipe_buf_unmap, 1204 .unmap = generic_pipe_buf_unmap,
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index e1338f074314..bcdabf37c40b 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -19,6 +19,7 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent)
19{ 19{
20 spin_lock_init(&counter->lock); 20 spin_lock_init(&counter->lock);
21 counter->limit = RESOURCE_MAX; 21 counter->limit = RESOURCE_MAX;
22 counter->soft_limit = RESOURCE_MAX;
22 counter->parent = parent; 23 counter->parent = parent;
23} 24}
24 25
@@ -101,6 +102,8 @@ res_counter_member(struct res_counter *counter, int member)
101 return &counter->limit; 102 return &counter->limit;
102 case RES_FAILCNT: 103 case RES_FAILCNT:
103 return &counter->failcnt; 104 return &counter->failcnt;
105 case RES_SOFT_LIMIT:
106 return &counter->soft_limit;
104 }; 107 };
105 108
106 BUG(); 109 BUG();
diff --git a/kernel/resource.c b/kernel/resource.c
index fb11a58b9594..dc15686b7a77 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -308,35 +308,37 @@ static int find_resource(struct resource *root, struct resource *new,
308 void *alignf_data) 308 void *alignf_data)
309{ 309{
310 struct resource *this = root->child; 310 struct resource *this = root->child;
311 resource_size_t start, end;
311 312
312 new->start = root->start; 313 start = root->start;
313 /* 314 /*
314 * Skip past an allocated resource that starts at 0, since the assignment 315 * Skip past an allocated resource that starts at 0, since the assignment
315 * of this->start - 1 to new->end below would cause an underflow. 316 * of this->start - 1 to new->end below would cause an underflow.
316 */ 317 */
317 if (this && this->start == 0) { 318 if (this && this->start == 0) {
318 new->start = this->end + 1; 319 start = this->end + 1;
319 this = this->sibling; 320 this = this->sibling;
320 } 321 }
321 for(;;) { 322 for(;;) {
322 if (this) 323 if (this)
323 new->end = this->start - 1; 324 end = this->start - 1;
324 else 325 else
325 new->end = root->end; 326 end = root->end;
326 if (new->start < min) 327 if (start < min)
327 new->start = min; 328 start = min;
328 if (new->end > max) 329 if (end > max)
329 new->end = max; 330 end = max;
330 new->start = ALIGN(new->start, align); 331 start = ALIGN(start, align);
331 if (alignf) 332 if (alignf)
332 alignf(alignf_data, new, size, align); 333 alignf(alignf_data, new, size, align);
333 if (new->start < new->end && new->end - new->start >= size - 1) { 334 if (start < end && end - start >= size - 1) {
334 new->end = new->start + size - 1; 335 new->start = start;
336 new->end = start + size - 1;
335 return 0; 337 return 0;
336 } 338 }
337 if (!this) 339 if (!this)
338 break; 340 break;
339 new->start = this->end + 1; 341 start = this->end + 1;
340 this = this->sibling; 342 this = this->sibling;
341 } 343 }
342 return -EBUSY; 344 return -EBUSY;
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 5fcb4fe645e2..ddabb54bb5c8 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -37,8 +37,8 @@ do { \
37 if (rt_trace_on) { \ 37 if (rt_trace_on) { \
38 rt_trace_on = 0; \ 38 rt_trace_on = 0; \
39 console_verbose(); \ 39 console_verbose(); \
40 if (spin_is_locked(&current->pi_lock)) \ 40 if (raw_spin_is_locked(&current->pi_lock)) \
41 spin_unlock(&current->pi_lock); \ 41 raw_spin_unlock(&current->pi_lock); \
42 } \ 42 } \
43} while (0) 43} while (0)
44 44
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 29bd4baf9e75..a9604815786a 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -138,9 +138,9 @@ static void rt_mutex_adjust_prio(struct task_struct *task)
138{ 138{
139 unsigned long flags; 139 unsigned long flags;
140 140
141 spin_lock_irqsave(&task->pi_lock, flags); 141 raw_spin_lock_irqsave(&task->pi_lock, flags);
142 __rt_mutex_adjust_prio(task); 142 __rt_mutex_adjust_prio(task);
143 spin_unlock_irqrestore(&task->pi_lock, flags); 143 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
144} 144}
145 145
146/* 146/*
@@ -195,7 +195,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
195 /* 195 /*
196 * Task can not go away as we did a get_task() before ! 196 * Task can not go away as we did a get_task() before !
197 */ 197 */
198 spin_lock_irqsave(&task->pi_lock, flags); 198 raw_spin_lock_irqsave(&task->pi_lock, flags);
199 199
200 waiter = task->pi_blocked_on; 200 waiter = task->pi_blocked_on;
201 /* 201 /*
@@ -231,8 +231,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
231 goto out_unlock_pi; 231 goto out_unlock_pi;
232 232
233 lock = waiter->lock; 233 lock = waiter->lock;
234 if (!spin_trylock(&lock->wait_lock)) { 234 if (!raw_spin_trylock(&lock->wait_lock)) {
235 spin_unlock_irqrestore(&task->pi_lock, flags); 235 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
236 cpu_relax(); 236 cpu_relax();
237 goto retry; 237 goto retry;
238 } 238 }
@@ -240,7 +240,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
240 /* Deadlock detection */ 240 /* Deadlock detection */
241 if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { 241 if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
242 debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); 242 debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
243 spin_unlock(&lock->wait_lock); 243 raw_spin_unlock(&lock->wait_lock);
244 ret = deadlock_detect ? -EDEADLK : 0; 244 ret = deadlock_detect ? -EDEADLK : 0;
245 goto out_unlock_pi; 245 goto out_unlock_pi;
246 } 246 }
@@ -253,13 +253,13 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
253 plist_add(&waiter->list_entry, &lock->wait_list); 253 plist_add(&waiter->list_entry, &lock->wait_list);
254 254
255 /* Release the task */ 255 /* Release the task */
256 spin_unlock_irqrestore(&task->pi_lock, flags); 256 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
257 put_task_struct(task); 257 put_task_struct(task);
258 258
259 /* Grab the next task */ 259 /* Grab the next task */
260 task = rt_mutex_owner(lock); 260 task = rt_mutex_owner(lock);
261 get_task_struct(task); 261 get_task_struct(task);
262 spin_lock_irqsave(&task->pi_lock, flags); 262 raw_spin_lock_irqsave(&task->pi_lock, flags);
263 263
264 if (waiter == rt_mutex_top_waiter(lock)) { 264 if (waiter == rt_mutex_top_waiter(lock)) {
265 /* Boost the owner */ 265 /* Boost the owner */
@@ -277,10 +277,10 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
277 __rt_mutex_adjust_prio(task); 277 __rt_mutex_adjust_prio(task);
278 } 278 }
279 279
280 spin_unlock_irqrestore(&task->pi_lock, flags); 280 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
281 281
282 top_waiter = rt_mutex_top_waiter(lock); 282 top_waiter = rt_mutex_top_waiter(lock);
283 spin_unlock(&lock->wait_lock); 283 raw_spin_unlock(&lock->wait_lock);
284 284
285 if (!detect_deadlock && waiter != top_waiter) 285 if (!detect_deadlock && waiter != top_waiter)
286 goto out_put_task; 286 goto out_put_task;
@@ -288,7 +288,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
288 goto again; 288 goto again;
289 289
290 out_unlock_pi: 290 out_unlock_pi:
291 spin_unlock_irqrestore(&task->pi_lock, flags); 291 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
292 out_put_task: 292 out_put_task:
293 put_task_struct(task); 293 put_task_struct(task);
294 294
@@ -313,9 +313,9 @@ static inline int try_to_steal_lock(struct rt_mutex *lock,
313 if (pendowner == task) 313 if (pendowner == task)
314 return 1; 314 return 1;
315 315
316 spin_lock_irqsave(&pendowner->pi_lock, flags); 316 raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
317 if (task->prio >= pendowner->prio) { 317 if (task->prio >= pendowner->prio) {
318 spin_unlock_irqrestore(&pendowner->pi_lock, flags); 318 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
319 return 0; 319 return 0;
320 } 320 }
321 321
@@ -325,7 +325,7 @@ static inline int try_to_steal_lock(struct rt_mutex *lock,
325 * priority. 325 * priority.
326 */ 326 */
327 if (likely(!rt_mutex_has_waiters(lock))) { 327 if (likely(!rt_mutex_has_waiters(lock))) {
328 spin_unlock_irqrestore(&pendowner->pi_lock, flags); 328 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
329 return 1; 329 return 1;
330 } 330 }
331 331
@@ -333,7 +333,7 @@ static inline int try_to_steal_lock(struct rt_mutex *lock,
333 next = rt_mutex_top_waiter(lock); 333 next = rt_mutex_top_waiter(lock);
334 plist_del(&next->pi_list_entry, &pendowner->pi_waiters); 334 plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
335 __rt_mutex_adjust_prio(pendowner); 335 __rt_mutex_adjust_prio(pendowner);
336 spin_unlock_irqrestore(&pendowner->pi_lock, flags); 336 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
337 337
338 /* 338 /*
339 * We are going to steal the lock and a waiter was 339 * We are going to steal the lock and a waiter was
@@ -350,10 +350,10 @@ static inline int try_to_steal_lock(struct rt_mutex *lock,
350 * might be task: 350 * might be task:
351 */ 351 */
352 if (likely(next->task != task)) { 352 if (likely(next->task != task)) {
353 spin_lock_irqsave(&task->pi_lock, flags); 353 raw_spin_lock_irqsave(&task->pi_lock, flags);
354 plist_add(&next->pi_list_entry, &task->pi_waiters); 354 plist_add(&next->pi_list_entry, &task->pi_waiters);
355 __rt_mutex_adjust_prio(task); 355 __rt_mutex_adjust_prio(task);
356 spin_unlock_irqrestore(&task->pi_lock, flags); 356 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
357 } 357 }
358 return 1; 358 return 1;
359} 359}
@@ -420,7 +420,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
420 unsigned long flags; 420 unsigned long flags;
421 int chain_walk = 0, res; 421 int chain_walk = 0, res;
422 422
423 spin_lock_irqsave(&task->pi_lock, flags); 423 raw_spin_lock_irqsave(&task->pi_lock, flags);
424 __rt_mutex_adjust_prio(task); 424 __rt_mutex_adjust_prio(task);
425 waiter->task = task; 425 waiter->task = task;
426 waiter->lock = lock; 426 waiter->lock = lock;
@@ -434,17 +434,17 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
434 434
435 task->pi_blocked_on = waiter; 435 task->pi_blocked_on = waiter;
436 436
437 spin_unlock_irqrestore(&task->pi_lock, flags); 437 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
438 438
439 if (waiter == rt_mutex_top_waiter(lock)) { 439 if (waiter == rt_mutex_top_waiter(lock)) {
440 spin_lock_irqsave(&owner->pi_lock, flags); 440 raw_spin_lock_irqsave(&owner->pi_lock, flags);
441 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); 441 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
442 plist_add(&waiter->pi_list_entry, &owner->pi_waiters); 442 plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
443 443
444 __rt_mutex_adjust_prio(owner); 444 __rt_mutex_adjust_prio(owner);
445 if (owner->pi_blocked_on) 445 if (owner->pi_blocked_on)
446 chain_walk = 1; 446 chain_walk = 1;
447 spin_unlock_irqrestore(&owner->pi_lock, flags); 447 raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
448 } 448 }
449 else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) 449 else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock))
450 chain_walk = 1; 450 chain_walk = 1;
@@ -459,12 +459,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
459 */ 459 */
460 get_task_struct(owner); 460 get_task_struct(owner);
461 461
462 spin_unlock(&lock->wait_lock); 462 raw_spin_unlock(&lock->wait_lock);
463 463
464 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, 464 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
465 task); 465 task);
466 466
467 spin_lock(&lock->wait_lock); 467 raw_spin_lock(&lock->wait_lock);
468 468
469 return res; 469 return res;
470} 470}
@@ -483,7 +483,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
483 struct task_struct *pendowner; 483 struct task_struct *pendowner;
484 unsigned long flags; 484 unsigned long flags;
485 485
486 spin_lock_irqsave(&current->pi_lock, flags); 486 raw_spin_lock_irqsave(&current->pi_lock, flags);
487 487
488 waiter = rt_mutex_top_waiter(lock); 488 waiter = rt_mutex_top_waiter(lock);
489 plist_del(&waiter->list_entry, &lock->wait_list); 489 plist_del(&waiter->list_entry, &lock->wait_list);
@@ -500,7 +500,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
500 500
501 rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); 501 rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING);
502 502
503 spin_unlock_irqrestore(&current->pi_lock, flags); 503 raw_spin_unlock_irqrestore(&current->pi_lock, flags);
504 504
505 /* 505 /*
506 * Clear the pi_blocked_on variable and enqueue a possible 506 * Clear the pi_blocked_on variable and enqueue a possible
@@ -509,7 +509,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
509 * waiter with higher priority than pending-owner->normal_prio 509 * waiter with higher priority than pending-owner->normal_prio
510 * is blocked on the unboosted (pending) owner. 510 * is blocked on the unboosted (pending) owner.
511 */ 511 */
512 spin_lock_irqsave(&pendowner->pi_lock, flags); 512 raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
513 513
514 WARN_ON(!pendowner->pi_blocked_on); 514 WARN_ON(!pendowner->pi_blocked_on);
515 WARN_ON(pendowner->pi_blocked_on != waiter); 515 WARN_ON(pendowner->pi_blocked_on != waiter);
@@ -523,7 +523,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
523 next = rt_mutex_top_waiter(lock); 523 next = rt_mutex_top_waiter(lock);
524 plist_add(&next->pi_list_entry, &pendowner->pi_waiters); 524 plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
525 } 525 }
526 spin_unlock_irqrestore(&pendowner->pi_lock, flags); 526 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
527 527
528 wake_up_process(pendowner); 528 wake_up_process(pendowner);
529} 529}
@@ -541,15 +541,15 @@ static void remove_waiter(struct rt_mutex *lock,
541 unsigned long flags; 541 unsigned long flags;
542 int chain_walk = 0; 542 int chain_walk = 0;
543 543
544 spin_lock_irqsave(&current->pi_lock, flags); 544 raw_spin_lock_irqsave(&current->pi_lock, flags);
545 plist_del(&waiter->list_entry, &lock->wait_list); 545 plist_del(&waiter->list_entry, &lock->wait_list);
546 waiter->task = NULL; 546 waiter->task = NULL;
547 current->pi_blocked_on = NULL; 547 current->pi_blocked_on = NULL;
548 spin_unlock_irqrestore(&current->pi_lock, flags); 548 raw_spin_unlock_irqrestore(&current->pi_lock, flags);
549 549
550 if (first && owner != current) { 550 if (first && owner != current) {
551 551
552 spin_lock_irqsave(&owner->pi_lock, flags); 552 raw_spin_lock_irqsave(&owner->pi_lock, flags);
553 553
554 plist_del(&waiter->pi_list_entry, &owner->pi_waiters); 554 plist_del(&waiter->pi_list_entry, &owner->pi_waiters);
555 555
@@ -564,7 +564,7 @@ static void remove_waiter(struct rt_mutex *lock,
564 if (owner->pi_blocked_on) 564 if (owner->pi_blocked_on)
565 chain_walk = 1; 565 chain_walk = 1;
566 566
567 spin_unlock_irqrestore(&owner->pi_lock, flags); 567 raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
568 } 568 }
569 569
570 WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); 570 WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
@@ -575,11 +575,11 @@ static void remove_waiter(struct rt_mutex *lock,
575 /* gets dropped in rt_mutex_adjust_prio_chain()! */ 575 /* gets dropped in rt_mutex_adjust_prio_chain()! */
576 get_task_struct(owner); 576 get_task_struct(owner);
577 577
578 spin_unlock(&lock->wait_lock); 578 raw_spin_unlock(&lock->wait_lock);
579 579
580 rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current); 580 rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current);
581 581
582 spin_lock(&lock->wait_lock); 582 raw_spin_lock(&lock->wait_lock);
583} 583}
584 584
585/* 585/*
@@ -592,15 +592,15 @@ void rt_mutex_adjust_pi(struct task_struct *task)
592 struct rt_mutex_waiter *waiter; 592 struct rt_mutex_waiter *waiter;
593 unsigned long flags; 593 unsigned long flags;
594 594
595 spin_lock_irqsave(&task->pi_lock, flags); 595 raw_spin_lock_irqsave(&task->pi_lock, flags);
596 596
597 waiter = task->pi_blocked_on; 597 waiter = task->pi_blocked_on;
598 if (!waiter || waiter->list_entry.prio == task->prio) { 598 if (!waiter || waiter->list_entry.prio == task->prio) {
599 spin_unlock_irqrestore(&task->pi_lock, flags); 599 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
600 return; 600 return;
601 } 601 }
602 602
603 spin_unlock_irqrestore(&task->pi_lock, flags); 603 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
604 604
605 /* gets dropped in rt_mutex_adjust_prio_chain()! */ 605 /* gets dropped in rt_mutex_adjust_prio_chain()! */
606 get_task_struct(task); 606 get_task_struct(task);
@@ -672,14 +672,14 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
672 break; 672 break;
673 } 673 }
674 674
675 spin_unlock(&lock->wait_lock); 675 raw_spin_unlock(&lock->wait_lock);
676 676
677 debug_rt_mutex_print_deadlock(waiter); 677 debug_rt_mutex_print_deadlock(waiter);
678 678
679 if (waiter->task) 679 if (waiter->task)
680 schedule_rt_mutex(lock); 680 schedule_rt_mutex(lock);
681 681
682 spin_lock(&lock->wait_lock); 682 raw_spin_lock(&lock->wait_lock);
683 set_current_state(state); 683 set_current_state(state);
684 } 684 }
685 685
@@ -700,11 +700,11 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
700 debug_rt_mutex_init_waiter(&waiter); 700 debug_rt_mutex_init_waiter(&waiter);
701 waiter.task = NULL; 701 waiter.task = NULL;
702 702
703 spin_lock(&lock->wait_lock); 703 raw_spin_lock(&lock->wait_lock);
704 704
705 /* Try to acquire the lock again: */ 705 /* Try to acquire the lock again: */
706 if (try_to_take_rt_mutex(lock)) { 706 if (try_to_take_rt_mutex(lock)) {
707 spin_unlock(&lock->wait_lock); 707 raw_spin_unlock(&lock->wait_lock);
708 return 0; 708 return 0;
709 } 709 }
710 710
@@ -731,7 +731,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
731 */ 731 */
732 fixup_rt_mutex_waiters(lock); 732 fixup_rt_mutex_waiters(lock);
733 733
734 spin_unlock(&lock->wait_lock); 734 raw_spin_unlock(&lock->wait_lock);
735 735
736 /* Remove pending timer: */ 736 /* Remove pending timer: */
737 if (unlikely(timeout)) 737 if (unlikely(timeout))
@@ -758,7 +758,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
758{ 758{
759 int ret = 0; 759 int ret = 0;
760 760
761 spin_lock(&lock->wait_lock); 761 raw_spin_lock(&lock->wait_lock);
762 762
763 if (likely(rt_mutex_owner(lock) != current)) { 763 if (likely(rt_mutex_owner(lock) != current)) {
764 764
@@ -770,7 +770,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
770 fixup_rt_mutex_waiters(lock); 770 fixup_rt_mutex_waiters(lock);
771 } 771 }
772 772
773 spin_unlock(&lock->wait_lock); 773 raw_spin_unlock(&lock->wait_lock);
774 774
775 return ret; 775 return ret;
776} 776}
@@ -781,7 +781,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
781static void __sched 781static void __sched
782rt_mutex_slowunlock(struct rt_mutex *lock) 782rt_mutex_slowunlock(struct rt_mutex *lock)
783{ 783{
784 spin_lock(&lock->wait_lock); 784 raw_spin_lock(&lock->wait_lock);
785 785
786 debug_rt_mutex_unlock(lock); 786 debug_rt_mutex_unlock(lock);
787 787
@@ -789,13 +789,13 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
789 789
790 if (!rt_mutex_has_waiters(lock)) { 790 if (!rt_mutex_has_waiters(lock)) {
791 lock->owner = NULL; 791 lock->owner = NULL;
792 spin_unlock(&lock->wait_lock); 792 raw_spin_unlock(&lock->wait_lock);
793 return; 793 return;
794 } 794 }
795 795
796 wakeup_next_waiter(lock); 796 wakeup_next_waiter(lock);
797 797
798 spin_unlock(&lock->wait_lock); 798 raw_spin_unlock(&lock->wait_lock);
799 799
800 /* Undo pi boosting if necessary: */ 800 /* Undo pi boosting if necessary: */
801 rt_mutex_adjust_prio(current); 801 rt_mutex_adjust_prio(current);
@@ -970,8 +970,8 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy);
970void __rt_mutex_init(struct rt_mutex *lock, const char *name) 970void __rt_mutex_init(struct rt_mutex *lock, const char *name)
971{ 971{
972 lock->owner = NULL; 972 lock->owner = NULL;
973 spin_lock_init(&lock->wait_lock); 973 raw_spin_lock_init(&lock->wait_lock);
974 plist_head_init(&lock->wait_list, &lock->wait_lock); 974 plist_head_init_raw(&lock->wait_list, &lock->wait_lock);
975 975
976 debug_rt_mutex_init(lock, name); 976 debug_rt_mutex_init(lock, name);
977} 977}
@@ -1032,7 +1032,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1032{ 1032{
1033 int ret; 1033 int ret;
1034 1034
1035 spin_lock(&lock->wait_lock); 1035 raw_spin_lock(&lock->wait_lock);
1036 1036
1037 mark_rt_mutex_waiters(lock); 1037 mark_rt_mutex_waiters(lock);
1038 1038
@@ -1040,7 +1040,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1040 /* We got the lock for task. */ 1040 /* We got the lock for task. */
1041 debug_rt_mutex_lock(lock); 1041 debug_rt_mutex_lock(lock);
1042 rt_mutex_set_owner(lock, task, 0); 1042 rt_mutex_set_owner(lock, task, 0);
1043 spin_unlock(&lock->wait_lock); 1043 raw_spin_unlock(&lock->wait_lock);
1044 rt_mutex_deadlock_account_lock(lock, task); 1044 rt_mutex_deadlock_account_lock(lock, task);
1045 return 1; 1045 return 1;
1046 } 1046 }
@@ -1056,7 +1056,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1056 */ 1056 */
1057 ret = 0; 1057 ret = 0;
1058 } 1058 }
1059 spin_unlock(&lock->wait_lock); 1059 raw_spin_unlock(&lock->wait_lock);
1060 1060
1061 debug_rt_mutex_print_deadlock(waiter); 1061 debug_rt_mutex_print_deadlock(waiter);
1062 1062
@@ -1106,7 +1106,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1106{ 1106{
1107 int ret; 1107 int ret;
1108 1108
1109 spin_lock(&lock->wait_lock); 1109 raw_spin_lock(&lock->wait_lock);
1110 1110
1111 set_current_state(TASK_INTERRUPTIBLE); 1111 set_current_state(TASK_INTERRUPTIBLE);
1112 1112
@@ -1124,7 +1124,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1124 */ 1124 */
1125 fixup_rt_mutex_waiters(lock); 1125 fixup_rt_mutex_waiters(lock);
1126 1126
1127 spin_unlock(&lock->wait_lock); 1127 raw_spin_unlock(&lock->wait_lock);
1128 1128
1129 /* 1129 /*
1130 * Readjust priority, when we did not get the lock. We might have been 1130 * Readjust priority, when we did not get the lock. We might have been
diff --git a/kernel/sched.c b/kernel/sched.c
index 2f76e06bea58..18cceeecce35 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -141,7 +141,7 @@ struct rt_prio_array {
141 141
142struct rt_bandwidth { 142struct rt_bandwidth {
143 /* nests inside the rq lock: */ 143 /* nests inside the rq lock: */
144 spinlock_t rt_runtime_lock; 144 raw_spinlock_t rt_runtime_lock;
145 ktime_t rt_period; 145 ktime_t rt_period;
146 u64 rt_runtime; 146 u64 rt_runtime;
147 struct hrtimer rt_period_timer; 147 struct hrtimer rt_period_timer;
@@ -178,7 +178,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
178 rt_b->rt_period = ns_to_ktime(period); 178 rt_b->rt_period = ns_to_ktime(period);
179 rt_b->rt_runtime = runtime; 179 rt_b->rt_runtime = runtime;
180 180
181 spin_lock_init(&rt_b->rt_runtime_lock); 181 raw_spin_lock_init(&rt_b->rt_runtime_lock);
182 182
183 hrtimer_init(&rt_b->rt_period_timer, 183 hrtimer_init(&rt_b->rt_period_timer,
184 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 184 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
@@ -200,7 +200,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
200 if (hrtimer_active(&rt_b->rt_period_timer)) 200 if (hrtimer_active(&rt_b->rt_period_timer))
201 return; 201 return;
202 202
203 spin_lock(&rt_b->rt_runtime_lock); 203 raw_spin_lock(&rt_b->rt_runtime_lock);
204 for (;;) { 204 for (;;) {
205 unsigned long delta; 205 unsigned long delta;
206 ktime_t soft, hard; 206 ktime_t soft, hard;
@@ -217,7 +217,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
217 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, 217 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
218 HRTIMER_MODE_ABS_PINNED, 0); 218 HRTIMER_MODE_ABS_PINNED, 0);
219 } 219 }
220 spin_unlock(&rt_b->rt_runtime_lock); 220 raw_spin_unlock(&rt_b->rt_runtime_lock);
221} 221}
222 222
223#ifdef CONFIG_RT_GROUP_SCHED 223#ifdef CONFIG_RT_GROUP_SCHED
@@ -298,7 +298,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
298 298
299#ifdef CONFIG_RT_GROUP_SCHED 299#ifdef CONFIG_RT_GROUP_SCHED
300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq); 301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var);
302#endif /* CONFIG_RT_GROUP_SCHED */ 302#endif /* CONFIG_RT_GROUP_SCHED */
303#else /* !CONFIG_USER_SCHED */ 303#else /* !CONFIG_USER_SCHED */
304#define root_task_group init_task_group 304#define root_task_group init_task_group
@@ -309,6 +309,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
309 */ 309 */
310static DEFINE_SPINLOCK(task_group_lock); 310static DEFINE_SPINLOCK(task_group_lock);
311 311
312#ifdef CONFIG_FAIR_GROUP_SCHED
313
312#ifdef CONFIG_SMP 314#ifdef CONFIG_SMP
313static int root_task_group_empty(void) 315static int root_task_group_empty(void)
314{ 316{
@@ -316,7 +318,6 @@ static int root_task_group_empty(void)
316} 318}
317#endif 319#endif
318 320
319#ifdef CONFIG_FAIR_GROUP_SCHED
320#ifdef CONFIG_USER_SCHED 321#ifdef CONFIG_USER_SCHED
321# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) 322# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
322#else /* !CONFIG_USER_SCHED */ 323#else /* !CONFIG_USER_SCHED */
@@ -469,7 +470,7 @@ struct rt_rq {
469 u64 rt_time; 470 u64 rt_time;
470 u64 rt_runtime; 471 u64 rt_runtime;
471 /* Nests inside the rq lock: */ 472 /* Nests inside the rq lock: */
472 spinlock_t rt_runtime_lock; 473 raw_spinlock_t rt_runtime_lock;
473 474
474#ifdef CONFIG_RT_GROUP_SCHED 475#ifdef CONFIG_RT_GROUP_SCHED
475 unsigned long rt_nr_boosted; 476 unsigned long rt_nr_boosted;
@@ -524,7 +525,7 @@ static struct root_domain def_root_domain;
524 */ 525 */
525struct rq { 526struct rq {
526 /* runqueue lock: */ 527 /* runqueue lock: */
527 spinlock_t lock; 528 raw_spinlock_t lock;
528 529
529 /* 530 /*
530 * nr_running and cpu_load should be in the same cacheline because 531 * nr_running and cpu_load should be in the same cacheline because
@@ -534,14 +535,12 @@ struct rq {
534 #define CPU_LOAD_IDX_MAX 5 535 #define CPU_LOAD_IDX_MAX 5
535 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 536 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
536#ifdef CONFIG_NO_HZ 537#ifdef CONFIG_NO_HZ
537 unsigned long last_tick_seen;
538 unsigned char in_nohz_recently; 538 unsigned char in_nohz_recently;
539#endif 539#endif
540 /* capture load from *all* tasks on this cpu: */ 540 /* capture load from *all* tasks on this cpu: */
541 struct load_weight load; 541 struct load_weight load;
542 unsigned long nr_load_updates; 542 unsigned long nr_load_updates;
543 u64 nr_switches; 543 u64 nr_switches;
544 u64 nr_migrations_in;
545 544
546 struct cfs_rq cfs; 545 struct cfs_rq cfs;
547 struct rt_rq rt; 546 struct rt_rq rt;
@@ -590,6 +589,8 @@ struct rq {
590 589
591 u64 rt_avg; 590 u64 rt_avg;
592 u64 age_stamp; 591 u64 age_stamp;
592 u64 idle_stamp;
593 u64 avg_idle;
593#endif 594#endif
594 595
595 /* calc_load related fields */ 596 /* calc_load related fields */
@@ -676,6 +677,7 @@ inline void update_rq_clock(struct rq *rq)
676 677
677/** 678/**
678 * runqueue_is_locked 679 * runqueue_is_locked
680 * @cpu: the processor in question.
679 * 681 *
680 * Returns true if the current cpu runqueue is locked. 682 * Returns true if the current cpu runqueue is locked.
681 * This interface allows printk to be called with the runqueue lock 683 * This interface allows printk to be called with the runqueue lock
@@ -683,7 +685,7 @@ inline void update_rq_clock(struct rq *rq)
683 */ 685 */
684int runqueue_is_locked(int cpu) 686int runqueue_is_locked(int cpu)
685{ 687{
686 return spin_is_locked(&cpu_rq(cpu)->lock); 688 return raw_spin_is_locked(&cpu_rq(cpu)->lock);
687} 689}
688 690
689/* 691/*
@@ -770,7 +772,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
770 if (!sched_feat_names[i]) 772 if (!sched_feat_names[i])
771 return -EINVAL; 773 return -EINVAL;
772 774
773 filp->f_pos += cnt; 775 *ppos += cnt;
774 776
775 return cnt; 777 return cnt;
776} 778}
@@ -780,7 +782,7 @@ static int sched_feat_open(struct inode *inode, struct file *filp)
780 return single_open(filp, sched_feat_show, NULL); 782 return single_open(filp, sched_feat_show, NULL);
781} 783}
782 784
783static struct file_operations sched_feat_fops = { 785static const struct file_operations sched_feat_fops = {
784 .open = sched_feat_open, 786 .open = sched_feat_open,
785 .write = sched_feat_write, 787 .write = sched_feat_write,
786 .read = seq_read, 788 .read = seq_read,
@@ -812,6 +814,7 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
812 * default: 0.25ms 814 * default: 0.25ms
813 */ 815 */
814unsigned int sysctl_sched_shares_ratelimit = 250000; 816unsigned int sysctl_sched_shares_ratelimit = 250000;
817unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
815 818
816/* 819/*
817 * Inject some fuzzyness into changing the per-cpu group shares 820 * Inject some fuzzyness into changing the per-cpu group shares
@@ -890,7 +893,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
890 */ 893 */
891 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); 894 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
892 895
893 spin_unlock_irq(&rq->lock); 896 raw_spin_unlock_irq(&rq->lock);
894} 897}
895 898
896#else /* __ARCH_WANT_UNLOCKED_CTXSW */ 899#else /* __ARCH_WANT_UNLOCKED_CTXSW */
@@ -914,9 +917,9 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
914 next->oncpu = 1; 917 next->oncpu = 1;
915#endif 918#endif
916#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 919#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
917 spin_unlock_irq(&rq->lock); 920 raw_spin_unlock_irq(&rq->lock);
918#else 921#else
919 spin_unlock(&rq->lock); 922 raw_spin_unlock(&rq->lock);
920#endif 923#endif
921} 924}
922 925
@@ -946,10 +949,10 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
946{ 949{
947 for (;;) { 950 for (;;) {
948 struct rq *rq = task_rq(p); 951 struct rq *rq = task_rq(p);
949 spin_lock(&rq->lock); 952 raw_spin_lock(&rq->lock);
950 if (likely(rq == task_rq(p))) 953 if (likely(rq == task_rq(p)))
951 return rq; 954 return rq;
952 spin_unlock(&rq->lock); 955 raw_spin_unlock(&rq->lock);
953 } 956 }
954} 957}
955 958
@@ -966,10 +969,10 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
966 for (;;) { 969 for (;;) {
967 local_irq_save(*flags); 970 local_irq_save(*flags);
968 rq = task_rq(p); 971 rq = task_rq(p);
969 spin_lock(&rq->lock); 972 raw_spin_lock(&rq->lock);
970 if (likely(rq == task_rq(p))) 973 if (likely(rq == task_rq(p)))
971 return rq; 974 return rq;
972 spin_unlock_irqrestore(&rq->lock, *flags); 975 raw_spin_unlock_irqrestore(&rq->lock, *flags);
973 } 976 }
974} 977}
975 978
@@ -978,19 +981,19 @@ void task_rq_unlock_wait(struct task_struct *p)
978 struct rq *rq = task_rq(p); 981 struct rq *rq = task_rq(p);
979 982
980 smp_mb(); /* spin-unlock-wait is not a full memory barrier */ 983 smp_mb(); /* spin-unlock-wait is not a full memory barrier */
981 spin_unlock_wait(&rq->lock); 984 raw_spin_unlock_wait(&rq->lock);
982} 985}
983 986
984static void __task_rq_unlock(struct rq *rq) 987static void __task_rq_unlock(struct rq *rq)
985 __releases(rq->lock) 988 __releases(rq->lock)
986{ 989{
987 spin_unlock(&rq->lock); 990 raw_spin_unlock(&rq->lock);
988} 991}
989 992
990static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) 993static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
991 __releases(rq->lock) 994 __releases(rq->lock)
992{ 995{
993 spin_unlock_irqrestore(&rq->lock, *flags); 996 raw_spin_unlock_irqrestore(&rq->lock, *flags);
994} 997}
995 998
996/* 999/*
@@ -1003,7 +1006,7 @@ static struct rq *this_rq_lock(void)
1003 1006
1004 local_irq_disable(); 1007 local_irq_disable();
1005 rq = this_rq(); 1008 rq = this_rq();
1006 spin_lock(&rq->lock); 1009 raw_spin_lock(&rq->lock);
1007 1010
1008 return rq; 1011 return rq;
1009} 1012}
@@ -1050,10 +1053,10 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
1050 1053
1051 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); 1054 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1052 1055
1053 spin_lock(&rq->lock); 1056 raw_spin_lock(&rq->lock);
1054 update_rq_clock(rq); 1057 update_rq_clock(rq);
1055 rq->curr->sched_class->task_tick(rq, rq->curr, 1); 1058 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1056 spin_unlock(&rq->lock); 1059 raw_spin_unlock(&rq->lock);
1057 1060
1058 return HRTIMER_NORESTART; 1061 return HRTIMER_NORESTART;
1059} 1062}
@@ -1066,10 +1069,10 @@ static void __hrtick_start(void *arg)
1066{ 1069{
1067 struct rq *rq = arg; 1070 struct rq *rq = arg;
1068 1071
1069 spin_lock(&rq->lock); 1072 raw_spin_lock(&rq->lock);
1070 hrtimer_restart(&rq->hrtick_timer); 1073 hrtimer_restart(&rq->hrtick_timer);
1071 rq->hrtick_csd_pending = 0; 1074 rq->hrtick_csd_pending = 0;
1072 spin_unlock(&rq->lock); 1075 raw_spin_unlock(&rq->lock);
1073} 1076}
1074 1077
1075/* 1078/*
@@ -1176,7 +1179,7 @@ static void resched_task(struct task_struct *p)
1176{ 1179{
1177 int cpu; 1180 int cpu;
1178 1181
1179 assert_spin_locked(&task_rq(p)->lock); 1182 assert_raw_spin_locked(&task_rq(p)->lock);
1180 1183
1181 if (test_tsk_need_resched(p)) 1184 if (test_tsk_need_resched(p))
1182 return; 1185 return;
@@ -1198,10 +1201,10 @@ static void resched_cpu(int cpu)
1198 struct rq *rq = cpu_rq(cpu); 1201 struct rq *rq = cpu_rq(cpu);
1199 unsigned long flags; 1202 unsigned long flags;
1200 1203
1201 if (!spin_trylock_irqsave(&rq->lock, flags)) 1204 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
1202 return; 1205 return;
1203 resched_task(cpu_curr(cpu)); 1206 resched_task(cpu_curr(cpu));
1204 spin_unlock_irqrestore(&rq->lock, flags); 1207 raw_spin_unlock_irqrestore(&rq->lock, flags);
1205} 1208}
1206 1209
1207#ifdef CONFIG_NO_HZ 1210#ifdef CONFIG_NO_HZ
@@ -1270,7 +1273,7 @@ static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1270#else /* !CONFIG_SMP */ 1273#else /* !CONFIG_SMP */
1271static void resched_task(struct task_struct *p) 1274static void resched_task(struct task_struct *p)
1272{ 1275{
1273 assert_spin_locked(&task_rq(p)->lock); 1276 assert_raw_spin_locked(&task_rq(p)->lock);
1274 set_tsk_need_resched(p); 1277 set_tsk_need_resched(p);
1275} 1278}
1276 1279
@@ -1563,11 +1566,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1563 1566
1564#ifdef CONFIG_FAIR_GROUP_SCHED 1567#ifdef CONFIG_FAIR_GROUP_SCHED
1565 1568
1566struct update_shares_data { 1569static __read_mostly unsigned long *update_shares_data;
1567 unsigned long rq_weight[NR_CPUS];
1568};
1569
1570static DEFINE_PER_CPU(struct update_shares_data, update_shares_data);
1571 1570
1572static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1571static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1573 1572
@@ -1577,12 +1576,12 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1577static void update_group_shares_cpu(struct task_group *tg, int cpu, 1576static void update_group_shares_cpu(struct task_group *tg, int cpu,
1578 unsigned long sd_shares, 1577 unsigned long sd_shares,
1579 unsigned long sd_rq_weight, 1578 unsigned long sd_rq_weight,
1580 struct update_shares_data *usd) 1579 unsigned long *usd_rq_weight)
1581{ 1580{
1582 unsigned long shares, rq_weight; 1581 unsigned long shares, rq_weight;
1583 int boost = 0; 1582 int boost = 0;
1584 1583
1585 rq_weight = usd->rq_weight[cpu]; 1584 rq_weight = usd_rq_weight[cpu];
1586 if (!rq_weight) { 1585 if (!rq_weight) {
1587 boost = 1; 1586 boost = 1;
1588 rq_weight = NICE_0_LOAD; 1587 rq_weight = NICE_0_LOAD;
@@ -1601,11 +1600,11 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu,
1601 struct rq *rq = cpu_rq(cpu); 1600 struct rq *rq = cpu_rq(cpu);
1602 unsigned long flags; 1601 unsigned long flags;
1603 1602
1604 spin_lock_irqsave(&rq->lock, flags); 1603 raw_spin_lock_irqsave(&rq->lock, flags);
1605 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; 1604 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1606 tg->cfs_rq[cpu]->shares = boost ? 0 : shares; 1605 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1607 __set_se_shares(tg->se[cpu], shares); 1606 __set_se_shares(tg->se[cpu], shares);
1608 spin_unlock_irqrestore(&rq->lock, flags); 1607 raw_spin_unlock_irqrestore(&rq->lock, flags);
1609 } 1608 }
1610} 1609}
1611 1610
@@ -1616,8 +1615,8 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu,
1616 */ 1615 */
1617static int tg_shares_up(struct task_group *tg, void *data) 1616static int tg_shares_up(struct task_group *tg, void *data)
1618{ 1617{
1619 unsigned long weight, rq_weight = 0, shares = 0; 1618 unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
1620 struct update_shares_data *usd; 1619 unsigned long *usd_rq_weight;
1621 struct sched_domain *sd = data; 1620 struct sched_domain *sd = data;
1622 unsigned long flags; 1621 unsigned long flags;
1623 int i; 1622 int i;
@@ -1626,12 +1625,13 @@ static int tg_shares_up(struct task_group *tg, void *data)
1626 return 0; 1625 return 0;
1627 1626
1628 local_irq_save(flags); 1627 local_irq_save(flags);
1629 usd = &__get_cpu_var(update_shares_data); 1628 usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
1630 1629
1631 for_each_cpu(i, sched_domain_span(sd)) { 1630 for_each_cpu(i, sched_domain_span(sd)) {
1632 weight = tg->cfs_rq[i]->load.weight; 1631 weight = tg->cfs_rq[i]->load.weight;
1633 usd->rq_weight[i] = weight; 1632 usd_rq_weight[i] = weight;
1634 1633
1634 rq_weight += weight;
1635 /* 1635 /*
1636 * If there are currently no tasks on the cpu pretend there 1636 * If there are currently no tasks on the cpu pretend there
1637 * is one of average load so that when a new task gets to 1637 * is one of average load so that when a new task gets to
@@ -1640,10 +1640,13 @@ static int tg_shares_up(struct task_group *tg, void *data)
1640 if (!weight) 1640 if (!weight)
1641 weight = NICE_0_LOAD; 1641 weight = NICE_0_LOAD;
1642 1642
1643 rq_weight += weight; 1643 sum_weight += weight;
1644 shares += tg->cfs_rq[i]->shares; 1644 shares += tg->cfs_rq[i]->shares;
1645 } 1645 }
1646 1646
1647 if (!rq_weight)
1648 rq_weight = sum_weight;
1649
1647 if ((!shares && rq_weight) || shares > tg->shares) 1650 if ((!shares && rq_weight) || shares > tg->shares)
1648 shares = tg->shares; 1651 shares = tg->shares;
1649 1652
@@ -1651,7 +1654,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
1651 shares = tg->shares; 1654 shares = tg->shares;
1652 1655
1653 for_each_cpu(i, sched_domain_span(sd)) 1656 for_each_cpu(i, sched_domain_span(sd))
1654 update_group_shares_cpu(tg, i, shares, rq_weight, usd); 1657 update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
1655 1658
1656 local_irq_restore(flags); 1659 local_irq_restore(flags);
1657 1660
@@ -1703,9 +1706,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1703 if (root_task_group_empty()) 1706 if (root_task_group_empty())
1704 return; 1707 return;
1705 1708
1706 spin_unlock(&rq->lock); 1709 raw_spin_unlock(&rq->lock);
1707 update_shares(sd); 1710 update_shares(sd);
1708 spin_lock(&rq->lock); 1711 raw_spin_lock(&rq->lock);
1709} 1712}
1710 1713
1711static void update_h_load(long cpu) 1714static void update_h_load(long cpu)
@@ -1745,7 +1748,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1745 __acquires(busiest->lock) 1748 __acquires(busiest->lock)
1746 __acquires(this_rq->lock) 1749 __acquires(this_rq->lock)
1747{ 1750{
1748 spin_unlock(&this_rq->lock); 1751 raw_spin_unlock(&this_rq->lock);
1749 double_rq_lock(this_rq, busiest); 1752 double_rq_lock(this_rq, busiest);
1750 1753
1751 return 1; 1754 return 1;
@@ -1766,14 +1769,16 @@ static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1766{ 1769{
1767 int ret = 0; 1770 int ret = 0;
1768 1771
1769 if (unlikely(!spin_trylock(&busiest->lock))) { 1772 if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1770 if (busiest < this_rq) { 1773 if (busiest < this_rq) {
1771 spin_unlock(&this_rq->lock); 1774 raw_spin_unlock(&this_rq->lock);
1772 spin_lock(&busiest->lock); 1775 raw_spin_lock(&busiest->lock);
1773 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING); 1776 raw_spin_lock_nested(&this_rq->lock,
1777 SINGLE_DEPTH_NESTING);
1774 ret = 1; 1778 ret = 1;
1775 } else 1779 } else
1776 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING); 1780 raw_spin_lock_nested(&busiest->lock,
1781 SINGLE_DEPTH_NESTING);
1777 } 1782 }
1778 return ret; 1783 return ret;
1779} 1784}
@@ -1787,7 +1792,7 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1787{ 1792{
1788 if (unlikely(!irqs_disabled())) { 1793 if (unlikely(!irqs_disabled())) {
1789 /* printk() doesn't work good under rq->lock */ 1794 /* printk() doesn't work good under rq->lock */
1790 spin_unlock(&this_rq->lock); 1795 raw_spin_unlock(&this_rq->lock);
1791 BUG_ON(1); 1796 BUG_ON(1);
1792 } 1797 }
1793 1798
@@ -1797,7 +1802,7 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1797static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) 1802static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1798 __releases(busiest->lock) 1803 __releases(busiest->lock)
1799{ 1804{
1800 spin_unlock(&busiest->lock); 1805 raw_spin_unlock(&busiest->lock);
1801 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); 1806 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1802} 1807}
1803#endif 1808#endif
@@ -1812,6 +1817,22 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1812#endif 1817#endif
1813 1818
1814static void calc_load_account_active(struct rq *this_rq); 1819static void calc_load_account_active(struct rq *this_rq);
1820static void update_sysctl(void);
1821static int get_update_sysctl_factor(void);
1822
1823static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1824{
1825 set_task_rq(p, cpu);
1826#ifdef CONFIG_SMP
1827 /*
1828 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1829 * successfuly executed on another CPU. We must ensure that updates of
1830 * per-task data have been completed by this moment.
1831 */
1832 smp_wmb();
1833 task_thread_info(p)->cpu = cpu;
1834#endif
1835}
1815 1836
1816#include "sched_stats.h" 1837#include "sched_stats.h"
1817#include "sched_idletask.c" 1838#include "sched_idletask.c"
@@ -1969,20 +1990,6 @@ inline int task_curr(const struct task_struct *p)
1969 return cpu_curr(task_cpu(p)) == p; 1990 return cpu_curr(task_cpu(p)) == p;
1970} 1991}
1971 1992
1972static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1973{
1974 set_task_rq(p, cpu);
1975#ifdef CONFIG_SMP
1976 /*
1977 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1978 * successfuly executed on another CPU. We must ensure that updates of
1979 * per-task data have been completed by this moment.
1980 */
1981 smp_wmb();
1982 task_thread_info(p)->cpu = cpu;
1983#endif
1984}
1985
1986static inline void check_class_changed(struct rq *rq, struct task_struct *p, 1993static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1987 const struct sched_class *prev_class, 1994 const struct sched_class *prev_class,
1988 int oldprio, int running) 1995 int oldprio, int running)
@@ -1995,6 +2002,39 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1995 p->sched_class->prio_changed(rq, p, oldprio, running); 2002 p->sched_class->prio_changed(rq, p, oldprio, running);
1996} 2003}
1997 2004
2005/**
2006 * kthread_bind - bind a just-created kthread to a cpu.
2007 * @p: thread created by kthread_create().
2008 * @cpu: cpu (might not be online, must be possible) for @k to run on.
2009 *
2010 * Description: This function is equivalent to set_cpus_allowed(),
2011 * except that @cpu doesn't need to be online, and the thread must be
2012 * stopped (i.e., just returned from kthread_create()).
2013 *
2014 * Function lives here instead of kthread.c because it messes with
2015 * scheduler internals which require locking.
2016 */
2017void kthread_bind(struct task_struct *p, unsigned int cpu)
2018{
2019 struct rq *rq = cpu_rq(cpu);
2020 unsigned long flags;
2021
2022 /* Must have done schedule() in kthread() before we set_task_cpu */
2023 if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
2024 WARN_ON(1);
2025 return;
2026 }
2027
2028 raw_spin_lock_irqsave(&rq->lock, flags);
2029 update_rq_clock(rq);
2030 set_task_cpu(p, cpu);
2031 p->cpus_allowed = cpumask_of_cpu(cpu);
2032 p->rt.nr_cpus_allowed = 1;
2033 p->flags |= PF_THREAD_BOUND;
2034 raw_spin_unlock_irqrestore(&rq->lock, flags);
2035}
2036EXPORT_SYMBOL(kthread_bind);
2037
1998#ifdef CONFIG_SMP 2038#ifdef CONFIG_SMP
1999/* 2039/*
2000 * Is this task likely cache-hot: 2040 * Is this task likely cache-hot:
@@ -2007,7 +2047,7 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2007 /* 2047 /*
2008 * Buddy candidates are cache hot: 2048 * Buddy candidates are cache hot:
2009 */ 2049 */
2010 if (sched_feat(CACHE_HOT_BUDDY) && 2050 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
2011 (&p->se == cfs_rq_of(&p->se)->next || 2051 (&p->se == cfs_rq_of(&p->se)->next ||
2012 &p->se == cfs_rq_of(&p->se)->last)) 2052 &p->se == cfs_rq_of(&p->se)->last))
2013 return 1; 2053 return 1;
@@ -2029,30 +2069,13 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2029void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 2069void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2030{ 2070{
2031 int old_cpu = task_cpu(p); 2071 int old_cpu = task_cpu(p);
2032 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
2033 struct cfs_rq *old_cfsrq = task_cfs_rq(p), 2072 struct cfs_rq *old_cfsrq = task_cfs_rq(p),
2034 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); 2073 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
2035 u64 clock_offset;
2036
2037 clock_offset = old_rq->clock - new_rq->clock;
2038 2074
2039 trace_sched_migrate_task(p, new_cpu); 2075 trace_sched_migrate_task(p, new_cpu);
2040 2076
2041#ifdef CONFIG_SCHEDSTATS
2042 if (p->se.wait_start)
2043 p->se.wait_start -= clock_offset;
2044 if (p->se.sleep_start)
2045 p->se.sleep_start -= clock_offset;
2046 if (p->se.block_start)
2047 p->se.block_start -= clock_offset;
2048#endif
2049 if (old_cpu != new_cpu) { 2077 if (old_cpu != new_cpu) {
2050 p->se.nr_migrations++; 2078 p->se.nr_migrations++;
2051 new_rq->nr_migrations_in++;
2052#ifdef CONFIG_SCHEDSTATS
2053 if (task_hot(p, old_rq->clock, NULL))
2054 schedstat_inc(p, se.nr_forced2_migrations);
2055#endif
2056 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 2079 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
2057 1, 1, NULL, 0); 2080 1, 1, NULL, 0);
2058 } 2081 }
@@ -2085,6 +2108,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2085 * it is sufficient to simply update the task's cpu field. 2108 * it is sufficient to simply update the task's cpu field.
2086 */ 2109 */
2087 if (!p->se.on_rq && !task_running(rq, p)) { 2110 if (!p->se.on_rq && !task_running(rq, p)) {
2111 update_rq_clock(rq);
2088 set_task_cpu(p, dest_cpu); 2112 set_task_cpu(p, dest_cpu);
2089 return 0; 2113 return 0;
2090 } 2114 }
@@ -2292,6 +2316,14 @@ void task_oncpu_function_call(struct task_struct *p,
2292 preempt_enable(); 2316 preempt_enable();
2293} 2317}
2294 2318
2319#ifdef CONFIG_SMP
2320static inline
2321int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2322{
2323 return p->sched_class->select_task_rq(p, sd_flags, wake_flags);
2324}
2325#endif
2326
2295/*** 2327/***
2296 * try_to_wake_up - wake up a thread 2328 * try_to_wake_up - wake up a thread
2297 * @p: the to-be-woken-up thread 2329 * @p: the to-be-woken-up thread
@@ -2311,7 +2343,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2311{ 2343{
2312 int cpu, orig_cpu, this_cpu, success = 0; 2344 int cpu, orig_cpu, this_cpu, success = 0;
2313 unsigned long flags; 2345 unsigned long flags;
2314 struct rq *rq; 2346 struct rq *rq, *orig_rq;
2315 2347
2316 if (!sched_feat(SYNC_WAKEUPS)) 2348 if (!sched_feat(SYNC_WAKEUPS))
2317 wake_flags &= ~WF_SYNC; 2349 wake_flags &= ~WF_SYNC;
@@ -2319,7 +2351,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2319 this_cpu = get_cpu(); 2351 this_cpu = get_cpu();
2320 2352
2321 smp_wmb(); 2353 smp_wmb();
2322 rq = task_rq_lock(p, &flags); 2354 rq = orig_rq = task_rq_lock(p, &flags);
2323 update_rq_clock(rq); 2355 update_rq_clock(rq);
2324 if (!(p->state & state)) 2356 if (!(p->state & state))
2325 goto out; 2357 goto out;
@@ -2343,13 +2375,15 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2343 if (task_contributes_to_load(p)) 2375 if (task_contributes_to_load(p))
2344 rq->nr_uninterruptible--; 2376 rq->nr_uninterruptible--;
2345 p->state = TASK_WAKING; 2377 p->state = TASK_WAKING;
2346 task_rq_unlock(rq, &flags); 2378 __task_rq_unlock(rq);
2347 2379
2348 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 2380 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2349 if (cpu != orig_cpu) 2381 if (cpu != orig_cpu)
2350 set_task_cpu(p, cpu); 2382 set_task_cpu(p, cpu);
2351 2383
2352 rq = task_rq_lock(p, &flags); 2384 rq = __task_rq_lock(p);
2385 update_rq_clock(rq);
2386
2353 WARN_ON(p->state != TASK_WAKING); 2387 WARN_ON(p->state != TASK_WAKING);
2354 cpu = task_cpu(p); 2388 cpu = task_cpu(p);
2355 2389
@@ -2406,6 +2440,17 @@ out_running:
2406#ifdef CONFIG_SMP 2440#ifdef CONFIG_SMP
2407 if (p->sched_class->task_wake_up) 2441 if (p->sched_class->task_wake_up)
2408 p->sched_class->task_wake_up(rq, p); 2442 p->sched_class->task_wake_up(rq, p);
2443
2444 if (unlikely(rq->idle_stamp)) {
2445 u64 delta = rq->clock - rq->idle_stamp;
2446 u64 max = 2*sysctl_sched_migration_cost;
2447
2448 if (delta > max)
2449 rq->avg_idle = max;
2450 else
2451 update_avg(&rq->avg_idle, delta);
2452 rq->idle_stamp = 0;
2453 }
2409#endif 2454#endif
2410out: 2455out:
2411 task_rq_unlock(rq, &flags); 2456 task_rq_unlock(rq, &flags);
@@ -2452,7 +2497,6 @@ static void __sched_fork(struct task_struct *p)
2452 p->se.avg_overlap = 0; 2497 p->se.avg_overlap = 0;
2453 p->se.start_runtime = 0; 2498 p->se.start_runtime = 0;
2454 p->se.avg_wakeup = sysctl_sched_wakeup_granularity; 2499 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2455 p->se.avg_running = 0;
2456 2500
2457#ifdef CONFIG_SCHEDSTATS 2501#ifdef CONFIG_SCHEDSTATS
2458 p->se.wait_start = 0; 2502 p->se.wait_start = 0;
@@ -2474,7 +2518,6 @@ static void __sched_fork(struct task_struct *p)
2474 p->se.nr_failed_migrations_running = 0; 2518 p->se.nr_failed_migrations_running = 0;
2475 p->se.nr_failed_migrations_hot = 0; 2519 p->se.nr_failed_migrations_hot = 0;
2476 p->se.nr_forced_migrations = 0; 2520 p->se.nr_forced_migrations = 0;
2477 p->se.nr_forced2_migrations = 0;
2478 2521
2479 p->se.nr_wakeups = 0; 2522 p->se.nr_wakeups = 0;
2480 p->se.nr_wakeups_sync = 0; 2523 p->se.nr_wakeups_sync = 0;
@@ -2515,22 +2558,17 @@ void sched_fork(struct task_struct *p, int clone_flags)
2515 __sched_fork(p); 2558 __sched_fork(p);
2516 2559
2517 /* 2560 /*
2518 * Make sure we do not leak PI boosting priority to the child.
2519 */
2520 p->prio = current->normal_prio;
2521
2522 /*
2523 * Revert to default priority/policy on fork if requested. 2561 * Revert to default priority/policy on fork if requested.
2524 */ 2562 */
2525 if (unlikely(p->sched_reset_on_fork)) { 2563 if (unlikely(p->sched_reset_on_fork)) {
2526 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) 2564 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
2527 p->policy = SCHED_NORMAL; 2565 p->policy = SCHED_NORMAL;
2528 2566 p->normal_prio = p->static_prio;
2529 if (p->normal_prio < DEFAULT_PRIO) 2567 }
2530 p->prio = DEFAULT_PRIO;
2531 2568
2532 if (PRIO_TO_NICE(p->static_prio) < 0) { 2569 if (PRIO_TO_NICE(p->static_prio) < 0) {
2533 p->static_prio = NICE_TO_PRIO(0); 2570 p->static_prio = NICE_TO_PRIO(0);
2571 p->normal_prio = p->static_prio;
2534 set_load_weight(p); 2572 set_load_weight(p);
2535 } 2573 }
2536 2574
@@ -2541,11 +2579,19 @@ void sched_fork(struct task_struct *p, int clone_flags)
2541 p->sched_reset_on_fork = 0; 2579 p->sched_reset_on_fork = 0;
2542 } 2580 }
2543 2581
2582 /*
2583 * Make sure we do not leak PI boosting priority to the child.
2584 */
2585 p->prio = current->normal_prio;
2586
2544 if (!rt_prio(p->prio)) 2587 if (!rt_prio(p->prio))
2545 p->sched_class = &fair_sched_class; 2588 p->sched_class = &fair_sched_class;
2546 2589
2590 if (p->sched_class->task_fork)
2591 p->sched_class->task_fork(p);
2592
2547#ifdef CONFIG_SMP 2593#ifdef CONFIG_SMP
2548 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); 2594 cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
2549#endif 2595#endif
2550 set_task_cpu(p, cpu); 2596 set_task_cpu(p, cpu);
2551 2597
@@ -2580,19 +2626,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2580 rq = task_rq_lock(p, &flags); 2626 rq = task_rq_lock(p, &flags);
2581 BUG_ON(p->state != TASK_RUNNING); 2627 BUG_ON(p->state != TASK_RUNNING);
2582 update_rq_clock(rq); 2628 update_rq_clock(rq);
2583 2629 activate_task(rq, p, 0);
2584 p->prio = effective_prio(p);
2585
2586 if (!p->sched_class->task_new || !current->se.on_rq) {
2587 activate_task(rq, p, 0);
2588 } else {
2589 /*
2590 * Let the scheduling class do new task startup
2591 * management (if any):
2592 */
2593 p->sched_class->task_new(rq, p);
2594 inc_nr_running(rq);
2595 }
2596 trace_sched_wakeup_new(rq, p, 1); 2630 trace_sched_wakeup_new(rq, p, 1);
2597 check_preempt_curr(rq, p, WF_FORK); 2631 check_preempt_curr(rq, p, WF_FORK);
2598#ifdef CONFIG_SMP 2632#ifdef CONFIG_SMP
@@ -2749,10 +2783,10 @@ static inline void post_schedule(struct rq *rq)
2749 if (rq->post_schedule) { 2783 if (rq->post_schedule) {
2750 unsigned long flags; 2784 unsigned long flags;
2751 2785
2752 spin_lock_irqsave(&rq->lock, flags); 2786 raw_spin_lock_irqsave(&rq->lock, flags);
2753 if (rq->curr->sched_class->post_schedule) 2787 if (rq->curr->sched_class->post_schedule)
2754 rq->curr->sched_class->post_schedule(rq); 2788 rq->curr->sched_class->post_schedule(rq);
2755 spin_unlock_irqrestore(&rq->lock, flags); 2789 raw_spin_unlock_irqrestore(&rq->lock, flags);
2756 2790
2757 rq->post_schedule = 0; 2791 rq->post_schedule = 0;
2758 } 2792 }
@@ -2816,14 +2850,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
2816 */ 2850 */
2817 arch_start_context_switch(prev); 2851 arch_start_context_switch(prev);
2818 2852
2819 if (unlikely(!mm)) { 2853 if (likely(!mm)) {
2820 next->active_mm = oldmm; 2854 next->active_mm = oldmm;
2821 atomic_inc(&oldmm->mm_count); 2855 atomic_inc(&oldmm->mm_count);
2822 enter_lazy_tlb(oldmm, next); 2856 enter_lazy_tlb(oldmm, next);
2823 } else 2857 } else
2824 switch_mm(oldmm, mm, next); 2858 switch_mm(oldmm, mm, next);
2825 2859
2826 if (unlikely(!prev->mm)) { 2860 if (likely(!prev->mm)) {
2827 prev->active_mm = NULL; 2861 prev->active_mm = NULL;
2828 rq->prev_mm = oldmm; 2862 rq->prev_mm = oldmm;
2829 } 2863 }
@@ -2986,15 +3020,6 @@ static void calc_load_account_active(struct rq *this_rq)
2986} 3020}
2987 3021
2988/* 3022/*
2989 * Externally visible per-cpu scheduler statistics:
2990 * cpu_nr_migrations(cpu) - number of migrations into that cpu
2991 */
2992u64 cpu_nr_migrations(int cpu)
2993{
2994 return cpu_rq(cpu)->nr_migrations_in;
2995}
2996
2997/*
2998 * Update rq->cpu_load[] statistics. This function is usually called every 3023 * Update rq->cpu_load[] statistics. This function is usually called every
2999 * scheduler tick (TICK_NSEC). 3024 * scheduler tick (TICK_NSEC).
3000 */ 3025 */
@@ -3043,15 +3068,15 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
3043{ 3068{
3044 BUG_ON(!irqs_disabled()); 3069 BUG_ON(!irqs_disabled());
3045 if (rq1 == rq2) { 3070 if (rq1 == rq2) {
3046 spin_lock(&rq1->lock); 3071 raw_spin_lock(&rq1->lock);
3047 __acquire(rq2->lock); /* Fake it out ;) */ 3072 __acquire(rq2->lock); /* Fake it out ;) */
3048 } else { 3073 } else {
3049 if (rq1 < rq2) { 3074 if (rq1 < rq2) {
3050 spin_lock(&rq1->lock); 3075 raw_spin_lock(&rq1->lock);
3051 spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); 3076 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
3052 } else { 3077 } else {
3053 spin_lock(&rq2->lock); 3078 raw_spin_lock(&rq2->lock);
3054 spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); 3079 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
3055 } 3080 }
3056 } 3081 }
3057 update_rq_clock(rq1); 3082 update_rq_clock(rq1);
@@ -3068,9 +3093,9 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
3068 __releases(rq1->lock) 3093 __releases(rq1->lock)
3069 __releases(rq2->lock) 3094 __releases(rq2->lock)
3070{ 3095{
3071 spin_unlock(&rq1->lock); 3096 raw_spin_unlock(&rq1->lock);
3072 if (rq1 != rq2) 3097 if (rq1 != rq2)
3073 spin_unlock(&rq2->lock); 3098 raw_spin_unlock(&rq2->lock);
3074 else 3099 else
3075 __release(rq2->lock); 3100 __release(rq2->lock);
3076} 3101}
@@ -3116,7 +3141,7 @@ out:
3116void sched_exec(void) 3141void sched_exec(void)
3117{ 3142{
3118 int new_cpu, this_cpu = get_cpu(); 3143 int new_cpu, this_cpu = get_cpu();
3119 new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0); 3144 new_cpu = select_task_rq(current, SD_BALANCE_EXEC, 0);
3120 put_cpu(); 3145 put_cpu();
3121 if (new_cpu != this_cpu) 3146 if (new_cpu != this_cpu)
3122 sched_migrate_task(current, new_cpu); 3147 sched_migrate_task(current, new_cpu);
@@ -3132,10 +3157,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
3132 deactivate_task(src_rq, p, 0); 3157 deactivate_task(src_rq, p, 0);
3133 set_task_cpu(p, this_cpu); 3158 set_task_cpu(p, this_cpu);
3134 activate_task(this_rq, p, 0); 3159 activate_task(this_rq, p, 0);
3135 /*
3136 * Note that idle threads have a prio of MAX_PRIO, for this test
3137 * to be always true for them.
3138 */
3139 check_preempt_curr(this_rq, p, 0); 3160 check_preempt_curr(this_rq, p, 0);
3140} 3161}
3141 3162
@@ -3658,6 +3679,7 @@ static void update_group_power(struct sched_domain *sd, int cpu)
3658 3679
3659/** 3680/**
3660 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 3681 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3682 * @sd: The sched_domain whose statistics are to be updated.
3661 * @group: sched_group whose statistics are to be updated. 3683 * @group: sched_group whose statistics are to be updated.
3662 * @this_cpu: Cpu for which load balance is currently performed. 3684 * @this_cpu: Cpu for which load balance is currently performed.
3663 * @idle: Idle status of this_cpu 3685 * @idle: Idle status of this_cpu
@@ -4093,7 +4115,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4093 unsigned long flags; 4115 unsigned long flags;
4094 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4116 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4095 4117
4096 cpumask_setall(cpus); 4118 cpumask_copy(cpus, cpu_active_mask);
4097 4119
4098 /* 4120 /*
4099 * When power savings policy is enabled for the parent domain, idle 4121 * When power savings policy is enabled for the parent domain, idle
@@ -4166,14 +4188,15 @@ redo:
4166 4188
4167 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { 4189 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
4168 4190
4169 spin_lock_irqsave(&busiest->lock, flags); 4191 raw_spin_lock_irqsave(&busiest->lock, flags);
4170 4192
4171 /* don't kick the migration_thread, if the curr 4193 /* don't kick the migration_thread, if the curr
4172 * task on busiest cpu can't be moved to this_cpu 4194 * task on busiest cpu can't be moved to this_cpu
4173 */ 4195 */
4174 if (!cpumask_test_cpu(this_cpu, 4196 if (!cpumask_test_cpu(this_cpu,
4175 &busiest->curr->cpus_allowed)) { 4197 &busiest->curr->cpus_allowed)) {
4176 spin_unlock_irqrestore(&busiest->lock, flags); 4198 raw_spin_unlock_irqrestore(&busiest->lock,
4199 flags);
4177 all_pinned = 1; 4200 all_pinned = 1;
4178 goto out_one_pinned; 4201 goto out_one_pinned;
4179 } 4202 }
@@ -4183,7 +4206,7 @@ redo:
4183 busiest->push_cpu = this_cpu; 4206 busiest->push_cpu = this_cpu;
4184 active_balance = 1; 4207 active_balance = 1;
4185 } 4208 }
4186 spin_unlock_irqrestore(&busiest->lock, flags); 4209 raw_spin_unlock_irqrestore(&busiest->lock, flags);
4187 if (active_balance) 4210 if (active_balance)
4188 wake_up_process(busiest->migration_thread); 4211 wake_up_process(busiest->migration_thread);
4189 4212
@@ -4256,7 +4279,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4256 int all_pinned = 0; 4279 int all_pinned = 0;
4257 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4280 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4258 4281
4259 cpumask_setall(cpus); 4282 cpumask_copy(cpus, cpu_active_mask);
4260 4283
4261 /* 4284 /*
4262 * When power savings policy is enabled for the parent domain, idle 4285 * When power savings policy is enabled for the parent domain, idle
@@ -4365,10 +4388,10 @@ redo:
4365 /* 4388 /*
4366 * Should not call ttwu while holding a rq->lock 4389 * Should not call ttwu while holding a rq->lock
4367 */ 4390 */
4368 spin_unlock(&this_rq->lock); 4391 raw_spin_unlock(&this_rq->lock);
4369 if (active_balance) 4392 if (active_balance)
4370 wake_up_process(busiest->migration_thread); 4393 wake_up_process(busiest->migration_thread);
4371 spin_lock(&this_rq->lock); 4394 raw_spin_lock(&this_rq->lock);
4372 4395
4373 } else 4396 } else
4374 sd->nr_balance_failed = 0; 4397 sd->nr_balance_failed = 0;
@@ -4396,6 +4419,11 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
4396 int pulled_task = 0; 4419 int pulled_task = 0;
4397 unsigned long next_balance = jiffies + HZ; 4420 unsigned long next_balance = jiffies + HZ;
4398 4421
4422 this_rq->idle_stamp = this_rq->clock;
4423
4424 if (this_rq->avg_idle < sysctl_sched_migration_cost)
4425 return;
4426
4399 for_each_domain(this_cpu, sd) { 4427 for_each_domain(this_cpu, sd) {
4400 unsigned long interval; 4428 unsigned long interval;
4401 4429
@@ -4410,8 +4438,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
4410 interval = msecs_to_jiffies(sd->balance_interval); 4438 interval = msecs_to_jiffies(sd->balance_interval);
4411 if (time_after(next_balance, sd->last_balance + interval)) 4439 if (time_after(next_balance, sd->last_balance + interval))
4412 next_balance = sd->last_balance + interval; 4440 next_balance = sd->last_balance + interval;
4413 if (pulled_task) 4441 if (pulled_task) {
4442 this_rq->idle_stamp = 0;
4414 break; 4443 break;
4444 }
4415 } 4445 }
4416 if (pulled_task || time_after(jiffies, this_rq->next_balance)) { 4446 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4417 /* 4447 /*
@@ -4646,7 +4676,7 @@ int select_nohz_load_balancer(int stop_tick)
4646 cpumask_set_cpu(cpu, nohz.cpu_mask); 4676 cpumask_set_cpu(cpu, nohz.cpu_mask);
4647 4677
4648 /* time for ilb owner also to sleep */ 4678 /* time for ilb owner also to sleep */
4649 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { 4679 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
4650 if (atomic_read(&nohz.load_balancer) == cpu) 4680 if (atomic_read(&nohz.load_balancer) == cpu)
4651 atomic_set(&nohz.load_balancer, -1); 4681 atomic_set(&nohz.load_balancer, -1);
4652 return 0; 4682 return 0;
@@ -5013,8 +5043,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
5013 p->gtime = cputime_add(p->gtime, cputime); 5043 p->gtime = cputime_add(p->gtime, cputime);
5014 5044
5015 /* Add guest time to cpustat. */ 5045 /* Add guest time to cpustat. */
5016 cpustat->user = cputime64_add(cpustat->user, tmp); 5046 if (TASK_NICE(p) > 0) {
5017 cpustat->guest = cputime64_add(cpustat->guest, tmp); 5047 cpustat->nice = cputime64_add(cpustat->nice, tmp);
5048 cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);
5049 } else {
5050 cpustat->user = cputime64_add(cpustat->user, tmp);
5051 cpustat->guest = cputime64_add(cpustat->guest, tmp);
5052 }
5018} 5053}
5019 5054
5020/* 5055/*
@@ -5129,60 +5164,86 @@ void account_idle_ticks(unsigned long ticks)
5129 * Use precise platform statistics if available: 5164 * Use precise platform statistics if available:
5130 */ 5165 */
5131#ifdef CONFIG_VIRT_CPU_ACCOUNTING 5166#ifdef CONFIG_VIRT_CPU_ACCOUNTING
5132cputime_t task_utime(struct task_struct *p) 5167void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5133{ 5168{
5134 return p->utime; 5169 *ut = p->utime;
5170 *st = p->stime;
5135} 5171}
5136 5172
5137cputime_t task_stime(struct task_struct *p) 5173void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5138{ 5174{
5139 return p->stime; 5175 struct task_cputime cputime;
5176
5177 thread_group_cputime(p, &cputime);
5178
5179 *ut = cputime.utime;
5180 *st = cputime.stime;
5140} 5181}
5141#else 5182#else
5142cputime_t task_utime(struct task_struct *p) 5183
5184#ifndef nsecs_to_cputime
5185# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
5186#endif
5187
5188void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5143{ 5189{
5144 clock_t utime = cputime_to_clock_t(p->utime), 5190 cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);
5145 total = utime + cputime_to_clock_t(p->stime);
5146 u64 temp;
5147 5191
5148 /* 5192 /*
5149 * Use CFS's precise accounting: 5193 * Use CFS's precise accounting:
5150 */ 5194 */
5151 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); 5195 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
5152 5196
5153 if (total) { 5197 if (total) {
5154 temp *= utime; 5198 u64 temp;
5199
5200 temp = (u64)(rtime * utime);
5155 do_div(temp, total); 5201 do_div(temp, total);
5156 } 5202 utime = (cputime_t)temp;
5157 utime = (clock_t)temp; 5203 } else
5204 utime = rtime;
5158 5205
5159 p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); 5206 /*
5160 return p->prev_utime; 5207 * Compare with previous values, to keep monotonicity:
5208 */
5209 p->prev_utime = max(p->prev_utime, utime);
5210 p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));
5211
5212 *ut = p->prev_utime;
5213 *st = p->prev_stime;
5161} 5214}
5162 5215
5163cputime_t task_stime(struct task_struct *p) 5216/*
5217 * Must be called with siglock held.
5218 */
5219void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5164{ 5220{
5165 clock_t stime; 5221 struct signal_struct *sig = p->signal;
5222 struct task_cputime cputime;
5223 cputime_t rtime, utime, total;
5166 5224
5167 /* 5225 thread_group_cputime(p, &cputime);
5168 * Use CFS's precise accounting. (we subtract utime from
5169 * the total, to make sure the total observed by userspace
5170 * grows monotonically - apps rely on that):
5171 */
5172 stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
5173 cputime_to_clock_t(task_utime(p));
5174 5226
5175 if (stime >= 0) 5227 total = cputime_add(cputime.utime, cputime.stime);
5176 p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); 5228 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
5177 5229
5178 return p->prev_stime; 5230 if (total) {
5179} 5231 u64 temp;
5180#endif
5181 5232
5182inline cputime_t task_gtime(struct task_struct *p) 5233 temp = (u64)(rtime * cputime.utime);
5183{ 5234 do_div(temp, total);
5184 return p->gtime; 5235 utime = (cputime_t)temp;
5236 } else
5237 utime = rtime;
5238
5239 sig->prev_utime = max(sig->prev_utime, utime);
5240 sig->prev_stime = max(sig->prev_stime,
5241 cputime_sub(rtime, sig->prev_utime));
5242
5243 *ut = sig->prev_utime;
5244 *st = sig->prev_stime;
5185} 5245}
5246#endif
5186 5247
5187/* 5248/*
5188 * This function gets called by the timer code, with HZ frequency. 5249 * This function gets called by the timer code, with HZ frequency.
@@ -5199,11 +5260,11 @@ void scheduler_tick(void)
5199 5260
5200 sched_clock_tick(); 5261 sched_clock_tick();
5201 5262
5202 spin_lock(&rq->lock); 5263 raw_spin_lock(&rq->lock);
5203 update_rq_clock(rq); 5264 update_rq_clock(rq);
5204 update_cpu_load(rq); 5265 update_cpu_load(rq);
5205 curr->sched_class->task_tick(rq, curr, 0); 5266 curr->sched_class->task_tick(rq, curr, 0);
5206 spin_unlock(&rq->lock); 5267 raw_spin_unlock(&rq->lock);
5207 5268
5208 perf_event_task_tick(curr, cpu); 5269 perf_event_task_tick(curr, cpu);
5209 5270
@@ -5317,13 +5378,14 @@ static inline void schedule_debug(struct task_struct *prev)
5317#endif 5378#endif
5318} 5379}
5319 5380
5320static void put_prev_task(struct rq *rq, struct task_struct *p) 5381static void put_prev_task(struct rq *rq, struct task_struct *prev)
5321{ 5382{
5322 u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime; 5383 if (prev->state == TASK_RUNNING) {
5384 u64 runtime = prev->se.sum_exec_runtime;
5323 5385
5324 update_avg(&p->se.avg_running, runtime); 5386 runtime -= prev->se.prev_sum_exec_runtime;
5387 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5325 5388
5326 if (p->state == TASK_RUNNING) {
5327 /* 5389 /*
5328 * In order to avoid avg_overlap growing stale when we are 5390 * In order to avoid avg_overlap growing stale when we are
5329 * indeed overlapping and hence not getting put to sleep, grow 5391 * indeed overlapping and hence not getting put to sleep, grow
@@ -5333,12 +5395,9 @@ static void put_prev_task(struct rq *rq, struct task_struct *p)
5333 * correlates to the amount of cache footprint a task can 5395 * correlates to the amount of cache footprint a task can
5334 * build up. 5396 * build up.
5335 */ 5397 */
5336 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); 5398 update_avg(&prev->se.avg_overlap, runtime);
5337 update_avg(&p->se.avg_overlap, runtime);
5338 } else {
5339 update_avg(&p->se.avg_running, 0);
5340 } 5399 }
5341 p->sched_class->put_prev_task(rq, p); 5400 prev->sched_class->put_prev_task(rq, prev);
5342} 5401}
5343 5402
5344/* 5403/*
@@ -5399,7 +5458,7 @@ need_resched_nonpreemptible:
5399 if (sched_feat(HRTICK)) 5458 if (sched_feat(HRTICK))
5400 hrtick_clear(rq); 5459 hrtick_clear(rq);
5401 5460
5402 spin_lock_irq(&rq->lock); 5461 raw_spin_lock_irq(&rq->lock);
5403 update_rq_clock(rq); 5462 update_rq_clock(rq);
5404 clear_tsk_need_resched(prev); 5463 clear_tsk_need_resched(prev);
5405 5464
@@ -5435,7 +5494,7 @@ need_resched_nonpreemptible:
5435 cpu = smp_processor_id(); 5494 cpu = smp_processor_id();
5436 rq = cpu_rq(cpu); 5495 rq = cpu_rq(cpu);
5437 } else 5496 } else
5438 spin_unlock_irq(&rq->lock); 5497 raw_spin_unlock_irq(&rq->lock);
5439 5498
5440 post_schedule(rq); 5499 post_schedule(rq);
5441 5500
@@ -5448,7 +5507,7 @@ need_resched_nonpreemptible:
5448} 5507}
5449EXPORT_SYMBOL(schedule); 5508EXPORT_SYMBOL(schedule);
5450 5509
5451#ifdef CONFIG_SMP 5510#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
5452/* 5511/*
5453 * Look out! "owner" is an entirely speculative pointer 5512 * Look out! "owner" is an entirely speculative pointer
5454 * access and not reliable. 5513 * access and not reliable.
@@ -6142,22 +6201,14 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
6142 BUG_ON(p->se.on_rq); 6201 BUG_ON(p->se.on_rq);
6143 6202
6144 p->policy = policy; 6203 p->policy = policy;
6145 switch (p->policy) {
6146 case SCHED_NORMAL:
6147 case SCHED_BATCH:
6148 case SCHED_IDLE:
6149 p->sched_class = &fair_sched_class;
6150 break;
6151 case SCHED_FIFO:
6152 case SCHED_RR:
6153 p->sched_class = &rt_sched_class;
6154 break;
6155 }
6156
6157 p->rt_priority = prio; 6204 p->rt_priority = prio;
6158 p->normal_prio = normal_prio(p); 6205 p->normal_prio = normal_prio(p);
6159 /* we are holding p->pi_lock already */ 6206 /* we are holding p->pi_lock already */
6160 p->prio = rt_mutex_getprio(p); 6207 p->prio = rt_mutex_getprio(p);
6208 if (rt_prio(p->prio))
6209 p->sched_class = &rt_sched_class;
6210 else
6211 p->sched_class = &fair_sched_class;
6161 set_load_weight(p); 6212 set_load_weight(p);
6162} 6213}
6163 6214
@@ -6272,7 +6323,7 @@ recheck:
6272 * make sure no PI-waiters arrive (or leave) while we are 6323 * make sure no PI-waiters arrive (or leave) while we are
6273 * changing the priority of the task: 6324 * changing the priority of the task:
6274 */ 6325 */
6275 spin_lock_irqsave(&p->pi_lock, flags); 6326 raw_spin_lock_irqsave(&p->pi_lock, flags);
6276 /* 6327 /*
6277 * To be able to change p->policy safely, the apropriate 6328 * To be able to change p->policy safely, the apropriate
6278 * runqueue lock must be held. 6329 * runqueue lock must be held.
@@ -6282,7 +6333,7 @@ recheck:
6282 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 6333 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
6283 policy = oldpolicy = -1; 6334 policy = oldpolicy = -1;
6284 __task_rq_unlock(rq); 6335 __task_rq_unlock(rq);
6285 spin_unlock_irqrestore(&p->pi_lock, flags); 6336 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
6286 goto recheck; 6337 goto recheck;
6287 } 6338 }
6288 update_rq_clock(rq); 6339 update_rq_clock(rq);
@@ -6306,7 +6357,7 @@ recheck:
6306 check_class_changed(rq, p, prev_class, oldprio, running); 6357 check_class_changed(rq, p, prev_class, oldprio, running);
6307 } 6358 }
6308 __task_rq_unlock(rq); 6359 __task_rq_unlock(rq);
6309 spin_unlock_irqrestore(&p->pi_lock, flags); 6360 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
6310 6361
6311 rt_mutex_adjust_pi(p); 6362 rt_mutex_adjust_pi(p);
6312 6363
@@ -6560,6 +6611,8 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
6560long sched_getaffinity(pid_t pid, struct cpumask *mask) 6611long sched_getaffinity(pid_t pid, struct cpumask *mask)
6561{ 6612{
6562 struct task_struct *p; 6613 struct task_struct *p;
6614 unsigned long flags;
6615 struct rq *rq;
6563 int retval; 6616 int retval;
6564 6617
6565 get_online_cpus(); 6618 get_online_cpus();
@@ -6574,7 +6627,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
6574 if (retval) 6627 if (retval)
6575 goto out_unlock; 6628 goto out_unlock;
6576 6629
6630 rq = task_rq_lock(p, &flags);
6577 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 6631 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
6632 task_rq_unlock(rq, &flags);
6578 6633
6579out_unlock: 6634out_unlock:
6580 read_unlock(&tasklist_lock); 6635 read_unlock(&tasklist_lock);
@@ -6632,7 +6687,7 @@ SYSCALL_DEFINE0(sched_yield)
6632 */ 6687 */
6633 __release(rq->lock); 6688 __release(rq->lock);
6634 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 6689 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
6635 _raw_spin_unlock(&rq->lock); 6690 do_raw_spin_unlock(&rq->lock);
6636 preempt_enable_no_resched(); 6691 preempt_enable_no_resched();
6637 6692
6638 schedule(); 6693 schedule();
@@ -6720,9 +6775,6 @@ EXPORT_SYMBOL(yield);
6720/* 6775/*
6721 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 6776 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
6722 * that process accounting knows that this is a task in IO wait state. 6777 * that process accounting knows that this is a task in IO wait state.
6723 *
6724 * But don't do that if it is a deliberate, throttling IO wait (this task
6725 * has set its backing_dev_info: the queue against which it should throttle)
6726 */ 6778 */
6727void __sched io_schedule(void) 6779void __sched io_schedule(void)
6728{ 6780{
@@ -6815,6 +6867,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6815{ 6867{
6816 struct task_struct *p; 6868 struct task_struct *p;
6817 unsigned int time_slice; 6869 unsigned int time_slice;
6870 unsigned long flags;
6871 struct rq *rq;
6818 int retval; 6872 int retval;
6819 struct timespec t; 6873 struct timespec t;
6820 6874
@@ -6831,7 +6885,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6831 if (retval) 6885 if (retval)
6832 goto out_unlock; 6886 goto out_unlock;
6833 6887
6834 time_slice = p->sched_class->get_rr_interval(p); 6888 rq = task_rq_lock(p, &flags);
6889 time_slice = p->sched_class->get_rr_interval(rq, p);
6890 task_rq_unlock(rq, &flags);
6835 6891
6836 read_unlock(&tasklist_lock); 6892 read_unlock(&tasklist_lock);
6837 jiffies_to_timespec(time_slice, &t); 6893 jiffies_to_timespec(time_slice, &t);
@@ -6905,7 +6961,7 @@ void show_state_filter(unsigned long state_filter)
6905 /* 6961 /*
6906 * Only show locks if all tasks are dumped: 6962 * Only show locks if all tasks are dumped:
6907 */ 6963 */
6908 if (state_filter == -1) 6964 if (!state_filter)
6909 debug_show_all_locks(); 6965 debug_show_all_locks();
6910} 6966}
6911 6967
@@ -6927,12 +6983,11 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
6927 struct rq *rq = cpu_rq(cpu); 6983 struct rq *rq = cpu_rq(cpu);
6928 unsigned long flags; 6984 unsigned long flags;
6929 6985
6930 spin_lock_irqsave(&rq->lock, flags); 6986 raw_spin_lock_irqsave(&rq->lock, flags);
6931 6987
6932 __sched_fork(idle); 6988 __sched_fork(idle);
6933 idle->se.exec_start = sched_clock(); 6989 idle->se.exec_start = sched_clock();
6934 6990
6935 idle->prio = idle->normal_prio = MAX_PRIO;
6936 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); 6991 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
6937 __set_task_cpu(idle, cpu); 6992 __set_task_cpu(idle, cpu);
6938 6993
@@ -6940,7 +6995,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
6940#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 6995#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
6941 idle->oncpu = 1; 6996 idle->oncpu = 1;
6942#endif 6997#endif
6943 spin_unlock_irqrestore(&rq->lock, flags); 6998 raw_spin_unlock_irqrestore(&rq->lock, flags);
6944 6999
6945 /* Set the preempt count _outside_ the spinlocks! */ 7000 /* Set the preempt count _outside_ the spinlocks! */
6946#if defined(CONFIG_PREEMPT) 7001#if defined(CONFIG_PREEMPT)
@@ -6973,22 +7028,43 @@ cpumask_var_t nohz_cpu_mask;
6973 * 7028 *
6974 * This idea comes from the SD scheduler of Con Kolivas: 7029 * This idea comes from the SD scheduler of Con Kolivas:
6975 */ 7030 */
6976static inline void sched_init_granularity(void) 7031static int get_update_sysctl_factor(void)
6977{ 7032{
6978 unsigned int factor = 1 + ilog2(num_online_cpus()); 7033 unsigned int cpus = min_t(int, num_online_cpus(), 8);
6979 const unsigned long limit = 200000000; 7034 unsigned int factor;
6980 7035
6981 sysctl_sched_min_granularity *= factor; 7036 switch (sysctl_sched_tunable_scaling) {
6982 if (sysctl_sched_min_granularity > limit) 7037 case SCHED_TUNABLESCALING_NONE:
6983 sysctl_sched_min_granularity = limit; 7038 factor = 1;
7039 break;
7040 case SCHED_TUNABLESCALING_LINEAR:
7041 factor = cpus;
7042 break;
7043 case SCHED_TUNABLESCALING_LOG:
7044 default:
7045 factor = 1 + ilog2(cpus);
7046 break;
7047 }
6984 7048
6985 sysctl_sched_latency *= factor; 7049 return factor;
6986 if (sysctl_sched_latency > limit) 7050}
6987 sysctl_sched_latency = limit;
6988 7051
6989 sysctl_sched_wakeup_granularity *= factor; 7052static void update_sysctl(void)
7053{
7054 unsigned int factor = get_update_sysctl_factor();
7055
7056#define SET_SYSCTL(name) \
7057 (sysctl_##name = (factor) * normalized_sysctl_##name)
7058 SET_SYSCTL(sched_min_granularity);
7059 SET_SYSCTL(sched_latency);
7060 SET_SYSCTL(sched_wakeup_granularity);
7061 SET_SYSCTL(sched_shares_ratelimit);
7062#undef SET_SYSCTL
7063}
6990 7064
6991 sysctl_sched_shares_ratelimit *= factor; 7065static inline void sched_init_granularity(void)
7066{
7067 update_sysctl();
6992} 7068}
6993 7069
6994#ifdef CONFIG_SMP 7070#ifdef CONFIG_SMP
@@ -7025,7 +7101,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7025 int ret = 0; 7101 int ret = 0;
7026 7102
7027 rq = task_rq_lock(p, &flags); 7103 rq = task_rq_lock(p, &flags);
7028 if (!cpumask_intersects(new_mask, cpu_online_mask)) { 7104 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
7029 ret = -EINVAL; 7105 ret = -EINVAL;
7030 goto out; 7106 goto out;
7031 } 7107 }
@@ -7047,7 +7123,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7047 if (cpumask_test_cpu(task_cpu(p), new_mask)) 7123 if (cpumask_test_cpu(task_cpu(p), new_mask))
7048 goto out; 7124 goto out;
7049 7125
7050 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { 7126 if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) {
7051 /* Need help from migration thread: drop lock and wait. */ 7127 /* Need help from migration thread: drop lock and wait. */
7052 struct task_struct *mt = rq->migration_thread; 7128 struct task_struct *mt = rq->migration_thread;
7053 7129
@@ -7136,10 +7212,10 @@ static int migration_thread(void *data)
7136 struct migration_req *req; 7212 struct migration_req *req;
7137 struct list_head *head; 7213 struct list_head *head;
7138 7214
7139 spin_lock_irq(&rq->lock); 7215 raw_spin_lock_irq(&rq->lock);
7140 7216
7141 if (cpu_is_offline(cpu)) { 7217 if (cpu_is_offline(cpu)) {
7142 spin_unlock_irq(&rq->lock); 7218 raw_spin_unlock_irq(&rq->lock);
7143 break; 7219 break;
7144 } 7220 }
7145 7221
@@ -7151,7 +7227,7 @@ static int migration_thread(void *data)
7151 head = &rq->migration_queue; 7227 head = &rq->migration_queue;
7152 7228
7153 if (list_empty(head)) { 7229 if (list_empty(head)) {
7154 spin_unlock_irq(&rq->lock); 7230 raw_spin_unlock_irq(&rq->lock);
7155 schedule(); 7231 schedule();
7156 set_current_state(TASK_INTERRUPTIBLE); 7232 set_current_state(TASK_INTERRUPTIBLE);
7157 continue; 7233 continue;
@@ -7160,14 +7236,14 @@ static int migration_thread(void *data)
7160 list_del_init(head->next); 7236 list_del_init(head->next);
7161 7237
7162 if (req->task != NULL) { 7238 if (req->task != NULL) {
7163 spin_unlock(&rq->lock); 7239 raw_spin_unlock(&rq->lock);
7164 __migrate_task(req->task, cpu, req->dest_cpu); 7240 __migrate_task(req->task, cpu, req->dest_cpu);
7165 } else if (likely(cpu == (badcpu = smp_processor_id()))) { 7241 } else if (likely(cpu == (badcpu = smp_processor_id()))) {
7166 req->dest_cpu = RCU_MIGRATION_GOT_QS; 7242 req->dest_cpu = RCU_MIGRATION_GOT_QS;
7167 spin_unlock(&rq->lock); 7243 raw_spin_unlock(&rq->lock);
7168 } else { 7244 } else {
7169 req->dest_cpu = RCU_MIGRATION_MUST_SYNC; 7245 req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
7170 spin_unlock(&rq->lock); 7246 raw_spin_unlock(&rq->lock);
7171 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu); 7247 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
7172 } 7248 }
7173 local_irq_enable(); 7249 local_irq_enable();
@@ -7201,19 +7277,19 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
7201 7277
7202again: 7278again:
7203 /* Look for allowed, online CPU in same node. */ 7279 /* Look for allowed, online CPU in same node. */
7204 for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask) 7280 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
7205 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 7281 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
7206 goto move; 7282 goto move;
7207 7283
7208 /* Any allowed, online CPU? */ 7284 /* Any allowed, online CPU? */
7209 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask); 7285 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
7210 if (dest_cpu < nr_cpu_ids) 7286 if (dest_cpu < nr_cpu_ids)
7211 goto move; 7287 goto move;
7212 7288
7213 /* No more Mr. Nice Guy. */ 7289 /* No more Mr. Nice Guy. */
7214 if (dest_cpu >= nr_cpu_ids) { 7290 if (dest_cpu >= nr_cpu_ids) {
7215 cpuset_cpus_allowed_locked(p, &p->cpus_allowed); 7291 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
7216 dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed); 7292 dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
7217 7293
7218 /* 7294 /*
7219 * Don't tell them about moving exiting tasks or 7295 * Don't tell them about moving exiting tasks or
@@ -7242,7 +7318,7 @@ move:
7242 */ 7318 */
7243static void migrate_nr_uninterruptible(struct rq *rq_src) 7319static void migrate_nr_uninterruptible(struct rq *rq_src)
7244{ 7320{
7245 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask)); 7321 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
7246 unsigned long flags; 7322 unsigned long flags;
7247 7323
7248 local_irq_save(flags); 7324 local_irq_save(flags);
@@ -7290,14 +7366,14 @@ void sched_idle_next(void)
7290 * Strictly not necessary since rest of the CPUs are stopped by now 7366 * Strictly not necessary since rest of the CPUs are stopped by now
7291 * and interrupts disabled on the current cpu. 7367 * and interrupts disabled on the current cpu.
7292 */ 7368 */
7293 spin_lock_irqsave(&rq->lock, flags); 7369 raw_spin_lock_irqsave(&rq->lock, flags);
7294 7370
7295 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); 7371 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
7296 7372
7297 update_rq_clock(rq); 7373 update_rq_clock(rq);
7298 activate_task(rq, p, 0); 7374 activate_task(rq, p, 0);
7299 7375
7300 spin_unlock_irqrestore(&rq->lock, flags); 7376 raw_spin_unlock_irqrestore(&rq->lock, flags);
7301} 7377}
7302 7378
7303/* 7379/*
@@ -7333,9 +7409,9 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
7333 * that's OK. No task can be added to this CPU, so iteration is 7409 * that's OK. No task can be added to this CPU, so iteration is
7334 * fine. 7410 * fine.
7335 */ 7411 */
7336 spin_unlock_irq(&rq->lock); 7412 raw_spin_unlock_irq(&rq->lock);
7337 move_task_off_dead_cpu(dead_cpu, p); 7413 move_task_off_dead_cpu(dead_cpu, p);
7338 spin_lock_irq(&rq->lock); 7414 raw_spin_lock_irq(&rq->lock);
7339 7415
7340 put_task_struct(p); 7416 put_task_struct(p);
7341} 7417}
@@ -7376,17 +7452,16 @@ static struct ctl_table sd_ctl_dir[] = {
7376 .procname = "sched_domain", 7452 .procname = "sched_domain",
7377 .mode = 0555, 7453 .mode = 0555,
7378 }, 7454 },
7379 {0, }, 7455 {}
7380}; 7456};
7381 7457
7382static struct ctl_table sd_ctl_root[] = { 7458static struct ctl_table sd_ctl_root[] = {
7383 { 7459 {
7384 .ctl_name = CTL_KERN,
7385 .procname = "kernel", 7460 .procname = "kernel",
7386 .mode = 0555, 7461 .mode = 0555,
7387 .child = sd_ctl_dir, 7462 .child = sd_ctl_dir,
7388 }, 7463 },
7389 {0, }, 7464 {}
7390}; 7465};
7391 7466
7392static struct ctl_table *sd_alloc_ctl_entry(int n) 7467static struct ctl_table *sd_alloc_ctl_entry(int n)
@@ -7496,7 +7571,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
7496static struct ctl_table_header *sd_sysctl_header; 7571static struct ctl_table_header *sd_sysctl_header;
7497static void register_sched_domain_sysctl(void) 7572static void register_sched_domain_sysctl(void)
7498{ 7573{
7499 int i, cpu_num = num_online_cpus(); 7574 int i, cpu_num = num_possible_cpus();
7500 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); 7575 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
7501 char buf[32]; 7576 char buf[32];
7502 7577
@@ -7506,7 +7581,7 @@ static void register_sched_domain_sysctl(void)
7506 if (entry == NULL) 7581 if (entry == NULL)
7507 return; 7582 return;
7508 7583
7509 for_each_online_cpu(i) { 7584 for_each_possible_cpu(i) {
7510 snprintf(buf, 32, "cpu%d", i); 7585 snprintf(buf, 32, "cpu%d", i);
7511 entry->procname = kstrdup(buf, GFP_KERNEL); 7586 entry->procname = kstrdup(buf, GFP_KERNEL);
7512 entry->mode = 0555; 7587 entry->mode = 0555;
@@ -7602,13 +7677,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7602 7677
7603 /* Update our root-domain */ 7678 /* Update our root-domain */
7604 rq = cpu_rq(cpu); 7679 rq = cpu_rq(cpu);
7605 spin_lock_irqsave(&rq->lock, flags); 7680 raw_spin_lock_irqsave(&rq->lock, flags);
7606 if (rq->rd) { 7681 if (rq->rd) {
7607 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 7682 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7608 7683
7609 set_rq_online(rq); 7684 set_rq_online(rq);
7610 } 7685 }
7611 spin_unlock_irqrestore(&rq->lock, flags); 7686 raw_spin_unlock_irqrestore(&rq->lock, flags);
7612 break; 7687 break;
7613 7688
7614#ifdef CONFIG_HOTPLUG_CPU 7689#ifdef CONFIG_HOTPLUG_CPU
@@ -7633,14 +7708,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7633 put_task_struct(rq->migration_thread); 7708 put_task_struct(rq->migration_thread);
7634 rq->migration_thread = NULL; 7709 rq->migration_thread = NULL;
7635 /* Idle task back to normal (off runqueue, low prio) */ 7710 /* Idle task back to normal (off runqueue, low prio) */
7636 spin_lock_irq(&rq->lock); 7711 raw_spin_lock_irq(&rq->lock);
7637 update_rq_clock(rq); 7712 update_rq_clock(rq);
7638 deactivate_task(rq, rq->idle, 0); 7713 deactivate_task(rq, rq->idle, 0);
7639 rq->idle->static_prio = MAX_PRIO;
7640 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); 7714 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
7641 rq->idle->sched_class = &idle_sched_class; 7715 rq->idle->sched_class = &idle_sched_class;
7642 migrate_dead_tasks(cpu); 7716 migrate_dead_tasks(cpu);
7643 spin_unlock_irq(&rq->lock); 7717 raw_spin_unlock_irq(&rq->lock);
7644 cpuset_unlock(); 7718 cpuset_unlock();
7645 migrate_nr_uninterruptible(rq); 7719 migrate_nr_uninterruptible(rq);
7646 BUG_ON(rq->nr_running != 0); 7720 BUG_ON(rq->nr_running != 0);
@@ -7650,30 +7724,30 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7650 * they didn't take sched_hotcpu_mutex. Just wake up 7724 * they didn't take sched_hotcpu_mutex. Just wake up
7651 * the requestors. 7725 * the requestors.
7652 */ 7726 */
7653 spin_lock_irq(&rq->lock); 7727 raw_spin_lock_irq(&rq->lock);
7654 while (!list_empty(&rq->migration_queue)) { 7728 while (!list_empty(&rq->migration_queue)) {
7655 struct migration_req *req; 7729 struct migration_req *req;
7656 7730
7657 req = list_entry(rq->migration_queue.next, 7731 req = list_entry(rq->migration_queue.next,
7658 struct migration_req, list); 7732 struct migration_req, list);
7659 list_del_init(&req->list); 7733 list_del_init(&req->list);
7660 spin_unlock_irq(&rq->lock); 7734 raw_spin_unlock_irq(&rq->lock);
7661 complete(&req->done); 7735 complete(&req->done);
7662 spin_lock_irq(&rq->lock); 7736 raw_spin_lock_irq(&rq->lock);
7663 } 7737 }
7664 spin_unlock_irq(&rq->lock); 7738 raw_spin_unlock_irq(&rq->lock);
7665 break; 7739 break;
7666 7740
7667 case CPU_DYING: 7741 case CPU_DYING:
7668 case CPU_DYING_FROZEN: 7742 case CPU_DYING_FROZEN:
7669 /* Update our root-domain */ 7743 /* Update our root-domain */
7670 rq = cpu_rq(cpu); 7744 rq = cpu_rq(cpu);
7671 spin_lock_irqsave(&rq->lock, flags); 7745 raw_spin_lock_irqsave(&rq->lock, flags);
7672 if (rq->rd) { 7746 if (rq->rd) {
7673 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 7747 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7674 set_rq_offline(rq); 7748 set_rq_offline(rq);
7675 } 7749 }
7676 spin_unlock_irqrestore(&rq->lock, flags); 7750 raw_spin_unlock_irqrestore(&rq->lock, flags);
7677 break; 7751 break;
7678#endif 7752#endif
7679 } 7753 }
@@ -7710,6 +7784,16 @@ early_initcall(migration_init);
7710 7784
7711#ifdef CONFIG_SCHED_DEBUG 7785#ifdef CONFIG_SCHED_DEBUG
7712 7786
7787static __read_mostly int sched_domain_debug_enabled;
7788
7789static int __init sched_domain_debug_setup(char *str)
7790{
7791 sched_domain_debug_enabled = 1;
7792
7793 return 0;
7794}
7795early_param("sched_debug", sched_domain_debug_setup);
7796
7713static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 7797static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7714 struct cpumask *groupmask) 7798 struct cpumask *groupmask)
7715{ 7799{
@@ -7796,6 +7880,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
7796 cpumask_var_t groupmask; 7880 cpumask_var_t groupmask;
7797 int level = 0; 7881 int level = 0;
7798 7882
7883 if (!sched_domain_debug_enabled)
7884 return;
7885
7799 if (!sd) { 7886 if (!sd) {
7800 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); 7887 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
7801 return; 7888 return;
@@ -7875,6 +7962,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
7875 7962
7876static void free_rootdomain(struct root_domain *rd) 7963static void free_rootdomain(struct root_domain *rd)
7877{ 7964{
7965 synchronize_sched();
7966
7878 cpupri_cleanup(&rd->cpupri); 7967 cpupri_cleanup(&rd->cpupri);
7879 7968
7880 free_cpumask_var(rd->rto_mask); 7969 free_cpumask_var(rd->rto_mask);
@@ -7888,7 +7977,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7888 struct root_domain *old_rd = NULL; 7977 struct root_domain *old_rd = NULL;
7889 unsigned long flags; 7978 unsigned long flags;
7890 7979
7891 spin_lock_irqsave(&rq->lock, flags); 7980 raw_spin_lock_irqsave(&rq->lock, flags);
7892 7981
7893 if (rq->rd) { 7982 if (rq->rd) {
7894 old_rd = rq->rd; 7983 old_rd = rq->rd;
@@ -7914,7 +8003,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7914 if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) 8003 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
7915 set_rq_online(rq); 8004 set_rq_online(rq);
7916 8005
7917 spin_unlock_irqrestore(&rq->lock, flags); 8006 raw_spin_unlock_irqrestore(&rq->lock, flags);
7918 8007
7919 if (old_rd) 8008 if (old_rd)
7920 free_rootdomain(old_rd); 8009 free_rootdomain(old_rd);
@@ -8015,6 +8104,7 @@ static cpumask_var_t cpu_isolated_map;
8015/* Setup the mask of cpus configured for isolated domains */ 8104/* Setup the mask of cpus configured for isolated domains */
8016static int __init isolated_cpu_setup(char *str) 8105static int __init isolated_cpu_setup(char *str)
8017{ 8106{
8107 alloc_bootmem_cpumask_var(&cpu_isolated_map);
8018 cpulist_parse(str, cpu_isolated_map); 8108 cpulist_parse(str, cpu_isolated_map);
8019 return 1; 8109 return 1;
8020} 8110}
@@ -8199,14 +8289,14 @@ enum s_alloc {
8199 */ 8289 */
8200#ifdef CONFIG_SCHED_SMT 8290#ifdef CONFIG_SCHED_SMT
8201static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); 8291static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
8202static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus); 8292static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
8203 8293
8204static int 8294static int
8205cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, 8295cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
8206 struct sched_group **sg, struct cpumask *unused) 8296 struct sched_group **sg, struct cpumask *unused)
8207{ 8297{
8208 if (sg) 8298 if (sg)
8209 *sg = &per_cpu(sched_group_cpus, cpu).sg; 8299 *sg = &per_cpu(sched_groups, cpu).sg;
8210 return cpu; 8300 return cpu;
8211} 8301}
8212#endif /* CONFIG_SCHED_SMT */ 8302#endif /* CONFIG_SCHED_SMT */
@@ -8851,7 +8941,7 @@ static int build_sched_domains(const struct cpumask *cpu_map)
8851 return __build_sched_domains(cpu_map, NULL); 8941 return __build_sched_domains(cpu_map, NULL);
8852} 8942}
8853 8943
8854static struct cpumask *doms_cur; /* current sched domains */ 8944static cpumask_var_t *doms_cur; /* current sched domains */
8855static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 8945static int ndoms_cur; /* number of sched domains in 'doms_cur' */
8856static struct sched_domain_attr *dattr_cur; 8946static struct sched_domain_attr *dattr_cur;
8857 /* attribues of custom domains in 'doms_cur' */ 8947 /* attribues of custom domains in 'doms_cur' */
@@ -8873,6 +8963,31 @@ int __attribute__((weak)) arch_update_cpu_topology(void)
8873 return 0; 8963 return 0;
8874} 8964}
8875 8965
8966cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
8967{
8968 int i;
8969 cpumask_var_t *doms;
8970
8971 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
8972 if (!doms)
8973 return NULL;
8974 for (i = 0; i < ndoms; i++) {
8975 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
8976 free_sched_domains(doms, i);
8977 return NULL;
8978 }
8979 }
8980 return doms;
8981}
8982
8983void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
8984{
8985 unsigned int i;
8986 for (i = 0; i < ndoms; i++)
8987 free_cpumask_var(doms[i]);
8988 kfree(doms);
8989}
8990
8876/* 8991/*
8877 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 8992 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
8878 * For now this just excludes isolated cpus, but could be used to 8993 * For now this just excludes isolated cpus, but could be used to
@@ -8884,12 +8999,12 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
8884 8999
8885 arch_update_cpu_topology(); 9000 arch_update_cpu_topology();
8886 ndoms_cur = 1; 9001 ndoms_cur = 1;
8887 doms_cur = kmalloc(cpumask_size(), GFP_KERNEL); 9002 doms_cur = alloc_sched_domains(ndoms_cur);
8888 if (!doms_cur) 9003 if (!doms_cur)
8889 doms_cur = fallback_doms; 9004 doms_cur = &fallback_doms;
8890 cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); 9005 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
8891 dattr_cur = NULL; 9006 dattr_cur = NULL;
8892 err = build_sched_domains(doms_cur); 9007 err = build_sched_domains(doms_cur[0]);
8893 register_sched_domain_sysctl(); 9008 register_sched_domain_sysctl();
8894 9009
8895 return err; 9010 return err;
@@ -8939,19 +9054,19 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
8939 * doms_new[] to the current sched domain partitioning, doms_cur[]. 9054 * doms_new[] to the current sched domain partitioning, doms_cur[].
8940 * It destroys each deleted domain and builds each new domain. 9055 * It destroys each deleted domain and builds each new domain.
8941 * 9056 *
8942 * 'doms_new' is an array of cpumask's of length 'ndoms_new'. 9057 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
8943 * The masks don't intersect (don't overlap.) We should setup one 9058 * The masks don't intersect (don't overlap.) We should setup one
8944 * sched domain for each mask. CPUs not in any of the cpumasks will 9059 * sched domain for each mask. CPUs not in any of the cpumasks will
8945 * not be load balanced. If the same cpumask appears both in the 9060 * not be load balanced. If the same cpumask appears both in the
8946 * current 'doms_cur' domains and in the new 'doms_new', we can leave 9061 * current 'doms_cur' domains and in the new 'doms_new', we can leave
8947 * it as it is. 9062 * it as it is.
8948 * 9063 *
8949 * The passed in 'doms_new' should be kmalloc'd. This routine takes 9064 * The passed in 'doms_new' should be allocated using
8950 * ownership of it and will kfree it when done with it. If the caller 9065 * alloc_sched_domains. This routine takes ownership of it and will
8951 * failed the kmalloc call, then it can pass in doms_new == NULL && 9066 * free_sched_domains it when done with it. If the caller failed the
8952 * ndoms_new == 1, and partition_sched_domains() will fallback to 9067 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
8953 * the single partition 'fallback_doms', it also forces the domains 9068 * and partition_sched_domains() will fallback to the single partition
8954 * to be rebuilt. 9069 * 'fallback_doms', it also forces the domains to be rebuilt.
8955 * 9070 *
8956 * If doms_new == NULL it will be replaced with cpu_online_mask. 9071 * If doms_new == NULL it will be replaced with cpu_online_mask.
8957 * ndoms_new == 0 is a special case for destroying existing domains, 9072 * ndoms_new == 0 is a special case for destroying existing domains,
@@ -8959,8 +9074,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
8959 * 9074 *
8960 * Call with hotplug lock held 9075 * Call with hotplug lock held
8961 */ 9076 */
8962/* FIXME: Change to struct cpumask *doms_new[] */ 9077void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
8963void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
8964 struct sched_domain_attr *dattr_new) 9078 struct sched_domain_attr *dattr_new)
8965{ 9079{
8966 int i, j, n; 9080 int i, j, n;
@@ -8979,40 +9093,40 @@ void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
8979 /* Destroy deleted domains */ 9093 /* Destroy deleted domains */
8980 for (i = 0; i < ndoms_cur; i++) { 9094 for (i = 0; i < ndoms_cur; i++) {
8981 for (j = 0; j < n && !new_topology; j++) { 9095 for (j = 0; j < n && !new_topology; j++) {
8982 if (cpumask_equal(&doms_cur[i], &doms_new[j]) 9096 if (cpumask_equal(doms_cur[i], doms_new[j])
8983 && dattrs_equal(dattr_cur, i, dattr_new, j)) 9097 && dattrs_equal(dattr_cur, i, dattr_new, j))
8984 goto match1; 9098 goto match1;
8985 } 9099 }
8986 /* no match - a current sched domain not in new doms_new[] */ 9100 /* no match - a current sched domain not in new doms_new[] */
8987 detach_destroy_domains(doms_cur + i); 9101 detach_destroy_domains(doms_cur[i]);
8988match1: 9102match1:
8989 ; 9103 ;
8990 } 9104 }
8991 9105
8992 if (doms_new == NULL) { 9106 if (doms_new == NULL) {
8993 ndoms_cur = 0; 9107 ndoms_cur = 0;
8994 doms_new = fallback_doms; 9108 doms_new = &fallback_doms;
8995 cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); 9109 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
8996 WARN_ON_ONCE(dattr_new); 9110 WARN_ON_ONCE(dattr_new);
8997 } 9111 }
8998 9112
8999 /* Build new domains */ 9113 /* Build new domains */
9000 for (i = 0; i < ndoms_new; i++) { 9114 for (i = 0; i < ndoms_new; i++) {
9001 for (j = 0; j < ndoms_cur && !new_topology; j++) { 9115 for (j = 0; j < ndoms_cur && !new_topology; j++) {
9002 if (cpumask_equal(&doms_new[i], &doms_cur[j]) 9116 if (cpumask_equal(doms_new[i], doms_cur[j])
9003 && dattrs_equal(dattr_new, i, dattr_cur, j)) 9117 && dattrs_equal(dattr_new, i, dattr_cur, j))
9004 goto match2; 9118 goto match2;
9005 } 9119 }
9006 /* no match - add a new doms_new */ 9120 /* no match - add a new doms_new */
9007 __build_sched_domains(doms_new + i, 9121 __build_sched_domains(doms_new[i],
9008 dattr_new ? dattr_new + i : NULL); 9122 dattr_new ? dattr_new + i : NULL);
9009match2: 9123match2:
9010 ; 9124 ;
9011 } 9125 }
9012 9126
9013 /* Remember the new sched domains */ 9127 /* Remember the new sched domains */
9014 if (doms_cur != fallback_doms) 9128 if (doms_cur != &fallback_doms)
9015 kfree(doms_cur); 9129 free_sched_domains(doms_cur, ndoms_cur);
9016 kfree(dattr_cur); /* kfree(NULL) is safe */ 9130 kfree(dattr_cur); /* kfree(NULL) is safe */
9017 doms_cur = doms_new; 9131 doms_cur = doms_new;
9018 dattr_cur = dattr_new; 9132 dattr_cur = dattr_new;
@@ -9123,8 +9237,10 @@ static int update_sched_domains(struct notifier_block *nfb,
9123 switch (action) { 9237 switch (action) {
9124 case CPU_ONLINE: 9238 case CPU_ONLINE:
9125 case CPU_ONLINE_FROZEN: 9239 case CPU_ONLINE_FROZEN:
9126 case CPU_DEAD: 9240 case CPU_DOWN_PREPARE:
9127 case CPU_DEAD_FROZEN: 9241 case CPU_DOWN_PREPARE_FROZEN:
9242 case CPU_DOWN_FAILED:
9243 case CPU_DOWN_FAILED_FROZEN:
9128 partition_sched_domains(1, NULL, NULL); 9244 partition_sched_domains(1, NULL, NULL);
9129 return NOTIFY_OK; 9245 return NOTIFY_OK;
9130 9246
@@ -9171,7 +9287,7 @@ void __init sched_init_smp(void)
9171#endif 9287#endif
9172 get_online_cpus(); 9288 get_online_cpus();
9173 mutex_lock(&sched_domains_mutex); 9289 mutex_lock(&sched_domains_mutex);
9174 arch_init_sched_domains(cpu_online_mask); 9290 arch_init_sched_domains(cpu_active_mask);
9175 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 9291 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
9176 if (cpumask_empty(non_isolated_cpus)) 9292 if (cpumask_empty(non_isolated_cpus))
9177 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 9293 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -9244,13 +9360,13 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
9244#ifdef CONFIG_SMP 9360#ifdef CONFIG_SMP
9245 rt_rq->rt_nr_migratory = 0; 9361 rt_rq->rt_nr_migratory = 0;
9246 rt_rq->overloaded = 0; 9362 rt_rq->overloaded = 0;
9247 plist_head_init(&rt_rq->pushable_tasks, &rq->lock); 9363 plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock);
9248#endif 9364#endif
9249 9365
9250 rt_rq->rt_time = 0; 9366 rt_rq->rt_time = 0;
9251 rt_rq->rt_throttled = 0; 9367 rt_rq->rt_throttled = 0;
9252 rt_rq->rt_runtime = 0; 9368 rt_rq->rt_runtime = 0;
9253 spin_lock_init(&rt_rq->rt_runtime_lock); 9369 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
9254 9370
9255#ifdef CONFIG_RT_GROUP_SCHED 9371#ifdef CONFIG_RT_GROUP_SCHED
9256 rt_rq->rt_nr_boosted = 0; 9372 rt_rq->rt_nr_boosted = 0;
@@ -9334,10 +9450,6 @@ void __init sched_init(void)
9334#ifdef CONFIG_CPUMASK_OFFSTACK 9450#ifdef CONFIG_CPUMASK_OFFSTACK
9335 alloc_size += num_possible_cpus() * cpumask_size(); 9451 alloc_size += num_possible_cpus() * cpumask_size();
9336#endif 9452#endif
9337 /*
9338 * As sched_init() is called before page_alloc is setup,
9339 * we use alloc_bootmem().
9340 */
9341 if (alloc_size) { 9453 if (alloc_size) {
9342 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 9454 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
9343 9455
@@ -9406,11 +9518,15 @@ void __init sched_init(void)
9406#endif /* CONFIG_USER_SCHED */ 9518#endif /* CONFIG_USER_SCHED */
9407#endif /* CONFIG_GROUP_SCHED */ 9519#endif /* CONFIG_GROUP_SCHED */
9408 9520
9521#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
9522 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
9523 __alignof__(unsigned long));
9524#endif
9409 for_each_possible_cpu(i) { 9525 for_each_possible_cpu(i) {
9410 struct rq *rq; 9526 struct rq *rq;
9411 9527
9412 rq = cpu_rq(i); 9528 rq = cpu_rq(i);
9413 spin_lock_init(&rq->lock); 9529 raw_spin_lock_init(&rq->lock);
9414 rq->nr_running = 0; 9530 rq->nr_running = 0;
9415 rq->calc_load_active = 0; 9531 rq->calc_load_active = 0;
9416 rq->calc_load_update = jiffies + LOAD_FREQ; 9532 rq->calc_load_update = jiffies + LOAD_FREQ;
@@ -9470,7 +9586,7 @@ void __init sched_init(void)
9470#elif defined CONFIG_USER_SCHED 9586#elif defined CONFIG_USER_SCHED
9471 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL); 9587 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
9472 init_tg_rt_entry(&init_task_group, 9588 init_tg_rt_entry(&init_task_group,
9473 &per_cpu(init_rt_rq, i), 9589 &per_cpu(init_rt_rq_var, i),
9474 &per_cpu(init_sched_rt_entity, i), i, 1, 9590 &per_cpu(init_sched_rt_entity, i), i, 1,
9475 root_task_group.rt_se[i]); 9591 root_task_group.rt_se[i]);
9476#endif 9592#endif
@@ -9488,6 +9604,8 @@ void __init sched_init(void)
9488 rq->cpu = i; 9604 rq->cpu = i;
9489 rq->online = 0; 9605 rq->online = 0;
9490 rq->migration_thread = NULL; 9606 rq->migration_thread = NULL;
9607 rq->idle_stamp = 0;
9608 rq->avg_idle = 2*sysctl_sched_migration_cost;
9491 INIT_LIST_HEAD(&rq->migration_queue); 9609 INIT_LIST_HEAD(&rq->migration_queue);
9492 rq_attach_root(rq, &def_root_domain); 9610 rq_attach_root(rq, &def_root_domain);
9493#endif 9611#endif
@@ -9506,7 +9624,7 @@ void __init sched_init(void)
9506#endif 9624#endif
9507 9625
9508#ifdef CONFIG_RT_MUTEXES 9626#ifdef CONFIG_RT_MUTEXES
9509 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); 9627 plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock);
9510#endif 9628#endif
9511 9629
9512 /* 9630 /*
@@ -9531,13 +9649,15 @@ void __init sched_init(void)
9531 current->sched_class = &fair_sched_class; 9649 current->sched_class = &fair_sched_class;
9532 9650
9533 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ 9651 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
9534 alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 9652 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
9535#ifdef CONFIG_SMP 9653#ifdef CONFIG_SMP
9536#ifdef CONFIG_NO_HZ 9654#ifdef CONFIG_NO_HZ
9537 alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); 9655 zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
9538 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); 9656 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
9539#endif 9657#endif
9540 alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 9658 /* May be allocated at isolcpus cmdline parse time */
9659 if (cpu_isolated_map == NULL)
9660 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
9541#endif /* SMP */ 9661#endif /* SMP */
9542 9662
9543 perf_event_init(); 9663 perf_event_init();
@@ -9629,13 +9749,13 @@ void normalize_rt_tasks(void)
9629 continue; 9749 continue;
9630 } 9750 }
9631 9751
9632 spin_lock(&p->pi_lock); 9752 raw_spin_lock(&p->pi_lock);
9633 rq = __task_rq_lock(p); 9753 rq = __task_rq_lock(p);
9634 9754
9635 normalize_task(rq, p); 9755 normalize_task(rq, p);
9636 9756
9637 __task_rq_unlock(rq); 9757 __task_rq_unlock(rq);
9638 spin_unlock(&p->pi_lock); 9758 raw_spin_unlock(&p->pi_lock);
9639 } while_each_thread(g, p); 9759 } while_each_thread(g, p);
9640 9760
9641 read_unlock_irqrestore(&tasklist_lock, flags); 9761 read_unlock_irqrestore(&tasklist_lock, flags);
@@ -9731,13 +9851,15 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
9731 se = kzalloc_node(sizeof(struct sched_entity), 9851 se = kzalloc_node(sizeof(struct sched_entity),
9732 GFP_KERNEL, cpu_to_node(i)); 9852 GFP_KERNEL, cpu_to_node(i));
9733 if (!se) 9853 if (!se)
9734 goto err; 9854 goto err_free_rq;
9735 9855
9736 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); 9856 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
9737 } 9857 }
9738 9858
9739 return 1; 9859 return 1;
9740 9860
9861 err_free_rq:
9862 kfree(cfs_rq);
9741 err: 9863 err:
9742 return 0; 9864 return 0;
9743} 9865}
@@ -9819,13 +9941,15 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
9819 rt_se = kzalloc_node(sizeof(struct sched_rt_entity), 9941 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
9820 GFP_KERNEL, cpu_to_node(i)); 9942 GFP_KERNEL, cpu_to_node(i));
9821 if (!rt_se) 9943 if (!rt_se)
9822 goto err; 9944 goto err_free_rq;
9823 9945
9824 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); 9946 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
9825 } 9947 }
9826 9948
9827 return 1; 9949 return 1;
9828 9950
9951 err_free_rq:
9952 kfree(rt_rq);
9829 err: 9953 err:
9830 return 0; 9954 return 0;
9831} 9955}
@@ -9994,9 +10118,9 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
9994 struct rq *rq = cfs_rq->rq; 10118 struct rq *rq = cfs_rq->rq;
9995 unsigned long flags; 10119 unsigned long flags;
9996 10120
9997 spin_lock_irqsave(&rq->lock, flags); 10121 raw_spin_lock_irqsave(&rq->lock, flags);
9998 __set_se_shares(se, shares); 10122 __set_se_shares(se, shares);
9999 spin_unlock_irqrestore(&rq->lock, flags); 10123 raw_spin_unlock_irqrestore(&rq->lock, flags);
10000} 10124}
10001 10125
10002static DEFINE_MUTEX(shares_mutex); 10126static DEFINE_MUTEX(shares_mutex);
@@ -10181,18 +10305,18 @@ static int tg_set_bandwidth(struct task_group *tg,
10181 if (err) 10305 if (err)
10182 goto unlock; 10306 goto unlock;
10183 10307
10184 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 10308 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
10185 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 10309 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
10186 tg->rt_bandwidth.rt_runtime = rt_runtime; 10310 tg->rt_bandwidth.rt_runtime = rt_runtime;
10187 10311
10188 for_each_possible_cpu(i) { 10312 for_each_possible_cpu(i) {
10189 struct rt_rq *rt_rq = tg->rt_rq[i]; 10313 struct rt_rq *rt_rq = tg->rt_rq[i];
10190 10314
10191 spin_lock(&rt_rq->rt_runtime_lock); 10315 raw_spin_lock(&rt_rq->rt_runtime_lock);
10192 rt_rq->rt_runtime = rt_runtime; 10316 rt_rq->rt_runtime = rt_runtime;
10193 spin_unlock(&rt_rq->rt_runtime_lock); 10317 raw_spin_unlock(&rt_rq->rt_runtime_lock);
10194 } 10318 }
10195 spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 10319 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
10196 unlock: 10320 unlock:
10197 read_unlock(&tasklist_lock); 10321 read_unlock(&tasklist_lock);
10198 mutex_unlock(&rt_constraints_mutex); 10322 mutex_unlock(&rt_constraints_mutex);
@@ -10297,22 +10421,22 @@ static int sched_rt_global_constraints(void)
10297 if (sysctl_sched_rt_runtime == 0) 10421 if (sysctl_sched_rt_runtime == 0)
10298 return -EBUSY; 10422 return -EBUSY;
10299 10423
10300 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 10424 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
10301 for_each_possible_cpu(i) { 10425 for_each_possible_cpu(i) {
10302 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 10426 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
10303 10427
10304 spin_lock(&rt_rq->rt_runtime_lock); 10428 raw_spin_lock(&rt_rq->rt_runtime_lock);
10305 rt_rq->rt_runtime = global_rt_runtime(); 10429 rt_rq->rt_runtime = global_rt_runtime();
10306 spin_unlock(&rt_rq->rt_runtime_lock); 10430 raw_spin_unlock(&rt_rq->rt_runtime_lock);
10307 } 10431 }
10308 spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 10432 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
10309 10433
10310 return 0; 10434 return 0;
10311} 10435}
10312#endif /* CONFIG_RT_GROUP_SCHED */ 10436#endif /* CONFIG_RT_GROUP_SCHED */
10313 10437
10314int sched_rt_handler(struct ctl_table *table, int write, 10438int sched_rt_handler(struct ctl_table *table, int write,
10315 struct file *filp, void __user *buffer, size_t *lenp, 10439 void __user *buffer, size_t *lenp,
10316 loff_t *ppos) 10440 loff_t *ppos)
10317{ 10441{
10318 int ret; 10442 int ret;
@@ -10323,7 +10447,7 @@ int sched_rt_handler(struct ctl_table *table, int write,
10323 old_period = sysctl_sched_rt_period; 10447 old_period = sysctl_sched_rt_period;
10324 old_runtime = sysctl_sched_rt_runtime; 10448 old_runtime = sysctl_sched_rt_runtime;
10325 10449
10326 ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); 10450 ret = proc_dointvec(table, write, buffer, lenp, ppos);
10327 10451
10328 if (!ret && write) { 10452 if (!ret && write) {
10329 ret = sched_rt_global_constraints(); 10453 ret = sched_rt_global_constraints();
@@ -10377,8 +10501,7 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
10377} 10501}
10378 10502
10379static int 10503static int
10380cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 10504cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
10381 struct task_struct *tsk)
10382{ 10505{
10383#ifdef CONFIG_RT_GROUP_SCHED 10506#ifdef CONFIG_RT_GROUP_SCHED
10384 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) 10507 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
@@ -10388,15 +10511,45 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
10388 if (tsk->sched_class != &fair_sched_class) 10511 if (tsk->sched_class != &fair_sched_class)
10389 return -EINVAL; 10512 return -EINVAL;
10390#endif 10513#endif
10514 return 0;
10515}
10391 10516
10517static int
10518cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
10519 struct task_struct *tsk, bool threadgroup)
10520{
10521 int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
10522 if (retval)
10523 return retval;
10524 if (threadgroup) {
10525 struct task_struct *c;
10526 rcu_read_lock();
10527 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
10528 retval = cpu_cgroup_can_attach_task(cgrp, c);
10529 if (retval) {
10530 rcu_read_unlock();
10531 return retval;
10532 }
10533 }
10534 rcu_read_unlock();
10535 }
10392 return 0; 10536 return 0;
10393} 10537}
10394 10538
10395static void 10539static void
10396cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 10540cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
10397 struct cgroup *old_cont, struct task_struct *tsk) 10541 struct cgroup *old_cont, struct task_struct *tsk,
10542 bool threadgroup)
10398{ 10543{
10399 sched_move_task(tsk); 10544 sched_move_task(tsk);
10545 if (threadgroup) {
10546 struct task_struct *c;
10547 rcu_read_lock();
10548 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
10549 sched_move_task(c);
10550 }
10551 rcu_read_unlock();
10552 }
10400} 10553}
10401 10554
10402#ifdef CONFIG_FAIR_GROUP_SCHED 10555#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -10567,9 +10720,9 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
10567 /* 10720 /*
10568 * Take rq->lock to make 64-bit read safe on 32-bit platforms. 10721 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
10569 */ 10722 */
10570 spin_lock_irq(&cpu_rq(cpu)->lock); 10723 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
10571 data = *cpuusage; 10724 data = *cpuusage;
10572 spin_unlock_irq(&cpu_rq(cpu)->lock); 10725 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
10573#else 10726#else
10574 data = *cpuusage; 10727 data = *cpuusage;
10575#endif 10728#endif
@@ -10585,9 +10738,9 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
10585 /* 10738 /*
10586 * Take rq->lock to make 64-bit write safe on 32-bit platforms. 10739 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
10587 */ 10740 */
10588 spin_lock_irq(&cpu_rq(cpu)->lock); 10741 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
10589 *cpuusage = val; 10742 *cpuusage = val;
10590 spin_unlock_irq(&cpu_rq(cpu)->lock); 10743 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
10591#else 10744#else
10592 *cpuusage = val; 10745 *cpuusage = val;
10593#endif 10746#endif
@@ -10821,9 +10974,9 @@ void synchronize_sched_expedited(void)
10821 init_completion(&req->done); 10974 init_completion(&req->done);
10822 req->task = NULL; 10975 req->task = NULL;
10823 req->dest_cpu = RCU_MIGRATION_NEED_QS; 10976 req->dest_cpu = RCU_MIGRATION_NEED_QS;
10824 spin_lock_irqsave(&rq->lock, flags); 10977 raw_spin_lock_irqsave(&rq->lock, flags);
10825 list_add(&req->list, &rq->migration_queue); 10978 list_add(&req->list, &rq->migration_queue);
10826 spin_unlock_irqrestore(&rq->lock, flags); 10979 raw_spin_unlock_irqrestore(&rq->lock, flags);
10827 wake_up_process(rq->migration_thread); 10980 wake_up_process(rq->migration_thread);
10828 } 10981 }
10829 for_each_online_cpu(cpu) { 10982 for_each_online_cpu(cpu) {
@@ -10831,13 +10984,14 @@ void synchronize_sched_expedited(void)
10831 req = &per_cpu(rcu_migration_req, cpu); 10984 req = &per_cpu(rcu_migration_req, cpu);
10832 rq = cpu_rq(cpu); 10985 rq = cpu_rq(cpu);
10833 wait_for_completion(&req->done); 10986 wait_for_completion(&req->done);
10834 spin_lock_irqsave(&rq->lock, flags); 10987 raw_spin_lock_irqsave(&rq->lock, flags);
10835 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC)) 10988 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
10836 need_full_sync = 1; 10989 need_full_sync = 1;
10837 req->dest_cpu = RCU_MIGRATION_IDLE; 10990 req->dest_cpu = RCU_MIGRATION_IDLE;
10838 spin_unlock_irqrestore(&rq->lock, flags); 10991 raw_spin_unlock_irqrestore(&rq->lock, flags);
10839 } 10992 }
10840 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; 10993 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
10994 synchronize_sched_expedited_count++;
10841 mutex_unlock(&rcu_sched_expedited_mutex); 10995 mutex_unlock(&rcu_sched_expedited_mutex);
10842 put_online_cpus(); 10996 put_online_cpus();
10843 if (need_full_sync) 10997 if (need_full_sync)
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index ac2e1dc708bd..479ce5682d7c 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -127,7 +127,7 @@ again:
127 clock = wrap_max(clock, min_clock); 127 clock = wrap_max(clock, min_clock);
128 clock = wrap_min(clock, max_clock); 128 clock = wrap_min(clock, max_clock);
129 129
130 if (cmpxchg(&scd->clock, old_clock, clock) != old_clock) 130 if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock)
131 goto again; 131 goto again;
132 132
133 return clock; 133 return clock;
@@ -163,7 +163,7 @@ again:
163 val = remote_clock; 163 val = remote_clock;
164 } 164 }
165 165
166 if (cmpxchg(ptr, old_val, val) != old_val) 166 if (cmpxchg64(ptr, old_val, val) != old_val)
167 goto again; 167 goto again;
168 168
169 return val; 169 return val;
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 0f052fc674d5..597b33099dfa 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -135,26 +135,26 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
135 if (likely(newpri != CPUPRI_INVALID)) { 135 if (likely(newpri != CPUPRI_INVALID)) {
136 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; 136 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
137 137
138 spin_lock_irqsave(&vec->lock, flags); 138 raw_spin_lock_irqsave(&vec->lock, flags);
139 139
140 cpumask_set_cpu(cpu, vec->mask); 140 cpumask_set_cpu(cpu, vec->mask);
141 vec->count++; 141 vec->count++;
142 if (vec->count == 1) 142 if (vec->count == 1)
143 set_bit(newpri, cp->pri_active); 143 set_bit(newpri, cp->pri_active);
144 144
145 spin_unlock_irqrestore(&vec->lock, flags); 145 raw_spin_unlock_irqrestore(&vec->lock, flags);
146 } 146 }
147 if (likely(oldpri != CPUPRI_INVALID)) { 147 if (likely(oldpri != CPUPRI_INVALID)) {
148 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; 148 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
149 149
150 spin_lock_irqsave(&vec->lock, flags); 150 raw_spin_lock_irqsave(&vec->lock, flags);
151 151
152 vec->count--; 152 vec->count--;
153 if (!vec->count) 153 if (!vec->count)
154 clear_bit(oldpri, cp->pri_active); 154 clear_bit(oldpri, cp->pri_active);
155 cpumask_clear_cpu(cpu, vec->mask); 155 cpumask_clear_cpu(cpu, vec->mask);
156 156
157 spin_unlock_irqrestore(&vec->lock, flags); 157 raw_spin_unlock_irqrestore(&vec->lock, flags);
158 } 158 }
159 159
160 *currpri = newpri; 160 *currpri = newpri;
@@ -180,7 +180,7 @@ int cpupri_init(struct cpupri *cp, bool bootmem)
180 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { 180 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
181 struct cpupri_vec *vec = &cp->pri_to_cpu[i]; 181 struct cpupri_vec *vec = &cp->pri_to_cpu[i];
182 182
183 spin_lock_init(&vec->lock); 183 raw_spin_lock_init(&vec->lock);
184 vec->count = 0; 184 vec->count = 0;
185 if (!zalloc_cpumask_var(&vec->mask, gfp)) 185 if (!zalloc_cpumask_var(&vec->mask, gfp))
186 goto cleanup; 186 goto cleanup;
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 9a7e859b8fbf..7cb5bb6b95be 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -12,7 +12,7 @@
12/* values 2-101 are RT priorities 0-99 */ 12/* values 2-101 are RT priorities 0-99 */
13 13
14struct cpupri_vec { 14struct cpupri_vec {
15 spinlock_t lock; 15 raw_spinlock_t lock;
16 int count; 16 int count;
17 cpumask_var_t mask; 17 cpumask_var_t mask;
18}; 18};
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index efb84409bc43..67f95aada4b9 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -184,7 +184,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
184 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", 184 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
185 SPLIT_NS(cfs_rq->exec_clock)); 185 SPLIT_NS(cfs_rq->exec_clock));
186 186
187 spin_lock_irqsave(&rq->lock, flags); 187 raw_spin_lock_irqsave(&rq->lock, flags);
188 if (cfs_rq->rb_leftmost) 188 if (cfs_rq->rb_leftmost)
189 MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; 189 MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime;
190 last = __pick_last_entity(cfs_rq); 190 last = __pick_last_entity(cfs_rq);
@@ -192,7 +192,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
192 max_vruntime = last->vruntime; 192 max_vruntime = last->vruntime;
193 min_vruntime = cfs_rq->min_vruntime; 193 min_vruntime = cfs_rq->min_vruntime;
194 rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime; 194 rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
195 spin_unlock_irqrestore(&rq->lock, flags); 195 raw_spin_unlock_irqrestore(&rq->lock, flags);
196 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", 196 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
197 SPLIT_NS(MIN_vruntime)); 197 SPLIT_NS(MIN_vruntime));
198 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", 198 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
@@ -285,12 +285,16 @@ static void print_cpu(struct seq_file *m, int cpu)
285 285
286#ifdef CONFIG_SCHEDSTATS 286#ifdef CONFIG_SCHEDSTATS
287#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); 287#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
288#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n);
288 289
289 P(yld_count); 290 P(yld_count);
290 291
291 P(sched_switch); 292 P(sched_switch);
292 P(sched_count); 293 P(sched_count);
293 P(sched_goidle); 294 P(sched_goidle);
295#ifdef CONFIG_SMP
296 P64(avg_idle);
297#endif
294 298
295 P(ttwu_count); 299 P(ttwu_count);
296 P(ttwu_local); 300 P(ttwu_local);
@@ -305,6 +309,12 @@ static void print_cpu(struct seq_file *m, int cpu)
305 print_rq(m, rq, cpu); 309 print_rq(m, rq, cpu);
306} 310}
307 311
312static const char *sched_tunable_scaling_names[] = {
313 "none",
314 "logaritmic",
315 "linear"
316};
317
308static int sched_debug_show(struct seq_file *m, void *v) 318static int sched_debug_show(struct seq_file *m, void *v)
309{ 319{
310 u64 now = ktime_to_ns(ktime_get()); 320 u64 now = ktime_to_ns(ktime_get());
@@ -330,6 +340,10 @@ static int sched_debug_show(struct seq_file *m, void *v)
330#undef PN 340#undef PN
331#undef P 341#undef P
332 342
343 SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling",
344 sysctl_sched_tunable_scaling,
345 sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
346
333 for_each_online_cpu(cpu) 347 for_each_online_cpu(cpu)
334 print_cpu(m, cpu); 348 print_cpu(m, cpu);
335 349
@@ -395,7 +409,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
395 PN(se.sum_exec_runtime); 409 PN(se.sum_exec_runtime);
396 PN(se.avg_overlap); 410 PN(se.avg_overlap);
397 PN(se.avg_wakeup); 411 PN(se.avg_wakeup);
398 PN(se.avg_running);
399 412
400 nr_switches = p->nvcsw + p->nivcsw; 413 nr_switches = p->nvcsw + p->nivcsw;
401 414
@@ -419,7 +432,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
419 P(se.nr_failed_migrations_running); 432 P(se.nr_failed_migrations_running);
420 P(se.nr_failed_migrations_hot); 433 P(se.nr_failed_migrations_hot);
421 P(se.nr_forced_migrations); 434 P(se.nr_forced_migrations);
422 P(se.nr_forced2_migrations);
423 P(se.nr_wakeups); 435 P(se.nr_wakeups);
424 P(se.nr_wakeups_sync); 436 P(se.nr_wakeups_sync);
425 P(se.nr_wakeups_migrate); 437 P(se.nr_wakeups_migrate);
@@ -495,7 +507,6 @@ void proc_sched_set_task(struct task_struct *p)
495 p->se.nr_failed_migrations_running = 0; 507 p->se.nr_failed_migrations_running = 0;
496 p->se.nr_failed_migrations_hot = 0; 508 p->se.nr_failed_migrations_hot = 0;
497 p->se.nr_forced_migrations = 0; 509 p->se.nr_forced_migrations = 0;
498 p->se.nr_forced2_migrations = 0;
499 p->se.nr_wakeups = 0; 510 p->se.nr_wakeups = 0;
500 p->se.nr_wakeups_sync = 0; 511 p->se.nr_wakeups_sync = 0;
501 p->se.nr_wakeups_migrate = 0; 512 p->se.nr_wakeups_migrate = 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ecc637a0d591..5bedf6e3ebf3 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -21,6 +21,7 @@
21 */ 21 */
22 22
23#include <linux/latencytop.h> 23#include <linux/latencytop.h>
24#include <linux/sched.h>
24 25
25/* 26/*
26 * Targeted preemption latency for CPU-bound tasks: 27 * Targeted preemption latency for CPU-bound tasks:
@@ -35,12 +36,26 @@
35 * run vmstat and monitor the context-switches (cs) field) 36 * run vmstat and monitor the context-switches (cs) field)
36 */ 37 */
37unsigned int sysctl_sched_latency = 5000000ULL; 38unsigned int sysctl_sched_latency = 5000000ULL;
39unsigned int normalized_sysctl_sched_latency = 5000000ULL;
40
41/*
42 * The initial- and re-scaling of tunables is configurable
43 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
44 *
45 * Options are:
46 * SCHED_TUNABLESCALING_NONE - unscaled, always *1
47 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
48 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
49 */
50enum sched_tunable_scaling sysctl_sched_tunable_scaling
51 = SCHED_TUNABLESCALING_LOG;
38 52
39/* 53/*
40 * Minimal preemption granularity for CPU-bound tasks: 54 * Minimal preemption granularity for CPU-bound tasks:
41 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) 55 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
42 */ 56 */
43unsigned int sysctl_sched_min_granularity = 1000000ULL; 57unsigned int sysctl_sched_min_granularity = 1000000ULL;
58unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL;
44 59
45/* 60/*
46 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity 61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
@@ -70,6 +85,7 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
70 * have immediate wakeup/sleep latencies. 85 * have immediate wakeup/sleep latencies.
71 */ 86 */
72unsigned int sysctl_sched_wakeup_granularity = 1000000UL; 87unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
88unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
73 89
74const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 90const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
75 91
@@ -383,11 +399,12 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
383 */ 399 */
384 400
385#ifdef CONFIG_SCHED_DEBUG 401#ifdef CONFIG_SCHED_DEBUG
386int sched_nr_latency_handler(struct ctl_table *table, int write, 402int sched_proc_update_handler(struct ctl_table *table, int write,
387 struct file *filp, void __user *buffer, size_t *lenp, 403 void __user *buffer, size_t *lenp,
388 loff_t *ppos) 404 loff_t *ppos)
389{ 405{
390 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 406 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
407 int factor = get_update_sysctl_factor();
391 408
392 if (ret || !write) 409 if (ret || !write)
393 return ret; 410 return ret;
@@ -395,6 +412,14 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
395 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, 412 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
396 sysctl_sched_min_granularity); 413 sysctl_sched_min_granularity);
397 414
415#define WRT_SYSCTL(name) \
416 (normalized_sysctl_##name = sysctl_##name / (factor))
417 WRT_SYSCTL(sched_min_granularity);
418 WRT_SYSCTL(sched_latency);
419 WRT_SYSCTL(sched_wakeup_granularity);
420 WRT_SYSCTL(sched_shares_ratelimit);
421#undef WRT_SYSCTL
422
398 return 0; 423 return 0;
399} 424}
400#endif 425#endif
@@ -822,6 +847,26 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
822 * re-elected due to buddy favours. 847 * re-elected due to buddy favours.
823 */ 848 */
824 clear_buddies(cfs_rq, curr); 849 clear_buddies(cfs_rq, curr);
850 return;
851 }
852
853 /*
854 * Ensure that a task that missed wakeup preemption by a
855 * narrow margin doesn't have to wait for a full slice.
856 * This also mitigates buddy induced latencies under load.
857 */
858 if (!sched_feat(WAKEUP_PREEMPT))
859 return;
860
861 if (delta_exec < sysctl_sched_min_granularity)
862 return;
863
864 if (cfs_rq->nr_running > 1) {
865 struct sched_entity *se = __pick_next_entity(cfs_rq);
866 s64 delta = curr->vruntime - se->vruntime;
867
868 if (delta > ideal_runtime)
869 resched_task(rq_of(cfs_rq)->curr);
825 } 870 }
826} 871}
827 872
@@ -861,12 +906,18 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
861static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) 906static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
862{ 907{
863 struct sched_entity *se = __pick_next_entity(cfs_rq); 908 struct sched_entity *se = __pick_next_entity(cfs_rq);
909 struct sched_entity *left = se;
864 910
865 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1) 911 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
866 return cfs_rq->next; 912 se = cfs_rq->next;
867 913
868 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1) 914 /*
869 return cfs_rq->last; 915 * Prefer last buddy, try to return the CPU to a preempted task.
916 */
917 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
918 se = cfs_rq->last;
919
920 clear_buddies(cfs_rq, se);
870 921
871 return se; 922 return se;
872} 923}
@@ -1319,6 +1370,37 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1319} 1370}
1320 1371
1321/* 1372/*
1373 * Try and locate an idle CPU in the sched_domain.
1374 */
1375static int
1376select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
1377{
1378 int cpu = smp_processor_id();
1379 int prev_cpu = task_cpu(p);
1380 int i;
1381
1382 /*
1383 * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE
1384 * test in select_task_rq_fair) and the prev_cpu is idle then that's
1385 * always a better target than the current cpu.
1386 */
1387 if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running)
1388 return prev_cpu;
1389
1390 /*
1391 * Otherwise, iterate the domain and find an elegible idle cpu.
1392 */
1393 for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
1394 if (!cpu_rq(i)->cfs.nr_running) {
1395 target = i;
1396 break;
1397 }
1398 }
1399
1400 return target;
1401}
1402
1403/*
1322 * sched_balance_self: balance the current task (running on cpu) in domains 1404 * sched_balance_self: balance the current task (running on cpu) in domains
1323 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and 1405 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1324 * SD_BALANCE_EXEC. 1406 * SD_BALANCE_EXEC.
@@ -1346,7 +1428,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1346 new_cpu = prev_cpu; 1428 new_cpu = prev_cpu;
1347 } 1429 }
1348 1430
1349 rcu_read_lock();
1350 for_each_domain(cpu, tmp) { 1431 for_each_domain(cpu, tmp) {
1351 /* 1432 /*
1352 * If power savings logic is enabled for a domain, see if we 1433 * If power savings logic is enabled for a domain, see if we
@@ -1372,11 +1453,35 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1372 want_sd = 0; 1453 want_sd = 0;
1373 } 1454 }
1374 1455
1375 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && 1456 /*
1376 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { 1457 * While iterating the domains looking for a spanning
1458 * WAKE_AFFINE domain, adjust the affine target to any idle cpu
1459 * in cache sharing domains along the way.
1460 */
1461 if (want_affine) {
1462 int target = -1;
1377 1463
1378 affine_sd = tmp; 1464 /*
1379 want_affine = 0; 1465 * If both cpu and prev_cpu are part of this domain,
1466 * cpu is a valid SD_WAKE_AFFINE target.
1467 */
1468 if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
1469 target = cpu;
1470
1471 /*
1472 * If there's an idle sibling in this domain, make that
1473 * the wake_affine target instead of the current cpu.
1474 */
1475 if (tmp->flags & SD_PREFER_SIBLING)
1476 target = select_idle_sibling(p, tmp, target);
1477
1478 if (target >= 0) {
1479 if (tmp->flags & SD_WAKE_AFFINE) {
1480 affine_sd = tmp;
1481 want_affine = 0;
1482 }
1483 cpu = target;
1484 }
1380 } 1485 }
1381 1486
1382 if (!want_sd && !want_affine) 1487 if (!want_sd && !want_affine)
@@ -1403,10 +1508,8 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1403 update_shares(tmp); 1508 update_shares(tmp);
1404 } 1509 }
1405 1510
1406 if (affine_sd && wake_affine(affine_sd, p, sync)) { 1511 if (affine_sd && wake_affine(affine_sd, p, sync))
1407 new_cpu = cpu; 1512 return cpu;
1408 goto out;
1409 }
1410 1513
1411 while (sd) { 1514 while (sd) {
1412 int load_idx = sd->forkexec_idx; 1515 int load_idx = sd->forkexec_idx;
@@ -1447,8 +1550,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1447 /* while loop will break here if sd == NULL */ 1550 /* while loop will break here if sd == NULL */
1448 } 1551 }
1449 1552
1450out:
1451 rcu_read_unlock();
1452 return new_cpu; 1553 return new_cpu;
1453} 1554}
1454#endif /* CONFIG_SMP */ 1555#endif /* CONFIG_SMP */
@@ -1568,13 +1669,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1568 struct sched_entity *se = &curr->se, *pse = &p->se; 1669 struct sched_entity *se = &curr->se, *pse = &p->se;
1569 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1670 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1570 int sync = wake_flags & WF_SYNC; 1671 int sync = wake_flags & WF_SYNC;
1672 int scale = cfs_rq->nr_running >= sched_nr_latency;
1571 1673
1572 update_curr(cfs_rq); 1674 if (unlikely(rt_prio(p->prio)))
1573 1675 goto preempt;
1574 if (unlikely(rt_prio(p->prio))) {
1575 resched_task(curr);
1576 return;
1577 }
1578 1676
1579 if (unlikely(p->sched_class != &fair_sched_class)) 1677 if (unlikely(p->sched_class != &fair_sched_class))
1580 return; 1678 return;
@@ -1582,18 +1680,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1582 if (unlikely(se == pse)) 1680 if (unlikely(se == pse))
1583 return; 1681 return;
1584 1682
1585 /* 1683 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK))
1586 * Only set the backward buddy when the current task is still on the
1587 * rq. This can happen when a wakeup gets interleaved with schedule on
1588 * the ->pre_schedule() or idle_balance() point, either of which can
1589 * drop the rq lock.
1590 *
1591 * Also, during early boot the idle thread is in the fair class, for
1592 * obvious reasons its a bad idea to schedule back to the idle thread.
1593 */
1594 if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
1595 set_last_buddy(se);
1596 if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK))
1597 set_next_buddy(pse); 1684 set_next_buddy(pse);
1598 1685
1599 /* 1686 /*
@@ -1611,36 +1698,44 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1611 return; 1698 return;
1612 1699
1613 /* Idle tasks are by definition preempted by everybody. */ 1700 /* Idle tasks are by definition preempted by everybody. */
1614 if (unlikely(curr->policy == SCHED_IDLE)) { 1701 if (unlikely(curr->policy == SCHED_IDLE))
1615 resched_task(curr); 1702 goto preempt;
1616 return;
1617 }
1618 1703
1619 if ((sched_feat(WAKEUP_SYNC) && sync) || 1704 if (sched_feat(WAKEUP_SYNC) && sync)
1620 (sched_feat(WAKEUP_OVERLAP) && 1705 goto preempt;
1621 (se->avg_overlap < sysctl_sched_migration_cost &&
1622 pse->avg_overlap < sysctl_sched_migration_cost))) {
1623 resched_task(curr);
1624 return;
1625 }
1626 1706
1627 if (sched_feat(WAKEUP_RUNNING)) { 1707 if (sched_feat(WAKEUP_OVERLAP) &&
1628 if (pse->avg_running < se->avg_running) { 1708 se->avg_overlap < sysctl_sched_migration_cost &&
1629 set_next_buddy(pse); 1709 pse->avg_overlap < sysctl_sched_migration_cost)
1630 resched_task(curr); 1710 goto preempt;
1631 return;
1632 }
1633 }
1634 1711
1635 if (!sched_feat(WAKEUP_PREEMPT)) 1712 if (!sched_feat(WAKEUP_PREEMPT))
1636 return; 1713 return;
1637 1714
1715 update_curr(cfs_rq);
1638 find_matching_se(&se, &pse); 1716 find_matching_se(&se, &pse);
1639
1640 BUG_ON(!pse); 1717 BUG_ON(!pse);
1641
1642 if (wakeup_preempt_entity(se, pse) == 1) 1718 if (wakeup_preempt_entity(se, pse) == 1)
1643 resched_task(curr); 1719 goto preempt;
1720
1721 return;
1722
1723preempt:
1724 resched_task(curr);
1725 /*
1726 * Only set the backward buddy when the current task is still
1727 * on the rq. This can happen when a wakeup gets interleaved
1728 * with schedule on the ->pre_schedule() or idle_balance()
1729 * point, either of which can * drop the rq lock.
1730 *
1731 * Also, during early boot the idle thread is in the fair class,
1732 * for obvious reasons its a bad idea to schedule back to it.
1733 */
1734 if (unlikely(!se->on_rq || curr == rq->idle))
1735 return;
1736
1737 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
1738 set_last_buddy(se);
1644} 1739}
1645 1740
1646static struct task_struct *pick_next_task_fair(struct rq *rq) 1741static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1649,21 +1744,11 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
1649 struct cfs_rq *cfs_rq = &rq->cfs; 1744 struct cfs_rq *cfs_rq = &rq->cfs;
1650 struct sched_entity *se; 1745 struct sched_entity *se;
1651 1746
1652 if (unlikely(!cfs_rq->nr_running)) 1747 if (!cfs_rq->nr_running)
1653 return NULL; 1748 return NULL;
1654 1749
1655 do { 1750 do {
1656 se = pick_next_entity(cfs_rq); 1751 se = pick_next_entity(cfs_rq);
1657 /*
1658 * If se was a buddy, clear it so that it will have to earn
1659 * the favour again.
1660 *
1661 * If se was not a buddy, clear the buddies because neither
1662 * was elegible to run, let them earn it again.
1663 *
1664 * IOW. unconditionally clear buddies.
1665 */
1666 __clear_buddies(cfs_rq, NULL);
1667 set_next_entity(cfs_rq, se); 1752 set_next_entity(cfs_rq, se);
1668 cfs_rq = group_cfs_rq(se); 1753 cfs_rq = group_cfs_rq(se);
1669 } while (cfs_rq); 1754 } while (cfs_rq);
@@ -1830,6 +1915,17 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1830 1915
1831 return 0; 1916 return 0;
1832} 1917}
1918
1919static void rq_online_fair(struct rq *rq)
1920{
1921 update_sysctl();
1922}
1923
1924static void rq_offline_fair(struct rq *rq)
1925{
1926 update_sysctl();
1927}
1928
1833#endif /* CONFIG_SMP */ 1929#endif /* CONFIG_SMP */
1834 1930
1835/* 1931/*
@@ -1847,28 +1943,30 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
1847} 1943}
1848 1944
1849/* 1945/*
1850 * Share the fairness runtime between parent and child, thus the 1946 * called on fork with the child task as argument from the parent's context
1851 * total amount of pressure for CPU stays equal - new tasks 1947 * - child not yet on the tasklist
1852 * get a chance to run but frequent forkers are not allowed to 1948 * - preemption disabled
1853 * monopolize the CPU. Note: the parent runqueue is locked,
1854 * the child is not running yet.
1855 */ 1949 */
1856static void task_new_fair(struct rq *rq, struct task_struct *p) 1950static void task_fork_fair(struct task_struct *p)
1857{ 1951{
1858 struct cfs_rq *cfs_rq = task_cfs_rq(p); 1952 struct cfs_rq *cfs_rq = task_cfs_rq(current);
1859 struct sched_entity *se = &p->se, *curr = cfs_rq->curr; 1953 struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
1860 int this_cpu = smp_processor_id(); 1954 int this_cpu = smp_processor_id();
1955 struct rq *rq = this_rq();
1956 unsigned long flags;
1957
1958 raw_spin_lock_irqsave(&rq->lock, flags);
1861 1959
1862 sched_info_queued(p); 1960 if (unlikely(task_cpu(p) != this_cpu))
1961 __set_task_cpu(p, this_cpu);
1863 1962
1864 update_curr(cfs_rq); 1963 update_curr(cfs_rq);
1964
1865 if (curr) 1965 if (curr)
1866 se->vruntime = curr->vruntime; 1966 se->vruntime = curr->vruntime;
1867 place_entity(cfs_rq, se, 1); 1967 place_entity(cfs_rq, se, 1);
1868 1968
1869 /* 'curr' will be NULL if the child belongs to a different group */ 1969 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
1870 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
1871 curr && entity_before(curr, se)) {
1872 /* 1970 /*
1873 * Upon rescheduling, sched_class::put_prev_task() will place 1971 * Upon rescheduling, sched_class::put_prev_task() will place
1874 * 'current' within the tree based on its new key value. 1972 * 'current' within the tree based on its new key value.
@@ -1877,7 +1975,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1877 resched_task(rq->curr); 1975 resched_task(rq->curr);
1878 } 1976 }
1879 1977
1880 enqueue_task_fair(rq, p, 0); 1978 raw_spin_unlock_irqrestore(&rq->lock, flags);
1881} 1979}
1882 1980
1883/* 1981/*
@@ -1939,21 +2037,17 @@ static void moved_group_fair(struct task_struct *p)
1939} 2037}
1940#endif 2038#endif
1941 2039
1942unsigned int get_rr_interval_fair(struct task_struct *task) 2040unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
1943{ 2041{
1944 struct sched_entity *se = &task->se; 2042 struct sched_entity *se = &task->se;
1945 unsigned long flags;
1946 struct rq *rq;
1947 unsigned int rr_interval = 0; 2043 unsigned int rr_interval = 0;
1948 2044
1949 /* 2045 /*
1950 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise 2046 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
1951 * idle runqueue: 2047 * idle runqueue:
1952 */ 2048 */
1953 rq = task_rq_lock(task, &flags);
1954 if (rq->cfs.load.weight) 2049 if (rq->cfs.load.weight)
1955 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); 2050 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
1956 task_rq_unlock(rq, &flags);
1957 2051
1958 return rr_interval; 2052 return rr_interval;
1959} 2053}
@@ -1977,11 +2071,13 @@ static const struct sched_class fair_sched_class = {
1977 2071
1978 .load_balance = load_balance_fair, 2072 .load_balance = load_balance_fair,
1979 .move_one_task = move_one_task_fair, 2073 .move_one_task = move_one_task_fair,
2074 .rq_online = rq_online_fair,
2075 .rq_offline = rq_offline_fair,
1980#endif 2076#endif
1981 2077
1982 .set_curr_task = set_curr_task_fair, 2078 .set_curr_task = set_curr_task_fair,
1983 .task_tick = task_tick_fair, 2079 .task_tick = task_tick_fair,
1984 .task_new = task_new_fair, 2080 .task_fork = task_fork_fair,
1985 2081
1986 .prio_changed = prio_changed_fair, 2082 .prio_changed = prio_changed_fair,
1987 .switched_to = switched_to_fair, 2083 .switched_to = switched_to_fair,
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 0d94083582c7..d5059fd761d9 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -54,11 +54,6 @@ SCHED_FEAT(WAKEUP_SYNC, 0)
54SCHED_FEAT(WAKEUP_OVERLAP, 0) 54SCHED_FEAT(WAKEUP_OVERLAP, 0)
55 55
56/* 56/*
57 * Wakeup preemption towards tasks that run short
58 */
59SCHED_FEAT(WAKEUP_RUNNING, 0)
60
61/*
62 * Use the SYNC wakeup hint, pipes and the likes use this to indicate 57 * Use the SYNC wakeup hint, pipes and the likes use this to indicate
63 * the remote end is likely to consume the data we just wrote, and 58 * the remote end is likely to consume the data we just wrote, and
64 * therefore has cache benefit from being placed on the same cpu, see 59 * therefore has cache benefit from being placed on the same cpu, see
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index b133a28fcde3..5f93b570d383 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -34,10 +34,10 @@ static struct task_struct *pick_next_task_idle(struct rq *rq)
34static void 34static void
35dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep) 35dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep)
36{ 36{
37 spin_unlock_irq(&rq->lock); 37 raw_spin_unlock_irq(&rq->lock);
38 printk(KERN_ERR "bad: scheduling from the idle thread!\n"); 38 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
39 dump_stack(); 39 dump_stack();
40 spin_lock_irq(&rq->lock); 40 raw_spin_lock_irq(&rq->lock);
41} 41}
42 42
43static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) 43static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
@@ -97,7 +97,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
97 check_preempt_curr(rq, p, 0); 97 check_preempt_curr(rq, p, 0);
98} 98}
99 99
100unsigned int get_rr_interval_idle(struct task_struct *task) 100unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
101{ 101{
102 return 0; 102 return 0;
103} 103}
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index a4d790cddb19..d2ea2828164e 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -327,7 +327,7 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
327 327
328 weight = cpumask_weight(rd->span); 328 weight = cpumask_weight(rd->span);
329 329
330 spin_lock(&rt_b->rt_runtime_lock); 330 raw_spin_lock(&rt_b->rt_runtime_lock);
331 rt_period = ktime_to_ns(rt_b->rt_period); 331 rt_period = ktime_to_ns(rt_b->rt_period);
332 for_each_cpu(i, rd->span) { 332 for_each_cpu(i, rd->span) {
333 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 333 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
@@ -336,7 +336,7 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
336 if (iter == rt_rq) 336 if (iter == rt_rq)
337 continue; 337 continue;
338 338
339 spin_lock(&iter->rt_runtime_lock); 339 raw_spin_lock(&iter->rt_runtime_lock);
340 /* 340 /*
341 * Either all rqs have inf runtime and there's nothing to steal 341 * Either all rqs have inf runtime and there's nothing to steal
342 * or __disable_runtime() below sets a specific rq to inf to 342 * or __disable_runtime() below sets a specific rq to inf to
@@ -358,14 +358,14 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
358 rt_rq->rt_runtime += diff; 358 rt_rq->rt_runtime += diff;
359 more = 1; 359 more = 1;
360 if (rt_rq->rt_runtime == rt_period) { 360 if (rt_rq->rt_runtime == rt_period) {
361 spin_unlock(&iter->rt_runtime_lock); 361 raw_spin_unlock(&iter->rt_runtime_lock);
362 break; 362 break;
363 } 363 }
364 } 364 }
365next: 365next:
366 spin_unlock(&iter->rt_runtime_lock); 366 raw_spin_unlock(&iter->rt_runtime_lock);
367 } 367 }
368 spin_unlock(&rt_b->rt_runtime_lock); 368 raw_spin_unlock(&rt_b->rt_runtime_lock);
369 369
370 return more; 370 return more;
371} 371}
@@ -386,8 +386,8 @@ static void __disable_runtime(struct rq *rq)
386 s64 want; 386 s64 want;
387 int i; 387 int i;
388 388
389 spin_lock(&rt_b->rt_runtime_lock); 389 raw_spin_lock(&rt_b->rt_runtime_lock);
390 spin_lock(&rt_rq->rt_runtime_lock); 390 raw_spin_lock(&rt_rq->rt_runtime_lock);
391 /* 391 /*
392 * Either we're all inf and nobody needs to borrow, or we're 392 * Either we're all inf and nobody needs to borrow, or we're
393 * already disabled and thus have nothing to do, or we have 393 * already disabled and thus have nothing to do, or we have
@@ -396,7 +396,7 @@ static void __disable_runtime(struct rq *rq)
396 if (rt_rq->rt_runtime == RUNTIME_INF || 396 if (rt_rq->rt_runtime == RUNTIME_INF ||
397 rt_rq->rt_runtime == rt_b->rt_runtime) 397 rt_rq->rt_runtime == rt_b->rt_runtime)
398 goto balanced; 398 goto balanced;
399 spin_unlock(&rt_rq->rt_runtime_lock); 399 raw_spin_unlock(&rt_rq->rt_runtime_lock);
400 400
401 /* 401 /*
402 * Calculate the difference between what we started out with 402 * Calculate the difference between what we started out with
@@ -418,7 +418,7 @@ static void __disable_runtime(struct rq *rq)
418 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) 418 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
419 continue; 419 continue;
420 420
421 spin_lock(&iter->rt_runtime_lock); 421 raw_spin_lock(&iter->rt_runtime_lock);
422 if (want > 0) { 422 if (want > 0) {
423 diff = min_t(s64, iter->rt_runtime, want); 423 diff = min_t(s64, iter->rt_runtime, want);
424 iter->rt_runtime -= diff; 424 iter->rt_runtime -= diff;
@@ -427,13 +427,13 @@ static void __disable_runtime(struct rq *rq)
427 iter->rt_runtime -= want; 427 iter->rt_runtime -= want;
428 want -= want; 428 want -= want;
429 } 429 }
430 spin_unlock(&iter->rt_runtime_lock); 430 raw_spin_unlock(&iter->rt_runtime_lock);
431 431
432 if (!want) 432 if (!want)
433 break; 433 break;
434 } 434 }
435 435
436 spin_lock(&rt_rq->rt_runtime_lock); 436 raw_spin_lock(&rt_rq->rt_runtime_lock);
437 /* 437 /*
438 * We cannot be left wanting - that would mean some runtime 438 * We cannot be left wanting - that would mean some runtime
439 * leaked out of the system. 439 * leaked out of the system.
@@ -445,8 +445,8 @@ balanced:
445 * runtime - in which case borrowing doesn't make sense. 445 * runtime - in which case borrowing doesn't make sense.
446 */ 446 */
447 rt_rq->rt_runtime = RUNTIME_INF; 447 rt_rq->rt_runtime = RUNTIME_INF;
448 spin_unlock(&rt_rq->rt_runtime_lock); 448 raw_spin_unlock(&rt_rq->rt_runtime_lock);
449 spin_unlock(&rt_b->rt_runtime_lock); 449 raw_spin_unlock(&rt_b->rt_runtime_lock);
450 } 450 }
451} 451}
452 452
@@ -454,9 +454,9 @@ static void disable_runtime(struct rq *rq)
454{ 454{
455 unsigned long flags; 455 unsigned long flags;
456 456
457 spin_lock_irqsave(&rq->lock, flags); 457 raw_spin_lock_irqsave(&rq->lock, flags);
458 __disable_runtime(rq); 458 __disable_runtime(rq);
459 spin_unlock_irqrestore(&rq->lock, flags); 459 raw_spin_unlock_irqrestore(&rq->lock, flags);
460} 460}
461 461
462static void __enable_runtime(struct rq *rq) 462static void __enable_runtime(struct rq *rq)
@@ -472,13 +472,13 @@ static void __enable_runtime(struct rq *rq)
472 for_each_leaf_rt_rq(rt_rq, rq) { 472 for_each_leaf_rt_rq(rt_rq, rq) {
473 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 473 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
474 474
475 spin_lock(&rt_b->rt_runtime_lock); 475 raw_spin_lock(&rt_b->rt_runtime_lock);
476 spin_lock(&rt_rq->rt_runtime_lock); 476 raw_spin_lock(&rt_rq->rt_runtime_lock);
477 rt_rq->rt_runtime = rt_b->rt_runtime; 477 rt_rq->rt_runtime = rt_b->rt_runtime;
478 rt_rq->rt_time = 0; 478 rt_rq->rt_time = 0;
479 rt_rq->rt_throttled = 0; 479 rt_rq->rt_throttled = 0;
480 spin_unlock(&rt_rq->rt_runtime_lock); 480 raw_spin_unlock(&rt_rq->rt_runtime_lock);
481 spin_unlock(&rt_b->rt_runtime_lock); 481 raw_spin_unlock(&rt_b->rt_runtime_lock);
482 } 482 }
483} 483}
484 484
@@ -486,9 +486,9 @@ static void enable_runtime(struct rq *rq)
486{ 486{
487 unsigned long flags; 487 unsigned long flags;
488 488
489 spin_lock_irqsave(&rq->lock, flags); 489 raw_spin_lock_irqsave(&rq->lock, flags);
490 __enable_runtime(rq); 490 __enable_runtime(rq);
491 spin_unlock_irqrestore(&rq->lock, flags); 491 raw_spin_unlock_irqrestore(&rq->lock, flags);
492} 492}
493 493
494static int balance_runtime(struct rt_rq *rt_rq) 494static int balance_runtime(struct rt_rq *rt_rq)
@@ -496,9 +496,9 @@ static int balance_runtime(struct rt_rq *rt_rq)
496 int more = 0; 496 int more = 0;
497 497
498 if (rt_rq->rt_time > rt_rq->rt_runtime) { 498 if (rt_rq->rt_time > rt_rq->rt_runtime) {
499 spin_unlock(&rt_rq->rt_runtime_lock); 499 raw_spin_unlock(&rt_rq->rt_runtime_lock);
500 more = do_balance_runtime(rt_rq); 500 more = do_balance_runtime(rt_rq);
501 spin_lock(&rt_rq->rt_runtime_lock); 501 raw_spin_lock(&rt_rq->rt_runtime_lock);
502 } 502 }
503 503
504 return more; 504 return more;
@@ -524,11 +524,11 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
524 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); 524 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
525 struct rq *rq = rq_of_rt_rq(rt_rq); 525 struct rq *rq = rq_of_rt_rq(rt_rq);
526 526
527 spin_lock(&rq->lock); 527 raw_spin_lock(&rq->lock);
528 if (rt_rq->rt_time) { 528 if (rt_rq->rt_time) {
529 u64 runtime; 529 u64 runtime;
530 530
531 spin_lock(&rt_rq->rt_runtime_lock); 531 raw_spin_lock(&rt_rq->rt_runtime_lock);
532 if (rt_rq->rt_throttled) 532 if (rt_rq->rt_throttled)
533 balance_runtime(rt_rq); 533 balance_runtime(rt_rq);
534 runtime = rt_rq->rt_runtime; 534 runtime = rt_rq->rt_runtime;
@@ -539,13 +539,13 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
539 } 539 }
540 if (rt_rq->rt_time || rt_rq->rt_nr_running) 540 if (rt_rq->rt_time || rt_rq->rt_nr_running)
541 idle = 0; 541 idle = 0;
542 spin_unlock(&rt_rq->rt_runtime_lock); 542 raw_spin_unlock(&rt_rq->rt_runtime_lock);
543 } else if (rt_rq->rt_nr_running) 543 } else if (rt_rq->rt_nr_running)
544 idle = 0; 544 idle = 0;
545 545
546 if (enqueue) 546 if (enqueue)
547 sched_rt_rq_enqueue(rt_rq); 547 sched_rt_rq_enqueue(rt_rq);
548 spin_unlock(&rq->lock); 548 raw_spin_unlock(&rq->lock);
549 } 549 }
550 550
551 return idle; 551 return idle;
@@ -624,11 +624,11 @@ static void update_curr_rt(struct rq *rq)
624 rt_rq = rt_rq_of_se(rt_se); 624 rt_rq = rt_rq_of_se(rt_se);
625 625
626 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { 626 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
627 spin_lock(&rt_rq->rt_runtime_lock); 627 raw_spin_lock(&rt_rq->rt_runtime_lock);
628 rt_rq->rt_time += delta_exec; 628 rt_rq->rt_time += delta_exec;
629 if (sched_rt_runtime_exceeded(rt_rq)) 629 if (sched_rt_runtime_exceeded(rt_rq))
630 resched_task(curr); 630 resched_task(curr);
631 spin_unlock(&rt_rq->rt_runtime_lock); 631 raw_spin_unlock(&rt_rq->rt_runtime_lock);
632 } 632 }
633 } 633 }
634} 634}
@@ -1153,29 +1153,12 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
1153 1153
1154static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); 1154static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1155 1155
1156static inline int pick_optimal_cpu(int this_cpu,
1157 const struct cpumask *mask)
1158{
1159 int first;
1160
1161 /* "this_cpu" is cheaper to preempt than a remote processor */
1162 if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask))
1163 return this_cpu;
1164
1165 first = cpumask_first(mask);
1166 if (first < nr_cpu_ids)
1167 return first;
1168
1169 return -1;
1170}
1171
1172static int find_lowest_rq(struct task_struct *task) 1156static int find_lowest_rq(struct task_struct *task)
1173{ 1157{
1174 struct sched_domain *sd; 1158 struct sched_domain *sd;
1175 struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); 1159 struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
1176 int this_cpu = smp_processor_id(); 1160 int this_cpu = smp_processor_id();
1177 int cpu = task_cpu(task); 1161 int cpu = task_cpu(task);
1178 cpumask_var_t domain_mask;
1179 1162
1180 if (task->rt.nr_cpus_allowed == 1) 1163 if (task->rt.nr_cpus_allowed == 1)
1181 return -1; /* No other targets possible */ 1164 return -1; /* No other targets possible */
@@ -1198,28 +1181,26 @@ static int find_lowest_rq(struct task_struct *task)
1198 * Otherwise, we consult the sched_domains span maps to figure 1181 * Otherwise, we consult the sched_domains span maps to figure
1199 * out which cpu is logically closest to our hot cache data. 1182 * out which cpu is logically closest to our hot cache data.
1200 */ 1183 */
1201 if (this_cpu == cpu) 1184 if (!cpumask_test_cpu(this_cpu, lowest_mask))
1202 this_cpu = -1; /* Skip this_cpu opt if the same */ 1185 this_cpu = -1; /* Skip this_cpu opt if not among lowest */
1203
1204 if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) {
1205 for_each_domain(cpu, sd) {
1206 if (sd->flags & SD_WAKE_AFFINE) {
1207 int best_cpu;
1208 1186
1209 cpumask_and(domain_mask, 1187 for_each_domain(cpu, sd) {
1210 sched_domain_span(sd), 1188 if (sd->flags & SD_WAKE_AFFINE) {
1211 lowest_mask); 1189 int best_cpu;
1212 1190
1213 best_cpu = pick_optimal_cpu(this_cpu, 1191 /*
1214 domain_mask); 1192 * "this_cpu" is cheaper to preempt than a
1215 1193 * remote processor.
1216 if (best_cpu != -1) { 1194 */
1217 free_cpumask_var(domain_mask); 1195 if (this_cpu != -1 &&
1218 return best_cpu; 1196 cpumask_test_cpu(this_cpu, sched_domain_span(sd)))
1219 } 1197 return this_cpu;
1220 } 1198
1199 best_cpu = cpumask_first_and(lowest_mask,
1200 sched_domain_span(sd));
1201 if (best_cpu < nr_cpu_ids)
1202 return best_cpu;
1221 } 1203 }
1222 free_cpumask_var(domain_mask);
1223 } 1204 }
1224 1205
1225 /* 1206 /*
@@ -1227,7 +1208,13 @@ static int find_lowest_rq(struct task_struct *task)
1227 * just give the caller *something* to work with from the compatible 1208 * just give the caller *something* to work with from the compatible
1228 * locations. 1209 * locations.
1229 */ 1210 */
1230 return pick_optimal_cpu(this_cpu, lowest_mask); 1211 if (this_cpu != -1)
1212 return this_cpu;
1213
1214 cpu = cpumask_any(lowest_mask);
1215 if (cpu < nr_cpu_ids)
1216 return cpu;
1217 return -1;
1231} 1218}
1232 1219
1233/* Will lock the rq it finds */ 1220/* Will lock the rq it finds */
@@ -1259,7 +1246,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1259 task_running(rq, task) || 1246 task_running(rq, task) ||
1260 !task->se.on_rq)) { 1247 !task->se.on_rq)) {
1261 1248
1262 spin_unlock(&lowest_rq->lock); 1249 raw_spin_unlock(&lowest_rq->lock);
1263 lowest_rq = NULL; 1250 lowest_rq = NULL;
1264 break; 1251 break;
1265 } 1252 }
@@ -1734,7 +1721,7 @@ static void set_curr_task_rt(struct rq *rq)
1734 dequeue_pushable_task(rq, p); 1721 dequeue_pushable_task(rq, p);
1735} 1722}
1736 1723
1737unsigned int get_rr_interval_rt(struct task_struct *task) 1724unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
1738{ 1725{
1739 /* 1726 /*
1740 * Time slice is 0 for SCHED_FIFO tasks 1727 * Time slice is 0 for SCHED_FIFO tasks
diff --git a/kernel/signal.c b/kernel/signal.c
index 64c5deeaca5d..1814e68e4de3 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -22,12 +22,14 @@
22#include <linux/ptrace.h> 22#include <linux/ptrace.h>
23#include <linux/signal.h> 23#include <linux/signal.h>
24#include <linux/signalfd.h> 24#include <linux/signalfd.h>
25#include <linux/ratelimit.h>
25#include <linux/tracehook.h> 26#include <linux/tracehook.h>
26#include <linux/capability.h> 27#include <linux/capability.h>
27#include <linux/freezer.h> 28#include <linux/freezer.h>
28#include <linux/pid_namespace.h> 29#include <linux/pid_namespace.h>
29#include <linux/nsproxy.h> 30#include <linux/nsproxy.h>
30#include <trace/events/sched.h> 31#define CREATE_TRACE_POINTS
32#include <trace/events/signal.h>
31 33
32#include <asm/param.h> 34#include <asm/param.h>
33#include <asm/uaccess.h> 35#include <asm/uaccess.h>
@@ -41,6 +43,8 @@
41 43
42static struct kmem_cache *sigqueue_cachep; 44static struct kmem_cache *sigqueue_cachep;
43 45
46int print_fatal_signals __read_mostly;
47
44static void __user *sig_handler(struct task_struct *t, int sig) 48static void __user *sig_handler(struct task_struct *t, int sig)
45{ 49{
46 return t->sighand->action[sig - 1].sa.sa_handler; 50 return t->sighand->action[sig - 1].sa.sa_handler;
@@ -159,7 +163,7 @@ int next_signal(struct sigpending *pending, sigset_t *mask)
159{ 163{
160 unsigned long i, *s, *m, x; 164 unsigned long i, *s, *m, x;
161 int sig = 0; 165 int sig = 0;
162 166
163 s = pending->signal.sig; 167 s = pending->signal.sig;
164 m = mask->sig; 168 m = mask->sig;
165 switch (_NSIG_WORDS) { 169 switch (_NSIG_WORDS) {
@@ -184,17 +188,31 @@ int next_signal(struct sigpending *pending, sigset_t *mask)
184 sig = ffz(~x) + 1; 188 sig = ffz(~x) + 1;
185 break; 189 break;
186 } 190 }
187 191
188 return sig; 192 return sig;
189} 193}
190 194
195static inline void print_dropped_signal(int sig)
196{
197 static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
198
199 if (!print_fatal_signals)
200 return;
201
202 if (!__ratelimit(&ratelimit_state))
203 return;
204
205 printk(KERN_INFO "%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n",
206 current->comm, current->pid, sig);
207}
208
191/* 209/*
192 * allocate a new signal queue record 210 * allocate a new signal queue record
193 * - this may be called without locks if and only if t == current, otherwise an 211 * - this may be called without locks if and only if t == current, otherwise an
194 * appopriate lock must be held to stop the target task from exiting 212 * appopriate lock must be held to stop the target task from exiting
195 */ 213 */
196static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, 214static struct sigqueue *
197 int override_rlimit) 215__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
198{ 216{
199 struct sigqueue *q = NULL; 217 struct sigqueue *q = NULL;
200 struct user_struct *user; 218 struct user_struct *user;
@@ -207,10 +225,15 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
207 */ 225 */
208 user = get_uid(__task_cred(t)->user); 226 user = get_uid(__task_cred(t)->user);
209 atomic_inc(&user->sigpending); 227 atomic_inc(&user->sigpending);
228
210 if (override_rlimit || 229 if (override_rlimit ||
211 atomic_read(&user->sigpending) <= 230 atomic_read(&user->sigpending) <=
212 t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) 231 t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) {
213 q = kmem_cache_alloc(sigqueue_cachep, flags); 232 q = kmem_cache_alloc(sigqueue_cachep, flags);
233 } else {
234 print_dropped_signal(sig);
235 }
236
214 if (unlikely(q == NULL)) { 237 if (unlikely(q == NULL)) {
215 atomic_dec(&user->sigpending); 238 atomic_dec(&user->sigpending);
216 free_uid(user); 239 free_uid(user);
@@ -400,7 +423,7 @@ still_pending:
400 */ 423 */
401 info->si_signo = sig; 424 info->si_signo = sig;
402 info->si_errno = 0; 425 info->si_errno = 0;
403 info->si_code = 0; 426 info->si_code = SI_USER;
404 info->si_pid = 0; 427 info->si_pid = 0;
405 info->si_uid = 0; 428 info->si_uid = 0;
406 } 429 }
@@ -584,6 +607,17 @@ static int rm_from_queue(unsigned long mask, struct sigpending *s)
584 return 1; 607 return 1;
585} 608}
586 609
610static inline int is_si_special(const struct siginfo *info)
611{
612 return info <= SEND_SIG_FORCED;
613}
614
615static inline bool si_fromuser(const struct siginfo *info)
616{
617 return info == SEND_SIG_NOINFO ||
618 (!is_si_special(info) && SI_FROMUSER(info));
619}
620
587/* 621/*
588 * Bad permissions for sending the signal 622 * Bad permissions for sending the signal
589 * - the caller must hold at least the RCU read lock 623 * - the caller must hold at least the RCU read lock
@@ -598,7 +632,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
598 if (!valid_signal(sig)) 632 if (!valid_signal(sig))
599 return -EINVAL; 633 return -EINVAL;
600 634
601 if (info != SEND_SIG_NOINFO && (is_si_special(info) || SI_FROMKERNEL(info))) 635 if (!si_fromuser(info))
602 return 0; 636 return 0;
603 637
604 error = audit_signal_info(sig, t); /* Let audit system see the signal */ 638 error = audit_signal_info(sig, t); /* Let audit system see the signal */
@@ -705,7 +739,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
705 739
706 if (why) { 740 if (why) {
707 /* 741 /*
708 * The first thread which returns from finish_stop() 742 * The first thread which returns from do_signal_stop()
709 * will take ->siglock, notice SIGNAL_CLD_MASK, and 743 * will take ->siglock, notice SIGNAL_CLD_MASK, and
710 * notify its parent. See get_signal_to_deliver(). 744 * notify its parent. See get_signal_to_deliver().
711 */ 745 */
@@ -834,7 +868,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
834 struct sigqueue *q; 868 struct sigqueue *q;
835 int override_rlimit; 869 int override_rlimit;
836 870
837 trace_sched_signal_send(sig, t); 871 trace_signal_generate(sig, info, t);
838 872
839 assert_spin_locked(&t->sighand->siglock); 873 assert_spin_locked(&t->sighand->siglock);
840 874
@@ -869,7 +903,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
869 else 903 else
870 override_rlimit = 0; 904 override_rlimit = 0;
871 905
872 q = __sigqueue_alloc(t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE, 906 q = __sigqueue_alloc(sig, t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE,
873 override_rlimit); 907 override_rlimit);
874 if (q) { 908 if (q) {
875 list_add_tail(&q->list, &pending->list); 909 list_add_tail(&q->list, &pending->list);
@@ -896,12 +930,21 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
896 break; 930 break;
897 } 931 }
898 } else if (!is_si_special(info)) { 932 } else if (!is_si_special(info)) {
899 if (sig >= SIGRTMIN && info->si_code != SI_USER) 933 if (sig >= SIGRTMIN && info->si_code != SI_USER) {
900 /* 934 /*
901 * Queue overflow, abort. We may abort if the signal was rt 935 * Queue overflow, abort. We may abort if the
902 * and sent by user using something other than kill(). 936 * signal was rt and sent by user using something
903 */ 937 * other than kill().
938 */
939 trace_signal_overflow_fail(sig, group, info);
904 return -EAGAIN; 940 return -EAGAIN;
941 } else {
942 /*
943 * This is a silent loss of information. We still
944 * send the signal, but the *info bits are lost.
945 */
946 trace_signal_lose_info(sig, group, info);
947 }
905 } 948 }
906 949
907out_set: 950out_set:
@@ -917,16 +960,13 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
917 int from_ancestor_ns = 0; 960 int from_ancestor_ns = 0;
918 961
919#ifdef CONFIG_PID_NS 962#ifdef CONFIG_PID_NS
920 if (!is_si_special(info) && SI_FROMUSER(info) && 963 from_ancestor_ns = si_fromuser(info) &&
921 task_pid_nr_ns(current, task_active_pid_ns(t)) <= 0) 964 !task_pid_nr_ns(current, task_active_pid_ns(t));
922 from_ancestor_ns = 1;
923#endif 965#endif
924 966
925 return __send_signal(sig, info, t, group, from_ancestor_ns); 967 return __send_signal(sig, info, t, group, from_ancestor_ns);
926} 968}
927 969
928int print_fatal_signals;
929
930static void print_fatal_signal(struct pt_regs *regs, int signr) 970static void print_fatal_signal(struct pt_regs *regs, int signr)
931{ 971{
932 printk("%s/%d: potentially unexpected fatal signal %d.\n", 972 printk("%s/%d: potentially unexpected fatal signal %d.\n",
@@ -971,6 +1011,20 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
971 return send_signal(sig, info, t, 0); 1011 return send_signal(sig, info, t, 0);
972} 1012}
973 1013
1014int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
1015 bool group)
1016{
1017 unsigned long flags;
1018 int ret = -ESRCH;
1019
1020 if (lock_task_sighand(p, &flags)) {
1021 ret = send_signal(sig, info, p, group);
1022 unlock_task_sighand(p, &flags);
1023 }
1024
1025 return ret;
1026}
1027
974/* 1028/*
975 * Force a signal that the process can't ignore: if necessary 1029 * Force a signal that the process can't ignore: if necessary
976 * we unblock the signal and change any SIG_IGN to SIG_DFL. 1030 * we unblock the signal and change any SIG_IGN to SIG_DFL.
@@ -1008,12 +1062,6 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
1008 return ret; 1062 return ret;
1009} 1063}
1010 1064
1011void
1012force_sig_specific(int sig, struct task_struct *t)
1013{
1014 force_sig_info(sig, SEND_SIG_FORCED, t);
1015}
1016
1017/* 1065/*
1018 * Nuke all other threads in the group. 1066 * Nuke all other threads in the group.
1019 */ 1067 */
@@ -1036,12 +1084,6 @@ void zap_other_threads(struct task_struct *p)
1036 } 1084 }
1037} 1085}
1038 1086
1039int __fatal_signal_pending(struct task_struct *tsk)
1040{
1041 return sigismember(&tsk->pending.signal, SIGKILL);
1042}
1043EXPORT_SYMBOL(__fatal_signal_pending);
1044
1045struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) 1087struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
1046{ 1088{
1047 struct sighand_struct *sighand; 1089 struct sighand_struct *sighand;
@@ -1068,18 +1110,10 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long
1068 */ 1110 */
1069int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) 1111int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1070{ 1112{
1071 unsigned long flags; 1113 int ret = check_kill_permission(sig, info, p);
1072 int ret;
1073
1074 ret = check_kill_permission(sig, info, p);
1075 1114
1076 if (!ret && sig) { 1115 if (!ret && sig)
1077 ret = -ESRCH; 1116 ret = do_send_sig_info(sig, info, p, true);
1078 if (lock_task_sighand(p, &flags)) {
1079 ret = __group_send_sig_info(sig, info, p);
1080 unlock_task_sighand(p, &flags);
1081 }
1082 }
1083 1117
1084 return ret; 1118 return ret;
1085} 1119}
@@ -1156,8 +1190,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
1156 goto out_unlock; 1190 goto out_unlock;
1157 } 1191 }
1158 pcred = __task_cred(p); 1192 pcred = __task_cred(p);
1159 if ((info == SEND_SIG_NOINFO || 1193 if (si_fromuser(info) &&
1160 (!is_si_special(info) && SI_FROMUSER(info))) &&
1161 euid != pcred->suid && euid != pcred->uid && 1194 euid != pcred->suid && euid != pcred->uid &&
1162 uid != pcred->suid && uid != pcred->uid) { 1195 uid != pcred->suid && uid != pcred->uid) {
1163 ret = -EPERM; 1196 ret = -EPERM;
@@ -1224,15 +1257,9 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
1224 * These are for backward compatibility with the rest of the kernel source. 1257 * These are for backward compatibility with the rest of the kernel source.
1225 */ 1258 */
1226 1259
1227/*
1228 * The caller must ensure the task can't exit.
1229 */
1230int 1260int
1231send_sig_info(int sig, struct siginfo *info, struct task_struct *p) 1261send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1232{ 1262{
1233 int ret;
1234 unsigned long flags;
1235
1236 /* 1263 /*
1237 * Make sure legacy kernel users don't send in bad values 1264 * Make sure legacy kernel users don't send in bad values
1238 * (normal paths check this in check_kill_permission). 1265 * (normal paths check this in check_kill_permission).
@@ -1240,10 +1267,7 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1240 if (!valid_signal(sig)) 1267 if (!valid_signal(sig))
1241 return -EINVAL; 1268 return -EINVAL;
1242 1269
1243 spin_lock_irqsave(&p->sighand->siglock, flags); 1270 return do_send_sig_info(sig, info, p, false);
1244 ret = specific_send_sig_info(sig, info, p);
1245 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1246 return ret;
1247} 1271}
1248 1272
1249#define __si_special(priv) \ 1273#define __si_special(priv) \
@@ -1302,19 +1326,19 @@ EXPORT_SYMBOL(kill_pid);
1302 * These functions support sending signals using preallocated sigqueue 1326 * These functions support sending signals using preallocated sigqueue
1303 * structures. This is needed "because realtime applications cannot 1327 * structures. This is needed "because realtime applications cannot
1304 * afford to lose notifications of asynchronous events, like timer 1328 * afford to lose notifications of asynchronous events, like timer
1305 * expirations or I/O completions". In the case of Posix Timers 1329 * expirations or I/O completions". In the case of Posix Timers
1306 * we allocate the sigqueue structure from the timer_create. If this 1330 * we allocate the sigqueue structure from the timer_create. If this
1307 * allocation fails we are able to report the failure to the application 1331 * allocation fails we are able to report the failure to the application
1308 * with an EAGAIN error. 1332 * with an EAGAIN error.
1309 */ 1333 */
1310
1311struct sigqueue *sigqueue_alloc(void) 1334struct sigqueue *sigqueue_alloc(void)
1312{ 1335{
1313 struct sigqueue *q; 1336 struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
1314 1337
1315 if ((q = __sigqueue_alloc(current, GFP_KERNEL, 0))) 1338 if (q)
1316 q->flags |= SIGQUEUE_PREALLOC; 1339 q->flags |= SIGQUEUE_PREALLOC;
1317 return(q); 1340
1341 return q;
1318} 1342}
1319 1343
1320void sigqueue_free(struct sigqueue *q) 1344void sigqueue_free(struct sigqueue *q)
@@ -1383,15 +1407,6 @@ ret:
1383} 1407}
1384 1408
1385/* 1409/*
1386 * Wake up any threads in the parent blocked in wait* syscalls.
1387 */
1388static inline void __wake_up_parent(struct task_struct *p,
1389 struct task_struct *parent)
1390{
1391 wake_up_interruptible_sync(&parent->signal->wait_chldexit);
1392}
1393
1394/*
1395 * Let a parent know about the death of a child. 1410 * Let a parent know about the death of a child.
1396 * For a stopped/continued status change, use do_notify_parent_cldstop instead. 1411 * For a stopped/continued status change, use do_notify_parent_cldstop instead.
1397 * 1412 *
@@ -1673,29 +1688,6 @@ void ptrace_notify(int exit_code)
1673 spin_unlock_irq(&current->sighand->siglock); 1688 spin_unlock_irq(&current->sighand->siglock);
1674} 1689}
1675 1690
1676static void
1677finish_stop(int stop_count)
1678{
1679 /*
1680 * If there are no other threads in the group, or if there is
1681 * a group stop in progress and we are the last to stop,
1682 * report to the parent. When ptraced, every thread reports itself.
1683 */
1684 if (tracehook_notify_jctl(stop_count == 0, CLD_STOPPED)) {
1685 read_lock(&tasklist_lock);
1686 do_notify_parent_cldstop(current, CLD_STOPPED);
1687 read_unlock(&tasklist_lock);
1688 }
1689
1690 do {
1691 schedule();
1692 } while (try_to_freeze());
1693 /*
1694 * Now we don't run again until continued.
1695 */
1696 current->exit_code = 0;
1697}
1698
1699/* 1691/*
1700 * This performs the stopping for SIGSTOP and other stop signals. 1692 * This performs the stopping for SIGSTOP and other stop signals.
1701 * We have to stop all threads in the thread group. 1693 * We have to stop all threads in the thread group.
@@ -1705,15 +1697,9 @@ finish_stop(int stop_count)
1705static int do_signal_stop(int signr) 1697static int do_signal_stop(int signr)
1706{ 1698{
1707 struct signal_struct *sig = current->signal; 1699 struct signal_struct *sig = current->signal;
1708 int stop_count; 1700 int notify;
1709 1701
1710 if (sig->group_stop_count > 0) { 1702 if (!sig->group_stop_count) {
1711 /*
1712 * There is a group stop in progress. We don't need to
1713 * start another one.
1714 */
1715 stop_count = --sig->group_stop_count;
1716 } else {
1717 struct task_struct *t; 1703 struct task_struct *t;
1718 1704
1719 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) || 1705 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) ||
@@ -1725,7 +1711,7 @@ static int do_signal_stop(int signr)
1725 */ 1711 */
1726 sig->group_exit_code = signr; 1712 sig->group_exit_code = signr;
1727 1713
1728 stop_count = 0; 1714 sig->group_stop_count = 1;
1729 for (t = next_thread(current); t != current; t = next_thread(t)) 1715 for (t = next_thread(current); t != current; t = next_thread(t))
1730 /* 1716 /*
1731 * Setting state to TASK_STOPPED for a group 1717 * Setting state to TASK_STOPPED for a group
@@ -1734,19 +1720,44 @@ static int do_signal_stop(int signr)
1734 */ 1720 */
1735 if (!(t->flags & PF_EXITING) && 1721 if (!(t->flags & PF_EXITING) &&
1736 !task_is_stopped_or_traced(t)) { 1722 !task_is_stopped_or_traced(t)) {
1737 stop_count++; 1723 sig->group_stop_count++;
1738 signal_wake_up(t, 0); 1724 signal_wake_up(t, 0);
1739 } 1725 }
1740 sig->group_stop_count = stop_count;
1741 } 1726 }
1727 /*
1728 * If there are no other threads in the group, or if there is
1729 * a group stop in progress and we are the last to stop, report
1730 * to the parent. When ptraced, every thread reports itself.
1731 */
1732 notify = sig->group_stop_count == 1 ? CLD_STOPPED : 0;
1733 notify = tracehook_notify_jctl(notify, CLD_STOPPED);
1734 /*
1735 * tracehook_notify_jctl() can drop and reacquire siglock, so
1736 * we keep ->group_stop_count != 0 before the call. If SIGCONT
1737 * or SIGKILL comes in between ->group_stop_count == 0.
1738 */
1739 if (sig->group_stop_count) {
1740 if (!--sig->group_stop_count)
1741 sig->flags = SIGNAL_STOP_STOPPED;
1742 current->exit_code = sig->group_exit_code;
1743 __set_current_state(TASK_STOPPED);
1744 }
1745 spin_unlock_irq(&current->sighand->siglock);
1742 1746
1743 if (stop_count == 0) 1747 if (notify) {
1744 sig->flags = SIGNAL_STOP_STOPPED; 1748 read_lock(&tasklist_lock);
1745 current->exit_code = sig->group_exit_code; 1749 do_notify_parent_cldstop(current, notify);
1746 __set_current_state(TASK_STOPPED); 1750 read_unlock(&tasklist_lock);
1751 }
1752
1753 /* Now we don't run again until woken by SIGCONT or SIGKILL */
1754 do {
1755 schedule();
1756 } while (try_to_freeze());
1757
1758 tracehook_finish_jctl();
1759 current->exit_code = 0;
1747 1760
1748 spin_unlock_irq(&current->sighand->siglock);
1749 finish_stop(stop_count);
1750 return 1; 1761 return 1;
1751} 1762}
1752 1763
@@ -1815,24 +1826,20 @@ relock:
1815 int why = (signal->flags & SIGNAL_STOP_CONTINUED) 1826 int why = (signal->flags & SIGNAL_STOP_CONTINUED)
1816 ? CLD_CONTINUED : CLD_STOPPED; 1827 ? CLD_CONTINUED : CLD_STOPPED;
1817 signal->flags &= ~SIGNAL_CLD_MASK; 1828 signal->flags &= ~SIGNAL_CLD_MASK;
1818 spin_unlock_irq(&sighand->siglock);
1819 1829
1820 if (unlikely(!tracehook_notify_jctl(1, why))) 1830 why = tracehook_notify_jctl(why, CLD_CONTINUED);
1821 goto relock; 1831 spin_unlock_irq(&sighand->siglock);
1822 1832
1823 read_lock(&tasklist_lock); 1833 if (why) {
1824 do_notify_parent_cldstop(current->group_leader, why); 1834 read_lock(&tasklist_lock);
1825 read_unlock(&tasklist_lock); 1835 do_notify_parent_cldstop(current->group_leader, why);
1836 read_unlock(&tasklist_lock);
1837 }
1826 goto relock; 1838 goto relock;
1827 } 1839 }
1828 1840
1829 for (;;) { 1841 for (;;) {
1830 struct k_sigaction *ka; 1842 struct k_sigaction *ka;
1831
1832 if (unlikely(signal->group_stop_count > 0) &&
1833 do_signal_stop(0))
1834 goto relock;
1835
1836 /* 1843 /*
1837 * Tracing can induce an artifical signal and choose sigaction. 1844 * Tracing can induce an artifical signal and choose sigaction.
1838 * The return value in @signr determines the default action, 1845 * The return value in @signr determines the default action,
@@ -1844,6 +1851,10 @@ relock:
1844 if (unlikely(signr != 0)) 1851 if (unlikely(signr != 0))
1845 ka = return_ka; 1852 ka = return_ka;
1846 else { 1853 else {
1854 if (unlikely(signal->group_stop_count > 0) &&
1855 do_signal_stop(0))
1856 goto relock;
1857
1847 signr = dequeue_signal(current, &current->blocked, 1858 signr = dequeue_signal(current, &current->blocked,
1848 info); 1859 info);
1849 1860
@@ -1860,6 +1871,9 @@ relock:
1860 ka = &sighand->action[signr-1]; 1871 ka = &sighand->action[signr-1];
1861 } 1872 }
1862 1873
1874 /* Trace actually delivered signals. */
1875 trace_signal_deliver(signr, info, ka);
1876
1863 if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ 1877 if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */
1864 continue; 1878 continue;
1865 if (ka->sa.sa_handler != SIG_DFL) { 1879 if (ka->sa.sa_handler != SIG_DFL) {
@@ -1987,14 +2001,14 @@ void exit_signals(struct task_struct *tsk)
1987 if (unlikely(tsk->signal->group_stop_count) && 2001 if (unlikely(tsk->signal->group_stop_count) &&
1988 !--tsk->signal->group_stop_count) { 2002 !--tsk->signal->group_stop_count) {
1989 tsk->signal->flags = SIGNAL_STOP_STOPPED; 2003 tsk->signal->flags = SIGNAL_STOP_STOPPED;
1990 group_stop = 1; 2004 group_stop = tracehook_notify_jctl(CLD_STOPPED, CLD_STOPPED);
1991 } 2005 }
1992out: 2006out:
1993 spin_unlock_irq(&tsk->sighand->siglock); 2007 spin_unlock_irq(&tsk->sighand->siglock);
1994 2008
1995 if (unlikely(group_stop) && tracehook_notify_jctl(1, CLD_STOPPED)) { 2009 if (unlikely(group_stop)) {
1996 read_lock(&tasklist_lock); 2010 read_lock(&tasklist_lock);
1997 do_notify_parent_cldstop(tsk, CLD_STOPPED); 2011 do_notify_parent_cldstop(tsk, group_stop);
1998 read_unlock(&tasklist_lock); 2012 read_unlock(&tasklist_lock);
1999 } 2013 }
2000} 2014}
@@ -2290,7 +2304,6 @@ static int
2290do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) 2304do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
2291{ 2305{
2292 struct task_struct *p; 2306 struct task_struct *p;
2293 unsigned long flags;
2294 int error = -ESRCH; 2307 int error = -ESRCH;
2295 2308
2296 rcu_read_lock(); 2309 rcu_read_lock();
@@ -2300,14 +2313,16 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
2300 /* 2313 /*
2301 * The null signal is a permissions and process existence 2314 * The null signal is a permissions and process existence
2302 * probe. No signal is actually delivered. 2315 * probe. No signal is actually delivered.
2303 *
2304 * If lock_task_sighand() fails we pretend the task dies
2305 * after receiving the signal. The window is tiny, and the
2306 * signal is private anyway.
2307 */ 2316 */
2308 if (!error && sig && lock_task_sighand(p, &flags)) { 2317 if (!error && sig) {
2309 error = specific_send_sig_info(sig, info, p); 2318 error = do_send_sig_info(sig, info, p, false);
2310 unlock_task_sighand(p, &flags); 2319 /*
2320 * If lock_task_sighand() failed we pretend the task
2321 * dies after receiving the signal. The window is tiny,
2322 * and the signal is private anyway.
2323 */
2324 if (unlikely(error == -ESRCH))
2325 error = 0;
2311 } 2326 }
2312 } 2327 }
2313 rcu_read_unlock(); 2328 rcu_read_unlock();
diff --git a/kernel/slow-work-debugfs.c b/kernel/slow-work-debugfs.c
new file mode 100644
index 000000000000..e45c43645298
--- /dev/null
+++ b/kernel/slow-work-debugfs.c
@@ -0,0 +1,227 @@
1/* Slow work debugging
2 *
3 * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/module.h>
13#include <linux/slow-work.h>
14#include <linux/fs.h>
15#include <linux/time.h>
16#include <linux/seq_file.h>
17#include "slow-work.h"
18
19#define ITERATOR_SHIFT (BITS_PER_LONG - 4)
20#define ITERATOR_SELECTOR (0xfUL << ITERATOR_SHIFT)
21#define ITERATOR_COUNTER (~ITERATOR_SELECTOR)
22
23void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m)
24{
25 seq_puts(m, "Slow-work: New thread");
26}
27
28/*
29 * Render the time mark field on a work item into a 5-char time with units plus
30 * a space
31 */
32static void slow_work_print_mark(struct seq_file *m, struct slow_work *work)
33{
34 struct timespec now, diff;
35
36 now = CURRENT_TIME;
37 diff = timespec_sub(now, work->mark);
38
39 if (diff.tv_sec < 0)
40 seq_puts(m, " -ve ");
41 else if (diff.tv_sec == 0 && diff.tv_nsec < 1000)
42 seq_printf(m, "%3luns ", diff.tv_nsec);
43 else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000)
44 seq_printf(m, "%3luus ", diff.tv_nsec / 1000);
45 else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000)
46 seq_printf(m, "%3lums ", diff.tv_nsec / 1000000);
47 else if (diff.tv_sec <= 1)
48 seq_puts(m, " 1s ");
49 else if (diff.tv_sec < 60)
50 seq_printf(m, "%4lus ", diff.tv_sec);
51 else if (diff.tv_sec < 60 * 60)
52 seq_printf(m, "%4lum ", diff.tv_sec / 60);
53 else if (diff.tv_sec < 60 * 60 * 24)
54 seq_printf(m, "%4luh ", diff.tv_sec / 3600);
55 else
56 seq_puts(m, "exces ");
57}
58
59/*
60 * Describe a slow work item for debugfs
61 */
62static int slow_work_runqueue_show(struct seq_file *m, void *v)
63{
64 struct slow_work *work;
65 struct list_head *p = v;
66 unsigned long id;
67
68 switch ((unsigned long) v) {
69 case 1:
70 seq_puts(m, "THR PID ITEM ADDR FL MARK DESC\n");
71 return 0;
72 case 2:
73 seq_puts(m, "=== ===== ================ == ===== ==========\n");
74 return 0;
75
76 case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1:
77 id = (unsigned long) v - 3;
78
79 read_lock(&slow_work_execs_lock);
80 work = slow_work_execs[id];
81 if (work) {
82 smp_read_barrier_depends();
83
84 seq_printf(m, "%3lu %5d %16p %2lx ",
85 id, slow_work_pids[id], work, work->flags);
86 slow_work_print_mark(m, work);
87
88 if (work->ops->desc)
89 work->ops->desc(work, m);
90 seq_putc(m, '\n');
91 }
92 read_unlock(&slow_work_execs_lock);
93 return 0;
94
95 default:
96 work = list_entry(p, struct slow_work, link);
97 seq_printf(m, "%3s - %16p %2lx ",
98 work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq",
99 work, work->flags);
100 slow_work_print_mark(m, work);
101
102 if (work->ops->desc)
103 work->ops->desc(work, m);
104 seq_putc(m, '\n');
105 return 0;
106 }
107}
108
109/*
110 * map the iterator to a work item
111 */
112static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos)
113{
114 struct list_head *p;
115 unsigned long count, id;
116
117 switch (*_pos >> ITERATOR_SHIFT) {
118 case 0x0:
119 if (*_pos == 0)
120 *_pos = 1;
121 if (*_pos < 3)
122 return (void *)(unsigned long) *_pos;
123 if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT)
124 for (id = *_pos - 3;
125 id < SLOW_WORK_THREAD_LIMIT;
126 id++, (*_pos)++)
127 if (slow_work_execs[id])
128 return (void *)(unsigned long) *_pos;
129 *_pos = 0x1UL << ITERATOR_SHIFT;
130
131 case 0x1:
132 count = *_pos & ITERATOR_COUNTER;
133 list_for_each(p, &slow_work_queue) {
134 if (count == 0)
135 return p;
136 count--;
137 }
138 *_pos = 0x2UL << ITERATOR_SHIFT;
139
140 case 0x2:
141 count = *_pos & ITERATOR_COUNTER;
142 list_for_each(p, &vslow_work_queue) {
143 if (count == 0)
144 return p;
145 count--;
146 }
147 *_pos = 0x3UL << ITERATOR_SHIFT;
148
149 default:
150 return NULL;
151 }
152}
153
154/*
155 * set up the iterator to start reading from the first line
156 */
157static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos)
158{
159 spin_lock_irq(&slow_work_queue_lock);
160 return slow_work_runqueue_index(m, _pos);
161}
162
163/*
164 * move to the next line
165 */
166static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos)
167{
168 struct list_head *p = v;
169 unsigned long selector = *_pos >> ITERATOR_SHIFT;
170
171 (*_pos)++;
172 switch (selector) {
173 case 0x0:
174 return slow_work_runqueue_index(m, _pos);
175
176 case 0x1:
177 if (*_pos >> ITERATOR_SHIFT == 0x1) {
178 p = p->next;
179 if (p != &slow_work_queue)
180 return p;
181 }
182 *_pos = 0x2UL << ITERATOR_SHIFT;
183 p = &vslow_work_queue;
184
185 case 0x2:
186 if (*_pos >> ITERATOR_SHIFT == 0x2) {
187 p = p->next;
188 if (p != &vslow_work_queue)
189 return p;
190 }
191 *_pos = 0x3UL << ITERATOR_SHIFT;
192
193 default:
194 return NULL;
195 }
196}
197
198/*
199 * clean up after reading
200 */
201static void slow_work_runqueue_stop(struct seq_file *m, void *v)
202{
203 spin_unlock_irq(&slow_work_queue_lock);
204}
205
206static const struct seq_operations slow_work_runqueue_ops = {
207 .start = slow_work_runqueue_start,
208 .stop = slow_work_runqueue_stop,
209 .next = slow_work_runqueue_next,
210 .show = slow_work_runqueue_show,
211};
212
213/*
214 * open "/sys/kernel/debug/slow_work/runqueue" to list queue contents
215 */
216static int slow_work_runqueue_open(struct inode *inode, struct file *file)
217{
218 return seq_open(file, &slow_work_runqueue_ops);
219}
220
221const struct file_operations slow_work_runqueue_fops = {
222 .owner = THIS_MODULE,
223 .open = slow_work_runqueue_open,
224 .read = seq_read,
225 .llseek = seq_lseek,
226 .release = seq_release,
227};
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 09d7519557d3..7494bbf5a270 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -16,20 +16,17 @@
16#include <linux/kthread.h> 16#include <linux/kthread.h>
17#include <linux/freezer.h> 17#include <linux/freezer.h>
18#include <linux/wait.h> 18#include <linux/wait.h>
19 19#include <linux/debugfs.h>
20#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of 20#include "slow-work.h"
21 * things to do */
22#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after
23 * OOM */
24 21
25static void slow_work_cull_timeout(unsigned long); 22static void slow_work_cull_timeout(unsigned long);
26static void slow_work_oom_timeout(unsigned long); 23static void slow_work_oom_timeout(unsigned long);
27 24
28#ifdef CONFIG_SYSCTL 25#ifdef CONFIG_SYSCTL
29static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *, 26static int slow_work_min_threads_sysctl(struct ctl_table *, int,
30 void __user *, size_t *, loff_t *); 27 void __user *, size_t *, loff_t *);
31 28
32static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *, 29static int slow_work_max_threads_sysctl(struct ctl_table *, int ,
33 void __user *, size_t *, loff_t *); 30 void __user *, size_t *, loff_t *);
34#endif 31#endif
35 32
@@ -46,13 +43,12 @@ static unsigned vslow_work_proportion = 50; /* % of threads that may process
46 43
47#ifdef CONFIG_SYSCTL 44#ifdef CONFIG_SYSCTL
48static const int slow_work_min_min_threads = 2; 45static const int slow_work_min_min_threads = 2;
49static int slow_work_max_max_threads = 255; 46static int slow_work_max_max_threads = SLOW_WORK_THREAD_LIMIT;
50static const int slow_work_min_vslow = 1; 47static const int slow_work_min_vslow = 1;
51static const int slow_work_max_vslow = 99; 48static const int slow_work_max_vslow = 99;
52 49
53ctl_table slow_work_sysctls[] = { 50ctl_table slow_work_sysctls[] = {
54 { 51 {
55 .ctl_name = CTL_UNNUMBERED,
56 .procname = "min-threads", 52 .procname = "min-threads",
57 .data = &slow_work_min_threads, 53 .data = &slow_work_min_threads,
58 .maxlen = sizeof(unsigned), 54 .maxlen = sizeof(unsigned),
@@ -62,7 +58,6 @@ ctl_table slow_work_sysctls[] = {
62 .extra2 = &slow_work_max_threads, 58 .extra2 = &slow_work_max_threads,
63 }, 59 },
64 { 60 {
65 .ctl_name = CTL_UNNUMBERED,
66 .procname = "max-threads", 61 .procname = "max-threads",
67 .data = &slow_work_max_threads, 62 .data = &slow_work_max_threads,
68 .maxlen = sizeof(unsigned), 63 .maxlen = sizeof(unsigned),
@@ -72,16 +67,15 @@ ctl_table slow_work_sysctls[] = {
72 .extra2 = (void *) &slow_work_max_max_threads, 67 .extra2 = (void *) &slow_work_max_max_threads,
73 }, 68 },
74 { 69 {
75 .ctl_name = CTL_UNNUMBERED,
76 .procname = "vslow-percentage", 70 .procname = "vslow-percentage",
77 .data = &vslow_work_proportion, 71 .data = &vslow_work_proportion,
78 .maxlen = sizeof(unsigned), 72 .maxlen = sizeof(unsigned),
79 .mode = 0644, 73 .mode = 0644,
80 .proc_handler = &proc_dointvec_minmax, 74 .proc_handler = proc_dointvec_minmax,
81 .extra1 = (void *) &slow_work_min_vslow, 75 .extra1 = (void *) &slow_work_min_vslow,
82 .extra2 = (void *) &slow_work_max_vslow, 76 .extra2 = (void *) &slow_work_max_vslow,
83 }, 77 },
84 { .ctl_name = 0 } 78 {}
85}; 79};
86#endif 80#endif
87 81
@@ -98,6 +92,56 @@ static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0);
98static struct slow_work slow_work_new_thread; /* new thread starter */ 92static struct slow_work slow_work_new_thread; /* new thread starter */
99 93
100/* 94/*
95 * slow work ID allocation (use slow_work_queue_lock)
96 */
97static DECLARE_BITMAP(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
98
99/*
100 * Unregistration tracking to prevent put_ref() from disappearing during module
101 * unload
102 */
103#ifdef CONFIG_MODULES
104static struct module *slow_work_thread_processing[SLOW_WORK_THREAD_LIMIT];
105static struct module *slow_work_unreg_module;
106static struct slow_work *slow_work_unreg_work_item;
107static DECLARE_WAIT_QUEUE_HEAD(slow_work_unreg_wq);
108static DEFINE_MUTEX(slow_work_unreg_sync_lock);
109
110static void slow_work_set_thread_processing(int id, struct slow_work *work)
111{
112 if (work)
113 slow_work_thread_processing[id] = work->owner;
114}
115static void slow_work_done_thread_processing(int id, struct slow_work *work)
116{
117 struct module *module = slow_work_thread_processing[id];
118
119 slow_work_thread_processing[id] = NULL;
120 smp_mb();
121 if (slow_work_unreg_work_item == work ||
122 slow_work_unreg_module == module)
123 wake_up_all(&slow_work_unreg_wq);
124}
125static void slow_work_clear_thread_processing(int id)
126{
127 slow_work_thread_processing[id] = NULL;
128}
129#else
130static void slow_work_set_thread_processing(int id, struct slow_work *work) {}
131static void slow_work_done_thread_processing(int id, struct slow_work *work) {}
132static void slow_work_clear_thread_processing(int id) {}
133#endif
134
135/*
136 * Data for tracking currently executing items for indication through /proc
137 */
138#ifdef CONFIG_SLOW_WORK_DEBUG
139struct slow_work *slow_work_execs[SLOW_WORK_THREAD_LIMIT];
140pid_t slow_work_pids[SLOW_WORK_THREAD_LIMIT];
141DEFINE_RWLOCK(slow_work_execs_lock);
142#endif
143
144/*
101 * The queues of work items and the lock governing access to them. These are 145 * The queues of work items and the lock governing access to them. These are
102 * shared between all the CPUs. It doesn't make sense to have per-CPU queues 146 * shared between all the CPUs. It doesn't make sense to have per-CPU queues
103 * as the number of threads bears no relation to the number of CPUs. 147 * as the number of threads bears no relation to the number of CPUs.
@@ -105,9 +149,18 @@ static struct slow_work slow_work_new_thread; /* new thread starter */
105 * There are two queues of work items: one for slow work items, and one for 149 * There are two queues of work items: one for slow work items, and one for
106 * very slow work items. 150 * very slow work items.
107 */ 151 */
108static LIST_HEAD(slow_work_queue); 152LIST_HEAD(slow_work_queue);
109static LIST_HEAD(vslow_work_queue); 153LIST_HEAD(vslow_work_queue);
110static DEFINE_SPINLOCK(slow_work_queue_lock); 154DEFINE_SPINLOCK(slow_work_queue_lock);
155
156/*
157 * The following are two wait queues that get pinged when a work item is placed
158 * on an empty queue. These allow work items that are hogging a thread by
159 * sleeping in a way that could be deferred to yield their thread and enqueue
160 * themselves.
161 */
162static DECLARE_WAIT_QUEUE_HEAD(slow_work_queue_waits_for_occupation);
163static DECLARE_WAIT_QUEUE_HEAD(vslow_work_queue_waits_for_occupation);
111 164
112/* 165/*
113 * The thread controls. A variable used to signal to the threads that they 166 * The thread controls. A variable used to signal to the threads that they
@@ -126,6 +179,20 @@ static DECLARE_COMPLETION(slow_work_last_thread_exited);
126static int slow_work_user_count; 179static int slow_work_user_count;
127static DEFINE_MUTEX(slow_work_user_lock); 180static DEFINE_MUTEX(slow_work_user_lock);
128 181
182static inline int slow_work_get_ref(struct slow_work *work)
183{
184 if (work->ops->get_ref)
185 return work->ops->get_ref(work);
186
187 return 0;
188}
189
190static inline void slow_work_put_ref(struct slow_work *work)
191{
192 if (work->ops->put_ref)
193 work->ops->put_ref(work);
194}
195
129/* 196/*
130 * Calculate the maximum number of active threads in the pool that are 197 * Calculate the maximum number of active threads in the pool that are
131 * permitted to process very slow work items. 198 * permitted to process very slow work items.
@@ -149,7 +216,7 @@ static unsigned slow_work_calc_vsmax(void)
149 * Attempt to execute stuff queued on a slow thread. Return true if we managed 216 * Attempt to execute stuff queued on a slow thread. Return true if we managed
150 * it, false if there was nothing to do. 217 * it, false if there was nothing to do.
151 */ 218 */
152static bool slow_work_execute(void) 219static noinline bool slow_work_execute(int id)
153{ 220{
154 struct slow_work *work = NULL; 221 struct slow_work *work = NULL;
155 unsigned vsmax; 222 unsigned vsmax;
@@ -186,6 +253,13 @@ static bool slow_work_execute(void)
186 } else { 253 } else {
187 very_slow = false; /* avoid the compiler warning */ 254 very_slow = false; /* avoid the compiler warning */
188 } 255 }
256
257 slow_work_set_thread_processing(id, work);
258 if (work) {
259 slow_work_mark_time(work);
260 slow_work_begin_exec(id, work);
261 }
262
189 spin_unlock_irq(&slow_work_queue_lock); 263 spin_unlock_irq(&slow_work_queue_lock);
190 264
191 if (!work) 265 if (!work)
@@ -194,12 +268,19 @@ static bool slow_work_execute(void)
194 if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags)) 268 if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags))
195 BUG(); 269 BUG();
196 270
197 work->ops->execute(work); 271 /* don't execute if the work is in the process of being cancelled */
272 if (!test_bit(SLOW_WORK_CANCELLING, &work->flags))
273 work->ops->execute(work);
198 274
199 if (very_slow) 275 if (very_slow)
200 atomic_dec(&vslow_work_executing_count); 276 atomic_dec(&vslow_work_executing_count);
201 clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags); 277 clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags);
202 278
279 /* wake up anyone waiting for this work to be complete */
280 wake_up_bit(&work->flags, SLOW_WORK_EXECUTING);
281
282 slow_work_end_exec(id, work);
283
203 /* if someone tried to enqueue the item whilst we were executing it, 284 /* if someone tried to enqueue the item whilst we were executing it,
204 * then it'll be left unenqueued to avoid multiple threads trying to 285 * then it'll be left unenqueued to avoid multiple threads trying to
205 * execute it simultaneously 286 * execute it simultaneously
@@ -219,7 +300,10 @@ static bool slow_work_execute(void)
219 spin_unlock_irq(&slow_work_queue_lock); 300 spin_unlock_irq(&slow_work_queue_lock);
220 } 301 }
221 302
222 work->ops->put_ref(work); 303 /* sort out the race between module unloading and put_ref() */
304 slow_work_put_ref(work);
305 slow_work_done_thread_processing(id, work);
306
223 return true; 307 return true;
224 308
225auto_requeue: 309auto_requeue:
@@ -227,15 +311,61 @@ auto_requeue:
227 * - we transfer our ref on the item back to the appropriate queue 311 * - we transfer our ref on the item back to the appropriate queue
228 * - don't wake another thread up as we're awake already 312 * - don't wake another thread up as we're awake already
229 */ 313 */
314 slow_work_mark_time(work);
230 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) 315 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
231 list_add_tail(&work->link, &vslow_work_queue); 316 list_add_tail(&work->link, &vslow_work_queue);
232 else 317 else
233 list_add_tail(&work->link, &slow_work_queue); 318 list_add_tail(&work->link, &slow_work_queue);
234 spin_unlock_irq(&slow_work_queue_lock); 319 spin_unlock_irq(&slow_work_queue_lock);
320 slow_work_clear_thread_processing(id);
235 return true; 321 return true;
236} 322}
237 323
238/** 324/**
325 * slow_work_sleep_till_thread_needed - Sleep till thread needed by other work
326 * work: The work item under execution that wants to sleep
327 * _timeout: Scheduler sleep timeout
328 *
329 * Allow a requeueable work item to sleep on a slow-work processor thread until
330 * that thread is needed to do some other work or the sleep is interrupted by
331 * some other event.
332 *
333 * The caller must set up a wake up event before calling this and must have set
334 * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own
335 * condition before calling this function as no test is made here.
336 *
337 * False is returned if there is nothing on the queue; true is returned if the
338 * work item should be requeued
339 */
340bool slow_work_sleep_till_thread_needed(struct slow_work *work,
341 signed long *_timeout)
342{
343 wait_queue_head_t *wfo_wq;
344 struct list_head *queue;
345
346 DEFINE_WAIT(wait);
347
348 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
349 wfo_wq = &vslow_work_queue_waits_for_occupation;
350 queue = &vslow_work_queue;
351 } else {
352 wfo_wq = &slow_work_queue_waits_for_occupation;
353 queue = &slow_work_queue;
354 }
355
356 if (!list_empty(queue))
357 return true;
358
359 add_wait_queue_exclusive(wfo_wq, &wait);
360 if (list_empty(queue))
361 *_timeout = schedule_timeout(*_timeout);
362 finish_wait(wfo_wq, &wait);
363
364 return !list_empty(queue);
365}
366EXPORT_SYMBOL(slow_work_sleep_till_thread_needed);
367
368/**
239 * slow_work_enqueue - Schedule a slow work item for processing 369 * slow_work_enqueue - Schedule a slow work item for processing
240 * @work: The work item to queue 370 * @work: The work item to queue
241 * 371 *
@@ -260,16 +390,22 @@ auto_requeue:
260 * allowed to pick items to execute. This ensures that very slow items won't 390 * allowed to pick items to execute. This ensures that very slow items won't
261 * overly block ones that are just ordinarily slow. 391 * overly block ones that are just ordinarily slow.
262 * 392 *
263 * Returns 0 if successful, -EAGAIN if not. 393 * Returns 0 if successful, -EAGAIN if not (or -ECANCELED if cancelled work is
394 * attempted queued)
264 */ 395 */
265int slow_work_enqueue(struct slow_work *work) 396int slow_work_enqueue(struct slow_work *work)
266{ 397{
398 wait_queue_head_t *wfo_wq;
399 struct list_head *queue;
267 unsigned long flags; 400 unsigned long flags;
401 int ret;
402
403 if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
404 return -ECANCELED;
268 405
269 BUG_ON(slow_work_user_count <= 0); 406 BUG_ON(slow_work_user_count <= 0);
270 BUG_ON(!work); 407 BUG_ON(!work);
271 BUG_ON(!work->ops); 408 BUG_ON(!work->ops);
272 BUG_ON(!work->ops->get_ref);
273 409
274 /* when honouring an enqueue request, we only promise that we will run 410 /* when honouring an enqueue request, we only promise that we will run
275 * the work function in the future; we do not promise to run it once 411 * the work function in the future; we do not promise to run it once
@@ -280,8 +416,19 @@ int slow_work_enqueue(struct slow_work *work)
280 * maintaining our promise 416 * maintaining our promise
281 */ 417 */
282 if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) { 418 if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
419 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
420 wfo_wq = &vslow_work_queue_waits_for_occupation;
421 queue = &vslow_work_queue;
422 } else {
423 wfo_wq = &slow_work_queue_waits_for_occupation;
424 queue = &slow_work_queue;
425 }
426
283 spin_lock_irqsave(&slow_work_queue_lock, flags); 427 spin_lock_irqsave(&slow_work_queue_lock, flags);
284 428
429 if (unlikely(test_bit(SLOW_WORK_CANCELLING, &work->flags)))
430 goto cancelled;
431
285 /* we promise that we will not attempt to execute the work 432 /* we promise that we will not attempt to execute the work
286 * function in more than one thread simultaneously 433 * function in more than one thread simultaneously
287 * 434 *
@@ -299,25 +446,221 @@ int slow_work_enqueue(struct slow_work *work)
299 if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) { 446 if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
300 set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags); 447 set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
301 } else { 448 } else {
302 if (work->ops->get_ref(work) < 0) 449 ret = slow_work_get_ref(work);
303 goto cant_get_ref; 450 if (ret < 0)
304 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) 451 goto failed;
305 list_add_tail(&work->link, &vslow_work_queue); 452 slow_work_mark_time(work);
306 else 453 list_add_tail(&work->link, queue);
307 list_add_tail(&work->link, &slow_work_queue);
308 wake_up(&slow_work_thread_wq); 454 wake_up(&slow_work_thread_wq);
455
456 /* if someone who could be requeued is sleeping on a
457 * thread, then ask them to yield their thread */
458 if (work->link.prev == queue)
459 wake_up(wfo_wq);
309 } 460 }
310 461
311 spin_unlock_irqrestore(&slow_work_queue_lock, flags); 462 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
312 } 463 }
313 return 0; 464 return 0;
314 465
315cant_get_ref: 466cancelled:
467 ret = -ECANCELED;
468failed:
316 spin_unlock_irqrestore(&slow_work_queue_lock, flags); 469 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
317 return -EAGAIN; 470 return ret;
318} 471}
319EXPORT_SYMBOL(slow_work_enqueue); 472EXPORT_SYMBOL(slow_work_enqueue);
320 473
474static int slow_work_wait(void *word)
475{
476 schedule();
477 return 0;
478}
479
480/**
481 * slow_work_cancel - Cancel a slow work item
482 * @work: The work item to cancel
483 *
484 * This function will cancel a previously enqueued work item. If we cannot
485 * cancel the work item, it is guarenteed to have run when this function
486 * returns.
487 */
488void slow_work_cancel(struct slow_work *work)
489{
490 bool wait = true, put = false;
491
492 set_bit(SLOW_WORK_CANCELLING, &work->flags);
493 smp_mb();
494
495 /* if the work item is a delayed work item with an active timer, we
496 * need to wait for the timer to finish _before_ getting the spinlock,
497 * lest we deadlock against the timer routine
498 *
499 * the timer routine will leave DELAYED set if it notices the
500 * CANCELLING flag in time
501 */
502 if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
503 struct delayed_slow_work *dwork =
504 container_of(work, struct delayed_slow_work, work);
505 del_timer_sync(&dwork->timer);
506 }
507
508 spin_lock_irq(&slow_work_queue_lock);
509
510 if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
511 /* the timer routine aborted or never happened, so we are left
512 * holding the timer's reference on the item and should just
513 * drop the pending flag and wait for any ongoing execution to
514 * finish */
515 struct delayed_slow_work *dwork =
516 container_of(work, struct delayed_slow_work, work);
517
518 BUG_ON(timer_pending(&dwork->timer));
519 BUG_ON(!list_empty(&work->link));
520
521 clear_bit(SLOW_WORK_DELAYED, &work->flags);
522 put = true;
523 clear_bit(SLOW_WORK_PENDING, &work->flags);
524
525 } else if (test_bit(SLOW_WORK_PENDING, &work->flags) &&
526 !list_empty(&work->link)) {
527 /* the link in the pending queue holds a reference on the item
528 * that we will need to release */
529 list_del_init(&work->link);
530 wait = false;
531 put = true;
532 clear_bit(SLOW_WORK_PENDING, &work->flags);
533
534 } else if (test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) {
535 /* the executor is holding our only reference on the item, so
536 * we merely need to wait for it to finish executing */
537 clear_bit(SLOW_WORK_PENDING, &work->flags);
538 }
539
540 spin_unlock_irq(&slow_work_queue_lock);
541
542 /* the EXECUTING flag is set by the executor whilst the spinlock is set
543 * and before the item is dequeued - so assuming the above doesn't
544 * actually dequeue it, simply waiting for the EXECUTING flag to be
545 * released here should be sufficient */
546 if (wait)
547 wait_on_bit(&work->flags, SLOW_WORK_EXECUTING, slow_work_wait,
548 TASK_UNINTERRUPTIBLE);
549
550 clear_bit(SLOW_WORK_CANCELLING, &work->flags);
551 if (put)
552 slow_work_put_ref(work);
553}
554EXPORT_SYMBOL(slow_work_cancel);
555
556/*
557 * Handle expiry of the delay timer, indicating that a delayed slow work item
558 * should now be queued if not cancelled
559 */
560static void delayed_slow_work_timer(unsigned long data)
561{
562 wait_queue_head_t *wfo_wq;
563 struct list_head *queue;
564 struct slow_work *work = (struct slow_work *) data;
565 unsigned long flags;
566 bool queued = false, put = false, first = false;
567
568 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
569 wfo_wq = &vslow_work_queue_waits_for_occupation;
570 queue = &vslow_work_queue;
571 } else {
572 wfo_wq = &slow_work_queue_waits_for_occupation;
573 queue = &slow_work_queue;
574 }
575
576 spin_lock_irqsave(&slow_work_queue_lock, flags);
577 if (likely(!test_bit(SLOW_WORK_CANCELLING, &work->flags))) {
578 clear_bit(SLOW_WORK_DELAYED, &work->flags);
579
580 if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
581 /* we discard the reference the timer was holding in
582 * favour of the one the executor holds */
583 set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
584 put = true;
585 } else {
586 slow_work_mark_time(work);
587 list_add_tail(&work->link, queue);
588 queued = true;
589 if (work->link.prev == queue)
590 first = true;
591 }
592 }
593
594 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
595 if (put)
596 slow_work_put_ref(work);
597 if (first)
598 wake_up(wfo_wq);
599 if (queued)
600 wake_up(&slow_work_thread_wq);
601}
602
603/**
604 * delayed_slow_work_enqueue - Schedule a delayed slow work item for processing
605 * @dwork: The delayed work item to queue
606 * @delay: When to start executing the work, in jiffies from now
607 *
608 * This is similar to slow_work_enqueue(), but it adds a delay before the work
609 * is actually queued for processing.
610 *
611 * The item can have delayed processing requested on it whilst it is being
612 * executed. The delay will begin immediately, and if it expires before the
613 * item finishes executing, the item will be placed back on the queue when it
614 * has done executing.
615 */
616int delayed_slow_work_enqueue(struct delayed_slow_work *dwork,
617 unsigned long delay)
618{
619 struct slow_work *work = &dwork->work;
620 unsigned long flags;
621 int ret;
622
623 if (delay == 0)
624 return slow_work_enqueue(&dwork->work);
625
626 BUG_ON(slow_work_user_count <= 0);
627 BUG_ON(!work);
628 BUG_ON(!work->ops);
629
630 if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
631 return -ECANCELED;
632
633 if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
634 spin_lock_irqsave(&slow_work_queue_lock, flags);
635
636 if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
637 goto cancelled;
638
639 /* the timer holds a reference whilst it is pending */
640 ret = work->ops->get_ref(work);
641 if (ret < 0)
642 goto cant_get_ref;
643
644 if (test_and_set_bit(SLOW_WORK_DELAYED, &work->flags))
645 BUG();
646 dwork->timer.expires = jiffies + delay;
647 dwork->timer.data = (unsigned long) work;
648 dwork->timer.function = delayed_slow_work_timer;
649 add_timer(&dwork->timer);
650
651 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
652 }
653
654 return 0;
655
656cancelled:
657 ret = -ECANCELED;
658cant_get_ref:
659 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
660 return ret;
661}
662EXPORT_SYMBOL(delayed_slow_work_enqueue);
663
321/* 664/*
322 * Schedule a cull of the thread pool at some time in the near future 665 * Schedule a cull of the thread pool at some time in the near future
323 */ 666 */
@@ -368,13 +711,23 @@ static inline bool slow_work_available(int vsmax)
368 */ 711 */
369static int slow_work_thread(void *_data) 712static int slow_work_thread(void *_data)
370{ 713{
371 int vsmax; 714 int vsmax, id;
372 715
373 DEFINE_WAIT(wait); 716 DEFINE_WAIT(wait);
374 717
375 set_freezable(); 718 set_freezable();
376 set_user_nice(current, -5); 719 set_user_nice(current, -5);
377 720
721 /* allocate ourselves an ID */
722 spin_lock_irq(&slow_work_queue_lock);
723 id = find_first_zero_bit(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
724 BUG_ON(id < 0 || id >= SLOW_WORK_THREAD_LIMIT);
725 __set_bit(id, slow_work_ids);
726 slow_work_set_thread_pid(id, current->pid);
727 spin_unlock_irq(&slow_work_queue_lock);
728
729 sprintf(current->comm, "kslowd%03u", id);
730
378 for (;;) { 731 for (;;) {
379 vsmax = vslow_work_proportion; 732 vsmax = vslow_work_proportion;
380 vsmax *= atomic_read(&slow_work_thread_count); 733 vsmax *= atomic_read(&slow_work_thread_count);
@@ -395,7 +748,7 @@ static int slow_work_thread(void *_data)
395 vsmax *= atomic_read(&slow_work_thread_count); 748 vsmax *= atomic_read(&slow_work_thread_count);
396 vsmax /= 100; 749 vsmax /= 100;
397 750
398 if (slow_work_available(vsmax) && slow_work_execute()) { 751 if (slow_work_available(vsmax) && slow_work_execute(id)) {
399 cond_resched(); 752 cond_resched();
400 if (list_empty(&slow_work_queue) && 753 if (list_empty(&slow_work_queue) &&
401 list_empty(&vslow_work_queue) && 754 list_empty(&vslow_work_queue) &&
@@ -412,6 +765,11 @@ static int slow_work_thread(void *_data)
412 break; 765 break;
413 } 766 }
414 767
768 spin_lock_irq(&slow_work_queue_lock);
769 slow_work_set_thread_pid(id, 0);
770 __clear_bit(id, slow_work_ids);
771 spin_unlock_irq(&slow_work_queue_lock);
772
415 if (atomic_dec_and_test(&slow_work_thread_count)) 773 if (atomic_dec_and_test(&slow_work_thread_count))
416 complete_and_exit(&slow_work_last_thread_exited, 0); 774 complete_and_exit(&slow_work_last_thread_exited, 0);
417 return 0; 775 return 0;
@@ -427,21 +785,6 @@ static void slow_work_cull_timeout(unsigned long data)
427} 785}
428 786
429/* 787/*
430 * Get a reference on slow work thread starter
431 */
432static int slow_work_new_thread_get_ref(struct slow_work *work)
433{
434 return 0;
435}
436
437/*
438 * Drop a reference on slow work thread starter
439 */
440static void slow_work_new_thread_put_ref(struct slow_work *work)
441{
442}
443
444/*
445 * Start a new slow work thread 788 * Start a new slow work thread
446 */ 789 */
447static void slow_work_new_thread_execute(struct slow_work *work) 790static void slow_work_new_thread_execute(struct slow_work *work)
@@ -475,9 +818,11 @@ static void slow_work_new_thread_execute(struct slow_work *work)
475} 818}
476 819
477static const struct slow_work_ops slow_work_new_thread_ops = { 820static const struct slow_work_ops slow_work_new_thread_ops = {
478 .get_ref = slow_work_new_thread_get_ref, 821 .owner = THIS_MODULE,
479 .put_ref = slow_work_new_thread_put_ref,
480 .execute = slow_work_new_thread_execute, 822 .execute = slow_work_new_thread_execute,
823#ifdef CONFIG_SLOW_WORK_DEBUG
824 .desc = slow_work_new_thread_desc,
825#endif
481}; 826};
482 827
483/* 828/*
@@ -493,10 +838,10 @@ static void slow_work_oom_timeout(unsigned long data)
493 * Handle adjustment of the minimum number of threads 838 * Handle adjustment of the minimum number of threads
494 */ 839 */
495static int slow_work_min_threads_sysctl(struct ctl_table *table, int write, 840static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
496 struct file *filp, void __user *buffer, 841 void __user *buffer,
497 size_t *lenp, loff_t *ppos) 842 size_t *lenp, loff_t *ppos)
498{ 843{
499 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 844 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
500 int n; 845 int n;
501 846
502 if (ret == 0) { 847 if (ret == 0) {
@@ -521,10 +866,10 @@ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
521 * Handle adjustment of the maximum number of threads 866 * Handle adjustment of the maximum number of threads
522 */ 867 */
523static int slow_work_max_threads_sysctl(struct ctl_table *table, int write, 868static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
524 struct file *filp, void __user *buffer, 869 void __user *buffer,
525 size_t *lenp, loff_t *ppos) 870 size_t *lenp, loff_t *ppos)
526{ 871{
527 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 872 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
528 int n; 873 int n;
529 874
530 if (ret == 0) { 875 if (ret == 0) {
@@ -546,12 +891,13 @@ static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
546 891
547/** 892/**
548 * slow_work_register_user - Register a user of the facility 893 * slow_work_register_user - Register a user of the facility
894 * @module: The module about to make use of the facility
549 * 895 *
550 * Register a user of the facility, starting up the initial threads if there 896 * Register a user of the facility, starting up the initial threads if there
551 * aren't any other users at this point. This will return 0 if successful, or 897 * aren't any other users at this point. This will return 0 if successful, or
552 * an error if not. 898 * an error if not.
553 */ 899 */
554int slow_work_register_user(void) 900int slow_work_register_user(struct module *module)
555{ 901{
556 struct task_struct *p; 902 struct task_struct *p;
557 int loop; 903 int loop;
@@ -598,14 +944,81 @@ error:
598} 944}
599EXPORT_SYMBOL(slow_work_register_user); 945EXPORT_SYMBOL(slow_work_register_user);
600 946
947/*
948 * wait for all outstanding items from the calling module to complete
949 * - note that more items may be queued whilst we're waiting
950 */
951static void slow_work_wait_for_items(struct module *module)
952{
953#ifdef CONFIG_MODULES
954 DECLARE_WAITQUEUE(myself, current);
955 struct slow_work *work;
956 int loop;
957
958 mutex_lock(&slow_work_unreg_sync_lock);
959 add_wait_queue(&slow_work_unreg_wq, &myself);
960
961 for (;;) {
962 spin_lock_irq(&slow_work_queue_lock);
963
964 /* first of all, we wait for the last queued item in each list
965 * to be processed */
966 list_for_each_entry_reverse(work, &vslow_work_queue, link) {
967 if (work->owner == module) {
968 set_current_state(TASK_UNINTERRUPTIBLE);
969 slow_work_unreg_work_item = work;
970 goto do_wait;
971 }
972 }
973 list_for_each_entry_reverse(work, &slow_work_queue, link) {
974 if (work->owner == module) {
975 set_current_state(TASK_UNINTERRUPTIBLE);
976 slow_work_unreg_work_item = work;
977 goto do_wait;
978 }
979 }
980
981 /* then we wait for the items being processed to finish */
982 slow_work_unreg_module = module;
983 smp_mb();
984 for (loop = 0; loop < SLOW_WORK_THREAD_LIMIT; loop++) {
985 if (slow_work_thread_processing[loop] == module)
986 goto do_wait;
987 }
988 spin_unlock_irq(&slow_work_queue_lock);
989 break; /* okay, we're done */
990
991 do_wait:
992 spin_unlock_irq(&slow_work_queue_lock);
993 schedule();
994 slow_work_unreg_work_item = NULL;
995 slow_work_unreg_module = NULL;
996 }
997
998 remove_wait_queue(&slow_work_unreg_wq, &myself);
999 mutex_unlock(&slow_work_unreg_sync_lock);
1000#endif /* CONFIG_MODULES */
1001}
1002
601/** 1003/**
602 * slow_work_unregister_user - Unregister a user of the facility 1004 * slow_work_unregister_user - Unregister a user of the facility
1005 * @module: The module whose items should be cleared
603 * 1006 *
604 * Unregister a user of the facility, killing all the threads if this was the 1007 * Unregister a user of the facility, killing all the threads if this was the
605 * last one. 1008 * last one.
1009 *
1010 * This waits for all the work items belonging to the nominated module to go
1011 * away before proceeding.
606 */ 1012 */
607void slow_work_unregister_user(void) 1013void slow_work_unregister_user(struct module *module)
608{ 1014{
1015 /* first of all, wait for all outstanding items from the calling module
1016 * to complete */
1017 if (module)
1018 slow_work_wait_for_items(module);
1019
1020 /* then we can actually go about shutting down the facility if need
1021 * be */
609 mutex_lock(&slow_work_user_lock); 1022 mutex_lock(&slow_work_user_lock);
610 1023
611 BUG_ON(slow_work_user_count <= 0); 1024 BUG_ON(slow_work_user_count <= 0);
@@ -639,6 +1052,16 @@ static int __init init_slow_work(void)
639 if (slow_work_max_max_threads < nr_cpus * 2) 1052 if (slow_work_max_max_threads < nr_cpus * 2)
640 slow_work_max_max_threads = nr_cpus * 2; 1053 slow_work_max_max_threads = nr_cpus * 2;
641#endif 1054#endif
1055#ifdef CONFIG_SLOW_WORK_DEBUG
1056 {
1057 struct dentry *dbdir;
1058
1059 dbdir = debugfs_create_dir("slow_work", NULL);
1060 if (dbdir && !IS_ERR(dbdir))
1061 debugfs_create_file("runqueue", S_IFREG | 0400, dbdir,
1062 NULL, &slow_work_runqueue_fops);
1063 }
1064#endif
642 return 0; 1065 return 0;
643} 1066}
644 1067
diff --git a/kernel/slow-work.h b/kernel/slow-work.h
new file mode 100644
index 000000000000..321f3c59d732
--- /dev/null
+++ b/kernel/slow-work.h
@@ -0,0 +1,72 @@
1/* Slow work private definitions
2 *
3 * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of
13 * things to do */
14#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after
15 * OOM */
16
17#define SLOW_WORK_THREAD_LIMIT 255 /* abs maximum number of slow-work threads */
18
19/*
20 * slow-work.c
21 */
22#ifdef CONFIG_SLOW_WORK_DEBUG
23extern struct slow_work *slow_work_execs[];
24extern pid_t slow_work_pids[];
25extern rwlock_t slow_work_execs_lock;
26#endif
27
28extern struct list_head slow_work_queue;
29extern struct list_head vslow_work_queue;
30extern spinlock_t slow_work_queue_lock;
31
32/*
33 * slow-work-debugfs.c
34 */
35#ifdef CONFIG_SLOW_WORK_DEBUG
36extern const struct file_operations slow_work_runqueue_fops;
37
38extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *);
39#endif
40
41/*
42 * Helper functions
43 */
44static inline void slow_work_set_thread_pid(int id, pid_t pid)
45{
46#ifdef CONFIG_SLOW_WORK_PROC
47 slow_work_pids[id] = pid;
48#endif
49}
50
51static inline void slow_work_mark_time(struct slow_work *work)
52{
53#ifdef CONFIG_SLOW_WORK_PROC
54 work->mark = CURRENT_TIME;
55#endif
56}
57
58static inline void slow_work_begin_exec(int id, struct slow_work *work)
59{
60#ifdef CONFIG_SLOW_WORK_PROC
61 slow_work_execs[id] = work;
62#endif
63}
64
65static inline void slow_work_end_exec(int id, struct slow_work *work)
66{
67#ifdef CONFIG_SLOW_WORK_PROC
68 write_lock(&slow_work_execs_lock);
69 slow_work_execs[id] = NULL;
70 write_unlock(&slow_work_execs_lock);
71#endif
72}
diff --git a/kernel/smp.c b/kernel/smp.c
index fd47a256a24e..de735a6637d0 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -16,11 +16,11 @@ static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
16 16
17static struct { 17static struct {
18 struct list_head queue; 18 struct list_head queue;
19 spinlock_t lock; 19 raw_spinlock_t lock;
20} call_function __cacheline_aligned_in_smp = 20} call_function __cacheline_aligned_in_smp =
21 { 21 {
22 .queue = LIST_HEAD_INIT(call_function.queue), 22 .queue = LIST_HEAD_INIT(call_function.queue),
23 .lock = __SPIN_LOCK_UNLOCKED(call_function.lock), 23 .lock = __RAW_SPIN_LOCK_UNLOCKED(call_function.lock),
24 }; 24 };
25 25
26enum { 26enum {
@@ -35,7 +35,7 @@ struct call_function_data {
35 35
36struct call_single_queue { 36struct call_single_queue {
37 struct list_head list; 37 struct list_head list;
38 spinlock_t lock; 38 raw_spinlock_t lock;
39}; 39};
40 40
41static DEFINE_PER_CPU(struct call_function_data, cfd_data); 41static DEFINE_PER_CPU(struct call_function_data, cfd_data);
@@ -80,7 +80,7 @@ static int __cpuinit init_call_single_data(void)
80 for_each_possible_cpu(i) { 80 for_each_possible_cpu(i) {
81 struct call_single_queue *q = &per_cpu(call_single_queue, i); 81 struct call_single_queue *q = &per_cpu(call_single_queue, i);
82 82
83 spin_lock_init(&q->lock); 83 raw_spin_lock_init(&q->lock);
84 INIT_LIST_HEAD(&q->list); 84 INIT_LIST_HEAD(&q->list);
85 } 85 }
86 86
@@ -141,10 +141,10 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
141 unsigned long flags; 141 unsigned long flags;
142 int ipi; 142 int ipi;
143 143
144 spin_lock_irqsave(&dst->lock, flags); 144 raw_spin_lock_irqsave(&dst->lock, flags);
145 ipi = list_empty(&dst->list); 145 ipi = list_empty(&dst->list);
146 list_add_tail(&data->list, &dst->list); 146 list_add_tail(&data->list, &dst->list);
147 spin_unlock_irqrestore(&dst->lock, flags); 147 raw_spin_unlock_irqrestore(&dst->lock, flags);
148 148
149 /* 149 /*
150 * The list addition should be visible before sending the IPI 150 * The list addition should be visible before sending the IPI
@@ -171,7 +171,7 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
171void generic_smp_call_function_interrupt(void) 171void generic_smp_call_function_interrupt(void)
172{ 172{
173 struct call_function_data *data; 173 struct call_function_data *data;
174 int cpu = get_cpu(); 174 int cpu = smp_processor_id();
175 175
176 /* 176 /*
177 * Shouldn't receive this interrupt on a cpu that is not yet online. 177 * Shouldn't receive this interrupt on a cpu that is not yet online.
@@ -201,9 +201,9 @@ void generic_smp_call_function_interrupt(void)
201 refs = atomic_dec_return(&data->refs); 201 refs = atomic_dec_return(&data->refs);
202 WARN_ON(refs < 0); 202 WARN_ON(refs < 0);
203 if (!refs) { 203 if (!refs) {
204 spin_lock(&call_function.lock); 204 raw_spin_lock(&call_function.lock);
205 list_del_rcu(&data->csd.list); 205 list_del_rcu(&data->csd.list);
206 spin_unlock(&call_function.lock); 206 raw_spin_unlock(&call_function.lock);
207 } 207 }
208 208
209 if (refs) 209 if (refs)
@@ -212,7 +212,6 @@ void generic_smp_call_function_interrupt(void)
212 csd_unlock(&data->csd); 212 csd_unlock(&data->csd);
213 } 213 }
214 214
215 put_cpu();
216} 215}
217 216
218/* 217/*
@@ -230,9 +229,9 @@ void generic_smp_call_function_single_interrupt(void)
230 */ 229 */
231 WARN_ON_ONCE(!cpu_online(smp_processor_id())); 230 WARN_ON_ONCE(!cpu_online(smp_processor_id()));
232 231
233 spin_lock(&q->lock); 232 raw_spin_lock(&q->lock);
234 list_replace_init(&q->list, &list); 233 list_replace_init(&q->list, &list);
235 spin_unlock(&q->lock); 234 raw_spin_unlock(&q->lock);
236 235
237 while (!list_empty(&list)) { 236 while (!list_empty(&list)) {
238 struct call_single_data *data; 237 struct call_single_data *data;
@@ -265,9 +264,7 @@ static DEFINE_PER_CPU(struct call_single_data, csd_data);
265 * @info: An arbitrary pointer to pass to the function. 264 * @info: An arbitrary pointer to pass to the function.
266 * @wait: If true, wait until function has completed on other CPUs. 265 * @wait: If true, wait until function has completed on other CPUs.
267 * 266 *
268 * Returns 0 on success, else a negative status code. Note that @wait 267 * Returns 0 on success, else a negative status code.
269 * will be implicitly turned on in case of allocation failures, since
270 * we fall back to on-stack allocation.
271 */ 268 */
272int smp_call_function_single(int cpu, void (*func) (void *info), void *info, 269int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
273 int wait) 270 int wait)
@@ -321,6 +318,51 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
321} 318}
322EXPORT_SYMBOL(smp_call_function_single); 319EXPORT_SYMBOL(smp_call_function_single);
323 320
321/*
322 * smp_call_function_any - Run a function on any of the given cpus
323 * @mask: The mask of cpus it can run on.
324 * @func: The function to run. This must be fast and non-blocking.
325 * @info: An arbitrary pointer to pass to the function.
326 * @wait: If true, wait until function has completed.
327 *
328 * Returns 0 on success, else a negative status code (if no cpus were online).
329 * Note that @wait will be implicitly turned on in case of allocation failures,
330 * since we fall back to on-stack allocation.
331 *
332 * Selection preference:
333 * 1) current cpu if in @mask
334 * 2) any cpu of current node if in @mask
335 * 3) any other online cpu in @mask
336 */
337int smp_call_function_any(const struct cpumask *mask,
338 void (*func)(void *info), void *info, int wait)
339{
340 unsigned int cpu;
341 const struct cpumask *nodemask;
342 int ret;
343
344 /* Try for same CPU (cheapest) */
345 cpu = get_cpu();
346 if (cpumask_test_cpu(cpu, mask))
347 goto call;
348
349 /* Try for same node. */
350 nodemask = cpumask_of_node(cpu);
351 for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids;
352 cpu = cpumask_next_and(cpu, nodemask, mask)) {
353 if (cpu_online(cpu))
354 goto call;
355 }
356
357 /* Any online will do: smp_call_function_single handles nr_cpu_ids. */
358 cpu = cpumask_any_and(mask, cpu_online_mask);
359call:
360 ret = smp_call_function_single(cpu, func, info, wait);
361 put_cpu();
362 return ret;
363}
364EXPORT_SYMBOL_GPL(smp_call_function_any);
365
324/** 366/**
325 * __smp_call_function_single(): Run a function on another CPU 367 * __smp_call_function_single(): Run a function on another CPU
326 * @cpu: The CPU to run on. 368 * @cpu: The CPU to run on.
@@ -347,13 +389,6 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
347 generic_exec_single(cpu, data, wait); 389 generic_exec_single(cpu, data, wait);
348} 390}
349 391
350/* Deprecated: shim for archs using old arch_send_call_function_ipi API. */
351
352#ifndef arch_send_call_function_ipi_mask
353# define arch_send_call_function_ipi_mask(maskp) \
354 arch_send_call_function_ipi(*(maskp))
355#endif
356
357/** 392/**
358 * smp_call_function_many(): Run a function on a set of other CPUs. 393 * smp_call_function_many(): Run a function on a set of other CPUs.
359 * @mask: The set of cpus to run on (only runs on online subset). 394 * @mask: The set of cpus to run on (only runs on online subset).
@@ -362,9 +397,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
362 * @wait: If true, wait (atomically) until function has completed 397 * @wait: If true, wait (atomically) until function has completed
363 * on other CPUs. 398 * on other CPUs.
364 * 399 *
365 * If @wait is true, then returns once @func has returned. Note that @wait 400 * If @wait is true, then returns once @func has returned.
366 * will be implicitly turned on in case of allocation failures, since
367 * we fall back to on-stack allocation.
368 * 401 *
369 * You must not call this function with disabled interrupts or from a 402 * You must not call this function with disabled interrupts or from a
370 * hardware interrupt handler or from a bottom half handler. Preemption 403 * hardware interrupt handler or from a bottom half handler. Preemption
@@ -415,14 +448,14 @@ void smp_call_function_many(const struct cpumask *mask,
415 cpumask_clear_cpu(this_cpu, data->cpumask); 448 cpumask_clear_cpu(this_cpu, data->cpumask);
416 atomic_set(&data->refs, cpumask_weight(data->cpumask)); 449 atomic_set(&data->refs, cpumask_weight(data->cpumask));
417 450
418 spin_lock_irqsave(&call_function.lock, flags); 451 raw_spin_lock_irqsave(&call_function.lock, flags);
419 /* 452 /*
420 * Place entry at the _HEAD_ of the list, so that any cpu still 453 * Place entry at the _HEAD_ of the list, so that any cpu still
421 * observing the entry in generic_smp_call_function_interrupt() 454 * observing the entry in generic_smp_call_function_interrupt()
422 * will not miss any other list entries: 455 * will not miss any other list entries:
423 */ 456 */
424 list_add_rcu(&data->csd.list, &call_function.queue); 457 list_add_rcu(&data->csd.list, &call_function.queue);
425 spin_unlock_irqrestore(&call_function.lock, flags); 458 raw_spin_unlock_irqrestore(&call_function.lock, flags);
426 459
427 /* 460 /*
428 * Make the list addition visible before sending the ipi. 461 * Make the list addition visible before sending the ipi.
@@ -450,8 +483,7 @@ EXPORT_SYMBOL(smp_call_function_many);
450 * Returns 0. 483 * Returns 0.
451 * 484 *
452 * If @wait is true, then returns once @func has returned; otherwise 485 * If @wait is true, then returns once @func has returned; otherwise
453 * it returns just before the target cpu calls @func. In case of allocation 486 * it returns just before the target cpu calls @func.
454 * failure, @wait will be implicitly turned on.
455 * 487 *
456 * You must not call this function with disabled interrupts or from a 488 * You must not call this function with disabled interrupts or from a
457 * hardware interrupt handler or from a bottom half handler. 489 * hardware interrupt handler or from a bottom half handler.
@@ -468,20 +500,20 @@ EXPORT_SYMBOL(smp_call_function);
468 500
469void ipi_call_lock(void) 501void ipi_call_lock(void)
470{ 502{
471 spin_lock(&call_function.lock); 503 raw_spin_lock(&call_function.lock);
472} 504}
473 505
474void ipi_call_unlock(void) 506void ipi_call_unlock(void)
475{ 507{
476 spin_unlock(&call_function.lock); 508 raw_spin_unlock(&call_function.lock);
477} 509}
478 510
479void ipi_call_lock_irq(void) 511void ipi_call_lock_irq(void)
480{ 512{
481 spin_lock_irq(&call_function.lock); 513 raw_spin_lock_irq(&call_function.lock);
482} 514}
483 515
484void ipi_call_unlock_irq(void) 516void ipi_call_unlock_irq(void)
485{ 517{
486 spin_unlock_irq(&call_function.lock); 518 raw_spin_unlock_irq(&call_function.lock);
487} 519}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index f8749e5216e0..a09502e2ef75 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -302,9 +302,9 @@ void irq_exit(void)
302 if (!in_interrupt() && local_softirq_pending()) 302 if (!in_interrupt() && local_softirq_pending())
303 invoke_softirq(); 303 invoke_softirq();
304 304
305 rcu_irq_exit();
305#ifdef CONFIG_NO_HZ 306#ifdef CONFIG_NO_HZ
306 /* Make sure that timer wheel updates are propagated */ 307 /* Make sure that timer wheel updates are propagated */
307 rcu_irq_exit();
308 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) 308 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
309 tick_nohz_stop_sched_tick(0); 309 tick_nohz_stop_sched_tick(0);
310#endif 310#endif
@@ -697,7 +697,7 @@ void __init softirq_init(void)
697 open_softirq(HI_SOFTIRQ, tasklet_hi_action); 697 open_softirq(HI_SOFTIRQ, tasklet_hi_action);
698} 698}
699 699
700static int ksoftirqd(void * __bind_cpu) 700static int run_ksoftirqd(void * __bind_cpu)
701{ 701{
702 set_current_state(TASK_INTERRUPTIBLE); 702 set_current_state(TASK_INTERRUPTIBLE);
703 703
@@ -810,7 +810,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
810 switch (action) { 810 switch (action) {
811 case CPU_UP_PREPARE: 811 case CPU_UP_PREPARE:
812 case CPU_UP_PREPARE_FROZEN: 812 case CPU_UP_PREPARE_FROZEN:
813 p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); 813 p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
814 if (IS_ERR(p)) { 814 if (IS_ERR(p)) {
815 printk("ksoftirqd for %i failed\n", hotcpu); 815 printk("ksoftirqd for %i failed\n", hotcpu);
816 return NOTIFY_BAD; 816 return NOTIFY_BAD;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 88796c330838..d22579087e27 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -22,9 +22,9 @@
22 22
23static DEFINE_SPINLOCK(print_lock); 23static DEFINE_SPINLOCK(print_lock);
24 24
25static DEFINE_PER_CPU(unsigned long, touch_timestamp); 25static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */
26static DEFINE_PER_CPU(unsigned long, print_timestamp); 26static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */
27static DEFINE_PER_CPU(struct task_struct *, watchdog_task); 27static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
28 28
29static int __read_mostly did_panic; 29static int __read_mostly did_panic;
30int __read_mostly softlockup_thresh = 60; 30int __read_mostly softlockup_thresh = 60;
@@ -70,12 +70,12 @@ static void __touch_softlockup_watchdog(void)
70{ 70{
71 int this_cpu = raw_smp_processor_id(); 71 int this_cpu = raw_smp_processor_id();
72 72
73 __raw_get_cpu_var(touch_timestamp) = get_timestamp(this_cpu); 73 __raw_get_cpu_var(softlockup_touch_ts) = get_timestamp(this_cpu);
74} 74}
75 75
76void touch_softlockup_watchdog(void) 76void touch_softlockup_watchdog(void)
77{ 77{
78 __raw_get_cpu_var(touch_timestamp) = 0; 78 __raw_get_cpu_var(softlockup_touch_ts) = 0;
79} 79}
80EXPORT_SYMBOL(touch_softlockup_watchdog); 80EXPORT_SYMBOL(touch_softlockup_watchdog);
81 81
@@ -85,16 +85,16 @@ void touch_all_softlockup_watchdogs(void)
85 85
86 /* Cause each CPU to re-update its timestamp rather than complain */ 86 /* Cause each CPU to re-update its timestamp rather than complain */
87 for_each_online_cpu(cpu) 87 for_each_online_cpu(cpu)
88 per_cpu(touch_timestamp, cpu) = 0; 88 per_cpu(softlockup_touch_ts, cpu) = 0;
89} 89}
90EXPORT_SYMBOL(touch_all_softlockup_watchdogs); 90EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
91 91
92int proc_dosoftlockup_thresh(struct ctl_table *table, int write, 92int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
93 struct file *filp, void __user *buffer, 93 void __user *buffer,
94 size_t *lenp, loff_t *ppos) 94 size_t *lenp, loff_t *ppos)
95{ 95{
96 touch_all_softlockup_watchdogs(); 96 touch_all_softlockup_watchdogs();
97 return proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 97 return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
98} 98}
99 99
100/* 100/*
@@ -104,28 +104,28 @@ int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
104void softlockup_tick(void) 104void softlockup_tick(void)
105{ 105{
106 int this_cpu = smp_processor_id(); 106 int this_cpu = smp_processor_id();
107 unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu); 107 unsigned long touch_ts = per_cpu(softlockup_touch_ts, this_cpu);
108 unsigned long print_timestamp; 108 unsigned long print_ts;
109 struct pt_regs *regs = get_irq_regs(); 109 struct pt_regs *regs = get_irq_regs();
110 unsigned long now; 110 unsigned long now;
111 111
112 /* Is detection switched off? */ 112 /* Is detection switched off? */
113 if (!per_cpu(watchdog_task, this_cpu) || softlockup_thresh <= 0) { 113 if (!per_cpu(softlockup_watchdog, this_cpu) || softlockup_thresh <= 0) {
114 /* Be sure we don't false trigger if switched back on */ 114 /* Be sure we don't false trigger if switched back on */
115 if (touch_timestamp) 115 if (touch_ts)
116 per_cpu(touch_timestamp, this_cpu) = 0; 116 per_cpu(softlockup_touch_ts, this_cpu) = 0;
117 return; 117 return;
118 } 118 }
119 119
120 if (touch_timestamp == 0) { 120 if (touch_ts == 0) {
121 __touch_softlockup_watchdog(); 121 __touch_softlockup_watchdog();
122 return; 122 return;
123 } 123 }
124 124
125 print_timestamp = per_cpu(print_timestamp, this_cpu); 125 print_ts = per_cpu(softlockup_print_ts, this_cpu);
126 126
127 /* report at most once a second */ 127 /* report at most once a second */
128 if (print_timestamp == touch_timestamp || did_panic) 128 if (print_ts == touch_ts || did_panic)
129 return; 129 return;
130 130
131 /* do not print during early bootup: */ 131 /* do not print during early bootup: */
@@ -140,18 +140,18 @@ void softlockup_tick(void)
140 * Wake up the high-prio watchdog task twice per 140 * Wake up the high-prio watchdog task twice per
141 * threshold timespan. 141 * threshold timespan.
142 */ 142 */
143 if (now > touch_timestamp + softlockup_thresh/2) 143 if (now > touch_ts + softlockup_thresh/2)
144 wake_up_process(per_cpu(watchdog_task, this_cpu)); 144 wake_up_process(per_cpu(softlockup_watchdog, this_cpu));
145 145
146 /* Warn about unreasonable delays: */ 146 /* Warn about unreasonable delays: */
147 if (now <= (touch_timestamp + softlockup_thresh)) 147 if (now <= (touch_ts + softlockup_thresh))
148 return; 148 return;
149 149
150 per_cpu(print_timestamp, this_cpu) = touch_timestamp; 150 per_cpu(softlockup_print_ts, this_cpu) = touch_ts;
151 151
152 spin_lock(&print_lock); 152 spin_lock(&print_lock);
153 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n", 153 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n",
154 this_cpu, now - touch_timestamp, 154 this_cpu, now - touch_ts,
155 current->comm, task_pid_nr(current)); 155 current->comm, task_pid_nr(current));
156 print_modules(); 156 print_modules();
157 print_irqtrace_events(current); 157 print_irqtrace_events(current);
@@ -209,32 +209,32 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
209 switch (action) { 209 switch (action) {
210 case CPU_UP_PREPARE: 210 case CPU_UP_PREPARE:
211 case CPU_UP_PREPARE_FROZEN: 211 case CPU_UP_PREPARE_FROZEN:
212 BUG_ON(per_cpu(watchdog_task, hotcpu)); 212 BUG_ON(per_cpu(softlockup_watchdog, hotcpu));
213 p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu); 213 p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
214 if (IS_ERR(p)) { 214 if (IS_ERR(p)) {
215 printk(KERN_ERR "watchdog for %i failed\n", hotcpu); 215 printk(KERN_ERR "watchdog for %i failed\n", hotcpu);
216 return NOTIFY_BAD; 216 return NOTIFY_BAD;
217 } 217 }
218 per_cpu(touch_timestamp, hotcpu) = 0; 218 per_cpu(softlockup_touch_ts, hotcpu) = 0;
219 per_cpu(watchdog_task, hotcpu) = p; 219 per_cpu(softlockup_watchdog, hotcpu) = p;
220 kthread_bind(p, hotcpu); 220 kthread_bind(p, hotcpu);
221 break; 221 break;
222 case CPU_ONLINE: 222 case CPU_ONLINE:
223 case CPU_ONLINE_FROZEN: 223 case CPU_ONLINE_FROZEN:
224 wake_up_process(per_cpu(watchdog_task, hotcpu)); 224 wake_up_process(per_cpu(softlockup_watchdog, hotcpu));
225 break; 225 break;
226#ifdef CONFIG_HOTPLUG_CPU 226#ifdef CONFIG_HOTPLUG_CPU
227 case CPU_UP_CANCELED: 227 case CPU_UP_CANCELED:
228 case CPU_UP_CANCELED_FROZEN: 228 case CPU_UP_CANCELED_FROZEN:
229 if (!per_cpu(watchdog_task, hotcpu)) 229 if (!per_cpu(softlockup_watchdog, hotcpu))
230 break; 230 break;
231 /* Unbind so it can run. Fall thru. */ 231 /* Unbind so it can run. Fall thru. */
232 kthread_bind(per_cpu(watchdog_task, hotcpu), 232 kthread_bind(per_cpu(softlockup_watchdog, hotcpu),
233 cpumask_any(cpu_online_mask)); 233 cpumask_any(cpu_online_mask));
234 case CPU_DEAD: 234 case CPU_DEAD:
235 case CPU_DEAD_FROZEN: 235 case CPU_DEAD_FROZEN:
236 p = per_cpu(watchdog_task, hotcpu); 236 p = per_cpu(softlockup_watchdog, hotcpu);
237 per_cpu(watchdog_task, hotcpu) = NULL; 237 per_cpu(softlockup_watchdog, hotcpu) = NULL;
238 kthread_stop(p); 238 kthread_stop(p);
239 break; 239 break;
240#endif /* CONFIG_HOTPLUG_CPU */ 240#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 5ddab730cb2f..be6517fb9c14 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -21,193 +21,72 @@
21#include <linux/debug_locks.h> 21#include <linux/debug_locks.h>
22#include <linux/module.h> 22#include <linux/module.h>
23 23
24#ifndef _spin_trylock
25int __lockfunc _spin_trylock(spinlock_t *lock)
26{
27 return __spin_trylock(lock);
28}
29EXPORT_SYMBOL(_spin_trylock);
30#endif
31
32#ifndef _read_trylock
33int __lockfunc _read_trylock(rwlock_t *lock)
34{
35 return __read_trylock(lock);
36}
37EXPORT_SYMBOL(_read_trylock);
38#endif
39
40#ifndef _write_trylock
41int __lockfunc _write_trylock(rwlock_t *lock)
42{
43 return __write_trylock(lock);
44}
45EXPORT_SYMBOL(_write_trylock);
46#endif
47
48/* 24/*
49 * If lockdep is enabled then we use the non-preemption spin-ops 25 * If lockdep is enabled then we use the non-preemption spin-ops
50 * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are 26 * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
51 * not re-enabled during lock-acquire (which the preempt-spin-ops do): 27 * not re-enabled during lock-acquire (which the preempt-spin-ops do):
52 */ 28 */
53#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) 29#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
54
55#ifndef _read_lock
56void __lockfunc _read_lock(rwlock_t *lock)
57{
58 __read_lock(lock);
59}
60EXPORT_SYMBOL(_read_lock);
61#endif
62
63#ifndef _spin_lock_irqsave
64unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
65{
66 return __spin_lock_irqsave(lock);
67}
68EXPORT_SYMBOL(_spin_lock_irqsave);
69#endif
70
71#ifndef _spin_lock_irq
72void __lockfunc _spin_lock_irq(spinlock_t *lock)
73{
74 __spin_lock_irq(lock);
75}
76EXPORT_SYMBOL(_spin_lock_irq);
77#endif
78
79#ifndef _spin_lock_bh
80void __lockfunc _spin_lock_bh(spinlock_t *lock)
81{
82 __spin_lock_bh(lock);
83}
84EXPORT_SYMBOL(_spin_lock_bh);
85#endif
86
87#ifndef _read_lock_irqsave
88unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
89{
90 return __read_lock_irqsave(lock);
91}
92EXPORT_SYMBOL(_read_lock_irqsave);
93#endif
94
95#ifndef _read_lock_irq
96void __lockfunc _read_lock_irq(rwlock_t *lock)
97{
98 __read_lock_irq(lock);
99}
100EXPORT_SYMBOL(_read_lock_irq);
101#endif
102
103#ifndef _read_lock_bh
104void __lockfunc _read_lock_bh(rwlock_t *lock)
105{
106 __read_lock_bh(lock);
107}
108EXPORT_SYMBOL(_read_lock_bh);
109#endif
110
111#ifndef _write_lock_irqsave
112unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
113{
114 return __write_lock_irqsave(lock);
115}
116EXPORT_SYMBOL(_write_lock_irqsave);
117#endif
118
119#ifndef _write_lock_irq
120void __lockfunc _write_lock_irq(rwlock_t *lock)
121{
122 __write_lock_irq(lock);
123}
124EXPORT_SYMBOL(_write_lock_irq);
125#endif
126
127#ifndef _write_lock_bh
128void __lockfunc _write_lock_bh(rwlock_t *lock)
129{
130 __write_lock_bh(lock);
131}
132EXPORT_SYMBOL(_write_lock_bh);
133#endif
134
135#ifndef _spin_lock
136void __lockfunc _spin_lock(spinlock_t *lock)
137{
138 __spin_lock(lock);
139}
140EXPORT_SYMBOL(_spin_lock);
141#endif
142
143#ifndef _write_lock
144void __lockfunc _write_lock(rwlock_t *lock)
145{
146 __write_lock(lock);
147}
148EXPORT_SYMBOL(_write_lock);
149#endif
150
151#else /* CONFIG_PREEMPT: */
152
153/* 30/*
31 * The __lock_function inlines are taken from
32 * include/linux/spinlock_api_smp.h
33 */
34#else
35#define raw_read_can_lock(l) read_can_lock(l)
36#define raw_write_can_lock(l) write_can_lock(l)
37/*
38 * We build the __lock_function inlines here. They are too large for
39 * inlining all over the place, but here is only one user per function
40 * which embedds them into the calling _lock_function below.
41 *
154 * This could be a long-held lock. We both prepare to spin for a long 42 * This could be a long-held lock. We both prepare to spin for a long
155 * time (making _this_ CPU preemptable if possible), and we also signal 43 * time (making _this_ CPU preemptable if possible), and we also signal
156 * towards that other CPU that it should break the lock ASAP. 44 * towards that other CPU that it should break the lock ASAP.
157 *
158 * (We do this in a function because inlining it would be excessive.)
159 */ 45 */
160
161#define BUILD_LOCK_OPS(op, locktype) \ 46#define BUILD_LOCK_OPS(op, locktype) \
162void __lockfunc _##op##_lock(locktype##_t *lock) \ 47void __lockfunc __raw_##op##_lock(locktype##_t *lock) \
163{ \ 48{ \
164 for (;;) { \ 49 for (;;) { \
165 preempt_disable(); \ 50 preempt_disable(); \
166 if (likely(_raw_##op##_trylock(lock))) \ 51 if (likely(do_raw_##op##_trylock(lock))) \
167 break; \ 52 break; \
168 preempt_enable(); \ 53 preempt_enable(); \
169 \ 54 \
170 if (!(lock)->break_lock) \ 55 if (!(lock)->break_lock) \
171 (lock)->break_lock = 1; \ 56 (lock)->break_lock = 1; \
172 while (!op##_can_lock(lock) && (lock)->break_lock) \ 57 while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\
173 _raw_##op##_relax(&lock->raw_lock); \ 58 arch_##op##_relax(&lock->raw_lock); \
174 } \ 59 } \
175 (lock)->break_lock = 0; \ 60 (lock)->break_lock = 0; \
176} \ 61} \
177 \ 62 \
178EXPORT_SYMBOL(_##op##_lock); \ 63unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \
179 \
180unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \
181{ \ 64{ \
182 unsigned long flags; \ 65 unsigned long flags; \
183 \ 66 \
184 for (;;) { \ 67 for (;;) { \
185 preempt_disable(); \ 68 preempt_disable(); \
186 local_irq_save(flags); \ 69 local_irq_save(flags); \
187 if (likely(_raw_##op##_trylock(lock))) \ 70 if (likely(do_raw_##op##_trylock(lock))) \
188 break; \ 71 break; \
189 local_irq_restore(flags); \ 72 local_irq_restore(flags); \
190 preempt_enable(); \ 73 preempt_enable(); \
191 \ 74 \
192 if (!(lock)->break_lock) \ 75 if (!(lock)->break_lock) \
193 (lock)->break_lock = 1; \ 76 (lock)->break_lock = 1; \
194 while (!op##_can_lock(lock) && (lock)->break_lock) \ 77 while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\
195 _raw_##op##_relax(&lock->raw_lock); \ 78 arch_##op##_relax(&lock->raw_lock); \
196 } \ 79 } \
197 (lock)->break_lock = 0; \ 80 (lock)->break_lock = 0; \
198 return flags; \ 81 return flags; \
199} \ 82} \
200 \ 83 \
201EXPORT_SYMBOL(_##op##_lock_irqsave); \ 84void __lockfunc __raw_##op##_lock_irq(locktype##_t *lock) \
202 \
203void __lockfunc _##op##_lock_irq(locktype##_t *lock) \
204{ \ 85{ \
205 _##op##_lock_irqsave(lock); \ 86 _raw_##op##_lock_irqsave(lock); \
206} \ 87} \
207 \ 88 \
208EXPORT_SYMBOL(_##op##_lock_irq); \ 89void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \
209 \
210void __lockfunc _##op##_lock_bh(locktype##_t *lock) \
211{ \ 90{ \
212 unsigned long flags; \ 91 unsigned long flags; \
213 \ 92 \
@@ -216,164 +95,283 @@ void __lockfunc _##op##_lock_bh(locktype##_t *lock) \
216 /* irq-disabling. We use the generic preemption-aware */ \ 95 /* irq-disabling. We use the generic preemption-aware */ \
217 /* function: */ \ 96 /* function: */ \
218 /**/ \ 97 /**/ \
219 flags = _##op##_lock_irqsave(lock); \ 98 flags = _raw_##op##_lock_irqsave(lock); \
220 local_bh_disable(); \ 99 local_bh_disable(); \
221 local_irq_restore(flags); \ 100 local_irq_restore(flags); \
222} \ 101} \
223 \
224EXPORT_SYMBOL(_##op##_lock_bh)
225 102
226/* 103/*
227 * Build preemption-friendly versions of the following 104 * Build preemption-friendly versions of the following
228 * lock-spinning functions: 105 * lock-spinning functions:
229 * 106 *
230 * _[spin|read|write]_lock() 107 * __[spin|read|write]_lock()
231 * _[spin|read|write]_lock_irq() 108 * __[spin|read|write]_lock_irq()
232 * _[spin|read|write]_lock_irqsave() 109 * __[spin|read|write]_lock_irqsave()
233 * _[spin|read|write]_lock_bh() 110 * __[spin|read|write]_lock_bh()
234 */ 111 */
235BUILD_LOCK_OPS(spin, spinlock); 112BUILD_LOCK_OPS(spin, raw_spinlock);
236BUILD_LOCK_OPS(read, rwlock); 113BUILD_LOCK_OPS(read, rwlock);
237BUILD_LOCK_OPS(write, rwlock); 114BUILD_LOCK_OPS(write, rwlock);
238 115
239#endif /* CONFIG_PREEMPT */ 116#endif
240 117
241#ifdef CONFIG_DEBUG_LOCK_ALLOC 118#ifndef CONFIG_INLINE_SPIN_TRYLOCK
119int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock)
120{
121 return __raw_spin_trylock(lock);
122}
123EXPORT_SYMBOL(_raw_spin_trylock);
124#endif
242 125
243void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) 126#ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH
127int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock)
244{ 128{
245 preempt_disable(); 129 return __raw_spin_trylock_bh(lock);
246 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
247 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
248} 130}
249EXPORT_SYMBOL(_spin_lock_nested); 131EXPORT_SYMBOL(_raw_spin_trylock_bh);
132#endif
250 133
251unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass) 134#ifndef CONFIG_INLINE_SPIN_LOCK
135void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)
252{ 136{
253 unsigned long flags; 137 __raw_spin_lock(lock);
138}
139EXPORT_SYMBOL(_raw_spin_lock);
140#endif
254 141
255 local_irq_save(flags); 142#ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE
256 preempt_disable(); 143unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock)
257 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); 144{
258 LOCK_CONTENDED_FLAGS(lock, _raw_spin_trylock, _raw_spin_lock, 145 return __raw_spin_lock_irqsave(lock);
259 _raw_spin_lock_flags, &flags);
260 return flags;
261} 146}
262EXPORT_SYMBOL(_spin_lock_irqsave_nested); 147EXPORT_SYMBOL(_raw_spin_lock_irqsave);
148#endif
263 149
264void __lockfunc _spin_lock_nest_lock(spinlock_t *lock, 150#ifndef CONFIG_INLINE_SPIN_LOCK_IRQ
265 struct lockdep_map *nest_lock) 151void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock)
266{ 152{
267 preempt_disable(); 153 __raw_spin_lock_irq(lock);
268 spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
269 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
270} 154}
271EXPORT_SYMBOL(_spin_lock_nest_lock); 155EXPORT_SYMBOL(_raw_spin_lock_irq);
156#endif
272 157
158#ifndef CONFIG_INLINE_SPIN_LOCK_BH
159void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
160{
161 __raw_spin_lock_bh(lock);
162}
163EXPORT_SYMBOL(_raw_spin_lock_bh);
273#endif 164#endif
274 165
275#ifndef _spin_unlock 166#ifndef CONFIG_INLINE_SPIN_UNLOCK
276void __lockfunc _spin_unlock(spinlock_t *lock) 167void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
277{ 168{
278 __spin_unlock(lock); 169 __raw_spin_unlock(lock);
279} 170}
280EXPORT_SYMBOL(_spin_unlock); 171EXPORT_SYMBOL(_raw_spin_unlock);
281#endif 172#endif
282 173
283#ifndef _write_unlock 174#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE
284void __lockfunc _write_unlock(rwlock_t *lock) 175void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags)
285{ 176{
286 __write_unlock(lock); 177 __raw_spin_unlock_irqrestore(lock, flags);
287} 178}
288EXPORT_SYMBOL(_write_unlock); 179EXPORT_SYMBOL(_raw_spin_unlock_irqrestore);
289#endif 180#endif
290 181
291#ifndef _read_unlock 182#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ
292void __lockfunc _read_unlock(rwlock_t *lock) 183void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock)
293{ 184{
294 __read_unlock(lock); 185 __raw_spin_unlock_irq(lock);
295} 186}
296EXPORT_SYMBOL(_read_unlock); 187EXPORT_SYMBOL(_raw_spin_unlock_irq);
297#endif 188#endif
298 189
299#ifndef _spin_unlock_irqrestore 190#ifndef CONFIG_INLINE_SPIN_UNLOCK_BH
300void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) 191void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
301{ 192{
302 __spin_unlock_irqrestore(lock, flags); 193 __raw_spin_unlock_bh(lock);
303} 194}
304EXPORT_SYMBOL(_spin_unlock_irqrestore); 195EXPORT_SYMBOL(_raw_spin_unlock_bh);
305#endif 196#endif
306 197
307#ifndef _spin_unlock_irq 198#ifndef CONFIG_INLINE_READ_TRYLOCK
308void __lockfunc _spin_unlock_irq(spinlock_t *lock) 199int __lockfunc _raw_read_trylock(rwlock_t *lock)
309{ 200{
310 __spin_unlock_irq(lock); 201 return __raw_read_trylock(lock);
311} 202}
312EXPORT_SYMBOL(_spin_unlock_irq); 203EXPORT_SYMBOL(_raw_read_trylock);
313#endif 204#endif
314 205
315#ifndef _spin_unlock_bh 206#ifndef CONFIG_INLINE_READ_LOCK
316void __lockfunc _spin_unlock_bh(spinlock_t *lock) 207void __lockfunc _raw_read_lock(rwlock_t *lock)
317{ 208{
318 __spin_unlock_bh(lock); 209 __raw_read_lock(lock);
319} 210}
320EXPORT_SYMBOL(_spin_unlock_bh); 211EXPORT_SYMBOL(_raw_read_lock);
321#endif 212#endif
322 213
323#ifndef _read_unlock_irqrestore 214#ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE
324void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 215unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock)
325{ 216{
326 __read_unlock_irqrestore(lock, flags); 217 return __raw_read_lock_irqsave(lock);
327} 218}
328EXPORT_SYMBOL(_read_unlock_irqrestore); 219EXPORT_SYMBOL(_raw_read_lock_irqsave);
329#endif 220#endif
330 221
331#ifndef _read_unlock_irq 222#ifndef CONFIG_INLINE_READ_LOCK_IRQ
332void __lockfunc _read_unlock_irq(rwlock_t *lock) 223void __lockfunc _raw_read_lock_irq(rwlock_t *lock)
333{ 224{
334 __read_unlock_irq(lock); 225 __raw_read_lock_irq(lock);
335} 226}
336EXPORT_SYMBOL(_read_unlock_irq); 227EXPORT_SYMBOL(_raw_read_lock_irq);
337#endif 228#endif
338 229
339#ifndef _read_unlock_bh 230#ifndef CONFIG_INLINE_READ_LOCK_BH
340void __lockfunc _read_unlock_bh(rwlock_t *lock) 231void __lockfunc _raw_read_lock_bh(rwlock_t *lock)
341{ 232{
342 __read_unlock_bh(lock); 233 __raw_read_lock_bh(lock);
343} 234}
344EXPORT_SYMBOL(_read_unlock_bh); 235EXPORT_SYMBOL(_raw_read_lock_bh);
345#endif 236#endif
346 237
347#ifndef _write_unlock_irqrestore 238#ifndef CONFIG_INLINE_READ_UNLOCK
348void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 239void __lockfunc _raw_read_unlock(rwlock_t *lock)
349{ 240{
350 __write_unlock_irqrestore(lock, flags); 241 __raw_read_unlock(lock);
351} 242}
352EXPORT_SYMBOL(_write_unlock_irqrestore); 243EXPORT_SYMBOL(_raw_read_unlock);
353#endif 244#endif
354 245
355#ifndef _write_unlock_irq 246#ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE
356void __lockfunc _write_unlock_irq(rwlock_t *lock) 247void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
357{ 248{
358 __write_unlock_irq(lock); 249 __raw_read_unlock_irqrestore(lock, flags);
359} 250}
360EXPORT_SYMBOL(_write_unlock_irq); 251EXPORT_SYMBOL(_raw_read_unlock_irqrestore);
361#endif 252#endif
362 253
363#ifndef _write_unlock_bh 254#ifndef CONFIG_INLINE_READ_UNLOCK_IRQ
364void __lockfunc _write_unlock_bh(rwlock_t *lock) 255void __lockfunc _raw_read_unlock_irq(rwlock_t *lock)
365{ 256{
366 __write_unlock_bh(lock); 257 __raw_read_unlock_irq(lock);
367} 258}
368EXPORT_SYMBOL(_write_unlock_bh); 259EXPORT_SYMBOL(_raw_read_unlock_irq);
369#endif 260#endif
370 261
371#ifndef _spin_trylock_bh 262#ifndef CONFIG_INLINE_READ_UNLOCK_BH
372int __lockfunc _spin_trylock_bh(spinlock_t *lock) 263void __lockfunc _raw_read_unlock_bh(rwlock_t *lock)
373{ 264{
374 return __spin_trylock_bh(lock); 265 __raw_read_unlock_bh(lock);
375} 266}
376EXPORT_SYMBOL(_spin_trylock_bh); 267EXPORT_SYMBOL(_raw_read_unlock_bh);
268#endif
269
270#ifndef CONFIG_INLINE_WRITE_TRYLOCK
271int __lockfunc _raw_write_trylock(rwlock_t *lock)
272{
273 return __raw_write_trylock(lock);
274}
275EXPORT_SYMBOL(_raw_write_trylock);
276#endif
277
278#ifndef CONFIG_INLINE_WRITE_LOCK
279void __lockfunc _raw_write_lock(rwlock_t *lock)
280{
281 __raw_write_lock(lock);
282}
283EXPORT_SYMBOL(_raw_write_lock);
284#endif
285
286#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE
287unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock)
288{
289 return __raw_write_lock_irqsave(lock);
290}
291EXPORT_SYMBOL(_raw_write_lock_irqsave);
292#endif
293
294#ifndef CONFIG_INLINE_WRITE_LOCK_IRQ
295void __lockfunc _raw_write_lock_irq(rwlock_t *lock)
296{
297 __raw_write_lock_irq(lock);
298}
299EXPORT_SYMBOL(_raw_write_lock_irq);
300#endif
301
302#ifndef CONFIG_INLINE_WRITE_LOCK_BH
303void __lockfunc _raw_write_lock_bh(rwlock_t *lock)
304{
305 __raw_write_lock_bh(lock);
306}
307EXPORT_SYMBOL(_raw_write_lock_bh);
308#endif
309
310#ifndef CONFIG_INLINE_WRITE_UNLOCK
311void __lockfunc _raw_write_unlock(rwlock_t *lock)
312{
313 __raw_write_unlock(lock);
314}
315EXPORT_SYMBOL(_raw_write_unlock);
316#endif
317
318#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE
319void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
320{
321 __raw_write_unlock_irqrestore(lock, flags);
322}
323EXPORT_SYMBOL(_raw_write_unlock_irqrestore);
324#endif
325
326#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ
327void __lockfunc _raw_write_unlock_irq(rwlock_t *lock)
328{
329 __raw_write_unlock_irq(lock);
330}
331EXPORT_SYMBOL(_raw_write_unlock_irq);
332#endif
333
334#ifndef CONFIG_INLINE_WRITE_UNLOCK_BH
335void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
336{
337 __raw_write_unlock_bh(lock);
338}
339EXPORT_SYMBOL(_raw_write_unlock_bh);
340#endif
341
342#ifdef CONFIG_DEBUG_LOCK_ALLOC
343
344void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
345{
346 preempt_disable();
347 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
348 LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
349}
350EXPORT_SYMBOL(_raw_spin_lock_nested);
351
352unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock,
353 int subclass)
354{
355 unsigned long flags;
356
357 local_irq_save(flags);
358 preempt_disable();
359 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
360 LOCK_CONTENDED_FLAGS(lock, do_raw_spin_trylock, do_raw_spin_lock,
361 do_raw_spin_lock_flags, &flags);
362 return flags;
363}
364EXPORT_SYMBOL(_raw_spin_lock_irqsave_nested);
365
366void __lockfunc _raw_spin_lock_nest_lock(raw_spinlock_t *lock,
367 struct lockdep_map *nest_lock)
368{
369 preempt_disable();
370 spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
371 LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
372}
373EXPORT_SYMBOL(_raw_spin_lock_nest_lock);
374
377#endif 375#endif
378 376
379notrace int in_lock_functions(unsigned long addr) 377notrace int in_lock_functions(unsigned long addr)
diff --git a/kernel/srcu.c b/kernel/srcu.c
index b0aeeaf22ce4..818d7d9aa03c 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -49,6 +49,7 @@ int init_srcu_struct(struct srcu_struct *sp)
49 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); 49 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
50 return (sp->per_cpu_ref ? 0 : -ENOMEM); 50 return (sp->per_cpu_ref ? 0 : -ENOMEM);
51} 51}
52EXPORT_SYMBOL_GPL(init_srcu_struct);
52 53
53/* 54/*
54 * srcu_readers_active_idx -- returns approximate number of readers 55 * srcu_readers_active_idx -- returns approximate number of readers
@@ -97,6 +98,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
97 free_percpu(sp->per_cpu_ref); 98 free_percpu(sp->per_cpu_ref);
98 sp->per_cpu_ref = NULL; 99 sp->per_cpu_ref = NULL;
99} 100}
101EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
100 102
101/** 103/**
102 * srcu_read_lock - register a new reader for an SRCU-protected structure. 104 * srcu_read_lock - register a new reader for an SRCU-protected structure.
@@ -118,6 +120,7 @@ int srcu_read_lock(struct srcu_struct *sp)
118 preempt_enable(); 120 preempt_enable();
119 return idx; 121 return idx;
120} 122}
123EXPORT_SYMBOL_GPL(srcu_read_lock);
121 124
122/** 125/**
123 * srcu_read_unlock - unregister a old reader from an SRCU-protected structure. 126 * srcu_read_unlock - unregister a old reader from an SRCU-protected structure.
@@ -136,22 +139,12 @@ void srcu_read_unlock(struct srcu_struct *sp, int idx)
136 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; 139 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--;
137 preempt_enable(); 140 preempt_enable();
138} 141}
142EXPORT_SYMBOL_GPL(srcu_read_unlock);
139 143
140/** 144/*
141 * synchronize_srcu - wait for prior SRCU read-side critical-section completion 145 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
142 * @sp: srcu_struct with which to synchronize.
143 *
144 * Flip the completed counter, and wait for the old count to drain to zero.
145 * As with classic RCU, the updater must use some separate means of
146 * synchronizing concurrent updates. Can block; must be called from
147 * process context.
148 *
149 * Note that it is illegal to call synchornize_srcu() from the corresponding
150 * SRCU read-side critical section; doing so will result in deadlock.
151 * However, it is perfectly legal to call synchronize_srcu() on one
152 * srcu_struct from some other srcu_struct's read-side critical section.
153 */ 146 */
154void synchronize_srcu(struct srcu_struct *sp) 147void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
155{ 148{
156 int idx; 149 int idx;
157 150
@@ -173,7 +166,7 @@ void synchronize_srcu(struct srcu_struct *sp)
173 return; 166 return;
174 } 167 }
175 168
176 synchronize_sched(); /* Force memory barrier on all CPUs. */ 169 sync_func(); /* Force memory barrier on all CPUs. */
177 170
178 /* 171 /*
179 * The preceding synchronize_sched() ensures that any CPU that 172 * The preceding synchronize_sched() ensures that any CPU that
@@ -190,7 +183,7 @@ void synchronize_srcu(struct srcu_struct *sp)
190 idx = sp->completed & 0x1; 183 idx = sp->completed & 0x1;
191 sp->completed++; 184 sp->completed++;
192 185
193 synchronize_sched(); /* Force memory barrier on all CPUs. */ 186 sync_func(); /* Force memory barrier on all CPUs. */
194 187
195 /* 188 /*
196 * At this point, because of the preceding synchronize_sched(), 189 * At this point, because of the preceding synchronize_sched(),
@@ -203,7 +196,7 @@ void synchronize_srcu(struct srcu_struct *sp)
203 while (srcu_readers_active_idx(sp, idx)) 196 while (srcu_readers_active_idx(sp, idx))
204 schedule_timeout_interruptible(1); 197 schedule_timeout_interruptible(1);
205 198
206 synchronize_sched(); /* Force memory barrier on all CPUs. */ 199 sync_func(); /* Force memory barrier on all CPUs. */
207 200
208 /* 201 /*
209 * The preceding synchronize_sched() forces all srcu_read_unlock() 202 * The preceding synchronize_sched() forces all srcu_read_unlock()
@@ -237,6 +230,47 @@ void synchronize_srcu(struct srcu_struct *sp)
237} 230}
238 231
239/** 232/**
233 * synchronize_srcu - wait for prior SRCU read-side critical-section completion
234 * @sp: srcu_struct with which to synchronize.
235 *
236 * Flip the completed counter, and wait for the old count to drain to zero.
237 * As with classic RCU, the updater must use some separate means of
238 * synchronizing concurrent updates. Can block; must be called from
239 * process context.
240 *
241 * Note that it is illegal to call synchronize_srcu() from the corresponding
242 * SRCU read-side critical section; doing so will result in deadlock.
243 * However, it is perfectly legal to call synchronize_srcu() on one
244 * srcu_struct from some other srcu_struct's read-side critical section.
245 */
246void synchronize_srcu(struct srcu_struct *sp)
247{
248 __synchronize_srcu(sp, synchronize_sched);
249}
250EXPORT_SYMBOL_GPL(synchronize_srcu);
251
252/**
253 * synchronize_srcu_expedited - like synchronize_srcu, but less patient
254 * @sp: srcu_struct with which to synchronize.
255 *
256 * Flip the completed counter, and wait for the old count to drain to zero.
257 * As with classic RCU, the updater must use some separate means of
258 * synchronizing concurrent updates. Can block; must be called from
259 * process context.
260 *
261 * Note that it is illegal to call synchronize_srcu_expedited()
262 * from the corresponding SRCU read-side critical section; doing so
263 * will result in deadlock. However, it is perfectly legal to call
264 * synchronize_srcu_expedited() on one srcu_struct from some other
265 * srcu_struct's read-side critical section.
266 */
267void synchronize_srcu_expedited(struct srcu_struct *sp)
268{
269 __synchronize_srcu(sp, synchronize_sched_expedited);
270}
271EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
272
273/**
240 * srcu_batches_completed - return batches completed. 274 * srcu_batches_completed - return batches completed.
241 * @sp: srcu_struct on which to report batch completion. 275 * @sp: srcu_struct on which to report batch completion.
242 * 276 *
@@ -248,10 +282,4 @@ long srcu_batches_completed(struct srcu_struct *sp)
248{ 282{
249 return sp->completed; 283 return sp->completed;
250} 284}
251
252EXPORT_SYMBOL_GPL(init_srcu_struct);
253EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
254EXPORT_SYMBOL_GPL(srcu_read_lock);
255EXPORT_SYMBOL_GPL(srcu_read_unlock);
256EXPORT_SYMBOL_GPL(synchronize_srcu);
257EXPORT_SYMBOL_GPL(srcu_batches_completed); 285EXPORT_SYMBOL_GPL(srcu_batches_completed);
diff --git a/kernel/sys.c b/kernel/sys.c
index ebcb15611728..20ccfb5da6af 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -8,7 +8,6 @@
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/utsname.h> 9#include <linux/utsname.h>
10#include <linux/mman.h> 10#include <linux/mman.h>
11#include <linux/smp_lock.h>
12#include <linux/notifier.h> 11#include <linux/notifier.h>
13#include <linux/reboot.h> 12#include <linux/reboot.h>
14#include <linux/prctl.h> 13#include <linux/prctl.h>
@@ -190,10 +189,10 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
190 !(user = find_user(who))) 189 !(user = find_user(who)))
191 goto out_unlock; /* No processes for this user */ 190 goto out_unlock; /* No processes for this user */
192 191
193 do_each_thread(g, p) 192 do_each_thread(g, p) {
194 if (__task_cred(p)->uid == who) 193 if (__task_cred(p)->uid == who)
195 error = set_one_prio(p, niceval, error); 194 error = set_one_prio(p, niceval, error);
196 while_each_thread(g, p); 195 } while_each_thread(g, p);
197 if (who != cred->uid) 196 if (who != cred->uid)
198 free_uid(user); /* For find_user() */ 197 free_uid(user); /* For find_user() */
199 break; 198 break;
@@ -253,13 +252,13 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
253 !(user = find_user(who))) 252 !(user = find_user(who)))
254 goto out_unlock; /* No processes for this user */ 253 goto out_unlock; /* No processes for this user */
255 254
256 do_each_thread(g, p) 255 do_each_thread(g, p) {
257 if (__task_cred(p)->uid == who) { 256 if (__task_cred(p)->uid == who) {
258 niceval = 20 - task_nice(p); 257 niceval = 20 - task_nice(p);
259 if (niceval > retval) 258 if (niceval > retval)
260 retval = niceval; 259 retval = niceval;
261 } 260 }
262 while_each_thread(g, p); 261 } while_each_thread(g, p);
263 if (who != cred->uid) 262 if (who != cred->uid)
264 free_uid(user); /* for find_user() */ 263 free_uid(user); /* for find_user() */
265 break; 264 break;
@@ -349,6 +348,9 @@ void kernel_power_off(void)
349 machine_power_off(); 348 machine_power_off();
350} 349}
351EXPORT_SYMBOL_GPL(kernel_power_off); 350EXPORT_SYMBOL_GPL(kernel_power_off);
351
352static DEFINE_MUTEX(reboot_mutex);
353
352/* 354/*
353 * Reboot system call: for obvious reasons only root may call it, 355 * Reboot system call: for obvious reasons only root may call it,
354 * and even root needs to set up some magic numbers in the registers 356 * and even root needs to set up some magic numbers in the registers
@@ -381,7 +383,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
381 if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) 383 if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
382 cmd = LINUX_REBOOT_CMD_HALT; 384 cmd = LINUX_REBOOT_CMD_HALT;
383 385
384 lock_kernel(); 386 mutex_lock(&reboot_mutex);
385 switch (cmd) { 387 switch (cmd) {
386 case LINUX_REBOOT_CMD_RESTART: 388 case LINUX_REBOOT_CMD_RESTART:
387 kernel_restart(NULL); 389 kernel_restart(NULL);
@@ -397,20 +399,18 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
397 399
398 case LINUX_REBOOT_CMD_HALT: 400 case LINUX_REBOOT_CMD_HALT:
399 kernel_halt(); 401 kernel_halt();
400 unlock_kernel();
401 do_exit(0); 402 do_exit(0);
402 panic("cannot halt"); 403 panic("cannot halt");
403 404
404 case LINUX_REBOOT_CMD_POWER_OFF: 405 case LINUX_REBOOT_CMD_POWER_OFF:
405 kernel_power_off(); 406 kernel_power_off();
406 unlock_kernel();
407 do_exit(0); 407 do_exit(0);
408 break; 408 break;
409 409
410 case LINUX_REBOOT_CMD_RESTART2: 410 case LINUX_REBOOT_CMD_RESTART2:
411 if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) { 411 if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) {
412 unlock_kernel(); 412 ret = -EFAULT;
413 return -EFAULT; 413 break;
414 } 414 }
415 buffer[sizeof(buffer) - 1] = '\0'; 415 buffer[sizeof(buffer) - 1] = '\0';
416 416
@@ -433,7 +433,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
433 ret = -EINVAL; 433 ret = -EINVAL;
434 break; 434 break;
435 } 435 }
436 unlock_kernel(); 436 mutex_unlock(&reboot_mutex);
437 return ret; 437 return ret;
438} 438}
439 439
@@ -911,16 +911,15 @@ change_okay:
911 911
912void do_sys_times(struct tms *tms) 912void do_sys_times(struct tms *tms)
913{ 913{
914 struct task_cputime cputime; 914 cputime_t tgutime, tgstime, cutime, cstime;
915 cputime_t cutime, cstime;
916 915
917 thread_group_cputime(current, &cputime);
918 spin_lock_irq(&current->sighand->siglock); 916 spin_lock_irq(&current->sighand->siglock);
917 thread_group_times(current, &tgutime, &tgstime);
919 cutime = current->signal->cutime; 918 cutime = current->signal->cutime;
920 cstime = current->signal->cstime; 919 cstime = current->signal->cstime;
921 spin_unlock_irq(&current->sighand->siglock); 920 spin_unlock_irq(&current->sighand->siglock);
922 tms->tms_utime = cputime_to_clock_t(cputime.utime); 921 tms->tms_utime = cputime_to_clock_t(tgutime);
923 tms->tms_stime = cputime_to_clock_t(cputime.stime); 922 tms->tms_stime = cputime_to_clock_t(tgstime);
924 tms->tms_cutime = cputime_to_clock_t(cutime); 923 tms->tms_cutime = cputime_to_clock_t(cutime);
925 tms->tms_cstime = cputime_to_clock_t(cstime); 924 tms->tms_cstime = cputime_to_clock_t(cstime);
926} 925}
@@ -1110,6 +1109,8 @@ SYSCALL_DEFINE0(setsid)
1110 err = session; 1109 err = session;
1111out: 1110out:
1112 write_unlock_irq(&tasklist_lock); 1111 write_unlock_irq(&tasklist_lock);
1112 if (err > 0)
1113 proc_sid_connector(group_leader);
1113 return err; 1114 return err;
1114} 1115}
1115 1116
@@ -1336,16 +1337,14 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1336{ 1337{
1337 struct task_struct *t; 1338 struct task_struct *t;
1338 unsigned long flags; 1339 unsigned long flags;
1339 cputime_t utime, stime; 1340 cputime_t tgutime, tgstime, utime, stime;
1340 struct task_cputime cputime;
1341 unsigned long maxrss = 0; 1341 unsigned long maxrss = 0;
1342 1342
1343 memset((char *) r, 0, sizeof *r); 1343 memset((char *) r, 0, sizeof *r);
1344 utime = stime = cputime_zero; 1344 utime = stime = cputime_zero;
1345 1345
1346 if (who == RUSAGE_THREAD) { 1346 if (who == RUSAGE_THREAD) {
1347 utime = task_utime(current); 1347 task_times(current, &utime, &stime);
1348 stime = task_stime(current);
1349 accumulate_thread_rusage(p, r); 1348 accumulate_thread_rusage(p, r);
1350 maxrss = p->signal->maxrss; 1349 maxrss = p->signal->maxrss;
1351 goto out; 1350 goto out;
@@ -1371,9 +1370,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1371 break; 1370 break;
1372 1371
1373 case RUSAGE_SELF: 1372 case RUSAGE_SELF:
1374 thread_group_cputime(p, &cputime); 1373 thread_group_times(p, &tgutime, &tgstime);
1375 utime = cputime_add(utime, cputime.utime); 1374 utime = cputime_add(utime, tgutime);
1376 stime = cputime_add(stime, cputime.stime); 1375 stime = cputime_add(stime, tgstime);
1377 r->ru_nvcsw += p->signal->nvcsw; 1376 r->ru_nvcsw += p->signal->nvcsw;
1378 r->ru_nivcsw += p->signal->nivcsw; 1377 r->ru_nivcsw += p->signal->nivcsw;
1379 r->ru_minflt += p->signal->min_flt; 1378 r->ru_minflt += p->signal->min_flt;
@@ -1542,6 +1541,41 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1542 current->timer_slack_ns = arg2; 1541 current->timer_slack_ns = arg2;
1543 error = 0; 1542 error = 0;
1544 break; 1543 break;
1544 case PR_MCE_KILL:
1545 if (arg4 | arg5)
1546 return -EINVAL;
1547 switch (arg2) {
1548 case PR_MCE_KILL_CLEAR:
1549 if (arg3 != 0)
1550 return -EINVAL;
1551 current->flags &= ~PF_MCE_PROCESS;
1552 break;
1553 case PR_MCE_KILL_SET:
1554 current->flags |= PF_MCE_PROCESS;
1555 if (arg3 == PR_MCE_KILL_EARLY)
1556 current->flags |= PF_MCE_EARLY;
1557 else if (arg3 == PR_MCE_KILL_LATE)
1558 current->flags &= ~PF_MCE_EARLY;
1559 else if (arg3 == PR_MCE_KILL_DEFAULT)
1560 current->flags &=
1561 ~(PF_MCE_EARLY|PF_MCE_PROCESS);
1562 else
1563 return -EINVAL;
1564 break;
1565 default:
1566 return -EINVAL;
1567 }
1568 error = 0;
1569 break;
1570 case PR_MCE_KILL_GET:
1571 if (arg2 | arg3 | arg4 | arg5)
1572 return -EINVAL;
1573 if (current->flags & PF_MCE_PROCESS)
1574 error = (current->flags & PF_MCE_EARLY) ?
1575 PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
1576 else
1577 error = PR_MCE_KILL_DEFAULT;
1578 break;
1545 default: 1579 default:
1546 error = -EINVAL; 1580 error = -EINVAL;
1547 break; 1581 break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 515bc230ac2a..695384f12a7d 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -48,7 +48,10 @@ cond_syscall(sys_shutdown);
48cond_syscall(sys_sendmsg); 48cond_syscall(sys_sendmsg);
49cond_syscall(compat_sys_sendmsg); 49cond_syscall(compat_sys_sendmsg);
50cond_syscall(sys_recvmsg); 50cond_syscall(sys_recvmsg);
51cond_syscall(sys_recvmmsg);
51cond_syscall(compat_sys_recvmsg); 52cond_syscall(compat_sys_recvmsg);
53cond_syscall(compat_sys_recvfrom);
54cond_syscall(compat_sys_recvmmsg);
52cond_syscall(sys_socketcall); 55cond_syscall(sys_socketcall);
53cond_syscall(sys_futex); 56cond_syscall(sys_futex);
54cond_syscall(compat_sys_futex); 57cond_syscall(compat_sys_futex);
@@ -138,7 +141,6 @@ cond_syscall(sys_pciconfig_read);
138cond_syscall(sys_pciconfig_write); 141cond_syscall(sys_pciconfig_write);
139cond_syscall(sys_pciconfig_iobase); 142cond_syscall(sys_pciconfig_iobase);
140cond_syscall(sys32_ipc); 143cond_syscall(sys32_ipc);
141cond_syscall(sys32_sysctl);
142cond_syscall(ppc_rtas); 144cond_syscall(ppc_rtas);
143cond_syscall(sys_spu_run); 145cond_syscall(sys_spu_run);
144cond_syscall(sys_spu_create); 146cond_syscall(sys_spu_create);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0dfaa47d7cb6..45e4bef0012a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -26,9 +26,7 @@
26#include <linux/proc_fs.h> 26#include <linux/proc_fs.h>
27#include <linux/security.h> 27#include <linux/security.h>
28#include <linux/ctype.h> 28#include <linux/ctype.h>
29#include <linux/utsname.h>
30#include <linux/kmemcheck.h> 29#include <linux/kmemcheck.h>
31#include <linux/smp_lock.h>
32#include <linux/fs.h> 30#include <linux/fs.h>
33#include <linux/init.h> 31#include <linux/init.h>
34#include <linux/kernel.h> 32#include <linux/kernel.h>
@@ -37,6 +35,7 @@
37#include <linux/sysrq.h> 35#include <linux/sysrq.h>
38#include <linux/highuid.h> 36#include <linux/highuid.h>
39#include <linux/writeback.h> 37#include <linux/writeback.h>
38#include <linux/ratelimit.h>
40#include <linux/hugetlb.h> 39#include <linux/hugetlb.h>
41#include <linux/initrd.h> 40#include <linux/initrd.h>
42#include <linux/key.h> 41#include <linux/key.h>
@@ -61,7 +60,6 @@
61#include <asm/io.h> 60#include <asm/io.h>
62#endif 61#endif
63 62
64static int deprecated_sysctl_warning(struct __sysctl_args *args);
65 63
66#if defined(CONFIG_SYSCTL) 64#if defined(CONFIG_SYSCTL)
67 65
@@ -77,6 +75,7 @@ extern int max_threads;
77extern int core_uses_pid; 75extern int core_uses_pid;
78extern int suid_dumpable; 76extern int suid_dumpable;
79extern char core_pattern[]; 77extern char core_pattern[];
78extern unsigned int core_pipe_limit;
80extern int pid_max; 79extern int pid_max;
81extern int min_free_kbytes; 80extern int min_free_kbytes;
82extern int pid_max_min, pid_max_max; 81extern int pid_max_min, pid_max_max;
@@ -158,14 +157,16 @@ extern int no_unaligned_warning;
158extern int unaligned_dump_stack; 157extern int unaligned_dump_stack;
159#endif 158#endif
160 159
160extern struct ratelimit_state printk_ratelimit_state;
161
161#ifdef CONFIG_RT_MUTEXES 162#ifdef CONFIG_RT_MUTEXES
162extern int max_lock_depth; 163extern int max_lock_depth;
163#endif 164#endif
164 165
165#ifdef CONFIG_PROC_SYSCTL 166#ifdef CONFIG_PROC_SYSCTL
166static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, 167static int proc_do_cad_pid(struct ctl_table *table, int write,
167 void __user *buffer, size_t *lenp, loff_t *ppos); 168 void __user *buffer, size_t *lenp, loff_t *ppos);
168static int proc_taint(struct ctl_table *table, int write, struct file *filp, 169static int proc_taint(struct ctl_table *table, int write,
169 void __user *buffer, size_t *lenp, loff_t *ppos); 170 void __user *buffer, size_t *lenp, loff_t *ppos);
170#endif 171#endif
171 172
@@ -207,31 +208,26 @@ extern int lock_stat;
207 208
208static struct ctl_table root_table[] = { 209static struct ctl_table root_table[] = {
209 { 210 {
210 .ctl_name = CTL_KERN,
211 .procname = "kernel", 211 .procname = "kernel",
212 .mode = 0555, 212 .mode = 0555,
213 .child = kern_table, 213 .child = kern_table,
214 }, 214 },
215 { 215 {
216 .ctl_name = CTL_VM,
217 .procname = "vm", 216 .procname = "vm",
218 .mode = 0555, 217 .mode = 0555,
219 .child = vm_table, 218 .child = vm_table,
220 }, 219 },
221 { 220 {
222 .ctl_name = CTL_FS,
223 .procname = "fs", 221 .procname = "fs",
224 .mode = 0555, 222 .mode = 0555,
225 .child = fs_table, 223 .child = fs_table,
226 }, 224 },
227 { 225 {
228 .ctl_name = CTL_DEBUG,
229 .procname = "debug", 226 .procname = "debug",
230 .mode = 0555, 227 .mode = 0555,
231 .child = debug_table, 228 .child = debug_table,
232 }, 229 },
233 { 230 {
234 .ctl_name = CTL_DEV,
235 .procname = "dev", 231 .procname = "dev",
236 .mode = 0555, 232 .mode = 0555,
237 .child = dev_table, 233 .child = dev_table,
@@ -240,7 +236,7 @@ static struct ctl_table root_table[] = {
240 * NOTE: do not add new entries to this table unless you have read 236 * NOTE: do not add new entries to this table unless you have read
241 * Documentation/sysctl/ctl_unnumbered.txt 237 * Documentation/sysctl/ctl_unnumbered.txt
242 */ 238 */
243 { .ctl_name = 0 } 239 { }
244}; 240};
245 241
246#ifdef CONFIG_SCHED_DEBUG 242#ifdef CONFIG_SCHED_DEBUG
@@ -248,188 +244,178 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */
248static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ 244static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
249static int min_wakeup_granularity_ns; /* 0 usecs */ 245static int min_wakeup_granularity_ns; /* 0 usecs */
250static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ 246static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
247static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
248static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
249static int min_sched_shares_ratelimit = 100000; /* 100 usec */
250static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
251#endif 251#endif
252 252
253static struct ctl_table kern_table[] = { 253static struct ctl_table kern_table[] = {
254 { 254 {
255 .ctl_name = CTL_UNNUMBERED,
256 .procname = "sched_child_runs_first", 255 .procname = "sched_child_runs_first",
257 .data = &sysctl_sched_child_runs_first, 256 .data = &sysctl_sched_child_runs_first,
258 .maxlen = sizeof(unsigned int), 257 .maxlen = sizeof(unsigned int),
259 .mode = 0644, 258 .mode = 0644,
260 .proc_handler = &proc_dointvec, 259 .proc_handler = proc_dointvec,
261 }, 260 },
262#ifdef CONFIG_SCHED_DEBUG 261#ifdef CONFIG_SCHED_DEBUG
263 { 262 {
264 .ctl_name = CTL_UNNUMBERED,
265 .procname = "sched_min_granularity_ns", 263 .procname = "sched_min_granularity_ns",
266 .data = &sysctl_sched_min_granularity, 264 .data = &sysctl_sched_min_granularity,
267 .maxlen = sizeof(unsigned int), 265 .maxlen = sizeof(unsigned int),
268 .mode = 0644, 266 .mode = 0644,
269 .proc_handler = &sched_nr_latency_handler, 267 .proc_handler = sched_proc_update_handler,
270 .strategy = &sysctl_intvec,
271 .extra1 = &min_sched_granularity_ns, 268 .extra1 = &min_sched_granularity_ns,
272 .extra2 = &max_sched_granularity_ns, 269 .extra2 = &max_sched_granularity_ns,
273 }, 270 },
274 { 271 {
275 .ctl_name = CTL_UNNUMBERED,
276 .procname = "sched_latency_ns", 272 .procname = "sched_latency_ns",
277 .data = &sysctl_sched_latency, 273 .data = &sysctl_sched_latency,
278 .maxlen = sizeof(unsigned int), 274 .maxlen = sizeof(unsigned int),
279 .mode = 0644, 275 .mode = 0644,
280 .proc_handler = &sched_nr_latency_handler, 276 .proc_handler = sched_proc_update_handler,
281 .strategy = &sysctl_intvec,
282 .extra1 = &min_sched_granularity_ns, 277 .extra1 = &min_sched_granularity_ns,
283 .extra2 = &max_sched_granularity_ns, 278 .extra2 = &max_sched_granularity_ns,
284 }, 279 },
285 { 280 {
286 .ctl_name = CTL_UNNUMBERED,
287 .procname = "sched_wakeup_granularity_ns", 281 .procname = "sched_wakeup_granularity_ns",
288 .data = &sysctl_sched_wakeup_granularity, 282 .data = &sysctl_sched_wakeup_granularity,
289 .maxlen = sizeof(unsigned int), 283 .maxlen = sizeof(unsigned int),
290 .mode = 0644, 284 .mode = 0644,
291 .proc_handler = &proc_dointvec_minmax, 285 .proc_handler = sched_proc_update_handler,
292 .strategy = &sysctl_intvec,
293 .extra1 = &min_wakeup_granularity_ns, 286 .extra1 = &min_wakeup_granularity_ns,
294 .extra2 = &max_wakeup_granularity_ns, 287 .extra2 = &max_wakeup_granularity_ns,
295 }, 288 },
296 { 289 {
297 .ctl_name = CTL_UNNUMBERED,
298 .procname = "sched_shares_ratelimit", 290 .procname = "sched_shares_ratelimit",
299 .data = &sysctl_sched_shares_ratelimit, 291 .data = &sysctl_sched_shares_ratelimit,
300 .maxlen = sizeof(unsigned int), 292 .maxlen = sizeof(unsigned int),
301 .mode = 0644, 293 .mode = 0644,
302 .proc_handler = &proc_dointvec, 294 .proc_handler = sched_proc_update_handler,
295 .extra1 = &min_sched_shares_ratelimit,
296 .extra2 = &max_sched_shares_ratelimit,
303 }, 297 },
304 { 298 {
305 .ctl_name = CTL_UNNUMBERED, 299 .procname = "sched_tunable_scaling",
306 .procname = "sched_shares_thresh", 300 .data = &sysctl_sched_tunable_scaling,
307 .data = &sysctl_sched_shares_thresh, 301 .maxlen = sizeof(enum sched_tunable_scaling),
308 .maxlen = sizeof(unsigned int),
309 .mode = 0644, 302 .mode = 0644,
310 .proc_handler = &proc_dointvec_minmax, 303 .proc_handler = sched_proc_update_handler,
311 .strategy = &sysctl_intvec, 304 .extra1 = &min_sched_tunable_scaling,
312 .extra1 = &zero, 305 .extra2 = &max_sched_tunable_scaling,
313 }, 306 },
314 { 307 {
315 .ctl_name = CTL_UNNUMBERED, 308 .procname = "sched_shares_thresh",
316 .procname = "sched_features", 309 .data = &sysctl_sched_shares_thresh,
317 .data = &sysctl_sched_features,
318 .maxlen = sizeof(unsigned int), 310 .maxlen = sizeof(unsigned int),
319 .mode = 0644, 311 .mode = 0644,
320 .proc_handler = &proc_dointvec, 312 .proc_handler = proc_dointvec_minmax,
313 .extra1 = &zero,
321 }, 314 },
322 { 315 {
323 .ctl_name = CTL_UNNUMBERED,
324 .procname = "sched_migration_cost", 316 .procname = "sched_migration_cost",
325 .data = &sysctl_sched_migration_cost, 317 .data = &sysctl_sched_migration_cost,
326 .maxlen = sizeof(unsigned int), 318 .maxlen = sizeof(unsigned int),
327 .mode = 0644, 319 .mode = 0644,
328 .proc_handler = &proc_dointvec, 320 .proc_handler = proc_dointvec,
329 }, 321 },
330 { 322 {
331 .ctl_name = CTL_UNNUMBERED,
332 .procname = "sched_nr_migrate", 323 .procname = "sched_nr_migrate",
333 .data = &sysctl_sched_nr_migrate, 324 .data = &sysctl_sched_nr_migrate,
334 .maxlen = sizeof(unsigned int), 325 .maxlen = sizeof(unsigned int),
335 .mode = 0644, 326 .mode = 0644,
336 .proc_handler = &proc_dointvec, 327 .proc_handler = proc_dointvec,
337 }, 328 },
338 { 329 {
339 .ctl_name = CTL_UNNUMBERED,
340 .procname = "sched_time_avg", 330 .procname = "sched_time_avg",
341 .data = &sysctl_sched_time_avg, 331 .data = &sysctl_sched_time_avg,
342 .maxlen = sizeof(unsigned int), 332 .maxlen = sizeof(unsigned int),
343 .mode = 0644, 333 .mode = 0644,
344 .proc_handler = &proc_dointvec, 334 .proc_handler = proc_dointvec,
345 }, 335 },
346 { 336 {
347 .ctl_name = CTL_UNNUMBERED,
348 .procname = "timer_migration", 337 .procname = "timer_migration",
349 .data = &sysctl_timer_migration, 338 .data = &sysctl_timer_migration,
350 .maxlen = sizeof(unsigned int), 339 .maxlen = sizeof(unsigned int),
351 .mode = 0644, 340 .mode = 0644,
352 .proc_handler = &proc_dointvec_minmax, 341 .proc_handler = proc_dointvec_minmax,
353 .strategy = &sysctl_intvec,
354 .extra1 = &zero, 342 .extra1 = &zero,
355 .extra2 = &one, 343 .extra2 = &one,
356 }, 344 },
357#endif 345#endif
358 { 346 {
359 .ctl_name = CTL_UNNUMBERED,
360 .procname = "sched_rt_period_us", 347 .procname = "sched_rt_period_us",
361 .data = &sysctl_sched_rt_period, 348 .data = &sysctl_sched_rt_period,
362 .maxlen = sizeof(unsigned int), 349 .maxlen = sizeof(unsigned int),
363 .mode = 0644, 350 .mode = 0644,
364 .proc_handler = &sched_rt_handler, 351 .proc_handler = sched_rt_handler,
365 }, 352 },
366 { 353 {
367 .ctl_name = CTL_UNNUMBERED,
368 .procname = "sched_rt_runtime_us", 354 .procname = "sched_rt_runtime_us",
369 .data = &sysctl_sched_rt_runtime, 355 .data = &sysctl_sched_rt_runtime,
370 .maxlen = sizeof(int), 356 .maxlen = sizeof(int),
371 .mode = 0644, 357 .mode = 0644,
372 .proc_handler = &sched_rt_handler, 358 .proc_handler = sched_rt_handler,
373 }, 359 },
374 { 360 {
375 .ctl_name = CTL_UNNUMBERED,
376 .procname = "sched_compat_yield", 361 .procname = "sched_compat_yield",
377 .data = &sysctl_sched_compat_yield, 362 .data = &sysctl_sched_compat_yield,
378 .maxlen = sizeof(unsigned int), 363 .maxlen = sizeof(unsigned int),
379 .mode = 0644, 364 .mode = 0644,
380 .proc_handler = &proc_dointvec, 365 .proc_handler = proc_dointvec,
381 }, 366 },
382#ifdef CONFIG_PROVE_LOCKING 367#ifdef CONFIG_PROVE_LOCKING
383 { 368 {
384 .ctl_name = CTL_UNNUMBERED,
385 .procname = "prove_locking", 369 .procname = "prove_locking",
386 .data = &prove_locking, 370 .data = &prove_locking,
387 .maxlen = sizeof(int), 371 .maxlen = sizeof(int),
388 .mode = 0644, 372 .mode = 0644,
389 .proc_handler = &proc_dointvec, 373 .proc_handler = proc_dointvec,
390 }, 374 },
391#endif 375#endif
392#ifdef CONFIG_LOCK_STAT 376#ifdef CONFIG_LOCK_STAT
393 { 377 {
394 .ctl_name = CTL_UNNUMBERED,
395 .procname = "lock_stat", 378 .procname = "lock_stat",
396 .data = &lock_stat, 379 .data = &lock_stat,
397 .maxlen = sizeof(int), 380 .maxlen = sizeof(int),
398 .mode = 0644, 381 .mode = 0644,
399 .proc_handler = &proc_dointvec, 382 .proc_handler = proc_dointvec,
400 }, 383 },
401#endif 384#endif
402 { 385 {
403 .ctl_name = KERN_PANIC,
404 .procname = "panic", 386 .procname = "panic",
405 .data = &panic_timeout, 387 .data = &panic_timeout,
406 .maxlen = sizeof(int), 388 .maxlen = sizeof(int),
407 .mode = 0644, 389 .mode = 0644,
408 .proc_handler = &proc_dointvec, 390 .proc_handler = proc_dointvec,
409 }, 391 },
410 { 392 {
411 .ctl_name = KERN_CORE_USES_PID,
412 .procname = "core_uses_pid", 393 .procname = "core_uses_pid",
413 .data = &core_uses_pid, 394 .data = &core_uses_pid,
414 .maxlen = sizeof(int), 395 .maxlen = sizeof(int),
415 .mode = 0644, 396 .mode = 0644,
416 .proc_handler = &proc_dointvec, 397 .proc_handler = proc_dointvec,
417 }, 398 },
418 { 399 {
419 .ctl_name = KERN_CORE_PATTERN,
420 .procname = "core_pattern", 400 .procname = "core_pattern",
421 .data = core_pattern, 401 .data = core_pattern,
422 .maxlen = CORENAME_MAX_SIZE, 402 .maxlen = CORENAME_MAX_SIZE,
423 .mode = 0644, 403 .mode = 0644,
424 .proc_handler = &proc_dostring, 404 .proc_handler = proc_dostring,
425 .strategy = &sysctl_string, 405 },
406 {
407 .procname = "core_pipe_limit",
408 .data = &core_pipe_limit,
409 .maxlen = sizeof(unsigned int),
410 .mode = 0644,
411 .proc_handler = proc_dointvec,
426 }, 412 },
427#ifdef CONFIG_PROC_SYSCTL 413#ifdef CONFIG_PROC_SYSCTL
428 { 414 {
429 .procname = "tainted", 415 .procname = "tainted",
430 .maxlen = sizeof(long), 416 .maxlen = sizeof(long),
431 .mode = 0644, 417 .mode = 0644,
432 .proc_handler = &proc_taint, 418 .proc_handler = proc_taint,
433 }, 419 },
434#endif 420#endif
435#ifdef CONFIG_LATENCYTOP 421#ifdef CONFIG_LATENCYTOP
@@ -438,181 +424,160 @@ static struct ctl_table kern_table[] = {
438 .data = &latencytop_enabled, 424 .data = &latencytop_enabled,
439 .maxlen = sizeof(int), 425 .maxlen = sizeof(int),
440 .mode = 0644, 426 .mode = 0644,
441 .proc_handler = &proc_dointvec, 427 .proc_handler = proc_dointvec,
442 }, 428 },
443#endif 429#endif
444#ifdef CONFIG_BLK_DEV_INITRD 430#ifdef CONFIG_BLK_DEV_INITRD
445 { 431 {
446 .ctl_name = KERN_REALROOTDEV,
447 .procname = "real-root-dev", 432 .procname = "real-root-dev",
448 .data = &real_root_dev, 433 .data = &real_root_dev,
449 .maxlen = sizeof(int), 434 .maxlen = sizeof(int),
450 .mode = 0644, 435 .mode = 0644,
451 .proc_handler = &proc_dointvec, 436 .proc_handler = proc_dointvec,
452 }, 437 },
453#endif 438#endif
454 { 439 {
455 .ctl_name = CTL_UNNUMBERED,
456 .procname = "print-fatal-signals", 440 .procname = "print-fatal-signals",
457 .data = &print_fatal_signals, 441 .data = &print_fatal_signals,
458 .maxlen = sizeof(int), 442 .maxlen = sizeof(int),
459 .mode = 0644, 443 .mode = 0644,
460 .proc_handler = &proc_dointvec, 444 .proc_handler = proc_dointvec,
461 }, 445 },
462#ifdef CONFIG_SPARC 446#ifdef CONFIG_SPARC
463 { 447 {
464 .ctl_name = KERN_SPARC_REBOOT,
465 .procname = "reboot-cmd", 448 .procname = "reboot-cmd",
466 .data = reboot_command, 449 .data = reboot_command,
467 .maxlen = 256, 450 .maxlen = 256,
468 .mode = 0644, 451 .mode = 0644,
469 .proc_handler = &proc_dostring, 452 .proc_handler = proc_dostring,
470 .strategy = &sysctl_string,
471 }, 453 },
472 { 454 {
473 .ctl_name = KERN_SPARC_STOP_A,
474 .procname = "stop-a", 455 .procname = "stop-a",
475 .data = &stop_a_enabled, 456 .data = &stop_a_enabled,
476 .maxlen = sizeof (int), 457 .maxlen = sizeof (int),
477 .mode = 0644, 458 .mode = 0644,
478 .proc_handler = &proc_dointvec, 459 .proc_handler = proc_dointvec,
479 }, 460 },
480 { 461 {
481 .ctl_name = KERN_SPARC_SCONS_PWROFF,
482 .procname = "scons-poweroff", 462 .procname = "scons-poweroff",
483 .data = &scons_pwroff, 463 .data = &scons_pwroff,
484 .maxlen = sizeof (int), 464 .maxlen = sizeof (int),
485 .mode = 0644, 465 .mode = 0644,
486 .proc_handler = &proc_dointvec, 466 .proc_handler = proc_dointvec,
487 }, 467 },
488#endif 468#endif
489#ifdef CONFIG_SPARC64 469#ifdef CONFIG_SPARC64
490 { 470 {
491 .ctl_name = CTL_UNNUMBERED,
492 .procname = "tsb-ratio", 471 .procname = "tsb-ratio",
493 .data = &sysctl_tsb_ratio, 472 .data = &sysctl_tsb_ratio,
494 .maxlen = sizeof (int), 473 .maxlen = sizeof (int),
495 .mode = 0644, 474 .mode = 0644,
496 .proc_handler = &proc_dointvec, 475 .proc_handler = proc_dointvec,
497 }, 476 },
498#endif 477#endif
499#ifdef __hppa__ 478#ifdef __hppa__
500 { 479 {
501 .ctl_name = KERN_HPPA_PWRSW,
502 .procname = "soft-power", 480 .procname = "soft-power",
503 .data = &pwrsw_enabled, 481 .data = &pwrsw_enabled,
504 .maxlen = sizeof (int), 482 .maxlen = sizeof (int),
505 .mode = 0644, 483 .mode = 0644,
506 .proc_handler = &proc_dointvec, 484 .proc_handler = proc_dointvec,
507 }, 485 },
508 { 486 {
509 .ctl_name = KERN_HPPA_UNALIGNED,
510 .procname = "unaligned-trap", 487 .procname = "unaligned-trap",
511 .data = &unaligned_enabled, 488 .data = &unaligned_enabled,
512 .maxlen = sizeof (int), 489 .maxlen = sizeof (int),
513 .mode = 0644, 490 .mode = 0644,
514 .proc_handler = &proc_dointvec, 491 .proc_handler = proc_dointvec,
515 }, 492 },
516#endif 493#endif
517 { 494 {
518 .ctl_name = KERN_CTLALTDEL,
519 .procname = "ctrl-alt-del", 495 .procname = "ctrl-alt-del",
520 .data = &C_A_D, 496 .data = &C_A_D,
521 .maxlen = sizeof(int), 497 .maxlen = sizeof(int),
522 .mode = 0644, 498 .mode = 0644,
523 .proc_handler = &proc_dointvec, 499 .proc_handler = proc_dointvec,
524 }, 500 },
525#ifdef CONFIG_FUNCTION_TRACER 501#ifdef CONFIG_FUNCTION_TRACER
526 { 502 {
527 .ctl_name = CTL_UNNUMBERED,
528 .procname = "ftrace_enabled", 503 .procname = "ftrace_enabled",
529 .data = &ftrace_enabled, 504 .data = &ftrace_enabled,
530 .maxlen = sizeof(int), 505 .maxlen = sizeof(int),
531 .mode = 0644, 506 .mode = 0644,
532 .proc_handler = &ftrace_enable_sysctl, 507 .proc_handler = ftrace_enable_sysctl,
533 }, 508 },
534#endif 509#endif
535#ifdef CONFIG_STACK_TRACER 510#ifdef CONFIG_STACK_TRACER
536 { 511 {
537 .ctl_name = CTL_UNNUMBERED,
538 .procname = "stack_tracer_enabled", 512 .procname = "stack_tracer_enabled",
539 .data = &stack_tracer_enabled, 513 .data = &stack_tracer_enabled,
540 .maxlen = sizeof(int), 514 .maxlen = sizeof(int),
541 .mode = 0644, 515 .mode = 0644,
542 .proc_handler = &stack_trace_sysctl, 516 .proc_handler = stack_trace_sysctl,
543 }, 517 },
544#endif 518#endif
545#ifdef CONFIG_TRACING 519#ifdef CONFIG_TRACING
546 { 520 {
547 .ctl_name = CTL_UNNUMBERED,
548 .procname = "ftrace_dump_on_oops", 521 .procname = "ftrace_dump_on_oops",
549 .data = &ftrace_dump_on_oops, 522 .data = &ftrace_dump_on_oops,
550 .maxlen = sizeof(int), 523 .maxlen = sizeof(int),
551 .mode = 0644, 524 .mode = 0644,
552 .proc_handler = &proc_dointvec, 525 .proc_handler = proc_dointvec,
553 }, 526 },
554#endif 527#endif
555#ifdef CONFIG_MODULES 528#ifdef CONFIG_MODULES
556 { 529 {
557 .ctl_name = KERN_MODPROBE,
558 .procname = "modprobe", 530 .procname = "modprobe",
559 .data = &modprobe_path, 531 .data = &modprobe_path,
560 .maxlen = KMOD_PATH_LEN, 532 .maxlen = KMOD_PATH_LEN,
561 .mode = 0644, 533 .mode = 0644,
562 .proc_handler = &proc_dostring, 534 .proc_handler = proc_dostring,
563 .strategy = &sysctl_string,
564 }, 535 },
565 { 536 {
566 .ctl_name = CTL_UNNUMBERED,
567 .procname = "modules_disabled", 537 .procname = "modules_disabled",
568 .data = &modules_disabled, 538 .data = &modules_disabled,
569 .maxlen = sizeof(int), 539 .maxlen = sizeof(int),
570 .mode = 0644, 540 .mode = 0644,
571 /* only handle a transition from default "0" to "1" */ 541 /* only handle a transition from default "0" to "1" */
572 .proc_handler = &proc_dointvec_minmax, 542 .proc_handler = proc_dointvec_minmax,
573 .extra1 = &one, 543 .extra1 = &one,
574 .extra2 = &one, 544 .extra2 = &one,
575 }, 545 },
576#endif 546#endif
577#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) 547#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
578 { 548 {
579 .ctl_name = KERN_HOTPLUG,
580 .procname = "hotplug", 549 .procname = "hotplug",
581 .data = &uevent_helper, 550 .data = &uevent_helper,
582 .maxlen = UEVENT_HELPER_PATH_LEN, 551 .maxlen = UEVENT_HELPER_PATH_LEN,
583 .mode = 0644, 552 .mode = 0644,
584 .proc_handler = &proc_dostring, 553 .proc_handler = proc_dostring,
585 .strategy = &sysctl_string,
586 }, 554 },
587#endif 555#endif
588#ifdef CONFIG_CHR_DEV_SG 556#ifdef CONFIG_CHR_DEV_SG
589 { 557 {
590 .ctl_name = KERN_SG_BIG_BUFF,
591 .procname = "sg-big-buff", 558 .procname = "sg-big-buff",
592 .data = &sg_big_buff, 559 .data = &sg_big_buff,
593 .maxlen = sizeof (int), 560 .maxlen = sizeof (int),
594 .mode = 0444, 561 .mode = 0444,
595 .proc_handler = &proc_dointvec, 562 .proc_handler = proc_dointvec,
596 }, 563 },
597#endif 564#endif
598#ifdef CONFIG_BSD_PROCESS_ACCT 565#ifdef CONFIG_BSD_PROCESS_ACCT
599 { 566 {
600 .ctl_name = KERN_ACCT,
601 .procname = "acct", 567 .procname = "acct",
602 .data = &acct_parm, 568 .data = &acct_parm,
603 .maxlen = 3*sizeof(int), 569 .maxlen = 3*sizeof(int),
604 .mode = 0644, 570 .mode = 0644,
605 .proc_handler = &proc_dointvec, 571 .proc_handler = proc_dointvec,
606 }, 572 },
607#endif 573#endif
608#ifdef CONFIG_MAGIC_SYSRQ 574#ifdef CONFIG_MAGIC_SYSRQ
609 { 575 {
610 .ctl_name = KERN_SYSRQ,
611 .procname = "sysrq", 576 .procname = "sysrq",
612 .data = &__sysrq_enabled, 577 .data = &__sysrq_enabled,
613 .maxlen = sizeof (int), 578 .maxlen = sizeof (int),
614 .mode = 0644, 579 .mode = 0644,
615 .proc_handler = &proc_dointvec, 580 .proc_handler = proc_dointvec,
616 }, 581 },
617#endif 582#endif
618#ifdef CONFIG_PROC_SYSCTL 583#ifdef CONFIG_PROC_SYSCTL
@@ -621,215 +586,188 @@ static struct ctl_table kern_table[] = {
621 .data = NULL, 586 .data = NULL,
622 .maxlen = sizeof (int), 587 .maxlen = sizeof (int),
623 .mode = 0600, 588 .mode = 0600,
624 .proc_handler = &proc_do_cad_pid, 589 .proc_handler = proc_do_cad_pid,
625 }, 590 },
626#endif 591#endif
627 { 592 {
628 .ctl_name = KERN_MAX_THREADS,
629 .procname = "threads-max", 593 .procname = "threads-max",
630 .data = &max_threads, 594 .data = &max_threads,
631 .maxlen = sizeof(int), 595 .maxlen = sizeof(int),
632 .mode = 0644, 596 .mode = 0644,
633 .proc_handler = &proc_dointvec, 597 .proc_handler = proc_dointvec,
634 }, 598 },
635 { 599 {
636 .ctl_name = KERN_RANDOM,
637 .procname = "random", 600 .procname = "random",
638 .mode = 0555, 601 .mode = 0555,
639 .child = random_table, 602 .child = random_table,
640 }, 603 },
641 { 604 {
642 .ctl_name = KERN_OVERFLOWUID,
643 .procname = "overflowuid", 605 .procname = "overflowuid",
644 .data = &overflowuid, 606 .data = &overflowuid,
645 .maxlen = sizeof(int), 607 .maxlen = sizeof(int),
646 .mode = 0644, 608 .mode = 0644,
647 .proc_handler = &proc_dointvec_minmax, 609 .proc_handler = proc_dointvec_minmax,
648 .strategy = &sysctl_intvec,
649 .extra1 = &minolduid, 610 .extra1 = &minolduid,
650 .extra2 = &maxolduid, 611 .extra2 = &maxolduid,
651 }, 612 },
652 { 613 {
653 .ctl_name = KERN_OVERFLOWGID,
654 .procname = "overflowgid", 614 .procname = "overflowgid",
655 .data = &overflowgid, 615 .data = &overflowgid,
656 .maxlen = sizeof(int), 616 .maxlen = sizeof(int),
657 .mode = 0644, 617 .mode = 0644,
658 .proc_handler = &proc_dointvec_minmax, 618 .proc_handler = proc_dointvec_minmax,
659 .strategy = &sysctl_intvec,
660 .extra1 = &minolduid, 619 .extra1 = &minolduid,
661 .extra2 = &maxolduid, 620 .extra2 = &maxolduid,
662 }, 621 },
663#ifdef CONFIG_S390 622#ifdef CONFIG_S390
664#ifdef CONFIG_MATHEMU 623#ifdef CONFIG_MATHEMU
665 { 624 {
666 .ctl_name = KERN_IEEE_EMULATION_WARNINGS,
667 .procname = "ieee_emulation_warnings", 625 .procname = "ieee_emulation_warnings",
668 .data = &sysctl_ieee_emulation_warnings, 626 .data = &sysctl_ieee_emulation_warnings,
669 .maxlen = sizeof(int), 627 .maxlen = sizeof(int),
670 .mode = 0644, 628 .mode = 0644,
671 .proc_handler = &proc_dointvec, 629 .proc_handler = proc_dointvec,
672 }, 630 },
673#endif 631#endif
674 { 632 {
675 .ctl_name = KERN_S390_USER_DEBUG_LOGGING,
676 .procname = "userprocess_debug", 633 .procname = "userprocess_debug",
677 .data = &sysctl_userprocess_debug, 634 .data = &sysctl_userprocess_debug,
678 .maxlen = sizeof(int), 635 .maxlen = sizeof(int),
679 .mode = 0644, 636 .mode = 0644,
680 .proc_handler = &proc_dointvec, 637 .proc_handler = proc_dointvec,
681 }, 638 },
682#endif 639#endif
683 { 640 {
684 .ctl_name = KERN_PIDMAX,
685 .procname = "pid_max", 641 .procname = "pid_max",
686 .data = &pid_max, 642 .data = &pid_max,
687 .maxlen = sizeof (int), 643 .maxlen = sizeof (int),
688 .mode = 0644, 644 .mode = 0644,
689 .proc_handler = &proc_dointvec_minmax, 645 .proc_handler = proc_dointvec_minmax,
690 .strategy = sysctl_intvec,
691 .extra1 = &pid_max_min, 646 .extra1 = &pid_max_min,
692 .extra2 = &pid_max_max, 647 .extra2 = &pid_max_max,
693 }, 648 },
694 { 649 {
695 .ctl_name = KERN_PANIC_ON_OOPS,
696 .procname = "panic_on_oops", 650 .procname = "panic_on_oops",
697 .data = &panic_on_oops, 651 .data = &panic_on_oops,
698 .maxlen = sizeof(int), 652 .maxlen = sizeof(int),
699 .mode = 0644, 653 .mode = 0644,
700 .proc_handler = &proc_dointvec, 654 .proc_handler = proc_dointvec,
701 }, 655 },
702#if defined CONFIG_PRINTK 656#if defined CONFIG_PRINTK
703 { 657 {
704 .ctl_name = KERN_PRINTK,
705 .procname = "printk", 658 .procname = "printk",
706 .data = &console_loglevel, 659 .data = &console_loglevel,
707 .maxlen = 4*sizeof(int), 660 .maxlen = 4*sizeof(int),
708 .mode = 0644, 661 .mode = 0644,
709 .proc_handler = &proc_dointvec, 662 .proc_handler = proc_dointvec,
710 }, 663 },
711 { 664 {
712 .ctl_name = KERN_PRINTK_RATELIMIT,
713 .procname = "printk_ratelimit", 665 .procname = "printk_ratelimit",
714 .data = &printk_ratelimit_state.interval, 666 .data = &printk_ratelimit_state.interval,
715 .maxlen = sizeof(int), 667 .maxlen = sizeof(int),
716 .mode = 0644, 668 .mode = 0644,
717 .proc_handler = &proc_dointvec_jiffies, 669 .proc_handler = proc_dointvec_jiffies,
718 .strategy = &sysctl_jiffies,
719 }, 670 },
720 { 671 {
721 .ctl_name = KERN_PRINTK_RATELIMIT_BURST,
722 .procname = "printk_ratelimit_burst", 672 .procname = "printk_ratelimit_burst",
723 .data = &printk_ratelimit_state.burst, 673 .data = &printk_ratelimit_state.burst,
724 .maxlen = sizeof(int), 674 .maxlen = sizeof(int),
725 .mode = 0644, 675 .mode = 0644,
726 .proc_handler = &proc_dointvec, 676 .proc_handler = proc_dointvec,
727 }, 677 },
728 { 678 {
729 .ctl_name = CTL_UNNUMBERED,
730 .procname = "printk_delay", 679 .procname = "printk_delay",
731 .data = &printk_delay_msec, 680 .data = &printk_delay_msec,
732 .maxlen = sizeof(int), 681 .maxlen = sizeof(int),
733 .mode = 0644, 682 .mode = 0644,
734 .proc_handler = &proc_dointvec_minmax, 683 .proc_handler = proc_dointvec_minmax,
735 .strategy = &sysctl_intvec,
736 .extra1 = &zero, 684 .extra1 = &zero,
737 .extra2 = &ten_thousand, 685 .extra2 = &ten_thousand,
738 }, 686 },
739#endif 687#endif
740 { 688 {
741 .ctl_name = KERN_NGROUPS_MAX,
742 .procname = "ngroups_max", 689 .procname = "ngroups_max",
743 .data = &ngroups_max, 690 .data = &ngroups_max,
744 .maxlen = sizeof (int), 691 .maxlen = sizeof (int),
745 .mode = 0444, 692 .mode = 0444,
746 .proc_handler = &proc_dointvec, 693 .proc_handler = proc_dointvec,
747 }, 694 },
748#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) 695#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
749 { 696 {
750 .ctl_name = KERN_UNKNOWN_NMI_PANIC,
751 .procname = "unknown_nmi_panic", 697 .procname = "unknown_nmi_panic",
752 .data = &unknown_nmi_panic, 698 .data = &unknown_nmi_panic,
753 .maxlen = sizeof (int), 699 .maxlen = sizeof (int),
754 .mode = 0644, 700 .mode = 0644,
755 .proc_handler = &proc_dointvec, 701 .proc_handler = proc_dointvec,
756 }, 702 },
757 { 703 {
758 .procname = "nmi_watchdog", 704 .procname = "nmi_watchdog",
759 .data = &nmi_watchdog_enabled, 705 .data = &nmi_watchdog_enabled,
760 .maxlen = sizeof (int), 706 .maxlen = sizeof (int),
761 .mode = 0644, 707 .mode = 0644,
762 .proc_handler = &proc_nmi_enabled, 708 .proc_handler = proc_nmi_enabled,
763 }, 709 },
764#endif 710#endif
765#if defined(CONFIG_X86) 711#if defined(CONFIG_X86)
766 { 712 {
767 .ctl_name = KERN_PANIC_ON_NMI,
768 .procname = "panic_on_unrecovered_nmi", 713 .procname = "panic_on_unrecovered_nmi",
769 .data = &panic_on_unrecovered_nmi, 714 .data = &panic_on_unrecovered_nmi,
770 .maxlen = sizeof(int), 715 .maxlen = sizeof(int),
771 .mode = 0644, 716 .mode = 0644,
772 .proc_handler = &proc_dointvec, 717 .proc_handler = proc_dointvec,
773 }, 718 },
774 { 719 {
775 .ctl_name = CTL_UNNUMBERED,
776 .procname = "panic_on_io_nmi", 720 .procname = "panic_on_io_nmi",
777 .data = &panic_on_io_nmi, 721 .data = &panic_on_io_nmi,
778 .maxlen = sizeof(int), 722 .maxlen = sizeof(int),
779 .mode = 0644, 723 .mode = 0644,
780 .proc_handler = &proc_dointvec, 724 .proc_handler = proc_dointvec,
781 }, 725 },
782 { 726 {
783 .ctl_name = KERN_BOOTLOADER_TYPE,
784 .procname = "bootloader_type", 727 .procname = "bootloader_type",
785 .data = &bootloader_type, 728 .data = &bootloader_type,
786 .maxlen = sizeof (int), 729 .maxlen = sizeof (int),
787 .mode = 0444, 730 .mode = 0444,
788 .proc_handler = &proc_dointvec, 731 .proc_handler = proc_dointvec,
789 }, 732 },
790 { 733 {
791 .ctl_name = CTL_UNNUMBERED,
792 .procname = "bootloader_version", 734 .procname = "bootloader_version",
793 .data = &bootloader_version, 735 .data = &bootloader_version,
794 .maxlen = sizeof (int), 736 .maxlen = sizeof (int),
795 .mode = 0444, 737 .mode = 0444,
796 .proc_handler = &proc_dointvec, 738 .proc_handler = proc_dointvec,
797 }, 739 },
798 { 740 {
799 .ctl_name = CTL_UNNUMBERED,
800 .procname = "kstack_depth_to_print", 741 .procname = "kstack_depth_to_print",
801 .data = &kstack_depth_to_print, 742 .data = &kstack_depth_to_print,
802 .maxlen = sizeof(int), 743 .maxlen = sizeof(int),
803 .mode = 0644, 744 .mode = 0644,
804 .proc_handler = &proc_dointvec, 745 .proc_handler = proc_dointvec,
805 }, 746 },
806 { 747 {
807 .ctl_name = CTL_UNNUMBERED,
808 .procname = "io_delay_type", 748 .procname = "io_delay_type",
809 .data = &io_delay_type, 749 .data = &io_delay_type,
810 .maxlen = sizeof(int), 750 .maxlen = sizeof(int),
811 .mode = 0644, 751 .mode = 0644,
812 .proc_handler = &proc_dointvec, 752 .proc_handler = proc_dointvec,
813 }, 753 },
814#endif 754#endif
815#if defined(CONFIG_MMU) 755#if defined(CONFIG_MMU)
816 { 756 {
817 .ctl_name = KERN_RANDOMIZE,
818 .procname = "randomize_va_space", 757 .procname = "randomize_va_space",
819 .data = &randomize_va_space, 758 .data = &randomize_va_space,
820 .maxlen = sizeof(int), 759 .maxlen = sizeof(int),
821 .mode = 0644, 760 .mode = 0644,
822 .proc_handler = &proc_dointvec, 761 .proc_handler = proc_dointvec,
823 }, 762 },
824#endif 763#endif
825#if defined(CONFIG_S390) && defined(CONFIG_SMP) 764#if defined(CONFIG_S390) && defined(CONFIG_SMP)
826 { 765 {
827 .ctl_name = KERN_SPIN_RETRY,
828 .procname = "spin_retry", 766 .procname = "spin_retry",
829 .data = &spin_retry, 767 .data = &spin_retry,
830 .maxlen = sizeof (int), 768 .maxlen = sizeof (int),
831 .mode = 0644, 769 .mode = 0644,
832 .proc_handler = &proc_dointvec, 770 .proc_handler = proc_dointvec,
833 }, 771 },
834#endif 772#endif
835#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) 773#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86)
@@ -838,123 +776,104 @@ static struct ctl_table kern_table[] = {
838 .data = &acpi_realmode_flags, 776 .data = &acpi_realmode_flags,
839 .maxlen = sizeof (unsigned long), 777 .maxlen = sizeof (unsigned long),
840 .mode = 0644, 778 .mode = 0644,
841 .proc_handler = &proc_doulongvec_minmax, 779 .proc_handler = proc_doulongvec_minmax,
842 }, 780 },
843#endif 781#endif
844#ifdef CONFIG_IA64 782#ifdef CONFIG_IA64
845 { 783 {
846 .ctl_name = KERN_IA64_UNALIGNED,
847 .procname = "ignore-unaligned-usertrap", 784 .procname = "ignore-unaligned-usertrap",
848 .data = &no_unaligned_warning, 785 .data = &no_unaligned_warning,
849 .maxlen = sizeof (int), 786 .maxlen = sizeof (int),
850 .mode = 0644, 787 .mode = 0644,
851 .proc_handler = &proc_dointvec, 788 .proc_handler = proc_dointvec,
852 }, 789 },
853 { 790 {
854 .ctl_name = CTL_UNNUMBERED,
855 .procname = "unaligned-dump-stack", 791 .procname = "unaligned-dump-stack",
856 .data = &unaligned_dump_stack, 792 .data = &unaligned_dump_stack,
857 .maxlen = sizeof (int), 793 .maxlen = sizeof (int),
858 .mode = 0644, 794 .mode = 0644,
859 .proc_handler = &proc_dointvec, 795 .proc_handler = proc_dointvec,
860 }, 796 },
861#endif 797#endif
862#ifdef CONFIG_DETECT_SOFTLOCKUP 798#ifdef CONFIG_DETECT_SOFTLOCKUP
863 { 799 {
864 .ctl_name = CTL_UNNUMBERED,
865 .procname = "softlockup_panic", 800 .procname = "softlockup_panic",
866 .data = &softlockup_panic, 801 .data = &softlockup_panic,
867 .maxlen = sizeof(int), 802 .maxlen = sizeof(int),
868 .mode = 0644, 803 .mode = 0644,
869 .proc_handler = &proc_dointvec_minmax, 804 .proc_handler = proc_dointvec_minmax,
870 .strategy = &sysctl_intvec,
871 .extra1 = &zero, 805 .extra1 = &zero,
872 .extra2 = &one, 806 .extra2 = &one,
873 }, 807 },
874 { 808 {
875 .ctl_name = CTL_UNNUMBERED,
876 .procname = "softlockup_thresh", 809 .procname = "softlockup_thresh",
877 .data = &softlockup_thresh, 810 .data = &softlockup_thresh,
878 .maxlen = sizeof(int), 811 .maxlen = sizeof(int),
879 .mode = 0644, 812 .mode = 0644,
880 .proc_handler = &proc_dosoftlockup_thresh, 813 .proc_handler = proc_dosoftlockup_thresh,
881 .strategy = &sysctl_intvec,
882 .extra1 = &neg_one, 814 .extra1 = &neg_one,
883 .extra2 = &sixty, 815 .extra2 = &sixty,
884 }, 816 },
885#endif 817#endif
886#ifdef CONFIG_DETECT_HUNG_TASK 818#ifdef CONFIG_DETECT_HUNG_TASK
887 { 819 {
888 .ctl_name = CTL_UNNUMBERED,
889 .procname = "hung_task_panic", 820 .procname = "hung_task_panic",
890 .data = &sysctl_hung_task_panic, 821 .data = &sysctl_hung_task_panic,
891 .maxlen = sizeof(int), 822 .maxlen = sizeof(int),
892 .mode = 0644, 823 .mode = 0644,
893 .proc_handler = &proc_dointvec_minmax, 824 .proc_handler = proc_dointvec_minmax,
894 .strategy = &sysctl_intvec,
895 .extra1 = &zero, 825 .extra1 = &zero,
896 .extra2 = &one, 826 .extra2 = &one,
897 }, 827 },
898 { 828 {
899 .ctl_name = CTL_UNNUMBERED,
900 .procname = "hung_task_check_count", 829 .procname = "hung_task_check_count",
901 .data = &sysctl_hung_task_check_count, 830 .data = &sysctl_hung_task_check_count,
902 .maxlen = sizeof(unsigned long), 831 .maxlen = sizeof(unsigned long),
903 .mode = 0644, 832 .mode = 0644,
904 .proc_handler = &proc_doulongvec_minmax, 833 .proc_handler = proc_doulongvec_minmax,
905 .strategy = &sysctl_intvec,
906 }, 834 },
907 { 835 {
908 .ctl_name = CTL_UNNUMBERED,
909 .procname = "hung_task_timeout_secs", 836 .procname = "hung_task_timeout_secs",
910 .data = &sysctl_hung_task_timeout_secs, 837 .data = &sysctl_hung_task_timeout_secs,
911 .maxlen = sizeof(unsigned long), 838 .maxlen = sizeof(unsigned long),
912 .mode = 0644, 839 .mode = 0644,
913 .proc_handler = &proc_dohung_task_timeout_secs, 840 .proc_handler = proc_dohung_task_timeout_secs,
914 .strategy = &sysctl_intvec,
915 }, 841 },
916 { 842 {
917 .ctl_name = CTL_UNNUMBERED,
918 .procname = "hung_task_warnings", 843 .procname = "hung_task_warnings",
919 .data = &sysctl_hung_task_warnings, 844 .data = &sysctl_hung_task_warnings,
920 .maxlen = sizeof(unsigned long), 845 .maxlen = sizeof(unsigned long),
921 .mode = 0644, 846 .mode = 0644,
922 .proc_handler = &proc_doulongvec_minmax, 847 .proc_handler = proc_doulongvec_minmax,
923 .strategy = &sysctl_intvec,
924 }, 848 },
925#endif 849#endif
926#ifdef CONFIG_COMPAT 850#ifdef CONFIG_COMPAT
927 { 851 {
928 .ctl_name = KERN_COMPAT_LOG,
929 .procname = "compat-log", 852 .procname = "compat-log",
930 .data = &compat_log, 853 .data = &compat_log,
931 .maxlen = sizeof (int), 854 .maxlen = sizeof (int),
932 .mode = 0644, 855 .mode = 0644,
933 .proc_handler = &proc_dointvec, 856 .proc_handler = proc_dointvec,
934 }, 857 },
935#endif 858#endif
936#ifdef CONFIG_RT_MUTEXES 859#ifdef CONFIG_RT_MUTEXES
937 { 860 {
938 .ctl_name = KERN_MAX_LOCK_DEPTH,
939 .procname = "max_lock_depth", 861 .procname = "max_lock_depth",
940 .data = &max_lock_depth, 862 .data = &max_lock_depth,
941 .maxlen = sizeof(int), 863 .maxlen = sizeof(int),
942 .mode = 0644, 864 .mode = 0644,
943 .proc_handler = &proc_dointvec, 865 .proc_handler = proc_dointvec,
944 }, 866 },
945#endif 867#endif
946 { 868 {
947 .ctl_name = CTL_UNNUMBERED,
948 .procname = "poweroff_cmd", 869 .procname = "poweroff_cmd",
949 .data = &poweroff_cmd, 870 .data = &poweroff_cmd,
950 .maxlen = POWEROFF_CMD_PATH_LEN, 871 .maxlen = POWEROFF_CMD_PATH_LEN,
951 .mode = 0644, 872 .mode = 0644,
952 .proc_handler = &proc_dostring, 873 .proc_handler = proc_dostring,
953 .strategy = &sysctl_string,
954 }, 874 },
955#ifdef CONFIG_KEYS 875#ifdef CONFIG_KEYS
956 { 876 {
957 .ctl_name = CTL_UNNUMBERED,
958 .procname = "keys", 877 .procname = "keys",
959 .mode = 0555, 878 .mode = 0555,
960 .child = key_sysctls, 879 .child = key_sysctls,
@@ -962,17 +881,15 @@ static struct ctl_table kern_table[] = {
962#endif 881#endif
963#ifdef CONFIG_RCU_TORTURE_TEST 882#ifdef CONFIG_RCU_TORTURE_TEST
964 { 883 {
965 .ctl_name = CTL_UNNUMBERED,
966 .procname = "rcutorture_runnable", 884 .procname = "rcutorture_runnable",
967 .data = &rcutorture_runnable, 885 .data = &rcutorture_runnable,
968 .maxlen = sizeof(int), 886 .maxlen = sizeof(int),
969 .mode = 0644, 887 .mode = 0644,
970 .proc_handler = &proc_dointvec, 888 .proc_handler = proc_dointvec,
971 }, 889 },
972#endif 890#endif
973#ifdef CONFIG_SLOW_WORK 891#ifdef CONFIG_SLOW_WORK
974 { 892 {
975 .ctl_name = CTL_UNNUMBERED,
976 .procname = "slow-work", 893 .procname = "slow-work",
977 .mode = 0555, 894 .mode = 0555,
978 .child = slow_work_sysctls, 895 .child = slow_work_sysctls,
@@ -980,146 +897,127 @@ static struct ctl_table kern_table[] = {
980#endif 897#endif
981#ifdef CONFIG_PERF_EVENTS 898#ifdef CONFIG_PERF_EVENTS
982 { 899 {
983 .ctl_name = CTL_UNNUMBERED,
984 .procname = "perf_event_paranoid", 900 .procname = "perf_event_paranoid",
985 .data = &sysctl_perf_event_paranoid, 901 .data = &sysctl_perf_event_paranoid,
986 .maxlen = sizeof(sysctl_perf_event_paranoid), 902 .maxlen = sizeof(sysctl_perf_event_paranoid),
987 .mode = 0644, 903 .mode = 0644,
988 .proc_handler = &proc_dointvec, 904 .proc_handler = proc_dointvec,
989 }, 905 },
990 { 906 {
991 .ctl_name = CTL_UNNUMBERED,
992 .procname = "perf_event_mlock_kb", 907 .procname = "perf_event_mlock_kb",
993 .data = &sysctl_perf_event_mlock, 908 .data = &sysctl_perf_event_mlock,
994 .maxlen = sizeof(sysctl_perf_event_mlock), 909 .maxlen = sizeof(sysctl_perf_event_mlock),
995 .mode = 0644, 910 .mode = 0644,
996 .proc_handler = &proc_dointvec, 911 .proc_handler = proc_dointvec,
997 }, 912 },
998 { 913 {
999 .ctl_name = CTL_UNNUMBERED,
1000 .procname = "perf_event_max_sample_rate", 914 .procname = "perf_event_max_sample_rate",
1001 .data = &sysctl_perf_event_sample_rate, 915 .data = &sysctl_perf_event_sample_rate,
1002 .maxlen = sizeof(sysctl_perf_event_sample_rate), 916 .maxlen = sizeof(sysctl_perf_event_sample_rate),
1003 .mode = 0644, 917 .mode = 0644,
1004 .proc_handler = &proc_dointvec, 918 .proc_handler = proc_dointvec,
1005 }, 919 },
1006#endif 920#endif
1007#ifdef CONFIG_KMEMCHECK 921#ifdef CONFIG_KMEMCHECK
1008 { 922 {
1009 .ctl_name = CTL_UNNUMBERED,
1010 .procname = "kmemcheck", 923 .procname = "kmemcheck",
1011 .data = &kmemcheck_enabled, 924 .data = &kmemcheck_enabled,
1012 .maxlen = sizeof(int), 925 .maxlen = sizeof(int),
1013 .mode = 0644, 926 .mode = 0644,
1014 .proc_handler = &proc_dointvec, 927 .proc_handler = proc_dointvec,
1015 }, 928 },
1016#endif 929#endif
1017#ifdef CONFIG_BLOCK 930#ifdef CONFIG_BLOCK
1018 { 931 {
1019 .ctl_name = CTL_UNNUMBERED,
1020 .procname = "blk_iopoll", 932 .procname = "blk_iopoll",
1021 .data = &blk_iopoll_enabled, 933 .data = &blk_iopoll_enabled,
1022 .maxlen = sizeof(int), 934 .maxlen = sizeof(int),
1023 .mode = 0644, 935 .mode = 0644,
1024 .proc_handler = &proc_dointvec, 936 .proc_handler = proc_dointvec,
1025 }, 937 },
1026#endif 938#endif
1027/* 939/*
1028 * NOTE: do not add new entries to this table unless you have read 940 * NOTE: do not add new entries to this table unless you have read
1029 * Documentation/sysctl/ctl_unnumbered.txt 941 * Documentation/sysctl/ctl_unnumbered.txt
1030 */ 942 */
1031 { .ctl_name = 0 } 943 { }
1032}; 944};
1033 945
1034static struct ctl_table vm_table[] = { 946static struct ctl_table vm_table[] = {
1035 { 947 {
1036 .ctl_name = VM_OVERCOMMIT_MEMORY,
1037 .procname = "overcommit_memory", 948 .procname = "overcommit_memory",
1038 .data = &sysctl_overcommit_memory, 949 .data = &sysctl_overcommit_memory,
1039 .maxlen = sizeof(sysctl_overcommit_memory), 950 .maxlen = sizeof(sysctl_overcommit_memory),
1040 .mode = 0644, 951 .mode = 0644,
1041 .proc_handler = &proc_dointvec, 952 .proc_handler = proc_dointvec,
1042 }, 953 },
1043 { 954 {
1044 .ctl_name = VM_PANIC_ON_OOM,
1045 .procname = "panic_on_oom", 955 .procname = "panic_on_oom",
1046 .data = &sysctl_panic_on_oom, 956 .data = &sysctl_panic_on_oom,
1047 .maxlen = sizeof(sysctl_panic_on_oom), 957 .maxlen = sizeof(sysctl_panic_on_oom),
1048 .mode = 0644, 958 .mode = 0644,
1049 .proc_handler = &proc_dointvec, 959 .proc_handler = proc_dointvec,
1050 }, 960 },
1051 { 961 {
1052 .ctl_name = CTL_UNNUMBERED,
1053 .procname = "oom_kill_allocating_task", 962 .procname = "oom_kill_allocating_task",
1054 .data = &sysctl_oom_kill_allocating_task, 963 .data = &sysctl_oom_kill_allocating_task,
1055 .maxlen = sizeof(sysctl_oom_kill_allocating_task), 964 .maxlen = sizeof(sysctl_oom_kill_allocating_task),
1056 .mode = 0644, 965 .mode = 0644,
1057 .proc_handler = &proc_dointvec, 966 .proc_handler = proc_dointvec,
1058 }, 967 },
1059 { 968 {
1060 .ctl_name = CTL_UNNUMBERED,
1061 .procname = "oom_dump_tasks", 969 .procname = "oom_dump_tasks",
1062 .data = &sysctl_oom_dump_tasks, 970 .data = &sysctl_oom_dump_tasks,
1063 .maxlen = sizeof(sysctl_oom_dump_tasks), 971 .maxlen = sizeof(sysctl_oom_dump_tasks),
1064 .mode = 0644, 972 .mode = 0644,
1065 .proc_handler = &proc_dointvec, 973 .proc_handler = proc_dointvec,
1066 }, 974 },
1067 { 975 {
1068 .ctl_name = VM_OVERCOMMIT_RATIO,
1069 .procname = "overcommit_ratio", 976 .procname = "overcommit_ratio",
1070 .data = &sysctl_overcommit_ratio, 977 .data = &sysctl_overcommit_ratio,
1071 .maxlen = sizeof(sysctl_overcommit_ratio), 978 .maxlen = sizeof(sysctl_overcommit_ratio),
1072 .mode = 0644, 979 .mode = 0644,
1073 .proc_handler = &proc_dointvec, 980 .proc_handler = proc_dointvec,
1074 }, 981 },
1075 { 982 {
1076 .ctl_name = VM_PAGE_CLUSTER,
1077 .procname = "page-cluster", 983 .procname = "page-cluster",
1078 .data = &page_cluster, 984 .data = &page_cluster,
1079 .maxlen = sizeof(int), 985 .maxlen = sizeof(int),
1080 .mode = 0644, 986 .mode = 0644,
1081 .proc_handler = &proc_dointvec, 987 .proc_handler = proc_dointvec,
1082 }, 988 },
1083 { 989 {
1084 .ctl_name = VM_DIRTY_BACKGROUND,
1085 .procname = "dirty_background_ratio", 990 .procname = "dirty_background_ratio",
1086 .data = &dirty_background_ratio, 991 .data = &dirty_background_ratio,
1087 .maxlen = sizeof(dirty_background_ratio), 992 .maxlen = sizeof(dirty_background_ratio),
1088 .mode = 0644, 993 .mode = 0644,
1089 .proc_handler = &dirty_background_ratio_handler, 994 .proc_handler = dirty_background_ratio_handler,
1090 .strategy = &sysctl_intvec,
1091 .extra1 = &zero, 995 .extra1 = &zero,
1092 .extra2 = &one_hundred, 996 .extra2 = &one_hundred,
1093 }, 997 },
1094 { 998 {
1095 .ctl_name = CTL_UNNUMBERED,
1096 .procname = "dirty_background_bytes", 999 .procname = "dirty_background_bytes",
1097 .data = &dirty_background_bytes, 1000 .data = &dirty_background_bytes,
1098 .maxlen = sizeof(dirty_background_bytes), 1001 .maxlen = sizeof(dirty_background_bytes),
1099 .mode = 0644, 1002 .mode = 0644,
1100 .proc_handler = &dirty_background_bytes_handler, 1003 .proc_handler = dirty_background_bytes_handler,
1101 .strategy = &sysctl_intvec,
1102 .extra1 = &one_ul, 1004 .extra1 = &one_ul,
1103 }, 1005 },
1104 { 1006 {
1105 .ctl_name = VM_DIRTY_RATIO,
1106 .procname = "dirty_ratio", 1007 .procname = "dirty_ratio",
1107 .data = &vm_dirty_ratio, 1008 .data = &vm_dirty_ratio,
1108 .maxlen = sizeof(vm_dirty_ratio), 1009 .maxlen = sizeof(vm_dirty_ratio),
1109 .mode = 0644, 1010 .mode = 0644,
1110 .proc_handler = &dirty_ratio_handler, 1011 .proc_handler = dirty_ratio_handler,
1111 .strategy = &sysctl_intvec,
1112 .extra1 = &zero, 1012 .extra1 = &zero,
1113 .extra2 = &one_hundred, 1013 .extra2 = &one_hundred,
1114 }, 1014 },
1115 { 1015 {
1116 .ctl_name = CTL_UNNUMBERED,
1117 .procname = "dirty_bytes", 1016 .procname = "dirty_bytes",
1118 .data = &vm_dirty_bytes, 1017 .data = &vm_dirty_bytes,
1119 .maxlen = sizeof(vm_dirty_bytes), 1018 .maxlen = sizeof(vm_dirty_bytes),
1120 .mode = 0644, 1019 .mode = 0644,
1121 .proc_handler = &dirty_bytes_handler, 1020 .proc_handler = dirty_bytes_handler,
1122 .strategy = &sysctl_intvec,
1123 .extra1 = &dirty_bytes_min, 1021 .extra1 = &dirty_bytes_min,
1124 }, 1022 },
1125 { 1023 {
@@ -1127,383 +1025,363 @@ static struct ctl_table vm_table[] = {
1127 .data = &dirty_writeback_interval, 1025 .data = &dirty_writeback_interval,
1128 .maxlen = sizeof(dirty_writeback_interval), 1026 .maxlen = sizeof(dirty_writeback_interval),
1129 .mode = 0644, 1027 .mode = 0644,
1130 .proc_handler = &dirty_writeback_centisecs_handler, 1028 .proc_handler = dirty_writeback_centisecs_handler,
1131 }, 1029 },
1132 { 1030 {
1133 .procname = "dirty_expire_centisecs", 1031 .procname = "dirty_expire_centisecs",
1134 .data = &dirty_expire_interval, 1032 .data = &dirty_expire_interval,
1135 .maxlen = sizeof(dirty_expire_interval), 1033 .maxlen = sizeof(dirty_expire_interval),
1136 .mode = 0644, 1034 .mode = 0644,
1137 .proc_handler = &proc_dointvec, 1035 .proc_handler = proc_dointvec,
1138 }, 1036 },
1139 { 1037 {
1140 .ctl_name = VM_NR_PDFLUSH_THREADS,
1141 .procname = "nr_pdflush_threads", 1038 .procname = "nr_pdflush_threads",
1142 .data = &nr_pdflush_threads, 1039 .data = &nr_pdflush_threads,
1143 .maxlen = sizeof nr_pdflush_threads, 1040 .maxlen = sizeof nr_pdflush_threads,
1144 .mode = 0444 /* read-only*/, 1041 .mode = 0444 /* read-only*/,
1145 .proc_handler = &proc_dointvec, 1042 .proc_handler = proc_dointvec,
1146 }, 1043 },
1147 { 1044 {
1148 .ctl_name = VM_SWAPPINESS,
1149 .procname = "swappiness", 1045 .procname = "swappiness",
1150 .data = &vm_swappiness, 1046 .data = &vm_swappiness,
1151 .maxlen = sizeof(vm_swappiness), 1047 .maxlen = sizeof(vm_swappiness),
1152 .mode = 0644, 1048 .mode = 0644,
1153 .proc_handler = &proc_dointvec_minmax, 1049 .proc_handler = proc_dointvec_minmax,
1154 .strategy = &sysctl_intvec,
1155 .extra1 = &zero, 1050 .extra1 = &zero,
1156 .extra2 = &one_hundred, 1051 .extra2 = &one_hundred,
1157 }, 1052 },
1158#ifdef CONFIG_HUGETLB_PAGE 1053#ifdef CONFIG_HUGETLB_PAGE
1159 { 1054 {
1160 .procname = "nr_hugepages", 1055 .procname = "nr_hugepages",
1161 .data = NULL, 1056 .data = NULL,
1162 .maxlen = sizeof(unsigned long), 1057 .maxlen = sizeof(unsigned long),
1163 .mode = 0644, 1058 .mode = 0644,
1164 .proc_handler = &hugetlb_sysctl_handler, 1059 .proc_handler = hugetlb_sysctl_handler,
1165 .extra1 = (void *)&hugetlb_zero, 1060 .extra1 = (void *)&hugetlb_zero,
1166 .extra2 = (void *)&hugetlb_infinity, 1061 .extra2 = (void *)&hugetlb_infinity,
1167 }, 1062 },
1063#ifdef CONFIG_NUMA
1064 {
1065 .procname = "nr_hugepages_mempolicy",
1066 .data = NULL,
1067 .maxlen = sizeof(unsigned long),
1068 .mode = 0644,
1069 .proc_handler = &hugetlb_mempolicy_sysctl_handler,
1070 .extra1 = (void *)&hugetlb_zero,
1071 .extra2 = (void *)&hugetlb_infinity,
1072 },
1073#endif
1168 { 1074 {
1169 .ctl_name = VM_HUGETLB_GROUP,
1170 .procname = "hugetlb_shm_group", 1075 .procname = "hugetlb_shm_group",
1171 .data = &sysctl_hugetlb_shm_group, 1076 .data = &sysctl_hugetlb_shm_group,
1172 .maxlen = sizeof(gid_t), 1077 .maxlen = sizeof(gid_t),
1173 .mode = 0644, 1078 .mode = 0644,
1174 .proc_handler = &proc_dointvec, 1079 .proc_handler = proc_dointvec,
1175 }, 1080 },
1176 { 1081 {
1177 .ctl_name = CTL_UNNUMBERED,
1178 .procname = "hugepages_treat_as_movable", 1082 .procname = "hugepages_treat_as_movable",
1179 .data = &hugepages_treat_as_movable, 1083 .data = &hugepages_treat_as_movable,
1180 .maxlen = sizeof(int), 1084 .maxlen = sizeof(int),
1181 .mode = 0644, 1085 .mode = 0644,
1182 .proc_handler = &hugetlb_treat_movable_handler, 1086 .proc_handler = hugetlb_treat_movable_handler,
1183 }, 1087 },
1184 { 1088 {
1185 .ctl_name = CTL_UNNUMBERED,
1186 .procname = "nr_overcommit_hugepages", 1089 .procname = "nr_overcommit_hugepages",
1187 .data = NULL, 1090 .data = NULL,
1188 .maxlen = sizeof(unsigned long), 1091 .maxlen = sizeof(unsigned long),
1189 .mode = 0644, 1092 .mode = 0644,
1190 .proc_handler = &hugetlb_overcommit_handler, 1093 .proc_handler = hugetlb_overcommit_handler,
1191 .extra1 = (void *)&hugetlb_zero, 1094 .extra1 = (void *)&hugetlb_zero,
1192 .extra2 = (void *)&hugetlb_infinity, 1095 .extra2 = (void *)&hugetlb_infinity,
1193 }, 1096 },
1194#endif 1097#endif
1195 { 1098 {
1196 .ctl_name = VM_LOWMEM_RESERVE_RATIO,
1197 .procname = "lowmem_reserve_ratio", 1099 .procname = "lowmem_reserve_ratio",
1198 .data = &sysctl_lowmem_reserve_ratio, 1100 .data = &sysctl_lowmem_reserve_ratio,
1199 .maxlen = sizeof(sysctl_lowmem_reserve_ratio), 1101 .maxlen = sizeof(sysctl_lowmem_reserve_ratio),
1200 .mode = 0644, 1102 .mode = 0644,
1201 .proc_handler = &lowmem_reserve_ratio_sysctl_handler, 1103 .proc_handler = lowmem_reserve_ratio_sysctl_handler,
1202 .strategy = &sysctl_intvec,
1203 }, 1104 },
1204 { 1105 {
1205 .ctl_name = VM_DROP_PAGECACHE,
1206 .procname = "drop_caches", 1106 .procname = "drop_caches",
1207 .data = &sysctl_drop_caches, 1107 .data = &sysctl_drop_caches,
1208 .maxlen = sizeof(int), 1108 .maxlen = sizeof(int),
1209 .mode = 0644, 1109 .mode = 0644,
1210 .proc_handler = drop_caches_sysctl_handler, 1110 .proc_handler = drop_caches_sysctl_handler,
1211 .strategy = &sysctl_intvec,
1212 }, 1111 },
1213 { 1112 {
1214 .ctl_name = VM_MIN_FREE_KBYTES,
1215 .procname = "min_free_kbytes", 1113 .procname = "min_free_kbytes",
1216 .data = &min_free_kbytes, 1114 .data = &min_free_kbytes,
1217 .maxlen = sizeof(min_free_kbytes), 1115 .maxlen = sizeof(min_free_kbytes),
1218 .mode = 0644, 1116 .mode = 0644,
1219 .proc_handler = &min_free_kbytes_sysctl_handler, 1117 .proc_handler = min_free_kbytes_sysctl_handler,
1220 .strategy = &sysctl_intvec,
1221 .extra1 = &zero, 1118 .extra1 = &zero,
1222 }, 1119 },
1223 { 1120 {
1224 .ctl_name = VM_PERCPU_PAGELIST_FRACTION,
1225 .procname = "percpu_pagelist_fraction", 1121 .procname = "percpu_pagelist_fraction",
1226 .data = &percpu_pagelist_fraction, 1122 .data = &percpu_pagelist_fraction,
1227 .maxlen = sizeof(percpu_pagelist_fraction), 1123 .maxlen = sizeof(percpu_pagelist_fraction),
1228 .mode = 0644, 1124 .mode = 0644,
1229 .proc_handler = &percpu_pagelist_fraction_sysctl_handler, 1125 .proc_handler = percpu_pagelist_fraction_sysctl_handler,
1230 .strategy = &sysctl_intvec,
1231 .extra1 = &min_percpu_pagelist_fract, 1126 .extra1 = &min_percpu_pagelist_fract,
1232 }, 1127 },
1233#ifdef CONFIG_MMU 1128#ifdef CONFIG_MMU
1234 { 1129 {
1235 .ctl_name = VM_MAX_MAP_COUNT,
1236 .procname = "max_map_count", 1130 .procname = "max_map_count",
1237 .data = &sysctl_max_map_count, 1131 .data = &sysctl_max_map_count,
1238 .maxlen = sizeof(sysctl_max_map_count), 1132 .maxlen = sizeof(sysctl_max_map_count),
1239 .mode = 0644, 1133 .mode = 0644,
1240 .proc_handler = &proc_dointvec 1134 .proc_handler = proc_dointvec,
1135 .extra1 = &zero,
1241 }, 1136 },
1242#else 1137#else
1243 { 1138 {
1244 .ctl_name = CTL_UNNUMBERED,
1245 .procname = "nr_trim_pages", 1139 .procname = "nr_trim_pages",
1246 .data = &sysctl_nr_trim_pages, 1140 .data = &sysctl_nr_trim_pages,
1247 .maxlen = sizeof(sysctl_nr_trim_pages), 1141 .maxlen = sizeof(sysctl_nr_trim_pages),
1248 .mode = 0644, 1142 .mode = 0644,
1249 .proc_handler = &proc_dointvec_minmax, 1143 .proc_handler = proc_dointvec_minmax,
1250 .strategy = &sysctl_intvec,
1251 .extra1 = &zero, 1144 .extra1 = &zero,
1252 }, 1145 },
1253#endif 1146#endif
1254 { 1147 {
1255 .ctl_name = VM_LAPTOP_MODE,
1256 .procname = "laptop_mode", 1148 .procname = "laptop_mode",
1257 .data = &laptop_mode, 1149 .data = &laptop_mode,
1258 .maxlen = sizeof(laptop_mode), 1150 .maxlen = sizeof(laptop_mode),
1259 .mode = 0644, 1151 .mode = 0644,
1260 .proc_handler = &proc_dointvec_jiffies, 1152 .proc_handler = proc_dointvec_jiffies,
1261 .strategy = &sysctl_jiffies,
1262 }, 1153 },
1263 { 1154 {
1264 .ctl_name = VM_BLOCK_DUMP,
1265 .procname = "block_dump", 1155 .procname = "block_dump",
1266 .data = &block_dump, 1156 .data = &block_dump,
1267 .maxlen = sizeof(block_dump), 1157 .maxlen = sizeof(block_dump),
1268 .mode = 0644, 1158 .mode = 0644,
1269 .proc_handler = &proc_dointvec, 1159 .proc_handler = proc_dointvec,
1270 .strategy = &sysctl_intvec,
1271 .extra1 = &zero, 1160 .extra1 = &zero,
1272 }, 1161 },
1273 { 1162 {
1274 .ctl_name = VM_VFS_CACHE_PRESSURE,
1275 .procname = "vfs_cache_pressure", 1163 .procname = "vfs_cache_pressure",
1276 .data = &sysctl_vfs_cache_pressure, 1164 .data = &sysctl_vfs_cache_pressure,
1277 .maxlen = sizeof(sysctl_vfs_cache_pressure), 1165 .maxlen = sizeof(sysctl_vfs_cache_pressure),
1278 .mode = 0644, 1166 .mode = 0644,
1279 .proc_handler = &proc_dointvec, 1167 .proc_handler = proc_dointvec,
1280 .strategy = &sysctl_intvec,
1281 .extra1 = &zero, 1168 .extra1 = &zero,
1282 }, 1169 },
1283#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT 1170#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
1284 { 1171 {
1285 .ctl_name = VM_LEGACY_VA_LAYOUT,
1286 .procname = "legacy_va_layout", 1172 .procname = "legacy_va_layout",
1287 .data = &sysctl_legacy_va_layout, 1173 .data = &sysctl_legacy_va_layout,
1288 .maxlen = sizeof(sysctl_legacy_va_layout), 1174 .maxlen = sizeof(sysctl_legacy_va_layout),
1289 .mode = 0644, 1175 .mode = 0644,
1290 .proc_handler = &proc_dointvec, 1176 .proc_handler = proc_dointvec,
1291 .strategy = &sysctl_intvec,
1292 .extra1 = &zero, 1177 .extra1 = &zero,
1293 }, 1178 },
1294#endif 1179#endif
1295#ifdef CONFIG_NUMA 1180#ifdef CONFIG_NUMA
1296 { 1181 {
1297 .ctl_name = VM_ZONE_RECLAIM_MODE,
1298 .procname = "zone_reclaim_mode", 1182 .procname = "zone_reclaim_mode",
1299 .data = &zone_reclaim_mode, 1183 .data = &zone_reclaim_mode,
1300 .maxlen = sizeof(zone_reclaim_mode), 1184 .maxlen = sizeof(zone_reclaim_mode),
1301 .mode = 0644, 1185 .mode = 0644,
1302 .proc_handler = &proc_dointvec, 1186 .proc_handler = proc_dointvec,
1303 .strategy = &sysctl_intvec,
1304 .extra1 = &zero, 1187 .extra1 = &zero,
1305 }, 1188 },
1306 { 1189 {
1307 .ctl_name = VM_MIN_UNMAPPED,
1308 .procname = "min_unmapped_ratio", 1190 .procname = "min_unmapped_ratio",
1309 .data = &sysctl_min_unmapped_ratio, 1191 .data = &sysctl_min_unmapped_ratio,
1310 .maxlen = sizeof(sysctl_min_unmapped_ratio), 1192 .maxlen = sizeof(sysctl_min_unmapped_ratio),
1311 .mode = 0644, 1193 .mode = 0644,
1312 .proc_handler = &sysctl_min_unmapped_ratio_sysctl_handler, 1194 .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler,
1313 .strategy = &sysctl_intvec,
1314 .extra1 = &zero, 1195 .extra1 = &zero,
1315 .extra2 = &one_hundred, 1196 .extra2 = &one_hundred,
1316 }, 1197 },
1317 { 1198 {
1318 .ctl_name = VM_MIN_SLAB,
1319 .procname = "min_slab_ratio", 1199 .procname = "min_slab_ratio",
1320 .data = &sysctl_min_slab_ratio, 1200 .data = &sysctl_min_slab_ratio,
1321 .maxlen = sizeof(sysctl_min_slab_ratio), 1201 .maxlen = sizeof(sysctl_min_slab_ratio),
1322 .mode = 0644, 1202 .mode = 0644,
1323 .proc_handler = &sysctl_min_slab_ratio_sysctl_handler, 1203 .proc_handler = sysctl_min_slab_ratio_sysctl_handler,
1324 .strategy = &sysctl_intvec,
1325 .extra1 = &zero, 1204 .extra1 = &zero,
1326 .extra2 = &one_hundred, 1205 .extra2 = &one_hundred,
1327 }, 1206 },
1328#endif 1207#endif
1329#ifdef CONFIG_SMP 1208#ifdef CONFIG_SMP
1330 { 1209 {
1331 .ctl_name = CTL_UNNUMBERED,
1332 .procname = "stat_interval", 1210 .procname = "stat_interval",
1333 .data = &sysctl_stat_interval, 1211 .data = &sysctl_stat_interval,
1334 .maxlen = sizeof(sysctl_stat_interval), 1212 .maxlen = sizeof(sysctl_stat_interval),
1335 .mode = 0644, 1213 .mode = 0644,
1336 .proc_handler = &proc_dointvec_jiffies, 1214 .proc_handler = proc_dointvec_jiffies,
1337 .strategy = &sysctl_jiffies,
1338 }, 1215 },
1339#endif 1216#endif
1340 { 1217 {
1341 .ctl_name = CTL_UNNUMBERED,
1342 .procname = "mmap_min_addr", 1218 .procname = "mmap_min_addr",
1343 .data = &dac_mmap_min_addr, 1219 .data = &dac_mmap_min_addr,
1344 .maxlen = sizeof(unsigned long), 1220 .maxlen = sizeof(unsigned long),
1345 .mode = 0644, 1221 .mode = 0644,
1346 .proc_handler = &mmap_min_addr_handler, 1222 .proc_handler = mmap_min_addr_handler,
1347 }, 1223 },
1348#ifdef CONFIG_NUMA 1224#ifdef CONFIG_NUMA
1349 { 1225 {
1350 .ctl_name = CTL_UNNUMBERED,
1351 .procname = "numa_zonelist_order", 1226 .procname = "numa_zonelist_order",
1352 .data = &numa_zonelist_order, 1227 .data = &numa_zonelist_order,
1353 .maxlen = NUMA_ZONELIST_ORDER_LEN, 1228 .maxlen = NUMA_ZONELIST_ORDER_LEN,
1354 .mode = 0644, 1229 .mode = 0644,
1355 .proc_handler = &numa_zonelist_order_handler, 1230 .proc_handler = numa_zonelist_order_handler,
1356 .strategy = &sysctl_string,
1357 }, 1231 },
1358#endif 1232#endif
1359#if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \ 1233#if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \
1360 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) 1234 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
1361 { 1235 {
1362 .ctl_name = VM_VDSO_ENABLED,
1363 .procname = "vdso_enabled", 1236 .procname = "vdso_enabled",
1364 .data = &vdso_enabled, 1237 .data = &vdso_enabled,
1365 .maxlen = sizeof(vdso_enabled), 1238 .maxlen = sizeof(vdso_enabled),
1366 .mode = 0644, 1239 .mode = 0644,
1367 .proc_handler = &proc_dointvec, 1240 .proc_handler = proc_dointvec,
1368 .strategy = &sysctl_intvec,
1369 .extra1 = &zero, 1241 .extra1 = &zero,
1370 }, 1242 },
1371#endif 1243#endif
1372#ifdef CONFIG_HIGHMEM 1244#ifdef CONFIG_HIGHMEM
1373 { 1245 {
1374 .ctl_name = CTL_UNNUMBERED,
1375 .procname = "highmem_is_dirtyable", 1246 .procname = "highmem_is_dirtyable",
1376 .data = &vm_highmem_is_dirtyable, 1247 .data = &vm_highmem_is_dirtyable,
1377 .maxlen = sizeof(vm_highmem_is_dirtyable), 1248 .maxlen = sizeof(vm_highmem_is_dirtyable),
1378 .mode = 0644, 1249 .mode = 0644,
1379 .proc_handler = &proc_dointvec_minmax, 1250 .proc_handler = proc_dointvec_minmax,
1380 .strategy = &sysctl_intvec,
1381 .extra1 = &zero, 1251 .extra1 = &zero,
1382 .extra2 = &one, 1252 .extra2 = &one,
1383 }, 1253 },
1384#endif 1254#endif
1385 { 1255 {
1386 .ctl_name = CTL_UNNUMBERED,
1387 .procname = "scan_unevictable_pages", 1256 .procname = "scan_unevictable_pages",
1388 .data = &scan_unevictable_pages, 1257 .data = &scan_unevictable_pages,
1389 .maxlen = sizeof(scan_unevictable_pages), 1258 .maxlen = sizeof(scan_unevictable_pages),
1390 .mode = 0644, 1259 .mode = 0644,
1391 .proc_handler = &scan_unevictable_handler, 1260 .proc_handler = scan_unevictable_handler,
1392 }, 1261 },
1262#ifdef CONFIG_MEMORY_FAILURE
1263 {
1264 .procname = "memory_failure_early_kill",
1265 .data = &sysctl_memory_failure_early_kill,
1266 .maxlen = sizeof(sysctl_memory_failure_early_kill),
1267 .mode = 0644,
1268 .proc_handler = proc_dointvec_minmax,
1269 .extra1 = &zero,
1270 .extra2 = &one,
1271 },
1272 {
1273 .procname = "memory_failure_recovery",
1274 .data = &sysctl_memory_failure_recovery,
1275 .maxlen = sizeof(sysctl_memory_failure_recovery),
1276 .mode = 0644,
1277 .proc_handler = proc_dointvec_minmax,
1278 .extra1 = &zero,
1279 .extra2 = &one,
1280 },
1281#endif
1282
1393/* 1283/*
1394 * NOTE: do not add new entries to this table unless you have read 1284 * NOTE: do not add new entries to this table unless you have read
1395 * Documentation/sysctl/ctl_unnumbered.txt 1285 * Documentation/sysctl/ctl_unnumbered.txt
1396 */ 1286 */
1397 { .ctl_name = 0 } 1287 { }
1398}; 1288};
1399 1289
1400#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) 1290#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
1401static struct ctl_table binfmt_misc_table[] = { 1291static struct ctl_table binfmt_misc_table[] = {
1402 { .ctl_name = 0 } 1292 { }
1403}; 1293};
1404#endif 1294#endif
1405 1295
1406static struct ctl_table fs_table[] = { 1296static struct ctl_table fs_table[] = {
1407 { 1297 {
1408 .ctl_name = FS_NRINODE,
1409 .procname = "inode-nr", 1298 .procname = "inode-nr",
1410 .data = &inodes_stat, 1299 .data = &inodes_stat,
1411 .maxlen = 2*sizeof(int), 1300 .maxlen = 2*sizeof(int),
1412 .mode = 0444, 1301 .mode = 0444,
1413 .proc_handler = &proc_dointvec, 1302 .proc_handler = proc_dointvec,
1414 }, 1303 },
1415 { 1304 {
1416 .ctl_name = FS_STATINODE,
1417 .procname = "inode-state", 1305 .procname = "inode-state",
1418 .data = &inodes_stat, 1306 .data = &inodes_stat,
1419 .maxlen = 7*sizeof(int), 1307 .maxlen = 7*sizeof(int),
1420 .mode = 0444, 1308 .mode = 0444,
1421 .proc_handler = &proc_dointvec, 1309 .proc_handler = proc_dointvec,
1422 }, 1310 },
1423 { 1311 {
1424 .procname = "file-nr", 1312 .procname = "file-nr",
1425 .data = &files_stat, 1313 .data = &files_stat,
1426 .maxlen = 3*sizeof(int), 1314 .maxlen = 3*sizeof(int),
1427 .mode = 0444, 1315 .mode = 0444,
1428 .proc_handler = &proc_nr_files, 1316 .proc_handler = proc_nr_files,
1429 }, 1317 },
1430 { 1318 {
1431 .ctl_name = FS_MAXFILE,
1432 .procname = "file-max", 1319 .procname = "file-max",
1433 .data = &files_stat.max_files, 1320 .data = &files_stat.max_files,
1434 .maxlen = sizeof(int), 1321 .maxlen = sizeof(int),
1435 .mode = 0644, 1322 .mode = 0644,
1436 .proc_handler = &proc_dointvec, 1323 .proc_handler = proc_dointvec,
1437 }, 1324 },
1438 { 1325 {
1439 .ctl_name = CTL_UNNUMBERED,
1440 .procname = "nr_open", 1326 .procname = "nr_open",
1441 .data = &sysctl_nr_open, 1327 .data = &sysctl_nr_open,
1442 .maxlen = sizeof(int), 1328 .maxlen = sizeof(int),
1443 .mode = 0644, 1329 .mode = 0644,
1444 .proc_handler = &proc_dointvec_minmax, 1330 .proc_handler = proc_dointvec_minmax,
1445 .extra1 = &sysctl_nr_open_min, 1331 .extra1 = &sysctl_nr_open_min,
1446 .extra2 = &sysctl_nr_open_max, 1332 .extra2 = &sysctl_nr_open_max,
1447 }, 1333 },
1448 { 1334 {
1449 .ctl_name = FS_DENTRY,
1450 .procname = "dentry-state", 1335 .procname = "dentry-state",
1451 .data = &dentry_stat, 1336 .data = &dentry_stat,
1452 .maxlen = 6*sizeof(int), 1337 .maxlen = 6*sizeof(int),
1453 .mode = 0444, 1338 .mode = 0444,
1454 .proc_handler = &proc_dointvec, 1339 .proc_handler = proc_dointvec,
1455 }, 1340 },
1456 { 1341 {
1457 .ctl_name = FS_OVERFLOWUID,
1458 .procname = "overflowuid", 1342 .procname = "overflowuid",
1459 .data = &fs_overflowuid, 1343 .data = &fs_overflowuid,
1460 .maxlen = sizeof(int), 1344 .maxlen = sizeof(int),
1461 .mode = 0644, 1345 .mode = 0644,
1462 .proc_handler = &proc_dointvec_minmax, 1346 .proc_handler = proc_dointvec_minmax,
1463 .strategy = &sysctl_intvec,
1464 .extra1 = &minolduid, 1347 .extra1 = &minolduid,
1465 .extra2 = &maxolduid, 1348 .extra2 = &maxolduid,
1466 }, 1349 },
1467 { 1350 {
1468 .ctl_name = FS_OVERFLOWGID,
1469 .procname = "overflowgid", 1351 .procname = "overflowgid",
1470 .data = &fs_overflowgid, 1352 .data = &fs_overflowgid,
1471 .maxlen = sizeof(int), 1353 .maxlen = sizeof(int),
1472 .mode = 0644, 1354 .mode = 0644,
1473 .proc_handler = &proc_dointvec_minmax, 1355 .proc_handler = proc_dointvec_minmax,
1474 .strategy = &sysctl_intvec,
1475 .extra1 = &minolduid, 1356 .extra1 = &minolduid,
1476 .extra2 = &maxolduid, 1357 .extra2 = &maxolduid,
1477 }, 1358 },
1478#ifdef CONFIG_FILE_LOCKING 1359#ifdef CONFIG_FILE_LOCKING
1479 { 1360 {
1480 .ctl_name = FS_LEASES,
1481 .procname = "leases-enable", 1361 .procname = "leases-enable",
1482 .data = &leases_enable, 1362 .data = &leases_enable,
1483 .maxlen = sizeof(int), 1363 .maxlen = sizeof(int),
1484 .mode = 0644, 1364 .mode = 0644,
1485 .proc_handler = &proc_dointvec, 1365 .proc_handler = proc_dointvec,
1486 }, 1366 },
1487#endif 1367#endif
1488#ifdef CONFIG_DNOTIFY 1368#ifdef CONFIG_DNOTIFY
1489 { 1369 {
1490 .ctl_name = FS_DIR_NOTIFY,
1491 .procname = "dir-notify-enable", 1370 .procname = "dir-notify-enable",
1492 .data = &dir_notify_enable, 1371 .data = &dir_notify_enable,
1493 .maxlen = sizeof(int), 1372 .maxlen = sizeof(int),
1494 .mode = 0644, 1373 .mode = 0644,
1495 .proc_handler = &proc_dointvec, 1374 .proc_handler = proc_dointvec,
1496 }, 1375 },
1497#endif 1376#endif
1498#ifdef CONFIG_MMU 1377#ifdef CONFIG_MMU
1499#ifdef CONFIG_FILE_LOCKING 1378#ifdef CONFIG_FILE_LOCKING
1500 { 1379 {
1501 .ctl_name = FS_LEASE_TIME,
1502 .procname = "lease-break-time", 1380 .procname = "lease-break-time",
1503 .data = &lease_break_time, 1381 .data = &lease_break_time,
1504 .maxlen = sizeof(int), 1382 .maxlen = sizeof(int),
1505 .mode = 0644, 1383 .mode = 0644,
1506 .proc_handler = &proc_dointvec, 1384 .proc_handler = proc_dointvec,
1507 }, 1385 },
1508#endif 1386#endif
1509#ifdef CONFIG_AIO 1387#ifdef CONFIG_AIO
@@ -1512,19 +1390,18 @@ static struct ctl_table fs_table[] = {
1512 .data = &aio_nr, 1390 .data = &aio_nr,
1513 .maxlen = sizeof(aio_nr), 1391 .maxlen = sizeof(aio_nr),
1514 .mode = 0444, 1392 .mode = 0444,
1515 .proc_handler = &proc_doulongvec_minmax, 1393 .proc_handler = proc_doulongvec_minmax,
1516 }, 1394 },
1517 { 1395 {
1518 .procname = "aio-max-nr", 1396 .procname = "aio-max-nr",
1519 .data = &aio_max_nr, 1397 .data = &aio_max_nr,
1520 .maxlen = sizeof(aio_max_nr), 1398 .maxlen = sizeof(aio_max_nr),
1521 .mode = 0644, 1399 .mode = 0644,
1522 .proc_handler = &proc_doulongvec_minmax, 1400 .proc_handler = proc_doulongvec_minmax,
1523 }, 1401 },
1524#endif /* CONFIG_AIO */ 1402#endif /* CONFIG_AIO */
1525#ifdef CONFIG_INOTIFY_USER 1403#ifdef CONFIG_INOTIFY_USER
1526 { 1404 {
1527 .ctl_name = FS_INOTIFY,
1528 .procname = "inotify", 1405 .procname = "inotify",
1529 .mode = 0555, 1406 .mode = 0555,
1530 .child = inotify_table, 1407 .child = inotify_table,
@@ -1539,19 +1416,16 @@ static struct ctl_table fs_table[] = {
1539#endif 1416#endif
1540#endif 1417#endif
1541 { 1418 {
1542 .ctl_name = KERN_SETUID_DUMPABLE,
1543 .procname = "suid_dumpable", 1419 .procname = "suid_dumpable",
1544 .data = &suid_dumpable, 1420 .data = &suid_dumpable,
1545 .maxlen = sizeof(int), 1421 .maxlen = sizeof(int),
1546 .mode = 0644, 1422 .mode = 0644,
1547 .proc_handler = &proc_dointvec_minmax, 1423 .proc_handler = proc_dointvec_minmax,
1548 .strategy = &sysctl_intvec,
1549 .extra1 = &zero, 1424 .extra1 = &zero,
1550 .extra2 = &two, 1425 .extra2 = &two,
1551 }, 1426 },
1552#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) 1427#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
1553 { 1428 {
1554 .ctl_name = CTL_UNNUMBERED,
1555 .procname = "binfmt_misc", 1429 .procname = "binfmt_misc",
1556 .mode = 0555, 1430 .mode = 0555,
1557 .child = binfmt_misc_table, 1431 .child = binfmt_misc_table,
@@ -1561,13 +1435,12 @@ static struct ctl_table fs_table[] = {
1561 * NOTE: do not add new entries to this table unless you have read 1435 * NOTE: do not add new entries to this table unless you have read
1562 * Documentation/sysctl/ctl_unnumbered.txt 1436 * Documentation/sysctl/ctl_unnumbered.txt
1563 */ 1437 */
1564 { .ctl_name = 0 } 1438 { }
1565}; 1439};
1566 1440
1567static struct ctl_table debug_table[] = { 1441static struct ctl_table debug_table[] = {
1568#if defined(CONFIG_X86) || defined(CONFIG_PPC) 1442#if defined(CONFIG_X86) || defined(CONFIG_PPC)
1569 { 1443 {
1570 .ctl_name = CTL_UNNUMBERED,
1571 .procname = "exception-trace", 1444 .procname = "exception-trace",
1572 .data = &show_unhandled_signals, 1445 .data = &show_unhandled_signals,
1573 .maxlen = sizeof(int), 1446 .maxlen = sizeof(int),
@@ -1575,11 +1448,11 @@ static struct ctl_table debug_table[] = {
1575 .proc_handler = proc_dointvec 1448 .proc_handler = proc_dointvec
1576 }, 1449 },
1577#endif 1450#endif
1578 { .ctl_name = 0 } 1451 { }
1579}; 1452};
1580 1453
1581static struct ctl_table dev_table[] = { 1454static struct ctl_table dev_table[] = {
1582 { .ctl_name = 0 } 1455 { }
1583}; 1456};
1584 1457
1585static DEFINE_SPINLOCK(sysctl_lock); 1458static DEFINE_SPINLOCK(sysctl_lock);
@@ -1733,122 +1606,6 @@ void register_sysctl_root(struct ctl_table_root *root)
1733 spin_unlock(&sysctl_lock); 1606 spin_unlock(&sysctl_lock);
1734} 1607}
1735 1608
1736#ifdef CONFIG_SYSCTL_SYSCALL
1737/* Perform the actual read/write of a sysctl table entry. */
1738static int do_sysctl_strategy(struct ctl_table_root *root,
1739 struct ctl_table *table,
1740 void __user *oldval, size_t __user *oldlenp,
1741 void __user *newval, size_t newlen)
1742{
1743 int op = 0, rc;
1744
1745 if (oldval)
1746 op |= MAY_READ;
1747 if (newval)
1748 op |= MAY_WRITE;
1749 if (sysctl_perm(root, table, op))
1750 return -EPERM;
1751
1752 if (table->strategy) {
1753 rc = table->strategy(table, oldval, oldlenp, newval, newlen);
1754 if (rc < 0)
1755 return rc;
1756 if (rc > 0)
1757 return 0;
1758 }
1759
1760 /* If there is no strategy routine, or if the strategy returns
1761 * zero, proceed with automatic r/w */
1762 if (table->data && table->maxlen) {
1763 rc = sysctl_data(table, oldval, oldlenp, newval, newlen);
1764 if (rc < 0)
1765 return rc;
1766 }
1767 return 0;
1768}
1769
1770static int parse_table(int __user *name, int nlen,
1771 void __user *oldval, size_t __user *oldlenp,
1772 void __user *newval, size_t newlen,
1773 struct ctl_table_root *root,
1774 struct ctl_table *table)
1775{
1776 int n;
1777repeat:
1778 if (!nlen)
1779 return -ENOTDIR;
1780 if (get_user(n, name))
1781 return -EFAULT;
1782 for ( ; table->ctl_name || table->procname; table++) {
1783 if (!table->ctl_name)
1784 continue;
1785 if (n == table->ctl_name) {
1786 int error;
1787 if (table->child) {
1788 if (sysctl_perm(root, table, MAY_EXEC))
1789 return -EPERM;
1790 name++;
1791 nlen--;
1792 table = table->child;
1793 goto repeat;
1794 }
1795 error = do_sysctl_strategy(root, table,
1796 oldval, oldlenp,
1797 newval, newlen);
1798 return error;
1799 }
1800 }
1801 return -ENOTDIR;
1802}
1803
1804int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp,
1805 void __user *newval, size_t newlen)
1806{
1807 struct ctl_table_header *head;
1808 int error = -ENOTDIR;
1809
1810 if (nlen <= 0 || nlen >= CTL_MAXNAME)
1811 return -ENOTDIR;
1812 if (oldval) {
1813 int old_len;
1814 if (!oldlenp || get_user(old_len, oldlenp))
1815 return -EFAULT;
1816 }
1817
1818 for (head = sysctl_head_next(NULL); head;
1819 head = sysctl_head_next(head)) {
1820 error = parse_table(name, nlen, oldval, oldlenp,
1821 newval, newlen,
1822 head->root, head->ctl_table);
1823 if (error != -ENOTDIR) {
1824 sysctl_head_finish(head);
1825 break;
1826 }
1827 }
1828 return error;
1829}
1830
1831SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
1832{
1833 struct __sysctl_args tmp;
1834 int error;
1835
1836 if (copy_from_user(&tmp, args, sizeof(tmp)))
1837 return -EFAULT;
1838
1839 error = deprecated_sysctl_warning(&tmp);
1840 if (error)
1841 goto out;
1842
1843 lock_kernel();
1844 error = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, tmp.oldlenp,
1845 tmp.newval, tmp.newlen);
1846 unlock_kernel();
1847out:
1848 return error;
1849}
1850#endif /* CONFIG_SYSCTL_SYSCALL */
1851
1852/* 1609/*
1853 * sysctl_perm does NOT grant the superuser all rights automatically, because 1610 * sysctl_perm does NOT grant the superuser all rights automatically, because
1854 * some sysctl variables are readonly even to root. 1611 * some sysctl variables are readonly even to root.
@@ -1884,7 +1641,7 @@ int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
1884 1641
1885static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) 1642static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
1886{ 1643{
1887 for (; table->ctl_name || table->procname; table++) { 1644 for (; table->procname; table++) {
1888 table->parent = parent; 1645 table->parent = parent;
1889 if (table->child) 1646 if (table->child)
1890 sysctl_set_parent(table, table->child); 1647 sysctl_set_parent(table, table->child);
@@ -1916,11 +1673,11 @@ static struct ctl_table *is_branch_in(struct ctl_table *branch,
1916 return NULL; 1673 return NULL;
1917 1674
1918 /* ... and nothing else */ 1675 /* ... and nothing else */
1919 if (branch[1].procname || branch[1].ctl_name) 1676 if (branch[1].procname)
1920 return NULL; 1677 return NULL;
1921 1678
1922 /* table should contain subdirectory with the same name */ 1679 /* table should contain subdirectory with the same name */
1923 for (p = table; p->procname || p->ctl_name; p++) { 1680 for (p = table; p->procname; p++) {
1924 if (!p->child) 1681 if (!p->child)
1925 continue; 1682 continue;
1926 if (p->procname && strcmp(p->procname, s) == 0) 1683 if (p->procname && strcmp(p->procname, s) == 0)
@@ -1965,9 +1722,6 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
1965 * 1722 *
1966 * The members of the &struct ctl_table structure are used as follows: 1723 * The members of the &struct ctl_table structure are used as follows:
1967 * 1724 *
1968 * ctl_name - This is the numeric sysctl value used by sysctl(2). The number
1969 * must be unique within that level of sysctl
1970 *
1971 * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not 1725 * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
1972 * enter a sysctl file 1726 * enter a sysctl file
1973 * 1727 *
@@ -1982,8 +1736,6 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
1982 * 1736 *
1983 * proc_handler - the text handler routine (described below) 1737 * proc_handler - the text handler routine (described below)
1984 * 1738 *
1985 * strategy - the strategy routine (described below)
1986 *
1987 * de - for internal use by the sysctl routines 1739 * de - for internal use by the sysctl routines
1988 * 1740 *
1989 * extra1, extra2 - extra pointers usable by the proc handler routines 1741 * extra1, extra2 - extra pointers usable by the proc handler routines
@@ -1996,19 +1748,6 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
1996 * struct enable minimal validation of the values being written to be 1748 * struct enable minimal validation of the values being written to be
1997 * performed, and the mode field allows minimal authentication. 1749 * performed, and the mode field allows minimal authentication.
1998 * 1750 *
1999 * More sophisticated management can be enabled by the provision of a
2000 * strategy routine with the table entry. This will be called before
2001 * any automatic read or write of the data is performed.
2002 *
2003 * The strategy routine may return
2004 *
2005 * < 0 - Error occurred (error is passed to user process)
2006 *
2007 * 0 - OK - proceed with automatic read or write.
2008 *
2009 * > 0 - OK - read or write has been done by the strategy routine, so
2010 * return immediately.
2011 *
2012 * There must be a proc_handler routine for any terminal nodes 1751 * There must be a proc_handler routine for any terminal nodes
2013 * mirrored under /proc/sys (non-terminals are handled by a built-in 1752 * mirrored under /proc/sys (non-terminals are handled by a built-in
2014 * directory handler). Several default handlers are available to 1753 * directory handler). Several default handlers are available to
@@ -2035,13 +1774,13 @@ struct ctl_table_header *__register_sysctl_paths(
2035 struct ctl_table_set *set; 1774 struct ctl_table_set *set;
2036 1775
2037 /* Count the path components */ 1776 /* Count the path components */
2038 for (npath = 0; path[npath].ctl_name || path[npath].procname; ++npath) 1777 for (npath = 0; path[npath].procname; ++npath)
2039 ; 1778 ;
2040 1779
2041 /* 1780 /*
2042 * For each path component, allocate a 2-element ctl_table array. 1781 * For each path component, allocate a 2-element ctl_table array.
2043 * The first array element will be filled with the sysctl entry 1782 * The first array element will be filled with the sysctl entry
2044 * for this, the second will be the sentinel (ctl_name == 0). 1783 * for this, the second will be the sentinel (procname == 0).
2045 * 1784 *
2046 * We allocate everything in one go so that we don't have to 1785 * We allocate everything in one go so that we don't have to
2047 * worry about freeing additional memory in unregister_sysctl_table. 1786 * worry about freeing additional memory in unregister_sysctl_table.
@@ -2058,7 +1797,6 @@ struct ctl_table_header *__register_sysctl_paths(
2058 for (n = 0; n < npath; ++n, ++path) { 1797 for (n = 0; n < npath; ++n, ++path) {
2059 /* Copy the procname */ 1798 /* Copy the procname */
2060 new->procname = path->procname; 1799 new->procname = path->procname;
2061 new->ctl_name = path->ctl_name;
2062 new->mode = 0555; 1800 new->mode = 0555;
2063 1801
2064 *prevp = new; 1802 *prevp = new;
@@ -2218,7 +1956,7 @@ void sysctl_head_put(struct ctl_table_header *head)
2218#ifdef CONFIG_PROC_SYSCTL 1956#ifdef CONFIG_PROC_SYSCTL
2219 1957
2220static int _proc_do_string(void* data, int maxlen, int write, 1958static int _proc_do_string(void* data, int maxlen, int write,
2221 struct file *filp, void __user *buffer, 1959 void __user *buffer,
2222 size_t *lenp, loff_t *ppos) 1960 size_t *lenp, loff_t *ppos)
2223{ 1961{
2224 size_t len; 1962 size_t len;
@@ -2279,7 +2017,6 @@ static int _proc_do_string(void* data, int maxlen, int write,
2279 * proc_dostring - read a string sysctl 2017 * proc_dostring - read a string sysctl
2280 * @table: the sysctl table 2018 * @table: the sysctl table
2281 * @write: %TRUE if this is a write to the sysctl file 2019 * @write: %TRUE if this is a write to the sysctl file
2282 * @filp: the file structure
2283 * @buffer: the user buffer 2020 * @buffer: the user buffer
2284 * @lenp: the size of the user buffer 2021 * @lenp: the size of the user buffer
2285 * @ppos: file position 2022 * @ppos: file position
@@ -2293,10 +2030,10 @@ static int _proc_do_string(void* data, int maxlen, int write,
2293 * 2030 *
2294 * Returns 0 on success. 2031 * Returns 0 on success.
2295 */ 2032 */
2296int proc_dostring(struct ctl_table *table, int write, struct file *filp, 2033int proc_dostring(struct ctl_table *table, int write,
2297 void __user *buffer, size_t *lenp, loff_t *ppos) 2034 void __user *buffer, size_t *lenp, loff_t *ppos)
2298{ 2035{
2299 return _proc_do_string(table->data, table->maxlen, write, filp, 2036 return _proc_do_string(table->data, table->maxlen, write,
2300 buffer, lenp, ppos); 2037 buffer, lenp, ppos);
2301} 2038}
2302 2039
@@ -2321,7 +2058,7 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
2321} 2058}
2322 2059
2323static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, 2060static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2324 int write, struct file *filp, void __user *buffer, 2061 int write, void __user *buffer,
2325 size_t *lenp, loff_t *ppos, 2062 size_t *lenp, loff_t *ppos,
2326 int (*conv)(int *negp, unsigned long *lvalp, int *valp, 2063 int (*conv)(int *negp, unsigned long *lvalp, int *valp,
2327 int write, void *data), 2064 int write, void *data),
@@ -2428,13 +2165,13 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2428#undef TMPBUFLEN 2165#undef TMPBUFLEN
2429} 2166}
2430 2167
2431static int do_proc_dointvec(struct ctl_table *table, int write, struct file *filp, 2168static int do_proc_dointvec(struct ctl_table *table, int write,
2432 void __user *buffer, size_t *lenp, loff_t *ppos, 2169 void __user *buffer, size_t *lenp, loff_t *ppos,
2433 int (*conv)(int *negp, unsigned long *lvalp, int *valp, 2170 int (*conv)(int *negp, unsigned long *lvalp, int *valp,
2434 int write, void *data), 2171 int write, void *data),
2435 void *data) 2172 void *data)
2436{ 2173{
2437 return __do_proc_dointvec(table->data, table, write, filp, 2174 return __do_proc_dointvec(table->data, table, write,
2438 buffer, lenp, ppos, conv, data); 2175 buffer, lenp, ppos, conv, data);
2439} 2176}
2440 2177
@@ -2442,7 +2179,6 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil
2442 * proc_dointvec - read a vector of integers 2179 * proc_dointvec - read a vector of integers
2443 * @table: the sysctl table 2180 * @table: the sysctl table
2444 * @write: %TRUE if this is a write to the sysctl file 2181 * @write: %TRUE if this is a write to the sysctl file
2445 * @filp: the file structure
2446 * @buffer: the user buffer 2182 * @buffer: the user buffer
2447 * @lenp: the size of the user buffer 2183 * @lenp: the size of the user buffer
2448 * @ppos: file position 2184 * @ppos: file position
@@ -2452,10 +2188,10 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil
2452 * 2188 *
2453 * Returns 0 on success. 2189 * Returns 0 on success.
2454 */ 2190 */
2455int proc_dointvec(struct ctl_table *table, int write, struct file *filp, 2191int proc_dointvec(struct ctl_table *table, int write,
2456 void __user *buffer, size_t *lenp, loff_t *ppos) 2192 void __user *buffer, size_t *lenp, loff_t *ppos)
2457{ 2193{
2458 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, 2194 return do_proc_dointvec(table,write,buffer,lenp,ppos,
2459 NULL,NULL); 2195 NULL,NULL);
2460} 2196}
2461 2197
@@ -2463,7 +2199,7 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
2463 * Taint values can only be increased 2199 * Taint values can only be increased
2464 * This means we can safely use a temporary. 2200 * This means we can safely use a temporary.
2465 */ 2201 */
2466static int proc_taint(struct ctl_table *table, int write, struct file *filp, 2202static int proc_taint(struct ctl_table *table, int write,
2467 void __user *buffer, size_t *lenp, loff_t *ppos) 2203 void __user *buffer, size_t *lenp, loff_t *ppos)
2468{ 2204{
2469 struct ctl_table t; 2205 struct ctl_table t;
@@ -2475,7 +2211,7 @@ static int proc_taint(struct ctl_table *table, int write, struct file *filp,
2475 2211
2476 t = *table; 2212 t = *table;
2477 t.data = &tmptaint; 2213 t.data = &tmptaint;
2478 err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos); 2214 err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
2479 if (err < 0) 2215 if (err < 0)
2480 return err; 2216 return err;
2481 2217
@@ -2527,7 +2263,6 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
2527 * proc_dointvec_minmax - read a vector of integers with min/max values 2263 * proc_dointvec_minmax - read a vector of integers with min/max values
2528 * @table: the sysctl table 2264 * @table: the sysctl table
2529 * @write: %TRUE if this is a write to the sysctl file 2265 * @write: %TRUE if this is a write to the sysctl file
2530 * @filp: the file structure
2531 * @buffer: the user buffer 2266 * @buffer: the user buffer
2532 * @lenp: the size of the user buffer 2267 * @lenp: the size of the user buffer
2533 * @ppos: file position 2268 * @ppos: file position
@@ -2540,19 +2275,18 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
2540 * 2275 *
2541 * Returns 0 on success. 2276 * Returns 0 on success.
2542 */ 2277 */
2543int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, 2278int proc_dointvec_minmax(struct ctl_table *table, int write,
2544 void __user *buffer, size_t *lenp, loff_t *ppos) 2279 void __user *buffer, size_t *lenp, loff_t *ppos)
2545{ 2280{
2546 struct do_proc_dointvec_minmax_conv_param param = { 2281 struct do_proc_dointvec_minmax_conv_param param = {
2547 .min = (int *) table->extra1, 2282 .min = (int *) table->extra1,
2548 .max = (int *) table->extra2, 2283 .max = (int *) table->extra2,
2549 }; 2284 };
2550 return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, 2285 return do_proc_dointvec(table, write, buffer, lenp, ppos,
2551 do_proc_dointvec_minmax_conv, &param); 2286 do_proc_dointvec_minmax_conv, &param);
2552} 2287}
2553 2288
2554static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, 2289static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
2555 struct file *filp,
2556 void __user *buffer, 2290 void __user *buffer,
2557 size_t *lenp, loff_t *ppos, 2291 size_t *lenp, loff_t *ppos,
2558 unsigned long convmul, 2292 unsigned long convmul,
@@ -2657,21 +2391,19 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
2657} 2391}
2658 2392
2659static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, 2393static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
2660 struct file *filp,
2661 void __user *buffer, 2394 void __user *buffer,
2662 size_t *lenp, loff_t *ppos, 2395 size_t *lenp, loff_t *ppos,
2663 unsigned long convmul, 2396 unsigned long convmul,
2664 unsigned long convdiv) 2397 unsigned long convdiv)
2665{ 2398{
2666 return __do_proc_doulongvec_minmax(table->data, table, write, 2399 return __do_proc_doulongvec_minmax(table->data, table, write,
2667 filp, buffer, lenp, ppos, convmul, convdiv); 2400 buffer, lenp, ppos, convmul, convdiv);
2668} 2401}
2669 2402
2670/** 2403/**
2671 * proc_doulongvec_minmax - read a vector of long integers with min/max values 2404 * proc_doulongvec_minmax - read a vector of long integers with min/max values
2672 * @table: the sysctl table 2405 * @table: the sysctl table
2673 * @write: %TRUE if this is a write to the sysctl file 2406 * @write: %TRUE if this is a write to the sysctl file
2674 * @filp: the file structure
2675 * @buffer: the user buffer 2407 * @buffer: the user buffer
2676 * @lenp: the size of the user buffer 2408 * @lenp: the size of the user buffer
2677 * @ppos: file position 2409 * @ppos: file position
@@ -2684,17 +2416,16 @@ static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
2684 * 2416 *
2685 * Returns 0 on success. 2417 * Returns 0 on success.
2686 */ 2418 */
2687int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp, 2419int proc_doulongvec_minmax(struct ctl_table *table, int write,
2688 void __user *buffer, size_t *lenp, loff_t *ppos) 2420 void __user *buffer, size_t *lenp, loff_t *ppos)
2689{ 2421{
2690 return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, 1l, 1l); 2422 return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l);
2691} 2423}
2692 2424
2693/** 2425/**
2694 * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values 2426 * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values
2695 * @table: the sysctl table 2427 * @table: the sysctl table
2696 * @write: %TRUE if this is a write to the sysctl file 2428 * @write: %TRUE if this is a write to the sysctl file
2697 * @filp: the file structure
2698 * @buffer: the user buffer 2429 * @buffer: the user buffer
2699 * @lenp: the size of the user buffer 2430 * @lenp: the size of the user buffer
2700 * @ppos: file position 2431 * @ppos: file position
@@ -2709,11 +2440,10 @@ int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp
2709 * Returns 0 on success. 2440 * Returns 0 on success.
2710 */ 2441 */
2711int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, 2442int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2712 struct file *filp,
2713 void __user *buffer, 2443 void __user *buffer,
2714 size_t *lenp, loff_t *ppos) 2444 size_t *lenp, loff_t *ppos)
2715{ 2445{
2716 return do_proc_doulongvec_minmax(table, write, filp, buffer, 2446 return do_proc_doulongvec_minmax(table, write, buffer,
2717 lenp, ppos, HZ, 1000l); 2447 lenp, ppos, HZ, 1000l);
2718} 2448}
2719 2449
@@ -2789,7 +2519,6 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
2789 * proc_dointvec_jiffies - read a vector of integers as seconds 2519 * proc_dointvec_jiffies - read a vector of integers as seconds
2790 * @table: the sysctl table 2520 * @table: the sysctl table
2791 * @write: %TRUE if this is a write to the sysctl file 2521 * @write: %TRUE if this is a write to the sysctl file
2792 * @filp: the file structure
2793 * @buffer: the user buffer 2522 * @buffer: the user buffer
2794 * @lenp: the size of the user buffer 2523 * @lenp: the size of the user buffer
2795 * @ppos: file position 2524 * @ppos: file position
@@ -2801,10 +2530,10 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
2801 * 2530 *
2802 * Returns 0 on success. 2531 * Returns 0 on success.
2803 */ 2532 */
2804int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, 2533int proc_dointvec_jiffies(struct ctl_table *table, int write,
2805 void __user *buffer, size_t *lenp, loff_t *ppos) 2534 void __user *buffer, size_t *lenp, loff_t *ppos)
2806{ 2535{
2807 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, 2536 return do_proc_dointvec(table,write,buffer,lenp,ppos,
2808 do_proc_dointvec_jiffies_conv,NULL); 2537 do_proc_dointvec_jiffies_conv,NULL);
2809} 2538}
2810 2539
@@ -2812,7 +2541,6 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
2812 * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds 2541 * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
2813 * @table: the sysctl table 2542 * @table: the sysctl table
2814 * @write: %TRUE if this is a write to the sysctl file 2543 * @write: %TRUE if this is a write to the sysctl file
2815 * @filp: the file structure
2816 * @buffer: the user buffer 2544 * @buffer: the user buffer
2817 * @lenp: the size of the user buffer 2545 * @lenp: the size of the user buffer
2818 * @ppos: pointer to the file position 2546 * @ppos: pointer to the file position
@@ -2824,10 +2552,10 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
2824 * 2552 *
2825 * Returns 0 on success. 2553 * Returns 0 on success.
2826 */ 2554 */
2827int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp, 2555int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
2828 void __user *buffer, size_t *lenp, loff_t *ppos) 2556 void __user *buffer, size_t *lenp, loff_t *ppos)
2829{ 2557{
2830 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, 2558 return do_proc_dointvec(table,write,buffer,lenp,ppos,
2831 do_proc_dointvec_userhz_jiffies_conv,NULL); 2559 do_proc_dointvec_userhz_jiffies_conv,NULL);
2832} 2560}
2833 2561
@@ -2835,7 +2563,6 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file
2835 * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds 2563 * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds
2836 * @table: the sysctl table 2564 * @table: the sysctl table
2837 * @write: %TRUE if this is a write to the sysctl file 2565 * @write: %TRUE if this is a write to the sysctl file
2838 * @filp: the file structure
2839 * @buffer: the user buffer 2566 * @buffer: the user buffer
2840 * @lenp: the size of the user buffer 2567 * @lenp: the size of the user buffer
2841 * @ppos: file position 2568 * @ppos: file position
@@ -2848,14 +2575,14 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file
2848 * 2575 *
2849 * Returns 0 on success. 2576 * Returns 0 on success.
2850 */ 2577 */
2851int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp, 2578int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
2852 void __user *buffer, size_t *lenp, loff_t *ppos) 2579 void __user *buffer, size_t *lenp, loff_t *ppos)
2853{ 2580{
2854 return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, 2581 return do_proc_dointvec(table, write, buffer, lenp, ppos,
2855 do_proc_dointvec_ms_jiffies_conv, NULL); 2582 do_proc_dointvec_ms_jiffies_conv, NULL);
2856} 2583}
2857 2584
2858static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, 2585static int proc_do_cad_pid(struct ctl_table *table, int write,
2859 void __user *buffer, size_t *lenp, loff_t *ppos) 2586 void __user *buffer, size_t *lenp, loff_t *ppos)
2860{ 2587{
2861 struct pid *new_pid; 2588 struct pid *new_pid;
@@ -2864,7 +2591,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp
2864 2591
2865 tmp = pid_vnr(cad_pid); 2592 tmp = pid_vnr(cad_pid);
2866 2593
2867 r = __do_proc_dointvec(&tmp, table, write, filp, buffer, 2594 r = __do_proc_dointvec(&tmp, table, write, buffer,
2868 lenp, ppos, NULL, NULL); 2595 lenp, ppos, NULL, NULL);
2869 if (r || !write) 2596 if (r || !write)
2870 return r; 2597 return r;
@@ -2879,50 +2606,49 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp
2879 2606
2880#else /* CONFIG_PROC_FS */ 2607#else /* CONFIG_PROC_FS */
2881 2608
2882int proc_dostring(struct ctl_table *table, int write, struct file *filp, 2609int proc_dostring(struct ctl_table *table, int write,
2883 void __user *buffer, size_t *lenp, loff_t *ppos) 2610 void __user *buffer, size_t *lenp, loff_t *ppos)
2884{ 2611{
2885 return -ENOSYS; 2612 return -ENOSYS;
2886} 2613}
2887 2614
2888int proc_dointvec(struct ctl_table *table, int write, struct file *filp, 2615int proc_dointvec(struct ctl_table *table, int write,
2889 void __user *buffer, size_t *lenp, loff_t *ppos) 2616 void __user *buffer, size_t *lenp, loff_t *ppos)
2890{ 2617{
2891 return -ENOSYS; 2618 return -ENOSYS;
2892} 2619}
2893 2620
2894int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, 2621int proc_dointvec_minmax(struct ctl_table *table, int write,
2895 void __user *buffer, size_t *lenp, loff_t *ppos) 2622 void __user *buffer, size_t *lenp, loff_t *ppos)
2896{ 2623{
2897 return -ENOSYS; 2624 return -ENOSYS;
2898} 2625}
2899 2626
2900int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, 2627int proc_dointvec_jiffies(struct ctl_table *table, int write,
2901 void __user *buffer, size_t *lenp, loff_t *ppos) 2628 void __user *buffer, size_t *lenp, loff_t *ppos)
2902{ 2629{
2903 return -ENOSYS; 2630 return -ENOSYS;
2904} 2631}
2905 2632
2906int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp, 2633int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
2907 void __user *buffer, size_t *lenp, loff_t *ppos) 2634 void __user *buffer, size_t *lenp, loff_t *ppos)
2908{ 2635{
2909 return -ENOSYS; 2636 return -ENOSYS;
2910} 2637}
2911 2638
2912int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp, 2639int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
2913 void __user *buffer, size_t *lenp, loff_t *ppos) 2640 void __user *buffer, size_t *lenp, loff_t *ppos)
2914{ 2641{
2915 return -ENOSYS; 2642 return -ENOSYS;
2916} 2643}
2917 2644
2918int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp, 2645int proc_doulongvec_minmax(struct ctl_table *table, int write,
2919 void __user *buffer, size_t *lenp, loff_t *ppos) 2646 void __user *buffer, size_t *lenp, loff_t *ppos)
2920{ 2647{
2921 return -ENOSYS; 2648 return -ENOSYS;
2922} 2649}
2923 2650
2924int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, 2651int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2925 struct file *filp,
2926 void __user *buffer, 2652 void __user *buffer,
2927 size_t *lenp, loff_t *ppos) 2653 size_t *lenp, loff_t *ppos)
2928{ 2654{
@@ -2932,286 +2658,6 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2932 2658
2933#endif /* CONFIG_PROC_FS */ 2659#endif /* CONFIG_PROC_FS */
2934 2660
2935
2936#ifdef CONFIG_SYSCTL_SYSCALL
2937/*
2938 * General sysctl support routines
2939 */
2940
2941/* The generic sysctl data routine (used if no strategy routine supplied) */
2942int sysctl_data(struct ctl_table *table,
2943 void __user *oldval, size_t __user *oldlenp,
2944 void __user *newval, size_t newlen)
2945{
2946 size_t len;
2947
2948 /* Get out of I don't have a variable */
2949 if (!table->data || !table->maxlen)
2950 return -ENOTDIR;
2951
2952 if (oldval && oldlenp) {
2953 if (get_user(len, oldlenp))
2954 return -EFAULT;
2955 if (len) {
2956 if (len > table->maxlen)
2957 len = table->maxlen;
2958 if (copy_to_user(oldval, table->data, len))
2959 return -EFAULT;
2960 if (put_user(len, oldlenp))
2961 return -EFAULT;
2962 }
2963 }
2964
2965 if (newval && newlen) {
2966 if (newlen > table->maxlen)
2967 newlen = table->maxlen;
2968
2969 if (copy_from_user(table->data, newval, newlen))
2970 return -EFAULT;
2971 }
2972 return 1;
2973}
2974
2975/* The generic string strategy routine: */
2976int sysctl_string(struct ctl_table *table,
2977 void __user *oldval, size_t __user *oldlenp,
2978 void __user *newval, size_t newlen)
2979{
2980 if (!table->data || !table->maxlen)
2981 return -ENOTDIR;
2982
2983 if (oldval && oldlenp) {
2984 size_t bufsize;
2985 if (get_user(bufsize, oldlenp))
2986 return -EFAULT;
2987 if (bufsize) {
2988 size_t len = strlen(table->data), copied;
2989
2990 /* This shouldn't trigger for a well-formed sysctl */
2991 if (len > table->maxlen)
2992 len = table->maxlen;
2993
2994 /* Copy up to a max of bufsize-1 bytes of the string */
2995 copied = (len >= bufsize) ? bufsize - 1 : len;
2996
2997 if (copy_to_user(oldval, table->data, copied) ||
2998 put_user(0, (char __user *)(oldval + copied)))
2999 return -EFAULT;
3000 if (put_user(len, oldlenp))
3001 return -EFAULT;
3002 }
3003 }
3004 if (newval && newlen) {
3005 size_t len = newlen;
3006 if (len > table->maxlen)
3007 len = table->maxlen;
3008 if(copy_from_user(table->data, newval, len))
3009 return -EFAULT;
3010 if (len == table->maxlen)
3011 len--;
3012 ((char *) table->data)[len] = 0;
3013 }
3014 return 1;
3015}
3016
3017/*
3018 * This function makes sure that all of the integers in the vector
3019 * are between the minimum and maximum values given in the arrays
3020 * table->extra1 and table->extra2, respectively.
3021 */
3022int sysctl_intvec(struct ctl_table *table,
3023 void __user *oldval, size_t __user *oldlenp,
3024 void __user *newval, size_t newlen)
3025{
3026
3027 if (newval && newlen) {
3028 int __user *vec = (int __user *) newval;
3029 int *min = (int *) table->extra1;
3030 int *max = (int *) table->extra2;
3031 size_t length;
3032 int i;
3033
3034 if (newlen % sizeof(int) != 0)
3035 return -EINVAL;
3036
3037 if (!table->extra1 && !table->extra2)
3038 return 0;
3039
3040 if (newlen > table->maxlen)
3041 newlen = table->maxlen;
3042 length = newlen / sizeof(int);
3043
3044 for (i = 0; i < length; i++) {
3045 int value;
3046 if (get_user(value, vec + i))
3047 return -EFAULT;
3048 if (min && value < min[i])
3049 return -EINVAL;
3050 if (max && value > max[i])
3051 return -EINVAL;
3052 }
3053 }
3054 return 0;
3055}
3056
3057/* Strategy function to convert jiffies to seconds */
3058int sysctl_jiffies(struct ctl_table *table,
3059 void __user *oldval, size_t __user *oldlenp,
3060 void __user *newval, size_t newlen)
3061{
3062 if (oldval && oldlenp) {
3063 size_t olen;
3064
3065 if (get_user(olen, oldlenp))
3066 return -EFAULT;
3067 if (olen) {
3068 int val;
3069
3070 if (olen < sizeof(int))
3071 return -EINVAL;
3072
3073 val = *(int *)(table->data) / HZ;
3074 if (put_user(val, (int __user *)oldval))
3075 return -EFAULT;
3076 if (put_user(sizeof(int), oldlenp))
3077 return -EFAULT;
3078 }
3079 }
3080 if (newval && newlen) {
3081 int new;
3082 if (newlen != sizeof(int))
3083 return -EINVAL;
3084 if (get_user(new, (int __user *)newval))
3085 return -EFAULT;
3086 *(int *)(table->data) = new*HZ;
3087 }
3088 return 1;
3089}
3090
3091/* Strategy function to convert jiffies to seconds */
3092int sysctl_ms_jiffies(struct ctl_table *table,
3093 void __user *oldval, size_t __user *oldlenp,
3094 void __user *newval, size_t newlen)
3095{
3096 if (oldval && oldlenp) {
3097 size_t olen;
3098
3099 if (get_user(olen, oldlenp))
3100 return -EFAULT;
3101 if (olen) {
3102 int val;
3103
3104 if (olen < sizeof(int))
3105 return -EINVAL;
3106
3107 val = jiffies_to_msecs(*(int *)(table->data));
3108 if (put_user(val, (int __user *)oldval))
3109 return -EFAULT;
3110 if (put_user(sizeof(int), oldlenp))
3111 return -EFAULT;
3112 }
3113 }
3114 if (newval && newlen) {
3115 int new;
3116 if (newlen != sizeof(int))
3117 return -EINVAL;
3118 if (get_user(new, (int __user *)newval))
3119 return -EFAULT;
3120 *(int *)(table->data) = msecs_to_jiffies(new);
3121 }
3122 return 1;
3123}
3124
3125
3126
3127#else /* CONFIG_SYSCTL_SYSCALL */
3128
3129
3130SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
3131{
3132 struct __sysctl_args tmp;
3133 int error;
3134
3135 if (copy_from_user(&tmp, args, sizeof(tmp)))
3136 return -EFAULT;
3137
3138 error = deprecated_sysctl_warning(&tmp);
3139
3140 /* If no error reading the parameters then just -ENOSYS ... */
3141 if (!error)
3142 error = -ENOSYS;
3143
3144 return error;
3145}
3146
3147int sysctl_data(struct ctl_table *table,
3148 void __user *oldval, size_t __user *oldlenp,
3149 void __user *newval, size_t newlen)
3150{
3151 return -ENOSYS;
3152}
3153
3154int sysctl_string(struct ctl_table *table,
3155 void __user *oldval, size_t __user *oldlenp,
3156 void __user *newval, size_t newlen)
3157{
3158 return -ENOSYS;
3159}
3160
3161int sysctl_intvec(struct ctl_table *table,
3162 void __user *oldval, size_t __user *oldlenp,
3163 void __user *newval, size_t newlen)
3164{
3165 return -ENOSYS;
3166}
3167
3168int sysctl_jiffies(struct ctl_table *table,
3169 void __user *oldval, size_t __user *oldlenp,
3170 void __user *newval, size_t newlen)
3171{
3172 return -ENOSYS;
3173}
3174
3175int sysctl_ms_jiffies(struct ctl_table *table,
3176 void __user *oldval, size_t __user *oldlenp,
3177 void __user *newval, size_t newlen)
3178{
3179 return -ENOSYS;
3180}
3181
3182#endif /* CONFIG_SYSCTL_SYSCALL */
3183
3184static int deprecated_sysctl_warning(struct __sysctl_args *args)
3185{
3186 static int msg_count;
3187 int name[CTL_MAXNAME];
3188 int i;
3189
3190 /* Check args->nlen. */
3191 if (args->nlen < 0 || args->nlen > CTL_MAXNAME)
3192 return -ENOTDIR;
3193
3194 /* Read in the sysctl name for better debug message logging */
3195 for (i = 0; i < args->nlen; i++)
3196 if (get_user(name[i], args->name + i))
3197 return -EFAULT;
3198
3199 /* Ignore accesses to kernel.version */
3200 if ((args->nlen == 2) && (name[0] == CTL_KERN) && (name[1] == KERN_VERSION))
3201 return 0;
3202
3203 if (msg_count < 5) {
3204 msg_count++;
3205 printk(KERN_INFO
3206 "warning: process `%s' used the deprecated sysctl "
3207 "system call with ", current->comm);
3208 for (i = 0; i < args->nlen; i++)
3209 printk("%d.", name[i]);
3210 printk("\n");
3211 }
3212 return 0;
3213}
3214
3215/* 2661/*
3216 * No sense putting this after each symbol definition, twice, 2662 * No sense putting this after each symbol definition, twice,
3217 * exception granted :-) 2663 * exception granted :-)
@@ -3226,9 +2672,4 @@ EXPORT_SYMBOL(proc_doulongvec_minmax);
3226EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); 2672EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
3227EXPORT_SYMBOL(register_sysctl_table); 2673EXPORT_SYMBOL(register_sysctl_table);
3228EXPORT_SYMBOL(register_sysctl_paths); 2674EXPORT_SYMBOL(register_sysctl_paths);
3229EXPORT_SYMBOL(sysctl_intvec);
3230EXPORT_SYMBOL(sysctl_jiffies);
3231EXPORT_SYMBOL(sysctl_ms_jiffies);
3232EXPORT_SYMBOL(sysctl_string);
3233EXPORT_SYMBOL(sysctl_data);
3234EXPORT_SYMBOL(unregister_sysctl_table); 2675EXPORT_SYMBOL(unregister_sysctl_table);
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
new file mode 100644
index 000000000000..b75dbf40f573
--- /dev/null
+++ b/kernel/sysctl_binary.c
@@ -0,0 +1,1507 @@
1#include <linux/stat.h>
2#include <linux/sysctl.h>
3#include "../fs/xfs/linux-2.6/xfs_sysctl.h"
4#include <linux/sunrpc/debug.h>
5#include <linux/string.h>
6#include <net/ip_vs.h>
7#include <linux/syscalls.h>
8#include <linux/namei.h>
9#include <linux/mount.h>
10#include <linux/fs.h>
11#include <linux/nsproxy.h>
12#include <linux/pid_namespace.h>
13#include <linux/file.h>
14#include <linux/ctype.h>
15#include <linux/netdevice.h>
16
17#ifdef CONFIG_SYSCTL_SYSCALL
18
19struct bin_table;
20typedef ssize_t bin_convert_t(struct file *file,
21 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen);
22
23static bin_convert_t bin_dir;
24static bin_convert_t bin_string;
25static bin_convert_t bin_intvec;
26static bin_convert_t bin_ulongvec;
27static bin_convert_t bin_uuid;
28static bin_convert_t bin_dn_node_address;
29
30#define CTL_DIR bin_dir
31#define CTL_STR bin_string
32#define CTL_INT bin_intvec
33#define CTL_ULONG bin_ulongvec
34#define CTL_UUID bin_uuid
35#define CTL_DNADR bin_dn_node_address
36
37#define BUFSZ 256
38
39struct bin_table {
40 bin_convert_t *convert;
41 int ctl_name;
42 const char *procname;
43 const struct bin_table *child;
44};
45
46static const struct bin_table bin_random_table[] = {
47 { CTL_INT, RANDOM_POOLSIZE, "poolsize" },
48 { CTL_INT, RANDOM_ENTROPY_COUNT, "entropy_avail" },
49 { CTL_INT, RANDOM_READ_THRESH, "read_wakeup_threshold" },
50 { CTL_INT, RANDOM_WRITE_THRESH, "write_wakeup_threshold" },
51 { CTL_UUID, RANDOM_BOOT_ID, "boot_id" },
52 { CTL_UUID, RANDOM_UUID, "uuid" },
53 {}
54};
55
56static const struct bin_table bin_pty_table[] = {
57 { CTL_INT, PTY_MAX, "max" },
58 { CTL_INT, PTY_NR, "nr" },
59 {}
60};
61
62static const struct bin_table bin_kern_table[] = {
63 { CTL_STR, KERN_OSTYPE, "ostype" },
64 { CTL_STR, KERN_OSRELEASE, "osrelease" },
65 /* KERN_OSREV not used */
66 { CTL_STR, KERN_VERSION, "version" },
67 /* KERN_SECUREMASK not used */
68 /* KERN_PROF not used */
69 { CTL_STR, KERN_NODENAME, "hostname" },
70 { CTL_STR, KERN_DOMAINNAME, "domainname" },
71
72 { CTL_INT, KERN_PANIC, "panic" },
73 { CTL_INT, KERN_REALROOTDEV, "real-root-dev" },
74
75 { CTL_STR, KERN_SPARC_REBOOT, "reboot-cmd" },
76 { CTL_INT, KERN_CTLALTDEL, "ctrl-alt-del" },
77 { CTL_INT, KERN_PRINTK, "printk" },
78
79 /* KERN_NAMETRANS not used */
80 /* KERN_PPC_HTABRECLAIM not used */
81 /* KERN_PPC_ZEROPAGED not used */
82 { CTL_INT, KERN_PPC_POWERSAVE_NAP, "powersave-nap" },
83
84 { CTL_STR, KERN_MODPROBE, "modprobe" },
85 { CTL_INT, KERN_SG_BIG_BUFF, "sg-big-buff" },
86 { CTL_INT, KERN_ACCT, "acct" },
87 /* KERN_PPC_L2CR "l2cr" no longer used */
88
89 /* KERN_RTSIGNR not used */
90 /* KERN_RTSIGMAX not used */
91
92 { CTL_ULONG, KERN_SHMMAX, "shmmax" },
93 { CTL_INT, KERN_MSGMAX, "msgmax" },
94 { CTL_INT, KERN_MSGMNB, "msgmnb" },
95 /* KERN_MSGPOOL not used*/
96 { CTL_INT, KERN_SYSRQ, "sysrq" },
97 { CTL_INT, KERN_MAX_THREADS, "threads-max" },
98 { CTL_DIR, KERN_RANDOM, "random", bin_random_table },
99 { CTL_ULONG, KERN_SHMALL, "shmall" },
100 { CTL_INT, KERN_MSGMNI, "msgmni" },
101 { CTL_INT, KERN_SEM, "sem" },
102 { CTL_INT, KERN_SPARC_STOP_A, "stop-a" },
103 { CTL_INT, KERN_SHMMNI, "shmmni" },
104
105 { CTL_INT, KERN_OVERFLOWUID, "overflowuid" },
106 { CTL_INT, KERN_OVERFLOWGID, "overflowgid" },
107
108 { CTL_STR, KERN_HOTPLUG, "hotplug", },
109 { CTL_INT, KERN_IEEE_EMULATION_WARNINGS, "ieee_emulation_warnings" },
110
111 { CTL_INT, KERN_S390_USER_DEBUG_LOGGING, "userprocess_debug" },
112 { CTL_INT, KERN_CORE_USES_PID, "core_uses_pid" },
113 /* KERN_TAINTED "tainted" no longer used */
114 { CTL_INT, KERN_CADPID, "cad_pid" },
115 { CTL_INT, KERN_PIDMAX, "pid_max" },
116 { CTL_STR, KERN_CORE_PATTERN, "core_pattern" },
117 { CTL_INT, KERN_PANIC_ON_OOPS, "panic_on_oops" },
118 { CTL_INT, KERN_HPPA_PWRSW, "soft-power" },
119 { CTL_INT, KERN_HPPA_UNALIGNED, "unaligned-trap" },
120
121 { CTL_INT, KERN_PRINTK_RATELIMIT, "printk_ratelimit" },
122 { CTL_INT, KERN_PRINTK_RATELIMIT_BURST, "printk_ratelimit_burst" },
123
124 { CTL_DIR, KERN_PTY, "pty", bin_pty_table },
125 { CTL_INT, KERN_NGROUPS_MAX, "ngroups_max" },
126 { CTL_INT, KERN_SPARC_SCONS_PWROFF, "scons-poweroff" },
127 /* KERN_HZ_TIMER "hz_timer" no longer used */
128 { CTL_INT, KERN_UNKNOWN_NMI_PANIC, "unknown_nmi_panic" },
129 { CTL_INT, KERN_BOOTLOADER_TYPE, "bootloader_type" },
130 { CTL_INT, KERN_RANDOMIZE, "randomize_va_space" },
131
132 { CTL_INT, KERN_SPIN_RETRY, "spin_retry" },
133 /* KERN_ACPI_VIDEO_FLAGS "acpi_video_flags" no longer used */
134 { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" },
135 { CTL_INT, KERN_COMPAT_LOG, "compat-log" },
136 { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
137 { CTL_INT, KERN_NMI_WATCHDOG, "nmi_watchdog" },
138 { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
139 {}
140};
141
142static const struct bin_table bin_vm_table[] = {
143 { CTL_INT, VM_OVERCOMMIT_MEMORY, "overcommit_memory" },
144 { CTL_INT, VM_PAGE_CLUSTER, "page-cluster" },
145 { CTL_INT, VM_DIRTY_BACKGROUND, "dirty_background_ratio" },
146 { CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" },
147 /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */
148 /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */
149 { CTL_INT, VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" },
150 { CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" },
151 /* VM_PAGEBUF unused */
152 /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */
153 { CTL_INT, VM_SWAPPINESS, "swappiness" },
154 { CTL_INT, VM_LOWMEM_RESERVE_RATIO, "lowmem_reserve_ratio" },
155 { CTL_INT, VM_MIN_FREE_KBYTES, "min_free_kbytes" },
156 { CTL_INT, VM_MAX_MAP_COUNT, "max_map_count" },
157 { CTL_INT, VM_LAPTOP_MODE, "laptop_mode" },
158 { CTL_INT, VM_BLOCK_DUMP, "block_dump" },
159 { CTL_INT, VM_HUGETLB_GROUP, "hugetlb_shm_group" },
160 { CTL_INT, VM_VFS_CACHE_PRESSURE, "vfs_cache_pressure" },
161 { CTL_INT, VM_LEGACY_VA_LAYOUT, "legacy_va_layout" },
162 /* VM_SWAP_TOKEN_TIMEOUT unused */
163 { CTL_INT, VM_DROP_PAGECACHE, "drop_caches" },
164 { CTL_INT, VM_PERCPU_PAGELIST_FRACTION, "percpu_pagelist_fraction" },
165 { CTL_INT, VM_ZONE_RECLAIM_MODE, "zone_reclaim_mode" },
166 { CTL_INT, VM_MIN_UNMAPPED, "min_unmapped_ratio" },
167 { CTL_INT, VM_PANIC_ON_OOM, "panic_on_oom" },
168 { CTL_INT, VM_VDSO_ENABLED, "vdso_enabled" },
169 { CTL_INT, VM_MIN_SLAB, "min_slab_ratio" },
170
171 {}
172};
173
174static const struct bin_table bin_net_core_table[] = {
175 { CTL_INT, NET_CORE_WMEM_MAX, "wmem_max" },
176 { CTL_INT, NET_CORE_RMEM_MAX, "rmem_max" },
177 { CTL_INT, NET_CORE_WMEM_DEFAULT, "wmem_default" },
178 { CTL_INT, NET_CORE_RMEM_DEFAULT, "rmem_default" },
179 /* NET_CORE_DESTROY_DELAY unused */
180 { CTL_INT, NET_CORE_MAX_BACKLOG, "netdev_max_backlog" },
181 /* NET_CORE_FASTROUTE unused */
182 { CTL_INT, NET_CORE_MSG_COST, "message_cost" },
183 { CTL_INT, NET_CORE_MSG_BURST, "message_burst" },
184 { CTL_INT, NET_CORE_OPTMEM_MAX, "optmem_max" },
185 /* NET_CORE_HOT_LIST_LENGTH unused */
186 /* NET_CORE_DIVERT_VERSION unused */
187 /* NET_CORE_NO_CONG_THRESH unused */
188 /* NET_CORE_NO_CONG unused */
189 /* NET_CORE_LO_CONG unused */
190 /* NET_CORE_MOD_CONG unused */
191 { CTL_INT, NET_CORE_DEV_WEIGHT, "dev_weight" },
192 { CTL_INT, NET_CORE_SOMAXCONN, "somaxconn" },
193 { CTL_INT, NET_CORE_BUDGET, "netdev_budget" },
194 { CTL_INT, NET_CORE_AEVENT_ETIME, "xfrm_aevent_etime" },
195 { CTL_INT, NET_CORE_AEVENT_RSEQTH, "xfrm_aevent_rseqth" },
196 { CTL_INT, NET_CORE_WARNINGS, "warnings" },
197 {},
198};
199
200static const struct bin_table bin_net_unix_table[] = {
201 /* NET_UNIX_DESTROY_DELAY unused */
202 /* NET_UNIX_DELETE_DELAY unused */
203 { CTL_INT, NET_UNIX_MAX_DGRAM_QLEN, "max_dgram_qlen" },
204 {}
205};
206
207static const struct bin_table bin_net_ipv4_route_table[] = {
208 { CTL_INT, NET_IPV4_ROUTE_FLUSH, "flush" },
209 /* NET_IPV4_ROUTE_MIN_DELAY "min_delay" no longer used */
210 /* NET_IPV4_ROUTE_MAX_DELAY "max_delay" no longer used */
211 { CTL_INT, NET_IPV4_ROUTE_GC_THRESH, "gc_thresh" },
212 { CTL_INT, NET_IPV4_ROUTE_MAX_SIZE, "max_size" },
213 { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" },
214 { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" },
215 { CTL_INT, NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" },
216 { CTL_INT, NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval" },
217 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" },
218 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" },
219 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" },
220 { CTL_INT, NET_IPV4_ROUTE_ERROR_COST, "error_cost" },
221 { CTL_INT, NET_IPV4_ROUTE_ERROR_BURST, "error_burst" },
222 { CTL_INT, NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity" },
223 { CTL_INT, NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" },
224 { CTL_INT, NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" },
225 { CTL_INT, NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" },
226 { CTL_INT, NET_IPV4_ROUTE_SECRET_INTERVAL, "secret_interval" },
227 {}
228};
229
230static const struct bin_table bin_net_ipv4_conf_vars_table[] = {
231 { CTL_INT, NET_IPV4_CONF_FORWARDING, "forwarding" },
232 { CTL_INT, NET_IPV4_CONF_MC_FORWARDING, "mc_forwarding" },
233
234 { CTL_INT, NET_IPV4_CONF_ACCEPT_REDIRECTS, "accept_redirects" },
235 { CTL_INT, NET_IPV4_CONF_SECURE_REDIRECTS, "secure_redirects" },
236 { CTL_INT, NET_IPV4_CONF_SEND_REDIRECTS, "send_redirects" },
237 { CTL_INT, NET_IPV4_CONF_SHARED_MEDIA, "shared_media" },
238 { CTL_INT, NET_IPV4_CONF_RP_FILTER, "rp_filter" },
239 { CTL_INT, NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
240 { CTL_INT, NET_IPV4_CONF_PROXY_ARP, "proxy_arp" },
241 { CTL_INT, NET_IPV4_CONF_MEDIUM_ID, "medium_id" },
242 { CTL_INT, NET_IPV4_CONF_BOOTP_RELAY, "bootp_relay" },
243 { CTL_INT, NET_IPV4_CONF_LOG_MARTIANS, "log_martians" },
244 { CTL_INT, NET_IPV4_CONF_TAG, "tag" },
245 { CTL_INT, NET_IPV4_CONF_ARPFILTER, "arp_filter" },
246 { CTL_INT, NET_IPV4_CONF_ARP_ANNOUNCE, "arp_announce" },
247 { CTL_INT, NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" },
248 { CTL_INT, NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" },
249 { CTL_INT, NET_IPV4_CONF_ARP_NOTIFY, "arp_notify" },
250
251 { CTL_INT, NET_IPV4_CONF_NOXFRM, "disable_xfrm" },
252 { CTL_INT, NET_IPV4_CONF_NOPOLICY, "disable_policy" },
253 { CTL_INT, NET_IPV4_CONF_FORCE_IGMP_VERSION, "force_igmp_version" },
254 { CTL_INT, NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" },
255 {}
256};
257
258static const struct bin_table bin_net_ipv4_conf_table[] = {
259 { CTL_DIR, NET_PROTO_CONF_ALL, "all", bin_net_ipv4_conf_vars_table },
260 { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_ipv4_conf_vars_table },
261 { CTL_DIR, 0, NULL, bin_net_ipv4_conf_vars_table },
262 {}
263};
264
265static const struct bin_table bin_net_neigh_vars_table[] = {
266 { CTL_INT, NET_NEIGH_MCAST_SOLICIT, "mcast_solicit" },
267 { CTL_INT, NET_NEIGH_UCAST_SOLICIT, "ucast_solicit" },
268 { CTL_INT, NET_NEIGH_APP_SOLICIT, "app_solicit" },
269 /* NET_NEIGH_RETRANS_TIME "retrans_time" no longer used */
270 { CTL_INT, NET_NEIGH_REACHABLE_TIME, "base_reachable_time" },
271 { CTL_INT, NET_NEIGH_DELAY_PROBE_TIME, "delay_first_probe_time" },
272 { CTL_INT, NET_NEIGH_GC_STALE_TIME, "gc_stale_time" },
273 { CTL_INT, NET_NEIGH_UNRES_QLEN, "unres_qlen" },
274 { CTL_INT, NET_NEIGH_PROXY_QLEN, "proxy_qlen" },
275 /* NET_NEIGH_ANYCAST_DELAY "anycast_delay" no longer used */
276 /* NET_NEIGH_PROXY_DELAY "proxy_delay" no longer used */
277 /* NET_NEIGH_LOCKTIME "locktime" no longer used */
278 { CTL_INT, NET_NEIGH_GC_INTERVAL, "gc_interval" },
279 { CTL_INT, NET_NEIGH_GC_THRESH1, "gc_thresh1" },
280 { CTL_INT, NET_NEIGH_GC_THRESH2, "gc_thresh2" },
281 { CTL_INT, NET_NEIGH_GC_THRESH3, "gc_thresh3" },
282 { CTL_INT, NET_NEIGH_RETRANS_TIME_MS, "retrans_time_ms" },
283 { CTL_INT, NET_NEIGH_REACHABLE_TIME_MS, "base_reachable_time_ms" },
284 {}
285};
286
287static const struct bin_table bin_net_neigh_table[] = {
288 { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_neigh_vars_table },
289 { CTL_DIR, 0, NULL, bin_net_neigh_vars_table },
290 {}
291};
292
293static const struct bin_table bin_net_ipv4_netfilter_table[] = {
294 { CTL_INT, NET_IPV4_NF_CONNTRACK_MAX, "ip_conntrack_max" },
295
296 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "ip_conntrack_tcp_timeout_syn_sent" no longer used */
297 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "ip_conntrack_tcp_timeout_syn_recv" no longer used */
298 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "ip_conntrack_tcp_timeout_established" no longer used */
299 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "ip_conntrack_tcp_timeout_fin_wait" no longer used */
300 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT "ip_conntrack_tcp_timeout_close_wait" no longer used */
301 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "ip_conntrack_tcp_timeout_last_ack" no longer used */
302 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "ip_conntrack_tcp_timeout_time_wait" no longer used */
303 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "ip_conntrack_tcp_timeout_close" no longer used */
304
305 /* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT "ip_conntrack_udp_timeout" no longer used */
306 /* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM "ip_conntrack_udp_timeout_stream" no longer used */
307 /* NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT "ip_conntrack_icmp_timeout" no longer used */
308 /* NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT "ip_conntrack_generic_timeout" no longer used */
309
310 { CTL_INT, NET_IPV4_NF_CONNTRACK_BUCKETS, "ip_conntrack_buckets" },
311 { CTL_INT, NET_IPV4_NF_CONNTRACK_LOG_INVALID, "ip_conntrack_log_invalid" },
312 /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "ip_conntrack_tcp_timeout_max_retrans" no longer used */
313 { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_LOOSE, "ip_conntrack_tcp_loose" },
314 { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL, "ip_conntrack_tcp_be_liberal" },
315 { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS, "ip_conntrack_tcp_max_retrans" },
316
317 /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "ip_conntrack_sctp_timeout_closed" no longer used */
318 /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "ip_conntrack_sctp_timeout_cookie_wait" no longer used */
319 /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "ip_conntrack_sctp_timeout_cookie_echoed" no longer used */
320 /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "ip_conntrack_sctp_timeout_established" no longer used */
321 /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "ip_conntrack_sctp_timeout_shutdown_sent" no longer used */
322 /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "ip_conntrack_sctp_timeout_shutdown_recd" no longer used */
323 /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "ip_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */
324
325 { CTL_INT, NET_IPV4_NF_CONNTRACK_COUNT, "ip_conntrack_count" },
326 { CTL_INT, NET_IPV4_NF_CONNTRACK_CHECKSUM, "ip_conntrack_checksum" },
327 {}
328};
329
330static const struct bin_table bin_net_ipv4_table[] = {
331 {CTL_INT, NET_IPV4_FORWARD, "ip_forward" },
332
333 { CTL_DIR, NET_IPV4_CONF, "conf", bin_net_ipv4_conf_table },
334 { CTL_DIR, NET_IPV4_NEIGH, "neigh", bin_net_neigh_table },
335 { CTL_DIR, NET_IPV4_ROUTE, "route", bin_net_ipv4_route_table },
336 /* NET_IPV4_FIB_HASH unused */
337 { CTL_DIR, NET_IPV4_NETFILTER, "netfilter", bin_net_ipv4_netfilter_table },
338
339 { CTL_INT, NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps" },
340 { CTL_INT, NET_IPV4_TCP_WINDOW_SCALING, "tcp_window_scaling" },
341 { CTL_INT, NET_IPV4_TCP_SACK, "tcp_sack" },
342 { CTL_INT, NET_IPV4_TCP_RETRANS_COLLAPSE, "tcp_retrans_collapse" },
343 { CTL_INT, NET_IPV4_DEFAULT_TTL, "ip_default_ttl" },
344 /* NET_IPV4_AUTOCONFIG unused */
345 { CTL_INT, NET_IPV4_NO_PMTU_DISC, "ip_no_pmtu_disc" },
346 { CTL_INT, NET_IPV4_NONLOCAL_BIND, "ip_nonlocal_bind" },
347 { CTL_INT, NET_IPV4_TCP_SYN_RETRIES, "tcp_syn_retries" },
348 { CTL_INT, NET_TCP_SYNACK_RETRIES, "tcp_synack_retries" },
349 { CTL_INT, NET_TCP_MAX_ORPHANS, "tcp_max_orphans" },
350 { CTL_INT, NET_TCP_MAX_TW_BUCKETS, "tcp_max_tw_buckets" },
351 { CTL_INT, NET_IPV4_DYNADDR, "ip_dynaddr" },
352 { CTL_INT, NET_IPV4_TCP_KEEPALIVE_TIME, "tcp_keepalive_time" },
353 { CTL_INT, NET_IPV4_TCP_KEEPALIVE_PROBES, "tcp_keepalive_probes" },
354 { CTL_INT, NET_IPV4_TCP_KEEPALIVE_INTVL, "tcp_keepalive_intvl" },
355 { CTL_INT, NET_IPV4_TCP_RETRIES1, "tcp_retries1" },
356 { CTL_INT, NET_IPV4_TCP_RETRIES2, "tcp_retries2" },
357 { CTL_INT, NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout" },
358 { CTL_INT, NET_TCP_SYNCOOKIES, "tcp_syncookies" },
359 { CTL_INT, NET_TCP_TW_RECYCLE, "tcp_tw_recycle" },
360 { CTL_INT, NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow" },
361 { CTL_INT, NET_TCP_STDURG, "tcp_stdurg" },
362 { CTL_INT, NET_TCP_RFC1337, "tcp_rfc1337" },
363 { CTL_INT, NET_TCP_MAX_SYN_BACKLOG, "tcp_max_syn_backlog" },
364 { CTL_INT, NET_IPV4_LOCAL_PORT_RANGE, "ip_local_port_range" },
365 { CTL_INT, NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships" },
366 { CTL_INT, NET_IPV4_IGMP_MAX_MSF, "igmp_max_msf" },
367 { CTL_INT, NET_IPV4_INET_PEER_THRESHOLD, "inet_peer_threshold" },
368 { CTL_INT, NET_IPV4_INET_PEER_MINTTL, "inet_peer_minttl" },
369 { CTL_INT, NET_IPV4_INET_PEER_MAXTTL, "inet_peer_maxttl" },
370 { CTL_INT, NET_IPV4_INET_PEER_GC_MINTIME, "inet_peer_gc_mintime" },
371 { CTL_INT, NET_IPV4_INET_PEER_GC_MAXTIME, "inet_peer_gc_maxtime" },
372 { CTL_INT, NET_TCP_ORPHAN_RETRIES, "tcp_orphan_retries" },
373 { CTL_INT, NET_TCP_FACK, "tcp_fack" },
374 { CTL_INT, NET_TCP_REORDERING, "tcp_reordering" },
375 { CTL_INT, NET_TCP_ECN, "tcp_ecn" },
376 { CTL_INT, NET_TCP_DSACK, "tcp_dsack" },
377 { CTL_INT, NET_TCP_MEM, "tcp_mem" },
378 { CTL_INT, NET_TCP_WMEM, "tcp_wmem" },
379 { CTL_INT, NET_TCP_RMEM, "tcp_rmem" },
380 { CTL_INT, NET_TCP_APP_WIN, "tcp_app_win" },
381 { CTL_INT, NET_TCP_ADV_WIN_SCALE, "tcp_adv_win_scale" },
382 { CTL_INT, NET_TCP_TW_REUSE, "tcp_tw_reuse" },
383 { CTL_INT, NET_TCP_FRTO, "tcp_frto" },
384 { CTL_INT, NET_TCP_FRTO_RESPONSE, "tcp_frto_response" },
385 { CTL_INT, NET_TCP_LOW_LATENCY, "tcp_low_latency" },
386 { CTL_INT, NET_TCP_NO_METRICS_SAVE, "tcp_no_metrics_save" },
387 { CTL_INT, NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" },
388 { CTL_INT, NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" },
389 { CTL_STR, NET_TCP_CONG_CONTROL, "tcp_congestion_control" },
390 { CTL_INT, NET_TCP_ABC, "tcp_abc" },
391 { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" },
392 { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" },
393 { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" },
394 { CTL_INT, NET_TCP_DMA_COPYBREAK, "tcp_dma_copybreak" },
395 { CTL_INT, NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" },
396 { CTL_INT, NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" },
397 { CTL_INT, NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" },
398 { CTL_INT, NET_CIPSOV4_RBM_OPTFMT, "cipso_rbm_optfmt" },
399 { CTL_INT, NET_CIPSOV4_RBM_STRICTVALID, "cipso_rbm_strictvalid" },
400 /* NET_TCP_AVAIL_CONG_CONTROL "tcp_available_congestion_control" no longer used */
401 { CTL_STR, NET_TCP_ALLOWED_CONG_CONTROL, "tcp_allowed_congestion_control" },
402 { CTL_INT, NET_TCP_MAX_SSTHRESH, "tcp_max_ssthresh" },
403
404 { CTL_INT, NET_IPV4_ICMP_ECHO_IGNORE_ALL, "icmp_echo_ignore_all" },
405 { CTL_INT, NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, "icmp_echo_ignore_broadcasts" },
406 { CTL_INT, NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, "icmp_ignore_bogus_error_responses" },
407 { CTL_INT, NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR, "icmp_errors_use_inbound_ifaddr" },
408 { CTL_INT, NET_IPV4_ICMP_RATELIMIT, "icmp_ratelimit" },
409 { CTL_INT, NET_IPV4_ICMP_RATEMASK, "icmp_ratemask" },
410
411 { CTL_INT, NET_IPV4_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh" },
412 { CTL_INT, NET_IPV4_IPFRAG_LOW_THRESH, "ipfrag_low_thresh" },
413 { CTL_INT, NET_IPV4_IPFRAG_TIME, "ipfrag_time" },
414
415 { CTL_INT, NET_IPV4_IPFRAG_SECRET_INTERVAL, "ipfrag_secret_interval" },
416 /* NET_IPV4_IPFRAG_MAX_DIST "ipfrag_max_dist" no longer used */
417
418 { CTL_INT, 2088 /* NET_IPQ_QMAX */, "ip_queue_maxlen" },
419
420 /* NET_TCP_DEFAULT_WIN_SCALE unused */
421 /* NET_TCP_BIC_BETA unused */
422 /* NET_IPV4_TCP_MAX_KA_PROBES unused */
423 /* NET_IPV4_IP_MASQ_DEBUG unused */
424 /* NET_TCP_SYN_TAILDROP unused */
425 /* NET_IPV4_ICMP_SOURCEQUENCH_RATE unused */
426 /* NET_IPV4_ICMP_DESTUNREACH_RATE unused */
427 /* NET_IPV4_ICMP_TIMEEXCEED_RATE unused */
428 /* NET_IPV4_ICMP_PARAMPROB_RATE unused */
429 /* NET_IPV4_ICMP_ECHOREPLY_RATE unused */
430 /* NET_IPV4_ALWAYS_DEFRAG unused */
431 {}
432};
433
434static const struct bin_table bin_net_ipx_table[] = {
435 { CTL_INT, NET_IPX_PPROP_BROADCASTING, "ipx_pprop_broadcasting" },
436 /* NET_IPX_FORWARDING unused */
437 {}
438};
439
440static const struct bin_table bin_net_atalk_table[] = {
441 { CTL_INT, NET_ATALK_AARP_EXPIRY_TIME, "aarp-expiry-time" },
442 { CTL_INT, NET_ATALK_AARP_TICK_TIME, "aarp-tick-time" },
443 { CTL_INT, NET_ATALK_AARP_RETRANSMIT_LIMIT, "aarp-retransmit-limit" },
444 { CTL_INT, NET_ATALK_AARP_RESOLVE_TIME, "aarp-resolve-time" },
445 {},
446};
447
448static const struct bin_table bin_net_netrom_table[] = {
449 { CTL_INT, NET_NETROM_DEFAULT_PATH_QUALITY, "default_path_quality" },
450 { CTL_INT, NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER, "obsolescence_count_initialiser" },
451 { CTL_INT, NET_NETROM_NETWORK_TTL_INITIALISER, "network_ttl_initialiser" },
452 { CTL_INT, NET_NETROM_TRANSPORT_TIMEOUT, "transport_timeout" },
453 { CTL_INT, NET_NETROM_TRANSPORT_MAXIMUM_TRIES, "transport_maximum_tries" },
454 { CTL_INT, NET_NETROM_TRANSPORT_ACKNOWLEDGE_DELAY, "transport_acknowledge_delay" },
455 { CTL_INT, NET_NETROM_TRANSPORT_BUSY_DELAY, "transport_busy_delay" },
456 { CTL_INT, NET_NETROM_TRANSPORT_REQUESTED_WINDOW_SIZE, "transport_requested_window_size" },
457 { CTL_INT, NET_NETROM_TRANSPORT_NO_ACTIVITY_TIMEOUT, "transport_no_activity_timeout" },
458 { CTL_INT, NET_NETROM_ROUTING_CONTROL, "routing_control" },
459 { CTL_INT, NET_NETROM_LINK_FAILS_COUNT, "link_fails_count" },
460 { CTL_INT, NET_NETROM_RESET, "reset" },
461 {}
462};
463
464static const struct bin_table bin_net_ax25_param_table[] = {
465 { CTL_INT, NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" },
466 { CTL_INT, NET_AX25_DEFAULT_MODE, "ax25_default_mode" },
467 { CTL_INT, NET_AX25_BACKOFF_TYPE, "backoff_type" },
468 { CTL_INT, NET_AX25_CONNECT_MODE, "connect_mode" },
469 { CTL_INT, NET_AX25_STANDARD_WINDOW, "standard_window_size" },
470 { CTL_INT, NET_AX25_EXTENDED_WINDOW, "extended_window_size" },
471 { CTL_INT, NET_AX25_T1_TIMEOUT, "t1_timeout" },
472 { CTL_INT, NET_AX25_T2_TIMEOUT, "t2_timeout" },
473 { CTL_INT, NET_AX25_T3_TIMEOUT, "t3_timeout" },
474 { CTL_INT, NET_AX25_IDLE_TIMEOUT, "idle_timeout" },
475 { CTL_INT, NET_AX25_N2, "maximum_retry_count" },
476 { CTL_INT, NET_AX25_PACLEN, "maximum_packet_length" },
477 { CTL_INT, NET_AX25_PROTOCOL, "protocol" },
478 { CTL_INT, NET_AX25_DAMA_SLAVE_TIMEOUT, "dama_slave_timeout" },
479 {}
480};
481
482static const struct bin_table bin_net_ax25_table[] = {
483 { CTL_DIR, 0, NULL, bin_net_ax25_param_table },
484 {}
485};
486
487static const struct bin_table bin_net_rose_table[] = {
488 { CTL_INT, NET_ROSE_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" },
489 { CTL_INT, NET_ROSE_CALL_REQUEST_TIMEOUT, "call_request_timeout" },
490 { CTL_INT, NET_ROSE_RESET_REQUEST_TIMEOUT, "reset_request_timeout" },
491 { CTL_INT, NET_ROSE_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" },
492 { CTL_INT, NET_ROSE_ACK_HOLD_BACK_TIMEOUT, "acknowledge_hold_back_timeout" },
493 { CTL_INT, NET_ROSE_ROUTING_CONTROL, "routing_control" },
494 { CTL_INT, NET_ROSE_LINK_FAIL_TIMEOUT, "link_fail_timeout" },
495 { CTL_INT, NET_ROSE_MAX_VCS, "maximum_virtual_circuits" },
496 { CTL_INT, NET_ROSE_WINDOW_SIZE, "window_size" },
497 { CTL_INT, NET_ROSE_NO_ACTIVITY_TIMEOUT, "no_activity_timeout" },
498 {}
499};
500
501static const struct bin_table bin_net_ipv6_conf_var_table[] = {
502 { CTL_INT, NET_IPV6_FORWARDING, "forwarding" },
503 { CTL_INT, NET_IPV6_HOP_LIMIT, "hop_limit" },
504 { CTL_INT, NET_IPV6_MTU, "mtu" },
505 { CTL_INT, NET_IPV6_ACCEPT_RA, "accept_ra" },
506 { CTL_INT, NET_IPV6_ACCEPT_REDIRECTS, "accept_redirects" },
507 { CTL_INT, NET_IPV6_AUTOCONF, "autoconf" },
508 { CTL_INT, NET_IPV6_DAD_TRANSMITS, "dad_transmits" },
509 { CTL_INT, NET_IPV6_RTR_SOLICITS, "router_solicitations" },
510 { CTL_INT, NET_IPV6_RTR_SOLICIT_INTERVAL, "router_solicitation_interval" },
511 { CTL_INT, NET_IPV6_RTR_SOLICIT_DELAY, "router_solicitation_delay" },
512 { CTL_INT, NET_IPV6_USE_TEMPADDR, "use_tempaddr" },
513 { CTL_INT, NET_IPV6_TEMP_VALID_LFT, "temp_valid_lft" },
514 { CTL_INT, NET_IPV6_TEMP_PREFERED_LFT, "temp_prefered_lft" },
515 { CTL_INT, NET_IPV6_REGEN_MAX_RETRY, "regen_max_retry" },
516 { CTL_INT, NET_IPV6_MAX_DESYNC_FACTOR, "max_desync_factor" },
517 { CTL_INT, NET_IPV6_MAX_ADDRESSES, "max_addresses" },
518 { CTL_INT, NET_IPV6_FORCE_MLD_VERSION, "force_mld_version" },
519 { CTL_INT, NET_IPV6_ACCEPT_RA_DEFRTR, "accept_ra_defrtr" },
520 { CTL_INT, NET_IPV6_ACCEPT_RA_PINFO, "accept_ra_pinfo" },
521 { CTL_INT, NET_IPV6_ACCEPT_RA_RTR_PREF, "accept_ra_rtr_pref" },
522 { CTL_INT, NET_IPV6_RTR_PROBE_INTERVAL, "router_probe_interval" },
523 { CTL_INT, NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" },
524 { CTL_INT, NET_IPV6_PROXY_NDP, "proxy_ndp" },
525 { CTL_INT, NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
526 {}
527};
528
529static const struct bin_table bin_net_ipv6_conf_table[] = {
530 { CTL_DIR, NET_PROTO_CONF_ALL, "all", bin_net_ipv6_conf_var_table },
531 { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_ipv6_conf_var_table },
532 { CTL_DIR, 0, NULL, bin_net_ipv6_conf_var_table },
533 {}
534};
535
536static const struct bin_table bin_net_ipv6_route_table[] = {
537 /* NET_IPV6_ROUTE_FLUSH "flush" no longer used */
538 { CTL_INT, NET_IPV6_ROUTE_GC_THRESH, "gc_thresh" },
539 { CTL_INT, NET_IPV6_ROUTE_MAX_SIZE, "max_size" },
540 { CTL_INT, NET_IPV6_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" },
541 { CTL_INT, NET_IPV6_ROUTE_GC_TIMEOUT, "gc_timeout" },
542 { CTL_INT, NET_IPV6_ROUTE_GC_INTERVAL, "gc_interval" },
543 { CTL_INT, NET_IPV6_ROUTE_GC_ELASTICITY, "gc_elasticity" },
544 { CTL_INT, NET_IPV6_ROUTE_MTU_EXPIRES, "mtu_expires" },
545 { CTL_INT, NET_IPV6_ROUTE_MIN_ADVMSS, "min_adv_mss" },
546 { CTL_INT, NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" },
547 {}
548};
549
550static const struct bin_table bin_net_ipv6_icmp_table[] = {
551 { CTL_INT, NET_IPV6_ICMP_RATELIMIT, "ratelimit" },
552 {}
553};
554
555static const struct bin_table bin_net_ipv6_table[] = {
556 { CTL_DIR, NET_IPV6_CONF, "conf", bin_net_ipv6_conf_table },
557 { CTL_DIR, NET_IPV6_NEIGH, "neigh", bin_net_neigh_table },
558 { CTL_DIR, NET_IPV6_ROUTE, "route", bin_net_ipv6_route_table },
559 { CTL_DIR, NET_IPV6_ICMP, "icmp", bin_net_ipv6_icmp_table },
560 { CTL_INT, NET_IPV6_BINDV6ONLY, "bindv6only" },
561 { CTL_INT, NET_IPV6_IP6FRAG_HIGH_THRESH, "ip6frag_high_thresh" },
562 { CTL_INT, NET_IPV6_IP6FRAG_LOW_THRESH, "ip6frag_low_thresh" },
563 { CTL_INT, NET_IPV6_IP6FRAG_TIME, "ip6frag_time" },
564 { CTL_INT, NET_IPV6_IP6FRAG_SECRET_INTERVAL, "ip6frag_secret_interval" },
565 { CTL_INT, NET_IPV6_MLD_MAX_MSF, "mld_max_msf" },
566 { CTL_INT, 2088 /* IPQ_QMAX */, "ip6_queue_maxlen" },
567 {}
568};
569
570static const struct bin_table bin_net_x25_table[] = {
571 { CTL_INT, NET_X25_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" },
572 { CTL_INT, NET_X25_CALL_REQUEST_TIMEOUT, "call_request_timeout" },
573 { CTL_INT, NET_X25_RESET_REQUEST_TIMEOUT, "reset_request_timeout" },
574 { CTL_INT, NET_X25_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" },
575 { CTL_INT, NET_X25_ACK_HOLD_BACK_TIMEOUT, "acknowledgement_hold_back_timeout" },
576 { CTL_INT, NET_X25_FORWARD, "x25_forward" },
577 {}
578};
579
580static const struct bin_table bin_net_tr_table[] = {
581 { CTL_INT, NET_TR_RIF_TIMEOUT, "rif_timeout" },
582 {}
583};
584
585
586static const struct bin_table bin_net_decnet_conf_vars[] = {
587 { CTL_INT, NET_DECNET_CONF_DEV_FORWARDING, "forwarding" },
588 { CTL_INT, NET_DECNET_CONF_DEV_PRIORITY, "priority" },
589 { CTL_INT, NET_DECNET_CONF_DEV_T2, "t2" },
590 { CTL_INT, NET_DECNET_CONF_DEV_T3, "t3" },
591 {}
592};
593
594static const struct bin_table bin_net_decnet_conf[] = {
595 { CTL_DIR, NET_DECNET_CONF_ETHER, "ethernet", bin_net_decnet_conf_vars },
596 { CTL_DIR, NET_DECNET_CONF_GRE, "ipgre", bin_net_decnet_conf_vars },
597 { CTL_DIR, NET_DECNET_CONF_X25, "x25", bin_net_decnet_conf_vars },
598 { CTL_DIR, NET_DECNET_CONF_PPP, "ppp", bin_net_decnet_conf_vars },
599 { CTL_DIR, NET_DECNET_CONF_DDCMP, "ddcmp", bin_net_decnet_conf_vars },
600 { CTL_DIR, NET_DECNET_CONF_LOOPBACK, "loopback", bin_net_decnet_conf_vars },
601 { CTL_DIR, 0, NULL, bin_net_decnet_conf_vars },
602 {}
603};
604
605static const struct bin_table bin_net_decnet_table[] = {
606 { CTL_DIR, NET_DECNET_CONF, "conf", bin_net_decnet_conf },
607 { CTL_DNADR, NET_DECNET_NODE_ADDRESS, "node_address" },
608 { CTL_STR, NET_DECNET_NODE_NAME, "node_name" },
609 { CTL_STR, NET_DECNET_DEFAULT_DEVICE, "default_device" },
610 { CTL_INT, NET_DECNET_TIME_WAIT, "time_wait" },
611 { CTL_INT, NET_DECNET_DN_COUNT, "dn_count" },
612 { CTL_INT, NET_DECNET_DI_COUNT, "di_count" },
613 { CTL_INT, NET_DECNET_DR_COUNT, "dr_count" },
614 { CTL_INT, NET_DECNET_DST_GC_INTERVAL, "dst_gc_interval" },
615 { CTL_INT, NET_DECNET_NO_FC_MAX_CWND, "no_fc_max_cwnd" },
616 { CTL_INT, NET_DECNET_MEM, "decnet_mem" },
617 { CTL_INT, NET_DECNET_RMEM, "decnet_rmem" },
618 { CTL_INT, NET_DECNET_WMEM, "decnet_wmem" },
619 { CTL_INT, NET_DECNET_DEBUG_LEVEL, "debug" },
620 {}
621};
622
623static const struct bin_table bin_net_sctp_table[] = {
624 { CTL_INT, NET_SCTP_RTO_INITIAL, "rto_initial" },
625 { CTL_INT, NET_SCTP_RTO_MIN, "rto_min" },
626 { CTL_INT, NET_SCTP_RTO_MAX, "rto_max" },
627 { CTL_INT, NET_SCTP_RTO_ALPHA, "rto_alpha_exp_divisor" },
628 { CTL_INT, NET_SCTP_RTO_BETA, "rto_beta_exp_divisor" },
629 { CTL_INT, NET_SCTP_VALID_COOKIE_LIFE, "valid_cookie_life" },
630 { CTL_INT, NET_SCTP_ASSOCIATION_MAX_RETRANS, "association_max_retrans" },
631 { CTL_INT, NET_SCTP_PATH_MAX_RETRANS, "path_max_retrans" },
632 { CTL_INT, NET_SCTP_MAX_INIT_RETRANSMITS, "max_init_retransmits" },
633 { CTL_INT, NET_SCTP_HB_INTERVAL, "hb_interval" },
634 { CTL_INT, NET_SCTP_PRESERVE_ENABLE, "cookie_preserve_enable" },
635 { CTL_INT, NET_SCTP_MAX_BURST, "max_burst" },
636 { CTL_INT, NET_SCTP_ADDIP_ENABLE, "addip_enable" },
637 { CTL_INT, NET_SCTP_PRSCTP_ENABLE, "prsctp_enable" },
638 { CTL_INT, NET_SCTP_SNDBUF_POLICY, "sndbuf_policy" },
639 { CTL_INT, NET_SCTP_SACK_TIMEOUT, "sack_timeout" },
640 { CTL_INT, NET_SCTP_RCVBUF_POLICY, "rcvbuf_policy" },
641 {}
642};
643
644static const struct bin_table bin_net_llc_llc2_timeout_table[] = {
645 { CTL_INT, NET_LLC2_ACK_TIMEOUT, "ack" },
646 { CTL_INT, NET_LLC2_P_TIMEOUT, "p" },
647 { CTL_INT, NET_LLC2_REJ_TIMEOUT, "rej" },
648 { CTL_INT, NET_LLC2_BUSY_TIMEOUT, "busy" },
649 {}
650};
651
652static const struct bin_table bin_net_llc_station_table[] = {
653 { CTL_INT, NET_LLC_STATION_ACK_TIMEOUT, "ack_timeout" },
654 {}
655};
656
657static const struct bin_table bin_net_llc_llc2_table[] = {
658 { CTL_DIR, NET_LLC2, "timeout", bin_net_llc_llc2_timeout_table },
659 {}
660};
661
662static const struct bin_table bin_net_llc_table[] = {
663 { CTL_DIR, NET_LLC2, "llc2", bin_net_llc_llc2_table },
664 { CTL_DIR, NET_LLC_STATION, "station", bin_net_llc_station_table },
665 {}
666};
667
668static const struct bin_table bin_net_netfilter_table[] = {
669 { CTL_INT, NET_NF_CONNTRACK_MAX, "nf_conntrack_max" },
670 /* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "nf_conntrack_tcp_timeout_syn_sent" no longer used */
671 /* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "nf_conntrack_tcp_timeout_syn_recv" no longer used */
672 /* NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "nf_conntrack_tcp_timeout_established" no longer used */
673 /* NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "nf_conntrack_tcp_timeout_fin_wait" no longer used */
674 /* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT "nf_conntrack_tcp_timeout_close_wait" no longer used */
675 /* NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "nf_conntrack_tcp_timeout_last_ack" no longer used */
676 /* NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "nf_conntrack_tcp_timeout_time_wait" no longer used */
677 /* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "nf_conntrack_tcp_timeout_close" no longer used */
678 /* NET_NF_CONNTRACK_UDP_TIMEOUT "nf_conntrack_udp_timeout" no longer used */
679 /* NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM "nf_conntrack_udp_timeout_stream" no longer used */
680 /* NET_NF_CONNTRACK_ICMP_TIMEOUT "nf_conntrack_icmp_timeout" no longer used */
681 /* NET_NF_CONNTRACK_GENERIC_TIMEOUT "nf_conntrack_generic_timeout" no longer used */
682 { CTL_INT, NET_NF_CONNTRACK_BUCKETS, "nf_conntrack_buckets" },
683 { CTL_INT, NET_NF_CONNTRACK_LOG_INVALID, "nf_conntrack_log_invalid" },
684 /* NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "nf_conntrack_tcp_timeout_max_retrans" no longer used */
685 { CTL_INT, NET_NF_CONNTRACK_TCP_LOOSE, "nf_conntrack_tcp_loose" },
686 { CTL_INT, NET_NF_CONNTRACK_TCP_BE_LIBERAL, "nf_conntrack_tcp_be_liberal" },
687 { CTL_INT, NET_NF_CONNTRACK_TCP_MAX_RETRANS, "nf_conntrack_tcp_max_retrans" },
688 /* NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "nf_conntrack_sctp_timeout_closed" no longer used */
689 /* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "nf_conntrack_sctp_timeout_cookie_wait" no longer used */
690 /* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "nf_conntrack_sctp_timeout_cookie_echoed" no longer used */
691 /* NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "nf_conntrack_sctp_timeout_established" no longer used */
692 /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "nf_conntrack_sctp_timeout_shutdown_sent" no longer used */
693 /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "nf_conntrack_sctp_timeout_shutdown_recd" no longer used */
694 /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "nf_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */
695 { CTL_INT, NET_NF_CONNTRACK_COUNT, "nf_conntrack_count" },
696 /* NET_NF_CONNTRACK_ICMPV6_TIMEOUT "nf_conntrack_icmpv6_timeout" no longer used */
697 /* NET_NF_CONNTRACK_FRAG6_TIMEOUT "nf_conntrack_frag6_timeout" no longer used */
698 { CTL_INT, NET_NF_CONNTRACK_FRAG6_LOW_THRESH, "nf_conntrack_frag6_low_thresh" },
699 { CTL_INT, NET_NF_CONNTRACK_FRAG6_HIGH_THRESH, "nf_conntrack_frag6_high_thresh" },
700 { CTL_INT, NET_NF_CONNTRACK_CHECKSUM, "nf_conntrack_checksum" },
701
702 {}
703};
704
705static const struct bin_table bin_net_irda_table[] = {
706 { CTL_INT, NET_IRDA_DISCOVERY, "discovery" },
707 { CTL_STR, NET_IRDA_DEVNAME, "devname" },
708 { CTL_INT, NET_IRDA_DEBUG, "debug" },
709 { CTL_INT, NET_IRDA_FAST_POLL, "fast_poll_increase" },
710 { CTL_INT, NET_IRDA_DISCOVERY_SLOTS, "discovery_slots" },
711 { CTL_INT, NET_IRDA_DISCOVERY_TIMEOUT, "discovery_timeout" },
712 { CTL_INT, NET_IRDA_SLOT_TIMEOUT, "slot_timeout" },
713 { CTL_INT, NET_IRDA_MAX_BAUD_RATE, "max_baud_rate" },
714 { CTL_INT, NET_IRDA_MIN_TX_TURN_TIME, "min_tx_turn_time" },
715 { CTL_INT, NET_IRDA_MAX_TX_DATA_SIZE, "max_tx_data_size" },
716 { CTL_INT, NET_IRDA_MAX_TX_WINDOW, "max_tx_window" },
717 { CTL_INT, NET_IRDA_MAX_NOREPLY_TIME, "max_noreply_time" },
718 { CTL_INT, NET_IRDA_WARN_NOREPLY_TIME, "warn_noreply_time" },
719 { CTL_INT, NET_IRDA_LAP_KEEPALIVE_TIME, "lap_keepalive_time" },
720 {}
721};
722
723static const struct bin_table bin_net_table[] = {
724 { CTL_DIR, NET_CORE, "core", bin_net_core_table },
725 /* NET_ETHER not used */
726 /* NET_802 not used */
727 { CTL_DIR, NET_UNIX, "unix", bin_net_unix_table },
728 { CTL_DIR, NET_IPV4, "ipv4", bin_net_ipv4_table },
729 { CTL_DIR, NET_IPX, "ipx", bin_net_ipx_table },
730 { CTL_DIR, NET_ATALK, "appletalk", bin_net_atalk_table },
731 { CTL_DIR, NET_NETROM, "netrom", bin_net_netrom_table },
732 { CTL_DIR, NET_AX25, "ax25", bin_net_ax25_table },
733 /* NET_BRIDGE "bridge" no longer used */
734 { CTL_DIR, NET_ROSE, "rose", bin_net_rose_table },
735 { CTL_DIR, NET_IPV6, "ipv6", bin_net_ipv6_table },
736 { CTL_DIR, NET_X25, "x25", bin_net_x25_table },
737 { CTL_DIR, NET_TR, "token-ring", bin_net_tr_table },
738 { CTL_DIR, NET_DECNET, "decnet", bin_net_decnet_table },
739 /* NET_ECONET not used */
740 { CTL_DIR, NET_SCTP, "sctp", bin_net_sctp_table },
741 { CTL_DIR, NET_LLC, "llc", bin_net_llc_table },
742 { CTL_DIR, NET_NETFILTER, "netfilter", bin_net_netfilter_table },
743 /* NET_DCCP "dccp" no longer used */
744 { CTL_DIR, NET_IRDA, "irda", bin_net_irda_table },
745 { CTL_INT, 2089, "nf_conntrack_max" },
746 {}
747};
748
749static const struct bin_table bin_fs_quota_table[] = {
750 { CTL_INT, FS_DQ_LOOKUPS, "lookups" },
751 { CTL_INT, FS_DQ_DROPS, "drops" },
752 { CTL_INT, FS_DQ_READS, "reads" },
753 { CTL_INT, FS_DQ_WRITES, "writes" },
754 { CTL_INT, FS_DQ_CACHE_HITS, "cache_hits" },
755 { CTL_INT, FS_DQ_ALLOCATED, "allocated_dquots" },
756 { CTL_INT, FS_DQ_FREE, "free_dquots" },
757 { CTL_INT, FS_DQ_SYNCS, "syncs" },
758 { CTL_INT, FS_DQ_WARNINGS, "warnings" },
759 {}
760};
761
762static const struct bin_table bin_fs_xfs_table[] = {
763 { CTL_INT, XFS_SGID_INHERIT, "irix_sgid_inherit" },
764 { CTL_INT, XFS_SYMLINK_MODE, "irix_symlink_mode" },
765 { CTL_INT, XFS_PANIC_MASK, "panic_mask" },
766
767 { CTL_INT, XFS_ERRLEVEL, "error_level" },
768 { CTL_INT, XFS_SYNCD_TIMER, "xfssyncd_centisecs" },
769 { CTL_INT, XFS_INHERIT_SYNC, "inherit_sync" },
770 { CTL_INT, XFS_INHERIT_NODUMP, "inherit_nodump" },
771 { CTL_INT, XFS_INHERIT_NOATIME, "inherit_noatime" },
772 { CTL_INT, XFS_BUF_TIMER, "xfsbufd_centisecs" },
773 { CTL_INT, XFS_BUF_AGE, "age_buffer_centisecs" },
774 { CTL_INT, XFS_INHERIT_NOSYM, "inherit_nosymlinks" },
775 { CTL_INT, XFS_ROTORSTEP, "rotorstep" },
776 { CTL_INT, XFS_INHERIT_NODFRG, "inherit_nodefrag" },
777 { CTL_INT, XFS_FILESTREAM_TIMER, "filestream_centisecs" },
778 { CTL_INT, XFS_STATS_CLEAR, "stats_clear" },
779 {}
780};
781
782static const struct bin_table bin_fs_ocfs2_nm_table[] = {
783 { CTL_STR, 1, "hb_ctl_path" },
784 {}
785};
786
787static const struct bin_table bin_fs_ocfs2_table[] = {
788 { CTL_DIR, 1, "nm", bin_fs_ocfs2_nm_table },
789 {}
790};
791
792static const struct bin_table bin_inotify_table[] = {
793 { CTL_INT, INOTIFY_MAX_USER_INSTANCES, "max_user_instances" },
794 { CTL_INT, INOTIFY_MAX_USER_WATCHES, "max_user_watches" },
795 { CTL_INT, INOTIFY_MAX_QUEUED_EVENTS, "max_queued_events" },
796 {}
797};
798
799static const struct bin_table bin_fs_table[] = {
800 { CTL_INT, FS_NRINODE, "inode-nr" },
801 { CTL_INT, FS_STATINODE, "inode-state" },
802 /* FS_MAXINODE unused */
803 /* FS_NRDQUOT unused */
804 /* FS_MAXDQUOT unused */
805 /* FS_NRFILE "file-nr" no longer used */
806 { CTL_INT, FS_MAXFILE, "file-max" },
807 { CTL_INT, FS_DENTRY, "dentry-state" },
808 /* FS_NRSUPER unused */
809 /* FS_MAXUPSER unused */
810 { CTL_INT, FS_OVERFLOWUID, "overflowuid" },
811 { CTL_INT, FS_OVERFLOWGID, "overflowgid" },
812 { CTL_INT, FS_LEASES, "leases-enable" },
813 { CTL_INT, FS_DIR_NOTIFY, "dir-notify-enable" },
814 { CTL_INT, FS_LEASE_TIME, "lease-break-time" },
815 { CTL_DIR, FS_DQSTATS, "quota", bin_fs_quota_table },
816 { CTL_DIR, FS_XFS, "xfs", bin_fs_xfs_table },
817 { CTL_ULONG, FS_AIO_NR, "aio-nr" },
818 { CTL_ULONG, FS_AIO_MAX_NR, "aio-max-nr" },
819 { CTL_DIR, FS_INOTIFY, "inotify", bin_inotify_table },
820 { CTL_DIR, FS_OCFS2, "ocfs2", bin_fs_ocfs2_table },
821 { CTL_INT, KERN_SETUID_DUMPABLE, "suid_dumpable" },
822 {}
823};
824
825static const struct bin_table bin_ipmi_table[] = {
826 { CTL_INT, DEV_IPMI_POWEROFF_POWERCYCLE, "poweroff_powercycle" },
827 {}
828};
829
830static const struct bin_table bin_mac_hid_files[] = {
831 /* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */
832 /* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */
833 { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON_EMULATION, "mouse_button_emulation" },
834 { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON2_KEYCODE, "mouse_button2_keycode" },
835 { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON3_KEYCODE, "mouse_button3_keycode" },
836 /* DEV_MAC_HID_ADB_MOUSE_SENDS_KEYCODES unused */
837 {}
838};
839
840static const struct bin_table bin_raid_table[] = {
841 { CTL_INT, DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min" },
842 { CTL_INT, DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max" },
843 {}
844};
845
846static const struct bin_table bin_scsi_table[] = {
847 { CTL_INT, DEV_SCSI_LOGGING_LEVEL, "logging_level" },
848 {}
849};
850
851static const struct bin_table bin_dev_table[] = {
852 /* DEV_CDROM "cdrom" no longer used */
853 /* DEV_HWMON unused */
854 /* DEV_PARPORT "parport" no longer used */
855 { CTL_DIR, DEV_RAID, "raid", bin_raid_table },
856 { CTL_DIR, DEV_MAC_HID, "mac_hid", bin_mac_hid_files },
857 { CTL_DIR, DEV_SCSI, "scsi", bin_scsi_table },
858 { CTL_DIR, DEV_IPMI, "ipmi", bin_ipmi_table },
859 {}
860};
861
862static const struct bin_table bin_bus_isa_table[] = {
863 { CTL_INT, BUS_ISA_MEM_BASE, "membase" },
864 { CTL_INT, BUS_ISA_PORT_BASE, "portbase" },
865 { CTL_INT, BUS_ISA_PORT_SHIFT, "portshift" },
866 {}
867};
868
869static const struct bin_table bin_bus_table[] = {
870 { CTL_DIR, CTL_BUS_ISA, "isa", bin_bus_isa_table },
871 {}
872};
873
874
875static const struct bin_table bin_s390dbf_table[] = {
876 { CTL_INT, 5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" },
877 { CTL_INT, 5679 /* CTL_S390DBF_ACTIVE */, "debug_active" },
878 {}
879};
880
881static const struct bin_table bin_sunrpc_table[] = {
882 /* CTL_RPCDEBUG "rpc_debug" no longer used */
883 /* CTL_NFSDEBUG "nfs_debug" no longer used */
884 /* CTL_NFSDDEBUG "nfsd_debug" no longer used */
885 /* CTL_NLMDEBUG "nlm_debug" no longer used */
886
887 { CTL_INT, CTL_SLOTTABLE_UDP, "udp_slot_table_entries" },
888 { CTL_INT, CTL_SLOTTABLE_TCP, "tcp_slot_table_entries" },
889 { CTL_INT, CTL_MIN_RESVPORT, "min_resvport" },
890 { CTL_INT, CTL_MAX_RESVPORT, "max_resvport" },
891 {}
892};
893
894static const struct bin_table bin_pm_table[] = {
895 /* frv specific */
896 /* 1 == CTL_PM_SUSPEND "suspend" no longer used" */
897 { CTL_INT, 2 /* CTL_PM_CMODE */, "cmode" },
898 { CTL_INT, 3 /* CTL_PM_P0 */, "p0" },
899 { CTL_INT, 4 /* CTL_PM_CM */, "cm" },
900 {}
901};
902
903static const struct bin_table bin_root_table[] = {
904 { CTL_DIR, CTL_KERN, "kernel", bin_kern_table },
905 { CTL_DIR, CTL_VM, "vm", bin_vm_table },
906 { CTL_DIR, CTL_NET, "net", bin_net_table },
907 /* CTL_PROC not used */
908 { CTL_DIR, CTL_FS, "fs", bin_fs_table },
909 /* CTL_DEBUG "debug" no longer used */
910 { CTL_DIR, CTL_DEV, "dev", bin_dev_table },
911 { CTL_DIR, CTL_BUS, "bus", bin_bus_table },
912 { CTL_DIR, CTL_ABI, "abi" },
913 /* CTL_CPU not used */
914 /* CTL_ARLAN "arlan" no longer used */
915 { CTL_DIR, CTL_S390DBF, "s390dbf", bin_s390dbf_table },
916 { CTL_DIR, CTL_SUNRPC, "sunrpc", bin_sunrpc_table },
917 { CTL_DIR, CTL_PM, "pm", bin_pm_table },
918 {}
919};
920
921static ssize_t bin_dir(struct file *file,
922 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
923{
924 return -ENOTDIR;
925}
926
927
928static ssize_t bin_string(struct file *file,
929 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
930{
931 ssize_t result, copied = 0;
932
933 if (oldval && oldlen) {
934 char __user *lastp;
935 loff_t pos = 0;
936 int ch;
937
938 result = vfs_read(file, oldval, oldlen, &pos);
939 if (result < 0)
940 goto out;
941
942 copied = result;
943 lastp = oldval + copied - 1;
944
945 result = -EFAULT;
946 if (get_user(ch, lastp))
947 goto out;
948
949 /* Trim off the trailing newline */
950 if (ch == '\n') {
951 result = -EFAULT;
952 if (put_user('\0', lastp))
953 goto out;
954 copied -= 1;
955 }
956 }
957
958 if (newval && newlen) {
959 loff_t pos = 0;
960
961 result = vfs_write(file, newval, newlen, &pos);
962 if (result < 0)
963 goto out;
964 }
965
966 result = copied;
967out:
968 return result;
969}
970
971static ssize_t bin_intvec(struct file *file,
972 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
973{
974 mm_segment_t old_fs = get_fs();
975 ssize_t copied = 0;
976 char *buffer;
977 ssize_t result;
978
979 result = -ENOMEM;
980 buffer = kmalloc(BUFSZ, GFP_KERNEL);
981 if (!buffer)
982 goto out;
983
984 if (oldval && oldlen) {
985 unsigned __user *vec = oldval;
986 size_t length = oldlen / sizeof(*vec);
987 loff_t pos = 0;
988 char *str, *end;
989 int i;
990
991 set_fs(KERNEL_DS);
992 result = vfs_read(file, buffer, BUFSZ - 1, &pos);
993 set_fs(old_fs);
994 if (result < 0)
995 goto out_kfree;
996
997 str = buffer;
998 end = str + result;
999 *end++ = '\0';
1000 for (i = 0; i < length; i++) {
1001 unsigned long value;
1002
1003 value = simple_strtoul(str, &str, 10);
1004 while (isspace(*str))
1005 str++;
1006
1007 result = -EFAULT;
1008 if (put_user(value, vec + i))
1009 goto out_kfree;
1010
1011 copied += sizeof(*vec);
1012 if (!isdigit(*str))
1013 break;
1014 }
1015 }
1016
1017 if (newval && newlen) {
1018 unsigned __user *vec = newval;
1019 size_t length = newlen / sizeof(*vec);
1020 loff_t pos = 0;
1021 char *str, *end;
1022 int i;
1023
1024 str = buffer;
1025 end = str + BUFSZ;
1026 for (i = 0; i < length; i++) {
1027 unsigned long value;
1028
1029 result = -EFAULT;
1030 if (get_user(value, vec + i))
1031 goto out_kfree;
1032
1033 str += snprintf(str, end - str, "%lu\t", value);
1034 }
1035
1036 set_fs(KERNEL_DS);
1037 result = vfs_write(file, buffer, str - buffer, &pos);
1038 set_fs(old_fs);
1039 if (result < 0)
1040 goto out_kfree;
1041 }
1042 result = copied;
1043out_kfree:
1044 kfree(buffer);
1045out:
1046 return result;
1047}
1048
1049static ssize_t bin_ulongvec(struct file *file,
1050 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1051{
1052 mm_segment_t old_fs = get_fs();
1053 ssize_t copied = 0;
1054 char *buffer;
1055 ssize_t result;
1056
1057 result = -ENOMEM;
1058 buffer = kmalloc(BUFSZ, GFP_KERNEL);
1059 if (!buffer)
1060 goto out;
1061
1062 if (oldval && oldlen) {
1063 unsigned long __user *vec = oldval;
1064 size_t length = oldlen / sizeof(*vec);
1065 loff_t pos = 0;
1066 char *str, *end;
1067 int i;
1068
1069 set_fs(KERNEL_DS);
1070 result = vfs_read(file, buffer, BUFSZ - 1, &pos);
1071 set_fs(old_fs);
1072 if (result < 0)
1073 goto out_kfree;
1074
1075 str = buffer;
1076 end = str + result;
1077 *end++ = '\0';
1078 for (i = 0; i < length; i++) {
1079 unsigned long value;
1080
1081 value = simple_strtoul(str, &str, 10);
1082 while (isspace(*str))
1083 str++;
1084
1085 result = -EFAULT;
1086 if (put_user(value, vec + i))
1087 goto out_kfree;
1088
1089 copied += sizeof(*vec);
1090 if (!isdigit(*str))
1091 break;
1092 }
1093 }
1094
1095 if (newval && newlen) {
1096 unsigned long __user *vec = newval;
1097 size_t length = newlen / sizeof(*vec);
1098 loff_t pos = 0;
1099 char *str, *end;
1100 int i;
1101
1102 str = buffer;
1103 end = str + BUFSZ;
1104 for (i = 0; i < length; i++) {
1105 unsigned long value;
1106
1107 result = -EFAULT;
1108 if (get_user(value, vec + i))
1109 goto out_kfree;
1110
1111 str += snprintf(str, end - str, "%lu\t", value);
1112 }
1113
1114 set_fs(KERNEL_DS);
1115 result = vfs_write(file, buffer, str - buffer, &pos);
1116 set_fs(old_fs);
1117 if (result < 0)
1118 goto out_kfree;
1119 }
1120 result = copied;
1121out_kfree:
1122 kfree(buffer);
1123out:
1124 return result;
1125}
1126
1127static unsigned hex_value(int ch)
1128{
1129 return isdigit(ch) ? ch - '0' : ((ch | 0x20) - 'a') + 10;
1130}
1131
1132static ssize_t bin_uuid(struct file *file,
1133 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1134{
1135 mm_segment_t old_fs = get_fs();
1136 ssize_t result, copied = 0;
1137
1138 /* Only supports reads */
1139 if (oldval && oldlen) {
1140 loff_t pos = 0;
1141 char buf[40], *str = buf;
1142 unsigned char uuid[16];
1143 int i;
1144
1145 set_fs(KERNEL_DS);
1146 result = vfs_read(file, buf, sizeof(buf) - 1, &pos);
1147 set_fs(old_fs);
1148 if (result < 0)
1149 goto out;
1150
1151 buf[result] = '\0';
1152
1153 /* Convert the uuid to from a string to binary */
1154 for (i = 0; i < 16; i++) {
1155 result = -EIO;
1156 if (!isxdigit(str[0]) || !isxdigit(str[1]))
1157 goto out;
1158
1159 uuid[i] = (hex_value(str[0]) << 4) | hex_value(str[1]);
1160 str += 2;
1161 if (*str == '-')
1162 str++;
1163 }
1164
1165 if (oldlen > 16)
1166 oldlen = 16;
1167
1168 result = -EFAULT;
1169 if (copy_to_user(oldval, uuid, oldlen))
1170 goto out;
1171
1172 copied = oldlen;
1173 }
1174 result = copied;
1175out:
1176 return result;
1177}
1178
1179static ssize_t bin_dn_node_address(struct file *file,
1180 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1181{
1182 mm_segment_t old_fs = get_fs();
1183 ssize_t result, copied = 0;
1184
1185 if (oldval && oldlen) {
1186 loff_t pos = 0;
1187 char buf[15], *nodep;
1188 unsigned long area, node;
1189 __le16 dnaddr;
1190
1191 set_fs(KERNEL_DS);
1192 result = vfs_read(file, buf, sizeof(buf) - 1, &pos);
1193 set_fs(old_fs);
1194 if (result < 0)
1195 goto out;
1196
1197 buf[result] = '\0';
1198
1199 /* Convert the decnet addresss to binary */
1200 result = -EIO;
1201 nodep = strchr(buf, '.') + 1;
1202 if (!nodep)
1203 goto out;
1204
1205 area = simple_strtoul(buf, NULL, 10);
1206 node = simple_strtoul(nodep, NULL, 10);
1207
1208 result = -EIO;
1209 if ((area > 63)||(node > 1023))
1210 goto out;
1211
1212 dnaddr = cpu_to_le16((area << 10) | node);
1213
1214 result = -EFAULT;
1215 if (put_user(dnaddr, (__le16 __user *)oldval))
1216 goto out;
1217
1218 copied = sizeof(dnaddr);
1219 }
1220
1221 if (newval && newlen) {
1222 loff_t pos = 0;
1223 __le16 dnaddr;
1224 char buf[15];
1225 int len;
1226
1227 result = -EINVAL;
1228 if (newlen != sizeof(dnaddr))
1229 goto out;
1230
1231 result = -EFAULT;
1232 if (get_user(dnaddr, (__le16 __user *)newval))
1233 goto out;
1234
1235 len = snprintf(buf, sizeof(buf), "%hu.%hu",
1236 le16_to_cpu(dnaddr) >> 10,
1237 le16_to_cpu(dnaddr) & 0x3ff);
1238
1239 set_fs(KERNEL_DS);
1240 result = vfs_write(file, buf, len, &pos);
1241 set_fs(old_fs);
1242 if (result < 0)
1243 goto out;
1244 }
1245
1246 result = copied;
1247out:
1248 return result;
1249}
1250
1251static const struct bin_table *get_sysctl(const int *name, int nlen, char *path)
1252{
1253 const struct bin_table *table = &bin_root_table[0];
1254 int ctl_name;
1255
1256 /* The binary sysctl tables have a small maximum depth so
1257 * there is no danger of overflowing our path as it PATH_MAX
1258 * bytes long.
1259 */
1260 memcpy(path, "sys/", 4);
1261 path += 4;
1262
1263repeat:
1264 if (!nlen)
1265 return ERR_PTR(-ENOTDIR);
1266 ctl_name = *name;
1267 name++;
1268 nlen--;
1269 for ( ; table->convert; table++) {
1270 int len = 0;
1271
1272 /*
1273 * For a wild card entry map from ifindex to network
1274 * device name.
1275 */
1276 if (!table->ctl_name) {
1277#ifdef CONFIG_NET
1278 struct net *net = current->nsproxy->net_ns;
1279 struct net_device *dev;
1280 dev = dev_get_by_index(net, ctl_name);
1281 if (dev) {
1282 len = strlen(dev->name);
1283 memcpy(path, dev->name, len);
1284 dev_put(dev);
1285 }
1286#endif
1287 /* Use the well known sysctl number to proc name mapping */
1288 } else if (ctl_name == table->ctl_name) {
1289 len = strlen(table->procname);
1290 memcpy(path, table->procname, len);
1291 }
1292 if (len) {
1293 path += len;
1294 if (table->child) {
1295 *path++ = '/';
1296 table = table->child;
1297 goto repeat;
1298 }
1299 *path = '\0';
1300 return table;
1301 }
1302 }
1303 return ERR_PTR(-ENOTDIR);
1304}
1305
1306static char *sysctl_getname(const int *name, int nlen, const struct bin_table **tablep)
1307{
1308 char *tmp, *result;
1309
1310 result = ERR_PTR(-ENOMEM);
1311 tmp = __getname();
1312 if (tmp) {
1313 const struct bin_table *table = get_sysctl(name, nlen, tmp);
1314 result = tmp;
1315 *tablep = table;
1316 if (IS_ERR(table)) {
1317 __putname(tmp);
1318 result = ERR_CAST(table);
1319 }
1320 }
1321 return result;
1322}
1323
1324static ssize_t binary_sysctl(const int *name, int nlen,
1325 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1326{
1327 const struct bin_table *table = NULL;
1328 struct nameidata nd;
1329 struct vfsmount *mnt;
1330 struct file *file;
1331 ssize_t result;
1332 char *pathname;
1333 int flags;
1334 int acc_mode, fmode;
1335
1336 pathname = sysctl_getname(name, nlen, &table);
1337 result = PTR_ERR(pathname);
1338 if (IS_ERR(pathname))
1339 goto out;
1340
1341 /* How should the sysctl be accessed? */
1342 if (oldval && oldlen && newval && newlen) {
1343 flags = O_RDWR;
1344 acc_mode = MAY_READ | MAY_WRITE;
1345 fmode = FMODE_READ | FMODE_WRITE;
1346 } else if (newval && newlen) {
1347 flags = O_WRONLY;
1348 acc_mode = MAY_WRITE;
1349 fmode = FMODE_WRITE;
1350 } else if (oldval && oldlen) {
1351 flags = O_RDONLY;
1352 acc_mode = MAY_READ;
1353 fmode = FMODE_READ;
1354 } else {
1355 result = 0;
1356 goto out_putname;
1357 }
1358
1359 mnt = current->nsproxy->pid_ns->proc_mnt;
1360 result = vfs_path_lookup(mnt->mnt_root, mnt, pathname, 0, &nd);
1361 if (result)
1362 goto out_putname;
1363
1364 result = may_open(&nd.path, acc_mode, fmode);
1365 if (result)
1366 goto out_putpath;
1367
1368 file = dentry_open(nd.path.dentry, nd.path.mnt, flags, current_cred());
1369 result = PTR_ERR(file);
1370 if (IS_ERR(file))
1371 goto out_putname;
1372
1373 result = table->convert(file, oldval, oldlen, newval, newlen);
1374
1375 fput(file);
1376out_putname:
1377 putname(pathname);
1378out:
1379 return result;
1380
1381out_putpath:
1382 path_put(&nd.path);
1383 goto out_putname;
1384}
1385
1386
1387#else /* CONFIG_SYSCTL_SYSCALL */
1388
1389static ssize_t binary_sysctl(const int *name, int nlen,
1390 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1391{
1392 return -ENOSYS;
1393}
1394
1395#endif /* CONFIG_SYSCTL_SYSCALL */
1396
1397
1398static void deprecated_sysctl_warning(const int *name, int nlen)
1399{
1400 int i;
1401
1402 if (printk_ratelimit()) {
1403 printk(KERN_INFO
1404 "warning: process `%s' used the deprecated sysctl "
1405 "system call with ", current->comm);
1406 for (i = 0; i < nlen; i++)
1407 printk("%d.", name[i]);
1408 printk("\n");
1409 }
1410 return;
1411}
1412
1413static ssize_t do_sysctl(int __user *args_name, int nlen,
1414 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1415{
1416 int name[CTL_MAXNAME];
1417 int i;
1418
1419 /* Check args->nlen. */
1420 if (nlen < 0 || nlen > CTL_MAXNAME)
1421 return -ENOTDIR;
1422 /* Read in the sysctl name for simplicity */
1423 for (i = 0; i < nlen; i++)
1424 if (get_user(name[i], args_name + i))
1425 return -EFAULT;
1426
1427 deprecated_sysctl_warning(name, nlen);
1428
1429 return binary_sysctl(name, nlen, oldval, oldlen, newval, newlen);
1430}
1431
1432SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
1433{
1434 struct __sysctl_args tmp;
1435 size_t oldlen = 0;
1436 ssize_t result;
1437
1438 if (copy_from_user(&tmp, args, sizeof(tmp)))
1439 return -EFAULT;
1440
1441 if (tmp.oldval && !tmp.oldlenp)
1442 return -EFAULT;
1443
1444 if (tmp.oldlenp && get_user(oldlen, tmp.oldlenp))
1445 return -EFAULT;
1446
1447 result = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, oldlen,
1448 tmp.newval, tmp.newlen);
1449
1450 if (result >= 0) {
1451 oldlen = result;
1452 result = 0;
1453 }
1454
1455 if (tmp.oldlenp && put_user(oldlen, tmp.oldlenp))
1456 return -EFAULT;
1457
1458 return result;
1459}
1460
1461
1462#ifdef CONFIG_COMPAT
1463#include <asm/compat.h>
1464
1465struct compat_sysctl_args {
1466 compat_uptr_t name;
1467 int nlen;
1468 compat_uptr_t oldval;
1469 compat_uptr_t oldlenp;
1470 compat_uptr_t newval;
1471 compat_size_t newlen;
1472 compat_ulong_t __unused[4];
1473};
1474
1475asmlinkage long compat_sys_sysctl(struct compat_sysctl_args __user *args)
1476{
1477 struct compat_sysctl_args tmp;
1478 compat_size_t __user *compat_oldlenp;
1479 size_t oldlen = 0;
1480 ssize_t result;
1481
1482 if (copy_from_user(&tmp, args, sizeof(tmp)))
1483 return -EFAULT;
1484
1485 if (tmp.oldval && !tmp.oldlenp)
1486 return -EFAULT;
1487
1488 compat_oldlenp = compat_ptr(tmp.oldlenp);
1489 if (compat_oldlenp && get_user(oldlen, compat_oldlenp))
1490 return -EFAULT;
1491
1492 result = do_sysctl(compat_ptr(tmp.name), tmp.nlen,
1493 compat_ptr(tmp.oldval), oldlen,
1494 compat_ptr(tmp.newval), tmp.newlen);
1495
1496 if (result >= 0) {
1497 oldlen = result;
1498 result = 0;
1499 }
1500
1501 if (compat_oldlenp && put_user(oldlen, compat_oldlenp))
1502 return -EFAULT;
1503
1504 return result;
1505}
1506
1507#endif /* CONFIG_COMPAT */
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index b38423ca711a..04cdcf72c827 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -5,1239 +5,6 @@
5#include <linux/string.h> 5#include <linux/string.h>
6#include <net/ip_vs.h> 6#include <net/ip_vs.h>
7 7
8struct trans_ctl_table {
9 int ctl_name;
10 const char *procname;
11 const struct trans_ctl_table *child;
12};
13
14static const struct trans_ctl_table trans_random_table[] = {
15 { RANDOM_POOLSIZE, "poolsize" },
16 { RANDOM_ENTROPY_COUNT, "entropy_avail" },
17 { RANDOM_READ_THRESH, "read_wakeup_threshold" },
18 { RANDOM_WRITE_THRESH, "write_wakeup_threshold" },
19 { RANDOM_BOOT_ID, "boot_id" },
20 { RANDOM_UUID, "uuid" },
21 {}
22};
23
24static const struct trans_ctl_table trans_pty_table[] = {
25 { PTY_MAX, "max" },
26 { PTY_NR, "nr" },
27 {}
28};
29
30static const struct trans_ctl_table trans_kern_table[] = {
31 { KERN_OSTYPE, "ostype" },
32 { KERN_OSRELEASE, "osrelease" },
33 /* KERN_OSREV not used */
34 { KERN_VERSION, "version" },
35 /* KERN_SECUREMASK not used */
36 /* KERN_PROF not used */
37 { KERN_NODENAME, "hostname" },
38 { KERN_DOMAINNAME, "domainname" },
39
40 { KERN_PANIC, "panic" },
41 { KERN_REALROOTDEV, "real-root-dev" },
42
43 { KERN_SPARC_REBOOT, "reboot-cmd" },
44 { KERN_CTLALTDEL, "ctrl-alt-del" },
45 { KERN_PRINTK, "printk" },
46
47 /* KERN_NAMETRANS not used */
48 /* KERN_PPC_HTABRECLAIM not used */
49 /* KERN_PPC_ZEROPAGED not used */
50 { KERN_PPC_POWERSAVE_NAP, "powersave-nap" },
51
52 { KERN_MODPROBE, "modprobe" },
53 { KERN_SG_BIG_BUFF, "sg-big-buff" },
54 { KERN_ACCT, "acct" },
55 { KERN_PPC_L2CR, "l2cr" },
56
57 /* KERN_RTSIGNR not used */
58 /* KERN_RTSIGMAX not used */
59
60 { KERN_SHMMAX, "shmmax" },
61 { KERN_MSGMAX, "msgmax" },
62 { KERN_MSGMNB, "msgmnb" },
63 /* KERN_MSGPOOL not used*/
64 { KERN_SYSRQ, "sysrq" },
65 { KERN_MAX_THREADS, "threads-max" },
66 { KERN_RANDOM, "random", trans_random_table },
67 { KERN_SHMALL, "shmall" },
68 { KERN_MSGMNI, "msgmni" },
69 { KERN_SEM, "sem" },
70 { KERN_SPARC_STOP_A, "stop-a" },
71 { KERN_SHMMNI, "shmmni" },
72
73 { KERN_OVERFLOWUID, "overflowuid" },
74 { KERN_OVERFLOWGID, "overflowgid" },
75
76 { KERN_HOTPLUG, "hotplug", },
77 { KERN_IEEE_EMULATION_WARNINGS, "ieee_emulation_warnings" },
78
79 { KERN_S390_USER_DEBUG_LOGGING, "userprocess_debug" },
80 { KERN_CORE_USES_PID, "core_uses_pid" },
81 { KERN_TAINTED, "tainted" },
82 { KERN_CADPID, "cad_pid" },
83 { KERN_PIDMAX, "pid_max" },
84 { KERN_CORE_PATTERN, "core_pattern" },
85 { KERN_PANIC_ON_OOPS, "panic_on_oops" },
86 { KERN_HPPA_PWRSW, "soft-power" },
87 { KERN_HPPA_UNALIGNED, "unaligned-trap" },
88
89 { KERN_PRINTK_RATELIMIT, "printk_ratelimit" },
90 { KERN_PRINTK_RATELIMIT_BURST, "printk_ratelimit_burst" },
91
92 { KERN_PTY, "pty", trans_pty_table },
93 { KERN_NGROUPS_MAX, "ngroups_max" },
94 { KERN_SPARC_SCONS_PWROFF, "scons-poweroff" },
95 { KERN_HZ_TIMER, "hz_timer" },
96 { KERN_UNKNOWN_NMI_PANIC, "unknown_nmi_panic" },
97 { KERN_BOOTLOADER_TYPE, "bootloader_type" },
98 { KERN_RANDOMIZE, "randomize_va_space" },
99
100 { KERN_SPIN_RETRY, "spin_retry" },
101 { KERN_ACPI_VIDEO_FLAGS, "acpi_video_flags" },
102 { KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" },
103 { KERN_COMPAT_LOG, "compat-log" },
104 { KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
105 { KERN_NMI_WATCHDOG, "nmi_watchdog" },
106 { KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
107 {}
108};
109
110static const struct trans_ctl_table trans_vm_table[] = {
111 { VM_OVERCOMMIT_MEMORY, "overcommit_memory" },
112 { VM_PAGE_CLUSTER, "page-cluster" },
113 { VM_DIRTY_BACKGROUND, "dirty_background_ratio" },
114 { VM_DIRTY_RATIO, "dirty_ratio" },
115 { VM_DIRTY_WB_CS, "dirty_writeback_centisecs" },
116 { VM_DIRTY_EXPIRE_CS, "dirty_expire_centisecs" },
117 { VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" },
118 { VM_OVERCOMMIT_RATIO, "overcommit_ratio" },
119 /* VM_PAGEBUF unused */
120 { VM_HUGETLB_PAGES, "nr_hugepages" },
121 { VM_SWAPPINESS, "swappiness" },
122 { VM_LOWMEM_RESERVE_RATIO, "lowmem_reserve_ratio" },
123 { VM_MIN_FREE_KBYTES, "min_free_kbytes" },
124 { VM_MAX_MAP_COUNT, "max_map_count" },
125 { VM_LAPTOP_MODE, "laptop_mode" },
126 { VM_BLOCK_DUMP, "block_dump" },
127 { VM_HUGETLB_GROUP, "hugetlb_shm_group" },
128 { VM_VFS_CACHE_PRESSURE, "vfs_cache_pressure" },
129 { VM_LEGACY_VA_LAYOUT, "legacy_va_layout" },
130 /* VM_SWAP_TOKEN_TIMEOUT unused */
131 { VM_DROP_PAGECACHE, "drop_caches" },
132 { VM_PERCPU_PAGELIST_FRACTION, "percpu_pagelist_fraction" },
133 { VM_ZONE_RECLAIM_MODE, "zone_reclaim_mode" },
134 { VM_MIN_UNMAPPED, "min_unmapped_ratio" },
135 { VM_PANIC_ON_OOM, "panic_on_oom" },
136 { VM_VDSO_ENABLED, "vdso_enabled" },
137 { VM_MIN_SLAB, "min_slab_ratio" },
138
139 {}
140};
141
142static const struct trans_ctl_table trans_net_core_table[] = {
143 { NET_CORE_WMEM_MAX, "wmem_max" },
144 { NET_CORE_RMEM_MAX, "rmem_max" },
145 { NET_CORE_WMEM_DEFAULT, "wmem_default" },
146 { NET_CORE_RMEM_DEFAULT, "rmem_default" },
147 /* NET_CORE_DESTROY_DELAY unused */
148 { NET_CORE_MAX_BACKLOG, "netdev_max_backlog" },
149 /* NET_CORE_FASTROUTE unused */
150 { NET_CORE_MSG_COST, "message_cost" },
151 { NET_CORE_MSG_BURST, "message_burst" },
152 { NET_CORE_OPTMEM_MAX, "optmem_max" },
153 /* NET_CORE_HOT_LIST_LENGTH unused */
154 /* NET_CORE_DIVERT_VERSION unused */
155 /* NET_CORE_NO_CONG_THRESH unused */
156 /* NET_CORE_NO_CONG unused */
157 /* NET_CORE_LO_CONG unused */
158 /* NET_CORE_MOD_CONG unused */
159 { NET_CORE_DEV_WEIGHT, "dev_weight" },
160 { NET_CORE_SOMAXCONN, "somaxconn" },
161 { NET_CORE_BUDGET, "netdev_budget" },
162 { NET_CORE_AEVENT_ETIME, "xfrm_aevent_etime" },
163 { NET_CORE_AEVENT_RSEQTH, "xfrm_aevent_rseqth" },
164 { NET_CORE_WARNINGS, "warnings" },
165 {},
166};
167
168static const struct trans_ctl_table trans_net_unix_table[] = {
169 /* NET_UNIX_DESTROY_DELAY unused */
170 /* NET_UNIX_DELETE_DELAY unused */
171 { NET_UNIX_MAX_DGRAM_QLEN, "max_dgram_qlen" },
172 {}
173};
174
175static const struct trans_ctl_table trans_net_ipv4_route_table[] = {
176 { NET_IPV4_ROUTE_FLUSH, "flush" },
177 { NET_IPV4_ROUTE_MIN_DELAY, "min_delay" },
178 { NET_IPV4_ROUTE_MAX_DELAY, "max_delay" },
179 { NET_IPV4_ROUTE_GC_THRESH, "gc_thresh" },
180 { NET_IPV4_ROUTE_MAX_SIZE, "max_size" },
181 { NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" },
182 { NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" },
183 { NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval" },
184 { NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" },
185 { NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" },
186 { NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" },
187 { NET_IPV4_ROUTE_ERROR_COST, "error_cost" },
188 { NET_IPV4_ROUTE_ERROR_BURST, "error_burst" },
189 { NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity" },
190 { NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" },
191 { NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" },
192 { NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" },
193 { NET_IPV4_ROUTE_SECRET_INTERVAL, "secret_interval" },
194 { NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" },
195 {}
196};
197
198static const struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = {
199 { NET_IPV4_CONF_FORWARDING, "forwarding" },
200 { NET_IPV4_CONF_MC_FORWARDING, "mc_forwarding" },
201
202 { NET_IPV4_CONF_PROXY_ARP, "proxy_arp" },
203 { NET_IPV4_CONF_ACCEPT_REDIRECTS, "accept_redirects" },
204 { NET_IPV4_CONF_SECURE_REDIRECTS, "secure_redirects" },
205 { NET_IPV4_CONF_SEND_REDIRECTS, "send_redirects" },
206 { NET_IPV4_CONF_SHARED_MEDIA, "shared_media" },
207 { NET_IPV4_CONF_RP_FILTER, "rp_filter" },
208 { NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
209 { NET_IPV4_CONF_BOOTP_RELAY, "bootp_relay" },
210 { NET_IPV4_CONF_LOG_MARTIANS, "log_martians" },
211 { NET_IPV4_CONF_TAG, "tag" },
212 { NET_IPV4_CONF_ARPFILTER, "arp_filter" },
213 { NET_IPV4_CONF_MEDIUM_ID, "medium_id" },
214 { NET_IPV4_CONF_NOXFRM, "disable_xfrm" },
215 { NET_IPV4_CONF_NOPOLICY, "disable_policy" },
216 { NET_IPV4_CONF_FORCE_IGMP_VERSION, "force_igmp_version" },
217
218 { NET_IPV4_CONF_ARP_ANNOUNCE, "arp_announce" },
219 { NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" },
220 { NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" },
221 { NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" },
222 { NET_IPV4_CONF_ARP_NOTIFY, "arp_notify" },
223 {}
224};
225
226static const struct trans_ctl_table trans_net_ipv4_conf_table[] = {
227 { NET_PROTO_CONF_ALL, "all", trans_net_ipv4_conf_vars_table },
228 { NET_PROTO_CONF_DEFAULT, "default", trans_net_ipv4_conf_vars_table },
229 { 0, NULL, trans_net_ipv4_conf_vars_table },
230 {}
231};
232
233static const struct trans_ctl_table trans_net_neigh_vars_table[] = {
234 { NET_NEIGH_MCAST_SOLICIT, "mcast_solicit" },
235 { NET_NEIGH_UCAST_SOLICIT, "ucast_solicit" },
236 { NET_NEIGH_APP_SOLICIT, "app_solicit" },
237 { NET_NEIGH_RETRANS_TIME, "retrans_time" },
238 { NET_NEIGH_REACHABLE_TIME, "base_reachable_time" },
239 { NET_NEIGH_DELAY_PROBE_TIME, "delay_first_probe_time" },
240 { NET_NEIGH_GC_STALE_TIME, "gc_stale_time" },
241 { NET_NEIGH_UNRES_QLEN, "unres_qlen" },
242 { NET_NEIGH_PROXY_QLEN, "proxy_qlen" },
243 { NET_NEIGH_ANYCAST_DELAY, "anycast_delay" },
244 { NET_NEIGH_PROXY_DELAY, "proxy_delay" },
245 { NET_NEIGH_LOCKTIME, "locktime" },
246 { NET_NEIGH_GC_INTERVAL, "gc_interval" },
247 { NET_NEIGH_GC_THRESH1, "gc_thresh1" },
248 { NET_NEIGH_GC_THRESH2, "gc_thresh2" },
249 { NET_NEIGH_GC_THRESH3, "gc_thresh3" },
250 { NET_NEIGH_RETRANS_TIME_MS, "retrans_time_ms" },
251 { NET_NEIGH_REACHABLE_TIME_MS, "base_reachable_time_ms" },
252 {}
253};
254
255static const struct trans_ctl_table trans_net_neigh_table[] = {
256 { NET_PROTO_CONF_DEFAULT, "default", trans_net_neigh_vars_table },
257 { 0, NULL, trans_net_neigh_vars_table },
258 {}
259};
260
261static const struct trans_ctl_table trans_net_ipv4_netfilter_table[] = {
262 { NET_IPV4_NF_CONNTRACK_MAX, "ip_conntrack_max" },
263
264 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, "ip_conntrack_tcp_timeout_syn_sent" },
265 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, "ip_conntrack_tcp_timeout_syn_recv" },
266 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED, "ip_conntrack_tcp_timeout_established" },
267 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT, "ip_conntrack_tcp_timeout_fin_wait" },
268 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT, "ip_conntrack_tcp_timeout_close_wait" },
269 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK, "ip_conntrack_tcp_timeout_last_ack" },
270 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT, "ip_conntrack_tcp_timeout_time_wait" },
271 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE, "ip_conntrack_tcp_timeout_close" },
272
273 { NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT, "ip_conntrack_udp_timeout" },
274 { NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM, "ip_conntrack_udp_timeout_stream" },
275 { NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT, "ip_conntrack_icmp_timeout" },
276 { NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT, "ip_conntrack_generic_timeout" },
277
278 { NET_IPV4_NF_CONNTRACK_BUCKETS, "ip_conntrack_buckets" },
279 { NET_IPV4_NF_CONNTRACK_LOG_INVALID, "ip_conntrack_log_invalid" },
280 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS, "ip_conntrack_tcp_timeout_max_retrans" },
281 { NET_IPV4_NF_CONNTRACK_TCP_LOOSE, "ip_conntrack_tcp_loose" },
282 { NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL, "ip_conntrack_tcp_be_liberal" },
283 { NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS, "ip_conntrack_tcp_max_retrans" },
284
285 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED, "ip_conntrack_sctp_timeout_closed" },
286 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT, "ip_conntrack_sctp_timeout_cookie_wait" },
287 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED, "ip_conntrack_sctp_timeout_cookie_echoed" },
288 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED, "ip_conntrack_sctp_timeout_established" },
289 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT, "ip_conntrack_sctp_timeout_shutdown_sent" },
290 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD, "ip_conntrack_sctp_timeout_shutdown_recd" },
291 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT, "ip_conntrack_sctp_timeout_shutdown_ack_sent" },
292
293 { NET_IPV4_NF_CONNTRACK_COUNT, "ip_conntrack_count" },
294 { NET_IPV4_NF_CONNTRACK_CHECKSUM, "ip_conntrack_checksum" },
295 {}
296};
297
298static const struct trans_ctl_table trans_net_ipv4_table[] = {
299 { NET_IPV4_FORWARD, "ip_forward" },
300 { NET_IPV4_DYNADDR, "ip_dynaddr" },
301
302 { NET_IPV4_CONF, "conf", trans_net_ipv4_conf_table },
303 { NET_IPV4_NEIGH, "neigh", trans_net_neigh_table },
304 { NET_IPV4_ROUTE, "route", trans_net_ipv4_route_table },
305 /* NET_IPV4_FIB_HASH unused */
306 { NET_IPV4_NETFILTER, "netfilter", trans_net_ipv4_netfilter_table },
307
308 { NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps" },
309 { NET_IPV4_TCP_WINDOW_SCALING, "tcp_window_scaling" },
310 { NET_IPV4_TCP_SACK, "tcp_sack" },
311 { NET_IPV4_TCP_RETRANS_COLLAPSE, "tcp_retrans_collapse" },
312 { NET_IPV4_DEFAULT_TTL, "ip_default_ttl" },
313 /* NET_IPV4_AUTOCONFIG unused */
314 { NET_IPV4_NO_PMTU_DISC, "ip_no_pmtu_disc" },
315 { NET_IPV4_TCP_SYN_RETRIES, "tcp_syn_retries" },
316 { NET_IPV4_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh" },
317 { NET_IPV4_IPFRAG_LOW_THRESH, "ipfrag_low_thresh" },
318 { NET_IPV4_IPFRAG_TIME, "ipfrag_time" },
319 /* NET_IPV4_TCP_MAX_KA_PROBES unused */
320 { NET_IPV4_TCP_KEEPALIVE_TIME, "tcp_keepalive_time" },
321 { NET_IPV4_TCP_KEEPALIVE_PROBES, "tcp_keepalive_probes" },
322 { NET_IPV4_TCP_RETRIES1, "tcp_retries1" },
323 { NET_IPV4_TCP_RETRIES2, "tcp_retries2" },
324 { NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout" },
325 /* NET_IPV4_IP_MASQ_DEBUG unused */
326 { NET_TCP_SYNCOOKIES, "tcp_syncookies" },
327 { NET_TCP_STDURG, "tcp_stdurg" },
328 { NET_TCP_RFC1337, "tcp_rfc1337" },
329 /* NET_TCP_SYN_TAILDROP unused */
330 { NET_TCP_MAX_SYN_BACKLOG, "tcp_max_syn_backlog" },
331 { NET_IPV4_LOCAL_PORT_RANGE, "ip_local_port_range" },
332 { NET_IPV4_ICMP_ECHO_IGNORE_ALL, "icmp_echo_ignore_all" },
333 { NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, "icmp_echo_ignore_broadcasts" },
334 /* NET_IPV4_ICMP_SOURCEQUENCH_RATE unused */
335 /* NET_IPV4_ICMP_DESTUNREACH_RATE unused */
336 /* NET_IPV4_ICMP_TIMEEXCEED_RATE unused */
337 /* NET_IPV4_ICMP_PARAMPROB_RATE unused */
338 /* NET_IPV4_ICMP_ECHOREPLY_RATE unused */
339 { NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, "icmp_ignore_bogus_error_responses" },
340 { NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships" },
341 { NET_TCP_TW_RECYCLE, "tcp_tw_recycle" },
342 /* NET_IPV4_ALWAYS_DEFRAG unused */
343 { NET_IPV4_TCP_KEEPALIVE_INTVL, "tcp_keepalive_intvl" },
344 { NET_IPV4_INET_PEER_THRESHOLD, "inet_peer_threshold" },
345 { NET_IPV4_INET_PEER_MINTTL, "inet_peer_minttl" },
346 { NET_IPV4_INET_PEER_MAXTTL, "inet_peer_maxttl" },
347 { NET_IPV4_INET_PEER_GC_MINTIME, "inet_peer_gc_mintime" },
348 { NET_IPV4_INET_PEER_GC_MAXTIME, "inet_peer_gc_maxtime" },
349 { NET_TCP_ORPHAN_RETRIES, "tcp_orphan_retries" },
350 { NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow" },
351 { NET_TCP_SYNACK_RETRIES, "tcp_synack_retries" },
352 { NET_TCP_MAX_ORPHANS, "tcp_max_orphans" },
353 { NET_TCP_MAX_TW_BUCKETS, "tcp_max_tw_buckets" },
354 { NET_TCP_FACK, "tcp_fack" },
355 { NET_TCP_REORDERING, "tcp_reordering" },
356 { NET_TCP_ECN, "tcp_ecn" },
357 { NET_TCP_DSACK, "tcp_dsack" },
358 { NET_TCP_MEM, "tcp_mem" },
359 { NET_TCP_WMEM, "tcp_wmem" },
360 { NET_TCP_RMEM, "tcp_rmem" },
361 { NET_TCP_APP_WIN, "tcp_app_win" },
362 { NET_TCP_ADV_WIN_SCALE, "tcp_adv_win_scale" },
363 { NET_IPV4_NONLOCAL_BIND, "ip_nonlocal_bind" },
364 { NET_IPV4_ICMP_RATELIMIT, "icmp_ratelimit" },
365 { NET_IPV4_ICMP_RATEMASK, "icmp_ratemask" },
366 { NET_TCP_TW_REUSE, "tcp_tw_reuse" },
367 { NET_TCP_FRTO, "tcp_frto" },
368 { NET_TCP_LOW_LATENCY, "tcp_low_latency" },
369 { NET_IPV4_IPFRAG_SECRET_INTERVAL, "ipfrag_secret_interval" },
370 { NET_IPV4_IGMP_MAX_MSF, "igmp_max_msf" },
371 { NET_TCP_NO_METRICS_SAVE, "tcp_no_metrics_save" },
372 /* NET_TCP_DEFAULT_WIN_SCALE unused */
373 { NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" },
374 { NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" },
375 /* NET_TCP_BIC_BETA unused */
376 { NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR, "icmp_errors_use_inbound_ifaddr" },
377 { NET_TCP_CONG_CONTROL, "tcp_congestion_control" },
378 { NET_TCP_ABC, "tcp_abc" },
379 { NET_IPV4_IPFRAG_MAX_DIST, "ipfrag_max_dist" },
380 { NET_TCP_MTU_PROBING, "tcp_mtu_probing" },
381 { NET_TCP_BASE_MSS, "tcp_base_mss" },
382 { NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" },
383 { NET_TCP_DMA_COPYBREAK, "tcp_dma_copybreak" },
384 { NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" },
385 { NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" },
386 { NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" },
387 { NET_CIPSOV4_RBM_OPTFMT, "cipso_rbm_optfmt" },
388 { NET_CIPSOV4_RBM_STRICTVALID, "cipso_rbm_strictvalid" },
389 { NET_TCP_AVAIL_CONG_CONTROL, "tcp_available_congestion_control" },
390 { NET_TCP_ALLOWED_CONG_CONTROL, "tcp_allowed_congestion_control" },
391 { NET_TCP_MAX_SSTHRESH, "tcp_max_ssthresh" },
392 { NET_TCP_FRTO_RESPONSE, "tcp_frto_response" },
393 { 2088 /* NET_IPQ_QMAX */, "ip_queue_maxlen" },
394 {}
395};
396
397static const struct trans_ctl_table trans_net_ipx_table[] = {
398 { NET_IPX_PPROP_BROADCASTING, "ipx_pprop_broadcasting" },
399 /* NET_IPX_FORWARDING unused */
400 {}
401};
402
403static const struct trans_ctl_table trans_net_atalk_table[] = {
404 { NET_ATALK_AARP_EXPIRY_TIME, "aarp-expiry-time" },
405 { NET_ATALK_AARP_TICK_TIME, "aarp-tick-time" },
406 { NET_ATALK_AARP_RETRANSMIT_LIMIT, "aarp-retransmit-limit" },
407 { NET_ATALK_AARP_RESOLVE_TIME, "aarp-resolve-time" },
408 {},
409};
410
411static const struct trans_ctl_table trans_net_netrom_table[] = {
412 { NET_NETROM_DEFAULT_PATH_QUALITY, "default_path_quality" },
413 { NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER, "obsolescence_count_initialiser" },
414 { NET_NETROM_NETWORK_TTL_INITIALISER, "network_ttl_initialiser" },
415 { NET_NETROM_TRANSPORT_TIMEOUT, "transport_timeout" },
416 { NET_NETROM_TRANSPORT_MAXIMUM_TRIES, "transport_maximum_tries" },
417 { NET_NETROM_TRANSPORT_ACKNOWLEDGE_DELAY, "transport_acknowledge_delay" },
418 { NET_NETROM_TRANSPORT_BUSY_DELAY, "transport_busy_delay" },
419 { NET_NETROM_TRANSPORT_REQUESTED_WINDOW_SIZE, "transport_requested_window_size" },
420 { NET_NETROM_TRANSPORT_NO_ACTIVITY_TIMEOUT, "transport_no_activity_timeout" },
421 { NET_NETROM_ROUTING_CONTROL, "routing_control" },
422 { NET_NETROM_LINK_FAILS_COUNT, "link_fails_count" },
423 { NET_NETROM_RESET, "reset" },
424 {}
425};
426
427static const struct trans_ctl_table trans_net_ax25_param_table[] = {
428 { NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" },
429 { NET_AX25_DEFAULT_MODE, "ax25_default_mode" },
430 { NET_AX25_BACKOFF_TYPE, "backoff_type" },
431 { NET_AX25_CONNECT_MODE, "connect_mode" },
432 { NET_AX25_STANDARD_WINDOW, "standard_window_size" },
433 { NET_AX25_EXTENDED_WINDOW, "extended_window_size" },
434 { NET_AX25_T1_TIMEOUT, "t1_timeout" },
435 { NET_AX25_T2_TIMEOUT, "t2_timeout" },
436 { NET_AX25_T3_TIMEOUT, "t3_timeout" },
437 { NET_AX25_IDLE_TIMEOUT, "idle_timeout" },
438 { NET_AX25_N2, "maximum_retry_count" },
439 { NET_AX25_PACLEN, "maximum_packet_length" },
440 { NET_AX25_PROTOCOL, "protocol" },
441 { NET_AX25_DAMA_SLAVE_TIMEOUT, "dama_slave_timeout" },
442 {}
443};
444
445static const struct trans_ctl_table trans_net_ax25_table[] = {
446 { 0, NULL, trans_net_ax25_param_table },
447 {}
448};
449
450static const struct trans_ctl_table trans_net_bridge_table[] = {
451 { NET_BRIDGE_NF_CALL_ARPTABLES, "bridge-nf-call-arptables" },
452 { NET_BRIDGE_NF_CALL_IPTABLES, "bridge-nf-call-iptables" },
453 { NET_BRIDGE_NF_CALL_IP6TABLES, "bridge-nf-call-ip6tables" },
454 { NET_BRIDGE_NF_FILTER_VLAN_TAGGED, "bridge-nf-filter-vlan-tagged" },
455 { NET_BRIDGE_NF_FILTER_PPPOE_TAGGED, "bridge-nf-filter-pppoe-tagged" },
456 {}
457};
458
459static const struct trans_ctl_table trans_net_rose_table[] = {
460 { NET_ROSE_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" },
461 { NET_ROSE_CALL_REQUEST_TIMEOUT, "call_request_timeout" },
462 { NET_ROSE_RESET_REQUEST_TIMEOUT, "reset_request_timeout" },
463 { NET_ROSE_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" },
464 { NET_ROSE_ACK_HOLD_BACK_TIMEOUT, "acknowledge_hold_back_timeout" },
465 { NET_ROSE_ROUTING_CONTROL, "routing_control" },
466 { NET_ROSE_LINK_FAIL_TIMEOUT, "link_fail_timeout" },
467 { NET_ROSE_MAX_VCS, "maximum_virtual_circuits" },
468 { NET_ROSE_WINDOW_SIZE, "window_size" },
469 { NET_ROSE_NO_ACTIVITY_TIMEOUT, "no_activity_timeout" },
470 {}
471};
472
473static const struct trans_ctl_table trans_net_ipv6_conf_var_table[] = {
474 { NET_IPV6_FORWARDING, "forwarding" },
475 { NET_IPV6_HOP_LIMIT, "hop_limit" },
476 { NET_IPV6_MTU, "mtu" },
477 { NET_IPV6_ACCEPT_RA, "accept_ra" },
478 { NET_IPV6_ACCEPT_REDIRECTS, "accept_redirects" },
479 { NET_IPV6_AUTOCONF, "autoconf" },
480 { NET_IPV6_DAD_TRANSMITS, "dad_transmits" },
481 { NET_IPV6_RTR_SOLICITS, "router_solicitations" },
482 { NET_IPV6_RTR_SOLICIT_INTERVAL, "router_solicitation_interval" },
483 { NET_IPV6_RTR_SOLICIT_DELAY, "router_solicitation_delay" },
484 { NET_IPV6_USE_TEMPADDR, "use_tempaddr" },
485 { NET_IPV6_TEMP_VALID_LFT, "temp_valid_lft" },
486 { NET_IPV6_TEMP_PREFERED_LFT, "temp_prefered_lft" },
487 { NET_IPV6_REGEN_MAX_RETRY, "regen_max_retry" },
488 { NET_IPV6_MAX_DESYNC_FACTOR, "max_desync_factor" },
489 { NET_IPV6_MAX_ADDRESSES, "max_addresses" },
490 { NET_IPV6_FORCE_MLD_VERSION, "force_mld_version" },
491 { NET_IPV6_ACCEPT_RA_DEFRTR, "accept_ra_defrtr" },
492 { NET_IPV6_ACCEPT_RA_PINFO, "accept_ra_pinfo" },
493 { NET_IPV6_ACCEPT_RA_RTR_PREF, "accept_ra_rtr_pref" },
494 { NET_IPV6_RTR_PROBE_INTERVAL, "router_probe_interval" },
495 { NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" },
496 { NET_IPV6_PROXY_NDP, "proxy_ndp" },
497 { NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
498 {}
499};
500
501static const struct trans_ctl_table trans_net_ipv6_conf_table[] = {
502 { NET_PROTO_CONF_ALL, "all", trans_net_ipv6_conf_var_table },
503 { NET_PROTO_CONF_DEFAULT, "default", trans_net_ipv6_conf_var_table },
504 { 0, NULL, trans_net_ipv6_conf_var_table },
505 {}
506};
507
508static const struct trans_ctl_table trans_net_ipv6_route_table[] = {
509 { NET_IPV6_ROUTE_FLUSH, "flush" },
510 { NET_IPV6_ROUTE_GC_THRESH, "gc_thresh" },
511 { NET_IPV6_ROUTE_MAX_SIZE, "max_size" },
512 { NET_IPV6_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" },
513 { NET_IPV6_ROUTE_GC_TIMEOUT, "gc_timeout" },
514 { NET_IPV6_ROUTE_GC_INTERVAL, "gc_interval" },
515 { NET_IPV6_ROUTE_GC_ELASTICITY, "gc_elasticity" },
516 { NET_IPV6_ROUTE_MTU_EXPIRES, "mtu_expires" },
517 { NET_IPV6_ROUTE_MIN_ADVMSS, "min_adv_mss" },
518 { NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" },
519 {}
520};
521
522static const struct trans_ctl_table trans_net_ipv6_icmp_table[] = {
523 { NET_IPV6_ICMP_RATELIMIT, "ratelimit" },
524 {}
525};
526
527static const struct trans_ctl_table trans_net_ipv6_table[] = {
528 { NET_IPV6_CONF, "conf", trans_net_ipv6_conf_table },
529 { NET_IPV6_NEIGH, "neigh", trans_net_neigh_table },
530 { NET_IPV6_ROUTE, "route", trans_net_ipv6_route_table },
531 { NET_IPV6_ICMP, "icmp", trans_net_ipv6_icmp_table },
532 { NET_IPV6_BINDV6ONLY, "bindv6only" },
533 { NET_IPV6_IP6FRAG_HIGH_THRESH, "ip6frag_high_thresh" },
534 { NET_IPV6_IP6FRAG_LOW_THRESH, "ip6frag_low_thresh" },
535 { NET_IPV6_IP6FRAG_TIME, "ip6frag_time" },
536 { NET_IPV6_IP6FRAG_SECRET_INTERVAL, "ip6frag_secret_interval" },
537 { NET_IPV6_MLD_MAX_MSF, "mld_max_msf" },
538 { 2088 /* IPQ_QMAX */, "ip6_queue_maxlen" },
539 {}
540};
541
542static const struct trans_ctl_table trans_net_x25_table[] = {
543 { NET_X25_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" },
544 { NET_X25_CALL_REQUEST_TIMEOUT, "call_request_timeout" },
545 { NET_X25_RESET_REQUEST_TIMEOUT, "reset_request_timeout" },
546 { NET_X25_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" },
547 { NET_X25_ACK_HOLD_BACK_TIMEOUT, "acknowledgement_hold_back_timeout" },
548 { NET_X25_FORWARD, "x25_forward" },
549 {}
550};
551
552static const struct trans_ctl_table trans_net_tr_table[] = {
553 { NET_TR_RIF_TIMEOUT, "rif_timeout" },
554 {}
555};
556
557
558static const struct trans_ctl_table trans_net_decnet_conf_vars[] = {
559 { NET_DECNET_CONF_DEV_FORWARDING, "forwarding" },
560 { NET_DECNET_CONF_DEV_PRIORITY, "priority" },
561 { NET_DECNET_CONF_DEV_T2, "t2" },
562 { NET_DECNET_CONF_DEV_T3, "t3" },
563 {}
564};
565
566static const struct trans_ctl_table trans_net_decnet_conf[] = {
567 { 0, NULL, trans_net_decnet_conf_vars },
568 {}
569};
570
571static const struct trans_ctl_table trans_net_decnet_table[] = {
572 { NET_DECNET_CONF, "conf", trans_net_decnet_conf },
573 { NET_DECNET_NODE_ADDRESS, "node_address" },
574 { NET_DECNET_NODE_NAME, "node_name" },
575 { NET_DECNET_DEFAULT_DEVICE, "default_device" },
576 { NET_DECNET_TIME_WAIT, "time_wait" },
577 { NET_DECNET_DN_COUNT, "dn_count" },
578 { NET_DECNET_DI_COUNT, "di_count" },
579 { NET_DECNET_DR_COUNT, "dr_count" },
580 { NET_DECNET_DST_GC_INTERVAL, "dst_gc_interval" },
581 { NET_DECNET_NO_FC_MAX_CWND, "no_fc_max_cwnd" },
582 { NET_DECNET_MEM, "decnet_mem" },
583 { NET_DECNET_RMEM, "decnet_rmem" },
584 { NET_DECNET_WMEM, "decnet_wmem" },
585 { NET_DECNET_DEBUG_LEVEL, "debug" },
586 {}
587};
588
589static const struct trans_ctl_table trans_net_sctp_table[] = {
590 { NET_SCTP_RTO_INITIAL, "rto_initial" },
591 { NET_SCTP_RTO_MIN, "rto_min" },
592 { NET_SCTP_RTO_MAX, "rto_max" },
593 { NET_SCTP_RTO_ALPHA, "rto_alpha_exp_divisor" },
594 { NET_SCTP_RTO_BETA, "rto_beta_exp_divisor" },
595 { NET_SCTP_VALID_COOKIE_LIFE, "valid_cookie_life" },
596 { NET_SCTP_ASSOCIATION_MAX_RETRANS, "association_max_retrans" },
597 { NET_SCTP_PATH_MAX_RETRANS, "path_max_retrans" },
598 { NET_SCTP_MAX_INIT_RETRANSMITS, "max_init_retransmits" },
599 { NET_SCTP_HB_INTERVAL, "hb_interval" },
600 { NET_SCTP_PRESERVE_ENABLE, "cookie_preserve_enable" },
601 { NET_SCTP_MAX_BURST, "max_burst" },
602 { NET_SCTP_ADDIP_ENABLE, "addip_enable" },
603 { NET_SCTP_PRSCTP_ENABLE, "prsctp_enable" },
604 { NET_SCTP_SNDBUF_POLICY, "sndbuf_policy" },
605 { NET_SCTP_SACK_TIMEOUT, "sack_timeout" },
606 { NET_SCTP_RCVBUF_POLICY, "rcvbuf_policy" },
607 {}
608};
609
610static const struct trans_ctl_table trans_net_llc_llc2_timeout_table[] = {
611 { NET_LLC2_ACK_TIMEOUT, "ack" },
612 { NET_LLC2_P_TIMEOUT, "p" },
613 { NET_LLC2_REJ_TIMEOUT, "rej" },
614 { NET_LLC2_BUSY_TIMEOUT, "busy" },
615 {}
616};
617
618static const struct trans_ctl_table trans_net_llc_station_table[] = {
619 { NET_LLC_STATION_ACK_TIMEOUT, "ack_timeout" },
620 {}
621};
622
623static const struct trans_ctl_table trans_net_llc_llc2_table[] = {
624 { NET_LLC2, "timeout", trans_net_llc_llc2_timeout_table },
625 {}
626};
627
628static const struct trans_ctl_table trans_net_llc_table[] = {
629 { NET_LLC2, "llc2", trans_net_llc_llc2_table },
630 { NET_LLC_STATION, "station", trans_net_llc_station_table },
631 {}
632};
633
634static const struct trans_ctl_table trans_net_netfilter_table[] = {
635 { NET_NF_CONNTRACK_MAX, "nf_conntrack_max" },
636 { NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, "nf_conntrack_tcp_timeout_syn_sent" },
637 { NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, "nf_conntrack_tcp_timeout_syn_recv" },
638 { NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED, "nf_conntrack_tcp_timeout_established" },
639 { NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT, "nf_conntrack_tcp_timeout_fin_wait" },
640 { NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT, "nf_conntrack_tcp_timeout_close_wait" },
641 { NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK, "nf_conntrack_tcp_timeout_last_ack" },
642 { NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT, "nf_conntrack_tcp_timeout_time_wait" },
643 { NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE, "nf_conntrack_tcp_timeout_close" },
644 { NET_NF_CONNTRACK_UDP_TIMEOUT, "nf_conntrack_udp_timeout" },
645 { NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM, "nf_conntrack_udp_timeout_stream" },
646 { NET_NF_CONNTRACK_ICMP_TIMEOUT, "nf_conntrack_icmp_timeout" },
647 { NET_NF_CONNTRACK_GENERIC_TIMEOUT, "nf_conntrack_generic_timeout" },
648 { NET_NF_CONNTRACK_BUCKETS, "nf_conntrack_buckets" },
649 { NET_NF_CONNTRACK_LOG_INVALID, "nf_conntrack_log_invalid" },
650 { NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS, "nf_conntrack_tcp_timeout_max_retrans" },
651 { NET_NF_CONNTRACK_TCP_LOOSE, "nf_conntrack_tcp_loose" },
652 { NET_NF_CONNTRACK_TCP_BE_LIBERAL, "nf_conntrack_tcp_be_liberal" },
653 { NET_NF_CONNTRACK_TCP_MAX_RETRANS, "nf_conntrack_tcp_max_retrans" },
654 { NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED, "nf_conntrack_sctp_timeout_closed" },
655 { NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT, "nf_conntrack_sctp_timeout_cookie_wait" },
656 { NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED, "nf_conntrack_sctp_timeout_cookie_echoed" },
657 { NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED, "nf_conntrack_sctp_timeout_established" },
658 { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT, "nf_conntrack_sctp_timeout_shutdown_sent" },
659 { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD, "nf_conntrack_sctp_timeout_shutdown_recd" },
660 { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT, "nf_conntrack_sctp_timeout_shutdown_ack_sent" },
661 { NET_NF_CONNTRACK_COUNT, "nf_conntrack_count" },
662 { NET_NF_CONNTRACK_ICMPV6_TIMEOUT, "nf_conntrack_icmpv6_timeout" },
663 { NET_NF_CONNTRACK_FRAG6_TIMEOUT, "nf_conntrack_frag6_timeout" },
664 { NET_NF_CONNTRACK_FRAG6_LOW_THRESH, "nf_conntrack_frag6_low_thresh" },
665 { NET_NF_CONNTRACK_FRAG6_HIGH_THRESH, "nf_conntrack_frag6_high_thresh" },
666 { NET_NF_CONNTRACK_CHECKSUM, "nf_conntrack_checksum" },
667
668 {}
669};
670
671static const struct trans_ctl_table trans_net_dccp_table[] = {
672 { NET_DCCP_DEFAULT, "default" },
673 {}
674};
675
676static const struct trans_ctl_table trans_net_irda_table[] = {
677 { NET_IRDA_DISCOVERY, "discovery" },
678 { NET_IRDA_DEVNAME, "devname" },
679 { NET_IRDA_DEBUG, "debug" },
680 { NET_IRDA_FAST_POLL, "fast_poll_increase" },
681 { NET_IRDA_DISCOVERY_SLOTS, "discovery_slots" },
682 { NET_IRDA_DISCOVERY_TIMEOUT, "discovery_timeout" },
683 { NET_IRDA_SLOT_TIMEOUT, "slot_timeout" },
684 { NET_IRDA_MAX_BAUD_RATE, "max_baud_rate" },
685 { NET_IRDA_MIN_TX_TURN_TIME, "min_tx_turn_time" },
686 { NET_IRDA_MAX_TX_DATA_SIZE, "max_tx_data_size" },
687 { NET_IRDA_MAX_TX_WINDOW, "max_tx_window" },
688 { NET_IRDA_MAX_NOREPLY_TIME, "max_noreply_time" },
689 { NET_IRDA_WARN_NOREPLY_TIME, "warn_noreply_time" },
690 { NET_IRDA_LAP_KEEPALIVE_TIME, "lap_keepalive_time" },
691 {}
692};
693
694static const struct trans_ctl_table trans_net_table[] = {
695 { NET_CORE, "core", trans_net_core_table },
696 /* NET_ETHER not used */
697 /* NET_802 not used */
698 { NET_UNIX, "unix", trans_net_unix_table },
699 { NET_IPV4, "ipv4", trans_net_ipv4_table },
700 { NET_IPX, "ipx", trans_net_ipx_table },
701 { NET_ATALK, "appletalk", trans_net_atalk_table },
702 { NET_NETROM, "netrom", trans_net_netrom_table },
703 { NET_AX25, "ax25", trans_net_ax25_table },
704 { NET_BRIDGE, "bridge", trans_net_bridge_table },
705 { NET_ROSE, "rose", trans_net_rose_table },
706 { NET_IPV6, "ipv6", trans_net_ipv6_table },
707 { NET_X25, "x25", trans_net_x25_table },
708 { NET_TR, "token-ring", trans_net_tr_table },
709 { NET_DECNET, "decnet", trans_net_decnet_table },
710 /* NET_ECONET not used */
711 { NET_SCTP, "sctp", trans_net_sctp_table },
712 { NET_LLC, "llc", trans_net_llc_table },
713 { NET_NETFILTER, "netfilter", trans_net_netfilter_table },
714 { NET_DCCP, "dccp", trans_net_dccp_table },
715 { NET_IRDA, "irda", trans_net_irda_table },
716 { 2089, "nf_conntrack_max" },
717 {}
718};
719
720static const struct trans_ctl_table trans_fs_quota_table[] = {
721 { FS_DQ_LOOKUPS, "lookups" },
722 { FS_DQ_DROPS, "drops" },
723 { FS_DQ_READS, "reads" },
724 { FS_DQ_WRITES, "writes" },
725 { FS_DQ_CACHE_HITS, "cache_hits" },
726 { FS_DQ_ALLOCATED, "allocated_dquots" },
727 { FS_DQ_FREE, "free_dquots" },
728 { FS_DQ_SYNCS, "syncs" },
729 { FS_DQ_WARNINGS, "warnings" },
730 {}
731};
732
733static const struct trans_ctl_table trans_fs_xfs_table[] = {
734 { XFS_SGID_INHERIT, "irix_sgid_inherit" },
735 { XFS_SYMLINK_MODE, "irix_symlink_mode" },
736 { XFS_PANIC_MASK, "panic_mask" },
737
738 { XFS_ERRLEVEL, "error_level" },
739 { XFS_SYNCD_TIMER, "xfssyncd_centisecs" },
740 { XFS_INHERIT_SYNC, "inherit_sync" },
741 { XFS_INHERIT_NODUMP, "inherit_nodump" },
742 { XFS_INHERIT_NOATIME, "inherit_noatime" },
743 { XFS_BUF_TIMER, "xfsbufd_centisecs" },
744 { XFS_BUF_AGE, "age_buffer_centisecs" },
745 { XFS_INHERIT_NOSYM, "inherit_nosymlinks" },
746 { XFS_ROTORSTEP, "rotorstep" },
747 { XFS_INHERIT_NODFRG, "inherit_nodefrag" },
748 { XFS_FILESTREAM_TIMER, "filestream_centisecs" },
749 { XFS_STATS_CLEAR, "stats_clear" },
750 {}
751};
752
753static const struct trans_ctl_table trans_fs_ocfs2_nm_table[] = {
754 { 1, "hb_ctl_path" },
755 {}
756};
757
758static const struct trans_ctl_table trans_fs_ocfs2_table[] = {
759 { 1, "nm", trans_fs_ocfs2_nm_table },
760 {}
761};
762
763static const struct trans_ctl_table trans_inotify_table[] = {
764 { INOTIFY_MAX_USER_INSTANCES, "max_user_instances" },
765 { INOTIFY_MAX_USER_WATCHES, "max_user_watches" },
766 { INOTIFY_MAX_QUEUED_EVENTS, "max_queued_events" },
767 {}
768};
769
770static const struct trans_ctl_table trans_fs_table[] = {
771 { FS_NRINODE, "inode-nr" },
772 { FS_STATINODE, "inode-state" },
773 /* FS_MAXINODE unused */
774 /* FS_NRDQUOT unused */
775 /* FS_MAXDQUOT unused */
776 { FS_NRFILE, "file-nr" },
777 { FS_MAXFILE, "file-max" },
778 { FS_DENTRY, "dentry-state" },
779 /* FS_NRSUPER unused */
780 /* FS_MAXUPSER unused */
781 { FS_OVERFLOWUID, "overflowuid" },
782 { FS_OVERFLOWGID, "overflowgid" },
783 { FS_LEASES, "leases-enable" },
784 { FS_DIR_NOTIFY, "dir-notify-enable" },
785 { FS_LEASE_TIME, "lease-break-time" },
786 { FS_DQSTATS, "quota", trans_fs_quota_table },
787 { FS_XFS, "xfs", trans_fs_xfs_table },
788 { FS_AIO_NR, "aio-nr" },
789 { FS_AIO_MAX_NR, "aio-max-nr" },
790 { FS_INOTIFY, "inotify", trans_inotify_table },
791 { FS_OCFS2, "ocfs2", trans_fs_ocfs2_table },
792 { KERN_SETUID_DUMPABLE, "suid_dumpable" },
793 {}
794};
795
796static const struct trans_ctl_table trans_debug_table[] = {
797 {}
798};
799
800static const struct trans_ctl_table trans_cdrom_table[] = {
801 { DEV_CDROM_INFO, "info" },
802 { DEV_CDROM_AUTOCLOSE, "autoclose" },
803 { DEV_CDROM_AUTOEJECT, "autoeject" },
804 { DEV_CDROM_DEBUG, "debug" },
805 { DEV_CDROM_LOCK, "lock" },
806 { DEV_CDROM_CHECK_MEDIA, "check_media" },
807 {}
808};
809
810static const struct trans_ctl_table trans_ipmi_table[] = {
811 { DEV_IPMI_POWEROFF_POWERCYCLE, "poweroff_powercycle" },
812 {}
813};
814
815static const struct trans_ctl_table trans_mac_hid_files[] = {
816 /* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */
817 /* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */
818 { DEV_MAC_HID_MOUSE_BUTTON_EMULATION, "mouse_button_emulation" },
819 { DEV_MAC_HID_MOUSE_BUTTON2_KEYCODE, "mouse_button2_keycode" },
820 { DEV_MAC_HID_MOUSE_BUTTON3_KEYCODE, "mouse_button3_keycode" },
821 /* DEV_MAC_HID_ADB_MOUSE_SENDS_KEYCODES unused */
822 {}
823};
824
825static const struct trans_ctl_table trans_raid_table[] = {
826 { DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min" },
827 { DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max" },
828 {}
829};
830
831static const struct trans_ctl_table trans_scsi_table[] = {
832 { DEV_SCSI_LOGGING_LEVEL, "logging_level" },
833 {}
834};
835
836static const struct trans_ctl_table trans_parport_default_table[] = {
837 { DEV_PARPORT_DEFAULT_TIMESLICE, "timeslice" },
838 { DEV_PARPORT_DEFAULT_SPINTIME, "spintime" },
839 {}
840};
841
842static const struct trans_ctl_table trans_parport_device_table[] = {
843 { DEV_PARPORT_DEVICE_TIMESLICE, "timeslice" },
844 {}
845};
846
847static const struct trans_ctl_table trans_parport_devices_table[] = {
848 { DEV_PARPORT_DEVICES_ACTIVE, "active" },
849 { 0, NULL, trans_parport_device_table },
850 {}
851};
852
853static const struct trans_ctl_table trans_parport_parport_table[] = {
854 { DEV_PARPORT_SPINTIME, "spintime" },
855 { DEV_PARPORT_BASE_ADDR, "base-addr" },
856 { DEV_PARPORT_IRQ, "irq" },
857 { DEV_PARPORT_DMA, "dma" },
858 { DEV_PARPORT_MODES, "modes" },
859 { DEV_PARPORT_DEVICES, "devices", trans_parport_devices_table },
860 { DEV_PARPORT_AUTOPROBE, "autoprobe" },
861 { DEV_PARPORT_AUTOPROBE + 1, "autoprobe0" },
862 { DEV_PARPORT_AUTOPROBE + 2, "autoprobe1" },
863 { DEV_PARPORT_AUTOPROBE + 3, "autoprobe2" },
864 { DEV_PARPORT_AUTOPROBE + 4, "autoprobe3" },
865 {}
866};
867static const struct trans_ctl_table trans_parport_table[] = {
868 { DEV_PARPORT_DEFAULT, "default", trans_parport_default_table },
869 { 0, NULL, trans_parport_parport_table },
870 {}
871};
872
873static const struct trans_ctl_table trans_dev_table[] = {
874 { DEV_CDROM, "cdrom", trans_cdrom_table },
875 /* DEV_HWMON unused */
876 { DEV_PARPORT, "parport", trans_parport_table },
877 { DEV_RAID, "raid", trans_raid_table },
878 { DEV_MAC_HID, "mac_hid", trans_mac_hid_files },
879 { DEV_SCSI, "scsi", trans_scsi_table },
880 { DEV_IPMI, "ipmi", trans_ipmi_table },
881 {}
882};
883
884static const struct trans_ctl_table trans_bus_isa_table[] = {
885 { BUS_ISA_MEM_BASE, "membase" },
886 { BUS_ISA_PORT_BASE, "portbase" },
887 { BUS_ISA_PORT_SHIFT, "portshift" },
888 {}
889};
890
891static const struct trans_ctl_table trans_bus_table[] = {
892 { CTL_BUS_ISA, "isa", trans_bus_isa_table },
893 {}
894};
895
896static const struct trans_ctl_table trans_arlan_conf_table0[] = {
897 { 1, "spreadingCode" },
898 { 2, "channelNumber" },
899 { 3, "scramblingDisable" },
900 { 4, "txAttenuation" },
901 { 5, "systemId" },
902 { 6, "maxDatagramSize" },
903 { 7, "maxFrameSize" },
904 { 8, "maxRetries" },
905 { 9, "receiveMode" },
906 { 10, "priority" },
907 { 11, "rootOrRepeater" },
908 { 12, "SID" },
909 { 13, "registrationMode" },
910 { 14, "registrationFill" },
911 { 15, "localTalkAddress" },
912 { 16, "codeFormat" },
913 { 17, "numChannels" },
914 { 18, "channel1" },
915 { 19, "channel2" },
916 { 20, "channel3" },
917 { 21, "channel4" },
918 { 22, "txClear" },
919 { 23, "txRetries" },
920 { 24, "txRouting" },
921 { 25, "txScrambled" },
922 { 26, "rxParameter" },
923 { 27, "txTimeoutMs" },
924 { 28, "waitCardTimeout" },
925 { 29, "channelSet" },
926 { 30, "name" },
927 { 31, "waitTime" },
928 { 32, "lParameter" },
929 { 33, "_15" },
930 { 34, "headerSize" },
931 { 36, "tx_delay_ms" },
932 { 37, "retries" },
933 { 38, "ReTransmitPacketMaxSize" },
934 { 39, "waitReTransmitPacketMaxSize" },
935 { 40, "fastReTransCount" },
936 { 41, "driverRetransmissions" },
937 { 42, "txAckTimeoutMs" },
938 { 43, "registrationInterrupts" },
939 { 44, "hardwareType" },
940 { 45, "radioType" },
941 { 46, "writeEEPROM" },
942 { 47, "writeRadioType" },
943 { 48, "entry_exit_debug" },
944 { 49, "debug" },
945 { 50, "in_speed" },
946 { 51, "out_speed" },
947 { 52, "in_speed10" },
948 { 53, "out_speed10" },
949 { 54, "in_speed_max" },
950 { 55, "out_speed_max" },
951 { 56, "measure_rate" },
952 { 57, "pre_Command_Wait" },
953 { 58, "rx_tweak1" },
954 { 59, "rx_tweak2" },
955 { 60, "tx_queue_len" },
956
957 { 150, "arlan0-txRing" },
958 { 151, "arlan0-rxRing" },
959 { 152, "arlan0-18" },
960 { 153, "arlan0-ring" },
961 { 154, "arlan0-shm-cpy" },
962 { 155, "config0" },
963 { 156, "reset0" },
964 {}
965};
966
967static const struct trans_ctl_table trans_arlan_conf_table1[] = {
968 { 1, "spreadingCode" },
969 { 2, "channelNumber" },
970 { 3, "scramblingDisable" },
971 { 4, "txAttenuation" },
972 { 5, "systemId" },
973 { 6, "maxDatagramSize" },
974 { 7, "maxFrameSize" },
975 { 8, "maxRetries" },
976 { 9, "receiveMode" },
977 { 10, "priority" },
978 { 11, "rootOrRepeater" },
979 { 12, "SID" },
980 { 13, "registrationMode" },
981 { 14, "registrationFill" },
982 { 15, "localTalkAddress" },
983 { 16, "codeFormat" },
984 { 17, "numChannels" },
985 { 18, "channel1" },
986 { 19, "channel2" },
987 { 20, "channel3" },
988 { 21, "channel4" },
989 { 22, "txClear" },
990 { 23, "txRetries" },
991 { 24, "txRouting" },
992 { 25, "txScrambled" },
993 { 26, "rxParameter" },
994 { 27, "txTimeoutMs" },
995 { 28, "waitCardTimeout" },
996 { 29, "channelSet" },
997 { 30, "name" },
998 { 31, "waitTime" },
999 { 32, "lParameter" },
1000 { 33, "_15" },
1001 { 34, "headerSize" },
1002 { 36, "tx_delay_ms" },
1003 { 37, "retries" },
1004 { 38, "ReTransmitPacketMaxSize" },
1005 { 39, "waitReTransmitPacketMaxSize" },
1006 { 40, "fastReTransCount" },
1007 { 41, "driverRetransmissions" },
1008 { 42, "txAckTimeoutMs" },
1009 { 43, "registrationInterrupts" },
1010 { 44, "hardwareType" },
1011 { 45, "radioType" },
1012 { 46, "writeEEPROM" },
1013 { 47, "writeRadioType" },
1014 { 48, "entry_exit_debug" },
1015 { 49, "debug" },
1016 { 50, "in_speed" },
1017 { 51, "out_speed" },
1018 { 52, "in_speed10" },
1019 { 53, "out_speed10" },
1020 { 54, "in_speed_max" },
1021 { 55, "out_speed_max" },
1022 { 56, "measure_rate" },
1023 { 57, "pre_Command_Wait" },
1024 { 58, "rx_tweak1" },
1025 { 59, "rx_tweak2" },
1026 { 60, "tx_queue_len" },
1027
1028 { 150, "arlan1-txRing" },
1029 { 151, "arlan1-rxRing" },
1030 { 152, "arlan1-18" },
1031 { 153, "arlan1-ring" },
1032 { 154, "arlan1-shm-cpy" },
1033 { 155, "config1" },
1034 { 156, "reset1" },
1035 {}
1036};
1037
1038static const struct trans_ctl_table trans_arlan_conf_table2[] = {
1039 { 1, "spreadingCode" },
1040 { 2, "channelNumber" },
1041 { 3, "scramblingDisable" },
1042 { 4, "txAttenuation" },
1043 { 5, "systemId" },
1044 { 6, "maxDatagramSize" },
1045 { 7, "maxFrameSize" },
1046 { 8, "maxRetries" },
1047 { 9, "receiveMode" },
1048 { 10, "priority" },
1049 { 11, "rootOrRepeater" },
1050 { 12, "SID" },
1051 { 13, "registrationMode" },
1052 { 14, "registrationFill" },
1053 { 15, "localTalkAddress" },
1054 { 16, "codeFormat" },
1055 { 17, "numChannels" },
1056 { 18, "channel1" },
1057 { 19, "channel2" },
1058 { 20, "channel3" },
1059 { 21, "channel4" },
1060 { 22, "txClear" },
1061 { 23, "txRetries" },
1062 { 24, "txRouting" },
1063 { 25, "txScrambled" },
1064 { 26, "rxParameter" },
1065 { 27, "txTimeoutMs" },
1066 { 28, "waitCardTimeout" },
1067 { 29, "channelSet" },
1068 { 30, "name" },
1069 { 31, "waitTime" },
1070 { 32, "lParameter" },
1071 { 33, "_15" },
1072 { 34, "headerSize" },
1073 { 36, "tx_delay_ms" },
1074 { 37, "retries" },
1075 { 38, "ReTransmitPacketMaxSize" },
1076 { 39, "waitReTransmitPacketMaxSize" },
1077 { 40, "fastReTransCount" },
1078 { 41, "driverRetransmissions" },
1079 { 42, "txAckTimeoutMs" },
1080 { 43, "registrationInterrupts" },
1081 { 44, "hardwareType" },
1082 { 45, "radioType" },
1083 { 46, "writeEEPROM" },
1084 { 47, "writeRadioType" },
1085 { 48, "entry_exit_debug" },
1086 { 49, "debug" },
1087 { 50, "in_speed" },
1088 { 51, "out_speed" },
1089 { 52, "in_speed10" },
1090 { 53, "out_speed10" },
1091 { 54, "in_speed_max" },
1092 { 55, "out_speed_max" },
1093 { 56, "measure_rate" },
1094 { 57, "pre_Command_Wait" },
1095 { 58, "rx_tweak1" },
1096 { 59, "rx_tweak2" },
1097 { 60, "tx_queue_len" },
1098
1099 { 150, "arlan2-txRing" },
1100 { 151, "arlan2-rxRing" },
1101 { 152, "arlan2-18" },
1102 { 153, "arlan2-ring" },
1103 { 154, "arlan2-shm-cpy" },
1104 { 155, "config2" },
1105 { 156, "reset2" },
1106 {}
1107};
1108
1109static const struct trans_ctl_table trans_arlan_conf_table3[] = {
1110 { 1, "spreadingCode" },
1111 { 2, "channelNumber" },
1112 { 3, "scramblingDisable" },
1113 { 4, "txAttenuation" },
1114 { 5, "systemId" },
1115 { 6, "maxDatagramSize" },
1116 { 7, "maxFrameSize" },
1117 { 8, "maxRetries" },
1118 { 9, "receiveMode" },
1119 { 10, "priority" },
1120 { 11, "rootOrRepeater" },
1121 { 12, "SID" },
1122 { 13, "registrationMode" },
1123 { 14, "registrationFill" },
1124 { 15, "localTalkAddress" },
1125 { 16, "codeFormat" },
1126 { 17, "numChannels" },
1127 { 18, "channel1" },
1128 { 19, "channel2" },
1129 { 20, "channel3" },
1130 { 21, "channel4" },
1131 { 22, "txClear" },
1132 { 23, "txRetries" },
1133 { 24, "txRouting" },
1134 { 25, "txScrambled" },
1135 { 26, "rxParameter" },
1136 { 27, "txTimeoutMs" },
1137 { 28, "waitCardTimeout" },
1138 { 29, "channelSet" },
1139 { 30, "name" },
1140 { 31, "waitTime" },
1141 { 32, "lParameter" },
1142 { 33, "_15" },
1143 { 34, "headerSize" },
1144 { 36, "tx_delay_ms" },
1145 { 37, "retries" },
1146 { 38, "ReTransmitPacketMaxSize" },
1147 { 39, "waitReTransmitPacketMaxSize" },
1148 { 40, "fastReTransCount" },
1149 { 41, "driverRetransmissions" },
1150 { 42, "txAckTimeoutMs" },
1151 { 43, "registrationInterrupts" },
1152 { 44, "hardwareType" },
1153 { 45, "radioType" },
1154 { 46, "writeEEPROM" },
1155 { 47, "writeRadioType" },
1156 { 48, "entry_exit_debug" },
1157 { 49, "debug" },
1158 { 50, "in_speed" },
1159 { 51, "out_speed" },
1160 { 52, "in_speed10" },
1161 { 53, "out_speed10" },
1162 { 54, "in_speed_max" },
1163 { 55, "out_speed_max" },
1164 { 56, "measure_rate" },
1165 { 57, "pre_Command_Wait" },
1166 { 58, "rx_tweak1" },
1167 { 59, "rx_tweak2" },
1168 { 60, "tx_queue_len" },
1169
1170 { 150, "arlan3-txRing" },
1171 { 151, "arlan3-rxRing" },
1172 { 152, "arlan3-18" },
1173 { 153, "arlan3-ring" },
1174 { 154, "arlan3-shm-cpy" },
1175 { 155, "config3" },
1176 { 156, "reset3" },
1177 {}
1178};
1179
1180static const struct trans_ctl_table trans_arlan_table[] = {
1181 { 1, "arlan0", trans_arlan_conf_table0 },
1182 { 2, "arlan1", trans_arlan_conf_table1 },
1183 { 3, "arlan2", trans_arlan_conf_table2 },
1184 { 4, "arlan3", trans_arlan_conf_table3 },
1185 {}
1186};
1187
1188static const struct trans_ctl_table trans_s390dbf_table[] = {
1189 { 5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" },
1190 { 5679 /* CTL_S390DBF_ACTIVE */, "debug_active" },
1191 {}
1192};
1193
1194static const struct trans_ctl_table trans_sunrpc_table[] = {
1195 { CTL_RPCDEBUG, "rpc_debug" },
1196 { CTL_NFSDEBUG, "nfs_debug" },
1197 { CTL_NFSDDEBUG, "nfsd_debug" },
1198 { CTL_NLMDEBUG, "nlm_debug" },
1199 { CTL_SLOTTABLE_UDP, "udp_slot_table_entries" },
1200 { CTL_SLOTTABLE_TCP, "tcp_slot_table_entries" },
1201 { CTL_MIN_RESVPORT, "min_resvport" },
1202 { CTL_MAX_RESVPORT, "max_resvport" },
1203 {}
1204};
1205
1206static const struct trans_ctl_table trans_pm_table[] = {
1207 { 1 /* CTL_PM_SUSPEND */, "suspend" },
1208 { 2 /* CTL_PM_CMODE */, "cmode" },
1209 { 3 /* CTL_PM_P0 */, "p0" },
1210 { 4 /* CTL_PM_CM */, "cm" },
1211 {}
1212};
1213
1214static const struct trans_ctl_table trans_frv_table[] = {
1215 { 1, "cache-mode" },
1216 { 2, "pin-cxnr" },
1217 {}
1218};
1219
1220static const struct trans_ctl_table trans_root_table[] = {
1221 { CTL_KERN, "kernel", trans_kern_table },
1222 { CTL_VM, "vm", trans_vm_table },
1223 { CTL_NET, "net", trans_net_table },
1224 /* CTL_PROC not used */
1225 { CTL_FS, "fs", trans_fs_table },
1226 { CTL_DEBUG, "debug", trans_debug_table },
1227 { CTL_DEV, "dev", trans_dev_table },
1228 { CTL_BUS, "bus", trans_bus_table },
1229 { CTL_ABI, "abi" },
1230 /* CTL_CPU not used */
1231 { CTL_ARLAN, "arlan", trans_arlan_table },
1232 { CTL_S390DBF, "s390dbf", trans_s390dbf_table },
1233 { CTL_SUNRPC, "sunrpc", trans_sunrpc_table },
1234 { CTL_PM, "pm", trans_pm_table },
1235 { CTL_FRV, "frv", trans_frv_table },
1236 {}
1237};
1238
1239
1240
1241 8
1242static int sysctl_depth(struct ctl_table *table) 9static int sysctl_depth(struct ctl_table *table)
1243{ 10{
@@ -1261,47 +28,6 @@ static struct ctl_table *sysctl_parent(struct ctl_table *table, int n)
1261 return table; 28 return table;
1262} 29}
1263 30
1264static const struct trans_ctl_table *sysctl_binary_lookup(struct ctl_table *table)
1265{
1266 struct ctl_table *test;
1267 const struct trans_ctl_table *ref;
1268 int cur_depth;
1269
1270 cur_depth = sysctl_depth(table);
1271
1272 ref = trans_root_table;
1273repeat:
1274 test = sysctl_parent(table, cur_depth);
1275 for (; ref->ctl_name || ref->procname || ref->child; ref++) {
1276 int match = 0;
1277
1278 if (cur_depth && !ref->child)
1279 continue;
1280
1281 if (test->procname && ref->procname &&
1282 (strcmp(test->procname, ref->procname) == 0))
1283 match++;
1284
1285 if (test->ctl_name && ref->ctl_name &&
1286 (test->ctl_name == ref->ctl_name))
1287 match++;
1288
1289 if (!ref->ctl_name && !ref->procname)
1290 match++;
1291
1292 if (match) {
1293 if (cur_depth != 0) {
1294 cur_depth--;
1295 ref = ref->child;
1296 goto repeat;
1297 }
1298 goto out;
1299 }
1300 }
1301 ref = NULL;
1302out:
1303 return ref;
1304}
1305 31
1306static void sysctl_print_path(struct ctl_table *table) 32static void sysctl_print_path(struct ctl_table *table)
1307{ 33{
@@ -1315,26 +41,6 @@ static void sysctl_print_path(struct ctl_table *table)
1315 } 41 }
1316 } 42 }
1317 printk(" "); 43 printk(" ");
1318 if (table->ctl_name) {
1319 for (i = depth; i >= 0; i--) {
1320 tmp = sysctl_parent(table, i);
1321 printk(".%d", tmp->ctl_name);
1322 }
1323 }
1324}
1325
1326static void sysctl_repair_table(struct ctl_table *table)
1327{
1328 /* Don't complain about the classic default
1329 * sysctl strategy routine. Maybe later we
1330 * can get the tables fixed and complain about
1331 * this.
1332 */
1333 if (table->ctl_name && table->procname &&
1334 (table->proc_handler == proc_dointvec) &&
1335 (!table->strategy)) {
1336 table->strategy = sysctl_data;
1337 }
1338} 44}
1339 45
1340static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces, 46static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces,
@@ -1352,7 +58,7 @@ static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces,
1352 ref = head->ctl_table; 58 ref = head->ctl_table;
1353repeat: 59repeat:
1354 test = sysctl_parent(table, cur_depth); 60 test = sysctl_parent(table, cur_depth);
1355 for (; ref->ctl_name || ref->procname; ref++) { 61 for (; ref->procname; ref++) {
1356 int match = 0; 62 int match = 0;
1357 if (cur_depth && !ref->child) 63 if (cur_depth && !ref->child)
1358 continue; 64 continue;
@@ -1361,10 +67,6 @@ repeat:
1361 (strcmp(test->procname, ref->procname) == 0)) 67 (strcmp(test->procname, ref->procname) == 0))
1362 match++; 68 match++;
1363 69
1364 if (test->ctl_name && ref->ctl_name &&
1365 (test->ctl_name == ref->ctl_name))
1366 match++;
1367
1368 if (match) { 70 if (match) {
1369 if (cur_depth != 0) { 71 if (cur_depth != 0) {
1370 cur_depth--; 72 cur_depth--;
@@ -1392,38 +94,6 @@ static void set_fail(const char **fail, struct ctl_table *table, const char *str
1392 *fail = str; 94 *fail = str;
1393} 95}
1394 96
1395static int sysctl_check_dir(struct nsproxy *namespaces,
1396 struct ctl_table *table)
1397{
1398 struct ctl_table *ref;
1399 int error;
1400
1401 error = 0;
1402 ref = sysctl_check_lookup(namespaces, table);
1403 if (ref) {
1404 int match = 0;
1405 if ((!table->procname && !ref->procname) ||
1406 (table->procname && ref->procname &&
1407 (strcmp(table->procname, ref->procname) == 0)))
1408 match++;
1409
1410 if ((!table->ctl_name && !ref->ctl_name) ||
1411 (table->ctl_name && ref->ctl_name &&
1412 (table->ctl_name == ref->ctl_name)))
1413 match++;
1414
1415 if (match != 2) {
1416 printk(KERN_ERR "%s: failed: ", __func__);
1417 sysctl_print_path(table);
1418 printk(" ref: ");
1419 sysctl_print_path(ref);
1420 printk("\n");
1421 error = -EINVAL;
1422 }
1423 }
1424 return error;
1425}
1426
1427static void sysctl_check_leaf(struct nsproxy *namespaces, 97static void sysctl_check_leaf(struct nsproxy *namespaces,
1428 struct ctl_table *table, const char **fail) 98 struct ctl_table *table, const char **fail)
1429{ 99{
@@ -1434,37 +104,15 @@ static void sysctl_check_leaf(struct nsproxy *namespaces,
1434 set_fail(fail, table, "Sysctl already exists"); 104 set_fail(fail, table, "Sysctl already exists");
1435} 105}
1436 106
1437static void sysctl_check_bin_path(struct ctl_table *table, const char **fail)
1438{
1439 const struct trans_ctl_table *ref;
1440
1441 ref = sysctl_binary_lookup(table);
1442 if (table->ctl_name && !ref)
1443 set_fail(fail, table, "Unknown sysctl binary path");
1444 if (ref) {
1445 if (ref->procname &&
1446 (!table->procname ||
1447 (strcmp(table->procname, ref->procname) != 0)))
1448 set_fail(fail, table, "procname does not match binary path procname");
1449
1450 if (ref->ctl_name && table->ctl_name &&
1451 (table->ctl_name != ref->ctl_name))
1452 set_fail(fail, table, "ctl_name does not match binary path ctl_name");
1453 }
1454}
1455
1456int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) 107int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
1457{ 108{
1458 int error = 0; 109 int error = 0;
1459 for (; table->ctl_name || table->procname; table++) { 110 for (; table->procname; table++) {
1460 const char *fail = NULL; 111 const char *fail = NULL;
1461 112
1462 sysctl_repair_table(table);
1463 if (table->parent) { 113 if (table->parent) {
1464 if (table->procname && !table->parent->procname) 114 if (table->procname && !table->parent->procname)
1465 set_fail(&fail, table, "Parent without procname"); 115 set_fail(&fail, table, "Parent without procname");
1466 if (table->ctl_name && !table->parent->ctl_name)
1467 set_fail(&fail, table, "Parent without ctl_name");
1468 } 116 }
1469 if (!table->procname) 117 if (!table->procname)
1470 set_fail(&fail, table, "No procname"); 118 set_fail(&fail, table, "No procname");
@@ -1477,21 +125,12 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
1477 set_fail(&fail, table, "Writable sysctl directory"); 125 set_fail(&fail, table, "Writable sysctl directory");
1478 if (table->proc_handler) 126 if (table->proc_handler)
1479 set_fail(&fail, table, "Directory with proc_handler"); 127 set_fail(&fail, table, "Directory with proc_handler");
1480 if (table->strategy)
1481 set_fail(&fail, table, "Directory with strategy");
1482 if (table->extra1) 128 if (table->extra1)
1483 set_fail(&fail, table, "Directory with extra1"); 129 set_fail(&fail, table, "Directory with extra1");
1484 if (table->extra2) 130 if (table->extra2)
1485 set_fail(&fail, table, "Directory with extra2"); 131 set_fail(&fail, table, "Directory with extra2");
1486 if (sysctl_check_dir(namespaces, table))
1487 set_fail(&fail, table, "Inconsistent directory names");
1488 } else { 132 } else {
1489 if ((table->strategy == sysctl_data) || 133 if ((table->proc_handler == proc_dostring) ||
1490 (table->strategy == sysctl_string) ||
1491 (table->strategy == sysctl_intvec) ||
1492 (table->strategy == sysctl_jiffies) ||
1493 (table->strategy == sysctl_ms_jiffies) ||
1494 (table->proc_handler == proc_dostring) ||
1495 (table->proc_handler == proc_dointvec) || 134 (table->proc_handler == proc_dointvec) ||
1496 (table->proc_handler == proc_dointvec_minmax) || 135 (table->proc_handler == proc_dointvec_minmax) ||
1497 (table->proc_handler == proc_dointvec_jiffies) || 136 (table->proc_handler == proc_dointvec_jiffies) ||
@@ -1513,15 +152,7 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
1513 set_fail(&fail, table, "No max"); 152 set_fail(&fail, table, "No max");
1514 } 153 }
1515 } 154 }
1516#ifdef CONFIG_SYSCTL_SYSCALL 155#ifdef CONFIG_PROC_SYSCTL
1517 if (table->ctl_name && !table->strategy)
1518 set_fail(&fail, table, "Missing strategy");
1519#endif
1520#if 0
1521 if (!table->ctl_name && table->strategy)
1522 set_fail(&fail, table, "Strategy without ctl_name");
1523#endif
1524#ifdef CONFIG_PROC_FS
1525 if (table->procname && !table->proc_handler) 156 if (table->procname && !table->proc_handler)
1526 set_fail(&fail, table, "No proc_handler"); 157 set_fail(&fail, table, "No proc_handler");
1527#endif 158#endif
@@ -1531,7 +162,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
1531#endif 162#endif
1532 sysctl_check_leaf(namespaces, table, &fail); 163 sysctl_check_leaf(namespaces, table, &fail);
1533 } 164 }
1534 sysctl_check_bin_path(table, &fail);
1535 if (table->mode > 0777) 165 if (table->mode > 0777)
1536 set_fail(&fail, table, "bogus .mode"); 166 set_fail(&fail, table, "bogus .mode");
1537 if (fail) { 167 if (fail) {
diff --git a/kernel/time.c b/kernel/time.c
index 2e2e469a7fec..c6324d96009e 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -136,7 +136,6 @@ static inline void warp_clock(void)
136 write_seqlock_irq(&xtime_lock); 136 write_seqlock_irq(&xtime_lock);
137 wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; 137 wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60;
138 xtime.tv_sec += sys_tz.tz_minuteswest * 60; 138 xtime.tv_sec += sys_tz.tz_minuteswest * 60;
139 update_xtime_cache(0);
140 write_sequnlock_irq(&xtime_lock); 139 write_sequnlock_irq(&xtime_lock);
141 clock_was_set(); 140 clock_was_set();
142} 141}
@@ -662,6 +661,36 @@ u64 nsec_to_clock_t(u64 x)
662#endif 661#endif
663} 662}
664 663
664/**
665 * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
666 *
667 * @n: nsecs in u64
668 *
669 * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
670 * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
671 * for scheduler, not for use in device drivers to calculate timeout value.
672 *
673 * note:
674 * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
675 * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
676 */
677unsigned long nsecs_to_jiffies(u64 n)
678{
679#if (NSEC_PER_SEC % HZ) == 0
680 /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
681 return div_u64(n, NSEC_PER_SEC / HZ);
682#elif (HZ % 512) == 0
683 /* overflow after 292 years if HZ = 1024 */
684 return div_u64(n * HZ / 512, NSEC_PER_SEC / 512);
685#else
686 /*
687 * Generic case - optimized for cases where HZ is a multiple of 3.
688 * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc.
689 */
690 return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ);
691#endif
692}
693
665#if (BITS_PER_LONG < 64) 694#if (BITS_PER_LONG < 64)
666u64 get_jiffies_64(void) 695u64 get_jiffies_64(void)
667{ 696{
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 0b0a6366c9d4..ee266620b06c 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,4 @@
1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o 1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o
2 2
3obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o 3obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o 4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 620b58abdc32..3d5fc0fd1cca 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -20,6 +20,8 @@
20#include <linux/sysdev.h> 20#include <linux/sysdev.h>
21#include <linux/tick.h> 21#include <linux/tick.h>
22 22
23#include "tick-internal.h"
24
23/* The registered clock event devices */ 25/* The registered clock event devices */
24static LIST_HEAD(clockevent_devices); 26static LIST_HEAD(clockevent_devices);
25static LIST_HEAD(clockevents_released); 27static LIST_HEAD(clockevents_released);
@@ -28,7 +30,7 @@ static LIST_HEAD(clockevents_released);
28static RAW_NOTIFIER_HEAD(clockevents_chain); 30static RAW_NOTIFIER_HEAD(clockevents_chain);
29 31
30/* Protection for the above */ 32/* Protection for the above */
31static DEFINE_SPINLOCK(clockevents_lock); 33static DEFINE_RAW_SPINLOCK(clockevents_lock);
32 34
33/** 35/**
34 * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds 36 * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds
@@ -37,10 +39,9 @@ static DEFINE_SPINLOCK(clockevents_lock);
37 * 39 *
38 * Math helper, returns latch value converted to nanoseconds (bound checked) 40 * Math helper, returns latch value converted to nanoseconds (bound checked)
39 */ 41 */
40unsigned long clockevent_delta2ns(unsigned long latch, 42u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
41 struct clock_event_device *evt)
42{ 43{
43 u64 clc = ((u64) latch << evt->shift); 44 u64 clc = (u64) latch << evt->shift;
44 45
45 if (unlikely(!evt->mult)) { 46 if (unlikely(!evt->mult)) {
46 evt->mult = 1; 47 evt->mult = 1;
@@ -50,10 +51,10 @@ unsigned long clockevent_delta2ns(unsigned long latch,
50 do_div(clc, evt->mult); 51 do_div(clc, evt->mult);
51 if (clc < 1000) 52 if (clc < 1000)
52 clc = 1000; 53 clc = 1000;
53 if (clc > LONG_MAX) 54 if (clc > KTIME_MAX)
54 clc = LONG_MAX; 55 clc = KTIME_MAX;
55 56
56 return (unsigned long) clc; 57 return clc;
57} 58}
58EXPORT_SYMBOL_GPL(clockevent_delta2ns); 59EXPORT_SYMBOL_GPL(clockevent_delta2ns);
59 60
@@ -140,9 +141,9 @@ int clockevents_register_notifier(struct notifier_block *nb)
140 unsigned long flags; 141 unsigned long flags;
141 int ret; 142 int ret;
142 143
143 spin_lock_irqsave(&clockevents_lock, flags); 144 raw_spin_lock_irqsave(&clockevents_lock, flags);
144 ret = raw_notifier_chain_register(&clockevents_chain, nb); 145 ret = raw_notifier_chain_register(&clockevents_chain, nb);
145 spin_unlock_irqrestore(&clockevents_lock, flags); 146 raw_spin_unlock_irqrestore(&clockevents_lock, flags);
146 147
147 return ret; 148 return ret;
148} 149}
@@ -184,13 +185,13 @@ void clockevents_register_device(struct clock_event_device *dev)
184 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); 185 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
185 BUG_ON(!dev->cpumask); 186 BUG_ON(!dev->cpumask);
186 187
187 spin_lock_irqsave(&clockevents_lock, flags); 188 raw_spin_lock_irqsave(&clockevents_lock, flags);
188 189
189 list_add(&dev->list, &clockevent_devices); 190 list_add(&dev->list, &clockevent_devices);
190 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); 191 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
191 clockevents_notify_released(); 192 clockevents_notify_released();
192 193
193 spin_unlock_irqrestore(&clockevents_lock, flags); 194 raw_spin_unlock_irqrestore(&clockevents_lock, flags);
194} 195}
195EXPORT_SYMBOL_GPL(clockevents_register_device); 196EXPORT_SYMBOL_GPL(clockevents_register_device);
196 197
@@ -240,7 +241,7 @@ void clockevents_notify(unsigned long reason, void *arg)
240 struct list_head *node, *tmp; 241 struct list_head *node, *tmp;
241 unsigned long flags; 242 unsigned long flags;
242 243
243 spin_lock_irqsave(&clockevents_lock, flags); 244 raw_spin_lock_irqsave(&clockevents_lock, flags);
244 clockevents_do_notify(reason, arg); 245 clockevents_do_notify(reason, arg);
245 246
246 switch (reason) { 247 switch (reason) {
@@ -255,7 +256,7 @@ void clockevents_notify(unsigned long reason, void *arg)
255 default: 256 default:
256 break; 257 break;
257 } 258 }
258 spin_unlock_irqrestore(&clockevents_lock, flags); 259 raw_spin_unlock_irqrestore(&clockevents_lock, flags);
259} 260}
260EXPORT_SYMBOL_GPL(clockevents_notify); 261EXPORT_SYMBOL_GPL(clockevents_notify);
261#endif 262#endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 09113347d328..e85c23404d34 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -39,7 +39,7 @@ void timecounter_init(struct timecounter *tc,
39 tc->cycle_last = cc->read(cc); 39 tc->cycle_last = cc->read(cc);
40 tc->nsec = start_tstamp; 40 tc->nsec = start_tstamp;
41} 41}
42EXPORT_SYMBOL(timecounter_init); 42EXPORT_SYMBOL_GPL(timecounter_init);
43 43
44/** 44/**
45 * timecounter_read_delta - get nanoseconds since last call of this function 45 * timecounter_read_delta - get nanoseconds since last call of this function
@@ -83,7 +83,7 @@ u64 timecounter_read(struct timecounter *tc)
83 83
84 return nsec; 84 return nsec;
85} 85}
86EXPORT_SYMBOL(timecounter_read); 86EXPORT_SYMBOL_GPL(timecounter_read);
87 87
88u64 timecounter_cyc2time(struct timecounter *tc, 88u64 timecounter_cyc2time(struct timecounter *tc,
89 cycle_t cycle_tstamp) 89 cycle_t cycle_tstamp)
@@ -105,7 +105,60 @@ u64 timecounter_cyc2time(struct timecounter *tc,
105 105
106 return nsec; 106 return nsec;
107} 107}
108EXPORT_SYMBOL(timecounter_cyc2time); 108EXPORT_SYMBOL_GPL(timecounter_cyc2time);
109
110/**
111 * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
112 * @mult: pointer to mult variable
113 * @shift: pointer to shift variable
114 * @from: frequency to convert from
115 * @to: frequency to convert to
116 * @minsec: guaranteed runtime conversion range in seconds
117 *
118 * The function evaluates the shift/mult pair for the scaled math
119 * operations of clocksources and clockevents.
120 *
121 * @to and @from are frequency values in HZ. For clock sources @to is
122 * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock
123 * event @to is the counter frequency and @from is NSEC_PER_SEC.
124 *
125 * The @minsec conversion range argument controls the time frame in
126 * seconds which must be covered by the runtime conversion with the
127 * calculated mult and shift factors. This guarantees that no 64bit
128 * overflow happens when the input value of the conversion is
129 * multiplied with the calculated mult factor. Larger ranges may
130 * reduce the conversion accuracy by chosing smaller mult and shift
131 * factors.
132 */
133void
134clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
135{
136 u64 tmp;
137 u32 sft, sftacc= 32;
138
139 /*
140 * Calculate the shift factor which is limiting the conversion
141 * range:
142 */
143 tmp = ((u64)minsec * from) >> 32;
144 while (tmp) {
145 tmp >>=1;
146 sftacc--;
147 }
148
149 /*
150 * Find the conversion shift/mult pair which has the best
151 * accuracy and fits the maxsec conversion range:
152 */
153 for (sft = 32; sft > 0; sft--) {
154 tmp = (u64) to << sft;
155 do_div(tmp, from);
156 if ((tmp >> sftacc) == 0)
157 break;
158 }
159 *mult = tmp;
160 *shift = sft;
161}
109 162
110/*[Clocksource internal variables]--------- 163/*[Clocksource internal variables]---------
111 * curr_clocksource: 164 * curr_clocksource:
@@ -394,15 +447,11 @@ void clocksource_resume(void)
394{ 447{
395 struct clocksource *cs; 448 struct clocksource *cs;
396 449
397 mutex_lock(&clocksource_mutex);
398
399 list_for_each_entry(cs, &clocksource_list, list) 450 list_for_each_entry(cs, &clocksource_list, list)
400 if (cs->resume) 451 if (cs->resume)
401 cs->resume(); 452 cs->resume();
402 453
403 clocksource_resume_watchdog(); 454 clocksource_resume_watchdog();
404
405 mutex_unlock(&clocksource_mutex);
406} 455}
407 456
408/** 457/**
@@ -417,6 +466,47 @@ void clocksource_touch_watchdog(void)
417 clocksource_resume_watchdog(); 466 clocksource_resume_watchdog();
418} 467}
419 468
469/**
470 * clocksource_max_deferment - Returns max time the clocksource can be deferred
471 * @cs: Pointer to clocksource
472 *
473 */
474static u64 clocksource_max_deferment(struct clocksource *cs)
475{
476 u64 max_nsecs, max_cycles;
477
478 /*
479 * Calculate the maximum number of cycles that we can pass to the
480 * cyc2ns function without overflowing a 64-bit signed result. The
481 * maximum number of cycles is equal to ULLONG_MAX/cs->mult which
482 * is equivalent to the below.
483 * max_cycles < (2^63)/cs->mult
484 * max_cycles < 2^(log2((2^63)/cs->mult))
485 * max_cycles < 2^(log2(2^63) - log2(cs->mult))
486 * max_cycles < 2^(63 - log2(cs->mult))
487 * max_cycles < 1 << (63 - log2(cs->mult))
488 * Please note that we add 1 to the result of the log2 to account for
489 * any rounding errors, ensure the above inequality is satisfied and
490 * no overflow will occur.
491 */
492 max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1));
493
494 /*
495 * The actual maximum number of cycles we can defer the clocksource is
496 * determined by the minimum of max_cycles and cs->mask.
497 */
498 max_cycles = min_t(u64, max_cycles, (u64) cs->mask);
499 max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift);
500
501 /*
502 * To ensure that the clocksource does not wrap whilst we are idle,
503 * limit the time the clocksource can be deferred by 12.5%. Please
504 * note a margin of 12.5% is used because this can be computed with
505 * a shift, versus say 10% which would require division.
506 */
507 return max_nsecs - (max_nsecs >> 5);
508}
509
420#ifdef CONFIG_GENERIC_TIME 510#ifdef CONFIG_GENERIC_TIME
421 511
422/** 512/**
@@ -515,6 +605,9 @@ static void clocksource_enqueue(struct clocksource *cs)
515 */ 605 */
516int clocksource_register(struct clocksource *cs) 606int clocksource_register(struct clocksource *cs)
517{ 607{
608 /* calculate max idle time permitted for this clocksource */
609 cs->max_idle_ns = clocksource_max_deferment(cs);
610
518 mutex_lock(&clocksource_mutex); 611 mutex_lock(&clocksource_mutex);
519 clocksource_enqueue(cs); 612 clocksource_enqueue(cs);
520 clocksource_select(); 613 clocksource_select();
@@ -584,7 +677,7 @@ sysfs_show_current_clocksources(struct sys_device *dev,
584 * @count: length of buffer 677 * @count: length of buffer
585 * 678 *
586 * Takes input from sysfs interface for manually overriding the default 679 * Takes input from sysfs interface for manually overriding the default
587 * clocksource selction. 680 * clocksource selection.
588 */ 681 */
589static ssize_t sysfs_override_clocksource(struct sys_device *dev, 682static ssize_t sysfs_override_clocksource(struct sys_device *dev,
590 struct sysdev_attribute *attr, 683 struct sysdev_attribute *attr,
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index c2ec25087a35..b3bafd5fc66d 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -31,7 +31,7 @@ static struct tick_device tick_broadcast_device;
31/* FIXME: Use cpumask_var_t. */ 31/* FIXME: Use cpumask_var_t. */
32static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS); 32static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS);
33static DECLARE_BITMAP(tmpmask, NR_CPUS); 33static DECLARE_BITMAP(tmpmask, NR_CPUS);
34static DEFINE_SPINLOCK(tick_broadcast_lock); 34static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
35static int tick_broadcast_force; 35static int tick_broadcast_force;
36 36
37#ifdef CONFIG_TICK_ONESHOT 37#ifdef CONFIG_TICK_ONESHOT
@@ -96,7 +96,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
96 unsigned long flags; 96 unsigned long flags;
97 int ret = 0; 97 int ret = 0;
98 98
99 spin_lock_irqsave(&tick_broadcast_lock, flags); 99 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
100 100
101 /* 101 /*
102 * Devices might be registered with both periodic and oneshot 102 * Devices might be registered with both periodic and oneshot
@@ -122,7 +122,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
122 tick_broadcast_clear_oneshot(cpu); 122 tick_broadcast_clear_oneshot(cpu);
123 } 123 }
124 } 124 }
125 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 125 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
126 return ret; 126 return ret;
127} 127}
128 128
@@ -161,13 +161,13 @@ static void tick_do_broadcast(struct cpumask *mask)
161 */ 161 */
162static void tick_do_periodic_broadcast(void) 162static void tick_do_periodic_broadcast(void)
163{ 163{
164 spin_lock(&tick_broadcast_lock); 164 raw_spin_lock(&tick_broadcast_lock);
165 165
166 cpumask_and(to_cpumask(tmpmask), 166 cpumask_and(to_cpumask(tmpmask),
167 cpu_online_mask, tick_get_broadcast_mask()); 167 cpu_online_mask, tick_get_broadcast_mask());
168 tick_do_broadcast(to_cpumask(tmpmask)); 168 tick_do_broadcast(to_cpumask(tmpmask));
169 169
170 spin_unlock(&tick_broadcast_lock); 170 raw_spin_unlock(&tick_broadcast_lock);
171} 171}
172 172
173/* 173/*
@@ -212,7 +212,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
212 unsigned long flags; 212 unsigned long flags;
213 int cpu, bc_stopped; 213 int cpu, bc_stopped;
214 214
215 spin_lock_irqsave(&tick_broadcast_lock, flags); 215 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
216 216
217 cpu = smp_processor_id(); 217 cpu = smp_processor_id();
218 td = &per_cpu(tick_cpu_device, cpu); 218 td = &per_cpu(tick_cpu_device, cpu);
@@ -263,7 +263,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
263 tick_broadcast_setup_oneshot(bc); 263 tick_broadcast_setup_oneshot(bc);
264 } 264 }
265out: 265out:
266 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 266 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
267} 267}
268 268
269/* 269/*
@@ -299,7 +299,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
299 unsigned long flags; 299 unsigned long flags;
300 unsigned int cpu = *cpup; 300 unsigned int cpu = *cpup;
301 301
302 spin_lock_irqsave(&tick_broadcast_lock, flags); 302 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
303 303
304 bc = tick_broadcast_device.evtdev; 304 bc = tick_broadcast_device.evtdev;
305 cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); 305 cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
@@ -309,7 +309,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
309 clockevents_shutdown(bc); 309 clockevents_shutdown(bc);
310 } 310 }
311 311
312 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 312 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
313} 313}
314 314
315void tick_suspend_broadcast(void) 315void tick_suspend_broadcast(void)
@@ -317,13 +317,13 @@ void tick_suspend_broadcast(void)
317 struct clock_event_device *bc; 317 struct clock_event_device *bc;
318 unsigned long flags; 318 unsigned long flags;
319 319
320 spin_lock_irqsave(&tick_broadcast_lock, flags); 320 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
321 321
322 bc = tick_broadcast_device.evtdev; 322 bc = tick_broadcast_device.evtdev;
323 if (bc) 323 if (bc)
324 clockevents_shutdown(bc); 324 clockevents_shutdown(bc);
325 325
326 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 326 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
327} 327}
328 328
329int tick_resume_broadcast(void) 329int tick_resume_broadcast(void)
@@ -332,7 +332,7 @@ int tick_resume_broadcast(void)
332 unsigned long flags; 332 unsigned long flags;
333 int broadcast = 0; 333 int broadcast = 0;
334 334
335 spin_lock_irqsave(&tick_broadcast_lock, flags); 335 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
336 336
337 bc = tick_broadcast_device.evtdev; 337 bc = tick_broadcast_device.evtdev;
338 338
@@ -351,7 +351,7 @@ int tick_resume_broadcast(void)
351 break; 351 break;
352 } 352 }
353 } 353 }
354 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 354 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
355 355
356 return broadcast; 356 return broadcast;
357} 357}
@@ -405,7 +405,7 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
405 ktime_t now, next_event; 405 ktime_t now, next_event;
406 int cpu; 406 int cpu;
407 407
408 spin_lock(&tick_broadcast_lock); 408 raw_spin_lock(&tick_broadcast_lock);
409again: 409again:
410 dev->next_event.tv64 = KTIME_MAX; 410 dev->next_event.tv64 = KTIME_MAX;
411 next_event.tv64 = KTIME_MAX; 411 next_event.tv64 = KTIME_MAX;
@@ -443,7 +443,7 @@ again:
443 if (tick_broadcast_set_event(next_event, 0)) 443 if (tick_broadcast_set_event(next_event, 0))
444 goto again; 444 goto again;
445 } 445 }
446 spin_unlock(&tick_broadcast_lock); 446 raw_spin_unlock(&tick_broadcast_lock);
447} 447}
448 448
449/* 449/*
@@ -457,7 +457,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
457 unsigned long flags; 457 unsigned long flags;
458 int cpu; 458 int cpu;
459 459
460 spin_lock_irqsave(&tick_broadcast_lock, flags); 460 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
461 461
462 /* 462 /*
463 * Periodic mode does not care about the enter/exit of power 463 * Periodic mode does not care about the enter/exit of power
@@ -492,7 +492,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
492 } 492 }
493 493
494out: 494out:
495 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 495 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
496} 496}
497 497
498/* 498/*
@@ -563,13 +563,13 @@ void tick_broadcast_switch_to_oneshot(void)
563 struct clock_event_device *bc; 563 struct clock_event_device *bc;
564 unsigned long flags; 564 unsigned long flags;
565 565
566 spin_lock_irqsave(&tick_broadcast_lock, flags); 566 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
567 567
568 tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; 568 tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT;
569 bc = tick_broadcast_device.evtdev; 569 bc = tick_broadcast_device.evtdev;
570 if (bc) 570 if (bc)
571 tick_broadcast_setup_oneshot(bc); 571 tick_broadcast_setup_oneshot(bc);
572 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 572 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
573} 573}
574 574
575 575
@@ -581,7 +581,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
581 unsigned long flags; 581 unsigned long flags;
582 unsigned int cpu = *cpup; 582 unsigned int cpu = *cpup;
583 583
584 spin_lock_irqsave(&tick_broadcast_lock, flags); 584 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
585 585
586 /* 586 /*
587 * Clear the broadcast mask flag for the dead cpu, but do not 587 * Clear the broadcast mask flag for the dead cpu, but do not
@@ -589,7 +589,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
589 */ 589 */
590 cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); 590 cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
591 591
592 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 592 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
593} 593}
594 594
595/* 595/*
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 83c4417b6a3c..b6b898d2eeef 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -34,7 +34,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
34ktime_t tick_next_period; 34ktime_t tick_next_period;
35ktime_t tick_period; 35ktime_t tick_period;
36int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; 36int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
37DEFINE_SPINLOCK(tick_device_lock); 37static DEFINE_RAW_SPINLOCK(tick_device_lock);
38 38
39/* 39/*
40 * Debugging: see timer_list.c 40 * Debugging: see timer_list.c
@@ -209,7 +209,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
209 int cpu, ret = NOTIFY_OK; 209 int cpu, ret = NOTIFY_OK;
210 unsigned long flags; 210 unsigned long flags;
211 211
212 spin_lock_irqsave(&tick_device_lock, flags); 212 raw_spin_lock_irqsave(&tick_device_lock, flags);
213 213
214 cpu = smp_processor_id(); 214 cpu = smp_processor_id();
215 if (!cpumask_test_cpu(cpu, newdev->cpumask)) 215 if (!cpumask_test_cpu(cpu, newdev->cpumask))
@@ -268,7 +268,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
268 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) 268 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
269 tick_oneshot_notify(); 269 tick_oneshot_notify();
270 270
271 spin_unlock_irqrestore(&tick_device_lock, flags); 271 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
272 return NOTIFY_STOP; 272 return NOTIFY_STOP;
273 273
274out_bc: 274out_bc:
@@ -278,7 +278,7 @@ out_bc:
278 if (tick_check_broadcast_device(newdev)) 278 if (tick_check_broadcast_device(newdev))
279 ret = NOTIFY_STOP; 279 ret = NOTIFY_STOP;
280 280
281 spin_unlock_irqrestore(&tick_device_lock, flags); 281 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
282 282
283 return ret; 283 return ret;
284} 284}
@@ -311,7 +311,7 @@ static void tick_shutdown(unsigned int *cpup)
311 struct clock_event_device *dev = td->evtdev; 311 struct clock_event_device *dev = td->evtdev;
312 unsigned long flags; 312 unsigned long flags;
313 313
314 spin_lock_irqsave(&tick_device_lock, flags); 314 raw_spin_lock_irqsave(&tick_device_lock, flags);
315 td->mode = TICKDEV_MODE_PERIODIC; 315 td->mode = TICKDEV_MODE_PERIODIC;
316 if (dev) { 316 if (dev) {
317 /* 317 /*
@@ -322,7 +322,7 @@ static void tick_shutdown(unsigned int *cpup)
322 clockevents_exchange_device(dev, NULL); 322 clockevents_exchange_device(dev, NULL);
323 td->evtdev = NULL; 323 td->evtdev = NULL;
324 } 324 }
325 spin_unlock_irqrestore(&tick_device_lock, flags); 325 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
326} 326}
327 327
328static void tick_suspend(void) 328static void tick_suspend(void)
@@ -330,9 +330,9 @@ static void tick_suspend(void)
330 struct tick_device *td = &__get_cpu_var(tick_cpu_device); 330 struct tick_device *td = &__get_cpu_var(tick_cpu_device);
331 unsigned long flags; 331 unsigned long flags;
332 332
333 spin_lock_irqsave(&tick_device_lock, flags); 333 raw_spin_lock_irqsave(&tick_device_lock, flags);
334 clockevents_shutdown(td->evtdev); 334 clockevents_shutdown(td->evtdev);
335 spin_unlock_irqrestore(&tick_device_lock, flags); 335 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
336} 336}
337 337
338static void tick_resume(void) 338static void tick_resume(void)
@@ -341,7 +341,7 @@ static void tick_resume(void)
341 unsigned long flags; 341 unsigned long flags;
342 int broadcast = tick_resume_broadcast(); 342 int broadcast = tick_resume_broadcast();
343 343
344 spin_lock_irqsave(&tick_device_lock, flags); 344 raw_spin_lock_irqsave(&tick_device_lock, flags);
345 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); 345 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);
346 346
347 if (!broadcast) { 347 if (!broadcast) {
@@ -350,7 +350,7 @@ static void tick_resume(void)
350 else 350 else
351 tick_resume_oneshot(); 351 tick_resume_oneshot();
352 } 352 }
353 spin_unlock_irqrestore(&tick_device_lock, flags); 353 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
354} 354}
355 355
356/* 356/*
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index b1c05bf75ee0..290eefbc1f60 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -6,7 +6,6 @@
6#define TICK_DO_TIMER_BOOT -2 6#define TICK_DO_TIMER_BOOT -2
7 7
8DECLARE_PER_CPU(struct tick_device, tick_cpu_device); 8DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
9extern spinlock_t tick_device_lock;
10extern ktime_t tick_next_period; 9extern ktime_t tick_next_period;
11extern ktime_t tick_period; 10extern ktime_t tick_period;
12extern int tick_do_timer_cpu __read_mostly; 11extern int tick_do_timer_cpu __read_mostly;
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index a96c0e2b89cf..0a8a213016f0 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -50,9 +50,9 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
50 dev->min_delta_ns += dev->min_delta_ns >> 1; 50 dev->min_delta_ns += dev->min_delta_ns >> 1;
51 51
52 printk(KERN_WARNING 52 printk(KERN_WARNING
53 "CE: %s increasing min_delta_ns to %lu nsec\n", 53 "CE: %s increasing min_delta_ns to %llu nsec\n",
54 dev->name ? dev->name : "?", 54 dev->name ? dev->name : "?",
55 dev->min_delta_ns << 1); 55 (unsigned long long) dev->min_delta_ns << 1);
56 56
57 i = 0; 57 i = 0;
58 } 58 }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index e0f59a21c061..f992762d7f51 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -134,18 +134,13 @@ __setup("nohz=", setup_tick_nohz);
134 * value. We do this unconditionally on any cpu, as we don't know whether the 134 * value. We do this unconditionally on any cpu, as we don't know whether the
135 * cpu, which has the update task assigned is in a long sleep. 135 * cpu, which has the update task assigned is in a long sleep.
136 */ 136 */
137static void tick_nohz_update_jiffies(void) 137static void tick_nohz_update_jiffies(ktime_t now)
138{ 138{
139 int cpu = smp_processor_id(); 139 int cpu = smp_processor_id();
140 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 140 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
141 unsigned long flags; 141 unsigned long flags;
142 ktime_t now;
143
144 if (!ts->tick_stopped)
145 return;
146 142
147 cpumask_clear_cpu(cpu, nohz_cpu_mask); 143 cpumask_clear_cpu(cpu, nohz_cpu_mask);
148 now = ktime_get();
149 ts->idle_waketime = now; 144 ts->idle_waketime = now;
150 145
151 local_irq_save(flags); 146 local_irq_save(flags);
@@ -155,20 +150,17 @@ static void tick_nohz_update_jiffies(void)
155 touch_softlockup_watchdog(); 150 touch_softlockup_watchdog();
156} 151}
157 152
158static void tick_nohz_stop_idle(int cpu) 153static void tick_nohz_stop_idle(int cpu, ktime_t now)
159{ 154{
160 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 155 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
156 ktime_t delta;
161 157
162 if (ts->idle_active) { 158 delta = ktime_sub(now, ts->idle_entrytime);
163 ktime_t now, delta; 159 ts->idle_lastupdate = now;
164 now = ktime_get(); 160 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
165 delta = ktime_sub(now, ts->idle_entrytime); 161 ts->idle_active = 0;
166 ts->idle_lastupdate = now;
167 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
168 ts->idle_active = 0;
169 162
170 sched_clock_idle_wakeup_event(0); 163 sched_clock_idle_wakeup_event(0);
171 }
172} 164}
173 165
174static ktime_t tick_nohz_start_idle(struct tick_sched *ts) 166static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
@@ -216,6 +208,7 @@ void tick_nohz_stop_sched_tick(int inidle)
216 struct tick_sched *ts; 208 struct tick_sched *ts;
217 ktime_t last_update, expires, now; 209 ktime_t last_update, expires, now;
218 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 210 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
211 u64 time_delta;
219 int cpu; 212 int cpu;
220 213
221 local_irq_save(flags); 214 local_irq_save(flags);
@@ -231,6 +224,13 @@ void tick_nohz_stop_sched_tick(int inidle)
231 if (!inidle && !ts->inidle) 224 if (!inidle && !ts->inidle)
232 goto end; 225 goto end;
233 226
227 /*
228 * Set ts->inidle unconditionally. Even if the system did not
229 * switch to NOHZ mode the cpu frequency governers rely on the
230 * update of the idle time accounting in tick_nohz_start_idle().
231 */
232 ts->inidle = 1;
233
234 now = tick_nohz_start_idle(ts); 234 now = tick_nohz_start_idle(ts);
235 235
236 /* 236 /*
@@ -248,8 +248,6 @@ void tick_nohz_stop_sched_tick(int inidle)
248 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) 248 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
249 goto end; 249 goto end;
250 250
251 ts->inidle = 1;
252
253 if (need_resched()) 251 if (need_resched())
254 goto end; 252 goto end;
255 253
@@ -258,7 +256,7 @@ void tick_nohz_stop_sched_tick(int inidle)
258 256
259 if (ratelimit < 10) { 257 if (ratelimit < 10) {
260 printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", 258 printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
261 local_softirq_pending()); 259 (unsigned int) local_softirq_pending());
262 ratelimit++; 260 ratelimit++;
263 } 261 }
264 goto end; 262 goto end;
@@ -270,14 +268,18 @@ void tick_nohz_stop_sched_tick(int inidle)
270 seq = read_seqbegin(&xtime_lock); 268 seq = read_seqbegin(&xtime_lock);
271 last_update = last_jiffies_update; 269 last_update = last_jiffies_update;
272 last_jiffies = jiffies; 270 last_jiffies = jiffies;
271 time_delta = timekeeping_max_deferment();
273 } while (read_seqretry(&xtime_lock, seq)); 272 } while (read_seqretry(&xtime_lock, seq));
274 273
275 /* Get the next timer wheel timer */ 274 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
276 next_jiffies = get_next_timer_interrupt(last_jiffies); 275 arch_needs_cpu(cpu)) {
277 delta_jiffies = next_jiffies - last_jiffies; 276 next_jiffies = last_jiffies + 1;
278
279 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu))
280 delta_jiffies = 1; 277 delta_jiffies = 1;
278 } else {
279 /* Get the next timer wheel timer */
280 next_jiffies = get_next_timer_interrupt(last_jiffies);
281 delta_jiffies = next_jiffies - last_jiffies;
282 }
281 /* 283 /*
282 * Do not stop the tick, if we are only one off 284 * Do not stop the tick, if we are only one off
283 * or if the cpu is required for rcu 285 * or if the cpu is required for rcu
@@ -289,22 +291,51 @@ void tick_nohz_stop_sched_tick(int inidle)
289 if ((long)delta_jiffies >= 1) { 291 if ((long)delta_jiffies >= 1) {
290 292
291 /* 293 /*
292 * calculate the expiry time for the next timer wheel
293 * timer
294 */
295 expires = ktime_add_ns(last_update, tick_period.tv64 *
296 delta_jiffies);
297
298 /*
299 * If this cpu is the one which updates jiffies, then 294 * If this cpu is the one which updates jiffies, then
300 * give up the assignment and let it be taken by the 295 * give up the assignment and let it be taken by the
301 * cpu which runs the tick timer next, which might be 296 * cpu which runs the tick timer next, which might be
302 * this cpu as well. If we don't drop this here the 297 * this cpu as well. If we don't drop this here the
303 * jiffies might be stale and do_timer() never 298 * jiffies might be stale and do_timer() never
304 * invoked. 299 * invoked. Keep track of the fact that it was the one
300 * which had the do_timer() duty last. If this cpu is
301 * the one which had the do_timer() duty last, we
302 * limit the sleep time to the timekeeping
303 * max_deferement value which we retrieved
304 * above. Otherwise we can sleep as long as we want.
305 */ 305 */
306 if (cpu == tick_do_timer_cpu) 306 if (cpu == tick_do_timer_cpu) {
307 tick_do_timer_cpu = TICK_DO_TIMER_NONE; 307 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
308 ts->do_timer_last = 1;
309 } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
310 time_delta = KTIME_MAX;
311 ts->do_timer_last = 0;
312 } else if (!ts->do_timer_last) {
313 time_delta = KTIME_MAX;
314 }
315
316 /*
317 * calculate the expiry time for the next timer wheel
318 * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals
319 * that there is no timer pending or at least extremely
320 * far into the future (12 days for HZ=1000). In this
321 * case we set the expiry to the end of time.
322 */
323 if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) {
324 /*
325 * Calculate the time delta for the next timer event.
326 * If the time delta exceeds the maximum time delta
327 * permitted by the current clocksource then adjust
328 * the time delta accordingly to ensure the
329 * clocksource does not wrap.
330 */
331 time_delta = min_t(u64, time_delta,
332 tick_period.tv64 * delta_jiffies);
333 }
334
335 if (time_delta < KTIME_MAX)
336 expires = ktime_add_ns(last_update, time_delta);
337 else
338 expires.tv64 = KTIME_MAX;
308 339
309 if (delta_jiffies > 1) 340 if (delta_jiffies > 1)
310 cpumask_set_cpu(cpu, nohz_cpu_mask); 341 cpumask_set_cpu(cpu, nohz_cpu_mask);
@@ -337,22 +368,19 @@ void tick_nohz_stop_sched_tick(int inidle)
337 368
338 ts->idle_sleeps++; 369 ts->idle_sleeps++;
339 370
371 /* Mark expires */
372 ts->idle_expires = expires;
373
340 /* 374 /*
341 * delta_jiffies >= NEXT_TIMER_MAX_DELTA signals that 375 * If the expiration time == KTIME_MAX, then
342 * there is no timer pending or at least extremly far 376 * in this case we simply stop the tick timer.
343 * into the future (12 days for HZ=1000). In this case
344 * we simply stop the tick timer:
345 */ 377 */
346 if (unlikely(delta_jiffies >= NEXT_TIMER_MAX_DELTA)) { 378 if (unlikely(expires.tv64 == KTIME_MAX)) {
347 ts->idle_expires.tv64 = KTIME_MAX;
348 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) 379 if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
349 hrtimer_cancel(&ts->sched_timer); 380 hrtimer_cancel(&ts->sched_timer);
350 goto out; 381 goto out;
351 } 382 }
352 383
353 /* Mark expiries */
354 ts->idle_expires = expires;
355
356 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { 384 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
357 hrtimer_start(&ts->sched_timer, expires, 385 hrtimer_start(&ts->sched_timer, expires,
358 HRTIMER_MODE_ABS_PINNED); 386 HRTIMER_MODE_ABS_PINNED);
@@ -431,7 +459,11 @@ void tick_nohz_restart_sched_tick(void)
431 ktime_t now; 459 ktime_t now;
432 460
433 local_irq_disable(); 461 local_irq_disable();
434 tick_nohz_stop_idle(cpu); 462 if (ts->idle_active || (ts->inidle && ts->tick_stopped))
463 now = ktime_get();
464
465 if (ts->idle_active)
466 tick_nohz_stop_idle(cpu, now);
435 467
436 if (!ts->inidle || !ts->tick_stopped) { 468 if (!ts->inidle || !ts->tick_stopped) {
437 ts->inidle = 0; 469 ts->inidle = 0;
@@ -445,7 +477,6 @@ void tick_nohz_restart_sched_tick(void)
445 477
446 /* Update jiffies first */ 478 /* Update jiffies first */
447 select_nohz_load_balancer(0); 479 select_nohz_load_balancer(0);
448 now = ktime_get();
449 tick_do_update_jiffies64(now); 480 tick_do_update_jiffies64(now);
450 cpumask_clear_cpu(cpu, nohz_cpu_mask); 481 cpumask_clear_cpu(cpu, nohz_cpu_mask);
451 482
@@ -579,22 +610,18 @@ static void tick_nohz_switch_to_nohz(void)
579 * timer and do not touch the other magic bits which need to be done 610 * timer and do not touch the other magic bits which need to be done
580 * when idle is left. 611 * when idle is left.
581 */ 612 */
582static void tick_nohz_kick_tick(int cpu) 613static void tick_nohz_kick_tick(int cpu, ktime_t now)
583{ 614{
584#if 0 615#if 0
585 /* Switch back to 2.6.27 behaviour */ 616 /* Switch back to 2.6.27 behaviour */
586 617
587 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 618 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
588 ktime_t delta, now; 619 ktime_t delta;
589
590 if (!ts->tick_stopped)
591 return;
592 620
593 /* 621 /*
594 * Do not touch the tick device, when the next expiry is either 622 * Do not touch the tick device, when the next expiry is either
595 * already reached or less/equal than the tick period. 623 * already reached or less/equal than the tick period.
596 */ 624 */
597 now = ktime_get();
598 delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now); 625 delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now);
599 if (delta.tv64 <= tick_period.tv64) 626 if (delta.tv64 <= tick_period.tv64)
600 return; 627 return;
@@ -603,9 +630,26 @@ static void tick_nohz_kick_tick(int cpu)
603#endif 630#endif
604} 631}
605 632
633static inline void tick_check_nohz(int cpu)
634{
635 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
636 ktime_t now;
637
638 if (!ts->idle_active && !ts->tick_stopped)
639 return;
640 now = ktime_get();
641 if (ts->idle_active)
642 tick_nohz_stop_idle(cpu, now);
643 if (ts->tick_stopped) {
644 tick_nohz_update_jiffies(now);
645 tick_nohz_kick_tick(cpu, now);
646 }
647}
648
606#else 649#else
607 650
608static inline void tick_nohz_switch_to_nohz(void) { } 651static inline void tick_nohz_switch_to_nohz(void) { }
652static inline void tick_check_nohz(int cpu) { }
609 653
610#endif /* NO_HZ */ 654#endif /* NO_HZ */
611 655
@@ -615,11 +659,7 @@ static inline void tick_nohz_switch_to_nohz(void) { }
615void tick_check_idle(int cpu) 659void tick_check_idle(int cpu)
616{ 660{
617 tick_check_oneshot_broadcast(cpu); 661 tick_check_oneshot_broadcast(cpu);
618#ifdef CONFIG_NO_HZ 662 tick_check_nohz(cpu);
619 tick_nohz_stop_idle(cpu);
620 tick_nohz_update_jiffies();
621 tick_nohz_kick_tick(cpu);
622#endif
623} 663}
624 664
625/* 665/*
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
index 71e7f1a19156..12f5c55090be 100644
--- a/kernel/time/timecompare.c
+++ b/kernel/time/timecompare.c
@@ -40,7 +40,7 @@ ktime_t timecompare_transform(struct timecompare *sync,
40 40
41 return ns_to_ktime(nsec); 41 return ns_to_ktime(nsec);
42} 42}
43EXPORT_SYMBOL(timecompare_transform); 43EXPORT_SYMBOL_GPL(timecompare_transform);
44 44
45int timecompare_offset(struct timecompare *sync, 45int timecompare_offset(struct timecompare *sync,
46 s64 *offset, 46 s64 *offset,
@@ -89,7 +89,7 @@ int timecompare_offset(struct timecompare *sync,
89 * source time 89 * source time
90 */ 90 */
91 sample.offset = 91 sample.offset =
92 ktime_to_ns(ktime_add(end, start)) / 2 - 92 (ktime_to_ns(end) + ktime_to_ns(start)) / 2 -
93 ts; 93 ts;
94 94
95 /* simple insertion sort based on duration */ 95 /* simple insertion sort based on duration */
@@ -131,7 +131,7 @@ int timecompare_offset(struct timecompare *sync,
131 131
132 return used; 132 return used;
133} 133}
134EXPORT_SYMBOL(timecompare_offset); 134EXPORT_SYMBOL_GPL(timecompare_offset);
135 135
136void __timecompare_update(struct timecompare *sync, 136void __timecompare_update(struct timecompare *sync,
137 u64 source_tstamp) 137 u64 source_tstamp)
@@ -188,4 +188,4 @@ void __timecompare_update(struct timecompare *sync,
188 } 188 }
189 } 189 }
190} 190}
191EXPORT_SYMBOL(__timecompare_update); 191EXPORT_SYMBOL_GPL(__timecompare_update);
diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c
new file mode 100644
index 000000000000..86628e755f38
--- /dev/null
+++ b/kernel/time/timeconv.c
@@ -0,0 +1,127 @@
1/*
2 * Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc.
3 * This file is part of the GNU C Library.
4 * Contributed by Paul Eggert (eggert@twinsun.com).
5 *
6 * The GNU C Library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Library General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
10 *
11 * The GNU C Library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Library General Public License for more details.
15 *
16 * You should have received a copy of the GNU Library General Public
17 * License along with the GNU C Library; see the file COPYING.LIB. If not,
18 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 02111-1307, USA.
20 */
21
22/*
23 * Converts the calendar time to broken-down time representation
24 * Based on code from glibc-2.6
25 *
26 * 2009-7-14:
27 * Moved from glibc-2.6 to kernel by Zhaolei<zhaolei@cn.fujitsu.com>
28 */
29
30#include <linux/time.h>
31#include <linux/module.h>
32
33/*
34 * Nonzero if YEAR is a leap year (every 4 years,
35 * except every 100th isn't, and every 400th is).
36 */
37static int __isleap(long year)
38{
39 return (year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0);
40}
41
42/* do a mathdiv for long type */
43static long math_div(long a, long b)
44{
45 return a / b - (a % b < 0);
46}
47
48/* How many leap years between y1 and y2, y1 must less or equal to y2 */
49static long leaps_between(long y1, long y2)
50{
51 long leaps1 = math_div(y1 - 1, 4) - math_div(y1 - 1, 100)
52 + math_div(y1 - 1, 400);
53 long leaps2 = math_div(y2 - 1, 4) - math_div(y2 - 1, 100)
54 + math_div(y2 - 1, 400);
55 return leaps2 - leaps1;
56}
57
58/* How many days come before each month (0-12). */
59static const unsigned short __mon_yday[2][13] = {
60 /* Normal years. */
61 {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365},
62 /* Leap years. */
63 {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366}
64};
65
66#define SECS_PER_HOUR (60 * 60)
67#define SECS_PER_DAY (SECS_PER_HOUR * 24)
68
69/**
70 * time_to_tm - converts the calendar time to local broken-down time
71 *
72 * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970,
73 * Coordinated Universal Time (UTC).
74 * @offset offset seconds adding to totalsecs.
75 * @result pointer to struct tm variable to receive broken-down time
76 */
77void time_to_tm(time_t totalsecs, int offset, struct tm *result)
78{
79 long days, rem, y;
80 const unsigned short *ip;
81
82 days = totalsecs / SECS_PER_DAY;
83 rem = totalsecs % SECS_PER_DAY;
84 rem += offset;
85 while (rem < 0) {
86 rem += SECS_PER_DAY;
87 --days;
88 }
89 while (rem >= SECS_PER_DAY) {
90 rem -= SECS_PER_DAY;
91 ++days;
92 }
93
94 result->tm_hour = rem / SECS_PER_HOUR;
95 rem %= SECS_PER_HOUR;
96 result->tm_min = rem / 60;
97 result->tm_sec = rem % 60;
98
99 /* January 1, 1970 was a Thursday. */
100 result->tm_wday = (4 + days) % 7;
101 if (result->tm_wday < 0)
102 result->tm_wday += 7;
103
104 y = 1970;
105
106 while (days < 0 || days >= (__isleap(y) ? 366 : 365)) {
107 /* Guess a corrected year, assuming 365 days per year. */
108 long yg = y + math_div(days, 365);
109
110 /* Adjust DAYS and Y to match the guessed year. */
111 days -= (yg - y) * 365 + leaps_between(y, yg);
112 y = yg;
113 }
114
115 result->tm_year = y - 1900;
116
117 result->tm_yday = days;
118
119 ip = __mon_yday[__isleap(y)];
120 for (y = 11; days < ip[y]; y--)
121 continue;
122 days -= ip[y];
123
124 result->tm_mon = y;
125 result->tm_mday = days + 1;
126}
127EXPORT_SYMBOL(time_to_tm);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index fb0f46fa1ecd..af4135f05825 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -13,6 +13,7 @@
13#include <linux/percpu.h> 13#include <linux/percpu.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/sched.h>
16#include <linux/sysdev.h> 17#include <linux/sysdev.h>
17#include <linux/clocksource.h> 18#include <linux/clocksource.h>
18#include <linux/jiffies.h> 19#include <linux/jiffies.h>
@@ -164,19 +165,12 @@ struct timespec raw_time;
164/* flag for if timekeeping is suspended */ 165/* flag for if timekeeping is suspended */
165int __read_mostly timekeeping_suspended; 166int __read_mostly timekeeping_suspended;
166 167
167static struct timespec xtime_cache __attribute__ ((aligned (16)));
168void update_xtime_cache(u64 nsec)
169{
170 xtime_cache = xtime;
171 timespec_add_ns(&xtime_cache, nsec);
172}
173
174/* must hold xtime_lock */ 168/* must hold xtime_lock */
175void timekeeping_leap_insert(int leapsecond) 169void timekeeping_leap_insert(int leapsecond)
176{ 170{
177 xtime.tv_sec += leapsecond; 171 xtime.tv_sec += leapsecond;
178 wall_to_monotonic.tv_sec -= leapsecond; 172 wall_to_monotonic.tv_sec -= leapsecond;
179 update_vsyscall(&xtime, timekeeper.clock); 173 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
180} 174}
181 175
182#ifdef CONFIG_GENERIC_TIME 176#ifdef CONFIG_GENERIC_TIME
@@ -331,12 +325,10 @@ int do_settimeofday(struct timespec *tv)
331 325
332 xtime = *tv; 326 xtime = *tv;
333 327
334 update_xtime_cache(0);
335
336 timekeeper.ntp_error = 0; 328 timekeeper.ntp_error = 0;
337 ntp_clear(); 329 ntp_clear();
338 330
339 update_vsyscall(&xtime, timekeeper.clock); 331 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
340 332
341 write_sequnlock_irqrestore(&xtime_lock, flags); 333 write_sequnlock_irqrestore(&xtime_lock, flags);
342 334
@@ -487,6 +479,17 @@ int timekeeping_valid_for_hres(void)
487} 479}
488 480
489/** 481/**
482 * timekeeping_max_deferment - Returns max time the clocksource can be deferred
483 *
484 * Caller must observe xtime_lock via read_seqbegin/read_seqretry to
485 * ensure that the clocksource does not change!
486 */
487u64 timekeeping_max_deferment(void)
488{
489 return timekeeper.clock->max_idle_ns;
490}
491
492/**
490 * read_persistent_clock - Return time from the persistent clock. 493 * read_persistent_clock - Return time from the persistent clock.
491 * 494 *
492 * Weak dummy function for arches that do not yet support it. 495 * Weak dummy function for arches that do not yet support it.
@@ -547,7 +550,6 @@ void __init timekeeping_init(void)
547 } 550 }
548 set_normalized_timespec(&wall_to_monotonic, 551 set_normalized_timespec(&wall_to_monotonic,
549 -boot.tv_sec, -boot.tv_nsec); 552 -boot.tv_sec, -boot.tv_nsec);
550 update_xtime_cache(0);
551 total_sleep_time.tv_sec = 0; 553 total_sleep_time.tv_sec = 0;
552 total_sleep_time.tv_nsec = 0; 554 total_sleep_time.tv_nsec = 0;
553 write_sequnlock_irqrestore(&xtime_lock, flags); 555 write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -581,7 +583,6 @@ static int timekeeping_resume(struct sys_device *dev)
581 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); 583 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
582 total_sleep_time = timespec_add_safe(total_sleep_time, ts); 584 total_sleep_time = timespec_add_safe(total_sleep_time, ts);
583 } 585 }
584 update_xtime_cache(0);
585 /* re-base the last cycle value */ 586 /* re-base the last cycle value */
586 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); 587 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
587 timekeeper.ntp_error = 0; 588 timekeeper.ntp_error = 0;
@@ -722,6 +723,49 @@ static void timekeeping_adjust(s64 offset)
722} 723}
723 724
724/** 725/**
726 * logarithmic_accumulation - shifted accumulation of cycles
727 *
728 * This functions accumulates a shifted interval of cycles into
729 * into a shifted interval nanoseconds. Allows for O(log) accumulation
730 * loop.
731 *
732 * Returns the unconsumed cycles.
733 */
734static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
735{
736 u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
737
738 /* If the offset is smaller then a shifted interval, do nothing */
739 if (offset < timekeeper.cycle_interval<<shift)
740 return offset;
741
742 /* Accumulate one shifted interval */
743 offset -= timekeeper.cycle_interval << shift;
744 timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift;
745
746 timekeeper.xtime_nsec += timekeeper.xtime_interval << shift;
747 while (timekeeper.xtime_nsec >= nsecps) {
748 timekeeper.xtime_nsec -= nsecps;
749 xtime.tv_sec++;
750 second_overflow();
751 }
752
753 /* Accumulate into raw time */
754 raw_time.tv_nsec += timekeeper.raw_interval << shift;;
755 while (raw_time.tv_nsec >= NSEC_PER_SEC) {
756 raw_time.tv_nsec -= NSEC_PER_SEC;
757 raw_time.tv_sec++;
758 }
759
760 /* Accumulate error between NTP and clock interval */
761 timekeeper.ntp_error += tick_length << shift;
762 timekeeper.ntp_error -= timekeeper.xtime_interval <<
763 (timekeeper.ntp_error_shift + shift);
764
765 return offset;
766}
767
768/**
725 * update_wall_time - Uses the current clocksource to increment the wall time 769 * update_wall_time - Uses the current clocksource to increment the wall time
726 * 770 *
727 * Called from the timer interrupt, must hold a write on xtime_lock. 771 * Called from the timer interrupt, must hold a write on xtime_lock.
@@ -730,7 +774,7 @@ void update_wall_time(void)
730{ 774{
731 struct clocksource *clock; 775 struct clocksource *clock;
732 cycle_t offset; 776 cycle_t offset;
733 u64 nsecs; 777 int shift = 0, maxshift;
734 778
735 /* Make sure we're fully resumed: */ 779 /* Make sure we're fully resumed: */
736 if (unlikely(timekeeping_suspended)) 780 if (unlikely(timekeeping_suspended))
@@ -744,33 +788,22 @@ void update_wall_time(void)
744#endif 788#endif
745 timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; 789 timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
746 790
747 /* normally this loop will run just once, however in the 791 /*
748 * case of lost or late ticks, it will accumulate correctly. 792 * With NO_HZ we may have to accumulate many cycle_intervals
793 * (think "ticks") worth of time at once. To do this efficiently,
794 * we calculate the largest doubling multiple of cycle_intervals
795 * that is smaller then the offset. We then accumulate that
796 * chunk in one go, and then try to consume the next smaller
797 * doubled multiple.
749 */ 798 */
799 shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
800 shift = max(0, shift);
801 /* Bound shift to one less then what overflows tick_length */
802 maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1;
803 shift = min(shift, maxshift);
750 while (offset >= timekeeper.cycle_interval) { 804 while (offset >= timekeeper.cycle_interval) {
751 u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; 805 offset = logarithmic_accumulation(offset, shift);
752 806 shift--;
753 /* accumulate one interval */
754 offset -= timekeeper.cycle_interval;
755 clock->cycle_last += timekeeper.cycle_interval;
756
757 timekeeper.xtime_nsec += timekeeper.xtime_interval;
758 if (timekeeper.xtime_nsec >= nsecps) {
759 timekeeper.xtime_nsec -= nsecps;
760 xtime.tv_sec++;
761 second_overflow();
762 }
763
764 raw_time.tv_nsec += timekeeper.raw_interval;
765 if (raw_time.tv_nsec >= NSEC_PER_SEC) {
766 raw_time.tv_nsec -= NSEC_PER_SEC;
767 raw_time.tv_sec++;
768 }
769
770 /* accumulate error between NTP and clock interval */
771 timekeeper.ntp_error += tick_length;
772 timekeeper.ntp_error -= timekeeper.xtime_interval <<
773 timekeeper.ntp_error_shift;
774 } 807 }
775 808
776 /* correct the clock when NTP error is too big */ 809 /* correct the clock when NTP error is too big */
@@ -806,11 +839,8 @@ void update_wall_time(void)
806 timekeeper.ntp_error += timekeeper.xtime_nsec << 839 timekeeper.ntp_error += timekeeper.xtime_nsec <<
807 timekeeper.ntp_error_shift; 840 timekeeper.ntp_error_shift;
808 841
809 nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift);
810 update_xtime_cache(nsecs);
811
812 /* check to see if there is a new clocksource to use */ 842 /* check to see if there is a new clocksource to use */
813 update_vsyscall(&xtime, timekeeper.clock); 843 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
814} 844}
815 845
816/** 846/**
@@ -845,13 +875,13 @@ void monotonic_to_bootbased(struct timespec *ts)
845 875
846unsigned long get_seconds(void) 876unsigned long get_seconds(void)
847{ 877{
848 return xtime_cache.tv_sec; 878 return xtime.tv_sec;
849} 879}
850EXPORT_SYMBOL(get_seconds); 880EXPORT_SYMBOL(get_seconds);
851 881
852struct timespec __current_kernel_time(void) 882struct timespec __current_kernel_time(void)
853{ 883{
854 return xtime_cache; 884 return xtime;
855} 885}
856 886
857struct timespec current_kernel_time(void) 887struct timespec current_kernel_time(void)
@@ -861,8 +891,7 @@ struct timespec current_kernel_time(void)
861 891
862 do { 892 do {
863 seq = read_seqbegin(&xtime_lock); 893 seq = read_seqbegin(&xtime_lock);
864 894 now = xtime;
865 now = xtime_cache;
866 } while (read_seqretry(&xtime_lock, seq)); 895 } while (read_seqretry(&xtime_lock, seq));
867 896
868 return now; 897 return now;
@@ -876,8 +905,7 @@ struct timespec get_monotonic_coarse(void)
876 905
877 do { 906 do {
878 seq = read_seqbegin(&xtime_lock); 907 seq = read_seqbegin(&xtime_lock);
879 908 now = xtime;
880 now = xtime_cache;
881 mono = wall_to_monotonic; 909 mono = wall_to_monotonic;
882 } while (read_seqretry(&xtime_lock, seq)); 910 } while (read_seqretry(&xtime_lock, seq));
883 911
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index fddd69d16e03..28265636b6c2 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -84,7 +84,7 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
84 84
85next_one: 85next_one:
86 i = 0; 86 i = 0;
87 spin_lock_irqsave(&base->cpu_base->lock, flags); 87 raw_spin_lock_irqsave(&base->cpu_base->lock, flags);
88 88
89 curr = base->first; 89 curr = base->first;
90 /* 90 /*
@@ -100,13 +100,13 @@ next_one:
100 100
101 timer = rb_entry(curr, struct hrtimer, node); 101 timer = rb_entry(curr, struct hrtimer, node);
102 tmp = *timer; 102 tmp = *timer;
103 spin_unlock_irqrestore(&base->cpu_base->lock, flags); 103 raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags);
104 104
105 print_timer(m, timer, &tmp, i, now); 105 print_timer(m, timer, &tmp, i, now);
106 next++; 106 next++;
107 goto next_one; 107 goto next_one;
108 } 108 }
109 spin_unlock_irqrestore(&base->cpu_base->lock, flags); 109 raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags);
110} 110}
111 111
112static void 112static void
@@ -150,6 +150,9 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
150 P_ns(expires_next); 150 P_ns(expires_next);
151 P(hres_active); 151 P(hres_active);
152 P(nr_events); 152 P(nr_events);
153 P(nr_retries);
154 P(nr_hangs);
155 P_ns(max_hang_time);
153#endif 156#endif
154#undef P 157#undef P
155#undef P_ns 158#undef P_ns
@@ -204,10 +207,12 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
204 return; 207 return;
205 } 208 }
206 SEQ_printf(m, "%s\n", dev->name); 209 SEQ_printf(m, "%s\n", dev->name);
207 SEQ_printf(m, " max_delta_ns: %lu\n", dev->max_delta_ns); 210 SEQ_printf(m, " max_delta_ns: %llu\n",
208 SEQ_printf(m, " min_delta_ns: %lu\n", dev->min_delta_ns); 211 (unsigned long long) dev->max_delta_ns);
209 SEQ_printf(m, " mult: %lu\n", dev->mult); 212 SEQ_printf(m, " min_delta_ns: %llu\n",
210 SEQ_printf(m, " shift: %d\n", dev->shift); 213 (unsigned long long) dev->min_delta_ns);
214 SEQ_printf(m, " mult: %u\n", dev->mult);
215 SEQ_printf(m, " shift: %u\n", dev->shift);
211 SEQ_printf(m, " mode: %d\n", dev->mode); 216 SEQ_printf(m, " mode: %d\n", dev->mode);
212 SEQ_printf(m, " next_event: %Ld nsecs\n", 217 SEQ_printf(m, " next_event: %Ld nsecs\n",
213 (unsigned long long) ktime_to_ns(dev->next_event)); 218 (unsigned long long) ktime_to_ns(dev->next_event));
@@ -252,7 +257,7 @@ static int timer_list_show(struct seq_file *m, void *v)
252 u64 now = ktime_to_ns(ktime_get()); 257 u64 now = ktime_to_ns(ktime_get());
253 int cpu; 258 int cpu;
254 259
255 SEQ_printf(m, "Timer List Version: v0.4\n"); 260 SEQ_printf(m, "Timer List Version: v0.5\n");
256 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); 261 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
257 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); 262 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
258 263
@@ -275,7 +280,7 @@ static int timer_list_open(struct inode *inode, struct file *filp)
275 return single_open(filp, timer_list_show, NULL); 280 return single_open(filp, timer_list_show, NULL);
276} 281}
277 282
278static struct file_operations timer_list_fops = { 283static const struct file_operations timer_list_fops = {
279 .open = timer_list_open, 284 .open = timer_list_open,
280 .read = seq_read, 285 .read = seq_read,
281 .llseek = seq_lseek, 286 .llseek = seq_lseek,
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 4cde8b9c716f..2f3b585b8d7d 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -86,7 +86,7 @@ static DEFINE_SPINLOCK(table_lock);
86/* 86/*
87 * Per-CPU lookup locks for fast hash lookup: 87 * Per-CPU lookup locks for fast hash lookup:
88 */ 88 */
89static DEFINE_PER_CPU(spinlock_t, lookup_lock); 89static DEFINE_PER_CPU(raw_spinlock_t, tstats_lookup_lock);
90 90
91/* 91/*
92 * Mutex to serialize state changes with show-stats activities: 92 * Mutex to serialize state changes with show-stats activities:
@@ -238,14 +238,14 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
238 /* 238 /*
239 * It doesnt matter which lock we take: 239 * It doesnt matter which lock we take:
240 */ 240 */
241 spinlock_t *lock; 241 raw_spinlock_t *lock;
242 struct entry *entry, input; 242 struct entry *entry, input;
243 unsigned long flags; 243 unsigned long flags;
244 244
245 if (likely(!timer_stats_active)) 245 if (likely(!timer_stats_active))
246 return; 246 return;
247 247
248 lock = &per_cpu(lookup_lock, raw_smp_processor_id()); 248 lock = &per_cpu(tstats_lookup_lock, raw_smp_processor_id());
249 249
250 input.timer = timer; 250 input.timer = timer;
251 input.start_func = startf; 251 input.start_func = startf;
@@ -253,7 +253,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
253 input.pid = pid; 253 input.pid = pid;
254 input.timer_flag = timer_flag; 254 input.timer_flag = timer_flag;
255 255
256 spin_lock_irqsave(lock, flags); 256 raw_spin_lock_irqsave(lock, flags);
257 if (!timer_stats_active) 257 if (!timer_stats_active)
258 goto out_unlock; 258 goto out_unlock;
259 259
@@ -264,7 +264,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
264 atomic_inc(&overflow_count); 264 atomic_inc(&overflow_count);
265 265
266 out_unlock: 266 out_unlock:
267 spin_unlock_irqrestore(lock, flags); 267 raw_spin_unlock_irqrestore(lock, flags);
268} 268}
269 269
270static void print_name_offset(struct seq_file *m, unsigned long addr) 270static void print_name_offset(struct seq_file *m, unsigned long addr)
@@ -348,9 +348,11 @@ static void sync_access(void)
348 int cpu; 348 int cpu;
349 349
350 for_each_online_cpu(cpu) { 350 for_each_online_cpu(cpu) {
351 spin_lock_irqsave(&per_cpu(lookup_lock, cpu), flags); 351 raw_spinlock_t *lock = &per_cpu(tstats_lookup_lock, cpu);
352
353 raw_spin_lock_irqsave(lock, flags);
352 /* nothing */ 354 /* nothing */
353 spin_unlock_irqrestore(&per_cpu(lookup_lock, cpu), flags); 355 raw_spin_unlock_irqrestore(lock, flags);
354 } 356 }
355} 357}
356 358
@@ -395,7 +397,7 @@ static int tstats_open(struct inode *inode, struct file *filp)
395 return single_open(filp, tstats_show, NULL); 397 return single_open(filp, tstats_show, NULL);
396} 398}
397 399
398static struct file_operations tstats_fops = { 400static const struct file_operations tstats_fops = {
399 .open = tstats_open, 401 .open = tstats_open,
400 .read = seq_read, 402 .read = seq_read,
401 .write = tstats_write, 403 .write = tstats_write,
@@ -408,7 +410,7 @@ void __init init_timer_stats(void)
408 int cpu; 410 int cpu;
409 411
410 for_each_possible_cpu(cpu) 412 for_each_possible_cpu(cpu)
411 spin_lock_init(&per_cpu(lookup_lock, cpu)); 413 raw_spin_lock_init(&per_cpu(tstats_lookup_lock, cpu));
412} 414}
413 415
414static int __init init_tstats_procfs(void) 416static int __init init_tstats_procfs(void)
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index b416512ad17f..d006554888dc 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -339,6 +339,27 @@ config POWER_TRACER
339 power management decisions, specifically the C-state and P-state 339 power management decisions, specifically the C-state and P-state
340 behavior. 340 behavior.
341 341
342config KSYM_TRACER
343 bool "Trace read and write access on kernel memory locations"
344 depends on HAVE_HW_BREAKPOINT
345 select TRACING
346 help
347 This tracer helps find read and write operations on any given kernel
348 symbol i.e. /proc/kallsyms.
349
350config PROFILE_KSYM_TRACER
351 bool "Profile all kernel memory accesses on 'watched' variables"
352 depends on KSYM_TRACER
353 help
354 This tracer profiles kernel accesses on variables watched through the
355 ksym tracer ftrace plugin. Depending upon the hardware, all read
356 and write operations on kernel variables can be monitored for
357 accesses.
358
359 The results will be displayed in:
360 /debugfs/tracing/profile_ksym
361
362 Say N if unsure.
342 363
343config STACK_TRACER 364config STACK_TRACER
344 bool "Trace max stack" 365 bool "Trace max stack"
@@ -428,6 +449,23 @@ config BLK_DEV_IO_TRACE
428 449
429 If unsure, say N. 450 If unsure, say N.
430 451
452config KPROBE_EVENT
453 depends on KPROBES
454 depends on X86
455 bool "Enable kprobes-based dynamic events"
456 select TRACING
457 default y
458 help
459 This allows the user to add tracing events (similar to tracepoints) on the fly
460 via the ftrace interface. See Documentation/trace/kprobetrace.txt
461 for more details.
462
463 Those events can be inserted wherever kprobes can probe, and record
464 various register and memory values.
465
466 This option is also required by perf-probe subcommand of perf tools. If
467 you want to use perf tools, this option is strongly recommended.
468
431config DYNAMIC_FTRACE 469config DYNAMIC_FTRACE
432 bool "enable/disable ftrace tracepoints dynamically" 470 bool "enable/disable ftrace tracepoints dynamically"
433 depends on FUNCTION_TRACER 471 depends on FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 26f03ac07c2b..cd9ecd89ec77 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -53,6 +53,8 @@ obj-$(CONFIG_EVENT_TRACING) += trace_export.o
53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o 53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
54obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o 54obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
55obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 55obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
56obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
57obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
56obj-$(CONFIG_EVENT_TRACING) += power-traces.o 58obj-$(CONFIG_EVENT_TRACING) += power-traces.o
57 59
58libftrace-y := ftrace.o 60libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 3eb159c277c8..d9d6206e0b14 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -856,6 +856,37 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
856} 856}
857 857
858/** 858/**
859 * blk_add_trace_rq_remap - Add a trace for a request-remap operation
860 * @q: queue the io is for
861 * @rq: the source request
862 * @dev: target device
863 * @from: source sector
864 *
865 * Description:
866 * Device mapper remaps request to other devices.
867 * Add a trace for that action.
868 *
869 **/
870static void blk_add_trace_rq_remap(struct request_queue *q,
871 struct request *rq, dev_t dev,
872 sector_t from)
873{
874 struct blk_trace *bt = q->blk_trace;
875 struct blk_io_trace_remap r;
876
877 if (likely(!bt))
878 return;
879
880 r.device_from = cpu_to_be32(dev);
881 r.device_to = cpu_to_be32(disk_devt(rq->rq_disk));
882 r.sector_from = cpu_to_be64(from);
883
884 __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
885 rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors,
886 sizeof(r), &r);
887}
888
889/**
859 * blk_add_driver_data - Add binary message with driver-specific data 890 * blk_add_driver_data - Add binary message with driver-specific data
860 * @q: queue the io is for 891 * @q: queue the io is for
861 * @rq: io request 892 * @rq: io request
@@ -922,10 +953,13 @@ static void blk_register_tracepoints(void)
922 WARN_ON(ret); 953 WARN_ON(ret);
923 ret = register_trace_block_remap(blk_add_trace_remap); 954 ret = register_trace_block_remap(blk_add_trace_remap);
924 WARN_ON(ret); 955 WARN_ON(ret);
956 ret = register_trace_block_rq_remap(blk_add_trace_rq_remap);
957 WARN_ON(ret);
925} 958}
926 959
927static void blk_unregister_tracepoints(void) 960static void blk_unregister_tracepoints(void)
928{ 961{
962 unregister_trace_block_rq_remap(blk_add_trace_rq_remap);
929 unregister_trace_block_remap(blk_add_trace_remap); 963 unregister_trace_block_remap(blk_add_trace_remap);
930 unregister_trace_block_split(blk_add_trace_split); 964 unregister_trace_block_split(blk_add_trace_split);
931 unregister_trace_block_unplug_io(blk_add_trace_unplug_io); 965 unregister_trace_block_unplug_io(blk_add_trace_unplug_io);
@@ -1657,6 +1691,11 @@ int blk_trace_init_sysfs(struct device *dev)
1657 return sysfs_create_group(&dev->kobj, &blk_trace_attr_group); 1691 return sysfs_create_group(&dev->kobj, &blk_trace_attr_group);
1658} 1692}
1659 1693
1694void blk_trace_remove_sysfs(struct device *dev)
1695{
1696 sysfs_remove_group(&dev->kobj, &blk_trace_attr_group);
1697}
1698
1660#endif /* CONFIG_BLK_DEV_IO_TRACE */ 1699#endif /* CONFIG_BLK_DEV_IO_TRACE */
1661 1700
1662#ifdef CONFIG_EVENT_TRACING 1701#ifdef CONFIG_EVENT_TRACING
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 23df7771c937..e51a1bcb7bed 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -60,6 +60,13 @@ static int last_ftrace_enabled;
60/* Quick disabling of function tracer. */ 60/* Quick disabling of function tracer. */
61int function_trace_stop; 61int function_trace_stop;
62 62
63/* List for set_ftrace_pid's pids. */
64LIST_HEAD(ftrace_pids);
65struct ftrace_pid {
66 struct list_head list;
67 struct pid *pid;
68};
69
63/* 70/*
64 * ftrace_disabled is set when an anomaly is discovered. 71 * ftrace_disabled is set when an anomaly is discovered.
65 * ftrace_disabled is much stronger than ftrace_enabled. 72 * ftrace_disabled is much stronger than ftrace_enabled.
@@ -78,6 +85,10 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
78ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; 85ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
79ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; 86ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
80 87
88#ifdef CONFIG_FUNCTION_GRAPH_TRACER
89static int ftrace_set_func(unsigned long *array, int *idx, char *buffer);
90#endif
91
81static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) 92static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
82{ 93{
83 struct ftrace_ops *op = ftrace_list; 94 struct ftrace_ops *op = ftrace_list;
@@ -155,7 +166,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
155 else 166 else
156 func = ftrace_list_func; 167 func = ftrace_list_func;
157 168
158 if (ftrace_pid_trace) { 169 if (!list_empty(&ftrace_pids)) {
159 set_ftrace_pid_function(func); 170 set_ftrace_pid_function(func);
160 func = ftrace_pid_func; 171 func = ftrace_pid_func;
161 } 172 }
@@ -203,7 +214,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
203 if (ftrace_list->next == &ftrace_list_end) { 214 if (ftrace_list->next == &ftrace_list_end) {
204 ftrace_func_t func = ftrace_list->func; 215 ftrace_func_t func = ftrace_list->func;
205 216
206 if (ftrace_pid_trace) { 217 if (!list_empty(&ftrace_pids)) {
207 set_ftrace_pid_function(func); 218 set_ftrace_pid_function(func);
208 func = ftrace_pid_func; 219 func = ftrace_pid_func;
209 } 220 }
@@ -225,9 +236,13 @@ static void ftrace_update_pid_func(void)
225 if (ftrace_trace_function == ftrace_stub) 236 if (ftrace_trace_function == ftrace_stub)
226 return; 237 return;
227 238
239#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
228 func = ftrace_trace_function; 240 func = ftrace_trace_function;
241#else
242 func = __ftrace_trace_function;
243#endif
229 244
230 if (ftrace_pid_trace) { 245 if (!list_empty(&ftrace_pids)) {
231 set_ftrace_pid_function(func); 246 set_ftrace_pid_function(func);
232 func = ftrace_pid_func; 247 func = ftrace_pid_func;
233 } else { 248 } else {
@@ -736,7 +751,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
736 out: 751 out:
737 mutex_unlock(&ftrace_profile_lock); 752 mutex_unlock(&ftrace_profile_lock);
738 753
739 filp->f_pos += cnt; 754 *ppos += cnt;
740 755
741 return cnt; 756 return cnt;
742} 757}
@@ -817,8 +832,6 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
817} 832}
818#endif /* CONFIG_FUNCTION_PROFILER */ 833#endif /* CONFIG_FUNCTION_PROFILER */
819 834
820/* set when tracing only a pid */
821struct pid *ftrace_pid_trace;
822static struct pid * const ftrace_swapper_pid = &init_struct_pid; 835static struct pid * const ftrace_swapper_pid = &init_struct_pid;
823 836
824#ifdef CONFIG_DYNAMIC_FTRACE 837#ifdef CONFIG_DYNAMIC_FTRACE
@@ -1074,14 +1087,9 @@ static void ftrace_replace_code(int enable)
1074 failed = __ftrace_replace_code(rec, enable); 1087 failed = __ftrace_replace_code(rec, enable);
1075 if (failed) { 1088 if (failed) {
1076 rec->flags |= FTRACE_FL_FAILED; 1089 rec->flags |= FTRACE_FL_FAILED;
1077 if ((system_state == SYSTEM_BOOTING) || 1090 ftrace_bug(failed, rec->ip);
1078 !core_kernel_text(rec->ip)) { 1091 /* Stop processing */
1079 ftrace_free_rec(rec); 1092 return;
1080 } else {
1081 ftrace_bug(failed, rec->ip);
1082 /* Stop processing */
1083 return;
1084 }
1085 } 1093 }
1086 } while_for_each_ftrace_rec(); 1094 } while_for_each_ftrace_rec();
1087} 1095}
@@ -1262,12 +1270,34 @@ static int ftrace_update_code(struct module *mod)
1262 ftrace_new_addrs = p->newlist; 1270 ftrace_new_addrs = p->newlist;
1263 p->flags = 0L; 1271 p->flags = 0L;
1264 1272
1265 /* convert record (i.e, patch mcount-call with NOP) */ 1273 /*
1266 if (ftrace_code_disable(mod, p)) { 1274 * Do the initial record convertion from mcount jump
1267 p->flags |= FTRACE_FL_CONVERTED; 1275 * to the NOP instructions.
1268 ftrace_update_cnt++; 1276 */
1269 } else 1277 if (!ftrace_code_disable(mod, p)) {
1270 ftrace_free_rec(p); 1278 ftrace_free_rec(p);
1279 continue;
1280 }
1281
1282 p->flags |= FTRACE_FL_CONVERTED;
1283 ftrace_update_cnt++;
1284
1285 /*
1286 * If the tracing is enabled, go ahead and enable the record.
1287 *
1288 * The reason not to enable the record immediatelly is the
1289 * inherent check of ftrace_make_nop/ftrace_make_call for
1290 * correct previous instructions. Making first the NOP
1291 * conversion puts the module to the correct state, thus
1292 * passing the ftrace_make_call check.
1293 */
1294 if (ftrace_start_up) {
1295 int failed = __ftrace_replace_code(p, 1);
1296 if (failed) {
1297 ftrace_bug(failed, p->ip);
1298 ftrace_free_rec(p);
1299 }
1300 }
1271 } 1301 }
1272 1302
1273 stop = ftrace_now(raw_smp_processor_id()); 1303 stop = ftrace_now(raw_smp_processor_id());
@@ -1621,8 +1651,10 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
1621 if (!ret) { 1651 if (!ret) {
1622 struct seq_file *m = file->private_data; 1652 struct seq_file *m = file->private_data;
1623 m->private = iter; 1653 m->private = iter;
1624 } else 1654 } else {
1655 trace_parser_put(&iter->parser);
1625 kfree(iter); 1656 kfree(iter);
1657 }
1626 } else 1658 } else
1627 file->private_data = iter; 1659 file->private_data = iter;
1628 mutex_unlock(&ftrace_regex_lock); 1660 mutex_unlock(&ftrace_regex_lock);
@@ -1655,60 +1687,6 @@ ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
1655 return ret; 1687 return ret;
1656} 1688}
1657 1689
1658enum {
1659 MATCH_FULL,
1660 MATCH_FRONT_ONLY,
1661 MATCH_MIDDLE_ONLY,
1662 MATCH_END_ONLY,
1663};
1664
1665/*
1666 * (static function - no need for kernel doc)
1667 *
1668 * Pass in a buffer containing a glob and this function will
1669 * set search to point to the search part of the buffer and
1670 * return the type of search it is (see enum above).
1671 * This does modify buff.
1672 *
1673 * Returns enum type.
1674 * search returns the pointer to use for comparison.
1675 * not returns 1 if buff started with a '!'
1676 * 0 otherwise.
1677 */
1678static int
1679ftrace_setup_glob(char *buff, int len, char **search, int *not)
1680{
1681 int type = MATCH_FULL;
1682 int i;
1683
1684 if (buff[0] == '!') {
1685 *not = 1;
1686 buff++;
1687 len--;
1688 } else
1689 *not = 0;
1690
1691 *search = buff;
1692
1693 for (i = 0; i < len; i++) {
1694 if (buff[i] == '*') {
1695 if (!i) {
1696 *search = buff + 1;
1697 type = MATCH_END_ONLY;
1698 } else {
1699 if (type == MATCH_END_ONLY)
1700 type = MATCH_MIDDLE_ONLY;
1701 else
1702 type = MATCH_FRONT_ONLY;
1703 buff[i] = 0;
1704 break;
1705 }
1706 }
1707 }
1708
1709 return type;
1710}
1711
1712static int ftrace_match(char *str, char *regex, int len, int type) 1690static int ftrace_match(char *str, char *regex, int len, int type)
1713{ 1691{
1714 int matched = 0; 1692 int matched = 0;
@@ -1757,7 +1735,7 @@ static void ftrace_match_records(char *buff, int len, int enable)
1757 int not; 1735 int not;
1758 1736
1759 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; 1737 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1760 type = ftrace_setup_glob(buff, len, &search, &not); 1738 type = filter_parse_regex(buff, len, &search, &not);
1761 1739
1762 search_len = strlen(search); 1740 search_len = strlen(search);
1763 1741
@@ -1825,7 +1803,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
1825 } 1803 }
1826 1804
1827 if (strlen(buff)) { 1805 if (strlen(buff)) {
1828 type = ftrace_setup_glob(buff, strlen(buff), &search, &not); 1806 type = filter_parse_regex(buff, strlen(buff), &search, &not);
1829 search_len = strlen(search); 1807 search_len = strlen(search);
1830 } 1808 }
1831 1809
@@ -1990,7 +1968,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
1990 int count = 0; 1968 int count = 0;
1991 char *search; 1969 char *search;
1992 1970
1993 type = ftrace_setup_glob(glob, strlen(glob), &search, &not); 1971 type = filter_parse_regex(glob, strlen(glob), &search, &not);
1994 len = strlen(search); 1972 len = strlen(search);
1995 1973
1996 /* we do not support '!' for function probes */ 1974 /* we do not support '!' for function probes */
@@ -2067,7 +2045,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
2067 else if (glob) { 2045 else if (glob) {
2068 int not; 2046 int not;
2069 2047
2070 type = ftrace_setup_glob(glob, strlen(glob), &search, &not); 2048 type = filter_parse_regex(glob, strlen(glob), &search, &not);
2071 len = strlen(search); 2049 len = strlen(search);
2072 2050
2073 /* we do not support '!' for function probes */ 2051 /* we do not support '!' for function probes */
@@ -2202,7 +2180,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2202 struct trace_parser *parser; 2180 struct trace_parser *parser;
2203 ssize_t ret, read; 2181 ssize_t ret, read;
2204 2182
2205 if (!cnt || cnt < 0) 2183 if (!cnt)
2206 return 0; 2184 return 0;
2207 2185
2208 mutex_lock(&ftrace_regex_lock); 2186 mutex_lock(&ftrace_regex_lock);
@@ -2216,20 +2194,20 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2216 parser = &iter->parser; 2194 parser = &iter->parser;
2217 read = trace_get_user(parser, ubuf, cnt, ppos); 2195 read = trace_get_user(parser, ubuf, cnt, ppos);
2218 2196
2219 if (trace_parser_loaded(parser) && 2197 if (read >= 0 && trace_parser_loaded(parser) &&
2220 !trace_parser_cont(parser)) { 2198 !trace_parser_cont(parser)) {
2221 ret = ftrace_process_regex(parser->buffer, 2199 ret = ftrace_process_regex(parser->buffer,
2222 parser->idx, enable); 2200 parser->idx, enable);
2223 if (ret) 2201 if (ret)
2224 goto out; 2202 goto out_unlock;
2225 2203
2226 trace_parser_clear(parser); 2204 trace_parser_clear(parser);
2227 } 2205 }
2228 2206
2229 ret = read; 2207 ret = read;
2230 2208out_unlock:
2231 mutex_unlock(&ftrace_regex_lock); 2209 mutex_unlock(&ftrace_regex_lock);
2232out: 2210
2233 return ret; 2211 return ret;
2234} 2212}
2235 2213
@@ -2311,6 +2289,32 @@ static int __init set_ftrace_filter(char *str)
2311} 2289}
2312__setup("ftrace_filter=", set_ftrace_filter); 2290__setup("ftrace_filter=", set_ftrace_filter);
2313 2291
2292#ifdef CONFIG_FUNCTION_GRAPH_TRACER
2293static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
2294static int __init set_graph_function(char *str)
2295{
2296 strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
2297 return 1;
2298}
2299__setup("ftrace_graph_filter=", set_graph_function);
2300
2301static void __init set_ftrace_early_graph(char *buf)
2302{
2303 int ret;
2304 char *func;
2305
2306 while (buf) {
2307 func = strsep(&buf, ",");
2308 /* we allow only one expression at a time */
2309 ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
2310 func);
2311 if (ret)
2312 printk(KERN_DEBUG "ftrace: function %s not "
2313 "traceable\n", func);
2314 }
2315}
2316#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
2317
2314static void __init set_ftrace_early_filter(char *buf, int enable) 2318static void __init set_ftrace_early_filter(char *buf, int enable)
2315{ 2319{
2316 char *func; 2320 char *func;
@@ -2327,6 +2331,10 @@ static void __init set_ftrace_early_filters(void)
2327 set_ftrace_early_filter(ftrace_filter_buf, 1); 2331 set_ftrace_early_filter(ftrace_filter_buf, 1);
2328 if (ftrace_notrace_buf[0]) 2332 if (ftrace_notrace_buf[0])
2329 set_ftrace_early_filter(ftrace_notrace_buf, 0); 2333 set_ftrace_early_filter(ftrace_notrace_buf, 0);
2334#ifdef CONFIG_FUNCTION_GRAPH_TRACER
2335 if (ftrace_graph_buf[0])
2336 set_ftrace_early_graph(ftrace_graph_buf);
2337#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
2330} 2338}
2331 2339
2332static int 2340static int
@@ -2512,7 +2520,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2512 return -ENODEV; 2520 return -ENODEV;
2513 2521
2514 /* decode regex */ 2522 /* decode regex */
2515 type = ftrace_setup_glob(buffer, strlen(buffer), &search, &not); 2523 type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
2516 if (not) 2524 if (not)
2517 return -EINVAL; 2525 return -EINVAL;
2518 2526
@@ -2552,8 +2560,7 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
2552 size_t cnt, loff_t *ppos) 2560 size_t cnt, loff_t *ppos)
2553{ 2561{
2554 struct trace_parser parser; 2562 struct trace_parser parser;
2555 size_t read = 0; 2563 ssize_t read, ret;
2556 ssize_t ret;
2557 2564
2558 if (!cnt || cnt < 0) 2565 if (!cnt || cnt < 0)
2559 return 0; 2566 return 0;
@@ -2562,29 +2569,31 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
2562 2569
2563 if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) { 2570 if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) {
2564 ret = -EBUSY; 2571 ret = -EBUSY;
2565 goto out; 2572 goto out_unlock;
2566 } 2573 }
2567 2574
2568 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { 2575 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
2569 ret = -ENOMEM; 2576 ret = -ENOMEM;
2570 goto out; 2577 goto out_unlock;
2571 } 2578 }
2572 2579
2573 read = trace_get_user(&parser, ubuf, cnt, ppos); 2580 read = trace_get_user(&parser, ubuf, cnt, ppos);
2574 2581
2575 if (trace_parser_loaded((&parser))) { 2582 if (read >= 0 && trace_parser_loaded((&parser))) {
2576 parser.buffer[parser.idx] = 0; 2583 parser.buffer[parser.idx] = 0;
2577 2584
2578 /* we allow only one expression at a time */ 2585 /* we allow only one expression at a time */
2579 ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, 2586 ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
2580 parser.buffer); 2587 parser.buffer);
2581 if (ret) 2588 if (ret)
2582 goto out; 2589 goto out_free;
2583 } 2590 }
2584 2591
2585 ret = read; 2592 ret = read;
2586 out: 2593
2594out_free:
2587 trace_parser_put(&parser); 2595 trace_parser_put(&parser);
2596out_unlock:
2588 mutex_unlock(&graph_lock); 2597 mutex_unlock(&graph_lock);
2589 2598
2590 return ret; 2599 return ret;
@@ -2622,7 +2631,7 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
2622 return 0; 2631 return 0;
2623} 2632}
2624 2633
2625static int ftrace_convert_nops(struct module *mod, 2634static int ftrace_process_locs(struct module *mod,
2626 unsigned long *start, 2635 unsigned long *start,
2627 unsigned long *end) 2636 unsigned long *end)
2628{ 2637{
@@ -2655,19 +2664,17 @@ static int ftrace_convert_nops(struct module *mod,
2655} 2664}
2656 2665
2657#ifdef CONFIG_MODULES 2666#ifdef CONFIG_MODULES
2658void ftrace_release(void *start, void *end) 2667void ftrace_release_mod(struct module *mod)
2659{ 2668{
2660 struct dyn_ftrace *rec; 2669 struct dyn_ftrace *rec;
2661 struct ftrace_page *pg; 2670 struct ftrace_page *pg;
2662 unsigned long s = (unsigned long)start;
2663 unsigned long e = (unsigned long)end;
2664 2671
2665 if (ftrace_disabled || !start || start == end) 2672 if (ftrace_disabled)
2666 return; 2673 return;
2667 2674
2668 mutex_lock(&ftrace_lock); 2675 mutex_lock(&ftrace_lock);
2669 do_for_each_ftrace_rec(pg, rec) { 2676 do_for_each_ftrace_rec(pg, rec) {
2670 if ((rec->ip >= s) && (rec->ip < e)) { 2677 if (within_module_core(rec->ip, mod)) {
2671 /* 2678 /*
2672 * rec->ip is changed in ftrace_free_rec() 2679 * rec->ip is changed in ftrace_free_rec()
2673 * It should not between s and e if record was freed. 2680 * It should not between s and e if record was freed.
@@ -2684,7 +2691,7 @@ static void ftrace_init_module(struct module *mod,
2684{ 2691{
2685 if (ftrace_disabled || start == end) 2692 if (ftrace_disabled || start == end)
2686 return; 2693 return;
2687 ftrace_convert_nops(mod, start, end); 2694 ftrace_process_locs(mod, start, end);
2688} 2695}
2689 2696
2690static int ftrace_module_notify(struct notifier_block *self, 2697static int ftrace_module_notify(struct notifier_block *self,
@@ -2699,9 +2706,7 @@ static int ftrace_module_notify(struct notifier_block *self,
2699 mod->num_ftrace_callsites); 2706 mod->num_ftrace_callsites);
2700 break; 2707 break;
2701 case MODULE_STATE_GOING: 2708 case MODULE_STATE_GOING:
2702 ftrace_release(mod->ftrace_callsites, 2709 ftrace_release_mod(mod);
2703 mod->ftrace_callsites +
2704 mod->num_ftrace_callsites);
2705 break; 2710 break;
2706 } 2711 }
2707 2712
@@ -2747,7 +2752,7 @@ void __init ftrace_init(void)
2747 2752
2748 last_ftrace_enabled = ftrace_enabled = 1; 2753 last_ftrace_enabled = ftrace_enabled = 1;
2749 2754
2750 ret = ftrace_convert_nops(NULL, 2755 ret = ftrace_process_locs(NULL,
2751 __start_mcount_loc, 2756 __start_mcount_loc,
2752 __stop_mcount_loc); 2757 __stop_mcount_loc);
2753 2758
@@ -2780,23 +2785,6 @@ static inline void ftrace_startup_enable(int command) { }
2780# define ftrace_shutdown_sysctl() do { } while (0) 2785# define ftrace_shutdown_sysctl() do { } while (0)
2781#endif /* CONFIG_DYNAMIC_FTRACE */ 2786#endif /* CONFIG_DYNAMIC_FTRACE */
2782 2787
2783static ssize_t
2784ftrace_pid_read(struct file *file, char __user *ubuf,
2785 size_t cnt, loff_t *ppos)
2786{
2787 char buf[64];
2788 int r;
2789
2790 if (ftrace_pid_trace == ftrace_swapper_pid)
2791 r = sprintf(buf, "swapper tasks\n");
2792 else if (ftrace_pid_trace)
2793 r = sprintf(buf, "%u\n", pid_vnr(ftrace_pid_trace));
2794 else
2795 r = sprintf(buf, "no pid\n");
2796
2797 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2798}
2799
2800static void clear_ftrace_swapper(void) 2788static void clear_ftrace_swapper(void)
2801{ 2789{
2802 struct task_struct *p; 2790 struct task_struct *p;
@@ -2847,14 +2835,12 @@ static void set_ftrace_pid(struct pid *pid)
2847 rcu_read_unlock(); 2835 rcu_read_unlock();
2848} 2836}
2849 2837
2850static void clear_ftrace_pid_task(struct pid **pid) 2838static void clear_ftrace_pid_task(struct pid *pid)
2851{ 2839{
2852 if (*pid == ftrace_swapper_pid) 2840 if (pid == ftrace_swapper_pid)
2853 clear_ftrace_swapper(); 2841 clear_ftrace_swapper();
2854 else 2842 else
2855 clear_ftrace_pid(*pid); 2843 clear_ftrace_pid(pid);
2856
2857 *pid = NULL;
2858} 2844}
2859 2845
2860static void set_ftrace_pid_task(struct pid *pid) 2846static void set_ftrace_pid_task(struct pid *pid)
@@ -2865,74 +2851,184 @@ static void set_ftrace_pid_task(struct pid *pid)
2865 set_ftrace_pid(pid); 2851 set_ftrace_pid(pid);
2866} 2852}
2867 2853
2868static ssize_t 2854static int ftrace_pid_add(int p)
2869ftrace_pid_write(struct file *filp, const char __user *ubuf,
2870 size_t cnt, loff_t *ppos)
2871{ 2855{
2872 struct pid *pid; 2856 struct pid *pid;
2873 char buf[64]; 2857 struct ftrace_pid *fpid;
2874 long val; 2858 int ret = -EINVAL;
2875 int ret;
2876 2859
2877 if (cnt >= sizeof(buf)) 2860 mutex_lock(&ftrace_lock);
2878 return -EINVAL;
2879 2861
2880 if (copy_from_user(&buf, ubuf, cnt)) 2862 if (!p)
2881 return -EFAULT; 2863 pid = ftrace_swapper_pid;
2864 else
2865 pid = find_get_pid(p);
2882 2866
2883 buf[cnt] = 0; 2867 if (!pid)
2868 goto out;
2884 2869
2885 ret = strict_strtol(buf, 10, &val); 2870 ret = 0;
2886 if (ret < 0)
2887 return ret;
2888 2871
2889 mutex_lock(&ftrace_lock); 2872 list_for_each_entry(fpid, &ftrace_pids, list)
2890 if (val < 0) { 2873 if (fpid->pid == pid)
2891 /* disable pid tracing */ 2874 goto out_put;
2892 if (!ftrace_pid_trace)
2893 goto out;
2894 2875
2895 clear_ftrace_pid_task(&ftrace_pid_trace); 2876 ret = -ENOMEM;
2896 2877
2897 } else { 2878 fpid = kmalloc(sizeof(*fpid), GFP_KERNEL);
2898 /* swapper task is special */ 2879 if (!fpid)
2899 if (!val) { 2880 goto out_put;
2900 pid = ftrace_swapper_pid;
2901 if (pid == ftrace_pid_trace)
2902 goto out;
2903 } else {
2904 pid = find_get_pid(val);
2905 2881
2906 if (pid == ftrace_pid_trace) { 2882 list_add(&fpid->list, &ftrace_pids);
2907 put_pid(pid); 2883 fpid->pid = pid;
2908 goto out;
2909 }
2910 }
2911 2884
2912 if (ftrace_pid_trace) 2885 set_ftrace_pid_task(pid);
2913 clear_ftrace_pid_task(&ftrace_pid_trace);
2914 2886
2915 if (!pid) 2887 ftrace_update_pid_func();
2916 goto out; 2888 ftrace_startup_enable(0);
2889
2890 mutex_unlock(&ftrace_lock);
2891 return 0;
2892
2893out_put:
2894 if (pid != ftrace_swapper_pid)
2895 put_pid(pid);
2896
2897out:
2898 mutex_unlock(&ftrace_lock);
2899 return ret;
2900}
2901
2902static void ftrace_pid_reset(void)
2903{
2904 struct ftrace_pid *fpid, *safe;
2905
2906 mutex_lock(&ftrace_lock);
2907 list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) {
2908 struct pid *pid = fpid->pid;
2917 2909
2918 ftrace_pid_trace = pid; 2910 clear_ftrace_pid_task(pid);
2919 2911
2920 set_ftrace_pid_task(ftrace_pid_trace); 2912 list_del(&fpid->list);
2913 kfree(fpid);
2921 } 2914 }
2922 2915
2923 /* update the function call */
2924 ftrace_update_pid_func(); 2916 ftrace_update_pid_func();
2925 ftrace_startup_enable(0); 2917 ftrace_startup_enable(0);
2926 2918
2927 out:
2928 mutex_unlock(&ftrace_lock); 2919 mutex_unlock(&ftrace_lock);
2920}
2929 2921
2930 return cnt; 2922static void *fpid_start(struct seq_file *m, loff_t *pos)
2923{
2924 mutex_lock(&ftrace_lock);
2925
2926 if (list_empty(&ftrace_pids) && (!*pos))
2927 return (void *) 1;
2928
2929 return seq_list_start(&ftrace_pids, *pos);
2930}
2931
2932static void *fpid_next(struct seq_file *m, void *v, loff_t *pos)
2933{
2934 if (v == (void *)1)
2935 return NULL;
2936
2937 return seq_list_next(v, &ftrace_pids, pos);
2938}
2939
2940static void fpid_stop(struct seq_file *m, void *p)
2941{
2942 mutex_unlock(&ftrace_lock);
2943}
2944
2945static int fpid_show(struct seq_file *m, void *v)
2946{
2947 const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list);
2948
2949 if (v == (void *)1) {
2950 seq_printf(m, "no pid\n");
2951 return 0;
2952 }
2953
2954 if (fpid->pid == ftrace_swapper_pid)
2955 seq_printf(m, "swapper tasks\n");
2956 else
2957 seq_printf(m, "%u\n", pid_vnr(fpid->pid));
2958
2959 return 0;
2960}
2961
2962static const struct seq_operations ftrace_pid_sops = {
2963 .start = fpid_start,
2964 .next = fpid_next,
2965 .stop = fpid_stop,
2966 .show = fpid_show,
2967};
2968
2969static int
2970ftrace_pid_open(struct inode *inode, struct file *file)
2971{
2972 int ret = 0;
2973
2974 if ((file->f_mode & FMODE_WRITE) &&
2975 (file->f_flags & O_TRUNC))
2976 ftrace_pid_reset();
2977
2978 if (file->f_mode & FMODE_READ)
2979 ret = seq_open(file, &ftrace_pid_sops);
2980
2981 return ret;
2982}
2983
2984static ssize_t
2985ftrace_pid_write(struct file *filp, const char __user *ubuf,
2986 size_t cnt, loff_t *ppos)
2987{
2988 char buf[64], *tmp;
2989 long val;
2990 int ret;
2991
2992 if (cnt >= sizeof(buf))
2993 return -EINVAL;
2994
2995 if (copy_from_user(&buf, ubuf, cnt))
2996 return -EFAULT;
2997
2998 buf[cnt] = 0;
2999
3000 /*
3001 * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid"
3002 * to clean the filter quietly.
3003 */
3004 tmp = strstrip(buf);
3005 if (strlen(tmp) == 0)
3006 return 1;
3007
3008 ret = strict_strtol(tmp, 10, &val);
3009 if (ret < 0)
3010 return ret;
3011
3012 ret = ftrace_pid_add(val);
3013
3014 return ret ? ret : cnt;
3015}
3016
3017static int
3018ftrace_pid_release(struct inode *inode, struct file *file)
3019{
3020 if (file->f_mode & FMODE_READ)
3021 seq_release(inode, file);
3022
3023 return 0;
2931} 3024}
2932 3025
2933static const struct file_operations ftrace_pid_fops = { 3026static const struct file_operations ftrace_pid_fops = {
2934 .read = ftrace_pid_read, 3027 .open = ftrace_pid_open,
2935 .write = ftrace_pid_write, 3028 .write = ftrace_pid_write,
3029 .read = seq_read,
3030 .llseek = seq_lseek,
3031 .release = ftrace_pid_release,
2936}; 3032};
2937 3033
2938static __init int ftrace_init_debugfs(void) 3034static __init int ftrace_init_debugfs(void)
@@ -3015,7 +3111,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
3015 3111
3016int 3112int
3017ftrace_enable_sysctl(struct ctl_table *table, int write, 3113ftrace_enable_sysctl(struct ctl_table *table, int write,
3018 struct file *file, void __user *buffer, size_t *lenp, 3114 void __user *buffer, size_t *lenp,
3019 loff_t *ppos) 3115 loff_t *ppos)
3020{ 3116{
3021 int ret; 3117 int ret;
@@ -3025,7 +3121,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
3025 3121
3026 mutex_lock(&ftrace_lock); 3122 mutex_lock(&ftrace_lock);
3027 3123
3028 ret = proc_dointvec(table, write, file, buffer, lenp, ppos); 3124 ret = proc_dointvec(table, write, buffer, lenp, ppos);
3029 3125
3030 if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) 3126 if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
3031 goto out; 3127 goto out;
@@ -3295,4 +3391,3 @@ void ftrace_graph_stop(void)
3295 ftrace_stop(); 3391 ftrace_stop();
3296} 3392}
3297#endif 3393#endif
3298
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 81b1645c8549..a91da69f153a 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -501,7 +501,7 @@ static int __init init_kmem_tracer(void)
501 return 1; 501 return 1;
502 } 502 }
503 503
504 if (!register_tracer(&kmem_tracer)) { 504 if (register_tracer(&kmem_tracer) != 0) {
505 pr_warning("Warning: could not register the kmem tracer\n"); 505 pr_warning("Warning: could not register the kmem tracer\n");
506 return 1; 506 return 1;
507 } 507 }
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index d4ff01970547..f58c9ad15830 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -397,18 +397,21 @@ int ring_buffer_print_page_header(struct trace_seq *s)
397 int ret; 397 int ret;
398 398
399 ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t" 399 ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t"
400 "offset:0;\tsize:%u;\n", 400 "offset:0;\tsize:%u;\tsigned:%u;\n",
401 (unsigned int)sizeof(field.time_stamp)); 401 (unsigned int)sizeof(field.time_stamp),
402 (unsigned int)is_signed_type(u64));
402 403
403 ret = trace_seq_printf(s, "\tfield: local_t commit;\t" 404 ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
404 "offset:%u;\tsize:%u;\n", 405 "offset:%u;\tsize:%u;\tsigned:%u;\n",
405 (unsigned int)offsetof(typeof(field), commit), 406 (unsigned int)offsetof(typeof(field), commit),
406 (unsigned int)sizeof(field.commit)); 407 (unsigned int)sizeof(field.commit),
408 (unsigned int)is_signed_type(long));
407 409
408 ret = trace_seq_printf(s, "\tfield: char data;\t" 410 ret = trace_seq_printf(s, "\tfield: char data;\t"
409 "offset:%u;\tsize:%u;\n", 411 "offset:%u;\tsize:%u;\tsigned:%u;\n",
410 (unsigned int)offsetof(typeof(field), data), 412 (unsigned int)offsetof(typeof(field), data),
411 (unsigned int)BUF_PAGE_SIZE); 413 (unsigned int)BUF_PAGE_SIZE,
414 (unsigned int)is_signed_type(char));
412 415
413 return ret; 416 return ret;
414} 417}
@@ -420,7 +423,7 @@ struct ring_buffer_per_cpu {
420 int cpu; 423 int cpu;
421 struct ring_buffer *buffer; 424 struct ring_buffer *buffer;
422 spinlock_t reader_lock; /* serialize readers */ 425 spinlock_t reader_lock; /* serialize readers */
423 raw_spinlock_t lock; 426 arch_spinlock_t lock;
424 struct lock_class_key lock_key; 427 struct lock_class_key lock_key;
425 struct list_head *pages; 428 struct list_head *pages;
426 struct buffer_page *head_page; /* read from head */ 429 struct buffer_page *head_page; /* read from head */
@@ -483,7 +486,7 @@ struct ring_buffer_iter {
483/* Up this if you want to test the TIME_EXTENTS and normalization */ 486/* Up this if you want to test the TIME_EXTENTS and normalization */
484#define DEBUG_SHIFT 0 487#define DEBUG_SHIFT 0
485 488
486static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu) 489static inline u64 rb_time_stamp(struct ring_buffer *buffer)
487{ 490{
488 /* shift to debug/test normalization and TIME_EXTENTS */ 491 /* shift to debug/test normalization and TIME_EXTENTS */
489 return buffer->clock() << DEBUG_SHIFT; 492 return buffer->clock() << DEBUG_SHIFT;
@@ -494,7 +497,7 @@ u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
494 u64 time; 497 u64 time;
495 498
496 preempt_disable_notrace(); 499 preempt_disable_notrace();
497 time = rb_time_stamp(buffer, cpu); 500 time = rb_time_stamp(buffer);
498 preempt_enable_no_resched_notrace(); 501 preempt_enable_no_resched_notrace();
499 502
500 return time; 503 return time;
@@ -599,7 +602,7 @@ static struct list_head *rb_list_head(struct list_head *list)
599} 602}
600 603
601/* 604/*
602 * rb_is_head_page - test if the give page is the head page 605 * rb_is_head_page - test if the given page is the head page
603 * 606 *
604 * Because the reader may move the head_page pointer, we can 607 * Because the reader may move the head_page pointer, we can
605 * not trust what the head page is (it may be pointing to 608 * not trust what the head page is (it may be pointing to
@@ -995,7 +998,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
995 cpu_buffer->buffer = buffer; 998 cpu_buffer->buffer = buffer;
996 spin_lock_init(&cpu_buffer->reader_lock); 999 spin_lock_init(&cpu_buffer->reader_lock);
997 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); 1000 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
998 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 1001 cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
999 1002
1000 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1003 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
1001 GFP_KERNEL, cpu_to_node(cpu)); 1004 GFP_KERNEL, cpu_to_node(cpu));
@@ -1193,6 +1196,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
1193 atomic_inc(&cpu_buffer->record_disabled); 1196 atomic_inc(&cpu_buffer->record_disabled);
1194 synchronize_sched(); 1197 synchronize_sched();
1195 1198
1199 spin_lock_irq(&cpu_buffer->reader_lock);
1196 rb_head_page_deactivate(cpu_buffer); 1200 rb_head_page_deactivate(cpu_buffer);
1197 1201
1198 for (i = 0; i < nr_pages; i++) { 1202 for (i = 0; i < nr_pages; i++) {
@@ -1207,6 +1211,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
1207 return; 1211 return;
1208 1212
1209 rb_reset_cpu(cpu_buffer); 1213 rb_reset_cpu(cpu_buffer);
1214 spin_unlock_irq(&cpu_buffer->reader_lock);
1210 1215
1211 rb_check_pages(cpu_buffer); 1216 rb_check_pages(cpu_buffer);
1212 1217
@@ -1785,9 +1790,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1785static struct ring_buffer_event * 1790static struct ring_buffer_event *
1786rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, 1791rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1787 unsigned long length, unsigned long tail, 1792 unsigned long length, unsigned long tail,
1788 struct buffer_page *commit_page,
1789 struct buffer_page *tail_page, u64 *ts) 1793 struct buffer_page *tail_page, u64 *ts)
1790{ 1794{
1795 struct buffer_page *commit_page = cpu_buffer->commit_page;
1791 struct ring_buffer *buffer = cpu_buffer->buffer; 1796 struct ring_buffer *buffer = cpu_buffer->buffer;
1792 struct buffer_page *next_page; 1797 struct buffer_page *next_page;
1793 int ret; 1798 int ret;
@@ -1868,7 +1873,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1868 * Nested commits always have zero deltas, so 1873 * Nested commits always have zero deltas, so
1869 * just reread the time stamp 1874 * just reread the time stamp
1870 */ 1875 */
1871 *ts = rb_time_stamp(buffer, cpu_buffer->cpu); 1876 *ts = rb_time_stamp(buffer);
1872 next_page->page->time_stamp = *ts; 1877 next_page->page->time_stamp = *ts;
1873 } 1878 }
1874 1879
@@ -1890,13 +1895,10 @@ static struct ring_buffer_event *
1890__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, 1895__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1891 unsigned type, unsigned long length, u64 *ts) 1896 unsigned type, unsigned long length, u64 *ts)
1892{ 1897{
1893 struct buffer_page *tail_page, *commit_page; 1898 struct buffer_page *tail_page;
1894 struct ring_buffer_event *event; 1899 struct ring_buffer_event *event;
1895 unsigned long tail, write; 1900 unsigned long tail, write;
1896 1901
1897 commit_page = cpu_buffer->commit_page;
1898 /* we just need to protect against interrupts */
1899 barrier();
1900 tail_page = cpu_buffer->tail_page; 1902 tail_page = cpu_buffer->tail_page;
1901 write = local_add_return(length, &tail_page->write); 1903 write = local_add_return(length, &tail_page->write);
1902 1904
@@ -1907,7 +1909,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1907 /* See if we shot pass the end of this buffer page */ 1909 /* See if we shot pass the end of this buffer page */
1908 if (write > BUF_PAGE_SIZE) 1910 if (write > BUF_PAGE_SIZE)
1909 return rb_move_tail(cpu_buffer, length, tail, 1911 return rb_move_tail(cpu_buffer, length, tail,
1910 commit_page, tail_page, ts); 1912 tail_page, ts);
1911 1913
1912 /* We reserved something on the buffer */ 1914 /* We reserved something on the buffer */
1913 1915
@@ -2111,7 +2113,7 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2111 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) 2113 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
2112 goto out_fail; 2114 goto out_fail;
2113 2115
2114 ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); 2116 ts = rb_time_stamp(cpu_buffer->buffer);
2115 2117
2116 /* 2118 /*
2117 * Only the first commit can update the timestamp. 2119 * Only the first commit can update the timestamp.
@@ -2681,7 +2683,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
2681EXPORT_SYMBOL_GPL(ring_buffer_entries); 2683EXPORT_SYMBOL_GPL(ring_buffer_entries);
2682 2684
2683/** 2685/**
2684 * ring_buffer_overrun_cpu - get the number of overruns in buffer 2686 * ring_buffer_overruns - get the number of overruns in buffer
2685 * @buffer: The ring buffer 2687 * @buffer: The ring buffer
2686 * 2688 *
2687 * Returns the total number of overruns in the ring buffer 2689 * Returns the total number of overruns in the ring buffer
@@ -2832,7 +2834,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2832 int ret; 2834 int ret;
2833 2835
2834 local_irq_save(flags); 2836 local_irq_save(flags);
2835 __raw_spin_lock(&cpu_buffer->lock); 2837 arch_spin_lock(&cpu_buffer->lock);
2836 2838
2837 again: 2839 again:
2838 /* 2840 /*
@@ -2921,7 +2923,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2921 goto again; 2923 goto again;
2922 2924
2923 out: 2925 out:
2924 __raw_spin_unlock(&cpu_buffer->lock); 2926 arch_spin_unlock(&cpu_buffer->lock);
2925 local_irq_restore(flags); 2927 local_irq_restore(flags);
2926 2928
2927 return reader; 2929 return reader;
@@ -3284,9 +3286,9 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
3284 synchronize_sched(); 3286 synchronize_sched();
3285 3287
3286 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3288 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3287 __raw_spin_lock(&cpu_buffer->lock); 3289 arch_spin_lock(&cpu_buffer->lock);
3288 rb_iter_reset(iter); 3290 rb_iter_reset(iter);
3289 __raw_spin_unlock(&cpu_buffer->lock); 3291 arch_spin_unlock(&cpu_buffer->lock);
3290 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3292 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3291 3293
3292 return iter; 3294 return iter;
@@ -3406,11 +3408,11 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
3406 if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) 3408 if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
3407 goto out; 3409 goto out;
3408 3410
3409 __raw_spin_lock(&cpu_buffer->lock); 3411 arch_spin_lock(&cpu_buffer->lock);
3410 3412
3411 rb_reset_cpu(cpu_buffer); 3413 rb_reset_cpu(cpu_buffer);
3412 3414
3413 __raw_spin_unlock(&cpu_buffer->lock); 3415 arch_spin_unlock(&cpu_buffer->lock);
3414 3416
3415 out: 3417 out:
3416 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3418 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 573d3cc762c3..b2477caf09c2 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -35,6 +35,28 @@ static int disable_reader;
35module_param(disable_reader, uint, 0644); 35module_param(disable_reader, uint, 0644);
36MODULE_PARM_DESC(disable_reader, "only run producer"); 36MODULE_PARM_DESC(disable_reader, "only run producer");
37 37
38static int write_iteration = 50;
39module_param(write_iteration, uint, 0644);
40MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings");
41
42static int producer_nice = 19;
43static int consumer_nice = 19;
44
45static int producer_fifo = -1;
46static int consumer_fifo = -1;
47
48module_param(producer_nice, uint, 0644);
49MODULE_PARM_DESC(producer_nice, "nice prio for producer");
50
51module_param(consumer_nice, uint, 0644);
52MODULE_PARM_DESC(consumer_nice, "nice prio for consumer");
53
54module_param(producer_fifo, uint, 0644);
55MODULE_PARM_DESC(producer_fifo, "fifo prio for producer");
56
57module_param(consumer_fifo, uint, 0644);
58MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer");
59
38static int read_events; 60static int read_events;
39 61
40static int kill_test; 62static int kill_test;
@@ -208,15 +230,18 @@ static void ring_buffer_producer(void)
208 do { 230 do {
209 struct ring_buffer_event *event; 231 struct ring_buffer_event *event;
210 int *entry; 232 int *entry;
211 233 int i;
212 event = ring_buffer_lock_reserve(buffer, 10); 234
213 if (!event) { 235 for (i = 0; i < write_iteration; i++) {
214 missed++; 236 event = ring_buffer_lock_reserve(buffer, 10);
215 } else { 237 if (!event) {
216 hit++; 238 missed++;
217 entry = ring_buffer_event_data(event); 239 } else {
218 *entry = smp_processor_id(); 240 hit++;
219 ring_buffer_unlock_commit(buffer, event); 241 entry = ring_buffer_event_data(event);
242 *entry = smp_processor_id();
243 ring_buffer_unlock_commit(buffer, event);
244 }
220 } 245 }
221 do_gettimeofday(&end_tv); 246 do_gettimeofday(&end_tv);
222 247
@@ -263,6 +288,27 @@ static void ring_buffer_producer(void)
263 288
264 if (kill_test) 289 if (kill_test)
265 trace_printk("ERROR!\n"); 290 trace_printk("ERROR!\n");
291
292 if (!disable_reader) {
293 if (consumer_fifo < 0)
294 trace_printk("Running Consumer at nice: %d\n",
295 consumer_nice);
296 else
297 trace_printk("Running Consumer at SCHED_FIFO %d\n",
298 consumer_fifo);
299 }
300 if (producer_fifo < 0)
301 trace_printk("Running Producer at nice: %d\n",
302 producer_nice);
303 else
304 trace_printk("Running Producer at SCHED_FIFO %d\n",
305 producer_fifo);
306
307 /* Let the user know that the test is running at low priority */
308 if (producer_fifo < 0 && consumer_fifo < 0 &&
309 producer_nice == 19 && consumer_nice == 19)
310 trace_printk("WARNING!!! This test is running at lowest priority.\n");
311
266 trace_printk("Time: %lld (usecs)\n", time); 312 trace_printk("Time: %lld (usecs)\n", time);
267 trace_printk("Overruns: %lld\n", overruns); 313 trace_printk("Overruns: %lld\n", overruns);
268 if (disable_reader) 314 if (disable_reader)
@@ -392,6 +438,27 @@ static int __init ring_buffer_benchmark_init(void)
392 if (IS_ERR(producer)) 438 if (IS_ERR(producer))
393 goto out_kill; 439 goto out_kill;
394 440
441 /*
442 * Run them as low-prio background tasks by default:
443 */
444 if (!disable_reader) {
445 if (consumer_fifo >= 0) {
446 struct sched_param param = {
447 .sched_priority = consumer_fifo
448 };
449 sched_setscheduler(consumer, SCHED_FIFO, &param);
450 } else
451 set_user_nice(consumer, consumer_nice);
452 }
453
454 if (producer_fifo >= 0) {
455 struct sched_param param = {
456 .sched_priority = consumer_fifo
457 };
458 sched_setscheduler(producer, SCHED_FIFO, &param);
459 } else
460 set_user_nice(producer, producer_nice);
461
395 return 0; 462 return 0;
396 463
397 out_kill: 464 out_kill:
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6c0f6a8a22eb..31118ae16f03 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -86,17 +86,17 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
86 */ 86 */
87static int tracing_disabled = 1; 87static int tracing_disabled = 1;
88 88
89DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); 89DEFINE_PER_CPU(int, ftrace_cpu_disabled);
90 90
91static inline void ftrace_disable_cpu(void) 91static inline void ftrace_disable_cpu(void)
92{ 92{
93 preempt_disable(); 93 preempt_disable();
94 local_inc(&__get_cpu_var(ftrace_cpu_disabled)); 94 __this_cpu_inc(per_cpu_var(ftrace_cpu_disabled));
95} 95}
96 96
97static inline void ftrace_enable_cpu(void) 97static inline void ftrace_enable_cpu(void)
98{ 98{
99 local_dec(&__get_cpu_var(ftrace_cpu_disabled)); 99 __this_cpu_dec(per_cpu_var(ftrace_cpu_disabled));
100 preempt_enable(); 100 preempt_enable();
101} 101}
102 102
@@ -129,7 +129,7 @@ static int tracing_set_tracer(const char *buf);
129static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; 129static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
130static char *default_bootup_tracer; 130static char *default_bootup_tracer;
131 131
132static int __init set_ftrace(char *str) 132static int __init set_cmdline_ftrace(char *str)
133{ 133{
134 strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); 134 strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
135 default_bootup_tracer = bootup_tracer_buf; 135 default_bootup_tracer = bootup_tracer_buf;
@@ -137,7 +137,7 @@ static int __init set_ftrace(char *str)
137 ring_buffer_expanded = 1; 137 ring_buffer_expanded = 1;
138 return 1; 138 return 1;
139} 139}
140__setup("ftrace=", set_ftrace); 140__setup("ftrace=", set_cmdline_ftrace);
141 141
142static int __init set_ftrace_dump_on_oops(char *str) 142static int __init set_ftrace_dump_on_oops(char *str)
143{ 143{
@@ -203,7 +203,7 @@ cycle_t ftrace_now(int cpu)
203 */ 203 */
204static struct trace_array max_tr; 204static struct trace_array max_tr;
205 205
206static DEFINE_PER_CPU(struct trace_array_cpu, max_data); 206static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data);
207 207
208/* tracer_enabled is used to toggle activation of a tracer */ 208/* tracer_enabled is used to toggle activation of a tracer */
209static int tracer_enabled = 1; 209static int tracer_enabled = 1;
@@ -415,7 +415,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
415 415
416 /* read the non-space input */ 416 /* read the non-space input */
417 while (cnt && !isspace(ch)) { 417 while (cnt && !isspace(ch)) {
418 if (parser->idx < parser->size) 418 if (parser->idx < parser->size - 1)
419 parser->buffer[parser->idx++] = ch; 419 parser->buffer[parser->idx++] = ch;
420 else { 420 else {
421 ret = -EINVAL; 421 ret = -EINVAL;
@@ -493,15 +493,15 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
493 * protected by per_cpu spinlocks. But the action of the swap 493 * protected by per_cpu spinlocks. But the action of the swap
494 * needs its own lock. 494 * needs its own lock.
495 * 495 *
496 * This is defined as a raw_spinlock_t in order to help 496 * This is defined as a arch_spinlock_t in order to help
497 * with performance when lockdep debugging is enabled. 497 * with performance when lockdep debugging is enabled.
498 * 498 *
499 * It is also used in other places outside the update_max_tr 499 * It is also used in other places outside the update_max_tr
500 * so it needs to be defined outside of the 500 * so it needs to be defined outside of the
501 * CONFIG_TRACER_MAX_TRACE. 501 * CONFIG_TRACER_MAX_TRACE.
502 */ 502 */
503static raw_spinlock_t ftrace_max_lock = 503static arch_spinlock_t ftrace_max_lock =
504 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 504 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
505 505
506#ifdef CONFIG_TRACER_MAX_TRACE 506#ifdef CONFIG_TRACER_MAX_TRACE
507unsigned long __read_mostly tracing_max_latency; 507unsigned long __read_mostly tracing_max_latency;
@@ -555,13 +555,13 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
555 return; 555 return;
556 556
557 WARN_ON_ONCE(!irqs_disabled()); 557 WARN_ON_ONCE(!irqs_disabled());
558 __raw_spin_lock(&ftrace_max_lock); 558 arch_spin_lock(&ftrace_max_lock);
559 559
560 tr->buffer = max_tr.buffer; 560 tr->buffer = max_tr.buffer;
561 max_tr.buffer = buf; 561 max_tr.buffer = buf;
562 562
563 __update_max_tr(tr, tsk, cpu); 563 __update_max_tr(tr, tsk, cpu);
564 __raw_spin_unlock(&ftrace_max_lock); 564 arch_spin_unlock(&ftrace_max_lock);
565} 565}
566 566
567/** 567/**
@@ -581,7 +581,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
581 return; 581 return;
582 582
583 WARN_ON_ONCE(!irqs_disabled()); 583 WARN_ON_ONCE(!irqs_disabled());
584 __raw_spin_lock(&ftrace_max_lock); 584 arch_spin_lock(&ftrace_max_lock);
585 585
586 ftrace_disable_cpu(); 586 ftrace_disable_cpu();
587 587
@@ -603,7 +603,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
603 WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); 603 WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
604 604
605 __update_max_tr(tr, tsk, cpu); 605 __update_max_tr(tr, tsk, cpu);
606 __raw_spin_unlock(&ftrace_max_lock); 606 arch_spin_unlock(&ftrace_max_lock);
607} 607}
608#endif /* CONFIG_TRACER_MAX_TRACE */ 608#endif /* CONFIG_TRACER_MAX_TRACE */
609 609
@@ -802,7 +802,7 @@ static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
802static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; 802static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
803static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; 803static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
804static int cmdline_idx; 804static int cmdline_idx;
805static raw_spinlock_t trace_cmdline_lock = __RAW_SPIN_LOCK_UNLOCKED; 805static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
806 806
807/* temporary disable recording */ 807/* temporary disable recording */
808static atomic_t trace_record_cmdline_disabled __read_mostly; 808static atomic_t trace_record_cmdline_disabled __read_mostly;
@@ -915,7 +915,7 @@ static void trace_save_cmdline(struct task_struct *tsk)
915 * nor do we want to disable interrupts, 915 * nor do we want to disable interrupts,
916 * so if we miss here, then better luck next time. 916 * so if we miss here, then better luck next time.
917 */ 917 */
918 if (!__raw_spin_trylock(&trace_cmdline_lock)) 918 if (!arch_spin_trylock(&trace_cmdline_lock))
919 return; 919 return;
920 920
921 idx = map_pid_to_cmdline[tsk->pid]; 921 idx = map_pid_to_cmdline[tsk->pid];
@@ -940,7 +940,7 @@ static void trace_save_cmdline(struct task_struct *tsk)
940 940
941 memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); 941 memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN);
942 942
943 __raw_spin_unlock(&trace_cmdline_lock); 943 arch_spin_unlock(&trace_cmdline_lock);
944} 944}
945 945
946void trace_find_cmdline(int pid, char comm[]) 946void trace_find_cmdline(int pid, char comm[])
@@ -958,14 +958,14 @@ void trace_find_cmdline(int pid, char comm[])
958 } 958 }
959 959
960 preempt_disable(); 960 preempt_disable();
961 __raw_spin_lock(&trace_cmdline_lock); 961 arch_spin_lock(&trace_cmdline_lock);
962 map = map_pid_to_cmdline[pid]; 962 map = map_pid_to_cmdline[pid];
963 if (map != NO_CMDLINE_MAP) 963 if (map != NO_CMDLINE_MAP)
964 strcpy(comm, saved_cmdlines[map]); 964 strcpy(comm, saved_cmdlines[map]);
965 else 965 else
966 strcpy(comm, "<...>"); 966 strcpy(comm, "<...>");
967 967
968 __raw_spin_unlock(&trace_cmdline_lock); 968 arch_spin_unlock(&trace_cmdline_lock);
969 preempt_enable(); 969 preempt_enable();
970} 970}
971 971
@@ -1085,7 +1085,7 @@ trace_function(struct trace_array *tr,
1085 struct ftrace_entry *entry; 1085 struct ftrace_entry *entry;
1086 1086
1087 /* If we are reading the ring buffer, don't trace */ 1087 /* If we are reading the ring buffer, don't trace */
1088 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) 1088 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
1089 return; 1089 return;
1090 1090
1091 event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), 1091 event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
@@ -1251,8 +1251,8 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
1251 */ 1251 */
1252int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) 1252int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1253{ 1253{
1254 static raw_spinlock_t trace_buf_lock = 1254 static arch_spinlock_t trace_buf_lock =
1255 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 1255 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
1256 static u32 trace_buf[TRACE_BUF_SIZE]; 1256 static u32 trace_buf[TRACE_BUF_SIZE];
1257 1257
1258 struct ftrace_event_call *call = &event_bprint; 1258 struct ftrace_event_call *call = &event_bprint;
@@ -1283,7 +1283,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1283 1283
1284 /* Lockdep uses trace_printk for lock tracing */ 1284 /* Lockdep uses trace_printk for lock tracing */
1285 local_irq_save(flags); 1285 local_irq_save(flags);
1286 __raw_spin_lock(&trace_buf_lock); 1286 arch_spin_lock(&trace_buf_lock);
1287 len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args); 1287 len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
1288 1288
1289 if (len > TRACE_BUF_SIZE || len < 0) 1289 if (len > TRACE_BUF_SIZE || len < 0)
@@ -1304,7 +1304,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1304 ring_buffer_unlock_commit(buffer, event); 1304 ring_buffer_unlock_commit(buffer, event);
1305 1305
1306out_unlock: 1306out_unlock:
1307 __raw_spin_unlock(&trace_buf_lock); 1307 arch_spin_unlock(&trace_buf_lock);
1308 local_irq_restore(flags); 1308 local_irq_restore(flags);
1309 1309
1310out: 1310out:
@@ -1334,7 +1334,7 @@ int trace_array_printk(struct trace_array *tr,
1334int trace_array_vprintk(struct trace_array *tr, 1334int trace_array_vprintk(struct trace_array *tr,
1335 unsigned long ip, const char *fmt, va_list args) 1335 unsigned long ip, const char *fmt, va_list args)
1336{ 1336{
1337 static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; 1337 static arch_spinlock_t trace_buf_lock = __ARCH_SPIN_LOCK_UNLOCKED;
1338 static char trace_buf[TRACE_BUF_SIZE]; 1338 static char trace_buf[TRACE_BUF_SIZE];
1339 1339
1340 struct ftrace_event_call *call = &event_print; 1340 struct ftrace_event_call *call = &event_print;
@@ -1360,12 +1360,9 @@ int trace_array_vprintk(struct trace_array *tr,
1360 1360
1361 pause_graph_tracing(); 1361 pause_graph_tracing();
1362 raw_local_irq_save(irq_flags); 1362 raw_local_irq_save(irq_flags);
1363 __raw_spin_lock(&trace_buf_lock); 1363 arch_spin_lock(&trace_buf_lock);
1364 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); 1364 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
1365 1365
1366 len = min(len, TRACE_BUF_SIZE-1);
1367 trace_buf[len] = 0;
1368
1369 size = sizeof(*entry) + len + 1; 1366 size = sizeof(*entry) + len + 1;
1370 buffer = tr->buffer; 1367 buffer = tr->buffer;
1371 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, 1368 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
@@ -1373,15 +1370,15 @@ int trace_array_vprintk(struct trace_array *tr,
1373 if (!event) 1370 if (!event)
1374 goto out_unlock; 1371 goto out_unlock;
1375 entry = ring_buffer_event_data(event); 1372 entry = ring_buffer_event_data(event);
1376 entry->ip = ip; 1373 entry->ip = ip;
1377 1374
1378 memcpy(&entry->buf, trace_buf, len); 1375 memcpy(&entry->buf, trace_buf, len);
1379 entry->buf[len] = 0; 1376 entry->buf[len] = '\0';
1380 if (!filter_check_discard(call, entry, buffer, event)) 1377 if (!filter_check_discard(call, entry, buffer, event))
1381 ring_buffer_unlock_commit(buffer, event); 1378 ring_buffer_unlock_commit(buffer, event);
1382 1379
1383 out_unlock: 1380 out_unlock:
1384 __raw_spin_unlock(&trace_buf_lock); 1381 arch_spin_unlock(&trace_buf_lock);
1385 raw_local_irq_restore(irq_flags); 1382 raw_local_irq_restore(irq_flags);
1386 unpause_graph_tracing(); 1383 unpause_graph_tracing();
1387 out: 1384 out:
@@ -1393,7 +1390,7 @@ int trace_array_vprintk(struct trace_array *tr,
1393 1390
1394int trace_vprintk(unsigned long ip, const char *fmt, va_list args) 1391int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1395{ 1392{
1396 return trace_array_printk(&global_trace, ip, fmt, args); 1393 return trace_array_vprintk(&global_trace, ip, fmt, args);
1397} 1394}
1398EXPORT_SYMBOL_GPL(trace_vprintk); 1395EXPORT_SYMBOL_GPL(trace_vprintk);
1399 1396
@@ -1515,6 +1512,8 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
1515 int i = (int)*pos; 1512 int i = (int)*pos;
1516 void *ent; 1513 void *ent;
1517 1514
1515 WARN_ON_ONCE(iter->leftover);
1516
1518 (*pos)++; 1517 (*pos)++;
1519 1518
1520 /* can't go backwards */ 1519 /* can't go backwards */
@@ -1613,8 +1612,16 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1613 ; 1612 ;
1614 1613
1615 } else { 1614 } else {
1616 l = *pos - 1; 1615 /*
1617 p = s_next(m, p, &l); 1616 * If we overflowed the seq_file before, then we want
1617 * to just reuse the trace_seq buffer again.
1618 */
1619 if (iter->leftover)
1620 p = iter;
1621 else {
1622 l = *pos - 1;
1623 p = s_next(m, p, &l);
1624 }
1618 } 1625 }
1619 1626
1620 trace_event_read_lock(); 1627 trace_event_read_lock();
@@ -1922,6 +1929,7 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
1922static int s_show(struct seq_file *m, void *v) 1929static int s_show(struct seq_file *m, void *v)
1923{ 1930{
1924 struct trace_iterator *iter = v; 1931 struct trace_iterator *iter = v;
1932 int ret;
1925 1933
1926 if (iter->ent == NULL) { 1934 if (iter->ent == NULL) {
1927 if (iter->tr) { 1935 if (iter->tr) {
@@ -1941,9 +1949,27 @@ static int s_show(struct seq_file *m, void *v)
1941 if (!(trace_flags & TRACE_ITER_VERBOSE)) 1949 if (!(trace_flags & TRACE_ITER_VERBOSE))
1942 print_func_help_header(m); 1950 print_func_help_header(m);
1943 } 1951 }
1952 } else if (iter->leftover) {
1953 /*
1954 * If we filled the seq_file buffer earlier, we
1955 * want to just show it now.
1956 */
1957 ret = trace_print_seq(m, &iter->seq);
1958
1959 /* ret should this time be zero, but you never know */
1960 iter->leftover = ret;
1961
1944 } else { 1962 } else {
1945 print_trace_line(iter); 1963 print_trace_line(iter);
1946 trace_print_seq(m, &iter->seq); 1964 ret = trace_print_seq(m, &iter->seq);
1965 /*
1966 * If we overflow the seq_file buffer, then it will
1967 * ask us for this data again at start up.
1968 * Use that instead.
1969 * ret is 0 if seq_file write succeeded.
1970 * -1 otherwise.
1971 */
1972 iter->leftover = ret;
1947 } 1973 }
1948 1974
1949 return 0; 1975 return 0;
@@ -1984,11 +2010,9 @@ __tracing_open(struct inode *inode, struct file *file)
1984 if (current_trace) 2010 if (current_trace)
1985 *iter->trace = *current_trace; 2011 *iter->trace = *current_trace;
1986 2012
1987 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) 2013 if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
1988 goto fail; 2014 goto fail;
1989 2015
1990 cpumask_clear(iter->started);
1991
1992 if (current_trace && current_trace->print_max) 2016 if (current_trace && current_trace->print_max)
1993 iter->tr = &max_tr; 2017 iter->tr = &max_tr;
1994 else 2018 else
@@ -2255,7 +2279,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2255 mutex_lock(&tracing_cpumask_update_lock); 2279 mutex_lock(&tracing_cpumask_update_lock);
2256 2280
2257 local_irq_disable(); 2281 local_irq_disable();
2258 __raw_spin_lock(&ftrace_max_lock); 2282 arch_spin_lock(&ftrace_max_lock);
2259 for_each_tracing_cpu(cpu) { 2283 for_each_tracing_cpu(cpu) {
2260 /* 2284 /*
2261 * Increase/decrease the disabled counter if we are 2285 * Increase/decrease the disabled counter if we are
@@ -2270,7 +2294,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2270 atomic_dec(&global_trace.data[cpu]->disabled); 2294 atomic_dec(&global_trace.data[cpu]->disabled);
2271 } 2295 }
2272 } 2296 }
2273 __raw_spin_unlock(&ftrace_max_lock); 2297 arch_spin_unlock(&ftrace_max_lock);
2274 local_irq_enable(); 2298 local_irq_enable();
2275 2299
2276 cpumask_copy(tracing_cpumask, tracing_cpumask_new); 2300 cpumask_copy(tracing_cpumask, tracing_cpumask_new);
@@ -2442,7 +2466,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2442 return ret; 2466 return ret;
2443 } 2467 }
2444 2468
2445 filp->f_pos += cnt; 2469 *ppos += cnt;
2446 2470
2447 return cnt; 2471 return cnt;
2448} 2472}
@@ -2584,7 +2608,7 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
2584 } 2608 }
2585 mutex_unlock(&trace_types_lock); 2609 mutex_unlock(&trace_types_lock);
2586 2610
2587 filp->f_pos += cnt; 2611 *ppos += cnt;
2588 2612
2589 return cnt; 2613 return cnt;
2590} 2614}
@@ -2766,7 +2790,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
2766 if (err) 2790 if (err)
2767 return err; 2791 return err;
2768 2792
2769 filp->f_pos += ret; 2793 *ppos += ret;
2770 2794
2771 return ret; 2795 return ret;
2772} 2796}
@@ -2899,6 +2923,10 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
2899 else 2923 else
2900 cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask); 2924 cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask);
2901 2925
2926
2927 if (iter->trace->pipe_close)
2928 iter->trace->pipe_close(iter);
2929
2902 mutex_unlock(&trace_types_lock); 2930 mutex_unlock(&trace_types_lock);
2903 2931
2904 free_cpumask_var(iter->started); 2932 free_cpumask_var(iter->started);
@@ -3105,7 +3133,7 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
3105 __free_page(spd->pages[idx]); 3133 __free_page(spd->pages[idx]);
3106} 3134}
3107 3135
3108static struct pipe_buf_operations tracing_pipe_buf_ops = { 3136static const struct pipe_buf_operations tracing_pipe_buf_ops = {
3109 .can_merge = 0, 3137 .can_merge = 0,
3110 .map = generic_pipe_buf_map, 3138 .map = generic_pipe_buf_map,
3111 .unmap = generic_pipe_buf_unmap, 3139 .unmap = generic_pipe_buf_unmap,
@@ -3301,7 +3329,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3301 } 3329 }
3302 } 3330 }
3303 3331
3304 filp->f_pos += cnt; 3332 *ppos += cnt;
3305 3333
3306 /* If check pages failed, return ENOMEM */ 3334 /* If check pages failed, return ENOMEM */
3307 if (tracing_disabled) 3335 if (tracing_disabled)
@@ -3336,7 +3364,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3336 size_t cnt, loff_t *fpos) 3364 size_t cnt, loff_t *fpos)
3337{ 3365{
3338 char *buf; 3366 char *buf;
3339 char *end;
3340 3367
3341 if (tracing_disabled) 3368 if (tracing_disabled)
3342 return -EINVAL; 3369 return -EINVAL;
@@ -3344,7 +3371,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3344 if (cnt > TRACE_BUF_SIZE) 3371 if (cnt > TRACE_BUF_SIZE)
3345 cnt = TRACE_BUF_SIZE; 3372 cnt = TRACE_BUF_SIZE;
3346 3373
3347 buf = kmalloc(cnt + 1, GFP_KERNEL); 3374 buf = kmalloc(cnt + 2, GFP_KERNEL);
3348 if (buf == NULL) 3375 if (buf == NULL)
3349 return -ENOMEM; 3376 return -ENOMEM;
3350 3377
@@ -3352,14 +3379,13 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3352 kfree(buf); 3379 kfree(buf);
3353 return -EFAULT; 3380 return -EFAULT;
3354 } 3381 }
3382 if (buf[cnt-1] != '\n') {
3383 buf[cnt] = '\n';
3384 buf[cnt+1] = '\0';
3385 } else
3386 buf[cnt] = '\0';
3355 3387
3356 /* Cut from the first nil or newline. */ 3388 cnt = mark_printk("%s", buf);
3357 buf[cnt] = '\0';
3358 end = strchr(buf, '\n');
3359 if (end)
3360 *end = '\0';
3361
3362 cnt = mark_printk("%s\n", buf);
3363 kfree(buf); 3389 kfree(buf);
3364 *fpos += cnt; 3390 *fpos += cnt;
3365 3391
@@ -3591,7 +3617,7 @@ static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
3591} 3617}
3592 3618
3593/* Pipe buffer operations for a buffer. */ 3619/* Pipe buffer operations for a buffer. */
3594static struct pipe_buf_operations buffer_pipe_buf_ops = { 3620static const struct pipe_buf_operations buffer_pipe_buf_ops = {
3595 .can_merge = 0, 3621 .can_merge = 0,
3596 .map = generic_pipe_buf_map, 3622 .map = generic_pipe_buf_map,
3597 .unmap = generic_pipe_buf_unmap, 3623 .unmap = generic_pipe_buf_unmap,
@@ -3732,7 +3758,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
3732 3758
3733 s = kmalloc(sizeof(*s), GFP_KERNEL); 3759 s = kmalloc(sizeof(*s), GFP_KERNEL);
3734 if (!s) 3760 if (!s)
3735 return ENOMEM; 3761 return -ENOMEM;
3736 3762
3737 trace_seq_init(s); 3763 trace_seq_init(s);
3738 3764
@@ -4281,8 +4307,8 @@ trace_printk_seq(struct trace_seq *s)
4281 4307
4282static void __ftrace_dump(bool disable_tracing) 4308static void __ftrace_dump(bool disable_tracing)
4283{ 4309{
4284 static raw_spinlock_t ftrace_dump_lock = 4310 static arch_spinlock_t ftrace_dump_lock =
4285 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 4311 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
4286 /* use static because iter can be a bit big for the stack */ 4312 /* use static because iter can be a bit big for the stack */
4287 static struct trace_iterator iter; 4313 static struct trace_iterator iter;
4288 unsigned int old_userobj; 4314 unsigned int old_userobj;
@@ -4292,7 +4318,7 @@ static void __ftrace_dump(bool disable_tracing)
4292 4318
4293 /* only one dump */ 4319 /* only one dump */
4294 local_irq_save(flags); 4320 local_irq_save(flags);
4295 __raw_spin_lock(&ftrace_dump_lock); 4321 arch_spin_lock(&ftrace_dump_lock);
4296 if (dump_ran) 4322 if (dump_ran)
4297 goto out; 4323 goto out;
4298 4324
@@ -4367,7 +4393,7 @@ static void __ftrace_dump(bool disable_tracing)
4367 } 4393 }
4368 4394
4369 out: 4395 out:
4370 __raw_spin_unlock(&ftrace_dump_lock); 4396 arch_spin_unlock(&ftrace_dump_lock);
4371 local_irq_restore(flags); 4397 local_irq_restore(flags);
4372} 4398}
4373 4399
@@ -4389,7 +4415,7 @@ __init static int tracer_alloc_buffers(void)
4389 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) 4415 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
4390 goto out_free_buffer_mask; 4416 goto out_free_buffer_mask;
4391 4417
4392 if (!alloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL)) 4418 if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
4393 goto out_free_tracing_cpumask; 4419 goto out_free_tracing_cpumask;
4394 4420
4395 /* To save memory, keep the ring buffer size to its minimum */ 4421 /* To save memory, keep the ring buffer size to its minimum */
@@ -4400,7 +4426,6 @@ __init static int tracer_alloc_buffers(void)
4400 4426
4401 cpumask_copy(tracing_buffer_mask, cpu_possible_mask); 4427 cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
4402 cpumask_copy(tracing_cpumask, cpu_all_mask); 4428 cpumask_copy(tracing_cpumask, cpu_all_mask);
4403 cpumask_clear(tracing_reader_cpumask);
4404 4429
4405 /* TODO: make the number of buffers hot pluggable with CPUS */ 4430 /* TODO: make the number of buffers hot pluggable with CPUS */
4406 global_trace.buffer = ring_buffer_alloc(ring_buf_size, 4431 global_trace.buffer = ring_buffer_alloc(ring_buf_size,
@@ -4429,7 +4454,7 @@ __init static int tracer_alloc_buffers(void)
4429 /* Allocate the first page for all buffers */ 4454 /* Allocate the first page for all buffers */
4430 for_each_tracing_cpu(i) { 4455 for_each_tracing_cpu(i) {
4431 global_trace.data[i] = &per_cpu(global_trace_cpu, i); 4456 global_trace.data[i] = &per_cpu(global_trace_cpu, i);
4432 max_tr.data[i] = &per_cpu(max_data, i); 4457 max_tr.data[i] = &per_cpu(max_tr_data, i);
4433 } 4458 }
4434 4459
4435 trace_init_cmdlines(); 4460 trace_init_cmdlines();
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 405cb850b75d..a52bed2eedd8 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -11,6 +11,7 @@
11#include <linux/ftrace.h> 11#include <linux/ftrace.h>
12#include <trace/boot.h> 12#include <trace/boot.h>
13#include <linux/kmemtrace.h> 13#include <linux/kmemtrace.h>
14#include <linux/hw_breakpoint.h>
14 15
15#include <linux/trace_seq.h> 16#include <linux/trace_seq.h>
16#include <linux/ftrace_event.h> 17#include <linux/ftrace_event.h>
@@ -37,6 +38,7 @@ enum trace_type {
37 TRACE_KMEM_ALLOC, 38 TRACE_KMEM_ALLOC,
38 TRACE_KMEM_FREE, 39 TRACE_KMEM_FREE,
39 TRACE_BLK, 40 TRACE_BLK,
41 TRACE_KSYM,
40 42
41 __TRACE_LAST_TYPE, 43 __TRACE_LAST_TYPE,
42}; 44};
@@ -98,9 +100,32 @@ struct syscall_trace_enter {
98struct syscall_trace_exit { 100struct syscall_trace_exit {
99 struct trace_entry ent; 101 struct trace_entry ent;
100 int nr; 102 int nr;
101 unsigned long ret; 103 long ret;
102}; 104};
103 105
106struct kprobe_trace_entry {
107 struct trace_entry ent;
108 unsigned long ip;
109 int nargs;
110 unsigned long args[];
111};
112
113#define SIZEOF_KPROBE_TRACE_ENTRY(n) \
114 (offsetof(struct kprobe_trace_entry, args) + \
115 (sizeof(unsigned long) * (n)))
116
117struct kretprobe_trace_entry {
118 struct trace_entry ent;
119 unsigned long func;
120 unsigned long ret_ip;
121 int nargs;
122 unsigned long args[];
123};
124
125#define SIZEOF_KRETPROBE_TRACE_ENTRY(n) \
126 (offsetof(struct kretprobe_trace_entry, args) + \
127 (sizeof(unsigned long) * (n)))
128
104/* 129/*
105 * trace_flag_type is an enumeration that holds different 130 * trace_flag_type is an enumeration that holds different
106 * states when a trace occurs. These are: 131 * states when a trace occurs. These are:
@@ -209,6 +234,7 @@ extern void __ftrace_bad_type(void);
209 TRACE_KMEM_ALLOC); \ 234 TRACE_KMEM_ALLOC); \
210 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ 235 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
211 TRACE_KMEM_FREE); \ 236 TRACE_KMEM_FREE); \
237 IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\
212 __ftrace_bad_type(); \ 238 __ftrace_bad_type(); \
213 } while (0) 239 } while (0)
214 240
@@ -246,6 +272,7 @@ struct tracer_flags {
246 * @pipe_open: called when the trace_pipe file is opened 272 * @pipe_open: called when the trace_pipe file is opened
247 * @wait_pipe: override how the user waits for traces on trace_pipe 273 * @wait_pipe: override how the user waits for traces on trace_pipe
248 * @close: called when the trace file is released 274 * @close: called when the trace file is released
275 * @pipe_close: called when the trace_pipe file is released
249 * @read: override the default read callback on trace_pipe 276 * @read: override the default read callback on trace_pipe
250 * @splice_read: override the default splice_read callback on trace_pipe 277 * @splice_read: override the default splice_read callback on trace_pipe
251 * @selftest: selftest to run on boot (see trace_selftest.c) 278 * @selftest: selftest to run on boot (see trace_selftest.c)
@@ -264,6 +291,7 @@ struct tracer {
264 void (*pipe_open)(struct trace_iterator *iter); 291 void (*pipe_open)(struct trace_iterator *iter);
265 void (*wait_pipe)(struct trace_iterator *iter); 292 void (*wait_pipe)(struct trace_iterator *iter);
266 void (*close)(struct trace_iterator *iter); 293 void (*close)(struct trace_iterator *iter);
294 void (*pipe_close)(struct trace_iterator *iter);
267 ssize_t (*read)(struct trace_iterator *iter, 295 ssize_t (*read)(struct trace_iterator *iter,
268 struct file *filp, char __user *ubuf, 296 struct file *filp, char __user *ubuf,
269 size_t cnt, loff_t *ppos); 297 size_t cnt, loff_t *ppos);
@@ -364,6 +392,8 @@ int register_tracer(struct tracer *type);
364void unregister_tracer(struct tracer *type); 392void unregister_tracer(struct tracer *type);
365int is_tracing_stopped(void); 393int is_tracing_stopped(void);
366 394
395extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr);
396
367extern unsigned long nsecs_to_usecs(unsigned long nsecs); 397extern unsigned long nsecs_to_usecs(unsigned long nsecs);
368 398
369#ifdef CONFIG_TRACER_MAX_TRACE 399#ifdef CONFIG_TRACER_MAX_TRACE
@@ -413,7 +443,7 @@ extern int DYN_FTRACE_TEST_NAME(void);
413 443
414extern int ring_buffer_expanded; 444extern int ring_buffer_expanded;
415extern bool tracing_selftest_disabled; 445extern bool tracing_selftest_disabled;
416DECLARE_PER_CPU(local_t, ftrace_cpu_disabled); 446DECLARE_PER_CPU(int, ftrace_cpu_disabled);
417 447
418#ifdef CONFIG_FTRACE_STARTUP_TEST 448#ifdef CONFIG_FTRACE_STARTUP_TEST
419extern int trace_selftest_startup_function(struct tracer *trace, 449extern int trace_selftest_startup_function(struct tracer *trace,
@@ -438,6 +468,8 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
438 struct trace_array *tr); 468 struct trace_array *tr);
439extern int trace_selftest_startup_hw_branches(struct tracer *trace, 469extern int trace_selftest_startup_hw_branches(struct tracer *trace,
440 struct trace_array *tr); 470 struct trace_array *tr);
471extern int trace_selftest_startup_ksym(struct tracer *trace,
472 struct trace_array *tr);
441#endif /* CONFIG_FTRACE_STARTUP_TEST */ 473#endif /* CONFIG_FTRACE_STARTUP_TEST */
442 474
443extern void *head_page(struct trace_array_cpu *data); 475extern void *head_page(struct trace_array_cpu *data);
@@ -483,10 +515,6 @@ static inline int ftrace_graph_addr(unsigned long addr)
483 return 0; 515 return 0;
484} 516}
485#else 517#else
486static inline int ftrace_trace_addr(unsigned long addr)
487{
488 return 1;
489}
490static inline int ftrace_graph_addr(unsigned long addr) 518static inline int ftrace_graph_addr(unsigned long addr)
491{ 519{
492 return 1; 520 return 1;
@@ -500,12 +528,12 @@ print_graph_function(struct trace_iterator *iter)
500} 528}
501#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 529#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
502 530
503extern struct pid *ftrace_pid_trace; 531extern struct list_head ftrace_pids;
504 532
505#ifdef CONFIG_FUNCTION_TRACER 533#ifdef CONFIG_FUNCTION_TRACER
506static inline int ftrace_trace_task(struct task_struct *task) 534static inline int ftrace_trace_task(struct task_struct *task)
507{ 535{
508 if (!ftrace_pid_trace) 536 if (list_empty(&ftrace_pids))
509 return 1; 537 return 1;
510 538
511 return test_tsk_trace_trace(task); 539 return test_tsk_trace_trace(task);
@@ -687,7 +715,6 @@ struct event_filter {
687 int n_preds; 715 int n_preds;
688 struct filter_pred **preds; 716 struct filter_pred **preds;
689 char *filter_string; 717 char *filter_string;
690 bool no_reset;
691}; 718};
692 719
693struct event_subsystem { 720struct event_subsystem {
@@ -699,22 +726,40 @@ struct event_subsystem {
699}; 726};
700 727
701struct filter_pred; 728struct filter_pred;
729struct regex;
702 730
703typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, 731typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event,
704 int val1, int val2); 732 int val1, int val2);
705 733
734typedef int (*regex_match_func)(char *str, struct regex *r, int len);
735
736enum regex_type {
737 MATCH_FULL = 0,
738 MATCH_FRONT_ONLY,
739 MATCH_MIDDLE_ONLY,
740 MATCH_END_ONLY,
741};
742
743struct regex {
744 char pattern[MAX_FILTER_STR_VAL];
745 int len;
746 int field_len;
747 regex_match_func match;
748};
749
706struct filter_pred { 750struct filter_pred {
707 filter_pred_fn_t fn; 751 filter_pred_fn_t fn;
708 u64 val; 752 u64 val;
709 char str_val[MAX_FILTER_STR_VAL]; 753 struct regex regex;
710 int str_len; 754 char *field_name;
711 char *field_name; 755 int offset;
712 int offset; 756 int not;
713 int not; 757 int op;
714 int op; 758 int pop_n;
715 int pop_n;
716}; 759};
717 760
761extern enum regex_type
762filter_parse_regex(char *buff, int len, char **search, int *not);
718extern void print_event_filter(struct ftrace_event_call *call, 763extern void print_event_filter(struct ftrace_event_call *call,
719 struct trace_seq *s); 764 struct trace_seq *s);
720extern int apply_event_filter(struct ftrace_event_call *call, 765extern int apply_event_filter(struct ftrace_event_call *call,
@@ -730,7 +775,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
730 struct ring_buffer *buffer, 775 struct ring_buffer *buffer,
731 struct ring_buffer_event *event) 776 struct ring_buffer_event *event)
732{ 777{
733 if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) { 778 if (unlikely(call->filter_active) &&
779 !filter_match_preds(call->filter, rec)) {
734 ring_buffer_discard_commit(buffer, event); 780 ring_buffer_discard_commit(buffer, event);
735 return 1; 781 return 1;
736 } 782 }
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 7a7a9fd249a9..4a194f08f88c 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -34,6 +34,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
34 struct trace_array *tr = branch_tracer; 34 struct trace_array *tr = branch_tracer;
35 struct ring_buffer_event *event; 35 struct ring_buffer_event *event;
36 struct trace_branch *entry; 36 struct trace_branch *entry;
37 struct ring_buffer *buffer;
37 unsigned long flags; 38 unsigned long flags;
38 int cpu, pc; 39 int cpu, pc;
39 const char *p; 40 const char *p;
@@ -54,7 +55,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
54 goto out; 55 goto out;
55 56
56 pc = preempt_count(); 57 pc = preempt_count();
57 event = trace_buffer_lock_reserve(tr, TRACE_BRANCH, 58 buffer = tr->buffer;
59 event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH,
58 sizeof(*entry), flags, pc); 60 sizeof(*entry), flags, pc);
59 if (!event) 61 if (!event)
60 goto out; 62 goto out;
@@ -74,8 +76,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
74 entry->line = f->line; 76 entry->line = f->line;
75 entry->correct = val == expect; 77 entry->correct = val == expect;
76 78
77 if (!filter_check_discard(call, entry, tr->buffer, event)) 79 if (!filter_check_discard(call, entry, buffer, event))
78 ring_buffer_unlock_commit(tr->buffer, event); 80 ring_buffer_unlock_commit(buffer, event);
79 81
80 out: 82 out:
81 atomic_dec(&tr->data[cpu]->disabled); 83 atomic_dec(&tr->data[cpu]->disabled);
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 20c5f92e28a8..84a3a7ba072a 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -20,6 +20,8 @@
20#include <linux/ktime.h> 20#include <linux/ktime.h>
21#include <linux/trace_clock.h> 21#include <linux/trace_clock.h>
22 22
23#include "trace.h"
24
23/* 25/*
24 * trace_clock_local(): the simplest and least coherent tracing clock. 26 * trace_clock_local(): the simplest and least coherent tracing clock.
25 * 27 *
@@ -28,17 +30,17 @@
28 */ 30 */
29u64 notrace trace_clock_local(void) 31u64 notrace trace_clock_local(void)
30{ 32{
31 unsigned long flags;
32 u64 clock; 33 u64 clock;
34 int resched;
33 35
34 /* 36 /*
35 * sched_clock() is an architecture implemented, fast, scalable, 37 * sched_clock() is an architecture implemented, fast, scalable,
36 * lockless clock. It is not guaranteed to be coherent across 38 * lockless clock. It is not guaranteed to be coherent across
37 * CPUs, nor across CPU idle events. 39 * CPUs, nor across CPU idle events.
38 */ 40 */
39 raw_local_irq_save(flags); 41 resched = ftrace_preempt_disable();
40 clock = sched_clock(); 42 clock = sched_clock();
41 raw_local_irq_restore(flags); 43 ftrace_preempt_enable(resched);
42 44
43 return clock; 45 return clock;
44} 46}
@@ -69,10 +71,10 @@ u64 notrace trace_clock(void)
69/* keep prev_time and lock in the same cacheline. */ 71/* keep prev_time and lock in the same cacheline. */
70static struct { 72static struct {
71 u64 prev_time; 73 u64 prev_time;
72 raw_spinlock_t lock; 74 arch_spinlock_t lock;
73} trace_clock_struct ____cacheline_aligned_in_smp = 75} trace_clock_struct ____cacheline_aligned_in_smp =
74 { 76 {
75 .lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED, 77 .lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED,
76 }; 78 };
77 79
78u64 notrace trace_clock_global(void) 80u64 notrace trace_clock_global(void)
@@ -92,7 +94,7 @@ u64 notrace trace_clock_global(void)
92 if (unlikely(in_nmi())) 94 if (unlikely(in_nmi()))
93 goto out; 95 goto out;
94 96
95 __raw_spin_lock(&trace_clock_struct.lock); 97 arch_spin_lock(&trace_clock_struct.lock);
96 98
97 /* 99 /*
98 * TODO: if this happens often then maybe we should reset 100 * TODO: if this happens often then maybe we should reset
@@ -104,7 +106,7 @@ u64 notrace trace_clock_global(void)
104 106
105 trace_clock_struct.prev_time = now; 107 trace_clock_struct.prev_time = now;
106 108
107 __raw_spin_unlock(&trace_clock_struct.lock); 109 arch_spin_unlock(&trace_clock_struct.lock);
108 110
109 out: 111 out:
110 raw_local_irq_restore(flags); 112 raw_local_irq_restore(flags);
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index ead3d724599d..c16a08f399df 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -364,3 +364,19 @@ FTRACE_ENTRY(kmem_free, kmemtrace_free_entry,
364 F_printk("type:%u call_site:%lx ptr:%p", 364 F_printk("type:%u call_site:%lx ptr:%p",
365 __entry->type_id, __entry->call_site, __entry->ptr) 365 __entry->type_id, __entry->call_site, __entry->ptr)
366); 366);
367
368FTRACE_ENTRY(ksym_trace, ksym_trace_entry,
369
370 TRACE_KSYM,
371
372 F_STRUCT(
373 __field( unsigned long, ip )
374 __field( unsigned char, type )
375 __array( char , cmd, TASK_COMM_LEN )
376 __field( unsigned long, addr )
377 ),
378
379 F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s",
380 (void *)__entry->ip, (unsigned int)__entry->type,
381 (void *)__entry->addr, __entry->cmd)
382);
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index dd44b8768867..d9c60f80aa0d 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -8,17 +8,14 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include "trace.h" 9#include "trace.h"
10 10
11/*
12 * We can't use a size but a type in alloc_percpu()
13 * So let's create a dummy type that matches the desired size
14 */
15typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t;
16 11
17char *trace_profile_buf; 12char *perf_trace_buf;
18EXPORT_SYMBOL_GPL(trace_profile_buf); 13EXPORT_SYMBOL_GPL(perf_trace_buf);
14
15char *perf_trace_buf_nmi;
16EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
19 17
20char *trace_profile_buf_nmi; 18typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;
21EXPORT_SYMBOL_GPL(trace_profile_buf_nmi);
22 19
23/* Count the events in use (per event id, not per instance) */ 20/* Count the events in use (per event id, not per instance) */
24static int total_profile_count; 21static int total_profile_count;
@@ -31,29 +28,34 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event)
31 if (atomic_inc_return(&event->profile_count)) 28 if (atomic_inc_return(&event->profile_count))
32 return 0; 29 return 0;
33 30
34 if (!total_profile_count++) { 31 if (!total_profile_count) {
35 buf = (char *)alloc_percpu(profile_buf_t); 32 buf = (char *)alloc_percpu(perf_trace_t);
36 if (!buf) 33 if (!buf)
37 goto fail_buf; 34 goto fail_buf;
38 35
39 rcu_assign_pointer(trace_profile_buf, buf); 36 rcu_assign_pointer(perf_trace_buf, buf);
40 37
41 buf = (char *)alloc_percpu(profile_buf_t); 38 buf = (char *)alloc_percpu(perf_trace_t);
42 if (!buf) 39 if (!buf)
43 goto fail_buf_nmi; 40 goto fail_buf_nmi;
44 41
45 rcu_assign_pointer(trace_profile_buf_nmi, buf); 42 rcu_assign_pointer(perf_trace_buf_nmi, buf);
46 } 43 }
47 44
48 ret = event->profile_enable(); 45 ret = event->profile_enable(event);
49 if (!ret) 46 if (!ret) {
47 total_profile_count++;
50 return 0; 48 return 0;
49 }
51 50
52 kfree(trace_profile_buf_nmi);
53fail_buf_nmi: 51fail_buf_nmi:
54 kfree(trace_profile_buf); 52 if (!total_profile_count) {
53 free_percpu(perf_trace_buf_nmi);
54 free_percpu(perf_trace_buf);
55 perf_trace_buf_nmi = NULL;
56 perf_trace_buf = NULL;
57 }
55fail_buf: 58fail_buf:
56 total_profile_count--;
57 atomic_dec(&event->profile_count); 59 atomic_dec(&event->profile_count);
58 60
59 return ret; 61 return ret;
@@ -84,14 +86,14 @@ static void ftrace_profile_disable_event(struct ftrace_event_call *event)
84 if (!atomic_add_negative(-1, &event->profile_count)) 86 if (!atomic_add_negative(-1, &event->profile_count))
85 return; 87 return;
86 88
87 event->profile_disable(); 89 event->profile_disable(event);
88 90
89 if (!--total_profile_count) { 91 if (!--total_profile_count) {
90 buf = trace_profile_buf; 92 buf = perf_trace_buf;
91 rcu_assign_pointer(trace_profile_buf, NULL); 93 rcu_assign_pointer(perf_trace_buf, NULL);
92 94
93 nmi_buf = trace_profile_buf_nmi; 95 nmi_buf = perf_trace_buf_nmi;
94 rcu_assign_pointer(trace_profile_buf_nmi, NULL); 96 rcu_assign_pointer(perf_trace_buf_nmi, NULL);
95 97
96 /* 98 /*
97 * Ensure every events in profiling have finished before 99 * Ensure every events in profiling have finished before
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 6f03c8a1105e..1d18315dc836 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -93,9 +93,7 @@ int trace_define_common_fields(struct ftrace_event_call *call)
93} 93}
94EXPORT_SYMBOL_GPL(trace_define_common_fields); 94EXPORT_SYMBOL_GPL(trace_define_common_fields);
95 95
96#ifdef CONFIG_MODULES 96void trace_destroy_fields(struct ftrace_event_call *call)
97
98static void trace_destroy_fields(struct ftrace_event_call *call)
99{ 97{
100 struct ftrace_event_field *field, *next; 98 struct ftrace_event_field *field, *next;
101 99
@@ -107,8 +105,6 @@ static void trace_destroy_fields(struct ftrace_event_call *call)
107 } 105 }
108} 106}
109 107
110#endif /* CONFIG_MODULES */
111
112static void ftrace_event_enable_disable(struct ftrace_event_call *call, 108static void ftrace_event_enable_disable(struct ftrace_event_call *call,
113 int enable) 109 int enable)
114{ 110{
@@ -117,14 +113,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
117 if (call->enabled) { 113 if (call->enabled) {
118 call->enabled = 0; 114 call->enabled = 0;
119 tracing_stop_cmdline_record(); 115 tracing_stop_cmdline_record();
120 call->unregfunc(call->data); 116 call->unregfunc(call);
121 } 117 }
122 break; 118 break;
123 case 1: 119 case 1:
124 if (!call->enabled) { 120 if (!call->enabled) {
125 call->enabled = 1; 121 call->enabled = 1;
126 tracing_start_cmdline_record(); 122 tracing_start_cmdline_record();
127 call->regfunc(call->data); 123 call->regfunc(call);
128 } 124 }
129 break; 125 break;
130 } 126 }
@@ -232,10 +228,9 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
232 size_t cnt, loff_t *ppos) 228 size_t cnt, loff_t *ppos)
233{ 229{
234 struct trace_parser parser; 230 struct trace_parser parser;
235 size_t read = 0; 231 ssize_t read, ret;
236 ssize_t ret;
237 232
238 if (!cnt || cnt < 0) 233 if (!cnt)
239 return 0; 234 return 0;
240 235
241 ret = tracing_update_buffers(); 236 ret = tracing_update_buffers();
@@ -247,7 +242,7 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
247 242
248 read = trace_get_user(&parser, ubuf, cnt, ppos); 243 read = trace_get_user(&parser, ubuf, cnt, ppos);
249 244
250 if (trace_parser_loaded((&parser))) { 245 if (read >= 0 && trace_parser_loaded((&parser))) {
251 int set = 1; 246 int set = 1;
252 247
253 if (*parser.buffer == '!') 248 if (*parser.buffer == '!')
@@ -508,7 +503,7 @@ extern char *__bad_type_size(void);
508#define FIELD(type, name) \ 503#define FIELD(type, name) \
509 sizeof(type) != sizeof(field.name) ? __bad_type_size() : \ 504 sizeof(type) != sizeof(field.name) ? __bad_type_size() : \
510 #type, "common_" #name, offsetof(typeof(field), name), \ 505 #type, "common_" #name, offsetof(typeof(field), name), \
511 sizeof(field.name) 506 sizeof(field.name), is_signed_type(type)
512 507
513static int trace_write_header(struct trace_seq *s) 508static int trace_write_header(struct trace_seq *s)
514{ 509{
@@ -516,17 +511,17 @@ static int trace_write_header(struct trace_seq *s)
516 511
517 /* struct trace_entry */ 512 /* struct trace_entry */
518 return trace_seq_printf(s, 513 return trace_seq_printf(s,
519 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 514 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
520 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 515 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
521 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 516 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
522 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 517 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
523 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 518 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
524 "\n", 519 "\n",
525 FIELD(unsigned short, type), 520 FIELD(unsigned short, type),
526 FIELD(unsigned char, flags), 521 FIELD(unsigned char, flags),
527 FIELD(unsigned char, preempt_count), 522 FIELD(unsigned char, preempt_count),
528 FIELD(int, pid), 523 FIELD(int, pid),
529 FIELD(int, lock_depth)); 524 FIELD(int, lock_depth));
530} 525}
531 526
532static ssize_t 527static ssize_t
@@ -879,9 +874,9 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
879 "'%s/filter' entry\n", name); 874 "'%s/filter' entry\n", name);
880 } 875 }
881 876
882 entry = trace_create_file("enable", 0644, system->entry, 877 trace_create_file("enable", 0644, system->entry,
883 (void *)system->name, 878 (void *)system->name,
884 &ftrace_system_enable_fops); 879 &ftrace_system_enable_fops);
885 880
886 return system->entry; 881 return system->entry;
887} 882}
@@ -893,7 +888,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
893 const struct file_operations *filter, 888 const struct file_operations *filter,
894 const struct file_operations *format) 889 const struct file_operations *format)
895{ 890{
896 struct dentry *entry;
897 int ret; 891 int ret;
898 892
899 /* 893 /*
@@ -911,12 +905,12 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
911 } 905 }
912 906
913 if (call->regfunc) 907 if (call->regfunc)
914 entry = trace_create_file("enable", 0644, call->dir, call, 908 trace_create_file("enable", 0644, call->dir, call,
915 enable); 909 enable);
916 910
917 if (call->id && call->profile_enable) 911 if (call->id && call->profile_enable)
918 entry = trace_create_file("id", 0444, call->dir, call, 912 trace_create_file("id", 0444, call->dir, call,
919 id); 913 id);
920 914
921 if (call->define_fields) { 915 if (call->define_fields) {
922 ret = call->define_fields(call); 916 ret = call->define_fields(call);
@@ -925,41 +919,60 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
925 " events/%s\n", call->name); 919 " events/%s\n", call->name);
926 return ret; 920 return ret;
927 } 921 }
928 entry = trace_create_file("filter", 0644, call->dir, call, 922 trace_create_file("filter", 0644, call->dir, call,
929 filter); 923 filter);
930 } 924 }
931 925
932 /* A trace may not want to export its format */ 926 /* A trace may not want to export its format */
933 if (!call->show_format) 927 if (!call->show_format)
934 return 0; 928 return 0;
935 929
936 entry = trace_create_file("format", 0444, call->dir, call, 930 trace_create_file("format", 0444, call->dir, call,
937 format); 931 format);
938 932
939 return 0; 933 return 0;
940} 934}
941 935
942#define for_each_event(event, start, end) \ 936static int __trace_add_event_call(struct ftrace_event_call *call)
943 for (event = start; \ 937{
944 (unsigned long)event < (unsigned long)end; \ 938 struct dentry *d_events;
945 event++) 939 int ret;
946 940
947#ifdef CONFIG_MODULES 941 if (!call->name)
942 return -EINVAL;
948 943
949static LIST_HEAD(ftrace_module_file_list); 944 if (call->raw_init) {
945 ret = call->raw_init(call);
946 if (ret < 0) {
947 if (ret != -ENOSYS)
948 pr_warning("Could not initialize trace "
949 "events/%s\n", call->name);
950 return ret;
951 }
952 }
950 953
951/* 954 d_events = event_trace_events_dir();
952 * Modules must own their file_operations to keep up with 955 if (!d_events)
953 * reference counting. 956 return -ENOENT;
954 */ 957
955struct ftrace_module_file_ops { 958 ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
956 struct list_head list; 959 &ftrace_enable_fops, &ftrace_event_filter_fops,
957 struct module *mod; 960 &ftrace_event_format_fops);
958 struct file_operations id; 961 if (!ret)
959 struct file_operations enable; 962 list_add(&call->list, &ftrace_events);
960 struct file_operations format; 963
961 struct file_operations filter; 964 return ret;
962}; 965}
966
967/* Add an additional event_call dynamically */
968int trace_add_event_call(struct ftrace_event_call *call)
969{
970 int ret;
971 mutex_lock(&event_mutex);
972 ret = __trace_add_event_call(call);
973 mutex_unlock(&event_mutex);
974 return ret;
975}
963 976
964static void remove_subsystem_dir(const char *name) 977static void remove_subsystem_dir(const char *name)
965{ 978{
@@ -987,6 +1000,53 @@ static void remove_subsystem_dir(const char *name)
987 } 1000 }
988} 1001}
989 1002
1003/*
1004 * Must be called under locking both of event_mutex and trace_event_mutex.
1005 */
1006static void __trace_remove_event_call(struct ftrace_event_call *call)
1007{
1008 ftrace_event_enable_disable(call, 0);
1009 if (call->event)
1010 __unregister_ftrace_event(call->event);
1011 debugfs_remove_recursive(call->dir);
1012 list_del(&call->list);
1013 trace_destroy_fields(call);
1014 destroy_preds(call);
1015 remove_subsystem_dir(call->system);
1016}
1017
1018/* Remove an event_call */
1019void trace_remove_event_call(struct ftrace_event_call *call)
1020{
1021 mutex_lock(&event_mutex);
1022 down_write(&trace_event_mutex);
1023 __trace_remove_event_call(call);
1024 up_write(&trace_event_mutex);
1025 mutex_unlock(&event_mutex);
1026}
1027
1028#define for_each_event(event, start, end) \
1029 for (event = start; \
1030 (unsigned long)event < (unsigned long)end; \
1031 event++)
1032
1033#ifdef CONFIG_MODULES
1034
1035static LIST_HEAD(ftrace_module_file_list);
1036
1037/*
1038 * Modules must own their file_operations to keep up with
1039 * reference counting.
1040 */
1041struct ftrace_module_file_ops {
1042 struct list_head list;
1043 struct module *mod;
1044 struct file_operations id;
1045 struct file_operations enable;
1046 struct file_operations format;
1047 struct file_operations filter;
1048};
1049
990static struct ftrace_module_file_ops * 1050static struct ftrace_module_file_ops *
991trace_create_file_ops(struct module *mod) 1051trace_create_file_ops(struct module *mod)
992{ 1052{
@@ -1044,7 +1104,7 @@ static void trace_module_add_events(struct module *mod)
1044 if (!call->name) 1104 if (!call->name)
1045 continue; 1105 continue;
1046 if (call->raw_init) { 1106 if (call->raw_init) {
1047 ret = call->raw_init(); 1107 ret = call->raw_init(call);
1048 if (ret < 0) { 1108 if (ret < 0) {
1049 if (ret != -ENOSYS) 1109 if (ret != -ENOSYS)
1050 pr_warning("Could not initialize trace " 1110 pr_warning("Could not initialize trace "
@@ -1062,10 +1122,11 @@ static void trace_module_add_events(struct module *mod)
1062 return; 1122 return;
1063 } 1123 }
1064 call->mod = mod; 1124 call->mod = mod;
1065 list_add(&call->list, &ftrace_events); 1125 ret = event_create_dir(call, d_events,
1066 event_create_dir(call, d_events, 1126 &file_ops->id, &file_ops->enable,
1067 &file_ops->id, &file_ops->enable, 1127 &file_ops->filter, &file_ops->format);
1068 &file_ops->filter, &file_ops->format); 1128 if (!ret)
1129 list_add(&call->list, &ftrace_events);
1069 } 1130 }
1070} 1131}
1071 1132
@@ -1079,14 +1140,7 @@ static void trace_module_remove_events(struct module *mod)
1079 list_for_each_entry_safe(call, p, &ftrace_events, list) { 1140 list_for_each_entry_safe(call, p, &ftrace_events, list) {
1080 if (call->mod == mod) { 1141 if (call->mod == mod) {
1081 found = true; 1142 found = true;
1082 ftrace_event_enable_disable(call, 0); 1143 __trace_remove_event_call(call);
1083 if (call->event)
1084 __unregister_ftrace_event(call->event);
1085 debugfs_remove_recursive(call->dir);
1086 list_del(&call->list);
1087 trace_destroy_fields(call);
1088 destroy_preds(call);
1089 remove_subsystem_dir(call->system);
1090 } 1144 }
1091 } 1145 }
1092 1146
@@ -1204,7 +1258,7 @@ static __init int event_trace_init(void)
1204 if (!call->name) 1258 if (!call->name)
1205 continue; 1259 continue;
1206 if (call->raw_init) { 1260 if (call->raw_init) {
1207 ret = call->raw_init(); 1261 ret = call->raw_init(call);
1208 if (ret < 0) { 1262 if (ret < 0) {
1209 if (ret != -ENOSYS) 1263 if (ret != -ENOSYS)
1210 pr_warning("Could not initialize trace " 1264 pr_warning("Could not initialize trace "
@@ -1212,10 +1266,12 @@ static __init int event_trace_init(void)
1212 continue; 1266 continue;
1213 } 1267 }
1214 } 1268 }
1215 list_add(&call->list, &ftrace_events); 1269 ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
1216 event_create_dir(call, d_events, &ftrace_event_id_fops, 1270 &ftrace_enable_fops,
1217 &ftrace_enable_fops, &ftrace_event_filter_fops, 1271 &ftrace_event_filter_fops,
1218 &ftrace_event_format_fops); 1272 &ftrace_event_format_fops);
1273 if (!ret)
1274 list_add(&call->list, &ftrace_events);
1219 } 1275 }
1220 1276
1221 while (true) { 1277 while (true) {
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 23245785927f..50504cb228de 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -18,11 +18,10 @@
18 * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com> 18 * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
19 */ 19 */
20 20
21#include <linux/debugfs.h>
22#include <linux/uaccess.h>
23#include <linux/module.h> 21#include <linux/module.h>
24#include <linux/ctype.h> 22#include <linux/ctype.h>
25#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/perf_event.h>
26 25
27#include "trace.h" 26#include "trace.h"
28#include "trace_output.h" 27#include "trace_output.h"
@@ -31,6 +30,7 @@ enum filter_op_ids
31{ 30{
32 OP_OR, 31 OP_OR,
33 OP_AND, 32 OP_AND,
33 OP_GLOB,
34 OP_NE, 34 OP_NE,
35 OP_EQ, 35 OP_EQ,
36 OP_LT, 36 OP_LT,
@@ -48,16 +48,17 @@ struct filter_op {
48}; 48};
49 49
50static struct filter_op filter_ops[] = { 50static struct filter_op filter_ops[] = {
51 { OP_OR, "||", 1 }, 51 { OP_OR, "||", 1 },
52 { OP_AND, "&&", 2 }, 52 { OP_AND, "&&", 2 },
53 { OP_NE, "!=", 4 }, 53 { OP_GLOB, "~", 4 },
54 { OP_EQ, "==", 4 }, 54 { OP_NE, "!=", 4 },
55 { OP_LT, "<", 5 }, 55 { OP_EQ, "==", 4 },
56 { OP_LE, "<=", 5 }, 56 { OP_LT, "<", 5 },
57 { OP_GT, ">", 5 }, 57 { OP_LE, "<=", 5 },
58 { OP_GE, ">=", 5 }, 58 { OP_GT, ">", 5 },
59 { OP_NONE, "OP_NONE", 0 }, 59 { OP_GE, ">=", 5 },
60 { OP_OPEN_PAREN, "(", 0 }, 60 { OP_NONE, "OP_NONE", 0 },
61 { OP_OPEN_PAREN, "(", 0 },
61}; 62};
62 63
63enum { 64enum {
@@ -197,9 +198,9 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
197 char *addr = (char *)(event + pred->offset); 198 char *addr = (char *)(event + pred->offset);
198 int cmp, match; 199 int cmp, match;
199 200
200 cmp = strncmp(addr, pred->str_val, pred->str_len); 201 cmp = pred->regex.match(addr, &pred->regex, pred->regex.field_len);
201 202
202 match = (!cmp) ^ pred->not; 203 match = cmp ^ pred->not;
203 204
204 return match; 205 return match;
205} 206}
@@ -211,9 +212,9 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event,
211 char **addr = (char **)(event + pred->offset); 212 char **addr = (char **)(event + pred->offset);
212 int cmp, match; 213 int cmp, match;
213 214
214 cmp = strncmp(*addr, pred->str_val, pred->str_len); 215 cmp = pred->regex.match(*addr, &pred->regex, pred->regex.field_len);
215 216
216 match = (!cmp) ^ pred->not; 217 match = cmp ^ pred->not;
217 218
218 return match; 219 return match;
219} 220}
@@ -237,9 +238,9 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event,
237 char *addr = (char *)(event + str_loc); 238 char *addr = (char *)(event + str_loc);
238 int cmp, match; 239 int cmp, match;
239 240
240 cmp = strncmp(addr, pred->str_val, str_len); 241 cmp = pred->regex.match(addr, &pred->regex, str_len);
241 242
242 match = (!cmp) ^ pred->not; 243 match = cmp ^ pred->not;
243 244
244 return match; 245 return match;
245} 246}
@@ -250,10 +251,121 @@ static int filter_pred_none(struct filter_pred *pred, void *event,
250 return 0; 251 return 0;
251} 252}
252 253
254/* Basic regex callbacks */
255static int regex_match_full(char *str, struct regex *r, int len)
256{
257 if (strncmp(str, r->pattern, len) == 0)
258 return 1;
259 return 0;
260}
261
262static int regex_match_front(char *str, struct regex *r, int len)
263{
264 if (strncmp(str, r->pattern, len) == 0)
265 return 1;
266 return 0;
267}
268
269static int regex_match_middle(char *str, struct regex *r, int len)
270{
271 if (strstr(str, r->pattern))
272 return 1;
273 return 0;
274}
275
276static int regex_match_end(char *str, struct regex *r, int len)
277{
278 char *ptr = strstr(str, r->pattern);
279
280 if (ptr && (ptr[r->len] == 0))
281 return 1;
282 return 0;
283}
284
285/**
286 * filter_parse_regex - parse a basic regex
287 * @buff: the raw regex
288 * @len: length of the regex
289 * @search: will point to the beginning of the string to compare
290 * @not: tell whether the match will have to be inverted
291 *
292 * This passes in a buffer containing a regex and this function will
293 * set search to point to the search part of the buffer and
294 * return the type of search it is (see enum above).
295 * This does modify buff.
296 *
297 * Returns enum type.
298 * search returns the pointer to use for comparison.
299 * not returns 1 if buff started with a '!'
300 * 0 otherwise.
301 */
302enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not)
303{
304 int type = MATCH_FULL;
305 int i;
306
307 if (buff[0] == '!') {
308 *not = 1;
309 buff++;
310 len--;
311 } else
312 *not = 0;
313
314 *search = buff;
315
316 for (i = 0; i < len; i++) {
317 if (buff[i] == '*') {
318 if (!i) {
319 *search = buff + 1;
320 type = MATCH_END_ONLY;
321 } else {
322 if (type == MATCH_END_ONLY)
323 type = MATCH_MIDDLE_ONLY;
324 else
325 type = MATCH_FRONT_ONLY;
326 buff[i] = 0;
327 break;
328 }
329 }
330 }
331
332 return type;
333}
334
335static void filter_build_regex(struct filter_pred *pred)
336{
337 struct regex *r = &pred->regex;
338 char *search;
339 enum regex_type type = MATCH_FULL;
340 int not = 0;
341
342 if (pred->op == OP_GLOB) {
343 type = filter_parse_regex(r->pattern, r->len, &search, &not);
344 r->len = strlen(search);
345 memmove(r->pattern, search, r->len+1);
346 }
347
348 switch (type) {
349 case MATCH_FULL:
350 r->match = regex_match_full;
351 break;
352 case MATCH_FRONT_ONLY:
353 r->match = regex_match_front;
354 break;
355 case MATCH_MIDDLE_ONLY:
356 r->match = regex_match_middle;
357 break;
358 case MATCH_END_ONLY:
359 r->match = regex_match_end;
360 break;
361 }
362
363 pred->not ^= not;
364}
365
253/* return 1 if event matches, 0 otherwise (discard) */ 366/* return 1 if event matches, 0 otherwise (discard) */
254int filter_match_preds(struct ftrace_event_call *call, void *rec) 367int filter_match_preds(struct event_filter *filter, void *rec)
255{ 368{
256 struct event_filter *filter = call->filter;
257 int match, top = 0, val1 = 0, val2 = 0; 369 int match, top = 0, val1 = 0, val2 = 0;
258 int stack[MAX_FILTER_PRED]; 370 int stack[MAX_FILTER_PRED];
259 struct filter_pred *pred; 371 struct filter_pred *pred;
@@ -396,7 +508,7 @@ static void filter_clear_pred(struct filter_pred *pred)
396{ 508{
397 kfree(pred->field_name); 509 kfree(pred->field_name);
398 pred->field_name = NULL; 510 pred->field_name = NULL;
399 pred->str_len = 0; 511 pred->regex.len = 0;
400} 512}
401 513
402static int filter_set_pred(struct filter_pred *dest, 514static int filter_set_pred(struct filter_pred *dest,
@@ -426,9 +538,8 @@ static void filter_disable_preds(struct ftrace_event_call *call)
426 filter->preds[i]->fn = filter_pred_none; 538 filter->preds[i]->fn = filter_pred_none;
427} 539}
428 540
429void destroy_preds(struct ftrace_event_call *call) 541static void __free_preds(struct event_filter *filter)
430{ 542{
431 struct event_filter *filter = call->filter;
432 int i; 543 int i;
433 544
434 if (!filter) 545 if (!filter)
@@ -441,21 +552,24 @@ void destroy_preds(struct ftrace_event_call *call)
441 kfree(filter->preds); 552 kfree(filter->preds);
442 kfree(filter->filter_string); 553 kfree(filter->filter_string);
443 kfree(filter); 554 kfree(filter);
555}
556
557void destroy_preds(struct ftrace_event_call *call)
558{
559 __free_preds(call->filter);
444 call->filter = NULL; 560 call->filter = NULL;
561 call->filter_active = 0;
445} 562}
446 563
447static int init_preds(struct ftrace_event_call *call) 564static struct event_filter *__alloc_preds(void)
448{ 565{
449 struct event_filter *filter; 566 struct event_filter *filter;
450 struct filter_pred *pred; 567 struct filter_pred *pred;
451 int i; 568 int i;
452 569
453 if (call->filter) 570 filter = kzalloc(sizeof(*filter), GFP_KERNEL);
454 return 0; 571 if (!filter)
455 572 return ERR_PTR(-ENOMEM);
456 filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
457 if (!call->filter)
458 return -ENOMEM;
459 573
460 filter->n_preds = 0; 574 filter->n_preds = 0;
461 575
@@ -471,12 +585,24 @@ static int init_preds(struct ftrace_event_call *call)
471 filter->preds[i] = pred; 585 filter->preds[i] = pred;
472 } 586 }
473 587
474 return 0; 588 return filter;
475 589
476oom: 590oom:
477 destroy_preds(call); 591 __free_preds(filter);
592 return ERR_PTR(-ENOMEM);
593}
594
595static int init_preds(struct ftrace_event_call *call)
596{
597 if (call->filter)
598 return 0;
599
600 call->filter_active = 0;
601 call->filter = __alloc_preds();
602 if (IS_ERR(call->filter))
603 return PTR_ERR(call->filter);
478 604
479 return -ENOMEM; 605 return 0;
480} 606}
481 607
482static int init_subsystem_preds(struct event_subsystem *system) 608static int init_subsystem_preds(struct event_subsystem *system)
@@ -499,14 +625,7 @@ static int init_subsystem_preds(struct event_subsystem *system)
499 return 0; 625 return 0;
500} 626}
501 627
502enum { 628static void filter_free_subsystem_preds(struct event_subsystem *system)
503 FILTER_DISABLE_ALL,
504 FILTER_INIT_NO_RESET,
505 FILTER_SKIP_NO_RESET,
506};
507
508static void filter_free_subsystem_preds(struct event_subsystem *system,
509 int flag)
510{ 629{
511 struct ftrace_event_call *call; 630 struct ftrace_event_call *call;
512 631
@@ -517,14 +636,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system,
517 if (strcmp(call->system, system->name) != 0) 636 if (strcmp(call->system, system->name) != 0)
518 continue; 637 continue;
519 638
520 if (flag == FILTER_INIT_NO_RESET) {
521 call->filter->no_reset = false;
522 continue;
523 }
524
525 if (flag == FILTER_SKIP_NO_RESET && call->filter->no_reset)
526 continue;
527
528 filter_disable_preds(call); 639 filter_disable_preds(call);
529 remove_filter_string(call->filter); 640 remove_filter_string(call->filter);
530 } 641 }
@@ -532,10 +643,10 @@ static void filter_free_subsystem_preds(struct event_subsystem *system,
532 643
533static int filter_add_pred_fn(struct filter_parse_state *ps, 644static int filter_add_pred_fn(struct filter_parse_state *ps,
534 struct ftrace_event_call *call, 645 struct ftrace_event_call *call,
646 struct event_filter *filter,
535 struct filter_pred *pred, 647 struct filter_pred *pred,
536 filter_pred_fn_t fn) 648 filter_pred_fn_t fn)
537{ 649{
538 struct event_filter *filter = call->filter;
539 int idx, err; 650 int idx, err;
540 651
541 if (filter->n_preds == MAX_FILTER_PRED) { 652 if (filter->n_preds == MAX_FILTER_PRED) {
@@ -550,7 +661,6 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
550 return err; 661 return err;
551 662
552 filter->n_preds++; 663 filter->n_preds++;
553 call->filter_active = 1;
554 664
555 return 0; 665 return 0;
556} 666}
@@ -575,7 +685,10 @@ static bool is_string_field(struct ftrace_event_field *field)
575 685
576static int is_legal_op(struct ftrace_event_field *field, int op) 686static int is_legal_op(struct ftrace_event_field *field, int op)
577{ 687{
578 if (is_string_field(field) && (op != OP_EQ && op != OP_NE)) 688 if (is_string_field(field) &&
689 (op != OP_EQ && op != OP_NE && op != OP_GLOB))
690 return 0;
691 if (!is_string_field(field) && op == OP_GLOB)
579 return 0; 692 return 0;
580 693
581 return 1; 694 return 1;
@@ -626,6 +739,7 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
626 739
627static int filter_add_pred(struct filter_parse_state *ps, 740static int filter_add_pred(struct filter_parse_state *ps,
628 struct ftrace_event_call *call, 741 struct ftrace_event_call *call,
742 struct event_filter *filter,
629 struct filter_pred *pred, 743 struct filter_pred *pred,
630 bool dry_run) 744 bool dry_run)
631{ 745{
@@ -660,21 +774,22 @@ static int filter_add_pred(struct filter_parse_state *ps,
660 } 774 }
661 775
662 if (is_string_field(field)) { 776 if (is_string_field(field)) {
663 pred->str_len = field->size; 777 filter_build_regex(pred);
664 778
665 if (field->filter_type == FILTER_STATIC_STRING) 779 if (field->filter_type == FILTER_STATIC_STRING) {
666 fn = filter_pred_string; 780 fn = filter_pred_string;
667 else if (field->filter_type == FILTER_DYN_STRING) 781 pred->regex.field_len = field->size;
782 } else if (field->filter_type == FILTER_DYN_STRING)
668 fn = filter_pred_strloc; 783 fn = filter_pred_strloc;
669 else { 784 else {
670 fn = filter_pred_pchar; 785 fn = filter_pred_pchar;
671 pred->str_len = strlen(pred->str_val); 786 pred->regex.field_len = strlen(pred->regex.pattern);
672 } 787 }
673 } else { 788 } else {
674 if (field->is_signed) 789 if (field->is_signed)
675 ret = strict_strtoll(pred->str_val, 0, &val); 790 ret = strict_strtoll(pred->regex.pattern, 0, &val);
676 else 791 else
677 ret = strict_strtoull(pred->str_val, 0, &val); 792 ret = strict_strtoull(pred->regex.pattern, 0, &val);
678 if (ret) { 793 if (ret) {
679 parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); 794 parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
680 return -EINVAL; 795 return -EINVAL;
@@ -694,45 +809,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
694 809
695add_pred_fn: 810add_pred_fn:
696 if (!dry_run) 811 if (!dry_run)
697 return filter_add_pred_fn(ps, call, pred, fn); 812 return filter_add_pred_fn(ps, call, filter, pred, fn);
698 return 0;
699}
700
701static int filter_add_subsystem_pred(struct filter_parse_state *ps,
702 struct event_subsystem *system,
703 struct filter_pred *pred,
704 char *filter_string,
705 bool dry_run)
706{
707 struct ftrace_event_call *call;
708 int err = 0;
709 bool fail = true;
710
711 list_for_each_entry(call, &ftrace_events, list) {
712
713 if (!call->define_fields)
714 continue;
715
716 if (strcmp(call->system, system->name))
717 continue;
718
719 if (call->filter->no_reset)
720 continue;
721
722 err = filter_add_pred(ps, call, pred, dry_run);
723 if (err)
724 call->filter->no_reset = true;
725 else
726 fail = false;
727
728 if (!dry_run)
729 replace_filter_string(call->filter, filter_string);
730 }
731
732 if (fail) {
733 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
734 return err;
735 }
736 return 0; 813 return 0;
737} 814}
738 815
@@ -933,8 +1010,9 @@ static void postfix_clear(struct filter_parse_state *ps)
933 1010
934 while (!list_empty(&ps->postfix)) { 1011 while (!list_empty(&ps->postfix)) {
935 elt = list_first_entry(&ps->postfix, struct postfix_elt, list); 1012 elt = list_first_entry(&ps->postfix, struct postfix_elt, list);
936 kfree(elt->operand);
937 list_del(&elt->list); 1013 list_del(&elt->list);
1014 kfree(elt->operand);
1015 kfree(elt);
938 } 1016 }
939} 1017}
940 1018
@@ -1044,8 +1122,8 @@ static struct filter_pred *create_pred(int op, char *operand1, char *operand2)
1044 return NULL; 1122 return NULL;
1045 } 1123 }
1046 1124
1047 strcpy(pred->str_val, operand2); 1125 strcpy(pred->regex.pattern, operand2);
1048 pred->str_len = strlen(operand2); 1126 pred->regex.len = strlen(pred->regex.pattern);
1049 1127
1050 pred->op = op; 1128 pred->op = op;
1051 1129
@@ -1089,8 +1167,8 @@ static int check_preds(struct filter_parse_state *ps)
1089 return 0; 1167 return 0;
1090} 1168}
1091 1169
1092static int replace_preds(struct event_subsystem *system, 1170static int replace_preds(struct ftrace_event_call *call,
1093 struct ftrace_event_call *call, 1171 struct event_filter *filter,
1094 struct filter_parse_state *ps, 1172 struct filter_parse_state *ps,
1095 char *filter_string, 1173 char *filter_string,
1096 bool dry_run) 1174 bool dry_run)
@@ -1137,11 +1215,7 @@ static int replace_preds(struct event_subsystem *system,
1137add_pred: 1215add_pred:
1138 if (!pred) 1216 if (!pred)
1139 return -ENOMEM; 1217 return -ENOMEM;
1140 if (call) 1218 err = filter_add_pred(ps, call, filter, pred, dry_run);
1141 err = filter_add_pred(ps, call, pred, false);
1142 else
1143 err = filter_add_subsystem_pred(ps, system, pred,
1144 filter_string, dry_run);
1145 filter_free_pred(pred); 1219 filter_free_pred(pred);
1146 if (err) 1220 if (err)
1147 return err; 1221 return err;
@@ -1152,10 +1226,50 @@ add_pred:
1152 return 0; 1226 return 0;
1153} 1227}
1154 1228
1155int apply_event_filter(struct ftrace_event_call *call, char *filter_string) 1229static int replace_system_preds(struct event_subsystem *system,
1230 struct filter_parse_state *ps,
1231 char *filter_string)
1156{ 1232{
1233 struct ftrace_event_call *call;
1234 bool fail = true;
1157 int err; 1235 int err;
1158 1236
1237 list_for_each_entry(call, &ftrace_events, list) {
1238 struct event_filter *filter = call->filter;
1239
1240 if (!call->define_fields)
1241 continue;
1242
1243 if (strcmp(call->system, system->name) != 0)
1244 continue;
1245
1246 /* try to see if the filter can be applied */
1247 err = replace_preds(call, filter, ps, filter_string, true);
1248 if (err)
1249 continue;
1250
1251 /* really apply the filter */
1252 filter_disable_preds(call);
1253 err = replace_preds(call, filter, ps, filter_string, false);
1254 if (err)
1255 filter_disable_preds(call);
1256 else {
1257 call->filter_active = 1;
1258 replace_filter_string(filter, filter_string);
1259 }
1260 fail = false;
1261 }
1262
1263 if (fail) {
1264 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
1265 return -EINVAL;
1266 }
1267 return 0;
1268}
1269
1270int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1271{
1272 int err;
1159 struct filter_parse_state *ps; 1273 struct filter_parse_state *ps;
1160 1274
1161 mutex_lock(&event_mutex); 1275 mutex_lock(&event_mutex);
@@ -1167,8 +1281,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1167 if (!strcmp(strstrip(filter_string), "0")) { 1281 if (!strcmp(strstrip(filter_string), "0")) {
1168 filter_disable_preds(call); 1282 filter_disable_preds(call);
1169 remove_filter_string(call->filter); 1283 remove_filter_string(call->filter);
1170 mutex_unlock(&event_mutex); 1284 goto out_unlock;
1171 return 0;
1172 } 1285 }
1173 1286
1174 err = -ENOMEM; 1287 err = -ENOMEM;
@@ -1186,10 +1299,11 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1186 goto out; 1299 goto out;
1187 } 1300 }
1188 1301
1189 err = replace_preds(NULL, call, ps, filter_string, false); 1302 err = replace_preds(call, call->filter, ps, filter_string, false);
1190 if (err) 1303 if (err)
1191 append_filter_err(ps, call->filter); 1304 append_filter_err(ps, call->filter);
1192 1305 else
1306 call->filter_active = 1;
1193out: 1307out:
1194 filter_opstack_clear(ps); 1308 filter_opstack_clear(ps);
1195 postfix_clear(ps); 1309 postfix_clear(ps);
@@ -1204,7 +1318,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1204 char *filter_string) 1318 char *filter_string)
1205{ 1319{
1206 int err; 1320 int err;
1207
1208 struct filter_parse_state *ps; 1321 struct filter_parse_state *ps;
1209 1322
1210 mutex_lock(&event_mutex); 1323 mutex_lock(&event_mutex);
@@ -1214,10 +1327,9 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1214 goto out_unlock; 1327 goto out_unlock;
1215 1328
1216 if (!strcmp(strstrip(filter_string), "0")) { 1329 if (!strcmp(strstrip(filter_string), "0")) {
1217 filter_free_subsystem_preds(system, FILTER_DISABLE_ALL); 1330 filter_free_subsystem_preds(system);
1218 remove_filter_string(system->filter); 1331 remove_filter_string(system->filter);
1219 mutex_unlock(&event_mutex); 1332 goto out_unlock;
1220 return 0;
1221 } 1333 }
1222 1334
1223 err = -ENOMEM; 1335 err = -ENOMEM;
@@ -1234,31 +1346,87 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1234 goto out; 1346 goto out;
1235 } 1347 }
1236 1348
1237 filter_free_subsystem_preds(system, FILTER_INIT_NO_RESET); 1349 err = replace_system_preds(system, ps, filter_string);
1238 1350 if (err)
1239 /* try to see the filter can be applied to which events */
1240 err = replace_preds(system, NULL, ps, filter_string, true);
1241 if (err) {
1242 append_filter_err(ps, system->filter); 1351 append_filter_err(ps, system->filter);
1243 goto out; 1352
1353out:
1354 filter_opstack_clear(ps);
1355 postfix_clear(ps);
1356 kfree(ps);
1357out_unlock:
1358 mutex_unlock(&event_mutex);
1359
1360 return err;
1361}
1362
1363#ifdef CONFIG_EVENT_PROFILE
1364
1365void ftrace_profile_free_filter(struct perf_event *event)
1366{
1367 struct event_filter *filter = event->filter;
1368
1369 event->filter = NULL;
1370 __free_preds(filter);
1371}
1372
1373int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1374 char *filter_str)
1375{
1376 int err;
1377 struct event_filter *filter;
1378 struct filter_parse_state *ps;
1379 struct ftrace_event_call *call = NULL;
1380
1381 mutex_lock(&event_mutex);
1382
1383 list_for_each_entry(call, &ftrace_events, list) {
1384 if (call->id == event_id)
1385 break;
1244 } 1386 }
1245 1387
1246 filter_free_subsystem_preds(system, FILTER_SKIP_NO_RESET); 1388 err = -EINVAL;
1389 if (!call)
1390 goto out_unlock;
1247 1391
1248 /* really apply the filter to the events */ 1392 err = -EEXIST;
1249 err = replace_preds(system, NULL, ps, filter_string, false); 1393 if (event->filter)
1250 if (err) { 1394 goto out_unlock;
1251 append_filter_err(ps, system->filter); 1395
1252 filter_free_subsystem_preds(system, 2); 1396 filter = __alloc_preds();
1397 if (IS_ERR(filter)) {
1398 err = PTR_ERR(filter);
1399 goto out_unlock;
1253 } 1400 }
1254 1401
1255out: 1402 err = -ENOMEM;
1403 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1404 if (!ps)
1405 goto free_preds;
1406
1407 parse_init(ps, filter_ops, filter_str);
1408 err = filter_parse(ps);
1409 if (err)
1410 goto free_ps;
1411
1412 err = replace_preds(call, filter, ps, filter_str, false);
1413 if (!err)
1414 event->filter = filter;
1415
1416free_ps:
1256 filter_opstack_clear(ps); 1417 filter_opstack_clear(ps);
1257 postfix_clear(ps); 1418 postfix_clear(ps);
1258 kfree(ps); 1419 kfree(ps);
1420
1421free_preds:
1422 if (err)
1423 __free_preds(filter);
1424
1259out_unlock: 1425out_unlock:
1260 mutex_unlock(&event_mutex); 1426 mutex_unlock(&event_mutex);
1261 1427
1262 return err; 1428 return err;
1263} 1429}
1264 1430
1431#endif /* CONFIG_EVENT_PROFILE */
1432
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 9753fcc61bc5..dff8c84ddf17 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -48,11 +48,11 @@
48struct ____ftrace_##name { \ 48struct ____ftrace_##name { \
49 tstruct \ 49 tstruct \
50}; \ 50}; \
51static void __used ____ftrace_check_##name(void) \ 51static void __always_unused ____ftrace_check_##name(void) \
52{ \ 52{ \
53 struct ____ftrace_##name *__entry = NULL; \ 53 struct ____ftrace_##name *__entry = NULL; \
54 \ 54 \
55 /* force cmpile-time check on F_printk() */ \ 55 /* force compile-time check on F_printk() */ \
56 printk(print); \ 56 printk(print); \
57} 57}
58 58
@@ -66,44 +66,47 @@ static void __used ____ftrace_check_##name(void) \
66#undef __field 66#undef __field
67#define __field(type, item) \ 67#define __field(type, item) \
68 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ 68 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
69 "offset:%zu;\tsize:%zu;\n", \ 69 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
70 offsetof(typeof(field), item), \ 70 offsetof(typeof(field), item), \
71 sizeof(field.item)); \ 71 sizeof(field.item), is_signed_type(type)); \
72 if (!ret) \ 72 if (!ret) \
73 return 0; 73 return 0;
74 74
75#undef __field_desc 75#undef __field_desc
76#define __field_desc(type, container, item) \ 76#define __field_desc(type, container, item) \
77 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ 77 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
78 "offset:%zu;\tsize:%zu;\n", \ 78 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
79 offsetof(typeof(field), container.item), \ 79 offsetof(typeof(field), container.item), \
80 sizeof(field.container.item)); \ 80 sizeof(field.container.item), \
81 is_signed_type(type)); \
81 if (!ret) \ 82 if (!ret) \
82 return 0; 83 return 0;
83 84
84#undef __array 85#undef __array
85#define __array(type, item, len) \ 86#define __array(type, item, len) \
86 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ 87 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
87 "offset:%zu;\tsize:%zu;\n", \ 88 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
88 offsetof(typeof(field), item), \ 89 offsetof(typeof(field), item), \
89 sizeof(field.item)); \ 90 sizeof(field.item), is_signed_type(type)); \
90 if (!ret) \ 91 if (!ret) \
91 return 0; 92 return 0;
92 93
93#undef __array_desc 94#undef __array_desc
94#define __array_desc(type, container, item, len) \ 95#define __array_desc(type, container, item, len) \
95 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ 96 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
96 "offset:%zu;\tsize:%zu;\n", \ 97 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
97 offsetof(typeof(field), container.item), \ 98 offsetof(typeof(field), container.item), \
98 sizeof(field.container.item)); \ 99 sizeof(field.container.item), \
100 is_signed_type(type)); \
99 if (!ret) \ 101 if (!ret) \
100 return 0; 102 return 0;
101 103
102#undef __dynamic_array 104#undef __dynamic_array
103#define __dynamic_array(type, item) \ 105#define __dynamic_array(type, item) \
104 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ 106 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
105 "offset:%zu;\tsize:0;\n", \ 107 "offset:%zu;\tsize:0;\tsigned:%u;\n", \
106 offsetof(typeof(field), item)); \ 108 offsetof(typeof(field), item), \
109 is_signed_type(type)); \
107 if (!ret) \ 110 if (!ret) \
108 return 0; 111 return 0;
109 112
@@ -131,7 +134,6 @@ ftrace_format_##name(struct ftrace_event_call *unused, \
131 134
132#include "trace_entries.h" 135#include "trace_entries.h"
133 136
134
135#undef __field 137#undef __field
136#define __field(type, item) \ 138#define __field(type, item) \
137 ret = trace_define_field(event_call, #type, #item, \ 139 ret = trace_define_field(event_call, #type, #item, \
@@ -193,6 +195,11 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
193 195
194#include "trace_entries.h" 196#include "trace_entries.h"
195 197
198static int ftrace_raw_init_event(struct ftrace_event_call *call)
199{
200 INIT_LIST_HEAD(&call->fields);
201 return 0;
202}
196 203
197#undef __field 204#undef __field
198#define __field(type, item) 205#define __field(type, item)
@@ -211,7 +218,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
211 218
212#undef FTRACE_ENTRY 219#undef FTRACE_ENTRY
213#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ 220#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \
214static int ftrace_raw_init_event_##call(void); \
215 \ 221 \
216struct ftrace_event_call __used \ 222struct ftrace_event_call __used \
217__attribute__((__aligned__(4))) \ 223__attribute__((__aligned__(4))) \
@@ -219,14 +225,9 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
219 .name = #call, \ 225 .name = #call, \
220 .id = type, \ 226 .id = type, \
221 .system = __stringify(TRACE_SYSTEM), \ 227 .system = __stringify(TRACE_SYSTEM), \
222 .raw_init = ftrace_raw_init_event_##call, \ 228 .raw_init = ftrace_raw_init_event, \
223 .show_format = ftrace_format_##call, \ 229 .show_format = ftrace_format_##call, \
224 .define_fields = ftrace_define_fields_##call, \ 230 .define_fields = ftrace_define_fields_##call, \
225}; \ 231}; \
226static int ftrace_raw_init_event_##call(void) \
227{ \
228 INIT_LIST_HEAD(&event_##call.fields); \
229 return 0; \
230} \
231 232
232#include "trace_entries.h" 233#include "trace_entries.h"
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 45e6c01b2e4d..b1342c5d37cf 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -14,9 +14,20 @@
14#include "trace.h" 14#include "trace.h"
15#include "trace_output.h" 15#include "trace_output.h"
16 16
17struct fgraph_data { 17struct fgraph_cpu_data {
18 pid_t last_pid; 18 pid_t last_pid;
19 int depth; 19 int depth;
20 int ignore;
21};
22
23struct fgraph_data {
24 struct fgraph_cpu_data *cpu_data;
25
26 /* Place to preserve last processed entry. */
27 struct ftrace_graph_ent_entry ent;
28 struct ftrace_graph_ret_entry ret;
29 int failed;
30 int cpu;
20}; 31};
21 32
22#define TRACE_GRAPH_INDENT 2 33#define TRACE_GRAPH_INDENT 2
@@ -176,7 +187,7 @@ static int __trace_graph_entry(struct trace_array *tr,
176 struct ring_buffer *buffer = tr->buffer; 187 struct ring_buffer *buffer = tr->buffer;
177 struct ftrace_graph_ent_entry *entry; 188 struct ftrace_graph_ent_entry *entry;
178 189
179 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) 190 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
180 return 0; 191 return 0;
181 192
182 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, 193 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
@@ -240,7 +251,7 @@ static void __trace_graph_return(struct trace_array *tr,
240 struct ring_buffer *buffer = tr->buffer; 251 struct ring_buffer *buffer = tr->buffer;
241 struct ftrace_graph_ret_entry *entry; 252 struct ftrace_graph_ret_entry *entry;
242 253
243 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) 254 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
244 return; 255 return;
245 256
246 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, 257 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
@@ -384,7 +395,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
384 if (!data) 395 if (!data)
385 return TRACE_TYPE_HANDLED; 396 return TRACE_TYPE_HANDLED;
386 397
387 last_pid = &(per_cpu_ptr(data, cpu)->last_pid); 398 last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
388 399
389 if (*last_pid == pid) 400 if (*last_pid == pid)
390 return TRACE_TYPE_HANDLED; 401 return TRACE_TYPE_HANDLED;
@@ -435,26 +446,49 @@ static struct ftrace_graph_ret_entry *
435get_return_for_leaf(struct trace_iterator *iter, 446get_return_for_leaf(struct trace_iterator *iter,
436 struct ftrace_graph_ent_entry *curr) 447 struct ftrace_graph_ent_entry *curr)
437{ 448{
438 struct ring_buffer_iter *ring_iter; 449 struct fgraph_data *data = iter->private;
450 struct ring_buffer_iter *ring_iter = NULL;
439 struct ring_buffer_event *event; 451 struct ring_buffer_event *event;
440 struct ftrace_graph_ret_entry *next; 452 struct ftrace_graph_ret_entry *next;
441 453
442 ring_iter = iter->buffer_iter[iter->cpu]; 454 /*
455 * If the previous output failed to write to the seq buffer,
456 * then we just reuse the data from before.
457 */
458 if (data && data->failed) {
459 curr = &data->ent;
460 next = &data->ret;
461 } else {
443 462
444 /* First peek to compare current entry and the next one */ 463 ring_iter = iter->buffer_iter[iter->cpu];
445 if (ring_iter) 464
446 event = ring_buffer_iter_peek(ring_iter, NULL); 465 /* First peek to compare current entry and the next one */
447 else { 466 if (ring_iter)
448 /* We need to consume the current entry to see the next one */ 467 event = ring_buffer_iter_peek(ring_iter, NULL);
449 ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL); 468 else {
450 event = ring_buffer_peek(iter->tr->buffer, iter->cpu, 469 /*
451 NULL); 470 * We need to consume the current entry to see
452 } 471 * the next one.
472 */
473 ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
474 event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
475 NULL);
476 }
453 477
454 if (!event) 478 if (!event)
455 return NULL; 479 return NULL;
480
481 next = ring_buffer_event_data(event);
456 482
457 next = ring_buffer_event_data(event); 483 if (data) {
484 /*
485 * Save current and next entries for later reference
486 * if the output fails.
487 */
488 data->ent = *curr;
489 data->ret = *next;
490 }
491 }
458 492
459 if (next->ent.type != TRACE_GRAPH_RET) 493 if (next->ent.type != TRACE_GRAPH_RET)
460 return NULL; 494 return NULL;
@@ -640,7 +674,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
640 674
641 if (data) { 675 if (data) {
642 int cpu = iter->cpu; 676 int cpu = iter->cpu;
643 int *depth = &(per_cpu_ptr(data, cpu)->depth); 677 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
644 678
645 /* 679 /*
646 * Comments display at + 1 to depth. Since 680 * Comments display at + 1 to depth. Since
@@ -688,7 +722,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
688 722
689 if (data) { 723 if (data) {
690 int cpu = iter->cpu; 724 int cpu = iter->cpu;
691 int *depth = &(per_cpu_ptr(data, cpu)->depth); 725 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
692 726
693 *depth = call->depth; 727 *depth = call->depth;
694 } 728 }
@@ -782,19 +816,34 @@ static enum print_line_t
782print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, 816print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
783 struct trace_iterator *iter) 817 struct trace_iterator *iter)
784{ 818{
785 int cpu = iter->cpu; 819 struct fgraph_data *data = iter->private;
786 struct ftrace_graph_ent *call = &field->graph_ent; 820 struct ftrace_graph_ent *call = &field->graph_ent;
787 struct ftrace_graph_ret_entry *leaf_ret; 821 struct ftrace_graph_ret_entry *leaf_ret;
822 static enum print_line_t ret;
823 int cpu = iter->cpu;
788 824
789 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func)) 825 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func))
790 return TRACE_TYPE_PARTIAL_LINE; 826 return TRACE_TYPE_PARTIAL_LINE;
791 827
792 leaf_ret = get_return_for_leaf(iter, field); 828 leaf_ret = get_return_for_leaf(iter, field);
793 if (leaf_ret) 829 if (leaf_ret)
794 return print_graph_entry_leaf(iter, field, leaf_ret, s); 830 ret = print_graph_entry_leaf(iter, field, leaf_ret, s);
795 else 831 else
796 return print_graph_entry_nested(iter, field, s, cpu); 832 ret = print_graph_entry_nested(iter, field, s, cpu);
797 833
834 if (data) {
835 /*
836 * If we failed to write our output, then we need to make
837 * note of it. Because we already consumed our entry.
838 */
839 if (s->full) {
840 data->failed = 1;
841 data->cpu = cpu;
842 } else
843 data->failed = 0;
844 }
845
846 return ret;
798} 847}
799 848
800static enum print_line_t 849static enum print_line_t
@@ -810,7 +859,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
810 859
811 if (data) { 860 if (data) {
812 int cpu = iter->cpu; 861 int cpu = iter->cpu;
813 int *depth = &(per_cpu_ptr(data, cpu)->depth); 862 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
814 863
815 /* 864 /*
816 * Comments display at + 1 to depth. This is the 865 * Comments display at + 1 to depth. This is the
@@ -873,7 +922,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
873 int i; 922 int i;
874 923
875 if (data) 924 if (data)
876 depth = per_cpu_ptr(data, iter->cpu)->depth; 925 depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
877 926
878 if (print_graph_prologue(iter, s, 0, 0)) 927 if (print_graph_prologue(iter, s, 0, 0))
879 return TRACE_TYPE_PARTIAL_LINE; 928 return TRACE_TYPE_PARTIAL_LINE;
@@ -941,8 +990,33 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
941enum print_line_t 990enum print_line_t
942print_graph_function(struct trace_iterator *iter) 991print_graph_function(struct trace_iterator *iter)
943{ 992{
993 struct ftrace_graph_ent_entry *field;
994 struct fgraph_data *data = iter->private;
944 struct trace_entry *entry = iter->ent; 995 struct trace_entry *entry = iter->ent;
945 struct trace_seq *s = &iter->seq; 996 struct trace_seq *s = &iter->seq;
997 int cpu = iter->cpu;
998 int ret;
999
1000 if (data && per_cpu_ptr(data->cpu_data, cpu)->ignore) {
1001 per_cpu_ptr(data->cpu_data, cpu)->ignore = 0;
1002 return TRACE_TYPE_HANDLED;
1003 }
1004
1005 /*
1006 * If the last output failed, there's a possibility we need
1007 * to print out the missing entry which would never go out.
1008 */
1009 if (data && data->failed) {
1010 field = &data->ent;
1011 iter->cpu = data->cpu;
1012 ret = print_graph_entry(field, s, iter);
1013 if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) {
1014 per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1;
1015 ret = TRACE_TYPE_NO_CONSUME;
1016 }
1017 iter->cpu = cpu;
1018 return ret;
1019 }
946 1020
947 switch (entry->type) { 1021 switch (entry->type) {
948 case TRACE_GRAPH_ENT: { 1022 case TRACE_GRAPH_ENT: {
@@ -952,7 +1026,7 @@ print_graph_function(struct trace_iterator *iter)
952 * sizeof(struct ftrace_graph_ent_entry) is very small, 1026 * sizeof(struct ftrace_graph_ent_entry) is very small,
953 * it can be safely saved at the stack. 1027 * it can be safely saved at the stack.
954 */ 1028 */
955 struct ftrace_graph_ent_entry *field, saved; 1029 struct ftrace_graph_ent_entry saved;
956 trace_assign_type(field, entry); 1030 trace_assign_type(field, entry);
957 saved = *field; 1031 saved = *field;
958 return print_graph_entry(&saved, s, iter); 1032 return print_graph_entry(&saved, s, iter);
@@ -1030,31 +1104,54 @@ static void print_graph_headers(struct seq_file *s)
1030static void graph_trace_open(struct trace_iterator *iter) 1104static void graph_trace_open(struct trace_iterator *iter)
1031{ 1105{
1032 /* pid and depth on the last trace processed */ 1106 /* pid and depth on the last trace processed */
1033 struct fgraph_data *data = alloc_percpu(struct fgraph_data); 1107 struct fgraph_data *data;
1034 int cpu; 1108 int cpu;
1035 1109
1110 iter->private = NULL;
1111
1112 data = kzalloc(sizeof(*data), GFP_KERNEL);
1036 if (!data) 1113 if (!data)
1037 pr_warning("function graph tracer: not enough memory\n"); 1114 goto out_err;
1038 else 1115
1039 for_each_possible_cpu(cpu) { 1116 data->cpu_data = alloc_percpu(struct fgraph_cpu_data);
1040 pid_t *pid = &(per_cpu_ptr(data, cpu)->last_pid); 1117 if (!data->cpu_data)
1041 int *depth = &(per_cpu_ptr(data, cpu)->depth); 1118 goto out_err_free;
1042 *pid = -1; 1119
1043 *depth = 0; 1120 for_each_possible_cpu(cpu) {
1044 } 1121 pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
1122 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
1123 int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore);
1124 *pid = -1;
1125 *depth = 0;
1126 *ignore = 0;
1127 }
1045 1128
1046 iter->private = data; 1129 iter->private = data;
1130
1131 return;
1132
1133 out_err_free:
1134 kfree(data);
1135 out_err:
1136 pr_warning("function graph tracer: not enough memory\n");
1047} 1137}
1048 1138
1049static void graph_trace_close(struct trace_iterator *iter) 1139static void graph_trace_close(struct trace_iterator *iter)
1050{ 1140{
1051 free_percpu(iter->private); 1141 struct fgraph_data *data = iter->private;
1142
1143 if (data) {
1144 free_percpu(data->cpu_data);
1145 kfree(data);
1146 }
1052} 1147}
1053 1148
1054static struct tracer graph_trace __read_mostly = { 1149static struct tracer graph_trace __read_mostly = {
1055 .name = "function_graph", 1150 .name = "function_graph",
1056 .open = graph_trace_open, 1151 .open = graph_trace_open,
1152 .pipe_open = graph_trace_open,
1057 .close = graph_trace_close, 1153 .close = graph_trace_close,
1154 .pipe_close = graph_trace_close,
1058 .wait_pipe = poll_wait_pipe, 1155 .wait_pipe = poll_wait_pipe,
1059 .init = graph_trace_init, 1156 .init = graph_trace_init,
1060 .reset = graph_trace_reset, 1157 .reset = graph_trace_reset,
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 23b63859130e..7b97000745f5 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -20,10 +20,10 @@
20 20
21#define BTS_BUFFER_SIZE (1 << 13) 21#define BTS_BUFFER_SIZE (1 << 13)
22 22
23static DEFINE_PER_CPU(struct bts_tracer *, tracer); 23static DEFINE_PER_CPU(struct bts_tracer *, hwb_tracer);
24static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer); 24static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], hwb_buffer);
25 25
26#define this_tracer per_cpu(tracer, smp_processor_id()) 26#define this_tracer per_cpu(hwb_tracer, smp_processor_id())
27 27
28static int trace_hw_branches_enabled __read_mostly; 28static int trace_hw_branches_enabled __read_mostly;
29static int trace_hw_branches_suspended __read_mostly; 29static int trace_hw_branches_suspended __read_mostly;
@@ -32,12 +32,13 @@ static struct trace_array *hw_branch_trace __read_mostly;
32 32
33static void bts_trace_init_cpu(int cpu) 33static void bts_trace_init_cpu(int cpu)
34{ 34{
35 per_cpu(tracer, cpu) = 35 per_cpu(hwb_tracer, cpu) =
36 ds_request_bts_cpu(cpu, per_cpu(buffer, cpu), BTS_BUFFER_SIZE, 36 ds_request_bts_cpu(cpu, per_cpu(hwb_buffer, cpu),
37 NULL, (size_t)-1, BTS_KERNEL); 37 BTS_BUFFER_SIZE, NULL, (size_t)-1,
38 BTS_KERNEL);
38 39
39 if (IS_ERR(per_cpu(tracer, cpu))) 40 if (IS_ERR(per_cpu(hwb_tracer, cpu)))
40 per_cpu(tracer, cpu) = NULL; 41 per_cpu(hwb_tracer, cpu) = NULL;
41} 42}
42 43
43static int bts_trace_init(struct trace_array *tr) 44static int bts_trace_init(struct trace_array *tr)
@@ -51,7 +52,7 @@ static int bts_trace_init(struct trace_array *tr)
51 for_each_online_cpu(cpu) { 52 for_each_online_cpu(cpu) {
52 bts_trace_init_cpu(cpu); 53 bts_trace_init_cpu(cpu);
53 54
54 if (likely(per_cpu(tracer, cpu))) 55 if (likely(per_cpu(hwb_tracer, cpu)))
55 trace_hw_branches_enabled = 1; 56 trace_hw_branches_enabled = 1;
56 } 57 }
57 trace_hw_branches_suspended = 0; 58 trace_hw_branches_suspended = 0;
@@ -67,9 +68,9 @@ static void bts_trace_reset(struct trace_array *tr)
67 68
68 get_online_cpus(); 69 get_online_cpus();
69 for_each_online_cpu(cpu) { 70 for_each_online_cpu(cpu) {
70 if (likely(per_cpu(tracer, cpu))) { 71 if (likely(per_cpu(hwb_tracer, cpu))) {
71 ds_release_bts(per_cpu(tracer, cpu)); 72 ds_release_bts(per_cpu(hwb_tracer, cpu));
72 per_cpu(tracer, cpu) = NULL; 73 per_cpu(hwb_tracer, cpu) = NULL;
73 } 74 }
74 } 75 }
75 trace_hw_branches_enabled = 0; 76 trace_hw_branches_enabled = 0;
@@ -83,8 +84,8 @@ static void bts_trace_start(struct trace_array *tr)
83 84
84 get_online_cpus(); 85 get_online_cpus();
85 for_each_online_cpu(cpu) 86 for_each_online_cpu(cpu)
86 if (likely(per_cpu(tracer, cpu))) 87 if (likely(per_cpu(hwb_tracer, cpu)))
87 ds_resume_bts(per_cpu(tracer, cpu)); 88 ds_resume_bts(per_cpu(hwb_tracer, cpu));
88 trace_hw_branches_suspended = 0; 89 trace_hw_branches_suspended = 0;
89 put_online_cpus(); 90 put_online_cpus();
90} 91}
@@ -95,8 +96,8 @@ static void bts_trace_stop(struct trace_array *tr)
95 96
96 get_online_cpus(); 97 get_online_cpus();
97 for_each_online_cpu(cpu) 98 for_each_online_cpu(cpu)
98 if (likely(per_cpu(tracer, cpu))) 99 if (likely(per_cpu(hwb_tracer, cpu)))
99 ds_suspend_bts(per_cpu(tracer, cpu)); 100 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
100 trace_hw_branches_suspended = 1; 101 trace_hw_branches_suspended = 1;
101 put_online_cpus(); 102 put_online_cpus();
102} 103}
@@ -114,16 +115,16 @@ static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
114 bts_trace_init_cpu(cpu); 115 bts_trace_init_cpu(cpu);
115 116
116 if (trace_hw_branches_suspended && 117 if (trace_hw_branches_suspended &&
117 likely(per_cpu(tracer, cpu))) 118 likely(per_cpu(hwb_tracer, cpu)))
118 ds_suspend_bts(per_cpu(tracer, cpu)); 119 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
119 } 120 }
120 break; 121 break;
121 122
122 case CPU_DOWN_PREPARE: 123 case CPU_DOWN_PREPARE:
123 /* The notification is sent with interrupts enabled. */ 124 /* The notification is sent with interrupts enabled. */
124 if (likely(per_cpu(tracer, cpu))) { 125 if (likely(per_cpu(hwb_tracer, cpu))) {
125 ds_release_bts(per_cpu(tracer, cpu)); 126 ds_release_bts(per_cpu(hwb_tracer, cpu));
126 per_cpu(tracer, cpu) = NULL; 127 per_cpu(hwb_tracer, cpu) = NULL;
127 } 128 }
128 } 129 }
129 130
@@ -165,6 +166,7 @@ void trace_hw_branch(u64 from, u64 to)
165 struct ftrace_event_call *call = &event_hw_branch; 166 struct ftrace_event_call *call = &event_hw_branch;
166 struct trace_array *tr = hw_branch_trace; 167 struct trace_array *tr = hw_branch_trace;
167 struct ring_buffer_event *event; 168 struct ring_buffer_event *event;
169 struct ring_buffer *buf;
168 struct hw_branch_entry *entry; 170 struct hw_branch_entry *entry;
169 unsigned long irq1; 171 unsigned long irq1;
170 int cpu; 172 int cpu;
@@ -180,7 +182,8 @@ void trace_hw_branch(u64 from, u64 to)
180 if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) 182 if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
181 goto out; 183 goto out;
182 184
183 event = trace_buffer_lock_reserve(tr, TRACE_HW_BRANCHES, 185 buf = tr->buffer;
186 event = trace_buffer_lock_reserve(buf, TRACE_HW_BRANCHES,
184 sizeof(*entry), 0, 0); 187 sizeof(*entry), 0, 0);
185 if (!event) 188 if (!event)
186 goto out; 189 goto out;
@@ -189,8 +192,8 @@ void trace_hw_branch(u64 from, u64 to)
189 entry->ent.type = TRACE_HW_BRANCHES; 192 entry->ent.type = TRACE_HW_BRANCHES;
190 entry->from = from; 193 entry->from = from;
191 entry->to = to; 194 entry->to = to;
192 if (!filter_check_discard(call, entry, tr->buffer, event)) 195 if (!filter_check_discard(call, entry, buf, event))
193 trace_buffer_unlock_commit(tr, event, 0, 0); 196 trace_buffer_unlock_commit(buf, event, 0, 0);
194 197
195 out: 198 out:
196 atomic_dec(&tr->data[cpu]->disabled); 199 atomic_dec(&tr->data[cpu]->disabled);
@@ -256,8 +259,8 @@ static void trace_bts_prepare(struct trace_iterator *iter)
256 259
257 get_online_cpus(); 260 get_online_cpus();
258 for_each_online_cpu(cpu) 261 for_each_online_cpu(cpu)
259 if (likely(per_cpu(tracer, cpu))) 262 if (likely(per_cpu(hwb_tracer, cpu)))
260 ds_suspend_bts(per_cpu(tracer, cpu)); 263 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
261 /* 264 /*
262 * We need to collect the trace on the respective cpu since ftrace 265 * We need to collect the trace on the respective cpu since ftrace
263 * implicitly adds the record for the current cpu. 266 * implicitly adds the record for the current cpu.
@@ -266,8 +269,8 @@ static void trace_bts_prepare(struct trace_iterator *iter)
266 on_each_cpu(trace_bts_cpu, iter->tr, 1); 269 on_each_cpu(trace_bts_cpu, iter->tr, 1);
267 270
268 for_each_online_cpu(cpu) 271 for_each_online_cpu(cpu)
269 if (likely(per_cpu(tracer, cpu))) 272 if (likely(per_cpu(hwb_tracer, cpu)))
270 ds_resume_bts(per_cpu(tracer, cpu)); 273 ds_resume_bts(per_cpu(hwb_tracer, cpu));
271 put_online_cpus(); 274 put_online_cpus();
272} 275}
273 276
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
new file mode 100644
index 000000000000..b52d397e57eb
--- /dev/null
+++ b/kernel/trace/trace_kprobe.c
@@ -0,0 +1,1542 @@
1/*
2 * Kprobes-based tracing events
3 *
4 * Created by Masami Hiramatsu <mhiramat@redhat.com>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20#include <linux/module.h>
21#include <linux/uaccess.h>
22#include <linux/kprobes.h>
23#include <linux/seq_file.h>
24#include <linux/slab.h>
25#include <linux/smp.h>
26#include <linux/debugfs.h>
27#include <linux/types.h>
28#include <linux/string.h>
29#include <linux/ctype.h>
30#include <linux/ptrace.h>
31#include <linux/perf_event.h>
32
33#include "trace.h"
34#include "trace_output.h"
35
36#define MAX_TRACE_ARGS 128
37#define MAX_ARGSTR_LEN 63
38#define MAX_EVENT_NAME_LEN 64
39#define KPROBE_EVENT_SYSTEM "kprobes"
40
41/* Reserved field names */
42#define FIELD_STRING_IP "__probe_ip"
43#define FIELD_STRING_NARGS "__probe_nargs"
44#define FIELD_STRING_RETIP "__probe_ret_ip"
45#define FIELD_STRING_FUNC "__probe_func"
46
47const char *reserved_field_names[] = {
48 "common_type",
49 "common_flags",
50 "common_preempt_count",
51 "common_pid",
52 "common_tgid",
53 "common_lock_depth",
54 FIELD_STRING_IP,
55 FIELD_STRING_NARGS,
56 FIELD_STRING_RETIP,
57 FIELD_STRING_FUNC,
58};
59
60struct fetch_func {
61 unsigned long (*func)(struct pt_regs *, void *);
62 void *data;
63};
64
65static __kprobes unsigned long call_fetch(struct fetch_func *f,
66 struct pt_regs *regs)
67{
68 return f->func(regs, f->data);
69}
70
71/* fetch handlers */
72static __kprobes unsigned long fetch_register(struct pt_regs *regs,
73 void *offset)
74{
75 return regs_get_register(regs, (unsigned int)((unsigned long)offset));
76}
77
78static __kprobes unsigned long fetch_stack(struct pt_regs *regs,
79 void *num)
80{
81 return regs_get_kernel_stack_nth(regs,
82 (unsigned int)((unsigned long)num));
83}
84
85static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr)
86{
87 unsigned long retval;
88
89 if (probe_kernel_address(addr, retval))
90 return 0;
91 return retval;
92}
93
94static __kprobes unsigned long fetch_argument(struct pt_regs *regs, void *num)
95{
96 return regs_get_argument_nth(regs, (unsigned int)((unsigned long)num));
97}
98
99static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs,
100 void *dummy)
101{
102 return regs_return_value(regs);
103}
104
105static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs,
106 void *dummy)
107{
108 return kernel_stack_pointer(regs);
109}
110
111/* Memory fetching by symbol */
112struct symbol_cache {
113 char *symbol;
114 long offset;
115 unsigned long addr;
116};
117
118static unsigned long update_symbol_cache(struct symbol_cache *sc)
119{
120 sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
121 if (sc->addr)
122 sc->addr += sc->offset;
123 return sc->addr;
124}
125
126static void free_symbol_cache(struct symbol_cache *sc)
127{
128 kfree(sc->symbol);
129 kfree(sc);
130}
131
132static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
133{
134 struct symbol_cache *sc;
135
136 if (!sym || strlen(sym) == 0)
137 return NULL;
138 sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
139 if (!sc)
140 return NULL;
141
142 sc->symbol = kstrdup(sym, GFP_KERNEL);
143 if (!sc->symbol) {
144 kfree(sc);
145 return NULL;
146 }
147 sc->offset = offset;
148
149 update_symbol_cache(sc);
150 return sc;
151}
152
153static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data)
154{
155 struct symbol_cache *sc = data;
156
157 if (sc->addr)
158 return fetch_memory(regs, (void *)sc->addr);
159 else
160 return 0;
161}
162
163/* Special indirect memory access interface */
164struct indirect_fetch_data {
165 struct fetch_func orig;
166 long offset;
167};
168
169static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data)
170{
171 struct indirect_fetch_data *ind = data;
172 unsigned long addr;
173
174 addr = call_fetch(&ind->orig, regs);
175 if (addr) {
176 addr += ind->offset;
177 return fetch_memory(regs, (void *)addr);
178 } else
179 return 0;
180}
181
182static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data)
183{
184 if (data->orig.func == fetch_indirect)
185 free_indirect_fetch_data(data->orig.data);
186 else if (data->orig.func == fetch_symbol)
187 free_symbol_cache(data->orig.data);
188 kfree(data);
189}
190
191/**
192 * Kprobe event core functions
193 */
194
195struct probe_arg {
196 struct fetch_func fetch;
197 const char *name;
198};
199
200/* Flags for trace_probe */
201#define TP_FLAG_TRACE 1
202#define TP_FLAG_PROFILE 2
203
204struct trace_probe {
205 struct list_head list;
206 struct kretprobe rp; /* Use rp.kp for kprobe use */
207 unsigned long nhit;
208 unsigned int flags; /* For TP_FLAG_* */
209 const char *symbol; /* symbol name */
210 struct ftrace_event_call call;
211 struct trace_event event;
212 unsigned int nr_args;
213 struct probe_arg args[];
214};
215
216#define SIZEOF_TRACE_PROBE(n) \
217 (offsetof(struct trace_probe, args) + \
218 (sizeof(struct probe_arg) * (n)))
219
220static __kprobes int probe_is_return(struct trace_probe *tp)
221{
222 return tp->rp.handler != NULL;
223}
224
225static __kprobes const char *probe_symbol(struct trace_probe *tp)
226{
227 return tp->symbol ? tp->symbol : "unknown";
228}
229
230static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
231{
232 int ret = -EINVAL;
233
234 if (ff->func == fetch_argument)
235 ret = snprintf(buf, n, "$arg%lu", (unsigned long)ff->data);
236 else if (ff->func == fetch_register) {
237 const char *name;
238 name = regs_query_register_name((unsigned int)((long)ff->data));
239 ret = snprintf(buf, n, "%%%s", name);
240 } else if (ff->func == fetch_stack)
241 ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data);
242 else if (ff->func == fetch_memory)
243 ret = snprintf(buf, n, "@0x%p", ff->data);
244 else if (ff->func == fetch_symbol) {
245 struct symbol_cache *sc = ff->data;
246 if (sc->offset)
247 ret = snprintf(buf, n, "@%s%+ld", sc->symbol,
248 sc->offset);
249 else
250 ret = snprintf(buf, n, "@%s", sc->symbol);
251 } else if (ff->func == fetch_retvalue)
252 ret = snprintf(buf, n, "$retval");
253 else if (ff->func == fetch_stack_address)
254 ret = snprintf(buf, n, "$stack");
255 else if (ff->func == fetch_indirect) {
256 struct indirect_fetch_data *id = ff->data;
257 size_t l = 0;
258 ret = snprintf(buf, n, "%+ld(", id->offset);
259 if (ret >= n)
260 goto end;
261 l += ret;
262 ret = probe_arg_string(buf + l, n - l, &id->orig);
263 if (ret < 0)
264 goto end;
265 l += ret;
266 ret = snprintf(buf + l, n - l, ")");
267 ret += l;
268 }
269end:
270 if (ret >= n)
271 return -ENOSPC;
272 return ret;
273}
274
275static int register_probe_event(struct trace_probe *tp);
276static void unregister_probe_event(struct trace_probe *tp);
277
278static DEFINE_MUTEX(probe_lock);
279static LIST_HEAD(probe_list);
280
281static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
282static int kretprobe_dispatcher(struct kretprobe_instance *ri,
283 struct pt_regs *regs);
284
285/*
286 * Allocate new trace_probe and initialize it (including kprobes).
287 */
288static struct trace_probe *alloc_trace_probe(const char *group,
289 const char *event,
290 void *addr,
291 const char *symbol,
292 unsigned long offs,
293 int nargs, int is_return)
294{
295 struct trace_probe *tp;
296
297 tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL);
298 if (!tp)
299 return ERR_PTR(-ENOMEM);
300
301 if (symbol) {
302 tp->symbol = kstrdup(symbol, GFP_KERNEL);
303 if (!tp->symbol)
304 goto error;
305 tp->rp.kp.symbol_name = tp->symbol;
306 tp->rp.kp.offset = offs;
307 } else
308 tp->rp.kp.addr = addr;
309
310 if (is_return)
311 tp->rp.handler = kretprobe_dispatcher;
312 else
313 tp->rp.kp.pre_handler = kprobe_dispatcher;
314
315 if (!event)
316 goto error;
317 tp->call.name = kstrdup(event, GFP_KERNEL);
318 if (!tp->call.name)
319 goto error;
320
321 if (!group)
322 goto error;
323 tp->call.system = kstrdup(group, GFP_KERNEL);
324 if (!tp->call.system)
325 goto error;
326
327 INIT_LIST_HEAD(&tp->list);
328 return tp;
329error:
330 kfree(tp->call.name);
331 kfree(tp->symbol);
332 kfree(tp);
333 return ERR_PTR(-ENOMEM);
334}
335
336static void free_probe_arg(struct probe_arg *arg)
337{
338 if (arg->fetch.func == fetch_symbol)
339 free_symbol_cache(arg->fetch.data);
340 else if (arg->fetch.func == fetch_indirect)
341 free_indirect_fetch_data(arg->fetch.data);
342 kfree(arg->name);
343}
344
345static void free_trace_probe(struct trace_probe *tp)
346{
347 int i;
348
349 for (i = 0; i < tp->nr_args; i++)
350 free_probe_arg(&tp->args[i]);
351
352 kfree(tp->call.system);
353 kfree(tp->call.name);
354 kfree(tp->symbol);
355 kfree(tp);
356}
357
358static struct trace_probe *find_probe_event(const char *event,
359 const char *group)
360{
361 struct trace_probe *tp;
362
363 list_for_each_entry(tp, &probe_list, list)
364 if (strcmp(tp->call.name, event) == 0 &&
365 strcmp(tp->call.system, group) == 0)
366 return tp;
367 return NULL;
368}
369
370/* Unregister a trace_probe and probe_event: call with locking probe_lock */
371static void unregister_trace_probe(struct trace_probe *tp)
372{
373 if (probe_is_return(tp))
374 unregister_kretprobe(&tp->rp);
375 else
376 unregister_kprobe(&tp->rp.kp);
377 list_del(&tp->list);
378 unregister_probe_event(tp);
379}
380
381/* Register a trace_probe and probe_event */
382static int register_trace_probe(struct trace_probe *tp)
383{
384 struct trace_probe *old_tp;
385 int ret;
386
387 mutex_lock(&probe_lock);
388
389 /* register as an event */
390 old_tp = find_probe_event(tp->call.name, tp->call.system);
391 if (old_tp) {
392 /* delete old event */
393 unregister_trace_probe(old_tp);
394 free_trace_probe(old_tp);
395 }
396 ret = register_probe_event(tp);
397 if (ret) {
398 pr_warning("Faild to register probe event(%d)\n", ret);
399 goto end;
400 }
401
402 tp->rp.kp.flags |= KPROBE_FLAG_DISABLED;
403 if (probe_is_return(tp))
404 ret = register_kretprobe(&tp->rp);
405 else
406 ret = register_kprobe(&tp->rp.kp);
407
408 if (ret) {
409 pr_warning("Could not insert probe(%d)\n", ret);
410 if (ret == -EILSEQ) {
411 pr_warning("Probing address(0x%p) is not an "
412 "instruction boundary.\n",
413 tp->rp.kp.addr);
414 ret = -EINVAL;
415 }
416 unregister_probe_event(tp);
417 } else
418 list_add_tail(&tp->list, &probe_list);
419end:
420 mutex_unlock(&probe_lock);
421 return ret;
422}
423
424/* Split symbol and offset. */
425static int split_symbol_offset(char *symbol, unsigned long *offset)
426{
427 char *tmp;
428 int ret;
429
430 if (!offset)
431 return -EINVAL;
432
433 tmp = strchr(symbol, '+');
434 if (tmp) {
435 /* skip sign because strict_strtol doesn't accept '+' */
436 ret = strict_strtoul(tmp + 1, 0, offset);
437 if (ret)
438 return ret;
439 *tmp = '\0';
440 } else
441 *offset = 0;
442 return 0;
443}
444
445#define PARAM_MAX_ARGS 16
446#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
447
448static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
449{
450 int ret = 0;
451 unsigned long param;
452
453 if (strcmp(arg, "retval") == 0) {
454 if (is_return) {
455 ff->func = fetch_retvalue;
456 ff->data = NULL;
457 } else
458 ret = -EINVAL;
459 } else if (strncmp(arg, "stack", 5) == 0) {
460 if (arg[5] == '\0') {
461 ff->func = fetch_stack_address;
462 ff->data = NULL;
463 } else if (isdigit(arg[5])) {
464 ret = strict_strtoul(arg + 5, 10, &param);
465 if (ret || param > PARAM_MAX_STACK)
466 ret = -EINVAL;
467 else {
468 ff->func = fetch_stack;
469 ff->data = (void *)param;
470 }
471 } else
472 ret = -EINVAL;
473 } else if (strncmp(arg, "arg", 3) == 0 && isdigit(arg[3])) {
474 ret = strict_strtoul(arg + 3, 10, &param);
475 if (ret || param > PARAM_MAX_ARGS)
476 ret = -EINVAL;
477 else {
478 ff->func = fetch_argument;
479 ff->data = (void *)param;
480 }
481 } else
482 ret = -EINVAL;
483 return ret;
484}
485
486/* Recursive argument parser */
487static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
488{
489 int ret = 0;
490 unsigned long param;
491 long offset;
492 char *tmp;
493
494 switch (arg[0]) {
495 case '$':
496 ret = parse_probe_vars(arg + 1, ff, is_return);
497 break;
498 case '%': /* named register */
499 ret = regs_query_register_offset(arg + 1);
500 if (ret >= 0) {
501 ff->func = fetch_register;
502 ff->data = (void *)(unsigned long)ret;
503 ret = 0;
504 }
505 break;
506 case '@': /* memory or symbol */
507 if (isdigit(arg[1])) {
508 ret = strict_strtoul(arg + 1, 0, &param);
509 if (ret)
510 break;
511 ff->func = fetch_memory;
512 ff->data = (void *)param;
513 } else {
514 ret = split_symbol_offset(arg + 1, &offset);
515 if (ret)
516 break;
517 ff->data = alloc_symbol_cache(arg + 1, offset);
518 if (ff->data)
519 ff->func = fetch_symbol;
520 else
521 ret = -EINVAL;
522 }
523 break;
524 case '+': /* indirect memory */
525 case '-':
526 tmp = strchr(arg, '(');
527 if (!tmp) {
528 ret = -EINVAL;
529 break;
530 }
531 *tmp = '\0';
532 ret = strict_strtol(arg + 1, 0, &offset);
533 if (ret)
534 break;
535 if (arg[0] == '-')
536 offset = -offset;
537 arg = tmp + 1;
538 tmp = strrchr(arg, ')');
539 if (tmp) {
540 struct indirect_fetch_data *id;
541 *tmp = '\0';
542 id = kzalloc(sizeof(struct indirect_fetch_data),
543 GFP_KERNEL);
544 if (!id)
545 return -ENOMEM;
546 id->offset = offset;
547 ret = __parse_probe_arg(arg, &id->orig, is_return);
548 if (ret)
549 kfree(id);
550 else {
551 ff->func = fetch_indirect;
552 ff->data = (void *)id;
553 }
554 } else
555 ret = -EINVAL;
556 break;
557 default:
558 /* TODO: support custom handler */
559 ret = -EINVAL;
560 }
561 return ret;
562}
563
564/* String length checking wrapper */
565static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
566{
567 if (strlen(arg) > MAX_ARGSTR_LEN) {
568 pr_info("Argument is too long.: %s\n", arg);
569 return -ENOSPC;
570 }
571 return __parse_probe_arg(arg, ff, is_return);
572}
573
574/* Return 1 if name is reserved or already used by another argument */
575static int conflict_field_name(const char *name,
576 struct probe_arg *args, int narg)
577{
578 int i;
579 for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++)
580 if (strcmp(reserved_field_names[i], name) == 0)
581 return 1;
582 for (i = 0; i < narg; i++)
583 if (strcmp(args[i].name, name) == 0)
584 return 1;
585 return 0;
586}
587
588static int create_trace_probe(int argc, char **argv)
589{
590 /*
591 * Argument syntax:
592 * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS]
593 * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS]
594 * Fetch args:
595 * $argN : fetch Nth of function argument. (N:0-)
596 * $retval : fetch return value
597 * $stack : fetch stack address
598 * $stackN : fetch Nth of stack (N:0-)
599 * @ADDR : fetch memory at ADDR (ADDR should be in kernel)
600 * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
601 * %REG : fetch register REG
602 * Indirect memory fetch:
603 * +|-offs(ARG) : fetch memory at ARG +|- offs address.
604 * Alias name of args:
605 * NAME=FETCHARG : set NAME as alias of FETCHARG.
606 */
607 struct trace_probe *tp;
608 int i, ret = 0;
609 int is_return = 0, is_delete = 0;
610 char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL;
611 unsigned long offset = 0;
612 void *addr = NULL;
613 char buf[MAX_EVENT_NAME_LEN];
614
615 /* argc must be >= 1 */
616 if (argv[0][0] == 'p')
617 is_return = 0;
618 else if (argv[0][0] == 'r')
619 is_return = 1;
620 else if (argv[0][0] == '-')
621 is_delete = 1;
622 else {
623 pr_info("Probe definition must be started with 'p', 'r' or"
624 " '-'.\n");
625 return -EINVAL;
626 }
627
628 if (argv[0][1] == ':') {
629 event = &argv[0][2];
630 if (strchr(event, '/')) {
631 group = event;
632 event = strchr(group, '/') + 1;
633 event[-1] = '\0';
634 if (strlen(group) == 0) {
635 pr_info("Group name is not specifiled\n");
636 return -EINVAL;
637 }
638 }
639 if (strlen(event) == 0) {
640 pr_info("Event name is not specifiled\n");
641 return -EINVAL;
642 }
643 }
644 if (!group)
645 group = KPROBE_EVENT_SYSTEM;
646
647 if (is_delete) {
648 if (!event) {
649 pr_info("Delete command needs an event name.\n");
650 return -EINVAL;
651 }
652 tp = find_probe_event(event, group);
653 if (!tp) {
654 pr_info("Event %s/%s doesn't exist.\n", group, event);
655 return -ENOENT;
656 }
657 /* delete an event */
658 unregister_trace_probe(tp);
659 free_trace_probe(tp);
660 return 0;
661 }
662
663 if (argc < 2) {
664 pr_info("Probe point is not specified.\n");
665 return -EINVAL;
666 }
667 if (isdigit(argv[1][0])) {
668 if (is_return) {
669 pr_info("Return probe point must be a symbol.\n");
670 return -EINVAL;
671 }
672 /* an address specified */
673 ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr);
674 if (ret) {
675 pr_info("Failed to parse address.\n");
676 return ret;
677 }
678 } else {
679 /* a symbol specified */
680 symbol = argv[1];
681 /* TODO: support .init module functions */
682 ret = split_symbol_offset(symbol, &offset);
683 if (ret) {
684 pr_info("Failed to parse symbol.\n");
685 return ret;
686 }
687 if (offset && is_return) {
688 pr_info("Return probe must be used without offset.\n");
689 return -EINVAL;
690 }
691 }
692 argc -= 2; argv += 2;
693
694 /* setup a probe */
695 if (!event) {
696 /* Make a new event name */
697 if (symbol)
698 snprintf(buf, MAX_EVENT_NAME_LEN, "%c@%s%+ld",
699 is_return ? 'r' : 'p', symbol, offset);
700 else
701 snprintf(buf, MAX_EVENT_NAME_LEN, "%c@0x%p",
702 is_return ? 'r' : 'p', addr);
703 event = buf;
704 }
705 tp = alloc_trace_probe(group, event, addr, symbol, offset, argc,
706 is_return);
707 if (IS_ERR(tp)) {
708 pr_info("Failed to allocate trace_probe.(%d)\n",
709 (int)PTR_ERR(tp));
710 return PTR_ERR(tp);
711 }
712
713 /* parse arguments */
714 ret = 0;
715 for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
716 /* Parse argument name */
717 arg = strchr(argv[i], '=');
718 if (arg)
719 *arg++ = '\0';
720 else
721 arg = argv[i];
722
723 if (conflict_field_name(argv[i], tp->args, i)) {
724 pr_info("Argument%d name '%s' conflicts with "
725 "another field.\n", i, argv[i]);
726 ret = -EINVAL;
727 goto error;
728 }
729
730 tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
731 if (!tp->args[i].name) {
732 pr_info("Failed to allocate argument%d name '%s'.\n",
733 i, argv[i]);
734 ret = -ENOMEM;
735 goto error;
736 }
737
738 /* Parse fetch argument */
739 ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return);
740 if (ret) {
741 pr_info("Parse error at argument%d. (%d)\n", i, ret);
742 kfree(tp->args[i].name);
743 goto error;
744 }
745
746 tp->nr_args++;
747 }
748
749 ret = register_trace_probe(tp);
750 if (ret)
751 goto error;
752 return 0;
753
754error:
755 free_trace_probe(tp);
756 return ret;
757}
758
759static void cleanup_all_probes(void)
760{
761 struct trace_probe *tp;
762
763 mutex_lock(&probe_lock);
764 /* TODO: Use batch unregistration */
765 while (!list_empty(&probe_list)) {
766 tp = list_entry(probe_list.next, struct trace_probe, list);
767 unregister_trace_probe(tp);
768 free_trace_probe(tp);
769 }
770 mutex_unlock(&probe_lock);
771}
772
773
774/* Probes listing interfaces */
775static void *probes_seq_start(struct seq_file *m, loff_t *pos)
776{
777 mutex_lock(&probe_lock);
778 return seq_list_start(&probe_list, *pos);
779}
780
781static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos)
782{
783 return seq_list_next(v, &probe_list, pos);
784}
785
786static void probes_seq_stop(struct seq_file *m, void *v)
787{
788 mutex_unlock(&probe_lock);
789}
790
791static int probes_seq_show(struct seq_file *m, void *v)
792{
793 struct trace_probe *tp = v;
794 int i, ret;
795 char buf[MAX_ARGSTR_LEN + 1];
796
797 seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
798 seq_printf(m, ":%s/%s", tp->call.system, tp->call.name);
799
800 if (!tp->symbol)
801 seq_printf(m, " 0x%p", tp->rp.kp.addr);
802 else if (tp->rp.kp.offset)
803 seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset);
804 else
805 seq_printf(m, " %s", probe_symbol(tp));
806
807 for (i = 0; i < tp->nr_args; i++) {
808 ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch);
809 if (ret < 0) {
810 pr_warning("Argument%d decoding error(%d).\n", i, ret);
811 return ret;
812 }
813 seq_printf(m, " %s=%s", tp->args[i].name, buf);
814 }
815 seq_printf(m, "\n");
816 return 0;
817}
818
819static const struct seq_operations probes_seq_op = {
820 .start = probes_seq_start,
821 .next = probes_seq_next,
822 .stop = probes_seq_stop,
823 .show = probes_seq_show
824};
825
826static int probes_open(struct inode *inode, struct file *file)
827{
828 if ((file->f_mode & FMODE_WRITE) &&
829 (file->f_flags & O_TRUNC))
830 cleanup_all_probes();
831
832 return seq_open(file, &probes_seq_op);
833}
834
835static int command_trace_probe(const char *buf)
836{
837 char **argv;
838 int argc = 0, ret = 0;
839
840 argv = argv_split(GFP_KERNEL, buf, &argc);
841 if (!argv)
842 return -ENOMEM;
843
844 if (argc)
845 ret = create_trace_probe(argc, argv);
846
847 argv_free(argv);
848 return ret;
849}
850
851#define WRITE_BUFSIZE 128
852
853static ssize_t probes_write(struct file *file, const char __user *buffer,
854 size_t count, loff_t *ppos)
855{
856 char *kbuf, *tmp;
857 int ret;
858 size_t done;
859 size_t size;
860
861 kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
862 if (!kbuf)
863 return -ENOMEM;
864
865 ret = done = 0;
866 while (done < count) {
867 size = count - done;
868 if (size >= WRITE_BUFSIZE)
869 size = WRITE_BUFSIZE - 1;
870 if (copy_from_user(kbuf, buffer + done, size)) {
871 ret = -EFAULT;
872 goto out;
873 }
874 kbuf[size] = '\0';
875 tmp = strchr(kbuf, '\n');
876 if (tmp) {
877 *tmp = '\0';
878 size = tmp - kbuf + 1;
879 } else if (done + size < count) {
880 pr_warning("Line length is too long: "
881 "Should be less than %d.", WRITE_BUFSIZE);
882 ret = -EINVAL;
883 goto out;
884 }
885 done += size;
886 /* Remove comments */
887 tmp = strchr(kbuf, '#');
888 if (tmp)
889 *tmp = '\0';
890
891 ret = command_trace_probe(kbuf);
892 if (ret)
893 goto out;
894 }
895 ret = done;
896out:
897 kfree(kbuf);
898 return ret;
899}
900
901static const struct file_operations kprobe_events_ops = {
902 .owner = THIS_MODULE,
903 .open = probes_open,
904 .read = seq_read,
905 .llseek = seq_lseek,
906 .release = seq_release,
907 .write = probes_write,
908};
909
910/* Probes profiling interfaces */
911static int probes_profile_seq_show(struct seq_file *m, void *v)
912{
913 struct trace_probe *tp = v;
914
915 seq_printf(m, " %-44s %15lu %15lu\n", tp->call.name, tp->nhit,
916 tp->rp.kp.nmissed);
917
918 return 0;
919}
920
921static const struct seq_operations profile_seq_op = {
922 .start = probes_seq_start,
923 .next = probes_seq_next,
924 .stop = probes_seq_stop,
925 .show = probes_profile_seq_show
926};
927
928static int profile_open(struct inode *inode, struct file *file)
929{
930 return seq_open(file, &profile_seq_op);
931}
932
933static const struct file_operations kprobe_profile_ops = {
934 .owner = THIS_MODULE,
935 .open = profile_open,
936 .read = seq_read,
937 .llseek = seq_lseek,
938 .release = seq_release,
939};
940
941/* Kprobe handler */
942static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
943{
944 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
945 struct kprobe_trace_entry *entry;
946 struct ring_buffer_event *event;
947 struct ring_buffer *buffer;
948 int size, i, pc;
949 unsigned long irq_flags;
950 struct ftrace_event_call *call = &tp->call;
951
952 tp->nhit++;
953
954 local_save_flags(irq_flags);
955 pc = preempt_count();
956
957 size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
958
959 event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
960 irq_flags, pc);
961 if (!event)
962 return 0;
963
964 entry = ring_buffer_event_data(event);
965 entry->nargs = tp->nr_args;
966 entry->ip = (unsigned long)kp->addr;
967 for (i = 0; i < tp->nr_args; i++)
968 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
969
970 if (!filter_current_check_discard(buffer, call, entry, event))
971 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
972 return 0;
973}
974
975/* Kretprobe handler */
976static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
977 struct pt_regs *regs)
978{
979 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
980 struct kretprobe_trace_entry *entry;
981 struct ring_buffer_event *event;
982 struct ring_buffer *buffer;
983 int size, i, pc;
984 unsigned long irq_flags;
985 struct ftrace_event_call *call = &tp->call;
986
987 local_save_flags(irq_flags);
988 pc = preempt_count();
989
990 size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
991
992 event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
993 irq_flags, pc);
994 if (!event)
995 return 0;
996
997 entry = ring_buffer_event_data(event);
998 entry->nargs = tp->nr_args;
999 entry->func = (unsigned long)tp->rp.kp.addr;
1000 entry->ret_ip = (unsigned long)ri->ret_addr;
1001 for (i = 0; i < tp->nr_args; i++)
1002 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1003
1004 if (!filter_current_check_discard(buffer, call, entry, event))
1005 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
1006
1007 return 0;
1008}
1009
1010/* Event entry printers */
1011enum print_line_t
1012print_kprobe_event(struct trace_iterator *iter, int flags)
1013{
1014 struct kprobe_trace_entry *field;
1015 struct trace_seq *s = &iter->seq;
1016 struct trace_event *event;
1017 struct trace_probe *tp;
1018 int i;
1019
1020 field = (struct kprobe_trace_entry *)iter->ent;
1021 event = ftrace_find_event(field->ent.type);
1022 tp = container_of(event, struct trace_probe, event);
1023
1024 if (!trace_seq_printf(s, "%s: (", tp->call.name))
1025 goto partial;
1026
1027 if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
1028 goto partial;
1029
1030 if (!trace_seq_puts(s, ")"))
1031 goto partial;
1032
1033 for (i = 0; i < field->nargs; i++)
1034 if (!trace_seq_printf(s, " %s=%lx",
1035 tp->args[i].name, field->args[i]))
1036 goto partial;
1037
1038 if (!trace_seq_puts(s, "\n"))
1039 goto partial;
1040
1041 return TRACE_TYPE_HANDLED;
1042partial:
1043 return TRACE_TYPE_PARTIAL_LINE;
1044}
1045
1046enum print_line_t
1047print_kretprobe_event(struct trace_iterator *iter, int flags)
1048{
1049 struct kretprobe_trace_entry *field;
1050 struct trace_seq *s = &iter->seq;
1051 struct trace_event *event;
1052 struct trace_probe *tp;
1053 int i;
1054
1055 field = (struct kretprobe_trace_entry *)iter->ent;
1056 event = ftrace_find_event(field->ent.type);
1057 tp = container_of(event, struct trace_probe, event);
1058
1059 if (!trace_seq_printf(s, "%s: (", tp->call.name))
1060 goto partial;
1061
1062 if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
1063 goto partial;
1064
1065 if (!trace_seq_puts(s, " <- "))
1066 goto partial;
1067
1068 if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET))
1069 goto partial;
1070
1071 if (!trace_seq_puts(s, ")"))
1072 goto partial;
1073
1074 for (i = 0; i < field->nargs; i++)
1075 if (!trace_seq_printf(s, " %s=%lx",
1076 tp->args[i].name, field->args[i]))
1077 goto partial;
1078
1079 if (!trace_seq_puts(s, "\n"))
1080 goto partial;
1081
1082 return TRACE_TYPE_HANDLED;
1083partial:
1084 return TRACE_TYPE_PARTIAL_LINE;
1085}
1086
1087static int probe_event_enable(struct ftrace_event_call *call)
1088{
1089 struct trace_probe *tp = (struct trace_probe *)call->data;
1090
1091 tp->flags |= TP_FLAG_TRACE;
1092 if (probe_is_return(tp))
1093 return enable_kretprobe(&tp->rp);
1094 else
1095 return enable_kprobe(&tp->rp.kp);
1096}
1097
1098static void probe_event_disable(struct ftrace_event_call *call)
1099{
1100 struct trace_probe *tp = (struct trace_probe *)call->data;
1101
1102 tp->flags &= ~TP_FLAG_TRACE;
1103 if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) {
1104 if (probe_is_return(tp))
1105 disable_kretprobe(&tp->rp);
1106 else
1107 disable_kprobe(&tp->rp.kp);
1108 }
1109}
1110
1111static int probe_event_raw_init(struct ftrace_event_call *event_call)
1112{
1113 INIT_LIST_HEAD(&event_call->fields);
1114
1115 return 0;
1116}
1117
1118#undef DEFINE_FIELD
1119#define DEFINE_FIELD(type, item, name, is_signed) \
1120 do { \
1121 ret = trace_define_field(event_call, #type, name, \
1122 offsetof(typeof(field), item), \
1123 sizeof(field.item), is_signed, \
1124 FILTER_OTHER); \
1125 if (ret) \
1126 return ret; \
1127 } while (0)
1128
1129static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
1130{
1131 int ret, i;
1132 struct kprobe_trace_entry field;
1133 struct trace_probe *tp = (struct trace_probe *)event_call->data;
1134
1135 ret = trace_define_common_fields(event_call);
1136 if (ret)
1137 return ret;
1138
1139 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
1140 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
1141 /* Set argument names as fields */
1142 for (i = 0; i < tp->nr_args; i++)
1143 DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
1144 return 0;
1145}
1146
1147static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
1148{
1149 int ret, i;
1150 struct kretprobe_trace_entry field;
1151 struct trace_probe *tp = (struct trace_probe *)event_call->data;
1152
1153 ret = trace_define_common_fields(event_call);
1154 if (ret)
1155 return ret;
1156
1157 DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
1158 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
1159 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
1160 /* Set argument names as fields */
1161 for (i = 0; i < tp->nr_args; i++)
1162 DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
1163 return 0;
1164}
1165
1166static int __probe_event_show_format(struct trace_seq *s,
1167 struct trace_probe *tp, const char *fmt,
1168 const char *arg)
1169{
1170 int i;
1171
1172 /* Show format */
1173 if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt))
1174 return 0;
1175
1176 for (i = 0; i < tp->nr_args; i++)
1177 if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name))
1178 return 0;
1179
1180 if (!trace_seq_printf(s, "\", %s", arg))
1181 return 0;
1182
1183 for (i = 0; i < tp->nr_args; i++)
1184 if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name))
1185 return 0;
1186
1187 return trace_seq_puts(s, "\n");
1188}
1189
1190#undef SHOW_FIELD
1191#define SHOW_FIELD(type, item, name) \
1192 do { \
1193 ret = trace_seq_printf(s, "\tfield: " #type " %s;\t" \
1194 "offset:%u;\tsize:%u;\n", name, \
1195 (unsigned int)offsetof(typeof(field), item),\
1196 (unsigned int)sizeof(type)); \
1197 if (!ret) \
1198 return 0; \
1199 } while (0)
1200
1201static int kprobe_event_show_format(struct ftrace_event_call *call,
1202 struct trace_seq *s)
1203{
1204 struct kprobe_trace_entry field __attribute__((unused));
1205 int ret, i;
1206 struct trace_probe *tp = (struct trace_probe *)call->data;
1207
1208 SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP);
1209 SHOW_FIELD(int, nargs, FIELD_STRING_NARGS);
1210
1211 /* Show fields */
1212 for (i = 0; i < tp->nr_args; i++)
1213 SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
1214 trace_seq_puts(s, "\n");
1215
1216 return __probe_event_show_format(s, tp, "(%lx)",
1217 "REC->" FIELD_STRING_IP);
1218}
1219
1220static int kretprobe_event_show_format(struct ftrace_event_call *call,
1221 struct trace_seq *s)
1222{
1223 struct kretprobe_trace_entry field __attribute__((unused));
1224 int ret, i;
1225 struct trace_probe *tp = (struct trace_probe *)call->data;
1226
1227 SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC);
1228 SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP);
1229 SHOW_FIELD(int, nargs, FIELD_STRING_NARGS);
1230
1231 /* Show fields */
1232 for (i = 0; i < tp->nr_args; i++)
1233 SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
1234 trace_seq_puts(s, "\n");
1235
1236 return __probe_event_show_format(s, tp, "(%lx <- %lx)",
1237 "REC->" FIELD_STRING_FUNC
1238 ", REC->" FIELD_STRING_RETIP);
1239}
1240
1241#ifdef CONFIG_EVENT_PROFILE
1242
1243/* Kprobe profile handler */
1244static __kprobes int kprobe_profile_func(struct kprobe *kp,
1245 struct pt_regs *regs)
1246{
1247 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
1248 struct ftrace_event_call *call = &tp->call;
1249 struct kprobe_trace_entry *entry;
1250 struct trace_entry *ent;
1251 int size, __size, i, pc, __cpu;
1252 unsigned long irq_flags;
1253 char *trace_buf;
1254 char *raw_data;
1255 int rctx;
1256
1257 pc = preempt_count();
1258 __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
1259 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1260 size -= sizeof(u32);
1261 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
1262 "profile buffer not large enough"))
1263 return 0;
1264
1265 /*
1266 * Protect the non nmi buffer
1267 * This also protects the rcu read side
1268 */
1269 local_irq_save(irq_flags);
1270
1271 rctx = perf_swevent_get_recursion_context();
1272 if (rctx < 0)
1273 goto end_recursion;
1274
1275 __cpu = smp_processor_id();
1276
1277 if (in_nmi())
1278 trace_buf = rcu_dereference(perf_trace_buf_nmi);
1279 else
1280 trace_buf = rcu_dereference(perf_trace_buf);
1281
1282 if (!trace_buf)
1283 goto end;
1284
1285 raw_data = per_cpu_ptr(trace_buf, __cpu);
1286
1287 /* Zero dead bytes from alignment to avoid buffer leak to userspace */
1288 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
1289 entry = (struct kprobe_trace_entry *)raw_data;
1290 ent = &entry->ent;
1291
1292 tracing_generic_entry_update(ent, irq_flags, pc);
1293 ent->type = call->id;
1294 entry->nargs = tp->nr_args;
1295 entry->ip = (unsigned long)kp->addr;
1296 for (i = 0; i < tp->nr_args; i++)
1297 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1298 perf_tp_event(call->id, entry->ip, 1, entry, size);
1299
1300end:
1301 perf_swevent_put_recursion_context(rctx);
1302end_recursion:
1303 local_irq_restore(irq_flags);
1304
1305 return 0;
1306}
1307
1308/* Kretprobe profile handler */
1309static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
1310 struct pt_regs *regs)
1311{
1312 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
1313 struct ftrace_event_call *call = &tp->call;
1314 struct kretprobe_trace_entry *entry;
1315 struct trace_entry *ent;
1316 int size, __size, i, pc, __cpu;
1317 unsigned long irq_flags;
1318 char *trace_buf;
1319 char *raw_data;
1320 int rctx;
1321
1322 pc = preempt_count();
1323 __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
1324 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1325 size -= sizeof(u32);
1326 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
1327 "profile buffer not large enough"))
1328 return 0;
1329
1330 /*
1331 * Protect the non nmi buffer
1332 * This also protects the rcu read side
1333 */
1334 local_irq_save(irq_flags);
1335
1336 rctx = perf_swevent_get_recursion_context();
1337 if (rctx < 0)
1338 goto end_recursion;
1339
1340 __cpu = smp_processor_id();
1341
1342 if (in_nmi())
1343 trace_buf = rcu_dereference(perf_trace_buf_nmi);
1344 else
1345 trace_buf = rcu_dereference(perf_trace_buf);
1346
1347 if (!trace_buf)
1348 goto end;
1349
1350 raw_data = per_cpu_ptr(trace_buf, __cpu);
1351
1352 /* Zero dead bytes from alignment to avoid buffer leak to userspace */
1353 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
1354 entry = (struct kretprobe_trace_entry *)raw_data;
1355 ent = &entry->ent;
1356
1357 tracing_generic_entry_update(ent, irq_flags, pc);
1358 ent->type = call->id;
1359 entry->nargs = tp->nr_args;
1360 entry->func = (unsigned long)tp->rp.kp.addr;
1361 entry->ret_ip = (unsigned long)ri->ret_addr;
1362 for (i = 0; i < tp->nr_args; i++)
1363 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1364 perf_tp_event(call->id, entry->ret_ip, 1, entry, size);
1365
1366end:
1367 perf_swevent_put_recursion_context(rctx);
1368end_recursion:
1369 local_irq_restore(irq_flags);
1370
1371 return 0;
1372}
1373
1374static int probe_profile_enable(struct ftrace_event_call *call)
1375{
1376 struct trace_probe *tp = (struct trace_probe *)call->data;
1377
1378 tp->flags |= TP_FLAG_PROFILE;
1379
1380 if (probe_is_return(tp))
1381 return enable_kretprobe(&tp->rp);
1382 else
1383 return enable_kprobe(&tp->rp.kp);
1384}
1385
1386static void probe_profile_disable(struct ftrace_event_call *call)
1387{
1388 struct trace_probe *tp = (struct trace_probe *)call->data;
1389
1390 tp->flags &= ~TP_FLAG_PROFILE;
1391
1392 if (!(tp->flags & TP_FLAG_TRACE)) {
1393 if (probe_is_return(tp))
1394 disable_kretprobe(&tp->rp);
1395 else
1396 disable_kprobe(&tp->rp.kp);
1397 }
1398}
1399#endif /* CONFIG_EVENT_PROFILE */
1400
1401
1402static __kprobes
1403int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
1404{
1405 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
1406
1407 if (tp->flags & TP_FLAG_TRACE)
1408 kprobe_trace_func(kp, regs);
1409#ifdef CONFIG_EVENT_PROFILE
1410 if (tp->flags & TP_FLAG_PROFILE)
1411 kprobe_profile_func(kp, regs);
1412#endif /* CONFIG_EVENT_PROFILE */
1413 return 0; /* We don't tweek kernel, so just return 0 */
1414}
1415
1416static __kprobes
1417int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
1418{
1419 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
1420
1421 if (tp->flags & TP_FLAG_TRACE)
1422 kretprobe_trace_func(ri, regs);
1423#ifdef CONFIG_EVENT_PROFILE
1424 if (tp->flags & TP_FLAG_PROFILE)
1425 kretprobe_profile_func(ri, regs);
1426#endif /* CONFIG_EVENT_PROFILE */
1427 return 0; /* We don't tweek kernel, so just return 0 */
1428}
1429
1430static int register_probe_event(struct trace_probe *tp)
1431{
1432 struct ftrace_event_call *call = &tp->call;
1433 int ret;
1434
1435 /* Initialize ftrace_event_call */
1436 if (probe_is_return(tp)) {
1437 tp->event.trace = print_kretprobe_event;
1438 call->raw_init = probe_event_raw_init;
1439 call->show_format = kretprobe_event_show_format;
1440 call->define_fields = kretprobe_event_define_fields;
1441 } else {
1442 tp->event.trace = print_kprobe_event;
1443 call->raw_init = probe_event_raw_init;
1444 call->show_format = kprobe_event_show_format;
1445 call->define_fields = kprobe_event_define_fields;
1446 }
1447 call->event = &tp->event;
1448 call->id = register_ftrace_event(&tp->event);
1449 if (!call->id)
1450 return -ENODEV;
1451 call->enabled = 0;
1452 call->regfunc = probe_event_enable;
1453 call->unregfunc = probe_event_disable;
1454
1455#ifdef CONFIG_EVENT_PROFILE
1456 atomic_set(&call->profile_count, -1);
1457 call->profile_enable = probe_profile_enable;
1458 call->profile_disable = probe_profile_disable;
1459#endif
1460 call->data = tp;
1461 ret = trace_add_event_call(call);
1462 if (ret) {
1463 pr_info("Failed to register kprobe event: %s\n", call->name);
1464 unregister_ftrace_event(&tp->event);
1465 }
1466 return ret;
1467}
1468
1469static void unregister_probe_event(struct trace_probe *tp)
1470{
1471 /* tp->event is unregistered in trace_remove_event_call() */
1472 trace_remove_event_call(&tp->call);
1473}
1474
1475/* Make a debugfs interface for controling probe points */
1476static __init int init_kprobe_trace(void)
1477{
1478 struct dentry *d_tracer;
1479 struct dentry *entry;
1480
1481 d_tracer = tracing_init_dentry();
1482 if (!d_tracer)
1483 return 0;
1484
1485 entry = debugfs_create_file("kprobe_events", 0644, d_tracer,
1486 NULL, &kprobe_events_ops);
1487
1488 /* Event list interface */
1489 if (!entry)
1490 pr_warning("Could not create debugfs "
1491 "'kprobe_events' entry\n");
1492
1493 /* Profile interface */
1494 entry = debugfs_create_file("kprobe_profile", 0444, d_tracer,
1495 NULL, &kprobe_profile_ops);
1496
1497 if (!entry)
1498 pr_warning("Could not create debugfs "
1499 "'kprobe_profile' entry\n");
1500 return 0;
1501}
1502fs_initcall(init_kprobe_trace);
1503
1504
1505#ifdef CONFIG_FTRACE_STARTUP_TEST
1506
1507static int kprobe_trace_selftest_target(int a1, int a2, int a3,
1508 int a4, int a5, int a6)
1509{
1510 return a1 + a2 + a3 + a4 + a5 + a6;
1511}
1512
1513static __init int kprobe_trace_self_tests_init(void)
1514{
1515 int ret;
1516 int (*target)(int, int, int, int, int, int);
1517
1518 target = kprobe_trace_selftest_target;
1519
1520 pr_info("Testing kprobe tracing: ");
1521
1522 ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target "
1523 "$arg1 $arg2 $arg3 $arg4 $stack $stack0");
1524 if (WARN_ON_ONCE(ret))
1525 pr_warning("error enabling function entry\n");
1526
1527 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
1528 "$retval");
1529 if (WARN_ON_ONCE(ret))
1530 pr_warning("error enabling function return\n");
1531
1532 ret = target(1, 2, 3, 4, 5, 6);
1533
1534 cleanup_all_probes();
1535
1536 pr_cont("OK\n");
1537 return 0;
1538}
1539
1540late_initcall(kprobe_trace_self_tests_init);
1541
1542#endif
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
new file mode 100644
index 000000000000..acb87d4a4ac1
--- /dev/null
+++ b/kernel/trace/trace_ksym.c
@@ -0,0 +1,551 @@
1/*
2 * trace_ksym.c - Kernel Symbol Tracer
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2009
19 */
20
21#include <linux/kallsyms.h>
22#include <linux/uaccess.h>
23#include <linux/debugfs.h>
24#include <linux/ftrace.h>
25#include <linux/module.h>
26#include <linux/fs.h>
27
28#include "trace_output.h"
29#include "trace_stat.h"
30#include "trace.h"
31
32#include <linux/hw_breakpoint.h>
33#include <asm/hw_breakpoint.h>
34
35/*
36 * For now, let us restrict the no. of symbols traced simultaneously to number
37 * of available hardware breakpoint registers.
38 */
39#define KSYM_TRACER_MAX HBP_NUM
40
41#define KSYM_TRACER_OP_LEN 3 /* rw- */
42
43struct trace_ksym {
44 struct perf_event **ksym_hbp;
45 struct perf_event_attr attr;
46#ifdef CONFIG_PROFILE_KSYM_TRACER
47 unsigned long counter;
48#endif
49 struct hlist_node ksym_hlist;
50};
51
52static struct trace_array *ksym_trace_array;
53
54static unsigned int ksym_filter_entry_count;
55static unsigned int ksym_tracing_enabled;
56
57static HLIST_HEAD(ksym_filter_head);
58
59static DEFINE_MUTEX(ksym_tracer_mutex);
60
61#ifdef CONFIG_PROFILE_KSYM_TRACER
62
63#define MAX_UL_INT 0xffffffff
64
65void ksym_collect_stats(unsigned long hbp_hit_addr)
66{
67 struct hlist_node *node;
68 struct trace_ksym *entry;
69
70 rcu_read_lock();
71 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
72 if ((entry->attr.bp_addr == hbp_hit_addr) &&
73 (entry->counter <= MAX_UL_INT)) {
74 entry->counter++;
75 break;
76 }
77 }
78 rcu_read_unlock();
79}
80#endif /* CONFIG_PROFILE_KSYM_TRACER */
81
82void ksym_hbp_handler(struct perf_event *hbp, int nmi,
83 struct perf_sample_data *data,
84 struct pt_regs *regs)
85{
86 struct ring_buffer_event *event;
87 struct ksym_trace_entry *entry;
88 struct ring_buffer *buffer;
89 int pc;
90
91 if (!ksym_tracing_enabled)
92 return;
93
94 buffer = ksym_trace_array->buffer;
95
96 pc = preempt_count();
97
98 event = trace_buffer_lock_reserve(buffer, TRACE_KSYM,
99 sizeof(*entry), 0, pc);
100 if (!event)
101 return;
102
103 entry = ring_buffer_event_data(event);
104 entry->ip = instruction_pointer(regs);
105 entry->type = hw_breakpoint_type(hbp);
106 entry->addr = hw_breakpoint_addr(hbp);
107 strlcpy(entry->cmd, current->comm, TASK_COMM_LEN);
108
109#ifdef CONFIG_PROFILE_KSYM_TRACER
110 ksym_collect_stats(hw_breakpoint_addr(hbp));
111#endif /* CONFIG_PROFILE_KSYM_TRACER */
112
113 trace_buffer_unlock_commit(buffer, event, 0, pc);
114}
115
116/* Valid access types are represented as
117 *
118 * rw- : Set Read/Write Access Breakpoint
119 * -w- : Set Write Access Breakpoint
120 * --- : Clear Breakpoints
121 * --x : Set Execution Break points (Not available yet)
122 *
123 */
124static int ksym_trace_get_access_type(char *str)
125{
126 int access = 0;
127
128 if (str[0] == 'r')
129 access |= HW_BREAKPOINT_R;
130
131 if (str[1] == 'w')
132 access |= HW_BREAKPOINT_W;
133
134 if (str[2] == 'x')
135 access |= HW_BREAKPOINT_X;
136
137 switch (access) {
138 case HW_BREAKPOINT_R:
139 case HW_BREAKPOINT_W:
140 case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
141 return access;
142 default:
143 return -EINVAL;
144 }
145}
146
147/*
148 * There can be several possible malformed requests and we attempt to capture
149 * all of them. We enumerate some of the rules
150 * 1. We will not allow kernel symbols with ':' since it is used as a delimiter.
151 * i.e. multiple ':' symbols disallowed. Possible uses are of the form
152 * <module>:<ksym_name>:<op>.
153 * 2. No delimiter symbol ':' in the input string
154 * 3. Spurious operator symbols or symbols not in their respective positions
155 * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file
156 * 5. Kernel symbol not a part of /proc/kallsyms
157 * 6. Duplicate requests
158 */
159static int parse_ksym_trace_str(char *input_string, char **ksymname,
160 unsigned long *addr)
161{
162 int ret;
163
164 *ksymname = strsep(&input_string, ":");
165 *addr = kallsyms_lookup_name(*ksymname);
166
167 /* Check for malformed request: (2), (1) and (5) */
168 if ((!input_string) ||
169 (strlen(input_string) != KSYM_TRACER_OP_LEN) ||
170 (*addr == 0))
171 return -EINVAL;;
172
173 ret = ksym_trace_get_access_type(input_string);
174
175 return ret;
176}
177
178int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
179{
180 struct trace_ksym *entry;
181 int ret = -ENOMEM;
182
183 if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
184 printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
185 " new requests for tracing can be accepted now.\n",
186 KSYM_TRACER_MAX);
187 return -ENOSPC;
188 }
189
190 entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
191 if (!entry)
192 return -ENOMEM;
193
194 hw_breakpoint_init(&entry->attr);
195
196 entry->attr.bp_type = op;
197 entry->attr.bp_addr = addr;
198 entry->attr.bp_len = HW_BREAKPOINT_LEN_4;
199
200 ret = -EAGAIN;
201 entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr,
202 ksym_hbp_handler);
203
204 if (IS_ERR(entry->ksym_hbp)) {
205 ret = PTR_ERR(entry->ksym_hbp);
206 printk(KERN_INFO "ksym_tracer request failed. Try again"
207 " later!!\n");
208 goto err;
209 }
210
211 hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
212 ksym_filter_entry_count++;
213
214 return 0;
215
216err:
217 kfree(entry);
218
219 return ret;
220}
221
222static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
223 size_t count, loff_t *ppos)
224{
225 struct trace_ksym *entry;
226 struct hlist_node *node;
227 struct trace_seq *s;
228 ssize_t cnt = 0;
229 int ret;
230
231 s = kmalloc(sizeof(*s), GFP_KERNEL);
232 if (!s)
233 return -ENOMEM;
234 trace_seq_init(s);
235
236 mutex_lock(&ksym_tracer_mutex);
237
238 hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
239 ret = trace_seq_printf(s, "%pS:", (void *)entry->attr.bp_addr);
240 if (entry->attr.bp_type == HW_BREAKPOINT_R)
241 ret = trace_seq_puts(s, "r--\n");
242 else if (entry->attr.bp_type == HW_BREAKPOINT_W)
243 ret = trace_seq_puts(s, "-w-\n");
244 else if (entry->attr.bp_type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R))
245 ret = trace_seq_puts(s, "rw-\n");
246 WARN_ON_ONCE(!ret);
247 }
248
249 cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
250
251 mutex_unlock(&ksym_tracer_mutex);
252
253 kfree(s);
254
255 return cnt;
256}
257
258static void __ksym_trace_reset(void)
259{
260 struct trace_ksym *entry;
261 struct hlist_node *node, *node1;
262
263 mutex_lock(&ksym_tracer_mutex);
264 hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
265 ksym_hlist) {
266 unregister_wide_hw_breakpoint(entry->ksym_hbp);
267 ksym_filter_entry_count--;
268 hlist_del_rcu(&(entry->ksym_hlist));
269 synchronize_rcu();
270 kfree(entry);
271 }
272 mutex_unlock(&ksym_tracer_mutex);
273}
274
275static ssize_t ksym_trace_filter_write(struct file *file,
276 const char __user *buffer,
277 size_t count, loff_t *ppos)
278{
279 struct trace_ksym *entry;
280 struct hlist_node *node;
281 char *input_string, *ksymname = NULL;
282 unsigned long ksym_addr = 0;
283 int ret, op, changed = 0;
284
285 input_string = kzalloc(count + 1, GFP_KERNEL);
286 if (!input_string)
287 return -ENOMEM;
288
289 if (copy_from_user(input_string, buffer, count)) {
290 kfree(input_string);
291 return -EFAULT;
292 }
293 input_string[count] = '\0';
294
295 strstrip(input_string);
296
297 /*
298 * Clear all breakpoints if:
299 * 1: echo > ksym_trace_filter
300 * 2: echo 0 > ksym_trace_filter
301 * 3: echo "*:---" > ksym_trace_filter
302 */
303 if (!input_string[0] || !strcmp(input_string, "0") ||
304 !strcmp(input_string, "*:---")) {
305 __ksym_trace_reset();
306 kfree(input_string);
307 return count;
308 }
309
310 ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
311 if (ret < 0) {
312 kfree(input_string);
313 return ret;
314 }
315
316 mutex_lock(&ksym_tracer_mutex);
317
318 ret = -EINVAL;
319 hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
320 if (entry->attr.bp_addr == ksym_addr) {
321 /* Check for malformed request: (6) */
322 if (entry->attr.bp_type != op)
323 changed = 1;
324 else
325 goto out;
326 break;
327 }
328 }
329 if (changed) {
330 unregister_wide_hw_breakpoint(entry->ksym_hbp);
331 entry->attr.bp_type = op;
332 ret = 0;
333 if (op > 0) {
334 entry->ksym_hbp =
335 register_wide_hw_breakpoint(&entry->attr,
336 ksym_hbp_handler);
337 if (IS_ERR(entry->ksym_hbp))
338 ret = PTR_ERR(entry->ksym_hbp);
339 else
340 goto out;
341 }
342 /* Error or "symbol:---" case: drop it */
343 ksym_filter_entry_count--;
344 hlist_del_rcu(&(entry->ksym_hlist));
345 synchronize_rcu();
346 kfree(entry);
347 goto out;
348 } else {
349 /* Check for malformed request: (4) */
350 if (op == 0)
351 goto out;
352 ret = process_new_ksym_entry(ksymname, op, ksym_addr);
353 }
354out:
355 mutex_unlock(&ksym_tracer_mutex);
356
357 kfree(input_string);
358
359 if (!ret)
360 ret = count;
361 return ret;
362}
363
364static const struct file_operations ksym_tracing_fops = {
365 .open = tracing_open_generic,
366 .read = ksym_trace_filter_read,
367 .write = ksym_trace_filter_write,
368};
369
370static void ksym_trace_reset(struct trace_array *tr)
371{
372 ksym_tracing_enabled = 0;
373 __ksym_trace_reset();
374}
375
376static int ksym_trace_init(struct trace_array *tr)
377{
378 int cpu, ret = 0;
379
380 for_each_online_cpu(cpu)
381 tracing_reset(tr, cpu);
382 ksym_tracing_enabled = 1;
383 ksym_trace_array = tr;
384
385 return ret;
386}
387
388static void ksym_trace_print_header(struct seq_file *m)
389{
390 seq_puts(m,
391 "# TASK-PID CPU# Symbol "
392 "Type Function\n");
393 seq_puts(m,
394 "# | | | "
395 " | |\n");
396}
397
398static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
399{
400 struct trace_entry *entry = iter->ent;
401 struct trace_seq *s = &iter->seq;
402 struct ksym_trace_entry *field;
403 char str[KSYM_SYMBOL_LEN];
404 int ret;
405
406 if (entry->type != TRACE_KSYM)
407 return TRACE_TYPE_UNHANDLED;
408
409 trace_assign_type(field, entry);
410
411 ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd,
412 entry->pid, iter->cpu, (char *)field->addr);
413 if (!ret)
414 return TRACE_TYPE_PARTIAL_LINE;
415
416 switch (field->type) {
417 case HW_BREAKPOINT_R:
418 ret = trace_seq_printf(s, " R ");
419 break;
420 case HW_BREAKPOINT_W:
421 ret = trace_seq_printf(s, " W ");
422 break;
423 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
424 ret = trace_seq_printf(s, " RW ");
425 break;
426 default:
427 return TRACE_TYPE_PARTIAL_LINE;
428 }
429
430 if (!ret)
431 return TRACE_TYPE_PARTIAL_LINE;
432
433 sprint_symbol(str, field->ip);
434 ret = trace_seq_printf(s, "%s\n", str);
435 if (!ret)
436 return TRACE_TYPE_PARTIAL_LINE;
437
438 return TRACE_TYPE_HANDLED;
439}
440
441struct tracer ksym_tracer __read_mostly =
442{
443 .name = "ksym_tracer",
444 .init = ksym_trace_init,
445 .reset = ksym_trace_reset,
446#ifdef CONFIG_FTRACE_SELFTEST
447 .selftest = trace_selftest_startup_ksym,
448#endif
449 .print_header = ksym_trace_print_header,
450 .print_line = ksym_trace_output
451};
452
453__init static int init_ksym_trace(void)
454{
455 struct dentry *d_tracer;
456 struct dentry *entry;
457
458 d_tracer = tracing_init_dentry();
459 ksym_filter_entry_count = 0;
460
461 entry = debugfs_create_file("ksym_trace_filter", 0644, d_tracer,
462 NULL, &ksym_tracing_fops);
463 if (!entry)
464 pr_warning("Could not create debugfs "
465 "'ksym_trace_filter' file\n");
466
467 return register_tracer(&ksym_tracer);
468}
469device_initcall(init_ksym_trace);
470
471
472#ifdef CONFIG_PROFILE_KSYM_TRACER
473static int ksym_tracer_stat_headers(struct seq_file *m)
474{
475 seq_puts(m, " Access Type ");
476 seq_puts(m, " Symbol Counter\n");
477 seq_puts(m, " ----------- ");
478 seq_puts(m, " ------ -------\n");
479 return 0;
480}
481
482static int ksym_tracer_stat_show(struct seq_file *m, void *v)
483{
484 struct hlist_node *stat = v;
485 struct trace_ksym *entry;
486 int access_type = 0;
487 char fn_name[KSYM_NAME_LEN];
488
489 entry = hlist_entry(stat, struct trace_ksym, ksym_hlist);
490
491 access_type = entry->attr.bp_type;
492
493 switch (access_type) {
494 case HW_BREAKPOINT_R:
495 seq_puts(m, " R ");
496 break;
497 case HW_BREAKPOINT_W:
498 seq_puts(m, " W ");
499 break;
500 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
501 seq_puts(m, " RW ");
502 break;
503 default:
504 seq_puts(m, " NA ");
505 }
506
507 if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0)
508 seq_printf(m, " %-36s", fn_name);
509 else
510 seq_printf(m, " %-36s", "<NA>");
511 seq_printf(m, " %15lu\n", entry->counter);
512
513 return 0;
514}
515
516static void *ksym_tracer_stat_start(struct tracer_stat *trace)
517{
518 return ksym_filter_head.first;
519}
520
521static void *
522ksym_tracer_stat_next(void *v, int idx)
523{
524 struct hlist_node *stat = v;
525
526 return stat->next;
527}
528
529static struct tracer_stat ksym_tracer_stats = {
530 .name = "ksym_tracer",
531 .stat_start = ksym_tracer_stat_start,
532 .stat_next = ksym_tracer_stat_next,
533 .stat_headers = ksym_tracer_stat_headers,
534 .stat_show = ksym_tracer_stat_show
535};
536
537__init static int ksym_tracer_stat_init(void)
538{
539 int ret;
540
541 ret = register_stat_tracer(&ksym_tracer_stats);
542 if (ret) {
543 printk(KERN_WARNING "Warning: could not register "
544 "ksym tracer stats\n");
545 return 1;
546 }
547
548 return 0;
549}
550fs_initcall(ksym_tracer_stat_init);
551#endif /* CONFIG_PROFILE_KSYM_TRACER */
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index f572f44c6e1e..8e46b3323cdc 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -23,13 +23,21 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
23 23
24static int next_event_type = __TRACE_LAST_TYPE + 1; 24static int next_event_type = __TRACE_LAST_TYPE + 1;
25 25
26void trace_print_seq(struct seq_file *m, struct trace_seq *s) 26int trace_print_seq(struct seq_file *m, struct trace_seq *s)
27{ 27{
28 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; 28 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
29 int ret;
30
31 ret = seq_write(m, s->buffer, len);
29 32
30 seq_write(m, s->buffer, len); 33 /*
34 * Only reset this buffer if we successfully wrote to the
35 * seq_file buffer.
36 */
37 if (!ret)
38 trace_seq_init(s);
31 39
32 trace_seq_init(s); 40 return ret;
33} 41}
34 42
35enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) 43enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
@@ -69,6 +77,9 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
69 * @s: trace sequence descriptor 77 * @s: trace sequence descriptor
70 * @fmt: printf format string 78 * @fmt: printf format string
71 * 79 *
80 * It returns 0 if the trace oversizes the buffer's free
81 * space, 1 otherwise.
82 *
72 * The tracer may use either sequence operations or its own 83 * The tracer may use either sequence operations or its own
73 * copy to user routines. To simplify formating of a trace 84 * copy to user routines. To simplify formating of a trace
74 * trace_seq_printf is used to store strings into a special 85 * trace_seq_printf is used to store strings into a special
@@ -82,7 +93,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
82 va_list ap; 93 va_list ap;
83 int ret; 94 int ret;
84 95
85 if (!len) 96 if (s->full || !len)
86 return 0; 97 return 0;
87 98
88 va_start(ap, fmt); 99 va_start(ap, fmt);
@@ -90,12 +101,14 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
90 va_end(ap); 101 va_end(ap);
91 102
92 /* If we can't write it all, don't bother writing anything */ 103 /* If we can't write it all, don't bother writing anything */
93 if (ret >= len) 104 if (ret >= len) {
105 s->full = 1;
94 return 0; 106 return 0;
107 }
95 108
96 s->len += ret; 109 s->len += ret;
97 110
98 return len; 111 return 1;
99} 112}
100EXPORT_SYMBOL_GPL(trace_seq_printf); 113EXPORT_SYMBOL_GPL(trace_seq_printf);
101 114
@@ -116,14 +129,16 @@ trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
116 int len = (PAGE_SIZE - 1) - s->len; 129 int len = (PAGE_SIZE - 1) - s->len;
117 int ret; 130 int ret;
118 131
119 if (!len) 132 if (s->full || !len)
120 return 0; 133 return 0;
121 134
122 ret = vsnprintf(s->buffer + s->len, len, fmt, args); 135 ret = vsnprintf(s->buffer + s->len, len, fmt, args);
123 136
124 /* If we can't write it all, don't bother writing anything */ 137 /* If we can't write it all, don't bother writing anything */
125 if (ret >= len) 138 if (ret >= len) {
139 s->full = 1;
126 return 0; 140 return 0;
141 }
127 142
128 s->len += ret; 143 s->len += ret;
129 144
@@ -136,14 +151,16 @@ int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
136 int len = (PAGE_SIZE - 1) - s->len; 151 int len = (PAGE_SIZE - 1) - s->len;
137 int ret; 152 int ret;
138 153
139 if (!len) 154 if (s->full || !len)
140 return 0; 155 return 0;
141 156
142 ret = bstr_printf(s->buffer + s->len, len, fmt, binary); 157 ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
143 158
144 /* If we can't write it all, don't bother writing anything */ 159 /* If we can't write it all, don't bother writing anything */
145 if (ret >= len) 160 if (ret >= len) {
161 s->full = 1;
146 return 0; 162 return 0;
163 }
147 164
148 s->len += ret; 165 s->len += ret;
149 166
@@ -164,9 +181,14 @@ int trace_seq_puts(struct trace_seq *s, const char *str)
164{ 181{
165 int len = strlen(str); 182 int len = strlen(str);
166 183
167 if (len > ((PAGE_SIZE - 1) - s->len)) 184 if (s->full)
168 return 0; 185 return 0;
169 186
187 if (len > ((PAGE_SIZE - 1) - s->len)) {
188 s->full = 1;
189 return 0;
190 }
191
170 memcpy(s->buffer + s->len, str, len); 192 memcpy(s->buffer + s->len, str, len);
171 s->len += len; 193 s->len += len;
172 194
@@ -175,9 +197,14 @@ int trace_seq_puts(struct trace_seq *s, const char *str)
175 197
176int trace_seq_putc(struct trace_seq *s, unsigned char c) 198int trace_seq_putc(struct trace_seq *s, unsigned char c)
177{ 199{
178 if (s->len >= (PAGE_SIZE - 1)) 200 if (s->full)
179 return 0; 201 return 0;
180 202
203 if (s->len >= (PAGE_SIZE - 1)) {
204 s->full = 1;
205 return 0;
206 }
207
181 s->buffer[s->len++] = c; 208 s->buffer[s->len++] = c;
182 209
183 return 1; 210 return 1;
@@ -185,8 +212,13 @@ int trace_seq_putc(struct trace_seq *s, unsigned char c)
185 212
186int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len) 213int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
187{ 214{
188 if (len > ((PAGE_SIZE - 1) - s->len)) 215 if (s->full)
216 return 0;
217
218 if (len > ((PAGE_SIZE - 1) - s->len)) {
219 s->full = 1;
189 return 0; 220 return 0;
221 }
190 222
191 memcpy(s->buffer + s->len, mem, len); 223 memcpy(s->buffer + s->len, mem, len);
192 s->len += len; 224 s->len += len;
@@ -200,6 +232,9 @@ int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len)
200 const unsigned char *data = mem; 232 const unsigned char *data = mem;
201 int i, j; 233 int i, j;
202 234
235 if (s->full)
236 return 0;
237
203#ifdef __BIG_ENDIAN 238#ifdef __BIG_ENDIAN
204 for (i = 0, j = 0; i < len; i++) { 239 for (i = 0, j = 0; i < len; i++) {
205#else 240#else
@@ -217,8 +252,13 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)
217{ 252{
218 void *ret; 253 void *ret;
219 254
220 if (len > ((PAGE_SIZE - 1) - s->len)) 255 if (s->full)
256 return 0;
257
258 if (len > ((PAGE_SIZE - 1) - s->len)) {
259 s->full = 1;
221 return NULL; 260 return NULL;
261 }
222 262
223 ret = s->buffer + s->len; 263 ret = s->buffer + s->len;
224 s->len += len; 264 s->len += len;
@@ -230,8 +270,14 @@ int trace_seq_path(struct trace_seq *s, struct path *path)
230{ 270{
231 unsigned char *p; 271 unsigned char *p;
232 272
233 if (s->len >= (PAGE_SIZE - 1)) 273 if (s->full)
274 return 0;
275
276 if (s->len >= (PAGE_SIZE - 1)) {
277 s->full = 1;
234 return 0; 278 return 0;
279 }
280
235 p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); 281 p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
236 if (!IS_ERR(p)) { 282 if (!IS_ERR(p)) {
237 p = mangle_path(s->buffer + s->len, p, "\n"); 283 p = mangle_path(s->buffer + s->len, p, "\n");
@@ -244,6 +290,7 @@ int trace_seq_path(struct trace_seq *s, struct path *path)
244 return 1; 290 return 1;
245 } 291 }
246 292
293 s->full = 1;
247 return 0; 294 return 0;
248} 295}
249 296
@@ -370,6 +417,9 @@ int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
370 unsigned long vmstart = 0; 417 unsigned long vmstart = 0;
371 int ret = 1; 418 int ret = 1;
372 419
420 if (s->full)
421 return 0;
422
373 if (mm) { 423 if (mm) {
374 const struct vm_area_struct *vma; 424 const struct vm_area_struct *vma;
375 425
@@ -486,16 +536,18 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
486 hardirq ? 'h' : softirq ? 's' : '.')) 536 hardirq ? 'h' : softirq ? 's' : '.'))
487 return 0; 537 return 0;
488 538
489 if (entry->lock_depth < 0) 539 if (entry->preempt_count)
490 ret = trace_seq_putc(s, '.'); 540 ret = trace_seq_printf(s, "%x", entry->preempt_count);
491 else 541 else
492 ret = trace_seq_printf(s, "%d", entry->lock_depth); 542 ret = trace_seq_putc(s, '.');
543
493 if (!ret) 544 if (!ret)
494 return 0; 545 return 0;
495 546
496 if (entry->preempt_count) 547 if (entry->lock_depth < 0)
497 return trace_seq_printf(s, "%x", entry->preempt_count); 548 return trace_seq_putc(s, '.');
498 return trace_seq_putc(s, '.'); 549
550 return trace_seq_printf(s, "%d", entry->lock_depth);
499} 551}
500 552
501static int 553static int
@@ -883,7 +935,7 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
883 trace_assign_type(field, iter->ent); 935 trace_assign_type(field, iter->ent);
884 936
885 if (!S) 937 if (!S)
886 task_state_char(field->prev_state); 938 S = task_state_char(field->prev_state);
887 T = task_state_char(field->next_state); 939 T = task_state_char(field->next_state);
888 if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", 940 if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
889 field->prev_pid, 941 field->prev_pid,
@@ -918,7 +970,7 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
918 trace_assign_type(field, iter->ent); 970 trace_assign_type(field, iter->ent);
919 971
920 if (!S) 972 if (!S)
921 task_state_char(field->prev_state); 973 S = task_state_char(field->prev_state);
922 T = task_state_char(field->next_state); 974 T = task_state_char(field->next_state);
923 975
924 SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); 976 SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 26185d727676..0271742abb8d 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -28,8 +28,8 @@ static int wakeup_current_cpu;
28static unsigned wakeup_prio = -1; 28static unsigned wakeup_prio = -1;
29static int wakeup_rt; 29static int wakeup_rt;
30 30
31static raw_spinlock_t wakeup_lock = 31static arch_spinlock_t wakeup_lock =
32 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 32 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
33 33
34static void __wakeup_reset(struct trace_array *tr); 34static void __wakeup_reset(struct trace_array *tr);
35 35
@@ -143,7 +143,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
143 goto out; 143 goto out;
144 144
145 local_irq_save(flags); 145 local_irq_save(flags);
146 __raw_spin_lock(&wakeup_lock); 146 arch_spin_lock(&wakeup_lock);
147 147
148 /* We could race with grabbing wakeup_lock */ 148 /* We could race with grabbing wakeup_lock */
149 if (unlikely(!tracer_enabled || next != wakeup_task)) 149 if (unlikely(!tracer_enabled || next != wakeup_task))
@@ -169,7 +169,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
169 169
170out_unlock: 170out_unlock:
171 __wakeup_reset(wakeup_trace); 171 __wakeup_reset(wakeup_trace);
172 __raw_spin_unlock(&wakeup_lock); 172 arch_spin_unlock(&wakeup_lock);
173 local_irq_restore(flags); 173 local_irq_restore(flags);
174out: 174out:
175 atomic_dec(&wakeup_trace->data[cpu]->disabled); 175 atomic_dec(&wakeup_trace->data[cpu]->disabled);
@@ -193,9 +193,9 @@ static void wakeup_reset(struct trace_array *tr)
193 tracing_reset_online_cpus(tr); 193 tracing_reset_online_cpus(tr);
194 194
195 local_irq_save(flags); 195 local_irq_save(flags);
196 __raw_spin_lock(&wakeup_lock); 196 arch_spin_lock(&wakeup_lock);
197 __wakeup_reset(tr); 197 __wakeup_reset(tr);
198 __raw_spin_unlock(&wakeup_lock); 198 arch_spin_unlock(&wakeup_lock);
199 local_irq_restore(flags); 199 local_irq_restore(flags);
200} 200}
201 201
@@ -225,7 +225,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
225 goto out; 225 goto out;
226 226
227 /* interrupts should be off from try_to_wake_up */ 227 /* interrupts should be off from try_to_wake_up */
228 __raw_spin_lock(&wakeup_lock); 228 arch_spin_lock(&wakeup_lock);
229 229
230 /* check for races. */ 230 /* check for races. */
231 if (!tracer_enabled || p->prio >= wakeup_prio) 231 if (!tracer_enabled || p->prio >= wakeup_prio)
@@ -255,7 +255,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
255 trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); 255 trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
256 256
257out_locked: 257out_locked:
258 __raw_spin_unlock(&wakeup_lock); 258 arch_spin_unlock(&wakeup_lock);
259out: 259out:
260 atomic_dec(&wakeup_trace->data[cpu]->disabled); 260 atomic_dec(&wakeup_trace->data[cpu]->disabled);
261} 261}
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index d2cdbabb4ead..280fea470d67 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -17,6 +17,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
17 case TRACE_GRAPH_ENT: 17 case TRACE_GRAPH_ENT:
18 case TRACE_GRAPH_RET: 18 case TRACE_GRAPH_RET:
19 case TRACE_HW_BRANCHES: 19 case TRACE_HW_BRANCHES:
20 case TRACE_KSYM:
20 return 1; 21 return 1;
21 } 22 }
22 return 0; 23 return 0;
@@ -66,7 +67,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
66 67
67 /* Don't allow flipping of max traces now */ 68 /* Don't allow flipping of max traces now */
68 local_irq_save(flags); 69 local_irq_save(flags);
69 __raw_spin_lock(&ftrace_max_lock); 70 arch_spin_lock(&ftrace_max_lock);
70 71
71 cnt = ring_buffer_entries(tr->buffer); 72 cnt = ring_buffer_entries(tr->buffer);
72 73
@@ -84,7 +85,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
84 break; 85 break;
85 } 86 }
86 tracing_on(); 87 tracing_on();
87 __raw_spin_unlock(&ftrace_max_lock); 88 arch_spin_unlock(&ftrace_max_lock);
88 local_irq_restore(flags); 89 local_irq_restore(flags);
89 90
90 if (count) 91 if (count)
@@ -808,3 +809,57 @@ trace_selftest_startup_hw_branches(struct tracer *trace,
808 return ret; 809 return ret;
809} 810}
810#endif /* CONFIG_HW_BRANCH_TRACER */ 811#endif /* CONFIG_HW_BRANCH_TRACER */
812
813#ifdef CONFIG_KSYM_TRACER
814static int ksym_selftest_dummy;
815
816int
817trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
818{
819 unsigned long count;
820 int ret;
821
822 /* start the tracing */
823 ret = tracer_init(trace, tr);
824 if (ret) {
825 warn_failed_init_tracer(trace, ret);
826 return ret;
827 }
828
829 ksym_selftest_dummy = 0;
830 /* Register the read-write tracing request */
831
832 ret = process_new_ksym_entry("ksym_selftest_dummy",
833 HW_BREAKPOINT_R | HW_BREAKPOINT_W,
834 (unsigned long)(&ksym_selftest_dummy));
835
836 if (ret < 0) {
837 printk(KERN_CONT "ksym_trace read-write startup test failed\n");
838 goto ret_path;
839 }
840 /* Perform a read and a write operation over the dummy variable to
841 * trigger the tracer
842 */
843 if (ksym_selftest_dummy == 0)
844 ksym_selftest_dummy++;
845
846 /* stop the tracing. */
847 tracing_stop();
848 /* check the trace buffer */
849 ret = trace_test_buffer(tr, &count);
850 trace->reset(tr);
851 tracing_start();
852
853 /* read & write operations - one each is performed on the dummy variable
854 * triggering two entries in the trace buffer
855 */
856 if (!ret && count != 2) {
857 printk(KERN_CONT "Ksym tracer startup test failed");
858 ret = -1;
859 }
860
861ret_path:
862 return ret;
863}
864#endif /* CONFIG_KSYM_TRACER */
865
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 0f6facb050a1..678a5120ee30 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -27,8 +27,8 @@ static struct stack_trace max_stack_trace = {
27}; 27};
28 28
29static unsigned long max_stack_size; 29static unsigned long max_stack_size;
30static raw_spinlock_t max_stack_lock = 30static arch_spinlock_t max_stack_lock =
31 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 31 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
32 32
33static int stack_trace_disabled __read_mostly; 33static int stack_trace_disabled __read_mostly;
34static DEFINE_PER_CPU(int, trace_active); 34static DEFINE_PER_CPU(int, trace_active);
@@ -54,7 +54,7 @@ static inline void check_stack(void)
54 return; 54 return;
55 55
56 local_irq_save(flags); 56 local_irq_save(flags);
57 __raw_spin_lock(&max_stack_lock); 57 arch_spin_lock(&max_stack_lock);
58 58
59 /* a race could have already updated it */ 59 /* a race could have already updated it */
60 if (this_size <= max_stack_size) 60 if (this_size <= max_stack_size)
@@ -103,7 +103,7 @@ static inline void check_stack(void)
103 } 103 }
104 104
105 out: 105 out:
106 __raw_spin_unlock(&max_stack_lock); 106 arch_spin_unlock(&max_stack_lock);
107 local_irq_restore(flags); 107 local_irq_restore(flags);
108} 108}
109 109
@@ -171,9 +171,9 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
171 return ret; 171 return ret;
172 172
173 local_irq_save(flags); 173 local_irq_save(flags);
174 __raw_spin_lock(&max_stack_lock); 174 arch_spin_lock(&max_stack_lock);
175 *ptr = val; 175 *ptr = val;
176 __raw_spin_unlock(&max_stack_lock); 176 arch_spin_unlock(&max_stack_lock);
177 local_irq_restore(flags); 177 local_irq_restore(flags);
178 178
179 return count; 179 return count;
@@ -207,7 +207,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
207static void *t_start(struct seq_file *m, loff_t *pos) 207static void *t_start(struct seq_file *m, loff_t *pos)
208{ 208{
209 local_irq_disable(); 209 local_irq_disable();
210 __raw_spin_lock(&max_stack_lock); 210 arch_spin_lock(&max_stack_lock);
211 211
212 if (*pos == 0) 212 if (*pos == 0)
213 return SEQ_START_TOKEN; 213 return SEQ_START_TOKEN;
@@ -217,7 +217,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
217 217
218static void t_stop(struct seq_file *m, void *p) 218static void t_stop(struct seq_file *m, void *p)
219{ 219{
220 __raw_spin_unlock(&max_stack_lock); 220 arch_spin_unlock(&max_stack_lock);
221 local_irq_enable(); 221 local_irq_enable();
222} 222}
223 223
@@ -296,14 +296,14 @@ static const struct file_operations stack_trace_fops = {
296 296
297int 297int
298stack_trace_sysctl(struct ctl_table *table, int write, 298stack_trace_sysctl(struct ctl_table *table, int write,
299 struct file *file, void __user *buffer, size_t *lenp, 299 void __user *buffer, size_t *lenp,
300 loff_t *ppos) 300 loff_t *ppos)
301{ 301{
302 int ret; 302 int ret;
303 303
304 mutex_lock(&stack_sysctl_mutex); 304 mutex_lock(&stack_sysctl_mutex);
305 305
306 ret = proc_dointvec(table, write, file, buffer, lenp, ppos); 306 ret = proc_dointvec(table, write, buffer, lenp, ppos);
307 307
308 if (ret || !write || 308 if (ret || !write ||
309 (last_stack_tracer_enabled == !!stack_tracer_enabled)) 309 (last_stack_tracer_enabled == !!stack_tracer_enabled))
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 9fbce6c9d2e1..57501d90096a 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -14,6 +14,43 @@ static int sys_refcount_exit;
14static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); 14static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
15static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); 15static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
16 16
17extern unsigned long __start_syscalls_metadata[];
18extern unsigned long __stop_syscalls_metadata[];
19
20static struct syscall_metadata **syscalls_metadata;
21
22static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
23{
24 struct syscall_metadata *start;
25 struct syscall_metadata *stop;
26 char str[KSYM_SYMBOL_LEN];
27
28
29 start = (struct syscall_metadata *)__start_syscalls_metadata;
30 stop = (struct syscall_metadata *)__stop_syscalls_metadata;
31 kallsyms_lookup(syscall, NULL, NULL, NULL, str);
32
33 for ( ; start < stop; start++) {
34 /*
35 * Only compare after the "sys" prefix. Archs that use
36 * syscall wrappers may have syscalls symbols aliases prefixed
37 * with "SyS" instead of "sys", leading to an unwanted
38 * mismatch.
39 */
40 if (start->name && !strcmp(start->name + 3, str + 3))
41 return start;
42 }
43 return NULL;
44}
45
46static struct syscall_metadata *syscall_nr_to_meta(int nr)
47{
48 if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
49 return NULL;
50
51 return syscalls_metadata[nr];
52}
53
17enum print_line_t 54enum print_line_t
18print_syscall_enter(struct trace_iterator *iter, int flags) 55print_syscall_enter(struct trace_iterator *iter, int flags)
19{ 56{
@@ -30,7 +67,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags)
30 if (!entry) 67 if (!entry)
31 goto end; 68 goto end;
32 69
33 if (entry->enter_id != ent->type) { 70 if (entry->enter_event->id != ent->type) {
34 WARN_ON_ONCE(1); 71 WARN_ON_ONCE(1);
35 goto end; 72 goto end;
36 } 73 }
@@ -85,7 +122,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
85 return TRACE_TYPE_HANDLED; 122 return TRACE_TYPE_HANDLED;
86 } 123 }
87 124
88 if (entry->exit_id != ent->type) { 125 if (entry->exit_event->id != ent->type) {
89 WARN_ON_ONCE(1); 126 WARN_ON_ONCE(1);
90 return TRACE_TYPE_UNHANDLED; 127 return TRACE_TYPE_UNHANDLED;
91 } 128 }
@@ -103,24 +140,19 @@ extern char *__bad_type_size(void);
103#define SYSCALL_FIELD(type, name) \ 140#define SYSCALL_FIELD(type, name) \
104 sizeof(type) != sizeof(trace.name) ? \ 141 sizeof(type) != sizeof(trace.name) ? \
105 __bad_type_size() : \ 142 __bad_type_size() : \
106 #type, #name, offsetof(typeof(trace), name), sizeof(trace.name) 143 #type, #name, offsetof(typeof(trace), name), \
144 sizeof(trace.name), is_signed_type(type)
107 145
108int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) 146int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
109{ 147{
110 int i; 148 int i;
111 int nr;
112 int ret; 149 int ret;
113 struct syscall_metadata *entry; 150 struct syscall_metadata *entry = call->data;
114 struct syscall_trace_enter trace; 151 struct syscall_trace_enter trace;
115 int offset = offsetof(struct syscall_trace_enter, args); 152 int offset = offsetof(struct syscall_trace_enter, args);
116 153
117 nr = syscall_name_to_nr(call->data); 154 ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
118 entry = syscall_nr_to_meta(nr); 155 "\tsigned:%u;\n",
119
120 if (!entry)
121 return 0;
122
123 ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n",
124 SYSCALL_FIELD(int, nr)); 156 SYSCALL_FIELD(int, nr));
125 if (!ret) 157 if (!ret)
126 return 0; 158 return 0;
@@ -130,8 +162,10 @@ int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
130 entry->args[i]); 162 entry->args[i]);
131 if (!ret) 163 if (!ret)
132 return 0; 164 return 0;
133 ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;\n", offset, 165 ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;"
134 sizeof(unsigned long)); 166 "\tsigned:%u;\n", offset,
167 sizeof(unsigned long),
168 is_signed_type(unsigned long));
135 if (!ret) 169 if (!ret)
136 return 0; 170 return 0;
137 offset += sizeof(unsigned long); 171 offset += sizeof(unsigned long);
@@ -163,10 +197,12 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
163 struct syscall_trace_exit trace; 197 struct syscall_trace_exit trace;
164 198
165 ret = trace_seq_printf(s, 199 ret = trace_seq_printf(s,
166 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 200 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
167 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n", 201 "\tsigned:%u;\n"
202 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
203 "\tsigned:%u;\n",
168 SYSCALL_FIELD(int, nr), 204 SYSCALL_FIELD(int, nr),
169 SYSCALL_FIELD(unsigned long, ret)); 205 SYSCALL_FIELD(long, ret));
170 if (!ret) 206 if (!ret)
171 return 0; 207 return 0;
172 208
@@ -176,22 +212,19 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
176int syscall_enter_define_fields(struct ftrace_event_call *call) 212int syscall_enter_define_fields(struct ftrace_event_call *call)
177{ 213{
178 struct syscall_trace_enter trace; 214 struct syscall_trace_enter trace;
179 struct syscall_metadata *meta; 215 struct syscall_metadata *meta = call->data;
180 int ret; 216 int ret;
181 int nr;
182 int i; 217 int i;
183 int offset = offsetof(typeof(trace), args); 218 int offset = offsetof(typeof(trace), args);
184 219
185 nr = syscall_name_to_nr(call->data);
186 meta = syscall_nr_to_meta(nr);
187
188 if (!meta)
189 return 0;
190
191 ret = trace_define_common_fields(call); 220 ret = trace_define_common_fields(call);
192 if (ret) 221 if (ret)
193 return ret; 222 return ret;
194 223
224 ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
225 if (ret)
226 return ret;
227
195 for (i = 0; i < meta->nb_args; i++) { 228 for (i = 0; i < meta->nb_args; i++) {
196 ret = trace_define_field(call, meta->types[i], 229 ret = trace_define_field(call, meta->types[i],
197 meta->args[i], offset, 230 meta->args[i], offset,
@@ -212,7 +245,11 @@ int syscall_exit_define_fields(struct ftrace_event_call *call)
212 if (ret) 245 if (ret)
213 return ret; 246 return ret;
214 247
215 ret = trace_define_field(call, SYSCALL_FIELD(unsigned long, ret), 0, 248 ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
249 if (ret)
250 return ret;
251
252 ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
216 FILTER_OTHER); 253 FILTER_OTHER);
217 254
218 return ret; 255 return ret;
@@ -239,8 +276,8 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
239 276
240 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; 277 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
241 278
242 event = trace_current_buffer_lock_reserve(&buffer, sys_data->enter_id, 279 event = trace_current_buffer_lock_reserve(&buffer,
243 size, 0, 0); 280 sys_data->enter_event->id, size, 0, 0);
244 if (!event) 281 if (!event)
245 return; 282 return;
246 283
@@ -271,8 +308,8 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
271 if (!sys_data) 308 if (!sys_data)
272 return; 309 return;
273 310
274 event = trace_current_buffer_lock_reserve(&buffer, sys_data->exit_id, 311 event = trace_current_buffer_lock_reserve(&buffer,
275 sizeof(*entry), 0, 0); 312 sys_data->exit_event->id, sizeof(*entry), 0, 0);
276 if (!event) 313 if (!event)
277 return; 314 return;
278 315
@@ -285,14 +322,12 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
285 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 322 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
286} 323}
287 324
288int reg_event_syscall_enter(void *ptr) 325int reg_event_syscall_enter(struct ftrace_event_call *call)
289{ 326{
290 int ret = 0; 327 int ret = 0;
291 int num; 328 int num;
292 char *name;
293 329
294 name = (char *)ptr; 330 num = ((struct syscall_metadata *)call->data)->syscall_nr;
295 num = syscall_name_to_nr(name);
296 if (num < 0 || num >= NR_syscalls) 331 if (num < 0 || num >= NR_syscalls)
297 return -ENOSYS; 332 return -ENOSYS;
298 mutex_lock(&syscall_trace_lock); 333 mutex_lock(&syscall_trace_lock);
@@ -309,13 +344,11 @@ int reg_event_syscall_enter(void *ptr)
309 return ret; 344 return ret;
310} 345}
311 346
312void unreg_event_syscall_enter(void *ptr) 347void unreg_event_syscall_enter(struct ftrace_event_call *call)
313{ 348{
314 int num; 349 int num;
315 char *name;
316 350
317 name = (char *)ptr; 351 num = ((struct syscall_metadata *)call->data)->syscall_nr;
318 num = syscall_name_to_nr(name);
319 if (num < 0 || num >= NR_syscalls) 352 if (num < 0 || num >= NR_syscalls)
320 return; 353 return;
321 mutex_lock(&syscall_trace_lock); 354 mutex_lock(&syscall_trace_lock);
@@ -326,14 +359,12 @@ void unreg_event_syscall_enter(void *ptr)
326 mutex_unlock(&syscall_trace_lock); 359 mutex_unlock(&syscall_trace_lock);
327} 360}
328 361
329int reg_event_syscall_exit(void *ptr) 362int reg_event_syscall_exit(struct ftrace_event_call *call)
330{ 363{
331 int ret = 0; 364 int ret = 0;
332 int num; 365 int num;
333 char *name;
334 366
335 name = (char *)ptr; 367 num = ((struct syscall_metadata *)call->data)->syscall_nr;
336 num = syscall_name_to_nr(name);
337 if (num < 0 || num >= NR_syscalls) 368 if (num < 0 || num >= NR_syscalls)
338 return -ENOSYS; 369 return -ENOSYS;
339 mutex_lock(&syscall_trace_lock); 370 mutex_lock(&syscall_trace_lock);
@@ -350,13 +381,11 @@ int reg_event_syscall_exit(void *ptr)
350 return ret; 381 return ret;
351} 382}
352 383
353void unreg_event_syscall_exit(void *ptr) 384void unreg_event_syscall_exit(struct ftrace_event_call *call)
354{ 385{
355 int num; 386 int num;
356 char *name;
357 387
358 name = (char *)ptr; 388 num = ((struct syscall_metadata *)call->data)->syscall_nr;
359 num = syscall_name_to_nr(name);
360 if (num < 0 || num >= NR_syscalls) 389 if (num < 0 || num >= NR_syscalls)
361 return; 390 return;
362 mutex_lock(&syscall_trace_lock); 391 mutex_lock(&syscall_trace_lock);
@@ -367,13 +396,44 @@ void unreg_event_syscall_exit(void *ptr)
367 mutex_unlock(&syscall_trace_lock); 396 mutex_unlock(&syscall_trace_lock);
368} 397}
369 398
370struct trace_event event_syscall_enter = { 399int init_syscall_trace(struct ftrace_event_call *call)
371 .trace = print_syscall_enter, 400{
372}; 401 int id;
402
403 id = register_ftrace_event(call->event);
404 if (!id)
405 return -ENODEV;
406 call->id = id;
407 INIT_LIST_HEAD(&call->fields);
408 return 0;
409}
410
411int __init init_ftrace_syscalls(void)
412{
413 struct syscall_metadata *meta;
414 unsigned long addr;
415 int i;
416
417 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
418 NR_syscalls, GFP_KERNEL);
419 if (!syscalls_metadata) {
420 WARN_ON(1);
421 return -ENOMEM;
422 }
423
424 for (i = 0; i < NR_syscalls; i++) {
425 addr = arch_syscall_addr(i);
426 meta = find_syscall_meta(addr);
427 if (!meta)
428 continue;
429
430 meta->syscall_nr = i;
431 syscalls_metadata[i] = meta;
432 }
373 433
374struct trace_event event_syscall_exit = { 434 return 0;
375 .trace = print_syscall_exit, 435}
376}; 436core_initcall(init_ftrace_syscalls);
377 437
378#ifdef CONFIG_EVENT_PROFILE 438#ifdef CONFIG_EVENT_PROFILE
379 439
@@ -387,8 +447,10 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
387 struct syscall_metadata *sys_data; 447 struct syscall_metadata *sys_data;
388 struct syscall_trace_enter *rec; 448 struct syscall_trace_enter *rec;
389 unsigned long flags; 449 unsigned long flags;
450 char *trace_buf;
390 char *raw_data; 451 char *raw_data;
391 int syscall_nr; 452 int syscall_nr;
453 int rctx;
392 int size; 454 int size;
393 int cpu; 455 int cpu;
394 456
@@ -412,41 +474,42 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
412 /* Protect the per cpu buffer, begin the rcu read side */ 474 /* Protect the per cpu buffer, begin the rcu read side */
413 local_irq_save(flags); 475 local_irq_save(flags);
414 476
477 rctx = perf_swevent_get_recursion_context();
478 if (rctx < 0)
479 goto end_recursion;
480
415 cpu = smp_processor_id(); 481 cpu = smp_processor_id();
416 482
417 if (in_nmi()) 483 trace_buf = rcu_dereference(perf_trace_buf);
418 raw_data = rcu_dereference(trace_profile_buf_nmi);
419 else
420 raw_data = rcu_dereference(trace_profile_buf);
421 484
422 if (!raw_data) 485 if (!trace_buf)
423 goto end; 486 goto end;
424 487
425 raw_data = per_cpu_ptr(raw_data, cpu); 488 raw_data = per_cpu_ptr(trace_buf, cpu);
426 489
427 /* zero the dead bytes from align to not leak stack to user */ 490 /* zero the dead bytes from align to not leak stack to user */
428 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; 491 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
429 492
430 rec = (struct syscall_trace_enter *) raw_data; 493 rec = (struct syscall_trace_enter *) raw_data;
431 tracing_generic_entry_update(&rec->ent, 0, 0); 494 tracing_generic_entry_update(&rec->ent, 0, 0);
432 rec->ent.type = sys_data->enter_id; 495 rec->ent.type = sys_data->enter_event->id;
433 rec->nr = syscall_nr; 496 rec->nr = syscall_nr;
434 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 497 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
435 (unsigned long *)&rec->args); 498 (unsigned long *)&rec->args);
436 perf_tp_event(sys_data->enter_id, 0, 1, rec, size); 499 perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size);
437 500
438end: 501end:
502 perf_swevent_put_recursion_context(rctx);
503end_recursion:
439 local_irq_restore(flags); 504 local_irq_restore(flags);
440} 505}
441 506
442int reg_prof_syscall_enter(char *name) 507int prof_sysenter_enable(struct ftrace_event_call *call)
443{ 508{
444 int ret = 0; 509 int ret = 0;
445 int num; 510 int num;
446 511
447 num = syscall_name_to_nr(name); 512 num = ((struct syscall_metadata *)call->data)->syscall_nr;
448 if (num < 0 || num >= NR_syscalls)
449 return -ENOSYS;
450 513
451 mutex_lock(&syscall_trace_lock); 514 mutex_lock(&syscall_trace_lock);
452 if (!sys_prof_refcount_enter) 515 if (!sys_prof_refcount_enter)
@@ -462,13 +525,11 @@ int reg_prof_syscall_enter(char *name)
462 return ret; 525 return ret;
463} 526}
464 527
465void unreg_prof_syscall_enter(char *name) 528void prof_sysenter_disable(struct ftrace_event_call *call)
466{ 529{
467 int num; 530 int num;
468 531
469 num = syscall_name_to_nr(name); 532 num = ((struct syscall_metadata *)call->data)->syscall_nr;
470 if (num < 0 || num >= NR_syscalls)
471 return;
472 533
473 mutex_lock(&syscall_trace_lock); 534 mutex_lock(&syscall_trace_lock);
474 sys_prof_refcount_enter--; 535 sys_prof_refcount_enter--;
@@ -484,7 +545,9 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
484 struct syscall_trace_exit *rec; 545 struct syscall_trace_exit *rec;
485 unsigned long flags; 546 unsigned long flags;
486 int syscall_nr; 547 int syscall_nr;
548 char *trace_buf;
487 char *raw_data; 549 char *raw_data;
550 int rctx;
488 int size; 551 int size;
489 int cpu; 552 int cpu;
490 553
@@ -510,17 +573,19 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
510 573
511 /* Protect the per cpu buffer, begin the rcu read side */ 574 /* Protect the per cpu buffer, begin the rcu read side */
512 local_irq_save(flags); 575 local_irq_save(flags);
576
577 rctx = perf_swevent_get_recursion_context();
578 if (rctx < 0)
579 goto end_recursion;
580
513 cpu = smp_processor_id(); 581 cpu = smp_processor_id();
514 582
515 if (in_nmi()) 583 trace_buf = rcu_dereference(perf_trace_buf);
516 raw_data = rcu_dereference(trace_profile_buf_nmi);
517 else
518 raw_data = rcu_dereference(trace_profile_buf);
519 584
520 if (!raw_data) 585 if (!trace_buf)
521 goto end; 586 goto end;
522 587
523 raw_data = per_cpu_ptr(raw_data, cpu); 588 raw_data = per_cpu_ptr(trace_buf, cpu);
524 589
525 /* zero the dead bytes from align to not leak stack to user */ 590 /* zero the dead bytes from align to not leak stack to user */
526 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; 591 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
@@ -528,24 +593,24 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
528 rec = (struct syscall_trace_exit *)raw_data; 593 rec = (struct syscall_trace_exit *)raw_data;
529 594
530 tracing_generic_entry_update(&rec->ent, 0, 0); 595 tracing_generic_entry_update(&rec->ent, 0, 0);
531 rec->ent.type = sys_data->exit_id; 596 rec->ent.type = sys_data->exit_event->id;
532 rec->nr = syscall_nr; 597 rec->nr = syscall_nr;
533 rec->ret = syscall_get_return_value(current, regs); 598 rec->ret = syscall_get_return_value(current, regs);
534 599
535 perf_tp_event(sys_data->exit_id, 0, 1, rec, size); 600 perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size);
536 601
537end: 602end:
603 perf_swevent_put_recursion_context(rctx);
604end_recursion:
538 local_irq_restore(flags); 605 local_irq_restore(flags);
539} 606}
540 607
541int reg_prof_syscall_exit(char *name) 608int prof_sysexit_enable(struct ftrace_event_call *call)
542{ 609{
543 int ret = 0; 610 int ret = 0;
544 int num; 611 int num;
545 612
546 num = syscall_name_to_nr(name); 613 num = ((struct syscall_metadata *)call->data)->syscall_nr;
547 if (num < 0 || num >= NR_syscalls)
548 return -ENOSYS;
549 614
550 mutex_lock(&syscall_trace_lock); 615 mutex_lock(&syscall_trace_lock);
551 if (!sys_prof_refcount_exit) 616 if (!sys_prof_refcount_exit)
@@ -561,13 +626,11 @@ int reg_prof_syscall_exit(char *name)
561 return ret; 626 return ret;
562} 627}
563 628
564void unreg_prof_syscall_exit(char *name) 629void prof_sysexit_disable(struct ftrace_event_call *call)
565{ 630{
566 int num; 631 int num;
567 632
568 num = syscall_name_to_nr(name); 633 num = ((struct syscall_metadata *)call->data)->syscall_nr;
569 if (num < 0 || num >= NR_syscalls)
570 return;
571 634
572 mutex_lock(&syscall_trace_lock); 635 mutex_lock(&syscall_trace_lock);
573 sys_prof_refcount_exit--; 636 sys_prof_refcount_exit--;
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 0314501688b9..419209893d87 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -4,7 +4,6 @@
4 */ 4 */
5 5
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/utsname.h>
8#include <linux/mman.h> 7#include <linux/mman.h>
9#include <linux/notifier.h> 8#include <linux/notifier.h>
10#include <linux/reboot.h> 9#include <linux/reboot.h>
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
new file mode 100644
index 000000000000..eb27fd3430a2
--- /dev/null
+++ b/kernel/user-return-notifier.c
@@ -0,0 +1,44 @@
1
2#include <linux/user-return-notifier.h>
3#include <linux/percpu.h>
4#include <linux/sched.h>
5#include <linux/module.h>
6
7static DEFINE_PER_CPU(struct hlist_head, return_notifier_list);
8
9/*
10 * Request a notification when the current cpu returns to userspace. Must be
11 * called in atomic context. The notifier will also be called in atomic
12 * context.
13 */
14void user_return_notifier_register(struct user_return_notifier *urn)
15{
16 set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
17 hlist_add_head(&urn->link, &__get_cpu_var(return_notifier_list));
18}
19EXPORT_SYMBOL_GPL(user_return_notifier_register);
20
21/*
22 * Removes a registered user return notifier. Must be called from atomic
23 * context, and from the same cpu registration occured in.
24 */
25void user_return_notifier_unregister(struct user_return_notifier *urn)
26{
27 hlist_del(&urn->link);
28 if (hlist_empty(&__get_cpu_var(return_notifier_list)))
29 clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
30}
31EXPORT_SYMBOL_GPL(user_return_notifier_unregister);
32
33/* Calls registered user return notifiers */
34void fire_user_return_notifiers(void)
35{
36 struct user_return_notifier *urn;
37 struct hlist_node *tmp1, *tmp2;
38 struct hlist_head *head;
39
40 head = &get_cpu_var(return_notifier_list);
41 hlist_for_each_entry_safe(urn, tmp1, tmp2, head, link)
42 urn->on_user_return(urn);
43 put_cpu_var(return_notifier_list);
44}
diff --git a/kernel/user.c b/kernel/user.c
index 2c000e7132ac..46d0165ca70c 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -330,9 +330,9 @@ done:
330 */ 330 */
331static void free_user(struct user_struct *up, unsigned long flags) 331static void free_user(struct user_struct *up, unsigned long flags)
332{ 332{
333 spin_unlock_irqrestore(&uidhash_lock, flags);
334 INIT_DELAYED_WORK(&up->work, cleanup_user_struct); 333 INIT_DELAYED_WORK(&up->work, cleanup_user_struct);
335 schedule_delayed_work(&up->work, msecs_to_jiffies(1000)); 334 schedule_delayed_work(&up->work, msecs_to_jiffies(1000));
335 spin_unlock_irqrestore(&uidhash_lock, flags);
336} 336}
337 337
338#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */ 338#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 92359cc747a7..a2cd77e70d4d 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -42,14 +42,14 @@ static void put_uts(ctl_table *table, int write, void *which)
42 * Special case of dostring for the UTS structure. This has locks 42 * Special case of dostring for the UTS structure. This has locks
43 * to observe. Should this be in kernel/sys.c ???? 43 * to observe. Should this be in kernel/sys.c ????
44 */ 44 */
45static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, 45static int proc_do_uts_string(ctl_table *table, int write,
46 void __user *buffer, size_t *lenp, loff_t *ppos) 46 void __user *buffer, size_t *lenp, loff_t *ppos)
47{ 47{
48 struct ctl_table uts_table; 48 struct ctl_table uts_table;
49 int r; 49 int r;
50 memcpy(&uts_table, table, sizeof(uts_table)); 50 memcpy(&uts_table, table, sizeof(uts_table));
51 uts_table.data = get_uts(table, write); 51 uts_table.data = get_uts(table, write);
52 r = proc_dostring(&uts_table,write,filp,buffer,lenp, ppos); 52 r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
53 put_uts(table, write, uts_table.data); 53 put_uts(table, write, uts_table.data);
54 return r; 54 return r;
55} 55}
@@ -57,78 +57,47 @@ static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
57#define proc_do_uts_string NULL 57#define proc_do_uts_string NULL
58#endif 58#endif
59 59
60
61#ifdef CONFIG_SYSCTL_SYSCALL
62/* The generic string strategy routine: */
63static int sysctl_uts_string(ctl_table *table,
64 void __user *oldval, size_t __user *oldlenp,
65 void __user *newval, size_t newlen)
66{
67 struct ctl_table uts_table;
68 int r, write;
69 write = newval && newlen;
70 memcpy(&uts_table, table, sizeof(uts_table));
71 uts_table.data = get_uts(table, write);
72 r = sysctl_string(&uts_table, oldval, oldlenp, newval, newlen);
73 put_uts(table, write, uts_table.data);
74 return r;
75}
76#else
77#define sysctl_uts_string NULL
78#endif
79
80static struct ctl_table uts_kern_table[] = { 60static struct ctl_table uts_kern_table[] = {
81 { 61 {
82 .ctl_name = KERN_OSTYPE,
83 .procname = "ostype", 62 .procname = "ostype",
84 .data = init_uts_ns.name.sysname, 63 .data = init_uts_ns.name.sysname,
85 .maxlen = sizeof(init_uts_ns.name.sysname), 64 .maxlen = sizeof(init_uts_ns.name.sysname),
86 .mode = 0444, 65 .mode = 0444,
87 .proc_handler = proc_do_uts_string, 66 .proc_handler = proc_do_uts_string,
88 .strategy = sysctl_uts_string,
89 }, 67 },
90 { 68 {
91 .ctl_name = KERN_OSRELEASE,
92 .procname = "osrelease", 69 .procname = "osrelease",
93 .data = init_uts_ns.name.release, 70 .data = init_uts_ns.name.release,
94 .maxlen = sizeof(init_uts_ns.name.release), 71 .maxlen = sizeof(init_uts_ns.name.release),
95 .mode = 0444, 72 .mode = 0444,
96 .proc_handler = proc_do_uts_string, 73 .proc_handler = proc_do_uts_string,
97 .strategy = sysctl_uts_string,
98 }, 74 },
99 { 75 {
100 .ctl_name = KERN_VERSION,
101 .procname = "version", 76 .procname = "version",
102 .data = init_uts_ns.name.version, 77 .data = init_uts_ns.name.version,
103 .maxlen = sizeof(init_uts_ns.name.version), 78 .maxlen = sizeof(init_uts_ns.name.version),
104 .mode = 0444, 79 .mode = 0444,
105 .proc_handler = proc_do_uts_string, 80 .proc_handler = proc_do_uts_string,
106 .strategy = sysctl_uts_string,
107 }, 81 },
108 { 82 {
109 .ctl_name = KERN_NODENAME,
110 .procname = "hostname", 83 .procname = "hostname",
111 .data = init_uts_ns.name.nodename, 84 .data = init_uts_ns.name.nodename,
112 .maxlen = sizeof(init_uts_ns.name.nodename), 85 .maxlen = sizeof(init_uts_ns.name.nodename),
113 .mode = 0644, 86 .mode = 0644,
114 .proc_handler = proc_do_uts_string, 87 .proc_handler = proc_do_uts_string,
115 .strategy = sysctl_uts_string,
116 }, 88 },
117 { 89 {
118 .ctl_name = KERN_DOMAINNAME,
119 .procname = "domainname", 90 .procname = "domainname",
120 .data = init_uts_ns.name.domainname, 91 .data = init_uts_ns.name.domainname,
121 .maxlen = sizeof(init_uts_ns.name.domainname), 92 .maxlen = sizeof(init_uts_ns.name.domainname),
122 .mode = 0644, 93 .mode = 0644,
123 .proc_handler = proc_do_uts_string, 94 .proc_handler = proc_do_uts_string,
124 .strategy = sysctl_uts_string,
125 }, 95 },
126 {} 96 {}
127}; 97};
128 98
129static struct ctl_table uts_root_table[] = { 99static struct ctl_table uts_root_table[] = {
130 { 100 {
131 .ctl_name = CTL_KERN,
132 .procname = "kernel", 101 .procname = "kernel",
133 .mode = 0555, 102 .mode = 0555,
134 .child = uts_kern_table, 103 .child = uts_kern_table,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index addfe2df93b1..dee48658805c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -68,6 +68,116 @@ struct workqueue_struct {
68#endif 68#endif
69}; 69};
70 70
71#ifdef CONFIG_DEBUG_OBJECTS_WORK
72
73static struct debug_obj_descr work_debug_descr;
74
75/*
76 * fixup_init is called when:
77 * - an active object is initialized
78 */
79static int work_fixup_init(void *addr, enum debug_obj_state state)
80{
81 struct work_struct *work = addr;
82
83 switch (state) {
84 case ODEBUG_STATE_ACTIVE:
85 cancel_work_sync(work);
86 debug_object_init(work, &work_debug_descr);
87 return 1;
88 default:
89 return 0;
90 }
91}
92
93/*
94 * fixup_activate is called when:
95 * - an active object is activated
96 * - an unknown object is activated (might be a statically initialized object)
97 */
98static int work_fixup_activate(void *addr, enum debug_obj_state state)
99{
100 struct work_struct *work = addr;
101
102 switch (state) {
103
104 case ODEBUG_STATE_NOTAVAILABLE:
105 /*
106 * This is not really a fixup. The work struct was
107 * statically initialized. We just make sure that it
108 * is tracked in the object tracker.
109 */
110 if (test_bit(WORK_STRUCT_STATIC, work_data_bits(work))) {
111 debug_object_init(work, &work_debug_descr);
112 debug_object_activate(work, &work_debug_descr);
113 return 0;
114 }
115 WARN_ON_ONCE(1);
116 return 0;
117
118 case ODEBUG_STATE_ACTIVE:
119 WARN_ON(1);
120
121 default:
122 return 0;
123 }
124}
125
126/*
127 * fixup_free is called when:
128 * - an active object is freed
129 */
130static int work_fixup_free(void *addr, enum debug_obj_state state)
131{
132 struct work_struct *work = addr;
133
134 switch (state) {
135 case ODEBUG_STATE_ACTIVE:
136 cancel_work_sync(work);
137 debug_object_free(work, &work_debug_descr);
138 return 1;
139 default:
140 return 0;
141 }
142}
143
144static struct debug_obj_descr work_debug_descr = {
145 .name = "work_struct",
146 .fixup_init = work_fixup_init,
147 .fixup_activate = work_fixup_activate,
148 .fixup_free = work_fixup_free,
149};
150
151static inline void debug_work_activate(struct work_struct *work)
152{
153 debug_object_activate(work, &work_debug_descr);
154}
155
156static inline void debug_work_deactivate(struct work_struct *work)
157{
158 debug_object_deactivate(work, &work_debug_descr);
159}
160
161void __init_work(struct work_struct *work, int onstack)
162{
163 if (onstack)
164 debug_object_init_on_stack(work, &work_debug_descr);
165 else
166 debug_object_init(work, &work_debug_descr);
167}
168EXPORT_SYMBOL_GPL(__init_work);
169
170void destroy_work_on_stack(struct work_struct *work)
171{
172 debug_object_free(work, &work_debug_descr);
173}
174EXPORT_SYMBOL_GPL(destroy_work_on_stack);
175
176#else
177static inline void debug_work_activate(struct work_struct *work) { }
178static inline void debug_work_deactivate(struct work_struct *work) { }
179#endif
180
71/* Serializes the accesses to the list of workqueues. */ 181/* Serializes the accesses to the list of workqueues. */
72static DEFINE_SPINLOCK(workqueue_lock); 182static DEFINE_SPINLOCK(workqueue_lock);
73static LIST_HEAD(workqueues); 183static LIST_HEAD(workqueues);
@@ -145,6 +255,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
145{ 255{
146 unsigned long flags; 256 unsigned long flags;
147 257
258 debug_work_activate(work);
148 spin_lock_irqsave(&cwq->lock, flags); 259 spin_lock_irqsave(&cwq->lock, flags);
149 insert_work(cwq, work, &cwq->worklist); 260 insert_work(cwq, work, &cwq->worklist);
150 spin_unlock_irqrestore(&cwq->lock, flags); 261 spin_unlock_irqrestore(&cwq->lock, flags);
@@ -280,6 +391,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
280 struct lockdep_map lockdep_map = work->lockdep_map; 391 struct lockdep_map lockdep_map = work->lockdep_map;
281#endif 392#endif
282 trace_workqueue_execution(cwq->thread, work); 393 trace_workqueue_execution(cwq->thread, work);
394 debug_work_deactivate(work);
283 cwq->current_work = work; 395 cwq->current_work = work;
284 list_del_init(cwq->worklist.next); 396 list_del_init(cwq->worklist.next);
285 spin_unlock_irq(&cwq->lock); 397 spin_unlock_irq(&cwq->lock);
@@ -350,11 +462,18 @@ static void wq_barrier_func(struct work_struct *work)
350static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, 462static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
351 struct wq_barrier *barr, struct list_head *head) 463 struct wq_barrier *barr, struct list_head *head)
352{ 464{
353 INIT_WORK(&barr->work, wq_barrier_func); 465 /*
466 * debugobject calls are safe here even with cwq->lock locked
467 * as we know for sure that this will not trigger any of the
468 * checks and call back into the fixup functions where we
469 * might deadlock.
470 */
471 INIT_WORK_ON_STACK(&barr->work, wq_barrier_func);
354 __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work)); 472 __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
355 473
356 init_completion(&barr->done); 474 init_completion(&barr->done);
357 475
476 debug_work_activate(&barr->work);
358 insert_work(cwq, &barr->work, head); 477 insert_work(cwq, &barr->work, head);
359} 478}
360 479
@@ -372,8 +491,10 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
372 } 491 }
373 spin_unlock_irq(&cwq->lock); 492 spin_unlock_irq(&cwq->lock);
374 493
375 if (active) 494 if (active) {
376 wait_for_completion(&barr.done); 495 wait_for_completion(&barr.done);
496 destroy_work_on_stack(&barr.work);
497 }
377 498
378 return active; 499 return active;
379} 500}
@@ -451,6 +572,7 @@ out:
451 return 0; 572 return 0;
452 573
453 wait_for_completion(&barr.done); 574 wait_for_completion(&barr.done);
575 destroy_work_on_stack(&barr.work);
454 return 1; 576 return 1;
455} 577}
456EXPORT_SYMBOL_GPL(flush_work); 578EXPORT_SYMBOL_GPL(flush_work);
@@ -485,6 +607,7 @@ static int try_to_grab_pending(struct work_struct *work)
485 */ 607 */
486 smp_rmb(); 608 smp_rmb();
487 if (cwq == get_wq_data(work)) { 609 if (cwq == get_wq_data(work)) {
610 debug_work_deactivate(work);
488 list_del_init(&work->entry); 611 list_del_init(&work->entry);
489 ret = 1; 612 ret = 1;
490 } 613 }
@@ -507,8 +630,10 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
507 } 630 }
508 spin_unlock_irq(&cwq->lock); 631 spin_unlock_irq(&cwq->lock);
509 632
510 if (unlikely(running)) 633 if (unlikely(running)) {
511 wait_for_completion(&barr.done); 634 wait_for_completion(&barr.done);
635 destroy_work_on_stack(&barr.work);
636 }
512} 637}
513 638
514static void wait_on_work(struct work_struct *work) 639static void wait_on_work(struct work_struct *work)
@@ -640,6 +765,24 @@ int schedule_delayed_work(struct delayed_work *dwork,
640EXPORT_SYMBOL(schedule_delayed_work); 765EXPORT_SYMBOL(schedule_delayed_work);
641 766
642/** 767/**
768 * flush_delayed_work - block until a dwork_struct's callback has terminated
769 * @dwork: the delayed work which is to be flushed
770 *
771 * Any timeout is cancelled, and any pending work is run immediately.
772 */
773void flush_delayed_work(struct delayed_work *dwork)
774{
775 if (del_timer_sync(&dwork->timer)) {
776 struct cpu_workqueue_struct *cwq;
777 cwq = wq_per_cpu(keventd_wq, get_cpu());
778 __queue_work(cwq, &dwork->work);
779 put_cpu();
780 }
781 flush_work(&dwork->work);
782}
783EXPORT_SYMBOL(flush_delayed_work);
784
785/**
643 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay 786 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
644 * @cpu: cpu to use 787 * @cpu: cpu to use
645 * @dwork: job to be done 788 * @dwork: job to be done
@@ -667,6 +810,7 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
667int schedule_on_each_cpu(work_func_t func) 810int schedule_on_each_cpu(work_func_t func)
668{ 811{
669 int cpu; 812 int cpu;
813 int orig = -1;
670 struct work_struct *works; 814 struct work_struct *works;
671 815
672 works = alloc_percpu(struct work_struct); 816 works = alloc_percpu(struct work_struct);
@@ -674,14 +818,28 @@ int schedule_on_each_cpu(work_func_t func)
674 return -ENOMEM; 818 return -ENOMEM;
675 819
676 get_online_cpus(); 820 get_online_cpus();
821
822 /*
823 * When running in keventd don't schedule a work item on
824 * itself. Can just call directly because the work queue is
825 * already bound. This also is faster.
826 */
827 if (current_is_keventd())
828 orig = raw_smp_processor_id();
829
677 for_each_online_cpu(cpu) { 830 for_each_online_cpu(cpu) {
678 struct work_struct *work = per_cpu_ptr(works, cpu); 831 struct work_struct *work = per_cpu_ptr(works, cpu);
679 832
680 INIT_WORK(work, func); 833 INIT_WORK(work, func);
681 schedule_work_on(cpu, work); 834 if (cpu != orig)
835 schedule_work_on(cpu, work);
682 } 836 }
837 if (orig >= 0)
838 func(per_cpu_ptr(works, orig));
839
683 for_each_online_cpu(cpu) 840 for_each_online_cpu(cpu)
684 flush_work(per_cpu_ptr(works, cpu)); 841 flush_work(per_cpu_ptr(works, cpu));
842
685 put_online_cpus(); 843 put_online_cpus();
686 free_percpu(works); 844 free_percpu(works);
687 return 0; 845 return 0;