aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorAndrea Bastoni <bastoni@cs.unc.edu>2010-10-23 01:01:49 -0400
committerAndrea Bastoni <bastoni@cs.unc.edu>2010-10-23 01:01:49 -0400
commit3dd41424090a0ca3a660218d06afe6ff4441bad3 (patch)
tree511ef1bb1799027fc5aad574adce49120ecadd87 /kernel
parent5c5456402d467969b217d7fdd6670f8c8600f5a8 (diff)
parentf6f94e2ab1b33f0082ac22d71f66385a60d8157f (diff)
Merge commit 'v2.6.36' into wip-merge-2.6.36
Conflicts: Makefile arch/x86/include/asm/unistd_32.h arch/x86/kernel/syscall_table_32.S kernel/sched.c kernel/time/tick-sched.c Relevant API and functions changes (solved in this commit): - (API) .enqueue_task() (enqueue_task_litmus), dequeue_task() (dequeue_task_litmus), [litmus/sched_litmus.c] - (API) .select_task_rq() (select_task_rq_litmus) [litmus/sched_litmus.c] - (API) sysrq_dump_trace_buffer() and sysrq_handle_kill_rt_tasks() [litmus/sched_trace.c] - struct kfifo internal buffer name changed (buffer -> buf) [litmus/sched_trace.c] - add_wait_queue_exclusive_locked -> __add_wait_queue_tail_exclusive [litmus/fmlp.c] - syscall numbers for both x86_32 and x86_64
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile13
-rw-r--r--kernel/acct.c22
-rw-r--r--kernel/async.c141
-rw-r--r--kernel/audit.c3
-rw-r--r--kernel/audit.h26
-rw-r--r--kernel/audit_tree.c237
-rw-r--r--kernel/audit_watch.c274
-rw-r--r--kernel/auditfilter.c39
-rw-r--r--kernel/auditsc.c19
-rw-r--r--kernel/capability.c1
-rw-r--r--kernel/cgroup.c48
-rw-r--r--kernel/cgroup_freezer.c21
-rw-r--r--kernel/compat.c63
-rw-r--r--kernel/cpu.c165
-rw-r--r--kernel/cpuset.c168
-rw-r--r--kernel/cred-internals.h21
-rw-r--r--kernel/cred.c94
-rw-r--r--kernel/debug/Makefile6
-rw-r--r--kernel/debug/debug_core.c985
-rw-r--r--kernel/debug/debug_core.h81
-rw-r--r--kernel/debug/gdbstub.c1095
-rw-r--r--kernel/debug/kdb/.gitignore1
-rw-r--r--kernel/debug/kdb/Makefile25
-rw-r--r--kernel/debug/kdb/kdb_bp.c562
-rw-r--r--kernel/debug/kdb/kdb_bt.c210
-rw-r--r--kernel/debug/kdb/kdb_cmds35
-rw-r--r--kernel/debug/kdb/kdb_debugger.c169
-rw-r--r--kernel/debug/kdb/kdb_io.c826
-rw-r--r--kernel/debug/kdb/kdb_keyboard.c212
-rw-r--r--kernel/debug/kdb/kdb_main.c2956
-rw-r--r--kernel/debug/kdb/kdb_private.h305
-rw-r--r--kernel/debug/kdb/kdb_support.c927
-rw-r--r--kernel/early_res.c6
-rw-r--r--kernel/exec_domain.c38
-rw-r--r--kernel/exit.c55
-rw-r--r--kernel/fork.c75
-rw-r--r--kernel/futex.c17
-rw-r--r--kernel/gcov/fs.c244
-rw-r--r--kernel/groups.c11
-rw-r--r--kernel/hrtimer.c102
-rw-r--r--kernel/hw_breakpoint.c267
-rw-r--r--kernel/irq/handle.c3
-rw-r--r--kernel/irq/manage.c94
-rw-r--r--kernel/irq/proc.c60
-rw-r--r--kernel/kallsyms.c21
-rw-r--r--kernel/kexec.c15
-rw-r--r--kernel/kfifo.c751
-rw-r--r--kernel/kgdb.c1764
-rw-r--r--kernel/kmod.c197
-rw-r--r--kernel/kprobes.c132
-rw-r--r--kernel/ksysfs.c3
-rw-r--r--kernel/kthread.c164
-rw-r--r--kernel/lockdep.c95
-rw-r--r--kernel/lockdep_internals.h72
-rw-r--r--kernel/lockdep_proc.c58
-rw-r--r--kernel/module.c1386
-rw-r--r--kernel/mutex.c30
-rw-r--r--kernel/padata.c882
-rw-r--r--kernel/panic.c87
-rw-r--r--kernel/params.c233
-rw-r--r--kernel/perf_event.c1402
-rw-r--r--kernel/pid.c63
-rw-r--r--kernel/pm_qos_params.c341
-rw-r--r--kernel/posix-cpu-timers.c348
-rw-r--r--kernel/posix-timers.c22
-rw-r--r--kernel/power/Kconfig9
-rw-r--r--kernel/power/Makefile5
-rw-r--r--kernel/power/block_io.c103
-rw-r--r--kernel/power/hibernate.c26
-rw-r--r--kernel/power/main.c55
-rw-r--r--kernel/power/nvs.c (renamed from kernel/power/hibernate_nvs.c)24
-rw-r--r--kernel/power/power.h27
-rw-r--r--kernel/power/poweroff.c2
-rw-r--r--kernel/power/process.c21
-rw-r--r--kernel/power/snapshot.c232
-rw-r--r--kernel/power/suspend.c19
-rw-r--r--kernel/power/swap.c337
-rw-r--r--kernel/power/user.c37
-rw-r--r--kernel/printk.c68
-rw-r--r--kernel/profile.c8
-rw-r--r--kernel/ptrace.c50
-rw-r--r--kernel/range.c4
-rw-r--r--kernel/rcupdate.c179
-rw-r--r--kernel/rcutiny.c37
-rw-r--r--kernel/rcutiny_plugin.h39
-rw-r--r--kernel/rcutorture.c7
-rw-r--r--kernel/rcutree.c133
-rw-r--r--kernel/rcutree.h2
-rw-r--r--kernel/rcutree_plugin.h69
-rw-r--r--kernel/rcutree_trace.c4
-rw-r--r--kernel/relay.c17
-rw-r--r--kernel/resource.c16
-rw-r--r--kernel/sched.c1350
-rw-r--r--kernel/sched_clock.c96
-rw-r--r--kernel/sched_cpupri.c8
-rw-r--r--kernel/sched_cpupri.h2
-rw-r--r--kernel/sched_debug.c120
-rw-r--r--kernel/sched_fair.c913
-rw-r--r--kernel/sched_features.h55
-rw-r--r--kernel/sched_idletask.c8
-rw-r--r--kernel/sched_rt.c18
-rw-r--r--kernel/sched_stats.h27
-rw-r--r--kernel/signal.c80
-rw-r--r--kernel/slow-work-debugfs.c227
-rw-r--r--kernel/slow-work.c1068
-rw-r--r--kernel/slow-work.h72
-rw-r--r--kernel/smp.c19
-rw-r--r--kernel/softirq.c6
-rw-r--r--kernel/softlockup.c293
-rw-r--r--kernel/stop_machine.c537
-rw-r--r--kernel/sys.c241
-rw-r--r--kernel/sys_ni.c4
-rw-r--r--kernel/sysctl.c696
-rw-r--r--kernel/sysctl_binary.c10
-rw-r--r--kernel/sysctl_check.c9
-rw-r--r--kernel/time.c27
-rw-r--r--kernel/time/Kconfig4
-rw-r--r--kernel/time/clocksource.c71
-rw-r--r--kernel/time/ntp.c2
-rw-r--r--kernel/time/tick-broadcast.c2
-rw-r--r--kernel/time/tick-sched.c93
-rw-r--r--kernel/time/timekeeping.c128
-rw-r--r--kernel/time/timer_list.c1
-rw-r--r--kernel/timer.c205
-rw-r--r--kernel/trace/Kconfig94
-rw-r--r--kernel/trace/Makefile8
-rw-r--r--kernel/trace/blktrace.c228
-rw-r--r--kernel/trace/ftrace.c60
-rw-r--r--kernel/trace/kmemtrace.c511
-rw-r--r--kernel/trace/ring_buffer.c241
-rw-r--r--kernel/trace/ring_buffer_benchmark.c5
-rw-r--r--kernel/trace/trace.c391
-rw-r--r--kernel/trace/trace.h163
-rw-r--r--kernel/trace/trace_boot.c185
-rw-r--r--kernel/trace/trace_branch.c8
-rw-r--r--kernel/trace/trace_clock.c7
-rw-r--r--kernel/trace/trace_entries.h106
-rw-r--r--kernel/trace/trace_event_perf.c184
-rw-r--r--kernel/trace/trace_events.c481
-rw-r--r--kernel/trace/trace_events_filter.c47
-rw-r--r--kernel/trace/trace_export.c20
-rw-r--r--kernel/trace/trace_functions.c6
-rw-r--r--kernel/trace/trace_functions_graph.c189
-rw-r--r--kernel/trace/trace_hw_branches.c312
-rw-r--r--kernel/trace/trace_irqsoff.c274
-rw-r--r--kernel/trace/trace_kdb.c136
-rw-r--r--kernel/trace/trace_kprobe.c904
-rw-r--r--kernel/trace/trace_ksym.c520
-rw-r--r--kernel/trace/trace_output.c198
-rw-r--r--kernel/trace/trace_output.h2
-rw-r--r--kernel/trace/trace_sched_switch.c21
-rw-r--r--kernel/trace/trace_sched_wakeup.c36
-rw-r--r--kernel/trace/trace_selftest.c151
-rw-r--r--kernel/trace/trace_stack.c8
-rw-r--r--kernel/trace/trace_syscalls.c147
-rw-r--r--kernel/trace/trace_sysprof.c329
-rw-r--r--kernel/trace/trace_workqueue.c26
-rw-r--r--kernel/tracepoint.c91
-rw-r--r--kernel/user.c11
-rw-r--r--kernel/user_namespace.c48
-rw-r--r--kernel/watchdog.c577
-rw-r--r--kernel/workqueue.c3255
-rw-r--r--kernel/workqueue_sched.h9
163 files changed, 23472 insertions, 13227 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index a987aa1676b5..0b72d1a74be0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -68,16 +68,17 @@ obj-$(CONFIG_USER_NS) += user_namespace.o
68obj-$(CONFIG_PID_NS) += pid_namespace.o 68obj-$(CONFIG_PID_NS) += pid_namespace.o
69obj-$(CONFIG_IKCONFIG) += configs.o 69obj-$(CONFIG_IKCONFIG) += configs.o
70obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o 70obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
71obj-$(CONFIG_STOP_MACHINE) += stop_machine.o 71obj-$(CONFIG_SMP) += stop_machine.o
72obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o 72obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
73obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o 73obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
74obj-$(CONFIG_AUDITSYSCALL) += auditsc.o 74obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
75obj-$(CONFIG_GCOV_KERNEL) += gcov/ 75obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o
76obj-$(CONFIG_AUDIT_TREE) += audit_tree.o 76obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
77obj-$(CONFIG_GCOV_KERNEL) += gcov/
77obj-$(CONFIG_KPROBES) += kprobes.o 78obj-$(CONFIG_KPROBES) += kprobes.o
78obj-$(CONFIG_KGDB) += kgdb.o 79obj-$(CONFIG_KGDB) += debug/
79obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
80obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o 80obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
81obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
81obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 82obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
82obj-$(CONFIG_SECCOMP) += seccomp.o 83obj-$(CONFIG_SECCOMP) += seccomp.o
83obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 84obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
@@ -99,8 +100,6 @@ obj-$(CONFIG_TRACING) += trace/
99obj-$(CONFIG_X86_DS) += trace/ 100obj-$(CONFIG_X86_DS) += trace/
100obj-$(CONFIG_RING_BUFFER) += trace/ 101obj-$(CONFIG_RING_BUFFER) += trace/
101obj-$(CONFIG_SMP) += sched_cpupri.o 102obj-$(CONFIG_SMP) += sched_cpupri.o
102obj-$(CONFIG_SLOW_WORK) += slow-work.o
103obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
104obj-$(CONFIG_PERF_EVENTS) += perf_event.o 103obj-$(CONFIG_PERF_EVENTS) += perf_event.o
105obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 104obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
106obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o 105obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
diff --git a/kernel/acct.c b/kernel/acct.c
index e4c0e1fee9b0..fa7eb3de2ddc 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -122,7 +122,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
122 spin_unlock(&acct_lock); 122 spin_unlock(&acct_lock);
123 123
124 /* May block */ 124 /* May block */
125 if (vfs_statfs(file->f_path.dentry, &sbuf)) 125 if (vfs_statfs(&file->f_path, &sbuf))
126 return res; 126 return res;
127 suspend = sbuf.f_blocks * SUSPEND; 127 suspend = sbuf.f_blocks * SUSPEND;
128 resume = sbuf.f_blocks * RESUME; 128 resume = sbuf.f_blocks * RESUME;
@@ -216,7 +216,6 @@ static int acct_on(char *name)
216{ 216{
217 struct file *file; 217 struct file *file;
218 struct vfsmount *mnt; 218 struct vfsmount *mnt;
219 int error;
220 struct pid_namespace *ns; 219 struct pid_namespace *ns;
221 struct bsd_acct_struct *acct = NULL; 220 struct bsd_acct_struct *acct = NULL;
222 221
@@ -244,13 +243,6 @@ static int acct_on(char *name)
244 } 243 }
245 } 244 }
246 245
247 error = security_acct(file);
248 if (error) {
249 kfree(acct);
250 filp_close(file, NULL);
251 return error;
252 }
253
254 spin_lock(&acct_lock); 246 spin_lock(&acct_lock);
255 if (ns->bacct == NULL) { 247 if (ns->bacct == NULL) {
256 ns->bacct = acct; 248 ns->bacct = acct;
@@ -281,7 +273,7 @@ static int acct_on(char *name)
281 */ 273 */
282SYSCALL_DEFINE1(acct, const char __user *, name) 274SYSCALL_DEFINE1(acct, const char __user *, name)
283{ 275{
284 int error; 276 int error = 0;
285 277
286 if (!capable(CAP_SYS_PACCT)) 278 if (!capable(CAP_SYS_PACCT))
287 return -EPERM; 279 return -EPERM;
@@ -299,13 +291,11 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
299 if (acct == NULL) 291 if (acct == NULL)
300 return 0; 292 return 0;
301 293
302 error = security_acct(NULL); 294 spin_lock(&acct_lock);
303 if (!error) { 295 acct_file_reopen(acct, NULL, NULL);
304 spin_lock(&acct_lock); 296 spin_unlock(&acct_lock);
305 acct_file_reopen(acct, NULL, NULL);
306 spin_unlock(&acct_lock);
307 }
308 } 297 }
298
309 return error; 299 return error;
310} 300}
311 301
diff --git a/kernel/async.c b/kernel/async.c
index 15319d6c18fe..cd9dbb913c77 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -49,40 +49,33 @@ asynchronous and synchronous parts of the kernel.
49*/ 49*/
50 50
51#include <linux/async.h> 51#include <linux/async.h>
52#include <linux/bug.h>
53#include <linux/module.h> 52#include <linux/module.h>
54#include <linux/wait.h> 53#include <linux/wait.h>
55#include <linux/sched.h> 54#include <linux/sched.h>
56#include <linux/init.h>
57#include <linux/kthread.h>
58#include <linux/delay.h>
59#include <linux/slab.h> 55#include <linux/slab.h>
56#include <linux/workqueue.h>
60#include <asm/atomic.h> 57#include <asm/atomic.h>
61 58
62static async_cookie_t next_cookie = 1; 59static async_cookie_t next_cookie = 1;
63 60
64#define MAX_THREADS 256
65#define MAX_WORK 32768 61#define MAX_WORK 32768
66 62
67static LIST_HEAD(async_pending); 63static LIST_HEAD(async_pending);
68static LIST_HEAD(async_running); 64static LIST_HEAD(async_running);
69static DEFINE_SPINLOCK(async_lock); 65static DEFINE_SPINLOCK(async_lock);
70 66
71static int async_enabled = 0;
72
73struct async_entry { 67struct async_entry {
74 struct list_head list; 68 struct list_head list;
75 async_cookie_t cookie; 69 struct work_struct work;
76 async_func_ptr *func; 70 async_cookie_t cookie;
77 void *data; 71 async_func_ptr *func;
78 struct list_head *running; 72 void *data;
73 struct list_head *running;
79}; 74};
80 75
81static DECLARE_WAIT_QUEUE_HEAD(async_done); 76static DECLARE_WAIT_QUEUE_HEAD(async_done);
82static DECLARE_WAIT_QUEUE_HEAD(async_new);
83 77
84static atomic_t entry_count; 78static atomic_t entry_count;
85static atomic_t thread_count;
86 79
87extern int initcall_debug; 80extern int initcall_debug;
88 81
@@ -117,27 +110,23 @@ static async_cookie_t lowest_in_progress(struct list_head *running)
117 spin_unlock_irqrestore(&async_lock, flags); 110 spin_unlock_irqrestore(&async_lock, flags);
118 return ret; 111 return ret;
119} 112}
113
120/* 114/*
121 * pick the first pending entry and run it 115 * pick the first pending entry and run it
122 */ 116 */
123static void run_one_entry(void) 117static void async_run_entry_fn(struct work_struct *work)
124{ 118{
119 struct async_entry *entry =
120 container_of(work, struct async_entry, work);
125 unsigned long flags; 121 unsigned long flags;
126 struct async_entry *entry;
127 ktime_t calltime, delta, rettime; 122 ktime_t calltime, delta, rettime;
128 123
129 /* 1) pick one task from the pending queue */ 124 /* 1) move self to the running queue */
130
131 spin_lock_irqsave(&async_lock, flags); 125 spin_lock_irqsave(&async_lock, flags);
132 if (list_empty(&async_pending))
133 goto out;
134 entry = list_first_entry(&async_pending, struct async_entry, list);
135
136 /* 2) move it to the running queue */
137 list_move_tail(&entry->list, entry->running); 126 list_move_tail(&entry->list, entry->running);
138 spin_unlock_irqrestore(&async_lock, flags); 127 spin_unlock_irqrestore(&async_lock, flags);
139 128
140 /* 3) run it (and print duration)*/ 129 /* 2) run (and print duration) */
141 if (initcall_debug && system_state == SYSTEM_BOOTING) { 130 if (initcall_debug && system_state == SYSTEM_BOOTING) {
142 printk("calling %lli_%pF @ %i\n", (long long)entry->cookie, 131 printk("calling %lli_%pF @ %i\n", (long long)entry->cookie,
143 entry->func, task_pid_nr(current)); 132 entry->func, task_pid_nr(current));
@@ -153,31 +142,25 @@ static void run_one_entry(void)
153 (long long)ktime_to_ns(delta) >> 10); 142 (long long)ktime_to_ns(delta) >> 10);
154 } 143 }
155 144
156 /* 4) remove it from the running queue */ 145 /* 3) remove self from the running queue */
157 spin_lock_irqsave(&async_lock, flags); 146 spin_lock_irqsave(&async_lock, flags);
158 list_del(&entry->list); 147 list_del(&entry->list);
159 148
160 /* 5) free the entry */ 149 /* 4) free the entry */
161 kfree(entry); 150 kfree(entry);
162 atomic_dec(&entry_count); 151 atomic_dec(&entry_count);
163 152
164 spin_unlock_irqrestore(&async_lock, flags); 153 spin_unlock_irqrestore(&async_lock, flags);
165 154
166 /* 6) wake up any waiters. */ 155 /* 5) wake up any waiters */
167 wake_up(&async_done); 156 wake_up(&async_done);
168 return;
169
170out:
171 spin_unlock_irqrestore(&async_lock, flags);
172} 157}
173 158
174
175static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running) 159static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running)
176{ 160{
177 struct async_entry *entry; 161 struct async_entry *entry;
178 unsigned long flags; 162 unsigned long flags;
179 async_cookie_t newcookie; 163 async_cookie_t newcookie;
180
181 164
182 /* allow irq-off callers */ 165 /* allow irq-off callers */
183 entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC); 166 entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC);
@@ -186,7 +169,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
186 * If we're out of memory or if there's too much work 169 * If we're out of memory or if there's too much work
187 * pending already, we execute synchronously. 170 * pending already, we execute synchronously.
188 */ 171 */
189 if (!async_enabled || !entry || atomic_read(&entry_count) > MAX_WORK) { 172 if (!entry || atomic_read(&entry_count) > MAX_WORK) {
190 kfree(entry); 173 kfree(entry);
191 spin_lock_irqsave(&async_lock, flags); 174 spin_lock_irqsave(&async_lock, flags);
192 newcookie = next_cookie++; 175 newcookie = next_cookie++;
@@ -196,6 +179,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
196 ptr(data, newcookie); 179 ptr(data, newcookie);
197 return newcookie; 180 return newcookie;
198 } 181 }
182 INIT_WORK(&entry->work, async_run_entry_fn);
199 entry->func = ptr; 183 entry->func = ptr;
200 entry->data = data; 184 entry->data = data;
201 entry->running = running; 185 entry->running = running;
@@ -205,7 +189,10 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
205 list_add_tail(&entry->list, &async_pending); 189 list_add_tail(&entry->list, &async_pending);
206 atomic_inc(&entry_count); 190 atomic_inc(&entry_count);
207 spin_unlock_irqrestore(&async_lock, flags); 191 spin_unlock_irqrestore(&async_lock, flags);
208 wake_up(&async_new); 192
193 /* schedule for execution */
194 queue_work(system_unbound_wq, &entry->work);
195
209 return newcookie; 196 return newcookie;
210} 197}
211 198
@@ -312,87 +299,3 @@ void async_synchronize_cookie(async_cookie_t cookie)
312 async_synchronize_cookie_domain(cookie, &async_running); 299 async_synchronize_cookie_domain(cookie, &async_running);
313} 300}
314EXPORT_SYMBOL_GPL(async_synchronize_cookie); 301EXPORT_SYMBOL_GPL(async_synchronize_cookie);
315
316
317static int async_thread(void *unused)
318{
319 DECLARE_WAITQUEUE(wq, current);
320 add_wait_queue(&async_new, &wq);
321
322 while (!kthread_should_stop()) {
323 int ret = HZ;
324 set_current_state(TASK_INTERRUPTIBLE);
325 /*
326 * check the list head without lock.. false positives
327 * are dealt with inside run_one_entry() while holding
328 * the lock.
329 */
330 rmb();
331 if (!list_empty(&async_pending))
332 run_one_entry();
333 else
334 ret = schedule_timeout(HZ);
335
336 if (ret == 0) {
337 /*
338 * we timed out, this means we as thread are redundant.
339 * we sign off and die, but we to avoid any races there
340 * is a last-straw check to see if work snuck in.
341 */
342 atomic_dec(&thread_count);
343 wmb(); /* manager must see our departure first */
344 if (list_empty(&async_pending))
345 break;
346 /*
347 * woops work came in between us timing out and us
348 * signing off; we need to stay alive and keep working.
349 */
350 atomic_inc(&thread_count);
351 }
352 }
353 remove_wait_queue(&async_new, &wq);
354
355 return 0;
356}
357
358static int async_manager_thread(void *unused)
359{
360 DECLARE_WAITQUEUE(wq, current);
361 add_wait_queue(&async_new, &wq);
362
363 while (!kthread_should_stop()) {
364 int tc, ec;
365
366 set_current_state(TASK_INTERRUPTIBLE);
367
368 tc = atomic_read(&thread_count);
369 rmb();
370 ec = atomic_read(&entry_count);
371
372 while (tc < ec && tc < MAX_THREADS) {
373 if (IS_ERR(kthread_run(async_thread, NULL, "async/%i",
374 tc))) {
375 msleep(100);
376 continue;
377 }
378 atomic_inc(&thread_count);
379 tc++;
380 }
381
382 schedule();
383 }
384 remove_wait_queue(&async_new, &wq);
385
386 return 0;
387}
388
389static int __init async_init(void)
390{
391 async_enabled =
392 !IS_ERR(kthread_run(async_manager_thread, NULL, "async/mgr"));
393
394 WARN_ON(!async_enabled);
395 return 0;
396}
397
398core_initcall(async_init);
diff --git a/kernel/audit.c b/kernel/audit.c
index c71bd26631a2..d96045789b54 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -56,7 +56,6 @@
56#include <net/netlink.h> 56#include <net/netlink.h>
57#include <linux/skbuff.h> 57#include <linux/skbuff.h>
58#include <linux/netlink.h> 58#include <linux/netlink.h>
59#include <linux/inotify.h>
60#include <linux/freezer.h> 59#include <linux/freezer.h>
61#include <linux/tty.h> 60#include <linux/tty.h>
62 61
@@ -407,7 +406,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
407 audit_hold_skb(skb); 406 audit_hold_skb(skb);
408 } else 407 } else
409 /* drop the extra reference if sent ok */ 408 /* drop the extra reference if sent ok */
410 kfree_skb(skb); 409 consume_skb(skb);
411} 410}
412 411
413static int kauditd_thread(void *dummy) 412static int kauditd_thread(void *dummy)
diff --git a/kernel/audit.h b/kernel/audit.h
index 208687be4f30..f7206db4e13d 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -103,21 +103,27 @@ extern struct mutex audit_filter_mutex;
103extern void audit_free_rule_rcu(struct rcu_head *); 103extern void audit_free_rule_rcu(struct rcu_head *);
104extern struct list_head audit_filter_list[]; 104extern struct list_head audit_filter_list[];
105 105
106extern struct audit_entry *audit_dupe_rule(struct audit_krule *old);
107
106/* audit watch functions */ 108/* audit watch functions */
107extern unsigned long audit_watch_inode(struct audit_watch *watch); 109#ifdef CONFIG_AUDIT_WATCH
108extern dev_t audit_watch_dev(struct audit_watch *watch);
109extern void audit_put_watch(struct audit_watch *watch); 110extern void audit_put_watch(struct audit_watch *watch);
110extern void audit_get_watch(struct audit_watch *watch); 111extern void audit_get_watch(struct audit_watch *watch);
111extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op); 112extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op);
112extern int audit_add_watch(struct audit_krule *krule); 113extern int audit_add_watch(struct audit_krule *krule, struct list_head **list);
113extern void audit_remove_watch(struct audit_watch *watch); 114extern void audit_remove_watch_rule(struct audit_krule *krule);
114extern void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list);
115extern void audit_inotify_unregister(struct list_head *in_list);
116extern char *audit_watch_path(struct audit_watch *watch); 115extern char *audit_watch_path(struct audit_watch *watch);
117extern struct list_head *audit_watch_rules(struct audit_watch *watch); 116extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev);
118 117#else
119extern struct audit_entry *audit_dupe_rule(struct audit_krule *old, 118#define audit_put_watch(w) {}
120 struct audit_watch *watch); 119#define audit_get_watch(w) {}
120#define audit_to_watch(k, p, l, o) (-EINVAL)
121#define audit_add_watch(k, l) (-EINVAL)
122#define audit_remove_watch_rule(k) BUG()
123#define audit_watch_path(w) ""
124#define audit_watch_compare(w, i, d) 0
125
126#endif /* CONFIG_AUDIT_WATCH */
121 127
122#ifdef CONFIG_AUDIT_TREE 128#ifdef CONFIG_AUDIT_TREE
123extern struct audit_chunk *audit_tree_lookup(const struct inode *); 129extern struct audit_chunk *audit_tree_lookup(const struct inode *);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 46a57b57a335..7f18d3a4527e 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -1,5 +1,5 @@
1#include "audit.h" 1#include "audit.h"
2#include <linux/inotify.h> 2#include <linux/fsnotify_backend.h>
3#include <linux/namei.h> 3#include <linux/namei.h>
4#include <linux/mount.h> 4#include <linux/mount.h>
5#include <linux/kthread.h> 5#include <linux/kthread.h>
@@ -22,7 +22,7 @@ struct audit_tree {
22 22
23struct audit_chunk { 23struct audit_chunk {
24 struct list_head hash; 24 struct list_head hash;
25 struct inotify_watch watch; 25 struct fsnotify_mark mark;
26 struct list_head trees; /* with root here */ 26 struct list_head trees; /* with root here */
27 int dead; 27 int dead;
28 int count; 28 int count;
@@ -59,7 +59,7 @@ static LIST_HEAD(prune_list);
59 * tree is refcounted; one reference for "some rules on rules_list refer to 59 * tree is refcounted; one reference for "some rules on rules_list refer to
60 * it", one for each chunk with pointer to it. 60 * it", one for each chunk with pointer to it.
61 * 61 *
62 * chunk is refcounted by embedded inotify_watch + .refs (non-zero refcount 62 * chunk is refcounted by embedded fsnotify_mark + .refs (non-zero refcount
63 * of watch contributes 1 to .refs). 63 * of watch contributes 1 to .refs).
64 * 64 *
65 * node.index allows to get from node.list to containing chunk. 65 * node.index allows to get from node.list to containing chunk.
@@ -68,7 +68,7 @@ static LIST_HEAD(prune_list);
68 * that makes a difference. Some. 68 * that makes a difference. Some.
69 */ 69 */
70 70
71static struct inotify_handle *rtree_ih; 71static struct fsnotify_group *audit_tree_group;
72 72
73static struct audit_tree *alloc_tree(const char *s) 73static struct audit_tree *alloc_tree(const char *s)
74{ 74{
@@ -111,29 +111,6 @@ const char *audit_tree_path(struct audit_tree *tree)
111 return tree->pathname; 111 return tree->pathname;
112} 112}
113 113
114static struct audit_chunk *alloc_chunk(int count)
115{
116 struct audit_chunk *chunk;
117 size_t size;
118 int i;
119
120 size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node);
121 chunk = kzalloc(size, GFP_KERNEL);
122 if (!chunk)
123 return NULL;
124
125 INIT_LIST_HEAD(&chunk->hash);
126 INIT_LIST_HEAD(&chunk->trees);
127 chunk->count = count;
128 atomic_long_set(&chunk->refs, 1);
129 for (i = 0; i < count; i++) {
130 INIT_LIST_HEAD(&chunk->owners[i].list);
131 chunk->owners[i].index = i;
132 }
133 inotify_init_watch(&chunk->watch);
134 return chunk;
135}
136
137static void free_chunk(struct audit_chunk *chunk) 114static void free_chunk(struct audit_chunk *chunk)
138{ 115{
139 int i; 116 int i;
@@ -157,6 +134,35 @@ static void __put_chunk(struct rcu_head *rcu)
157 audit_put_chunk(chunk); 134 audit_put_chunk(chunk);
158} 135}
159 136
137static void audit_tree_destroy_watch(struct fsnotify_mark *entry)
138{
139 struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
140 call_rcu(&chunk->head, __put_chunk);
141}
142
143static struct audit_chunk *alloc_chunk(int count)
144{
145 struct audit_chunk *chunk;
146 size_t size;
147 int i;
148
149 size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node);
150 chunk = kzalloc(size, GFP_KERNEL);
151 if (!chunk)
152 return NULL;
153
154 INIT_LIST_HEAD(&chunk->hash);
155 INIT_LIST_HEAD(&chunk->trees);
156 chunk->count = count;
157 atomic_long_set(&chunk->refs, 1);
158 for (i = 0; i < count; i++) {
159 INIT_LIST_HEAD(&chunk->owners[i].list);
160 chunk->owners[i].index = i;
161 }
162 fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch);
163 return chunk;
164}
165
160enum {HASH_SIZE = 128}; 166enum {HASH_SIZE = 128};
161static struct list_head chunk_hash_heads[HASH_SIZE]; 167static struct list_head chunk_hash_heads[HASH_SIZE];
162static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock); 168static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock);
@@ -167,10 +173,15 @@ static inline struct list_head *chunk_hash(const struct inode *inode)
167 return chunk_hash_heads + n % HASH_SIZE; 173 return chunk_hash_heads + n % HASH_SIZE;
168} 174}
169 175
170/* hash_lock is held by caller */ 176/* hash_lock & entry->lock is held by caller */
171static void insert_hash(struct audit_chunk *chunk) 177static void insert_hash(struct audit_chunk *chunk)
172{ 178{
173 struct list_head *list = chunk_hash(chunk->watch.inode); 179 struct fsnotify_mark *entry = &chunk->mark;
180 struct list_head *list;
181
182 if (!entry->i.inode)
183 return;
184 list = chunk_hash(entry->i.inode);
174 list_add_rcu(&chunk->hash, list); 185 list_add_rcu(&chunk->hash, list);
175} 186}
176 187
@@ -181,7 +192,8 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
181 struct audit_chunk *p; 192 struct audit_chunk *p;
182 193
183 list_for_each_entry_rcu(p, list, hash) { 194 list_for_each_entry_rcu(p, list, hash) {
184 if (p->watch.inode == inode) { 195 /* mark.inode may have gone NULL, but who cares? */
196 if (p->mark.i.inode == inode) {
185 atomic_long_inc(&p->refs); 197 atomic_long_inc(&p->refs);
186 return p; 198 return p;
187 } 199 }
@@ -210,38 +222,19 @@ static struct audit_chunk *find_chunk(struct node *p)
210static void untag_chunk(struct node *p) 222static void untag_chunk(struct node *p)
211{ 223{
212 struct audit_chunk *chunk = find_chunk(p); 224 struct audit_chunk *chunk = find_chunk(p);
225 struct fsnotify_mark *entry = &chunk->mark;
213 struct audit_chunk *new; 226 struct audit_chunk *new;
214 struct audit_tree *owner; 227 struct audit_tree *owner;
215 int size = chunk->count - 1; 228 int size = chunk->count - 1;
216 int i, j; 229 int i, j;
217 230
218 if (!pin_inotify_watch(&chunk->watch)) { 231 fsnotify_get_mark(entry);
219 /*
220 * Filesystem is shutting down; all watches are getting
221 * evicted, just take it off the node list for this
222 * tree and let the eviction logics take care of the
223 * rest.
224 */
225 owner = p->owner;
226 if (owner->root == chunk) {
227 list_del_init(&owner->same_root);
228 owner->root = NULL;
229 }
230 list_del_init(&p->list);
231 p->owner = NULL;
232 put_tree(owner);
233 return;
234 }
235 232
236 spin_unlock(&hash_lock); 233 spin_unlock(&hash_lock);
237 234
238 /* 235 spin_lock(&entry->lock);
239 * pin_inotify_watch() succeeded, so the watch won't go away 236 if (chunk->dead || !entry->i.inode) {
240 * from under us. 237 spin_unlock(&entry->lock);
241 */
242 mutex_lock(&chunk->watch.inode->inotify_mutex);
243 if (chunk->dead) {
244 mutex_unlock(&chunk->watch.inode->inotify_mutex);
245 goto out; 238 goto out;
246 } 239 }
247 240
@@ -256,16 +249,17 @@ static void untag_chunk(struct node *p)
256 list_del_init(&p->list); 249 list_del_init(&p->list);
257 list_del_rcu(&chunk->hash); 250 list_del_rcu(&chunk->hash);
258 spin_unlock(&hash_lock); 251 spin_unlock(&hash_lock);
259 inotify_evict_watch(&chunk->watch); 252 spin_unlock(&entry->lock);
260 mutex_unlock(&chunk->watch.inode->inotify_mutex); 253 fsnotify_destroy_mark(entry);
261 put_inotify_watch(&chunk->watch); 254 fsnotify_put_mark(entry);
262 goto out; 255 goto out;
263 } 256 }
264 257
265 new = alloc_chunk(size); 258 new = alloc_chunk(size);
266 if (!new) 259 if (!new)
267 goto Fallback; 260 goto Fallback;
268 if (inotify_clone_watch(&chunk->watch, &new->watch) < 0) { 261 fsnotify_duplicate_mark(&new->mark, entry);
262 if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
269 free_chunk(new); 263 free_chunk(new);
270 goto Fallback; 264 goto Fallback;
271 } 265 }
@@ -298,9 +292,9 @@ static void untag_chunk(struct node *p)
298 list_for_each_entry(owner, &new->trees, same_root) 292 list_for_each_entry(owner, &new->trees, same_root)
299 owner->root = new; 293 owner->root = new;
300 spin_unlock(&hash_lock); 294 spin_unlock(&hash_lock);
301 inotify_evict_watch(&chunk->watch); 295 spin_unlock(&entry->lock);
302 mutex_unlock(&chunk->watch.inode->inotify_mutex); 296 fsnotify_destroy_mark(entry);
303 put_inotify_watch(&chunk->watch); 297 fsnotify_put_mark(entry);
304 goto out; 298 goto out;
305 299
306Fallback: 300Fallback:
@@ -314,31 +308,33 @@ Fallback:
314 p->owner = NULL; 308 p->owner = NULL;
315 put_tree(owner); 309 put_tree(owner);
316 spin_unlock(&hash_lock); 310 spin_unlock(&hash_lock);
317 mutex_unlock(&chunk->watch.inode->inotify_mutex); 311 spin_unlock(&entry->lock);
318out: 312out:
319 unpin_inotify_watch(&chunk->watch); 313 fsnotify_put_mark(entry);
320 spin_lock(&hash_lock); 314 spin_lock(&hash_lock);
321} 315}
322 316
323static int create_chunk(struct inode *inode, struct audit_tree *tree) 317static int create_chunk(struct inode *inode, struct audit_tree *tree)
324{ 318{
319 struct fsnotify_mark *entry;
325 struct audit_chunk *chunk = alloc_chunk(1); 320 struct audit_chunk *chunk = alloc_chunk(1);
326 if (!chunk) 321 if (!chunk)
327 return -ENOMEM; 322 return -ENOMEM;
328 323
329 if (inotify_add_watch(rtree_ih, &chunk->watch, inode, IN_IGNORED | IN_DELETE_SELF) < 0) { 324 entry = &chunk->mark;
325 if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) {
330 free_chunk(chunk); 326 free_chunk(chunk);
331 return -ENOSPC; 327 return -ENOSPC;
332 } 328 }
333 329
334 mutex_lock(&inode->inotify_mutex); 330 spin_lock(&entry->lock);
335 spin_lock(&hash_lock); 331 spin_lock(&hash_lock);
336 if (tree->goner) { 332 if (tree->goner) {
337 spin_unlock(&hash_lock); 333 spin_unlock(&hash_lock);
338 chunk->dead = 1; 334 chunk->dead = 1;
339 inotify_evict_watch(&chunk->watch); 335 spin_unlock(&entry->lock);
340 mutex_unlock(&inode->inotify_mutex); 336 fsnotify_destroy_mark(entry);
341 put_inotify_watch(&chunk->watch); 337 fsnotify_put_mark(entry);
342 return 0; 338 return 0;
343 } 339 }
344 chunk->owners[0].index = (1U << 31); 340 chunk->owners[0].index = (1U << 31);
@@ -351,30 +347,31 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
351 } 347 }
352 insert_hash(chunk); 348 insert_hash(chunk);
353 spin_unlock(&hash_lock); 349 spin_unlock(&hash_lock);
354 mutex_unlock(&inode->inotify_mutex); 350 spin_unlock(&entry->lock);
355 return 0; 351 return 0;
356} 352}
357 353
358/* the first tagged inode becomes root of tree */ 354/* the first tagged inode becomes root of tree */
359static int tag_chunk(struct inode *inode, struct audit_tree *tree) 355static int tag_chunk(struct inode *inode, struct audit_tree *tree)
360{ 356{
361 struct inotify_watch *watch; 357 struct fsnotify_mark *old_entry, *chunk_entry;
362 struct audit_tree *owner; 358 struct audit_tree *owner;
363 struct audit_chunk *chunk, *old; 359 struct audit_chunk *chunk, *old;
364 struct node *p; 360 struct node *p;
365 int n; 361 int n;
366 362
367 if (inotify_find_watch(rtree_ih, inode, &watch) < 0) 363 old_entry = fsnotify_find_inode_mark(audit_tree_group, inode);
364 if (!old_entry)
368 return create_chunk(inode, tree); 365 return create_chunk(inode, tree);
369 366
370 old = container_of(watch, struct audit_chunk, watch); 367 old = container_of(old_entry, struct audit_chunk, mark);
371 368
372 /* are we already there? */ 369 /* are we already there? */
373 spin_lock(&hash_lock); 370 spin_lock(&hash_lock);
374 for (n = 0; n < old->count; n++) { 371 for (n = 0; n < old->count; n++) {
375 if (old->owners[n].owner == tree) { 372 if (old->owners[n].owner == tree) {
376 spin_unlock(&hash_lock); 373 spin_unlock(&hash_lock);
377 put_inotify_watch(&old->watch); 374 fsnotify_put_mark(old_entry);
378 return 0; 375 return 0;
379 } 376 }
380 } 377 }
@@ -382,25 +379,44 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
382 379
383 chunk = alloc_chunk(old->count + 1); 380 chunk = alloc_chunk(old->count + 1);
384 if (!chunk) { 381 if (!chunk) {
385 put_inotify_watch(&old->watch); 382 fsnotify_put_mark(old_entry);
386 return -ENOMEM; 383 return -ENOMEM;
387 } 384 }
388 385
389 mutex_lock(&inode->inotify_mutex); 386 chunk_entry = &chunk->mark;
390 if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) { 387
391 mutex_unlock(&inode->inotify_mutex); 388 spin_lock(&old_entry->lock);
392 put_inotify_watch(&old->watch); 389 if (!old_entry->i.inode) {
390 /* old_entry is being shot, lets just lie */
391 spin_unlock(&old_entry->lock);
392 fsnotify_put_mark(old_entry);
393 free_chunk(chunk); 393 free_chunk(chunk);
394 return -ENOENT;
395 }
396
397 fsnotify_duplicate_mark(chunk_entry, old_entry);
398 if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) {
399 spin_unlock(&old_entry->lock);
400 free_chunk(chunk);
401 fsnotify_put_mark(old_entry);
394 return -ENOSPC; 402 return -ENOSPC;
395 } 403 }
404
405 /* even though we hold old_entry->lock, this is safe since chunk_entry->lock could NEVER have been grabbed before */
406 spin_lock(&chunk_entry->lock);
396 spin_lock(&hash_lock); 407 spin_lock(&hash_lock);
408
409 /* we now hold old_entry->lock, chunk_entry->lock, and hash_lock */
397 if (tree->goner) { 410 if (tree->goner) {
398 spin_unlock(&hash_lock); 411 spin_unlock(&hash_lock);
399 chunk->dead = 1; 412 chunk->dead = 1;
400 inotify_evict_watch(&chunk->watch); 413 spin_unlock(&chunk_entry->lock);
401 mutex_unlock(&inode->inotify_mutex); 414 spin_unlock(&old_entry->lock);
402 put_inotify_watch(&old->watch); 415
403 put_inotify_watch(&chunk->watch); 416 fsnotify_destroy_mark(chunk_entry);
417
418 fsnotify_put_mark(chunk_entry);
419 fsnotify_put_mark(old_entry);
404 return 0; 420 return 0;
405 } 421 }
406 list_replace_init(&old->trees, &chunk->trees); 422 list_replace_init(&old->trees, &chunk->trees);
@@ -426,10 +442,11 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
426 list_add(&tree->same_root, &chunk->trees); 442 list_add(&tree->same_root, &chunk->trees);
427 } 443 }
428 spin_unlock(&hash_lock); 444 spin_unlock(&hash_lock);
429 inotify_evict_watch(&old->watch); 445 spin_unlock(&chunk_entry->lock);
430 mutex_unlock(&inode->inotify_mutex); 446 spin_unlock(&old_entry->lock);
431 put_inotify_watch(&old->watch); /* pair to inotify_find_watch */ 447 fsnotify_destroy_mark(old_entry);
432 put_inotify_watch(&old->watch); /* and kill it */ 448 fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
449 fsnotify_put_mark(old_entry); /* and kill it */
433 return 0; 450 return 0;
434} 451}
435 452
@@ -584,7 +601,9 @@ void audit_trim_trees(void)
584 601
585 spin_lock(&hash_lock); 602 spin_lock(&hash_lock);
586 list_for_each_entry(node, &tree->chunks, list) { 603 list_for_each_entry(node, &tree->chunks, list) {
587 struct inode *inode = find_chunk(node)->watch.inode; 604 struct audit_chunk *chunk = find_chunk(node);
605 /* this could be NULL if the watch is dieing else where... */
606 struct inode *inode = chunk->mark.i.inode;
588 node->index |= 1U<<31; 607 node->index |= 1U<<31;
589 if (iterate_mounts(compare_root, inode, root_mnt)) 608 if (iterate_mounts(compare_root, inode, root_mnt))
590 node->index &= ~(1U<<31); 609 node->index &= ~(1U<<31);
@@ -846,7 +865,6 @@ void audit_kill_trees(struct list_head *list)
846 * Here comes the stuff asynchronous to auditctl operations 865 * Here comes the stuff asynchronous to auditctl operations
847 */ 866 */
848 867
849/* inode->inotify_mutex is locked */
850static void evict_chunk(struct audit_chunk *chunk) 868static void evict_chunk(struct audit_chunk *chunk)
851{ 869{
852 struct audit_tree *owner; 870 struct audit_tree *owner;
@@ -885,35 +903,46 @@ static void evict_chunk(struct audit_chunk *chunk)
885 mutex_unlock(&audit_filter_mutex); 903 mutex_unlock(&audit_filter_mutex);
886} 904}
887 905
888static void handle_event(struct inotify_watch *watch, u32 wd, u32 mask, 906static int audit_tree_handle_event(struct fsnotify_group *group,
889 u32 cookie, const char *dname, struct inode *inode) 907 struct fsnotify_mark *inode_mark,
908 struct fsnotify_mark *vfsmonut_mark,
909 struct fsnotify_event *event)
910{
911 BUG();
912 return -EOPNOTSUPP;
913}
914
915static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group)
890{ 916{
891 struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch); 917 struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
892 918
893 if (mask & IN_IGNORED) { 919 evict_chunk(chunk);
894 evict_chunk(chunk); 920 fsnotify_put_mark(entry);
895 put_inotify_watch(watch);
896 }
897} 921}
898 922
899static void destroy_watch(struct inotify_watch *watch) 923static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode,
924 struct fsnotify_mark *inode_mark,
925 struct fsnotify_mark *vfsmount_mark,
926 __u32 mask, void *data, int data_type)
900{ 927{
901 struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch); 928 return false;
902 call_rcu(&chunk->head, __put_chunk);
903} 929}
904 930
905static const struct inotify_operations rtree_inotify_ops = { 931static const struct fsnotify_ops audit_tree_ops = {
906 .handle_event = handle_event, 932 .handle_event = audit_tree_handle_event,
907 .destroy_watch = destroy_watch, 933 .should_send_event = audit_tree_send_event,
934 .free_group_priv = NULL,
935 .free_event_priv = NULL,
936 .freeing_mark = audit_tree_freeing_mark,
908}; 937};
909 938
910static int __init audit_tree_init(void) 939static int __init audit_tree_init(void)
911{ 940{
912 int i; 941 int i;
913 942
914 rtree_ih = inotify_init(&rtree_inotify_ops); 943 audit_tree_group = fsnotify_alloc_group(&audit_tree_ops);
915 if (IS_ERR(rtree_ih)) 944 if (IS_ERR(audit_tree_group))
916 audit_panic("cannot initialize inotify handle for rectree watches"); 945 audit_panic("cannot initialize fsnotify group for rectree watches");
917 946
918 for (i = 0; i < HASH_SIZE; i++) 947 for (i = 0; i < HASH_SIZE; i++)
919 INIT_LIST_HEAD(&chunk_hash_heads[i]); 948 INIT_LIST_HEAD(&chunk_hash_heads[i]);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 8df43696f4ba..f0c9b2e7542d 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -24,18 +24,18 @@
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/mutex.h> 25#include <linux/mutex.h>
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/fsnotify_backend.h>
27#include <linux/namei.h> 28#include <linux/namei.h>
28#include <linux/netlink.h> 29#include <linux/netlink.h>
29#include <linux/sched.h> 30#include <linux/sched.h>
30#include <linux/slab.h> 31#include <linux/slab.h>
31#include <linux/inotify.h>
32#include <linux/security.h> 32#include <linux/security.h>
33#include "audit.h" 33#include "audit.h"
34 34
35/* 35/*
36 * Reference counting: 36 * Reference counting:
37 * 37 *
38 * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED 38 * audit_parent: lifetime is from audit_init_parent() to receipt of an FS_IGNORED
39 * event. Each audit_watch holds a reference to its associated parent. 39 * event. Each audit_watch holds a reference to its associated parent.
40 * 40 *
41 * audit_watch: if added to lists, lifetime is from audit_init_watch() to 41 * audit_watch: if added to lists, lifetime is from audit_init_watch() to
@@ -51,40 +51,61 @@ struct audit_watch {
51 unsigned long ino; /* associated inode number */ 51 unsigned long ino; /* associated inode number */
52 struct audit_parent *parent; /* associated parent */ 52 struct audit_parent *parent; /* associated parent */
53 struct list_head wlist; /* entry in parent->watches list */ 53 struct list_head wlist; /* entry in parent->watches list */
54 struct list_head rules; /* associated rules */ 54 struct list_head rules; /* anchor for krule->rlist */
55}; 55};
56 56
57struct audit_parent { 57struct audit_parent {
58 struct list_head ilist; /* entry in inotify registration list */ 58 struct list_head watches; /* anchor for audit_watch->wlist */
59 struct list_head watches; /* associated watches */ 59 struct fsnotify_mark mark; /* fsnotify mark on the inode */
60 struct inotify_watch wdata; /* inotify watch data */
61 unsigned flags; /* status flags */
62}; 60};
63 61
64/* Inotify handle. */ 62/* fsnotify handle. */
65struct inotify_handle *audit_ih; 63struct fsnotify_group *audit_watch_group;
66 64
67/* 65/* fsnotify events we care about. */
68 * audit_parent status flags: 66#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
69 * 67 FS_MOVE_SELF | FS_EVENT_ON_CHILD)
70 * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
71 * a filesystem event to ensure we're adding audit watches to a valid parent.
72 * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
73 * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
74 * we can receive while holding nameidata.
75 */
76#define AUDIT_PARENT_INVALID 0x001
77 68
78/* Inotify events we care about. */ 69static void audit_free_parent(struct audit_parent *parent)
79#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF 70{
71 WARN_ON(!list_empty(&parent->watches));
72 kfree(parent);
73}
80 74
81static void audit_free_parent(struct inotify_watch *i_watch) 75static void audit_watch_free_mark(struct fsnotify_mark *entry)
82{ 76{
83 struct audit_parent *parent; 77 struct audit_parent *parent;
84 78
85 parent = container_of(i_watch, struct audit_parent, wdata); 79 parent = container_of(entry, struct audit_parent, mark);
86 WARN_ON(!list_empty(&parent->watches)); 80 audit_free_parent(parent);
87 kfree(parent); 81}
82
83static void audit_get_parent(struct audit_parent *parent)
84{
85 if (likely(parent))
86 fsnotify_get_mark(&parent->mark);
87}
88
89static void audit_put_parent(struct audit_parent *parent)
90{
91 if (likely(parent))
92 fsnotify_put_mark(&parent->mark);
93}
94
95/*
96 * Find and return the audit_parent on the given inode. If found a reference
97 * is taken on this parent.
98 */
99static inline struct audit_parent *audit_find_parent(struct inode *inode)
100{
101 struct audit_parent *parent = NULL;
102 struct fsnotify_mark *entry;
103
104 entry = fsnotify_find_inode_mark(audit_watch_group, inode);
105 if (entry)
106 parent = container_of(entry, struct audit_parent, mark);
107
108 return parent;
88} 109}
89 110
90void audit_get_watch(struct audit_watch *watch) 111void audit_get_watch(struct audit_watch *watch)
@@ -105,7 +126,7 @@ void audit_put_watch(struct audit_watch *watch)
105void audit_remove_watch(struct audit_watch *watch) 126void audit_remove_watch(struct audit_watch *watch)
106{ 127{
107 list_del(&watch->wlist); 128 list_del(&watch->wlist);
108 put_inotify_watch(&watch->parent->wdata); 129 audit_put_parent(watch->parent);
109 watch->parent = NULL; 130 watch->parent = NULL;
110 audit_put_watch(watch); /* match initial get */ 131 audit_put_watch(watch); /* match initial get */
111} 132}
@@ -115,42 +136,32 @@ char *audit_watch_path(struct audit_watch *watch)
115 return watch->path; 136 return watch->path;
116} 137}
117 138
118struct list_head *audit_watch_rules(struct audit_watch *watch) 139int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
119{
120 return &watch->rules;
121}
122
123unsigned long audit_watch_inode(struct audit_watch *watch)
124{ 140{
125 return watch->ino; 141 return (watch->ino != (unsigned long)-1) &&
126} 142 (watch->ino == ino) &&
127 143 (watch->dev == dev);
128dev_t audit_watch_dev(struct audit_watch *watch)
129{
130 return watch->dev;
131} 144}
132 145
133/* Initialize a parent watch entry. */ 146/* Initialize a parent watch entry. */
134static struct audit_parent *audit_init_parent(struct nameidata *ndp) 147static struct audit_parent *audit_init_parent(struct nameidata *ndp)
135{ 148{
149 struct inode *inode = ndp->path.dentry->d_inode;
136 struct audit_parent *parent; 150 struct audit_parent *parent;
137 s32 wd; 151 int ret;
138 152
139 parent = kzalloc(sizeof(*parent), GFP_KERNEL); 153 parent = kzalloc(sizeof(*parent), GFP_KERNEL);
140 if (unlikely(!parent)) 154 if (unlikely(!parent))
141 return ERR_PTR(-ENOMEM); 155 return ERR_PTR(-ENOMEM);
142 156
143 INIT_LIST_HEAD(&parent->watches); 157 INIT_LIST_HEAD(&parent->watches);
144 parent->flags = 0; 158
145 159 fsnotify_init_mark(&parent->mark, audit_watch_free_mark);
146 inotify_init_watch(&parent->wdata); 160 parent->mark.mask = AUDIT_FS_WATCH;
147 /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */ 161 ret = fsnotify_add_mark(&parent->mark, audit_watch_group, inode, NULL, 0);
148 get_inotify_watch(&parent->wdata); 162 if (ret < 0) {
149 wd = inotify_add_watch(audit_ih, &parent->wdata, 163 audit_free_parent(parent);
150 ndp->path.dentry->d_inode, AUDIT_IN_WATCH); 164 return ERR_PTR(ret);
151 if (wd < 0) {
152 audit_free_parent(&parent->wdata);
153 return ERR_PTR(wd);
154 } 165 }
155 166
156 return parent; 167 return parent;
@@ -179,7 +190,7 @@ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
179{ 190{
180 struct audit_watch *watch; 191 struct audit_watch *watch;
181 192
182 if (!audit_ih) 193 if (!audit_watch_group)
183 return -EOPNOTSUPP; 194 return -EOPNOTSUPP;
184 195
185 if (path[0] != '/' || path[len-1] == '/' || 196 if (path[0] != '/' || path[len-1] == '/' ||
@@ -217,7 +228,7 @@ static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
217 228
218 new->dev = old->dev; 229 new->dev = old->dev;
219 new->ino = old->ino; 230 new->ino = old->ino;
220 get_inotify_watch(&old->parent->wdata); 231 audit_get_parent(old->parent);
221 new->parent = old->parent; 232 new->parent = old->parent;
222 233
223out: 234out:
@@ -251,15 +262,19 @@ static void audit_update_watch(struct audit_parent *parent,
251 struct audit_entry *oentry, *nentry; 262 struct audit_entry *oentry, *nentry;
252 263
253 mutex_lock(&audit_filter_mutex); 264 mutex_lock(&audit_filter_mutex);
265 /* Run all of the watches on this parent looking for the one that
266 * matches the given dname */
254 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { 267 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
255 if (audit_compare_dname_path(dname, owatch->path, NULL)) 268 if (audit_compare_dname_path(dname, owatch->path, NULL))
256 continue; 269 continue;
257 270
258 /* If the update involves invalidating rules, do the inode-based 271 /* If the update involves invalidating rules, do the inode-based
259 * filtering now, so we don't omit records. */ 272 * filtering now, so we don't omit records. */
260 if (invalidating && current->audit_context) 273 if (invalidating && !audit_dummy_context())
261 audit_filter_inodes(current, current->audit_context); 274 audit_filter_inodes(current, current->audit_context);
262 275
276 /* updating ino will likely change which audit_hash_list we
277 * are on so we need a new watch for the new list */
263 nwatch = audit_dupe_watch(owatch); 278 nwatch = audit_dupe_watch(owatch);
264 if (IS_ERR(nwatch)) { 279 if (IS_ERR(nwatch)) {
265 mutex_unlock(&audit_filter_mutex); 280 mutex_unlock(&audit_filter_mutex);
@@ -275,12 +290,21 @@ static void audit_update_watch(struct audit_parent *parent,
275 list_del(&oentry->rule.rlist); 290 list_del(&oentry->rule.rlist);
276 list_del_rcu(&oentry->list); 291 list_del_rcu(&oentry->list);
277 292
278 nentry = audit_dupe_rule(&oentry->rule, nwatch); 293 nentry = audit_dupe_rule(&oentry->rule);
279 if (IS_ERR(nentry)) { 294 if (IS_ERR(nentry)) {
280 list_del(&oentry->rule.list); 295 list_del(&oentry->rule.list);
281 audit_panic("error updating watch, removing"); 296 audit_panic("error updating watch, removing");
282 } else { 297 } else {
283 int h = audit_hash_ino((u32)ino); 298 int h = audit_hash_ino((u32)ino);
299
300 /*
301 * nentry->rule.watch == oentry->rule.watch so
302 * we must drop that reference and set it to our
303 * new watch.
304 */
305 audit_put_watch(nentry->rule.watch);
306 audit_get_watch(nwatch);
307 nentry->rule.watch = nwatch;
284 list_add(&nentry->rule.rlist, &nwatch->rules); 308 list_add(&nentry->rule.rlist, &nwatch->rules);
285 list_add_rcu(&nentry->list, &audit_inode_hash[h]); 309 list_add_rcu(&nentry->list, &audit_inode_hash[h]);
286 list_replace(&oentry->rule.list, 310 list_replace(&oentry->rule.list,
@@ -312,7 +336,6 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
312 struct audit_entry *e; 336 struct audit_entry *e;
313 337
314 mutex_lock(&audit_filter_mutex); 338 mutex_lock(&audit_filter_mutex);
315 parent->flags |= AUDIT_PARENT_INVALID;
316 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { 339 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
317 list_for_each_entry_safe(r, nextr, &w->rules, rlist) { 340 list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
318 e = container_of(r, struct audit_entry, rule); 341 e = container_of(r, struct audit_entry, rule);
@@ -325,20 +348,8 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
325 audit_remove_watch(w); 348 audit_remove_watch(w);
326 } 349 }
327 mutex_unlock(&audit_filter_mutex); 350 mutex_unlock(&audit_filter_mutex);
328}
329
330/* Unregister inotify watches for parents on in_list.
331 * Generates an IN_IGNORED event. */
332void audit_inotify_unregister(struct list_head *in_list)
333{
334 struct audit_parent *p, *n;
335 351
336 list_for_each_entry_safe(p, n, in_list, ilist) { 352 fsnotify_destroy_mark(&parent->mark);
337 list_del(&p->ilist);
338 inotify_rm_watch(audit_ih, &p->wdata);
339 /* the unpin matching the pin in audit_do_del_rule() */
340 unpin_inotify_watch(&p->wdata);
341 }
342} 353}
343 354
344/* Get path information necessary for adding watches. */ 355/* Get path information necessary for adding watches. */
@@ -389,7 +400,7 @@ static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
389 } 400 }
390} 401}
391 402
392/* Associate the given rule with an existing parent inotify_watch. 403/* Associate the given rule with an existing parent.
393 * Caller must hold audit_filter_mutex. */ 404 * Caller must hold audit_filter_mutex. */
394static void audit_add_to_parent(struct audit_krule *krule, 405static void audit_add_to_parent(struct audit_krule *krule,
395 struct audit_parent *parent) 406 struct audit_parent *parent)
@@ -397,6 +408,8 @@ static void audit_add_to_parent(struct audit_krule *krule,
397 struct audit_watch *w, *watch = krule->watch; 408 struct audit_watch *w, *watch = krule->watch;
398 int watch_found = 0; 409 int watch_found = 0;
399 410
411 BUG_ON(!mutex_is_locked(&audit_filter_mutex));
412
400 list_for_each_entry(w, &parent->watches, wlist) { 413 list_for_each_entry(w, &parent->watches, wlist) {
401 if (strcmp(watch->path, w->path)) 414 if (strcmp(watch->path, w->path))
402 continue; 415 continue;
@@ -413,7 +426,7 @@ static void audit_add_to_parent(struct audit_krule *krule,
413 } 426 }
414 427
415 if (!watch_found) { 428 if (!watch_found) {
416 get_inotify_watch(&parent->wdata); 429 audit_get_parent(parent);
417 watch->parent = parent; 430 watch->parent = parent;
418 431
419 list_add(&watch->wlist, &parent->watches); 432 list_add(&watch->wlist, &parent->watches);
@@ -423,13 +436,12 @@ static void audit_add_to_parent(struct audit_krule *krule,
423 436
424/* Find a matching watch entry, or add this one. 437/* Find a matching watch entry, or add this one.
425 * Caller must hold audit_filter_mutex. */ 438 * Caller must hold audit_filter_mutex. */
426int audit_add_watch(struct audit_krule *krule) 439int audit_add_watch(struct audit_krule *krule, struct list_head **list)
427{ 440{
428 struct audit_watch *watch = krule->watch; 441 struct audit_watch *watch = krule->watch;
429 struct inotify_watch *i_watch;
430 struct audit_parent *parent; 442 struct audit_parent *parent;
431 struct nameidata *ndp = NULL, *ndw = NULL; 443 struct nameidata *ndp = NULL, *ndw = NULL;
432 int ret = 0; 444 int h, ret = 0;
433 445
434 mutex_unlock(&audit_filter_mutex); 446 mutex_unlock(&audit_filter_mutex);
435 447
@@ -441,47 +453,38 @@ int audit_add_watch(struct audit_krule *krule)
441 goto error; 453 goto error;
442 } 454 }
443 455
456 mutex_lock(&audit_filter_mutex);
457
444 /* update watch filter fields */ 458 /* update watch filter fields */
445 if (ndw) { 459 if (ndw) {
446 watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev; 460 watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
447 watch->ino = ndw->path.dentry->d_inode->i_ino; 461 watch->ino = ndw->path.dentry->d_inode->i_ino;
448 } 462 }
449 463
450 /* The audit_filter_mutex must not be held during inotify calls because 464 /* either find an old parent or attach a new one */
451 * we hold it during inotify event callback processing. If an existing 465 parent = audit_find_parent(ndp->path.dentry->d_inode);
452 * inotify watch is found, inotify_find_watch() grabs a reference before 466 if (!parent) {
453 * returning.
454 */
455 if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
456 &i_watch) < 0) {
457 parent = audit_init_parent(ndp); 467 parent = audit_init_parent(ndp);
458 if (IS_ERR(parent)) { 468 if (IS_ERR(parent)) {
459 /* caller expects mutex locked */
460 mutex_lock(&audit_filter_mutex);
461 ret = PTR_ERR(parent); 469 ret = PTR_ERR(parent);
462 goto error; 470 goto error;
463 } 471 }
464 } else 472 }
465 parent = container_of(i_watch, struct audit_parent, wdata);
466
467 mutex_lock(&audit_filter_mutex);
468 473
469 /* parent was moved before we took audit_filter_mutex */ 474 audit_add_to_parent(krule, parent);
470 if (parent->flags & AUDIT_PARENT_INVALID)
471 ret = -ENOENT;
472 else
473 audit_add_to_parent(krule, parent);
474 475
475 /* match get in audit_init_parent or inotify_find_watch */ 476 /* match get in audit_find_parent or audit_init_parent */
476 put_inotify_watch(&parent->wdata); 477 audit_put_parent(parent);
477 478
479 h = audit_hash_ino((u32)watch->ino);
480 *list = &audit_inode_hash[h];
478error: 481error:
479 audit_put_nd(ndp, ndw); /* NULL args OK */ 482 audit_put_nd(ndp, ndw); /* NULL args OK */
480 return ret; 483 return ret;
481 484
482} 485}
483 486
484void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list) 487void audit_remove_watch_rule(struct audit_krule *krule)
485{ 488{
486 struct audit_watch *watch = krule->watch; 489 struct audit_watch *watch = krule->watch;
487 struct audit_parent *parent = watch->parent; 490 struct audit_parent *parent = watch->parent;
@@ -492,53 +495,74 @@ void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list)
492 audit_remove_watch(watch); 495 audit_remove_watch(watch);
493 496
494 if (list_empty(&parent->watches)) { 497 if (list_empty(&parent->watches)) {
495 /* Put parent on the inotify un-registration 498 audit_get_parent(parent);
496 * list. Grab a reference before releasing 499 fsnotify_destroy_mark(&parent->mark);
497 * audit_filter_mutex, to be released in 500 audit_put_parent(parent);
498 * audit_inotify_unregister().
499 * If filesystem is going away, just leave
500 * the sucker alone, eviction will take
501 * care of it. */
502 if (pin_inotify_watch(&parent->wdata))
503 list_add(&parent->ilist, list);
504 } 501 }
505 } 502 }
506} 503}
507 504
508/* Update watch data in audit rules based on inotify events. */ 505static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode,
509static void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask, 506 struct fsnotify_mark *inode_mark,
510 u32 cookie, const char *dname, struct inode *inode) 507 struct fsnotify_mark *vfsmount_mark,
508 __u32 mask, void *data, int data_type)
509{
510 return true;
511}
512
513/* Update watch data in audit rules based on fsnotify events. */
514static int audit_watch_handle_event(struct fsnotify_group *group,
515 struct fsnotify_mark *inode_mark,
516 struct fsnotify_mark *vfsmount_mark,
517 struct fsnotify_event *event)
511{ 518{
519 struct inode *inode;
520 __u32 mask = event->mask;
521 const char *dname = event->file_name;
512 struct audit_parent *parent; 522 struct audit_parent *parent;
513 523
514 parent = container_of(i_watch, struct audit_parent, wdata); 524 parent = container_of(inode_mark, struct audit_parent, mark);
515 525
516 if (mask & (IN_CREATE|IN_MOVED_TO) && inode) 526 BUG_ON(group != audit_watch_group);
517 audit_update_watch(parent, dname, inode->i_sb->s_dev, 527
518 inode->i_ino, 0); 528 switch (event->data_type) {
519 else if (mask & (IN_DELETE|IN_MOVED_FROM)) 529 case (FSNOTIFY_EVENT_PATH):
530 inode = event->path.dentry->d_inode;
531 break;
532 case (FSNOTIFY_EVENT_INODE):
533 inode = event->inode;
534 break;
535 default:
536 BUG();
537 inode = NULL;
538 break;
539 };
540
541 if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
542 audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0);
543 else if (mask & (FS_DELETE|FS_MOVED_FROM))
520 audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1); 544 audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
521 /* inotify automatically removes the watch and sends IN_IGNORED */ 545 else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
522 else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
523 audit_remove_parent_watches(parent);
524 /* inotify does not remove the watch, so remove it manually */
525 else if(mask & IN_MOVE_SELF) {
526 audit_remove_parent_watches(parent); 546 audit_remove_parent_watches(parent);
527 inotify_remove_watch_locked(audit_ih, i_watch); 547
528 } else if (mask & IN_IGNORED) 548 return 0;
529 put_inotify_watch(i_watch);
530} 549}
531 550
532static const struct inotify_operations audit_inotify_ops = { 551static const struct fsnotify_ops audit_watch_fsnotify_ops = {
533 .handle_event = audit_handle_ievent, 552 .should_send_event = audit_watch_should_send_event,
534 .destroy_watch = audit_free_parent, 553 .handle_event = audit_watch_handle_event,
554 .free_group_priv = NULL,
555 .freeing_mark = NULL,
556 .free_event_priv = NULL,
535}; 557};
536 558
537static int __init audit_watch_init(void) 559static int __init audit_watch_init(void)
538{ 560{
539 audit_ih = inotify_init(&audit_inotify_ops); 561 audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops);
540 if (IS_ERR(audit_ih)) 562 if (IS_ERR(audit_watch_group)) {
541 audit_panic("cannot initialize inotify handle"); 563 audit_watch_group = NULL;
564 audit_panic("cannot create audit fsnotify group");
565 }
542 return 0; 566 return 0;
543} 567}
544subsys_initcall(audit_watch_init); 568device_initcall(audit_watch_init);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index ce08041f578d..eb7675499fb5 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -71,6 +71,7 @@ static inline void audit_free_rule(struct audit_entry *e)
71{ 71{
72 int i; 72 int i;
73 struct audit_krule *erule = &e->rule; 73 struct audit_krule *erule = &e->rule;
74
74 /* some rules don't have associated watches */ 75 /* some rules don't have associated watches */
75 if (erule->watch) 76 if (erule->watch)
76 audit_put_watch(erule->watch); 77 audit_put_watch(erule->watch);
@@ -746,8 +747,7 @@ static inline int audit_dupe_lsm_field(struct audit_field *df,
746 * rule with the new rule in the filterlist, then free the old rule. 747 * rule with the new rule in the filterlist, then free the old rule.
747 * The rlist element is undefined; list manipulations are handled apart from 748 * The rlist element is undefined; list manipulations are handled apart from
748 * the initial copy. */ 749 * the initial copy. */
749struct audit_entry *audit_dupe_rule(struct audit_krule *old, 750struct audit_entry *audit_dupe_rule(struct audit_krule *old)
750 struct audit_watch *watch)
751{ 751{
752 u32 fcount = old->field_count; 752 u32 fcount = old->field_count;
753 struct audit_entry *entry; 753 struct audit_entry *entry;
@@ -769,8 +769,8 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old,
769 new->prio = old->prio; 769 new->prio = old->prio;
770 new->buflen = old->buflen; 770 new->buflen = old->buflen;
771 new->inode_f = old->inode_f; 771 new->inode_f = old->inode_f;
772 new->watch = NULL;
773 new->field_count = old->field_count; 772 new->field_count = old->field_count;
773
774 /* 774 /*
775 * note that we are OK with not refcounting here; audit_match_tree() 775 * note that we are OK with not refcounting here; audit_match_tree()
776 * never dereferences tree and we can't get false positives there 776 * never dereferences tree and we can't get false positives there
@@ -811,9 +811,9 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old,
811 } 811 }
812 } 812 }
813 813
814 if (watch) { 814 if (old->watch) {
815 audit_get_watch(watch); 815 audit_get_watch(old->watch);
816 new->watch = watch; 816 new->watch = old->watch;
817 } 817 }
818 818
819 return entry; 819 return entry;
@@ -866,7 +866,7 @@ static inline int audit_add_rule(struct audit_entry *entry)
866 struct audit_watch *watch = entry->rule.watch; 866 struct audit_watch *watch = entry->rule.watch;
867 struct audit_tree *tree = entry->rule.tree; 867 struct audit_tree *tree = entry->rule.tree;
868 struct list_head *list; 868 struct list_head *list;
869 int h, err; 869 int err;
870#ifdef CONFIG_AUDITSYSCALL 870#ifdef CONFIG_AUDITSYSCALL
871 int dont_count = 0; 871 int dont_count = 0;
872 872
@@ -889,15 +889,11 @@ static inline int audit_add_rule(struct audit_entry *entry)
889 889
890 if (watch) { 890 if (watch) {
891 /* audit_filter_mutex is dropped and re-taken during this call */ 891 /* audit_filter_mutex is dropped and re-taken during this call */
892 err = audit_add_watch(&entry->rule); 892 err = audit_add_watch(&entry->rule, &list);
893 if (err) { 893 if (err) {
894 mutex_unlock(&audit_filter_mutex); 894 mutex_unlock(&audit_filter_mutex);
895 goto error; 895 goto error;
896 } 896 }
897 /* entry->rule.watch may have changed during audit_add_watch() */
898 watch = entry->rule.watch;
899 h = audit_hash_ino((u32)audit_watch_inode(watch));
900 list = &audit_inode_hash[h];
901 } 897 }
902 if (tree) { 898 if (tree) {
903 err = audit_add_tree_rule(&entry->rule); 899 err = audit_add_tree_rule(&entry->rule);
@@ -949,7 +945,6 @@ static inline int audit_del_rule(struct audit_entry *entry)
949 struct audit_watch *watch = entry->rule.watch; 945 struct audit_watch *watch = entry->rule.watch;
950 struct audit_tree *tree = entry->rule.tree; 946 struct audit_tree *tree = entry->rule.tree;
951 struct list_head *list; 947 struct list_head *list;
952 LIST_HEAD(inotify_list);
953 int ret = 0; 948 int ret = 0;
954#ifdef CONFIG_AUDITSYSCALL 949#ifdef CONFIG_AUDITSYSCALL
955 int dont_count = 0; 950 int dont_count = 0;
@@ -969,7 +964,7 @@ static inline int audit_del_rule(struct audit_entry *entry)
969 } 964 }
970 965
971 if (e->rule.watch) 966 if (e->rule.watch)
972 audit_remove_watch_rule(&e->rule, &inotify_list); 967 audit_remove_watch_rule(&e->rule);
973 968
974 if (e->rule.tree) 969 if (e->rule.tree)
975 audit_remove_tree_rule(&e->rule); 970 audit_remove_tree_rule(&e->rule);
@@ -987,9 +982,6 @@ static inline int audit_del_rule(struct audit_entry *entry)
987#endif 982#endif
988 mutex_unlock(&audit_filter_mutex); 983 mutex_unlock(&audit_filter_mutex);
989 984
990 if (!list_empty(&inotify_list))
991 audit_inotify_unregister(&inotify_list);
992
993out: 985out:
994 if (watch) 986 if (watch)
995 audit_put_watch(watch); /* match initial get */ 987 audit_put_watch(watch); /* match initial get */
@@ -1323,30 +1315,23 @@ static int update_lsm_rule(struct audit_krule *r)
1323{ 1315{
1324 struct audit_entry *entry = container_of(r, struct audit_entry, rule); 1316 struct audit_entry *entry = container_of(r, struct audit_entry, rule);
1325 struct audit_entry *nentry; 1317 struct audit_entry *nentry;
1326 struct audit_watch *watch;
1327 struct audit_tree *tree;
1328 int err = 0; 1318 int err = 0;
1329 1319
1330 if (!security_audit_rule_known(r)) 1320 if (!security_audit_rule_known(r))
1331 return 0; 1321 return 0;
1332 1322
1333 watch = r->watch; 1323 nentry = audit_dupe_rule(r);
1334 tree = r->tree;
1335 nentry = audit_dupe_rule(r, watch);
1336 if (IS_ERR(nentry)) { 1324 if (IS_ERR(nentry)) {
1337 /* save the first error encountered for the 1325 /* save the first error encountered for the
1338 * return value */ 1326 * return value */
1339 err = PTR_ERR(nentry); 1327 err = PTR_ERR(nentry);
1340 audit_panic("error updating LSM filters"); 1328 audit_panic("error updating LSM filters");
1341 if (watch) 1329 if (r->watch)
1342 list_del(&r->rlist); 1330 list_del(&r->rlist);
1343 list_del_rcu(&entry->list); 1331 list_del_rcu(&entry->list);
1344 list_del(&r->list); 1332 list_del(&r->list);
1345 } else { 1333 } else {
1346 if (watch) { 1334 if (r->watch || r->tree)
1347 list_add(&nentry->rule.rlist, audit_watch_rules(watch));
1348 list_del(&r->rlist);
1349 } else if (tree)
1350 list_replace_init(&r->rlist, &nentry->rule.rlist); 1335 list_replace_init(&r->rlist, &nentry->rule.rlist);
1351 list_replace_rcu(&entry->list, &nentry->list); 1336 list_replace_rcu(&entry->list, &nentry->list);
1352 list_replace(&r->list, &nentry->rule.list); 1337 list_replace(&r->list, &nentry->rule.list);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 3828ad5fb8f1..1b31c130d034 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -65,7 +65,6 @@
65#include <linux/binfmts.h> 65#include <linux/binfmts.h>
66#include <linux/highmem.h> 66#include <linux/highmem.h>
67#include <linux/syscalls.h> 67#include <linux/syscalls.h>
68#include <linux/inotify.h>
69#include <linux/capability.h> 68#include <linux/capability.h>
70#include <linux/fs_struct.h> 69#include <linux/fs_struct.h>
71 70
@@ -549,9 +548,8 @@ static int audit_filter_rules(struct task_struct *tsk,
549 } 548 }
550 break; 549 break;
551 case AUDIT_WATCH: 550 case AUDIT_WATCH:
552 if (name && audit_watch_inode(rule->watch) != (unsigned long)-1) 551 if (name)
553 result = (name->dev == audit_watch_dev(rule->watch) && 552 result = audit_watch_compare(rule->watch, name->ino, name->dev);
554 name->ino == audit_watch_inode(rule->watch));
555 break; 553 break;
556 case AUDIT_DIR: 554 case AUDIT_DIR:
557 if (ctx) 555 if (ctx)
@@ -1726,7 +1724,7 @@ static inline void handle_one(const struct inode *inode)
1726 struct audit_tree_refs *p; 1724 struct audit_tree_refs *p;
1727 struct audit_chunk *chunk; 1725 struct audit_chunk *chunk;
1728 int count; 1726 int count;
1729 if (likely(list_empty(&inode->inotify_watches))) 1727 if (likely(hlist_empty(&inode->i_fsnotify_marks)))
1730 return; 1728 return;
1731 context = current->audit_context; 1729 context = current->audit_context;
1732 p = context->trees; 1730 p = context->trees;
@@ -1769,7 +1767,7 @@ retry:
1769 seq = read_seqbegin(&rename_lock); 1767 seq = read_seqbegin(&rename_lock);
1770 for(;;) { 1768 for(;;) {
1771 struct inode *inode = d->d_inode; 1769 struct inode *inode = d->d_inode;
1772 if (inode && unlikely(!list_empty(&inode->inotify_watches))) { 1770 if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) {
1773 struct audit_chunk *chunk; 1771 struct audit_chunk *chunk;
1774 chunk = audit_tree_lookup(inode); 1772 chunk = audit_tree_lookup(inode);
1775 if (chunk) { 1773 if (chunk) {
@@ -1837,13 +1835,8 @@ void __audit_getname(const char *name)
1837 context->names[context->name_count].ino = (unsigned long)-1; 1835 context->names[context->name_count].ino = (unsigned long)-1;
1838 context->names[context->name_count].osid = 0; 1836 context->names[context->name_count].osid = 0;
1839 ++context->name_count; 1837 ++context->name_count;
1840 if (!context->pwd.dentry) { 1838 if (!context->pwd.dentry)
1841 read_lock(&current->fs->lock); 1839 get_fs_pwd(current->fs, &context->pwd);
1842 context->pwd = current->fs->pwd;
1843 path_get(&current->fs->pwd);
1844 read_unlock(&current->fs->lock);
1845 }
1846
1847} 1840}
1848 1841
1849/* audit_putname - intercept a putname request 1842/* audit_putname - intercept a putname request
diff --git a/kernel/capability.c b/kernel/capability.c
index 9e4697e9b276..2f05303715a5 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -15,7 +15,6 @@
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/pid_namespace.h> 16#include <linux/pid_namespace.h>
17#include <asm/uaccess.h> 17#include <asm/uaccess.h>
18#include "cred-internals.h"
19 18
20/* 19/*
21 * Leveraged for setting/resetting capabilities 20 * Leveraged for setting/resetting capabilities
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 6d870f2d1228..c9483d8f6140 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1102,7 +1102,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1102 if (opts->release_agent) 1102 if (opts->release_agent)
1103 return -EINVAL; 1103 return -EINVAL;
1104 opts->release_agent = 1104 opts->release_agent =
1105 kstrndup(token + 14, PATH_MAX, GFP_KERNEL); 1105 kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
1106 if (!opts->release_agent) 1106 if (!opts->release_agent)
1107 return -ENOMEM; 1107 return -ENOMEM;
1108 } else if (!strncmp(token, "name=", 5)) { 1108 } else if (!strncmp(token, "name=", 5)) {
@@ -1123,7 +1123,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1123 if (opts->name) 1123 if (opts->name)
1124 return -EINVAL; 1124 return -EINVAL;
1125 opts->name = kstrndup(name, 1125 opts->name = kstrndup(name,
1126 MAX_CGROUP_ROOT_NAMELEN, 1126 MAX_CGROUP_ROOT_NAMELEN - 1,
1127 GFP_KERNEL); 1127 GFP_KERNEL);
1128 if (!opts->name) 1128 if (!opts->name)
1129 return -ENOMEM; 1129 return -ENOMEM;
@@ -1623,6 +1623,8 @@ static struct file_system_type cgroup_fs_type = {
1623 .kill_sb = cgroup_kill_sb, 1623 .kill_sb = cgroup_kill_sb,
1624}; 1624};
1625 1625
1626static struct kobject *cgroup_kobj;
1627
1626static inline struct cgroup *__d_cgrp(struct dentry *dentry) 1628static inline struct cgroup *__d_cgrp(struct dentry *dentry)
1627{ 1629{
1628 return dentry->d_fsdata; 1630 return dentry->d_fsdata;
@@ -1788,6 +1790,30 @@ out:
1788 return retval; 1790 return retval;
1789} 1791}
1790 1792
1793/**
1794 * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
1795 * @from: attach to all cgroups of a given task
1796 * @tsk: the task to be attached
1797 */
1798int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
1799{
1800 struct cgroupfs_root *root;
1801 int retval = 0;
1802
1803 cgroup_lock();
1804 for_each_active_root(root) {
1805 struct cgroup *from_cg = task_cgroup_from_root(from, root);
1806
1807 retval = cgroup_attach_task(from_cg, tsk);
1808 if (retval)
1809 break;
1810 }
1811 cgroup_unlock();
1812
1813 return retval;
1814}
1815EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
1816
1791/* 1817/*
1792 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex 1818 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
1793 * held. May take task_lock of task 1819 * held. May take task_lock of task
@@ -2994,7 +3020,6 @@ static void cgroup_event_remove(struct work_struct *work)
2994 remove); 3020 remove);
2995 struct cgroup *cgrp = event->cgrp; 3021 struct cgroup *cgrp = event->cgrp;
2996 3022
2997 /* TODO: check return code */
2998 event->cft->unregister_event(cgrp, event->cft, event->eventfd); 3023 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
2999 3024
3000 eventfd_ctx_put(event->eventfd); 3025 eventfd_ctx_put(event->eventfd);
@@ -3016,7 +3041,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3016 unsigned long flags = (unsigned long)key; 3041 unsigned long flags = (unsigned long)key;
3017 3042
3018 if (flags & POLLHUP) { 3043 if (flags & POLLHUP) {
3019 remove_wait_queue_locked(event->wqh, &event->wait); 3044 __remove_wait_queue(event->wqh, &event->wait);
3020 spin_lock(&cgrp->event_list_lock); 3045 spin_lock(&cgrp->event_list_lock);
3021 list_del(&event->list); 3046 list_del(&event->list);
3022 spin_unlock(&cgrp->event_list_lock); 3047 spin_unlock(&cgrp->event_list_lock);
@@ -3615,7 +3640,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
3615 * @ss: the subsystem to load 3640 * @ss: the subsystem to load
3616 * 3641 *
3617 * This function should be called in a modular subsystem's initcall. If the 3642 * This function should be called in a modular subsystem's initcall. If the
3618 * subsytem is built as a module, it will be assigned a new subsys_id and set 3643 * subsystem is built as a module, it will be assigned a new subsys_id and set
3619 * up for use. If the subsystem is built-in anyway, work is delegated to the 3644 * up for use. If the subsystem is built-in anyway, work is delegated to the
3620 * simpler cgroup_init_subsys. 3645 * simpler cgroup_init_subsys.
3621 */ 3646 */
@@ -3872,9 +3897,18 @@ int __init cgroup_init(void)
3872 hhead = css_set_hash(init_css_set.subsys); 3897 hhead = css_set_hash(init_css_set.subsys);
3873 hlist_add_head(&init_css_set.hlist, hhead); 3898 hlist_add_head(&init_css_set.hlist, hhead);
3874 BUG_ON(!init_root_id(&rootnode)); 3899 BUG_ON(!init_root_id(&rootnode));
3900
3901 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
3902 if (!cgroup_kobj) {
3903 err = -ENOMEM;
3904 goto out;
3905 }
3906
3875 err = register_filesystem(&cgroup_fs_type); 3907 err = register_filesystem(&cgroup_fs_type);
3876 if (err < 0) 3908 if (err < 0) {
3909 kobject_put(cgroup_kobj);
3877 goto out; 3910 goto out;
3911 }
3878 3912
3879 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); 3913 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
3880 3914
@@ -4599,7 +4633,7 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
4599 parent_css = parent->subsys[subsys_id]; 4633 parent_css = parent->subsys[subsys_id];
4600 child_css = child->subsys[subsys_id]; 4634 child_css = child->subsys[subsys_id];
4601 parent_id = parent_css->id; 4635 parent_id = parent_css->id;
4602 depth = parent_id->depth; 4636 depth = parent_id->depth + 1;
4603 4637
4604 child_id = get_new_cssid(ss, depth); 4638 child_id = get_new_cssid(ss, depth);
4605 if (IS_ERR(child_id)) 4639 if (IS_ERR(child_id))
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index e5c0244962b0..ce71ed53e88f 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -89,10 +89,10 @@ struct cgroup_subsys freezer_subsys;
89 89
90/* Locks taken and their ordering 90/* Locks taken and their ordering
91 * ------------------------------ 91 * ------------------------------
92 * css_set_lock
93 * cgroup_mutex (AKA cgroup_lock) 92 * cgroup_mutex (AKA cgroup_lock)
94 * task->alloc_lock (AKA task_lock)
95 * freezer->lock 93 * freezer->lock
94 * css_set_lock
95 * task->alloc_lock (AKA task_lock)
96 * task->sighand->siglock 96 * task->sighand->siglock
97 * 97 *
98 * cgroup code forces css_set_lock to be taken before task->alloc_lock 98 * cgroup code forces css_set_lock to be taken before task->alloc_lock
@@ -100,33 +100,38 @@ struct cgroup_subsys freezer_subsys;
100 * freezer_create(), freezer_destroy(): 100 * freezer_create(), freezer_destroy():
101 * cgroup_mutex [ by cgroup core ] 101 * cgroup_mutex [ by cgroup core ]
102 * 102 *
103 * can_attach(): 103 * freezer_can_attach():
104 * cgroup_mutex 104 * cgroup_mutex (held by caller of can_attach)
105 * 105 *
106 * cgroup_frozen(): 106 * cgroup_freezing_or_frozen():
107 * task->alloc_lock (to get task's cgroup) 107 * task->alloc_lock (to get task's cgroup)
108 * 108 *
109 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): 109 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
110 * task->alloc_lock (to get task's cgroup)
111 * freezer->lock 110 * freezer->lock
112 * sighand->siglock (if the cgroup is freezing) 111 * sighand->siglock (if the cgroup is freezing)
113 * 112 *
114 * freezer_read(): 113 * freezer_read():
115 * cgroup_mutex 114 * cgroup_mutex
116 * freezer->lock 115 * freezer->lock
116 * write_lock css_set_lock (cgroup iterator start)
117 * task->alloc_lock
117 * read_lock css_set_lock (cgroup iterator start) 118 * read_lock css_set_lock (cgroup iterator start)
118 * 119 *
119 * freezer_write() (freeze): 120 * freezer_write() (freeze):
120 * cgroup_mutex 121 * cgroup_mutex
121 * freezer->lock 122 * freezer->lock
123 * write_lock css_set_lock (cgroup iterator start)
124 * task->alloc_lock
122 * read_lock css_set_lock (cgroup iterator start) 125 * read_lock css_set_lock (cgroup iterator start)
123 * sighand->siglock 126 * sighand->siglock (fake signal delivery inside freeze_task())
124 * 127 *
125 * freezer_write() (unfreeze): 128 * freezer_write() (unfreeze):
126 * cgroup_mutex 129 * cgroup_mutex
127 * freezer->lock 130 * freezer->lock
131 * write_lock css_set_lock (cgroup iterator start)
132 * task->alloc_lock
128 * read_lock css_set_lock (cgroup iterator start) 133 * read_lock css_set_lock (cgroup iterator start)
129 * task->alloc_lock (to prevent races with freeze_task()) 134 * task->alloc_lock (inside thaw_process(), prevents race with refrigerator())
130 * sighand->siglock 135 * sighand->siglock
131 */ 136 */
132static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, 137static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
diff --git a/kernel/compat.c b/kernel/compat.c
index 7f40e9275fd9..c9e2ec0b34a8 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -279,11 +279,6 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource,
279 struct compat_rlimit __user *rlim) 279 struct compat_rlimit __user *rlim)
280{ 280{
281 struct rlimit r; 281 struct rlimit r;
282 int ret;
283 mm_segment_t old_fs = get_fs ();
284
285 if (resource >= RLIM_NLIMITS)
286 return -EINVAL;
287 282
288 if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) || 283 if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) ||
289 __get_user(r.rlim_cur, &rlim->rlim_cur) || 284 __get_user(r.rlim_cur, &rlim->rlim_cur) ||
@@ -294,10 +289,7 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource,
294 r.rlim_cur = RLIM_INFINITY; 289 r.rlim_cur = RLIM_INFINITY;
295 if (r.rlim_max == COMPAT_RLIM_INFINITY) 290 if (r.rlim_max == COMPAT_RLIM_INFINITY)
296 r.rlim_max = RLIM_INFINITY; 291 r.rlim_max = RLIM_INFINITY;
297 set_fs(KERNEL_DS); 292 return do_prlimit(current, resource, &r, NULL);
298 ret = sys_setrlimit(resource, (struct rlimit __user *) &r);
299 set_fs(old_fs);
300 return ret;
301} 293}
302 294
303#ifdef COMPAT_RLIM_OLD_INFINITY 295#ifdef COMPAT_RLIM_OLD_INFINITY
@@ -329,16 +321,13 @@ asmlinkage long compat_sys_old_getrlimit(unsigned int resource,
329 321
330#endif 322#endif
331 323
332asmlinkage long compat_sys_getrlimit (unsigned int resource, 324asmlinkage long compat_sys_getrlimit(unsigned int resource,
333 struct compat_rlimit __user *rlim) 325 struct compat_rlimit __user *rlim)
334{ 326{
335 struct rlimit r; 327 struct rlimit r;
336 int ret; 328 int ret;
337 mm_segment_t old_fs = get_fs();
338 329
339 set_fs(KERNEL_DS); 330 ret = do_prlimit(current, resource, NULL, &r);
340 ret = sys_getrlimit(resource, (struct rlimit __user *) &r);
341 set_fs(old_fs);
342 if (!ret) { 331 if (!ret) {
343 if (r.rlim_cur > COMPAT_RLIM_INFINITY) 332 if (r.rlim_cur > COMPAT_RLIM_INFINITY)
344 r.rlim_cur = COMPAT_RLIM_INFINITY; 333 r.rlim_cur = COMPAT_RLIM_INFINITY;
@@ -495,29 +484,26 @@ asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
495{ 484{
496 int ret; 485 int ret;
497 cpumask_var_t mask; 486 cpumask_var_t mask;
498 unsigned long *k;
499 unsigned int min_length = cpumask_size();
500
501 if (nr_cpu_ids <= BITS_PER_COMPAT_LONG)
502 min_length = sizeof(compat_ulong_t);
503 487
504 if (len < min_length) 488 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
489 return -EINVAL;
490 if (len & (sizeof(compat_ulong_t)-1))
505 return -EINVAL; 491 return -EINVAL;
506 492
507 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 493 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
508 return -ENOMEM; 494 return -ENOMEM;
509 495
510 ret = sched_getaffinity(pid, mask); 496 ret = sched_getaffinity(pid, mask);
511 if (ret < 0) 497 if (ret == 0) {
512 goto out; 498 size_t retlen = min_t(size_t, len, cpumask_size());
513 499
514 k = cpumask_bits(mask); 500 if (compat_put_bitmap(user_mask_ptr, cpumask_bits(mask), retlen * 8))
515 ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8); 501 ret = -EFAULT;
516 if (ret == 0) 502 else
517 ret = min_length; 503 ret = retlen;
518 504 }
519out:
520 free_cpumask_var(mask); 505 free_cpumask_var(mask);
506
521 return ret; 507 return ret;
522} 508}
523 509
@@ -1140,3 +1126,24 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info)
1140 1126
1141 return 0; 1127 return 0;
1142} 1128}
1129
1130/*
1131 * Allocate user-space memory for the duration of a single system call,
1132 * in order to marshall parameters inside a compat thunk.
1133 */
1134void __user *compat_alloc_user_space(unsigned long len)
1135{
1136 void __user *ptr;
1137
1138 /* If len would occupy more than half of the entire compat space... */
1139 if (unlikely(len > (((compat_uptr_t)~0) >> 1)))
1140 return NULL;
1141
1142 ptr = arch_compat_alloc_user_space(len);
1143
1144 if (unlikely(!access_ok(VERIFY_WRITE, ptr, len)))
1145 return NULL;
1146
1147 return ptr;
1148}
1149EXPORT_SYMBOL_GPL(compat_alloc_user_space);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 25bba73b1be3..f6e726f18491 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -20,13 +20,29 @@
20/* Serializes the updates to cpu_online_mask, cpu_present_mask */ 20/* Serializes the updates to cpu_online_mask, cpu_present_mask */
21static DEFINE_MUTEX(cpu_add_remove_lock); 21static DEFINE_MUTEX(cpu_add_remove_lock);
22 22
23static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); 23/*
24 * The following two API's must be used when attempting
25 * to serialize the updates to cpu_online_mask, cpu_present_mask.
26 */
27void cpu_maps_update_begin(void)
28{
29 mutex_lock(&cpu_add_remove_lock);
30}
31
32void cpu_maps_update_done(void)
33{
34 mutex_unlock(&cpu_add_remove_lock);
35}
36
37static RAW_NOTIFIER_HEAD(cpu_chain);
24 38
25/* If set, cpu_up and cpu_down will return -EBUSY and do nothing. 39/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
26 * Should always be manipulated under cpu_add_remove_lock 40 * Should always be manipulated under cpu_add_remove_lock
27 */ 41 */
28static int cpu_hotplug_disabled; 42static int cpu_hotplug_disabled;
29 43
44#ifdef CONFIG_HOTPLUG_CPU
45
30static struct { 46static struct {
31 struct task_struct *active_writer; 47 struct task_struct *active_writer;
32 struct mutex lock; /* Synchronizes accesses to refcount, */ 48 struct mutex lock; /* Synchronizes accesses to refcount, */
@@ -41,8 +57,6 @@ static struct {
41 .refcount = 0, 57 .refcount = 0,
42}; 58};
43 59
44#ifdef CONFIG_HOTPLUG_CPU
45
46void get_online_cpus(void) 60void get_online_cpus(void)
47{ 61{
48 might_sleep(); 62 might_sleep();
@@ -67,22 +81,6 @@ void put_online_cpus(void)
67} 81}
68EXPORT_SYMBOL_GPL(put_online_cpus); 82EXPORT_SYMBOL_GPL(put_online_cpus);
69 83
70#endif /* CONFIG_HOTPLUG_CPU */
71
72/*
73 * The following two API's must be used when attempting
74 * to serialize the updates to cpu_online_mask, cpu_present_mask.
75 */
76void cpu_maps_update_begin(void)
77{
78 mutex_lock(&cpu_add_remove_lock);
79}
80
81void cpu_maps_update_done(void)
82{
83 mutex_unlock(&cpu_add_remove_lock);
84}
85
86/* 84/*
87 * This ensures that the hotplug operation can begin only when the 85 * This ensures that the hotplug operation can begin only when the
88 * refcount goes to zero. 86 * refcount goes to zero.
@@ -124,6 +122,12 @@ static void cpu_hotplug_done(void)
124 cpu_hotplug.active_writer = NULL; 122 cpu_hotplug.active_writer = NULL;
125 mutex_unlock(&cpu_hotplug.lock); 123 mutex_unlock(&cpu_hotplug.lock);
126} 124}
125
126#else /* #if CONFIG_HOTPLUG_CPU */
127static void cpu_hotplug_begin(void) {}
128static void cpu_hotplug_done(void) {}
129#endif /* #esle #if CONFIG_HOTPLUG_CPU */
130
127/* Need to know about CPUs going up/down? */ 131/* Need to know about CPUs going up/down? */
128int __ref register_cpu_notifier(struct notifier_block *nb) 132int __ref register_cpu_notifier(struct notifier_block *nb)
129{ 133{
@@ -134,8 +138,29 @@ int __ref register_cpu_notifier(struct notifier_block *nb)
134 return ret; 138 return ret;
135} 139}
136 140
141static int __cpu_notify(unsigned long val, void *v, int nr_to_call,
142 int *nr_calls)
143{
144 int ret;
145
146 ret = __raw_notifier_call_chain(&cpu_chain, val, v, nr_to_call,
147 nr_calls);
148
149 return notifier_to_errno(ret);
150}
151
152static int cpu_notify(unsigned long val, void *v)
153{
154 return __cpu_notify(val, v, -1, NULL);
155}
156
137#ifdef CONFIG_HOTPLUG_CPU 157#ifdef CONFIG_HOTPLUG_CPU
138 158
159static void cpu_notify_nofail(unsigned long val, void *v)
160{
161 BUG_ON(cpu_notify(val, v));
162}
163
139EXPORT_SYMBOL(register_cpu_notifier); 164EXPORT_SYMBOL(register_cpu_notifier);
140 165
141void __ref unregister_cpu_notifier(struct notifier_block *nb) 166void __ref unregister_cpu_notifier(struct notifier_block *nb)
@@ -164,6 +189,7 @@ static inline void check_for_tasks(int cpu)
164} 189}
165 190
166struct take_cpu_down_param { 191struct take_cpu_down_param {
192 struct task_struct *caller;
167 unsigned long mod; 193 unsigned long mod;
168 void *hcpu; 194 void *hcpu;
169}; 195};
@@ -172,6 +198,7 @@ struct take_cpu_down_param {
172static int __ref take_cpu_down(void *_param) 198static int __ref take_cpu_down(void *_param)
173{ 199{
174 struct take_cpu_down_param *param = _param; 200 struct take_cpu_down_param *param = _param;
201 unsigned int cpu = (unsigned long)param->hcpu;
175 int err; 202 int err;
176 203
177 /* Ensure this CPU doesn't handle any more interrupts. */ 204 /* Ensure this CPU doesn't handle any more interrupts. */
@@ -179,9 +206,10 @@ static int __ref take_cpu_down(void *_param)
179 if (err < 0) 206 if (err < 0)
180 return err; 207 return err;
181 208
182 raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, 209 cpu_notify(CPU_DYING | param->mod, param->hcpu);
183 param->hcpu);
184 210
211 if (task_cpu(param->caller) == cpu)
212 move_task_off_dead_cpu(cpu, param->caller);
185 /* Force idle task to run as soon as we yield: it should 213 /* Force idle task to run as soon as we yield: it should
186 immediately notice cpu is offline and die quickly. */ 214 immediately notice cpu is offline and die quickly. */
187 sched_idle_next(); 215 sched_idle_next();
@@ -192,10 +220,10 @@ static int __ref take_cpu_down(void *_param)
192static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) 220static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
193{ 221{
194 int err, nr_calls = 0; 222 int err, nr_calls = 0;
195 cpumask_var_t old_allowed;
196 void *hcpu = (void *)(long)cpu; 223 void *hcpu = (void *)(long)cpu;
197 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 224 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
198 struct take_cpu_down_param tcd_param = { 225 struct take_cpu_down_param tcd_param = {
226 .caller = current,
199 .mod = mod, 227 .mod = mod,
200 .hcpu = hcpu, 228 .hcpu = hcpu,
201 }; 229 };
@@ -206,38 +234,22 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
206 if (!cpu_online(cpu)) 234 if (!cpu_online(cpu))
207 return -EINVAL; 235 return -EINVAL;
208 236
209 if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL))
210 return -ENOMEM;
211
212 cpu_hotplug_begin(); 237 cpu_hotplug_begin();
213 set_cpu_active(cpu, false); 238 err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
214 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, 239 if (err) {
215 hcpu, -1, &nr_calls);
216 if (err == NOTIFY_BAD) {
217 set_cpu_active(cpu, true);
218
219 nr_calls--; 240 nr_calls--;
220 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 241 __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
221 hcpu, nr_calls, NULL);
222 printk("%s: attempt to take down CPU %u failed\n", 242 printk("%s: attempt to take down CPU %u failed\n",
223 __func__, cpu); 243 __func__, cpu);
224 err = -EINVAL;
225 goto out_release; 244 goto out_release;
226 } 245 }
227 246
228 /* Ensure that we are not runnable on dying cpu */
229 cpumask_copy(old_allowed, &current->cpus_allowed);
230 set_cpus_allowed_ptr(current, cpu_active_mask);
231
232 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); 247 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
233 if (err) { 248 if (err) {
234 set_cpu_active(cpu, true);
235 /* CPU didn't die: tell everyone. Can't complain. */ 249 /* CPU didn't die: tell everyone. Can't complain. */
236 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 250 cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
237 hcpu) == NOTIFY_BAD)
238 BUG();
239 251
240 goto out_allowed; 252 goto out_release;
241 } 253 }
242 BUG_ON(cpu_online(cpu)); 254 BUG_ON(cpu_online(cpu));
243 255
@@ -249,22 +261,14 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
249 __cpu_die(cpu); 261 __cpu_die(cpu);
250 262
251 /* CPU is completely dead: tell everyone. Too late to complain. */ 263 /* CPU is completely dead: tell everyone. Too late to complain. */
252 if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD | mod, 264 cpu_notify_nofail(CPU_DEAD | mod, hcpu);
253 hcpu) == NOTIFY_BAD)
254 BUG();
255 265
256 check_for_tasks(cpu); 266 check_for_tasks(cpu);
257 267
258out_allowed:
259 set_cpus_allowed_ptr(current, old_allowed);
260out_release: 268out_release:
261 cpu_hotplug_done(); 269 cpu_hotplug_done();
262 if (!err) { 270 if (!err)
263 if (raw_notifier_call_chain(&cpu_chain, CPU_POST_DEAD | mod, 271 cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
264 hcpu) == NOTIFY_BAD)
265 BUG();
266 }
267 free_cpumask_var(old_allowed);
268 return err; 272 return err;
269} 273}
270 274
@@ -272,9 +276,6 @@ int __ref cpu_down(unsigned int cpu)
272{ 276{
273 int err; 277 int err;
274 278
275 err = stop_machine_create();
276 if (err)
277 return err;
278 cpu_maps_update_begin(); 279 cpu_maps_update_begin();
279 280
280 if (cpu_hotplug_disabled) { 281 if (cpu_hotplug_disabled) {
@@ -286,7 +287,6 @@ int __ref cpu_down(unsigned int cpu)
286 287
287out: 288out:
288 cpu_maps_update_done(); 289 cpu_maps_update_done();
289 stop_machine_destroy();
290 return err; 290 return err;
291} 291}
292EXPORT_SYMBOL(cpu_down); 292EXPORT_SYMBOL(cpu_down);
@@ -303,13 +303,11 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
303 return -EINVAL; 303 return -EINVAL;
304 304
305 cpu_hotplug_begin(); 305 cpu_hotplug_begin();
306 ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu, 306 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
307 -1, &nr_calls); 307 if (ret) {
308 if (ret == NOTIFY_BAD) {
309 nr_calls--; 308 nr_calls--;
310 printk("%s: attempt to bring up CPU %u failed\n", 309 printk("%s: attempt to bring up CPU %u failed\n",
311 __func__, cpu); 310 __func__, cpu);
312 ret = -EINVAL;
313 goto out_notify; 311 goto out_notify;
314 } 312 }
315 313
@@ -319,15 +317,12 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
319 goto out_notify; 317 goto out_notify;
320 BUG_ON(!cpu_online(cpu)); 318 BUG_ON(!cpu_online(cpu));
321 319
322 set_cpu_active(cpu, true);
323
324 /* Now call notifier in preparation. */ 320 /* Now call notifier in preparation. */
325 raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu); 321 cpu_notify(CPU_ONLINE | mod, hcpu);
326 322
327out_notify: 323out_notify:
328 if (ret != 0) 324 if (ret != 0)
329 __raw_notifier_call_chain(&cpu_chain, 325 __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
330 CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
331 cpu_hotplug_done(); 326 cpu_hotplug_done();
332 327
333 return ret; 328 return ret;
@@ -336,6 +331,12 @@ out_notify:
336int __cpuinit cpu_up(unsigned int cpu) 331int __cpuinit cpu_up(unsigned int cpu)
337{ 332{
338 int err = 0; 333 int err = 0;
334
335#ifdef CONFIG_MEMORY_HOTPLUG
336 int nid;
337 pg_data_t *pgdat;
338#endif
339
339 if (!cpu_possible(cpu)) { 340 if (!cpu_possible(cpu)) {
340 printk(KERN_ERR "can't online cpu %d because it is not " 341 printk(KERN_ERR "can't online cpu %d because it is not "
341 "configured as may-hotadd at boot time\n", cpu); 342 "configured as may-hotadd at boot time\n", cpu);
@@ -346,6 +347,28 @@ int __cpuinit cpu_up(unsigned int cpu)
346 return -EINVAL; 347 return -EINVAL;
347 } 348 }
348 349
350#ifdef CONFIG_MEMORY_HOTPLUG
351 nid = cpu_to_node(cpu);
352 if (!node_online(nid)) {
353 err = mem_online_node(nid);
354 if (err)
355 return err;
356 }
357
358 pgdat = NODE_DATA(nid);
359 if (!pgdat) {
360 printk(KERN_ERR
361 "Can't online cpu %d due to NULL pgdat\n", cpu);
362 return -ENOMEM;
363 }
364
365 if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
366 mutex_lock(&zonelists_mutex);
367 build_all_zonelists(NULL);
368 mutex_unlock(&zonelists_mutex);
369 }
370#endif
371
349 cpu_maps_update_begin(); 372 cpu_maps_update_begin();
350 373
351 if (cpu_hotplug_disabled) { 374 if (cpu_hotplug_disabled) {
@@ -365,11 +388,8 @@ static cpumask_var_t frozen_cpus;
365 388
366int disable_nonboot_cpus(void) 389int disable_nonboot_cpus(void)
367{ 390{
368 int cpu, first_cpu, error; 391 int cpu, first_cpu, error = 0;
369 392
370 error = stop_machine_create();
371 if (error)
372 return error;
373 cpu_maps_update_begin(); 393 cpu_maps_update_begin();
374 first_cpu = cpumask_first(cpu_online_mask); 394 first_cpu = cpumask_first(cpu_online_mask);
375 /* 395 /*
@@ -400,7 +420,6 @@ int disable_nonboot_cpus(void)
400 printk(KERN_ERR "Non-boot CPUs are not disabled\n"); 420 printk(KERN_ERR "Non-boot CPUs are not disabled\n");
401 } 421 }
402 cpu_maps_update_done(); 422 cpu_maps_update_done();
403 stop_machine_destroy();
404 return error; 423 return error;
405} 424}
406 425
@@ -467,7 +486,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
467 if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus)) 486 if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus))
468 val = CPU_STARTING_FROZEN; 487 val = CPU_STARTING_FROZEN;
469#endif /* CONFIG_PM_SLEEP_SMP */ 488#endif /* CONFIG_PM_SLEEP_SMP */
470 raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu); 489 cpu_notify(val, (void *)(long)cpu);
471} 490}
472 491
473#endif /* CONFIG_SMP */ 492#endif /* CONFIG_SMP */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d10946748ec2..b23c0979bbe7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -105,7 +105,7 @@ struct cpuset {
105 /* for custom sched domain */ 105 /* for custom sched domain */
106 int relax_domain_level; 106 int relax_domain_level;
107 107
108 /* used for walking a cpuset heirarchy */ 108 /* used for walking a cpuset hierarchy */
109 struct list_head stack_list; 109 struct list_head stack_list;
110}; 110};
111 111
@@ -946,16 +946,62 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
946 * In order to avoid seeing no nodes if the old and new nodes are disjoint, 946 * In order to avoid seeing no nodes if the old and new nodes are disjoint,
947 * we structure updates as setting all new allowed nodes, then clearing newly 947 * we structure updates as setting all new allowed nodes, then clearing newly
948 * disallowed ones. 948 * disallowed ones.
949 *
950 * Called with task's alloc_lock held
951 */ 949 */
952static void cpuset_change_task_nodemask(struct task_struct *tsk, 950static void cpuset_change_task_nodemask(struct task_struct *tsk,
953 nodemask_t *newmems) 951 nodemask_t *newmems)
954{ 952{
953repeat:
954 /*
955 * Allow tasks that have access to memory reserves because they have
956 * been OOM killed to get memory anywhere.
957 */
958 if (unlikely(test_thread_flag(TIF_MEMDIE)))
959 return;
960 if (current->flags & PF_EXITING) /* Let dying task have memory */
961 return;
962
963 task_lock(tsk);
955 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); 964 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
956 mpol_rebind_task(tsk, &tsk->mems_allowed); 965 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
957 mpol_rebind_task(tsk, newmems); 966
967
968 /*
969 * ensure checking ->mems_allowed_change_disable after setting all new
970 * allowed nodes.
971 *
972 * the read-side task can see an nodemask with new allowed nodes and
973 * old allowed nodes. and if it allocates page when cpuset clears newly
974 * disallowed ones continuous, it can see the new allowed bits.
975 *
976 * And if setting all new allowed nodes is after the checking, setting
977 * all new allowed nodes and clearing newly disallowed ones will be done
978 * continuous, and the read-side task may find no node to alloc page.
979 */
980 smp_mb();
981
982 /*
983 * Allocation of memory is very fast, we needn't sleep when waiting
984 * for the read-side.
985 */
986 while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
987 task_unlock(tsk);
988 if (!task_curr(tsk))
989 yield();
990 goto repeat;
991 }
992
993 /*
994 * ensure checking ->mems_allowed_change_disable before clearing all new
995 * disallowed nodes.
996 *
997 * if clearing newly disallowed bits before the checking, the read-side
998 * task may find no node to alloc page.
999 */
1000 smp_mb();
1001
1002 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
958 tsk->mems_allowed = *newmems; 1003 tsk->mems_allowed = *newmems;
1004 task_unlock(tsk);
959} 1005}
960 1006
961/* 1007/*
@@ -978,9 +1024,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
978 cs = cgroup_cs(scan->cg); 1024 cs = cgroup_cs(scan->cg);
979 guarantee_online_mems(cs, newmems); 1025 guarantee_online_mems(cs, newmems);
980 1026
981 task_lock(p);
982 cpuset_change_task_nodemask(p, newmems); 1027 cpuset_change_task_nodemask(p, newmems);
983 task_unlock(p);
984 1028
985 NODEMASK_FREE(newmems); 1029 NODEMASK_FREE(newmems);
986 1030
@@ -1383,9 +1427,7 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
1383 err = set_cpus_allowed_ptr(tsk, cpus_attach); 1427 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1384 WARN_ON_ONCE(err); 1428 WARN_ON_ONCE(err);
1385 1429
1386 task_lock(tsk);
1387 cpuset_change_task_nodemask(tsk, to); 1430 cpuset_change_task_nodemask(tsk, to);
1388 task_unlock(tsk);
1389 cpuset_update_task_spread_flag(cs, tsk); 1431 cpuset_update_task_spread_flag(cs, tsk);
1390 1432
1391} 1433}
@@ -2071,31 +2113,17 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2071 * but making no active use of cpusets. 2113 * but making no active use of cpusets.
2072 * 2114 *
2073 * This routine ensures that top_cpuset.cpus_allowed tracks 2115 * This routine ensures that top_cpuset.cpus_allowed tracks
2074 * cpu_online_map on each CPU hotplug (cpuhp) event. 2116 * cpu_active_mask on each CPU hotplug (cpuhp) event.
2075 * 2117 *
2076 * Called within get_online_cpus(). Needs to call cgroup_lock() 2118 * Called within get_online_cpus(). Needs to call cgroup_lock()
2077 * before calling generate_sched_domains(). 2119 * before calling generate_sched_domains().
2078 */ 2120 */
2079static int cpuset_track_online_cpus(struct notifier_block *unused_nb, 2121void cpuset_update_active_cpus(void)
2080 unsigned long phase, void *unused_cpu)
2081{ 2122{
2082 struct sched_domain_attr *attr; 2123 struct sched_domain_attr *attr;
2083 cpumask_var_t *doms; 2124 cpumask_var_t *doms;
2084 int ndoms; 2125 int ndoms;
2085 2126
2086 switch (phase) {
2087 case CPU_ONLINE:
2088 case CPU_ONLINE_FROZEN:
2089 case CPU_DOWN_PREPARE:
2090 case CPU_DOWN_PREPARE_FROZEN:
2091 case CPU_DOWN_FAILED:
2092 case CPU_DOWN_FAILED_FROZEN:
2093 break;
2094
2095 default:
2096 return NOTIFY_DONE;
2097 }
2098
2099 cgroup_lock(); 2127 cgroup_lock();
2100 mutex_lock(&callback_mutex); 2128 mutex_lock(&callback_mutex);
2101 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2129 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
@@ -2106,8 +2134,6 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2106 2134
2107 /* Have scheduler rebuild the domains */ 2135 /* Have scheduler rebuild the domains */
2108 partition_sched_domains(ndoms, doms, attr); 2136 partition_sched_domains(ndoms, doms, attr);
2109
2110 return NOTIFY_OK;
2111} 2137}
2112 2138
2113#ifdef CONFIG_MEMORY_HOTPLUG 2139#ifdef CONFIG_MEMORY_HOTPLUG
@@ -2161,7 +2187,6 @@ void __init cpuset_init_smp(void)
2161 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2187 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2162 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2188 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2163 2189
2164 hotcpu_notifier(cpuset_track_online_cpus, 0);
2165 hotplug_memory_notifier(cpuset_track_online_nodes, 10); 2190 hotplug_memory_notifier(cpuset_track_online_nodes, 10);
2166 2191
2167 cpuset_wq = create_singlethread_workqueue("cpuset"); 2192 cpuset_wq = create_singlethread_workqueue("cpuset");
@@ -2182,19 +2207,52 @@ void __init cpuset_init_smp(void)
2182void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) 2207void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2183{ 2208{
2184 mutex_lock(&callback_mutex); 2209 mutex_lock(&callback_mutex);
2185 cpuset_cpus_allowed_locked(tsk, pmask); 2210 task_lock(tsk);
2211 guarantee_online_cpus(task_cs(tsk), pmask);
2212 task_unlock(tsk);
2186 mutex_unlock(&callback_mutex); 2213 mutex_unlock(&callback_mutex);
2187} 2214}
2188 2215
2189/** 2216int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2190 * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
2191 * Must be called with callback_mutex held.
2192 **/
2193void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
2194{ 2217{
2195 task_lock(tsk); 2218 const struct cpuset *cs;
2196 guarantee_online_cpus(task_cs(tsk), pmask); 2219 int cpu;
2197 task_unlock(tsk); 2220
2221 rcu_read_lock();
2222 cs = task_cs(tsk);
2223 if (cs)
2224 cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed);
2225 rcu_read_unlock();
2226
2227 /*
2228 * We own tsk->cpus_allowed, nobody can change it under us.
2229 *
2230 * But we used cs && cs->cpus_allowed lockless and thus can
2231 * race with cgroup_attach_task() or update_cpumask() and get
2232 * the wrong tsk->cpus_allowed. However, both cases imply the
2233 * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
2234 * which takes task_rq_lock().
2235 *
2236 * If we are called after it dropped the lock we must see all
2237 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
2238 * set any mask even if it is not right from task_cs() pov,
2239 * the pending set_cpus_allowed_ptr() will fix things.
2240 */
2241
2242 cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
2243 if (cpu >= nr_cpu_ids) {
2244 /*
2245 * Either tsk->cpus_allowed is wrong (see above) or it
2246 * is actually empty. The latter case is only possible
2247 * if we are racing with remove_tasks_in_empty_cpuset().
2248 * Like above we can temporary set any mask and rely on
2249 * set_cpus_allowed_ptr() as synchronization point.
2250 */
2251 cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask);
2252 cpu = cpumask_any(cpu_active_mask);
2253 }
2254
2255 return cpu;
2198} 2256}
2199 2257
2200void cpuset_init_current_mems_allowed(void) 2258void cpuset_init_current_mems_allowed(void)
@@ -2383,22 +2441,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
2383} 2441}
2384 2442
2385/** 2443/**
2386 * cpuset_lock - lock out any changes to cpuset structures
2387 *
2388 * The out of memory (oom) code needs to mutex_lock cpusets
2389 * from being changed while it scans the tasklist looking for a
2390 * task in an overlapping cpuset. Expose callback_mutex via this
2391 * cpuset_lock() routine, so the oom code can lock it, before
2392 * locking the task list. The tasklist_lock is a spinlock, so
2393 * must be taken inside callback_mutex.
2394 */
2395
2396void cpuset_lock(void)
2397{
2398 mutex_lock(&callback_mutex);
2399}
2400
2401/**
2402 * cpuset_unlock - release lock on cpuset changes 2444 * cpuset_unlock - release lock on cpuset changes
2403 * 2445 *
2404 * Undo the lock taken in a previous cpuset_lock() call. 2446 * Undo the lock taken in a previous cpuset_lock() call.
@@ -2410,7 +2452,8 @@ void cpuset_unlock(void)
2410} 2452}
2411 2453
2412/** 2454/**
2413 * cpuset_mem_spread_node() - On which node to begin search for a page 2455 * cpuset_mem_spread_node() - On which node to begin search for a file page
2456 * cpuset_slab_spread_node() - On which node to begin search for a slab page
2414 * 2457 *
2415 * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for 2458 * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
2416 * tasks in a cpuset with is_spread_page or is_spread_slab set), 2459 * tasks in a cpuset with is_spread_page or is_spread_slab set),
@@ -2435,16 +2478,27 @@ void cpuset_unlock(void)
2435 * See kmem_cache_alloc_node(). 2478 * See kmem_cache_alloc_node().
2436 */ 2479 */
2437 2480
2438int cpuset_mem_spread_node(void) 2481static int cpuset_spread_node(int *rotor)
2439{ 2482{
2440 int node; 2483 int node;
2441 2484
2442 node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed); 2485 node = next_node(*rotor, current->mems_allowed);
2443 if (node == MAX_NUMNODES) 2486 if (node == MAX_NUMNODES)
2444 node = first_node(current->mems_allowed); 2487 node = first_node(current->mems_allowed);
2445 current->cpuset_mem_spread_rotor = node; 2488 *rotor = node;
2446 return node; 2489 return node;
2447} 2490}
2491
2492int cpuset_mem_spread_node(void)
2493{
2494 return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
2495}
2496
2497int cpuset_slab_spread_node(void)
2498{
2499 return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
2500}
2501
2448EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); 2502EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
2449 2503
2450/** 2504/**
diff --git a/kernel/cred-internals.h b/kernel/cred-internals.h
deleted file mode 100644
index 2dc4fc2d0bf1..000000000000
--- a/kernel/cred-internals.h
+++ /dev/null
@@ -1,21 +0,0 @@
1/* Internal credentials stuff
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12/*
13 * user.c
14 */
15static inline void sched_switch_user(struct task_struct *p)
16{
17#ifdef CONFIG_USER_SCHED
18 sched_move_task(p);
19#endif /* CONFIG_USER_SCHED */
20}
21
diff --git a/kernel/cred.c b/kernel/cred.c
index 62af1816c235..9a3e22641fe7 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -17,16 +17,11 @@
17#include <linux/init_task.h> 17#include <linux/init_task.h>
18#include <linux/security.h> 18#include <linux/security.h>
19#include <linux/cn_proc.h> 19#include <linux/cn_proc.h>
20#include "cred-internals.h"
21 20
22#if 0 21#if 0
23#define kdebug(FMT, ...) \ 22#define kdebug(FMT, ...) \
24 printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) 23 printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
25#else 24#else
26static inline __attribute__((format(printf, 1, 2)))
27void no_printk(const char *fmt, ...)
28{
29}
30#define kdebug(FMT, ...) \ 25#define kdebug(FMT, ...) \
31 no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) 26 no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
32#endif 27#endif
@@ -210,6 +205,31 @@ void exit_creds(struct task_struct *tsk)
210 } 205 }
211} 206}
212 207
208/**
209 * get_task_cred - Get another task's objective credentials
210 * @task: The task to query
211 *
212 * Get the objective credentials of a task, pinning them so that they can't go
213 * away. Accessing a task's credentials directly is not permitted.
214 *
215 * The caller must also make sure task doesn't get deleted, either by holding a
216 * ref on task or by holding tasklist_lock to prevent it from being unlinked.
217 */
218const struct cred *get_task_cred(struct task_struct *task)
219{
220 const struct cred *cred;
221
222 rcu_read_lock();
223
224 do {
225 cred = __task_cred((task));
226 BUG_ON(!cred);
227 } while (!atomic_inc_not_zero(&((struct cred *)cred)->usage));
228
229 rcu_read_unlock();
230 return cred;
231}
232
213/* 233/*
214 * Allocate blank credentials, such that the credentials can be filled in at a 234 * Allocate blank credentials, such that the credentials can be filled in at a
215 * later date without risk of ENOMEM. 235 * later date without risk of ENOMEM.
@@ -348,66 +368,6 @@ struct cred *prepare_exec_creds(void)
348} 368}
349 369
350/* 370/*
351 * prepare new credentials for the usermode helper dispatcher
352 */
353struct cred *prepare_usermodehelper_creds(void)
354{
355#ifdef CONFIG_KEYS
356 struct thread_group_cred *tgcred = NULL;
357#endif
358 struct cred *new;
359
360#ifdef CONFIG_KEYS
361 tgcred = kzalloc(sizeof(*new->tgcred), GFP_ATOMIC);
362 if (!tgcred)
363 return NULL;
364#endif
365
366 new = kmem_cache_alloc(cred_jar, GFP_ATOMIC);
367 if (!new)
368 goto free_tgcred;
369
370 kdebug("prepare_usermodehelper_creds() alloc %p", new);
371
372 memcpy(new, &init_cred, sizeof(struct cred));
373
374 atomic_set(&new->usage, 1);
375 set_cred_subscribers(new, 0);
376 get_group_info(new->group_info);
377 get_uid(new->user);
378
379#ifdef CONFIG_KEYS
380 new->thread_keyring = NULL;
381 new->request_key_auth = NULL;
382 new->jit_keyring = KEY_REQKEY_DEFL_DEFAULT;
383
384 atomic_set(&tgcred->usage, 1);
385 spin_lock_init(&tgcred->lock);
386 new->tgcred = tgcred;
387#endif
388
389#ifdef CONFIG_SECURITY
390 new->security = NULL;
391#endif
392 if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0)
393 goto error;
394 validate_creds(new);
395
396 BUG_ON(atomic_read(&new->usage) != 1);
397 return new;
398
399error:
400 put_cred(new);
401 return NULL;
402
403free_tgcred:
404#ifdef CONFIG_KEYS
405 kfree(tgcred);
406#endif
407 return NULL;
408}
409
410/*
411 * Copy credentials for the new process created by fork() 371 * Copy credentials for the new process created by fork()
412 * 372 *
413 * We share if we can, but under some circumstances we have to generate a new 373 * We share if we can, but under some circumstances we have to generate a new
@@ -523,8 +483,6 @@ int commit_creds(struct cred *new)
523#endif 483#endif
524 BUG_ON(atomic_read(&new->usage) < 1); 484 BUG_ON(atomic_read(&new->usage) < 1);
525 485
526 security_commit_creds(new, old);
527
528 get_cred(new); /* we will require a ref for the subj creds too */ 486 get_cred(new); /* we will require a ref for the subj creds too */
529 487
530 /* dumpability changes */ 488 /* dumpability changes */
@@ -560,8 +518,6 @@ int commit_creds(struct cred *new)
560 atomic_dec(&old->user->processes); 518 atomic_dec(&old->user->processes);
561 alter_cred_subscribers(old, -2); 519 alter_cred_subscribers(old, -2);
562 520
563 sched_switch_user(task);
564
565 /* send notifications */ 521 /* send notifications */
566 if (new->uid != old->uid || 522 if (new->uid != old->uid ||
567 new->euid != old->euid || 523 new->euid != old->euid ||
diff --git a/kernel/debug/Makefile b/kernel/debug/Makefile
new file mode 100644
index 000000000000..a85edc339985
--- /dev/null
+++ b/kernel/debug/Makefile
@@ -0,0 +1,6 @@
1#
2# Makefile for the linux kernel debugger
3#
4
5obj-$(CONFIG_KGDB) += debug_core.o gdbstub.o
6obj-$(CONFIG_KGDB_KDB) += kdb/
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
new file mode 100644
index 000000000000..de407c78178d
--- /dev/null
+++ b/kernel/debug/debug_core.c
@@ -0,0 +1,985 @@
1/*
2 * Kernel Debug Core
3 *
4 * Maintainer: Jason Wessel <jason.wessel@windriver.com>
5 *
6 * Copyright (C) 2000-2001 VERITAS Software Corporation.
7 * Copyright (C) 2002-2004 Timesys Corporation
8 * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
9 * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz>
10 * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
11 * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
12 * Copyright (C) 2005-2009 Wind River Systems, Inc.
13 * Copyright (C) 2007 MontaVista Software, Inc.
14 * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
15 *
16 * Contributors at various stages not listed above:
17 * Jason Wessel ( jason.wessel@windriver.com )
18 * George Anzinger <george@mvista.com>
19 * Anurekh Saxena (anurekh.saxena@timesys.com)
20 * Lake Stevens Instrument Division (Glenn Engel)
21 * Jim Kingdon, Cygnus Support.
22 *
23 * Original KGDB stub: David Grothe <dave@gcom.com>,
24 * Tigran Aivazian <tigran@sco.com>
25 *
26 * This file is licensed under the terms of the GNU General Public License
27 * version 2. This program is licensed "as is" without any warranty of any
28 * kind, whether express or implied.
29 */
30#include <linux/pid_namespace.h>
31#include <linux/clocksource.h>
32#include <linux/interrupt.h>
33#include <linux/spinlock.h>
34#include <linux/console.h>
35#include <linux/threads.h>
36#include <linux/uaccess.h>
37#include <linux/kernel.h>
38#include <linux/module.h>
39#include <linux/ptrace.h>
40#include <linux/string.h>
41#include <linux/delay.h>
42#include <linux/sched.h>
43#include <linux/sysrq.h>
44#include <linux/init.h>
45#include <linux/kgdb.h>
46#include <linux/kdb.h>
47#include <linux/pid.h>
48#include <linux/smp.h>
49#include <linux/mm.h>
50
51#include <asm/cacheflush.h>
52#include <asm/byteorder.h>
53#include <asm/atomic.h>
54#include <asm/system.h>
55
56#include "debug_core.h"
57
58static int kgdb_break_asap;
59
60struct debuggerinfo_struct kgdb_info[NR_CPUS];
61
62/**
63 * kgdb_connected - Is a host GDB connected to us?
64 */
65int kgdb_connected;
66EXPORT_SYMBOL_GPL(kgdb_connected);
67
68/* All the KGDB handlers are installed */
69int kgdb_io_module_registered;
70
71/* Guard for recursive entry */
72static int exception_level;
73
74struct kgdb_io *dbg_io_ops;
75static DEFINE_SPINLOCK(kgdb_registration_lock);
76
77/* kgdb console driver is loaded */
78static int kgdb_con_registered;
79/* determine if kgdb console output should be used */
80static int kgdb_use_con;
81/* Flag for alternate operations for early debugging */
82bool dbg_is_early = true;
83/* Next cpu to become the master debug core */
84int dbg_switch_cpu;
85
86/* Use kdb or gdbserver mode */
87int dbg_kdb_mode = 1;
88
89static int __init opt_kgdb_con(char *str)
90{
91 kgdb_use_con = 1;
92 return 0;
93}
94
95early_param("kgdbcon", opt_kgdb_con);
96
97module_param(kgdb_use_con, int, 0644);
98
99/*
100 * Holds information about breakpoints in a kernel. These breakpoints are
101 * added and removed by gdb.
102 */
103static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = {
104 [0 ... KGDB_MAX_BREAKPOINTS-1] = { .state = BP_UNDEFINED }
105};
106
107/*
108 * The CPU# of the active CPU, or -1 if none:
109 */
110atomic_t kgdb_active = ATOMIC_INIT(-1);
111EXPORT_SYMBOL_GPL(kgdb_active);
112
113/*
114 * We use NR_CPUs not PERCPU, in case kgdb is used to debug early
115 * bootup code (which might not have percpu set up yet):
116 */
117static atomic_t passive_cpu_wait[NR_CPUS];
118static atomic_t cpu_in_kgdb[NR_CPUS];
119static atomic_t kgdb_break_tasklet_var;
120atomic_t kgdb_setting_breakpoint;
121
122struct task_struct *kgdb_usethread;
123struct task_struct *kgdb_contthread;
124
125int kgdb_single_step;
126static pid_t kgdb_sstep_pid;
127
128/* to keep track of the CPU which is doing the single stepping*/
129atomic_t kgdb_cpu_doing_single_step = ATOMIC_INIT(-1);
130
131/*
132 * If you are debugging a problem where roundup (the collection of
133 * all other CPUs) is a problem [this should be extremely rare],
134 * then use the nokgdbroundup option to avoid roundup. In that case
135 * the other CPUs might interfere with your debugging context, so
136 * use this with care:
137 */
138static int kgdb_do_roundup = 1;
139
140static int __init opt_nokgdbroundup(char *str)
141{
142 kgdb_do_roundup = 0;
143
144 return 0;
145}
146
147early_param("nokgdbroundup", opt_nokgdbroundup);
148
149/*
150 * Finally, some KGDB code :-)
151 */
152
153/*
154 * Weak aliases for breakpoint management,
155 * can be overriden by architectures when needed:
156 */
157int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
158{
159 int err;
160
161 err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE);
162 if (err)
163 return err;
164
165 return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr,
166 BREAK_INSTR_SIZE);
167}
168
169int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
170{
171 return probe_kernel_write((char *)addr,
172 (char *)bundle, BREAK_INSTR_SIZE);
173}
174
175int __weak kgdb_validate_break_address(unsigned long addr)
176{
177 char tmp_variable[BREAK_INSTR_SIZE];
178 int err;
179 /* Validate setting the breakpoint and then removing it. In the
180 * remove fails, the kernel needs to emit a bad message because we
181 * are deep trouble not being able to put things back the way we
182 * found them.
183 */
184 err = kgdb_arch_set_breakpoint(addr, tmp_variable);
185 if (err)
186 return err;
187 err = kgdb_arch_remove_breakpoint(addr, tmp_variable);
188 if (err)
189 printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
190 "memory destroyed at: %lx", addr);
191 return err;
192}
193
194unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs)
195{
196 return instruction_pointer(regs);
197}
198
199int __weak kgdb_arch_init(void)
200{
201 return 0;
202}
203
204int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
205{
206 return 0;
207}
208
209/**
210 * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
211 * @regs: Current &struct pt_regs.
212 *
213 * This function will be called if the particular architecture must
214 * disable hardware debugging while it is processing gdb packets or
215 * handling exception.
216 */
217void __weak kgdb_disable_hw_debug(struct pt_regs *regs)
218{
219}
220
221/*
222 * Some architectures need cache flushes when we set/clear a
223 * breakpoint:
224 */
225static void kgdb_flush_swbreak_addr(unsigned long addr)
226{
227 if (!CACHE_FLUSH_IS_SAFE)
228 return;
229
230 if (current->mm && current->mm->mmap_cache) {
231 flush_cache_range(current->mm->mmap_cache,
232 addr, addr + BREAK_INSTR_SIZE);
233 }
234 /* Force flush instruction cache if it was outside the mm */
235 flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
236}
237
238/*
239 * SW breakpoint management:
240 */
241int dbg_activate_sw_breakpoints(void)
242{
243 unsigned long addr;
244 int error;
245 int ret = 0;
246 int i;
247
248 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
249 if (kgdb_break[i].state != BP_SET)
250 continue;
251
252 addr = kgdb_break[i].bpt_addr;
253 error = kgdb_arch_set_breakpoint(addr,
254 kgdb_break[i].saved_instr);
255 if (error) {
256 ret = error;
257 printk(KERN_INFO "KGDB: BP install failed: %lx", addr);
258 continue;
259 }
260
261 kgdb_flush_swbreak_addr(addr);
262 kgdb_break[i].state = BP_ACTIVE;
263 }
264 return ret;
265}
266
267int dbg_set_sw_break(unsigned long addr)
268{
269 int err = kgdb_validate_break_address(addr);
270 int breakno = -1;
271 int i;
272
273 if (err)
274 return err;
275
276 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
277 if ((kgdb_break[i].state == BP_SET) &&
278 (kgdb_break[i].bpt_addr == addr))
279 return -EEXIST;
280 }
281 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
282 if (kgdb_break[i].state == BP_REMOVED &&
283 kgdb_break[i].bpt_addr == addr) {
284 breakno = i;
285 break;
286 }
287 }
288
289 if (breakno == -1) {
290 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
291 if (kgdb_break[i].state == BP_UNDEFINED) {
292 breakno = i;
293 break;
294 }
295 }
296 }
297
298 if (breakno == -1)
299 return -E2BIG;
300
301 kgdb_break[breakno].state = BP_SET;
302 kgdb_break[breakno].type = BP_BREAKPOINT;
303 kgdb_break[breakno].bpt_addr = addr;
304
305 return 0;
306}
307
308int dbg_deactivate_sw_breakpoints(void)
309{
310 unsigned long addr;
311 int error;
312 int ret = 0;
313 int i;
314
315 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
316 if (kgdb_break[i].state != BP_ACTIVE)
317 continue;
318 addr = kgdb_break[i].bpt_addr;
319 error = kgdb_arch_remove_breakpoint(addr,
320 kgdb_break[i].saved_instr);
321 if (error) {
322 printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr);
323 ret = error;
324 }
325
326 kgdb_flush_swbreak_addr(addr);
327 kgdb_break[i].state = BP_SET;
328 }
329 return ret;
330}
331
332int dbg_remove_sw_break(unsigned long addr)
333{
334 int i;
335
336 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
337 if ((kgdb_break[i].state == BP_SET) &&
338 (kgdb_break[i].bpt_addr == addr)) {
339 kgdb_break[i].state = BP_REMOVED;
340 return 0;
341 }
342 }
343 return -ENOENT;
344}
345
346int kgdb_isremovedbreak(unsigned long addr)
347{
348 int i;
349
350 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
351 if ((kgdb_break[i].state == BP_REMOVED) &&
352 (kgdb_break[i].bpt_addr == addr))
353 return 1;
354 }
355 return 0;
356}
357
358int dbg_remove_all_break(void)
359{
360 unsigned long addr;
361 int error;
362 int i;
363
364 /* Clear memory breakpoints. */
365 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
366 if (kgdb_break[i].state != BP_ACTIVE)
367 goto setundefined;
368 addr = kgdb_break[i].bpt_addr;
369 error = kgdb_arch_remove_breakpoint(addr,
370 kgdb_break[i].saved_instr);
371 if (error)
372 printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n",
373 addr);
374setundefined:
375 kgdb_break[i].state = BP_UNDEFINED;
376 }
377
378 /* Clear hardware breakpoints. */
379 if (arch_kgdb_ops.remove_all_hw_break)
380 arch_kgdb_ops.remove_all_hw_break();
381
382 return 0;
383}
384
385/*
386 * Return true if there is a valid kgdb I/O module. Also if no
387 * debugger is attached a message can be printed to the console about
388 * waiting for the debugger to attach.
389 *
390 * The print_wait argument is only to be true when called from inside
391 * the core kgdb_handle_exception, because it will wait for the
392 * debugger to attach.
393 */
394static int kgdb_io_ready(int print_wait)
395{
396 if (!dbg_io_ops)
397 return 0;
398 if (kgdb_connected)
399 return 1;
400 if (atomic_read(&kgdb_setting_breakpoint))
401 return 1;
402 if (print_wait) {
403#ifdef CONFIG_KGDB_KDB
404 if (!dbg_kdb_mode)
405 printk(KERN_CRIT "KGDB: waiting... or $3#33 for KDB\n");
406#else
407 printk(KERN_CRIT "KGDB: Waiting for remote debugger\n");
408#endif
409 }
410 return 1;
411}
412
413static int kgdb_reenter_check(struct kgdb_state *ks)
414{
415 unsigned long addr;
416
417 if (atomic_read(&kgdb_active) != raw_smp_processor_id())
418 return 0;
419
420 /* Panic on recursive debugger calls: */
421 exception_level++;
422 addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
423 dbg_deactivate_sw_breakpoints();
424
425 /*
426 * If the break point removed ok at the place exception
427 * occurred, try to recover and print a warning to the end
428 * user because the user planted a breakpoint in a place that
429 * KGDB needs in order to function.
430 */
431 if (dbg_remove_sw_break(addr) == 0) {
432 exception_level = 0;
433 kgdb_skipexception(ks->ex_vector, ks->linux_regs);
434 dbg_activate_sw_breakpoints();
435 printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n",
436 addr);
437 WARN_ON_ONCE(1);
438
439 return 1;
440 }
441 dbg_remove_all_break();
442 kgdb_skipexception(ks->ex_vector, ks->linux_regs);
443
444 if (exception_level > 1) {
445 dump_stack();
446 panic("Recursive entry to debugger");
447 }
448
449 printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n");
450#ifdef CONFIG_KGDB_KDB
451 /* Allow kdb to debug itself one level */
452 return 0;
453#endif
454 dump_stack();
455 panic("Recursive entry to debugger");
456
457 return 1;
458}
459
460static void dbg_cpu_switch(int cpu, int next_cpu)
461{
462 /* Mark the cpu we are switching away from as a slave when it
463 * holds the kgdb_active token. This must be done so that the
464 * that all the cpus wait in for the debug core will not enter
465 * again as the master. */
466 if (cpu == atomic_read(&kgdb_active)) {
467 kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
468 kgdb_info[cpu].exception_state &= ~DCPU_WANT_MASTER;
469 }
470 kgdb_info[next_cpu].exception_state |= DCPU_NEXT_MASTER;
471}
472
473static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs)
474{
475 unsigned long flags;
476 int sstep_tries = 100;
477 int error;
478 int i, cpu;
479 int trace_on = 0;
480acquirelock:
481 /*
482 * Interrupts will be restored by the 'trap return' code, except when
483 * single stepping.
484 */
485 local_irq_save(flags);
486
487 cpu = ks->cpu;
488 kgdb_info[cpu].debuggerinfo = regs;
489 kgdb_info[cpu].task = current;
490 kgdb_info[cpu].ret_state = 0;
491 kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT;
492 /*
493 * Make sure the above info reaches the primary CPU before
494 * our cpu_in_kgdb[] flag setting does:
495 */
496 atomic_inc(&cpu_in_kgdb[cpu]);
497
498 if (exception_level == 1)
499 goto cpu_master_loop;
500
501 /*
502 * CPU will loop if it is a slave or request to become a kgdb
503 * master cpu and acquire the kgdb_active lock:
504 */
505 while (1) {
506cpu_loop:
507 if (kgdb_info[cpu].exception_state & DCPU_NEXT_MASTER) {
508 kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER;
509 goto cpu_master_loop;
510 } else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) {
511 if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu)
512 break;
513 } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
514 if (!atomic_read(&passive_cpu_wait[cpu]))
515 goto return_normal;
516 } else {
517return_normal:
518 /* Return to normal operation by executing any
519 * hw breakpoint fixup.
520 */
521 if (arch_kgdb_ops.correct_hw_break)
522 arch_kgdb_ops.correct_hw_break();
523 if (trace_on)
524 tracing_on();
525 atomic_dec(&cpu_in_kgdb[cpu]);
526 touch_softlockup_watchdog_sync();
527 clocksource_touch_watchdog();
528 local_irq_restore(flags);
529 return 0;
530 }
531 cpu_relax();
532 }
533
534 /*
535 * For single stepping, try to only enter on the processor
536 * that was single stepping. To gaurd against a deadlock, the
537 * kernel will only try for the value of sstep_tries before
538 * giving up and continuing on.
539 */
540 if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
541 (kgdb_info[cpu].task &&
542 kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
543 atomic_set(&kgdb_active, -1);
544 touch_softlockup_watchdog_sync();
545 clocksource_touch_watchdog();
546 local_irq_restore(flags);
547
548 goto acquirelock;
549 }
550
551 if (!kgdb_io_ready(1)) {
552 kgdb_info[cpu].ret_state = 1;
553 goto kgdb_restore; /* No I/O connection, resume the system */
554 }
555
556 /*
557 * Don't enter if we have hit a removed breakpoint.
558 */
559 if (kgdb_skipexception(ks->ex_vector, ks->linux_regs))
560 goto kgdb_restore;
561
562 /* Call the I/O driver's pre_exception routine */
563 if (dbg_io_ops->pre_exception)
564 dbg_io_ops->pre_exception();
565
566 kgdb_disable_hw_debug(ks->linux_regs);
567
568 /*
569 * Get the passive CPU lock which will hold all the non-primary
570 * CPU in a spin state while the debugger is active
571 */
572 if (!kgdb_single_step) {
573 for (i = 0; i < NR_CPUS; i++)
574 atomic_inc(&passive_cpu_wait[i]);
575 }
576
577#ifdef CONFIG_SMP
578 /* Signal the other CPUs to enter kgdb_wait() */
579 if ((!kgdb_single_step) && kgdb_do_roundup)
580 kgdb_roundup_cpus(flags);
581#endif
582
583 /*
584 * Wait for the other CPUs to be notified and be waiting for us:
585 */
586 for_each_online_cpu(i) {
587 while (kgdb_do_roundup && !atomic_read(&cpu_in_kgdb[i]))
588 cpu_relax();
589 }
590
591 /*
592 * At this point the primary processor is completely
593 * in the debugger and all secondary CPUs are quiescent
594 */
595 dbg_deactivate_sw_breakpoints();
596 kgdb_single_step = 0;
597 kgdb_contthread = current;
598 exception_level = 0;
599 trace_on = tracing_is_on();
600 if (trace_on)
601 tracing_off();
602
603 while (1) {
604cpu_master_loop:
605 if (dbg_kdb_mode) {
606 kgdb_connected = 1;
607 error = kdb_stub(ks);
608 if (error == -1)
609 continue;
610 kgdb_connected = 0;
611 } else {
612 error = gdb_serial_stub(ks);
613 }
614
615 if (error == DBG_PASS_EVENT) {
616 dbg_kdb_mode = !dbg_kdb_mode;
617 } else if (error == DBG_SWITCH_CPU_EVENT) {
618 dbg_cpu_switch(cpu, dbg_switch_cpu);
619 goto cpu_loop;
620 } else {
621 kgdb_info[cpu].ret_state = error;
622 break;
623 }
624 }
625
626 /* Call the I/O driver's post_exception routine */
627 if (dbg_io_ops->post_exception)
628 dbg_io_ops->post_exception();
629
630 atomic_dec(&cpu_in_kgdb[ks->cpu]);
631
632 if (!kgdb_single_step) {
633 for (i = NR_CPUS-1; i >= 0; i--)
634 atomic_dec(&passive_cpu_wait[i]);
635 /*
636 * Wait till all the CPUs have quit from the debugger,
637 * but allow a CPU that hit an exception and is
638 * waiting to become the master to remain in the debug
639 * core.
640 */
641 for_each_online_cpu(i) {
642 while (kgdb_do_roundup &&
643 atomic_read(&cpu_in_kgdb[i]) &&
644 !(kgdb_info[i].exception_state &
645 DCPU_WANT_MASTER))
646 cpu_relax();
647 }
648 }
649
650kgdb_restore:
651 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
652 int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step);
653 if (kgdb_info[sstep_cpu].task)
654 kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid;
655 else
656 kgdb_sstep_pid = 0;
657 }
658 if (trace_on)
659 tracing_on();
660 /* Free kgdb_active */
661 atomic_set(&kgdb_active, -1);
662 touch_softlockup_watchdog_sync();
663 clocksource_touch_watchdog();
664 local_irq_restore(flags);
665
666 return kgdb_info[cpu].ret_state;
667}
668
669/*
670 * kgdb_handle_exception() - main entry point from a kernel exception
671 *
672 * Locking hierarchy:
673 * interface locks, if any (begin_session)
674 * kgdb lock (kgdb_active)
675 */
676int
677kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
678{
679 struct kgdb_state kgdb_var;
680 struct kgdb_state *ks = &kgdb_var;
681 int ret;
682
683 ks->cpu = raw_smp_processor_id();
684 ks->ex_vector = evector;
685 ks->signo = signo;
686 ks->err_code = ecode;
687 ks->kgdb_usethreadid = 0;
688 ks->linux_regs = regs;
689
690 if (kgdb_reenter_check(ks))
691 return 0; /* Ouch, double exception ! */
692 kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER;
693 ret = kgdb_cpu_enter(ks, regs);
694 kgdb_info[ks->cpu].exception_state &= ~(DCPU_WANT_MASTER |
695 DCPU_IS_SLAVE);
696 return ret;
697}
698
699int kgdb_nmicallback(int cpu, void *regs)
700{
701#ifdef CONFIG_SMP
702 struct kgdb_state kgdb_var;
703 struct kgdb_state *ks = &kgdb_var;
704
705 memset(ks, 0, sizeof(struct kgdb_state));
706 ks->cpu = cpu;
707 ks->linux_regs = regs;
708
709 if (!atomic_read(&cpu_in_kgdb[cpu]) &&
710 atomic_read(&kgdb_active) != -1 &&
711 atomic_read(&kgdb_active) != cpu) {
712 kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
713 kgdb_cpu_enter(ks, regs);
714 kgdb_info[cpu].exception_state &= ~DCPU_IS_SLAVE;
715 return 0;
716 }
717#endif
718 return 1;
719}
720
721static void kgdb_console_write(struct console *co, const char *s,
722 unsigned count)
723{
724 unsigned long flags;
725
726 /* If we're debugging, or KGDB has not connected, don't try
727 * and print. */
728 if (!kgdb_connected || atomic_read(&kgdb_active) != -1 || dbg_kdb_mode)
729 return;
730
731 local_irq_save(flags);
732 gdbstub_msg_write(s, count);
733 local_irq_restore(flags);
734}
735
736static struct console kgdbcons = {
737 .name = "kgdb",
738 .write = kgdb_console_write,
739 .flags = CON_PRINTBUFFER | CON_ENABLED,
740 .index = -1,
741};
742
743#ifdef CONFIG_MAGIC_SYSRQ
744static void sysrq_handle_dbg(int key)
745{
746 if (!dbg_io_ops) {
747 printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
748 return;
749 }
750 if (!kgdb_connected) {
751#ifdef CONFIG_KGDB_KDB
752 if (!dbg_kdb_mode)
753 printk(KERN_CRIT "KGDB or $3#33 for KDB\n");
754#else
755 printk(KERN_CRIT "Entering KGDB\n");
756#endif
757 }
758
759 kgdb_breakpoint();
760}
761
762static struct sysrq_key_op sysrq_dbg_op = {
763 .handler = sysrq_handle_dbg,
764 .help_msg = "debug(G)",
765 .action_msg = "DEBUG",
766};
767#endif
768
769static int kgdb_panic_event(struct notifier_block *self,
770 unsigned long val,
771 void *data)
772{
773 if (dbg_kdb_mode)
774 kdb_printf("PANIC: %s\n", (char *)data);
775 kgdb_breakpoint();
776 return NOTIFY_DONE;
777}
778
779static struct notifier_block kgdb_panic_event_nb = {
780 .notifier_call = kgdb_panic_event,
781 .priority = INT_MAX,
782};
783
784void __weak kgdb_arch_late(void)
785{
786}
787
788void __init dbg_late_init(void)
789{
790 dbg_is_early = false;
791 if (kgdb_io_module_registered)
792 kgdb_arch_late();
793 kdb_init(KDB_INIT_FULL);
794}
795
796static void kgdb_register_callbacks(void)
797{
798 if (!kgdb_io_module_registered) {
799 kgdb_io_module_registered = 1;
800 kgdb_arch_init();
801 if (!dbg_is_early)
802 kgdb_arch_late();
803 atomic_notifier_chain_register(&panic_notifier_list,
804 &kgdb_panic_event_nb);
805#ifdef CONFIG_MAGIC_SYSRQ
806 register_sysrq_key('g', &sysrq_dbg_op);
807#endif
808 if (kgdb_use_con && !kgdb_con_registered) {
809 register_console(&kgdbcons);
810 kgdb_con_registered = 1;
811 }
812 }
813}
814
815static void kgdb_unregister_callbacks(void)
816{
817 /*
818 * When this routine is called KGDB should unregister from the
819 * panic handler and clean up, making sure it is not handling any
820 * break exceptions at the time.
821 */
822 if (kgdb_io_module_registered) {
823 kgdb_io_module_registered = 0;
824 atomic_notifier_chain_unregister(&panic_notifier_list,
825 &kgdb_panic_event_nb);
826 kgdb_arch_exit();
827#ifdef CONFIG_MAGIC_SYSRQ
828 unregister_sysrq_key('g', &sysrq_dbg_op);
829#endif
830 if (kgdb_con_registered) {
831 unregister_console(&kgdbcons);
832 kgdb_con_registered = 0;
833 }
834 }
835}
836
837/*
838 * There are times a tasklet needs to be used vs a compiled in
839 * break point so as to cause an exception outside a kgdb I/O module,
840 * such as is the case with kgdboe, where calling a breakpoint in the
841 * I/O driver itself would be fatal.
842 */
843static void kgdb_tasklet_bpt(unsigned long ing)
844{
845 kgdb_breakpoint();
846 atomic_set(&kgdb_break_tasklet_var, 0);
847}
848
849static DECLARE_TASKLET(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt, 0);
850
851void kgdb_schedule_breakpoint(void)
852{
853 if (atomic_read(&kgdb_break_tasklet_var) ||
854 atomic_read(&kgdb_active) != -1 ||
855 atomic_read(&kgdb_setting_breakpoint))
856 return;
857 atomic_inc(&kgdb_break_tasklet_var);
858 tasklet_schedule(&kgdb_tasklet_breakpoint);
859}
860EXPORT_SYMBOL_GPL(kgdb_schedule_breakpoint);
861
862static void kgdb_initial_breakpoint(void)
863{
864 kgdb_break_asap = 0;
865
866 printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n");
867 kgdb_breakpoint();
868}
869
870/**
871 * kgdb_register_io_module - register KGDB IO module
872 * @new_dbg_io_ops: the io ops vector
873 *
874 * Register it with the KGDB core.
875 */
876int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
877{
878 int err;
879
880 spin_lock(&kgdb_registration_lock);
881
882 if (dbg_io_ops) {
883 spin_unlock(&kgdb_registration_lock);
884
885 printk(KERN_ERR "kgdb: Another I/O driver is already "
886 "registered with KGDB.\n");
887 return -EBUSY;
888 }
889
890 if (new_dbg_io_ops->init) {
891 err = new_dbg_io_ops->init();
892 if (err) {
893 spin_unlock(&kgdb_registration_lock);
894 return err;
895 }
896 }
897
898 dbg_io_ops = new_dbg_io_ops;
899
900 spin_unlock(&kgdb_registration_lock);
901
902 printk(KERN_INFO "kgdb: Registered I/O driver %s.\n",
903 new_dbg_io_ops->name);
904
905 /* Arm KGDB now. */
906 kgdb_register_callbacks();
907
908 if (kgdb_break_asap)
909 kgdb_initial_breakpoint();
910
911 return 0;
912}
913EXPORT_SYMBOL_GPL(kgdb_register_io_module);
914
915/**
916 * kkgdb_unregister_io_module - unregister KGDB IO module
917 * @old_dbg_io_ops: the io ops vector
918 *
919 * Unregister it with the KGDB core.
920 */
921void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops)
922{
923 BUG_ON(kgdb_connected);
924
925 /*
926 * KGDB is no longer able to communicate out, so
927 * unregister our callbacks and reset state.
928 */
929 kgdb_unregister_callbacks();
930
931 spin_lock(&kgdb_registration_lock);
932
933 WARN_ON_ONCE(dbg_io_ops != old_dbg_io_ops);
934 dbg_io_ops = NULL;
935
936 spin_unlock(&kgdb_registration_lock);
937
938 printk(KERN_INFO
939 "kgdb: Unregistered I/O driver %s, debugger disabled.\n",
940 old_dbg_io_ops->name);
941}
942EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
943
944int dbg_io_get_char(void)
945{
946 int ret = dbg_io_ops->read_char();
947 if (ret == NO_POLL_CHAR)
948 return -1;
949 if (!dbg_kdb_mode)
950 return ret;
951 if (ret == 127)
952 return 8;
953 return ret;
954}
955
956/**
957 * kgdb_breakpoint - generate breakpoint exception
958 *
959 * This function will generate a breakpoint exception. It is used at the
960 * beginning of a program to sync up with a debugger and can be used
961 * otherwise as a quick means to stop program execution and "break" into
962 * the debugger.
963 */
964void kgdb_breakpoint(void)
965{
966 atomic_inc(&kgdb_setting_breakpoint);
967 wmb(); /* Sync point before breakpoint */
968 arch_kgdb_breakpoint();
969 wmb(); /* Sync point after breakpoint */
970 atomic_dec(&kgdb_setting_breakpoint);
971}
972EXPORT_SYMBOL_GPL(kgdb_breakpoint);
973
974static int __init opt_kgdb_wait(char *str)
975{
976 kgdb_break_asap = 1;
977
978 kdb_init(KDB_INIT_EARLY);
979 if (kgdb_io_module_registered)
980 kgdb_initial_breakpoint();
981
982 return 0;
983}
984
985early_param("kgdbwait", opt_kgdb_wait);
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
new file mode 100644
index 000000000000..c5d753d80f67
--- /dev/null
+++ b/kernel/debug/debug_core.h
@@ -0,0 +1,81 @@
1/*
2 * Created by: Jason Wessel <jason.wessel@windriver.com>
3 *
4 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
5 *
6 * This file is licensed under the terms of the GNU General Public
7 * License version 2. This program is licensed "as is" without any
8 * warranty of any kind, whether express or implied.
9 */
10
11#ifndef _DEBUG_CORE_H_
12#define _DEBUG_CORE_H_
13/*
14 * These are the private implementation headers between the kernel
15 * debugger core and the debugger front end code.
16 */
17
18/* kernel debug core data structures */
19struct kgdb_state {
20 int ex_vector;
21 int signo;
22 int err_code;
23 int cpu;
24 int pass_exception;
25 unsigned long thr_query;
26 unsigned long threadid;
27 long kgdb_usethreadid;
28 struct pt_regs *linux_regs;
29};
30
31/* Exception state values */
32#define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */
33#define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */
34#define DCPU_IS_SLAVE 0x4 /* Slave cpu enter exception */
35#define DCPU_SSTEP 0x8 /* CPU is single stepping */
36
37struct debuggerinfo_struct {
38 void *debuggerinfo;
39 struct task_struct *task;
40 int exception_state;
41 int ret_state;
42 int irq_depth;
43};
44
45extern struct debuggerinfo_struct kgdb_info[];
46
47/* kernel debug core break point routines */
48extern int dbg_remove_all_break(void);
49extern int dbg_set_sw_break(unsigned long addr);
50extern int dbg_remove_sw_break(unsigned long addr);
51extern int dbg_activate_sw_breakpoints(void);
52extern int dbg_deactivate_sw_breakpoints(void);
53
54/* polled character access to i/o module */
55extern int dbg_io_get_char(void);
56
57/* stub return value for switching between the gdbstub and kdb */
58#define DBG_PASS_EVENT -12345
59/* Switch from one cpu to another */
60#define DBG_SWITCH_CPU_EVENT -123456
61extern int dbg_switch_cpu;
62
63/* gdbstub interface functions */
64extern int gdb_serial_stub(struct kgdb_state *ks);
65extern void gdbstub_msg_write(const char *s, int len);
66
67/* gdbstub functions used for kdb <-> gdbstub transition */
68extern int gdbstub_state(struct kgdb_state *ks, char *cmd);
69extern int dbg_kdb_mode;
70
71#ifdef CONFIG_KGDB_KDB
72extern int kdb_stub(struct kgdb_state *ks);
73extern int kdb_parse(const char *cmdstr);
74#else /* ! CONFIG_KGDB_KDB */
75static inline int kdb_stub(struct kgdb_state *ks)
76{
77 return DBG_PASS_EVENT;
78}
79#endif /* CONFIG_KGDB_KDB */
80
81#endif /* _DEBUG_CORE_H_ */
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
new file mode 100644
index 000000000000..481a7bd2dfe7
--- /dev/null
+++ b/kernel/debug/gdbstub.c
@@ -0,0 +1,1095 @@
1/*
2 * Kernel Debug Core
3 *
4 * Maintainer: Jason Wessel <jason.wessel@windriver.com>
5 *
6 * Copyright (C) 2000-2001 VERITAS Software Corporation.
7 * Copyright (C) 2002-2004 Timesys Corporation
8 * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
9 * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz>
10 * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
11 * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
12 * Copyright (C) 2005-2009 Wind River Systems, Inc.
13 * Copyright (C) 2007 MontaVista Software, Inc.
14 * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
15 *
16 * Contributors at various stages not listed above:
17 * Jason Wessel ( jason.wessel@windriver.com )
18 * George Anzinger <george@mvista.com>
19 * Anurekh Saxena (anurekh.saxena@timesys.com)
20 * Lake Stevens Instrument Division (Glenn Engel)
21 * Jim Kingdon, Cygnus Support.
22 *
23 * Original KGDB stub: David Grothe <dave@gcom.com>,
24 * Tigran Aivazian <tigran@sco.com>
25 *
26 * This file is licensed under the terms of the GNU General Public License
27 * version 2. This program is licensed "as is" without any warranty of any
28 * kind, whether express or implied.
29 */
30
31#include <linux/kernel.h>
32#include <linux/kgdb.h>
33#include <linux/kdb.h>
34#include <linux/reboot.h>
35#include <linux/uaccess.h>
36#include <asm/cacheflush.h>
37#include <asm/unaligned.h>
38#include "debug_core.h"
39
40#define KGDB_MAX_THREAD_QUERY 17
41
42/* Our I/O buffers. */
43static char remcom_in_buffer[BUFMAX];
44static char remcom_out_buffer[BUFMAX];
45
46/* Storage for the registers, in GDB format. */
47static unsigned long gdb_regs[(NUMREGBYTES +
48 sizeof(unsigned long) - 1) /
49 sizeof(unsigned long)];
50
51/*
52 * GDB remote protocol parser:
53 */
54
55#ifdef CONFIG_KGDB_KDB
56static int gdbstub_read_wait(void)
57{
58 int ret = -1;
59 int i;
60
61 /* poll any additional I/O interfaces that are defined */
62 while (ret < 0)
63 for (i = 0; kdb_poll_funcs[i] != NULL; i++) {
64 ret = kdb_poll_funcs[i]();
65 if (ret > 0)
66 break;
67 }
68 return ret;
69}
70#else
71static int gdbstub_read_wait(void)
72{
73 int ret = dbg_io_ops->read_char();
74 while (ret == NO_POLL_CHAR)
75 ret = dbg_io_ops->read_char();
76 return ret;
77}
78#endif
79/* scan for the sequence $<data>#<checksum> */
80static void get_packet(char *buffer)
81{
82 unsigned char checksum;
83 unsigned char xmitcsum;
84 int count;
85 char ch;
86
87 do {
88 /*
89 * Spin and wait around for the start character, ignore all
90 * other characters:
91 */
92 while ((ch = (gdbstub_read_wait())) != '$')
93 /* nothing */;
94
95 kgdb_connected = 1;
96 checksum = 0;
97 xmitcsum = -1;
98
99 count = 0;
100
101 /*
102 * now, read until a # or end of buffer is found:
103 */
104 while (count < (BUFMAX - 1)) {
105 ch = gdbstub_read_wait();
106 if (ch == '#')
107 break;
108 checksum = checksum + ch;
109 buffer[count] = ch;
110 count = count + 1;
111 }
112 buffer[count] = 0;
113
114 if (ch == '#') {
115 xmitcsum = hex_to_bin(gdbstub_read_wait()) << 4;
116 xmitcsum += hex_to_bin(gdbstub_read_wait());
117
118 if (checksum != xmitcsum)
119 /* failed checksum */
120 dbg_io_ops->write_char('-');
121 else
122 /* successful transfer */
123 dbg_io_ops->write_char('+');
124 if (dbg_io_ops->flush)
125 dbg_io_ops->flush();
126 }
127 } while (checksum != xmitcsum);
128}
129
130/*
131 * Send the packet in buffer.
132 * Check for gdb connection if asked for.
133 */
134static void put_packet(char *buffer)
135{
136 unsigned char checksum;
137 int count;
138 char ch;
139
140 /*
141 * $<packet info>#<checksum>.
142 */
143 while (1) {
144 dbg_io_ops->write_char('$');
145 checksum = 0;
146 count = 0;
147
148 while ((ch = buffer[count])) {
149 dbg_io_ops->write_char(ch);
150 checksum += ch;
151 count++;
152 }
153
154 dbg_io_ops->write_char('#');
155 dbg_io_ops->write_char(hex_asc_hi(checksum));
156 dbg_io_ops->write_char(hex_asc_lo(checksum));
157 if (dbg_io_ops->flush)
158 dbg_io_ops->flush();
159
160 /* Now see what we get in reply. */
161 ch = gdbstub_read_wait();
162
163 if (ch == 3)
164 ch = gdbstub_read_wait();
165
166 /* If we get an ACK, we are done. */
167 if (ch == '+')
168 return;
169
170 /*
171 * If we get the start of another packet, this means
172 * that GDB is attempting to reconnect. We will NAK
173 * the packet being sent, and stop trying to send this
174 * packet.
175 */
176 if (ch == '$') {
177 dbg_io_ops->write_char('-');
178 if (dbg_io_ops->flush)
179 dbg_io_ops->flush();
180 return;
181 }
182 }
183}
184
185static char gdbmsgbuf[BUFMAX + 1];
186
187void gdbstub_msg_write(const char *s, int len)
188{
189 char *bufptr;
190 int wcount;
191 int i;
192
193 if (len == 0)
194 len = strlen(s);
195
196 /* 'O'utput */
197 gdbmsgbuf[0] = 'O';
198
199 /* Fill and send buffers... */
200 while (len > 0) {
201 bufptr = gdbmsgbuf + 1;
202
203 /* Calculate how many this time */
204 if ((len << 1) > (BUFMAX - 2))
205 wcount = (BUFMAX - 2) >> 1;
206 else
207 wcount = len;
208
209 /* Pack in hex chars */
210 for (i = 0; i < wcount; i++)
211 bufptr = pack_hex_byte(bufptr, s[i]);
212 *bufptr = '\0';
213
214 /* Move up */
215 s += wcount;
216 len -= wcount;
217
218 /* Write packet */
219 put_packet(gdbmsgbuf);
220 }
221}
222
223/*
224 * Convert the memory pointed to by mem into hex, placing result in
225 * buf. Return a pointer to the last char put in buf (null). May
226 * return an error.
227 */
228char *kgdb_mem2hex(char *mem, char *buf, int count)
229{
230 char *tmp;
231 int err;
232
233 /*
234 * We use the upper half of buf as an intermediate buffer for the
235 * raw memory copy. Hex conversion will work against this one.
236 */
237 tmp = buf + count;
238
239 err = probe_kernel_read(tmp, mem, count);
240 if (err)
241 return NULL;
242 while (count > 0) {
243 buf = pack_hex_byte(buf, *tmp);
244 tmp++;
245 count--;
246 }
247 *buf = 0;
248
249 return buf;
250}
251
252/*
253 * Convert the hex array pointed to by buf into binary to be placed in
254 * mem. Return a pointer to the character AFTER the last byte
255 * written. May return an error.
256 */
257int kgdb_hex2mem(char *buf, char *mem, int count)
258{
259 char *tmp_raw;
260 char *tmp_hex;
261
262 /*
263 * We use the upper half of buf as an intermediate buffer for the
264 * raw memory that is converted from hex.
265 */
266 tmp_raw = buf + count * 2;
267
268 tmp_hex = tmp_raw - 1;
269 while (tmp_hex >= buf) {
270 tmp_raw--;
271 *tmp_raw = hex_to_bin(*tmp_hex--);
272 *tmp_raw |= hex_to_bin(*tmp_hex--) << 4;
273 }
274
275 return probe_kernel_write(mem, tmp_raw, count);
276}
277
278/*
279 * While we find nice hex chars, build a long_val.
280 * Return number of chars processed.
281 */
282int kgdb_hex2long(char **ptr, unsigned long *long_val)
283{
284 int hex_val;
285 int num = 0;
286 int negate = 0;
287
288 *long_val = 0;
289
290 if (**ptr == '-') {
291 negate = 1;
292 (*ptr)++;
293 }
294 while (**ptr) {
295 hex_val = hex_to_bin(**ptr);
296 if (hex_val < 0)
297 break;
298
299 *long_val = (*long_val << 4) | hex_val;
300 num++;
301 (*ptr)++;
302 }
303
304 if (negate)
305 *long_val = -*long_val;
306
307 return num;
308}
309
310/*
311 * Copy the binary array pointed to by buf into mem. Fix $, #, and
312 * 0x7d escaped with 0x7d. Return -EFAULT on failure or 0 on success.
313 * The input buf is overwitten with the result to write to mem.
314 */
315static int kgdb_ebin2mem(char *buf, char *mem, int count)
316{
317 int size = 0;
318 char *c = buf;
319
320 while (count-- > 0) {
321 c[size] = *buf++;
322 if (c[size] == 0x7d)
323 c[size] = *buf++ ^ 0x20;
324 size++;
325 }
326
327 return probe_kernel_write(mem, c, size);
328}
329
330#if DBG_MAX_REG_NUM > 0
331void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
332{
333 int i;
334 int idx = 0;
335 char *ptr = (char *)gdb_regs;
336
337 for (i = 0; i < DBG_MAX_REG_NUM; i++) {
338 dbg_get_reg(i, ptr + idx, regs);
339 idx += dbg_reg_def[i].size;
340 }
341}
342
343void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
344{
345 int i;
346 int idx = 0;
347 char *ptr = (char *)gdb_regs;
348
349 for (i = 0; i < DBG_MAX_REG_NUM; i++) {
350 dbg_set_reg(i, ptr + idx, regs);
351 idx += dbg_reg_def[i].size;
352 }
353}
354#endif /* DBG_MAX_REG_NUM > 0 */
355
356/* Write memory due to an 'M' or 'X' packet. */
357static int write_mem_msg(int binary)
358{
359 char *ptr = &remcom_in_buffer[1];
360 unsigned long addr;
361 unsigned long length;
362 int err;
363
364 if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' &&
365 kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') {
366 if (binary)
367 err = kgdb_ebin2mem(ptr, (char *)addr, length);
368 else
369 err = kgdb_hex2mem(ptr, (char *)addr, length);
370 if (err)
371 return err;
372 if (CACHE_FLUSH_IS_SAFE)
373 flush_icache_range(addr, addr + length);
374 return 0;
375 }
376
377 return -EINVAL;
378}
379
380static void error_packet(char *pkt, int error)
381{
382 error = -error;
383 pkt[0] = 'E';
384 pkt[1] = hex_asc[(error / 10)];
385 pkt[2] = hex_asc[(error % 10)];
386 pkt[3] = '\0';
387}
388
389/*
390 * Thread ID accessors. We represent a flat TID space to GDB, where
391 * the per CPU idle threads (which under Linux all have PID 0) are
392 * remapped to negative TIDs.
393 */
394
395#define BUF_THREAD_ID_SIZE 8
396
397static char *pack_threadid(char *pkt, unsigned char *id)
398{
399 unsigned char *limit;
400 int lzero = 1;
401
402 limit = id + (BUF_THREAD_ID_SIZE / 2);
403 while (id < limit) {
404 if (!lzero || *id != 0) {
405 pkt = pack_hex_byte(pkt, *id);
406 lzero = 0;
407 }
408 id++;
409 }
410
411 if (lzero)
412 pkt = pack_hex_byte(pkt, 0);
413
414 return pkt;
415}
416
417static void int_to_threadref(unsigned char *id, int value)
418{
419 put_unaligned_be32(value, id);
420}
421
422static struct task_struct *getthread(struct pt_regs *regs, int tid)
423{
424 /*
425 * Non-positive TIDs are remapped to the cpu shadow information
426 */
427 if (tid == 0 || tid == -1)
428 tid = -atomic_read(&kgdb_active) - 2;
429 if (tid < -1 && tid > -NR_CPUS - 2) {
430 if (kgdb_info[-tid - 2].task)
431 return kgdb_info[-tid - 2].task;
432 else
433 return idle_task(-tid - 2);
434 }
435 if (tid <= 0) {
436 printk(KERN_ERR "KGDB: Internal thread select error\n");
437 dump_stack();
438 return NULL;
439 }
440
441 /*
442 * find_task_by_pid_ns() does not take the tasklist lock anymore
443 * but is nicely RCU locked - hence is a pretty resilient
444 * thing to use:
445 */
446 return find_task_by_pid_ns(tid, &init_pid_ns);
447}
448
449
450/*
451 * Remap normal tasks to their real PID,
452 * CPU shadow threads are mapped to -CPU - 2
453 */
454static inline int shadow_pid(int realpid)
455{
456 if (realpid)
457 return realpid;
458
459 return -raw_smp_processor_id() - 2;
460}
461
462/*
463 * All the functions that start with gdb_cmd are the various
464 * operations to implement the handlers for the gdbserial protocol
465 * where KGDB is communicating with an external debugger
466 */
467
468/* Handle the '?' status packets */
469static void gdb_cmd_status(struct kgdb_state *ks)
470{
471 /*
472 * We know that this packet is only sent
473 * during initial connect. So to be safe,
474 * we clear out our breakpoints now in case
475 * GDB is reconnecting.
476 */
477 dbg_remove_all_break();
478
479 remcom_out_buffer[0] = 'S';
480 pack_hex_byte(&remcom_out_buffer[1], ks->signo);
481}
482
483static void gdb_get_regs_helper(struct kgdb_state *ks)
484{
485 struct task_struct *thread;
486 void *local_debuggerinfo;
487 int i;
488
489 thread = kgdb_usethread;
490 if (!thread) {
491 thread = kgdb_info[ks->cpu].task;
492 local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo;
493 } else {
494 local_debuggerinfo = NULL;
495 for_each_online_cpu(i) {
496 /*
497 * Try to find the task on some other
498 * or possibly this node if we do not
499 * find the matching task then we try
500 * to approximate the results.
501 */
502 if (thread == kgdb_info[i].task)
503 local_debuggerinfo = kgdb_info[i].debuggerinfo;
504 }
505 }
506
507 /*
508 * All threads that don't have debuggerinfo should be
509 * in schedule() sleeping, since all other CPUs
510 * are in kgdb_wait, and thus have debuggerinfo.
511 */
512 if (local_debuggerinfo) {
513 pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo);
514 } else {
515 /*
516 * Pull stuff saved during switch_to; nothing
517 * else is accessible (or even particularly
518 * relevant).
519 *
520 * This should be enough for a stack trace.
521 */
522 sleeping_thread_to_gdb_regs(gdb_regs, thread);
523 }
524}
525
526/* Handle the 'g' get registers request */
527static void gdb_cmd_getregs(struct kgdb_state *ks)
528{
529 gdb_get_regs_helper(ks);
530 kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES);
531}
532
533/* Handle the 'G' set registers request */
534static void gdb_cmd_setregs(struct kgdb_state *ks)
535{
536 kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES);
537
538 if (kgdb_usethread && kgdb_usethread != current) {
539 error_packet(remcom_out_buffer, -EINVAL);
540 } else {
541 gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs);
542 strcpy(remcom_out_buffer, "OK");
543 }
544}
545
546/* Handle the 'm' memory read bytes */
547static void gdb_cmd_memread(struct kgdb_state *ks)
548{
549 char *ptr = &remcom_in_buffer[1];
550 unsigned long length;
551 unsigned long addr;
552 char *err;
553
554 if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
555 kgdb_hex2long(&ptr, &length) > 0) {
556 err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length);
557 if (!err)
558 error_packet(remcom_out_buffer, -EINVAL);
559 } else {
560 error_packet(remcom_out_buffer, -EINVAL);
561 }
562}
563
564/* Handle the 'M' memory write bytes */
565static void gdb_cmd_memwrite(struct kgdb_state *ks)
566{
567 int err = write_mem_msg(0);
568
569 if (err)
570 error_packet(remcom_out_buffer, err);
571 else
572 strcpy(remcom_out_buffer, "OK");
573}
574
575#if DBG_MAX_REG_NUM > 0
576static char *gdb_hex_reg_helper(int regnum, char *out)
577{
578 int i;
579 int offset = 0;
580
581 for (i = 0; i < regnum; i++)
582 offset += dbg_reg_def[i].size;
583 return kgdb_mem2hex((char *)gdb_regs + offset, out,
584 dbg_reg_def[i].size);
585}
586
587/* Handle the 'p' individual regster get */
588static void gdb_cmd_reg_get(struct kgdb_state *ks)
589{
590 unsigned long regnum;
591 char *ptr = &remcom_in_buffer[1];
592
593 kgdb_hex2long(&ptr, &regnum);
594 if (regnum >= DBG_MAX_REG_NUM) {
595 error_packet(remcom_out_buffer, -EINVAL);
596 return;
597 }
598 gdb_get_regs_helper(ks);
599 gdb_hex_reg_helper(regnum, remcom_out_buffer);
600}
601
602/* Handle the 'P' individual regster set */
603static void gdb_cmd_reg_set(struct kgdb_state *ks)
604{
605 unsigned long regnum;
606 char *ptr = &remcom_in_buffer[1];
607 int i = 0;
608
609 kgdb_hex2long(&ptr, &regnum);
610 if (*ptr++ != '=' ||
611 !(!kgdb_usethread || kgdb_usethread == current) ||
612 !dbg_get_reg(regnum, gdb_regs, ks->linux_regs)) {
613 error_packet(remcom_out_buffer, -EINVAL);
614 return;
615 }
616 memset(gdb_regs, 0, sizeof(gdb_regs));
617 while (i < sizeof(gdb_regs) * 2)
618 if (hex_to_bin(ptr[i]) >= 0)
619 i++;
620 else
621 break;
622 i = i / 2;
623 kgdb_hex2mem(ptr, (char *)gdb_regs, i);
624 dbg_set_reg(regnum, gdb_regs, ks->linux_regs);
625 strcpy(remcom_out_buffer, "OK");
626}
627#endif /* DBG_MAX_REG_NUM > 0 */
628
629/* Handle the 'X' memory binary write bytes */
630static void gdb_cmd_binwrite(struct kgdb_state *ks)
631{
632 int err = write_mem_msg(1);
633
634 if (err)
635 error_packet(remcom_out_buffer, err);
636 else
637 strcpy(remcom_out_buffer, "OK");
638}
639
640/* Handle the 'D' or 'k', detach or kill packets */
641static void gdb_cmd_detachkill(struct kgdb_state *ks)
642{
643 int error;
644
645 /* The detach case */
646 if (remcom_in_buffer[0] == 'D') {
647 error = dbg_remove_all_break();
648 if (error < 0) {
649 error_packet(remcom_out_buffer, error);
650 } else {
651 strcpy(remcom_out_buffer, "OK");
652 kgdb_connected = 0;
653 }
654 put_packet(remcom_out_buffer);
655 } else {
656 /*
657 * Assume the kill case, with no exit code checking,
658 * trying to force detach the debugger:
659 */
660 dbg_remove_all_break();
661 kgdb_connected = 0;
662 }
663}
664
665/* Handle the 'R' reboot packets */
666static int gdb_cmd_reboot(struct kgdb_state *ks)
667{
668 /* For now, only honor R0 */
669 if (strcmp(remcom_in_buffer, "R0") == 0) {
670 printk(KERN_CRIT "Executing emergency reboot\n");
671 strcpy(remcom_out_buffer, "OK");
672 put_packet(remcom_out_buffer);
673
674 /*
675 * Execution should not return from
676 * machine_emergency_restart()
677 */
678 machine_emergency_restart();
679 kgdb_connected = 0;
680
681 return 1;
682 }
683 return 0;
684}
685
686/* Handle the 'q' query packets */
687static void gdb_cmd_query(struct kgdb_state *ks)
688{
689 struct task_struct *g;
690 struct task_struct *p;
691 unsigned char thref[BUF_THREAD_ID_SIZE];
692 char *ptr;
693 int i;
694 int cpu;
695 int finished = 0;
696
697 switch (remcom_in_buffer[1]) {
698 case 's':
699 case 'f':
700 if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10))
701 break;
702
703 i = 0;
704 remcom_out_buffer[0] = 'm';
705 ptr = remcom_out_buffer + 1;
706 if (remcom_in_buffer[1] == 'f') {
707 /* Each cpu is a shadow thread */
708 for_each_online_cpu(cpu) {
709 ks->thr_query = 0;
710 int_to_threadref(thref, -cpu - 2);
711 ptr = pack_threadid(ptr, thref);
712 *(ptr++) = ',';
713 i++;
714 }
715 }
716
717 do_each_thread(g, p) {
718 if (i >= ks->thr_query && !finished) {
719 int_to_threadref(thref, p->pid);
720 ptr = pack_threadid(ptr, thref);
721 *(ptr++) = ',';
722 ks->thr_query++;
723 if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
724 finished = 1;
725 }
726 i++;
727 } while_each_thread(g, p);
728
729 *(--ptr) = '\0';
730 break;
731
732 case 'C':
733 /* Current thread id */
734 strcpy(remcom_out_buffer, "QC");
735 ks->threadid = shadow_pid(current->pid);
736 int_to_threadref(thref, ks->threadid);
737 pack_threadid(remcom_out_buffer + 2, thref);
738 break;
739 case 'T':
740 if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16))
741 break;
742
743 ks->threadid = 0;
744 ptr = remcom_in_buffer + 17;
745 kgdb_hex2long(&ptr, &ks->threadid);
746 if (!getthread(ks->linux_regs, ks->threadid)) {
747 error_packet(remcom_out_buffer, -EINVAL);
748 break;
749 }
750 if ((int)ks->threadid > 0) {
751 kgdb_mem2hex(getthread(ks->linux_regs,
752 ks->threadid)->comm,
753 remcom_out_buffer, 16);
754 } else {
755 static char tmpstr[23 + BUF_THREAD_ID_SIZE];
756
757 sprintf(tmpstr, "shadowCPU%d",
758 (int)(-ks->threadid - 2));
759 kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr));
760 }
761 break;
762#ifdef CONFIG_KGDB_KDB
763 case 'R':
764 if (strncmp(remcom_in_buffer, "qRcmd,", 6) == 0) {
765 int len = strlen(remcom_in_buffer + 6);
766
767 if ((len % 2) != 0) {
768 strcpy(remcom_out_buffer, "E01");
769 break;
770 }
771 kgdb_hex2mem(remcom_in_buffer + 6,
772 remcom_out_buffer, len);
773 len = len / 2;
774 remcom_out_buffer[len++] = 0;
775
776 kdb_parse(remcom_out_buffer);
777 strcpy(remcom_out_buffer, "OK");
778 }
779 break;
780#endif
781 }
782}
783
784/* Handle the 'H' task query packets */
785static void gdb_cmd_task(struct kgdb_state *ks)
786{
787 struct task_struct *thread;
788 char *ptr;
789
790 switch (remcom_in_buffer[1]) {
791 case 'g':
792 ptr = &remcom_in_buffer[2];
793 kgdb_hex2long(&ptr, &ks->threadid);
794 thread = getthread(ks->linux_regs, ks->threadid);
795 if (!thread && ks->threadid > 0) {
796 error_packet(remcom_out_buffer, -EINVAL);
797 break;
798 }
799 kgdb_usethread = thread;
800 ks->kgdb_usethreadid = ks->threadid;
801 strcpy(remcom_out_buffer, "OK");
802 break;
803 case 'c':
804 ptr = &remcom_in_buffer[2];
805 kgdb_hex2long(&ptr, &ks->threadid);
806 if (!ks->threadid) {
807 kgdb_contthread = NULL;
808 } else {
809 thread = getthread(ks->linux_regs, ks->threadid);
810 if (!thread && ks->threadid > 0) {
811 error_packet(remcom_out_buffer, -EINVAL);
812 break;
813 }
814 kgdb_contthread = thread;
815 }
816 strcpy(remcom_out_buffer, "OK");
817 break;
818 }
819}
820
821/* Handle the 'T' thread query packets */
822static void gdb_cmd_thread(struct kgdb_state *ks)
823{
824 char *ptr = &remcom_in_buffer[1];
825 struct task_struct *thread;
826
827 kgdb_hex2long(&ptr, &ks->threadid);
828 thread = getthread(ks->linux_regs, ks->threadid);
829 if (thread)
830 strcpy(remcom_out_buffer, "OK");
831 else
832 error_packet(remcom_out_buffer, -EINVAL);
833}
834
835/* Handle the 'z' or 'Z' breakpoint remove or set packets */
836static void gdb_cmd_break(struct kgdb_state *ks)
837{
838 /*
839 * Since GDB-5.3, it's been drafted that '0' is a software
840 * breakpoint, '1' is a hardware breakpoint, so let's do that.
841 */
842 char *bpt_type = &remcom_in_buffer[1];
843 char *ptr = &remcom_in_buffer[2];
844 unsigned long addr;
845 unsigned long length;
846 int error = 0;
847
848 if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') {
849 /* Unsupported */
850 if (*bpt_type > '4')
851 return;
852 } else {
853 if (*bpt_type != '0' && *bpt_type != '1')
854 /* Unsupported. */
855 return;
856 }
857
858 /*
859 * Test if this is a hardware breakpoint, and
860 * if we support it:
861 */
862 if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT))
863 /* Unsupported. */
864 return;
865
866 if (*(ptr++) != ',') {
867 error_packet(remcom_out_buffer, -EINVAL);
868 return;
869 }
870 if (!kgdb_hex2long(&ptr, &addr)) {
871 error_packet(remcom_out_buffer, -EINVAL);
872 return;
873 }
874 if (*(ptr++) != ',' ||
875 !kgdb_hex2long(&ptr, &length)) {
876 error_packet(remcom_out_buffer, -EINVAL);
877 return;
878 }
879
880 if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0')
881 error = dbg_set_sw_break(addr);
882 else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0')
883 error = dbg_remove_sw_break(addr);
884 else if (remcom_in_buffer[0] == 'Z')
885 error = arch_kgdb_ops.set_hw_breakpoint(addr,
886 (int)length, *bpt_type - '0');
887 else if (remcom_in_buffer[0] == 'z')
888 error = arch_kgdb_ops.remove_hw_breakpoint(addr,
889 (int) length, *bpt_type - '0');
890
891 if (error == 0)
892 strcpy(remcom_out_buffer, "OK");
893 else
894 error_packet(remcom_out_buffer, error);
895}
896
897/* Handle the 'C' signal / exception passing packets */
898static int gdb_cmd_exception_pass(struct kgdb_state *ks)
899{
900 /* C09 == pass exception
901 * C15 == detach kgdb, pass exception
902 */
903 if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') {
904
905 ks->pass_exception = 1;
906 remcom_in_buffer[0] = 'c';
907
908 } else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') {
909
910 ks->pass_exception = 1;
911 remcom_in_buffer[0] = 'D';
912 dbg_remove_all_break();
913 kgdb_connected = 0;
914 return 1;
915
916 } else {
917 gdbstub_msg_write("KGDB only knows signal 9 (pass)"
918 " and 15 (pass and disconnect)\n"
919 "Executing a continue without signal passing\n", 0);
920 remcom_in_buffer[0] = 'c';
921 }
922
923 /* Indicate fall through */
924 return -1;
925}
926
927/*
928 * This function performs all gdbserial command procesing
929 */
930int gdb_serial_stub(struct kgdb_state *ks)
931{
932 int error = 0;
933 int tmp;
934
935 /* Initialize comm buffer and globals. */
936 memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
937 kgdb_usethread = kgdb_info[ks->cpu].task;
938 ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
939 ks->pass_exception = 0;
940
941 if (kgdb_connected) {
942 unsigned char thref[BUF_THREAD_ID_SIZE];
943 char *ptr;
944
945 /* Reply to host that an exception has occurred */
946 ptr = remcom_out_buffer;
947 *ptr++ = 'T';
948 ptr = pack_hex_byte(ptr, ks->signo);
949 ptr += strlen(strcpy(ptr, "thread:"));
950 int_to_threadref(thref, shadow_pid(current->pid));
951 ptr = pack_threadid(ptr, thref);
952 *ptr++ = ';';
953 put_packet(remcom_out_buffer);
954 }
955
956 while (1) {
957 error = 0;
958
959 /* Clear the out buffer. */
960 memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
961
962 get_packet(remcom_in_buffer);
963
964 switch (remcom_in_buffer[0]) {
965 case '?': /* gdbserial status */
966 gdb_cmd_status(ks);
967 break;
968 case 'g': /* return the value of the CPU registers */
969 gdb_cmd_getregs(ks);
970 break;
971 case 'G': /* set the value of the CPU registers - return OK */
972 gdb_cmd_setregs(ks);
973 break;
974 case 'm': /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */
975 gdb_cmd_memread(ks);
976 break;
977 case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
978 gdb_cmd_memwrite(ks);
979 break;
980#if DBG_MAX_REG_NUM > 0
981 case 'p': /* pXX Return gdb register XX (in hex) */
982 gdb_cmd_reg_get(ks);
983 break;
984 case 'P': /* PXX=aaaa Set gdb register XX to aaaa (in hex) */
985 gdb_cmd_reg_set(ks);
986 break;
987#endif /* DBG_MAX_REG_NUM > 0 */
988 case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
989 gdb_cmd_binwrite(ks);
990 break;
991 /* kill or detach. KGDB should treat this like a
992 * continue.
993 */
994 case 'D': /* Debugger detach */
995 case 'k': /* Debugger detach via kill */
996 gdb_cmd_detachkill(ks);
997 goto default_handle;
998 case 'R': /* Reboot */
999 if (gdb_cmd_reboot(ks))
1000 goto default_handle;
1001 break;
1002 case 'q': /* query command */
1003 gdb_cmd_query(ks);
1004 break;
1005 case 'H': /* task related */
1006 gdb_cmd_task(ks);
1007 break;
1008 case 'T': /* Query thread status */
1009 gdb_cmd_thread(ks);
1010 break;
1011 case 'z': /* Break point remove */
1012 case 'Z': /* Break point set */
1013 gdb_cmd_break(ks);
1014 break;
1015#ifdef CONFIG_KGDB_KDB
1016 case '3': /* Escape into back into kdb */
1017 if (remcom_in_buffer[1] == '\0') {
1018 gdb_cmd_detachkill(ks);
1019 return DBG_PASS_EVENT;
1020 }
1021#endif
1022 case 'C': /* Exception passing */
1023 tmp = gdb_cmd_exception_pass(ks);
1024 if (tmp > 0)
1025 goto default_handle;
1026 if (tmp == 0)
1027 break;
1028 /* Fall through on tmp < 0 */
1029 case 'c': /* Continue packet */
1030 case 's': /* Single step packet */
1031 if (kgdb_contthread && kgdb_contthread != current) {
1032 /* Can't switch threads in kgdb */
1033 error_packet(remcom_out_buffer, -EINVAL);
1034 break;
1035 }
1036 dbg_activate_sw_breakpoints();
1037 /* Fall through to default processing */
1038 default:
1039default_handle:
1040 error = kgdb_arch_handle_exception(ks->ex_vector,
1041 ks->signo,
1042 ks->err_code,
1043 remcom_in_buffer,
1044 remcom_out_buffer,
1045 ks->linux_regs);
1046 /*
1047 * Leave cmd processing on error, detach,
1048 * kill, continue, or single step.
1049 */
1050 if (error >= 0 || remcom_in_buffer[0] == 'D' ||
1051 remcom_in_buffer[0] == 'k') {
1052 error = 0;
1053 goto kgdb_exit;
1054 }
1055
1056 }
1057
1058 /* reply to the request */
1059 put_packet(remcom_out_buffer);
1060 }
1061
1062kgdb_exit:
1063 if (ks->pass_exception)
1064 error = 1;
1065 return error;
1066}
1067
1068int gdbstub_state(struct kgdb_state *ks, char *cmd)
1069{
1070 int error;
1071
1072 switch (cmd[0]) {
1073 case 'e':
1074 error = kgdb_arch_handle_exception(ks->ex_vector,
1075 ks->signo,
1076 ks->err_code,
1077 remcom_in_buffer,
1078 remcom_out_buffer,
1079 ks->linux_regs);
1080 return error;
1081 case 's':
1082 case 'c':
1083 strcpy(remcom_in_buffer, cmd);
1084 return 0;
1085 case '?':
1086 gdb_cmd_status(ks);
1087 break;
1088 case '\0':
1089 strcpy(remcom_out_buffer, "");
1090 break;
1091 }
1092 dbg_io_ops->write_char('+');
1093 put_packet(remcom_out_buffer);
1094 return 0;
1095}
diff --git a/kernel/debug/kdb/.gitignore b/kernel/debug/kdb/.gitignore
new file mode 100644
index 000000000000..396d12eda9e8
--- /dev/null
+++ b/kernel/debug/kdb/.gitignore
@@ -0,0 +1 @@
gen-kdb_cmds.c
diff --git a/kernel/debug/kdb/Makefile b/kernel/debug/kdb/Makefile
new file mode 100644
index 000000000000..d4fc58f4b88d
--- /dev/null
+++ b/kernel/debug/kdb/Makefile
@@ -0,0 +1,25 @@
1# This file is subject to the terms and conditions of the GNU General Public
2# License. See the file "COPYING" in the main directory of this archive
3# for more details.
4#
5# Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
6# Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
7#
8
9CCVERSION := $(shell $(CC) -v 2>&1 | sed -ne '$$p')
10obj-y := kdb_io.o kdb_main.o kdb_support.o kdb_bt.o gen-kdb_cmds.o kdb_bp.o kdb_debugger.o
11obj-$(CONFIG_KDB_KEYBOARD) += kdb_keyboard.o
12
13clean-files := gen-kdb_cmds.c
14
15quiet_cmd_gen-kdb = GENKDB $@
16 cmd_gen-kdb = $(AWK) 'BEGIN {print "\#include <linux/stddef.h>"; print "\#include <linux/init.h>"} \
17 /^\#/{next} \
18 /^[ \t]*$$/{next} \
19 {gsub(/"/, "\\\"", $$0); \
20 print "static __initdata char kdb_cmd" cmds++ "[] = \"" $$0 "\\n\";"} \
21 END {print "extern char *kdb_cmds[]; char __initdata *kdb_cmds[] = {"; for (i = 0; i < cmds; ++i) {print " kdb_cmd" i ","}; print(" NULL\n};");}' \
22 $(filter-out %/Makefile,$^) > $@#
23
24$(obj)/gen-kdb_cmds.c: $(src)/kdb_cmds $(src)/Makefile
25 $(call cmd,gen-kdb)
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
new file mode 100644
index 000000000000..20059ef4459a
--- /dev/null
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -0,0 +1,562 @@
1/*
2 * Kernel Debugger Architecture Independent Breakpoint Handler
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
9 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
10 */
11
12#include <linux/string.h>
13#include <linux/kernel.h>
14#include <linux/init.h>
15#include <linux/kdb.h>
16#include <linux/kgdb.h>
17#include <linux/smp.h>
18#include <linux/sched.h>
19#include <linux/interrupt.h>
20#include "kdb_private.h"
21
22/*
23 * Table of kdb_breakpoints
24 */
25kdb_bp_t kdb_breakpoints[KDB_MAXBPT];
26
27static void kdb_setsinglestep(struct pt_regs *regs)
28{
29 KDB_STATE_SET(DOING_SS);
30}
31
32static char *kdb_rwtypes[] = {
33 "Instruction(i)",
34 "Instruction(Register)",
35 "Data Write",
36 "I/O",
37 "Data Access"
38};
39
40static char *kdb_bptype(kdb_bp_t *bp)
41{
42 if (bp->bp_type < 0 || bp->bp_type > 4)
43 return "";
44
45 return kdb_rwtypes[bp->bp_type];
46}
47
48static int kdb_parsebp(int argc, const char **argv, int *nextargp, kdb_bp_t *bp)
49{
50 int nextarg = *nextargp;
51 int diag;
52
53 bp->bph_length = 1;
54 if ((argc + 1) != nextarg) {
55 if (strnicmp(argv[nextarg], "datar", sizeof("datar")) == 0)
56 bp->bp_type = BP_ACCESS_WATCHPOINT;
57 else if (strnicmp(argv[nextarg], "dataw", sizeof("dataw")) == 0)
58 bp->bp_type = BP_WRITE_WATCHPOINT;
59 else if (strnicmp(argv[nextarg], "inst", sizeof("inst")) == 0)
60 bp->bp_type = BP_HARDWARE_BREAKPOINT;
61 else
62 return KDB_ARGCOUNT;
63
64 bp->bph_length = 1;
65
66 nextarg++;
67
68 if ((argc + 1) != nextarg) {
69 unsigned long len;
70
71 diag = kdbgetularg((char *)argv[nextarg],
72 &len);
73 if (diag)
74 return diag;
75
76
77 if (len > 8)
78 return KDB_BADLENGTH;
79
80 bp->bph_length = len;
81 nextarg++;
82 }
83
84 if ((argc + 1) != nextarg)
85 return KDB_ARGCOUNT;
86 }
87
88 *nextargp = nextarg;
89 return 0;
90}
91
92static int _kdb_bp_remove(kdb_bp_t *bp)
93{
94 int ret = 1;
95 if (!bp->bp_installed)
96 return ret;
97 if (!bp->bp_type)
98 ret = dbg_remove_sw_break(bp->bp_addr);
99 else
100 ret = arch_kgdb_ops.remove_hw_breakpoint(bp->bp_addr,
101 bp->bph_length,
102 bp->bp_type);
103 if (ret == 0)
104 bp->bp_installed = 0;
105 return ret;
106}
107
108static void kdb_handle_bp(struct pt_regs *regs, kdb_bp_t *bp)
109{
110 if (KDB_DEBUG(BP))
111 kdb_printf("regs->ip = 0x%lx\n", instruction_pointer(regs));
112
113 /*
114 * Setup single step
115 */
116 kdb_setsinglestep(regs);
117
118 /*
119 * Reset delay attribute
120 */
121 bp->bp_delay = 0;
122 bp->bp_delayed = 1;
123}
124
125static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)
126{
127 int ret;
128 /*
129 * Install the breakpoint, if it is not already installed.
130 */
131
132 if (KDB_DEBUG(BP))
133 kdb_printf("%s: bp_installed %d\n",
134 __func__, bp->bp_installed);
135 if (!KDB_STATE(SSBPT))
136 bp->bp_delay = 0;
137 if (bp->bp_installed)
138 return 1;
139 if (bp->bp_delay || (bp->bp_delayed && KDB_STATE(DOING_SS))) {
140 if (KDB_DEBUG(BP))
141 kdb_printf("%s: delayed bp\n", __func__);
142 kdb_handle_bp(regs, bp);
143 return 0;
144 }
145 if (!bp->bp_type)
146 ret = dbg_set_sw_break(bp->bp_addr);
147 else
148 ret = arch_kgdb_ops.set_hw_breakpoint(bp->bp_addr,
149 bp->bph_length,
150 bp->bp_type);
151 if (ret == 0) {
152 bp->bp_installed = 1;
153 } else {
154 kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
155 __func__, bp->bp_addr);
156 return 1;
157 }
158 return 0;
159}
160
161/*
162 * kdb_bp_install
163 *
164 * Install kdb_breakpoints prior to returning from the
165 * kernel debugger. This allows the kdb_breakpoints to be set
166 * upon functions that are used internally by kdb, such as
167 * printk(). This function is only called once per kdb session.
168 */
169void kdb_bp_install(struct pt_regs *regs)
170{
171 int i;
172
173 for (i = 0; i < KDB_MAXBPT; i++) {
174 kdb_bp_t *bp = &kdb_breakpoints[i];
175
176 if (KDB_DEBUG(BP)) {
177 kdb_printf("%s: bp %d bp_enabled %d\n",
178 __func__, i, bp->bp_enabled);
179 }
180 if (bp->bp_enabled)
181 _kdb_bp_install(regs, bp);
182 }
183}
184
185/*
186 * kdb_bp_remove
187 *
188 * Remove kdb_breakpoints upon entry to the kernel debugger.
189 *
190 * Parameters:
191 * None.
192 * Outputs:
193 * None.
194 * Returns:
195 * None.
196 * Locking:
197 * None.
198 * Remarks:
199 */
200void kdb_bp_remove(void)
201{
202 int i;
203
204 for (i = KDB_MAXBPT - 1; i >= 0; i--) {
205 kdb_bp_t *bp = &kdb_breakpoints[i];
206
207 if (KDB_DEBUG(BP)) {
208 kdb_printf("%s: bp %d bp_enabled %d\n",
209 __func__, i, bp->bp_enabled);
210 }
211 if (bp->bp_enabled)
212 _kdb_bp_remove(bp);
213 }
214}
215
216
217/*
218 * kdb_printbp
219 *
220 * Internal function to format and print a breakpoint entry.
221 *
222 * Parameters:
223 * None.
224 * Outputs:
225 * None.
226 * Returns:
227 * None.
228 * Locking:
229 * None.
230 * Remarks:
231 */
232
233static void kdb_printbp(kdb_bp_t *bp, int i)
234{
235 kdb_printf("%s ", kdb_bptype(bp));
236 kdb_printf("BP #%d at ", i);
237 kdb_symbol_print(bp->bp_addr, NULL, KDB_SP_DEFAULT);
238
239 if (bp->bp_enabled)
240 kdb_printf("\n is enabled");
241 else
242 kdb_printf("\n is disabled");
243
244 kdb_printf("\taddr at %016lx, hardtype=%d installed=%d\n",
245 bp->bp_addr, bp->bp_type, bp->bp_installed);
246
247 kdb_printf("\n");
248}
249
250/*
251 * kdb_bp
252 *
253 * Handle the bp commands.
254 *
255 * [bp|bph] <addr-expression> [DATAR|DATAW]
256 *
257 * Parameters:
258 * argc Count of arguments in argv
259 * argv Space delimited command line arguments
260 * Outputs:
261 * None.
262 * Returns:
263 * Zero for success, a kdb diagnostic if failure.
264 * Locking:
265 * None.
266 * Remarks:
267 *
268 * bp Set breakpoint on all cpus. Only use hardware assist if need.
269 * bph Set breakpoint on all cpus. Force hardware register
270 */
271
272static int kdb_bp(int argc, const char **argv)
273{
274 int i, bpno;
275 kdb_bp_t *bp, *bp_check;
276 int diag;
277 char *symname = NULL;
278 long offset = 0ul;
279 int nextarg;
280 kdb_bp_t template = {0};
281
282 if (argc == 0) {
283 /*
284 * Display breakpoint table
285 */
286 for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT;
287 bpno++, bp++) {
288 if (bp->bp_free)
289 continue;
290 kdb_printbp(bp, bpno);
291 }
292
293 return 0;
294 }
295
296 nextarg = 1;
297 diag = kdbgetaddrarg(argc, argv, &nextarg, &template.bp_addr,
298 &offset, &symname);
299 if (diag)
300 return diag;
301 if (!template.bp_addr)
302 return KDB_BADINT;
303
304 /*
305 * Find an empty bp structure to allocate
306 */
307 for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) {
308 if (bp->bp_free)
309 break;
310 }
311
312 if (bpno == KDB_MAXBPT)
313 return KDB_TOOMANYBPT;
314
315 if (strcmp(argv[0], "bph") == 0) {
316 template.bp_type = BP_HARDWARE_BREAKPOINT;
317 diag = kdb_parsebp(argc, argv, &nextarg, &template);
318 if (diag)
319 return diag;
320 } else {
321 template.bp_type = BP_BREAKPOINT;
322 }
323
324 /*
325 * Check for clashing breakpoints.
326 *
327 * Note, in this design we can't have hardware breakpoints
328 * enabled for both read and write on the same address.
329 */
330 for (i = 0, bp_check = kdb_breakpoints; i < KDB_MAXBPT;
331 i++, bp_check++) {
332 if (!bp_check->bp_free &&
333 bp_check->bp_addr == template.bp_addr) {
334 kdb_printf("You already have a breakpoint at "
335 kdb_bfd_vma_fmt0 "\n", template.bp_addr);
336 return KDB_DUPBPT;
337 }
338 }
339
340 template.bp_enabled = 1;
341
342 /*
343 * Actually allocate the breakpoint found earlier
344 */
345 *bp = template;
346 bp->bp_free = 0;
347
348 kdb_printbp(bp, bpno);
349
350 return 0;
351}
352
353/*
354 * kdb_bc
355 *
356 * Handles the 'bc', 'be', and 'bd' commands
357 *
358 * [bd|bc|be] <breakpoint-number>
359 * [bd|bc|be] *
360 *
361 * Parameters:
362 * argc Count of arguments in argv
363 * argv Space delimited command line arguments
364 * Outputs:
365 * None.
366 * Returns:
367 * Zero for success, a kdb diagnostic for failure
368 * Locking:
369 * None.
370 * Remarks:
371 */
372static int kdb_bc(int argc, const char **argv)
373{
374 unsigned long addr;
375 kdb_bp_t *bp = NULL;
376 int lowbp = KDB_MAXBPT;
377 int highbp = 0;
378 int done = 0;
379 int i;
380 int diag = 0;
381
382 int cmd; /* KDBCMD_B? */
383#define KDBCMD_BC 0
384#define KDBCMD_BE 1
385#define KDBCMD_BD 2
386
387 if (strcmp(argv[0], "be") == 0)
388 cmd = KDBCMD_BE;
389 else if (strcmp(argv[0], "bd") == 0)
390 cmd = KDBCMD_BD;
391 else
392 cmd = KDBCMD_BC;
393
394 if (argc != 1)
395 return KDB_ARGCOUNT;
396
397 if (strcmp(argv[1], "*") == 0) {
398 lowbp = 0;
399 highbp = KDB_MAXBPT;
400 } else {
401 diag = kdbgetularg(argv[1], &addr);
402 if (diag)
403 return diag;
404
405 /*
406 * For addresses less than the maximum breakpoint number,
407 * assume that the breakpoint number is desired.
408 */
409 if (addr < KDB_MAXBPT) {
410 bp = &kdb_breakpoints[addr];
411 lowbp = highbp = addr;
412 highbp++;
413 } else {
414 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT;
415 i++, bp++) {
416 if (bp->bp_addr == addr) {
417 lowbp = highbp = i;
418 highbp++;
419 break;
420 }
421 }
422 }
423 }
424
425 /*
426 * Now operate on the set of breakpoints matching the input
427 * criteria (either '*' for all, or an individual breakpoint).
428 */
429 for (bp = &kdb_breakpoints[lowbp], i = lowbp;
430 i < highbp;
431 i++, bp++) {
432 if (bp->bp_free)
433 continue;
434
435 done++;
436
437 switch (cmd) {
438 case KDBCMD_BC:
439 bp->bp_enabled = 0;
440
441 kdb_printf("Breakpoint %d at "
442 kdb_bfd_vma_fmt " cleared\n",
443 i, bp->bp_addr);
444
445 bp->bp_addr = 0;
446 bp->bp_free = 1;
447
448 break;
449 case KDBCMD_BE:
450 bp->bp_enabled = 1;
451
452 kdb_printf("Breakpoint %d at "
453 kdb_bfd_vma_fmt " enabled",
454 i, bp->bp_addr);
455
456 kdb_printf("\n");
457 break;
458 case KDBCMD_BD:
459 if (!bp->bp_enabled)
460 break;
461
462 bp->bp_enabled = 0;
463
464 kdb_printf("Breakpoint %d at "
465 kdb_bfd_vma_fmt " disabled\n",
466 i, bp->bp_addr);
467
468 break;
469 }
470 if (bp->bp_delay && (cmd == KDBCMD_BC || cmd == KDBCMD_BD)) {
471 bp->bp_delay = 0;
472 KDB_STATE_CLEAR(SSBPT);
473 }
474 }
475
476 return (!done) ? KDB_BPTNOTFOUND : 0;
477}
478
479/*
480 * kdb_ss
481 *
482 * Process the 'ss' (Single Step) and 'ssb' (Single Step to Branch)
483 * commands.
484 *
485 * ss
486 * ssb
487 *
488 * Parameters:
489 * argc Argument count
490 * argv Argument vector
491 * Outputs:
492 * None.
493 * Returns:
494 * KDB_CMD_SS[B] for success, a kdb error if failure.
495 * Locking:
496 * None.
497 * Remarks:
498 *
499 * Set the arch specific option to trigger a debug trap after the next
500 * instruction.
501 *
502 * For 'ssb', set the trace flag in the debug trap handler
503 * after printing the current insn and return directly without
504 * invoking the kdb command processor, until a branch instruction
505 * is encountered.
506 */
507
508static int kdb_ss(int argc, const char **argv)
509{
510 int ssb = 0;
511
512 ssb = (strcmp(argv[0], "ssb") == 0);
513 if (argc != 0)
514 return KDB_ARGCOUNT;
515 /*
516 * Set trace flag and go.
517 */
518 KDB_STATE_SET(DOING_SS);
519 if (ssb) {
520 KDB_STATE_SET(DOING_SSB);
521 return KDB_CMD_SSB;
522 }
523 return KDB_CMD_SS;
524}
525
526/* Initialize the breakpoint table and register breakpoint commands. */
527
528void __init kdb_initbptab(void)
529{
530 int i;
531 kdb_bp_t *bp;
532
533 /*
534 * First time initialization.
535 */
536 memset(&kdb_breakpoints, '\0', sizeof(kdb_breakpoints));
537
538 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++)
539 bp->bp_free = 1;
540
541 kdb_register_repeat("bp", kdb_bp, "[<vaddr>]",
542 "Set/Display breakpoints", 0, KDB_REPEAT_NO_ARGS);
543 kdb_register_repeat("bl", kdb_bp, "[<vaddr>]",
544 "Display breakpoints", 0, KDB_REPEAT_NO_ARGS);
545 if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT)
546 kdb_register_repeat("bph", kdb_bp, "[<vaddr>]",
547 "[datar [length]|dataw [length]] Set hw brk", 0, KDB_REPEAT_NO_ARGS);
548 kdb_register_repeat("bc", kdb_bc, "<bpnum>",
549 "Clear Breakpoint", 0, KDB_REPEAT_NONE);
550 kdb_register_repeat("be", kdb_bc, "<bpnum>",
551 "Enable Breakpoint", 0, KDB_REPEAT_NONE);
552 kdb_register_repeat("bd", kdb_bc, "<bpnum>",
553 "Disable Breakpoint", 0, KDB_REPEAT_NONE);
554
555 kdb_register_repeat("ss", kdb_ss, "",
556 "Single Step", 1, KDB_REPEAT_NO_ARGS);
557 kdb_register_repeat("ssb", kdb_ss, "",
558 "Single step to branch/call", 0, KDB_REPEAT_NO_ARGS);
559 /*
560 * Architecture dependent initialization.
561 */
562}
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
new file mode 100644
index 000000000000..2f62fe85f16a
--- /dev/null
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -0,0 +1,210 @@
1/*
2 * Kernel Debugger Architecture Independent Stack Traceback
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
9 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
10 */
11
12#include <linux/ctype.h>
13#include <linux/string.h>
14#include <linux/kernel.h>
15#include <linux/sched.h>
16#include <linux/kdb.h>
17#include <linux/nmi.h>
18#include <asm/system.h>
19#include "kdb_private.h"
20
21
22static void kdb_show_stack(struct task_struct *p, void *addr)
23{
24 int old_lvl = console_loglevel;
25 console_loglevel = 15;
26 kdb_trap_printk++;
27 kdb_set_current_task(p);
28 if (addr) {
29 show_stack((struct task_struct *)p, addr);
30 } else if (kdb_current_regs) {
31#ifdef CONFIG_X86
32 show_stack(p, &kdb_current_regs->sp);
33#else
34 show_stack(p, NULL);
35#endif
36 } else {
37 show_stack(p, NULL);
38 }
39 console_loglevel = old_lvl;
40 kdb_trap_printk--;
41}
42
43/*
44 * kdb_bt
45 *
46 * This function implements the 'bt' command. Print a stack
47 * traceback.
48 *
49 * bt [<address-expression>] (addr-exp is for alternate stacks)
50 * btp <pid> Kernel stack for <pid>
51 * btt <address-expression> Kernel stack for task structure at
52 * <address-expression>
53 * bta [DRSTCZEUIMA] All useful processes, optionally
54 * filtered by state
55 * btc [<cpu>] The current process on one cpu,
56 * default is all cpus
57 *
58 * bt <address-expression> refers to a address on the stack, that location
59 * is assumed to contain a return address.
60 *
61 * btt <address-expression> refers to the address of a struct task.
62 *
63 * Inputs:
64 * argc argument count
65 * argv argument vector
66 * Outputs:
67 * None.
68 * Returns:
69 * zero for success, a kdb diagnostic if error
70 * Locking:
71 * none.
72 * Remarks:
73 * Backtrack works best when the code uses frame pointers. But even
74 * without frame pointers we should get a reasonable trace.
75 *
76 * mds comes in handy when examining the stack to do a manual traceback or
77 * to get a starting point for bt <address-expression>.
78 */
79
80static int
81kdb_bt1(struct task_struct *p, unsigned long mask,
82 int argcount, int btaprompt)
83{
84 char buffer[2];
85 if (kdb_getarea(buffer[0], (unsigned long)p) ||
86 kdb_getarea(buffer[0], (unsigned long)(p+1)-1))
87 return KDB_BADADDR;
88 if (!kdb_task_state(p, mask))
89 return 0;
90 kdb_printf("Stack traceback for pid %d\n", p->pid);
91 kdb_ps1(p);
92 kdb_show_stack(p, NULL);
93 if (btaprompt) {
94 kdb_getstr(buffer, sizeof(buffer),
95 "Enter <q> to end, <cr> to continue:");
96 if (buffer[0] == 'q') {
97 kdb_printf("\n");
98 return 1;
99 }
100 }
101 touch_nmi_watchdog();
102 return 0;
103}
104
105int
106kdb_bt(int argc, const char **argv)
107{
108 int diag;
109 int argcount = 5;
110 int btaprompt = 1;
111 int nextarg;
112 unsigned long addr;
113 long offset;
114
115 kdbgetintenv("BTARGS", &argcount); /* Arguments to print */
116 kdbgetintenv("BTAPROMPT", &btaprompt); /* Prompt after each
117 * proc in bta */
118
119 if (strcmp(argv[0], "bta") == 0) {
120 struct task_struct *g, *p;
121 unsigned long cpu;
122 unsigned long mask = kdb_task_state_string(argc ? argv[1] :
123 NULL);
124 if (argc == 0)
125 kdb_ps_suppressed();
126 /* Run the active tasks first */
127 for_each_online_cpu(cpu) {
128 p = kdb_curr_task(cpu);
129 if (kdb_bt1(p, mask, argcount, btaprompt))
130 return 0;
131 }
132 /* Now the inactive tasks */
133 kdb_do_each_thread(g, p) {
134 if (task_curr(p))
135 continue;
136 if (kdb_bt1(p, mask, argcount, btaprompt))
137 return 0;
138 } kdb_while_each_thread(g, p);
139 } else if (strcmp(argv[0], "btp") == 0) {
140 struct task_struct *p;
141 unsigned long pid;
142 if (argc != 1)
143 return KDB_ARGCOUNT;
144 diag = kdbgetularg((char *)argv[1], &pid);
145 if (diag)
146 return diag;
147 p = find_task_by_pid_ns(pid, &init_pid_ns);
148 if (p) {
149 kdb_set_current_task(p);
150 return kdb_bt1(p, ~0UL, argcount, 0);
151 }
152 kdb_printf("No process with pid == %ld found\n", pid);
153 return 0;
154 } else if (strcmp(argv[0], "btt") == 0) {
155 if (argc != 1)
156 return KDB_ARGCOUNT;
157 diag = kdbgetularg((char *)argv[1], &addr);
158 if (diag)
159 return diag;
160 kdb_set_current_task((struct task_struct *)addr);
161 return kdb_bt1((struct task_struct *)addr, ~0UL, argcount, 0);
162 } else if (strcmp(argv[0], "btc") == 0) {
163 unsigned long cpu = ~0;
164 struct task_struct *save_current_task = kdb_current_task;
165 char buf[80];
166 if (argc > 1)
167 return KDB_ARGCOUNT;
168 if (argc == 1) {
169 diag = kdbgetularg((char *)argv[1], &cpu);
170 if (diag)
171 return diag;
172 }
173 /* Recursive use of kdb_parse, do not use argv after
174 * this point */
175 argv = NULL;
176 if (cpu != ~0) {
177 if (cpu >= num_possible_cpus() || !cpu_online(cpu)) {
178 kdb_printf("no process for cpu %ld\n", cpu);
179 return 0;
180 }
181 sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu));
182 kdb_parse(buf);
183 return 0;
184 }
185 kdb_printf("btc: cpu status: ");
186 kdb_parse("cpu\n");
187 for_each_online_cpu(cpu) {
188 sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu));
189 kdb_parse(buf);
190 touch_nmi_watchdog();
191 }
192 kdb_set_current_task(save_current_task);
193 return 0;
194 } else {
195 if (argc) {
196 nextarg = 1;
197 diag = kdbgetaddrarg(argc, argv, &nextarg, &addr,
198 &offset, NULL);
199 if (diag)
200 return diag;
201 kdb_show_stack(kdb_current_task, (void *)addr);
202 return 0;
203 } else {
204 return kdb_bt1(kdb_current_task, ~0UL, argcount, 0);
205 }
206 }
207
208 /* NOTREACHED */
209 return 0;
210}
diff --git a/kernel/debug/kdb/kdb_cmds b/kernel/debug/kdb/kdb_cmds
new file mode 100644
index 000000000000..56c88e4db309
--- /dev/null
+++ b/kernel/debug/kdb/kdb_cmds
@@ -0,0 +1,35 @@
1# Initial commands for kdb, alter to suit your needs.
2# These commands are executed in kdb_init() context, no SMP, no
3# processes. Commands that require process data (including stack or
4# registers) are not reliable this early. set and bp commands should
5# be safe. Global breakpoint commands affect each cpu as it is booted.
6
7# Standard debugging information for first level support, just type archkdb
8# or archkdbcpu or archkdbshort at the kdb prompt.
9
10defcmd dumpcommon "" "Common kdb debugging"
11 set BTAPROMPT 0
12 set LINES 10000
13 -summary
14 -cpu
15 -ps
16 -dmesg 600
17 -bt
18endefcmd
19
20defcmd dumpall "" "First line debugging"
21 set BTSYMARG 1
22 set BTARGS 9
23 pid R
24 -dumpcommon
25 -bta
26endefcmd
27
28defcmd dumpcpu "" "Same as dumpall but only tasks on cpus"
29 set BTSYMARG 1
30 set BTARGS 9
31 pid R
32 -dumpcommon
33 -btc
34endefcmd
35
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
new file mode 100644
index 000000000000..bf6e8270e957
--- /dev/null
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -0,0 +1,169 @@
1/*
2 * Created by: Jason Wessel <jason.wessel@windriver.com>
3 *
4 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
5 *
6 * This file is licensed under the terms of the GNU General Public
7 * License version 2. This program is licensed "as is" without any
8 * warranty of any kind, whether express or implied.
9 */
10
11#include <linux/kgdb.h>
12#include <linux/kdb.h>
13#include <linux/kdebug.h>
14#include "kdb_private.h"
15#include "../debug_core.h"
16
17/*
18 * KDB interface to KGDB internals
19 */
20get_char_func kdb_poll_funcs[] = {
21 dbg_io_get_char,
22 NULL,
23 NULL,
24 NULL,
25 NULL,
26 NULL,
27};
28EXPORT_SYMBOL_GPL(kdb_poll_funcs);
29
30int kdb_poll_idx = 1;
31EXPORT_SYMBOL_GPL(kdb_poll_idx);
32
33int kdb_stub(struct kgdb_state *ks)
34{
35 int error = 0;
36 kdb_bp_t *bp;
37 unsigned long addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
38 kdb_reason_t reason = KDB_REASON_OOPS;
39 kdb_dbtrap_t db_result = KDB_DB_NOBPT;
40 int i;
41
42 if (KDB_STATE(REENTRY)) {
43 reason = KDB_REASON_SWITCH;
44 KDB_STATE_CLEAR(REENTRY);
45 addr = instruction_pointer(ks->linux_regs);
46 }
47 ks->pass_exception = 0;
48 if (atomic_read(&kgdb_setting_breakpoint))
49 reason = KDB_REASON_KEYBOARD;
50
51 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
52 if ((bp->bp_enabled) && (bp->bp_addr == addr)) {
53 reason = KDB_REASON_BREAK;
54 db_result = KDB_DB_BPT;
55 if (addr != instruction_pointer(ks->linux_regs))
56 kgdb_arch_set_pc(ks->linux_regs, addr);
57 break;
58 }
59 }
60 if (reason == KDB_REASON_BREAK || reason == KDB_REASON_SWITCH) {
61 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
62 if (bp->bp_free)
63 continue;
64 if (bp->bp_addr == addr) {
65 bp->bp_delay = 1;
66 bp->bp_delayed = 1;
67 /*
68 * SSBPT is set when the kernel debugger must single step a
69 * task in order to re-establish an instruction breakpoint
70 * which uses the instruction replacement mechanism. It is
71 * cleared by any action that removes the need to single-step
72 * the breakpoint.
73 */
74 reason = KDB_REASON_BREAK;
75 db_result = KDB_DB_BPT;
76 KDB_STATE_SET(SSBPT);
77 break;
78 }
79 }
80 }
81
82 if (reason != KDB_REASON_BREAK && ks->ex_vector == 0 &&
83 ks->signo == SIGTRAP) {
84 reason = KDB_REASON_SSTEP;
85 db_result = KDB_DB_BPT;
86 }
87 /* Set initial kdb state variables */
88 KDB_STATE_CLEAR(KGDB_TRANS);
89 kdb_initial_cpu = ks->cpu;
90 kdb_current_task = kgdb_info[ks->cpu].task;
91 kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
92 /* Remove any breakpoints as needed by kdb and clear single step */
93 kdb_bp_remove();
94 KDB_STATE_CLEAR(DOING_SS);
95 KDB_STATE_CLEAR(DOING_SSB);
96 KDB_STATE_SET(PAGER);
97 /* zero out any offline cpu data */
98 for_each_present_cpu(i) {
99 if (!cpu_online(i)) {
100 kgdb_info[i].debuggerinfo = NULL;
101 kgdb_info[i].task = NULL;
102 }
103 }
104 if (ks->err_code == DIE_OOPS || reason == KDB_REASON_OOPS) {
105 ks->pass_exception = 1;
106 KDB_FLAG_SET(CATASTROPHIC);
107 }
108 kdb_initial_cpu = ks->cpu;
109 if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) {
110 KDB_STATE_CLEAR(SSBPT);
111 KDB_STATE_CLEAR(DOING_SS);
112 } else {
113 /* Start kdb main loop */
114 error = kdb_main_loop(KDB_REASON_ENTER, reason,
115 ks->err_code, db_result, ks->linux_regs);
116 }
117 /*
118 * Upon exit from the kdb main loop setup break points and restart
119 * the system based on the requested continue state
120 */
121 kdb_initial_cpu = -1;
122 kdb_current_task = NULL;
123 kdb_current_regs = NULL;
124 KDB_STATE_CLEAR(PAGER);
125 kdbnearsym_cleanup();
126 if (error == KDB_CMD_KGDB) {
127 if (KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2)) {
128 /*
129 * This inteface glue which allows kdb to transition in into
130 * the gdb stub. In order to do this the '?' or '' gdb serial
131 * packet response is processed here. And then control is
132 * passed to the gdbstub.
133 */
134 if (KDB_STATE(DOING_KGDB))
135 gdbstub_state(ks, "?");
136 else
137 gdbstub_state(ks, "");
138 KDB_STATE_CLEAR(DOING_KGDB);
139 KDB_STATE_CLEAR(DOING_KGDB2);
140 }
141 return DBG_PASS_EVENT;
142 }
143 kdb_bp_install(ks->linux_regs);
144 dbg_activate_sw_breakpoints();
145 /* Set the exit state to a single step or a continue */
146 if (KDB_STATE(DOING_SS))
147 gdbstub_state(ks, "s");
148 else
149 gdbstub_state(ks, "c");
150
151 KDB_FLAG_CLEAR(CATASTROPHIC);
152
153 /* Invoke arch specific exception handling prior to system resume */
154 kgdb_info[ks->cpu].ret_state = gdbstub_state(ks, "e");
155 if (ks->pass_exception)
156 kgdb_info[ks->cpu].ret_state = 1;
157 if (error == KDB_CMD_CPU) {
158 KDB_STATE_SET(REENTRY);
159 /*
160 * Force clear the single step bit because kdb emulates this
161 * differently vs the gdbstub
162 */
163 kgdb_single_step = 0;
164 dbg_deactivate_sw_breakpoints();
165 return DBG_SWITCH_CPU_EVENT;
166 }
167 return kgdb_info[ks->cpu].ret_state;
168}
169
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
new file mode 100644
index 000000000000..c9b7f4f90bba
--- /dev/null
+++ b/kernel/debug/kdb/kdb_io.c
@@ -0,0 +1,826 @@
1/*
2 * Kernel Debugger Architecture Independent Console I/O handler
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (c) 1999-2006 Silicon Graphics, Inc. All Rights Reserved.
9 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
10 */
11
12#include <linux/module.h>
13#include <linux/types.h>
14#include <linux/ctype.h>
15#include <linux/kernel.h>
16#include <linux/init.h>
17#include <linux/kdev_t.h>
18#include <linux/console.h>
19#include <linux/string.h>
20#include <linux/sched.h>
21#include <linux/smp.h>
22#include <linux/nmi.h>
23#include <linux/delay.h>
24#include <linux/kgdb.h>
25#include <linux/kdb.h>
26#include <linux/kallsyms.h>
27#include "kdb_private.h"
28
29#define CMD_BUFLEN 256
30char kdb_prompt_str[CMD_BUFLEN];
31
32int kdb_trap_printk;
33
34static void kgdb_transition_check(char *buffer)
35{
36 int slen = strlen(buffer);
37 if (strncmp(buffer, "$?#3f", slen) != 0 &&
38 strncmp(buffer, "$qSupported#37", slen) != 0 &&
39 strncmp(buffer, "+$qSupported#37", slen) != 0) {
40 KDB_STATE_SET(KGDB_TRANS);
41 kdb_printf("%s", buffer);
42 }
43}
44
45static int kdb_read_get_key(char *buffer, size_t bufsize)
46{
47#define ESCAPE_UDELAY 1000
48#define ESCAPE_DELAY (2*1000000/ESCAPE_UDELAY) /* 2 seconds worth of udelays */
49 char escape_data[5]; /* longest vt100 escape sequence is 4 bytes */
50 char *ped = escape_data;
51 int escape_delay = 0;
52 get_char_func *f, *f_escape = NULL;
53 int key;
54
55 for (f = &kdb_poll_funcs[0]; ; ++f) {
56 if (*f == NULL) {
57 /* Reset NMI watchdog once per poll loop */
58 touch_nmi_watchdog();
59 f = &kdb_poll_funcs[0];
60 }
61 if (escape_delay == 2) {
62 *ped = '\0';
63 ped = escape_data;
64 --escape_delay;
65 }
66 if (escape_delay == 1) {
67 key = *ped++;
68 if (!*ped)
69 --escape_delay;
70 break;
71 }
72 key = (*f)();
73 if (key == -1) {
74 if (escape_delay) {
75 udelay(ESCAPE_UDELAY);
76 --escape_delay;
77 }
78 continue;
79 }
80 if (bufsize <= 2) {
81 if (key == '\r')
82 key = '\n';
83 *buffer++ = key;
84 *buffer = '\0';
85 return -1;
86 }
87 if (escape_delay == 0 && key == '\e') {
88 escape_delay = ESCAPE_DELAY;
89 ped = escape_data;
90 f_escape = f;
91 }
92 if (escape_delay) {
93 *ped++ = key;
94 if (f_escape != f) {
95 escape_delay = 2;
96 continue;
97 }
98 if (ped - escape_data == 1) {
99 /* \e */
100 continue;
101 } else if (ped - escape_data == 2) {
102 /* \e<something> */
103 if (key != '[')
104 escape_delay = 2;
105 continue;
106 } else if (ped - escape_data == 3) {
107 /* \e[<something> */
108 int mapkey = 0;
109 switch (key) {
110 case 'A': /* \e[A, up arrow */
111 mapkey = 16;
112 break;
113 case 'B': /* \e[B, down arrow */
114 mapkey = 14;
115 break;
116 case 'C': /* \e[C, right arrow */
117 mapkey = 6;
118 break;
119 case 'D': /* \e[D, left arrow */
120 mapkey = 2;
121 break;
122 case '1': /* dropthrough */
123 case '3': /* dropthrough */
124 /* \e[<1,3,4>], may be home, del, end */
125 case '4':
126 mapkey = -1;
127 break;
128 }
129 if (mapkey != -1) {
130 if (mapkey > 0) {
131 escape_data[0] = mapkey;
132 escape_data[1] = '\0';
133 }
134 escape_delay = 2;
135 }
136 continue;
137 } else if (ped - escape_data == 4) {
138 /* \e[<1,3,4><something> */
139 int mapkey = 0;
140 if (key == '~') {
141 switch (escape_data[2]) {
142 case '1': /* \e[1~, home */
143 mapkey = 1;
144 break;
145 case '3': /* \e[3~, del */
146 mapkey = 4;
147 break;
148 case '4': /* \e[4~, end */
149 mapkey = 5;
150 break;
151 }
152 }
153 if (mapkey > 0) {
154 escape_data[0] = mapkey;
155 escape_data[1] = '\0';
156 }
157 escape_delay = 2;
158 continue;
159 }
160 }
161 break; /* A key to process */
162 }
163 return key;
164}
165
166/*
167 * kdb_read
168 *
169 * This function reads a string of characters, terminated by
170 * a newline, or by reaching the end of the supplied buffer,
171 * from the current kernel debugger console device.
172 * Parameters:
173 * buffer - Address of character buffer to receive input characters.
174 * bufsize - size, in bytes, of the character buffer
175 * Returns:
176 * Returns a pointer to the buffer containing the received
177 * character string. This string will be terminated by a
178 * newline character.
179 * Locking:
180 * No locks are required to be held upon entry to this
181 * function. It is not reentrant - it relies on the fact
182 * that while kdb is running on only one "master debug" cpu.
183 * Remarks:
184 *
185 * The buffer size must be >= 2. A buffer size of 2 means that the caller only
186 * wants a single key.
187 *
188 * An escape key could be the start of a vt100 control sequence such as \e[D
189 * (left arrow) or it could be a character in its own right. The standard
190 * method for detecting the difference is to wait for 2 seconds to see if there
191 * are any other characters. kdb is complicated by the lack of a timer service
192 * (interrupts are off), by multiple input sources and by the need to sometimes
193 * return after just one key. Escape sequence processing has to be done as
194 * states in the polling loop.
195 */
196
197static char *kdb_read(char *buffer, size_t bufsize)
198{
199 char *cp = buffer;
200 char *bufend = buffer+bufsize-2; /* Reserve space for newline
201 * and null byte */
202 char *lastchar;
203 char *p_tmp;
204 char tmp;
205 static char tmpbuffer[CMD_BUFLEN];
206 int len = strlen(buffer);
207 int len_tmp;
208 int tab = 0;
209 int count;
210 int i;
211 int diag, dtab_count;
212 int key;
213
214
215 diag = kdbgetintenv("DTABCOUNT", &dtab_count);
216 if (diag)
217 dtab_count = 30;
218
219 if (len > 0) {
220 cp += len;
221 if (*(buffer+len-1) == '\n')
222 cp--;
223 }
224
225 lastchar = cp;
226 *cp = '\0';
227 kdb_printf("%s", buffer);
228poll_again:
229 key = kdb_read_get_key(buffer, bufsize);
230 if (key == -1)
231 return buffer;
232 if (key != 9)
233 tab = 0;
234 switch (key) {
235 case 8: /* backspace */
236 if (cp > buffer) {
237 if (cp < lastchar) {
238 memcpy(tmpbuffer, cp, lastchar - cp);
239 memcpy(cp-1, tmpbuffer, lastchar - cp);
240 }
241 *(--lastchar) = '\0';
242 --cp;
243 kdb_printf("\b%s \r", cp);
244 tmp = *cp;
245 *cp = '\0';
246 kdb_printf(kdb_prompt_str);
247 kdb_printf("%s", buffer);
248 *cp = tmp;
249 }
250 break;
251 case 13: /* enter */
252 *lastchar++ = '\n';
253 *lastchar++ = '\0';
254 kdb_printf("\n");
255 return buffer;
256 case 4: /* Del */
257 if (cp < lastchar) {
258 memcpy(tmpbuffer, cp+1, lastchar - cp - 1);
259 memcpy(cp, tmpbuffer, lastchar - cp - 1);
260 *(--lastchar) = '\0';
261 kdb_printf("%s \r", cp);
262 tmp = *cp;
263 *cp = '\0';
264 kdb_printf(kdb_prompt_str);
265 kdb_printf("%s", buffer);
266 *cp = tmp;
267 }
268 break;
269 case 1: /* Home */
270 if (cp > buffer) {
271 kdb_printf("\r");
272 kdb_printf(kdb_prompt_str);
273 cp = buffer;
274 }
275 break;
276 case 5: /* End */
277 if (cp < lastchar) {
278 kdb_printf("%s", cp);
279 cp = lastchar;
280 }
281 break;
282 case 2: /* Left */
283 if (cp > buffer) {
284 kdb_printf("\b");
285 --cp;
286 }
287 break;
288 case 14: /* Down */
289 memset(tmpbuffer, ' ',
290 strlen(kdb_prompt_str) + (lastchar-buffer));
291 *(tmpbuffer+strlen(kdb_prompt_str) +
292 (lastchar-buffer)) = '\0';
293 kdb_printf("\r%s\r", tmpbuffer);
294 *lastchar = (char)key;
295 *(lastchar+1) = '\0';
296 return lastchar;
297 case 6: /* Right */
298 if (cp < lastchar) {
299 kdb_printf("%c", *cp);
300 ++cp;
301 }
302 break;
303 case 16: /* Up */
304 memset(tmpbuffer, ' ',
305 strlen(kdb_prompt_str) + (lastchar-buffer));
306 *(tmpbuffer+strlen(kdb_prompt_str) +
307 (lastchar-buffer)) = '\0';
308 kdb_printf("\r%s\r", tmpbuffer);
309 *lastchar = (char)key;
310 *(lastchar+1) = '\0';
311 return lastchar;
312 case 9: /* Tab */
313 if (tab < 2)
314 ++tab;
315 p_tmp = buffer;
316 while (*p_tmp == ' ')
317 p_tmp++;
318 if (p_tmp > cp)
319 break;
320 memcpy(tmpbuffer, p_tmp, cp-p_tmp);
321 *(tmpbuffer + (cp-p_tmp)) = '\0';
322 p_tmp = strrchr(tmpbuffer, ' ');
323 if (p_tmp)
324 ++p_tmp;
325 else
326 p_tmp = tmpbuffer;
327 len = strlen(p_tmp);
328 count = kallsyms_symbol_complete(p_tmp,
329 sizeof(tmpbuffer) -
330 (p_tmp - tmpbuffer));
331 if (tab == 2 && count > 0) {
332 kdb_printf("\n%d symbols are found.", count);
333 if (count > dtab_count) {
334 count = dtab_count;
335 kdb_printf(" But only first %d symbols will"
336 " be printed.\nYou can change the"
337 " environment variable DTABCOUNT.",
338 count);
339 }
340 kdb_printf("\n");
341 for (i = 0; i < count; i++) {
342 if (kallsyms_symbol_next(p_tmp, i) < 0)
343 break;
344 kdb_printf("%s ", p_tmp);
345 *(p_tmp + len) = '\0';
346 }
347 if (i >= dtab_count)
348 kdb_printf("...");
349 kdb_printf("\n");
350 kdb_printf(kdb_prompt_str);
351 kdb_printf("%s", buffer);
352 } else if (tab != 2 && count > 0) {
353 len_tmp = strlen(p_tmp);
354 strncpy(p_tmp+len_tmp, cp, lastchar-cp+1);
355 len_tmp = strlen(p_tmp);
356 strncpy(cp, p_tmp+len, len_tmp-len + 1);
357 len = len_tmp - len;
358 kdb_printf("%s", cp);
359 cp += len;
360 lastchar += len;
361 }
362 kdb_nextline = 1; /* reset output line number */
363 break;
364 default:
365 if (key >= 32 && lastchar < bufend) {
366 if (cp < lastchar) {
367 memcpy(tmpbuffer, cp, lastchar - cp);
368 memcpy(cp+1, tmpbuffer, lastchar - cp);
369 *++lastchar = '\0';
370 *cp = key;
371 kdb_printf("%s\r", cp);
372 ++cp;
373 tmp = *cp;
374 *cp = '\0';
375 kdb_printf(kdb_prompt_str);
376 kdb_printf("%s", buffer);
377 *cp = tmp;
378 } else {
379 *++lastchar = '\0';
380 *cp++ = key;
381 /* The kgdb transition check will hide
382 * printed characters if we think that
383 * kgdb is connecting, until the check
384 * fails */
385 if (!KDB_STATE(KGDB_TRANS))
386 kgdb_transition_check(buffer);
387 else
388 kdb_printf("%c", key);
389 }
390 /* Special escape to kgdb */
391 if (lastchar - buffer >= 5 &&
392 strcmp(lastchar - 5, "$?#3f") == 0) {
393 strcpy(buffer, "kgdb");
394 KDB_STATE_SET(DOING_KGDB);
395 return buffer;
396 }
397 if (lastchar - buffer >= 14 &&
398 strcmp(lastchar - 14, "$qSupported#37") == 0) {
399 strcpy(buffer, "kgdb");
400 KDB_STATE_SET(DOING_KGDB2);
401 return buffer;
402 }
403 }
404 break;
405 }
406 goto poll_again;
407}
408
409/*
410 * kdb_getstr
411 *
412 * Print the prompt string and read a command from the
413 * input device.
414 *
415 * Parameters:
416 * buffer Address of buffer to receive command
417 * bufsize Size of buffer in bytes
418 * prompt Pointer to string to use as prompt string
419 * Returns:
420 * Pointer to command buffer.
421 * Locking:
422 * None.
423 * Remarks:
424 * For SMP kernels, the processor number will be
425 * substituted for %d, %x or %o in the prompt.
426 */
427
428char *kdb_getstr(char *buffer, size_t bufsize, char *prompt)
429{
430 if (prompt && kdb_prompt_str != prompt)
431 strncpy(kdb_prompt_str, prompt, CMD_BUFLEN);
432 kdb_printf(kdb_prompt_str);
433 kdb_nextline = 1; /* Prompt and input resets line number */
434 return kdb_read(buffer, bufsize);
435}
436
437/*
438 * kdb_input_flush
439 *
440 * Get rid of any buffered console input.
441 *
442 * Parameters:
443 * none
444 * Returns:
445 * nothing
446 * Locking:
447 * none
448 * Remarks:
449 * Call this function whenever you want to flush input. If there is any
450 * outstanding input, it ignores all characters until there has been no
451 * data for approximately 1ms.
452 */
453
454static void kdb_input_flush(void)
455{
456 get_char_func *f;
457 int res;
458 int flush_delay = 1;
459 while (flush_delay) {
460 flush_delay--;
461empty:
462 touch_nmi_watchdog();
463 for (f = &kdb_poll_funcs[0]; *f; ++f) {
464 res = (*f)();
465 if (res != -1) {
466 flush_delay = 1;
467 goto empty;
468 }
469 }
470 if (flush_delay)
471 mdelay(1);
472 }
473}
474
475/*
476 * kdb_printf
477 *
478 * Print a string to the output device(s).
479 *
480 * Parameters:
481 * printf-like format and optional args.
482 * Returns:
483 * 0
484 * Locking:
485 * None.
486 * Remarks:
487 * use 'kdbcons->write()' to avoid polluting 'log_buf' with
488 * kdb output.
489 *
490 * If the user is doing a cmd args | grep srch
491 * then kdb_grepping_flag is set.
492 * In that case we need to accumulate full lines (ending in \n) before
493 * searching for the pattern.
494 */
495
496static char kdb_buffer[256]; /* A bit too big to go on stack */
497static char *next_avail = kdb_buffer;
498static int size_avail;
499static int suspend_grep;
500
501/*
502 * search arg1 to see if it contains arg2
503 * (kdmain.c provides flags for ^pat and pat$)
504 *
505 * return 1 for found, 0 for not found
506 */
507static int kdb_search_string(char *searched, char *searchfor)
508{
509 char firstchar, *cp;
510 int len1, len2;
511
512 /* not counting the newline at the end of "searched" */
513 len1 = strlen(searched)-1;
514 len2 = strlen(searchfor);
515 if (len1 < len2)
516 return 0;
517 if (kdb_grep_leading && kdb_grep_trailing && len1 != len2)
518 return 0;
519 if (kdb_grep_leading) {
520 if (!strncmp(searched, searchfor, len2))
521 return 1;
522 } else if (kdb_grep_trailing) {
523 if (!strncmp(searched+len1-len2, searchfor, len2))
524 return 1;
525 } else {
526 firstchar = *searchfor;
527 cp = searched;
528 while ((cp = strchr(cp, firstchar))) {
529 if (!strncmp(cp, searchfor, len2))
530 return 1;
531 cp++;
532 }
533 }
534 return 0;
535}
536
537int vkdb_printf(const char *fmt, va_list ap)
538{
539 int diag;
540 int linecount;
541 int logging, saved_loglevel = 0;
542 int saved_trap_printk;
543 int got_printf_lock = 0;
544 int retlen = 0;
545 int fnd, len;
546 char *cp, *cp2, *cphold = NULL, replaced_byte = ' ';
547 char *moreprompt = "more> ";
548 struct console *c = console_drivers;
549 static DEFINE_SPINLOCK(kdb_printf_lock);
550 unsigned long uninitialized_var(flags);
551
552 preempt_disable();
553 saved_trap_printk = kdb_trap_printk;
554 kdb_trap_printk = 0;
555
556 /* Serialize kdb_printf if multiple cpus try to write at once.
557 * But if any cpu goes recursive in kdb, just print the output,
558 * even if it is interleaved with any other text.
559 */
560 if (!KDB_STATE(PRINTF_LOCK)) {
561 KDB_STATE_SET(PRINTF_LOCK);
562 spin_lock_irqsave(&kdb_printf_lock, flags);
563 got_printf_lock = 1;
564 atomic_inc(&kdb_event);
565 } else {
566 __acquire(kdb_printf_lock);
567 }
568
569 diag = kdbgetintenv("LINES", &linecount);
570 if (diag || linecount <= 1)
571 linecount = 24;
572
573 diag = kdbgetintenv("LOGGING", &logging);
574 if (diag)
575 logging = 0;
576
577 if (!kdb_grepping_flag || suspend_grep) {
578 /* normally, every vsnprintf starts a new buffer */
579 next_avail = kdb_buffer;
580 size_avail = sizeof(kdb_buffer);
581 }
582 vsnprintf(next_avail, size_avail, fmt, ap);
583
584 /*
585 * If kdb_parse() found that the command was cmd xxx | grep yyy
586 * then kdb_grepping_flag is set, and kdb_grep_string contains yyy
587 *
588 * Accumulate the print data up to a newline before searching it.
589 * (vsnprintf does null-terminate the string that it generates)
590 */
591
592 /* skip the search if prints are temporarily unconditional */
593 if (!suspend_grep && kdb_grepping_flag) {
594 cp = strchr(kdb_buffer, '\n');
595 if (!cp) {
596 /*
597 * Special cases that don't end with newlines
598 * but should be written without one:
599 * The "[nn]kdb> " prompt should
600 * appear at the front of the buffer.
601 *
602 * The "[nn]more " prompt should also be
603 * (MOREPROMPT -> moreprompt)
604 * written * but we print that ourselves,
605 * we set the suspend_grep flag to make
606 * it unconditional.
607 *
608 */
609 if (next_avail == kdb_buffer) {
610 /*
611 * these should occur after a newline,
612 * so they will be at the front of the
613 * buffer
614 */
615 cp2 = kdb_buffer;
616 len = strlen(kdb_prompt_str);
617 if (!strncmp(cp2, kdb_prompt_str, len)) {
618 /*
619 * We're about to start a new
620 * command, so we can go back
621 * to normal mode.
622 */
623 kdb_grepping_flag = 0;
624 goto kdb_printit;
625 }
626 }
627 /* no newline; don't search/write the buffer
628 until one is there */
629 len = strlen(kdb_buffer);
630 next_avail = kdb_buffer + len;
631 size_avail = sizeof(kdb_buffer) - len;
632 goto kdb_print_out;
633 }
634
635 /*
636 * The newline is present; print through it or discard
637 * it, depending on the results of the search.
638 */
639 cp++; /* to byte after the newline */
640 replaced_byte = *cp; /* remember what/where it was */
641 cphold = cp;
642 *cp = '\0'; /* end the string for our search */
643
644 /*
645 * We now have a newline at the end of the string
646 * Only continue with this output if it contains the
647 * search string.
648 */
649 fnd = kdb_search_string(kdb_buffer, kdb_grep_string);
650 if (!fnd) {
651 /*
652 * At this point the complete line at the start
653 * of kdb_buffer can be discarded, as it does
654 * not contain what the user is looking for.
655 * Shift the buffer left.
656 */
657 *cphold = replaced_byte;
658 strcpy(kdb_buffer, cphold);
659 len = strlen(kdb_buffer);
660 next_avail = kdb_buffer + len;
661 size_avail = sizeof(kdb_buffer) - len;
662 goto kdb_print_out;
663 }
664 /*
665 * at this point the string is a full line and
666 * should be printed, up to the null.
667 */
668 }
669kdb_printit:
670
671 /*
672 * Write to all consoles.
673 */
674 retlen = strlen(kdb_buffer);
675 if (!dbg_kdb_mode && kgdb_connected) {
676 gdbstub_msg_write(kdb_buffer, retlen);
677 } else {
678 if (!dbg_io_ops->is_console) {
679 len = strlen(kdb_buffer);
680 cp = kdb_buffer;
681 while (len--) {
682 dbg_io_ops->write_char(*cp);
683 cp++;
684 }
685 }
686 while (c) {
687 c->write(c, kdb_buffer, retlen);
688 touch_nmi_watchdog();
689 c = c->next;
690 }
691 }
692 if (logging) {
693 saved_loglevel = console_loglevel;
694 console_loglevel = 0;
695 printk(KERN_INFO "%s", kdb_buffer);
696 }
697
698 if (KDB_STATE(PAGER) && strchr(kdb_buffer, '\n'))
699 kdb_nextline++;
700
701 /* check for having reached the LINES number of printed lines */
702 if (kdb_nextline == linecount) {
703 char buf1[16] = "";
704#if defined(CONFIG_SMP)
705 char buf2[32];
706#endif
707
708 /* Watch out for recursion here. Any routine that calls
709 * kdb_printf will come back through here. And kdb_read
710 * uses kdb_printf to echo on serial consoles ...
711 */
712 kdb_nextline = 1; /* In case of recursion */
713
714 /*
715 * Pause until cr.
716 */
717 moreprompt = kdbgetenv("MOREPROMPT");
718 if (moreprompt == NULL)
719 moreprompt = "more> ";
720
721#if defined(CONFIG_SMP)
722 if (strchr(moreprompt, '%')) {
723 sprintf(buf2, moreprompt, get_cpu());
724 put_cpu();
725 moreprompt = buf2;
726 }
727#endif
728
729 kdb_input_flush();
730 c = console_drivers;
731
732 if (!dbg_io_ops->is_console) {
733 len = strlen(moreprompt);
734 cp = moreprompt;
735 while (len--) {
736 dbg_io_ops->write_char(*cp);
737 cp++;
738 }
739 }
740 while (c) {
741 c->write(c, moreprompt, strlen(moreprompt));
742 touch_nmi_watchdog();
743 c = c->next;
744 }
745
746 if (logging)
747 printk("%s", moreprompt);
748
749 kdb_read(buf1, 2); /* '2' indicates to return
750 * immediately after getting one key. */
751 kdb_nextline = 1; /* Really set output line 1 */
752
753 /* empty and reset the buffer: */
754 kdb_buffer[0] = '\0';
755 next_avail = kdb_buffer;
756 size_avail = sizeof(kdb_buffer);
757 if ((buf1[0] == 'q') || (buf1[0] == 'Q')) {
758 /* user hit q or Q */
759 KDB_FLAG_SET(CMD_INTERRUPT); /* command interrupted */
760 KDB_STATE_CLEAR(PAGER);
761 /* end of command output; back to normal mode */
762 kdb_grepping_flag = 0;
763 kdb_printf("\n");
764 } else if (buf1[0] == ' ') {
765 kdb_printf("\n");
766 suspend_grep = 1; /* for this recursion */
767 } else if (buf1[0] == '\n') {
768 kdb_nextline = linecount - 1;
769 kdb_printf("\r");
770 suspend_grep = 1; /* for this recursion */
771 } else if (buf1[0] && buf1[0] != '\n') {
772 /* user hit something other than enter */
773 suspend_grep = 1; /* for this recursion */
774 kdb_printf("\nOnly 'q' or 'Q' are processed at more "
775 "prompt, input ignored\n");
776 } else if (kdb_grepping_flag) {
777 /* user hit enter */
778 suspend_grep = 1; /* for this recursion */
779 kdb_printf("\n");
780 }
781 kdb_input_flush();
782 }
783
784 /*
785 * For grep searches, shift the printed string left.
786 * replaced_byte contains the character that was overwritten with
787 * the terminating null, and cphold points to the null.
788 * Then adjust the notion of available space in the buffer.
789 */
790 if (kdb_grepping_flag && !suspend_grep) {
791 *cphold = replaced_byte;
792 strcpy(kdb_buffer, cphold);
793 len = strlen(kdb_buffer);
794 next_avail = kdb_buffer + len;
795 size_avail = sizeof(kdb_buffer) - len;
796 }
797
798kdb_print_out:
799 suspend_grep = 0; /* end of what may have been a recursive call */
800 if (logging)
801 console_loglevel = saved_loglevel;
802 if (KDB_STATE(PRINTF_LOCK) && got_printf_lock) {
803 got_printf_lock = 0;
804 spin_unlock_irqrestore(&kdb_printf_lock, flags);
805 KDB_STATE_CLEAR(PRINTF_LOCK);
806 atomic_dec(&kdb_event);
807 } else {
808 __release(kdb_printf_lock);
809 }
810 kdb_trap_printk = saved_trap_printk;
811 preempt_enable();
812 return retlen;
813}
814
815int kdb_printf(const char *fmt, ...)
816{
817 va_list ap;
818 int r;
819
820 va_start(ap, fmt);
821 r = vkdb_printf(fmt, ap);
822 va_end(ap);
823
824 return r;
825}
826
diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c
new file mode 100644
index 000000000000..4bca634975c0
--- /dev/null
+++ b/kernel/debug/kdb/kdb_keyboard.c
@@ -0,0 +1,212 @@
1/*
2 * Kernel Debugger Architecture Dependent Console I/O handler
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License.
6 *
7 * Copyright (c) 1999-2006 Silicon Graphics, Inc. All Rights Reserved.
8 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
9 */
10
11#include <linux/kdb.h>
12#include <linux/keyboard.h>
13#include <linux/ctype.h>
14#include <linux/module.h>
15#include <linux/io.h>
16
17/* Keyboard Controller Registers on normal PCs. */
18
19#define KBD_STATUS_REG 0x64 /* Status register (R) */
20#define KBD_DATA_REG 0x60 /* Keyboard data register (R/W) */
21
22/* Status Register Bits */
23
24#define KBD_STAT_OBF 0x01 /* Keyboard output buffer full */
25#define KBD_STAT_MOUSE_OBF 0x20 /* Mouse output buffer full */
26
27static int kbd_exists;
28
29/*
30 * Check if the keyboard controller has a keypress for us.
31 * Some parts (Enter Release, LED change) are still blocking polled here,
32 * but hopefully they are all short.
33 */
34int kdb_get_kbd_char(void)
35{
36 int scancode, scanstatus;
37 static int shift_lock; /* CAPS LOCK state (0-off, 1-on) */
38 static int shift_key; /* Shift next keypress */
39 static int ctrl_key;
40 u_short keychar;
41
42 if (KDB_FLAG(NO_I8042) || KDB_FLAG(NO_VT_CONSOLE) ||
43 (inb(KBD_STATUS_REG) == 0xff && inb(KBD_DATA_REG) == 0xff)) {
44 kbd_exists = 0;
45 return -1;
46 }
47 kbd_exists = 1;
48
49 if ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
50 return -1;
51
52 /*
53 * Fetch the scancode
54 */
55 scancode = inb(KBD_DATA_REG);
56 scanstatus = inb(KBD_STATUS_REG);
57
58 /*
59 * Ignore mouse events.
60 */
61 if (scanstatus & KBD_STAT_MOUSE_OBF)
62 return -1;
63
64 /*
65 * Ignore release, trigger on make
66 * (except for shift keys, where we want to
67 * keep the shift state so long as the key is
68 * held down).
69 */
70
71 if (((scancode&0x7f) == 0x2a) || ((scancode&0x7f) == 0x36)) {
72 /*
73 * Next key may use shift table
74 */
75 if ((scancode & 0x80) == 0)
76 shift_key = 1;
77 else
78 shift_key = 0;
79 return -1;
80 }
81
82 if ((scancode&0x7f) == 0x1d) {
83 /*
84 * Left ctrl key
85 */
86 if ((scancode & 0x80) == 0)
87 ctrl_key = 1;
88 else
89 ctrl_key = 0;
90 return -1;
91 }
92
93 if ((scancode & 0x80) != 0)
94 return -1;
95
96 scancode &= 0x7f;
97
98 /*
99 * Translate scancode
100 */
101
102 if (scancode == 0x3a) {
103 /*
104 * Toggle caps lock
105 */
106 shift_lock ^= 1;
107
108#ifdef KDB_BLINK_LED
109 kdb_toggleled(0x4);
110#endif
111 return -1;
112 }
113
114 if (scancode == 0x0e) {
115 /*
116 * Backspace
117 */
118 return 8;
119 }
120
121 /* Special Key */
122 switch (scancode) {
123 case 0xF: /* Tab */
124 return 9;
125 case 0x53: /* Del */
126 return 4;
127 case 0x47: /* Home */
128 return 1;
129 case 0x4F: /* End */
130 return 5;
131 case 0x4B: /* Left */
132 return 2;
133 case 0x48: /* Up */
134 return 16;
135 case 0x50: /* Down */
136 return 14;
137 case 0x4D: /* Right */
138 return 6;
139 }
140
141 if (scancode == 0xe0)
142 return -1;
143
144 /*
145 * For Japanese 86/106 keyboards
146 * See comment in drivers/char/pc_keyb.c.
147 * - Masahiro Adegawa
148 */
149 if (scancode == 0x73)
150 scancode = 0x59;
151 else if (scancode == 0x7d)
152 scancode = 0x7c;
153
154 if (!shift_lock && !shift_key && !ctrl_key) {
155 keychar = plain_map[scancode];
156 } else if ((shift_lock || shift_key) && key_maps[1]) {
157 keychar = key_maps[1][scancode];
158 } else if (ctrl_key && key_maps[4]) {
159 keychar = key_maps[4][scancode];
160 } else {
161 keychar = 0x0020;
162 kdb_printf("Unknown state/scancode (%d)\n", scancode);
163 }
164 keychar &= 0x0fff;
165 if (keychar == '\t')
166 keychar = ' ';
167 switch (KTYP(keychar)) {
168 case KT_LETTER:
169 case KT_LATIN:
170 if (isprint(keychar))
171 break; /* printable characters */
172 /* drop through */
173 case KT_SPEC:
174 if (keychar == K_ENTER)
175 break;
176 /* drop through */
177 default:
178 return -1; /* ignore unprintables */
179 }
180
181 if ((scancode & 0x7f) == 0x1c) {
182 /*
183 * enter key. All done. Absorb the release scancode.
184 */
185 while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
186 ;
187
188 /*
189 * Fetch the scancode
190 */
191 scancode = inb(KBD_DATA_REG);
192 scanstatus = inb(KBD_STATUS_REG);
193
194 while (scanstatus & KBD_STAT_MOUSE_OBF) {
195 scancode = inb(KBD_DATA_REG);
196 scanstatus = inb(KBD_STATUS_REG);
197 }
198
199 if (scancode != 0x9c) {
200 /*
201 * Wasn't an enter-release, why not?
202 */
203 kdb_printf("kdb: expected enter got 0x%x status 0x%x\n",
204 scancode, scanstatus);
205 }
206
207 return 13;
208 }
209
210 return keychar & 0xff;
211}
212EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
new file mode 100644
index 000000000000..caf057a3de0e
--- /dev/null
+++ b/kernel/debug/kdb/kdb_main.c
@@ -0,0 +1,2956 @@
1/*
2 * Kernel Debugger Architecture Independent Main Code
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (C) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
9 * Copyright (C) 2000 Stephane Eranian <eranian@hpl.hp.com>
10 * Xscale (R) modifications copyright (C) 2003 Intel Corporation.
11 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
12 */
13
14#include <linux/ctype.h>
15#include <linux/string.h>
16#include <linux/kernel.h>
17#include <linux/reboot.h>
18#include <linux/sched.h>
19#include <linux/sysrq.h>
20#include <linux/smp.h>
21#include <linux/utsname.h>
22#include <linux/vmalloc.h>
23#include <linux/module.h>
24#include <linux/mm.h>
25#include <linux/init.h>
26#include <linux/kallsyms.h>
27#include <linux/kgdb.h>
28#include <linux/kdb.h>
29#include <linux/notifier.h>
30#include <linux/interrupt.h>
31#include <linux/delay.h>
32#include <linux/nmi.h>
33#include <linux/time.h>
34#include <linux/ptrace.h>
35#include <linux/sysctl.h>
36#include <linux/cpu.h>
37#include <linux/kdebug.h>
38#include <linux/proc_fs.h>
39#include <linux/uaccess.h>
40#include <linux/slab.h>
41#include "kdb_private.h"
42
43#define GREP_LEN 256
44char kdb_grep_string[GREP_LEN];
45int kdb_grepping_flag;
46EXPORT_SYMBOL(kdb_grepping_flag);
47int kdb_grep_leading;
48int kdb_grep_trailing;
49
50/*
51 * Kernel debugger state flags
52 */
53int kdb_flags;
54atomic_t kdb_event;
55
56/*
57 * kdb_lock protects updates to kdb_initial_cpu. Used to
58 * single thread processors through the kernel debugger.
59 */
60int kdb_initial_cpu = -1; /* cpu number that owns kdb */
61int kdb_nextline = 1;
62int kdb_state; /* General KDB state */
63
64struct task_struct *kdb_current_task;
65EXPORT_SYMBOL(kdb_current_task);
66struct pt_regs *kdb_current_regs;
67
68const char *kdb_diemsg;
69static int kdb_go_count;
70#ifdef CONFIG_KDB_CONTINUE_CATASTROPHIC
71static unsigned int kdb_continue_catastrophic =
72 CONFIG_KDB_CONTINUE_CATASTROPHIC;
73#else
74static unsigned int kdb_continue_catastrophic;
75#endif
76
77/* kdb_commands describes the available commands. */
78static kdbtab_t *kdb_commands;
79#define KDB_BASE_CMD_MAX 50
80static int kdb_max_commands = KDB_BASE_CMD_MAX;
81static kdbtab_t kdb_base_commands[50];
82#define for_each_kdbcmd(cmd, num) \
83 for ((cmd) = kdb_base_commands, (num) = 0; \
84 num < kdb_max_commands; \
85 num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++, num++)
86
87typedef struct _kdbmsg {
88 int km_diag; /* kdb diagnostic */
89 char *km_msg; /* Corresponding message text */
90} kdbmsg_t;
91
92#define KDBMSG(msgnum, text) \
93 { KDB_##msgnum, text }
94
95static kdbmsg_t kdbmsgs[] = {
96 KDBMSG(NOTFOUND, "Command Not Found"),
97 KDBMSG(ARGCOUNT, "Improper argument count, see usage."),
98 KDBMSG(BADWIDTH, "Illegal value for BYTESPERWORD use 1, 2, 4 or 8, "
99 "8 is only allowed on 64 bit systems"),
100 KDBMSG(BADRADIX, "Illegal value for RADIX use 8, 10 or 16"),
101 KDBMSG(NOTENV, "Cannot find environment variable"),
102 KDBMSG(NOENVVALUE, "Environment variable should have value"),
103 KDBMSG(NOTIMP, "Command not implemented"),
104 KDBMSG(ENVFULL, "Environment full"),
105 KDBMSG(ENVBUFFULL, "Environment buffer full"),
106 KDBMSG(TOOMANYBPT, "Too many breakpoints defined"),
107#ifdef CONFIG_CPU_XSCALE
108 KDBMSG(TOOMANYDBREGS, "More breakpoints than ibcr registers defined"),
109#else
110 KDBMSG(TOOMANYDBREGS, "More breakpoints than db registers defined"),
111#endif
112 KDBMSG(DUPBPT, "Duplicate breakpoint address"),
113 KDBMSG(BPTNOTFOUND, "Breakpoint not found"),
114 KDBMSG(BADMODE, "Invalid IDMODE"),
115 KDBMSG(BADINT, "Illegal numeric value"),
116 KDBMSG(INVADDRFMT, "Invalid symbolic address format"),
117 KDBMSG(BADREG, "Invalid register name"),
118 KDBMSG(BADCPUNUM, "Invalid cpu number"),
119 KDBMSG(BADLENGTH, "Invalid length field"),
120 KDBMSG(NOBP, "No Breakpoint exists"),
121 KDBMSG(BADADDR, "Invalid address"),
122};
123#undef KDBMSG
124
125static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t);
126
127
128/*
129 * Initial environment. This is all kept static and local to
130 * this file. We don't want to rely on the memory allocation
131 * mechanisms in the kernel, so we use a very limited allocate-only
132 * heap for new and altered environment variables. The entire
133 * environment is limited to a fixed number of entries (add more
134 * to __env[] if required) and a fixed amount of heap (add more to
135 * KDB_ENVBUFSIZE if required).
136 */
137
138static char *__env[] = {
139#if defined(CONFIG_SMP)
140 "PROMPT=[%d]kdb> ",
141 "MOREPROMPT=[%d]more> ",
142#else
143 "PROMPT=kdb> ",
144 "MOREPROMPT=more> ",
145#endif
146 "RADIX=16",
147 "MDCOUNT=8", /* lines of md output */
148 "BTARGS=9", /* 9 possible args in bt */
149 KDB_PLATFORM_ENV,
150 "DTABCOUNT=30",
151 "NOSECT=1",
152 (char *)0,
153 (char *)0,
154 (char *)0,
155 (char *)0,
156 (char *)0,
157 (char *)0,
158 (char *)0,
159 (char *)0,
160 (char *)0,
161 (char *)0,
162 (char *)0,
163 (char *)0,
164 (char *)0,
165 (char *)0,
166 (char *)0,
167 (char *)0,
168 (char *)0,
169 (char *)0,
170 (char *)0,
171 (char *)0,
172 (char *)0,
173 (char *)0,
174 (char *)0,
175};
176
177static const int __nenv = (sizeof(__env) / sizeof(char *));
178
179struct task_struct *kdb_curr_task(int cpu)
180{
181 struct task_struct *p = curr_task(cpu);
182#ifdef _TIF_MCA_INIT
183 if ((task_thread_info(p)->flags & _TIF_MCA_INIT) && KDB_TSK(cpu))
184 p = krp->p;
185#endif
186 return p;
187}
188
189/*
190 * kdbgetenv - This function will return the character string value of
191 * an environment variable.
192 * Parameters:
193 * match A character string representing an environment variable.
194 * Returns:
195 * NULL No environment variable matches 'match'
196 * char* Pointer to string value of environment variable.
197 */
198char *kdbgetenv(const char *match)
199{
200 char **ep = __env;
201 int matchlen = strlen(match);
202 int i;
203
204 for (i = 0; i < __nenv; i++) {
205 char *e = *ep++;
206
207 if (!e)
208 continue;
209
210 if ((strncmp(match, e, matchlen) == 0)
211 && ((e[matchlen] == '\0')
212 || (e[matchlen] == '='))) {
213 char *cp = strchr(e, '=');
214 return cp ? ++cp : "";
215 }
216 }
217 return NULL;
218}
219
220/*
221 * kdballocenv - This function is used to allocate bytes for
222 * environment entries.
223 * Parameters:
224 * match A character string representing a numeric value
225 * Outputs:
226 * *value the unsigned long representation of the env variable 'match'
227 * Returns:
228 * Zero on success, a kdb diagnostic on failure.
229 * Remarks:
230 * We use a static environment buffer (envbuffer) to hold the values
231 * of dynamically generated environment variables (see kdb_set). Buffer
232 * space once allocated is never free'd, so over time, the amount of space
233 * (currently 512 bytes) will be exhausted if env variables are changed
234 * frequently.
235 */
236static char *kdballocenv(size_t bytes)
237{
238#define KDB_ENVBUFSIZE 512
239 static char envbuffer[KDB_ENVBUFSIZE];
240 static int envbufsize;
241 char *ep = NULL;
242
243 if ((KDB_ENVBUFSIZE - envbufsize) >= bytes) {
244 ep = &envbuffer[envbufsize];
245 envbufsize += bytes;
246 }
247 return ep;
248}
249
250/*
251 * kdbgetulenv - This function will return the value of an unsigned
252 * long-valued environment variable.
253 * Parameters:
254 * match A character string representing a numeric value
255 * Outputs:
256 * *value the unsigned long represntation of the env variable 'match'
257 * Returns:
258 * Zero on success, a kdb diagnostic on failure.
259 */
260static int kdbgetulenv(const char *match, unsigned long *value)
261{
262 char *ep;
263
264 ep = kdbgetenv(match);
265 if (!ep)
266 return KDB_NOTENV;
267 if (strlen(ep) == 0)
268 return KDB_NOENVVALUE;
269
270 *value = simple_strtoul(ep, NULL, 0);
271
272 return 0;
273}
274
275/*
276 * kdbgetintenv - This function will return the value of an
277 * integer-valued environment variable.
278 * Parameters:
279 * match A character string representing an integer-valued env variable
280 * Outputs:
281 * *value the integer representation of the environment variable 'match'
282 * Returns:
283 * Zero on success, a kdb diagnostic on failure.
284 */
285int kdbgetintenv(const char *match, int *value)
286{
287 unsigned long val;
288 int diag;
289
290 diag = kdbgetulenv(match, &val);
291 if (!diag)
292 *value = (int) val;
293 return diag;
294}
295
296/*
297 * kdbgetularg - This function will convert a numeric string into an
298 * unsigned long value.
299 * Parameters:
300 * arg A character string representing a numeric value
301 * Outputs:
302 * *value the unsigned long represntation of arg.
303 * Returns:
304 * Zero on success, a kdb diagnostic on failure.
305 */
306int kdbgetularg(const char *arg, unsigned long *value)
307{
308 char *endp;
309 unsigned long val;
310
311 val = simple_strtoul(arg, &endp, 0);
312
313 if (endp == arg) {
314 /*
315 * Also try base 16, for us folks too lazy to type the
316 * leading 0x...
317 */
318 val = simple_strtoul(arg, &endp, 16);
319 if (endp == arg)
320 return KDB_BADINT;
321 }
322
323 *value = val;
324
325 return 0;
326}
327
328int kdbgetu64arg(const char *arg, u64 *value)
329{
330 char *endp;
331 u64 val;
332
333 val = simple_strtoull(arg, &endp, 0);
334
335 if (endp == arg) {
336
337 val = simple_strtoull(arg, &endp, 16);
338 if (endp == arg)
339 return KDB_BADINT;
340 }
341
342 *value = val;
343
344 return 0;
345}
346
347/*
348 * kdb_set - This function implements the 'set' command. Alter an
349 * existing environment variable or create a new one.
350 */
351int kdb_set(int argc, const char **argv)
352{
353 int i;
354 char *ep;
355 size_t varlen, vallen;
356
357 /*
358 * we can be invoked two ways:
359 * set var=value argv[1]="var", argv[2]="value"
360 * set var = value argv[1]="var", argv[2]="=", argv[3]="value"
361 * - if the latter, shift 'em down.
362 */
363 if (argc == 3) {
364 argv[2] = argv[3];
365 argc--;
366 }
367
368 if (argc != 2)
369 return KDB_ARGCOUNT;
370
371 /*
372 * Check for internal variables
373 */
374 if (strcmp(argv[1], "KDBDEBUG") == 0) {
375 unsigned int debugflags;
376 char *cp;
377
378 debugflags = simple_strtoul(argv[2], &cp, 0);
379 if (cp == argv[2] || debugflags & ~KDB_DEBUG_FLAG_MASK) {
380 kdb_printf("kdb: illegal debug flags '%s'\n",
381 argv[2]);
382 return 0;
383 }
384 kdb_flags = (kdb_flags &
385 ~(KDB_DEBUG_FLAG_MASK << KDB_DEBUG_FLAG_SHIFT))
386 | (debugflags << KDB_DEBUG_FLAG_SHIFT);
387
388 return 0;
389 }
390
391 /*
392 * Tokenizer squashed the '=' sign. argv[1] is variable
393 * name, argv[2] = value.
394 */
395 varlen = strlen(argv[1]);
396 vallen = strlen(argv[2]);
397 ep = kdballocenv(varlen + vallen + 2);
398 if (ep == (char *)0)
399 return KDB_ENVBUFFULL;
400
401 sprintf(ep, "%s=%s", argv[1], argv[2]);
402
403 ep[varlen+vallen+1] = '\0';
404
405 for (i = 0; i < __nenv; i++) {
406 if (__env[i]
407 && ((strncmp(__env[i], argv[1], varlen) == 0)
408 && ((__env[i][varlen] == '\0')
409 || (__env[i][varlen] == '=')))) {
410 __env[i] = ep;
411 return 0;
412 }
413 }
414
415 /*
416 * Wasn't existing variable. Fit into slot.
417 */
418 for (i = 0; i < __nenv-1; i++) {
419 if (__env[i] == (char *)0) {
420 __env[i] = ep;
421 return 0;
422 }
423 }
424
425 return KDB_ENVFULL;
426}
427
428static int kdb_check_regs(void)
429{
430 if (!kdb_current_regs) {
431 kdb_printf("No current kdb registers."
432 " You may need to select another task\n");
433 return KDB_BADREG;
434 }
435 return 0;
436}
437
438/*
439 * kdbgetaddrarg - This function is responsible for parsing an
440 * address-expression and returning the value of the expression,
441 * symbol name, and offset to the caller.
442 *
443 * The argument may consist of a numeric value (decimal or
444 * hexidecimal), a symbol name, a register name (preceeded by the
445 * percent sign), an environment variable with a numeric value
446 * (preceeded by a dollar sign) or a simple arithmetic expression
447 * consisting of a symbol name, +/-, and a numeric constant value
448 * (offset).
449 * Parameters:
450 * argc - count of arguments in argv
451 * argv - argument vector
452 * *nextarg - index to next unparsed argument in argv[]
453 * regs - Register state at time of KDB entry
454 * Outputs:
455 * *value - receives the value of the address-expression
456 * *offset - receives the offset specified, if any
457 * *name - receives the symbol name, if any
458 * *nextarg - index to next unparsed argument in argv[]
459 * Returns:
460 * zero is returned on success, a kdb diagnostic code is
461 * returned on error.
462 */
463int kdbgetaddrarg(int argc, const char **argv, int *nextarg,
464 unsigned long *value, long *offset,
465 char **name)
466{
467 unsigned long addr;
468 unsigned long off = 0;
469 int positive;
470 int diag;
471 int found = 0;
472 char *symname;
473 char symbol = '\0';
474 char *cp;
475 kdb_symtab_t symtab;
476
477 /*
478 * Process arguments which follow the following syntax:
479 *
480 * symbol | numeric-address [+/- numeric-offset]
481 * %register
482 * $environment-variable
483 */
484
485 if (*nextarg > argc)
486 return KDB_ARGCOUNT;
487
488 symname = (char *)argv[*nextarg];
489
490 /*
491 * If there is no whitespace between the symbol
492 * or address and the '+' or '-' symbols, we
493 * remember the character and replace it with a
494 * null so the symbol/value can be properly parsed
495 */
496 cp = strpbrk(symname, "+-");
497 if (cp != NULL) {
498 symbol = *cp;
499 *cp++ = '\0';
500 }
501
502 if (symname[0] == '$') {
503 diag = kdbgetulenv(&symname[1], &addr);
504 if (diag)
505 return diag;
506 } else if (symname[0] == '%') {
507 diag = kdb_check_regs();
508 if (diag)
509 return diag;
510 /* Implement register values with % at a later time as it is
511 * arch optional.
512 */
513 return KDB_NOTIMP;
514 } else {
515 found = kdbgetsymval(symname, &symtab);
516 if (found) {
517 addr = symtab.sym_start;
518 } else {
519 diag = kdbgetularg(argv[*nextarg], &addr);
520 if (diag)
521 return diag;
522 }
523 }
524
525 if (!found)
526 found = kdbnearsym(addr, &symtab);
527
528 (*nextarg)++;
529
530 if (name)
531 *name = symname;
532 if (value)
533 *value = addr;
534 if (offset && name && *name)
535 *offset = addr - symtab.sym_start;
536
537 if ((*nextarg > argc)
538 && (symbol == '\0'))
539 return 0;
540
541 /*
542 * check for +/- and offset
543 */
544
545 if (symbol == '\0') {
546 if ((argv[*nextarg][0] != '+')
547 && (argv[*nextarg][0] != '-')) {
548 /*
549 * Not our argument. Return.
550 */
551 return 0;
552 } else {
553 positive = (argv[*nextarg][0] == '+');
554 (*nextarg)++;
555 }
556 } else
557 positive = (symbol == '+');
558
559 /*
560 * Now there must be an offset!
561 */
562 if ((*nextarg > argc)
563 && (symbol == '\0')) {
564 return KDB_INVADDRFMT;
565 }
566
567 if (!symbol) {
568 cp = (char *)argv[*nextarg];
569 (*nextarg)++;
570 }
571
572 diag = kdbgetularg(cp, &off);
573 if (diag)
574 return diag;
575
576 if (!positive)
577 off = -off;
578
579 if (offset)
580 *offset += off;
581
582 if (value)
583 *value += off;
584
585 return 0;
586}
587
588static void kdb_cmderror(int diag)
589{
590 int i;
591
592 if (diag >= 0) {
593 kdb_printf("no error detected (diagnostic is %d)\n", diag);
594 return;
595 }
596
597 for (i = 0; i < __nkdb_err; i++) {
598 if (kdbmsgs[i].km_diag == diag) {
599 kdb_printf("diag: %d: %s\n", diag, kdbmsgs[i].km_msg);
600 return;
601 }
602 }
603
604 kdb_printf("Unknown diag %d\n", -diag);
605}
606
607/*
608 * kdb_defcmd, kdb_defcmd2 - This function implements the 'defcmd'
609 * command which defines one command as a set of other commands,
610 * terminated by endefcmd. kdb_defcmd processes the initial
611 * 'defcmd' command, kdb_defcmd2 is invoked from kdb_parse for
612 * the following commands until 'endefcmd'.
613 * Inputs:
614 * argc argument count
615 * argv argument vector
616 * Returns:
617 * zero for success, a kdb diagnostic if error
618 */
619struct defcmd_set {
620 int count;
621 int usable;
622 char *name;
623 char *usage;
624 char *help;
625 char **command;
626};
627static struct defcmd_set *defcmd_set;
628static int defcmd_set_count;
629static int defcmd_in_progress;
630
631/* Forward references */
632static int kdb_exec_defcmd(int argc, const char **argv);
633
634static int kdb_defcmd2(const char *cmdstr, const char *argv0)
635{
636 struct defcmd_set *s = defcmd_set + defcmd_set_count - 1;
637 char **save_command = s->command;
638 if (strcmp(argv0, "endefcmd") == 0) {
639 defcmd_in_progress = 0;
640 if (!s->count)
641 s->usable = 0;
642 if (s->usable)
643 kdb_register(s->name, kdb_exec_defcmd,
644 s->usage, s->help, 0);
645 return 0;
646 }
647 if (!s->usable)
648 return KDB_NOTIMP;
649 s->command = kmalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB);
650 if (!s->command) {
651 kdb_printf("Could not allocate new kdb_defcmd table for %s\n",
652 cmdstr);
653 s->usable = 0;
654 return KDB_NOTIMP;
655 }
656 memcpy(s->command, save_command, s->count * sizeof(*(s->command)));
657 s->command[s->count++] = kdb_strdup(cmdstr, GFP_KDB);
658 kfree(save_command);
659 return 0;
660}
661
662static int kdb_defcmd(int argc, const char **argv)
663{
664 struct defcmd_set *save_defcmd_set = defcmd_set, *s;
665 if (defcmd_in_progress) {
666 kdb_printf("kdb: nested defcmd detected, assuming missing "
667 "endefcmd\n");
668 kdb_defcmd2("endefcmd", "endefcmd");
669 }
670 if (argc == 0) {
671 int i;
672 for (s = defcmd_set; s < defcmd_set + defcmd_set_count; ++s) {
673 kdb_printf("defcmd %s \"%s\" \"%s\"\n", s->name,
674 s->usage, s->help);
675 for (i = 0; i < s->count; ++i)
676 kdb_printf("%s", s->command[i]);
677 kdb_printf("endefcmd\n");
678 }
679 return 0;
680 }
681 if (argc != 3)
682 return KDB_ARGCOUNT;
683 defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set),
684 GFP_KDB);
685 if (!defcmd_set) {
686 kdb_printf("Could not allocate new defcmd_set entry for %s\n",
687 argv[1]);
688 defcmd_set = save_defcmd_set;
689 return KDB_NOTIMP;
690 }
691 memcpy(defcmd_set, save_defcmd_set,
692 defcmd_set_count * sizeof(*defcmd_set));
693 kfree(save_defcmd_set);
694 s = defcmd_set + defcmd_set_count;
695 memset(s, 0, sizeof(*s));
696 s->usable = 1;
697 s->name = kdb_strdup(argv[1], GFP_KDB);
698 s->usage = kdb_strdup(argv[2], GFP_KDB);
699 s->help = kdb_strdup(argv[3], GFP_KDB);
700 if (s->usage[0] == '"') {
701 strcpy(s->usage, s->usage+1);
702 s->usage[strlen(s->usage)-1] = '\0';
703 }
704 if (s->help[0] == '"') {
705 strcpy(s->help, s->help+1);
706 s->help[strlen(s->help)-1] = '\0';
707 }
708 ++defcmd_set_count;
709 defcmd_in_progress = 1;
710 return 0;
711}
712
713/*
714 * kdb_exec_defcmd - Execute the set of commands associated with this
715 * defcmd name.
716 * Inputs:
717 * argc argument count
718 * argv argument vector
719 * Returns:
720 * zero for success, a kdb diagnostic if error
721 */
722static int kdb_exec_defcmd(int argc, const char **argv)
723{
724 int i, ret;
725 struct defcmd_set *s;
726 if (argc != 0)
727 return KDB_ARGCOUNT;
728 for (s = defcmd_set, i = 0; i < defcmd_set_count; ++i, ++s) {
729 if (strcmp(s->name, argv[0]) == 0)
730 break;
731 }
732 if (i == defcmd_set_count) {
733 kdb_printf("kdb_exec_defcmd: could not find commands for %s\n",
734 argv[0]);
735 return KDB_NOTIMP;
736 }
737 for (i = 0; i < s->count; ++i) {
738 /* Recursive use of kdb_parse, do not use argv after
739 * this point */
740 argv = NULL;
741 kdb_printf("[%s]kdb> %s\n", s->name, s->command[i]);
742 ret = kdb_parse(s->command[i]);
743 if (ret)
744 return ret;
745 }
746 return 0;
747}
748
749/* Command history */
750#define KDB_CMD_HISTORY_COUNT 32
751#define CMD_BUFLEN 200 /* kdb_printf: max printline
752 * size == 256 */
753static unsigned int cmd_head, cmd_tail;
754static unsigned int cmdptr;
755static char cmd_hist[KDB_CMD_HISTORY_COUNT][CMD_BUFLEN];
756static char cmd_cur[CMD_BUFLEN];
757
758/*
759 * The "str" argument may point to something like | grep xyz
760 */
761static void parse_grep(const char *str)
762{
763 int len;
764 char *cp = (char *)str, *cp2;
765
766 /* sanity check: we should have been called with the \ first */
767 if (*cp != '|')
768 return;
769 cp++;
770 while (isspace(*cp))
771 cp++;
772 if (strncmp(cp, "grep ", 5)) {
773 kdb_printf("invalid 'pipe', see grephelp\n");
774 return;
775 }
776 cp += 5;
777 while (isspace(*cp))
778 cp++;
779 cp2 = strchr(cp, '\n');
780 if (cp2)
781 *cp2 = '\0'; /* remove the trailing newline */
782 len = strlen(cp);
783 if (len == 0) {
784 kdb_printf("invalid 'pipe', see grephelp\n");
785 return;
786 }
787 /* now cp points to a nonzero length search string */
788 if (*cp == '"') {
789 /* allow it be "x y z" by removing the "'s - there must
790 be two of them */
791 cp++;
792 cp2 = strchr(cp, '"');
793 if (!cp2) {
794 kdb_printf("invalid quoted string, see grephelp\n");
795 return;
796 }
797 *cp2 = '\0'; /* end the string where the 2nd " was */
798 }
799 kdb_grep_leading = 0;
800 if (*cp == '^') {
801 kdb_grep_leading = 1;
802 cp++;
803 }
804 len = strlen(cp);
805 kdb_grep_trailing = 0;
806 if (*(cp+len-1) == '$') {
807 kdb_grep_trailing = 1;
808 *(cp+len-1) = '\0';
809 }
810 len = strlen(cp);
811 if (!len)
812 return;
813 if (len >= GREP_LEN) {
814 kdb_printf("search string too long\n");
815 return;
816 }
817 strcpy(kdb_grep_string, cp);
818 kdb_grepping_flag++;
819 return;
820}
821
822/*
823 * kdb_parse - Parse the command line, search the command table for a
824 * matching command and invoke the command function. This
825 * function may be called recursively, if it is, the second call
826 * will overwrite argv and cbuf. It is the caller's
827 * responsibility to save their argv if they recursively call
828 * kdb_parse().
829 * Parameters:
830 * cmdstr The input command line to be parsed.
831 * regs The registers at the time kdb was entered.
832 * Returns:
833 * Zero for success, a kdb diagnostic if failure.
834 * Remarks:
835 * Limited to 20 tokens.
836 *
837 * Real rudimentary tokenization. Basically only whitespace
838 * is considered a token delimeter (but special consideration
839 * is taken of the '=' sign as used by the 'set' command).
840 *
841 * The algorithm used to tokenize the input string relies on
842 * there being at least one whitespace (or otherwise useless)
843 * character between tokens as the character immediately following
844 * the token is altered in-place to a null-byte to terminate the
845 * token string.
846 */
847
848#define MAXARGC 20
849
850int kdb_parse(const char *cmdstr)
851{
852 static char *argv[MAXARGC];
853 static int argc;
854 static char cbuf[CMD_BUFLEN+2];
855 char *cp;
856 char *cpp, quoted;
857 kdbtab_t *tp;
858 int i, escaped, ignore_errors = 0, check_grep;
859
860 /*
861 * First tokenize the command string.
862 */
863 cp = (char *)cmdstr;
864 kdb_grepping_flag = check_grep = 0;
865
866 if (KDB_FLAG(CMD_INTERRUPT)) {
867 /* Previous command was interrupted, newline must not
868 * repeat the command */
869 KDB_FLAG_CLEAR(CMD_INTERRUPT);
870 KDB_STATE_SET(PAGER);
871 argc = 0; /* no repeat */
872 }
873
874 if (*cp != '\n' && *cp != '\0') {
875 argc = 0;
876 cpp = cbuf;
877 while (*cp) {
878 /* skip whitespace */
879 while (isspace(*cp))
880 cp++;
881 if ((*cp == '\0') || (*cp == '\n') ||
882 (*cp == '#' && !defcmd_in_progress))
883 break;
884 /* special case: check for | grep pattern */
885 if (*cp == '|') {
886 check_grep++;
887 break;
888 }
889 if (cpp >= cbuf + CMD_BUFLEN) {
890 kdb_printf("kdb_parse: command buffer "
891 "overflow, command ignored\n%s\n",
892 cmdstr);
893 return KDB_NOTFOUND;
894 }
895 if (argc >= MAXARGC - 1) {
896 kdb_printf("kdb_parse: too many arguments, "
897 "command ignored\n%s\n", cmdstr);
898 return KDB_NOTFOUND;
899 }
900 argv[argc++] = cpp;
901 escaped = 0;
902 quoted = '\0';
903 /* Copy to next unquoted and unescaped
904 * whitespace or '=' */
905 while (*cp && *cp != '\n' &&
906 (escaped || quoted || !isspace(*cp))) {
907 if (cpp >= cbuf + CMD_BUFLEN)
908 break;
909 if (escaped) {
910 escaped = 0;
911 *cpp++ = *cp++;
912 continue;
913 }
914 if (*cp == '\\') {
915 escaped = 1;
916 ++cp;
917 continue;
918 }
919 if (*cp == quoted)
920 quoted = '\0';
921 else if (*cp == '\'' || *cp == '"')
922 quoted = *cp;
923 *cpp = *cp++;
924 if (*cpp == '=' && !quoted)
925 break;
926 ++cpp;
927 }
928 *cpp++ = '\0'; /* Squash a ws or '=' character */
929 }
930 }
931 if (!argc)
932 return 0;
933 if (check_grep)
934 parse_grep(cp);
935 if (defcmd_in_progress) {
936 int result = kdb_defcmd2(cmdstr, argv[0]);
937 if (!defcmd_in_progress) {
938 argc = 0; /* avoid repeat on endefcmd */
939 *(argv[0]) = '\0';
940 }
941 return result;
942 }
943 if (argv[0][0] == '-' && argv[0][1] &&
944 (argv[0][1] < '0' || argv[0][1] > '9')) {
945 ignore_errors = 1;
946 ++argv[0];
947 }
948
949 for_each_kdbcmd(tp, i) {
950 if (tp->cmd_name) {
951 /*
952 * If this command is allowed to be abbreviated,
953 * check to see if this is it.
954 */
955
956 if (tp->cmd_minlen
957 && (strlen(argv[0]) <= tp->cmd_minlen)) {
958 if (strncmp(argv[0],
959 tp->cmd_name,
960 tp->cmd_minlen) == 0) {
961 break;
962 }
963 }
964
965 if (strcmp(argv[0], tp->cmd_name) == 0)
966 break;
967 }
968 }
969
970 /*
971 * If we don't find a command by this name, see if the first
972 * few characters of this match any of the known commands.
973 * e.g., md1c20 should match md.
974 */
975 if (i == kdb_max_commands) {
976 for_each_kdbcmd(tp, i) {
977 if (tp->cmd_name) {
978 if (strncmp(argv[0],
979 tp->cmd_name,
980 strlen(tp->cmd_name)) == 0) {
981 break;
982 }
983 }
984 }
985 }
986
987 if (i < kdb_max_commands) {
988 int result;
989 KDB_STATE_SET(CMD);
990 result = (*tp->cmd_func)(argc-1, (const char **)argv);
991 if (result && ignore_errors && result > KDB_CMD_GO)
992 result = 0;
993 KDB_STATE_CLEAR(CMD);
994 switch (tp->cmd_repeat) {
995 case KDB_REPEAT_NONE:
996 argc = 0;
997 if (argv[0])
998 *(argv[0]) = '\0';
999 break;
1000 case KDB_REPEAT_NO_ARGS:
1001 argc = 1;
1002 if (argv[1])
1003 *(argv[1]) = '\0';
1004 break;
1005 case KDB_REPEAT_WITH_ARGS:
1006 break;
1007 }
1008 return result;
1009 }
1010
1011 /*
1012 * If the input with which we were presented does not
1013 * map to an existing command, attempt to parse it as an
1014 * address argument and display the result. Useful for
1015 * obtaining the address of a variable, or the nearest symbol
1016 * to an address contained in a register.
1017 */
1018 {
1019 unsigned long value;
1020 char *name = NULL;
1021 long offset;
1022 int nextarg = 0;
1023
1024 if (kdbgetaddrarg(0, (const char **)argv, &nextarg,
1025 &value, &offset, &name)) {
1026 return KDB_NOTFOUND;
1027 }
1028
1029 kdb_printf("%s = ", argv[0]);
1030 kdb_symbol_print(value, NULL, KDB_SP_DEFAULT);
1031 kdb_printf("\n");
1032 return 0;
1033 }
1034}
1035
1036
1037static int handle_ctrl_cmd(char *cmd)
1038{
1039#define CTRL_P 16
1040#define CTRL_N 14
1041
1042 /* initial situation */
1043 if (cmd_head == cmd_tail)
1044 return 0;
1045 switch (*cmd) {
1046 case CTRL_P:
1047 if (cmdptr != cmd_tail)
1048 cmdptr = (cmdptr-1) % KDB_CMD_HISTORY_COUNT;
1049 strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
1050 return 1;
1051 case CTRL_N:
1052 if (cmdptr != cmd_head)
1053 cmdptr = (cmdptr+1) % KDB_CMD_HISTORY_COUNT;
1054 strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
1055 return 1;
1056 }
1057 return 0;
1058}
1059
1060/*
1061 * kdb_reboot - This function implements the 'reboot' command. Reboot
1062 * the system immediately, or loop for ever on failure.
1063 */
1064static int kdb_reboot(int argc, const char **argv)
1065{
1066 emergency_restart();
1067 kdb_printf("Hmm, kdb_reboot did not reboot, spinning here\n");
1068 while (1)
1069 cpu_relax();
1070 /* NOTREACHED */
1071 return 0;
1072}
1073
1074static void kdb_dumpregs(struct pt_regs *regs)
1075{
1076 int old_lvl = console_loglevel;
1077 console_loglevel = 15;
1078 kdb_trap_printk++;
1079 show_regs(regs);
1080 kdb_trap_printk--;
1081 kdb_printf("\n");
1082 console_loglevel = old_lvl;
1083}
1084
1085void kdb_set_current_task(struct task_struct *p)
1086{
1087 kdb_current_task = p;
1088
1089 if (kdb_task_has_cpu(p)) {
1090 kdb_current_regs = KDB_TSKREGS(kdb_process_cpu(p));
1091 return;
1092 }
1093 kdb_current_regs = NULL;
1094}
1095
1096/*
1097 * kdb_local - The main code for kdb. This routine is invoked on a
1098 * specific processor, it is not global. The main kdb() routine
1099 * ensures that only one processor at a time is in this routine.
1100 * This code is called with the real reason code on the first
1101 * entry to a kdb session, thereafter it is called with reason
1102 * SWITCH, even if the user goes back to the original cpu.
1103 * Inputs:
1104 * reason The reason KDB was invoked
1105 * error The hardware-defined error code
1106 * regs The exception frame at time of fault/breakpoint.
1107 * db_result Result code from the break or debug point.
1108 * Returns:
1109 * 0 KDB was invoked for an event which it wasn't responsible
1110 * 1 KDB handled the event for which it was invoked.
1111 * KDB_CMD_GO User typed 'go'.
1112 * KDB_CMD_CPU User switched to another cpu.
1113 * KDB_CMD_SS Single step.
1114 * KDB_CMD_SSB Single step until branch.
1115 */
1116static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
1117 kdb_dbtrap_t db_result)
1118{
1119 char *cmdbuf;
1120 int diag;
1121 struct task_struct *kdb_current =
1122 kdb_curr_task(raw_smp_processor_id());
1123
1124 KDB_DEBUG_STATE("kdb_local 1", reason);
1125 kdb_go_count = 0;
1126 if (reason == KDB_REASON_DEBUG) {
1127 /* special case below */
1128 } else {
1129 kdb_printf("\nEntering kdb (current=0x%p, pid %d) ",
1130 kdb_current, kdb_current->pid);
1131#if defined(CONFIG_SMP)
1132 kdb_printf("on processor %d ", raw_smp_processor_id());
1133#endif
1134 }
1135
1136 switch (reason) {
1137 case KDB_REASON_DEBUG:
1138 {
1139 /*
1140 * If re-entering kdb after a single step
1141 * command, don't print the message.
1142 */
1143 switch (db_result) {
1144 case KDB_DB_BPT:
1145 kdb_printf("\nEntering kdb (0x%p, pid %d) ",
1146 kdb_current, kdb_current->pid);
1147#if defined(CONFIG_SMP)
1148 kdb_printf("on processor %d ", raw_smp_processor_id());
1149#endif
1150 kdb_printf("due to Debug @ " kdb_machreg_fmt "\n",
1151 instruction_pointer(regs));
1152 break;
1153 case KDB_DB_SSB:
1154 /*
1155 * In the midst of ssb command. Just return.
1156 */
1157 KDB_DEBUG_STATE("kdb_local 3", reason);
1158 return KDB_CMD_SSB; /* Continue with SSB command */
1159
1160 break;
1161 case KDB_DB_SS:
1162 break;
1163 case KDB_DB_SSBPT:
1164 KDB_DEBUG_STATE("kdb_local 4", reason);
1165 return 1; /* kdba_db_trap did the work */
1166 default:
1167 kdb_printf("kdb: Bad result from kdba_db_trap: %d\n",
1168 db_result);
1169 break;
1170 }
1171
1172 }
1173 break;
1174 case KDB_REASON_ENTER:
1175 if (KDB_STATE(KEYBOARD))
1176 kdb_printf("due to Keyboard Entry\n");
1177 else
1178 kdb_printf("due to KDB_ENTER()\n");
1179 break;
1180 case KDB_REASON_KEYBOARD:
1181 KDB_STATE_SET(KEYBOARD);
1182 kdb_printf("due to Keyboard Entry\n");
1183 break;
1184 case KDB_REASON_ENTER_SLAVE:
1185 /* drop through, slaves only get released via cpu switch */
1186 case KDB_REASON_SWITCH:
1187 kdb_printf("due to cpu switch\n");
1188 break;
1189 case KDB_REASON_OOPS:
1190 kdb_printf("Oops: %s\n", kdb_diemsg);
1191 kdb_printf("due to oops @ " kdb_machreg_fmt "\n",
1192 instruction_pointer(regs));
1193 kdb_dumpregs(regs);
1194 break;
1195 case KDB_REASON_NMI:
1196 kdb_printf("due to NonMaskable Interrupt @ "
1197 kdb_machreg_fmt "\n",
1198 instruction_pointer(regs));
1199 kdb_dumpregs(regs);
1200 break;
1201 case KDB_REASON_SSTEP:
1202 case KDB_REASON_BREAK:
1203 kdb_printf("due to %s @ " kdb_machreg_fmt "\n",
1204 reason == KDB_REASON_BREAK ?
1205 "Breakpoint" : "SS trap", instruction_pointer(regs));
1206 /*
1207 * Determine if this breakpoint is one that we
1208 * are interested in.
1209 */
1210 if (db_result != KDB_DB_BPT) {
1211 kdb_printf("kdb: error return from kdba_bp_trap: %d\n",
1212 db_result);
1213 KDB_DEBUG_STATE("kdb_local 6", reason);
1214 return 0; /* Not for us, dismiss it */
1215 }
1216 break;
1217 case KDB_REASON_RECURSE:
1218 kdb_printf("due to Recursion @ " kdb_machreg_fmt "\n",
1219 instruction_pointer(regs));
1220 break;
1221 default:
1222 kdb_printf("kdb: unexpected reason code: %d\n", reason);
1223 KDB_DEBUG_STATE("kdb_local 8", reason);
1224 return 0; /* Not for us, dismiss it */
1225 }
1226
1227 while (1) {
1228 /*
1229 * Initialize pager context.
1230 */
1231 kdb_nextline = 1;
1232 KDB_STATE_CLEAR(SUPPRESS);
1233
1234 cmdbuf = cmd_cur;
1235 *cmdbuf = '\0';
1236 *(cmd_hist[cmd_head]) = '\0';
1237
1238 if (KDB_FLAG(ONLY_DO_DUMP)) {
1239 /* kdb is off but a catastrophic error requires a dump.
1240 * Take the dump and reboot.
1241 * Turn on logging so the kdb output appears in the log
1242 * buffer in the dump.
1243 */
1244 const char *setargs[] = { "set", "LOGGING", "1" };
1245 kdb_set(2, setargs);
1246 kdb_reboot(0, NULL);
1247 /*NOTREACHED*/
1248 }
1249
1250do_full_getstr:
1251#if defined(CONFIG_SMP)
1252 snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"),
1253 raw_smp_processor_id());
1254#else
1255 snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"));
1256#endif
1257 if (defcmd_in_progress)
1258 strncat(kdb_prompt_str, "[defcmd]", CMD_BUFLEN);
1259
1260 /*
1261 * Fetch command from keyboard
1262 */
1263 cmdbuf = kdb_getstr(cmdbuf, CMD_BUFLEN, kdb_prompt_str);
1264 if (*cmdbuf != '\n') {
1265 if (*cmdbuf < 32) {
1266 if (cmdptr == cmd_head) {
1267 strncpy(cmd_hist[cmd_head], cmd_cur,
1268 CMD_BUFLEN);
1269 *(cmd_hist[cmd_head] +
1270 strlen(cmd_hist[cmd_head])-1) = '\0';
1271 }
1272 if (!handle_ctrl_cmd(cmdbuf))
1273 *(cmd_cur+strlen(cmd_cur)-1) = '\0';
1274 cmdbuf = cmd_cur;
1275 goto do_full_getstr;
1276 } else {
1277 strncpy(cmd_hist[cmd_head], cmd_cur,
1278 CMD_BUFLEN);
1279 }
1280
1281 cmd_head = (cmd_head+1) % KDB_CMD_HISTORY_COUNT;
1282 if (cmd_head == cmd_tail)
1283 cmd_tail = (cmd_tail+1) % KDB_CMD_HISTORY_COUNT;
1284 }
1285
1286 cmdptr = cmd_head;
1287 diag = kdb_parse(cmdbuf);
1288 if (diag == KDB_NOTFOUND) {
1289 kdb_printf("Unknown kdb command: '%s'\n", cmdbuf);
1290 diag = 0;
1291 }
1292 if (diag == KDB_CMD_GO
1293 || diag == KDB_CMD_CPU
1294 || diag == KDB_CMD_SS
1295 || diag == KDB_CMD_SSB
1296 || diag == KDB_CMD_KGDB)
1297 break;
1298
1299 if (diag)
1300 kdb_cmderror(diag);
1301 }
1302 KDB_DEBUG_STATE("kdb_local 9", diag);
1303 return diag;
1304}
1305
1306
1307/*
1308 * kdb_print_state - Print the state data for the current processor
1309 * for debugging.
1310 * Inputs:
1311 * text Identifies the debug point
1312 * value Any integer value to be printed, e.g. reason code.
1313 */
1314void kdb_print_state(const char *text, int value)
1315{
1316 kdb_printf("state: %s cpu %d value %d initial %d state %x\n",
1317 text, raw_smp_processor_id(), value, kdb_initial_cpu,
1318 kdb_state);
1319}
1320
1321/*
1322 * kdb_main_loop - After initial setup and assignment of the
1323 * controlling cpu, all cpus are in this loop. One cpu is in
1324 * control and will issue the kdb prompt, the others will spin
1325 * until 'go' or cpu switch.
1326 *
1327 * To get a consistent view of the kernel stacks for all
1328 * processes, this routine is invoked from the main kdb code via
1329 * an architecture specific routine. kdba_main_loop is
1330 * responsible for making the kernel stacks consistent for all
1331 * processes, there should be no difference between a blocked
1332 * process and a running process as far as kdb is concerned.
1333 * Inputs:
1334 * reason The reason KDB was invoked
1335 * error The hardware-defined error code
1336 * reason2 kdb's current reason code.
1337 * Initially error but can change
1338 * acording to kdb state.
1339 * db_result Result code from break or debug point.
1340 * regs The exception frame at time of fault/breakpoint.
1341 * should always be valid.
1342 * Returns:
1343 * 0 KDB was invoked for an event which it wasn't responsible
1344 * 1 KDB handled the event for which it was invoked.
1345 */
1346int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
1347 kdb_dbtrap_t db_result, struct pt_regs *regs)
1348{
1349 int result = 1;
1350 /* Stay in kdb() until 'go', 'ss[b]' or an error */
1351 while (1) {
1352 /*
1353 * All processors except the one that is in control
1354 * will spin here.
1355 */
1356 KDB_DEBUG_STATE("kdb_main_loop 1", reason);
1357 while (KDB_STATE(HOLD_CPU)) {
1358 /* state KDB is turned off by kdb_cpu to see if the
1359 * other cpus are still live, each cpu in this loop
1360 * turns it back on.
1361 */
1362 if (!KDB_STATE(KDB))
1363 KDB_STATE_SET(KDB);
1364 }
1365
1366 KDB_STATE_CLEAR(SUPPRESS);
1367 KDB_DEBUG_STATE("kdb_main_loop 2", reason);
1368 if (KDB_STATE(LEAVING))
1369 break; /* Another cpu said 'go' */
1370 /* Still using kdb, this processor is in control */
1371 result = kdb_local(reason2, error, regs, db_result);
1372 KDB_DEBUG_STATE("kdb_main_loop 3", result);
1373
1374 if (result == KDB_CMD_CPU)
1375 break;
1376
1377 if (result == KDB_CMD_SS) {
1378 KDB_STATE_SET(DOING_SS);
1379 break;
1380 }
1381
1382 if (result == KDB_CMD_SSB) {
1383 KDB_STATE_SET(DOING_SS);
1384 KDB_STATE_SET(DOING_SSB);
1385 break;
1386 }
1387
1388 if (result == KDB_CMD_KGDB) {
1389 if (!(KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2)))
1390 kdb_printf("Entering please attach debugger "
1391 "or use $D#44+ or $3#33\n");
1392 break;
1393 }
1394 if (result && result != 1 && result != KDB_CMD_GO)
1395 kdb_printf("\nUnexpected kdb_local return code %d\n",
1396 result);
1397 KDB_DEBUG_STATE("kdb_main_loop 4", reason);
1398 break;
1399 }
1400 if (KDB_STATE(DOING_SS))
1401 KDB_STATE_CLEAR(SSBPT);
1402
1403 return result;
1404}
1405
1406/*
1407 * kdb_mdr - This function implements the guts of the 'mdr', memory
1408 * read command.
1409 * mdr <addr arg>,<byte count>
1410 * Inputs:
1411 * addr Start address
1412 * count Number of bytes
1413 * Returns:
1414 * Always 0. Any errors are detected and printed by kdb_getarea.
1415 */
1416static int kdb_mdr(unsigned long addr, unsigned int count)
1417{
1418 unsigned char c;
1419 while (count--) {
1420 if (kdb_getarea(c, addr))
1421 return 0;
1422 kdb_printf("%02x", c);
1423 addr++;
1424 }
1425 kdb_printf("\n");
1426 return 0;
1427}
1428
1429/*
1430 * kdb_md - This function implements the 'md', 'md1', 'md2', 'md4',
1431 * 'md8' 'mdr' and 'mds' commands.
1432 *
1433 * md|mds [<addr arg> [<line count> [<radix>]]]
1434 * mdWcN [<addr arg> [<line count> [<radix>]]]
1435 * where W = is the width (1, 2, 4 or 8) and N is the count.
1436 * for eg., md1c20 reads 20 bytes, 1 at a time.
1437 * mdr <addr arg>,<byte count>
1438 */
1439static void kdb_md_line(const char *fmtstr, unsigned long addr,
1440 int symbolic, int nosect, int bytesperword,
1441 int num, int repeat, int phys)
1442{
1443 /* print just one line of data */
1444 kdb_symtab_t symtab;
1445 char cbuf[32];
1446 char *c = cbuf;
1447 int i;
1448 unsigned long word;
1449
1450 memset(cbuf, '\0', sizeof(cbuf));
1451 if (phys)
1452 kdb_printf("phys " kdb_machreg_fmt0 " ", addr);
1453 else
1454 kdb_printf(kdb_machreg_fmt0 " ", addr);
1455
1456 for (i = 0; i < num && repeat--; i++) {
1457 if (phys) {
1458 if (kdb_getphysword(&word, addr, bytesperword))
1459 break;
1460 } else if (kdb_getword(&word, addr, bytesperword))
1461 break;
1462 kdb_printf(fmtstr, word);
1463 if (symbolic)
1464 kdbnearsym(word, &symtab);
1465 else
1466 memset(&symtab, 0, sizeof(symtab));
1467 if (symtab.sym_name) {
1468 kdb_symbol_print(word, &symtab, 0);
1469 if (!nosect) {
1470 kdb_printf("\n");
1471 kdb_printf(" %s %s "
1472 kdb_machreg_fmt " "
1473 kdb_machreg_fmt " "
1474 kdb_machreg_fmt, symtab.mod_name,
1475 symtab.sec_name, symtab.sec_start,
1476 symtab.sym_start, symtab.sym_end);
1477 }
1478 addr += bytesperword;
1479 } else {
1480 union {
1481 u64 word;
1482 unsigned char c[8];
1483 } wc;
1484 unsigned char *cp;
1485#ifdef __BIG_ENDIAN
1486 cp = wc.c + 8 - bytesperword;
1487#else
1488 cp = wc.c;
1489#endif
1490 wc.word = word;
1491#define printable_char(c) \
1492 ({unsigned char __c = c; isascii(__c) && isprint(__c) ? __c : '.'; })
1493 switch (bytesperword) {
1494 case 8:
1495 *c++ = printable_char(*cp++);
1496 *c++ = printable_char(*cp++);
1497 *c++ = printable_char(*cp++);
1498 *c++ = printable_char(*cp++);
1499 addr += 4;
1500 case 4:
1501 *c++ = printable_char(*cp++);
1502 *c++ = printable_char(*cp++);
1503 addr += 2;
1504 case 2:
1505 *c++ = printable_char(*cp++);
1506 addr++;
1507 case 1:
1508 *c++ = printable_char(*cp++);
1509 addr++;
1510 break;
1511 }
1512#undef printable_char
1513 }
1514 }
1515 kdb_printf("%*s %s\n", (int)((num-i)*(2*bytesperword + 1)+1),
1516 " ", cbuf);
1517}
1518
1519static int kdb_md(int argc, const char **argv)
1520{
1521 static unsigned long last_addr;
1522 static int last_radix, last_bytesperword, last_repeat;
1523 int radix = 16, mdcount = 8, bytesperword = KDB_WORD_SIZE, repeat;
1524 int nosect = 0;
1525 char fmtchar, fmtstr[64];
1526 unsigned long addr;
1527 unsigned long word;
1528 long offset = 0;
1529 int symbolic = 0;
1530 int valid = 0;
1531 int phys = 0;
1532
1533 kdbgetintenv("MDCOUNT", &mdcount);
1534 kdbgetintenv("RADIX", &radix);
1535 kdbgetintenv("BYTESPERWORD", &bytesperword);
1536
1537 /* Assume 'md <addr>' and start with environment values */
1538 repeat = mdcount * 16 / bytesperword;
1539
1540 if (strcmp(argv[0], "mdr") == 0) {
1541 if (argc != 2)
1542 return KDB_ARGCOUNT;
1543 valid = 1;
1544 } else if (isdigit(argv[0][2])) {
1545 bytesperword = (int)(argv[0][2] - '0');
1546 if (bytesperword == 0) {
1547 bytesperword = last_bytesperword;
1548 if (bytesperword == 0)
1549 bytesperword = 4;
1550 }
1551 last_bytesperword = bytesperword;
1552 repeat = mdcount * 16 / bytesperword;
1553 if (!argv[0][3])
1554 valid = 1;
1555 else if (argv[0][3] == 'c' && argv[0][4]) {
1556 char *p;
1557 repeat = simple_strtoul(argv[0] + 4, &p, 10);
1558 mdcount = ((repeat * bytesperword) + 15) / 16;
1559 valid = !*p;
1560 }
1561 last_repeat = repeat;
1562 } else if (strcmp(argv[0], "md") == 0)
1563 valid = 1;
1564 else if (strcmp(argv[0], "mds") == 0)
1565 valid = 1;
1566 else if (strcmp(argv[0], "mdp") == 0) {
1567 phys = valid = 1;
1568 }
1569 if (!valid)
1570 return KDB_NOTFOUND;
1571
1572 if (argc == 0) {
1573 if (last_addr == 0)
1574 return KDB_ARGCOUNT;
1575 addr = last_addr;
1576 radix = last_radix;
1577 bytesperword = last_bytesperword;
1578 repeat = last_repeat;
1579 mdcount = ((repeat * bytesperword) + 15) / 16;
1580 }
1581
1582 if (argc) {
1583 unsigned long val;
1584 int diag, nextarg = 1;
1585 diag = kdbgetaddrarg(argc, argv, &nextarg, &addr,
1586 &offset, NULL);
1587 if (diag)
1588 return diag;
1589 if (argc > nextarg+2)
1590 return KDB_ARGCOUNT;
1591
1592 if (argc >= nextarg) {
1593 diag = kdbgetularg(argv[nextarg], &val);
1594 if (!diag) {
1595 mdcount = (int) val;
1596 repeat = mdcount * 16 / bytesperword;
1597 }
1598 }
1599 if (argc >= nextarg+1) {
1600 diag = kdbgetularg(argv[nextarg+1], &val);
1601 if (!diag)
1602 radix = (int) val;
1603 }
1604 }
1605
1606 if (strcmp(argv[0], "mdr") == 0)
1607 return kdb_mdr(addr, mdcount);
1608
1609 switch (radix) {
1610 case 10:
1611 fmtchar = 'd';
1612 break;
1613 case 16:
1614 fmtchar = 'x';
1615 break;
1616 case 8:
1617 fmtchar = 'o';
1618 break;
1619 default:
1620 return KDB_BADRADIX;
1621 }
1622
1623 last_radix = radix;
1624
1625 if (bytesperword > KDB_WORD_SIZE)
1626 return KDB_BADWIDTH;
1627
1628 switch (bytesperword) {
1629 case 8:
1630 sprintf(fmtstr, "%%16.16l%c ", fmtchar);
1631 break;
1632 case 4:
1633 sprintf(fmtstr, "%%8.8l%c ", fmtchar);
1634 break;
1635 case 2:
1636 sprintf(fmtstr, "%%4.4l%c ", fmtchar);
1637 break;
1638 case 1:
1639 sprintf(fmtstr, "%%2.2l%c ", fmtchar);
1640 break;
1641 default:
1642 return KDB_BADWIDTH;
1643 }
1644
1645 last_repeat = repeat;
1646 last_bytesperword = bytesperword;
1647
1648 if (strcmp(argv[0], "mds") == 0) {
1649 symbolic = 1;
1650 /* Do not save these changes as last_*, they are temporary mds
1651 * overrides.
1652 */
1653 bytesperword = KDB_WORD_SIZE;
1654 repeat = mdcount;
1655 kdbgetintenv("NOSECT", &nosect);
1656 }
1657
1658 /* Round address down modulo BYTESPERWORD */
1659
1660 addr &= ~(bytesperword-1);
1661
1662 while (repeat > 0) {
1663 unsigned long a;
1664 int n, z, num = (symbolic ? 1 : (16 / bytesperword));
1665
1666 if (KDB_FLAG(CMD_INTERRUPT))
1667 return 0;
1668 for (a = addr, z = 0; z < repeat; a += bytesperword, ++z) {
1669 if (phys) {
1670 if (kdb_getphysword(&word, a, bytesperword)
1671 || word)
1672 break;
1673 } else if (kdb_getword(&word, a, bytesperword) || word)
1674 break;
1675 }
1676 n = min(num, repeat);
1677 kdb_md_line(fmtstr, addr, symbolic, nosect, bytesperword,
1678 num, repeat, phys);
1679 addr += bytesperword * n;
1680 repeat -= n;
1681 z = (z + num - 1) / num;
1682 if (z > 2) {
1683 int s = num * (z-2);
1684 kdb_printf(kdb_machreg_fmt0 "-" kdb_machreg_fmt0
1685 " zero suppressed\n",
1686 addr, addr + bytesperword * s - 1);
1687 addr += bytesperword * s;
1688 repeat -= s;
1689 }
1690 }
1691 last_addr = addr;
1692
1693 return 0;
1694}
1695
1696/*
1697 * kdb_mm - This function implements the 'mm' command.
1698 * mm address-expression new-value
1699 * Remarks:
1700 * mm works on machine words, mmW works on bytes.
1701 */
1702static int kdb_mm(int argc, const char **argv)
1703{
1704 int diag;
1705 unsigned long addr;
1706 long offset = 0;
1707 unsigned long contents;
1708 int nextarg;
1709 int width;
1710
1711 if (argv[0][2] && !isdigit(argv[0][2]))
1712 return KDB_NOTFOUND;
1713
1714 if (argc < 2)
1715 return KDB_ARGCOUNT;
1716
1717 nextarg = 1;
1718 diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
1719 if (diag)
1720 return diag;
1721
1722 if (nextarg > argc)
1723 return KDB_ARGCOUNT;
1724 diag = kdbgetaddrarg(argc, argv, &nextarg, &contents, NULL, NULL);
1725 if (diag)
1726 return diag;
1727
1728 if (nextarg != argc + 1)
1729 return KDB_ARGCOUNT;
1730
1731 width = argv[0][2] ? (argv[0][2] - '0') : (KDB_WORD_SIZE);
1732 diag = kdb_putword(addr, contents, width);
1733 if (diag)
1734 return diag;
1735
1736 kdb_printf(kdb_machreg_fmt " = " kdb_machreg_fmt "\n", addr, contents);
1737
1738 return 0;
1739}
1740
1741/*
1742 * kdb_go - This function implements the 'go' command.
1743 * go [address-expression]
1744 */
1745static int kdb_go(int argc, const char **argv)
1746{
1747 unsigned long addr;
1748 int diag;
1749 int nextarg;
1750 long offset;
1751
1752 if (argc == 1) {
1753 if (raw_smp_processor_id() != kdb_initial_cpu) {
1754 kdb_printf("go <address> must be issued from the "
1755 "initial cpu, do cpu %d first\n",
1756 kdb_initial_cpu);
1757 return KDB_ARGCOUNT;
1758 }
1759 nextarg = 1;
1760 diag = kdbgetaddrarg(argc, argv, &nextarg,
1761 &addr, &offset, NULL);
1762 if (diag)
1763 return diag;
1764 } else if (argc) {
1765 return KDB_ARGCOUNT;
1766 }
1767
1768 diag = KDB_CMD_GO;
1769 if (KDB_FLAG(CATASTROPHIC)) {
1770 kdb_printf("Catastrophic error detected\n");
1771 kdb_printf("kdb_continue_catastrophic=%d, ",
1772 kdb_continue_catastrophic);
1773 if (kdb_continue_catastrophic == 0 && kdb_go_count++ == 0) {
1774 kdb_printf("type go a second time if you really want "
1775 "to continue\n");
1776 return 0;
1777 }
1778 if (kdb_continue_catastrophic == 2) {
1779 kdb_printf("forcing reboot\n");
1780 kdb_reboot(0, NULL);
1781 }
1782 kdb_printf("attempting to continue\n");
1783 }
1784 return diag;
1785}
1786
1787/*
1788 * kdb_rd - This function implements the 'rd' command.
1789 */
1790static int kdb_rd(int argc, const char **argv)
1791{
1792 int len = kdb_check_regs();
1793#if DBG_MAX_REG_NUM > 0
1794 int i;
1795 char *rname;
1796 int rsize;
1797 u64 reg64;
1798 u32 reg32;
1799 u16 reg16;
1800 u8 reg8;
1801
1802 if (len)
1803 return len;
1804
1805 for (i = 0; i < DBG_MAX_REG_NUM; i++) {
1806 rsize = dbg_reg_def[i].size * 2;
1807 if (rsize > 16)
1808 rsize = 2;
1809 if (len + strlen(dbg_reg_def[i].name) + 4 + rsize > 80) {
1810 len = 0;
1811 kdb_printf("\n");
1812 }
1813 if (len)
1814 len += kdb_printf(" ");
1815 switch(dbg_reg_def[i].size * 8) {
1816 case 8:
1817 rname = dbg_get_reg(i, &reg8, kdb_current_regs);
1818 if (!rname)
1819 break;
1820 len += kdb_printf("%s: %02x", rname, reg8);
1821 break;
1822 case 16:
1823 rname = dbg_get_reg(i, &reg16, kdb_current_regs);
1824 if (!rname)
1825 break;
1826 len += kdb_printf("%s: %04x", rname, reg16);
1827 break;
1828 case 32:
1829 rname = dbg_get_reg(i, &reg32, kdb_current_regs);
1830 if (!rname)
1831 break;
1832 len += kdb_printf("%s: %08x", rname, reg32);
1833 break;
1834 case 64:
1835 rname = dbg_get_reg(i, &reg64, kdb_current_regs);
1836 if (!rname)
1837 break;
1838 len += kdb_printf("%s: %016llx", rname, reg64);
1839 break;
1840 default:
1841 len += kdb_printf("%s: ??", dbg_reg_def[i].name);
1842 }
1843 }
1844 kdb_printf("\n");
1845#else
1846 if (len)
1847 return len;
1848
1849 kdb_dumpregs(kdb_current_regs);
1850#endif
1851 return 0;
1852}
1853
1854/*
1855 * kdb_rm - This function implements the 'rm' (register modify) command.
1856 * rm register-name new-contents
1857 * Remarks:
1858 * Allows register modification with the same restrictions as gdb
1859 */
1860static int kdb_rm(int argc, const char **argv)
1861{
1862#if DBG_MAX_REG_NUM > 0
1863 int diag;
1864 const char *rname;
1865 int i;
1866 u64 reg64;
1867 u32 reg32;
1868 u16 reg16;
1869 u8 reg8;
1870
1871 if (argc != 2)
1872 return KDB_ARGCOUNT;
1873 /*
1874 * Allow presence or absence of leading '%' symbol.
1875 */
1876 rname = argv[1];
1877 if (*rname == '%')
1878 rname++;
1879
1880 diag = kdbgetu64arg(argv[2], &reg64);
1881 if (diag)
1882 return diag;
1883
1884 diag = kdb_check_regs();
1885 if (diag)
1886 return diag;
1887
1888 diag = KDB_BADREG;
1889 for (i = 0; i < DBG_MAX_REG_NUM; i++) {
1890 if (strcmp(rname, dbg_reg_def[i].name) == 0) {
1891 diag = 0;
1892 break;
1893 }
1894 }
1895 if (!diag) {
1896 switch(dbg_reg_def[i].size * 8) {
1897 case 8:
1898 reg8 = reg64;
1899 dbg_set_reg(i, &reg8, kdb_current_regs);
1900 break;
1901 case 16:
1902 reg16 = reg64;
1903 dbg_set_reg(i, &reg16, kdb_current_regs);
1904 break;
1905 case 32:
1906 reg32 = reg64;
1907 dbg_set_reg(i, &reg32, kdb_current_regs);
1908 break;
1909 case 64:
1910 dbg_set_reg(i, &reg64, kdb_current_regs);
1911 break;
1912 }
1913 }
1914 return diag;
1915#else
1916 kdb_printf("ERROR: Register set currently not implemented\n");
1917 return 0;
1918#endif
1919}
1920
1921#if defined(CONFIG_MAGIC_SYSRQ)
1922/*
1923 * kdb_sr - This function implements the 'sr' (SYSRQ key) command
1924 * which interfaces to the soi-disant MAGIC SYSRQ functionality.
1925 * sr <magic-sysrq-code>
1926 */
1927static int kdb_sr(int argc, const char **argv)
1928{
1929 if (argc != 1)
1930 return KDB_ARGCOUNT;
1931 kdb_trap_printk++;
1932 __handle_sysrq(*argv[1], false);
1933 kdb_trap_printk--;
1934
1935 return 0;
1936}
1937#endif /* CONFIG_MAGIC_SYSRQ */
1938
1939/*
1940 * kdb_ef - This function implements the 'regs' (display exception
1941 * frame) command. This command takes an address and expects to
1942 * find an exception frame at that address, formats and prints
1943 * it.
1944 * regs address-expression
1945 * Remarks:
1946 * Not done yet.
1947 */
1948static int kdb_ef(int argc, const char **argv)
1949{
1950 int diag;
1951 unsigned long addr;
1952 long offset;
1953 int nextarg;
1954
1955 if (argc != 1)
1956 return KDB_ARGCOUNT;
1957
1958 nextarg = 1;
1959 diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
1960 if (diag)
1961 return diag;
1962 show_regs((struct pt_regs *)addr);
1963 return 0;
1964}
1965
1966#if defined(CONFIG_MODULES)
1967/*
1968 * kdb_lsmod - This function implements the 'lsmod' command. Lists
1969 * currently loaded kernel modules.
1970 * Mostly taken from userland lsmod.
1971 */
1972static int kdb_lsmod(int argc, const char **argv)
1973{
1974 struct module *mod;
1975
1976 if (argc != 0)
1977 return KDB_ARGCOUNT;
1978
1979 kdb_printf("Module Size modstruct Used by\n");
1980 list_for_each_entry(mod, kdb_modules, list) {
1981
1982 kdb_printf("%-20s%8u 0x%p ", mod->name,
1983 mod->core_size, (void *)mod);
1984#ifdef CONFIG_MODULE_UNLOAD
1985 kdb_printf("%4d ", module_refcount(mod));
1986#endif
1987 if (mod->state == MODULE_STATE_GOING)
1988 kdb_printf(" (Unloading)");
1989 else if (mod->state == MODULE_STATE_COMING)
1990 kdb_printf(" (Loading)");
1991 else
1992 kdb_printf(" (Live)");
1993 kdb_printf(" 0x%p", mod->module_core);
1994
1995#ifdef CONFIG_MODULE_UNLOAD
1996 {
1997 struct module_use *use;
1998 kdb_printf(" [ ");
1999 list_for_each_entry(use, &mod->source_list,
2000 source_list)
2001 kdb_printf("%s ", use->target->name);
2002 kdb_printf("]\n");
2003 }
2004#endif
2005 }
2006
2007 return 0;
2008}
2009
2010#endif /* CONFIG_MODULES */
2011
2012/*
2013 * kdb_env - This function implements the 'env' command. Display the
2014 * current environment variables.
2015 */
2016
2017static int kdb_env(int argc, const char **argv)
2018{
2019 int i;
2020
2021 for (i = 0; i < __nenv; i++) {
2022 if (__env[i])
2023 kdb_printf("%s\n", __env[i]);
2024 }
2025
2026 if (KDB_DEBUG(MASK))
2027 kdb_printf("KDBFLAGS=0x%x\n", kdb_flags);
2028
2029 return 0;
2030}
2031
2032#ifdef CONFIG_PRINTK
2033/*
2034 * kdb_dmesg - This function implements the 'dmesg' command to display
2035 * the contents of the syslog buffer.
2036 * dmesg [lines] [adjust]
2037 */
2038static int kdb_dmesg(int argc, const char **argv)
2039{
2040 char *syslog_data[4], *start, *end, c = '\0', *p;
2041 int diag, logging, logsize, lines = 0, adjust = 0, n;
2042
2043 if (argc > 2)
2044 return KDB_ARGCOUNT;
2045 if (argc) {
2046 char *cp;
2047 lines = simple_strtol(argv[1], &cp, 0);
2048 if (*cp)
2049 lines = 0;
2050 if (argc > 1) {
2051 adjust = simple_strtoul(argv[2], &cp, 0);
2052 if (*cp || adjust < 0)
2053 adjust = 0;
2054 }
2055 }
2056
2057 /* disable LOGGING if set */
2058 diag = kdbgetintenv("LOGGING", &logging);
2059 if (!diag && logging) {
2060 const char *setargs[] = { "set", "LOGGING", "0" };
2061 kdb_set(2, setargs);
2062 }
2063
2064 /* syslog_data[0,1] physical start, end+1. syslog_data[2,3]
2065 * logical start, end+1. */
2066 kdb_syslog_data(syslog_data);
2067 if (syslog_data[2] == syslog_data[3])
2068 return 0;
2069 logsize = syslog_data[1] - syslog_data[0];
2070 start = syslog_data[2];
2071 end = syslog_data[3];
2072#define KDB_WRAP(p) (((p - syslog_data[0]) % logsize) + syslog_data[0])
2073 for (n = 0, p = start; p < end; ++p) {
2074 c = *KDB_WRAP(p);
2075 if (c == '\n')
2076 ++n;
2077 }
2078 if (c != '\n')
2079 ++n;
2080 if (lines < 0) {
2081 if (adjust >= n)
2082 kdb_printf("buffer only contains %d lines, nothing "
2083 "printed\n", n);
2084 else if (adjust - lines >= n)
2085 kdb_printf("buffer only contains %d lines, last %d "
2086 "lines printed\n", n, n - adjust);
2087 if (adjust) {
2088 for (; start < end && adjust; ++start) {
2089 if (*KDB_WRAP(start) == '\n')
2090 --adjust;
2091 }
2092 if (start < end)
2093 ++start;
2094 }
2095 for (p = start; p < end && lines; ++p) {
2096 if (*KDB_WRAP(p) == '\n')
2097 ++lines;
2098 }
2099 end = p;
2100 } else if (lines > 0) {
2101 int skip = n - (adjust + lines);
2102 if (adjust >= n) {
2103 kdb_printf("buffer only contains %d lines, "
2104 "nothing printed\n", n);
2105 skip = n;
2106 } else if (skip < 0) {
2107 lines += skip;
2108 skip = 0;
2109 kdb_printf("buffer only contains %d lines, first "
2110 "%d lines printed\n", n, lines);
2111 }
2112 for (; start < end && skip; ++start) {
2113 if (*KDB_WRAP(start) == '\n')
2114 --skip;
2115 }
2116 for (p = start; p < end && lines; ++p) {
2117 if (*KDB_WRAP(p) == '\n')
2118 --lines;
2119 }
2120 end = p;
2121 }
2122 /* Do a line at a time (max 200 chars) to reduce protocol overhead */
2123 c = '\n';
2124 while (start != end) {
2125 char buf[201];
2126 p = buf;
2127 if (KDB_FLAG(CMD_INTERRUPT))
2128 return 0;
2129 while (start < end && (c = *KDB_WRAP(start)) &&
2130 (p - buf) < sizeof(buf)-1) {
2131 ++start;
2132 *p++ = c;
2133 if (c == '\n')
2134 break;
2135 }
2136 *p = '\0';
2137 kdb_printf("%s", buf);
2138 }
2139 if (c != '\n')
2140 kdb_printf("\n");
2141
2142 return 0;
2143}
2144#endif /* CONFIG_PRINTK */
2145/*
2146 * kdb_cpu - This function implements the 'cpu' command.
2147 * cpu [<cpunum>]
2148 * Returns:
2149 * KDB_CMD_CPU for success, a kdb diagnostic if error
2150 */
2151static void kdb_cpu_status(void)
2152{
2153 int i, start_cpu, first_print = 1;
2154 char state, prev_state = '?';
2155
2156 kdb_printf("Currently on cpu %d\n", raw_smp_processor_id());
2157 kdb_printf("Available cpus: ");
2158 for (start_cpu = -1, i = 0; i < NR_CPUS; i++) {
2159 if (!cpu_online(i)) {
2160 state = 'F'; /* cpu is offline */
2161 } else {
2162 state = ' '; /* cpu is responding to kdb */
2163 if (kdb_task_state_char(KDB_TSK(i)) == 'I')
2164 state = 'I'; /* idle task */
2165 }
2166 if (state != prev_state) {
2167 if (prev_state != '?') {
2168 if (!first_print)
2169 kdb_printf(", ");
2170 first_print = 0;
2171 kdb_printf("%d", start_cpu);
2172 if (start_cpu < i-1)
2173 kdb_printf("-%d", i-1);
2174 if (prev_state != ' ')
2175 kdb_printf("(%c)", prev_state);
2176 }
2177 prev_state = state;
2178 start_cpu = i;
2179 }
2180 }
2181 /* print the trailing cpus, ignoring them if they are all offline */
2182 if (prev_state != 'F') {
2183 if (!first_print)
2184 kdb_printf(", ");
2185 kdb_printf("%d", start_cpu);
2186 if (start_cpu < i-1)
2187 kdb_printf("-%d", i-1);
2188 if (prev_state != ' ')
2189 kdb_printf("(%c)", prev_state);
2190 }
2191 kdb_printf("\n");
2192}
2193
2194static int kdb_cpu(int argc, const char **argv)
2195{
2196 unsigned long cpunum;
2197 int diag;
2198
2199 if (argc == 0) {
2200 kdb_cpu_status();
2201 return 0;
2202 }
2203
2204 if (argc != 1)
2205 return KDB_ARGCOUNT;
2206
2207 diag = kdbgetularg(argv[1], &cpunum);
2208 if (diag)
2209 return diag;
2210
2211 /*
2212 * Validate cpunum
2213 */
2214 if ((cpunum > NR_CPUS) || !cpu_online(cpunum))
2215 return KDB_BADCPUNUM;
2216
2217 dbg_switch_cpu = cpunum;
2218
2219 /*
2220 * Switch to other cpu
2221 */
2222 return KDB_CMD_CPU;
2223}
2224
2225/* The user may not realize that ps/bta with no parameters does not print idle
2226 * or sleeping system daemon processes, so tell them how many were suppressed.
2227 */
2228void kdb_ps_suppressed(void)
2229{
2230 int idle = 0, daemon = 0;
2231 unsigned long mask_I = kdb_task_state_string("I"),
2232 mask_M = kdb_task_state_string("M");
2233 unsigned long cpu;
2234 const struct task_struct *p, *g;
2235 for_each_online_cpu(cpu) {
2236 p = kdb_curr_task(cpu);
2237 if (kdb_task_state(p, mask_I))
2238 ++idle;
2239 }
2240 kdb_do_each_thread(g, p) {
2241 if (kdb_task_state(p, mask_M))
2242 ++daemon;
2243 } kdb_while_each_thread(g, p);
2244 if (idle || daemon) {
2245 if (idle)
2246 kdb_printf("%d idle process%s (state I)%s\n",
2247 idle, idle == 1 ? "" : "es",
2248 daemon ? " and " : "");
2249 if (daemon)
2250 kdb_printf("%d sleeping system daemon (state M) "
2251 "process%s", daemon,
2252 daemon == 1 ? "" : "es");
2253 kdb_printf(" suppressed,\nuse 'ps A' to see all.\n");
2254 }
2255}
2256
2257/*
2258 * kdb_ps - This function implements the 'ps' command which shows a
2259 * list of the active processes.
2260 * ps [DRSTCZEUIMA] All processes, optionally filtered by state
2261 */
2262void kdb_ps1(const struct task_struct *p)
2263{
2264 int cpu;
2265 unsigned long tmp;
2266
2267 if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long)))
2268 return;
2269
2270 cpu = kdb_process_cpu(p);
2271 kdb_printf("0x%p %8d %8d %d %4d %c 0x%p %c%s\n",
2272 (void *)p, p->pid, p->parent->pid,
2273 kdb_task_has_cpu(p), kdb_process_cpu(p),
2274 kdb_task_state_char(p),
2275 (void *)(&p->thread),
2276 p == kdb_curr_task(raw_smp_processor_id()) ? '*' : ' ',
2277 p->comm);
2278 if (kdb_task_has_cpu(p)) {
2279 if (!KDB_TSK(cpu)) {
2280 kdb_printf(" Error: no saved data for this cpu\n");
2281 } else {
2282 if (KDB_TSK(cpu) != p)
2283 kdb_printf(" Error: does not match running "
2284 "process table (0x%p)\n", KDB_TSK(cpu));
2285 }
2286 }
2287}
2288
2289static int kdb_ps(int argc, const char **argv)
2290{
2291 struct task_struct *g, *p;
2292 unsigned long mask, cpu;
2293
2294 if (argc == 0)
2295 kdb_ps_suppressed();
2296 kdb_printf("%-*s Pid Parent [*] cpu State %-*s Command\n",
2297 (int)(2*sizeof(void *))+2, "Task Addr",
2298 (int)(2*sizeof(void *))+2, "Thread");
2299 mask = kdb_task_state_string(argc ? argv[1] : NULL);
2300 /* Run the active tasks first */
2301 for_each_online_cpu(cpu) {
2302 if (KDB_FLAG(CMD_INTERRUPT))
2303 return 0;
2304 p = kdb_curr_task(cpu);
2305 if (kdb_task_state(p, mask))
2306 kdb_ps1(p);
2307 }
2308 kdb_printf("\n");
2309 /* Now the real tasks */
2310 kdb_do_each_thread(g, p) {
2311 if (KDB_FLAG(CMD_INTERRUPT))
2312 return 0;
2313 if (kdb_task_state(p, mask))
2314 kdb_ps1(p);
2315 } kdb_while_each_thread(g, p);
2316
2317 return 0;
2318}
2319
2320/*
2321 * kdb_pid - This function implements the 'pid' command which switches
2322 * the currently active process.
2323 * pid [<pid> | R]
2324 */
2325static int kdb_pid(int argc, const char **argv)
2326{
2327 struct task_struct *p;
2328 unsigned long val;
2329 int diag;
2330
2331 if (argc > 1)
2332 return KDB_ARGCOUNT;
2333
2334 if (argc) {
2335 if (strcmp(argv[1], "R") == 0) {
2336 p = KDB_TSK(kdb_initial_cpu);
2337 } else {
2338 diag = kdbgetularg(argv[1], &val);
2339 if (diag)
2340 return KDB_BADINT;
2341
2342 p = find_task_by_pid_ns((pid_t)val, &init_pid_ns);
2343 if (!p) {
2344 kdb_printf("No task with pid=%d\n", (pid_t)val);
2345 return 0;
2346 }
2347 }
2348 kdb_set_current_task(p);
2349 }
2350 kdb_printf("KDB current process is %s(pid=%d)\n",
2351 kdb_current_task->comm,
2352 kdb_current_task->pid);
2353
2354 return 0;
2355}
2356
2357/*
2358 * kdb_ll - This function implements the 'll' command which follows a
2359 * linked list and executes an arbitrary command for each
2360 * element.
2361 */
2362static int kdb_ll(int argc, const char **argv)
2363{
2364 int diag;
2365 unsigned long addr;
2366 long offset = 0;
2367 unsigned long va;
2368 unsigned long linkoffset;
2369 int nextarg;
2370 const char *command;
2371
2372 if (argc != 3)
2373 return KDB_ARGCOUNT;
2374
2375 nextarg = 1;
2376 diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
2377 if (diag)
2378 return diag;
2379
2380 diag = kdbgetularg(argv[2], &linkoffset);
2381 if (diag)
2382 return diag;
2383
2384 /*
2385 * Using the starting address as
2386 * the first element in the list, and assuming that
2387 * the list ends with a null pointer.
2388 */
2389
2390 va = addr;
2391 command = kdb_strdup(argv[3], GFP_KDB);
2392 if (!command) {
2393 kdb_printf("%s: cannot duplicate command\n", __func__);
2394 return 0;
2395 }
2396 /* Recursive use of kdb_parse, do not use argv after this point */
2397 argv = NULL;
2398
2399 while (va) {
2400 char buf[80];
2401
2402 if (KDB_FLAG(CMD_INTERRUPT))
2403 return 0;
2404
2405 sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va);
2406 diag = kdb_parse(buf);
2407 if (diag)
2408 return diag;
2409
2410 addr = va + linkoffset;
2411 if (kdb_getword(&va, addr, sizeof(va)))
2412 return 0;
2413 }
2414 kfree(command);
2415
2416 return 0;
2417}
2418
2419static int kdb_kgdb(int argc, const char **argv)
2420{
2421 return KDB_CMD_KGDB;
2422}
2423
2424/*
2425 * kdb_help - This function implements the 'help' and '?' commands.
2426 */
2427static int kdb_help(int argc, const char **argv)
2428{
2429 kdbtab_t *kt;
2430 int i;
2431
2432 kdb_printf("%-15.15s %-20.20s %s\n", "Command", "Usage", "Description");
2433 kdb_printf("-----------------------------"
2434 "-----------------------------\n");
2435 for_each_kdbcmd(kt, i) {
2436 if (kt->cmd_name)
2437 kdb_printf("%-15.15s %-20.20s %s\n", kt->cmd_name,
2438 kt->cmd_usage, kt->cmd_help);
2439 if (KDB_FLAG(CMD_INTERRUPT))
2440 return 0;
2441 }
2442 return 0;
2443}
2444
2445/*
2446 * kdb_kill - This function implements the 'kill' commands.
2447 */
2448static int kdb_kill(int argc, const char **argv)
2449{
2450 long sig, pid;
2451 char *endp;
2452 struct task_struct *p;
2453 struct siginfo info;
2454
2455 if (argc != 2)
2456 return KDB_ARGCOUNT;
2457
2458 sig = simple_strtol(argv[1], &endp, 0);
2459 if (*endp)
2460 return KDB_BADINT;
2461 if (sig >= 0) {
2462 kdb_printf("Invalid signal parameter.<-signal>\n");
2463 return 0;
2464 }
2465 sig = -sig;
2466
2467 pid = simple_strtol(argv[2], &endp, 0);
2468 if (*endp)
2469 return KDB_BADINT;
2470 if (pid <= 0) {
2471 kdb_printf("Process ID must be large than 0.\n");
2472 return 0;
2473 }
2474
2475 /* Find the process. */
2476 p = find_task_by_pid_ns(pid, &init_pid_ns);
2477 if (!p) {
2478 kdb_printf("The specified process isn't found.\n");
2479 return 0;
2480 }
2481 p = p->group_leader;
2482 info.si_signo = sig;
2483 info.si_errno = 0;
2484 info.si_code = SI_USER;
2485 info.si_pid = pid; /* same capabilities as process being signalled */
2486 info.si_uid = 0; /* kdb has root authority */
2487 kdb_send_sig_info(p, &info);
2488 return 0;
2489}
2490
2491struct kdb_tm {
2492 int tm_sec; /* seconds */
2493 int tm_min; /* minutes */
2494 int tm_hour; /* hours */
2495 int tm_mday; /* day of the month */
2496 int tm_mon; /* month */
2497 int tm_year; /* year */
2498};
2499
2500static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm)
2501{
2502 /* This will work from 1970-2099, 2100 is not a leap year */
2503 static int mon_day[] = { 31, 29, 31, 30, 31, 30, 31,
2504 31, 30, 31, 30, 31 };
2505 memset(tm, 0, sizeof(*tm));
2506 tm->tm_sec = tv->tv_sec % (24 * 60 * 60);
2507 tm->tm_mday = tv->tv_sec / (24 * 60 * 60) +
2508 (2 * 365 + 1); /* shift base from 1970 to 1968 */
2509 tm->tm_min = tm->tm_sec / 60 % 60;
2510 tm->tm_hour = tm->tm_sec / 60 / 60;
2511 tm->tm_sec = tm->tm_sec % 60;
2512 tm->tm_year = 68 + 4*(tm->tm_mday / (4*365+1));
2513 tm->tm_mday %= (4*365+1);
2514 mon_day[1] = 29;
2515 while (tm->tm_mday >= mon_day[tm->tm_mon]) {
2516 tm->tm_mday -= mon_day[tm->tm_mon];
2517 if (++tm->tm_mon == 12) {
2518 tm->tm_mon = 0;
2519 ++tm->tm_year;
2520 mon_day[1] = 28;
2521 }
2522 }
2523 ++tm->tm_mday;
2524}
2525
2526/*
2527 * Most of this code has been lifted from kernel/timer.c::sys_sysinfo().
2528 * I cannot call that code directly from kdb, it has an unconditional
2529 * cli()/sti() and calls routines that take locks which can stop the debugger.
2530 */
2531static void kdb_sysinfo(struct sysinfo *val)
2532{
2533 struct timespec uptime;
2534 do_posix_clock_monotonic_gettime(&uptime);
2535 memset(val, 0, sizeof(*val));
2536 val->uptime = uptime.tv_sec;
2537 val->loads[0] = avenrun[0];
2538 val->loads[1] = avenrun[1];
2539 val->loads[2] = avenrun[2];
2540 val->procs = nr_threads-1;
2541 si_meminfo(val);
2542
2543 return;
2544}
2545
2546/*
2547 * kdb_summary - This function implements the 'summary' command.
2548 */
2549static int kdb_summary(int argc, const char **argv)
2550{
2551 struct timespec now;
2552 struct kdb_tm tm;
2553 struct sysinfo val;
2554
2555 if (argc)
2556 return KDB_ARGCOUNT;
2557
2558 kdb_printf("sysname %s\n", init_uts_ns.name.sysname);
2559 kdb_printf("release %s\n", init_uts_ns.name.release);
2560 kdb_printf("version %s\n", init_uts_ns.name.version);
2561 kdb_printf("machine %s\n", init_uts_ns.name.machine);
2562 kdb_printf("nodename %s\n", init_uts_ns.name.nodename);
2563 kdb_printf("domainname %s\n", init_uts_ns.name.domainname);
2564 kdb_printf("ccversion %s\n", __stringify(CCVERSION));
2565
2566 now = __current_kernel_time();
2567 kdb_gmtime(&now, &tm);
2568 kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d "
2569 "tz_minuteswest %d\n",
2570 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday,
2571 tm.tm_hour, tm.tm_min, tm.tm_sec,
2572 sys_tz.tz_minuteswest);
2573
2574 kdb_sysinfo(&val);
2575 kdb_printf("uptime ");
2576 if (val.uptime > (24*60*60)) {
2577 int days = val.uptime / (24*60*60);
2578 val.uptime %= (24*60*60);
2579 kdb_printf("%d day%s ", days, days == 1 ? "" : "s");
2580 }
2581 kdb_printf("%02ld:%02ld\n", val.uptime/(60*60), (val.uptime/60)%60);
2582
2583 /* lifted from fs/proc/proc_misc.c::loadavg_read_proc() */
2584
2585#define LOAD_INT(x) ((x) >> FSHIFT)
2586#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
2587 kdb_printf("load avg %ld.%02ld %ld.%02ld %ld.%02ld\n",
2588 LOAD_INT(val.loads[0]), LOAD_FRAC(val.loads[0]),
2589 LOAD_INT(val.loads[1]), LOAD_FRAC(val.loads[1]),
2590 LOAD_INT(val.loads[2]), LOAD_FRAC(val.loads[2]));
2591#undef LOAD_INT
2592#undef LOAD_FRAC
2593 /* Display in kilobytes */
2594#define K(x) ((x) << (PAGE_SHIFT - 10))
2595 kdb_printf("\nMemTotal: %8lu kB\nMemFree: %8lu kB\n"
2596 "Buffers: %8lu kB\n",
2597 val.totalram, val.freeram, val.bufferram);
2598 return 0;
2599}
2600
2601/*
2602 * kdb_per_cpu - This function implements the 'per_cpu' command.
2603 */
2604static int kdb_per_cpu(int argc, const char **argv)
2605{
2606 char buf[256], fmtstr[64];
2607 kdb_symtab_t symtab;
2608 cpumask_t suppress = CPU_MASK_NONE;
2609 int cpu, diag;
2610 unsigned long addr, val, bytesperword = 0, whichcpu = ~0UL;
2611
2612 if (argc < 1 || argc > 3)
2613 return KDB_ARGCOUNT;
2614
2615 snprintf(buf, sizeof(buf), "per_cpu__%s", argv[1]);
2616 if (!kdbgetsymval(buf, &symtab)) {
2617 kdb_printf("%s is not a per_cpu variable\n", argv[1]);
2618 return KDB_BADADDR;
2619 }
2620 if (argc >= 2) {
2621 diag = kdbgetularg(argv[2], &bytesperword);
2622 if (diag)
2623 return diag;
2624 }
2625 if (!bytesperword)
2626 bytesperword = KDB_WORD_SIZE;
2627 else if (bytesperword > KDB_WORD_SIZE)
2628 return KDB_BADWIDTH;
2629 sprintf(fmtstr, "%%0%dlx ", (int)(2*bytesperword));
2630 if (argc >= 3) {
2631 diag = kdbgetularg(argv[3], &whichcpu);
2632 if (diag)
2633 return diag;
2634 if (!cpu_online(whichcpu)) {
2635 kdb_printf("cpu %ld is not online\n", whichcpu);
2636 return KDB_BADCPUNUM;
2637 }
2638 }
2639
2640 /* Most architectures use __per_cpu_offset[cpu], some use
2641 * __per_cpu_offset(cpu), smp has no __per_cpu_offset.
2642 */
2643#ifdef __per_cpu_offset
2644#define KDB_PCU(cpu) __per_cpu_offset(cpu)
2645#else
2646#ifdef CONFIG_SMP
2647#define KDB_PCU(cpu) __per_cpu_offset[cpu]
2648#else
2649#define KDB_PCU(cpu) 0
2650#endif
2651#endif
2652
2653 for_each_online_cpu(cpu) {
2654 if (whichcpu != ~0UL && whichcpu != cpu)
2655 continue;
2656 addr = symtab.sym_start + KDB_PCU(cpu);
2657 diag = kdb_getword(&val, addr, bytesperword);
2658 if (diag) {
2659 kdb_printf("%5d " kdb_bfd_vma_fmt0 " - unable to "
2660 "read, diag=%d\n", cpu, addr, diag);
2661 continue;
2662 }
2663#ifdef CONFIG_SMP
2664 if (!val) {
2665 cpu_set(cpu, suppress);
2666 continue;
2667 }
2668#endif /* CONFIG_SMP */
2669 kdb_printf("%5d ", cpu);
2670 kdb_md_line(fmtstr, addr,
2671 bytesperword == KDB_WORD_SIZE,
2672 1, bytesperword, 1, 1, 0);
2673 }
2674 if (cpus_weight(suppress) == 0)
2675 return 0;
2676 kdb_printf("Zero suppressed cpu(s):");
2677 for (cpu = first_cpu(suppress); cpu < num_possible_cpus();
2678 cpu = next_cpu(cpu, suppress)) {
2679 kdb_printf(" %d", cpu);
2680 if (cpu == num_possible_cpus() - 1 ||
2681 next_cpu(cpu, suppress) != cpu + 1)
2682 continue;
2683 while (cpu < num_possible_cpus() &&
2684 next_cpu(cpu, suppress) == cpu + 1)
2685 ++cpu;
2686 kdb_printf("-%d", cpu);
2687 }
2688 kdb_printf("\n");
2689
2690#undef KDB_PCU
2691
2692 return 0;
2693}
2694
2695/*
2696 * display help for the use of cmd | grep pattern
2697 */
2698static int kdb_grep_help(int argc, const char **argv)
2699{
2700 kdb_printf("Usage of cmd args | grep pattern:\n");
2701 kdb_printf(" Any command's output may be filtered through an ");
2702 kdb_printf("emulated 'pipe'.\n");
2703 kdb_printf(" 'grep' is just a key word.\n");
2704 kdb_printf(" The pattern may include a very limited set of "
2705 "metacharacters:\n");
2706 kdb_printf(" pattern or ^pattern or pattern$ or ^pattern$\n");
2707 kdb_printf(" And if there are spaces in the pattern, you may "
2708 "quote it:\n");
2709 kdb_printf(" \"pat tern\" or \"^pat tern\" or \"pat tern$\""
2710 " or \"^pat tern$\"\n");
2711 return 0;
2712}
2713
2714/*
2715 * kdb_register_repeat - This function is used to register a kernel
2716 * debugger command.
2717 * Inputs:
2718 * cmd Command name
2719 * func Function to execute the command
2720 * usage A simple usage string showing arguments
2721 * help A simple help string describing command
2722 * repeat Does the command auto repeat on enter?
2723 * Returns:
2724 * zero for success, one if a duplicate command.
2725 */
2726#define kdb_command_extend 50 /* arbitrary */
2727int kdb_register_repeat(char *cmd,
2728 kdb_func_t func,
2729 char *usage,
2730 char *help,
2731 short minlen,
2732 kdb_repeat_t repeat)
2733{
2734 int i;
2735 kdbtab_t *kp;
2736
2737 /*
2738 * Brute force method to determine duplicates
2739 */
2740 for_each_kdbcmd(kp, i) {
2741 if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
2742 kdb_printf("Duplicate kdb command registered: "
2743 "%s, func %p help %s\n", cmd, func, help);
2744 return 1;
2745 }
2746 }
2747
2748 /*
2749 * Insert command into first available location in table
2750 */
2751 for_each_kdbcmd(kp, i) {
2752 if (kp->cmd_name == NULL)
2753 break;
2754 }
2755
2756 if (i >= kdb_max_commands) {
2757 kdbtab_t *new = kmalloc((kdb_max_commands - KDB_BASE_CMD_MAX +
2758 kdb_command_extend) * sizeof(*new), GFP_KDB);
2759 if (!new) {
2760 kdb_printf("Could not allocate new kdb_command "
2761 "table\n");
2762 return 1;
2763 }
2764 if (kdb_commands) {
2765 memcpy(new, kdb_commands,
2766 kdb_max_commands * sizeof(*new));
2767 kfree(kdb_commands);
2768 }
2769 memset(new + kdb_max_commands, 0,
2770 kdb_command_extend * sizeof(*new));
2771 kdb_commands = new;
2772 kp = kdb_commands + kdb_max_commands;
2773 kdb_max_commands += kdb_command_extend;
2774 }
2775
2776 kp->cmd_name = cmd;
2777 kp->cmd_func = func;
2778 kp->cmd_usage = usage;
2779 kp->cmd_help = help;
2780 kp->cmd_flags = 0;
2781 kp->cmd_minlen = minlen;
2782 kp->cmd_repeat = repeat;
2783
2784 return 0;
2785}
2786
2787/*
2788 * kdb_register - Compatibility register function for commands that do
2789 * not need to specify a repeat state. Equivalent to
2790 * kdb_register_repeat with KDB_REPEAT_NONE.
2791 * Inputs:
2792 * cmd Command name
2793 * func Function to execute the command
2794 * usage A simple usage string showing arguments
2795 * help A simple help string describing command
2796 * Returns:
2797 * zero for success, one if a duplicate command.
2798 */
2799int kdb_register(char *cmd,
2800 kdb_func_t func,
2801 char *usage,
2802 char *help,
2803 short minlen)
2804{
2805 return kdb_register_repeat(cmd, func, usage, help, minlen,
2806 KDB_REPEAT_NONE);
2807}
2808
2809/*
2810 * kdb_unregister - This function is used to unregister a kernel
2811 * debugger command. It is generally called when a module which
2812 * implements kdb commands is unloaded.
2813 * Inputs:
2814 * cmd Command name
2815 * Returns:
2816 * zero for success, one command not registered.
2817 */
2818int kdb_unregister(char *cmd)
2819{
2820 int i;
2821 kdbtab_t *kp;
2822
2823 /*
2824 * find the command.
2825 */
2826 for (i = 0, kp = kdb_commands; i < kdb_max_commands; i++, kp++) {
2827 if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
2828 kp->cmd_name = NULL;
2829 return 0;
2830 }
2831 }
2832
2833 /* Couldn't find it. */
2834 return 1;
2835}
2836
2837/* Initialize the kdb command table. */
2838static void __init kdb_inittab(void)
2839{
2840 int i;
2841 kdbtab_t *kp;
2842
2843 for_each_kdbcmd(kp, i)
2844 kp->cmd_name = NULL;
2845
2846 kdb_register_repeat("md", kdb_md, "<vaddr>",
2847 "Display Memory Contents, also mdWcN, e.g. md8c1", 1,
2848 KDB_REPEAT_NO_ARGS);
2849 kdb_register_repeat("mdr", kdb_md, "<vaddr> <bytes>",
2850 "Display Raw Memory", 0, KDB_REPEAT_NO_ARGS);
2851 kdb_register_repeat("mdp", kdb_md, "<paddr> <bytes>",
2852 "Display Physical Memory", 0, KDB_REPEAT_NO_ARGS);
2853 kdb_register_repeat("mds", kdb_md, "<vaddr>",
2854 "Display Memory Symbolically", 0, KDB_REPEAT_NO_ARGS);
2855 kdb_register_repeat("mm", kdb_mm, "<vaddr> <contents>",
2856 "Modify Memory Contents", 0, KDB_REPEAT_NO_ARGS);
2857 kdb_register_repeat("go", kdb_go, "[<vaddr>]",
2858 "Continue Execution", 1, KDB_REPEAT_NONE);
2859 kdb_register_repeat("rd", kdb_rd, "",
2860 "Display Registers", 0, KDB_REPEAT_NONE);
2861 kdb_register_repeat("rm", kdb_rm, "<reg> <contents>",
2862 "Modify Registers", 0, KDB_REPEAT_NONE);
2863 kdb_register_repeat("ef", kdb_ef, "<vaddr>",
2864 "Display exception frame", 0, KDB_REPEAT_NONE);
2865 kdb_register_repeat("bt", kdb_bt, "[<vaddr>]",
2866 "Stack traceback", 1, KDB_REPEAT_NONE);
2867 kdb_register_repeat("btp", kdb_bt, "<pid>",
2868 "Display stack for process <pid>", 0, KDB_REPEAT_NONE);
2869 kdb_register_repeat("bta", kdb_bt, "[DRSTCZEUIMA]",
2870 "Display stack all processes", 0, KDB_REPEAT_NONE);
2871 kdb_register_repeat("btc", kdb_bt, "",
2872 "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE);
2873 kdb_register_repeat("btt", kdb_bt, "<vaddr>",
2874 "Backtrace process given its struct task address", 0,
2875 KDB_REPEAT_NONE);
2876 kdb_register_repeat("ll", kdb_ll, "<first-element> <linkoffset> <cmd>",
2877 "Execute cmd for each element in linked list", 0, KDB_REPEAT_NONE);
2878 kdb_register_repeat("env", kdb_env, "",
2879 "Show environment variables", 0, KDB_REPEAT_NONE);
2880 kdb_register_repeat("set", kdb_set, "",
2881 "Set environment variables", 0, KDB_REPEAT_NONE);
2882 kdb_register_repeat("help", kdb_help, "",
2883 "Display Help Message", 1, KDB_REPEAT_NONE);
2884 kdb_register_repeat("?", kdb_help, "",
2885 "Display Help Message", 0, KDB_REPEAT_NONE);
2886 kdb_register_repeat("cpu", kdb_cpu, "<cpunum>",
2887 "Switch to new cpu", 0, KDB_REPEAT_NONE);
2888 kdb_register_repeat("kgdb", kdb_kgdb, "",
2889 "Enter kgdb mode", 0, KDB_REPEAT_NONE);
2890 kdb_register_repeat("ps", kdb_ps, "[<flags>|A]",
2891 "Display active task list", 0, KDB_REPEAT_NONE);
2892 kdb_register_repeat("pid", kdb_pid, "<pidnum>",
2893 "Switch to another task", 0, KDB_REPEAT_NONE);
2894 kdb_register_repeat("reboot", kdb_reboot, "",
2895 "Reboot the machine immediately", 0, KDB_REPEAT_NONE);
2896#if defined(CONFIG_MODULES)
2897 kdb_register_repeat("lsmod", kdb_lsmod, "",
2898 "List loaded kernel modules", 0, KDB_REPEAT_NONE);
2899#endif
2900#if defined(CONFIG_MAGIC_SYSRQ)
2901 kdb_register_repeat("sr", kdb_sr, "<key>",
2902 "Magic SysRq key", 0, KDB_REPEAT_NONE);
2903#endif
2904#if defined(CONFIG_PRINTK)
2905 kdb_register_repeat("dmesg", kdb_dmesg, "[lines]",
2906 "Display syslog buffer", 0, KDB_REPEAT_NONE);
2907#endif
2908 kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
2909 "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE);
2910 kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>",
2911 "Send a signal to a process", 0, KDB_REPEAT_NONE);
2912 kdb_register_repeat("summary", kdb_summary, "",
2913 "Summarize the system", 4, KDB_REPEAT_NONE);
2914 kdb_register_repeat("per_cpu", kdb_per_cpu, "",
2915 "Display per_cpu variables", 3, KDB_REPEAT_NONE);
2916 kdb_register_repeat("grephelp", kdb_grep_help, "",
2917 "Display help on | grep", 0, KDB_REPEAT_NONE);
2918}
2919
2920/* Execute any commands defined in kdb_cmds. */
2921static void __init kdb_cmd_init(void)
2922{
2923 int i, diag;
2924 for (i = 0; kdb_cmds[i]; ++i) {
2925 diag = kdb_parse(kdb_cmds[i]);
2926 if (diag)
2927 kdb_printf("kdb command %s failed, kdb diag %d\n",
2928 kdb_cmds[i], diag);
2929 }
2930 if (defcmd_in_progress) {
2931 kdb_printf("Incomplete 'defcmd' set, forcing endefcmd\n");
2932 kdb_parse("endefcmd");
2933 }
2934}
2935
2936/* Intialize kdb_printf, breakpoint tables and kdb state */
2937void __init kdb_init(int lvl)
2938{
2939 static int kdb_init_lvl = KDB_NOT_INITIALIZED;
2940 int i;
2941
2942 if (kdb_init_lvl == KDB_INIT_FULL || lvl <= kdb_init_lvl)
2943 return;
2944 for (i = kdb_init_lvl; i < lvl; i++) {
2945 switch (i) {
2946 case KDB_NOT_INITIALIZED:
2947 kdb_inittab(); /* Initialize Command Table */
2948 kdb_initbptab(); /* Initialize Breakpoints */
2949 break;
2950 case KDB_INIT_EARLY:
2951 kdb_cmd_init(); /* Build kdb_cmds tables */
2952 break;
2953 }
2954 }
2955 kdb_init_lvl = lvl;
2956}
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
new file mode 100644
index 000000000000..be775f7e81e0
--- /dev/null
+++ b/kernel/debug/kdb/kdb_private.h
@@ -0,0 +1,305 @@
1#ifndef _KDBPRIVATE_H
2#define _KDBPRIVATE_H
3
4/*
5 * Kernel Debugger Architecture Independent Private Headers
6 *
7 * This file is subject to the terms and conditions of the GNU General Public
8 * License. See the file "COPYING" in the main directory of this archive
9 * for more details.
10 *
11 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
12 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
13 */
14
15#include <linux/kgdb.h>
16#include "../debug_core.h"
17
18/* Kernel Debugger Error codes. Must not overlap with command codes. */
19#define KDB_NOTFOUND (-1)
20#define KDB_ARGCOUNT (-2)
21#define KDB_BADWIDTH (-3)
22#define KDB_BADRADIX (-4)
23#define KDB_NOTENV (-5)
24#define KDB_NOENVVALUE (-6)
25#define KDB_NOTIMP (-7)
26#define KDB_ENVFULL (-8)
27#define KDB_ENVBUFFULL (-9)
28#define KDB_TOOMANYBPT (-10)
29#define KDB_TOOMANYDBREGS (-11)
30#define KDB_DUPBPT (-12)
31#define KDB_BPTNOTFOUND (-13)
32#define KDB_BADMODE (-14)
33#define KDB_BADINT (-15)
34#define KDB_INVADDRFMT (-16)
35#define KDB_BADREG (-17)
36#define KDB_BADCPUNUM (-18)
37#define KDB_BADLENGTH (-19)
38#define KDB_NOBP (-20)
39#define KDB_BADADDR (-21)
40
41/* Kernel Debugger Command codes. Must not overlap with error codes. */
42#define KDB_CMD_GO (-1001)
43#define KDB_CMD_CPU (-1002)
44#define KDB_CMD_SS (-1003)
45#define KDB_CMD_SSB (-1004)
46#define KDB_CMD_KGDB (-1005)
47#define KDB_CMD_KGDB2 (-1006)
48
49/* Internal debug flags */
50#define KDB_DEBUG_FLAG_BP 0x0002 /* Breakpoint subsystem debug */
51#define KDB_DEBUG_FLAG_BB_SUMM 0x0004 /* Basic block analysis, summary only */
52#define KDB_DEBUG_FLAG_AR 0x0008 /* Activation record, generic */
53#define KDB_DEBUG_FLAG_ARA 0x0010 /* Activation record, arch specific */
54#define KDB_DEBUG_FLAG_BB 0x0020 /* All basic block analysis */
55#define KDB_DEBUG_FLAG_STATE 0x0040 /* State flags */
56#define KDB_DEBUG_FLAG_MASK 0xffff /* All debug flags */
57#define KDB_DEBUG_FLAG_SHIFT 16 /* Shift factor for dbflags */
58
59#define KDB_DEBUG(flag) (kdb_flags & \
60 (KDB_DEBUG_FLAG_##flag << KDB_DEBUG_FLAG_SHIFT))
61#define KDB_DEBUG_STATE(text, value) if (KDB_DEBUG(STATE)) \
62 kdb_print_state(text, value)
63
64#if BITS_PER_LONG == 32
65
66#define KDB_PLATFORM_ENV "BYTESPERWORD=4"
67
68#define kdb_machreg_fmt "0x%lx"
69#define kdb_machreg_fmt0 "0x%08lx"
70#define kdb_bfd_vma_fmt "0x%lx"
71#define kdb_bfd_vma_fmt0 "0x%08lx"
72#define kdb_elfw_addr_fmt "0x%x"
73#define kdb_elfw_addr_fmt0 "0x%08x"
74#define kdb_f_count_fmt "%d"
75
76#elif BITS_PER_LONG == 64
77
78#define KDB_PLATFORM_ENV "BYTESPERWORD=8"
79
80#define kdb_machreg_fmt "0x%lx"
81#define kdb_machreg_fmt0 "0x%016lx"
82#define kdb_bfd_vma_fmt "0x%lx"
83#define kdb_bfd_vma_fmt0 "0x%016lx"
84#define kdb_elfw_addr_fmt "0x%x"
85#define kdb_elfw_addr_fmt0 "0x%016x"
86#define kdb_f_count_fmt "%ld"
87
88#endif
89
90/*
91 * KDB_MAXBPT describes the total number of breakpoints
92 * supported by this architecure.
93 */
94#define KDB_MAXBPT 16
95
96/* Maximum number of arguments to a function */
97#define KDB_MAXARGS 16
98
99typedef enum {
100 KDB_REPEAT_NONE = 0, /* Do not repeat this command */
101 KDB_REPEAT_NO_ARGS, /* Repeat the command without arguments */
102 KDB_REPEAT_WITH_ARGS, /* Repeat the command including its arguments */
103} kdb_repeat_t;
104
105typedef int (*kdb_func_t)(int, const char **);
106
107/* Symbol table format returned by kallsyms. */
108typedef struct __ksymtab {
109 unsigned long value; /* Address of symbol */
110 const char *mod_name; /* Module containing symbol or
111 * "kernel" */
112 unsigned long mod_start;
113 unsigned long mod_end;
114 const char *sec_name; /* Section containing symbol */
115 unsigned long sec_start;
116 unsigned long sec_end;
117 const char *sym_name; /* Full symbol name, including
118 * any version */
119 unsigned long sym_start;
120 unsigned long sym_end;
121 } kdb_symtab_t;
122extern int kallsyms_symbol_next(char *prefix_name, int flag);
123extern int kallsyms_symbol_complete(char *prefix_name, int max_len);
124
125/* Exported Symbols for kernel loadable modules to use. */
126extern int kdb_register(char *, kdb_func_t, char *, char *, short);
127extern int kdb_register_repeat(char *, kdb_func_t, char *, char *,
128 short, kdb_repeat_t);
129extern int kdb_unregister(char *);
130
131extern int kdb_getarea_size(void *, unsigned long, size_t);
132extern int kdb_putarea_size(unsigned long, void *, size_t);
133
134/*
135 * Like get_user and put_user, kdb_getarea and kdb_putarea take variable
136 * names, not pointers. The underlying *_size functions take pointers.
137 */
138#define kdb_getarea(x, addr) kdb_getarea_size(&(x), addr, sizeof((x)))
139#define kdb_putarea(addr, x) kdb_putarea_size(addr, &(x), sizeof((x)))
140
141extern int kdb_getphysword(unsigned long *word,
142 unsigned long addr, size_t size);
143extern int kdb_getword(unsigned long *, unsigned long, size_t);
144extern int kdb_putword(unsigned long, unsigned long, size_t);
145
146extern int kdbgetularg(const char *, unsigned long *);
147extern char *kdbgetenv(const char *);
148extern int kdbgetaddrarg(int, const char **, int*, unsigned long *,
149 long *, char **);
150extern int kdbgetsymval(const char *, kdb_symtab_t *);
151extern int kdbnearsym(unsigned long, kdb_symtab_t *);
152extern void kdbnearsym_cleanup(void);
153extern char *kdb_strdup(const char *str, gfp_t type);
154extern void kdb_symbol_print(unsigned long, const kdb_symtab_t *, unsigned int);
155
156/* Routine for debugging the debugger state. */
157extern void kdb_print_state(const char *, int);
158
159extern int kdb_state;
160#define KDB_STATE_KDB 0x00000001 /* Cpu is inside kdb */
161#define KDB_STATE_LEAVING 0x00000002 /* Cpu is leaving kdb */
162#define KDB_STATE_CMD 0x00000004 /* Running a kdb command */
163#define KDB_STATE_KDB_CONTROL 0x00000008 /* This cpu is under
164 * kdb control */
165#define KDB_STATE_HOLD_CPU 0x00000010 /* Hold this cpu inside kdb */
166#define KDB_STATE_DOING_SS 0x00000020 /* Doing ss command */
167#define KDB_STATE_DOING_SSB 0x00000040 /* Doing ssb command,
168 * DOING_SS is also set */
169#define KDB_STATE_SSBPT 0x00000080 /* Install breakpoint
170 * after one ss, independent of
171 * DOING_SS */
172#define KDB_STATE_REENTRY 0x00000100 /* Valid re-entry into kdb */
173#define KDB_STATE_SUPPRESS 0x00000200 /* Suppress error messages */
174#define KDB_STATE_PAGER 0x00000400 /* pager is available */
175#define KDB_STATE_GO_SWITCH 0x00000800 /* go is switching
176 * back to initial cpu */
177#define KDB_STATE_PRINTF_LOCK 0x00001000 /* Holds kdb_printf lock */
178#define KDB_STATE_WAIT_IPI 0x00002000 /* Waiting for kdb_ipi() NMI */
179#define KDB_STATE_RECURSE 0x00004000 /* Recursive entry to kdb */
180#define KDB_STATE_IP_ADJUSTED 0x00008000 /* Restart IP has been
181 * adjusted */
182#define KDB_STATE_GO1 0x00010000 /* go only releases one cpu */
183#define KDB_STATE_KEYBOARD 0x00020000 /* kdb entered via
184 * keyboard on this cpu */
185#define KDB_STATE_KEXEC 0x00040000 /* kexec issued */
186#define KDB_STATE_DOING_KGDB 0x00080000 /* kgdb enter now issued */
187#define KDB_STATE_DOING_KGDB2 0x00100000 /* kgdb enter now issued */
188#define KDB_STATE_KGDB_TRANS 0x00200000 /* Transition to kgdb */
189#define KDB_STATE_ARCH 0xff000000 /* Reserved for arch
190 * specific use */
191
192#define KDB_STATE(flag) (kdb_state & KDB_STATE_##flag)
193#define KDB_STATE_SET(flag) ((void)(kdb_state |= KDB_STATE_##flag))
194#define KDB_STATE_CLEAR(flag) ((void)(kdb_state &= ~KDB_STATE_##flag))
195
196extern int kdb_nextline; /* Current number of lines displayed */
197
198typedef struct _kdb_bp {
199 unsigned long bp_addr; /* Address breakpoint is present at */
200 unsigned int bp_free:1; /* This entry is available */
201 unsigned int bp_enabled:1; /* Breakpoint is active in register */
202 unsigned int bp_type:4; /* Uses hardware register */
203 unsigned int bp_installed:1; /* Breakpoint is installed */
204 unsigned int bp_delay:1; /* Do delayed bp handling */
205 unsigned int bp_delayed:1; /* Delayed breakpoint */
206 unsigned int bph_length; /* HW break length */
207} kdb_bp_t;
208
209#ifdef CONFIG_KGDB_KDB
210extern kdb_bp_t kdb_breakpoints[/* KDB_MAXBPT */];
211
212/* The KDB shell command table */
213typedef struct _kdbtab {
214 char *cmd_name; /* Command name */
215 kdb_func_t cmd_func; /* Function to execute command */
216 char *cmd_usage; /* Usage String for this command */
217 char *cmd_help; /* Help message for this command */
218 short cmd_flags; /* Parsing flags */
219 short cmd_minlen; /* Minimum legal # command
220 * chars required */
221 kdb_repeat_t cmd_repeat; /* Does command auto repeat on enter? */
222} kdbtab_t;
223
224extern int kdb_bt(int, const char **); /* KDB display back trace */
225
226/* KDB breakpoint management functions */
227extern void kdb_initbptab(void);
228extern void kdb_bp_install(struct pt_regs *);
229extern void kdb_bp_remove(void);
230
231typedef enum {
232 KDB_DB_BPT, /* Breakpoint */
233 KDB_DB_SS, /* Single-step trap */
234 KDB_DB_SSB, /* Single step to branch */
235 KDB_DB_SSBPT, /* Single step over breakpoint */
236 KDB_DB_NOBPT /* Spurious breakpoint */
237} kdb_dbtrap_t;
238
239extern int kdb_main_loop(kdb_reason_t, kdb_reason_t,
240 int, kdb_dbtrap_t, struct pt_regs *);
241
242/* Miscellaneous functions and data areas */
243extern int kdb_grepping_flag;
244extern char kdb_grep_string[];
245extern int kdb_grep_leading;
246extern int kdb_grep_trailing;
247extern char *kdb_cmds[];
248extern void kdb_syslog_data(char *syslog_data[]);
249extern unsigned long kdb_task_state_string(const char *);
250extern char kdb_task_state_char (const struct task_struct *);
251extern unsigned long kdb_task_state(const struct task_struct *p,
252 unsigned long mask);
253extern void kdb_ps_suppressed(void);
254extern void kdb_ps1(const struct task_struct *p);
255extern void kdb_print_nameval(const char *name, unsigned long val);
256extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
257extern void kdb_meminfo_proc_show(void);
258#ifdef CONFIG_KALLSYMS
259extern const char *kdb_walk_kallsyms(loff_t *pos);
260#else /* ! CONFIG_KALLSYMS */
261static inline const char *kdb_walk_kallsyms(loff_t *pos)
262{
263 return NULL;
264}
265#endif /* ! CONFIG_KALLSYMS */
266extern char *kdb_getstr(char *, size_t, char *);
267
268/* Defines for kdb_symbol_print */
269#define KDB_SP_SPACEB 0x0001 /* Space before string */
270#define KDB_SP_SPACEA 0x0002 /* Space after string */
271#define KDB_SP_PAREN 0x0004 /* Parenthesis around string */
272#define KDB_SP_VALUE 0x0008 /* Print the value of the address */
273#define KDB_SP_SYMSIZE 0x0010 /* Print the size of the symbol */
274#define KDB_SP_NEWLINE 0x0020 /* Newline after string */
275#define KDB_SP_DEFAULT (KDB_SP_VALUE|KDB_SP_PAREN)
276
277#define KDB_TSK(cpu) kgdb_info[cpu].task
278#define KDB_TSKREGS(cpu) kgdb_info[cpu].debuggerinfo
279
280extern struct task_struct *kdb_curr_task(int);
281
282#define kdb_task_has_cpu(p) (task_curr(p))
283
284/* Simplify coexistence with NPTL */
285#define kdb_do_each_thread(g, p) do_each_thread(g, p)
286#define kdb_while_each_thread(g, p) while_each_thread(g, p)
287
288#define GFP_KDB (in_interrupt() ? GFP_ATOMIC : GFP_KERNEL)
289
290extern void *debug_kmalloc(size_t size, gfp_t flags);
291extern void debug_kfree(void *);
292extern void debug_kusage(void);
293
294extern void kdb_set_current_task(struct task_struct *);
295extern struct task_struct *kdb_current_task;
296#ifdef CONFIG_MODULES
297extern struct list_head *kdb_modules;
298#endif /* CONFIG_MODULES */
299
300extern char kdb_prompt_str[];
301
302#define KDB_WORD_SIZE ((int)sizeof(unsigned long))
303
304#endif /* CONFIG_KGDB_KDB */
305#endif /* !_KDBPRIVATE_H */
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
new file mode 100644
index 000000000000..6b2485dcb050
--- /dev/null
+++ b/kernel/debug/kdb/kdb_support.c
@@ -0,0 +1,927 @@
1/*
2 * Kernel Debugger Architecture Independent Support Functions
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
9 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
10 * 03/02/13 added new 2.5 kallsyms <xavier.bru@bull.net>
11 */
12
13#include <stdarg.h>
14#include <linux/types.h>
15#include <linux/sched.h>
16#include <linux/mm.h>
17#include <linux/kallsyms.h>
18#include <linux/stddef.h>
19#include <linux/vmalloc.h>
20#include <linux/ptrace.h>
21#include <linux/module.h>
22#include <linux/highmem.h>
23#include <linux/hardirq.h>
24#include <linux/delay.h>
25#include <linux/uaccess.h>
26#include <linux/kdb.h>
27#include <linux/slab.h>
28#include "kdb_private.h"
29
30/*
31 * kdbgetsymval - Return the address of the given symbol.
32 *
33 * Parameters:
34 * symname Character string containing symbol name
35 * symtab Structure to receive results
36 * Returns:
37 * 0 Symbol not found, symtab zero filled
38 * 1 Symbol mapped to module/symbol/section, data in symtab
39 */
40int kdbgetsymval(const char *symname, kdb_symtab_t *symtab)
41{
42 if (KDB_DEBUG(AR))
43 kdb_printf("kdbgetsymval: symname=%s, symtab=%p\n", symname,
44 symtab);
45 memset(symtab, 0, sizeof(*symtab));
46 symtab->sym_start = kallsyms_lookup_name(symname);
47 if (symtab->sym_start) {
48 if (KDB_DEBUG(AR))
49 kdb_printf("kdbgetsymval: returns 1, "
50 "symtab->sym_start=0x%lx\n",
51 symtab->sym_start);
52 return 1;
53 }
54 if (KDB_DEBUG(AR))
55 kdb_printf("kdbgetsymval: returns 0\n");
56 return 0;
57}
58EXPORT_SYMBOL(kdbgetsymval);
59
60static char *kdb_name_table[100]; /* arbitrary size */
61
62/*
63 * kdbnearsym - Return the name of the symbol with the nearest address
64 * less than 'addr'.
65 *
66 * Parameters:
67 * addr Address to check for symbol near
68 * symtab Structure to receive results
69 * Returns:
70 * 0 No sections contain this address, symtab zero filled
71 * 1 Address mapped to module/symbol/section, data in symtab
72 * Remarks:
73 * 2.6 kallsyms has a "feature" where it unpacks the name into a
74 * string. If that string is reused before the caller expects it
75 * then the caller sees its string change without warning. To
76 * avoid cluttering up the main kdb code with lots of kdb_strdup,
77 * tests and kfree calls, kdbnearsym maintains an LRU list of the
78 * last few unique strings. The list is sized large enough to
79 * hold active strings, no kdb caller of kdbnearsym makes more
80 * than ~20 later calls before using a saved value.
81 */
82int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
83{
84 int ret = 0;
85 unsigned long symbolsize = 0;
86 unsigned long offset = 0;
87#define knt1_size 128 /* must be >= kallsyms table size */
88 char *knt1 = NULL;
89
90 if (KDB_DEBUG(AR))
91 kdb_printf("kdbnearsym: addr=0x%lx, symtab=%p\n", addr, symtab);
92 memset(symtab, 0, sizeof(*symtab));
93
94 if (addr < 4096)
95 goto out;
96 knt1 = debug_kmalloc(knt1_size, GFP_ATOMIC);
97 if (!knt1) {
98 kdb_printf("kdbnearsym: addr=0x%lx cannot kmalloc knt1\n",
99 addr);
100 goto out;
101 }
102 symtab->sym_name = kallsyms_lookup(addr, &symbolsize , &offset,
103 (char **)(&symtab->mod_name), knt1);
104 if (offset > 8*1024*1024) {
105 symtab->sym_name = NULL;
106 addr = offset = symbolsize = 0;
107 }
108 symtab->sym_start = addr - offset;
109 symtab->sym_end = symtab->sym_start + symbolsize;
110 ret = symtab->sym_name != NULL && *(symtab->sym_name) != '\0';
111
112 if (ret) {
113 int i;
114 /* Another 2.6 kallsyms "feature". Sometimes the sym_name is
115 * set but the buffer passed into kallsyms_lookup is not used,
116 * so it contains garbage. The caller has to work out which
117 * buffer needs to be saved.
118 *
119 * What was Rusty smoking when he wrote that code?
120 */
121 if (symtab->sym_name != knt1) {
122 strncpy(knt1, symtab->sym_name, knt1_size);
123 knt1[knt1_size-1] = '\0';
124 }
125 for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) {
126 if (kdb_name_table[i] &&
127 strcmp(kdb_name_table[i], knt1) == 0)
128 break;
129 }
130 if (i >= ARRAY_SIZE(kdb_name_table)) {
131 debug_kfree(kdb_name_table[0]);
132 memcpy(kdb_name_table, kdb_name_table+1,
133 sizeof(kdb_name_table[0]) *
134 (ARRAY_SIZE(kdb_name_table)-1));
135 } else {
136 debug_kfree(knt1);
137 knt1 = kdb_name_table[i];
138 memcpy(kdb_name_table+i, kdb_name_table+i+1,
139 sizeof(kdb_name_table[0]) *
140 (ARRAY_SIZE(kdb_name_table)-i-1));
141 }
142 i = ARRAY_SIZE(kdb_name_table) - 1;
143 kdb_name_table[i] = knt1;
144 symtab->sym_name = kdb_name_table[i];
145 knt1 = NULL;
146 }
147
148 if (symtab->mod_name == NULL)
149 symtab->mod_name = "kernel";
150 if (KDB_DEBUG(AR))
151 kdb_printf("kdbnearsym: returns %d symtab->sym_start=0x%lx, "
152 "symtab->mod_name=%p, symtab->sym_name=%p (%s)\n", ret,
153 symtab->sym_start, symtab->mod_name, symtab->sym_name,
154 symtab->sym_name);
155
156out:
157 debug_kfree(knt1);
158 return ret;
159}
160
161void kdbnearsym_cleanup(void)
162{
163 int i;
164 for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) {
165 if (kdb_name_table[i]) {
166 debug_kfree(kdb_name_table[i]);
167 kdb_name_table[i] = NULL;
168 }
169 }
170}
171
172static char ks_namebuf[KSYM_NAME_LEN+1], ks_namebuf_prev[KSYM_NAME_LEN+1];
173
174/*
175 * kallsyms_symbol_complete
176 *
177 * Parameters:
178 * prefix_name prefix of a symbol name to lookup
179 * max_len maximum length that can be returned
180 * Returns:
181 * Number of symbols which match the given prefix.
182 * Notes:
183 * prefix_name is changed to contain the longest unique prefix that
184 * starts with this prefix (tab completion).
185 */
186int kallsyms_symbol_complete(char *prefix_name, int max_len)
187{
188 loff_t pos = 0;
189 int prefix_len = strlen(prefix_name), prev_len = 0;
190 int i, number = 0;
191 const char *name;
192
193 while ((name = kdb_walk_kallsyms(&pos))) {
194 if (strncmp(name, prefix_name, prefix_len) == 0) {
195 strcpy(ks_namebuf, name);
196 /* Work out the longest name that matches the prefix */
197 if (++number == 1) {
198 prev_len = min_t(int, max_len-1,
199 strlen(ks_namebuf));
200 memcpy(ks_namebuf_prev, ks_namebuf, prev_len);
201 ks_namebuf_prev[prev_len] = '\0';
202 continue;
203 }
204 for (i = 0; i < prev_len; i++) {
205 if (ks_namebuf[i] != ks_namebuf_prev[i]) {
206 prev_len = i;
207 ks_namebuf_prev[i] = '\0';
208 break;
209 }
210 }
211 }
212 }
213 if (prev_len > prefix_len)
214 memcpy(prefix_name, ks_namebuf_prev, prev_len+1);
215 return number;
216}
217
218/*
219 * kallsyms_symbol_next
220 *
221 * Parameters:
222 * prefix_name prefix of a symbol name to lookup
223 * flag 0 means search from the head, 1 means continue search.
224 * Returns:
225 * 1 if a symbol matches the given prefix.
226 * 0 if no string found
227 */
228int kallsyms_symbol_next(char *prefix_name, int flag)
229{
230 int prefix_len = strlen(prefix_name);
231 static loff_t pos;
232 const char *name;
233
234 if (!flag)
235 pos = 0;
236
237 while ((name = kdb_walk_kallsyms(&pos))) {
238 if (strncmp(name, prefix_name, prefix_len) == 0) {
239 strncpy(prefix_name, name, strlen(name)+1);
240 return 1;
241 }
242 }
243 return 0;
244}
245
246/*
247 * kdb_symbol_print - Standard method for printing a symbol name and offset.
248 * Inputs:
249 * addr Address to be printed.
250 * symtab Address of symbol data, if NULL this routine does its
251 * own lookup.
252 * punc Punctuation for string, bit field.
253 * Remarks:
254 * The string and its punctuation is only printed if the address
255 * is inside the kernel, except that the value is always printed
256 * when requested.
257 */
258void kdb_symbol_print(unsigned long addr, const kdb_symtab_t *symtab_p,
259 unsigned int punc)
260{
261 kdb_symtab_t symtab, *symtab_p2;
262 if (symtab_p) {
263 symtab_p2 = (kdb_symtab_t *)symtab_p;
264 } else {
265 symtab_p2 = &symtab;
266 kdbnearsym(addr, symtab_p2);
267 }
268 if (!(symtab_p2->sym_name || (punc & KDB_SP_VALUE)))
269 return;
270 if (punc & KDB_SP_SPACEB)
271 kdb_printf(" ");
272 if (punc & KDB_SP_VALUE)
273 kdb_printf(kdb_machreg_fmt0, addr);
274 if (symtab_p2->sym_name) {
275 if (punc & KDB_SP_VALUE)
276 kdb_printf(" ");
277 if (punc & KDB_SP_PAREN)
278 kdb_printf("(");
279 if (strcmp(symtab_p2->mod_name, "kernel"))
280 kdb_printf("[%s]", symtab_p2->mod_name);
281 kdb_printf("%s", symtab_p2->sym_name);
282 if (addr != symtab_p2->sym_start)
283 kdb_printf("+0x%lx", addr - symtab_p2->sym_start);
284 if (punc & KDB_SP_SYMSIZE)
285 kdb_printf("/0x%lx",
286 symtab_p2->sym_end - symtab_p2->sym_start);
287 if (punc & KDB_SP_PAREN)
288 kdb_printf(")");
289 }
290 if (punc & KDB_SP_SPACEA)
291 kdb_printf(" ");
292 if (punc & KDB_SP_NEWLINE)
293 kdb_printf("\n");
294}
295
296/*
297 * kdb_strdup - kdb equivalent of strdup, for disasm code.
298 * Inputs:
299 * str The string to duplicate.
300 * type Flags to kmalloc for the new string.
301 * Returns:
302 * Address of the new string, NULL if storage could not be allocated.
303 * Remarks:
304 * This is not in lib/string.c because it uses kmalloc which is not
305 * available when string.o is used in boot loaders.
306 */
307char *kdb_strdup(const char *str, gfp_t type)
308{
309 int n = strlen(str)+1;
310 char *s = kmalloc(n, type);
311 if (!s)
312 return NULL;
313 return strcpy(s, str);
314}
315
316/*
317 * kdb_getarea_size - Read an area of data. The kdb equivalent of
318 * copy_from_user, with kdb messages for invalid addresses.
319 * Inputs:
320 * res Pointer to the area to receive the result.
321 * addr Address of the area to copy.
322 * size Size of the area.
323 * Returns:
324 * 0 for success, < 0 for error.
325 */
326int kdb_getarea_size(void *res, unsigned long addr, size_t size)
327{
328 int ret = probe_kernel_read((char *)res, (char *)addr, size);
329 if (ret) {
330 if (!KDB_STATE(SUPPRESS)) {
331 kdb_printf("kdb_getarea: Bad address 0x%lx\n", addr);
332 KDB_STATE_SET(SUPPRESS);
333 }
334 ret = KDB_BADADDR;
335 } else {
336 KDB_STATE_CLEAR(SUPPRESS);
337 }
338 return ret;
339}
340
341/*
342 * kdb_putarea_size - Write an area of data. The kdb equivalent of
343 * copy_to_user, with kdb messages for invalid addresses.
344 * Inputs:
345 * addr Address of the area to write to.
346 * res Pointer to the area holding the data.
347 * size Size of the area.
348 * Returns:
349 * 0 for success, < 0 for error.
350 */
351int kdb_putarea_size(unsigned long addr, void *res, size_t size)
352{
353 int ret = probe_kernel_read((char *)addr, (char *)res, size);
354 if (ret) {
355 if (!KDB_STATE(SUPPRESS)) {
356 kdb_printf("kdb_putarea: Bad address 0x%lx\n", addr);
357 KDB_STATE_SET(SUPPRESS);
358 }
359 ret = KDB_BADADDR;
360 } else {
361 KDB_STATE_CLEAR(SUPPRESS);
362 }
363 return ret;
364}
365
366/*
367 * kdb_getphys - Read data from a physical address. Validate the
368 * address is in range, use kmap_atomic() to get data
369 * similar to kdb_getarea() - but for phys addresses
370 * Inputs:
371 * res Pointer to the word to receive the result
372 * addr Physical address of the area to copy
373 * size Size of the area
374 * Returns:
375 * 0 for success, < 0 for error.
376 */
377static int kdb_getphys(void *res, unsigned long addr, size_t size)
378{
379 unsigned long pfn;
380 void *vaddr;
381 struct page *page;
382
383 pfn = (addr >> PAGE_SHIFT);
384 if (!pfn_valid(pfn))
385 return 1;
386 page = pfn_to_page(pfn);
387 vaddr = kmap_atomic(page, KM_KDB);
388 memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size);
389 kunmap_atomic(vaddr, KM_KDB);
390
391 return 0;
392}
393
394/*
395 * kdb_getphysword
396 * Inputs:
397 * word Pointer to the word to receive the result.
398 * addr Address of the area to copy.
399 * size Size of the area.
400 * Returns:
401 * 0 for success, < 0 for error.
402 */
403int kdb_getphysword(unsigned long *word, unsigned long addr, size_t size)
404{
405 int diag;
406 __u8 w1;
407 __u16 w2;
408 __u32 w4;
409 __u64 w8;
410 *word = 0; /* Default value if addr or size is invalid */
411
412 switch (size) {
413 case 1:
414 diag = kdb_getphys(&w1, addr, sizeof(w1));
415 if (!diag)
416 *word = w1;
417 break;
418 case 2:
419 diag = kdb_getphys(&w2, addr, sizeof(w2));
420 if (!diag)
421 *word = w2;
422 break;
423 case 4:
424 diag = kdb_getphys(&w4, addr, sizeof(w4));
425 if (!diag)
426 *word = w4;
427 break;
428 case 8:
429 if (size <= sizeof(*word)) {
430 diag = kdb_getphys(&w8, addr, sizeof(w8));
431 if (!diag)
432 *word = w8;
433 break;
434 }
435 /* drop through */
436 default:
437 diag = KDB_BADWIDTH;
438 kdb_printf("kdb_getphysword: bad width %ld\n", (long) size);
439 }
440 return diag;
441}
442
443/*
444 * kdb_getword - Read a binary value. Unlike kdb_getarea, this treats
445 * data as numbers.
446 * Inputs:
447 * word Pointer to the word to receive the result.
448 * addr Address of the area to copy.
449 * size Size of the area.
450 * Returns:
451 * 0 for success, < 0 for error.
452 */
453int kdb_getword(unsigned long *word, unsigned long addr, size_t size)
454{
455 int diag;
456 __u8 w1;
457 __u16 w2;
458 __u32 w4;
459 __u64 w8;
460 *word = 0; /* Default value if addr or size is invalid */
461 switch (size) {
462 case 1:
463 diag = kdb_getarea(w1, addr);
464 if (!diag)
465 *word = w1;
466 break;
467 case 2:
468 diag = kdb_getarea(w2, addr);
469 if (!diag)
470 *word = w2;
471 break;
472 case 4:
473 diag = kdb_getarea(w4, addr);
474 if (!diag)
475 *word = w4;
476 break;
477 case 8:
478 if (size <= sizeof(*word)) {
479 diag = kdb_getarea(w8, addr);
480 if (!diag)
481 *word = w8;
482 break;
483 }
484 /* drop through */
485 default:
486 diag = KDB_BADWIDTH;
487 kdb_printf("kdb_getword: bad width %ld\n", (long) size);
488 }
489 return diag;
490}
491
492/*
493 * kdb_putword - Write a binary value. Unlike kdb_putarea, this
494 * treats data as numbers.
495 * Inputs:
496 * addr Address of the area to write to..
497 * word The value to set.
498 * size Size of the area.
499 * Returns:
500 * 0 for success, < 0 for error.
501 */
502int kdb_putword(unsigned long addr, unsigned long word, size_t size)
503{
504 int diag;
505 __u8 w1;
506 __u16 w2;
507 __u32 w4;
508 __u64 w8;
509 switch (size) {
510 case 1:
511 w1 = word;
512 diag = kdb_putarea(addr, w1);
513 break;
514 case 2:
515 w2 = word;
516 diag = kdb_putarea(addr, w2);
517 break;
518 case 4:
519 w4 = word;
520 diag = kdb_putarea(addr, w4);
521 break;
522 case 8:
523 if (size <= sizeof(word)) {
524 w8 = word;
525 diag = kdb_putarea(addr, w8);
526 break;
527 }
528 /* drop through */
529 default:
530 diag = KDB_BADWIDTH;
531 kdb_printf("kdb_putword: bad width %ld\n", (long) size);
532 }
533 return diag;
534}
535
536/*
537 * kdb_task_state_string - Convert a string containing any of the
538 * letters DRSTCZEUIMA to a mask for the process state field and
539 * return the value. If no argument is supplied, return the mask
540 * that corresponds to environment variable PS, DRSTCZEU by
541 * default.
542 * Inputs:
543 * s String to convert
544 * Returns:
545 * Mask for process state.
546 * Notes:
547 * The mask folds data from several sources into a single long value, so
548 * be carefull not to overlap the bits. TASK_* bits are in the LSB,
549 * special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there
550 * is no overlap between TASK_* and EXIT_* but that may not always be
551 * true, so EXIT_* bits are shifted left 16 bits before being stored in
552 * the mask.
553 */
554
555/* unrunnable is < 0 */
556#define UNRUNNABLE (1UL << (8*sizeof(unsigned long) - 1))
557#define RUNNING (1UL << (8*sizeof(unsigned long) - 2))
558#define IDLE (1UL << (8*sizeof(unsigned long) - 3))
559#define DAEMON (1UL << (8*sizeof(unsigned long) - 4))
560
561unsigned long kdb_task_state_string(const char *s)
562{
563 long res = 0;
564 if (!s) {
565 s = kdbgetenv("PS");
566 if (!s)
567 s = "DRSTCZEU"; /* default value for ps */
568 }
569 while (*s) {
570 switch (*s) {
571 case 'D':
572 res |= TASK_UNINTERRUPTIBLE;
573 break;
574 case 'R':
575 res |= RUNNING;
576 break;
577 case 'S':
578 res |= TASK_INTERRUPTIBLE;
579 break;
580 case 'T':
581 res |= TASK_STOPPED;
582 break;
583 case 'C':
584 res |= TASK_TRACED;
585 break;
586 case 'Z':
587 res |= EXIT_ZOMBIE << 16;
588 break;
589 case 'E':
590 res |= EXIT_DEAD << 16;
591 break;
592 case 'U':
593 res |= UNRUNNABLE;
594 break;
595 case 'I':
596 res |= IDLE;
597 break;
598 case 'M':
599 res |= DAEMON;
600 break;
601 case 'A':
602 res = ~0UL;
603 break;
604 default:
605 kdb_printf("%s: unknown flag '%c' ignored\n",
606 __func__, *s);
607 break;
608 }
609 ++s;
610 }
611 return res;
612}
613
614/*
615 * kdb_task_state_char - Return the character that represents the task state.
616 * Inputs:
617 * p struct task for the process
618 * Returns:
619 * One character to represent the task state.
620 */
621char kdb_task_state_char (const struct task_struct *p)
622{
623 int cpu;
624 char state;
625 unsigned long tmp;
626
627 if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long)))
628 return 'E';
629
630 cpu = kdb_process_cpu(p);
631 state = (p->state == 0) ? 'R' :
632 (p->state < 0) ? 'U' :
633 (p->state & TASK_UNINTERRUPTIBLE) ? 'D' :
634 (p->state & TASK_STOPPED) ? 'T' :
635 (p->state & TASK_TRACED) ? 'C' :
636 (p->exit_state & EXIT_ZOMBIE) ? 'Z' :
637 (p->exit_state & EXIT_DEAD) ? 'E' :
638 (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?';
639 if (p->pid == 0) {
640 /* Idle task. Is it really idle, apart from the kdb
641 * interrupt? */
642 if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) {
643 if (cpu != kdb_initial_cpu)
644 state = 'I'; /* idle task */
645 }
646 } else if (!p->mm && state == 'S') {
647 state = 'M'; /* sleeping system daemon */
648 }
649 return state;
650}
651
652/*
653 * kdb_task_state - Return true if a process has the desired state
654 * given by the mask.
655 * Inputs:
656 * p struct task for the process
657 * mask mask from kdb_task_state_string to select processes
658 * Returns:
659 * True if the process matches at least one criteria defined by the mask.
660 */
661unsigned long kdb_task_state(const struct task_struct *p, unsigned long mask)
662{
663 char state[] = { kdb_task_state_char(p), '\0' };
664 return (mask & kdb_task_state_string(state)) != 0;
665}
666
667/*
668 * kdb_print_nameval - Print a name and its value, converting the
669 * value to a symbol lookup if possible.
670 * Inputs:
671 * name field name to print
672 * val value of field
673 */
674void kdb_print_nameval(const char *name, unsigned long val)
675{
676 kdb_symtab_t symtab;
677 kdb_printf(" %-11.11s ", name);
678 if (kdbnearsym(val, &symtab))
679 kdb_symbol_print(val, &symtab,
680 KDB_SP_VALUE|KDB_SP_SYMSIZE|KDB_SP_NEWLINE);
681 else
682 kdb_printf("0x%lx\n", val);
683}
684
685/* Last ditch allocator for debugging, so we can still debug even when
686 * the GFP_ATOMIC pool has been exhausted. The algorithms are tuned
687 * for space usage, not for speed. One smallish memory pool, the free
688 * chain is always in ascending address order to allow coalescing,
689 * allocations are done in brute force best fit.
690 */
691
692struct debug_alloc_header {
693 u32 next; /* offset of next header from start of pool */
694 u32 size;
695 void *caller;
696};
697
698/* The memory returned by this allocator must be aligned, which means
699 * so must the header size. Do not assume that sizeof(struct
700 * debug_alloc_header) is a multiple of the alignment, explicitly
701 * calculate the overhead of this header, including the alignment.
702 * The rest of this code must not use sizeof() on any header or
703 * pointer to a header.
704 */
705#define dah_align 8
706#define dah_overhead ALIGN(sizeof(struct debug_alloc_header), dah_align)
707
708static u64 debug_alloc_pool_aligned[256*1024/dah_align]; /* 256K pool */
709static char *debug_alloc_pool = (char *)debug_alloc_pool_aligned;
710static u32 dah_first, dah_first_call = 1, dah_used, dah_used_max;
711
712/* Locking is awkward. The debug code is called from all contexts,
713 * including non maskable interrupts. A normal spinlock is not safe
714 * in NMI context. Try to get the debug allocator lock, if it cannot
715 * be obtained after a second then give up. If the lock could not be
716 * previously obtained on this cpu then only try once.
717 *
718 * sparse has no annotation for "this function _sometimes_ acquires a
719 * lock", so fudge the acquire/release notation.
720 */
721static DEFINE_SPINLOCK(dap_lock);
722static int get_dap_lock(void)
723 __acquires(dap_lock)
724{
725 static int dap_locked = -1;
726 int count;
727 if (dap_locked == smp_processor_id())
728 count = 1;
729 else
730 count = 1000;
731 while (1) {
732 if (spin_trylock(&dap_lock)) {
733 dap_locked = -1;
734 return 1;
735 }
736 if (!count--)
737 break;
738 udelay(1000);
739 }
740 dap_locked = smp_processor_id();
741 __acquire(dap_lock);
742 return 0;
743}
744
745void *debug_kmalloc(size_t size, gfp_t flags)
746{
747 unsigned int rem, h_offset;
748 struct debug_alloc_header *best, *bestprev, *prev, *h;
749 void *p = NULL;
750 if (!get_dap_lock()) {
751 __release(dap_lock); /* we never actually got it */
752 return NULL;
753 }
754 h = (struct debug_alloc_header *)(debug_alloc_pool + dah_first);
755 if (dah_first_call) {
756 h->size = sizeof(debug_alloc_pool_aligned) - dah_overhead;
757 dah_first_call = 0;
758 }
759 size = ALIGN(size, dah_align);
760 prev = best = bestprev = NULL;
761 while (1) {
762 if (h->size >= size && (!best || h->size < best->size)) {
763 best = h;
764 bestprev = prev;
765 if (h->size == size)
766 break;
767 }
768 if (!h->next)
769 break;
770 prev = h;
771 h = (struct debug_alloc_header *)(debug_alloc_pool + h->next);
772 }
773 if (!best)
774 goto out;
775 rem = best->size - size;
776 /* The pool must always contain at least one header */
777 if (best->next == 0 && bestprev == NULL && rem < dah_overhead)
778 goto out;
779 if (rem >= dah_overhead) {
780 best->size = size;
781 h_offset = ((char *)best - debug_alloc_pool) +
782 dah_overhead + best->size;
783 h = (struct debug_alloc_header *)(debug_alloc_pool + h_offset);
784 h->size = rem - dah_overhead;
785 h->next = best->next;
786 } else
787 h_offset = best->next;
788 best->caller = __builtin_return_address(0);
789 dah_used += best->size;
790 dah_used_max = max(dah_used, dah_used_max);
791 if (bestprev)
792 bestprev->next = h_offset;
793 else
794 dah_first = h_offset;
795 p = (char *)best + dah_overhead;
796 memset(p, POISON_INUSE, best->size - 1);
797 *((char *)p + best->size - 1) = POISON_END;
798out:
799 spin_unlock(&dap_lock);
800 return p;
801}
802
803void debug_kfree(void *p)
804{
805 struct debug_alloc_header *h;
806 unsigned int h_offset;
807 if (!p)
808 return;
809 if ((char *)p < debug_alloc_pool ||
810 (char *)p >= debug_alloc_pool + sizeof(debug_alloc_pool_aligned)) {
811 kfree(p);
812 return;
813 }
814 if (!get_dap_lock()) {
815 __release(dap_lock); /* we never actually got it */
816 return; /* memory leak, cannot be helped */
817 }
818 h = (struct debug_alloc_header *)((char *)p - dah_overhead);
819 memset(p, POISON_FREE, h->size - 1);
820 *((char *)p + h->size - 1) = POISON_END;
821 h->caller = NULL;
822 dah_used -= h->size;
823 h_offset = (char *)h - debug_alloc_pool;
824 if (h_offset < dah_first) {
825 h->next = dah_first;
826 dah_first = h_offset;
827 } else {
828 struct debug_alloc_header *prev;
829 unsigned int prev_offset;
830 prev = (struct debug_alloc_header *)(debug_alloc_pool +
831 dah_first);
832 while (1) {
833 if (!prev->next || prev->next > h_offset)
834 break;
835 prev = (struct debug_alloc_header *)
836 (debug_alloc_pool + prev->next);
837 }
838 prev_offset = (char *)prev - debug_alloc_pool;
839 if (prev_offset + dah_overhead + prev->size == h_offset) {
840 prev->size += dah_overhead + h->size;
841 memset(h, POISON_FREE, dah_overhead - 1);
842 *((char *)h + dah_overhead - 1) = POISON_END;
843 h = prev;
844 h_offset = prev_offset;
845 } else {
846 h->next = prev->next;
847 prev->next = h_offset;
848 }
849 }
850 if (h_offset + dah_overhead + h->size == h->next) {
851 struct debug_alloc_header *next;
852 next = (struct debug_alloc_header *)
853 (debug_alloc_pool + h->next);
854 h->size += dah_overhead + next->size;
855 h->next = next->next;
856 memset(next, POISON_FREE, dah_overhead - 1);
857 *((char *)next + dah_overhead - 1) = POISON_END;
858 }
859 spin_unlock(&dap_lock);
860}
861
862void debug_kusage(void)
863{
864 struct debug_alloc_header *h_free, *h_used;
865#ifdef CONFIG_IA64
866 /* FIXME: using dah for ia64 unwind always results in a memory leak.
867 * Fix that memory leak first, then set debug_kusage_one_time = 1 for
868 * all architectures.
869 */
870 static int debug_kusage_one_time;
871#else
872 static int debug_kusage_one_time = 1;
873#endif
874 if (!get_dap_lock()) {
875 __release(dap_lock); /* we never actually got it */
876 return;
877 }
878 h_free = (struct debug_alloc_header *)(debug_alloc_pool + dah_first);
879 if (dah_first == 0 &&
880 (h_free->size == sizeof(debug_alloc_pool_aligned) - dah_overhead ||
881 dah_first_call))
882 goto out;
883 if (!debug_kusage_one_time)
884 goto out;
885 debug_kusage_one_time = 0;
886 kdb_printf("%s: debug_kmalloc memory leak dah_first %d\n",
887 __func__, dah_first);
888 if (dah_first) {
889 h_used = (struct debug_alloc_header *)debug_alloc_pool;
890 kdb_printf("%s: h_used %p size %d\n", __func__, h_used,
891 h_used->size);
892 }
893 do {
894 h_used = (struct debug_alloc_header *)
895 ((char *)h_free + dah_overhead + h_free->size);
896 kdb_printf("%s: h_used %p size %d caller %p\n",
897 __func__, h_used, h_used->size, h_used->caller);
898 h_free = (struct debug_alloc_header *)
899 (debug_alloc_pool + h_free->next);
900 } while (h_free->next);
901 h_used = (struct debug_alloc_header *)
902 ((char *)h_free + dah_overhead + h_free->size);
903 if ((char *)h_used - debug_alloc_pool !=
904 sizeof(debug_alloc_pool_aligned))
905 kdb_printf("%s: h_used %p size %d caller %p\n",
906 __func__, h_used, h_used->size, h_used->caller);
907out:
908 spin_unlock(&dap_lock);
909}
910
911/* Maintain a small stack of kdb_flags to allow recursion without disturbing
912 * the global kdb state.
913 */
914
915static int kdb_flags_stack[4], kdb_flags_index;
916
917void kdb_save_flags(void)
918{
919 BUG_ON(kdb_flags_index >= ARRAY_SIZE(kdb_flags_stack));
920 kdb_flags_stack[kdb_flags_index++] = kdb_flags;
921}
922
923void kdb_restore_flags(void)
924{
925 BUG_ON(kdb_flags_index <= 0);
926 kdb_flags = kdb_flags_stack[--kdb_flags_index];
927}
diff --git a/kernel/early_res.c b/kernel/early_res.c
index 31aa9332ef3f..7bfae887f211 100644
--- a/kernel/early_res.c
+++ b/kernel/early_res.c
@@ -7,6 +7,8 @@
7#include <linux/bootmem.h> 7#include <linux/bootmem.h>
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/early_res.h> 9#include <linux/early_res.h>
10#include <linux/slab.h>
11#include <linux/kmemleak.h>
10 12
11/* 13/*
12 * Early reserved memory areas. 14 * Early reserved memory areas.
@@ -319,6 +321,8 @@ void __init free_early(u64 start, u64 end)
319 struct early_res *r; 321 struct early_res *r;
320 int i; 322 int i;
321 323
324 kmemleak_free_part(__va(start), end - start);
325
322 i = find_overlapped_early(start, end); 326 i = find_overlapped_early(start, end);
323 r = &early_res[i]; 327 r = &early_res[i];
324 if (i >= max_early_res || r->end != end || r->start != start) 328 if (i >= max_early_res || r->end != end || r->start != start)
@@ -333,6 +337,8 @@ void __init free_early_partial(u64 start, u64 end)
333 struct early_res *r; 337 struct early_res *r;
334 int i; 338 int i;
335 339
340 kmemleak_free_part(__va(start), end - start);
341
336 if (start == end) 342 if (start == end)
337 return; 343 return;
338 344
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index c35452cadded..0dbeae374225 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -27,7 +27,7 @@ static struct exec_domain *exec_domains = &default_exec_domain;
27static DEFINE_RWLOCK(exec_domains_lock); 27static DEFINE_RWLOCK(exec_domains_lock);
28 28
29 29
30static u_long ident_map[32] = { 30static unsigned long ident_map[32] = {
31 0, 1, 2, 3, 4, 5, 6, 7, 31 0, 1, 2, 3, 4, 5, 6, 7,
32 8, 9, 10, 11, 12, 13, 14, 15, 32 8, 9, 10, 11, 12, 13, 14, 15,
33 16, 17, 18, 19, 20, 21, 22, 23, 33 16, 17, 18, 19, 20, 21, 22, 23,
@@ -56,10 +56,10 @@ default_handler(int segment, struct pt_regs *regp)
56} 56}
57 57
58static struct exec_domain * 58static struct exec_domain *
59lookup_exec_domain(u_long personality) 59lookup_exec_domain(unsigned int personality)
60{ 60{
61 struct exec_domain * ep; 61 unsigned int pers = personality(personality);
62 u_long pers = personality(personality); 62 struct exec_domain *ep;
63 63
64 read_lock(&exec_domains_lock); 64 read_lock(&exec_domains_lock);
65 for (ep = exec_domains; ep; ep = ep->next) { 65 for (ep = exec_domains; ep; ep = ep->next) {
@@ -70,7 +70,7 @@ lookup_exec_domain(u_long personality)
70 70
71#ifdef CONFIG_MODULES 71#ifdef CONFIG_MODULES
72 read_unlock(&exec_domains_lock); 72 read_unlock(&exec_domains_lock);
73 request_module("personality-%ld", pers); 73 request_module("personality-%d", pers);
74 read_lock(&exec_domains_lock); 74 read_lock(&exec_domains_lock);
75 75
76 for (ep = exec_domains; ep; ep = ep->next) { 76 for (ep = exec_domains; ep; ep = ep->next) {
@@ -134,23 +134,14 @@ unregister:
134 return 0; 134 return 0;
135} 135}
136 136
137int 137int __set_personality(unsigned int personality)
138__set_personality(u_long personality)
139{ 138{
140 struct exec_domain *ep, *oep; 139 struct exec_domain *oep = current_thread_info()->exec_domain;
141
142 ep = lookup_exec_domain(personality);
143 if (ep == current_thread_info()->exec_domain) {
144 current->personality = personality;
145 module_put(ep->module);
146 return 0;
147 }
148 140
141 current_thread_info()->exec_domain = lookup_exec_domain(personality);
149 current->personality = personality; 142 current->personality = personality;
150 oep = current_thread_info()->exec_domain;
151 current_thread_info()->exec_domain = ep;
152
153 module_put(oep->module); 143 module_put(oep->module);
144
154 return 0; 145 return 0;
155} 146}
156 147
@@ -188,17 +179,14 @@ static int __init proc_execdomains_init(void)
188module_init(proc_execdomains_init); 179module_init(proc_execdomains_init);
189#endif 180#endif
190 181
191SYSCALL_DEFINE1(personality, u_long, personality) 182SYSCALL_DEFINE1(personality, unsigned int, personality)
192{ 183{
193 u_long old = current->personality; 184 unsigned int old = current->personality;
194 185
195 if (personality != 0xffffffff) { 186 if (personality != 0xffffffff)
196 set_personality(personality); 187 set_personality(personality);
197 if (current->personality != personality)
198 return -EINVAL;
199 }
200 188
201 return (long)old; 189 return old;
202} 190}
203 191
204 192
diff --git a/kernel/exit.c b/kernel/exit.c
index 256ce8c2ebc8..b9d3bc6c21ec 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -55,17 +55,16 @@
55#include <asm/unistd.h> 55#include <asm/unistd.h>
56#include <asm/pgtable.h> 56#include <asm/pgtable.h>
57#include <asm/mmu_context.h> 57#include <asm/mmu_context.h>
58#include "cred-internals.h"
59 58
60extern void exit_od_table(struct task_struct *t); 59extern void exit_od_table(struct task_struct *t);
61 60
62static void exit_mm(struct task_struct * tsk); 61static void exit_mm(struct task_struct * tsk);
63 62
64static void __unhash_process(struct task_struct *p) 63static void __unhash_process(struct task_struct *p, bool group_dead)
65{ 64{
66 nr_threads--; 65 nr_threads--;
67 detach_pid(p, PIDTYPE_PID); 66 detach_pid(p, PIDTYPE_PID);
68 if (thread_group_leader(p)) { 67 if (group_dead) {
69 detach_pid(p, PIDTYPE_PGID); 68 detach_pid(p, PIDTYPE_PGID);
70 detach_pid(p, PIDTYPE_SID); 69 detach_pid(p, PIDTYPE_SID);
71 70
@@ -82,10 +81,9 @@ static void __unhash_process(struct task_struct *p)
82static void __exit_signal(struct task_struct *tsk) 81static void __exit_signal(struct task_struct *tsk)
83{ 82{
84 struct signal_struct *sig = tsk->signal; 83 struct signal_struct *sig = tsk->signal;
84 bool group_dead = thread_group_leader(tsk);
85 struct sighand_struct *sighand; 85 struct sighand_struct *sighand;
86 86 struct tty_struct *uninitialized_var(tty);
87 BUG_ON(!sig);
88 BUG_ON(!atomic_read(&sig->count));
89 87
90 sighand = rcu_dereference_check(tsk->sighand, 88 sighand = rcu_dereference_check(tsk->sighand,
91 rcu_read_lock_held() || 89 rcu_read_lock_held() ||
@@ -93,14 +91,16 @@ static void __exit_signal(struct task_struct *tsk)
93 spin_lock(&sighand->siglock); 91 spin_lock(&sighand->siglock);
94 92
95 posix_cpu_timers_exit(tsk); 93 posix_cpu_timers_exit(tsk);
96 if (atomic_dec_and_test(&sig->count)) 94 if (group_dead) {
97 posix_cpu_timers_exit_group(tsk); 95 posix_cpu_timers_exit_group(tsk);
98 else { 96 tty = sig->tty;
97 sig->tty = NULL;
98 } else {
99 /* 99 /*
100 * If there is any task waiting for the group exit 100 * If there is any task waiting for the group exit
101 * then notify it: 101 * then notify it:
102 */ 102 */
103 if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) 103 if (sig->notify_count > 0 && !--sig->notify_count)
104 wake_up_process(sig->group_exit_task); 104 wake_up_process(sig->group_exit_task);
105 105
106 if (tsk == sig->curr_target) 106 if (tsk == sig->curr_target)
@@ -126,32 +126,24 @@ static void __exit_signal(struct task_struct *tsk)
126 sig->oublock += task_io_get_oublock(tsk); 126 sig->oublock += task_io_get_oublock(tsk);
127 task_io_accounting_add(&sig->ioac, &tsk->ioac); 127 task_io_accounting_add(&sig->ioac, &tsk->ioac);
128 sig->sum_sched_runtime += tsk->se.sum_exec_runtime; 128 sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
129 sig = NULL; /* Marker for below. */
130 } 129 }
131 130
132 __unhash_process(tsk); 131 sig->nr_threads--;
132 __unhash_process(tsk, group_dead);
133 133
134 /* 134 /*
135 * Do this under ->siglock, we can race with another thread 135 * Do this under ->siglock, we can race with another thread
136 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. 136 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
137 */ 137 */
138 flush_sigqueue(&tsk->pending); 138 flush_sigqueue(&tsk->pending);
139
140 tsk->signal = NULL;
141 tsk->sighand = NULL; 139 tsk->sighand = NULL;
142 spin_unlock(&sighand->siglock); 140 spin_unlock(&sighand->siglock);
143 141
144 __cleanup_sighand(sighand); 142 __cleanup_sighand(sighand);
145 clear_tsk_thread_flag(tsk,TIF_SIGPENDING); 143 clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
146 if (sig) { 144 if (group_dead) {
147 flush_sigqueue(&sig->shared_pending); 145 flush_sigqueue(&sig->shared_pending);
148 taskstats_tgid_free(sig); 146 tty_kref_put(tty);
149 /*
150 * Make sure ->signal can't go away under rq->lock,
151 * see account_group_exec_runtime().
152 */
153 task_rq_unlock_wait(tsk);
154 __cleanup_signal(sig);
155 } 147 }
156} 148}
157 149
@@ -781,9 +773,12 @@ static void forget_original_parent(struct task_struct *father)
781 struct task_struct *p, *n, *reaper; 773 struct task_struct *p, *n, *reaper;
782 LIST_HEAD(dead_children); 774 LIST_HEAD(dead_children);
783 775
784 exit_ptrace(father);
785
786 write_lock_irq(&tasklist_lock); 776 write_lock_irq(&tasklist_lock);
777 /*
778 * Note that exit_ptrace() and find_new_reaper() might
779 * drop tasklist_lock and reacquire it.
780 */
781 exit_ptrace(father);
787 reaper = find_new_reaper(father); 782 reaper = find_new_reaper(father);
788 783
789 list_for_each_entry_safe(p, n, &father->children, sibling) { 784 list_for_each_entry_safe(p, n, &father->children, sibling) {
@@ -859,12 +854,9 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
859 854
860 tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE; 855 tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
861 856
862 /* mt-exec, de_thread() is waiting for us */ 857 /* mt-exec, de_thread() is waiting for group leader */
863 if (thread_group_leader(tsk) && 858 if (unlikely(tsk->signal->notify_count < 0))
864 tsk->signal->group_exit_task &&
865 tsk->signal->notify_count < 0)
866 wake_up_process(tsk->signal->group_exit_task); 859 wake_up_process(tsk->signal->group_exit_task);
867
868 write_unlock_irq(&tasklist_lock); 860 write_unlock_irq(&tasklist_lock);
869 861
870 tracehook_report_death(tsk, signal, cookie, group_dead); 862 tracehook_report_death(tsk, signal, cookie, group_dead);
@@ -1007,8 +999,10 @@ NORET_TYPE void do_exit(long code)
1007 999
1008 exit_notify(tsk, group_dead); 1000 exit_notify(tsk, group_dead);
1009#ifdef CONFIG_NUMA 1001#ifdef CONFIG_NUMA
1002 task_lock(tsk);
1010 mpol_put(tsk->mempolicy); 1003 mpol_put(tsk->mempolicy);
1011 tsk->mempolicy = NULL; 1004 tsk->mempolicy = NULL;
1005 task_unlock(tsk);
1012#endif 1006#endif
1013#ifdef CONFIG_FUTEX 1007#ifdef CONFIG_FUTEX
1014 if (unlikely(current->pi_state_cache)) 1008 if (unlikely(current->pi_state_cache))
@@ -1396,8 +1390,7 @@ static int wait_task_stopped(struct wait_opts *wo,
1396 if (!unlikely(wo->wo_flags & WNOWAIT)) 1390 if (!unlikely(wo->wo_flags & WNOWAIT))
1397 *p_code = 0; 1391 *p_code = 0;
1398 1392
1399 /* don't need the RCU readlock here as we're holding a spinlock */ 1393 uid = task_uid(p);
1400 uid = __task_cred(p)->uid;
1401unlock_sig: 1394unlock_sig:
1402 spin_unlock_irq(&p->sighand->siglock); 1395 spin_unlock_irq(&p->sighand->siglock);
1403 if (!exit_code) 1396 if (!exit_code)
@@ -1470,7 +1463,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1470 } 1463 }
1471 if (!unlikely(wo->wo_flags & WNOWAIT)) 1464 if (!unlikely(wo->wo_flags & WNOWAIT))
1472 p->signal->flags &= ~SIGNAL_STOP_CONTINUED; 1465 p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1473 uid = __task_cred(p)->uid; 1466 uid = task_uid(p);
1474 spin_unlock_irq(&p->sighand->siglock); 1467 spin_unlock_irq(&p->sighand->siglock);
1475 1468
1476 pid = task_pid_vnr(p); 1469 pid = task_pid_vnr(p);
diff --git a/kernel/fork.c b/kernel/fork.c
index 166eb780dd7d..ab7f29d906c7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -168,6 +168,18 @@ void free_task(struct task_struct *tsk)
168} 168}
169EXPORT_SYMBOL(free_task); 169EXPORT_SYMBOL(free_task);
170 170
171static inline void free_signal_struct(struct signal_struct *sig)
172{
173 taskstats_tgid_free(sig);
174 kmem_cache_free(signal_cachep, sig);
175}
176
177static inline void put_signal_struct(struct signal_struct *sig)
178{
179 if (atomic_dec_and_test(&sig->sigcnt))
180 free_signal_struct(sig);
181}
182
171void __put_task_struct(struct task_struct *tsk) 183void __put_task_struct(struct task_struct *tsk)
172{ 184{
173 WARN_ON(!tsk->exit_state); 185 WARN_ON(!tsk->exit_state);
@@ -177,6 +189,7 @@ void __put_task_struct(struct task_struct *tsk)
177 exit_litmus(tsk); 189 exit_litmus(tsk);
178 exit_creds(tsk); 190 exit_creds(tsk);
179 delayacct_tsk_free(tsk); 191 delayacct_tsk_free(tsk);
192 put_signal_struct(tsk->signal);
180 193
181 if (!profile_handoff_task(tsk)) 194 if (!profile_handoff_task(tsk))
182 free_task(tsk); 195 free_task(tsk);
@@ -294,7 +307,7 @@ out:
294#ifdef CONFIG_MMU 307#ifdef CONFIG_MMU
295static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) 308static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
296{ 309{
297 struct vm_area_struct *mpnt, *tmp, **pprev; 310 struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
298 struct rb_node **rb_link, *rb_parent; 311 struct rb_node **rb_link, *rb_parent;
299 int retval; 312 int retval;
300 unsigned long charge; 313 unsigned long charge;
@@ -322,6 +335,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
322 if (retval) 335 if (retval)
323 goto out; 336 goto out;
324 337
338 prev = NULL;
325 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { 339 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
326 struct file *file; 340 struct file *file;
327 341
@@ -349,11 +363,11 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
349 if (IS_ERR(pol)) 363 if (IS_ERR(pol))
350 goto fail_nomem_policy; 364 goto fail_nomem_policy;
351 vma_set_policy(tmp, pol); 365 vma_set_policy(tmp, pol);
366 tmp->vm_mm = mm;
352 if (anon_vma_fork(tmp, mpnt)) 367 if (anon_vma_fork(tmp, mpnt))
353 goto fail_nomem_anon_vma_fork; 368 goto fail_nomem_anon_vma_fork;
354 tmp->vm_flags &= ~VM_LOCKED; 369 tmp->vm_flags &= ~VM_LOCKED;
355 tmp->vm_mm = mm; 370 tmp->vm_next = tmp->vm_prev = NULL;
356 tmp->vm_next = NULL;
357 file = tmp->vm_file; 371 file = tmp->vm_file;
358 if (file) { 372 if (file) {
359 struct inode *inode = file->f_path.dentry->d_inode; 373 struct inode *inode = file->f_path.dentry->d_inode;
@@ -386,6 +400,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
386 */ 400 */
387 *pprev = tmp; 401 *pprev = tmp;
388 pprev = &tmp->vm_next; 402 pprev = &tmp->vm_next;
403 tmp->vm_prev = prev;
404 prev = tmp;
389 405
390 __vma_link_rb(mm, tmp, rb_link, rb_parent); 406 __vma_link_rb(mm, tmp, rb_link, rb_parent);
391 rb_link = &tmp->vm_rb.rb_right; 407 rb_link = &tmp->vm_rb.rb_right;
@@ -746,13 +762,13 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
746 struct fs_struct *fs = current->fs; 762 struct fs_struct *fs = current->fs;
747 if (clone_flags & CLONE_FS) { 763 if (clone_flags & CLONE_FS) {
748 /* tsk->fs is already what we want */ 764 /* tsk->fs is already what we want */
749 write_lock(&fs->lock); 765 spin_lock(&fs->lock);
750 if (fs->in_exec) { 766 if (fs->in_exec) {
751 write_unlock(&fs->lock); 767 spin_unlock(&fs->lock);
752 return -EAGAIN; 768 return -EAGAIN;
753 } 769 }
754 fs->users++; 770 fs->users++;
755 write_unlock(&fs->lock); 771 spin_unlock(&fs->lock);
756 return 0; 772 return 0;
757 } 773 }
758 tsk->fs = copy_fs_struct(fs); 774 tsk->fs = copy_fs_struct(fs);
@@ -871,8 +887,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
871 if (!sig) 887 if (!sig)
872 return -ENOMEM; 888 return -ENOMEM;
873 889
874 atomic_set(&sig->count, 1); 890 sig->nr_threads = 1;
875 atomic_set(&sig->live, 1); 891 atomic_set(&sig->live, 1);
892 atomic_set(&sig->sigcnt, 1);
876 init_waitqueue_head(&sig->wait_chldexit); 893 init_waitqueue_head(&sig->wait_chldexit);
877 if (clone_flags & CLONE_NEWPID) 894 if (clone_flags & CLONE_NEWPID)
878 sig->flags |= SIGNAL_UNKILLABLE; 895 sig->flags |= SIGNAL_UNKILLABLE;
@@ -892,22 +909,16 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
892 tty_audit_fork(sig); 909 tty_audit_fork(sig);
893 910
894 sig->oom_adj = current->signal->oom_adj; 911 sig->oom_adj = current->signal->oom_adj;
912 sig->oom_score_adj = current->signal->oom_score_adj;
895 913
896 return 0; 914 return 0;
897} 915}
898 916
899void __cleanup_signal(struct signal_struct *sig)
900{
901 thread_group_cputime_free(sig);
902 tty_kref_put(sig->tty);
903 kmem_cache_free(signal_cachep, sig);
904}
905
906static void copy_flags(unsigned long clone_flags, struct task_struct *p) 917static void copy_flags(unsigned long clone_flags, struct task_struct *p)
907{ 918{
908 unsigned long new_flags = p->flags; 919 unsigned long new_flags = p->flags;
909 920
910 new_flags &= ~PF_SUPERPRIV; 921 new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
911 new_flags |= PF_FORKNOEXEC; 922 new_flags |= PF_FORKNOEXEC;
912 new_flags |= PF_STARTING; 923 new_flags |= PF_STARTING;
913 p->flags = new_flags; 924 p->flags = new_flags;
@@ -1119,8 +1130,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1119 p->memcg_batch.memcg = NULL; 1130 p->memcg_batch.memcg = NULL;
1120#endif 1131#endif
1121 1132
1122 p->bts = NULL;
1123
1124 /* Perform scheduler related setup. Assign this task to a CPU. */ 1133 /* Perform scheduler related setup. Assign this task to a CPU. */
1125 sched_fork(p, clone_flags); 1134 sched_fork(p, clone_flags);
1126 1135
@@ -1254,8 +1263,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1254 } 1263 }
1255 1264
1256 if (clone_flags & CLONE_THREAD) { 1265 if (clone_flags & CLONE_THREAD) {
1257 atomic_inc(&current->signal->count); 1266 current->signal->nr_threads++;
1258 atomic_inc(&current->signal->live); 1267 atomic_inc(&current->signal->live);
1268 atomic_inc(&current->signal->sigcnt);
1259 p->group_leader = current->group_leader; 1269 p->group_leader = current->group_leader;
1260 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); 1270 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
1261 } 1271 }
@@ -1268,7 +1278,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1268 p->nsproxy->pid_ns->child_reaper = p; 1278 p->nsproxy->pid_ns->child_reaper = p;
1269 1279
1270 p->signal->leader_pid = pid; 1280 p->signal->leader_pid = pid;
1271 tty_kref_put(p->signal->tty);
1272 p->signal->tty = tty_kref_get(current->signal->tty); 1281 p->signal->tty = tty_kref_get(current->signal->tty);
1273 attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); 1282 attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
1274 attach_pid(p, PIDTYPE_SID, task_session(current)); 1283 attach_pid(p, PIDTYPE_SID, task_session(current));
@@ -1301,7 +1310,7 @@ bad_fork_cleanup_mm:
1301 mmput(p->mm); 1310 mmput(p->mm);
1302bad_fork_cleanup_signal: 1311bad_fork_cleanup_signal:
1303 if (!(clone_flags & CLONE_THREAD)) 1312 if (!(clone_flags & CLONE_THREAD))
1304 __cleanup_signal(p->signal); 1313 free_signal_struct(p->signal);
1305bad_fork_cleanup_sighand: 1314bad_fork_cleanup_sighand:
1306 __cleanup_sighand(p->sighand); 1315 __cleanup_sighand(p->sighand);
1307bad_fork_cleanup_fs: 1316bad_fork_cleanup_fs:
@@ -1336,6 +1345,16 @@ noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_re
1336 return regs; 1345 return regs;
1337} 1346}
1338 1347
1348static inline void init_idle_pids(struct pid_link *links)
1349{
1350 enum pid_type type;
1351
1352 for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
1353 INIT_HLIST_NODE(&links[type].node); /* not really needed */
1354 links[type].pid = &init_struct_pid;
1355 }
1356}
1357
1339struct task_struct * __cpuinit fork_idle(int cpu) 1358struct task_struct * __cpuinit fork_idle(int cpu)
1340{ 1359{
1341 struct task_struct *task; 1360 struct task_struct *task;
@@ -1343,8 +1362,10 @@ struct task_struct * __cpuinit fork_idle(int cpu)
1343 1362
1344 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, 1363 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
1345 &init_struct_pid, 0); 1364 &init_struct_pid, 0);
1346 if (!IS_ERR(task)) 1365 if (!IS_ERR(task)) {
1366 init_idle_pids(task->pids);
1347 init_idle(task, cpu); 1367 init_idle(task, cpu);
1368 }
1348 1369
1349 return task; 1370 return task;
1350} 1371}
@@ -1516,14 +1537,6 @@ static void check_unshare_flags(unsigned long *flags_ptr)
1516 *flags_ptr |= CLONE_SIGHAND; 1537 *flags_ptr |= CLONE_SIGHAND;
1517 1538
1518 /* 1539 /*
1519 * If unsharing signal handlers and the task was created
1520 * using CLONE_THREAD, then must unshare the thread
1521 */
1522 if ((*flags_ptr & CLONE_SIGHAND) &&
1523 (atomic_read(&current->signal->count) > 1))
1524 *flags_ptr |= CLONE_THREAD;
1525
1526 /*
1527 * If unsharing namespace, must also unshare filesystem information. 1540 * If unsharing namespace, must also unshare filesystem information.
1528 */ 1541 */
1529 if (*flags_ptr & CLONE_NEWNS) 1542 if (*flags_ptr & CLONE_NEWNS)
@@ -1673,13 +1686,13 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1673 1686
1674 if (new_fs) { 1687 if (new_fs) {
1675 fs = current->fs; 1688 fs = current->fs;
1676 write_lock(&fs->lock); 1689 spin_lock(&fs->lock);
1677 current->fs = new_fs; 1690 current->fs = new_fs;
1678 if (--fs->users) 1691 if (--fs->users)
1679 new_fs = NULL; 1692 new_fs = NULL;
1680 else 1693 else
1681 new_fs = fs; 1694 new_fs = fs;
1682 write_unlock(&fs->lock); 1695 spin_unlock(&fs->lock);
1683 } 1696 }
1684 1697
1685 if (new_mm) { 1698 if (new_mm) {
diff --git a/kernel/futex.c b/kernel/futex.c
index e7a35f1039e7..6a3a5fa1526d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -429,20 +429,11 @@ static void free_pi_state(struct futex_pi_state *pi_state)
429static struct task_struct * futex_find_get_task(pid_t pid) 429static struct task_struct * futex_find_get_task(pid_t pid)
430{ 430{
431 struct task_struct *p; 431 struct task_struct *p;
432 const struct cred *cred = current_cred(), *pcred;
433 432
434 rcu_read_lock(); 433 rcu_read_lock();
435 p = find_task_by_vpid(pid); 434 p = find_task_by_vpid(pid);
436 if (!p) { 435 if (p)
437 p = ERR_PTR(-ESRCH); 436 get_task_struct(p);
438 } else {
439 pcred = __task_cred(p);
440 if (cred->euid != pcred->euid &&
441 cred->euid != pcred->uid)
442 p = ERR_PTR(-ESRCH);
443 else
444 get_task_struct(p);
445 }
446 437
447 rcu_read_unlock(); 438 rcu_read_unlock();
448 439
@@ -564,8 +555,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
564 if (!pid) 555 if (!pid)
565 return -ESRCH; 556 return -ESRCH;
566 p = futex_find_get_task(pid); 557 p = futex_find_get_task(pid);
567 if (IS_ERR(p)) 558 if (!p)
568 return PTR_ERR(p); 559 return -ESRCH;
569 560
570 /* 561 /*
571 * We need to look at the task state flags to figure out, 562 * We need to look at the task state flags to figure out,
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index ef3c3f88a7a3..f83972b16564 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -33,10 +33,11 @@
33 * @children: child nodes 33 * @children: child nodes
34 * @all: list head for list of all nodes 34 * @all: list head for list of all nodes
35 * @parent: parent node 35 * @parent: parent node
36 * @info: associated profiling data structure if not a directory 36 * @loaded_info: array of pointers to profiling data sets for loaded object
37 * @ghost: when an object file containing profiling data is unloaded we keep a 37 * files.
38 * copy of the profiling data here to allow collecting coverage data 38 * @num_loaded: number of profiling data sets for loaded object files.
39 * for cleanup code. Such a node is called a "ghost". 39 * @unloaded_info: accumulated copy of profiling data sets for unloaded
40 * object files. Used only when gcov_persist=1.
40 * @dentry: main debugfs entry, either a directory or data file 41 * @dentry: main debugfs entry, either a directory or data file
41 * @links: associated symbolic links 42 * @links: associated symbolic links
42 * @name: data file basename 43 * @name: data file basename
@@ -51,10 +52,11 @@ struct gcov_node {
51 struct list_head children; 52 struct list_head children;
52 struct list_head all; 53 struct list_head all;
53 struct gcov_node *parent; 54 struct gcov_node *parent;
54 struct gcov_info *info; 55 struct gcov_info **loaded_info;
55 struct gcov_info *ghost; 56 struct gcov_info *unloaded_info;
56 struct dentry *dentry; 57 struct dentry *dentry;
57 struct dentry **links; 58 struct dentry **links;
59 int num_loaded;
58 char name[0]; 60 char name[0];
59}; 61};
60 62
@@ -136,16 +138,37 @@ static const struct seq_operations gcov_seq_ops = {
136}; 138};
137 139
138/* 140/*
139 * Return the profiling data set for a given node. This can either be the 141 * Return a profiling data set associated with the given node. This is
140 * original profiling data structure or a duplicate (also called "ghost") 142 * either a data set for a loaded object file or a data set copy in case
141 * in case the associated object file has been unloaded. 143 * all associated object files have been unloaded.
142 */ 144 */
143static struct gcov_info *get_node_info(struct gcov_node *node) 145static struct gcov_info *get_node_info(struct gcov_node *node)
144{ 146{
145 if (node->info) 147 if (node->num_loaded > 0)
146 return node->info; 148 return node->loaded_info[0];
147 149
148 return node->ghost; 150 return node->unloaded_info;
151}
152
153/*
154 * Return a newly allocated profiling data set which contains the sum of
155 * all profiling data associated with the given node.
156 */
157static struct gcov_info *get_accumulated_info(struct gcov_node *node)
158{
159 struct gcov_info *info;
160 int i = 0;
161
162 if (node->unloaded_info)
163 info = gcov_info_dup(node->unloaded_info);
164 else
165 info = gcov_info_dup(node->loaded_info[i++]);
166 if (!info)
167 return NULL;
168 for (; i < node->num_loaded; i++)
169 gcov_info_add(info, node->loaded_info[i]);
170
171 return info;
149} 172}
150 173
151/* 174/*
@@ -163,9 +186,10 @@ static int gcov_seq_open(struct inode *inode, struct file *file)
163 mutex_lock(&node_lock); 186 mutex_lock(&node_lock);
164 /* 187 /*
165 * Read from a profiling data copy to minimize reference tracking 188 * Read from a profiling data copy to minimize reference tracking
166 * complexity and concurrent access. 189 * complexity and concurrent access and to keep accumulating multiple
190 * profiling data sets associated with one node simple.
167 */ 191 */
168 info = gcov_info_dup(get_node_info(node)); 192 info = get_accumulated_info(node);
169 if (!info) 193 if (!info)
170 goto out_unlock; 194 goto out_unlock;
171 iter = gcov_iter_new(info); 195 iter = gcov_iter_new(info);
@@ -225,12 +249,25 @@ static struct gcov_node *get_node_by_name(const char *name)
225 return NULL; 249 return NULL;
226} 250}
227 251
252/*
253 * Reset all profiling data associated with the specified node.
254 */
255static void reset_node(struct gcov_node *node)
256{
257 int i;
258
259 if (node->unloaded_info)
260 gcov_info_reset(node->unloaded_info);
261 for (i = 0; i < node->num_loaded; i++)
262 gcov_info_reset(node->loaded_info[i]);
263}
264
228static void remove_node(struct gcov_node *node); 265static void remove_node(struct gcov_node *node);
229 266
230/* 267/*
231 * write() implementation for gcov data files. Reset profiling data for the 268 * write() implementation for gcov data files. Reset profiling data for the
232 * associated file. If the object file has been unloaded (i.e. this is 269 * corresponding file. If all associated object files have been unloaded,
233 * a "ghost" node), remove the debug fs node as well. 270 * remove the debug fs node as well.
234 */ 271 */
235static ssize_t gcov_seq_write(struct file *file, const char __user *addr, 272static ssize_t gcov_seq_write(struct file *file, const char __user *addr,
236 size_t len, loff_t *pos) 273 size_t len, loff_t *pos)
@@ -245,10 +282,10 @@ static ssize_t gcov_seq_write(struct file *file, const char __user *addr,
245 node = get_node_by_name(info->filename); 282 node = get_node_by_name(info->filename);
246 if (node) { 283 if (node) {
247 /* Reset counts or remove node for unloaded modules. */ 284 /* Reset counts or remove node for unloaded modules. */
248 if (node->ghost) 285 if (node->num_loaded == 0)
249 remove_node(node); 286 remove_node(node);
250 else 287 else
251 gcov_info_reset(node->info); 288 reset_node(node);
252 } 289 }
253 /* Reset counts for open file. */ 290 /* Reset counts for open file. */
254 gcov_info_reset(info); 291 gcov_info_reset(info);
@@ -378,7 +415,10 @@ static void init_node(struct gcov_node *node, struct gcov_info *info,
378 INIT_LIST_HEAD(&node->list); 415 INIT_LIST_HEAD(&node->list);
379 INIT_LIST_HEAD(&node->children); 416 INIT_LIST_HEAD(&node->children);
380 INIT_LIST_HEAD(&node->all); 417 INIT_LIST_HEAD(&node->all);
381 node->info = info; 418 if (node->loaded_info) {
419 node->loaded_info[0] = info;
420 node->num_loaded = 1;
421 }
382 node->parent = parent; 422 node->parent = parent;
383 if (name) 423 if (name)
384 strcpy(node->name, name); 424 strcpy(node->name, name);
@@ -394,9 +434,13 @@ static struct gcov_node *new_node(struct gcov_node *parent,
394 struct gcov_node *node; 434 struct gcov_node *node;
395 435
396 node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL); 436 node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL);
397 if (!node) { 437 if (!node)
398 pr_warning("out of memory\n"); 438 goto err_nomem;
399 return NULL; 439 if (info) {
440 node->loaded_info = kcalloc(1, sizeof(struct gcov_info *),
441 GFP_KERNEL);
442 if (!node->loaded_info)
443 goto err_nomem;
400 } 444 }
401 init_node(node, info, name, parent); 445 init_node(node, info, name, parent);
402 /* Differentiate between gcov data file nodes and directory nodes. */ 446 /* Differentiate between gcov data file nodes and directory nodes. */
@@ -416,6 +460,11 @@ static struct gcov_node *new_node(struct gcov_node *parent,
416 list_add(&node->all, &all_head); 460 list_add(&node->all, &all_head);
417 461
418 return node; 462 return node;
463
464err_nomem:
465 kfree(node);
466 pr_warning("out of memory\n");
467 return NULL;
419} 468}
420 469
421/* Remove symbolic links associated with node. */ 470/* Remove symbolic links associated with node. */
@@ -441,8 +490,9 @@ static void release_node(struct gcov_node *node)
441 list_del(&node->all); 490 list_del(&node->all);
442 debugfs_remove(node->dentry); 491 debugfs_remove(node->dentry);
443 remove_links(node); 492 remove_links(node);
444 if (node->ghost) 493 kfree(node->loaded_info);
445 gcov_info_free(node->ghost); 494 if (node->unloaded_info)
495 gcov_info_free(node->unloaded_info);
446 kfree(node); 496 kfree(node);
447} 497}
448 498
@@ -477,7 +527,7 @@ static struct gcov_node *get_child_by_name(struct gcov_node *parent,
477 527
478/* 528/*
479 * write() implementation for reset file. Reset all profiling data to zero 529 * write() implementation for reset file. Reset all profiling data to zero
480 * and remove ghost nodes. 530 * and remove nodes for which all associated object files are unloaded.
481 */ 531 */
482static ssize_t reset_write(struct file *file, const char __user *addr, 532static ssize_t reset_write(struct file *file, const char __user *addr,
483 size_t len, loff_t *pos) 533 size_t len, loff_t *pos)
@@ -487,8 +537,8 @@ static ssize_t reset_write(struct file *file, const char __user *addr,
487 mutex_lock(&node_lock); 537 mutex_lock(&node_lock);
488restart: 538restart:
489 list_for_each_entry(node, &all_head, all) { 539 list_for_each_entry(node, &all_head, all) {
490 if (node->info) 540 if (node->num_loaded > 0)
491 gcov_info_reset(node->info); 541 reset_node(node);
492 else if (list_empty(&node->children)) { 542 else if (list_empty(&node->children)) {
493 remove_node(node); 543 remove_node(node);
494 /* Several nodes may have gone - restart loop. */ 544 /* Several nodes may have gone - restart loop. */
@@ -564,37 +614,115 @@ err_remove:
564} 614}
565 615
566/* 616/*
567 * The profiling data set associated with this node is being unloaded. Store a 617 * Associate a profiling data set with an existing node. Needs to be called
568 * copy of the profiling data and turn this node into a "ghost". 618 * with node_lock held.
569 */ 619 */
570static int ghost_node(struct gcov_node *node) 620static void add_info(struct gcov_node *node, struct gcov_info *info)
571{ 621{
572 node->ghost = gcov_info_dup(node->info); 622 struct gcov_info **loaded_info;
573 if (!node->ghost) { 623 int num = node->num_loaded;
574 pr_warning("could not save data for '%s' (out of memory)\n", 624
575 node->info->filename); 625 /*
576 return -ENOMEM; 626 * Prepare new array. This is done first to simplify cleanup in
627 * case the new data set is incompatible, the node only contains
628 * unloaded data sets and there's not enough memory for the array.
629 */
630 loaded_info = kcalloc(num + 1, sizeof(struct gcov_info *), GFP_KERNEL);
631 if (!loaded_info) {
632 pr_warning("could not add '%s' (out of memory)\n",
633 info->filename);
634 return;
635 }
636 memcpy(loaded_info, node->loaded_info,
637 num * sizeof(struct gcov_info *));
638 loaded_info[num] = info;
639 /* Check if the new data set is compatible. */
640 if (num == 0) {
641 /*
642 * A module was unloaded, modified and reloaded. The new
643 * data set replaces the copy of the last one.
644 */
645 if (!gcov_info_is_compatible(node->unloaded_info, info)) {
646 pr_warning("discarding saved data for %s "
647 "(incompatible version)\n", info->filename);
648 gcov_info_free(node->unloaded_info);
649 node->unloaded_info = NULL;
650 }
651 } else {
652 /*
653 * Two different versions of the same object file are loaded.
654 * The initial one takes precedence.
655 */
656 if (!gcov_info_is_compatible(node->loaded_info[0], info)) {
657 pr_warning("could not add '%s' (incompatible "
658 "version)\n", info->filename);
659 kfree(loaded_info);
660 return;
661 }
577 } 662 }
578 node->info = NULL; 663 /* Overwrite previous array. */
664 kfree(node->loaded_info);
665 node->loaded_info = loaded_info;
666 node->num_loaded = num + 1;
667}
579 668
580 return 0; 669/*
670 * Return the index of a profiling data set associated with a node.
671 */
672static int get_info_index(struct gcov_node *node, struct gcov_info *info)
673{
674 int i;
675
676 for (i = 0; i < node->num_loaded; i++) {
677 if (node->loaded_info[i] == info)
678 return i;
679 }
680 return -ENOENT;
581} 681}
582 682
583/* 683/*
584 * Profiling data for this node has been loaded again. Add profiling data 684 * Save the data of a profiling data set which is being unloaded.
585 * from previous instantiation and turn this node into a regular node.
586 */ 685 */
587static void revive_node(struct gcov_node *node, struct gcov_info *info) 686static void save_info(struct gcov_node *node, struct gcov_info *info)
588{ 687{
589 if (gcov_info_is_compatible(node->ghost, info)) 688 if (node->unloaded_info)
590 gcov_info_add(info, node->ghost); 689 gcov_info_add(node->unloaded_info, info);
591 else { 690 else {
592 pr_warning("discarding saved data for '%s' (version changed)\n", 691 node->unloaded_info = gcov_info_dup(info);
692 if (!node->unloaded_info) {
693 pr_warning("could not save data for '%s' "
694 "(out of memory)\n", info->filename);
695 }
696 }
697}
698
699/*
700 * Disassociate a profiling data set from a node. Needs to be called with
701 * node_lock held.
702 */
703static void remove_info(struct gcov_node *node, struct gcov_info *info)
704{
705 int i;
706
707 i = get_info_index(node, info);
708 if (i < 0) {
709 pr_warning("could not remove '%s' (not found)\n",
593 info->filename); 710 info->filename);
711 return;
594 } 712 }
595 gcov_info_free(node->ghost); 713 if (gcov_persist)
596 node->ghost = NULL; 714 save_info(node, info);
597 node->info = info; 715 /* Shrink array. */
716 node->loaded_info[i] = node->loaded_info[node->num_loaded - 1];
717 node->num_loaded--;
718 if (node->num_loaded > 0)
719 return;
720 /* Last loaded data set was removed. */
721 kfree(node->loaded_info);
722 node->loaded_info = NULL;
723 node->num_loaded = 0;
724 if (!node->unloaded_info)
725 remove_node(node);
598} 726}
599 727
600/* 728/*
@@ -609,30 +737,18 @@ void gcov_event(enum gcov_action action, struct gcov_info *info)
609 node = get_node_by_name(info->filename); 737 node = get_node_by_name(info->filename);
610 switch (action) { 738 switch (action) {
611 case GCOV_ADD: 739 case GCOV_ADD:
612 /* Add new node or revive ghost. */ 740 if (node)
613 if (!node) { 741 add_info(node, info);
742 else
614 add_node(info); 743 add_node(info);
615 break;
616 }
617 if (gcov_persist)
618 revive_node(node, info);
619 else {
620 pr_warning("could not add '%s' (already exists)\n",
621 info->filename);
622 }
623 break; 744 break;
624 case GCOV_REMOVE: 745 case GCOV_REMOVE:
625 /* Remove node or turn into ghost. */ 746 if (node)
626 if (!node) { 747 remove_info(node, info);
748 else {
627 pr_warning("could not remove '%s' (not found)\n", 749 pr_warning("could not remove '%s' (not found)\n",
628 info->filename); 750 info->filename);
629 break;
630 } 751 }
631 if (gcov_persist) {
632 if (!ghost_node(node))
633 break;
634 }
635 remove_node(node);
636 break; 752 break;
637 } 753 }
638 mutex_unlock(&node_lock); 754 mutex_unlock(&node_lock);
diff --git a/kernel/groups.c b/kernel/groups.c
index 2b45b2ee3964..253dc0f35cf4 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -143,10 +143,9 @@ int groups_search(const struct group_info *group_info, gid_t grp)
143 right = group_info->ngroups; 143 right = group_info->ngroups;
144 while (left < right) { 144 while (left < right) {
145 unsigned int mid = (left+right)/2; 145 unsigned int mid = (left+right)/2;
146 int cmp = grp - GROUP_AT(group_info, mid); 146 if (grp > GROUP_AT(group_info, mid))
147 if (cmp > 0)
148 left = mid + 1; 147 left = mid + 1;
149 else if (cmp < 0) 148 else if (grp < GROUP_AT(group_info, mid))
150 right = mid; 149 right = mid;
151 else 150 else
152 return 1; 151 return 1;
@@ -164,12 +163,6 @@ int groups_search(const struct group_info *group_info, gid_t grp)
164 */ 163 */
165int set_groups(struct cred *new, struct group_info *group_info) 164int set_groups(struct cred *new, struct group_info *group_info)
166{ 165{
167 int retval;
168
169 retval = security_task_setgroups(group_info);
170 if (retval)
171 return retval;
172
173 put_group_info(new->group_info); 166 put_group_info(new->group_info);
174 groups_sort(group_info); 167 groups_sort(group_info);
175 get_group_info(group_info); 168 get_group_info(group_info);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index fdf95968e517..cb49883b64e5 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -91,8 +91,8 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
91 91
92 do { 92 do {
93 seq = read_seqbegin(&xtime_lock); 93 seq = read_seqbegin(&xtime_lock);
94 xts = current_kernel_time(); 94 xts = __current_kernel_time();
95 tom = wall_to_monotonic; 95 tom = __get_wall_to_monotonic();
96 } while (read_seqretry(&xtime_lock, seq)); 96 } while (read_seqretry(&xtime_lock, seq));
97 97
98 xtim = timespec_to_ktime(xts); 98 xtim = timespec_to_ktime(xts);
@@ -146,12 +146,8 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
146static int hrtimer_get_target(int this_cpu, int pinned) 146static int hrtimer_get_target(int this_cpu, int pinned)
147{ 147{
148#ifdef CONFIG_NO_HZ 148#ifdef CONFIG_NO_HZ
149 if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) { 149 if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))
150 int preferred_cpu = get_nohz_load_balancer(); 150 return get_nohz_timer_target();
151
152 if (preferred_cpu >= 0)
153 return preferred_cpu;
154 }
155#endif 151#endif
156 return this_cpu; 152 return this_cpu;
157} 153}
@@ -614,7 +610,7 @@ static int hrtimer_reprogram(struct hrtimer *timer,
614static void retrigger_next_event(void *arg) 610static void retrigger_next_event(void *arg)
615{ 611{
616 struct hrtimer_cpu_base *base; 612 struct hrtimer_cpu_base *base;
617 struct timespec realtime_offset; 613 struct timespec realtime_offset, wtm;
618 unsigned long seq; 614 unsigned long seq;
619 615
620 if (!hrtimer_hres_active()) 616 if (!hrtimer_hres_active())
@@ -622,10 +618,9 @@ static void retrigger_next_event(void *arg)
622 618
623 do { 619 do {
624 seq = read_seqbegin(&xtime_lock); 620 seq = read_seqbegin(&xtime_lock);
625 set_normalized_timespec(&realtime_offset, 621 wtm = __get_wall_to_monotonic();
626 -wall_to_monotonic.tv_sec,
627 -wall_to_monotonic.tv_nsec);
628 } while (read_seqretry(&xtime_lock, seq)); 622 } while (read_seqretry(&xtime_lock, seq));
623 set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
629 624
630 base = &__get_cpu_var(hrtimer_bases); 625 base = &__get_cpu_var(hrtimer_bases);
631 626
@@ -938,6 +933,7 @@ static inline int
938remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) 933remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
939{ 934{
940 if (hrtimer_is_queued(timer)) { 935 if (hrtimer_is_queued(timer)) {
936 unsigned long state;
941 int reprogram; 937 int reprogram;
942 938
943 /* 939 /*
@@ -951,8 +947,13 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
951 debug_deactivate(timer); 947 debug_deactivate(timer);
952 timer_stats_hrtimer_clear_start_info(timer); 948 timer_stats_hrtimer_clear_start_info(timer);
953 reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); 949 reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
954 __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 950 /*
955 reprogram); 951 * We must preserve the CALLBACK state flag here,
952 * otherwise we could move the timer base in
953 * switch_hrtimer_base.
954 */
955 state = timer->state & HRTIMER_STATE_CALLBACK;
956 __remove_hrtimer(timer, base, state, reprogram);
956 return 1; 957 return 1;
957 } 958 }
958 return 0; 959 return 0;
@@ -1190,11 +1191,10 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel);
1190 */ 1191 */
1191ktime_t hrtimer_get_remaining(const struct hrtimer *timer) 1192ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
1192{ 1193{
1193 struct hrtimer_clock_base *base;
1194 unsigned long flags; 1194 unsigned long flags;
1195 ktime_t rem; 1195 ktime_t rem;
1196 1196
1197 base = lock_hrtimer_base(timer, &flags); 1197 lock_hrtimer_base(timer, &flags);
1198 rem = hrtimer_expires_remaining(timer); 1198 rem = hrtimer_expires_remaining(timer);
1199 unlock_hrtimer_base(timer, &flags); 1199 unlock_hrtimer_base(timer, &flags);
1200 1200
@@ -1331,6 +1331,9 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
1331 BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); 1331 BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
1332 enqueue_hrtimer(timer, base); 1332 enqueue_hrtimer(timer, base);
1333 } 1333 }
1334
1335 WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
1336
1334 timer->state &= ~HRTIMER_STATE_CALLBACK; 1337 timer->state &= ~HRTIMER_STATE_CALLBACK;
1335} 1338}
1336 1339
@@ -1844,35 +1847,15 @@ void __init hrtimers_init(void)
1844} 1847}
1845 1848
1846/** 1849/**
1847 * schedule_hrtimeout_range - sleep until timeout 1850 * schedule_hrtimeout_range_clock - sleep until timeout
1848 * @expires: timeout value (ktime_t) 1851 * @expires: timeout value (ktime_t)
1849 * @delta: slack in expires timeout (ktime_t) 1852 * @delta: slack in expires timeout (ktime_t)
1850 * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL 1853 * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
1851 * 1854 * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME
1852 * Make the current task sleep until the given expiry time has
1853 * elapsed. The routine will return immediately unless
1854 * the current task state has been set (see set_current_state()).
1855 *
1856 * The @delta argument gives the kernel the freedom to schedule the
1857 * actual wakeup to a time that is both power and performance friendly.
1858 * The kernel give the normal best effort behavior for "@expires+@delta",
1859 * but may decide to fire the timer earlier, but no earlier than @expires.
1860 *
1861 * You can set the task state as follows -
1862 *
1863 * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
1864 * pass before the routine returns.
1865 *
1866 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
1867 * delivered to the current task.
1868 *
1869 * The current task state is guaranteed to be TASK_RUNNING when this
1870 * routine returns.
1871 *
1872 * Returns 0 when the timer has expired otherwise -EINTR
1873 */ 1855 */
1874int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta, 1856int __sched
1875 const enum hrtimer_mode mode) 1857schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
1858 const enum hrtimer_mode mode, int clock)
1876{ 1859{
1877 struct hrtimer_sleeper t; 1860 struct hrtimer_sleeper t;
1878 1861
@@ -1894,7 +1877,7 @@ int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
1894 return -EINTR; 1877 return -EINTR;
1895 } 1878 }
1896 1879
1897 hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, mode); 1880 hrtimer_init_on_stack(&t.timer, clock, mode);
1898 hrtimer_set_expires_range_ns(&t.timer, *expires, delta); 1881 hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
1899 1882
1900 hrtimer_init_sleeper(&t, current); 1883 hrtimer_init_sleeper(&t, current);
@@ -1913,6 +1896,41 @@ int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
1913 1896
1914 return !t.task ? 0 : -EINTR; 1897 return !t.task ? 0 : -EINTR;
1915} 1898}
1899
1900/**
1901 * schedule_hrtimeout_range - sleep until timeout
1902 * @expires: timeout value (ktime_t)
1903 * @delta: slack in expires timeout (ktime_t)
1904 * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
1905 *
1906 * Make the current task sleep until the given expiry time has
1907 * elapsed. The routine will return immediately unless
1908 * the current task state has been set (see set_current_state()).
1909 *
1910 * The @delta argument gives the kernel the freedom to schedule the
1911 * actual wakeup to a time that is both power and performance friendly.
1912 * The kernel give the normal best effort behavior for "@expires+@delta",
1913 * but may decide to fire the timer earlier, but no earlier than @expires.
1914 *
1915 * You can set the task state as follows -
1916 *
1917 * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
1918 * pass before the routine returns.
1919 *
1920 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
1921 * delivered to the current task.
1922 *
1923 * The current task state is guaranteed to be TASK_RUNNING when this
1924 * routine returns.
1925 *
1926 * Returns 0 when the timer has expired otherwise -EINTR
1927 */
1928int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
1929 const enum hrtimer_mode mode)
1930{
1931 return schedule_hrtimeout_range_clock(expires, delta, mode,
1932 CLOCK_MONOTONIC);
1933}
1916EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); 1934EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
1917 1935
1918/** 1936/**
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 03808ed342a6..c7c2aed9e2dc 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -40,23 +40,33 @@
40#include <linux/percpu.h> 40#include <linux/percpu.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/init.h> 42#include <linux/init.h>
43#include <linux/slab.h>
44#include <linux/list.h>
43#include <linux/cpu.h> 45#include <linux/cpu.h>
44#include <linux/smp.h> 46#include <linux/smp.h>
45 47
46#include <linux/hw_breakpoint.h> 48#include <linux/hw_breakpoint.h>
47 49
50
48/* 51/*
49 * Constraints data 52 * Constraints data
50 */ 53 */
51 54
52/* Number of pinned cpu breakpoints in a cpu */ 55/* Number of pinned cpu breakpoints in a cpu */
53static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned); 56static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]);
54 57
55/* Number of pinned task breakpoints in a cpu */ 58/* Number of pinned task breakpoints in a cpu */
56static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[HBP_NUM]); 59static DEFINE_PER_CPU(unsigned int *, nr_task_bp_pinned[TYPE_MAX]);
57 60
58/* Number of non-pinned cpu/task breakpoints in a cpu */ 61/* Number of non-pinned cpu/task breakpoints in a cpu */
59static DEFINE_PER_CPU(unsigned int, nr_bp_flexible); 62static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]);
63
64static int nr_slots[TYPE_MAX];
65
66/* Keep track of the breakpoints attached to tasks */
67static LIST_HEAD(bp_task_head);
68
69static int constraints_initialized;
60 70
61/* Gather the number of total pinned and un-pinned bp in a cpuset */ 71/* Gather the number of total pinned and un-pinned bp in a cpuset */
62struct bp_busy_slots { 72struct bp_busy_slots {
@@ -67,16 +77,29 @@ struct bp_busy_slots {
67/* Serialize accesses to the above constraints */ 77/* Serialize accesses to the above constraints */
68static DEFINE_MUTEX(nr_bp_mutex); 78static DEFINE_MUTEX(nr_bp_mutex);
69 79
80__weak int hw_breakpoint_weight(struct perf_event *bp)
81{
82 return 1;
83}
84
85static inline enum bp_type_idx find_slot_idx(struct perf_event *bp)
86{
87 if (bp->attr.bp_type & HW_BREAKPOINT_RW)
88 return TYPE_DATA;
89
90 return TYPE_INST;
91}
92
70/* 93/*
71 * Report the maximum number of pinned breakpoints a task 94 * Report the maximum number of pinned breakpoints a task
72 * have in this cpu 95 * have in this cpu
73 */ 96 */
74static unsigned int max_task_bp_pinned(int cpu) 97static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
75{ 98{
76 int i; 99 int i;
77 unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned, cpu); 100 unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
78 101
79 for (i = HBP_NUM -1; i >= 0; i--) { 102 for (i = nr_slots[type] - 1; i >= 0; i--) {
80 if (tsk_pinned[i] > 0) 103 if (tsk_pinned[i] > 0)
81 return i + 1; 104 return i + 1;
82 } 105 }
@@ -84,32 +107,21 @@ static unsigned int max_task_bp_pinned(int cpu)
84 return 0; 107 return 0;
85} 108}
86 109
87static int task_bp_pinned(struct task_struct *tsk) 110/*
111 * Count the number of breakpoints of the same type and same task.
112 * The given event must be not on the list.
113 */
114static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type)
88{ 115{
89 struct perf_event_context *ctx = tsk->perf_event_ctxp; 116 struct perf_event_context *ctx = bp->ctx;
90 struct list_head *list; 117 struct perf_event *iter;
91 struct perf_event *bp;
92 unsigned long flags;
93 int count = 0; 118 int count = 0;
94 119
95 if (WARN_ONCE(!ctx, "No perf context for this task")) 120 list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
96 return 0; 121 if (iter->ctx == ctx && find_slot_idx(iter) == type)
97 122 count += hw_breakpoint_weight(iter);
98 list = &ctx->event_list;
99
100 raw_spin_lock_irqsave(&ctx->lock, flags);
101
102 /*
103 * The current breakpoint counter is not included in the list
104 * at the open() callback time
105 */
106 list_for_each_entry(bp, list, event_entry) {
107 if (bp->attr.type == PERF_TYPE_BREAKPOINT)
108 count++;
109 } 123 }
110 124
111 raw_spin_unlock_irqrestore(&ctx->lock, flags);
112
113 return count; 125 return count;
114} 126}
115 127
@@ -118,18 +130,19 @@ static int task_bp_pinned(struct task_struct *tsk)
118 * a given cpu (cpu > -1) or in all of them (cpu = -1). 130 * a given cpu (cpu > -1) or in all of them (cpu = -1).
119 */ 131 */
120static void 132static void
121fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp) 133fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
134 enum bp_type_idx type)
122{ 135{
123 int cpu = bp->cpu; 136 int cpu = bp->cpu;
124 struct task_struct *tsk = bp->ctx->task; 137 struct task_struct *tsk = bp->ctx->task;
125 138
126 if (cpu >= 0) { 139 if (cpu >= 0) {
127 slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu); 140 slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu);
128 if (!tsk) 141 if (!tsk)
129 slots->pinned += max_task_bp_pinned(cpu); 142 slots->pinned += max_task_bp_pinned(cpu, type);
130 else 143 else
131 slots->pinned += task_bp_pinned(tsk); 144 slots->pinned += task_bp_pinned(bp, type);
132 slots->flexible = per_cpu(nr_bp_flexible, cpu); 145 slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
133 146
134 return; 147 return;
135 } 148 }
@@ -137,16 +150,16 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
137 for_each_online_cpu(cpu) { 150 for_each_online_cpu(cpu) {
138 unsigned int nr; 151 unsigned int nr;
139 152
140 nr = per_cpu(nr_cpu_bp_pinned, cpu); 153 nr = per_cpu(nr_cpu_bp_pinned[type], cpu);
141 if (!tsk) 154 if (!tsk)
142 nr += max_task_bp_pinned(cpu); 155 nr += max_task_bp_pinned(cpu, type);
143 else 156 else
144 nr += task_bp_pinned(tsk); 157 nr += task_bp_pinned(bp, type);
145 158
146 if (nr > slots->pinned) 159 if (nr > slots->pinned)
147 slots->pinned = nr; 160 slots->pinned = nr;
148 161
149 nr = per_cpu(nr_bp_flexible, cpu); 162 nr = per_cpu(nr_bp_flexible[type], cpu);
150 163
151 if (nr > slots->flexible) 164 if (nr > slots->flexible)
152 slots->flexible = nr; 165 slots->flexible = nr;
@@ -154,52 +167,89 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
154} 167}
155 168
156/* 169/*
170 * For now, continue to consider flexible as pinned, until we can
171 * ensure no flexible event can ever be scheduled before a pinned event
172 * in a same cpu.
173 */
174static void
175fetch_this_slot(struct bp_busy_slots *slots, int weight)
176{
177 slots->pinned += weight;
178}
179
180/*
157 * Add a pinned breakpoint for the given task in our constraint table 181 * Add a pinned breakpoint for the given task in our constraint table
158 */ 182 */
159static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable) 183static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable,
184 enum bp_type_idx type, int weight)
160{ 185{
161 unsigned int *tsk_pinned; 186 unsigned int *tsk_pinned;
162 int count = 0; 187 int old_count = 0;
188 int old_idx = 0;
189 int idx = 0;
163 190
164 count = task_bp_pinned(tsk); 191 old_count = task_bp_pinned(bp, type);
192 old_idx = old_count - 1;
193 idx = old_idx + weight;
165 194
166 tsk_pinned = per_cpu(nr_task_bp_pinned, cpu); 195 /* tsk_pinned[n] is the number of tasks having n breakpoints */
196 tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
167 if (enable) { 197 if (enable) {
168 tsk_pinned[count]++; 198 tsk_pinned[idx]++;
169 if (count > 0) 199 if (old_count > 0)
170 tsk_pinned[count-1]--; 200 tsk_pinned[old_idx]--;
171 } else { 201 } else {
172 tsk_pinned[count]--; 202 tsk_pinned[idx]--;
173 if (count > 0) 203 if (old_count > 0)
174 tsk_pinned[count-1]++; 204 tsk_pinned[old_idx]++;
175 } 205 }
176} 206}
177 207
178/* 208/*
179 * Add/remove the given breakpoint in our constraint table 209 * Add/remove the given breakpoint in our constraint table
180 */ 210 */
181static void toggle_bp_slot(struct perf_event *bp, bool enable) 211static void
212toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
213 int weight)
182{ 214{
183 int cpu = bp->cpu; 215 int cpu = bp->cpu;
184 struct task_struct *tsk = bp->ctx->task; 216 struct task_struct *tsk = bp->ctx->task;
185 217
218 /* Pinned counter cpu profiling */
219 if (!tsk) {
220
221 if (enable)
222 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight;
223 else
224 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight;
225 return;
226 }
227
186 /* Pinned counter task profiling */ 228 /* Pinned counter task profiling */
187 if (tsk) {
188 if (cpu >= 0) {
189 toggle_bp_task_slot(tsk, cpu, enable);
190 return;
191 }
192 229
230 if (!enable)
231 list_del(&bp->hw.bp_list);
232
233 if (cpu >= 0) {
234 toggle_bp_task_slot(bp, cpu, enable, type, weight);
235 } else {
193 for_each_online_cpu(cpu) 236 for_each_online_cpu(cpu)
194 toggle_bp_task_slot(tsk, cpu, enable); 237 toggle_bp_task_slot(bp, cpu, enable, type, weight);
195 return;
196 } 238 }
197 239
198 /* Pinned counter cpu profiling */
199 if (enable) 240 if (enable)
200 per_cpu(nr_cpu_bp_pinned, bp->cpu)++; 241 list_add_tail(&bp->hw.bp_list, &bp_task_head);
201 else 242}
202 per_cpu(nr_cpu_bp_pinned, bp->cpu)--; 243
244/*
245 * Function to perform processor-specific cleanup during unregistration
246 */
247__weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
248{
249 /*
250 * A weak stub function here for those archs that don't define
251 * it inside arch/.../kernel/hw_breakpoint.c
252 */
203} 253}
204 254
205/* 255/*
@@ -246,14 +296,33 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
246static int __reserve_bp_slot(struct perf_event *bp) 296static int __reserve_bp_slot(struct perf_event *bp)
247{ 297{
248 struct bp_busy_slots slots = {0}; 298 struct bp_busy_slots slots = {0};
299 enum bp_type_idx type;
300 int weight;
301
302 /* We couldn't initialize breakpoint constraints on boot */
303 if (!constraints_initialized)
304 return -ENOMEM;
249 305
250 fetch_bp_busy_slots(&slots, bp); 306 /* Basic checks */
307 if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY ||
308 bp->attr.bp_type == HW_BREAKPOINT_INVALID)
309 return -EINVAL;
310
311 type = find_slot_idx(bp);
312 weight = hw_breakpoint_weight(bp);
313
314 fetch_bp_busy_slots(&slots, bp, type);
315 /*
316 * Simulate the addition of this breakpoint to the constraints
317 * and see the result.
318 */
319 fetch_this_slot(&slots, weight);
251 320
252 /* Flexible counters need to keep at least one slot */ 321 /* Flexible counters need to keep at least one slot */
253 if (slots.pinned + (!!slots.flexible) == HBP_NUM) 322 if (slots.pinned + (!!slots.flexible) > nr_slots[type])
254 return -ENOSPC; 323 return -ENOSPC;
255 324
256 toggle_bp_slot(bp, true); 325 toggle_bp_slot(bp, true, type, weight);
257 326
258 return 0; 327 return 0;
259} 328}
@@ -273,13 +342,19 @@ int reserve_bp_slot(struct perf_event *bp)
273 342
274static void __release_bp_slot(struct perf_event *bp) 343static void __release_bp_slot(struct perf_event *bp)
275{ 344{
276 toggle_bp_slot(bp, false); 345 enum bp_type_idx type;
346 int weight;
347
348 type = find_slot_idx(bp);
349 weight = hw_breakpoint_weight(bp);
350 toggle_bp_slot(bp, false, type, weight);
277} 351}
278 352
279void release_bp_slot(struct perf_event *bp) 353void release_bp_slot(struct perf_event *bp)
280{ 354{
281 mutex_lock(&nr_bp_mutex); 355 mutex_lock(&nr_bp_mutex);
282 356
357 arch_unregister_hw_breakpoint(bp);
283 __release_bp_slot(bp); 358 __release_bp_slot(bp);
284 359
285 mutex_unlock(&nr_bp_mutex); 360 mutex_unlock(&nr_bp_mutex);
@@ -308,6 +383,28 @@ int dbg_release_bp_slot(struct perf_event *bp)
308 return 0; 383 return 0;
309} 384}
310 385
386static int validate_hw_breakpoint(struct perf_event *bp)
387{
388 int ret;
389
390 ret = arch_validate_hwbkpt_settings(bp);
391 if (ret)
392 return ret;
393
394 if (arch_check_bp_in_kernelspace(bp)) {
395 if (bp->attr.exclude_kernel)
396 return -EINVAL;
397 /*
398 * Don't let unprivileged users set a breakpoint in the trap
399 * path to avoid trap recursion attacks.
400 */
401 if (!capable(CAP_SYS_ADMIN))
402 return -EPERM;
403 }
404
405 return 0;
406}
407
311int register_perf_hw_breakpoint(struct perf_event *bp) 408int register_perf_hw_breakpoint(struct perf_event *bp)
312{ 409{
313 int ret; 410 int ret;
@@ -316,17 +413,7 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
316 if (ret) 413 if (ret)
317 return ret; 414 return ret;
318 415
319 /* 416 ret = validate_hw_breakpoint(bp);
320 * Ptrace breakpoints can be temporary perf events only
321 * meant to reserve a slot. In this case, it is created disabled and
322 * we don't want to check the params right now (as we put a null addr)
323 * But perf tools create events as disabled and we want to check
324 * the params for them.
325 * This is a quick hack that will be removed soon, once we remove
326 * the tmp breakpoints from ptrace
327 */
328 if (!bp->attr.disabled || !bp->overflow_handler)
329 ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
330 417
331 /* if arch_validate_hwbkpt_settings() fails then release bp slot */ 418 /* if arch_validate_hwbkpt_settings() fails then release bp slot */
332 if (ret) 419 if (ret)
@@ -346,7 +433,8 @@ register_user_hw_breakpoint(struct perf_event_attr *attr,
346 perf_overflow_handler_t triggered, 433 perf_overflow_handler_t triggered,
347 struct task_struct *tsk) 434 struct task_struct *tsk)
348{ 435{
349 return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered); 436 return perf_event_create_kernel_counter(attr, -1, task_pid_vnr(tsk),
437 triggered);
350} 438}
351EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); 439EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
352 440
@@ -373,7 +461,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
373 if (attr->disabled) 461 if (attr->disabled)
374 goto end; 462 goto end;
375 463
376 err = arch_validate_hwbkpt_settings(bp, bp->ctx->task); 464 err = validate_hw_breakpoint(bp);
377 if (!err) 465 if (!err)
378 perf_event_enable(bp); 466 perf_event_enable(bp);
379 467
@@ -480,7 +568,36 @@ static struct notifier_block hw_breakpoint_exceptions_nb = {
480 568
481static int __init init_hw_breakpoint(void) 569static int __init init_hw_breakpoint(void)
482{ 570{
571 unsigned int **task_bp_pinned;
572 int cpu, err_cpu;
573 int i;
574
575 for (i = 0; i < TYPE_MAX; i++)
576 nr_slots[i] = hw_breakpoint_slots(i);
577
578 for_each_possible_cpu(cpu) {
579 for (i = 0; i < TYPE_MAX; i++) {
580 task_bp_pinned = &per_cpu(nr_task_bp_pinned[i], cpu);
581 *task_bp_pinned = kzalloc(sizeof(int) * nr_slots[i],
582 GFP_KERNEL);
583 if (!*task_bp_pinned)
584 goto err_alloc;
585 }
586 }
587
588 constraints_initialized = 1;
589
483 return register_die_notifier(&hw_breakpoint_exceptions_nb); 590 return register_die_notifier(&hw_breakpoint_exceptions_nb);
591
592 err_alloc:
593 for_each_possible_cpu(err_cpu) {
594 if (err_cpu == cpu)
595 break;
596 for (i = 0; i < TYPE_MAX; i++)
597 kfree(per_cpu(nr_task_bp_pinned[i], cpu));
598 }
599
600 return -ENOMEM;
484} 601}
485core_initcall(init_hw_breakpoint); 602core_initcall(init_hw_breakpoint);
486 603
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 76d5a671bfe1..27e5c6911223 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -370,9 +370,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
370 irqreturn_t ret, retval = IRQ_NONE; 370 irqreturn_t ret, retval = IRQ_NONE;
371 unsigned int status = 0; 371 unsigned int status = 0;
372 372
373 if (!(action->flags & IRQF_DISABLED))
374 local_irq_enable_in_hardirq();
375
376 do { 373 do {
377 trace_irq_handler_entry(irq, action); 374 trace_irq_handler_entry(irq, action);
378 ret = action->handler(irq, action->dev_id); 375 ret = action->handler(irq, action->dev_id);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 704e488730a5..c3003e9d91a3 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -138,6 +138,22 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
138 return 0; 138 return 0;
139} 139}
140 140
141int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
142{
143 struct irq_desc *desc = irq_to_desc(irq);
144 unsigned long flags;
145
146 if (!desc)
147 return -EINVAL;
148
149 raw_spin_lock_irqsave(&desc->lock, flags);
150 desc->affinity_hint = m;
151 raw_spin_unlock_irqrestore(&desc->lock, flags);
152
153 return 0;
154}
155EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
156
141#ifndef CONFIG_AUTO_IRQ_AFFINITY 157#ifndef CONFIG_AUTO_IRQ_AFFINITY
142/* 158/*
143 * Generic version of the affinity autoselector. 159 * Generic version of the affinity autoselector.
@@ -200,7 +216,7 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *desc)
200void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) 216void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
201{ 217{
202 if (suspend) { 218 if (suspend) {
203 if (!desc->action || (desc->action->flags & IRQF_TIMER)) 219 if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND))
204 return; 220 return;
205 desc->status |= IRQ_SUSPENDED; 221 desc->status |= IRQ_SUSPENDED;
206 } 222 }
@@ -440,6 +456,9 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
440 /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */ 456 /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */
441 desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK); 457 desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK);
442 desc->status |= flags; 458 desc->status |= flags;
459
460 if (chip != desc->chip)
461 irq_chip_set_defaults(desc->chip);
443 } 462 }
444 463
445 return ret; 464 return ret;
@@ -757,16 +776,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
757 if (new->flags & IRQF_ONESHOT) 776 if (new->flags & IRQF_ONESHOT)
758 desc->status |= IRQ_ONESHOT; 777 desc->status |= IRQ_ONESHOT;
759 778
760 /*
761 * Force MSI interrupts to run with interrupts
762 * disabled. The multi vector cards can cause stack
763 * overflows due to nested interrupts when enough of
764 * them are directed to a core and fire at the same
765 * time.
766 */
767 if (desc->msi_desc)
768 new->flags |= IRQF_DISABLED;
769
770 if (!(desc->status & IRQ_NOAUTOEN)) { 779 if (!(desc->status & IRQ_NOAUTOEN)) {
771 desc->depth = 0; 780 desc->depth = 0;
772 desc->status &= ~IRQ_DISABLED; 781 desc->status &= ~IRQ_DISABLED;
@@ -916,6 +925,12 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
916 desc->chip->disable(irq); 925 desc->chip->disable(irq);
917 } 926 }
918 927
928#ifdef CONFIG_SMP
929 /* make sure affinity_hint is cleaned up */
930 if (WARN_ON_ONCE(desc->affinity_hint))
931 desc->affinity_hint = NULL;
932#endif
933
919 raw_spin_unlock_irqrestore(&desc->lock, flags); 934 raw_spin_unlock_irqrestore(&desc->lock, flags);
920 935
921 unregister_handler_proc(irq, action); 936 unregister_handler_proc(irq, action);
@@ -1027,7 +1042,6 @@ EXPORT_SYMBOL(free_irq);
1027 * Flags: 1042 * Flags:
1028 * 1043 *
1029 * IRQF_SHARED Interrupt is shared 1044 * IRQF_SHARED Interrupt is shared
1030 * IRQF_DISABLED Disable local interrupts while processing
1031 * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy 1045 * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy
1032 * IRQF_TRIGGER_* Specify active edge(s) or level 1046 * IRQF_TRIGGER_* Specify active edge(s) or level
1033 * 1047 *
@@ -1041,25 +1055,6 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1041 int retval; 1055 int retval;
1042 1056
1043 /* 1057 /*
1044 * handle_IRQ_event() always ignores IRQF_DISABLED except for
1045 * the _first_ irqaction (sigh). That can cause oopsing, but
1046 * the behavior is classified as "will not fix" so we need to
1047 * start nudging drivers away from using that idiom.
1048 */
1049 if ((irqflags & (IRQF_SHARED|IRQF_DISABLED)) ==
1050 (IRQF_SHARED|IRQF_DISABLED)) {
1051 pr_warning(
1052 "IRQ %d/%s: IRQF_DISABLED is not guaranteed on shared IRQs\n",
1053 irq, devname);
1054 }
1055
1056#ifdef CONFIG_LOCKDEP
1057 /*
1058 * Lockdep wants atomic interrupt handlers:
1059 */
1060 irqflags |= IRQF_DISABLED;
1061#endif
1062 /*
1063 * Sanity-check: shared interrupts must pass in a real dev-ID, 1058 * Sanity-check: shared interrupts must pass in a real dev-ID,
1064 * otherwise we'll have trouble later trying to figure out 1059 * otherwise we'll have trouble later trying to figure out
1065 * which interrupt is which (messes up the interrupt freeing 1060 * which interrupt is which (messes up the interrupt freeing
@@ -1120,3 +1115,40 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1120 return retval; 1115 return retval;
1121} 1116}
1122EXPORT_SYMBOL(request_threaded_irq); 1117EXPORT_SYMBOL(request_threaded_irq);
1118
1119/**
1120 * request_any_context_irq - allocate an interrupt line
1121 * @irq: Interrupt line to allocate
1122 * @handler: Function to be called when the IRQ occurs.
1123 * Threaded handler for threaded interrupts.
1124 * @flags: Interrupt type flags
1125 * @name: An ascii name for the claiming device
1126 * @dev_id: A cookie passed back to the handler function
1127 *
1128 * This call allocates interrupt resources and enables the
1129 * interrupt line and IRQ handling. It selects either a
1130 * hardirq or threaded handling method depending on the
1131 * context.
1132 *
1133 * On failure, it returns a negative value. On success,
1134 * it returns either IRQC_IS_HARDIRQ or IRQC_IS_NESTED.
1135 */
1136int request_any_context_irq(unsigned int irq, irq_handler_t handler,
1137 unsigned long flags, const char *name, void *dev_id)
1138{
1139 struct irq_desc *desc = irq_to_desc(irq);
1140 int ret;
1141
1142 if (!desc)
1143 return -EINVAL;
1144
1145 if (desc->status & IRQ_NESTED_THREAD) {
1146 ret = request_threaded_irq(irq, NULL, handler,
1147 flags, name, dev_id);
1148 return !ret ? IRQC_IS_NESTED : ret;
1149 }
1150
1151 ret = request_irq(irq, handler, flags, name, dev_id);
1152 return !ret ? IRQC_IS_HARDIRQ : ret;
1153}
1154EXPORT_SYMBOL_GPL(request_any_context_irq);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 7a6eb04ef6b5..09a2ee540bd2 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -32,6 +32,27 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v)
32 return 0; 32 return 0;
33} 33}
34 34
35static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
36{
37 struct irq_desc *desc = irq_to_desc((long)m->private);
38 unsigned long flags;
39 cpumask_var_t mask;
40
41 if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
42 return -ENOMEM;
43
44 raw_spin_lock_irqsave(&desc->lock, flags);
45 if (desc->affinity_hint)
46 cpumask_copy(mask, desc->affinity_hint);
47 raw_spin_unlock_irqrestore(&desc->lock, flags);
48
49 seq_cpumask(m, mask);
50 seq_putc(m, '\n');
51 free_cpumask_var(mask);
52
53 return 0;
54}
55
35#ifndef is_affinity_mask_valid 56#ifndef is_affinity_mask_valid
36#define is_affinity_mask_valid(val) 1 57#define is_affinity_mask_valid(val) 1
37#endif 58#endif
@@ -84,6 +105,11 @@ static int irq_affinity_proc_open(struct inode *inode, struct file *file)
84 return single_open(file, irq_affinity_proc_show, PDE(inode)->data); 105 return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
85} 106}
86 107
108static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file)
109{
110 return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data);
111}
112
87static const struct file_operations irq_affinity_proc_fops = { 113static const struct file_operations irq_affinity_proc_fops = {
88 .open = irq_affinity_proc_open, 114 .open = irq_affinity_proc_open,
89 .read = seq_read, 115 .read = seq_read,
@@ -92,6 +118,13 @@ static const struct file_operations irq_affinity_proc_fops = {
92 .write = irq_affinity_proc_write, 118 .write = irq_affinity_proc_write,
93}; 119};
94 120
121static const struct file_operations irq_affinity_hint_proc_fops = {
122 .open = irq_affinity_hint_proc_open,
123 .read = seq_read,
124 .llseek = seq_lseek,
125 .release = single_release,
126};
127
95static int default_affinity_show(struct seq_file *m, void *v) 128static int default_affinity_show(struct seq_file *m, void *v)
96{ 129{
97 seq_cpumask(m, irq_default_affinity); 130 seq_cpumask(m, irq_default_affinity);
@@ -147,6 +180,26 @@ static const struct file_operations default_affinity_proc_fops = {
147 .release = single_release, 180 .release = single_release,
148 .write = default_affinity_write, 181 .write = default_affinity_write,
149}; 182};
183
184static int irq_node_proc_show(struct seq_file *m, void *v)
185{
186 struct irq_desc *desc = irq_to_desc((long) m->private);
187
188 seq_printf(m, "%d\n", desc->node);
189 return 0;
190}
191
192static int irq_node_proc_open(struct inode *inode, struct file *file)
193{
194 return single_open(file, irq_node_proc_show, PDE(inode)->data);
195}
196
197static const struct file_operations irq_node_proc_fops = {
198 .open = irq_node_proc_open,
199 .read = seq_read,
200 .llseek = seq_lseek,
201 .release = single_release,
202};
150#endif 203#endif
151 204
152static int irq_spurious_proc_show(struct seq_file *m, void *v) 205static int irq_spurious_proc_show(struct seq_file *m, void *v)
@@ -231,6 +284,13 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
231 /* create /proc/irq/<irq>/smp_affinity */ 284 /* create /proc/irq/<irq>/smp_affinity */
232 proc_create_data("smp_affinity", 0600, desc->dir, 285 proc_create_data("smp_affinity", 0600, desc->dir,
233 &irq_affinity_proc_fops, (void *)(long)irq); 286 &irq_affinity_proc_fops, (void *)(long)irq);
287
288 /* create /proc/irq/<irq>/affinity_hint */
289 proc_create_data("affinity_hint", 0400, desc->dir,
290 &irq_affinity_hint_proc_fops, (void *)(long)irq);
291
292 proc_create_data("node", 0444, desc->dir,
293 &irq_node_proc_fops, (void *)(long)irq);
234#endif 294#endif
235 295
236 proc_create_data("spurious", 0444, desc->dir, 296 proc_create_data("spurious", 0444, desc->dir,
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 13aff293f4de..6f6d091b5757 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -16,6 +16,7 @@
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/seq_file.h> 17#include <linux/seq_file.h>
18#include <linux/fs.h> 18#include <linux/fs.h>
19#include <linux/kdb.h>
19#include <linux/err.h> 20#include <linux/err.h>
20#include <linux/proc_fs.h> 21#include <linux/proc_fs.h>
21#include <linux/sched.h> /* for cond_resched */ 22#include <linux/sched.h> /* for cond_resched */
@@ -516,6 +517,26 @@ static int kallsyms_open(struct inode *inode, struct file *file)
516 return ret; 517 return ret;
517} 518}
518 519
520#ifdef CONFIG_KGDB_KDB
521const char *kdb_walk_kallsyms(loff_t *pos)
522{
523 static struct kallsym_iter kdb_walk_kallsyms_iter;
524 if (*pos == 0) {
525 memset(&kdb_walk_kallsyms_iter, 0,
526 sizeof(kdb_walk_kallsyms_iter));
527 reset_iter(&kdb_walk_kallsyms_iter, 0);
528 }
529 while (1) {
530 if (!update_iter(&kdb_walk_kallsyms_iter, *pos))
531 return NULL;
532 ++*pos;
533 /* Some debugging symbols have no name. Ignore them. */
534 if (kdb_walk_kallsyms_iter.name[0])
535 return kdb_walk_kallsyms_iter.name;
536 }
537}
538#endif /* CONFIG_KGDB_KDB */
539
519static const struct file_operations kallsyms_operations = { 540static const struct file_operations kallsyms_operations = {
520 .open = kallsyms_open, 541 .open = kallsyms_open,
521 .read = seq_read, 542 .read = seq_read,
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 474a84715eac..c0613f7d6730 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -151,8 +151,10 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
151 image->nr_segments = nr_segments; 151 image->nr_segments = nr_segments;
152 segment_bytes = nr_segments * sizeof(*segments); 152 segment_bytes = nr_segments * sizeof(*segments);
153 result = copy_from_user(image->segment, segments, segment_bytes); 153 result = copy_from_user(image->segment, segments, segment_bytes);
154 if (result) 154 if (result) {
155 result = -EFAULT;
155 goto out; 156 goto out;
157 }
156 158
157 /* 159 /*
158 * Verify we have good destination addresses. The caller is 160 * Verify we have good destination addresses. The caller is
@@ -827,7 +829,7 @@ static int kimage_load_normal_segment(struct kimage *image,
827 result = copy_from_user(ptr, buf, uchunk); 829 result = copy_from_user(ptr, buf, uchunk);
828 kunmap(page); 830 kunmap(page);
829 if (result) { 831 if (result) {
830 result = (result < 0) ? result : -EIO; 832 result = -EFAULT;
831 goto out; 833 goto out;
832 } 834 }
833 ubytes -= uchunk; 835 ubytes -= uchunk;
@@ -882,7 +884,7 @@ static int kimage_load_crash_segment(struct kimage *image,
882 kexec_flush_icache_page(page); 884 kexec_flush_icache_page(page);
883 kunmap(page); 885 kunmap(page);
884 if (result) { 886 if (result) {
885 result = (result < 0) ? result : -EIO; 887 result = -EFAULT;
886 goto out; 888 goto out;
887 } 889 }
888 ubytes -= uchunk; 890 ubytes -= uchunk;
@@ -1089,9 +1091,10 @@ void crash_kexec(struct pt_regs *regs)
1089 1091
1090size_t crash_get_memory_size(void) 1092size_t crash_get_memory_size(void)
1091{ 1093{
1092 size_t size; 1094 size_t size = 0;
1093 mutex_lock(&kexec_mutex); 1095 mutex_lock(&kexec_mutex);
1094 size = crashk_res.end - crashk_res.start + 1; 1096 if (crashk_res.end != crashk_res.start)
1097 size = crashk_res.end - crashk_res.start + 1;
1095 mutex_unlock(&kexec_mutex); 1098 mutex_unlock(&kexec_mutex);
1096 return size; 1099 return size;
1097} 1100}
@@ -1134,7 +1137,7 @@ int crash_shrink_memory(unsigned long new_size)
1134 1137
1135 free_reserved_phys_range(end, crashk_res.end); 1138 free_reserved_phys_range(end, crashk_res.end);
1136 1139
1137 if (start == end) 1140 if ((start == end) && (crashk_res.parent != NULL))
1138 release_resource(&crashk_res); 1141 release_resource(&crashk_res);
1139 crashk_res.end = end - 1; 1142 crashk_res.end = end - 1;
1140 1143
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 35edbe22e9a9..01a0700e873f 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -1,8 +1,7 @@
1/* 1/*
2 * A generic kernel FIFO implementation. 2 * A generic kernel FIFO implementation
3 * 3 *
4 * Copyright (C) 2009 Stefani Seibold <stefani@seibold.net> 4 * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net>
5 * Copyright (C) 2004 Stelian Pop <stelian@popies.net>
6 * 5 *
7 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by 7 * it under the terms of the GNU General Public License as published by
@@ -11,7 +10,7 @@
11 * 10 *
12 * This program is distributed in the hope that it will be useful, 11 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details. 14 * GNU General Public License for more details.
16 * 15 *
17 * You should have received a copy of the GNU General Public License 16 * You should have received a copy of the GNU General Public License
@@ -24,422 +23,586 @@
24#include <linux/module.h> 23#include <linux/module.h>
25#include <linux/slab.h> 24#include <linux/slab.h>
26#include <linux/err.h> 25#include <linux/err.h>
27#include <linux/kfifo.h>
28#include <linux/log2.h> 26#include <linux/log2.h>
29#include <linux/uaccess.h> 27#include <linux/uaccess.h>
28#include <linux/kfifo.h>
30 29
31static void _kfifo_init(struct kfifo *fifo, void *buffer, 30/*
32 unsigned int size) 31 * internal helper to calculate the unused elements in a fifo
33{
34 fifo->buffer = buffer;
35 fifo->size = size;
36
37 kfifo_reset(fifo);
38}
39
40/**
41 * kfifo_init - initialize a FIFO using a preallocated buffer
42 * @fifo: the fifo to assign the buffer
43 * @buffer: the preallocated buffer to be used.
44 * @size: the size of the internal buffer, this has to be a power of 2.
45 *
46 */ 32 */
47void kfifo_init(struct kfifo *fifo, void *buffer, unsigned int size) 33static inline unsigned int kfifo_unused(struct __kfifo *fifo)
48{ 34{
49 /* size must be a power of 2 */ 35 return (fifo->mask + 1) - (fifo->in - fifo->out);
50 BUG_ON(!is_power_of_2(size));
51
52 _kfifo_init(fifo, buffer, size);
53} 36}
54EXPORT_SYMBOL(kfifo_init);
55 37
56/** 38int __kfifo_alloc(struct __kfifo *fifo, unsigned int size,
57 * kfifo_alloc - allocates a new FIFO internal buffer 39 size_t esize, gfp_t gfp_mask)
58 * @fifo: the fifo to assign then new buffer
59 * @size: the size of the buffer to be allocated, this have to be a power of 2.
60 * @gfp_mask: get_free_pages mask, passed to kmalloc()
61 *
62 * This function dynamically allocates a new fifo internal buffer
63 *
64 * The size will be rounded-up to a power of 2.
65 * The buffer will be release with kfifo_free().
66 * Return 0 if no error, otherwise the an error code
67 */
68int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask)
69{ 40{
70 unsigned char *buffer;
71
72 /* 41 /*
73 * round up to the next power of 2, since our 'let the indices 42 * round down to the next power of 2, since our 'let the indices
74 * wrap' technique works only in this case. 43 * wrap' technique works only in this case.
75 */ 44 */
76 if (!is_power_of_2(size)) { 45 if (!is_power_of_2(size))
77 BUG_ON(size > 0x80000000); 46 size = rounddown_pow_of_two(size);
78 size = roundup_pow_of_two(size); 47
48 fifo->in = 0;
49 fifo->out = 0;
50 fifo->esize = esize;
51
52 if (size < 2) {
53 fifo->data = NULL;
54 fifo->mask = 0;
55 return -EINVAL;
79 } 56 }
80 57
81 buffer = kmalloc(size, gfp_mask); 58 fifo->data = kmalloc(size * esize, gfp_mask);
82 if (!buffer) { 59
83 _kfifo_init(fifo, NULL, 0); 60 if (!fifo->data) {
61 fifo->mask = 0;
84 return -ENOMEM; 62 return -ENOMEM;
85 } 63 }
86 64 fifo->mask = size - 1;
87 _kfifo_init(fifo, buffer, size);
88 65
89 return 0; 66 return 0;
90} 67}
91EXPORT_SYMBOL(kfifo_alloc); 68EXPORT_SYMBOL(__kfifo_alloc);
92 69
93/** 70void __kfifo_free(struct __kfifo *fifo)
94 * kfifo_free - frees the FIFO internal buffer
95 * @fifo: the fifo to be freed.
96 */
97void kfifo_free(struct kfifo *fifo)
98{ 71{
99 kfree(fifo->buffer); 72 kfree(fifo->data);
100 _kfifo_init(fifo, NULL, 0); 73 fifo->in = 0;
74 fifo->out = 0;
75 fifo->esize = 0;
76 fifo->data = NULL;
77 fifo->mask = 0;
101} 78}
102EXPORT_SYMBOL(kfifo_free); 79EXPORT_SYMBOL(__kfifo_free);
103 80
104/** 81int __kfifo_init(struct __kfifo *fifo, void *buffer,
105 * kfifo_skip - skip output data 82 unsigned int size, size_t esize)
106 * @fifo: the fifo to be used.
107 * @len: number of bytes to skip
108 */
109void kfifo_skip(struct kfifo *fifo, unsigned int len)
110{ 83{
111 if (len < kfifo_len(fifo)) { 84 size /= esize;
112 __kfifo_add_out(fifo, len); 85
113 return; 86 if (!is_power_of_2(size))
87 size = rounddown_pow_of_two(size);
88
89 fifo->in = 0;
90 fifo->out = 0;
91 fifo->esize = esize;
92 fifo->data = buffer;
93
94 if (size < 2) {
95 fifo->mask = 0;
96 return -EINVAL;
114 } 97 }
115 kfifo_reset_out(fifo); 98 fifo->mask = size - 1;
99
100 return 0;
116} 101}
117EXPORT_SYMBOL(kfifo_skip); 102EXPORT_SYMBOL(__kfifo_init);
118 103
119static inline void __kfifo_in_data(struct kfifo *fifo, 104static void kfifo_copy_in(struct __kfifo *fifo, const void *src,
120 const void *from, unsigned int len, unsigned int off) 105 unsigned int len, unsigned int off)
121{ 106{
107 unsigned int size = fifo->mask + 1;
108 unsigned int esize = fifo->esize;
122 unsigned int l; 109 unsigned int l;
123 110
111 off &= fifo->mask;
112 if (esize != 1) {
113 off *= esize;
114 size *= esize;
115 len *= esize;
116 }
117 l = min(len, size - off);
118
119 memcpy(fifo->data + off, src, l);
120 memcpy(fifo->data, src + l, len - l);
124 /* 121 /*
125 * Ensure that we sample the fifo->out index -before- we 122 * make sure that the data in the fifo is up to date before
126 * start putting bytes into the kfifo. 123 * incrementing the fifo->in index counter
127 */ 124 */
125 smp_wmb();
126}
128 127
129 smp_mb(); 128unsigned int __kfifo_in(struct __kfifo *fifo,
130 129 const void *buf, unsigned int len)
131 off = __kfifo_off(fifo, fifo->in + off); 130{
131 unsigned int l;
132 132
133 /* first put the data starting from fifo->in to buffer end */ 133 l = kfifo_unused(fifo);
134 l = min(len, fifo->size - off); 134 if (len > l)
135 memcpy(fifo->buffer + off, from, l); 135 len = l;
136 136
137 /* then put the rest (if any) at the beginning of the buffer */ 137 kfifo_copy_in(fifo, buf, len, fifo->in);
138 memcpy(fifo->buffer, from + l, len - l); 138 fifo->in += len;
139 return len;
139} 140}
141EXPORT_SYMBOL(__kfifo_in);
140 142
141static inline void __kfifo_out_data(struct kfifo *fifo, 143static void kfifo_copy_out(struct __kfifo *fifo, void *dst,
142 void *to, unsigned int len, unsigned int off) 144 unsigned int len, unsigned int off)
143{ 145{
146 unsigned int size = fifo->mask + 1;
147 unsigned int esize = fifo->esize;
144 unsigned int l; 148 unsigned int l;
145 149
150 off &= fifo->mask;
151 if (esize != 1) {
152 off *= esize;
153 size *= esize;
154 len *= esize;
155 }
156 l = min(len, size - off);
157
158 memcpy(dst, fifo->data + off, l);
159 memcpy(dst + l, fifo->data, len - l);
146 /* 160 /*
147 * Ensure that we sample the fifo->in index -before- we 161 * make sure that the data is copied before
148 * start removing bytes from the kfifo. 162 * incrementing the fifo->out index counter
149 */ 163 */
164 smp_wmb();
165}
150 166
151 smp_rmb(); 167unsigned int __kfifo_out_peek(struct __kfifo *fifo,
168 void *buf, unsigned int len)
169{
170 unsigned int l;
152 171
153 off = __kfifo_off(fifo, fifo->out + off); 172 l = fifo->in - fifo->out;
173 if (len > l)
174 len = l;
154 175
155 /* first get the data from fifo->out until the end of the buffer */ 176 kfifo_copy_out(fifo, buf, len, fifo->out);
156 l = min(len, fifo->size - off); 177 return len;
157 memcpy(to, fifo->buffer + off, l); 178}
179EXPORT_SYMBOL(__kfifo_out_peek);
158 180
159 /* then get the rest (if any) from the beginning of the buffer */ 181unsigned int __kfifo_out(struct __kfifo *fifo,
160 memcpy(to + l, fifo->buffer, len - l); 182 void *buf, unsigned int len)
183{
184 len = __kfifo_out_peek(fifo, buf, len);
185 fifo->out += len;
186 return len;
161} 187}
188EXPORT_SYMBOL(__kfifo_out);
162 189
163static inline int __kfifo_from_user_data(struct kfifo *fifo, 190static unsigned long kfifo_copy_from_user(struct __kfifo *fifo,
164 const void __user *from, unsigned int len, unsigned int off, 191 const void __user *from, unsigned int len, unsigned int off,
165 unsigned *lenout) 192 unsigned int *copied)
166{ 193{
194 unsigned int size = fifo->mask + 1;
195 unsigned int esize = fifo->esize;
167 unsigned int l; 196 unsigned int l;
168 int ret; 197 unsigned long ret;
169 198
199 off &= fifo->mask;
200 if (esize != 1) {
201 off *= esize;
202 size *= esize;
203 len *= esize;
204 }
205 l = min(len, size - off);
206
207 ret = copy_from_user(fifo->data + off, from, l);
208 if (unlikely(ret))
209 ret = DIV_ROUND_UP(ret + len - l, esize);
210 else {
211 ret = copy_from_user(fifo->data, from + l, len - l);
212 if (unlikely(ret))
213 ret = DIV_ROUND_UP(ret, esize);
214 }
170 /* 215 /*
171 * Ensure that we sample the fifo->out index -before- we 216 * make sure that the data in the fifo is up to date before
172 * start putting bytes into the kfifo. 217 * incrementing the fifo->in index counter
173 */ 218 */
219 smp_wmb();
220 *copied = len - ret;
221 /* return the number of elements which are not copied */
222 return ret;
223}
174 224
175 smp_mb(); 225int __kfifo_from_user(struct __kfifo *fifo, const void __user *from,
226 unsigned long len, unsigned int *copied)
227{
228 unsigned int l;
229 unsigned long ret;
230 unsigned int esize = fifo->esize;
231 int err;
176 232
177 off = __kfifo_off(fifo, fifo->in + off); 233 if (esize != 1)
234 len /= esize;
178 235
179 /* first put the data starting from fifo->in to buffer end */ 236 l = kfifo_unused(fifo);
180 l = min(len, fifo->size - off); 237 if (len > l)
181 ret = copy_from_user(fifo->buffer + off, from, l); 238 len = l;
182 if (unlikely(ret)) {
183 *lenout = ret;
184 return -EFAULT;
185 }
186 *lenout = l;
187 239
188 /* then put the rest (if any) at the beginning of the buffer */ 240 ret = kfifo_copy_from_user(fifo, from, len, fifo->in, copied);
189 ret = copy_from_user(fifo->buffer, from + l, len - l); 241 if (unlikely(ret)) {
190 *lenout += ret ? ret : len - l; 242 len -= ret;
191 return ret ? -EFAULT : 0; 243 err = -EFAULT;
244 } else
245 err = 0;
246 fifo->in += len;
247 return err;
192} 248}
249EXPORT_SYMBOL(__kfifo_from_user);
193 250
194static inline int __kfifo_to_user_data(struct kfifo *fifo, 251static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to,
195 void __user *to, unsigned int len, unsigned int off, unsigned *lenout) 252 unsigned int len, unsigned int off, unsigned int *copied)
196{ 253{
197 unsigned int l; 254 unsigned int l;
198 int ret; 255 unsigned long ret;
199 256 unsigned int size = fifo->mask + 1;
257 unsigned int esize = fifo->esize;
258
259 off &= fifo->mask;
260 if (esize != 1) {
261 off *= esize;
262 size *= esize;
263 len *= esize;
264 }
265 l = min(len, size - off);
266
267 ret = copy_to_user(to, fifo->data + off, l);
268 if (unlikely(ret))
269 ret = DIV_ROUND_UP(ret + len - l, esize);
270 else {
271 ret = copy_to_user(to + l, fifo->data, len - l);
272 if (unlikely(ret))
273 ret = DIV_ROUND_UP(ret, esize);
274 }
200 /* 275 /*
201 * Ensure that we sample the fifo->in index -before- we 276 * make sure that the data is copied before
202 * start removing bytes from the kfifo. 277 * incrementing the fifo->out index counter
203 */ 278 */
279 smp_wmb();
280 *copied = len - ret;
281 /* return the number of elements which are not copied */
282 return ret;
283}
204 284
205 smp_rmb(); 285int __kfifo_to_user(struct __kfifo *fifo, void __user *to,
286 unsigned long len, unsigned int *copied)
287{
288 unsigned int l;
289 unsigned long ret;
290 unsigned int esize = fifo->esize;
291 int err;
206 292
207 off = __kfifo_off(fifo, fifo->out + off); 293 if (esize != 1)
294 len /= esize;
208 295
209 /* first get the data from fifo->out until the end of the buffer */ 296 l = fifo->in - fifo->out;
210 l = min(len, fifo->size - off); 297 if (len > l)
211 ret = copy_to_user(to, fifo->buffer + off, l); 298 len = l;
212 *lenout = l; 299 ret = kfifo_copy_to_user(fifo, to, len, fifo->out, copied);
213 if (unlikely(ret)) { 300 if (unlikely(ret)) {
214 *lenout -= ret; 301 len -= ret;
215 return -EFAULT; 302 err = -EFAULT;
216 } 303 } else
304 err = 0;
305 fifo->out += len;
306 return err;
307}
308EXPORT_SYMBOL(__kfifo_to_user);
217 309
218 /* then get the rest (if any) from the beginning of the buffer */ 310static int setup_sgl_buf(struct scatterlist *sgl, void *buf,
219 len -= l; 311 int nents, unsigned int len)
220 ret = copy_to_user(to + l, fifo->buffer, len); 312{
221 if (unlikely(ret)) { 313 int n;
222 *lenout += len - ret; 314 unsigned int l;
223 return -EFAULT; 315 unsigned int off;
316 struct page *page;
317
318 if (!nents)
319 return 0;
320
321 if (!len)
322 return 0;
323
324 n = 0;
325 page = virt_to_page(buf);
326 off = offset_in_page(buf);
327 l = 0;
328
329 while (len >= l + PAGE_SIZE - off) {
330 struct page *npage;
331
332 l += PAGE_SIZE;
333 buf += PAGE_SIZE;
334 npage = virt_to_page(buf);
335 if (page_to_phys(page) != page_to_phys(npage) - l) {
336 sg_set_page(sgl, page, l - off, off);
337 sgl = sg_next(sgl);
338 if (++n == nents || sgl == NULL)
339 return n;
340 page = npage;
341 len -= l - off;
342 l = off = 0;
343 }
224 } 344 }
225 *lenout += len; 345 sg_set_page(sgl, page, len, off);
226 return 0; 346 return n + 1;
227} 347}
228 348
229unsigned int __kfifo_in_n(struct kfifo *fifo, 349static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl,
230 const void *from, unsigned int len, unsigned int recsize) 350 int nents, unsigned int len, unsigned int off)
231{ 351{
232 if (kfifo_avail(fifo) < len + recsize) 352 unsigned int size = fifo->mask + 1;
233 return len + 1; 353 unsigned int esize = fifo->esize;
354 unsigned int l;
355 unsigned int n;
234 356
235 __kfifo_in_data(fifo, from, len, recsize); 357 off &= fifo->mask;
236 return 0; 358 if (esize != 1) {
359 off *= esize;
360 size *= esize;
361 len *= esize;
362 }
363 l = min(len, size - off);
364
365 n = setup_sgl_buf(sgl, fifo->data + off, nents, l);
366 n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l);
367
368 return n;
237} 369}
238EXPORT_SYMBOL(__kfifo_in_n);
239 370
240/** 371unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo,
241 * kfifo_in - puts some data into the FIFO 372 struct scatterlist *sgl, int nents, unsigned int len)
242 * @fifo: the fifo to be used.
243 * @from: the data to be added.
244 * @len: the length of the data to be added.
245 *
246 * This function copies at most @len bytes from the @from buffer into
247 * the FIFO depending on the free space, and returns the number of
248 * bytes copied.
249 *
250 * Note that with only one concurrent reader and one concurrent
251 * writer, you don't need extra locking to use these functions.
252 */
253unsigned int kfifo_in(struct kfifo *fifo, const void *from,
254 unsigned int len)
255{ 373{
256 len = min(kfifo_avail(fifo), len); 374 unsigned int l;
257 375
258 __kfifo_in_data(fifo, from, len, 0); 376 l = kfifo_unused(fifo);
259 __kfifo_add_in(fifo, len); 377 if (len > l)
260 return len; 378 len = l;
379
380 return setup_sgl(fifo, sgl, nents, len, fifo->in);
261} 381}
262EXPORT_SYMBOL(kfifo_in); 382EXPORT_SYMBOL(__kfifo_dma_in_prepare);
263 383
264unsigned int __kfifo_in_generic(struct kfifo *fifo, 384unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo,
265 const void *from, unsigned int len, unsigned int recsize) 385 struct scatterlist *sgl, int nents, unsigned int len)
266{ 386{
267 return __kfifo_in_rec(fifo, from, len, recsize); 387 unsigned int l;
388
389 l = fifo->in - fifo->out;
390 if (len > l)
391 len = l;
392
393 return setup_sgl(fifo, sgl, nents, len, fifo->out);
268} 394}
269EXPORT_SYMBOL(__kfifo_in_generic); 395EXPORT_SYMBOL(__kfifo_dma_out_prepare);
270 396
271unsigned int __kfifo_out_n(struct kfifo *fifo, 397unsigned int __kfifo_max_r(unsigned int len, size_t recsize)
272 void *to, unsigned int len, unsigned int recsize)
273{ 398{
274 if (kfifo_len(fifo) < len + recsize) 399 unsigned int max = (1 << (recsize << 3)) - 1;
275 return len;
276 400
277 __kfifo_out_data(fifo, to, len, recsize); 401 if (len > max)
278 __kfifo_add_out(fifo, len + recsize); 402 return max;
279 return 0; 403 return len;
280} 404}
281EXPORT_SYMBOL(__kfifo_out_n);
282 405
283/** 406#define __KFIFO_PEEK(data, out, mask) \
284 * kfifo_out - gets some data from the FIFO 407 ((data)[(out) & (mask)])
285 * @fifo: the fifo to be used. 408/*
286 * @to: where the data must be copied. 409 * __kfifo_peek_n internal helper function for determinate the length of
287 * @len: the size of the destination buffer. 410 * the next record in the fifo
288 *
289 * This function copies at most @len bytes from the FIFO into the
290 * @to buffer and returns the number of copied bytes.
291 *
292 * Note that with only one concurrent reader and one concurrent
293 * writer, you don't need extra locking to use these functions.
294 */ 411 */
295unsigned int kfifo_out(struct kfifo *fifo, void *to, unsigned int len) 412static unsigned int __kfifo_peek_n(struct __kfifo *fifo, size_t recsize)
296{ 413{
297 len = min(kfifo_len(fifo), len); 414 unsigned int l;
415 unsigned int mask = fifo->mask;
416 unsigned char *data = fifo->data;
298 417
299 __kfifo_out_data(fifo, to, len, 0); 418 l = __KFIFO_PEEK(data, fifo->out, mask);
300 __kfifo_add_out(fifo, len);
301 419
302 return len; 420 if (--recsize)
421 l |= __KFIFO_PEEK(data, fifo->out + 1, mask) << 8;
422
423 return l;
303} 424}
304EXPORT_SYMBOL(kfifo_out);
305 425
306/** 426#define __KFIFO_POKE(data, in, mask, val) \
307 * kfifo_out_peek - copy some data from the FIFO, but do not remove it 427 ( \
308 * @fifo: the fifo to be used. 428 (data)[(in) & (mask)] = (unsigned char)(val) \
309 * @to: where the data must be copied. 429 )
310 * @len: the size of the destination buffer. 430
311 * @offset: offset into the fifo 431/*
312 * 432 * __kfifo_poke_n internal helper function for storeing the length of
313 * This function copies at most @len bytes at @offset from the FIFO 433 * the record into the fifo
314 * into the @to buffer and returns the number of copied bytes.
315 * The data is not removed from the FIFO.
316 */ 434 */
317unsigned int kfifo_out_peek(struct kfifo *fifo, void *to, unsigned int len, 435static void __kfifo_poke_n(struct __kfifo *fifo, unsigned int n, size_t recsize)
318 unsigned offset)
319{ 436{
320 len = min(kfifo_len(fifo), len + offset); 437 unsigned int mask = fifo->mask;
438 unsigned char *data = fifo->data;
321 439
322 __kfifo_out_data(fifo, to, len, offset); 440 __KFIFO_POKE(data, fifo->in, mask, n);
323 return len; 441
442 if (recsize > 1)
443 __KFIFO_POKE(data, fifo->in + 1, mask, n >> 8);
324} 444}
325EXPORT_SYMBOL(kfifo_out_peek);
326 445
327unsigned int __kfifo_out_generic(struct kfifo *fifo, 446unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize)
328 void *to, unsigned int len, unsigned int recsize,
329 unsigned int *total)
330{ 447{
331 return __kfifo_out_rec(fifo, to, len, recsize, total); 448 return __kfifo_peek_n(fifo, recsize);
332} 449}
333EXPORT_SYMBOL(__kfifo_out_generic); 450EXPORT_SYMBOL(__kfifo_len_r);
334 451
335unsigned int __kfifo_from_user_n(struct kfifo *fifo, 452unsigned int __kfifo_in_r(struct __kfifo *fifo, const void *buf,
336 const void __user *from, unsigned int len, unsigned int recsize) 453 unsigned int len, size_t recsize)
337{ 454{
338 unsigned total; 455 if (len + recsize > kfifo_unused(fifo))
456 return 0;
339 457
340 if (kfifo_avail(fifo) < len + recsize) 458 __kfifo_poke_n(fifo, len, recsize);
341 return len + 1;
342 459
343 __kfifo_from_user_data(fifo, from, len, recsize, &total); 460 kfifo_copy_in(fifo, buf, len, fifo->in + recsize);
344 return total; 461 fifo->in += len + recsize;
462 return len;
345} 463}
346EXPORT_SYMBOL(__kfifo_from_user_n); 464EXPORT_SYMBOL(__kfifo_in_r);
347 465
348/** 466static unsigned int kfifo_out_copy_r(struct __kfifo *fifo,
349 * kfifo_from_user - puts some data from user space into the FIFO 467 void *buf, unsigned int len, size_t recsize, unsigned int *n)
350 * @fifo: the fifo to be used. 468{
351 * @from: pointer to the data to be added. 469 *n = __kfifo_peek_n(fifo, recsize);
352 * @len: the length of the data to be added. 470
353 * @total: the actual returned data length. 471 if (len > *n)
354 * 472 len = *n;
355 * This function copies at most @len bytes from the @from into the 473
356 * FIFO depending and returns -EFAULT/0. 474 kfifo_copy_out(fifo, buf, len, fifo->out + recsize);
357 * 475 return len;
358 * Note that with only one concurrent reader and one concurrent
359 * writer, you don't need extra locking to use these functions.
360 */
361int kfifo_from_user(struct kfifo *fifo,
362 const void __user *from, unsigned int len, unsigned *total)
363{
364 int ret;
365 len = min(kfifo_avail(fifo), len);
366 ret = __kfifo_from_user_data(fifo, from, len, 0, total);
367 if (ret)
368 return ret;
369 __kfifo_add_in(fifo, len);
370 return 0;
371} 476}
372EXPORT_SYMBOL(kfifo_from_user);
373 477
374unsigned int __kfifo_from_user_generic(struct kfifo *fifo, 478unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf,
375 const void __user *from, unsigned int len, unsigned int recsize) 479 unsigned int len, size_t recsize)
376{ 480{
377 return __kfifo_from_user_rec(fifo, from, len, recsize); 481 unsigned int n;
482
483 if (fifo->in == fifo->out)
484 return 0;
485
486 return kfifo_out_copy_r(fifo, buf, len, recsize, &n);
378} 487}
379EXPORT_SYMBOL(__kfifo_from_user_generic); 488EXPORT_SYMBOL(__kfifo_out_peek_r);
380 489
381unsigned int __kfifo_to_user_n(struct kfifo *fifo, 490unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf,
382 void __user *to, unsigned int len, unsigned int reclen, 491 unsigned int len, size_t recsize)
383 unsigned int recsize)
384{ 492{
385 unsigned int ret, total; 493 unsigned int n;
386 494
387 if (kfifo_len(fifo) < reclen + recsize) 495 if (fifo->in == fifo->out)
388 return len; 496 return 0;
389 497
390 ret = __kfifo_to_user_data(fifo, to, reclen, recsize, &total); 498 len = kfifo_out_copy_r(fifo, buf, len, recsize, &n);
499 fifo->out += n + recsize;
500 return len;
501}
502EXPORT_SYMBOL(__kfifo_out_r);
391 503
392 if (likely(ret == 0)) 504void __kfifo_skip_r(struct __kfifo *fifo, size_t recsize)
393 __kfifo_add_out(fifo, reclen + recsize); 505{
506 unsigned int n;
394 507
395 return total; 508 n = __kfifo_peek_n(fifo, recsize);
509 fifo->out += n + recsize;
396} 510}
397EXPORT_SYMBOL(__kfifo_to_user_n); 511EXPORT_SYMBOL(__kfifo_skip_r);
398 512
399/** 513int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from,
400 * kfifo_to_user - gets data from the FIFO and write it to user space 514 unsigned long len, unsigned int *copied, size_t recsize)
401 * @fifo: the fifo to be used.
402 * @to: where the data must be copied.
403 * @len: the size of the destination buffer.
404 * @lenout: pointer to output variable with copied data
405 *
406 * This function copies at most @len bytes from the FIFO into the
407 * @to buffer and 0 or -EFAULT.
408 *
409 * Note that with only one concurrent reader and one concurrent
410 * writer, you don't need extra locking to use these functions.
411 */
412int kfifo_to_user(struct kfifo *fifo,
413 void __user *to, unsigned int len, unsigned *lenout)
414{ 515{
415 int ret; 516 unsigned long ret;
416 len = min(kfifo_len(fifo), len); 517
417 ret = __kfifo_to_user_data(fifo, to, len, 0, lenout); 518 len = __kfifo_max_r(len, recsize);
418 __kfifo_add_out(fifo, *lenout); 519
419 return ret; 520 if (len + recsize > kfifo_unused(fifo)) {
521 *copied = 0;
522 return 0;
523 }
524
525 __kfifo_poke_n(fifo, len, recsize);
526
527 ret = kfifo_copy_from_user(fifo, from, len, fifo->in + recsize, copied);
528 if (unlikely(ret)) {
529 *copied = 0;
530 return -EFAULT;
531 }
532 fifo->in += len + recsize;
533 return 0;
420} 534}
421EXPORT_SYMBOL(kfifo_to_user); 535EXPORT_SYMBOL(__kfifo_from_user_r);
422 536
423unsigned int __kfifo_to_user_generic(struct kfifo *fifo, 537int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to,
424 void __user *to, unsigned int len, unsigned int recsize, 538 unsigned long len, unsigned int *copied, size_t recsize)
425 unsigned int *total)
426{ 539{
427 return __kfifo_to_user_rec(fifo, to, len, recsize, total); 540 unsigned long ret;
541 unsigned int n;
542
543 if (fifo->in == fifo->out) {
544 *copied = 0;
545 return 0;
546 }
547
548 n = __kfifo_peek_n(fifo, recsize);
549 if (len > n)
550 len = n;
551
552 ret = kfifo_copy_to_user(fifo, to, len, fifo->out + recsize, copied);
553 if (unlikely(ret)) {
554 *copied = 0;
555 return -EFAULT;
556 }
557 fifo->out += n + recsize;
558 return 0;
428} 559}
429EXPORT_SYMBOL(__kfifo_to_user_generic); 560EXPORT_SYMBOL(__kfifo_to_user_r);
430 561
431unsigned int __kfifo_peek_generic(struct kfifo *fifo, unsigned int recsize) 562unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo,
563 struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
432{ 564{
433 if (recsize == 0) 565 if (!nents)
434 return kfifo_avail(fifo); 566 BUG();
435 567
436 return __kfifo_peek_n(fifo, recsize); 568 len = __kfifo_max_r(len, recsize);
569
570 if (len + recsize > kfifo_unused(fifo))
571 return 0;
572
573 return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize);
437} 574}
438EXPORT_SYMBOL(__kfifo_peek_generic); 575EXPORT_SYMBOL(__kfifo_dma_in_prepare_r);
439 576
440void __kfifo_skip_generic(struct kfifo *fifo, unsigned int recsize) 577void __kfifo_dma_in_finish_r(struct __kfifo *fifo,
578 unsigned int len, size_t recsize)
441{ 579{
442 __kfifo_skip_rec(fifo, recsize); 580 len = __kfifo_max_r(len, recsize);
581 __kfifo_poke_n(fifo, len, recsize);
582 fifo->in += len + recsize;
443} 583}
444EXPORT_SYMBOL(__kfifo_skip_generic); 584EXPORT_SYMBOL(__kfifo_dma_in_finish_r);
445 585
586unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo,
587 struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
588{
589 if (!nents)
590 BUG();
591
592 len = __kfifo_max_r(len, recsize);
593
594 if (len + recsize > fifo->in - fifo->out)
595 return 0;
596
597 return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize);
598}
599EXPORT_SYMBOL(__kfifo_dma_out_prepare_r);
600
601void __kfifo_dma_out_finish_r(struct __kfifo *fifo, size_t recsize)
602{
603 unsigned int len;
604
605 len = __kfifo_peek_n(fifo, recsize);
606 fifo->out += len + recsize;
607}
608EXPORT_SYMBOL(__kfifo_dma_out_finish_r);
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
deleted file mode 100644
index 11f3515ca83f..000000000000
--- a/kernel/kgdb.c
+++ /dev/null
@@ -1,1764 +0,0 @@
1/*
2 * KGDB stub.
3 *
4 * Maintainer: Jason Wessel <jason.wessel@windriver.com>
5 *
6 * Copyright (C) 2000-2001 VERITAS Software Corporation.
7 * Copyright (C) 2002-2004 Timesys Corporation
8 * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
9 * Copyright (C) 2004 Pavel Machek <pavel@suse.cz>
10 * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
11 * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
12 * Copyright (C) 2005-2008 Wind River Systems, Inc.
13 * Copyright (C) 2007 MontaVista Software, Inc.
14 * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
15 *
16 * Contributors at various stages not listed above:
17 * Jason Wessel ( jason.wessel@windriver.com )
18 * George Anzinger <george@mvista.com>
19 * Anurekh Saxena (anurekh.saxena@timesys.com)
20 * Lake Stevens Instrument Division (Glenn Engel)
21 * Jim Kingdon, Cygnus Support.
22 *
23 * Original KGDB stub: David Grothe <dave@gcom.com>,
24 * Tigran Aivazian <tigran@sco.com>
25 *
26 * This file is licensed under the terms of the GNU General Public License
27 * version 2. This program is licensed "as is" without any warranty of any
28 * kind, whether express or implied.
29 */
30#include <linux/pid_namespace.h>
31#include <linux/clocksource.h>
32#include <linux/interrupt.h>
33#include <linux/spinlock.h>
34#include <linux/console.h>
35#include <linux/threads.h>
36#include <linux/uaccess.h>
37#include <linux/kernel.h>
38#include <linux/module.h>
39#include <linux/ptrace.h>
40#include <linux/reboot.h>
41#include <linux/string.h>
42#include <linux/delay.h>
43#include <linux/sched.h>
44#include <linux/sysrq.h>
45#include <linux/init.h>
46#include <linux/kgdb.h>
47#include <linux/pid.h>
48#include <linux/smp.h>
49#include <linux/mm.h>
50
51#include <asm/cacheflush.h>
52#include <asm/byteorder.h>
53#include <asm/atomic.h>
54#include <asm/system.h>
55#include <asm/unaligned.h>
56
57static int kgdb_break_asap;
58
59#define KGDB_MAX_THREAD_QUERY 17
60struct kgdb_state {
61 int ex_vector;
62 int signo;
63 int err_code;
64 int cpu;
65 int pass_exception;
66 unsigned long thr_query;
67 unsigned long threadid;
68 long kgdb_usethreadid;
69 struct pt_regs *linux_regs;
70};
71
72/* Exception state values */
73#define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */
74#define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */
75#define DCPU_IS_SLAVE 0x4 /* Slave cpu enter exception */
76#define DCPU_SSTEP 0x8 /* CPU is single stepping */
77
78static struct debuggerinfo_struct {
79 void *debuggerinfo;
80 struct task_struct *task;
81 int exception_state;
82} kgdb_info[NR_CPUS];
83
84/**
85 * kgdb_connected - Is a host GDB connected to us?
86 */
87int kgdb_connected;
88EXPORT_SYMBOL_GPL(kgdb_connected);
89
90/* All the KGDB handlers are installed */
91static int kgdb_io_module_registered;
92
93/* Guard for recursive entry */
94static int exception_level;
95
96static struct kgdb_io *kgdb_io_ops;
97static DEFINE_SPINLOCK(kgdb_registration_lock);
98
99/* kgdb console driver is loaded */
100static int kgdb_con_registered;
101/* determine if kgdb console output should be used */
102static int kgdb_use_con;
103
104static int __init opt_kgdb_con(char *str)
105{
106 kgdb_use_con = 1;
107 return 0;
108}
109
110early_param("kgdbcon", opt_kgdb_con);
111
112module_param(kgdb_use_con, int, 0644);
113
114/*
115 * Holds information about breakpoints in a kernel. These breakpoints are
116 * added and removed by gdb.
117 */
118static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = {
119 [0 ... KGDB_MAX_BREAKPOINTS-1] = { .state = BP_UNDEFINED }
120};
121
122/*
123 * The CPU# of the active CPU, or -1 if none:
124 */
125atomic_t kgdb_active = ATOMIC_INIT(-1);
126
127/*
128 * We use NR_CPUs not PERCPU, in case kgdb is used to debug early
129 * bootup code (which might not have percpu set up yet):
130 */
131static atomic_t passive_cpu_wait[NR_CPUS];
132static atomic_t cpu_in_kgdb[NR_CPUS];
133atomic_t kgdb_setting_breakpoint;
134
135struct task_struct *kgdb_usethread;
136struct task_struct *kgdb_contthread;
137
138int kgdb_single_step;
139pid_t kgdb_sstep_pid;
140
141/* Our I/O buffers. */
142static char remcom_in_buffer[BUFMAX];
143static char remcom_out_buffer[BUFMAX];
144
145/* Storage for the registers, in GDB format. */
146static unsigned long gdb_regs[(NUMREGBYTES +
147 sizeof(unsigned long) - 1) /
148 sizeof(unsigned long)];
149
150/* to keep track of the CPU which is doing the single stepping*/
151atomic_t kgdb_cpu_doing_single_step = ATOMIC_INIT(-1);
152
153/*
154 * If you are debugging a problem where roundup (the collection of
155 * all other CPUs) is a problem [this should be extremely rare],
156 * then use the nokgdbroundup option to avoid roundup. In that case
157 * the other CPUs might interfere with your debugging context, so
158 * use this with care:
159 */
160static int kgdb_do_roundup = 1;
161
162static int __init opt_nokgdbroundup(char *str)
163{
164 kgdb_do_roundup = 0;
165
166 return 0;
167}
168
169early_param("nokgdbroundup", opt_nokgdbroundup);
170
171/*
172 * Finally, some KGDB code :-)
173 */
174
175/*
176 * Weak aliases for breakpoint management,
177 * can be overriden by architectures when needed:
178 */
179int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
180{
181 int err;
182
183 err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE);
184 if (err)
185 return err;
186
187 return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr,
188 BREAK_INSTR_SIZE);
189}
190
191int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
192{
193 return probe_kernel_write((char *)addr,
194 (char *)bundle, BREAK_INSTR_SIZE);
195}
196
197int __weak kgdb_validate_break_address(unsigned long addr)
198{
199 char tmp_variable[BREAK_INSTR_SIZE];
200 int err;
201 /* Validate setting the breakpoint and then removing it. In the
202 * remove fails, the kernel needs to emit a bad message because we
203 * are deep trouble not being able to put things back the way we
204 * found them.
205 */
206 err = kgdb_arch_set_breakpoint(addr, tmp_variable);
207 if (err)
208 return err;
209 err = kgdb_arch_remove_breakpoint(addr, tmp_variable);
210 if (err)
211 printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
212 "memory destroyed at: %lx", addr);
213 return err;
214}
215
216unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs)
217{
218 return instruction_pointer(regs);
219}
220
221int __weak kgdb_arch_init(void)
222{
223 return 0;
224}
225
226int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
227{
228 return 0;
229}
230
231void __weak
232kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code)
233{
234 return;
235}
236
237/**
238 * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
239 * @regs: Current &struct pt_regs.
240 *
241 * This function will be called if the particular architecture must
242 * disable hardware debugging while it is processing gdb packets or
243 * handling exception.
244 */
245void __weak kgdb_disable_hw_debug(struct pt_regs *regs)
246{
247}
248
249/*
250 * GDB remote protocol parser:
251 */
252
253static int hex(char ch)
254{
255 if ((ch >= 'a') && (ch <= 'f'))
256 return ch - 'a' + 10;
257 if ((ch >= '0') && (ch <= '9'))
258 return ch - '0';
259 if ((ch >= 'A') && (ch <= 'F'))
260 return ch - 'A' + 10;
261 return -1;
262}
263
264/* scan for the sequence $<data>#<checksum> */
265static void get_packet(char *buffer)
266{
267 unsigned char checksum;
268 unsigned char xmitcsum;
269 int count;
270 char ch;
271
272 do {
273 /*
274 * Spin and wait around for the start character, ignore all
275 * other characters:
276 */
277 while ((ch = (kgdb_io_ops->read_char())) != '$')
278 /* nothing */;
279
280 kgdb_connected = 1;
281 checksum = 0;
282 xmitcsum = -1;
283
284 count = 0;
285
286 /*
287 * now, read until a # or end of buffer is found:
288 */
289 while (count < (BUFMAX - 1)) {
290 ch = kgdb_io_ops->read_char();
291 if (ch == '#')
292 break;
293 checksum = checksum + ch;
294 buffer[count] = ch;
295 count = count + 1;
296 }
297 buffer[count] = 0;
298
299 if (ch == '#') {
300 xmitcsum = hex(kgdb_io_ops->read_char()) << 4;
301 xmitcsum += hex(kgdb_io_ops->read_char());
302
303 if (checksum != xmitcsum)
304 /* failed checksum */
305 kgdb_io_ops->write_char('-');
306 else
307 /* successful transfer */
308 kgdb_io_ops->write_char('+');
309 if (kgdb_io_ops->flush)
310 kgdb_io_ops->flush();
311 }
312 } while (checksum != xmitcsum);
313}
314
315/*
316 * Send the packet in buffer.
317 * Check for gdb connection if asked for.
318 */
319static void put_packet(char *buffer)
320{
321 unsigned char checksum;
322 int count;
323 char ch;
324
325 /*
326 * $<packet info>#<checksum>.
327 */
328 while (1) {
329 kgdb_io_ops->write_char('$');
330 checksum = 0;
331 count = 0;
332
333 while ((ch = buffer[count])) {
334 kgdb_io_ops->write_char(ch);
335 checksum += ch;
336 count++;
337 }
338
339 kgdb_io_ops->write_char('#');
340 kgdb_io_ops->write_char(hex_asc_hi(checksum));
341 kgdb_io_ops->write_char(hex_asc_lo(checksum));
342 if (kgdb_io_ops->flush)
343 kgdb_io_ops->flush();
344
345 /* Now see what we get in reply. */
346 ch = kgdb_io_ops->read_char();
347
348 if (ch == 3)
349 ch = kgdb_io_ops->read_char();
350
351 /* If we get an ACK, we are done. */
352 if (ch == '+')
353 return;
354
355 /*
356 * If we get the start of another packet, this means
357 * that GDB is attempting to reconnect. We will NAK
358 * the packet being sent, and stop trying to send this
359 * packet.
360 */
361 if (ch == '$') {
362 kgdb_io_ops->write_char('-');
363 if (kgdb_io_ops->flush)
364 kgdb_io_ops->flush();
365 return;
366 }
367 }
368}
369
370/*
371 * Convert the memory pointed to by mem into hex, placing result in buf.
372 * Return a pointer to the last char put in buf (null). May return an error.
373 */
374int kgdb_mem2hex(char *mem, char *buf, int count)
375{
376 char *tmp;
377 int err;
378
379 /*
380 * We use the upper half of buf as an intermediate buffer for the
381 * raw memory copy. Hex conversion will work against this one.
382 */
383 tmp = buf + count;
384
385 err = probe_kernel_read(tmp, mem, count);
386 if (!err) {
387 while (count > 0) {
388 buf = pack_hex_byte(buf, *tmp);
389 tmp++;
390 count--;
391 }
392
393 *buf = 0;
394 }
395
396 return err;
397}
398
399/*
400 * Copy the binary array pointed to by buf into mem. Fix $, #, and
401 * 0x7d escaped with 0x7d. Return -EFAULT on failure or 0 on success.
402 * The input buf is overwitten with the result to write to mem.
403 */
404static int kgdb_ebin2mem(char *buf, char *mem, int count)
405{
406 int size = 0;
407 char *c = buf;
408
409 while (count-- > 0) {
410 c[size] = *buf++;
411 if (c[size] == 0x7d)
412 c[size] = *buf++ ^ 0x20;
413 size++;
414 }
415
416 return probe_kernel_write(mem, c, size);
417}
418
419/*
420 * Convert the hex array pointed to by buf into binary to be placed in mem.
421 * Return a pointer to the character AFTER the last byte written.
422 * May return an error.
423 */
424int kgdb_hex2mem(char *buf, char *mem, int count)
425{
426 char *tmp_raw;
427 char *tmp_hex;
428
429 /*
430 * We use the upper half of buf as an intermediate buffer for the
431 * raw memory that is converted from hex.
432 */
433 tmp_raw = buf + count * 2;
434
435 tmp_hex = tmp_raw - 1;
436 while (tmp_hex >= buf) {
437 tmp_raw--;
438 *tmp_raw = hex(*tmp_hex--);
439 *tmp_raw |= hex(*tmp_hex--) << 4;
440 }
441
442 return probe_kernel_write(mem, tmp_raw, count);
443}
444
445/*
446 * While we find nice hex chars, build a long_val.
447 * Return number of chars processed.
448 */
449int kgdb_hex2long(char **ptr, unsigned long *long_val)
450{
451 int hex_val;
452 int num = 0;
453 int negate = 0;
454
455 *long_val = 0;
456
457 if (**ptr == '-') {
458 negate = 1;
459 (*ptr)++;
460 }
461 while (**ptr) {
462 hex_val = hex(**ptr);
463 if (hex_val < 0)
464 break;
465
466 *long_val = (*long_val << 4) | hex_val;
467 num++;
468 (*ptr)++;
469 }
470
471 if (negate)
472 *long_val = -*long_val;
473
474 return num;
475}
476
477/* Write memory due to an 'M' or 'X' packet. */
478static int write_mem_msg(int binary)
479{
480 char *ptr = &remcom_in_buffer[1];
481 unsigned long addr;
482 unsigned long length;
483 int err;
484
485 if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' &&
486 kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') {
487 if (binary)
488 err = kgdb_ebin2mem(ptr, (char *)addr, length);
489 else
490 err = kgdb_hex2mem(ptr, (char *)addr, length);
491 if (err)
492 return err;
493 if (CACHE_FLUSH_IS_SAFE)
494 flush_icache_range(addr, addr + length);
495 return 0;
496 }
497
498 return -EINVAL;
499}
500
501static void error_packet(char *pkt, int error)
502{
503 error = -error;
504 pkt[0] = 'E';
505 pkt[1] = hex_asc[(error / 10)];
506 pkt[2] = hex_asc[(error % 10)];
507 pkt[3] = '\0';
508}
509
510/*
511 * Thread ID accessors. We represent a flat TID space to GDB, where
512 * the per CPU idle threads (which under Linux all have PID 0) are
513 * remapped to negative TIDs.
514 */
515
516#define BUF_THREAD_ID_SIZE 16
517
518static char *pack_threadid(char *pkt, unsigned char *id)
519{
520 char *limit;
521
522 limit = pkt + BUF_THREAD_ID_SIZE;
523 while (pkt < limit)
524 pkt = pack_hex_byte(pkt, *id++);
525
526 return pkt;
527}
528
529static void int_to_threadref(unsigned char *id, int value)
530{
531 unsigned char *scan;
532 int i = 4;
533
534 scan = (unsigned char *)id;
535 while (i--)
536 *scan++ = 0;
537 put_unaligned_be32(value, scan);
538}
539
540static struct task_struct *getthread(struct pt_regs *regs, int tid)
541{
542 /*
543 * Non-positive TIDs are remapped to the cpu shadow information
544 */
545 if (tid == 0 || tid == -1)
546 tid = -atomic_read(&kgdb_active) - 2;
547 if (tid < -1 && tid > -NR_CPUS - 2) {
548 if (kgdb_info[-tid - 2].task)
549 return kgdb_info[-tid - 2].task;
550 else
551 return idle_task(-tid - 2);
552 }
553 if (tid <= 0) {
554 printk(KERN_ERR "KGDB: Internal thread select error\n");
555 dump_stack();
556 return NULL;
557 }
558
559 /*
560 * find_task_by_pid_ns() does not take the tasklist lock anymore
561 * but is nicely RCU locked - hence is a pretty resilient
562 * thing to use:
563 */
564 return find_task_by_pid_ns(tid, &init_pid_ns);
565}
566
567/*
568 * Some architectures need cache flushes when we set/clear a
569 * breakpoint:
570 */
571static void kgdb_flush_swbreak_addr(unsigned long addr)
572{
573 if (!CACHE_FLUSH_IS_SAFE)
574 return;
575
576 if (current->mm && current->mm->mmap_cache) {
577 flush_cache_range(current->mm->mmap_cache,
578 addr, addr + BREAK_INSTR_SIZE);
579 }
580 /* Force flush instruction cache if it was outside the mm */
581 flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
582}
583
584/*
585 * SW breakpoint management:
586 */
587static int kgdb_activate_sw_breakpoints(void)
588{
589 unsigned long addr;
590 int error;
591 int ret = 0;
592 int i;
593
594 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
595 if (kgdb_break[i].state != BP_SET)
596 continue;
597
598 addr = kgdb_break[i].bpt_addr;
599 error = kgdb_arch_set_breakpoint(addr,
600 kgdb_break[i].saved_instr);
601 if (error) {
602 ret = error;
603 printk(KERN_INFO "KGDB: BP install failed: %lx", addr);
604 continue;
605 }
606
607 kgdb_flush_swbreak_addr(addr);
608 kgdb_break[i].state = BP_ACTIVE;
609 }
610 return ret;
611}
612
613static int kgdb_set_sw_break(unsigned long addr)
614{
615 int err = kgdb_validate_break_address(addr);
616 int breakno = -1;
617 int i;
618
619 if (err)
620 return err;
621
622 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
623 if ((kgdb_break[i].state == BP_SET) &&
624 (kgdb_break[i].bpt_addr == addr))
625 return -EEXIST;
626 }
627 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
628 if (kgdb_break[i].state == BP_REMOVED &&
629 kgdb_break[i].bpt_addr == addr) {
630 breakno = i;
631 break;
632 }
633 }
634
635 if (breakno == -1) {
636 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
637 if (kgdb_break[i].state == BP_UNDEFINED) {
638 breakno = i;
639 break;
640 }
641 }
642 }
643
644 if (breakno == -1)
645 return -E2BIG;
646
647 kgdb_break[breakno].state = BP_SET;
648 kgdb_break[breakno].type = BP_BREAKPOINT;
649 kgdb_break[breakno].bpt_addr = addr;
650
651 return 0;
652}
653
654static int kgdb_deactivate_sw_breakpoints(void)
655{
656 unsigned long addr;
657 int error;
658 int ret = 0;
659 int i;
660
661 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
662 if (kgdb_break[i].state != BP_ACTIVE)
663 continue;
664 addr = kgdb_break[i].bpt_addr;
665 error = kgdb_arch_remove_breakpoint(addr,
666 kgdb_break[i].saved_instr);
667 if (error) {
668 printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr);
669 ret = error;
670 }
671
672 kgdb_flush_swbreak_addr(addr);
673 kgdb_break[i].state = BP_SET;
674 }
675 return ret;
676}
677
678static int kgdb_remove_sw_break(unsigned long addr)
679{
680 int i;
681
682 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
683 if ((kgdb_break[i].state == BP_SET) &&
684 (kgdb_break[i].bpt_addr == addr)) {
685 kgdb_break[i].state = BP_REMOVED;
686 return 0;
687 }
688 }
689 return -ENOENT;
690}
691
692int kgdb_isremovedbreak(unsigned long addr)
693{
694 int i;
695
696 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
697 if ((kgdb_break[i].state == BP_REMOVED) &&
698 (kgdb_break[i].bpt_addr == addr))
699 return 1;
700 }
701 return 0;
702}
703
704static int remove_all_break(void)
705{
706 unsigned long addr;
707 int error;
708 int i;
709
710 /* Clear memory breakpoints. */
711 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
712 if (kgdb_break[i].state != BP_ACTIVE)
713 goto setundefined;
714 addr = kgdb_break[i].bpt_addr;
715 error = kgdb_arch_remove_breakpoint(addr,
716 kgdb_break[i].saved_instr);
717 if (error)
718 printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n",
719 addr);
720setundefined:
721 kgdb_break[i].state = BP_UNDEFINED;
722 }
723
724 /* Clear hardware breakpoints. */
725 if (arch_kgdb_ops.remove_all_hw_break)
726 arch_kgdb_ops.remove_all_hw_break();
727
728 return 0;
729}
730
731/*
732 * Remap normal tasks to their real PID,
733 * CPU shadow threads are mapped to -CPU - 2
734 */
735static inline int shadow_pid(int realpid)
736{
737 if (realpid)
738 return realpid;
739
740 return -raw_smp_processor_id() - 2;
741}
742
743static char gdbmsgbuf[BUFMAX + 1];
744
745static void kgdb_msg_write(const char *s, int len)
746{
747 char *bufptr;
748 int wcount;
749 int i;
750
751 /* 'O'utput */
752 gdbmsgbuf[0] = 'O';
753
754 /* Fill and send buffers... */
755 while (len > 0) {
756 bufptr = gdbmsgbuf + 1;
757
758 /* Calculate how many this time */
759 if ((len << 1) > (BUFMAX - 2))
760 wcount = (BUFMAX - 2) >> 1;
761 else
762 wcount = len;
763
764 /* Pack in hex chars */
765 for (i = 0; i < wcount; i++)
766 bufptr = pack_hex_byte(bufptr, s[i]);
767 *bufptr = '\0';
768
769 /* Move up */
770 s += wcount;
771 len -= wcount;
772
773 /* Write packet */
774 put_packet(gdbmsgbuf);
775 }
776}
777
778/*
779 * Return true if there is a valid kgdb I/O module. Also if no
780 * debugger is attached a message can be printed to the console about
781 * waiting for the debugger to attach.
782 *
783 * The print_wait argument is only to be true when called from inside
784 * the core kgdb_handle_exception, because it will wait for the
785 * debugger to attach.
786 */
787static int kgdb_io_ready(int print_wait)
788{
789 if (!kgdb_io_ops)
790 return 0;
791 if (kgdb_connected)
792 return 1;
793 if (atomic_read(&kgdb_setting_breakpoint))
794 return 1;
795 if (print_wait)
796 printk(KERN_CRIT "KGDB: Waiting for remote debugger\n");
797 return 1;
798}
799
800/*
801 * All the functions that start with gdb_cmd are the various
802 * operations to implement the handlers for the gdbserial protocol
803 * where KGDB is communicating with an external debugger
804 */
805
806/* Handle the '?' status packets */
807static void gdb_cmd_status(struct kgdb_state *ks)
808{
809 /*
810 * We know that this packet is only sent
811 * during initial connect. So to be safe,
812 * we clear out our breakpoints now in case
813 * GDB is reconnecting.
814 */
815 remove_all_break();
816
817 remcom_out_buffer[0] = 'S';
818 pack_hex_byte(&remcom_out_buffer[1], ks->signo);
819}
820
821/* Handle the 'g' get registers request */
822static void gdb_cmd_getregs(struct kgdb_state *ks)
823{
824 struct task_struct *thread;
825 void *local_debuggerinfo;
826 int i;
827
828 thread = kgdb_usethread;
829 if (!thread) {
830 thread = kgdb_info[ks->cpu].task;
831 local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo;
832 } else {
833 local_debuggerinfo = NULL;
834 for_each_online_cpu(i) {
835 /*
836 * Try to find the task on some other
837 * or possibly this node if we do not
838 * find the matching task then we try
839 * to approximate the results.
840 */
841 if (thread == kgdb_info[i].task)
842 local_debuggerinfo = kgdb_info[i].debuggerinfo;
843 }
844 }
845
846 /*
847 * All threads that don't have debuggerinfo should be
848 * in schedule() sleeping, since all other CPUs
849 * are in kgdb_wait, and thus have debuggerinfo.
850 */
851 if (local_debuggerinfo) {
852 pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo);
853 } else {
854 /*
855 * Pull stuff saved during switch_to; nothing
856 * else is accessible (or even particularly
857 * relevant).
858 *
859 * This should be enough for a stack trace.
860 */
861 sleeping_thread_to_gdb_regs(gdb_regs, thread);
862 }
863 kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES);
864}
865
866/* Handle the 'G' set registers request */
867static void gdb_cmd_setregs(struct kgdb_state *ks)
868{
869 kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES);
870
871 if (kgdb_usethread && kgdb_usethread != current) {
872 error_packet(remcom_out_buffer, -EINVAL);
873 } else {
874 gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs);
875 strcpy(remcom_out_buffer, "OK");
876 }
877}
878
879/* Handle the 'm' memory read bytes */
880static void gdb_cmd_memread(struct kgdb_state *ks)
881{
882 char *ptr = &remcom_in_buffer[1];
883 unsigned long length;
884 unsigned long addr;
885 int err;
886
887 if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
888 kgdb_hex2long(&ptr, &length) > 0) {
889 err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length);
890 if (err)
891 error_packet(remcom_out_buffer, err);
892 } else {
893 error_packet(remcom_out_buffer, -EINVAL);
894 }
895}
896
897/* Handle the 'M' memory write bytes */
898static void gdb_cmd_memwrite(struct kgdb_state *ks)
899{
900 int err = write_mem_msg(0);
901
902 if (err)
903 error_packet(remcom_out_buffer, err);
904 else
905 strcpy(remcom_out_buffer, "OK");
906}
907
908/* Handle the 'X' memory binary write bytes */
909static void gdb_cmd_binwrite(struct kgdb_state *ks)
910{
911 int err = write_mem_msg(1);
912
913 if (err)
914 error_packet(remcom_out_buffer, err);
915 else
916 strcpy(remcom_out_buffer, "OK");
917}
918
919/* Handle the 'D' or 'k', detach or kill packets */
920static void gdb_cmd_detachkill(struct kgdb_state *ks)
921{
922 int error;
923
924 /* The detach case */
925 if (remcom_in_buffer[0] == 'D') {
926 error = remove_all_break();
927 if (error < 0) {
928 error_packet(remcom_out_buffer, error);
929 } else {
930 strcpy(remcom_out_buffer, "OK");
931 kgdb_connected = 0;
932 }
933 put_packet(remcom_out_buffer);
934 } else {
935 /*
936 * Assume the kill case, with no exit code checking,
937 * trying to force detach the debugger:
938 */
939 remove_all_break();
940 kgdb_connected = 0;
941 }
942}
943
944/* Handle the 'R' reboot packets */
945static int gdb_cmd_reboot(struct kgdb_state *ks)
946{
947 /* For now, only honor R0 */
948 if (strcmp(remcom_in_buffer, "R0") == 0) {
949 printk(KERN_CRIT "Executing emergency reboot\n");
950 strcpy(remcom_out_buffer, "OK");
951 put_packet(remcom_out_buffer);
952
953 /*
954 * Execution should not return from
955 * machine_emergency_restart()
956 */
957 machine_emergency_restart();
958 kgdb_connected = 0;
959
960 return 1;
961 }
962 return 0;
963}
964
965/* Handle the 'q' query packets */
966static void gdb_cmd_query(struct kgdb_state *ks)
967{
968 struct task_struct *g;
969 struct task_struct *p;
970 unsigned char thref[8];
971 char *ptr;
972 int i;
973 int cpu;
974 int finished = 0;
975
976 switch (remcom_in_buffer[1]) {
977 case 's':
978 case 'f':
979 if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) {
980 error_packet(remcom_out_buffer, -EINVAL);
981 break;
982 }
983
984 i = 0;
985 remcom_out_buffer[0] = 'm';
986 ptr = remcom_out_buffer + 1;
987 if (remcom_in_buffer[1] == 'f') {
988 /* Each cpu is a shadow thread */
989 for_each_online_cpu(cpu) {
990 ks->thr_query = 0;
991 int_to_threadref(thref, -cpu - 2);
992 pack_threadid(ptr, thref);
993 ptr += BUF_THREAD_ID_SIZE;
994 *(ptr++) = ',';
995 i++;
996 }
997 }
998
999 do_each_thread(g, p) {
1000 if (i >= ks->thr_query && !finished) {
1001 int_to_threadref(thref, p->pid);
1002 pack_threadid(ptr, thref);
1003 ptr += BUF_THREAD_ID_SIZE;
1004 *(ptr++) = ',';
1005 ks->thr_query++;
1006 if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
1007 finished = 1;
1008 }
1009 i++;
1010 } while_each_thread(g, p);
1011
1012 *(--ptr) = '\0';
1013 break;
1014
1015 case 'C':
1016 /* Current thread id */
1017 strcpy(remcom_out_buffer, "QC");
1018 ks->threadid = shadow_pid(current->pid);
1019 int_to_threadref(thref, ks->threadid);
1020 pack_threadid(remcom_out_buffer + 2, thref);
1021 break;
1022 case 'T':
1023 if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) {
1024 error_packet(remcom_out_buffer, -EINVAL);
1025 break;
1026 }
1027 ks->threadid = 0;
1028 ptr = remcom_in_buffer + 17;
1029 kgdb_hex2long(&ptr, &ks->threadid);
1030 if (!getthread(ks->linux_regs, ks->threadid)) {
1031 error_packet(remcom_out_buffer, -EINVAL);
1032 break;
1033 }
1034 if ((int)ks->threadid > 0) {
1035 kgdb_mem2hex(getthread(ks->linux_regs,
1036 ks->threadid)->comm,
1037 remcom_out_buffer, 16);
1038 } else {
1039 static char tmpstr[23 + BUF_THREAD_ID_SIZE];
1040
1041 sprintf(tmpstr, "shadowCPU%d",
1042 (int)(-ks->threadid - 2));
1043 kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr));
1044 }
1045 break;
1046 }
1047}
1048
1049/* Handle the 'H' task query packets */
1050static void gdb_cmd_task(struct kgdb_state *ks)
1051{
1052 struct task_struct *thread;
1053 char *ptr;
1054
1055 switch (remcom_in_buffer[1]) {
1056 case 'g':
1057 ptr = &remcom_in_buffer[2];
1058 kgdb_hex2long(&ptr, &ks->threadid);
1059 thread = getthread(ks->linux_regs, ks->threadid);
1060 if (!thread && ks->threadid > 0) {
1061 error_packet(remcom_out_buffer, -EINVAL);
1062 break;
1063 }
1064 kgdb_usethread = thread;
1065 ks->kgdb_usethreadid = ks->threadid;
1066 strcpy(remcom_out_buffer, "OK");
1067 break;
1068 case 'c':
1069 ptr = &remcom_in_buffer[2];
1070 kgdb_hex2long(&ptr, &ks->threadid);
1071 if (!ks->threadid) {
1072 kgdb_contthread = NULL;
1073 } else {
1074 thread = getthread(ks->linux_regs, ks->threadid);
1075 if (!thread && ks->threadid > 0) {
1076 error_packet(remcom_out_buffer, -EINVAL);
1077 break;
1078 }
1079 kgdb_contthread = thread;
1080 }
1081 strcpy(remcom_out_buffer, "OK");
1082 break;
1083 }
1084}
1085
1086/* Handle the 'T' thread query packets */
1087static void gdb_cmd_thread(struct kgdb_state *ks)
1088{
1089 char *ptr = &remcom_in_buffer[1];
1090 struct task_struct *thread;
1091
1092 kgdb_hex2long(&ptr, &ks->threadid);
1093 thread = getthread(ks->linux_regs, ks->threadid);
1094 if (thread)
1095 strcpy(remcom_out_buffer, "OK");
1096 else
1097 error_packet(remcom_out_buffer, -EINVAL);
1098}
1099
1100/* Handle the 'z' or 'Z' breakpoint remove or set packets */
1101static void gdb_cmd_break(struct kgdb_state *ks)
1102{
1103 /*
1104 * Since GDB-5.3, it's been drafted that '0' is a software
1105 * breakpoint, '1' is a hardware breakpoint, so let's do that.
1106 */
1107 char *bpt_type = &remcom_in_buffer[1];
1108 char *ptr = &remcom_in_buffer[2];
1109 unsigned long addr;
1110 unsigned long length;
1111 int error = 0;
1112
1113 if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') {
1114 /* Unsupported */
1115 if (*bpt_type > '4')
1116 return;
1117 } else {
1118 if (*bpt_type != '0' && *bpt_type != '1')
1119 /* Unsupported. */
1120 return;
1121 }
1122
1123 /*
1124 * Test if this is a hardware breakpoint, and
1125 * if we support it:
1126 */
1127 if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT))
1128 /* Unsupported. */
1129 return;
1130
1131 if (*(ptr++) != ',') {
1132 error_packet(remcom_out_buffer, -EINVAL);
1133 return;
1134 }
1135 if (!kgdb_hex2long(&ptr, &addr)) {
1136 error_packet(remcom_out_buffer, -EINVAL);
1137 return;
1138 }
1139 if (*(ptr++) != ',' ||
1140 !kgdb_hex2long(&ptr, &length)) {
1141 error_packet(remcom_out_buffer, -EINVAL);
1142 return;
1143 }
1144
1145 if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0')
1146 error = kgdb_set_sw_break(addr);
1147 else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0')
1148 error = kgdb_remove_sw_break(addr);
1149 else if (remcom_in_buffer[0] == 'Z')
1150 error = arch_kgdb_ops.set_hw_breakpoint(addr,
1151 (int)length, *bpt_type - '0');
1152 else if (remcom_in_buffer[0] == 'z')
1153 error = arch_kgdb_ops.remove_hw_breakpoint(addr,
1154 (int) length, *bpt_type - '0');
1155
1156 if (error == 0)
1157 strcpy(remcom_out_buffer, "OK");
1158 else
1159 error_packet(remcom_out_buffer, error);
1160}
1161
1162/* Handle the 'C' signal / exception passing packets */
1163static int gdb_cmd_exception_pass(struct kgdb_state *ks)
1164{
1165 /* C09 == pass exception
1166 * C15 == detach kgdb, pass exception
1167 */
1168 if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') {
1169
1170 ks->pass_exception = 1;
1171 remcom_in_buffer[0] = 'c';
1172
1173 } else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') {
1174
1175 ks->pass_exception = 1;
1176 remcom_in_buffer[0] = 'D';
1177 remove_all_break();
1178 kgdb_connected = 0;
1179 return 1;
1180
1181 } else {
1182 kgdb_msg_write("KGDB only knows signal 9 (pass)"
1183 " and 15 (pass and disconnect)\n"
1184 "Executing a continue without signal passing\n", 0);
1185 remcom_in_buffer[0] = 'c';
1186 }
1187
1188 /* Indicate fall through */
1189 return -1;
1190}
1191
1192/*
1193 * This function performs all gdbserial command procesing
1194 */
1195static int gdb_serial_stub(struct kgdb_state *ks)
1196{
1197 int error = 0;
1198 int tmp;
1199
1200 /* Clear the out buffer. */
1201 memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
1202
1203 if (kgdb_connected) {
1204 unsigned char thref[8];
1205 char *ptr;
1206
1207 /* Reply to host that an exception has occurred */
1208 ptr = remcom_out_buffer;
1209 *ptr++ = 'T';
1210 ptr = pack_hex_byte(ptr, ks->signo);
1211 ptr += strlen(strcpy(ptr, "thread:"));
1212 int_to_threadref(thref, shadow_pid(current->pid));
1213 ptr = pack_threadid(ptr, thref);
1214 *ptr++ = ';';
1215 put_packet(remcom_out_buffer);
1216 }
1217
1218 kgdb_usethread = kgdb_info[ks->cpu].task;
1219 ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
1220 ks->pass_exception = 0;
1221
1222 while (1) {
1223 error = 0;
1224
1225 /* Clear the out buffer. */
1226 memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
1227
1228 get_packet(remcom_in_buffer);
1229
1230 switch (remcom_in_buffer[0]) {
1231 case '?': /* gdbserial status */
1232 gdb_cmd_status(ks);
1233 break;
1234 case 'g': /* return the value of the CPU registers */
1235 gdb_cmd_getregs(ks);
1236 break;
1237 case 'G': /* set the value of the CPU registers - return OK */
1238 gdb_cmd_setregs(ks);
1239 break;
1240 case 'm': /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */
1241 gdb_cmd_memread(ks);
1242 break;
1243 case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
1244 gdb_cmd_memwrite(ks);
1245 break;
1246 case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
1247 gdb_cmd_binwrite(ks);
1248 break;
1249 /* kill or detach. KGDB should treat this like a
1250 * continue.
1251 */
1252 case 'D': /* Debugger detach */
1253 case 'k': /* Debugger detach via kill */
1254 gdb_cmd_detachkill(ks);
1255 goto default_handle;
1256 case 'R': /* Reboot */
1257 if (gdb_cmd_reboot(ks))
1258 goto default_handle;
1259 break;
1260 case 'q': /* query command */
1261 gdb_cmd_query(ks);
1262 break;
1263 case 'H': /* task related */
1264 gdb_cmd_task(ks);
1265 break;
1266 case 'T': /* Query thread status */
1267 gdb_cmd_thread(ks);
1268 break;
1269 case 'z': /* Break point remove */
1270 case 'Z': /* Break point set */
1271 gdb_cmd_break(ks);
1272 break;
1273 case 'C': /* Exception passing */
1274 tmp = gdb_cmd_exception_pass(ks);
1275 if (tmp > 0)
1276 goto default_handle;
1277 if (tmp == 0)
1278 break;
1279 /* Fall through on tmp < 0 */
1280 case 'c': /* Continue packet */
1281 case 's': /* Single step packet */
1282 if (kgdb_contthread && kgdb_contthread != current) {
1283 /* Can't switch threads in kgdb */
1284 error_packet(remcom_out_buffer, -EINVAL);
1285 break;
1286 }
1287 kgdb_activate_sw_breakpoints();
1288 /* Fall through to default processing */
1289 default:
1290default_handle:
1291 error = kgdb_arch_handle_exception(ks->ex_vector,
1292 ks->signo,
1293 ks->err_code,
1294 remcom_in_buffer,
1295 remcom_out_buffer,
1296 ks->linux_regs);
1297 /*
1298 * Leave cmd processing on error, detach,
1299 * kill, continue, or single step.
1300 */
1301 if (error >= 0 || remcom_in_buffer[0] == 'D' ||
1302 remcom_in_buffer[0] == 'k') {
1303 error = 0;
1304 goto kgdb_exit;
1305 }
1306
1307 }
1308
1309 /* reply to the request */
1310 put_packet(remcom_out_buffer);
1311 }
1312
1313kgdb_exit:
1314 if (ks->pass_exception)
1315 error = 1;
1316 return error;
1317}
1318
1319static int kgdb_reenter_check(struct kgdb_state *ks)
1320{
1321 unsigned long addr;
1322
1323 if (atomic_read(&kgdb_active) != raw_smp_processor_id())
1324 return 0;
1325
1326 /* Panic on recursive debugger calls: */
1327 exception_level++;
1328 addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
1329 kgdb_deactivate_sw_breakpoints();
1330
1331 /*
1332 * If the break point removed ok at the place exception
1333 * occurred, try to recover and print a warning to the end
1334 * user because the user planted a breakpoint in a place that
1335 * KGDB needs in order to function.
1336 */
1337 if (kgdb_remove_sw_break(addr) == 0) {
1338 exception_level = 0;
1339 kgdb_skipexception(ks->ex_vector, ks->linux_regs);
1340 kgdb_activate_sw_breakpoints();
1341 printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n",
1342 addr);
1343 WARN_ON_ONCE(1);
1344
1345 return 1;
1346 }
1347 remove_all_break();
1348 kgdb_skipexception(ks->ex_vector, ks->linux_regs);
1349
1350 if (exception_level > 1) {
1351 dump_stack();
1352 panic("Recursive entry to debugger");
1353 }
1354
1355 printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n");
1356 dump_stack();
1357 panic("Recursive entry to debugger");
1358
1359 return 1;
1360}
1361
1362static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs)
1363{
1364 unsigned long flags;
1365 int sstep_tries = 100;
1366 int error = 0;
1367 int i, cpu;
1368 int trace_on = 0;
1369acquirelock:
1370 /*
1371 * Interrupts will be restored by the 'trap return' code, except when
1372 * single stepping.
1373 */
1374 local_irq_save(flags);
1375
1376 cpu = ks->cpu;
1377 kgdb_info[cpu].debuggerinfo = regs;
1378 kgdb_info[cpu].task = current;
1379 /*
1380 * Make sure the above info reaches the primary CPU before
1381 * our cpu_in_kgdb[] flag setting does:
1382 */
1383 atomic_inc(&cpu_in_kgdb[cpu]);
1384
1385 /*
1386 * CPU will loop if it is a slave or request to become a kgdb
1387 * master cpu and acquire the kgdb_active lock:
1388 */
1389 while (1) {
1390 if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) {
1391 if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu)
1392 break;
1393 } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
1394 if (!atomic_read(&passive_cpu_wait[cpu]))
1395 goto return_normal;
1396 } else {
1397return_normal:
1398 /* Return to normal operation by executing any
1399 * hw breakpoint fixup.
1400 */
1401 if (arch_kgdb_ops.correct_hw_break)
1402 arch_kgdb_ops.correct_hw_break();
1403 if (trace_on)
1404 tracing_on();
1405 atomic_dec(&cpu_in_kgdb[cpu]);
1406 touch_softlockup_watchdog_sync();
1407 clocksource_touch_watchdog();
1408 local_irq_restore(flags);
1409 return 0;
1410 }
1411 cpu_relax();
1412 }
1413
1414 /*
1415 * For single stepping, try to only enter on the processor
1416 * that was single stepping. To gaurd against a deadlock, the
1417 * kernel will only try for the value of sstep_tries before
1418 * giving up and continuing on.
1419 */
1420 if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
1421 (kgdb_info[cpu].task &&
1422 kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
1423 atomic_set(&kgdb_active, -1);
1424 touch_softlockup_watchdog_sync();
1425 clocksource_touch_watchdog();
1426 local_irq_restore(flags);
1427
1428 goto acquirelock;
1429 }
1430
1431 if (!kgdb_io_ready(1)) {
1432 error = 1;
1433 goto kgdb_restore; /* No I/O connection, so resume the system */
1434 }
1435
1436 /*
1437 * Don't enter if we have hit a removed breakpoint.
1438 */
1439 if (kgdb_skipexception(ks->ex_vector, ks->linux_regs))
1440 goto kgdb_restore;
1441
1442 /* Call the I/O driver's pre_exception routine */
1443 if (kgdb_io_ops->pre_exception)
1444 kgdb_io_ops->pre_exception();
1445
1446 kgdb_disable_hw_debug(ks->linux_regs);
1447
1448 /*
1449 * Get the passive CPU lock which will hold all the non-primary
1450 * CPU in a spin state while the debugger is active
1451 */
1452 if (!kgdb_single_step) {
1453 for (i = 0; i < NR_CPUS; i++)
1454 atomic_inc(&passive_cpu_wait[i]);
1455 }
1456
1457#ifdef CONFIG_SMP
1458 /* Signal the other CPUs to enter kgdb_wait() */
1459 if ((!kgdb_single_step) && kgdb_do_roundup)
1460 kgdb_roundup_cpus(flags);
1461#endif
1462
1463 /*
1464 * Wait for the other CPUs to be notified and be waiting for us:
1465 */
1466 for_each_online_cpu(i) {
1467 while (!atomic_read(&cpu_in_kgdb[i]))
1468 cpu_relax();
1469 }
1470
1471 /*
1472 * At this point the primary processor is completely
1473 * in the debugger and all secondary CPUs are quiescent
1474 */
1475 kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code);
1476 kgdb_deactivate_sw_breakpoints();
1477 kgdb_single_step = 0;
1478 kgdb_contthread = current;
1479 exception_level = 0;
1480 trace_on = tracing_is_on();
1481 if (trace_on)
1482 tracing_off();
1483
1484 /* Talk to debugger with gdbserial protocol */
1485 error = gdb_serial_stub(ks);
1486
1487 /* Call the I/O driver's post_exception routine */
1488 if (kgdb_io_ops->post_exception)
1489 kgdb_io_ops->post_exception();
1490
1491 atomic_dec(&cpu_in_kgdb[ks->cpu]);
1492
1493 if (!kgdb_single_step) {
1494 for (i = NR_CPUS-1; i >= 0; i--)
1495 atomic_dec(&passive_cpu_wait[i]);
1496 /*
1497 * Wait till all the CPUs have quit
1498 * from the debugger.
1499 */
1500 for_each_online_cpu(i) {
1501 while (atomic_read(&cpu_in_kgdb[i]))
1502 cpu_relax();
1503 }
1504 }
1505
1506kgdb_restore:
1507 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
1508 int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step);
1509 if (kgdb_info[sstep_cpu].task)
1510 kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid;
1511 else
1512 kgdb_sstep_pid = 0;
1513 }
1514 if (trace_on)
1515 tracing_on();
1516 /* Free kgdb_active */
1517 atomic_set(&kgdb_active, -1);
1518 touch_softlockup_watchdog_sync();
1519 clocksource_touch_watchdog();
1520 local_irq_restore(flags);
1521
1522 return error;
1523}
1524
1525/*
1526 * kgdb_handle_exception() - main entry point from a kernel exception
1527 *
1528 * Locking hierarchy:
1529 * interface locks, if any (begin_session)
1530 * kgdb lock (kgdb_active)
1531 */
1532int
1533kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
1534{
1535 struct kgdb_state kgdb_var;
1536 struct kgdb_state *ks = &kgdb_var;
1537 int ret;
1538
1539 ks->cpu = raw_smp_processor_id();
1540 ks->ex_vector = evector;
1541 ks->signo = signo;
1542 ks->ex_vector = evector;
1543 ks->err_code = ecode;
1544 ks->kgdb_usethreadid = 0;
1545 ks->linux_regs = regs;
1546
1547 if (kgdb_reenter_check(ks))
1548 return 0; /* Ouch, double exception ! */
1549 kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER;
1550 ret = kgdb_cpu_enter(ks, regs);
1551 kgdb_info[ks->cpu].exception_state &= ~DCPU_WANT_MASTER;
1552 return ret;
1553}
1554
1555int kgdb_nmicallback(int cpu, void *regs)
1556{
1557#ifdef CONFIG_SMP
1558 struct kgdb_state kgdb_var;
1559 struct kgdb_state *ks = &kgdb_var;
1560
1561 memset(ks, 0, sizeof(struct kgdb_state));
1562 ks->cpu = cpu;
1563 ks->linux_regs = regs;
1564
1565 if (!atomic_read(&cpu_in_kgdb[cpu]) &&
1566 atomic_read(&kgdb_active) != -1 &&
1567 atomic_read(&kgdb_active) != cpu) {
1568 kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
1569 kgdb_cpu_enter(ks, regs);
1570 kgdb_info[cpu].exception_state &= ~DCPU_IS_SLAVE;
1571 return 0;
1572 }
1573#endif
1574 return 1;
1575}
1576
1577static void kgdb_console_write(struct console *co, const char *s,
1578 unsigned count)
1579{
1580 unsigned long flags;
1581
1582 /* If we're debugging, or KGDB has not connected, don't try
1583 * and print. */
1584 if (!kgdb_connected || atomic_read(&kgdb_active) != -1)
1585 return;
1586
1587 local_irq_save(flags);
1588 kgdb_msg_write(s, count);
1589 local_irq_restore(flags);
1590}
1591
1592static struct console kgdbcons = {
1593 .name = "kgdb",
1594 .write = kgdb_console_write,
1595 .flags = CON_PRINTBUFFER | CON_ENABLED,
1596 .index = -1,
1597};
1598
1599#ifdef CONFIG_MAGIC_SYSRQ
1600static void sysrq_handle_gdb(int key, struct tty_struct *tty)
1601{
1602 if (!kgdb_io_ops) {
1603 printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
1604 return;
1605 }
1606 if (!kgdb_connected)
1607 printk(KERN_CRIT "Entering KGDB\n");
1608
1609 kgdb_breakpoint();
1610}
1611
1612static struct sysrq_key_op sysrq_gdb_op = {
1613 .handler = sysrq_handle_gdb,
1614 .help_msg = "debug(G)",
1615 .action_msg = "DEBUG",
1616};
1617#endif
1618
1619static void kgdb_register_callbacks(void)
1620{
1621 if (!kgdb_io_module_registered) {
1622 kgdb_io_module_registered = 1;
1623 kgdb_arch_init();
1624#ifdef CONFIG_MAGIC_SYSRQ
1625 register_sysrq_key('g', &sysrq_gdb_op);
1626#endif
1627 if (kgdb_use_con && !kgdb_con_registered) {
1628 register_console(&kgdbcons);
1629 kgdb_con_registered = 1;
1630 }
1631 }
1632}
1633
1634static void kgdb_unregister_callbacks(void)
1635{
1636 /*
1637 * When this routine is called KGDB should unregister from the
1638 * panic handler and clean up, making sure it is not handling any
1639 * break exceptions at the time.
1640 */
1641 if (kgdb_io_module_registered) {
1642 kgdb_io_module_registered = 0;
1643 kgdb_arch_exit();
1644#ifdef CONFIG_MAGIC_SYSRQ
1645 unregister_sysrq_key('g', &sysrq_gdb_op);
1646#endif
1647 if (kgdb_con_registered) {
1648 unregister_console(&kgdbcons);
1649 kgdb_con_registered = 0;
1650 }
1651 }
1652}
1653
1654static void kgdb_initial_breakpoint(void)
1655{
1656 kgdb_break_asap = 0;
1657
1658 printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n");
1659 kgdb_breakpoint();
1660}
1661
1662/**
1663 * kgdb_register_io_module - register KGDB IO module
1664 * @new_kgdb_io_ops: the io ops vector
1665 *
1666 * Register it with the KGDB core.
1667 */
1668int kgdb_register_io_module(struct kgdb_io *new_kgdb_io_ops)
1669{
1670 int err;
1671
1672 spin_lock(&kgdb_registration_lock);
1673
1674 if (kgdb_io_ops) {
1675 spin_unlock(&kgdb_registration_lock);
1676
1677 printk(KERN_ERR "kgdb: Another I/O driver is already "
1678 "registered with KGDB.\n");
1679 return -EBUSY;
1680 }
1681
1682 if (new_kgdb_io_ops->init) {
1683 err = new_kgdb_io_ops->init();
1684 if (err) {
1685 spin_unlock(&kgdb_registration_lock);
1686 return err;
1687 }
1688 }
1689
1690 kgdb_io_ops = new_kgdb_io_ops;
1691
1692 spin_unlock(&kgdb_registration_lock);
1693
1694 printk(KERN_INFO "kgdb: Registered I/O driver %s.\n",
1695 new_kgdb_io_ops->name);
1696
1697 /* Arm KGDB now. */
1698 kgdb_register_callbacks();
1699
1700 if (kgdb_break_asap)
1701 kgdb_initial_breakpoint();
1702
1703 return 0;
1704}
1705EXPORT_SYMBOL_GPL(kgdb_register_io_module);
1706
1707/**
1708 * kkgdb_unregister_io_module - unregister KGDB IO module
1709 * @old_kgdb_io_ops: the io ops vector
1710 *
1711 * Unregister it with the KGDB core.
1712 */
1713void kgdb_unregister_io_module(struct kgdb_io *old_kgdb_io_ops)
1714{
1715 BUG_ON(kgdb_connected);
1716
1717 /*
1718 * KGDB is no longer able to communicate out, so
1719 * unregister our callbacks and reset state.
1720 */
1721 kgdb_unregister_callbacks();
1722
1723 spin_lock(&kgdb_registration_lock);
1724
1725 WARN_ON_ONCE(kgdb_io_ops != old_kgdb_io_ops);
1726 kgdb_io_ops = NULL;
1727
1728 spin_unlock(&kgdb_registration_lock);
1729
1730 printk(KERN_INFO
1731 "kgdb: Unregistered I/O driver %s, debugger disabled.\n",
1732 old_kgdb_io_ops->name);
1733}
1734EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
1735
1736/**
1737 * kgdb_breakpoint - generate breakpoint exception
1738 *
1739 * This function will generate a breakpoint exception. It is used at the
1740 * beginning of a program to sync up with a debugger and can be used
1741 * otherwise as a quick means to stop program execution and "break" into
1742 * the debugger.
1743 */
1744void kgdb_breakpoint(void)
1745{
1746 atomic_inc(&kgdb_setting_breakpoint);
1747 wmb(); /* Sync point before breakpoint */
1748 arch_kgdb_breakpoint();
1749 wmb(); /* Sync point after breakpoint */
1750 atomic_dec(&kgdb_setting_breakpoint);
1751}
1752EXPORT_SYMBOL_GPL(kgdb_breakpoint);
1753
1754static int __init opt_kgdb_wait(char *str)
1755{
1756 kgdb_break_asap = 1;
1757
1758 if (kgdb_io_module_registered)
1759 kgdb_initial_breakpoint();
1760
1761 return 0;
1762}
1763
1764early_param("kgdbwait", opt_kgdb_wait);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index bf0e231d9702..9cd0591c96a2 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -116,27 +116,16 @@ int __request_module(bool wait, const char *fmt, ...)
116 116
117 trace_module_request(module_name, wait, _RET_IP_); 117 trace_module_request(module_name, wait, _RET_IP_);
118 118
119 ret = call_usermodehelper(modprobe_path, argv, envp, 119 ret = call_usermodehelper_fns(modprobe_path, argv, envp,
120 wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); 120 wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC,
121 NULL, NULL, NULL);
122
121 atomic_dec(&kmod_concurrent); 123 atomic_dec(&kmod_concurrent);
122 return ret; 124 return ret;
123} 125}
124EXPORT_SYMBOL(__request_module); 126EXPORT_SYMBOL(__request_module);
125#endif /* CONFIG_MODULES */ 127#endif /* CONFIG_MODULES */
126 128
127struct subprocess_info {
128 struct work_struct work;
129 struct completion *complete;
130 struct cred *cred;
131 char *path;
132 char **argv;
133 char **envp;
134 enum umh_wait wait;
135 int retval;
136 struct file *stdin;
137 void (*cleanup)(char **argv, char **envp);
138};
139
140/* 129/*
141 * This is the task which runs the usermode application 130 * This is the task which runs the usermode application
142 */ 131 */
@@ -145,36 +134,10 @@ static int ____call_usermodehelper(void *data)
145 struct subprocess_info *sub_info = data; 134 struct subprocess_info *sub_info = data;
146 int retval; 135 int retval;
147 136
148 BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
149
150 /* Unblock all signals */
151 spin_lock_irq(&current->sighand->siglock); 137 spin_lock_irq(&current->sighand->siglock);
152 flush_signal_handlers(current, 1); 138 flush_signal_handlers(current, 1);
153 sigemptyset(&current->blocked);
154 recalc_sigpending();
155 spin_unlock_irq(&current->sighand->siglock); 139 spin_unlock_irq(&current->sighand->siglock);
156 140
157 /* Install the credentials */
158 commit_creds(sub_info->cred);
159 sub_info->cred = NULL;
160
161 /* Install input pipe when needed */
162 if (sub_info->stdin) {
163 struct files_struct *f = current->files;
164 struct fdtable *fdt;
165 /* no races because files should be private here */
166 sys_close(0);
167 fd_install(0, sub_info->stdin);
168 spin_lock(&f->file_lock);
169 fdt = files_fdtable(f);
170 FD_SET(0, fdt->open_fds);
171 FD_CLR(0, fdt->close_on_exec);
172 spin_unlock(&f->file_lock);
173
174 /* and disallow core files too */
175 current->signal->rlim[RLIMIT_CORE] = (struct rlimit){0, 0};
176 }
177
178 /* We can run anywhere, unlike our parent keventd(). */ 141 /* We can run anywhere, unlike our parent keventd(). */
179 set_cpus_allowed_ptr(current, cpu_all_mask); 142 set_cpus_allowed_ptr(current, cpu_all_mask);
180 143
@@ -184,9 +147,18 @@ static int ____call_usermodehelper(void *data)
184 */ 147 */
185 set_user_nice(current, 0); 148 set_user_nice(current, 0);
186 149
187 retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp); 150 if (sub_info->init) {
151 retval = sub_info->init(sub_info);
152 if (retval)
153 goto fail;
154 }
155
156 retval = kernel_execve(sub_info->path,
157 (const char *const *)sub_info->argv,
158 (const char *const *)sub_info->envp);
188 159
189 /* Exec failed? */ 160 /* Exec failed? */
161fail:
190 sub_info->retval = retval; 162 sub_info->retval = retval;
191 do_exit(0); 163 do_exit(0);
192} 164}
@@ -194,9 +166,7 @@ static int ____call_usermodehelper(void *data)
194void call_usermodehelper_freeinfo(struct subprocess_info *info) 166void call_usermodehelper_freeinfo(struct subprocess_info *info)
195{ 167{
196 if (info->cleanup) 168 if (info->cleanup)
197 (*info->cleanup)(info->argv, info->envp); 169 (*info->cleanup)(info);
198 if (info->cred)
199 put_cred(info->cred);
200 kfree(info); 170 kfree(info);
201} 171}
202EXPORT_SYMBOL(call_usermodehelper_freeinfo); 172EXPORT_SYMBOL(call_usermodehelper_freeinfo);
@@ -207,16 +177,16 @@ static int wait_for_helper(void *data)
207 struct subprocess_info *sub_info = data; 177 struct subprocess_info *sub_info = data;
208 pid_t pid; 178 pid_t pid;
209 179
210 /* Install a handler: if SIGCLD isn't handled sys_wait4 won't 180 /* If SIGCLD is ignored sys_wait4 won't populate the status. */
211 * populate the status, but will return -ECHILD. */ 181 spin_lock_irq(&current->sighand->siglock);
212 allow_signal(SIGCHLD); 182 current->sighand->action[SIGCHLD-1].sa.sa_handler = SIG_DFL;
183 spin_unlock_irq(&current->sighand->siglock);
213 184
214 pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); 185 pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
215 if (pid < 0) { 186 if (pid < 0) {
216 sub_info->retval = pid; 187 sub_info->retval = pid;
217 } else { 188 } else {
218 int ret; 189 int ret = -ECHILD;
219
220 /* 190 /*
221 * Normally it is bogus to call wait4() from in-kernel because 191 * Normally it is bogus to call wait4() from in-kernel because
222 * wait4() wants to write the exit code to a userspace address. 192 * wait4() wants to write the exit code to a userspace address.
@@ -237,10 +207,7 @@ static int wait_for_helper(void *data)
237 sub_info->retval = ret; 207 sub_info->retval = ret;
238 } 208 }
239 209
240 if (sub_info->wait == UMH_NO_WAIT) 210 complete(sub_info->complete);
241 call_usermodehelper_freeinfo(sub_info);
242 else
243 complete(sub_info->complete);
244 return 0; 211 return 0;
245} 212}
246 213
@@ -249,15 +216,13 @@ static void __call_usermodehelper(struct work_struct *work)
249{ 216{
250 struct subprocess_info *sub_info = 217 struct subprocess_info *sub_info =
251 container_of(work, struct subprocess_info, work); 218 container_of(work, struct subprocess_info, work);
252 pid_t pid;
253 enum umh_wait wait = sub_info->wait; 219 enum umh_wait wait = sub_info->wait;
254 220 pid_t pid;
255 BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
256 221
257 /* CLONE_VFORK: wait until the usermode helper has execve'd 222 /* CLONE_VFORK: wait until the usermode helper has execve'd
258 * successfully We need the data structures to stay around 223 * successfully We need the data structures to stay around
259 * until that is done. */ 224 * until that is done. */
260 if (wait == UMH_WAIT_PROC || wait == UMH_NO_WAIT) 225 if (wait == UMH_WAIT_PROC)
261 pid = kernel_thread(wait_for_helper, sub_info, 226 pid = kernel_thread(wait_for_helper, sub_info,
262 CLONE_FS | CLONE_FILES | SIGCHLD); 227 CLONE_FS | CLONE_FILES | SIGCHLD);
263 else 228 else
@@ -266,15 +231,16 @@ static void __call_usermodehelper(struct work_struct *work)
266 231
267 switch (wait) { 232 switch (wait) {
268 case UMH_NO_WAIT: 233 case UMH_NO_WAIT:
234 call_usermodehelper_freeinfo(sub_info);
269 break; 235 break;
270 236
271 case UMH_WAIT_PROC: 237 case UMH_WAIT_PROC:
272 if (pid > 0) 238 if (pid > 0)
273 break; 239 break;
274 sub_info->retval = pid;
275 /* FALLTHROUGH */ 240 /* FALLTHROUGH */
276
277 case UMH_WAIT_EXEC: 241 case UMH_WAIT_EXEC:
242 if (pid < 0)
243 sub_info->retval = pid;
278 complete(sub_info->complete); 244 complete(sub_info->complete);
279 } 245 }
280} 246}
@@ -376,80 +342,37 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
376 sub_info->path = path; 342 sub_info->path = path;
377 sub_info->argv = argv; 343 sub_info->argv = argv;
378 sub_info->envp = envp; 344 sub_info->envp = envp;
379 sub_info->cred = prepare_usermodehelper_creds();
380 if (!sub_info->cred) {
381 kfree(sub_info);
382 return NULL;
383 }
384
385 out: 345 out:
386 return sub_info; 346 return sub_info;
387} 347}
388EXPORT_SYMBOL(call_usermodehelper_setup); 348EXPORT_SYMBOL(call_usermodehelper_setup);
389 349
390/** 350/**
391 * call_usermodehelper_setkeys - set the session keys for usermode helper 351 * call_usermodehelper_setfns - set a cleanup/init function
392 * @info: a subprocess_info returned by call_usermodehelper_setup
393 * @session_keyring: the session keyring for the process
394 */
395void call_usermodehelper_setkeys(struct subprocess_info *info,
396 struct key *session_keyring)
397{
398#ifdef CONFIG_KEYS
399 struct thread_group_cred *tgcred = info->cred->tgcred;
400 key_put(tgcred->session_keyring);
401 tgcred->session_keyring = key_get(session_keyring);
402#else
403 BUG();
404#endif
405}
406EXPORT_SYMBOL(call_usermodehelper_setkeys);
407
408/**
409 * call_usermodehelper_setcleanup - set a cleanup function
410 * @info: a subprocess_info returned by call_usermodehelper_setup 352 * @info: a subprocess_info returned by call_usermodehelper_setup
411 * @cleanup: a cleanup function 353 * @cleanup: a cleanup function
354 * @init: an init function
355 * @data: arbitrary context sensitive data
412 * 356 *
413 * The cleanup function is just befor ethe subprocess_info is about to 357 * The init function is used to customize the helper process prior to
358 * exec. A non-zero return code causes the process to error out, exit,
359 * and return the failure to the calling process
360 *
361 * The cleanup function is just before ethe subprocess_info is about to
414 * be freed. This can be used for freeing the argv and envp. The 362 * be freed. This can be used for freeing the argv and envp. The
415 * Function must be runnable in either a process context or the 363 * Function must be runnable in either a process context or the
416 * context in which call_usermodehelper_exec is called. 364 * context in which call_usermodehelper_exec is called.
417 */ 365 */
418void call_usermodehelper_setcleanup(struct subprocess_info *info, 366void call_usermodehelper_setfns(struct subprocess_info *info,
419 void (*cleanup)(char **argv, char **envp)) 367 int (*init)(struct subprocess_info *info),
368 void (*cleanup)(struct subprocess_info *info),
369 void *data)
420{ 370{
421 info->cleanup = cleanup; 371 info->cleanup = cleanup;
372 info->init = init;
373 info->data = data;
422} 374}
423EXPORT_SYMBOL(call_usermodehelper_setcleanup); 375EXPORT_SYMBOL(call_usermodehelper_setfns);
424
425/**
426 * call_usermodehelper_stdinpipe - set up a pipe to be used for stdin
427 * @sub_info: a subprocess_info returned by call_usermodehelper_setup
428 * @filp: set to the write-end of a pipe
429 *
430 * This constructs a pipe, and sets the read end to be the stdin of the
431 * subprocess, and returns the write-end in *@filp.
432 */
433int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info,
434 struct file **filp)
435{
436 struct file *f;
437
438 f = create_write_pipe(0);
439 if (IS_ERR(f))
440 return PTR_ERR(f);
441 *filp = f;
442
443 f = create_read_pipe(f, 0);
444 if (IS_ERR(f)) {
445 free_write_pipe(*filp);
446 return PTR_ERR(f);
447 }
448 sub_info->stdin = f;
449
450 return 0;
451}
452EXPORT_SYMBOL(call_usermodehelper_stdinpipe);
453 376
454/** 377/**
455 * call_usermodehelper_exec - start a usermode application 378 * call_usermodehelper_exec - start a usermode application
@@ -469,9 +392,6 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
469 DECLARE_COMPLETION_ONSTACK(done); 392 DECLARE_COMPLETION_ONSTACK(done);
470 int retval = 0; 393 int retval = 0;
471 394
472 BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
473 validate_creds(sub_info->cred);
474
475 helper_lock(); 395 helper_lock();
476 if (sub_info->path[0] == '\0') 396 if (sub_info->path[0] == '\0')
477 goto out; 397 goto out;
@@ -498,41 +418,6 @@ unlock:
498} 418}
499EXPORT_SYMBOL(call_usermodehelper_exec); 419EXPORT_SYMBOL(call_usermodehelper_exec);
500 420
501/**
502 * call_usermodehelper_pipe - call a usermode helper process with a pipe stdin
503 * @path: path to usermode executable
504 * @argv: arg vector for process
505 * @envp: environment for process
506 * @filp: set to the write-end of a pipe
507 *
508 * This is a simple wrapper which executes a usermode-helper function
509 * with a pipe as stdin. It is implemented entirely in terms of
510 * lower-level call_usermodehelper_* functions.
511 */
512int call_usermodehelper_pipe(char *path, char **argv, char **envp,
513 struct file **filp)
514{
515 struct subprocess_info *sub_info;
516 int ret;
517
518 sub_info = call_usermodehelper_setup(path, argv, envp, GFP_KERNEL);
519 if (sub_info == NULL)
520 return -ENOMEM;
521
522 ret = call_usermodehelper_stdinpipe(sub_info, filp);
523 if (ret < 0) {
524 call_usermodehelper_freeinfo(sub_info);
525 return ret;
526 }
527
528 ret = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
529 if (ret < 0) /* Failed to execute helper, close pipe */
530 filp_close(*filp, NULL);
531
532 return ret;
533}
534EXPORT_SYMBOL(call_usermodehelper_pipe);
535
536void __init usermodehelper_init(void) 421void __init usermodehelper_init(void)
537{ 422{
538 khelper_wq = create_singlethread_workqueue("khelper"); 423 khelper_wq = create_singlethread_workqueue("khelper");
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 0ed46f3e51e9..282035f3ae96 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1588,6 +1588,72 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1588 arch_remove_kprobe(p); 1588 arch_remove_kprobe(p);
1589} 1589}
1590 1590
1591/* Disable one kprobe */
1592int __kprobes disable_kprobe(struct kprobe *kp)
1593{
1594 int ret = 0;
1595 struct kprobe *p;
1596
1597 mutex_lock(&kprobe_mutex);
1598
1599 /* Check whether specified probe is valid. */
1600 p = __get_valid_kprobe(kp);
1601 if (unlikely(p == NULL)) {
1602 ret = -EINVAL;
1603 goto out;
1604 }
1605
1606 /* If the probe is already disabled (or gone), just return */
1607 if (kprobe_disabled(kp))
1608 goto out;
1609
1610 kp->flags |= KPROBE_FLAG_DISABLED;
1611 if (p != kp)
1612 /* When kp != p, p is always enabled. */
1613 try_to_disable_aggr_kprobe(p);
1614
1615 if (!kprobes_all_disarmed && kprobe_disabled(p))
1616 disarm_kprobe(p);
1617out:
1618 mutex_unlock(&kprobe_mutex);
1619 return ret;
1620}
1621EXPORT_SYMBOL_GPL(disable_kprobe);
1622
1623/* Enable one kprobe */
1624int __kprobes enable_kprobe(struct kprobe *kp)
1625{
1626 int ret = 0;
1627 struct kprobe *p;
1628
1629 mutex_lock(&kprobe_mutex);
1630
1631 /* Check whether specified probe is valid. */
1632 p = __get_valid_kprobe(kp);
1633 if (unlikely(p == NULL)) {
1634 ret = -EINVAL;
1635 goto out;
1636 }
1637
1638 if (kprobe_gone(kp)) {
1639 /* This kprobe has gone, we couldn't enable it. */
1640 ret = -EINVAL;
1641 goto out;
1642 }
1643
1644 if (p != kp)
1645 kp->flags &= ~KPROBE_FLAG_DISABLED;
1646
1647 if (!kprobes_all_disarmed && kprobe_disabled(p)) {
1648 p->flags &= ~KPROBE_FLAG_DISABLED;
1649 arm_kprobe(p);
1650 }
1651out:
1652 mutex_unlock(&kprobe_mutex);
1653 return ret;
1654}
1655EXPORT_SYMBOL_GPL(enable_kprobe);
1656
1591void __kprobes dump_kprobe(struct kprobe *kp) 1657void __kprobes dump_kprobe(struct kprobe *kp)
1592{ 1658{
1593 printk(KERN_WARNING "Dumping kprobe:\n"); 1659 printk(KERN_WARNING "Dumping kprobe:\n");
@@ -1805,72 +1871,6 @@ static const struct file_operations debugfs_kprobes_operations = {
1805 .release = seq_release, 1871 .release = seq_release,
1806}; 1872};
1807 1873
1808/* Disable one kprobe */
1809int __kprobes disable_kprobe(struct kprobe *kp)
1810{
1811 int ret = 0;
1812 struct kprobe *p;
1813
1814 mutex_lock(&kprobe_mutex);
1815
1816 /* Check whether specified probe is valid. */
1817 p = __get_valid_kprobe(kp);
1818 if (unlikely(p == NULL)) {
1819 ret = -EINVAL;
1820 goto out;
1821 }
1822
1823 /* If the probe is already disabled (or gone), just return */
1824 if (kprobe_disabled(kp))
1825 goto out;
1826
1827 kp->flags |= KPROBE_FLAG_DISABLED;
1828 if (p != kp)
1829 /* When kp != p, p is always enabled. */
1830 try_to_disable_aggr_kprobe(p);
1831
1832 if (!kprobes_all_disarmed && kprobe_disabled(p))
1833 disarm_kprobe(p);
1834out:
1835 mutex_unlock(&kprobe_mutex);
1836 return ret;
1837}
1838EXPORT_SYMBOL_GPL(disable_kprobe);
1839
1840/* Enable one kprobe */
1841int __kprobes enable_kprobe(struct kprobe *kp)
1842{
1843 int ret = 0;
1844 struct kprobe *p;
1845
1846 mutex_lock(&kprobe_mutex);
1847
1848 /* Check whether specified probe is valid. */
1849 p = __get_valid_kprobe(kp);
1850 if (unlikely(p == NULL)) {
1851 ret = -EINVAL;
1852 goto out;
1853 }
1854
1855 if (kprobe_gone(kp)) {
1856 /* This kprobe has gone, we couldn't enable it. */
1857 ret = -EINVAL;
1858 goto out;
1859 }
1860
1861 if (p != kp)
1862 kp->flags &= ~KPROBE_FLAG_DISABLED;
1863
1864 if (!kprobes_all_disarmed && kprobe_disabled(p)) {
1865 p->flags &= ~KPROBE_FLAG_DISABLED;
1866 arm_kprobe(p);
1867 }
1868out:
1869 mutex_unlock(&kprobe_mutex);
1870 return ret;
1871}
1872EXPORT_SYMBOL_GPL(enable_kprobe);
1873
1874static void __kprobes arm_all_kprobes(void) 1874static void __kprobes arm_all_kprobes(void)
1875{ 1875{
1876 struct hlist_head *head; 1876 struct hlist_head *head;
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 21fe3c426948..0b624e791805 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -138,7 +138,8 @@ extern const void __start_notes __attribute__((weak));
138extern const void __stop_notes __attribute__((weak)); 138extern const void __stop_notes __attribute__((weak));
139#define notes_size (&__stop_notes - &__start_notes) 139#define notes_size (&__stop_notes - &__start_notes)
140 140
141static ssize_t notes_read(struct kobject *kobj, struct bin_attribute *bin_attr, 141static ssize_t notes_read(struct file *filp, struct kobject *kobj,
142 struct bin_attribute *bin_attr,
142 char *buf, loff_t off, size_t count) 143 char *buf, loff_t off, size_t count)
143{ 144{
144 memcpy(buf, &__start_notes + off, count); 145 memcpy(buf, &__start_notes + off, count);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 83911c780175..2dc3786349d1 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -14,6 +14,8 @@
14#include <linux/file.h> 14#include <linux/file.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <linux/slab.h>
18#include <linux/freezer.h>
17#include <trace/events/sched.h> 19#include <trace/events/sched.h>
18 20
19static DEFINE_SPINLOCK(kthread_create_lock); 21static DEFINE_SPINLOCK(kthread_create_lock);
@@ -35,6 +37,7 @@ struct kthread_create_info
35 37
36struct kthread { 38struct kthread {
37 int should_stop; 39 int should_stop;
40 void *data;
38 struct completion exited; 41 struct completion exited;
39}; 42};
40 43
@@ -54,6 +57,19 @@ int kthread_should_stop(void)
54} 57}
55EXPORT_SYMBOL(kthread_should_stop); 58EXPORT_SYMBOL(kthread_should_stop);
56 59
60/**
61 * kthread_data - return data value specified on kthread creation
62 * @task: kthread task in question
63 *
64 * Return the data value specified when kthread @task was created.
65 * The caller is responsible for ensuring the validity of @task when
66 * calling this function.
67 */
68void *kthread_data(struct task_struct *task)
69{
70 return to_kthread(task)->data;
71}
72
57static int kthread(void *_create) 73static int kthread(void *_create)
58{ 74{
59 /* Copy data: it's on kthread's stack */ 75 /* Copy data: it's on kthread's stack */
@@ -64,6 +80,7 @@ static int kthread(void *_create)
64 int ret; 80 int ret;
65 81
66 self.should_stop = 0; 82 self.should_stop = 0;
83 self.data = data;
67 init_completion(&self.exited); 84 init_completion(&self.exited);
68 current->vfork_done = &self.exited; 85 current->vfork_done = &self.exited;
69 86
@@ -247,3 +264,150 @@ int kthreadd(void *unused)
247 264
248 return 0; 265 return 0;
249} 266}
267
268/**
269 * kthread_worker_fn - kthread function to process kthread_worker
270 * @worker_ptr: pointer to initialized kthread_worker
271 *
272 * This function can be used as @threadfn to kthread_create() or
273 * kthread_run() with @worker_ptr argument pointing to an initialized
274 * kthread_worker. The started kthread will process work_list until
275 * the it is stopped with kthread_stop(). A kthread can also call
276 * this function directly after extra initialization.
277 *
278 * Different kthreads can be used for the same kthread_worker as long
279 * as there's only one kthread attached to it at any given time. A
280 * kthread_worker without an attached kthread simply collects queued
281 * kthread_works.
282 */
283int kthread_worker_fn(void *worker_ptr)
284{
285 struct kthread_worker *worker = worker_ptr;
286 struct kthread_work *work;
287
288 WARN_ON(worker->task);
289 worker->task = current;
290repeat:
291 set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
292
293 if (kthread_should_stop()) {
294 __set_current_state(TASK_RUNNING);
295 spin_lock_irq(&worker->lock);
296 worker->task = NULL;
297 spin_unlock_irq(&worker->lock);
298 return 0;
299 }
300
301 work = NULL;
302 spin_lock_irq(&worker->lock);
303 if (!list_empty(&worker->work_list)) {
304 work = list_first_entry(&worker->work_list,
305 struct kthread_work, node);
306 list_del_init(&work->node);
307 }
308 spin_unlock_irq(&worker->lock);
309
310 if (work) {
311 __set_current_state(TASK_RUNNING);
312 work->func(work);
313 smp_wmb(); /* wmb worker-b0 paired with flush-b1 */
314 work->done_seq = work->queue_seq;
315 smp_mb(); /* mb worker-b1 paired with flush-b0 */
316 if (atomic_read(&work->flushing))
317 wake_up_all(&work->done);
318 } else if (!freezing(current))
319 schedule();
320
321 try_to_freeze();
322 goto repeat;
323}
324EXPORT_SYMBOL_GPL(kthread_worker_fn);
325
326/**
327 * queue_kthread_work - queue a kthread_work
328 * @worker: target kthread_worker
329 * @work: kthread_work to queue
330 *
331 * Queue @work to work processor @task for async execution. @task
332 * must have been created with kthread_worker_create(). Returns %true
333 * if @work was successfully queued, %false if it was already pending.
334 */
335bool queue_kthread_work(struct kthread_worker *worker,
336 struct kthread_work *work)
337{
338 bool ret = false;
339 unsigned long flags;
340
341 spin_lock_irqsave(&worker->lock, flags);
342 if (list_empty(&work->node)) {
343 list_add_tail(&work->node, &worker->work_list);
344 work->queue_seq++;
345 if (likely(worker->task))
346 wake_up_process(worker->task);
347 ret = true;
348 }
349 spin_unlock_irqrestore(&worker->lock, flags);
350 return ret;
351}
352EXPORT_SYMBOL_GPL(queue_kthread_work);
353
354/**
355 * flush_kthread_work - flush a kthread_work
356 * @work: work to flush
357 *
358 * If @work is queued or executing, wait for it to finish execution.
359 */
360void flush_kthread_work(struct kthread_work *work)
361{
362 int seq = work->queue_seq;
363
364 atomic_inc(&work->flushing);
365
366 /*
367 * mb flush-b0 paired with worker-b1, to make sure either
368 * worker sees the above increment or we see done_seq update.
369 */
370 smp_mb__after_atomic_inc();
371
372 /* A - B <= 0 tests whether B is in front of A regardless of overflow */
373 wait_event(work->done, seq - work->done_seq <= 0);
374 atomic_dec(&work->flushing);
375
376 /*
377 * rmb flush-b1 paired with worker-b0, to make sure our caller
378 * sees every change made by work->func().
379 */
380 smp_mb__after_atomic_dec();
381}
382EXPORT_SYMBOL_GPL(flush_kthread_work);
383
384struct kthread_flush_work {
385 struct kthread_work work;
386 struct completion done;
387};
388
389static void kthread_flush_work_fn(struct kthread_work *work)
390{
391 struct kthread_flush_work *fwork =
392 container_of(work, struct kthread_flush_work, work);
393 complete(&fwork->done);
394}
395
396/**
397 * flush_kthread_worker - flush all current works on a kthread_worker
398 * @worker: worker to flush
399 *
400 * Wait until all currently executing or pending works on @worker are
401 * finished.
402 */
403void flush_kthread_worker(struct kthread_worker *worker)
404{
405 struct kthread_flush_work fwork = {
406 KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
407 COMPLETION_INITIALIZER_ONSTACK(fwork.done),
408 };
409
410 queue_kthread_work(worker, &fwork.work);
411 wait_for_completion(&fwork.done);
412}
413EXPORT_SYMBOL_GPL(flush_kthread_worker);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 2594e1ce41cb..f2852a510232 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -146,7 +146,7 @@ static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS],
146 146
147static inline u64 lockstat_clock(void) 147static inline u64 lockstat_clock(void)
148{ 148{
149 return cpu_clock(smp_processor_id()); 149 return local_clock();
150} 150}
151 151
152static int lock_point(unsigned long points[], unsigned long ip) 152static int lock_point(unsigned long points[], unsigned long ip)
@@ -431,20 +431,7 @@ static struct stack_trace lockdep_init_trace = {
431/* 431/*
432 * Various lockdep statistics: 432 * Various lockdep statistics:
433 */ 433 */
434atomic_t chain_lookup_hits; 434DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats);
435atomic_t chain_lookup_misses;
436atomic_t hardirqs_on_events;
437atomic_t hardirqs_off_events;
438atomic_t redundant_hardirqs_on;
439atomic_t redundant_hardirqs_off;
440atomic_t softirqs_on_events;
441atomic_t softirqs_off_events;
442atomic_t redundant_softirqs_on;
443atomic_t redundant_softirqs_off;
444atomic_t nr_unused_locks;
445atomic_t nr_cyclic_checks;
446atomic_t nr_find_usage_forwards_checks;
447atomic_t nr_find_usage_backwards_checks;
448#endif 435#endif
449 436
450/* 437/*
@@ -748,7 +735,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
748 return NULL; 735 return NULL;
749 } 736 }
750 class = lock_classes + nr_lock_classes++; 737 class = lock_classes + nr_lock_classes++;
751 debug_atomic_inc(&nr_unused_locks); 738 debug_atomic_inc(nr_unused_locks);
752 class->key = key; 739 class->key = key;
753 class->name = lock->name; 740 class->name = lock->name;
754 class->subclass = subclass; 741 class->subclass = subclass;
@@ -818,7 +805,8 @@ static struct lock_list *alloc_list_entry(void)
818 * Add a new dependency to the head of the list: 805 * Add a new dependency to the head of the list:
819 */ 806 */
820static int add_lock_to_list(struct lock_class *class, struct lock_class *this, 807static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
821 struct list_head *head, unsigned long ip, int distance) 808 struct list_head *head, unsigned long ip,
809 int distance, struct stack_trace *trace)
822{ 810{
823 struct lock_list *entry; 811 struct lock_list *entry;
824 /* 812 /*
@@ -829,11 +817,9 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
829 if (!entry) 817 if (!entry)
830 return 0; 818 return 0;
831 819
832 if (!save_trace(&entry->trace))
833 return 0;
834
835 entry->class = this; 820 entry->class = this;
836 entry->distance = distance; 821 entry->distance = distance;
822 entry->trace = *trace;
837 /* 823 /*
838 * Since we never remove from the dependency list, the list can 824 * Since we never remove from the dependency list, the list can
839 * be walked lockless by other CPUs, it's only allocation 825 * be walked lockless by other CPUs, it's only allocation
@@ -1205,7 +1191,7 @@ check_noncircular(struct lock_list *root, struct lock_class *target,
1205{ 1191{
1206 int result; 1192 int result;
1207 1193
1208 debug_atomic_inc(&nr_cyclic_checks); 1194 debug_atomic_inc(nr_cyclic_checks);
1209 1195
1210 result = __bfs_forwards(root, target, class_equal, target_entry); 1196 result = __bfs_forwards(root, target, class_equal, target_entry);
1211 1197
@@ -1242,7 +1228,7 @@ find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
1242{ 1228{
1243 int result; 1229 int result;
1244 1230
1245 debug_atomic_inc(&nr_find_usage_forwards_checks); 1231 debug_atomic_inc(nr_find_usage_forwards_checks);
1246 1232
1247 result = __bfs_forwards(root, (void *)bit, usage_match, target_entry); 1233 result = __bfs_forwards(root, (void *)bit, usage_match, target_entry);
1248 1234
@@ -1265,7 +1251,7 @@ find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit,
1265{ 1251{
1266 int result; 1252 int result;
1267 1253
1268 debug_atomic_inc(&nr_find_usage_backwards_checks); 1254 debug_atomic_inc(nr_find_usage_backwards_checks);
1269 1255
1270 result = __bfs_backwards(root, (void *)bit, usage_match, target_entry); 1256 result = __bfs_backwards(root, (void *)bit, usage_match, target_entry);
1271 1257
@@ -1635,12 +1621,20 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
1635 */ 1621 */
1636static int 1622static int
1637check_prev_add(struct task_struct *curr, struct held_lock *prev, 1623check_prev_add(struct task_struct *curr, struct held_lock *prev,
1638 struct held_lock *next, int distance) 1624 struct held_lock *next, int distance, int trylock_loop)
1639{ 1625{
1640 struct lock_list *entry; 1626 struct lock_list *entry;
1641 int ret; 1627 int ret;
1642 struct lock_list this; 1628 struct lock_list this;
1643 struct lock_list *uninitialized_var(target_entry); 1629 struct lock_list *uninitialized_var(target_entry);
1630 /*
1631 * Static variable, serialized by the graph_lock().
1632 *
1633 * We use this static variable to save the stack trace in case
1634 * we call into this function multiple times due to encountering
1635 * trylocks in the held lock stack.
1636 */
1637 static struct stack_trace trace;
1644 1638
1645 /* 1639 /*
1646 * Prove that the new <prev> -> <next> dependency would not 1640 * Prove that the new <prev> -> <next> dependency would not
@@ -1688,20 +1682,23 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1688 } 1682 }
1689 } 1683 }
1690 1684
1685 if (!trylock_loop && !save_trace(&trace))
1686 return 0;
1687
1691 /* 1688 /*
1692 * Ok, all validations passed, add the new lock 1689 * Ok, all validations passed, add the new lock
1693 * to the previous lock's dependency list: 1690 * to the previous lock's dependency list:
1694 */ 1691 */
1695 ret = add_lock_to_list(hlock_class(prev), hlock_class(next), 1692 ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
1696 &hlock_class(prev)->locks_after, 1693 &hlock_class(prev)->locks_after,
1697 next->acquire_ip, distance); 1694 next->acquire_ip, distance, &trace);
1698 1695
1699 if (!ret) 1696 if (!ret)
1700 return 0; 1697 return 0;
1701 1698
1702 ret = add_lock_to_list(hlock_class(next), hlock_class(prev), 1699 ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
1703 &hlock_class(next)->locks_before, 1700 &hlock_class(next)->locks_before,
1704 next->acquire_ip, distance); 1701 next->acquire_ip, distance, &trace);
1705 if (!ret) 1702 if (!ret)
1706 return 0; 1703 return 0;
1707 1704
@@ -1731,6 +1728,7 @@ static int
1731check_prevs_add(struct task_struct *curr, struct held_lock *next) 1728check_prevs_add(struct task_struct *curr, struct held_lock *next)
1732{ 1729{
1733 int depth = curr->lockdep_depth; 1730 int depth = curr->lockdep_depth;
1731 int trylock_loop = 0;
1734 struct held_lock *hlock; 1732 struct held_lock *hlock;
1735 1733
1736 /* 1734 /*
@@ -1756,7 +1754,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
1756 * added: 1754 * added:
1757 */ 1755 */
1758 if (hlock->read != 2) { 1756 if (hlock->read != 2) {
1759 if (!check_prev_add(curr, hlock, next, distance)) 1757 if (!check_prev_add(curr, hlock, next,
1758 distance, trylock_loop))
1760 return 0; 1759 return 0;
1761 /* 1760 /*
1762 * Stop after the first non-trylock entry, 1761 * Stop after the first non-trylock entry,
@@ -1779,6 +1778,7 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
1779 if (curr->held_locks[depth].irq_context != 1778 if (curr->held_locks[depth].irq_context !=
1780 curr->held_locks[depth-1].irq_context) 1779 curr->held_locks[depth-1].irq_context)
1781 break; 1780 break;
1781 trylock_loop = 1;
1782 } 1782 }
1783 return 1; 1783 return 1;
1784out_bug: 1784out_bug:
@@ -1825,7 +1825,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
1825 list_for_each_entry(chain, hash_head, entry) { 1825 list_for_each_entry(chain, hash_head, entry) {
1826 if (chain->chain_key == chain_key) { 1826 if (chain->chain_key == chain_key) {
1827cache_hit: 1827cache_hit:
1828 debug_atomic_inc(&chain_lookup_hits); 1828 debug_atomic_inc(chain_lookup_hits);
1829 if (very_verbose(class)) 1829 if (very_verbose(class))
1830 printk("\nhash chain already cached, key: " 1830 printk("\nhash chain already cached, key: "
1831 "%016Lx tail class: [%p] %s\n", 1831 "%016Lx tail class: [%p] %s\n",
@@ -1890,7 +1890,7 @@ cache_hit:
1890 chain_hlocks[chain->base + j] = class - lock_classes; 1890 chain_hlocks[chain->base + j] = class - lock_classes;
1891 } 1891 }
1892 list_add_tail_rcu(&chain->entry, hash_head); 1892 list_add_tail_rcu(&chain->entry, hash_head);
1893 debug_atomic_inc(&chain_lookup_misses); 1893 debug_atomic_inc(chain_lookup_misses);
1894 inc_chains(); 1894 inc_chains();
1895 1895
1896 return 1; 1896 return 1;
@@ -2311,7 +2311,12 @@ void trace_hardirqs_on_caller(unsigned long ip)
2311 return; 2311 return;
2312 2312
2313 if (unlikely(curr->hardirqs_enabled)) { 2313 if (unlikely(curr->hardirqs_enabled)) {
2314 debug_atomic_inc(&redundant_hardirqs_on); 2314 /*
2315 * Neither irq nor preemption are disabled here
2316 * so this is racy by nature but loosing one hit
2317 * in a stat is not a big deal.
2318 */
2319 __debug_atomic_inc(redundant_hardirqs_on);
2315 return; 2320 return;
2316 } 2321 }
2317 /* we'll do an OFF -> ON transition: */ 2322 /* we'll do an OFF -> ON transition: */
@@ -2338,7 +2343,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
2338 2343
2339 curr->hardirq_enable_ip = ip; 2344 curr->hardirq_enable_ip = ip;
2340 curr->hardirq_enable_event = ++curr->irq_events; 2345 curr->hardirq_enable_event = ++curr->irq_events;
2341 debug_atomic_inc(&hardirqs_on_events); 2346 debug_atomic_inc(hardirqs_on_events);
2342} 2347}
2343EXPORT_SYMBOL(trace_hardirqs_on_caller); 2348EXPORT_SYMBOL(trace_hardirqs_on_caller);
2344 2349
@@ -2370,9 +2375,9 @@ void trace_hardirqs_off_caller(unsigned long ip)
2370 curr->hardirqs_enabled = 0; 2375 curr->hardirqs_enabled = 0;
2371 curr->hardirq_disable_ip = ip; 2376 curr->hardirq_disable_ip = ip;
2372 curr->hardirq_disable_event = ++curr->irq_events; 2377 curr->hardirq_disable_event = ++curr->irq_events;
2373 debug_atomic_inc(&hardirqs_off_events); 2378 debug_atomic_inc(hardirqs_off_events);
2374 } else 2379 } else
2375 debug_atomic_inc(&redundant_hardirqs_off); 2380 debug_atomic_inc(redundant_hardirqs_off);
2376} 2381}
2377EXPORT_SYMBOL(trace_hardirqs_off_caller); 2382EXPORT_SYMBOL(trace_hardirqs_off_caller);
2378 2383
@@ -2396,7 +2401,7 @@ void trace_softirqs_on(unsigned long ip)
2396 return; 2401 return;
2397 2402
2398 if (curr->softirqs_enabled) { 2403 if (curr->softirqs_enabled) {
2399 debug_atomic_inc(&redundant_softirqs_on); 2404 debug_atomic_inc(redundant_softirqs_on);
2400 return; 2405 return;
2401 } 2406 }
2402 2407
@@ -2406,7 +2411,7 @@ void trace_softirqs_on(unsigned long ip)
2406 curr->softirqs_enabled = 1; 2411 curr->softirqs_enabled = 1;
2407 curr->softirq_enable_ip = ip; 2412 curr->softirq_enable_ip = ip;
2408 curr->softirq_enable_event = ++curr->irq_events; 2413 curr->softirq_enable_event = ++curr->irq_events;
2409 debug_atomic_inc(&softirqs_on_events); 2414 debug_atomic_inc(softirqs_on_events);
2410 /* 2415 /*
2411 * We are going to turn softirqs on, so set the 2416 * We are going to turn softirqs on, so set the
2412 * usage bit for all held locks, if hardirqs are 2417 * usage bit for all held locks, if hardirqs are
@@ -2436,10 +2441,10 @@ void trace_softirqs_off(unsigned long ip)
2436 curr->softirqs_enabled = 0; 2441 curr->softirqs_enabled = 0;
2437 curr->softirq_disable_ip = ip; 2442 curr->softirq_disable_ip = ip;
2438 curr->softirq_disable_event = ++curr->irq_events; 2443 curr->softirq_disable_event = ++curr->irq_events;
2439 debug_atomic_inc(&softirqs_off_events); 2444 debug_atomic_inc(softirqs_off_events);
2440 DEBUG_LOCKS_WARN_ON(!softirq_count()); 2445 DEBUG_LOCKS_WARN_ON(!softirq_count());
2441 } else 2446 } else
2442 debug_atomic_inc(&redundant_softirqs_off); 2447 debug_atomic_inc(redundant_softirqs_off);
2443} 2448}
2444 2449
2445static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) 2450static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
@@ -2644,7 +2649,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
2644 return 0; 2649 return 0;
2645 break; 2650 break;
2646 case LOCK_USED: 2651 case LOCK_USED:
2647 debug_atomic_dec(&nr_unused_locks); 2652 debug_atomic_dec(nr_unused_locks);
2648 break; 2653 break;
2649 default: 2654 default:
2650 if (!debug_locks_off_graph_unlock()) 2655 if (!debug_locks_off_graph_unlock())
@@ -2706,6 +2711,8 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
2706} 2711}
2707EXPORT_SYMBOL_GPL(lockdep_init_map); 2712EXPORT_SYMBOL_GPL(lockdep_init_map);
2708 2713
2714struct lock_class_key __lockdep_no_validate__;
2715
2709/* 2716/*
2710 * This gets called for every mutex_lock*()/spin_lock*() operation. 2717 * This gets called for every mutex_lock*()/spin_lock*() operation.
2711 * We maintain the dependency maps and validate the locking attempt: 2718 * We maintain the dependency maps and validate the locking attempt:
@@ -2740,6 +2747,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2740 return 0; 2747 return 0;
2741 } 2748 }
2742 2749
2750 if (lock->key == &__lockdep_no_validate__)
2751 check = 1;
2752
2743 if (!subclass) 2753 if (!subclass)
2744 class = lock->class_cache; 2754 class = lock->class_cache;
2745 /* 2755 /*
@@ -2750,7 +2760,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2750 if (!class) 2760 if (!class)
2751 return 0; 2761 return 0;
2752 } 2762 }
2753 debug_atomic_inc((atomic_t *)&class->ops); 2763 atomic_inc((atomic_t *)&class->ops);
2754 if (very_verbose(class)) { 2764 if (very_verbose(class)) {
2755 printk("\nacquire class [%p] %s", class->key, class->name); 2765 printk("\nacquire class [%p] %s", class->key, class->name);
2756 if (class->name_version > 1) 2766 if (class->name_version > 1)
@@ -3227,7 +3237,7 @@ void lock_release(struct lockdep_map *lock, int nested,
3227 raw_local_irq_save(flags); 3237 raw_local_irq_save(flags);
3228 check_flags(flags); 3238 check_flags(flags);
3229 current->lockdep_recursion = 1; 3239 current->lockdep_recursion = 1;
3230 trace_lock_release(lock, nested, ip); 3240 trace_lock_release(lock, ip);
3231 __lock_release(lock, nested, ip); 3241 __lock_release(lock, nested, ip);
3232 current->lockdep_recursion = 0; 3242 current->lockdep_recursion = 0;
3233 raw_local_irq_restore(flags); 3243 raw_local_irq_restore(flags);
@@ -3380,7 +3390,7 @@ found_it:
3380 hlock->holdtime_stamp = now; 3390 hlock->holdtime_stamp = now;
3381 } 3391 }
3382 3392
3383 trace_lock_acquired(lock, ip, waittime); 3393 trace_lock_acquired(lock, ip);
3384 3394
3385 stats = get_lock_stats(hlock_class(hlock)); 3395 stats = get_lock_stats(hlock_class(hlock));
3386 if (waittime) { 3396 if (waittime) {
@@ -3801,8 +3811,11 @@ void lockdep_rcu_dereference(const char *file, const int line)
3801{ 3811{
3802 struct task_struct *curr = current; 3812 struct task_struct *curr = current;
3803 3813
3814#ifndef CONFIG_PROVE_RCU_REPEATEDLY
3804 if (!debug_locks_off()) 3815 if (!debug_locks_off())
3805 return; 3816 return;
3817#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
3818 /* Note: the following can be executed concurrently, so be careful. */
3806 printk("\n===================================================\n"); 3819 printk("\n===================================================\n");
3807 printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n"); 3820 printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n");
3808 printk( "---------------------------------------------------\n"); 3821 printk( "---------------------------------------------------\n");
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index a2ee95ad1313..4f560cfedc8f 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -110,30 +110,60 @@ lockdep_count_backward_deps(struct lock_class *class)
110#endif 110#endif
111 111
112#ifdef CONFIG_DEBUG_LOCKDEP 112#ifdef CONFIG_DEBUG_LOCKDEP
113
114#include <asm/local.h>
113/* 115/*
114 * Various lockdep statistics: 116 * Various lockdep statistics.
117 * We want them per cpu as they are often accessed in fast path
118 * and we want to avoid too much cache bouncing.
115 */ 119 */
116extern atomic_t chain_lookup_hits; 120struct lockdep_stats {
117extern atomic_t chain_lookup_misses; 121 int chain_lookup_hits;
118extern atomic_t hardirqs_on_events; 122 int chain_lookup_misses;
119extern atomic_t hardirqs_off_events; 123 int hardirqs_on_events;
120extern atomic_t redundant_hardirqs_on; 124 int hardirqs_off_events;
121extern atomic_t redundant_hardirqs_off; 125 int redundant_hardirqs_on;
122extern atomic_t softirqs_on_events; 126 int redundant_hardirqs_off;
123extern atomic_t softirqs_off_events; 127 int softirqs_on_events;
124extern atomic_t redundant_softirqs_on; 128 int softirqs_off_events;
125extern atomic_t redundant_softirqs_off; 129 int redundant_softirqs_on;
126extern atomic_t nr_unused_locks; 130 int redundant_softirqs_off;
127extern atomic_t nr_cyclic_checks; 131 int nr_unused_locks;
128extern atomic_t nr_cyclic_check_recursions; 132 int nr_cyclic_checks;
129extern atomic_t nr_find_usage_forwards_checks; 133 int nr_cyclic_check_recursions;
130extern atomic_t nr_find_usage_forwards_recursions; 134 int nr_find_usage_forwards_checks;
131extern atomic_t nr_find_usage_backwards_checks; 135 int nr_find_usage_forwards_recursions;
132extern atomic_t nr_find_usage_backwards_recursions; 136 int nr_find_usage_backwards_checks;
133# define debug_atomic_inc(ptr) atomic_inc(ptr) 137 int nr_find_usage_backwards_recursions;
134# define debug_atomic_dec(ptr) atomic_dec(ptr) 138};
135# define debug_atomic_read(ptr) atomic_read(ptr) 139
140DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats);
141
142#define __debug_atomic_inc(ptr) \
143 this_cpu_inc(lockdep_stats.ptr);
144
145#define debug_atomic_inc(ptr) { \
146 WARN_ON_ONCE(!irqs_disabled()); \
147 __this_cpu_inc(lockdep_stats.ptr); \
148}
149
150#define debug_atomic_dec(ptr) { \
151 WARN_ON_ONCE(!irqs_disabled()); \
152 __this_cpu_dec(lockdep_stats.ptr); \
153}
154
155#define debug_atomic_read(ptr) ({ \
156 struct lockdep_stats *__cpu_lockdep_stats; \
157 unsigned long long __total = 0; \
158 int __cpu; \
159 for_each_possible_cpu(__cpu) { \
160 __cpu_lockdep_stats = &per_cpu(lockdep_stats, __cpu); \
161 __total += __cpu_lockdep_stats->ptr; \
162 } \
163 __total; \
164})
136#else 165#else
166# define __debug_atomic_inc(ptr) do { } while (0)
137# define debug_atomic_inc(ptr) do { } while (0) 167# define debug_atomic_inc(ptr) do { } while (0)
138# define debug_atomic_dec(ptr) do { } while (0) 168# define debug_atomic_dec(ptr) do { } while (0)
139# define debug_atomic_read(ptr) 0 169# define debug_atomic_read(ptr) 0
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index d4aba4f3584c..59b76c8ce9d7 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -184,34 +184,34 @@ static const struct file_operations proc_lockdep_chains_operations = {
184static void lockdep_stats_debug_show(struct seq_file *m) 184static void lockdep_stats_debug_show(struct seq_file *m)
185{ 185{
186#ifdef CONFIG_DEBUG_LOCKDEP 186#ifdef CONFIG_DEBUG_LOCKDEP
187 unsigned int hi1 = debug_atomic_read(&hardirqs_on_events), 187 unsigned long long hi1 = debug_atomic_read(hardirqs_on_events),
188 hi2 = debug_atomic_read(&hardirqs_off_events), 188 hi2 = debug_atomic_read(hardirqs_off_events),
189 hr1 = debug_atomic_read(&redundant_hardirqs_on), 189 hr1 = debug_atomic_read(redundant_hardirqs_on),
190 hr2 = debug_atomic_read(&redundant_hardirqs_off), 190 hr2 = debug_atomic_read(redundant_hardirqs_off),
191 si1 = debug_atomic_read(&softirqs_on_events), 191 si1 = debug_atomic_read(softirqs_on_events),
192 si2 = debug_atomic_read(&softirqs_off_events), 192 si2 = debug_atomic_read(softirqs_off_events),
193 sr1 = debug_atomic_read(&redundant_softirqs_on), 193 sr1 = debug_atomic_read(redundant_softirqs_on),
194 sr2 = debug_atomic_read(&redundant_softirqs_off); 194 sr2 = debug_atomic_read(redundant_softirqs_off);
195 195
196 seq_printf(m, " chain lookup misses: %11u\n", 196 seq_printf(m, " chain lookup misses: %11llu\n",
197 debug_atomic_read(&chain_lookup_misses)); 197 debug_atomic_read(chain_lookup_misses));
198 seq_printf(m, " chain lookup hits: %11u\n", 198 seq_printf(m, " chain lookup hits: %11llu\n",
199 debug_atomic_read(&chain_lookup_hits)); 199 debug_atomic_read(chain_lookup_hits));
200 seq_printf(m, " cyclic checks: %11u\n", 200 seq_printf(m, " cyclic checks: %11llu\n",
201 debug_atomic_read(&nr_cyclic_checks)); 201 debug_atomic_read(nr_cyclic_checks));
202 seq_printf(m, " find-mask forwards checks: %11u\n", 202 seq_printf(m, " find-mask forwards checks: %11llu\n",
203 debug_atomic_read(&nr_find_usage_forwards_checks)); 203 debug_atomic_read(nr_find_usage_forwards_checks));
204 seq_printf(m, " find-mask backwards checks: %11u\n", 204 seq_printf(m, " find-mask backwards checks: %11llu\n",
205 debug_atomic_read(&nr_find_usage_backwards_checks)); 205 debug_atomic_read(nr_find_usage_backwards_checks));
206 206
207 seq_printf(m, " hardirq on events: %11u\n", hi1); 207 seq_printf(m, " hardirq on events: %11llu\n", hi1);
208 seq_printf(m, " hardirq off events: %11u\n", hi2); 208 seq_printf(m, " hardirq off events: %11llu\n", hi2);
209 seq_printf(m, " redundant hardirq ons: %11u\n", hr1); 209 seq_printf(m, " redundant hardirq ons: %11llu\n", hr1);
210 seq_printf(m, " redundant hardirq offs: %11u\n", hr2); 210 seq_printf(m, " redundant hardirq offs: %11llu\n", hr2);
211 seq_printf(m, " softirq on events: %11u\n", si1); 211 seq_printf(m, " softirq on events: %11llu\n", si1);
212 seq_printf(m, " softirq off events: %11u\n", si2); 212 seq_printf(m, " softirq off events: %11llu\n", si2);
213 seq_printf(m, " redundant softirq ons: %11u\n", sr1); 213 seq_printf(m, " redundant softirq ons: %11llu\n", sr1);
214 seq_printf(m, " redundant softirq offs: %11u\n", sr2); 214 seq_printf(m, " redundant softirq offs: %11llu\n", sr2);
215#endif 215#endif
216} 216}
217 217
@@ -263,7 +263,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
263#endif 263#endif
264 } 264 }
265#ifdef CONFIG_DEBUG_LOCKDEP 265#ifdef CONFIG_DEBUG_LOCKDEP
266 DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused); 266 DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused);
267#endif 267#endif
268 seq_printf(m, " lock-classes: %11lu [max: %lu]\n", 268 seq_printf(m, " lock-classes: %11lu [max: %lu]\n",
269 nr_lock_classes, MAX_LOCKDEP_KEYS); 269 nr_lock_classes, MAX_LOCKDEP_KEYS);
diff --git a/kernel/module.c b/kernel/module.c
index 1016b75b026a..ccd641991842 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1,6 +1,6 @@
1/* 1/*
2 Copyright (C) 2002 Richard Henderson 2 Copyright (C) 2002 Richard Henderson
3 Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. 3 Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
4 4
5 This program is free software; you can redistribute it and/or modify 5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by 6 it under the terms of the GNU General Public License as published by
@@ -59,8 +59,6 @@
59#define CREATE_TRACE_POINTS 59#define CREATE_TRACE_POINTS
60#include <trace/events/module.h> 60#include <trace/events/module.h>
61 61
62EXPORT_TRACEPOINT_SYMBOL(module_get);
63
64#if 0 62#if 0
65#define DEBUGP printk 63#define DEBUGP printk
66#else 64#else
@@ -74,11 +72,19 @@ EXPORT_TRACEPOINT_SYMBOL(module_get);
74/* If this is set, the section belongs in the init part of the module */ 72/* If this is set, the section belongs in the init part of the module */
75#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) 73#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
76 74
77/* List of modules, protected by module_mutex or preempt_disable 75/*
76 * Mutex protects:
77 * 1) List of modules (also safely readable with preempt_disable),
78 * 2) module_use links,
79 * 3) module_addr_min/module_addr_max.
78 * (delete uses stop_machine/add uses RCU list operations). */ 80 * (delete uses stop_machine/add uses RCU list operations). */
79DEFINE_MUTEX(module_mutex); 81DEFINE_MUTEX(module_mutex);
80EXPORT_SYMBOL_GPL(module_mutex); 82EXPORT_SYMBOL_GPL(module_mutex);
81static LIST_HEAD(modules); 83static LIST_HEAD(modules);
84#ifdef CONFIG_KGDB_KDB
85struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
86#endif /* CONFIG_KGDB_KDB */
87
82 88
83/* Block module loading/unloading? */ 89/* Block module loading/unloading? */
84int modules_disabled = 0; 90int modules_disabled = 0;
@@ -88,7 +94,8 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq);
88 94
89static BLOCKING_NOTIFIER_HEAD(module_notify_list); 95static BLOCKING_NOTIFIER_HEAD(module_notify_list);
90 96
91/* Bounds of module allocation, for speeding __module_address */ 97/* Bounds of module allocation, for speeding __module_address.
98 * Protected by module_mutex. */
92static unsigned long module_addr_min = -1UL, module_addr_max = 0; 99static unsigned long module_addr_min = -1UL, module_addr_max = 0;
93 100
94int register_module_notifier(struct notifier_block * nb) 101int register_module_notifier(struct notifier_block * nb)
@@ -103,6 +110,20 @@ int unregister_module_notifier(struct notifier_block * nb)
103} 110}
104EXPORT_SYMBOL(unregister_module_notifier); 111EXPORT_SYMBOL(unregister_module_notifier);
105 112
113struct load_info {
114 Elf_Ehdr *hdr;
115 unsigned long len;
116 Elf_Shdr *sechdrs;
117 char *secstrings, *strtab;
118 unsigned long *strmap;
119 unsigned long symoffs, stroffs;
120 struct _ddebug *debug;
121 unsigned int num_debug;
122 struct {
123 unsigned int sym, str, mod, vers, info, pcpu;
124 } index;
125};
126
106/* We require a truly strong try_module_get(): 0 means failure due to 127/* We require a truly strong try_module_get(): 0 means failure due to
107 ongoing or failed initialization etc. */ 128 ongoing or failed initialization etc. */
108static inline int strong_try_module_get(struct module *mod) 129static inline int strong_try_module_get(struct module *mod)
@@ -133,42 +154,38 @@ void __module_put_and_exit(struct module *mod, long code)
133EXPORT_SYMBOL(__module_put_and_exit); 154EXPORT_SYMBOL(__module_put_and_exit);
134 155
135/* Find a module section: 0 means not found. */ 156/* Find a module section: 0 means not found. */
136static unsigned int find_sec(Elf_Ehdr *hdr, 157static unsigned int find_sec(const struct load_info *info, const char *name)
137 Elf_Shdr *sechdrs,
138 const char *secstrings,
139 const char *name)
140{ 158{
141 unsigned int i; 159 unsigned int i;
142 160
143 for (i = 1; i < hdr->e_shnum; i++) 161 for (i = 1; i < info->hdr->e_shnum; i++) {
162 Elf_Shdr *shdr = &info->sechdrs[i];
144 /* Alloc bit cleared means "ignore it." */ 163 /* Alloc bit cleared means "ignore it." */
145 if ((sechdrs[i].sh_flags & SHF_ALLOC) 164 if ((shdr->sh_flags & SHF_ALLOC)
146 && strcmp(secstrings+sechdrs[i].sh_name, name) == 0) 165 && strcmp(info->secstrings + shdr->sh_name, name) == 0)
147 return i; 166 return i;
167 }
148 return 0; 168 return 0;
149} 169}
150 170
151/* Find a module section, or NULL. */ 171/* Find a module section, or NULL. */
152static void *section_addr(Elf_Ehdr *hdr, Elf_Shdr *shdrs, 172static void *section_addr(const struct load_info *info, const char *name)
153 const char *secstrings, const char *name)
154{ 173{
155 /* Section 0 has sh_addr 0. */ 174 /* Section 0 has sh_addr 0. */
156 return (void *)shdrs[find_sec(hdr, shdrs, secstrings, name)].sh_addr; 175 return (void *)info->sechdrs[find_sec(info, name)].sh_addr;
157} 176}
158 177
159/* Find a module section, or NULL. Fill in number of "objects" in section. */ 178/* Find a module section, or NULL. Fill in number of "objects" in section. */
160static void *section_objs(Elf_Ehdr *hdr, 179static void *section_objs(const struct load_info *info,
161 Elf_Shdr *sechdrs,
162 const char *secstrings,
163 const char *name, 180 const char *name,
164 size_t object_size, 181 size_t object_size,
165 unsigned int *num) 182 unsigned int *num)
166{ 183{
167 unsigned int sec = find_sec(hdr, sechdrs, secstrings, name); 184 unsigned int sec = find_sec(info, name);
168 185
169 /* Section 0 has sh_addr 0 and sh_size 0. */ 186 /* Section 0 has sh_addr 0 and sh_size 0. */
170 *num = sechdrs[sec].sh_size / object_size; 187 *num = info->sechdrs[sec].sh_size / object_size;
171 return (void *)sechdrs[sec].sh_addr; 188 return (void *)info->sechdrs[sec].sh_addr;
172} 189}
173 190
174/* Provided by the linker */ 191/* Provided by the linker */
@@ -178,8 +195,6 @@ extern const struct kernel_symbol __start___ksymtab_gpl[];
178extern const struct kernel_symbol __stop___ksymtab_gpl[]; 195extern const struct kernel_symbol __stop___ksymtab_gpl[];
179extern const struct kernel_symbol __start___ksymtab_gpl_future[]; 196extern const struct kernel_symbol __start___ksymtab_gpl_future[];
180extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; 197extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
181extern const struct kernel_symbol __start___ksymtab_gpl_future[];
182extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
183extern const unsigned long __start___kcrctab[]; 198extern const unsigned long __start___kcrctab[];
184extern const unsigned long __start___kcrctab_gpl[]; 199extern const unsigned long __start___kcrctab_gpl[];
185extern const unsigned long __start___kcrctab_gpl_future[]; 200extern const unsigned long __start___kcrctab_gpl_future[];
@@ -222,7 +237,7 @@ bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
222 unsigned int symnum, void *data), void *data) 237 unsigned int symnum, void *data), void *data)
223{ 238{
224 struct module *mod; 239 struct module *mod;
225 const struct symsearch arr[] = { 240 static const struct symsearch arr[] = {
226 { __start___ksymtab, __stop___ksymtab, __start___kcrctab, 241 { __start___ksymtab, __stop___ksymtab, __start___kcrctab,
227 NOT_GPL_ONLY, false }, 242 NOT_GPL_ONLY, false },
228 { __start___ksymtab_gpl, __stop___ksymtab_gpl, 243 { __start___ksymtab_gpl, __stop___ksymtab_gpl,
@@ -329,7 +344,7 @@ static bool find_symbol_in_section(const struct symsearch *syms,
329} 344}
330 345
331/* Find a symbol and return it, along with, (optional) crc and 346/* Find a symbol and return it, along with, (optional) crc and
332 * (optional) module which owns it */ 347 * (optional) module which owns it. Needs preempt disabled or module_mutex. */
333const struct kernel_symbol *find_symbol(const char *name, 348const struct kernel_symbol *find_symbol(const char *name,
334 struct module **owner, 349 struct module **owner,
335 const unsigned long **crc, 350 const unsigned long **crc,
@@ -387,7 +402,8 @@ static int percpu_modalloc(struct module *mod,
387 mod->percpu = __alloc_reserved_percpu(size, align); 402 mod->percpu = __alloc_reserved_percpu(size, align);
388 if (!mod->percpu) { 403 if (!mod->percpu) {
389 printk(KERN_WARNING 404 printk(KERN_WARNING
390 "Could not allocate %lu bytes percpu data\n", size); 405 "%s: Could not allocate %lu bytes percpu data\n",
406 mod->name, size);
391 return -ENOMEM; 407 return -ENOMEM;
392 } 408 }
393 mod->percpu_size = size; 409 mod->percpu_size = size;
@@ -399,11 +415,9 @@ static void percpu_modfree(struct module *mod)
399 free_percpu(mod->percpu); 415 free_percpu(mod->percpu);
400} 416}
401 417
402static unsigned int find_pcpusec(Elf_Ehdr *hdr, 418static unsigned int find_pcpusec(struct load_info *info)
403 Elf_Shdr *sechdrs,
404 const char *secstrings)
405{ 419{
406 return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); 420 return find_sec(info, ".data..percpu");
407} 421}
408 422
409static void percpu_modcopy(struct module *mod, 423static void percpu_modcopy(struct module *mod,
@@ -463,9 +477,7 @@ static inline int percpu_modalloc(struct module *mod,
463static inline void percpu_modfree(struct module *mod) 477static inline void percpu_modfree(struct module *mod)
464{ 478{
465} 479}
466static inline unsigned int find_pcpusec(Elf_Ehdr *hdr, 480static unsigned int find_pcpusec(struct load_info *info)
467 Elf_Shdr *sechdrs,
468 const char *secstrings)
469{ 481{
470 return 0; 482 return 0;
471} 483}
@@ -515,37 +527,34 @@ MODINFO_ATTR(srcversion);
515static char last_unloaded_module[MODULE_NAME_LEN+1]; 527static char last_unloaded_module[MODULE_NAME_LEN+1];
516 528
517#ifdef CONFIG_MODULE_UNLOAD 529#ifdef CONFIG_MODULE_UNLOAD
530
531EXPORT_TRACEPOINT_SYMBOL(module_get);
532
518/* Init the unload section of the module. */ 533/* Init the unload section of the module. */
519static void module_unload_init(struct module *mod) 534static int module_unload_init(struct module *mod)
520{ 535{
521 int cpu; 536 mod->refptr = alloc_percpu(struct module_ref);
537 if (!mod->refptr)
538 return -ENOMEM;
522 539
523 INIT_LIST_HEAD(&mod->modules_which_use_me); 540 INIT_LIST_HEAD(&mod->source_list);
524 for_each_possible_cpu(cpu) { 541 INIT_LIST_HEAD(&mod->target_list);
525 per_cpu_ptr(mod->refptr, cpu)->incs = 0;
526 per_cpu_ptr(mod->refptr, cpu)->decs = 0;
527 }
528 542
529 /* Hold reference count during initialization. */ 543 /* Hold reference count during initialization. */
530 __this_cpu_write(mod->refptr->incs, 1); 544 __this_cpu_write(mod->refptr->incs, 1);
531 /* Backwards compatibility macros put refcount during init. */ 545 /* Backwards compatibility macros put refcount during init. */
532 mod->waiter = current; 546 mod->waiter = current;
533}
534 547
535/* modules using other modules */ 548 return 0;
536struct module_use 549}
537{
538 struct list_head list;
539 struct module *module_which_uses;
540};
541 550
542/* Does a already use b? */ 551/* Does a already use b? */
543static int already_uses(struct module *a, struct module *b) 552static int already_uses(struct module *a, struct module *b)
544{ 553{
545 struct module_use *use; 554 struct module_use *use;
546 555
547 list_for_each_entry(use, &b->modules_which_use_me, list) { 556 list_for_each_entry(use, &b->source_list, source_list) {
548 if (use->module_which_uses == a) { 557 if (use->source == a) {
549 DEBUGP("%s uses %s!\n", a->name, b->name); 558 DEBUGP("%s uses %s!\n", a->name, b->name);
550 return 1; 559 return 1;
551 } 560 }
@@ -554,62 +563,70 @@ static int already_uses(struct module *a, struct module *b)
554 return 0; 563 return 0;
555} 564}
556 565
557/* Module a uses b */ 566/*
558int use_module(struct module *a, struct module *b) 567 * Module a uses b
568 * - we add 'a' as a "source", 'b' as a "target" of module use
569 * - the module_use is added to the list of 'b' sources (so
570 * 'b' can walk the list to see who sourced them), and of 'a'
571 * targets (so 'a' can see what modules it targets).
572 */
573static int add_module_usage(struct module *a, struct module *b)
559{ 574{
560 struct module_use *use; 575 struct module_use *use;
561 int no_warn, err;
562 576
563 if (b == NULL || already_uses(a, b)) return 1; 577 DEBUGP("Allocating new usage for %s.\n", a->name);
578 use = kmalloc(sizeof(*use), GFP_ATOMIC);
579 if (!use) {
580 printk(KERN_WARNING "%s: out of memory loading\n", a->name);
581 return -ENOMEM;
582 }
583
584 use->source = a;
585 use->target = b;
586 list_add(&use->source_list, &b->source_list);
587 list_add(&use->target_list, &a->target_list);
588 return 0;
589}
590
591/* Module a uses b: caller needs module_mutex() */
592int ref_module(struct module *a, struct module *b)
593{
594 int err;
564 595
565 /* If we're interrupted or time out, we fail. */ 596 if (b == NULL || already_uses(a, b))
566 if (wait_event_interruptible_timeout(
567 module_wq, (err = strong_try_module_get(b)) != -EBUSY,
568 30 * HZ) <= 0) {
569 printk("%s: gave up waiting for init of module %s.\n",
570 a->name, b->name);
571 return 0; 597 return 0;
572 }
573 598
574 /* If strong_try_module_get() returned a different error, we fail. */ 599 /* If module isn't available, we fail. */
600 err = strong_try_module_get(b);
575 if (err) 601 if (err)
576 return 0; 602 return err;
577 603
578 DEBUGP("Allocating new usage for %s.\n", a->name); 604 err = add_module_usage(a, b);
579 use = kmalloc(sizeof(*use), GFP_ATOMIC); 605 if (err) {
580 if (!use) {
581 printk("%s: out of memory loading\n", a->name);
582 module_put(b); 606 module_put(b);
583 return 0; 607 return err;
584 } 608 }
585 609 return 0;
586 use->module_which_uses = a;
587 list_add(&use->list, &b->modules_which_use_me);
588 no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name);
589 return 1;
590} 610}
591EXPORT_SYMBOL_GPL(use_module); 611EXPORT_SYMBOL_GPL(ref_module);
592 612
593/* Clear the unload stuff of the module. */ 613/* Clear the unload stuff of the module. */
594static void module_unload_free(struct module *mod) 614static void module_unload_free(struct module *mod)
595{ 615{
596 struct module *i; 616 struct module_use *use, *tmp;
597 617
598 list_for_each_entry(i, &modules, list) { 618 mutex_lock(&module_mutex);
599 struct module_use *use; 619 list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) {
600 620 struct module *i = use->target;
601 list_for_each_entry(use, &i->modules_which_use_me, list) { 621 DEBUGP("%s unusing %s\n", mod->name, i->name);
602 if (use->module_which_uses == mod) { 622 module_put(i);
603 DEBUGP("%s unusing %s\n", mod->name, i->name); 623 list_del(&use->source_list);
604 module_put(i); 624 list_del(&use->target_list);
605 list_del(&use->list); 625 kfree(use);
606 kfree(use);
607 sysfs_remove_link(i->holders_dir, mod->name);
608 /* There can be at most one match. */
609 break;
610 }
611 }
612 } 626 }
627 mutex_unlock(&module_mutex);
628
629 free_percpu(mod->refptr);
613} 630}
614 631
615#ifdef CONFIG_MODULE_FORCE_UNLOAD 632#ifdef CONFIG_MODULE_FORCE_UNLOAD
@@ -723,16 +740,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
723 return -EFAULT; 740 return -EFAULT;
724 name[MODULE_NAME_LEN-1] = '\0'; 741 name[MODULE_NAME_LEN-1] = '\0';
725 742
726 /* Create stop_machine threads since free_module relies on 743 if (mutex_lock_interruptible(&module_mutex) != 0)
727 * a non-failing stop_machine call. */ 744 return -EINTR;
728 ret = stop_machine_create();
729 if (ret)
730 return ret;
731
732 if (mutex_lock_interruptible(&module_mutex) != 0) {
733 ret = -EINTR;
734 goto out_stop;
735 }
736 745
737 mod = find_module(name); 746 mod = find_module(name);
738 if (!mod) { 747 if (!mod) {
@@ -740,7 +749,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
740 goto out; 749 goto out;
741 } 750 }
742 751
743 if (!list_empty(&mod->modules_which_use_me)) { 752 if (!list_empty(&mod->source_list)) {
744 /* Other modules depend on us: get rid of them first. */ 753 /* Other modules depend on us: get rid of them first. */
745 ret = -EWOULDBLOCK; 754 ret = -EWOULDBLOCK;
746 goto out; 755 goto out;
@@ -784,16 +793,14 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
784 blocking_notifier_call_chain(&module_notify_list, 793 blocking_notifier_call_chain(&module_notify_list,
785 MODULE_STATE_GOING, mod); 794 MODULE_STATE_GOING, mod);
786 async_synchronize_full(); 795 async_synchronize_full();
787 mutex_lock(&module_mutex); 796
788 /* Store the name of the last unloaded module for diagnostic purposes */ 797 /* Store the name of the last unloaded module for diagnostic purposes */
789 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); 798 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
790 ddebug_remove_module(mod->name);
791 free_module(mod);
792 799
793 out: 800 free_module(mod);
801 return 0;
802out:
794 mutex_unlock(&module_mutex); 803 mutex_unlock(&module_mutex);
795out_stop:
796 stop_machine_destroy();
797 return ret; 804 return ret;
798} 805}
799 806
@@ -806,9 +813,9 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
806 813
807 /* Always include a trailing , so userspace can differentiate 814 /* Always include a trailing , so userspace can differentiate
808 between this and the old multi-field proc format. */ 815 between this and the old multi-field proc format. */
809 list_for_each_entry(use, &mod->modules_which_use_me, list) { 816 list_for_each_entry(use, &mod->source_list, source_list) {
810 printed_something = 1; 817 printed_something = 1;
811 seq_printf(m, "%s,", use->module_which_uses->name); 818 seq_printf(m, "%s,", use->source->name);
812 } 819 }
813 820
814 if (mod->init != NULL && mod->exit == NULL) { 821 if (mod->init != NULL && mod->exit == NULL) {
@@ -867,8 +874,7 @@ void module_put(struct module *module)
867 smp_wmb(); /* see comment in module_refcount */ 874 smp_wmb(); /* see comment in module_refcount */
868 __this_cpu_inc(module->refptr->decs); 875 __this_cpu_inc(module->refptr->decs);
869 876
870 trace_module_put(module, _RET_IP_, 877 trace_module_put(module, _RET_IP_);
871 __this_cpu_read(module->refptr->decs));
872 /* Maybe they're waiting for us to drop reference? */ 878 /* Maybe they're waiting for us to drop reference? */
873 if (unlikely(!module_is_live(module))) 879 if (unlikely(!module_is_live(module)))
874 wake_up_process(module->waiter); 880 wake_up_process(module->waiter);
@@ -888,14 +894,15 @@ static inline void module_unload_free(struct module *mod)
888{ 894{
889} 895}
890 896
891int use_module(struct module *a, struct module *b) 897int ref_module(struct module *a, struct module *b)
892{ 898{
893 return strong_try_module_get(b) == 0; 899 return strong_try_module_get(b);
894} 900}
895EXPORT_SYMBOL_GPL(use_module); 901EXPORT_SYMBOL_GPL(ref_module);
896 902
897static inline void module_unload_init(struct module *mod) 903static inline int module_unload_init(struct module *mod)
898{ 904{
905 return 0;
899} 906}
900#endif /* CONFIG_MODULE_UNLOAD */ 907#endif /* CONFIG_MODULE_UNLOAD */
901 908
@@ -1009,6 +1016,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
1009{ 1016{
1010 const unsigned long *crc; 1017 const unsigned long *crc;
1011 1018
1019 /* Since this should be found in kernel (which can't be removed),
1020 * no locking is necessary. */
1012 if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL, 1021 if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL,
1013 &crc, true, false)) 1022 &crc, true, false))
1014 BUG(); 1023 BUG();
@@ -1051,35 +1060,68 @@ static inline int same_magic(const char *amagic, const char *bmagic,
1051} 1060}
1052#endif /* CONFIG_MODVERSIONS */ 1061#endif /* CONFIG_MODVERSIONS */
1053 1062
1054/* Resolve a symbol for this module. I.e. if we find one, record usage. 1063/* Resolve a symbol for this module. I.e. if we find one, record usage. */
1055 Must be holding module_mutex. */ 1064static const struct kernel_symbol *resolve_symbol(struct module *mod,
1056static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, 1065 const struct load_info *info,
1057 unsigned int versindex,
1058 const char *name, 1066 const char *name,
1059 struct module *mod) 1067 char ownername[])
1060{ 1068{
1061 struct module *owner; 1069 struct module *owner;
1062 const struct kernel_symbol *sym; 1070 const struct kernel_symbol *sym;
1063 const unsigned long *crc; 1071 const unsigned long *crc;
1072 int err;
1064 1073
1074 mutex_lock(&module_mutex);
1065 sym = find_symbol(name, &owner, &crc, 1075 sym = find_symbol(name, &owner, &crc,
1066 !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true); 1076 !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
1067 /* use_module can fail due to OOM, 1077 if (!sym)
1068 or module initialization or unloading */ 1078 goto unlock;
1069 if (sym) { 1079
1070 if (!check_version(sechdrs, versindex, name, mod, crc, owner) 1080 if (!check_version(info->sechdrs, info->index.vers, name, mod, crc,
1071 || !use_module(mod, owner)) 1081 owner)) {
1072 sym = NULL; 1082 sym = ERR_PTR(-EINVAL);
1083 goto getname;
1084 }
1085
1086 err = ref_module(mod, owner);
1087 if (err) {
1088 sym = ERR_PTR(err);
1089 goto getname;
1073 } 1090 }
1091
1092getname:
1093 /* We must make copy under the lock if we failed to get ref. */
1094 strncpy(ownername, module_name(owner), MODULE_NAME_LEN);
1095unlock:
1096 mutex_unlock(&module_mutex);
1074 return sym; 1097 return sym;
1075} 1098}
1076 1099
1100static const struct kernel_symbol *
1101resolve_symbol_wait(struct module *mod,
1102 const struct load_info *info,
1103 const char *name)
1104{
1105 const struct kernel_symbol *ksym;
1106 char owner[MODULE_NAME_LEN];
1107
1108 if (wait_event_interruptible_timeout(module_wq,
1109 !IS_ERR(ksym = resolve_symbol(mod, info, name, owner))
1110 || PTR_ERR(ksym) != -EBUSY,
1111 30 * HZ) <= 0) {
1112 printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n",
1113 mod->name, owner);
1114 }
1115 return ksym;
1116}
1117
1077/* 1118/*
1078 * /sys/module/foo/sections stuff 1119 * /sys/module/foo/sections stuff
1079 * J. Corbet <corbet@lwn.net> 1120 * J. Corbet <corbet@lwn.net>
1080 */ 1121 */
1081#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS) 1122#ifdef CONFIG_SYSFS
1082 1123
1124#ifdef CONFIG_KALLSYMS
1083static inline bool sect_empty(const Elf_Shdr *sect) 1125static inline bool sect_empty(const Elf_Shdr *sect)
1084{ 1126{
1085 return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; 1127 return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
@@ -1116,8 +1158,7 @@ static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
1116 kfree(sect_attrs); 1158 kfree(sect_attrs);
1117} 1159}
1118 1160
1119static void add_sect_attrs(struct module *mod, unsigned int nsect, 1161static void add_sect_attrs(struct module *mod, const struct load_info *info)
1120 char *secstrings, Elf_Shdr *sechdrs)
1121{ 1162{
1122 unsigned int nloaded = 0, i, size[2]; 1163 unsigned int nloaded = 0, i, size[2];
1123 struct module_sect_attrs *sect_attrs; 1164 struct module_sect_attrs *sect_attrs;
@@ -1125,8 +1166,8 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1125 struct attribute **gattr; 1166 struct attribute **gattr;
1126 1167
1127 /* Count loaded sections and allocate structures */ 1168 /* Count loaded sections and allocate structures */
1128 for (i = 0; i < nsect; i++) 1169 for (i = 0; i < info->hdr->e_shnum; i++)
1129 if (!sect_empty(&sechdrs[i])) 1170 if (!sect_empty(&info->sechdrs[i]))
1130 nloaded++; 1171 nloaded++;
1131 size[0] = ALIGN(sizeof(*sect_attrs) 1172 size[0] = ALIGN(sizeof(*sect_attrs)
1132 + nloaded * sizeof(sect_attrs->attrs[0]), 1173 + nloaded * sizeof(sect_attrs->attrs[0]),
@@ -1143,11 +1184,12 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1143 sect_attrs->nsections = 0; 1184 sect_attrs->nsections = 0;
1144 sattr = &sect_attrs->attrs[0]; 1185 sattr = &sect_attrs->attrs[0];
1145 gattr = &sect_attrs->grp.attrs[0]; 1186 gattr = &sect_attrs->grp.attrs[0];
1146 for (i = 0; i < nsect; i++) { 1187 for (i = 0; i < info->hdr->e_shnum; i++) {
1147 if (sect_empty(&sechdrs[i])) 1188 Elf_Shdr *sec = &info->sechdrs[i];
1189 if (sect_empty(sec))
1148 continue; 1190 continue;
1149 sattr->address = sechdrs[i].sh_addr; 1191 sattr->address = sec->sh_addr;
1150 sattr->name = kstrdup(secstrings + sechdrs[i].sh_name, 1192 sattr->name = kstrdup(info->secstrings + sec->sh_name,
1151 GFP_KERNEL); 1193 GFP_KERNEL);
1152 if (sattr->name == NULL) 1194 if (sattr->name == NULL)
1153 goto out; 1195 goto out;
@@ -1192,7 +1234,7 @@ struct module_notes_attrs {
1192 struct bin_attribute attrs[0]; 1234 struct bin_attribute attrs[0];
1193}; 1235};
1194 1236
1195static ssize_t module_notes_read(struct kobject *kobj, 1237static ssize_t module_notes_read(struct file *filp, struct kobject *kobj,
1196 struct bin_attribute *bin_attr, 1238 struct bin_attribute *bin_attr,
1197 char *buf, loff_t pos, size_t count) 1239 char *buf, loff_t pos, size_t count)
1198{ 1240{
@@ -1215,8 +1257,7 @@ static void free_notes_attrs(struct module_notes_attrs *notes_attrs,
1215 kfree(notes_attrs); 1257 kfree(notes_attrs);
1216} 1258}
1217 1259
1218static void add_notes_attrs(struct module *mod, unsigned int nsect, 1260static void add_notes_attrs(struct module *mod, const struct load_info *info)
1219 char *secstrings, Elf_Shdr *sechdrs)
1220{ 1261{
1221 unsigned int notes, loaded, i; 1262 unsigned int notes, loaded, i;
1222 struct module_notes_attrs *notes_attrs; 1263 struct module_notes_attrs *notes_attrs;
@@ -1228,9 +1269,9 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1228 1269
1229 /* Count notes sections and allocate structures. */ 1270 /* Count notes sections and allocate structures. */
1230 notes = 0; 1271 notes = 0;
1231 for (i = 0; i < nsect; i++) 1272 for (i = 0; i < info->hdr->e_shnum; i++)
1232 if (!sect_empty(&sechdrs[i]) && 1273 if (!sect_empty(&info->sechdrs[i]) &&
1233 (sechdrs[i].sh_type == SHT_NOTE)) 1274 (info->sechdrs[i].sh_type == SHT_NOTE))
1234 ++notes; 1275 ++notes;
1235 1276
1236 if (notes == 0) 1277 if (notes == 0)
@@ -1244,15 +1285,15 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1244 1285
1245 notes_attrs->notes = notes; 1286 notes_attrs->notes = notes;
1246 nattr = &notes_attrs->attrs[0]; 1287 nattr = &notes_attrs->attrs[0];
1247 for (loaded = i = 0; i < nsect; ++i) { 1288 for (loaded = i = 0; i < info->hdr->e_shnum; ++i) {
1248 if (sect_empty(&sechdrs[i])) 1289 if (sect_empty(&info->sechdrs[i]))
1249 continue; 1290 continue;
1250 if (sechdrs[i].sh_type == SHT_NOTE) { 1291 if (info->sechdrs[i].sh_type == SHT_NOTE) {
1251 sysfs_bin_attr_init(nattr); 1292 sysfs_bin_attr_init(nattr);
1252 nattr->attr.name = mod->sect_attrs->attrs[loaded].name; 1293 nattr->attr.name = mod->sect_attrs->attrs[loaded].name;
1253 nattr->attr.mode = S_IRUGO; 1294 nattr->attr.mode = S_IRUGO;
1254 nattr->size = sechdrs[i].sh_size; 1295 nattr->size = info->sechdrs[i].sh_size;
1255 nattr->private = (void *) sechdrs[i].sh_addr; 1296 nattr->private = (void *) info->sechdrs[i].sh_addr;
1256 nattr->read = module_notes_read; 1297 nattr->read = module_notes_read;
1257 ++nattr; 1298 ++nattr;
1258 } 1299 }
@@ -1283,8 +1324,8 @@ static void remove_notes_attrs(struct module *mod)
1283 1324
1284#else 1325#else
1285 1326
1286static inline void add_sect_attrs(struct module *mod, unsigned int nsect, 1327static inline void add_sect_attrs(struct module *mod,
1287 char *sectstrings, Elf_Shdr *sechdrs) 1328 const struct load_info *info)
1288{ 1329{
1289} 1330}
1290 1331
@@ -1292,18 +1333,44 @@ static inline void remove_sect_attrs(struct module *mod)
1292{ 1333{
1293} 1334}
1294 1335
1295static inline void add_notes_attrs(struct module *mod, unsigned int nsect, 1336static inline void add_notes_attrs(struct module *mod,
1296 char *sectstrings, Elf_Shdr *sechdrs) 1337 const struct load_info *info)
1297{ 1338{
1298} 1339}
1299 1340
1300static inline void remove_notes_attrs(struct module *mod) 1341static inline void remove_notes_attrs(struct module *mod)
1301{ 1342{
1302} 1343}
1344#endif /* CONFIG_KALLSYMS */
1345
1346static void add_usage_links(struct module *mod)
1347{
1348#ifdef CONFIG_MODULE_UNLOAD
1349 struct module_use *use;
1350 int nowarn;
1351
1352 mutex_lock(&module_mutex);
1353 list_for_each_entry(use, &mod->target_list, target_list) {
1354 nowarn = sysfs_create_link(use->target->holders_dir,
1355 &mod->mkobj.kobj, mod->name);
1356 }
1357 mutex_unlock(&module_mutex);
1303#endif 1358#endif
1359}
1304 1360
1305#ifdef CONFIG_SYSFS 1361static void del_usage_links(struct module *mod)
1306int module_add_modinfo_attrs(struct module *mod) 1362{
1363#ifdef CONFIG_MODULE_UNLOAD
1364 struct module_use *use;
1365
1366 mutex_lock(&module_mutex);
1367 list_for_each_entry(use, &mod->target_list, target_list)
1368 sysfs_remove_link(use->target->holders_dir, mod->name);
1369 mutex_unlock(&module_mutex);
1370#endif
1371}
1372
1373static int module_add_modinfo_attrs(struct module *mod)
1307{ 1374{
1308 struct module_attribute *attr; 1375 struct module_attribute *attr;
1309 struct module_attribute *temp_attr; 1376 struct module_attribute *temp_attr;
@@ -1329,7 +1396,7 @@ int module_add_modinfo_attrs(struct module *mod)
1329 return error; 1396 return error;
1330} 1397}
1331 1398
1332void module_remove_modinfo_attrs(struct module *mod) 1399static void module_remove_modinfo_attrs(struct module *mod)
1333{ 1400{
1334 struct module_attribute *attr; 1401 struct module_attribute *attr;
1335 int i; 1402 int i;
@@ -1345,7 +1412,7 @@ void module_remove_modinfo_attrs(struct module *mod)
1345 kfree(mod->modinfo_attrs); 1412 kfree(mod->modinfo_attrs);
1346} 1413}
1347 1414
1348int mod_sysfs_init(struct module *mod) 1415static int mod_sysfs_init(struct module *mod)
1349{ 1416{
1350 int err; 1417 int err;
1351 struct kobject *kobj; 1418 struct kobject *kobj;
@@ -1379,12 +1446,17 @@ out:
1379 return err; 1446 return err;
1380} 1447}
1381 1448
1382int mod_sysfs_setup(struct module *mod, 1449static int mod_sysfs_setup(struct module *mod,
1450 const struct load_info *info,
1383 struct kernel_param *kparam, 1451 struct kernel_param *kparam,
1384 unsigned int num_params) 1452 unsigned int num_params)
1385{ 1453{
1386 int err; 1454 int err;
1387 1455
1456 err = mod_sysfs_init(mod);
1457 if (err)
1458 goto out;
1459
1388 mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj); 1460 mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj);
1389 if (!mod->holders_dir) { 1461 if (!mod->holders_dir) {
1390 err = -ENOMEM; 1462 err = -ENOMEM;
@@ -1399,6 +1471,10 @@ int mod_sysfs_setup(struct module *mod,
1399 if (err) 1471 if (err)
1400 goto out_unreg_param; 1472 goto out_unreg_param;
1401 1473
1474 add_usage_links(mod);
1475 add_sect_attrs(mod, info);
1476 add_notes_attrs(mod, info);
1477
1402 kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); 1478 kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
1403 return 0; 1479 return 0;
1404 1480
@@ -1408,24 +1484,44 @@ out_unreg_holders:
1408 kobject_put(mod->holders_dir); 1484 kobject_put(mod->holders_dir);
1409out_unreg: 1485out_unreg:
1410 kobject_put(&mod->mkobj.kobj); 1486 kobject_put(&mod->mkobj.kobj);
1487out:
1411 return err; 1488 return err;
1412} 1489}
1413 1490
1414static void mod_sysfs_fini(struct module *mod) 1491static void mod_sysfs_fini(struct module *mod)
1415{ 1492{
1493 remove_notes_attrs(mod);
1494 remove_sect_attrs(mod);
1416 kobject_put(&mod->mkobj.kobj); 1495 kobject_put(&mod->mkobj.kobj);
1417} 1496}
1418 1497
1419#else /* CONFIG_SYSFS */ 1498#else /* !CONFIG_SYSFS */
1499
1500static int mod_sysfs_setup(struct module *mod,
1501 const struct load_info *info,
1502 struct kernel_param *kparam,
1503 unsigned int num_params)
1504{
1505 return 0;
1506}
1420 1507
1421static void mod_sysfs_fini(struct module *mod) 1508static void mod_sysfs_fini(struct module *mod)
1422{ 1509{
1423} 1510}
1424 1511
1512static void module_remove_modinfo_attrs(struct module *mod)
1513{
1514}
1515
1516static void del_usage_links(struct module *mod)
1517{
1518}
1519
1425#endif /* CONFIG_SYSFS */ 1520#endif /* CONFIG_SYSFS */
1426 1521
1427static void mod_kobject_remove(struct module *mod) 1522static void mod_sysfs_teardown(struct module *mod)
1428{ 1523{
1524 del_usage_links(mod);
1429 module_remove_modinfo_attrs(mod); 1525 module_remove_modinfo_attrs(mod);
1430 module_param_sysfs_remove(mod); 1526 module_param_sysfs_remove(mod);
1431 kobject_put(mod->mkobj.drivers_dir); 1527 kobject_put(mod->mkobj.drivers_dir);
@@ -1441,19 +1537,23 @@ static int __unlink_module(void *_mod)
1441{ 1537{
1442 struct module *mod = _mod; 1538 struct module *mod = _mod;
1443 list_del(&mod->list); 1539 list_del(&mod->list);
1540 module_bug_cleanup(mod);
1444 return 0; 1541 return 0;
1445} 1542}
1446 1543
1447/* Free a module, remove from lists, etc (must hold module_mutex). */ 1544/* Free a module, remove from lists, etc. */
1448static void free_module(struct module *mod) 1545static void free_module(struct module *mod)
1449{ 1546{
1450 trace_module_free(mod); 1547 trace_module_free(mod);
1451 1548
1452 /* Delete from various lists */ 1549 /* Delete from various lists */
1550 mutex_lock(&module_mutex);
1453 stop_machine(__unlink_module, mod, NULL); 1551 stop_machine(__unlink_module, mod, NULL);
1454 remove_notes_attrs(mod); 1552 mutex_unlock(&module_mutex);
1455 remove_sect_attrs(mod); 1553 mod_sysfs_teardown(mod);
1456 mod_kobject_remove(mod); 1554
1555 /* Remove dynamic debug info */
1556 ddebug_remove_module(mod->name);
1457 1557
1458 /* Arch-specific cleanup. */ 1558 /* Arch-specific cleanup. */
1459 module_arch_cleanup(mod); 1559 module_arch_cleanup(mod);
@@ -1468,10 +1568,7 @@ static void free_module(struct module *mod)
1468 module_free(mod, mod->module_init); 1568 module_free(mod, mod->module_init);
1469 kfree(mod->args); 1569 kfree(mod->args);
1470 percpu_modfree(mod); 1570 percpu_modfree(mod);
1471#if defined(CONFIG_MODULE_UNLOAD) 1571
1472 if (mod->refptr)
1473 free_percpu(mod->refptr);
1474#endif
1475 /* Free lock-classes: */ 1572 /* Free lock-classes: */
1476 lockdep_free_key_range(mod->module_core, mod->core_size); 1573 lockdep_free_key_range(mod->module_core, mod->core_size);
1477 1574
@@ -1501,6 +1598,8 @@ EXPORT_SYMBOL_GPL(__symbol_get);
1501/* 1598/*
1502 * Ensure that an exported symbol [global namespace] does not already exist 1599 * Ensure that an exported symbol [global namespace] does not already exist
1503 * in the kernel or in some other module's exported symbol table. 1600 * in the kernel or in some other module's exported symbol table.
1601 *
1602 * You must hold the module_mutex.
1504 */ 1603 */
1505static int verify_export_symbols(struct module *mod) 1604static int verify_export_symbols(struct module *mod)
1506{ 1605{
@@ -1535,25 +1634,23 @@ static int verify_export_symbols(struct module *mod)
1535} 1634}
1536 1635
1537/* Change all symbols so that st_value encodes the pointer directly. */ 1636/* Change all symbols so that st_value encodes the pointer directly. */
1538static int simplify_symbols(Elf_Shdr *sechdrs, 1637static int simplify_symbols(struct module *mod, const struct load_info *info)
1539 unsigned int symindex, 1638{
1540 const char *strtab, 1639 Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
1541 unsigned int versindex, 1640 Elf_Sym *sym = (void *)symsec->sh_addr;
1542 unsigned int pcpuindex,
1543 struct module *mod)
1544{
1545 Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr;
1546 unsigned long secbase; 1641 unsigned long secbase;
1547 unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym); 1642 unsigned int i;
1548 int ret = 0; 1643 int ret = 0;
1549 const struct kernel_symbol *ksym; 1644 const struct kernel_symbol *ksym;
1550 1645
1551 for (i = 1; i < n; i++) { 1646 for (i = 1; i < symsec->sh_size / sizeof(Elf_Sym); i++) {
1647 const char *name = info->strtab + sym[i].st_name;
1648
1552 switch (sym[i].st_shndx) { 1649 switch (sym[i].st_shndx) {
1553 case SHN_COMMON: 1650 case SHN_COMMON:
1554 /* We compiled with -fno-common. These are not 1651 /* We compiled with -fno-common. These are not
1555 supposed to happen. */ 1652 supposed to happen. */
1556 DEBUGP("Common symbol: %s\n", strtab + sym[i].st_name); 1653 DEBUGP("Common symbol: %s\n", name);
1557 printk("%s: please compile with -fno-common\n", 1654 printk("%s: please compile with -fno-common\n",
1558 mod->name); 1655 mod->name);
1559 ret = -ENOEXEC; 1656 ret = -ENOEXEC;
@@ -1566,29 +1663,28 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1566 break; 1663 break;
1567 1664
1568 case SHN_UNDEF: 1665 case SHN_UNDEF:
1569 ksym = resolve_symbol(sechdrs, versindex, 1666 ksym = resolve_symbol_wait(mod, info, name);
1570 strtab + sym[i].st_name, mod);
1571 /* Ok if resolved. */ 1667 /* Ok if resolved. */
1572 if (ksym) { 1668 if (ksym && !IS_ERR(ksym)) {
1573 sym[i].st_value = ksym->value; 1669 sym[i].st_value = ksym->value;
1574 break; 1670 break;
1575 } 1671 }
1576 1672
1577 /* Ok if weak. */ 1673 /* Ok if weak. */
1578 if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK) 1674 if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
1579 break; 1675 break;
1580 1676
1581 printk(KERN_WARNING "%s: Unknown symbol %s\n", 1677 printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n",
1582 mod->name, strtab + sym[i].st_name); 1678 mod->name, name, PTR_ERR(ksym));
1583 ret = -ENOENT; 1679 ret = PTR_ERR(ksym) ?: -ENOENT;
1584 break; 1680 break;
1585 1681
1586 default: 1682 default:
1587 /* Divert to percpu allocation if a percpu var. */ 1683 /* Divert to percpu allocation if a percpu var. */
1588 if (sym[i].st_shndx == pcpuindex) 1684 if (sym[i].st_shndx == info->index.pcpu)
1589 secbase = (unsigned long)mod_percpu(mod); 1685 secbase = (unsigned long)mod_percpu(mod);
1590 else 1686 else
1591 secbase = sechdrs[sym[i].st_shndx].sh_addr; 1687 secbase = info->sechdrs[sym[i].st_shndx].sh_addr;
1592 sym[i].st_value += secbase; 1688 sym[i].st_value += secbase;
1593 break; 1689 break;
1594 } 1690 }
@@ -1597,6 +1693,35 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1597 return ret; 1693 return ret;
1598} 1694}
1599 1695
1696static int apply_relocations(struct module *mod, const struct load_info *info)
1697{
1698 unsigned int i;
1699 int err = 0;
1700
1701 /* Now do relocations. */
1702 for (i = 1; i < info->hdr->e_shnum; i++) {
1703 unsigned int infosec = info->sechdrs[i].sh_info;
1704
1705 /* Not a valid relocation section? */
1706 if (infosec >= info->hdr->e_shnum)
1707 continue;
1708
1709 /* Don't bother with non-allocated sections */
1710 if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC))
1711 continue;
1712
1713 if (info->sechdrs[i].sh_type == SHT_REL)
1714 err = apply_relocate(info->sechdrs, info->strtab,
1715 info->index.sym, i, mod);
1716 else if (info->sechdrs[i].sh_type == SHT_RELA)
1717 err = apply_relocate_add(info->sechdrs, info->strtab,
1718 info->index.sym, i, mod);
1719 if (err < 0)
1720 break;
1721 }
1722 return err;
1723}
1724
1600/* Additional bytes needed by arch in front of individual sections */ 1725/* Additional bytes needed by arch in front of individual sections */
1601unsigned int __weak arch_mod_section_prepend(struct module *mod, 1726unsigned int __weak arch_mod_section_prepend(struct module *mod,
1602 unsigned int section) 1727 unsigned int section)
@@ -1621,10 +1746,7 @@ static long get_offset(struct module *mod, unsigned int *size,
1621 might -- code, read-only data, read-write data, small data. Tally 1746 might -- code, read-only data, read-write data, small data. Tally
1622 sizes, and place the offsets into sh_entsize fields: high bit means it 1747 sizes, and place the offsets into sh_entsize fields: high bit means it
1623 belongs in init. */ 1748 belongs in init. */
1624static void layout_sections(struct module *mod, 1749static void layout_sections(struct module *mod, struct load_info *info)
1625 const Elf_Ehdr *hdr,
1626 Elf_Shdr *sechdrs,
1627 const char *secstrings)
1628{ 1750{
1629 static unsigned long const masks[][2] = { 1751 static unsigned long const masks[][2] = {
1630 /* NOTE: all executable code must be the first section 1752 /* NOTE: all executable code must be the first section
@@ -1637,21 +1759,22 @@ static void layout_sections(struct module *mod,
1637 }; 1759 };
1638 unsigned int m, i; 1760 unsigned int m, i;
1639 1761
1640 for (i = 0; i < hdr->e_shnum; i++) 1762 for (i = 0; i < info->hdr->e_shnum; i++)
1641 sechdrs[i].sh_entsize = ~0UL; 1763 info->sechdrs[i].sh_entsize = ~0UL;
1642 1764
1643 DEBUGP("Core section allocation order:\n"); 1765 DEBUGP("Core section allocation order:\n");
1644 for (m = 0; m < ARRAY_SIZE(masks); ++m) { 1766 for (m = 0; m < ARRAY_SIZE(masks); ++m) {
1645 for (i = 0; i < hdr->e_shnum; ++i) { 1767 for (i = 0; i < info->hdr->e_shnum; ++i) {
1646 Elf_Shdr *s = &sechdrs[i]; 1768 Elf_Shdr *s = &info->sechdrs[i];
1769 const char *sname = info->secstrings + s->sh_name;
1647 1770
1648 if ((s->sh_flags & masks[m][0]) != masks[m][0] 1771 if ((s->sh_flags & masks[m][0]) != masks[m][0]
1649 || (s->sh_flags & masks[m][1]) 1772 || (s->sh_flags & masks[m][1])
1650 || s->sh_entsize != ~0UL 1773 || s->sh_entsize != ~0UL
1651 || strstarts(secstrings + s->sh_name, ".init")) 1774 || strstarts(sname, ".init"))
1652 continue; 1775 continue;
1653 s->sh_entsize = get_offset(mod, &mod->core_size, s, i); 1776 s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
1654 DEBUGP("\t%s\n", secstrings + s->sh_name); 1777 DEBUGP("\t%s\n", name);
1655 } 1778 }
1656 if (m == 0) 1779 if (m == 0)
1657 mod->core_text_size = mod->core_size; 1780 mod->core_text_size = mod->core_size;
@@ -1659,17 +1782,18 @@ static void layout_sections(struct module *mod,
1659 1782
1660 DEBUGP("Init section allocation order:\n"); 1783 DEBUGP("Init section allocation order:\n");
1661 for (m = 0; m < ARRAY_SIZE(masks); ++m) { 1784 for (m = 0; m < ARRAY_SIZE(masks); ++m) {
1662 for (i = 0; i < hdr->e_shnum; ++i) { 1785 for (i = 0; i < info->hdr->e_shnum; ++i) {
1663 Elf_Shdr *s = &sechdrs[i]; 1786 Elf_Shdr *s = &info->sechdrs[i];
1787 const char *sname = info->secstrings + s->sh_name;
1664 1788
1665 if ((s->sh_flags & masks[m][0]) != masks[m][0] 1789 if ((s->sh_flags & masks[m][0]) != masks[m][0]
1666 || (s->sh_flags & masks[m][1]) 1790 || (s->sh_flags & masks[m][1])
1667 || s->sh_entsize != ~0UL 1791 || s->sh_entsize != ~0UL
1668 || !strstarts(secstrings + s->sh_name, ".init")) 1792 || !strstarts(sname, ".init"))
1669 continue; 1793 continue;
1670 s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) 1794 s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
1671 | INIT_OFFSET_MASK); 1795 | INIT_OFFSET_MASK);
1672 DEBUGP("\t%s\n", secstrings + s->sh_name); 1796 DEBUGP("\t%s\n", sname);
1673 } 1797 }
1674 if (m == 0) 1798 if (m == 0)
1675 mod->init_text_size = mod->init_size; 1799 mod->init_text_size = mod->init_size;
@@ -1708,33 +1832,28 @@ static char *next_string(char *string, unsigned long *secsize)
1708 return string; 1832 return string;
1709} 1833}
1710 1834
1711static char *get_modinfo(Elf_Shdr *sechdrs, 1835static char *get_modinfo(struct load_info *info, const char *tag)
1712 unsigned int info,
1713 const char *tag)
1714{ 1836{
1715 char *p; 1837 char *p;
1716 unsigned int taglen = strlen(tag); 1838 unsigned int taglen = strlen(tag);
1717 unsigned long size = sechdrs[info].sh_size; 1839 Elf_Shdr *infosec = &info->sechdrs[info->index.info];
1840 unsigned long size = infosec->sh_size;
1718 1841
1719 for (p = (char *)sechdrs[info].sh_addr; p; p = next_string(p, &size)) { 1842 for (p = (char *)infosec->sh_addr; p; p = next_string(p, &size)) {
1720 if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=') 1843 if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')
1721 return p + taglen + 1; 1844 return p + taglen + 1;
1722 } 1845 }
1723 return NULL; 1846 return NULL;
1724} 1847}
1725 1848
1726static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs, 1849static void setup_modinfo(struct module *mod, struct load_info *info)
1727 unsigned int infoindex)
1728{ 1850{
1729 struct module_attribute *attr; 1851 struct module_attribute *attr;
1730 int i; 1852 int i;
1731 1853
1732 for (i = 0; (attr = modinfo_attrs[i]); i++) { 1854 for (i = 0; (attr = modinfo_attrs[i]); i++) {
1733 if (attr->setup) 1855 if (attr->setup)
1734 attr->setup(mod, 1856 attr->setup(mod, get_modinfo(info, attr->attr.name));
1735 get_modinfo(sechdrs,
1736 infoindex,
1737 attr->attr.name));
1738 } 1857 }
1739} 1858}
1740 1859
@@ -1775,11 +1894,10 @@ static int is_exported(const char *name, unsigned long value,
1775} 1894}
1776 1895
1777/* As per nm */ 1896/* As per nm */
1778static char elf_type(const Elf_Sym *sym, 1897static char elf_type(const Elf_Sym *sym, const struct load_info *info)
1779 Elf_Shdr *sechdrs,
1780 const char *secstrings,
1781 struct module *mod)
1782{ 1898{
1899 const Elf_Shdr *sechdrs = info->sechdrs;
1900
1783 if (ELF_ST_BIND(sym->st_info) == STB_WEAK) { 1901 if (ELF_ST_BIND(sym->st_info) == STB_WEAK) {
1784 if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT) 1902 if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT)
1785 return 'v'; 1903 return 'v';
@@ -1809,8 +1927,10 @@ static char elf_type(const Elf_Sym *sym,
1809 else 1927 else
1810 return 'b'; 1928 return 'b';
1811 } 1929 }
1812 if (strstarts(secstrings + sechdrs[sym->st_shndx].sh_name, ".debug")) 1930 if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name,
1931 ".debug")) {
1813 return 'n'; 1932 return 'n';
1933 }
1814 return '?'; 1934 return '?';
1815} 1935}
1816 1936
@@ -1835,127 +1955,96 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
1835 return true; 1955 return true;
1836} 1956}
1837 1957
1838static unsigned long layout_symtab(struct module *mod, 1958static void layout_symtab(struct module *mod, struct load_info *info)
1839 Elf_Shdr *sechdrs,
1840 unsigned int symindex,
1841 unsigned int strindex,
1842 const Elf_Ehdr *hdr,
1843 const char *secstrings,
1844 unsigned long *pstroffs,
1845 unsigned long *strmap)
1846{ 1959{
1847 unsigned long symoffs; 1960 Elf_Shdr *symsect = info->sechdrs + info->index.sym;
1848 Elf_Shdr *symsect = sechdrs + symindex; 1961 Elf_Shdr *strsect = info->sechdrs + info->index.str;
1849 Elf_Shdr *strsect = sechdrs + strindex;
1850 const Elf_Sym *src; 1962 const Elf_Sym *src;
1851 const char *strtab;
1852 unsigned int i, nsrc, ndst; 1963 unsigned int i, nsrc, ndst;
1853 1964
1854 /* Put symbol section at end of init part of module. */ 1965 /* Put symbol section at end of init part of module. */
1855 symsect->sh_flags |= SHF_ALLOC; 1966 symsect->sh_flags |= SHF_ALLOC;
1856 symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, 1967 symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
1857 symindex) | INIT_OFFSET_MASK; 1968 info->index.sym) | INIT_OFFSET_MASK;
1858 DEBUGP("\t%s\n", secstrings + symsect->sh_name); 1969 DEBUGP("\t%s\n", info->secstrings + symsect->sh_name);
1859 1970
1860 src = (void *)hdr + symsect->sh_offset; 1971 src = (void *)info->hdr + symsect->sh_offset;
1861 nsrc = symsect->sh_size / sizeof(*src); 1972 nsrc = symsect->sh_size / sizeof(*src);
1862 strtab = (void *)hdr + strsect->sh_offset;
1863 for (ndst = i = 1; i < nsrc; ++i, ++src) 1973 for (ndst = i = 1; i < nsrc; ++i, ++src)
1864 if (is_core_symbol(src, sechdrs, hdr->e_shnum)) { 1974 if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) {
1865 unsigned int j = src->st_name; 1975 unsigned int j = src->st_name;
1866 1976
1867 while(!__test_and_set_bit(j, strmap) && strtab[j]) 1977 while (!__test_and_set_bit(j, info->strmap)
1978 && info->strtab[j])
1868 ++j; 1979 ++j;
1869 ++ndst; 1980 ++ndst;
1870 } 1981 }
1871 1982
1872 /* Append room for core symbols at end of core part. */ 1983 /* Append room for core symbols at end of core part. */
1873 symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); 1984 info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
1874 mod->core_size = symoffs + ndst * sizeof(Elf_Sym); 1985 mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
1875 1986
1876 /* Put string table section at end of init part of module. */ 1987 /* Put string table section at end of init part of module. */
1877 strsect->sh_flags |= SHF_ALLOC; 1988 strsect->sh_flags |= SHF_ALLOC;
1878 strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, 1989 strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
1879 strindex) | INIT_OFFSET_MASK; 1990 info->index.str) | INIT_OFFSET_MASK;
1880 DEBUGP("\t%s\n", secstrings + strsect->sh_name); 1991 DEBUGP("\t%s\n", info->secstrings + strsect->sh_name);
1881 1992
1882 /* Append room for core symbols' strings at end of core part. */ 1993 /* Append room for core symbols' strings at end of core part. */
1883 *pstroffs = mod->core_size; 1994 info->stroffs = mod->core_size;
1884 __set_bit(0, strmap); 1995 __set_bit(0, info->strmap);
1885 mod->core_size += bitmap_weight(strmap, strsect->sh_size); 1996 mod->core_size += bitmap_weight(info->strmap, strsect->sh_size);
1886
1887 return symoffs;
1888} 1997}
1889 1998
1890static void add_kallsyms(struct module *mod, 1999static void add_kallsyms(struct module *mod, const struct load_info *info)
1891 Elf_Shdr *sechdrs,
1892 unsigned int shnum,
1893 unsigned int symindex,
1894 unsigned int strindex,
1895 unsigned long symoffs,
1896 unsigned long stroffs,
1897 const char *secstrings,
1898 unsigned long *strmap)
1899{ 2000{
1900 unsigned int i, ndst; 2001 unsigned int i, ndst;
1901 const Elf_Sym *src; 2002 const Elf_Sym *src;
1902 Elf_Sym *dst; 2003 Elf_Sym *dst;
1903 char *s; 2004 char *s;
2005 Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
1904 2006
1905 mod->symtab = (void *)sechdrs[symindex].sh_addr; 2007 mod->symtab = (void *)symsec->sh_addr;
1906 mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym); 2008 mod->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
1907 mod->strtab = (void *)sechdrs[strindex].sh_addr; 2009 /* Make sure we get permanent strtab: don't use info->strtab. */
2010 mod->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
1908 2011
1909 /* Set types up while we still have access to sections. */ 2012 /* Set types up while we still have access to sections. */
1910 for (i = 0; i < mod->num_symtab; i++) 2013 for (i = 0; i < mod->num_symtab; i++)
1911 mod->symtab[i].st_info 2014 mod->symtab[i].st_info = elf_type(&mod->symtab[i], info);
1912 = elf_type(&mod->symtab[i], sechdrs, secstrings, mod);
1913 2015
1914 mod->core_symtab = dst = mod->module_core + symoffs; 2016 mod->core_symtab = dst = mod->module_core + info->symoffs;
1915 src = mod->symtab; 2017 src = mod->symtab;
1916 *dst = *src; 2018 *dst = *src;
1917 for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { 2019 for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) {
1918 if (!is_core_symbol(src, sechdrs, shnum)) 2020 if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum))
1919 continue; 2021 continue;
1920 dst[ndst] = *src; 2022 dst[ndst] = *src;
1921 dst[ndst].st_name = bitmap_weight(strmap, dst[ndst].st_name); 2023 dst[ndst].st_name = bitmap_weight(info->strmap,
2024 dst[ndst].st_name);
1922 ++ndst; 2025 ++ndst;
1923 } 2026 }
1924 mod->core_num_syms = ndst; 2027 mod->core_num_syms = ndst;
1925 2028
1926 mod->core_strtab = s = mod->module_core + stroffs; 2029 mod->core_strtab = s = mod->module_core + info->stroffs;
1927 for (*s = 0, i = 1; i < sechdrs[strindex].sh_size; ++i) 2030 for (*s = 0, i = 1; i < info->sechdrs[info->index.str].sh_size; ++i)
1928 if (test_bit(i, strmap)) 2031 if (test_bit(i, info->strmap))
1929 *++s = mod->strtab[i]; 2032 *++s = mod->strtab[i];
1930} 2033}
1931#else 2034#else
1932static inline unsigned long layout_symtab(struct module *mod, 2035static inline void layout_symtab(struct module *mod, struct load_info *info)
1933 Elf_Shdr *sechdrs,
1934 unsigned int symindex,
1935 unsigned int strindex,
1936 const Elf_Ehdr *hdr,
1937 const char *secstrings,
1938 unsigned long *pstroffs,
1939 unsigned long *strmap)
1940{ 2036{
1941 return 0;
1942} 2037}
1943 2038
1944static inline void add_kallsyms(struct module *mod, 2039static void add_kallsyms(struct module *mod, struct load_info *info)
1945 Elf_Shdr *sechdrs,
1946 unsigned int shnum,
1947 unsigned int symindex,
1948 unsigned int strindex,
1949 unsigned long symoffs,
1950 unsigned long stroffs,
1951 const char *secstrings,
1952 const unsigned long *strmap)
1953{ 2040{
1954} 2041}
1955#endif /* CONFIG_KALLSYMS */ 2042#endif /* CONFIG_KALLSYMS */
1956 2043
1957static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num) 2044static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
1958{ 2045{
2046 if (!debug)
2047 return;
1959#ifdef CONFIG_DYNAMIC_DEBUG 2048#ifdef CONFIG_DYNAMIC_DEBUG
1960 if (ddebug_add_module(debug, num, debug->modname)) 2049 if (ddebug_add_module(debug, num, debug->modname))
1961 printk(KERN_ERR "dynamic debug error adding module: %s\n", 2050 printk(KERN_ERR "dynamic debug error adding module: %s\n",
@@ -1963,77 +2052,70 @@ static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
1963#endif 2052#endif
1964} 2053}
1965 2054
2055static void dynamic_debug_remove(struct _ddebug *debug)
2056{
2057 if (debug)
2058 ddebug_remove_module(debug->modname);
2059}
2060
1966static void *module_alloc_update_bounds(unsigned long size) 2061static void *module_alloc_update_bounds(unsigned long size)
1967{ 2062{
1968 void *ret = module_alloc(size); 2063 void *ret = module_alloc(size);
1969 2064
1970 if (ret) { 2065 if (ret) {
2066 mutex_lock(&module_mutex);
1971 /* Update module bounds. */ 2067 /* Update module bounds. */
1972 if ((unsigned long)ret < module_addr_min) 2068 if ((unsigned long)ret < module_addr_min)
1973 module_addr_min = (unsigned long)ret; 2069 module_addr_min = (unsigned long)ret;
1974 if ((unsigned long)ret + size > module_addr_max) 2070 if ((unsigned long)ret + size > module_addr_max)
1975 module_addr_max = (unsigned long)ret + size; 2071 module_addr_max = (unsigned long)ret + size;
2072 mutex_unlock(&module_mutex);
1976 } 2073 }
1977 return ret; 2074 return ret;
1978} 2075}
1979 2076
1980#ifdef CONFIG_DEBUG_KMEMLEAK 2077#ifdef CONFIG_DEBUG_KMEMLEAK
1981static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, 2078static void kmemleak_load_module(const struct module *mod,
1982 Elf_Shdr *sechdrs, char *secstrings) 2079 const struct load_info *info)
1983{ 2080{
1984 unsigned int i; 2081 unsigned int i;
1985 2082
1986 /* only scan the sections containing data */ 2083 /* only scan the sections containing data */
1987 kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); 2084 kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
1988 2085
1989 for (i = 1; i < hdr->e_shnum; i++) { 2086 for (i = 1; i < info->hdr->e_shnum; i++) {
1990 if (!(sechdrs[i].sh_flags & SHF_ALLOC)) 2087 const char *name = info->secstrings + info->sechdrs[i].sh_name;
2088 if (!(info->sechdrs[i].sh_flags & SHF_ALLOC))
1991 continue; 2089 continue;
1992 if (strncmp(secstrings + sechdrs[i].sh_name, ".data", 5) != 0 2090 if (!strstarts(name, ".data") && !strstarts(name, ".bss"))
1993 && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0)
1994 continue; 2091 continue;
1995 2092
1996 kmemleak_scan_area((void *)sechdrs[i].sh_addr, 2093 kmemleak_scan_area((void *)info->sechdrs[i].sh_addr,
1997 sechdrs[i].sh_size, GFP_KERNEL); 2094 info->sechdrs[i].sh_size, GFP_KERNEL);
1998 } 2095 }
1999} 2096}
2000#else 2097#else
2001static inline void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, 2098static inline void kmemleak_load_module(const struct module *mod,
2002 Elf_Shdr *sechdrs, char *secstrings) 2099 const struct load_info *info)
2003{ 2100{
2004} 2101}
2005#endif 2102#endif
2006 2103
2007/* Allocate and load the module: note that size of section 0 is always 2104/* Sets info->hdr and info->len. */
2008 zero, and we rely on this for optional sections. */ 2105static int copy_and_check(struct load_info *info,
2009static noinline struct module *load_module(void __user *umod, 2106 const void __user *umod, unsigned long len,
2010 unsigned long len, 2107 const char __user *uargs)
2011 const char __user *uargs)
2012{ 2108{
2109 int err;
2013 Elf_Ehdr *hdr; 2110 Elf_Ehdr *hdr;
2014 Elf_Shdr *sechdrs;
2015 char *secstrings, *args, *modmagic, *strtab = NULL;
2016 char *staging;
2017 unsigned int i;
2018 unsigned int symindex = 0;
2019 unsigned int strindex = 0;
2020 unsigned int modindex, versindex, infoindex, pcpuindex;
2021 struct module *mod;
2022 long err = 0;
2023 void *ptr = NULL; /* Stops spurious gcc warning */
2024 unsigned long symoffs, stroffs, *strmap;
2025
2026 mm_segment_t old_fs;
2027 2111
2028 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
2029 umod, len, uargs);
2030 if (len < sizeof(*hdr)) 2112 if (len < sizeof(*hdr))
2031 return ERR_PTR(-ENOEXEC); 2113 return -ENOEXEC;
2032 2114
2033 /* Suck in entire file: we'll want most of it. */ 2115 /* Suck in entire file: we'll want most of it. */
2034 /* vmalloc barfs on "unusual" numbers. Check here */ 2116 /* vmalloc barfs on "unusual" numbers. Check here */
2035 if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) 2117 if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
2036 return ERR_PTR(-ENOMEM); 2118 return -ENOMEM;
2037 2119
2038 if (copy_from_user(hdr, umod, len) != 0) { 2120 if (copy_from_user(hdr, umod, len) != 0) {
2039 err = -EFAULT; 2121 err = -EFAULT;
@@ -2041,138 +2123,225 @@ static noinline struct module *load_module(void __user *umod,
2041 } 2123 }
2042 2124
2043 /* Sanity checks against insmoding binaries or wrong arch, 2125 /* Sanity checks against insmoding binaries or wrong arch,
2044 weird elf version */ 2126 weird elf version */
2045 if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 2127 if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0
2046 || hdr->e_type != ET_REL 2128 || hdr->e_type != ET_REL
2047 || !elf_check_arch(hdr) 2129 || !elf_check_arch(hdr)
2048 || hdr->e_shentsize != sizeof(*sechdrs)) { 2130 || hdr->e_shentsize != sizeof(Elf_Shdr)) {
2049 err = -ENOEXEC; 2131 err = -ENOEXEC;
2050 goto free_hdr; 2132 goto free_hdr;
2051 } 2133 }
2052 2134
2053 if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) 2135 if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) {
2054 goto truncated; 2136 err = -ENOEXEC;
2137 goto free_hdr;
2138 }
2139
2140 info->hdr = hdr;
2141 info->len = len;
2142 return 0;
2143
2144free_hdr:
2145 vfree(hdr);
2146 return err;
2147}
2148
2149static void free_copy(struct load_info *info)
2150{
2151 vfree(info->hdr);
2152}
2055 2153
2056 /* Convenience variables */ 2154static int rewrite_section_headers(struct load_info *info)
2057 sechdrs = (void *)hdr + hdr->e_shoff; 2155{
2058 secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; 2156 unsigned int i;
2059 sechdrs[0].sh_addr = 0;
2060 2157
2061 for (i = 1; i < hdr->e_shnum; i++) { 2158 /* This should always be true, but let's be sure. */
2062 if (sechdrs[i].sh_type != SHT_NOBITS 2159 info->sechdrs[0].sh_addr = 0;
2063 && len < sechdrs[i].sh_offset + sechdrs[i].sh_size) 2160
2064 goto truncated; 2161 for (i = 1; i < info->hdr->e_shnum; i++) {
2162 Elf_Shdr *shdr = &info->sechdrs[i];
2163 if (shdr->sh_type != SHT_NOBITS
2164 && info->len < shdr->sh_offset + shdr->sh_size) {
2165 printk(KERN_ERR "Module len %lu truncated\n",
2166 info->len);
2167 return -ENOEXEC;
2168 }
2065 2169
2066 /* Mark all sections sh_addr with their address in the 2170 /* Mark all sections sh_addr with their address in the
2067 temporary image. */ 2171 temporary image. */
2068 sechdrs[i].sh_addr = (size_t)hdr + sechdrs[i].sh_offset; 2172 shdr->sh_addr = (size_t)info->hdr + shdr->sh_offset;
2069 2173
2070 /* Internal symbols and strings. */
2071 if (sechdrs[i].sh_type == SHT_SYMTAB) {
2072 symindex = i;
2073 strindex = sechdrs[i].sh_link;
2074 strtab = (char *)hdr + sechdrs[strindex].sh_offset;
2075 }
2076#ifndef CONFIG_MODULE_UNLOAD 2174#ifndef CONFIG_MODULE_UNLOAD
2077 /* Don't load .exit sections */ 2175 /* Don't load .exit sections */
2078 if (strstarts(secstrings+sechdrs[i].sh_name, ".exit")) 2176 if (strstarts(info->secstrings+shdr->sh_name, ".exit"))
2079 sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC; 2177 shdr->sh_flags &= ~(unsigned long)SHF_ALLOC;
2080#endif 2178#endif
2081 } 2179 }
2082 2180
2083 modindex = find_sec(hdr, sechdrs, secstrings, 2181 /* Track but don't keep modinfo and version sections. */
2084 ".gnu.linkonce.this_module"); 2182 info->index.vers = find_sec(info, "__versions");
2085 if (!modindex) { 2183 info->index.info = find_sec(info, ".modinfo");
2184 info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
2185 info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
2186 return 0;
2187}
2188
2189/*
2190 * Set up our basic convenience variables (pointers to section headers,
2191 * search for module section index etc), and do some basic section
2192 * verification.
2193 *
2194 * Return the temporary module pointer (we'll replace it with the final
2195 * one when we move the module sections around).
2196 */
2197static struct module *setup_load_info(struct load_info *info)
2198{
2199 unsigned int i;
2200 int err;
2201 struct module *mod;
2202
2203 /* Set up the convenience variables */
2204 info->sechdrs = (void *)info->hdr + info->hdr->e_shoff;
2205 info->secstrings = (void *)info->hdr
2206 + info->sechdrs[info->hdr->e_shstrndx].sh_offset;
2207
2208 err = rewrite_section_headers(info);
2209 if (err)
2210 return ERR_PTR(err);
2211
2212 /* Find internal symbols and strings. */
2213 for (i = 1; i < info->hdr->e_shnum; i++) {
2214 if (info->sechdrs[i].sh_type == SHT_SYMTAB) {
2215 info->index.sym = i;
2216 info->index.str = info->sechdrs[i].sh_link;
2217 info->strtab = (char *)info->hdr
2218 + info->sechdrs[info->index.str].sh_offset;
2219 break;
2220 }
2221 }
2222
2223 info->index.mod = find_sec(info, ".gnu.linkonce.this_module");
2224 if (!info->index.mod) {
2086 printk(KERN_WARNING "No module found in object\n"); 2225 printk(KERN_WARNING "No module found in object\n");
2087 err = -ENOEXEC; 2226 return ERR_PTR(-ENOEXEC);
2088 goto free_hdr;
2089 } 2227 }
2090 /* This is temporary: point mod into copy of data. */ 2228 /* This is temporary: point mod into copy of data. */
2091 mod = (void *)sechdrs[modindex].sh_addr; 2229 mod = (void *)info->sechdrs[info->index.mod].sh_addr;
2092 2230
2093 if (symindex == 0) { 2231 if (info->index.sym == 0) {
2094 printk(KERN_WARNING "%s: module has no symbols (stripped?)\n", 2232 printk(KERN_WARNING "%s: module has no symbols (stripped?)\n",
2095 mod->name); 2233 mod->name);
2096 err = -ENOEXEC; 2234 return ERR_PTR(-ENOEXEC);
2097 goto free_hdr;
2098 } 2235 }
2099 2236
2100 versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); 2237 info->index.pcpu = find_pcpusec(info);
2101 infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
2102 pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
2103
2104 /* Don't keep modinfo and version sections. */
2105 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
2106 sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
2107 2238
2108 /* Check module struct version now, before we try to use module. */ 2239 /* Check module struct version now, before we try to use module. */
2109 if (!check_modstruct_version(sechdrs, versindex, mod)) { 2240 if (!check_modstruct_version(info->sechdrs, info->index.vers, mod))
2110 err = -ENOEXEC; 2241 return ERR_PTR(-ENOEXEC);
2111 goto free_hdr; 2242
2112 } 2243 return mod;
2244}
2245
2246static int check_modinfo(struct module *mod, struct load_info *info)
2247{
2248 const char *modmagic = get_modinfo(info, "vermagic");
2249 int err;
2113 2250
2114 modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
2115 /* This is allowed: modprobe --force will invalidate it. */ 2251 /* This is allowed: modprobe --force will invalidate it. */
2116 if (!modmagic) { 2252 if (!modmagic) {
2117 err = try_to_force_load(mod, "bad vermagic"); 2253 err = try_to_force_load(mod, "bad vermagic");
2118 if (err) 2254 if (err)
2119 goto free_hdr; 2255 return err;
2120 } else if (!same_magic(modmagic, vermagic, versindex)) { 2256 } else if (!same_magic(modmagic, vermagic, info->index.vers)) {
2121 printk(KERN_ERR "%s: version magic '%s' should be '%s'\n", 2257 printk(KERN_ERR "%s: version magic '%s' should be '%s'\n",
2122 mod->name, modmagic, vermagic); 2258 mod->name, modmagic, vermagic);
2123 err = -ENOEXEC; 2259 return -ENOEXEC;
2124 goto free_hdr;
2125 } 2260 }
2126 2261
2127 staging = get_modinfo(sechdrs, infoindex, "staging"); 2262 if (get_modinfo(info, "staging")) {
2128 if (staging) {
2129 add_taint_module(mod, TAINT_CRAP); 2263 add_taint_module(mod, TAINT_CRAP);
2130 printk(KERN_WARNING "%s: module is from the staging directory," 2264 printk(KERN_WARNING "%s: module is from the staging directory,"
2131 " the quality is unknown, you have been warned.\n", 2265 " the quality is unknown, you have been warned.\n",
2132 mod->name); 2266 mod->name);
2133 } 2267 }
2134 2268
2135 /* Now copy in args */ 2269 /* Set up license info based on the info section */
2136 args = strndup_user(uargs, ~0UL >> 1); 2270 set_license(mod, get_modinfo(info, "license"));
2137 if (IS_ERR(args)) {
2138 err = PTR_ERR(args);
2139 goto free_hdr;
2140 }
2141 2271
2142 strmap = kzalloc(BITS_TO_LONGS(sechdrs[strindex].sh_size) 2272 return 0;
2143 * sizeof(long), GFP_KERNEL); 2273}
2144 if (!strmap) {
2145 err = -ENOMEM;
2146 goto free_mod;
2147 }
2148 2274
2149 if (find_module(mod->name)) { 2275static void find_module_sections(struct module *mod, struct load_info *info)
2150 err = -EEXIST; 2276{
2151 goto free_mod; 2277 mod->kp = section_objs(info, "__param",
2152 } 2278 sizeof(*mod->kp), &mod->num_kp);
2279 mod->syms = section_objs(info, "__ksymtab",
2280 sizeof(*mod->syms), &mod->num_syms);
2281 mod->crcs = section_addr(info, "__kcrctab");
2282 mod->gpl_syms = section_objs(info, "__ksymtab_gpl",
2283 sizeof(*mod->gpl_syms),
2284 &mod->num_gpl_syms);
2285 mod->gpl_crcs = section_addr(info, "__kcrctab_gpl");
2286 mod->gpl_future_syms = section_objs(info,
2287 "__ksymtab_gpl_future",
2288 sizeof(*mod->gpl_future_syms),
2289 &mod->num_gpl_future_syms);
2290 mod->gpl_future_crcs = section_addr(info, "__kcrctab_gpl_future");
2153 2291
2154 mod->state = MODULE_STATE_COMING; 2292#ifdef CONFIG_UNUSED_SYMBOLS
2293 mod->unused_syms = section_objs(info, "__ksymtab_unused",
2294 sizeof(*mod->unused_syms),
2295 &mod->num_unused_syms);
2296 mod->unused_crcs = section_addr(info, "__kcrctab_unused");
2297 mod->unused_gpl_syms = section_objs(info, "__ksymtab_unused_gpl",
2298 sizeof(*mod->unused_gpl_syms),
2299 &mod->num_unused_gpl_syms);
2300 mod->unused_gpl_crcs = section_addr(info, "__kcrctab_unused_gpl");
2301#endif
2302#ifdef CONFIG_CONSTRUCTORS
2303 mod->ctors = section_objs(info, ".ctors",
2304 sizeof(*mod->ctors), &mod->num_ctors);
2305#endif
2155 2306
2156 /* Allow arches to frob section contents and sizes. */ 2307#ifdef CONFIG_TRACEPOINTS
2157 err = module_frob_arch_sections(hdr, sechdrs, secstrings, mod); 2308 mod->tracepoints = section_objs(info, "__tracepoints",
2158 if (err < 0) 2309 sizeof(*mod->tracepoints),
2159 goto free_mod; 2310 &mod->num_tracepoints);
2311#endif
2312#ifdef CONFIG_EVENT_TRACING
2313 mod->trace_events = section_objs(info, "_ftrace_events",
2314 sizeof(*mod->trace_events),
2315 &mod->num_trace_events);
2316 /*
2317 * This section contains pointers to allocated objects in the trace
2318 * code and not scanning it leads to false positives.
2319 */
2320 kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
2321 mod->num_trace_events, GFP_KERNEL);
2322#endif
2323#ifdef CONFIG_FTRACE_MCOUNT_RECORD
2324 /* sechdrs[0].sh_size is always zero */
2325 mod->ftrace_callsites = section_objs(info, "__mcount_loc",
2326 sizeof(*mod->ftrace_callsites),
2327 &mod->num_ftrace_callsites);
2328#endif
2160 2329
2161 if (pcpuindex) { 2330 mod->extable = section_objs(info, "__ex_table",
2162 /* We have a special allocation for this section. */ 2331 sizeof(*mod->extable), &mod->num_exentries);
2163 err = percpu_modalloc(mod, sechdrs[pcpuindex].sh_size,
2164 sechdrs[pcpuindex].sh_addralign);
2165 if (err)
2166 goto free_mod;
2167 sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
2168 }
2169 2332
2170 /* Determine total sizes, and put offsets in sh_entsize. For now 2333 if (section_addr(info, "__obsparm"))
2171 this is done generically; there doesn't appear to be any 2334 printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
2172 special cases for the architectures. */ 2335 mod->name);
2173 layout_sections(mod, hdr, sechdrs, secstrings); 2336
2174 symoffs = layout_symtab(mod, sechdrs, symindex, strindex, hdr, 2337 info->debug = section_objs(info, "__verbose",
2175 secstrings, &stroffs, strmap); 2338 sizeof(*info->debug), &info->num_debug);
2339}
2340
2341static int move_module(struct module *mod, struct load_info *info)
2342{
2343 int i;
2344 void *ptr;
2176 2345
2177 /* Do the allocs. */ 2346 /* Do the allocs. */
2178 ptr = module_alloc_update_bounds(mod->core_size); 2347 ptr = module_alloc_update_bounds(mod->core_size);
@@ -2182,10 +2351,9 @@ static noinline struct module *load_module(void __user *umod,
2182 * leak. 2351 * leak.
2183 */ 2352 */
2184 kmemleak_not_leak(ptr); 2353 kmemleak_not_leak(ptr);
2185 if (!ptr) { 2354 if (!ptr)
2186 err = -ENOMEM; 2355 return -ENOMEM;
2187 goto free_percpu; 2356
2188 }
2189 memset(ptr, 0, mod->core_size); 2357 memset(ptr, 0, mod->core_size);
2190 mod->module_core = ptr; 2358 mod->module_core = ptr;
2191 2359
@@ -2198,55 +2366,40 @@ static noinline struct module *load_module(void __user *umod,
2198 */ 2366 */
2199 kmemleak_ignore(ptr); 2367 kmemleak_ignore(ptr);
2200 if (!ptr && mod->init_size) { 2368 if (!ptr && mod->init_size) {
2201 err = -ENOMEM; 2369 module_free(mod, mod->module_core);
2202 goto free_core; 2370 return -ENOMEM;
2203 } 2371 }
2204 memset(ptr, 0, mod->init_size); 2372 memset(ptr, 0, mod->init_size);
2205 mod->module_init = ptr; 2373 mod->module_init = ptr;
2206 2374
2207 /* Transfer each section which specifies SHF_ALLOC */ 2375 /* Transfer each section which specifies SHF_ALLOC */
2208 DEBUGP("final section addresses:\n"); 2376 DEBUGP("final section addresses:\n");
2209 for (i = 0; i < hdr->e_shnum; i++) { 2377 for (i = 0; i < info->hdr->e_shnum; i++) {
2210 void *dest; 2378 void *dest;
2379 Elf_Shdr *shdr = &info->sechdrs[i];
2211 2380
2212 if (!(sechdrs[i].sh_flags & SHF_ALLOC)) 2381 if (!(shdr->sh_flags & SHF_ALLOC))
2213 continue; 2382 continue;
2214 2383
2215 if (sechdrs[i].sh_entsize & INIT_OFFSET_MASK) 2384 if (shdr->sh_entsize & INIT_OFFSET_MASK)
2216 dest = mod->module_init 2385 dest = mod->module_init
2217 + (sechdrs[i].sh_entsize & ~INIT_OFFSET_MASK); 2386 + (shdr->sh_entsize & ~INIT_OFFSET_MASK);
2218 else 2387 else
2219 dest = mod->module_core + sechdrs[i].sh_entsize; 2388 dest = mod->module_core + shdr->sh_entsize;
2220 2389
2221 if (sechdrs[i].sh_type != SHT_NOBITS) 2390 if (shdr->sh_type != SHT_NOBITS)
2222 memcpy(dest, (void *)sechdrs[i].sh_addr, 2391 memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
2223 sechdrs[i].sh_size);
2224 /* Update sh_addr to point to copy in image. */ 2392 /* Update sh_addr to point to copy in image. */
2225 sechdrs[i].sh_addr = (unsigned long)dest; 2393 shdr->sh_addr = (unsigned long)dest;
2226 DEBUGP("\t0x%lx %s\n", sechdrs[i].sh_addr, secstrings + sechdrs[i].sh_name); 2394 DEBUGP("\t0x%lx %s\n",
2227 } 2395 shdr->sh_addr, info->secstrings + shdr->sh_name);
2228 /* Module has been moved. */
2229 mod = (void *)sechdrs[modindex].sh_addr;
2230 kmemleak_load_module(mod, hdr, sechdrs, secstrings);
2231
2232#if defined(CONFIG_MODULE_UNLOAD)
2233 mod->refptr = alloc_percpu(struct module_ref);
2234 if (!mod->refptr) {
2235 err = -ENOMEM;
2236 goto free_init;
2237 } 2396 }
2238#endif
2239 /* Now we've moved module, initialize linked lists, etc. */
2240 module_unload_init(mod);
2241
2242 /* add kobject, so we can reference it. */
2243 err = mod_sysfs_init(mod);
2244 if (err)
2245 goto free_unload;
2246 2397
2247 /* Set up license info based on the info section */ 2398 return 0;
2248 set_license(mod, get_modinfo(sechdrs, infoindex, "license")); 2399}
2249 2400
2401static int check_module_license_and_versions(struct module *mod)
2402{
2250 /* 2403 /*
2251 * ndiswrapper is under GPL by itself, but loads proprietary modules. 2404 * ndiswrapper is under GPL by itself, but loads proprietary modules.
2252 * Don't use add_taint_module(), as it would prevent ndiswrapper from 2405 * Don't use add_taint_module(), as it would prevent ndiswrapper from
@@ -2259,77 +2412,6 @@ static noinline struct module *load_module(void __user *umod,
2259 if (strcmp(mod->name, "driverloader") == 0) 2412 if (strcmp(mod->name, "driverloader") == 0)
2260 add_taint_module(mod, TAINT_PROPRIETARY_MODULE); 2413 add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
2261 2414
2262 /* Set up MODINFO_ATTR fields */
2263 setup_modinfo(mod, sechdrs, infoindex);
2264
2265 /* Fix up syms, so that st_value is a pointer to location. */
2266 err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex,
2267 mod);
2268 if (err < 0)
2269 goto cleanup;
2270
2271 /* Now we've got everything in the final locations, we can
2272 * find optional sections. */
2273 mod->kp = section_objs(hdr, sechdrs, secstrings, "__param",
2274 sizeof(*mod->kp), &mod->num_kp);
2275 mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab",
2276 sizeof(*mod->syms), &mod->num_syms);
2277 mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab");
2278 mod->gpl_syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab_gpl",
2279 sizeof(*mod->gpl_syms),
2280 &mod->num_gpl_syms);
2281 mod->gpl_crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab_gpl");
2282 mod->gpl_future_syms = section_objs(hdr, sechdrs, secstrings,
2283 "__ksymtab_gpl_future",
2284 sizeof(*mod->gpl_future_syms),
2285 &mod->num_gpl_future_syms);
2286 mod->gpl_future_crcs = section_addr(hdr, sechdrs, secstrings,
2287 "__kcrctab_gpl_future");
2288
2289#ifdef CONFIG_UNUSED_SYMBOLS
2290 mod->unused_syms = section_objs(hdr, sechdrs, secstrings,
2291 "__ksymtab_unused",
2292 sizeof(*mod->unused_syms),
2293 &mod->num_unused_syms);
2294 mod->unused_crcs = section_addr(hdr, sechdrs, secstrings,
2295 "__kcrctab_unused");
2296 mod->unused_gpl_syms = section_objs(hdr, sechdrs, secstrings,
2297 "__ksymtab_unused_gpl",
2298 sizeof(*mod->unused_gpl_syms),
2299 &mod->num_unused_gpl_syms);
2300 mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings,
2301 "__kcrctab_unused_gpl");
2302#endif
2303#ifdef CONFIG_CONSTRUCTORS
2304 mod->ctors = section_objs(hdr, sechdrs, secstrings, ".ctors",
2305 sizeof(*mod->ctors), &mod->num_ctors);
2306#endif
2307
2308#ifdef CONFIG_TRACEPOINTS
2309 mod->tracepoints = section_objs(hdr, sechdrs, secstrings,
2310 "__tracepoints",
2311 sizeof(*mod->tracepoints),
2312 &mod->num_tracepoints);
2313#endif
2314#ifdef CONFIG_EVENT_TRACING
2315 mod->trace_events = section_objs(hdr, sechdrs, secstrings,
2316 "_ftrace_events",
2317 sizeof(*mod->trace_events),
2318 &mod->num_trace_events);
2319 /*
2320 * This section contains pointers to allocated objects in the trace
2321 * code and not scanning it leads to false positives.
2322 */
2323 kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
2324 mod->num_trace_events, GFP_KERNEL);
2325#endif
2326#ifdef CONFIG_FTRACE_MCOUNT_RECORD
2327 /* sechdrs[0].sh_size is always zero */
2328 mod->ftrace_callsites = section_objs(hdr, sechdrs, secstrings,
2329 "__mcount_loc",
2330 sizeof(*mod->ftrace_callsites),
2331 &mod->num_ftrace_callsites);
2332#endif
2333#ifdef CONFIG_MODVERSIONS 2415#ifdef CONFIG_MODVERSIONS
2334 if ((mod->num_syms && !mod->crcs) 2416 if ((mod->num_syms && !mod->crcs)
2335 || (mod->num_gpl_syms && !mod->gpl_crcs) 2417 || (mod->num_gpl_syms && !mod->gpl_crcs)
@@ -2339,67 +2421,16 @@ static noinline struct module *load_module(void __user *umod,
2339 || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs) 2421 || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs)
2340#endif 2422#endif
2341 ) { 2423 ) {
2342 err = try_to_force_load(mod, 2424 return try_to_force_load(mod,
2343 "no versions for exported symbols"); 2425 "no versions for exported symbols");
2344 if (err)
2345 goto cleanup;
2346 } 2426 }
2347#endif 2427#endif
2428 return 0;
2429}
2348 2430
2349 /* Now do relocations. */ 2431static void flush_module_icache(const struct module *mod)
2350 for (i = 1; i < hdr->e_shnum; i++) { 2432{
2351 const char *strtab = (char *)sechdrs[strindex].sh_addr; 2433 mm_segment_t old_fs;
2352 unsigned int info = sechdrs[i].sh_info;
2353
2354 /* Not a valid relocation section? */
2355 if (info >= hdr->e_shnum)
2356 continue;
2357
2358 /* Don't bother with non-allocated sections */
2359 if (!(sechdrs[info].sh_flags & SHF_ALLOC))
2360 continue;
2361
2362 if (sechdrs[i].sh_type == SHT_REL)
2363 err = apply_relocate(sechdrs, strtab, symindex, i,mod);
2364 else if (sechdrs[i].sh_type == SHT_RELA)
2365 err = apply_relocate_add(sechdrs, strtab, symindex, i,
2366 mod);
2367 if (err < 0)
2368 goto cleanup;
2369 }
2370
2371 /* Find duplicate symbols */
2372 err = verify_export_symbols(mod);
2373 if (err < 0)
2374 goto cleanup;
2375
2376 /* Set up and sort exception table */
2377 mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table",
2378 sizeof(*mod->extable), &mod->num_exentries);
2379 sort_extable(mod->extable, mod->extable + mod->num_exentries);
2380
2381 /* Finally, copy percpu area over. */
2382 percpu_modcopy(mod, (void *)sechdrs[pcpuindex].sh_addr,
2383 sechdrs[pcpuindex].sh_size);
2384
2385 add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex,
2386 symoffs, stroffs, secstrings, strmap);
2387 kfree(strmap);
2388 strmap = NULL;
2389
2390 if (!mod->taints) {
2391 struct _ddebug *debug;
2392 unsigned int num_debug;
2393
2394 debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
2395 sizeof(*debug), &num_debug);
2396 if (debug)
2397 dynamic_debug_setup(debug, num_debug);
2398 }
2399
2400 err = module_finalize(hdr, sechdrs, mod);
2401 if (err < 0)
2402 goto cleanup;
2403 2434
2404 /* flush the icache in correct context */ 2435 /* flush the icache in correct context */
2405 old_fs = get_fs(); 2436 old_fs = get_fs();
@@ -2418,11 +2449,160 @@ static noinline struct module *load_module(void __user *umod,
2418 (unsigned long)mod->module_core + mod->core_size); 2449 (unsigned long)mod->module_core + mod->core_size);
2419 2450
2420 set_fs(old_fs); 2451 set_fs(old_fs);
2452}
2421 2453
2422 mod->args = args; 2454static struct module *layout_and_allocate(struct load_info *info)
2423 if (section_addr(hdr, sechdrs, secstrings, "__obsparm")) 2455{
2424 printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", 2456 /* Module within temporary copy. */
2425 mod->name); 2457 struct module *mod;
2458 Elf_Shdr *pcpusec;
2459 int err;
2460
2461 mod = setup_load_info(info);
2462 if (IS_ERR(mod))
2463 return mod;
2464
2465 err = check_modinfo(mod, info);
2466 if (err)
2467 return ERR_PTR(err);
2468
2469 /* Allow arches to frob section contents and sizes. */
2470 err = module_frob_arch_sections(info->hdr, info->sechdrs,
2471 info->secstrings, mod);
2472 if (err < 0)
2473 goto out;
2474
2475 pcpusec = &info->sechdrs[info->index.pcpu];
2476 if (pcpusec->sh_size) {
2477 /* We have a special allocation for this section. */
2478 err = percpu_modalloc(mod,
2479 pcpusec->sh_size, pcpusec->sh_addralign);
2480 if (err)
2481 goto out;
2482 pcpusec->sh_flags &= ~(unsigned long)SHF_ALLOC;
2483 }
2484
2485 /* Determine total sizes, and put offsets in sh_entsize. For now
2486 this is done generically; there doesn't appear to be any
2487 special cases for the architectures. */
2488 layout_sections(mod, info);
2489
2490 info->strmap = kzalloc(BITS_TO_LONGS(info->sechdrs[info->index.str].sh_size)
2491 * sizeof(long), GFP_KERNEL);
2492 if (!info->strmap) {
2493 err = -ENOMEM;
2494 goto free_percpu;
2495 }
2496 layout_symtab(mod, info);
2497
2498 /* Allocate and move to the final place */
2499 err = move_module(mod, info);
2500 if (err)
2501 goto free_strmap;
2502
2503 /* Module has been copied to its final place now: return it. */
2504 mod = (void *)info->sechdrs[info->index.mod].sh_addr;
2505 kmemleak_load_module(mod, info);
2506 return mod;
2507
2508free_strmap:
2509 kfree(info->strmap);
2510free_percpu:
2511 percpu_modfree(mod);
2512out:
2513 return ERR_PTR(err);
2514}
2515
2516/* mod is no longer valid after this! */
2517static void module_deallocate(struct module *mod, struct load_info *info)
2518{
2519 kfree(info->strmap);
2520 percpu_modfree(mod);
2521 module_free(mod, mod->module_init);
2522 module_free(mod, mod->module_core);
2523}
2524
2525static int post_relocation(struct module *mod, const struct load_info *info)
2526{
2527 /* Sort exception table now relocations are done. */
2528 sort_extable(mod->extable, mod->extable + mod->num_exentries);
2529
2530 /* Copy relocated percpu area over. */
2531 percpu_modcopy(mod, (void *)info->sechdrs[info->index.pcpu].sh_addr,
2532 info->sechdrs[info->index.pcpu].sh_size);
2533
2534 /* Setup kallsyms-specific fields. */
2535 add_kallsyms(mod, info);
2536
2537 /* Arch-specific module finalizing. */
2538 return module_finalize(info->hdr, info->sechdrs, mod);
2539}
2540
2541/* Allocate and load the module: note that size of section 0 is always
2542 zero, and we rely on this for optional sections. */
2543static struct module *load_module(void __user *umod,
2544 unsigned long len,
2545 const char __user *uargs)
2546{
2547 struct load_info info = { NULL, };
2548 struct module *mod;
2549 long err;
2550
2551 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
2552 umod, len, uargs);
2553
2554 /* Copy in the blobs from userspace, check they are vaguely sane. */
2555 err = copy_and_check(&info, umod, len, uargs);
2556 if (err)
2557 return ERR_PTR(err);
2558
2559 /* Figure out module layout, and allocate all the memory. */
2560 mod = layout_and_allocate(&info);
2561 if (IS_ERR(mod)) {
2562 err = PTR_ERR(mod);
2563 goto free_copy;
2564 }
2565
2566 /* Now module is in final location, initialize linked lists, etc. */
2567 err = module_unload_init(mod);
2568 if (err)
2569 goto free_module;
2570
2571 /* Now we've got everything in the final locations, we can
2572 * find optional sections. */
2573 find_module_sections(mod, &info);
2574
2575 err = check_module_license_and_versions(mod);
2576 if (err)
2577 goto free_unload;
2578
2579 /* Set up MODINFO_ATTR fields */
2580 setup_modinfo(mod, &info);
2581
2582 /* Fix up syms, so that st_value is a pointer to location. */
2583 err = simplify_symbols(mod, &info);
2584 if (err < 0)
2585 goto free_modinfo;
2586
2587 err = apply_relocations(mod, &info);
2588 if (err < 0)
2589 goto free_modinfo;
2590
2591 err = post_relocation(mod, &info);
2592 if (err < 0)
2593 goto free_modinfo;
2594
2595 flush_module_icache(mod);
2596
2597 /* Now copy in args */
2598 mod->args = strndup_user(uargs, ~0UL >> 1);
2599 if (IS_ERR(mod->args)) {
2600 err = PTR_ERR(mod->args);
2601 goto free_arch_cleanup;
2602 }
2603
2604 /* Mark state as coming so strong_try_module_get() ignores us. */
2605 mod->state = MODULE_STATE_COMING;
2426 2606
2427 /* Now sew it into the lists so we can get lockdep and oops 2607 /* Now sew it into the lists so we can get lockdep and oops
2428 * info during argument parsing. Noone should access us, since 2608 * info during argument parsing. Noone should access us, since
@@ -2431,58 +2611,67 @@ static noinline struct module *load_module(void __user *umod,
2431 * function to insert in a way safe to concurrent readers. 2611 * function to insert in a way safe to concurrent readers.
2432 * The mutex protects against concurrent writers. 2612 * The mutex protects against concurrent writers.
2433 */ 2613 */
2614 mutex_lock(&module_mutex);
2615 if (find_module(mod->name)) {
2616 err = -EEXIST;
2617 goto unlock;
2618 }
2619
2620 /* This has to be done once we're sure module name is unique. */
2621 if (!mod->taints)
2622 dynamic_debug_setup(info.debug, info.num_debug);
2623
2624 /* Find duplicate symbols */
2625 err = verify_export_symbols(mod);
2626 if (err < 0)
2627 goto ddebug;
2628
2629 module_bug_finalize(info.hdr, info.sechdrs, mod);
2434 list_add_rcu(&mod->list, &modules); 2630 list_add_rcu(&mod->list, &modules);
2631 mutex_unlock(&module_mutex);
2435 2632
2633 /* Module is ready to execute: parsing args may do that. */
2436 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL); 2634 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL);
2437 if (err < 0) 2635 if (err < 0)
2438 goto unlink; 2636 goto unlink;
2439 2637
2440 err = mod_sysfs_setup(mod, mod->kp, mod->num_kp); 2638 /* Link in to syfs. */
2639 err = mod_sysfs_setup(mod, &info, mod->kp, mod->num_kp);
2441 if (err < 0) 2640 if (err < 0)
2442 goto unlink; 2641 goto unlink;
2443 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
2444 add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
2445
2446 /* Get rid of temporary copy */
2447 vfree(hdr);
2448 2642
2449 trace_module_load(mod); 2643 /* Get rid of temporary copy and strmap. */
2644 kfree(info.strmap);
2645 free_copy(&info);
2450 2646
2451 /* Done! */ 2647 /* Done! */
2648 trace_module_load(mod);
2452 return mod; 2649 return mod;
2453 2650
2454 unlink: 2651 unlink:
2652 mutex_lock(&module_mutex);
2455 /* Unlink carefully: kallsyms could be walking list. */ 2653 /* Unlink carefully: kallsyms could be walking list. */
2456 list_del_rcu(&mod->list); 2654 list_del_rcu(&mod->list);
2655 module_bug_cleanup(mod);
2656
2657 ddebug:
2658 if (!mod->taints)
2659 dynamic_debug_remove(info.debug);
2660 unlock:
2661 mutex_unlock(&module_mutex);
2457 synchronize_sched(); 2662 synchronize_sched();
2663 kfree(mod->args);
2664 free_arch_cleanup:
2458 module_arch_cleanup(mod); 2665 module_arch_cleanup(mod);
2459 cleanup: 2666 free_modinfo:
2460 free_modinfo(mod); 2667 free_modinfo(mod);
2461 kobject_del(&mod->mkobj.kobj);
2462 kobject_put(&mod->mkobj.kobj);
2463 free_unload: 2668 free_unload:
2464 module_unload_free(mod); 2669 module_unload_free(mod);
2465#if defined(CONFIG_MODULE_UNLOAD) 2670 free_module:
2466 free_percpu(mod->refptr); 2671 module_deallocate(mod, &info);
2467 free_init: 2672 free_copy:
2468#endif 2673 free_copy(&info);
2469 module_free(mod, mod->module_init);
2470 free_core:
2471 module_free(mod, mod->module_core);
2472 /* mod will be freed with core. Don't access it beyond this line! */
2473 free_percpu:
2474 percpu_modfree(mod);
2475 free_mod:
2476 kfree(args);
2477 kfree(strmap);
2478 free_hdr:
2479 vfree(hdr);
2480 return ERR_PTR(err); 2674 return ERR_PTR(err);
2481
2482 truncated:
2483 printk(KERN_ERR "Module len %lu truncated\n", len);
2484 err = -ENOEXEC;
2485 goto free_hdr;
2486} 2675}
2487 2676
2488/* Call module constructors. */ 2677/* Call module constructors. */
@@ -2507,19 +2696,10 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2507 if (!capable(CAP_SYS_MODULE) || modules_disabled) 2696 if (!capable(CAP_SYS_MODULE) || modules_disabled)
2508 return -EPERM; 2697 return -EPERM;
2509 2698
2510 /* Only one module load at a time, please */
2511 if (mutex_lock_interruptible(&module_mutex) != 0)
2512 return -EINTR;
2513
2514 /* Do all the hard work */ 2699 /* Do all the hard work */
2515 mod = load_module(umod, len, uargs); 2700 mod = load_module(umod, len, uargs);
2516 if (IS_ERR(mod)) { 2701 if (IS_ERR(mod))
2517 mutex_unlock(&module_mutex);
2518 return PTR_ERR(mod); 2702 return PTR_ERR(mod);
2519 }
2520
2521 /* Drop lock so they can recurse */
2522 mutex_unlock(&module_mutex);
2523 2703
2524 blocking_notifier_call_chain(&module_notify_list, 2704 blocking_notifier_call_chain(&module_notify_list,
2525 MODULE_STATE_COMING, mod); 2705 MODULE_STATE_COMING, mod);
@@ -2536,9 +2716,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2536 module_put(mod); 2716 module_put(mod);
2537 blocking_notifier_call_chain(&module_notify_list, 2717 blocking_notifier_call_chain(&module_notify_list,
2538 MODULE_STATE_GOING, mod); 2718 MODULE_STATE_GOING, mod);
2539 mutex_lock(&module_mutex);
2540 free_module(mod); 2719 free_module(mod);
2541 mutex_unlock(&module_mutex);
2542 wake_up(&module_wq); 2720 wake_up(&module_wq);
2543 return ret; 2721 return ret;
2544 } 2722 }
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 632f04c57d82..200407c1502f 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -36,15 +36,6 @@
36# include <asm/mutex.h> 36# include <asm/mutex.h>
37#endif 37#endif
38 38
39/***
40 * mutex_init - initialize the mutex
41 * @lock: the mutex to be initialized
42 * @key: the lock_class_key for the class; used by mutex lock debugging
43 *
44 * Initialize the mutex to unlocked state.
45 *
46 * It is not allowed to initialize an already locked mutex.
47 */
48void 39void
49__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) 40__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
50{ 41{
@@ -68,7 +59,7 @@ EXPORT_SYMBOL(__mutex_init);
68static __used noinline void __sched 59static __used noinline void __sched
69__mutex_lock_slowpath(atomic_t *lock_count); 60__mutex_lock_slowpath(atomic_t *lock_count);
70 61
71/*** 62/**
72 * mutex_lock - acquire the mutex 63 * mutex_lock - acquire the mutex
73 * @lock: the mutex to be acquired 64 * @lock: the mutex to be acquired
74 * 65 *
@@ -105,7 +96,7 @@ EXPORT_SYMBOL(mutex_lock);
105 96
106static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); 97static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
107 98
108/*** 99/**
109 * mutex_unlock - release the mutex 100 * mutex_unlock - release the mutex
110 * @lock: the mutex to be released 101 * @lock: the mutex to be released
111 * 102 *
@@ -172,6 +163,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
172 struct thread_info *owner; 163 struct thread_info *owner;
173 164
174 /* 165 /*
166 * If we own the BKL, then don't spin. The owner of
167 * the mutex might be waiting on us to release the BKL.
168 */
169 if (unlikely(current->lock_depth >= 0))
170 break;
171
172 /*
175 * If there's an owner, wait for it to either 173 * If there's an owner, wait for it to either
176 * release the lock or go to sleep. 174 * release the lock or go to sleep.
177 */ 175 */
@@ -357,8 +355,8 @@ __mutex_lock_killable_slowpath(atomic_t *lock_count);
357static noinline int __sched 355static noinline int __sched
358__mutex_lock_interruptible_slowpath(atomic_t *lock_count); 356__mutex_lock_interruptible_slowpath(atomic_t *lock_count);
359 357
360/*** 358/**
361 * mutex_lock_interruptible - acquire the mutex, interruptable 359 * mutex_lock_interruptible - acquire the mutex, interruptible
362 * @lock: the mutex to be acquired 360 * @lock: the mutex to be acquired
363 * 361 *
364 * Lock the mutex like mutex_lock(), and return 0 if the mutex has 362 * Lock the mutex like mutex_lock(), and return 0 if the mutex has
@@ -449,15 +447,15 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
449 return prev == 1; 447 return prev == 1;
450} 448}
451 449
452/*** 450/**
453 * mutex_trylock - try acquire the mutex, without waiting 451 * mutex_trylock - try to acquire the mutex, without waiting
454 * @lock: the mutex to be acquired 452 * @lock: the mutex to be acquired
455 * 453 *
456 * Try to acquire the mutex atomically. Returns 1 if the mutex 454 * Try to acquire the mutex atomically. Returns 1 if the mutex
457 * has been acquired successfully, and 0 on contention. 455 * has been acquired successfully, and 0 on contention.
458 * 456 *
459 * NOTE: this function follows the spin_trylock() convention, so 457 * NOTE: this function follows the spin_trylock() convention, so
460 * it is negated to the down_trylock() return values! Be careful 458 * it is negated from the down_trylock() return values! Be careful
461 * about this when converting semaphore users to mutexes. 459 * about this when converting semaphore users to mutexes.
462 * 460 *
463 * This function must not be used in interrupt context. The 461 * This function must not be used in interrupt context. The
diff --git a/kernel/padata.c b/kernel/padata.c
index fd03513c7327..751019415d23 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -26,18 +26,19 @@
26#include <linux/mutex.h> 26#include <linux/mutex.h>
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/sysfs.h>
29#include <linux/rcupdate.h> 30#include <linux/rcupdate.h>
30 31
31#define MAX_SEQ_NR INT_MAX - NR_CPUS 32#define MAX_SEQ_NR (INT_MAX - NR_CPUS)
32#define MAX_OBJ_NUM 10000 * NR_CPUS 33#define MAX_OBJ_NUM 1000
33 34
34static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) 35static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
35{ 36{
36 int cpu, target_cpu; 37 int cpu, target_cpu;
37 38
38 target_cpu = cpumask_first(pd->cpumask); 39 target_cpu = cpumask_first(pd->cpumask.pcpu);
39 for (cpu = 0; cpu < cpu_index; cpu++) 40 for (cpu = 0; cpu < cpu_index; cpu++)
40 target_cpu = cpumask_next(target_cpu, pd->cpumask); 41 target_cpu = cpumask_next(target_cpu, pd->cpumask.pcpu);
41 42
42 return target_cpu; 43 return target_cpu;
43} 44}
@@ -53,26 +54,27 @@ static int padata_cpu_hash(struct padata_priv *padata)
53 * Hash the sequence numbers to the cpus by taking 54 * Hash the sequence numbers to the cpus by taking
54 * seq_nr mod. number of cpus in use. 55 * seq_nr mod. number of cpus in use.
55 */ 56 */
56 cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask); 57 cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask.pcpu);
57 58
58 return padata_index_to_cpu(pd, cpu_index); 59 return padata_index_to_cpu(pd, cpu_index);
59} 60}
60 61
61static void padata_parallel_worker(struct work_struct *work) 62static void padata_parallel_worker(struct work_struct *parallel_work)
62{ 63{
63 struct padata_queue *queue; 64 struct padata_parallel_queue *pqueue;
64 struct parallel_data *pd; 65 struct parallel_data *pd;
65 struct padata_instance *pinst; 66 struct padata_instance *pinst;
66 LIST_HEAD(local_list); 67 LIST_HEAD(local_list);
67 68
68 local_bh_disable(); 69 local_bh_disable();
69 queue = container_of(work, struct padata_queue, pwork); 70 pqueue = container_of(parallel_work,
70 pd = queue->pd; 71 struct padata_parallel_queue, work);
72 pd = pqueue->pd;
71 pinst = pd->pinst; 73 pinst = pd->pinst;
72 74
73 spin_lock(&queue->parallel.lock); 75 spin_lock(&pqueue->parallel.lock);
74 list_replace_init(&queue->parallel.list, &local_list); 76 list_replace_init(&pqueue->parallel.list, &local_list);
75 spin_unlock(&queue->parallel.lock); 77 spin_unlock(&pqueue->parallel.lock);
76 78
77 while (!list_empty(&local_list)) { 79 while (!list_empty(&local_list)) {
78 struct padata_priv *padata; 80 struct padata_priv *padata;
@@ -88,13 +90,13 @@ static void padata_parallel_worker(struct work_struct *work)
88 local_bh_enable(); 90 local_bh_enable();
89} 91}
90 92
91/* 93/**
92 * padata_do_parallel - padata parallelization function 94 * padata_do_parallel - padata parallelization function
93 * 95 *
94 * @pinst: padata instance 96 * @pinst: padata instance
95 * @padata: object to be parallelized 97 * @padata: object to be parallelized
96 * @cb_cpu: cpu the serialization callback function will run on, 98 * @cb_cpu: cpu the serialization callback function will run on,
97 * must be in the cpumask of padata. 99 * must be in the serial cpumask of padata(i.e. cpumask.cbcpu).
98 * 100 *
99 * The parallelization callback function will run with BHs off. 101 * The parallelization callback function will run with BHs off.
100 * Note: Every object which is parallelized by padata_do_parallel 102 * Note: Every object which is parallelized by padata_do_parallel
@@ -104,15 +106,18 @@ int padata_do_parallel(struct padata_instance *pinst,
104 struct padata_priv *padata, int cb_cpu) 106 struct padata_priv *padata, int cb_cpu)
105{ 107{
106 int target_cpu, err; 108 int target_cpu, err;
107 struct padata_queue *queue; 109 struct padata_parallel_queue *queue;
108 struct parallel_data *pd; 110 struct parallel_data *pd;
109 111
110 rcu_read_lock_bh(); 112 rcu_read_lock_bh();
111 113
112 pd = rcu_dereference(pinst->pd); 114 pd = rcu_dereference(pinst->pd);
113 115
114 err = 0; 116 err = -EINVAL;
115 if (!(pinst->flags & PADATA_INIT)) 117 if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID)
118 goto out;
119
120 if (!cpumask_test_cpu(cb_cpu, pd->cpumask.cbcpu))
116 goto out; 121 goto out;
117 122
118 err = -EBUSY; 123 err = -EBUSY;
@@ -122,11 +127,7 @@ int padata_do_parallel(struct padata_instance *pinst,
122 if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM) 127 if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM)
123 goto out; 128 goto out;
124 129
125 err = -EINVAL; 130 err = 0;
126 if (!cpumask_test_cpu(cb_cpu, pd->cpumask))
127 goto out;
128
129 err = -EINPROGRESS;
130 atomic_inc(&pd->refcnt); 131 atomic_inc(&pd->refcnt);
131 padata->pd = pd; 132 padata->pd = pd;
132 padata->cb_cpu = cb_cpu; 133 padata->cb_cpu = cb_cpu;
@@ -137,13 +138,13 @@ int padata_do_parallel(struct padata_instance *pinst,
137 padata->seq_nr = atomic_inc_return(&pd->seq_nr); 138 padata->seq_nr = atomic_inc_return(&pd->seq_nr);
138 139
139 target_cpu = padata_cpu_hash(padata); 140 target_cpu = padata_cpu_hash(padata);
140 queue = per_cpu_ptr(pd->queue, target_cpu); 141 queue = per_cpu_ptr(pd->pqueue, target_cpu);
141 142
142 spin_lock(&queue->parallel.lock); 143 spin_lock(&queue->parallel.lock);
143 list_add_tail(&padata->list, &queue->parallel.list); 144 list_add_tail(&padata->list, &queue->parallel.list);
144 spin_unlock(&queue->parallel.lock); 145 spin_unlock(&queue->parallel.lock);
145 146
146 queue_work_on(target_cpu, pinst->wq, &queue->pwork); 147 queue_work_on(target_cpu, pinst->wq, &queue->work);
147 148
148out: 149out:
149 rcu_read_unlock_bh(); 150 rcu_read_unlock_bh();
@@ -152,86 +153,72 @@ out:
152} 153}
153EXPORT_SYMBOL(padata_do_parallel); 154EXPORT_SYMBOL(padata_do_parallel);
154 155
156/*
157 * padata_get_next - Get the next object that needs serialization.
158 *
159 * Return values are:
160 *
161 * A pointer to the control struct of the next object that needs
162 * serialization, if present in one of the percpu reorder queues.
163 *
164 * NULL, if all percpu reorder queues are empty.
165 *
166 * -EINPROGRESS, if the next object that needs serialization will
167 * be parallel processed by another cpu and is not yet present in
168 * the cpu's reorder queue.
169 *
170 * -ENODATA, if this cpu has to do the parallel processing for
171 * the next object.
172 */
155static struct padata_priv *padata_get_next(struct parallel_data *pd) 173static struct padata_priv *padata_get_next(struct parallel_data *pd)
156{ 174{
157 int cpu, num_cpus, empty, calc_seq_nr; 175 int cpu, num_cpus;
158 int seq_nr, next_nr, overrun, next_overrun; 176 int next_nr, next_index;
159 struct padata_queue *queue, *next_queue; 177 struct padata_parallel_queue *queue, *next_queue;
160 struct padata_priv *padata; 178 struct padata_priv *padata;
161 struct padata_list *reorder; 179 struct padata_list *reorder;
162 180
163 empty = 0; 181 num_cpus = cpumask_weight(pd->cpumask.pcpu);
164 next_nr = -1;
165 next_overrun = 0;
166 next_queue = NULL;
167
168 num_cpus = cpumask_weight(pd->cpumask);
169
170 for_each_cpu(cpu, pd->cpumask) {
171 queue = per_cpu_ptr(pd->queue, cpu);
172 reorder = &queue->reorder;
173
174 /*
175 * Calculate the seq_nr of the object that should be
176 * next in this queue.
177 */
178 overrun = 0;
179 calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus)
180 + queue->cpu_index;
181
182 if (unlikely(calc_seq_nr > pd->max_seq_nr)) {
183 calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1;
184 overrun = 1;
185 }
186
187 if (!list_empty(&reorder->list)) {
188 padata = list_entry(reorder->list.next,
189 struct padata_priv, list);
190
191 seq_nr = padata->seq_nr;
192 BUG_ON(calc_seq_nr != seq_nr);
193 } else {
194 seq_nr = calc_seq_nr;
195 empty++;
196 }
197 182
198 if (next_nr < 0 || seq_nr < next_nr 183 /*
199 || (next_overrun && !overrun)) { 184 * Calculate the percpu reorder queue and the sequence
200 next_nr = seq_nr; 185 * number of the next object.
201 next_overrun = overrun; 186 */
202 next_queue = queue; 187 next_nr = pd->processed;
203 } 188 next_index = next_nr % num_cpus;
189 cpu = padata_index_to_cpu(pd, next_index);
190 next_queue = per_cpu_ptr(pd->pqueue, cpu);
191
192 if (unlikely(next_nr > pd->max_seq_nr)) {
193 next_nr = next_nr - pd->max_seq_nr - 1;
194 next_index = next_nr % num_cpus;
195 cpu = padata_index_to_cpu(pd, next_index);
196 next_queue = per_cpu_ptr(pd->pqueue, cpu);
197 pd->processed = 0;
204 } 198 }
205 199
206 padata = NULL; 200 padata = NULL;
207 201
208 if (empty == num_cpus)
209 goto out;
210
211 reorder = &next_queue->reorder; 202 reorder = &next_queue->reorder;
212 203
213 if (!list_empty(&reorder->list)) { 204 if (!list_empty(&reorder->list)) {
214 padata = list_entry(reorder->list.next, 205 padata = list_entry(reorder->list.next,
215 struct padata_priv, list); 206 struct padata_priv, list);
216 207
217 if (unlikely(next_overrun)) { 208 BUG_ON(next_nr != padata->seq_nr);
218 for_each_cpu(cpu, pd->cpumask) {
219 queue = per_cpu_ptr(pd->queue, cpu);
220 atomic_set(&queue->num_obj, 0);
221 }
222 }
223 209
224 spin_lock(&reorder->lock); 210 spin_lock(&reorder->lock);
225 list_del_init(&padata->list); 211 list_del_init(&padata->list);
226 atomic_dec(&pd->reorder_objects); 212 atomic_dec(&pd->reorder_objects);
227 spin_unlock(&reorder->lock); 213 spin_unlock(&reorder->lock);
228 214
229 atomic_inc(&next_queue->num_obj); 215 pd->processed++;
230 216
231 goto out; 217 goto out;
232 } 218 }
233 219
234 if (next_nr % num_cpus == next_queue->cpu_index) { 220 queue = per_cpu_ptr(pd->pqueue, smp_processor_id());
221 if (queue->cpu_index == next_queue->cpu_index) {
235 padata = ERR_PTR(-ENODATA); 222 padata = ERR_PTR(-ENODATA);
236 goto out; 223 goto out;
237 } 224 }
@@ -244,55 +231,90 @@ out:
244static void padata_reorder(struct parallel_data *pd) 231static void padata_reorder(struct parallel_data *pd)
245{ 232{
246 struct padata_priv *padata; 233 struct padata_priv *padata;
247 struct padata_queue *queue; 234 struct padata_serial_queue *squeue;
248 struct padata_instance *pinst = pd->pinst; 235 struct padata_instance *pinst = pd->pinst;
249 236
250try_again: 237 /*
238 * We need to ensure that only one cpu can work on dequeueing of
239 * the reorder queue the time. Calculating in which percpu reorder
240 * queue the next object will arrive takes some time. A spinlock
241 * would be highly contended. Also it is not clear in which order
242 * the objects arrive to the reorder queues. So a cpu could wait to
243 * get the lock just to notice that there is nothing to do at the
244 * moment. Therefore we use a trylock and let the holder of the lock
245 * care for all the objects enqueued during the holdtime of the lock.
246 */
251 if (!spin_trylock_bh(&pd->lock)) 247 if (!spin_trylock_bh(&pd->lock))
252 goto out; 248 return;
253 249
254 while (1) { 250 while (1) {
255 padata = padata_get_next(pd); 251 padata = padata_get_next(pd);
256 252
253 /*
254 * All reorder queues are empty, or the next object that needs
255 * serialization is parallel processed by another cpu and is
256 * still on it's way to the cpu's reorder queue, nothing to
257 * do for now.
258 */
257 if (!padata || PTR_ERR(padata) == -EINPROGRESS) 259 if (!padata || PTR_ERR(padata) == -EINPROGRESS)
258 break; 260 break;
259 261
262 /*
263 * This cpu has to do the parallel processing of the next
264 * object. It's waiting in the cpu's parallelization queue,
265 * so exit imediately.
266 */
260 if (PTR_ERR(padata) == -ENODATA) { 267 if (PTR_ERR(padata) == -ENODATA) {
268 del_timer(&pd->timer);
261 spin_unlock_bh(&pd->lock); 269 spin_unlock_bh(&pd->lock);
262 goto out; 270 return;
263 } 271 }
264 272
265 queue = per_cpu_ptr(pd->queue, padata->cb_cpu); 273 squeue = per_cpu_ptr(pd->squeue, padata->cb_cpu);
266 274
267 spin_lock(&queue->serial.lock); 275 spin_lock(&squeue->serial.lock);
268 list_add_tail(&padata->list, &queue->serial.list); 276 list_add_tail(&padata->list, &squeue->serial.list);
269 spin_unlock(&queue->serial.lock); 277 spin_unlock(&squeue->serial.lock);
270 278
271 queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork); 279 queue_work_on(padata->cb_cpu, pinst->wq, &squeue->work);
272 } 280 }
273 281
274 spin_unlock_bh(&pd->lock); 282 spin_unlock_bh(&pd->lock);
275 283
276 if (atomic_read(&pd->reorder_objects)) 284 /*
277 goto try_again; 285 * The next object that needs serialization might have arrived to
286 * the reorder queues in the meantime, we will be called again
287 * from the timer function if noone else cares for it.
288 */
289 if (atomic_read(&pd->reorder_objects)
290 && !(pinst->flags & PADATA_RESET))
291 mod_timer(&pd->timer, jiffies + HZ);
292 else
293 del_timer(&pd->timer);
278 294
279out:
280 return; 295 return;
281} 296}
282 297
283static void padata_serial_worker(struct work_struct *work) 298static void padata_reorder_timer(unsigned long arg)
299{
300 struct parallel_data *pd = (struct parallel_data *)arg;
301
302 padata_reorder(pd);
303}
304
305static void padata_serial_worker(struct work_struct *serial_work)
284{ 306{
285 struct padata_queue *queue; 307 struct padata_serial_queue *squeue;
286 struct parallel_data *pd; 308 struct parallel_data *pd;
287 LIST_HEAD(local_list); 309 LIST_HEAD(local_list);
288 310
289 local_bh_disable(); 311 local_bh_disable();
290 queue = container_of(work, struct padata_queue, swork); 312 squeue = container_of(serial_work, struct padata_serial_queue, work);
291 pd = queue->pd; 313 pd = squeue->pd;
292 314
293 spin_lock(&queue->serial.lock); 315 spin_lock(&squeue->serial.lock);
294 list_replace_init(&queue->serial.list, &local_list); 316 list_replace_init(&squeue->serial.list, &local_list);
295 spin_unlock(&queue->serial.lock); 317 spin_unlock(&squeue->serial.lock);
296 318
297 while (!list_empty(&local_list)) { 319 while (!list_empty(&local_list)) {
298 struct padata_priv *padata; 320 struct padata_priv *padata;
@@ -308,7 +330,7 @@ static void padata_serial_worker(struct work_struct *work)
308 local_bh_enable(); 330 local_bh_enable();
309} 331}
310 332
311/* 333/**
312 * padata_do_serial - padata serialization function 334 * padata_do_serial - padata serialization function
313 * 335 *
314 * @padata: object to be serialized. 336 * @padata: object to be serialized.
@@ -319,18 +341,18 @@ static void padata_serial_worker(struct work_struct *work)
319void padata_do_serial(struct padata_priv *padata) 341void padata_do_serial(struct padata_priv *padata)
320{ 342{
321 int cpu; 343 int cpu;
322 struct padata_queue *queue; 344 struct padata_parallel_queue *pqueue;
323 struct parallel_data *pd; 345 struct parallel_data *pd;
324 346
325 pd = padata->pd; 347 pd = padata->pd;
326 348
327 cpu = get_cpu(); 349 cpu = get_cpu();
328 queue = per_cpu_ptr(pd->queue, cpu); 350 pqueue = per_cpu_ptr(pd->pqueue, cpu);
329 351
330 spin_lock(&queue->reorder.lock); 352 spin_lock(&pqueue->reorder.lock);
331 atomic_inc(&pd->reorder_objects); 353 atomic_inc(&pd->reorder_objects);
332 list_add_tail(&padata->list, &queue->reorder.list); 354 list_add_tail(&padata->list, &pqueue->reorder.list);
333 spin_unlock(&queue->reorder.lock); 355 spin_unlock(&pqueue->reorder.lock);
334 356
335 put_cpu(); 357 put_cpu();
336 358
@@ -338,55 +360,90 @@ void padata_do_serial(struct padata_priv *padata)
338} 360}
339EXPORT_SYMBOL(padata_do_serial); 361EXPORT_SYMBOL(padata_do_serial);
340 362
341static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, 363static int padata_setup_cpumasks(struct parallel_data *pd,
342 const struct cpumask *cpumask) 364 const struct cpumask *pcpumask,
365 const struct cpumask *cbcpumask)
343{ 366{
344 int cpu, cpu_index, num_cpus; 367 if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL))
345 struct padata_queue *queue; 368 return -ENOMEM;
346 struct parallel_data *pd;
347 369
348 cpu_index = 0; 370 cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_active_mask);
371 if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) {
372 free_cpumask_var(pd->cpumask.cbcpu);
373 return -ENOMEM;
374 }
349 375
350 pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL); 376 cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_active_mask);
351 if (!pd) 377 return 0;
352 goto err; 378}
353 379
354 pd->queue = alloc_percpu(struct padata_queue); 380static void __padata_list_init(struct padata_list *pd_list)
355 if (!pd->queue) 381{
356 goto err_free_pd; 382 INIT_LIST_HEAD(&pd_list->list);
383 spin_lock_init(&pd_list->lock);
384}
357 385
358 if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL)) 386/* Initialize all percpu queues used by serial workers */
359 goto err_free_queue; 387static void padata_init_squeues(struct parallel_data *pd)
388{
389 int cpu;
390 struct padata_serial_queue *squeue;
391
392 for_each_cpu(cpu, pd->cpumask.cbcpu) {
393 squeue = per_cpu_ptr(pd->squeue, cpu);
394 squeue->pd = pd;
395 __padata_list_init(&squeue->serial);
396 INIT_WORK(&squeue->work, padata_serial_worker);
397 }
398}
360 399
361 for_each_possible_cpu(cpu) { 400/* Initialize all percpu queues used by parallel workers */
362 queue = per_cpu_ptr(pd->queue, cpu); 401static void padata_init_pqueues(struct parallel_data *pd)
402{
403 int cpu_index, num_cpus, cpu;
404 struct padata_parallel_queue *pqueue;
363 405
364 queue->pd = pd; 406 cpu_index = 0;
407 for_each_cpu(cpu, pd->cpumask.pcpu) {
408 pqueue = per_cpu_ptr(pd->pqueue, cpu);
409 pqueue->pd = pd;
410 pqueue->cpu_index = cpu_index;
411 cpu_index++;
412
413 __padata_list_init(&pqueue->reorder);
414 __padata_list_init(&pqueue->parallel);
415 INIT_WORK(&pqueue->work, padata_parallel_worker);
416 atomic_set(&pqueue->num_obj, 0);
417 }
365 418
366 if (cpumask_test_cpu(cpu, cpumask) 419 num_cpus = cpumask_weight(pd->cpumask.pcpu);
367 && cpumask_test_cpu(cpu, cpu_active_mask)) { 420 pd->max_seq_nr = num_cpus ? (MAX_SEQ_NR / num_cpus) * num_cpus - 1 : 0;
368 queue->cpu_index = cpu_index; 421}
369 cpu_index++;
370 } else
371 queue->cpu_index = -1;
372 422
373 INIT_LIST_HEAD(&queue->reorder.list); 423/* Allocate and initialize the internal cpumask dependend resources. */
374 INIT_LIST_HEAD(&queue->parallel.list); 424static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
375 INIT_LIST_HEAD(&queue->serial.list); 425 const struct cpumask *pcpumask,
376 spin_lock_init(&queue->reorder.lock); 426 const struct cpumask *cbcpumask)
377 spin_lock_init(&queue->parallel.lock); 427{
378 spin_lock_init(&queue->serial.lock); 428 struct parallel_data *pd;
379 429
380 INIT_WORK(&queue->pwork, padata_parallel_worker); 430 pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
381 INIT_WORK(&queue->swork, padata_serial_worker); 431 if (!pd)
382 atomic_set(&queue->num_obj, 0); 432 goto err;
383 }
384 433
385 cpumask_and(pd->cpumask, cpumask, cpu_active_mask); 434 pd->pqueue = alloc_percpu(struct padata_parallel_queue);
435 if (!pd->pqueue)
436 goto err_free_pd;
386 437
387 num_cpus = cpumask_weight(pd->cpumask); 438 pd->squeue = alloc_percpu(struct padata_serial_queue);
388 pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1; 439 if (!pd->squeue)
440 goto err_free_pqueue;
441 if (padata_setup_cpumasks(pd, pcpumask, cbcpumask) < 0)
442 goto err_free_squeue;
389 443
444 padata_init_pqueues(pd);
445 padata_init_squeues(pd);
446 setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
390 atomic_set(&pd->seq_nr, -1); 447 atomic_set(&pd->seq_nr, -1);
391 atomic_set(&pd->reorder_objects, 0); 448 atomic_set(&pd->reorder_objects, 0);
392 atomic_set(&pd->refcnt, 0); 449 atomic_set(&pd->refcnt, 0);
@@ -395,8 +452,10 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
395 452
396 return pd; 453 return pd;
397 454
398err_free_queue: 455err_free_squeue:
399 free_percpu(pd->queue); 456 free_percpu(pd->squeue);
457err_free_pqueue:
458 free_percpu(pd->pqueue);
400err_free_pd: 459err_free_pd:
401 kfree(pd); 460 kfree(pd);
402err: 461err:
@@ -405,15 +464,63 @@ err:
405 464
406static void padata_free_pd(struct parallel_data *pd) 465static void padata_free_pd(struct parallel_data *pd)
407{ 466{
408 free_cpumask_var(pd->cpumask); 467 free_cpumask_var(pd->cpumask.pcpu);
409 free_percpu(pd->queue); 468 free_cpumask_var(pd->cpumask.cbcpu);
469 free_percpu(pd->pqueue);
470 free_percpu(pd->squeue);
410 kfree(pd); 471 kfree(pd);
411} 472}
412 473
474/* Flush all objects out of the padata queues. */
475static void padata_flush_queues(struct parallel_data *pd)
476{
477 int cpu;
478 struct padata_parallel_queue *pqueue;
479 struct padata_serial_queue *squeue;
480
481 for_each_cpu(cpu, pd->cpumask.pcpu) {
482 pqueue = per_cpu_ptr(pd->pqueue, cpu);
483 flush_work(&pqueue->work);
484 }
485
486 del_timer_sync(&pd->timer);
487
488 if (atomic_read(&pd->reorder_objects))
489 padata_reorder(pd);
490
491 for_each_cpu(cpu, pd->cpumask.cbcpu) {
492 squeue = per_cpu_ptr(pd->squeue, cpu);
493 flush_work(&squeue->work);
494 }
495
496 BUG_ON(atomic_read(&pd->refcnt) != 0);
497}
498
499static void __padata_start(struct padata_instance *pinst)
500{
501 pinst->flags |= PADATA_INIT;
502}
503
504static void __padata_stop(struct padata_instance *pinst)
505{
506 if (!(pinst->flags & PADATA_INIT))
507 return;
508
509 pinst->flags &= ~PADATA_INIT;
510
511 synchronize_rcu();
512
513 get_online_cpus();
514 padata_flush_queues(pinst->pd);
515 put_online_cpus();
516}
517
518/* Replace the internal control stucture with a new one. */
413static void padata_replace(struct padata_instance *pinst, 519static void padata_replace(struct padata_instance *pinst,
414 struct parallel_data *pd_new) 520 struct parallel_data *pd_new)
415{ 521{
416 struct parallel_data *pd_old = pinst->pd; 522 struct parallel_data *pd_old = pinst->pd;
523 int notification_mask = 0;
417 524
418 pinst->flags |= PADATA_RESET; 525 pinst->flags |= PADATA_RESET;
419 526
@@ -421,43 +528,162 @@ static void padata_replace(struct padata_instance *pinst,
421 528
422 synchronize_rcu(); 529 synchronize_rcu();
423 530
424 while (atomic_read(&pd_old->refcnt) != 0) 531 if (!cpumask_equal(pd_old->cpumask.pcpu, pd_new->cpumask.pcpu))
425 yield(); 532 notification_mask |= PADATA_CPU_PARALLEL;
426 533 if (!cpumask_equal(pd_old->cpumask.cbcpu, pd_new->cpumask.cbcpu))
427 flush_workqueue(pinst->wq); 534 notification_mask |= PADATA_CPU_SERIAL;
428 535
536 padata_flush_queues(pd_old);
429 padata_free_pd(pd_old); 537 padata_free_pd(pd_old);
430 538
539 if (notification_mask)
540 blocking_notifier_call_chain(&pinst->cpumask_change_notifier,
541 notification_mask,
542 &pd_new->cpumask);
543
431 pinst->flags &= ~PADATA_RESET; 544 pinst->flags &= ~PADATA_RESET;
432} 545}
433 546
434/* 547/**
435 * padata_set_cpumask - set the cpumask that padata should use 548 * padata_register_cpumask_notifier - Registers a notifier that will be called
549 * if either pcpu or cbcpu or both cpumasks change.
436 * 550 *
437 * @pinst: padata instance 551 * @pinst: A poineter to padata instance
438 * @cpumask: the cpumask to use 552 * @nblock: A pointer to notifier block.
439 */ 553 */
440int padata_set_cpumask(struct padata_instance *pinst, 554int padata_register_cpumask_notifier(struct padata_instance *pinst,
441 cpumask_var_t cpumask) 555 struct notifier_block *nblock)
442{ 556{
443 struct parallel_data *pd; 557 return blocking_notifier_chain_register(&pinst->cpumask_change_notifier,
444 int err = 0; 558 nblock);
559}
560EXPORT_SYMBOL(padata_register_cpumask_notifier);
445 561
446 might_sleep(); 562/**
563 * padata_unregister_cpumask_notifier - Unregisters cpumask notifier
564 * registered earlier using padata_register_cpumask_notifier
565 *
566 * @pinst: A pointer to data instance.
567 * @nlock: A pointer to notifier block.
568 */
569int padata_unregister_cpumask_notifier(struct padata_instance *pinst,
570 struct notifier_block *nblock)
571{
572 return blocking_notifier_chain_unregister(
573 &pinst->cpumask_change_notifier,
574 nblock);
575}
576EXPORT_SYMBOL(padata_unregister_cpumask_notifier);
447 577
448 mutex_lock(&pinst->lock);
449 578
450 pd = padata_alloc_pd(pinst, cpumask); 579/* If cpumask contains no active cpu, we mark the instance as invalid. */
451 if (!pd) { 580static bool padata_validate_cpumask(struct padata_instance *pinst,
452 err = -ENOMEM; 581 const struct cpumask *cpumask)
453 goto out; 582{
583 if (!cpumask_intersects(cpumask, cpu_active_mask)) {
584 pinst->flags |= PADATA_INVALID;
585 return false;
586 }
587
588 pinst->flags &= ~PADATA_INVALID;
589 return true;
590}
591
592static int __padata_set_cpumasks(struct padata_instance *pinst,
593 cpumask_var_t pcpumask,
594 cpumask_var_t cbcpumask)
595{
596 int valid;
597 struct parallel_data *pd;
598
599 valid = padata_validate_cpumask(pinst, pcpumask);
600 if (!valid) {
601 __padata_stop(pinst);
602 goto out_replace;
454 } 603 }
455 604
456 cpumask_copy(pinst->cpumask, cpumask); 605 valid = padata_validate_cpumask(pinst, cbcpumask);
606 if (!valid)
607 __padata_stop(pinst);
608
609out_replace:
610 pd = padata_alloc_pd(pinst, pcpumask, cbcpumask);
611 if (!pd)
612 return -ENOMEM;
613
614 cpumask_copy(pinst->cpumask.pcpu, pcpumask);
615 cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
457 616
458 padata_replace(pinst, pd); 617 padata_replace(pinst, pd);
459 618
619 if (valid)
620 __padata_start(pinst);
621
622 return 0;
623}
624
625/**
626 * padata_set_cpumasks - Set both parallel and serial cpumasks. The first
627 * one is used by parallel workers and the second one
628 * by the wokers doing serialization.
629 *
630 * @pinst: padata instance
631 * @pcpumask: the cpumask to use for parallel workers
632 * @cbcpumask: the cpumsak to use for serial workers
633 */
634int padata_set_cpumasks(struct padata_instance *pinst, cpumask_var_t pcpumask,
635 cpumask_var_t cbcpumask)
636{
637 int err;
638
639 mutex_lock(&pinst->lock);
640 get_online_cpus();
641
642 err = __padata_set_cpumasks(pinst, pcpumask, cbcpumask);
643
644 put_online_cpus();
645 mutex_unlock(&pinst->lock);
646
647 return err;
648
649}
650EXPORT_SYMBOL(padata_set_cpumasks);
651
652/**
653 * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value
654 * equivalent to @cpumask.
655 *
656 * @pinst: padata instance
657 * @cpumask_type: PADATA_CPU_SERIAL or PADATA_CPU_PARALLEL corresponding
658 * to parallel and serial cpumasks respectively.
659 * @cpumask: the cpumask to use
660 */
661int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
662 cpumask_var_t cpumask)
663{
664 struct cpumask *serial_mask, *parallel_mask;
665 int err = -EINVAL;
666
667 mutex_lock(&pinst->lock);
668 get_online_cpus();
669
670 switch (cpumask_type) {
671 case PADATA_CPU_PARALLEL:
672 serial_mask = pinst->cpumask.cbcpu;
673 parallel_mask = cpumask;
674 break;
675 case PADATA_CPU_SERIAL:
676 parallel_mask = pinst->cpumask.pcpu;
677 serial_mask = cpumask;
678 break;
679 default:
680 goto out;
681 }
682
683 err = __padata_set_cpumasks(pinst, parallel_mask, serial_mask);
684
460out: 685out:
686 put_online_cpus();
461 mutex_unlock(&pinst->lock); 687 mutex_unlock(&pinst->lock);
462 688
463 return err; 689 return err;
@@ -469,32 +695,50 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
469 struct parallel_data *pd; 695 struct parallel_data *pd;
470 696
471 if (cpumask_test_cpu(cpu, cpu_active_mask)) { 697 if (cpumask_test_cpu(cpu, cpu_active_mask)) {
472 pd = padata_alloc_pd(pinst, pinst->cpumask); 698 pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
699 pinst->cpumask.cbcpu);
473 if (!pd) 700 if (!pd)
474 return -ENOMEM; 701 return -ENOMEM;
475 702
476 padata_replace(pinst, pd); 703 padata_replace(pinst, pd);
704
705 if (padata_validate_cpumask(pinst, pinst->cpumask.pcpu) &&
706 padata_validate_cpumask(pinst, pinst->cpumask.cbcpu))
707 __padata_start(pinst);
477 } 708 }
478 709
479 return 0; 710 return 0;
480} 711}
481 712
482/* 713 /**
483 * padata_add_cpu - add a cpu to the padata cpumask 714 * padata_add_cpu - add a cpu to one or both(parallel and serial)
715 * padata cpumasks.
484 * 716 *
485 * @pinst: padata instance 717 * @pinst: padata instance
486 * @cpu: cpu to add 718 * @cpu: cpu to add
719 * @mask: bitmask of flags specifying to which cpumask @cpu shuld be added.
720 * The @mask may be any combination of the following flags:
721 * PADATA_CPU_SERIAL - serial cpumask
722 * PADATA_CPU_PARALLEL - parallel cpumask
487 */ 723 */
488int padata_add_cpu(struct padata_instance *pinst, int cpu) 724
725int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask)
489{ 726{
490 int err; 727 int err;
491 728
492 might_sleep(); 729 if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
730 return -EINVAL;
493 731
494 mutex_lock(&pinst->lock); 732 mutex_lock(&pinst->lock);
495 733
496 cpumask_set_cpu(cpu, pinst->cpumask); 734 get_online_cpus();
735 if (mask & PADATA_CPU_SERIAL)
736 cpumask_set_cpu(cpu, pinst->cpumask.cbcpu);
737 if (mask & PADATA_CPU_PARALLEL)
738 cpumask_set_cpu(cpu, pinst->cpumask.pcpu);
739
497 err = __padata_add_cpu(pinst, cpu); 740 err = __padata_add_cpu(pinst, cpu);
741 put_online_cpus();
498 742
499 mutex_unlock(&pinst->lock); 743 mutex_unlock(&pinst->lock);
500 744
@@ -504,10 +748,16 @@ EXPORT_SYMBOL(padata_add_cpu);
504 748
505static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) 749static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
506{ 750{
507 struct parallel_data *pd; 751 struct parallel_data *pd = NULL;
508 752
509 if (cpumask_test_cpu(cpu, cpu_online_mask)) { 753 if (cpumask_test_cpu(cpu, cpu_online_mask)) {
510 pd = padata_alloc_pd(pinst, pinst->cpumask); 754
755 if (!padata_validate_cpumask(pinst, pinst->cpumask.pcpu) ||
756 !padata_validate_cpumask(pinst, pinst->cpumask.cbcpu))
757 __padata_stop(pinst);
758
759 pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
760 pinst->cpumask.cbcpu);
511 if (!pd) 761 if (!pd)
512 return -ENOMEM; 762 return -ENOMEM;
513 763
@@ -517,22 +767,34 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
517 return 0; 767 return 0;
518} 768}
519 769
520/* 770 /**
521 * padata_remove_cpu - remove a cpu from the padata cpumask 771 * padata_remove_cpu - remove a cpu from the one or both(serial and paralell)
772 * padata cpumasks.
522 * 773 *
523 * @pinst: padata instance 774 * @pinst: padata instance
524 * @cpu: cpu to remove 775 * @cpu: cpu to remove
776 * @mask: bitmask specifying from which cpumask @cpu should be removed
777 * The @mask may be any combination of the following flags:
778 * PADATA_CPU_SERIAL - serial cpumask
779 * PADATA_CPU_PARALLEL - parallel cpumask
525 */ 780 */
526int padata_remove_cpu(struct padata_instance *pinst, int cpu) 781int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask)
527{ 782{
528 int err; 783 int err;
529 784
530 might_sleep(); 785 if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
786 return -EINVAL;
531 787
532 mutex_lock(&pinst->lock); 788 mutex_lock(&pinst->lock);
533 789
534 cpumask_clear_cpu(cpu, pinst->cpumask); 790 get_online_cpus();
791 if (mask & PADATA_CPU_SERIAL)
792 cpumask_clear_cpu(cpu, pinst->cpumask.cbcpu);
793 if (mask & PADATA_CPU_PARALLEL)
794 cpumask_clear_cpu(cpu, pinst->cpumask.pcpu);
795
535 err = __padata_remove_cpu(pinst, cpu); 796 err = __padata_remove_cpu(pinst, cpu);
797 put_online_cpus();
536 798
537 mutex_unlock(&pinst->lock); 799 mutex_unlock(&pinst->lock);
538 800
@@ -540,38 +802,52 @@ int padata_remove_cpu(struct padata_instance *pinst, int cpu)
540} 802}
541EXPORT_SYMBOL(padata_remove_cpu); 803EXPORT_SYMBOL(padata_remove_cpu);
542 804
543/* 805/**
544 * padata_start - start the parallel processing 806 * padata_start - start the parallel processing
545 * 807 *
546 * @pinst: padata instance to start 808 * @pinst: padata instance to start
547 */ 809 */
548void padata_start(struct padata_instance *pinst) 810int padata_start(struct padata_instance *pinst)
549{ 811{
550 might_sleep(); 812 int err = 0;
551 813
552 mutex_lock(&pinst->lock); 814 mutex_lock(&pinst->lock);
553 pinst->flags |= PADATA_INIT; 815
816 if (pinst->flags & PADATA_INVALID)
817 err =-EINVAL;
818
819 __padata_start(pinst);
820
554 mutex_unlock(&pinst->lock); 821 mutex_unlock(&pinst->lock);
822
823 return err;
555} 824}
556EXPORT_SYMBOL(padata_start); 825EXPORT_SYMBOL(padata_start);
557 826
558/* 827/**
559 * padata_stop - stop the parallel processing 828 * padata_stop - stop the parallel processing
560 * 829 *
561 * @pinst: padata instance to stop 830 * @pinst: padata instance to stop
562 */ 831 */
563void padata_stop(struct padata_instance *pinst) 832void padata_stop(struct padata_instance *pinst)
564{ 833{
565 might_sleep();
566
567 mutex_lock(&pinst->lock); 834 mutex_lock(&pinst->lock);
568 pinst->flags &= ~PADATA_INIT; 835 __padata_stop(pinst);
569 mutex_unlock(&pinst->lock); 836 mutex_unlock(&pinst->lock);
570} 837}
571EXPORT_SYMBOL(padata_stop); 838EXPORT_SYMBOL(padata_stop);
572 839
573static int __cpuinit padata_cpu_callback(struct notifier_block *nfb, 840#ifdef CONFIG_HOTPLUG_CPU
574 unsigned long action, void *hcpu) 841
842static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu)
843{
844 return cpumask_test_cpu(cpu, pinst->cpumask.pcpu) ||
845 cpumask_test_cpu(cpu, pinst->cpumask.cbcpu);
846}
847
848
849static int padata_cpu_callback(struct notifier_block *nfb,
850 unsigned long action, void *hcpu)
575{ 851{
576 int err; 852 int err;
577 struct padata_instance *pinst; 853 struct padata_instance *pinst;
@@ -582,29 +858,29 @@ static int __cpuinit padata_cpu_callback(struct notifier_block *nfb,
582 switch (action) { 858 switch (action) {
583 case CPU_ONLINE: 859 case CPU_ONLINE:
584 case CPU_ONLINE_FROZEN: 860 case CPU_ONLINE_FROZEN:
585 if (!cpumask_test_cpu(cpu, pinst->cpumask)) 861 if (!pinst_has_cpu(pinst, cpu))
586 break; 862 break;
587 mutex_lock(&pinst->lock); 863 mutex_lock(&pinst->lock);
588 err = __padata_add_cpu(pinst, cpu); 864 err = __padata_add_cpu(pinst, cpu);
589 mutex_unlock(&pinst->lock); 865 mutex_unlock(&pinst->lock);
590 if (err) 866 if (err)
591 return NOTIFY_BAD; 867 return notifier_from_errno(err);
592 break; 868 break;
593 869
594 case CPU_DOWN_PREPARE: 870 case CPU_DOWN_PREPARE:
595 case CPU_DOWN_PREPARE_FROZEN: 871 case CPU_DOWN_PREPARE_FROZEN:
596 if (!cpumask_test_cpu(cpu, pinst->cpumask)) 872 if (!pinst_has_cpu(pinst, cpu))
597 break; 873 break;
598 mutex_lock(&pinst->lock); 874 mutex_lock(&pinst->lock);
599 err = __padata_remove_cpu(pinst, cpu); 875 err = __padata_remove_cpu(pinst, cpu);
600 mutex_unlock(&pinst->lock); 876 mutex_unlock(&pinst->lock);
601 if (err) 877 if (err)
602 return NOTIFY_BAD; 878 return notifier_from_errno(err);
603 break; 879 break;
604 880
605 case CPU_UP_CANCELED: 881 case CPU_UP_CANCELED:
606 case CPU_UP_CANCELED_FROZEN: 882 case CPU_UP_CANCELED_FROZEN:
607 if (!cpumask_test_cpu(cpu, pinst->cpumask)) 883 if (!pinst_has_cpu(pinst, cpu))
608 break; 884 break;
609 mutex_lock(&pinst->lock); 885 mutex_lock(&pinst->lock);
610 __padata_remove_cpu(pinst, cpu); 886 __padata_remove_cpu(pinst, cpu);
@@ -612,7 +888,7 @@ static int __cpuinit padata_cpu_callback(struct notifier_block *nfb,
612 888
613 case CPU_DOWN_FAILED: 889 case CPU_DOWN_FAILED:
614 case CPU_DOWN_FAILED_FROZEN: 890 case CPU_DOWN_FAILED_FROZEN:
615 if (!cpumask_test_cpu(cpu, pinst->cpumask)) 891 if (!pinst_has_cpu(pinst, cpu))
616 break; 892 break;
617 mutex_lock(&pinst->lock); 893 mutex_lock(&pinst->lock);
618 __padata_add_cpu(pinst, cpu); 894 __padata_add_cpu(pinst, cpu);
@@ -621,77 +897,239 @@ static int __cpuinit padata_cpu_callback(struct notifier_block *nfb,
621 897
622 return NOTIFY_OK; 898 return NOTIFY_OK;
623} 899}
900#endif
901
902static void __padata_free(struct padata_instance *pinst)
903{
904#ifdef CONFIG_HOTPLUG_CPU
905 unregister_hotcpu_notifier(&pinst->cpu_notifier);
906#endif
907
908 padata_stop(pinst);
909 padata_free_pd(pinst->pd);
910 free_cpumask_var(pinst->cpumask.pcpu);
911 free_cpumask_var(pinst->cpumask.cbcpu);
912 kfree(pinst);
913}
914
915#define kobj2pinst(_kobj) \
916 container_of(_kobj, struct padata_instance, kobj)
917#define attr2pentry(_attr) \
918 container_of(_attr, struct padata_sysfs_entry, attr)
919
920static void padata_sysfs_release(struct kobject *kobj)
921{
922 struct padata_instance *pinst = kobj2pinst(kobj);
923 __padata_free(pinst);
924}
925
926struct padata_sysfs_entry {
927 struct attribute attr;
928 ssize_t (*show)(struct padata_instance *, struct attribute *, char *);
929 ssize_t (*store)(struct padata_instance *, struct attribute *,
930 const char *, size_t);
931};
932
933static ssize_t show_cpumask(struct padata_instance *pinst,
934 struct attribute *attr, char *buf)
935{
936 struct cpumask *cpumask;
937 ssize_t len;
938
939 mutex_lock(&pinst->lock);
940 if (!strcmp(attr->name, "serial_cpumask"))
941 cpumask = pinst->cpumask.cbcpu;
942 else
943 cpumask = pinst->cpumask.pcpu;
944
945 len = bitmap_scnprintf(buf, PAGE_SIZE, cpumask_bits(cpumask),
946 nr_cpu_ids);
947 if (PAGE_SIZE - len < 2)
948 len = -EINVAL;
949 else
950 len += sprintf(buf + len, "\n");
951
952 mutex_unlock(&pinst->lock);
953 return len;
954}
955
956static ssize_t store_cpumask(struct padata_instance *pinst,
957 struct attribute *attr,
958 const char *buf, size_t count)
959{
960 cpumask_var_t new_cpumask;
961 ssize_t ret;
962 int mask_type;
963
964 if (!alloc_cpumask_var(&new_cpumask, GFP_KERNEL))
965 return -ENOMEM;
966
967 ret = bitmap_parse(buf, count, cpumask_bits(new_cpumask),
968 nr_cpumask_bits);
969 if (ret < 0)
970 goto out;
971
972 mask_type = !strcmp(attr->name, "serial_cpumask") ?
973 PADATA_CPU_SERIAL : PADATA_CPU_PARALLEL;
974 ret = padata_set_cpumask(pinst, mask_type, new_cpumask);
975 if (!ret)
976 ret = count;
977
978out:
979 free_cpumask_var(new_cpumask);
980 return ret;
981}
982
983#define PADATA_ATTR_RW(_name, _show_name, _store_name) \
984 static struct padata_sysfs_entry _name##_attr = \
985 __ATTR(_name, 0644, _show_name, _store_name)
986#define PADATA_ATTR_RO(_name, _show_name) \
987 static struct padata_sysfs_entry _name##_attr = \
988 __ATTR(_name, 0400, _show_name, NULL)
989
990PADATA_ATTR_RW(serial_cpumask, show_cpumask, store_cpumask);
991PADATA_ATTR_RW(parallel_cpumask, show_cpumask, store_cpumask);
624 992
625/* 993/*
626 * padata_alloc - allocate and initialize a padata instance 994 * Padata sysfs provides the following objects:
995 * serial_cpumask [RW] - cpumask for serial workers
996 * parallel_cpumask [RW] - cpumask for parallel workers
997 */
998static struct attribute *padata_default_attrs[] = {
999 &serial_cpumask_attr.attr,
1000 &parallel_cpumask_attr.attr,
1001 NULL,
1002};
1003
1004static ssize_t padata_sysfs_show(struct kobject *kobj,
1005 struct attribute *attr, char *buf)
1006{
1007 struct padata_instance *pinst;
1008 struct padata_sysfs_entry *pentry;
1009 ssize_t ret = -EIO;
1010
1011 pinst = kobj2pinst(kobj);
1012 pentry = attr2pentry(attr);
1013 if (pentry->show)
1014 ret = pentry->show(pinst, attr, buf);
1015
1016 return ret;
1017}
1018
1019static ssize_t padata_sysfs_store(struct kobject *kobj, struct attribute *attr,
1020 const char *buf, size_t count)
1021{
1022 struct padata_instance *pinst;
1023 struct padata_sysfs_entry *pentry;
1024 ssize_t ret = -EIO;
1025
1026 pinst = kobj2pinst(kobj);
1027 pentry = attr2pentry(attr);
1028 if (pentry->show)
1029 ret = pentry->store(pinst, attr, buf, count);
1030
1031 return ret;
1032}
1033
1034static const struct sysfs_ops padata_sysfs_ops = {
1035 .show = padata_sysfs_show,
1036 .store = padata_sysfs_store,
1037};
1038
1039static struct kobj_type padata_attr_type = {
1040 .sysfs_ops = &padata_sysfs_ops,
1041 .default_attrs = padata_default_attrs,
1042 .release = padata_sysfs_release,
1043};
1044
1045/**
1046 * padata_alloc_possible - Allocate and initialize padata instance.
1047 * Use the cpu_possible_mask for serial and
1048 * parallel workers.
627 * 1049 *
628 * @cpumask: cpumask that padata uses for parallelization
629 * @wq: workqueue to use for the allocated padata instance 1050 * @wq: workqueue to use for the allocated padata instance
630 */ 1051 */
631struct padata_instance *padata_alloc(const struct cpumask *cpumask, 1052struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq)
632 struct workqueue_struct *wq) 1053{
1054 return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask);
1055}
1056EXPORT_SYMBOL(padata_alloc_possible);
1057
1058/**
1059 * padata_alloc - allocate and initialize a padata instance and specify
1060 * cpumasks for serial and parallel workers.
1061 *
1062 * @wq: workqueue to use for the allocated padata instance
1063 * @pcpumask: cpumask that will be used for padata parallelization
1064 * @cbcpumask: cpumask that will be used for padata serialization
1065 */
1066struct padata_instance *padata_alloc(struct workqueue_struct *wq,
1067 const struct cpumask *pcpumask,
1068 const struct cpumask *cbcpumask)
633{ 1069{
634 int err;
635 struct padata_instance *pinst; 1070 struct padata_instance *pinst;
636 struct parallel_data *pd; 1071 struct parallel_data *pd = NULL;
637 1072
638 pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL); 1073 pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL);
639 if (!pinst) 1074 if (!pinst)
640 goto err; 1075 goto err;
641 1076
642 pd = padata_alloc_pd(pinst, cpumask); 1077 get_online_cpus();
643 if (!pd) 1078 if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL))
1079 goto err_free_inst;
1080 if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) {
1081 free_cpumask_var(pinst->cpumask.pcpu);
644 goto err_free_inst; 1082 goto err_free_inst;
1083 }
1084 if (!padata_validate_cpumask(pinst, pcpumask) ||
1085 !padata_validate_cpumask(pinst, cbcpumask))
1086 goto err_free_masks;
645 1087
646 if (!alloc_cpumask_var(&pinst->cpumask, GFP_KERNEL)) 1088 pd = padata_alloc_pd(pinst, pcpumask, cbcpumask);
647 goto err_free_pd; 1089 if (!pd)
1090 goto err_free_masks;
648 1091
649 rcu_assign_pointer(pinst->pd, pd); 1092 rcu_assign_pointer(pinst->pd, pd);
650 1093
651 pinst->wq = wq; 1094 pinst->wq = wq;
652 1095
653 cpumask_copy(pinst->cpumask, cpumask); 1096 cpumask_copy(pinst->cpumask.pcpu, pcpumask);
1097 cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
654 1098
655 pinst->flags = 0; 1099 pinst->flags = 0;
656 1100
1101#ifdef CONFIG_HOTPLUG_CPU
657 pinst->cpu_notifier.notifier_call = padata_cpu_callback; 1102 pinst->cpu_notifier.notifier_call = padata_cpu_callback;
658 pinst->cpu_notifier.priority = 0; 1103 pinst->cpu_notifier.priority = 0;
659 err = register_hotcpu_notifier(&pinst->cpu_notifier); 1104 register_hotcpu_notifier(&pinst->cpu_notifier);
660 if (err) 1105#endif
661 goto err_free_cpumask; 1106
1107 put_online_cpus();
662 1108
1109 BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier);
1110 kobject_init(&pinst->kobj, &padata_attr_type);
663 mutex_init(&pinst->lock); 1111 mutex_init(&pinst->lock);
664 1112
665 return pinst; 1113 return pinst;
666 1114
667err_free_cpumask: 1115err_free_masks:
668 free_cpumask_var(pinst->cpumask); 1116 free_cpumask_var(pinst->cpumask.pcpu);
669err_free_pd: 1117 free_cpumask_var(pinst->cpumask.cbcpu);
670 padata_free_pd(pd);
671err_free_inst: 1118err_free_inst:
672 kfree(pinst); 1119 kfree(pinst);
1120 put_online_cpus();
673err: 1121err:
674 return NULL; 1122 return NULL;
675} 1123}
676EXPORT_SYMBOL(padata_alloc); 1124EXPORT_SYMBOL(padata_alloc);
677 1125
678/* 1126/**
679 * padata_free - free a padata instance 1127 * padata_free - free a padata instance
680 * 1128 *
681 * @ padata_inst: padata instance to free 1129 * @padata_inst: padata instance to free
682 */ 1130 */
683void padata_free(struct padata_instance *pinst) 1131void padata_free(struct padata_instance *pinst)
684{ 1132{
685 padata_stop(pinst); 1133 kobject_put(&pinst->kobj);
686
687 synchronize_rcu();
688
689 while (atomic_read(&pinst->pd->refcnt) != 0)
690 yield();
691
692 unregister_hotcpu_notifier(&pinst->cpu_notifier);
693 padata_free_pd(pinst->pd);
694 free_cpumask_var(pinst->cpumask);
695 kfree(pinst);
696} 1134}
697EXPORT_SYMBOL(padata_free); 1135EXPORT_SYMBOL(padata_free);
diff --git a/kernel/panic.c b/kernel/panic.c
index 13d966b4c14a..4c13b1a88ebb 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -24,6 +24,9 @@
24#include <linux/nmi.h> 24#include <linux/nmi.h>
25#include <linux/dmi.h> 25#include <linux/dmi.h>
26 26
27#define PANIC_TIMER_STEP 100
28#define PANIC_BLINK_SPD 18
29
27int panic_on_oops; 30int panic_on_oops;
28static unsigned long tainted_mask; 31static unsigned long tainted_mask;
29static int pause_on_oops; 32static int pause_on_oops;
@@ -36,36 +39,15 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
36 39
37EXPORT_SYMBOL(panic_notifier_list); 40EXPORT_SYMBOL(panic_notifier_list);
38 41
39/* Returns how long it waited in ms */ 42static long no_blink(int state)
40long (*panic_blink)(long time);
41EXPORT_SYMBOL(panic_blink);
42
43static void panic_blink_one_second(void)
44{ 43{
45 static long i = 0, end; 44 return 0;
46
47 if (panic_blink) {
48 end = i + MSEC_PER_SEC;
49
50 while (i < end) {
51 i += panic_blink(i);
52 mdelay(1);
53 i++;
54 }
55 } else {
56 /*
57 * When running under a hypervisor a small mdelay may get
58 * rounded up to the hypervisor timeslice. For example, with
59 * a 1ms in 10ms hypervisor timeslice we might inflate a
60 * mdelay(1) loop by 10x.
61 *
62 * If we have nothing to blink, spin on 1 second calls to
63 * mdelay to avoid this.
64 */
65 mdelay(MSEC_PER_SEC);
66 }
67} 45}
68 46
47/* Returns how long it waited in ms */
48long (*panic_blink)(int state);
49EXPORT_SYMBOL(panic_blink);
50
69/** 51/**
70 * panic - halt the system 52 * panic - halt the system
71 * @fmt: The text string to print 53 * @fmt: The text string to print
@@ -78,7 +60,8 @@ NORET_TYPE void panic(const char * fmt, ...)
78{ 60{
79 static char buf[1024]; 61 static char buf[1024];
80 va_list args; 62 va_list args;
81 long i; 63 long i, i_next = 0;
64 int state = 0;
82 65
83 /* 66 /*
84 * It's possible to come here directly from a panic-assertion and 67 * It's possible to come here directly from a panic-assertion and
@@ -87,6 +70,7 @@ NORET_TYPE void panic(const char * fmt, ...)
87 */ 70 */
88 preempt_disable(); 71 preempt_disable();
89 72
73 console_verbose();
90 bust_spinlocks(1); 74 bust_spinlocks(1);
91 va_start(args, fmt); 75 va_start(args, fmt);
92 vsnprintf(buf, sizeof(buf), fmt, args); 76 vsnprintf(buf, sizeof(buf), fmt, args);
@@ -116,6 +100,9 @@ NORET_TYPE void panic(const char * fmt, ...)
116 100
117 bust_spinlocks(0); 101 bust_spinlocks(0);
118 102
103 if (!panic_blink)
104 panic_blink = no_blink;
105
119 if (panic_timeout > 0) { 106 if (panic_timeout > 0) {
120 /* 107 /*
121 * Delay timeout seconds before rebooting the machine. 108 * Delay timeout seconds before rebooting the machine.
@@ -123,9 +110,13 @@ NORET_TYPE void panic(const char * fmt, ...)
123 */ 110 */
124 printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout); 111 printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
125 112
126 for (i = 0; i < panic_timeout; i++) { 113 for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) {
127 touch_nmi_watchdog(); 114 touch_nmi_watchdog();
128 panic_blink_one_second(); 115 if (i >= i_next) {
116 i += panic_blink(state ^= 1);
117 i_next = i + 3600 / PANIC_BLINK_SPD;
118 }
119 mdelay(PANIC_TIMER_STEP);
129 } 120 }
130 /* 121 /*
131 * This will not be a clean reboot, with everything 122 * This will not be a clean reboot, with everything
@@ -151,9 +142,13 @@ NORET_TYPE void panic(const char * fmt, ...)
151 } 142 }
152#endif 143#endif
153 local_irq_enable(); 144 local_irq_enable();
154 while (1) { 145 for (i = 0; ; i += PANIC_TIMER_STEP) {
155 touch_softlockup_watchdog(); 146 touch_softlockup_watchdog();
156 panic_blink_one_second(); 147 if (i >= i_next) {
148 i += panic_blink(state ^= 1);
149 i_next = i + 3600 / PANIC_BLINK_SPD;
150 }
151 mdelay(PANIC_TIMER_STEP);
157 } 152 }
158} 153}
159 154
@@ -178,6 +173,7 @@ static const struct tnt tnts[] = {
178 { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' }, 173 { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' },
179 { TAINT_WARN, 'W', ' ' }, 174 { TAINT_WARN, 'W', ' ' },
180 { TAINT_CRAP, 'C', ' ' }, 175 { TAINT_CRAP, 'C', ' ' },
176 { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' },
181}; 177};
182 178
183/** 179/**
@@ -194,6 +190,7 @@ static const struct tnt tnts[] = {
194 * 'A' - ACPI table overridden. 190 * 'A' - ACPI table overridden.
195 * 'W' - Taint on warning. 191 * 'W' - Taint on warning.
196 * 'C' - modules from drivers/staging are loaded. 192 * 'C' - modules from drivers/staging are loaded.
193 * 'I' - Working around severe firmware bug.
197 * 194 *
198 * The string is overwritten by the next call to print_tainted(). 195 * The string is overwritten by the next call to print_tainted().
199 */ 196 */
@@ -341,7 +338,7 @@ static int init_oops_id(void)
341} 338}
342late_initcall(init_oops_id); 339late_initcall(init_oops_id);
343 340
344static void print_oops_end_marker(void) 341void print_oops_end_marker(void)
345{ 342{
346 init_oops_id(); 343 init_oops_id();
347 printk(KERN_WARNING "---[ end trace %016llx ]---\n", 344 printk(KERN_WARNING "---[ end trace %016llx ]---\n",
@@ -365,7 +362,8 @@ struct slowpath_args {
365 va_list args; 362 va_list args;
366}; 363};
367 364
368static void warn_slowpath_common(const char *file, int line, void *caller, struct slowpath_args *args) 365static void warn_slowpath_common(const char *file, int line, void *caller,
366 unsigned taint, struct slowpath_args *args)
369{ 367{
370 const char *board; 368 const char *board;
371 369
@@ -381,7 +379,7 @@ static void warn_slowpath_common(const char *file, int line, void *caller, struc
381 print_modules(); 379 print_modules();
382 dump_stack(); 380 dump_stack();
383 print_oops_end_marker(); 381 print_oops_end_marker();
384 add_taint(TAINT_WARN); 382 add_taint(taint);
385} 383}
386 384
387void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) 385void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
@@ -390,14 +388,29 @@ void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
390 388
391 args.fmt = fmt; 389 args.fmt = fmt;
392 va_start(args.args, fmt); 390 va_start(args.args, fmt);
393 warn_slowpath_common(file, line, __builtin_return_address(0), &args); 391 warn_slowpath_common(file, line, __builtin_return_address(0),
392 TAINT_WARN, &args);
394 va_end(args.args); 393 va_end(args.args);
395} 394}
396EXPORT_SYMBOL(warn_slowpath_fmt); 395EXPORT_SYMBOL(warn_slowpath_fmt);
397 396
397void warn_slowpath_fmt_taint(const char *file, int line,
398 unsigned taint, const char *fmt, ...)
399{
400 struct slowpath_args args;
401
402 args.fmt = fmt;
403 va_start(args.args, fmt);
404 warn_slowpath_common(file, line, __builtin_return_address(0),
405 taint, &args);
406 va_end(args.args);
407}
408EXPORT_SYMBOL(warn_slowpath_fmt_taint);
409
398void warn_slowpath_null(const char *file, int line) 410void warn_slowpath_null(const char *file, int line)
399{ 411{
400 warn_slowpath_common(file, line, __builtin_return_address(0), NULL); 412 warn_slowpath_common(file, line, __builtin_return_address(0),
413 TAINT_WARN, NULL);
401} 414}
402EXPORT_SYMBOL(warn_slowpath_null); 415EXPORT_SYMBOL(warn_slowpath_null);
403#endif 416#endif
diff --git a/kernel/params.c b/kernel/params.c
index 0b30ecd53a52..08107d181758 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -31,6 +31,42 @@
31#define DEBUGP(fmt, a...) 31#define DEBUGP(fmt, a...)
32#endif 32#endif
33 33
34/* Protects all parameters, and incidentally kmalloced_param list. */
35static DEFINE_MUTEX(param_lock);
36
37/* This just allows us to keep track of which parameters are kmalloced. */
38struct kmalloced_param {
39 struct list_head list;
40 char val[];
41};
42static LIST_HEAD(kmalloced_params);
43
44static void *kmalloc_parameter(unsigned int size)
45{
46 struct kmalloced_param *p;
47
48 p = kmalloc(sizeof(*p) + size, GFP_KERNEL);
49 if (!p)
50 return NULL;
51
52 list_add(&p->list, &kmalloced_params);
53 return p->val;
54}
55
56/* Does nothing if parameter wasn't kmalloced above. */
57static void maybe_kfree_parameter(void *param)
58{
59 struct kmalloced_param *p;
60
61 list_for_each_entry(p, &kmalloced_params, list) {
62 if (p->val == param) {
63 list_del(&p->list);
64 kfree(p);
65 break;
66 }
67 }
68}
69
34static inline char dash2underscore(char c) 70static inline char dash2underscore(char c)
35{ 71{
36 if (c == '-') 72 if (c == '-')
@@ -49,18 +85,25 @@ static inline int parameq(const char *input, const char *paramname)
49 85
50static int parse_one(char *param, 86static int parse_one(char *param,
51 char *val, 87 char *val,
52 struct kernel_param *params, 88 const struct kernel_param *params,
53 unsigned num_params, 89 unsigned num_params,
54 int (*handle_unknown)(char *param, char *val)) 90 int (*handle_unknown)(char *param, char *val))
55{ 91{
56 unsigned int i; 92 unsigned int i;
93 int err;
57 94
58 /* Find parameter */ 95 /* Find parameter */
59 for (i = 0; i < num_params; i++) { 96 for (i = 0; i < num_params; i++) {
60 if (parameq(param, params[i].name)) { 97 if (parameq(param, params[i].name)) {
98 /* Noone handled NULL, so do it here. */
99 if (!val && params[i].ops->set != param_set_bool)
100 return -EINVAL;
61 DEBUGP("They are equal! Calling %p\n", 101 DEBUGP("They are equal! Calling %p\n",
62 params[i].set); 102 params[i].ops->set);
63 return params[i].set(val, &params[i]); 103 mutex_lock(&param_lock);
104 err = params[i].ops->set(val, &params[i]);
105 mutex_unlock(&param_lock);
106 return err;
64 } 107 }
65 } 108 }
66 109
@@ -128,7 +171,7 @@ static char *next_arg(char *args, char **param, char **val)
128/* Args looks like "foo=bar,bar2 baz=fuz wiz". */ 171/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
129int parse_args(const char *name, 172int parse_args(const char *name,
130 char *args, 173 char *args,
131 struct kernel_param *params, 174 const struct kernel_param *params,
132 unsigned num, 175 unsigned num,
133 int (*unknown)(char *param, char *val)) 176 int (*unknown)(char *param, char *val))
134{ 177{
@@ -176,22 +219,29 @@ int parse_args(const char *name,
176 219
177/* Lazy bastard, eh? */ 220/* Lazy bastard, eh? */
178#define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \ 221#define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \
179 int param_set_##name(const char *val, struct kernel_param *kp) \ 222 int param_set_##name(const char *val, const struct kernel_param *kp) \
180 { \ 223 { \
181 tmptype l; \ 224 tmptype l; \
182 int ret; \ 225 int ret; \
183 \ 226 \
184 if (!val) return -EINVAL; \
185 ret = strtolfn(val, 0, &l); \ 227 ret = strtolfn(val, 0, &l); \
186 if (ret == -EINVAL || ((type)l != l)) \ 228 if (ret == -EINVAL || ((type)l != l)) \
187 return -EINVAL; \ 229 return -EINVAL; \
188 *((type *)kp->arg) = l; \ 230 *((type *)kp->arg) = l; \
189 return 0; \ 231 return 0; \
190 } \ 232 } \
191 int param_get_##name(char *buffer, struct kernel_param *kp) \ 233 int param_get_##name(char *buffer, const struct kernel_param *kp) \
192 { \ 234 { \
193 return sprintf(buffer, format, *((type *)kp->arg)); \ 235 return sprintf(buffer, format, *((type *)kp->arg)); \
194 } 236 } \
237 struct kernel_param_ops param_ops_##name = { \
238 .set = param_set_##name, \
239 .get = param_get_##name, \
240 }; \
241 EXPORT_SYMBOL(param_set_##name); \
242 EXPORT_SYMBOL(param_get_##name); \
243 EXPORT_SYMBOL(param_ops_##name)
244
195 245
196STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, strict_strtoul); 246STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, strict_strtoul);
197STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol); 247STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol);
@@ -201,39 +251,50 @@ STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul);
201STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol); 251STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol);
202STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul); 252STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul);
203 253
204int param_set_charp(const char *val, struct kernel_param *kp) 254int param_set_charp(const char *val, const struct kernel_param *kp)
205{ 255{
206 if (!val) {
207 printk(KERN_ERR "%s: string parameter expected\n",
208 kp->name);
209 return -EINVAL;
210 }
211
212 if (strlen(val) > 1024) { 256 if (strlen(val) > 1024) {
213 printk(KERN_ERR "%s: string parameter too long\n", 257 printk(KERN_ERR "%s: string parameter too long\n",
214 kp->name); 258 kp->name);
215 return -ENOSPC; 259 return -ENOSPC;
216 } 260 }
217 261
218 /* This is a hack. We can't need to strdup in early boot, and we 262 maybe_kfree_parameter(*(char **)kp->arg);
263
264 /* This is a hack. We can't kmalloc in early boot, and we
219 * don't need to; this mangled commandline is preserved. */ 265 * don't need to; this mangled commandline is preserved. */
220 if (slab_is_available()) { 266 if (slab_is_available()) {
221 *(char **)kp->arg = kstrdup(val, GFP_KERNEL); 267 *(char **)kp->arg = kmalloc_parameter(strlen(val)+1);
222 if (!*(char **)kp->arg) 268 if (!*(char **)kp->arg)
223 return -ENOMEM; 269 return -ENOMEM;
270 strcpy(*(char **)kp->arg, val);
224 } else 271 } else
225 *(const char **)kp->arg = val; 272 *(const char **)kp->arg = val;
226 273
227 return 0; 274 return 0;
228} 275}
276EXPORT_SYMBOL(param_set_charp);
229 277
230int param_get_charp(char *buffer, struct kernel_param *kp) 278int param_get_charp(char *buffer, const struct kernel_param *kp)
231{ 279{
232 return sprintf(buffer, "%s", *((char **)kp->arg)); 280 return sprintf(buffer, "%s", *((char **)kp->arg));
233} 281}
282EXPORT_SYMBOL(param_get_charp);
283
284static void param_free_charp(void *arg)
285{
286 maybe_kfree_parameter(*((char **)arg));
287}
288
289struct kernel_param_ops param_ops_charp = {
290 .set = param_set_charp,
291 .get = param_get_charp,
292 .free = param_free_charp,
293};
294EXPORT_SYMBOL(param_ops_charp);
234 295
235/* Actually could be a bool or an int, for historical reasons. */ 296/* Actually could be a bool or an int, for historical reasons. */
236int param_set_bool(const char *val, struct kernel_param *kp) 297int param_set_bool(const char *val, const struct kernel_param *kp)
237{ 298{
238 bool v; 299 bool v;
239 300
@@ -258,8 +319,9 @@ int param_set_bool(const char *val, struct kernel_param *kp)
258 *(int *)kp->arg = v; 319 *(int *)kp->arg = v;
259 return 0; 320 return 0;
260} 321}
322EXPORT_SYMBOL(param_set_bool);
261 323
262int param_get_bool(char *buffer, struct kernel_param *kp) 324int param_get_bool(char *buffer, const struct kernel_param *kp)
263{ 325{
264 bool val; 326 bool val;
265 if (kp->flags & KPARAM_ISBOOL) 327 if (kp->flags & KPARAM_ISBOOL)
@@ -270,9 +332,16 @@ int param_get_bool(char *buffer, struct kernel_param *kp)
270 /* Y and N chosen as being relatively non-coder friendly */ 332 /* Y and N chosen as being relatively non-coder friendly */
271 return sprintf(buffer, "%c", val ? 'Y' : 'N'); 333 return sprintf(buffer, "%c", val ? 'Y' : 'N');
272} 334}
335EXPORT_SYMBOL(param_get_bool);
336
337struct kernel_param_ops param_ops_bool = {
338 .set = param_set_bool,
339 .get = param_get_bool,
340};
341EXPORT_SYMBOL(param_ops_bool);
273 342
274/* This one must be bool. */ 343/* This one must be bool. */
275int param_set_invbool(const char *val, struct kernel_param *kp) 344int param_set_invbool(const char *val, const struct kernel_param *kp)
276{ 345{
277 int ret; 346 int ret;
278 bool boolval; 347 bool boolval;
@@ -285,18 +354,26 @@ int param_set_invbool(const char *val, struct kernel_param *kp)
285 *(bool *)kp->arg = !boolval; 354 *(bool *)kp->arg = !boolval;
286 return ret; 355 return ret;
287} 356}
357EXPORT_SYMBOL(param_set_invbool);
288 358
289int param_get_invbool(char *buffer, struct kernel_param *kp) 359int param_get_invbool(char *buffer, const struct kernel_param *kp)
290{ 360{
291 return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y'); 361 return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y');
292} 362}
363EXPORT_SYMBOL(param_get_invbool);
364
365struct kernel_param_ops param_ops_invbool = {
366 .set = param_set_invbool,
367 .get = param_get_invbool,
368};
369EXPORT_SYMBOL(param_ops_invbool);
293 370
294/* We break the rule and mangle the string. */ 371/* We break the rule and mangle the string. */
295static int param_array(const char *name, 372static int param_array(const char *name,
296 const char *val, 373 const char *val,
297 unsigned int min, unsigned int max, 374 unsigned int min, unsigned int max,
298 void *elem, int elemsize, 375 void *elem, int elemsize,
299 int (*set)(const char *, struct kernel_param *kp), 376 int (*set)(const char *, const struct kernel_param *kp),
300 u16 flags, 377 u16 flags,
301 unsigned int *num) 378 unsigned int *num)
302{ 379{
@@ -309,12 +386,6 @@ static int param_array(const char *name,
309 kp.arg = elem; 386 kp.arg = elem;
310 kp.flags = flags; 387 kp.flags = flags;
311 388
312 /* No equals sign? */
313 if (!val) {
314 printk(KERN_ERR "%s: expects arguments\n", name);
315 return -EINVAL;
316 }
317
318 *num = 0; 389 *num = 0;
319 /* We expect a comma-separated list of values. */ 390 /* We expect a comma-separated list of values. */
320 do { 391 do {
@@ -330,6 +401,7 @@ static int param_array(const char *name,
330 /* nul-terminate and parse */ 401 /* nul-terminate and parse */
331 save = val[len]; 402 save = val[len];
332 ((char *)val)[len] = '\0'; 403 ((char *)val)[len] = '\0';
404 BUG_ON(!mutex_is_locked(&param_lock));
333 ret = set(val, &kp); 405 ret = set(val, &kp);
334 406
335 if (ret != 0) 407 if (ret != 0)
@@ -347,17 +419,17 @@ static int param_array(const char *name,
347 return 0; 419 return 0;
348} 420}
349 421
350int param_array_set(const char *val, struct kernel_param *kp) 422static int param_array_set(const char *val, const struct kernel_param *kp)
351{ 423{
352 const struct kparam_array *arr = kp->arr; 424 const struct kparam_array *arr = kp->arr;
353 unsigned int temp_num; 425 unsigned int temp_num;
354 426
355 return param_array(kp->name, val, 1, arr->max, arr->elem, 427 return param_array(kp->name, val, 1, arr->max, arr->elem,
356 arr->elemsize, arr->set, kp->flags, 428 arr->elemsize, arr->ops->set, kp->flags,
357 arr->num ?: &temp_num); 429 arr->num ?: &temp_num);
358} 430}
359 431
360int param_array_get(char *buffer, struct kernel_param *kp) 432static int param_array_get(char *buffer, const struct kernel_param *kp)
361{ 433{
362 int i, off, ret; 434 int i, off, ret;
363 const struct kparam_array *arr = kp->arr; 435 const struct kparam_array *arr = kp->arr;
@@ -368,7 +440,8 @@ int param_array_get(char *buffer, struct kernel_param *kp)
368 if (i) 440 if (i)
369 buffer[off++] = ','; 441 buffer[off++] = ',';
370 p.arg = arr->elem + arr->elemsize * i; 442 p.arg = arr->elem + arr->elemsize * i;
371 ret = arr->get(buffer + off, &p); 443 BUG_ON(!mutex_is_locked(&param_lock));
444 ret = arr->ops->get(buffer + off, &p);
372 if (ret < 0) 445 if (ret < 0)
373 return ret; 446 return ret;
374 off += ret; 447 off += ret;
@@ -377,14 +450,27 @@ int param_array_get(char *buffer, struct kernel_param *kp)
377 return off; 450 return off;
378} 451}
379 452
380int param_set_copystring(const char *val, struct kernel_param *kp) 453static void param_array_free(void *arg)
454{
455 unsigned int i;
456 const struct kparam_array *arr = arg;
457
458 if (arr->ops->free)
459 for (i = 0; i < (arr->num ? *arr->num : arr->max); i++)
460 arr->ops->free(arr->elem + arr->elemsize * i);
461}
462
463struct kernel_param_ops param_array_ops = {
464 .set = param_array_set,
465 .get = param_array_get,
466 .free = param_array_free,
467};
468EXPORT_SYMBOL(param_array_ops);
469
470int param_set_copystring(const char *val, const struct kernel_param *kp)
381{ 471{
382 const struct kparam_string *kps = kp->str; 472 const struct kparam_string *kps = kp->str;
383 473
384 if (!val) {
385 printk(KERN_ERR "%s: missing param set value\n", kp->name);
386 return -EINVAL;
387 }
388 if (strlen(val)+1 > kps->maxlen) { 474 if (strlen(val)+1 > kps->maxlen) {
389 printk(KERN_ERR "%s: string doesn't fit in %u chars.\n", 475 printk(KERN_ERR "%s: string doesn't fit in %u chars.\n",
390 kp->name, kps->maxlen-1); 476 kp->name, kps->maxlen-1);
@@ -393,12 +479,20 @@ int param_set_copystring(const char *val, struct kernel_param *kp)
393 strcpy(kps->string, val); 479 strcpy(kps->string, val);
394 return 0; 480 return 0;
395} 481}
482EXPORT_SYMBOL(param_set_copystring);
396 483
397int param_get_string(char *buffer, struct kernel_param *kp) 484int param_get_string(char *buffer, const struct kernel_param *kp)
398{ 485{
399 const struct kparam_string *kps = kp->str; 486 const struct kparam_string *kps = kp->str;
400 return strlcpy(buffer, kps->string, kps->maxlen); 487 return strlcpy(buffer, kps->string, kps->maxlen);
401} 488}
489EXPORT_SYMBOL(param_get_string);
490
491struct kernel_param_ops param_ops_string = {
492 .set = param_set_copystring,
493 .get = param_get_string,
494};
495EXPORT_SYMBOL(param_ops_string);
402 496
403/* sysfs output in /sys/modules/XYZ/parameters/ */ 497/* sysfs output in /sys/modules/XYZ/parameters/ */
404#define to_module_attr(n) container_of(n, struct module_attribute, attr) 498#define to_module_attr(n) container_of(n, struct module_attribute, attr)
@@ -409,7 +503,7 @@ extern struct kernel_param __start___param[], __stop___param[];
409struct param_attribute 503struct param_attribute
410{ 504{
411 struct module_attribute mattr; 505 struct module_attribute mattr;
412 struct kernel_param *param; 506 const struct kernel_param *param;
413}; 507};
414 508
415struct module_param_attrs 509struct module_param_attrs
@@ -428,10 +522,12 @@ static ssize_t param_attr_show(struct module_attribute *mattr,
428 int count; 522 int count;
429 struct param_attribute *attribute = to_param_attr(mattr); 523 struct param_attribute *attribute = to_param_attr(mattr);
430 524
431 if (!attribute->param->get) 525 if (!attribute->param->ops->get)
432 return -EPERM; 526 return -EPERM;
433 527
434 count = attribute->param->get(buf, attribute->param); 528 mutex_lock(&param_lock);
529 count = attribute->param->ops->get(buf, attribute->param);
530 mutex_unlock(&param_lock);
435 if (count > 0) { 531 if (count > 0) {
436 strcat(buf, "\n"); 532 strcat(buf, "\n");
437 ++count; 533 ++count;
@@ -447,10 +543,12 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
447 int err; 543 int err;
448 struct param_attribute *attribute = to_param_attr(mattr); 544 struct param_attribute *attribute = to_param_attr(mattr);
449 545
450 if (!attribute->param->set) 546 if (!attribute->param->ops->set)
451 return -EPERM; 547 return -EPERM;
452 548
453 err = attribute->param->set(buf, attribute->param); 549 mutex_lock(&param_lock);
550 err = attribute->param->ops->set(buf, attribute->param);
551 mutex_unlock(&param_lock);
454 if (!err) 552 if (!err)
455 return len; 553 return len;
456 return err; 554 return err;
@@ -464,6 +562,18 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
464#endif 562#endif
465 563
466#ifdef CONFIG_SYSFS 564#ifdef CONFIG_SYSFS
565void __kernel_param_lock(void)
566{
567 mutex_lock(&param_lock);
568}
569EXPORT_SYMBOL(__kernel_param_lock);
570
571void __kernel_param_unlock(void)
572{
573 mutex_unlock(&param_lock);
574}
575EXPORT_SYMBOL(__kernel_param_unlock);
576
467/* 577/*
468 * add_sysfs_param - add a parameter to sysfs 578 * add_sysfs_param - add a parameter to sysfs
469 * @mk: struct module_kobject 579 * @mk: struct module_kobject
@@ -475,7 +585,7 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
475 * if there's an error. 585 * if there's an error.
476 */ 586 */
477static __modinit int add_sysfs_param(struct module_kobject *mk, 587static __modinit int add_sysfs_param(struct module_kobject *mk,
478 struct kernel_param *kp, 588 const struct kernel_param *kp,
479 const char *name) 589 const char *name)
480{ 590{
481 struct module_param_attrs *new; 591 struct module_param_attrs *new;
@@ -557,7 +667,7 @@ static void free_module_param_attrs(struct module_kobject *mk)
557 * /sys/module/[mod->name]/parameters/ 667 * /sys/module/[mod->name]/parameters/
558 */ 668 */
559int module_param_sysfs_setup(struct module *mod, 669int module_param_sysfs_setup(struct module *mod,
560 struct kernel_param *kparam, 670 const struct kernel_param *kparam,
561 unsigned int num_params) 671 unsigned int num_params)
562{ 672{
563 int i, err; 673 int i, err;
@@ -602,7 +712,11 @@ void module_param_sysfs_remove(struct module *mod)
602 712
603void destroy_params(const struct kernel_param *params, unsigned num) 713void destroy_params(const struct kernel_param *params, unsigned num)
604{ 714{
605 /* FIXME: This should free kmalloced charp parameters. It doesn't. */ 715 unsigned int i;
716
717 for (i = 0; i < num; i++)
718 if (params[i].ops->free)
719 params[i].ops->free(params[i].arg);
606} 720}
607 721
608static void __init kernel_add_sysfs_param(const char *name, 722static void __init kernel_add_sysfs_param(const char *name,
@@ -768,28 +882,3 @@ static int __init param_sysfs_init(void)
768subsys_initcall(param_sysfs_init); 882subsys_initcall(param_sysfs_init);
769 883
770#endif /* CONFIG_SYSFS */ 884#endif /* CONFIG_SYSFS */
771
772EXPORT_SYMBOL(param_set_byte);
773EXPORT_SYMBOL(param_get_byte);
774EXPORT_SYMBOL(param_set_short);
775EXPORT_SYMBOL(param_get_short);
776EXPORT_SYMBOL(param_set_ushort);
777EXPORT_SYMBOL(param_get_ushort);
778EXPORT_SYMBOL(param_set_int);
779EXPORT_SYMBOL(param_get_int);
780EXPORT_SYMBOL(param_set_uint);
781EXPORT_SYMBOL(param_get_uint);
782EXPORT_SYMBOL(param_set_long);
783EXPORT_SYMBOL(param_get_long);
784EXPORT_SYMBOL(param_set_ulong);
785EXPORT_SYMBOL(param_get_ulong);
786EXPORT_SYMBOL(param_set_charp);
787EXPORT_SYMBOL(param_get_charp);
788EXPORT_SYMBOL(param_set_bool);
789EXPORT_SYMBOL(param_get_bool);
790EXPORT_SYMBOL(param_set_invbool);
791EXPORT_SYMBOL(param_get_invbool);
792EXPORT_SYMBOL(param_array_set);
793EXPORT_SYMBOL(param_array_get);
794EXPORT_SYMBOL(param_set_copystring);
795EXPORT_SYMBOL(param_get_string);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 3d1552d3c12b..b98bed3d8182 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -16,6 +16,7 @@
16#include <linux/file.h> 16#include <linux/file.h>
17#include <linux/poll.h> 17#include <linux/poll.h>
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/hash.h>
19#include <linux/sysfs.h> 20#include <linux/sysfs.h>
20#include <linux/dcache.h> 21#include <linux/dcache.h>
21#include <linux/percpu.h> 22#include <linux/percpu.h>
@@ -82,14 +83,6 @@ extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
82void __weak hw_perf_disable(void) { barrier(); } 83void __weak hw_perf_disable(void) { barrier(); }
83void __weak hw_perf_enable(void) { barrier(); } 84void __weak hw_perf_enable(void) { barrier(); }
84 85
85int __weak
86hw_perf_group_sched_in(struct perf_event *group_leader,
87 struct perf_cpu_context *cpuctx,
88 struct perf_event_context *ctx)
89{
90 return 0;
91}
92
93void __weak perf_event_print_debug(void) { } 86void __weak perf_event_print_debug(void) { }
94 87
95static DEFINE_PER_CPU(int, perf_disable_count); 88static DEFINE_PER_CPU(int, perf_disable_count);
@@ -221,7 +214,7 @@ static void perf_unpin_context(struct perf_event_context *ctx)
221 214
222static inline u64 perf_clock(void) 215static inline u64 perf_clock(void)
223{ 216{
224 return cpu_clock(raw_smp_processor_id()); 217 return local_clock();
225} 218}
226 219
227/* 220/*
@@ -262,6 +255,18 @@ static void update_event_times(struct perf_event *event)
262 event->total_time_running = run_end - event->tstamp_running; 255 event->total_time_running = run_end - event->tstamp_running;
263} 256}
264 257
258/*
259 * Update total_time_enabled and total_time_running for all events in a group.
260 */
261static void update_group_times(struct perf_event *leader)
262{
263 struct perf_event *event;
264
265 update_event_times(leader);
266 list_for_each_entry(event, &leader->sibling_list, group_entry)
267 update_event_times(event);
268}
269
265static struct list_head * 270static struct list_head *
266ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) 271ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
267{ 272{
@@ -278,14 +283,15 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
278static void 283static void
279list_add_event(struct perf_event *event, struct perf_event_context *ctx) 284list_add_event(struct perf_event *event, struct perf_event_context *ctx)
280{ 285{
281 struct perf_event *group_leader = event->group_leader; 286 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
287 event->attach_state |= PERF_ATTACH_CONTEXT;
282 288
283 /* 289 /*
284 * Depending on whether it is a standalone or sibling event, 290 * If we're a stand alone event or group leader, we go to the context
285 * add it straight to the context's event list, or to the group 291 * list, group events are kept attached to the group so that
286 * leader's sibling list: 292 * perf_group_detach can, at all times, locate all siblings.
287 */ 293 */
288 if (group_leader == event) { 294 if (event->group_leader == event) {
289 struct list_head *list; 295 struct list_head *list;
290 296
291 if (is_software_event(event)) 297 if (is_software_event(event))
@@ -293,13 +299,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
293 299
294 list = ctx_group_list(event, ctx); 300 list = ctx_group_list(event, ctx);
295 list_add_tail(&event->group_entry, list); 301 list_add_tail(&event->group_entry, list);
296 } else {
297 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
298 !is_software_event(event))
299 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
300
301 list_add_tail(&event->group_entry, &group_leader->sibling_list);
302 group_leader->nr_siblings++;
303 } 302 }
304 303
305 list_add_rcu(&event->event_entry, &ctx->event_list); 304 list_add_rcu(&event->event_entry, &ctx->event_list);
@@ -308,6 +307,24 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
308 ctx->nr_stat++; 307 ctx->nr_stat++;
309} 308}
310 309
310static void perf_group_attach(struct perf_event *event)
311{
312 struct perf_event *group_leader = event->group_leader;
313
314 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP);
315 event->attach_state |= PERF_ATTACH_GROUP;
316
317 if (group_leader == event)
318 return;
319
320 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
321 !is_software_event(event))
322 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
323
324 list_add_tail(&event->group_entry, &group_leader->sibling_list);
325 group_leader->nr_siblings++;
326}
327
311/* 328/*
312 * Remove a event from the lists for its context. 329 * Remove a event from the lists for its context.
313 * Must be called with ctx->mutex and ctx->lock held. 330 * Must be called with ctx->mutex and ctx->lock held.
@@ -315,21 +332,24 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
315static void 332static void
316list_del_event(struct perf_event *event, struct perf_event_context *ctx) 333list_del_event(struct perf_event *event, struct perf_event_context *ctx)
317{ 334{
318 struct perf_event *sibling, *tmp; 335 /*
319 336 * We can have double detach due to exit/hot-unplug + close.
320 if (list_empty(&event->group_entry)) 337 */
338 if (!(event->attach_state & PERF_ATTACH_CONTEXT))
321 return; 339 return;
340
341 event->attach_state &= ~PERF_ATTACH_CONTEXT;
342
322 ctx->nr_events--; 343 ctx->nr_events--;
323 if (event->attr.inherit_stat) 344 if (event->attr.inherit_stat)
324 ctx->nr_stat--; 345 ctx->nr_stat--;
325 346
326 list_del_init(&event->group_entry);
327 list_del_rcu(&event->event_entry); 347 list_del_rcu(&event->event_entry);
328 348
329 if (event->group_leader != event) 349 if (event->group_leader == event)
330 event->group_leader->nr_siblings--; 350 list_del_init(&event->group_entry);
331 351
332 update_event_times(event); 352 update_group_times(event);
333 353
334 /* 354 /*
335 * If event was in error state, then keep it 355 * If event was in error state, then keep it
@@ -340,17 +360,41 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
340 */ 360 */
341 if (event->state > PERF_EVENT_STATE_OFF) 361 if (event->state > PERF_EVENT_STATE_OFF)
342 event->state = PERF_EVENT_STATE_OFF; 362 event->state = PERF_EVENT_STATE_OFF;
363}
364
365static void perf_group_detach(struct perf_event *event)
366{
367 struct perf_event *sibling, *tmp;
368 struct list_head *list = NULL;
369
370 /*
371 * We can have double detach due to exit/hot-unplug + close.
372 */
373 if (!(event->attach_state & PERF_ATTACH_GROUP))
374 return;
375
376 event->attach_state &= ~PERF_ATTACH_GROUP;
377
378 /*
379 * If this is a sibling, remove it from its group.
380 */
381 if (event->group_leader != event) {
382 list_del_init(&event->group_entry);
383 event->group_leader->nr_siblings--;
384 return;
385 }
386
387 if (!list_empty(&event->group_entry))
388 list = &event->group_entry;
343 389
344 /* 390 /*
345 * If this was a group event with sibling events then 391 * If this was a group event with sibling events then
346 * upgrade the siblings to singleton events by adding them 392 * upgrade the siblings to singleton events by adding them
347 * to the context list directly: 393 * to whatever list we are on.
348 */ 394 */
349 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { 395 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
350 struct list_head *list; 396 if (list)
351 397 list_move_tail(&sibling->group_entry, list);
352 list = ctx_group_list(event, ctx);
353 list_move_tail(&sibling->group_entry, list);
354 sibling->group_leader = sibling; 398 sibling->group_leader = sibling;
355 399
356 /* Inherit group flags from the previous leader */ 400 /* Inherit group flags from the previous leader */
@@ -358,11 +402,31 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
358 } 402 }
359} 403}
360 404
405static inline int
406event_filter_match(struct perf_event *event)
407{
408 return event->cpu == -1 || event->cpu == smp_processor_id();
409}
410
361static void 411static void
362event_sched_out(struct perf_event *event, 412event_sched_out(struct perf_event *event,
363 struct perf_cpu_context *cpuctx, 413 struct perf_cpu_context *cpuctx,
364 struct perf_event_context *ctx) 414 struct perf_event_context *ctx)
365{ 415{
416 u64 delta;
417 /*
418 * An event which could not be activated because of
419 * filter mismatch still needs to have its timings
420 * maintained, otherwise bogus information is return
421 * via read() for time_enabled, time_running:
422 */
423 if (event->state == PERF_EVENT_STATE_INACTIVE
424 && !event_filter_match(event)) {
425 delta = ctx->time - event->tstamp_stopped;
426 event->tstamp_running += delta;
427 event->tstamp_stopped = ctx->time;
428 }
429
366 if (event->state != PERF_EVENT_STATE_ACTIVE) 430 if (event->state != PERF_EVENT_STATE_ACTIVE)
367 return; 431 return;
368 432
@@ -388,9 +452,7 @@ group_sched_out(struct perf_event *group_event,
388 struct perf_event_context *ctx) 452 struct perf_event_context *ctx)
389{ 453{
390 struct perf_event *event; 454 struct perf_event *event;
391 455 int state = group_event->state;
392 if (group_event->state != PERF_EVENT_STATE_ACTIVE)
393 return;
394 456
395 event_sched_out(group_event, cpuctx, ctx); 457 event_sched_out(group_event, cpuctx, ctx);
396 458
@@ -400,7 +462,7 @@ group_sched_out(struct perf_event *group_event,
400 list_for_each_entry(event, &group_event->sibling_list, group_entry) 462 list_for_each_entry(event, &group_event->sibling_list, group_entry)
401 event_sched_out(event, cpuctx, ctx); 463 event_sched_out(event, cpuctx, ctx);
402 464
403 if (group_event->attr.exclusive) 465 if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
404 cpuctx->exclusive = 0; 466 cpuctx->exclusive = 0;
405} 467}
406 468
@@ -505,18 +567,6 @@ retry:
505} 567}
506 568
507/* 569/*
508 * Update total_time_enabled and total_time_running for all events in a group.
509 */
510static void update_group_times(struct perf_event *leader)
511{
512 struct perf_event *event;
513
514 update_event_times(leader);
515 list_for_each_entry(event, &leader->sibling_list, group_entry)
516 update_event_times(event);
517}
518
519/*
520 * Cross CPU call to disable a performance event 570 * Cross CPU call to disable a performance event
521 */ 571 */
522static void __perf_event_disable(void *info) 572static void __perf_event_disable(void *info)
@@ -640,18 +690,25 @@ group_sched_in(struct perf_event *group_event,
640 struct perf_cpu_context *cpuctx, 690 struct perf_cpu_context *cpuctx,
641 struct perf_event_context *ctx) 691 struct perf_event_context *ctx)
642{ 692{
643 struct perf_event *event, *partial_group; 693 struct perf_event *event, *partial_group = NULL;
644 int ret; 694 const struct pmu *pmu = group_event->pmu;
695 bool txn = false;
645 696
646 if (group_event->state == PERF_EVENT_STATE_OFF) 697 if (group_event->state == PERF_EVENT_STATE_OFF)
647 return 0; 698 return 0;
648 699
649 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx); 700 /* Check if group transaction availabe */
650 if (ret) 701 if (pmu->start_txn)
651 return ret < 0 ? ret : 0; 702 txn = true;
652 703
653 if (event_sched_in(group_event, cpuctx, ctx)) 704 if (txn)
705 pmu->start_txn(pmu);
706
707 if (event_sched_in(group_event, cpuctx, ctx)) {
708 if (txn)
709 pmu->cancel_txn(pmu);
654 return -EAGAIN; 710 return -EAGAIN;
711 }
655 712
656 /* 713 /*
657 * Schedule in siblings as one group (if any): 714 * Schedule in siblings as one group (if any):
@@ -663,7 +720,8 @@ group_sched_in(struct perf_event *group_event,
663 } 720 }
664 } 721 }
665 722
666 return 0; 723 if (!txn || !pmu->commit_txn(pmu))
724 return 0;
667 725
668group_error: 726group_error:
669 /* 727 /*
@@ -677,6 +735,9 @@ group_error:
677 } 735 }
678 event_sched_out(group_event, cpuctx, ctx); 736 event_sched_out(group_event, cpuctx, ctx);
679 737
738 if (txn)
739 pmu->cancel_txn(pmu);
740
680 return -EAGAIN; 741 return -EAGAIN;
681} 742}
682 743
@@ -715,6 +776,7 @@ static void add_event_to_ctx(struct perf_event *event,
715 struct perf_event_context *ctx) 776 struct perf_event_context *ctx)
716{ 777{
717 list_add_event(event, ctx); 778 list_add_event(event, ctx);
779 perf_group_attach(event);
718 event->tstamp_enabled = ctx->time; 780 event->tstamp_enabled = ctx->time;
719 event->tstamp_running = ctx->time; 781 event->tstamp_running = ctx->time;
720 event->tstamp_stopped = ctx->time; 782 event->tstamp_stopped = ctx->time;
@@ -1104,9 +1166,9 @@ static void __perf_event_sync_stat(struct perf_event *event,
1104 * In order to keep per-task stats reliable we need to flip the event 1166 * In order to keep per-task stats reliable we need to flip the event
1105 * values when we flip the contexts. 1167 * values when we flip the contexts.
1106 */ 1168 */
1107 value = atomic64_read(&next_event->count); 1169 value = local64_read(&next_event->count);
1108 value = atomic64_xchg(&event->count, value); 1170 value = local64_xchg(&event->count, value);
1109 atomic64_set(&next_event->count, value); 1171 local64_set(&next_event->count, value);
1110 1172
1111 swap(event->total_time_enabled, next_event->total_time_enabled); 1173 swap(event->total_time_enabled, next_event->total_time_enabled);
1112 swap(event->total_time_running, next_event->total_time_running); 1174 swap(event->total_time_running, next_event->total_time_running);
@@ -1367,6 +1429,8 @@ void perf_event_task_sched_in(struct task_struct *task)
1367 if (cpuctx->task_ctx == ctx) 1429 if (cpuctx->task_ctx == ctx)
1368 return; 1430 return;
1369 1431
1432 perf_disable();
1433
1370 /* 1434 /*
1371 * We want to keep the following priority order: 1435 * We want to keep the following priority order:
1372 * cpu pinned (that don't need to move), task pinned, 1436 * cpu pinned (that don't need to move), task pinned,
@@ -1379,6 +1443,8 @@ void perf_event_task_sched_in(struct task_struct *task)
1379 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); 1443 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
1380 1444
1381 cpuctx->task_ctx = ctx; 1445 cpuctx->task_ctx = ctx;
1446
1447 perf_enable();
1382} 1448}
1383 1449
1384#define MAX_INTERRUPTS (~0ULL) 1450#define MAX_INTERRUPTS (~0ULL)
@@ -1452,6 +1518,9 @@ do { \
1452 divisor = nsec * frequency; 1518 divisor = nsec * frequency;
1453 } 1519 }
1454 1520
1521 if (!divisor)
1522 return dividend;
1523
1455 return div64_u64(dividend, divisor); 1524 return div64_u64(dividend, divisor);
1456} 1525}
1457 1526
@@ -1474,7 +1543,7 @@ static int perf_event_start(struct perf_event *event)
1474static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) 1543static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1475{ 1544{
1476 struct hw_perf_event *hwc = &event->hw; 1545 struct hw_perf_event *hwc = &event->hw;
1477 u64 period, sample_period; 1546 s64 period, sample_period;
1478 s64 delta; 1547 s64 delta;
1479 1548
1480 period = perf_calculate_period(event, nsec, count); 1549 period = perf_calculate_period(event, nsec, count);
@@ -1489,10 +1558,10 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1489 1558
1490 hwc->sample_period = sample_period; 1559 hwc->sample_period = sample_period;
1491 1560
1492 if (atomic64_read(&hwc->period_left) > 8*sample_period) { 1561 if (local64_read(&hwc->period_left) > 8*sample_period) {
1493 perf_disable(); 1562 perf_disable();
1494 perf_event_stop(event); 1563 perf_event_stop(event);
1495 atomic64_set(&hwc->period_left, 0); 1564 local64_set(&hwc->period_left, 0);
1496 perf_event_start(event); 1565 perf_event_start(event);
1497 perf_enable(); 1566 perf_enable();
1498 } 1567 }
@@ -1533,7 +1602,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1533 1602
1534 perf_disable(); 1603 perf_disable();
1535 event->pmu->read(event); 1604 event->pmu->read(event);
1536 now = atomic64_read(&event->count); 1605 now = local64_read(&event->count);
1537 delta = now - hwc->freq_count_stamp; 1606 delta = now - hwc->freq_count_stamp;
1538 hwc->freq_count_stamp = now; 1607 hwc->freq_count_stamp = now;
1539 1608
@@ -1685,6 +1754,11 @@ static void __perf_event_read(void *info)
1685 event->pmu->read(event); 1754 event->pmu->read(event);
1686} 1755}
1687 1756
1757static inline u64 perf_event_count(struct perf_event *event)
1758{
1759 return local64_read(&event->count) + atomic64_read(&event->child_count);
1760}
1761
1688static u64 perf_event_read(struct perf_event *event) 1762static u64 perf_event_read(struct perf_event *event)
1689{ 1763{
1690 /* 1764 /*
@@ -1704,7 +1778,7 @@ static u64 perf_event_read(struct perf_event *event)
1704 raw_spin_unlock_irqrestore(&ctx->lock, flags); 1778 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1705 } 1779 }
1706 1780
1707 return atomic64_read(&event->count); 1781 return perf_event_count(event);
1708} 1782}
1709 1783
1710/* 1784/*
@@ -1825,6 +1899,7 @@ static void free_event_rcu(struct rcu_head *head)
1825} 1899}
1826 1900
1827static void perf_pending_sync(struct perf_event *event); 1901static void perf_pending_sync(struct perf_event *event);
1902static void perf_buffer_put(struct perf_buffer *buffer);
1828 1903
1829static void free_event(struct perf_event *event) 1904static void free_event(struct perf_event *event)
1830{ 1905{
@@ -1832,7 +1907,7 @@ static void free_event(struct perf_event *event)
1832 1907
1833 if (!event->parent) { 1908 if (!event->parent) {
1834 atomic_dec(&nr_events); 1909 atomic_dec(&nr_events);
1835 if (event->attr.mmap) 1910 if (event->attr.mmap || event->attr.mmap_data)
1836 atomic_dec(&nr_mmap_events); 1911 atomic_dec(&nr_mmap_events);
1837 if (event->attr.comm) 1912 if (event->attr.comm)
1838 atomic_dec(&nr_comm_events); 1913 atomic_dec(&nr_comm_events);
@@ -1840,9 +1915,9 @@ static void free_event(struct perf_event *event)
1840 atomic_dec(&nr_task_events); 1915 atomic_dec(&nr_task_events);
1841 } 1916 }
1842 1917
1843 if (event->output) { 1918 if (event->buffer) {
1844 fput(event->output->filp); 1919 perf_buffer_put(event->buffer);
1845 event->output = NULL; 1920 event->buffer = NULL;
1846 } 1921 }
1847 1922
1848 if (event->destroy) 1923 if (event->destroy)
@@ -1856,9 +1931,30 @@ int perf_event_release_kernel(struct perf_event *event)
1856{ 1931{
1857 struct perf_event_context *ctx = event->ctx; 1932 struct perf_event_context *ctx = event->ctx;
1858 1933
1934 /*
1935 * Remove from the PMU, can't get re-enabled since we got
1936 * here because the last ref went.
1937 */
1938 perf_event_disable(event);
1939
1859 WARN_ON_ONCE(ctx->parent_ctx); 1940 WARN_ON_ONCE(ctx->parent_ctx);
1860 mutex_lock(&ctx->mutex); 1941 /*
1861 perf_event_remove_from_context(event); 1942 * There are two ways this annotation is useful:
1943 *
1944 * 1) there is a lock recursion from perf_event_exit_task
1945 * see the comment there.
1946 *
1947 * 2) there is a lock-inversion with mmap_sem through
1948 * perf_event_read_group(), which takes faults while
1949 * holding ctx->mutex, however this is called after
1950 * the last filedesc died, so there is no possibility
1951 * to trigger the AB-BA case.
1952 */
1953 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
1954 raw_spin_lock_irq(&ctx->lock);
1955 perf_group_detach(event);
1956 list_del_event(event, ctx);
1957 raw_spin_unlock_irq(&ctx->lock);
1862 mutex_unlock(&ctx->mutex); 1958 mutex_unlock(&ctx->mutex);
1863 1959
1864 mutex_lock(&event->owner->perf_event_mutex); 1960 mutex_lock(&event->owner->perf_event_mutex);
@@ -2046,13 +2142,13 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
2046static unsigned int perf_poll(struct file *file, poll_table *wait) 2142static unsigned int perf_poll(struct file *file, poll_table *wait)
2047{ 2143{
2048 struct perf_event *event = file->private_data; 2144 struct perf_event *event = file->private_data;
2049 struct perf_mmap_data *data; 2145 struct perf_buffer *buffer;
2050 unsigned int events = POLL_HUP; 2146 unsigned int events = POLL_HUP;
2051 2147
2052 rcu_read_lock(); 2148 rcu_read_lock();
2053 data = rcu_dereference(event->data); 2149 buffer = rcu_dereference(event->buffer);
2054 if (data) 2150 if (buffer)
2055 events = atomic_xchg(&data->poll, 0); 2151 events = atomic_xchg(&buffer->poll, 0);
2056 rcu_read_unlock(); 2152 rcu_read_unlock();
2057 2153
2058 poll_wait(file, &event->waitq, wait); 2154 poll_wait(file, &event->waitq, wait);
@@ -2063,7 +2159,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
2063static void perf_event_reset(struct perf_event *event) 2159static void perf_event_reset(struct perf_event *event)
2064{ 2160{
2065 (void)perf_event_read(event); 2161 (void)perf_event_read(event);
2066 atomic64_set(&event->count, 0); 2162 local64_set(&event->count, 0);
2067 perf_event_update_userpage(event); 2163 perf_event_update_userpage(event);
2068} 2164}
2069 2165
@@ -2106,15 +2202,13 @@ static void perf_event_for_each(struct perf_event *event,
2106static int perf_event_period(struct perf_event *event, u64 __user *arg) 2202static int perf_event_period(struct perf_event *event, u64 __user *arg)
2107{ 2203{
2108 struct perf_event_context *ctx = event->ctx; 2204 struct perf_event_context *ctx = event->ctx;
2109 unsigned long size;
2110 int ret = 0; 2205 int ret = 0;
2111 u64 value; 2206 u64 value;
2112 2207
2113 if (!event->attr.sample_period) 2208 if (!event->attr.sample_period)
2114 return -EINVAL; 2209 return -EINVAL;
2115 2210
2116 size = copy_from_user(&value, arg, sizeof(value)); 2211 if (copy_from_user(&value, arg, sizeof(value)))
2117 if (size != sizeof(value))
2118 return -EFAULT; 2212 return -EFAULT;
2119 2213
2120 if (!value) 2214 if (!value)
@@ -2138,7 +2232,27 @@ unlock:
2138 return ret; 2232 return ret;
2139} 2233}
2140 2234
2141static int perf_event_set_output(struct perf_event *event, int output_fd); 2235static const struct file_operations perf_fops;
2236
2237static struct perf_event *perf_fget_light(int fd, int *fput_needed)
2238{
2239 struct file *file;
2240
2241 file = fget_light(fd, fput_needed);
2242 if (!file)
2243 return ERR_PTR(-EBADF);
2244
2245 if (file->f_op != &perf_fops) {
2246 fput_light(file, *fput_needed);
2247 *fput_needed = 0;
2248 return ERR_PTR(-EBADF);
2249 }
2250
2251 return file->private_data;
2252}
2253
2254static int perf_event_set_output(struct perf_event *event,
2255 struct perf_event *output_event);
2142static int perf_event_set_filter(struct perf_event *event, void __user *arg); 2256static int perf_event_set_filter(struct perf_event *event, void __user *arg);
2143 2257
2144static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 2258static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
@@ -2165,7 +2279,23 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2165 return perf_event_period(event, (u64 __user *)arg); 2279 return perf_event_period(event, (u64 __user *)arg);
2166 2280
2167 case PERF_EVENT_IOC_SET_OUTPUT: 2281 case PERF_EVENT_IOC_SET_OUTPUT:
2168 return perf_event_set_output(event, arg); 2282 {
2283 struct perf_event *output_event = NULL;
2284 int fput_needed = 0;
2285 int ret;
2286
2287 if (arg != -1) {
2288 output_event = perf_fget_light(arg, &fput_needed);
2289 if (IS_ERR(output_event))
2290 return PTR_ERR(output_event);
2291 }
2292
2293 ret = perf_event_set_output(event, output_event);
2294 if (output_event)
2295 fput_light(output_event->filp, fput_needed);
2296
2297 return ret;
2298 }
2169 2299
2170 case PERF_EVENT_IOC_SET_FILTER: 2300 case PERF_EVENT_IOC_SET_FILTER:
2171 return perf_event_set_filter(event, (void __user *)arg); 2301 return perf_event_set_filter(event, (void __user *)arg);
@@ -2226,14 +2356,14 @@ static int perf_event_index(struct perf_event *event)
2226void perf_event_update_userpage(struct perf_event *event) 2356void perf_event_update_userpage(struct perf_event *event)
2227{ 2357{
2228 struct perf_event_mmap_page *userpg; 2358 struct perf_event_mmap_page *userpg;
2229 struct perf_mmap_data *data; 2359 struct perf_buffer *buffer;
2230 2360
2231 rcu_read_lock(); 2361 rcu_read_lock();
2232 data = rcu_dereference(event->data); 2362 buffer = rcu_dereference(event->buffer);
2233 if (!data) 2363 if (!buffer)
2234 goto unlock; 2364 goto unlock;
2235 2365
2236 userpg = data->user_page; 2366 userpg = buffer->user_page;
2237 2367
2238 /* 2368 /*
2239 * Disable preemption so as to not let the corresponding user-space 2369 * Disable preemption so as to not let the corresponding user-space
@@ -2243,9 +2373,9 @@ void perf_event_update_userpage(struct perf_event *event)
2243 ++userpg->lock; 2373 ++userpg->lock;
2244 barrier(); 2374 barrier();
2245 userpg->index = perf_event_index(event); 2375 userpg->index = perf_event_index(event);
2246 userpg->offset = atomic64_read(&event->count); 2376 userpg->offset = perf_event_count(event);
2247 if (event->state == PERF_EVENT_STATE_ACTIVE) 2377 if (event->state == PERF_EVENT_STATE_ACTIVE)
2248 userpg->offset -= atomic64_read(&event->hw.prev_count); 2378 userpg->offset -= local64_read(&event->hw.prev_count);
2249 2379
2250 userpg->time_enabled = event->total_time_enabled + 2380 userpg->time_enabled = event->total_time_enabled +
2251 atomic64_read(&event->child_total_time_enabled); 2381 atomic64_read(&event->child_total_time_enabled);
@@ -2260,9 +2390,23 @@ unlock:
2260 rcu_read_unlock(); 2390 rcu_read_unlock();
2261} 2391}
2262 2392
2263static unsigned long perf_data_size(struct perf_mmap_data *data) 2393static unsigned long perf_data_size(struct perf_buffer *buffer);
2394
2395static void
2396perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags)
2264{ 2397{
2265 return data->nr_pages << (PAGE_SHIFT + data->data_order); 2398 long max_size = perf_data_size(buffer);
2399
2400 if (watermark)
2401 buffer->watermark = min(max_size, watermark);
2402
2403 if (!buffer->watermark)
2404 buffer->watermark = max_size / 2;
2405
2406 if (flags & PERF_BUFFER_WRITABLE)
2407 buffer->writable = 1;
2408
2409 atomic_set(&buffer->refcount, 1);
2266} 2410}
2267 2411
2268#ifndef CONFIG_PERF_USE_VMALLOC 2412#ifndef CONFIG_PERF_USE_VMALLOC
@@ -2272,56 +2416,68 @@ static unsigned long perf_data_size(struct perf_mmap_data *data)
2272 */ 2416 */
2273 2417
2274static struct page * 2418static struct page *
2275perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) 2419perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
2276{ 2420{
2277 if (pgoff > data->nr_pages) 2421 if (pgoff > buffer->nr_pages)
2278 return NULL; 2422 return NULL;
2279 2423
2280 if (pgoff == 0) 2424 if (pgoff == 0)
2281 return virt_to_page(data->user_page); 2425 return virt_to_page(buffer->user_page);
2426
2427 return virt_to_page(buffer->data_pages[pgoff - 1]);
2428}
2429
2430static void *perf_mmap_alloc_page(int cpu)
2431{
2432 struct page *page;
2433 int node;
2434
2435 node = (cpu == -1) ? cpu : cpu_to_node(cpu);
2436 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
2437 if (!page)
2438 return NULL;
2282 2439
2283 return virt_to_page(data->data_pages[pgoff - 1]); 2440 return page_address(page);
2284} 2441}
2285 2442
2286static struct perf_mmap_data * 2443static struct perf_buffer *
2287perf_mmap_data_alloc(struct perf_event *event, int nr_pages) 2444perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
2288{ 2445{
2289 struct perf_mmap_data *data; 2446 struct perf_buffer *buffer;
2290 unsigned long size; 2447 unsigned long size;
2291 int i; 2448 int i;
2292 2449
2293 WARN_ON(atomic_read(&event->mmap_count)); 2450 size = sizeof(struct perf_buffer);
2294
2295 size = sizeof(struct perf_mmap_data);
2296 size += nr_pages * sizeof(void *); 2451 size += nr_pages * sizeof(void *);
2297 2452
2298 data = kzalloc(size, GFP_KERNEL); 2453 buffer = kzalloc(size, GFP_KERNEL);
2299 if (!data) 2454 if (!buffer)
2300 goto fail; 2455 goto fail;
2301 2456
2302 data->user_page = (void *)get_zeroed_page(GFP_KERNEL); 2457 buffer->user_page = perf_mmap_alloc_page(cpu);
2303 if (!data->user_page) 2458 if (!buffer->user_page)
2304 goto fail_user_page; 2459 goto fail_user_page;
2305 2460
2306 for (i = 0; i < nr_pages; i++) { 2461 for (i = 0; i < nr_pages; i++) {
2307 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL); 2462 buffer->data_pages[i] = perf_mmap_alloc_page(cpu);
2308 if (!data->data_pages[i]) 2463 if (!buffer->data_pages[i])
2309 goto fail_data_pages; 2464 goto fail_data_pages;
2310 } 2465 }
2311 2466
2312 data->data_order = 0; 2467 buffer->nr_pages = nr_pages;
2313 data->nr_pages = nr_pages;
2314 2468
2315 return data; 2469 perf_buffer_init(buffer, watermark, flags);
2470
2471 return buffer;
2316 2472
2317fail_data_pages: 2473fail_data_pages:
2318 for (i--; i >= 0; i--) 2474 for (i--; i >= 0; i--)
2319 free_page((unsigned long)data->data_pages[i]); 2475 free_page((unsigned long)buffer->data_pages[i]);
2320 2476
2321 free_page((unsigned long)data->user_page); 2477 free_page((unsigned long)buffer->user_page);
2322 2478
2323fail_user_page: 2479fail_user_page:
2324 kfree(data); 2480 kfree(buffer);
2325 2481
2326fail: 2482fail:
2327 return NULL; 2483 return NULL;
@@ -2335,14 +2491,19 @@ static void perf_mmap_free_page(unsigned long addr)
2335 __free_page(page); 2491 __free_page(page);
2336} 2492}
2337 2493
2338static void perf_mmap_data_free(struct perf_mmap_data *data) 2494static void perf_buffer_free(struct perf_buffer *buffer)
2339{ 2495{
2340 int i; 2496 int i;
2341 2497
2342 perf_mmap_free_page((unsigned long)data->user_page); 2498 perf_mmap_free_page((unsigned long)buffer->user_page);
2343 for (i = 0; i < data->nr_pages; i++) 2499 for (i = 0; i < buffer->nr_pages; i++)
2344 perf_mmap_free_page((unsigned long)data->data_pages[i]); 2500 perf_mmap_free_page((unsigned long)buffer->data_pages[i]);
2345 kfree(data); 2501 kfree(buffer);
2502}
2503
2504static inline int page_order(struct perf_buffer *buffer)
2505{
2506 return 0;
2346} 2507}
2347 2508
2348#else 2509#else
@@ -2353,13 +2514,18 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)
2353 * Required for architectures that have d-cache aliasing issues. 2514 * Required for architectures that have d-cache aliasing issues.
2354 */ 2515 */
2355 2516
2517static inline int page_order(struct perf_buffer *buffer)
2518{
2519 return buffer->page_order;
2520}
2521
2356static struct page * 2522static struct page *
2357perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) 2523perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
2358{ 2524{
2359 if (pgoff > (1UL << data->data_order)) 2525 if (pgoff > (1UL << page_order(buffer)))
2360 return NULL; 2526 return NULL;
2361 2527
2362 return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE); 2528 return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE);
2363} 2529}
2364 2530
2365static void perf_mmap_unmark_page(void *addr) 2531static void perf_mmap_unmark_page(void *addr)
@@ -2369,59 +2535,59 @@ static void perf_mmap_unmark_page(void *addr)
2369 page->mapping = NULL; 2535 page->mapping = NULL;
2370} 2536}
2371 2537
2372static void perf_mmap_data_free_work(struct work_struct *work) 2538static void perf_buffer_free_work(struct work_struct *work)
2373{ 2539{
2374 struct perf_mmap_data *data; 2540 struct perf_buffer *buffer;
2375 void *base; 2541 void *base;
2376 int i, nr; 2542 int i, nr;
2377 2543
2378 data = container_of(work, struct perf_mmap_data, work); 2544 buffer = container_of(work, struct perf_buffer, work);
2379 nr = 1 << data->data_order; 2545 nr = 1 << page_order(buffer);
2380 2546
2381 base = data->user_page; 2547 base = buffer->user_page;
2382 for (i = 0; i < nr + 1; i++) 2548 for (i = 0; i < nr + 1; i++)
2383 perf_mmap_unmark_page(base + (i * PAGE_SIZE)); 2549 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
2384 2550
2385 vfree(base); 2551 vfree(base);
2386 kfree(data); 2552 kfree(buffer);
2387} 2553}
2388 2554
2389static void perf_mmap_data_free(struct perf_mmap_data *data) 2555static void perf_buffer_free(struct perf_buffer *buffer)
2390{ 2556{
2391 schedule_work(&data->work); 2557 schedule_work(&buffer->work);
2392} 2558}
2393 2559
2394static struct perf_mmap_data * 2560static struct perf_buffer *
2395perf_mmap_data_alloc(struct perf_event *event, int nr_pages) 2561perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
2396{ 2562{
2397 struct perf_mmap_data *data; 2563 struct perf_buffer *buffer;
2398 unsigned long size; 2564 unsigned long size;
2399 void *all_buf; 2565 void *all_buf;
2400 2566
2401 WARN_ON(atomic_read(&event->mmap_count)); 2567 size = sizeof(struct perf_buffer);
2402
2403 size = sizeof(struct perf_mmap_data);
2404 size += sizeof(void *); 2568 size += sizeof(void *);
2405 2569
2406 data = kzalloc(size, GFP_KERNEL); 2570 buffer = kzalloc(size, GFP_KERNEL);
2407 if (!data) 2571 if (!buffer)
2408 goto fail; 2572 goto fail;
2409 2573
2410 INIT_WORK(&data->work, perf_mmap_data_free_work); 2574 INIT_WORK(&buffer->work, perf_buffer_free_work);
2411 2575
2412 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); 2576 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
2413 if (!all_buf) 2577 if (!all_buf)
2414 goto fail_all_buf; 2578 goto fail_all_buf;
2415 2579
2416 data->user_page = all_buf; 2580 buffer->user_page = all_buf;
2417 data->data_pages[0] = all_buf + PAGE_SIZE; 2581 buffer->data_pages[0] = all_buf + PAGE_SIZE;
2418 data->data_order = ilog2(nr_pages); 2582 buffer->page_order = ilog2(nr_pages);
2419 data->nr_pages = 1; 2583 buffer->nr_pages = 1;
2584
2585 perf_buffer_init(buffer, watermark, flags);
2420 2586
2421 return data; 2587 return buffer;
2422 2588
2423fail_all_buf: 2589fail_all_buf:
2424 kfree(data); 2590 kfree(buffer);
2425 2591
2426fail: 2592fail:
2427 return NULL; 2593 return NULL;
@@ -2429,10 +2595,15 @@ fail:
2429 2595
2430#endif 2596#endif
2431 2597
2598static unsigned long perf_data_size(struct perf_buffer *buffer)
2599{
2600 return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer));
2601}
2602
2432static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 2603static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2433{ 2604{
2434 struct perf_event *event = vma->vm_file->private_data; 2605 struct perf_event *event = vma->vm_file->private_data;
2435 struct perf_mmap_data *data; 2606 struct perf_buffer *buffer;
2436 int ret = VM_FAULT_SIGBUS; 2607 int ret = VM_FAULT_SIGBUS;
2437 2608
2438 if (vmf->flags & FAULT_FLAG_MKWRITE) { 2609 if (vmf->flags & FAULT_FLAG_MKWRITE) {
@@ -2442,14 +2613,14 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2442 } 2613 }
2443 2614
2444 rcu_read_lock(); 2615 rcu_read_lock();
2445 data = rcu_dereference(event->data); 2616 buffer = rcu_dereference(event->buffer);
2446 if (!data) 2617 if (!buffer)
2447 goto unlock; 2618 goto unlock;
2448 2619
2449 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) 2620 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
2450 goto unlock; 2621 goto unlock;
2451 2622
2452 vmf->page = perf_mmap_to_page(data, vmf->pgoff); 2623 vmf->page = perf_mmap_to_page(buffer, vmf->pgoff);
2453 if (!vmf->page) 2624 if (!vmf->page)
2454 goto unlock; 2625 goto unlock;
2455 2626
@@ -2464,41 +2635,35 @@ unlock:
2464 return ret; 2635 return ret;
2465} 2636}
2466 2637
2467static void 2638static void perf_buffer_free_rcu(struct rcu_head *rcu_head)
2468perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2469{ 2639{
2470 long max_size = perf_data_size(data); 2640 struct perf_buffer *buffer;
2471 2641
2472 atomic_set(&data->lock, -1); 2642 buffer = container_of(rcu_head, struct perf_buffer, rcu_head);
2473 2643 perf_buffer_free(buffer);
2474 if (event->attr.watermark) {
2475 data->watermark = min_t(long, max_size,
2476 event->attr.wakeup_watermark);
2477 }
2478
2479 if (!data->watermark)
2480 data->watermark = max_size / 2;
2481
2482
2483 rcu_assign_pointer(event->data, data);
2484} 2644}
2485 2645
2486static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head) 2646static struct perf_buffer *perf_buffer_get(struct perf_event *event)
2487{ 2647{
2488 struct perf_mmap_data *data; 2648 struct perf_buffer *buffer;
2489 2649
2490 data = container_of(rcu_head, struct perf_mmap_data, rcu_head); 2650 rcu_read_lock();
2491 perf_mmap_data_free(data); 2651 buffer = rcu_dereference(event->buffer);
2652 if (buffer) {
2653 if (!atomic_inc_not_zero(&buffer->refcount))
2654 buffer = NULL;
2655 }
2656 rcu_read_unlock();
2657
2658 return buffer;
2492} 2659}
2493 2660
2494static void perf_mmap_data_release(struct perf_event *event) 2661static void perf_buffer_put(struct perf_buffer *buffer)
2495{ 2662{
2496 struct perf_mmap_data *data = event->data; 2663 if (!atomic_dec_and_test(&buffer->refcount))
2497 2664 return;
2498 WARN_ON(atomic_read(&event->mmap_count));
2499 2665
2500 rcu_assign_pointer(event->data, NULL); 2666 call_rcu(&buffer->rcu_head, perf_buffer_free_rcu);
2501 call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
2502} 2667}
2503 2668
2504static void perf_mmap_open(struct vm_area_struct *vma) 2669static void perf_mmap_open(struct vm_area_struct *vma)
@@ -2512,15 +2677,18 @@ static void perf_mmap_close(struct vm_area_struct *vma)
2512{ 2677{
2513 struct perf_event *event = vma->vm_file->private_data; 2678 struct perf_event *event = vma->vm_file->private_data;
2514 2679
2515 WARN_ON_ONCE(event->ctx->parent_ctx);
2516 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { 2680 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
2517 unsigned long size = perf_data_size(event->data); 2681 unsigned long size = perf_data_size(event->buffer);
2518 struct user_struct *user = current_user(); 2682 struct user_struct *user = event->mmap_user;
2683 struct perf_buffer *buffer = event->buffer;
2519 2684
2520 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); 2685 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
2521 vma->vm_mm->locked_vm -= event->data->nr_locked; 2686 vma->vm_mm->locked_vm -= event->mmap_locked;
2522 perf_mmap_data_release(event); 2687 rcu_assign_pointer(event->buffer, NULL);
2523 mutex_unlock(&event->mmap_mutex); 2688 mutex_unlock(&event->mmap_mutex);
2689
2690 perf_buffer_put(buffer);
2691 free_uid(user);
2524 } 2692 }
2525} 2693}
2526 2694
@@ -2537,11 +2705,19 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2537 unsigned long user_locked, user_lock_limit; 2705 unsigned long user_locked, user_lock_limit;
2538 struct user_struct *user = current_user(); 2706 struct user_struct *user = current_user();
2539 unsigned long locked, lock_limit; 2707 unsigned long locked, lock_limit;
2540 struct perf_mmap_data *data; 2708 struct perf_buffer *buffer;
2541 unsigned long vma_size; 2709 unsigned long vma_size;
2542 unsigned long nr_pages; 2710 unsigned long nr_pages;
2543 long user_extra, extra; 2711 long user_extra, extra;
2544 int ret = 0; 2712 int ret = 0, flags = 0;
2713
2714 /*
2715 * Don't allow mmap() of inherited per-task counters. This would
2716 * create a performance issue due to all children writing to the
2717 * same buffer.
2718 */
2719 if (event->cpu == -1 && event->attr.inherit)
2720 return -EINVAL;
2545 2721
2546 if (!(vma->vm_flags & VM_SHARED)) 2722 if (!(vma->vm_flags & VM_SHARED))
2547 return -EINVAL; 2723 return -EINVAL;
@@ -2550,7 +2726,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2550 nr_pages = (vma_size / PAGE_SIZE) - 1; 2726 nr_pages = (vma_size / PAGE_SIZE) - 1;
2551 2727
2552 /* 2728 /*
2553 * If we have data pages ensure they're a power-of-two number, so we 2729 * If we have buffer pages ensure they're a power-of-two number, so we
2554 * can do bitmasks instead of modulo. 2730 * can do bitmasks instead of modulo.
2555 */ 2731 */
2556 if (nr_pages != 0 && !is_power_of_2(nr_pages)) 2732 if (nr_pages != 0 && !is_power_of_2(nr_pages))
@@ -2564,13 +2740,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2564 2740
2565 WARN_ON_ONCE(event->ctx->parent_ctx); 2741 WARN_ON_ONCE(event->ctx->parent_ctx);
2566 mutex_lock(&event->mmap_mutex); 2742 mutex_lock(&event->mmap_mutex);
2567 if (event->output) { 2743 if (event->buffer) {
2568 ret = -EINVAL; 2744 if (event->buffer->nr_pages == nr_pages)
2569 goto unlock; 2745 atomic_inc(&event->buffer->refcount);
2570 } 2746 else
2571
2572 if (atomic_inc_not_zero(&event->mmap_count)) {
2573 if (nr_pages != event->data->nr_pages)
2574 ret = -EINVAL; 2747 ret = -EINVAL;
2575 goto unlock; 2748 goto unlock;
2576 } 2749 }
@@ -2599,24 +2772,27 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2599 goto unlock; 2772 goto unlock;
2600 } 2773 }
2601 2774
2602 WARN_ON(event->data); 2775 WARN_ON(event->buffer);
2603 2776
2604 data = perf_mmap_data_alloc(event, nr_pages); 2777 if (vma->vm_flags & VM_WRITE)
2605 ret = -ENOMEM; 2778 flags |= PERF_BUFFER_WRITABLE;
2606 if (!data)
2607 goto unlock;
2608 2779
2609 ret = 0; 2780 buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark,
2610 perf_mmap_data_init(event, data); 2781 event->cpu, flags);
2782 if (!buffer) {
2783 ret = -ENOMEM;
2784 goto unlock;
2785 }
2786 rcu_assign_pointer(event->buffer, buffer);
2611 2787
2612 atomic_set(&event->mmap_count, 1);
2613 atomic_long_add(user_extra, &user->locked_vm); 2788 atomic_long_add(user_extra, &user->locked_vm);
2614 vma->vm_mm->locked_vm += extra; 2789 event->mmap_locked = extra;
2615 event->data->nr_locked = extra; 2790 event->mmap_user = get_current_user();
2616 if (vma->vm_flags & VM_WRITE) 2791 vma->vm_mm->locked_vm += event->mmap_locked;
2617 event->data->writable = 1;
2618 2792
2619unlock: 2793unlock:
2794 if (!ret)
2795 atomic_inc(&event->mmap_count);
2620 mutex_unlock(&event->mmap_mutex); 2796 mutex_unlock(&event->mmap_mutex);
2621 2797
2622 vma->vm_flags |= VM_RESERVED; 2798 vma->vm_flags |= VM_RESERVED;
@@ -2642,6 +2818,7 @@ static int perf_fasync(int fd, struct file *filp, int on)
2642} 2818}
2643 2819
2644static const struct file_operations perf_fops = { 2820static const struct file_operations perf_fops = {
2821 .llseek = no_llseek,
2645 .release = perf_release, 2822 .release = perf_release,
2646 .read = perf_read, 2823 .read = perf_read,
2647 .poll = perf_poll, 2824 .poll = perf_poll,
@@ -2785,24 +2962,40 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2785 return NULL; 2962 return NULL;
2786} 2963}
2787 2964
2788__weak 2965
2789void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) 2966/*
2967 * We assume there is only KVM supporting the callbacks.
2968 * Later on, we might change it to a list if there is
2969 * another virtualization implementation supporting the callbacks.
2970 */
2971struct perf_guest_info_callbacks *perf_guest_cbs;
2972
2973int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
2790{ 2974{
2975 perf_guest_cbs = cbs;
2976 return 0;
2791} 2977}
2978EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
2792 2979
2980int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
2981{
2982 perf_guest_cbs = NULL;
2983 return 0;
2984}
2985EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
2793 2986
2794/* 2987/*
2795 * Output 2988 * Output
2796 */ 2989 */
2797static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, 2990static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail,
2798 unsigned long offset, unsigned long head) 2991 unsigned long offset, unsigned long head)
2799{ 2992{
2800 unsigned long mask; 2993 unsigned long mask;
2801 2994
2802 if (!data->writable) 2995 if (!buffer->writable)
2803 return true; 2996 return true;
2804 2997
2805 mask = perf_data_size(data) - 1; 2998 mask = perf_data_size(buffer) - 1;
2806 2999
2807 offset = (offset - tail) & mask; 3000 offset = (offset - tail) & mask;
2808 head = (head - tail) & mask; 3001 head = (head - tail) & mask;
@@ -2815,7 +3008,7 @@ static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
2815 3008
2816static void perf_output_wakeup(struct perf_output_handle *handle) 3009static void perf_output_wakeup(struct perf_output_handle *handle)
2817{ 3010{
2818 atomic_set(&handle->data->poll, POLL_IN); 3011 atomic_set(&handle->buffer->poll, POLL_IN);
2819 3012
2820 if (handle->nmi) { 3013 if (handle->nmi) {
2821 handle->event->pending_wakeup = 1; 3014 handle->event->pending_wakeup = 1;
@@ -2826,128 +3019,88 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
2826} 3019}
2827 3020
2828/* 3021/*
2829 * Curious locking construct.
2830 *
2831 * We need to ensure a later event_id doesn't publish a head when a former 3022 * We need to ensure a later event_id doesn't publish a head when a former
2832 * event_id isn't done writing. However since we need to deal with NMIs we 3023 * event isn't done writing. However since we need to deal with NMIs we
2833 * cannot fully serialize things. 3024 * cannot fully serialize things.
2834 * 3025 *
2835 * What we do is serialize between CPUs so we only have to deal with NMI
2836 * nesting on a single CPU.
2837 *
2838 * We only publish the head (and generate a wakeup) when the outer-most 3026 * We only publish the head (and generate a wakeup) when the outer-most
2839 * event_id completes. 3027 * event completes.
2840 */ 3028 */
2841static void perf_output_lock(struct perf_output_handle *handle) 3029static void perf_output_get_handle(struct perf_output_handle *handle)
2842{ 3030{
2843 struct perf_mmap_data *data = handle->data; 3031 struct perf_buffer *buffer = handle->buffer;
2844 int cur, cpu = get_cpu();
2845
2846 handle->locked = 0;
2847
2848 for (;;) {
2849 cur = atomic_cmpxchg(&data->lock, -1, cpu);
2850 if (cur == -1) {
2851 handle->locked = 1;
2852 break;
2853 }
2854 if (cur == cpu)
2855 break;
2856 3032
2857 cpu_relax(); 3033 preempt_disable();
2858 } 3034 local_inc(&buffer->nest);
3035 handle->wakeup = local_read(&buffer->wakeup);
2859} 3036}
2860 3037
2861static void perf_output_unlock(struct perf_output_handle *handle) 3038static void perf_output_put_handle(struct perf_output_handle *handle)
2862{ 3039{
2863 struct perf_mmap_data *data = handle->data; 3040 struct perf_buffer *buffer = handle->buffer;
2864 unsigned long head; 3041 unsigned long head;
2865 int cpu;
2866
2867 data->done_head = data->head;
2868
2869 if (!handle->locked)
2870 goto out;
2871 3042
2872again: 3043again:
2873 /* 3044 head = local_read(&buffer->head);
2874 * The xchg implies a full barrier that ensures all writes are done
2875 * before we publish the new head, matched by a rmb() in userspace when
2876 * reading this position.
2877 */
2878 while ((head = atomic_long_xchg(&data->done_head, 0)))
2879 data->user_page->data_head = head;
2880 3045
2881 /* 3046 /*
2882 * NMI can happen here, which means we can miss a done_head update. 3047 * IRQ/NMI can happen here, which means we can miss a head update.
2883 */ 3048 */
2884 3049
2885 cpu = atomic_xchg(&data->lock, -1); 3050 if (!local_dec_and_test(&buffer->nest))
2886 WARN_ON_ONCE(cpu != smp_processor_id()); 3051 goto out;
2887 3052
2888 /* 3053 /*
2889 * Therefore we have to validate we did not indeed do so. 3054 * Publish the known good head. Rely on the full barrier implied
3055 * by atomic_dec_and_test() order the buffer->head read and this
3056 * write.
2890 */ 3057 */
2891 if (unlikely(atomic_long_read(&data->done_head))) { 3058 buffer->user_page->data_head = head;
2892 /*
2893 * Since we had it locked, we can lock it again.
2894 */
2895 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2896 cpu_relax();
2897 3059
3060 /*
3061 * Now check if we missed an update, rely on the (compiler)
3062 * barrier in atomic_dec_and_test() to re-read buffer->head.
3063 */
3064 if (unlikely(head != local_read(&buffer->head))) {
3065 local_inc(&buffer->nest);
2898 goto again; 3066 goto again;
2899 } 3067 }
2900 3068
2901 if (atomic_xchg(&data->wakeup, 0)) 3069 if (handle->wakeup != local_read(&buffer->wakeup))
2902 perf_output_wakeup(handle); 3070 perf_output_wakeup(handle);
2903out: 3071
2904 put_cpu(); 3072 out:
3073 preempt_enable();
2905} 3074}
2906 3075
2907void perf_output_copy(struct perf_output_handle *handle, 3076__always_inline void perf_output_copy(struct perf_output_handle *handle,
2908 const void *buf, unsigned int len) 3077 const void *buf, unsigned int len)
2909{ 3078{
2910 unsigned int pages_mask;
2911 unsigned long offset;
2912 unsigned int size;
2913 void **pages;
2914
2915 offset = handle->offset;
2916 pages_mask = handle->data->nr_pages - 1;
2917 pages = handle->data->data_pages;
2918
2919 do { 3079 do {
2920 unsigned long page_offset; 3080 unsigned long size = min_t(unsigned long, handle->size, len);
2921 unsigned long page_size;
2922 int nr;
2923 3081
2924 nr = (offset >> PAGE_SHIFT) & pages_mask; 3082 memcpy(handle->addr, buf, size);
2925 page_size = 1UL << (handle->data->data_order + PAGE_SHIFT);
2926 page_offset = offset & (page_size - 1);
2927 size = min_t(unsigned int, page_size - page_offset, len);
2928 3083
2929 memcpy(pages[nr] + page_offset, buf, size); 3084 len -= size;
3085 handle->addr += size;
3086 buf += size;
3087 handle->size -= size;
3088 if (!handle->size) {
3089 struct perf_buffer *buffer = handle->buffer;
2930 3090
2931 len -= size; 3091 handle->page++;
2932 buf += size; 3092 handle->page &= buffer->nr_pages - 1;
2933 offset += size; 3093 handle->addr = buffer->data_pages[handle->page];
3094 handle->size = PAGE_SIZE << page_order(buffer);
3095 }
2934 } while (len); 3096 } while (len);
2935
2936 handle->offset = offset;
2937
2938 /*
2939 * Check we didn't copy past our reservation window, taking the
2940 * possible unsigned int wrap into account.
2941 */
2942 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2943} 3097}
2944 3098
2945int perf_output_begin(struct perf_output_handle *handle, 3099int perf_output_begin(struct perf_output_handle *handle,
2946 struct perf_event *event, unsigned int size, 3100 struct perf_event *event, unsigned int size,
2947 int nmi, int sample) 3101 int nmi, int sample)
2948{ 3102{
2949 struct perf_event *output_event; 3103 struct perf_buffer *buffer;
2950 struct perf_mmap_data *data;
2951 unsigned long tail, offset, head; 3104 unsigned long tail, offset, head;
2952 int have_lost; 3105 int have_lost;
2953 struct { 3106 struct {
@@ -2963,27 +3116,23 @@ int perf_output_begin(struct perf_output_handle *handle,
2963 if (event->parent) 3116 if (event->parent)
2964 event = event->parent; 3117 event = event->parent;
2965 3118
2966 output_event = rcu_dereference(event->output); 3119 buffer = rcu_dereference(event->buffer);
2967 if (output_event) 3120 if (!buffer)
2968 event = output_event;
2969
2970 data = rcu_dereference(event->data);
2971 if (!data)
2972 goto out; 3121 goto out;
2973 3122
2974 handle->data = data; 3123 handle->buffer = buffer;
2975 handle->event = event; 3124 handle->event = event;
2976 handle->nmi = nmi; 3125 handle->nmi = nmi;
2977 handle->sample = sample; 3126 handle->sample = sample;
2978 3127
2979 if (!data->nr_pages) 3128 if (!buffer->nr_pages)
2980 goto fail; 3129 goto out;
2981 3130
2982 have_lost = atomic_read(&data->lost); 3131 have_lost = local_read(&buffer->lost);
2983 if (have_lost) 3132 if (have_lost)
2984 size += sizeof(lost_event); 3133 size += sizeof(lost_event);
2985 3134
2986 perf_output_lock(handle); 3135 perf_output_get_handle(handle);
2987 3136
2988 do { 3137 do {
2989 /* 3138 /*
@@ -2991,26 +3140,30 @@ int perf_output_begin(struct perf_output_handle *handle,
2991 * tail pointer. So that all reads will be completed before the 3140 * tail pointer. So that all reads will be completed before the
2992 * write is issued. 3141 * write is issued.
2993 */ 3142 */
2994 tail = ACCESS_ONCE(data->user_page->data_tail); 3143 tail = ACCESS_ONCE(buffer->user_page->data_tail);
2995 smp_rmb(); 3144 smp_rmb();
2996 offset = head = atomic_long_read(&data->head); 3145 offset = head = local_read(&buffer->head);
2997 head += size; 3146 head += size;
2998 if (unlikely(!perf_output_space(data, tail, offset, head))) 3147 if (unlikely(!perf_output_space(buffer, tail, offset, head)))
2999 goto fail; 3148 goto fail;
3000 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); 3149 } while (local_cmpxchg(&buffer->head, offset, head) != offset);
3001 3150
3002 handle->offset = offset; 3151 if (head - local_read(&buffer->wakeup) > buffer->watermark)
3003 handle->head = head; 3152 local_add(buffer->watermark, &buffer->wakeup);
3004 3153
3005 if (head - tail > data->watermark) 3154 handle->page = offset >> (PAGE_SHIFT + page_order(buffer));
3006 atomic_set(&data->wakeup, 1); 3155 handle->page &= buffer->nr_pages - 1;
3156 handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1);
3157 handle->addr = buffer->data_pages[handle->page];
3158 handle->addr += handle->size;
3159 handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size;
3007 3160
3008 if (have_lost) { 3161 if (have_lost) {
3009 lost_event.header.type = PERF_RECORD_LOST; 3162 lost_event.header.type = PERF_RECORD_LOST;
3010 lost_event.header.misc = 0; 3163 lost_event.header.misc = 0;
3011 lost_event.header.size = sizeof(lost_event); 3164 lost_event.header.size = sizeof(lost_event);
3012 lost_event.id = event->id; 3165 lost_event.id = event->id;
3013 lost_event.lost = atomic_xchg(&data->lost, 0); 3166 lost_event.lost = local_xchg(&buffer->lost, 0);
3014 3167
3015 perf_output_put(handle, lost_event); 3168 perf_output_put(handle, lost_event);
3016 } 3169 }
@@ -3018,8 +3171,8 @@ int perf_output_begin(struct perf_output_handle *handle,
3018 return 0; 3171 return 0;
3019 3172
3020fail: 3173fail:
3021 atomic_inc(&data->lost); 3174 local_inc(&buffer->lost);
3022 perf_output_unlock(handle); 3175 perf_output_put_handle(handle);
3023out: 3176out:
3024 rcu_read_unlock(); 3177 rcu_read_unlock();
3025 3178
@@ -3029,19 +3182,19 @@ out:
3029void perf_output_end(struct perf_output_handle *handle) 3182void perf_output_end(struct perf_output_handle *handle)
3030{ 3183{
3031 struct perf_event *event = handle->event; 3184 struct perf_event *event = handle->event;
3032 struct perf_mmap_data *data = handle->data; 3185 struct perf_buffer *buffer = handle->buffer;
3033 3186
3034 int wakeup_events = event->attr.wakeup_events; 3187 int wakeup_events = event->attr.wakeup_events;
3035 3188
3036 if (handle->sample && wakeup_events) { 3189 if (handle->sample && wakeup_events) {
3037 int events = atomic_inc_return(&data->events); 3190 int events = local_inc_return(&buffer->events);
3038 if (events >= wakeup_events) { 3191 if (events >= wakeup_events) {
3039 atomic_sub(wakeup_events, &data->events); 3192 local_sub(wakeup_events, &buffer->events);
3040 atomic_set(&data->wakeup, 1); 3193 local_inc(&buffer->wakeup);
3041 } 3194 }
3042 } 3195 }
3043 3196
3044 perf_output_unlock(handle); 3197 perf_output_put_handle(handle);
3045 rcu_read_unlock(); 3198 rcu_read_unlock();
3046} 3199}
3047 3200
@@ -3074,7 +3227,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
3074 u64 values[4]; 3227 u64 values[4];
3075 int n = 0; 3228 int n = 0;
3076 3229
3077 values[n++] = atomic64_read(&event->count); 3230 values[n++] = perf_event_count(event);
3078 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 3231 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
3079 values[n++] = event->total_time_enabled + 3232 values[n++] = event->total_time_enabled +
3080 atomic64_read(&event->child_total_time_enabled); 3233 atomic64_read(&event->child_total_time_enabled);
@@ -3111,7 +3264,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
3111 if (leader != event) 3264 if (leader != event)
3112 leader->pmu->read(leader); 3265 leader->pmu->read(leader);
3113 3266
3114 values[n++] = atomic64_read(&leader->count); 3267 values[n++] = perf_event_count(leader);
3115 if (read_format & PERF_FORMAT_ID) 3268 if (read_format & PERF_FORMAT_ID)
3116 values[n++] = primary_event_id(leader); 3269 values[n++] = primary_event_id(leader);
3117 3270
@@ -3123,7 +3276,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
3123 if (sub != event) 3276 if (sub != event)
3124 sub->pmu->read(sub); 3277 sub->pmu->read(sub);
3125 3278
3126 values[n++] = atomic64_read(&sub->count); 3279 values[n++] = perf_event_count(sub);
3127 if (read_format & PERF_FORMAT_ID) 3280 if (read_format & PERF_FORMAT_ID)
3128 values[n++] = primary_event_id(sub); 3281 values[n++] = primary_event_id(sub);
3129 3282
@@ -3354,7 +3507,7 @@ perf_event_read_event(struct perf_event *event,
3354/* 3507/*
3355 * task tracking -- fork/exit 3508 * task tracking -- fork/exit
3356 * 3509 *
3357 * enabled by: attr.comm | attr.mmap | attr.task 3510 * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task
3358 */ 3511 */
3359 3512
3360struct perf_task_event { 3513struct perf_task_event {
@@ -3377,22 +3530,13 @@ static void perf_event_task_output(struct perf_event *event,
3377{ 3530{
3378 struct perf_output_handle handle; 3531 struct perf_output_handle handle;
3379 struct task_struct *task = task_event->task; 3532 struct task_struct *task = task_event->task;
3380 unsigned long flags;
3381 int size, ret; 3533 int size, ret;
3382 3534
3383 /*
3384 * If this CPU attempts to acquire an rq lock held by a CPU spinning
3385 * in perf_output_lock() from interrupt context, it's game over.
3386 */
3387 local_irq_save(flags);
3388
3389 size = task_event->event_id.header.size; 3535 size = task_event->event_id.header.size;
3390 ret = perf_output_begin(&handle, event, size, 0, 0); 3536 ret = perf_output_begin(&handle, event, size, 0, 0);
3391 3537
3392 if (ret) { 3538 if (ret)
3393 local_irq_restore(flags);
3394 return; 3539 return;
3395 }
3396 3540
3397 task_event->event_id.pid = perf_event_pid(event, task); 3541 task_event->event_id.pid = perf_event_pid(event, task);
3398 task_event->event_id.ppid = perf_event_pid(event, current); 3542 task_event->event_id.ppid = perf_event_pid(event, current);
@@ -3403,7 +3547,6 @@ static void perf_event_task_output(struct perf_event *event,
3403 perf_output_put(&handle, task_event->event_id); 3547 perf_output_put(&handle, task_event->event_id);
3404 3548
3405 perf_output_end(&handle); 3549 perf_output_end(&handle);
3406 local_irq_restore(flags);
3407} 3550}
3408 3551
3409static int perf_event_task_match(struct perf_event *event) 3552static int perf_event_task_match(struct perf_event *event)
@@ -3414,7 +3557,8 @@ static int perf_event_task_match(struct perf_event *event)
3414 if (event->cpu != -1 && event->cpu != smp_processor_id()) 3557 if (event->cpu != -1 && event->cpu != smp_processor_id())
3415 return 0; 3558 return 0;
3416 3559
3417 if (event->attr.comm || event->attr.mmap || event->attr.task) 3560 if (event->attr.comm || event->attr.mmap ||
3561 event->attr.mmap_data || event->attr.task)
3418 return 1; 3562 return 1;
3419 3563
3420 return 0; 3564 return 0;
@@ -3639,7 +3783,8 @@ static void perf_event_mmap_output(struct perf_event *event,
3639} 3783}
3640 3784
3641static int perf_event_mmap_match(struct perf_event *event, 3785static int perf_event_mmap_match(struct perf_event *event,
3642 struct perf_mmap_event *mmap_event) 3786 struct perf_mmap_event *mmap_event,
3787 int executable)
3643{ 3788{
3644 if (event->state < PERF_EVENT_STATE_INACTIVE) 3789 if (event->state < PERF_EVENT_STATE_INACTIVE)
3645 return 0; 3790 return 0;
@@ -3647,19 +3792,21 @@ static int perf_event_mmap_match(struct perf_event *event,
3647 if (event->cpu != -1 && event->cpu != smp_processor_id()) 3792 if (event->cpu != -1 && event->cpu != smp_processor_id())
3648 return 0; 3793 return 0;
3649 3794
3650 if (event->attr.mmap) 3795 if ((!executable && event->attr.mmap_data) ||
3796 (executable && event->attr.mmap))
3651 return 1; 3797 return 1;
3652 3798
3653 return 0; 3799 return 0;
3654} 3800}
3655 3801
3656static void perf_event_mmap_ctx(struct perf_event_context *ctx, 3802static void perf_event_mmap_ctx(struct perf_event_context *ctx,
3657 struct perf_mmap_event *mmap_event) 3803 struct perf_mmap_event *mmap_event,
3804 int executable)
3658{ 3805{
3659 struct perf_event *event; 3806 struct perf_event *event;
3660 3807
3661 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3808 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3662 if (perf_event_mmap_match(event, mmap_event)) 3809 if (perf_event_mmap_match(event, mmap_event, executable))
3663 perf_event_mmap_output(event, mmap_event); 3810 perf_event_mmap_output(event, mmap_event);
3664 } 3811 }
3665} 3812}
@@ -3703,6 +3850,14 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
3703 if (!vma->vm_mm) { 3850 if (!vma->vm_mm) {
3704 name = strncpy(tmp, "[vdso]", sizeof(tmp)); 3851 name = strncpy(tmp, "[vdso]", sizeof(tmp));
3705 goto got_name; 3852 goto got_name;
3853 } else if (vma->vm_start <= vma->vm_mm->start_brk &&
3854 vma->vm_end >= vma->vm_mm->brk) {
3855 name = strncpy(tmp, "[heap]", sizeof(tmp));
3856 goto got_name;
3857 } else if (vma->vm_start <= vma->vm_mm->start_stack &&
3858 vma->vm_end >= vma->vm_mm->start_stack) {
3859 name = strncpy(tmp, "[stack]", sizeof(tmp));
3860 goto got_name;
3706 } 3861 }
3707 3862
3708 name = strncpy(tmp, "//anon", sizeof(tmp)); 3863 name = strncpy(tmp, "//anon", sizeof(tmp));
@@ -3719,17 +3874,17 @@ got_name:
3719 3874
3720 rcu_read_lock(); 3875 rcu_read_lock();
3721 cpuctx = &get_cpu_var(perf_cpu_context); 3876 cpuctx = &get_cpu_var(perf_cpu_context);
3722 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); 3877 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC);
3723 ctx = rcu_dereference(current->perf_event_ctxp); 3878 ctx = rcu_dereference(current->perf_event_ctxp);
3724 if (ctx) 3879 if (ctx)
3725 perf_event_mmap_ctx(ctx, mmap_event); 3880 perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC);
3726 put_cpu_var(perf_cpu_context); 3881 put_cpu_var(perf_cpu_context);
3727 rcu_read_unlock(); 3882 rcu_read_unlock();
3728 3883
3729 kfree(buf); 3884 kfree(buf);
3730} 3885}
3731 3886
3732void __perf_event_mmap(struct vm_area_struct *vma) 3887void perf_event_mmap(struct vm_area_struct *vma)
3733{ 3888{
3734 struct perf_mmap_event mmap_event; 3889 struct perf_mmap_event mmap_event;
3735 3890
@@ -3743,7 +3898,7 @@ void __perf_event_mmap(struct vm_area_struct *vma)
3743 .event_id = { 3898 .event_id = {
3744 .header = { 3899 .header = {
3745 .type = PERF_RECORD_MMAP, 3900 .type = PERF_RECORD_MMAP,
3746 .misc = 0, 3901 .misc = PERF_RECORD_MISC_USER,
3747 /* .size */ 3902 /* .size */
3748 }, 3903 },
3749 /* .pid */ 3904 /* .pid */
@@ -3891,14 +4046,14 @@ static u64 perf_swevent_set_period(struct perf_event *event)
3891 hwc->last_period = hwc->sample_period; 4046 hwc->last_period = hwc->sample_period;
3892 4047
3893again: 4048again:
3894 old = val = atomic64_read(&hwc->period_left); 4049 old = val = local64_read(&hwc->period_left);
3895 if (val < 0) 4050 if (val < 0)
3896 return 0; 4051 return 0;
3897 4052
3898 nr = div64_u64(period + val, period); 4053 nr = div64_u64(period + val, period);
3899 offset = nr * period; 4054 offset = nr * period;
3900 val -= offset; 4055 val -= offset;
3901 if (atomic64_cmpxchg(&hwc->period_left, old, val) != old) 4056 if (local64_cmpxchg(&hwc->period_left, old, val) != old)
3902 goto again; 4057 goto again;
3903 4058
3904 return nr; 4059 return nr;
@@ -3931,20 +4086,13 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
3931 } 4086 }
3932} 4087}
3933 4088
3934static void perf_swevent_unthrottle(struct perf_event *event)
3935{
3936 /*
3937 * Nothing to do, we already reset hwc->interrupts.
3938 */
3939}
3940
3941static void perf_swevent_add(struct perf_event *event, u64 nr, 4089static void perf_swevent_add(struct perf_event *event, u64 nr,
3942 int nmi, struct perf_sample_data *data, 4090 int nmi, struct perf_sample_data *data,
3943 struct pt_regs *regs) 4091 struct pt_regs *regs)
3944{ 4092{
3945 struct hw_perf_event *hwc = &event->hw; 4093 struct hw_perf_event *hwc = &event->hw;
3946 4094
3947 atomic64_add(nr, &event->count); 4095 local64_add(nr, &event->count);
3948 4096
3949 if (!regs) 4097 if (!regs)
3950 return; 4098 return;
@@ -3955,45 +4103,12 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
3955 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) 4103 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
3956 return perf_swevent_overflow(event, 1, nmi, data, regs); 4104 return perf_swevent_overflow(event, 1, nmi, data, regs);
3957 4105
3958 if (atomic64_add_negative(nr, &hwc->period_left)) 4106 if (local64_add_negative(nr, &hwc->period_left))
3959 return; 4107 return;
3960 4108
3961 perf_swevent_overflow(event, 0, nmi, data, regs); 4109 perf_swevent_overflow(event, 0, nmi, data, regs);
3962} 4110}
3963 4111
3964static int perf_swevent_is_counting(struct perf_event *event)
3965{
3966 /*
3967 * The event is active, we're good!
3968 */
3969 if (event->state == PERF_EVENT_STATE_ACTIVE)
3970 return 1;
3971
3972 /*
3973 * The event is off/error, not counting.
3974 */
3975 if (event->state != PERF_EVENT_STATE_INACTIVE)
3976 return 0;
3977
3978 /*
3979 * The event is inactive, if the context is active
3980 * we're part of a group that didn't make it on the 'pmu',
3981 * not counting.
3982 */
3983 if (event->ctx->is_active)
3984 return 0;
3985
3986 /*
3987 * We're inactive and the context is too, this means the
3988 * task is scheduled out, we're counting events that happen
3989 * to us, like migration events.
3990 */
3991 return 1;
3992}
3993
3994static int perf_tp_event_match(struct perf_event *event,
3995 struct perf_sample_data *data);
3996
3997static int perf_exclude_event(struct perf_event *event, 4112static int perf_exclude_event(struct perf_event *event,
3998 struct pt_regs *regs) 4113 struct pt_regs *regs)
3999{ 4114{
@@ -4014,12 +4129,6 @@ static int perf_swevent_match(struct perf_event *event,
4014 struct perf_sample_data *data, 4129 struct perf_sample_data *data,
4015 struct pt_regs *regs) 4130 struct pt_regs *regs)
4016{ 4131{
4017 if (event->cpu != -1 && event->cpu != smp_processor_id())
4018 return 0;
4019
4020 if (!perf_swevent_is_counting(event))
4021 return 0;
4022
4023 if (event->attr.type != type) 4132 if (event->attr.type != type)
4024 return 0; 4133 return 0;
4025 4134
@@ -4029,30 +4138,88 @@ static int perf_swevent_match(struct perf_event *event,
4029 if (perf_exclude_event(event, regs)) 4138 if (perf_exclude_event(event, regs))
4030 return 0; 4139 return 0;
4031 4140
4032 if (event->attr.type == PERF_TYPE_TRACEPOINT &&
4033 !perf_tp_event_match(event, data))
4034 return 0;
4035
4036 return 1; 4141 return 1;
4037} 4142}
4038 4143
4039static void perf_swevent_ctx_event(struct perf_event_context *ctx, 4144static inline u64 swevent_hash(u64 type, u32 event_id)
4040 enum perf_type_id type, 4145{
4041 u32 event_id, u64 nr, int nmi, 4146 u64 val = event_id | (type << 32);
4042 struct perf_sample_data *data, 4147
4043 struct pt_regs *regs) 4148 return hash_64(val, SWEVENT_HLIST_BITS);
4149}
4150
4151static inline struct hlist_head *
4152__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
4044{ 4153{
4154 u64 hash = swevent_hash(type, event_id);
4155
4156 return &hlist->heads[hash];
4157}
4158
4159/* For the read side: events when they trigger */
4160static inline struct hlist_head *
4161find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
4162{
4163 struct swevent_hlist *hlist;
4164
4165 hlist = rcu_dereference(ctx->swevent_hlist);
4166 if (!hlist)
4167 return NULL;
4168
4169 return __find_swevent_head(hlist, type, event_id);
4170}
4171
4172/* For the event head insertion and removal in the hlist */
4173static inline struct hlist_head *
4174find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
4175{
4176 struct swevent_hlist *hlist;
4177 u32 event_id = event->attr.config;
4178 u64 type = event->attr.type;
4179
4180 /*
4181 * Event scheduling is always serialized against hlist allocation
4182 * and release. Which makes the protected version suitable here.
4183 * The context lock guarantees that.
4184 */
4185 hlist = rcu_dereference_protected(ctx->swevent_hlist,
4186 lockdep_is_held(&event->ctx->lock));
4187 if (!hlist)
4188 return NULL;
4189
4190 return __find_swevent_head(hlist, type, event_id);
4191}
4192
4193static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
4194 u64 nr, int nmi,
4195 struct perf_sample_data *data,
4196 struct pt_regs *regs)
4197{
4198 struct perf_cpu_context *cpuctx;
4045 struct perf_event *event; 4199 struct perf_event *event;
4200 struct hlist_node *node;
4201 struct hlist_head *head;
4046 4202
4047 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 4203 cpuctx = &__get_cpu_var(perf_cpu_context);
4204
4205 rcu_read_lock();
4206
4207 head = find_swevent_head_rcu(cpuctx, type, event_id);
4208
4209 if (!head)
4210 goto end;
4211
4212 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4048 if (perf_swevent_match(event, type, event_id, data, regs)) 4213 if (perf_swevent_match(event, type, event_id, data, regs))
4049 perf_swevent_add(event, nr, nmi, data, regs); 4214 perf_swevent_add(event, nr, nmi, data, regs);
4050 } 4215 }
4216end:
4217 rcu_read_unlock();
4051} 4218}
4052 4219
4053int perf_swevent_get_recursion_context(void) 4220int perf_swevent_get_recursion_context(void)
4054{ 4221{
4055 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); 4222 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4056 int rctx; 4223 int rctx;
4057 4224
4058 if (in_nmi()) 4225 if (in_nmi())
@@ -4064,10 +4231,8 @@ int perf_swevent_get_recursion_context(void)
4064 else 4231 else
4065 rctx = 0; 4232 rctx = 0;
4066 4233
4067 if (cpuctx->recursion[rctx]) { 4234 if (cpuctx->recursion[rctx])
4068 put_cpu_var(perf_cpu_context);
4069 return -1; 4235 return -1;
4070 }
4071 4236
4072 cpuctx->recursion[rctx]++; 4237 cpuctx->recursion[rctx]++;
4073 barrier(); 4238 barrier();
@@ -4076,35 +4241,11 @@ int perf_swevent_get_recursion_context(void)
4076} 4241}
4077EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); 4242EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
4078 4243
4079void perf_swevent_put_recursion_context(int rctx) 4244void inline perf_swevent_put_recursion_context(int rctx)
4080{ 4245{
4081 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 4246 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4082 barrier(); 4247 barrier();
4083 cpuctx->recursion[rctx]--; 4248 cpuctx->recursion[rctx]--;
4084 put_cpu_var(perf_cpu_context);
4085}
4086EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
4087
4088static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
4089 u64 nr, int nmi,
4090 struct perf_sample_data *data,
4091 struct pt_regs *regs)
4092{
4093 struct perf_cpu_context *cpuctx;
4094 struct perf_event_context *ctx;
4095
4096 cpuctx = &__get_cpu_var(perf_cpu_context);
4097 rcu_read_lock();
4098 perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
4099 nr, nmi, data, regs);
4100 /*
4101 * doesn't really matter which of the child contexts the
4102 * events ends up in.
4103 */
4104 ctx = rcu_dereference(current->perf_event_ctxp);
4105 if (ctx)
4106 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
4107 rcu_read_unlock();
4108} 4249}
4109 4250
4110void __perf_sw_event(u32 event_id, u64 nr, int nmi, 4251void __perf_sw_event(u32 event_id, u64 nr, int nmi,
@@ -4113,6 +4254,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
4113 struct perf_sample_data data; 4254 struct perf_sample_data data;
4114 int rctx; 4255 int rctx;
4115 4256
4257 preempt_disable_notrace();
4116 rctx = perf_swevent_get_recursion_context(); 4258 rctx = perf_swevent_get_recursion_context();
4117 if (rctx < 0) 4259 if (rctx < 0)
4118 return; 4260 return;
@@ -4122,6 +4264,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
4122 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); 4264 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
4123 4265
4124 perf_swevent_put_recursion_context(rctx); 4266 perf_swevent_put_recursion_context(rctx);
4267 preempt_enable_notrace();
4125} 4268}
4126 4269
4127static void perf_swevent_read(struct perf_event *event) 4270static void perf_swevent_read(struct perf_event *event)
@@ -4131,23 +4274,46 @@ static void perf_swevent_read(struct perf_event *event)
4131static int perf_swevent_enable(struct perf_event *event) 4274static int perf_swevent_enable(struct perf_event *event)
4132{ 4275{
4133 struct hw_perf_event *hwc = &event->hw; 4276 struct hw_perf_event *hwc = &event->hw;
4277 struct perf_cpu_context *cpuctx;
4278 struct hlist_head *head;
4279
4280 cpuctx = &__get_cpu_var(perf_cpu_context);
4134 4281
4135 if (hwc->sample_period) { 4282 if (hwc->sample_period) {
4136 hwc->last_period = hwc->sample_period; 4283 hwc->last_period = hwc->sample_period;
4137 perf_swevent_set_period(event); 4284 perf_swevent_set_period(event);
4138 } 4285 }
4286
4287 head = find_swevent_head(cpuctx, event);
4288 if (WARN_ON_ONCE(!head))
4289 return -EINVAL;
4290
4291 hlist_add_head_rcu(&event->hlist_entry, head);
4292
4139 return 0; 4293 return 0;
4140} 4294}
4141 4295
4142static void perf_swevent_disable(struct perf_event *event) 4296static void perf_swevent_disable(struct perf_event *event)
4143{ 4297{
4298 hlist_del_rcu(&event->hlist_entry);
4299}
4300
4301static void perf_swevent_void(struct perf_event *event)
4302{
4303}
4304
4305static int perf_swevent_int(struct perf_event *event)
4306{
4307 return 0;
4144} 4308}
4145 4309
4146static const struct pmu perf_ops_generic = { 4310static const struct pmu perf_ops_generic = {
4147 .enable = perf_swevent_enable, 4311 .enable = perf_swevent_enable,
4148 .disable = perf_swevent_disable, 4312 .disable = perf_swevent_disable,
4313 .start = perf_swevent_int,
4314 .stop = perf_swevent_void,
4149 .read = perf_swevent_read, 4315 .read = perf_swevent_read,
4150 .unthrottle = perf_swevent_unthrottle, 4316 .unthrottle = perf_swevent_void, /* hwc->interrupts already reset */
4151}; 4317};
4152 4318
4153/* 4319/*
@@ -4168,15 +4334,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4168 perf_sample_data_init(&data, 0); 4334 perf_sample_data_init(&data, 0);
4169 data.period = event->hw.last_period; 4335 data.period = event->hw.last_period;
4170 regs = get_irq_regs(); 4336 regs = get_irq_regs();
4171 /*
4172 * In case we exclude kernel IPs or are somehow not in interrupt
4173 * context, provide the next best thing, the user IP.
4174 */
4175 if ((event->attr.exclude_kernel || !regs) &&
4176 !event->attr.exclude_user)
4177 regs = task_pt_regs(current);
4178 4337
4179 if (regs) { 4338 if (regs && !perf_exclude_event(event, regs)) {
4180 if (!(event->attr.exclude_idle && current->pid == 0)) 4339 if (!(event->attr.exclude_idle && current->pid == 0))
4181 if (perf_event_overflow(event, 0, &data, regs)) 4340 if (perf_event_overflow(event, 0, &data, regs))
4182 ret = HRTIMER_NORESTART; 4341 ret = HRTIMER_NORESTART;
@@ -4235,8 +4394,8 @@ static void cpu_clock_perf_event_update(struct perf_event *event)
4235 u64 now; 4394 u64 now;
4236 4395
4237 now = cpu_clock(cpu); 4396 now = cpu_clock(cpu);
4238 prev = atomic64_xchg(&event->hw.prev_count, now); 4397 prev = local64_xchg(&event->hw.prev_count, now);
4239 atomic64_add(now - prev, &event->count); 4398 local64_add(now - prev, &event->count);
4240} 4399}
4241 4400
4242static int cpu_clock_perf_event_enable(struct perf_event *event) 4401static int cpu_clock_perf_event_enable(struct perf_event *event)
@@ -4244,7 +4403,7 @@ static int cpu_clock_perf_event_enable(struct perf_event *event)
4244 struct hw_perf_event *hwc = &event->hw; 4403 struct hw_perf_event *hwc = &event->hw;
4245 int cpu = raw_smp_processor_id(); 4404 int cpu = raw_smp_processor_id();
4246 4405
4247 atomic64_set(&hwc->prev_count, cpu_clock(cpu)); 4406 local64_set(&hwc->prev_count, cpu_clock(cpu));
4248 perf_swevent_start_hrtimer(event); 4407 perf_swevent_start_hrtimer(event);
4249 4408
4250 return 0; 4409 return 0;
@@ -4276,9 +4435,9 @@ static void task_clock_perf_event_update(struct perf_event *event, u64 now)
4276 u64 prev; 4435 u64 prev;
4277 s64 delta; 4436 s64 delta;
4278 4437
4279 prev = atomic64_xchg(&event->hw.prev_count, now); 4438 prev = local64_xchg(&event->hw.prev_count, now);
4280 delta = now - prev; 4439 delta = now - prev;
4281 atomic64_add(delta, &event->count); 4440 local64_add(delta, &event->count);
4282} 4441}
4283 4442
4284static int task_clock_perf_event_enable(struct perf_event *event) 4443static int task_clock_perf_event_enable(struct perf_event *event)
@@ -4288,7 +4447,7 @@ static int task_clock_perf_event_enable(struct perf_event *event)
4288 4447
4289 now = event->ctx->time; 4448 now = event->ctx->time;
4290 4449
4291 atomic64_set(&hwc->prev_count, now); 4450 local64_set(&hwc->prev_count, now);
4292 4451
4293 perf_swevent_start_hrtimer(event); 4452 perf_swevent_start_hrtimer(event);
4294 4453
@@ -4324,27 +4483,124 @@ static const struct pmu perf_ops_task_clock = {
4324 .read = task_clock_perf_event_read, 4483 .read = task_clock_perf_event_read,
4325}; 4484};
4326 4485
4327#ifdef CONFIG_EVENT_TRACING 4486/* Deref the hlist from the update side */
4487static inline struct swevent_hlist *
4488swevent_hlist_deref(struct perf_cpu_context *cpuctx)
4489{
4490 return rcu_dereference_protected(cpuctx->swevent_hlist,
4491 lockdep_is_held(&cpuctx->hlist_mutex));
4492}
4328 4493
4329void perf_tp_event(int event_id, u64 addr, u64 count, void *record, 4494static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
4330 int entry_size, struct pt_regs *regs)
4331{ 4495{
4332 struct perf_sample_data data; 4496 struct swevent_hlist *hlist;
4333 struct perf_raw_record raw = {
4334 .size = entry_size,
4335 .data = record,
4336 };
4337 4497
4338 perf_sample_data_init(&data, addr); 4498 hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
4339 data.raw = &raw; 4499 kfree(hlist);
4500}
4501
4502static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
4503{
4504 struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx);
4340 4505
4341 /* Trace events already protected against recursion */ 4506 if (!hlist)
4342 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, 4507 return;
4343 &data, regs); 4508
4509 rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
4510 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
4344} 4511}
4345EXPORT_SYMBOL_GPL(perf_tp_event);
4346 4512
4347static int perf_tp_event_match(struct perf_event *event, 4513static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
4514{
4515 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4516
4517 mutex_lock(&cpuctx->hlist_mutex);
4518
4519 if (!--cpuctx->hlist_refcount)
4520 swevent_hlist_release(cpuctx);
4521
4522 mutex_unlock(&cpuctx->hlist_mutex);
4523}
4524
4525static void swevent_hlist_put(struct perf_event *event)
4526{
4527 int cpu;
4528
4529 if (event->cpu != -1) {
4530 swevent_hlist_put_cpu(event, event->cpu);
4531 return;
4532 }
4533
4534 for_each_possible_cpu(cpu)
4535 swevent_hlist_put_cpu(event, cpu);
4536}
4537
4538static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
4539{
4540 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4541 int err = 0;
4542
4543 mutex_lock(&cpuctx->hlist_mutex);
4544
4545 if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) {
4546 struct swevent_hlist *hlist;
4547
4548 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
4549 if (!hlist) {
4550 err = -ENOMEM;
4551 goto exit;
4552 }
4553 rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
4554 }
4555 cpuctx->hlist_refcount++;
4556 exit:
4557 mutex_unlock(&cpuctx->hlist_mutex);
4558
4559 return err;
4560}
4561
4562static int swevent_hlist_get(struct perf_event *event)
4563{
4564 int err;
4565 int cpu, failed_cpu;
4566
4567 if (event->cpu != -1)
4568 return swevent_hlist_get_cpu(event, event->cpu);
4569
4570 get_online_cpus();
4571 for_each_possible_cpu(cpu) {
4572 err = swevent_hlist_get_cpu(event, cpu);
4573 if (err) {
4574 failed_cpu = cpu;
4575 goto fail;
4576 }
4577 }
4578 put_online_cpus();
4579
4580 return 0;
4581 fail:
4582 for_each_possible_cpu(cpu) {
4583 if (cpu == failed_cpu)
4584 break;
4585 swevent_hlist_put_cpu(event, cpu);
4586 }
4587
4588 put_online_cpus();
4589 return err;
4590}
4591
4592#ifdef CONFIG_EVENT_TRACING
4593
4594static const struct pmu perf_ops_tracepoint = {
4595 .enable = perf_trace_enable,
4596 .disable = perf_trace_disable,
4597 .start = perf_swevent_int,
4598 .stop = perf_swevent_void,
4599 .read = perf_swevent_read,
4600 .unthrottle = perf_swevent_void,
4601};
4602
4603static int perf_tp_filter_match(struct perf_event *event,
4348 struct perf_sample_data *data) 4604 struct perf_sample_data *data)
4349{ 4605{
4350 void *record = data->raw->data; 4606 void *record = data->raw->data;
@@ -4354,13 +4610,55 @@ static int perf_tp_event_match(struct perf_event *event,
4354 return 0; 4610 return 0;
4355} 4611}
4356 4612
4613static int perf_tp_event_match(struct perf_event *event,
4614 struct perf_sample_data *data,
4615 struct pt_regs *regs)
4616{
4617 /*
4618 * All tracepoints are from kernel-space.
4619 */
4620 if (event->attr.exclude_kernel)
4621 return 0;
4622
4623 if (!perf_tp_filter_match(event, data))
4624 return 0;
4625
4626 return 1;
4627}
4628
4629void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
4630 struct pt_regs *regs, struct hlist_head *head, int rctx)
4631{
4632 struct perf_sample_data data;
4633 struct perf_event *event;
4634 struct hlist_node *node;
4635
4636 struct perf_raw_record raw = {
4637 .size = entry_size,
4638 .data = record,
4639 };
4640
4641 perf_sample_data_init(&data, addr);
4642 data.raw = &raw;
4643
4644 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4645 if (perf_tp_event_match(event, &data, regs))
4646 perf_swevent_add(event, count, 1, &data, regs);
4647 }
4648
4649 perf_swevent_put_recursion_context(rctx);
4650}
4651EXPORT_SYMBOL_GPL(perf_tp_event);
4652
4357static void tp_perf_event_destroy(struct perf_event *event) 4653static void tp_perf_event_destroy(struct perf_event *event)
4358{ 4654{
4359 perf_trace_disable(event->attr.config); 4655 perf_trace_destroy(event);
4360} 4656}
4361 4657
4362static const struct pmu *tp_perf_event_init(struct perf_event *event) 4658static const struct pmu *tp_perf_event_init(struct perf_event *event)
4363{ 4659{
4660 int err;
4661
4364 /* 4662 /*
4365 * Raw tracepoint data is a severe data leak, only allow root to 4663 * Raw tracepoint data is a severe data leak, only allow root to
4366 * have these. 4664 * have these.
@@ -4370,12 +4668,13 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
4370 !capable(CAP_SYS_ADMIN)) 4668 !capable(CAP_SYS_ADMIN))
4371 return ERR_PTR(-EPERM); 4669 return ERR_PTR(-EPERM);
4372 4670
4373 if (perf_trace_enable(event->attr.config)) 4671 err = perf_trace_init(event);
4672 if (err)
4374 return NULL; 4673 return NULL;
4375 4674
4376 event->destroy = tp_perf_event_destroy; 4675 event->destroy = tp_perf_event_destroy;
4377 4676
4378 return &perf_ops_generic; 4677 return &perf_ops_tracepoint;
4379} 4678}
4380 4679
4381static int perf_event_set_filter(struct perf_event *event, void __user *arg) 4680static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4403,12 +4702,6 @@ static void perf_event_free_filter(struct perf_event *event)
4403 4702
4404#else 4703#else
4405 4704
4406static int perf_tp_event_match(struct perf_event *event,
4407 struct perf_sample_data *data)
4408{
4409 return 1;
4410}
4411
4412static const struct pmu *tp_perf_event_init(struct perf_event *event) 4705static const struct pmu *tp_perf_event_init(struct perf_event *event)
4413{ 4706{
4414 return NULL; 4707 return NULL;
@@ -4474,6 +4767,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
4474 WARN_ON(event->parent); 4767 WARN_ON(event->parent);
4475 4768
4476 atomic_dec(&perf_swevent_enabled[event_id]); 4769 atomic_dec(&perf_swevent_enabled[event_id]);
4770 swevent_hlist_put(event);
4477} 4771}
4478 4772
4479static const struct pmu *sw_perf_event_init(struct perf_event *event) 4773static const struct pmu *sw_perf_event_init(struct perf_event *event)
@@ -4512,6 +4806,12 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
4512 case PERF_COUNT_SW_ALIGNMENT_FAULTS: 4806 case PERF_COUNT_SW_ALIGNMENT_FAULTS:
4513 case PERF_COUNT_SW_EMULATION_FAULTS: 4807 case PERF_COUNT_SW_EMULATION_FAULTS:
4514 if (!event->parent) { 4808 if (!event->parent) {
4809 int err;
4810
4811 err = swevent_hlist_get(event);
4812 if (err)
4813 return ERR_PTR(err);
4814
4515 atomic_inc(&perf_swevent_enabled[event_id]); 4815 atomic_inc(&perf_swevent_enabled[event_id]);
4516 event->destroy = sw_perf_event_destroy; 4816 event->destroy = sw_perf_event_destroy;
4517 } 4817 }
@@ -4590,7 +4890,7 @@ perf_event_alloc(struct perf_event_attr *attr,
4590 hwc->sample_period = 1; 4890 hwc->sample_period = 1;
4591 hwc->last_period = hwc->sample_period; 4891 hwc->last_period = hwc->sample_period;
4592 4892
4593 atomic64_set(&hwc->period_left, hwc->sample_period); 4893 local64_set(&hwc->period_left, hwc->sample_period);
4594 4894
4595 /* 4895 /*
4596 * we currently do not support PERF_FORMAT_GROUP on inherited events 4896 * we currently do not support PERF_FORMAT_GROUP on inherited events
@@ -4639,7 +4939,7 @@ done:
4639 4939
4640 if (!event->parent) { 4940 if (!event->parent) {
4641 atomic_inc(&nr_events); 4941 atomic_inc(&nr_events);
4642 if (event->attr.mmap) 4942 if (event->attr.mmap || event->attr.mmap_data)
4643 atomic_inc(&nr_mmap_events); 4943 atomic_inc(&nr_mmap_events);
4644 if (event->attr.comm) 4944 if (event->attr.comm)
4645 atomic_inc(&nr_comm_events); 4945 atomic_inc(&nr_comm_events);
@@ -4730,54 +5030,53 @@ err_size:
4730 goto out; 5030 goto out;
4731} 5031}
4732 5032
4733static int perf_event_set_output(struct perf_event *event, int output_fd) 5033static int
5034perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
4734{ 5035{
4735 struct perf_event *output_event = NULL; 5036 struct perf_buffer *buffer = NULL, *old_buffer = NULL;
4736 struct file *output_file = NULL;
4737 struct perf_event *old_output;
4738 int fput_needed = 0;
4739 int ret = -EINVAL; 5037 int ret = -EINVAL;
4740 5038
4741 if (!output_fd) 5039 if (!output_event)
4742 goto set; 5040 goto set;
4743 5041
4744 output_file = fget_light(output_fd, &fput_needed); 5042 /* don't allow circular references */
4745 if (!output_file) 5043 if (event == output_event)
4746 return -EBADF;
4747
4748 if (output_file->f_op != &perf_fops)
4749 goto out; 5044 goto out;
4750 5045
4751 output_event = output_file->private_data; 5046 /*
4752 5047 * Don't allow cross-cpu buffers
4753 /* Don't chain output fds */ 5048 */
4754 if (output_event->output) 5049 if (output_event->cpu != event->cpu)
4755 goto out; 5050 goto out;
4756 5051
4757 /* Don't set an output fd when we already have an output channel */ 5052 /*
4758 if (event->data) 5053 * If its not a per-cpu buffer, it must be the same task.
5054 */
5055 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
4759 goto out; 5056 goto out;
4760 5057
4761 atomic_long_inc(&output_file->f_count);
4762
4763set: 5058set:
4764 mutex_lock(&event->mmap_mutex); 5059 mutex_lock(&event->mmap_mutex);
4765 old_output = event->output; 5060 /* Can't redirect output if we've got an active mmap() */
4766 rcu_assign_pointer(event->output, output_event); 5061 if (atomic_read(&event->mmap_count))
4767 mutex_unlock(&event->mmap_mutex); 5062 goto unlock;
4768 5063
4769 if (old_output) { 5064 if (output_event) {
4770 /* 5065 /* get the buffer we want to redirect to */
4771 * we need to make sure no existing perf_output_*() 5066 buffer = perf_buffer_get(output_event);
4772 * is still referencing this event. 5067 if (!buffer)
4773 */ 5068 goto unlock;
4774 synchronize_rcu();
4775 fput(old_output->filp);
4776 } 5069 }
4777 5070
5071 old_buffer = event->buffer;
5072 rcu_assign_pointer(event->buffer, buffer);
4778 ret = 0; 5073 ret = 0;
5074unlock:
5075 mutex_unlock(&event->mmap_mutex);
5076
5077 if (old_buffer)
5078 perf_buffer_put(old_buffer);
4779out: 5079out:
4780 fput_light(output_file, fput_needed);
4781 return ret; 5080 return ret;
4782} 5081}
4783 5082
@@ -4793,13 +5092,13 @@ SYSCALL_DEFINE5(perf_event_open,
4793 struct perf_event_attr __user *, attr_uptr, 5092 struct perf_event_attr __user *, attr_uptr,
4794 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) 5093 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
4795{ 5094{
4796 struct perf_event *event, *group_leader; 5095 struct perf_event *event, *group_leader = NULL, *output_event = NULL;
4797 struct perf_event_attr attr; 5096 struct perf_event_attr attr;
4798 struct perf_event_context *ctx; 5097 struct perf_event_context *ctx;
4799 struct file *event_file = NULL; 5098 struct file *event_file = NULL;
4800 struct file *group_file = NULL; 5099 struct file *group_file = NULL;
5100 int event_fd;
4801 int fput_needed = 0; 5101 int fput_needed = 0;
4802 int fput_needed2 = 0;
4803 int err; 5102 int err;
4804 5103
4805 /* for future expandability... */ 5104 /* for future expandability... */
@@ -4820,26 +5119,38 @@ SYSCALL_DEFINE5(perf_event_open,
4820 return -EINVAL; 5119 return -EINVAL;
4821 } 5120 }
4822 5121
5122 event_fd = get_unused_fd_flags(O_RDWR);
5123 if (event_fd < 0)
5124 return event_fd;
5125
4823 /* 5126 /*
4824 * Get the target context (task or percpu): 5127 * Get the target context (task or percpu):
4825 */ 5128 */
4826 ctx = find_get_context(pid, cpu); 5129 ctx = find_get_context(pid, cpu);
4827 if (IS_ERR(ctx)) 5130 if (IS_ERR(ctx)) {
4828 return PTR_ERR(ctx); 5131 err = PTR_ERR(ctx);
5132 goto err_fd;
5133 }
5134
5135 if (group_fd != -1) {
5136 group_leader = perf_fget_light(group_fd, &fput_needed);
5137 if (IS_ERR(group_leader)) {
5138 err = PTR_ERR(group_leader);
5139 goto err_put_context;
5140 }
5141 group_file = group_leader->filp;
5142 if (flags & PERF_FLAG_FD_OUTPUT)
5143 output_event = group_leader;
5144 if (flags & PERF_FLAG_FD_NO_GROUP)
5145 group_leader = NULL;
5146 }
4829 5147
4830 /* 5148 /*
4831 * Look up the group leader (we will attach this event to it): 5149 * Look up the group leader (we will attach this event to it):
4832 */ 5150 */
4833 group_leader = NULL; 5151 if (group_leader) {
4834 if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
4835 err = -EINVAL; 5152 err = -EINVAL;
4836 group_file = fget_light(group_fd, &fput_needed);
4837 if (!group_file)
4838 goto err_put_context;
4839 if (group_file->f_op != &perf_fops)
4840 goto err_put_context;
4841 5153
4842 group_leader = group_file->private_data;
4843 /* 5154 /*
4844 * Do not allow a recursive hierarchy (this new sibling 5155 * Do not allow a recursive hierarchy (this new sibling
4845 * becoming part of another group-sibling): 5156 * becoming part of another group-sibling):
@@ -4861,22 +5172,21 @@ SYSCALL_DEFINE5(perf_event_open,
4861 5172
4862 event = perf_event_alloc(&attr, cpu, ctx, group_leader, 5173 event = perf_event_alloc(&attr, cpu, ctx, group_leader,
4863 NULL, NULL, GFP_KERNEL); 5174 NULL, NULL, GFP_KERNEL);
4864 err = PTR_ERR(event); 5175 if (IS_ERR(event)) {
4865 if (IS_ERR(event)) 5176 err = PTR_ERR(event);
4866 goto err_put_context; 5177 goto err_put_context;
5178 }
4867 5179
4868 err = anon_inode_getfd("[perf_event]", &perf_fops, event, O_RDWR); 5180 if (output_event) {
4869 if (err < 0) 5181 err = perf_event_set_output(event, output_event);
4870 goto err_free_put_context; 5182 if (err)
5183 goto err_free_put_context;
5184 }
4871 5185
4872 event_file = fget_light(err, &fput_needed2); 5186 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
4873 if (!event_file) 5187 if (IS_ERR(event_file)) {
5188 err = PTR_ERR(event_file);
4874 goto err_free_put_context; 5189 goto err_free_put_context;
4875
4876 if (flags & PERF_FLAG_FD_OUTPUT) {
4877 err = perf_event_set_output(event, group_fd);
4878 if (err)
4879 goto err_fput_free_put_context;
4880 } 5190 }
4881 5191
4882 event->filp = event_file; 5192 event->filp = event_file;
@@ -4892,19 +5202,23 @@ SYSCALL_DEFINE5(perf_event_open,
4892 list_add_tail(&event->owner_entry, &current->perf_event_list); 5202 list_add_tail(&event->owner_entry, &current->perf_event_list);
4893 mutex_unlock(&current->perf_event_mutex); 5203 mutex_unlock(&current->perf_event_mutex);
4894 5204
4895err_fput_free_put_context: 5205 /*
4896 fput_light(event_file, fput_needed2); 5206 * Drop the reference on the group_event after placing the
5207 * new event on the sibling_list. This ensures destruction
5208 * of the group leader will find the pointer to itself in
5209 * perf_group_detach().
5210 */
5211 fput_light(group_file, fput_needed);
5212 fd_install(event_fd, event_file);
5213 return event_fd;
4897 5214
4898err_free_put_context: 5215err_free_put_context:
4899 if (err < 0) 5216 free_event(event);
4900 free_event(event);
4901
4902err_put_context: 5217err_put_context:
4903 if (err < 0)
4904 put_ctx(ctx);
4905
4906 fput_light(group_file, fput_needed); 5218 fput_light(group_file, fput_needed);
4907 5219 put_ctx(ctx);
5220err_fd:
5221 put_unused_fd(event_fd);
4908 return err; 5222 return err;
4909} 5223}
4910 5224
@@ -5010,7 +5324,7 @@ inherit_event(struct perf_event *parent_event,
5010 hwc->sample_period = sample_period; 5324 hwc->sample_period = sample_period;
5011 hwc->last_period = sample_period; 5325 hwc->last_period = sample_period;
5012 5326
5013 atomic64_set(&hwc->period_left, sample_period); 5327 local64_set(&hwc->period_left, sample_period);
5014 } 5328 }
5015 5329
5016 child_event->overflow_handler = parent_event->overflow_handler; 5330 child_event->overflow_handler = parent_event->overflow_handler;
@@ -5071,12 +5385,12 @@ static void sync_child_event(struct perf_event *child_event,
5071 if (child_event->attr.inherit_stat) 5385 if (child_event->attr.inherit_stat)
5072 perf_event_read_event(child_event, child); 5386 perf_event_read_event(child_event, child);
5073 5387
5074 child_val = atomic64_read(&child_event->count); 5388 child_val = perf_event_count(child_event);
5075 5389
5076 /* 5390 /*
5077 * Add back the child's count to the parent's count: 5391 * Add back the child's count to the parent's count:
5078 */ 5392 */
5079 atomic64_add(child_val, &parent_event->count); 5393 atomic64_add(child_val, &parent_event->child_count);
5080 atomic64_add(child_event->total_time_enabled, 5394 atomic64_add(child_event->total_time_enabled,
5081 &parent_event->child_total_time_enabled); 5395 &parent_event->child_total_time_enabled);
5082 atomic64_add(child_event->total_time_running, 5396 atomic64_add(child_event->total_time_running,
@@ -5176,7 +5490,7 @@ void perf_event_exit_task(struct task_struct *child)
5176 * 5490 *
5177 * But since its the parent context it won't be the same instance. 5491 * But since its the parent context it won't be the same instance.
5178 */ 5492 */
5179 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); 5493 mutex_lock(&child_ctx->mutex);
5180 5494
5181again: 5495again:
5182 list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups, 5496 list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
@@ -5215,6 +5529,7 @@ static void perf_free_event(struct perf_event *event,
5215 5529
5216 fput(parent->filp); 5530 fput(parent->filp);
5217 5531
5532 perf_group_detach(event);
5218 list_del_event(event, ctx); 5533 list_del_event(event, ctx);
5219 free_event(event); 5534 free_event(event);
5220} 5535}
@@ -5384,6 +5699,7 @@ static void __init perf_event_init_all_cpus(void)
5384 5699
5385 for_each_possible_cpu(cpu) { 5700 for_each_possible_cpu(cpu) {
5386 cpuctx = &per_cpu(perf_cpu_context, cpu); 5701 cpuctx = &per_cpu(perf_cpu_context, cpu);
5702 mutex_init(&cpuctx->hlist_mutex);
5387 __perf_event_init_context(&cpuctx->ctx, NULL); 5703 __perf_event_init_context(&cpuctx->ctx, NULL);
5388 } 5704 }
5389} 5705}
@@ -5397,6 +5713,16 @@ static void __cpuinit perf_event_init_cpu(int cpu)
5397 spin_lock(&perf_resource_lock); 5713 spin_lock(&perf_resource_lock);
5398 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; 5714 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
5399 spin_unlock(&perf_resource_lock); 5715 spin_unlock(&perf_resource_lock);
5716
5717 mutex_lock(&cpuctx->hlist_mutex);
5718 if (cpuctx->hlist_refcount > 0) {
5719 struct swevent_hlist *hlist;
5720
5721 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
5722 WARN_ON_ONCE(!hlist);
5723 rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
5724 }
5725 mutex_unlock(&cpuctx->hlist_mutex);
5400} 5726}
5401 5727
5402#ifdef CONFIG_HOTPLUG_CPU 5728#ifdef CONFIG_HOTPLUG_CPU
@@ -5416,6 +5742,10 @@ static void perf_event_exit_cpu(int cpu)
5416 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 5742 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
5417 struct perf_event_context *ctx = &cpuctx->ctx; 5743 struct perf_event_context *ctx = &cpuctx->ctx;
5418 5744
5745 mutex_lock(&cpuctx->hlist_mutex);
5746 swevent_hlist_release(cpuctx);
5747 mutex_unlock(&cpuctx->hlist_mutex);
5748
5419 mutex_lock(&ctx->mutex); 5749 mutex_lock(&ctx->mutex);
5420 smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); 5750 smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
5421 mutex_unlock(&ctx->mutex); 5751 mutex_unlock(&ctx->mutex);
@@ -5429,15 +5759,15 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5429{ 5759{
5430 unsigned int cpu = (long)hcpu; 5760 unsigned int cpu = (long)hcpu;
5431 5761
5432 switch (action) { 5762 switch (action & ~CPU_TASKS_FROZEN) {
5433 5763
5434 case CPU_UP_PREPARE: 5764 case CPU_UP_PREPARE:
5435 case CPU_UP_PREPARE_FROZEN: 5765 case CPU_DOWN_FAILED:
5436 perf_event_init_cpu(cpu); 5766 perf_event_init_cpu(cpu);
5437 break; 5767 break;
5438 5768
5769 case CPU_UP_CANCELED:
5439 case CPU_DOWN_PREPARE: 5770 case CPU_DOWN_PREPARE:
5440 case CPU_DOWN_PREPARE_FROZEN:
5441 perf_event_exit_cpu(cpu); 5771 perf_event_exit_cpu(cpu);
5442 break; 5772 break;
5443 5773
diff --git a/kernel/pid.c b/kernel/pid.c
index aebb30d9c233..d55c6fb8d087 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -122,6 +122,43 @@ static void free_pidmap(struct upid *upid)
122 atomic_inc(&map->nr_free); 122 atomic_inc(&map->nr_free);
123} 123}
124 124
125/*
126 * If we started walking pids at 'base', is 'a' seen before 'b'?
127 */
128static int pid_before(int base, int a, int b)
129{
130 /*
131 * This is the same as saying
132 *
133 * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT
134 * and that mapping orders 'a' and 'b' with respect to 'base'.
135 */
136 return (unsigned)(a - base) < (unsigned)(b - base);
137}
138
139/*
140 * We might be racing with someone else trying to set pid_ns->last_pid.
141 * We want the winner to have the "later" value, because if the
142 * "earlier" value prevails, then a pid may get reused immediately.
143 *
144 * Since pids rollover, it is not sufficient to just pick the bigger
145 * value. We have to consider where we started counting from.
146 *
147 * 'base' is the value of pid_ns->last_pid that we observed when
148 * we started looking for a pid.
149 *
150 * 'pid' is the pid that we eventually found.
151 */
152static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid)
153{
154 int prev;
155 int last_write = base;
156 do {
157 prev = last_write;
158 last_write = cmpxchg(&pid_ns->last_pid, prev, pid);
159 } while ((prev != last_write) && (pid_before(base, last_write, pid)));
160}
161
125static int alloc_pidmap(struct pid_namespace *pid_ns) 162static int alloc_pidmap(struct pid_namespace *pid_ns)
126{ 163{
127 int i, offset, max_scan, pid, last = pid_ns->last_pid; 164 int i, offset, max_scan, pid, last = pid_ns->last_pid;
@@ -132,7 +169,12 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
132 pid = RESERVED_PIDS; 169 pid = RESERVED_PIDS;
133 offset = pid & BITS_PER_PAGE_MASK; 170 offset = pid & BITS_PER_PAGE_MASK;
134 map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; 171 map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
135 max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset; 172 /*
173 * If last_pid points into the middle of the map->page we
174 * want to scan this bitmap block twice, the second time
175 * we start with offset == 0 (or RESERVED_PIDS).
176 */
177 max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset;
136 for (i = 0; i <= max_scan; ++i) { 178 for (i = 0; i <= max_scan; ++i) {
137 if (unlikely(!map->page)) { 179 if (unlikely(!map->page)) {
138 void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); 180 void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
@@ -154,20 +196,12 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
154 do { 196 do {
155 if (!test_and_set_bit(offset, map->page)) { 197 if (!test_and_set_bit(offset, map->page)) {
156 atomic_dec(&map->nr_free); 198 atomic_dec(&map->nr_free);
157 pid_ns->last_pid = pid; 199 set_last_pid(pid_ns, last, pid);
158 return pid; 200 return pid;
159 } 201 }
160 offset = find_next_offset(map, offset); 202 offset = find_next_offset(map, offset);
161 pid = mk_pid(pid_ns, map, offset); 203 pid = mk_pid(pid_ns, map, offset);
162 /* 204 } while (offset < BITS_PER_PAGE && pid < pid_max);
163 * find_next_offset() found a bit, the pid from it
164 * is in-bounds, and if we fell back to the last
165 * bitmap block and the final block was the same
166 * as the starting point, pid is before last_pid.
167 */
168 } while (offset < BITS_PER_PAGE && pid < pid_max &&
169 (i != max_scan || pid < last ||
170 !((last+1) & BITS_PER_PAGE_MASK)));
171 } 205 }
172 if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { 206 if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
173 ++map; 207 ++map;
@@ -513,6 +547,13 @@ void __init pidhash_init(void)
513 547
514void __init pidmap_init(void) 548void __init pidmap_init(void)
515{ 549{
550 /* bump default and minimum pid_max based on number of cpus */
551 pid_max = min(pid_max_max, max_t(int, pid_max,
552 PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
553 pid_max_min = max_t(int, pid_max_min,
554 PIDS_PER_CPU_MIN * num_possible_cpus());
555 pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
556
516 init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); 557 init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
517 /* Reserve PID 0. We never call free_pidmap(0) */ 558 /* Reserve PID 0. We never call free_pidmap(0) */
518 set_bit(0, init_pid_ns.pidmap[0].page); 559 set_bit(0, init_pid_ns.pidmap[0].page);
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 3db49b9ca374..645e541a45f6 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -2,7 +2,7 @@
2 * This module exposes the interface to kernel space for specifying 2 * This module exposes the interface to kernel space for specifying
3 * QoS dependencies. It provides infrastructure for registration of: 3 * QoS dependencies. It provides infrastructure for registration of:
4 * 4 *
5 * Dependents on a QoS value : register requirements 5 * Dependents on a QoS value : register requests
6 * Watchers of QoS value : get notified when target QoS value changes 6 * Watchers of QoS value : get notified when target QoS value changes
7 * 7 *
8 * This QoS design is best effort based. Dependents register their QoS needs. 8 * This QoS design is best effort based. Dependents register their QoS needs.
@@ -14,19 +14,21 @@
14 * timeout: usec <-- currently not used. 14 * timeout: usec <-- currently not used.
15 * throughput: kbs (kilo byte / sec) 15 * throughput: kbs (kilo byte / sec)
16 * 16 *
17 * There are lists of pm_qos_objects each one wrapping requirements, notifiers 17 * There are lists of pm_qos_objects each one wrapping requests, notifiers
18 * 18 *
19 * User mode requirements on a QOS parameter register themselves to the 19 * User mode requests on a QOS parameter register themselves to the
20 * subsystem by opening the device node /dev/... and writing there request to 20 * subsystem by opening the device node /dev/... and writing there request to
21 * the node. As long as the process holds a file handle open to the node the 21 * the node. As long as the process holds a file handle open to the node the
22 * client continues to be accounted for. Upon file release the usermode 22 * client continues to be accounted for. Upon file release the usermode
23 * requirement is removed and a new qos target is computed. This way when the 23 * request is removed and a new qos target is computed. This way when the
24 * requirement that the application has is cleaned up when closes the file 24 * request that the application has is cleaned up when closes the file
25 * pointer or exits the pm_qos_object will get an opportunity to clean up. 25 * pointer or exits the pm_qos_object will get an opportunity to clean up.
26 * 26 *
27 * Mark Gross <mgross@linux.intel.com> 27 * Mark Gross <mgross@linux.intel.com>
28 */ 28 */
29 29
30/*#define DEBUG*/
31
30#include <linux/pm_qos_params.h> 32#include <linux/pm_qos_params.h>
31#include <linux/sched.h> 33#include <linux/sched.h>
32#include <linux/spinlock.h> 34#include <linux/spinlock.h>
@@ -42,64 +44,53 @@
42#include <linux/uaccess.h> 44#include <linux/uaccess.h>
43 45
44/* 46/*
45 * locking rule: all changes to requirements or notifiers lists 47 * locking rule: all changes to requests or notifiers lists
46 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock 48 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
47 * held, taken with _irqsave. One lock to rule them all 49 * held, taken with _irqsave. One lock to rule them all
48 */ 50 */
49struct requirement_list { 51enum pm_qos_type {
50 struct list_head list; 52 PM_QOS_MAX, /* return the largest value */
51 union { 53 PM_QOS_MIN /* return the smallest value */
52 s32 value;
53 s32 usec;
54 s32 kbps;
55 };
56 char *name;
57}; 54};
58 55
59static s32 max_compare(s32 v1, s32 v2);
60static s32 min_compare(s32 v1, s32 v2);
61
62struct pm_qos_object { 56struct pm_qos_object {
63 struct requirement_list requirements; 57 struct plist_head requests;
64 struct blocking_notifier_head *notifiers; 58 struct blocking_notifier_head *notifiers;
65 struct miscdevice pm_qos_power_miscdev; 59 struct miscdevice pm_qos_power_miscdev;
66 char *name; 60 char *name;
67 s32 default_value; 61 s32 default_value;
68 atomic_t target_value; 62 enum pm_qos_type type;
69 s32 (*comparitor)(s32, s32);
70}; 63};
71 64
65static DEFINE_SPINLOCK(pm_qos_lock);
66
72static struct pm_qos_object null_pm_qos; 67static struct pm_qos_object null_pm_qos;
73static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); 68static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
74static struct pm_qos_object cpu_dma_pm_qos = { 69static struct pm_qos_object cpu_dma_pm_qos = {
75 .requirements = {LIST_HEAD_INIT(cpu_dma_pm_qos.requirements.list)}, 70 .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock),
76 .notifiers = &cpu_dma_lat_notifier, 71 .notifiers = &cpu_dma_lat_notifier,
77 .name = "cpu_dma_latency", 72 .name = "cpu_dma_latency",
78 .default_value = 2000 * USEC_PER_SEC, 73 .default_value = 2000 * USEC_PER_SEC,
79 .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), 74 .type = PM_QOS_MIN,
80 .comparitor = min_compare
81}; 75};
82 76
83static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); 77static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
84static struct pm_qos_object network_lat_pm_qos = { 78static struct pm_qos_object network_lat_pm_qos = {
85 .requirements = {LIST_HEAD_INIT(network_lat_pm_qos.requirements.list)}, 79 .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock),
86 .notifiers = &network_lat_notifier, 80 .notifiers = &network_lat_notifier,
87 .name = "network_latency", 81 .name = "network_latency",
88 .default_value = 2000 * USEC_PER_SEC, 82 .default_value = 2000 * USEC_PER_SEC,
89 .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), 83 .type = PM_QOS_MIN
90 .comparitor = min_compare
91}; 84};
92 85
93 86
94static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); 87static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
95static struct pm_qos_object network_throughput_pm_qos = { 88static struct pm_qos_object network_throughput_pm_qos = {
96 .requirements = 89 .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock),
97 {LIST_HEAD_INIT(network_throughput_pm_qos.requirements.list)},
98 .notifiers = &network_throughput_notifier, 90 .notifiers = &network_throughput_notifier,
99 .name = "network_throughput", 91 .name = "network_throughput",
100 .default_value = 0, 92 .default_value = 0,
101 .target_value = ATOMIC_INIT(0), 93 .type = PM_QOS_MAX,
102 .comparitor = max_compare
103}; 94};
104 95
105 96
@@ -110,8 +101,6 @@ static struct pm_qos_object *pm_qos_array[] = {
110 &network_throughput_pm_qos 101 &network_throughput_pm_qos
111}; 102};
112 103
113static DEFINE_SPINLOCK(pm_qos_lock);
114
115static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, 104static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
116 size_t count, loff_t *f_pos); 105 size_t count, loff_t *f_pos);
117static int pm_qos_power_open(struct inode *inode, struct file *filp); 106static int pm_qos_power_open(struct inode *inode, struct file *filp);
@@ -123,43 +112,55 @@ static const struct file_operations pm_qos_power_fops = {
123 .release = pm_qos_power_release, 112 .release = pm_qos_power_release,
124}; 113};
125 114
126/* static helper functions */ 115/* unlocked internal variant */
127static s32 max_compare(s32 v1, s32 v2) 116static inline int pm_qos_get_value(struct pm_qos_object *o)
128{ 117{
129 return max(v1, v2); 118 if (plist_head_empty(&o->requests))
130} 119 return o->default_value;
131 120
132static s32 min_compare(s32 v1, s32 v2) 121 switch (o->type) {
133{ 122 case PM_QOS_MIN:
134 return min(v1, v2); 123 return plist_last(&o->requests)->prio;
135}
136 124
125 case PM_QOS_MAX:
126 return plist_first(&o->requests)->prio;
137 127
138static void update_target(int target) 128 default:
129 /* runtime check for not using enum */
130 BUG();
131 }
132}
133
134static void update_target(struct pm_qos_object *o, struct plist_node *node,
135 int del, int value)
139{ 136{
140 s32 extreme_value;
141 struct requirement_list *node;
142 unsigned long flags; 137 unsigned long flags;
143 int call_notifier = 0; 138 int prev_value, curr_value;
144 139
145 spin_lock_irqsave(&pm_qos_lock, flags); 140 spin_lock_irqsave(&pm_qos_lock, flags);
146 extreme_value = pm_qos_array[target]->default_value; 141 prev_value = pm_qos_get_value(o);
147 list_for_each_entry(node, 142 /* PM_QOS_DEFAULT_VALUE is a signal that the value is unchanged */
148 &pm_qos_array[target]->requirements.list, list) { 143 if (value != PM_QOS_DEFAULT_VALUE) {
149 extreme_value = pm_qos_array[target]->comparitor( 144 /*
150 extreme_value, node->value); 145 * to change the list, we atomically remove, reinit
151 } 146 * with new value and add, then see if the extremal
152 if (atomic_read(&pm_qos_array[target]->target_value) != extreme_value) { 147 * changed
153 call_notifier = 1; 148 */
154 atomic_set(&pm_qos_array[target]->target_value, extreme_value); 149 plist_del(node, &o->requests);
155 pr_debug(KERN_ERR "new target for qos %d is %d\n", target, 150 plist_node_init(node, value);
156 atomic_read(&pm_qos_array[target]->target_value)); 151 plist_add(node, &o->requests);
152 } else if (del) {
153 plist_del(node, &o->requests);
154 } else {
155 plist_add(node, &o->requests);
157 } 156 }
157 curr_value = pm_qos_get_value(o);
158 spin_unlock_irqrestore(&pm_qos_lock, flags); 158 spin_unlock_irqrestore(&pm_qos_lock, flags);
159 159
160 if (call_notifier) 160 if (prev_value != curr_value)
161 blocking_notifier_call_chain(pm_qos_array[target]->notifiers, 161 blocking_notifier_call_chain(o->notifiers,
162 (unsigned long) extreme_value, NULL); 162 (unsigned long)curr_value,
163 NULL);
163} 164}
164 165
165static int register_pm_qos_misc(struct pm_qos_object *qos) 166static int register_pm_qos_misc(struct pm_qos_object *qos)
@@ -185,125 +186,125 @@ static int find_pm_qos_object_by_minor(int minor)
185} 186}
186 187
187/** 188/**
188 * pm_qos_requirement - returns current system wide qos expectation 189 * pm_qos_request - returns current system wide qos expectation
189 * @pm_qos_class: identification of which qos value is requested 190 * @pm_qos_class: identification of which qos value is requested
190 * 191 *
191 * This function returns the current target value in an atomic manner. 192 * This function returns the current target value in an atomic manner.
192 */ 193 */
193int pm_qos_requirement(int pm_qos_class) 194int pm_qos_request(int pm_qos_class)
194{ 195{
195 return atomic_read(&pm_qos_array[pm_qos_class]->target_value); 196 unsigned long flags;
197 int value;
198
199 spin_lock_irqsave(&pm_qos_lock, flags);
200 value = pm_qos_get_value(pm_qos_array[pm_qos_class]);
201 spin_unlock_irqrestore(&pm_qos_lock, flags);
202
203 return value;
196} 204}
197EXPORT_SYMBOL_GPL(pm_qos_requirement); 205EXPORT_SYMBOL_GPL(pm_qos_request);
206
207int pm_qos_request_active(struct pm_qos_request_list *req)
208{
209 return req->pm_qos_class != 0;
210}
211EXPORT_SYMBOL_GPL(pm_qos_request_active);
198 212
199/** 213/**
200 * pm_qos_add_requirement - inserts new qos request into the list 214 * pm_qos_add_request - inserts new qos request into the list
201 * @pm_qos_class: identifies which list of qos request to us 215 * @dep: pointer to a preallocated handle
202 * @name: identifies the request 216 * @pm_qos_class: identifies which list of qos request to use
203 * @value: defines the qos request 217 * @value: defines the qos request
204 * 218 *
205 * This function inserts a new entry in the pm_qos_class list of requested qos 219 * This function inserts a new entry in the pm_qos_class list of requested qos
206 * performance characteristics. It recomputes the aggregate QoS expectations 220 * performance characteristics. It recomputes the aggregate QoS expectations
207 * for the pm_qos_class of parameters. 221 * for the pm_qos_class of parameters and initializes the pm_qos_request_list
222 * handle. Caller needs to save this handle for later use in updates and
223 * removal.
208 */ 224 */
209int pm_qos_add_requirement(int pm_qos_class, char *name, s32 value) 225
226void pm_qos_add_request(struct pm_qos_request_list *dep,
227 int pm_qos_class, s32 value)
210{ 228{
211 struct requirement_list *dep; 229 struct pm_qos_object *o = pm_qos_array[pm_qos_class];
212 unsigned long flags; 230 int new_value;
213 231
214 dep = kzalloc(sizeof(struct requirement_list), GFP_KERNEL); 232 if (pm_qos_request_active(dep)) {
215 if (dep) { 233 WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n");
216 if (value == PM_QOS_DEFAULT_VALUE) 234 return;
217 dep->value = pm_qos_array[pm_qos_class]->default_value;
218 else
219 dep->value = value;
220 dep->name = kstrdup(name, GFP_KERNEL);
221 if (!dep->name)
222 goto cleanup;
223
224 spin_lock_irqsave(&pm_qos_lock, flags);
225 list_add(&dep->list,
226 &pm_qos_array[pm_qos_class]->requirements.list);
227 spin_unlock_irqrestore(&pm_qos_lock, flags);
228 update_target(pm_qos_class);
229
230 return 0;
231 } 235 }
232 236 if (value == PM_QOS_DEFAULT_VALUE)
233cleanup: 237 new_value = o->default_value;
234 kfree(dep); 238 else
235 return -ENOMEM; 239 new_value = value;
240 plist_node_init(&dep->list, new_value);
241 dep->pm_qos_class = pm_qos_class;
242 update_target(o, &dep->list, 0, PM_QOS_DEFAULT_VALUE);
236} 243}
237EXPORT_SYMBOL_GPL(pm_qos_add_requirement); 244EXPORT_SYMBOL_GPL(pm_qos_add_request);
238 245
239/** 246/**
240 * pm_qos_update_requirement - modifies an existing qos request 247 * pm_qos_update_request - modifies an existing qos request
241 * @pm_qos_class: identifies which list of qos request to us 248 * @pm_qos_req : handle to list element holding a pm_qos request to use
242 * @name: identifies the request
243 * @value: defines the qos request 249 * @value: defines the qos request
244 * 250 *
245 * Updates an existing qos requirement for the pm_qos_class of parameters along 251 * Updates an existing qos request for the pm_qos_class of parameters along
246 * with updating the target pm_qos_class value. 252 * with updating the target pm_qos_class value.
247 * 253 *
248 * If the named request isn't in the list then no change is made. 254 * Attempts are made to make this code callable on hot code paths.
249 */ 255 */
250int pm_qos_update_requirement(int pm_qos_class, char *name, s32 new_value) 256void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req,
257 s32 new_value)
251{ 258{
252 unsigned long flags; 259 s32 temp;
253 struct requirement_list *node; 260 struct pm_qos_object *o;
254 int pending_update = 0;
255 261
256 spin_lock_irqsave(&pm_qos_lock, flags); 262 if (!pm_qos_req) /*guard against callers passing in null */
257 list_for_each_entry(node, 263 return;
258 &pm_qos_array[pm_qos_class]->requirements.list, list) { 264
259 if (strcmp(node->name, name) == 0) { 265 if (!pm_qos_request_active(pm_qos_req)) {
260 if (new_value == PM_QOS_DEFAULT_VALUE) 266 WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n");
261 node->value = 267 return;
262 pm_qos_array[pm_qos_class]->default_value;
263 else
264 node->value = new_value;
265 pending_update = 1;
266 break;
267 }
268 } 268 }
269 spin_unlock_irqrestore(&pm_qos_lock, flags);
270 if (pending_update)
271 update_target(pm_qos_class);
272 269
273 return 0; 270 o = pm_qos_array[pm_qos_req->pm_qos_class];
271
272 if (new_value == PM_QOS_DEFAULT_VALUE)
273 temp = o->default_value;
274 else
275 temp = new_value;
276
277 if (temp != pm_qos_req->list.prio)
278 update_target(o, &pm_qos_req->list, 0, temp);
274} 279}
275EXPORT_SYMBOL_GPL(pm_qos_update_requirement); 280EXPORT_SYMBOL_GPL(pm_qos_update_request);
276 281
277/** 282/**
278 * pm_qos_remove_requirement - modifies an existing qos request 283 * pm_qos_remove_request - modifies an existing qos request
279 * @pm_qos_class: identifies which list of qos request to us 284 * @pm_qos_req: handle to request list element
280 * @name: identifies the request
281 * 285 *
282 * Will remove named qos request from pm_qos_class list of parameters and 286 * Will remove pm qos request from the list of requests and
283 * recompute the current target value for the pm_qos_class. 287 * recompute the current target value for the pm_qos_class. Call this
288 * on slow code paths.
284 */ 289 */
285void pm_qos_remove_requirement(int pm_qos_class, char *name) 290void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req)
286{ 291{
287 unsigned long flags; 292 struct pm_qos_object *o;
288 struct requirement_list *node;
289 int pending_update = 0;
290 293
291 spin_lock_irqsave(&pm_qos_lock, flags); 294 if (pm_qos_req == NULL)
292 list_for_each_entry(node, 295 return;
293 &pm_qos_array[pm_qos_class]->requirements.list, list) { 296 /* silent return to keep pcm code cleaner */
294 if (strcmp(node->name, name) == 0) { 297
295 kfree(node->name); 298 if (!pm_qos_request_active(pm_qos_req)) {
296 list_del(&node->list); 299 WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n");
297 kfree(node); 300 return;
298 pending_update = 1;
299 break;
300 }
301 } 301 }
302 spin_unlock_irqrestore(&pm_qos_lock, flags); 302
303 if (pending_update) 303 o = pm_qos_array[pm_qos_req->pm_qos_class];
304 update_target(pm_qos_class); 304 update_target(o, &pm_qos_req->list, 1, PM_QOS_DEFAULT_VALUE);
305 memset(pm_qos_req, 0, sizeof(*pm_qos_req));
305} 306}
306EXPORT_SYMBOL_GPL(pm_qos_remove_requirement); 307EXPORT_SYMBOL_GPL(pm_qos_remove_request);
307 308
308/** 309/**
309 * pm_qos_add_notifier - sets notification entry for changes to target value 310 * pm_qos_add_notifier - sets notification entry for changes to target value
@@ -313,7 +314,7 @@ EXPORT_SYMBOL_GPL(pm_qos_remove_requirement);
313 * will register the notifier into a notification chain that gets called 314 * will register the notifier into a notification chain that gets called
314 * upon changes to the pm_qos_class target value. 315 * upon changes to the pm_qos_class target value.
315 */ 316 */
316 int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier) 317int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
317{ 318{
318 int retval; 319 int retval;
319 320
@@ -343,21 +344,20 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
343} 344}
344EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); 345EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
345 346
346#define PID_NAME_LEN 32
347
348static int pm_qos_power_open(struct inode *inode, struct file *filp) 347static int pm_qos_power_open(struct inode *inode, struct file *filp)
349{ 348{
350 int ret;
351 long pm_qos_class; 349 long pm_qos_class;
352 char name[PID_NAME_LEN];
353 350
354 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); 351 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
355 if (pm_qos_class >= 0) { 352 if (pm_qos_class >= 0) {
356 filp->private_data = (void *)pm_qos_class; 353 struct pm_qos_request_list *req = kzalloc(sizeof(*req), GFP_KERNEL);
357 snprintf(name, PID_NAME_LEN, "process_%d", current->pid); 354 if (!req)
358 ret = pm_qos_add_requirement(pm_qos_class, name, 355 return -ENOMEM;
359 PM_QOS_DEFAULT_VALUE); 356
360 if (ret >= 0) 357 pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE);
358 filp->private_data = req;
359
360 if (filp->private_data)
361 return 0; 361 return 0;
362 } 362 }
363 return -EPERM; 363 return -EPERM;
@@ -365,32 +365,43 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
365 365
366static int pm_qos_power_release(struct inode *inode, struct file *filp) 366static int pm_qos_power_release(struct inode *inode, struct file *filp)
367{ 367{
368 int pm_qos_class; 368 struct pm_qos_request_list *req;
369 char name[PID_NAME_LEN];
370 369
371 pm_qos_class = (long)filp->private_data; 370 req = filp->private_data;
372 snprintf(name, PID_NAME_LEN, "process_%d", current->pid); 371 pm_qos_remove_request(req);
373 pm_qos_remove_requirement(pm_qos_class, name); 372 kfree(req);
374 373
375 return 0; 374 return 0;
376} 375}
377 376
377
378static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, 378static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
379 size_t count, loff_t *f_pos) 379 size_t count, loff_t *f_pos)
380{ 380{
381 s32 value; 381 s32 value;
382 int pm_qos_class; 382 int x;
383 char name[PID_NAME_LEN]; 383 char ascii_value[11];
384 384 struct pm_qos_request_list *pm_qos_req;
385 pm_qos_class = (long)filp->private_data; 385
386 if (count != sizeof(s32)) 386 if (count == sizeof(s32)) {
387 if (copy_from_user(&value, buf, sizeof(s32)))
388 return -EFAULT;
389 } else if (count == 11) { /* len('0x12345678/0') */
390 if (copy_from_user(ascii_value, buf, 11))
391 return -EFAULT;
392 if (strlen(ascii_value) != 10)
393 return -EINVAL;
394 x = sscanf(ascii_value, "%x", &value);
395 if (x != 1)
396 return -EINVAL;
397 pr_debug("%s, %d, 0x%x\n", ascii_value, x, value);
398 } else
387 return -EINVAL; 399 return -EINVAL;
388 if (copy_from_user(&value, buf, sizeof(s32)))
389 return -EFAULT;
390 snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
391 pm_qos_update_requirement(pm_qos_class, name, value);
392 400
393 return sizeof(s32); 401 pm_qos_req = (struct pm_qos_request_list *)filp->private_data;
402 pm_qos_update_request(pm_qos_req, value);
403
404 return count;
394} 405}
395 406
396 407
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index bc7704b3a443..6842eeba5879 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -11,19 +11,18 @@
11#include <trace/events/timer.h> 11#include <trace/events/timer.h>
12 12
13/* 13/*
14 * Called after updating RLIMIT_CPU to set timer expiration if necessary. 14 * Called after updating RLIMIT_CPU to run cpu timer and update
15 * tsk->signal->cputime_expires expiration cache if necessary. Needs
16 * siglock protection since other code may update expiration cache as
17 * well.
15 */ 18 */
16void update_rlimit_cpu(unsigned long rlim_new) 19void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
17{ 20{
18 cputime_t cputime = secs_to_cputime(rlim_new); 21 cputime_t cputime = secs_to_cputime(rlim_new);
19 struct signal_struct *const sig = current->signal;
20 22
21 if (cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) || 23 spin_lock_irq(&task->sighand->siglock);
22 cputime_gt(sig->it[CPUCLOCK_PROF].expires, cputime)) { 24 set_process_cpu_timer(task, CPUCLOCK_PROF, &cputime, NULL);
23 spin_lock_irq(&current->sighand->siglock); 25 spin_unlock_irq(&task->sighand->siglock);
24 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
25 spin_unlock_irq(&current->sighand->siglock);
26 }
27} 26}
28 27
29static int check_clock(const clockid_t which_clock) 28static int check_clock(const clockid_t which_clock)
@@ -233,31 +232,24 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
233 232
234void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) 233void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
235{ 234{
236 struct sighand_struct *sighand; 235 struct signal_struct *sig = tsk->signal;
237 struct signal_struct *sig;
238 struct task_struct *t; 236 struct task_struct *t;
239 237
240 *times = INIT_CPUTIME; 238 times->utime = sig->utime;
239 times->stime = sig->stime;
240 times->sum_exec_runtime = sig->sum_sched_runtime;
241 241
242 rcu_read_lock(); 242 rcu_read_lock();
243 sighand = rcu_dereference(tsk->sighand); 243 /* make sure we can trust tsk->thread_group list */
244 if (!sighand) 244 if (!likely(pid_alive(tsk)))
245 goto out; 245 goto out;
246 246
247 sig = tsk->signal;
248
249 t = tsk; 247 t = tsk;
250 do { 248 do {
251 times->utime = cputime_add(times->utime, t->utime); 249 times->utime = cputime_add(times->utime, t->utime);
252 times->stime = cputime_add(times->stime, t->stime); 250 times->stime = cputime_add(times->stime, t->stime);
253 times->sum_exec_runtime += t->se.sum_exec_runtime; 251 times->sum_exec_runtime += t->se.sum_exec_runtime;
254 252 } while_each_thread(tsk, t);
255 t = next_thread(t);
256 } while (t != tsk);
257
258 times->utime = cputime_add(times->utime, sig->utime);
259 times->stime = cputime_add(times->stime, sig->stime);
260 times->sum_exec_runtime += sig->sum_sched_runtime;
261out: 253out:
262 rcu_read_unlock(); 254 rcu_read_unlock();
263} 255}
@@ -364,7 +356,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
364 } 356 }
365 } else { 357 } else {
366 read_lock(&tasklist_lock); 358 read_lock(&tasklist_lock);
367 if (thread_group_leader(p) && p->signal) { 359 if (thread_group_leader(p) && p->sighand) {
368 error = 360 error =
369 cpu_clock_sample_group(which_clock, 361 cpu_clock_sample_group(which_clock,
370 p, &rtn); 362 p, &rtn);
@@ -440,7 +432,7 @@ int posix_cpu_timer_del(struct k_itimer *timer)
440 432
441 if (likely(p != NULL)) { 433 if (likely(p != NULL)) {
442 read_lock(&tasklist_lock); 434 read_lock(&tasklist_lock);
443 if (unlikely(p->signal == NULL)) { 435 if (unlikely(p->sighand == NULL)) {
444 /* 436 /*
445 * We raced with the reaping of the task. 437 * We raced with the reaping of the task.
446 * The deletion should have cleared us off the list. 438 * The deletion should have cleared us off the list.
@@ -548,111 +540,62 @@ static inline int expires_gt(cputime_t expires, cputime_t new_exp)
548 cputime_gt(expires, new_exp); 540 cputime_gt(expires, new_exp);
549} 541}
550 542
551static inline int expires_le(cputime_t expires, cputime_t new_exp)
552{
553 return !cputime_eq(expires, cputime_zero) &&
554 cputime_le(expires, new_exp);
555}
556/* 543/*
557 * Insert the timer on the appropriate list before any timers that 544 * Insert the timer on the appropriate list before any timers that
558 * expire later. This must be called with the tasklist_lock held 545 * expire later. This must be called with the tasklist_lock held
559 * for reading, and interrupts disabled. 546 * for reading, interrupts disabled and p->sighand->siglock taken.
560 */ 547 */
561static void arm_timer(struct k_itimer *timer, union cpu_time_count now) 548static void arm_timer(struct k_itimer *timer)
562{ 549{
563 struct task_struct *p = timer->it.cpu.task; 550 struct task_struct *p = timer->it.cpu.task;
564 struct list_head *head, *listpos; 551 struct list_head *head, *listpos;
552 struct task_cputime *cputime_expires;
565 struct cpu_timer_list *const nt = &timer->it.cpu; 553 struct cpu_timer_list *const nt = &timer->it.cpu;
566 struct cpu_timer_list *next; 554 struct cpu_timer_list *next;
567 unsigned long i;
568 555
569 head = (CPUCLOCK_PERTHREAD(timer->it_clock) ? 556 if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
570 p->cpu_timers : p->signal->cpu_timers); 557 head = p->cpu_timers;
558 cputime_expires = &p->cputime_expires;
559 } else {
560 head = p->signal->cpu_timers;
561 cputime_expires = &p->signal->cputime_expires;
562 }
571 head += CPUCLOCK_WHICH(timer->it_clock); 563 head += CPUCLOCK_WHICH(timer->it_clock);
572 564
573 BUG_ON(!irqs_disabled());
574 spin_lock(&p->sighand->siglock);
575
576 listpos = head; 565 listpos = head;
577 if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { 566 list_for_each_entry(next, head, entry) {
578 list_for_each_entry(next, head, entry) { 567 if (cpu_time_before(timer->it_clock, nt->expires, next->expires))
579 if (next->expires.sched > nt->expires.sched) 568 break;
580 break; 569 listpos = &next->entry;
581 listpos = &next->entry;
582 }
583 } else {
584 list_for_each_entry(next, head, entry) {
585 if (cputime_gt(next->expires.cpu, nt->expires.cpu))
586 break;
587 listpos = &next->entry;
588 }
589 } 570 }
590 list_add(&nt->entry, listpos); 571 list_add(&nt->entry, listpos);
591 572
592 if (listpos == head) { 573 if (listpos == head) {
574 union cpu_time_count *exp = &nt->expires;
575
593 /* 576 /*
594 * We are the new earliest-expiring timer. 577 * We are the new earliest-expiring POSIX 1.b timer, hence
595 * If we are a thread timer, there can always 578 * need to update expiration cache. Take into account that
596 * be a process timer telling us to stop earlier. 579 * for process timers we share expiration cache with itimers
580 * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME.
597 */ 581 */
598 582
599 if (CPUCLOCK_PERTHREAD(timer->it_clock)) { 583 switch (CPUCLOCK_WHICH(timer->it_clock)) {
600 union cpu_time_count *exp = &nt->expires; 584 case CPUCLOCK_PROF:
601 585 if (expires_gt(cputime_expires->prof_exp, exp->cpu))
602 switch (CPUCLOCK_WHICH(timer->it_clock)) { 586 cputime_expires->prof_exp = exp->cpu;
603 default: 587 break;
604 BUG(); 588 case CPUCLOCK_VIRT:
605 case CPUCLOCK_PROF: 589 if (expires_gt(cputime_expires->virt_exp, exp->cpu))
606 if (expires_gt(p->cputime_expires.prof_exp, 590 cputime_expires->virt_exp = exp->cpu;
607 exp->cpu)) 591 break;
608 p->cputime_expires.prof_exp = exp->cpu; 592 case CPUCLOCK_SCHED:
609 break; 593 if (cputime_expires->sched_exp == 0 ||
610 case CPUCLOCK_VIRT: 594 cputime_expires->sched_exp > exp->sched)
611 if (expires_gt(p->cputime_expires.virt_exp, 595 cputime_expires->sched_exp = exp->sched;
612 exp->cpu)) 596 break;
613 p->cputime_expires.virt_exp = exp->cpu;
614 break;
615 case CPUCLOCK_SCHED:
616 if (p->cputime_expires.sched_exp == 0 ||
617 p->cputime_expires.sched_exp > exp->sched)
618 p->cputime_expires.sched_exp =
619 exp->sched;
620 break;
621 }
622 } else {
623 struct signal_struct *const sig = p->signal;
624 union cpu_time_count *exp = &timer->it.cpu.expires;
625
626 /*
627 * For a process timer, set the cached expiration time.
628 */
629 switch (CPUCLOCK_WHICH(timer->it_clock)) {
630 default:
631 BUG();
632 case CPUCLOCK_VIRT:
633 if (expires_le(sig->it[CPUCLOCK_VIRT].expires,
634 exp->cpu))
635 break;
636 sig->cputime_expires.virt_exp = exp->cpu;
637 break;
638 case CPUCLOCK_PROF:
639 if (expires_le(sig->it[CPUCLOCK_PROF].expires,
640 exp->cpu))
641 break;
642 i = sig->rlim[RLIMIT_CPU].rlim_cur;
643 if (i != RLIM_INFINITY &&
644 i <= cputime_to_secs(exp->cpu))
645 break;
646 sig->cputime_expires.prof_exp = exp->cpu;
647 break;
648 case CPUCLOCK_SCHED:
649 sig->cputime_expires.sched_exp = exp->sched;
650 break;
651 }
652 } 597 }
653 } 598 }
654
655 spin_unlock(&p->sighand->siglock);
656} 599}
657 600
658/* 601/*
@@ -660,7 +603,12 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
660 */ 603 */
661static void cpu_timer_fire(struct k_itimer *timer) 604static void cpu_timer_fire(struct k_itimer *timer)
662{ 605{
663 if (unlikely(timer->sigq == NULL)) { 606 if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
607 /*
608 * User don't want any signal.
609 */
610 timer->it.cpu.expires.sched = 0;
611 } else if (unlikely(timer->sigq == NULL)) {
664 /* 612 /*
665 * This a special case for clock_nanosleep, 613 * This a special case for clock_nanosleep,
666 * not a normal timer from sys_timer_create. 614 * not a normal timer from sys_timer_create.
@@ -721,7 +669,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
721 struct itimerspec *new, struct itimerspec *old) 669 struct itimerspec *new, struct itimerspec *old)
722{ 670{
723 struct task_struct *p = timer->it.cpu.task; 671 struct task_struct *p = timer->it.cpu.task;
724 union cpu_time_count old_expires, new_expires, val; 672 union cpu_time_count old_expires, new_expires, old_incr, val;
725 int ret; 673 int ret;
726 674
727 if (unlikely(p == NULL)) { 675 if (unlikely(p == NULL)) {
@@ -736,10 +684,10 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
736 read_lock(&tasklist_lock); 684 read_lock(&tasklist_lock);
737 /* 685 /*
738 * We need the tasklist_lock to protect against reaping that 686 * We need the tasklist_lock to protect against reaping that
739 * clears p->signal. If p has just been reaped, we can no 687 * clears p->sighand. If p has just been reaped, we can no
740 * longer get any information about it at all. 688 * longer get any information about it at all.
741 */ 689 */
742 if (unlikely(p->signal == NULL)) { 690 if (unlikely(p->sighand == NULL)) {
743 read_unlock(&tasklist_lock); 691 read_unlock(&tasklist_lock);
744 put_task_struct(p); 692 put_task_struct(p);
745 timer->it.cpu.task = NULL; 693 timer->it.cpu.task = NULL;
@@ -752,6 +700,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
752 BUG_ON(!irqs_disabled()); 700 BUG_ON(!irqs_disabled());
753 701
754 ret = 0; 702 ret = 0;
703 old_incr = timer->it.cpu.incr;
755 spin_lock(&p->sighand->siglock); 704 spin_lock(&p->sighand->siglock);
756 old_expires = timer->it.cpu.expires; 705 old_expires = timer->it.cpu.expires;
757 if (unlikely(timer->it.cpu.firing)) { 706 if (unlikely(timer->it.cpu.firing)) {
@@ -759,7 +708,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
759 ret = TIMER_RETRY; 708 ret = TIMER_RETRY;
760 } else 709 } else
761 list_del_init(&timer->it.cpu.entry); 710 list_del_init(&timer->it.cpu.entry);
762 spin_unlock(&p->sighand->siglock);
763 711
764 /* 712 /*
765 * We need to sample the current value to convert the new 713 * We need to sample the current value to convert the new
@@ -813,6 +761,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
813 * disable this firing since we are already reporting 761 * disable this firing since we are already reporting
814 * it as an overrun (thanks to bump_cpu_timer above). 762 * it as an overrun (thanks to bump_cpu_timer above).
815 */ 763 */
764 spin_unlock(&p->sighand->siglock);
816 read_unlock(&tasklist_lock); 765 read_unlock(&tasklist_lock);
817 goto out; 766 goto out;
818 } 767 }
@@ -828,11 +777,11 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
828 */ 777 */
829 timer->it.cpu.expires = new_expires; 778 timer->it.cpu.expires = new_expires;
830 if (new_expires.sched != 0 && 779 if (new_expires.sched != 0 &&
831 (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE &&
832 cpu_time_before(timer->it_clock, val, new_expires)) { 780 cpu_time_before(timer->it_clock, val, new_expires)) {
833 arm_timer(timer, val); 781 arm_timer(timer);
834 } 782 }
835 783
784 spin_unlock(&p->sighand->siglock);
836 read_unlock(&tasklist_lock); 785 read_unlock(&tasklist_lock);
837 786
838 /* 787 /*
@@ -853,7 +802,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
853 timer->it_overrun = -1; 802 timer->it_overrun = -1;
854 803
855 if (new_expires.sched != 0 && 804 if (new_expires.sched != 0 &&
856 (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE &&
857 !cpu_time_before(timer->it_clock, val, new_expires)) { 805 !cpu_time_before(timer->it_clock, val, new_expires)) {
858 /* 806 /*
859 * The designated time already passed, so we notify 807 * The designated time already passed, so we notify
@@ -867,7 +815,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
867 out: 815 out:
868 if (old) { 816 if (old) {
869 sample_to_timespec(timer->it_clock, 817 sample_to_timespec(timer->it_clock,
870 timer->it.cpu.incr, &old->it_interval); 818 old_incr, &old->it_interval);
871 } 819 }
872 return ret; 820 return ret;
873} 821}
@@ -908,7 +856,7 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
908 clear_dead = p->exit_state; 856 clear_dead = p->exit_state;
909 } else { 857 } else {
910 read_lock(&tasklist_lock); 858 read_lock(&tasklist_lock);
911 if (unlikely(p->signal == NULL)) { 859 if (unlikely(p->sighand == NULL)) {
912 /* 860 /*
913 * The process has been reaped. 861 * The process has been reaped.
914 * We can't even collect a sample any more. 862 * We can't even collect a sample any more.
@@ -927,25 +875,6 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
927 read_unlock(&tasklist_lock); 875 read_unlock(&tasklist_lock);
928 } 876 }
929 877
930 if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
931 if (timer->it.cpu.incr.sched == 0 &&
932 cpu_time_before(timer->it_clock,
933 timer->it.cpu.expires, now)) {
934 /*
935 * Do-nothing timer expired and has no reload,
936 * so it's as if it was never set.
937 */
938 timer->it.cpu.expires.sched = 0;
939 itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
940 return;
941 }
942 /*
943 * Account for any expirations and reloads that should
944 * have happened.
945 */
946 bump_cpu_timer(timer, now);
947 }
948
949 if (unlikely(clear_dead)) { 878 if (unlikely(clear_dead)) {
950 /* 879 /*
951 * We've noticed that the thread is dead, but 880 * We've noticed that the thread is dead, but
@@ -1066,16 +995,9 @@ static void stop_process_timers(struct signal_struct *sig)
1066 struct thread_group_cputimer *cputimer = &sig->cputimer; 995 struct thread_group_cputimer *cputimer = &sig->cputimer;
1067 unsigned long flags; 996 unsigned long flags;
1068 997
1069 if (!cputimer->running)
1070 return;
1071
1072 spin_lock_irqsave(&cputimer->lock, flags); 998 spin_lock_irqsave(&cputimer->lock, flags);
1073 cputimer->running = 0; 999 cputimer->running = 0;
1074 spin_unlock_irqrestore(&cputimer->lock, flags); 1000 spin_unlock_irqrestore(&cputimer->lock, flags);
1075
1076 sig->cputime_expires.prof_exp = cputime_zero;
1077 sig->cputime_expires.virt_exp = cputime_zero;
1078 sig->cputime_expires.sched_exp = 0;
1079} 1001}
1080 1002
1081static u32 onecputick; 1003static u32 onecputick;
@@ -1112,6 +1034,23 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1112 } 1034 }
1113} 1035}
1114 1036
1037/**
1038 * task_cputime_zero - Check a task_cputime struct for all zero fields.
1039 *
1040 * @cputime: The struct to compare.
1041 *
1042 * Checks @cputime to see if all fields are zero. Returns true if all fields
1043 * are zero, false if any field is nonzero.
1044 */
1045static inline int task_cputime_zero(const struct task_cputime *cputime)
1046{
1047 if (cputime_eq(cputime->utime, cputime_zero) &&
1048 cputime_eq(cputime->stime, cputime_zero) &&
1049 cputime->sum_exec_runtime == 0)
1050 return 1;
1051 return 0;
1052}
1053
1115/* 1054/*
1116 * Check for any per-thread CPU timers that have fired and move them 1055 * Check for any per-thread CPU timers that have fired and move them
1117 * off the tsk->*_timers list onto the firing list. Per-thread timers 1056 * off the tsk->*_timers list onto the firing list. Per-thread timers
@@ -1129,19 +1068,6 @@ static void check_process_timers(struct task_struct *tsk,
1129 unsigned long soft; 1068 unsigned long soft;
1130 1069
1131 /* 1070 /*
1132 * Don't sample the current process CPU clocks if there are no timers.
1133 */
1134 if (list_empty(&timers[CPUCLOCK_PROF]) &&
1135 cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) &&
1136 sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY &&
1137 list_empty(&timers[CPUCLOCK_VIRT]) &&
1138 cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) &&
1139 list_empty(&timers[CPUCLOCK_SCHED])) {
1140 stop_process_timers(sig);
1141 return;
1142 }
1143
1144 /*
1145 * Collect the current process totals. 1071 * Collect the current process totals.
1146 */ 1072 */
1147 thread_group_cputimer(tsk, &cputime); 1073 thread_group_cputimer(tsk, &cputime);
@@ -1230,18 +1156,11 @@ static void check_process_timers(struct task_struct *tsk,
1230 } 1156 }
1231 } 1157 }
1232 1158
1233 if (!cputime_eq(prof_expires, cputime_zero) && 1159 sig->cputime_expires.prof_exp = prof_expires;
1234 (cputime_eq(sig->cputime_expires.prof_exp, cputime_zero) || 1160 sig->cputime_expires.virt_exp = virt_expires;
1235 cputime_gt(sig->cputime_expires.prof_exp, prof_expires))) 1161 sig->cputime_expires.sched_exp = sched_expires;
1236 sig->cputime_expires.prof_exp = prof_expires; 1162 if (task_cputime_zero(&sig->cputime_expires))
1237 if (!cputime_eq(virt_expires, cputime_zero) && 1163 stop_process_timers(sig);
1238 (cputime_eq(sig->cputime_expires.virt_exp, cputime_zero) ||
1239 cputime_gt(sig->cputime_expires.virt_exp, virt_expires)))
1240 sig->cputime_expires.virt_exp = virt_expires;
1241 if (sched_expires != 0 &&
1242 (sig->cputime_expires.sched_exp == 0 ||
1243 sig->cputime_expires.sched_exp > sched_expires))
1244 sig->cputime_expires.sched_exp = sched_expires;
1245} 1164}
1246 1165
1247/* 1166/*
@@ -1270,9 +1189,10 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1270 goto out; 1189 goto out;
1271 } 1190 }
1272 read_lock(&tasklist_lock); /* arm_timer needs it. */ 1191 read_lock(&tasklist_lock); /* arm_timer needs it. */
1192 spin_lock(&p->sighand->siglock);
1273 } else { 1193 } else {
1274 read_lock(&tasklist_lock); 1194 read_lock(&tasklist_lock);
1275 if (unlikely(p->signal == NULL)) { 1195 if (unlikely(p->sighand == NULL)) {
1276 /* 1196 /*
1277 * The process has been reaped. 1197 * The process has been reaped.
1278 * We can't even collect a sample any more. 1198 * We can't even collect a sample any more.
@@ -1290,6 +1210,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1290 clear_dead_task(timer, now); 1210 clear_dead_task(timer, now);
1291 goto out_unlock; 1211 goto out_unlock;
1292 } 1212 }
1213 spin_lock(&p->sighand->siglock);
1293 cpu_timer_sample_group(timer->it_clock, p, &now); 1214 cpu_timer_sample_group(timer->it_clock, p, &now);
1294 bump_cpu_timer(timer, now); 1215 bump_cpu_timer(timer, now);
1295 /* Leave the tasklist_lock locked for the call below. */ 1216 /* Leave the tasklist_lock locked for the call below. */
@@ -1298,7 +1219,9 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1298 /* 1219 /*
1299 * Now re-arm for the new expiry time. 1220 * Now re-arm for the new expiry time.
1300 */ 1221 */
1301 arm_timer(timer, now); 1222 BUG_ON(!irqs_disabled());
1223 arm_timer(timer);
1224 spin_unlock(&p->sighand->siglock);
1302 1225
1303out_unlock: 1226out_unlock:
1304 read_unlock(&tasklist_lock); 1227 read_unlock(&tasklist_lock);
@@ -1310,23 +1233,6 @@ out:
1310} 1233}
1311 1234
1312/** 1235/**
1313 * task_cputime_zero - Check a task_cputime struct for all zero fields.
1314 *
1315 * @cputime: The struct to compare.
1316 *
1317 * Checks @cputime to see if all fields are zero. Returns true if all fields
1318 * are zero, false if any field is nonzero.
1319 */
1320static inline int task_cputime_zero(const struct task_cputime *cputime)
1321{
1322 if (cputime_eq(cputime->utime, cputime_zero) &&
1323 cputime_eq(cputime->stime, cputime_zero) &&
1324 cputime->sum_exec_runtime == 0)
1325 return 1;
1326 return 0;
1327}
1328
1329/**
1330 * task_cputime_expired - Compare two task_cputime entities. 1236 * task_cputime_expired - Compare two task_cputime entities.
1331 * 1237 *
1332 * @sample: The task_cputime structure to be checked for expiration. 1238 * @sample: The task_cputime structure to be checked for expiration.
@@ -1366,10 +1272,6 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1366{ 1272{
1367 struct signal_struct *sig; 1273 struct signal_struct *sig;
1368 1274
1369 /* tsk == current, ensure it is safe to use ->signal/sighand */
1370 if (unlikely(tsk->exit_state))
1371 return 0;
1372
1373 if (!task_cputime_zero(&tsk->cputime_expires)) { 1275 if (!task_cputime_zero(&tsk->cputime_expires)) {
1374 struct task_cputime task_sample = { 1276 struct task_cputime task_sample = {
1375 .utime = tsk->utime, 1277 .utime = tsk->utime,
@@ -1382,15 +1284,18 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1382 } 1284 }
1383 1285
1384 sig = tsk->signal; 1286 sig = tsk->signal;
1385 if (!task_cputime_zero(&sig->cputime_expires)) { 1287 if (sig->cputimer.running) {
1386 struct task_cputime group_sample; 1288 struct task_cputime group_sample;
1387 1289
1388 thread_group_cputimer(tsk, &group_sample); 1290 spin_lock(&sig->cputimer.lock);
1291 group_sample = sig->cputimer.cputime;
1292 spin_unlock(&sig->cputimer.lock);
1293
1389 if (task_cputime_expired(&group_sample, &sig->cputime_expires)) 1294 if (task_cputime_expired(&group_sample, &sig->cputime_expires))
1390 return 1; 1295 return 1;
1391 } 1296 }
1392 1297
1393 return sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY; 1298 return 0;
1394} 1299}
1395 1300
1396/* 1301/*
@@ -1402,6 +1307,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1402{ 1307{
1403 LIST_HEAD(firing); 1308 LIST_HEAD(firing);
1404 struct k_itimer *timer, *next; 1309 struct k_itimer *timer, *next;
1310 unsigned long flags;
1405 1311
1406 BUG_ON(!irqs_disabled()); 1312 BUG_ON(!irqs_disabled());
1407 1313
@@ -1412,14 +1318,20 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1412 if (!fastpath_timer_check(tsk)) 1318 if (!fastpath_timer_check(tsk))
1413 return; 1319 return;
1414 1320
1415 spin_lock(&tsk->sighand->siglock); 1321 if (!lock_task_sighand(tsk, &flags))
1322 return;
1416 /* 1323 /*
1417 * Here we take off tsk->signal->cpu_timers[N] and 1324 * Here we take off tsk->signal->cpu_timers[N] and
1418 * tsk->cpu_timers[N] all the timers that are firing, and 1325 * tsk->cpu_timers[N] all the timers that are firing, and
1419 * put them on the firing list. 1326 * put them on the firing list.
1420 */ 1327 */
1421 check_thread_timers(tsk, &firing); 1328 check_thread_timers(tsk, &firing);
1422 check_process_timers(tsk, &firing); 1329 /*
1330 * If there are any active process wide timers (POSIX 1.b, itimers,
1331 * RLIMIT_CPU) cputimer must be running.
1332 */
1333 if (tsk->signal->cputimer.running)
1334 check_process_timers(tsk, &firing);
1423 1335
1424 /* 1336 /*
1425 * We must release these locks before taking any timer's lock. 1337 * We must release these locks before taking any timer's lock.
@@ -1429,7 +1341,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1429 * that gets the timer lock before we do will give it up and 1341 * that gets the timer lock before we do will give it up and
1430 * spin until we've taken care of that timer below. 1342 * spin until we've taken care of that timer below.
1431 */ 1343 */
1432 spin_unlock(&tsk->sighand->siglock); 1344 unlock_task_sighand(tsk, &flags);
1433 1345
1434 /* 1346 /*
1435 * Now that all the timers on our list have the firing flag, 1347 * Now that all the timers on our list have the firing flag,
@@ -1456,21 +1368,23 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1456} 1368}
1457 1369
1458/* 1370/*
1459 * Set one of the process-wide special case CPU timers. 1371 * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
1460 * The tsk->sighand->siglock must be held by the caller. 1372 * The tsk->sighand->siglock must be held by the caller.
1461 * The *newval argument is relative and we update it to be absolute, *oldval
1462 * is absolute and we update it to be relative.
1463 */ 1373 */
1464void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, 1374void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1465 cputime_t *newval, cputime_t *oldval) 1375 cputime_t *newval, cputime_t *oldval)
1466{ 1376{
1467 union cpu_time_count now; 1377 union cpu_time_count now;
1468 struct list_head *head;
1469 1378
1470 BUG_ON(clock_idx == CPUCLOCK_SCHED); 1379 BUG_ON(clock_idx == CPUCLOCK_SCHED);
1471 cpu_timer_sample_group(clock_idx, tsk, &now); 1380 cpu_timer_sample_group(clock_idx, tsk, &now);
1472 1381
1473 if (oldval) { 1382 if (oldval) {
1383 /*
1384 * We are setting itimer. The *oldval is absolute and we update
1385 * it to be relative, *newval argument is relative and we update
1386 * it to be absolute.
1387 */
1474 if (!cputime_eq(*oldval, cputime_zero)) { 1388 if (!cputime_eq(*oldval, cputime_zero)) {
1475 if (cputime_le(*oldval, now.cpu)) { 1389 if (cputime_le(*oldval, now.cpu)) {
1476 /* Just about to fire. */ 1390 /* Just about to fire. */
@@ -1483,33 +1397,21 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1483 if (cputime_eq(*newval, cputime_zero)) 1397 if (cputime_eq(*newval, cputime_zero))
1484 return; 1398 return;
1485 *newval = cputime_add(*newval, now.cpu); 1399 *newval = cputime_add(*newval, now.cpu);
1486
1487 /*
1488 * If the RLIMIT_CPU timer will expire before the
1489 * ITIMER_PROF timer, we have nothing else to do.
1490 */
1491 if (tsk->signal->rlim[RLIMIT_CPU].rlim_cur
1492 < cputime_to_secs(*newval))
1493 return;
1494 } 1400 }
1495 1401
1496 /* 1402 /*
1497 * Check whether there are any process timers already set to fire 1403 * Update expiration cache if we are the earliest timer, or eventually
1498 * before this one. If so, we don't have anything more to do. 1404 * RLIMIT_CPU limit is earlier than prof_exp cpu timer expire.
1499 */ 1405 */
1500 head = &tsk->signal->cpu_timers[clock_idx]; 1406 switch (clock_idx) {
1501 if (list_empty(head) || 1407 case CPUCLOCK_PROF:
1502 cputime_ge(list_first_entry(head, 1408 if (expires_gt(tsk->signal->cputime_expires.prof_exp, *newval))
1503 struct cpu_timer_list, entry)->expires.cpu,
1504 *newval)) {
1505 switch (clock_idx) {
1506 case CPUCLOCK_PROF:
1507 tsk->signal->cputime_expires.prof_exp = *newval; 1409 tsk->signal->cputime_expires.prof_exp = *newval;
1508 break; 1410 break;
1509 case CPUCLOCK_VIRT: 1411 case CPUCLOCK_VIRT:
1412 if (expires_gt(tsk->signal->cputime_expires.virt_exp, *newval))
1510 tsk->signal->cputime_expires.virt_exp = *newval; 1413 tsk->signal->cputime_expires.virt_exp = *newval;
1511 break; 1414 break;
1512 }
1513 } 1415 }
1514} 1416}
1515 1417
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 00d1fda58ab6..9ca4973f736d 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -559,19 +559,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
559 new_timer->it_id = (timer_t) new_timer_id; 559 new_timer->it_id = (timer_t) new_timer_id;
560 new_timer->it_clock = which_clock; 560 new_timer->it_clock = which_clock;
561 new_timer->it_overrun = -1; 561 new_timer->it_overrun = -1;
562 error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
563 if (error)
564 goto out;
565 562
566 /*
567 * return the timer_id now. The next step is hard to
568 * back out if there is an error.
569 */
570 if (copy_to_user(created_timer_id,
571 &new_timer_id, sizeof (new_timer_id))) {
572 error = -EFAULT;
573 goto out;
574 }
575 if (timer_event_spec) { 563 if (timer_event_spec) {
576 if (copy_from_user(&event, timer_event_spec, sizeof (event))) { 564 if (copy_from_user(&event, timer_event_spec, sizeof (event))) {
577 error = -EFAULT; 565 error = -EFAULT;
@@ -597,6 +585,16 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
597 new_timer->sigq->info.si_tid = new_timer->it_id; 585 new_timer->sigq->info.si_tid = new_timer->it_id;
598 new_timer->sigq->info.si_code = SI_TIMER; 586 new_timer->sigq->info.si_code = SI_TIMER;
599 587
588 if (copy_to_user(created_timer_id,
589 &new_timer_id, sizeof (new_timer_id))) {
590 error = -EFAULT;
591 goto out;
592 }
593
594 error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
595 if (error)
596 goto out;
597
600 spin_lock_irq(&current->sighand->siglock); 598 spin_lock_irq(&current->sighand->siglock);
601 new_timer->it_signal = current->signal; 599 new_timer->it_signal = current->signal;
602 list_add(&new_timer->list, &current->signal->posix_timers); 600 list_add(&new_timer->list, &current->signal->posix_timers);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 5c36ea9d55d2..ca6066a6952e 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -99,9 +99,13 @@ config PM_SLEEP_ADVANCED_DEBUG
99 depends on PM_ADVANCED_DEBUG 99 depends on PM_ADVANCED_DEBUG
100 default n 100 default n
101 101
102config SUSPEND_NVS
103 bool
104
102config SUSPEND 105config SUSPEND
103 bool "Suspend to RAM and standby" 106 bool "Suspend to RAM and standby"
104 depends on PM && ARCH_SUSPEND_POSSIBLE 107 depends on PM && ARCH_SUSPEND_POSSIBLE
108 select SUSPEND_NVS if HAS_IOMEM
105 default y 109 default y
106 ---help--- 110 ---help---
107 Allow the system to enter sleep states in which main memory is 111 Allow the system to enter sleep states in which main memory is
@@ -130,13 +134,10 @@ config SUSPEND_FREEZER
130 134
131 Turning OFF this setting is NOT recommended! If in doubt, say Y. 135 Turning OFF this setting is NOT recommended! If in doubt, say Y.
132 136
133config HIBERNATION_NVS
134 bool
135
136config HIBERNATION 137config HIBERNATION
137 bool "Hibernation (aka 'suspend to disk')" 138 bool "Hibernation (aka 'suspend to disk')"
138 depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE 139 depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
139 select HIBERNATION_NVS if HAS_IOMEM 140 select SUSPEND_NVS if HAS_IOMEM
140 ---help--- 141 ---help---
141 Enable the suspend to disk (STD) functionality, which is usually 142 Enable the suspend to disk (STD) functionality, which is usually
142 called "hibernation" in user interfaces. STD checkpoints the 143 called "hibernation" in user interfaces. STD checkpoints the
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 43191815f874..f9063c6b185d 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -8,7 +8,8 @@ obj-$(CONFIG_PM_SLEEP) += console.o
8obj-$(CONFIG_FREEZER) += process.o 8obj-$(CONFIG_FREEZER) += process.o
9obj-$(CONFIG_SUSPEND) += suspend.o 9obj-$(CONFIG_SUSPEND) += suspend.o
10obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o 10obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
11obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o 11obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \
12obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o 12 block_io.o
13obj-$(CONFIG_SUSPEND_NVS) += nvs.o
13 14
14obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 15obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
new file mode 100644
index 000000000000..83bbc7c02df9
--- /dev/null
+++ b/kernel/power/block_io.c
@@ -0,0 +1,103 @@
1/*
2 * This file provides functions for block I/O operations on swap/file.
3 *
4 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
5 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
6 *
7 * This file is released under the GPLv2.
8 */
9
10#include <linux/bio.h>
11#include <linux/kernel.h>
12#include <linux/pagemap.h>
13#include <linux/swap.h>
14
15#include "power.h"
16
17/**
18 * submit - submit BIO request.
19 * @rw: READ or WRITE.
20 * @off physical offset of page.
21 * @page: page we're reading or writing.
22 * @bio_chain: list of pending biod (for async reading)
23 *
24 * Straight from the textbook - allocate and initialize the bio.
25 * If we're reading, make sure the page is marked as dirty.
26 * Then submit it and, if @bio_chain == NULL, wait.
27 */
28static int submit(int rw, struct block_device *bdev, sector_t sector,
29 struct page *page, struct bio **bio_chain)
30{
31 const int bio_rw = rw | REQ_SYNC | REQ_UNPLUG;
32 struct bio *bio;
33
34 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
35 bio->bi_sector = sector;
36 bio->bi_bdev = bdev;
37 bio->bi_end_io = end_swap_bio_read;
38
39 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
40 printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
41 (unsigned long long)sector);
42 bio_put(bio);
43 return -EFAULT;
44 }
45
46 lock_page(page);
47 bio_get(bio);
48
49 if (bio_chain == NULL) {
50 submit_bio(bio_rw, bio);
51 wait_on_page_locked(page);
52 if (rw == READ)
53 bio_set_pages_dirty(bio);
54 bio_put(bio);
55 } else {
56 if (rw == READ)
57 get_page(page); /* These pages are freed later */
58 bio->bi_private = *bio_chain;
59 *bio_chain = bio;
60 submit_bio(bio_rw, bio);
61 }
62 return 0;
63}
64
65int hib_bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
66{
67 return submit(READ, hib_resume_bdev, page_off * (PAGE_SIZE >> 9),
68 virt_to_page(addr), bio_chain);
69}
70
71int hib_bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
72{
73 return submit(WRITE, hib_resume_bdev, page_off * (PAGE_SIZE >> 9),
74 virt_to_page(addr), bio_chain);
75}
76
77int hib_wait_on_bio_chain(struct bio **bio_chain)
78{
79 struct bio *bio;
80 struct bio *next_bio;
81 int ret = 0;
82
83 if (bio_chain == NULL)
84 return 0;
85
86 bio = *bio_chain;
87 if (bio == NULL)
88 return 0;
89 while (bio) {
90 struct page *page;
91
92 next_bio = bio->bi_private;
93 page = bio->bi_io_vec[0].bv_page;
94 wait_on_page_locked(page);
95 if (!PageUptodate(page) || PageError(page))
96 ret = -EIO;
97 put_page(page);
98 bio_put(bio);
99 bio = next_bio;
100 }
101 *bio_chain = NULL;
102 return ret;
103}
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index aa9e916da4d5..8dc31e02ae12 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Copyright (c) 2003 Patrick Mochel 4 * Copyright (c) 2003 Patrick Mochel
5 * Copyright (c) 2003 Open Source Development Lab 5 * Copyright (c) 2003 Open Source Development Lab
6 * Copyright (c) 2004 Pavel Machek <pavel@suse.cz> 6 * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz>
7 * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. 7 * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc.
8 * 8 *
9 * This file is released under the GPLv2. 9 * This file is released under the GPLv2.
@@ -277,7 +277,7 @@ static int create_image(int platform_mode)
277 goto Enable_irqs; 277 goto Enable_irqs;
278 } 278 }
279 279
280 if (hibernation_test(TEST_CORE)) 280 if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events())
281 goto Power_up; 281 goto Power_up;
282 282
283 in_suspend = 1; 283 in_suspend = 1;
@@ -288,8 +288,10 @@ static int create_image(int platform_mode)
288 error); 288 error);
289 /* Restore control flow magically appears here */ 289 /* Restore control flow magically appears here */
290 restore_processor_state(); 290 restore_processor_state();
291 if (!in_suspend) 291 if (!in_suspend) {
292 events_check_enabled = false;
292 platform_leave(platform_mode); 293 platform_leave(platform_mode);
294 }
293 295
294 Power_up: 296 Power_up:
295 sysdev_resume(); 297 sysdev_resume();
@@ -328,7 +330,7 @@ int hibernation_snapshot(int platform_mode)
328 330
329 error = platform_begin(platform_mode); 331 error = platform_begin(platform_mode);
330 if (error) 332 if (error)
331 return error; 333 goto Close;
332 334
333 /* Preallocate image memory before shutting down devices. */ 335 /* Preallocate image memory before shutting down devices. */
334 error = hibernate_preallocate_memory(); 336 error = hibernate_preallocate_memory();
@@ -511,18 +513,24 @@ int hibernation_platform_enter(void)
511 513
512 local_irq_disable(); 514 local_irq_disable();
513 sysdev_suspend(PMSG_HIBERNATE); 515 sysdev_suspend(PMSG_HIBERNATE);
516 if (!pm_check_wakeup_events()) {
517 error = -EAGAIN;
518 goto Power_up;
519 }
520
514 hibernation_ops->enter(); 521 hibernation_ops->enter();
515 /* We should never get here */ 522 /* We should never get here */
516 while (1); 523 while (1);
517 524
518 /* 525 Power_up:
519 * We don't need to reenable the nonboot CPUs or resume consoles, since 526 sysdev_resume();
520 * the system is going to be halted anyway. 527 local_irq_enable();
521 */ 528 enable_nonboot_cpus();
529
522 Platform_finish: 530 Platform_finish:
523 hibernation_ops->finish(); 531 hibernation_ops->finish();
524 532
525 dpm_suspend_noirq(PMSG_RESTORE); 533 dpm_resume_noirq(PMSG_RESTORE);
526 534
527 Resume_devices: 535 Resume_devices:
528 entering_platform_hibernation = false; 536 entering_platform_hibernation = false;
diff --git a/kernel/power/main.c b/kernel/power/main.c
index b58800b21fc0..62b0bc6e4983 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -204,6 +204,60 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
204 204
205power_attr(state); 205power_attr(state);
206 206
207#ifdef CONFIG_PM_SLEEP
208/*
209 * The 'wakeup_count' attribute, along with the functions defined in
210 * drivers/base/power/wakeup.c, provides a means by which wakeup events can be
211 * handled in a non-racy way.
212 *
213 * If a wakeup event occurs when the system is in a sleep state, it simply is
214 * woken up. In turn, if an event that would wake the system up from a sleep
215 * state occurs when it is undergoing a transition to that sleep state, the
216 * transition should be aborted. Moreover, if such an event occurs when the
217 * system is in the working state, an attempt to start a transition to the
218 * given sleep state should fail during certain period after the detection of
219 * the event. Using the 'state' attribute alone is not sufficient to satisfy
220 * these requirements, because a wakeup event may occur exactly when 'state'
221 * is being written to and may be delivered to user space right before it is
222 * frozen, so the event will remain only partially processed until the system is
223 * woken up by another event. In particular, it won't cause the transition to
224 * a sleep state to be aborted.
225 *
226 * This difficulty may be overcome if user space uses 'wakeup_count' before
227 * writing to 'state'. It first should read from 'wakeup_count' and store
228 * the read value. Then, after carrying out its own preparations for the system
229 * transition to a sleep state, it should write the stored value to
230 * 'wakeup_count'. If that fails, at least one wakeup event has occured since
231 * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it
232 * is allowed to write to 'state', but the transition will be aborted if there
233 * are any wakeup events detected after 'wakeup_count' was written to.
234 */
235
236static ssize_t wakeup_count_show(struct kobject *kobj,
237 struct kobj_attribute *attr,
238 char *buf)
239{
240 unsigned long val;
241
242 return pm_get_wakeup_count(&val) ? sprintf(buf, "%lu\n", val) : -EINTR;
243}
244
245static ssize_t wakeup_count_store(struct kobject *kobj,
246 struct kobj_attribute *attr,
247 const char *buf, size_t n)
248{
249 unsigned long val;
250
251 if (sscanf(buf, "%lu", &val) == 1) {
252 if (pm_save_wakeup_count(val))
253 return n;
254 }
255 return -EINVAL;
256}
257
258power_attr(wakeup_count);
259#endif /* CONFIG_PM_SLEEP */
260
207#ifdef CONFIG_PM_TRACE 261#ifdef CONFIG_PM_TRACE
208int pm_trace_enabled; 262int pm_trace_enabled;
209 263
@@ -236,6 +290,7 @@ static struct attribute * g[] = {
236#endif 290#endif
237#ifdef CONFIG_PM_SLEEP 291#ifdef CONFIG_PM_SLEEP
238 &pm_async_attr.attr, 292 &pm_async_attr.attr,
293 &wakeup_count_attr.attr,
239#ifdef CONFIG_PM_DEBUG 294#ifdef CONFIG_PM_DEBUG
240 &pm_test_attr.attr, 295 &pm_test_attr.attr,
241#endif 296#endif
diff --git a/kernel/power/hibernate_nvs.c b/kernel/power/nvs.c
index fdcad9ed5a7b..1836db60bbb6 100644
--- a/kernel/power/hibernate_nvs.c
+++ b/kernel/power/nvs.c
@@ -15,7 +15,7 @@
15 15
16/* 16/*
17 * Platforms, like ACPI, may want us to save some memory used by them during 17 * Platforms, like ACPI, may want us to save some memory used by them during
18 * hibernation and to restore the contents of this memory during the subsequent 18 * suspend and to restore the contents of this memory during the subsequent
19 * resume. The code below implements a mechanism allowing us to do that. 19 * resume. The code below implements a mechanism allowing us to do that.
20 */ 20 */
21 21
@@ -30,7 +30,7 @@ struct nvs_page {
30static LIST_HEAD(nvs_list); 30static LIST_HEAD(nvs_list);
31 31
32/** 32/**
33 * hibernate_nvs_register - register platform NVS memory region to save 33 * suspend_nvs_register - register platform NVS memory region to save
34 * @start - physical address of the region 34 * @start - physical address of the region
35 * @size - size of the region 35 * @size - size of the region
36 * 36 *
@@ -38,7 +38,7 @@ static LIST_HEAD(nvs_list);
38 * things so that the data from page-aligned addresses in this region will 38 * things so that the data from page-aligned addresses in this region will
39 * be copied into separate RAM pages. 39 * be copied into separate RAM pages.
40 */ 40 */
41int hibernate_nvs_register(unsigned long start, unsigned long size) 41int suspend_nvs_register(unsigned long start, unsigned long size)
42{ 42{
43 struct nvs_page *entry, *next; 43 struct nvs_page *entry, *next;
44 44
@@ -68,9 +68,9 @@ int hibernate_nvs_register(unsigned long start, unsigned long size)
68} 68}
69 69
70/** 70/**
71 * hibernate_nvs_free - free data pages allocated for saving NVS regions 71 * suspend_nvs_free - free data pages allocated for saving NVS regions
72 */ 72 */
73void hibernate_nvs_free(void) 73void suspend_nvs_free(void)
74{ 74{
75 struct nvs_page *entry; 75 struct nvs_page *entry;
76 76
@@ -86,16 +86,16 @@ void hibernate_nvs_free(void)
86} 86}
87 87
88/** 88/**
89 * hibernate_nvs_alloc - allocate memory necessary for saving NVS regions 89 * suspend_nvs_alloc - allocate memory necessary for saving NVS regions
90 */ 90 */
91int hibernate_nvs_alloc(void) 91int suspend_nvs_alloc(void)
92{ 92{
93 struct nvs_page *entry; 93 struct nvs_page *entry;
94 94
95 list_for_each_entry(entry, &nvs_list, node) { 95 list_for_each_entry(entry, &nvs_list, node) {
96 entry->data = (void *)__get_free_page(GFP_KERNEL); 96 entry->data = (void *)__get_free_page(GFP_KERNEL);
97 if (!entry->data) { 97 if (!entry->data) {
98 hibernate_nvs_free(); 98 suspend_nvs_free();
99 return -ENOMEM; 99 return -ENOMEM;
100 } 100 }
101 } 101 }
@@ -103,9 +103,9 @@ int hibernate_nvs_alloc(void)
103} 103}
104 104
105/** 105/**
106 * hibernate_nvs_save - save NVS memory regions 106 * suspend_nvs_save - save NVS memory regions
107 */ 107 */
108void hibernate_nvs_save(void) 108void suspend_nvs_save(void)
109{ 109{
110 struct nvs_page *entry; 110 struct nvs_page *entry;
111 111
@@ -119,12 +119,12 @@ void hibernate_nvs_save(void)
119} 119}
120 120
121/** 121/**
122 * hibernate_nvs_restore - restore NVS memory regions 122 * suspend_nvs_restore - restore NVS memory regions
123 * 123 *
124 * This function is going to be called with interrupts disabled, so it 124 * This function is going to be called with interrupts disabled, so it
125 * cannot iounmap the virtual addresses used to access the NVS region. 125 * cannot iounmap the virtual addresses used to access the NVS region.
126 */ 126 */
127void hibernate_nvs_restore(void) 127void suspend_nvs_restore(void)
128{ 128{
129 struct nvs_page *entry; 129 struct nvs_page *entry;
130 130
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 46c5a26630a3..006270fe382d 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -97,24 +97,12 @@ extern int hibernate_preallocate_memory(void);
97 */ 97 */
98 98
99struct snapshot_handle { 99struct snapshot_handle {
100 loff_t offset; /* number of the last byte ready for reading
101 * or writing in the sequence
102 */
103 unsigned int cur; /* number of the block of PAGE_SIZE bytes the 100 unsigned int cur; /* number of the block of PAGE_SIZE bytes the
104 * next operation will refer to (ie. current) 101 * next operation will refer to (ie. current)
105 */ 102 */
106 unsigned int cur_offset; /* offset with respect to the current
107 * block (for the next operation)
108 */
109 unsigned int prev; /* number of the block of PAGE_SIZE bytes that
110 * was the current one previously
111 */
112 void *buffer; /* address of the block to read from 103 void *buffer; /* address of the block to read from
113 * or write to 104 * or write to
114 */ 105 */
115 unsigned int buf_offset; /* location to read from or write to,
116 * given as a displacement from 'buffer'
117 */
118 int sync_read; /* Set to one to notify the caller of 106 int sync_read; /* Set to one to notify the caller of
119 * snapshot_write_next() that it may 107 * snapshot_write_next() that it may
120 * need to call wait_on_bio_chain() 108 * need to call wait_on_bio_chain()
@@ -125,12 +113,12 @@ struct snapshot_handle {
125 * snapshot_read_next()/snapshot_write_next() is allowed to 113 * snapshot_read_next()/snapshot_write_next() is allowed to
126 * read/write data after the function returns 114 * read/write data after the function returns
127 */ 115 */
128#define data_of(handle) ((handle).buffer + (handle).buf_offset) 116#define data_of(handle) ((handle).buffer)
129 117
130extern unsigned int snapshot_additional_pages(struct zone *zone); 118extern unsigned int snapshot_additional_pages(struct zone *zone);
131extern unsigned long snapshot_get_image_size(void); 119extern unsigned long snapshot_get_image_size(void);
132extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); 120extern int snapshot_read_next(struct snapshot_handle *handle);
133extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); 121extern int snapshot_write_next(struct snapshot_handle *handle);
134extern void snapshot_write_finalize(struct snapshot_handle *handle); 122extern void snapshot_write_finalize(struct snapshot_handle *handle);
135extern int snapshot_image_loaded(struct snapshot_handle *handle); 123extern int snapshot_image_loaded(struct snapshot_handle *handle);
136 124
@@ -154,6 +142,15 @@ extern int swsusp_read(unsigned int *flags_p);
154extern int swsusp_write(unsigned int flags); 142extern int swsusp_write(unsigned int flags);
155extern void swsusp_close(fmode_t); 143extern void swsusp_close(fmode_t);
156 144
145/* kernel/power/block_io.c */
146extern struct block_device *hib_resume_bdev;
147
148extern int hib_bio_read_page(pgoff_t page_off, void *addr,
149 struct bio **bio_chain);
150extern int hib_bio_write_page(pgoff_t page_off, void *addr,
151 struct bio **bio_chain);
152extern int hib_wait_on_bio_chain(struct bio **bio_chain);
153
157struct timeval; 154struct timeval;
158/* kernel/power/swsusp.c */ 155/* kernel/power/swsusp.c */
159extern void swsusp_show_speed(struct timeval *, struct timeval *, 156extern void swsusp_show_speed(struct timeval *, struct timeval *,
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index e8b337006276..d52359374e85 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -24,7 +24,7 @@ static void do_poweroff(struct work_struct *dummy)
24 24
25static DECLARE_WORK(poweroff_work, do_poweroff); 25static DECLARE_WORK(poweroff_work, do_poweroff);
26 26
27static void handle_poweroff(int key, struct tty_struct *tty) 27static void handle_poweroff(int key)
28{ 28{
29 /* run sysrq poweroff on boot cpu */ 29 /* run sysrq poweroff on boot cpu */
30 schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work); 30 schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 71ae29052ab6..028a99598f49 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -15,6 +15,7 @@
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/freezer.h> 16#include <linux/freezer.h>
17#include <linux/delay.h> 17#include <linux/delay.h>
18#include <linux/workqueue.h>
18 19
19/* 20/*
20 * Timeout for stopping processes 21 * Timeout for stopping processes
@@ -35,6 +36,7 @@ static int try_to_freeze_tasks(bool sig_only)
35 struct task_struct *g, *p; 36 struct task_struct *g, *p;
36 unsigned long end_time; 37 unsigned long end_time;
37 unsigned int todo; 38 unsigned int todo;
39 bool wq_busy = false;
38 struct timeval start, end; 40 struct timeval start, end;
39 u64 elapsed_csecs64; 41 u64 elapsed_csecs64;
40 unsigned int elapsed_csecs; 42 unsigned int elapsed_csecs;
@@ -42,6 +44,10 @@ static int try_to_freeze_tasks(bool sig_only)
42 do_gettimeofday(&start); 44 do_gettimeofday(&start);
43 45
44 end_time = jiffies + TIMEOUT; 46 end_time = jiffies + TIMEOUT;
47
48 if (!sig_only)
49 freeze_workqueues_begin();
50
45 while (true) { 51 while (true) {
46 todo = 0; 52 todo = 0;
47 read_lock(&tasklist_lock); 53 read_lock(&tasklist_lock);
@@ -63,6 +69,12 @@ static int try_to_freeze_tasks(bool sig_only)
63 todo++; 69 todo++;
64 } while_each_thread(g, p); 70 } while_each_thread(g, p);
65 read_unlock(&tasklist_lock); 71 read_unlock(&tasklist_lock);
72
73 if (!sig_only) {
74 wq_busy = freeze_workqueues_busy();
75 todo += wq_busy;
76 }
77
66 if (!todo || time_after(jiffies, end_time)) 78 if (!todo || time_after(jiffies, end_time))
67 break; 79 break;
68 80
@@ -86,8 +98,12 @@ static int try_to_freeze_tasks(bool sig_only)
86 */ 98 */
87 printk("\n"); 99 printk("\n");
88 printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds " 100 printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds "
89 "(%d tasks refusing to freeze):\n", 101 "(%d tasks refusing to freeze, wq_busy=%d):\n",
90 elapsed_csecs / 100, elapsed_csecs % 100, todo); 102 elapsed_csecs / 100, elapsed_csecs % 100,
103 todo - wq_busy, wq_busy);
104
105 thaw_workqueues();
106
91 read_lock(&tasklist_lock); 107 read_lock(&tasklist_lock);
92 do_each_thread(g, p) { 108 do_each_thread(g, p) {
93 task_lock(p); 109 task_lock(p);
@@ -157,6 +173,7 @@ void thaw_processes(void)
157 oom_killer_enable(); 173 oom_killer_enable();
158 174
159 printk("Restarting tasks ... "); 175 printk("Restarting tasks ... ");
176 thaw_workqueues();
160 thaw_tasks(true); 177 thaw_tasks(true);
161 thaw_tasks(false); 178 thaw_tasks(false);
162 schedule(); 179 schedule();
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index be861c26dda7..d3f795f01bbc 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * This file provides system snapshot/restore functionality for swsusp. 4 * This file provides system snapshot/restore functionality for swsusp.
5 * 5 *
6 * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz> 6 * Copyright (C) 1998-2005 Pavel Machek <pavel@ucw.cz>
7 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> 7 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
8 * 8 *
9 * This file is released under the GPLv2. 9 * This file is released under the GPLv2.
@@ -1121,9 +1121,19 @@ static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask)
1121 return nr_alloc; 1121 return nr_alloc;
1122} 1122}
1123 1123
1124static unsigned long preallocate_image_memory(unsigned long nr_pages) 1124static unsigned long preallocate_image_memory(unsigned long nr_pages,
1125 unsigned long avail_normal)
1125{ 1126{
1126 return preallocate_image_pages(nr_pages, GFP_IMAGE); 1127 unsigned long alloc;
1128
1129 if (avail_normal <= alloc_normal)
1130 return 0;
1131
1132 alloc = avail_normal - alloc_normal;
1133 if (nr_pages < alloc)
1134 alloc = nr_pages;
1135
1136 return preallocate_image_pages(alloc, GFP_IMAGE);
1127} 1137}
1128 1138
1129#ifdef CONFIG_HIGHMEM 1139#ifdef CONFIG_HIGHMEM
@@ -1169,15 +1179,22 @@ static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
1169 */ 1179 */
1170static void free_unnecessary_pages(void) 1180static void free_unnecessary_pages(void)
1171{ 1181{
1172 unsigned long save_highmem, to_free_normal, to_free_highmem; 1182 unsigned long save, to_free_normal, to_free_highmem;
1173 1183
1174 to_free_normal = alloc_normal - count_data_pages(); 1184 save = count_data_pages();
1175 save_highmem = count_highmem_pages(); 1185 if (alloc_normal >= save) {
1176 if (alloc_highmem > save_highmem) { 1186 to_free_normal = alloc_normal - save;
1177 to_free_highmem = alloc_highmem - save_highmem; 1187 save = 0;
1188 } else {
1189 to_free_normal = 0;
1190 save -= alloc_normal;
1191 }
1192 save += count_highmem_pages();
1193 if (alloc_highmem >= save) {
1194 to_free_highmem = alloc_highmem - save;
1178 } else { 1195 } else {
1179 to_free_highmem = 0; 1196 to_free_highmem = 0;
1180 to_free_normal -= save_highmem - alloc_highmem; 1197 to_free_normal -= save - alloc_highmem;
1181 } 1198 }
1182 1199
1183 memory_bm_position_reset(&copy_bm); 1200 memory_bm_position_reset(&copy_bm);
@@ -1258,7 +1275,7 @@ int hibernate_preallocate_memory(void)
1258{ 1275{
1259 struct zone *zone; 1276 struct zone *zone;
1260 unsigned long saveable, size, max_size, count, highmem, pages = 0; 1277 unsigned long saveable, size, max_size, count, highmem, pages = 0;
1261 unsigned long alloc, save_highmem, pages_highmem; 1278 unsigned long alloc, save_highmem, pages_highmem, avail_normal;
1262 struct timeval start, stop; 1279 struct timeval start, stop;
1263 int error; 1280 int error;
1264 1281
@@ -1295,6 +1312,7 @@ int hibernate_preallocate_memory(void)
1295 else 1312 else
1296 count += zone_page_state(zone, NR_FREE_PAGES); 1313 count += zone_page_state(zone, NR_FREE_PAGES);
1297 } 1314 }
1315 avail_normal = count;
1298 count += highmem; 1316 count += highmem;
1299 count -= totalreserve_pages; 1317 count -= totalreserve_pages;
1300 1318
@@ -1309,12 +1327,21 @@ int hibernate_preallocate_memory(void)
1309 */ 1327 */
1310 if (size >= saveable) { 1328 if (size >= saveable) {
1311 pages = preallocate_image_highmem(save_highmem); 1329 pages = preallocate_image_highmem(save_highmem);
1312 pages += preallocate_image_memory(saveable - pages); 1330 pages += preallocate_image_memory(saveable - pages, avail_normal);
1313 goto out; 1331 goto out;
1314 } 1332 }
1315 1333
1316 /* Estimate the minimum size of the image. */ 1334 /* Estimate the minimum size of the image. */
1317 pages = minimum_image_size(saveable); 1335 pages = minimum_image_size(saveable);
1336 /*
1337 * To avoid excessive pressure on the normal zone, leave room in it to
1338 * accommodate an image of the minimum size (unless it's already too
1339 * small, in which case don't preallocate pages from it at all).
1340 */
1341 if (avail_normal > pages)
1342 avail_normal -= pages;
1343 else
1344 avail_normal = 0;
1318 if (size < pages) 1345 if (size < pages)
1319 size = min_t(unsigned long, pages, max_size); 1346 size = min_t(unsigned long, pages, max_size);
1320 1347
@@ -1335,16 +1362,34 @@ int hibernate_preallocate_memory(void)
1335 */ 1362 */
1336 pages_highmem = preallocate_image_highmem(highmem / 2); 1363 pages_highmem = preallocate_image_highmem(highmem / 2);
1337 alloc = (count - max_size) - pages_highmem; 1364 alloc = (count - max_size) - pages_highmem;
1338 pages = preallocate_image_memory(alloc); 1365 pages = preallocate_image_memory(alloc, avail_normal);
1339 if (pages < alloc) 1366 if (pages < alloc) {
1340 goto err_out; 1367 /* We have exhausted non-highmem pages, try highmem. */
1341 size = max_size - size; 1368 alloc -= pages;
1342 alloc = size; 1369 pages += pages_highmem;
1343 size = preallocate_highmem_fraction(size, highmem, count); 1370 pages_highmem = preallocate_image_highmem(alloc);
1344 pages_highmem += size; 1371 if (pages_highmem < alloc)
1345 alloc -= size; 1372 goto err_out;
1346 pages += preallocate_image_memory(alloc); 1373 pages += pages_highmem;
1347 pages += pages_highmem; 1374 /*
1375 * size is the desired number of saveable pages to leave in
1376 * memory, so try to preallocate (all memory - size) pages.
1377 */
1378 alloc = (count - pages) - size;
1379 pages += preallocate_image_highmem(alloc);
1380 } else {
1381 /*
1382 * There are approximately max_size saveable pages at this point
1383 * and we want to reduce this number down to size.
1384 */
1385 alloc = max_size - size;
1386 size = preallocate_highmem_fraction(alloc, highmem, count);
1387 pages_highmem += size;
1388 alloc -= size;
1389 size = preallocate_image_memory(alloc, avail_normal);
1390 pages_highmem += preallocate_image_highmem(alloc - size);
1391 pages += pages_highmem + size;
1392 }
1348 1393
1349 /* 1394 /*
1350 * We only need as many page frames for the image as there are saveable 1395 * We only need as many page frames for the image as there are saveable
@@ -1604,14 +1649,9 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
1604 * snapshot_handle structure. The structure gets updated and a pointer 1649 * snapshot_handle structure. The structure gets updated and a pointer
1605 * to it should be passed to this function every next time. 1650 * to it should be passed to this function every next time.
1606 * 1651 *
1607 * The @count parameter should contain the number of bytes the caller
1608 * wants to read from the snapshot. It must not be zero.
1609 *
1610 * On success the function returns a positive number. Then, the caller 1652 * On success the function returns a positive number. Then, the caller
1611 * is allowed to read up to the returned number of bytes from the memory 1653 * is allowed to read up to the returned number of bytes from the memory
1612 * location computed by the data_of() macro. The number returned 1654 * location computed by the data_of() macro.
1613 * may be smaller than @count, but this only happens if the read would
1614 * cross a page boundary otherwise.
1615 * 1655 *
1616 * The function returns 0 to indicate the end of data stream condition, 1656 * The function returns 0 to indicate the end of data stream condition,
1617 * and a negative number is returned on error. In such cases the 1657 * and a negative number is returned on error. In such cases the
@@ -1619,7 +1659,7 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
1619 * any more. 1659 * any more.
1620 */ 1660 */
1621 1661
1622int snapshot_read_next(struct snapshot_handle *handle, size_t count) 1662int snapshot_read_next(struct snapshot_handle *handle)
1623{ 1663{
1624 if (handle->cur > nr_meta_pages + nr_copy_pages) 1664 if (handle->cur > nr_meta_pages + nr_copy_pages)
1625 return 0; 1665 return 0;
@@ -1630,7 +1670,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
1630 if (!buffer) 1670 if (!buffer)
1631 return -ENOMEM; 1671 return -ENOMEM;
1632 } 1672 }
1633 if (!handle->offset) { 1673 if (!handle->cur) {
1634 int error; 1674 int error;
1635 1675
1636 error = init_header((struct swsusp_info *)buffer); 1676 error = init_header((struct swsusp_info *)buffer);
@@ -1639,42 +1679,30 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
1639 handle->buffer = buffer; 1679 handle->buffer = buffer;
1640 memory_bm_position_reset(&orig_bm); 1680 memory_bm_position_reset(&orig_bm);
1641 memory_bm_position_reset(&copy_bm); 1681 memory_bm_position_reset(&copy_bm);
1642 } 1682 } else if (handle->cur <= nr_meta_pages) {
1643 if (handle->prev < handle->cur) { 1683 memset(buffer, 0, PAGE_SIZE);
1644 if (handle->cur <= nr_meta_pages) { 1684 pack_pfns(buffer, &orig_bm);
1645 memset(buffer, 0, PAGE_SIZE); 1685 } else {
1646 pack_pfns(buffer, &orig_bm); 1686 struct page *page;
1647 } else {
1648 struct page *page;
1649 1687
1650 page = pfn_to_page(memory_bm_next_pfn(&copy_bm)); 1688 page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
1651 if (PageHighMem(page)) { 1689 if (PageHighMem(page)) {
1652 /* Highmem pages are copied to the buffer, 1690 /* Highmem pages are copied to the buffer,
1653 * because we can't return with a kmapped 1691 * because we can't return with a kmapped
1654 * highmem page (we may not be called again). 1692 * highmem page (we may not be called again).
1655 */ 1693 */
1656 void *kaddr; 1694 void *kaddr;
1657 1695
1658 kaddr = kmap_atomic(page, KM_USER0); 1696 kaddr = kmap_atomic(page, KM_USER0);
1659 memcpy(buffer, kaddr, PAGE_SIZE); 1697 memcpy(buffer, kaddr, PAGE_SIZE);
1660 kunmap_atomic(kaddr, KM_USER0); 1698 kunmap_atomic(kaddr, KM_USER0);
1661 handle->buffer = buffer; 1699 handle->buffer = buffer;
1662 } else { 1700 } else {
1663 handle->buffer = page_address(page); 1701 handle->buffer = page_address(page);
1664 }
1665 } 1702 }
1666 handle->prev = handle->cur;
1667 } 1703 }
1668 handle->buf_offset = handle->cur_offset; 1704 handle->cur++;
1669 if (handle->cur_offset + count >= PAGE_SIZE) { 1705 return PAGE_SIZE;
1670 count = PAGE_SIZE - handle->cur_offset;
1671 handle->cur_offset = 0;
1672 handle->cur++;
1673 } else {
1674 handle->cur_offset += count;
1675 }
1676 handle->offset += count;
1677 return count;
1678} 1706}
1679 1707
1680/** 1708/**
@@ -2133,14 +2161,9 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
2133 * snapshot_handle structure. The structure gets updated and a pointer 2161 * snapshot_handle structure. The structure gets updated and a pointer
2134 * to it should be passed to this function every next time. 2162 * to it should be passed to this function every next time.
2135 * 2163 *
2136 * The @count parameter should contain the number of bytes the caller
2137 * wants to write to the image. It must not be zero.
2138 *
2139 * On success the function returns a positive number. Then, the caller 2164 * On success the function returns a positive number. Then, the caller
2140 * is allowed to write up to the returned number of bytes to the memory 2165 * is allowed to write up to the returned number of bytes to the memory
2141 * location computed by the data_of() macro. The number returned 2166 * location computed by the data_of() macro.
2142 * may be smaller than @count, but this only happens if the write would
2143 * cross a page boundary otherwise.
2144 * 2167 *
2145 * The function returns 0 to indicate the "end of file" condition, 2168 * The function returns 0 to indicate the "end of file" condition,
2146 * and a negative number is returned on error. In such cases the 2169 * and a negative number is returned on error. In such cases the
@@ -2148,16 +2171,18 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
2148 * any more. 2171 * any more.
2149 */ 2172 */
2150 2173
2151int snapshot_write_next(struct snapshot_handle *handle, size_t count) 2174int snapshot_write_next(struct snapshot_handle *handle)
2152{ 2175{
2153 static struct chain_allocator ca; 2176 static struct chain_allocator ca;
2154 int error = 0; 2177 int error = 0;
2155 2178
2156 /* Check if we have already loaded the entire image */ 2179 /* Check if we have already loaded the entire image */
2157 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) 2180 if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages)
2158 return 0; 2181 return 0;
2159 2182
2160 if (handle->offset == 0) { 2183 handle->sync_read = 1;
2184
2185 if (!handle->cur) {
2161 if (!buffer) 2186 if (!buffer)
2162 /* This makes the buffer be freed by swsusp_free() */ 2187 /* This makes the buffer be freed by swsusp_free() */
2163 buffer = get_image_page(GFP_ATOMIC, PG_ANY); 2188 buffer = get_image_page(GFP_ATOMIC, PG_ANY);
@@ -2166,56 +2191,43 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
2166 return -ENOMEM; 2191 return -ENOMEM;
2167 2192
2168 handle->buffer = buffer; 2193 handle->buffer = buffer;
2169 } 2194 } else if (handle->cur == 1) {
2170 handle->sync_read = 1; 2195 error = load_header(buffer);
2171 if (handle->prev < handle->cur) { 2196 if (error)
2172 if (handle->prev == 0) { 2197 return error;
2173 error = load_header(buffer);
2174 if (error)
2175 return error;
2176 2198
2177 error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY); 2199 error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY);
2178 if (error) 2200 if (error)
2179 return error; 2201 return error;
2180 2202
2181 } else if (handle->prev <= nr_meta_pages) { 2203 } else if (handle->cur <= nr_meta_pages + 1) {
2182 error = unpack_orig_pfns(buffer, &copy_bm); 2204 error = unpack_orig_pfns(buffer, &copy_bm);
2205 if (error)
2206 return error;
2207
2208 if (handle->cur == nr_meta_pages + 1) {
2209 error = prepare_image(&orig_bm, &copy_bm);
2183 if (error) 2210 if (error)
2184 return error; 2211 return error;
2185 2212
2186 if (handle->prev == nr_meta_pages) { 2213 chain_init(&ca, GFP_ATOMIC, PG_SAFE);
2187 error = prepare_image(&orig_bm, &copy_bm); 2214 memory_bm_position_reset(&orig_bm);
2188 if (error) 2215 restore_pblist = NULL;
2189 return error;
2190
2191 chain_init(&ca, GFP_ATOMIC, PG_SAFE);
2192 memory_bm_position_reset(&orig_bm);
2193 restore_pblist = NULL;
2194 handle->buffer = get_buffer(&orig_bm, &ca);
2195 handle->sync_read = 0;
2196 if (IS_ERR(handle->buffer))
2197 return PTR_ERR(handle->buffer);
2198 }
2199 } else {
2200 copy_last_highmem_page();
2201 handle->buffer = get_buffer(&orig_bm, &ca); 2216 handle->buffer = get_buffer(&orig_bm, &ca);
2217 handle->sync_read = 0;
2202 if (IS_ERR(handle->buffer)) 2218 if (IS_ERR(handle->buffer))
2203 return PTR_ERR(handle->buffer); 2219 return PTR_ERR(handle->buffer);
2204 if (handle->buffer != buffer)
2205 handle->sync_read = 0;
2206 } 2220 }
2207 handle->prev = handle->cur;
2208 }
2209 handle->buf_offset = handle->cur_offset;
2210 if (handle->cur_offset + count >= PAGE_SIZE) {
2211 count = PAGE_SIZE - handle->cur_offset;
2212 handle->cur_offset = 0;
2213 handle->cur++;
2214 } else { 2221 } else {
2215 handle->cur_offset += count; 2222 copy_last_highmem_page();
2223 handle->buffer = get_buffer(&orig_bm, &ca);
2224 if (IS_ERR(handle->buffer))
2225 return PTR_ERR(handle->buffer);
2226 if (handle->buffer != buffer)
2227 handle->sync_read = 0;
2216 } 2228 }
2217 handle->offset += count; 2229 handle->cur++;
2218 return count; 2230 return PAGE_SIZE;
2219} 2231}
2220 2232
2221/** 2233/**
@@ -2230,7 +2242,7 @@ void snapshot_write_finalize(struct snapshot_handle *handle)
2230{ 2242{
2231 copy_last_highmem_page(); 2243 copy_last_highmem_page();
2232 /* Free only if we have loaded the image entirely */ 2244 /* Free only if we have loaded the image entirely */
2233 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) { 2245 if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) {
2234 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); 2246 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
2235 free_highmem_data(); 2247 free_highmem_data();
2236 } 2248 }
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 56e7dbb8b996..7335952ee473 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -16,6 +16,12 @@
16#include <linux/cpu.h> 16#include <linux/cpu.h>
17#include <linux/syscalls.h> 17#include <linux/syscalls.h>
18#include <linux/gfp.h> 18#include <linux/gfp.h>
19#include <linux/io.h>
20#include <linux/kernel.h>
21#include <linux/list.h>
22#include <linux/mm.h>
23#include <linux/slab.h>
24#include <linux/suspend.h>
19 25
20#include "power.h" 26#include "power.h"
21 27
@@ -130,19 +136,19 @@ static int suspend_enter(suspend_state_t state)
130 if (suspend_ops->prepare) { 136 if (suspend_ops->prepare) {
131 error = suspend_ops->prepare(); 137 error = suspend_ops->prepare();
132 if (error) 138 if (error)
133 return error; 139 goto Platform_finish;
134 } 140 }
135 141
136 error = dpm_suspend_noirq(PMSG_SUSPEND); 142 error = dpm_suspend_noirq(PMSG_SUSPEND);
137 if (error) { 143 if (error) {
138 printk(KERN_ERR "PM: Some devices failed to power down\n"); 144 printk(KERN_ERR "PM: Some devices failed to power down\n");
139 goto Platfrom_finish; 145 goto Platform_finish;
140 } 146 }
141 147
142 if (suspend_ops->prepare_late) { 148 if (suspend_ops->prepare_late) {
143 error = suspend_ops->prepare_late(); 149 error = suspend_ops->prepare_late();
144 if (error) 150 if (error)
145 goto Power_up_devices; 151 goto Platform_wake;
146 } 152 }
147 153
148 if (suspend_test(TEST_PLATFORM)) 154 if (suspend_test(TEST_PLATFORM))
@@ -157,8 +163,10 @@ static int suspend_enter(suspend_state_t state)
157 163
158 error = sysdev_suspend(PMSG_SUSPEND); 164 error = sysdev_suspend(PMSG_SUSPEND);
159 if (!error) { 165 if (!error) {
160 if (!suspend_test(TEST_CORE)) 166 if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) {
161 error = suspend_ops->enter(state); 167 error = suspend_ops->enter(state);
168 events_check_enabled = false;
169 }
162 sysdev_resume(); 170 sysdev_resume();
163 } 171 }
164 172
@@ -172,10 +180,9 @@ static int suspend_enter(suspend_state_t state)
172 if (suspend_ops->wake) 180 if (suspend_ops->wake)
173 suspend_ops->wake(); 181 suspend_ops->wake();
174 182
175 Power_up_devices:
176 dpm_resume_noirq(PMSG_RESUME); 183 dpm_resume_noirq(PMSG_RESUME);
177 184
178 Platfrom_finish: 185 Platform_finish:
179 if (suspend_ops->finish) 186 if (suspend_ops->finish)
180 suspend_ops->finish(); 187 suspend_ops->finish();
181 188
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 66824d71983a..e6a5bdf61a37 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -4,7 +4,7 @@
4 * This file provides functions for reading the suspend image from 4 * This file provides functions for reading the suspend image from
5 * and writing it to a swap partition. 5 * and writing it to a swap partition.
6 * 6 *
7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz> 7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
8 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> 8 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
9 * 9 *
10 * This file is released under the GPLv2. 10 * This file is released under the GPLv2.
@@ -29,6 +29,40 @@
29 29
30#define SWSUSP_SIG "S1SUSPEND" 30#define SWSUSP_SIG "S1SUSPEND"
31 31
32/*
33 * The swap map is a data structure used for keeping track of each page
34 * written to a swap partition. It consists of many swap_map_page
35 * structures that contain each an array of MAP_PAGE_ENTRIES swap entries.
36 * These structures are stored on the swap and linked together with the
37 * help of the .next_swap member.
38 *
39 * The swap map is created during suspend. The swap map pages are
40 * allocated and populated one at a time, so we only need one memory
41 * page to set up the entire structure.
42 *
43 * During resume we also only need to use one swap_map_page structure
44 * at a time.
45 */
46
47#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1)
48
49struct swap_map_page {
50 sector_t entries[MAP_PAGE_ENTRIES];
51 sector_t next_swap;
52};
53
54/**
55 * The swap_map_handle structure is used for handling swap in
56 * a file-alike way
57 */
58
59struct swap_map_handle {
60 struct swap_map_page *cur;
61 sector_t cur_swap;
62 sector_t first_sector;
63 unsigned int k;
64};
65
32struct swsusp_header { 66struct swsusp_header {
33 char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)]; 67 char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)];
34 sector_t image; 68 sector_t image;
@@ -114,7 +148,7 @@ sector_t alloc_swapdev_block(int swap)
114 148
115/** 149/**
116 * free_all_swap_pages - free swap pages allocated for saving image data. 150 * free_all_swap_pages - free swap pages allocated for saving image data.
117 * It also frees the extents used to register which swap entres had been 151 * It also frees the extents used to register which swap entries had been
118 * allocated. 152 * allocated.
119 */ 153 */
120 154
@@ -145,110 +179,24 @@ int swsusp_swap_in_use(void)
145 */ 179 */
146 180
147static unsigned short root_swap = 0xffff; 181static unsigned short root_swap = 0xffff;
148static struct block_device *resume_bdev; 182struct block_device *hib_resume_bdev;
149
150/**
151 * submit - submit BIO request.
152 * @rw: READ or WRITE.
153 * @off physical offset of page.
154 * @page: page we're reading or writing.
155 * @bio_chain: list of pending biod (for async reading)
156 *
157 * Straight from the textbook - allocate and initialize the bio.
158 * If we're reading, make sure the page is marked as dirty.
159 * Then submit it and, if @bio_chain == NULL, wait.
160 */
161static int submit(int rw, pgoff_t page_off, struct page *page,
162 struct bio **bio_chain)
163{
164 const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
165 struct bio *bio;
166
167 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
168 bio->bi_sector = page_off * (PAGE_SIZE >> 9);
169 bio->bi_bdev = resume_bdev;
170 bio->bi_end_io = end_swap_bio_read;
171
172 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
173 printk(KERN_ERR "PM: Adding page to bio failed at %ld\n",
174 page_off);
175 bio_put(bio);
176 return -EFAULT;
177 }
178
179 lock_page(page);
180 bio_get(bio);
181
182 if (bio_chain == NULL) {
183 submit_bio(bio_rw, bio);
184 wait_on_page_locked(page);
185 if (rw == READ)
186 bio_set_pages_dirty(bio);
187 bio_put(bio);
188 } else {
189 if (rw == READ)
190 get_page(page); /* These pages are freed later */
191 bio->bi_private = *bio_chain;
192 *bio_chain = bio;
193 submit_bio(bio_rw, bio);
194 }
195 return 0;
196}
197
198static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
199{
200 return submit(READ, page_off, virt_to_page(addr), bio_chain);
201}
202
203static int bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
204{
205 return submit(WRITE, page_off, virt_to_page(addr), bio_chain);
206}
207
208static int wait_on_bio_chain(struct bio **bio_chain)
209{
210 struct bio *bio;
211 struct bio *next_bio;
212 int ret = 0;
213
214 if (bio_chain == NULL)
215 return 0;
216
217 bio = *bio_chain;
218 if (bio == NULL)
219 return 0;
220 while (bio) {
221 struct page *page;
222
223 next_bio = bio->bi_private;
224 page = bio->bi_io_vec[0].bv_page;
225 wait_on_page_locked(page);
226 if (!PageUptodate(page) || PageError(page))
227 ret = -EIO;
228 put_page(page);
229 bio_put(bio);
230 bio = next_bio;
231 }
232 *bio_chain = NULL;
233 return ret;
234}
235 183
236/* 184/*
237 * Saving part 185 * Saving part
238 */ 186 */
239 187
240static int mark_swapfiles(sector_t start, unsigned int flags) 188static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
241{ 189{
242 int error; 190 int error;
243 191
244 bio_read_page(swsusp_resume_block, swsusp_header, NULL); 192 hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL);
245 if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || 193 if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
246 !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { 194 !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
247 memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); 195 memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
248 memcpy(swsusp_header->sig,SWSUSP_SIG, 10); 196 memcpy(swsusp_header->sig,SWSUSP_SIG, 10);
249 swsusp_header->image = start; 197 swsusp_header->image = handle->first_sector;
250 swsusp_header->flags = flags; 198 swsusp_header->flags = flags;
251 error = bio_write_page(swsusp_resume_block, 199 error = hib_bio_write_page(swsusp_resume_block,
252 swsusp_header, NULL); 200 swsusp_header, NULL);
253 } else { 201 } else {
254 printk(KERN_ERR "PM: Swap header not found!\n"); 202 printk(KERN_ERR "PM: Swap header not found!\n");
@@ -260,25 +208,26 @@ static int mark_swapfiles(sector_t start, unsigned int flags)
260/** 208/**
261 * swsusp_swap_check - check if the resume device is a swap device 209 * swsusp_swap_check - check if the resume device is a swap device
262 * and get its index (if so) 210 * and get its index (if so)
211 *
212 * This is called before saving image
263 */ 213 */
264 214static int swsusp_swap_check(void)
265static int swsusp_swap_check(void) /* This is called before saving image */
266{ 215{
267 int res; 216 int res;
268 217
269 res = swap_type_of(swsusp_resume_device, swsusp_resume_block, 218 res = swap_type_of(swsusp_resume_device, swsusp_resume_block,
270 &resume_bdev); 219 &hib_resume_bdev);
271 if (res < 0) 220 if (res < 0)
272 return res; 221 return res;
273 222
274 root_swap = res; 223 root_swap = res;
275 res = blkdev_get(resume_bdev, FMODE_WRITE); 224 res = blkdev_get(hib_resume_bdev, FMODE_WRITE);
276 if (res) 225 if (res)
277 return res; 226 return res;
278 227
279 res = set_blocksize(resume_bdev, PAGE_SIZE); 228 res = set_blocksize(hib_resume_bdev, PAGE_SIZE);
280 if (res < 0) 229 if (res < 0)
281 blkdev_put(resume_bdev, FMODE_WRITE); 230 blkdev_put(hib_resume_bdev, FMODE_WRITE);
282 231
283 return res; 232 return res;
284} 233}
@@ -309,42 +258,9 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
309 } else { 258 } else {
310 src = buf; 259 src = buf;
311 } 260 }
312 return bio_write_page(offset, src, bio_chain); 261 return hib_bio_write_page(offset, src, bio_chain);
313} 262}
314 263
315/*
316 * The swap map is a data structure used for keeping track of each page
317 * written to a swap partition. It consists of many swap_map_page
318 * structures that contain each an array of MAP_PAGE_SIZE swap entries.
319 * These structures are stored on the swap and linked together with the
320 * help of the .next_swap member.
321 *
322 * The swap map is created during suspend. The swap map pages are
323 * allocated and populated one at a time, so we only need one memory
324 * page to set up the entire structure.
325 *
326 * During resume we also only need to use one swap_map_page structure
327 * at a time.
328 */
329
330#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1)
331
332struct swap_map_page {
333 sector_t entries[MAP_PAGE_ENTRIES];
334 sector_t next_swap;
335};
336
337/**
338 * The swap_map_handle structure is used for handling swap in
339 * a file-alike way
340 */
341
342struct swap_map_handle {
343 struct swap_map_page *cur;
344 sector_t cur_swap;
345 unsigned int k;
346};
347
348static void release_swap_writer(struct swap_map_handle *handle) 264static void release_swap_writer(struct swap_map_handle *handle)
349{ 265{
350 if (handle->cur) 266 if (handle->cur)
@@ -354,16 +270,33 @@ static void release_swap_writer(struct swap_map_handle *handle)
354 270
355static int get_swap_writer(struct swap_map_handle *handle) 271static int get_swap_writer(struct swap_map_handle *handle)
356{ 272{
273 int ret;
274
275 ret = swsusp_swap_check();
276 if (ret) {
277 if (ret != -ENOSPC)
278 printk(KERN_ERR "PM: Cannot find swap device, try "
279 "swapon -a.\n");
280 return ret;
281 }
357 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); 282 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
358 if (!handle->cur) 283 if (!handle->cur) {
359 return -ENOMEM; 284 ret = -ENOMEM;
285 goto err_close;
286 }
360 handle->cur_swap = alloc_swapdev_block(root_swap); 287 handle->cur_swap = alloc_swapdev_block(root_swap);
361 if (!handle->cur_swap) { 288 if (!handle->cur_swap) {
362 release_swap_writer(handle); 289 ret = -ENOSPC;
363 return -ENOSPC; 290 goto err_rel;
364 } 291 }
365 handle->k = 0; 292 handle->k = 0;
293 handle->first_sector = handle->cur_swap;
366 return 0; 294 return 0;
295err_rel:
296 release_swap_writer(handle);
297err_close:
298 swsusp_close(FMODE_WRITE);
299 return ret;
367} 300}
368 301
369static int swap_write_page(struct swap_map_handle *handle, void *buf, 302static int swap_write_page(struct swap_map_handle *handle, void *buf,
@@ -380,7 +313,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
380 return error; 313 return error;
381 handle->cur->entries[handle->k++] = offset; 314 handle->cur->entries[handle->k++] = offset;
382 if (handle->k >= MAP_PAGE_ENTRIES) { 315 if (handle->k >= MAP_PAGE_ENTRIES) {
383 error = wait_on_bio_chain(bio_chain); 316 error = hib_wait_on_bio_chain(bio_chain);
384 if (error) 317 if (error)
385 goto out; 318 goto out;
386 offset = alloc_swapdev_block(root_swap); 319 offset = alloc_swapdev_block(root_swap);
@@ -406,6 +339,24 @@ static int flush_swap_writer(struct swap_map_handle *handle)
406 return -EINVAL; 339 return -EINVAL;
407} 340}
408 341
342static int swap_writer_finish(struct swap_map_handle *handle,
343 unsigned int flags, int error)
344{
345 if (!error) {
346 flush_swap_writer(handle);
347 printk(KERN_INFO "PM: S");
348 error = mark_swapfiles(handle, flags);
349 printk("|\n");
350 }
351
352 if (error)
353 free_all_swap_pages(root_swap);
354 release_swap_writer(handle);
355 swsusp_close(FMODE_WRITE);
356
357 return error;
358}
359
409/** 360/**
410 * save_image - save the suspend image data 361 * save_image - save the suspend image data
411 */ 362 */
@@ -431,7 +382,7 @@ static int save_image(struct swap_map_handle *handle,
431 bio = NULL; 382 bio = NULL;
432 do_gettimeofday(&start); 383 do_gettimeofday(&start);
433 while (1) { 384 while (1) {
434 ret = snapshot_read_next(snapshot, PAGE_SIZE); 385 ret = snapshot_read_next(snapshot);
435 if (ret <= 0) 386 if (ret <= 0)
436 break; 387 break;
437 ret = swap_write_page(handle, data_of(*snapshot), &bio); 388 ret = swap_write_page(handle, data_of(*snapshot), &bio);
@@ -441,7 +392,7 @@ static int save_image(struct swap_map_handle *handle,
441 printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m); 392 printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
442 nr_pages++; 393 nr_pages++;
443 } 394 }
444 err2 = wait_on_bio_chain(&bio); 395 err2 = hib_wait_on_bio_chain(&bio);
445 do_gettimeofday(&stop); 396 do_gettimeofday(&stop);
446 if (!ret) 397 if (!ret)
447 ret = err2; 398 ret = err2;
@@ -483,50 +434,34 @@ int swsusp_write(unsigned int flags)
483 struct swap_map_handle handle; 434 struct swap_map_handle handle;
484 struct snapshot_handle snapshot; 435 struct snapshot_handle snapshot;
485 struct swsusp_info *header; 436 struct swsusp_info *header;
437 unsigned long pages;
486 int error; 438 int error;
487 439
488 error = swsusp_swap_check(); 440 pages = snapshot_get_image_size();
441 error = get_swap_writer(&handle);
489 if (error) { 442 if (error) {
490 printk(KERN_ERR "PM: Cannot find swap device, try " 443 printk(KERN_ERR "PM: Cannot get swap writer\n");
491 "swapon -a.\n");
492 return error; 444 return error;
493 } 445 }
446 if (!enough_swap(pages)) {
447 printk(KERN_ERR "PM: Not enough free swap\n");
448 error = -ENOSPC;
449 goto out_finish;
450 }
494 memset(&snapshot, 0, sizeof(struct snapshot_handle)); 451 memset(&snapshot, 0, sizeof(struct snapshot_handle));
495 error = snapshot_read_next(&snapshot, PAGE_SIZE); 452 error = snapshot_read_next(&snapshot);
496 if (error < PAGE_SIZE) { 453 if (error < PAGE_SIZE) {
497 if (error >= 0) 454 if (error >= 0)
498 error = -EFAULT; 455 error = -EFAULT;
499 456
500 goto out; 457 goto out_finish;
501 } 458 }
502 header = (struct swsusp_info *)data_of(snapshot); 459 header = (struct swsusp_info *)data_of(snapshot);
503 if (!enough_swap(header->pages)) { 460 error = swap_write_page(&handle, header, NULL);
504 printk(KERN_ERR "PM: Not enough free swap\n"); 461 if (!error)
505 error = -ENOSPC; 462 error = save_image(&handle, &snapshot, pages - 1);
506 goto out; 463out_finish:
507 } 464 error = swap_writer_finish(&handle, flags, error);
508 error = get_swap_writer(&handle);
509 if (!error) {
510 sector_t start = handle.cur_swap;
511
512 error = swap_write_page(&handle, header, NULL);
513 if (!error)
514 error = save_image(&handle, &snapshot,
515 header->pages - 1);
516
517 if (!error) {
518 flush_swap_writer(&handle);
519 printk(KERN_INFO "PM: S");
520 error = mark_swapfiles(start, flags);
521 printk("|\n");
522 }
523 }
524 if (error)
525 free_all_swap_pages(root_swap);
526
527 release_swap_writer(&handle);
528 out:
529 swsusp_close(FMODE_WRITE);
530 return error; 465 return error;
531} 466}
532 467
@@ -542,18 +477,21 @@ static void release_swap_reader(struct swap_map_handle *handle)
542 handle->cur = NULL; 477 handle->cur = NULL;
543} 478}
544 479
545static int get_swap_reader(struct swap_map_handle *handle, sector_t start) 480static int get_swap_reader(struct swap_map_handle *handle,
481 unsigned int *flags_p)
546{ 482{
547 int error; 483 int error;
548 484
549 if (!start) 485 *flags_p = swsusp_header->flags;
486
487 if (!swsusp_header->image) /* how can this happen? */
550 return -EINVAL; 488 return -EINVAL;
551 489
552 handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH); 490 handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH);
553 if (!handle->cur) 491 if (!handle->cur)
554 return -ENOMEM; 492 return -ENOMEM;
555 493
556 error = bio_read_page(start, handle->cur, NULL); 494 error = hib_bio_read_page(swsusp_header->image, handle->cur, NULL);
557 if (error) { 495 if (error) {
558 release_swap_reader(handle); 496 release_swap_reader(handle);
559 return error; 497 return error;
@@ -573,21 +511,28 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
573 offset = handle->cur->entries[handle->k]; 511 offset = handle->cur->entries[handle->k];
574 if (!offset) 512 if (!offset)
575 return -EFAULT; 513 return -EFAULT;
576 error = bio_read_page(offset, buf, bio_chain); 514 error = hib_bio_read_page(offset, buf, bio_chain);
577 if (error) 515 if (error)
578 return error; 516 return error;
579 if (++handle->k >= MAP_PAGE_ENTRIES) { 517 if (++handle->k >= MAP_PAGE_ENTRIES) {
580 error = wait_on_bio_chain(bio_chain); 518 error = hib_wait_on_bio_chain(bio_chain);
581 handle->k = 0; 519 handle->k = 0;
582 offset = handle->cur->next_swap; 520 offset = handle->cur->next_swap;
583 if (!offset) 521 if (!offset)
584 release_swap_reader(handle); 522 release_swap_reader(handle);
585 else if (!error) 523 else if (!error)
586 error = bio_read_page(offset, handle->cur, NULL); 524 error = hib_bio_read_page(offset, handle->cur, NULL);
587 } 525 }
588 return error; 526 return error;
589} 527}
590 528
529static int swap_reader_finish(struct swap_map_handle *handle)
530{
531 release_swap_reader(handle);
532
533 return 0;
534}
535
591/** 536/**
592 * load_image - load the image using the swap map handle 537 * load_image - load the image using the swap map handle
593 * @handle and the snapshot handle @snapshot 538 * @handle and the snapshot handle @snapshot
@@ -615,21 +560,21 @@ static int load_image(struct swap_map_handle *handle,
615 bio = NULL; 560 bio = NULL;
616 do_gettimeofday(&start); 561 do_gettimeofday(&start);
617 for ( ; ; ) { 562 for ( ; ; ) {
618 error = snapshot_write_next(snapshot, PAGE_SIZE); 563 error = snapshot_write_next(snapshot);
619 if (error <= 0) 564 if (error <= 0)
620 break; 565 break;
621 error = swap_read_page(handle, data_of(*snapshot), &bio); 566 error = swap_read_page(handle, data_of(*snapshot), &bio);
622 if (error) 567 if (error)
623 break; 568 break;
624 if (snapshot->sync_read) 569 if (snapshot->sync_read)
625 error = wait_on_bio_chain(&bio); 570 error = hib_wait_on_bio_chain(&bio);
626 if (error) 571 if (error)
627 break; 572 break;
628 if (!(nr_pages % m)) 573 if (!(nr_pages % m))
629 printk("\b\b\b\b%3d%%", nr_pages / m); 574 printk("\b\b\b\b%3d%%", nr_pages / m);
630 nr_pages++; 575 nr_pages++;
631 } 576 }
632 err2 = wait_on_bio_chain(&bio); 577 err2 = hib_wait_on_bio_chain(&bio);
633 do_gettimeofday(&stop); 578 do_gettimeofday(&stop);
634 if (!error) 579 if (!error)
635 error = err2; 580 error = err2;
@@ -657,20 +602,20 @@ int swsusp_read(unsigned int *flags_p)
657 struct snapshot_handle snapshot; 602 struct snapshot_handle snapshot;
658 struct swsusp_info *header; 603 struct swsusp_info *header;
659 604
660 *flags_p = swsusp_header->flags;
661
662 memset(&snapshot, 0, sizeof(struct snapshot_handle)); 605 memset(&snapshot, 0, sizeof(struct snapshot_handle));
663 error = snapshot_write_next(&snapshot, PAGE_SIZE); 606 error = snapshot_write_next(&snapshot);
664 if (error < PAGE_SIZE) 607 if (error < PAGE_SIZE)
665 return error < 0 ? error : -EFAULT; 608 return error < 0 ? error : -EFAULT;
666 header = (struct swsusp_info *)data_of(snapshot); 609 header = (struct swsusp_info *)data_of(snapshot);
667 error = get_swap_reader(&handle, swsusp_header->image); 610 error = get_swap_reader(&handle, flags_p);
611 if (error)
612 goto end;
668 if (!error) 613 if (!error)
669 error = swap_read_page(&handle, header, NULL); 614 error = swap_read_page(&handle, header, NULL);
670 if (!error) 615 if (!error)
671 error = load_image(&handle, &snapshot, header->pages - 1); 616 error = load_image(&handle, &snapshot, header->pages - 1);
672 release_swap_reader(&handle); 617 swap_reader_finish(&handle);
673 618end:
674 if (!error) 619 if (!error)
675 pr_debug("PM: Image successfully loaded\n"); 620 pr_debug("PM: Image successfully loaded\n");
676 else 621 else
@@ -686,11 +631,11 @@ int swsusp_check(void)
686{ 631{
687 int error; 632 int error;
688 633
689 resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); 634 hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
690 if (!IS_ERR(resume_bdev)) { 635 if (!IS_ERR(hib_resume_bdev)) {
691 set_blocksize(resume_bdev, PAGE_SIZE); 636 set_blocksize(hib_resume_bdev, PAGE_SIZE);
692 memset(swsusp_header, 0, PAGE_SIZE); 637 memset(swsusp_header, 0, PAGE_SIZE);
693 error = bio_read_page(swsusp_resume_block, 638 error = hib_bio_read_page(swsusp_resume_block,
694 swsusp_header, NULL); 639 swsusp_header, NULL);
695 if (error) 640 if (error)
696 goto put; 641 goto put;
@@ -698,7 +643,7 @@ int swsusp_check(void)
698 if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) { 643 if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) {
699 memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); 644 memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
700 /* Reset swap signature now */ 645 /* Reset swap signature now */
701 error = bio_write_page(swsusp_resume_block, 646 error = hib_bio_write_page(swsusp_resume_block,
702 swsusp_header, NULL); 647 swsusp_header, NULL);
703 } else { 648 } else {
704 error = -EINVAL; 649 error = -EINVAL;
@@ -706,11 +651,11 @@ int swsusp_check(void)
706 651
707put: 652put:
708 if (error) 653 if (error)
709 blkdev_put(resume_bdev, FMODE_READ); 654 blkdev_put(hib_resume_bdev, FMODE_READ);
710 else 655 else
711 pr_debug("PM: Signature found, resuming\n"); 656 pr_debug("PM: Signature found, resuming\n");
712 } else { 657 } else {
713 error = PTR_ERR(resume_bdev); 658 error = PTR_ERR(hib_resume_bdev);
714 } 659 }
715 660
716 if (error) 661 if (error)
@@ -725,12 +670,12 @@ put:
725 670
726void swsusp_close(fmode_t mode) 671void swsusp_close(fmode_t mode)
727{ 672{
728 if (IS_ERR(resume_bdev)) { 673 if (IS_ERR(hib_resume_bdev)) {
729 pr_debug("PM: Image device not initialised\n"); 674 pr_debug("PM: Image device not initialised\n");
730 return; 675 return;
731 } 676 }
732 677
733 blkdev_put(resume_bdev, mode); 678 blkdev_put(hib_resume_bdev, mode);
734} 679}
735 680
736static int swsusp_header_init(void) 681static int swsusp_header_init(void)
diff --git a/kernel/power/user.c b/kernel/power/user.c
index a8c96212bc1b..e819e17877ca 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -151,6 +151,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
151{ 151{
152 struct snapshot_data *data; 152 struct snapshot_data *data;
153 ssize_t res; 153 ssize_t res;
154 loff_t pg_offp = *offp & ~PAGE_MASK;
154 155
155 mutex_lock(&pm_mutex); 156 mutex_lock(&pm_mutex);
156 157
@@ -159,14 +160,19 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
159 res = -ENODATA; 160 res = -ENODATA;
160 goto Unlock; 161 goto Unlock;
161 } 162 }
162 res = snapshot_read_next(&data->handle, count); 163 if (!pg_offp) { /* on page boundary? */
163 if (res > 0) { 164 res = snapshot_read_next(&data->handle);
164 if (copy_to_user(buf, data_of(data->handle), res)) 165 if (res <= 0)
165 res = -EFAULT; 166 goto Unlock;
166 else 167 } else {
167 *offp = data->handle.offset; 168 res = PAGE_SIZE - pg_offp;
168 } 169 }
169 170
171 res = simple_read_from_buffer(buf, count, &pg_offp,
172 data_of(data->handle), res);
173 if (res > 0)
174 *offp += res;
175
170 Unlock: 176 Unlock:
171 mutex_unlock(&pm_mutex); 177 mutex_unlock(&pm_mutex);
172 178
@@ -178,18 +184,25 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
178{ 184{
179 struct snapshot_data *data; 185 struct snapshot_data *data;
180 ssize_t res; 186 ssize_t res;
187 loff_t pg_offp = *offp & ~PAGE_MASK;
181 188
182 mutex_lock(&pm_mutex); 189 mutex_lock(&pm_mutex);
183 190
184 data = filp->private_data; 191 data = filp->private_data;
185 res = snapshot_write_next(&data->handle, count); 192
186 if (res > 0) { 193 if (!pg_offp) {
187 if (copy_from_user(data_of(data->handle), buf, res)) 194 res = snapshot_write_next(&data->handle);
188 res = -EFAULT; 195 if (res <= 0)
189 else 196 goto unlock;
190 *offp = data->handle.offset; 197 } else {
198 res = PAGE_SIZE - pg_offp;
191 } 199 }
192 200
201 res = simple_write_to_buffer(data_of(data->handle), res, &pg_offp,
202 buf, count);
203 if (res > 0)
204 *offp += res;
205unlock:
193 mutex_unlock(&pm_mutex); 206 mutex_unlock(&pm_mutex);
194 207
195 return res; 208 return res;
diff --git a/kernel/printk.c b/kernel/printk.c
index ee54355cfdf1..9dc8ea140426 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -33,9 +33,12 @@
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/kexec.h> 35#include <linux/kexec.h>
36#include <linux/kdb.h>
36#include <linux/ratelimit.h> 37#include <linux/ratelimit.h>
37#include <linux/kmsg_dump.h> 38#include <linux/kmsg_dump.h>
38#include <linux/syslog.h> 39#include <linux/syslog.h>
40#include <linux/cpu.h>
41#include <linux/notifier.h>
39 42
40#include <asm/uaccess.h> 43#include <asm/uaccess.h>
41 44
@@ -420,6 +423,22 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
420 return do_syslog(type, buf, len, SYSLOG_FROM_CALL); 423 return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
421} 424}
422 425
426#ifdef CONFIG_KGDB_KDB
427/* kdb dmesg command needs access to the syslog buffer. do_syslog()
428 * uses locks so it cannot be used during debugging. Just tell kdb
429 * where the start and end of the physical and logical logs are. This
430 * is equivalent to do_syslog(3).
431 */
432void kdb_syslog_data(char *syslog_data[4])
433{
434 syslog_data[0] = log_buf;
435 syslog_data[1] = log_buf + log_buf_len;
436 syslog_data[2] = log_buf + log_end -
437 (logged_chars < log_buf_len ? logged_chars : log_buf_len);
438 syslog_data[3] = log_buf + log_end;
439}
440#endif /* CONFIG_KGDB_KDB */
441
423/* 442/*
424 * Call the console drivers on a range of log_buf 443 * Call the console drivers on a range of log_buf
425 */ 444 */
@@ -593,6 +612,14 @@ asmlinkage int printk(const char *fmt, ...)
593 va_list args; 612 va_list args;
594 int r; 613 int r;
595 614
615#ifdef CONFIG_KGDB_KDB
616 if (unlikely(kdb_trap_printk)) {
617 va_start(args, fmt);
618 r = vkdb_printf(fmt, args);
619 va_end(args);
620 return r;
621 }
622#endif
596 va_start(args, fmt); 623 va_start(args, fmt);
597 r = vprintk(fmt, args); 624 r = vprintk(fmt, args);
598 va_end(args); 625 va_end(args);
@@ -970,6 +997,32 @@ void resume_console(void)
970} 997}
971 998
972/** 999/**
1000 * console_cpu_notify - print deferred console messages after CPU hotplug
1001 * @self: notifier struct
1002 * @action: CPU hotplug event
1003 * @hcpu: unused
1004 *
1005 * If printk() is called from a CPU that is not online yet, the messages
1006 * will be spooled but will not show up on the console. This function is
1007 * called when a new CPU comes online (or fails to come up), and ensures
1008 * that any such output gets printed.
1009 */
1010static int __cpuinit console_cpu_notify(struct notifier_block *self,
1011 unsigned long action, void *hcpu)
1012{
1013 switch (action) {
1014 case CPU_ONLINE:
1015 case CPU_DEAD:
1016 case CPU_DYING:
1017 case CPU_DOWN_FAILED:
1018 case CPU_UP_CANCELED:
1019 acquire_console_sem();
1020 release_console_sem();
1021 }
1022 return NOTIFY_OK;
1023}
1024
1025/**
973 * acquire_console_sem - lock the console system for exclusive use. 1026 * acquire_console_sem - lock the console system for exclusive use.
974 * 1027 *
975 * Acquires a semaphore which guarantees that the caller has 1028 * Acquires a semaphore which guarantees that the caller has
@@ -1356,7 +1409,7 @@ int unregister_console(struct console *console)
1356} 1409}
1357EXPORT_SYMBOL(unregister_console); 1410EXPORT_SYMBOL(unregister_console);
1358 1411
1359static int __init disable_boot_consoles(void) 1412static int __init printk_late_init(void)
1360{ 1413{
1361 struct console *con; 1414 struct console *con;
1362 1415
@@ -1367,9 +1420,10 @@ static int __init disable_boot_consoles(void)
1367 unregister_console(con); 1420 unregister_console(con);
1368 } 1421 }
1369 } 1422 }
1423 hotcpu_notifier(console_cpu_notify, 0);
1370 return 0; 1424 return 0;
1371} 1425}
1372late_initcall(disable_boot_consoles); 1426late_initcall(printk_late_init);
1373 1427
1374#if defined CONFIG_PRINTK 1428#if defined CONFIG_PRINTK
1375 1429
@@ -1505,9 +1559,9 @@ void kmsg_dump(enum kmsg_dump_reason reason)
1505 chars = logged_chars; 1559 chars = logged_chars;
1506 spin_unlock_irqrestore(&logbuf_lock, flags); 1560 spin_unlock_irqrestore(&logbuf_lock, flags);
1507 1561
1508 if (logged_chars > end) { 1562 if (chars > end) {
1509 s1 = log_buf + log_buf_len - logged_chars + end; 1563 s1 = log_buf + log_buf_len - chars + end;
1510 l1 = logged_chars - end; 1564 l1 = chars - end;
1511 1565
1512 s2 = log_buf; 1566 s2 = log_buf;
1513 l2 = end; 1567 l2 = end;
@@ -1515,8 +1569,8 @@ void kmsg_dump(enum kmsg_dump_reason reason)
1515 s1 = ""; 1569 s1 = "";
1516 l1 = 0; 1570 l1 = 0;
1517 1571
1518 s2 = log_buf + end - logged_chars; 1572 s2 = log_buf + end - chars;
1519 l2 = logged_chars; 1573 l2 = chars;
1520 } 1574 }
1521 1575
1522 if (!spin_trylock_irqsave(&dump_list_lock, flags)) { 1576 if (!spin_trylock_irqsave(&dump_list_lock, flags)) {
diff --git a/kernel/profile.c b/kernel/profile.c
index dfadc5b729f1..b22a899934cc 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -365,14 +365,14 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info,
365 switch (action) { 365 switch (action) {
366 case CPU_UP_PREPARE: 366 case CPU_UP_PREPARE:
367 case CPU_UP_PREPARE_FROZEN: 367 case CPU_UP_PREPARE_FROZEN:
368 node = cpu_to_node(cpu); 368 node = cpu_to_mem(cpu);
369 per_cpu(cpu_profile_flip, cpu) = 0; 369 per_cpu(cpu_profile_flip, cpu) = 0;
370 if (!per_cpu(cpu_profile_hits, cpu)[1]) { 370 if (!per_cpu(cpu_profile_hits, cpu)[1]) {
371 page = alloc_pages_exact_node(node, 371 page = alloc_pages_exact_node(node,
372 GFP_KERNEL | __GFP_ZERO, 372 GFP_KERNEL | __GFP_ZERO,
373 0); 373 0);
374 if (!page) 374 if (!page)
375 return NOTIFY_BAD; 375 return notifier_from_errno(-ENOMEM);
376 per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); 376 per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
377 } 377 }
378 if (!per_cpu(cpu_profile_hits, cpu)[0]) { 378 if (!per_cpu(cpu_profile_hits, cpu)[0]) {
@@ -388,7 +388,7 @@ out_free:
388 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); 388 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
389 per_cpu(cpu_profile_hits, cpu)[1] = NULL; 389 per_cpu(cpu_profile_hits, cpu)[1] = NULL;
390 __free_page(page); 390 __free_page(page);
391 return NOTIFY_BAD; 391 return notifier_from_errno(-ENOMEM);
392 case CPU_ONLINE: 392 case CPU_ONLINE:
393 case CPU_ONLINE_FROZEN: 393 case CPU_ONLINE_FROZEN:
394 if (prof_cpu_mask != NULL) 394 if (prof_cpu_mask != NULL)
@@ -567,7 +567,7 @@ static int create_hash_tables(void)
567 int cpu; 567 int cpu;
568 568
569 for_each_online_cpu(cpu) { 569 for_each_online_cpu(cpu) {
570 int node = cpu_to_node(cpu); 570 int node = cpu_to_mem(cpu);
571 struct page *page; 571 struct page *page;
572 572
573 page = alloc_pages_exact_node(node, 573 page = alloc_pages_exact_node(node,
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 42ad8ae729a0..f34d798ef4a2 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -14,7 +14,6 @@
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/highmem.h> 15#include <linux/highmem.h>
16#include <linux/pagemap.h> 16#include <linux/pagemap.h>
17#include <linux/smp_lock.h>
18#include <linux/ptrace.h> 17#include <linux/ptrace.h>
19#include <linux/security.h> 18#include <linux/security.h>
20#include <linux/signal.h> 19#include <linux/signal.h>
@@ -76,7 +75,6 @@ void __ptrace_unlink(struct task_struct *child)
76 child->parent = child->real_parent; 75 child->parent = child->real_parent;
77 list_del_init(&child->ptrace_entry); 76 list_del_init(&child->ptrace_entry);
78 77
79 arch_ptrace_untrace(child);
80 if (task_is_traced(child)) 78 if (task_is_traced(child))
81 ptrace_untrace(child); 79 ptrace_untrace(child);
82} 80}
@@ -326,26 +324,32 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
326} 324}
327 325
328/* 326/*
329 * Detach all tasks we were using ptrace on. 327 * Detach all tasks we were using ptrace on. Called with tasklist held
328 * for writing, and returns with it held too. But note it can release
329 * and reacquire the lock.
330 */ 330 */
331void exit_ptrace(struct task_struct *tracer) 331void exit_ptrace(struct task_struct *tracer)
332{ 332{
333 struct task_struct *p, *n; 333 struct task_struct *p, *n;
334 LIST_HEAD(ptrace_dead); 334 LIST_HEAD(ptrace_dead);
335 335
336 write_lock_irq(&tasklist_lock); 336 if (likely(list_empty(&tracer->ptraced)))
337 return;
338
337 list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { 339 list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
338 if (__ptrace_detach(tracer, p)) 340 if (__ptrace_detach(tracer, p))
339 list_add(&p->ptrace_entry, &ptrace_dead); 341 list_add(&p->ptrace_entry, &ptrace_dead);
340 } 342 }
341 write_unlock_irq(&tasklist_lock);
342 343
344 write_unlock_irq(&tasklist_lock);
343 BUG_ON(!list_empty(&tracer->ptraced)); 345 BUG_ON(!list_empty(&tracer->ptraced));
344 346
345 list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) { 347 list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
346 list_del_init(&p->ptrace_entry); 348 list_del_init(&p->ptrace_entry);
347 release_task(p); 349 release_task(p);
348 } 350 }
351
352 write_lock_irq(&tasklist_lock);
349} 353}
350 354
351int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) 355int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
@@ -596,6 +600,32 @@ int ptrace_request(struct task_struct *child, long request,
596 ret = ptrace_detach(child, data); 600 ret = ptrace_detach(child, data);
597 break; 601 break;
598 602
603#ifdef CONFIG_BINFMT_ELF_FDPIC
604 case PTRACE_GETFDPIC: {
605 struct mm_struct *mm = get_task_mm(child);
606 unsigned long tmp = 0;
607
608 ret = -ESRCH;
609 if (!mm)
610 break;
611
612 switch (addr) {
613 case PTRACE_GETFDPIC_EXEC:
614 tmp = mm->context.exec_fdpic_loadmap;
615 break;
616 case PTRACE_GETFDPIC_INTERP:
617 tmp = mm->context.interp_fdpic_loadmap;
618 break;
619 default:
620 break;
621 }
622 mmput(mm);
623
624 ret = put_user(tmp, (unsigned long __user *) data);
625 break;
626 }
627#endif
628
599#ifdef PTRACE_SINGLESTEP 629#ifdef PTRACE_SINGLESTEP
600 case PTRACE_SINGLESTEP: 630 case PTRACE_SINGLESTEP:
601#endif 631#endif
@@ -666,10 +696,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
666 struct task_struct *child; 696 struct task_struct *child;
667 long ret; 697 long ret;
668 698
669 /*
670 * This lock_kernel fixes a subtle race with suid exec
671 */
672 lock_kernel();
673 if (request == PTRACE_TRACEME) { 699 if (request == PTRACE_TRACEME) {
674 ret = ptrace_traceme(); 700 ret = ptrace_traceme();
675 if (!ret) 701 if (!ret)
@@ -703,7 +729,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
703 out_put_task_struct: 729 out_put_task_struct:
704 put_task_struct(child); 730 put_task_struct(child);
705 out: 731 out:
706 unlock_kernel();
707 return ret; 732 return ret;
708} 733}
709 734
@@ -813,10 +838,6 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
813 struct task_struct *child; 838 struct task_struct *child;
814 long ret; 839 long ret;
815 840
816 /*
817 * This lock_kernel fixes a subtle race with suid exec
818 */
819 lock_kernel();
820 if (request == PTRACE_TRACEME) { 841 if (request == PTRACE_TRACEME) {
821 ret = ptrace_traceme(); 842 ret = ptrace_traceme();
822 goto out; 843 goto out;
@@ -846,7 +867,6 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
846 out_put_task_struct: 867 out_put_task_struct:
847 put_task_struct(child); 868 put_task_struct(child);
848 out: 869 out:
849 unlock_kernel();
850 return ret; 870 return ret;
851} 871}
852#endif /* CONFIG_COMPAT */ 872#endif /* CONFIG_COMPAT */
diff --git a/kernel/range.c b/kernel/range.c
index 74e2e6114927..471b66acabb5 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -7,10 +7,6 @@
7 7
8#include <linux/range.h> 8#include <linux/range.h>
9 9
10#ifndef ARRAY_SIZE
11#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
12#endif
13
14int add_range(struct range *range, int az, int nr_range, u64 start, u64 end) 10int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)
15{ 11{
16 if (start >= end) 12 if (start >= end)
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 49d808e833b0..4d169835fb36 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -44,7 +44,6 @@
44#include <linux/cpu.h> 44#include <linux/cpu.h>
45#include <linux/mutex.h> 45#include <linux/mutex.h>
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/kernel_stat.h>
48#include <linux/hardirq.h> 47#include <linux/hardirq.h>
49 48
50#ifdef CONFIG_DEBUG_LOCK_ALLOC 49#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -64,9 +63,6 @@ struct lockdep_map rcu_sched_lock_map =
64EXPORT_SYMBOL_GPL(rcu_sched_lock_map); 63EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
65#endif 64#endif
66 65
67int rcu_scheduler_active __read_mostly;
68EXPORT_SYMBOL_GPL(rcu_scheduler_active);
69
70#ifdef CONFIG_DEBUG_LOCK_ALLOC 66#ifdef CONFIG_DEBUG_LOCK_ALLOC
71 67
72int debug_lockdep_rcu_enabled(void) 68int debug_lockdep_rcu_enabled(void)
@@ -97,21 +93,6 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
97#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 93#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
98 94
99/* 95/*
100 * This function is invoked towards the end of the scheduler's initialization
101 * process. Before this is called, the idle task might contain
102 * RCU read-side critical sections (during which time, this idle
103 * task is booting the system). After this function is called, the
104 * idle tasks are prohibited from containing RCU read-side critical
105 * sections.
106 */
107void rcu_scheduler_starting(void)
108{
109 WARN_ON(num_online_cpus() != 1);
110 WARN_ON(nr_context_switches() > 0);
111 rcu_scheduler_active = 1;
112}
113
114/*
115 * Awaken the corresponding synchronize_rcu() instance now that a 96 * Awaken the corresponding synchronize_rcu() instance now that a
116 * grace period has elapsed. 97 * grace period has elapsed.
117 */ 98 */
@@ -133,3 +114,163 @@ int rcu_my_thread_group_empty(void)
133} 114}
134EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty); 115EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty);
135#endif /* #ifdef CONFIG_PROVE_RCU */ 116#endif /* #ifdef CONFIG_PROVE_RCU */
117
118#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
119static inline void debug_init_rcu_head(struct rcu_head *head)
120{
121 debug_object_init(head, &rcuhead_debug_descr);
122}
123
124static inline void debug_rcu_head_free(struct rcu_head *head)
125{
126 debug_object_free(head, &rcuhead_debug_descr);
127}
128
129/*
130 * fixup_init is called when:
131 * - an active object is initialized
132 */
133static int rcuhead_fixup_init(void *addr, enum debug_obj_state state)
134{
135 struct rcu_head *head = addr;
136
137 switch (state) {
138 case ODEBUG_STATE_ACTIVE:
139 /*
140 * Ensure that queued callbacks are all executed.
141 * If we detect that we are nested in a RCU read-side critical
142 * section, we should simply fail, otherwise we would deadlock.
143 */
144 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
145 irqs_disabled()) {
146 WARN_ON(1);
147 return 0;
148 }
149 rcu_barrier();
150 rcu_barrier_sched();
151 rcu_barrier_bh();
152 debug_object_init(head, &rcuhead_debug_descr);
153 return 1;
154 default:
155 return 0;
156 }
157}
158
159/*
160 * fixup_activate is called when:
161 * - an active object is activated
162 * - an unknown object is activated (might be a statically initialized object)
163 * Activation is performed internally by call_rcu().
164 */
165static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
166{
167 struct rcu_head *head = addr;
168
169 switch (state) {
170
171 case ODEBUG_STATE_NOTAVAILABLE:
172 /*
173 * This is not really a fixup. We just make sure that it is
174 * tracked in the object tracker.
175 */
176 debug_object_init(head, &rcuhead_debug_descr);
177 debug_object_activate(head, &rcuhead_debug_descr);
178 return 0;
179
180 case ODEBUG_STATE_ACTIVE:
181 /*
182 * Ensure that queued callbacks are all executed.
183 * If we detect that we are nested in a RCU read-side critical
184 * section, we should simply fail, otherwise we would deadlock.
185 */
186 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
187 irqs_disabled()) {
188 WARN_ON(1);
189 return 0;
190 }
191 rcu_barrier();
192 rcu_barrier_sched();
193 rcu_barrier_bh();
194 debug_object_activate(head, &rcuhead_debug_descr);
195 return 1;
196 default:
197 return 0;
198 }
199}
200
201/*
202 * fixup_free is called when:
203 * - an active object is freed
204 */
205static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
206{
207 struct rcu_head *head = addr;
208
209 switch (state) {
210 case ODEBUG_STATE_ACTIVE:
211 /*
212 * Ensure that queued callbacks are all executed.
213 * If we detect that we are nested in a RCU read-side critical
214 * section, we should simply fail, otherwise we would deadlock.
215 */
216#ifndef CONFIG_PREEMPT
217 WARN_ON(1);
218 return 0;
219#else
220 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
221 irqs_disabled()) {
222 WARN_ON(1);
223 return 0;
224 }
225 rcu_barrier();
226 rcu_barrier_sched();
227 rcu_barrier_bh();
228 debug_object_free(head, &rcuhead_debug_descr);
229 return 1;
230#endif
231 default:
232 return 0;
233 }
234}
235
236/**
237 * init_rcu_head_on_stack() - initialize on-stack rcu_head for debugobjects
238 * @head: pointer to rcu_head structure to be initialized
239 *
240 * This function informs debugobjects of a new rcu_head structure that
241 * has been allocated as an auto variable on the stack. This function
242 * is not required for rcu_head structures that are statically defined or
243 * that are dynamically allocated on the heap. This function has no
244 * effect for !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds.
245 */
246void init_rcu_head_on_stack(struct rcu_head *head)
247{
248 debug_object_init_on_stack(head, &rcuhead_debug_descr);
249}
250EXPORT_SYMBOL_GPL(init_rcu_head_on_stack);
251
252/**
253 * destroy_rcu_head_on_stack() - destroy on-stack rcu_head for debugobjects
254 * @head: pointer to rcu_head structure to be initialized
255 *
256 * This function informs debugobjects that an on-stack rcu_head structure
257 * is about to go out of scope. As with init_rcu_head_on_stack(), this
258 * function is not required for rcu_head structures that are statically
259 * defined or that are dynamically allocated on the heap. Also as with
260 * init_rcu_head_on_stack(), this function has no effect for
261 * !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds.
262 */
263void destroy_rcu_head_on_stack(struct rcu_head *head)
264{
265 debug_object_free(head, &rcuhead_debug_descr);
266}
267EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack);
268
269struct debug_obj_descr rcuhead_debug_descr = {
270 .name = "rcu_head",
271 .fixup_init = rcuhead_fixup_init,
272 .fixup_activate = rcuhead_fixup_activate,
273 .fixup_free = rcuhead_fixup_free,
274};
275EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
276#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 9f6d9ff2572c..196ec02f8be0 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -44,9 +44,9 @@ struct rcu_ctrlblk {
44}; 44};
45 45
46/* Definition for rcupdate control block. */ 46/* Definition for rcupdate control block. */
47static struct rcu_ctrlblk rcu_ctrlblk = { 47static struct rcu_ctrlblk rcu_sched_ctrlblk = {
48 .donetail = &rcu_ctrlblk.rcucblist, 48 .donetail = &rcu_sched_ctrlblk.rcucblist,
49 .curtail = &rcu_ctrlblk.rcucblist, 49 .curtail = &rcu_sched_ctrlblk.rcucblist,
50}; 50};
51 51
52static struct rcu_ctrlblk rcu_bh_ctrlblk = { 52static struct rcu_ctrlblk rcu_bh_ctrlblk = {
@@ -54,6 +54,11 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
54 .curtail = &rcu_bh_ctrlblk.rcucblist, 54 .curtail = &rcu_bh_ctrlblk.rcucblist,
55}; 55};
56 56
57#ifdef CONFIG_DEBUG_LOCK_ALLOC
58int rcu_scheduler_active __read_mostly;
59EXPORT_SYMBOL_GPL(rcu_scheduler_active);
60#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
61
57#ifdef CONFIG_NO_HZ 62#ifdef CONFIG_NO_HZ
58 63
59static long rcu_dynticks_nesting = 1; 64static long rcu_dynticks_nesting = 1;
@@ -108,7 +113,8 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
108 */ 113 */
109void rcu_sched_qs(int cpu) 114void rcu_sched_qs(int cpu)
110{ 115{
111 if (rcu_qsctr_help(&rcu_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk)) 116 if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
117 rcu_qsctr_help(&rcu_bh_ctrlblk))
112 raise_softirq(RCU_SOFTIRQ); 118 raise_softirq(RCU_SOFTIRQ);
113} 119}
114 120
@@ -163,6 +169,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
163 while (list) { 169 while (list) {
164 next = list->next; 170 next = list->next;
165 prefetch(next); 171 prefetch(next);
172 debug_rcu_head_unqueue(list);
166 list->func(list); 173 list->func(list);
167 list = next; 174 list = next;
168 } 175 }
@@ -173,7 +180,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
173 */ 180 */
174static void rcu_process_callbacks(struct softirq_action *unused) 181static void rcu_process_callbacks(struct softirq_action *unused)
175{ 182{
176 __rcu_process_callbacks(&rcu_ctrlblk); 183 __rcu_process_callbacks(&rcu_sched_ctrlblk);
177 __rcu_process_callbacks(&rcu_bh_ctrlblk); 184 __rcu_process_callbacks(&rcu_bh_ctrlblk);
178} 185}
179 186
@@ -187,7 +194,8 @@ static void rcu_process_callbacks(struct softirq_action *unused)
187 * 194 *
188 * Cool, huh? (Due to Josh Triplett.) 195 * Cool, huh? (Due to Josh Triplett.)
189 * 196 *
190 * But we want to make this a static inline later. 197 * But we want to make this a static inline later. The cond_resched()
198 * currently makes this problematic.
191 */ 199 */
192void synchronize_sched(void) 200void synchronize_sched(void)
193{ 201{
@@ -195,12 +203,6 @@ void synchronize_sched(void)
195} 203}
196EXPORT_SYMBOL_GPL(synchronize_sched); 204EXPORT_SYMBOL_GPL(synchronize_sched);
197 205
198void synchronize_rcu_bh(void)
199{
200 synchronize_sched();
201}
202EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
203
204/* 206/*
205 * Helper function for call_rcu() and call_rcu_bh(). 207 * Helper function for call_rcu() and call_rcu_bh().
206 */ 208 */
@@ -210,6 +212,7 @@ static void __call_rcu(struct rcu_head *head,
210{ 212{
211 unsigned long flags; 213 unsigned long flags;
212 214
215 debug_rcu_head_queue(head);
213 head->func = func; 216 head->func = func;
214 head->next = NULL; 217 head->next = NULL;
215 218
@@ -226,7 +229,7 @@ static void __call_rcu(struct rcu_head *head,
226 */ 229 */
227void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 230void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
228{ 231{
229 __call_rcu(head, func, &rcu_ctrlblk); 232 __call_rcu(head, func, &rcu_sched_ctrlblk);
230} 233}
231EXPORT_SYMBOL_GPL(call_rcu); 234EXPORT_SYMBOL_GPL(call_rcu);
232 235
@@ -244,11 +247,13 @@ void rcu_barrier(void)
244{ 247{
245 struct rcu_synchronize rcu; 248 struct rcu_synchronize rcu;
246 249
250 init_rcu_head_on_stack(&rcu.head);
247 init_completion(&rcu.completion); 251 init_completion(&rcu.completion);
248 /* Will wake me after RCU finished. */ 252 /* Will wake me after RCU finished. */
249 call_rcu(&rcu.head, wakeme_after_rcu); 253 call_rcu(&rcu.head, wakeme_after_rcu);
250 /* Wait for it. */ 254 /* Wait for it. */
251 wait_for_completion(&rcu.completion); 255 wait_for_completion(&rcu.completion);
256 destroy_rcu_head_on_stack(&rcu.head);
252} 257}
253EXPORT_SYMBOL_GPL(rcu_barrier); 258EXPORT_SYMBOL_GPL(rcu_barrier);
254 259
@@ -256,11 +261,13 @@ void rcu_barrier_bh(void)
256{ 261{
257 struct rcu_synchronize rcu; 262 struct rcu_synchronize rcu;
258 263
264 init_rcu_head_on_stack(&rcu.head);
259 init_completion(&rcu.completion); 265 init_completion(&rcu.completion);
260 /* Will wake me after RCU finished. */ 266 /* Will wake me after RCU finished. */
261 call_rcu_bh(&rcu.head, wakeme_after_rcu); 267 call_rcu_bh(&rcu.head, wakeme_after_rcu);
262 /* Wait for it. */ 268 /* Wait for it. */
263 wait_for_completion(&rcu.completion); 269 wait_for_completion(&rcu.completion);
270 destroy_rcu_head_on_stack(&rcu.head);
264} 271}
265EXPORT_SYMBOL_GPL(rcu_barrier_bh); 272EXPORT_SYMBOL_GPL(rcu_barrier_bh);
266 273
@@ -268,11 +275,13 @@ void rcu_barrier_sched(void)
268{ 275{
269 struct rcu_synchronize rcu; 276 struct rcu_synchronize rcu;
270 277
278 init_rcu_head_on_stack(&rcu.head);
271 init_completion(&rcu.completion); 279 init_completion(&rcu.completion);
272 /* Will wake me after RCU finished. */ 280 /* Will wake me after RCU finished. */
273 call_rcu_sched(&rcu.head, wakeme_after_rcu); 281 call_rcu_sched(&rcu.head, wakeme_after_rcu);
274 /* Wait for it. */ 282 /* Wait for it. */
275 wait_for_completion(&rcu.completion); 283 wait_for_completion(&rcu.completion);
284 destroy_rcu_head_on_stack(&rcu.head);
276} 285}
277EXPORT_SYMBOL_GPL(rcu_barrier_sched); 286EXPORT_SYMBOL_GPL(rcu_barrier_sched);
278 287
@@ -280,3 +289,5 @@ void __init rcu_init(void)
280{ 289{
281 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 290 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
282} 291}
292
293#include "rcutiny_plugin.h"
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
new file mode 100644
index 000000000000..d223a92bc742
--- /dev/null
+++ b/kernel/rcutiny_plugin.h
@@ -0,0 +1,39 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
3 * Internal non-public definitions that provide either classic
4 * or preemptable semantics.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 *
20 * Copyright IBM Corporation, 2009
21 *
22 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
23 */
24
25#ifdef CONFIG_DEBUG_LOCK_ALLOC
26
27#include <linux/kernel_stat.h>
28
29/*
30 * During boot, we forgive RCU lockdep issues. After this function is
31 * invoked, we start taking RCU lockdep issues seriously.
32 */
33void rcu_scheduler_starting(void)
34{
35 WARN_ON(nr_context_switches() > 0);
36 rcu_scheduler_active = 1;
37}
38
39#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 58df55bf83ed..2e2726d790b9 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -239,8 +239,7 @@ static unsigned long
239rcu_random(struct rcu_random_state *rrsp) 239rcu_random(struct rcu_random_state *rrsp)
240{ 240{
241 if (--rrsp->rrs_count < 0) { 241 if (--rrsp->rrs_count < 0) {
242 rrsp->rrs_state += 242 rrsp->rrs_state += (unsigned long)local_clock();
243 (unsigned long)cpu_clock(raw_smp_processor_id());
244 rrsp->rrs_count = RCU_RANDOM_REFRESH; 243 rrsp->rrs_count = RCU_RANDOM_REFRESH;
245 } 244 }
246 rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; 245 rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;
@@ -464,9 +463,11 @@ static void rcu_bh_torture_synchronize(void)
464{ 463{
465 struct rcu_bh_torture_synchronize rcu; 464 struct rcu_bh_torture_synchronize rcu;
466 465
466 init_rcu_head_on_stack(&rcu.head);
467 init_completion(&rcu.completion); 467 init_completion(&rcu.completion);
468 call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb); 468 call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb);
469 wait_for_completion(&rcu.completion); 469 wait_for_completion(&rcu.completion);
470 destroy_rcu_head_on_stack(&rcu.head);
470} 471}
471 472
472static struct rcu_torture_ops rcu_bh_ops = { 473static struct rcu_torture_ops rcu_bh_ops = {
@@ -669,7 +670,7 @@ static struct rcu_torture_ops sched_expedited_ops = {
669 .sync = synchronize_sched_expedited, 670 .sync = synchronize_sched_expedited,
670 .cb_barrier = NULL, 671 .cb_barrier = NULL,
671 .fqs = rcu_sched_force_quiescent_state, 672 .fqs = rcu_sched_force_quiescent_state,
672 .stats = rcu_expedited_torture_stats, 673 .stats = NULL,
673 .irq_capable = 1, 674 .irq_capable = 1,
674 .name = "sched_expedited" 675 .name = "sched_expedited"
675}; 676};
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 3ec8160fc75f..d5bc43976c5a 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -46,6 +46,7 @@
46#include <linux/cpu.h> 46#include <linux/cpu.h>
47#include <linux/mutex.h> 47#include <linux/mutex.h>
48#include <linux/time.h> 48#include <linux/time.h>
49#include <linux/kernel_stat.h>
49 50
50#include "rcutree.h" 51#include "rcutree.h"
51 52
@@ -53,8 +54,8 @@
53 54
54static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; 55static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
55 56
56#define RCU_STATE_INITIALIZER(name) { \ 57#define RCU_STATE_INITIALIZER(structname) { \
57 .level = { &name.node[0] }, \ 58 .level = { &structname.node[0] }, \
58 .levelcnt = { \ 59 .levelcnt = { \
59 NUM_RCU_LVL_0, /* root of hierarchy. */ \ 60 NUM_RCU_LVL_0, /* root of hierarchy. */ \
60 NUM_RCU_LVL_1, \ 61 NUM_RCU_LVL_1, \
@@ -65,13 +66,14 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
65 .signaled = RCU_GP_IDLE, \ 66 .signaled = RCU_GP_IDLE, \
66 .gpnum = -300, \ 67 .gpnum = -300, \
67 .completed = -300, \ 68 .completed = -300, \
68 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&name.onofflock), \ 69 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
69 .orphan_cbs_list = NULL, \ 70 .orphan_cbs_list = NULL, \
70 .orphan_cbs_tail = &name.orphan_cbs_list, \ 71 .orphan_cbs_tail = &structname.orphan_cbs_list, \
71 .orphan_qlen = 0, \ 72 .orphan_qlen = 0, \
72 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&name.fqslock), \ 73 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
73 .n_force_qs = 0, \ 74 .n_force_qs = 0, \
74 .n_force_qs_ngp = 0, \ 75 .n_force_qs_ngp = 0, \
76 .name = #structname, \
75} 77}
76 78
77struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state); 79struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state);
@@ -80,6 +82,9 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
80struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 82struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
81DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 83DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
82 84
85int rcu_scheduler_active __read_mostly;
86EXPORT_SYMBOL_GPL(rcu_scheduler_active);
87
83/* 88/*
84 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 89 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
85 * permit this function to be invoked without holding the root rcu_node 90 * permit this function to be invoked without holding the root rcu_node
@@ -97,25 +102,32 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
97 */ 102 */
98void rcu_sched_qs(int cpu) 103void rcu_sched_qs(int cpu)
99{ 104{
100 struct rcu_data *rdp; 105 struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
101 106
102 rdp = &per_cpu(rcu_sched_data, cpu);
103 rdp->passed_quiesc_completed = rdp->gpnum - 1; 107 rdp->passed_quiesc_completed = rdp->gpnum - 1;
104 barrier(); 108 barrier();
105 rdp->passed_quiesc = 1; 109 rdp->passed_quiesc = 1;
106 rcu_preempt_note_context_switch(cpu);
107} 110}
108 111
109void rcu_bh_qs(int cpu) 112void rcu_bh_qs(int cpu)
110{ 113{
111 struct rcu_data *rdp; 114 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
112 115
113 rdp = &per_cpu(rcu_bh_data, cpu);
114 rdp->passed_quiesc_completed = rdp->gpnum - 1; 116 rdp->passed_quiesc_completed = rdp->gpnum - 1;
115 barrier(); 117 barrier();
116 rdp->passed_quiesc = 1; 118 rdp->passed_quiesc = 1;
117} 119}
118 120
121/*
122 * Note a context switch. This is a quiescent state for RCU-sched,
123 * and requires special handling for preemptible RCU.
124 */
125void rcu_note_context_switch(int cpu)
126{
127 rcu_sched_qs(cpu);
128 rcu_preempt_note_context_switch(cpu);
129}
130
119#ifdef CONFIG_NO_HZ 131#ifdef CONFIG_NO_HZ
120DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 132DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
121 .dynticks_nesting = 1, 133 .dynticks_nesting = 1,
@@ -438,6 +450,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
438 450
439#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 451#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
440 452
453int rcu_cpu_stall_panicking __read_mostly;
454
441static void record_gp_stall_check_time(struct rcu_state *rsp) 455static void record_gp_stall_check_time(struct rcu_state *rsp)
442{ 456{
443 rsp->gp_start = jiffies; 457 rsp->gp_start = jiffies;
@@ -470,7 +484,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
470 484
471 /* OK, time to rat on our buddy... */ 485 /* OK, time to rat on our buddy... */
472 486
473 printk(KERN_ERR "INFO: RCU detected CPU stalls:"); 487 printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {",
488 rsp->name);
474 rcu_for_each_leaf_node(rsp, rnp) { 489 rcu_for_each_leaf_node(rsp, rnp) {
475 raw_spin_lock_irqsave(&rnp->lock, flags); 490 raw_spin_lock_irqsave(&rnp->lock, flags);
476 rcu_print_task_stall(rnp); 491 rcu_print_task_stall(rnp);
@@ -481,7 +496,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
481 if (rnp->qsmask & (1UL << cpu)) 496 if (rnp->qsmask & (1UL << cpu))
482 printk(" %d", rnp->grplo + cpu); 497 printk(" %d", rnp->grplo + cpu);
483 } 498 }
484 printk(" (detected by %d, t=%ld jiffies)\n", 499 printk("} (detected by %d, t=%ld jiffies)\n",
485 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 500 smp_processor_id(), (long)(jiffies - rsp->gp_start));
486 trigger_all_cpu_backtrace(); 501 trigger_all_cpu_backtrace();
487 502
@@ -497,8 +512,8 @@ static void print_cpu_stall(struct rcu_state *rsp)
497 unsigned long flags; 512 unsigned long flags;
498 struct rcu_node *rnp = rcu_get_root(rsp); 513 struct rcu_node *rnp = rcu_get_root(rsp);
499 514
500 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n", 515 printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
501 smp_processor_id(), jiffies - rsp->gp_start); 516 rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
502 trigger_all_cpu_backtrace(); 517 trigger_all_cpu_backtrace();
503 518
504 raw_spin_lock_irqsave(&rnp->lock, flags); 519 raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -515,6 +530,8 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
515 long delta; 530 long delta;
516 struct rcu_node *rnp; 531 struct rcu_node *rnp;
517 532
533 if (rcu_cpu_stall_panicking)
534 return;
518 delta = jiffies - rsp->jiffies_stall; 535 delta = jiffies - rsp->jiffies_stall;
519 rnp = rdp->mynode; 536 rnp = rdp->mynode;
520 if ((rnp->qsmask & rdp->grpmask) && delta >= 0) { 537 if ((rnp->qsmask & rdp->grpmask) && delta >= 0) {
@@ -529,6 +546,21 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
529 } 546 }
530} 547}
531 548
549static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
550{
551 rcu_cpu_stall_panicking = 1;
552 return NOTIFY_DONE;
553}
554
555static struct notifier_block rcu_panic_block = {
556 .notifier_call = rcu_panic,
557};
558
559static void __init check_cpu_stall_init(void)
560{
561 atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
562}
563
532#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 564#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
533 565
534static void record_gp_stall_check_time(struct rcu_state *rsp) 566static void record_gp_stall_check_time(struct rcu_state *rsp)
@@ -539,6 +571,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
539{ 571{
540} 572}
541 573
574static void __init check_cpu_stall_init(void)
575{
576}
577
542#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 578#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
543 579
544/* 580/*
@@ -1076,6 +1112,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1076 while (list) { 1112 while (list) {
1077 next = list->next; 1113 next = list->next;
1078 prefetch(next); 1114 prefetch(next);
1115 debug_rcu_head_unqueue(list);
1079 list->func(list); 1116 list->func(list);
1080 list = next; 1117 list = next;
1081 if (++count >= rdp->blimit) 1118 if (++count >= rdp->blimit)
@@ -1125,8 +1162,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1125 */ 1162 */
1126void rcu_check_callbacks(int cpu, int user) 1163void rcu_check_callbacks(int cpu, int user)
1127{ 1164{
1128 if (!rcu_pending(cpu))
1129 return; /* if nothing for RCU to do. */
1130 if (user || 1165 if (user ||
1131 (idle_cpu(cpu) && rcu_scheduler_active && 1166 (idle_cpu(cpu) && rcu_scheduler_active &&
1132 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { 1167 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
@@ -1158,7 +1193,8 @@ void rcu_check_callbacks(int cpu, int user)
1158 rcu_bh_qs(cpu); 1193 rcu_bh_qs(cpu);
1159 } 1194 }
1160 rcu_preempt_check_callbacks(cpu); 1195 rcu_preempt_check_callbacks(cpu);
1161 raise_softirq(RCU_SOFTIRQ); 1196 if (rcu_pending(cpu))
1197 raise_softirq(RCU_SOFTIRQ);
1162} 1198}
1163 1199
1164#ifdef CONFIG_SMP 1200#ifdef CONFIG_SMP
@@ -1236,11 +1272,11 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1236 break; /* grace period idle or initializing, ignore. */ 1272 break; /* grace period idle or initializing, ignore. */
1237 1273
1238 case RCU_SAVE_DYNTICK: 1274 case RCU_SAVE_DYNTICK:
1239
1240 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1241 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK) 1275 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
1242 break; /* So gcc recognizes the dead code. */ 1276 break; /* So gcc recognizes the dead code. */
1243 1277
1278 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1279
1244 /* Record dyntick-idle state. */ 1280 /* Record dyntick-idle state. */
1245 force_qs_rnp(rsp, dyntick_save_progress_counter); 1281 force_qs_rnp(rsp, dyntick_save_progress_counter);
1246 raw_spin_lock(&rnp->lock); /* irqs already disabled */ 1282 raw_spin_lock(&rnp->lock); /* irqs already disabled */
@@ -1353,6 +1389,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1353 unsigned long flags; 1389 unsigned long flags;
1354 struct rcu_data *rdp; 1390 struct rcu_data *rdp;
1355 1391
1392 debug_rcu_head_queue(head);
1356 head->func = func; 1393 head->func = func;
1357 head->next = NULL; 1394 head->next = NULL;
1358 1395
@@ -1449,11 +1486,13 @@ void synchronize_sched(void)
1449 if (rcu_blocking_is_gp()) 1486 if (rcu_blocking_is_gp())
1450 return; 1487 return;
1451 1488
1489 init_rcu_head_on_stack(&rcu.head);
1452 init_completion(&rcu.completion); 1490 init_completion(&rcu.completion);
1453 /* Will wake me after RCU finished. */ 1491 /* Will wake me after RCU finished. */
1454 call_rcu_sched(&rcu.head, wakeme_after_rcu); 1492 call_rcu_sched(&rcu.head, wakeme_after_rcu);
1455 /* Wait for it. */ 1493 /* Wait for it. */
1456 wait_for_completion(&rcu.completion); 1494 wait_for_completion(&rcu.completion);
1495 destroy_rcu_head_on_stack(&rcu.head);
1457} 1496}
1458EXPORT_SYMBOL_GPL(synchronize_sched); 1497EXPORT_SYMBOL_GPL(synchronize_sched);
1459 1498
@@ -1473,11 +1512,13 @@ void synchronize_rcu_bh(void)
1473 if (rcu_blocking_is_gp()) 1512 if (rcu_blocking_is_gp())
1474 return; 1513 return;
1475 1514
1515 init_rcu_head_on_stack(&rcu.head);
1476 init_completion(&rcu.completion); 1516 init_completion(&rcu.completion);
1477 /* Will wake me after RCU finished. */ 1517 /* Will wake me after RCU finished. */
1478 call_rcu_bh(&rcu.head, wakeme_after_rcu); 1518 call_rcu_bh(&rcu.head, wakeme_after_rcu);
1479 /* Wait for it. */ 1519 /* Wait for it. */
1480 wait_for_completion(&rcu.completion); 1520 wait_for_completion(&rcu.completion);
1521 destroy_rcu_head_on_stack(&rcu.head);
1481} 1522}
1482EXPORT_SYMBOL_GPL(synchronize_rcu_bh); 1523EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
1483 1524
@@ -1498,8 +1539,20 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1498 check_cpu_stall(rsp, rdp); 1539 check_cpu_stall(rsp, rdp);
1499 1540
1500 /* Is the RCU core waiting for a quiescent state from this CPU? */ 1541 /* Is the RCU core waiting for a quiescent state from this CPU? */
1501 if (rdp->qs_pending) { 1542 if (rdp->qs_pending && !rdp->passed_quiesc) {
1543
1544 /*
1545 * If force_quiescent_state() coming soon and this CPU
1546 * needs a quiescent state, and this is either RCU-sched
1547 * or RCU-bh, force a local reschedule.
1548 */
1502 rdp->n_rp_qs_pending++; 1549 rdp->n_rp_qs_pending++;
1550 if (!rdp->preemptable &&
1551 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
1552 jiffies))
1553 set_need_resched();
1554 } else if (rdp->qs_pending && rdp->passed_quiesc) {
1555 rdp->n_rp_report_qs++;
1503 return 1; 1556 return 1;
1504 } 1557 }
1505 1558
@@ -1767,6 +1820,21 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1767} 1820}
1768 1821
1769/* 1822/*
1823 * This function is invoked towards the end of the scheduler's initialization
1824 * process. Before this is called, the idle task might contain
1825 * RCU read-side critical sections (during which time, this idle
1826 * task is booting the system). After this function is called, the
1827 * idle tasks are prohibited from containing RCU read-side critical
1828 * sections. This function also enables RCU lockdep checking.
1829 */
1830void rcu_scheduler_starting(void)
1831{
1832 WARN_ON(num_online_cpus() != 1);
1833 WARN_ON(nr_context_switches() > 0);
1834 rcu_scheduler_active = 1;
1835}
1836
1837/*
1770 * Compute the per-level fanout, either using the exact fanout specified 1838 * Compute the per-level fanout, either using the exact fanout specified
1771 * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT. 1839 * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
1772 */ 1840 */
@@ -1849,6 +1917,14 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1849 INIT_LIST_HEAD(&rnp->blocked_tasks[3]); 1917 INIT_LIST_HEAD(&rnp->blocked_tasks[3]);
1850 } 1918 }
1851 } 1919 }
1920
1921 rnp = rsp->level[NUM_RCU_LVLS - 1];
1922 for_each_possible_cpu(i) {
1923 while (i > rnp->grphi)
1924 rnp++;
1925 rsp->rda[i]->mynode = rnp;
1926 rcu_boot_init_percpu_data(i, rsp);
1927 }
1852} 1928}
1853 1929
1854/* 1930/*
@@ -1859,19 +1935,11 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1859#define RCU_INIT_FLAVOR(rsp, rcu_data) \ 1935#define RCU_INIT_FLAVOR(rsp, rcu_data) \
1860do { \ 1936do { \
1861 int i; \ 1937 int i; \
1862 int j; \
1863 struct rcu_node *rnp; \
1864 \ 1938 \
1865 rcu_init_one(rsp); \
1866 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
1867 j = 0; \
1868 for_each_possible_cpu(i) { \ 1939 for_each_possible_cpu(i) { \
1869 if (i > rnp[j].grphi) \
1870 j++; \
1871 per_cpu(rcu_data, i).mynode = &rnp[j]; \
1872 (rsp)->rda[i] = &per_cpu(rcu_data, i); \ 1940 (rsp)->rda[i] = &per_cpu(rcu_data, i); \
1873 rcu_boot_init_percpu_data(i, rsp); \
1874 } \ 1941 } \
1942 rcu_init_one(rsp); \
1875} while (0) 1943} while (0)
1876 1944
1877void __init rcu_init(void) 1945void __init rcu_init(void)
@@ -1879,12 +1947,6 @@ void __init rcu_init(void)
1879 int cpu; 1947 int cpu;
1880 1948
1881 rcu_bootup_announce(); 1949 rcu_bootup_announce();
1882#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1883 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1884#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
1885#if NUM_RCU_LVL_4 != 0
1886 printk(KERN_INFO "Experimental four-level hierarchy is enabled.\n");
1887#endif /* #if NUM_RCU_LVL_4 != 0 */
1888 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); 1950 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
1889 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); 1951 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
1890 __rcu_init_preempt(); 1952 __rcu_init_preempt();
@@ -1898,6 +1960,7 @@ void __init rcu_init(void)
1898 cpu_notifier(rcu_cpu_notify, 0); 1960 cpu_notifier(rcu_cpu_notify, 0);
1899 for_each_online_cpu(cpu) 1961 for_each_online_cpu(cpu)
1900 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 1962 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
1963 check_cpu_stall_init();
1901} 1964}
1902 1965
1903#include "rcutree_plugin.h" 1966#include "rcutree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 4a525a30e08e..14c040b18ed0 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -223,6 +223,7 @@ struct rcu_data {
223 /* 5) __rcu_pending() statistics. */ 223 /* 5) __rcu_pending() statistics. */
224 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ 224 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
225 unsigned long n_rp_qs_pending; 225 unsigned long n_rp_qs_pending;
226 unsigned long n_rp_report_qs;
226 unsigned long n_rp_cb_ready; 227 unsigned long n_rp_cb_ready;
227 unsigned long n_rp_cpu_needs_gp; 228 unsigned long n_rp_cpu_needs_gp;
228 unsigned long n_rp_gp_completed; 229 unsigned long n_rp_gp_completed;
@@ -326,6 +327,7 @@ struct rcu_state {
326 unsigned long jiffies_stall; /* Time at which to check */ 327 unsigned long jiffies_stall; /* Time at which to check */
327 /* for CPU stalls. */ 328 /* for CPU stalls. */
328#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 329#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
330 char *name; /* Name of structure. */
329}; 331};
330 332
331/* Return values for rcu_preempt_offline_tasks(). */ 333/* Return values for rcu_preempt_offline_tasks(). */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 79b53bda8943..0e4f420245d9 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -26,6 +26,45 @@
26 26
27#include <linux/delay.h> 27#include <linux/delay.h>
28 28
29/*
30 * Check the RCU kernel configuration parameters and print informative
31 * messages about anything out of the ordinary. If you like #ifdef, you
32 * will love this function.
33 */
34static void __init rcu_bootup_announce_oddness(void)
35{
36#ifdef CONFIG_RCU_TRACE
37 printk(KERN_INFO "\tRCU debugfs-based tracing is enabled.\n");
38#endif
39#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)
40 printk(KERN_INFO "\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
41 CONFIG_RCU_FANOUT);
42#endif
43#ifdef CONFIG_RCU_FANOUT_EXACT
44 printk(KERN_INFO "\tHierarchical RCU autobalancing is disabled.\n");
45#endif
46#ifdef CONFIG_RCU_FAST_NO_HZ
47 printk(KERN_INFO
48 "\tRCU dyntick-idle grace-period acceleration is enabled.\n");
49#endif
50#ifdef CONFIG_PROVE_RCU
51 printk(KERN_INFO "\tRCU lockdep checking is enabled.\n");
52#endif
53#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
54 printk(KERN_INFO "\tRCU torture testing starts during boot.\n");
55#endif
56#ifndef CONFIG_RCU_CPU_STALL_DETECTOR
57 printk(KERN_INFO
58 "\tRCU-based detection of stalled CPUs is disabled.\n");
59#endif
60#ifndef CONFIG_RCU_CPU_STALL_VERBOSE
61 printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
62#endif
63#if NUM_RCU_LVL_4 != 0
64 printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n");
65#endif
66}
67
29#ifdef CONFIG_TREE_PREEMPT_RCU 68#ifdef CONFIG_TREE_PREEMPT_RCU
30 69
31struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); 70struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
@@ -38,8 +77,8 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp);
38 */ 77 */
39static void __init rcu_bootup_announce(void) 78static void __init rcu_bootup_announce(void)
40{ 79{
41 printk(KERN_INFO 80 printk(KERN_INFO "Preemptable hierarchical RCU implementation.\n");
42 "Experimental preemptable hierarchical RCU implementation.\n"); 81 rcu_bootup_announce_oddness();
43} 82}
44 83
45/* 84/*
@@ -75,13 +114,19 @@ EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
75 * that this just means that the task currently running on the CPU is 114 * that this just means that the task currently running on the CPU is
76 * not in a quiescent state. There might be any number of tasks blocked 115 * not in a quiescent state. There might be any number of tasks blocked
77 * while in an RCU read-side critical section. 116 * while in an RCU read-side critical section.
117 *
118 * Unlike the other rcu_*_qs() functions, callers to this function
119 * must disable irqs in order to protect the assignment to
120 * ->rcu_read_unlock_special.
78 */ 121 */
79static void rcu_preempt_qs(int cpu) 122static void rcu_preempt_qs(int cpu)
80{ 123{
81 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 124 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
125
82 rdp->passed_quiesc_completed = rdp->gpnum - 1; 126 rdp->passed_quiesc_completed = rdp->gpnum - 1;
83 barrier(); 127 barrier();
84 rdp->passed_quiesc = 1; 128 rdp->passed_quiesc = 1;
129 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
85} 130}
86 131
87/* 132/*
@@ -144,9 +189,8 @@ static void rcu_preempt_note_context_switch(int cpu)
144 * grace period, then the fact that the task has been enqueued 189 * grace period, then the fact that the task has been enqueued
145 * means that we continue to block the current grace period. 190 * means that we continue to block the current grace period.
146 */ 191 */
147 rcu_preempt_qs(cpu);
148 local_irq_save(flags); 192 local_irq_save(flags);
149 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; 193 rcu_preempt_qs(cpu);
150 local_irq_restore(flags); 194 local_irq_restore(flags);
151} 195}
152 196
@@ -236,7 +280,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
236 */ 280 */
237 special = t->rcu_read_unlock_special; 281 special = t->rcu_read_unlock_special;
238 if (special & RCU_READ_UNLOCK_NEED_QS) { 282 if (special & RCU_READ_UNLOCK_NEED_QS) {
239 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
240 rcu_preempt_qs(smp_processor_id()); 283 rcu_preempt_qs(smp_processor_id());
241 } 284 }
242 285
@@ -473,7 +516,6 @@ static void rcu_preempt_check_callbacks(int cpu)
473 struct task_struct *t = current; 516 struct task_struct *t = current;
474 517
475 if (t->rcu_read_lock_nesting == 0) { 518 if (t->rcu_read_lock_nesting == 0) {
476 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
477 rcu_preempt_qs(cpu); 519 rcu_preempt_qs(cpu);
478 return; 520 return;
479 } 521 }
@@ -515,11 +557,13 @@ void synchronize_rcu(void)
515 if (!rcu_scheduler_active) 557 if (!rcu_scheduler_active)
516 return; 558 return;
517 559
560 init_rcu_head_on_stack(&rcu.head);
518 init_completion(&rcu.completion); 561 init_completion(&rcu.completion);
519 /* Will wake me after RCU finished. */ 562 /* Will wake me after RCU finished. */
520 call_rcu(&rcu.head, wakeme_after_rcu); 563 call_rcu(&rcu.head, wakeme_after_rcu);
521 /* Wait for it. */ 564 /* Wait for it. */
522 wait_for_completion(&rcu.completion); 565 wait_for_completion(&rcu.completion);
566 destroy_rcu_head_on_stack(&rcu.head);
523} 567}
524EXPORT_SYMBOL_GPL(synchronize_rcu); 568EXPORT_SYMBOL_GPL(synchronize_rcu);
525 569
@@ -754,6 +798,7 @@ void exit_rcu(void)
754static void __init rcu_bootup_announce(void) 798static void __init rcu_bootup_announce(void)
755{ 799{
756 printk(KERN_INFO "Hierarchical RCU implementation.\n"); 800 printk(KERN_INFO "Hierarchical RCU implementation.\n");
801 rcu_bootup_announce_oddness();
757} 802}
758 803
759/* 804/*
@@ -1008,6 +1053,8 @@ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
1008int rcu_needs_cpu(int cpu) 1053int rcu_needs_cpu(int cpu)
1009{ 1054{
1010 int c = 0; 1055 int c = 0;
1056 int snap;
1057 int snap_nmi;
1011 int thatcpu; 1058 int thatcpu;
1012 1059
1013 /* Check for being in the holdoff period. */ 1060 /* Check for being in the holdoff period. */
@@ -1015,12 +1062,18 @@ int rcu_needs_cpu(int cpu)
1015 return rcu_needs_cpu_quick_check(cpu); 1062 return rcu_needs_cpu_quick_check(cpu);
1016 1063
1017 /* Don't bother unless we are the last non-dyntick-idle CPU. */ 1064 /* Don't bother unless we are the last non-dyntick-idle CPU. */
1018 for_each_cpu_not(thatcpu, nohz_cpu_mask) 1065 for_each_online_cpu(thatcpu) {
1019 if (thatcpu != cpu) { 1066 if (thatcpu == cpu)
1067 continue;
1068 snap = per_cpu(rcu_dynticks, thatcpu).dynticks;
1069 snap_nmi = per_cpu(rcu_dynticks, thatcpu).dynticks_nmi;
1070 smp_mb(); /* Order sampling of snap with end of grace period. */
1071 if (((snap & 0x1) != 0) || ((snap_nmi & 0x1) != 0)) {
1020 per_cpu(rcu_dyntick_drain, cpu) = 0; 1072 per_cpu(rcu_dyntick_drain, cpu) = 0;
1021 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; 1073 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
1022 return rcu_needs_cpu_quick_check(cpu); 1074 return rcu_needs_cpu_quick_check(cpu);
1023 } 1075 }
1076 }
1024 1077
1025 /* Check and update the rcu_dyntick_drain sequencing. */ 1078 /* Check and update the rcu_dyntick_drain sequencing. */
1026 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { 1079 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d45db2e35d27..36c95b45738e 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -241,11 +241,13 @@ static const struct file_operations rcugp_fops = {
241static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) 241static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
242{ 242{
243 seq_printf(m, "%3d%cnp=%ld " 243 seq_printf(m, "%3d%cnp=%ld "
244 "qsp=%ld cbr=%ld cng=%ld gpc=%ld gps=%ld nf=%ld nn=%ld\n", 244 "qsp=%ld rpq=%ld cbr=%ld cng=%ld "
245 "gpc=%ld gps=%ld nf=%ld nn=%ld\n",
245 rdp->cpu, 246 rdp->cpu,
246 cpu_is_offline(rdp->cpu) ? '!' : ' ', 247 cpu_is_offline(rdp->cpu) ? '!' : ' ',
247 rdp->n_rcu_pending, 248 rdp->n_rcu_pending,
248 rdp->n_rp_qs_pending, 249 rdp->n_rp_qs_pending,
250 rdp->n_rp_report_qs,
249 rdp->n_rp_cb_ready, 251 rdp->n_rp_cb_ready,
250 rdp->n_rp_cpu_needs_gp, 252 rdp->n_rp_cpu_needs_gp,
251 rdp->n_rp_gp_completed, 253 rdp->n_rp_gp_completed,
diff --git a/kernel/relay.c b/kernel/relay.c
index 3d97f2821611..c7cf397fb929 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -539,7 +539,7 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
539 "relay_hotcpu_callback: cpu %d buffer " 539 "relay_hotcpu_callback: cpu %d buffer "
540 "creation failed\n", hotcpu); 540 "creation failed\n", hotcpu);
541 mutex_unlock(&relay_channels_mutex); 541 mutex_unlock(&relay_channels_mutex);
542 return NOTIFY_BAD; 542 return notifier_from_errno(-ENOMEM);
543 } 543 }
544 } 544 }
545 mutex_unlock(&relay_channels_mutex); 545 mutex_unlock(&relay_channels_mutex);
@@ -1231,8 +1231,8 @@ static ssize_t subbuf_splice_actor(struct file *in,
1231 size_t read_subbuf = read_start / subbuf_size; 1231 size_t read_subbuf = read_start / subbuf_size;
1232 size_t padding = rbuf->padding[read_subbuf]; 1232 size_t padding = rbuf->padding[read_subbuf];
1233 size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding; 1233 size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding;
1234 struct page *pages[PIPE_BUFFERS]; 1234 struct page *pages[PIPE_DEF_BUFFERS];
1235 struct partial_page partial[PIPE_BUFFERS]; 1235 struct partial_page partial[PIPE_DEF_BUFFERS];
1236 struct splice_pipe_desc spd = { 1236 struct splice_pipe_desc spd = {
1237 .pages = pages, 1237 .pages = pages,
1238 .nr_pages = 0, 1238 .nr_pages = 0,
@@ -1245,6 +1245,8 @@ static ssize_t subbuf_splice_actor(struct file *in,
1245 1245
1246 if (rbuf->subbufs_produced == rbuf->subbufs_consumed) 1246 if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
1247 return 0; 1247 return 0;
1248 if (splice_grow_spd(pipe, &spd))
1249 return -ENOMEM;
1248 1250
1249 /* 1251 /*
1250 * Adjust read len, if longer than what is available 1252 * Adjust read len, if longer than what is available
@@ -1255,7 +1257,7 @@ static ssize_t subbuf_splice_actor(struct file *in,
1255 subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT; 1257 subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
1256 pidx = (read_start / PAGE_SIZE) % subbuf_pages; 1258 pidx = (read_start / PAGE_SIZE) % subbuf_pages;
1257 poff = read_start & ~PAGE_MASK; 1259 poff = read_start & ~PAGE_MASK;
1258 nr_pages = min_t(unsigned int, subbuf_pages, PIPE_BUFFERS); 1260 nr_pages = min_t(unsigned int, subbuf_pages, pipe->buffers);
1259 1261
1260 for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) { 1262 for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) {
1261 unsigned int this_len, this_end, private; 1263 unsigned int this_len, this_end, private;
@@ -1289,16 +1291,19 @@ static ssize_t subbuf_splice_actor(struct file *in,
1289 } 1291 }
1290 } 1292 }
1291 1293
1294 ret = 0;
1292 if (!spd.nr_pages) 1295 if (!spd.nr_pages)
1293 return 0; 1296 goto out;
1294 1297
1295 ret = *nonpad_ret = splice_to_pipe(pipe, &spd); 1298 ret = *nonpad_ret = splice_to_pipe(pipe, &spd);
1296 if (ret < 0 || ret < total_len) 1299 if (ret < 0 || ret < total_len)
1297 return ret; 1300 goto out;
1298 1301
1299 if (read_start + ret == nonpad_end) 1302 if (read_start + ret == nonpad_end)
1300 ret += padding; 1303 ret += padding;
1301 1304
1305out:
1306 splice_shrink_spd(pipe, &spd);
1302 return ret; 1307 return ret;
1303} 1308}
1304 1309
diff --git a/kernel/resource.c b/kernel/resource.c
index 9c358e263534..7b36976e5dea 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -15,6 +15,7 @@
15#include <linux/spinlock.h> 15#include <linux/spinlock.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/proc_fs.h> 17#include <linux/proc_fs.h>
18#include <linux/sched.h>
18#include <linux/seq_file.h> 19#include <linux/seq_file.h>
19#include <linux/device.h> 20#include <linux/device.h>
20#include <linux/pfn.h> 21#include <linux/pfn.h>
@@ -681,6 +682,8 @@ resource_size_t resource_alignment(struct resource *res)
681 * release_region releases a matching busy region. 682 * release_region releases a matching busy region.
682 */ 683 */
683 684
685static DECLARE_WAIT_QUEUE_HEAD(muxed_resource_wait);
686
684/** 687/**
685 * __request_region - create a new busy resource region 688 * __request_region - create a new busy resource region
686 * @parent: parent resource descriptor 689 * @parent: parent resource descriptor
@@ -693,6 +696,7 @@ struct resource * __request_region(struct resource *parent,
693 resource_size_t start, resource_size_t n, 696 resource_size_t start, resource_size_t n,
694 const char *name, int flags) 697 const char *name, int flags)
695{ 698{
699 DECLARE_WAITQUEUE(wait, current);
696 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); 700 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
697 701
698 if (!res) 702 if (!res)
@@ -717,7 +721,15 @@ struct resource * __request_region(struct resource *parent,
717 if (!(conflict->flags & IORESOURCE_BUSY)) 721 if (!(conflict->flags & IORESOURCE_BUSY))
718 continue; 722 continue;
719 } 723 }
720 724 if (conflict->flags & flags & IORESOURCE_MUXED) {
725 add_wait_queue(&muxed_resource_wait, &wait);
726 write_unlock(&resource_lock);
727 set_current_state(TASK_UNINTERRUPTIBLE);
728 schedule();
729 remove_wait_queue(&muxed_resource_wait, &wait);
730 write_lock(&resource_lock);
731 continue;
732 }
721 /* Uhhuh, that didn't work out.. */ 733 /* Uhhuh, that didn't work out.. */
722 kfree(res); 734 kfree(res);
723 res = NULL; 735 res = NULL;
@@ -791,6 +803,8 @@ void __release_region(struct resource *parent, resource_size_t start,
791 break; 803 break;
792 *p = res->sibling; 804 *p = res->sibling;
793 write_unlock(&resource_lock); 805 write_unlock(&resource_lock);
806 if (res->flags & IORESOURCE_MUXED)
807 wake_up(&muxed_resource_wait);
794 kfree(res); 808 kfree(res);
795 return; 809 return;
796 } 810 }
diff --git a/kernel/sched.c b/kernel/sched.c
index 5e3c509e0efe..6777dc7942a0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -55,9 +55,9 @@
55#include <linux/cpu.h> 55#include <linux/cpu.h>
56#include <linux/cpuset.h> 56#include <linux/cpuset.h>
57#include <linux/percpu.h> 57#include <linux/percpu.h>
58#include <linux/kthread.h>
59#include <linux/proc_fs.h> 58#include <linux/proc_fs.h>
60#include <linux/seq_file.h> 59#include <linux/seq_file.h>
60#include <linux/stop_machine.h>
61#include <linux/sysctl.h> 61#include <linux/sysctl.h>
62#include <linux/syscalls.h> 62#include <linux/syscalls.h>
63#include <linux/times.h> 63#include <linux/times.h>
@@ -77,6 +77,7 @@
77#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
78 78
79#include "sched_cpupri.h" 79#include "sched_cpupri.h"
80#include "workqueue_sched.h"
80 81
81#include <litmus/sched_trace.h> 82#include <litmus/sched_trace.h>
82#include <litmus/trace.h> 83#include <litmus/trace.h>
@@ -309,52 +310,6 @@ static int init_task_group_load = INIT_TASK_GROUP_LOAD;
309 */ 310 */
310struct task_group init_task_group; 311struct task_group init_task_group;
311 312
312/* return group to which a task belongs */
313static inline struct task_group *task_group(struct task_struct *p)
314{
315 struct task_group *tg;
316
317#ifdef CONFIG_CGROUP_SCHED
318 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
319 struct task_group, css);
320#else
321 tg = &init_task_group;
322#endif
323 return tg;
324}
325
326/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
327static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
328{
329 /*
330 * Strictly speaking this rcu_read_lock() is not needed since the
331 * task_group is tied to the cgroup, which in turn can never go away
332 * as long as there are tasks attached to it.
333 *
334 * However since task_group() uses task_subsys_state() which is an
335 * rcu_dereference() user, this quiets CONFIG_PROVE_RCU.
336 */
337 rcu_read_lock();
338#ifdef CONFIG_FAIR_GROUP_SCHED
339 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
340 p->se.parent = task_group(p)->se[cpu];
341#endif
342
343#ifdef CONFIG_RT_GROUP_SCHED
344 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
345 p->rt.parent = task_group(p)->rt_se[cpu];
346#endif
347 rcu_read_unlock();
348}
349
350#else
351
352static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
353static inline struct task_group *task_group(struct task_struct *p)
354{
355 return NULL;
356}
357
358#endif /* CONFIG_CGROUP_SCHED */ 313#endif /* CONFIG_CGROUP_SCHED */
359 314
360/* CFS-related fields in a runqueue */ 315/* CFS-related fields in a runqueue */
@@ -511,9 +466,13 @@ struct rq {
511 unsigned long nr_running; 466 unsigned long nr_running;
512 #define CPU_LOAD_IDX_MAX 5 467 #define CPU_LOAD_IDX_MAX 5
513 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 468 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
469 unsigned long last_load_update_tick;
514#ifdef CONFIG_NO_HZ 470#ifdef CONFIG_NO_HZ
515 unsigned char in_nohz_recently; 471 u64 nohz_stamp;
472 unsigned char nohz_balance_kick;
516#endif 473#endif
474 unsigned int skip_clock_update;
475
517 /* capture load from *all* tasks on this cpu: */ 476 /* capture load from *all* tasks on this cpu: */
518 struct load_weight load; 477 struct load_weight load;
519 unsigned long nr_load_updates; 478 unsigned long nr_load_updates;
@@ -551,20 +510,20 @@ struct rq {
551 struct root_domain *rd; 510 struct root_domain *rd;
552 struct sched_domain *sd; 511 struct sched_domain *sd;
553 512
513 unsigned long cpu_power;
514
554 unsigned char idle_at_tick; 515 unsigned char idle_at_tick;
555 /* For active balancing */ 516 /* For active balancing */
556 int post_schedule; 517 int post_schedule;
557 int active_balance; 518 int active_balance;
558 int push_cpu; 519 int push_cpu;
520 struct cpu_stop_work active_balance_work;
559 /* cpu of this runqueue: */ 521 /* cpu of this runqueue: */
560 int cpu; 522 int cpu;
561 int online; 523 int online;
562 524
563 unsigned long avg_load_per_task; 525 unsigned long avg_load_per_task;
564 526
565 struct task_struct *migration_thread;
566 struct list_head migration_queue;
567
568 u64 rt_avg; 527 u64 rt_avg;
569 u64 age_stamp; 528 u64 age_stamp;
570 u64 idle_stamp; 529 u64 idle_stamp;
@@ -612,6 +571,13 @@ static inline
612void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 571void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
613{ 572{
614 rq->curr->sched_class->check_preempt_curr(rq, p, flags); 573 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
574
575 /*
576 * A queue event has occurred, and we're going to schedule. In
577 * this case, we can save a useless back to back clock update.
578 */
579 if (test_tsk_need_resched(p))
580 rq->skip_clock_update = 1;
615} 581}
616 582
617static inline int cpu_of(struct rq *rq) 583static inline int cpu_of(struct rq *rq)
@@ -644,9 +610,53 @@ static inline int cpu_of(struct rq *rq)
644#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 610#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
645#define raw_rq() (&__raw_get_cpu_var(runqueues)) 611#define raw_rq() (&__raw_get_cpu_var(runqueues))
646 612
613#ifdef CONFIG_CGROUP_SCHED
614
615/*
616 * Return the group to which this tasks belongs.
617 *
618 * We use task_subsys_state_check() and extend the RCU verification
619 * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach()
620 * holds that lock for each task it moves into the cgroup. Therefore
621 * by holding that lock, we pin the task to the current cgroup.
622 */
623static inline struct task_group *task_group(struct task_struct *p)
624{
625 struct cgroup_subsys_state *css;
626
627 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
628 lockdep_is_held(&task_rq(p)->lock));
629 return container_of(css, struct task_group, css);
630}
631
632/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
633static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
634{
635#ifdef CONFIG_FAIR_GROUP_SCHED
636 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
637 p->se.parent = task_group(p)->se[cpu];
638#endif
639
640#ifdef CONFIG_RT_GROUP_SCHED
641 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
642 p->rt.parent = task_group(p)->rt_se[cpu];
643#endif
644}
645
646#else /* CONFIG_CGROUP_SCHED */
647
648static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
649static inline struct task_group *task_group(struct task_struct *p)
650{
651 return NULL;
652}
653
654#endif /* CONFIG_CGROUP_SCHED */
655
647inline void update_rq_clock(struct rq *rq) 656inline void update_rq_clock(struct rq *rq)
648{ 657{
649 rq->clock = sched_clock_cpu(cpu_of(rq)); 658 if (!rq->skip_clock_update)
659 rq->clock = sched_clock_cpu(cpu_of(rq));
650} 660}
651 661
652/* 662/*
@@ -924,16 +934,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
924#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 934#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
925 935
926/* 936/*
927 * Check whether the task is waking, we use this to synchronize against 937 * Check whether the task is waking, we use this to synchronize ->cpus_allowed
928 * ttwu() so that task_cpu() reports a stable number. 938 * against ttwu().
929 *
930 * We need to make an exception for PF_STARTING tasks because the fork
931 * path might require task_rq_lock() to work, eg. it can call
932 * set_cpus_allowed_ptr() from the cpuset clone_ns code.
933 */ 939 */
934static inline int task_is_waking(struct task_struct *p) 940static inline int task_is_waking(struct task_struct *p)
935{ 941{
936 return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING)); 942 return unlikely(p->state == TASK_WAKING);
937} 943}
938 944
939/* 945/*
@@ -946,11 +952,9 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
946 struct rq *rq; 952 struct rq *rq;
947 953
948 for (;;) { 954 for (;;) {
949 while (task_is_waking(p))
950 cpu_relax();
951 rq = task_rq(p); 955 rq = task_rq(p);
952 raw_spin_lock(&rq->lock); 956 raw_spin_lock(&rq->lock);
953 if (likely(rq == task_rq(p) && !task_is_waking(p))) 957 if (likely(rq == task_rq(p)))
954 return rq; 958 return rq;
955 raw_spin_unlock(&rq->lock); 959 raw_spin_unlock(&rq->lock);
956 } 960 }
@@ -967,25 +971,15 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
967 struct rq *rq; 971 struct rq *rq;
968 972
969 for (;;) { 973 for (;;) {
970 while (task_is_waking(p))
971 cpu_relax();
972 local_irq_save(*flags); 974 local_irq_save(*flags);
973 rq = task_rq(p); 975 rq = task_rq(p);
974 raw_spin_lock(&rq->lock); 976 raw_spin_lock(&rq->lock);
975 if (likely(rq == task_rq(p) && !task_is_waking(p))) 977 if (likely(rq == task_rq(p)))
976 return rq; 978 return rq;
977 raw_spin_unlock_irqrestore(&rq->lock, *flags); 979 raw_spin_unlock_irqrestore(&rq->lock, *flags);
978 } 980 }
979} 981}
980 982
981void task_rq_unlock_wait(struct task_struct *p)
982{
983 struct rq *rq = task_rq(p);
984
985 smp_mb(); /* spin-unlock-wait is not a full memory barrier */
986 raw_spin_unlock_wait(&rq->lock);
987}
988
989static void __task_rq_unlock(struct rq *rq) 983static void __task_rq_unlock(struct rq *rq)
990 __releases(rq->lock) 984 __releases(rq->lock)
991{ 985{
@@ -1211,6 +1205,27 @@ static void resched_cpu(int cpu)
1211 1205
1212#ifdef CONFIG_NO_HZ 1206#ifdef CONFIG_NO_HZ
1213/* 1207/*
1208 * In the semi idle case, use the nearest busy cpu for migrating timers
1209 * from an idle cpu. This is good for power-savings.
1210 *
1211 * We don't do similar optimization for completely idle system, as
1212 * selecting an idle cpu will add more delays to the timers than intended
1213 * (as that cpu's timer base may not be uptodate wrt jiffies etc).
1214 */
1215int get_nohz_timer_target(void)
1216{
1217 int cpu = smp_processor_id();
1218 int i;
1219 struct sched_domain *sd;
1220
1221 for_each_domain(cpu, sd) {
1222 for_each_cpu(i, sched_domain_span(sd))
1223 if (!idle_cpu(i))
1224 return i;
1225 }
1226 return cpu;
1227}
1228/*
1214 * When add_timer_on() enqueues a timer into the timer wheel of an 1229 * When add_timer_on() enqueues a timer into the timer wheel of an
1215 * idle CPU then this timer might expire before the next timer event 1230 * idle CPU then this timer might expire before the next timer event
1216 * which is scheduled to wake up that CPU. In case of a completely 1231 * which is scheduled to wake up that CPU. In case of a completely
@@ -1249,6 +1264,7 @@ void wake_up_idle_cpu(int cpu)
1249 if (!tsk_is_polling(rq->idle)) 1264 if (!tsk_is_polling(rq->idle))
1250 smp_send_reschedule(cpu); 1265 smp_send_reschedule(cpu);
1251} 1266}
1267
1252#endif /* CONFIG_NO_HZ */ 1268#endif /* CONFIG_NO_HZ */
1253 1269
1254static u64 sched_avg_period(void) 1270static u64 sched_avg_period(void)
@@ -1261,6 +1277,12 @@ static void sched_avg_update(struct rq *rq)
1261 s64 period = sched_avg_period(); 1277 s64 period = sched_avg_period();
1262 1278
1263 while ((s64)(rq->clock - rq->age_stamp) > period) { 1279 while ((s64)(rq->clock - rq->age_stamp) > period) {
1280 /*
1281 * Inline assembly required to prevent the compiler
1282 * optimising this loop into a divmod call.
1283 * See __iter_div_u64_rem() for another example of this.
1284 */
1285 asm("" : "+rm" (rq->age_stamp));
1264 rq->age_stamp += period; 1286 rq->age_stamp += period;
1265 rq->rt_avg /= 2; 1287 rq->rt_avg /= 2;
1266 } 1288 }
@@ -1282,6 +1304,10 @@ static void resched_task(struct task_struct *p)
1282static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) 1304static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1283{ 1305{
1284} 1306}
1307
1308static void sched_avg_update(struct rq *rq)
1309{
1310}
1285#endif /* CONFIG_SMP */ 1311#endif /* CONFIG_SMP */
1286 1312
1287#if BITS_PER_LONG == 32 1313#if BITS_PER_LONG == 32
@@ -1505,24 +1531,9 @@ static unsigned long target_load(int cpu, int type)
1505 return max(rq->cpu_load[type-1], total); 1531 return max(rq->cpu_load[type-1], total);
1506} 1532}
1507 1533
1508static struct sched_group *group_of(int cpu)
1509{
1510 struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd);
1511
1512 if (!sd)
1513 return NULL;
1514
1515 return sd->groups;
1516}
1517
1518static unsigned long power_of(int cpu) 1534static unsigned long power_of(int cpu)
1519{ 1535{
1520 struct sched_group *group = group_of(cpu); 1536 return cpu_rq(cpu)->cpu_power;
1521
1522 if (!group)
1523 return SCHED_LOAD_SCALE;
1524
1525 return group->cpu_power;
1526} 1537}
1527 1538
1528static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1539static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
@@ -1668,7 +1679,7 @@ static void update_shares(struct sched_domain *sd)
1668 if (root_task_group_empty()) 1679 if (root_task_group_empty())
1669 return; 1680 return;
1670 1681
1671 now = cpu_clock(raw_smp_processor_id()); 1682 now = local_clock();
1672 elapsed = now - sd->last_update; 1683 elapsed = now - sd->last_update;
1673 1684
1674 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1685 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
@@ -1679,9 +1690,6 @@ static void update_shares(struct sched_domain *sd)
1679 1690
1680static void update_h_load(long cpu) 1691static void update_h_load(long cpu)
1681{ 1692{
1682 if (root_task_group_empty())
1683 return;
1684
1685 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1693 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1686} 1694}
1687 1695
@@ -1791,8 +1799,6 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1791 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); 1799 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1792 } 1800 }
1793 } 1801 }
1794 update_rq_clock(rq1);
1795 update_rq_clock(rq2);
1796} 1802}
1797 1803
1798/* 1804/*
@@ -1823,9 +1829,10 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1823} 1829}
1824#endif 1830#endif
1825 1831
1826static void calc_load_account_active(struct rq *this_rq); 1832static void calc_load_account_idle(struct rq *this_rq);
1827static void update_sysctl(void); 1833static void update_sysctl(void);
1828static int get_update_sysctl_factor(void); 1834static int get_update_sysctl_factor(void);
1835static void update_cpu_load(struct rq *this_rq);
1829 1836
1830static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1837static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1831{ 1838{
@@ -1862,8 +1869,8 @@ static void dec_nr_running(struct rq *rq)
1862static void set_load_weight(struct task_struct *p) 1869static void set_load_weight(struct task_struct *p)
1863{ 1870{
1864 if (task_has_rt_policy(p)) { 1871 if (task_has_rt_policy(p)) {
1865 p->se.load.weight = prio_to_weight[0] * 2; 1872 p->se.load.weight = 0;
1866 p->se.load.inv_weight = prio_to_wmult[0] >> 1; 1873 p->se.load.inv_weight = WMULT_CONST;
1867 return; 1874 return;
1868 } 1875 }
1869 1876
@@ -1880,62 +1887,43 @@ static void set_load_weight(struct task_struct *p)
1880 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; 1887 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1881} 1888}
1882 1889
1883static void update_avg(u64 *avg, u64 sample) 1890static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1884{
1885 s64 diff = sample - *avg;
1886 *avg += diff >> 3;
1887}
1888
1889static void
1890enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1891{ 1891{
1892 if (wakeup) 1892 update_rq_clock(rq);
1893 p->se.start_runtime = p->se.sum_exec_runtime;
1894
1895 sched_info_queued(p); 1893 sched_info_queued(p);
1896 p->sched_class->enqueue_task(rq, p, wakeup, head); 1894 p->sched_class->enqueue_task(rq, p, flags);
1897 p->se.on_rq = 1; 1895 p->se.on_rq = 1;
1898} 1896}
1899 1897
1900static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) 1898static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1901{ 1899{
1902 if (sleep) { 1900 update_rq_clock(rq);
1903 if (p->se.last_wakeup) {
1904 update_avg(&p->se.avg_overlap,
1905 p->se.sum_exec_runtime - p->se.last_wakeup);
1906 p->se.last_wakeup = 0;
1907 } else {
1908 update_avg(&p->se.avg_wakeup,
1909 sysctl_sched_wakeup_granularity);
1910 }
1911 }
1912
1913 sched_info_dequeued(p); 1901 sched_info_dequeued(p);
1914 p->sched_class->dequeue_task(rq, p, sleep); 1902 p->sched_class->dequeue_task(rq, p, flags);
1915 p->se.on_rq = 0; 1903 p->se.on_rq = 0;
1916} 1904}
1917 1905
1918/* 1906/*
1919 * activate_task - move a task to the runqueue. 1907 * activate_task - move a task to the runqueue.
1920 */ 1908 */
1921static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) 1909static void activate_task(struct rq *rq, struct task_struct *p, int flags)
1922{ 1910{
1923 if (task_contributes_to_load(p)) 1911 if (task_contributes_to_load(p))
1924 rq->nr_uninterruptible--; 1912 rq->nr_uninterruptible--;
1925 1913
1926 enqueue_task(rq, p, wakeup, false); 1914 enqueue_task(rq, p, flags);
1927 inc_nr_running(rq); 1915 inc_nr_running(rq);
1928} 1916}
1929 1917
1930/* 1918/*
1931 * deactivate_task - remove a task from the runqueue. 1919 * deactivate_task - remove a task from the runqueue.
1932 */ 1920 */
1933static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) 1921static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1934{ 1922{
1935 if (task_contributes_to_load(p)) 1923 if (task_contributes_to_load(p))
1936 rq->nr_uninterruptible++; 1924 rq->nr_uninterruptible++;
1937 1925
1938 dequeue_task(rq, p, sleep); 1926 dequeue_task(rq, p, flags);
1939 dec_nr_running(rq); 1927 dec_nr_running(rq);
1940} 1928}
1941 1929
@@ -2065,21 +2053,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2065 __set_task_cpu(p, new_cpu); 2053 __set_task_cpu(p, new_cpu);
2066} 2054}
2067 2055
2068struct migration_req { 2056struct migration_arg {
2069 struct list_head list;
2070
2071 struct task_struct *task; 2057 struct task_struct *task;
2072 int dest_cpu; 2058 int dest_cpu;
2073
2074 struct completion done;
2075}; 2059};
2076 2060
2061static int migration_cpu_stop(void *data);
2062
2077/* 2063/*
2078 * The task's runqueue lock must be held. 2064 * The task's runqueue lock must be held.
2079 * Returns true if you have to wait for migration thread. 2065 * Returns true if you have to wait for migration thread.
2080 */ 2066 */
2081static int 2067static bool migrate_task(struct task_struct *p, int dest_cpu)
2082migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2083{ 2068{
2084 struct rq *rq = task_rq(p); 2069 struct rq *rq = task_rq(p);
2085 2070
@@ -2087,58 +2072,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2087 * If the task is not on a runqueue (and not running), then 2072 * If the task is not on a runqueue (and not running), then
2088 * the next wake-up will properly place the task. 2073 * the next wake-up will properly place the task.
2089 */ 2074 */
2090 if (!p->se.on_rq && !task_running(rq, p)) 2075 return p->se.on_rq || task_running(rq, p);
2091 return 0;
2092
2093 init_completion(&req->done);
2094 req->task = p;
2095 req->dest_cpu = dest_cpu;
2096 list_add(&req->list, &rq->migration_queue);
2097
2098 return 1;
2099}
2100
2101/*
2102 * wait_task_context_switch - wait for a thread to complete at least one
2103 * context switch.
2104 *
2105 * @p must not be current.
2106 */
2107void wait_task_context_switch(struct task_struct *p)
2108{
2109 unsigned long nvcsw, nivcsw, flags;
2110 int running;
2111 struct rq *rq;
2112
2113 nvcsw = p->nvcsw;
2114 nivcsw = p->nivcsw;
2115 for (;;) {
2116 /*
2117 * The runqueue is assigned before the actual context
2118 * switch. We need to take the runqueue lock.
2119 *
2120 * We could check initially without the lock but it is
2121 * very likely that we need to take the lock in every
2122 * iteration.
2123 */
2124 rq = task_rq_lock(p, &flags);
2125 running = task_running(rq, p);
2126 task_rq_unlock(rq, &flags);
2127
2128 if (likely(!running))
2129 break;
2130 /*
2131 * The switch count is incremented before the actual
2132 * context switch. We thus wait for two switches to be
2133 * sure at least one completed.
2134 */
2135 if ((p->nvcsw - nvcsw) > 1)
2136 break;
2137 if ((p->nivcsw - nivcsw) > 1)
2138 break;
2139
2140 cpu_relax();
2141 }
2142} 2076}
2143 2077
2144/* 2078/*
@@ -2196,7 +2130,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2196 * just go back and repeat. 2130 * just go back and repeat.
2197 */ 2131 */
2198 rq = task_rq_lock(p, &flags); 2132 rq = task_rq_lock(p, &flags);
2199 trace_sched_wait_task(rq, p); 2133 trace_sched_wait_task(p);
2200 running = task_running(rq, p); 2134 running = task_running(rq, p);
2201 on_rq = p->se.on_rq; 2135 on_rq = p->se.on_rq;
2202 ncsw = 0; 2136 ncsw = 0;
@@ -2294,6 +2228,9 @@ void task_oncpu_function_call(struct task_struct *p,
2294} 2228}
2295 2229
2296#ifdef CONFIG_SMP 2230#ifdef CONFIG_SMP
2231/*
2232 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
2233 */
2297static int select_fallback_rq(int cpu, struct task_struct *p) 2234static int select_fallback_rq(int cpu, struct task_struct *p)
2298{ 2235{
2299 int dest_cpu; 2236 int dest_cpu;
@@ -2310,12 +2247,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2310 return dest_cpu; 2247 return dest_cpu;
2311 2248
2312 /* No more Mr. Nice Guy. */ 2249 /* No more Mr. Nice Guy. */
2313 if (dest_cpu >= nr_cpu_ids) { 2250 if (unlikely(dest_cpu >= nr_cpu_ids)) {
2314 rcu_read_lock(); 2251 dest_cpu = cpuset_cpus_allowed_fallback(p);
2315 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
2316 rcu_read_unlock();
2317 dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
2318
2319 /* 2252 /*
2320 * Don't tell them about moving exiting tasks or 2253 * Don't tell them about moving exiting tasks or
2321 * kernel threads (both mm NULL), since they never 2254 * kernel threads (both mm NULL), since they never
@@ -2332,17 +2265,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2332} 2265}
2333 2266
2334/* 2267/*
2335 * Gets called from 3 sites (exec, fork, wakeup), since it is called without 2268 * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
2336 * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done
2337 * by:
2338 *
2339 * exec: is unstable, retry loop
2340 * fork & wake-up: serialize ->cpus_allowed against TASK_WAKING
2341 */ 2269 */
2342static inline 2270static inline
2343int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) 2271int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
2344{ 2272{
2345 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); 2273 int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
2346 2274
2347 /* 2275 /*
2348 * In order not to call set_task_cpu() on a blocking task we need 2276 * In order not to call set_task_cpu() on a blocking task we need
@@ -2360,13 +2288,63 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2360 2288
2361 return cpu; 2289 return cpu;
2362} 2290}
2291
2292static void update_avg(u64 *avg, u64 sample)
2293{
2294 s64 diff = sample - *avg;
2295 *avg += diff >> 3;
2296}
2363#endif 2297#endif
2364 2298
2365/*** 2299static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
2300 bool is_sync, bool is_migrate, bool is_local,
2301 unsigned long en_flags)
2302{
2303 schedstat_inc(p, se.statistics.nr_wakeups);
2304 if (is_sync)
2305 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2306 if (is_migrate)
2307 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2308 if (is_local)
2309 schedstat_inc(p, se.statistics.nr_wakeups_local);
2310 else
2311 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2312
2313 activate_task(rq, p, en_flags);
2314}
2315
2316static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2317 int wake_flags, bool success)
2318{
2319 trace_sched_wakeup(p, success);
2320 check_preempt_curr(rq, p, wake_flags);
2321
2322 p->state = TASK_RUNNING;
2323#ifdef CONFIG_SMP
2324 if (p->sched_class->task_woken)
2325 p->sched_class->task_woken(rq, p);
2326
2327 if (unlikely(rq->idle_stamp)) {
2328 u64 delta = rq->clock - rq->idle_stamp;
2329 u64 max = 2*sysctl_sched_migration_cost;
2330
2331 if (delta > max)
2332 rq->avg_idle = max;
2333 else
2334 update_avg(&rq->avg_idle, delta);
2335 rq->idle_stamp = 0;
2336 }
2337#endif
2338 /* if a worker is waking up, notify workqueue */
2339 if ((p->flags & PF_WQ_WORKER) && success)
2340 wq_worker_waking_up(p, cpu_of(rq));
2341}
2342
2343/**
2366 * try_to_wake_up - wake up a thread 2344 * try_to_wake_up - wake up a thread
2367 * @p: the to-be-woken-up thread 2345 * @p: the thread to be awakened
2368 * @state: the mask of task states that can be woken 2346 * @state: the mask of task states that can be woken
2369 * @sync: do a synchronous wakeup? 2347 * @wake_flags: wake modifier flags (WF_*)
2370 * 2348 *
2371 * Put it on the run-queue if it's not already there. The "current" 2349 * Put it on the run-queue if it's not already there. The "current"
2372 * thread is always on the run-queue (except when the actual 2350 * thread is always on the run-queue (except when the actual
@@ -2374,26 +2352,24 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2374 * the simpler "current->state = TASK_RUNNING" to mark yourself 2352 * the simpler "current->state = TASK_RUNNING" to mark yourself
2375 * runnable without the overhead of this. 2353 * runnable without the overhead of this.
2376 * 2354 *
2377 * returns failure only if the task is already active. 2355 * Returns %true if @p was woken up, %false if it was already running
2356 * or @state didn't match @p's state.
2378 */ 2357 */
2379static int try_to_wake_up(struct task_struct *p, unsigned int state, 2358static int try_to_wake_up(struct task_struct *p, unsigned int state,
2380 int wake_flags) 2359 int wake_flags)
2381{ 2360{
2382 int cpu, orig_cpu, this_cpu, success = 0; 2361 int cpu, orig_cpu, this_cpu, success = 0;
2383 unsigned long flags; 2362 unsigned long flags;
2363 unsigned long en_flags = ENQUEUE_WAKEUP;
2384 struct rq *rq; 2364 struct rq *rq;
2385 2365
2386 if (is_realtime(p)) 2366 if (is_realtime(p))
2387 TRACE_TASK(p, "try_to_wake_up() state:%d\n", p->state); 2367 TRACE_TASK(p, "try_to_wake_up() state:%d\n", p->state);
2388 2368
2389 if (!sched_feat(SYNC_WAKEUPS))
2390 wake_flags &= ~WF_SYNC;
2391
2392 this_cpu = get_cpu(); 2369 this_cpu = get_cpu();
2393 2370
2394 smp_wmb(); 2371 smp_wmb();
2395 rq = task_rq_lock(p, &flags); 2372 rq = task_rq_lock(p, &flags);
2396 update_rq_clock(rq);
2397 if (!(p->state & state)) 2373 if (!(p->state & state))
2398 goto out; 2374 goto out;
2399 2375
@@ -2413,28 +2389,26 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2413 * 2389 *
2414 * First fix up the nr_uninterruptible count: 2390 * First fix up the nr_uninterruptible count:
2415 */ 2391 */
2416 if (task_contributes_to_load(p)) 2392 if (task_contributes_to_load(p)) {
2417 rq->nr_uninterruptible--; 2393 if (likely(cpu_online(orig_cpu)))
2394 rq->nr_uninterruptible--;
2395 else
2396 this_rq()->nr_uninterruptible--;
2397 }
2418 p->state = TASK_WAKING; 2398 p->state = TASK_WAKING;
2419 2399
2420 if (p->sched_class->task_waking) 2400 if (p->sched_class->task_waking) {
2421 p->sched_class->task_waking(rq, p); 2401 p->sched_class->task_waking(rq, p);
2402 en_flags |= ENQUEUE_WAKING;
2403 }
2422 2404
2423 __task_rq_unlock(rq); 2405 cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
2424 2406 if (cpu != orig_cpu)
2425 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2426 if (cpu != orig_cpu) {
2427 /*
2428 * Since we migrate the task without holding any rq->lock,
2429 * we need to be careful with task_rq_lock(), since that
2430 * might end up locking an invalid rq.
2431 */
2432 set_task_cpu(p, cpu); 2407 set_task_cpu(p, cpu);
2433 } 2408 __task_rq_unlock(rq);
2434 2409
2435 rq = cpu_rq(cpu); 2410 rq = cpu_rq(cpu);
2436 raw_spin_lock(&rq->lock); 2411 raw_spin_lock(&rq->lock);
2437 update_rq_clock(rq);
2438 2412
2439 /* 2413 /*
2440 * We migrated the task without holding either rq->lock, however 2414 * We migrated the task without holding either rq->lock, however
@@ -2462,54 +2436,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2462 2436
2463out_activate: 2437out_activate:
2464#endif /* CONFIG_SMP */ 2438#endif /* CONFIG_SMP */
2465 schedstat_inc(p, se.nr_wakeups); 2439 ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
2466 if (wake_flags & WF_SYNC) 2440 cpu == this_cpu, en_flags);
2467 schedstat_inc(p, se.nr_wakeups_sync);
2468 if (orig_cpu != cpu)
2469 schedstat_inc(p, se.nr_wakeups_migrate);
2470 if (cpu == this_cpu)
2471 schedstat_inc(p, se.nr_wakeups_local);
2472 else
2473 schedstat_inc(p, se.nr_wakeups_remote);
2474 activate_task(rq, p, 1);
2475 success = 1; 2441 success = 1;
2476
2477 /*
2478 * Only attribute actual wakeups done by this task.
2479 */
2480 if (!in_interrupt()) {
2481 struct sched_entity *se = &current->se;
2482 u64 sample = se->sum_exec_runtime;
2483
2484 if (se->last_wakeup)
2485 sample -= se->last_wakeup;
2486 else
2487 sample -= se->start_runtime;
2488 update_avg(&se->avg_wakeup, sample);
2489
2490 se->last_wakeup = se->sum_exec_runtime;
2491 }
2492
2493out_running: 2442out_running:
2494 trace_sched_wakeup(rq, p, success); 2443 ttwu_post_activation(p, rq, wake_flags, success);
2495 check_preempt_curr(rq, p, wake_flags);
2496
2497 p->state = TASK_RUNNING;
2498#ifdef CONFIG_SMP
2499 if (p->sched_class->task_woken)
2500 p->sched_class->task_woken(rq, p);
2501
2502 if (unlikely(rq->idle_stamp)) {
2503 u64 delta = rq->clock - rq->idle_stamp;
2504 u64 max = 2*sysctl_sched_migration_cost;
2505
2506 if (delta > max)
2507 rq->avg_idle = max;
2508 else
2509 update_avg(&rq->avg_idle, delta);
2510 rq->idle_stamp = 0;
2511 }
2512#endif
2513out: 2444out:
2514 if (is_realtime(p)) 2445 if (is_realtime(p))
2515 TRACE_TASK(p, "try_to_wake_up() done state:%d\n", p->state); 2446 TRACE_TASK(p, "try_to_wake_up() done state:%d\n", p->state);
@@ -2520,6 +2451,37 @@ out:
2520} 2451}
2521 2452
2522/** 2453/**
2454 * try_to_wake_up_local - try to wake up a local task with rq lock held
2455 * @p: the thread to be awakened
2456 *
2457 * Put @p on the run-queue if it's not alredy there. The caller must
2458 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2459 * the current task. this_rq() stays locked over invocation.
2460 */
2461static void try_to_wake_up_local(struct task_struct *p)
2462{
2463 struct rq *rq = task_rq(p);
2464 bool success = false;
2465
2466 BUG_ON(rq != this_rq());
2467 BUG_ON(p == current);
2468 lockdep_assert_held(&rq->lock);
2469
2470 if (!(p->state & TASK_NORMAL))
2471 return;
2472
2473 if (!p->se.on_rq) {
2474 if (likely(!task_running(rq, p))) {
2475 schedstat_inc(rq, ttwu_count);
2476 schedstat_inc(rq, ttwu_local);
2477 }
2478 ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
2479 success = true;
2480 }
2481 ttwu_post_activation(p, rq, 0, success);
2482}
2483
2484/**
2523 * wake_up_process - Wake up a specific process 2485 * wake_up_process - Wake up a specific process
2524 * @p: The process to be woken up. 2486 * @p: The process to be woken up.
2525 * 2487 *
@@ -2553,42 +2515,9 @@ static void __sched_fork(struct task_struct *p)
2553 p->se.sum_exec_runtime = 0; 2515 p->se.sum_exec_runtime = 0;
2554 p->se.prev_sum_exec_runtime = 0; 2516 p->se.prev_sum_exec_runtime = 0;
2555 p->se.nr_migrations = 0; 2517 p->se.nr_migrations = 0;
2556 p->se.last_wakeup = 0;
2557 p->se.avg_overlap = 0;
2558 p->se.start_runtime = 0;
2559 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2560 2518
2561#ifdef CONFIG_SCHEDSTATS 2519#ifdef CONFIG_SCHEDSTATS
2562 p->se.wait_start = 0; 2520 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2563 p->se.wait_max = 0;
2564 p->se.wait_count = 0;
2565 p->se.wait_sum = 0;
2566
2567 p->se.sleep_start = 0;
2568 p->se.sleep_max = 0;
2569 p->se.sum_sleep_runtime = 0;
2570
2571 p->se.block_start = 0;
2572 p->se.block_max = 0;
2573 p->se.exec_max = 0;
2574 p->se.slice_max = 0;
2575
2576 p->se.nr_migrations_cold = 0;
2577 p->se.nr_failed_migrations_affine = 0;
2578 p->se.nr_failed_migrations_running = 0;
2579 p->se.nr_failed_migrations_hot = 0;
2580 p->se.nr_forced_migrations = 0;
2581
2582 p->se.nr_wakeups = 0;
2583 p->se.nr_wakeups_sync = 0;
2584 p->se.nr_wakeups_migrate = 0;
2585 p->se.nr_wakeups_local = 0;
2586 p->se.nr_wakeups_remote = 0;
2587 p->se.nr_wakeups_affine = 0;
2588 p->se.nr_wakeups_affine_attempts = 0;
2589 p->se.nr_wakeups_passive = 0;
2590 p->se.nr_wakeups_idle = 0;
2591
2592#endif 2521#endif
2593 2522
2594 INIT_LIST_HEAD(&p->rt.run_list); 2523 INIT_LIST_HEAD(&p->rt.run_list);
@@ -2609,11 +2538,11 @@ void sched_fork(struct task_struct *p, int clone_flags)
2609 2538
2610 __sched_fork(p); 2539 __sched_fork(p);
2611 /* 2540 /*
2612 * We mark the process as waking here. This guarantees that 2541 * We mark the process as running here. This guarantees that
2613 * nobody will actually run it, and a signal or other external 2542 * nobody will actually run it, and a signal or other external
2614 * event cannot wake it up and insert it on the runqueue either. 2543 * event cannot wake it up and insert it on the runqueue either.
2615 */ 2544 */
2616 p->state = TASK_WAKING; 2545 p->state = TASK_RUNNING;
2617 2546
2618 /* 2547 /*
2619 * Revert to default priority/policy on fork if requested. 2548 * Revert to default priority/policy on fork if requested.
@@ -2648,7 +2577,16 @@ void sched_fork(struct task_struct *p, int clone_flags)
2648 if (p->sched_class->task_fork) 2577 if (p->sched_class->task_fork)
2649 p->sched_class->task_fork(p); 2578 p->sched_class->task_fork(p);
2650 2579
2580 /*
2581 * The child is not yet in the pid-hash so no cgroup attach races,
2582 * and the cgroup is pinned to this child due to cgroup_fork()
2583 * is ran before sched_fork().
2584 *
2585 * Silence PROVE_RCU.
2586 */
2587 rcu_read_lock();
2651 set_task_cpu(p, cpu); 2588 set_task_cpu(p, cpu);
2589 rcu_read_unlock();
2652 2590
2653#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2591#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2654 if (likely(sched_info_on())) 2592 if (likely(sched_info_on()))
@@ -2680,31 +2618,27 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2680 int cpu __maybe_unused = get_cpu(); 2618 int cpu __maybe_unused = get_cpu();
2681 2619
2682#ifdef CONFIG_SMP 2620#ifdef CONFIG_SMP
2621 rq = task_rq_lock(p, &flags);
2622 p->state = TASK_WAKING;
2623
2683 /* 2624 /*
2684 * Fork balancing, do it here and not earlier because: 2625 * Fork balancing, do it here and not earlier because:
2685 * - cpus_allowed can change in the fork path 2626 * - cpus_allowed can change in the fork path
2686 * - any previously selected cpu might disappear through hotplug 2627 * - any previously selected cpu might disappear through hotplug
2687 * 2628 *
2688 * We still have TASK_WAKING but PF_STARTING is gone now, meaning 2629 * We set TASK_WAKING so that select_task_rq() can drop rq->lock
2689 * ->cpus_allowed is stable, we have preemption disabled, meaning 2630 * without people poking at ->cpus_allowed.
2690 * cpu_online_mask is stable.
2691 */ 2631 */
2692 cpu = select_task_rq(p, SD_BALANCE_FORK, 0); 2632 cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
2693 set_task_cpu(p, cpu); 2633 set_task_cpu(p, cpu);
2694#endif
2695 2634
2696 /*
2697 * Since the task is not on the rq and we still have TASK_WAKING set
2698 * nobody else will migrate this task.
2699 */
2700 rq = cpu_rq(cpu);
2701 raw_spin_lock_irqsave(&rq->lock, flags);
2702
2703 BUG_ON(p->state != TASK_WAKING);
2704 p->state = TASK_RUNNING; 2635 p->state = TASK_RUNNING;
2705 update_rq_clock(rq); 2636 task_rq_unlock(rq, &flags);
2637#endif
2638
2639 rq = task_rq_lock(p, &flags);
2706 activate_task(rq, p, 0); 2640 activate_task(rq, p, 0);
2707 trace_sched_wakeup_new(rq, p, 1); 2641 trace_sched_wakeup_new(p, 1);
2708 check_preempt_curr(rq, p, WF_FORK); 2642 check_preempt_curr(rq, p, WF_FORK);
2709#ifdef CONFIG_SMP 2643#ifdef CONFIG_SMP
2710 if (p->sched_class->task_woken) 2644 if (p->sched_class->task_woken)
@@ -2935,7 +2869,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2935 struct mm_struct *mm, *oldmm; 2869 struct mm_struct *mm, *oldmm;
2936 2870
2937 prepare_task_switch(rq, prev, next); 2871 prepare_task_switch(rq, prev, next);
2938 trace_sched_switch(rq, prev, next); 2872 trace_sched_switch(prev, next);
2939 mm = next->mm; 2873 mm = next->mm;
2940 oldmm = prev->active_mm; 2874 oldmm = prev->active_mm;
2941 /* 2875 /*
@@ -3033,9 +2967,9 @@ unsigned long nr_iowait(void)
3033 return sum; 2967 return sum;
3034} 2968}
3035 2969
3036unsigned long nr_iowait_cpu(void) 2970unsigned long nr_iowait_cpu(int cpu)
3037{ 2971{
3038 struct rq *this = this_rq(); 2972 struct rq *this = cpu_rq(cpu);
3039 return atomic_read(&this->nr_iowait); 2973 return atomic_read(&this->nr_iowait);
3040} 2974}
3041 2975
@@ -3052,6 +2986,61 @@ static unsigned long calc_load_update;
3052unsigned long avenrun[3]; 2986unsigned long avenrun[3];
3053EXPORT_SYMBOL(avenrun); 2987EXPORT_SYMBOL(avenrun);
3054 2988
2989static long calc_load_fold_active(struct rq *this_rq)
2990{
2991 long nr_active, delta = 0;
2992
2993 nr_active = this_rq->nr_running;
2994 nr_active += (long) this_rq->nr_uninterruptible;
2995
2996 if (nr_active != this_rq->calc_load_active) {
2997 delta = nr_active - this_rq->calc_load_active;
2998 this_rq->calc_load_active = nr_active;
2999 }
3000
3001 return delta;
3002}
3003
3004#ifdef CONFIG_NO_HZ
3005/*
3006 * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
3007 *
3008 * When making the ILB scale, we should try to pull this in as well.
3009 */
3010static atomic_long_t calc_load_tasks_idle;
3011
3012static void calc_load_account_idle(struct rq *this_rq)
3013{
3014 long delta;
3015
3016 delta = calc_load_fold_active(this_rq);
3017 if (delta)
3018 atomic_long_add(delta, &calc_load_tasks_idle);
3019}
3020
3021static long calc_load_fold_idle(void)
3022{
3023 long delta = 0;
3024
3025 /*
3026 * Its got a race, we don't care...
3027 */
3028 if (atomic_long_read(&calc_load_tasks_idle))
3029 delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
3030
3031 return delta;
3032}
3033#else
3034static void calc_load_account_idle(struct rq *this_rq)
3035{
3036}
3037
3038static inline long calc_load_fold_idle(void)
3039{
3040 return 0;
3041}
3042#endif
3043
3055/** 3044/**
3056 * get_avenrun - get the load average array 3045 * get_avenrun - get the load average array
3057 * @loads: pointer to dest load array 3046 * @loads: pointer to dest load array
@@ -3098,40 +3087,121 @@ void calc_global_load(void)
3098} 3087}
3099 3088
3100/* 3089/*
3101 * Either called from update_cpu_load() or from a cpu going idle 3090 * Called from update_cpu_load() to periodically update this CPU's
3091 * active count.
3102 */ 3092 */
3103static void calc_load_account_active(struct rq *this_rq) 3093static void calc_load_account_active(struct rq *this_rq)
3104{ 3094{
3105 long nr_active, delta; 3095 long delta;
3106 3096
3107 nr_active = this_rq->nr_running; 3097 if (time_before(jiffies, this_rq->calc_load_update))
3108 nr_active += (long) this_rq->nr_uninterruptible; 3098 return;
3109 3099
3110 if (nr_active != this_rq->calc_load_active) { 3100 delta = calc_load_fold_active(this_rq);
3111 delta = nr_active - this_rq->calc_load_active; 3101 delta += calc_load_fold_idle();
3112 this_rq->calc_load_active = nr_active; 3102 if (delta)
3113 atomic_long_add(delta, &calc_load_tasks); 3103 atomic_long_add(delta, &calc_load_tasks);
3104
3105 this_rq->calc_load_update += LOAD_FREQ;
3106}
3107
3108/*
3109 * The exact cpuload at various idx values, calculated at every tick would be
3110 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
3111 *
3112 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
3113 * on nth tick when cpu may be busy, then we have:
3114 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3115 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
3116 *
3117 * decay_load_missed() below does efficient calculation of
3118 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3119 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
3120 *
3121 * The calculation is approximated on a 128 point scale.
3122 * degrade_zero_ticks is the number of ticks after which load at any
3123 * particular idx is approximated to be zero.
3124 * degrade_factor is a precomputed table, a row for each load idx.
3125 * Each column corresponds to degradation factor for a power of two ticks,
3126 * based on 128 point scale.
3127 * Example:
3128 * row 2, col 3 (=12) says that the degradation at load idx 2 after
3129 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
3130 *
3131 * With this power of 2 load factors, we can degrade the load n times
3132 * by looking at 1 bits in n and doing as many mult/shift instead of
3133 * n mult/shifts needed by the exact degradation.
3134 */
3135#define DEGRADE_SHIFT 7
3136static const unsigned char
3137 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
3138static const unsigned char
3139 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
3140 {0, 0, 0, 0, 0, 0, 0, 0},
3141 {64, 32, 8, 0, 0, 0, 0, 0},
3142 {96, 72, 40, 12, 1, 0, 0},
3143 {112, 98, 75, 43, 15, 1, 0},
3144 {120, 112, 98, 76, 45, 16, 2} };
3145
3146/*
3147 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
3148 * would be when CPU is idle and so we just decay the old load without
3149 * adding any new load.
3150 */
3151static unsigned long
3152decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
3153{
3154 int j = 0;
3155
3156 if (!missed_updates)
3157 return load;
3158
3159 if (missed_updates >= degrade_zero_ticks[idx])
3160 return 0;
3161
3162 if (idx == 1)
3163 return load >> missed_updates;
3164
3165 while (missed_updates) {
3166 if (missed_updates % 2)
3167 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
3168
3169 missed_updates >>= 1;
3170 j++;
3114 } 3171 }
3172 return load;
3115} 3173}
3116 3174
3117/* 3175/*
3118 * Update rq->cpu_load[] statistics. This function is usually called every 3176 * Update rq->cpu_load[] statistics. This function is usually called every
3119 * scheduler tick (TICK_NSEC). 3177 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
3178 * every tick. We fix it up based on jiffies.
3120 */ 3179 */
3121static void update_cpu_load(struct rq *this_rq) 3180static void update_cpu_load(struct rq *this_rq)
3122{ 3181{
3123 unsigned long this_load = this_rq->load.weight; 3182 unsigned long this_load = this_rq->load.weight;
3183 unsigned long curr_jiffies = jiffies;
3184 unsigned long pending_updates;
3124 int i, scale; 3185 int i, scale;
3125 3186
3126 this_rq->nr_load_updates++; 3187 this_rq->nr_load_updates++;
3127 3188
3189 /* Avoid repeated calls on same jiffy, when moving in and out of idle */
3190 if (curr_jiffies == this_rq->last_load_update_tick)
3191 return;
3192
3193 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
3194 this_rq->last_load_update_tick = curr_jiffies;
3195
3128 /* Update our load: */ 3196 /* Update our load: */
3129 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 3197 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
3198 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
3130 unsigned long old_load, new_load; 3199 unsigned long old_load, new_load;
3131 3200
3132 /* scale is effectively 1 << i now, and >> i divides by scale */ 3201 /* scale is effectively 1 << i now, and >> i divides by scale */
3133 3202
3134 old_load = this_rq->cpu_load[i]; 3203 old_load = this_rq->cpu_load[i];
3204 old_load = decay_load_missed(old_load, pending_updates - 1, i);
3135 new_load = this_load; 3205 new_load = this_load;
3136 /* 3206 /*
3137 * Round up the averaging division if load is increasing. This 3207 * Round up the averaging division if load is increasing. This
@@ -3139,14 +3209,19 @@ static void update_cpu_load(struct rq *this_rq)
3139 * example. 3209 * example.
3140 */ 3210 */
3141 if (new_load > old_load) 3211 if (new_load > old_load)
3142 new_load += scale-1; 3212 new_load += scale - 1;
3143 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
3144 }
3145 3213
3146 if (time_after_eq(jiffies, this_rq->calc_load_update)) { 3214 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
3147 this_rq->calc_load_update += LOAD_FREQ;
3148 calc_load_account_active(this_rq);
3149 } 3215 }
3216
3217 sched_avg_update(this_rq);
3218}
3219
3220static void update_cpu_load_active(struct rq *this_rq)
3221{
3222 update_cpu_load(this_rq);
3223
3224 calc_load_account_active(this_rq);
3150} 3225}
3151 3226
3152#ifdef CONFIG_SMP 3227#ifdef CONFIG_SMP
@@ -3158,44 +3233,27 @@ static void update_cpu_load(struct rq *this_rq)
3158void sched_exec(void) 3233void sched_exec(void)
3159{ 3234{
3160 struct task_struct *p = current; 3235 struct task_struct *p = current;
3161 struct migration_req req;
3162 int dest_cpu, this_cpu;
3163 unsigned long flags; 3236 unsigned long flags;
3164 struct rq *rq; 3237 struct rq *rq;
3165 3238 int dest_cpu;
3166again:
3167 this_cpu = get_cpu();
3168 dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0);
3169 if (dest_cpu == this_cpu) {
3170 put_cpu();
3171 return;
3172 }
3173 3239
3174 rq = task_rq_lock(p, &flags); 3240 rq = task_rq_lock(p, &flags);
3175 put_cpu(); 3241 dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
3242 if (dest_cpu == smp_processor_id())
3243 goto unlock;
3176 3244
3177 /* 3245 /*
3178 * select_task_rq() can race against ->cpus_allowed 3246 * select_task_rq() can race against ->cpus_allowed
3179 */ 3247 */
3180 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) 3248 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3181 || unlikely(!cpu_active(dest_cpu))) { 3249 likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
3182 task_rq_unlock(rq, &flags); 3250 struct migration_arg arg = { p, dest_cpu };
3183 goto again;
3184 }
3185
3186 /* force the process onto the specified CPU */
3187 if (migrate_task(p, dest_cpu, &req)) {
3188 /* Need to wait for migration thread (might exit: take ref). */
3189 struct task_struct *mt = rq->migration_thread;
3190 3251
3191 get_task_struct(mt);
3192 task_rq_unlock(rq, &flags); 3252 task_rq_unlock(rq, &flags);
3193 wake_up_process(mt); 3253 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
3194 put_task_struct(mt);
3195 wait_for_completion(&req.done);
3196
3197 return; 3254 return;
3198 } 3255 }
3256unlock:
3199 task_rq_unlock(rq, &flags); 3257 task_rq_unlock(rq, &flags);
3200} 3258}
3201 3259
@@ -3482,9 +3540,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3482 rtime = nsecs_to_cputime(p->se.sum_exec_runtime); 3540 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
3483 3541
3484 if (total) { 3542 if (total) {
3485 u64 temp; 3543 u64 temp = rtime;
3486 3544
3487 temp = (u64)(rtime * utime); 3545 temp *= utime;
3488 do_div(temp, total); 3546 do_div(temp, total);
3489 utime = (cputime_t)temp; 3547 utime = (cputime_t)temp;
3490 } else 3548 } else
@@ -3515,9 +3573,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3515 rtime = nsecs_to_cputime(cputime.sum_exec_runtime); 3573 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
3516 3574
3517 if (total) { 3575 if (total) {
3518 u64 temp; 3576 u64 temp = rtime;
3519 3577
3520 temp = (u64)(rtime * cputime.utime); 3578 temp *= cputime.utime;
3521 do_div(temp, total); 3579 do_div(temp, total);
3522 utime = (cputime_t)temp; 3580 utime = (cputime_t)temp;
3523 } else 3581 } else
@@ -3551,7 +3609,7 @@ void scheduler_tick(void)
3551 3609
3552 raw_spin_lock(&rq->lock); 3610 raw_spin_lock(&rq->lock);
3553 update_rq_clock(rq); 3611 update_rq_clock(rq);
3554 update_cpu_load(rq); 3612 update_cpu_load_active(rq);
3555 curr->sched_class->task_tick(rq, curr, 0); 3613 curr->sched_class->task_tick(rq, curr, 0);
3556 3614
3557 /* litmus_tick may force current to resched */ 3615 /* litmus_tick may force current to resched */
@@ -3675,23 +3733,9 @@ static inline void schedule_debug(struct task_struct *prev)
3675 3733
3676static void put_prev_task(struct rq *rq, struct task_struct *prev) 3734static void put_prev_task(struct rq *rq, struct task_struct *prev)
3677{ 3735{
3678 if (prev->state == TASK_RUNNING) { 3736 if (prev->se.on_rq)
3679 u64 runtime = prev->se.sum_exec_runtime; 3737 update_rq_clock(rq);
3680 3738 rq->skip_clock_update = 0;
3681 runtime -= prev->se.prev_sum_exec_runtime;
3682 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
3683
3684 /*
3685 * In order to avoid avg_overlap growing stale when we are
3686 * indeed overlapping and hence not getting put to sleep, grow
3687 * the avg_overlap on preemption.
3688 *
3689 * We use the average preemption runtime because that
3690 * correlates to the amount of cache footprint a task can
3691 * build up.
3692 */
3693 update_avg(&prev->se.avg_overlap, runtime);
3694 }
3695 prev->sched_class->put_prev_task(rq, prev); 3739 prev->sched_class->put_prev_task(rq, prev);
3696} 3740}
3697 3741
@@ -3749,9 +3793,8 @@ need_resched:
3749 preempt_disable(); 3793 preempt_disable();
3750 cpu = smp_processor_id(); 3794 cpu = smp_processor_id();
3751 rq = cpu_rq(cpu); 3795 rq = cpu_rq(cpu);
3752 rcu_sched_qs(cpu); 3796 rcu_note_context_switch(cpu);
3753 prev = rq->curr; 3797 prev = rq->curr;
3754 switch_count = &prev->nivcsw;
3755 3798
3756 release_kernel_lock(prev); 3799 release_kernel_lock(prev);
3757need_resched_nonpreemptible: 3800need_resched_nonpreemptible:
@@ -3764,14 +3807,28 @@ need_resched_nonpreemptible:
3764 hrtick_clear(rq); 3807 hrtick_clear(rq);
3765 3808
3766 raw_spin_lock_irq(&rq->lock); 3809 raw_spin_lock_irq(&rq->lock);
3767 update_rq_clock(rq);
3768 clear_tsk_need_resched(prev); 3810 clear_tsk_need_resched(prev);
3769 3811
3812 switch_count = &prev->nivcsw;
3770 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3813 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3771 if (unlikely(signal_pending_state(prev->state, prev))) 3814 if (unlikely(signal_pending_state(prev->state, prev))) {
3772 prev->state = TASK_RUNNING; 3815 prev->state = TASK_RUNNING;
3773 else 3816 } else {
3774 deactivate_task(rq, prev, 1); 3817 /*
3818 * If a worker is going to sleep, notify and
3819 * ask workqueue whether it wants to wake up a
3820 * task to maintain concurrency. If so, wake
3821 * up the task.
3822 */
3823 if (prev->flags & PF_WQ_WORKER) {
3824 struct task_struct *to_wakeup;
3825
3826 to_wakeup = wq_worker_sleeping(prev, cpu);
3827 if (to_wakeup)
3828 try_to_wake_up_local(to_wakeup);
3829 }
3830 deactivate_task(rq, prev, DEQUEUE_SLEEP);
3831 }
3775 switch_count = &prev->nvcsw; 3832 switch_count = &prev->nvcsw;
3776 } 3833 }
3777 3834
@@ -3796,8 +3853,10 @@ need_resched_nonpreemptible:
3796 context_switch(rq, prev, next); /* unlocks the rq */ 3853 context_switch(rq, prev, next); /* unlocks the rq */
3797 TS_CXS_END(current); 3854 TS_CXS_END(current);
3798 /* 3855 /*
3799 * the context switch might have flipped the stack from under 3856 * The context switch have flipped the stack from under us
3800 * us, hence refresh the local variables. 3857 * and restored the local variables which were saved when
3858 * this task called schedule() in the past. prev == current
3859 * is still correct, but it can be moved to another cpu/rq.
3801 */ 3860 */
3802 cpu = smp_processor_id(); 3861 cpu = smp_processor_id();
3803 rq = cpu_rq(cpu); 3862 rq = cpu_rq(cpu);
@@ -3810,11 +3869,8 @@ need_resched_nonpreemptible:
3810 3869
3811 post_schedule(rq); 3870 post_schedule(rq);
3812 3871
3813 if (unlikely(reacquire_kernel_lock(current) < 0)) { 3872 if (unlikely(reacquire_kernel_lock(prev)))
3814 prev = rq->curr;
3815 switch_count = &prev->nivcsw;
3816 goto need_resched_nonpreemptible; 3873 goto need_resched_nonpreemptible;
3817 }
3818 3874
3819 preempt_enable_no_resched(); 3875 preempt_enable_no_resched();
3820 if (need_resched()) 3876 if (need_resched())
@@ -3870,8 +3926,16 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
3870 /* 3926 /*
3871 * Owner changed, break to re-assess state. 3927 * Owner changed, break to re-assess state.
3872 */ 3928 */
3873 if (lock->owner != owner) 3929 if (lock->owner != owner) {
3930 /*
3931 * If the lock has switched to a different owner,
3932 * we likely have heavy contention. Return 0 to quit
3933 * optimistic spinning and not contend further:
3934 */
3935 if (lock->owner)
3936 return 0;
3874 break; 3937 break;
3938 }
3875 3939
3876 /* 3940 /*
3877 * Is that owner really running on that cpu? 3941 * Is that owner really running on that cpu?
@@ -3892,7 +3956,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
3892 * off of preempt_enable. Kernel preemptions off return from interrupt 3956 * off of preempt_enable. Kernel preemptions off return from interrupt
3893 * occur there and call schedule directly. 3957 * occur there and call schedule directly.
3894 */ 3958 */
3895asmlinkage void __sched preempt_schedule(void) 3959asmlinkage void __sched notrace preempt_schedule(void)
3896{ 3960{
3897 struct thread_info *ti = current_thread_info(); 3961 struct thread_info *ti = current_thread_info();
3898 3962
@@ -3904,9 +3968,9 @@ asmlinkage void __sched preempt_schedule(void)
3904 return; 3968 return;
3905 3969
3906 do { 3970 do {
3907 add_preempt_count(PREEMPT_ACTIVE); 3971 add_preempt_count_notrace(PREEMPT_ACTIVE);
3908 schedule(); 3972 schedule();
3909 sub_preempt_count(PREEMPT_ACTIVE); 3973 sub_preempt_count_notrace(PREEMPT_ACTIVE);
3910 3974
3911 /* 3975 /*
3912 * Check again in case we missed a preemption opportunity 3976 * Check again in case we missed a preemption opportunity
@@ -4005,6 +4069,7 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
4005{ 4069{
4006 __wake_up_common(q, mode, 1, 0, NULL); 4070 __wake_up_common(q, mode, 1, 0, NULL);
4007} 4071}
4072EXPORT_SYMBOL_GPL(__wake_up_locked);
4008 4073
4009void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) 4074void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
4010{ 4075{
@@ -4115,8 +4180,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
4115 if (!x->done) { 4180 if (!x->done) {
4116 DECLARE_WAITQUEUE(wait, current); 4181 DECLARE_WAITQUEUE(wait, current);
4117 4182
4118 wait.flags |= WQ_FLAG_EXCLUSIVE; 4183 __add_wait_queue_tail_exclusive(&x->wait, &wait);
4119 __add_wait_queue_tail(&x->wait, &wait);
4120 do { 4184 do {
4121 if (signal_pending_state(state, current)) { 4185 if (signal_pending_state(state, current)) {
4122 timeout = -ERESTARTSYS; 4186 timeout = -ERESTARTSYS;
@@ -4227,6 +4291,23 @@ int __sched wait_for_completion_killable(struct completion *x)
4227EXPORT_SYMBOL(wait_for_completion_killable); 4291EXPORT_SYMBOL(wait_for_completion_killable);
4228 4292
4229/** 4293/**
4294 * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
4295 * @x: holds the state of this particular completion
4296 * @timeout: timeout value in jiffies
4297 *
4298 * This waits for either a completion of a specific task to be
4299 * signaled or for a specified timeout to expire. It can be
4300 * interrupted by a kill signal. The timeout is in jiffies.
4301 */
4302unsigned long __sched
4303wait_for_completion_killable_timeout(struct completion *x,
4304 unsigned long timeout)
4305{
4306 return wait_for_common(x, timeout, TASK_KILLABLE);
4307}
4308EXPORT_SYMBOL(wait_for_completion_killable_timeout);
4309
4310/**
4230 * try_wait_for_completion - try to decrement a completion without blocking 4311 * try_wait_for_completion - try to decrement a completion without blocking
4231 * @x: completion structure 4312 * @x: completion structure
4232 * 4313 *
@@ -4342,7 +4423,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4342 BUG_ON(prio < 0 || prio > MAX_PRIO); 4423 BUG_ON(prio < 0 || prio > MAX_PRIO);
4343 4424
4344 rq = task_rq_lock(p, &flags); 4425 rq = task_rq_lock(p, &flags);
4345 update_rq_clock(rq);
4346 4426
4347 oldprio = p->prio; 4427 oldprio = p->prio;
4348 prev_class = p->sched_class; 4428 prev_class = p->sched_class;
@@ -4363,7 +4443,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4363 if (running) 4443 if (running)
4364 p->sched_class->set_curr_task(rq); 4444 p->sched_class->set_curr_task(rq);
4365 if (on_rq) { 4445 if (on_rq) {
4366 enqueue_task(rq, p, 0, oldprio < prio); 4446 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4367 4447
4368 check_class_changed(rq, p, prev_class, oldprio, running); 4448 check_class_changed(rq, p, prev_class, oldprio, running);
4369 } 4449 }
@@ -4385,7 +4465,6 @@ void set_user_nice(struct task_struct *p, long nice)
4385 * the task might be in the middle of scheduling on another CPU. 4465 * the task might be in the middle of scheduling on another CPU.
4386 */ 4466 */
4387 rq = task_rq_lock(p, &flags); 4467 rq = task_rq_lock(p, &flags);
4388 update_rq_clock(rq);
4389 /* 4468 /*
4390 * The RT priorities are set via sched_setscheduler(), but we still 4469 * The RT priorities are set via sched_setscheduler(), but we still
4391 * allow the 'normal' nice value to be set - but as expected 4470 * allow the 'normal' nice value to be set - but as expected
@@ -4407,7 +4486,7 @@ void set_user_nice(struct task_struct *p, long nice)
4407 delta = p->prio - old_prio; 4486 delta = p->prio - old_prio;
4408 4487
4409 if (on_rq) { 4488 if (on_rq) {
4410 enqueue_task(rq, p, 0, false); 4489 enqueue_task(rq, p, 0);
4411 /* 4490 /*
4412 * If the task increased its priority or is running and 4491 * If the task increased its priority or is running and
4413 * lowered its priority, then reschedule its CPU: 4492 * lowered its priority, then reschedule its CPU:
@@ -4607,12 +4686,8 @@ recheck:
4607 */ 4686 */
4608 if (user && !capable(CAP_SYS_NICE)) { 4687 if (user && !capable(CAP_SYS_NICE)) {
4609 if (rt_policy(policy)) { 4688 if (rt_policy(policy)) {
4610 unsigned long rlim_rtprio; 4689 unsigned long rlim_rtprio =
4611 4690 task_rlimit(p, RLIMIT_RTPRIO);
4612 if (!lock_task_sighand(p, &flags))
4613 return -ESRCH;
4614 rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
4615 unlock_task_sighand(p, &flags);
4616 4691
4617 /* can't set/change the rt policy */ 4692 /* can't set/change the rt policy */
4618 if (policy != p->policy && !rlim_rtprio) 4693 if (policy != p->policy && !rlim_rtprio)
@@ -4640,16 +4715,6 @@ recheck:
4640 } 4715 }
4641 4716
4642 if (user) { 4717 if (user) {
4643#ifdef CONFIG_RT_GROUP_SCHED
4644 /*
4645 * Do not allow realtime tasks into groups that have no runtime
4646 * assigned.
4647 */
4648 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4649 task_group(p)->rt_bandwidth.rt_runtime == 0)
4650 return -EPERM;
4651#endif
4652
4653 retval = security_task_setscheduler(p, policy, param); 4718 retval = security_task_setscheduler(p, policy, param);
4654 if (retval) 4719 if (retval)
4655 return retval; 4720 return retval;
@@ -4671,6 +4736,22 @@ recheck:
4671 * runqueue lock must be held. 4736 * runqueue lock must be held.
4672 */ 4737 */
4673 rq = __task_rq_lock(p); 4738 rq = __task_rq_lock(p);
4739
4740#ifdef CONFIG_RT_GROUP_SCHED
4741 if (user) {
4742 /*
4743 * Do not allow realtime tasks into groups that have no runtime
4744 * assigned.
4745 */
4746 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4747 task_group(p)->rt_bandwidth.rt_runtime == 0) {
4748 __task_rq_unlock(rq);
4749 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4750 return -EPERM;
4751 }
4752 }
4753#endif
4754
4674 /* recheck policy now with rq lock held */ 4755 /* recheck policy now with rq lock held */
4675 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 4756 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4676 policy = oldpolicy = -1; 4757 policy = oldpolicy = -1;
@@ -4678,7 +4759,6 @@ recheck:
4678 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 4759 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4679 goto recheck; 4760 goto recheck;
4680 } 4761 }
4681 update_rq_clock(rq);
4682 on_rq = p->se.on_rq; 4762 on_rq = p->se.on_rq;
4683 running = task_current(rq, p); 4763 running = task_current(rq, p);
4684 if (on_rq) 4764 if (on_rq)
@@ -5425,17 +5505,15 @@ static inline void sched_init_granularity(void)
5425/* 5505/*
5426 * This is how migration works: 5506 * This is how migration works:
5427 * 5507 *
5428 * 1) we queue a struct migration_req structure in the source CPU's 5508 * 1) we invoke migration_cpu_stop() on the target CPU using
5429 * runqueue and wake up that CPU's migration thread. 5509 * stop_one_cpu().
5430 * 2) we down() the locked semaphore => thread blocks. 5510 * 2) stopper starts to run (implicitly forcing the migrated thread
5431 * 3) migration thread wakes up (implicitly it forces the migrated 5511 * off the CPU)
5432 * thread off the CPU) 5512 * 3) it checks whether the migrated task is still in the wrong runqueue.
5433 * 4) it gets the migration request and checks whether the migrated 5513 * 4) if it's in the wrong runqueue then the migration thread removes
5434 * task is still in the wrong runqueue.
5435 * 5) if it's in the wrong runqueue then the migration thread removes
5436 * it and puts it into the right queue. 5514 * it and puts it into the right queue.
5437 * 6) migration thread up()s the semaphore. 5515 * 5) stopper completes and stop_one_cpu() returns and the migration
5438 * 7) we wake up and the migration is done. 5516 * is done.
5439 */ 5517 */
5440 5518
5441/* 5519/*
@@ -5449,12 +5527,23 @@ static inline void sched_init_granularity(void)
5449 */ 5527 */
5450int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) 5528int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5451{ 5529{
5452 struct migration_req req;
5453 unsigned long flags; 5530 unsigned long flags;
5454 struct rq *rq; 5531 struct rq *rq;
5532 unsigned int dest_cpu;
5455 int ret = 0; 5533 int ret = 0;
5456 5534
5535 /*
5536 * Serialize against TASK_WAKING so that ttwu() and wunt() can
5537 * drop the rq->lock and still rely on ->cpus_allowed.
5538 */
5539again:
5540 while (task_is_waking(p))
5541 cpu_relax();
5457 rq = task_rq_lock(p, &flags); 5542 rq = task_rq_lock(p, &flags);
5543 if (task_is_waking(p)) {
5544 task_rq_unlock(rq, &flags);
5545 goto again;
5546 }
5458 5547
5459 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 5548 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
5460 ret = -EINVAL; 5549 ret = -EINVAL;
@@ -5478,15 +5567,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5478 if (cpumask_test_cpu(task_cpu(p), new_mask)) 5567 if (cpumask_test_cpu(task_cpu(p), new_mask))
5479 goto out; 5568 goto out;
5480 5569
5481 if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) { 5570 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5571 if (migrate_task(p, dest_cpu)) {
5572 struct migration_arg arg = { p, dest_cpu };
5482 /* Need help from migration thread: drop lock and wait. */ 5573 /* Need help from migration thread: drop lock and wait. */
5483 struct task_struct *mt = rq->migration_thread;
5484
5485 get_task_struct(mt);
5486 task_rq_unlock(rq, &flags); 5574 task_rq_unlock(rq, &flags);
5487 wake_up_process(mt); 5575 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
5488 put_task_struct(mt);
5489 wait_for_completion(&req.done);
5490 tlb_migrate_finish(p->mm); 5576 tlb_migrate_finish(p->mm);
5491 return 0; 5577 return 0;
5492 } 5578 }
@@ -5544,98 +5630,49 @@ fail:
5544 return ret; 5630 return ret;
5545} 5631}
5546 5632
5547#define RCU_MIGRATION_IDLE 0
5548#define RCU_MIGRATION_NEED_QS 1
5549#define RCU_MIGRATION_GOT_QS 2
5550#define RCU_MIGRATION_MUST_SYNC 3
5551
5552/* 5633/*
5553 * migration_thread - this is a highprio system thread that performs 5634 * migration_cpu_stop - this will be executed by a highprio stopper thread
5554 * thread migration by bumping thread off CPU then 'pushing' onto 5635 * and performs thread migration by bumping thread off CPU then
5555 * another runqueue. 5636 * 'pushing' onto another runqueue.
5556 */ 5637 */
5557static int migration_thread(void *data) 5638static int migration_cpu_stop(void *data)
5558{
5559 int badcpu;
5560 int cpu = (long)data;
5561 struct rq *rq;
5562
5563 rq = cpu_rq(cpu);
5564 BUG_ON(rq->migration_thread != current);
5565
5566 set_current_state(TASK_INTERRUPTIBLE);
5567 while (!kthread_should_stop()) {
5568 struct migration_req *req;
5569 struct list_head *head;
5570
5571 raw_spin_lock_irq(&rq->lock);
5572
5573 if (cpu_is_offline(cpu)) {
5574 raw_spin_unlock_irq(&rq->lock);
5575 break;
5576 }
5577
5578 if (rq->active_balance) {
5579 active_load_balance(rq, cpu);
5580 rq->active_balance = 0;
5581 }
5582
5583 head = &rq->migration_queue;
5584
5585 if (list_empty(head)) {
5586 raw_spin_unlock_irq(&rq->lock);
5587 schedule();
5588 set_current_state(TASK_INTERRUPTIBLE);
5589 continue;
5590 }
5591 req = list_entry(head->next, struct migration_req, list);
5592 list_del_init(head->next);
5593
5594 if (req->task != NULL) {
5595 raw_spin_unlock(&rq->lock);
5596 __migrate_task(req->task, cpu, req->dest_cpu);
5597 } else if (likely(cpu == (badcpu = smp_processor_id()))) {
5598 req->dest_cpu = RCU_MIGRATION_GOT_QS;
5599 raw_spin_unlock(&rq->lock);
5600 } else {
5601 req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
5602 raw_spin_unlock(&rq->lock);
5603 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
5604 }
5605 local_irq_enable();
5606
5607 complete(&req->done);
5608 }
5609 __set_current_state(TASK_RUNNING);
5610
5611 return 0;
5612}
5613
5614#ifdef CONFIG_HOTPLUG_CPU
5615
5616static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
5617{ 5639{
5618 int ret; 5640 struct migration_arg *arg = data;
5619 5641
5642 /*
5643 * The original target cpu might have gone down and we might
5644 * be on another cpu but it doesn't matter.
5645 */
5620 local_irq_disable(); 5646 local_irq_disable();
5621 ret = __migrate_task(p, src_cpu, dest_cpu); 5647 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
5622 local_irq_enable(); 5648 local_irq_enable();
5623 return ret; 5649 return 0;
5624} 5650}
5625 5651
5652#ifdef CONFIG_HOTPLUG_CPU
5626/* 5653/*
5627 * Figure out where task on dead CPU should go, use force if necessary. 5654 * Figure out where task on dead CPU should go, use force if necessary.
5628 */ 5655 */
5629static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 5656void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5630{ 5657{
5631 int dest_cpu; 5658 struct rq *rq = cpu_rq(dead_cpu);
5659 int needs_cpu, uninitialized_var(dest_cpu);
5660 unsigned long flags;
5632 5661
5633again: 5662 local_irq_save(flags);
5634 dest_cpu = select_fallback_rq(dead_cpu, p);
5635 5663
5636 /* It can have affinity changed while we were choosing. */ 5664 raw_spin_lock(&rq->lock);
5637 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) 5665 needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
5638 goto again; 5666 if (needs_cpu)
5667 dest_cpu = select_fallback_rq(dead_cpu, p);
5668 raw_spin_unlock(&rq->lock);
5669 /*
5670 * It can only fail if we race with set_cpus_allowed(),
5671 * in the racer should migrate the task anyway.
5672 */
5673 if (needs_cpu)
5674 __migrate_task(p, dead_cpu, dest_cpu);
5675 local_irq_restore(flags);
5639} 5676}
5640 5677
5641/* 5678/*
@@ -5699,7 +5736,6 @@ void sched_idle_next(void)
5699 5736
5700 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); 5737 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5701 5738
5702 update_rq_clock(rq);
5703 activate_task(rq, p, 0); 5739 activate_task(rq, p, 0);
5704 5740
5705 raw_spin_unlock_irqrestore(&rq->lock, flags); 5741 raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -5754,7 +5790,6 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
5754 for ( ; ; ) { 5790 for ( ; ; ) {
5755 if (!rq->nr_running) 5791 if (!rq->nr_running)
5756 break; 5792 break;
5757 update_rq_clock(rq);
5758 next = pick_next_task(rq); 5793 next = pick_next_task(rq);
5759 if (!next) 5794 if (!next)
5760 break; 5795 break;
@@ -5977,35 +6012,20 @@ static void set_rq_offline(struct rq *rq)
5977static int __cpuinit 6012static int __cpuinit
5978migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) 6013migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5979{ 6014{
5980 struct task_struct *p;
5981 int cpu = (long)hcpu; 6015 int cpu = (long)hcpu;
5982 unsigned long flags; 6016 unsigned long flags;
5983 struct rq *rq; 6017 struct rq *rq = cpu_rq(cpu);
5984 6018
5985 switch (action) { 6019 switch (action) {
5986 6020
5987 case CPU_UP_PREPARE: 6021 case CPU_UP_PREPARE:
5988 case CPU_UP_PREPARE_FROZEN: 6022 case CPU_UP_PREPARE_FROZEN:
5989 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
5990 if (IS_ERR(p))
5991 return NOTIFY_BAD;
5992 kthread_bind(p, cpu);
5993 /* Must be high prio: stop_machine expects to yield to it. */
5994 rq = task_rq_lock(p, &flags);
5995 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5996 task_rq_unlock(rq, &flags);
5997 get_task_struct(p);
5998 cpu_rq(cpu)->migration_thread = p;
5999 rq->calc_load_update = calc_load_update; 6023 rq->calc_load_update = calc_load_update;
6000 break; 6024 break;
6001 6025
6002 case CPU_ONLINE: 6026 case CPU_ONLINE:
6003 case CPU_ONLINE_FROZEN: 6027 case CPU_ONLINE_FROZEN:
6004 /* Strictly unnecessary, as first user will wake it. */
6005 wake_up_process(cpu_rq(cpu)->migration_thread);
6006
6007 /* Update our root-domain */ 6028 /* Update our root-domain */
6008 rq = cpu_rq(cpu);
6009 raw_spin_lock_irqsave(&rq->lock, flags); 6029 raw_spin_lock_irqsave(&rq->lock, flags);
6010 if (rq->rd) { 6030 if (rq->rd) {
6011 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 6031 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -6016,61 +6036,24 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6016 break; 6036 break;
6017 6037
6018#ifdef CONFIG_HOTPLUG_CPU 6038#ifdef CONFIG_HOTPLUG_CPU
6019 case CPU_UP_CANCELED:
6020 case CPU_UP_CANCELED_FROZEN:
6021 if (!cpu_rq(cpu)->migration_thread)
6022 break;
6023 /* Unbind it from offline cpu so it can run. Fall thru. */
6024 kthread_bind(cpu_rq(cpu)->migration_thread,
6025 cpumask_any(cpu_online_mask));
6026 kthread_stop(cpu_rq(cpu)->migration_thread);
6027 put_task_struct(cpu_rq(cpu)->migration_thread);
6028 cpu_rq(cpu)->migration_thread = NULL;
6029 break;
6030
6031 case CPU_DEAD: 6039 case CPU_DEAD:
6032 case CPU_DEAD_FROZEN: 6040 case CPU_DEAD_FROZEN:
6033 cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
6034 migrate_live_tasks(cpu); 6041 migrate_live_tasks(cpu);
6035 rq = cpu_rq(cpu);
6036 kthread_stop(rq->migration_thread);
6037 put_task_struct(rq->migration_thread);
6038 rq->migration_thread = NULL;
6039 /* Idle task back to normal (off runqueue, low prio) */ 6042 /* Idle task back to normal (off runqueue, low prio) */
6040 raw_spin_lock_irq(&rq->lock); 6043 raw_spin_lock_irq(&rq->lock);
6041 update_rq_clock(rq);
6042 deactivate_task(rq, rq->idle, 0); 6044 deactivate_task(rq, rq->idle, 0);
6043 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); 6045 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
6044 rq->idle->sched_class = &idle_sched_class; 6046 rq->idle->sched_class = &idle_sched_class;
6045 migrate_dead_tasks(cpu); 6047 migrate_dead_tasks(cpu);
6046 raw_spin_unlock_irq(&rq->lock); 6048 raw_spin_unlock_irq(&rq->lock);
6047 cpuset_unlock();
6048 migrate_nr_uninterruptible(rq); 6049 migrate_nr_uninterruptible(rq);
6049 BUG_ON(rq->nr_running != 0); 6050 BUG_ON(rq->nr_running != 0);
6050 calc_global_load_remove(rq); 6051 calc_global_load_remove(rq);
6051 /*
6052 * No need to migrate the tasks: it was best-effort if
6053 * they didn't take sched_hotcpu_mutex. Just wake up
6054 * the requestors.
6055 */
6056 raw_spin_lock_irq(&rq->lock);
6057 while (!list_empty(&rq->migration_queue)) {
6058 struct migration_req *req;
6059
6060 req = list_entry(rq->migration_queue.next,
6061 struct migration_req, list);
6062 list_del_init(&req->list);
6063 raw_spin_unlock_irq(&rq->lock);
6064 complete(&req->done);
6065 raw_spin_lock_irq(&rq->lock);
6066 }
6067 raw_spin_unlock_irq(&rq->lock);
6068 break; 6052 break;
6069 6053
6070 case CPU_DYING: 6054 case CPU_DYING:
6071 case CPU_DYING_FROZEN: 6055 case CPU_DYING_FROZEN:
6072 /* Update our root-domain */ 6056 /* Update our root-domain */
6073 rq = cpu_rq(cpu);
6074 raw_spin_lock_irqsave(&rq->lock, flags); 6057 raw_spin_lock_irqsave(&rq->lock, flags);
6075 if (rq->rd) { 6058 if (rq->rd) {
6076 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 6059 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -6090,20 +6073,49 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6090 */ 6073 */
6091static struct notifier_block __cpuinitdata migration_notifier = { 6074static struct notifier_block __cpuinitdata migration_notifier = {
6092 .notifier_call = migration_call, 6075 .notifier_call = migration_call,
6093 .priority = 10 6076 .priority = CPU_PRI_MIGRATION,
6094}; 6077};
6095 6078
6079static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
6080 unsigned long action, void *hcpu)
6081{
6082 switch (action & ~CPU_TASKS_FROZEN) {
6083 case CPU_ONLINE:
6084 case CPU_DOWN_FAILED:
6085 set_cpu_active((long)hcpu, true);
6086 return NOTIFY_OK;
6087 default:
6088 return NOTIFY_DONE;
6089 }
6090}
6091
6092static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
6093 unsigned long action, void *hcpu)
6094{
6095 switch (action & ~CPU_TASKS_FROZEN) {
6096 case CPU_DOWN_PREPARE:
6097 set_cpu_active((long)hcpu, false);
6098 return NOTIFY_OK;
6099 default:
6100 return NOTIFY_DONE;
6101 }
6102}
6103
6096static int __init migration_init(void) 6104static int __init migration_init(void)
6097{ 6105{
6098 void *cpu = (void *)(long)smp_processor_id(); 6106 void *cpu = (void *)(long)smp_processor_id();
6099 int err; 6107 int err;
6100 6108
6101 /* Start one for the boot CPU: */ 6109 /* Initialize migration for the boot CPU */
6102 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 6110 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
6103 BUG_ON(err == NOTIFY_BAD); 6111 BUG_ON(err == NOTIFY_BAD);
6104 migration_call(&migration_notifier, CPU_ONLINE, cpu); 6112 migration_call(&migration_notifier, CPU_ONLINE, cpu);
6105 register_cpu_notifier(&migration_notifier); 6113 register_cpu_notifier(&migration_notifier);
6106 6114
6115 /* Register cpu active notifiers */
6116 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
6117 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
6118
6107 return 0; 6119 return 0;
6108} 6120}
6109early_initcall(migration_init); 6121early_initcall(migration_init);
@@ -6338,23 +6350,18 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6338 free_rootdomain(old_rd); 6350 free_rootdomain(old_rd);
6339} 6351}
6340 6352
6341static int init_rootdomain(struct root_domain *rd, bool bootmem) 6353static int init_rootdomain(struct root_domain *rd)
6342{ 6354{
6343 gfp_t gfp = GFP_KERNEL;
6344
6345 memset(rd, 0, sizeof(*rd)); 6355 memset(rd, 0, sizeof(*rd));
6346 6356
6347 if (bootmem) 6357 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
6348 gfp = GFP_NOWAIT;
6349
6350 if (!alloc_cpumask_var(&rd->span, gfp))
6351 goto out; 6358 goto out;
6352 if (!alloc_cpumask_var(&rd->online, gfp)) 6359 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
6353 goto free_span; 6360 goto free_span;
6354 if (!alloc_cpumask_var(&rd->rto_mask, gfp)) 6361 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
6355 goto free_online; 6362 goto free_online;
6356 6363
6357 if (cpupri_init(&rd->cpupri, bootmem) != 0) 6364 if (cpupri_init(&rd->cpupri) != 0)
6358 goto free_rto_mask; 6365 goto free_rto_mask;
6359 return 0; 6366 return 0;
6360 6367
@@ -6370,7 +6377,7 @@ out:
6370 6377
6371static void init_defrootdomain(void) 6378static void init_defrootdomain(void)
6372{ 6379{
6373 init_rootdomain(&def_root_domain, true); 6380 init_rootdomain(&def_root_domain);
6374 6381
6375 atomic_set(&def_root_domain.refcount, 1); 6382 atomic_set(&def_root_domain.refcount, 1);
6376} 6383}
@@ -6383,7 +6390,7 @@ static struct root_domain *alloc_rootdomain(void)
6383 if (!rd) 6390 if (!rd)
6384 return NULL; 6391 return NULL;
6385 6392
6386 if (init_rootdomain(rd, false) != 0) { 6393 if (init_rootdomain(rd) != 0) {
6387 kfree(rd); 6394 kfree(rd);
6388 return NULL; 6395 return NULL;
6389 } 6396 }
@@ -6401,6 +6408,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6401 struct rq *rq = cpu_rq(cpu); 6408 struct rq *rq = cpu_rq(cpu);
6402 struct sched_domain *tmp; 6409 struct sched_domain *tmp;
6403 6410
6411 for (tmp = sd; tmp; tmp = tmp->parent)
6412 tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
6413
6404 /* Remove the sched domains which do not contribute to scheduling. */ 6414 /* Remove the sched domains which do not contribute to scheduling. */
6405 for (tmp = sd; tmp; ) { 6415 for (tmp = sd; tmp; ) {
6406 struct sched_domain *parent = tmp->parent; 6416 struct sched_domain *parent = tmp->parent;
@@ -7559,29 +7569,35 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7559} 7569}
7560#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 7570#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
7561 7571
7562#ifndef CONFIG_CPUSETS
7563/* 7572/*
7564 * Add online and remove offline CPUs from the scheduler domains. 7573 * Update cpusets according to cpu_active mask. If cpusets are
7565 * When cpusets are enabled they take over this function. 7574 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
7575 * around partition_sched_domains().
7566 */ 7576 */
7567static int update_sched_domains(struct notifier_block *nfb, 7577static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
7568 unsigned long action, void *hcpu) 7578 void *hcpu)
7569{ 7579{
7570 switch (action) { 7580 switch (action & ~CPU_TASKS_FROZEN) {
7571 case CPU_ONLINE: 7581 case CPU_ONLINE:
7572 case CPU_ONLINE_FROZEN:
7573 case CPU_DOWN_PREPARE:
7574 case CPU_DOWN_PREPARE_FROZEN:
7575 case CPU_DOWN_FAILED: 7582 case CPU_DOWN_FAILED:
7576 case CPU_DOWN_FAILED_FROZEN: 7583 cpuset_update_active_cpus();
7577 partition_sched_domains(1, NULL, NULL);
7578 return NOTIFY_OK; 7584 return NOTIFY_OK;
7585 default:
7586 return NOTIFY_DONE;
7587 }
7588}
7579 7589
7590static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
7591 void *hcpu)
7592{
7593 switch (action & ~CPU_TASKS_FROZEN) {
7594 case CPU_DOWN_PREPARE:
7595 cpuset_update_active_cpus();
7596 return NOTIFY_OK;
7580 default: 7597 default:
7581 return NOTIFY_DONE; 7598 return NOTIFY_DONE;
7582 } 7599 }
7583} 7600}
7584#endif
7585 7601
7586static int update_runtime(struct notifier_block *nfb, 7602static int update_runtime(struct notifier_block *nfb,
7587 unsigned long action, void *hcpu) 7603 unsigned long action, void *hcpu)
@@ -7627,10 +7643,8 @@ void __init sched_init_smp(void)
7627 mutex_unlock(&sched_domains_mutex); 7643 mutex_unlock(&sched_domains_mutex);
7628 put_online_cpus(); 7644 put_online_cpus();
7629 7645
7630#ifndef CONFIG_CPUSETS 7646 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
7631 /* XXX: Theoretical race here - CPU may be hotplugged now */ 7647 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
7632 hotcpu_notifier(update_sched_domains, 0);
7633#endif
7634 7648
7635 /* RT runtime code needs to handle some hotplug events */ 7649 /* RT runtime code needs to handle some hotplug events */
7636 hotcpu_notifier(update_runtime, 0); 7650 hotcpu_notifier(update_runtime, 0);
@@ -7875,20 +7889,26 @@ void __init sched_init(void)
7875 7889
7876 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7890 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
7877 rq->cpu_load[j] = 0; 7891 rq->cpu_load[j] = 0;
7892
7893 rq->last_load_update_tick = jiffies;
7894
7878#ifdef CONFIG_SMP 7895#ifdef CONFIG_SMP
7879 rq->sd = NULL; 7896 rq->sd = NULL;
7880 rq->rd = NULL; 7897 rq->rd = NULL;
7898 rq->cpu_power = SCHED_LOAD_SCALE;
7881 rq->post_schedule = 0; 7899 rq->post_schedule = 0;
7882 rq->active_balance = 0; 7900 rq->active_balance = 0;
7883 rq->next_balance = jiffies; 7901 rq->next_balance = jiffies;
7884 rq->push_cpu = 0; 7902 rq->push_cpu = 0;
7885 rq->cpu = i; 7903 rq->cpu = i;
7886 rq->online = 0; 7904 rq->online = 0;
7887 rq->migration_thread = NULL;
7888 rq->idle_stamp = 0; 7905 rq->idle_stamp = 0;
7889 rq->avg_idle = 2*sysctl_sched_migration_cost; 7906 rq->avg_idle = 2*sysctl_sched_migration_cost;
7890 INIT_LIST_HEAD(&rq->migration_queue);
7891 rq_attach_root(rq, &def_root_domain); 7907 rq_attach_root(rq, &def_root_domain);
7908#ifdef CONFIG_NO_HZ
7909 rq->nohz_balance_kick = 0;
7910 init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
7911#endif
7892#endif 7912#endif
7893 init_rq_hrtick(rq); 7913 init_rq_hrtick(rq);
7894 atomic_set(&rq->nr_iowait, 0); 7914 atomic_set(&rq->nr_iowait, 0);
@@ -7933,8 +7953,11 @@ void __init sched_init(void)
7933 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 7953 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
7934#ifdef CONFIG_SMP 7954#ifdef CONFIG_SMP
7935#ifdef CONFIG_NO_HZ 7955#ifdef CONFIG_NO_HZ
7936 zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); 7956 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
7937 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); 7957 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
7958 atomic_set(&nohz.load_balancer, nr_cpu_ids);
7959 atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
7960 atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
7938#endif 7961#endif
7939 /* May be allocated at isolcpus cmdline parse time */ 7962 /* May be allocated at isolcpus cmdline parse time */
7940 if (cpu_isolated_map == NULL) 7963 if (cpu_isolated_map == NULL)
@@ -7988,7 +8011,6 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
7988{ 8011{
7989 int on_rq; 8012 int on_rq;
7990 8013
7991 update_rq_clock(rq);
7992 on_rq = p->se.on_rq; 8014 on_rq = p->se.on_rq;
7993 if (on_rq) 8015 if (on_rq)
7994 deactivate_task(rq, p, 0); 8016 deactivate_task(rq, p, 0);
@@ -8015,9 +8037,9 @@ void normalize_rt_tasks(void)
8015 8037
8016 p->se.exec_start = 0; 8038 p->se.exec_start = 0;
8017#ifdef CONFIG_SCHEDSTATS 8039#ifdef CONFIG_SCHEDSTATS
8018 p->se.wait_start = 0; 8040 p->se.statistics.wait_start = 0;
8019 p->se.sleep_start = 0; 8041 p->se.statistics.sleep_start = 0;
8020 p->se.block_start = 0; 8042 p->se.statistics.block_start = 0;
8021#endif 8043#endif
8022 8044
8023 if (!rt_task(p)) { 8045 if (!rt_task(p)) {
@@ -8044,9 +8066,9 @@ void normalize_rt_tasks(void)
8044 8066
8045#endif /* CONFIG_MAGIC_SYSRQ */ 8067#endif /* CONFIG_MAGIC_SYSRQ */
8046 8068
8047#ifdef CONFIG_IA64 8069#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
8048/* 8070/*
8049 * These functions are only useful for the IA64 MCA handling. 8071 * These functions are only useful for the IA64 MCA handling, or kdb.
8050 * 8072 *
8051 * They can only be called when the whole system has been 8073 * They can only be called when the whole system has been
8052 * stopped - every CPU needs to be quiescent, and no scheduling 8074 * stopped - every CPU needs to be quiescent, and no scheduling
@@ -8066,6 +8088,9 @@ struct task_struct *curr_task(int cpu)
8066 return cpu_curr(cpu); 8088 return cpu_curr(cpu);
8067} 8089}
8068 8090
8091#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
8092
8093#ifdef CONFIG_IA64
8069/** 8094/**
8070 * set_curr_task - set the current task for a given cpu. 8095 * set_curr_task - set the current task for a given cpu.
8071 * @cpu: the processor in question. 8096 * @cpu: the processor in question.
@@ -8350,8 +8375,6 @@ void sched_move_task(struct task_struct *tsk)
8350 8375
8351 rq = task_rq_lock(tsk, &flags); 8376 rq = task_rq_lock(tsk, &flags);
8352 8377
8353 update_rq_clock(rq);
8354
8355 running = task_current(rq, tsk); 8378 running = task_current(rq, tsk);
8356 on_rq = tsk->se.on_rq; 8379 on_rq = tsk->se.on_rq;
8357 8380
@@ -8370,7 +8393,7 @@ void sched_move_task(struct task_struct *tsk)
8370 if (unlikely(running)) 8393 if (unlikely(running))
8371 tsk->sched_class->set_curr_task(rq); 8394 tsk->sched_class->set_curr_task(rq);
8372 if (on_rq) 8395 if (on_rq)
8373 enqueue_task(rq, tsk, 0, false); 8396 enqueue_task(rq, tsk, 0);
8374 8397
8375 task_rq_unlock(rq, &flags); 8398 task_rq_unlock(rq, &flags);
8376} 8399}
@@ -9184,43 +9207,32 @@ struct cgroup_subsys cpuacct_subsys = {
9184 9207
9185#ifndef CONFIG_SMP 9208#ifndef CONFIG_SMP
9186 9209
9187int rcu_expedited_torture_stats(char *page)
9188{
9189 return 0;
9190}
9191EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
9192
9193void synchronize_sched_expedited(void) 9210void synchronize_sched_expedited(void)
9194{ 9211{
9212 barrier();
9195} 9213}
9196EXPORT_SYMBOL_GPL(synchronize_sched_expedited); 9214EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9197 9215
9198#else /* #ifndef CONFIG_SMP */ 9216#else /* #ifndef CONFIG_SMP */
9199 9217
9200static DEFINE_PER_CPU(struct migration_req, rcu_migration_req); 9218static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
9201static DEFINE_MUTEX(rcu_sched_expedited_mutex);
9202
9203#define RCU_EXPEDITED_STATE_POST -2
9204#define RCU_EXPEDITED_STATE_IDLE -1
9205 9219
9206static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; 9220static int synchronize_sched_expedited_cpu_stop(void *data)
9207
9208int rcu_expedited_torture_stats(char *page)
9209{ 9221{
9210 int cnt = 0; 9222 /*
9211 int cpu; 9223 * There must be a full memory barrier on each affected CPU
9212 9224 * between the time that try_stop_cpus() is called and the
9213 cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state); 9225 * time that it returns.
9214 for_each_online_cpu(cpu) { 9226 *
9215 cnt += sprintf(&page[cnt], " %d:%d", 9227 * In the current initial implementation of cpu_stop, the
9216 cpu, per_cpu(rcu_migration_req, cpu).dest_cpu); 9228 * above condition is already met when the control reaches
9217 } 9229 * this point and the following smp_mb() is not strictly
9218 cnt += sprintf(&page[cnt], "\n"); 9230 * necessary. Do smp_mb() anyway for documentation and
9219 return cnt; 9231 * robustness against future implementation changes.
9232 */
9233 smp_mb(); /* See above comment block. */
9234 return 0;
9220} 9235}
9221EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
9222
9223static long synchronize_sched_expedited_count;
9224 9236
9225/* 9237/*
9226 * Wait for an rcu-sched grace period to elapse, but use "big hammer" 9238 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
@@ -9234,18 +9246,14 @@ static long synchronize_sched_expedited_count;
9234 */ 9246 */
9235void synchronize_sched_expedited(void) 9247void synchronize_sched_expedited(void)
9236{ 9248{
9237 int cpu; 9249 int snap, trycount = 0;
9238 unsigned long flags;
9239 bool need_full_sync = 0;
9240 struct rq *rq;
9241 struct migration_req *req;
9242 long snap;
9243 int trycount = 0;
9244 9250
9245 smp_mb(); /* ensure prior mod happens before capturing snap. */ 9251 smp_mb(); /* ensure prior mod happens before capturing snap. */
9246 snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1; 9252 snap = atomic_read(&synchronize_sched_expedited_count) + 1;
9247 get_online_cpus(); 9253 get_online_cpus();
9248 while (!mutex_trylock(&rcu_sched_expedited_mutex)) { 9254 while (try_stop_cpus(cpu_online_mask,
9255 synchronize_sched_expedited_cpu_stop,
9256 NULL) == -EAGAIN) {
9249 put_online_cpus(); 9257 put_online_cpus();
9250 if (trycount++ < 10) 9258 if (trycount++ < 10)
9251 udelay(trycount * num_online_cpus()); 9259 udelay(trycount * num_online_cpus());
@@ -9253,41 +9261,15 @@ void synchronize_sched_expedited(void)
9253 synchronize_sched(); 9261 synchronize_sched();
9254 return; 9262 return;
9255 } 9263 }
9256 if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) { 9264 if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
9257 smp_mb(); /* ensure test happens before caller kfree */ 9265 smp_mb(); /* ensure test happens before caller kfree */
9258 return; 9266 return;
9259 } 9267 }
9260 get_online_cpus(); 9268 get_online_cpus();
9261 } 9269 }
9262 rcu_expedited_state = RCU_EXPEDITED_STATE_POST; 9270 atomic_inc(&synchronize_sched_expedited_count);
9263 for_each_online_cpu(cpu) { 9271 smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
9264 rq = cpu_rq(cpu);
9265 req = &per_cpu(rcu_migration_req, cpu);
9266 init_completion(&req->done);
9267 req->task = NULL;
9268 req->dest_cpu = RCU_MIGRATION_NEED_QS;
9269 raw_spin_lock_irqsave(&rq->lock, flags);
9270 list_add(&req->list, &rq->migration_queue);
9271 raw_spin_unlock_irqrestore(&rq->lock, flags);
9272 wake_up_process(rq->migration_thread);
9273 }
9274 for_each_online_cpu(cpu) {
9275 rcu_expedited_state = cpu;
9276 req = &per_cpu(rcu_migration_req, cpu);
9277 rq = cpu_rq(cpu);
9278 wait_for_completion(&req->done);
9279 raw_spin_lock_irqsave(&rq->lock, flags);
9280 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
9281 need_full_sync = 1;
9282 req->dest_cpu = RCU_MIGRATION_IDLE;
9283 raw_spin_unlock_irqrestore(&rq->lock, flags);
9284 }
9285 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
9286 synchronize_sched_expedited_count++;
9287 mutex_unlock(&rcu_sched_expedited_mutex);
9288 put_online_cpus(); 9272 put_online_cpus();
9289 if (need_full_sync)
9290 synchronize_sched();
9291} 9273}
9292EXPORT_SYMBOL_GPL(synchronize_sched_expedited); 9274EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9293 9275
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 5b496132c28a..52f1a149bfb1 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -10,19 +10,55 @@
10 * Ingo Molnar <mingo@redhat.com> 10 * Ingo Molnar <mingo@redhat.com>
11 * Guillaume Chazarain <guichaz@gmail.com> 11 * Guillaume Chazarain <guichaz@gmail.com>
12 * 12 *
13 * Create a semi stable clock from a mixture of other events, including: 13 *
14 * - gtod 14 * What:
15 *
16 * cpu_clock(i) provides a fast (execution time) high resolution
17 * clock with bounded drift between CPUs. The value of cpu_clock(i)
18 * is monotonic for constant i. The timestamp returned is in nanoseconds.
19 *
20 * ######################### BIG FAT WARNING ##########################
21 * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
22 * # go backwards !! #
23 * ####################################################################
24 *
25 * There is no strict promise about the base, although it tends to start
26 * at 0 on boot (but people really shouldn't rely on that).
27 *
28 * cpu_clock(i) -- can be used from any context, including NMI.
29 * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI)
30 * local_clock() -- is cpu_clock() on the current cpu.
31 *
32 * How:
33 *
34 * The implementation either uses sched_clock() when
35 * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
36 * sched_clock() is assumed to provide these properties (mostly it means
37 * the architecture provides a globally synchronized highres time source).
38 *
39 * Otherwise it tries to create a semi stable clock from a mixture of other
40 * clocks, including:
41 *
42 * - GTOD (clock monotomic)
15 * - sched_clock() 43 * - sched_clock()
16 * - explicit idle events 44 * - explicit idle events
17 * 45 *
18 * We use gtod as base and the unstable clock deltas. The deltas are filtered, 46 * We use GTOD as base and use sched_clock() deltas to improve resolution. The
19 * making it monotonic and keeping it within an expected window. 47 * deltas are filtered to provide monotonicity and keeping it within an
48 * expected window.
20 * 49 *
21 * Furthermore, explicit sleep and wakeup hooks allow us to account for time 50 * Furthermore, explicit sleep and wakeup hooks allow us to account for time
22 * that is otherwise invisible (TSC gets stopped). 51 * that is otherwise invisible (TSC gets stopped).
23 * 52 *
24 * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat 53 *
25 * consistent between cpus (never more than 2 jiffies difference). 54 * Notes:
55 *
56 * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things
57 * like cpufreq interrupts that can change the base clock (TSC) multiplier
58 * and cause funny jumps in time -- although the filtering provided by
59 * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it
60 * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on
61 * sched_clock().
26 */ 62 */
27#include <linux/spinlock.h> 63#include <linux/spinlock.h>
28#include <linux/hardirq.h> 64#include <linux/hardirq.h>
@@ -41,6 +77,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
41 return (unsigned long long)(jiffies - INITIAL_JIFFIES) 77 return (unsigned long long)(jiffies - INITIAL_JIFFIES)
42 * (NSEC_PER_SEC / HZ); 78 * (NSEC_PER_SEC / HZ);
43} 79}
80EXPORT_SYMBOL_GPL(sched_clock);
44 81
45static __read_mostly int sched_clock_running; 82static __read_mostly int sched_clock_running;
46 83
@@ -169,6 +206,11 @@ again:
169 return val; 206 return val;
170} 207}
171 208
209/*
210 * Similar to cpu_clock(), but requires local IRQs to be disabled.
211 *
212 * See cpu_clock().
213 */
172u64 sched_clock_cpu(int cpu) 214u64 sched_clock_cpu(int cpu)
173{ 215{
174 struct sched_clock_data *scd; 216 struct sched_clock_data *scd;
@@ -236,9 +278,19 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
236} 278}
237EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); 279EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
238 280
239unsigned long long cpu_clock(int cpu) 281/*
282 * As outlined at the top, provides a fast, high resolution, nanosecond
283 * time source that is monotonic per cpu argument and has bounded drift
284 * between cpus.
285 *
286 * ######################### BIG FAT WARNING ##########################
287 * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
288 * # go backwards !! #
289 * ####################################################################
290 */
291u64 cpu_clock(int cpu)
240{ 292{
241 unsigned long long clock; 293 u64 clock;
242 unsigned long flags; 294 unsigned long flags;
243 295
244 local_irq_save(flags); 296 local_irq_save(flags);
@@ -248,6 +300,25 @@ unsigned long long cpu_clock(int cpu)
248 return clock; 300 return clock;
249} 301}
250 302
303/*
304 * Similar to cpu_clock() for the current cpu. Time will only be observed
305 * to be monotonic if care is taken to only compare timestampt taken on the
306 * same CPU.
307 *
308 * See cpu_clock().
309 */
310u64 local_clock(void)
311{
312 u64 clock;
313 unsigned long flags;
314
315 local_irq_save(flags);
316 clock = sched_clock_cpu(smp_processor_id());
317 local_irq_restore(flags);
318
319 return clock;
320}
321
251#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 322#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
252 323
253void sched_clock_init(void) 324void sched_clock_init(void)
@@ -263,12 +334,17 @@ u64 sched_clock_cpu(int cpu)
263 return sched_clock(); 334 return sched_clock();
264} 335}
265 336
266 337u64 cpu_clock(int cpu)
267unsigned long long cpu_clock(int cpu)
268{ 338{
269 return sched_clock_cpu(cpu); 339 return sched_clock_cpu(cpu);
270} 340}
271 341
342u64 local_clock(void)
343{
344 return sched_clock_cpu(0);
345}
346
272#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 347#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
273 348
274EXPORT_SYMBOL_GPL(cpu_clock); 349EXPORT_SYMBOL_GPL(cpu_clock);
350EXPORT_SYMBOL_GPL(local_clock);
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index e6871cb3fc83..2722dc1b4138 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -166,14 +166,10 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
166 * 166 *
167 * Returns: -ENOMEM if memory fails. 167 * Returns: -ENOMEM if memory fails.
168 */ 168 */
169int cpupri_init(struct cpupri *cp, bool bootmem) 169int cpupri_init(struct cpupri *cp)
170{ 170{
171 gfp_t gfp = GFP_KERNEL;
172 int i; 171 int i;
173 172
174 if (bootmem)
175 gfp = GFP_NOWAIT;
176
177 memset(cp, 0, sizeof(*cp)); 173 memset(cp, 0, sizeof(*cp));
178 174
179 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { 175 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
@@ -181,7 +177,7 @@ int cpupri_init(struct cpupri *cp, bool bootmem)
181 177
182 raw_spin_lock_init(&vec->lock); 178 raw_spin_lock_init(&vec->lock);
183 vec->count = 0; 179 vec->count = 0;
184 if (!zalloc_cpumask_var(&vec->mask, gfp)) 180 if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
185 goto cleanup; 181 goto cleanup;
186 } 182 }
187 183
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 7cb5bb6b95be..9fc7d386fea4 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -27,7 +27,7 @@ struct cpupri {
27int cpupri_find(struct cpupri *cp, 27int cpupri_find(struct cpupri *cp,
28 struct task_struct *p, struct cpumask *lowest_mask); 28 struct task_struct *p, struct cpumask *lowest_mask);
29void cpupri_set(struct cpupri *cp, int cpu, int pri); 29void cpupri_set(struct cpupri *cp, int cpu, int pri);
30int cpupri_init(struct cpupri *cp, bool bootmem); 30int cpupri_init(struct cpupri *cp);
31void cpupri_cleanup(struct cpupri *cp); 31void cpupri_cleanup(struct cpupri *cp);
32#else 32#else
33#define cpupri_set(cp, cpu, pri) do { } while (0) 33#define cpupri_set(cp, cpu, pri) do { } while (0)
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 19be00ba6123..2e1b0d17dd9b 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -70,16 +70,16 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu,
70 PN(se->vruntime); 70 PN(se->vruntime);
71 PN(se->sum_exec_runtime); 71 PN(se->sum_exec_runtime);
72#ifdef CONFIG_SCHEDSTATS 72#ifdef CONFIG_SCHEDSTATS
73 PN(se->wait_start); 73 PN(se->statistics.wait_start);
74 PN(se->sleep_start); 74 PN(se->statistics.sleep_start);
75 PN(se->block_start); 75 PN(se->statistics.block_start);
76 PN(se->sleep_max); 76 PN(se->statistics.sleep_max);
77 PN(se->block_max); 77 PN(se->statistics.block_max);
78 PN(se->exec_max); 78 PN(se->statistics.exec_max);
79 PN(se->slice_max); 79 PN(se->statistics.slice_max);
80 PN(se->wait_max); 80 PN(se->statistics.wait_max);
81 PN(se->wait_sum); 81 PN(se->statistics.wait_sum);
82 P(se->wait_count); 82 P(se->statistics.wait_count);
83#endif 83#endif
84 P(se->load.weight); 84 P(se->load.weight);
85#undef PN 85#undef PN
@@ -104,7 +104,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
104 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", 104 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
105 SPLIT_NS(p->se.vruntime), 105 SPLIT_NS(p->se.vruntime),
106 SPLIT_NS(p->se.sum_exec_runtime), 106 SPLIT_NS(p->se.sum_exec_runtime),
107 SPLIT_NS(p->se.sum_sleep_runtime)); 107 SPLIT_NS(p->se.statistics.sum_sleep_runtime));
108#else 108#else
109 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", 109 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
110 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); 110 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
@@ -175,11 +175,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
175 task_group_path(tg, path, sizeof(path)); 175 task_group_path(tg, path, sizeof(path));
176 176
177 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); 177 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
178#elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
179 {
180 uid_t uid = cfs_rq->tg->uid;
181 SEQ_printf(m, "\ncfs_rq[%d] for UID: %u\n", cpu, uid);
182 }
183#else 178#else
184 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); 179 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
185#endif 180#endif
@@ -337,7 +332,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
337 PN(sysctl_sched_latency); 332 PN(sysctl_sched_latency);
338 PN(sysctl_sched_min_granularity); 333 PN(sysctl_sched_min_granularity);
339 PN(sysctl_sched_wakeup_granularity); 334 PN(sysctl_sched_wakeup_granularity);
340 PN(sysctl_sched_child_runs_first); 335 P(sysctl_sched_child_runs_first);
341 P(sysctl_sched_features); 336 P(sysctl_sched_features);
342#undef PN 337#undef PN
343#undef P 338#undef P
@@ -386,15 +381,9 @@ __initcall(init_sched_debug_procfs);
386void proc_sched_show_task(struct task_struct *p, struct seq_file *m) 381void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
387{ 382{
388 unsigned long nr_switches; 383 unsigned long nr_switches;
389 unsigned long flags;
390 int num_threads = 1;
391
392 if (lock_task_sighand(p, &flags)) {
393 num_threads = atomic_read(&p->signal->count);
394 unlock_task_sighand(p, &flags);
395 }
396 384
397 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads); 385 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid,
386 get_nr_threads(p));
398 SEQ_printf(m, 387 SEQ_printf(m,
399 "---------------------------------------------------------\n"); 388 "---------------------------------------------------------\n");
400#define __P(F) \ 389#define __P(F) \
@@ -409,40 +398,38 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
409 PN(se.exec_start); 398 PN(se.exec_start);
410 PN(se.vruntime); 399 PN(se.vruntime);
411 PN(se.sum_exec_runtime); 400 PN(se.sum_exec_runtime);
412 PN(se.avg_overlap);
413 PN(se.avg_wakeup);
414 401
415 nr_switches = p->nvcsw + p->nivcsw; 402 nr_switches = p->nvcsw + p->nivcsw;
416 403
417#ifdef CONFIG_SCHEDSTATS 404#ifdef CONFIG_SCHEDSTATS
418 PN(se.wait_start); 405 PN(se.statistics.wait_start);
419 PN(se.sleep_start); 406 PN(se.statistics.sleep_start);
420 PN(se.block_start); 407 PN(se.statistics.block_start);
421 PN(se.sleep_max); 408 PN(se.statistics.sleep_max);
422 PN(se.block_max); 409 PN(se.statistics.block_max);
423 PN(se.exec_max); 410 PN(se.statistics.exec_max);
424 PN(se.slice_max); 411 PN(se.statistics.slice_max);
425 PN(se.wait_max); 412 PN(se.statistics.wait_max);
426 PN(se.wait_sum); 413 PN(se.statistics.wait_sum);
427 P(se.wait_count); 414 P(se.statistics.wait_count);
428 PN(se.iowait_sum); 415 PN(se.statistics.iowait_sum);
429 P(se.iowait_count); 416 P(se.statistics.iowait_count);
430 P(sched_info.bkl_count); 417 P(sched_info.bkl_count);
431 P(se.nr_migrations); 418 P(se.nr_migrations);
432 P(se.nr_migrations_cold); 419 P(se.statistics.nr_migrations_cold);
433 P(se.nr_failed_migrations_affine); 420 P(se.statistics.nr_failed_migrations_affine);
434 P(se.nr_failed_migrations_running); 421 P(se.statistics.nr_failed_migrations_running);
435 P(se.nr_failed_migrations_hot); 422 P(se.statistics.nr_failed_migrations_hot);
436 P(se.nr_forced_migrations); 423 P(se.statistics.nr_forced_migrations);
437 P(se.nr_wakeups); 424 P(se.statistics.nr_wakeups);
438 P(se.nr_wakeups_sync); 425 P(se.statistics.nr_wakeups_sync);
439 P(se.nr_wakeups_migrate); 426 P(se.statistics.nr_wakeups_migrate);
440 P(se.nr_wakeups_local); 427 P(se.statistics.nr_wakeups_local);
441 P(se.nr_wakeups_remote); 428 P(se.statistics.nr_wakeups_remote);
442 P(se.nr_wakeups_affine); 429 P(se.statistics.nr_wakeups_affine);
443 P(se.nr_wakeups_affine_attempts); 430 P(se.statistics.nr_wakeups_affine_attempts);
444 P(se.nr_wakeups_passive); 431 P(se.statistics.nr_wakeups_passive);
445 P(se.nr_wakeups_idle); 432 P(se.statistics.nr_wakeups_idle);
446 433
447 { 434 {
448 u64 avg_atom, avg_per_cpu; 435 u64 avg_atom, avg_per_cpu;
@@ -493,31 +480,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
493void proc_sched_set_task(struct task_struct *p) 480void proc_sched_set_task(struct task_struct *p)
494{ 481{
495#ifdef CONFIG_SCHEDSTATS 482#ifdef CONFIG_SCHEDSTATS
496 p->se.wait_max = 0; 483 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
497 p->se.wait_sum = 0;
498 p->se.wait_count = 0;
499 p->se.iowait_sum = 0;
500 p->se.iowait_count = 0;
501 p->se.sleep_max = 0;
502 p->se.sum_sleep_runtime = 0;
503 p->se.block_max = 0;
504 p->se.exec_max = 0;
505 p->se.slice_max = 0;
506 p->se.nr_migrations = 0;
507 p->se.nr_migrations_cold = 0;
508 p->se.nr_failed_migrations_affine = 0;
509 p->se.nr_failed_migrations_running = 0;
510 p->se.nr_failed_migrations_hot = 0;
511 p->se.nr_forced_migrations = 0;
512 p->se.nr_wakeups = 0;
513 p->se.nr_wakeups_sync = 0;
514 p->se.nr_wakeups_migrate = 0;
515 p->se.nr_wakeups_local = 0;
516 p->se.nr_wakeups_remote = 0;
517 p->se.nr_wakeups_affine = 0;
518 p->se.nr_wakeups_affine_attempts = 0;
519 p->se.nr_wakeups_passive = 0;
520 p->se.nr_wakeups_idle = 0;
521 p->sched_info.bkl_count = 0;
522#endif 484#endif
523} 485}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index b1af6d42c024..e0e8d5ca3c98 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -35,8 +35,8 @@
35 * (to see the precise effective timeslice length of your workload, 35 * (to see the precise effective timeslice length of your workload,
36 * run vmstat and monitor the context-switches (cs) field) 36 * run vmstat and monitor the context-switches (cs) field)
37 */ 37 */
38unsigned int sysctl_sched_latency = 5000000ULL; 38unsigned int sysctl_sched_latency = 6000000ULL;
39unsigned int normalized_sysctl_sched_latency = 5000000ULL; 39unsigned int normalized_sysctl_sched_latency = 6000000ULL;
40 40
41/* 41/*
42 * The initial- and re-scaling of tunables is configurable 42 * The initial- and re-scaling of tunables is configurable
@@ -52,15 +52,15 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
52 52
53/* 53/*
54 * Minimal preemption granularity for CPU-bound tasks: 54 * Minimal preemption granularity for CPU-bound tasks:
55 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) 55 * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
56 */ 56 */
57unsigned int sysctl_sched_min_granularity = 1000000ULL; 57unsigned int sysctl_sched_min_granularity = 750000ULL;
58unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL; 58unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
59 59
60/* 60/*
61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity 61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
62 */ 62 */
63static unsigned int sched_nr_latency = 5; 63static unsigned int sched_nr_latency = 8;
64 64
65/* 65/*
66 * After fork, child runs first. If set to 0 (default) then 66 * After fork, child runs first. If set to 0 (default) then
@@ -505,7 +505,8 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
505{ 505{
506 unsigned long delta_exec_weighted; 506 unsigned long delta_exec_weighted;
507 507
508 schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); 508 schedstat_set(curr->statistics.exec_max,
509 max((u64)delta_exec, curr->statistics.exec_max));
509 510
510 curr->sum_exec_runtime += delta_exec; 511 curr->sum_exec_runtime += delta_exec;
511 schedstat_add(cfs_rq, exec_clock, delta_exec); 512 schedstat_add(cfs_rq, exec_clock, delta_exec);
@@ -548,7 +549,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
548static inline void 549static inline void
549update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) 550update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
550{ 551{
551 schedstat_set(se->wait_start, rq_of(cfs_rq)->clock); 552 schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
552} 553}
553 554
554/* 555/*
@@ -567,18 +568,18 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
567static void 568static void
568update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) 569update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
569{ 570{
570 schedstat_set(se->wait_max, max(se->wait_max, 571 schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
571 rq_of(cfs_rq)->clock - se->wait_start)); 572 rq_of(cfs_rq)->clock - se->statistics.wait_start));
572 schedstat_set(se->wait_count, se->wait_count + 1); 573 schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
573 schedstat_set(se->wait_sum, se->wait_sum + 574 schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
574 rq_of(cfs_rq)->clock - se->wait_start); 575 rq_of(cfs_rq)->clock - se->statistics.wait_start);
575#ifdef CONFIG_SCHEDSTATS 576#ifdef CONFIG_SCHEDSTATS
576 if (entity_is_task(se)) { 577 if (entity_is_task(se)) {
577 trace_sched_stat_wait(task_of(se), 578 trace_sched_stat_wait(task_of(se),
578 rq_of(cfs_rq)->clock - se->wait_start); 579 rq_of(cfs_rq)->clock - se->statistics.wait_start);
579 } 580 }
580#endif 581#endif
581 schedstat_set(se->wait_start, 0); 582 schedstat_set(se->statistics.wait_start, 0);
582} 583}
583 584
584static inline void 585static inline void
@@ -657,39 +658,39 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
657 if (entity_is_task(se)) 658 if (entity_is_task(se))
658 tsk = task_of(se); 659 tsk = task_of(se);
659 660
660 if (se->sleep_start) { 661 if (se->statistics.sleep_start) {
661 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; 662 u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
662 663
663 if ((s64)delta < 0) 664 if ((s64)delta < 0)
664 delta = 0; 665 delta = 0;
665 666
666 if (unlikely(delta > se->sleep_max)) 667 if (unlikely(delta > se->statistics.sleep_max))
667 se->sleep_max = delta; 668 se->statistics.sleep_max = delta;
668 669
669 se->sleep_start = 0; 670 se->statistics.sleep_start = 0;
670 se->sum_sleep_runtime += delta; 671 se->statistics.sum_sleep_runtime += delta;
671 672
672 if (tsk) { 673 if (tsk) {
673 account_scheduler_latency(tsk, delta >> 10, 1); 674 account_scheduler_latency(tsk, delta >> 10, 1);
674 trace_sched_stat_sleep(tsk, delta); 675 trace_sched_stat_sleep(tsk, delta);
675 } 676 }
676 } 677 }
677 if (se->block_start) { 678 if (se->statistics.block_start) {
678 u64 delta = rq_of(cfs_rq)->clock - se->block_start; 679 u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
679 680
680 if ((s64)delta < 0) 681 if ((s64)delta < 0)
681 delta = 0; 682 delta = 0;
682 683
683 if (unlikely(delta > se->block_max)) 684 if (unlikely(delta > se->statistics.block_max))
684 se->block_max = delta; 685 se->statistics.block_max = delta;
685 686
686 se->block_start = 0; 687 se->statistics.block_start = 0;
687 se->sum_sleep_runtime += delta; 688 se->statistics.sum_sleep_runtime += delta;
688 689
689 if (tsk) { 690 if (tsk) {
690 if (tsk->in_iowait) { 691 if (tsk->in_iowait) {
691 se->iowait_sum += delta; 692 se->statistics.iowait_sum += delta;
692 se->iowait_count++; 693 se->statistics.iowait_count++;
693 trace_sched_stat_iowait(tsk, delta); 694 trace_sched_stat_iowait(tsk, delta);
694 } 695 }
695 696
@@ -737,20 +738,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
737 vruntime += sched_vslice(cfs_rq, se); 738 vruntime += sched_vslice(cfs_rq, se);
738 739
739 /* sleeps up to a single latency don't count. */ 740 /* sleeps up to a single latency don't count. */
740 if (!initial && sched_feat(FAIR_SLEEPERS)) { 741 if (!initial) {
741 unsigned long thresh = sysctl_sched_latency; 742 unsigned long thresh = sysctl_sched_latency;
742 743
743 /* 744 /*
744 * Convert the sleeper threshold into virtual time.
745 * SCHED_IDLE is a special sub-class. We care about
746 * fairness only relative to other SCHED_IDLE tasks,
747 * all of which have the same weight.
748 */
749 if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) ||
750 task_of(se)->policy != SCHED_IDLE))
751 thresh = calc_delta_fair(thresh, se);
752
753 /*
754 * Halve their sleep time's effect, to allow 745 * Halve their sleep time's effect, to allow
755 * for a gentler effect of sleepers: 746 * for a gentler effect of sleepers:
756 */ 747 */
@@ -766,9 +757,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
766 se->vruntime = vruntime; 757 se->vruntime = vruntime;
767} 758}
768 759
769#define ENQUEUE_WAKEUP 1
770#define ENQUEUE_MIGRATE 2
771
772static void 760static void
773enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 761enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
774{ 762{
@@ -776,7 +764,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
776 * Update the normalized vruntime before updating min_vruntime 764 * Update the normalized vruntime before updating min_vruntime
777 * through callig update_curr(). 765 * through callig update_curr().
778 */ 766 */
779 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE)) 767 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
780 se->vruntime += cfs_rq->min_vruntime; 768 se->vruntime += cfs_rq->min_vruntime;
781 769
782 /* 770 /*
@@ -812,7 +800,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
812} 800}
813 801
814static void 802static void
815dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) 803dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
816{ 804{
817 /* 805 /*
818 * Update run-time statistics of the 'current'. 806 * Update run-time statistics of the 'current'.
@@ -820,15 +808,15 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
820 update_curr(cfs_rq); 808 update_curr(cfs_rq);
821 809
822 update_stats_dequeue(cfs_rq, se); 810 update_stats_dequeue(cfs_rq, se);
823 if (sleep) { 811 if (flags & DEQUEUE_SLEEP) {
824#ifdef CONFIG_SCHEDSTATS 812#ifdef CONFIG_SCHEDSTATS
825 if (entity_is_task(se)) { 813 if (entity_is_task(se)) {
826 struct task_struct *tsk = task_of(se); 814 struct task_struct *tsk = task_of(se);
827 815
828 if (tsk->state & TASK_INTERRUPTIBLE) 816 if (tsk->state & TASK_INTERRUPTIBLE)
829 se->sleep_start = rq_of(cfs_rq)->clock; 817 se->statistics.sleep_start = rq_of(cfs_rq)->clock;
830 if (tsk->state & TASK_UNINTERRUPTIBLE) 818 if (tsk->state & TASK_UNINTERRUPTIBLE)
831 se->block_start = rq_of(cfs_rq)->clock; 819 se->statistics.block_start = rq_of(cfs_rq)->clock;
832 } 820 }
833#endif 821#endif
834 } 822 }
@@ -845,7 +833,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
845 * update can refer to the ->curr item and we need to reflect this 833 * update can refer to the ->curr item and we need to reflect this
846 * movement in our normalized position. 834 * movement in our normalized position.
847 */ 835 */
848 if (!sleep) 836 if (!(flags & DEQUEUE_SLEEP))
849 se->vruntime -= cfs_rq->min_vruntime; 837 se->vruntime -= cfs_rq->min_vruntime;
850} 838}
851 839
@@ -912,7 +900,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
912 * when there are only lesser-weight tasks around): 900 * when there are only lesser-weight tasks around):
913 */ 901 */
914 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { 902 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
915 se->slice_max = max(se->slice_max, 903 se->statistics.slice_max = max(se->statistics.slice_max,
916 se->sum_exec_runtime - se->prev_sum_exec_runtime); 904 se->sum_exec_runtime - se->prev_sum_exec_runtime);
917 } 905 }
918#endif 906#endif
@@ -1054,16 +1042,10 @@ static inline void hrtick_update(struct rq *rq)
1054 * then put the task into the rbtree: 1042 * then put the task into the rbtree:
1055 */ 1043 */
1056static void 1044static void
1057enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head) 1045enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1058{ 1046{
1059 struct cfs_rq *cfs_rq; 1047 struct cfs_rq *cfs_rq;
1060 struct sched_entity *se = &p->se; 1048 struct sched_entity *se = &p->se;
1061 int flags = 0;
1062
1063 if (wakeup)
1064 flags |= ENQUEUE_WAKEUP;
1065 if (p->state == TASK_WAKING)
1066 flags |= ENQUEUE_MIGRATE;
1067 1049
1068 for_each_sched_entity(se) { 1050 for_each_sched_entity(se) {
1069 if (se->on_rq) 1051 if (se->on_rq)
@@ -1081,18 +1063,18 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1081 * decreased. We remove the task from the rbtree and 1063 * decreased. We remove the task from the rbtree and
1082 * update the fair scheduling stats: 1064 * update the fair scheduling stats:
1083 */ 1065 */
1084static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) 1066static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1085{ 1067{
1086 struct cfs_rq *cfs_rq; 1068 struct cfs_rq *cfs_rq;
1087 struct sched_entity *se = &p->se; 1069 struct sched_entity *se = &p->se;
1088 1070
1089 for_each_sched_entity(se) { 1071 for_each_sched_entity(se) {
1090 cfs_rq = cfs_rq_of(se); 1072 cfs_rq = cfs_rq_of(se);
1091 dequeue_entity(cfs_rq, se, sleep); 1073 dequeue_entity(cfs_rq, se, flags);
1092 /* Don't dequeue parent if it has other entities besides us */ 1074 /* Don't dequeue parent if it has other entities besides us */
1093 if (cfs_rq->load.weight) 1075 if (cfs_rq->load.weight)
1094 break; 1076 break;
1095 sleep = 1; 1077 flags |= DEQUEUE_SLEEP;
1096 } 1078 }
1097 1079
1098 hrtick_update(rq); 1080 hrtick_update(rq);
@@ -1240,11 +1222,9 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
1240 1222
1241static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) 1223static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1242{ 1224{
1243 struct task_struct *curr = current;
1244 unsigned long this_load, load; 1225 unsigned long this_load, load;
1245 int idx, this_cpu, prev_cpu; 1226 int idx, this_cpu, prev_cpu;
1246 unsigned long tl_per_task; 1227 unsigned long tl_per_task;
1247 unsigned int imbalance;
1248 struct task_group *tg; 1228 struct task_group *tg;
1249 unsigned long weight; 1229 unsigned long weight;
1250 int balanced; 1230 int balanced;
@@ -1255,23 +1235,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1255 load = source_load(prev_cpu, idx); 1235 load = source_load(prev_cpu, idx);
1256 this_load = target_load(this_cpu, idx); 1236 this_load = target_load(this_cpu, idx);
1257 1237
1258 if (sync) {
1259 if (sched_feat(SYNC_LESS) &&
1260 (curr->se.avg_overlap > sysctl_sched_migration_cost ||
1261 p->se.avg_overlap > sysctl_sched_migration_cost))
1262 sync = 0;
1263 } else {
1264 if (sched_feat(SYNC_MORE) &&
1265 (curr->se.avg_overlap < sysctl_sched_migration_cost &&
1266 p->se.avg_overlap < sysctl_sched_migration_cost))
1267 sync = 1;
1268 }
1269
1270 /* 1238 /*
1271 * If sync wakeup then subtract the (maximum possible) 1239 * If sync wakeup then subtract the (maximum possible)
1272 * effect of the currently running task from the load 1240 * effect of the currently running task from the load
1273 * of the current CPU: 1241 * of the current CPU:
1274 */ 1242 */
1243 rcu_read_lock();
1275 if (sync) { 1244 if (sync) {
1276 tg = task_group(current); 1245 tg = task_group(current);
1277 weight = current->se.load.weight; 1246 weight = current->se.load.weight;
@@ -1283,8 +1252,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1283 tg = task_group(p); 1252 tg = task_group(p);
1284 weight = p->se.load.weight; 1253 weight = p->se.load.weight;
1285 1254
1286 imbalance = 100 + (sd->imbalance_pct - 100) / 2;
1287
1288 /* 1255 /*
1289 * In low-load situations, where prev_cpu is idle and this_cpu is idle 1256 * In low-load situations, where prev_cpu is idle and this_cpu is idle
1290 * due to the sync cause above having dropped this_load to 0, we'll 1257 * due to the sync cause above having dropped this_load to 0, we'll
@@ -1294,9 +1261,22 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1294 * Otherwise check if either cpus are near enough in load to allow this 1261 * Otherwise check if either cpus are near enough in load to allow this
1295 * task to be woken on this_cpu. 1262 * task to be woken on this_cpu.
1296 */ 1263 */
1297 balanced = !this_load || 1264 if (this_load) {
1298 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <= 1265 unsigned long this_eff_load, prev_eff_load;
1299 imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); 1266
1267 this_eff_load = 100;
1268 this_eff_load *= power_of(prev_cpu);
1269 this_eff_load *= this_load +
1270 effective_load(tg, this_cpu, weight, weight);
1271
1272 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
1273 prev_eff_load *= power_of(this_cpu);
1274 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
1275
1276 balanced = this_eff_load <= prev_eff_load;
1277 } else
1278 balanced = true;
1279 rcu_read_unlock();
1300 1280
1301 /* 1281 /*
1302 * If the currently running task will sleep within 1282 * If the currently running task will sleep within
@@ -1306,7 +1286,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1306 if (sync && balanced) 1286 if (sync && balanced)
1307 return 1; 1287 return 1;
1308 1288
1309 schedstat_inc(p, se.nr_wakeups_affine_attempts); 1289 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
1310 tl_per_task = cpu_avg_load_per_task(this_cpu); 1290 tl_per_task = cpu_avg_load_per_task(this_cpu);
1311 1291
1312 if (balanced || 1292 if (balanced ||
@@ -1318,7 +1298,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1318 * there is no bad imbalance. 1298 * there is no bad imbalance.
1319 */ 1299 */
1320 schedstat_inc(sd, ttwu_move_affine); 1300 schedstat_inc(sd, ttwu_move_affine);
1321 schedstat_inc(p, se.nr_wakeups_affine); 1301 schedstat_inc(p, se.statistics.nr_wakeups_affine);
1322 1302
1323 return 1; 1303 return 1;
1324 } 1304 }
@@ -1333,7 +1313,7 @@ static struct sched_group *
1333find_idlest_group(struct sched_domain *sd, struct task_struct *p, 1313find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1334 int this_cpu, int load_idx) 1314 int this_cpu, int load_idx)
1335{ 1315{
1336 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; 1316 struct sched_group *idlest = NULL, *group = sd->groups;
1337 unsigned long min_load = ULONG_MAX, this_load = 0; 1317 unsigned long min_load = ULONG_MAX, this_load = 0;
1338 int imbalance = 100 + (sd->imbalance_pct-100)/2; 1318 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1339 1319
@@ -1368,7 +1348,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1368 1348
1369 if (local_group) { 1349 if (local_group) {
1370 this_load = avg_load; 1350 this_load = avg_load;
1371 this = group;
1372 } else if (avg_load < min_load) { 1351 } else if (avg_load < min_load) {
1373 min_load = avg_load; 1352 min_load = avg_load;
1374 idlest = group; 1353 idlest = group;
@@ -1406,29 +1385,48 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1406/* 1385/*
1407 * Try and locate an idle CPU in the sched_domain. 1386 * Try and locate an idle CPU in the sched_domain.
1408 */ 1387 */
1409static int 1388static int select_idle_sibling(struct task_struct *p, int target)
1410select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
1411{ 1389{
1412 int cpu = smp_processor_id(); 1390 int cpu = smp_processor_id();
1413 int prev_cpu = task_cpu(p); 1391 int prev_cpu = task_cpu(p);
1392 struct sched_domain *sd;
1414 int i; 1393 int i;
1415 1394
1416 /* 1395 /*
1417 * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE 1396 * If the task is going to be woken-up on this cpu and if it is
1418 * test in select_task_rq_fair) and the prev_cpu is idle then that's 1397 * already idle, then it is the right target.
1419 * always a better target than the current cpu. 1398 */
1399 if (target == cpu && idle_cpu(cpu))
1400 return cpu;
1401
1402 /*
1403 * If the task is going to be woken-up on the cpu where it previously
1404 * ran and if it is currently idle, then it the right target.
1420 */ 1405 */
1421 if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running) 1406 if (target == prev_cpu && idle_cpu(prev_cpu))
1422 return prev_cpu; 1407 return prev_cpu;
1423 1408
1424 /* 1409 /*
1425 * Otherwise, iterate the domain and find an elegible idle cpu. 1410 * Otherwise, iterate the domains and find an elegible idle cpu.
1426 */ 1411 */
1427 for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { 1412 for_each_domain(target, sd) {
1428 if (!cpu_rq(i)->cfs.nr_running) { 1413 if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
1429 target = i;
1430 break; 1414 break;
1415
1416 for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
1417 if (idle_cpu(i)) {
1418 target = i;
1419 break;
1420 }
1431 } 1421 }
1422
1423 /*
1424 * Lets stop looking for an idle sibling when we reached
1425 * the domain that spans the current cpu and prev_cpu.
1426 */
1427 if (cpumask_test_cpu(cpu, sched_domain_span(sd)) &&
1428 cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
1429 break;
1432 } 1430 }
1433 1431
1434 return target; 1432 return target;
@@ -1445,7 +1443,8 @@ select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
1445 * 1443 *
1446 * preempt must be disabled. 1444 * preempt must be disabled.
1447 */ 1445 */
1448static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) 1446static int
1447select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags)
1449{ 1448{
1450 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; 1449 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
1451 int cpu = smp_processor_id(); 1450 int cpu = smp_processor_id();
@@ -1456,8 +1455,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1456 int sync = wake_flags & WF_SYNC; 1455 int sync = wake_flags & WF_SYNC;
1457 1456
1458 if (sd_flag & SD_BALANCE_WAKE) { 1457 if (sd_flag & SD_BALANCE_WAKE) {
1459 if (sched_feat(AFFINE_WAKEUPS) && 1458 if (cpumask_test_cpu(cpu, &p->cpus_allowed))
1460 cpumask_test_cpu(cpu, &p->cpus_allowed))
1461 want_affine = 1; 1459 want_affine = 1;
1462 new_cpu = prev_cpu; 1460 new_cpu = prev_cpu;
1463 } 1461 }
@@ -1491,34 +1489,13 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1491 } 1489 }
1492 1490
1493 /* 1491 /*
1494 * While iterating the domains looking for a spanning 1492 * If both cpu and prev_cpu are part of this domain,
1495 * WAKE_AFFINE domain, adjust the affine target to any idle cpu 1493 * cpu is a valid SD_WAKE_AFFINE target.
1496 * in cache sharing domains along the way.
1497 */ 1494 */
1498 if (want_affine) { 1495 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
1499 int target = -1; 1496 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
1500 1497 affine_sd = tmp;
1501 /* 1498 want_affine = 0;
1502 * If both cpu and prev_cpu are part of this domain,
1503 * cpu is a valid SD_WAKE_AFFINE target.
1504 */
1505 if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
1506 target = cpu;
1507
1508 /*
1509 * If there's an idle sibling in this domain, make that
1510 * the wake_affine target instead of the current cpu.
1511 */
1512 if (tmp->flags & SD_SHARE_PKG_RESOURCES)
1513 target = select_idle_sibling(p, tmp, target);
1514
1515 if (target >= 0) {
1516 if (tmp->flags & SD_WAKE_AFFINE) {
1517 affine_sd = tmp;
1518 want_affine = 0;
1519 }
1520 cpu = target;
1521 }
1522 } 1499 }
1523 1500
1524 if (!want_sd && !want_affine) 1501 if (!want_sd && !want_affine)
@@ -1531,22 +1508,29 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1531 sd = tmp; 1508 sd = tmp;
1532 } 1509 }
1533 1510
1511#ifdef CONFIG_FAIR_GROUP_SCHED
1534 if (sched_feat(LB_SHARES_UPDATE)) { 1512 if (sched_feat(LB_SHARES_UPDATE)) {
1535 /* 1513 /*
1536 * Pick the largest domain to update shares over 1514 * Pick the largest domain to update shares over
1537 */ 1515 */
1538 tmp = sd; 1516 tmp = sd;
1539 if (affine_sd && (!tmp || 1517 if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
1540 cpumask_weight(sched_domain_span(affine_sd)) >
1541 cpumask_weight(sched_domain_span(sd))))
1542 tmp = affine_sd; 1518 tmp = affine_sd;
1543 1519
1544 if (tmp) 1520 if (tmp) {
1521 raw_spin_unlock(&rq->lock);
1545 update_shares(tmp); 1522 update_shares(tmp);
1523 raw_spin_lock(&rq->lock);
1524 }
1546 } 1525 }
1526#endif
1547 1527
1548 if (affine_sd && wake_affine(affine_sd, p, sync)) 1528 if (affine_sd) {
1549 return cpu; 1529 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
1530 return select_idle_sibling(p, cpu);
1531 else
1532 return select_idle_sibling(p, prev_cpu);
1533 }
1550 1534
1551 while (sd) { 1535 while (sd) {
1552 int load_idx = sd->forkexec_idx; 1536 int load_idx = sd->forkexec_idx;
@@ -1576,10 +1560,10 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1576 1560
1577 /* Now try balancing at a lower domain level of new_cpu */ 1561 /* Now try balancing at a lower domain level of new_cpu */
1578 cpu = new_cpu; 1562 cpu = new_cpu;
1579 weight = cpumask_weight(sched_domain_span(sd)); 1563 weight = sd->span_weight;
1580 sd = NULL; 1564 sd = NULL;
1581 for_each_domain(cpu, tmp) { 1565 for_each_domain(cpu, tmp) {
1582 if (weight <= cpumask_weight(sched_domain_span(tmp))) 1566 if (weight <= tmp->span_weight)
1583 break; 1567 break;
1584 if (tmp->flags & sd_flag) 1568 if (tmp->flags & sd_flag)
1585 sd = tmp; 1569 sd = tmp;
@@ -1591,63 +1575,26 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1591} 1575}
1592#endif /* CONFIG_SMP */ 1576#endif /* CONFIG_SMP */
1593 1577
1594/*
1595 * Adaptive granularity
1596 *
1597 * se->avg_wakeup gives the average time a task runs until it does a wakeup,
1598 * with the limit of wakeup_gran -- when it never does a wakeup.
1599 *
1600 * So the smaller avg_wakeup is the faster we want this task to preempt,
1601 * but we don't want to treat the preemptee unfairly and therefore allow it
1602 * to run for at least the amount of time we'd like to run.
1603 *
1604 * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one
1605 *
1606 * NOTE: we use *nr_running to scale with load, this nicely matches the
1607 * degrading latency on load.
1608 */
1609static unsigned long
1610adaptive_gran(struct sched_entity *curr, struct sched_entity *se)
1611{
1612 u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
1613 u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running;
1614 u64 gran = 0;
1615
1616 if (this_run < expected_wakeup)
1617 gran = expected_wakeup - this_run;
1618
1619 return min_t(s64, gran, sysctl_sched_wakeup_granularity);
1620}
1621
1622static unsigned long 1578static unsigned long
1623wakeup_gran(struct sched_entity *curr, struct sched_entity *se) 1579wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
1624{ 1580{
1625 unsigned long gran = sysctl_sched_wakeup_granularity; 1581 unsigned long gran = sysctl_sched_wakeup_granularity;
1626 1582
1627 if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN))
1628 gran = adaptive_gran(curr, se);
1629
1630 /* 1583 /*
1631 * Since its curr running now, convert the gran from real-time 1584 * Since its curr running now, convert the gran from real-time
1632 * to virtual-time in his units. 1585 * to virtual-time in his units.
1586 *
1587 * By using 'se' instead of 'curr' we penalize light tasks, so
1588 * they get preempted easier. That is, if 'se' < 'curr' then
1589 * the resulting gran will be larger, therefore penalizing the
1590 * lighter, if otoh 'se' > 'curr' then the resulting gran will
1591 * be smaller, again penalizing the lighter task.
1592 *
1593 * This is especially important for buddies when the leftmost
1594 * task is higher priority than the buddy.
1633 */ 1595 */
1634 if (sched_feat(ASYM_GRAN)) { 1596 if (unlikely(se->load.weight != NICE_0_LOAD))
1635 /* 1597 gran = calc_delta_fair(gran, se);
1636 * By using 'se' instead of 'curr' we penalize light tasks, so
1637 * they get preempted easier. That is, if 'se' < 'curr' then
1638 * the resulting gran will be larger, therefore penalizing the
1639 * lighter, if otoh 'se' > 'curr' then the resulting gran will
1640 * be smaller, again penalizing the lighter task.
1641 *
1642 * This is especially important for buddies when the leftmost
1643 * task is higher priority than the buddy.
1644 */
1645 if (unlikely(se->load.weight != NICE_0_LOAD))
1646 gran = calc_delta_fair(gran, se);
1647 } else {
1648 if (unlikely(curr->load.weight != NICE_0_LOAD))
1649 gran = calc_delta_fair(gran, curr);
1650 }
1651 1598
1652 return gran; 1599 return gran;
1653} 1600}
@@ -1705,7 +1652,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1705 struct task_struct *curr = rq->curr; 1652 struct task_struct *curr = rq->curr;
1706 struct sched_entity *se = &curr->se, *pse = &p->se; 1653 struct sched_entity *se = &curr->se, *pse = &p->se;
1707 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1654 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1708 int sync = wake_flags & WF_SYNC;
1709 int scale = cfs_rq->nr_running >= sched_nr_latency; 1655 int scale = cfs_rq->nr_running >= sched_nr_latency;
1710 1656
1711 if (unlikely(rt_prio(p->prio)) || p->policy == SCHED_LITMUS) 1657 if (unlikely(rt_prio(p->prio)) || p->policy == SCHED_LITMUS)
@@ -1738,14 +1684,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1738 if (unlikely(curr->policy == SCHED_IDLE)) 1684 if (unlikely(curr->policy == SCHED_IDLE))
1739 goto preempt; 1685 goto preempt;
1740 1686
1741 if (sched_feat(WAKEUP_SYNC) && sync)
1742 goto preempt;
1743
1744 if (sched_feat(WAKEUP_OVERLAP) &&
1745 se->avg_overlap < sysctl_sched_migration_cost &&
1746 pse->avg_overlap < sysctl_sched_migration_cost)
1747 goto preempt;
1748
1749 if (!sched_feat(WAKEUP_PREEMPT)) 1687 if (!sched_feat(WAKEUP_PREEMPT))
1750 return; 1688 return;
1751 1689
@@ -1844,13 +1782,13 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1844 * 3) are cache-hot on their current CPU. 1782 * 3) are cache-hot on their current CPU.
1845 */ 1783 */
1846 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { 1784 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
1847 schedstat_inc(p, se.nr_failed_migrations_affine); 1785 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
1848 return 0; 1786 return 0;
1849 } 1787 }
1850 *all_pinned = 0; 1788 *all_pinned = 0;
1851 1789
1852 if (task_running(rq, p)) { 1790 if (task_running(rq, p)) {
1853 schedstat_inc(p, se.nr_failed_migrations_running); 1791 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
1854 return 0; 1792 return 0;
1855 } 1793 }
1856 1794
@@ -1866,14 +1804,14 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1866#ifdef CONFIG_SCHEDSTATS 1804#ifdef CONFIG_SCHEDSTATS
1867 if (tsk_cache_hot) { 1805 if (tsk_cache_hot) {
1868 schedstat_inc(sd, lb_hot_gained[idle]); 1806 schedstat_inc(sd, lb_hot_gained[idle]);
1869 schedstat_inc(p, se.nr_forced_migrations); 1807 schedstat_inc(p, se.statistics.nr_forced_migrations);
1870 } 1808 }
1871#endif 1809#endif
1872 return 1; 1810 return 1;
1873 } 1811 }
1874 1812
1875 if (tsk_cache_hot) { 1813 if (tsk_cache_hot) {
1876 schedstat_inc(p, se.nr_failed_migrations_hot); 1814 schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
1877 return 0; 1815 return 0;
1878 } 1816 }
1879 return 1; 1817 return 1;
@@ -2311,7 +2249,7 @@ unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
2311 2249
2312unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) 2250unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
2313{ 2251{
2314 unsigned long weight = cpumask_weight(sched_domain_span(sd)); 2252 unsigned long weight = sd->span_weight;
2315 unsigned long smt_gain = sd->smt_gain; 2253 unsigned long smt_gain = sd->smt_gain;
2316 2254
2317 smt_gain /= weight; 2255 smt_gain /= weight;
@@ -2329,8 +2267,6 @@ unsigned long scale_rt_power(int cpu)
2329 struct rq *rq = cpu_rq(cpu); 2267 struct rq *rq = cpu_rq(cpu);
2330 u64 total, available; 2268 u64 total, available;
2331 2269
2332 sched_avg_update(rq);
2333
2334 total = sched_avg_period() + (rq->clock - rq->age_stamp); 2270 total = sched_avg_period() + (rq->clock - rq->age_stamp);
2335 available = total - rq->rt_avg; 2271 available = total - rq->rt_avg;
2336 2272
@@ -2344,17 +2280,10 @@ unsigned long scale_rt_power(int cpu)
2344 2280
2345static void update_cpu_power(struct sched_domain *sd, int cpu) 2281static void update_cpu_power(struct sched_domain *sd, int cpu)
2346{ 2282{
2347 unsigned long weight = cpumask_weight(sched_domain_span(sd)); 2283 unsigned long weight = sd->span_weight;
2348 unsigned long power = SCHED_LOAD_SCALE; 2284 unsigned long power = SCHED_LOAD_SCALE;
2349 struct sched_group *sdg = sd->groups; 2285 struct sched_group *sdg = sd->groups;
2350 2286
2351 if (sched_feat(ARCH_POWER))
2352 power *= arch_scale_freq_power(sd, cpu);
2353 else
2354 power *= default_scale_freq_power(sd, cpu);
2355
2356 power >>= SCHED_LOAD_SHIFT;
2357
2358 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { 2287 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
2359 if (sched_feat(ARCH_POWER)) 2288 if (sched_feat(ARCH_POWER))
2360 power *= arch_scale_smt_power(sd, cpu); 2289 power *= arch_scale_smt_power(sd, cpu);
@@ -2364,12 +2293,22 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
2364 power >>= SCHED_LOAD_SHIFT; 2293 power >>= SCHED_LOAD_SHIFT;
2365 } 2294 }
2366 2295
2296 sdg->cpu_power_orig = power;
2297
2298 if (sched_feat(ARCH_POWER))
2299 power *= arch_scale_freq_power(sd, cpu);
2300 else
2301 power *= default_scale_freq_power(sd, cpu);
2302
2303 power >>= SCHED_LOAD_SHIFT;
2304
2367 power *= scale_rt_power(cpu); 2305 power *= scale_rt_power(cpu);
2368 power >>= SCHED_LOAD_SHIFT; 2306 power >>= SCHED_LOAD_SHIFT;
2369 2307
2370 if (!power) 2308 if (!power)
2371 power = 1; 2309 power = 1;
2372 2310
2311 cpu_rq(cpu)->cpu_power = power;
2373 sdg->cpu_power = power; 2312 sdg->cpu_power = power;
2374} 2313}
2375 2314
@@ -2395,6 +2334,31 @@ static void update_group_power(struct sched_domain *sd, int cpu)
2395 sdg->cpu_power = power; 2334 sdg->cpu_power = power;
2396} 2335}
2397 2336
2337/*
2338 * Try and fix up capacity for tiny siblings, this is needed when
2339 * things like SD_ASYM_PACKING need f_b_g to select another sibling
2340 * which on its own isn't powerful enough.
2341 *
2342 * See update_sd_pick_busiest() and check_asym_packing().
2343 */
2344static inline int
2345fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2346{
2347 /*
2348 * Only siblings can have significantly less than SCHED_LOAD_SCALE
2349 */
2350 if (sd->level != SD_LV_SIBLING)
2351 return 0;
2352
2353 /*
2354 * If ~90% of the cpu_power is still there, we're good.
2355 */
2356 if (group->cpu_power * 32 > group->cpu_power_orig * 29)
2357 return 1;
2358
2359 return 0;
2360}
2361
2398/** 2362/**
2399 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 2363 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
2400 * @sd: The sched_domain whose statistics are to be updated. 2364 * @sd: The sched_domain whose statistics are to be updated.
@@ -2460,14 +2424,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2460 * domains. In the newly idle case, we will allow all the cpu's 2424 * domains. In the newly idle case, we will allow all the cpu's
2461 * to do the newly idle load balance. 2425 * to do the newly idle load balance.
2462 */ 2426 */
2463 if (idle != CPU_NEWLY_IDLE && local_group && 2427 if (idle != CPU_NEWLY_IDLE && local_group) {
2464 balance_cpu != this_cpu) { 2428 if (balance_cpu != this_cpu) {
2465 *balance = 0; 2429 *balance = 0;
2466 return; 2430 return;
2431 }
2432 update_group_power(sd, this_cpu);
2467 } 2433 }
2468 2434
2469 update_group_power(sd, this_cpu);
2470
2471 /* Adjust by relative CPU power of the group */ 2435 /* Adjust by relative CPU power of the group */
2472 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; 2436 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
2473 2437
@@ -2488,6 +2452,51 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2488 2452
2489 sgs->group_capacity = 2453 sgs->group_capacity =
2490 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); 2454 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2455 if (!sgs->group_capacity)
2456 sgs->group_capacity = fix_small_capacity(sd, group);
2457}
2458
2459/**
2460 * update_sd_pick_busiest - return 1 on busiest group
2461 * @sd: sched_domain whose statistics are to be checked
2462 * @sds: sched_domain statistics
2463 * @sg: sched_group candidate to be checked for being the busiest
2464 * @sgs: sched_group statistics
2465 * @this_cpu: the current cpu
2466 *
2467 * Determine if @sg is a busier group than the previously selected
2468 * busiest group.
2469 */
2470static bool update_sd_pick_busiest(struct sched_domain *sd,
2471 struct sd_lb_stats *sds,
2472 struct sched_group *sg,
2473 struct sg_lb_stats *sgs,
2474 int this_cpu)
2475{
2476 if (sgs->avg_load <= sds->max_load)
2477 return false;
2478
2479 if (sgs->sum_nr_running > sgs->group_capacity)
2480 return true;
2481
2482 if (sgs->group_imb)
2483 return true;
2484
2485 /*
2486 * ASYM_PACKING needs to move all the work to the lowest
2487 * numbered CPUs in the group, therefore mark all groups
2488 * higher than ourself as busy.
2489 */
2490 if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
2491 this_cpu < group_first_cpu(sg)) {
2492 if (!sds->busiest)
2493 return true;
2494
2495 if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
2496 return true;
2497 }
2498
2499 return false;
2491} 2500}
2492 2501
2493/** 2502/**
@@ -2495,7 +2504,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2495 * @sd: sched_domain whose statistics are to be updated. 2504 * @sd: sched_domain whose statistics are to be updated.
2496 * @this_cpu: Cpu for which load balance is currently performed. 2505 * @this_cpu: Cpu for which load balance is currently performed.
2497 * @idle: Idle status of this_cpu 2506 * @idle: Idle status of this_cpu
2498 * @sd_idle: Idle status of the sched_domain containing group. 2507 * @sd_idle: Idle status of the sched_domain containing sg.
2499 * @cpus: Set of cpus considered for load balancing. 2508 * @cpus: Set of cpus considered for load balancing.
2500 * @balance: Should we balance. 2509 * @balance: Should we balance.
2501 * @sds: variable to hold the statistics for this sched_domain. 2510 * @sds: variable to hold the statistics for this sched_domain.
@@ -2506,7 +2515,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2506 struct sd_lb_stats *sds) 2515 struct sd_lb_stats *sds)
2507{ 2516{
2508 struct sched_domain *child = sd->child; 2517 struct sched_domain *child = sd->child;
2509 struct sched_group *group = sd->groups; 2518 struct sched_group *sg = sd->groups;
2510 struct sg_lb_stats sgs; 2519 struct sg_lb_stats sgs;
2511 int load_idx, prefer_sibling = 0; 2520 int load_idx, prefer_sibling = 0;
2512 2521
@@ -2519,21 +2528,20 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2519 do { 2528 do {
2520 int local_group; 2529 int local_group;
2521 2530
2522 local_group = cpumask_test_cpu(this_cpu, 2531 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
2523 sched_group_cpus(group));
2524 memset(&sgs, 0, sizeof(sgs)); 2532 memset(&sgs, 0, sizeof(sgs));
2525 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, 2533 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle,
2526 local_group, cpus, balance, &sgs); 2534 local_group, cpus, balance, &sgs);
2527 2535
2528 if (local_group && !(*balance)) 2536 if (local_group && !(*balance))
2529 return; 2537 return;
2530 2538
2531 sds->total_load += sgs.group_load; 2539 sds->total_load += sgs.group_load;
2532 sds->total_pwr += group->cpu_power; 2540 sds->total_pwr += sg->cpu_power;
2533 2541
2534 /* 2542 /*
2535 * In case the child domain prefers tasks go to siblings 2543 * In case the child domain prefers tasks go to siblings
2536 * first, lower the group capacity to one so that we'll try 2544 * first, lower the sg capacity to one so that we'll try
2537 * and move all the excess tasks away. 2545 * and move all the excess tasks away.
2538 */ 2546 */
2539 if (prefer_sibling) 2547 if (prefer_sibling)
@@ -2541,23 +2549,72 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2541 2549
2542 if (local_group) { 2550 if (local_group) {
2543 sds->this_load = sgs.avg_load; 2551 sds->this_load = sgs.avg_load;
2544 sds->this = group; 2552 sds->this = sg;
2545 sds->this_nr_running = sgs.sum_nr_running; 2553 sds->this_nr_running = sgs.sum_nr_running;
2546 sds->this_load_per_task = sgs.sum_weighted_load; 2554 sds->this_load_per_task = sgs.sum_weighted_load;
2547 } else if (sgs.avg_load > sds->max_load && 2555 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
2548 (sgs.sum_nr_running > sgs.group_capacity ||
2549 sgs.group_imb)) {
2550 sds->max_load = sgs.avg_load; 2556 sds->max_load = sgs.avg_load;
2551 sds->busiest = group; 2557 sds->busiest = sg;
2552 sds->busiest_nr_running = sgs.sum_nr_running; 2558 sds->busiest_nr_running = sgs.sum_nr_running;
2553 sds->busiest_group_capacity = sgs.group_capacity; 2559 sds->busiest_group_capacity = sgs.group_capacity;
2554 sds->busiest_load_per_task = sgs.sum_weighted_load; 2560 sds->busiest_load_per_task = sgs.sum_weighted_load;
2555 sds->group_imb = sgs.group_imb; 2561 sds->group_imb = sgs.group_imb;
2556 } 2562 }
2557 2563
2558 update_sd_power_savings_stats(group, sds, local_group, &sgs); 2564 update_sd_power_savings_stats(sg, sds, local_group, &sgs);
2559 group = group->next; 2565 sg = sg->next;
2560 } while (group != sd->groups); 2566 } while (sg != sd->groups);
2567}
2568
2569int __weak arch_sd_sibling_asym_packing(void)
2570{
2571 return 0*SD_ASYM_PACKING;
2572}
2573
2574/**
2575 * check_asym_packing - Check to see if the group is packed into the
2576 * sched doman.
2577 *
2578 * This is primarily intended to used at the sibling level. Some
2579 * cores like POWER7 prefer to use lower numbered SMT threads. In the
2580 * case of POWER7, it can move to lower SMT modes only when higher
2581 * threads are idle. When in lower SMT modes, the threads will
2582 * perform better since they share less core resources. Hence when we
2583 * have idle threads, we want them to be the higher ones.
2584 *
2585 * This packing function is run on idle threads. It checks to see if
2586 * the busiest CPU in this domain (core in the P7 case) has a higher
2587 * CPU number than the packing function is being run on. Here we are
2588 * assuming lower CPU number will be equivalent to lower a SMT thread
2589 * number.
2590 *
2591 * Returns 1 when packing is required and a task should be moved to
2592 * this CPU. The amount of the imbalance is returned in *imbalance.
2593 *
2594 * @sd: The sched_domain whose packing is to be checked.
2595 * @sds: Statistics of the sched_domain which is to be packed
2596 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
2597 * @imbalance: returns amount of imbalanced due to packing.
2598 */
2599static int check_asym_packing(struct sched_domain *sd,
2600 struct sd_lb_stats *sds,
2601 int this_cpu, unsigned long *imbalance)
2602{
2603 int busiest_cpu;
2604
2605 if (!(sd->flags & SD_ASYM_PACKING))
2606 return 0;
2607
2608 if (!sds->busiest)
2609 return 0;
2610
2611 busiest_cpu = group_first_cpu(sds->busiest);
2612 if (this_cpu > busiest_cpu)
2613 return 0;
2614
2615 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
2616 SCHED_LOAD_SCALE);
2617 return 1;
2561} 2618}
2562 2619
2563/** 2620/**
@@ -2752,6 +2809,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2752 if (!(*balance)) 2809 if (!(*balance))
2753 goto ret; 2810 goto ret;
2754 2811
2812 if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
2813 check_asym_packing(sd, &sds, this_cpu, imbalance))
2814 return sds.busiest;
2815
2755 if (!sds.busiest || sds.busiest_nr_running == 0) 2816 if (!sds.busiest || sds.busiest_nr_running == 0)
2756 goto out_balanced; 2817 goto out_balanced;
2757 2818
@@ -2786,8 +2847,9 @@ ret:
2786 * find_busiest_queue - find the busiest runqueue among the cpus in group. 2847 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2787 */ 2848 */
2788static struct rq * 2849static struct rq *
2789find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, 2850find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
2790 unsigned long imbalance, const struct cpumask *cpus) 2851 enum cpu_idle_type idle, unsigned long imbalance,
2852 const struct cpumask *cpus)
2791{ 2853{
2792 struct rq *busiest = NULL, *rq; 2854 struct rq *busiest = NULL, *rq;
2793 unsigned long max_load = 0; 2855 unsigned long max_load = 0;
@@ -2798,6 +2860,9 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2798 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); 2860 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
2799 unsigned long wl; 2861 unsigned long wl;
2800 2862
2863 if (!capacity)
2864 capacity = fix_small_capacity(sd, group);
2865
2801 if (!cpumask_test_cpu(i, cpus)) 2866 if (!cpumask_test_cpu(i, cpus))
2802 continue; 2867 continue;
2803 2868
@@ -2837,9 +2902,19 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2837/* Working cpumask for load_balance and load_balance_newidle. */ 2902/* Working cpumask for load_balance and load_balance_newidle. */
2838static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 2903static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
2839 2904
2840static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle) 2905static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
2906 int busiest_cpu, int this_cpu)
2841{ 2907{
2842 if (idle == CPU_NEWLY_IDLE) { 2908 if (idle == CPU_NEWLY_IDLE) {
2909
2910 /*
2911 * ASYM_PACKING needs to force migrate tasks from busy but
2912 * higher numbered CPUs in order to pack all tasks in the
2913 * lowest numbered CPUs.
2914 */
2915 if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu)
2916 return 1;
2917
2843 /* 2918 /*
2844 * The only task running in a non-idle cpu can be moved to this 2919 * The only task running in a non-idle cpu can be moved to this
2845 * cpu in an attempt to completely freeup the other CPU 2920 * cpu in an attempt to completely freeup the other CPU
@@ -2870,6 +2945,8 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
2870 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); 2945 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
2871} 2946}
2872 2947
2948static int active_load_balance_cpu_stop(void *data);
2949
2873/* 2950/*
2874 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2951 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2875 * tasks if there is an imbalance. 2952 * tasks if there is an imbalance.
@@ -2912,7 +2989,7 @@ redo:
2912 goto out_balanced; 2989 goto out_balanced;
2913 } 2990 }
2914 2991
2915 busiest = find_busiest_queue(group, idle, imbalance, cpus); 2992 busiest = find_busiest_queue(sd, group, idle, imbalance, cpus);
2916 if (!busiest) { 2993 if (!busiest) {
2917 schedstat_inc(sd, lb_nobusyq[idle]); 2994 schedstat_inc(sd, lb_nobusyq[idle]);
2918 goto out_balanced; 2995 goto out_balanced;
@@ -2956,11 +3033,13 @@ redo:
2956 schedstat_inc(sd, lb_failed[idle]); 3033 schedstat_inc(sd, lb_failed[idle]);
2957 sd->nr_balance_failed++; 3034 sd->nr_balance_failed++;
2958 3035
2959 if (need_active_balance(sd, sd_idle, idle)) { 3036 if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
3037 this_cpu)) {
2960 raw_spin_lock_irqsave(&busiest->lock, flags); 3038 raw_spin_lock_irqsave(&busiest->lock, flags);
2961 3039
2962 /* don't kick the migration_thread, if the curr 3040 /* don't kick the active_load_balance_cpu_stop,
2963 * task on busiest cpu can't be moved to this_cpu 3041 * if the curr task on busiest cpu can't be
3042 * moved to this_cpu
2964 */ 3043 */
2965 if (!cpumask_test_cpu(this_cpu, 3044 if (!cpumask_test_cpu(this_cpu,
2966 &busiest->curr->cpus_allowed)) { 3045 &busiest->curr->cpus_allowed)) {
@@ -2970,14 +3049,22 @@ redo:
2970 goto out_one_pinned; 3049 goto out_one_pinned;
2971 } 3050 }
2972 3051
3052 /*
3053 * ->active_balance synchronizes accesses to
3054 * ->active_balance_work. Once set, it's cleared
3055 * only after active load balance is finished.
3056 */
2973 if (!busiest->active_balance) { 3057 if (!busiest->active_balance) {
2974 busiest->active_balance = 1; 3058 busiest->active_balance = 1;
2975 busiest->push_cpu = this_cpu; 3059 busiest->push_cpu = this_cpu;
2976 active_balance = 1; 3060 active_balance = 1;
2977 } 3061 }
2978 raw_spin_unlock_irqrestore(&busiest->lock, flags); 3062 raw_spin_unlock_irqrestore(&busiest->lock, flags);
3063
2979 if (active_balance) 3064 if (active_balance)
2980 wake_up_process(busiest->migration_thread); 3065 stop_one_cpu_nowait(cpu_of(busiest),
3066 active_load_balance_cpu_stop, busiest,
3067 &busiest->active_balance_work);
2981 3068
2982 /* 3069 /*
2983 * We've kicked active balancing, reset the failure 3070 * We've kicked active balancing, reset the failure
@@ -3084,24 +3171,29 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3084} 3171}
3085 3172
3086/* 3173/*
3087 * active_load_balance is run by migration threads. It pushes running tasks 3174 * active_load_balance_cpu_stop is run by cpu stopper. It pushes
3088 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be 3175 * running tasks off the busiest CPU onto idle CPUs. It requires at
3089 * running on each physical CPU where possible, and avoids physical / 3176 * least 1 task to be running on each physical CPU where possible, and
3090 * logical imbalances. 3177 * avoids physical / logical imbalances.
3091 *
3092 * Called with busiest_rq locked.
3093 */ 3178 */
3094static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) 3179static int active_load_balance_cpu_stop(void *data)
3095{ 3180{
3181 struct rq *busiest_rq = data;
3182 int busiest_cpu = cpu_of(busiest_rq);
3096 int target_cpu = busiest_rq->push_cpu; 3183 int target_cpu = busiest_rq->push_cpu;
3184 struct rq *target_rq = cpu_rq(target_cpu);
3097 struct sched_domain *sd; 3185 struct sched_domain *sd;
3098 struct rq *target_rq; 3186
3187 raw_spin_lock_irq(&busiest_rq->lock);
3188
3189 /* make sure the requested cpu hasn't gone down in the meantime */
3190 if (unlikely(busiest_cpu != smp_processor_id() ||
3191 !busiest_rq->active_balance))
3192 goto out_unlock;
3099 3193
3100 /* Is there any task to move? */ 3194 /* Is there any task to move? */
3101 if (busiest_rq->nr_running <= 1) 3195 if (busiest_rq->nr_running <= 1)
3102 return; 3196 goto out_unlock;
3103
3104 target_rq = cpu_rq(target_cpu);
3105 3197
3106 /* 3198 /*
3107 * This condition is "impossible", if it occurs 3199 * This condition is "impossible", if it occurs
@@ -3112,8 +3204,6 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3112 3204
3113 /* move a task from busiest_rq to target_rq */ 3205 /* move a task from busiest_rq to target_rq */
3114 double_lock_balance(busiest_rq, target_rq); 3206 double_lock_balance(busiest_rq, target_rq);
3115 update_rq_clock(busiest_rq);
3116 update_rq_clock(target_rq);
3117 3207
3118 /* Search for an sd spanning us and the target CPU. */ 3208 /* Search for an sd spanning us and the target CPU. */
3119 for_each_domain(target_cpu, sd) { 3209 for_each_domain(target_cpu, sd) {
@@ -3132,16 +3222,47 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3132 schedstat_inc(sd, alb_failed); 3222 schedstat_inc(sd, alb_failed);
3133 } 3223 }
3134 double_unlock_balance(busiest_rq, target_rq); 3224 double_unlock_balance(busiest_rq, target_rq);
3225out_unlock:
3226 busiest_rq->active_balance = 0;
3227 raw_spin_unlock_irq(&busiest_rq->lock);
3228 return 0;
3135} 3229}
3136 3230
3137#ifdef CONFIG_NO_HZ 3231#ifdef CONFIG_NO_HZ
3232
3233static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
3234
3235static void trigger_sched_softirq(void *data)
3236{
3237 raise_softirq_irqoff(SCHED_SOFTIRQ);
3238}
3239
3240static inline void init_sched_softirq_csd(struct call_single_data *csd)
3241{
3242 csd->func = trigger_sched_softirq;
3243 csd->info = NULL;
3244 csd->flags = 0;
3245 csd->priv = 0;
3246}
3247
3248/*
3249 * idle load balancing details
3250 * - One of the idle CPUs nominates itself as idle load_balancer, while
3251 * entering idle.
3252 * - This idle load balancer CPU will also go into tickless mode when
3253 * it is idle, just like all other idle CPUs
3254 * - When one of the busy CPUs notice that there may be an idle rebalancing
3255 * needed, they will kick the idle load balancer, which then does idle
3256 * load balancing for all the idle CPUs.
3257 */
3138static struct { 3258static struct {
3139 atomic_t load_balancer; 3259 atomic_t load_balancer;
3140 cpumask_var_t cpu_mask; 3260 atomic_t first_pick_cpu;
3141 cpumask_var_t ilb_grp_nohz_mask; 3261 atomic_t second_pick_cpu;
3142} nohz ____cacheline_aligned = { 3262 cpumask_var_t idle_cpus_mask;
3143 .load_balancer = ATOMIC_INIT(-1), 3263 cpumask_var_t grp_idle_mask;
3144}; 3264 unsigned long next_balance; /* in jiffy units */
3265} nohz ____cacheline_aligned;
3145 3266
3146int get_nohz_load_balancer(void) 3267int get_nohz_load_balancer(void)
3147{ 3268{
@@ -3195,17 +3316,17 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
3195 */ 3316 */
3196static inline int is_semi_idle_group(struct sched_group *ilb_group) 3317static inline int is_semi_idle_group(struct sched_group *ilb_group)
3197{ 3318{
3198 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, 3319 cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
3199 sched_group_cpus(ilb_group)); 3320 sched_group_cpus(ilb_group));
3200 3321
3201 /* 3322 /*
3202 * A sched_group is semi-idle when it has atleast one busy cpu 3323 * A sched_group is semi-idle when it has atleast one busy cpu
3203 * and atleast one idle cpu. 3324 * and atleast one idle cpu.
3204 */ 3325 */
3205 if (cpumask_empty(nohz.ilb_grp_nohz_mask)) 3326 if (cpumask_empty(nohz.grp_idle_mask))
3206 return 0; 3327 return 0;
3207 3328
3208 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) 3329 if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
3209 return 0; 3330 return 0;
3210 3331
3211 return 1; 3332 return 1;
@@ -3238,7 +3359,7 @@ static int find_new_ilb(int cpu)
3238 * Optimize for the case when we have no idle CPUs or only one 3359 * Optimize for the case when we have no idle CPUs or only one
3239 * idle CPU. Don't walk the sched_domain hierarchy in such cases 3360 * idle CPU. Don't walk the sched_domain hierarchy in such cases
3240 */ 3361 */
3241 if (cpumask_weight(nohz.cpu_mask) < 2) 3362 if (cpumask_weight(nohz.idle_cpus_mask) < 2)
3242 goto out_done; 3363 goto out_done;
3243 3364
3244 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { 3365 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
@@ -3246,7 +3367,7 @@ static int find_new_ilb(int cpu)
3246 3367
3247 do { 3368 do {
3248 if (is_semi_idle_group(ilb_group)) 3369 if (is_semi_idle_group(ilb_group))
3249 return cpumask_first(nohz.ilb_grp_nohz_mask); 3370 return cpumask_first(nohz.grp_idle_mask);
3250 3371
3251 ilb_group = ilb_group->next; 3372 ilb_group = ilb_group->next;
3252 3373
@@ -3254,98 +3375,116 @@ static int find_new_ilb(int cpu)
3254 } 3375 }
3255 3376
3256out_done: 3377out_done:
3257 return cpumask_first(nohz.cpu_mask); 3378 return nr_cpu_ids;
3258} 3379}
3259#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ 3380#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
3260static inline int find_new_ilb(int call_cpu) 3381static inline int find_new_ilb(int call_cpu)
3261{ 3382{
3262 return cpumask_first(nohz.cpu_mask); 3383 return nr_cpu_ids;
3263} 3384}
3264#endif 3385#endif
3265 3386
3266/* 3387/*
3388 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
3389 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
3390 * CPU (if there is one).
3391 */
3392static void nohz_balancer_kick(int cpu)
3393{
3394 int ilb_cpu;
3395
3396 nohz.next_balance++;
3397
3398 ilb_cpu = get_nohz_load_balancer();
3399
3400 if (ilb_cpu >= nr_cpu_ids) {
3401 ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
3402 if (ilb_cpu >= nr_cpu_ids)
3403 return;
3404 }
3405
3406 if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
3407 struct call_single_data *cp;
3408
3409 cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
3410 cp = &per_cpu(remote_sched_softirq_cb, cpu);
3411 __smp_call_function_single(ilb_cpu, cp, 0);
3412 }
3413 return;
3414}
3415
3416/*
3267 * This routine will try to nominate the ilb (idle load balancing) 3417 * This routine will try to nominate the ilb (idle load balancing)
3268 * owner among the cpus whose ticks are stopped. ilb owner will do the idle 3418 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
3269 * load balancing on behalf of all those cpus. If all the cpus in the system 3419 * load balancing on behalf of all those cpus.
3270 * go into this tickless mode, then there will be no ilb owner (as there is
3271 * no need for one) and all the cpus will sleep till the next wakeup event
3272 * arrives...
3273 *
3274 * For the ilb owner, tick is not stopped. And this tick will be used
3275 * for idle load balancing. ilb owner will still be part of
3276 * nohz.cpu_mask..
3277 * 3420 *
3278 * While stopping the tick, this cpu will become the ilb owner if there 3421 * When the ilb owner becomes busy, we will not have new ilb owner until some
3279 * is no other owner. And will be the owner till that cpu becomes busy 3422 * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
3280 * or if all cpus in the system stop their ticks at which point 3423 * idle load balancing by kicking one of the idle CPUs.
3281 * there is no need for ilb owner.
3282 * 3424 *
3283 * When the ilb owner becomes busy, it nominates another owner, during the 3425 * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
3284 * next busy scheduler_tick() 3426 * ilb owner CPU in future (when there is a need for idle load balancing on
3427 * behalf of all idle CPUs).
3285 */ 3428 */
3286int select_nohz_load_balancer(int stop_tick) 3429void select_nohz_load_balancer(int stop_tick)
3287{ 3430{
3288 int cpu = smp_processor_id(); 3431 int cpu = smp_processor_id();
3289 3432
3290 if (stop_tick) { 3433 if (stop_tick) {
3291 cpu_rq(cpu)->in_nohz_recently = 1;
3292
3293 if (!cpu_active(cpu)) { 3434 if (!cpu_active(cpu)) {
3294 if (atomic_read(&nohz.load_balancer) != cpu) 3435 if (atomic_read(&nohz.load_balancer) != cpu)
3295 return 0; 3436 return;
3296 3437
3297 /* 3438 /*
3298 * If we are going offline and still the leader, 3439 * If we are going offline and still the leader,
3299 * give up! 3440 * give up!
3300 */ 3441 */
3301 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 3442 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
3443 nr_cpu_ids) != cpu)
3302 BUG(); 3444 BUG();
3303 3445
3304 return 0; 3446 return;
3305 } 3447 }
3306 3448
3307 cpumask_set_cpu(cpu, nohz.cpu_mask); 3449 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
3308 3450
3309 /* time for ilb owner also to sleep */ 3451 if (atomic_read(&nohz.first_pick_cpu) == cpu)
3310 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { 3452 atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
3311 if (atomic_read(&nohz.load_balancer) == cpu) 3453 if (atomic_read(&nohz.second_pick_cpu) == cpu)
3312 atomic_set(&nohz.load_balancer, -1); 3454 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
3313 return 0;
3314 }
3315 3455
3316 if (atomic_read(&nohz.load_balancer) == -1) { 3456 if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
3317 /* make me the ilb owner */
3318 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
3319 return 1;
3320 } else if (atomic_read(&nohz.load_balancer) == cpu) {
3321 int new_ilb; 3457 int new_ilb;
3322 3458
3323 if (!(sched_smt_power_savings || 3459 /* make me the ilb owner */
3324 sched_mc_power_savings)) 3460 if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids,
3325 return 1; 3461 cpu) != nr_cpu_ids)
3462 return;
3463
3326 /* 3464 /*
3327 * Check to see if there is a more power-efficient 3465 * Check to see if there is a more power-efficient
3328 * ilb. 3466 * ilb.
3329 */ 3467 */
3330 new_ilb = find_new_ilb(cpu); 3468 new_ilb = find_new_ilb(cpu);
3331 if (new_ilb < nr_cpu_ids && new_ilb != cpu) { 3469 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
3332 atomic_set(&nohz.load_balancer, -1); 3470 atomic_set(&nohz.load_balancer, nr_cpu_ids);
3333 resched_cpu(new_ilb); 3471 resched_cpu(new_ilb);
3334 return 0; 3472 return;
3335 } 3473 }
3336 return 1; 3474 return;
3337 } 3475 }
3338 } else { 3476 } else {
3339 if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) 3477 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
3340 return 0; 3478 return;
3341 3479
3342 cpumask_clear_cpu(cpu, nohz.cpu_mask); 3480 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
3343 3481
3344 if (atomic_read(&nohz.load_balancer) == cpu) 3482 if (atomic_read(&nohz.load_balancer) == cpu)
3345 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 3483 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
3484 nr_cpu_ids) != cpu)
3346 BUG(); 3485 BUG();
3347 } 3486 }
3348 return 0; 3487 return;
3349} 3488}
3350#endif 3489#endif
3351 3490
@@ -3427,11 +3566,102 @@ out:
3427 rq->next_balance = next_balance; 3566 rq->next_balance = next_balance;
3428} 3567}
3429 3568
3569#ifdef CONFIG_NO_HZ
3430/* 3570/*
3431 * run_rebalance_domains is triggered when needed from the scheduler tick. 3571 * In CONFIG_NO_HZ case, the idle balance kickee will do the
3432 * In CONFIG_NO_HZ case, the idle load balance owner will do the
3433 * rebalancing for all the cpus for whom scheduler ticks are stopped. 3572 * rebalancing for all the cpus for whom scheduler ticks are stopped.
3434 */ 3573 */
3574static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
3575{
3576 struct rq *this_rq = cpu_rq(this_cpu);
3577 struct rq *rq;
3578 int balance_cpu;
3579
3580 if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
3581 return;
3582
3583 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
3584 if (balance_cpu == this_cpu)
3585 continue;
3586
3587 /*
3588 * If this cpu gets work to do, stop the load balancing
3589 * work being done for other cpus. Next load
3590 * balancing owner will pick it up.
3591 */
3592 if (need_resched()) {
3593 this_rq->nohz_balance_kick = 0;
3594 break;
3595 }
3596
3597 raw_spin_lock_irq(&this_rq->lock);
3598 update_rq_clock(this_rq);
3599 update_cpu_load(this_rq);
3600 raw_spin_unlock_irq(&this_rq->lock);
3601
3602 rebalance_domains(balance_cpu, CPU_IDLE);
3603
3604 rq = cpu_rq(balance_cpu);
3605 if (time_after(this_rq->next_balance, rq->next_balance))
3606 this_rq->next_balance = rq->next_balance;
3607 }
3608 nohz.next_balance = this_rq->next_balance;
3609 this_rq->nohz_balance_kick = 0;
3610}
3611
3612/*
3613 * Current heuristic for kicking the idle load balancer
3614 * - first_pick_cpu is the one of the busy CPUs. It will kick
3615 * idle load balancer when it has more than one process active. This
3616 * eliminates the need for idle load balancing altogether when we have
3617 * only one running process in the system (common case).
3618 * - If there are more than one busy CPU, idle load balancer may have
3619 * to run for active_load_balance to happen (i.e., two busy CPUs are
3620 * SMT or core siblings and can run better if they move to different
3621 * physical CPUs). So, second_pick_cpu is the second of the busy CPUs
3622 * which will kick idle load balancer as soon as it has any load.
3623 */
3624static inline int nohz_kick_needed(struct rq *rq, int cpu)
3625{
3626 unsigned long now = jiffies;
3627 int ret;
3628 int first_pick_cpu, second_pick_cpu;
3629
3630 if (time_before(now, nohz.next_balance))
3631 return 0;
3632
3633 if (rq->idle_at_tick)
3634 return 0;
3635
3636 first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
3637 second_pick_cpu = atomic_read(&nohz.second_pick_cpu);
3638
3639 if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu &&
3640 second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
3641 return 0;
3642
3643 ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu);
3644 if (ret == nr_cpu_ids || ret == cpu) {
3645 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
3646 if (rq->nr_running > 1)
3647 return 1;
3648 } else {
3649 ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu);
3650 if (ret == nr_cpu_ids || ret == cpu) {
3651 if (rq->nr_running)
3652 return 1;
3653 }
3654 }
3655 return 0;
3656}
3657#else
3658static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
3659#endif
3660
3661/*
3662 * run_rebalance_domains is triggered when needed from the scheduler tick.
3663 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
3664 */
3435static void run_rebalance_domains(struct softirq_action *h) 3665static void run_rebalance_domains(struct softirq_action *h)
3436{ 3666{
3437 int this_cpu = smp_processor_id(); 3667 int this_cpu = smp_processor_id();
@@ -3441,37 +3671,12 @@ static void run_rebalance_domains(struct softirq_action *h)
3441 3671
3442 rebalance_domains(this_cpu, idle); 3672 rebalance_domains(this_cpu, idle);
3443 3673
3444#ifdef CONFIG_NO_HZ
3445 /* 3674 /*
3446 * If this cpu is the owner for idle load balancing, then do the 3675 * If this cpu has a pending nohz_balance_kick, then do the
3447 * balancing on behalf of the other idle cpus whose ticks are 3676 * balancing on behalf of the other idle cpus whose ticks are
3448 * stopped. 3677 * stopped.
3449 */ 3678 */
3450 if (this_rq->idle_at_tick && 3679 nohz_idle_balance(this_cpu, idle);
3451 atomic_read(&nohz.load_balancer) == this_cpu) {
3452 struct rq *rq;
3453 int balance_cpu;
3454
3455 for_each_cpu(balance_cpu, nohz.cpu_mask) {
3456 if (balance_cpu == this_cpu)
3457 continue;
3458
3459 /*
3460 * If this cpu gets work to do, stop the load balancing
3461 * work being done for other cpus. Next load
3462 * balancing owner will pick it up.
3463 */
3464 if (need_resched())
3465 break;
3466
3467 rebalance_domains(balance_cpu, CPU_IDLE);
3468
3469 rq = cpu_rq(balance_cpu);
3470 if (time_after(this_rq->next_balance, rq->next_balance))
3471 this_rq->next_balance = rq->next_balance;
3472 }
3473 }
3474#endif
3475} 3680}
3476 3681
3477static inline int on_null_domain(int cpu) 3682static inline int on_null_domain(int cpu)
@@ -3481,57 +3686,17 @@ static inline int on_null_domain(int cpu)
3481 3686
3482/* 3687/*
3483 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 3688 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3484 *
3485 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3486 * idle load balancing owner or decide to stop the periodic load balancing,
3487 * if the whole system is idle.
3488 */ 3689 */
3489static inline void trigger_load_balance(struct rq *rq, int cpu) 3690static inline void trigger_load_balance(struct rq *rq, int cpu)
3490{ 3691{
3491#ifdef CONFIG_NO_HZ
3492 /*
3493 * If we were in the nohz mode recently and busy at the current
3494 * scheduler tick, then check if we need to nominate new idle
3495 * load balancer.
3496 */
3497 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3498 rq->in_nohz_recently = 0;
3499
3500 if (atomic_read(&nohz.load_balancer) == cpu) {
3501 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3502 atomic_set(&nohz.load_balancer, -1);
3503 }
3504
3505 if (atomic_read(&nohz.load_balancer) == -1) {
3506 int ilb = find_new_ilb(cpu);
3507
3508 if (ilb < nr_cpu_ids)
3509 resched_cpu(ilb);
3510 }
3511 }
3512
3513 /*
3514 * If this cpu is idle and doing idle load balancing for all the
3515 * cpus with ticks stopped, is it time for that to stop?
3516 */
3517 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3518 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3519 resched_cpu(cpu);
3520 return;
3521 }
3522
3523 /*
3524 * If this cpu is idle and the idle load balancing is done by
3525 * someone else, then no need raise the SCHED_SOFTIRQ
3526 */
3527 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3528 cpumask_test_cpu(cpu, nohz.cpu_mask))
3529 return;
3530#endif
3531 /* Don't need to rebalance while attached to NULL domain */ 3692 /* Don't need to rebalance while attached to NULL domain */
3532 if (time_after_eq(jiffies, rq->next_balance) && 3693 if (time_after_eq(jiffies, rq->next_balance) &&
3533 likely(!on_null_domain(cpu))) 3694 likely(!on_null_domain(cpu)))
3534 raise_softirq(SCHED_SOFTIRQ); 3695 raise_softirq(SCHED_SOFTIRQ);
3696#ifdef CONFIG_NO_HZ
3697 else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
3698 nohz_balancer_kick(cpu);
3699#endif
3535} 3700}
3536 3701
3537static void rq_online_fair(struct rq *rq) 3702static void rq_online_fair(struct rq *rq)
@@ -3584,6 +3749,8 @@ static void task_fork_fair(struct task_struct *p)
3584 3749
3585 raw_spin_lock_irqsave(&rq->lock, flags); 3750 raw_spin_lock_irqsave(&rq->lock, flags);
3586 3751
3752 update_rq_clock(rq);
3753
3587 if (unlikely(task_cpu(p) != this_cpu)) 3754 if (unlikely(task_cpu(p) != this_cpu))
3588 __set_task_cpu(p, this_cpu); 3755 __set_task_cpu(p, this_cpu);
3589 3756
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index d5059fd761d9..83c66e8ad3ee 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,11 +1,4 @@
1/* 1/*
2 * Disregards a certain amount of sleep time (sched_latency_ns) and
3 * considers the task to be running during that period. This gives it
4 * a service deficit on wakeup, allowing it to run sooner.
5 */
6SCHED_FEAT(FAIR_SLEEPERS, 1)
7
8/*
9 * Only give sleepers 50% of their service deficit. This allows 2 * Only give sleepers 50% of their service deficit. This allows
10 * them to run sooner, but does not allow tons of sleepers to 3 * them to run sooner, but does not allow tons of sleepers to
11 * rip the spread apart. 4 * rip the spread apart.
@@ -13,13 +6,6 @@ SCHED_FEAT(FAIR_SLEEPERS, 1)
13SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1) 6SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
14 7
15/* 8/*
16 * By not normalizing the sleep time, heavy tasks get an effective
17 * longer period, and lighter task an effective shorter period they
18 * are considered running.
19 */
20SCHED_FEAT(NORMALIZED_SLEEPER, 0)
21
22/*
23 * Place new tasks ahead so that they do not starve already running 9 * Place new tasks ahead so that they do not starve already running
24 * tasks 10 * tasks
25 */ 11 */
@@ -31,37 +17,6 @@ SCHED_FEAT(START_DEBIT, 1)
31SCHED_FEAT(WAKEUP_PREEMPT, 1) 17SCHED_FEAT(WAKEUP_PREEMPT, 1)
32 18
33/* 19/*
34 * Compute wakeup_gran based on task behaviour, clipped to
35 * [0, sched_wakeup_gran_ns]
36 */
37SCHED_FEAT(ADAPTIVE_GRAN, 1)
38
39/*
40 * When converting the wakeup granularity to virtual time, do it such
41 * that heavier tasks preempting a lighter task have an edge.
42 */
43SCHED_FEAT(ASYM_GRAN, 1)
44
45/*
46 * Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS.
47 */
48SCHED_FEAT(WAKEUP_SYNC, 0)
49
50/*
51 * Wakeup preempt based on task behaviour. Tasks that do not overlap
52 * don't get preempted.
53 */
54SCHED_FEAT(WAKEUP_OVERLAP, 0)
55
56/*
57 * Use the SYNC wakeup hint, pipes and the likes use this to indicate
58 * the remote end is likely to consume the data we just wrote, and
59 * therefore has cache benefit from being placed on the same cpu, see
60 * also AFFINE_WAKEUPS.
61 */
62SCHED_FEAT(SYNC_WAKEUPS, 1)
63
64/*
65 * Based on load and program behaviour, see if it makes sense to place 20 * Based on load and program behaviour, see if it makes sense to place
66 * a newly woken task on the same cpu as the task that woke it -- 21 * a newly woken task on the same cpu as the task that woke it --
67 * improve cache locality. Typically used with SYNC wakeups as 22 * improve cache locality. Typically used with SYNC wakeups as
@@ -70,16 +25,6 @@ SCHED_FEAT(SYNC_WAKEUPS, 1)
70SCHED_FEAT(AFFINE_WAKEUPS, 1) 25SCHED_FEAT(AFFINE_WAKEUPS, 1)
71 26
72/* 27/*
73 * Weaken SYNC hint based on overlap
74 */
75SCHED_FEAT(SYNC_LESS, 1)
76
77/*
78 * Add SYNC hint based on overlap
79 */
80SCHED_FEAT(SYNC_MORE, 0)
81
82/*
83 * Prefer to schedule the task we woke last (assuming it failed 28 * Prefer to schedule the task we woke last (assuming it failed
84 * wakeup-preemption), since its likely going to consume data we 29 * wakeup-preemption), since its likely going to consume data we
85 * touched, increases cache locality. 30 * touched, increases cache locality.
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index a8a6d8a50947..9fa0f402c87c 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -6,7 +6,8 @@
6 */ 6 */
7 7
8#ifdef CONFIG_SMP 8#ifdef CONFIG_SMP
9static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) 9static int
10select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
10{ 11{
11 return task_cpu(p); /* IDLE tasks as never migrated */ 12 return task_cpu(p); /* IDLE tasks as never migrated */
12} 13}
@@ -22,8 +23,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
22static struct task_struct *pick_next_task_idle(struct rq *rq) 23static struct task_struct *pick_next_task_idle(struct rq *rq)
23{ 24{
24 schedstat_inc(rq, sched_goidle); 25 schedstat_inc(rq, sched_goidle);
25 /* adjust the active tasks as we might go into a long sleep */ 26 calc_load_account_idle(rq);
26 calc_load_account_active(rq);
27 return rq->idle; 27 return rq->idle;
28} 28}
29 29
@@ -32,7 +32,7 @@ static struct task_struct *pick_next_task_idle(struct rq *rq)
32 * message if some code attempts to do it: 32 * message if some code attempts to do it:
33 */ 33 */
34static void 34static void
35dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep) 35dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
36{ 36{
37 raw_spin_unlock_irq(&rq->lock); 37 raw_spin_unlock_irq(&rq->lock);
38 printk(KERN_ERR "bad: scheduling from the idle thread!\n"); 38 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index c2fbb02c1b54..e40e7fe43170 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -613,7 +613,7 @@ static void update_curr_rt(struct rq *rq)
613 if (unlikely((s64)delta_exec < 0)) 613 if (unlikely((s64)delta_exec < 0))
614 delta_exec = 0; 614 delta_exec = 0;
615 615
616 schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec)); 616 schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec));
617 617
618 curr->se.sum_exec_runtime += delta_exec; 618 curr->se.sum_exec_runtime += delta_exec;
619 account_group_exec_runtime(curr, delta_exec); 619 account_group_exec_runtime(curr, delta_exec);
@@ -888,20 +888,20 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
888 * Adding/removing a task to/from a priority array: 888 * Adding/removing a task to/from a priority array:
889 */ 889 */
890static void 890static void
891enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head) 891enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
892{ 892{
893 struct sched_rt_entity *rt_se = &p->rt; 893 struct sched_rt_entity *rt_se = &p->rt;
894 894
895 if (wakeup) 895 if (flags & ENQUEUE_WAKEUP)
896 rt_se->timeout = 0; 896 rt_se->timeout = 0;
897 897
898 enqueue_rt_entity(rt_se, head); 898 enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
899 899
900 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 900 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
901 enqueue_pushable_task(rq, p); 901 enqueue_pushable_task(rq, p);
902} 902}
903 903
904static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) 904static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
905{ 905{
906 struct sched_rt_entity *rt_se = &p->rt; 906 struct sched_rt_entity *rt_se = &p->rt;
907 907
@@ -948,10 +948,9 @@ static void yield_task_rt(struct rq *rq)
948#ifdef CONFIG_SMP 948#ifdef CONFIG_SMP
949static int find_lowest_rq(struct task_struct *task); 949static int find_lowest_rq(struct task_struct *task);
950 950
951static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) 951static int
952select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
952{ 953{
953 struct rq *rq = task_rq(p);
954
955 if (sd_flag != SD_BALANCE_WAKE) 954 if (sd_flag != SD_BALANCE_WAKE)
956 return smp_processor_id(); 955 return smp_processor_id();
957 956
@@ -1664,9 +1663,6 @@ static void watchdog(struct rq *rq, struct task_struct *p)
1664{ 1663{
1665 unsigned long soft, hard; 1664 unsigned long soft, hard;
1666 1665
1667 if (!p->signal)
1668 return;
1669
1670 /* max may change after cur was read, this will be fixed next tick */ 1666 /* max may change after cur was read, this will be fixed next tick */
1671 soft = task_rlimit(p, RLIMIT_RTTIME); 1667 soft = task_rlimit(p, RLIMIT_RTTIME);
1672 hard = task_rlimit_max(p, RLIMIT_RTTIME); 1668 hard = task_rlimit_max(p, RLIMIT_RTTIME);
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 32d2bd4061b0..25c2f962f6fc 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -295,13 +295,7 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
295static inline void account_group_user_time(struct task_struct *tsk, 295static inline void account_group_user_time(struct task_struct *tsk,
296 cputime_t cputime) 296 cputime_t cputime)
297{ 297{
298 struct thread_group_cputimer *cputimer; 298 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
299
300 /* tsk == current, ensure it is safe to use ->signal */
301 if (unlikely(tsk->exit_state))
302 return;
303
304 cputimer = &tsk->signal->cputimer;
305 299
306 if (!cputimer->running) 300 if (!cputimer->running)
307 return; 301 return;
@@ -325,13 +319,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
325static inline void account_group_system_time(struct task_struct *tsk, 319static inline void account_group_system_time(struct task_struct *tsk,
326 cputime_t cputime) 320 cputime_t cputime)
327{ 321{
328 struct thread_group_cputimer *cputimer; 322 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
329
330 /* tsk == current, ensure it is safe to use ->signal */
331 if (unlikely(tsk->exit_state))
332 return;
333
334 cputimer = &tsk->signal->cputimer;
335 323
336 if (!cputimer->running) 324 if (!cputimer->running)
337 return; 325 return;
@@ -355,16 +343,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
355static inline void account_group_exec_runtime(struct task_struct *tsk, 343static inline void account_group_exec_runtime(struct task_struct *tsk,
356 unsigned long long ns) 344 unsigned long long ns)
357{ 345{
358 struct thread_group_cputimer *cputimer; 346 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
359 struct signal_struct *sig;
360
361 sig = tsk->signal;
362 /* see __exit_signal()->task_rq_unlock_wait() */
363 barrier();
364 if (unlikely(!sig))
365 return;
366
367 cputimer = &sig->cputimer;
368 347
369 if (!cputimer->running) 348 if (!cputimer->running)
370 return; 349 return;
diff --git a/kernel/signal.c b/kernel/signal.c
index dbd7fe073c55..919562c3d6b7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -637,12 +637,12 @@ static inline bool si_fromuser(const struct siginfo *info)
637 637
638/* 638/*
639 * Bad permissions for sending the signal 639 * Bad permissions for sending the signal
640 * - the caller must hold at least the RCU read lock 640 * - the caller must hold the RCU read lock
641 */ 641 */
642static int check_kill_permission(int sig, struct siginfo *info, 642static int check_kill_permission(int sig, struct siginfo *info,
643 struct task_struct *t) 643 struct task_struct *t)
644{ 644{
645 const struct cred *cred = current_cred(), *tcred; 645 const struct cred *cred, *tcred;
646 struct pid *sid; 646 struct pid *sid;
647 int error; 647 int error;
648 648
@@ -656,8 +656,10 @@ static int check_kill_permission(int sig, struct siginfo *info,
656 if (error) 656 if (error)
657 return error; 657 return error;
658 658
659 cred = current_cred();
659 tcred = __task_cred(t); 660 tcred = __task_cred(t);
660 if ((cred->euid ^ tcred->suid) && 661 if (!same_thread_group(current, t) &&
662 (cred->euid ^ tcred->suid) &&
661 (cred->euid ^ tcred->uid) && 663 (cred->euid ^ tcred->uid) &&
662 (cred->uid ^ tcred->suid) && 664 (cred->uid ^ tcred->suid) &&
663 (cred->uid ^ tcred->uid) && 665 (cred->uid ^ tcred->uid) &&
@@ -1083,23 +1085,24 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
1083/* 1085/*
1084 * Nuke all other threads in the group. 1086 * Nuke all other threads in the group.
1085 */ 1087 */
1086void zap_other_threads(struct task_struct *p) 1088int zap_other_threads(struct task_struct *p)
1087{ 1089{
1088 struct task_struct *t; 1090 struct task_struct *t = p;
1091 int count = 0;
1089 1092
1090 p->signal->group_stop_count = 0; 1093 p->signal->group_stop_count = 0;
1091 1094
1092 for (t = next_thread(p); t != p; t = next_thread(t)) { 1095 while_each_thread(p, t) {
1093 /* 1096 count++;
1094 * Don't bother with already dead threads 1097
1095 */ 1098 /* Don't bother with already dead threads */
1096 if (t->exit_state) 1099 if (t->exit_state)
1097 continue; 1100 continue;
1098
1099 /* SIGKILL will be handled before any pending SIGSTOP */
1100 sigaddset(&t->pending.signal, SIGKILL); 1101 sigaddset(&t->pending.signal, SIGKILL);
1101 signal_wake_up(t, 1); 1102 signal_wake_up(t, 1);
1102 } 1103 }
1104
1105 return count;
1103} 1106}
1104 1107
1105struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) 1108struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
@@ -1124,11 +1127,14 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long
1124 1127
1125/* 1128/*
1126 * send signal info to all the members of a group 1129 * send signal info to all the members of a group
1127 * - the caller must hold the RCU read lock at least
1128 */ 1130 */
1129int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) 1131int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1130{ 1132{
1131 int ret = check_kill_permission(sig, info, p); 1133 int ret;
1134
1135 rcu_read_lock();
1136 ret = check_kill_permission(sig, info, p);
1137 rcu_read_unlock();
1132 1138
1133 if (!ret && sig) 1139 if (!ret && sig)
1134 ret = do_send_sig_info(sig, info, p, true); 1140 ret = do_send_sig_info(sig, info, p, true);
@@ -2209,6 +2215,14 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
2209#ifdef __ARCH_SI_TRAPNO 2215#ifdef __ARCH_SI_TRAPNO
2210 err |= __put_user(from->si_trapno, &to->si_trapno); 2216 err |= __put_user(from->si_trapno, &to->si_trapno);
2211#endif 2217#endif
2218#ifdef BUS_MCEERR_AO
2219 /*
2220 * Other callers might not initialize the si_lsb field,
2221 * so check explicitely for the right codes here.
2222 */
2223 if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)
2224 err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
2225#endif
2212 break; 2226 break;
2213 case __SI_CHLD: 2227 case __SI_CHLD:
2214 err |= __put_user(from->si_pid, &to->si_pid); 2228 err |= __put_user(from->si_pid, &to->si_pid);
@@ -2735,3 +2749,43 @@ void __init signals_init(void)
2735{ 2749{
2736 sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC); 2750 sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC);
2737} 2751}
2752
2753#ifdef CONFIG_KGDB_KDB
2754#include <linux/kdb.h>
2755/*
2756 * kdb_send_sig_info - Allows kdb to send signals without exposing
2757 * signal internals. This function checks if the required locks are
2758 * available before calling the main signal code, to avoid kdb
2759 * deadlocks.
2760 */
2761void
2762kdb_send_sig_info(struct task_struct *t, struct siginfo *info)
2763{
2764 static struct task_struct *kdb_prev_t;
2765 int sig, new_t;
2766 if (!spin_trylock(&t->sighand->siglock)) {
2767 kdb_printf("Can't do kill command now.\n"
2768 "The sigmask lock is held somewhere else in "
2769 "kernel, try again later\n");
2770 return;
2771 }
2772 spin_unlock(&t->sighand->siglock);
2773 new_t = kdb_prev_t != t;
2774 kdb_prev_t = t;
2775 if (t->state != TASK_RUNNING && new_t) {
2776 kdb_printf("Process is not RUNNING, sending a signal from "
2777 "kdb risks deadlock\n"
2778 "on the run queue locks. "
2779 "The signal has _not_ been sent.\n"
2780 "Reissue the kill command if you want to risk "
2781 "the deadlock.\n");
2782 return;
2783 }
2784 sig = info->si_signo;
2785 if (send_sig_info(sig, info, t))
2786 kdb_printf("Fail to deliver Signal %d to process %d.\n",
2787 sig, t->pid);
2788 else
2789 kdb_printf("Signal %d is sent to process %d.\n", sig, t->pid);
2790}
2791#endif /* CONFIG_KGDB_KDB */
diff --git a/kernel/slow-work-debugfs.c b/kernel/slow-work-debugfs.c
deleted file mode 100644
index e45c43645298..000000000000
--- a/kernel/slow-work-debugfs.c
+++ /dev/null
@@ -1,227 +0,0 @@
1/* Slow work debugging
2 *
3 * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/module.h>
13#include <linux/slow-work.h>
14#include <linux/fs.h>
15#include <linux/time.h>
16#include <linux/seq_file.h>
17#include "slow-work.h"
18
19#define ITERATOR_SHIFT (BITS_PER_LONG - 4)
20#define ITERATOR_SELECTOR (0xfUL << ITERATOR_SHIFT)
21#define ITERATOR_COUNTER (~ITERATOR_SELECTOR)
22
23void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m)
24{
25 seq_puts(m, "Slow-work: New thread");
26}
27
28/*
29 * Render the time mark field on a work item into a 5-char time with units plus
30 * a space
31 */
32static void slow_work_print_mark(struct seq_file *m, struct slow_work *work)
33{
34 struct timespec now, diff;
35
36 now = CURRENT_TIME;
37 diff = timespec_sub(now, work->mark);
38
39 if (diff.tv_sec < 0)
40 seq_puts(m, " -ve ");
41 else if (diff.tv_sec == 0 && diff.tv_nsec < 1000)
42 seq_printf(m, "%3luns ", diff.tv_nsec);
43 else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000)
44 seq_printf(m, "%3luus ", diff.tv_nsec / 1000);
45 else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000)
46 seq_printf(m, "%3lums ", diff.tv_nsec / 1000000);
47 else if (diff.tv_sec <= 1)
48 seq_puts(m, " 1s ");
49 else if (diff.tv_sec < 60)
50 seq_printf(m, "%4lus ", diff.tv_sec);
51 else if (diff.tv_sec < 60 * 60)
52 seq_printf(m, "%4lum ", diff.tv_sec / 60);
53 else if (diff.tv_sec < 60 * 60 * 24)
54 seq_printf(m, "%4luh ", diff.tv_sec / 3600);
55 else
56 seq_puts(m, "exces ");
57}
58
59/*
60 * Describe a slow work item for debugfs
61 */
62static int slow_work_runqueue_show(struct seq_file *m, void *v)
63{
64 struct slow_work *work;
65 struct list_head *p = v;
66 unsigned long id;
67
68 switch ((unsigned long) v) {
69 case 1:
70 seq_puts(m, "THR PID ITEM ADDR FL MARK DESC\n");
71 return 0;
72 case 2:
73 seq_puts(m, "=== ===== ================ == ===== ==========\n");
74 return 0;
75
76 case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1:
77 id = (unsigned long) v - 3;
78
79 read_lock(&slow_work_execs_lock);
80 work = slow_work_execs[id];
81 if (work) {
82 smp_read_barrier_depends();
83
84 seq_printf(m, "%3lu %5d %16p %2lx ",
85 id, slow_work_pids[id], work, work->flags);
86 slow_work_print_mark(m, work);
87
88 if (work->ops->desc)
89 work->ops->desc(work, m);
90 seq_putc(m, '\n');
91 }
92 read_unlock(&slow_work_execs_lock);
93 return 0;
94
95 default:
96 work = list_entry(p, struct slow_work, link);
97 seq_printf(m, "%3s - %16p %2lx ",
98 work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq",
99 work, work->flags);
100 slow_work_print_mark(m, work);
101
102 if (work->ops->desc)
103 work->ops->desc(work, m);
104 seq_putc(m, '\n');
105 return 0;
106 }
107}
108
109/*
110 * map the iterator to a work item
111 */
112static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos)
113{
114 struct list_head *p;
115 unsigned long count, id;
116
117 switch (*_pos >> ITERATOR_SHIFT) {
118 case 0x0:
119 if (*_pos == 0)
120 *_pos = 1;
121 if (*_pos < 3)
122 return (void *)(unsigned long) *_pos;
123 if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT)
124 for (id = *_pos - 3;
125 id < SLOW_WORK_THREAD_LIMIT;
126 id++, (*_pos)++)
127 if (slow_work_execs[id])
128 return (void *)(unsigned long) *_pos;
129 *_pos = 0x1UL << ITERATOR_SHIFT;
130
131 case 0x1:
132 count = *_pos & ITERATOR_COUNTER;
133 list_for_each(p, &slow_work_queue) {
134 if (count == 0)
135 return p;
136 count--;
137 }
138 *_pos = 0x2UL << ITERATOR_SHIFT;
139
140 case 0x2:
141 count = *_pos & ITERATOR_COUNTER;
142 list_for_each(p, &vslow_work_queue) {
143 if (count == 0)
144 return p;
145 count--;
146 }
147 *_pos = 0x3UL << ITERATOR_SHIFT;
148
149 default:
150 return NULL;
151 }
152}
153
154/*
155 * set up the iterator to start reading from the first line
156 */
157static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos)
158{
159 spin_lock_irq(&slow_work_queue_lock);
160 return slow_work_runqueue_index(m, _pos);
161}
162
163/*
164 * move to the next line
165 */
166static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos)
167{
168 struct list_head *p = v;
169 unsigned long selector = *_pos >> ITERATOR_SHIFT;
170
171 (*_pos)++;
172 switch (selector) {
173 case 0x0:
174 return slow_work_runqueue_index(m, _pos);
175
176 case 0x1:
177 if (*_pos >> ITERATOR_SHIFT == 0x1) {
178 p = p->next;
179 if (p != &slow_work_queue)
180 return p;
181 }
182 *_pos = 0x2UL << ITERATOR_SHIFT;
183 p = &vslow_work_queue;
184
185 case 0x2:
186 if (*_pos >> ITERATOR_SHIFT == 0x2) {
187 p = p->next;
188 if (p != &vslow_work_queue)
189 return p;
190 }
191 *_pos = 0x3UL << ITERATOR_SHIFT;
192
193 default:
194 return NULL;
195 }
196}
197
198/*
199 * clean up after reading
200 */
201static void slow_work_runqueue_stop(struct seq_file *m, void *v)
202{
203 spin_unlock_irq(&slow_work_queue_lock);
204}
205
206static const struct seq_operations slow_work_runqueue_ops = {
207 .start = slow_work_runqueue_start,
208 .stop = slow_work_runqueue_stop,
209 .next = slow_work_runqueue_next,
210 .show = slow_work_runqueue_show,
211};
212
213/*
214 * open "/sys/kernel/debug/slow_work/runqueue" to list queue contents
215 */
216static int slow_work_runqueue_open(struct inode *inode, struct file *file)
217{
218 return seq_open(file, &slow_work_runqueue_ops);
219}
220
221const struct file_operations slow_work_runqueue_fops = {
222 .owner = THIS_MODULE,
223 .open = slow_work_runqueue_open,
224 .read = seq_read,
225 .llseek = seq_lseek,
226 .release = seq_release,
227};
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
deleted file mode 100644
index 7d3f4fa9ef4f..000000000000
--- a/kernel/slow-work.c
+++ /dev/null
@@ -1,1068 +0,0 @@
1/* Worker thread pool for slow items, such as filesystem lookups or mkdirs
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 *
11 * See Documentation/slow-work.txt
12 */
13
14#include <linux/module.h>
15#include <linux/slow-work.h>
16#include <linux/kthread.h>
17#include <linux/freezer.h>
18#include <linux/wait.h>
19#include <linux/debugfs.h>
20#include "slow-work.h"
21
22static void slow_work_cull_timeout(unsigned long);
23static void slow_work_oom_timeout(unsigned long);
24
25#ifdef CONFIG_SYSCTL
26static int slow_work_min_threads_sysctl(struct ctl_table *, int,
27 void __user *, size_t *, loff_t *);
28
29static int slow_work_max_threads_sysctl(struct ctl_table *, int ,
30 void __user *, size_t *, loff_t *);
31#endif
32
33/*
34 * The pool of threads has at least min threads in it as long as someone is
35 * using the facility, and may have as many as max.
36 *
37 * A portion of the pool may be processing very slow operations.
38 */
39static unsigned slow_work_min_threads = 2;
40static unsigned slow_work_max_threads = 4;
41static unsigned vslow_work_proportion = 50; /* % of threads that may process
42 * very slow work */
43
44#ifdef CONFIG_SYSCTL
45static const int slow_work_min_min_threads = 2;
46static int slow_work_max_max_threads = SLOW_WORK_THREAD_LIMIT;
47static const int slow_work_min_vslow = 1;
48static const int slow_work_max_vslow = 99;
49
50ctl_table slow_work_sysctls[] = {
51 {
52 .procname = "min-threads",
53 .data = &slow_work_min_threads,
54 .maxlen = sizeof(unsigned),
55 .mode = 0644,
56 .proc_handler = slow_work_min_threads_sysctl,
57 .extra1 = (void *) &slow_work_min_min_threads,
58 .extra2 = &slow_work_max_threads,
59 },
60 {
61 .procname = "max-threads",
62 .data = &slow_work_max_threads,
63 .maxlen = sizeof(unsigned),
64 .mode = 0644,
65 .proc_handler = slow_work_max_threads_sysctl,
66 .extra1 = &slow_work_min_threads,
67 .extra2 = (void *) &slow_work_max_max_threads,
68 },
69 {
70 .procname = "vslow-percentage",
71 .data = &vslow_work_proportion,
72 .maxlen = sizeof(unsigned),
73 .mode = 0644,
74 .proc_handler = proc_dointvec_minmax,
75 .extra1 = (void *) &slow_work_min_vslow,
76 .extra2 = (void *) &slow_work_max_vslow,
77 },
78 {}
79};
80#endif
81
82/*
83 * The active state of the thread pool
84 */
85static atomic_t slow_work_thread_count;
86static atomic_t vslow_work_executing_count;
87
88static bool slow_work_may_not_start_new_thread;
89static bool slow_work_cull; /* cull a thread due to lack of activity */
90static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0);
91static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0);
92static struct slow_work slow_work_new_thread; /* new thread starter */
93
94/*
95 * slow work ID allocation (use slow_work_queue_lock)
96 */
97static DECLARE_BITMAP(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
98
99/*
100 * Unregistration tracking to prevent put_ref() from disappearing during module
101 * unload
102 */
103#ifdef CONFIG_MODULES
104static struct module *slow_work_thread_processing[SLOW_WORK_THREAD_LIMIT];
105static struct module *slow_work_unreg_module;
106static struct slow_work *slow_work_unreg_work_item;
107static DECLARE_WAIT_QUEUE_HEAD(slow_work_unreg_wq);
108static DEFINE_MUTEX(slow_work_unreg_sync_lock);
109
110static void slow_work_set_thread_processing(int id, struct slow_work *work)
111{
112 if (work)
113 slow_work_thread_processing[id] = work->owner;
114}
115static void slow_work_done_thread_processing(int id, struct slow_work *work)
116{
117 struct module *module = slow_work_thread_processing[id];
118
119 slow_work_thread_processing[id] = NULL;
120 smp_mb();
121 if (slow_work_unreg_work_item == work ||
122 slow_work_unreg_module == module)
123 wake_up_all(&slow_work_unreg_wq);
124}
125static void slow_work_clear_thread_processing(int id)
126{
127 slow_work_thread_processing[id] = NULL;
128}
129#else
130static void slow_work_set_thread_processing(int id, struct slow_work *work) {}
131static void slow_work_done_thread_processing(int id, struct slow_work *work) {}
132static void slow_work_clear_thread_processing(int id) {}
133#endif
134
135/*
136 * Data for tracking currently executing items for indication through /proc
137 */
138#ifdef CONFIG_SLOW_WORK_DEBUG
139struct slow_work *slow_work_execs[SLOW_WORK_THREAD_LIMIT];
140pid_t slow_work_pids[SLOW_WORK_THREAD_LIMIT];
141DEFINE_RWLOCK(slow_work_execs_lock);
142#endif
143
144/*
145 * The queues of work items and the lock governing access to them. These are
146 * shared between all the CPUs. It doesn't make sense to have per-CPU queues
147 * as the number of threads bears no relation to the number of CPUs.
148 *
149 * There are two queues of work items: one for slow work items, and one for
150 * very slow work items.
151 */
152LIST_HEAD(slow_work_queue);
153LIST_HEAD(vslow_work_queue);
154DEFINE_SPINLOCK(slow_work_queue_lock);
155
156/*
157 * The following are two wait queues that get pinged when a work item is placed
158 * on an empty queue. These allow work items that are hogging a thread by
159 * sleeping in a way that could be deferred to yield their thread and enqueue
160 * themselves.
161 */
162static DECLARE_WAIT_QUEUE_HEAD(slow_work_queue_waits_for_occupation);
163static DECLARE_WAIT_QUEUE_HEAD(vslow_work_queue_waits_for_occupation);
164
165/*
166 * The thread controls. A variable used to signal to the threads that they
167 * should exit when the queue is empty, a waitqueue used by the threads to wait
168 * for signals, and a completion set by the last thread to exit.
169 */
170static bool slow_work_threads_should_exit;
171static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq);
172static DECLARE_COMPLETION(slow_work_last_thread_exited);
173
174/*
175 * The number of users of the thread pool and its lock. Whilst this is zero we
176 * have no threads hanging around, and when this reaches zero, we wait for all
177 * active or queued work items to complete and kill all the threads we do have.
178 */
179static int slow_work_user_count;
180static DEFINE_MUTEX(slow_work_user_lock);
181
182static inline int slow_work_get_ref(struct slow_work *work)
183{
184 if (work->ops->get_ref)
185 return work->ops->get_ref(work);
186
187 return 0;
188}
189
190static inline void slow_work_put_ref(struct slow_work *work)
191{
192 if (work->ops->put_ref)
193 work->ops->put_ref(work);
194}
195
196/*
197 * Calculate the maximum number of active threads in the pool that are
198 * permitted to process very slow work items.
199 *
200 * The answer is rounded up to at least 1, but may not equal or exceed the
201 * maximum number of the threads in the pool. This means we always have at
202 * least one thread that can process slow work items, and we always have at
203 * least one thread that won't get tied up doing so.
204 */
205static unsigned slow_work_calc_vsmax(void)
206{
207 unsigned vsmax;
208
209 vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion;
210 vsmax /= 100;
211 vsmax = max(vsmax, 1U);
212 return min(vsmax, slow_work_max_threads - 1);
213}
214
215/*
216 * Attempt to execute stuff queued on a slow thread. Return true if we managed
217 * it, false if there was nothing to do.
218 */
219static noinline bool slow_work_execute(int id)
220{
221 struct slow_work *work = NULL;
222 unsigned vsmax;
223 bool very_slow;
224
225 vsmax = slow_work_calc_vsmax();
226
227 /* see if we can schedule a new thread to be started if we're not
228 * keeping up with the work */
229 if (!waitqueue_active(&slow_work_thread_wq) &&
230 (!list_empty(&slow_work_queue) || !list_empty(&vslow_work_queue)) &&
231 atomic_read(&slow_work_thread_count) < slow_work_max_threads &&
232 !slow_work_may_not_start_new_thread)
233 slow_work_enqueue(&slow_work_new_thread);
234
235 /* find something to execute */
236 spin_lock_irq(&slow_work_queue_lock);
237 if (!list_empty(&vslow_work_queue) &&
238 atomic_read(&vslow_work_executing_count) < vsmax) {
239 work = list_entry(vslow_work_queue.next,
240 struct slow_work, link);
241 if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
242 BUG();
243 list_del_init(&work->link);
244 atomic_inc(&vslow_work_executing_count);
245 very_slow = true;
246 } else if (!list_empty(&slow_work_queue)) {
247 work = list_entry(slow_work_queue.next,
248 struct slow_work, link);
249 if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
250 BUG();
251 list_del_init(&work->link);
252 very_slow = false;
253 } else {
254 very_slow = false; /* avoid the compiler warning */
255 }
256
257 slow_work_set_thread_processing(id, work);
258 if (work) {
259 slow_work_mark_time(work);
260 slow_work_begin_exec(id, work);
261 }
262
263 spin_unlock_irq(&slow_work_queue_lock);
264
265 if (!work)
266 return false;
267
268 if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags))
269 BUG();
270
271 /* don't execute if the work is in the process of being cancelled */
272 if (!test_bit(SLOW_WORK_CANCELLING, &work->flags))
273 work->ops->execute(work);
274
275 if (very_slow)
276 atomic_dec(&vslow_work_executing_count);
277 clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags);
278
279 /* wake up anyone waiting for this work to be complete */
280 wake_up_bit(&work->flags, SLOW_WORK_EXECUTING);
281
282 slow_work_end_exec(id, work);
283
284 /* if someone tried to enqueue the item whilst we were executing it,
285 * then it'll be left unenqueued to avoid multiple threads trying to
286 * execute it simultaneously
287 *
288 * there is, however, a race between us testing the pending flag and
289 * getting the spinlock, and between the enqueuer setting the pending
290 * flag and getting the spinlock, so we use a deferral bit to tell us
291 * if the enqueuer got there first
292 */
293 if (test_bit(SLOW_WORK_PENDING, &work->flags)) {
294 spin_lock_irq(&slow_work_queue_lock);
295
296 if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) &&
297 test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags))
298 goto auto_requeue;
299
300 spin_unlock_irq(&slow_work_queue_lock);
301 }
302
303 /* sort out the race between module unloading and put_ref() */
304 slow_work_put_ref(work);
305 slow_work_done_thread_processing(id, work);
306
307 return true;
308
309auto_requeue:
310 /* we must complete the enqueue operation
311 * - we transfer our ref on the item back to the appropriate queue
312 * - don't wake another thread up as we're awake already
313 */
314 slow_work_mark_time(work);
315 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
316 list_add_tail(&work->link, &vslow_work_queue);
317 else
318 list_add_tail(&work->link, &slow_work_queue);
319 spin_unlock_irq(&slow_work_queue_lock);
320 slow_work_clear_thread_processing(id);
321 return true;
322}
323
324/**
325 * slow_work_sleep_till_thread_needed - Sleep till thread needed by other work
326 * work: The work item under execution that wants to sleep
327 * _timeout: Scheduler sleep timeout
328 *
329 * Allow a requeueable work item to sleep on a slow-work processor thread until
330 * that thread is needed to do some other work or the sleep is interrupted by
331 * some other event.
332 *
333 * The caller must set up a wake up event before calling this and must have set
334 * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own
335 * condition before calling this function as no test is made here.
336 *
337 * False is returned if there is nothing on the queue; true is returned if the
338 * work item should be requeued
339 */
340bool slow_work_sleep_till_thread_needed(struct slow_work *work,
341 signed long *_timeout)
342{
343 wait_queue_head_t *wfo_wq;
344 struct list_head *queue;
345
346 DEFINE_WAIT(wait);
347
348 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
349 wfo_wq = &vslow_work_queue_waits_for_occupation;
350 queue = &vslow_work_queue;
351 } else {
352 wfo_wq = &slow_work_queue_waits_for_occupation;
353 queue = &slow_work_queue;
354 }
355
356 if (!list_empty(queue))
357 return true;
358
359 add_wait_queue_exclusive(wfo_wq, &wait);
360 if (list_empty(queue))
361 *_timeout = schedule_timeout(*_timeout);
362 finish_wait(wfo_wq, &wait);
363
364 return !list_empty(queue);
365}
366EXPORT_SYMBOL(slow_work_sleep_till_thread_needed);
367
368/**
369 * slow_work_enqueue - Schedule a slow work item for processing
370 * @work: The work item to queue
371 *
372 * Schedule a slow work item for processing. If the item is already undergoing
373 * execution, this guarantees not to re-enter the execution routine until the
374 * first execution finishes.
375 *
376 * The item is pinned by this function as it retains a reference to it, managed
377 * through the item operations. The item is unpinned once it has been
378 * executed.
379 *
380 * An item may hog the thread that is running it for a relatively large amount
381 * of time, sufficient, for example, to perform several lookup, mkdir, create
382 * and setxattr operations. It may sleep on I/O and may sleep to obtain locks.
383 *
384 * Conversely, if a number of items are awaiting processing, it may take some
385 * time before any given item is given attention. The number of threads in the
386 * pool may be increased to deal with demand, but only up to a limit.
387 *
388 * If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in
389 * the very slow queue, from which only a portion of the threads will be
390 * allowed to pick items to execute. This ensures that very slow items won't
391 * overly block ones that are just ordinarily slow.
392 *
393 * Returns 0 if successful, -EAGAIN if not (or -ECANCELED if cancelled work is
394 * attempted queued)
395 */
396int slow_work_enqueue(struct slow_work *work)
397{
398 wait_queue_head_t *wfo_wq;
399 struct list_head *queue;
400 unsigned long flags;
401 int ret;
402
403 if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
404 return -ECANCELED;
405
406 BUG_ON(slow_work_user_count <= 0);
407 BUG_ON(!work);
408 BUG_ON(!work->ops);
409
410 /* when honouring an enqueue request, we only promise that we will run
411 * the work function in the future; we do not promise to run it once
412 * per enqueue request
413 *
414 * we use the PENDING bit to merge together repeat requests without
415 * having to disable IRQs and take the spinlock, whilst still
416 * maintaining our promise
417 */
418 if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
419 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
420 wfo_wq = &vslow_work_queue_waits_for_occupation;
421 queue = &vslow_work_queue;
422 } else {
423 wfo_wq = &slow_work_queue_waits_for_occupation;
424 queue = &slow_work_queue;
425 }
426
427 spin_lock_irqsave(&slow_work_queue_lock, flags);
428
429 if (unlikely(test_bit(SLOW_WORK_CANCELLING, &work->flags)))
430 goto cancelled;
431
432 /* we promise that we will not attempt to execute the work
433 * function in more than one thread simultaneously
434 *
435 * this, however, leaves us with a problem if we're asked to
436 * enqueue the work whilst someone is executing the work
437 * function as simply queueing the work immediately means that
438 * another thread may try executing it whilst it is already
439 * under execution
440 *
441 * to deal with this, we set the ENQ_DEFERRED bit instead of
442 * enqueueing, and the thread currently executing the work
443 * function will enqueue the work item when the work function
444 * returns and it has cleared the EXECUTING bit
445 */
446 if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
447 set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
448 } else {
449 ret = slow_work_get_ref(work);
450 if (ret < 0)
451 goto failed;
452 slow_work_mark_time(work);
453 list_add_tail(&work->link, queue);
454 wake_up(&slow_work_thread_wq);
455
456 /* if someone who could be requeued is sleeping on a
457 * thread, then ask them to yield their thread */
458 if (work->link.prev == queue)
459 wake_up(wfo_wq);
460 }
461
462 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
463 }
464 return 0;
465
466cancelled:
467 ret = -ECANCELED;
468failed:
469 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
470 return ret;
471}
472EXPORT_SYMBOL(slow_work_enqueue);
473
474static int slow_work_wait(void *word)
475{
476 schedule();
477 return 0;
478}
479
480/**
481 * slow_work_cancel - Cancel a slow work item
482 * @work: The work item to cancel
483 *
484 * This function will cancel a previously enqueued work item. If we cannot
485 * cancel the work item, it is guarenteed to have run when this function
486 * returns.
487 */
488void slow_work_cancel(struct slow_work *work)
489{
490 bool wait = true, put = false;
491
492 set_bit(SLOW_WORK_CANCELLING, &work->flags);
493 smp_mb();
494
495 /* if the work item is a delayed work item with an active timer, we
496 * need to wait for the timer to finish _before_ getting the spinlock,
497 * lest we deadlock against the timer routine
498 *
499 * the timer routine will leave DELAYED set if it notices the
500 * CANCELLING flag in time
501 */
502 if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
503 struct delayed_slow_work *dwork =
504 container_of(work, struct delayed_slow_work, work);
505 del_timer_sync(&dwork->timer);
506 }
507
508 spin_lock_irq(&slow_work_queue_lock);
509
510 if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
511 /* the timer routine aborted or never happened, so we are left
512 * holding the timer's reference on the item and should just
513 * drop the pending flag and wait for any ongoing execution to
514 * finish */
515 struct delayed_slow_work *dwork =
516 container_of(work, struct delayed_slow_work, work);
517
518 BUG_ON(timer_pending(&dwork->timer));
519 BUG_ON(!list_empty(&work->link));
520
521 clear_bit(SLOW_WORK_DELAYED, &work->flags);
522 put = true;
523 clear_bit(SLOW_WORK_PENDING, &work->flags);
524
525 } else if (test_bit(SLOW_WORK_PENDING, &work->flags) &&
526 !list_empty(&work->link)) {
527 /* the link in the pending queue holds a reference on the item
528 * that we will need to release */
529 list_del_init(&work->link);
530 wait = false;
531 put = true;
532 clear_bit(SLOW_WORK_PENDING, &work->flags);
533
534 } else if (test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) {
535 /* the executor is holding our only reference on the item, so
536 * we merely need to wait for it to finish executing */
537 clear_bit(SLOW_WORK_PENDING, &work->flags);
538 }
539
540 spin_unlock_irq(&slow_work_queue_lock);
541
542 /* the EXECUTING flag is set by the executor whilst the spinlock is set
543 * and before the item is dequeued - so assuming the above doesn't
544 * actually dequeue it, simply waiting for the EXECUTING flag to be
545 * released here should be sufficient */
546 if (wait)
547 wait_on_bit(&work->flags, SLOW_WORK_EXECUTING, slow_work_wait,
548 TASK_UNINTERRUPTIBLE);
549
550 clear_bit(SLOW_WORK_CANCELLING, &work->flags);
551 if (put)
552 slow_work_put_ref(work);
553}
554EXPORT_SYMBOL(slow_work_cancel);
555
556/*
557 * Handle expiry of the delay timer, indicating that a delayed slow work item
558 * should now be queued if not cancelled
559 */
560static void delayed_slow_work_timer(unsigned long data)
561{
562 wait_queue_head_t *wfo_wq;
563 struct list_head *queue;
564 struct slow_work *work = (struct slow_work *) data;
565 unsigned long flags;
566 bool queued = false, put = false, first = false;
567
568 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
569 wfo_wq = &vslow_work_queue_waits_for_occupation;
570 queue = &vslow_work_queue;
571 } else {
572 wfo_wq = &slow_work_queue_waits_for_occupation;
573 queue = &slow_work_queue;
574 }
575
576 spin_lock_irqsave(&slow_work_queue_lock, flags);
577 if (likely(!test_bit(SLOW_WORK_CANCELLING, &work->flags))) {
578 clear_bit(SLOW_WORK_DELAYED, &work->flags);
579
580 if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
581 /* we discard the reference the timer was holding in
582 * favour of the one the executor holds */
583 set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
584 put = true;
585 } else {
586 slow_work_mark_time(work);
587 list_add_tail(&work->link, queue);
588 queued = true;
589 if (work->link.prev == queue)
590 first = true;
591 }
592 }
593
594 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
595 if (put)
596 slow_work_put_ref(work);
597 if (first)
598 wake_up(wfo_wq);
599 if (queued)
600 wake_up(&slow_work_thread_wq);
601}
602
603/**
604 * delayed_slow_work_enqueue - Schedule a delayed slow work item for processing
605 * @dwork: The delayed work item to queue
606 * @delay: When to start executing the work, in jiffies from now
607 *
608 * This is similar to slow_work_enqueue(), but it adds a delay before the work
609 * is actually queued for processing.
610 *
611 * The item can have delayed processing requested on it whilst it is being
612 * executed. The delay will begin immediately, and if it expires before the
613 * item finishes executing, the item will be placed back on the queue when it
614 * has done executing.
615 */
616int delayed_slow_work_enqueue(struct delayed_slow_work *dwork,
617 unsigned long delay)
618{
619 struct slow_work *work = &dwork->work;
620 unsigned long flags;
621 int ret;
622
623 if (delay == 0)
624 return slow_work_enqueue(&dwork->work);
625
626 BUG_ON(slow_work_user_count <= 0);
627 BUG_ON(!work);
628 BUG_ON(!work->ops);
629
630 if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
631 return -ECANCELED;
632
633 if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
634 spin_lock_irqsave(&slow_work_queue_lock, flags);
635
636 if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
637 goto cancelled;
638
639 /* the timer holds a reference whilst it is pending */
640 ret = slow_work_get_ref(work);
641 if (ret < 0)
642 goto cant_get_ref;
643
644 if (test_and_set_bit(SLOW_WORK_DELAYED, &work->flags))
645 BUG();
646 dwork->timer.expires = jiffies + delay;
647 dwork->timer.data = (unsigned long) work;
648 dwork->timer.function = delayed_slow_work_timer;
649 add_timer(&dwork->timer);
650
651 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
652 }
653
654 return 0;
655
656cancelled:
657 ret = -ECANCELED;
658cant_get_ref:
659 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
660 return ret;
661}
662EXPORT_SYMBOL(delayed_slow_work_enqueue);
663
664/*
665 * Schedule a cull of the thread pool at some time in the near future
666 */
667static void slow_work_schedule_cull(void)
668{
669 mod_timer(&slow_work_cull_timer,
670 round_jiffies(jiffies + SLOW_WORK_CULL_TIMEOUT));
671}
672
673/*
674 * Worker thread culling algorithm
675 */
676static bool slow_work_cull_thread(void)
677{
678 unsigned long flags;
679 bool do_cull = false;
680
681 spin_lock_irqsave(&slow_work_queue_lock, flags);
682
683 if (slow_work_cull) {
684 slow_work_cull = false;
685
686 if (list_empty(&slow_work_queue) &&
687 list_empty(&vslow_work_queue) &&
688 atomic_read(&slow_work_thread_count) >
689 slow_work_min_threads) {
690 slow_work_schedule_cull();
691 do_cull = true;
692 }
693 }
694
695 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
696 return do_cull;
697}
698
699/*
700 * Determine if there is slow work available for dispatch
701 */
702static inline bool slow_work_available(int vsmax)
703{
704 return !list_empty(&slow_work_queue) ||
705 (!list_empty(&vslow_work_queue) &&
706 atomic_read(&vslow_work_executing_count) < vsmax);
707}
708
709/*
710 * Worker thread dispatcher
711 */
712static int slow_work_thread(void *_data)
713{
714 int vsmax, id;
715
716 DEFINE_WAIT(wait);
717
718 set_freezable();
719 set_user_nice(current, -5);
720
721 /* allocate ourselves an ID */
722 spin_lock_irq(&slow_work_queue_lock);
723 id = find_first_zero_bit(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
724 BUG_ON(id < 0 || id >= SLOW_WORK_THREAD_LIMIT);
725 __set_bit(id, slow_work_ids);
726 slow_work_set_thread_pid(id, current->pid);
727 spin_unlock_irq(&slow_work_queue_lock);
728
729 sprintf(current->comm, "kslowd%03u", id);
730
731 for (;;) {
732 vsmax = vslow_work_proportion;
733 vsmax *= atomic_read(&slow_work_thread_count);
734 vsmax /= 100;
735
736 prepare_to_wait_exclusive(&slow_work_thread_wq, &wait,
737 TASK_INTERRUPTIBLE);
738 if (!freezing(current) &&
739 !slow_work_threads_should_exit &&
740 !slow_work_available(vsmax) &&
741 !slow_work_cull)
742 schedule();
743 finish_wait(&slow_work_thread_wq, &wait);
744
745 try_to_freeze();
746
747 vsmax = vslow_work_proportion;
748 vsmax *= atomic_read(&slow_work_thread_count);
749 vsmax /= 100;
750
751 if (slow_work_available(vsmax) && slow_work_execute(id)) {
752 cond_resched();
753 if (list_empty(&slow_work_queue) &&
754 list_empty(&vslow_work_queue) &&
755 atomic_read(&slow_work_thread_count) >
756 slow_work_min_threads)
757 slow_work_schedule_cull();
758 continue;
759 }
760
761 if (slow_work_threads_should_exit)
762 break;
763
764 if (slow_work_cull && slow_work_cull_thread())
765 break;
766 }
767
768 spin_lock_irq(&slow_work_queue_lock);
769 slow_work_set_thread_pid(id, 0);
770 __clear_bit(id, slow_work_ids);
771 spin_unlock_irq(&slow_work_queue_lock);
772
773 if (atomic_dec_and_test(&slow_work_thread_count))
774 complete_and_exit(&slow_work_last_thread_exited, 0);
775 return 0;
776}
777
778/*
779 * Handle thread cull timer expiration
780 */
781static void slow_work_cull_timeout(unsigned long data)
782{
783 slow_work_cull = true;
784 wake_up(&slow_work_thread_wq);
785}
786
787/*
788 * Start a new slow work thread
789 */
790static void slow_work_new_thread_execute(struct slow_work *work)
791{
792 struct task_struct *p;
793
794 if (slow_work_threads_should_exit)
795 return;
796
797 if (atomic_read(&slow_work_thread_count) >= slow_work_max_threads)
798 return;
799
800 if (!mutex_trylock(&slow_work_user_lock))
801 return;
802
803 slow_work_may_not_start_new_thread = true;
804 atomic_inc(&slow_work_thread_count);
805 p = kthread_run(slow_work_thread, NULL, "kslowd");
806 if (IS_ERR(p)) {
807 printk(KERN_DEBUG "Slow work thread pool: OOM\n");
808 if (atomic_dec_and_test(&slow_work_thread_count))
809 BUG(); /* we're running on a slow work thread... */
810 mod_timer(&slow_work_oom_timer,
811 round_jiffies(jiffies + SLOW_WORK_OOM_TIMEOUT));
812 } else {
813 /* ratelimit the starting of new threads */
814 mod_timer(&slow_work_oom_timer, jiffies + 1);
815 }
816
817 mutex_unlock(&slow_work_user_lock);
818}
819
820static const struct slow_work_ops slow_work_new_thread_ops = {
821 .owner = THIS_MODULE,
822 .execute = slow_work_new_thread_execute,
823#ifdef CONFIG_SLOW_WORK_DEBUG
824 .desc = slow_work_new_thread_desc,
825#endif
826};
827
828/*
829 * post-OOM new thread start suppression expiration
830 */
831static void slow_work_oom_timeout(unsigned long data)
832{
833 slow_work_may_not_start_new_thread = false;
834}
835
836#ifdef CONFIG_SYSCTL
837/*
838 * Handle adjustment of the minimum number of threads
839 */
840static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
841 void __user *buffer,
842 size_t *lenp, loff_t *ppos)
843{
844 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
845 int n;
846
847 if (ret == 0) {
848 mutex_lock(&slow_work_user_lock);
849 if (slow_work_user_count > 0) {
850 /* see if we need to start or stop threads */
851 n = atomic_read(&slow_work_thread_count) -
852 slow_work_min_threads;
853
854 if (n < 0 && !slow_work_may_not_start_new_thread)
855 slow_work_enqueue(&slow_work_new_thread);
856 else if (n > 0)
857 slow_work_schedule_cull();
858 }
859 mutex_unlock(&slow_work_user_lock);
860 }
861
862 return ret;
863}
864
865/*
866 * Handle adjustment of the maximum number of threads
867 */
868static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
869 void __user *buffer,
870 size_t *lenp, loff_t *ppos)
871{
872 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
873 int n;
874
875 if (ret == 0) {
876 mutex_lock(&slow_work_user_lock);
877 if (slow_work_user_count > 0) {
878 /* see if we need to stop threads */
879 n = slow_work_max_threads -
880 atomic_read(&slow_work_thread_count);
881
882 if (n < 0)
883 slow_work_schedule_cull();
884 }
885 mutex_unlock(&slow_work_user_lock);
886 }
887
888 return ret;
889}
890#endif /* CONFIG_SYSCTL */
891
892/**
893 * slow_work_register_user - Register a user of the facility
894 * @module: The module about to make use of the facility
895 *
896 * Register a user of the facility, starting up the initial threads if there
897 * aren't any other users at this point. This will return 0 if successful, or
898 * an error if not.
899 */
900int slow_work_register_user(struct module *module)
901{
902 struct task_struct *p;
903 int loop;
904
905 mutex_lock(&slow_work_user_lock);
906
907 if (slow_work_user_count == 0) {
908 printk(KERN_NOTICE "Slow work thread pool: Starting up\n");
909 init_completion(&slow_work_last_thread_exited);
910
911 slow_work_threads_should_exit = false;
912 slow_work_init(&slow_work_new_thread,
913 &slow_work_new_thread_ops);
914 slow_work_may_not_start_new_thread = false;
915 slow_work_cull = false;
916
917 /* start the minimum number of threads */
918 for (loop = 0; loop < slow_work_min_threads; loop++) {
919 atomic_inc(&slow_work_thread_count);
920 p = kthread_run(slow_work_thread, NULL, "kslowd");
921 if (IS_ERR(p))
922 goto error;
923 }
924 printk(KERN_NOTICE "Slow work thread pool: Ready\n");
925 }
926
927 slow_work_user_count++;
928 mutex_unlock(&slow_work_user_lock);
929 return 0;
930
931error:
932 if (atomic_dec_and_test(&slow_work_thread_count))
933 complete(&slow_work_last_thread_exited);
934 if (loop > 0) {
935 printk(KERN_ERR "Slow work thread pool:"
936 " Aborting startup on ENOMEM\n");
937 slow_work_threads_should_exit = true;
938 wake_up_all(&slow_work_thread_wq);
939 wait_for_completion(&slow_work_last_thread_exited);
940 printk(KERN_ERR "Slow work thread pool: Aborted\n");
941 }
942 mutex_unlock(&slow_work_user_lock);
943 return PTR_ERR(p);
944}
945EXPORT_SYMBOL(slow_work_register_user);
946
947/*
948 * wait for all outstanding items from the calling module to complete
949 * - note that more items may be queued whilst we're waiting
950 */
951static void slow_work_wait_for_items(struct module *module)
952{
953#ifdef CONFIG_MODULES
954 DECLARE_WAITQUEUE(myself, current);
955 struct slow_work *work;
956 int loop;
957
958 mutex_lock(&slow_work_unreg_sync_lock);
959 add_wait_queue(&slow_work_unreg_wq, &myself);
960
961 for (;;) {
962 spin_lock_irq(&slow_work_queue_lock);
963
964 /* first of all, we wait for the last queued item in each list
965 * to be processed */
966 list_for_each_entry_reverse(work, &vslow_work_queue, link) {
967 if (work->owner == module) {
968 set_current_state(TASK_UNINTERRUPTIBLE);
969 slow_work_unreg_work_item = work;
970 goto do_wait;
971 }
972 }
973 list_for_each_entry_reverse(work, &slow_work_queue, link) {
974 if (work->owner == module) {
975 set_current_state(TASK_UNINTERRUPTIBLE);
976 slow_work_unreg_work_item = work;
977 goto do_wait;
978 }
979 }
980
981 /* then we wait for the items being processed to finish */
982 slow_work_unreg_module = module;
983 smp_mb();
984 for (loop = 0; loop < SLOW_WORK_THREAD_LIMIT; loop++) {
985 if (slow_work_thread_processing[loop] == module)
986 goto do_wait;
987 }
988 spin_unlock_irq(&slow_work_queue_lock);
989 break; /* okay, we're done */
990
991 do_wait:
992 spin_unlock_irq(&slow_work_queue_lock);
993 schedule();
994 slow_work_unreg_work_item = NULL;
995 slow_work_unreg_module = NULL;
996 }
997
998 remove_wait_queue(&slow_work_unreg_wq, &myself);
999 mutex_unlock(&slow_work_unreg_sync_lock);
1000#endif /* CONFIG_MODULES */
1001}
1002
1003/**
1004 * slow_work_unregister_user - Unregister a user of the facility
1005 * @module: The module whose items should be cleared
1006 *
1007 * Unregister a user of the facility, killing all the threads if this was the
1008 * last one.
1009 *
1010 * This waits for all the work items belonging to the nominated module to go
1011 * away before proceeding.
1012 */
1013void slow_work_unregister_user(struct module *module)
1014{
1015 /* first of all, wait for all outstanding items from the calling module
1016 * to complete */
1017 if (module)
1018 slow_work_wait_for_items(module);
1019
1020 /* then we can actually go about shutting down the facility if need
1021 * be */
1022 mutex_lock(&slow_work_user_lock);
1023
1024 BUG_ON(slow_work_user_count <= 0);
1025
1026 slow_work_user_count--;
1027 if (slow_work_user_count == 0) {
1028 printk(KERN_NOTICE "Slow work thread pool: Shutting down\n");
1029 slow_work_threads_should_exit = true;
1030 del_timer_sync(&slow_work_cull_timer);
1031 del_timer_sync(&slow_work_oom_timer);
1032 wake_up_all(&slow_work_thread_wq);
1033 wait_for_completion(&slow_work_last_thread_exited);
1034 printk(KERN_NOTICE "Slow work thread pool:"
1035 " Shut down complete\n");
1036 }
1037
1038 mutex_unlock(&slow_work_user_lock);
1039}
1040EXPORT_SYMBOL(slow_work_unregister_user);
1041
1042/*
1043 * Initialise the slow work facility
1044 */
1045static int __init init_slow_work(void)
1046{
1047 unsigned nr_cpus = num_possible_cpus();
1048
1049 if (slow_work_max_threads < nr_cpus)
1050 slow_work_max_threads = nr_cpus;
1051#ifdef CONFIG_SYSCTL
1052 if (slow_work_max_max_threads < nr_cpus * 2)
1053 slow_work_max_max_threads = nr_cpus * 2;
1054#endif
1055#ifdef CONFIG_SLOW_WORK_DEBUG
1056 {
1057 struct dentry *dbdir;
1058
1059 dbdir = debugfs_create_dir("slow_work", NULL);
1060 if (dbdir && !IS_ERR(dbdir))
1061 debugfs_create_file("runqueue", S_IFREG | 0400, dbdir,
1062 NULL, &slow_work_runqueue_fops);
1063 }
1064#endif
1065 return 0;
1066}
1067
1068subsys_initcall(init_slow_work);
diff --git a/kernel/slow-work.h b/kernel/slow-work.h
deleted file mode 100644
index a29ebd1ef41d..000000000000
--- a/kernel/slow-work.h
+++ /dev/null
@@ -1,72 +0,0 @@
1/* Slow work private definitions
2 *
3 * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of
13 * things to do */
14#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after
15 * OOM */
16
17#define SLOW_WORK_THREAD_LIMIT 255 /* abs maximum number of slow-work threads */
18
19/*
20 * slow-work.c
21 */
22#ifdef CONFIG_SLOW_WORK_DEBUG
23extern struct slow_work *slow_work_execs[];
24extern pid_t slow_work_pids[];
25extern rwlock_t slow_work_execs_lock;
26#endif
27
28extern struct list_head slow_work_queue;
29extern struct list_head vslow_work_queue;
30extern spinlock_t slow_work_queue_lock;
31
32/*
33 * slow-work-debugfs.c
34 */
35#ifdef CONFIG_SLOW_WORK_DEBUG
36extern const struct file_operations slow_work_runqueue_fops;
37
38extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *);
39#endif
40
41/*
42 * Helper functions
43 */
44static inline void slow_work_set_thread_pid(int id, pid_t pid)
45{
46#ifdef CONFIG_SLOW_WORK_DEBUG
47 slow_work_pids[id] = pid;
48#endif
49}
50
51static inline void slow_work_mark_time(struct slow_work *work)
52{
53#ifdef CONFIG_SLOW_WORK_DEBUG
54 work->mark = CURRENT_TIME;
55#endif
56}
57
58static inline void slow_work_begin_exec(int id, struct slow_work *work)
59{
60#ifdef CONFIG_SLOW_WORK_DEBUG
61 slow_work_execs[id] = work;
62#endif
63}
64
65static inline void slow_work_end_exec(int id, struct slow_work *work)
66{
67#ifdef CONFIG_SLOW_WORK_DEBUG
68 write_lock(&slow_work_execs_lock);
69 slow_work_execs[id] = NULL;
70 write_unlock(&slow_work_execs_lock);
71#endif
72}
diff --git a/kernel/smp.c b/kernel/smp.c
index 3fc697336183..ed6aacfcb7ef 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -52,7 +52,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
52 case CPU_UP_PREPARE_FROZEN: 52 case CPU_UP_PREPARE_FROZEN:
53 if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, 53 if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
54 cpu_to_node(cpu))) 54 cpu_to_node(cpu)))
55 return NOTIFY_BAD; 55 return notifier_from_errno(-ENOMEM);
56 break; 56 break;
57 57
58#ifdef CONFIG_HOTPLUG_CPU 58#ifdef CONFIG_HOTPLUG_CPU
@@ -365,9 +365,10 @@ call:
365EXPORT_SYMBOL_GPL(smp_call_function_any); 365EXPORT_SYMBOL_GPL(smp_call_function_any);
366 366
367/** 367/**
368 * __smp_call_function_single(): Run a function on another CPU 368 * __smp_call_function_single(): Run a function on a specific CPU
369 * @cpu: The CPU to run on. 369 * @cpu: The CPU to run on.
370 * @data: Pre-allocated and setup data structure 370 * @data: Pre-allocated and setup data structure
371 * @wait: If true, wait until function has completed on specified CPU.
371 * 372 *
372 * Like smp_call_function_single(), but allow caller to pass in a 373 * Like smp_call_function_single(), but allow caller to pass in a
373 * pre-allocated data structure. Useful for embedding @data inside 374 * pre-allocated data structure. Useful for embedding @data inside
@@ -376,8 +377,10 @@ EXPORT_SYMBOL_GPL(smp_call_function_any);
376void __smp_call_function_single(int cpu, struct call_single_data *data, 377void __smp_call_function_single(int cpu, struct call_single_data *data,
377 int wait) 378 int wait)
378{ 379{
379 csd_lock(data); 380 unsigned int this_cpu;
381 unsigned long flags;
380 382
383 this_cpu = get_cpu();
381 /* 384 /*
382 * Can deadlock when called with interrupts disabled. 385 * Can deadlock when called with interrupts disabled.
383 * We allow cpu's that are not yet online though, as no one else can 386 * We allow cpu's that are not yet online though, as no one else can
@@ -387,7 +390,15 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
387 WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled() 390 WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled()
388 && !oops_in_progress); 391 && !oops_in_progress);
389 392
390 generic_exec_single(cpu, data, wait); 393 if (cpu == this_cpu) {
394 local_irq_save(flags);
395 data->func(data->info);
396 local_irq_restore(flags);
397 } else {
398 csd_lock(data);
399 generic_exec_single(cpu, data, wait);
400 }
401 put_cpu();
391} 402}
392 403
393/** 404/**
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 7c1a67ef0274..07b4f1b1a73a 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -716,7 +716,7 @@ static int run_ksoftirqd(void * __bind_cpu)
716 preempt_enable_no_resched(); 716 preempt_enable_no_resched();
717 cond_resched(); 717 cond_resched();
718 preempt_disable(); 718 preempt_disable();
719 rcu_sched_qs((long)__bind_cpu); 719 rcu_note_context_switch((long)__bind_cpu);
720 } 720 }
721 preempt_enable(); 721 preempt_enable();
722 set_current_state(TASK_INTERRUPTIBLE); 722 set_current_state(TASK_INTERRUPTIBLE);
@@ -808,7 +808,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
808 p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); 808 p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
809 if (IS_ERR(p)) { 809 if (IS_ERR(p)) {
810 printk("ksoftirqd for %i failed\n", hotcpu); 810 printk("ksoftirqd for %i failed\n", hotcpu);
811 return NOTIFY_BAD; 811 return notifier_from_errno(PTR_ERR(p));
812 } 812 }
813 kthread_bind(p, hotcpu); 813 kthread_bind(p, hotcpu);
814 per_cpu(ksoftirqd, hotcpu) = p; 814 per_cpu(ksoftirqd, hotcpu) = p;
@@ -850,7 +850,7 @@ static __init int spawn_ksoftirqd(void)
850 void *cpu = (void *)(long)smp_processor_id(); 850 void *cpu = (void *)(long)smp_processor_id();
851 int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); 851 int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
852 852
853 BUG_ON(err == NOTIFY_BAD); 853 BUG_ON(err != NOTIFY_OK);
854 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); 854 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
855 register_cpu_notifier(&cpu_nfb); 855 register_cpu_notifier(&cpu_nfb);
856 return 0; 856 return 0;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
deleted file mode 100644
index 4b493f67dcb5..000000000000
--- a/kernel/softlockup.c
+++ /dev/null
@@ -1,293 +0,0 @@
1/*
2 * Detect Soft Lockups
3 *
4 * started by Ingo Molnar, Copyright (C) 2005, 2006 Red Hat, Inc.
5 *
6 * this code detects soft lockups: incidents in where on a CPU
7 * the kernel does not reschedule for 10 seconds or more.
8 */
9#include <linux/mm.h>
10#include <linux/cpu.h>
11#include <linux/nmi.h>
12#include <linux/init.h>
13#include <linux/delay.h>
14#include <linux/freezer.h>
15#include <linux/kthread.h>
16#include <linux/lockdep.h>
17#include <linux/notifier.h>
18#include <linux/module.h>
19#include <linux/sysctl.h>
20
21#include <asm/irq_regs.h>
22
23static DEFINE_SPINLOCK(print_lock);
24
25static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */
26static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */
27static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
28static DEFINE_PER_CPU(bool, softlock_touch_sync);
29
30static int __read_mostly did_panic;
31int __read_mostly softlockup_thresh = 60;
32
33/*
34 * Should we panic (and reboot, if panic_timeout= is set) when a
35 * soft-lockup occurs:
36 */
37unsigned int __read_mostly softlockup_panic =
38 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
39
40static int __init softlockup_panic_setup(char *str)
41{
42 softlockup_panic = simple_strtoul(str, NULL, 0);
43
44 return 1;
45}
46__setup("softlockup_panic=", softlockup_panic_setup);
47
48static int
49softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
50{
51 did_panic = 1;
52
53 return NOTIFY_DONE;
54}
55
56static struct notifier_block panic_block = {
57 .notifier_call = softlock_panic,
58};
59
60/*
61 * Returns seconds, approximately. We don't need nanosecond
62 * resolution, and we don't need to waste time with a big divide when
63 * 2^30ns == 1.074s.
64 */
65static unsigned long get_timestamp(int this_cpu)
66{
67 return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
68}
69
70static void __touch_softlockup_watchdog(void)
71{
72 int this_cpu = raw_smp_processor_id();
73
74 __raw_get_cpu_var(softlockup_touch_ts) = get_timestamp(this_cpu);
75}
76
77void touch_softlockup_watchdog(void)
78{
79 __raw_get_cpu_var(softlockup_touch_ts) = 0;
80}
81EXPORT_SYMBOL(touch_softlockup_watchdog);
82
83void touch_softlockup_watchdog_sync(void)
84{
85 __raw_get_cpu_var(softlock_touch_sync) = true;
86 __raw_get_cpu_var(softlockup_touch_ts) = 0;
87}
88
89void touch_all_softlockup_watchdogs(void)
90{
91 int cpu;
92
93 /* Cause each CPU to re-update its timestamp rather than complain */
94 for_each_online_cpu(cpu)
95 per_cpu(softlockup_touch_ts, cpu) = 0;
96}
97EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
98
99int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
100 void __user *buffer,
101 size_t *lenp, loff_t *ppos)
102{
103 touch_all_softlockup_watchdogs();
104 return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
105}
106
107/*
108 * This callback runs from the timer interrupt, and checks
109 * whether the watchdog thread has hung or not:
110 */
111void softlockup_tick(void)
112{
113 int this_cpu = smp_processor_id();
114 unsigned long touch_ts = per_cpu(softlockup_touch_ts, this_cpu);
115 unsigned long print_ts;
116 struct pt_regs *regs = get_irq_regs();
117 unsigned long now;
118
119 /* Is detection switched off? */
120 if (!per_cpu(softlockup_watchdog, this_cpu) || softlockup_thresh <= 0) {
121 /* Be sure we don't false trigger if switched back on */
122 if (touch_ts)
123 per_cpu(softlockup_touch_ts, this_cpu) = 0;
124 return;
125 }
126
127 if (touch_ts == 0) {
128 if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) {
129 /*
130 * If the time stamp was touched atomically
131 * make sure the scheduler tick is up to date.
132 */
133 per_cpu(softlock_touch_sync, this_cpu) = false;
134 sched_clock_tick();
135 }
136 __touch_softlockup_watchdog();
137 return;
138 }
139
140 print_ts = per_cpu(softlockup_print_ts, this_cpu);
141
142 /* report at most once a second */
143 if (print_ts == touch_ts || did_panic)
144 return;
145
146 /* do not print during early bootup: */
147 if (unlikely(system_state != SYSTEM_RUNNING)) {
148 __touch_softlockup_watchdog();
149 return;
150 }
151
152 now = get_timestamp(this_cpu);
153
154 /*
155 * Wake up the high-prio watchdog task twice per
156 * threshold timespan.
157 */
158 if (time_after(now - softlockup_thresh/2, touch_ts))
159 wake_up_process(per_cpu(softlockup_watchdog, this_cpu));
160
161 /* Warn about unreasonable delays: */
162 if (time_before_eq(now - softlockup_thresh, touch_ts))
163 return;
164
165 per_cpu(softlockup_print_ts, this_cpu) = touch_ts;
166
167 spin_lock(&print_lock);
168 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n",
169 this_cpu, now - touch_ts,
170 current->comm, task_pid_nr(current));
171 print_modules();
172 print_irqtrace_events(current);
173 if (regs)
174 show_regs(regs);
175 else
176 dump_stack();
177 spin_unlock(&print_lock);
178
179 if (softlockup_panic)
180 panic("softlockup: hung tasks");
181}
182
183/*
184 * The watchdog thread - runs every second and touches the timestamp.
185 */
186static int watchdog(void *__bind_cpu)
187{
188 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
189
190 sched_setscheduler(current, SCHED_FIFO, &param);
191
192 /* initialize timestamp */
193 __touch_softlockup_watchdog();
194
195 set_current_state(TASK_INTERRUPTIBLE);
196 /*
197 * Run briefly once per second to reset the softlockup timestamp.
198 * If this gets delayed for more than 60 seconds then the
199 * debug-printout triggers in softlockup_tick().
200 */
201 while (!kthread_should_stop()) {
202 __touch_softlockup_watchdog();
203 schedule();
204
205 if (kthread_should_stop())
206 break;
207
208 set_current_state(TASK_INTERRUPTIBLE);
209 }
210 __set_current_state(TASK_RUNNING);
211
212 return 0;
213}
214
215/*
216 * Create/destroy watchdog threads as CPUs come and go:
217 */
218static int __cpuinit
219cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
220{
221 int hotcpu = (unsigned long)hcpu;
222 struct task_struct *p;
223
224 switch (action) {
225 case CPU_UP_PREPARE:
226 case CPU_UP_PREPARE_FROZEN:
227 BUG_ON(per_cpu(softlockup_watchdog, hotcpu));
228 p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
229 if (IS_ERR(p)) {
230 printk(KERN_ERR "watchdog for %i failed\n", hotcpu);
231 return NOTIFY_BAD;
232 }
233 per_cpu(softlockup_touch_ts, hotcpu) = 0;
234 per_cpu(softlockup_watchdog, hotcpu) = p;
235 kthread_bind(p, hotcpu);
236 break;
237 case CPU_ONLINE:
238 case CPU_ONLINE_FROZEN:
239 wake_up_process(per_cpu(softlockup_watchdog, hotcpu));
240 break;
241#ifdef CONFIG_HOTPLUG_CPU
242 case CPU_UP_CANCELED:
243 case CPU_UP_CANCELED_FROZEN:
244 if (!per_cpu(softlockup_watchdog, hotcpu))
245 break;
246 /* Unbind so it can run. Fall thru. */
247 kthread_bind(per_cpu(softlockup_watchdog, hotcpu),
248 cpumask_any(cpu_online_mask));
249 case CPU_DEAD:
250 case CPU_DEAD_FROZEN:
251 p = per_cpu(softlockup_watchdog, hotcpu);
252 per_cpu(softlockup_watchdog, hotcpu) = NULL;
253 kthread_stop(p);
254 break;
255#endif /* CONFIG_HOTPLUG_CPU */
256 }
257 return NOTIFY_OK;
258}
259
260static struct notifier_block __cpuinitdata cpu_nfb = {
261 .notifier_call = cpu_callback
262};
263
264static int __initdata nosoftlockup;
265
266static int __init nosoftlockup_setup(char *str)
267{
268 nosoftlockup = 1;
269 return 1;
270}
271__setup("nosoftlockup", nosoftlockup_setup);
272
273static int __init spawn_softlockup_task(void)
274{
275 void *cpu = (void *)(long)smp_processor_id();
276 int err;
277
278 if (nosoftlockup)
279 return 0;
280
281 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
282 if (err == NOTIFY_BAD) {
283 BUG();
284 return 1;
285 }
286 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
287 register_cpu_notifier(&cpu_nfb);
288
289 atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
290
291 return 0;
292}
293early_initcall(spawn_softlockup_task);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 9bb9fb1bd79c..4372ccb25127 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,17 +1,384 @@
1/* Copyright 2008, 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation. 1/*
2 * GPL v2 and any later version. 2 * kernel/stop_machine.c
3 *
4 * Copyright (C) 2008, 2005 IBM Corporation.
5 * Copyright (C) 2008, 2005 Rusty Russell rusty@rustcorp.com.au
6 * Copyright (C) 2010 SUSE Linux Products GmbH
7 * Copyright (C) 2010 Tejun Heo <tj@kernel.org>
8 *
9 * This file is released under the GPLv2 and any later version.
3 */ 10 */
11#include <linux/completion.h>
4#include <linux/cpu.h> 12#include <linux/cpu.h>
5#include <linux/err.h> 13#include <linux/init.h>
6#include <linux/kthread.h> 14#include <linux/kthread.h>
7#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/percpu.h>
8#include <linux/sched.h> 17#include <linux/sched.h>
9#include <linux/stop_machine.h> 18#include <linux/stop_machine.h>
10#include <linux/syscalls.h>
11#include <linux/interrupt.h> 19#include <linux/interrupt.h>
20#include <linux/kallsyms.h>
12 21
13#include <asm/atomic.h> 22#include <asm/atomic.h>
14#include <asm/uaccess.h> 23
24/*
25 * Structure to determine completion condition and record errors. May
26 * be shared by works on different cpus.
27 */
28struct cpu_stop_done {
29 atomic_t nr_todo; /* nr left to execute */
30 bool executed; /* actually executed? */
31 int ret; /* collected return value */
32 struct completion completion; /* fired if nr_todo reaches 0 */
33};
34
35/* the actual stopper, one per every possible cpu, enabled on online cpus */
36struct cpu_stopper {
37 spinlock_t lock;
38 bool enabled; /* is this stopper enabled? */
39 struct list_head works; /* list of pending works */
40 struct task_struct *thread; /* stopper thread */
41};
42
43static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
44
45static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
46{
47 memset(done, 0, sizeof(*done));
48 atomic_set(&done->nr_todo, nr_todo);
49 init_completion(&done->completion);
50}
51
52/* signal completion unless @done is NULL */
53static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
54{
55 if (done) {
56 if (executed)
57 done->executed = true;
58 if (atomic_dec_and_test(&done->nr_todo))
59 complete(&done->completion);
60 }
61}
62
63/* queue @work to @stopper. if offline, @work is completed immediately */
64static void cpu_stop_queue_work(struct cpu_stopper *stopper,
65 struct cpu_stop_work *work)
66{
67 unsigned long flags;
68
69 spin_lock_irqsave(&stopper->lock, flags);
70
71 if (stopper->enabled) {
72 list_add_tail(&work->list, &stopper->works);
73 wake_up_process(stopper->thread);
74 } else
75 cpu_stop_signal_done(work->done, false);
76
77 spin_unlock_irqrestore(&stopper->lock, flags);
78}
79
80/**
81 * stop_one_cpu - stop a cpu
82 * @cpu: cpu to stop
83 * @fn: function to execute
84 * @arg: argument to @fn
85 *
86 * Execute @fn(@arg) on @cpu. @fn is run in a process context with
87 * the highest priority preempting any task on the cpu and
88 * monopolizing it. This function returns after the execution is
89 * complete.
90 *
91 * This function doesn't guarantee @cpu stays online till @fn
92 * completes. If @cpu goes down in the middle, execution may happen
93 * partially or fully on different cpus. @fn should either be ready
94 * for that or the caller should ensure that @cpu stays online until
95 * this function completes.
96 *
97 * CONTEXT:
98 * Might sleep.
99 *
100 * RETURNS:
101 * -ENOENT if @fn(@arg) was not executed because @cpu was offline;
102 * otherwise, the return value of @fn.
103 */
104int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
105{
106 struct cpu_stop_done done;
107 struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
108
109 cpu_stop_init_done(&done, 1);
110 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work);
111 wait_for_completion(&done.completion);
112 return done.executed ? done.ret : -ENOENT;
113}
114
115/**
116 * stop_one_cpu_nowait - stop a cpu but don't wait for completion
117 * @cpu: cpu to stop
118 * @fn: function to execute
119 * @arg: argument to @fn
120 *
121 * Similar to stop_one_cpu() but doesn't wait for completion. The
122 * caller is responsible for ensuring @work_buf is currently unused
123 * and will remain untouched until stopper starts executing @fn.
124 *
125 * CONTEXT:
126 * Don't care.
127 */
128void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
129 struct cpu_stop_work *work_buf)
130{
131 *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
132 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf);
133}
134
135/* static data for stop_cpus */
136static DEFINE_MUTEX(stop_cpus_mutex);
137static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
138
139int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
140{
141 struct cpu_stop_work *work;
142 struct cpu_stop_done done;
143 unsigned int cpu;
144
145 /* initialize works and done */
146 for_each_cpu(cpu, cpumask) {
147 work = &per_cpu(stop_cpus_work, cpu);
148 work->fn = fn;
149 work->arg = arg;
150 work->done = &done;
151 }
152 cpu_stop_init_done(&done, cpumask_weight(cpumask));
153
154 /*
155 * Disable preemption while queueing to avoid getting
156 * preempted by a stopper which might wait for other stoppers
157 * to enter @fn which can lead to deadlock.
158 */
159 preempt_disable();
160 for_each_cpu(cpu, cpumask)
161 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu),
162 &per_cpu(stop_cpus_work, cpu));
163 preempt_enable();
164
165 wait_for_completion(&done.completion);
166 return done.executed ? done.ret : -ENOENT;
167}
168
169/**
170 * stop_cpus - stop multiple cpus
171 * @cpumask: cpus to stop
172 * @fn: function to execute
173 * @arg: argument to @fn
174 *
175 * Execute @fn(@arg) on online cpus in @cpumask. On each target cpu,
176 * @fn is run in a process context with the highest priority
177 * preempting any task on the cpu and monopolizing it. This function
178 * returns after all executions are complete.
179 *
180 * This function doesn't guarantee the cpus in @cpumask stay online
181 * till @fn completes. If some cpus go down in the middle, execution
182 * on the cpu may happen partially or fully on different cpus. @fn
183 * should either be ready for that or the caller should ensure that
184 * the cpus stay online until this function completes.
185 *
186 * All stop_cpus() calls are serialized making it safe for @fn to wait
187 * for all cpus to start executing it.
188 *
189 * CONTEXT:
190 * Might sleep.
191 *
192 * RETURNS:
193 * -ENOENT if @fn(@arg) was not executed at all because all cpus in
194 * @cpumask were offline; otherwise, 0 if all executions of @fn
195 * returned 0, any non zero return value if any returned non zero.
196 */
197int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
198{
199 int ret;
200
201 /* static works are used, process one request at a time */
202 mutex_lock(&stop_cpus_mutex);
203 ret = __stop_cpus(cpumask, fn, arg);
204 mutex_unlock(&stop_cpus_mutex);
205 return ret;
206}
207
208/**
209 * try_stop_cpus - try to stop multiple cpus
210 * @cpumask: cpus to stop
211 * @fn: function to execute
212 * @arg: argument to @fn
213 *
214 * Identical to stop_cpus() except that it fails with -EAGAIN if
215 * someone else is already using the facility.
216 *
217 * CONTEXT:
218 * Might sleep.
219 *
220 * RETURNS:
221 * -EAGAIN if someone else is already stopping cpus, -ENOENT if
222 * @fn(@arg) was not executed at all because all cpus in @cpumask were
223 * offline; otherwise, 0 if all executions of @fn returned 0, any non
224 * zero return value if any returned non zero.
225 */
226int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
227{
228 int ret;
229
230 /* static works are used, process one request at a time */
231 if (!mutex_trylock(&stop_cpus_mutex))
232 return -EAGAIN;
233 ret = __stop_cpus(cpumask, fn, arg);
234 mutex_unlock(&stop_cpus_mutex);
235 return ret;
236}
237
238static int cpu_stopper_thread(void *data)
239{
240 struct cpu_stopper *stopper = data;
241 struct cpu_stop_work *work;
242 int ret;
243
244repeat:
245 set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
246
247 if (kthread_should_stop()) {
248 __set_current_state(TASK_RUNNING);
249 return 0;
250 }
251
252 work = NULL;
253 spin_lock_irq(&stopper->lock);
254 if (!list_empty(&stopper->works)) {
255 work = list_first_entry(&stopper->works,
256 struct cpu_stop_work, list);
257 list_del_init(&work->list);
258 }
259 spin_unlock_irq(&stopper->lock);
260
261 if (work) {
262 cpu_stop_fn_t fn = work->fn;
263 void *arg = work->arg;
264 struct cpu_stop_done *done = work->done;
265 char ksym_buf[KSYM_NAME_LEN];
266
267 __set_current_state(TASK_RUNNING);
268
269 /* cpu stop callbacks are not allowed to sleep */
270 preempt_disable();
271
272 ret = fn(arg);
273 if (ret)
274 done->ret = ret;
275
276 /* restore preemption and check it's still balanced */
277 preempt_enable();
278 WARN_ONCE(preempt_count(),
279 "cpu_stop: %s(%p) leaked preempt count\n",
280 kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL,
281 ksym_buf), arg);
282
283 cpu_stop_signal_done(done, true);
284 } else
285 schedule();
286
287 goto repeat;
288}
289
290/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
291static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
292 unsigned long action, void *hcpu)
293{
294 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
295 unsigned int cpu = (unsigned long)hcpu;
296 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
297 struct task_struct *p;
298
299 switch (action & ~CPU_TASKS_FROZEN) {
300 case CPU_UP_PREPARE:
301 BUG_ON(stopper->thread || stopper->enabled ||
302 !list_empty(&stopper->works));
303 p = kthread_create(cpu_stopper_thread, stopper, "migration/%d",
304 cpu);
305 if (IS_ERR(p))
306 return NOTIFY_BAD;
307 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
308 get_task_struct(p);
309 stopper->thread = p;
310 break;
311
312 case CPU_ONLINE:
313 kthread_bind(stopper->thread, cpu);
314 /* strictly unnecessary, as first user will wake it */
315 wake_up_process(stopper->thread);
316 /* mark enabled */
317 spin_lock_irq(&stopper->lock);
318 stopper->enabled = true;
319 spin_unlock_irq(&stopper->lock);
320 break;
321
322#ifdef CONFIG_HOTPLUG_CPU
323 case CPU_UP_CANCELED:
324 case CPU_POST_DEAD:
325 {
326 struct cpu_stop_work *work;
327
328 /* kill the stopper */
329 kthread_stop(stopper->thread);
330 /* drain remaining works */
331 spin_lock_irq(&stopper->lock);
332 list_for_each_entry(work, &stopper->works, list)
333 cpu_stop_signal_done(work->done, false);
334 stopper->enabled = false;
335 spin_unlock_irq(&stopper->lock);
336 /* release the stopper */
337 put_task_struct(stopper->thread);
338 stopper->thread = NULL;
339 break;
340 }
341#endif
342 }
343
344 return NOTIFY_OK;
345}
346
347/*
348 * Give it a higher priority so that cpu stopper is available to other
349 * cpu notifiers. It currently shares the same priority as sched
350 * migration_notifier.
351 */
352static struct notifier_block __cpuinitdata cpu_stop_cpu_notifier = {
353 .notifier_call = cpu_stop_cpu_callback,
354 .priority = 10,
355};
356
357static int __init cpu_stop_init(void)
358{
359 void *bcpu = (void *)(long)smp_processor_id();
360 unsigned int cpu;
361 int err;
362
363 for_each_possible_cpu(cpu) {
364 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
365
366 spin_lock_init(&stopper->lock);
367 INIT_LIST_HEAD(&stopper->works);
368 }
369
370 /* start one for the boot cpu */
371 err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
372 bcpu);
373 BUG_ON(err == NOTIFY_BAD);
374 cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
375 register_cpu_notifier(&cpu_stop_cpu_notifier);
376
377 return 0;
378}
379early_initcall(cpu_stop_init);
380
381#ifdef CONFIG_STOP_MACHINE
15 382
16/* This controls the threads on each CPU. */ 383/* This controls the threads on each CPU. */
17enum stopmachine_state { 384enum stopmachine_state {
@@ -26,174 +393,94 @@ enum stopmachine_state {
26 /* Exit */ 393 /* Exit */
27 STOPMACHINE_EXIT, 394 STOPMACHINE_EXIT,
28}; 395};
29static enum stopmachine_state state;
30 396
31struct stop_machine_data { 397struct stop_machine_data {
32 int (*fn)(void *); 398 int (*fn)(void *);
33 void *data; 399 void *data;
34 int fnret; 400 /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
401 unsigned int num_threads;
402 const struct cpumask *active_cpus;
403
404 enum stopmachine_state state;
405 atomic_t thread_ack;
35}; 406};
36 407
37/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ 408static void set_state(struct stop_machine_data *smdata,
38static unsigned int num_threads; 409 enum stopmachine_state newstate)
39static atomic_t thread_ack;
40static DEFINE_MUTEX(lock);
41/* setup_lock protects refcount, stop_machine_wq and stop_machine_work. */
42static DEFINE_MUTEX(setup_lock);
43/* Users of stop_machine. */
44static int refcount;
45static struct workqueue_struct *stop_machine_wq;
46static struct stop_machine_data active, idle;
47static const struct cpumask *active_cpus;
48static void __percpu *stop_machine_work;
49
50static void set_state(enum stopmachine_state newstate)
51{ 410{
52 /* Reset ack counter. */ 411 /* Reset ack counter. */
53 atomic_set(&thread_ack, num_threads); 412 atomic_set(&smdata->thread_ack, smdata->num_threads);
54 smp_wmb(); 413 smp_wmb();
55 state = newstate; 414 smdata->state = newstate;
56} 415}
57 416
58/* Last one to ack a state moves to the next state. */ 417/* Last one to ack a state moves to the next state. */
59static void ack_state(void) 418static void ack_state(struct stop_machine_data *smdata)
60{ 419{
61 if (atomic_dec_and_test(&thread_ack)) 420 if (atomic_dec_and_test(&smdata->thread_ack))
62 set_state(state + 1); 421 set_state(smdata, smdata->state + 1);
63} 422}
64 423
65/* This is the actual function which stops the CPU. It runs 424/* This is the cpu_stop function which stops the CPU. */
66 * in the context of a dedicated stopmachine workqueue. */ 425static int stop_machine_cpu_stop(void *data)
67static void stop_cpu(struct work_struct *unused)
68{ 426{
427 struct stop_machine_data *smdata = data;
69 enum stopmachine_state curstate = STOPMACHINE_NONE; 428 enum stopmachine_state curstate = STOPMACHINE_NONE;
70 struct stop_machine_data *smdata = &idle; 429 int cpu = smp_processor_id(), err = 0;
71 int cpu = smp_processor_id(); 430 bool is_active;
72 int err; 431
432 if (!smdata->active_cpus)
433 is_active = cpu == cpumask_first(cpu_online_mask);
434 else
435 is_active = cpumask_test_cpu(cpu, smdata->active_cpus);
73 436
74 if (!active_cpus) {
75 if (cpu == cpumask_first(cpu_online_mask))
76 smdata = &active;
77 } else {
78 if (cpumask_test_cpu(cpu, active_cpus))
79 smdata = &active;
80 }
81 /* Simple state machine */ 437 /* Simple state machine */
82 do { 438 do {
83 /* Chill out and ensure we re-read stopmachine_state. */ 439 /* Chill out and ensure we re-read stopmachine_state. */
84 cpu_relax(); 440 cpu_relax();
85 if (state != curstate) { 441 if (smdata->state != curstate) {
86 curstate = state; 442 curstate = smdata->state;
87 switch (curstate) { 443 switch (curstate) {
88 case STOPMACHINE_DISABLE_IRQ: 444 case STOPMACHINE_DISABLE_IRQ:
89 local_irq_disable(); 445 local_irq_disable();
90 hard_irq_disable(); 446 hard_irq_disable();
91 break; 447 break;
92 case STOPMACHINE_RUN: 448 case STOPMACHINE_RUN:
93 /* On multiple CPUs only a single error code 449 if (is_active)
94 * is needed to tell that something failed. */ 450 err = smdata->fn(smdata->data);
95 err = smdata->fn(smdata->data);
96 if (err)
97 smdata->fnret = err;
98 break; 451 break;
99 default: 452 default:
100 break; 453 break;
101 } 454 }
102 ack_state(); 455 ack_state(smdata);
103 } 456 }
104 } while (curstate != STOPMACHINE_EXIT); 457 } while (curstate != STOPMACHINE_EXIT);
105 458
106 local_irq_enable(); 459 local_irq_enable();
460 return err;
107} 461}
108 462
109/* Callback for CPUs which aren't supposed to do anything. */
110static int chill(void *unused)
111{
112 return 0;
113}
114
115int stop_machine_create(void)
116{
117 mutex_lock(&setup_lock);
118 if (refcount)
119 goto done;
120 stop_machine_wq = create_rt_workqueue("kstop");
121 if (!stop_machine_wq)
122 goto err_out;
123 stop_machine_work = alloc_percpu(struct work_struct);
124 if (!stop_machine_work)
125 goto err_out;
126done:
127 refcount++;
128 mutex_unlock(&setup_lock);
129 return 0;
130
131err_out:
132 if (stop_machine_wq)
133 destroy_workqueue(stop_machine_wq);
134 mutex_unlock(&setup_lock);
135 return -ENOMEM;
136}
137EXPORT_SYMBOL_GPL(stop_machine_create);
138
139void stop_machine_destroy(void)
140{
141 mutex_lock(&setup_lock);
142 refcount--;
143 if (refcount)
144 goto done;
145 destroy_workqueue(stop_machine_wq);
146 free_percpu(stop_machine_work);
147done:
148 mutex_unlock(&setup_lock);
149}
150EXPORT_SYMBOL_GPL(stop_machine_destroy);
151
152int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) 463int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
153{ 464{
154 struct work_struct *sm_work; 465 struct stop_machine_data smdata = { .fn = fn, .data = data,
155 int i, ret; 466 .num_threads = num_online_cpus(),
156 467 .active_cpus = cpus };
157 /* Set up initial state. */ 468
158 mutex_lock(&lock); 469 /* Set the initial state and stop all online cpus. */
159 num_threads = num_online_cpus(); 470 set_state(&smdata, STOPMACHINE_PREPARE);
160 active_cpus = cpus; 471 return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
161 active.fn = fn;
162 active.data = data;
163 active.fnret = 0;
164 idle.fn = chill;
165 idle.data = NULL;
166
167 set_state(STOPMACHINE_PREPARE);
168
169 /* Schedule the stop_cpu work on all cpus: hold this CPU so one
170 * doesn't hit this CPU until we're ready. */
171 get_cpu();
172 for_each_online_cpu(i) {
173 sm_work = per_cpu_ptr(stop_machine_work, i);
174 INIT_WORK(sm_work, stop_cpu);
175 queue_work_on(i, stop_machine_wq, sm_work);
176 }
177 /* This will release the thread on our CPU. */
178 put_cpu();
179 flush_workqueue(stop_machine_wq);
180 ret = active.fnret;
181 mutex_unlock(&lock);
182 return ret;
183} 472}
184 473
185int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) 474int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
186{ 475{
187 int ret; 476 int ret;
188 477
189 ret = stop_machine_create();
190 if (ret)
191 return ret;
192 /* No CPUs can come up or down during this. */ 478 /* No CPUs can come up or down during this. */
193 get_online_cpus(); 479 get_online_cpus();
194 ret = __stop_machine(fn, data, cpus); 480 ret = __stop_machine(fn, data, cpus);
195 put_online_cpus(); 481 put_online_cpus();
196 stop_machine_destroy();
197 return ret; 482 return ret;
198} 483}
199EXPORT_SYMBOL_GPL(stop_machine); 484EXPORT_SYMBOL_GPL(stop_machine);
485
486#endif /* CONFIG_STOP_MACHINE */
diff --git a/kernel/sys.c b/kernel/sys.c
index 7cb426a58965..7f5a0cd296a9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -492,10 +492,6 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
492 return -ENOMEM; 492 return -ENOMEM;
493 old = current_cred(); 493 old = current_cred();
494 494
495 retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE);
496 if (retval)
497 goto error;
498
499 retval = -EPERM; 495 retval = -EPERM;
500 if (rgid != (gid_t) -1) { 496 if (rgid != (gid_t) -1) {
501 if (old->gid == rgid || 497 if (old->gid == rgid ||
@@ -543,10 +539,6 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
543 return -ENOMEM; 539 return -ENOMEM;
544 old = current_cred(); 540 old = current_cred();
545 541
546 retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID);
547 if (retval)
548 goto error;
549
550 retval = -EPERM; 542 retval = -EPERM;
551 if (capable(CAP_SETGID)) 543 if (capable(CAP_SETGID))
552 new->gid = new->egid = new->sgid = new->fsgid = gid; 544 new->gid = new->egid = new->sgid = new->fsgid = gid;
@@ -610,10 +602,6 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
610 return -ENOMEM; 602 return -ENOMEM;
611 old = current_cred(); 603 old = current_cred();
612 604
613 retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE);
614 if (retval)
615 goto error;
616
617 retval = -EPERM; 605 retval = -EPERM;
618 if (ruid != (uid_t) -1) { 606 if (ruid != (uid_t) -1) {
619 new->uid = ruid; 607 new->uid = ruid;
@@ -675,10 +663,6 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
675 return -ENOMEM; 663 return -ENOMEM;
676 old = current_cred(); 664 old = current_cred();
677 665
678 retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID);
679 if (retval)
680 goto error;
681
682 retval = -EPERM; 666 retval = -EPERM;
683 if (capable(CAP_SETUID)) { 667 if (capable(CAP_SETUID)) {
684 new->suid = new->uid = uid; 668 new->suid = new->uid = uid;
@@ -719,9 +703,6 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
719 if (!new) 703 if (!new)
720 return -ENOMEM; 704 return -ENOMEM;
721 705
722 retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES);
723 if (retval)
724 goto error;
725 old = current_cred(); 706 old = current_cred();
726 707
727 retval = -EPERM; 708 retval = -EPERM;
@@ -788,10 +769,6 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
788 return -ENOMEM; 769 return -ENOMEM;
789 old = current_cred(); 770 old = current_cred();
790 771
791 retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES);
792 if (retval)
793 goto error;
794
795 retval = -EPERM; 772 retval = -EPERM;
796 if (!capable(CAP_SETGID)) { 773 if (!capable(CAP_SETGID)) {
797 if (rgid != (gid_t) -1 && rgid != old->gid && 774 if (rgid != (gid_t) -1 && rgid != old->gid &&
@@ -851,9 +828,6 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
851 old = current_cred(); 828 old = current_cred();
852 old_fsuid = old->fsuid; 829 old_fsuid = old->fsuid;
853 830
854 if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS) < 0)
855 goto error;
856
857 if (uid == old->uid || uid == old->euid || 831 if (uid == old->uid || uid == old->euid ||
858 uid == old->suid || uid == old->fsuid || 832 uid == old->suid || uid == old->fsuid ||
859 capable(CAP_SETUID)) { 833 capable(CAP_SETUID)) {
@@ -864,7 +838,6 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
864 } 838 }
865 } 839 }
866 840
867error:
868 abort_creds(new); 841 abort_creds(new);
869 return old_fsuid; 842 return old_fsuid;
870 843
@@ -888,9 +861,6 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
888 old = current_cred(); 861 old = current_cred();
889 old_fsgid = old->fsgid; 862 old_fsgid = old->fsgid;
890 863
891 if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS))
892 goto error;
893
894 if (gid == old->gid || gid == old->egid || 864 if (gid == old->gid || gid == old->egid ||
895 gid == old->sgid || gid == old->fsgid || 865 gid == old->sgid || gid == old->fsgid ||
896 capable(CAP_SETGID)) { 866 capable(CAP_SETGID)) {
@@ -900,7 +870,6 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
900 } 870 }
901 } 871 }
902 872
903error:
904 abort_creds(new); 873 abort_creds(new);
905 return old_fsgid; 874 return old_fsgid;
906 875
@@ -962,6 +931,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
962 pgid = pid; 931 pgid = pid;
963 if (pgid < 0) 932 if (pgid < 0)
964 return -EINVAL; 933 return -EINVAL;
934 rcu_read_lock();
965 935
966 /* From this point forward we keep holding onto the tasklist lock 936 /* From this point forward we keep holding onto the tasklist lock
967 * so that our parent does not change from under us. -DaveM 937 * so that our parent does not change from under us. -DaveM
@@ -1015,6 +985,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
1015out: 985out:
1016 /* All paths lead to here, thus we are safe. -DaveM */ 986 /* All paths lead to here, thus we are safe. -DaveM */
1017 write_unlock_irq(&tasklist_lock); 987 write_unlock_irq(&tasklist_lock);
988 rcu_read_unlock();
1018 return err; 989 return err;
1019} 990}
1020 991
@@ -1267,15 +1238,14 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
1267 1238
1268SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) 1239SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
1269{ 1240{
1270 if (resource >= RLIM_NLIMITS) 1241 struct rlimit value;
1271 return -EINVAL; 1242 int ret;
1272 else { 1243
1273 struct rlimit value; 1244 ret = do_prlimit(current, resource, NULL, &value);
1274 task_lock(current->group_leader); 1245 if (!ret)
1275 value = current->signal->rlim[resource]; 1246 ret = copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0;
1276 task_unlock(current->group_leader); 1247
1277 return copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0; 1248 return ret;
1278 }
1279} 1249}
1280 1250
1281#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT 1251#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT
@@ -1303,44 +1273,89 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
1303 1273
1304#endif 1274#endif
1305 1275
1306SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) 1276static inline bool rlim64_is_infinity(__u64 rlim64)
1307{ 1277{
1308 struct rlimit new_rlim, *old_rlim; 1278#if BITS_PER_LONG < 64
1309 int retval; 1279 return rlim64 >= ULONG_MAX;
1280#else
1281 return rlim64 == RLIM64_INFINITY;
1282#endif
1283}
1284
1285static void rlim_to_rlim64(const struct rlimit *rlim, struct rlimit64 *rlim64)
1286{
1287 if (rlim->rlim_cur == RLIM_INFINITY)
1288 rlim64->rlim_cur = RLIM64_INFINITY;
1289 else
1290 rlim64->rlim_cur = rlim->rlim_cur;
1291 if (rlim->rlim_max == RLIM_INFINITY)
1292 rlim64->rlim_max = RLIM64_INFINITY;
1293 else
1294 rlim64->rlim_max = rlim->rlim_max;
1295}
1296
1297static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim)
1298{
1299 if (rlim64_is_infinity(rlim64->rlim_cur))
1300 rlim->rlim_cur = RLIM_INFINITY;
1301 else
1302 rlim->rlim_cur = (unsigned long)rlim64->rlim_cur;
1303 if (rlim64_is_infinity(rlim64->rlim_max))
1304 rlim->rlim_max = RLIM_INFINITY;
1305 else
1306 rlim->rlim_max = (unsigned long)rlim64->rlim_max;
1307}
1308
1309/* make sure you are allowed to change @tsk limits before calling this */
1310int do_prlimit(struct task_struct *tsk, unsigned int resource,
1311 struct rlimit *new_rlim, struct rlimit *old_rlim)
1312{
1313 struct rlimit *rlim;
1314 int retval = 0;
1310 1315
1311 if (resource >= RLIM_NLIMITS) 1316 if (resource >= RLIM_NLIMITS)
1312 return -EINVAL; 1317 return -EINVAL;
1313 if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) 1318 if (new_rlim) {
1314 return -EFAULT; 1319 if (new_rlim->rlim_cur > new_rlim->rlim_max)
1315 if (new_rlim.rlim_cur > new_rlim.rlim_max) 1320 return -EINVAL;
1316 return -EINVAL; 1321 if (resource == RLIMIT_NOFILE &&
1317 old_rlim = current->signal->rlim + resource; 1322 new_rlim->rlim_max > sysctl_nr_open)
1318 if ((new_rlim.rlim_max > old_rlim->rlim_max) && 1323 return -EPERM;
1319 !capable(CAP_SYS_RESOURCE))
1320 return -EPERM;
1321 if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open)
1322 return -EPERM;
1323
1324 retval = security_task_setrlimit(resource, &new_rlim);
1325 if (retval)
1326 return retval;
1327
1328 if (resource == RLIMIT_CPU && new_rlim.rlim_cur == 0) {
1329 /*
1330 * The caller is asking for an immediate RLIMIT_CPU
1331 * expiry. But we use the zero value to mean "it was
1332 * never set". So let's cheat and make it one second
1333 * instead
1334 */
1335 new_rlim.rlim_cur = 1;
1336 } 1324 }
1337 1325
1338 task_lock(current->group_leader); 1326 /* protect tsk->signal and tsk->sighand from disappearing */
1339 *old_rlim = new_rlim; 1327 read_lock(&tasklist_lock);
1340 task_unlock(current->group_leader); 1328 if (!tsk->sighand) {
1341 1329 retval = -ESRCH;
1342 if (resource != RLIMIT_CPU)
1343 goto out; 1330 goto out;
1331 }
1332
1333 rlim = tsk->signal->rlim + resource;
1334 task_lock(tsk->group_leader);
1335 if (new_rlim) {
1336 if (new_rlim->rlim_max > rlim->rlim_max &&
1337 !capable(CAP_SYS_RESOURCE))
1338 retval = -EPERM;
1339 if (!retval)
1340 retval = security_task_setrlimit(tsk->group_leader,
1341 resource, new_rlim);
1342 if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) {
1343 /*
1344 * The caller is asking for an immediate RLIMIT_CPU
1345 * expiry. But we use the zero value to mean "it was
1346 * never set". So let's cheat and make it one second
1347 * instead
1348 */
1349 new_rlim->rlim_cur = 1;
1350 }
1351 }
1352 if (!retval) {
1353 if (old_rlim)
1354 *old_rlim = *rlim;
1355 if (new_rlim)
1356 *rlim = *new_rlim;
1357 }
1358 task_unlock(tsk->group_leader);
1344 1359
1345 /* 1360 /*
1346 * RLIMIT_CPU handling. Note that the kernel fails to return an error 1361 * RLIMIT_CPU handling. Note that the kernel fails to return an error
@@ -1348,14 +1363,84 @@ SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
1348 * very long-standing error, and fixing it now risks breakage of 1363 * very long-standing error, and fixing it now risks breakage of
1349 * applications, so we live with it 1364 * applications, so we live with it
1350 */ 1365 */
1351 if (new_rlim.rlim_cur == RLIM_INFINITY) 1366 if (!retval && new_rlim && resource == RLIMIT_CPU &&
1352 goto out; 1367 new_rlim->rlim_cur != RLIM_INFINITY)
1353 1368 update_rlimit_cpu(tsk, new_rlim->rlim_cur);
1354 update_rlimit_cpu(new_rlim.rlim_cur);
1355out: 1369out:
1370 read_unlock(&tasklist_lock);
1371 return retval;
1372}
1373
1374/* rcu lock must be held */
1375static int check_prlimit_permission(struct task_struct *task)
1376{
1377 const struct cred *cred = current_cred(), *tcred;
1378
1379 tcred = __task_cred(task);
1380 if ((cred->uid != tcred->euid ||
1381 cred->uid != tcred->suid ||
1382 cred->uid != tcred->uid ||
1383 cred->gid != tcred->egid ||
1384 cred->gid != tcred->sgid ||
1385 cred->gid != tcred->gid) &&
1386 !capable(CAP_SYS_RESOURCE)) {
1387 return -EPERM;
1388 }
1389
1356 return 0; 1390 return 0;
1357} 1391}
1358 1392
1393SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
1394 const struct rlimit64 __user *, new_rlim,
1395 struct rlimit64 __user *, old_rlim)
1396{
1397 struct rlimit64 old64, new64;
1398 struct rlimit old, new;
1399 struct task_struct *tsk;
1400 int ret;
1401
1402 if (new_rlim) {
1403 if (copy_from_user(&new64, new_rlim, sizeof(new64)))
1404 return -EFAULT;
1405 rlim64_to_rlim(&new64, &new);
1406 }
1407
1408 rcu_read_lock();
1409 tsk = pid ? find_task_by_vpid(pid) : current;
1410 if (!tsk) {
1411 rcu_read_unlock();
1412 return -ESRCH;
1413 }
1414 ret = check_prlimit_permission(tsk);
1415 if (ret) {
1416 rcu_read_unlock();
1417 return ret;
1418 }
1419 get_task_struct(tsk);
1420 rcu_read_unlock();
1421
1422 ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL,
1423 old_rlim ? &old : NULL);
1424
1425 if (!ret && old_rlim) {
1426 rlim_to_rlim64(&old, &old64);
1427 if (copy_to_user(old_rlim, &old64, sizeof(old64)))
1428 ret = -EFAULT;
1429 }
1430
1431 put_task_struct(tsk);
1432 return ret;
1433}
1434
1435SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
1436{
1437 struct rlimit new_rlim;
1438
1439 if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
1440 return -EFAULT;
1441 return do_prlimit(current, resource, &new_rlim, NULL);
1442}
1443
1359/* 1444/*
1360 * It would make sense to put struct rusage in the task_struct, 1445 * It would make sense to put struct rusage in the task_struct,
1361 * except that would make the task_struct be *really big*. After 1446 * except that would make the task_struct be *really big*. After
@@ -1663,9 +1748,9 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
1663 1748
1664char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; 1749char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
1665 1750
1666static void argv_cleanup(char **argv, char **envp) 1751static void argv_cleanup(struct subprocess_info *info)
1667{ 1752{
1668 argv_free(argv); 1753 argv_free(info->argv);
1669} 1754}
1670 1755
1671/** 1756/**
@@ -1699,7 +1784,7 @@ int orderly_poweroff(bool force)
1699 goto out; 1784 goto out;
1700 } 1785 }
1701 1786
1702 call_usermodehelper_setcleanup(info, argv_cleanup); 1787 call_usermodehelper_setfns(info, NULL, argv_cleanup, NULL);
1703 1788
1704 ret = call_usermodehelper_exec(info, UMH_NO_WAIT); 1789 ret = call_usermodehelper_exec(info, UMH_NO_WAIT);
1705 1790
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 70f2ea758ffe..bad369ec5403 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -181,3 +181,7 @@ cond_syscall(sys_eventfd2);
181 181
182/* performance counters: */ 182/* performance counters: */
183cond_syscall(sys_perf_event_open); 183cond_syscall(sys_perf_event_open);
184
185/* fanotify! */
186cond_syscall(sys_fanotify_init);
187cond_syscall(sys_fanotify_mark);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8686b0f5fc12..3a45c224770f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -37,21 +37,24 @@
37#include <linux/highuid.h> 37#include <linux/highuid.h>
38#include <linux/writeback.h> 38#include <linux/writeback.h>
39#include <linux/ratelimit.h> 39#include <linux/ratelimit.h>
40#include <linux/compaction.h>
40#include <linux/hugetlb.h> 41#include <linux/hugetlb.h>
41#include <linux/initrd.h> 42#include <linux/initrd.h>
42#include <linux/key.h> 43#include <linux/key.h>
43#include <linux/times.h> 44#include <linux/times.h>
44#include <linux/limits.h> 45#include <linux/limits.h>
45#include <linux/dcache.h> 46#include <linux/dcache.h>
47#include <linux/dnotify.h>
46#include <linux/syscalls.h> 48#include <linux/syscalls.h>
47#include <linux/vmstat.h> 49#include <linux/vmstat.h>
48#include <linux/nfs_fs.h> 50#include <linux/nfs_fs.h>
49#include <linux/acpi.h> 51#include <linux/acpi.h>
50#include <linux/reboot.h> 52#include <linux/reboot.h>
51#include <linux/ftrace.h> 53#include <linux/ftrace.h>
52#include <linux/slow-work.h>
53#include <linux/perf_event.h> 54#include <linux/perf_event.h>
54#include <linux/kprobes.h> 55#include <linux/kprobes.h>
56#include <linux/pipe_fs_i.h>
57#include <linux/oom.h>
55 58
56#include <asm/uaccess.h> 59#include <asm/uaccess.h>
57#include <asm/processor.h> 60#include <asm/processor.h>
@@ -74,15 +77,16 @@
74#include <scsi/sg.h> 77#include <scsi/sg.h>
75#endif 78#endif
76 79
80#ifdef CONFIG_LOCKUP_DETECTOR
81#include <linux/nmi.h>
82#endif
83
77 84
78#if defined(CONFIG_SYSCTL) 85#if defined(CONFIG_SYSCTL)
79 86
80/* External variables not in a header file. */ 87/* External variables not in a header file. */
81extern int sysctl_overcommit_memory; 88extern int sysctl_overcommit_memory;
82extern int sysctl_overcommit_ratio; 89extern int sysctl_overcommit_ratio;
83extern int sysctl_panic_on_oom;
84extern int sysctl_oom_kill_allocating_task;
85extern int sysctl_oom_dump_tasks;
86extern int max_threads; 90extern int max_threads;
87extern int core_uses_pid; 91extern int core_uses_pid;
88extern int suid_dumpable; 92extern int suid_dumpable;
@@ -104,7 +108,7 @@ extern int blk_iopoll_enabled;
104#endif 108#endif
105 109
106/* Constants used for minimum and maximum */ 110/* Constants used for minimum and maximum */
107#ifdef CONFIG_DETECT_SOFTLOCKUP 111#ifdef CONFIG_LOCKUP_DETECTOR
108static int sixty = 60; 112static int sixty = 60;
109static int neg_one = -1; 113static int neg_one = -1;
110#endif 114#endif
@@ -128,6 +132,9 @@ static int min_percpu_pagelist_fract = 8;
128 132
129static int ngroups_max = NGROUPS_MAX; 133static int ngroups_max = NGROUPS_MAX;
130 134
135#ifdef CONFIG_INOTIFY_USER
136#include <linux/inotify.h>
137#endif
131#ifdef CONFIG_SPARC 138#ifdef CONFIG_SPARC
132#include <asm/system.h> 139#include <asm/system.h>
133#endif 140#endif
@@ -163,6 +170,27 @@ static int proc_taint(struct ctl_table *table, int write,
163 void __user *buffer, size_t *lenp, loff_t *ppos); 170 void __user *buffer, size_t *lenp, loff_t *ppos);
164#endif 171#endif
165 172
173#ifdef CONFIG_MAGIC_SYSRQ
174static int __sysrq_enabled; /* Note: sysrq code ises it's own private copy */
175
176static int sysrq_sysctl_handler(ctl_table *table, int write,
177 void __user *buffer, size_t *lenp,
178 loff_t *ppos)
179{
180 int error;
181
182 error = proc_dointvec(table, write, buffer, lenp, ppos);
183 if (error)
184 return error;
185
186 if (write)
187 sysrq_toggle_support(__sysrq_enabled);
188
189 return 0;
190}
191
192#endif
193
166static struct ctl_table root_table[]; 194static struct ctl_table root_table[];
167static struct ctl_table_root sysctl_table_root; 195static struct ctl_table_root sysctl_table_root;
168static struct ctl_table_header root_table_header = { 196static struct ctl_table_header root_table_header = {
@@ -183,9 +211,6 @@ static struct ctl_table fs_table[];
183static struct ctl_table debug_table[]; 211static struct ctl_table debug_table[];
184static struct ctl_table dev_table[]; 212static struct ctl_table dev_table[];
185extern struct ctl_table random_table[]; 213extern struct ctl_table random_table[];
186#ifdef CONFIG_INOTIFY_USER
187extern struct ctl_table inotify_table[];
188#endif
189#ifdef CONFIG_EPOLL 214#ifdef CONFIG_EPOLL
190extern struct ctl_table epoll_table[]; 215extern struct ctl_table epoll_table[];
191#endif 216#endif
@@ -240,6 +265,11 @@ static int min_sched_shares_ratelimit = 100000; /* 100 usec */
240static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */ 265static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
241#endif 266#endif
242 267
268#ifdef CONFIG_COMPACTION
269static int min_extfrag_threshold;
270static int max_extfrag_threshold = 1000;
271#endif
272
243static struct ctl_table kern_table[] = { 273static struct ctl_table kern_table[] = {
244 { 274 {
245 .procname = "sched_child_runs_first", 275 .procname = "sched_child_runs_first",
@@ -534,7 +564,7 @@ static struct ctl_table kern_table[] = {
534 .extra2 = &one, 564 .extra2 = &one,
535 }, 565 },
536#endif 566#endif
537#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) 567#ifdef CONFIG_HOTPLUG
538 { 568 {
539 .procname = "hotplug", 569 .procname = "hotplug",
540 .data = &uevent_helper, 570 .data = &uevent_helper,
@@ -567,7 +597,7 @@ static struct ctl_table kern_table[] = {
567 .data = &__sysrq_enabled, 597 .data = &__sysrq_enabled,
568 .maxlen = sizeof (int), 598 .maxlen = sizeof (int),
569 .mode = 0644, 599 .mode = 0644,
570 .proc_handler = proc_dointvec, 600 .proc_handler = sysrq_sysctl_handler,
571 }, 601 },
572#endif 602#endif
573#ifdef CONFIG_PROC_SYSCTL 603#ifdef CONFIG_PROC_SYSCTL
@@ -621,7 +651,7 @@ static struct ctl_table kern_table[] = {
621#endif 651#endif
622 { 652 {
623 .procname = "userprocess_debug", 653 .procname = "userprocess_debug",
624 .data = &sysctl_userprocess_debug, 654 .data = &show_unhandled_signals,
625 .maxlen = sizeof(int), 655 .maxlen = sizeof(int),
626 .mode = 0644, 656 .mode = 0644,
627 .proc_handler = proc_dointvec, 657 .proc_handler = proc_dointvec,
@@ -682,7 +712,34 @@ static struct ctl_table kern_table[] = {
682 .mode = 0444, 712 .mode = 0444,
683 .proc_handler = proc_dointvec, 713 .proc_handler = proc_dointvec,
684 }, 714 },
685#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) 715#if defined(CONFIG_LOCKUP_DETECTOR)
716 {
717 .procname = "watchdog",
718 .data = &watchdog_enabled,
719 .maxlen = sizeof (int),
720 .mode = 0644,
721 .proc_handler = proc_dowatchdog_enabled,
722 },
723 {
724 .procname = "watchdog_thresh",
725 .data = &softlockup_thresh,
726 .maxlen = sizeof(int),
727 .mode = 0644,
728 .proc_handler = proc_dowatchdog_thresh,
729 .extra1 = &neg_one,
730 .extra2 = &sixty,
731 },
732 {
733 .procname = "softlockup_panic",
734 .data = &softlockup_panic,
735 .maxlen = sizeof(int),
736 .mode = 0644,
737 .proc_handler = proc_dointvec_minmax,
738 .extra1 = &zero,
739 .extra2 = &one,
740 },
741#endif
742#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR)
686 { 743 {
687 .procname = "unknown_nmi_panic", 744 .procname = "unknown_nmi_panic",
688 .data = &unknown_nmi_panic, 745 .data = &unknown_nmi_panic,
@@ -785,26 +842,6 @@ static struct ctl_table kern_table[] = {
785 .proc_handler = proc_dointvec, 842 .proc_handler = proc_dointvec,
786 }, 843 },
787#endif 844#endif
788#ifdef CONFIG_DETECT_SOFTLOCKUP
789 {
790 .procname = "softlockup_panic",
791 .data = &softlockup_panic,
792 .maxlen = sizeof(int),
793 .mode = 0644,
794 .proc_handler = proc_dointvec_minmax,
795 .extra1 = &zero,
796 .extra2 = &one,
797 },
798 {
799 .procname = "softlockup_thresh",
800 .data = &softlockup_thresh,
801 .maxlen = sizeof(int),
802 .mode = 0644,
803 .proc_handler = proc_dosoftlockup_thresh,
804 .extra1 = &neg_one,
805 .extra2 = &sixty,
806 },
807#endif
808#ifdef CONFIG_DETECT_HUNG_TASK 845#ifdef CONFIG_DETECT_HUNG_TASK
809 { 846 {
810 .procname = "hung_task_panic", 847 .procname = "hung_task_panic",
@@ -878,13 +915,6 @@ static struct ctl_table kern_table[] = {
878 .proc_handler = proc_dointvec, 915 .proc_handler = proc_dointvec,
879 }, 916 },
880#endif 917#endif
881#ifdef CONFIG_SLOW_WORK
882 {
883 .procname = "slow-work",
884 .mode = 0555,
885 .child = slow_work_sysctls,
886 },
887#endif
888#ifdef CONFIG_PERF_EVENTS 918#ifdef CONFIG_PERF_EVENTS
889 { 919 {
890 .procname = "perf_event_paranoid", 920 .procname = "perf_event_paranoid",
@@ -1099,6 +1129,25 @@ static struct ctl_table vm_table[] = {
1099 .mode = 0644, 1129 .mode = 0644,
1100 .proc_handler = drop_caches_sysctl_handler, 1130 .proc_handler = drop_caches_sysctl_handler,
1101 }, 1131 },
1132#ifdef CONFIG_COMPACTION
1133 {
1134 .procname = "compact_memory",
1135 .data = &sysctl_compact_memory,
1136 .maxlen = sizeof(int),
1137 .mode = 0200,
1138 .proc_handler = sysctl_compaction_handler,
1139 },
1140 {
1141 .procname = "extfrag_threshold",
1142 .data = &sysctl_extfrag_threshold,
1143 .maxlen = sizeof(int),
1144 .mode = 0644,
1145 .proc_handler = sysctl_extfrag_handler,
1146 .extra1 = &min_extfrag_threshold,
1147 .extra2 = &max_extfrag_threshold,
1148 },
1149
1150#endif /* CONFIG_COMPACTION */
1102 { 1151 {
1103 .procname = "min_free_kbytes", 1152 .procname = "min_free_kbytes",
1104 .data = &min_free_kbytes, 1153 .data = &min_free_kbytes,
@@ -1423,6 +1472,14 @@ static struct ctl_table fs_table[] = {
1423 .child = binfmt_misc_table, 1472 .child = binfmt_misc_table,
1424 }, 1473 },
1425#endif 1474#endif
1475 {
1476 .procname = "pipe-max-size",
1477 .data = &pipe_max_size,
1478 .maxlen = sizeof(int),
1479 .mode = 0644,
1480 .proc_handler = &pipe_proc_fn,
1481 .extra1 = &pipe_min_size,
1482 },
1426/* 1483/*
1427 * NOTE: do not add new entries to this table unless you have read 1484 * NOTE: do not add new entries to this table unless you have read
1428 * Documentation/sysctl/ctl_unnumbered.txt 1485 * Documentation/sysctl/ctl_unnumbered.txt
@@ -1431,7 +1488,8 @@ static struct ctl_table fs_table[] = {
1431}; 1488};
1432 1489
1433static struct ctl_table debug_table[] = { 1490static struct ctl_table debug_table[] = {
1434#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) 1491#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \
1492 defined(CONFIG_S390)
1435 { 1493 {
1436 .procname = "exception-trace", 1494 .procname = "exception-trace",
1437 .data = &show_unhandled_signals, 1495 .data = &show_unhandled_signals,
@@ -1655,10 +1713,7 @@ static __init int sysctl_init(void)
1655{ 1713{
1656 sysctl_set_parent(NULL, root_table); 1714 sysctl_set_parent(NULL, root_table);
1657#ifdef CONFIG_SYSCTL_SYSCALL_CHECK 1715#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
1658 { 1716 sysctl_check_table(current->nsproxy, root_table);
1659 int err;
1660 err = sysctl_check_table(current->nsproxy, root_table);
1661 }
1662#endif 1717#endif
1663 return 0; 1718 return 0;
1664} 1719}
@@ -2040,8 +2095,132 @@ int proc_dostring(struct ctl_table *table, int write,
2040 buffer, lenp, ppos); 2095 buffer, lenp, ppos);
2041} 2096}
2042 2097
2098static size_t proc_skip_spaces(char **buf)
2099{
2100 size_t ret;
2101 char *tmp = skip_spaces(*buf);
2102 ret = tmp - *buf;
2103 *buf = tmp;
2104 return ret;
2105}
2106
2107static void proc_skip_char(char **buf, size_t *size, const char v)
2108{
2109 while (*size) {
2110 if (**buf != v)
2111 break;
2112 (*size)--;
2113 (*buf)++;
2114 }
2115}
2116
2117#define TMPBUFLEN 22
2118/**
2119 * proc_get_long - reads an ASCII formatted integer from a user buffer
2120 *
2121 * @buf: a kernel buffer
2122 * @size: size of the kernel buffer
2123 * @val: this is where the number will be stored
2124 * @neg: set to %TRUE if number is negative
2125 * @perm_tr: a vector which contains the allowed trailers
2126 * @perm_tr_len: size of the perm_tr vector
2127 * @tr: pointer to store the trailer character
2128 *
2129 * In case of success %0 is returned and @buf and @size are updated with
2130 * the amount of bytes read. If @tr is non-NULL and a trailing
2131 * character exists (size is non-zero after returning from this
2132 * function), @tr is updated with the trailing character.
2133 */
2134static int proc_get_long(char **buf, size_t *size,
2135 unsigned long *val, bool *neg,
2136 const char *perm_tr, unsigned perm_tr_len, char *tr)
2137{
2138 int len;
2139 char *p, tmp[TMPBUFLEN];
2140
2141 if (!*size)
2142 return -EINVAL;
2143
2144 len = *size;
2145 if (len > TMPBUFLEN - 1)
2146 len = TMPBUFLEN - 1;
2147
2148 memcpy(tmp, *buf, len);
2149
2150 tmp[len] = 0;
2151 p = tmp;
2152 if (*p == '-' && *size > 1) {
2153 *neg = true;
2154 p++;
2155 } else
2156 *neg = false;
2157 if (!isdigit(*p))
2158 return -EINVAL;
2159
2160 *val = simple_strtoul(p, &p, 0);
2161
2162 len = p - tmp;
2163
2164 /* We don't know if the next char is whitespace thus we may accept
2165 * invalid integers (e.g. 1234...a) or two integers instead of one
2166 * (e.g. 123...1). So lets not allow such large numbers. */
2167 if (len == TMPBUFLEN - 1)
2168 return -EINVAL;
2169
2170 if (len < *size && perm_tr_len && !memchr(perm_tr, *p, perm_tr_len))
2171 return -EINVAL;
2172
2173 if (tr && (len < *size))
2174 *tr = *p;
2175
2176 *buf += len;
2177 *size -= len;
2178
2179 return 0;
2180}
2181
2182/**
2183 * proc_put_long - converts an integer to a decimal ASCII formatted string
2184 *
2185 * @buf: the user buffer
2186 * @size: the size of the user buffer
2187 * @val: the integer to be converted
2188 * @neg: sign of the number, %TRUE for negative
2189 *
2190 * In case of success %0 is returned and @buf and @size are updated with
2191 * the amount of bytes written.
2192 */
2193static int proc_put_long(void __user **buf, size_t *size, unsigned long val,
2194 bool neg)
2195{
2196 int len;
2197 char tmp[TMPBUFLEN], *p = tmp;
2198
2199 sprintf(p, "%s%lu", neg ? "-" : "", val);
2200 len = strlen(tmp);
2201 if (len > *size)
2202 len = *size;
2203 if (copy_to_user(*buf, tmp, len))
2204 return -EFAULT;
2205 *size -= len;
2206 *buf += len;
2207 return 0;
2208}
2209#undef TMPBUFLEN
2043 2210
2044static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, 2211static int proc_put_char(void __user **buf, size_t *size, char c)
2212{
2213 if (*size) {
2214 char __user **buffer = (char __user **)buf;
2215 if (put_user(c, *buffer))
2216 return -EFAULT;
2217 (*size)--, (*buffer)++;
2218 *buf = *buffer;
2219 }
2220 return 0;
2221}
2222
2223static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
2045 int *valp, 2224 int *valp,
2046 int write, void *data) 2225 int write, void *data)
2047{ 2226{
@@ -2050,33 +2229,31 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
2050 } else { 2229 } else {
2051 int val = *valp; 2230 int val = *valp;
2052 if (val < 0) { 2231 if (val < 0) {
2053 *negp = -1; 2232 *negp = true;
2054 *lvalp = (unsigned long)-val; 2233 *lvalp = (unsigned long)-val;
2055 } else { 2234 } else {
2056 *negp = 0; 2235 *negp = false;
2057 *lvalp = (unsigned long)val; 2236 *lvalp = (unsigned long)val;
2058 } 2237 }
2059 } 2238 }
2060 return 0; 2239 return 0;
2061} 2240}
2062 2241
2242static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
2243
2063static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, 2244static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2064 int write, void __user *buffer, 2245 int write, void __user *buffer,
2065 size_t *lenp, loff_t *ppos, 2246 size_t *lenp, loff_t *ppos,
2066 int (*conv)(int *negp, unsigned long *lvalp, int *valp, 2247 int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
2067 int write, void *data), 2248 int write, void *data),
2068 void *data) 2249 void *data)
2069{ 2250{
2070#define TMPBUFLEN 21 2251 int *i, vleft, first = 1, err = 0;
2071 int *i, vleft, first = 1, neg; 2252 unsigned long page = 0;
2072 unsigned long lval; 2253 size_t left;
2073 size_t left, len; 2254 char *kbuf;
2074 2255
2075 char buf[TMPBUFLEN], *p; 2256 if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
2076 char __user *s = buffer;
2077
2078 if (!tbl_data || !table->maxlen || !*lenp ||
2079 (*ppos && !write)) {
2080 *lenp = 0; 2257 *lenp = 0;
2081 return 0; 2258 return 0;
2082 } 2259 }
@@ -2088,89 +2265,71 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2088 if (!conv) 2265 if (!conv)
2089 conv = do_proc_dointvec_conv; 2266 conv = do_proc_dointvec_conv;
2090 2267
2268 if (write) {
2269 if (left > PAGE_SIZE - 1)
2270 left = PAGE_SIZE - 1;
2271 page = __get_free_page(GFP_TEMPORARY);
2272 kbuf = (char *) page;
2273 if (!kbuf)
2274 return -ENOMEM;
2275 if (copy_from_user(kbuf, buffer, left)) {
2276 err = -EFAULT;
2277 goto free;
2278 }
2279 kbuf[left] = 0;
2280 }
2281
2091 for (; left && vleft--; i++, first=0) { 2282 for (; left && vleft--; i++, first=0) {
2283 unsigned long lval;
2284 bool neg;
2285
2092 if (write) { 2286 if (write) {
2093 while (left) { 2287 left -= proc_skip_spaces(&kbuf);
2094 char c; 2288
2095 if (get_user(c, s))
2096 return -EFAULT;
2097 if (!isspace(c))
2098 break;
2099 left--;
2100 s++;
2101 }
2102 if (!left) 2289 if (!left)
2103 break; 2290 break;
2104 neg = 0; 2291 err = proc_get_long(&kbuf, &left, &lval, &neg,
2105 len = left; 2292 proc_wspace_sep,
2106 if (len > sizeof(buf) - 1) 2293 sizeof(proc_wspace_sep), NULL);
2107 len = sizeof(buf) - 1; 2294 if (err)
2108 if (copy_from_user(buf, s, len))
2109 return -EFAULT;
2110 buf[len] = 0;
2111 p = buf;
2112 if (*p == '-' && left > 1) {
2113 neg = 1;
2114 p++;
2115 }
2116 if (*p < '0' || *p > '9')
2117 break; 2295 break;
2118 2296 if (conv(&neg, &lval, i, 1, data)) {
2119 lval = simple_strtoul(p, &p, 0); 2297 err = -EINVAL;
2120
2121 len = p-buf;
2122 if ((len < left) && *p && !isspace(*p))
2123 break;
2124 s += len;
2125 left -= len;
2126
2127 if (conv(&neg, &lval, i, 1, data))
2128 break; 2298 break;
2299 }
2129 } else { 2300 } else {
2130 p = buf; 2301 if (conv(&neg, &lval, i, 0, data)) {
2302 err = -EINVAL;
2303 break;
2304 }
2131 if (!first) 2305 if (!first)
2132 *p++ = '\t'; 2306 err = proc_put_char(&buffer, &left, '\t');
2133 2307 if (err)
2134 if (conv(&neg, &lval, i, 0, data)) 2308 break;
2309 err = proc_put_long(&buffer, &left, lval, neg);
2310 if (err)
2135 break; 2311 break;
2136
2137 sprintf(p, "%s%lu", neg ? "-" : "", lval);
2138 len = strlen(buf);
2139 if (len > left)
2140 len = left;
2141 if(copy_to_user(s, buf, len))
2142 return -EFAULT;
2143 left -= len;
2144 s += len;
2145 } 2312 }
2146 } 2313 }
2147 2314
2148 if (!write && !first && left) { 2315 if (!write && !first && left && !err)
2149 if(put_user('\n', s)) 2316 err = proc_put_char(&buffer, &left, '\n');
2150 return -EFAULT; 2317 if (write && !err && left)
2151 left--, s++; 2318 left -= proc_skip_spaces(&kbuf);
2152 } 2319free:
2153 if (write) { 2320 if (write) {
2154 while (left) { 2321 free_page(page);
2155 char c; 2322 if (first)
2156 if (get_user(c, s++)) 2323 return err ? : -EINVAL;
2157 return -EFAULT;
2158 if (!isspace(c))
2159 break;
2160 left--;
2161 }
2162 } 2324 }
2163 if (write && first)
2164 return -EINVAL;
2165 *lenp -= left; 2325 *lenp -= left;
2166 *ppos += *lenp; 2326 *ppos += *lenp;
2167 return 0; 2327 return err;
2168#undef TMPBUFLEN
2169} 2328}
2170 2329
2171static int do_proc_dointvec(struct ctl_table *table, int write, 2330static int do_proc_dointvec(struct ctl_table *table, int write,
2172 void __user *buffer, size_t *lenp, loff_t *ppos, 2331 void __user *buffer, size_t *lenp, loff_t *ppos,
2173 int (*conv)(int *negp, unsigned long *lvalp, int *valp, 2332 int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
2174 int write, void *data), 2333 int write, void *data),
2175 void *data) 2334 void *data)
2176{ 2335{
@@ -2238,8 +2397,8 @@ struct do_proc_dointvec_minmax_conv_param {
2238 int *max; 2397 int *max;
2239}; 2398};
2240 2399
2241static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, 2400static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
2242 int *valp, 2401 int *valp,
2243 int write, void *data) 2402 int write, void *data)
2244{ 2403{
2245 struct do_proc_dointvec_minmax_conv_param *param = data; 2404 struct do_proc_dointvec_minmax_conv_param *param = data;
@@ -2252,10 +2411,10 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
2252 } else { 2411 } else {
2253 int val = *valp; 2412 int val = *valp;
2254 if (val < 0) { 2413 if (val < 0) {
2255 *negp = -1; 2414 *negp = true;
2256 *lvalp = (unsigned long)-val; 2415 *lvalp = (unsigned long)-val;
2257 } else { 2416 } else {
2258 *negp = 0; 2417 *negp = false;
2259 *lvalp = (unsigned long)val; 2418 *lvalp = (unsigned long)val;
2260 } 2419 }
2261 } 2420 }
@@ -2295,102 +2454,78 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
2295 unsigned long convmul, 2454 unsigned long convmul,
2296 unsigned long convdiv) 2455 unsigned long convdiv)
2297{ 2456{
2298#define TMPBUFLEN 21 2457 unsigned long *i, *min, *max;
2299 unsigned long *i, *min, *max, val; 2458 int vleft, first = 1, err = 0;
2300 int vleft, first=1, neg; 2459 unsigned long page = 0;
2301 size_t len, left; 2460 size_t left;
2302 char buf[TMPBUFLEN], *p; 2461 char *kbuf;
2303 char __user *s = buffer; 2462
2304 2463 if (!data || !table->maxlen || !*lenp || (*ppos && !write)) {
2305 if (!data || !table->maxlen || !*lenp ||
2306 (*ppos && !write)) {
2307 *lenp = 0; 2464 *lenp = 0;
2308 return 0; 2465 return 0;
2309 } 2466 }
2310 2467
2311 i = (unsigned long *) data; 2468 i = (unsigned long *) data;
2312 min = (unsigned long *) table->extra1; 2469 min = (unsigned long *) table->extra1;
2313 max = (unsigned long *) table->extra2; 2470 max = (unsigned long *) table->extra2;
2314 vleft = table->maxlen / sizeof(unsigned long); 2471 vleft = table->maxlen / sizeof(unsigned long);
2315 left = *lenp; 2472 left = *lenp;
2316 2473
2317 for (; left && vleft--; i++, min++, max++, first=0) { 2474 if (write) {
2475 if (left > PAGE_SIZE - 1)
2476 left = PAGE_SIZE - 1;
2477 page = __get_free_page(GFP_TEMPORARY);
2478 kbuf = (char *) page;
2479 if (!kbuf)
2480 return -ENOMEM;
2481 if (copy_from_user(kbuf, buffer, left)) {
2482 err = -EFAULT;
2483 goto free;
2484 }
2485 kbuf[left] = 0;
2486 }
2487
2488 for (; left && vleft--; i++, first = 0) {
2489 unsigned long val;
2490
2318 if (write) { 2491 if (write) {
2319 while (left) { 2492 bool neg;
2320 char c; 2493
2321 if (get_user(c, s)) 2494 left -= proc_skip_spaces(&kbuf);
2322 return -EFAULT; 2495
2323 if (!isspace(c)) 2496 err = proc_get_long(&kbuf, &left, &val, &neg,
2324 break; 2497 proc_wspace_sep,
2325 left--; 2498 sizeof(proc_wspace_sep), NULL);
2326 s++; 2499 if (err)
2327 }
2328 if (!left)
2329 break;
2330 neg = 0;
2331 len = left;
2332 if (len > TMPBUFLEN-1)
2333 len = TMPBUFLEN-1;
2334 if (copy_from_user(buf, s, len))
2335 return -EFAULT;
2336 buf[len] = 0;
2337 p = buf;
2338 if (*p == '-' && left > 1) {
2339 neg = 1;
2340 p++;
2341 }
2342 if (*p < '0' || *p > '9')
2343 break;
2344 val = simple_strtoul(p, &p, 0) * convmul / convdiv ;
2345 len = p-buf;
2346 if ((len < left) && *p && !isspace(*p))
2347 break; 2500 break;
2348 if (neg) 2501 if (neg)
2349 val = -val;
2350 s += len;
2351 left -= len;
2352
2353 if(neg)
2354 continue; 2502 continue;
2355 if ((min && val < *min) || (max && val > *max)) 2503 if ((min && val < *min) || (max && val > *max))
2356 continue; 2504 continue;
2357 *i = val; 2505 *i = val;
2358 } else { 2506 } else {
2359 p = buf; 2507 val = convdiv * (*i) / convmul;
2360 if (!first) 2508 if (!first)
2361 *p++ = '\t'; 2509 err = proc_put_char(&buffer, &left, '\t');
2362 sprintf(p, "%lu", convdiv * (*i) / convmul); 2510 err = proc_put_long(&buffer, &left, val, false);
2363 len = strlen(buf); 2511 if (err)
2364 if (len > left) 2512 break;
2365 len = left;
2366 if(copy_to_user(s, buf, len))
2367 return -EFAULT;
2368 left -= len;
2369 s += len;
2370 } 2513 }
2371 } 2514 }
2372 2515
2373 if (!write && !first && left) { 2516 if (!write && !first && left && !err)
2374 if(put_user('\n', s)) 2517 err = proc_put_char(&buffer, &left, '\n');
2375 return -EFAULT; 2518 if (write && !err)
2376 left--, s++; 2519 left -= proc_skip_spaces(&kbuf);
2377 } 2520free:
2378 if (write) { 2521 if (write) {
2379 while (left) { 2522 free_page(page);
2380 char c; 2523 if (first)
2381 if (get_user(c, s++)) 2524 return err ? : -EINVAL;
2382 return -EFAULT;
2383 if (!isspace(c))
2384 break;
2385 left--;
2386 }
2387 } 2525 }
2388 if (write && first)
2389 return -EINVAL;
2390 *lenp -= left; 2526 *lenp -= left;
2391 *ppos += *lenp; 2527 *ppos += *lenp;
2392 return 0; 2528 return err;
2393#undef TMPBUFLEN
2394} 2529}
2395 2530
2396static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, 2531static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
@@ -2451,7 +2586,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2451} 2586}
2452 2587
2453 2588
2454static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp, 2589static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
2455 int *valp, 2590 int *valp,
2456 int write, void *data) 2591 int write, void *data)
2457{ 2592{
@@ -2463,10 +2598,10 @@ static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
2463 int val = *valp; 2598 int val = *valp;
2464 unsigned long lval; 2599 unsigned long lval;
2465 if (val < 0) { 2600 if (val < 0) {
2466 *negp = -1; 2601 *negp = true;
2467 lval = (unsigned long)-val; 2602 lval = (unsigned long)-val;
2468 } else { 2603 } else {
2469 *negp = 0; 2604 *negp = false;
2470 lval = (unsigned long)val; 2605 lval = (unsigned long)val;
2471 } 2606 }
2472 *lvalp = lval / HZ; 2607 *lvalp = lval / HZ;
@@ -2474,7 +2609,7 @@ static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
2474 return 0; 2609 return 0;
2475} 2610}
2476 2611
2477static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp, 2612static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp,
2478 int *valp, 2613 int *valp,
2479 int write, void *data) 2614 int write, void *data)
2480{ 2615{
@@ -2486,10 +2621,10 @@ static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
2486 int val = *valp; 2621 int val = *valp;
2487 unsigned long lval; 2622 unsigned long lval;
2488 if (val < 0) { 2623 if (val < 0) {
2489 *negp = -1; 2624 *negp = true;
2490 lval = (unsigned long)-val; 2625 lval = (unsigned long)-val;
2491 } else { 2626 } else {
2492 *negp = 0; 2627 *negp = false;
2493 lval = (unsigned long)val; 2628 lval = (unsigned long)val;
2494 } 2629 }
2495 *lvalp = jiffies_to_clock_t(lval); 2630 *lvalp = jiffies_to_clock_t(lval);
@@ -2497,7 +2632,7 @@ static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
2497 return 0; 2632 return 0;
2498} 2633}
2499 2634
2500static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp, 2635static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
2501 int *valp, 2636 int *valp,
2502 int write, void *data) 2637 int write, void *data)
2503{ 2638{
@@ -2507,10 +2642,10 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
2507 int val = *valp; 2642 int val = *valp;
2508 unsigned long lval; 2643 unsigned long lval;
2509 if (val < 0) { 2644 if (val < 0) {
2510 *negp = -1; 2645 *negp = true;
2511 lval = (unsigned long)-val; 2646 lval = (unsigned long)-val;
2512 } else { 2647 } else {
2513 *negp = 0; 2648 *negp = false;
2514 lval = (unsigned long)val; 2649 lval = (unsigned long)val;
2515 } 2650 }
2516 *lvalp = jiffies_to_msecs(lval); 2651 *lvalp = jiffies_to_msecs(lval);
@@ -2607,6 +2742,157 @@ static int proc_do_cad_pid(struct ctl_table *table, int write,
2607 return 0; 2742 return 0;
2608} 2743}
2609 2744
2745/**
2746 * proc_do_large_bitmap - read/write from/to a large bitmap
2747 * @table: the sysctl table
2748 * @write: %TRUE if this is a write to the sysctl file
2749 * @buffer: the user buffer
2750 * @lenp: the size of the user buffer
2751 * @ppos: file position
2752 *
2753 * The bitmap is stored at table->data and the bitmap length (in bits)
2754 * in table->maxlen.
2755 *
2756 * We use a range comma separated format (e.g. 1,3-4,10-10) so that
2757 * large bitmaps may be represented in a compact manner. Writing into
2758 * the file will clear the bitmap then update it with the given input.
2759 *
2760 * Returns 0 on success.
2761 */
2762int proc_do_large_bitmap(struct ctl_table *table, int write,
2763 void __user *buffer, size_t *lenp, loff_t *ppos)
2764{
2765 int err = 0;
2766 bool first = 1;
2767 size_t left = *lenp;
2768 unsigned long bitmap_len = table->maxlen;
2769 unsigned long *bitmap = (unsigned long *) table->data;
2770 unsigned long *tmp_bitmap = NULL;
2771 char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c;
2772
2773 if (!bitmap_len || !left || (*ppos && !write)) {
2774 *lenp = 0;
2775 return 0;
2776 }
2777
2778 if (write) {
2779 unsigned long page = 0;
2780 char *kbuf;
2781
2782 if (left > PAGE_SIZE - 1)
2783 left = PAGE_SIZE - 1;
2784
2785 page = __get_free_page(GFP_TEMPORARY);
2786 kbuf = (char *) page;
2787 if (!kbuf)
2788 return -ENOMEM;
2789 if (copy_from_user(kbuf, buffer, left)) {
2790 free_page(page);
2791 return -EFAULT;
2792 }
2793 kbuf[left] = 0;
2794
2795 tmp_bitmap = kzalloc(BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long),
2796 GFP_KERNEL);
2797 if (!tmp_bitmap) {
2798 free_page(page);
2799 return -ENOMEM;
2800 }
2801 proc_skip_char(&kbuf, &left, '\n');
2802 while (!err && left) {
2803 unsigned long val_a, val_b;
2804 bool neg;
2805
2806 err = proc_get_long(&kbuf, &left, &val_a, &neg, tr_a,
2807 sizeof(tr_a), &c);
2808 if (err)
2809 break;
2810 if (val_a >= bitmap_len || neg) {
2811 err = -EINVAL;
2812 break;
2813 }
2814
2815 val_b = val_a;
2816 if (left) {
2817 kbuf++;
2818 left--;
2819 }
2820
2821 if (c == '-') {
2822 err = proc_get_long(&kbuf, &left, &val_b,
2823 &neg, tr_b, sizeof(tr_b),
2824 &c);
2825 if (err)
2826 break;
2827 if (val_b >= bitmap_len || neg ||
2828 val_a > val_b) {
2829 err = -EINVAL;
2830 break;
2831 }
2832 if (left) {
2833 kbuf++;
2834 left--;
2835 }
2836 }
2837
2838 while (val_a <= val_b)
2839 set_bit(val_a++, tmp_bitmap);
2840
2841 first = 0;
2842 proc_skip_char(&kbuf, &left, '\n');
2843 }
2844 free_page(page);
2845 } else {
2846 unsigned long bit_a, bit_b = 0;
2847
2848 while (left) {
2849 bit_a = find_next_bit(bitmap, bitmap_len, bit_b);
2850 if (bit_a >= bitmap_len)
2851 break;
2852 bit_b = find_next_zero_bit(bitmap, bitmap_len,
2853 bit_a + 1) - 1;
2854
2855 if (!first) {
2856 err = proc_put_char(&buffer, &left, ',');
2857 if (err)
2858 break;
2859 }
2860 err = proc_put_long(&buffer, &left, bit_a, false);
2861 if (err)
2862 break;
2863 if (bit_a != bit_b) {
2864 err = proc_put_char(&buffer, &left, '-');
2865 if (err)
2866 break;
2867 err = proc_put_long(&buffer, &left, bit_b, false);
2868 if (err)
2869 break;
2870 }
2871
2872 first = 0; bit_b++;
2873 }
2874 if (!err)
2875 err = proc_put_char(&buffer, &left, '\n');
2876 }
2877
2878 if (!err) {
2879 if (write) {
2880 if (*ppos)
2881 bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len);
2882 else
2883 memcpy(bitmap, tmp_bitmap,
2884 BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long));
2885 }
2886 kfree(tmp_bitmap);
2887 *lenp -= left;
2888 *ppos += *lenp;
2889 return 0;
2890 } else {
2891 kfree(tmp_bitmap);
2892 return err;
2893 }
2894}
2895
2610#else /* CONFIG_PROC_FS */ 2896#else /* CONFIG_PROC_FS */
2611 2897
2612int proc_dostring(struct ctl_table *table, int write, 2898int proc_dostring(struct ctl_table *table, int write,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 59030570f5ca..1357c5786064 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -13,6 +13,7 @@
13#include <linux/file.h> 13#include <linux/file.h>
14#include <linux/ctype.h> 14#include <linux/ctype.h>
15#include <linux/netdevice.h> 15#include <linux/netdevice.h>
16#include <linux/kernel.h>
16#include <linux/slab.h> 17#include <linux/slab.h>
17 18
18#ifdef CONFIG_SYSCTL_SYSCALL 19#ifdef CONFIG_SYSCTL_SYSCALL
@@ -224,7 +225,6 @@ static const struct bin_table bin_net_ipv4_route_table[] = {
224 { CTL_INT, NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" }, 225 { CTL_INT, NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" },
225 { CTL_INT, NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" }, 226 { CTL_INT, NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" },
226 { CTL_INT, NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" }, 227 { CTL_INT, NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" },
227 { CTL_INT, NET_IPV4_ROUTE_SECRET_INTERVAL, "secret_interval" },
228 {} 228 {}
229}; 229};
230 230
@@ -1125,11 +1125,6 @@ out:
1125 return result; 1125 return result;
1126} 1126}
1127 1127
1128static unsigned hex_value(int ch)
1129{
1130 return isdigit(ch) ? ch - '0' : ((ch | 0x20) - 'a') + 10;
1131}
1132
1133static ssize_t bin_uuid(struct file *file, 1128static ssize_t bin_uuid(struct file *file,
1134 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) 1129 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1135{ 1130{
@@ -1157,7 +1152,8 @@ static ssize_t bin_uuid(struct file *file,
1157 if (!isxdigit(str[0]) || !isxdigit(str[1])) 1152 if (!isxdigit(str[0]) || !isxdigit(str[1]))
1158 goto out; 1153 goto out;
1159 1154
1160 uuid[i] = (hex_value(str[0]) << 4) | hex_value(str[1]); 1155 uuid[i] = (hex_to_bin(str[0]) << 4) |
1156 hex_to_bin(str[1]);
1161 str += 2; 1157 str += 2;
1162 if (*str == '-') 1158 if (*str == '-')
1163 str++; 1159 str++;
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index 04cdcf72c827..10b90d8a03c4 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -143,15 +143,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
143 if (!table->maxlen) 143 if (!table->maxlen)
144 set_fail(&fail, table, "No maxlen"); 144 set_fail(&fail, table, "No maxlen");
145 } 145 }
146 if ((table->proc_handler == proc_doulongvec_minmax) ||
147 (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
148 if (table->maxlen > sizeof (unsigned long)) {
149 if (!table->extra1)
150 set_fail(&fail, table, "No min");
151 if (!table->extra2)
152 set_fail(&fail, table, "No max");
153 }
154 }
155#ifdef CONFIG_PROC_SYSCTL 146#ifdef CONFIG_PROC_SYSCTL
156 if (table->procname && !table->proc_handler) 147 if (table->procname && !table->proc_handler)
157 set_fail(&fail, table, "No proc_handler"); 148 set_fail(&fail, table, "No proc_handler");
diff --git a/kernel/time.c b/kernel/time.c
index 656dccfe1cbb..ba9b338d1835 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -132,12 +132,11 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
132 */ 132 */
133static inline void warp_clock(void) 133static inline void warp_clock(void)
134{ 134{
135 write_seqlock_irq(&xtime_lock); 135 struct timespec adjust;
136 wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; 136
137 xtime.tv_sec += sys_tz.tz_minuteswest * 60; 137 adjust = current_kernel_time();
138 update_xtime_cache(0); 138 adjust.tv_sec += sys_tz.tz_minuteswest * 60;
139 write_sequnlock_irq(&xtime_lock); 139 do_settimeofday(&adjust);
140 clock_was_set();
141} 140}
142 141
143/* 142/*
@@ -301,22 +300,6 @@ struct timespec timespec_trunc(struct timespec t, unsigned gran)
301} 300}
302EXPORT_SYMBOL(timespec_trunc); 301EXPORT_SYMBOL(timespec_trunc);
303 302
304#ifndef CONFIG_GENERIC_TIME
305/*
306 * Simulate gettimeofday using do_gettimeofday which only allows a timeval
307 * and therefore only yields usec accuracy
308 */
309void getnstimeofday(struct timespec *tv)
310{
311 struct timeval x;
312
313 do_gettimeofday(&x);
314 tv->tv_sec = x.tv_sec;
315 tv->tv_nsec = x.tv_usec * NSEC_PER_USEC;
316}
317EXPORT_SYMBOL_GPL(getnstimeofday);
318#endif
319
320/* Converts Gregorian date to seconds since 1970-01-01 00:00:00. 303/* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
321 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 304 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
322 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. 305 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 95ed42951e0a..f06a8a365648 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -6,7 +6,7 @@ config TICK_ONESHOT
6 6
7config NO_HZ 7config NO_HZ
8 bool "Tickless System (Dynamic Ticks)" 8 bool "Tickless System (Dynamic Ticks)"
9 depends on GENERIC_TIME && GENERIC_CLOCKEVENTS 9 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
10 select TICK_ONESHOT 10 select TICK_ONESHOT
11 help 11 help
12 This option enables a tickless system: timer interrupts will 12 This option enables a tickless system: timer interrupts will
@@ -15,7 +15,7 @@ config NO_HZ
15 15
16config HIGH_RES_TIMERS 16config HIGH_RES_TIMERS
17 bool "High Resolution Timer Support" 17 bool "High Resolution Timer Support"
18 depends on GENERIC_TIME && GENERIC_CLOCKEVENTS 18 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
19 select TICK_ONESHOT 19 select TICK_ONESHOT
20 help 20 help
21 This option enables high resolution timer support. If your 21 This option enables high resolution timer support. If your
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 1f5dde637457..c18d7efa1b4b 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -531,7 +531,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs)
531 return max_nsecs - (max_nsecs >> 5); 531 return max_nsecs - (max_nsecs >> 5);
532} 532}
533 533
534#ifdef CONFIG_GENERIC_TIME 534#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
535 535
536/** 536/**
537 * clocksource_select - Select the best clocksource available 537 * clocksource_select - Select the best clocksource available
@@ -577,7 +577,7 @@ static void clocksource_select(void)
577 } 577 }
578} 578}
579 579
580#else /* CONFIG_GENERIC_TIME */ 580#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */
581 581
582static inline void clocksource_select(void) { } 582static inline void clocksource_select(void) { }
583 583
@@ -625,6 +625,73 @@ static void clocksource_enqueue(struct clocksource *cs)
625 list_add(&cs->list, entry); 625 list_add(&cs->list, entry);
626} 626}
627 627
628
629/*
630 * Maximum time we expect to go between ticks. This includes idle
631 * tickless time. It provides the trade off between selecting a
632 * mult/shift pair that is very precise but can only handle a short
633 * period of time, vs. a mult/shift pair that can handle long periods
634 * of time but isn't as precise.
635 *
636 * This is a subsystem constant, and actual hardware limitations
637 * may override it (ie: clocksources that wrap every 3 seconds).
638 */
639#define MAX_UPDATE_LENGTH 5 /* Seconds */
640
641/**
642 * __clocksource_updatefreq_scale - Used update clocksource with new freq
643 * @t: clocksource to be registered
644 * @scale: Scale factor multiplied against freq to get clocksource hz
645 * @freq: clocksource frequency (cycles per second) divided by scale
646 *
647 * This should only be called from the clocksource->enable() method.
648 *
649 * This *SHOULD NOT* be called directly! Please use the
650 * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions.
651 */
652void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
653{
654 /*
655 * Ideally we want to use some of the limits used in
656 * clocksource_max_deferment, to provide a more informed
657 * MAX_UPDATE_LENGTH. But for now this just gets the
658 * register interface working properly.
659 */
660 clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
661 NSEC_PER_SEC/scale,
662 MAX_UPDATE_LENGTH*scale);
663 cs->max_idle_ns = clocksource_max_deferment(cs);
664}
665EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
666
667/**
668 * __clocksource_register_scale - Used to install new clocksources
669 * @t: clocksource to be registered
670 * @scale: Scale factor multiplied against freq to get clocksource hz
671 * @freq: clocksource frequency (cycles per second) divided by scale
672 *
673 * Returns -EBUSY if registration fails, zero otherwise.
674 *
675 * This *SHOULD NOT* be called directly! Please use the
676 * clocksource_register_hz() or clocksource_register_khz helper functions.
677 */
678int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
679{
680
681 /* Intialize mult/shift and max_idle_ns */
682 __clocksource_updatefreq_scale(cs, scale, freq);
683
684 /* Add clocksource to the clcoksource list */
685 mutex_lock(&clocksource_mutex);
686 clocksource_enqueue(cs);
687 clocksource_select();
688 clocksource_enqueue_watchdog(cs);
689 mutex_unlock(&clocksource_mutex);
690 return 0;
691}
692EXPORT_SYMBOL_GPL(__clocksource_register_scale);
693
694
628/** 695/**
629 * clocksource_register - Used to install new clocksources 696 * clocksource_register - Used to install new clocksources
630 * @t: clocksource to be registered 697 * @t: clocksource to be registered
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 7c0f180d6e9d..c63116863a80 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -69,7 +69,7 @@ static s64 time_freq;
69/* time at last adjustment (secs): */ 69/* time at last adjustment (secs): */
70static long time_reftime; 70static long time_reftime;
71 71
72long time_adjust; 72static long time_adjust;
73 73
74/* constant (boot-param configurable) NTP tick adjustment (upscaled) */ 74/* constant (boot-param configurable) NTP tick adjustment (upscaled) */
75static s64 ntp_tick_adj; 75static s64 ntp_tick_adj;
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index b3bafd5fc66d..48b2761b5668 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -188,7 +188,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
188 /* 188 /*
189 * Setup the next period for devices, which do not have 189 * Setup the next period for devices, which do not have
190 * periodic mode. We read dev->next_event first and add to it 190 * periodic mode. We read dev->next_event first and add to it
191 * when the event alrady expired. clockevents_program_event() 191 * when the event already expired. clockevents_program_event()
192 * sets dev->next_event only when the event is really 192 * sets dev->next_event only when the event is really
193 * programmed to the device. 193 * programmed to the device.
194 */ 194 */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 0adc54bd7c7c..bb2d8b7850a3 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -150,35 +150,65 @@ static void tick_nohz_update_jiffies(ktime_t now)
150 touch_softlockup_watchdog(); 150 touch_softlockup_watchdog();
151} 151}
152 152
153/*
154 * Updates the per cpu time idle statistics counters
155 */
156static void
157update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time)
158{
159 ktime_t delta;
160
161 if (ts->idle_active) {
162 delta = ktime_sub(now, ts->idle_entrytime);
163 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
164 if (nr_iowait_cpu(cpu) > 0)
165 ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
166 ts->idle_entrytime = now;
167 }
168
169 if (last_update_time)
170 *last_update_time = ktime_to_us(now);
171
172}
173
153static void tick_nohz_stop_idle(int cpu, ktime_t now) 174static void tick_nohz_stop_idle(int cpu, ktime_t now)
154{ 175{
155 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 176 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
156 ktime_t delta;
157 177
158 delta = ktime_sub(now, ts->idle_entrytime); 178 update_ts_time_stats(cpu, ts, now, NULL);
159 ts->idle_lastupdate = now;
160 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
161 ts->idle_active = 0; 179 ts->idle_active = 0;
162 180
163 sched_clock_idle_wakeup_event(0); 181 sched_clock_idle_wakeup_event(0);
164} 182}
165 183
166static ktime_t tick_nohz_start_idle(struct tick_sched *ts) 184static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
167{ 185{
168 ktime_t now, delta; 186 ktime_t now;
169 187
170 now = ktime_get(); 188 now = ktime_get();
171 if (ts->idle_active) { 189
172 delta = ktime_sub(now, ts->idle_entrytime); 190 update_ts_time_stats(cpu, ts, now, NULL);
173 ts->idle_lastupdate = now; 191
174 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
175 }
176 ts->idle_entrytime = now; 192 ts->idle_entrytime = now;
177 ts->idle_active = 1; 193 ts->idle_active = 1;
178 sched_clock_idle_sleep_event(); 194 sched_clock_idle_sleep_event();
179 return now; 195 return now;
180} 196}
181 197
198/**
199 * get_cpu_idle_time_us - get the total idle time of a cpu
200 * @cpu: CPU number to query
201 * @last_update_time: variable to store update time in
202 *
203 * Return the cummulative idle time (since boot) for a given
204 * CPU, in microseconds. The idle time returned includes
205 * the iowait time (unlike what "top" and co report).
206 *
207 * This time is measured via accounting rather than sampling,
208 * and is as accurate as ktime_get() is.
209 *
210 * This function returns -1 if NOHZ is not enabled.
211 */
182u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) 212u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
183{ 213{
184 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 214 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
@@ -186,15 +216,38 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
186 if (!tick_nohz_enabled) 216 if (!tick_nohz_enabled)
187 return -1; 217 return -1;
188 218
189 if (ts->idle_active) 219 update_ts_time_stats(cpu, ts, ktime_get(), last_update_time);
190 *last_update_time = ktime_to_us(ts->idle_lastupdate);
191 else
192 *last_update_time = ktime_to_us(ktime_get());
193 220
194 return ktime_to_us(ts->idle_sleeptime); 221 return ktime_to_us(ts->idle_sleeptime);
195} 222}
196EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); 223EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
197 224
225/*
226 * get_cpu_iowait_time_us - get the total iowait time of a cpu
227 * @cpu: CPU number to query
228 * @last_update_time: variable to store update time in
229 *
230 * Return the cummulative iowait time (since boot) for a given
231 * CPU, in microseconds.
232 *
233 * This time is measured via accounting rather than sampling,
234 * and is as accurate as ktime_get() is.
235 *
236 * This function returns -1 if NOHZ is not enabled.
237 */
238u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
239{
240 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
241
242 if (!tick_nohz_enabled)
243 return -1;
244
245 update_ts_time_stats(cpu, ts, ktime_get(), last_update_time);
246
247 return ktime_to_us(ts->iowait_sleeptime);
248}
249EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
250
198/** 251/**
199 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task 252 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
200 * 253 *
@@ -231,7 +284,7 @@ void tick_nohz_stop_sched_tick(int inidle)
231 */ 284 */
232 ts->inidle = 1; 285 ts->inidle = 1;
233 286
234 now = tick_nohz_start_idle(ts); 287 now = tick_nohz_start_idle(cpu, ts);
235 288
236 /* 289 /*
237 * If this cpu is offline and it is the one which updates 290 * If this cpu is offline and it is the one which updates
@@ -352,13 +405,7 @@ void tick_nohz_stop_sched_tick(int inidle)
352 * the scheduler tick in nohz_restart_sched_tick. 405 * the scheduler tick in nohz_restart_sched_tick.
353 */ 406 */
354 if (!ts->tick_stopped) { 407 if (!ts->tick_stopped) {
355 if (select_nohz_load_balancer(1)) { 408 select_nohz_load_balancer(1);
356 /*
357 * sched tick not stopped!
358 */
359 cpumask_clear_cpu(cpu, nohz_cpu_mask);
360 goto out;
361 }
362 409
363 ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); 410 ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
364 ts->tick_stopped = 1; 411 ts->tick_stopped = 1;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 39f6177fafac..49010d822f72 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -153,8 +153,8 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
153 * - wall_to_monotonic is no longer the boot time, getboottime must be 153 * - wall_to_monotonic is no longer the boot time, getboottime must be
154 * used instead. 154 * used instead.
155 */ 155 */
156struct timespec xtime __attribute__ ((aligned (16))); 156static struct timespec xtime __attribute__ ((aligned (16)));
157struct timespec wall_to_monotonic __attribute__ ((aligned (16))); 157static struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
158static struct timespec total_sleep_time; 158static struct timespec total_sleep_time;
159 159
160/* 160/*
@@ -165,23 +165,15 @@ struct timespec raw_time;
165/* flag for if timekeeping is suspended */ 165/* flag for if timekeeping is suspended */
166int __read_mostly timekeeping_suspended; 166int __read_mostly timekeeping_suspended;
167 167
168static struct timespec xtime_cache __attribute__ ((aligned (16)));
169void update_xtime_cache(u64 nsec)
170{
171 xtime_cache = xtime;
172 timespec_add_ns(&xtime_cache, nsec);
173}
174
175/* must hold xtime_lock */ 168/* must hold xtime_lock */
176void timekeeping_leap_insert(int leapsecond) 169void timekeeping_leap_insert(int leapsecond)
177{ 170{
178 xtime.tv_sec += leapsecond; 171 xtime.tv_sec += leapsecond;
179 wall_to_monotonic.tv_sec -= leapsecond; 172 wall_to_monotonic.tv_sec -= leapsecond;
180 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); 173 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
174 timekeeper.mult);
181} 175}
182 176
183#ifdef CONFIG_GENERIC_TIME
184
185/** 177/**
186 * timekeeping_forward_now - update clock to the current time 178 * timekeeping_forward_now - update clock to the current time
187 * 179 *
@@ -332,12 +324,11 @@ int do_settimeofday(struct timespec *tv)
332 324
333 xtime = *tv; 325 xtime = *tv;
334 326
335 update_xtime_cache(0);
336
337 timekeeper.ntp_error = 0; 327 timekeeper.ntp_error = 0;
338 ntp_clear(); 328 ntp_clear();
339 329
340 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); 330 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
331 timekeeper.mult);
341 332
342 write_sequnlock_irqrestore(&xtime_lock, flags); 333 write_sequnlock_irqrestore(&xtime_lock, flags);
343 334
@@ -385,52 +376,6 @@ void timekeeping_notify(struct clocksource *clock)
385 tick_clock_notify(); 376 tick_clock_notify();
386} 377}
387 378
388#else /* GENERIC_TIME */
389
390static inline void timekeeping_forward_now(void) { }
391
392/**
393 * ktime_get - get the monotonic time in ktime_t format
394 *
395 * returns the time in ktime_t format
396 */
397ktime_t ktime_get(void)
398{
399 struct timespec now;
400
401 ktime_get_ts(&now);
402
403 return timespec_to_ktime(now);
404}
405EXPORT_SYMBOL_GPL(ktime_get);
406
407/**
408 * ktime_get_ts - get the monotonic clock in timespec format
409 * @ts: pointer to timespec variable
410 *
411 * The function calculates the monotonic clock from the realtime
412 * clock and the wall_to_monotonic offset and stores the result
413 * in normalized timespec format in the variable pointed to by @ts.
414 */
415void ktime_get_ts(struct timespec *ts)
416{
417 struct timespec tomono;
418 unsigned long seq;
419
420 do {
421 seq = read_seqbegin(&xtime_lock);
422 getnstimeofday(ts);
423 tomono = wall_to_monotonic;
424
425 } while (read_seqretry(&xtime_lock, seq));
426
427 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
428 ts->tv_nsec + tomono.tv_nsec);
429}
430EXPORT_SYMBOL_GPL(ktime_get_ts);
431
432#endif /* !GENERIC_TIME */
433
434/** 379/**
435 * ktime_get_real - get the real (wall-) time in ktime_t format 380 * ktime_get_real - get the real (wall-) time in ktime_t format
436 * 381 *
@@ -559,7 +504,6 @@ void __init timekeeping_init(void)
559 } 504 }
560 set_normalized_timespec(&wall_to_monotonic, 505 set_normalized_timespec(&wall_to_monotonic,
561 -boot.tv_sec, -boot.tv_nsec); 506 -boot.tv_sec, -boot.tv_nsec);
562 update_xtime_cache(0);
563 total_sleep_time.tv_sec = 0; 507 total_sleep_time.tv_sec = 0;
564 total_sleep_time.tv_nsec = 0; 508 total_sleep_time.tv_nsec = 0;
565 write_sequnlock_irqrestore(&xtime_lock, flags); 509 write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -589,11 +533,10 @@ static int timekeeping_resume(struct sys_device *dev)
589 533
590 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { 534 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
591 ts = timespec_sub(ts, timekeeping_suspend_time); 535 ts = timespec_sub(ts, timekeeping_suspend_time);
592 xtime = timespec_add_safe(xtime, ts); 536 xtime = timespec_add(xtime, ts);
593 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); 537 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
594 total_sleep_time = timespec_add_safe(total_sleep_time, ts); 538 total_sleep_time = timespec_add(total_sleep_time, ts);
595 } 539 }
596 update_xtime_cache(0);
597 /* re-base the last cycle value */ 540 /* re-base the last cycle value */
598 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); 541 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
599 timekeeper.ntp_error = 0; 542 timekeeper.ntp_error = 0;
@@ -747,6 +690,7 @@ static void timekeeping_adjust(s64 offset)
747static cycle_t logarithmic_accumulation(cycle_t offset, int shift) 690static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
748{ 691{
749 u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; 692 u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
693 u64 raw_nsecs;
750 694
751 /* If the offset is smaller then a shifted interval, do nothing */ 695 /* If the offset is smaller then a shifted interval, do nothing */
752 if (offset < timekeeper.cycle_interval<<shift) 696 if (offset < timekeeper.cycle_interval<<shift)
@@ -763,12 +707,15 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
763 second_overflow(); 707 second_overflow();
764 } 708 }
765 709
766 /* Accumulate into raw time */ 710 /* Accumulate raw time */
767 raw_time.tv_nsec += timekeeper.raw_interval << shift;; 711 raw_nsecs = timekeeper.raw_interval << shift;
768 while (raw_time.tv_nsec >= NSEC_PER_SEC) { 712 raw_nsecs += raw_time.tv_nsec;
769 raw_time.tv_nsec -= NSEC_PER_SEC; 713 if (raw_nsecs >= NSEC_PER_SEC) {
770 raw_time.tv_sec++; 714 u64 raw_secs = raw_nsecs;
715 raw_nsecs = do_div(raw_secs, NSEC_PER_SEC);
716 raw_time.tv_sec += raw_secs;
771 } 717 }
718 raw_time.tv_nsec = raw_nsecs;
772 719
773 /* Accumulate error between NTP and clock interval */ 720 /* Accumulate error between NTP and clock interval */
774 timekeeper.ntp_error += tick_length << shift; 721 timekeeper.ntp_error += tick_length << shift;
@@ -788,7 +735,6 @@ void update_wall_time(void)
788{ 735{
789 struct clocksource *clock; 736 struct clocksource *clock;
790 cycle_t offset; 737 cycle_t offset;
791 u64 nsecs;
792 int shift = 0, maxshift; 738 int shift = 0, maxshift;
793 739
794 /* Make sure we're fully resumed: */ 740 /* Make sure we're fully resumed: */
@@ -796,10 +742,11 @@ void update_wall_time(void)
796 return; 742 return;
797 743
798 clock = timekeeper.clock; 744 clock = timekeeper.clock;
799#ifdef CONFIG_GENERIC_TIME 745
800 offset = (clock->read(clock) - clock->cycle_last) & clock->mask; 746#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
801#else
802 offset = timekeeper.cycle_interval; 747 offset = timekeeper.cycle_interval;
748#else
749 offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
803#endif 750#endif
804 timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; 751 timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
805 752
@@ -847,7 +794,9 @@ void update_wall_time(void)
847 timekeeper.ntp_error += neg << timekeeper.ntp_error_shift; 794 timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
848 } 795 }
849 796
850 /* store full nanoseconds into xtime after rounding it up and 797
798 /*
799 * Store full nanoseconds into xtime after rounding it up and
851 * add the remainder to the error difference. 800 * add the remainder to the error difference.
852 */ 801 */
853 xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1; 802 xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1;
@@ -855,11 +804,19 @@ void update_wall_time(void)
855 timekeeper.ntp_error += timekeeper.xtime_nsec << 804 timekeeper.ntp_error += timekeeper.xtime_nsec <<
856 timekeeper.ntp_error_shift; 805 timekeeper.ntp_error_shift;
857 806
858 nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift); 807 /*
859 update_xtime_cache(nsecs); 808 * Finally, make sure that after the rounding
809 * xtime.tv_nsec isn't larger then NSEC_PER_SEC
810 */
811 if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) {
812 xtime.tv_nsec -= NSEC_PER_SEC;
813 xtime.tv_sec++;
814 second_overflow();
815 }
860 816
861 /* check to see if there is a new clocksource to use */ 817 /* check to see if there is a new clocksource to use */
862 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); 818 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
819 timekeeper.mult);
863} 820}
864 821
865/** 822/**
@@ -890,19 +847,24 @@ EXPORT_SYMBOL_GPL(getboottime);
890 */ 847 */
891void monotonic_to_bootbased(struct timespec *ts) 848void monotonic_to_bootbased(struct timespec *ts)
892{ 849{
893 *ts = timespec_add_safe(*ts, total_sleep_time); 850 *ts = timespec_add(*ts, total_sleep_time);
894} 851}
895EXPORT_SYMBOL_GPL(monotonic_to_bootbased); 852EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
896 853
897unsigned long get_seconds(void) 854unsigned long get_seconds(void)
898{ 855{
899 return xtime_cache.tv_sec; 856 return xtime.tv_sec;
900} 857}
901EXPORT_SYMBOL(get_seconds); 858EXPORT_SYMBOL(get_seconds);
902 859
903struct timespec __current_kernel_time(void) 860struct timespec __current_kernel_time(void)
904{ 861{
905 return xtime_cache; 862 return xtime;
863}
864
865struct timespec __get_wall_to_monotonic(void)
866{
867 return wall_to_monotonic;
906} 868}
907 869
908struct timespec current_kernel_time(void) 870struct timespec current_kernel_time(void)
@@ -913,7 +875,7 @@ struct timespec current_kernel_time(void)
913 do { 875 do {
914 seq = read_seqbegin(&xtime_lock); 876 seq = read_seqbegin(&xtime_lock);
915 877
916 now = xtime_cache; 878 now = xtime;
917 } while (read_seqretry(&xtime_lock, seq)); 879 } while (read_seqretry(&xtime_lock, seq));
918 880
919 return now; 881 return now;
@@ -928,7 +890,7 @@ struct timespec get_monotonic_coarse(void)
928 do { 890 do {
929 seq = read_seqbegin(&xtime_lock); 891 seq = read_seqbegin(&xtime_lock);
930 892
931 now = xtime_cache; 893 now = xtime;
932 mono = wall_to_monotonic; 894 mono = wall_to_monotonic;
933 } while (read_seqretry(&xtime_lock, seq)); 895 } while (read_seqretry(&xtime_lock, seq));
934 896
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 1a4a7dd78777..ab8f5e33fa92 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -176,6 +176,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
176 P_ns(idle_waketime); 176 P_ns(idle_waketime);
177 P_ns(idle_exittime); 177 P_ns(idle_exittime);
178 P_ns(idle_sleeptime); 178 P_ns(idle_sleeptime);
179 P_ns(iowait_sleeptime);
179 P(last_jiffies); 180 P(last_jiffies);
180 P(next_jiffies); 181 P(next_jiffies);
181 P_ns(idle_expires); 182 P_ns(idle_expires);
diff --git a/kernel/timer.c b/kernel/timer.c
index aeb6a54f2771..97bf05baade7 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -90,8 +90,13 @@ static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
90 90
91/* 91/*
92 * Note that all tvec_bases are 2 byte aligned and lower bit of 92 * Note that all tvec_bases are 2 byte aligned and lower bit of
93 * base in timer_list is guaranteed to be zero. Use the LSB for 93 * base in timer_list is guaranteed to be zero. Use the LSB to
94 * the new flag to indicate whether the timer is deferrable 94 * indicate whether the timer is deferrable.
95 *
96 * A deferrable timer will work normally when the system is busy, but
97 * will not cause a CPU to come out of idle just to service it; instead,
98 * the timer will be serviced when the CPU eventually wakes up with a
99 * subsequent non-deferrable timer.
95 */ 100 */
96#define TBASE_DEFERRABLE_FLAG (0x1) 101#define TBASE_DEFERRABLE_FLAG (0x1)
97 102
@@ -319,6 +324,25 @@ unsigned long round_jiffies_up_relative(unsigned long j)
319} 324}
320EXPORT_SYMBOL_GPL(round_jiffies_up_relative); 325EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
321 326
327/**
328 * set_timer_slack - set the allowed slack for a timer
329 * @timer: the timer to be modified
330 * @slack_hz: the amount of time (in jiffies) allowed for rounding
331 *
332 * Set the amount of time, in jiffies, that a certain timer has
333 * in terms of slack. By setting this value, the timer subsystem
334 * will schedule the actual timer somewhere between
335 * the time mod_timer() asks for, and that time plus the slack.
336 *
337 * By setting the slack to -1, a percentage of the delay is used
338 * instead.
339 */
340void set_timer_slack(struct timer_list *timer, int slack_hz)
341{
342 timer->slack = slack_hz;
343}
344EXPORT_SYMBOL_GPL(set_timer_slack);
345
322 346
323static inline void set_running_timer(struct tvec_base *base, 347static inline void set_running_timer(struct tvec_base *base,
324 struct timer_list *timer) 348 struct timer_list *timer)
@@ -550,6 +574,7 @@ static void __init_timer(struct timer_list *timer,
550{ 574{
551 timer->entry.next = NULL; 575 timer->entry.next = NULL;
552 timer->base = __raw_get_cpu_var(tvec_bases); 576 timer->base = __raw_get_cpu_var(tvec_bases);
577 timer->slack = -1;
553#ifdef CONFIG_TIMER_STATS 578#ifdef CONFIG_TIMER_STATS
554 timer->start_site = NULL; 579 timer->start_site = NULL;
555 timer->start_pid = -1; 580 timer->start_pid = -1;
@@ -558,6 +583,19 @@ static void __init_timer(struct timer_list *timer,
558 lockdep_init_map(&timer->lockdep_map, name, key, 0); 583 lockdep_init_map(&timer->lockdep_map, name, key, 0);
559} 584}
560 585
586void setup_deferrable_timer_on_stack_key(struct timer_list *timer,
587 const char *name,
588 struct lock_class_key *key,
589 void (*function)(unsigned long),
590 unsigned long data)
591{
592 timer->function = function;
593 timer->data = data;
594 init_timer_on_stack_key(timer, name, key);
595 timer_set_deferrable(timer);
596}
597EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key);
598
561/** 599/**
562 * init_timer_key - initialize a timer 600 * init_timer_key - initialize a timer
563 * @timer: the timer to be initialized 601 * @timer: the timer to be initialized
@@ -660,12 +698,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
660 cpu = smp_processor_id(); 698 cpu = smp_processor_id();
661 699
662#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) 700#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
663 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) { 701 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu))
664 int preferred_cpu = get_nohz_load_balancer(); 702 cpu = get_nohz_timer_target();
665
666 if (preferred_cpu >= 0)
667 cpu = preferred_cpu;
668 }
669#endif 703#endif
670 new_base = per_cpu(tvec_bases, cpu); 704 new_base = per_cpu(tvec_bases, cpu);
671 705
@@ -715,6 +749,46 @@ int mod_timer_pending(struct timer_list *timer, unsigned long expires)
715} 749}
716EXPORT_SYMBOL(mod_timer_pending); 750EXPORT_SYMBOL(mod_timer_pending);
717 751
752/*
753 * Decide where to put the timer while taking the slack into account
754 *
755 * Algorithm:
756 * 1) calculate the maximum (absolute) time
757 * 2) calculate the highest bit where the expires and new max are different
758 * 3) use this bit to make a mask
759 * 4) use the bitmask to round down the maximum time, so that all last
760 * bits are zeros
761 */
762static inline
763unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
764{
765 unsigned long expires_limit, mask;
766 int bit;
767
768 expires_limit = expires;
769
770 if (timer->slack >= 0) {
771 expires_limit = expires + timer->slack;
772 } else {
773 unsigned long now = jiffies;
774
775 /* No slack, if already expired else auto slack 0.4% */
776 if (time_after(expires, now))
777 expires_limit = expires + (expires - now)/256;
778 }
779 mask = expires ^ expires_limit;
780 if (mask == 0)
781 return expires;
782
783 bit = find_last_bit(&mask, BITS_PER_LONG);
784
785 mask = (1 << bit) - 1;
786
787 expires_limit = expires_limit & ~(mask);
788
789 return expires_limit;
790}
791
718/** 792/**
719 * mod_timer - modify a timer's timeout 793 * mod_timer - modify a timer's timeout
720 * @timer: the timer to be modified 794 * @timer: the timer to be modified
@@ -745,6 +819,8 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
745 if (timer_pending(timer) && timer->expires == expires) 819 if (timer_pending(timer) && timer->expires == expires)
746 return 1; 820 return 1;
747 821
822 expires = apply_slack(timer, expires);
823
748 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); 824 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
749} 825}
750EXPORT_SYMBOL(mod_timer); 826EXPORT_SYMBOL(mod_timer);
@@ -955,6 +1031,47 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index)
955 return index; 1031 return index;
956} 1032}
957 1033
1034static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
1035 unsigned long data)
1036{
1037 int preempt_count = preempt_count();
1038
1039#ifdef CONFIG_LOCKDEP
1040 /*
1041 * It is permissible to free the timer from inside the
1042 * function that is called from it, this we need to take into
1043 * account for lockdep too. To avoid bogus "held lock freed"
1044 * warnings as well as problems when looking into
1045 * timer->lockdep_map, make a copy and use that here.
1046 */
1047 struct lockdep_map lockdep_map = timer->lockdep_map;
1048#endif
1049 /*
1050 * Couple the lock chain with the lock chain at
1051 * del_timer_sync() by acquiring the lock_map around the fn()
1052 * call here and in del_timer_sync().
1053 */
1054 lock_map_acquire(&lockdep_map);
1055
1056 trace_timer_expire_entry(timer);
1057 fn(data);
1058 trace_timer_expire_exit(timer);
1059
1060 lock_map_release(&lockdep_map);
1061
1062 if (preempt_count != preempt_count()) {
1063 WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n",
1064 fn, preempt_count, preempt_count());
1065 /*
1066 * Restore the preempt count. That gives us a decent
1067 * chance to survive and extract information. If the
1068 * callback kept a lock held, bad luck, but not worse
1069 * than the BUG() we had.
1070 */
1071 preempt_count() = preempt_count;
1072 }
1073}
1074
958#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) 1075#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
959 1076
960/** 1077/**
@@ -998,45 +1115,7 @@ static inline void __run_timers(struct tvec_base *base)
998 detach_timer(timer, 1); 1115 detach_timer(timer, 1);
999 1116
1000 spin_unlock_irq(&base->lock); 1117 spin_unlock_irq(&base->lock);
1001 { 1118 call_timer_fn(timer, fn, data);
1002 int preempt_count = preempt_count();
1003
1004#ifdef CONFIG_LOCKDEP
1005 /*
1006 * It is permissible to free the timer from
1007 * inside the function that is called from
1008 * it, this we need to take into account for
1009 * lockdep too. To avoid bogus "held lock
1010 * freed" warnings as well as problems when
1011 * looking into timer->lockdep_map, make a
1012 * copy and use that here.
1013 */
1014 struct lockdep_map lockdep_map =
1015 timer->lockdep_map;
1016#endif
1017 /*
1018 * Couple the lock chain with the lock chain at
1019 * del_timer_sync() by acquiring the lock_map
1020 * around the fn() call here and in
1021 * del_timer_sync().
1022 */
1023 lock_map_acquire(&lockdep_map);
1024
1025 trace_timer_expire_entry(timer);
1026 fn(data);
1027 trace_timer_expire_exit(timer);
1028
1029 lock_map_release(&lockdep_map);
1030
1031 if (preempt_count != preempt_count()) {
1032 printk(KERN_ERR "huh, entered %p "
1033 "with preempt_count %08x, exited"
1034 " with %08x?\n",
1035 fn, preempt_count,
1036 preempt_count());
1037 BUG();
1038 }
1039 }
1040 spin_lock_irq(&base->lock); 1119 spin_lock_irq(&base->lock);
1041 } 1120 }
1042 } 1121 }
@@ -1225,7 +1304,6 @@ void run_local_timers(void)
1225{ 1304{
1226 hrtimer_run_queues(); 1305 hrtimer_run_queues();
1227 raise_softirq(TIMER_SOFTIRQ); 1306 raise_softirq(TIMER_SOFTIRQ);
1228 softlockup_tick();
1229} 1307}
1230 1308
1231/* 1309/*
@@ -1620,11 +1698,14 @@ static int __cpuinit timer_cpu_notify(struct notifier_block *self,
1620 unsigned long action, void *hcpu) 1698 unsigned long action, void *hcpu)
1621{ 1699{
1622 long cpu = (long)hcpu; 1700 long cpu = (long)hcpu;
1701 int err;
1702
1623 switch(action) { 1703 switch(action) {
1624 case CPU_UP_PREPARE: 1704 case CPU_UP_PREPARE:
1625 case CPU_UP_PREPARE_FROZEN: 1705 case CPU_UP_PREPARE_FROZEN:
1626 if (init_timers_cpu(cpu) < 0) 1706 err = init_timers_cpu(cpu);
1627 return NOTIFY_BAD; 1707 if (err < 0)
1708 return notifier_from_errno(err);
1628 break; 1709 break;
1629#ifdef CONFIG_HOTPLUG_CPU 1710#ifdef CONFIG_HOTPLUG_CPU
1630 case CPU_DEAD: 1711 case CPU_DEAD:
@@ -1650,7 +1731,7 @@ void __init init_timers(void)
1650 1731
1651 init_timer_stats(); 1732 init_timer_stats();
1652 1733
1653 BUG_ON(err == NOTIFY_BAD); 1734 BUG_ON(err != NOTIFY_OK);
1654 register_cpu_notifier(&timers_nb); 1735 register_cpu_notifier(&timers_nb);
1655 open_softirq(TIMER_SOFTIRQ, run_timer_softirq); 1736 open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
1656} 1737}
@@ -1683,3 +1764,25 @@ unsigned long msleep_interruptible(unsigned int msecs)
1683} 1764}
1684 1765
1685EXPORT_SYMBOL(msleep_interruptible); 1766EXPORT_SYMBOL(msleep_interruptible);
1767
1768static int __sched do_usleep_range(unsigned long min, unsigned long max)
1769{
1770 ktime_t kmin;
1771 unsigned long delta;
1772
1773 kmin = ktime_set(0, min * NSEC_PER_USEC);
1774 delta = (max - min) * NSEC_PER_USEC;
1775 return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
1776}
1777
1778/**
1779 * usleep_range - Drop in replacement for udelay where wakeup is flexible
1780 * @min: Minimum time in usecs to sleep
1781 * @max: Maximum time in usecs to sleep
1782 */
1783void usleep_range(unsigned long min, unsigned long max)
1784{
1785 __set_current_state(TASK_UNINTERRUPTIBLE);
1786 do_usleep_range(min, max);
1787}
1788EXPORT_SYMBOL(usleep_range);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 13e13d428cd3..538501c6ea50 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -44,9 +44,6 @@ config HAVE_FTRACE_MCOUNT_RECORD
44 help 44 help
45 See Documentation/trace/ftrace-design.txt 45 See Documentation/trace/ftrace-design.txt
46 46
47config HAVE_HW_BRANCH_TRACER
48 bool
49
50config HAVE_SYSCALL_TRACEPOINTS 47config HAVE_SYSCALL_TRACEPOINTS
51 bool 48 bool
52 help 49 help
@@ -156,7 +153,7 @@ config IRQSOFF_TRACER
156 bool "Interrupts-off Latency Tracer" 153 bool "Interrupts-off Latency Tracer"
157 default n 154 default n
158 depends on TRACE_IRQFLAGS_SUPPORT 155 depends on TRACE_IRQFLAGS_SUPPORT
159 depends on GENERIC_TIME 156 depends on !ARCH_USES_GETTIMEOFFSET
160 select TRACE_IRQFLAGS 157 select TRACE_IRQFLAGS
161 select GENERIC_TRACER 158 select GENERIC_TRACER
162 select TRACER_MAX_TRACE 159 select TRACER_MAX_TRACE
@@ -178,7 +175,7 @@ config IRQSOFF_TRACER
178config PREEMPT_TRACER 175config PREEMPT_TRACER
179 bool "Preemption-off Latency Tracer" 176 bool "Preemption-off Latency Tracer"
180 default n 177 default n
181 depends on GENERIC_TIME 178 depends on !ARCH_USES_GETTIMEOFFSET
182 depends on PREEMPT 179 depends on PREEMPT
183 select GENERIC_TRACER 180 select GENERIC_TRACER
184 select TRACER_MAX_TRACE 181 select TRACER_MAX_TRACE
@@ -197,15 +194,6 @@ config PREEMPT_TRACER
197 enabled. This option and the irqs-off timing option can be 194 enabled. This option and the irqs-off timing option can be
198 used together or separately.) 195 used together or separately.)
199 196
200config SYSPROF_TRACER
201 bool "Sysprof Tracer"
202 depends on X86
203 select GENERIC_TRACER
204 select CONTEXT_SWITCH_TRACER
205 help
206 This tracer provides the trace needed by the 'Sysprof' userspace
207 tool.
208
209config SCHED_TRACER 197config SCHED_TRACER
210 bool "Scheduling Latency Tracer" 198 bool "Scheduling Latency Tracer"
211 select GENERIC_TRACER 199 select GENERIC_TRACER
@@ -232,23 +220,6 @@ config FTRACE_SYSCALLS
232 help 220 help
233 Basic tracer to catch the syscall entry and exit events. 221 Basic tracer to catch the syscall entry and exit events.
234 222
235config BOOT_TRACER
236 bool "Trace boot initcalls"
237 select GENERIC_TRACER
238 select CONTEXT_SWITCH_TRACER
239 help
240 This tracer helps developers to optimize boot times: it records
241 the timings of the initcalls and traces key events and the identity
242 of tasks that can cause boot delays, such as context-switches.
243
244 Its aim is to be parsed by the scripts/bootgraph.pl tool to
245 produce pretty graphics about boot inefficiencies, giving a visual
246 representation of the delays during initcalls - but the raw
247 /debug/tracing/trace text output is readable too.
248
249 You must pass in initcall_debug and ftrace=initcall to the kernel
250 command line to enable this on bootup.
251
252config TRACE_BRANCH_PROFILING 223config TRACE_BRANCH_PROFILING
253 bool 224 bool
254 select GENERIC_TRACER 225 select GENERIC_TRACER
@@ -328,28 +299,6 @@ config BRANCH_TRACER
328 299
329 Say N if unsure. 300 Say N if unsure.
330 301
331config KSYM_TRACER
332 bool "Trace read and write access on kernel memory locations"
333 depends on HAVE_HW_BREAKPOINT
334 select TRACING
335 help
336 This tracer helps find read and write operations on any given kernel
337 symbol i.e. /proc/kallsyms.
338
339config PROFILE_KSYM_TRACER
340 bool "Profile all kernel memory accesses on 'watched' variables"
341 depends on KSYM_TRACER
342 help
343 This tracer profiles kernel accesses on variables watched through the
344 ksym tracer ftrace plugin. Depending upon the hardware, all read
345 and write operations on kernel variables can be monitored for
346 accesses.
347
348 The results will be displayed in:
349 /debugfs/tracing/profile_ksym
350
351 Say N if unsure.
352
353config STACK_TRACER 302config STACK_TRACER
354 bool "Trace max stack" 303 bool "Trace max stack"
355 depends on HAVE_FUNCTION_TRACER 304 depends on HAVE_FUNCTION_TRACER
@@ -374,45 +323,6 @@ config STACK_TRACER
374 323
375 Say N if unsure. 324 Say N if unsure.
376 325
377config HW_BRANCH_TRACER
378 depends on HAVE_HW_BRANCH_TRACER
379 bool "Trace hw branches"
380 select GENERIC_TRACER
381 help
382 This tracer records all branches on the system in a circular
383 buffer, giving access to the last N branches for each cpu.
384
385config KMEMTRACE
386 bool "Trace SLAB allocations"
387 select GENERIC_TRACER
388 help
389 kmemtrace provides tracing for slab allocator functions, such as
390 kmalloc, kfree, kmem_cache_alloc, kmem_cache_free, etc. Collected
391 data is then fed to the userspace application in order to analyse
392 allocation hotspots, internal fragmentation and so on, making it
393 possible to see how well an allocator performs, as well as debug
394 and profile kernel code.
395
396 This requires an userspace application to use. See
397 Documentation/trace/kmemtrace.txt for more information.
398
399 Saying Y will make the kernel somewhat larger and slower. However,
400 if you disable kmemtrace at run-time or boot-time, the performance
401 impact is minimal (depending on the arch the kernel is built for).
402
403 If unsure, say N.
404
405config WORKQUEUE_TRACER
406 bool "Trace workqueues"
407 select GENERIC_TRACER
408 help
409 The workqueue tracer provides some statistical information
410 about each cpu workqueue thread such as the number of the
411 works inserted and executed since their creation. It can help
412 to evaluate the amount of work each of them has to perform.
413 For example it can help a developer to decide whether he should
414 choose a per-cpu workqueue instead of a singlethreaded one.
415
416config BLK_DEV_IO_TRACE 326config BLK_DEV_IO_TRACE
417 bool "Support for tracing block IO actions" 327 bool "Support for tracing block IO actions"
418 depends on SYSFS 328 depends on SYSFS
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 78edc6490038..53f338190b26 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -30,7 +30,6 @@ obj-$(CONFIG_TRACING) += trace_output.o
30obj-$(CONFIG_TRACING) += trace_stat.o 30obj-$(CONFIG_TRACING) += trace_stat.o
31obj-$(CONFIG_TRACING) += trace_printk.o 31obj-$(CONFIG_TRACING) += trace_printk.o
32obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o 32obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
33obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
34obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o 33obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
35obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o 34obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
36obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o 35obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
@@ -38,11 +37,8 @@ obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
38obj-$(CONFIG_NOP_TRACER) += trace_nop.o 37obj-$(CONFIG_NOP_TRACER) += trace_nop.o
39obj-$(CONFIG_STACK_TRACER) += trace_stack.o 38obj-$(CONFIG_STACK_TRACER) += trace_stack.o
40obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o 39obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
41obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o 40obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o 41obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
44obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
45obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
46obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o 42obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
47obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o 43obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
48ifeq ($(CONFIG_BLOCK),y) 44ifeq ($(CONFIG_BLOCK),y)
@@ -56,7 +52,9 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
56endif 52endif
57obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 53obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
58obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o 54obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
59obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
60obj-$(CONFIG_EVENT_TRACING) += power-traces.o 55obj-$(CONFIG_EVENT_TRACING) += power-traces.o
56ifeq ($(CONFIG_TRACING),y)
57obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
58endif
61 59
62libftrace-y := ftrace.o 60libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b3bc91a3f510..959f8d6c8cc1 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -169,9 +169,12 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
169static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), 169static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
170 BLK_TC_ACT(BLK_TC_WRITE) }; 170 BLK_TC_ACT(BLK_TC_WRITE) };
171 171
172#define BLK_TC_HARDBARRIER BLK_TC_BARRIER
173#define BLK_TC_RAHEAD BLK_TC_AHEAD
174
172/* The ilog2() calls fall out because they're constant */ 175/* The ilog2() calls fall out because they're constant */
173#define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \ 176#define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \
174 (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name)) 177 (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name))
175 178
176/* 179/*
177 * The worker for the various blk_add_trace*() types. Fills out a 180 * The worker for the various blk_add_trace*() types. Fills out a
@@ -194,9 +197,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
194 return; 197 return;
195 198
196 what |= ddir_act[rw & WRITE]; 199 what |= ddir_act[rw & WRITE];
197 what |= MASK_TC_BIT(rw, BARRIER); 200 what |= MASK_TC_BIT(rw, HARDBARRIER);
198 what |= MASK_TC_BIT(rw, SYNCIO); 201 what |= MASK_TC_BIT(rw, SYNC);
199 what |= MASK_TC_BIT(rw, AHEAD); 202 what |= MASK_TC_BIT(rw, RAHEAD);
200 what |= MASK_TC_BIT(rw, META); 203 what |= MASK_TC_BIT(rw, META);
201 what |= MASK_TC_BIT(rw, DISCARD); 204 what |= MASK_TC_BIT(rw, DISCARD);
202 205
@@ -549,6 +552,41 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
549} 552}
550EXPORT_SYMBOL_GPL(blk_trace_setup); 553EXPORT_SYMBOL_GPL(blk_trace_setup);
551 554
555#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
556static int compat_blk_trace_setup(struct request_queue *q, char *name,
557 dev_t dev, struct block_device *bdev,
558 char __user *arg)
559{
560 struct blk_user_trace_setup buts;
561 struct compat_blk_user_trace_setup cbuts;
562 int ret;
563
564 if (copy_from_user(&cbuts, arg, sizeof(cbuts)))
565 return -EFAULT;
566
567 buts = (struct blk_user_trace_setup) {
568 .act_mask = cbuts.act_mask,
569 .buf_size = cbuts.buf_size,
570 .buf_nr = cbuts.buf_nr,
571 .start_lba = cbuts.start_lba,
572 .end_lba = cbuts.end_lba,
573 .pid = cbuts.pid,
574 };
575 memcpy(&buts.name, &cbuts.name, 32);
576
577 ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
578 if (ret)
579 return ret;
580
581 if (copy_to_user(arg, &buts.name, 32)) {
582 blk_trace_remove(q);
583 return -EFAULT;
584 }
585
586 return 0;
587}
588#endif
589
552int blk_trace_startstop(struct request_queue *q, int start) 590int blk_trace_startstop(struct request_queue *q, int start)
553{ 591{
554 int ret; 592 int ret;
@@ -601,6 +639,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
601 if (!q) 639 if (!q)
602 return -ENXIO; 640 return -ENXIO;
603 641
642 lock_kernel();
604 mutex_lock(&bdev->bd_mutex); 643 mutex_lock(&bdev->bd_mutex);
605 644
606 switch (cmd) { 645 switch (cmd) {
@@ -608,6 +647,12 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
608 bdevname(bdev, b); 647 bdevname(bdev, b);
609 ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); 648 ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
610 break; 649 break;
650#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
651 case BLKTRACESETUP32:
652 bdevname(bdev, b);
653 ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
654 break;
655#endif
611 case BLKTRACESTART: 656 case BLKTRACESTART:
612 start = 1; 657 start = 1;
613 case BLKTRACESTOP: 658 case BLKTRACESTOP:
@@ -622,6 +667,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
622 } 667 }
623 668
624 mutex_unlock(&bdev->bd_mutex); 669 mutex_unlock(&bdev->bd_mutex);
670 unlock_kernel();
625 return ret; 671 return ret;
626} 672}
627 673
@@ -661,10 +707,13 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
661 if (likely(!bt)) 707 if (likely(!bt))
662 return; 708 return;
663 709
664 if (blk_discard_rq(rq)) 710 if (rq->cmd_flags & REQ_DISCARD)
665 rw |= (1 << BIO_RW_DISCARD); 711 rw |= REQ_DISCARD;
666 712
667 if (blk_pc_request(rq)) { 713 if (rq->cmd_flags & REQ_SECURE)
714 rw |= REQ_SECURE;
715
716 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
668 what |= BLK_TC_ACT(BLK_TC_PC); 717 what |= BLK_TC_ACT(BLK_TC_PC);
669 __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw, 718 __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw,
670 what, rq->errors, rq->cmd_len, rq->cmd); 719 what, rq->errors, rq->cmd_len, rq->cmd);
@@ -675,28 +724,33 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
675 } 724 }
676} 725}
677 726
678static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq) 727static void blk_add_trace_rq_abort(void *ignore,
728 struct request_queue *q, struct request *rq)
679{ 729{
680 blk_add_trace_rq(q, rq, BLK_TA_ABORT); 730 blk_add_trace_rq(q, rq, BLK_TA_ABORT);
681} 731}
682 732
683static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq) 733static void blk_add_trace_rq_insert(void *ignore,
734 struct request_queue *q, struct request *rq)
684{ 735{
685 blk_add_trace_rq(q, rq, BLK_TA_INSERT); 736 blk_add_trace_rq(q, rq, BLK_TA_INSERT);
686} 737}
687 738
688static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq) 739static void blk_add_trace_rq_issue(void *ignore,
740 struct request_queue *q, struct request *rq)
689{ 741{
690 blk_add_trace_rq(q, rq, BLK_TA_ISSUE); 742 blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
691} 743}
692 744
693static void blk_add_trace_rq_requeue(struct request_queue *q, 745static void blk_add_trace_rq_requeue(void *ignore,
746 struct request_queue *q,
694 struct request *rq) 747 struct request *rq)
695{ 748{
696 blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); 749 blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
697} 750}
698 751
699static void blk_add_trace_rq_complete(struct request_queue *q, 752static void blk_add_trace_rq_complete(void *ignore,
753 struct request_queue *q,
700 struct request *rq) 754 struct request *rq)
701{ 755{
702 blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); 756 blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
@@ -724,34 +778,40 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
724 !bio_flagged(bio, BIO_UPTODATE), 0, NULL); 778 !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
725} 779}
726 780
727static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio) 781static void blk_add_trace_bio_bounce(void *ignore,
782 struct request_queue *q, struct bio *bio)
728{ 783{
729 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE); 784 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
730} 785}
731 786
732static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio) 787static void blk_add_trace_bio_complete(void *ignore,
788 struct request_queue *q, struct bio *bio)
733{ 789{
734 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE); 790 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
735} 791}
736 792
737static void blk_add_trace_bio_backmerge(struct request_queue *q, 793static void blk_add_trace_bio_backmerge(void *ignore,
794 struct request_queue *q,
738 struct bio *bio) 795 struct bio *bio)
739{ 796{
740 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); 797 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
741} 798}
742 799
743static void blk_add_trace_bio_frontmerge(struct request_queue *q, 800static void blk_add_trace_bio_frontmerge(void *ignore,
801 struct request_queue *q,
744 struct bio *bio) 802 struct bio *bio)
745{ 803{
746 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); 804 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
747} 805}
748 806
749static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio) 807static void blk_add_trace_bio_queue(void *ignore,
808 struct request_queue *q, struct bio *bio)
750{ 809{
751 blk_add_trace_bio(q, bio, BLK_TA_QUEUE); 810 blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
752} 811}
753 812
754static void blk_add_trace_getrq(struct request_queue *q, 813static void blk_add_trace_getrq(void *ignore,
814 struct request_queue *q,
755 struct bio *bio, int rw) 815 struct bio *bio, int rw)
756{ 816{
757 if (bio) 817 if (bio)
@@ -765,7 +825,8 @@ static void blk_add_trace_getrq(struct request_queue *q,
765} 825}
766 826
767 827
768static void blk_add_trace_sleeprq(struct request_queue *q, 828static void blk_add_trace_sleeprq(void *ignore,
829 struct request_queue *q,
769 struct bio *bio, int rw) 830 struct bio *bio, int rw)
770{ 831{
771 if (bio) 832 if (bio)
@@ -779,7 +840,7 @@ static void blk_add_trace_sleeprq(struct request_queue *q,
779 } 840 }
780} 841}
781 842
782static void blk_add_trace_plug(struct request_queue *q) 843static void blk_add_trace_plug(void *ignore, struct request_queue *q)
783{ 844{
784 struct blk_trace *bt = q->blk_trace; 845 struct blk_trace *bt = q->blk_trace;
785 846
@@ -787,7 +848,7 @@ static void blk_add_trace_plug(struct request_queue *q)
787 __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); 848 __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
788} 849}
789 850
790static void blk_add_trace_unplug_io(struct request_queue *q) 851static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q)
791{ 852{
792 struct blk_trace *bt = q->blk_trace; 853 struct blk_trace *bt = q->blk_trace;
793 854
@@ -800,7 +861,7 @@ static void blk_add_trace_unplug_io(struct request_queue *q)
800 } 861 }
801} 862}
802 863
803static void blk_add_trace_unplug_timer(struct request_queue *q) 864static void blk_add_trace_unplug_timer(void *ignore, struct request_queue *q)
804{ 865{
805 struct blk_trace *bt = q->blk_trace; 866 struct blk_trace *bt = q->blk_trace;
806 867
@@ -813,7 +874,8 @@ static void blk_add_trace_unplug_timer(struct request_queue *q)
813 } 874 }
814} 875}
815 876
816static void blk_add_trace_split(struct request_queue *q, struct bio *bio, 877static void blk_add_trace_split(void *ignore,
878 struct request_queue *q, struct bio *bio,
817 unsigned int pdu) 879 unsigned int pdu)
818{ 880{
819 struct blk_trace *bt = q->blk_trace; 881 struct blk_trace *bt = q->blk_trace;
@@ -829,6 +891,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
829 891
830/** 892/**
831 * blk_add_trace_remap - Add a trace for a remap operation 893 * blk_add_trace_remap - Add a trace for a remap operation
894 * @ignore: trace callback data parameter (not used)
832 * @q: queue the io is for 895 * @q: queue the io is for
833 * @bio: the source bio 896 * @bio: the source bio
834 * @dev: target device 897 * @dev: target device
@@ -839,8 +902,9 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
839 * it spans a stripe (or similar). Add a trace for that action. 902 * it spans a stripe (or similar). Add a trace for that action.
840 * 903 *
841 **/ 904 **/
842static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, 905static void blk_add_trace_remap(void *ignore,
843 dev_t dev, sector_t from) 906 struct request_queue *q, struct bio *bio,
907 dev_t dev, sector_t from)
844{ 908{
845 struct blk_trace *bt = q->blk_trace; 909 struct blk_trace *bt = q->blk_trace;
846 struct blk_io_trace_remap r; 910 struct blk_io_trace_remap r;
@@ -859,6 +923,7 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
859 923
860/** 924/**
861 * blk_add_trace_rq_remap - Add a trace for a request-remap operation 925 * blk_add_trace_rq_remap - Add a trace for a request-remap operation
926 * @ignore: trace callback data parameter (not used)
862 * @q: queue the io is for 927 * @q: queue the io is for
863 * @rq: the source request 928 * @rq: the source request
864 * @dev: target device 929 * @dev: target device
@@ -869,7 +934,8 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
869 * Add a trace for that action. 934 * Add a trace for that action.
870 * 935 *
871 **/ 936 **/
872static void blk_add_trace_rq_remap(struct request_queue *q, 937static void blk_add_trace_rq_remap(void *ignore,
938 struct request_queue *q,
873 struct request *rq, dev_t dev, 939 struct request *rq, dev_t dev,
874 sector_t from) 940 sector_t from)
875{ 941{
@@ -908,7 +974,7 @@ void blk_add_driver_data(struct request_queue *q,
908 if (likely(!bt)) 974 if (likely(!bt))
909 return; 975 return;
910 976
911 if (blk_pc_request(rq)) 977 if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
912 __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, 978 __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0,
913 BLK_TA_DRV_DATA, rq->errors, len, data); 979 BLK_TA_DRV_DATA, rq->errors, len, data);
914 else 980 else
@@ -921,64 +987,64 @@ static void blk_register_tracepoints(void)
921{ 987{
922 int ret; 988 int ret;
923 989
924 ret = register_trace_block_rq_abort(blk_add_trace_rq_abort); 990 ret = register_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
925 WARN_ON(ret); 991 WARN_ON(ret);
926 ret = register_trace_block_rq_insert(blk_add_trace_rq_insert); 992 ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
927 WARN_ON(ret); 993 WARN_ON(ret);
928 ret = register_trace_block_rq_issue(blk_add_trace_rq_issue); 994 ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
929 WARN_ON(ret); 995 WARN_ON(ret);
930 ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue); 996 ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
931 WARN_ON(ret); 997 WARN_ON(ret);
932 ret = register_trace_block_rq_complete(blk_add_trace_rq_complete); 998 ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
933 WARN_ON(ret); 999 WARN_ON(ret);
934 ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce); 1000 ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
935 WARN_ON(ret); 1001 WARN_ON(ret);
936 ret = register_trace_block_bio_complete(blk_add_trace_bio_complete); 1002 ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
937 WARN_ON(ret); 1003 WARN_ON(ret);
938 ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge); 1004 ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
939 WARN_ON(ret); 1005 WARN_ON(ret);
940 ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge); 1006 ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
941 WARN_ON(ret); 1007 WARN_ON(ret);
942 ret = register_trace_block_bio_queue(blk_add_trace_bio_queue); 1008 ret = register_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
943 WARN_ON(ret); 1009 WARN_ON(ret);
944 ret = register_trace_block_getrq(blk_add_trace_getrq); 1010 ret = register_trace_block_getrq(blk_add_trace_getrq, NULL);
945 WARN_ON(ret); 1011 WARN_ON(ret);
946 ret = register_trace_block_sleeprq(blk_add_trace_sleeprq); 1012 ret = register_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
947 WARN_ON(ret); 1013 WARN_ON(ret);
948 ret = register_trace_block_plug(blk_add_trace_plug); 1014 ret = register_trace_block_plug(blk_add_trace_plug, NULL);
949 WARN_ON(ret); 1015 WARN_ON(ret);
950 ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer); 1016 ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
951 WARN_ON(ret); 1017 WARN_ON(ret);
952 ret = register_trace_block_unplug_io(blk_add_trace_unplug_io); 1018 ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
953 WARN_ON(ret); 1019 WARN_ON(ret);
954 ret = register_trace_block_split(blk_add_trace_split); 1020 ret = register_trace_block_split(blk_add_trace_split, NULL);
955 WARN_ON(ret); 1021 WARN_ON(ret);
956 ret = register_trace_block_remap(blk_add_trace_remap); 1022 ret = register_trace_block_remap(blk_add_trace_remap, NULL);
957 WARN_ON(ret); 1023 WARN_ON(ret);
958 ret = register_trace_block_rq_remap(blk_add_trace_rq_remap); 1024 ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
959 WARN_ON(ret); 1025 WARN_ON(ret);
960} 1026}
961 1027
962static void blk_unregister_tracepoints(void) 1028static void blk_unregister_tracepoints(void)
963{ 1029{
964 unregister_trace_block_rq_remap(blk_add_trace_rq_remap); 1030 unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
965 unregister_trace_block_remap(blk_add_trace_remap); 1031 unregister_trace_block_remap(blk_add_trace_remap, NULL);
966 unregister_trace_block_split(blk_add_trace_split); 1032 unregister_trace_block_split(blk_add_trace_split, NULL);
967 unregister_trace_block_unplug_io(blk_add_trace_unplug_io); 1033 unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
968 unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer); 1034 unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
969 unregister_trace_block_plug(blk_add_trace_plug); 1035 unregister_trace_block_plug(blk_add_trace_plug, NULL);
970 unregister_trace_block_sleeprq(blk_add_trace_sleeprq); 1036 unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
971 unregister_trace_block_getrq(blk_add_trace_getrq); 1037 unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
972 unregister_trace_block_bio_queue(blk_add_trace_bio_queue); 1038 unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
973 unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge); 1039 unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
974 unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge); 1040 unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
975 unregister_trace_block_bio_complete(blk_add_trace_bio_complete); 1041 unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
976 unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce); 1042 unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
977 unregister_trace_block_rq_complete(blk_add_trace_rq_complete); 1043 unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
978 unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue); 1044 unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
979 unregister_trace_block_rq_issue(blk_add_trace_rq_issue); 1045 unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
980 unregister_trace_block_rq_insert(blk_add_trace_rq_insert); 1046 unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
981 unregister_trace_block_rq_abort(blk_add_trace_rq_abort); 1047 unregister_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
982 1048
983 tracepoint_synchronize_unregister(); 1049 tracepoint_synchronize_unregister();
984} 1050}
@@ -1321,7 +1387,7 @@ out:
1321} 1387}
1322 1388
1323static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, 1389static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
1324 int flags) 1390 int flags, struct trace_event *event)
1325{ 1391{
1326 return print_one_line(iter, false); 1392 return print_one_line(iter, false);
1327} 1393}
@@ -1343,7 +1409,8 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
1343} 1409}
1344 1410
1345static enum print_line_t 1411static enum print_line_t
1346blk_trace_event_print_binary(struct trace_iterator *iter, int flags) 1412blk_trace_event_print_binary(struct trace_iterator *iter, int flags,
1413 struct trace_event *event)
1347{ 1414{
1348 return blk_trace_synthesize_old_trace(iter) ? 1415 return blk_trace_synthesize_old_trace(iter) ?
1349 TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 1416 TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
@@ -1381,12 +1448,16 @@ static struct tracer blk_tracer __read_mostly = {
1381 .set_flag = blk_tracer_set_flag, 1448 .set_flag = blk_tracer_set_flag,
1382}; 1449};
1383 1450
1384static struct trace_event trace_blk_event = { 1451static struct trace_event_functions trace_blk_event_funcs = {
1385 .type = TRACE_BLK,
1386 .trace = blk_trace_event_print, 1452 .trace = blk_trace_event_print,
1387 .binary = blk_trace_event_print_binary, 1453 .binary = blk_trace_event_print_binary,
1388}; 1454};
1389 1455
1456static struct trace_event trace_blk_event = {
1457 .type = TRACE_BLK,
1458 .funcs = &trace_blk_event_funcs,
1459};
1460
1390static int __init init_blk_tracer(void) 1461static int __init init_blk_tracer(void)
1391{ 1462{
1392 if (!register_ftrace_event(&trace_blk_event)) { 1463 if (!register_ftrace_event(&trace_blk_event)) {
@@ -1708,7 +1779,7 @@ void blk_dump_cmd(char *buf, struct request *rq)
1708 int len = rq->cmd_len; 1779 int len = rq->cmd_len;
1709 unsigned char *cmd = rq->cmd; 1780 unsigned char *cmd = rq->cmd;
1710 1781
1711 if (!blk_pc_request(rq)) { 1782 if (rq->cmd_type != REQ_TYPE_BLOCK_PC) {
1712 buf[0] = '\0'; 1783 buf[0] = '\0';
1713 return; 1784 return;
1714 } 1785 }
@@ -1733,21 +1804,23 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
1733 1804
1734 if (rw & WRITE) 1805 if (rw & WRITE)
1735 rwbs[i++] = 'W'; 1806 rwbs[i++] = 'W';
1736 else if (rw & 1 << BIO_RW_DISCARD) 1807 else if (rw & REQ_DISCARD)
1737 rwbs[i++] = 'D'; 1808 rwbs[i++] = 'D';
1738 else if (bytes) 1809 else if (bytes)
1739 rwbs[i++] = 'R'; 1810 rwbs[i++] = 'R';
1740 else 1811 else
1741 rwbs[i++] = 'N'; 1812 rwbs[i++] = 'N';
1742 1813
1743 if (rw & 1 << BIO_RW_AHEAD) 1814 if (rw & REQ_RAHEAD)
1744 rwbs[i++] = 'A'; 1815 rwbs[i++] = 'A';
1745 if (rw & 1 << BIO_RW_BARRIER) 1816 if (rw & REQ_HARDBARRIER)
1746 rwbs[i++] = 'B'; 1817 rwbs[i++] = 'B';
1747 if (rw & 1 << BIO_RW_SYNCIO) 1818 if (rw & REQ_SYNC)
1748 rwbs[i++] = 'S'; 1819 rwbs[i++] = 'S';
1749 if (rw & 1 << BIO_RW_META) 1820 if (rw & REQ_META)
1750 rwbs[i++] = 'M'; 1821 rwbs[i++] = 'M';
1822 if (rw & REQ_SECURE)
1823 rwbs[i++] = 'E';
1751 1824
1752 rwbs[i] = '\0'; 1825 rwbs[i] = '\0';
1753} 1826}
@@ -1757,8 +1830,11 @@ void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
1757 int rw = rq->cmd_flags & 0x03; 1830 int rw = rq->cmd_flags & 0x03;
1758 int bytes; 1831 int bytes;
1759 1832
1760 if (blk_discard_rq(rq)) 1833 if (rq->cmd_flags & REQ_DISCARD)
1761 rw |= (1 << BIO_RW_DISCARD); 1834 rw |= REQ_DISCARD;
1835
1836 if (rq->cmd_flags & REQ_SECURE)
1837 rw |= REQ_SECURE;
1762 1838
1763 bytes = blk_rq_bytes(rq); 1839 bytes = blk_rq_bytes(rq);
1764 1840
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2404b59b3097..fa7ece649fe1 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -264,6 +264,7 @@ struct ftrace_profile {
264 unsigned long counter; 264 unsigned long counter;
265#ifdef CONFIG_FUNCTION_GRAPH_TRACER 265#ifdef CONFIG_FUNCTION_GRAPH_TRACER
266 unsigned long long time; 266 unsigned long long time;
267 unsigned long long time_squared;
267#endif 268#endif
268}; 269};
269 270
@@ -366,9 +367,9 @@ static int function_stat_headers(struct seq_file *m)
366{ 367{
367#ifdef CONFIG_FUNCTION_GRAPH_TRACER 368#ifdef CONFIG_FUNCTION_GRAPH_TRACER
368 seq_printf(m, " Function " 369 seq_printf(m, " Function "
369 "Hit Time Avg\n" 370 "Hit Time Avg s^2\n"
370 " -------- " 371 " -------- "
371 "--- ---- ---\n"); 372 "--- ---- --- ---\n");
372#else 373#else
373 seq_printf(m, " Function Hit\n" 374 seq_printf(m, " Function Hit\n"
374 " -------- ---\n"); 375 " -------- ---\n");
@@ -380,11 +381,19 @@ static int function_stat_show(struct seq_file *m, void *v)
380{ 381{
381 struct ftrace_profile *rec = v; 382 struct ftrace_profile *rec = v;
382 char str[KSYM_SYMBOL_LEN]; 383 char str[KSYM_SYMBOL_LEN];
384 int ret = 0;
383#ifdef CONFIG_FUNCTION_GRAPH_TRACER 385#ifdef CONFIG_FUNCTION_GRAPH_TRACER
384 static DEFINE_MUTEX(mutex);
385 static struct trace_seq s; 386 static struct trace_seq s;
386 unsigned long long avg; 387 unsigned long long avg;
388 unsigned long long stddev;
387#endif 389#endif
390 mutex_lock(&ftrace_profile_lock);
391
392 /* we raced with function_profile_reset() */
393 if (unlikely(rec->counter == 0)) {
394 ret = -EBUSY;
395 goto out;
396 }
388 397
389 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 398 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
390 seq_printf(m, " %-30.30s %10lu", str, rec->counter); 399 seq_printf(m, " %-30.30s %10lu", str, rec->counter);
@@ -394,17 +403,31 @@ static int function_stat_show(struct seq_file *m, void *v)
394 avg = rec->time; 403 avg = rec->time;
395 do_div(avg, rec->counter); 404 do_div(avg, rec->counter);
396 405
397 mutex_lock(&mutex); 406 /* Sample standard deviation (s^2) */
407 if (rec->counter <= 1)
408 stddev = 0;
409 else {
410 stddev = rec->time_squared - rec->counter * avg * avg;
411 /*
412 * Divide only 1000 for ns^2 -> us^2 conversion.
413 * trace_print_graph_duration will divide 1000 again.
414 */
415 do_div(stddev, (rec->counter - 1) * 1000);
416 }
417
398 trace_seq_init(&s); 418 trace_seq_init(&s);
399 trace_print_graph_duration(rec->time, &s); 419 trace_print_graph_duration(rec->time, &s);
400 trace_seq_puts(&s, " "); 420 trace_seq_puts(&s, " ");
401 trace_print_graph_duration(avg, &s); 421 trace_print_graph_duration(avg, &s);
422 trace_seq_puts(&s, " ");
423 trace_print_graph_duration(stddev, &s);
402 trace_print_seq(m, &s); 424 trace_print_seq(m, &s);
403 mutex_unlock(&mutex);
404#endif 425#endif
405 seq_putc(m, '\n'); 426 seq_putc(m, '\n');
427out:
428 mutex_unlock(&ftrace_profile_lock);
406 429
407 return 0; 430 return ret;
408} 431}
409 432
410static void ftrace_profile_reset(struct ftrace_profile_stat *stat) 433static void ftrace_profile_reset(struct ftrace_profile_stat *stat)
@@ -650,6 +673,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
650 if (!stat->hash || !ftrace_profile_enabled) 673 if (!stat->hash || !ftrace_profile_enabled)
651 goto out; 674 goto out;
652 675
676 /* If the calltime was zero'd ignore it */
677 if (!trace->calltime)
678 goto out;
679
653 calltime = trace->rettime - trace->calltime; 680 calltime = trace->rettime - trace->calltime;
654 681
655 if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) { 682 if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) {
@@ -668,8 +695,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
668 } 695 }
669 696
670 rec = ftrace_find_profiled_func(stat, trace->func); 697 rec = ftrace_find_profiled_func(stat, trace->func);
671 if (rec) 698 if (rec) {
672 rec->time += calltime; 699 rec->time += calltime;
700 rec->time_squared += calltime * calltime;
701 }
673 702
674 out: 703 out:
675 local_irq_restore(flags); 704 local_irq_restore(flags);
@@ -1481,6 +1510,8 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1481 if (*pos > 0) 1510 if (*pos > 0)
1482 return t_hash_start(m, pos); 1511 return t_hash_start(m, pos);
1483 iter->flags |= FTRACE_ITER_PRINTALL; 1512 iter->flags |= FTRACE_ITER_PRINTALL;
1513 /* reset in case of seek/pread */
1514 iter->flags &= ~FTRACE_ITER_HASH;
1484 return iter; 1515 return iter;
1485 } 1516 }
1486 1517
@@ -1861,7 +1892,6 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
1861 struct hlist_head *hhd; 1892 struct hlist_head *hhd;
1862 struct hlist_node *n; 1893 struct hlist_node *n;
1863 unsigned long key; 1894 unsigned long key;
1864 int resched;
1865 1895
1866 key = hash_long(ip, FTRACE_HASH_BITS); 1896 key = hash_long(ip, FTRACE_HASH_BITS);
1867 1897
@@ -1875,12 +1905,12 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
1875 * period. This syncs the hash iteration and freeing of items 1905 * period. This syncs the hash iteration and freeing of items
1876 * on the hash. rcu_read_lock is too dangerous here. 1906 * on the hash. rcu_read_lock is too dangerous here.
1877 */ 1907 */
1878 resched = ftrace_preempt_disable(); 1908 preempt_disable_notrace();
1879 hlist_for_each_entry_rcu(entry, n, hhd, node) { 1909 hlist_for_each_entry_rcu(entry, n, hhd, node) {
1880 if (entry->ip == ip) 1910 if (entry->ip == ip)
1881 entry->ops->func(ip, parent_ip, &entry->data); 1911 entry->ops->func(ip, parent_ip, &entry->data);
1882 } 1912 }
1883 ftrace_preempt_enable(resched); 1913 preempt_enable_notrace();
1884} 1914}
1885 1915
1886static struct ftrace_ops trace_probe_ops __read_mostly = 1916static struct ftrace_ops trace_probe_ops __read_mostly =
@@ -2388,7 +2418,7 @@ static const struct file_operations ftrace_filter_fops = {
2388 .open = ftrace_filter_open, 2418 .open = ftrace_filter_open,
2389 .read = seq_read, 2419 .read = seq_read,
2390 .write = ftrace_filter_write, 2420 .write = ftrace_filter_write,
2391 .llseek = ftrace_regex_lseek, 2421 .llseek = no_llseek,
2392 .release = ftrace_filter_release, 2422 .release = ftrace_filter_release,
2393}; 2423};
2394 2424
@@ -3212,8 +3242,8 @@ free:
3212} 3242}
3213 3243
3214static void 3244static void
3215ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev, 3245ftrace_graph_probe_sched_switch(void *ignore,
3216 struct task_struct *next) 3246 struct task_struct *prev, struct task_struct *next)
3217{ 3247{
3218 unsigned long long timestamp; 3248 unsigned long long timestamp;
3219 int index; 3249 int index;
@@ -3267,7 +3297,7 @@ static int start_graph_tracing(void)
3267 } while (ret == -EAGAIN); 3297 } while (ret == -EAGAIN);
3268 3298
3269 if (!ret) { 3299 if (!ret) {
3270 ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch); 3300 ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
3271 if (ret) 3301 if (ret)
3272 pr_info("ftrace_graph: Couldn't activate tracepoint" 3302 pr_info("ftrace_graph: Couldn't activate tracepoint"
3273 " probe to kernel_sched_switch\n"); 3303 " probe to kernel_sched_switch\n");
@@ -3339,11 +3369,11 @@ void unregister_ftrace_graph(void)
3339 goto out; 3369 goto out;
3340 3370
3341 ftrace_graph_active--; 3371 ftrace_graph_active--;
3342 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
3343 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; 3372 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
3344 ftrace_graph_entry = ftrace_graph_entry_stub; 3373 ftrace_graph_entry = ftrace_graph_entry_stub;
3345 ftrace_shutdown(FTRACE_STOP_FUNC_RET); 3374 ftrace_shutdown(FTRACE_STOP_FUNC_RET);
3346 unregister_pm_notifier(&ftrace_suspend_notifier); 3375 unregister_pm_notifier(&ftrace_suspend_notifier);
3376 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
3347 3377
3348 out: 3378 out:
3349 mutex_unlock(&ftrace_lock); 3379 mutex_unlock(&ftrace_lock);
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
deleted file mode 100644
index a91da69f153a..000000000000
--- a/kernel/trace/kmemtrace.c
+++ /dev/null
@@ -1,511 +0,0 @@
1/*
2 * Memory allocator tracing
3 *
4 * Copyright (C) 2008 Eduard - Gabriel Munteanu
5 * Copyright (C) 2008 Pekka Enberg <penberg@cs.helsinki.fi>
6 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
7 */
8
9#include <linux/tracepoint.h>
10#include <linux/seq_file.h>
11#include <linux/debugfs.h>
12#include <linux/dcache.h>
13#include <linux/fs.h>
14
15#include <linux/kmemtrace.h>
16
17#include "trace_output.h"
18#include "trace.h"
19
20/* Select an alternative, minimalistic output than the original one */
21#define TRACE_KMEM_OPT_MINIMAL 0x1
22
23static struct tracer_opt kmem_opts[] = {
24 /* Default disable the minimalistic output */
25 { TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) },
26 { }
27};
28
29static struct tracer_flags kmem_tracer_flags = {
30 .val = 0,
31 .opts = kmem_opts
32};
33
34static struct trace_array *kmemtrace_array;
35
36/* Trace allocations */
37static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
38 unsigned long call_site,
39 const void *ptr,
40 size_t bytes_req,
41 size_t bytes_alloc,
42 gfp_t gfp_flags,
43 int node)
44{
45 struct ftrace_event_call *call = &event_kmem_alloc;
46 struct trace_array *tr = kmemtrace_array;
47 struct kmemtrace_alloc_entry *entry;
48 struct ring_buffer_event *event;
49
50 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
51 if (!event)
52 return;
53
54 entry = ring_buffer_event_data(event);
55 tracing_generic_entry_update(&entry->ent, 0, 0);
56
57 entry->ent.type = TRACE_KMEM_ALLOC;
58 entry->type_id = type_id;
59 entry->call_site = call_site;
60 entry->ptr = ptr;
61 entry->bytes_req = bytes_req;
62 entry->bytes_alloc = bytes_alloc;
63 entry->gfp_flags = gfp_flags;
64 entry->node = node;
65
66 if (!filter_check_discard(call, entry, tr->buffer, event))
67 ring_buffer_unlock_commit(tr->buffer, event);
68
69 trace_wake_up();
70}
71
72static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
73 unsigned long call_site,
74 const void *ptr)
75{
76 struct ftrace_event_call *call = &event_kmem_free;
77 struct trace_array *tr = kmemtrace_array;
78 struct kmemtrace_free_entry *entry;
79 struct ring_buffer_event *event;
80
81 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
82 if (!event)
83 return;
84 entry = ring_buffer_event_data(event);
85 tracing_generic_entry_update(&entry->ent, 0, 0);
86
87 entry->ent.type = TRACE_KMEM_FREE;
88 entry->type_id = type_id;
89 entry->call_site = call_site;
90 entry->ptr = ptr;
91
92 if (!filter_check_discard(call, entry, tr->buffer, event))
93 ring_buffer_unlock_commit(tr->buffer, event);
94
95 trace_wake_up();
96}
97
98static void kmemtrace_kmalloc(unsigned long call_site,
99 const void *ptr,
100 size_t bytes_req,
101 size_t bytes_alloc,
102 gfp_t gfp_flags)
103{
104 kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
105 bytes_req, bytes_alloc, gfp_flags, -1);
106}
107
108static void kmemtrace_kmem_cache_alloc(unsigned long call_site,
109 const void *ptr,
110 size_t bytes_req,
111 size_t bytes_alloc,
112 gfp_t gfp_flags)
113{
114 kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
115 bytes_req, bytes_alloc, gfp_flags, -1);
116}
117
118static void kmemtrace_kmalloc_node(unsigned long call_site,
119 const void *ptr,
120 size_t bytes_req,
121 size_t bytes_alloc,
122 gfp_t gfp_flags,
123 int node)
124{
125 kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
126 bytes_req, bytes_alloc, gfp_flags, node);
127}
128
129static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site,
130 const void *ptr,
131 size_t bytes_req,
132 size_t bytes_alloc,
133 gfp_t gfp_flags,
134 int node)
135{
136 kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
137 bytes_req, bytes_alloc, gfp_flags, node);
138}
139
140static void kmemtrace_kfree(unsigned long call_site, const void *ptr)
141{
142 kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr);
143}
144
145static void kmemtrace_kmem_cache_free(unsigned long call_site, const void *ptr)
146{
147 kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr);
148}
149
150static int kmemtrace_start_probes(void)
151{
152 int err;
153
154 err = register_trace_kmalloc(kmemtrace_kmalloc);
155 if (err)
156 return err;
157 err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
158 if (err)
159 return err;
160 err = register_trace_kmalloc_node(kmemtrace_kmalloc_node);
161 if (err)
162 return err;
163 err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
164 if (err)
165 return err;
166 err = register_trace_kfree(kmemtrace_kfree);
167 if (err)
168 return err;
169 err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
170
171 return err;
172}
173
174static void kmemtrace_stop_probes(void)
175{
176 unregister_trace_kmalloc(kmemtrace_kmalloc);
177 unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
178 unregister_trace_kmalloc_node(kmemtrace_kmalloc_node);
179 unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
180 unregister_trace_kfree(kmemtrace_kfree);
181 unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
182}
183
184static int kmem_trace_init(struct trace_array *tr)
185{
186 kmemtrace_array = tr;
187
188 tracing_reset_online_cpus(tr);
189
190 kmemtrace_start_probes();
191
192 return 0;
193}
194
195static void kmem_trace_reset(struct trace_array *tr)
196{
197 kmemtrace_stop_probes();
198}
199
200static void kmemtrace_headers(struct seq_file *s)
201{
202 /* Don't need headers for the original kmemtrace output */
203 if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
204 return;
205
206 seq_printf(s, "#\n");
207 seq_printf(s, "# ALLOC TYPE REQ GIVEN FLAGS "
208 " POINTER NODE CALLER\n");
209 seq_printf(s, "# FREE | | | | "
210 " | | | |\n");
211 seq_printf(s, "# |\n\n");
212}
213
214/*
215 * The following functions give the original output from kmemtrace,
216 * plus the origin CPU, since reordering occurs in-kernel now.
217 */
218
219#define KMEMTRACE_USER_ALLOC 0
220#define KMEMTRACE_USER_FREE 1
221
222struct kmemtrace_user_event {
223 u8 event_id;
224 u8 type_id;
225 u16 event_size;
226 u32 cpu;
227 u64 timestamp;
228 unsigned long call_site;
229 unsigned long ptr;
230};
231
232struct kmemtrace_user_event_alloc {
233 size_t bytes_req;
234 size_t bytes_alloc;
235 unsigned gfp_flags;
236 int node;
237};
238
239static enum print_line_t
240kmemtrace_print_alloc(struct trace_iterator *iter, int flags)
241{
242 struct trace_seq *s = &iter->seq;
243 struct kmemtrace_alloc_entry *entry;
244 int ret;
245
246 trace_assign_type(entry, iter->ent);
247
248 ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu "
249 "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n",
250 entry->type_id, (void *)entry->call_site, (unsigned long)entry->ptr,
251 (unsigned long)entry->bytes_req, (unsigned long)entry->bytes_alloc,
252 (unsigned long)entry->gfp_flags, entry->node);
253
254 if (!ret)
255 return TRACE_TYPE_PARTIAL_LINE;
256 return TRACE_TYPE_HANDLED;
257}
258
259static enum print_line_t
260kmemtrace_print_free(struct trace_iterator *iter, int flags)
261{
262 struct trace_seq *s = &iter->seq;
263 struct kmemtrace_free_entry *entry;
264 int ret;
265
266 trace_assign_type(entry, iter->ent);
267
268 ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu\n",
269 entry->type_id, (void *)entry->call_site,
270 (unsigned long)entry->ptr);
271
272 if (!ret)
273 return TRACE_TYPE_PARTIAL_LINE;
274 return TRACE_TYPE_HANDLED;
275}
276
277static enum print_line_t
278kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags)
279{
280 struct trace_seq *s = &iter->seq;
281 struct kmemtrace_alloc_entry *entry;
282 struct kmemtrace_user_event *ev;
283 struct kmemtrace_user_event_alloc *ev_alloc;
284
285 trace_assign_type(entry, iter->ent);
286
287 ev = trace_seq_reserve(s, sizeof(*ev));
288 if (!ev)
289 return TRACE_TYPE_PARTIAL_LINE;
290
291 ev->event_id = KMEMTRACE_USER_ALLOC;
292 ev->type_id = entry->type_id;
293 ev->event_size = sizeof(*ev) + sizeof(*ev_alloc);
294 ev->cpu = iter->cpu;
295 ev->timestamp = iter->ts;
296 ev->call_site = entry->call_site;
297 ev->ptr = (unsigned long)entry->ptr;
298
299 ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc));
300 if (!ev_alloc)
301 return TRACE_TYPE_PARTIAL_LINE;
302
303 ev_alloc->bytes_req = entry->bytes_req;
304 ev_alloc->bytes_alloc = entry->bytes_alloc;
305 ev_alloc->gfp_flags = entry->gfp_flags;
306 ev_alloc->node = entry->node;
307
308 return TRACE_TYPE_HANDLED;
309}
310
311static enum print_line_t
312kmemtrace_print_free_user(struct trace_iterator *iter, int flags)
313{
314 struct trace_seq *s = &iter->seq;
315 struct kmemtrace_free_entry *entry;
316 struct kmemtrace_user_event *ev;
317
318 trace_assign_type(entry, iter->ent);
319
320 ev = trace_seq_reserve(s, sizeof(*ev));
321 if (!ev)
322 return TRACE_TYPE_PARTIAL_LINE;
323
324 ev->event_id = KMEMTRACE_USER_FREE;
325 ev->type_id = entry->type_id;
326 ev->event_size = sizeof(*ev);
327 ev->cpu = iter->cpu;
328 ev->timestamp = iter->ts;
329 ev->call_site = entry->call_site;
330 ev->ptr = (unsigned long)entry->ptr;
331
332 return TRACE_TYPE_HANDLED;
333}
334
335/* The two other following provide a more minimalistic output */
336static enum print_line_t
337kmemtrace_print_alloc_compress(struct trace_iterator *iter)
338{
339 struct kmemtrace_alloc_entry *entry;
340 struct trace_seq *s = &iter->seq;
341 int ret;
342
343 trace_assign_type(entry, iter->ent);
344
345 /* Alloc entry */
346 ret = trace_seq_printf(s, " + ");
347 if (!ret)
348 return TRACE_TYPE_PARTIAL_LINE;
349
350 /* Type */
351 switch (entry->type_id) {
352 case KMEMTRACE_TYPE_KMALLOC:
353 ret = trace_seq_printf(s, "K ");
354 break;
355 case KMEMTRACE_TYPE_CACHE:
356 ret = trace_seq_printf(s, "C ");
357 break;
358 case KMEMTRACE_TYPE_PAGES:
359 ret = trace_seq_printf(s, "P ");
360 break;
361 default:
362 ret = trace_seq_printf(s, "? ");
363 }
364
365 if (!ret)
366 return TRACE_TYPE_PARTIAL_LINE;
367
368 /* Requested */
369 ret = trace_seq_printf(s, "%4zu ", entry->bytes_req);
370 if (!ret)
371 return TRACE_TYPE_PARTIAL_LINE;
372
373 /* Allocated */
374 ret = trace_seq_printf(s, "%4zu ", entry->bytes_alloc);
375 if (!ret)
376 return TRACE_TYPE_PARTIAL_LINE;
377
378 /* Flags
379 * TODO: would be better to see the name of the GFP flag names
380 */
381 ret = trace_seq_printf(s, "%08x ", entry->gfp_flags);
382 if (!ret)
383 return TRACE_TYPE_PARTIAL_LINE;
384
385 /* Pointer to allocated */
386 ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr);
387 if (!ret)
388 return TRACE_TYPE_PARTIAL_LINE;
389
390 /* Node and call site*/
391 ret = trace_seq_printf(s, "%4d %pf\n", entry->node,
392 (void *)entry->call_site);
393 if (!ret)
394 return TRACE_TYPE_PARTIAL_LINE;
395
396 return TRACE_TYPE_HANDLED;
397}
398
399static enum print_line_t
400kmemtrace_print_free_compress(struct trace_iterator *iter)
401{
402 struct kmemtrace_free_entry *entry;
403 struct trace_seq *s = &iter->seq;
404 int ret;
405
406 trace_assign_type(entry, iter->ent);
407
408 /* Free entry */
409 ret = trace_seq_printf(s, " - ");
410 if (!ret)
411 return TRACE_TYPE_PARTIAL_LINE;
412
413 /* Type */
414 switch (entry->type_id) {
415 case KMEMTRACE_TYPE_KMALLOC:
416 ret = trace_seq_printf(s, "K ");
417 break;
418 case KMEMTRACE_TYPE_CACHE:
419 ret = trace_seq_printf(s, "C ");
420 break;
421 case KMEMTRACE_TYPE_PAGES:
422 ret = trace_seq_printf(s, "P ");
423 break;
424 default:
425 ret = trace_seq_printf(s, "? ");
426 }
427
428 if (!ret)
429 return TRACE_TYPE_PARTIAL_LINE;
430
431 /* Skip requested/allocated/flags */
432 ret = trace_seq_printf(s, " ");
433 if (!ret)
434 return TRACE_TYPE_PARTIAL_LINE;
435
436 /* Pointer to allocated */
437 ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr);
438 if (!ret)
439 return TRACE_TYPE_PARTIAL_LINE;
440
441 /* Skip node and print call site*/
442 ret = trace_seq_printf(s, " %pf\n", (void *)entry->call_site);
443 if (!ret)
444 return TRACE_TYPE_PARTIAL_LINE;
445
446 return TRACE_TYPE_HANDLED;
447}
448
449static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
450{
451 struct trace_entry *entry = iter->ent;
452
453 if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
454 return TRACE_TYPE_UNHANDLED;
455
456 switch (entry->type) {
457 case TRACE_KMEM_ALLOC:
458 return kmemtrace_print_alloc_compress(iter);
459 case TRACE_KMEM_FREE:
460 return kmemtrace_print_free_compress(iter);
461 default:
462 return TRACE_TYPE_UNHANDLED;
463 }
464}
465
466static struct trace_event kmem_trace_alloc = {
467 .type = TRACE_KMEM_ALLOC,
468 .trace = kmemtrace_print_alloc,
469 .binary = kmemtrace_print_alloc_user,
470};
471
472static struct trace_event kmem_trace_free = {
473 .type = TRACE_KMEM_FREE,
474 .trace = kmemtrace_print_free,
475 .binary = kmemtrace_print_free_user,
476};
477
478static struct tracer kmem_tracer __read_mostly = {
479 .name = "kmemtrace",
480 .init = kmem_trace_init,
481 .reset = kmem_trace_reset,
482 .print_line = kmemtrace_print_line,
483 .print_header = kmemtrace_headers,
484 .flags = &kmem_tracer_flags
485};
486
487void kmemtrace_init(void)
488{
489 /* earliest opportunity to start kmem tracing */
490}
491
492static int __init init_kmem_tracer(void)
493{
494 if (!register_ftrace_event(&kmem_trace_alloc)) {
495 pr_warning("Warning: could not register kmem events\n");
496 return 1;
497 }
498
499 if (!register_ftrace_event(&kmem_trace_free)) {
500 pr_warning("Warning: could not register kmem events\n");
501 return 1;
502 }
503
504 if (register_tracer(&kmem_tracer) != 0) {
505 pr_warning("Warning: could not register the kmem tracer\n");
506 return 1;
507 }
508
509 return 0;
510}
511device_initcall(init_kmem_tracer);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 41ca394feb22..bca96377fd4e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -319,6 +319,11 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
319#define TS_MASK ((1ULL << TS_SHIFT) - 1) 319#define TS_MASK ((1ULL << TS_SHIFT) - 1)
320#define TS_DELTA_TEST (~TS_MASK) 320#define TS_DELTA_TEST (~TS_MASK)
321 321
322/* Flag when events were overwritten */
323#define RB_MISSED_EVENTS (1 << 31)
324/* Missed count stored at end */
325#define RB_MISSED_STORED (1 << 30)
326
322struct buffer_data_page { 327struct buffer_data_page {
323 u64 time_stamp; /* page time stamp */ 328 u64 time_stamp; /* page time stamp */
324 local_t commit; /* write committed index */ 329 local_t commit; /* write committed index */
@@ -338,6 +343,7 @@ struct buffer_page {
338 local_t write; /* index for next write */ 343 local_t write; /* index for next write */
339 unsigned read; /* index for next read */ 344 unsigned read; /* index for next read */
340 local_t entries; /* entries on this page */ 345 local_t entries; /* entries on this page */
346 unsigned long real_end; /* real end of data */
341 struct buffer_data_page *page; /* Actual data page */ 347 struct buffer_data_page *page; /* Actual data page */
342}; 348};
343 349
@@ -399,7 +405,7 @@ static inline int test_time_stamp(u64 delta)
399#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) 405#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
400 406
401/* Max number of timestamps that can fit on a page */ 407/* Max number of timestamps that can fit on a page */
402#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_STAMP) 408#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_EXTEND)
403 409
404int ring_buffer_print_page_header(struct trace_seq *s) 410int ring_buffer_print_page_header(struct trace_seq *s)
405{ 411{
@@ -417,6 +423,12 @@ int ring_buffer_print_page_header(struct trace_seq *s)
417 (unsigned int)sizeof(field.commit), 423 (unsigned int)sizeof(field.commit),
418 (unsigned int)is_signed_type(long)); 424 (unsigned int)is_signed_type(long));
419 425
426 ret = trace_seq_printf(s, "\tfield: int overwrite;\t"
427 "offset:%u;\tsize:%u;\tsigned:%u;\n",
428 (unsigned int)offsetof(typeof(field), commit),
429 1,
430 (unsigned int)is_signed_type(long));
431
420 ret = trace_seq_printf(s, "\tfield: char data;\t" 432 ret = trace_seq_printf(s, "\tfield: char data;\t"
421 "offset:%u;\tsize:%u;\tsigned:%u;\n", 433 "offset:%u;\tsize:%u;\tsigned:%u;\n",
422 (unsigned int)offsetof(typeof(field), data), 434 (unsigned int)offsetof(typeof(field), data),
@@ -431,6 +443,7 @@ int ring_buffer_print_page_header(struct trace_seq *s)
431 */ 443 */
432struct ring_buffer_per_cpu { 444struct ring_buffer_per_cpu {
433 int cpu; 445 int cpu;
446 atomic_t record_disabled;
434 struct ring_buffer *buffer; 447 struct ring_buffer *buffer;
435 spinlock_t reader_lock; /* serialize readers */ 448 spinlock_t reader_lock; /* serialize readers */
436 arch_spinlock_t lock; 449 arch_spinlock_t lock;
@@ -440,6 +453,8 @@ struct ring_buffer_per_cpu {
440 struct buffer_page *tail_page; /* write to tail */ 453 struct buffer_page *tail_page; /* write to tail */
441 struct buffer_page *commit_page; /* committed pages */ 454 struct buffer_page *commit_page; /* committed pages */
442 struct buffer_page *reader_page; 455 struct buffer_page *reader_page;
456 unsigned long lost_events;
457 unsigned long last_overrun;
443 local_t commit_overrun; 458 local_t commit_overrun;
444 local_t overrun; 459 local_t overrun;
445 local_t entries; 460 local_t entries;
@@ -448,7 +463,6 @@ struct ring_buffer_per_cpu {
448 unsigned long read; 463 unsigned long read;
449 u64 write_stamp; 464 u64 write_stamp;
450 u64 read_stamp; 465 u64 read_stamp;
451 atomic_t record_disabled;
452}; 466};
453 467
454struct ring_buffer { 468struct ring_buffer {
@@ -1754,6 +1768,14 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1754 * must fill the old tail_page with padding. 1768 * must fill the old tail_page with padding.
1755 */ 1769 */
1756 if (tail >= BUF_PAGE_SIZE) { 1770 if (tail >= BUF_PAGE_SIZE) {
1771 /*
1772 * If the page was filled, then we still need
1773 * to update the real_end. Reset it to zero
1774 * and the reader will ignore it.
1775 */
1776 if (tail == BUF_PAGE_SIZE)
1777 tail_page->real_end = 0;
1778
1757 local_sub(length, &tail_page->write); 1779 local_sub(length, &tail_page->write);
1758 return; 1780 return;
1759 } 1781 }
@@ -1762,6 +1784,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1762 kmemcheck_annotate_bitfield(event, bitfield); 1784 kmemcheck_annotate_bitfield(event, bitfield);
1763 1785
1764 /* 1786 /*
1787 * Save the original length to the meta data.
1788 * This will be used by the reader to add lost event
1789 * counter.
1790 */
1791 tail_page->real_end = tail;
1792
1793 /*
1765 * If this event is bigger than the minimum size, then 1794 * If this event is bigger than the minimum size, then
1766 * we need to be careful that we don't subtract the 1795 * we need to be careful that we don't subtract the
1767 * write counter enough to allow another writer to slip 1796 * write counter enough to allow another writer to slip
@@ -1979,17 +2008,13 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1979 u64 *ts, u64 *delta) 2008 u64 *ts, u64 *delta)
1980{ 2009{
1981 struct ring_buffer_event *event; 2010 struct ring_buffer_event *event;
1982 static int once;
1983 int ret; 2011 int ret;
1984 2012
1985 if (unlikely(*delta > (1ULL << 59) && !once++)) { 2013 WARN_ONCE(*delta > (1ULL << 59),
1986 printk(KERN_WARNING "Delta way too big! %llu" 2014 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
1987 " ts=%llu write stamp = %llu\n", 2015 (unsigned long long)*delta,
1988 (unsigned long long)*delta, 2016 (unsigned long long)*ts,
1989 (unsigned long long)*ts, 2017 (unsigned long long)cpu_buffer->write_stamp);
1990 (unsigned long long)cpu_buffer->write_stamp);
1991 WARN_ON(1);
1992 }
1993 2018
1994 /* 2019 /*
1995 * The delta is too big, we to add a 2020 * The delta is too big, we to add a
@@ -2217,8 +2242,6 @@ static void trace_recursive_unlock(void)
2217 2242
2218#endif 2243#endif
2219 2244
2220static DEFINE_PER_CPU(int, rb_need_resched);
2221
2222/** 2245/**
2223 * ring_buffer_lock_reserve - reserve a part of the buffer 2246 * ring_buffer_lock_reserve - reserve a part of the buffer
2224 * @buffer: the ring buffer to reserve from 2247 * @buffer: the ring buffer to reserve from
@@ -2239,13 +2262,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
2239{ 2262{
2240 struct ring_buffer_per_cpu *cpu_buffer; 2263 struct ring_buffer_per_cpu *cpu_buffer;
2241 struct ring_buffer_event *event; 2264 struct ring_buffer_event *event;
2242 int cpu, resched; 2265 int cpu;
2243 2266
2244 if (ring_buffer_flags != RB_BUFFERS_ON) 2267 if (ring_buffer_flags != RB_BUFFERS_ON)
2245 return NULL; 2268 return NULL;
2246 2269
2247 /* If we are tracing schedule, we don't want to recurse */ 2270 /* If we are tracing schedule, we don't want to recurse */
2248 resched = ftrace_preempt_disable(); 2271 preempt_disable_notrace();
2249 2272
2250 if (atomic_read(&buffer->record_disabled)) 2273 if (atomic_read(&buffer->record_disabled))
2251 goto out_nocheck; 2274 goto out_nocheck;
@@ -2270,21 +2293,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
2270 if (!event) 2293 if (!event)
2271 goto out; 2294 goto out;
2272 2295
2273 /*
2274 * Need to store resched state on this cpu.
2275 * Only the first needs to.
2276 */
2277
2278 if (preempt_count() == 1)
2279 per_cpu(rb_need_resched, cpu) = resched;
2280
2281 return event; 2296 return event;
2282 2297
2283 out: 2298 out:
2284 trace_recursive_unlock(); 2299 trace_recursive_unlock();
2285 2300
2286 out_nocheck: 2301 out_nocheck:
2287 ftrace_preempt_enable(resched); 2302 preempt_enable_notrace();
2288 return NULL; 2303 return NULL;
2289} 2304}
2290EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); 2305EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
@@ -2330,13 +2345,7 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
2330 2345
2331 trace_recursive_unlock(); 2346 trace_recursive_unlock();
2332 2347
2333 /* 2348 preempt_enable_notrace();
2334 * Only the last preempt count needs to restore preemption.
2335 */
2336 if (preempt_count() == 1)
2337 ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
2338 else
2339 preempt_enable_no_resched_notrace();
2340 2349
2341 return 0; 2350 return 0;
2342} 2351}
@@ -2444,13 +2453,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
2444 2453
2445 trace_recursive_unlock(); 2454 trace_recursive_unlock();
2446 2455
2447 /* 2456 preempt_enable_notrace();
2448 * Only the last preempt count needs to restore preemption.
2449 */
2450 if (preempt_count() == 1)
2451 ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
2452 else
2453 preempt_enable_no_resched_notrace();
2454 2457
2455} 2458}
2456EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); 2459EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
@@ -2476,12 +2479,12 @@ int ring_buffer_write(struct ring_buffer *buffer,
2476 struct ring_buffer_event *event; 2479 struct ring_buffer_event *event;
2477 void *body; 2480 void *body;
2478 int ret = -EBUSY; 2481 int ret = -EBUSY;
2479 int cpu, resched; 2482 int cpu;
2480 2483
2481 if (ring_buffer_flags != RB_BUFFERS_ON) 2484 if (ring_buffer_flags != RB_BUFFERS_ON)
2482 return -EBUSY; 2485 return -EBUSY;
2483 2486
2484 resched = ftrace_preempt_disable(); 2487 preempt_disable_notrace();
2485 2488
2486 if (atomic_read(&buffer->record_disabled)) 2489 if (atomic_read(&buffer->record_disabled))
2487 goto out; 2490 goto out;
@@ -2511,7 +2514,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
2511 2514
2512 ret = 0; 2515 ret = 0;
2513 out: 2516 out:
2514 ftrace_preempt_enable(resched); 2517 preempt_enable_notrace();
2515 2518
2516 return ret; 2519 return ret;
2517} 2520}
@@ -2838,6 +2841,7 @@ static struct buffer_page *
2838rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) 2841rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2839{ 2842{
2840 struct buffer_page *reader = NULL; 2843 struct buffer_page *reader = NULL;
2844 unsigned long overwrite;
2841 unsigned long flags; 2845 unsigned long flags;
2842 int nr_loops = 0; 2846 int nr_loops = 0;
2843 int ret; 2847 int ret;
@@ -2879,6 +2883,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2879 local_set(&cpu_buffer->reader_page->write, 0); 2883 local_set(&cpu_buffer->reader_page->write, 0);
2880 local_set(&cpu_buffer->reader_page->entries, 0); 2884 local_set(&cpu_buffer->reader_page->entries, 0);
2881 local_set(&cpu_buffer->reader_page->page->commit, 0); 2885 local_set(&cpu_buffer->reader_page->page->commit, 0);
2886 cpu_buffer->reader_page->real_end = 0;
2882 2887
2883 spin: 2888 spin:
2884 /* 2889 /*
@@ -2899,6 +2904,18 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2899 rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list); 2904 rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
2900 2905
2901 /* 2906 /*
2907 * We want to make sure we read the overruns after we set up our
2908 * pointers to the next object. The writer side does a
2909 * cmpxchg to cross pages which acts as the mb on the writer
2910 * side. Note, the reader will constantly fail the swap
2911 * while the writer is updating the pointers, so this
2912 * guarantees that the overwrite recorded here is the one we
2913 * want to compare with the last_overrun.
2914 */
2915 smp_mb();
2916 overwrite = local_read(&(cpu_buffer->overrun));
2917
2918 /*
2902 * Here's the tricky part. 2919 * Here's the tricky part.
2903 * 2920 *
2904 * We need to move the pointer past the header page. 2921 * We need to move the pointer past the header page.
@@ -2929,6 +2946,11 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2929 cpu_buffer->reader_page = reader; 2946 cpu_buffer->reader_page = reader;
2930 rb_reset_reader_page(cpu_buffer); 2947 rb_reset_reader_page(cpu_buffer);
2931 2948
2949 if (overwrite != cpu_buffer->last_overrun) {
2950 cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun;
2951 cpu_buffer->last_overrun = overwrite;
2952 }
2953
2932 goto again; 2954 goto again;
2933 2955
2934 out: 2956 out:
@@ -2963,13 +2985,11 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
2963 2985
2964static void rb_advance_iter(struct ring_buffer_iter *iter) 2986static void rb_advance_iter(struct ring_buffer_iter *iter)
2965{ 2987{
2966 struct ring_buffer *buffer;
2967 struct ring_buffer_per_cpu *cpu_buffer; 2988 struct ring_buffer_per_cpu *cpu_buffer;
2968 struct ring_buffer_event *event; 2989 struct ring_buffer_event *event;
2969 unsigned length; 2990 unsigned length;
2970 2991
2971 cpu_buffer = iter->cpu_buffer; 2992 cpu_buffer = iter->cpu_buffer;
2972 buffer = cpu_buffer->buffer;
2973 2993
2974 /* 2994 /*
2975 * Check if we are at the end of the buffer. 2995 * Check if we are at the end of the buffer.
@@ -3005,8 +3025,14 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
3005 rb_advance_iter(iter); 3025 rb_advance_iter(iter);
3006} 3026}
3007 3027
3028static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)
3029{
3030 return cpu_buffer->lost_events;
3031}
3032
3008static struct ring_buffer_event * 3033static struct ring_buffer_event *
3009rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts) 3034rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
3035 unsigned long *lost_events)
3010{ 3036{
3011 struct ring_buffer_event *event; 3037 struct ring_buffer_event *event;
3012 struct buffer_page *reader; 3038 struct buffer_page *reader;
@@ -3058,6 +3084,8 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
3058 ring_buffer_normalize_time_stamp(cpu_buffer->buffer, 3084 ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
3059 cpu_buffer->cpu, ts); 3085 cpu_buffer->cpu, ts);
3060 } 3086 }
3087 if (lost_events)
3088 *lost_events = rb_lost_events(cpu_buffer);
3061 return event; 3089 return event;
3062 3090
3063 default: 3091 default:
@@ -3168,12 +3196,14 @@ static inline int rb_ok_to_lock(void)
3168 * @buffer: The ring buffer to read 3196 * @buffer: The ring buffer to read
3169 * @cpu: The cpu to peak at 3197 * @cpu: The cpu to peak at
3170 * @ts: The timestamp counter of this event. 3198 * @ts: The timestamp counter of this event.
3199 * @lost_events: a variable to store if events were lost (may be NULL)
3171 * 3200 *
3172 * This will return the event that will be read next, but does 3201 * This will return the event that will be read next, but does
3173 * not consume the data. 3202 * not consume the data.
3174 */ 3203 */
3175struct ring_buffer_event * 3204struct ring_buffer_event *
3176ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) 3205ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
3206 unsigned long *lost_events)
3177{ 3207{
3178 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; 3208 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
3179 struct ring_buffer_event *event; 3209 struct ring_buffer_event *event;
@@ -3188,7 +3218,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
3188 local_irq_save(flags); 3218 local_irq_save(flags);
3189 if (dolock) 3219 if (dolock)
3190 spin_lock(&cpu_buffer->reader_lock); 3220 spin_lock(&cpu_buffer->reader_lock);
3191 event = rb_buffer_peek(cpu_buffer, ts); 3221 event = rb_buffer_peek(cpu_buffer, ts, lost_events);
3192 if (event && event->type_len == RINGBUF_TYPE_PADDING) 3222 if (event && event->type_len == RINGBUF_TYPE_PADDING)
3193 rb_advance_reader(cpu_buffer); 3223 rb_advance_reader(cpu_buffer);
3194 if (dolock) 3224 if (dolock)
@@ -3230,13 +3260,17 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3230/** 3260/**
3231 * ring_buffer_consume - return an event and consume it 3261 * ring_buffer_consume - return an event and consume it
3232 * @buffer: The ring buffer to get the next event from 3262 * @buffer: The ring buffer to get the next event from
3263 * @cpu: the cpu to read the buffer from
3264 * @ts: a variable to store the timestamp (may be NULL)
3265 * @lost_events: a variable to store if events were lost (may be NULL)
3233 * 3266 *
3234 * Returns the next event in the ring buffer, and that event is consumed. 3267 * Returns the next event in the ring buffer, and that event is consumed.
3235 * Meaning, that sequential reads will keep returning a different event, 3268 * Meaning, that sequential reads will keep returning a different event,
3236 * and eventually empty the ring buffer if the producer is slower. 3269 * and eventually empty the ring buffer if the producer is slower.
3237 */ 3270 */
3238struct ring_buffer_event * 3271struct ring_buffer_event *
3239ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) 3272ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
3273 unsigned long *lost_events)
3240{ 3274{
3241 struct ring_buffer_per_cpu *cpu_buffer; 3275 struct ring_buffer_per_cpu *cpu_buffer;
3242 struct ring_buffer_event *event = NULL; 3276 struct ring_buffer_event *event = NULL;
@@ -3257,9 +3291,11 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
3257 if (dolock) 3291 if (dolock)
3258 spin_lock(&cpu_buffer->reader_lock); 3292 spin_lock(&cpu_buffer->reader_lock);
3259 3293
3260 event = rb_buffer_peek(cpu_buffer, ts); 3294 event = rb_buffer_peek(cpu_buffer, ts, lost_events);
3261 if (event) 3295 if (event) {
3296 cpu_buffer->lost_events = 0;
3262 rb_advance_reader(cpu_buffer); 3297 rb_advance_reader(cpu_buffer);
3298 }
3263 3299
3264 if (dolock) 3300 if (dolock)
3265 spin_unlock(&cpu_buffer->reader_lock); 3301 spin_unlock(&cpu_buffer->reader_lock);
@@ -3276,23 +3312,30 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
3276EXPORT_SYMBOL_GPL(ring_buffer_consume); 3312EXPORT_SYMBOL_GPL(ring_buffer_consume);
3277 3313
3278/** 3314/**
3279 * ring_buffer_read_start - start a non consuming read of the buffer 3315 * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
3280 * @buffer: The ring buffer to read from 3316 * @buffer: The ring buffer to read from
3281 * @cpu: The cpu buffer to iterate over 3317 * @cpu: The cpu buffer to iterate over
3282 * 3318 *
3283 * This starts up an iteration through the buffer. It also disables 3319 * This performs the initial preparations necessary to iterate
3284 * the recording to the buffer until the reading is finished. 3320 * through the buffer. Memory is allocated, buffer recording
3285 * This prevents the reading from being corrupted. This is not 3321 * is disabled, and the iterator pointer is returned to the caller.
3286 * a consuming read, so a producer is not expected.
3287 * 3322 *
3288 * Must be paired with ring_buffer_finish. 3323 * Disabling buffer recordng prevents the reading from being
3324 * corrupted. This is not a consuming read, so a producer is not
3325 * expected.
3326 *
3327 * After a sequence of ring_buffer_read_prepare calls, the user is
3328 * expected to make at least one call to ring_buffer_prepare_sync.
3329 * Afterwards, ring_buffer_read_start is invoked to get things going
3330 * for real.
3331 *
3332 * This overall must be paired with ring_buffer_finish.
3289 */ 3333 */
3290struct ring_buffer_iter * 3334struct ring_buffer_iter *
3291ring_buffer_read_start(struct ring_buffer *buffer, int cpu) 3335ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
3292{ 3336{
3293 struct ring_buffer_per_cpu *cpu_buffer; 3337 struct ring_buffer_per_cpu *cpu_buffer;
3294 struct ring_buffer_iter *iter; 3338 struct ring_buffer_iter *iter;
3295 unsigned long flags;
3296 3339
3297 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 3340 if (!cpumask_test_cpu(cpu, buffer->cpumask))
3298 return NULL; 3341 return NULL;
@@ -3306,15 +3349,52 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
3306 iter->cpu_buffer = cpu_buffer; 3349 iter->cpu_buffer = cpu_buffer;
3307 3350
3308 atomic_inc(&cpu_buffer->record_disabled); 3351 atomic_inc(&cpu_buffer->record_disabled);
3352
3353 return iter;
3354}
3355EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
3356
3357/**
3358 * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
3359 *
3360 * All previously invoked ring_buffer_read_prepare calls to prepare
3361 * iterators will be synchronized. Afterwards, read_buffer_read_start
3362 * calls on those iterators are allowed.
3363 */
3364void
3365ring_buffer_read_prepare_sync(void)
3366{
3309 synchronize_sched(); 3367 synchronize_sched();
3368}
3369EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
3370
3371/**
3372 * ring_buffer_read_start - start a non consuming read of the buffer
3373 * @iter: The iterator returned by ring_buffer_read_prepare
3374 *
3375 * This finalizes the startup of an iteration through the buffer.
3376 * The iterator comes from a call to ring_buffer_read_prepare and
3377 * an intervening ring_buffer_read_prepare_sync must have been
3378 * performed.
3379 *
3380 * Must be paired with ring_buffer_finish.
3381 */
3382void
3383ring_buffer_read_start(struct ring_buffer_iter *iter)
3384{
3385 struct ring_buffer_per_cpu *cpu_buffer;
3386 unsigned long flags;
3387
3388 if (!iter)
3389 return;
3390
3391 cpu_buffer = iter->cpu_buffer;
3310 3392
3311 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3393 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3312 arch_spin_lock(&cpu_buffer->lock); 3394 arch_spin_lock(&cpu_buffer->lock);
3313 rb_iter_reset(iter); 3395 rb_iter_reset(iter);
3314 arch_spin_unlock(&cpu_buffer->lock); 3396 arch_spin_unlock(&cpu_buffer->lock);
3315 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3397 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3316
3317 return iter;
3318} 3398}
3319EXPORT_SYMBOL_GPL(ring_buffer_read_start); 3399EXPORT_SYMBOL_GPL(ring_buffer_read_start);
3320 3400
@@ -3408,6 +3488,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
3408 cpu_buffer->write_stamp = 0; 3488 cpu_buffer->write_stamp = 0;
3409 cpu_buffer->read_stamp = 0; 3489 cpu_buffer->read_stamp = 0;
3410 3490
3491 cpu_buffer->lost_events = 0;
3492 cpu_buffer->last_overrun = 0;
3493
3411 rb_head_page_activate(cpu_buffer); 3494 rb_head_page_activate(cpu_buffer);
3412} 3495}
3413 3496
@@ -3683,6 +3766,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3683 struct ring_buffer_event *event; 3766 struct ring_buffer_event *event;
3684 struct buffer_data_page *bpage; 3767 struct buffer_data_page *bpage;
3685 struct buffer_page *reader; 3768 struct buffer_page *reader;
3769 unsigned long missed_events;
3686 unsigned long flags; 3770 unsigned long flags;
3687 unsigned int commit; 3771 unsigned int commit;
3688 unsigned int read; 3772 unsigned int read;
@@ -3719,6 +3803,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3719 read = reader->read; 3803 read = reader->read;
3720 commit = rb_page_commit(reader); 3804 commit = rb_page_commit(reader);
3721 3805
3806 /* Check if any events were dropped */
3807 missed_events = cpu_buffer->lost_events;
3808
3722 /* 3809 /*
3723 * If this page has been partially read or 3810 * If this page has been partially read or
3724 * if len is not big enough to read the rest of the page or 3811 * if len is not big enough to read the rest of the page or
@@ -3757,6 +3844,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3757 rpos = reader->read; 3844 rpos = reader->read;
3758 pos += size; 3845 pos += size;
3759 3846
3847 if (rpos >= commit)
3848 break;
3849
3760 event = rb_reader_event(cpu_buffer); 3850 event = rb_reader_event(cpu_buffer);
3761 size = rb_event_length(event); 3851 size = rb_event_length(event);
3762 } while (len > size); 3852 } while (len > size);
@@ -3779,9 +3869,42 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3779 local_set(&reader->entries, 0); 3869 local_set(&reader->entries, 0);
3780 reader->read = 0; 3870 reader->read = 0;
3781 *data_page = bpage; 3871 *data_page = bpage;
3872
3873 /*
3874 * Use the real_end for the data size,
3875 * This gives us a chance to store the lost events
3876 * on the page.
3877 */
3878 if (reader->real_end)
3879 local_set(&bpage->commit, reader->real_end);
3782 } 3880 }
3783 ret = read; 3881 ret = read;
3784 3882
3883 cpu_buffer->lost_events = 0;
3884
3885 commit = local_read(&bpage->commit);
3886 /*
3887 * Set a flag in the commit field if we lost events
3888 */
3889 if (missed_events) {
3890 /* If there is room at the end of the page to save the
3891 * missed events, then record it there.
3892 */
3893 if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) {
3894 memcpy(&bpage->data[commit], &missed_events,
3895 sizeof(missed_events));
3896 local_add(RB_MISSED_STORED, &bpage->commit);
3897 commit += sizeof(missed_events);
3898 }
3899 local_add(RB_MISSED_EVENTS, &bpage->commit);
3900 }
3901
3902 /*
3903 * This page may be off to user land. Zero it out here.
3904 */
3905 if (commit < BUF_PAGE_SIZE)
3906 memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit);
3907
3785 out_unlock: 3908 out_unlock:
3786 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3909 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3787 3910
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index df74c7982255..302f8a614635 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -81,7 +81,7 @@ static enum event_status read_event(int cpu)
81 int *entry; 81 int *entry;
82 u64 ts; 82 u64 ts;
83 83
84 event = ring_buffer_consume(buffer, cpu, &ts); 84 event = ring_buffer_consume(buffer, cpu, &ts, NULL);
85 if (!event) 85 if (!event)
86 return EVENT_DROPPED; 86 return EVENT_DROPPED;
87 87
@@ -113,7 +113,8 @@ static enum event_status read_page(int cpu)
113 ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1); 113 ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
114 if (ret >= 0) { 114 if (ret >= 0) {
115 rpage = bpage; 115 rpage = bpage;
116 commit = local_read(&rpage->commit); 116 /* The commit may have missed event flags set, clear them */
117 commit = local_read(&rpage->commit) & 0xfffff;
117 for (i = 0; i < commit && !kill_test; i += inc) { 118 for (i = 0; i < commit && !kill_test; i += inc) {
118 119
119 if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) { 120 if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 44f916a04065..9ec59f541156 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -101,10 +101,7 @@ static inline void ftrace_enable_cpu(void)
101 preempt_enable(); 101 preempt_enable();
102} 102}
103 103
104static cpumask_var_t __read_mostly tracing_buffer_mask; 104cpumask_var_t __read_mostly tracing_buffer_mask;
105
106#define for_each_tracing_cpu(cpu) \
107 for_each_cpu(cpu, tracing_buffer_mask)
108 105
109/* 106/*
110 * ftrace_dump_on_oops - variable to dump ftrace buffer on oops 107 * ftrace_dump_on_oops - variable to dump ftrace buffer on oops
@@ -117,9 +114,12 @@ static cpumask_var_t __read_mostly tracing_buffer_mask;
117 * 114 *
118 * It is default off, but you can enable it with either specifying 115 * It is default off, but you can enable it with either specifying
119 * "ftrace_dump_on_oops" in the kernel command line, or setting 116 * "ftrace_dump_on_oops" in the kernel command line, or setting
120 * /proc/sys/kernel/ftrace_dump_on_oops to true. 117 * /proc/sys/kernel/ftrace_dump_on_oops
118 * Set 1 if you want to dump buffers of all CPUs
119 * Set 2 if you want to dump the buffer of the CPU that triggered oops
121 */ 120 */
122int ftrace_dump_on_oops; 121
122enum ftrace_dump_mode ftrace_dump_on_oops;
123 123
124static int tracing_set_tracer(const char *buf); 124static int tracing_set_tracer(const char *buf);
125 125
@@ -139,8 +139,17 @@ __setup("ftrace=", set_cmdline_ftrace);
139 139
140static int __init set_ftrace_dump_on_oops(char *str) 140static int __init set_ftrace_dump_on_oops(char *str)
141{ 141{
142 ftrace_dump_on_oops = 1; 142 if (*str++ != '=' || !*str) {
143 return 1; 143 ftrace_dump_on_oops = DUMP_ALL;
144 return 1;
145 }
146
147 if (!strcmp("orig_cpu", str)) {
148 ftrace_dump_on_oops = DUMP_ORIG;
149 return 1;
150 }
151
152 return 0;
144} 153}
145__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); 154__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
146 155
@@ -332,7 +341,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
332/* trace_flags holds trace_options default values */ 341/* trace_flags holds trace_options default values */
333unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | 342unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
334 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | 343 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
335 TRACE_ITER_GRAPH_TIME; 344 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD;
336 345
337static int trace_stop_count; 346static int trace_stop_count;
338static DEFINE_SPINLOCK(tracing_start_lock); 347static DEFINE_SPINLOCK(tracing_start_lock);
@@ -416,6 +425,7 @@ static const char *trace_options[] = {
416 "latency-format", 425 "latency-format",
417 "sleep-time", 426 "sleep-time",
418 "graph-time", 427 "graph-time",
428 "record-cmd",
419 NULL 429 NULL
420}; 430};
421 431
@@ -647,6 +657,10 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
647 return; 657 return;
648 658
649 WARN_ON_ONCE(!irqs_disabled()); 659 WARN_ON_ONCE(!irqs_disabled());
660 if (!current_trace->use_max_tr) {
661 WARN_ON_ONCE(1);
662 return;
663 }
650 arch_spin_lock(&ftrace_max_lock); 664 arch_spin_lock(&ftrace_max_lock);
651 665
652 tr->buffer = max_tr.buffer; 666 tr->buffer = max_tr.buffer;
@@ -673,6 +687,11 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
673 return; 687 return;
674 688
675 WARN_ON_ONCE(!irqs_disabled()); 689 WARN_ON_ONCE(!irqs_disabled());
690 if (!current_trace->use_max_tr) {
691 WARN_ON_ONCE(1);
692 return;
693 }
694
676 arch_spin_lock(&ftrace_max_lock); 695 arch_spin_lock(&ftrace_max_lock);
677 696
678 ftrace_disable_cpu(); 697 ftrace_disable_cpu();
@@ -717,18 +736,11 @@ __acquires(kernel_lock)
717 return -1; 736 return -1;
718 } 737 }
719 738
720 if (strlen(type->name) > MAX_TRACER_SIZE) { 739 if (strlen(type->name) >= MAX_TRACER_SIZE) {
721 pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE); 740 pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE);
722 return -1; 741 return -1;
723 } 742 }
724 743
725 /*
726 * When this gets called we hold the BKL which means that
727 * preemption is disabled. Various trace selftests however
728 * need to disable and enable preemption for successful tests.
729 * So we drop the BKL here and grab it after the tests again.
730 */
731 unlock_kernel();
732 mutex_lock(&trace_types_lock); 744 mutex_lock(&trace_types_lock);
733 745
734 tracing_selftest_running = true; 746 tracing_selftest_running = true;
@@ -810,7 +822,6 @@ __acquires(kernel_lock)
810#endif 822#endif
811 823
812 out_unlock: 824 out_unlock:
813 lock_kernel();
814 return ret; 825 return ret;
815} 826}
816 827
@@ -1319,61 +1330,6 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
1319 1330
1320#endif /* CONFIG_STACKTRACE */ 1331#endif /* CONFIG_STACKTRACE */
1321 1332
1322static void
1323ftrace_trace_special(void *__tr,
1324 unsigned long arg1, unsigned long arg2, unsigned long arg3,
1325 int pc)
1326{
1327 struct ftrace_event_call *call = &event_special;
1328 struct ring_buffer_event *event;
1329 struct trace_array *tr = __tr;
1330 struct ring_buffer *buffer = tr->buffer;
1331 struct special_entry *entry;
1332
1333 event = trace_buffer_lock_reserve(buffer, TRACE_SPECIAL,
1334 sizeof(*entry), 0, pc);
1335 if (!event)
1336 return;
1337 entry = ring_buffer_event_data(event);
1338 entry->arg1 = arg1;
1339 entry->arg2 = arg2;
1340 entry->arg3 = arg3;
1341
1342 if (!filter_check_discard(call, entry, buffer, event))
1343 trace_buffer_unlock_commit(buffer, event, 0, pc);
1344}
1345
1346void
1347__trace_special(void *__tr, void *__data,
1348 unsigned long arg1, unsigned long arg2, unsigned long arg3)
1349{
1350 ftrace_trace_special(__tr, arg1, arg2, arg3, preempt_count());
1351}
1352
1353void
1354ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
1355{
1356 struct trace_array *tr = &global_trace;
1357 struct trace_array_cpu *data;
1358 unsigned long flags;
1359 int cpu;
1360 int pc;
1361
1362 if (tracing_disabled)
1363 return;
1364
1365 pc = preempt_count();
1366 local_irq_save(flags);
1367 cpu = raw_smp_processor_id();
1368 data = tr->data[cpu];
1369
1370 if (likely(atomic_inc_return(&data->disabled) == 1))
1371 ftrace_trace_special(tr, arg1, arg2, arg3, pc);
1372
1373 atomic_dec(&data->disabled);
1374 local_irq_restore(flags);
1375}
1376
1377/** 1333/**
1378 * trace_vbprintk - write binary msg to tracing buffer 1334 * trace_vbprintk - write binary msg to tracing buffer
1379 * 1335 *
@@ -1392,7 +1348,6 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1392 struct bprint_entry *entry; 1348 struct bprint_entry *entry;
1393 unsigned long flags; 1349 unsigned long flags;
1394 int disable; 1350 int disable;
1395 int resched;
1396 int cpu, len = 0, size, pc; 1351 int cpu, len = 0, size, pc;
1397 1352
1398 if (unlikely(tracing_selftest_running || tracing_disabled)) 1353 if (unlikely(tracing_selftest_running || tracing_disabled))
@@ -1402,7 +1357,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1402 pause_graph_tracing(); 1357 pause_graph_tracing();
1403 1358
1404 pc = preempt_count(); 1359 pc = preempt_count();
1405 resched = ftrace_preempt_disable(); 1360 preempt_disable_notrace();
1406 cpu = raw_smp_processor_id(); 1361 cpu = raw_smp_processor_id();
1407 data = tr->data[cpu]; 1362 data = tr->data[cpu];
1408 1363
@@ -1440,7 +1395,7 @@ out_unlock:
1440 1395
1441out: 1396out:
1442 atomic_dec_return(&data->disabled); 1397 atomic_dec_return(&data->disabled);
1443 ftrace_preempt_enable(resched); 1398 preempt_enable_notrace();
1444 unpause_graph_tracing(); 1399 unpause_graph_tracing();
1445 1400
1446 return len; 1401 return len;
@@ -1527,11 +1482,6 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1527} 1482}
1528EXPORT_SYMBOL_GPL(trace_vprintk); 1483EXPORT_SYMBOL_GPL(trace_vprintk);
1529 1484
1530enum trace_file_type {
1531 TRACE_FILE_LAT_FMT = 1,
1532 TRACE_FILE_ANNOTATE = 2,
1533};
1534
1535static void trace_iterator_increment(struct trace_iterator *iter) 1485static void trace_iterator_increment(struct trace_iterator *iter)
1536{ 1486{
1537 /* Don't allow ftrace to trace into the ring buffers */ 1487 /* Don't allow ftrace to trace into the ring buffers */
@@ -1545,7 +1495,8 @@ static void trace_iterator_increment(struct trace_iterator *iter)
1545} 1495}
1546 1496
1547static struct trace_entry * 1497static struct trace_entry *
1548peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts) 1498peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
1499 unsigned long *lost_events)
1549{ 1500{
1550 struct ring_buffer_event *event; 1501 struct ring_buffer_event *event;
1551 struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; 1502 struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu];
@@ -1556,7 +1507,8 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
1556 if (buf_iter) 1507 if (buf_iter)
1557 event = ring_buffer_iter_peek(buf_iter, ts); 1508 event = ring_buffer_iter_peek(buf_iter, ts);
1558 else 1509 else
1559 event = ring_buffer_peek(iter->tr->buffer, cpu, ts); 1510 event = ring_buffer_peek(iter->tr->buffer, cpu, ts,
1511 lost_events);
1560 1512
1561 ftrace_enable_cpu(); 1513 ftrace_enable_cpu();
1562 1514
@@ -1564,10 +1516,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
1564} 1516}
1565 1517
1566static struct trace_entry * 1518static struct trace_entry *
1567__find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) 1519__find_next_entry(struct trace_iterator *iter, int *ent_cpu,
1520 unsigned long *missing_events, u64 *ent_ts)
1568{ 1521{
1569 struct ring_buffer *buffer = iter->tr->buffer; 1522 struct ring_buffer *buffer = iter->tr->buffer;
1570 struct trace_entry *ent, *next = NULL; 1523 struct trace_entry *ent, *next = NULL;
1524 unsigned long lost_events = 0, next_lost = 0;
1571 int cpu_file = iter->cpu_file; 1525 int cpu_file = iter->cpu_file;
1572 u64 next_ts = 0, ts; 1526 u64 next_ts = 0, ts;
1573 int next_cpu = -1; 1527 int next_cpu = -1;
@@ -1580,7 +1534,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1580 if (cpu_file > TRACE_PIPE_ALL_CPU) { 1534 if (cpu_file > TRACE_PIPE_ALL_CPU) {
1581 if (ring_buffer_empty_cpu(buffer, cpu_file)) 1535 if (ring_buffer_empty_cpu(buffer, cpu_file))
1582 return NULL; 1536 return NULL;
1583 ent = peek_next_entry(iter, cpu_file, ent_ts); 1537 ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events);
1584 if (ent_cpu) 1538 if (ent_cpu)
1585 *ent_cpu = cpu_file; 1539 *ent_cpu = cpu_file;
1586 1540
@@ -1592,7 +1546,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1592 if (ring_buffer_empty_cpu(buffer, cpu)) 1546 if (ring_buffer_empty_cpu(buffer, cpu))
1593 continue; 1547 continue;
1594 1548
1595 ent = peek_next_entry(iter, cpu, &ts); 1549 ent = peek_next_entry(iter, cpu, &ts, &lost_events);
1596 1550
1597 /* 1551 /*
1598 * Pick the entry with the smallest timestamp: 1552 * Pick the entry with the smallest timestamp:
@@ -1601,6 +1555,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1601 next = ent; 1555 next = ent;
1602 next_cpu = cpu; 1556 next_cpu = cpu;
1603 next_ts = ts; 1557 next_ts = ts;
1558 next_lost = lost_events;
1604 } 1559 }
1605 } 1560 }
1606 1561
@@ -1610,6 +1565,9 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1610 if (ent_ts) 1565 if (ent_ts)
1611 *ent_ts = next_ts; 1566 *ent_ts = next_ts;
1612 1567
1568 if (missing_events)
1569 *missing_events = next_lost;
1570
1613 return next; 1571 return next;
1614} 1572}
1615 1573
@@ -1617,13 +1575,14 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1617struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, 1575struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
1618 int *ent_cpu, u64 *ent_ts) 1576 int *ent_cpu, u64 *ent_ts)
1619{ 1577{
1620 return __find_next_entry(iter, ent_cpu, ent_ts); 1578 return __find_next_entry(iter, ent_cpu, NULL, ent_ts);
1621} 1579}
1622 1580
1623/* Find the next real entry, and increment the iterator to the next entry */ 1581/* Find the next real entry, and increment the iterator to the next entry */
1624static void *find_next_entry_inc(struct trace_iterator *iter) 1582void *trace_find_next_entry_inc(struct trace_iterator *iter)
1625{ 1583{
1626 iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts); 1584 iter->ent = __find_next_entry(iter, &iter->cpu,
1585 &iter->lost_events, &iter->ts);
1627 1586
1628 if (iter->ent) 1587 if (iter->ent)
1629 trace_iterator_increment(iter); 1588 trace_iterator_increment(iter);
@@ -1635,7 +1594,8 @@ static void trace_consume(struct trace_iterator *iter)
1635{ 1594{
1636 /* Don't allow ftrace to trace into the ring buffers */ 1595 /* Don't allow ftrace to trace into the ring buffers */
1637 ftrace_disable_cpu(); 1596 ftrace_disable_cpu();
1638 ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts); 1597 ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts,
1598 &iter->lost_events);
1639 ftrace_enable_cpu(); 1599 ftrace_enable_cpu();
1640} 1600}
1641 1601
@@ -1654,19 +1614,19 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
1654 return NULL; 1614 return NULL;
1655 1615
1656 if (iter->idx < 0) 1616 if (iter->idx < 0)
1657 ent = find_next_entry_inc(iter); 1617 ent = trace_find_next_entry_inc(iter);
1658 else 1618 else
1659 ent = iter; 1619 ent = iter;
1660 1620
1661 while (ent && iter->idx < i) 1621 while (ent && iter->idx < i)
1662 ent = find_next_entry_inc(iter); 1622 ent = trace_find_next_entry_inc(iter);
1663 1623
1664 iter->pos = *pos; 1624 iter->pos = *pos;
1665 1625
1666 return ent; 1626 return ent;
1667} 1627}
1668 1628
1669static void tracing_iter_reset(struct trace_iterator *iter, int cpu) 1629void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1670{ 1630{
1671 struct trace_array *tr = iter->tr; 1631 struct trace_array *tr = iter->tr;
1672 struct ring_buffer_event *event; 1632 struct ring_buffer_event *event;
@@ -1786,7 +1746,7 @@ static void print_func_help_header(struct seq_file *m)
1786} 1746}
1787 1747
1788 1748
1789static void 1749void
1790print_trace_header(struct seq_file *m, struct trace_iterator *iter) 1750print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1791{ 1751{
1792 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); 1752 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
@@ -1914,7 +1874,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
1914 } 1874 }
1915 1875
1916 if (event) 1876 if (event)
1917 return event->trace(iter, sym_flags); 1877 return event->funcs->trace(iter, sym_flags, event);
1918 1878
1919 if (!trace_seq_printf(s, "Unknown type %d\n", entry->type)) 1879 if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
1920 goto partial; 1880 goto partial;
@@ -1940,7 +1900,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
1940 1900
1941 event = ftrace_find_event(entry->type); 1901 event = ftrace_find_event(entry->type);
1942 if (event) 1902 if (event)
1943 return event->raw(iter, 0); 1903 return event->funcs->raw(iter, 0, event);
1944 1904
1945 if (!trace_seq_printf(s, "%d ?\n", entry->type)) 1905 if (!trace_seq_printf(s, "%d ?\n", entry->type))
1946 goto partial; 1906 goto partial;
@@ -1967,7 +1927,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
1967 1927
1968 event = ftrace_find_event(entry->type); 1928 event = ftrace_find_event(entry->type);
1969 if (event) { 1929 if (event) {
1970 enum print_line_t ret = event->hex(iter, 0); 1930 enum print_line_t ret = event->funcs->hex(iter, 0, event);
1971 if (ret != TRACE_TYPE_HANDLED) 1931 if (ret != TRACE_TYPE_HANDLED)
1972 return ret; 1932 return ret;
1973 } 1933 }
@@ -1992,10 +1952,11 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
1992 } 1952 }
1993 1953
1994 event = ftrace_find_event(entry->type); 1954 event = ftrace_find_event(entry->type);
1995 return event ? event->binary(iter, 0) : TRACE_TYPE_HANDLED; 1955 return event ? event->funcs->binary(iter, 0, event) :
1956 TRACE_TYPE_HANDLED;
1996} 1957}
1997 1958
1998static int trace_empty(struct trace_iterator *iter) 1959int trace_empty(struct trace_iterator *iter)
1999{ 1960{
2000 int cpu; 1961 int cpu;
2001 1962
@@ -2026,10 +1987,14 @@ static int trace_empty(struct trace_iterator *iter)
2026} 1987}
2027 1988
2028/* Called with trace_event_read_lock() held. */ 1989/* Called with trace_event_read_lock() held. */
2029static enum print_line_t print_trace_line(struct trace_iterator *iter) 1990enum print_line_t print_trace_line(struct trace_iterator *iter)
2030{ 1991{
2031 enum print_line_t ret; 1992 enum print_line_t ret;
2032 1993
1994 if (iter->lost_events)
1995 trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
1996 iter->cpu, iter->lost_events);
1997
2033 if (iter->trace && iter->trace->print_line) { 1998 if (iter->trace && iter->trace->print_line) {
2034 ret = iter->trace->print_line(iter); 1999 ret = iter->trace->print_line(iter);
2035 if (ret != TRACE_TYPE_UNHANDLED) 2000 if (ret != TRACE_TYPE_UNHANDLED)
@@ -2058,6 +2023,23 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
2058 return print_trace_fmt(iter); 2023 return print_trace_fmt(iter);
2059} 2024}
2060 2025
2026void trace_default_header(struct seq_file *m)
2027{
2028 struct trace_iterator *iter = m->private;
2029
2030 if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
2031 /* print nothing if the buffers are empty */
2032 if (trace_empty(iter))
2033 return;
2034 print_trace_header(m, iter);
2035 if (!(trace_flags & TRACE_ITER_VERBOSE))
2036 print_lat_help_header(m);
2037 } else {
2038 if (!(trace_flags & TRACE_ITER_VERBOSE))
2039 print_func_help_header(m);
2040 }
2041}
2042
2061static int s_show(struct seq_file *m, void *v) 2043static int s_show(struct seq_file *m, void *v)
2062{ 2044{
2063 struct trace_iterator *iter = v; 2045 struct trace_iterator *iter = v;
@@ -2070,17 +2052,9 @@ static int s_show(struct seq_file *m, void *v)
2070 } 2052 }
2071 if (iter->trace && iter->trace->print_header) 2053 if (iter->trace && iter->trace->print_header)
2072 iter->trace->print_header(m); 2054 iter->trace->print_header(m);
2073 else if (iter->iter_flags & TRACE_FILE_LAT_FMT) { 2055 else
2074 /* print nothing if the buffers are empty */ 2056 trace_default_header(m);
2075 if (trace_empty(iter)) 2057
2076 return 0;
2077 print_trace_header(m, iter);
2078 if (!(trace_flags & TRACE_ITER_VERBOSE))
2079 print_lat_help_header(m);
2080 } else {
2081 if (!(trace_flags & TRACE_ITER_VERBOSE))
2082 print_func_help_header(m);
2083 }
2084 } else if (iter->leftover) { 2058 } else if (iter->leftover) {
2085 /* 2059 /*
2086 * If we filled the seq_file buffer earlier, we 2060 * If we filled the seq_file buffer earlier, we
@@ -2166,15 +2140,20 @@ __tracing_open(struct inode *inode, struct file *file)
2166 2140
2167 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { 2141 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
2168 for_each_tracing_cpu(cpu) { 2142 for_each_tracing_cpu(cpu) {
2169
2170 iter->buffer_iter[cpu] = 2143 iter->buffer_iter[cpu] =
2171 ring_buffer_read_start(iter->tr->buffer, cpu); 2144 ring_buffer_read_prepare(iter->tr->buffer, cpu);
2145 }
2146 ring_buffer_read_prepare_sync();
2147 for_each_tracing_cpu(cpu) {
2148 ring_buffer_read_start(iter->buffer_iter[cpu]);
2172 tracing_iter_reset(iter, cpu); 2149 tracing_iter_reset(iter, cpu);
2173 } 2150 }
2174 } else { 2151 } else {
2175 cpu = iter->cpu_file; 2152 cpu = iter->cpu_file;
2176 iter->buffer_iter[cpu] = 2153 iter->buffer_iter[cpu] =
2177 ring_buffer_read_start(iter->tr->buffer, cpu); 2154 ring_buffer_read_prepare(iter->tr->buffer, cpu);
2155 ring_buffer_read_prepare_sync();
2156 ring_buffer_read_start(iter->buffer_iter[cpu]);
2178 tracing_iter_reset(iter, cpu); 2157 tracing_iter_reset(iter, cpu);
2179 } 2158 }
2180 2159
@@ -2353,6 +2332,7 @@ static const struct file_operations show_traces_fops = {
2353 .open = show_traces_open, 2332 .open = show_traces_open,
2354 .read = seq_read, 2333 .read = seq_read,
2355 .release = seq_release, 2334 .release = seq_release,
2335 .llseek = seq_lseek,
2356}; 2336};
2357 2337
2358/* 2338/*
@@ -2446,6 +2426,7 @@ static const struct file_operations tracing_cpumask_fops = {
2446 .open = tracing_open_generic, 2426 .open = tracing_open_generic,
2447 .read = tracing_cpumask_read, 2427 .read = tracing_cpumask_read,
2448 .write = tracing_cpumask_write, 2428 .write = tracing_cpumask_write,
2429 .llseek = generic_file_llseek,
2449}; 2430};
2450 2431
2451static int tracing_trace_options_show(struct seq_file *m, void *v) 2432static int tracing_trace_options_show(struct seq_file *m, void *v)
@@ -2521,6 +2502,9 @@ static void set_tracer_flags(unsigned int mask, int enabled)
2521 trace_flags |= mask; 2502 trace_flags |= mask;
2522 else 2503 else
2523 trace_flags &= ~mask; 2504 trace_flags &= ~mask;
2505
2506 if (mask == TRACE_ITER_RECORD_CMD)
2507 trace_event_enable_cmd_record(enabled);
2524} 2508}
2525 2509
2526static ssize_t 2510static ssize_t
@@ -2612,6 +2596,7 @@ tracing_readme_read(struct file *filp, char __user *ubuf,
2612static const struct file_operations tracing_readme_fops = { 2596static const struct file_operations tracing_readme_fops = {
2613 .open = tracing_open_generic, 2597 .open = tracing_open_generic,
2614 .read = tracing_readme_read, 2598 .read = tracing_readme_read,
2599 .llseek = generic_file_llseek,
2615}; 2600};
2616 2601
2617static ssize_t 2602static ssize_t
@@ -2662,6 +2647,7 @@ tracing_saved_cmdlines_read(struct file *file, char __user *ubuf,
2662static const struct file_operations tracing_saved_cmdlines_fops = { 2647static const struct file_operations tracing_saved_cmdlines_fops = {
2663 .open = tracing_open_generic, 2648 .open = tracing_open_generic,
2664 .read = tracing_saved_cmdlines_read, 2649 .read = tracing_saved_cmdlines_read,
2650 .llseek = generic_file_llseek,
2665}; 2651};
2666 2652
2667static ssize_t 2653static ssize_t
@@ -2757,6 +2743,9 @@ static int tracing_resize_ring_buffer(unsigned long size)
2757 if (ret < 0) 2743 if (ret < 0)
2758 return ret; 2744 return ret;
2759 2745
2746 if (!current_trace->use_max_tr)
2747 goto out;
2748
2760 ret = ring_buffer_resize(max_tr.buffer, size); 2749 ret = ring_buffer_resize(max_tr.buffer, size);
2761 if (ret < 0) { 2750 if (ret < 0) {
2762 int r; 2751 int r;
@@ -2784,11 +2773,14 @@ static int tracing_resize_ring_buffer(unsigned long size)
2784 return ret; 2773 return ret;
2785 } 2774 }
2786 2775
2776 max_tr.entries = size;
2777 out:
2787 global_trace.entries = size; 2778 global_trace.entries = size;
2788 2779
2789 return ret; 2780 return ret;
2790} 2781}
2791 2782
2783
2792/** 2784/**
2793 * tracing_update_buffers - used by tracing facility to expand ring buffers 2785 * tracing_update_buffers - used by tracing facility to expand ring buffers
2794 * 2786 *
@@ -2849,12 +2841,26 @@ static int tracing_set_tracer(const char *buf)
2849 trace_branch_disable(); 2841 trace_branch_disable();
2850 if (current_trace && current_trace->reset) 2842 if (current_trace && current_trace->reset)
2851 current_trace->reset(tr); 2843 current_trace->reset(tr);
2852 2844 if (current_trace && current_trace->use_max_tr) {
2845 /*
2846 * We don't free the ring buffer. instead, resize it because
2847 * The max_tr ring buffer has some state (e.g. ring->clock) and
2848 * we want preserve it.
2849 */
2850 ring_buffer_resize(max_tr.buffer, 1);
2851 max_tr.entries = 1;
2852 }
2853 destroy_trace_option_files(topts); 2853 destroy_trace_option_files(topts);
2854 2854
2855 current_trace = t; 2855 current_trace = t;
2856 2856
2857 topts = create_trace_option_files(current_trace); 2857 topts = create_trace_option_files(current_trace);
2858 if (current_trace->use_max_tr) {
2859 ret = ring_buffer_resize(max_tr.buffer, global_trace.entries);
2860 if (ret < 0)
2861 goto out;
2862 max_tr.entries = global_trace.entries;
2863 }
2858 2864
2859 if (t->init) { 2865 if (t->init) {
2860 ret = tracer_init(t, tr); 2866 ret = tracer_init(t, tr);
@@ -2991,6 +2997,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
2991 if (iter->trace->pipe_open) 2997 if (iter->trace->pipe_open)
2992 iter->trace->pipe_open(iter); 2998 iter->trace->pipe_open(iter);
2993 2999
3000 nonseekable_open(inode, filp);
2994out: 3001out:
2995 mutex_unlock(&trace_types_lock); 3002 mutex_unlock(&trace_types_lock);
2996 return ret; 3003 return ret;
@@ -3170,7 +3177,7 @@ waitagain:
3170 3177
3171 trace_event_read_lock(); 3178 trace_event_read_lock();
3172 trace_access_lock(iter->cpu_file); 3179 trace_access_lock(iter->cpu_file);
3173 while (find_next_entry_inc(iter) != NULL) { 3180 while (trace_find_next_entry_inc(iter) != NULL) {
3174 enum print_line_t ret; 3181 enum print_line_t ret;
3175 int len = iter->seq.len; 3182 int len = iter->seq.len;
3176 3183
@@ -3253,7 +3260,7 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
3253 if (ret != TRACE_TYPE_NO_CONSUME) 3260 if (ret != TRACE_TYPE_NO_CONSUME)
3254 trace_consume(iter); 3261 trace_consume(iter);
3255 rem -= count; 3262 rem -= count;
3256 if (!find_next_entry_inc(iter)) { 3263 if (!trace_find_next_entry_inc(iter)) {
3257 rem = 0; 3264 rem = 0;
3258 iter->ent = NULL; 3265 iter->ent = NULL;
3259 break; 3266 break;
@@ -3269,12 +3276,12 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3269 size_t len, 3276 size_t len,
3270 unsigned int flags) 3277 unsigned int flags)
3271{ 3278{
3272 struct page *pages[PIPE_BUFFERS]; 3279 struct page *pages_def[PIPE_DEF_BUFFERS];
3273 struct partial_page partial[PIPE_BUFFERS]; 3280 struct partial_page partial_def[PIPE_DEF_BUFFERS];
3274 struct trace_iterator *iter = filp->private_data; 3281 struct trace_iterator *iter = filp->private_data;
3275 struct splice_pipe_desc spd = { 3282 struct splice_pipe_desc spd = {
3276 .pages = pages, 3283 .pages = pages_def,
3277 .partial = partial, 3284 .partial = partial_def,
3278 .nr_pages = 0, /* This gets updated below. */ 3285 .nr_pages = 0, /* This gets updated below. */
3279 .flags = flags, 3286 .flags = flags,
3280 .ops = &tracing_pipe_buf_ops, 3287 .ops = &tracing_pipe_buf_ops,
@@ -3285,6 +3292,9 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3285 size_t rem; 3292 size_t rem;
3286 unsigned int i; 3293 unsigned int i;
3287 3294
3295 if (splice_grow_spd(pipe, &spd))
3296 return -ENOMEM;
3297
3288 /* copy the tracer to avoid using a global lock all around */ 3298 /* copy the tracer to avoid using a global lock all around */
3289 mutex_lock(&trace_types_lock); 3299 mutex_lock(&trace_types_lock);
3290 if (unlikely(old_tracer != current_trace && current_trace)) { 3300 if (unlikely(old_tracer != current_trace && current_trace)) {
@@ -3306,7 +3316,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3306 if (ret <= 0) 3316 if (ret <= 0)
3307 goto out_err; 3317 goto out_err;
3308 3318
3309 if (!iter->ent && !find_next_entry_inc(iter)) { 3319 if (!iter->ent && !trace_find_next_entry_inc(iter)) {
3310 ret = -EFAULT; 3320 ret = -EFAULT;
3311 goto out_err; 3321 goto out_err;
3312 } 3322 }
@@ -3315,23 +3325,23 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3315 trace_access_lock(iter->cpu_file); 3325 trace_access_lock(iter->cpu_file);
3316 3326
3317 /* Fill as many pages as possible. */ 3327 /* Fill as many pages as possible. */
3318 for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { 3328 for (i = 0, rem = len; i < pipe->buffers && rem; i++) {
3319 pages[i] = alloc_page(GFP_KERNEL); 3329 spd.pages[i] = alloc_page(GFP_KERNEL);
3320 if (!pages[i]) 3330 if (!spd.pages[i])
3321 break; 3331 break;
3322 3332
3323 rem = tracing_fill_pipe_page(rem, iter); 3333 rem = tracing_fill_pipe_page(rem, iter);
3324 3334
3325 /* Copy the data into the page, so we can start over. */ 3335 /* Copy the data into the page, so we can start over. */
3326 ret = trace_seq_to_buffer(&iter->seq, 3336 ret = trace_seq_to_buffer(&iter->seq,
3327 page_address(pages[i]), 3337 page_address(spd.pages[i]),
3328 iter->seq.len); 3338 iter->seq.len);
3329 if (ret < 0) { 3339 if (ret < 0) {
3330 __free_page(pages[i]); 3340 __free_page(spd.pages[i]);
3331 break; 3341 break;
3332 } 3342 }
3333 partial[i].offset = 0; 3343 spd.partial[i].offset = 0;
3334 partial[i].len = iter->seq.len; 3344 spd.partial[i].len = iter->seq.len;
3335 3345
3336 trace_seq_init(&iter->seq); 3346 trace_seq_init(&iter->seq);
3337 } 3347 }
@@ -3342,12 +3352,14 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3342 3352
3343 spd.nr_pages = i; 3353 spd.nr_pages = i;
3344 3354
3345 return splice_to_pipe(pipe, &spd); 3355 ret = splice_to_pipe(pipe, &spd);
3356out:
3357 splice_shrink_spd(pipe, &spd);
3358 return ret;
3346 3359
3347out_err: 3360out_err:
3348 mutex_unlock(&iter->mutex); 3361 mutex_unlock(&iter->mutex);
3349 3362 goto out;
3350 return ret;
3351} 3363}
3352 3364
3353static ssize_t 3365static ssize_t
@@ -3431,7 +3443,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3431 } 3443 }
3432 3444
3433 tracing_start(); 3445 tracing_start();
3434 max_tr.entries = global_trace.entries;
3435 mutex_unlock(&trace_types_lock); 3446 mutex_unlock(&trace_types_lock);
3436 3447
3437 return cnt; 3448 return cnt;
@@ -3452,6 +3463,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3452 size_t cnt, loff_t *fpos) 3463 size_t cnt, loff_t *fpos)
3453{ 3464{
3454 char *buf; 3465 char *buf;
3466 size_t written;
3455 3467
3456 if (tracing_disabled) 3468 if (tracing_disabled)
3457 return -EINVAL; 3469 return -EINVAL;
@@ -3473,11 +3485,15 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3473 } else 3485 } else
3474 buf[cnt] = '\0'; 3486 buf[cnt] = '\0';
3475 3487
3476 cnt = mark_printk("%s", buf); 3488 written = mark_printk("%s", buf);
3477 kfree(buf); 3489 kfree(buf);
3478 *fpos += cnt; 3490 *fpos += written;
3479 3491
3480 return cnt; 3492 /* don't tell userspace we wrote more - it might confuse them */
3493 if (written > cnt)
3494 written = cnt;
3495
3496 return written;
3481} 3497}
3482 3498
3483static int tracing_clock_show(struct seq_file *m, void *v) 3499static int tracing_clock_show(struct seq_file *m, void *v)
@@ -3544,18 +3560,21 @@ static const struct file_operations tracing_max_lat_fops = {
3544 .open = tracing_open_generic, 3560 .open = tracing_open_generic,
3545 .read = tracing_max_lat_read, 3561 .read = tracing_max_lat_read,
3546 .write = tracing_max_lat_write, 3562 .write = tracing_max_lat_write,
3563 .llseek = generic_file_llseek,
3547}; 3564};
3548 3565
3549static const struct file_operations tracing_ctrl_fops = { 3566static const struct file_operations tracing_ctrl_fops = {
3550 .open = tracing_open_generic, 3567 .open = tracing_open_generic,
3551 .read = tracing_ctrl_read, 3568 .read = tracing_ctrl_read,
3552 .write = tracing_ctrl_write, 3569 .write = tracing_ctrl_write,
3570 .llseek = generic_file_llseek,
3553}; 3571};
3554 3572
3555static const struct file_operations set_tracer_fops = { 3573static const struct file_operations set_tracer_fops = {
3556 .open = tracing_open_generic, 3574 .open = tracing_open_generic,
3557 .read = tracing_set_trace_read, 3575 .read = tracing_set_trace_read,
3558 .write = tracing_set_trace_write, 3576 .write = tracing_set_trace_write,
3577 .llseek = generic_file_llseek,
3559}; 3578};
3560 3579
3561static const struct file_operations tracing_pipe_fops = { 3580static const struct file_operations tracing_pipe_fops = {
@@ -3564,17 +3583,20 @@ static const struct file_operations tracing_pipe_fops = {
3564 .read = tracing_read_pipe, 3583 .read = tracing_read_pipe,
3565 .splice_read = tracing_splice_read_pipe, 3584 .splice_read = tracing_splice_read_pipe,
3566 .release = tracing_release_pipe, 3585 .release = tracing_release_pipe,
3586 .llseek = no_llseek,
3567}; 3587};
3568 3588
3569static const struct file_operations tracing_entries_fops = { 3589static const struct file_operations tracing_entries_fops = {
3570 .open = tracing_open_generic, 3590 .open = tracing_open_generic,
3571 .read = tracing_entries_read, 3591 .read = tracing_entries_read,
3572 .write = tracing_entries_write, 3592 .write = tracing_entries_write,
3593 .llseek = generic_file_llseek,
3573}; 3594};
3574 3595
3575static const struct file_operations tracing_mark_fops = { 3596static const struct file_operations tracing_mark_fops = {
3576 .open = tracing_open_generic, 3597 .open = tracing_open_generic,
3577 .write = tracing_mark_write, 3598 .write = tracing_mark_write,
3599 .llseek = generic_file_llseek,
3578}; 3600};
3579 3601
3580static const struct file_operations trace_clock_fops = { 3602static const struct file_operations trace_clock_fops = {
@@ -3620,7 +3642,6 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
3620 size_t count, loff_t *ppos) 3642 size_t count, loff_t *ppos)
3621{ 3643{
3622 struct ftrace_buffer_info *info = filp->private_data; 3644 struct ftrace_buffer_info *info = filp->private_data;
3623 unsigned int pos;
3624 ssize_t ret; 3645 ssize_t ret;
3625 size_t size; 3646 size_t size;
3626 3647
@@ -3647,11 +3668,6 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
3647 if (ret < 0) 3668 if (ret < 0)
3648 return 0; 3669 return 0;
3649 3670
3650 pos = ring_buffer_page_len(info->spare);
3651
3652 if (pos < PAGE_SIZE)
3653 memset(info->spare + pos, 0, PAGE_SIZE - pos);
3654
3655read: 3671read:
3656 size = PAGE_SIZE - info->read; 3672 size = PAGE_SIZE - info->read;
3657 if (size > count) 3673 if (size > count)
@@ -3746,11 +3762,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3746 unsigned int flags) 3762 unsigned int flags)
3747{ 3763{
3748 struct ftrace_buffer_info *info = file->private_data; 3764 struct ftrace_buffer_info *info = file->private_data;
3749 struct partial_page partial[PIPE_BUFFERS]; 3765 struct partial_page partial_def[PIPE_DEF_BUFFERS];
3750 struct page *pages[PIPE_BUFFERS]; 3766 struct page *pages_def[PIPE_DEF_BUFFERS];
3751 struct splice_pipe_desc spd = { 3767 struct splice_pipe_desc spd = {
3752 .pages = pages, 3768 .pages = pages_def,
3753 .partial = partial, 3769 .partial = partial_def,
3754 .flags = flags, 3770 .flags = flags,
3755 .ops = &buffer_pipe_buf_ops, 3771 .ops = &buffer_pipe_buf_ops,
3756 .spd_release = buffer_spd_release, 3772 .spd_release = buffer_spd_release,
@@ -3759,22 +3775,28 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3759 int entries, size, i; 3775 int entries, size, i;
3760 size_t ret; 3776 size_t ret;
3761 3777
3778 if (splice_grow_spd(pipe, &spd))
3779 return -ENOMEM;
3780
3762 if (*ppos & (PAGE_SIZE - 1)) { 3781 if (*ppos & (PAGE_SIZE - 1)) {
3763 WARN_ONCE(1, "Ftrace: previous read must page-align\n"); 3782 WARN_ONCE(1, "Ftrace: previous read must page-align\n");
3764 return -EINVAL; 3783 ret = -EINVAL;
3784 goto out;
3765 } 3785 }
3766 3786
3767 if (len & (PAGE_SIZE - 1)) { 3787 if (len & (PAGE_SIZE - 1)) {
3768 WARN_ONCE(1, "Ftrace: splice_read should page-align\n"); 3788 WARN_ONCE(1, "Ftrace: splice_read should page-align\n");
3769 if (len < PAGE_SIZE) 3789 if (len < PAGE_SIZE) {
3770 return -EINVAL; 3790 ret = -EINVAL;
3791 goto out;
3792 }
3771 len &= PAGE_MASK; 3793 len &= PAGE_MASK;
3772 } 3794 }
3773 3795
3774 trace_access_lock(info->cpu); 3796 trace_access_lock(info->cpu);
3775 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); 3797 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
3776 3798
3777 for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) { 3799 for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) {
3778 struct page *page; 3800 struct page *page;
3779 int r; 3801 int r;
3780 3802
@@ -3829,11 +3851,12 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3829 else 3851 else
3830 ret = 0; 3852 ret = 0;
3831 /* TODO: block */ 3853 /* TODO: block */
3832 return ret; 3854 goto out;
3833 } 3855 }
3834 3856
3835 ret = splice_to_pipe(pipe, &spd); 3857 ret = splice_to_pipe(pipe, &spd);
3836 3858 splice_shrink_spd(pipe, &spd);
3859out:
3837 return ret; 3860 return ret;
3838} 3861}
3839 3862
@@ -3879,6 +3902,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
3879static const struct file_operations tracing_stats_fops = { 3902static const struct file_operations tracing_stats_fops = {
3880 .open = tracing_open_generic, 3903 .open = tracing_open_generic,
3881 .read = tracing_stats_read, 3904 .read = tracing_stats_read,
3905 .llseek = generic_file_llseek,
3882}; 3906};
3883 3907
3884#ifdef CONFIG_DYNAMIC_FTRACE 3908#ifdef CONFIG_DYNAMIC_FTRACE
@@ -3915,6 +3939,7 @@ tracing_read_dyn_info(struct file *filp, char __user *ubuf,
3915static const struct file_operations tracing_dyn_info_fops = { 3939static const struct file_operations tracing_dyn_info_fops = {
3916 .open = tracing_open_generic, 3940 .open = tracing_open_generic,
3917 .read = tracing_read_dyn_info, 3941 .read = tracing_read_dyn_info,
3942 .llseek = generic_file_llseek,
3918}; 3943};
3919#endif 3944#endif
3920 3945
@@ -4068,6 +4093,7 @@ static const struct file_operations trace_options_fops = {
4068 .open = tracing_open_generic, 4093 .open = tracing_open_generic,
4069 .read = trace_options_read, 4094 .read = trace_options_read,
4070 .write = trace_options_write, 4095 .write = trace_options_write,
4096 .llseek = generic_file_llseek,
4071}; 4097};
4072 4098
4073static ssize_t 4099static ssize_t
@@ -4119,6 +4145,7 @@ static const struct file_operations trace_options_core_fops = {
4119 .open = tracing_open_generic, 4145 .open = tracing_open_generic,
4120 .read = trace_options_core_read, 4146 .read = trace_options_core_read,
4121 .write = trace_options_core_write, 4147 .write = trace_options_core_write,
4148 .llseek = generic_file_llseek,
4122}; 4149};
4123 4150
4124struct dentry *trace_create_file(const char *name, 4151struct dentry *trace_create_file(const char *name,
@@ -4308,9 +4335,6 @@ static __init int tracer_init_debugfs(void)
4308 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, 4335 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
4309 &ftrace_update_tot_cnt, &tracing_dyn_info_fops); 4336 &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
4310#endif 4337#endif
4311#ifdef CONFIG_SYSPROF_TRACER
4312 init_tracer_sysprof_debugfs(d_tracer);
4313#endif
4314 4338
4315 create_trace_options_dir(); 4339 create_trace_options_dir();
4316 4340
@@ -4324,7 +4348,7 @@ static int trace_panic_handler(struct notifier_block *this,
4324 unsigned long event, void *unused) 4348 unsigned long event, void *unused)
4325{ 4349{
4326 if (ftrace_dump_on_oops) 4350 if (ftrace_dump_on_oops)
4327 ftrace_dump(); 4351 ftrace_dump(ftrace_dump_on_oops);
4328 return NOTIFY_OK; 4352 return NOTIFY_OK;
4329} 4353}
4330 4354
@@ -4341,7 +4365,7 @@ static int trace_die_handler(struct notifier_block *self,
4341 switch (val) { 4365 switch (val) {
4342 case DIE_OOPS: 4366 case DIE_OOPS:
4343 if (ftrace_dump_on_oops) 4367 if (ftrace_dump_on_oops)
4344 ftrace_dump(); 4368 ftrace_dump(ftrace_dump_on_oops);
4345 break; 4369 break;
4346 default: 4370 default:
4347 break; 4371 break;
@@ -4367,7 +4391,7 @@ static struct notifier_block trace_die_notifier = {
4367 */ 4391 */
4368#define KERN_TRACE KERN_EMERG 4392#define KERN_TRACE KERN_EMERG
4369 4393
4370static void 4394void
4371trace_printk_seq(struct trace_seq *s) 4395trace_printk_seq(struct trace_seq *s)
4372{ 4396{
4373 /* Probably should print a warning here. */ 4397 /* Probably should print a warning here. */
@@ -4382,7 +4406,15 @@ trace_printk_seq(struct trace_seq *s)
4382 trace_seq_init(s); 4406 trace_seq_init(s);
4383} 4407}
4384 4408
4385static void __ftrace_dump(bool disable_tracing) 4409void trace_init_global_iter(struct trace_iterator *iter)
4410{
4411 iter->tr = &global_trace;
4412 iter->trace = current_trace;
4413 iter->cpu_file = TRACE_PIPE_ALL_CPU;
4414}
4415
4416static void
4417__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
4386{ 4418{
4387 static arch_spinlock_t ftrace_dump_lock = 4419 static arch_spinlock_t ftrace_dump_lock =
4388 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 4420 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
@@ -4406,8 +4438,10 @@ static void __ftrace_dump(bool disable_tracing)
4406 if (disable_tracing) 4438 if (disable_tracing)
4407 ftrace_kill(); 4439 ftrace_kill();
4408 4440
4441 trace_init_global_iter(&iter);
4442
4409 for_each_tracing_cpu(cpu) { 4443 for_each_tracing_cpu(cpu) {
4410 atomic_inc(&global_trace.data[cpu]->disabled); 4444 atomic_inc(&iter.tr->data[cpu]->disabled);
4411 } 4445 }
4412 4446
4413 old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; 4447 old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ;
@@ -4415,12 +4449,25 @@ static void __ftrace_dump(bool disable_tracing)
4415 /* don't look at user memory in panic mode */ 4449 /* don't look at user memory in panic mode */
4416 trace_flags &= ~TRACE_ITER_SYM_USEROBJ; 4450 trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
4417 4451
4418 printk(KERN_TRACE "Dumping ftrace buffer:\n");
4419
4420 /* Simulate the iterator */ 4452 /* Simulate the iterator */
4421 iter.tr = &global_trace; 4453 iter.tr = &global_trace;
4422 iter.trace = current_trace; 4454 iter.trace = current_trace;
4423 iter.cpu_file = TRACE_PIPE_ALL_CPU; 4455
4456 switch (oops_dump_mode) {
4457 case DUMP_ALL:
4458 iter.cpu_file = TRACE_PIPE_ALL_CPU;
4459 break;
4460 case DUMP_ORIG:
4461 iter.cpu_file = raw_smp_processor_id();
4462 break;
4463 case DUMP_NONE:
4464 goto out_enable;
4465 default:
4466 printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n");
4467 iter.cpu_file = TRACE_PIPE_ALL_CPU;
4468 }
4469
4470 printk(KERN_TRACE "Dumping ftrace buffer:\n");
4424 4471
4425 /* 4472 /*
4426 * We need to stop all tracing on all CPUS to read the 4473 * We need to stop all tracing on all CPUS to read the
@@ -4443,7 +4490,7 @@ static void __ftrace_dump(bool disable_tracing)
4443 iter.iter_flags |= TRACE_FILE_LAT_FMT; 4490 iter.iter_flags |= TRACE_FILE_LAT_FMT;
4444 iter.pos = -1; 4491 iter.pos = -1;
4445 4492
4446 if (find_next_entry_inc(&iter) != NULL) { 4493 if (trace_find_next_entry_inc(&iter) != NULL) {
4447 int ret; 4494 int ret;
4448 4495
4449 ret = print_trace_line(&iter); 4496 ret = print_trace_line(&iter);
@@ -4459,12 +4506,13 @@ static void __ftrace_dump(bool disable_tracing)
4459 else 4506 else
4460 printk(KERN_TRACE "---------------------------------\n"); 4507 printk(KERN_TRACE "---------------------------------\n");
4461 4508
4509 out_enable:
4462 /* Re-enable tracing if requested */ 4510 /* Re-enable tracing if requested */
4463 if (!disable_tracing) { 4511 if (!disable_tracing) {
4464 trace_flags |= old_userobj; 4512 trace_flags |= old_userobj;
4465 4513
4466 for_each_tracing_cpu(cpu) { 4514 for_each_tracing_cpu(cpu) {
4467 atomic_dec(&global_trace.data[cpu]->disabled); 4515 atomic_dec(&iter.tr->data[cpu]->disabled);
4468 } 4516 }
4469 tracing_on(); 4517 tracing_on();
4470 } 4518 }
@@ -4475,9 +4523,9 @@ static void __ftrace_dump(bool disable_tracing)
4475} 4523}
4476 4524
4477/* By default: disable tracing after the dump */ 4525/* By default: disable tracing after the dump */
4478void ftrace_dump(void) 4526void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
4479{ 4527{
4480 __ftrace_dump(true); 4528 __ftrace_dump(true, oops_dump_mode);
4481} 4529}
4482 4530
4483__init static int tracer_alloc_buffers(void) 4531__init static int tracer_alloc_buffers(void)
@@ -4513,16 +4561,14 @@ __init static int tracer_alloc_buffers(void)
4513 4561
4514 4562
4515#ifdef CONFIG_TRACER_MAX_TRACE 4563#ifdef CONFIG_TRACER_MAX_TRACE
4516 max_tr.buffer = ring_buffer_alloc(ring_buf_size, 4564 max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS);
4517 TRACE_BUFFER_FLAGS);
4518 if (!max_tr.buffer) { 4565 if (!max_tr.buffer) {
4519 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); 4566 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
4520 WARN_ON(1); 4567 WARN_ON(1);
4521 ring_buffer_free(global_trace.buffer); 4568 ring_buffer_free(global_trace.buffer);
4522 goto out_free_cpumask; 4569 goto out_free_cpumask;
4523 } 4570 }
4524 max_tr.entries = ring_buffer_size(max_tr.buffer); 4571 max_tr.entries = 1;
4525 WARN_ON(max_tr.entries != global_trace.entries);
4526#endif 4572#endif
4527 4573
4528 /* Allocate the first page for all buffers */ 4574 /* Allocate the first page for all buffers */
@@ -4535,9 +4581,6 @@ __init static int tracer_alloc_buffers(void)
4535 4581
4536 register_tracer(&nop_trace); 4582 register_tracer(&nop_trace);
4537 current_trace = &nop_trace; 4583 current_trace = &nop_trace;
4538#ifdef CONFIG_BOOT_TRACER
4539 register_tracer(&boot_tracer);
4540#endif
4541 /* All seems OK, enable tracing */ 4584 /* All seems OK, enable tracing */
4542 tracing_disabled = 0; 4585 tracing_disabled = 0;
4543 4586
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2825ef2c0b15..d39b3c5454a5 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -9,10 +9,7 @@
9#include <linux/mmiotrace.h> 9#include <linux/mmiotrace.h>
10#include <linux/tracepoint.h> 10#include <linux/tracepoint.h>
11#include <linux/ftrace.h> 11#include <linux/ftrace.h>
12#include <trace/boot.h>
13#include <linux/kmemtrace.h>
14#include <linux/hw_breakpoint.h> 12#include <linux/hw_breakpoint.h>
15
16#include <linux/trace_seq.h> 13#include <linux/trace_seq.h>
17#include <linux/ftrace_event.h> 14#include <linux/ftrace_event.h>
18 15
@@ -25,31 +22,17 @@ enum trace_type {
25 TRACE_STACK, 22 TRACE_STACK,
26 TRACE_PRINT, 23 TRACE_PRINT,
27 TRACE_BPRINT, 24 TRACE_BPRINT,
28 TRACE_SPECIAL,
29 TRACE_MMIO_RW, 25 TRACE_MMIO_RW,
30 TRACE_MMIO_MAP, 26 TRACE_MMIO_MAP,
31 TRACE_BRANCH, 27 TRACE_BRANCH,
32 TRACE_BOOT_CALL,
33 TRACE_BOOT_RET,
34 TRACE_GRAPH_RET, 28 TRACE_GRAPH_RET,
35 TRACE_GRAPH_ENT, 29 TRACE_GRAPH_ENT,
36 TRACE_USER_STACK, 30 TRACE_USER_STACK,
37 TRACE_HW_BRANCHES,
38 TRACE_KMEM_ALLOC,
39 TRACE_KMEM_FREE,
40 TRACE_BLK, 31 TRACE_BLK,
41 TRACE_KSYM,
42 32
43 __TRACE_LAST_TYPE, 33 __TRACE_LAST_TYPE,
44}; 34};
45 35
46enum kmemtrace_type_id {
47 KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */
48 KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */
49 KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */
50};
51
52extern struct tracer boot_tracer;
53 36
54#undef __field 37#undef __field
55#define __field(type, item) type item; 38#define __field(type, item) type item;
@@ -103,29 +86,17 @@ struct syscall_trace_exit {
103 long ret; 86 long ret;
104}; 87};
105 88
106struct kprobe_trace_entry { 89struct kprobe_trace_entry_head {
107 struct trace_entry ent; 90 struct trace_entry ent;
108 unsigned long ip; 91 unsigned long ip;
109 int nargs;
110 unsigned long args[];
111}; 92};
112 93
113#define SIZEOF_KPROBE_TRACE_ENTRY(n) \ 94struct kretprobe_trace_entry_head {
114 (offsetof(struct kprobe_trace_entry, args) + \
115 (sizeof(unsigned long) * (n)))
116
117struct kretprobe_trace_entry {
118 struct trace_entry ent; 95 struct trace_entry ent;
119 unsigned long func; 96 unsigned long func;
120 unsigned long ret_ip; 97 unsigned long ret_ip;
121 int nargs;
122 unsigned long args[];
123}; 98};
124 99
125#define SIZEOF_KRETPROBE_TRACE_ENTRY(n) \
126 (offsetof(struct kretprobe_trace_entry, args) + \
127 (sizeof(unsigned long) * (n)))
128
129/* 100/*
130 * trace_flag_type is an enumeration that holds different 101 * trace_flag_type is an enumeration that holds different
131 * states when a trace occurs. These are: 102 * states when a trace occurs. These are:
@@ -217,24 +188,15 @@ extern void __ftrace_bad_type(void);
217 IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ 188 IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
218 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ 189 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
219 IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ 190 IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \
220 IF_ASSIGN(var, ent, struct special_entry, 0); \
221 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ 191 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
222 TRACE_MMIO_RW); \ 192 TRACE_MMIO_RW); \
223 IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ 193 IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
224 TRACE_MMIO_MAP); \ 194 TRACE_MMIO_MAP); \
225 IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\
226 IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\
227 IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \ 195 IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
228 IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \ 196 IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \
229 TRACE_GRAPH_ENT); \ 197 TRACE_GRAPH_ENT); \
230 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ 198 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
231 TRACE_GRAPH_RET); \ 199 TRACE_GRAPH_RET); \
232 IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
233 IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \
234 TRACE_KMEM_ALLOC); \
235 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
236 TRACE_KMEM_FREE); \
237 IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\
238 __ftrace_bad_type(); \ 200 __ftrace_bad_type(); \
239 } while (0) 201 } while (0)
240 202
@@ -312,6 +274,7 @@ struct tracer {
312 struct tracer *next; 274 struct tracer *next;
313 int print_max; 275 int print_max;
314 struct tracer_flags *flags; 276 struct tracer_flags *flags;
277 int use_max_tr;
315}; 278};
316 279
317 280
@@ -332,7 +295,6 @@ struct dentry *trace_create_file(const char *name,
332 const struct file_operations *fops); 295 const struct file_operations *fops);
333 296
334struct dentry *tracing_init_dentry(void); 297struct dentry *tracing_init_dentry(void);
335void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
336 298
337struct ring_buffer_event; 299struct ring_buffer_event;
338 300
@@ -352,6 +314,14 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
352struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, 314struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
353 int *ent_cpu, u64 *ent_ts); 315 int *ent_cpu, u64 *ent_ts);
354 316
317int trace_empty(struct trace_iterator *iter);
318
319void *trace_find_next_entry_inc(struct trace_iterator *iter);
320
321void trace_init_global_iter(struct trace_iterator *iter);
322
323void tracing_iter_reset(struct trace_iterator *iter, int cpu);
324
355void default_wait_pipe(struct trace_iterator *iter); 325void default_wait_pipe(struct trace_iterator *iter);
356void poll_wait_pipe(struct trace_iterator *iter); 326void poll_wait_pipe(struct trace_iterator *iter);
357 327
@@ -369,15 +339,13 @@ void tracing_sched_wakeup_trace(struct trace_array *tr,
369 struct task_struct *wakee, 339 struct task_struct *wakee,
370 struct task_struct *cur, 340 struct task_struct *cur,
371 unsigned long flags, int pc); 341 unsigned long flags, int pc);
372void trace_special(struct trace_array *tr,
373 struct trace_array_cpu *data,
374 unsigned long arg1,
375 unsigned long arg2,
376 unsigned long arg3, int pc);
377void trace_function(struct trace_array *tr, 342void trace_function(struct trace_array *tr,
378 unsigned long ip, 343 unsigned long ip,
379 unsigned long parent_ip, 344 unsigned long parent_ip,
380 unsigned long flags, int pc); 345 unsigned long flags, int pc);
346void trace_default_header(struct seq_file *m);
347void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
348int trace_empty(struct trace_iterator *iter);
381 349
382void trace_graph_return(struct ftrace_graph_ret *trace); 350void trace_graph_return(struct ftrace_graph_ret *trace);
383int trace_graph_entry(struct ftrace_graph_ent *trace); 351int trace_graph_entry(struct ftrace_graph_ent *trace);
@@ -391,8 +359,15 @@ void tracing_start_sched_switch_record(void);
391int register_tracer(struct tracer *type); 359int register_tracer(struct tracer *type);
392void unregister_tracer(struct tracer *type); 360void unregister_tracer(struct tracer *type);
393int is_tracing_stopped(void); 361int is_tracing_stopped(void);
362enum trace_file_type {
363 TRACE_FILE_LAT_FMT = 1,
364 TRACE_FILE_ANNOTATE = 2,
365};
366
367extern cpumask_var_t __read_mostly tracing_buffer_mask;
394 368
395extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr); 369#define for_each_tracing_cpu(cpu) \
370 for_each_cpu(cpu, tracing_buffer_mask)
396 371
397extern unsigned long nsecs_to_usecs(unsigned long nsecs); 372extern unsigned long nsecs_to_usecs(unsigned long nsecs);
398 373
@@ -416,12 +391,12 @@ void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
416void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, 391void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
417 int pc); 392 int pc);
418#else 393#else
419static inline void ftrace_trace_stack(struct trace_array *tr, 394static inline void ftrace_trace_stack(struct ring_buffer *buffer,
420 unsigned long flags, int skip, int pc) 395 unsigned long flags, int skip, int pc)
421{ 396{
422} 397}
423 398
424static inline void ftrace_trace_userstack(struct trace_array *tr, 399static inline void ftrace_trace_userstack(struct ring_buffer *buffer,
425 unsigned long flags, int pc) 400 unsigned long flags, int pc)
426{ 401{
427} 402}
@@ -463,14 +438,8 @@ extern int trace_selftest_startup_nop(struct tracer *trace,
463 struct trace_array *tr); 438 struct trace_array *tr);
464extern int trace_selftest_startup_sched_switch(struct tracer *trace, 439extern int trace_selftest_startup_sched_switch(struct tracer *trace,
465 struct trace_array *tr); 440 struct trace_array *tr);
466extern int trace_selftest_startup_sysprof(struct tracer *trace,
467 struct trace_array *tr);
468extern int trace_selftest_startup_branch(struct tracer *trace, 441extern int trace_selftest_startup_branch(struct tracer *trace,
469 struct trace_array *tr); 442 struct trace_array *tr);
470extern int trace_selftest_startup_hw_branches(struct tracer *trace,
471 struct trace_array *tr);
472extern int trace_selftest_startup_ksym(struct tracer *trace,
473 struct trace_array *tr);
474#endif /* CONFIG_FTRACE_STARTUP_TEST */ 443#endif /* CONFIG_FTRACE_STARTUP_TEST */
475 444
476extern void *head_page(struct trace_array_cpu *data); 445extern void *head_page(struct trace_array_cpu *data);
@@ -484,6 +453,8 @@ trace_array_vprintk(struct trace_array *tr,
484 unsigned long ip, const char *fmt, va_list args); 453 unsigned long ip, const char *fmt, va_list args);
485int trace_array_printk(struct trace_array *tr, 454int trace_array_printk(struct trace_array *tr,
486 unsigned long ip, const char *fmt, ...); 455 unsigned long ip, const char *fmt, ...);
456void trace_printk_seq(struct trace_seq *s);
457enum print_line_t print_trace_line(struct trace_iterator *iter);
487 458
488extern unsigned long trace_flags; 459extern unsigned long trace_flags;
489 460
@@ -491,9 +462,29 @@ extern int trace_clock_id;
491 462
492/* Standard output formatting function used for function return traces */ 463/* Standard output formatting function used for function return traces */
493#ifdef CONFIG_FUNCTION_GRAPH_TRACER 464#ifdef CONFIG_FUNCTION_GRAPH_TRACER
494extern enum print_line_t print_graph_function(struct trace_iterator *iter); 465
466/* Flag options */
467#define TRACE_GRAPH_PRINT_OVERRUN 0x1
468#define TRACE_GRAPH_PRINT_CPU 0x2
469#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
470#define TRACE_GRAPH_PRINT_PROC 0x8
471#define TRACE_GRAPH_PRINT_DURATION 0x10
472#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
473
474extern enum print_line_t
475print_graph_function_flags(struct trace_iterator *iter, u32 flags);
476extern void print_graph_headers_flags(struct seq_file *s, u32 flags);
495extern enum print_line_t 477extern enum print_line_t
496trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); 478trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
479extern void graph_trace_open(struct trace_iterator *iter);
480extern void graph_trace_close(struct trace_iterator *iter);
481extern int __trace_graph_entry(struct trace_array *tr,
482 struct ftrace_graph_ent *trace,
483 unsigned long flags, int pc);
484extern void __trace_graph_return(struct trace_array *tr,
485 struct ftrace_graph_ret *trace,
486 unsigned long flags, int pc);
487
497 488
498#ifdef CONFIG_DYNAMIC_FTRACE 489#ifdef CONFIG_DYNAMIC_FTRACE
499/* TODO: make this variable */ 490/* TODO: make this variable */
@@ -524,7 +515,7 @@ static inline int ftrace_graph_addr(unsigned long addr)
524#endif /* CONFIG_DYNAMIC_FTRACE */ 515#endif /* CONFIG_DYNAMIC_FTRACE */
525#else /* CONFIG_FUNCTION_GRAPH_TRACER */ 516#else /* CONFIG_FUNCTION_GRAPH_TRACER */
526static inline enum print_line_t 517static inline enum print_line_t
527print_graph_function(struct trace_iterator *iter) 518print_graph_function_flags(struct trace_iterator *iter, u32 flags)
528{ 519{
529 return TRACE_TYPE_UNHANDLED; 520 return TRACE_TYPE_UNHANDLED;
530} 521}
@@ -610,6 +601,7 @@ enum trace_iterator_flags {
610 TRACE_ITER_LATENCY_FMT = 0x20000, 601 TRACE_ITER_LATENCY_FMT = 0x20000,
611 TRACE_ITER_SLEEP_TIME = 0x40000, 602 TRACE_ITER_SLEEP_TIME = 0x40000,
612 TRACE_ITER_GRAPH_TIME = 0x80000, 603 TRACE_ITER_GRAPH_TIME = 0x80000,
604 TRACE_ITER_RECORD_CMD = 0x100000,
613}; 605};
614 606
615/* 607/*
@@ -621,54 +613,6 @@ enum trace_iterator_flags {
621 613
622extern struct tracer nop_trace; 614extern struct tracer nop_trace;
623 615
624/**
625 * ftrace_preempt_disable - disable preemption scheduler safe
626 *
627 * When tracing can happen inside the scheduler, there exists
628 * cases that the tracing might happen before the need_resched
629 * flag is checked. If this happens and the tracer calls
630 * preempt_enable (after a disable), a schedule might take place
631 * causing an infinite recursion.
632 *
633 * To prevent this, we read the need_resched flag before
634 * disabling preemption. When we want to enable preemption we
635 * check the flag, if it is set, then we call preempt_enable_no_resched.
636 * Otherwise, we call preempt_enable.
637 *
638 * The rational for doing the above is that if need_resched is set
639 * and we have yet to reschedule, we are either in an atomic location
640 * (where we do not need to check for scheduling) or we are inside
641 * the scheduler and do not want to resched.
642 */
643static inline int ftrace_preempt_disable(void)
644{
645 int resched;
646
647 resched = need_resched();
648 preempt_disable_notrace();
649
650 return resched;
651}
652
653/**
654 * ftrace_preempt_enable - enable preemption scheduler safe
655 * @resched: the return value from ftrace_preempt_disable
656 *
657 * This is a scheduler safe way to enable preemption and not miss
658 * any preemption checks. The disabled saved the state of preemption.
659 * If resched is set, then we are either inside an atomic or
660 * are inside the scheduler (we would have already scheduled
661 * otherwise). In this case, we do not want to call normal
662 * preempt_enable, but preempt_enable_no_resched instead.
663 */
664static inline void ftrace_preempt_enable(int resched)
665{
666 if (resched)
667 preempt_enable_no_resched_notrace();
668 else
669 preempt_enable_notrace();
670}
671
672#ifdef CONFIG_BRANCH_TRACER 616#ifdef CONFIG_BRANCH_TRACER
673extern int enable_branch_tracing(struct trace_array *tr); 617extern int enable_branch_tracing(struct trace_array *tr);
674extern void disable_branch_tracing(void); 618extern void disable_branch_tracing(void);
@@ -759,6 +703,8 @@ struct filter_pred {
759 int pop_n; 703 int pop_n;
760}; 704};
761 705
706extern struct list_head ftrace_common_fields;
707
762extern enum regex_type 708extern enum regex_type
763filter_parse_regex(char *buff, int len, char **search, int *not); 709filter_parse_regex(char *buff, int len, char **search, int *not);
764extern void print_event_filter(struct ftrace_event_call *call, 710extern void print_event_filter(struct ftrace_event_call *call,
@@ -771,12 +717,15 @@ extern void print_subsystem_event_filter(struct event_subsystem *system,
771 struct trace_seq *s); 717 struct trace_seq *s);
772extern int filter_assign_type(const char *type); 718extern int filter_assign_type(const char *type);
773 719
720struct list_head *
721trace_get_fields(struct ftrace_event_call *event_call);
722
774static inline int 723static inline int
775filter_check_discard(struct ftrace_event_call *call, void *rec, 724filter_check_discard(struct ftrace_event_call *call, void *rec,
776 struct ring_buffer *buffer, 725 struct ring_buffer *buffer,
777 struct ring_buffer_event *event) 726 struct ring_buffer_event *event)
778{ 727{
779 if (unlikely(call->filter_active) && 728 if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
780 !filter_match_preds(call->filter, rec)) { 729 !filter_match_preds(call->filter, rec)) {
781 ring_buffer_discard_commit(buffer, event); 730 ring_buffer_discard_commit(buffer, event);
782 return 1; 731 return 1;
@@ -785,6 +734,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
785 return 0; 734 return 0;
786} 735}
787 736
737extern void trace_event_enable_cmd_record(bool enable);
738
788extern struct mutex event_mutex; 739extern struct mutex event_mutex;
789extern struct list_head ftrace_events; 740extern struct list_head ftrace_events;
790 741
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
deleted file mode 100644
index c21d5f3956ad..000000000000
--- a/kernel/trace/trace_boot.c
+++ /dev/null
@@ -1,185 +0,0 @@
1/*
2 * ring buffer based initcalls tracer
3 *
4 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
5 *
6 */
7
8#include <linux/init.h>
9#include <linux/debugfs.h>
10#include <linux/ftrace.h>
11#include <linux/kallsyms.h>
12#include <linux/time.h>
13
14#include "trace.h"
15#include "trace_output.h"
16
17static struct trace_array *boot_trace;
18static bool pre_initcalls_finished;
19
20/* Tells the boot tracer that the pre_smp_initcalls are finished.
21 * So we are ready .
22 * It doesn't enable sched events tracing however.
23 * You have to call enable_boot_trace to do so.
24 */
25void start_boot_trace(void)
26{
27 pre_initcalls_finished = true;
28}
29
30void enable_boot_trace(void)
31{
32 if (boot_trace && pre_initcalls_finished)
33 tracing_start_sched_switch_record();
34}
35
36void disable_boot_trace(void)
37{
38 if (boot_trace && pre_initcalls_finished)
39 tracing_stop_sched_switch_record();
40}
41
42static int boot_trace_init(struct trace_array *tr)
43{
44 boot_trace = tr;
45
46 if (!tr)
47 return 0;
48
49 tracing_reset_online_cpus(tr);
50
51 tracing_sched_switch_assign_trace(tr);
52 return 0;
53}
54
55static enum print_line_t
56initcall_call_print_line(struct trace_iterator *iter)
57{
58 struct trace_entry *entry = iter->ent;
59 struct trace_seq *s = &iter->seq;
60 struct trace_boot_call *field;
61 struct boot_trace_call *call;
62 u64 ts;
63 unsigned long nsec_rem;
64 int ret;
65
66 trace_assign_type(field, entry);
67 call = &field->boot_call;
68 ts = iter->ts;
69 nsec_rem = do_div(ts, NSEC_PER_SEC);
70
71 ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n",
72 (unsigned long)ts, nsec_rem, call->func, call->caller);
73
74 if (!ret)
75 return TRACE_TYPE_PARTIAL_LINE;
76 else
77 return TRACE_TYPE_HANDLED;
78}
79
80static enum print_line_t
81initcall_ret_print_line(struct trace_iterator *iter)
82{
83 struct trace_entry *entry = iter->ent;
84 struct trace_seq *s = &iter->seq;
85 struct trace_boot_ret *field;
86 struct boot_trace_ret *init_ret;
87 u64 ts;
88 unsigned long nsec_rem;
89 int ret;
90
91 trace_assign_type(field, entry);
92 init_ret = &field->boot_ret;
93 ts = iter->ts;
94 nsec_rem = do_div(ts, NSEC_PER_SEC);
95
96 ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
97 "returned %d after %llu msecs\n",
98 (unsigned long) ts,
99 nsec_rem,
100 init_ret->func, init_ret->result, init_ret->duration);
101
102 if (!ret)
103 return TRACE_TYPE_PARTIAL_LINE;
104 else
105 return TRACE_TYPE_HANDLED;
106}
107
108static enum print_line_t initcall_print_line(struct trace_iterator *iter)
109{
110 struct trace_entry *entry = iter->ent;
111
112 switch (entry->type) {
113 case TRACE_BOOT_CALL:
114 return initcall_call_print_line(iter);
115 case TRACE_BOOT_RET:
116 return initcall_ret_print_line(iter);
117 default:
118 return TRACE_TYPE_UNHANDLED;
119 }
120}
121
122struct tracer boot_tracer __read_mostly =
123{
124 .name = "initcall",
125 .init = boot_trace_init,
126 .reset = tracing_reset_online_cpus,
127 .print_line = initcall_print_line,
128};
129
130void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
131{
132 struct ftrace_event_call *call = &event_boot_call;
133 struct ring_buffer_event *event;
134 struct ring_buffer *buffer;
135 struct trace_boot_call *entry;
136 struct trace_array *tr = boot_trace;
137
138 if (!tr || !pre_initcalls_finished)
139 return;
140
141 /* Get its name now since this function could
142 * disappear because it is in the .init section.
143 */
144 sprint_symbol(bt->func, (unsigned long)fn);
145 preempt_disable();
146
147 buffer = tr->buffer;
148 event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_CALL,
149 sizeof(*entry), 0, 0);
150 if (!event)
151 goto out;
152 entry = ring_buffer_event_data(event);
153 entry->boot_call = *bt;
154 if (!filter_check_discard(call, entry, buffer, event))
155 trace_buffer_unlock_commit(buffer, event, 0, 0);
156 out:
157 preempt_enable();
158}
159
160void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
161{
162 struct ftrace_event_call *call = &event_boot_ret;
163 struct ring_buffer_event *event;
164 struct ring_buffer *buffer;
165 struct trace_boot_ret *entry;
166 struct trace_array *tr = boot_trace;
167
168 if (!tr || !pre_initcalls_finished)
169 return;
170
171 sprint_symbol(bt->func, (unsigned long)fn);
172 preempt_disable();
173
174 buffer = tr->buffer;
175 event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_RET,
176 sizeof(*entry), 0, 0);
177 if (!event)
178 goto out;
179 entry = ring_buffer_event_data(event);
180 entry->boot_ret = *bt;
181 if (!filter_check_discard(call, entry, buffer, event))
182 trace_buffer_unlock_commit(buffer, event, 0, 0);
183 out:
184 preempt_enable();
185}
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index b9bc4d470177..8d3538b4ea5f 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -143,7 +143,7 @@ static void branch_trace_reset(struct trace_array *tr)
143} 143}
144 144
145static enum print_line_t trace_branch_print(struct trace_iterator *iter, 145static enum print_line_t trace_branch_print(struct trace_iterator *iter,
146 int flags) 146 int flags, struct trace_event *event)
147{ 147{
148 struct trace_branch *field; 148 struct trace_branch *field;
149 149
@@ -167,9 +167,13 @@ static void branch_print_header(struct seq_file *s)
167 " |\n"); 167 " |\n");
168} 168}
169 169
170static struct trace_event_functions trace_branch_funcs = {
171 .trace = trace_branch_print,
172};
173
170static struct trace_event trace_branch_event = { 174static struct trace_event trace_branch_event = {
171 .type = TRACE_BRANCH, 175 .type = TRACE_BRANCH,
172 .trace = trace_branch_print, 176 .funcs = &trace_branch_funcs,
173}; 177};
174 178
175static struct tracer branch_trace __read_mostly = 179static struct tracer branch_trace __read_mostly =
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 9d589d8dcd1a..685a67d55db0 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -32,16 +32,15 @@
32u64 notrace trace_clock_local(void) 32u64 notrace trace_clock_local(void)
33{ 33{
34 u64 clock; 34 u64 clock;
35 int resched;
36 35
37 /* 36 /*
38 * sched_clock() is an architecture implemented, fast, scalable, 37 * sched_clock() is an architecture implemented, fast, scalable,
39 * lockless clock. It is not guaranteed to be coherent across 38 * lockless clock. It is not guaranteed to be coherent across
40 * CPUs, nor across CPU idle events. 39 * CPUs, nor across CPU idle events.
41 */ 40 */
42 resched = ftrace_preempt_disable(); 41 preempt_disable_notrace();
43 clock = sched_clock(); 42 clock = sched_clock();
44 ftrace_preempt_enable(resched); 43 preempt_enable_notrace();
45 44
46 return clock; 45 return clock;
47} 46}
@@ -56,7 +55,7 @@ u64 notrace trace_clock_local(void)
56 */ 55 */
57u64 notrace trace_clock(void) 56u64 notrace trace_clock(void)
58{ 57{
59 return cpu_clock(raw_smp_processor_id()); 58 return local_clock();
60} 59}
61 60
62 61
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index c16a08f399df..e3dfecaf13e6 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -151,23 +151,6 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
151); 151);
152 152
153/* 153/*
154 * Special (free-form) trace entry:
155 */
156FTRACE_ENTRY(special, special_entry,
157
158 TRACE_SPECIAL,
159
160 F_STRUCT(
161 __field( unsigned long, arg1 )
162 __field( unsigned long, arg2 )
163 __field( unsigned long, arg3 )
164 ),
165
166 F_printk("(%08lx) (%08lx) (%08lx)",
167 __entry->arg1, __entry->arg2, __entry->arg3)
168);
169
170/*
171 * Stack-trace entry: 154 * Stack-trace entry:
172 */ 155 */
173 156
@@ -271,33 +254,6 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
271 __entry->map_id, __entry->opcode) 254 __entry->map_id, __entry->opcode)
272); 255);
273 256
274FTRACE_ENTRY(boot_call, trace_boot_call,
275
276 TRACE_BOOT_CALL,
277
278 F_STRUCT(
279 __field_struct( struct boot_trace_call, boot_call )
280 __field_desc( pid_t, boot_call, caller )
281 __array_desc( char, boot_call, func, KSYM_SYMBOL_LEN)
282 ),
283
284 F_printk("%d %s", __entry->caller, __entry->func)
285);
286
287FTRACE_ENTRY(boot_ret, trace_boot_ret,
288
289 TRACE_BOOT_RET,
290
291 F_STRUCT(
292 __field_struct( struct boot_trace_ret, boot_ret )
293 __array_desc( char, boot_ret, func, KSYM_SYMBOL_LEN)
294 __field_desc( int, boot_ret, result )
295 __field_desc( unsigned long, boot_ret, duration )
296 ),
297
298 F_printk("%s %d %lx",
299 __entry->func, __entry->result, __entry->duration)
300);
301 257
302#define TRACE_FUNC_SIZE 30 258#define TRACE_FUNC_SIZE 30
303#define TRACE_FILE_SIZE 20 259#define TRACE_FILE_SIZE 20
@@ -318,65 +274,3 @@ FTRACE_ENTRY(branch, trace_branch,
318 __entry->func, __entry->file, __entry->correct) 274 __entry->func, __entry->file, __entry->correct)
319); 275);
320 276
321FTRACE_ENTRY(hw_branch, hw_branch_entry,
322
323 TRACE_HW_BRANCHES,
324
325 F_STRUCT(
326 __field( u64, from )
327 __field( u64, to )
328 ),
329
330 F_printk("from: %llx to: %llx", __entry->from, __entry->to)
331);
332
333FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
334
335 TRACE_KMEM_ALLOC,
336
337 F_STRUCT(
338 __field( enum kmemtrace_type_id, type_id )
339 __field( unsigned long, call_site )
340 __field( const void *, ptr )
341 __field( size_t, bytes_req )
342 __field( size_t, bytes_alloc )
343 __field( gfp_t, gfp_flags )
344 __field( int, node )
345 ),
346
347 F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi"
348 " flags:%x node:%d",
349 __entry->type_id, __entry->call_site, __entry->ptr,
350 __entry->bytes_req, __entry->bytes_alloc,
351 __entry->gfp_flags, __entry->node)
352);
353
354FTRACE_ENTRY(kmem_free, kmemtrace_free_entry,
355
356 TRACE_KMEM_FREE,
357
358 F_STRUCT(
359 __field( enum kmemtrace_type_id, type_id )
360 __field( unsigned long, call_site )
361 __field( const void *, ptr )
362 ),
363
364 F_printk("type:%u call_site:%lx ptr:%p",
365 __entry->type_id, __entry->call_site, __entry->ptr)
366);
367
368FTRACE_ENTRY(ksym_trace, ksym_trace_entry,
369
370 TRACE_KSYM,
371
372 F_STRUCT(
373 __field( unsigned long, ip )
374 __field( unsigned char, type )
375 __array( char , cmd, TASK_COMM_LEN )
376 __field( unsigned long, addr )
377 ),
378
379 F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s",
380 (void *)__entry->ip, (unsigned int)__entry->type,
381 (void *)__entry->addr, __entry->cmd)
382);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 0565bb42566f..31cc4cb0dbf2 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -9,13 +9,7 @@
9#include <linux/kprobes.h> 9#include <linux/kprobes.h>
10#include "trace.h" 10#include "trace.h"
11 11
12DEFINE_PER_CPU(struct pt_regs, perf_trace_regs); 12static char *perf_trace_buf[4];
13EXPORT_PER_CPU_SYMBOL_GPL(perf_trace_regs);
14
15EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
16
17static char *perf_trace_buf;
18static char *perf_trace_buf_nmi;
19 13
20/* 14/*
21 * Force it to be aligned to unsigned long to avoid misaligned accesses 15 * Force it to be aligned to unsigned long to avoid misaligned accesses
@@ -27,57 +21,78 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
27/* Count the events in use (per event id, not per instance) */ 21/* Count the events in use (per event id, not per instance) */
28static int total_ref_count; 22static int total_ref_count;
29 23
30static int perf_trace_event_enable(struct ftrace_event_call *event) 24static int perf_trace_event_init(struct ftrace_event_call *tp_event,
25 struct perf_event *p_event)
31{ 26{
32 char *buf; 27 struct hlist_head *list;
33 int ret = -ENOMEM; 28 int ret = -ENOMEM;
29 int cpu;
34 30
35 if (event->perf_refcount++ > 0) 31 p_event->tp_event = tp_event;
32 if (tp_event->perf_refcount++ > 0)
36 return 0; 33 return 0;
37 34
38 if (!total_ref_count) { 35 list = alloc_percpu(struct hlist_head);
39 buf = (char *)alloc_percpu(perf_trace_t); 36 if (!list)
40 if (!buf) 37 goto fail;
41 goto fail_buf;
42 38
43 rcu_assign_pointer(perf_trace_buf, buf); 39 for_each_possible_cpu(cpu)
40 INIT_HLIST_HEAD(per_cpu_ptr(list, cpu));
44 41
45 buf = (char *)alloc_percpu(perf_trace_t); 42 tp_event->perf_events = list;
46 if (!buf)
47 goto fail_buf_nmi;
48 43
49 rcu_assign_pointer(perf_trace_buf_nmi, buf); 44 if (!total_ref_count) {
50 } 45 char *buf;
46 int i;
51 47
52 ret = event->perf_event_enable(event); 48 for (i = 0; i < 4; i++) {
53 if (!ret) { 49 buf = (char *)alloc_percpu(perf_trace_t);
54 total_ref_count++; 50 if (!buf)
55 return 0; 51 goto fail;
52
53 perf_trace_buf[i] = buf;
54 }
56 } 55 }
57 56
58fail_buf_nmi: 57 ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER);
58 if (ret)
59 goto fail;
60
61 total_ref_count++;
62 return 0;
63
64fail:
59 if (!total_ref_count) { 65 if (!total_ref_count) {
60 free_percpu(perf_trace_buf_nmi); 66 int i;
61 free_percpu(perf_trace_buf); 67
62 perf_trace_buf_nmi = NULL; 68 for (i = 0; i < 4; i++) {
63 perf_trace_buf = NULL; 69 free_percpu(perf_trace_buf[i]);
70 perf_trace_buf[i] = NULL;
71 }
72 }
73
74 if (!--tp_event->perf_refcount) {
75 free_percpu(tp_event->perf_events);
76 tp_event->perf_events = NULL;
64 } 77 }
65fail_buf:
66 event->perf_refcount--;
67 78
68 return ret; 79 return ret;
69} 80}
70 81
71int perf_trace_enable(int event_id) 82int perf_trace_init(struct perf_event *p_event)
72{ 83{
73 struct ftrace_event_call *event; 84 struct ftrace_event_call *tp_event;
85 int event_id = p_event->attr.config;
74 int ret = -EINVAL; 86 int ret = -EINVAL;
75 87
76 mutex_lock(&event_mutex); 88 mutex_lock(&event_mutex);
77 list_for_each_entry(event, &ftrace_events, list) { 89 list_for_each_entry(tp_event, &ftrace_events, list) {
78 if (event->id == event_id && event->perf_event_enable && 90 if (tp_event->event.type == event_id &&
79 try_module_get(event->mod)) { 91 tp_event->class && tp_event->class->reg &&
80 ret = perf_trace_event_enable(event); 92 try_module_get(tp_event->mod)) {
93 ret = perf_trace_event_init(tp_event, p_event);
94 if (ret)
95 module_put(tp_event->mod);
81 break; 96 break;
82 } 97 }
83 } 98 }
@@ -86,90 +101,83 @@ int perf_trace_enable(int event_id)
86 return ret; 101 return ret;
87} 102}
88 103
89static void perf_trace_event_disable(struct ftrace_event_call *event) 104int perf_trace_enable(struct perf_event *p_event)
90{ 105{
91 char *buf, *nmi_buf; 106 struct ftrace_event_call *tp_event = p_event->tp_event;
92 107 struct hlist_head *list;
93 if (--event->perf_refcount > 0)
94 return;
95 108
96 event->perf_event_disable(event); 109 list = tp_event->perf_events;
110 if (WARN_ON_ONCE(!list))
111 return -EINVAL;
97 112
98 if (!--total_ref_count) { 113 list = this_cpu_ptr(list);
99 buf = perf_trace_buf; 114 hlist_add_head_rcu(&p_event->hlist_entry, list);
100 rcu_assign_pointer(perf_trace_buf, NULL);
101
102 nmi_buf = perf_trace_buf_nmi;
103 rcu_assign_pointer(perf_trace_buf_nmi, NULL);
104 115
105 /* 116 return 0;
106 * Ensure every events in profiling have finished before 117}
107 * releasing the buffers
108 */
109 synchronize_sched();
110 118
111 free_percpu(buf); 119void perf_trace_disable(struct perf_event *p_event)
112 free_percpu(nmi_buf); 120{
113 } 121 hlist_del_rcu(&p_event->hlist_entry);
114} 122}
115 123
116void perf_trace_disable(int event_id) 124void perf_trace_destroy(struct perf_event *p_event)
117{ 125{
118 struct ftrace_event_call *event; 126 struct ftrace_event_call *tp_event = p_event->tp_event;
127 int i;
119 128
120 mutex_lock(&event_mutex); 129 mutex_lock(&event_mutex);
121 list_for_each_entry(event, &ftrace_events, list) { 130 if (--tp_event->perf_refcount > 0)
122 if (event->id == event_id) { 131 goto out;
123 perf_trace_event_disable(event); 132
124 module_put(event->mod); 133 tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
125 break; 134
135 /*
136 * Ensure our callback won't be called anymore. The buffers
137 * will be freed after that.
138 */
139 tracepoint_synchronize_unregister();
140
141 free_percpu(tp_event->perf_events);
142 tp_event->perf_events = NULL;
143
144 if (!--total_ref_count) {
145 for (i = 0; i < 4; i++) {
146 free_percpu(perf_trace_buf[i]);
147 perf_trace_buf[i] = NULL;
126 } 148 }
127 } 149 }
150out:
151 module_put(tp_event->mod);
128 mutex_unlock(&event_mutex); 152 mutex_unlock(&event_mutex);
129} 153}
130 154
131__kprobes void *perf_trace_buf_prepare(int size, unsigned short type, 155__kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
132 int *rctxp, unsigned long *irq_flags) 156 struct pt_regs *regs, int *rctxp)
133{ 157{
134 struct trace_entry *entry; 158 struct trace_entry *entry;
135 char *trace_buf, *raw_data; 159 unsigned long flags;
136 int pc, cpu; 160 char *raw_data;
161 int pc;
137 162
138 BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); 163 BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
139 164
140 pc = preempt_count(); 165 pc = preempt_count();
141 166
142 /* Protect the per cpu buffer, begin the rcu read side */
143 local_irq_save(*irq_flags);
144
145 *rctxp = perf_swevent_get_recursion_context(); 167 *rctxp = perf_swevent_get_recursion_context();
146 if (*rctxp < 0) 168 if (*rctxp < 0)
147 goto err_recursion; 169 return NULL;
148
149 cpu = smp_processor_id();
150
151 if (in_nmi())
152 trace_buf = rcu_dereference_sched(perf_trace_buf_nmi);
153 else
154 trace_buf = rcu_dereference_sched(perf_trace_buf);
155
156 if (!trace_buf)
157 goto err;
158 170
159 raw_data = per_cpu_ptr(trace_buf, cpu); 171 raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
160 172
161 /* zero the dead bytes from align to not leak stack to user */ 173 /* zero the dead bytes from align to not leak stack to user */
162 memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64)); 174 memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
163 175
164 entry = (struct trace_entry *)raw_data; 176 entry = (struct trace_entry *)raw_data;
165 tracing_generic_entry_update(entry, *irq_flags, pc); 177 local_save_flags(flags);
178 tracing_generic_entry_update(entry, flags, pc);
166 entry->type = type; 179 entry->type = type;
167 180
168 return raw_data; 181 return raw_data;
169err:
170 perf_swevent_put_recursion_context(*rctxp);
171err_recursion:
172 local_irq_restore(*irq_flags);
173 return NULL;
174} 182}
175EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); 183EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index c697c7043349..4c758f146328 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -28,10 +28,19 @@
28DEFINE_MUTEX(event_mutex); 28DEFINE_MUTEX(event_mutex);
29 29
30LIST_HEAD(ftrace_events); 30LIST_HEAD(ftrace_events);
31LIST_HEAD(ftrace_common_fields);
31 32
32int trace_define_field(struct ftrace_event_call *call, const char *type, 33struct list_head *
33 const char *name, int offset, int size, int is_signed, 34trace_get_fields(struct ftrace_event_call *event_call)
34 int filter_type) 35{
36 if (!event_call->class->get_fields)
37 return &event_call->class->fields;
38 return event_call->class->get_fields(event_call);
39}
40
41static int __trace_define_field(struct list_head *head, const char *type,
42 const char *name, int offset, int size,
43 int is_signed, int filter_type)
35{ 44{
36 struct ftrace_event_field *field; 45 struct ftrace_event_field *field;
37 46
@@ -56,7 +65,7 @@ int trace_define_field(struct ftrace_event_call *call, const char *type,
56 field->size = size; 65 field->size = size;
57 field->is_signed = is_signed; 66 field->is_signed = is_signed;
58 67
59 list_add(&field->link, &call->fields); 68 list_add(&field->link, head);
60 69
61 return 0; 70 return 0;
62 71
@@ -67,17 +76,32 @@ err:
67 76
68 return -ENOMEM; 77 return -ENOMEM;
69} 78}
79
80int trace_define_field(struct ftrace_event_call *call, const char *type,
81 const char *name, int offset, int size, int is_signed,
82 int filter_type)
83{
84 struct list_head *head;
85
86 if (WARN_ON(!call->class))
87 return 0;
88
89 head = trace_get_fields(call);
90 return __trace_define_field(head, type, name, offset, size,
91 is_signed, filter_type);
92}
70EXPORT_SYMBOL_GPL(trace_define_field); 93EXPORT_SYMBOL_GPL(trace_define_field);
71 94
72#define __common_field(type, item) \ 95#define __common_field(type, item) \
73 ret = trace_define_field(call, #type, "common_" #item, \ 96 ret = __trace_define_field(&ftrace_common_fields, #type, \
74 offsetof(typeof(ent), item), \ 97 "common_" #item, \
75 sizeof(ent.item), \ 98 offsetof(typeof(ent), item), \
76 is_signed_type(type), FILTER_OTHER); \ 99 sizeof(ent.item), \
100 is_signed_type(type), FILTER_OTHER); \
77 if (ret) \ 101 if (ret) \
78 return ret; 102 return ret;
79 103
80static int trace_define_common_fields(struct ftrace_event_call *call) 104static int trace_define_common_fields(void)
81{ 105{
82 int ret; 106 int ret;
83 struct trace_entry ent; 107 struct trace_entry ent;
@@ -94,8 +118,10 @@ static int trace_define_common_fields(struct ftrace_event_call *call)
94void trace_destroy_fields(struct ftrace_event_call *call) 118void trace_destroy_fields(struct ftrace_event_call *call)
95{ 119{
96 struct ftrace_event_field *field, *next; 120 struct ftrace_event_field *field, *next;
121 struct list_head *head;
97 122
98 list_for_each_entry_safe(field, next, &call->fields, link) { 123 head = trace_get_fields(call);
124 list_for_each_entry_safe(field, next, head, link) {
99 list_del(&field->link); 125 list_del(&field->link);
100 kfree(field->type); 126 kfree(field->type);
101 kfree(field->name); 127 kfree(field->name);
@@ -107,16 +133,63 @@ int trace_event_raw_init(struct ftrace_event_call *call)
107{ 133{
108 int id; 134 int id;
109 135
110 id = register_ftrace_event(call->event); 136 id = register_ftrace_event(&call->event);
111 if (!id) 137 if (!id)
112 return -ENODEV; 138 return -ENODEV;
113 call->id = id;
114 INIT_LIST_HEAD(&call->fields);
115 139
116 return 0; 140 return 0;
117} 141}
118EXPORT_SYMBOL_GPL(trace_event_raw_init); 142EXPORT_SYMBOL_GPL(trace_event_raw_init);
119 143
144int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type)
145{
146 switch (type) {
147 case TRACE_REG_REGISTER:
148 return tracepoint_probe_register(call->name,
149 call->class->probe,
150 call);
151 case TRACE_REG_UNREGISTER:
152 tracepoint_probe_unregister(call->name,
153 call->class->probe,
154 call);
155 return 0;
156
157#ifdef CONFIG_PERF_EVENTS
158 case TRACE_REG_PERF_REGISTER:
159 return tracepoint_probe_register(call->name,
160 call->class->perf_probe,
161 call);
162 case TRACE_REG_PERF_UNREGISTER:
163 tracepoint_probe_unregister(call->name,
164 call->class->perf_probe,
165 call);
166 return 0;
167#endif
168 }
169 return 0;
170}
171EXPORT_SYMBOL_GPL(ftrace_event_reg);
172
173void trace_event_enable_cmd_record(bool enable)
174{
175 struct ftrace_event_call *call;
176
177 mutex_lock(&event_mutex);
178 list_for_each_entry(call, &ftrace_events, list) {
179 if (!(call->flags & TRACE_EVENT_FL_ENABLED))
180 continue;
181
182 if (enable) {
183 tracing_start_cmdline_record();
184 call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
185 } else {
186 tracing_stop_cmdline_record();
187 call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
188 }
189 }
190 mutex_unlock(&event_mutex);
191}
192
120static int ftrace_event_enable_disable(struct ftrace_event_call *call, 193static int ftrace_event_enable_disable(struct ftrace_event_call *call,
121 int enable) 194 int enable)
122{ 195{
@@ -124,23 +197,29 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
124 197
125 switch (enable) { 198 switch (enable) {
126 case 0: 199 case 0:
127 if (call->enabled) { 200 if (call->flags & TRACE_EVENT_FL_ENABLED) {
128 call->enabled = 0; 201 call->flags &= ~TRACE_EVENT_FL_ENABLED;
129 tracing_stop_cmdline_record(); 202 if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) {
130 call->unregfunc(call); 203 tracing_stop_cmdline_record();
204 call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
205 }
206 call->class->reg(call, TRACE_REG_UNREGISTER);
131 } 207 }
132 break; 208 break;
133 case 1: 209 case 1:
134 if (!call->enabled) { 210 if (!(call->flags & TRACE_EVENT_FL_ENABLED)) {
135 tracing_start_cmdline_record(); 211 if (trace_flags & TRACE_ITER_RECORD_CMD) {
136 ret = call->regfunc(call); 212 tracing_start_cmdline_record();
213 call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
214 }
215 ret = call->class->reg(call, TRACE_REG_REGISTER);
137 if (ret) { 216 if (ret) {
138 tracing_stop_cmdline_record(); 217 tracing_stop_cmdline_record();
139 pr_info("event trace: Could not enable event " 218 pr_info("event trace: Could not enable event "
140 "%s\n", call->name); 219 "%s\n", call->name);
141 break; 220 break;
142 } 221 }
143 call->enabled = 1; 222 call->flags |= TRACE_EVENT_FL_ENABLED;
144 } 223 }
145 break; 224 break;
146 } 225 }
@@ -171,15 +250,15 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
171 mutex_lock(&event_mutex); 250 mutex_lock(&event_mutex);
172 list_for_each_entry(call, &ftrace_events, list) { 251 list_for_each_entry(call, &ftrace_events, list) {
173 252
174 if (!call->name || !call->regfunc) 253 if (!call->name || !call->class || !call->class->reg)
175 continue; 254 continue;
176 255
177 if (match && 256 if (match &&
178 strcmp(match, call->name) != 0 && 257 strcmp(match, call->name) != 0 &&
179 strcmp(match, call->system) != 0) 258 strcmp(match, call->class->system) != 0)
180 continue; 259 continue;
181 260
182 if (sub && strcmp(sub, call->system) != 0) 261 if (sub && strcmp(sub, call->class->system) != 0)
183 continue; 262 continue;
184 263
185 if (event && strcmp(event, call->name) != 0) 264 if (event && strcmp(event, call->name) != 0)
@@ -297,7 +376,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
297 * The ftrace subsystem is for showing formats only. 376 * The ftrace subsystem is for showing formats only.
298 * They can not be enabled or disabled via the event files. 377 * They can not be enabled or disabled via the event files.
299 */ 378 */
300 if (call->regfunc) 379 if (call->class && call->class->reg)
301 return call; 380 return call;
302 } 381 }
303 382
@@ -328,7 +407,7 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
328 (*pos)++; 407 (*pos)++;
329 408
330 list_for_each_entry_continue(call, &ftrace_events, list) { 409 list_for_each_entry_continue(call, &ftrace_events, list) {
331 if (call->enabled) 410 if (call->flags & TRACE_EVENT_FL_ENABLED)
332 return call; 411 return call;
333 } 412 }
334 413
@@ -355,8 +434,8 @@ static int t_show(struct seq_file *m, void *v)
355{ 434{
356 struct ftrace_event_call *call = v; 435 struct ftrace_event_call *call = v;
357 436
358 if (strcmp(call->system, TRACE_SYSTEM) != 0) 437 if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
359 seq_printf(m, "%s:", call->system); 438 seq_printf(m, "%s:", call->class->system);
360 seq_printf(m, "%s\n", call->name); 439 seq_printf(m, "%s\n", call->name);
361 440
362 return 0; 441 return 0;
@@ -387,7 +466,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
387 struct ftrace_event_call *call = filp->private_data; 466 struct ftrace_event_call *call = filp->private_data;
388 char *buf; 467 char *buf;
389 468
390 if (call->enabled) 469 if (call->flags & TRACE_EVENT_FL_ENABLED)
391 buf = "1\n"; 470 buf = "1\n";
392 else 471 else
393 buf = "0\n"; 472 buf = "0\n";
@@ -450,10 +529,10 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
450 529
451 mutex_lock(&event_mutex); 530 mutex_lock(&event_mutex);
452 list_for_each_entry(call, &ftrace_events, list) { 531 list_for_each_entry(call, &ftrace_events, list) {
453 if (!call->name || !call->regfunc) 532 if (!call->name || !call->class || !call->class->reg)
454 continue; 533 continue;
455 534
456 if (system && strcmp(call->system, system) != 0) 535 if (system && strcmp(call->class->system, system) != 0)
457 continue; 536 continue;
458 537
459 /* 538 /*
@@ -461,7 +540,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
461 * or if all events or cleared, or if we have 540 * or if all events or cleared, or if we have
462 * a mixture. 541 * a mixture.
463 */ 542 */
464 set |= (1 << !!call->enabled); 543 set |= (1 << !!(call->flags & TRACE_EVENT_FL_ENABLED));
465 544
466 /* 545 /*
467 * If we have a mixture, no need to look further. 546 * If we have a mixture, no need to look further.
@@ -519,83 +598,165 @@ out:
519 return ret; 598 return ret;
520} 599}
521 600
522static ssize_t 601enum {
523event_format_read(struct file *filp, char __user *ubuf, size_t cnt, 602 FORMAT_HEADER = 1,
524 loff_t *ppos) 603 FORMAT_PRINTFMT = 2,
604};
605
606static void *f_next(struct seq_file *m, void *v, loff_t *pos)
525{ 607{
526 struct ftrace_event_call *call = filp->private_data; 608 struct ftrace_event_call *call = m->private;
527 struct ftrace_event_field *field; 609 struct ftrace_event_field *field;
528 struct trace_seq *s; 610 struct list_head *head;
529 int common_field_count = 5;
530 char *buf;
531 int r = 0;
532
533 if (*ppos)
534 return 0;
535 611
536 s = kmalloc(sizeof(*s), GFP_KERNEL); 612 (*pos)++;
537 if (!s)
538 return -ENOMEM;
539 613
540 trace_seq_init(s); 614 switch ((unsigned long)v) {
615 case FORMAT_HEADER:
616 head = &ftrace_common_fields;
541 617
542 trace_seq_printf(s, "name: %s\n", call->name); 618 if (unlikely(list_empty(head)))
543 trace_seq_printf(s, "ID: %d\n", call->id); 619 return NULL;
544 trace_seq_printf(s, "format:\n");
545 620
546 list_for_each_entry_reverse(field, &call->fields, link) { 621 field = list_entry(head->prev, struct ftrace_event_field, link);
547 /* 622 return field;
548 * Smartly shows the array type(except dynamic array).
549 * Normal:
550 * field:TYPE VAR
551 * If TYPE := TYPE[LEN], it is shown:
552 * field:TYPE VAR[LEN]
553 */
554 const char *array_descriptor = strchr(field->type, '[');
555 623
556 if (!strncmp(field->type, "__data_loc", 10)) 624 case FORMAT_PRINTFMT:
557 array_descriptor = NULL; 625 /* all done */
626 return NULL;
627 }
558 628
559 if (!array_descriptor) { 629 head = trace_get_fields(call);
560 r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;"
561 "\tsize:%u;\tsigned:%d;\n",
562 field->type, field->name, field->offset,
563 field->size, !!field->is_signed);
564 } else {
565 r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;"
566 "\tsize:%u;\tsigned:%d;\n",
567 (int)(array_descriptor - field->type),
568 field->type, field->name,
569 array_descriptor, field->offset,
570 field->size, !!field->is_signed);
571 }
572 630
573 if (--common_field_count == 0) 631 /*
574 r = trace_seq_printf(s, "\n"); 632 * To separate common fields from event fields, the
633 * LSB is set on the first event field. Clear it in case.
634 */
635 v = (void *)((unsigned long)v & ~1L);
575 636
576 if (!r) 637 field = v;
577 break; 638 /*
639 * If this is a common field, and at the end of the list, then
640 * continue with main list.
641 */
642 if (field->link.prev == &ftrace_common_fields) {
643 if (unlikely(list_empty(head)))
644 return NULL;
645 field = list_entry(head->prev, struct ftrace_event_field, link);
646 /* Set the LSB to notify f_show to print an extra newline */
647 field = (struct ftrace_event_field *)
648 ((unsigned long)field | 1);
649 return field;
578 } 650 }
579 651
580 if (r) 652 /* If we are done tell f_show to print the format */
581 r = trace_seq_printf(s, "\nprint fmt: %s\n", 653 if (field->link.prev == head)
582 call->print_fmt); 654 return (void *)FORMAT_PRINTFMT;
583 655
584 if (!r) { 656 field = list_entry(field->link.prev, struct ftrace_event_field, link);
585 /* 657
586 * ug! The format output is bigger than a PAGE!! 658 return field;
587 */ 659}
588 buf = "FORMAT TOO BIG\n"; 660
589 r = simple_read_from_buffer(ubuf, cnt, ppos, 661static void *f_start(struct seq_file *m, loff_t *pos)
590 buf, strlen(buf)); 662{
591 goto out; 663 loff_t l = 0;
664 void *p;
665
666 /* Start by showing the header */
667 if (!*pos)
668 return (void *)FORMAT_HEADER;
669
670 p = (void *)FORMAT_HEADER;
671 do {
672 p = f_next(m, p, &l);
673 } while (p && l < *pos);
674
675 return p;
676}
677
678static int f_show(struct seq_file *m, void *v)
679{
680 struct ftrace_event_call *call = m->private;
681 struct ftrace_event_field *field;
682 const char *array_descriptor;
683
684 switch ((unsigned long)v) {
685 case FORMAT_HEADER:
686 seq_printf(m, "name: %s\n", call->name);
687 seq_printf(m, "ID: %d\n", call->event.type);
688 seq_printf(m, "format:\n");
689 return 0;
690
691 case FORMAT_PRINTFMT:
692 seq_printf(m, "\nprint fmt: %s\n",
693 call->print_fmt);
694 return 0;
592 } 695 }
593 696
594 r = simple_read_from_buffer(ubuf, cnt, ppos, 697 /*
595 s->buffer, s->len); 698 * To separate common fields from event fields, the
596 out: 699 * LSB is set on the first event field. Clear it and
597 kfree(s); 700 * print a newline if it is set.
598 return r; 701 */
702 if ((unsigned long)v & 1) {
703 seq_putc(m, '\n');
704 v = (void *)((unsigned long)v & ~1L);
705 }
706
707 field = v;
708
709 /*
710 * Smartly shows the array type(except dynamic array).
711 * Normal:
712 * field:TYPE VAR
713 * If TYPE := TYPE[LEN], it is shown:
714 * field:TYPE VAR[LEN]
715 */
716 array_descriptor = strchr(field->type, '[');
717
718 if (!strncmp(field->type, "__data_loc", 10))
719 array_descriptor = NULL;
720
721 if (!array_descriptor)
722 seq_printf(m, "\tfield:%s %s;\toffset:%u;\tsize:%u;\tsigned:%d;\n",
723 field->type, field->name, field->offset,
724 field->size, !!field->is_signed);
725 else
726 seq_printf(m, "\tfield:%.*s %s%s;\toffset:%u;\tsize:%u;\tsigned:%d;\n",
727 (int)(array_descriptor - field->type),
728 field->type, field->name,
729 array_descriptor, field->offset,
730 field->size, !!field->is_signed);
731
732 return 0;
733}
734
735static void f_stop(struct seq_file *m, void *p)
736{
737}
738
739static const struct seq_operations trace_format_seq_ops = {
740 .start = f_start,
741 .next = f_next,
742 .stop = f_stop,
743 .show = f_show,
744};
745
746static int trace_format_open(struct inode *inode, struct file *file)
747{
748 struct ftrace_event_call *call = inode->i_private;
749 struct seq_file *m;
750 int ret;
751
752 ret = seq_open(file, &trace_format_seq_ops);
753 if (ret < 0)
754 return ret;
755
756 m = file->private_data;
757 m->private = call;
758
759 return 0;
599} 760}
600 761
601static ssize_t 762static ssize_t
@@ -613,7 +774,7 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
613 return -ENOMEM; 774 return -ENOMEM;
614 775
615 trace_seq_init(s); 776 trace_seq_init(s);
616 trace_seq_printf(s, "%d\n", call->id); 777 trace_seq_printf(s, "%d\n", call->event.type);
617 778
618 r = simple_read_from_buffer(ubuf, cnt, ppos, 779 r = simple_read_from_buffer(ubuf, cnt, ppos,
619 s->buffer, s->len); 780 s->buffer, s->len);
@@ -793,8 +954,10 @@ static const struct file_operations ftrace_enable_fops = {
793}; 954};
794 955
795static const struct file_operations ftrace_event_format_fops = { 956static const struct file_operations ftrace_event_format_fops = {
796 .open = tracing_open_generic, 957 .open = trace_format_open,
797 .read = event_format_read, 958 .read = seq_read,
959 .llseek = seq_lseek,
960 .release = seq_release,
798}; 961};
799 962
800static const struct file_operations ftrace_event_id_fops = { 963static const struct file_operations ftrace_event_id_fops = {
@@ -919,14 +1082,15 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
919 const struct file_operations *filter, 1082 const struct file_operations *filter,
920 const struct file_operations *format) 1083 const struct file_operations *format)
921{ 1084{
1085 struct list_head *head;
922 int ret; 1086 int ret;
923 1087
924 /* 1088 /*
925 * If the trace point header did not define TRACE_SYSTEM 1089 * If the trace point header did not define TRACE_SYSTEM
926 * then the system would be called "TRACE_SYSTEM". 1090 * then the system would be called "TRACE_SYSTEM".
927 */ 1091 */
928 if (strcmp(call->system, TRACE_SYSTEM) != 0) 1092 if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
929 d_events = event_subsystem_dir(call->system, d_events); 1093 d_events = event_subsystem_dir(call->class->system, d_events);
930 1094
931 call->dir = debugfs_create_dir(call->name, d_events); 1095 call->dir = debugfs_create_dir(call->name, d_events);
932 if (!call->dir) { 1096 if (!call->dir) {
@@ -935,26 +1099,31 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
935 return -1; 1099 return -1;
936 } 1100 }
937 1101
938 if (call->regfunc) 1102 if (call->class->reg)
939 trace_create_file("enable", 0644, call->dir, call, 1103 trace_create_file("enable", 0644, call->dir, call,
940 enable); 1104 enable);
941 1105
942 if (call->id && call->perf_event_enable) 1106#ifdef CONFIG_PERF_EVENTS
1107 if (call->event.type && call->class->reg)
943 trace_create_file("id", 0444, call->dir, call, 1108 trace_create_file("id", 0444, call->dir, call,
944 id); 1109 id);
1110#endif
945 1111
946 if (call->define_fields) { 1112 /*
947 ret = trace_define_common_fields(call); 1113 * Other events may have the same class. Only update
948 if (!ret) 1114 * the fields if they are not already defined.
949 ret = call->define_fields(call); 1115 */
1116 head = trace_get_fields(call);
1117 if (list_empty(head)) {
1118 ret = call->class->define_fields(call);
950 if (ret < 0) { 1119 if (ret < 0) {
951 pr_warning("Could not initialize trace point" 1120 pr_warning("Could not initialize trace point"
952 " events/%s\n", call->name); 1121 " events/%s\n", call->name);
953 return ret; 1122 return ret;
954 } 1123 }
955 trace_create_file("filter", 0644, call->dir, call,
956 filter);
957 } 1124 }
1125 trace_create_file("filter", 0644, call->dir, call,
1126 filter);
958 1127
959 trace_create_file("format", 0444, call->dir, call, 1128 trace_create_file("format", 0444, call->dir, call,
960 format); 1129 format);
@@ -962,20 +1131,26 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
962 return 0; 1131 return 0;
963} 1132}
964 1133
965static int __trace_add_event_call(struct ftrace_event_call *call) 1134static int
1135__trace_add_event_call(struct ftrace_event_call *call, struct module *mod,
1136 const struct file_operations *id,
1137 const struct file_operations *enable,
1138 const struct file_operations *filter,
1139 const struct file_operations *format)
966{ 1140{
967 struct dentry *d_events; 1141 struct dentry *d_events;
968 int ret; 1142 int ret;
969 1143
1144 /* The linker may leave blanks */
970 if (!call->name) 1145 if (!call->name)
971 return -EINVAL; 1146 return -EINVAL;
972 1147
973 if (call->raw_init) { 1148 if (call->class->raw_init) {
974 ret = call->raw_init(call); 1149 ret = call->class->raw_init(call);
975 if (ret < 0) { 1150 if (ret < 0) {
976 if (ret != -ENOSYS) 1151 if (ret != -ENOSYS)
977 pr_warning("Could not initialize trace " 1152 pr_warning("Could not initialize trace events/%s\n",
978 "events/%s\n", call->name); 1153 call->name);
979 return ret; 1154 return ret;
980 } 1155 }
981 } 1156 }
@@ -984,11 +1159,10 @@ static int __trace_add_event_call(struct ftrace_event_call *call)
984 if (!d_events) 1159 if (!d_events)
985 return -ENOENT; 1160 return -ENOENT;
986 1161
987 ret = event_create_dir(call, d_events, &ftrace_event_id_fops, 1162 ret = event_create_dir(call, d_events, id, enable, filter, format);
988 &ftrace_enable_fops, &ftrace_event_filter_fops,
989 &ftrace_event_format_fops);
990 if (!ret) 1163 if (!ret)
991 list_add(&call->list, &ftrace_events); 1164 list_add(&call->list, &ftrace_events);
1165 call->mod = mod;
992 1166
993 return ret; 1167 return ret;
994} 1168}
@@ -998,7 +1172,10 @@ int trace_add_event_call(struct ftrace_event_call *call)
998{ 1172{
999 int ret; 1173 int ret;
1000 mutex_lock(&event_mutex); 1174 mutex_lock(&event_mutex);
1001 ret = __trace_add_event_call(call); 1175 ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops,
1176 &ftrace_enable_fops,
1177 &ftrace_event_filter_fops,
1178 &ftrace_event_format_fops);
1002 mutex_unlock(&event_mutex); 1179 mutex_unlock(&event_mutex);
1003 return ret; 1180 return ret;
1004} 1181}
@@ -1035,13 +1212,13 @@ static void remove_subsystem_dir(const char *name)
1035static void __trace_remove_event_call(struct ftrace_event_call *call) 1212static void __trace_remove_event_call(struct ftrace_event_call *call)
1036{ 1213{
1037 ftrace_event_enable_disable(call, 0); 1214 ftrace_event_enable_disable(call, 0);
1038 if (call->event) 1215 if (call->event.funcs)
1039 __unregister_ftrace_event(call->event); 1216 __unregister_ftrace_event(&call->event);
1040 debugfs_remove_recursive(call->dir); 1217 debugfs_remove_recursive(call->dir);
1041 list_del(&call->list); 1218 list_del(&call->list);
1042 trace_destroy_fields(call); 1219 trace_destroy_fields(call);
1043 destroy_preds(call); 1220 destroy_preds(call);
1044 remove_subsystem_dir(call->system); 1221 remove_subsystem_dir(call->class->system);
1045} 1222}
1046 1223
1047/* Remove an event_call */ 1224/* Remove an event_call */
@@ -1115,8 +1292,6 @@ static void trace_module_add_events(struct module *mod)
1115{ 1292{
1116 struct ftrace_module_file_ops *file_ops = NULL; 1293 struct ftrace_module_file_ops *file_ops = NULL;
1117 struct ftrace_event_call *call, *start, *end; 1294 struct ftrace_event_call *call, *start, *end;
1118 struct dentry *d_events;
1119 int ret;
1120 1295
1121 start = mod->trace_events; 1296 start = mod->trace_events;
1122 end = mod->trace_events + mod->num_trace_events; 1297 end = mod->trace_events + mod->num_trace_events;
@@ -1124,38 +1299,14 @@ static void trace_module_add_events(struct module *mod)
1124 if (start == end) 1299 if (start == end)
1125 return; 1300 return;
1126 1301
1127 d_events = event_trace_events_dir(); 1302 file_ops = trace_create_file_ops(mod);
1128 if (!d_events) 1303 if (!file_ops)
1129 return; 1304 return;
1130 1305
1131 for_each_event(call, start, end) { 1306 for_each_event(call, start, end) {
1132 /* The linker may leave blanks */ 1307 __trace_add_event_call(call, mod,
1133 if (!call->name)
1134 continue;
1135 if (call->raw_init) {
1136 ret = call->raw_init(call);
1137 if (ret < 0) {
1138 if (ret != -ENOSYS)
1139 pr_warning("Could not initialize trace "
1140 "point events/%s\n", call->name);
1141 continue;
1142 }
1143 }
1144 /*
1145 * This module has events, create file ops for this module
1146 * if not already done.
1147 */
1148 if (!file_ops) {
1149 file_ops = trace_create_file_ops(mod);
1150 if (!file_ops)
1151 return;
1152 }
1153 call->mod = mod;
1154 ret = event_create_dir(call, d_events,
1155 &file_ops->id, &file_ops->enable, 1308 &file_ops->id, &file_ops->enable,
1156 &file_ops->filter, &file_ops->format); 1309 &file_ops->filter, &file_ops->format);
1157 if (!ret)
1158 list_add(&call->list, &ftrace_events);
1159 } 1310 }
1160} 1311}
1161 1312
@@ -1282,25 +1433,14 @@ static __init int event_trace_init(void)
1282 trace_create_file("enable", 0644, d_events, 1433 trace_create_file("enable", 0644, d_events,
1283 NULL, &ftrace_system_enable_fops); 1434 NULL, &ftrace_system_enable_fops);
1284 1435
1436 if (trace_define_common_fields())
1437 pr_warning("tracing: Failed to allocate common fields");
1438
1285 for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { 1439 for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
1286 /* The linker may leave blanks */ 1440 __trace_add_event_call(call, NULL, &ftrace_event_id_fops,
1287 if (!call->name)
1288 continue;
1289 if (call->raw_init) {
1290 ret = call->raw_init(call);
1291 if (ret < 0) {
1292 if (ret != -ENOSYS)
1293 pr_warning("Could not initialize trace "
1294 "point events/%s\n", call->name);
1295 continue;
1296 }
1297 }
1298 ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
1299 &ftrace_enable_fops, 1441 &ftrace_enable_fops,
1300 &ftrace_event_filter_fops, 1442 &ftrace_event_filter_fops,
1301 &ftrace_event_format_fops); 1443 &ftrace_event_format_fops);
1302 if (!ret)
1303 list_add(&call->list, &ftrace_events);
1304 } 1444 }
1305 1445
1306 while (true) { 1446 while (true) {
@@ -1388,8 +1528,8 @@ static __init void event_trace_self_tests(void)
1388 1528
1389 list_for_each_entry(call, &ftrace_events, list) { 1529 list_for_each_entry(call, &ftrace_events, list) {
1390 1530
1391 /* Only test those that have a regfunc */ 1531 /* Only test those that have a probe */
1392 if (!call->regfunc) 1532 if (!call->class || !call->class->probe)
1393 continue; 1533 continue;
1394 1534
1395/* 1535/*
@@ -1399,8 +1539,8 @@ static __init void event_trace_self_tests(void)
1399 * syscalls as we test. 1539 * syscalls as we test.
1400 */ 1540 */
1401#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS 1541#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS
1402 if (call->system && 1542 if (call->class->system &&
1403 strcmp(call->system, "syscalls") == 0) 1543 strcmp(call->class->system, "syscalls") == 0)
1404 continue; 1544 continue;
1405#endif 1545#endif
1406 1546
@@ -1410,7 +1550,7 @@ static __init void event_trace_self_tests(void)
1410 * If an event is already enabled, someone is using 1550 * If an event is already enabled, someone is using
1411 * it and the self test should not be on. 1551 * it and the self test should not be on.
1412 */ 1552 */
1413 if (call->enabled) { 1553 if (call->flags & TRACE_EVENT_FL_ENABLED) {
1414 pr_warning("Enabled event during self test!\n"); 1554 pr_warning("Enabled event during self test!\n");
1415 WARN_ON_ONCE(1); 1555 WARN_ON_ONCE(1);
1416 continue; 1556 continue;
@@ -1487,12 +1627,11 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1487 struct ftrace_entry *entry; 1627 struct ftrace_entry *entry;
1488 unsigned long flags; 1628 unsigned long flags;
1489 long disabled; 1629 long disabled;
1490 int resched;
1491 int cpu; 1630 int cpu;
1492 int pc; 1631 int pc;
1493 1632
1494 pc = preempt_count(); 1633 pc = preempt_count();
1495 resched = ftrace_preempt_disable(); 1634 preempt_disable_notrace();
1496 cpu = raw_smp_processor_id(); 1635 cpu = raw_smp_processor_id();
1497 disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); 1636 disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
1498 1637
@@ -1514,7 +1653,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1514 1653
1515 out: 1654 out:
1516 atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); 1655 atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
1517 ftrace_preempt_enable(resched); 1656 preempt_enable_notrace();
1518} 1657}
1519 1658
1520static struct ftrace_ops trace_ops __initdata = 1659static struct ftrace_ops trace_ops __initdata =
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 88c0b6dbd7fe..36d40104b17f 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -497,11 +497,11 @@ void print_subsystem_event_filter(struct event_subsystem *system,
497} 497}
498 498
499static struct ftrace_event_field * 499static struct ftrace_event_field *
500find_event_field(struct ftrace_event_call *call, char *name) 500__find_event_field(struct list_head *head, char *name)
501{ 501{
502 struct ftrace_event_field *field; 502 struct ftrace_event_field *field;
503 503
504 list_for_each_entry(field, &call->fields, link) { 504 list_for_each_entry(field, head, link) {
505 if (!strcmp(field->name, name)) 505 if (!strcmp(field->name, name))
506 return field; 506 return field;
507 } 507 }
@@ -509,6 +509,20 @@ find_event_field(struct ftrace_event_call *call, char *name)
509 return NULL; 509 return NULL;
510} 510}
511 511
512static struct ftrace_event_field *
513find_event_field(struct ftrace_event_call *call, char *name)
514{
515 struct ftrace_event_field *field;
516 struct list_head *head;
517
518 field = __find_event_field(&ftrace_common_fields, name);
519 if (field)
520 return field;
521
522 head = trace_get_fields(call);
523 return __find_event_field(head, name);
524}
525
512static void filter_free_pred(struct filter_pred *pred) 526static void filter_free_pred(struct filter_pred *pred)
513{ 527{
514 if (!pred) 528 if (!pred)
@@ -545,7 +559,7 @@ static void filter_disable_preds(struct ftrace_event_call *call)
545 struct event_filter *filter = call->filter; 559 struct event_filter *filter = call->filter;
546 int i; 560 int i;
547 561
548 call->filter_active = 0; 562 call->flags &= ~TRACE_EVENT_FL_FILTERED;
549 filter->n_preds = 0; 563 filter->n_preds = 0;
550 564
551 for (i = 0; i < MAX_FILTER_PRED; i++) 565 for (i = 0; i < MAX_FILTER_PRED; i++)
@@ -572,7 +586,7 @@ void destroy_preds(struct ftrace_event_call *call)
572{ 586{
573 __free_preds(call->filter); 587 __free_preds(call->filter);
574 call->filter = NULL; 588 call->filter = NULL;
575 call->filter_active = 0; 589 call->flags &= ~TRACE_EVENT_FL_FILTERED;
576} 590}
577 591
578static struct event_filter *__alloc_preds(void) 592static struct event_filter *__alloc_preds(void)
@@ -611,7 +625,7 @@ static int init_preds(struct ftrace_event_call *call)
611 if (call->filter) 625 if (call->filter)
612 return 0; 626 return 0;
613 627
614 call->filter_active = 0; 628 call->flags &= ~TRACE_EVENT_FL_FILTERED;
615 call->filter = __alloc_preds(); 629 call->filter = __alloc_preds();
616 if (IS_ERR(call->filter)) 630 if (IS_ERR(call->filter))
617 return PTR_ERR(call->filter); 631 return PTR_ERR(call->filter);
@@ -625,10 +639,7 @@ static int init_subsystem_preds(struct event_subsystem *system)
625 int err; 639 int err;
626 640
627 list_for_each_entry(call, &ftrace_events, list) { 641 list_for_each_entry(call, &ftrace_events, list) {
628 if (!call->define_fields) 642 if (strcmp(call->class->system, system->name) != 0)
629 continue;
630
631 if (strcmp(call->system, system->name) != 0)
632 continue; 643 continue;
633 644
634 err = init_preds(call); 645 err = init_preds(call);
@@ -644,10 +655,7 @@ static void filter_free_subsystem_preds(struct event_subsystem *system)
644 struct ftrace_event_call *call; 655 struct ftrace_event_call *call;
645 656
646 list_for_each_entry(call, &ftrace_events, list) { 657 list_for_each_entry(call, &ftrace_events, list) {
647 if (!call->define_fields) 658 if (strcmp(call->class->system, system->name) != 0)
648 continue;
649
650 if (strcmp(call->system, system->name) != 0)
651 continue; 659 continue;
652 660
653 filter_disable_preds(call); 661 filter_disable_preds(call);
@@ -1249,10 +1257,7 @@ static int replace_system_preds(struct event_subsystem *system,
1249 list_for_each_entry(call, &ftrace_events, list) { 1257 list_for_each_entry(call, &ftrace_events, list) {
1250 struct event_filter *filter = call->filter; 1258 struct event_filter *filter = call->filter;
1251 1259
1252 if (!call->define_fields) 1260 if (strcmp(call->class->system, system->name) != 0)
1253 continue;
1254
1255 if (strcmp(call->system, system->name) != 0)
1256 continue; 1261 continue;
1257 1262
1258 /* try to see if the filter can be applied */ 1263 /* try to see if the filter can be applied */
@@ -1266,7 +1271,7 @@ static int replace_system_preds(struct event_subsystem *system,
1266 if (err) 1271 if (err)
1267 filter_disable_preds(call); 1272 filter_disable_preds(call);
1268 else { 1273 else {
1269 call->filter_active = 1; 1274 call->flags |= TRACE_EVENT_FL_FILTERED;
1270 replace_filter_string(filter, filter_string); 1275 replace_filter_string(filter, filter_string);
1271 } 1276 }
1272 fail = false; 1277 fail = false;
@@ -1315,7 +1320,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1315 if (err) 1320 if (err)
1316 append_filter_err(ps, call->filter); 1321 append_filter_err(ps, call->filter);
1317 else 1322 else
1318 call->filter_active = 1; 1323 call->flags |= TRACE_EVENT_FL_FILTERED;
1319out: 1324out:
1320 filter_opstack_clear(ps); 1325 filter_opstack_clear(ps);
1321 postfix_clear(ps); 1326 postfix_clear(ps);
@@ -1393,12 +1398,12 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1393 mutex_lock(&event_mutex); 1398 mutex_lock(&event_mutex);
1394 1399
1395 list_for_each_entry(call, &ftrace_events, list) { 1400 list_for_each_entry(call, &ftrace_events, list) {
1396 if (call->id == event_id) 1401 if (call->event.type == event_id)
1397 break; 1402 break;
1398 } 1403 }
1399 1404
1400 err = -EINVAL; 1405 err = -EINVAL;
1401 if (!call) 1406 if (&call->list == &ftrace_events)
1402 goto out_unlock; 1407 goto out_unlock;
1403 1408
1404 err = -EEXIST; 1409 err = -EEXIST;
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index e091f64ba6ce..4ba44deaac25 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -125,12 +125,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
125 125
126#include "trace_entries.h" 126#include "trace_entries.h"
127 127
128static int ftrace_raw_init_event(struct ftrace_event_call *call)
129{
130 INIT_LIST_HEAD(&call->fields);
131 return 0;
132}
133
134#undef __entry 128#undef __entry
135#define __entry REC 129#define __entry REC
136 130
@@ -153,17 +147,21 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
153#define F_printk(fmt, args...) #fmt ", " __stringify(args) 147#define F_printk(fmt, args...) #fmt ", " __stringify(args)
154 148
155#undef FTRACE_ENTRY 149#undef FTRACE_ENTRY
156#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ 150#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \
151 \
152struct ftrace_event_class event_class_ftrace_##call = { \
153 .system = __stringify(TRACE_SYSTEM), \
154 .define_fields = ftrace_define_fields_##call, \
155 .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
156}; \
157 \ 157 \
158struct ftrace_event_call __used \ 158struct ftrace_event_call __used \
159__attribute__((__aligned__(4))) \ 159__attribute__((__aligned__(4))) \
160__attribute__((section("_ftrace_events"))) event_##call = { \ 160__attribute__((section("_ftrace_events"))) event_##call = { \
161 .name = #call, \ 161 .name = #call, \
162 .id = type, \ 162 .event.type = etype, \
163 .system = __stringify(TRACE_SYSTEM), \ 163 .class = &event_class_ftrace_##call, \
164 .raw_init = ftrace_raw_init_event, \
165 .print_fmt = print, \ 164 .print_fmt = print, \
166 .define_fields = ftrace_define_fields_##call, \
167}; \ 165}; \
168 166
169#include "trace_entries.h" 167#include "trace_entries.h"
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index b3f3776b0cd6..16aee4d44e8f 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -54,14 +54,14 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
54 struct trace_array_cpu *data; 54 struct trace_array_cpu *data;
55 unsigned long flags; 55 unsigned long flags;
56 long disabled; 56 long disabled;
57 int cpu, resched; 57 int cpu;
58 int pc; 58 int pc;
59 59
60 if (unlikely(!ftrace_function_enabled)) 60 if (unlikely(!ftrace_function_enabled))
61 return; 61 return;
62 62
63 pc = preempt_count(); 63 pc = preempt_count();
64 resched = ftrace_preempt_disable(); 64 preempt_disable_notrace();
65 local_save_flags(flags); 65 local_save_flags(flags);
66 cpu = raw_smp_processor_id(); 66 cpu = raw_smp_processor_id();
67 data = tr->data[cpu]; 67 data = tr->data[cpu];
@@ -71,7 +71,7 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
71 trace_function(tr, ip, parent_ip, flags, pc); 71 trace_function(tr, ip, parent_ip, flags, pc);
72 72
73 atomic_dec(&data->disabled); 73 atomic_dec(&data->disabled);
74 ftrace_preempt_enable(resched); 74 preempt_enable_notrace();
75} 75}
76 76
77static void 77static void
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 9aed1a5cf553..6f233698518e 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -40,7 +40,7 @@ struct fgraph_data {
40#define TRACE_GRAPH_PRINT_OVERHEAD 0x4 40#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
41#define TRACE_GRAPH_PRINT_PROC 0x8 41#define TRACE_GRAPH_PRINT_PROC 0x8
42#define TRACE_GRAPH_PRINT_DURATION 0x10 42#define TRACE_GRAPH_PRINT_DURATION 0x10
43#define TRACE_GRAPH_PRINT_ABS_TIME 0X20 43#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
44 44
45static struct tracer_opt trace_opts[] = { 45static struct tracer_opt trace_opts[] = {
46 /* Display overruns? (for self-debug purpose) */ 46 /* Display overruns? (for self-debug purpose) */
@@ -179,7 +179,7 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
179 return ret; 179 return ret;
180} 180}
181 181
182static int __trace_graph_entry(struct trace_array *tr, 182int __trace_graph_entry(struct trace_array *tr,
183 struct ftrace_graph_ent *trace, 183 struct ftrace_graph_ent *trace,
184 unsigned long flags, 184 unsigned long flags,
185 int pc) 185 int pc)
@@ -246,7 +246,7 @@ int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
246 return trace_graph_entry(trace); 246 return trace_graph_entry(trace);
247} 247}
248 248
249static void __trace_graph_return(struct trace_array *tr, 249void __trace_graph_return(struct trace_array *tr,
250 struct ftrace_graph_ret *trace, 250 struct ftrace_graph_ret *trace,
251 unsigned long flags, 251 unsigned long flags,
252 int pc) 252 int pc)
@@ -490,9 +490,10 @@ get_return_for_leaf(struct trace_iterator *iter,
490 * We need to consume the current entry to see 490 * We need to consume the current entry to see
491 * the next one. 491 * the next one.
492 */ 492 */
493 ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL); 493 ring_buffer_consume(iter->tr->buffer, iter->cpu,
494 NULL, NULL);
494 event = ring_buffer_peek(iter->tr->buffer, iter->cpu, 495 event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
495 NULL); 496 NULL, NULL);
496 } 497 }
497 498
498 if (!event) 499 if (!event)
@@ -506,7 +507,15 @@ get_return_for_leaf(struct trace_iterator *iter,
506 * if the output fails. 507 * if the output fails.
507 */ 508 */
508 data->ent = *curr; 509 data->ent = *curr;
509 data->ret = *next; 510 /*
511 * If the next event is not a return type, then
512 * we only care about what type it is. Otherwise we can
513 * safely copy the entire event.
514 */
515 if (next->ent.type == TRACE_GRAPH_RET)
516 data->ret = *next;
517 else
518 data->ret.ent.type = next->ent.type;
510 } 519 }
511 } 520 }
512 521
@@ -526,17 +535,18 @@ get_return_for_leaf(struct trace_iterator *iter,
526 535
527/* Signal a overhead of time execution to the output */ 536/* Signal a overhead of time execution to the output */
528static int 537static int
529print_graph_overhead(unsigned long long duration, struct trace_seq *s) 538print_graph_overhead(unsigned long long duration, struct trace_seq *s,
539 u32 flags)
530{ 540{
531 /* If duration disappear, we don't need anything */ 541 /* If duration disappear, we don't need anything */
532 if (!(tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)) 542 if (!(flags & TRACE_GRAPH_PRINT_DURATION))
533 return 1; 543 return 1;
534 544
535 /* Non nested entry or return */ 545 /* Non nested entry or return */
536 if (duration == -1) 546 if (duration == -1)
537 return trace_seq_printf(s, " "); 547 return trace_seq_printf(s, " ");
538 548
539 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { 549 if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
540 /* Duration exceeded 100 msecs */ 550 /* Duration exceeded 100 msecs */
541 if (duration > 100000ULL) 551 if (duration > 100000ULL)
542 return trace_seq_printf(s, "! "); 552 return trace_seq_printf(s, "! ");
@@ -562,7 +572,7 @@ static int print_graph_abs_time(u64 t, struct trace_seq *s)
562 572
563static enum print_line_t 573static enum print_line_t
564print_graph_irq(struct trace_iterator *iter, unsigned long addr, 574print_graph_irq(struct trace_iterator *iter, unsigned long addr,
565 enum trace_type type, int cpu, pid_t pid) 575 enum trace_type type, int cpu, pid_t pid, u32 flags)
566{ 576{
567 int ret; 577 int ret;
568 struct trace_seq *s = &iter->seq; 578 struct trace_seq *s = &iter->seq;
@@ -572,21 +582,21 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
572 return TRACE_TYPE_UNHANDLED; 582 return TRACE_TYPE_UNHANDLED;
573 583
574 /* Absolute time */ 584 /* Absolute time */
575 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { 585 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
576 ret = print_graph_abs_time(iter->ts, s); 586 ret = print_graph_abs_time(iter->ts, s);
577 if (!ret) 587 if (!ret)
578 return TRACE_TYPE_PARTIAL_LINE; 588 return TRACE_TYPE_PARTIAL_LINE;
579 } 589 }
580 590
581 /* Cpu */ 591 /* Cpu */
582 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { 592 if (flags & TRACE_GRAPH_PRINT_CPU) {
583 ret = print_graph_cpu(s, cpu); 593 ret = print_graph_cpu(s, cpu);
584 if (ret == TRACE_TYPE_PARTIAL_LINE) 594 if (ret == TRACE_TYPE_PARTIAL_LINE)
585 return TRACE_TYPE_PARTIAL_LINE; 595 return TRACE_TYPE_PARTIAL_LINE;
586 } 596 }
587 597
588 /* Proc */ 598 /* Proc */
589 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { 599 if (flags & TRACE_GRAPH_PRINT_PROC) {
590 ret = print_graph_proc(s, pid); 600 ret = print_graph_proc(s, pid);
591 if (ret == TRACE_TYPE_PARTIAL_LINE) 601 if (ret == TRACE_TYPE_PARTIAL_LINE)
592 return TRACE_TYPE_PARTIAL_LINE; 602 return TRACE_TYPE_PARTIAL_LINE;
@@ -596,7 +606,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
596 } 606 }
597 607
598 /* No overhead */ 608 /* No overhead */
599 ret = print_graph_overhead(-1, s); 609 ret = print_graph_overhead(-1, s, flags);
600 if (!ret) 610 if (!ret)
601 return TRACE_TYPE_PARTIAL_LINE; 611 return TRACE_TYPE_PARTIAL_LINE;
602 612
@@ -609,7 +619,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
609 return TRACE_TYPE_PARTIAL_LINE; 619 return TRACE_TYPE_PARTIAL_LINE;
610 620
611 /* Don't close the duration column if haven't one */ 621 /* Don't close the duration column if haven't one */
612 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 622 if (flags & TRACE_GRAPH_PRINT_DURATION)
613 trace_seq_printf(s, " |"); 623 trace_seq_printf(s, " |");
614 ret = trace_seq_printf(s, "\n"); 624 ret = trace_seq_printf(s, "\n");
615 625
@@ -639,7 +649,8 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
639 649
640 /* Print nsecs (we don't want to exceed 7 numbers) */ 650 /* Print nsecs (we don't want to exceed 7 numbers) */
641 if (len < 7) { 651 if (len < 7) {
642 snprintf(nsecs_str, 8 - len, "%03lu", nsecs_rem); 652 snprintf(nsecs_str, min(sizeof(nsecs_str), 8UL - len), "%03lu",
653 nsecs_rem);
643 ret = trace_seq_printf(s, ".%s", nsecs_str); 654 ret = trace_seq_printf(s, ".%s", nsecs_str);
644 if (!ret) 655 if (!ret)
645 return TRACE_TYPE_PARTIAL_LINE; 656 return TRACE_TYPE_PARTIAL_LINE;
@@ -679,7 +690,8 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
679static enum print_line_t 690static enum print_line_t
680print_graph_entry_leaf(struct trace_iterator *iter, 691print_graph_entry_leaf(struct trace_iterator *iter,
681 struct ftrace_graph_ent_entry *entry, 692 struct ftrace_graph_ent_entry *entry,
682 struct ftrace_graph_ret_entry *ret_entry, struct trace_seq *s) 693 struct ftrace_graph_ret_entry *ret_entry,
694 struct trace_seq *s, u32 flags)
683{ 695{
684 struct fgraph_data *data = iter->private; 696 struct fgraph_data *data = iter->private;
685 struct ftrace_graph_ret *graph_ret; 697 struct ftrace_graph_ret *graph_ret;
@@ -711,12 +723,12 @@ print_graph_entry_leaf(struct trace_iterator *iter,
711 } 723 }
712 724
713 /* Overhead */ 725 /* Overhead */
714 ret = print_graph_overhead(duration, s); 726 ret = print_graph_overhead(duration, s, flags);
715 if (!ret) 727 if (!ret)
716 return TRACE_TYPE_PARTIAL_LINE; 728 return TRACE_TYPE_PARTIAL_LINE;
717 729
718 /* Duration */ 730 /* Duration */
719 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { 731 if (flags & TRACE_GRAPH_PRINT_DURATION) {
720 ret = print_graph_duration(duration, s); 732 ret = print_graph_duration(duration, s);
721 if (ret == TRACE_TYPE_PARTIAL_LINE) 733 if (ret == TRACE_TYPE_PARTIAL_LINE)
722 return TRACE_TYPE_PARTIAL_LINE; 734 return TRACE_TYPE_PARTIAL_LINE;
@@ -739,7 +751,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
739static enum print_line_t 751static enum print_line_t
740print_graph_entry_nested(struct trace_iterator *iter, 752print_graph_entry_nested(struct trace_iterator *iter,
741 struct ftrace_graph_ent_entry *entry, 753 struct ftrace_graph_ent_entry *entry,
742 struct trace_seq *s, int cpu) 754 struct trace_seq *s, int cpu, u32 flags)
743{ 755{
744 struct ftrace_graph_ent *call = &entry->graph_ent; 756 struct ftrace_graph_ent *call = &entry->graph_ent;
745 struct fgraph_data *data = iter->private; 757 struct fgraph_data *data = iter->private;
@@ -759,12 +771,12 @@ print_graph_entry_nested(struct trace_iterator *iter,
759 } 771 }
760 772
761 /* No overhead */ 773 /* No overhead */
762 ret = print_graph_overhead(-1, s); 774 ret = print_graph_overhead(-1, s, flags);
763 if (!ret) 775 if (!ret)
764 return TRACE_TYPE_PARTIAL_LINE; 776 return TRACE_TYPE_PARTIAL_LINE;
765 777
766 /* No time */ 778 /* No time */
767 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { 779 if (flags & TRACE_GRAPH_PRINT_DURATION) {
768 ret = trace_seq_printf(s, " | "); 780 ret = trace_seq_printf(s, " | ");
769 if (!ret) 781 if (!ret)
770 return TRACE_TYPE_PARTIAL_LINE; 782 return TRACE_TYPE_PARTIAL_LINE;
@@ -790,7 +802,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
790 802
791static enum print_line_t 803static enum print_line_t
792print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, 804print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
793 int type, unsigned long addr) 805 int type, unsigned long addr, u32 flags)
794{ 806{
795 struct fgraph_data *data = iter->private; 807 struct fgraph_data *data = iter->private;
796 struct trace_entry *ent = iter->ent; 808 struct trace_entry *ent = iter->ent;
@@ -803,27 +815,27 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
803 815
804 if (type) { 816 if (type) {
805 /* Interrupt */ 817 /* Interrupt */
806 ret = print_graph_irq(iter, addr, type, cpu, ent->pid); 818 ret = print_graph_irq(iter, addr, type, cpu, ent->pid, flags);
807 if (ret == TRACE_TYPE_PARTIAL_LINE) 819 if (ret == TRACE_TYPE_PARTIAL_LINE)
808 return TRACE_TYPE_PARTIAL_LINE; 820 return TRACE_TYPE_PARTIAL_LINE;
809 } 821 }
810 822
811 /* Absolute time */ 823 /* Absolute time */
812 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { 824 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
813 ret = print_graph_abs_time(iter->ts, s); 825 ret = print_graph_abs_time(iter->ts, s);
814 if (!ret) 826 if (!ret)
815 return TRACE_TYPE_PARTIAL_LINE; 827 return TRACE_TYPE_PARTIAL_LINE;
816 } 828 }
817 829
818 /* Cpu */ 830 /* Cpu */
819 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { 831 if (flags & TRACE_GRAPH_PRINT_CPU) {
820 ret = print_graph_cpu(s, cpu); 832 ret = print_graph_cpu(s, cpu);
821 if (ret == TRACE_TYPE_PARTIAL_LINE) 833 if (ret == TRACE_TYPE_PARTIAL_LINE)
822 return TRACE_TYPE_PARTIAL_LINE; 834 return TRACE_TYPE_PARTIAL_LINE;
823 } 835 }
824 836
825 /* Proc */ 837 /* Proc */
826 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { 838 if (flags & TRACE_GRAPH_PRINT_PROC) {
827 ret = print_graph_proc(s, ent->pid); 839 ret = print_graph_proc(s, ent->pid);
828 if (ret == TRACE_TYPE_PARTIAL_LINE) 840 if (ret == TRACE_TYPE_PARTIAL_LINE)
829 return TRACE_TYPE_PARTIAL_LINE; 841 return TRACE_TYPE_PARTIAL_LINE;
@@ -845,7 +857,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
845 857
846static enum print_line_t 858static enum print_line_t
847print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, 859print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
848 struct trace_iterator *iter) 860 struct trace_iterator *iter, u32 flags)
849{ 861{
850 struct fgraph_data *data = iter->private; 862 struct fgraph_data *data = iter->private;
851 struct ftrace_graph_ent *call = &field->graph_ent; 863 struct ftrace_graph_ent *call = &field->graph_ent;
@@ -853,14 +865,14 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
853 static enum print_line_t ret; 865 static enum print_line_t ret;
854 int cpu = iter->cpu; 866 int cpu = iter->cpu;
855 867
856 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func)) 868 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags))
857 return TRACE_TYPE_PARTIAL_LINE; 869 return TRACE_TYPE_PARTIAL_LINE;
858 870
859 leaf_ret = get_return_for_leaf(iter, field); 871 leaf_ret = get_return_for_leaf(iter, field);
860 if (leaf_ret) 872 if (leaf_ret)
861 ret = print_graph_entry_leaf(iter, field, leaf_ret, s); 873 ret = print_graph_entry_leaf(iter, field, leaf_ret, s, flags);
862 else 874 else
863 ret = print_graph_entry_nested(iter, field, s, cpu); 875 ret = print_graph_entry_nested(iter, field, s, cpu, flags);
864 876
865 if (data) { 877 if (data) {
866 /* 878 /*
@@ -879,7 +891,8 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
879 891
880static enum print_line_t 892static enum print_line_t
881print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, 893print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
882 struct trace_entry *ent, struct trace_iterator *iter) 894 struct trace_entry *ent, struct trace_iterator *iter,
895 u32 flags)
883{ 896{
884 unsigned long long duration = trace->rettime - trace->calltime; 897 unsigned long long duration = trace->rettime - trace->calltime;
885 struct fgraph_data *data = iter->private; 898 struct fgraph_data *data = iter->private;
@@ -909,16 +922,16 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
909 } 922 }
910 } 923 }
911 924
912 if (print_graph_prologue(iter, s, 0, 0)) 925 if (print_graph_prologue(iter, s, 0, 0, flags))
913 return TRACE_TYPE_PARTIAL_LINE; 926 return TRACE_TYPE_PARTIAL_LINE;
914 927
915 /* Overhead */ 928 /* Overhead */
916 ret = print_graph_overhead(duration, s); 929 ret = print_graph_overhead(duration, s, flags);
917 if (!ret) 930 if (!ret)
918 return TRACE_TYPE_PARTIAL_LINE; 931 return TRACE_TYPE_PARTIAL_LINE;
919 932
920 /* Duration */ 933 /* Duration */
921 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { 934 if (flags & TRACE_GRAPH_PRINT_DURATION) {
922 ret = print_graph_duration(duration, s); 935 ret = print_graph_duration(duration, s);
923 if (ret == TRACE_TYPE_PARTIAL_LINE) 936 if (ret == TRACE_TYPE_PARTIAL_LINE)
924 return TRACE_TYPE_PARTIAL_LINE; 937 return TRACE_TYPE_PARTIAL_LINE;
@@ -948,14 +961,15 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
948 } 961 }
949 962
950 /* Overrun */ 963 /* Overrun */
951 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) { 964 if (flags & TRACE_GRAPH_PRINT_OVERRUN) {
952 ret = trace_seq_printf(s, " (Overruns: %lu)\n", 965 ret = trace_seq_printf(s, " (Overruns: %lu)\n",
953 trace->overrun); 966 trace->overrun);
954 if (!ret) 967 if (!ret)
955 return TRACE_TYPE_PARTIAL_LINE; 968 return TRACE_TYPE_PARTIAL_LINE;
956 } 969 }
957 970
958 ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, cpu, pid); 971 ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET,
972 cpu, pid, flags);
959 if (ret == TRACE_TYPE_PARTIAL_LINE) 973 if (ret == TRACE_TYPE_PARTIAL_LINE)
960 return TRACE_TYPE_PARTIAL_LINE; 974 return TRACE_TYPE_PARTIAL_LINE;
961 975
@@ -963,8 +977,8 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
963} 977}
964 978
965static enum print_line_t 979static enum print_line_t
966print_graph_comment(struct trace_seq *s, struct trace_entry *ent, 980print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
967 struct trace_iterator *iter) 981 struct trace_iterator *iter, u32 flags)
968{ 982{
969 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); 983 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
970 struct fgraph_data *data = iter->private; 984 struct fgraph_data *data = iter->private;
@@ -976,16 +990,16 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
976 if (data) 990 if (data)
977 depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth; 991 depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
978 992
979 if (print_graph_prologue(iter, s, 0, 0)) 993 if (print_graph_prologue(iter, s, 0, 0, flags))
980 return TRACE_TYPE_PARTIAL_LINE; 994 return TRACE_TYPE_PARTIAL_LINE;
981 995
982 /* No overhead */ 996 /* No overhead */
983 ret = print_graph_overhead(-1, s); 997 ret = print_graph_overhead(-1, s, flags);
984 if (!ret) 998 if (!ret)
985 return TRACE_TYPE_PARTIAL_LINE; 999 return TRACE_TYPE_PARTIAL_LINE;
986 1000
987 /* No time */ 1001 /* No time */
988 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { 1002 if (flags & TRACE_GRAPH_PRINT_DURATION) {
989 ret = trace_seq_printf(s, " | "); 1003 ret = trace_seq_printf(s, " | ");
990 if (!ret) 1004 if (!ret)
991 return TRACE_TYPE_PARTIAL_LINE; 1005 return TRACE_TYPE_PARTIAL_LINE;
@@ -1020,7 +1034,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1020 if (!event) 1034 if (!event)
1021 return TRACE_TYPE_UNHANDLED; 1035 return TRACE_TYPE_UNHANDLED;
1022 1036
1023 ret = event->trace(iter, sym_flags); 1037 ret = event->funcs->trace(iter, sym_flags, event);
1024 if (ret != TRACE_TYPE_HANDLED) 1038 if (ret != TRACE_TYPE_HANDLED)
1025 return ret; 1039 return ret;
1026 } 1040 }
@@ -1040,7 +1054,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1040 1054
1041 1055
1042enum print_line_t 1056enum print_line_t
1043print_graph_function(struct trace_iterator *iter) 1057print_graph_function_flags(struct trace_iterator *iter, u32 flags)
1044{ 1058{
1045 struct ftrace_graph_ent_entry *field; 1059 struct ftrace_graph_ent_entry *field;
1046 struct fgraph_data *data = iter->private; 1060 struct fgraph_data *data = iter->private;
@@ -1061,7 +1075,7 @@ print_graph_function(struct trace_iterator *iter)
1061 if (data && data->failed) { 1075 if (data && data->failed) {
1062 field = &data->ent; 1076 field = &data->ent;
1063 iter->cpu = data->cpu; 1077 iter->cpu = data->cpu;
1064 ret = print_graph_entry(field, s, iter); 1078 ret = print_graph_entry(field, s, iter, flags);
1065 if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) { 1079 if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) {
1066 per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1; 1080 per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1;
1067 ret = TRACE_TYPE_NO_CONSUME; 1081 ret = TRACE_TYPE_NO_CONSUME;
@@ -1081,32 +1095,50 @@ print_graph_function(struct trace_iterator *iter)
1081 struct ftrace_graph_ent_entry saved; 1095 struct ftrace_graph_ent_entry saved;
1082 trace_assign_type(field, entry); 1096 trace_assign_type(field, entry);
1083 saved = *field; 1097 saved = *field;
1084 return print_graph_entry(&saved, s, iter); 1098 return print_graph_entry(&saved, s, iter, flags);
1085 } 1099 }
1086 case TRACE_GRAPH_RET: { 1100 case TRACE_GRAPH_RET: {
1087 struct ftrace_graph_ret_entry *field; 1101 struct ftrace_graph_ret_entry *field;
1088 trace_assign_type(field, entry); 1102 trace_assign_type(field, entry);
1089 return print_graph_return(&field->ret, s, entry, iter); 1103 return print_graph_return(&field->ret, s, entry, iter, flags);
1090 } 1104 }
1105 case TRACE_STACK:
1106 case TRACE_FN:
1107 /* dont trace stack and functions as comments */
1108 return TRACE_TYPE_UNHANDLED;
1109
1091 default: 1110 default:
1092 return print_graph_comment(s, entry, iter); 1111 return print_graph_comment(s, entry, iter, flags);
1093 } 1112 }
1094 1113
1095 return TRACE_TYPE_HANDLED; 1114 return TRACE_TYPE_HANDLED;
1096} 1115}
1097 1116
1098static void print_lat_header(struct seq_file *s) 1117static enum print_line_t
1118print_graph_function(struct trace_iterator *iter)
1119{
1120 return print_graph_function_flags(iter, tracer_flags.val);
1121}
1122
1123static enum print_line_t
1124print_graph_function_event(struct trace_iterator *iter, int flags,
1125 struct trace_event *event)
1126{
1127 return print_graph_function(iter);
1128}
1129
1130static void print_lat_header(struct seq_file *s, u32 flags)
1099{ 1131{
1100 static const char spaces[] = " " /* 16 spaces */ 1132 static const char spaces[] = " " /* 16 spaces */
1101 " " /* 4 spaces */ 1133 " " /* 4 spaces */
1102 " "; /* 17 spaces */ 1134 " "; /* 17 spaces */
1103 int size = 0; 1135 int size = 0;
1104 1136
1105 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1137 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
1106 size += 16; 1138 size += 16;
1107 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1139 if (flags & TRACE_GRAPH_PRINT_CPU)
1108 size += 4; 1140 size += 4;
1109 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1141 if (flags & TRACE_GRAPH_PRINT_PROC)
1110 size += 17; 1142 size += 17;
1111 1143
1112 seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces); 1144 seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces);
@@ -1117,43 +1149,48 @@ static void print_lat_header(struct seq_file *s)
1117 seq_printf(s, "#%.*s|||| / \n", size, spaces); 1149 seq_printf(s, "#%.*s|||| / \n", size, spaces);
1118} 1150}
1119 1151
1120static void print_graph_headers(struct seq_file *s) 1152void print_graph_headers_flags(struct seq_file *s, u32 flags)
1121{ 1153{
1122 int lat = trace_flags & TRACE_ITER_LATENCY_FMT; 1154 int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
1123 1155
1124 if (lat) 1156 if (lat)
1125 print_lat_header(s); 1157 print_lat_header(s, flags);
1126 1158
1127 /* 1st line */ 1159 /* 1st line */
1128 seq_printf(s, "#"); 1160 seq_printf(s, "#");
1129 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1161 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
1130 seq_printf(s, " TIME "); 1162 seq_printf(s, " TIME ");
1131 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1163 if (flags & TRACE_GRAPH_PRINT_CPU)
1132 seq_printf(s, " CPU"); 1164 seq_printf(s, " CPU");
1133 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1165 if (flags & TRACE_GRAPH_PRINT_PROC)
1134 seq_printf(s, " TASK/PID "); 1166 seq_printf(s, " TASK/PID ");
1135 if (lat) 1167 if (lat)
1136 seq_printf(s, "|||||"); 1168 seq_printf(s, "|||||");
1137 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 1169 if (flags & TRACE_GRAPH_PRINT_DURATION)
1138 seq_printf(s, " DURATION "); 1170 seq_printf(s, " DURATION ");
1139 seq_printf(s, " FUNCTION CALLS\n"); 1171 seq_printf(s, " FUNCTION CALLS\n");
1140 1172
1141 /* 2nd line */ 1173 /* 2nd line */
1142 seq_printf(s, "#"); 1174 seq_printf(s, "#");
1143 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1175 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
1144 seq_printf(s, " | "); 1176 seq_printf(s, " | ");
1145 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1177 if (flags & TRACE_GRAPH_PRINT_CPU)
1146 seq_printf(s, " | "); 1178 seq_printf(s, " | ");
1147 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1179 if (flags & TRACE_GRAPH_PRINT_PROC)
1148 seq_printf(s, " | | "); 1180 seq_printf(s, " | | ");
1149 if (lat) 1181 if (lat)
1150 seq_printf(s, "|||||"); 1182 seq_printf(s, "|||||");
1151 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 1183 if (flags & TRACE_GRAPH_PRINT_DURATION)
1152 seq_printf(s, " | | "); 1184 seq_printf(s, " | | ");
1153 seq_printf(s, " | | | |\n"); 1185 seq_printf(s, " | | | |\n");
1154} 1186}
1155 1187
1156static void graph_trace_open(struct trace_iterator *iter) 1188void print_graph_headers(struct seq_file *s)
1189{
1190 print_graph_headers_flags(s, tracer_flags.val);
1191}
1192
1193void graph_trace_open(struct trace_iterator *iter)
1157{ 1194{
1158 /* pid and depth on the last trace processed */ 1195 /* pid and depth on the last trace processed */
1159 struct fgraph_data *data; 1196 struct fgraph_data *data;
@@ -1188,7 +1225,7 @@ static void graph_trace_open(struct trace_iterator *iter)
1188 pr_warning("function graph tracer: not enough memory\n"); 1225 pr_warning("function graph tracer: not enough memory\n");
1189} 1226}
1190 1227
1191static void graph_trace_close(struct trace_iterator *iter) 1228void graph_trace_close(struct trace_iterator *iter)
1192{ 1229{
1193 struct fgraph_data *data = iter->private; 1230 struct fgraph_data *data = iter->private;
1194 1231
@@ -1198,6 +1235,20 @@ static void graph_trace_close(struct trace_iterator *iter)
1198 } 1235 }
1199} 1236}
1200 1237
1238static struct trace_event_functions graph_functions = {
1239 .trace = print_graph_function_event,
1240};
1241
1242static struct trace_event graph_trace_entry_event = {
1243 .type = TRACE_GRAPH_ENT,
1244 .funcs = &graph_functions,
1245};
1246
1247static struct trace_event graph_trace_ret_event = {
1248 .type = TRACE_GRAPH_RET,
1249 .funcs = &graph_functions
1250};
1251
1201static struct tracer graph_trace __read_mostly = { 1252static struct tracer graph_trace __read_mostly = {
1202 .name = "function_graph", 1253 .name = "function_graph",
1203 .open = graph_trace_open, 1254 .open = graph_trace_open,
@@ -1219,6 +1270,16 @@ static __init int init_graph_trace(void)
1219{ 1270{
1220 max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); 1271 max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
1221 1272
1273 if (!register_ftrace_event(&graph_trace_entry_event)) {
1274 pr_warning("Warning: could not register graph trace events\n");
1275 return 1;
1276 }
1277
1278 if (!register_ftrace_event(&graph_trace_ret_event)) {
1279 pr_warning("Warning: could not register graph trace events\n");
1280 return 1;
1281 }
1282
1222 return register_tracer(&graph_trace); 1283 return register_tracer(&graph_trace);
1223} 1284}
1224 1285
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
deleted file mode 100644
index 7b97000745f5..000000000000
--- a/kernel/trace/trace_hw_branches.c
+++ /dev/null
@@ -1,312 +0,0 @@
1/*
2 * h/w branch tracer for x86 based on BTS
3 *
4 * Copyright (C) 2008-2009 Intel Corporation.
5 * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
6 */
7#include <linux/kallsyms.h>
8#include <linux/debugfs.h>
9#include <linux/ftrace.h>
10#include <linux/module.h>
11#include <linux/cpu.h>
12#include <linux/smp.h>
13#include <linux/fs.h>
14
15#include <asm/ds.h>
16
17#include "trace_output.h"
18#include "trace.h"
19
20
21#define BTS_BUFFER_SIZE (1 << 13)
22
23static DEFINE_PER_CPU(struct bts_tracer *, hwb_tracer);
24static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], hwb_buffer);
25
26#define this_tracer per_cpu(hwb_tracer, smp_processor_id())
27
28static int trace_hw_branches_enabled __read_mostly;
29static int trace_hw_branches_suspended __read_mostly;
30static struct trace_array *hw_branch_trace __read_mostly;
31
32
33static void bts_trace_init_cpu(int cpu)
34{
35 per_cpu(hwb_tracer, cpu) =
36 ds_request_bts_cpu(cpu, per_cpu(hwb_buffer, cpu),
37 BTS_BUFFER_SIZE, NULL, (size_t)-1,
38 BTS_KERNEL);
39
40 if (IS_ERR(per_cpu(hwb_tracer, cpu)))
41 per_cpu(hwb_tracer, cpu) = NULL;
42}
43
44static int bts_trace_init(struct trace_array *tr)
45{
46 int cpu;
47
48 hw_branch_trace = tr;
49 trace_hw_branches_enabled = 0;
50
51 get_online_cpus();
52 for_each_online_cpu(cpu) {
53 bts_trace_init_cpu(cpu);
54
55 if (likely(per_cpu(hwb_tracer, cpu)))
56 trace_hw_branches_enabled = 1;
57 }
58 trace_hw_branches_suspended = 0;
59 put_online_cpus();
60
61 /* If we could not enable tracing on a single cpu, we fail. */
62 return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP;
63}
64
65static void bts_trace_reset(struct trace_array *tr)
66{
67 int cpu;
68
69 get_online_cpus();
70 for_each_online_cpu(cpu) {
71 if (likely(per_cpu(hwb_tracer, cpu))) {
72 ds_release_bts(per_cpu(hwb_tracer, cpu));
73 per_cpu(hwb_tracer, cpu) = NULL;
74 }
75 }
76 trace_hw_branches_enabled = 0;
77 trace_hw_branches_suspended = 0;
78 put_online_cpus();
79}
80
81static void bts_trace_start(struct trace_array *tr)
82{
83 int cpu;
84
85 get_online_cpus();
86 for_each_online_cpu(cpu)
87 if (likely(per_cpu(hwb_tracer, cpu)))
88 ds_resume_bts(per_cpu(hwb_tracer, cpu));
89 trace_hw_branches_suspended = 0;
90 put_online_cpus();
91}
92
93static void bts_trace_stop(struct trace_array *tr)
94{
95 int cpu;
96
97 get_online_cpus();
98 for_each_online_cpu(cpu)
99 if (likely(per_cpu(hwb_tracer, cpu)))
100 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
101 trace_hw_branches_suspended = 1;
102 put_online_cpus();
103}
104
105static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
106 unsigned long action, void *hcpu)
107{
108 int cpu = (long)hcpu;
109
110 switch (action) {
111 case CPU_ONLINE:
112 case CPU_DOWN_FAILED:
113 /* The notification is sent with interrupts enabled. */
114 if (trace_hw_branches_enabled) {
115 bts_trace_init_cpu(cpu);
116
117 if (trace_hw_branches_suspended &&
118 likely(per_cpu(hwb_tracer, cpu)))
119 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
120 }
121 break;
122
123 case CPU_DOWN_PREPARE:
124 /* The notification is sent with interrupts enabled. */
125 if (likely(per_cpu(hwb_tracer, cpu))) {
126 ds_release_bts(per_cpu(hwb_tracer, cpu));
127 per_cpu(hwb_tracer, cpu) = NULL;
128 }
129 }
130
131 return NOTIFY_DONE;
132}
133
134static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
135 .notifier_call = bts_hotcpu_handler
136};
137
138static void bts_trace_print_header(struct seq_file *m)
139{
140 seq_puts(m, "# CPU# TO <- FROM\n");
141}
142
143static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
144{
145 unsigned long symflags = TRACE_ITER_SYM_OFFSET;
146 struct trace_entry *entry = iter->ent;
147 struct trace_seq *seq = &iter->seq;
148 struct hw_branch_entry *it;
149
150 trace_assign_type(it, entry);
151
152 if (entry->type == TRACE_HW_BRANCHES) {
153 if (trace_seq_printf(seq, "%4d ", iter->cpu) &&
154 seq_print_ip_sym(seq, it->to, symflags) &&
155 trace_seq_printf(seq, "\t <- ") &&
156 seq_print_ip_sym(seq, it->from, symflags) &&
157 trace_seq_printf(seq, "\n"))
158 return TRACE_TYPE_HANDLED;
159 return TRACE_TYPE_PARTIAL_LINE;
160 }
161 return TRACE_TYPE_UNHANDLED;
162}
163
164void trace_hw_branch(u64 from, u64 to)
165{
166 struct ftrace_event_call *call = &event_hw_branch;
167 struct trace_array *tr = hw_branch_trace;
168 struct ring_buffer_event *event;
169 struct ring_buffer *buf;
170 struct hw_branch_entry *entry;
171 unsigned long irq1;
172 int cpu;
173
174 if (unlikely(!tr))
175 return;
176
177 if (unlikely(!trace_hw_branches_enabled))
178 return;
179
180 local_irq_save(irq1);
181 cpu = raw_smp_processor_id();
182 if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
183 goto out;
184
185 buf = tr->buffer;
186 event = trace_buffer_lock_reserve(buf, TRACE_HW_BRANCHES,
187 sizeof(*entry), 0, 0);
188 if (!event)
189 goto out;
190 entry = ring_buffer_event_data(event);
191 tracing_generic_entry_update(&entry->ent, 0, from);
192 entry->ent.type = TRACE_HW_BRANCHES;
193 entry->from = from;
194 entry->to = to;
195 if (!filter_check_discard(call, entry, buf, event))
196 trace_buffer_unlock_commit(buf, event, 0, 0);
197
198 out:
199 atomic_dec(&tr->data[cpu]->disabled);
200 local_irq_restore(irq1);
201}
202
203static void trace_bts_at(const struct bts_trace *trace, void *at)
204{
205 struct bts_struct bts;
206 int err = 0;
207
208 WARN_ON_ONCE(!trace->read);
209 if (!trace->read)
210 return;
211
212 err = trace->read(this_tracer, at, &bts);
213 if (err < 0)
214 return;
215
216 switch (bts.qualifier) {
217 case BTS_BRANCH:
218 trace_hw_branch(bts.variant.lbr.from, bts.variant.lbr.to);
219 break;
220 }
221}
222
223/*
224 * Collect the trace on the current cpu and write it into the ftrace buffer.
225 *
226 * pre: tracing must be suspended on the current cpu
227 */
228static void trace_bts_cpu(void *arg)
229{
230 struct trace_array *tr = (struct trace_array *)arg;
231 const struct bts_trace *trace;
232 unsigned char *at;
233
234 if (unlikely(!tr))
235 return;
236
237 if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled)))
238 return;
239
240 if (unlikely(!this_tracer))
241 return;
242
243 trace = ds_read_bts(this_tracer);
244 if (!trace)
245 return;
246
247 for (at = trace->ds.top; (void *)at < trace->ds.end;
248 at += trace->ds.size)
249 trace_bts_at(trace, at);
250
251 for (at = trace->ds.begin; (void *)at < trace->ds.top;
252 at += trace->ds.size)
253 trace_bts_at(trace, at);
254}
255
256static void trace_bts_prepare(struct trace_iterator *iter)
257{
258 int cpu;
259
260 get_online_cpus();
261 for_each_online_cpu(cpu)
262 if (likely(per_cpu(hwb_tracer, cpu)))
263 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
264 /*
265 * We need to collect the trace on the respective cpu since ftrace
266 * implicitly adds the record for the current cpu.
267 * Once that is more flexible, we could collect the data from any cpu.
268 */
269 on_each_cpu(trace_bts_cpu, iter->tr, 1);
270
271 for_each_online_cpu(cpu)
272 if (likely(per_cpu(hwb_tracer, cpu)))
273 ds_resume_bts(per_cpu(hwb_tracer, cpu));
274 put_online_cpus();
275}
276
277static void trace_bts_close(struct trace_iterator *iter)
278{
279 tracing_reset_online_cpus(iter->tr);
280}
281
282void trace_hw_branch_oops(void)
283{
284 if (this_tracer) {
285 ds_suspend_bts_noirq(this_tracer);
286 trace_bts_cpu(hw_branch_trace);
287 ds_resume_bts_noirq(this_tracer);
288 }
289}
290
291struct tracer bts_tracer __read_mostly =
292{
293 .name = "hw-branch-tracer",
294 .init = bts_trace_init,
295 .reset = bts_trace_reset,
296 .print_header = bts_trace_print_header,
297 .print_line = bts_trace_print_line,
298 .start = bts_trace_start,
299 .stop = bts_trace_stop,
300 .open = trace_bts_prepare,
301 .close = trace_bts_close,
302#ifdef CONFIG_FTRACE_SELFTEST
303 .selftest = trace_selftest_startup_hw_branches,
304#endif /* CONFIG_FTRACE_SELFTEST */
305};
306
307__init static int init_bts_trace(void)
308{
309 register_hotcpu_notifier(&bts_hotcpu_notifier);
310 return register_tracer(&bts_tracer);
311}
312device_initcall(init_bts_trace);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 2974bc7538c7..73a6b0601f2e 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -34,6 +34,9 @@ static int trace_type __read_mostly;
34 34
35static int save_lat_flag; 35static int save_lat_flag;
36 36
37static void stop_irqsoff_tracer(struct trace_array *tr, int graph);
38static int start_irqsoff_tracer(struct trace_array *tr, int graph);
39
37#ifdef CONFIG_PREEMPT_TRACER 40#ifdef CONFIG_PREEMPT_TRACER
38static inline int 41static inline int
39preempt_trace(void) 42preempt_trace(void)
@@ -55,6 +58,23 @@ irq_trace(void)
55# define irq_trace() (0) 58# define irq_trace() (0)
56#endif 59#endif
57 60
61#define TRACE_DISPLAY_GRAPH 1
62
63static struct tracer_opt trace_opts[] = {
64#ifdef CONFIG_FUNCTION_GRAPH_TRACER
65 /* display latency trace as call graph */
66 { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) },
67#endif
68 { } /* Empty entry */
69};
70
71static struct tracer_flags tracer_flags = {
72 .val = 0,
73 .opts = trace_opts,
74};
75
76#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
77
58/* 78/*
59 * Sequence count - we record it when starting a measurement and 79 * Sequence count - we record it when starting a measurement and
60 * skip the latency if the sequence has changed - some other section 80 * skip the latency if the sequence has changed - some other section
@@ -108,6 +128,202 @@ static struct ftrace_ops trace_ops __read_mostly =
108}; 128};
109#endif /* CONFIG_FUNCTION_TRACER */ 129#endif /* CONFIG_FUNCTION_TRACER */
110 130
131#ifdef CONFIG_FUNCTION_GRAPH_TRACER
132static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
133{
134 int cpu;
135
136 if (!(bit & TRACE_DISPLAY_GRAPH))
137 return -EINVAL;
138
139 if (!(is_graph() ^ set))
140 return 0;
141
142 stop_irqsoff_tracer(irqsoff_trace, !set);
143
144 for_each_possible_cpu(cpu)
145 per_cpu(tracing_cpu, cpu) = 0;
146
147 tracing_max_latency = 0;
148 tracing_reset_online_cpus(irqsoff_trace);
149
150 return start_irqsoff_tracer(irqsoff_trace, set);
151}
152
153static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
154{
155 struct trace_array *tr = irqsoff_trace;
156 struct trace_array_cpu *data;
157 unsigned long flags;
158 long disabled;
159 int ret;
160 int cpu;
161 int pc;
162
163 cpu = raw_smp_processor_id();
164 if (likely(!per_cpu(tracing_cpu, cpu)))
165 return 0;
166
167 local_save_flags(flags);
168 /* slight chance to get a false positive on tracing_cpu */
169 if (!irqs_disabled_flags(flags))
170 return 0;
171
172 data = tr->data[cpu];
173 disabled = atomic_inc_return(&data->disabled);
174
175 if (likely(disabled == 1)) {
176 pc = preempt_count();
177 ret = __trace_graph_entry(tr, trace, flags, pc);
178 } else
179 ret = 0;
180
181 atomic_dec(&data->disabled);
182 return ret;
183}
184
185static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
186{
187 struct trace_array *tr = irqsoff_trace;
188 struct trace_array_cpu *data;
189 unsigned long flags;
190 long disabled;
191 int cpu;
192 int pc;
193
194 cpu = raw_smp_processor_id();
195 if (likely(!per_cpu(tracing_cpu, cpu)))
196 return;
197
198 local_save_flags(flags);
199 /* slight chance to get a false positive on tracing_cpu */
200 if (!irqs_disabled_flags(flags))
201 return;
202
203 data = tr->data[cpu];
204 disabled = atomic_inc_return(&data->disabled);
205
206 if (likely(disabled == 1)) {
207 pc = preempt_count();
208 __trace_graph_return(tr, trace, flags, pc);
209 }
210
211 atomic_dec(&data->disabled);
212}
213
214static void irqsoff_trace_open(struct trace_iterator *iter)
215{
216 if (is_graph())
217 graph_trace_open(iter);
218
219}
220
221static void irqsoff_trace_close(struct trace_iterator *iter)
222{
223 if (iter->private)
224 graph_trace_close(iter);
225}
226
227#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \
228 TRACE_GRAPH_PRINT_PROC)
229
230static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
231{
232 u32 flags = GRAPH_TRACER_FLAGS;
233
234 if (trace_flags & TRACE_ITER_LATENCY_FMT)
235 flags |= TRACE_GRAPH_PRINT_DURATION;
236 else
237 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
238
239 /*
240 * In graph mode call the graph tracer output function,
241 * otherwise go with the TRACE_FN event handler
242 */
243 if (is_graph())
244 return print_graph_function_flags(iter, flags);
245
246 return TRACE_TYPE_UNHANDLED;
247}
248
249static void irqsoff_print_header(struct seq_file *s)
250{
251 if (is_graph()) {
252 struct trace_iterator *iter = s->private;
253 u32 flags = GRAPH_TRACER_FLAGS;
254
255 if (trace_flags & TRACE_ITER_LATENCY_FMT) {
256 /* print nothing if the buffers are empty */
257 if (trace_empty(iter))
258 return;
259
260 print_trace_header(s, iter);
261 flags |= TRACE_GRAPH_PRINT_DURATION;
262 } else
263 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
264
265 print_graph_headers_flags(s, flags);
266 } else
267 trace_default_header(s);
268}
269
270static void
271trace_graph_function(struct trace_array *tr,
272 unsigned long ip, unsigned long flags, int pc)
273{
274 u64 time = trace_clock_local();
275 struct ftrace_graph_ent ent = {
276 .func = ip,
277 .depth = 0,
278 };
279 struct ftrace_graph_ret ret = {
280 .func = ip,
281 .depth = 0,
282 .calltime = time,
283 .rettime = time,
284 };
285
286 __trace_graph_entry(tr, &ent, flags, pc);
287 __trace_graph_return(tr, &ret, flags, pc);
288}
289
290static void
291__trace_function(struct trace_array *tr,
292 unsigned long ip, unsigned long parent_ip,
293 unsigned long flags, int pc)
294{
295 if (!is_graph())
296 trace_function(tr, ip, parent_ip, flags, pc);
297 else {
298 trace_graph_function(tr, parent_ip, flags, pc);
299 trace_graph_function(tr, ip, flags, pc);
300 }
301}
302
303#else
304#define __trace_function trace_function
305
306static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
307{
308 return -EINVAL;
309}
310
311static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
312{
313 return -1;
314}
315
316static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
317{
318 return TRACE_TYPE_UNHANDLED;
319}
320
321static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { }
322static void irqsoff_print_header(struct seq_file *s) { }
323static void irqsoff_trace_open(struct trace_iterator *iter) { }
324static void irqsoff_trace_close(struct trace_iterator *iter) { }
325#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
326
111/* 327/*
112 * Should this new latency be reported/recorded? 328 * Should this new latency be reported/recorded?
113 */ 329 */
@@ -150,7 +366,7 @@ check_critical_timing(struct trace_array *tr,
150 if (!report_latency(delta)) 366 if (!report_latency(delta))
151 goto out_unlock; 367 goto out_unlock;
152 368
153 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 369 __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
154 /* Skip 5 functions to get to the irq/preempt enable function */ 370 /* Skip 5 functions to get to the irq/preempt enable function */
155 __trace_stack(tr, flags, 5, pc); 371 __trace_stack(tr, flags, 5, pc);
156 372
@@ -172,7 +388,7 @@ out_unlock:
172out: 388out:
173 data->critical_sequence = max_sequence; 389 data->critical_sequence = max_sequence;
174 data->preempt_timestamp = ftrace_now(cpu); 390 data->preempt_timestamp = ftrace_now(cpu);
175 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 391 __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
176} 392}
177 393
178static inline void 394static inline void
@@ -204,7 +420,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
204 420
205 local_save_flags(flags); 421 local_save_flags(flags);
206 422
207 trace_function(tr, ip, parent_ip, flags, preempt_count()); 423 __trace_function(tr, ip, parent_ip, flags, preempt_count());
208 424
209 per_cpu(tracing_cpu, cpu) = 1; 425 per_cpu(tracing_cpu, cpu) = 1;
210 426
@@ -238,7 +454,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
238 atomic_inc(&data->disabled); 454 atomic_inc(&data->disabled);
239 455
240 local_save_flags(flags); 456 local_save_flags(flags);
241 trace_function(tr, ip, parent_ip, flags, preempt_count()); 457 __trace_function(tr, ip, parent_ip, flags, preempt_count());
242 check_critical_timing(tr, data, parent_ip ? : ip, cpu); 458 check_critical_timing(tr, data, parent_ip ? : ip, cpu);
243 data->critical_start = 0; 459 data->critical_start = 0;
244 atomic_dec(&data->disabled); 460 atomic_dec(&data->disabled);
@@ -347,19 +563,32 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
347} 563}
348#endif /* CONFIG_PREEMPT_TRACER */ 564#endif /* CONFIG_PREEMPT_TRACER */
349 565
350static void start_irqsoff_tracer(struct trace_array *tr) 566static int start_irqsoff_tracer(struct trace_array *tr, int graph)
351{ 567{
352 register_ftrace_function(&trace_ops); 568 int ret = 0;
353 if (tracing_is_enabled()) 569
570 if (!graph)
571 ret = register_ftrace_function(&trace_ops);
572 else
573 ret = register_ftrace_graph(&irqsoff_graph_return,
574 &irqsoff_graph_entry);
575
576 if (!ret && tracing_is_enabled())
354 tracer_enabled = 1; 577 tracer_enabled = 1;
355 else 578 else
356 tracer_enabled = 0; 579 tracer_enabled = 0;
580
581 return ret;
357} 582}
358 583
359static void stop_irqsoff_tracer(struct trace_array *tr) 584static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
360{ 585{
361 tracer_enabled = 0; 586 tracer_enabled = 0;
362 unregister_ftrace_function(&trace_ops); 587
588 if (!graph)
589 unregister_ftrace_function(&trace_ops);
590 else
591 unregister_ftrace_graph();
363} 592}
364 593
365static void __irqsoff_tracer_init(struct trace_array *tr) 594static void __irqsoff_tracer_init(struct trace_array *tr)
@@ -372,12 +601,14 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
372 /* make sure that the tracer is visible */ 601 /* make sure that the tracer is visible */
373 smp_wmb(); 602 smp_wmb();
374 tracing_reset_online_cpus(tr); 603 tracing_reset_online_cpus(tr);
375 start_irqsoff_tracer(tr); 604
605 if (start_irqsoff_tracer(tr, is_graph()))
606 printk(KERN_ERR "failed to start irqsoff tracer\n");
376} 607}
377 608
378static void irqsoff_tracer_reset(struct trace_array *tr) 609static void irqsoff_tracer_reset(struct trace_array *tr)
379{ 610{
380 stop_irqsoff_tracer(tr); 611 stop_irqsoff_tracer(tr, is_graph());
381 612
382 if (!save_lat_flag) 613 if (!save_lat_flag)
383 trace_flags &= ~TRACE_ITER_LATENCY_FMT; 614 trace_flags &= ~TRACE_ITER_LATENCY_FMT;
@@ -409,9 +640,16 @@ static struct tracer irqsoff_tracer __read_mostly =
409 .start = irqsoff_tracer_start, 640 .start = irqsoff_tracer_start,
410 .stop = irqsoff_tracer_stop, 641 .stop = irqsoff_tracer_stop,
411 .print_max = 1, 642 .print_max = 1,
643 .print_header = irqsoff_print_header,
644 .print_line = irqsoff_print_line,
645 .flags = &tracer_flags,
646 .set_flag = irqsoff_set_flag,
412#ifdef CONFIG_FTRACE_SELFTEST 647#ifdef CONFIG_FTRACE_SELFTEST
413 .selftest = trace_selftest_startup_irqsoff, 648 .selftest = trace_selftest_startup_irqsoff,
414#endif 649#endif
650 .open = irqsoff_trace_open,
651 .close = irqsoff_trace_close,
652 .use_max_tr = 1,
415}; 653};
416# define register_irqsoff(trace) register_tracer(&trace) 654# define register_irqsoff(trace) register_tracer(&trace)
417#else 655#else
@@ -435,9 +673,16 @@ static struct tracer preemptoff_tracer __read_mostly =
435 .start = irqsoff_tracer_start, 673 .start = irqsoff_tracer_start,
436 .stop = irqsoff_tracer_stop, 674 .stop = irqsoff_tracer_stop,
437 .print_max = 1, 675 .print_max = 1,
676 .print_header = irqsoff_print_header,
677 .print_line = irqsoff_print_line,
678 .flags = &tracer_flags,
679 .set_flag = irqsoff_set_flag,
438#ifdef CONFIG_FTRACE_SELFTEST 680#ifdef CONFIG_FTRACE_SELFTEST
439 .selftest = trace_selftest_startup_preemptoff, 681 .selftest = trace_selftest_startup_preemptoff,
440#endif 682#endif
683 .open = irqsoff_trace_open,
684 .close = irqsoff_trace_close,
685 .use_max_tr = 1,
441}; 686};
442# define register_preemptoff(trace) register_tracer(&trace) 687# define register_preemptoff(trace) register_tracer(&trace)
443#else 688#else
@@ -463,9 +708,16 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
463 .start = irqsoff_tracer_start, 708 .start = irqsoff_tracer_start,
464 .stop = irqsoff_tracer_stop, 709 .stop = irqsoff_tracer_stop,
465 .print_max = 1, 710 .print_max = 1,
711 .print_header = irqsoff_print_header,
712 .print_line = irqsoff_print_line,
713 .flags = &tracer_flags,
714 .set_flag = irqsoff_set_flag,
466#ifdef CONFIG_FTRACE_SELFTEST 715#ifdef CONFIG_FTRACE_SELFTEST
467 .selftest = trace_selftest_startup_preemptirqsoff, 716 .selftest = trace_selftest_startup_preemptirqsoff,
468#endif 717#endif
718 .open = irqsoff_trace_open,
719 .close = irqsoff_trace_close,
720 .use_max_tr = 1,
469}; 721};
470 722
471# define register_preemptirqsoff(trace) register_tracer(&trace) 723# define register_preemptirqsoff(trace) register_tracer(&trace)
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
new file mode 100644
index 000000000000..7b8ecd751d93
--- /dev/null
+++ b/kernel/trace/trace_kdb.c
@@ -0,0 +1,136 @@
1/*
2 * kdb helper for dumping the ftrace buffer
3 *
4 * Copyright (C) 2010 Jason Wessel <jason.wessel@windriver.com>
5 *
6 * ftrace_dump_buf based on ftrace_dump:
7 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
8 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
9 *
10 */
11#include <linux/init.h>
12#include <linux/kgdb.h>
13#include <linux/kdb.h>
14#include <linux/ftrace.h>
15
16#include "../debug/kdb/kdb_private.h"
17#include "trace.h"
18#include "trace_output.h"
19
20static void ftrace_dump_buf(int skip_lines, long cpu_file)
21{
22 /* use static because iter can be a bit big for the stack */
23 static struct trace_iterator iter;
24 unsigned int old_userobj;
25 int cnt = 0, cpu;
26
27 trace_init_global_iter(&iter);
28
29 for_each_tracing_cpu(cpu) {
30 atomic_inc(&iter.tr->data[cpu]->disabled);
31 }
32
33 old_userobj = trace_flags;
34
35 /* don't look at user memory in panic mode */
36 trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
37
38 kdb_printf("Dumping ftrace buffer:\n");
39
40 /* reset all but tr, trace, and overruns */
41 memset(&iter.seq, 0,
42 sizeof(struct trace_iterator) -
43 offsetof(struct trace_iterator, seq));
44 iter.iter_flags |= TRACE_FILE_LAT_FMT;
45 iter.pos = -1;
46
47 if (cpu_file == TRACE_PIPE_ALL_CPU) {
48 for_each_tracing_cpu(cpu) {
49 iter.buffer_iter[cpu] =
50 ring_buffer_read_prepare(iter.tr->buffer, cpu);
51 ring_buffer_read_start(iter.buffer_iter[cpu]);
52 tracing_iter_reset(&iter, cpu);
53 }
54 } else {
55 iter.cpu_file = cpu_file;
56 iter.buffer_iter[cpu_file] =
57 ring_buffer_read_prepare(iter.tr->buffer, cpu_file);
58 ring_buffer_read_start(iter.buffer_iter[cpu_file]);
59 tracing_iter_reset(&iter, cpu_file);
60 }
61 if (!trace_empty(&iter))
62 trace_find_next_entry_inc(&iter);
63 while (!trace_empty(&iter)) {
64 if (!cnt)
65 kdb_printf("---------------------------------\n");
66 cnt++;
67
68 if (trace_find_next_entry_inc(&iter) != NULL && !skip_lines)
69 print_trace_line(&iter);
70 if (!skip_lines)
71 trace_printk_seq(&iter.seq);
72 else
73 skip_lines--;
74 if (KDB_FLAG(CMD_INTERRUPT))
75 goto out;
76 }
77
78 if (!cnt)
79 kdb_printf(" (ftrace buffer empty)\n");
80 else
81 kdb_printf("---------------------------------\n");
82
83out:
84 trace_flags = old_userobj;
85
86 for_each_tracing_cpu(cpu) {
87 atomic_dec(&iter.tr->data[cpu]->disabled);
88 }
89
90 for_each_tracing_cpu(cpu)
91 if (iter.buffer_iter[cpu])
92 ring_buffer_read_finish(iter.buffer_iter[cpu]);
93}
94
95/*
96 * kdb_ftdump - Dump the ftrace log buffer
97 */
98static int kdb_ftdump(int argc, const char **argv)
99{
100 int skip_lines = 0;
101 long cpu_file;
102 char *cp;
103
104 if (argc > 2)
105 return KDB_ARGCOUNT;
106
107 if (argc) {
108 skip_lines = simple_strtol(argv[1], &cp, 0);
109 if (*cp)
110 skip_lines = 0;
111 }
112
113 if (argc == 2) {
114 cpu_file = simple_strtol(argv[2], &cp, 0);
115 if (*cp || cpu_file >= NR_CPUS || cpu_file < 0 ||
116 !cpu_online(cpu_file))
117 return KDB_BADINT;
118 } else {
119 cpu_file = TRACE_PIPE_ALL_CPU;
120 }
121
122 kdb_trap_printk++;
123 ftrace_dump_buf(skip_lines, cpu_file);
124 kdb_trap_printk--;
125
126 return 0;
127}
128
129static __init int kdb_ftrace_register(void)
130{
131 kdb_register_repeat("ftdump", kdb_ftdump, "[skip_#lines] [cpu]",
132 "Dump ftrace log", 0, KDB_REPEAT_NONE);
133 return 0;
134}
135
136late_initcall(kdb_ftrace_register);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 1251e367bae9..544301d29dee 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -29,6 +29,10 @@
29#include <linux/ctype.h> 29#include <linux/ctype.h>
30#include <linux/ptrace.h> 30#include <linux/ptrace.h>
31#include <linux/perf_event.h> 31#include <linux/perf_event.h>
32#include <linux/stringify.h>
33#include <linux/limits.h>
34#include <linux/uaccess.h>
35#include <asm/bitsperlong.h>
32 36
33#include "trace.h" 37#include "trace.h"
34#include "trace_output.h" 38#include "trace_output.h"
@@ -36,11 +40,11 @@
36#define MAX_TRACE_ARGS 128 40#define MAX_TRACE_ARGS 128
37#define MAX_ARGSTR_LEN 63 41#define MAX_ARGSTR_LEN 63
38#define MAX_EVENT_NAME_LEN 64 42#define MAX_EVENT_NAME_LEN 64
43#define MAX_STRING_SIZE PATH_MAX
39#define KPROBE_EVENT_SYSTEM "kprobes" 44#define KPROBE_EVENT_SYSTEM "kprobes"
40 45
41/* Reserved field names */ 46/* Reserved field names */
42#define FIELD_STRING_IP "__probe_ip" 47#define FIELD_STRING_IP "__probe_ip"
43#define FIELD_STRING_NARGS "__probe_nargs"
44#define FIELD_STRING_RETIP "__probe_ret_ip" 48#define FIELD_STRING_RETIP "__probe_ret_ip"
45#define FIELD_STRING_FUNC "__probe_func" 49#define FIELD_STRING_FUNC "__probe_func"
46 50
@@ -52,55 +56,214 @@ const char *reserved_field_names[] = {
52 "common_tgid", 56 "common_tgid",
53 "common_lock_depth", 57 "common_lock_depth",
54 FIELD_STRING_IP, 58 FIELD_STRING_IP,
55 FIELD_STRING_NARGS,
56 FIELD_STRING_RETIP, 59 FIELD_STRING_RETIP,
57 FIELD_STRING_FUNC, 60 FIELD_STRING_FUNC,
58}; 61};
59 62
60struct fetch_func { 63/* Printing function type */
61 unsigned long (*func)(struct pt_regs *, void *); 64typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *,
62 void *data; 65 void *);
63}; 66#define PRINT_TYPE_FUNC_NAME(type) print_type_##type
64 67#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type
65static __kprobes unsigned long call_fetch(struct fetch_func *f, 68
66 struct pt_regs *regs) 69/* Printing in basic type function template */
70#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \
71static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \
72 const char *name, \
73 void *data, void *ent)\
74{ \
75 return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\
76} \
77static const char PRINT_TYPE_FMT_NAME(type)[] = fmt;
78
79DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int)
80DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int)
81DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long)
82DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long)
83DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int)
84DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
85DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
86DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
87
88/* data_rloc: data relative location, compatible with u32 */
89#define make_data_rloc(len, roffs) \
90 (((u32)(len) << 16) | ((u32)(roffs) & 0xffff))
91#define get_rloc_len(dl) ((u32)(dl) >> 16)
92#define get_rloc_offs(dl) ((u32)(dl) & 0xffff)
93
94static inline void *get_rloc_data(u32 *dl)
67{ 95{
68 return f->func(regs, f->data); 96 return (u8 *)dl + get_rloc_offs(*dl);
69} 97}
70 98
71/* fetch handlers */ 99/* For data_loc conversion */
72static __kprobes unsigned long fetch_register(struct pt_regs *regs, 100static inline void *get_loc_data(u32 *dl, void *ent)
73 void *offset)
74{ 101{
75 return regs_get_register(regs, (unsigned int)((unsigned long)offset)); 102 return (u8 *)ent + get_rloc_offs(*dl);
76} 103}
77 104
78static __kprobes unsigned long fetch_stack(struct pt_regs *regs, 105/*
79 void *num) 106 * Convert data_rloc to data_loc:
80{ 107 * data_rloc stores the offset from data_rloc itself, but data_loc
81 return regs_get_kernel_stack_nth(regs, 108 * stores the offset from event entry.
82 (unsigned int)((unsigned long)num)); 109 */
83} 110#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs))
84 111
85static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr) 112/* For defining macros, define string/string_size types */
113typedef u32 string;
114typedef u32 string_size;
115
116/* Print type function for string type */
117static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
118 const char *name,
119 void *data, void *ent)
86{ 120{
87 unsigned long retval; 121 int len = *(u32 *)data >> 16;
88 122
89 if (probe_kernel_address(addr, retval)) 123 if (!len)
90 return 0; 124 return trace_seq_printf(s, " %s=(fault)", name);
91 return retval; 125 else
126 return trace_seq_printf(s, " %s=\"%s\"", name,
127 (const char *)get_loc_data(data, ent));
92} 128}
129static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
93 130
94static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs, 131/* Data fetch function type */
95 void *dummy) 132typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
133
134struct fetch_param {
135 fetch_func_t fn;
136 void *data;
137};
138
139static __kprobes void call_fetch(struct fetch_param *fprm,
140 struct pt_regs *regs, void *dest)
96{ 141{
97 return regs_return_value(regs); 142 return fprm->fn(regs, fprm->data, dest);
98} 143}
99 144
100static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs, 145#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type
101 void *dummy) 146/*
147 * Define macro for basic types - we don't need to define s* types, because
148 * we have to care only about bitwidth at recording time.
149 */
150#define DEFINE_BASIC_FETCH_FUNCS(method) \
151DEFINE_FETCH_##method(u8) \
152DEFINE_FETCH_##method(u16) \
153DEFINE_FETCH_##method(u32) \
154DEFINE_FETCH_##method(u64)
155
156#define CHECK_FETCH_FUNCS(method, fn) \
157 (((FETCH_FUNC_NAME(method, u8) == fn) || \
158 (FETCH_FUNC_NAME(method, u16) == fn) || \
159 (FETCH_FUNC_NAME(method, u32) == fn) || \
160 (FETCH_FUNC_NAME(method, u64) == fn) || \
161 (FETCH_FUNC_NAME(method, string) == fn) || \
162 (FETCH_FUNC_NAME(method, string_size) == fn)) \
163 && (fn != NULL))
164
165/* Data fetch function templates */
166#define DEFINE_FETCH_reg(type) \
167static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \
168 void *offset, void *dest) \
169{ \
170 *(type *)dest = (type)regs_get_register(regs, \
171 (unsigned int)((unsigned long)offset)); \
172}
173DEFINE_BASIC_FETCH_FUNCS(reg)
174/* No string on the register */
175#define fetch_reg_string NULL
176#define fetch_reg_string_size NULL
177
178#define DEFINE_FETCH_stack(type) \
179static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
180 void *offset, void *dest) \
181{ \
182 *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \
183 (unsigned int)((unsigned long)offset)); \
184}
185DEFINE_BASIC_FETCH_FUNCS(stack)
186/* No string on the stack entry */
187#define fetch_stack_string NULL
188#define fetch_stack_string_size NULL
189
190#define DEFINE_FETCH_retval(type) \
191static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
192 void *dummy, void *dest) \
193{ \
194 *(type *)dest = (type)regs_return_value(regs); \
195}
196DEFINE_BASIC_FETCH_FUNCS(retval)
197/* No string on the retval */
198#define fetch_retval_string NULL
199#define fetch_retval_string_size NULL
200
201#define DEFINE_FETCH_memory(type) \
202static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
203 void *addr, void *dest) \
204{ \
205 type retval; \
206 if (probe_kernel_address(addr, retval)) \
207 *(type *)dest = 0; \
208 else \
209 *(type *)dest = retval; \
210}
211DEFINE_BASIC_FETCH_FUNCS(memory)
212/*
213 * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
214 * length and relative data location.
215 */
216static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
217 void *addr, void *dest)
102{ 218{
103 return kernel_stack_pointer(regs); 219 long ret;
220 int maxlen = get_rloc_len(*(u32 *)dest);
221 u8 *dst = get_rloc_data(dest);
222 u8 *src = addr;
223 mm_segment_t old_fs = get_fs();
224 if (!maxlen)
225 return;
226 /*
227 * Try to get string again, since the string can be changed while
228 * probing.
229 */
230 set_fs(KERNEL_DS);
231 pagefault_disable();
232 do
233 ret = __copy_from_user_inatomic(dst++, src++, 1);
234 while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
235 dst[-1] = '\0';
236 pagefault_enable();
237 set_fs(old_fs);
238
239 if (ret < 0) { /* Failed to fetch string */
240 ((u8 *)get_rloc_data(dest))[0] = '\0';
241 *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
242 } else
243 *(u32 *)dest = make_data_rloc(src - (u8 *)addr,
244 get_rloc_offs(*(u32 *)dest));
245}
246/* Return the length of string -- including null terminal byte */
247static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
248 void *addr, void *dest)
249{
250 int ret, len = 0;
251 u8 c;
252 mm_segment_t old_fs = get_fs();
253
254 set_fs(KERNEL_DS);
255 pagefault_disable();
256 do {
257 ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);
258 len++;
259 } while (c && ret == 0 && len < MAX_STRING_SIZE);
260 pagefault_enable();
261 set_fs(old_fs);
262
263 if (ret < 0) /* Failed to check the length */
264 *(u32 *)dest = 0;
265 else
266 *(u32 *)dest = len;
104} 267}
105 268
106/* Memory fetching by symbol */ 269/* Memory fetching by symbol */
@@ -145,51 +308,168 @@ static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
145 return sc; 308 return sc;
146} 309}
147 310
148static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data) 311#define DEFINE_FETCH_symbol(type) \
149{ 312static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
150 struct symbol_cache *sc = data; 313 void *data, void *dest) \
151 314{ \
152 if (sc->addr) 315 struct symbol_cache *sc = data; \
153 return fetch_memory(regs, (void *)sc->addr); 316 if (sc->addr) \
154 else 317 fetch_memory_##type(regs, (void *)sc->addr, dest); \
155 return 0; 318 else \
319 *(type *)dest = 0; \
156} 320}
321DEFINE_BASIC_FETCH_FUNCS(symbol)
322DEFINE_FETCH_symbol(string)
323DEFINE_FETCH_symbol(string_size)
157 324
158/* Special indirect memory access interface */ 325/* Dereference memory access function */
159struct indirect_fetch_data { 326struct deref_fetch_param {
160 struct fetch_func orig; 327 struct fetch_param orig;
161 long offset; 328 long offset;
162}; 329};
163 330
164static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data) 331#define DEFINE_FETCH_deref(type) \
165{ 332static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
166 struct indirect_fetch_data *ind = data; 333 void *data, void *dest) \
167 unsigned long addr; 334{ \
168 335 struct deref_fetch_param *dprm = data; \
169 addr = call_fetch(&ind->orig, regs); 336 unsigned long addr; \
170 if (addr) { 337 call_fetch(&dprm->orig, regs, &addr); \
171 addr += ind->offset; 338 if (addr) { \
172 return fetch_memory(regs, (void *)addr); 339 addr += dprm->offset; \
173 } else 340 fetch_memory_##type(regs, (void *)addr, dest); \
174 return 0; 341 } else \
342 *(type *)dest = 0; \
175} 343}
344DEFINE_BASIC_FETCH_FUNCS(deref)
345DEFINE_FETCH_deref(string)
346DEFINE_FETCH_deref(string_size)
176 347
177static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data) 348static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
178{ 349{
179 if (data->orig.func == fetch_indirect) 350 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
180 free_indirect_fetch_data(data->orig.data); 351 free_deref_fetch_param(data->orig.data);
181 else if (data->orig.func == fetch_symbol) 352 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
182 free_symbol_cache(data->orig.data); 353 free_symbol_cache(data->orig.data);
183 kfree(data); 354 kfree(data);
184} 355}
185 356
357/* Default (unsigned long) fetch type */
358#define __DEFAULT_FETCH_TYPE(t) u##t
359#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
360#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
361#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
362
363/* Fetch types */
364enum {
365 FETCH_MTD_reg = 0,
366 FETCH_MTD_stack,
367 FETCH_MTD_retval,
368 FETCH_MTD_memory,
369 FETCH_MTD_symbol,
370 FETCH_MTD_deref,
371 FETCH_MTD_END,
372};
373
374#define ASSIGN_FETCH_FUNC(method, type) \
375 [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type)
376
377#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
378 {.name = _name, \
379 .size = _size, \
380 .is_signed = sign, \
381 .print = PRINT_TYPE_FUNC_NAME(ptype), \
382 .fmt = PRINT_TYPE_FMT_NAME(ptype), \
383 .fmttype = _fmttype, \
384 .fetch = { \
385ASSIGN_FETCH_FUNC(reg, ftype), \
386ASSIGN_FETCH_FUNC(stack, ftype), \
387ASSIGN_FETCH_FUNC(retval, ftype), \
388ASSIGN_FETCH_FUNC(memory, ftype), \
389ASSIGN_FETCH_FUNC(symbol, ftype), \
390ASSIGN_FETCH_FUNC(deref, ftype), \
391 } \
392 }
393
394#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
395 __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
396
397#define FETCH_TYPE_STRING 0
398#define FETCH_TYPE_STRSIZE 1
399
400/* Fetch type information table */
401static const struct fetch_type {
402 const char *name; /* Name of type */
403 size_t size; /* Byte size of type */
404 int is_signed; /* Signed flag */
405 print_type_func_t print; /* Print functions */
406 const char *fmt; /* Fromat string */
407 const char *fmttype; /* Name in format file */
408 /* Fetch functions */
409 fetch_func_t fetch[FETCH_MTD_END];
410} fetch_type_table[] = {
411 /* Special types */
412 [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
413 sizeof(u32), 1, "__data_loc char[]"),
414 [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
415 string_size, sizeof(u32), 0, "u32"),
416 /* Basic types */
417 ASSIGN_FETCH_TYPE(u8, u8, 0),
418 ASSIGN_FETCH_TYPE(u16, u16, 0),
419 ASSIGN_FETCH_TYPE(u32, u32, 0),
420 ASSIGN_FETCH_TYPE(u64, u64, 0),
421 ASSIGN_FETCH_TYPE(s8, u8, 1),
422 ASSIGN_FETCH_TYPE(s16, u16, 1),
423 ASSIGN_FETCH_TYPE(s32, u32, 1),
424 ASSIGN_FETCH_TYPE(s64, u64, 1),
425};
426
427static const struct fetch_type *find_fetch_type(const char *type)
428{
429 int i;
430
431 if (!type)
432 type = DEFAULT_FETCH_TYPE_STR;
433
434 for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
435 if (strcmp(type, fetch_type_table[i].name) == 0)
436 return &fetch_type_table[i];
437 return NULL;
438}
439
440/* Special function : only accept unsigned long */
441static __kprobes void fetch_stack_address(struct pt_regs *regs,
442 void *dummy, void *dest)
443{
444 *(unsigned long *)dest = kernel_stack_pointer(regs);
445}
446
447static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
448 fetch_func_t orig_fn)
449{
450 int i;
451
452 if (type != &fetch_type_table[FETCH_TYPE_STRING])
453 return NULL; /* Only string type needs size function */
454 for (i = 0; i < FETCH_MTD_END; i++)
455 if (type->fetch[i] == orig_fn)
456 return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i];
457
458 WARN_ON(1); /* This should not happen */
459 return NULL;
460}
461
186/** 462/**
187 * Kprobe event core functions 463 * Kprobe event core functions
188 */ 464 */
189 465
190struct probe_arg { 466struct probe_arg {
191 struct fetch_func fetch; 467 struct fetch_param fetch;
192 const char *name; 468 struct fetch_param fetch_size;
469 unsigned int offset; /* Offset from argument entry */
470 const char *name; /* Name of this argument */
471 const char *comm; /* Command of this argument */
472 const struct fetch_type *type; /* Type of this argument */
193}; 473};
194 474
195/* Flags for trace_probe */ 475/* Flags for trace_probe */
@@ -202,8 +482,9 @@ struct trace_probe {
202 unsigned long nhit; 482 unsigned long nhit;
203 unsigned int flags; /* For TP_FLAG_* */ 483 unsigned int flags; /* For TP_FLAG_* */
204 const char *symbol; /* symbol name */ 484 const char *symbol; /* symbol name */
485 struct ftrace_event_class class;
205 struct ftrace_event_call call; 486 struct ftrace_event_call call;
206 struct trace_event event; 487 ssize_t size; /* trace entry size */
207 unsigned int nr_args; 488 unsigned int nr_args;
208 struct probe_arg args[]; 489 struct probe_arg args[];
209}; 490};
@@ -212,6 +493,7 @@ struct trace_probe {
212 (offsetof(struct trace_probe, args) + \ 493 (offsetof(struct trace_probe, args) + \
213 (sizeof(struct probe_arg) * (n))) 494 (sizeof(struct probe_arg) * (n)))
214 495
496
215static __kprobes int probe_is_return(struct trace_probe *tp) 497static __kprobes int probe_is_return(struct trace_probe *tp)
216{ 498{
217 return tp->rp.handler != NULL; 499 return tp->rp.handler != NULL;
@@ -222,49 +504,6 @@ static __kprobes const char *probe_symbol(struct trace_probe *tp)
222 return tp->symbol ? tp->symbol : "unknown"; 504 return tp->symbol ? tp->symbol : "unknown";
223} 505}
224 506
225static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
226{
227 int ret = -EINVAL;
228
229 if (ff->func == fetch_register) {
230 const char *name;
231 name = regs_query_register_name((unsigned int)((long)ff->data));
232 ret = snprintf(buf, n, "%%%s", name);
233 } else if (ff->func == fetch_stack)
234 ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data);
235 else if (ff->func == fetch_memory)
236 ret = snprintf(buf, n, "@0x%p", ff->data);
237 else if (ff->func == fetch_symbol) {
238 struct symbol_cache *sc = ff->data;
239 if (sc->offset)
240 ret = snprintf(buf, n, "@%s%+ld", sc->symbol,
241 sc->offset);
242 else
243 ret = snprintf(buf, n, "@%s", sc->symbol);
244 } else if (ff->func == fetch_retvalue)
245 ret = snprintf(buf, n, "$retval");
246 else if (ff->func == fetch_stack_address)
247 ret = snprintf(buf, n, "$stack");
248 else if (ff->func == fetch_indirect) {
249 struct indirect_fetch_data *id = ff->data;
250 size_t l = 0;
251 ret = snprintf(buf, n, "%+ld(", id->offset);
252 if (ret >= n)
253 goto end;
254 l += ret;
255 ret = probe_arg_string(buf + l, n - l, &id->orig);
256 if (ret < 0)
257 goto end;
258 l += ret;
259 ret = snprintf(buf + l, n - l, ")");
260 ret += l;
261 }
262end:
263 if (ret >= n)
264 return -ENOSPC;
265 return ret;
266}
267
268static int register_probe_event(struct trace_probe *tp); 507static int register_probe_event(struct trace_probe *tp);
269static void unregister_probe_event(struct trace_probe *tp); 508static void unregister_probe_event(struct trace_probe *tp);
270 509
@@ -275,8 +514,8 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
275static int kretprobe_dispatcher(struct kretprobe_instance *ri, 514static int kretprobe_dispatcher(struct kretprobe_instance *ri,
276 struct pt_regs *regs); 515 struct pt_regs *regs);
277 516
278/* Check the name is good for event/group */ 517/* Check the name is good for event/group/fields */
279static int check_event_name(const char *name) 518static int is_good_name(const char *name)
280{ 519{
281 if (!isalpha(*name) && *name != '_') 520 if (!isalpha(*name) && *name != '_')
282 return 0; 521 return 0;
@@ -318,22 +557,23 @@ static struct trace_probe *alloc_trace_probe(const char *group,
318 else 557 else
319 tp->rp.kp.pre_handler = kprobe_dispatcher; 558 tp->rp.kp.pre_handler = kprobe_dispatcher;
320 559
321 if (!event || !check_event_name(event)) { 560 if (!event || !is_good_name(event)) {
322 ret = -EINVAL; 561 ret = -EINVAL;
323 goto error; 562 goto error;
324 } 563 }
325 564
565 tp->call.class = &tp->class;
326 tp->call.name = kstrdup(event, GFP_KERNEL); 566 tp->call.name = kstrdup(event, GFP_KERNEL);
327 if (!tp->call.name) 567 if (!tp->call.name)
328 goto error; 568 goto error;
329 569
330 if (!group || !check_event_name(group)) { 570 if (!group || !is_good_name(group)) {
331 ret = -EINVAL; 571 ret = -EINVAL;
332 goto error; 572 goto error;
333 } 573 }
334 574
335 tp->call.system = kstrdup(group, GFP_KERNEL); 575 tp->class.system = kstrdup(group, GFP_KERNEL);
336 if (!tp->call.system) 576 if (!tp->class.system)
337 goto error; 577 goto error;
338 578
339 INIT_LIST_HEAD(&tp->list); 579 INIT_LIST_HEAD(&tp->list);
@@ -347,11 +587,12 @@ error:
347 587
348static void free_probe_arg(struct probe_arg *arg) 588static void free_probe_arg(struct probe_arg *arg)
349{ 589{
350 if (arg->fetch.func == fetch_symbol) 590 if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
591 free_deref_fetch_param(arg->fetch.data);
592 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
351 free_symbol_cache(arg->fetch.data); 593 free_symbol_cache(arg->fetch.data);
352 else if (arg->fetch.func == fetch_indirect)
353 free_indirect_fetch_data(arg->fetch.data);
354 kfree(arg->name); 594 kfree(arg->name);
595 kfree(arg->comm);
355} 596}
356 597
357static void free_trace_probe(struct trace_probe *tp) 598static void free_trace_probe(struct trace_probe *tp)
@@ -361,7 +602,7 @@ static void free_trace_probe(struct trace_probe *tp)
361 for (i = 0; i < tp->nr_args; i++) 602 for (i = 0; i < tp->nr_args; i++)
362 free_probe_arg(&tp->args[i]); 603 free_probe_arg(&tp->args[i]);
363 604
364 kfree(tp->call.system); 605 kfree(tp->call.class->system);
365 kfree(tp->call.name); 606 kfree(tp->call.name);
366 kfree(tp->symbol); 607 kfree(tp->symbol);
367 kfree(tp); 608 kfree(tp);
@@ -374,7 +615,7 @@ static struct trace_probe *find_probe_event(const char *event,
374 615
375 list_for_each_entry(tp, &probe_list, list) 616 list_for_each_entry(tp, &probe_list, list)
376 if (strcmp(tp->call.name, event) == 0 && 617 if (strcmp(tp->call.name, event) == 0 &&
377 strcmp(tp->call.system, group) == 0) 618 strcmp(tp->call.class->system, group) == 0)
378 return tp; 619 return tp;
379 return NULL; 620 return NULL;
380} 621}
@@ -399,7 +640,7 @@ static int register_trace_probe(struct trace_probe *tp)
399 mutex_lock(&probe_lock); 640 mutex_lock(&probe_lock);
400 641
401 /* register as an event */ 642 /* register as an event */
402 old_tp = find_probe_event(tp->call.name, tp->call.system); 643 old_tp = find_probe_event(tp->call.name, tp->call.class->system);
403 if (old_tp) { 644 if (old_tp) {
404 /* delete old event */ 645 /* delete old event */
405 unregister_trace_probe(old_tp); 646 unregister_trace_probe(old_tp);
@@ -457,28 +698,30 @@ static int split_symbol_offset(char *symbol, unsigned long *offset)
457#define PARAM_MAX_ARGS 16 698#define PARAM_MAX_ARGS 16
458#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) 699#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
459 700
460static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return) 701static int parse_probe_vars(char *arg, const struct fetch_type *t,
702 struct fetch_param *f, int is_return)
461{ 703{
462 int ret = 0; 704 int ret = 0;
463 unsigned long param; 705 unsigned long param;
464 706
465 if (strcmp(arg, "retval") == 0) { 707 if (strcmp(arg, "retval") == 0) {
466 if (is_return) { 708 if (is_return)
467 ff->func = fetch_retvalue; 709 f->fn = t->fetch[FETCH_MTD_retval];
468 ff->data = NULL; 710 else
469 } else
470 ret = -EINVAL; 711 ret = -EINVAL;
471 } else if (strncmp(arg, "stack", 5) == 0) { 712 } else if (strncmp(arg, "stack", 5) == 0) {
472 if (arg[5] == '\0') { 713 if (arg[5] == '\0') {
473 ff->func = fetch_stack_address; 714 if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0)
474 ff->data = NULL; 715 f->fn = fetch_stack_address;
716 else
717 ret = -EINVAL;
475 } else if (isdigit(arg[5])) { 718 } else if (isdigit(arg[5])) {
476 ret = strict_strtoul(arg + 5, 10, &param); 719 ret = strict_strtoul(arg + 5, 10, &param);
477 if (ret || param > PARAM_MAX_STACK) 720 if (ret || param > PARAM_MAX_STACK)
478 ret = -EINVAL; 721 ret = -EINVAL;
479 else { 722 else {
480 ff->func = fetch_stack; 723 f->fn = t->fetch[FETCH_MTD_stack];
481 ff->data = (void *)param; 724 f->data = (void *)param;
482 } 725 }
483 } else 726 } else
484 ret = -EINVAL; 727 ret = -EINVAL;
@@ -488,7 +731,8 @@ static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
488} 731}
489 732
490/* Recursive argument parser */ 733/* Recursive argument parser */
491static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) 734static int __parse_probe_arg(char *arg, const struct fetch_type *t,
735 struct fetch_param *f, int is_return)
492{ 736{
493 int ret = 0; 737 int ret = 0;
494 unsigned long param; 738 unsigned long param;
@@ -497,13 +741,13 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
497 741
498 switch (arg[0]) { 742 switch (arg[0]) {
499 case '$': 743 case '$':
500 ret = parse_probe_vars(arg + 1, ff, is_return); 744 ret = parse_probe_vars(arg + 1, t, f, is_return);
501 break; 745 break;
502 case '%': /* named register */ 746 case '%': /* named register */
503 ret = regs_query_register_offset(arg + 1); 747 ret = regs_query_register_offset(arg + 1);
504 if (ret >= 0) { 748 if (ret >= 0) {
505 ff->func = fetch_register; 749 f->fn = t->fetch[FETCH_MTD_reg];
506 ff->data = (void *)(unsigned long)ret; 750 f->data = (void *)(unsigned long)ret;
507 ret = 0; 751 ret = 0;
508 } 752 }
509 break; 753 break;
@@ -512,26 +756,22 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
512 ret = strict_strtoul(arg + 1, 0, &param); 756 ret = strict_strtoul(arg + 1, 0, &param);
513 if (ret) 757 if (ret)
514 break; 758 break;
515 ff->func = fetch_memory; 759 f->fn = t->fetch[FETCH_MTD_memory];
516 ff->data = (void *)param; 760 f->data = (void *)param;
517 } else { 761 } else {
518 ret = split_symbol_offset(arg + 1, &offset); 762 ret = split_symbol_offset(arg + 1, &offset);
519 if (ret) 763 if (ret)
520 break; 764 break;
521 ff->data = alloc_symbol_cache(arg + 1, offset); 765 f->data = alloc_symbol_cache(arg + 1, offset);
522 if (ff->data) 766 if (f->data)
523 ff->func = fetch_symbol; 767 f->fn = t->fetch[FETCH_MTD_symbol];
524 else
525 ret = -EINVAL;
526 } 768 }
527 break; 769 break;
528 case '+': /* indirect memory */ 770 case '+': /* deref memory */
529 case '-': 771 case '-':
530 tmp = strchr(arg, '('); 772 tmp = strchr(arg, '(');
531 if (!tmp) { 773 if (!tmp)
532 ret = -EINVAL;
533 break; 774 break;
534 }
535 *tmp = '\0'; 775 *tmp = '\0';
536 ret = strict_strtol(arg + 1, 0, &offset); 776 ret = strict_strtol(arg + 1, 0, &offset);
537 if (ret) 777 if (ret)
@@ -541,38 +781,68 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
541 arg = tmp + 1; 781 arg = tmp + 1;
542 tmp = strrchr(arg, ')'); 782 tmp = strrchr(arg, ')');
543 if (tmp) { 783 if (tmp) {
544 struct indirect_fetch_data *id; 784 struct deref_fetch_param *dprm;
785 const struct fetch_type *t2 = find_fetch_type(NULL);
545 *tmp = '\0'; 786 *tmp = '\0';
546 id = kzalloc(sizeof(struct indirect_fetch_data), 787 dprm = kzalloc(sizeof(struct deref_fetch_param),
547 GFP_KERNEL); 788 GFP_KERNEL);
548 if (!id) 789 if (!dprm)
549 return -ENOMEM; 790 return -ENOMEM;
550 id->offset = offset; 791 dprm->offset = offset;
551 ret = __parse_probe_arg(arg, &id->orig, is_return); 792 ret = __parse_probe_arg(arg, t2, &dprm->orig,
793 is_return);
552 if (ret) 794 if (ret)
553 kfree(id); 795 kfree(dprm);
554 else { 796 else {
555 ff->func = fetch_indirect; 797 f->fn = t->fetch[FETCH_MTD_deref];
556 ff->data = (void *)id; 798 f->data = (void *)dprm;
557 } 799 }
558 } else 800 }
559 ret = -EINVAL;
560 break; 801 break;
561 default: 802 }
562 /* TODO: support custom handler */ 803 if (!ret && !f->fn) { /* Parsed, but do not find fetch method */
804 pr_info("%s type has no corresponding fetch method.\n",
805 t->name);
563 ret = -EINVAL; 806 ret = -EINVAL;
564 } 807 }
565 return ret; 808 return ret;
566} 809}
567 810
568/* String length checking wrapper */ 811/* String length checking wrapper */
569static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) 812static int parse_probe_arg(char *arg, struct trace_probe *tp,
813 struct probe_arg *parg, int is_return)
570{ 814{
815 const char *t;
816 int ret;
817
571 if (strlen(arg) > MAX_ARGSTR_LEN) { 818 if (strlen(arg) > MAX_ARGSTR_LEN) {
572 pr_info("Argument is too long.: %s\n", arg); 819 pr_info("Argument is too long.: %s\n", arg);
573 return -ENOSPC; 820 return -ENOSPC;
574 } 821 }
575 return __parse_probe_arg(arg, ff, is_return); 822 parg->comm = kstrdup(arg, GFP_KERNEL);
823 if (!parg->comm) {
824 pr_info("Failed to allocate memory for command '%s'.\n", arg);
825 return -ENOMEM;
826 }
827 t = strchr(parg->comm, ':');
828 if (t) {
829 arg[t - parg->comm] = '\0';
830 t++;
831 }
832 parg->type = find_fetch_type(t);
833 if (!parg->type) {
834 pr_info("Unsupported type: %s\n", t);
835 return -EINVAL;
836 }
837 parg->offset = tp->size;
838 tp->size += parg->type->size;
839 ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
840 if (ret >= 0) {
841 parg->fetch_size.fn = get_fetch_size_function(parg->type,
842 parg->fetch.fn);
843 parg->fetch_size.data = parg->fetch.data;
844 }
845 return ret;
576} 846}
577 847
578/* Return 1 if name is reserved or already used by another argument */ 848/* Return 1 if name is reserved or already used by another argument */
@@ -602,15 +872,18 @@ static int create_trace_probe(int argc, char **argv)
602 * @ADDR : fetch memory at ADDR (ADDR should be in kernel) 872 * @ADDR : fetch memory at ADDR (ADDR should be in kernel)
603 * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) 873 * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
604 * %REG : fetch register REG 874 * %REG : fetch register REG
605 * Indirect memory fetch: 875 * Dereferencing memory fetch:
606 * +|-offs(ARG) : fetch memory at ARG +|- offs address. 876 * +|-offs(ARG) : fetch memory at ARG +|- offs address.
607 * Alias name of args: 877 * Alias name of args:
608 * NAME=FETCHARG : set NAME as alias of FETCHARG. 878 * NAME=FETCHARG : set NAME as alias of FETCHARG.
879 * Type of args:
880 * FETCHARG:TYPE : use TYPE instead of unsigned long.
609 */ 881 */
610 struct trace_probe *tp; 882 struct trace_probe *tp;
611 int i, ret = 0; 883 int i, ret = 0;
612 int is_return = 0, is_delete = 0; 884 int is_return = 0, is_delete = 0;
613 char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL; 885 char *symbol = NULL, *event = NULL, *group = NULL;
886 char *arg;
614 unsigned long offset = 0; 887 unsigned long offset = 0;
615 void *addr = NULL; 888 void *addr = NULL;
616 char buf[MAX_EVENT_NAME_LEN]; 889 char buf[MAX_EVENT_NAME_LEN];
@@ -652,14 +925,17 @@ static int create_trace_probe(int argc, char **argv)
652 pr_info("Delete command needs an event name.\n"); 925 pr_info("Delete command needs an event name.\n");
653 return -EINVAL; 926 return -EINVAL;
654 } 927 }
928 mutex_lock(&probe_lock);
655 tp = find_probe_event(event, group); 929 tp = find_probe_event(event, group);
656 if (!tp) { 930 if (!tp) {
931 mutex_unlock(&probe_lock);
657 pr_info("Event %s/%s doesn't exist.\n", group, event); 932 pr_info("Event %s/%s doesn't exist.\n", group, event);
658 return -ENOENT; 933 return -ENOENT;
659 } 934 }
660 /* delete an event */ 935 /* delete an event */
661 unregister_trace_probe(tp); 936 unregister_trace_probe(tp);
662 free_trace_probe(tp); 937 free_trace_probe(tp);
938 mutex_unlock(&probe_lock);
663 return 0; 939 return 0;
664 } 940 }
665 941
@@ -716,37 +992,47 @@ static int create_trace_probe(int argc, char **argv)
716 /* parse arguments */ 992 /* parse arguments */
717 ret = 0; 993 ret = 0;
718 for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { 994 for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
995 /* Increment count for freeing args in error case */
996 tp->nr_args++;
997
719 /* Parse argument name */ 998 /* Parse argument name */
720 arg = strchr(argv[i], '='); 999 arg = strchr(argv[i], '=');
721 if (arg) 1000 if (arg) {
722 *arg++ = '\0'; 1001 *arg++ = '\0';
723 else 1002 tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
1003 } else {
724 arg = argv[i]; 1004 arg = argv[i];
1005 /* If argument name is omitted, set "argN" */
1006 snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1);
1007 tp->args[i].name = kstrdup(buf, GFP_KERNEL);
1008 }
725 1009
726 if (conflict_field_name(argv[i], tp->args, i)) { 1010 if (!tp->args[i].name) {
727 pr_info("Argument%d name '%s' conflicts with " 1011 pr_info("Failed to allocate argument[%d] name.\n", i);
728 "another field.\n", i, argv[i]); 1012 ret = -ENOMEM;
1013 goto error;
1014 }
1015
1016 if (!is_good_name(tp->args[i].name)) {
1017 pr_info("Invalid argument[%d] name: %s\n",
1018 i, tp->args[i].name);
729 ret = -EINVAL; 1019 ret = -EINVAL;
730 goto error; 1020 goto error;
731 } 1021 }
732 1022
733 tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); 1023 if (conflict_field_name(tp->args[i].name, tp->args, i)) {
734 if (!tp->args[i].name) { 1024 pr_info("Argument[%d] name '%s' conflicts with "
735 pr_info("Failed to allocate argument%d name '%s'.\n", 1025 "another field.\n", i, argv[i]);
736 i, argv[i]); 1026 ret = -EINVAL;
737 ret = -ENOMEM;
738 goto error; 1027 goto error;
739 } 1028 }
740 1029
741 /* Parse fetch argument */ 1030 /* Parse fetch argument */
742 ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return); 1031 ret = parse_probe_arg(arg, tp, &tp->args[i], is_return);
743 if (ret) { 1032 if (ret) {
744 pr_info("Parse error at argument%d. (%d)\n", i, ret); 1033 pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
745 kfree(tp->args[i].name);
746 goto error; 1034 goto error;
747 } 1035 }
748
749 tp->nr_args++;
750 } 1036 }
751 1037
752 ret = register_trace_probe(tp); 1038 ret = register_trace_probe(tp);
@@ -794,11 +1080,10 @@ static void probes_seq_stop(struct seq_file *m, void *v)
794static int probes_seq_show(struct seq_file *m, void *v) 1080static int probes_seq_show(struct seq_file *m, void *v)
795{ 1081{
796 struct trace_probe *tp = v; 1082 struct trace_probe *tp = v;
797 int i, ret; 1083 int i;
798 char buf[MAX_ARGSTR_LEN + 1];
799 1084
800 seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p'); 1085 seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
801 seq_printf(m, ":%s/%s", tp->call.system, tp->call.name); 1086 seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name);
802 1087
803 if (!tp->symbol) 1088 if (!tp->symbol)
804 seq_printf(m, " 0x%p", tp->rp.kp.addr); 1089 seq_printf(m, " 0x%p", tp->rp.kp.addr);
@@ -807,15 +1092,10 @@ static int probes_seq_show(struct seq_file *m, void *v)
807 else 1092 else
808 seq_printf(m, " %s", probe_symbol(tp)); 1093 seq_printf(m, " %s", probe_symbol(tp));
809 1094
810 for (i = 0; i < tp->nr_args; i++) { 1095 for (i = 0; i < tp->nr_args; i++)
811 ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch); 1096 seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm);
812 if (ret < 0) {
813 pr_warning("Argument%d decoding error(%d).\n", i, ret);
814 return ret;
815 }
816 seq_printf(m, " %s=%s", tp->args[i].name, buf);
817 }
818 seq_printf(m, "\n"); 1097 seq_printf(m, "\n");
1098
819 return 0; 1099 return 0;
820} 1100}
821 1101
@@ -941,14 +1221,62 @@ static const struct file_operations kprobe_profile_ops = {
941 .release = seq_release, 1221 .release = seq_release,
942}; 1222};
943 1223
1224/* Sum up total data length for dynamic arraies (strings) */
1225static __kprobes int __get_data_size(struct trace_probe *tp,
1226 struct pt_regs *regs)
1227{
1228 int i, ret = 0;
1229 u32 len;
1230
1231 for (i = 0; i < tp->nr_args; i++)
1232 if (unlikely(tp->args[i].fetch_size.fn)) {
1233 call_fetch(&tp->args[i].fetch_size, regs, &len);
1234 ret += len;
1235 }
1236
1237 return ret;
1238}
1239
1240/* Store the value of each argument */
1241static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp,
1242 struct pt_regs *regs,
1243 u8 *data, int maxlen)
1244{
1245 int i;
1246 u32 end = tp->size;
1247 u32 *dl; /* Data (relative) location */
1248
1249 for (i = 0; i < tp->nr_args; i++) {
1250 if (unlikely(tp->args[i].fetch_size.fn)) {
1251 /*
1252 * First, we set the relative location and
1253 * maximum data length to *dl
1254 */
1255 dl = (u32 *)(data + tp->args[i].offset);
1256 *dl = make_data_rloc(maxlen, end - tp->args[i].offset);
1257 /* Then try to fetch string or dynamic array data */
1258 call_fetch(&tp->args[i].fetch, regs, dl);
1259 /* Reduce maximum length */
1260 end += get_rloc_len(*dl);
1261 maxlen -= get_rloc_len(*dl);
1262 /* Trick here, convert data_rloc to data_loc */
1263 *dl = convert_rloc_to_loc(*dl,
1264 ent_size + tp->args[i].offset);
1265 } else
1266 /* Just fetching data normally */
1267 call_fetch(&tp->args[i].fetch, regs,
1268 data + tp->args[i].offset);
1269 }
1270}
1271
944/* Kprobe handler */ 1272/* Kprobe handler */
945static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) 1273static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
946{ 1274{
947 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); 1275 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
948 struct kprobe_trace_entry *entry; 1276 struct kprobe_trace_entry_head *entry;
949 struct ring_buffer_event *event; 1277 struct ring_buffer_event *event;
950 struct ring_buffer *buffer; 1278 struct ring_buffer *buffer;
951 int size, i, pc; 1279 int size, dsize, pc;
952 unsigned long irq_flags; 1280 unsigned long irq_flags;
953 struct ftrace_event_call *call = &tp->call; 1281 struct ftrace_event_call *call = &tp->call;
954 1282
@@ -957,18 +1285,17 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
957 local_save_flags(irq_flags); 1285 local_save_flags(irq_flags);
958 pc = preempt_count(); 1286 pc = preempt_count();
959 1287
960 size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); 1288 dsize = __get_data_size(tp, regs);
1289 size = sizeof(*entry) + tp->size + dsize;
961 1290
962 event = trace_current_buffer_lock_reserve(&buffer, call->id, size, 1291 event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
963 irq_flags, pc); 1292 size, irq_flags, pc);
964 if (!event) 1293 if (!event)
965 return; 1294 return;
966 1295
967 entry = ring_buffer_event_data(event); 1296 entry = ring_buffer_event_data(event);
968 entry->nargs = tp->nr_args;
969 entry->ip = (unsigned long)kp->addr; 1297 entry->ip = (unsigned long)kp->addr;
970 for (i = 0; i < tp->nr_args; i++) 1298 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
971 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
972 1299
973 if (!filter_current_check_discard(buffer, call, entry, event)) 1300 if (!filter_current_check_discard(buffer, call, entry, event))
974 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1301 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
@@ -979,29 +1306,28 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
979 struct pt_regs *regs) 1306 struct pt_regs *regs)
980{ 1307{
981 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); 1308 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
982 struct kretprobe_trace_entry *entry; 1309 struct kretprobe_trace_entry_head *entry;
983 struct ring_buffer_event *event; 1310 struct ring_buffer_event *event;
984 struct ring_buffer *buffer; 1311 struct ring_buffer *buffer;
985 int size, i, pc; 1312 int size, pc, dsize;
986 unsigned long irq_flags; 1313 unsigned long irq_flags;
987 struct ftrace_event_call *call = &tp->call; 1314 struct ftrace_event_call *call = &tp->call;
988 1315
989 local_save_flags(irq_flags); 1316 local_save_flags(irq_flags);
990 pc = preempt_count(); 1317 pc = preempt_count();
991 1318
992 size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); 1319 dsize = __get_data_size(tp, regs);
1320 size = sizeof(*entry) + tp->size + dsize;
993 1321
994 event = trace_current_buffer_lock_reserve(&buffer, call->id, size, 1322 event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
995 irq_flags, pc); 1323 size, irq_flags, pc);
996 if (!event) 1324 if (!event)
997 return; 1325 return;
998 1326
999 entry = ring_buffer_event_data(event); 1327 entry = ring_buffer_event_data(event);
1000 entry->nargs = tp->nr_args;
1001 entry->func = (unsigned long)tp->rp.kp.addr; 1328 entry->func = (unsigned long)tp->rp.kp.addr;
1002 entry->ret_ip = (unsigned long)ri->ret_addr; 1329 entry->ret_ip = (unsigned long)ri->ret_addr;
1003 for (i = 0; i < tp->nr_args; i++) 1330 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1004 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1005 1331
1006 if (!filter_current_check_discard(buffer, call, entry, event)) 1332 if (!filter_current_check_discard(buffer, call, entry, event))
1007 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1333 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
@@ -1009,17 +1335,17 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
1009 1335
1010/* Event entry printers */ 1336/* Event entry printers */
1011enum print_line_t 1337enum print_line_t
1012print_kprobe_event(struct trace_iterator *iter, int flags) 1338print_kprobe_event(struct trace_iterator *iter, int flags,
1339 struct trace_event *event)
1013{ 1340{
1014 struct kprobe_trace_entry *field; 1341 struct kprobe_trace_entry_head *field;
1015 struct trace_seq *s = &iter->seq; 1342 struct trace_seq *s = &iter->seq;
1016 struct trace_event *event;
1017 struct trace_probe *tp; 1343 struct trace_probe *tp;
1344 u8 *data;
1018 int i; 1345 int i;
1019 1346
1020 field = (struct kprobe_trace_entry *)iter->ent; 1347 field = (struct kprobe_trace_entry_head *)iter->ent;
1021 event = ftrace_find_event(field->ent.type); 1348 tp = container_of(event, struct trace_probe, call.event);
1022 tp = container_of(event, struct trace_probe, event);
1023 1349
1024 if (!trace_seq_printf(s, "%s: (", tp->call.name)) 1350 if (!trace_seq_printf(s, "%s: (", tp->call.name))
1025 goto partial; 1351 goto partial;
@@ -1030,9 +1356,10 @@ print_kprobe_event(struct trace_iterator *iter, int flags)
1030 if (!trace_seq_puts(s, ")")) 1356 if (!trace_seq_puts(s, ")"))
1031 goto partial; 1357 goto partial;
1032 1358
1033 for (i = 0; i < field->nargs; i++) 1359 data = (u8 *)&field[1];
1034 if (!trace_seq_printf(s, " %s=%lx", 1360 for (i = 0; i < tp->nr_args; i++)
1035 tp->args[i].name, field->args[i])) 1361 if (!tp->args[i].type->print(s, tp->args[i].name,
1362 data + tp->args[i].offset, field))
1036 goto partial; 1363 goto partial;
1037 1364
1038 if (!trace_seq_puts(s, "\n")) 1365 if (!trace_seq_puts(s, "\n"))
@@ -1044,17 +1371,17 @@ partial:
1044} 1371}
1045 1372
1046enum print_line_t 1373enum print_line_t
1047print_kretprobe_event(struct trace_iterator *iter, int flags) 1374print_kretprobe_event(struct trace_iterator *iter, int flags,
1375 struct trace_event *event)
1048{ 1376{
1049 struct kretprobe_trace_entry *field; 1377 struct kretprobe_trace_entry_head *field;
1050 struct trace_seq *s = &iter->seq; 1378 struct trace_seq *s = &iter->seq;
1051 struct trace_event *event;
1052 struct trace_probe *tp; 1379 struct trace_probe *tp;
1380 u8 *data;
1053 int i; 1381 int i;
1054 1382
1055 field = (struct kretprobe_trace_entry *)iter->ent; 1383 field = (struct kretprobe_trace_entry_head *)iter->ent;
1056 event = ftrace_find_event(field->ent.type); 1384 tp = container_of(event, struct trace_probe, call.event);
1057 tp = container_of(event, struct trace_probe, event);
1058 1385
1059 if (!trace_seq_printf(s, "%s: (", tp->call.name)) 1386 if (!trace_seq_printf(s, "%s: (", tp->call.name))
1060 goto partial; 1387 goto partial;
@@ -1071,9 +1398,10 @@ print_kretprobe_event(struct trace_iterator *iter, int flags)
1071 if (!trace_seq_puts(s, ")")) 1398 if (!trace_seq_puts(s, ")"))
1072 goto partial; 1399 goto partial;
1073 1400
1074 for (i = 0; i < field->nargs; i++) 1401 data = (u8 *)&field[1];
1075 if (!trace_seq_printf(s, " %s=%lx", 1402 for (i = 0; i < tp->nr_args; i++)
1076 tp->args[i].name, field->args[i])) 1403 if (!tp->args[i].type->print(s, tp->args[i].name,
1404 data + tp->args[i].offset, field))
1077 goto partial; 1405 goto partial;
1078 1406
1079 if (!trace_seq_puts(s, "\n")) 1407 if (!trace_seq_puts(s, "\n"))
@@ -1108,13 +1436,6 @@ static void probe_event_disable(struct ftrace_event_call *call)
1108 } 1436 }
1109} 1437}
1110 1438
1111static int probe_event_raw_init(struct ftrace_event_call *event_call)
1112{
1113 INIT_LIST_HEAD(&event_call->fields);
1114
1115 return 0;
1116}
1117
1118#undef DEFINE_FIELD 1439#undef DEFINE_FIELD
1119#define DEFINE_FIELD(type, item, name, is_signed) \ 1440#define DEFINE_FIELD(type, item, name, is_signed) \
1120 do { \ 1441 do { \
@@ -1129,29 +1450,43 @@ static int probe_event_raw_init(struct ftrace_event_call *event_call)
1129static int kprobe_event_define_fields(struct ftrace_event_call *event_call) 1450static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
1130{ 1451{
1131 int ret, i; 1452 int ret, i;
1132 struct kprobe_trace_entry field; 1453 struct kprobe_trace_entry_head field;
1133 struct trace_probe *tp = (struct trace_probe *)event_call->data; 1454 struct trace_probe *tp = (struct trace_probe *)event_call->data;
1134 1455
1135 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); 1456 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
1136 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
1137 /* Set argument names as fields */ 1457 /* Set argument names as fields */
1138 for (i = 0; i < tp->nr_args; i++) 1458 for (i = 0; i < tp->nr_args; i++) {
1139 DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); 1459 ret = trace_define_field(event_call, tp->args[i].type->fmttype,
1460 tp->args[i].name,
1461 sizeof(field) + tp->args[i].offset,
1462 tp->args[i].type->size,
1463 tp->args[i].type->is_signed,
1464 FILTER_OTHER);
1465 if (ret)
1466 return ret;
1467 }
1140 return 0; 1468 return 0;
1141} 1469}
1142 1470
1143static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) 1471static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
1144{ 1472{
1145 int ret, i; 1473 int ret, i;
1146 struct kretprobe_trace_entry field; 1474 struct kretprobe_trace_entry_head field;
1147 struct trace_probe *tp = (struct trace_probe *)event_call->data; 1475 struct trace_probe *tp = (struct trace_probe *)event_call->data;
1148 1476
1149 DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); 1477 DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
1150 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); 1478 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
1151 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
1152 /* Set argument names as fields */ 1479 /* Set argument names as fields */
1153 for (i = 0; i < tp->nr_args; i++) 1480 for (i = 0; i < tp->nr_args; i++) {
1154 DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); 1481 ret = trace_define_field(event_call, tp->args[i].type->fmttype,
1482 tp->args[i].name,
1483 sizeof(field) + tp->args[i].offset,
1484 tp->args[i].type->size,
1485 tp->args[i].type->is_signed,
1486 FILTER_OTHER);
1487 if (ret)
1488 return ret;
1489 }
1155 return 0; 1490 return 0;
1156} 1491}
1157 1492
@@ -1176,15 +1511,20 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
1176 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); 1511 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
1177 1512
1178 for (i = 0; i < tp->nr_args; i++) { 1513 for (i = 0; i < tp->nr_args; i++) {
1179 pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%%lx", 1514 pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s",
1180 tp->args[i].name); 1515 tp->args[i].name, tp->args[i].type->fmt);
1181 } 1516 }
1182 1517
1183 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); 1518 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
1184 1519
1185 for (i = 0; i < tp->nr_args; i++) { 1520 for (i = 0; i < tp->nr_args; i++) {
1186 pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", 1521 if (strcmp(tp->args[i].type->name, "string") == 0)
1187 tp->args[i].name); 1522 pos += snprintf(buf + pos, LEN_OR_ZERO,
1523 ", __get_str(%s)",
1524 tp->args[i].name);
1525 else
1526 pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
1527 tp->args[i].name);
1188 } 1528 }
1189 1529
1190#undef LEN_OR_ZERO 1530#undef LEN_OR_ZERO
@@ -1219,28 +1559,29 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
1219{ 1559{
1220 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); 1560 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
1221 struct ftrace_event_call *call = &tp->call; 1561 struct ftrace_event_call *call = &tp->call;
1222 struct kprobe_trace_entry *entry; 1562 struct kprobe_trace_entry_head *entry;
1223 int size, __size, i; 1563 struct hlist_head *head;
1224 unsigned long irq_flags; 1564 int size, __size, dsize;
1225 int rctx; 1565 int rctx;
1226 1566
1227 __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); 1567 dsize = __get_data_size(tp, regs);
1568 __size = sizeof(*entry) + tp->size + dsize;
1228 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1569 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1229 size -= sizeof(u32); 1570 size -= sizeof(u32);
1230 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, 1571 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
1231 "profile buffer not large enough")) 1572 "profile buffer not large enough"))
1232 return; 1573 return;
1233 1574
1234 entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags); 1575 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
1235 if (!entry) 1576 if (!entry)
1236 return; 1577 return;
1237 1578
1238 entry->nargs = tp->nr_args;
1239 entry->ip = (unsigned long)kp->addr; 1579 entry->ip = (unsigned long)kp->addr;
1240 for (i = 0; i < tp->nr_args; i++) 1580 memset(&entry[1], 0, dsize);
1241 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1581 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1242 1582
1243 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs); 1583 head = this_cpu_ptr(call->perf_events);
1584 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
1244} 1585}
1245 1586
1246/* Kretprobe profile handler */ 1587/* Kretprobe profile handler */
@@ -1249,30 +1590,29 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1249{ 1590{
1250 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); 1591 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
1251 struct ftrace_event_call *call = &tp->call; 1592 struct ftrace_event_call *call = &tp->call;
1252 struct kretprobe_trace_entry *entry; 1593 struct kretprobe_trace_entry_head *entry;
1253 int size, __size, i; 1594 struct hlist_head *head;
1254 unsigned long irq_flags; 1595 int size, __size, dsize;
1255 int rctx; 1596 int rctx;
1256 1597
1257 __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); 1598 dsize = __get_data_size(tp, regs);
1599 __size = sizeof(*entry) + tp->size + dsize;
1258 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1600 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1259 size -= sizeof(u32); 1601 size -= sizeof(u32);
1260 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, 1602 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
1261 "profile buffer not large enough")) 1603 "profile buffer not large enough"))
1262 return; 1604 return;
1263 1605
1264 entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags); 1606 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
1265 if (!entry) 1607 if (!entry)
1266 return; 1608 return;
1267 1609
1268 entry->nargs = tp->nr_args;
1269 entry->func = (unsigned long)tp->rp.kp.addr; 1610 entry->func = (unsigned long)tp->rp.kp.addr;
1270 entry->ret_ip = (unsigned long)ri->ret_addr; 1611 entry->ret_ip = (unsigned long)ri->ret_addr;
1271 for (i = 0; i < tp->nr_args; i++) 1612 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1272 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1273 1613
1274 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, 1614 head = this_cpu_ptr(call->perf_events);
1275 irq_flags, regs); 1615 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
1276} 1616}
1277 1617
1278static int probe_perf_enable(struct ftrace_event_call *call) 1618static int probe_perf_enable(struct ftrace_event_call *call)
@@ -1302,6 +1642,26 @@ static void probe_perf_disable(struct ftrace_event_call *call)
1302} 1642}
1303#endif /* CONFIG_PERF_EVENTS */ 1643#endif /* CONFIG_PERF_EVENTS */
1304 1644
1645static __kprobes
1646int kprobe_register(struct ftrace_event_call *event, enum trace_reg type)
1647{
1648 switch (type) {
1649 case TRACE_REG_REGISTER:
1650 return probe_event_enable(event);
1651 case TRACE_REG_UNREGISTER:
1652 probe_event_disable(event);
1653 return 0;
1654
1655#ifdef CONFIG_PERF_EVENTS
1656 case TRACE_REG_PERF_REGISTER:
1657 return probe_perf_enable(event);
1658 case TRACE_REG_PERF_UNREGISTER:
1659 probe_perf_disable(event);
1660 return 0;
1661#endif
1662 }
1663 return 0;
1664}
1305 1665
1306static __kprobes 1666static __kprobes
1307int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) 1667int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
@@ -1331,43 +1691,43 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
1331 return 0; /* We don't tweek kernel, so just return 0 */ 1691 return 0; /* We don't tweek kernel, so just return 0 */
1332} 1692}
1333 1693
1694static struct trace_event_functions kretprobe_funcs = {
1695 .trace = print_kretprobe_event
1696};
1697
1698static struct trace_event_functions kprobe_funcs = {
1699 .trace = print_kprobe_event
1700};
1701
1334static int register_probe_event(struct trace_probe *tp) 1702static int register_probe_event(struct trace_probe *tp)
1335{ 1703{
1336 struct ftrace_event_call *call = &tp->call; 1704 struct ftrace_event_call *call = &tp->call;
1337 int ret; 1705 int ret;
1338 1706
1339 /* Initialize ftrace_event_call */ 1707 /* Initialize ftrace_event_call */
1708 INIT_LIST_HEAD(&call->class->fields);
1340 if (probe_is_return(tp)) { 1709 if (probe_is_return(tp)) {
1341 tp->event.trace = print_kretprobe_event; 1710 call->event.funcs = &kretprobe_funcs;
1342 call->raw_init = probe_event_raw_init; 1711 call->class->define_fields = kretprobe_event_define_fields;
1343 call->define_fields = kretprobe_event_define_fields;
1344 } else { 1712 } else {
1345 tp->event.trace = print_kprobe_event; 1713 call->event.funcs = &kprobe_funcs;
1346 call->raw_init = probe_event_raw_init; 1714 call->class->define_fields = kprobe_event_define_fields;
1347 call->define_fields = kprobe_event_define_fields;
1348 } 1715 }
1349 if (set_print_fmt(tp) < 0) 1716 if (set_print_fmt(tp) < 0)
1350 return -ENOMEM; 1717 return -ENOMEM;
1351 call->event = &tp->event; 1718 ret = register_ftrace_event(&call->event);
1352 call->id = register_ftrace_event(&tp->event); 1719 if (!ret) {
1353 if (!call->id) {
1354 kfree(call->print_fmt); 1720 kfree(call->print_fmt);
1355 return -ENODEV; 1721 return -ENODEV;
1356 } 1722 }
1357 call->enabled = 0; 1723 call->flags = 0;
1358 call->regfunc = probe_event_enable; 1724 call->class->reg = kprobe_register;
1359 call->unregfunc = probe_event_disable;
1360
1361#ifdef CONFIG_PERF_EVENTS
1362 call->perf_event_enable = probe_perf_enable;
1363 call->perf_event_disable = probe_perf_disable;
1364#endif
1365 call->data = tp; 1725 call->data = tp;
1366 ret = trace_add_event_call(call); 1726 ret = trace_add_event_call(call);
1367 if (ret) { 1727 if (ret) {
1368 pr_info("Failed to register kprobe event: %s\n", call->name); 1728 pr_info("Failed to register kprobe event: %s\n", call->name);
1369 kfree(call->print_fmt); 1729 kfree(call->print_fmt);
1370 unregister_ftrace_event(&tp->event); 1730 unregister_ftrace_event(&call->event);
1371 } 1731 }
1372 return ret; 1732 return ret;
1373} 1733}
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
deleted file mode 100644
index d59cd6879477..000000000000
--- a/kernel/trace/trace_ksym.c
+++ /dev/null
@@ -1,520 +0,0 @@
1/*
2 * trace_ksym.c - Kernel Symbol Tracer
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2009
19 */
20
21#include <linux/kallsyms.h>
22#include <linux/uaccess.h>
23#include <linux/debugfs.h>
24#include <linux/ftrace.h>
25#include <linux/module.h>
26#include <linux/slab.h>
27#include <linux/fs.h>
28
29#include "trace_output.h"
30#include "trace.h"
31
32#include <linux/hw_breakpoint.h>
33#include <asm/hw_breakpoint.h>
34
35#include <asm/atomic.h>
36
37/*
38 * For now, let us restrict the no. of symbols traced simultaneously to number
39 * of available hardware breakpoint registers.
40 */
41#define KSYM_TRACER_MAX HBP_NUM
42
43#define KSYM_TRACER_OP_LEN 3 /* rw- */
44
45struct trace_ksym {
46 struct perf_event **ksym_hbp;
47 struct perf_event_attr attr;
48#ifdef CONFIG_PROFILE_KSYM_TRACER
49 atomic64_t counter;
50#endif
51 struct hlist_node ksym_hlist;
52};
53
54static struct trace_array *ksym_trace_array;
55
56static unsigned int ksym_filter_entry_count;
57static unsigned int ksym_tracing_enabled;
58
59static HLIST_HEAD(ksym_filter_head);
60
61static DEFINE_MUTEX(ksym_tracer_mutex);
62
63#ifdef CONFIG_PROFILE_KSYM_TRACER
64
65#define MAX_UL_INT 0xffffffff
66
67void ksym_collect_stats(unsigned long hbp_hit_addr)
68{
69 struct hlist_node *node;
70 struct trace_ksym *entry;
71
72 rcu_read_lock();
73 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
74 if (entry->attr.bp_addr == hbp_hit_addr) {
75 atomic64_inc(&entry->counter);
76 break;
77 }
78 }
79 rcu_read_unlock();
80}
81#endif /* CONFIG_PROFILE_KSYM_TRACER */
82
83void ksym_hbp_handler(struct perf_event *hbp, int nmi,
84 struct perf_sample_data *data,
85 struct pt_regs *regs)
86{
87 struct ring_buffer_event *event;
88 struct ksym_trace_entry *entry;
89 struct ring_buffer *buffer;
90 int pc;
91
92 if (!ksym_tracing_enabled)
93 return;
94
95 buffer = ksym_trace_array->buffer;
96
97 pc = preempt_count();
98
99 event = trace_buffer_lock_reserve(buffer, TRACE_KSYM,
100 sizeof(*entry), 0, pc);
101 if (!event)
102 return;
103
104 entry = ring_buffer_event_data(event);
105 entry->ip = instruction_pointer(regs);
106 entry->type = hw_breakpoint_type(hbp);
107 entry->addr = hw_breakpoint_addr(hbp);
108 strlcpy(entry->cmd, current->comm, TASK_COMM_LEN);
109
110#ifdef CONFIG_PROFILE_KSYM_TRACER
111 ksym_collect_stats(hw_breakpoint_addr(hbp));
112#endif /* CONFIG_PROFILE_KSYM_TRACER */
113
114 trace_buffer_unlock_commit(buffer, event, 0, pc);
115}
116
117/* Valid access types are represented as
118 *
119 * rw- : Set Read/Write Access Breakpoint
120 * -w- : Set Write Access Breakpoint
121 * --- : Clear Breakpoints
122 * --x : Set Execution Break points (Not available yet)
123 *
124 */
125static int ksym_trace_get_access_type(char *str)
126{
127 int access = 0;
128
129 if (str[0] == 'r')
130 access |= HW_BREAKPOINT_R;
131
132 if (str[1] == 'w')
133 access |= HW_BREAKPOINT_W;
134
135 if (str[2] == 'x')
136 access |= HW_BREAKPOINT_X;
137
138 switch (access) {
139 case HW_BREAKPOINT_R:
140 case HW_BREAKPOINT_W:
141 case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
142 return access;
143 default:
144 return -EINVAL;
145 }
146}
147
148/*
149 * There can be several possible malformed requests and we attempt to capture
150 * all of them. We enumerate some of the rules
151 * 1. We will not allow kernel symbols with ':' since it is used as a delimiter.
152 * i.e. multiple ':' symbols disallowed. Possible uses are of the form
153 * <module>:<ksym_name>:<op>.
154 * 2. No delimiter symbol ':' in the input string
155 * 3. Spurious operator symbols or symbols not in their respective positions
156 * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file
157 * 5. Kernel symbol not a part of /proc/kallsyms
158 * 6. Duplicate requests
159 */
160static int parse_ksym_trace_str(char *input_string, char **ksymname,
161 unsigned long *addr)
162{
163 int ret;
164
165 *ksymname = strsep(&input_string, ":");
166 *addr = kallsyms_lookup_name(*ksymname);
167
168 /* Check for malformed request: (2), (1) and (5) */
169 if ((!input_string) ||
170 (strlen(input_string) != KSYM_TRACER_OP_LEN) ||
171 (*addr == 0))
172 return -EINVAL;;
173
174 ret = ksym_trace_get_access_type(input_string);
175
176 return ret;
177}
178
179int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
180{
181 struct trace_ksym *entry;
182 int ret = -ENOMEM;
183
184 if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
185 printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
186 " new requests for tracing can be accepted now.\n",
187 KSYM_TRACER_MAX);
188 return -ENOSPC;
189 }
190
191 entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
192 if (!entry)
193 return -ENOMEM;
194
195 hw_breakpoint_init(&entry->attr);
196
197 entry->attr.bp_type = op;
198 entry->attr.bp_addr = addr;
199 entry->attr.bp_len = HW_BREAKPOINT_LEN_4;
200
201 entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr,
202 ksym_hbp_handler);
203
204 if (IS_ERR(entry->ksym_hbp)) {
205 ret = PTR_ERR(entry->ksym_hbp);
206 printk(KERN_INFO "ksym_tracer request failed. Try again"
207 " later!!\n");
208 goto err;
209 }
210
211 hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
212 ksym_filter_entry_count++;
213
214 return 0;
215
216err:
217 kfree(entry);
218
219 return ret;
220}
221
222static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
223 size_t count, loff_t *ppos)
224{
225 struct trace_ksym *entry;
226 struct hlist_node *node;
227 struct trace_seq *s;
228 ssize_t cnt = 0;
229 int ret;
230
231 s = kmalloc(sizeof(*s), GFP_KERNEL);
232 if (!s)
233 return -ENOMEM;
234 trace_seq_init(s);
235
236 mutex_lock(&ksym_tracer_mutex);
237
238 hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
239 ret = trace_seq_printf(s, "%pS:",
240 (void *)(unsigned long)entry->attr.bp_addr);
241 if (entry->attr.bp_type == HW_BREAKPOINT_R)
242 ret = trace_seq_puts(s, "r--\n");
243 else if (entry->attr.bp_type == HW_BREAKPOINT_W)
244 ret = trace_seq_puts(s, "-w-\n");
245 else if (entry->attr.bp_type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R))
246 ret = trace_seq_puts(s, "rw-\n");
247 WARN_ON_ONCE(!ret);
248 }
249
250 cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
251
252 mutex_unlock(&ksym_tracer_mutex);
253
254 kfree(s);
255
256 return cnt;
257}
258
259static void __ksym_trace_reset(void)
260{
261 struct trace_ksym *entry;
262 struct hlist_node *node, *node1;
263
264 mutex_lock(&ksym_tracer_mutex);
265 hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
266 ksym_hlist) {
267 unregister_wide_hw_breakpoint(entry->ksym_hbp);
268 ksym_filter_entry_count--;
269 hlist_del_rcu(&(entry->ksym_hlist));
270 synchronize_rcu();
271 kfree(entry);
272 }
273 mutex_unlock(&ksym_tracer_mutex);
274}
275
276static ssize_t ksym_trace_filter_write(struct file *file,
277 const char __user *buffer,
278 size_t count, loff_t *ppos)
279{
280 struct trace_ksym *entry;
281 struct hlist_node *node;
282 char *buf, *input_string, *ksymname = NULL;
283 unsigned long ksym_addr = 0;
284 int ret, op, changed = 0;
285
286 buf = kzalloc(count + 1, GFP_KERNEL);
287 if (!buf)
288 return -ENOMEM;
289
290 ret = -EFAULT;
291 if (copy_from_user(buf, buffer, count))
292 goto out;
293
294 buf[count] = '\0';
295 input_string = strstrip(buf);
296
297 /*
298 * Clear all breakpoints if:
299 * 1: echo > ksym_trace_filter
300 * 2: echo 0 > ksym_trace_filter
301 * 3: echo "*:---" > ksym_trace_filter
302 */
303 if (!input_string[0] || !strcmp(input_string, "0") ||
304 !strcmp(input_string, "*:---")) {
305 __ksym_trace_reset();
306 ret = 0;
307 goto out;
308 }
309
310 ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
311 if (ret < 0)
312 goto out;
313
314 mutex_lock(&ksym_tracer_mutex);
315
316 ret = -EINVAL;
317 hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
318 if (entry->attr.bp_addr == ksym_addr) {
319 /* Check for malformed request: (6) */
320 if (entry->attr.bp_type != op)
321 changed = 1;
322 else
323 goto out_unlock;
324 break;
325 }
326 }
327 if (changed) {
328 unregister_wide_hw_breakpoint(entry->ksym_hbp);
329 entry->attr.bp_type = op;
330 ret = 0;
331 if (op > 0) {
332 entry->ksym_hbp =
333 register_wide_hw_breakpoint(&entry->attr,
334 ksym_hbp_handler);
335 if (IS_ERR(entry->ksym_hbp))
336 ret = PTR_ERR(entry->ksym_hbp);
337 else
338 goto out_unlock;
339 }
340 /* Error or "symbol:---" case: drop it */
341 ksym_filter_entry_count--;
342 hlist_del_rcu(&(entry->ksym_hlist));
343 synchronize_rcu();
344 kfree(entry);
345 goto out_unlock;
346 } else {
347 /* Check for malformed request: (4) */
348 if (op)
349 ret = process_new_ksym_entry(ksymname, op, ksym_addr);
350 }
351out_unlock:
352 mutex_unlock(&ksym_tracer_mutex);
353out:
354 kfree(buf);
355 return !ret ? count : ret;
356}
357
358static const struct file_operations ksym_tracing_fops = {
359 .open = tracing_open_generic,
360 .read = ksym_trace_filter_read,
361 .write = ksym_trace_filter_write,
362};
363
364static void ksym_trace_reset(struct trace_array *tr)
365{
366 ksym_tracing_enabled = 0;
367 __ksym_trace_reset();
368}
369
370static int ksym_trace_init(struct trace_array *tr)
371{
372 int cpu, ret = 0;
373
374 for_each_online_cpu(cpu)
375 tracing_reset(tr, cpu);
376 ksym_tracing_enabled = 1;
377 ksym_trace_array = tr;
378
379 return ret;
380}
381
382static void ksym_trace_print_header(struct seq_file *m)
383{
384 seq_puts(m,
385 "# TASK-PID CPU# Symbol "
386 "Type Function\n");
387 seq_puts(m,
388 "# | | | "
389 " | |\n");
390}
391
392static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
393{
394 struct trace_entry *entry = iter->ent;
395 struct trace_seq *s = &iter->seq;
396 struct ksym_trace_entry *field;
397 char str[KSYM_SYMBOL_LEN];
398 int ret;
399
400 if (entry->type != TRACE_KSYM)
401 return TRACE_TYPE_UNHANDLED;
402
403 trace_assign_type(field, entry);
404
405 ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd,
406 entry->pid, iter->cpu, (char *)field->addr);
407 if (!ret)
408 return TRACE_TYPE_PARTIAL_LINE;
409
410 switch (field->type) {
411 case HW_BREAKPOINT_R:
412 ret = trace_seq_printf(s, " R ");
413 break;
414 case HW_BREAKPOINT_W:
415 ret = trace_seq_printf(s, " W ");
416 break;
417 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
418 ret = trace_seq_printf(s, " RW ");
419 break;
420 default:
421 return TRACE_TYPE_PARTIAL_LINE;
422 }
423
424 if (!ret)
425 return TRACE_TYPE_PARTIAL_LINE;
426
427 sprint_symbol(str, field->ip);
428 ret = trace_seq_printf(s, "%s\n", str);
429 if (!ret)
430 return TRACE_TYPE_PARTIAL_LINE;
431
432 return TRACE_TYPE_HANDLED;
433}
434
435struct tracer ksym_tracer __read_mostly =
436{
437 .name = "ksym_tracer",
438 .init = ksym_trace_init,
439 .reset = ksym_trace_reset,
440#ifdef CONFIG_FTRACE_SELFTEST
441 .selftest = trace_selftest_startup_ksym,
442#endif
443 .print_header = ksym_trace_print_header,
444 .print_line = ksym_trace_output
445};
446
447#ifdef CONFIG_PROFILE_KSYM_TRACER
448static int ksym_profile_show(struct seq_file *m, void *v)
449{
450 struct hlist_node *node;
451 struct trace_ksym *entry;
452 int access_type = 0;
453 char fn_name[KSYM_NAME_LEN];
454
455 seq_puts(m, " Access Type ");
456 seq_puts(m, " Symbol Counter\n");
457 seq_puts(m, " ----------- ");
458 seq_puts(m, " ------ -------\n");
459
460 rcu_read_lock();
461 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
462
463 access_type = entry->attr.bp_type;
464
465 switch (access_type) {
466 case HW_BREAKPOINT_R:
467 seq_puts(m, " R ");
468 break;
469 case HW_BREAKPOINT_W:
470 seq_puts(m, " W ");
471 break;
472 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
473 seq_puts(m, " RW ");
474 break;
475 default:
476 seq_puts(m, " NA ");
477 }
478
479 if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0)
480 seq_printf(m, " %-36s", fn_name);
481 else
482 seq_printf(m, " %-36s", "<NA>");
483 seq_printf(m, " %15llu\n",
484 (unsigned long long)atomic64_read(&entry->counter));
485 }
486 rcu_read_unlock();
487
488 return 0;
489}
490
491static int ksym_profile_open(struct inode *node, struct file *file)
492{
493 return single_open(file, ksym_profile_show, NULL);
494}
495
496static const struct file_operations ksym_profile_fops = {
497 .open = ksym_profile_open,
498 .read = seq_read,
499 .llseek = seq_lseek,
500 .release = single_release,
501};
502#endif /* CONFIG_PROFILE_KSYM_TRACER */
503
504__init static int init_ksym_trace(void)
505{
506 struct dentry *d_tracer;
507
508 d_tracer = tracing_init_dentry();
509
510 trace_create_file("ksym_trace_filter", 0644, d_tracer,
511 NULL, &ksym_tracing_fops);
512
513#ifdef CONFIG_PROFILE_KSYM_TRACER
514 trace_create_file("ksym_profile", 0444, d_tracer,
515 NULL, &ksym_profile_fops);
516#endif
517
518 return register_tracer(&ksym_tracer);
519}
520device_initcall(init_ksym_trace);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 8e46b3323cdc..02272baa2206 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -16,9 +16,6 @@
16 16
17DECLARE_RWSEM(trace_event_mutex); 17DECLARE_RWSEM(trace_event_mutex);
18 18
19DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq);
20EXPORT_PER_CPU_SYMBOL(ftrace_event_seq);
21
22static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; 19static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
23 20
24static int next_event_type = __TRACE_LAST_TYPE + 1; 21static int next_event_type = __TRACE_LAST_TYPE + 1;
@@ -209,6 +206,7 @@ int trace_seq_putc(struct trace_seq *s, unsigned char c)
209 206
210 return 1; 207 return 1;
211} 208}
209EXPORT_SYMBOL(trace_seq_putc);
212 210
213int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len) 211int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
214{ 212{
@@ -253,7 +251,7 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)
253 void *ret; 251 void *ret;
254 252
255 if (s->full) 253 if (s->full)
256 return 0; 254 return NULL;
257 255
258 if (len > ((PAGE_SIZE - 1) - s->len)) { 256 if (len > ((PAGE_SIZE - 1) - s->len)) {
259 s->full = 1; 257 s->full = 1;
@@ -355,6 +353,21 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
355} 353}
356EXPORT_SYMBOL(ftrace_print_symbols_seq); 354EXPORT_SYMBOL(ftrace_print_symbols_seq);
357 355
356const char *
357ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
358{
359 int i;
360 const char *ret = p->buffer + p->len;
361
362 for (i = 0; i < buf_len; i++)
363 trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]);
364
365 trace_seq_putc(p, 0);
366
367 return ret;
368}
369EXPORT_SYMBOL(ftrace_print_hex_seq);
370
358#ifdef CONFIG_KRETPROBES 371#ifdef CONFIG_KRETPROBES
359static inline const char *kretprobed(const char *name) 372static inline const char *kretprobed(const char *name)
360{ 373{
@@ -726,6 +739,9 @@ int register_ftrace_event(struct trace_event *event)
726 if (WARN_ON(!event)) 739 if (WARN_ON(!event))
727 goto out; 740 goto out;
728 741
742 if (WARN_ON(!event->funcs))
743 goto out;
744
729 INIT_LIST_HEAD(&event->list); 745 INIT_LIST_HEAD(&event->list);
730 746
731 if (!event->type) { 747 if (!event->type) {
@@ -758,14 +774,14 @@ int register_ftrace_event(struct trace_event *event)
758 goto out; 774 goto out;
759 } 775 }
760 776
761 if (event->trace == NULL) 777 if (event->funcs->trace == NULL)
762 event->trace = trace_nop_print; 778 event->funcs->trace = trace_nop_print;
763 if (event->raw == NULL) 779 if (event->funcs->raw == NULL)
764 event->raw = trace_nop_print; 780 event->funcs->raw = trace_nop_print;
765 if (event->hex == NULL) 781 if (event->funcs->hex == NULL)
766 event->hex = trace_nop_print; 782 event->funcs->hex = trace_nop_print;
767 if (event->binary == NULL) 783 if (event->funcs->binary == NULL)
768 event->binary = trace_nop_print; 784 event->funcs->binary = trace_nop_print;
769 785
770 key = event->type & (EVENT_HASHSIZE - 1); 786 key = event->type & (EVENT_HASHSIZE - 1);
771 787
@@ -807,13 +823,15 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event);
807 * Standard events 823 * Standard events
808 */ 824 */
809 825
810enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags) 826enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
827 struct trace_event *event)
811{ 828{
812 return TRACE_TYPE_HANDLED; 829 return TRACE_TYPE_HANDLED;
813} 830}
814 831
815/* TRACE_FN */ 832/* TRACE_FN */
816static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags) 833static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
834 struct trace_event *event)
817{ 835{
818 struct ftrace_entry *field; 836 struct ftrace_entry *field;
819 struct trace_seq *s = &iter->seq; 837 struct trace_seq *s = &iter->seq;
@@ -840,7 +858,8 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags)
840 return TRACE_TYPE_PARTIAL_LINE; 858 return TRACE_TYPE_PARTIAL_LINE;
841} 859}
842 860
843static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags) 861static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags,
862 struct trace_event *event)
844{ 863{
845 struct ftrace_entry *field; 864 struct ftrace_entry *field;
846 865
@@ -854,7 +873,8 @@ static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags)
854 return TRACE_TYPE_HANDLED; 873 return TRACE_TYPE_HANDLED;
855} 874}
856 875
857static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags) 876static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags,
877 struct trace_event *event)
858{ 878{
859 struct ftrace_entry *field; 879 struct ftrace_entry *field;
860 struct trace_seq *s = &iter->seq; 880 struct trace_seq *s = &iter->seq;
@@ -867,7 +887,8 @@ static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags)
867 return TRACE_TYPE_HANDLED; 887 return TRACE_TYPE_HANDLED;
868} 888}
869 889
870static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags) 890static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags,
891 struct trace_event *event)
871{ 892{
872 struct ftrace_entry *field; 893 struct ftrace_entry *field;
873 struct trace_seq *s = &iter->seq; 894 struct trace_seq *s = &iter->seq;
@@ -880,14 +901,18 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags)
880 return TRACE_TYPE_HANDLED; 901 return TRACE_TYPE_HANDLED;
881} 902}
882 903
883static struct trace_event trace_fn_event = { 904static struct trace_event_functions trace_fn_funcs = {
884 .type = TRACE_FN,
885 .trace = trace_fn_trace, 905 .trace = trace_fn_trace,
886 .raw = trace_fn_raw, 906 .raw = trace_fn_raw,
887 .hex = trace_fn_hex, 907 .hex = trace_fn_hex,
888 .binary = trace_fn_bin, 908 .binary = trace_fn_bin,
889}; 909};
890 910
911static struct trace_event trace_fn_event = {
912 .type = TRACE_FN,
913 .funcs = &trace_fn_funcs,
914};
915
891/* TRACE_CTX an TRACE_WAKE */ 916/* TRACE_CTX an TRACE_WAKE */
892static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, 917static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
893 char *delim) 918 char *delim)
@@ -916,13 +941,14 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
916 return TRACE_TYPE_HANDLED; 941 return TRACE_TYPE_HANDLED;
917} 942}
918 943
919static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags) 944static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags,
945 struct trace_event *event)
920{ 946{
921 return trace_ctxwake_print(iter, "==>"); 947 return trace_ctxwake_print(iter, "==>");
922} 948}
923 949
924static enum print_line_t trace_wake_print(struct trace_iterator *iter, 950static enum print_line_t trace_wake_print(struct trace_iterator *iter,
925 int flags) 951 int flags, struct trace_event *event)
926{ 952{
927 return trace_ctxwake_print(iter, " +"); 953 return trace_ctxwake_print(iter, " +");
928} 954}
@@ -950,12 +976,14 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
950 return TRACE_TYPE_HANDLED; 976 return TRACE_TYPE_HANDLED;
951} 977}
952 978
953static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags) 979static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags,
980 struct trace_event *event)
954{ 981{
955 return trace_ctxwake_raw(iter, 0); 982 return trace_ctxwake_raw(iter, 0);
956} 983}
957 984
958static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags) 985static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags,
986 struct trace_event *event)
959{ 987{
960 return trace_ctxwake_raw(iter, '+'); 988 return trace_ctxwake_raw(iter, '+');
961} 989}
@@ -984,18 +1012,20 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
984 return TRACE_TYPE_HANDLED; 1012 return TRACE_TYPE_HANDLED;
985} 1013}
986 1014
987static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags) 1015static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags,
1016 struct trace_event *event)
988{ 1017{
989 return trace_ctxwake_hex(iter, 0); 1018 return trace_ctxwake_hex(iter, 0);
990} 1019}
991 1020
992static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags) 1021static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags,
1022 struct trace_event *event)
993{ 1023{
994 return trace_ctxwake_hex(iter, '+'); 1024 return trace_ctxwake_hex(iter, '+');
995} 1025}
996 1026
997static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter, 1027static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
998 int flags) 1028 int flags, struct trace_event *event)
999{ 1029{
1000 struct ctx_switch_entry *field; 1030 struct ctx_switch_entry *field;
1001 struct trace_seq *s = &iter->seq; 1031 struct trace_seq *s = &iter->seq;
@@ -1012,81 +1042,34 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
1012 return TRACE_TYPE_HANDLED; 1042 return TRACE_TYPE_HANDLED;
1013} 1043}
1014 1044
1015static struct trace_event trace_ctx_event = { 1045static struct trace_event_functions trace_ctx_funcs = {
1016 .type = TRACE_CTX,
1017 .trace = trace_ctx_print, 1046 .trace = trace_ctx_print,
1018 .raw = trace_ctx_raw, 1047 .raw = trace_ctx_raw,
1019 .hex = trace_ctx_hex, 1048 .hex = trace_ctx_hex,
1020 .binary = trace_ctxwake_bin, 1049 .binary = trace_ctxwake_bin,
1021}; 1050};
1022 1051
1023static struct trace_event trace_wake_event = { 1052static struct trace_event trace_ctx_event = {
1024 .type = TRACE_WAKE, 1053 .type = TRACE_CTX,
1054 .funcs = &trace_ctx_funcs,
1055};
1056
1057static struct trace_event_functions trace_wake_funcs = {
1025 .trace = trace_wake_print, 1058 .trace = trace_wake_print,
1026 .raw = trace_wake_raw, 1059 .raw = trace_wake_raw,
1027 .hex = trace_wake_hex, 1060 .hex = trace_wake_hex,
1028 .binary = trace_ctxwake_bin, 1061 .binary = trace_ctxwake_bin,
1029}; 1062};
1030 1063
1031/* TRACE_SPECIAL */ 1064static struct trace_event trace_wake_event = {
1032static enum print_line_t trace_special_print(struct trace_iterator *iter, 1065 .type = TRACE_WAKE,
1033 int flags) 1066 .funcs = &trace_wake_funcs,
1034{
1035 struct special_entry *field;
1036
1037 trace_assign_type(field, iter->ent);
1038
1039 if (!trace_seq_printf(&iter->seq, "# %ld %ld %ld\n",
1040 field->arg1,
1041 field->arg2,
1042 field->arg3))
1043 return TRACE_TYPE_PARTIAL_LINE;
1044
1045 return TRACE_TYPE_HANDLED;
1046}
1047
1048static enum print_line_t trace_special_hex(struct trace_iterator *iter,
1049 int flags)
1050{
1051 struct special_entry *field;
1052 struct trace_seq *s = &iter->seq;
1053
1054 trace_assign_type(field, iter->ent);
1055
1056 SEQ_PUT_HEX_FIELD_RET(s, field->arg1);
1057 SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
1058 SEQ_PUT_HEX_FIELD_RET(s, field->arg3);
1059
1060 return TRACE_TYPE_HANDLED;
1061}
1062
1063static enum print_line_t trace_special_bin(struct trace_iterator *iter,
1064 int flags)
1065{
1066 struct special_entry *field;
1067 struct trace_seq *s = &iter->seq;
1068
1069 trace_assign_type(field, iter->ent);
1070
1071 SEQ_PUT_FIELD_RET(s, field->arg1);
1072 SEQ_PUT_FIELD_RET(s, field->arg2);
1073 SEQ_PUT_FIELD_RET(s, field->arg3);
1074
1075 return TRACE_TYPE_HANDLED;
1076}
1077
1078static struct trace_event trace_special_event = {
1079 .type = TRACE_SPECIAL,
1080 .trace = trace_special_print,
1081 .raw = trace_special_print,
1082 .hex = trace_special_hex,
1083 .binary = trace_special_bin,
1084}; 1067};
1085 1068
1086/* TRACE_STACK */ 1069/* TRACE_STACK */
1087 1070
1088static enum print_line_t trace_stack_print(struct trace_iterator *iter, 1071static enum print_line_t trace_stack_print(struct trace_iterator *iter,
1089 int flags) 1072 int flags, struct trace_event *event)
1090{ 1073{
1091 struct stack_entry *field; 1074 struct stack_entry *field;
1092 struct trace_seq *s = &iter->seq; 1075 struct trace_seq *s = &iter->seq;
@@ -1114,17 +1097,18 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
1114 return TRACE_TYPE_PARTIAL_LINE; 1097 return TRACE_TYPE_PARTIAL_LINE;
1115} 1098}
1116 1099
1100static struct trace_event_functions trace_stack_funcs = {
1101 .trace = trace_stack_print,
1102};
1103
1117static struct trace_event trace_stack_event = { 1104static struct trace_event trace_stack_event = {
1118 .type = TRACE_STACK, 1105 .type = TRACE_STACK,
1119 .trace = trace_stack_print, 1106 .funcs = &trace_stack_funcs,
1120 .raw = trace_special_print,
1121 .hex = trace_special_hex,
1122 .binary = trace_special_bin,
1123}; 1107};
1124 1108
1125/* TRACE_USER_STACK */ 1109/* TRACE_USER_STACK */
1126static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, 1110static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
1127 int flags) 1111 int flags, struct trace_event *event)
1128{ 1112{
1129 struct userstack_entry *field; 1113 struct userstack_entry *field;
1130 struct trace_seq *s = &iter->seq; 1114 struct trace_seq *s = &iter->seq;
@@ -1143,17 +1127,19 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
1143 return TRACE_TYPE_PARTIAL_LINE; 1127 return TRACE_TYPE_PARTIAL_LINE;
1144} 1128}
1145 1129
1130static struct trace_event_functions trace_user_stack_funcs = {
1131 .trace = trace_user_stack_print,
1132};
1133
1146static struct trace_event trace_user_stack_event = { 1134static struct trace_event trace_user_stack_event = {
1147 .type = TRACE_USER_STACK, 1135 .type = TRACE_USER_STACK,
1148 .trace = trace_user_stack_print, 1136 .funcs = &trace_user_stack_funcs,
1149 .raw = trace_special_print,
1150 .hex = trace_special_hex,
1151 .binary = trace_special_bin,
1152}; 1137};
1153 1138
1154/* TRACE_BPRINT */ 1139/* TRACE_BPRINT */
1155static enum print_line_t 1140static enum print_line_t
1156trace_bprint_print(struct trace_iterator *iter, int flags) 1141trace_bprint_print(struct trace_iterator *iter, int flags,
1142 struct trace_event *event)
1157{ 1143{
1158 struct trace_entry *entry = iter->ent; 1144 struct trace_entry *entry = iter->ent;
1159 struct trace_seq *s = &iter->seq; 1145 struct trace_seq *s = &iter->seq;
@@ -1178,7 +1164,8 @@ trace_bprint_print(struct trace_iterator *iter, int flags)
1178 1164
1179 1165
1180static enum print_line_t 1166static enum print_line_t
1181trace_bprint_raw(struct trace_iterator *iter, int flags) 1167trace_bprint_raw(struct trace_iterator *iter, int flags,
1168 struct trace_event *event)
1182{ 1169{
1183 struct bprint_entry *field; 1170 struct bprint_entry *field;
1184 struct trace_seq *s = &iter->seq; 1171 struct trace_seq *s = &iter->seq;
@@ -1197,16 +1184,19 @@ trace_bprint_raw(struct trace_iterator *iter, int flags)
1197 return TRACE_TYPE_PARTIAL_LINE; 1184 return TRACE_TYPE_PARTIAL_LINE;
1198} 1185}
1199 1186
1187static struct trace_event_functions trace_bprint_funcs = {
1188 .trace = trace_bprint_print,
1189 .raw = trace_bprint_raw,
1190};
1200 1191
1201static struct trace_event trace_bprint_event = { 1192static struct trace_event trace_bprint_event = {
1202 .type = TRACE_BPRINT, 1193 .type = TRACE_BPRINT,
1203 .trace = trace_bprint_print, 1194 .funcs = &trace_bprint_funcs,
1204 .raw = trace_bprint_raw,
1205}; 1195};
1206 1196
1207/* TRACE_PRINT */ 1197/* TRACE_PRINT */
1208static enum print_line_t trace_print_print(struct trace_iterator *iter, 1198static enum print_line_t trace_print_print(struct trace_iterator *iter,
1209 int flags) 1199 int flags, struct trace_event *event)
1210{ 1200{
1211 struct print_entry *field; 1201 struct print_entry *field;
1212 struct trace_seq *s = &iter->seq; 1202 struct trace_seq *s = &iter->seq;
@@ -1225,7 +1215,8 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
1225 return TRACE_TYPE_PARTIAL_LINE; 1215 return TRACE_TYPE_PARTIAL_LINE;
1226} 1216}
1227 1217
1228static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags) 1218static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
1219 struct trace_event *event)
1229{ 1220{
1230 struct print_entry *field; 1221 struct print_entry *field;
1231 1222
@@ -1240,18 +1231,21 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
1240 return TRACE_TYPE_PARTIAL_LINE; 1231 return TRACE_TYPE_PARTIAL_LINE;
1241} 1232}
1242 1233
1243static struct trace_event trace_print_event = { 1234static struct trace_event_functions trace_print_funcs = {
1244 .type = TRACE_PRINT,
1245 .trace = trace_print_print, 1235 .trace = trace_print_print,
1246 .raw = trace_print_raw, 1236 .raw = trace_print_raw,
1247}; 1237};
1248 1238
1239static struct trace_event trace_print_event = {
1240 .type = TRACE_PRINT,
1241 .funcs = &trace_print_funcs,
1242};
1243
1249 1244
1250static struct trace_event *events[] __initdata = { 1245static struct trace_event *events[] __initdata = {
1251 &trace_fn_event, 1246 &trace_fn_event,
1252 &trace_ctx_event, 1247 &trace_ctx_event,
1253 &trace_wake_event, 1248 &trace_wake_event,
1254 &trace_special_event,
1255 &trace_stack_event, 1249 &trace_stack_event,
1256 &trace_user_stack_event, 1250 &trace_user_stack_event,
1257 &trace_bprint_event, 1251 &trace_bprint_event,
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 9d91c72ba38b..c038eba0492b 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -25,7 +25,7 @@ extern void trace_event_read_unlock(void);
25extern struct trace_event *ftrace_find_event(int type); 25extern struct trace_event *ftrace_find_event(int type);
26 26
27extern enum print_line_t trace_nop_print(struct trace_iterator *iter, 27extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
28 int flags); 28 int flags, struct trace_event *event);
29extern int 29extern int
30trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry); 30trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
31 31
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 5fca0f51fde4..8f758d070c43 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -50,8 +50,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
50} 50}
51 51
52static void 52static void
53probe_sched_switch(struct rq *__rq, struct task_struct *prev, 53probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next)
54 struct task_struct *next)
55{ 54{
56 struct trace_array_cpu *data; 55 struct trace_array_cpu *data;
57 unsigned long flags; 56 unsigned long flags;
@@ -109,7 +108,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
109} 108}
110 109
111static void 110static void
112probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success) 111probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
113{ 112{
114 struct trace_array_cpu *data; 113 struct trace_array_cpu *data;
115 unsigned long flags; 114 unsigned long flags;
@@ -139,21 +138,21 @@ static int tracing_sched_register(void)
139{ 138{
140 int ret; 139 int ret;
141 140
142 ret = register_trace_sched_wakeup(probe_sched_wakeup); 141 ret = register_trace_sched_wakeup(probe_sched_wakeup, NULL);
143 if (ret) { 142 if (ret) {
144 pr_info("wakeup trace: Couldn't activate tracepoint" 143 pr_info("wakeup trace: Couldn't activate tracepoint"
145 " probe to kernel_sched_wakeup\n"); 144 " probe to kernel_sched_wakeup\n");
146 return ret; 145 return ret;
147 } 146 }
148 147
149 ret = register_trace_sched_wakeup_new(probe_sched_wakeup); 148 ret = register_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
150 if (ret) { 149 if (ret) {
151 pr_info("wakeup trace: Couldn't activate tracepoint" 150 pr_info("wakeup trace: Couldn't activate tracepoint"
152 " probe to kernel_sched_wakeup_new\n"); 151 " probe to kernel_sched_wakeup_new\n");
153 goto fail_deprobe; 152 goto fail_deprobe;
154 } 153 }
155 154
156 ret = register_trace_sched_switch(probe_sched_switch); 155 ret = register_trace_sched_switch(probe_sched_switch, NULL);
157 if (ret) { 156 if (ret) {
158 pr_info("sched trace: Couldn't activate tracepoint" 157 pr_info("sched trace: Couldn't activate tracepoint"
159 " probe to kernel_sched_switch\n"); 158 " probe to kernel_sched_switch\n");
@@ -162,17 +161,17 @@ static int tracing_sched_register(void)
162 161
163 return ret; 162 return ret;
164fail_deprobe_wake_new: 163fail_deprobe_wake_new:
165 unregister_trace_sched_wakeup_new(probe_sched_wakeup); 164 unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
166fail_deprobe: 165fail_deprobe:
167 unregister_trace_sched_wakeup(probe_sched_wakeup); 166 unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
168 return ret; 167 return ret;
169} 168}
170 169
171static void tracing_sched_unregister(void) 170static void tracing_sched_unregister(void)
172{ 171{
173 unregister_trace_sched_switch(probe_sched_switch); 172 unregister_trace_sched_switch(probe_sched_switch, NULL);
174 unregister_trace_sched_wakeup_new(probe_sched_wakeup); 173 unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
175 unregister_trace_sched_wakeup(probe_sched_wakeup); 174 unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
176} 175}
177 176
178static void tracing_start_sched_switch(void) 177static void tracing_start_sched_switch(void)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 0271742abb8d..4086eae6e81b 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -46,7 +46,6 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
46 struct trace_array_cpu *data; 46 struct trace_array_cpu *data;
47 unsigned long flags; 47 unsigned long flags;
48 long disabled; 48 long disabled;
49 int resched;
50 int cpu; 49 int cpu;
51 int pc; 50 int pc;
52 51
@@ -54,7 +53,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
54 return; 53 return;
55 54
56 pc = preempt_count(); 55 pc = preempt_count();
57 resched = ftrace_preempt_disable(); 56 preempt_disable_notrace();
58 57
59 cpu = raw_smp_processor_id(); 58 cpu = raw_smp_processor_id();
60 if (cpu != wakeup_current_cpu) 59 if (cpu != wakeup_current_cpu)
@@ -74,7 +73,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
74 out: 73 out:
75 atomic_dec(&data->disabled); 74 atomic_dec(&data->disabled);
76 out_enable: 75 out_enable:
77 ftrace_preempt_enable(resched); 76 preempt_enable_notrace();
78} 77}
79 78
80static struct ftrace_ops trace_ops __read_mostly = 79static struct ftrace_ops trace_ops __read_mostly =
@@ -98,7 +97,8 @@ static int report_latency(cycle_t delta)
98 return 1; 97 return 1;
99} 98}
100 99
101static void probe_wakeup_migrate_task(struct task_struct *task, int cpu) 100static void
101probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu)
102{ 102{
103 if (task != wakeup_task) 103 if (task != wakeup_task)
104 return; 104 return;
@@ -107,8 +107,8 @@ static void probe_wakeup_migrate_task(struct task_struct *task, int cpu)
107} 107}
108 108
109static void notrace 109static void notrace
110probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, 110probe_wakeup_sched_switch(void *ignore,
111 struct task_struct *next) 111 struct task_struct *prev, struct task_struct *next)
112{ 112{
113 struct trace_array_cpu *data; 113 struct trace_array_cpu *data;
114 cycle_t T0, T1, delta; 114 cycle_t T0, T1, delta;
@@ -200,7 +200,7 @@ static void wakeup_reset(struct trace_array *tr)
200} 200}
201 201
202static void 202static void
203probe_wakeup(struct rq *rq, struct task_struct *p, int success) 203probe_wakeup(void *ignore, struct task_struct *p, int success)
204{ 204{
205 struct trace_array_cpu *data; 205 struct trace_array_cpu *data;
206 int cpu = smp_processor_id(); 206 int cpu = smp_processor_id();
@@ -264,28 +264,28 @@ static void start_wakeup_tracer(struct trace_array *tr)
264{ 264{
265 int ret; 265 int ret;
266 266
267 ret = register_trace_sched_wakeup(probe_wakeup); 267 ret = register_trace_sched_wakeup(probe_wakeup, NULL);
268 if (ret) { 268 if (ret) {
269 pr_info("wakeup trace: Couldn't activate tracepoint" 269 pr_info("wakeup trace: Couldn't activate tracepoint"
270 " probe to kernel_sched_wakeup\n"); 270 " probe to kernel_sched_wakeup\n");
271 return; 271 return;
272 } 272 }
273 273
274 ret = register_trace_sched_wakeup_new(probe_wakeup); 274 ret = register_trace_sched_wakeup_new(probe_wakeup, NULL);
275 if (ret) { 275 if (ret) {
276 pr_info("wakeup trace: Couldn't activate tracepoint" 276 pr_info("wakeup trace: Couldn't activate tracepoint"
277 " probe to kernel_sched_wakeup_new\n"); 277 " probe to kernel_sched_wakeup_new\n");
278 goto fail_deprobe; 278 goto fail_deprobe;
279 } 279 }
280 280
281 ret = register_trace_sched_switch(probe_wakeup_sched_switch); 281 ret = register_trace_sched_switch(probe_wakeup_sched_switch, NULL);
282 if (ret) { 282 if (ret) {
283 pr_info("sched trace: Couldn't activate tracepoint" 283 pr_info("sched trace: Couldn't activate tracepoint"
284 " probe to kernel_sched_switch\n"); 284 " probe to kernel_sched_switch\n");
285 goto fail_deprobe_wake_new; 285 goto fail_deprobe_wake_new;
286 } 286 }
287 287
288 ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task); 288 ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);
289 if (ret) { 289 if (ret) {
290 pr_info("wakeup trace: Couldn't activate tracepoint" 290 pr_info("wakeup trace: Couldn't activate tracepoint"
291 " probe to kernel_sched_migrate_task\n"); 291 " probe to kernel_sched_migrate_task\n");
@@ -312,19 +312,19 @@ static void start_wakeup_tracer(struct trace_array *tr)
312 312
313 return; 313 return;
314fail_deprobe_wake_new: 314fail_deprobe_wake_new:
315 unregister_trace_sched_wakeup_new(probe_wakeup); 315 unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
316fail_deprobe: 316fail_deprobe:
317 unregister_trace_sched_wakeup(probe_wakeup); 317 unregister_trace_sched_wakeup(probe_wakeup, NULL);
318} 318}
319 319
320static void stop_wakeup_tracer(struct trace_array *tr) 320static void stop_wakeup_tracer(struct trace_array *tr)
321{ 321{
322 tracer_enabled = 0; 322 tracer_enabled = 0;
323 unregister_ftrace_function(&trace_ops); 323 unregister_ftrace_function(&trace_ops);
324 unregister_trace_sched_switch(probe_wakeup_sched_switch); 324 unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
325 unregister_trace_sched_wakeup_new(probe_wakeup); 325 unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
326 unregister_trace_sched_wakeup(probe_wakeup); 326 unregister_trace_sched_wakeup(probe_wakeup, NULL);
327 unregister_trace_sched_migrate_task(probe_wakeup_migrate_task); 327 unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);
328} 328}
329 329
330static int __wakeup_tracer_init(struct trace_array *tr) 330static int __wakeup_tracer_init(struct trace_array *tr)
@@ -382,6 +382,7 @@ static struct tracer wakeup_tracer __read_mostly =
382#ifdef CONFIG_FTRACE_SELFTEST 382#ifdef CONFIG_FTRACE_SELFTEST
383 .selftest = trace_selftest_startup_wakeup, 383 .selftest = trace_selftest_startup_wakeup,
384#endif 384#endif
385 .use_max_tr = 1,
385}; 386};
386 387
387static struct tracer wakeup_rt_tracer __read_mostly = 388static struct tracer wakeup_rt_tracer __read_mostly =
@@ -396,6 +397,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
396#ifdef CONFIG_FTRACE_SELFTEST 397#ifdef CONFIG_FTRACE_SELFTEST
397 .selftest = trace_selftest_startup_wakeup, 398 .selftest = trace_selftest_startup_wakeup,
398#endif 399#endif
400 .use_max_tr = 1,
399}; 401};
400 402
401__init static int init_wakeup_tracer(void) 403__init static int init_wakeup_tracer(void)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 81003b4d617f..155a415b3209 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -13,12 +13,9 @@ static inline int trace_valid_entry(struct trace_entry *entry)
13 case TRACE_WAKE: 13 case TRACE_WAKE:
14 case TRACE_STACK: 14 case TRACE_STACK:
15 case TRACE_PRINT: 15 case TRACE_PRINT:
16 case TRACE_SPECIAL:
17 case TRACE_BRANCH: 16 case TRACE_BRANCH:
18 case TRACE_GRAPH_ENT: 17 case TRACE_GRAPH_ENT:
19 case TRACE_GRAPH_RET: 18 case TRACE_GRAPH_RET:
20 case TRACE_HW_BRANCHES:
21 case TRACE_KSYM:
22 return 1; 19 return 1;
23 } 20 }
24 return 0; 21 return 0;
@@ -30,7 +27,7 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
30 struct trace_entry *entry; 27 struct trace_entry *entry;
31 unsigned int loops = 0; 28 unsigned int loops = 0;
32 29
33 while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) { 30 while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) {
34 entry = ring_buffer_event_data(event); 31 entry = ring_buffer_event_data(event);
35 32
36 /* 33 /*
@@ -256,7 +253,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
256/* Maximum number of functions to trace before diagnosing a hang */ 253/* Maximum number of functions to trace before diagnosing a hang */
257#define GRAPH_MAX_FUNC_TEST 100000000 254#define GRAPH_MAX_FUNC_TEST 100000000
258 255
259static void __ftrace_dump(bool disable_tracing); 256static void
257__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode);
260static unsigned int graph_hang_thresh; 258static unsigned int graph_hang_thresh;
261 259
262/* Wrap the real function entry probe to avoid possible hanging */ 260/* Wrap the real function entry probe to avoid possible hanging */
@@ -267,7 +265,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
267 ftrace_graph_stop(); 265 ftrace_graph_stop();
268 printk(KERN_WARNING "BUG: Function graph tracer hang!\n"); 266 printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
269 if (ftrace_dump_on_oops) 267 if (ftrace_dump_on_oops)
270 __ftrace_dump(false); 268 __ftrace_dump(false, DUMP_ALL);
271 return 0; 269 return 0;
272 } 270 }
273 271
@@ -691,38 +689,6 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
691} 689}
692#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ 690#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
693 691
694#ifdef CONFIG_SYSPROF_TRACER
695int
696trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
697{
698 unsigned long count;
699 int ret;
700
701 /* start the tracing */
702 ret = tracer_init(trace, tr);
703 if (ret) {
704 warn_failed_init_tracer(trace, ret);
705 return ret;
706 }
707
708 /* Sleep for a 1/10 of a second */
709 msleep(100);
710 /* stop the tracing. */
711 tracing_stop();
712 /* check the trace buffer */
713 ret = trace_test_buffer(tr, &count);
714 trace->reset(tr);
715 tracing_start();
716
717 if (!ret && !count) {
718 printk(KERN_CONT ".. no entries found ..");
719 ret = -1;
720 }
721
722 return ret;
723}
724#endif /* CONFIG_SYSPROF_TRACER */
725
726#ifdef CONFIG_BRANCH_TRACER 692#ifdef CONFIG_BRANCH_TRACER
727int 693int
728trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) 694trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
@@ -755,112 +721,3 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
755} 721}
756#endif /* CONFIG_BRANCH_TRACER */ 722#endif /* CONFIG_BRANCH_TRACER */
757 723
758#ifdef CONFIG_HW_BRANCH_TRACER
759int
760trace_selftest_startup_hw_branches(struct tracer *trace,
761 struct trace_array *tr)
762{
763 struct trace_iterator *iter;
764 struct tracer tracer;
765 unsigned long count;
766 int ret;
767
768 if (!trace->open) {
769 printk(KERN_CONT "missing open function...");
770 return -1;
771 }
772
773 ret = tracer_init(trace, tr);
774 if (ret) {
775 warn_failed_init_tracer(trace, ret);
776 return ret;
777 }
778
779 /*
780 * The hw-branch tracer needs to collect the trace from the various
781 * cpu trace buffers - before tracing is stopped.
782 */
783 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
784 if (!iter)
785 return -ENOMEM;
786
787 memcpy(&tracer, trace, sizeof(tracer));
788
789 iter->trace = &tracer;
790 iter->tr = tr;
791 iter->pos = -1;
792 mutex_init(&iter->mutex);
793
794 trace->open(iter);
795
796 mutex_destroy(&iter->mutex);
797 kfree(iter);
798
799 tracing_stop();
800
801 ret = trace_test_buffer(tr, &count);
802 trace->reset(tr);
803 tracing_start();
804
805 if (!ret && !count) {
806 printk(KERN_CONT "no entries found..");
807 ret = -1;
808 }
809
810 return ret;
811}
812#endif /* CONFIG_HW_BRANCH_TRACER */
813
814#ifdef CONFIG_KSYM_TRACER
815static int ksym_selftest_dummy;
816
817int
818trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
819{
820 unsigned long count;
821 int ret;
822
823 /* start the tracing */
824 ret = tracer_init(trace, tr);
825 if (ret) {
826 warn_failed_init_tracer(trace, ret);
827 return ret;
828 }
829
830 ksym_selftest_dummy = 0;
831 /* Register the read-write tracing request */
832
833 ret = process_new_ksym_entry("ksym_selftest_dummy",
834 HW_BREAKPOINT_R | HW_BREAKPOINT_W,
835 (unsigned long)(&ksym_selftest_dummy));
836
837 if (ret < 0) {
838 printk(KERN_CONT "ksym_trace read-write startup test failed\n");
839 goto ret_path;
840 }
841 /* Perform a read and a write operation over the dummy variable to
842 * trigger the tracer
843 */
844 if (ksym_selftest_dummy == 0)
845 ksym_selftest_dummy++;
846
847 /* stop the tracing. */
848 tracing_stop();
849 /* check the trace buffer */
850 ret = trace_test_buffer(tr, &count);
851 trace->reset(tr);
852 tracing_start();
853
854 /* read & write operations - one each is performed on the dummy variable
855 * triggering two entries in the trace buffer
856 */
857 if (!ret && count != 2) {
858 printk(KERN_CONT "Ksym tracer startup test failed");
859 ret = -1;
860 }
861
862ret_path:
863 return ret;
864}
865#endif /* CONFIG_KSYM_TRACER */
866
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index f4bc9b27de5f..a6b7e0e0f3eb 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -110,12 +110,12 @@ static inline void check_stack(void)
110static void 110static void
111stack_trace_call(unsigned long ip, unsigned long parent_ip) 111stack_trace_call(unsigned long ip, unsigned long parent_ip)
112{ 112{
113 int cpu, resched; 113 int cpu;
114 114
115 if (unlikely(!ftrace_enabled || stack_trace_disabled)) 115 if (unlikely(!ftrace_enabled || stack_trace_disabled))
116 return; 116 return;
117 117
118 resched = ftrace_preempt_disable(); 118 preempt_disable_notrace();
119 119
120 cpu = raw_smp_processor_id(); 120 cpu = raw_smp_processor_id();
121 /* no atomic needed, we only modify this variable by this cpu */ 121 /* no atomic needed, we only modify this variable by this cpu */
@@ -127,7 +127,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
127 out: 127 out:
128 per_cpu(trace_active, cpu)--; 128 per_cpu(trace_active, cpu)--;
129 /* prevent recursion in schedule */ 129 /* prevent recursion in schedule */
130 ftrace_preempt_enable(resched); 130 preempt_enable_notrace();
131} 131}
132 132
133static struct ftrace_ops trace_ops __read_mostly = 133static struct ftrace_ops trace_ops __read_mostly =
@@ -249,7 +249,7 @@ static int trace_lookup_stack(struct seq_file *m, long i)
249{ 249{
250 unsigned long addr = stack_dump_trace[i]; 250 unsigned long addr = stack_dump_trace[i];
251 251
252 return seq_printf(m, "%pF\n", (void *)addr); 252 return seq_printf(m, "%pS\n", (void *)addr);
253} 253}
254 254
255static void print_disabled(struct seq_file *m) 255static void print_disabled(struct seq_file *m)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 4d6d711717f2..bac752f0cfb5 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -15,6 +15,55 @@ static int sys_refcount_exit;
15static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); 15static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
16static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); 16static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
17 17
18static int syscall_enter_register(struct ftrace_event_call *event,
19 enum trace_reg type);
20static int syscall_exit_register(struct ftrace_event_call *event,
21 enum trace_reg type);
22
23static int syscall_enter_define_fields(struct ftrace_event_call *call);
24static int syscall_exit_define_fields(struct ftrace_event_call *call);
25
26/* All syscall exit events have the same fields */
27static LIST_HEAD(syscall_exit_fields);
28
29static struct list_head *
30syscall_get_enter_fields(struct ftrace_event_call *call)
31{
32 struct syscall_metadata *entry = call->data;
33
34 return &entry->enter_fields;
35}
36
37static struct list_head *
38syscall_get_exit_fields(struct ftrace_event_call *call)
39{
40 return &syscall_exit_fields;
41}
42
43struct trace_event_functions enter_syscall_print_funcs = {
44 .trace = print_syscall_enter,
45};
46
47struct trace_event_functions exit_syscall_print_funcs = {
48 .trace = print_syscall_exit,
49};
50
51struct ftrace_event_class event_class_syscall_enter = {
52 .system = "syscalls",
53 .reg = syscall_enter_register,
54 .define_fields = syscall_enter_define_fields,
55 .get_fields = syscall_get_enter_fields,
56 .raw_init = init_syscall_trace,
57};
58
59struct ftrace_event_class event_class_syscall_exit = {
60 .system = "syscalls",
61 .reg = syscall_exit_register,
62 .define_fields = syscall_exit_define_fields,
63 .get_fields = syscall_get_exit_fields,
64 .raw_init = init_syscall_trace,
65};
66
18extern unsigned long __start_syscalls_metadata[]; 67extern unsigned long __start_syscalls_metadata[];
19extern unsigned long __stop_syscalls_metadata[]; 68extern unsigned long __stop_syscalls_metadata[];
20 69
@@ -53,7 +102,8 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr)
53} 102}
54 103
55enum print_line_t 104enum print_line_t
56print_syscall_enter(struct trace_iterator *iter, int flags) 105print_syscall_enter(struct trace_iterator *iter, int flags,
106 struct trace_event *event)
57{ 107{
58 struct trace_seq *s = &iter->seq; 108 struct trace_seq *s = &iter->seq;
59 struct trace_entry *ent = iter->ent; 109 struct trace_entry *ent = iter->ent;
@@ -68,7 +118,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags)
68 if (!entry) 118 if (!entry)
69 goto end; 119 goto end;
70 120
71 if (entry->enter_event->id != ent->type) { 121 if (entry->enter_event->event.type != ent->type) {
72 WARN_ON_ONCE(1); 122 WARN_ON_ONCE(1);
73 goto end; 123 goto end;
74 } 124 }
@@ -105,7 +155,8 @@ end:
105} 155}
106 156
107enum print_line_t 157enum print_line_t
108print_syscall_exit(struct trace_iterator *iter, int flags) 158print_syscall_exit(struct trace_iterator *iter, int flags,
159 struct trace_event *event)
109{ 160{
110 struct trace_seq *s = &iter->seq; 161 struct trace_seq *s = &iter->seq;
111 struct trace_entry *ent = iter->ent; 162 struct trace_entry *ent = iter->ent;
@@ -123,7 +174,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
123 return TRACE_TYPE_HANDLED; 174 return TRACE_TYPE_HANDLED;
124 } 175 }
125 176
126 if (entry->exit_event->id != ent->type) { 177 if (entry->exit_event->event.type != ent->type) {
127 WARN_ON_ONCE(1); 178 WARN_ON_ONCE(1);
128 return TRACE_TYPE_UNHANDLED; 179 return TRACE_TYPE_UNHANDLED;
129 } 180 }
@@ -205,7 +256,7 @@ static void free_syscall_print_fmt(struct ftrace_event_call *call)
205 kfree(call->print_fmt); 256 kfree(call->print_fmt);
206} 257}
207 258
208int syscall_enter_define_fields(struct ftrace_event_call *call) 259static int syscall_enter_define_fields(struct ftrace_event_call *call)
209{ 260{
210 struct syscall_trace_enter trace; 261 struct syscall_trace_enter trace;
211 struct syscall_metadata *meta = call->data; 262 struct syscall_metadata *meta = call->data;
@@ -228,7 +279,7 @@ int syscall_enter_define_fields(struct ftrace_event_call *call)
228 return ret; 279 return ret;
229} 280}
230 281
231int syscall_exit_define_fields(struct ftrace_event_call *call) 282static int syscall_exit_define_fields(struct ftrace_event_call *call)
232{ 283{
233 struct syscall_trace_exit trace; 284 struct syscall_trace_exit trace;
234 int ret; 285 int ret;
@@ -243,7 +294,7 @@ int syscall_exit_define_fields(struct ftrace_event_call *call)
243 return ret; 294 return ret;
244} 295}
245 296
246void ftrace_syscall_enter(struct pt_regs *regs, long id) 297void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
247{ 298{
248 struct syscall_trace_enter *entry; 299 struct syscall_trace_enter *entry;
249 struct syscall_metadata *sys_data; 300 struct syscall_metadata *sys_data;
@@ -265,7 +316,7 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
265 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; 316 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
266 317
267 event = trace_current_buffer_lock_reserve(&buffer, 318 event = trace_current_buffer_lock_reserve(&buffer,
268 sys_data->enter_event->id, size, 0, 0); 319 sys_data->enter_event->event.type, size, 0, 0);
269 if (!event) 320 if (!event)
270 return; 321 return;
271 322
@@ -278,7 +329,7 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
278 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 329 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
279} 330}
280 331
281void ftrace_syscall_exit(struct pt_regs *regs, long ret) 332void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
282{ 333{
283 struct syscall_trace_exit *entry; 334 struct syscall_trace_exit *entry;
284 struct syscall_metadata *sys_data; 335 struct syscall_metadata *sys_data;
@@ -297,7 +348,7 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
297 return; 348 return;
298 349
299 event = trace_current_buffer_lock_reserve(&buffer, 350 event = trace_current_buffer_lock_reserve(&buffer,
300 sys_data->exit_event->id, sizeof(*entry), 0, 0); 351 sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
301 if (!event) 352 if (!event)
302 return; 353 return;
303 354
@@ -320,7 +371,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
320 return -ENOSYS; 371 return -ENOSYS;
321 mutex_lock(&syscall_trace_lock); 372 mutex_lock(&syscall_trace_lock);
322 if (!sys_refcount_enter) 373 if (!sys_refcount_enter)
323 ret = register_trace_sys_enter(ftrace_syscall_enter); 374 ret = register_trace_sys_enter(ftrace_syscall_enter, NULL);
324 if (!ret) { 375 if (!ret) {
325 set_bit(num, enabled_enter_syscalls); 376 set_bit(num, enabled_enter_syscalls);
326 sys_refcount_enter++; 377 sys_refcount_enter++;
@@ -340,7 +391,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
340 sys_refcount_enter--; 391 sys_refcount_enter--;
341 clear_bit(num, enabled_enter_syscalls); 392 clear_bit(num, enabled_enter_syscalls);
342 if (!sys_refcount_enter) 393 if (!sys_refcount_enter)
343 unregister_trace_sys_enter(ftrace_syscall_enter); 394 unregister_trace_sys_enter(ftrace_syscall_enter, NULL);
344 mutex_unlock(&syscall_trace_lock); 395 mutex_unlock(&syscall_trace_lock);
345} 396}
346 397
@@ -354,7 +405,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
354 return -ENOSYS; 405 return -ENOSYS;
355 mutex_lock(&syscall_trace_lock); 406 mutex_lock(&syscall_trace_lock);
356 if (!sys_refcount_exit) 407 if (!sys_refcount_exit)
357 ret = register_trace_sys_exit(ftrace_syscall_exit); 408 ret = register_trace_sys_exit(ftrace_syscall_exit, NULL);
358 if (!ret) { 409 if (!ret) {
359 set_bit(num, enabled_exit_syscalls); 410 set_bit(num, enabled_exit_syscalls);
360 sys_refcount_exit++; 411 sys_refcount_exit++;
@@ -374,7 +425,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
374 sys_refcount_exit--; 425 sys_refcount_exit--;
375 clear_bit(num, enabled_exit_syscalls); 426 clear_bit(num, enabled_exit_syscalls);
376 if (!sys_refcount_exit) 427 if (!sys_refcount_exit)
377 unregister_trace_sys_exit(ftrace_syscall_exit); 428 unregister_trace_sys_exit(ftrace_syscall_exit, NULL);
378 mutex_unlock(&syscall_trace_lock); 429 mutex_unlock(&syscall_trace_lock);
379} 430}
380 431
@@ -434,11 +485,11 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
434static int sys_perf_refcount_enter; 485static int sys_perf_refcount_enter;
435static int sys_perf_refcount_exit; 486static int sys_perf_refcount_exit;
436 487
437static void perf_syscall_enter(struct pt_regs *regs, long id) 488static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
438{ 489{
439 struct syscall_metadata *sys_data; 490 struct syscall_metadata *sys_data;
440 struct syscall_trace_enter *rec; 491 struct syscall_trace_enter *rec;
441 unsigned long flags; 492 struct hlist_head *head;
442 int syscall_nr; 493 int syscall_nr;
443 int rctx; 494 int rctx;
444 int size; 495 int size;
@@ -461,14 +512,16 @@ static void perf_syscall_enter(struct pt_regs *regs, long id)
461 return; 512 return;
462 513
463 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, 514 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
464 sys_data->enter_event->id, &rctx, &flags); 515 sys_data->enter_event->event.type, regs, &rctx);
465 if (!rec) 516 if (!rec)
466 return; 517 return;
467 518
468 rec->nr = syscall_nr; 519 rec->nr = syscall_nr;
469 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 520 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
470 (unsigned long *)&rec->args); 521 (unsigned long *)&rec->args);
471 perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs); 522
523 head = this_cpu_ptr(sys_data->enter_event->perf_events);
524 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
472} 525}
473 526
474int perf_sysenter_enable(struct ftrace_event_call *call) 527int perf_sysenter_enable(struct ftrace_event_call *call)
@@ -480,7 +533,7 @@ int perf_sysenter_enable(struct ftrace_event_call *call)
480 533
481 mutex_lock(&syscall_trace_lock); 534 mutex_lock(&syscall_trace_lock);
482 if (!sys_perf_refcount_enter) 535 if (!sys_perf_refcount_enter)
483 ret = register_trace_sys_enter(perf_syscall_enter); 536 ret = register_trace_sys_enter(perf_syscall_enter, NULL);
484 if (ret) { 537 if (ret) {
485 pr_info("event trace: Could not activate" 538 pr_info("event trace: Could not activate"
486 "syscall entry trace point"); 539 "syscall entry trace point");
@@ -502,15 +555,15 @@ void perf_sysenter_disable(struct ftrace_event_call *call)
502 sys_perf_refcount_enter--; 555 sys_perf_refcount_enter--;
503 clear_bit(num, enabled_perf_enter_syscalls); 556 clear_bit(num, enabled_perf_enter_syscalls);
504 if (!sys_perf_refcount_enter) 557 if (!sys_perf_refcount_enter)
505 unregister_trace_sys_enter(perf_syscall_enter); 558 unregister_trace_sys_enter(perf_syscall_enter, NULL);
506 mutex_unlock(&syscall_trace_lock); 559 mutex_unlock(&syscall_trace_lock);
507} 560}
508 561
509static void perf_syscall_exit(struct pt_regs *regs, long ret) 562static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
510{ 563{
511 struct syscall_metadata *sys_data; 564 struct syscall_metadata *sys_data;
512 struct syscall_trace_exit *rec; 565 struct syscall_trace_exit *rec;
513 unsigned long flags; 566 struct hlist_head *head;
514 int syscall_nr; 567 int syscall_nr;
515 int rctx; 568 int rctx;
516 int size; 569 int size;
@@ -536,14 +589,15 @@ static void perf_syscall_exit(struct pt_regs *regs, long ret)
536 return; 589 return;
537 590
538 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, 591 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
539 sys_data->exit_event->id, &rctx, &flags); 592 sys_data->exit_event->event.type, regs, &rctx);
540 if (!rec) 593 if (!rec)
541 return; 594 return;
542 595
543 rec->nr = syscall_nr; 596 rec->nr = syscall_nr;
544 rec->ret = syscall_get_return_value(current, regs); 597 rec->ret = syscall_get_return_value(current, regs);
545 598
546 perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs); 599 head = this_cpu_ptr(sys_data->exit_event->perf_events);
600 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
547} 601}
548 602
549int perf_sysexit_enable(struct ftrace_event_call *call) 603int perf_sysexit_enable(struct ftrace_event_call *call)
@@ -555,7 +609,7 @@ int perf_sysexit_enable(struct ftrace_event_call *call)
555 609
556 mutex_lock(&syscall_trace_lock); 610 mutex_lock(&syscall_trace_lock);
557 if (!sys_perf_refcount_exit) 611 if (!sys_perf_refcount_exit)
558 ret = register_trace_sys_exit(perf_syscall_exit); 612 ret = register_trace_sys_exit(perf_syscall_exit, NULL);
559 if (ret) { 613 if (ret) {
560 pr_info("event trace: Could not activate" 614 pr_info("event trace: Could not activate"
561 "syscall exit trace point"); 615 "syscall exit trace point");
@@ -577,9 +631,50 @@ void perf_sysexit_disable(struct ftrace_event_call *call)
577 sys_perf_refcount_exit--; 631 sys_perf_refcount_exit--;
578 clear_bit(num, enabled_perf_exit_syscalls); 632 clear_bit(num, enabled_perf_exit_syscalls);
579 if (!sys_perf_refcount_exit) 633 if (!sys_perf_refcount_exit)
580 unregister_trace_sys_exit(perf_syscall_exit); 634 unregister_trace_sys_exit(perf_syscall_exit, NULL);
581 mutex_unlock(&syscall_trace_lock); 635 mutex_unlock(&syscall_trace_lock);
582} 636}
583 637
584#endif /* CONFIG_PERF_EVENTS */ 638#endif /* CONFIG_PERF_EVENTS */
585 639
640static int syscall_enter_register(struct ftrace_event_call *event,
641 enum trace_reg type)
642{
643 switch (type) {
644 case TRACE_REG_REGISTER:
645 return reg_event_syscall_enter(event);
646 case TRACE_REG_UNREGISTER:
647 unreg_event_syscall_enter(event);
648 return 0;
649
650#ifdef CONFIG_PERF_EVENTS
651 case TRACE_REG_PERF_REGISTER:
652 return perf_sysenter_enable(event);
653 case TRACE_REG_PERF_UNREGISTER:
654 perf_sysenter_disable(event);
655 return 0;
656#endif
657 }
658 return 0;
659}
660
661static int syscall_exit_register(struct ftrace_event_call *event,
662 enum trace_reg type)
663{
664 switch (type) {
665 case TRACE_REG_REGISTER:
666 return reg_event_syscall_exit(event);
667 case TRACE_REG_UNREGISTER:
668 unreg_event_syscall_exit(event);
669 return 0;
670
671#ifdef CONFIG_PERF_EVENTS
672 case TRACE_REG_PERF_REGISTER:
673 return perf_sysexit_enable(event);
674 case TRACE_REG_PERF_UNREGISTER:
675 perf_sysexit_disable(event);
676 return 0;
677#endif
678 }
679 return 0;
680}
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
deleted file mode 100644
index a7974a552ca9..000000000000
--- a/kernel/trace/trace_sysprof.c
+++ /dev/null
@@ -1,329 +0,0 @@
1/*
2 * trace stack traces
3 *
4 * Copyright (C) 2004-2008, Soeren Sandmann
5 * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
6 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
7 */
8#include <linux/kallsyms.h>
9#include <linux/debugfs.h>
10#include <linux/hrtimer.h>
11#include <linux/uaccess.h>
12#include <linux/ftrace.h>
13#include <linux/module.h>
14#include <linux/irq.h>
15#include <linux/fs.h>
16
17#include <asm/stacktrace.h>
18
19#include "trace.h"
20
21static struct trace_array *sysprof_trace;
22static int __read_mostly tracer_enabled;
23
24/*
25 * 1 msec sample interval by default:
26 */
27static unsigned long sample_period = 1000000;
28static const unsigned int sample_max_depth = 512;
29
30static DEFINE_MUTEX(sample_timer_lock);
31/*
32 * Per CPU hrtimers that do the profiling:
33 */
34static DEFINE_PER_CPU(struct hrtimer, stack_trace_hrtimer);
35
36struct stack_frame {
37 const void __user *next_fp;
38 unsigned long return_address;
39};
40
41static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
42{
43 int ret;
44
45 if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
46 return 0;
47
48 ret = 1;
49 pagefault_disable();
50 if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
51 ret = 0;
52 pagefault_enable();
53
54 return ret;
55}
56
57struct backtrace_info {
58 struct trace_array_cpu *data;
59 struct trace_array *tr;
60 int pos;
61};
62
63static void
64backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
65{
66 /* Ignore warnings */
67}
68
69static void backtrace_warning(void *data, char *msg)
70{
71 /* Ignore warnings */
72}
73
74static int backtrace_stack(void *data, char *name)
75{
76 /* Don't bother with IRQ stacks for now */
77 return -1;
78}
79
80static void backtrace_address(void *data, unsigned long addr, int reliable)
81{
82 struct backtrace_info *info = data;
83
84 if (info->pos < sample_max_depth && reliable) {
85 __trace_special(info->tr, info->data, 1, addr, 0);
86
87 info->pos++;
88 }
89}
90
91static const struct stacktrace_ops backtrace_ops = {
92 .warning = backtrace_warning,
93 .warning_symbol = backtrace_warning_symbol,
94 .stack = backtrace_stack,
95 .address = backtrace_address,
96 .walk_stack = print_context_stack,
97};
98
99static int
100trace_kernel(struct pt_regs *regs, struct trace_array *tr,
101 struct trace_array_cpu *data)
102{
103 struct backtrace_info info;
104 unsigned long bp;
105 char *stack;
106
107 info.tr = tr;
108 info.data = data;
109 info.pos = 1;
110
111 __trace_special(info.tr, info.data, 1, regs->ip, 0);
112
113 stack = ((char *)regs + sizeof(struct pt_regs));
114#ifdef CONFIG_FRAME_POINTER
115 bp = regs->bp;
116#else
117 bp = 0;
118#endif
119
120 dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, &info);
121
122 return info.pos;
123}
124
125static void timer_notify(struct pt_regs *regs, int cpu)
126{
127 struct trace_array_cpu *data;
128 struct stack_frame frame;
129 struct trace_array *tr;
130 const void __user *fp;
131 int is_user;
132 int i;
133
134 if (!regs)
135 return;
136
137 tr = sysprof_trace;
138 data = tr->data[cpu];
139 is_user = user_mode(regs);
140
141 if (!current || current->pid == 0)
142 return;
143
144 if (is_user && current->state != TASK_RUNNING)
145 return;
146
147 __trace_special(tr, data, 0, 0, current->pid);
148
149 if (!is_user)
150 i = trace_kernel(regs, tr, data);
151 else
152 i = 0;
153
154 /*
155 * Trace user stack if we are not a kernel thread
156 */
157 if (current->mm && i < sample_max_depth) {
158 regs = (struct pt_regs *)current->thread.sp0 - 1;
159
160 fp = (void __user *)regs->bp;
161
162 __trace_special(tr, data, 2, regs->ip, 0);
163
164 while (i < sample_max_depth) {
165 frame.next_fp = NULL;
166 frame.return_address = 0;
167 if (!copy_stack_frame(fp, &frame))
168 break;
169 if ((unsigned long)fp < regs->sp)
170 break;
171
172 __trace_special(tr, data, 2, frame.return_address,
173 (unsigned long)fp);
174 fp = frame.next_fp;
175
176 i++;
177 }
178
179 }
180
181 /*
182 * Special trace entry if we overflow the max depth:
183 */
184 if (i == sample_max_depth)
185 __trace_special(tr, data, -1, -1, -1);
186
187 __trace_special(tr, data, 3, current->pid, i);
188}
189
190static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer)
191{
192 /* trace here */
193 timer_notify(get_irq_regs(), smp_processor_id());
194
195 hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
196
197 return HRTIMER_RESTART;
198}
199
200static void start_stack_timer(void *unused)
201{
202 struct hrtimer *hrtimer = &__get_cpu_var(stack_trace_hrtimer);
203
204 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
205 hrtimer->function = stack_trace_timer_fn;
206
207 hrtimer_start(hrtimer, ns_to_ktime(sample_period),
208 HRTIMER_MODE_REL_PINNED);
209}
210
211static void start_stack_timers(void)
212{
213 on_each_cpu(start_stack_timer, NULL, 1);
214}
215
216static void stop_stack_timer(int cpu)
217{
218 struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu);
219
220 hrtimer_cancel(hrtimer);
221}
222
223static void stop_stack_timers(void)
224{
225 int cpu;
226
227 for_each_online_cpu(cpu)
228 stop_stack_timer(cpu);
229}
230
231static void stop_stack_trace(struct trace_array *tr)
232{
233 mutex_lock(&sample_timer_lock);
234 stop_stack_timers();
235 tracer_enabled = 0;
236 mutex_unlock(&sample_timer_lock);
237}
238
239static int stack_trace_init(struct trace_array *tr)
240{
241 sysprof_trace = tr;
242
243 tracing_start_cmdline_record();
244
245 mutex_lock(&sample_timer_lock);
246 start_stack_timers();
247 tracer_enabled = 1;
248 mutex_unlock(&sample_timer_lock);
249 return 0;
250}
251
252static void stack_trace_reset(struct trace_array *tr)
253{
254 tracing_stop_cmdline_record();
255 stop_stack_trace(tr);
256}
257
258static struct tracer stack_trace __read_mostly =
259{
260 .name = "sysprof",
261 .init = stack_trace_init,
262 .reset = stack_trace_reset,
263#ifdef CONFIG_FTRACE_SELFTEST
264 .selftest = trace_selftest_startup_sysprof,
265#endif
266};
267
268__init static int init_stack_trace(void)
269{
270 return register_tracer(&stack_trace);
271}
272device_initcall(init_stack_trace);
273
274#define MAX_LONG_DIGITS 22
275
276static ssize_t
277sysprof_sample_read(struct file *filp, char __user *ubuf,
278 size_t cnt, loff_t *ppos)
279{
280 char buf[MAX_LONG_DIGITS];
281 int r;
282
283 r = sprintf(buf, "%ld\n", nsecs_to_usecs(sample_period));
284
285 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
286}
287
288static ssize_t
289sysprof_sample_write(struct file *filp, const char __user *ubuf,
290 size_t cnt, loff_t *ppos)
291{
292 char buf[MAX_LONG_DIGITS];
293 unsigned long val;
294
295 if (cnt > MAX_LONG_DIGITS-1)
296 cnt = MAX_LONG_DIGITS-1;
297
298 if (copy_from_user(&buf, ubuf, cnt))
299 return -EFAULT;
300
301 buf[cnt] = 0;
302
303 val = simple_strtoul(buf, NULL, 10);
304 /*
305 * Enforce a minimum sample period of 100 usecs:
306 */
307 if (val < 100)
308 val = 100;
309
310 mutex_lock(&sample_timer_lock);
311 stop_stack_timers();
312 sample_period = val * 1000;
313 start_stack_timers();
314 mutex_unlock(&sample_timer_lock);
315
316 return cnt;
317}
318
319static const struct file_operations sysprof_sample_fops = {
320 .read = sysprof_sample_read,
321 .write = sysprof_sample_write,
322};
323
324void init_tracer_sysprof_debugfs(struct dentry *d_tracer)
325{
326
327 trace_create_file("sysprof_sample_period", 0644,
328 d_tracer, NULL, &sysprof_sample_fops);
329}
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index cc2d2faa7d9e..a7cc3793baf6 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -49,7 +49,8 @@ static void cpu_workqueue_stat_free(struct kref *kref)
49 49
50/* Insertion of a work */ 50/* Insertion of a work */
51static void 51static void
52probe_workqueue_insertion(struct task_struct *wq_thread, 52probe_workqueue_insertion(void *ignore,
53 struct task_struct *wq_thread,
53 struct work_struct *work) 54 struct work_struct *work)
54{ 55{
55 int cpu = cpumask_first(&wq_thread->cpus_allowed); 56 int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -70,7 +71,8 @@ found:
70 71
71/* Execution of a work */ 72/* Execution of a work */
72static void 73static void
73probe_workqueue_execution(struct task_struct *wq_thread, 74probe_workqueue_execution(void *ignore,
75 struct task_struct *wq_thread,
74 struct work_struct *work) 76 struct work_struct *work)
75{ 77{
76 int cpu = cpumask_first(&wq_thread->cpus_allowed); 78 int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -90,7 +92,8 @@ found:
90} 92}
91 93
92/* Creation of a cpu workqueue thread */ 94/* Creation of a cpu workqueue thread */
93static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu) 95static void probe_workqueue_creation(void *ignore,
96 struct task_struct *wq_thread, int cpu)
94{ 97{
95 struct cpu_workqueue_stats *cws; 98 struct cpu_workqueue_stats *cws;
96 unsigned long flags; 99 unsigned long flags;
@@ -114,7 +117,8 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
114} 117}
115 118
116/* Destruction of a cpu workqueue thread */ 119/* Destruction of a cpu workqueue thread */
117static void probe_workqueue_destruction(struct task_struct *wq_thread) 120static void
121probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread)
118{ 122{
119 /* Workqueue only execute on one cpu */ 123 /* Workqueue only execute on one cpu */
120 int cpu = cpumask_first(&wq_thread->cpus_allowed); 124 int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -259,19 +263,19 @@ int __init trace_workqueue_early_init(void)
259{ 263{
260 int ret, cpu; 264 int ret, cpu;
261 265
262 ret = register_trace_workqueue_insertion(probe_workqueue_insertion); 266 ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
263 if (ret) 267 if (ret)
264 goto out; 268 goto out;
265 269
266 ret = register_trace_workqueue_execution(probe_workqueue_execution); 270 ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL);
267 if (ret) 271 if (ret)
268 goto no_insertion; 272 goto no_insertion;
269 273
270 ret = register_trace_workqueue_creation(probe_workqueue_creation); 274 ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL);
271 if (ret) 275 if (ret)
272 goto no_execution; 276 goto no_execution;
273 277
274 ret = register_trace_workqueue_destruction(probe_workqueue_destruction); 278 ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL);
275 if (ret) 279 if (ret)
276 goto no_creation; 280 goto no_creation;
277 281
@@ -283,11 +287,11 @@ int __init trace_workqueue_early_init(void)
283 return 0; 287 return 0;
284 288
285no_creation: 289no_creation:
286 unregister_trace_workqueue_creation(probe_workqueue_creation); 290 unregister_trace_workqueue_creation(probe_workqueue_creation, NULL);
287no_execution: 291no_execution:
288 unregister_trace_workqueue_execution(probe_workqueue_execution); 292 unregister_trace_workqueue_execution(probe_workqueue_execution, NULL);
289no_insertion: 293no_insertion:
290 unregister_trace_workqueue_insertion(probe_workqueue_insertion); 294 unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
291out: 295out:
292 pr_warning("trace_workqueue: unable to trace workqueues\n"); 296 pr_warning("trace_workqueue: unable to trace workqueues\n");
293 297
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index cc89be5bc0f8..c77f3eceea25 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -54,7 +54,7 @@ static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
54 */ 54 */
55struct tracepoint_entry { 55struct tracepoint_entry {
56 struct hlist_node hlist; 56 struct hlist_node hlist;
57 void **funcs; 57 struct tracepoint_func *funcs;
58 int refcount; /* Number of times armed. 0 if disarmed. */ 58 int refcount; /* Number of times armed. 0 if disarmed. */
59 char name[0]; 59 char name[0];
60}; 60};
@@ -64,12 +64,12 @@ struct tp_probes {
64 struct rcu_head rcu; 64 struct rcu_head rcu;
65 struct list_head list; 65 struct list_head list;
66 } u; 66 } u;
67 void *probes[0]; 67 struct tracepoint_func probes[0];
68}; 68};
69 69
70static inline void *allocate_probes(int count) 70static inline void *allocate_probes(int count)
71{ 71{
72 struct tp_probes *p = kmalloc(count * sizeof(void *) 72 struct tp_probes *p = kmalloc(count * sizeof(struct tracepoint_func)
73 + sizeof(struct tp_probes), GFP_KERNEL); 73 + sizeof(struct tp_probes), GFP_KERNEL);
74 return p == NULL ? NULL : p->probes; 74 return p == NULL ? NULL : p->probes;
75} 75}
@@ -79,7 +79,7 @@ static void rcu_free_old_probes(struct rcu_head *head)
79 kfree(container_of(head, struct tp_probes, u.rcu)); 79 kfree(container_of(head, struct tp_probes, u.rcu));
80} 80}
81 81
82static inline void release_probes(void *old) 82static inline void release_probes(struct tracepoint_func *old)
83{ 83{
84 if (old) { 84 if (old) {
85 struct tp_probes *tp_probes = container_of(old, 85 struct tp_probes *tp_probes = container_of(old,
@@ -95,15 +95,16 @@ static void debug_print_probes(struct tracepoint_entry *entry)
95 if (!tracepoint_debug || !entry->funcs) 95 if (!tracepoint_debug || !entry->funcs)
96 return; 96 return;
97 97
98 for (i = 0; entry->funcs[i]; i++) 98 for (i = 0; entry->funcs[i].func; i++)
99 printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i]); 99 printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i].func);
100} 100}
101 101
102static void * 102static struct tracepoint_func *
103tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe) 103tracepoint_entry_add_probe(struct tracepoint_entry *entry,
104 void *probe, void *data)
104{ 105{
105 int nr_probes = 0; 106 int nr_probes = 0;
106 void **old, **new; 107 struct tracepoint_func *old, *new;
107 108
108 WARN_ON(!probe); 109 WARN_ON(!probe);
109 110
@@ -111,8 +112,9 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
111 old = entry->funcs; 112 old = entry->funcs;
112 if (old) { 113 if (old) {
113 /* (N -> N+1), (N != 0, 1) probes */ 114 /* (N -> N+1), (N != 0, 1) probes */
114 for (nr_probes = 0; old[nr_probes]; nr_probes++) 115 for (nr_probes = 0; old[nr_probes].func; nr_probes++)
115 if (old[nr_probes] == probe) 116 if (old[nr_probes].func == probe &&
117 old[nr_probes].data == data)
116 return ERR_PTR(-EEXIST); 118 return ERR_PTR(-EEXIST);
117 } 119 }
118 /* + 2 : one for new probe, one for NULL func */ 120 /* + 2 : one for new probe, one for NULL func */
@@ -120,9 +122,10 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
120 if (new == NULL) 122 if (new == NULL)
121 return ERR_PTR(-ENOMEM); 123 return ERR_PTR(-ENOMEM);
122 if (old) 124 if (old)
123 memcpy(new, old, nr_probes * sizeof(void *)); 125 memcpy(new, old, nr_probes * sizeof(struct tracepoint_func));
124 new[nr_probes] = probe; 126 new[nr_probes].func = probe;
125 new[nr_probes + 1] = NULL; 127 new[nr_probes].data = data;
128 new[nr_probes + 1].func = NULL;
126 entry->refcount = nr_probes + 1; 129 entry->refcount = nr_probes + 1;
127 entry->funcs = new; 130 entry->funcs = new;
128 debug_print_probes(entry); 131 debug_print_probes(entry);
@@ -130,10 +133,11 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
130} 133}
131 134
132static void * 135static void *
133tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe) 136tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
137 void *probe, void *data)
134{ 138{
135 int nr_probes = 0, nr_del = 0, i; 139 int nr_probes = 0, nr_del = 0, i;
136 void **old, **new; 140 struct tracepoint_func *old, *new;
137 141
138 old = entry->funcs; 142 old = entry->funcs;
139 143
@@ -142,8 +146,10 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
142 146
143 debug_print_probes(entry); 147 debug_print_probes(entry);
144 /* (N -> M), (N > 1, M >= 0) probes */ 148 /* (N -> M), (N > 1, M >= 0) probes */
145 for (nr_probes = 0; old[nr_probes]; nr_probes++) { 149 for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
146 if ((!probe || old[nr_probes] == probe)) 150 if (!probe ||
151 (old[nr_probes].func == probe &&
152 old[nr_probes].data == data))
147 nr_del++; 153 nr_del++;
148 } 154 }
149 155
@@ -160,10 +166,11 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
160 new = allocate_probes(nr_probes - nr_del + 1); 166 new = allocate_probes(nr_probes - nr_del + 1);
161 if (new == NULL) 167 if (new == NULL)
162 return ERR_PTR(-ENOMEM); 168 return ERR_PTR(-ENOMEM);
163 for (i = 0; old[i]; i++) 169 for (i = 0; old[i].func; i++)
164 if ((probe && old[i] != probe)) 170 if (probe &&
171 (old[i].func != probe || old[i].data != data))
165 new[j++] = old[i]; 172 new[j++] = old[i];
166 new[nr_probes - nr_del] = NULL; 173 new[nr_probes - nr_del].func = NULL;
167 entry->refcount = nr_probes - nr_del; 174 entry->refcount = nr_probes - nr_del;
168 entry->funcs = new; 175 entry->funcs = new;
169 } 176 }
@@ -315,18 +322,19 @@ static void tracepoint_update_probes(void)
315 module_update_tracepoints(); 322 module_update_tracepoints();
316} 323}
317 324
318static void *tracepoint_add_probe(const char *name, void *probe) 325static struct tracepoint_func *
326tracepoint_add_probe(const char *name, void *probe, void *data)
319{ 327{
320 struct tracepoint_entry *entry; 328 struct tracepoint_entry *entry;
321 void *old; 329 struct tracepoint_func *old;
322 330
323 entry = get_tracepoint(name); 331 entry = get_tracepoint(name);
324 if (!entry) { 332 if (!entry) {
325 entry = add_tracepoint(name); 333 entry = add_tracepoint(name);
326 if (IS_ERR(entry)) 334 if (IS_ERR(entry))
327 return entry; 335 return (struct tracepoint_func *)entry;
328 } 336 }
329 old = tracepoint_entry_add_probe(entry, probe); 337 old = tracepoint_entry_add_probe(entry, probe, data);
330 if (IS_ERR(old) && !entry->refcount) 338 if (IS_ERR(old) && !entry->refcount)
331 remove_tracepoint(entry); 339 remove_tracepoint(entry);
332 return old; 340 return old;
@@ -340,12 +348,12 @@ static void *tracepoint_add_probe(const char *name, void *probe)
340 * Returns 0 if ok, error value on error. 348 * Returns 0 if ok, error value on error.
341 * The probe address must at least be aligned on the architecture pointer size. 349 * The probe address must at least be aligned on the architecture pointer size.
342 */ 350 */
343int tracepoint_probe_register(const char *name, void *probe) 351int tracepoint_probe_register(const char *name, void *probe, void *data)
344{ 352{
345 void *old; 353 struct tracepoint_func *old;
346 354
347 mutex_lock(&tracepoints_mutex); 355 mutex_lock(&tracepoints_mutex);
348 old = tracepoint_add_probe(name, probe); 356 old = tracepoint_add_probe(name, probe, data);
349 mutex_unlock(&tracepoints_mutex); 357 mutex_unlock(&tracepoints_mutex);
350 if (IS_ERR(old)) 358 if (IS_ERR(old))
351 return PTR_ERR(old); 359 return PTR_ERR(old);
@@ -356,15 +364,16 @@ int tracepoint_probe_register(const char *name, void *probe)
356} 364}
357EXPORT_SYMBOL_GPL(tracepoint_probe_register); 365EXPORT_SYMBOL_GPL(tracepoint_probe_register);
358 366
359static void *tracepoint_remove_probe(const char *name, void *probe) 367static struct tracepoint_func *
368tracepoint_remove_probe(const char *name, void *probe, void *data)
360{ 369{
361 struct tracepoint_entry *entry; 370 struct tracepoint_entry *entry;
362 void *old; 371 struct tracepoint_func *old;
363 372
364 entry = get_tracepoint(name); 373 entry = get_tracepoint(name);
365 if (!entry) 374 if (!entry)
366 return ERR_PTR(-ENOENT); 375 return ERR_PTR(-ENOENT);
367 old = tracepoint_entry_remove_probe(entry, probe); 376 old = tracepoint_entry_remove_probe(entry, probe, data);
368 if (IS_ERR(old)) 377 if (IS_ERR(old))
369 return old; 378 return old;
370 if (!entry->refcount) 379 if (!entry->refcount)
@@ -382,12 +391,12 @@ static void *tracepoint_remove_probe(const char *name, void *probe)
382 * itself uses stop_machine(), which insures that every preempt disabled section 391 * itself uses stop_machine(), which insures that every preempt disabled section
383 * have finished. 392 * have finished.
384 */ 393 */
385int tracepoint_probe_unregister(const char *name, void *probe) 394int tracepoint_probe_unregister(const char *name, void *probe, void *data)
386{ 395{
387 void *old; 396 struct tracepoint_func *old;
388 397
389 mutex_lock(&tracepoints_mutex); 398 mutex_lock(&tracepoints_mutex);
390 old = tracepoint_remove_probe(name, probe); 399 old = tracepoint_remove_probe(name, probe, data);
391 mutex_unlock(&tracepoints_mutex); 400 mutex_unlock(&tracepoints_mutex);
392 if (IS_ERR(old)) 401 if (IS_ERR(old))
393 return PTR_ERR(old); 402 return PTR_ERR(old);
@@ -418,12 +427,13 @@ static void tracepoint_add_old_probes(void *old)
418 * 427 *
419 * caller must call tracepoint_probe_update_all() 428 * caller must call tracepoint_probe_update_all()
420 */ 429 */
421int tracepoint_probe_register_noupdate(const char *name, void *probe) 430int tracepoint_probe_register_noupdate(const char *name, void *probe,
431 void *data)
422{ 432{
423 void *old; 433 struct tracepoint_func *old;
424 434
425 mutex_lock(&tracepoints_mutex); 435 mutex_lock(&tracepoints_mutex);
426 old = tracepoint_add_probe(name, probe); 436 old = tracepoint_add_probe(name, probe, data);
427 if (IS_ERR(old)) { 437 if (IS_ERR(old)) {
428 mutex_unlock(&tracepoints_mutex); 438 mutex_unlock(&tracepoints_mutex);
429 return PTR_ERR(old); 439 return PTR_ERR(old);
@@ -441,12 +451,13 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate);
441 * 451 *
442 * caller must call tracepoint_probe_update_all() 452 * caller must call tracepoint_probe_update_all()
443 */ 453 */
444int tracepoint_probe_unregister_noupdate(const char *name, void *probe) 454int tracepoint_probe_unregister_noupdate(const char *name, void *probe,
455 void *data)
445{ 456{
446 void *old; 457 struct tracepoint_func *old;
447 458
448 mutex_lock(&tracepoints_mutex); 459 mutex_lock(&tracepoints_mutex);
449 old = tracepoint_remove_probe(name, probe); 460 old = tracepoint_remove_probe(name, probe, data);
450 if (IS_ERR(old)) { 461 if (IS_ERR(old)) {
451 mutex_unlock(&tracepoints_mutex); 462 mutex_unlock(&tracepoints_mutex);
452 return PTR_ERR(old); 463 return PTR_ERR(old);
diff --git a/kernel/user.c b/kernel/user.c
index 766467b3bcb7..7e72614b736d 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,7 +16,6 @@
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/user_namespace.h> 18#include <linux/user_namespace.h>
19#include "cred-internals.h"
20 19
21struct user_namespace init_user_ns = { 20struct user_namespace init_user_ns = {
22 .kref = { 21 .kref = {
@@ -137,9 +136,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
137 struct hlist_head *hashent = uidhashentry(ns, uid); 136 struct hlist_head *hashent = uidhashentry(ns, uid);
138 struct user_struct *up, *new; 137 struct user_struct *up, *new;
139 138
140 /* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
141 * atomic.
142 */
143 spin_lock_irq(&uidhash_lock); 139 spin_lock_irq(&uidhash_lock);
144 up = uid_hash_find(uid, hashent); 140 up = uid_hash_find(uid, hashent);
145 spin_unlock_irq(&uidhash_lock); 141 spin_unlock_irq(&uidhash_lock);
@@ -161,11 +157,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
161 spin_lock_irq(&uidhash_lock); 157 spin_lock_irq(&uidhash_lock);
162 up = uid_hash_find(uid, hashent); 158 up = uid_hash_find(uid, hashent);
163 if (up) { 159 if (up) {
164 /* This case is not possible when CONFIG_USER_SCHED
165 * is defined, since we serialize alloc_uid() using
166 * uids_mutex. Hence no need to call
167 * sched_destroy_user() or remove_user_sysfs_dir().
168 */
169 key_put(new->uid_keyring); 160 key_put(new->uid_keyring);
170 key_put(new->session_keyring); 161 key_put(new->session_keyring);
171 kmem_cache_free(uid_cachep, new); 162 kmem_cache_free(uid_cachep, new);
@@ -178,8 +169,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
178 169
179 return up; 170 return up;
180 171
181 put_user_ns(new->user_ns);
182 kmem_cache_free(uid_cachep, new);
183out_unlock: 172out_unlock:
184 return NULL; 173 return NULL;
185} 174}
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 076c7c8215b0..25915832291a 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,6 +9,7 @@
9#include <linux/nsproxy.h> 9#include <linux/nsproxy.h>
10#include <linux/slab.h> 10#include <linux/slab.h>
11#include <linux/user_namespace.h> 11#include <linux/user_namespace.h>
12#include <linux/highuid.h>
12#include <linux/cred.h> 13#include <linux/cred.h>
13 14
14/* 15/*
@@ -54,8 +55,8 @@ int create_user_ns(struct cred *new)
54#endif 55#endif
55 /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ 56 /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
56 57
57 /* alloc_uid() incremented the userns refcount. Just set it to 1 */ 58 /* root_user holds a reference to ns, our reference can be dropped */
58 kref_set(&ns->kref, 1); 59 put_user_ns(ns);
59 60
60 return 0; 61 return 0;
61} 62}
@@ -82,3 +83,46 @@ void free_user_ns(struct kref *kref)
82 schedule_work(&ns->destroyer); 83 schedule_work(&ns->destroyer);
83} 84}
84EXPORT_SYMBOL(free_user_ns); 85EXPORT_SYMBOL(free_user_ns);
86
87uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid)
88{
89 struct user_namespace *tmp;
90
91 if (likely(to == cred->user->user_ns))
92 return uid;
93
94
95 /* Is cred->user the creator of the target user_ns
96 * or the creator of one of it's parents?
97 */
98 for ( tmp = to; tmp != &init_user_ns;
99 tmp = tmp->creator->user_ns ) {
100 if (cred->user == tmp->creator) {
101 return (uid_t)0;
102 }
103 }
104
105 /* No useful relationship so no mapping */
106 return overflowuid;
107}
108
109gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid)
110{
111 struct user_namespace *tmp;
112
113 if (likely(to == cred->user->user_ns))
114 return gid;
115
116 /* Is cred->user the creator of the target user_ns
117 * or the creator of one of it's parents?
118 */
119 for ( tmp = to; tmp != &init_user_ns;
120 tmp = tmp->creator->user_ns ) {
121 if (cred->user == tmp->creator) {
122 return (gid_t)0;
123 }
124 }
125
126 /* No useful relationship so no mapping */
127 return overflowgid;
128}
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
new file mode 100644
index 000000000000..7f9c3c52ecc1
--- /dev/null
+++ b/kernel/watchdog.c
@@ -0,0 +1,577 @@
1/*
2 * Detect hard and soft lockups on a system
3 *
4 * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
5 *
6 * this code detects hard lockups: incidents in where on a CPU
7 * the kernel does not respond to anything except NMI.
8 *
9 * Note: Most of this code is borrowed heavily from softlockup.c,
10 * so thanks to Ingo for the initial implementation.
11 * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks
12 * to those contributors as well.
13 */
14
15#include <linux/mm.h>
16#include <linux/cpu.h>
17#include <linux/nmi.h>
18#include <linux/init.h>
19#include <linux/delay.h>
20#include <linux/freezer.h>
21#include <linux/kthread.h>
22#include <linux/lockdep.h>
23#include <linux/notifier.h>
24#include <linux/module.h>
25#include <linux/sysctl.h>
26
27#include <asm/irq_regs.h>
28#include <linux/perf_event.h>
29
30int watchdog_enabled;
31int __read_mostly softlockup_thresh = 60;
32
33static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
34static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
35static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
36static DEFINE_PER_CPU(bool, softlockup_touch_sync);
37static DEFINE_PER_CPU(bool, soft_watchdog_warn);
38#ifdef CONFIG_HARDLOCKUP_DETECTOR
39static DEFINE_PER_CPU(bool, hard_watchdog_warn);
40static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
41static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
42static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
43static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
44#endif
45
46static int __read_mostly did_panic;
47static int __initdata no_watchdog;
48
49
50/* boot commands */
51/*
52 * Should we panic when a soft-lockup or hard-lockup occurs:
53 */
54#ifdef CONFIG_HARDLOCKUP_DETECTOR
55static int hardlockup_panic;
56
57static int __init hardlockup_panic_setup(char *str)
58{
59 if (!strncmp(str, "panic", 5))
60 hardlockup_panic = 1;
61 return 1;
62}
63__setup("nmi_watchdog=", hardlockup_panic_setup);
64#endif
65
66unsigned int __read_mostly softlockup_panic =
67 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
68
69static int __init softlockup_panic_setup(char *str)
70{
71 softlockup_panic = simple_strtoul(str, NULL, 0);
72
73 return 1;
74}
75__setup("softlockup_panic=", softlockup_panic_setup);
76
77static int __init nowatchdog_setup(char *str)
78{
79 no_watchdog = 1;
80 return 1;
81}
82__setup("nowatchdog", nowatchdog_setup);
83
84/* deprecated */
85static int __init nosoftlockup_setup(char *str)
86{
87 no_watchdog = 1;
88 return 1;
89}
90__setup("nosoftlockup", nosoftlockup_setup);
91/* */
92
93
94/*
95 * Returns seconds, approximately. We don't need nanosecond
96 * resolution, and we don't need to waste time with a big divide when
97 * 2^30ns == 1.074s.
98 */
99static unsigned long get_timestamp(int this_cpu)
100{
101 return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
102}
103
104static unsigned long get_sample_period(void)
105{
106 /*
107 * convert softlockup_thresh from seconds to ns
108 * the divide by 5 is to give hrtimer 5 chances to
109 * increment before the hardlockup detector generates
110 * a warning
111 */
112 return softlockup_thresh / 5 * NSEC_PER_SEC;
113}
114
115/* Commands for resetting the watchdog */
116static void __touch_watchdog(void)
117{
118 int this_cpu = smp_processor_id();
119
120 __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu);
121}
122
123void touch_softlockup_watchdog(void)
124{
125 __raw_get_cpu_var(watchdog_touch_ts) = 0;
126}
127EXPORT_SYMBOL(touch_softlockup_watchdog);
128
129void touch_all_softlockup_watchdogs(void)
130{
131 int cpu;
132
133 /*
134 * this is done lockless
135 * do we care if a 0 races with a timestamp?
136 * all it means is the softlock check starts one cycle later
137 */
138 for_each_online_cpu(cpu)
139 per_cpu(watchdog_touch_ts, cpu) = 0;
140}
141
142#ifdef CONFIG_HARDLOCKUP_DETECTOR
143void touch_nmi_watchdog(void)
144{
145 if (watchdog_enabled) {
146 unsigned cpu;
147
148 for_each_present_cpu(cpu) {
149 if (per_cpu(watchdog_nmi_touch, cpu) != true)
150 per_cpu(watchdog_nmi_touch, cpu) = true;
151 }
152 }
153 touch_softlockup_watchdog();
154}
155EXPORT_SYMBOL(touch_nmi_watchdog);
156
157#endif
158
159void touch_softlockup_watchdog_sync(void)
160{
161 __raw_get_cpu_var(softlockup_touch_sync) = true;
162 __raw_get_cpu_var(watchdog_touch_ts) = 0;
163}
164
165#ifdef CONFIG_HARDLOCKUP_DETECTOR
166/* watchdog detector functions */
167static int is_hardlockup(void)
168{
169 unsigned long hrint = __get_cpu_var(hrtimer_interrupts);
170
171 if (__get_cpu_var(hrtimer_interrupts_saved) == hrint)
172 return 1;
173
174 __get_cpu_var(hrtimer_interrupts_saved) = hrint;
175 return 0;
176}
177#endif
178
179static int is_softlockup(unsigned long touch_ts)
180{
181 unsigned long now = get_timestamp(smp_processor_id());
182
183 /* Warn about unreasonable delays: */
184 if (time_after(now, touch_ts + softlockup_thresh))
185 return now - touch_ts;
186
187 return 0;
188}
189
190static int
191watchdog_panic(struct notifier_block *this, unsigned long event, void *ptr)
192{
193 did_panic = 1;
194
195 return NOTIFY_DONE;
196}
197
198static struct notifier_block panic_block = {
199 .notifier_call = watchdog_panic,
200};
201
202#ifdef CONFIG_HARDLOCKUP_DETECTOR
203static struct perf_event_attr wd_hw_attr = {
204 .type = PERF_TYPE_HARDWARE,
205 .config = PERF_COUNT_HW_CPU_CYCLES,
206 .size = sizeof(struct perf_event_attr),
207 .pinned = 1,
208 .disabled = 1,
209};
210
211/* Callback function for perf event subsystem */
212void watchdog_overflow_callback(struct perf_event *event, int nmi,
213 struct perf_sample_data *data,
214 struct pt_regs *regs)
215{
216 /* Ensure the watchdog never gets throttled */
217 event->hw.interrupts = 0;
218
219 if (__get_cpu_var(watchdog_nmi_touch) == true) {
220 __get_cpu_var(watchdog_nmi_touch) = false;
221 return;
222 }
223
224 /* check for a hardlockup
225 * This is done by making sure our timer interrupt
226 * is incrementing. The timer interrupt should have
227 * fired multiple times before we overflow'd. If it hasn't
228 * then this is a good indication the cpu is stuck
229 */
230 if (is_hardlockup()) {
231 int this_cpu = smp_processor_id();
232
233 /* only print hardlockups once */
234 if (__get_cpu_var(hard_watchdog_warn) == true)
235 return;
236
237 if (hardlockup_panic)
238 panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
239 else
240 WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
241
242 __get_cpu_var(hard_watchdog_warn) = true;
243 return;
244 }
245
246 __get_cpu_var(hard_watchdog_warn) = false;
247 return;
248}
249static void watchdog_interrupt_count(void)
250{
251 __get_cpu_var(hrtimer_interrupts)++;
252}
253#else
254static inline void watchdog_interrupt_count(void) { return; }
255#endif /* CONFIG_HARDLOCKUP_DETECTOR */
256
257/* watchdog kicker functions */
258static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
259{
260 unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts);
261 struct pt_regs *regs = get_irq_regs();
262 int duration;
263
264 /* kick the hardlockup detector */
265 watchdog_interrupt_count();
266
267 /* kick the softlockup detector */
268 wake_up_process(__get_cpu_var(softlockup_watchdog));
269
270 /* .. and repeat */
271 hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));
272
273 if (touch_ts == 0) {
274 if (unlikely(__get_cpu_var(softlockup_touch_sync))) {
275 /*
276 * If the time stamp was touched atomically
277 * make sure the scheduler tick is up to date.
278 */
279 __get_cpu_var(softlockup_touch_sync) = false;
280 sched_clock_tick();
281 }
282 __touch_watchdog();
283 return HRTIMER_RESTART;
284 }
285
286 /* check for a softlockup
287 * This is done by making sure a high priority task is
288 * being scheduled. The task touches the watchdog to
289 * indicate it is getting cpu time. If it hasn't then
290 * this is a good indication some task is hogging the cpu
291 */
292 duration = is_softlockup(touch_ts);
293 if (unlikely(duration)) {
294 /* only warn once */
295 if (__get_cpu_var(soft_watchdog_warn) == true)
296 return HRTIMER_RESTART;
297
298 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
299 smp_processor_id(), duration,
300 current->comm, task_pid_nr(current));
301 print_modules();
302 print_irqtrace_events(current);
303 if (regs)
304 show_regs(regs);
305 else
306 dump_stack();
307
308 if (softlockup_panic)
309 panic("softlockup: hung tasks");
310 __get_cpu_var(soft_watchdog_warn) = true;
311 } else
312 __get_cpu_var(soft_watchdog_warn) = false;
313
314 return HRTIMER_RESTART;
315}
316
317
318/*
319 * The watchdog thread - touches the timestamp.
320 */
321static int watchdog(void *unused)
322{
323 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
324 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
325
326 sched_setscheduler(current, SCHED_FIFO, &param);
327
328 /* initialize timestamp */
329 __touch_watchdog();
330
331 /* kick off the timer for the hardlockup detector */
332 /* done here because hrtimer_start can only pin to smp_processor_id() */
333 hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()),
334 HRTIMER_MODE_REL_PINNED);
335
336 set_current_state(TASK_INTERRUPTIBLE);
337 /*
338 * Run briefly once per second to reset the softlockup timestamp.
339 * If this gets delayed for more than 60 seconds then the
340 * debug-printout triggers in watchdog_timer_fn().
341 */
342 while (!kthread_should_stop()) {
343 __touch_watchdog();
344 schedule();
345
346 if (kthread_should_stop())
347 break;
348
349 set_current_state(TASK_INTERRUPTIBLE);
350 }
351 __set_current_state(TASK_RUNNING);
352
353 return 0;
354}
355
356
357#ifdef CONFIG_HARDLOCKUP_DETECTOR
358static int watchdog_nmi_enable(int cpu)
359{
360 struct perf_event_attr *wd_attr;
361 struct perf_event *event = per_cpu(watchdog_ev, cpu);
362
363 /* is it already setup and enabled? */
364 if (event && event->state > PERF_EVENT_STATE_OFF)
365 goto out;
366
367 /* it is setup but not enabled */
368 if (event != NULL)
369 goto out_enable;
370
371 /* Try to register using hardware perf events */
372 wd_attr = &wd_hw_attr;
373 wd_attr->sample_period = hw_nmi_get_sample_period();
374 event = perf_event_create_kernel_counter(wd_attr, cpu, -1, watchdog_overflow_callback);
375 if (!IS_ERR(event)) {
376 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
377 goto out_save;
378 }
379
380 printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event);
381 return -1;
382
383 /* success path */
384out_save:
385 per_cpu(watchdog_ev, cpu) = event;
386out_enable:
387 perf_event_enable(per_cpu(watchdog_ev, cpu));
388out:
389 return 0;
390}
391
392static void watchdog_nmi_disable(int cpu)
393{
394 struct perf_event *event = per_cpu(watchdog_ev, cpu);
395
396 if (event) {
397 perf_event_disable(event);
398 per_cpu(watchdog_ev, cpu) = NULL;
399
400 /* should be in cleanup, but blocks oprofile */
401 perf_event_release_kernel(event);
402 }
403 return;
404}
405#else
406static int watchdog_nmi_enable(int cpu) { return 0; }
407static void watchdog_nmi_disable(int cpu) { return; }
408#endif /* CONFIG_HARDLOCKUP_DETECTOR */
409
410/* prepare/enable/disable routines */
411static int watchdog_prepare_cpu(int cpu)
412{
413 struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
414
415 WARN_ON(per_cpu(softlockup_watchdog, cpu));
416 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
417 hrtimer->function = watchdog_timer_fn;
418
419 return 0;
420}
421
422static int watchdog_enable(int cpu)
423{
424 struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
425
426 /* enable the perf event */
427 if (watchdog_nmi_enable(cpu) != 0)
428 return -1;
429
430 /* create the watchdog thread */
431 if (!p) {
432 p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
433 if (IS_ERR(p)) {
434 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
435 return -1;
436 }
437 kthread_bind(p, cpu);
438 per_cpu(watchdog_touch_ts, cpu) = 0;
439 per_cpu(softlockup_watchdog, cpu) = p;
440 wake_up_process(p);
441 }
442
443 /* if any cpu succeeds, watchdog is considered enabled for the system */
444 watchdog_enabled = 1;
445
446 return 0;
447}
448
449static void watchdog_disable(int cpu)
450{
451 struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
452 struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
453
454 /*
455 * cancel the timer first to stop incrementing the stats
456 * and waking up the kthread
457 */
458 hrtimer_cancel(hrtimer);
459
460 /* disable the perf event */
461 watchdog_nmi_disable(cpu);
462
463 /* stop the watchdog thread */
464 if (p) {
465 per_cpu(softlockup_watchdog, cpu) = NULL;
466 kthread_stop(p);
467 }
468}
469
470static void watchdog_enable_all_cpus(void)
471{
472 int cpu;
473 int result = 0;
474
475 for_each_online_cpu(cpu)
476 result += watchdog_enable(cpu);
477
478 if (result)
479 printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n");
480
481}
482
483static void watchdog_disable_all_cpus(void)
484{
485 int cpu;
486
487 for_each_online_cpu(cpu)
488 watchdog_disable(cpu);
489
490 /* if all watchdogs are disabled, then they are disabled for the system */
491 watchdog_enabled = 0;
492}
493
494
495/* sysctl functions */
496#ifdef CONFIG_SYSCTL
497/*
498 * proc handler for /proc/sys/kernel/nmi_watchdog
499 */
500
501int proc_dowatchdog_enabled(struct ctl_table *table, int write,
502 void __user *buffer, size_t *length, loff_t *ppos)
503{
504 proc_dointvec(table, write, buffer, length, ppos);
505
506 if (watchdog_enabled)
507 watchdog_enable_all_cpus();
508 else
509 watchdog_disable_all_cpus();
510 return 0;
511}
512
513int proc_dowatchdog_thresh(struct ctl_table *table, int write,
514 void __user *buffer,
515 size_t *lenp, loff_t *ppos)
516{
517 return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
518}
519#endif /* CONFIG_SYSCTL */
520
521
522/*
523 * Create/destroy watchdog threads as CPUs come and go:
524 */
525static int __cpuinit
526cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
527{
528 int hotcpu = (unsigned long)hcpu;
529
530 switch (action) {
531 case CPU_UP_PREPARE:
532 case CPU_UP_PREPARE_FROZEN:
533 if (watchdog_prepare_cpu(hotcpu))
534 return NOTIFY_BAD;
535 break;
536 case CPU_ONLINE:
537 case CPU_ONLINE_FROZEN:
538 if (watchdog_enable(hotcpu))
539 return NOTIFY_BAD;
540 break;
541#ifdef CONFIG_HOTPLUG_CPU
542 case CPU_UP_CANCELED:
543 case CPU_UP_CANCELED_FROZEN:
544 watchdog_disable(hotcpu);
545 break;
546 case CPU_DEAD:
547 case CPU_DEAD_FROZEN:
548 watchdog_disable(hotcpu);
549 break;
550#endif /* CONFIG_HOTPLUG_CPU */
551 }
552 return NOTIFY_OK;
553}
554
555static struct notifier_block __cpuinitdata cpu_nfb = {
556 .notifier_call = cpu_callback
557};
558
559static int __init spawn_watchdog_task(void)
560{
561 void *cpu = (void *)(long)smp_processor_id();
562 int err;
563
564 if (no_watchdog)
565 return 0;
566
567 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
568 WARN_ON(err == NOTIFY_BAD);
569
570 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
571 register_cpu_notifier(&cpu_nfb);
572
573 atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
574
575 return 0;
576}
577early_initcall(spawn_watchdog_task);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5bfb213984b2..f77afd939229 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1,19 +1,26 @@
1/* 1/*
2 * linux/kernel/workqueue.c 2 * kernel/workqueue.c - generic async execution with shared worker pool
3 * 3 *
4 * Generic mechanism for defining kernel helper threads for running 4 * Copyright (C) 2002 Ingo Molnar
5 * arbitrary tasks in process context.
6 * 5 *
7 * Started by Ingo Molnar, Copyright (C) 2002 6 * Derived from the taskqueue/keventd code by:
7 * David Woodhouse <dwmw2@infradead.org>
8 * Andrew Morton
9 * Kai Petzke <wpp@marie.physik.tu-berlin.de>
10 * Theodore Ts'o <tytso@mit.edu>
8 * 11 *
9 * Derived from the taskqueue/keventd code by: 12 * Made to use alloc_percpu by Christoph Lameter.
10 * 13 *
11 * David Woodhouse <dwmw2@infradead.org> 14 * Copyright (C) 2010 SUSE Linux Products GmbH
12 * Andrew Morton 15 * Copyright (C) 2010 Tejun Heo <tj@kernel.org>
13 * Kai Petzke <wpp@marie.physik.tu-berlin.de>
14 * Theodore Ts'o <tytso@mit.edu>
15 * 16 *
16 * Made to use alloc_percpu by Christoph Lameter. 17 * This is the generic async execution mechanism. Work items as are
18 * executed in process context. The worker pool is shared and
19 * automatically managed. There is one worker pool for each CPU and
20 * one extra for works which are better served by workers which are
21 * not bound to any specific CPU.
22 *
23 * Please read Documentation/workqueue.txt for details.
17 */ 24 */
18 25
19#include <linux/module.h> 26#include <linux/module.h>
@@ -33,41 +40,291 @@
33#include <linux/kallsyms.h> 40#include <linux/kallsyms.h>
34#include <linux/debug_locks.h> 41#include <linux/debug_locks.h>
35#include <linux/lockdep.h> 42#include <linux/lockdep.h>
43#include <linux/idr.h>
44
36#define CREATE_TRACE_POINTS 45#define CREATE_TRACE_POINTS
37#include <trace/events/workqueue.h> 46#include <trace/events/workqueue.h>
38 47
48#include "workqueue_sched.h"
49
50enum {
51 /* global_cwq flags */
52 GCWQ_MANAGE_WORKERS = 1 << 0, /* need to manage workers */
53 GCWQ_MANAGING_WORKERS = 1 << 1, /* managing workers */
54 GCWQ_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */
55 GCWQ_FREEZING = 1 << 3, /* freeze in progress */
56 GCWQ_HIGHPRI_PENDING = 1 << 4, /* highpri works on queue */
57
58 /* worker flags */
59 WORKER_STARTED = 1 << 0, /* started */
60 WORKER_DIE = 1 << 1, /* die die die */
61 WORKER_IDLE = 1 << 2, /* is idle */
62 WORKER_PREP = 1 << 3, /* preparing to run works */
63 WORKER_ROGUE = 1 << 4, /* not bound to any cpu */
64 WORKER_REBIND = 1 << 5, /* mom is home, come back */
65 WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */
66 WORKER_UNBOUND = 1 << 7, /* worker is unbound */
67
68 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND |
69 WORKER_CPU_INTENSIVE | WORKER_UNBOUND,
70
71 /* gcwq->trustee_state */
72 TRUSTEE_START = 0, /* start */
73 TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */
74 TRUSTEE_BUTCHER = 2, /* butcher workers */
75 TRUSTEE_RELEASE = 3, /* release workers */
76 TRUSTEE_DONE = 4, /* trustee is done */
77
78 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */
79 BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER,
80 BUSY_WORKER_HASH_MASK = BUSY_WORKER_HASH_SIZE - 1,
81
82 MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */
83 IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */
84
85 MAYDAY_INITIAL_TIMEOUT = HZ / 100, /* call for help after 10ms */
86 MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */
87 CREATE_COOLDOWN = HZ, /* time to breath after fail */
88 TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */
89
90 /*
91 * Rescue workers are used only on emergencies and shared by
92 * all cpus. Give -20.
93 */
94 RESCUER_NICE_LEVEL = -20,
95};
96
39/* 97/*
40 * The per-CPU workqueue (if single thread, we always use the first 98 * Structure fields follow one of the following exclusion rules.
41 * possible cpu). 99 *
100 * I: Modifiable by initialization/destruction paths and read-only for
101 * everyone else.
102 *
103 * P: Preemption protected. Disabling preemption is enough and should
104 * only be modified and accessed from the local cpu.
105 *
106 * L: gcwq->lock protected. Access with gcwq->lock held.
107 *
108 * X: During normal operation, modification requires gcwq->lock and
109 * should be done only from local cpu. Either disabling preemption
110 * on local cpu or grabbing gcwq->lock is enough for read access.
111 * If GCWQ_DISASSOCIATED is set, it's identical to L.
112 *
113 * F: wq->flush_mutex protected.
114 *
115 * W: workqueue_lock protected.
42 */ 116 */
43struct cpu_workqueue_struct {
44 117
45 spinlock_t lock; 118struct global_cwq;
46 119
47 struct list_head worklist; 120/*
48 wait_queue_head_t more_work; 121 * The poor guys doing the actual heavy lifting. All on-duty workers
49 struct work_struct *current_work; 122 * are either serving the manager role, on idle list or on busy hash.
123 */
124struct worker {
125 /* on idle list while idle, on busy hash table while busy */
126 union {
127 struct list_head entry; /* L: while idle */
128 struct hlist_node hentry; /* L: while busy */
129 };
50 130
51 struct workqueue_struct *wq; 131 struct work_struct *current_work; /* L: work being processed */
52 struct task_struct *thread; 132 struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
53} ____cacheline_aligned; 133 struct list_head scheduled; /* L: scheduled works */
134 struct task_struct *task; /* I: worker task */
135 struct global_cwq *gcwq; /* I: the associated gcwq */
136 /* 64 bytes boundary on 64bit, 32 on 32bit */
137 unsigned long last_active; /* L: last active timestamp */
138 unsigned int flags; /* X: flags */
139 int id; /* I: worker id */
140 struct work_struct rebind_work; /* L: rebind worker to cpu */
141};
142
143/*
144 * Global per-cpu workqueue. There's one and only one for each cpu
145 * and all works are queued and processed here regardless of their
146 * target workqueues.
147 */
148struct global_cwq {
149 spinlock_t lock; /* the gcwq lock */
150 struct list_head worklist; /* L: list of pending works */
151 unsigned int cpu; /* I: the associated cpu */
152 unsigned int flags; /* L: GCWQ_* flags */
153
154 int nr_workers; /* L: total number of workers */
155 int nr_idle; /* L: currently idle ones */
156
157 /* workers are chained either in the idle_list or busy_hash */
158 struct list_head idle_list; /* X: list of idle workers */
159 struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE];
160 /* L: hash of busy workers */
161
162 struct timer_list idle_timer; /* L: worker idle timeout */
163 struct timer_list mayday_timer; /* L: SOS timer for dworkers */
164
165 struct ida worker_ida; /* L: for worker IDs */
166
167 struct task_struct *trustee; /* L: for gcwq shutdown */
168 unsigned int trustee_state; /* L: trustee state */
169 wait_queue_head_t trustee_wait; /* trustee wait */
170 struct worker *first_idle; /* L: first idle worker */
171} ____cacheline_aligned_in_smp;
172
173/*
174 * The per-CPU workqueue. The lower WORK_STRUCT_FLAG_BITS of
175 * work_struct->data are used for flags and thus cwqs need to be
176 * aligned at two's power of the number of flag bits.
177 */
178struct cpu_workqueue_struct {
179 struct global_cwq *gcwq; /* I: the associated gcwq */
180 struct workqueue_struct *wq; /* I: the owning workqueue */
181 int work_color; /* L: current color */
182 int flush_color; /* L: flushing color */
183 int nr_in_flight[WORK_NR_COLORS];
184 /* L: nr of in_flight works */
185 int nr_active; /* L: nr of active works */
186 int max_active; /* L: max active works */
187 struct list_head delayed_works; /* L: delayed works */
188};
189
190/*
191 * Structure used to wait for workqueue flush.
192 */
193struct wq_flusher {
194 struct list_head list; /* F: list of flushers */
195 int flush_color; /* F: flush color waiting for */
196 struct completion done; /* flush completion */
197};
198
199/*
200 * All cpumasks are assumed to be always set on UP and thus can't be
201 * used to determine whether there's something to be done.
202 */
203#ifdef CONFIG_SMP
204typedef cpumask_var_t mayday_mask_t;
205#define mayday_test_and_set_cpu(cpu, mask) \
206 cpumask_test_and_set_cpu((cpu), (mask))
207#define mayday_clear_cpu(cpu, mask) cpumask_clear_cpu((cpu), (mask))
208#define for_each_mayday_cpu(cpu, mask) for_each_cpu((cpu), (mask))
209#define alloc_mayday_mask(maskp, gfp) zalloc_cpumask_var((maskp), (gfp))
210#define free_mayday_mask(mask) free_cpumask_var((mask))
211#else
212typedef unsigned long mayday_mask_t;
213#define mayday_test_and_set_cpu(cpu, mask) test_and_set_bit(0, &(mask))
214#define mayday_clear_cpu(cpu, mask) clear_bit(0, &(mask))
215#define for_each_mayday_cpu(cpu, mask) if ((cpu) = 0, (mask))
216#define alloc_mayday_mask(maskp, gfp) true
217#define free_mayday_mask(mask) do { } while (0)
218#endif
54 219
55/* 220/*
56 * The externally visible workqueue abstraction is an array of 221 * The externally visible workqueue abstraction is an array of
57 * per-CPU workqueues: 222 * per-CPU workqueues:
58 */ 223 */
59struct workqueue_struct { 224struct workqueue_struct {
60 struct cpu_workqueue_struct *cpu_wq; 225 unsigned int flags; /* I: WQ_* flags */
61 struct list_head list; 226 union {
62 const char *name; 227 struct cpu_workqueue_struct __percpu *pcpu;
63 int singlethread; 228 struct cpu_workqueue_struct *single;
64 int freezeable; /* Freeze threads during suspend */ 229 unsigned long v;
65 int rt; 230 } cpu_wq; /* I: cwq's */
231 struct list_head list; /* W: list of all workqueues */
232
233 struct mutex flush_mutex; /* protects wq flushing */
234 int work_color; /* F: current work color */
235 int flush_color; /* F: current flush color */
236 atomic_t nr_cwqs_to_flush; /* flush in progress */
237 struct wq_flusher *first_flusher; /* F: first flusher */
238 struct list_head flusher_queue; /* F: flush waiters */
239 struct list_head flusher_overflow; /* F: flush overflow list */
240
241 mayday_mask_t mayday_mask; /* cpus requesting rescue */
242 struct worker *rescuer; /* I: rescue worker */
243
244 int saved_max_active; /* W: saved cwq max_active */
245 const char *name; /* I: workqueue name */
66#ifdef CONFIG_LOCKDEP 246#ifdef CONFIG_LOCKDEP
67 struct lockdep_map lockdep_map; 247 struct lockdep_map lockdep_map;
68#endif 248#endif
69}; 249};
70 250
251struct workqueue_struct *system_wq __read_mostly;
252struct workqueue_struct *system_long_wq __read_mostly;
253struct workqueue_struct *system_nrt_wq __read_mostly;
254struct workqueue_struct *system_unbound_wq __read_mostly;
255EXPORT_SYMBOL_GPL(system_wq);
256EXPORT_SYMBOL_GPL(system_long_wq);
257EXPORT_SYMBOL_GPL(system_nrt_wq);
258EXPORT_SYMBOL_GPL(system_unbound_wq);
259
260#define for_each_busy_worker(worker, i, pos, gcwq) \
261 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \
262 hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
263
264static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask,
265 unsigned int sw)
266{
267 if (cpu < nr_cpu_ids) {
268 if (sw & 1) {
269 cpu = cpumask_next(cpu, mask);
270 if (cpu < nr_cpu_ids)
271 return cpu;
272 }
273 if (sw & 2)
274 return WORK_CPU_UNBOUND;
275 }
276 return WORK_CPU_NONE;
277}
278
279static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
280 struct workqueue_struct *wq)
281{
282 return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2);
283}
284
285/*
286 * CPU iterators
287 *
288 * An extra gcwq is defined for an invalid cpu number
289 * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any
290 * specific CPU. The following iterators are similar to
291 * for_each_*_cpu() iterators but also considers the unbound gcwq.
292 *
293 * for_each_gcwq_cpu() : possible CPUs + WORK_CPU_UNBOUND
294 * for_each_online_gcwq_cpu() : online CPUs + WORK_CPU_UNBOUND
295 * for_each_cwq_cpu() : possible CPUs for bound workqueues,
296 * WORK_CPU_UNBOUND for unbound workqueues
297 */
298#define for_each_gcwq_cpu(cpu) \
299 for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3); \
300 (cpu) < WORK_CPU_NONE; \
301 (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3))
302
303#define for_each_online_gcwq_cpu(cpu) \
304 for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3); \
305 (cpu) < WORK_CPU_NONE; \
306 (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3))
307
308#define for_each_cwq_cpu(cpu, wq) \
309 for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq)); \
310 (cpu) < WORK_CPU_NONE; \
311 (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq)))
312
313#ifdef CONFIG_LOCKDEP
314/**
315 * in_workqueue_context() - in context of specified workqueue?
316 * @wq: the workqueue of interest
317 *
318 * Checks lockdep state to see if the current task is executing from
319 * within a workqueue item. This function exists only if lockdep is
320 * enabled.
321 */
322int in_workqueue_context(struct workqueue_struct *wq)
323{
324 return lock_is_held(&wq->lockdep_map);
325}
326#endif
327
71#ifdef CONFIG_DEBUG_OBJECTS_WORK 328#ifdef CONFIG_DEBUG_OBJECTS_WORK
72 329
73static struct debug_obj_descr work_debug_descr; 330static struct debug_obj_descr work_debug_descr;
@@ -107,7 +364,7 @@ static int work_fixup_activate(void *addr, enum debug_obj_state state)
107 * statically initialized. We just make sure that it 364 * statically initialized. We just make sure that it
108 * is tracked in the object tracker. 365 * is tracked in the object tracker.
109 */ 366 */
110 if (test_bit(WORK_STRUCT_STATIC, work_data_bits(work))) { 367 if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
111 debug_object_init(work, &work_debug_descr); 368 debug_object_init(work, &work_debug_descr);
112 debug_object_activate(work, &work_debug_descr); 369 debug_object_activate(work, &work_debug_descr);
113 return 0; 370 return 0;
@@ -181,84 +438,582 @@ static inline void debug_work_deactivate(struct work_struct *work) { }
181/* Serializes the accesses to the list of workqueues. */ 438/* Serializes the accesses to the list of workqueues. */
182static DEFINE_SPINLOCK(workqueue_lock); 439static DEFINE_SPINLOCK(workqueue_lock);
183static LIST_HEAD(workqueues); 440static LIST_HEAD(workqueues);
441static bool workqueue_freezing; /* W: have wqs started freezing? */
184 442
185static int singlethread_cpu __read_mostly;
186static const struct cpumask *cpu_singlethread_map __read_mostly;
187/* 443/*
188 * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD 444 * The almighty global cpu workqueues. nr_running is the only field
189 * flushes cwq->worklist. This means that flush_workqueue/wait_on_work 445 * which is expected to be used frequently by other cpus via
190 * which comes in between can't use for_each_online_cpu(). We could 446 * try_to_wake_up(). Put it in a separate cacheline.
191 * use cpu_possible_map, the cpumask below is more a documentation
192 * than optimization.
193 */ 447 */
194static cpumask_var_t cpu_populated_map __read_mostly; 448static DEFINE_PER_CPU(struct global_cwq, global_cwq);
449static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
450
451/*
452 * Global cpu workqueue and nr_running counter for unbound gcwq. The
453 * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its
454 * workers have WORKER_UNBOUND set.
455 */
456static struct global_cwq unbound_global_cwq;
457static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0); /* always 0 */
458
459static int worker_thread(void *__worker);
460
461static struct global_cwq *get_gcwq(unsigned int cpu)
462{
463 if (cpu != WORK_CPU_UNBOUND)
464 return &per_cpu(global_cwq, cpu);
465 else
466 return &unbound_global_cwq;
467}
468
469static atomic_t *get_gcwq_nr_running(unsigned int cpu)
470{
471 if (cpu != WORK_CPU_UNBOUND)
472 return &per_cpu(gcwq_nr_running, cpu);
473 else
474 return &unbound_gcwq_nr_running;
475}
195 476
196/* If it's single threaded, it isn't in the list of workqueues. */ 477static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
197static inline int is_wq_single_threaded(struct workqueue_struct *wq) 478 struct workqueue_struct *wq)
198{ 479{
199 return wq->singlethread; 480 if (!(wq->flags & WQ_UNBOUND)) {
481 if (likely(cpu < nr_cpu_ids)) {
482#ifdef CONFIG_SMP
483 return per_cpu_ptr(wq->cpu_wq.pcpu, cpu);
484#else
485 return wq->cpu_wq.single;
486#endif
487 }
488 } else if (likely(cpu == WORK_CPU_UNBOUND))
489 return wq->cpu_wq.single;
490 return NULL;
200} 491}
201 492
202static const struct cpumask *wq_cpu_map(struct workqueue_struct *wq) 493static unsigned int work_color_to_flags(int color)
203{ 494{
204 return is_wq_single_threaded(wq) 495 return color << WORK_STRUCT_COLOR_SHIFT;
205 ? cpu_singlethread_map : cpu_populated_map;
206} 496}
207 497
208static 498static int get_work_color(struct work_struct *work)
209struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu)
210{ 499{
211 if (unlikely(is_wq_single_threaded(wq))) 500 return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) &
212 cpu = singlethread_cpu; 501 ((1 << WORK_STRUCT_COLOR_BITS) - 1);
213 return per_cpu_ptr(wq->cpu_wq, cpu); 502}
503
504static int work_next_color(int color)
505{
506 return (color + 1) % WORK_NR_COLORS;
214} 507}
215 508
216/* 509/*
217 * Set the workqueue on which a work item is to be run 510 * A work's data points to the cwq with WORK_STRUCT_CWQ set while the
218 * - Must *only* be called if the pending flag is set 511 * work is on queue. Once execution starts, WORK_STRUCT_CWQ is
512 * cleared and the work data contains the cpu number it was last on.
513 *
514 * set_work_{cwq|cpu}() and clear_work_data() can be used to set the
515 * cwq, cpu or clear work->data. These functions should only be
516 * called while the work is owned - ie. while the PENDING bit is set.
517 *
518 * get_work_[g]cwq() can be used to obtain the gcwq or cwq
519 * corresponding to a work. gcwq is available once the work has been
520 * queued anywhere after initialization. cwq is available only from
521 * queueing until execution starts.
219 */ 522 */
220static inline void set_wq_data(struct work_struct *work, 523static inline void set_work_data(struct work_struct *work, unsigned long data,
221 struct cpu_workqueue_struct *cwq) 524 unsigned long flags)
222{ 525{
223 unsigned long new;
224
225 BUG_ON(!work_pending(work)); 526 BUG_ON(!work_pending(work));
527 atomic_long_set(&work->data, data | flags | work_static(work));
528}
529
530static void set_work_cwq(struct work_struct *work,
531 struct cpu_workqueue_struct *cwq,
532 unsigned long extra_flags)
533{
534 set_work_data(work, (unsigned long)cwq,
535 WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags);
536}
537
538static void set_work_cpu(struct work_struct *work, unsigned int cpu)
539{
540 set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING);
541}
542
543static void clear_work_data(struct work_struct *work)
544{
545 set_work_data(work, WORK_STRUCT_NO_CPU, 0);
546}
547
548static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work)
549{
550 unsigned long data = atomic_long_read(&work->data);
551
552 if (data & WORK_STRUCT_CWQ)
553 return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
554 else
555 return NULL;
556}
557
558static struct global_cwq *get_work_gcwq(struct work_struct *work)
559{
560 unsigned long data = atomic_long_read(&work->data);
561 unsigned int cpu;
562
563 if (data & WORK_STRUCT_CWQ)
564 return ((struct cpu_workqueue_struct *)
565 (data & WORK_STRUCT_WQ_DATA_MASK))->gcwq;
566
567 cpu = data >> WORK_STRUCT_FLAG_BITS;
568 if (cpu == WORK_CPU_NONE)
569 return NULL;
226 570
227 new = (unsigned long) cwq | (1UL << WORK_STRUCT_PENDING); 571 BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND);
228 new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work); 572 return get_gcwq(cpu);
229 atomic_long_set(&work->data, new); 573}
574
575/*
576 * Policy functions. These define the policies on how the global
577 * worker pool is managed. Unless noted otherwise, these functions
578 * assume that they're being called with gcwq->lock held.
579 */
580
581static bool __need_more_worker(struct global_cwq *gcwq)
582{
583 return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) ||
584 gcwq->flags & GCWQ_HIGHPRI_PENDING;
585}
586
587/*
588 * Need to wake up a worker? Called from anything but currently
589 * running workers.
590 */
591static bool need_more_worker(struct global_cwq *gcwq)
592{
593 return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq);
230} 594}
231 595
232static inline 596/* Can I start working? Called from busy but !running workers. */
233struct cpu_workqueue_struct *get_wq_data(struct work_struct *work) 597static bool may_start_working(struct global_cwq *gcwq)
234{ 598{
235 return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); 599 return gcwq->nr_idle;
236} 600}
237 601
602/* Do I need to keep working? Called from currently running workers. */
603static bool keep_working(struct global_cwq *gcwq)
604{
605 atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
606
607 return !list_empty(&gcwq->worklist) && atomic_read(nr_running) <= 1;
608}
609
610/* Do we need a new worker? Called from manager. */
611static bool need_to_create_worker(struct global_cwq *gcwq)
612{
613 return need_more_worker(gcwq) && !may_start_working(gcwq);
614}
615
616/* Do I need to be the manager? */
617static bool need_to_manage_workers(struct global_cwq *gcwq)
618{
619 return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS;
620}
621
622/* Do we have too many workers and should some go away? */
623static bool too_many_workers(struct global_cwq *gcwq)
624{
625 bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS;
626 int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */
627 int nr_busy = gcwq->nr_workers - nr_idle;
628
629 return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
630}
631
632/*
633 * Wake up functions.
634 */
635
636/* Return the first worker. Safe with preemption disabled */
637static struct worker *first_worker(struct global_cwq *gcwq)
638{
639 if (unlikely(list_empty(&gcwq->idle_list)))
640 return NULL;
641
642 return list_first_entry(&gcwq->idle_list, struct worker, entry);
643}
644
645/**
646 * wake_up_worker - wake up an idle worker
647 * @gcwq: gcwq to wake worker for
648 *
649 * Wake up the first idle worker of @gcwq.
650 *
651 * CONTEXT:
652 * spin_lock_irq(gcwq->lock).
653 */
654static void wake_up_worker(struct global_cwq *gcwq)
655{
656 struct worker *worker = first_worker(gcwq);
657
658 if (likely(worker))
659 wake_up_process(worker->task);
660}
661
662/**
663 * wq_worker_waking_up - a worker is waking up
664 * @task: task waking up
665 * @cpu: CPU @task is waking up to
666 *
667 * This function is called during try_to_wake_up() when a worker is
668 * being awoken.
669 *
670 * CONTEXT:
671 * spin_lock_irq(rq->lock)
672 */
673void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
674{
675 struct worker *worker = kthread_data(task);
676
677 if (likely(!(worker->flags & WORKER_NOT_RUNNING)))
678 atomic_inc(get_gcwq_nr_running(cpu));
679}
680
681/**
682 * wq_worker_sleeping - a worker is going to sleep
683 * @task: task going to sleep
684 * @cpu: CPU in question, must be the current CPU number
685 *
686 * This function is called during schedule() when a busy worker is
687 * going to sleep. Worker on the same cpu can be woken up by
688 * returning pointer to its task.
689 *
690 * CONTEXT:
691 * spin_lock_irq(rq->lock)
692 *
693 * RETURNS:
694 * Worker task on @cpu to wake up, %NULL if none.
695 */
696struct task_struct *wq_worker_sleeping(struct task_struct *task,
697 unsigned int cpu)
698{
699 struct worker *worker = kthread_data(task), *to_wakeup = NULL;
700 struct global_cwq *gcwq = get_gcwq(cpu);
701 atomic_t *nr_running = get_gcwq_nr_running(cpu);
702
703 if (unlikely(worker->flags & WORKER_NOT_RUNNING))
704 return NULL;
705
706 /* this can only happen on the local cpu */
707 BUG_ON(cpu != raw_smp_processor_id());
708
709 /*
710 * The counterpart of the following dec_and_test, implied mb,
711 * worklist not empty test sequence is in insert_work().
712 * Please read comment there.
713 *
714 * NOT_RUNNING is clear. This means that trustee is not in
715 * charge and we're running on the local cpu w/ rq lock held
716 * and preemption disabled, which in turn means that none else
717 * could be manipulating idle_list, so dereferencing idle_list
718 * without gcwq lock is safe.
719 */
720 if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist))
721 to_wakeup = first_worker(gcwq);
722 return to_wakeup ? to_wakeup->task : NULL;
723}
724
725/**
726 * worker_set_flags - set worker flags and adjust nr_running accordingly
727 * @worker: self
728 * @flags: flags to set
729 * @wakeup: wakeup an idle worker if necessary
730 *
731 * Set @flags in @worker->flags and adjust nr_running accordingly. If
732 * nr_running becomes zero and @wakeup is %true, an idle worker is
733 * woken up.
734 *
735 * CONTEXT:
736 * spin_lock_irq(gcwq->lock)
737 */
738static inline void worker_set_flags(struct worker *worker, unsigned int flags,
739 bool wakeup)
740{
741 struct global_cwq *gcwq = worker->gcwq;
742
743 WARN_ON_ONCE(worker->task != current);
744
745 /*
746 * If transitioning into NOT_RUNNING, adjust nr_running and
747 * wake up an idle worker as necessary if requested by
748 * @wakeup.
749 */
750 if ((flags & WORKER_NOT_RUNNING) &&
751 !(worker->flags & WORKER_NOT_RUNNING)) {
752 atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
753
754 if (wakeup) {
755 if (atomic_dec_and_test(nr_running) &&
756 !list_empty(&gcwq->worklist))
757 wake_up_worker(gcwq);
758 } else
759 atomic_dec(nr_running);
760 }
761
762 worker->flags |= flags;
763}
764
765/**
766 * worker_clr_flags - clear worker flags and adjust nr_running accordingly
767 * @worker: self
768 * @flags: flags to clear
769 *
770 * Clear @flags in @worker->flags and adjust nr_running accordingly.
771 *
772 * CONTEXT:
773 * spin_lock_irq(gcwq->lock)
774 */
775static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
776{
777 struct global_cwq *gcwq = worker->gcwq;
778 unsigned int oflags = worker->flags;
779
780 WARN_ON_ONCE(worker->task != current);
781
782 worker->flags &= ~flags;
783
784 /* if transitioning out of NOT_RUNNING, increment nr_running */
785 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
786 if (!(worker->flags & WORKER_NOT_RUNNING))
787 atomic_inc(get_gcwq_nr_running(gcwq->cpu));
788}
789
790/**
791 * busy_worker_head - return the busy hash head for a work
792 * @gcwq: gcwq of interest
793 * @work: work to be hashed
794 *
795 * Return hash head of @gcwq for @work.
796 *
797 * CONTEXT:
798 * spin_lock_irq(gcwq->lock).
799 *
800 * RETURNS:
801 * Pointer to the hash head.
802 */
803static struct hlist_head *busy_worker_head(struct global_cwq *gcwq,
804 struct work_struct *work)
805{
806 const int base_shift = ilog2(sizeof(struct work_struct));
807 unsigned long v = (unsigned long)work;
808
809 /* simple shift and fold hash, do we need something better? */
810 v >>= base_shift;
811 v += v >> BUSY_WORKER_HASH_ORDER;
812 v &= BUSY_WORKER_HASH_MASK;
813
814 return &gcwq->busy_hash[v];
815}
816
817/**
818 * __find_worker_executing_work - find worker which is executing a work
819 * @gcwq: gcwq of interest
820 * @bwh: hash head as returned by busy_worker_head()
821 * @work: work to find worker for
822 *
823 * Find a worker which is executing @work on @gcwq. @bwh should be
824 * the hash head obtained by calling busy_worker_head() with the same
825 * work.
826 *
827 * CONTEXT:
828 * spin_lock_irq(gcwq->lock).
829 *
830 * RETURNS:
831 * Pointer to worker which is executing @work if found, NULL
832 * otherwise.
833 */
834static struct worker *__find_worker_executing_work(struct global_cwq *gcwq,
835 struct hlist_head *bwh,
836 struct work_struct *work)
837{
838 struct worker *worker;
839 struct hlist_node *tmp;
840
841 hlist_for_each_entry(worker, tmp, bwh, hentry)
842 if (worker->current_work == work)
843 return worker;
844 return NULL;
845}
846
847/**
848 * find_worker_executing_work - find worker which is executing a work
849 * @gcwq: gcwq of interest
850 * @work: work to find worker for
851 *
852 * Find a worker which is executing @work on @gcwq. This function is
853 * identical to __find_worker_executing_work() except that this
854 * function calculates @bwh itself.
855 *
856 * CONTEXT:
857 * spin_lock_irq(gcwq->lock).
858 *
859 * RETURNS:
860 * Pointer to worker which is executing @work if found, NULL
861 * otherwise.
862 */
863static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
864 struct work_struct *work)
865{
866 return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work),
867 work);
868}
869
870/**
871 * gcwq_determine_ins_pos - find insertion position
872 * @gcwq: gcwq of interest
873 * @cwq: cwq a work is being queued for
874 *
875 * A work for @cwq is about to be queued on @gcwq, determine insertion
876 * position for the work. If @cwq is for HIGHPRI wq, the work is
877 * queued at the head of the queue but in FIFO order with respect to
878 * other HIGHPRI works; otherwise, at the end of the queue. This
879 * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that
880 * there are HIGHPRI works pending.
881 *
882 * CONTEXT:
883 * spin_lock_irq(gcwq->lock).
884 *
885 * RETURNS:
886 * Pointer to inserstion position.
887 */
888static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq,
889 struct cpu_workqueue_struct *cwq)
890{
891 struct work_struct *twork;
892
893 if (likely(!(cwq->wq->flags & WQ_HIGHPRI)))
894 return &gcwq->worklist;
895
896 list_for_each_entry(twork, &gcwq->worklist, entry) {
897 struct cpu_workqueue_struct *tcwq = get_work_cwq(twork);
898
899 if (!(tcwq->wq->flags & WQ_HIGHPRI))
900 break;
901 }
902
903 gcwq->flags |= GCWQ_HIGHPRI_PENDING;
904 return &twork->entry;
905}
906
907/**
908 * insert_work - insert a work into gcwq
909 * @cwq: cwq @work belongs to
910 * @work: work to insert
911 * @head: insertion point
912 * @extra_flags: extra WORK_STRUCT_* flags to set
913 *
914 * Insert @work which belongs to @cwq into @gcwq after @head.
915 * @extra_flags is or'd to work_struct flags.
916 *
917 * CONTEXT:
918 * spin_lock_irq(gcwq->lock).
919 */
238static void insert_work(struct cpu_workqueue_struct *cwq, 920static void insert_work(struct cpu_workqueue_struct *cwq,
239 struct work_struct *work, struct list_head *head) 921 struct work_struct *work, struct list_head *head,
922 unsigned int extra_flags)
240{ 923{
241 trace_workqueue_insertion(cwq->thread, work); 924 struct global_cwq *gcwq = cwq->gcwq;
925
926 /* we own @work, set data and link */
927 set_work_cwq(work, cwq, extra_flags);
242 928
243 set_wq_data(work, cwq);
244 /* 929 /*
245 * Ensure that we get the right work->data if we see the 930 * Ensure that we get the right work->data if we see the
246 * result of list_add() below, see try_to_grab_pending(). 931 * result of list_add() below, see try_to_grab_pending().
247 */ 932 */
248 smp_wmb(); 933 smp_wmb();
934
249 list_add_tail(&work->entry, head); 935 list_add_tail(&work->entry, head);
250 wake_up(&cwq->more_work); 936
937 /*
938 * Ensure either worker_sched_deactivated() sees the above
939 * list_add_tail() or we see zero nr_running to avoid workers
940 * lying around lazily while there are works to be processed.
941 */
942 smp_mb();
943
944 if (__need_more_worker(gcwq))
945 wake_up_worker(gcwq);
251} 946}
252 947
253static void __queue_work(struct cpu_workqueue_struct *cwq, 948static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
254 struct work_struct *work) 949 struct work_struct *work)
255{ 950{
951 struct global_cwq *gcwq;
952 struct cpu_workqueue_struct *cwq;
953 struct list_head *worklist;
954 unsigned int work_flags;
256 unsigned long flags; 955 unsigned long flags;
257 956
258 debug_work_activate(work); 957 debug_work_activate(work);
259 spin_lock_irqsave(&cwq->lock, flags); 958
260 insert_work(cwq, work, &cwq->worklist); 959 if (WARN_ON_ONCE(wq->flags & WQ_DYING))
261 spin_unlock_irqrestore(&cwq->lock, flags); 960 return;
961
962 /* determine gcwq to use */
963 if (!(wq->flags & WQ_UNBOUND)) {
964 struct global_cwq *last_gcwq;
965
966 if (unlikely(cpu == WORK_CPU_UNBOUND))
967 cpu = raw_smp_processor_id();
968
969 /*
970 * It's multi cpu. If @wq is non-reentrant and @work
971 * was previously on a different cpu, it might still
972 * be running there, in which case the work needs to
973 * be queued on that cpu to guarantee non-reentrance.
974 */
975 gcwq = get_gcwq(cpu);
976 if (wq->flags & WQ_NON_REENTRANT &&
977 (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) {
978 struct worker *worker;
979
980 spin_lock_irqsave(&last_gcwq->lock, flags);
981
982 worker = find_worker_executing_work(last_gcwq, work);
983
984 if (worker && worker->current_cwq->wq == wq)
985 gcwq = last_gcwq;
986 else {
987 /* meh... not running there, queue here */
988 spin_unlock_irqrestore(&last_gcwq->lock, flags);
989 spin_lock_irqsave(&gcwq->lock, flags);
990 }
991 } else
992 spin_lock_irqsave(&gcwq->lock, flags);
993 } else {
994 gcwq = get_gcwq(WORK_CPU_UNBOUND);
995 spin_lock_irqsave(&gcwq->lock, flags);
996 }
997
998 /* gcwq determined, get cwq and queue */
999 cwq = get_cwq(gcwq->cpu, wq);
1000
1001 BUG_ON(!list_empty(&work->entry));
1002
1003 cwq->nr_in_flight[cwq->work_color]++;
1004 work_flags = work_color_to_flags(cwq->work_color);
1005
1006 if (likely(cwq->nr_active < cwq->max_active)) {
1007 cwq->nr_active++;
1008 worklist = gcwq_determine_ins_pos(gcwq, cwq);
1009 } else {
1010 work_flags |= WORK_STRUCT_DELAYED;
1011 worklist = &cwq->delayed_works;
1012 }
1013
1014 insert_work(cwq, work, worklist, work_flags);
1015
1016 spin_unlock_irqrestore(&gcwq->lock, flags);
262} 1017}
263 1018
264/** 1019/**
@@ -298,9 +1053,8 @@ queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
298{ 1053{
299 int ret = 0; 1054 int ret = 0;
300 1055
301 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { 1056 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
302 BUG_ON(!list_empty(&work->entry)); 1057 __queue_work(cpu, wq, work);
303 __queue_work(wq_per_cpu(wq, cpu), work);
304 ret = 1; 1058 ret = 1;
305 } 1059 }
306 return ret; 1060 return ret;
@@ -310,10 +1064,9 @@ EXPORT_SYMBOL_GPL(queue_work_on);
310static void delayed_work_timer_fn(unsigned long __data) 1064static void delayed_work_timer_fn(unsigned long __data)
311{ 1065{
312 struct delayed_work *dwork = (struct delayed_work *)__data; 1066 struct delayed_work *dwork = (struct delayed_work *)__data;
313 struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work); 1067 struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
314 struct workqueue_struct *wq = cwq->wq;
315 1068
316 __queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work); 1069 __queue_work(smp_processor_id(), cwq->wq, &dwork->work);
317} 1070}
318 1071
319/** 1072/**
@@ -350,14 +1103,31 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
350 struct timer_list *timer = &dwork->timer; 1103 struct timer_list *timer = &dwork->timer;
351 struct work_struct *work = &dwork->work; 1104 struct work_struct *work = &dwork->work;
352 1105
353 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { 1106 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
1107 unsigned int lcpu;
1108
354 BUG_ON(timer_pending(timer)); 1109 BUG_ON(timer_pending(timer));
355 BUG_ON(!list_empty(&work->entry)); 1110 BUG_ON(!list_empty(&work->entry));
356 1111
357 timer_stats_timer_set_start_info(&dwork->timer); 1112 timer_stats_timer_set_start_info(&dwork->timer);
358 1113
359 /* This stores cwq for the moment, for the timer_fn */ 1114 /*
360 set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id())); 1115 * This stores cwq for the moment, for the timer_fn.
1116 * Note that the work's gcwq is preserved to allow
1117 * reentrance detection for delayed works.
1118 */
1119 if (!(wq->flags & WQ_UNBOUND)) {
1120 struct global_cwq *gcwq = get_work_gcwq(work);
1121
1122 if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND)
1123 lcpu = gcwq->cpu;
1124 else
1125 lcpu = raw_smp_processor_id();
1126 } else
1127 lcpu = WORK_CPU_UNBOUND;
1128
1129 set_work_cwq(work, get_cwq(lcpu, wq), 0);
1130
361 timer->expires = jiffies + delay; 1131 timer->expires = jiffies + delay;
362 timer->data = (unsigned long)dwork; 1132 timer->data = (unsigned long)dwork;
363 timer->function = delayed_work_timer_fn; 1133 timer->function = delayed_work_timer_fn;
@@ -372,80 +1142,888 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
372} 1142}
373EXPORT_SYMBOL_GPL(queue_delayed_work_on); 1143EXPORT_SYMBOL_GPL(queue_delayed_work_on);
374 1144
375static void run_workqueue(struct cpu_workqueue_struct *cwq) 1145/**
1146 * worker_enter_idle - enter idle state
1147 * @worker: worker which is entering idle state
1148 *
1149 * @worker is entering idle state. Update stats and idle timer if
1150 * necessary.
1151 *
1152 * LOCKING:
1153 * spin_lock_irq(gcwq->lock).
1154 */
1155static void worker_enter_idle(struct worker *worker)
376{ 1156{
377 spin_lock_irq(&cwq->lock); 1157 struct global_cwq *gcwq = worker->gcwq;
378 while (!list_empty(&cwq->worklist)) { 1158
379 struct work_struct *work = list_entry(cwq->worklist.next, 1159 BUG_ON(worker->flags & WORKER_IDLE);
380 struct work_struct, entry); 1160 BUG_ON(!list_empty(&worker->entry) &&
381 work_func_t f = work->func; 1161 (worker->hentry.next || worker->hentry.pprev));
382#ifdef CONFIG_LOCKDEP 1162
1163 /* can't use worker_set_flags(), also called from start_worker() */
1164 worker->flags |= WORKER_IDLE;
1165 gcwq->nr_idle++;
1166 worker->last_active = jiffies;
1167
1168 /* idle_list is LIFO */
1169 list_add(&worker->entry, &gcwq->idle_list);
1170
1171 if (likely(!(worker->flags & WORKER_ROGUE))) {
1172 if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer))
1173 mod_timer(&gcwq->idle_timer,
1174 jiffies + IDLE_WORKER_TIMEOUT);
1175 } else
1176 wake_up_all(&gcwq->trustee_wait);
1177
1178 /* sanity check nr_running */
1179 WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle &&
1180 atomic_read(get_gcwq_nr_running(gcwq->cpu)));
1181}
1182
1183/**
1184 * worker_leave_idle - leave idle state
1185 * @worker: worker which is leaving idle state
1186 *
1187 * @worker is leaving idle state. Update stats.
1188 *
1189 * LOCKING:
1190 * spin_lock_irq(gcwq->lock).
1191 */
1192static void worker_leave_idle(struct worker *worker)
1193{
1194 struct global_cwq *gcwq = worker->gcwq;
1195
1196 BUG_ON(!(worker->flags & WORKER_IDLE));
1197 worker_clr_flags(worker, WORKER_IDLE);
1198 gcwq->nr_idle--;
1199 list_del_init(&worker->entry);
1200}
1201
1202/**
1203 * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq
1204 * @worker: self
1205 *
1206 * Works which are scheduled while the cpu is online must at least be
1207 * scheduled to a worker which is bound to the cpu so that if they are
1208 * flushed from cpu callbacks while cpu is going down, they are
1209 * guaranteed to execute on the cpu.
1210 *
1211 * This function is to be used by rogue workers and rescuers to bind
1212 * themselves to the target cpu and may race with cpu going down or
1213 * coming online. kthread_bind() can't be used because it may put the
1214 * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
1215 * verbatim as it's best effort and blocking and gcwq may be
1216 * [dis]associated in the meantime.
1217 *
1218 * This function tries set_cpus_allowed() and locks gcwq and verifies
1219 * the binding against GCWQ_DISASSOCIATED which is set during
1220 * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters
1221 * idle state or fetches works without dropping lock, it can guarantee
1222 * the scheduling requirement described in the first paragraph.
1223 *
1224 * CONTEXT:
1225 * Might sleep. Called without any lock but returns with gcwq->lock
1226 * held.
1227 *
1228 * RETURNS:
1229 * %true if the associated gcwq is online (@worker is successfully
1230 * bound), %false if offline.
1231 */
1232static bool worker_maybe_bind_and_lock(struct worker *worker)
1233__acquires(&gcwq->lock)
1234{
1235 struct global_cwq *gcwq = worker->gcwq;
1236 struct task_struct *task = worker->task;
1237
1238 while (true) {
383 /* 1239 /*
384 * It is permissible to free the struct work_struct 1240 * The following call may fail, succeed or succeed
385 * from inside the function that is called from it, 1241 * without actually migrating the task to the cpu if
386 * this we need to take into account for lockdep too. 1242 * it races with cpu hotunplug operation. Verify
387 * To avoid bogus "held lock freed" warnings as well 1243 * against GCWQ_DISASSOCIATED.
388 * as problems when looking into work->lockdep_map,
389 * make a copy and use that here.
390 */ 1244 */
391 struct lockdep_map lockdep_map = work->lockdep_map; 1245 if (!(gcwq->flags & GCWQ_DISASSOCIATED))
392#endif 1246 set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu));
393 trace_workqueue_execution(cwq->thread, work); 1247
394 debug_work_deactivate(work); 1248 spin_lock_irq(&gcwq->lock);
395 cwq->current_work = work; 1249 if (gcwq->flags & GCWQ_DISASSOCIATED)
396 list_del_init(cwq->worklist.next); 1250 return false;
397 spin_unlock_irq(&cwq->lock); 1251 if (task_cpu(task) == gcwq->cpu &&
398 1252 cpumask_equal(&current->cpus_allowed,
399 BUG_ON(get_wq_data(work) != cwq); 1253 get_cpu_mask(gcwq->cpu)))
400 work_clear_pending(work); 1254 return true;
401 lock_map_acquire(&cwq->wq->lockdep_map); 1255 spin_unlock_irq(&gcwq->lock);
402 lock_map_acquire(&lockdep_map); 1256
403 f(work); 1257 /* CPU has come up inbetween, retry migration */
404 lock_map_release(&lockdep_map); 1258 cpu_relax();
405 lock_map_release(&cwq->wq->lockdep_map); 1259 }
406 1260}
407 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { 1261
408 printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " 1262/*
409 "%s/0x%08x/%d\n", 1263 * Function for worker->rebind_work used to rebind rogue busy workers
410 current->comm, preempt_count(), 1264 * to the associated cpu which is coming back online. This is
411 task_pid_nr(current)); 1265 * scheduled by cpu up but can race with other cpu hotplug operations
412 printk(KERN_ERR " last function: "); 1266 * and may be executed twice without intervening cpu down.
413 print_symbol("%s\n", (unsigned long)f); 1267 */
414 debug_show_held_locks(current); 1268static void worker_rebind_fn(struct work_struct *work)
415 dump_stack(); 1269{
1270 struct worker *worker = container_of(work, struct worker, rebind_work);
1271 struct global_cwq *gcwq = worker->gcwq;
1272
1273 if (worker_maybe_bind_and_lock(worker))
1274 worker_clr_flags(worker, WORKER_REBIND);
1275
1276 spin_unlock_irq(&gcwq->lock);
1277}
1278
1279static struct worker *alloc_worker(void)
1280{
1281 struct worker *worker;
1282
1283 worker = kzalloc(sizeof(*worker), GFP_KERNEL);
1284 if (worker) {
1285 INIT_LIST_HEAD(&worker->entry);
1286 INIT_LIST_HEAD(&worker->scheduled);
1287 INIT_WORK(&worker->rebind_work, worker_rebind_fn);
1288 /* on creation a worker is in !idle && prep state */
1289 worker->flags = WORKER_PREP;
1290 }
1291 return worker;
1292}
1293
1294/**
1295 * create_worker - create a new workqueue worker
1296 * @gcwq: gcwq the new worker will belong to
1297 * @bind: whether to set affinity to @cpu or not
1298 *
1299 * Create a new worker which is bound to @gcwq. The returned worker
1300 * can be started by calling start_worker() or destroyed using
1301 * destroy_worker().
1302 *
1303 * CONTEXT:
1304 * Might sleep. Does GFP_KERNEL allocations.
1305 *
1306 * RETURNS:
1307 * Pointer to the newly created worker.
1308 */
1309static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
1310{
1311 bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND;
1312 struct worker *worker = NULL;
1313 int id = -1;
1314
1315 spin_lock_irq(&gcwq->lock);
1316 while (ida_get_new(&gcwq->worker_ida, &id)) {
1317 spin_unlock_irq(&gcwq->lock);
1318 if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL))
1319 goto fail;
1320 spin_lock_irq(&gcwq->lock);
1321 }
1322 spin_unlock_irq(&gcwq->lock);
1323
1324 worker = alloc_worker();
1325 if (!worker)
1326 goto fail;
1327
1328 worker->gcwq = gcwq;
1329 worker->id = id;
1330
1331 if (!on_unbound_cpu)
1332 worker->task = kthread_create(worker_thread, worker,
1333 "kworker/%u:%d", gcwq->cpu, id);
1334 else
1335 worker->task = kthread_create(worker_thread, worker,
1336 "kworker/u:%d", id);
1337 if (IS_ERR(worker->task))
1338 goto fail;
1339
1340 /*
1341 * A rogue worker will become a regular one if CPU comes
1342 * online later on. Make sure every worker has
1343 * PF_THREAD_BOUND set.
1344 */
1345 if (bind && !on_unbound_cpu)
1346 kthread_bind(worker->task, gcwq->cpu);
1347 else {
1348 worker->task->flags |= PF_THREAD_BOUND;
1349 if (on_unbound_cpu)
1350 worker->flags |= WORKER_UNBOUND;
1351 }
1352
1353 return worker;
1354fail:
1355 if (id >= 0) {
1356 spin_lock_irq(&gcwq->lock);
1357 ida_remove(&gcwq->worker_ida, id);
1358 spin_unlock_irq(&gcwq->lock);
1359 }
1360 kfree(worker);
1361 return NULL;
1362}
1363
1364/**
1365 * start_worker - start a newly created worker
1366 * @worker: worker to start
1367 *
1368 * Make the gcwq aware of @worker and start it.
1369 *
1370 * CONTEXT:
1371 * spin_lock_irq(gcwq->lock).
1372 */
1373static void start_worker(struct worker *worker)
1374{
1375 worker->flags |= WORKER_STARTED;
1376 worker->gcwq->nr_workers++;
1377 worker_enter_idle(worker);
1378 wake_up_process(worker->task);
1379}
1380
1381/**
1382 * destroy_worker - destroy a workqueue worker
1383 * @worker: worker to be destroyed
1384 *
1385 * Destroy @worker and adjust @gcwq stats accordingly.
1386 *
1387 * CONTEXT:
1388 * spin_lock_irq(gcwq->lock) which is released and regrabbed.
1389 */
1390static void destroy_worker(struct worker *worker)
1391{
1392 struct global_cwq *gcwq = worker->gcwq;
1393 int id = worker->id;
1394
1395 /* sanity check frenzy */
1396 BUG_ON(worker->current_work);
1397 BUG_ON(!list_empty(&worker->scheduled));
1398
1399 if (worker->flags & WORKER_STARTED)
1400 gcwq->nr_workers--;
1401 if (worker->flags & WORKER_IDLE)
1402 gcwq->nr_idle--;
1403
1404 list_del_init(&worker->entry);
1405 worker->flags |= WORKER_DIE;
1406
1407 spin_unlock_irq(&gcwq->lock);
1408
1409 kthread_stop(worker->task);
1410 kfree(worker);
1411
1412 spin_lock_irq(&gcwq->lock);
1413 ida_remove(&gcwq->worker_ida, id);
1414}
1415
1416static void idle_worker_timeout(unsigned long __gcwq)
1417{
1418 struct global_cwq *gcwq = (void *)__gcwq;
1419
1420 spin_lock_irq(&gcwq->lock);
1421
1422 if (too_many_workers(gcwq)) {
1423 struct worker *worker;
1424 unsigned long expires;
1425
1426 /* idle_list is kept in LIFO order, check the last one */
1427 worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
1428 expires = worker->last_active + IDLE_WORKER_TIMEOUT;
1429
1430 if (time_before(jiffies, expires))
1431 mod_timer(&gcwq->idle_timer, expires);
1432 else {
1433 /* it's been idle for too long, wake up manager */
1434 gcwq->flags |= GCWQ_MANAGE_WORKERS;
1435 wake_up_worker(gcwq);
416 } 1436 }
1437 }
417 1438
418 spin_lock_irq(&cwq->lock); 1439 spin_unlock_irq(&gcwq->lock);
419 cwq->current_work = NULL; 1440}
1441
1442static bool send_mayday(struct work_struct *work)
1443{
1444 struct cpu_workqueue_struct *cwq = get_work_cwq(work);
1445 struct workqueue_struct *wq = cwq->wq;
1446 unsigned int cpu;
1447
1448 if (!(wq->flags & WQ_RESCUER))
1449 return false;
1450
1451 /* mayday mayday mayday */
1452 cpu = cwq->gcwq->cpu;
1453 /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
1454 if (cpu == WORK_CPU_UNBOUND)
1455 cpu = 0;
1456 if (!mayday_test_and_set_cpu(cpu, wq->mayday_mask))
1457 wake_up_process(wq->rescuer->task);
1458 return true;
1459}
1460
1461static void gcwq_mayday_timeout(unsigned long __gcwq)
1462{
1463 struct global_cwq *gcwq = (void *)__gcwq;
1464 struct work_struct *work;
1465
1466 spin_lock_irq(&gcwq->lock);
1467
1468 if (need_to_create_worker(gcwq)) {
1469 /*
1470 * We've been trying to create a new worker but
1471 * haven't been successful. We might be hitting an
1472 * allocation deadlock. Send distress signals to
1473 * rescuers.
1474 */
1475 list_for_each_entry(work, &gcwq->worklist, entry)
1476 send_mayday(work);
420 } 1477 }
421 spin_unlock_irq(&cwq->lock); 1478
1479 spin_unlock_irq(&gcwq->lock);
1480
1481 mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL);
422} 1482}
423 1483
424static int worker_thread(void *__cwq) 1484/**
1485 * maybe_create_worker - create a new worker if necessary
1486 * @gcwq: gcwq to create a new worker for
1487 *
1488 * Create a new worker for @gcwq if necessary. @gcwq is guaranteed to
1489 * have at least one idle worker on return from this function. If
1490 * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
1491 * sent to all rescuers with works scheduled on @gcwq to resolve
1492 * possible allocation deadlock.
1493 *
1494 * On return, need_to_create_worker() is guaranteed to be false and
1495 * may_start_working() true.
1496 *
1497 * LOCKING:
1498 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1499 * multiple times. Does GFP_KERNEL allocations. Called only from
1500 * manager.
1501 *
1502 * RETURNS:
1503 * false if no action was taken and gcwq->lock stayed locked, true
1504 * otherwise.
1505 */
1506static bool maybe_create_worker(struct global_cwq *gcwq)
1507__releases(&gcwq->lock)
1508__acquires(&gcwq->lock)
425{ 1509{
426 struct cpu_workqueue_struct *cwq = __cwq; 1510 if (!need_to_create_worker(gcwq))
427 DEFINE_WAIT(wait); 1511 return false;
1512restart:
1513 spin_unlock_irq(&gcwq->lock);
1514
1515 /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
1516 mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
1517
1518 while (true) {
1519 struct worker *worker;
1520
1521 worker = create_worker(gcwq, true);
1522 if (worker) {
1523 del_timer_sync(&gcwq->mayday_timer);
1524 spin_lock_irq(&gcwq->lock);
1525 start_worker(worker);
1526 BUG_ON(need_to_create_worker(gcwq));
1527 return true;
1528 }
1529
1530 if (!need_to_create_worker(gcwq))
1531 break;
1532
1533 __set_current_state(TASK_INTERRUPTIBLE);
1534 schedule_timeout(CREATE_COOLDOWN);
428 1535
429 if (cwq->wq->freezeable) 1536 if (!need_to_create_worker(gcwq))
430 set_freezable(); 1537 break;
1538 }
1539
1540 del_timer_sync(&gcwq->mayday_timer);
1541 spin_lock_irq(&gcwq->lock);
1542 if (need_to_create_worker(gcwq))
1543 goto restart;
1544 return true;
1545}
1546
1547/**
1548 * maybe_destroy_worker - destroy workers which have been idle for a while
1549 * @gcwq: gcwq to destroy workers for
1550 *
1551 * Destroy @gcwq workers which have been idle for longer than
1552 * IDLE_WORKER_TIMEOUT.
1553 *
1554 * LOCKING:
1555 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1556 * multiple times. Called only from manager.
1557 *
1558 * RETURNS:
1559 * false if no action was taken and gcwq->lock stayed locked, true
1560 * otherwise.
1561 */
1562static bool maybe_destroy_workers(struct global_cwq *gcwq)
1563{
1564 bool ret = false;
431 1565
432 for (;;) { 1566 while (too_many_workers(gcwq)) {
433 prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); 1567 struct worker *worker;
434 if (!freezing(current) && 1568 unsigned long expires;
435 !kthread_should_stop() &&
436 list_empty(&cwq->worklist))
437 schedule();
438 finish_wait(&cwq->more_work, &wait);
439 1569
440 try_to_freeze(); 1570 worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
1571 expires = worker->last_active + IDLE_WORKER_TIMEOUT;
441 1572
442 if (kthread_should_stop()) 1573 if (time_before(jiffies, expires)) {
1574 mod_timer(&gcwq->idle_timer, expires);
443 break; 1575 break;
1576 }
444 1577
445 run_workqueue(cwq); 1578 destroy_worker(worker);
1579 ret = true;
446 } 1580 }
447 1581
448 return 0; 1582 return ret;
1583}
1584
1585/**
1586 * manage_workers - manage worker pool
1587 * @worker: self
1588 *
1589 * Assume the manager role and manage gcwq worker pool @worker belongs
1590 * to. At any given time, there can be only zero or one manager per
1591 * gcwq. The exclusion is handled automatically by this function.
1592 *
1593 * The caller can safely start processing works on false return. On
1594 * true return, it's guaranteed that need_to_create_worker() is false
1595 * and may_start_working() is true.
1596 *
1597 * CONTEXT:
1598 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1599 * multiple times. Does GFP_KERNEL allocations.
1600 *
1601 * RETURNS:
1602 * false if no action was taken and gcwq->lock stayed locked, true if
1603 * some action was taken.
1604 */
1605static bool manage_workers(struct worker *worker)
1606{
1607 struct global_cwq *gcwq = worker->gcwq;
1608 bool ret = false;
1609
1610 if (gcwq->flags & GCWQ_MANAGING_WORKERS)
1611 return ret;
1612
1613 gcwq->flags &= ~GCWQ_MANAGE_WORKERS;
1614 gcwq->flags |= GCWQ_MANAGING_WORKERS;
1615
1616 /*
1617 * Destroy and then create so that may_start_working() is true
1618 * on return.
1619 */
1620 ret |= maybe_destroy_workers(gcwq);
1621 ret |= maybe_create_worker(gcwq);
1622
1623 gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
1624
1625 /*
1626 * The trustee might be waiting to take over the manager
1627 * position, tell it we're done.
1628 */
1629 if (unlikely(gcwq->trustee))
1630 wake_up_all(&gcwq->trustee_wait);
1631
1632 return ret;
1633}
1634
1635/**
1636 * move_linked_works - move linked works to a list
1637 * @work: start of series of works to be scheduled
1638 * @head: target list to append @work to
1639 * @nextp: out paramter for nested worklist walking
1640 *
1641 * Schedule linked works starting from @work to @head. Work series to
1642 * be scheduled starts at @work and includes any consecutive work with
1643 * WORK_STRUCT_LINKED set in its predecessor.
1644 *
1645 * If @nextp is not NULL, it's updated to point to the next work of
1646 * the last scheduled work. This allows move_linked_works() to be
1647 * nested inside outer list_for_each_entry_safe().
1648 *
1649 * CONTEXT:
1650 * spin_lock_irq(gcwq->lock).
1651 */
1652static void move_linked_works(struct work_struct *work, struct list_head *head,
1653 struct work_struct **nextp)
1654{
1655 struct work_struct *n;
1656
1657 /*
1658 * Linked worklist will always end before the end of the list,
1659 * use NULL for list head.
1660 */
1661 list_for_each_entry_safe_from(work, n, NULL, entry) {
1662 list_move_tail(&work->entry, head);
1663 if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
1664 break;
1665 }
1666
1667 /*
1668 * If we're already inside safe list traversal and have moved
1669 * multiple works to the scheduled queue, the next position
1670 * needs to be updated.
1671 */
1672 if (nextp)
1673 *nextp = n;
1674}
1675
1676static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
1677{
1678 struct work_struct *work = list_first_entry(&cwq->delayed_works,
1679 struct work_struct, entry);
1680 struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq);
1681
1682 move_linked_works(work, pos, NULL);
1683 __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
1684 cwq->nr_active++;
1685}
1686
1687/**
1688 * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
1689 * @cwq: cwq of interest
1690 * @color: color of work which left the queue
1691 * @delayed: for a delayed work
1692 *
1693 * A work either has completed or is removed from pending queue,
1694 * decrement nr_in_flight of its cwq and handle workqueue flushing.
1695 *
1696 * CONTEXT:
1697 * spin_lock_irq(gcwq->lock).
1698 */
1699static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color,
1700 bool delayed)
1701{
1702 /* ignore uncolored works */
1703 if (color == WORK_NO_COLOR)
1704 return;
1705
1706 cwq->nr_in_flight[color]--;
1707
1708 if (!delayed) {
1709 cwq->nr_active--;
1710 if (!list_empty(&cwq->delayed_works)) {
1711 /* one down, submit a delayed one */
1712 if (cwq->nr_active < cwq->max_active)
1713 cwq_activate_first_delayed(cwq);
1714 }
1715 }
1716
1717 /* is flush in progress and are we at the flushing tip? */
1718 if (likely(cwq->flush_color != color))
1719 return;
1720
1721 /* are there still in-flight works? */
1722 if (cwq->nr_in_flight[color])
1723 return;
1724
1725 /* this cwq is done, clear flush_color */
1726 cwq->flush_color = -1;
1727
1728 /*
1729 * If this was the last cwq, wake up the first flusher. It
1730 * will handle the rest.
1731 */
1732 if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
1733 complete(&cwq->wq->first_flusher->done);
1734}
1735
1736/**
1737 * process_one_work - process single work
1738 * @worker: self
1739 * @work: work to process
1740 *
1741 * Process @work. This function contains all the logics necessary to
1742 * process a single work including synchronization against and
1743 * interaction with other workers on the same cpu, queueing and
1744 * flushing. As long as context requirement is met, any worker can
1745 * call this function to process a work.
1746 *
1747 * CONTEXT:
1748 * spin_lock_irq(gcwq->lock) which is released and regrabbed.
1749 */
1750static void process_one_work(struct worker *worker, struct work_struct *work)
1751__releases(&gcwq->lock)
1752__acquires(&gcwq->lock)
1753{
1754 struct cpu_workqueue_struct *cwq = get_work_cwq(work);
1755 struct global_cwq *gcwq = cwq->gcwq;
1756 struct hlist_head *bwh = busy_worker_head(gcwq, work);
1757 bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
1758 work_func_t f = work->func;
1759 int work_color;
1760 struct worker *collision;
1761#ifdef CONFIG_LOCKDEP
1762 /*
1763 * It is permissible to free the struct work_struct from
1764 * inside the function that is called from it, this we need to
1765 * take into account for lockdep too. To avoid bogus "held
1766 * lock freed" warnings as well as problems when looking into
1767 * work->lockdep_map, make a copy and use that here.
1768 */
1769 struct lockdep_map lockdep_map = work->lockdep_map;
1770#endif
1771 /*
1772 * A single work shouldn't be executed concurrently by
1773 * multiple workers on a single cpu. Check whether anyone is
1774 * already processing the work. If so, defer the work to the
1775 * currently executing one.
1776 */
1777 collision = __find_worker_executing_work(gcwq, bwh, work);
1778 if (unlikely(collision)) {
1779 move_linked_works(work, &collision->scheduled, NULL);
1780 return;
1781 }
1782
1783 /* claim and process */
1784 debug_work_deactivate(work);
1785 hlist_add_head(&worker->hentry, bwh);
1786 worker->current_work = work;
1787 worker->current_cwq = cwq;
1788 work_color = get_work_color(work);
1789
1790 /* record the current cpu number in the work data and dequeue */
1791 set_work_cpu(work, gcwq->cpu);
1792 list_del_init(&work->entry);
1793
1794 /*
1795 * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI,
1796 * wake up another worker; otherwise, clear HIGHPRI_PENDING.
1797 */
1798 if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) {
1799 struct work_struct *nwork = list_first_entry(&gcwq->worklist,
1800 struct work_struct, entry);
1801
1802 if (!list_empty(&gcwq->worklist) &&
1803 get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
1804 wake_up_worker(gcwq);
1805 else
1806 gcwq->flags &= ~GCWQ_HIGHPRI_PENDING;
1807 }
1808
1809 /*
1810 * CPU intensive works don't participate in concurrency
1811 * management. They're the scheduler's responsibility.
1812 */
1813 if (unlikely(cpu_intensive))
1814 worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
1815
1816 spin_unlock_irq(&gcwq->lock);
1817
1818 work_clear_pending(work);
1819 lock_map_acquire(&cwq->wq->lockdep_map);
1820 lock_map_acquire(&lockdep_map);
1821 trace_workqueue_execute_start(work);
1822 f(work);
1823 /*
1824 * While we must be careful to not use "work" after this, the trace
1825 * point will only record its address.
1826 */
1827 trace_workqueue_execute_end(work);
1828 lock_map_release(&lockdep_map);
1829 lock_map_release(&cwq->wq->lockdep_map);
1830
1831 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
1832 printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
1833 "%s/0x%08x/%d\n",
1834 current->comm, preempt_count(), task_pid_nr(current));
1835 printk(KERN_ERR " last function: ");
1836 print_symbol("%s\n", (unsigned long)f);
1837 debug_show_held_locks(current);
1838 dump_stack();
1839 }
1840
1841 spin_lock_irq(&gcwq->lock);
1842
1843 /* clear cpu intensive status */
1844 if (unlikely(cpu_intensive))
1845 worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
1846
1847 /* we're done with it, release */
1848 hlist_del_init(&worker->hentry);
1849 worker->current_work = NULL;
1850 worker->current_cwq = NULL;
1851 cwq_dec_nr_in_flight(cwq, work_color, false);
1852}
1853
1854/**
1855 * process_scheduled_works - process scheduled works
1856 * @worker: self
1857 *
1858 * Process all scheduled works. Please note that the scheduled list
1859 * may change while processing a work, so this function repeatedly
1860 * fetches a work from the top and executes it.
1861 *
1862 * CONTEXT:
1863 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1864 * multiple times.
1865 */
1866static void process_scheduled_works(struct worker *worker)
1867{
1868 while (!list_empty(&worker->scheduled)) {
1869 struct work_struct *work = list_first_entry(&worker->scheduled,
1870 struct work_struct, entry);
1871 process_one_work(worker, work);
1872 }
1873}
1874
1875/**
1876 * worker_thread - the worker thread function
1877 * @__worker: self
1878 *
1879 * The gcwq worker thread function. There's a single dynamic pool of
1880 * these per each cpu. These workers process all works regardless of
1881 * their specific target workqueue. The only exception is works which
1882 * belong to workqueues with a rescuer which will be explained in
1883 * rescuer_thread().
1884 */
1885static int worker_thread(void *__worker)
1886{
1887 struct worker *worker = __worker;
1888 struct global_cwq *gcwq = worker->gcwq;
1889
1890 /* tell the scheduler that this is a workqueue worker */
1891 worker->task->flags |= PF_WQ_WORKER;
1892woke_up:
1893 spin_lock_irq(&gcwq->lock);
1894
1895 /* DIE can be set only while we're idle, checking here is enough */
1896 if (worker->flags & WORKER_DIE) {
1897 spin_unlock_irq(&gcwq->lock);
1898 worker->task->flags &= ~PF_WQ_WORKER;
1899 return 0;
1900 }
1901
1902 worker_leave_idle(worker);
1903recheck:
1904 /* no more worker necessary? */
1905 if (!need_more_worker(gcwq))
1906 goto sleep;
1907
1908 /* do we need to manage? */
1909 if (unlikely(!may_start_working(gcwq)) && manage_workers(worker))
1910 goto recheck;
1911
1912 /*
1913 * ->scheduled list can only be filled while a worker is
1914 * preparing to process a work or actually processing it.
1915 * Make sure nobody diddled with it while I was sleeping.
1916 */
1917 BUG_ON(!list_empty(&worker->scheduled));
1918
1919 /*
1920 * When control reaches this point, we're guaranteed to have
1921 * at least one idle worker or that someone else has already
1922 * assumed the manager role.
1923 */
1924 worker_clr_flags(worker, WORKER_PREP);
1925
1926 do {
1927 struct work_struct *work =
1928 list_first_entry(&gcwq->worklist,
1929 struct work_struct, entry);
1930
1931 if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
1932 /* optimization path, not strictly necessary */
1933 process_one_work(worker, work);
1934 if (unlikely(!list_empty(&worker->scheduled)))
1935 process_scheduled_works(worker);
1936 } else {
1937 move_linked_works(work, &worker->scheduled, NULL);
1938 process_scheduled_works(worker);
1939 }
1940 } while (keep_working(gcwq));
1941
1942 worker_set_flags(worker, WORKER_PREP, false);
1943sleep:
1944 if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker))
1945 goto recheck;
1946
1947 /*
1948 * gcwq->lock is held and there's no work to process and no
1949 * need to manage, sleep. Workers are woken up only while
1950 * holding gcwq->lock or from local cpu, so setting the
1951 * current state before releasing gcwq->lock is enough to
1952 * prevent losing any event.
1953 */
1954 worker_enter_idle(worker);
1955 __set_current_state(TASK_INTERRUPTIBLE);
1956 spin_unlock_irq(&gcwq->lock);
1957 schedule();
1958 goto woke_up;
1959}
1960
1961/**
1962 * rescuer_thread - the rescuer thread function
1963 * @__wq: the associated workqueue
1964 *
1965 * Workqueue rescuer thread function. There's one rescuer for each
1966 * workqueue which has WQ_RESCUER set.
1967 *
1968 * Regular work processing on a gcwq may block trying to create a new
1969 * worker which uses GFP_KERNEL allocation which has slight chance of
1970 * developing into deadlock if some works currently on the same queue
1971 * need to be processed to satisfy the GFP_KERNEL allocation. This is
1972 * the problem rescuer solves.
1973 *
1974 * When such condition is possible, the gcwq summons rescuers of all
1975 * workqueues which have works queued on the gcwq and let them process
1976 * those works so that forward progress can be guaranteed.
1977 *
1978 * This should happen rarely.
1979 */
1980static int rescuer_thread(void *__wq)
1981{
1982 struct workqueue_struct *wq = __wq;
1983 struct worker *rescuer = wq->rescuer;
1984 struct list_head *scheduled = &rescuer->scheduled;
1985 bool is_unbound = wq->flags & WQ_UNBOUND;
1986 unsigned int cpu;
1987
1988 set_user_nice(current, RESCUER_NICE_LEVEL);
1989repeat:
1990 set_current_state(TASK_INTERRUPTIBLE);
1991
1992 if (kthread_should_stop())
1993 return 0;
1994
1995 /*
1996 * See whether any cpu is asking for help. Unbounded
1997 * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND.
1998 */
1999 for_each_mayday_cpu(cpu, wq->mayday_mask) {
2000 unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
2001 struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq);
2002 struct global_cwq *gcwq = cwq->gcwq;
2003 struct work_struct *work, *n;
2004
2005 __set_current_state(TASK_RUNNING);
2006 mayday_clear_cpu(cpu, wq->mayday_mask);
2007
2008 /* migrate to the target cpu if possible */
2009 rescuer->gcwq = gcwq;
2010 worker_maybe_bind_and_lock(rescuer);
2011
2012 /*
2013 * Slurp in all works issued via this workqueue and
2014 * process'em.
2015 */
2016 BUG_ON(!list_empty(&rescuer->scheduled));
2017 list_for_each_entry_safe(work, n, &gcwq->worklist, entry)
2018 if (get_work_cwq(work) == cwq)
2019 move_linked_works(work, scheduled, &n);
2020
2021 process_scheduled_works(rescuer);
2022 spin_unlock_irq(&gcwq->lock);
2023 }
2024
2025 schedule();
2026 goto repeat;
449} 2027}
450 2028
451struct wq_barrier { 2029struct wq_barrier {
@@ -459,44 +2037,137 @@ static void wq_barrier_func(struct work_struct *work)
459 complete(&barr->done); 2037 complete(&barr->done);
460} 2038}
461 2039
2040/**
2041 * insert_wq_barrier - insert a barrier work
2042 * @cwq: cwq to insert barrier into
2043 * @barr: wq_barrier to insert
2044 * @target: target work to attach @barr to
2045 * @worker: worker currently executing @target, NULL if @target is not executing
2046 *
2047 * @barr is linked to @target such that @barr is completed only after
2048 * @target finishes execution. Please note that the ordering
2049 * guarantee is observed only with respect to @target and on the local
2050 * cpu.
2051 *
2052 * Currently, a queued barrier can't be canceled. This is because
2053 * try_to_grab_pending() can't determine whether the work to be
2054 * grabbed is at the head of the queue and thus can't clear LINKED
2055 * flag of the previous work while there must be a valid next work
2056 * after a work with LINKED flag set.
2057 *
2058 * Note that when @worker is non-NULL, @target may be modified
2059 * underneath us, so we can't reliably determine cwq from @target.
2060 *
2061 * CONTEXT:
2062 * spin_lock_irq(gcwq->lock).
2063 */
462static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, 2064static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
463 struct wq_barrier *barr, struct list_head *head) 2065 struct wq_barrier *barr,
2066 struct work_struct *target, struct worker *worker)
464{ 2067{
2068 struct list_head *head;
2069 unsigned int linked = 0;
2070
465 /* 2071 /*
466 * debugobject calls are safe here even with cwq->lock locked 2072 * debugobject calls are safe here even with gcwq->lock locked
467 * as we know for sure that this will not trigger any of the 2073 * as we know for sure that this will not trigger any of the
468 * checks and call back into the fixup functions where we 2074 * checks and call back into the fixup functions where we
469 * might deadlock. 2075 * might deadlock.
470 */ 2076 */
471 INIT_WORK_ON_STACK(&barr->work, wq_barrier_func); 2077 INIT_WORK_ON_STACK(&barr->work, wq_barrier_func);
472 __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work)); 2078 __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
473
474 init_completion(&barr->done); 2079 init_completion(&barr->done);
475 2080
2081 /*
2082 * If @target is currently being executed, schedule the
2083 * barrier to the worker; otherwise, put it after @target.
2084 */
2085 if (worker)
2086 head = worker->scheduled.next;
2087 else {
2088 unsigned long *bits = work_data_bits(target);
2089
2090 head = target->entry.next;
2091 /* there can already be other linked works, inherit and set */
2092 linked = *bits & WORK_STRUCT_LINKED;
2093 __set_bit(WORK_STRUCT_LINKED_BIT, bits);
2094 }
2095
476 debug_work_activate(&barr->work); 2096 debug_work_activate(&barr->work);
477 insert_work(cwq, &barr->work, head); 2097 insert_work(cwq, &barr->work, head,
2098 work_color_to_flags(WORK_NO_COLOR) | linked);
478} 2099}
479 2100
480static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) 2101/**
2102 * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing
2103 * @wq: workqueue being flushed
2104 * @flush_color: new flush color, < 0 for no-op
2105 * @work_color: new work color, < 0 for no-op
2106 *
2107 * Prepare cwqs for workqueue flushing.
2108 *
2109 * If @flush_color is non-negative, flush_color on all cwqs should be
2110 * -1. If no cwq has in-flight commands at the specified color, all
2111 * cwq->flush_color's stay at -1 and %false is returned. If any cwq
2112 * has in flight commands, its cwq->flush_color is set to
2113 * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq
2114 * wakeup logic is armed and %true is returned.
2115 *
2116 * The caller should have initialized @wq->first_flusher prior to
2117 * calling this function with non-negative @flush_color. If
2118 * @flush_color is negative, no flush color update is done and %false
2119 * is returned.
2120 *
2121 * If @work_color is non-negative, all cwqs should have the same
2122 * work_color which is previous to @work_color and all will be
2123 * advanced to @work_color.
2124 *
2125 * CONTEXT:
2126 * mutex_lock(wq->flush_mutex).
2127 *
2128 * RETURNS:
2129 * %true if @flush_color >= 0 and there's something to flush. %false
2130 * otherwise.
2131 */
2132static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
2133 int flush_color, int work_color)
481{ 2134{
482 int active = 0; 2135 bool wait = false;
483 struct wq_barrier barr; 2136 unsigned int cpu;
484
485 WARN_ON(cwq->thread == current);
486 2137
487 spin_lock_irq(&cwq->lock); 2138 if (flush_color >= 0) {
488 if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) { 2139 BUG_ON(atomic_read(&wq->nr_cwqs_to_flush));
489 insert_wq_barrier(cwq, &barr, &cwq->worklist); 2140 atomic_set(&wq->nr_cwqs_to_flush, 1);
490 active = 1;
491 } 2141 }
492 spin_unlock_irq(&cwq->lock);
493 2142
494 if (active) { 2143 for_each_cwq_cpu(cpu, wq) {
495 wait_for_completion(&barr.done); 2144 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
496 destroy_work_on_stack(&barr.work); 2145 struct global_cwq *gcwq = cwq->gcwq;
2146
2147 spin_lock_irq(&gcwq->lock);
2148
2149 if (flush_color >= 0) {
2150 BUG_ON(cwq->flush_color != -1);
2151
2152 if (cwq->nr_in_flight[flush_color]) {
2153 cwq->flush_color = flush_color;
2154 atomic_inc(&wq->nr_cwqs_to_flush);
2155 wait = true;
2156 }
2157 }
2158
2159 if (work_color >= 0) {
2160 BUG_ON(work_color != work_next_color(cwq->work_color));
2161 cwq->work_color = work_color;
2162 }
2163
2164 spin_unlock_irq(&gcwq->lock);
497 } 2165 }
498 2166
499 return active; 2167 if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush))
2168 complete(&wq->first_flusher->done);
2169
2170 return wait;
500} 2171}
501 2172
502/** 2173/**
@@ -508,20 +2179,150 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
508 * 2179 *
509 * We sleep until all works which were queued on entry have been handled, 2180 * We sleep until all works which were queued on entry have been handled,
510 * but we are not livelocked by new incoming ones. 2181 * but we are not livelocked by new incoming ones.
511 *
512 * This function used to run the workqueues itself. Now we just wait for the
513 * helper threads to do it.
514 */ 2182 */
515void flush_workqueue(struct workqueue_struct *wq) 2183void flush_workqueue(struct workqueue_struct *wq)
516{ 2184{
517 const struct cpumask *cpu_map = wq_cpu_map(wq); 2185 struct wq_flusher this_flusher = {
518 int cpu; 2186 .list = LIST_HEAD_INIT(this_flusher.list),
2187 .flush_color = -1,
2188 .done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done),
2189 };
2190 int next_color;
519 2191
520 might_sleep();
521 lock_map_acquire(&wq->lockdep_map); 2192 lock_map_acquire(&wq->lockdep_map);
522 lock_map_release(&wq->lockdep_map); 2193 lock_map_release(&wq->lockdep_map);
523 for_each_cpu(cpu, cpu_map) 2194
524 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); 2195 mutex_lock(&wq->flush_mutex);
2196
2197 /*
2198 * Start-to-wait phase
2199 */
2200 next_color = work_next_color(wq->work_color);
2201
2202 if (next_color != wq->flush_color) {
2203 /*
2204 * Color space is not full. The current work_color
2205 * becomes our flush_color and work_color is advanced
2206 * by one.
2207 */
2208 BUG_ON(!list_empty(&wq->flusher_overflow));
2209 this_flusher.flush_color = wq->work_color;
2210 wq->work_color = next_color;
2211
2212 if (!wq->first_flusher) {
2213 /* no flush in progress, become the first flusher */
2214 BUG_ON(wq->flush_color != this_flusher.flush_color);
2215
2216 wq->first_flusher = &this_flusher;
2217
2218 if (!flush_workqueue_prep_cwqs(wq, wq->flush_color,
2219 wq->work_color)) {
2220 /* nothing to flush, done */
2221 wq->flush_color = next_color;
2222 wq->first_flusher = NULL;
2223 goto out_unlock;
2224 }
2225 } else {
2226 /* wait in queue */
2227 BUG_ON(wq->flush_color == this_flusher.flush_color);
2228 list_add_tail(&this_flusher.list, &wq->flusher_queue);
2229 flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
2230 }
2231 } else {
2232 /*
2233 * Oops, color space is full, wait on overflow queue.
2234 * The next flush completion will assign us
2235 * flush_color and transfer to flusher_queue.
2236 */
2237 list_add_tail(&this_flusher.list, &wq->flusher_overflow);
2238 }
2239
2240 mutex_unlock(&wq->flush_mutex);
2241
2242 wait_for_completion(&this_flusher.done);
2243
2244 /*
2245 * Wake-up-and-cascade phase
2246 *
2247 * First flushers are responsible for cascading flushes and
2248 * handling overflow. Non-first flushers can simply return.
2249 */
2250 if (wq->first_flusher != &this_flusher)
2251 return;
2252
2253 mutex_lock(&wq->flush_mutex);
2254
2255 /* we might have raced, check again with mutex held */
2256 if (wq->first_flusher != &this_flusher)
2257 goto out_unlock;
2258
2259 wq->first_flusher = NULL;
2260
2261 BUG_ON(!list_empty(&this_flusher.list));
2262 BUG_ON(wq->flush_color != this_flusher.flush_color);
2263
2264 while (true) {
2265 struct wq_flusher *next, *tmp;
2266
2267 /* complete all the flushers sharing the current flush color */
2268 list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) {
2269 if (next->flush_color != wq->flush_color)
2270 break;
2271 list_del_init(&next->list);
2272 complete(&next->done);
2273 }
2274
2275 BUG_ON(!list_empty(&wq->flusher_overflow) &&
2276 wq->flush_color != work_next_color(wq->work_color));
2277
2278 /* this flush_color is finished, advance by one */
2279 wq->flush_color = work_next_color(wq->flush_color);
2280
2281 /* one color has been freed, handle overflow queue */
2282 if (!list_empty(&wq->flusher_overflow)) {
2283 /*
2284 * Assign the same color to all overflowed
2285 * flushers, advance work_color and append to
2286 * flusher_queue. This is the start-to-wait
2287 * phase for these overflowed flushers.
2288 */
2289 list_for_each_entry(tmp, &wq->flusher_overflow, list)
2290 tmp->flush_color = wq->work_color;
2291
2292 wq->work_color = work_next_color(wq->work_color);
2293
2294 list_splice_tail_init(&wq->flusher_overflow,
2295 &wq->flusher_queue);
2296 flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
2297 }
2298
2299 if (list_empty(&wq->flusher_queue)) {
2300 BUG_ON(wq->flush_color != wq->work_color);
2301 break;
2302 }
2303
2304 /*
2305 * Need to flush more colors. Make the next flusher
2306 * the new first flusher and arm cwqs.
2307 */
2308 BUG_ON(wq->flush_color == wq->work_color);
2309 BUG_ON(wq->flush_color != next->flush_color);
2310
2311 list_del_init(&next->list);
2312 wq->first_flusher = next;
2313
2314 if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1))
2315 break;
2316
2317 /*
2318 * Meh... this color is already done, clear first
2319 * flusher and repeat cascading.
2320 */
2321 wq->first_flusher = NULL;
2322 }
2323
2324out_unlock:
2325 mutex_unlock(&wq->flush_mutex);
525} 2326}
526EXPORT_SYMBOL_GPL(flush_workqueue); 2327EXPORT_SYMBOL_GPL(flush_workqueue);
527 2328
@@ -537,43 +2338,46 @@ EXPORT_SYMBOL_GPL(flush_workqueue);
537 */ 2338 */
538int flush_work(struct work_struct *work) 2339int flush_work(struct work_struct *work)
539{ 2340{
2341 struct worker *worker = NULL;
2342 struct global_cwq *gcwq;
540 struct cpu_workqueue_struct *cwq; 2343 struct cpu_workqueue_struct *cwq;
541 struct list_head *prev;
542 struct wq_barrier barr; 2344 struct wq_barrier barr;
543 2345
544 might_sleep(); 2346 might_sleep();
545 cwq = get_wq_data(work); 2347 gcwq = get_work_gcwq(work);
546 if (!cwq) 2348 if (!gcwq)
547 return 0; 2349 return 0;
548 2350
549 lock_map_acquire(&cwq->wq->lockdep_map); 2351 spin_lock_irq(&gcwq->lock);
550 lock_map_release(&cwq->wq->lockdep_map);
551
552 prev = NULL;
553 spin_lock_irq(&cwq->lock);
554 if (!list_empty(&work->entry)) { 2352 if (!list_empty(&work->entry)) {
555 /* 2353 /*
556 * See the comment near try_to_grab_pending()->smp_rmb(). 2354 * See the comment near try_to_grab_pending()->smp_rmb().
557 * If it was re-queued under us we are not going to wait. 2355 * If it was re-queued to a different gcwq under us, we
2356 * are not going to wait.
558 */ 2357 */
559 smp_rmb(); 2358 smp_rmb();
560 if (unlikely(cwq != get_wq_data(work))) 2359 cwq = get_work_cwq(work);
561 goto out; 2360 if (unlikely(!cwq || gcwq != cwq->gcwq))
562 prev = &work->entry; 2361 goto already_gone;
563 } else { 2362 } else {
564 if (cwq->current_work != work) 2363 worker = find_worker_executing_work(gcwq, work);
565 goto out; 2364 if (!worker)
566 prev = &cwq->worklist; 2365 goto already_gone;
2366 cwq = worker->current_cwq;
567 } 2367 }
568 insert_wq_barrier(cwq, &barr, prev->next); 2368
569out: 2369 insert_wq_barrier(cwq, &barr, work, worker);
570 spin_unlock_irq(&cwq->lock); 2370 spin_unlock_irq(&gcwq->lock);
571 if (!prev) 2371
572 return 0; 2372 lock_map_acquire(&cwq->wq->lockdep_map);
2373 lock_map_release(&cwq->wq->lockdep_map);
573 2374
574 wait_for_completion(&barr.done); 2375 wait_for_completion(&barr.done);
575 destroy_work_on_stack(&barr.work); 2376 destroy_work_on_stack(&barr.work);
576 return 1; 2377 return 1;
2378already_gone:
2379 spin_unlock_irq(&gcwq->lock);
2380 return 0;
577} 2381}
578EXPORT_SYMBOL_GPL(flush_work); 2382EXPORT_SYMBOL_GPL(flush_work);
579 2383
@@ -583,54 +2387,56 @@ EXPORT_SYMBOL_GPL(flush_work);
583 */ 2387 */
584static int try_to_grab_pending(struct work_struct *work) 2388static int try_to_grab_pending(struct work_struct *work)
585{ 2389{
586 struct cpu_workqueue_struct *cwq; 2390 struct global_cwq *gcwq;
587 int ret = -1; 2391 int ret = -1;
588 2392
589 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) 2393 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
590 return 0; 2394 return 0;
591 2395
592 /* 2396 /*
593 * The queueing is in progress, or it is already queued. Try to 2397 * The queueing is in progress, or it is already queued. Try to
594 * steal it from ->worklist without clearing WORK_STRUCT_PENDING. 2398 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
595 */ 2399 */
596 2400 gcwq = get_work_gcwq(work);
597 cwq = get_wq_data(work); 2401 if (!gcwq)
598 if (!cwq)
599 return ret; 2402 return ret;
600 2403
601 spin_lock_irq(&cwq->lock); 2404 spin_lock_irq(&gcwq->lock);
602 if (!list_empty(&work->entry)) { 2405 if (!list_empty(&work->entry)) {
603 /* 2406 /*
604 * This work is queued, but perhaps we locked the wrong cwq. 2407 * This work is queued, but perhaps we locked the wrong gcwq.
605 * In that case we must see the new value after rmb(), see 2408 * In that case we must see the new value after rmb(), see
606 * insert_work()->wmb(). 2409 * insert_work()->wmb().
607 */ 2410 */
608 smp_rmb(); 2411 smp_rmb();
609 if (cwq == get_wq_data(work)) { 2412 if (gcwq == get_work_gcwq(work)) {
610 debug_work_deactivate(work); 2413 debug_work_deactivate(work);
611 list_del_init(&work->entry); 2414 list_del_init(&work->entry);
2415 cwq_dec_nr_in_flight(get_work_cwq(work),
2416 get_work_color(work),
2417 *work_data_bits(work) & WORK_STRUCT_DELAYED);
612 ret = 1; 2418 ret = 1;
613 } 2419 }
614 } 2420 }
615 spin_unlock_irq(&cwq->lock); 2421 spin_unlock_irq(&gcwq->lock);
616 2422
617 return ret; 2423 return ret;
618} 2424}
619 2425
620static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq, 2426static void wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work)
621 struct work_struct *work)
622{ 2427{
623 struct wq_barrier barr; 2428 struct wq_barrier barr;
624 int running = 0; 2429 struct worker *worker;
625 2430
626 spin_lock_irq(&cwq->lock); 2431 spin_lock_irq(&gcwq->lock);
627 if (unlikely(cwq->current_work == work)) { 2432
628 insert_wq_barrier(cwq, &barr, cwq->worklist.next); 2433 worker = find_worker_executing_work(gcwq, work);
629 running = 1; 2434 if (unlikely(worker))
630 } 2435 insert_wq_barrier(worker->current_cwq, &barr, work, worker);
631 spin_unlock_irq(&cwq->lock);
632 2436
633 if (unlikely(running)) { 2437 spin_unlock_irq(&gcwq->lock);
2438
2439 if (unlikely(worker)) {
634 wait_for_completion(&barr.done); 2440 wait_for_completion(&barr.done);
635 destroy_work_on_stack(&barr.work); 2441 destroy_work_on_stack(&barr.work);
636 } 2442 }
@@ -638,9 +2444,6 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
638 2444
639static void wait_on_work(struct work_struct *work) 2445static void wait_on_work(struct work_struct *work)
640{ 2446{
641 struct cpu_workqueue_struct *cwq;
642 struct workqueue_struct *wq;
643 const struct cpumask *cpu_map;
644 int cpu; 2447 int cpu;
645 2448
646 might_sleep(); 2449 might_sleep();
@@ -648,15 +2451,8 @@ static void wait_on_work(struct work_struct *work)
648 lock_map_acquire(&work->lockdep_map); 2451 lock_map_acquire(&work->lockdep_map);
649 lock_map_release(&work->lockdep_map); 2452 lock_map_release(&work->lockdep_map);
650 2453
651 cwq = get_wq_data(work); 2454 for_each_gcwq_cpu(cpu)
652 if (!cwq) 2455 wait_on_cpu_work(get_gcwq(cpu), work);
653 return;
654
655 wq = cwq->wq;
656 cpu_map = wq_cpu_map(wq);
657
658 for_each_cpu(cpu, cpu_map)
659 wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
660} 2456}
661 2457
662static int __cancel_work_timer(struct work_struct *work, 2458static int __cancel_work_timer(struct work_struct *work,
@@ -671,7 +2467,7 @@ static int __cancel_work_timer(struct work_struct *work,
671 wait_on_work(work); 2467 wait_on_work(work);
672 } while (unlikely(ret < 0)); 2468 } while (unlikely(ret < 0));
673 2469
674 work_clear_pending(work); 2470 clear_work_data(work);
675 return ret; 2471 return ret;
676} 2472}
677 2473
@@ -717,8 +2513,6 @@ int cancel_delayed_work_sync(struct delayed_work *dwork)
717} 2513}
718EXPORT_SYMBOL(cancel_delayed_work_sync); 2514EXPORT_SYMBOL(cancel_delayed_work_sync);
719 2515
720static struct workqueue_struct *keventd_wq __read_mostly;
721
722/** 2516/**
723 * schedule_work - put work task in global workqueue 2517 * schedule_work - put work task in global workqueue
724 * @work: job to be done 2518 * @work: job to be done
@@ -732,7 +2526,7 @@ static struct workqueue_struct *keventd_wq __read_mostly;
732 */ 2526 */
733int schedule_work(struct work_struct *work) 2527int schedule_work(struct work_struct *work)
734{ 2528{
735 return queue_work(keventd_wq, work); 2529 return queue_work(system_wq, work);
736} 2530}
737EXPORT_SYMBOL(schedule_work); 2531EXPORT_SYMBOL(schedule_work);
738 2532
@@ -745,7 +2539,7 @@ EXPORT_SYMBOL(schedule_work);
745 */ 2539 */
746int schedule_work_on(int cpu, struct work_struct *work) 2540int schedule_work_on(int cpu, struct work_struct *work)
747{ 2541{
748 return queue_work_on(cpu, keventd_wq, work); 2542 return queue_work_on(cpu, system_wq, work);
749} 2543}
750EXPORT_SYMBOL(schedule_work_on); 2544EXPORT_SYMBOL(schedule_work_on);
751 2545
@@ -760,7 +2554,7 @@ EXPORT_SYMBOL(schedule_work_on);
760int schedule_delayed_work(struct delayed_work *dwork, 2554int schedule_delayed_work(struct delayed_work *dwork,
761 unsigned long delay) 2555 unsigned long delay)
762{ 2556{
763 return queue_delayed_work(keventd_wq, dwork, delay); 2557 return queue_delayed_work(system_wq, dwork, delay);
764} 2558}
765EXPORT_SYMBOL(schedule_delayed_work); 2559EXPORT_SYMBOL(schedule_delayed_work);
766 2560
@@ -773,9 +2567,8 @@ EXPORT_SYMBOL(schedule_delayed_work);
773void flush_delayed_work(struct delayed_work *dwork) 2567void flush_delayed_work(struct delayed_work *dwork)
774{ 2568{
775 if (del_timer_sync(&dwork->timer)) { 2569 if (del_timer_sync(&dwork->timer)) {
776 struct cpu_workqueue_struct *cwq; 2570 __queue_work(get_cpu(), get_work_cwq(&dwork->work)->wq,
777 cwq = wq_per_cpu(get_wq_data(&dwork->work)->wq, get_cpu()); 2571 &dwork->work);
778 __queue_work(cwq, &dwork->work);
779 put_cpu(); 2572 put_cpu();
780 } 2573 }
781 flush_work(&dwork->work); 2574 flush_work(&dwork->work);
@@ -794,7 +2587,7 @@ EXPORT_SYMBOL(flush_delayed_work);
794int schedule_delayed_work_on(int cpu, 2587int schedule_delayed_work_on(int cpu,
795 struct delayed_work *dwork, unsigned long delay) 2588 struct delayed_work *dwork, unsigned long delay)
796{ 2589{
797 return queue_delayed_work_on(cpu, keventd_wq, dwork, delay); 2590 return queue_delayed_work_on(cpu, system_wq, dwork, delay);
798} 2591}
799EXPORT_SYMBOL(schedule_delayed_work_on); 2592EXPORT_SYMBOL(schedule_delayed_work_on);
800 2593
@@ -810,8 +2603,7 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
810int schedule_on_each_cpu(work_func_t func) 2603int schedule_on_each_cpu(work_func_t func)
811{ 2604{
812 int cpu; 2605 int cpu;
813 int orig = -1; 2606 struct work_struct __percpu *works;
814 struct work_struct *works;
815 2607
816 works = alloc_percpu(struct work_struct); 2608 works = alloc_percpu(struct work_struct);
817 if (!works) 2609 if (!works)
@@ -819,23 +2611,12 @@ int schedule_on_each_cpu(work_func_t func)
819 2611
820 get_online_cpus(); 2612 get_online_cpus();
821 2613
822 /*
823 * When running in keventd don't schedule a work item on
824 * itself. Can just call directly because the work queue is
825 * already bound. This also is faster.
826 */
827 if (current_is_keventd())
828 orig = raw_smp_processor_id();
829
830 for_each_online_cpu(cpu) { 2614 for_each_online_cpu(cpu) {
831 struct work_struct *work = per_cpu_ptr(works, cpu); 2615 struct work_struct *work = per_cpu_ptr(works, cpu);
832 2616
833 INIT_WORK(work, func); 2617 INIT_WORK(work, func);
834 if (cpu != orig) 2618 schedule_work_on(cpu, work);
835 schedule_work_on(cpu, work);
836 } 2619 }
837 if (orig >= 0)
838 func(per_cpu_ptr(works, orig));
839 2620
840 for_each_online_cpu(cpu) 2621 for_each_online_cpu(cpu)
841 flush_work(per_cpu_ptr(works, cpu)); 2622 flush_work(per_cpu_ptr(works, cpu));
@@ -845,9 +2626,33 @@ int schedule_on_each_cpu(work_func_t func)
845 return 0; 2626 return 0;
846} 2627}
847 2628
2629/**
2630 * flush_scheduled_work - ensure that any scheduled work has run to completion.
2631 *
2632 * Forces execution of the kernel-global workqueue and blocks until its
2633 * completion.
2634 *
2635 * Think twice before calling this function! It's very easy to get into
2636 * trouble if you don't take great care. Either of the following situations
2637 * will lead to deadlock:
2638 *
2639 * One of the work items currently on the workqueue needs to acquire
2640 * a lock held by your code or its caller.
2641 *
2642 * Your code is running in the context of a work routine.
2643 *
2644 * They will be detected by lockdep when they occur, but the first might not
2645 * occur very often. It depends on what work items are on the workqueue and
2646 * what locks they need, which you have no control over.
2647 *
2648 * In most situations flushing the entire workqueue is overkill; you merely
2649 * need to know that a particular work item isn't queued and isn't running.
2650 * In such cases you should use cancel_delayed_work_sync() or
2651 * cancel_work_sync() instead.
2652 */
848void flush_scheduled_work(void) 2653void flush_scheduled_work(void)
849{ 2654{
850 flush_workqueue(keventd_wq); 2655 flush_workqueue(system_wq);
851} 2656}
852EXPORT_SYMBOL(flush_scheduled_work); 2657EXPORT_SYMBOL(flush_scheduled_work);
853 2658
@@ -879,170 +2684,169 @@ EXPORT_SYMBOL_GPL(execute_in_process_context);
879 2684
880int keventd_up(void) 2685int keventd_up(void)
881{ 2686{
882 return keventd_wq != NULL; 2687 return system_wq != NULL;
883} 2688}
884 2689
885int current_is_keventd(void) 2690static int alloc_cwqs(struct workqueue_struct *wq)
886{ 2691{
887 struct cpu_workqueue_struct *cwq; 2692 /*
888 int cpu = raw_smp_processor_id(); /* preempt-safe: keventd is per-cpu */ 2693 * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
889 int ret = 0; 2694 * Make sure that the alignment isn't lower than that of
890 2695 * unsigned long long.
891 BUG_ON(!keventd_wq); 2696 */
2697 const size_t size = sizeof(struct cpu_workqueue_struct);
2698 const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
2699 __alignof__(unsigned long long));
2700#ifdef CONFIG_SMP
2701 bool percpu = !(wq->flags & WQ_UNBOUND);
2702#else
2703 bool percpu = false;
2704#endif
892 2705
893 cwq = per_cpu_ptr(keventd_wq->cpu_wq, cpu); 2706 if (percpu)
894 if (current == cwq->thread) 2707 wq->cpu_wq.pcpu = __alloc_percpu(size, align);
895 ret = 1; 2708 else {
2709 void *ptr;
896 2710
897 return ret; 2711 /*
2712 * Allocate enough room to align cwq and put an extra
2713 * pointer at the end pointing back to the originally
2714 * allocated pointer which will be used for free.
2715 */
2716 ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL);
2717 if (ptr) {
2718 wq->cpu_wq.single = PTR_ALIGN(ptr, align);
2719 *(void **)(wq->cpu_wq.single + 1) = ptr;
2720 }
2721 }
898 2722
2723 /* just in case, make sure it's actually aligned */
2724 BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
2725 return wq->cpu_wq.v ? 0 : -ENOMEM;
899} 2726}
900 2727
901static struct cpu_workqueue_struct * 2728static void free_cwqs(struct workqueue_struct *wq)
902init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
903{ 2729{
904 struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); 2730#ifdef CONFIG_SMP
905 2731 bool percpu = !(wq->flags & WQ_UNBOUND);
906 cwq->wq = wq; 2732#else
907 spin_lock_init(&cwq->lock); 2733 bool percpu = false;
908 INIT_LIST_HEAD(&cwq->worklist); 2734#endif
909 init_waitqueue_head(&cwq->more_work);
910 2735
911 return cwq; 2736 if (percpu)
2737 free_percpu(wq->cpu_wq.pcpu);
2738 else if (wq->cpu_wq.single) {
2739 /* the pointer to free is stored right after the cwq */
2740 kfree(*(void **)(wq->cpu_wq.single + 1));
2741 }
912} 2742}
913 2743
914static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) 2744static int wq_clamp_max_active(int max_active, unsigned int flags,
2745 const char *name)
915{ 2746{
916 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 2747 int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;
917 struct workqueue_struct *wq = cwq->wq;
918 const char *fmt = is_wq_single_threaded(wq) ? "%s" : "%s/%d";
919 struct task_struct *p;
920 2748
921 p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu); 2749 if (max_active < 1 || max_active > lim)
922 /* 2750 printk(KERN_WARNING "workqueue: max_active %d requested for %s "
923 * Nobody can add the work_struct to this cwq, 2751 "is out of range, clamping between %d and %d\n",
924 * if (caller is __create_workqueue) 2752 max_active, name, 1, lim);
925 * nobody should see this wq
926 * else // caller is CPU_UP_PREPARE
927 * cpu is not on cpu_online_map
928 * so we can abort safely.
929 */
930 if (IS_ERR(p))
931 return PTR_ERR(p);
932 if (cwq->wq->rt)
933 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
934 cwq->thread = p;
935 2753
936 trace_workqueue_creation(cwq->thread, cpu); 2754 return clamp_val(max_active, 1, lim);
937
938 return 0;
939} 2755}
940 2756
941static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) 2757struct workqueue_struct *__alloc_workqueue_key(const char *name,
2758 unsigned int flags,
2759 int max_active,
2760 struct lock_class_key *key,
2761 const char *lock_name)
942{ 2762{
943 struct task_struct *p = cwq->thread; 2763 struct workqueue_struct *wq;
2764 unsigned int cpu;
944 2765
945 if (p != NULL) { 2766 /*
946 if (cpu >= 0) 2767 * Unbound workqueues aren't concurrency managed and should be
947 kthread_bind(p, cpu); 2768 * dispatched to workers immediately.
948 wake_up_process(p); 2769 */
949 } 2770 if (flags & WQ_UNBOUND)
950} 2771 flags |= WQ_HIGHPRI;
951 2772
952struct workqueue_struct *__create_workqueue_key(const char *name, 2773 max_active = max_active ?: WQ_DFL_ACTIVE;
953 int singlethread, 2774 max_active = wq_clamp_max_active(max_active, flags, name);
954 int freezeable,
955 int rt,
956 struct lock_class_key *key,
957 const char *lock_name)
958{
959 struct workqueue_struct *wq;
960 struct cpu_workqueue_struct *cwq;
961 int err = 0, cpu;
962 2775
963 wq = kzalloc(sizeof(*wq), GFP_KERNEL); 2776 wq = kzalloc(sizeof(*wq), GFP_KERNEL);
964 if (!wq) 2777 if (!wq)
965 return NULL; 2778 goto err;
966 2779
967 wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); 2780 wq->flags = flags;
968 if (!wq->cpu_wq) { 2781 wq->saved_max_active = max_active;
969 kfree(wq); 2782 mutex_init(&wq->flush_mutex);
970 return NULL; 2783 atomic_set(&wq->nr_cwqs_to_flush, 0);
971 } 2784 INIT_LIST_HEAD(&wq->flusher_queue);
2785 INIT_LIST_HEAD(&wq->flusher_overflow);
972 2786
973 wq->name = name; 2787 wq->name = name;
974 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); 2788 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
975 wq->singlethread = singlethread;
976 wq->freezeable = freezeable;
977 wq->rt = rt;
978 INIT_LIST_HEAD(&wq->list); 2789 INIT_LIST_HEAD(&wq->list);
979 2790
980 if (singlethread) { 2791 if (alloc_cwqs(wq) < 0)
981 cwq = init_cpu_workqueue(wq, singlethread_cpu); 2792 goto err;
982 err = create_workqueue_thread(cwq, singlethread_cpu); 2793
983 start_workqueue_thread(cwq, -1); 2794 for_each_cwq_cpu(cpu, wq) {
984 } else { 2795 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
985 cpu_maps_update_begin(); 2796 struct global_cwq *gcwq = get_gcwq(cpu);
986 /* 2797
987 * We must place this wq on list even if the code below fails. 2798 BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
988 * cpu_down(cpu) can remove cpu from cpu_populated_map before 2799 cwq->gcwq = gcwq;
989 * destroy_workqueue() takes the lock, in that case we leak 2800 cwq->wq = wq;
990 * cwq[cpu]->thread. 2801 cwq->flush_color = -1;
991 */ 2802 cwq->max_active = max_active;
992 spin_lock(&workqueue_lock); 2803 INIT_LIST_HEAD(&cwq->delayed_works);
993 list_add(&wq->list, &workqueues);
994 spin_unlock(&workqueue_lock);
995 /*
996 * We must initialize cwqs for each possible cpu even if we
997 * are going to call destroy_workqueue() finally. Otherwise
998 * cpu_up() can hit the uninitialized cwq once we drop the
999 * lock.
1000 */
1001 for_each_possible_cpu(cpu) {
1002 cwq = init_cpu_workqueue(wq, cpu);
1003 if (err || !cpu_online(cpu))
1004 continue;
1005 err = create_workqueue_thread(cwq, cpu);
1006 start_workqueue_thread(cwq, cpu);
1007 }
1008 cpu_maps_update_done();
1009 } 2804 }
1010 2805
1011 if (err) { 2806 if (flags & WQ_RESCUER) {
1012 destroy_workqueue(wq); 2807 struct worker *rescuer;
1013 wq = NULL; 2808
2809 if (!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL))
2810 goto err;
2811
2812 wq->rescuer = rescuer = alloc_worker();
2813 if (!rescuer)
2814 goto err;
2815
2816 rescuer->task = kthread_create(rescuer_thread, wq, "%s", name);
2817 if (IS_ERR(rescuer->task))
2818 goto err;
2819
2820 rescuer->task->flags |= PF_THREAD_BOUND;
2821 wake_up_process(rescuer->task);
1014 } 2822 }
1015 return wq;
1016}
1017EXPORT_SYMBOL_GPL(__create_workqueue_key);
1018 2823
1019static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
1020{
1021 /* 2824 /*
1022 * Our caller is either destroy_workqueue() or CPU_POST_DEAD, 2825 * workqueue_lock protects global freeze state and workqueues
1023 * cpu_add_remove_lock protects cwq->thread. 2826 * list. Grab it, set max_active accordingly and add the new
2827 * workqueue to workqueues list.
1024 */ 2828 */
1025 if (cwq->thread == NULL) 2829 spin_lock(&workqueue_lock);
1026 return;
1027 2830
1028 lock_map_acquire(&cwq->wq->lockdep_map); 2831 if (workqueue_freezing && wq->flags & WQ_FREEZEABLE)
1029 lock_map_release(&cwq->wq->lockdep_map); 2832 for_each_cwq_cpu(cpu, wq)
2833 get_cwq(cpu, wq)->max_active = 0;
1030 2834
1031 flush_cpu_workqueue(cwq); 2835 list_add(&wq->list, &workqueues);
1032 /* 2836
1033 * If the caller is CPU_POST_DEAD and cwq->worklist was not empty, 2837 spin_unlock(&workqueue_lock);
1034 * a concurrent flush_workqueue() can insert a barrier after us. 2838
1035 * However, in that case run_workqueue() won't return and check 2839 return wq;
1036 * kthread_should_stop() until it flushes all work_struct's. 2840err:
1037 * When ->worklist becomes empty it is safe to exit because no 2841 if (wq) {
1038 * more work_structs can be queued on this cwq: flush_workqueue 2842 free_cwqs(wq);
1039 * checks list_empty(), and a "normal" queue_work() can't use 2843 free_mayday_mask(wq->mayday_mask);
1040 * a dead CPU. 2844 kfree(wq->rescuer);
1041 */ 2845 kfree(wq);
1042 trace_workqueue_destruction(cwq->thread); 2846 }
1043 kthread_stop(cwq->thread); 2847 return NULL;
1044 cwq->thread = NULL;
1045} 2848}
2849EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
1046 2850
1047/** 2851/**
1048 * destroy_workqueue - safely terminate a workqueue 2852 * destroy_workqueue - safely terminate a workqueue
@@ -1052,71 +2856,520 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
1052 */ 2856 */
1053void destroy_workqueue(struct workqueue_struct *wq) 2857void destroy_workqueue(struct workqueue_struct *wq)
1054{ 2858{
1055 const struct cpumask *cpu_map = wq_cpu_map(wq); 2859 unsigned int cpu;
1056 int cpu;
1057 2860
1058 cpu_maps_update_begin(); 2861 wq->flags |= WQ_DYING;
2862 flush_workqueue(wq);
2863
2864 /*
2865 * wq list is used to freeze wq, remove from list after
2866 * flushing is complete in case freeze races us.
2867 */
1059 spin_lock(&workqueue_lock); 2868 spin_lock(&workqueue_lock);
1060 list_del(&wq->list); 2869 list_del(&wq->list);
1061 spin_unlock(&workqueue_lock); 2870 spin_unlock(&workqueue_lock);
1062 2871
1063 for_each_cpu(cpu, cpu_map) 2872 /* sanity check */
1064 cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu)); 2873 for_each_cwq_cpu(cpu, wq) {
1065 cpu_maps_update_done(); 2874 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2875 int i;
2876
2877 for (i = 0; i < WORK_NR_COLORS; i++)
2878 BUG_ON(cwq->nr_in_flight[i]);
2879 BUG_ON(cwq->nr_active);
2880 BUG_ON(!list_empty(&cwq->delayed_works));
2881 }
2882
2883 if (wq->flags & WQ_RESCUER) {
2884 kthread_stop(wq->rescuer->task);
2885 free_mayday_mask(wq->mayday_mask);
2886 kfree(wq->rescuer);
2887 }
1066 2888
1067 free_percpu(wq->cpu_wq); 2889 free_cwqs(wq);
1068 kfree(wq); 2890 kfree(wq);
1069} 2891}
1070EXPORT_SYMBOL_GPL(destroy_workqueue); 2892EXPORT_SYMBOL_GPL(destroy_workqueue);
1071 2893
2894/**
2895 * workqueue_set_max_active - adjust max_active of a workqueue
2896 * @wq: target workqueue
2897 * @max_active: new max_active value.
2898 *
2899 * Set max_active of @wq to @max_active.
2900 *
2901 * CONTEXT:
2902 * Don't call from IRQ context.
2903 */
2904void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
2905{
2906 unsigned int cpu;
2907
2908 max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
2909
2910 spin_lock(&workqueue_lock);
2911
2912 wq->saved_max_active = max_active;
2913
2914 for_each_cwq_cpu(cpu, wq) {
2915 struct global_cwq *gcwq = get_gcwq(cpu);
2916
2917 spin_lock_irq(&gcwq->lock);
2918
2919 if (!(wq->flags & WQ_FREEZEABLE) ||
2920 !(gcwq->flags & GCWQ_FREEZING))
2921 get_cwq(gcwq->cpu, wq)->max_active = max_active;
2922
2923 spin_unlock_irq(&gcwq->lock);
2924 }
2925
2926 spin_unlock(&workqueue_lock);
2927}
2928EXPORT_SYMBOL_GPL(workqueue_set_max_active);
2929
2930/**
2931 * workqueue_congested - test whether a workqueue is congested
2932 * @cpu: CPU in question
2933 * @wq: target workqueue
2934 *
2935 * Test whether @wq's cpu workqueue for @cpu is congested. There is
2936 * no synchronization around this function and the test result is
2937 * unreliable and only useful as advisory hints or for debugging.
2938 *
2939 * RETURNS:
2940 * %true if congested, %false otherwise.
2941 */
2942bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq)
2943{
2944 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2945
2946 return !list_empty(&cwq->delayed_works);
2947}
2948EXPORT_SYMBOL_GPL(workqueue_congested);
2949
2950/**
2951 * work_cpu - return the last known associated cpu for @work
2952 * @work: the work of interest
2953 *
2954 * RETURNS:
2955 * CPU number if @work was ever queued. WORK_CPU_NONE otherwise.
2956 */
2957unsigned int work_cpu(struct work_struct *work)
2958{
2959 struct global_cwq *gcwq = get_work_gcwq(work);
2960
2961 return gcwq ? gcwq->cpu : WORK_CPU_NONE;
2962}
2963EXPORT_SYMBOL_GPL(work_cpu);
2964
2965/**
2966 * work_busy - test whether a work is currently pending or running
2967 * @work: the work to be tested
2968 *
2969 * Test whether @work is currently pending or running. There is no
2970 * synchronization around this function and the test result is
2971 * unreliable and only useful as advisory hints or for debugging.
2972 * Especially for reentrant wqs, the pending state might hide the
2973 * running state.
2974 *
2975 * RETURNS:
2976 * OR'd bitmask of WORK_BUSY_* bits.
2977 */
2978unsigned int work_busy(struct work_struct *work)
2979{
2980 struct global_cwq *gcwq = get_work_gcwq(work);
2981 unsigned long flags;
2982 unsigned int ret = 0;
2983
2984 if (!gcwq)
2985 return false;
2986
2987 spin_lock_irqsave(&gcwq->lock, flags);
2988
2989 if (work_pending(work))
2990 ret |= WORK_BUSY_PENDING;
2991 if (find_worker_executing_work(gcwq, work))
2992 ret |= WORK_BUSY_RUNNING;
2993
2994 spin_unlock_irqrestore(&gcwq->lock, flags);
2995
2996 return ret;
2997}
2998EXPORT_SYMBOL_GPL(work_busy);
2999
3000/*
3001 * CPU hotplug.
3002 *
3003 * There are two challenges in supporting CPU hotplug. Firstly, there
3004 * are a lot of assumptions on strong associations among work, cwq and
3005 * gcwq which make migrating pending and scheduled works very
3006 * difficult to implement without impacting hot paths. Secondly,
3007 * gcwqs serve mix of short, long and very long running works making
3008 * blocked draining impractical.
3009 *
3010 * This is solved by allowing a gcwq to be detached from CPU, running
3011 * it with unbound (rogue) workers and allowing it to be reattached
3012 * later if the cpu comes back online. A separate thread is created
3013 * to govern a gcwq in such state and is called the trustee of the
3014 * gcwq.
3015 *
3016 * Trustee states and their descriptions.
3017 *
3018 * START Command state used on startup. On CPU_DOWN_PREPARE, a
3019 * new trustee is started with this state.
3020 *
3021 * IN_CHARGE Once started, trustee will enter this state after
3022 * assuming the manager role and making all existing
3023 * workers rogue. DOWN_PREPARE waits for trustee to
3024 * enter this state. After reaching IN_CHARGE, trustee
3025 * tries to execute the pending worklist until it's empty
3026 * and the state is set to BUTCHER, or the state is set
3027 * to RELEASE.
3028 *
3029 * BUTCHER Command state which is set by the cpu callback after
3030 * the cpu has went down. Once this state is set trustee
3031 * knows that there will be no new works on the worklist
3032 * and once the worklist is empty it can proceed to
3033 * killing idle workers.
3034 *
3035 * RELEASE Command state which is set by the cpu callback if the
3036 * cpu down has been canceled or it has come online
3037 * again. After recognizing this state, trustee stops
3038 * trying to drain or butcher and clears ROGUE, rebinds
3039 * all remaining workers back to the cpu and releases
3040 * manager role.
3041 *
3042 * DONE Trustee will enter this state after BUTCHER or RELEASE
3043 * is complete.
3044 *
3045 * trustee CPU draining
3046 * took over down complete
3047 * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
3048 * | | ^
3049 * | CPU is back online v return workers |
3050 * ----------------> RELEASE --------------
3051 */
3052
3053/**
3054 * trustee_wait_event_timeout - timed event wait for trustee
3055 * @cond: condition to wait for
3056 * @timeout: timeout in jiffies
3057 *
3058 * wait_event_timeout() for trustee to use. Handles locking and
3059 * checks for RELEASE request.
3060 *
3061 * CONTEXT:
3062 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3063 * multiple times. To be used by trustee.
3064 *
3065 * RETURNS:
3066 * Positive indicating left time if @cond is satisfied, 0 if timed
3067 * out, -1 if canceled.
3068 */
3069#define trustee_wait_event_timeout(cond, timeout) ({ \
3070 long __ret = (timeout); \
3071 while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \
3072 __ret) { \
3073 spin_unlock_irq(&gcwq->lock); \
3074 __wait_event_timeout(gcwq->trustee_wait, (cond) || \
3075 (gcwq->trustee_state == TRUSTEE_RELEASE), \
3076 __ret); \
3077 spin_lock_irq(&gcwq->lock); \
3078 } \
3079 gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \
3080})
3081
3082/**
3083 * trustee_wait_event - event wait for trustee
3084 * @cond: condition to wait for
3085 *
3086 * wait_event() for trustee to use. Automatically handles locking and
3087 * checks for CANCEL request.
3088 *
3089 * CONTEXT:
3090 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3091 * multiple times. To be used by trustee.
3092 *
3093 * RETURNS:
3094 * 0 if @cond is satisfied, -1 if canceled.
3095 */
3096#define trustee_wait_event(cond) ({ \
3097 long __ret1; \
3098 __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
3099 __ret1 < 0 ? -1 : 0; \
3100})
3101
3102static int __cpuinit trustee_thread(void *__gcwq)
3103{
3104 struct global_cwq *gcwq = __gcwq;
3105 struct worker *worker;
3106 struct work_struct *work;
3107 struct hlist_node *pos;
3108 long rc;
3109 int i;
3110
3111 BUG_ON(gcwq->cpu != smp_processor_id());
3112
3113 spin_lock_irq(&gcwq->lock);
3114 /*
3115 * Claim the manager position and make all workers rogue.
3116 * Trustee must be bound to the target cpu and can't be
3117 * cancelled.
3118 */
3119 BUG_ON(gcwq->cpu != smp_processor_id());
3120 rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS));
3121 BUG_ON(rc < 0);
3122
3123 gcwq->flags |= GCWQ_MANAGING_WORKERS;
3124
3125 list_for_each_entry(worker, &gcwq->idle_list, entry)
3126 worker->flags |= WORKER_ROGUE;
3127
3128 for_each_busy_worker(worker, i, pos, gcwq)
3129 worker->flags |= WORKER_ROGUE;
3130
3131 /*
3132 * Call schedule() so that we cross rq->lock and thus can
3133 * guarantee sched callbacks see the rogue flag. This is
3134 * necessary as scheduler callbacks may be invoked from other
3135 * cpus.
3136 */
3137 spin_unlock_irq(&gcwq->lock);
3138 schedule();
3139 spin_lock_irq(&gcwq->lock);
3140
3141 /*
3142 * Sched callbacks are disabled now. Zap nr_running. After
3143 * this, nr_running stays zero and need_more_worker() and
3144 * keep_working() are always true as long as the worklist is
3145 * not empty.
3146 */
3147 atomic_set(get_gcwq_nr_running(gcwq->cpu), 0);
3148
3149 spin_unlock_irq(&gcwq->lock);
3150 del_timer_sync(&gcwq->idle_timer);
3151 spin_lock_irq(&gcwq->lock);
3152
3153 /*
3154 * We're now in charge. Notify and proceed to drain. We need
3155 * to keep the gcwq running during the whole CPU down
3156 * procedure as other cpu hotunplug callbacks may need to
3157 * flush currently running tasks.
3158 */
3159 gcwq->trustee_state = TRUSTEE_IN_CHARGE;
3160 wake_up_all(&gcwq->trustee_wait);
3161
3162 /*
3163 * The original cpu is in the process of dying and may go away
3164 * anytime now. When that happens, we and all workers would
3165 * be migrated to other cpus. Try draining any left work. We
3166 * want to get it over with ASAP - spam rescuers, wake up as
3167 * many idlers as necessary and create new ones till the
3168 * worklist is empty. Note that if the gcwq is frozen, there
3169 * may be frozen works in freezeable cwqs. Don't declare
3170 * completion while frozen.
3171 */
3172 while (gcwq->nr_workers != gcwq->nr_idle ||
3173 gcwq->flags & GCWQ_FREEZING ||
3174 gcwq->trustee_state == TRUSTEE_IN_CHARGE) {
3175 int nr_works = 0;
3176
3177 list_for_each_entry(work, &gcwq->worklist, entry) {
3178 send_mayday(work);
3179 nr_works++;
3180 }
3181
3182 list_for_each_entry(worker, &gcwq->idle_list, entry) {
3183 if (!nr_works--)
3184 break;
3185 wake_up_process(worker->task);
3186 }
3187
3188 if (need_to_create_worker(gcwq)) {
3189 spin_unlock_irq(&gcwq->lock);
3190 worker = create_worker(gcwq, false);
3191 spin_lock_irq(&gcwq->lock);
3192 if (worker) {
3193 worker->flags |= WORKER_ROGUE;
3194 start_worker(worker);
3195 }
3196 }
3197
3198 /* give a breather */
3199 if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
3200 break;
3201 }
3202
3203 /*
3204 * Either all works have been scheduled and cpu is down, or
3205 * cpu down has already been canceled. Wait for and butcher
3206 * all workers till we're canceled.
3207 */
3208 do {
3209 rc = trustee_wait_event(!list_empty(&gcwq->idle_list));
3210 while (!list_empty(&gcwq->idle_list))
3211 destroy_worker(list_first_entry(&gcwq->idle_list,
3212 struct worker, entry));
3213 } while (gcwq->nr_workers && rc >= 0);
3214
3215 /*
3216 * At this point, either draining has completed and no worker
3217 * is left, or cpu down has been canceled or the cpu is being
3218 * brought back up. There shouldn't be any idle one left.
3219 * Tell the remaining busy ones to rebind once it finishes the
3220 * currently scheduled works by scheduling the rebind_work.
3221 */
3222 WARN_ON(!list_empty(&gcwq->idle_list));
3223
3224 for_each_busy_worker(worker, i, pos, gcwq) {
3225 struct work_struct *rebind_work = &worker->rebind_work;
3226
3227 /*
3228 * Rebind_work may race with future cpu hotplug
3229 * operations. Use a separate flag to mark that
3230 * rebinding is scheduled.
3231 */
3232 worker->flags |= WORKER_REBIND;
3233 worker->flags &= ~WORKER_ROGUE;
3234
3235 /* queue rebind_work, wq doesn't matter, use the default one */
3236 if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
3237 work_data_bits(rebind_work)))
3238 continue;
3239
3240 debug_work_activate(rebind_work);
3241 insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
3242 worker->scheduled.next,
3243 work_color_to_flags(WORK_NO_COLOR));
3244 }
3245
3246 /* relinquish manager role */
3247 gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
3248
3249 /* notify completion */
3250 gcwq->trustee = NULL;
3251 gcwq->trustee_state = TRUSTEE_DONE;
3252 wake_up_all(&gcwq->trustee_wait);
3253 spin_unlock_irq(&gcwq->lock);
3254 return 0;
3255}
3256
3257/**
3258 * wait_trustee_state - wait for trustee to enter the specified state
3259 * @gcwq: gcwq the trustee of interest belongs to
3260 * @state: target state to wait for
3261 *
3262 * Wait for the trustee to reach @state. DONE is already matched.
3263 *
3264 * CONTEXT:
3265 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3266 * multiple times. To be used by cpu_callback.
3267 */
3268static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state)
3269__releases(&gcwq->lock)
3270__acquires(&gcwq->lock)
3271{
3272 if (!(gcwq->trustee_state == state ||
3273 gcwq->trustee_state == TRUSTEE_DONE)) {
3274 spin_unlock_irq(&gcwq->lock);
3275 __wait_event(gcwq->trustee_wait,
3276 gcwq->trustee_state == state ||
3277 gcwq->trustee_state == TRUSTEE_DONE);
3278 spin_lock_irq(&gcwq->lock);
3279 }
3280}
3281
1072static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, 3282static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
1073 unsigned long action, 3283 unsigned long action,
1074 void *hcpu) 3284 void *hcpu)
1075{ 3285{
1076 unsigned int cpu = (unsigned long)hcpu; 3286 unsigned int cpu = (unsigned long)hcpu;
1077 struct cpu_workqueue_struct *cwq; 3287 struct global_cwq *gcwq = get_gcwq(cpu);
1078 struct workqueue_struct *wq; 3288 struct task_struct *new_trustee = NULL;
1079 int ret = NOTIFY_OK; 3289 struct worker *uninitialized_var(new_worker);
3290 unsigned long flags;
1080 3291
1081 action &= ~CPU_TASKS_FROZEN; 3292 action &= ~CPU_TASKS_FROZEN;
1082 3293
1083 switch (action) { 3294 switch (action) {
3295 case CPU_DOWN_PREPARE:
3296 new_trustee = kthread_create(trustee_thread, gcwq,
3297 "workqueue_trustee/%d\n", cpu);
3298 if (IS_ERR(new_trustee))
3299 return notifier_from_errno(PTR_ERR(new_trustee));
3300 kthread_bind(new_trustee, cpu);
3301 /* fall through */
1084 case CPU_UP_PREPARE: 3302 case CPU_UP_PREPARE:
1085 cpumask_set_cpu(cpu, cpu_populated_map); 3303 BUG_ON(gcwq->first_idle);
1086 } 3304 new_worker = create_worker(gcwq, false);
1087undo: 3305 if (!new_worker) {
1088 list_for_each_entry(wq, &workqueues, list) { 3306 if (new_trustee)
1089 cwq = per_cpu_ptr(wq->cpu_wq, cpu); 3307 kthread_stop(new_trustee);
1090 3308 return NOTIFY_BAD;
1091 switch (action) {
1092 case CPU_UP_PREPARE:
1093 if (!create_workqueue_thread(cwq, cpu))
1094 break;
1095 printk(KERN_ERR "workqueue [%s] for %i failed\n",
1096 wq->name, cpu);
1097 action = CPU_UP_CANCELED;
1098 ret = NOTIFY_BAD;
1099 goto undo;
1100
1101 case CPU_ONLINE:
1102 start_workqueue_thread(cwq, cpu);
1103 break;
1104
1105 case CPU_UP_CANCELED:
1106 start_workqueue_thread(cwq, -1);
1107 case CPU_POST_DEAD:
1108 cleanup_workqueue_thread(cwq);
1109 break;
1110 } 3309 }
1111 } 3310 }
1112 3311
3312 /* some are called w/ irq disabled, don't disturb irq status */
3313 spin_lock_irqsave(&gcwq->lock, flags);
3314
1113 switch (action) { 3315 switch (action) {
1114 case CPU_UP_CANCELED: 3316 case CPU_DOWN_PREPARE:
3317 /* initialize trustee and tell it to acquire the gcwq */
3318 BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE);
3319 gcwq->trustee = new_trustee;
3320 gcwq->trustee_state = TRUSTEE_START;
3321 wake_up_process(gcwq->trustee);
3322 wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
3323 /* fall through */
3324 case CPU_UP_PREPARE:
3325 BUG_ON(gcwq->first_idle);
3326 gcwq->first_idle = new_worker;
3327 break;
3328
3329 case CPU_DYING:
3330 /*
3331 * Before this, the trustee and all workers except for
3332 * the ones which are still executing works from
3333 * before the last CPU down must be on the cpu. After
3334 * this, they'll all be diasporas.
3335 */
3336 gcwq->flags |= GCWQ_DISASSOCIATED;
3337 break;
3338
1115 case CPU_POST_DEAD: 3339 case CPU_POST_DEAD:
1116 cpumask_clear_cpu(cpu, cpu_populated_map); 3340 gcwq->trustee_state = TRUSTEE_BUTCHER;
3341 /* fall through */
3342 case CPU_UP_CANCELED:
3343 destroy_worker(gcwq->first_idle);
3344 gcwq->first_idle = NULL;
3345 break;
3346
3347 case CPU_DOWN_FAILED:
3348 case CPU_ONLINE:
3349 gcwq->flags &= ~GCWQ_DISASSOCIATED;
3350 if (gcwq->trustee_state != TRUSTEE_DONE) {
3351 gcwq->trustee_state = TRUSTEE_RELEASE;
3352 wake_up_process(gcwq->trustee);
3353 wait_trustee_state(gcwq, TRUSTEE_DONE);
3354 }
3355
3356 /*
3357 * Trustee is done and there might be no worker left.
3358 * Put the first_idle in and request a real manager to
3359 * take a look.
3360 */
3361 spin_unlock_irq(&gcwq->lock);
3362 kthread_bind(gcwq->first_idle->task, cpu);
3363 spin_lock_irq(&gcwq->lock);
3364 gcwq->flags |= GCWQ_MANAGE_WORKERS;
3365 start_worker(gcwq->first_idle);
3366 gcwq->first_idle = NULL;
3367 break;
1117 } 3368 }
1118 3369
1119 return ret; 3370 spin_unlock_irqrestore(&gcwq->lock, flags);
3371
3372 return notifier_from_errno(0);
1120} 3373}
1121 3374
1122#ifdef CONFIG_SMP 3375#ifdef CONFIG_SMP
@@ -1166,14 +3419,200 @@ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
1166EXPORT_SYMBOL_GPL(work_on_cpu); 3419EXPORT_SYMBOL_GPL(work_on_cpu);
1167#endif /* CONFIG_SMP */ 3420#endif /* CONFIG_SMP */
1168 3421
1169void __init init_workqueues(void) 3422#ifdef CONFIG_FREEZER
3423
3424/**
3425 * freeze_workqueues_begin - begin freezing workqueues
3426 *
3427 * Start freezing workqueues. After this function returns, all
3428 * freezeable workqueues will queue new works to their frozen_works
3429 * list instead of gcwq->worklist.
3430 *
3431 * CONTEXT:
3432 * Grabs and releases workqueue_lock and gcwq->lock's.
3433 */
3434void freeze_workqueues_begin(void)
3435{
3436 unsigned int cpu;
3437
3438 spin_lock(&workqueue_lock);
3439
3440 BUG_ON(workqueue_freezing);
3441 workqueue_freezing = true;
3442
3443 for_each_gcwq_cpu(cpu) {
3444 struct global_cwq *gcwq = get_gcwq(cpu);
3445 struct workqueue_struct *wq;
3446
3447 spin_lock_irq(&gcwq->lock);
3448
3449 BUG_ON(gcwq->flags & GCWQ_FREEZING);
3450 gcwq->flags |= GCWQ_FREEZING;
3451
3452 list_for_each_entry(wq, &workqueues, list) {
3453 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3454
3455 if (cwq && wq->flags & WQ_FREEZEABLE)
3456 cwq->max_active = 0;
3457 }
3458
3459 spin_unlock_irq(&gcwq->lock);
3460 }
3461
3462 spin_unlock(&workqueue_lock);
3463}
3464
3465/**
3466 * freeze_workqueues_busy - are freezeable workqueues still busy?
3467 *
3468 * Check whether freezing is complete. This function must be called
3469 * between freeze_workqueues_begin() and thaw_workqueues().
3470 *
3471 * CONTEXT:
3472 * Grabs and releases workqueue_lock.
3473 *
3474 * RETURNS:
3475 * %true if some freezeable workqueues are still busy. %false if
3476 * freezing is complete.
3477 */
3478bool freeze_workqueues_busy(void)
3479{
3480 unsigned int cpu;
3481 bool busy = false;
3482
3483 spin_lock(&workqueue_lock);
3484
3485 BUG_ON(!workqueue_freezing);
3486
3487 for_each_gcwq_cpu(cpu) {
3488 struct workqueue_struct *wq;
3489 /*
3490 * nr_active is monotonically decreasing. It's safe
3491 * to peek without lock.
3492 */
3493 list_for_each_entry(wq, &workqueues, list) {
3494 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3495
3496 if (!cwq || !(wq->flags & WQ_FREEZEABLE))
3497 continue;
3498
3499 BUG_ON(cwq->nr_active < 0);
3500 if (cwq->nr_active) {
3501 busy = true;
3502 goto out_unlock;
3503 }
3504 }
3505 }
3506out_unlock:
3507 spin_unlock(&workqueue_lock);
3508 return busy;
3509}
3510
3511/**
3512 * thaw_workqueues - thaw workqueues
3513 *
3514 * Thaw workqueues. Normal queueing is restored and all collected
3515 * frozen works are transferred to their respective gcwq worklists.
3516 *
3517 * CONTEXT:
3518 * Grabs and releases workqueue_lock and gcwq->lock's.
3519 */
3520void thaw_workqueues(void)
3521{
3522 unsigned int cpu;
3523
3524 spin_lock(&workqueue_lock);
3525
3526 if (!workqueue_freezing)
3527 goto out_unlock;
3528
3529 for_each_gcwq_cpu(cpu) {
3530 struct global_cwq *gcwq = get_gcwq(cpu);
3531 struct workqueue_struct *wq;
3532
3533 spin_lock_irq(&gcwq->lock);
3534
3535 BUG_ON(!(gcwq->flags & GCWQ_FREEZING));
3536 gcwq->flags &= ~GCWQ_FREEZING;
3537
3538 list_for_each_entry(wq, &workqueues, list) {
3539 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3540
3541 if (!cwq || !(wq->flags & WQ_FREEZEABLE))
3542 continue;
3543
3544 /* restore max_active and repopulate worklist */
3545 cwq->max_active = wq->saved_max_active;
3546
3547 while (!list_empty(&cwq->delayed_works) &&
3548 cwq->nr_active < cwq->max_active)
3549 cwq_activate_first_delayed(cwq);
3550 }
3551
3552 wake_up_worker(gcwq);
3553
3554 spin_unlock_irq(&gcwq->lock);
3555 }
3556
3557 workqueue_freezing = false;
3558out_unlock:
3559 spin_unlock(&workqueue_lock);
3560}
3561#endif /* CONFIG_FREEZER */
3562
3563static int __init init_workqueues(void)
1170{ 3564{
1171 alloc_cpumask_var(&cpu_populated_map, GFP_KERNEL); 3565 unsigned int cpu;
3566 int i;
3567
3568 cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
3569
3570 /* initialize gcwqs */
3571 for_each_gcwq_cpu(cpu) {
3572 struct global_cwq *gcwq = get_gcwq(cpu);
3573
3574 spin_lock_init(&gcwq->lock);
3575 INIT_LIST_HEAD(&gcwq->worklist);
3576 gcwq->cpu = cpu;
3577 gcwq->flags |= GCWQ_DISASSOCIATED;
3578
3579 INIT_LIST_HEAD(&gcwq->idle_list);
3580 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
3581 INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
3582
3583 init_timer_deferrable(&gcwq->idle_timer);
3584 gcwq->idle_timer.function = idle_worker_timeout;
3585 gcwq->idle_timer.data = (unsigned long)gcwq;
1172 3586
1173 cpumask_copy(cpu_populated_map, cpu_online_mask); 3587 setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout,
1174 singlethread_cpu = cpumask_first(cpu_possible_mask); 3588 (unsigned long)gcwq);
1175 cpu_singlethread_map = cpumask_of(singlethread_cpu); 3589
1176 hotcpu_notifier(workqueue_cpu_callback, 0); 3590 ida_init(&gcwq->worker_ida);
1177 keventd_wq = create_workqueue("events"); 3591
1178 BUG_ON(!keventd_wq); 3592 gcwq->trustee_state = TRUSTEE_DONE;
3593 init_waitqueue_head(&gcwq->trustee_wait);
3594 }
3595
3596 /* create the initial worker */
3597 for_each_online_gcwq_cpu(cpu) {
3598 struct global_cwq *gcwq = get_gcwq(cpu);
3599 struct worker *worker;
3600
3601 if (cpu != WORK_CPU_UNBOUND)
3602 gcwq->flags &= ~GCWQ_DISASSOCIATED;
3603 worker = create_worker(gcwq, true);
3604 BUG_ON(!worker);
3605 spin_lock_irq(&gcwq->lock);
3606 start_worker(worker);
3607 spin_unlock_irq(&gcwq->lock);
3608 }
3609
3610 system_wq = alloc_workqueue("events", 0, 0);
3611 system_long_wq = alloc_workqueue("events_long", 0, 0);
3612 system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
3613 system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
3614 WQ_UNBOUND_MAX_ACTIVE);
3615 BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq);
3616 return 0;
1179} 3617}
3618early_initcall(init_workqueues);
diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h
new file mode 100644
index 000000000000..2d10fc98dc79
--- /dev/null
+++ b/kernel/workqueue_sched.h
@@ -0,0 +1,9 @@
1/*
2 * kernel/workqueue_sched.h
3 *
4 * Scheduler hooks for concurrency managed workqueue. Only to be
5 * included from sched.c and workqueue.c.
6 */
7void wq_worker_waking_up(struct task_struct *task, unsigned int cpu);
8struct task_struct *wq_worker_sleeping(struct task_struct *task,
9 unsigned int cpu);