aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile15
-rw-r--r--kernel/acct.c2
-rw-r--r--kernel/async.c141
-rw-r--r--kernel/audit.c3
-rw-r--r--kernel/audit.h26
-rw-r--r--kernel/audit_tree.c237
-rw-r--r--kernel/audit_watch.c274
-rw-r--r--kernel/auditfilter.c39
-rw-r--r--kernel/auditsc.c19
-rw-r--r--kernel/cgroup.c177
-rw-r--r--kernel/cgroup_freezer.c72
-rw-r--r--kernel/compat.c38
-rw-r--r--kernel/configs.c1
-rw-r--r--kernel/cpu.c6
-rw-r--r--kernel/cpuset.c27
-rw-r--r--kernel/cred.c8
-rw-r--r--kernel/debug/debug_core.c145
-rw-r--r--kernel/debug/debug_core.h1
-rw-r--r--kernel/debug/gdbstub.c191
-rw-r--r--kernel/debug/kdb/kdb_bp.c2
-rw-r--r--kernel/debug/kdb/kdb_debugger.c3
-rw-r--r--kernel/debug/kdb/kdb_io.c2
-rw-r--r--kernel/debug/kdb/kdb_main.c156
-rw-r--r--kernel/debug/kdb/kdb_private.h43
-rw-r--r--kernel/debug/kdb/kdb_support.c4
-rw-r--r--kernel/early_res.c590
-rw-r--r--kernel/exec_domain.c22
-rw-r--r--kernel/exit.c21
-rw-r--r--kernel/fork.c39
-rw-r--r--kernel/futex.c68
-rw-r--r--kernel/futex_compat.c2
-rw-r--r--kernel/gcov/fs.c245
-rw-r--r--kernel/groups.c5
-rw-r--r--kernel/hrtimer.c33
-rw-r--r--kernel/hung_task.c4
-rw-r--r--kernel/hw_breakpoint.c160
-rw-r--r--kernel/irq/Kconfig53
-rw-r--r--kernel/irq/Makefile3
-rw-r--r--kernel/irq/autoprobe.c15
-rw-r--r--kernel/irq/chip.c378
-rw-r--r--kernel/irq/dummychip.c68
-rw-r--r--kernel/irq/handle.c341
-rw-r--r--kernel/irq/internals.h39
-rw-r--r--kernel/irq/irqdesc.c410
-rw-r--r--kernel/irq/manage.c89
-rw-r--r--kernel/irq/migration.c12
-rw-r--r--kernel/irq/numa_migrate.c120
-rw-r--r--kernel/irq/proc.c26
-rw-r--r--kernel/irq/resend.c5
-rw-r--r--kernel/irq/spurious.c8
-rw-r--r--kernel/irq_work.c164
-rw-r--r--kernel/jump_label.c429
-rw-r--r--kernel/kexec.c10
-rw-r--r--kernel/kfifo.c751
-rw-r--r--kernel/kmod.c4
-rw-r--r--kernel/kprobes.c34
-rw-r--r--kernel/kthread.c164
-rw-r--r--kernel/lockdep.c53
-rw-r--r--kernel/module.c1098
-rw-r--r--kernel/mutex.c23
-rw-r--r--kernel/ns_cgroup.c8
-rw-r--r--kernel/padata.c755
-rw-r--r--kernel/panic.c60
-rw-r--r--kernel/params.c233
-rw-r--r--kernel/perf_event.c3016
-rw-r--r--kernel/pid.c59
-rw-r--r--kernel/pm_qos_params.c232
-rw-r--r--kernel/posix-cpu-timers.c44
-rw-r--r--kernel/posix-timers.c11
-rw-r--r--kernel/power/Kconfig17
-rw-r--r--kernel/power/block_io.c2
-rw-r--r--kernel/power/hibernate.c51
-rw-r--r--kernel/power/main.c76
-rw-r--r--kernel/power/power.h10
-rw-r--r--kernel/power/poweroff.c2
-rw-r--r--kernel/power/process.c32
-rw-r--r--kernel/power/snapshot.c118
-rw-r--r--kernel/power/suspend.c13
-rw-r--r--kernel/power/swap.c312
-rw-r--r--kernel/printk.c52
-rw-r--r--kernel/profile.c1
-rw-r--r--kernel/ptrace.c48
-rw-r--r--kernel/range.c4
-rw-r--r--kernel/rcupdate.c168
-rw-r--r--kernel/rcutiny.c35
-rw-r--r--kernel/rcutiny_plugin.h582
-rw-r--r--kernel/rcutorture.c20
-rw-r--r--kernel/rcutree.c94
-rw-r--r--kernel/rcutree.h20
-rw-r--r--kernel/rcutree_plugin.h47
-rw-r--r--kernel/rcutree_trace.c12
-rw-r--r--kernel/resource.c153
-rw-r--r--kernel/rtmutex-tester.c6
-rw-r--r--kernel/sched.c730
-rw-r--r--kernel/sched_clock.c95
-rw-r--r--kernel/sched_cpupri.c8
-rw-r--r--kernel/sched_cpupri.h2
-rw-r--r--kernel/sched_debug.c2
-rw-r--r--kernel/sched_fair.c626
-rw-r--r--kernel/sched_features.h5
-rw-r--r--kernel/sched_rt.c43
-rw-r--r--kernel/sched_stats.h27
-rw-r--r--kernel/sched_stoptask.c108
-rw-r--r--kernel/signal.c22
-rw-r--r--kernel/slow-work-debugfs.c227
-rw-r--r--kernel/slow-work.c1068
-rw-r--r--kernel/slow-work.h72
-rw-r--r--kernel/smp.c25
-rw-r--r--kernel/softirq.c91
-rw-r--r--kernel/softlockup.c293
-rw-r--r--kernel/srcu.c2
-rw-r--r--kernel/stop_machine.c16
-rw-r--r--kernel/sys.c204
-rw-r--r--kernel/sys_ni.c5
-rw-r--r--kernel/sysctl.c97
-rw-r--r--kernel/sysctl_check.c9
-rw-r--r--kernel/taskstats.c172
-rw-r--r--kernel/test_kprobes.c12
-rw-r--r--kernel/time.c16
-rw-r--r--kernel/time/Kconfig4
-rw-r--r--kernel/time/clocksource.c33
-rw-r--r--kernel/time/ntp.c14
-rw-r--r--kernel/time/tick-broadcast.c2
-rw-r--r--kernel/time/tick-sched.c15
-rw-r--r--kernel/time/timekeeping.c93
-rw-r--r--kernel/timer.c61
-rw-r--r--kernel/trace/Kconfig90
-rw-r--r--kernel/trace/Makefile7
-rw-r--r--kernel/trace/blktrace.c100
-rw-r--r--kernel/trace/ftrace.c149
-rw-r--r--kernel/trace/kmemtrace.c529
-rw-r--r--kernel/trace/ring_buffer.c402
-rw-r--r--kernel/trace/trace.c199
-rw-r--r--kernel/trace/trace.h111
-rw-r--r--kernel/trace/trace_boot.c185
-rw-r--r--kernel/trace/trace_clock.c7
-rw-r--r--kernel/trace/trace_entries.h94
-rw-r--r--kernel/trace/trace_event_perf.c58
-rw-r--r--kernel/trace/trace_events.c431
-rw-r--r--kernel/trace/trace_events_filter.c27
-rw-r--r--kernel/trace/trace_export.c8
-rw-r--r--kernel/trace/trace_functions.c6
-rw-r--r--kernel/trace/trace_functions_graph.c218
-rw-r--r--kernel/trace/trace_irqsoff.c155
-rw-r--r--kernel/trace/trace_kdb.c135
-rw-r--r--kernel/trace/trace_kprobe.c427
-rw-r--r--kernel/trace/trace_ksym.c508
-rw-r--r--kernel/trace/trace_output.c69
-rw-r--r--kernel/trace/trace_sched_wakeup.c263
-rw-r--r--kernel/trace/trace_selftest.c87
-rw-r--r--kernel/trace/trace_stack.c9
-rw-r--r--kernel/trace/trace_syscalls.c7
-rw-r--r--kernel/trace/trace_sysprof.c329
-rw-r--r--kernel/trace/trace_workqueue.c10
-rw-r--r--kernel/tracepoint.c14
-rw-r--r--kernel/tsacct.c10
-rw-r--r--kernel/user.c1
-rw-r--r--kernel/user_namespace.c44
-rw-r--r--kernel/wait.c6
-rw-r--r--kernel/watchdog.c566
-rw-r--r--kernel/workqueue.c3478
-rw-r--r--kernel/workqueue_sched.h9
162 files changed, 15552 insertions, 10806 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 057472fbc272..0b5ff083fa22 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,8 +10,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
13 async.o range.o 13 async.o range.o jump_label.o
14obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o
15obj-y += groups.o 14obj-y += groups.o
16 15
17ifdef CONFIG_FUNCTION_TRACER 16ifdef CONFIG_FUNCTION_TRACER
@@ -23,6 +22,7 @@ CFLAGS_REMOVE_rtmutex-debug.o = -pg
23CFLAGS_REMOVE_cgroup-debug.o = -pg 22CFLAGS_REMOVE_cgroup-debug.o = -pg
24CFLAGS_REMOVE_sched_clock.o = -pg 23CFLAGS_REMOVE_sched_clock.o = -pg
25CFLAGS_REMOVE_perf_event.o = -pg 24CFLAGS_REMOVE_perf_event.o = -pg
25CFLAGS_REMOVE_irq_work.o = -pg
26endif 26endif
27 27
28obj-$(CONFIG_FREEZER) += freezer.o 28obj-$(CONFIG_FREEZER) += freezer.o
@@ -70,14 +70,15 @@ obj-$(CONFIG_IKCONFIG) += configs.o
70obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o 70obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
71obj-$(CONFIG_SMP) += stop_machine.o 71obj-$(CONFIG_SMP) += stop_machine.o
72obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o 72obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
73obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o 73obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
74obj-$(CONFIG_AUDITSYSCALL) += auditsc.o 74obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
75obj-$(CONFIG_GCOV_KERNEL) += gcov/ 75obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o
76obj-$(CONFIG_AUDIT_TREE) += audit_tree.o 76obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
77obj-$(CONFIG_GCOV_KERNEL) += gcov/
77obj-$(CONFIG_KPROBES) += kprobes.o 78obj-$(CONFIG_KPROBES) += kprobes.o
78obj-$(CONFIG_KGDB) += debug/ 79obj-$(CONFIG_KGDB) += debug/
79obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
80obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o 80obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
81obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
81obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 82obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
82obj-$(CONFIG_SECCOMP) += seccomp.o 83obj-$(CONFIG_SECCOMP) += seccomp.o
83obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 84obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
@@ -85,6 +86,7 @@ obj-$(CONFIG_TREE_RCU) += rcutree.o
85obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o 86obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
86obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o 87obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
87obj-$(CONFIG_TINY_RCU) += rcutiny.o 88obj-$(CONFIG_TINY_RCU) += rcutiny.o
89obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o
88obj-$(CONFIG_RELAY) += relay.o 90obj-$(CONFIG_RELAY) += relay.o
89obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 91obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
90obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 92obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
@@ -99,8 +101,7 @@ obj-$(CONFIG_TRACING) += trace/
99obj-$(CONFIG_X86_DS) += trace/ 101obj-$(CONFIG_X86_DS) += trace/
100obj-$(CONFIG_RING_BUFFER) += trace/ 102obj-$(CONFIG_RING_BUFFER) += trace/
101obj-$(CONFIG_SMP) += sched_cpupri.o 103obj-$(CONFIG_SMP) += sched_cpupri.o
102obj-$(CONFIG_SLOW_WORK) += slow-work.o 104obj-$(CONFIG_IRQ_WORK) += irq_work.o
103obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
104obj-$(CONFIG_PERF_EVENTS) += perf_event.o 105obj-$(CONFIG_PERF_EVENTS) += perf_event.o
105obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 106obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
106obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o 107obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 385b88461c29..fa7eb3de2ddc 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -122,7 +122,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
122 spin_unlock(&acct_lock); 122 spin_unlock(&acct_lock);
123 123
124 /* May block */ 124 /* May block */
125 if (vfs_statfs(file->f_path.dentry, &sbuf)) 125 if (vfs_statfs(&file->f_path, &sbuf))
126 return res; 126 return res;
127 suspend = sbuf.f_blocks * SUSPEND; 127 suspend = sbuf.f_blocks * SUSPEND;
128 resume = sbuf.f_blocks * RESUME; 128 resume = sbuf.f_blocks * RESUME;
diff --git a/kernel/async.c b/kernel/async.c
index 15319d6c18fe..cd9dbb913c77 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -49,40 +49,33 @@ asynchronous and synchronous parts of the kernel.
49*/ 49*/
50 50
51#include <linux/async.h> 51#include <linux/async.h>
52#include <linux/bug.h>
53#include <linux/module.h> 52#include <linux/module.h>
54#include <linux/wait.h> 53#include <linux/wait.h>
55#include <linux/sched.h> 54#include <linux/sched.h>
56#include <linux/init.h>
57#include <linux/kthread.h>
58#include <linux/delay.h>
59#include <linux/slab.h> 55#include <linux/slab.h>
56#include <linux/workqueue.h>
60#include <asm/atomic.h> 57#include <asm/atomic.h>
61 58
62static async_cookie_t next_cookie = 1; 59static async_cookie_t next_cookie = 1;
63 60
64#define MAX_THREADS 256
65#define MAX_WORK 32768 61#define MAX_WORK 32768
66 62
67static LIST_HEAD(async_pending); 63static LIST_HEAD(async_pending);
68static LIST_HEAD(async_running); 64static LIST_HEAD(async_running);
69static DEFINE_SPINLOCK(async_lock); 65static DEFINE_SPINLOCK(async_lock);
70 66
71static int async_enabled = 0;
72
73struct async_entry { 67struct async_entry {
74 struct list_head list; 68 struct list_head list;
75 async_cookie_t cookie; 69 struct work_struct work;
76 async_func_ptr *func; 70 async_cookie_t cookie;
77 void *data; 71 async_func_ptr *func;
78 struct list_head *running; 72 void *data;
73 struct list_head *running;
79}; 74};
80 75
81static DECLARE_WAIT_QUEUE_HEAD(async_done); 76static DECLARE_WAIT_QUEUE_HEAD(async_done);
82static DECLARE_WAIT_QUEUE_HEAD(async_new);
83 77
84static atomic_t entry_count; 78static atomic_t entry_count;
85static atomic_t thread_count;
86 79
87extern int initcall_debug; 80extern int initcall_debug;
88 81
@@ -117,27 +110,23 @@ static async_cookie_t lowest_in_progress(struct list_head *running)
117 spin_unlock_irqrestore(&async_lock, flags); 110 spin_unlock_irqrestore(&async_lock, flags);
118 return ret; 111 return ret;
119} 112}
113
120/* 114/*
121 * pick the first pending entry and run it 115 * pick the first pending entry and run it
122 */ 116 */
123static void run_one_entry(void) 117static void async_run_entry_fn(struct work_struct *work)
124{ 118{
119 struct async_entry *entry =
120 container_of(work, struct async_entry, work);
125 unsigned long flags; 121 unsigned long flags;
126 struct async_entry *entry;
127 ktime_t calltime, delta, rettime; 122 ktime_t calltime, delta, rettime;
128 123
129 /* 1) pick one task from the pending queue */ 124 /* 1) move self to the running queue */
130
131 spin_lock_irqsave(&async_lock, flags); 125 spin_lock_irqsave(&async_lock, flags);
132 if (list_empty(&async_pending))
133 goto out;
134 entry = list_first_entry(&async_pending, struct async_entry, list);
135
136 /* 2) move it to the running queue */
137 list_move_tail(&entry->list, entry->running); 126 list_move_tail(&entry->list, entry->running);
138 spin_unlock_irqrestore(&async_lock, flags); 127 spin_unlock_irqrestore(&async_lock, flags);
139 128
140 /* 3) run it (and print duration)*/ 129 /* 2) run (and print duration) */
141 if (initcall_debug && system_state == SYSTEM_BOOTING) { 130 if (initcall_debug && system_state == SYSTEM_BOOTING) {
142 printk("calling %lli_%pF @ %i\n", (long long)entry->cookie, 131 printk("calling %lli_%pF @ %i\n", (long long)entry->cookie,
143 entry->func, task_pid_nr(current)); 132 entry->func, task_pid_nr(current));
@@ -153,31 +142,25 @@ static void run_one_entry(void)
153 (long long)ktime_to_ns(delta) >> 10); 142 (long long)ktime_to_ns(delta) >> 10);
154 } 143 }
155 144
156 /* 4) remove it from the running queue */ 145 /* 3) remove self from the running queue */
157 spin_lock_irqsave(&async_lock, flags); 146 spin_lock_irqsave(&async_lock, flags);
158 list_del(&entry->list); 147 list_del(&entry->list);
159 148
160 /* 5) free the entry */ 149 /* 4) free the entry */
161 kfree(entry); 150 kfree(entry);
162 atomic_dec(&entry_count); 151 atomic_dec(&entry_count);
163 152
164 spin_unlock_irqrestore(&async_lock, flags); 153 spin_unlock_irqrestore(&async_lock, flags);
165 154
166 /* 6) wake up any waiters. */ 155 /* 5) wake up any waiters */
167 wake_up(&async_done); 156 wake_up(&async_done);
168 return;
169
170out:
171 spin_unlock_irqrestore(&async_lock, flags);
172} 157}
173 158
174
175static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running) 159static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running)
176{ 160{
177 struct async_entry *entry; 161 struct async_entry *entry;
178 unsigned long flags; 162 unsigned long flags;
179 async_cookie_t newcookie; 163 async_cookie_t newcookie;
180
181 164
182 /* allow irq-off callers */ 165 /* allow irq-off callers */
183 entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC); 166 entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC);
@@ -186,7 +169,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
186 * If we're out of memory or if there's too much work 169 * If we're out of memory or if there's too much work
187 * pending already, we execute synchronously. 170 * pending already, we execute synchronously.
188 */ 171 */
189 if (!async_enabled || !entry || atomic_read(&entry_count) > MAX_WORK) { 172 if (!entry || atomic_read(&entry_count) > MAX_WORK) {
190 kfree(entry); 173 kfree(entry);
191 spin_lock_irqsave(&async_lock, flags); 174 spin_lock_irqsave(&async_lock, flags);
192 newcookie = next_cookie++; 175 newcookie = next_cookie++;
@@ -196,6 +179,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
196 ptr(data, newcookie); 179 ptr(data, newcookie);
197 return newcookie; 180 return newcookie;
198 } 181 }
182 INIT_WORK(&entry->work, async_run_entry_fn);
199 entry->func = ptr; 183 entry->func = ptr;
200 entry->data = data; 184 entry->data = data;
201 entry->running = running; 185 entry->running = running;
@@ -205,7 +189,10 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
205 list_add_tail(&entry->list, &async_pending); 189 list_add_tail(&entry->list, &async_pending);
206 atomic_inc(&entry_count); 190 atomic_inc(&entry_count);
207 spin_unlock_irqrestore(&async_lock, flags); 191 spin_unlock_irqrestore(&async_lock, flags);
208 wake_up(&async_new); 192
193 /* schedule for execution */
194 queue_work(system_unbound_wq, &entry->work);
195
209 return newcookie; 196 return newcookie;
210} 197}
211 198
@@ -312,87 +299,3 @@ void async_synchronize_cookie(async_cookie_t cookie)
312 async_synchronize_cookie_domain(cookie, &async_running); 299 async_synchronize_cookie_domain(cookie, &async_running);
313} 300}
314EXPORT_SYMBOL_GPL(async_synchronize_cookie); 301EXPORT_SYMBOL_GPL(async_synchronize_cookie);
315
316
317static int async_thread(void *unused)
318{
319 DECLARE_WAITQUEUE(wq, current);
320 add_wait_queue(&async_new, &wq);
321
322 while (!kthread_should_stop()) {
323 int ret = HZ;
324 set_current_state(TASK_INTERRUPTIBLE);
325 /*
326 * check the list head without lock.. false positives
327 * are dealt with inside run_one_entry() while holding
328 * the lock.
329 */
330 rmb();
331 if (!list_empty(&async_pending))
332 run_one_entry();
333 else
334 ret = schedule_timeout(HZ);
335
336 if (ret == 0) {
337 /*
338 * we timed out, this means we as thread are redundant.
339 * we sign off and die, but we to avoid any races there
340 * is a last-straw check to see if work snuck in.
341 */
342 atomic_dec(&thread_count);
343 wmb(); /* manager must see our departure first */
344 if (list_empty(&async_pending))
345 break;
346 /*
347 * woops work came in between us timing out and us
348 * signing off; we need to stay alive and keep working.
349 */
350 atomic_inc(&thread_count);
351 }
352 }
353 remove_wait_queue(&async_new, &wq);
354
355 return 0;
356}
357
358static int async_manager_thread(void *unused)
359{
360 DECLARE_WAITQUEUE(wq, current);
361 add_wait_queue(&async_new, &wq);
362
363 while (!kthread_should_stop()) {
364 int tc, ec;
365
366 set_current_state(TASK_INTERRUPTIBLE);
367
368 tc = atomic_read(&thread_count);
369 rmb();
370 ec = atomic_read(&entry_count);
371
372 while (tc < ec && tc < MAX_THREADS) {
373 if (IS_ERR(kthread_run(async_thread, NULL, "async/%i",
374 tc))) {
375 msleep(100);
376 continue;
377 }
378 atomic_inc(&thread_count);
379 tc++;
380 }
381
382 schedule();
383 }
384 remove_wait_queue(&async_new, &wq);
385
386 return 0;
387}
388
389static int __init async_init(void)
390{
391 async_enabled =
392 !IS_ERR(kthread_run(async_manager_thread, NULL, "async/mgr"));
393
394 WARN_ON(!async_enabled);
395 return 0;
396}
397
398core_initcall(async_init);
diff --git a/kernel/audit.c b/kernel/audit.c
index c71bd26631a2..d96045789b54 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -56,7 +56,6 @@
56#include <net/netlink.h> 56#include <net/netlink.h>
57#include <linux/skbuff.h> 57#include <linux/skbuff.h>
58#include <linux/netlink.h> 58#include <linux/netlink.h>
59#include <linux/inotify.h>
60#include <linux/freezer.h> 59#include <linux/freezer.h>
61#include <linux/tty.h> 60#include <linux/tty.h>
62 61
@@ -407,7 +406,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
407 audit_hold_skb(skb); 406 audit_hold_skb(skb);
408 } else 407 } else
409 /* drop the extra reference if sent ok */ 408 /* drop the extra reference if sent ok */
410 kfree_skb(skb); 409 consume_skb(skb);
411} 410}
412 411
413static int kauditd_thread(void *dummy) 412static int kauditd_thread(void *dummy)
diff --git a/kernel/audit.h b/kernel/audit.h
index 208687be4f30..f7206db4e13d 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -103,21 +103,27 @@ extern struct mutex audit_filter_mutex;
103extern void audit_free_rule_rcu(struct rcu_head *); 103extern void audit_free_rule_rcu(struct rcu_head *);
104extern struct list_head audit_filter_list[]; 104extern struct list_head audit_filter_list[];
105 105
106extern struct audit_entry *audit_dupe_rule(struct audit_krule *old);
107
106/* audit watch functions */ 108/* audit watch functions */
107extern unsigned long audit_watch_inode(struct audit_watch *watch); 109#ifdef CONFIG_AUDIT_WATCH
108extern dev_t audit_watch_dev(struct audit_watch *watch);
109extern void audit_put_watch(struct audit_watch *watch); 110extern void audit_put_watch(struct audit_watch *watch);
110extern void audit_get_watch(struct audit_watch *watch); 111extern void audit_get_watch(struct audit_watch *watch);
111extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op); 112extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op);
112extern int audit_add_watch(struct audit_krule *krule); 113extern int audit_add_watch(struct audit_krule *krule, struct list_head **list);
113extern void audit_remove_watch(struct audit_watch *watch); 114extern void audit_remove_watch_rule(struct audit_krule *krule);
114extern void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list);
115extern void audit_inotify_unregister(struct list_head *in_list);
116extern char *audit_watch_path(struct audit_watch *watch); 115extern char *audit_watch_path(struct audit_watch *watch);
117extern struct list_head *audit_watch_rules(struct audit_watch *watch); 116extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev);
118 117#else
119extern struct audit_entry *audit_dupe_rule(struct audit_krule *old, 118#define audit_put_watch(w) {}
120 struct audit_watch *watch); 119#define audit_get_watch(w) {}
120#define audit_to_watch(k, p, l, o) (-EINVAL)
121#define audit_add_watch(k, l) (-EINVAL)
122#define audit_remove_watch_rule(k) BUG()
123#define audit_watch_path(w) ""
124#define audit_watch_compare(w, i, d) 0
125
126#endif /* CONFIG_AUDIT_WATCH */
121 127
122#ifdef CONFIG_AUDIT_TREE 128#ifdef CONFIG_AUDIT_TREE
123extern struct audit_chunk *audit_tree_lookup(const struct inode *); 129extern struct audit_chunk *audit_tree_lookup(const struct inode *);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 46a57b57a335..7f18d3a4527e 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -1,5 +1,5 @@
1#include "audit.h" 1#include "audit.h"
2#include <linux/inotify.h> 2#include <linux/fsnotify_backend.h>
3#include <linux/namei.h> 3#include <linux/namei.h>
4#include <linux/mount.h> 4#include <linux/mount.h>
5#include <linux/kthread.h> 5#include <linux/kthread.h>
@@ -22,7 +22,7 @@ struct audit_tree {
22 22
23struct audit_chunk { 23struct audit_chunk {
24 struct list_head hash; 24 struct list_head hash;
25 struct inotify_watch watch; 25 struct fsnotify_mark mark;
26 struct list_head trees; /* with root here */ 26 struct list_head trees; /* with root here */
27 int dead; 27 int dead;
28 int count; 28 int count;
@@ -59,7 +59,7 @@ static LIST_HEAD(prune_list);
59 * tree is refcounted; one reference for "some rules on rules_list refer to 59 * tree is refcounted; one reference for "some rules on rules_list refer to
60 * it", one for each chunk with pointer to it. 60 * it", one for each chunk with pointer to it.
61 * 61 *
62 * chunk is refcounted by embedded inotify_watch + .refs (non-zero refcount 62 * chunk is refcounted by embedded fsnotify_mark + .refs (non-zero refcount
63 * of watch contributes 1 to .refs). 63 * of watch contributes 1 to .refs).
64 * 64 *
65 * node.index allows to get from node.list to containing chunk. 65 * node.index allows to get from node.list to containing chunk.
@@ -68,7 +68,7 @@ static LIST_HEAD(prune_list);
68 * that makes a difference. Some. 68 * that makes a difference. Some.
69 */ 69 */
70 70
71static struct inotify_handle *rtree_ih; 71static struct fsnotify_group *audit_tree_group;
72 72
73static struct audit_tree *alloc_tree(const char *s) 73static struct audit_tree *alloc_tree(const char *s)
74{ 74{
@@ -111,29 +111,6 @@ const char *audit_tree_path(struct audit_tree *tree)
111 return tree->pathname; 111 return tree->pathname;
112} 112}
113 113
114static struct audit_chunk *alloc_chunk(int count)
115{
116 struct audit_chunk *chunk;
117 size_t size;
118 int i;
119
120 size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node);
121 chunk = kzalloc(size, GFP_KERNEL);
122 if (!chunk)
123 return NULL;
124
125 INIT_LIST_HEAD(&chunk->hash);
126 INIT_LIST_HEAD(&chunk->trees);
127 chunk->count = count;
128 atomic_long_set(&chunk->refs, 1);
129 for (i = 0; i < count; i++) {
130 INIT_LIST_HEAD(&chunk->owners[i].list);
131 chunk->owners[i].index = i;
132 }
133 inotify_init_watch(&chunk->watch);
134 return chunk;
135}
136
137static void free_chunk(struct audit_chunk *chunk) 114static void free_chunk(struct audit_chunk *chunk)
138{ 115{
139 int i; 116 int i;
@@ -157,6 +134,35 @@ static void __put_chunk(struct rcu_head *rcu)
157 audit_put_chunk(chunk); 134 audit_put_chunk(chunk);
158} 135}
159 136
137static void audit_tree_destroy_watch(struct fsnotify_mark *entry)
138{
139 struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
140 call_rcu(&chunk->head, __put_chunk);
141}
142
143static struct audit_chunk *alloc_chunk(int count)
144{
145 struct audit_chunk *chunk;
146 size_t size;
147 int i;
148
149 size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node);
150 chunk = kzalloc(size, GFP_KERNEL);
151 if (!chunk)
152 return NULL;
153
154 INIT_LIST_HEAD(&chunk->hash);
155 INIT_LIST_HEAD(&chunk->trees);
156 chunk->count = count;
157 atomic_long_set(&chunk->refs, 1);
158 for (i = 0; i < count; i++) {
159 INIT_LIST_HEAD(&chunk->owners[i].list);
160 chunk->owners[i].index = i;
161 }
162 fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch);
163 return chunk;
164}
165
160enum {HASH_SIZE = 128}; 166enum {HASH_SIZE = 128};
161static struct list_head chunk_hash_heads[HASH_SIZE]; 167static struct list_head chunk_hash_heads[HASH_SIZE];
162static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock); 168static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock);
@@ -167,10 +173,15 @@ static inline struct list_head *chunk_hash(const struct inode *inode)
167 return chunk_hash_heads + n % HASH_SIZE; 173 return chunk_hash_heads + n % HASH_SIZE;
168} 174}
169 175
170/* hash_lock is held by caller */ 176/* hash_lock & entry->lock is held by caller */
171static void insert_hash(struct audit_chunk *chunk) 177static void insert_hash(struct audit_chunk *chunk)
172{ 178{
173 struct list_head *list = chunk_hash(chunk->watch.inode); 179 struct fsnotify_mark *entry = &chunk->mark;
180 struct list_head *list;
181
182 if (!entry->i.inode)
183 return;
184 list = chunk_hash(entry->i.inode);
174 list_add_rcu(&chunk->hash, list); 185 list_add_rcu(&chunk->hash, list);
175} 186}
176 187
@@ -181,7 +192,8 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
181 struct audit_chunk *p; 192 struct audit_chunk *p;
182 193
183 list_for_each_entry_rcu(p, list, hash) { 194 list_for_each_entry_rcu(p, list, hash) {
184 if (p->watch.inode == inode) { 195 /* mark.inode may have gone NULL, but who cares? */
196 if (p->mark.i.inode == inode) {
185 atomic_long_inc(&p->refs); 197 atomic_long_inc(&p->refs);
186 return p; 198 return p;
187 } 199 }
@@ -210,38 +222,19 @@ static struct audit_chunk *find_chunk(struct node *p)
210static void untag_chunk(struct node *p) 222static void untag_chunk(struct node *p)
211{ 223{
212 struct audit_chunk *chunk = find_chunk(p); 224 struct audit_chunk *chunk = find_chunk(p);
225 struct fsnotify_mark *entry = &chunk->mark;
213 struct audit_chunk *new; 226 struct audit_chunk *new;
214 struct audit_tree *owner; 227 struct audit_tree *owner;
215 int size = chunk->count - 1; 228 int size = chunk->count - 1;
216 int i, j; 229 int i, j;
217 230
218 if (!pin_inotify_watch(&chunk->watch)) { 231 fsnotify_get_mark(entry);
219 /*
220 * Filesystem is shutting down; all watches are getting
221 * evicted, just take it off the node list for this
222 * tree and let the eviction logics take care of the
223 * rest.
224 */
225 owner = p->owner;
226 if (owner->root == chunk) {
227 list_del_init(&owner->same_root);
228 owner->root = NULL;
229 }
230 list_del_init(&p->list);
231 p->owner = NULL;
232 put_tree(owner);
233 return;
234 }
235 232
236 spin_unlock(&hash_lock); 233 spin_unlock(&hash_lock);
237 234
238 /* 235 spin_lock(&entry->lock);
239 * pin_inotify_watch() succeeded, so the watch won't go away 236 if (chunk->dead || !entry->i.inode) {
240 * from under us. 237 spin_unlock(&entry->lock);
241 */
242 mutex_lock(&chunk->watch.inode->inotify_mutex);
243 if (chunk->dead) {
244 mutex_unlock(&chunk->watch.inode->inotify_mutex);
245 goto out; 238 goto out;
246 } 239 }
247 240
@@ -256,16 +249,17 @@ static void untag_chunk(struct node *p)
256 list_del_init(&p->list); 249 list_del_init(&p->list);
257 list_del_rcu(&chunk->hash); 250 list_del_rcu(&chunk->hash);
258 spin_unlock(&hash_lock); 251 spin_unlock(&hash_lock);
259 inotify_evict_watch(&chunk->watch); 252 spin_unlock(&entry->lock);
260 mutex_unlock(&chunk->watch.inode->inotify_mutex); 253 fsnotify_destroy_mark(entry);
261 put_inotify_watch(&chunk->watch); 254 fsnotify_put_mark(entry);
262 goto out; 255 goto out;
263 } 256 }
264 257
265 new = alloc_chunk(size); 258 new = alloc_chunk(size);
266 if (!new) 259 if (!new)
267 goto Fallback; 260 goto Fallback;
268 if (inotify_clone_watch(&chunk->watch, &new->watch) < 0) { 261 fsnotify_duplicate_mark(&new->mark, entry);
262 if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
269 free_chunk(new); 263 free_chunk(new);
270 goto Fallback; 264 goto Fallback;
271 } 265 }
@@ -298,9 +292,9 @@ static void untag_chunk(struct node *p)
298 list_for_each_entry(owner, &new->trees, same_root) 292 list_for_each_entry(owner, &new->trees, same_root)
299 owner->root = new; 293 owner->root = new;
300 spin_unlock(&hash_lock); 294 spin_unlock(&hash_lock);
301 inotify_evict_watch(&chunk->watch); 295 spin_unlock(&entry->lock);
302 mutex_unlock(&chunk->watch.inode->inotify_mutex); 296 fsnotify_destroy_mark(entry);
303 put_inotify_watch(&chunk->watch); 297 fsnotify_put_mark(entry);
304 goto out; 298 goto out;
305 299
306Fallback: 300Fallback:
@@ -314,31 +308,33 @@ Fallback:
314 p->owner = NULL; 308 p->owner = NULL;
315 put_tree(owner); 309 put_tree(owner);
316 spin_unlock(&hash_lock); 310 spin_unlock(&hash_lock);
317 mutex_unlock(&chunk->watch.inode->inotify_mutex); 311 spin_unlock(&entry->lock);
318out: 312out:
319 unpin_inotify_watch(&chunk->watch); 313 fsnotify_put_mark(entry);
320 spin_lock(&hash_lock); 314 spin_lock(&hash_lock);
321} 315}
322 316
323static int create_chunk(struct inode *inode, struct audit_tree *tree) 317static int create_chunk(struct inode *inode, struct audit_tree *tree)
324{ 318{
319 struct fsnotify_mark *entry;
325 struct audit_chunk *chunk = alloc_chunk(1); 320 struct audit_chunk *chunk = alloc_chunk(1);
326 if (!chunk) 321 if (!chunk)
327 return -ENOMEM; 322 return -ENOMEM;
328 323
329 if (inotify_add_watch(rtree_ih, &chunk->watch, inode, IN_IGNORED | IN_DELETE_SELF) < 0) { 324 entry = &chunk->mark;
325 if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) {
330 free_chunk(chunk); 326 free_chunk(chunk);
331 return -ENOSPC; 327 return -ENOSPC;
332 } 328 }
333 329
334 mutex_lock(&inode->inotify_mutex); 330 spin_lock(&entry->lock);
335 spin_lock(&hash_lock); 331 spin_lock(&hash_lock);
336 if (tree->goner) { 332 if (tree->goner) {
337 spin_unlock(&hash_lock); 333 spin_unlock(&hash_lock);
338 chunk->dead = 1; 334 chunk->dead = 1;
339 inotify_evict_watch(&chunk->watch); 335 spin_unlock(&entry->lock);
340 mutex_unlock(&inode->inotify_mutex); 336 fsnotify_destroy_mark(entry);
341 put_inotify_watch(&chunk->watch); 337 fsnotify_put_mark(entry);
342 return 0; 338 return 0;
343 } 339 }
344 chunk->owners[0].index = (1U << 31); 340 chunk->owners[0].index = (1U << 31);
@@ -351,30 +347,31 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
351 } 347 }
352 insert_hash(chunk); 348 insert_hash(chunk);
353 spin_unlock(&hash_lock); 349 spin_unlock(&hash_lock);
354 mutex_unlock(&inode->inotify_mutex); 350 spin_unlock(&entry->lock);
355 return 0; 351 return 0;
356} 352}
357 353
358/* the first tagged inode becomes root of tree */ 354/* the first tagged inode becomes root of tree */
359static int tag_chunk(struct inode *inode, struct audit_tree *tree) 355static int tag_chunk(struct inode *inode, struct audit_tree *tree)
360{ 356{
361 struct inotify_watch *watch; 357 struct fsnotify_mark *old_entry, *chunk_entry;
362 struct audit_tree *owner; 358 struct audit_tree *owner;
363 struct audit_chunk *chunk, *old; 359 struct audit_chunk *chunk, *old;
364 struct node *p; 360 struct node *p;
365 int n; 361 int n;
366 362
367 if (inotify_find_watch(rtree_ih, inode, &watch) < 0) 363 old_entry = fsnotify_find_inode_mark(audit_tree_group, inode);
364 if (!old_entry)
368 return create_chunk(inode, tree); 365 return create_chunk(inode, tree);
369 366
370 old = container_of(watch, struct audit_chunk, watch); 367 old = container_of(old_entry, struct audit_chunk, mark);
371 368
372 /* are we already there? */ 369 /* are we already there? */
373 spin_lock(&hash_lock); 370 spin_lock(&hash_lock);
374 for (n = 0; n < old->count; n++) { 371 for (n = 0; n < old->count; n++) {
375 if (old->owners[n].owner == tree) { 372 if (old->owners[n].owner == tree) {
376 spin_unlock(&hash_lock); 373 spin_unlock(&hash_lock);
377 put_inotify_watch(&old->watch); 374 fsnotify_put_mark(old_entry);
378 return 0; 375 return 0;
379 } 376 }
380 } 377 }
@@ -382,25 +379,44 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
382 379
383 chunk = alloc_chunk(old->count + 1); 380 chunk = alloc_chunk(old->count + 1);
384 if (!chunk) { 381 if (!chunk) {
385 put_inotify_watch(&old->watch); 382 fsnotify_put_mark(old_entry);
386 return -ENOMEM; 383 return -ENOMEM;
387 } 384 }
388 385
389 mutex_lock(&inode->inotify_mutex); 386 chunk_entry = &chunk->mark;
390 if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) { 387
391 mutex_unlock(&inode->inotify_mutex); 388 spin_lock(&old_entry->lock);
392 put_inotify_watch(&old->watch); 389 if (!old_entry->i.inode) {
390 /* old_entry is being shot, lets just lie */
391 spin_unlock(&old_entry->lock);
392 fsnotify_put_mark(old_entry);
393 free_chunk(chunk); 393 free_chunk(chunk);
394 return -ENOENT;
395 }
396
397 fsnotify_duplicate_mark(chunk_entry, old_entry);
398 if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) {
399 spin_unlock(&old_entry->lock);
400 free_chunk(chunk);
401 fsnotify_put_mark(old_entry);
394 return -ENOSPC; 402 return -ENOSPC;
395 } 403 }
404
405 /* even though we hold old_entry->lock, this is safe since chunk_entry->lock could NEVER have been grabbed before */
406 spin_lock(&chunk_entry->lock);
396 spin_lock(&hash_lock); 407 spin_lock(&hash_lock);
408
409 /* we now hold old_entry->lock, chunk_entry->lock, and hash_lock */
397 if (tree->goner) { 410 if (tree->goner) {
398 spin_unlock(&hash_lock); 411 spin_unlock(&hash_lock);
399 chunk->dead = 1; 412 chunk->dead = 1;
400 inotify_evict_watch(&chunk->watch); 413 spin_unlock(&chunk_entry->lock);
401 mutex_unlock(&inode->inotify_mutex); 414 spin_unlock(&old_entry->lock);
402 put_inotify_watch(&old->watch); 415
403 put_inotify_watch(&chunk->watch); 416 fsnotify_destroy_mark(chunk_entry);
417
418 fsnotify_put_mark(chunk_entry);
419 fsnotify_put_mark(old_entry);
404 return 0; 420 return 0;
405 } 421 }
406 list_replace_init(&old->trees, &chunk->trees); 422 list_replace_init(&old->trees, &chunk->trees);
@@ -426,10 +442,11 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
426 list_add(&tree->same_root, &chunk->trees); 442 list_add(&tree->same_root, &chunk->trees);
427 } 443 }
428 spin_unlock(&hash_lock); 444 spin_unlock(&hash_lock);
429 inotify_evict_watch(&old->watch); 445 spin_unlock(&chunk_entry->lock);
430 mutex_unlock(&inode->inotify_mutex); 446 spin_unlock(&old_entry->lock);
431 put_inotify_watch(&old->watch); /* pair to inotify_find_watch */ 447 fsnotify_destroy_mark(old_entry);
432 put_inotify_watch(&old->watch); /* and kill it */ 448 fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
449 fsnotify_put_mark(old_entry); /* and kill it */
433 return 0; 450 return 0;
434} 451}
435 452
@@ -584,7 +601,9 @@ void audit_trim_trees(void)
584 601
585 spin_lock(&hash_lock); 602 spin_lock(&hash_lock);
586 list_for_each_entry(node, &tree->chunks, list) { 603 list_for_each_entry(node, &tree->chunks, list) {
587 struct inode *inode = find_chunk(node)->watch.inode; 604 struct audit_chunk *chunk = find_chunk(node);
605 /* this could be NULL if the watch is dieing else where... */
606 struct inode *inode = chunk->mark.i.inode;
588 node->index |= 1U<<31; 607 node->index |= 1U<<31;
589 if (iterate_mounts(compare_root, inode, root_mnt)) 608 if (iterate_mounts(compare_root, inode, root_mnt))
590 node->index &= ~(1U<<31); 609 node->index &= ~(1U<<31);
@@ -846,7 +865,6 @@ void audit_kill_trees(struct list_head *list)
846 * Here comes the stuff asynchronous to auditctl operations 865 * Here comes the stuff asynchronous to auditctl operations
847 */ 866 */
848 867
849/* inode->inotify_mutex is locked */
850static void evict_chunk(struct audit_chunk *chunk) 868static void evict_chunk(struct audit_chunk *chunk)
851{ 869{
852 struct audit_tree *owner; 870 struct audit_tree *owner;
@@ -885,35 +903,46 @@ static void evict_chunk(struct audit_chunk *chunk)
885 mutex_unlock(&audit_filter_mutex); 903 mutex_unlock(&audit_filter_mutex);
886} 904}
887 905
888static void handle_event(struct inotify_watch *watch, u32 wd, u32 mask, 906static int audit_tree_handle_event(struct fsnotify_group *group,
889 u32 cookie, const char *dname, struct inode *inode) 907 struct fsnotify_mark *inode_mark,
908 struct fsnotify_mark *vfsmonut_mark,
909 struct fsnotify_event *event)
910{
911 BUG();
912 return -EOPNOTSUPP;
913}
914
915static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group)
890{ 916{
891 struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch); 917 struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
892 918
893 if (mask & IN_IGNORED) { 919 evict_chunk(chunk);
894 evict_chunk(chunk); 920 fsnotify_put_mark(entry);
895 put_inotify_watch(watch);
896 }
897} 921}
898 922
899static void destroy_watch(struct inotify_watch *watch) 923static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode,
924 struct fsnotify_mark *inode_mark,
925 struct fsnotify_mark *vfsmount_mark,
926 __u32 mask, void *data, int data_type)
900{ 927{
901 struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch); 928 return false;
902 call_rcu(&chunk->head, __put_chunk);
903} 929}
904 930
905static const struct inotify_operations rtree_inotify_ops = { 931static const struct fsnotify_ops audit_tree_ops = {
906 .handle_event = handle_event, 932 .handle_event = audit_tree_handle_event,
907 .destroy_watch = destroy_watch, 933 .should_send_event = audit_tree_send_event,
934 .free_group_priv = NULL,
935 .free_event_priv = NULL,
936 .freeing_mark = audit_tree_freeing_mark,
908}; 937};
909 938
910static int __init audit_tree_init(void) 939static int __init audit_tree_init(void)
911{ 940{
912 int i; 941 int i;
913 942
914 rtree_ih = inotify_init(&rtree_inotify_ops); 943 audit_tree_group = fsnotify_alloc_group(&audit_tree_ops);
915 if (IS_ERR(rtree_ih)) 944 if (IS_ERR(audit_tree_group))
916 audit_panic("cannot initialize inotify handle for rectree watches"); 945 audit_panic("cannot initialize fsnotify group for rectree watches");
917 946
918 for (i = 0; i < HASH_SIZE; i++) 947 for (i = 0; i < HASH_SIZE; i++)
919 INIT_LIST_HEAD(&chunk_hash_heads[i]); 948 INIT_LIST_HEAD(&chunk_hash_heads[i]);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 8df43696f4ba..f0c9b2e7542d 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -24,18 +24,18 @@
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/mutex.h> 25#include <linux/mutex.h>
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/fsnotify_backend.h>
27#include <linux/namei.h> 28#include <linux/namei.h>
28#include <linux/netlink.h> 29#include <linux/netlink.h>
29#include <linux/sched.h> 30#include <linux/sched.h>
30#include <linux/slab.h> 31#include <linux/slab.h>
31#include <linux/inotify.h>
32#include <linux/security.h> 32#include <linux/security.h>
33#include "audit.h" 33#include "audit.h"
34 34
35/* 35/*
36 * Reference counting: 36 * Reference counting:
37 * 37 *
38 * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED 38 * audit_parent: lifetime is from audit_init_parent() to receipt of an FS_IGNORED
39 * event. Each audit_watch holds a reference to its associated parent. 39 * event. Each audit_watch holds a reference to its associated parent.
40 * 40 *
41 * audit_watch: if added to lists, lifetime is from audit_init_watch() to 41 * audit_watch: if added to lists, lifetime is from audit_init_watch() to
@@ -51,40 +51,61 @@ struct audit_watch {
51 unsigned long ino; /* associated inode number */ 51 unsigned long ino; /* associated inode number */
52 struct audit_parent *parent; /* associated parent */ 52 struct audit_parent *parent; /* associated parent */
53 struct list_head wlist; /* entry in parent->watches list */ 53 struct list_head wlist; /* entry in parent->watches list */
54 struct list_head rules; /* associated rules */ 54 struct list_head rules; /* anchor for krule->rlist */
55}; 55};
56 56
57struct audit_parent { 57struct audit_parent {
58 struct list_head ilist; /* entry in inotify registration list */ 58 struct list_head watches; /* anchor for audit_watch->wlist */
59 struct list_head watches; /* associated watches */ 59 struct fsnotify_mark mark; /* fsnotify mark on the inode */
60 struct inotify_watch wdata; /* inotify watch data */
61 unsigned flags; /* status flags */
62}; 60};
63 61
64/* Inotify handle. */ 62/* fsnotify handle. */
65struct inotify_handle *audit_ih; 63struct fsnotify_group *audit_watch_group;
66 64
67/* 65/* fsnotify events we care about. */
68 * audit_parent status flags: 66#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
69 * 67 FS_MOVE_SELF | FS_EVENT_ON_CHILD)
70 * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
71 * a filesystem event to ensure we're adding audit watches to a valid parent.
72 * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
73 * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
74 * we can receive while holding nameidata.
75 */
76#define AUDIT_PARENT_INVALID 0x001
77 68
78/* Inotify events we care about. */ 69static void audit_free_parent(struct audit_parent *parent)
79#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF 70{
71 WARN_ON(!list_empty(&parent->watches));
72 kfree(parent);
73}
80 74
81static void audit_free_parent(struct inotify_watch *i_watch) 75static void audit_watch_free_mark(struct fsnotify_mark *entry)
82{ 76{
83 struct audit_parent *parent; 77 struct audit_parent *parent;
84 78
85 parent = container_of(i_watch, struct audit_parent, wdata); 79 parent = container_of(entry, struct audit_parent, mark);
86 WARN_ON(!list_empty(&parent->watches)); 80 audit_free_parent(parent);
87 kfree(parent); 81}
82
83static void audit_get_parent(struct audit_parent *parent)
84{
85 if (likely(parent))
86 fsnotify_get_mark(&parent->mark);
87}
88
89static void audit_put_parent(struct audit_parent *parent)
90{
91 if (likely(parent))
92 fsnotify_put_mark(&parent->mark);
93}
94
95/*
96 * Find and return the audit_parent on the given inode. If found a reference
97 * is taken on this parent.
98 */
99static inline struct audit_parent *audit_find_parent(struct inode *inode)
100{
101 struct audit_parent *parent = NULL;
102 struct fsnotify_mark *entry;
103
104 entry = fsnotify_find_inode_mark(audit_watch_group, inode);
105 if (entry)
106 parent = container_of(entry, struct audit_parent, mark);
107
108 return parent;
88} 109}
89 110
90void audit_get_watch(struct audit_watch *watch) 111void audit_get_watch(struct audit_watch *watch)
@@ -105,7 +126,7 @@ void audit_put_watch(struct audit_watch *watch)
105void audit_remove_watch(struct audit_watch *watch) 126void audit_remove_watch(struct audit_watch *watch)
106{ 127{
107 list_del(&watch->wlist); 128 list_del(&watch->wlist);
108 put_inotify_watch(&watch->parent->wdata); 129 audit_put_parent(watch->parent);
109 watch->parent = NULL; 130 watch->parent = NULL;
110 audit_put_watch(watch); /* match initial get */ 131 audit_put_watch(watch); /* match initial get */
111} 132}
@@ -115,42 +136,32 @@ char *audit_watch_path(struct audit_watch *watch)
115 return watch->path; 136 return watch->path;
116} 137}
117 138
118struct list_head *audit_watch_rules(struct audit_watch *watch) 139int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
119{
120 return &watch->rules;
121}
122
123unsigned long audit_watch_inode(struct audit_watch *watch)
124{ 140{
125 return watch->ino; 141 return (watch->ino != (unsigned long)-1) &&
126} 142 (watch->ino == ino) &&
127 143 (watch->dev == dev);
128dev_t audit_watch_dev(struct audit_watch *watch)
129{
130 return watch->dev;
131} 144}
132 145
133/* Initialize a parent watch entry. */ 146/* Initialize a parent watch entry. */
134static struct audit_parent *audit_init_parent(struct nameidata *ndp) 147static struct audit_parent *audit_init_parent(struct nameidata *ndp)
135{ 148{
149 struct inode *inode = ndp->path.dentry->d_inode;
136 struct audit_parent *parent; 150 struct audit_parent *parent;
137 s32 wd; 151 int ret;
138 152
139 parent = kzalloc(sizeof(*parent), GFP_KERNEL); 153 parent = kzalloc(sizeof(*parent), GFP_KERNEL);
140 if (unlikely(!parent)) 154 if (unlikely(!parent))
141 return ERR_PTR(-ENOMEM); 155 return ERR_PTR(-ENOMEM);
142 156
143 INIT_LIST_HEAD(&parent->watches); 157 INIT_LIST_HEAD(&parent->watches);
144 parent->flags = 0; 158
145 159 fsnotify_init_mark(&parent->mark, audit_watch_free_mark);
146 inotify_init_watch(&parent->wdata); 160 parent->mark.mask = AUDIT_FS_WATCH;
147 /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */ 161 ret = fsnotify_add_mark(&parent->mark, audit_watch_group, inode, NULL, 0);
148 get_inotify_watch(&parent->wdata); 162 if (ret < 0) {
149 wd = inotify_add_watch(audit_ih, &parent->wdata, 163 audit_free_parent(parent);
150 ndp->path.dentry->d_inode, AUDIT_IN_WATCH); 164 return ERR_PTR(ret);
151 if (wd < 0) {
152 audit_free_parent(&parent->wdata);
153 return ERR_PTR(wd);
154 } 165 }
155 166
156 return parent; 167 return parent;
@@ -179,7 +190,7 @@ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
179{ 190{
180 struct audit_watch *watch; 191 struct audit_watch *watch;
181 192
182 if (!audit_ih) 193 if (!audit_watch_group)
183 return -EOPNOTSUPP; 194 return -EOPNOTSUPP;
184 195
185 if (path[0] != '/' || path[len-1] == '/' || 196 if (path[0] != '/' || path[len-1] == '/' ||
@@ -217,7 +228,7 @@ static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
217 228
218 new->dev = old->dev; 229 new->dev = old->dev;
219 new->ino = old->ino; 230 new->ino = old->ino;
220 get_inotify_watch(&old->parent->wdata); 231 audit_get_parent(old->parent);
221 new->parent = old->parent; 232 new->parent = old->parent;
222 233
223out: 234out:
@@ -251,15 +262,19 @@ static void audit_update_watch(struct audit_parent *parent,
251 struct audit_entry *oentry, *nentry; 262 struct audit_entry *oentry, *nentry;
252 263
253 mutex_lock(&audit_filter_mutex); 264 mutex_lock(&audit_filter_mutex);
265 /* Run all of the watches on this parent looking for the one that
266 * matches the given dname */
254 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { 267 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
255 if (audit_compare_dname_path(dname, owatch->path, NULL)) 268 if (audit_compare_dname_path(dname, owatch->path, NULL))
256 continue; 269 continue;
257 270
258 /* If the update involves invalidating rules, do the inode-based 271 /* If the update involves invalidating rules, do the inode-based
259 * filtering now, so we don't omit records. */ 272 * filtering now, so we don't omit records. */
260 if (invalidating && current->audit_context) 273 if (invalidating && !audit_dummy_context())
261 audit_filter_inodes(current, current->audit_context); 274 audit_filter_inodes(current, current->audit_context);
262 275
276 /* updating ino will likely change which audit_hash_list we
277 * are on so we need a new watch for the new list */
263 nwatch = audit_dupe_watch(owatch); 278 nwatch = audit_dupe_watch(owatch);
264 if (IS_ERR(nwatch)) { 279 if (IS_ERR(nwatch)) {
265 mutex_unlock(&audit_filter_mutex); 280 mutex_unlock(&audit_filter_mutex);
@@ -275,12 +290,21 @@ static void audit_update_watch(struct audit_parent *parent,
275 list_del(&oentry->rule.rlist); 290 list_del(&oentry->rule.rlist);
276 list_del_rcu(&oentry->list); 291 list_del_rcu(&oentry->list);
277 292
278 nentry = audit_dupe_rule(&oentry->rule, nwatch); 293 nentry = audit_dupe_rule(&oentry->rule);
279 if (IS_ERR(nentry)) { 294 if (IS_ERR(nentry)) {
280 list_del(&oentry->rule.list); 295 list_del(&oentry->rule.list);
281 audit_panic("error updating watch, removing"); 296 audit_panic("error updating watch, removing");
282 } else { 297 } else {
283 int h = audit_hash_ino((u32)ino); 298 int h = audit_hash_ino((u32)ino);
299
300 /*
301 * nentry->rule.watch == oentry->rule.watch so
302 * we must drop that reference and set it to our
303 * new watch.
304 */
305 audit_put_watch(nentry->rule.watch);
306 audit_get_watch(nwatch);
307 nentry->rule.watch = nwatch;
284 list_add(&nentry->rule.rlist, &nwatch->rules); 308 list_add(&nentry->rule.rlist, &nwatch->rules);
285 list_add_rcu(&nentry->list, &audit_inode_hash[h]); 309 list_add_rcu(&nentry->list, &audit_inode_hash[h]);
286 list_replace(&oentry->rule.list, 310 list_replace(&oentry->rule.list,
@@ -312,7 +336,6 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
312 struct audit_entry *e; 336 struct audit_entry *e;
313 337
314 mutex_lock(&audit_filter_mutex); 338 mutex_lock(&audit_filter_mutex);
315 parent->flags |= AUDIT_PARENT_INVALID;
316 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { 339 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
317 list_for_each_entry_safe(r, nextr, &w->rules, rlist) { 340 list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
318 e = container_of(r, struct audit_entry, rule); 341 e = container_of(r, struct audit_entry, rule);
@@ -325,20 +348,8 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
325 audit_remove_watch(w); 348 audit_remove_watch(w);
326 } 349 }
327 mutex_unlock(&audit_filter_mutex); 350 mutex_unlock(&audit_filter_mutex);
328}
329
330/* Unregister inotify watches for parents on in_list.
331 * Generates an IN_IGNORED event. */
332void audit_inotify_unregister(struct list_head *in_list)
333{
334 struct audit_parent *p, *n;
335 351
336 list_for_each_entry_safe(p, n, in_list, ilist) { 352 fsnotify_destroy_mark(&parent->mark);
337 list_del(&p->ilist);
338 inotify_rm_watch(audit_ih, &p->wdata);
339 /* the unpin matching the pin in audit_do_del_rule() */
340 unpin_inotify_watch(&p->wdata);
341 }
342} 353}
343 354
344/* Get path information necessary for adding watches. */ 355/* Get path information necessary for adding watches. */
@@ -389,7 +400,7 @@ static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
389 } 400 }
390} 401}
391 402
392/* Associate the given rule with an existing parent inotify_watch. 403/* Associate the given rule with an existing parent.
393 * Caller must hold audit_filter_mutex. */ 404 * Caller must hold audit_filter_mutex. */
394static void audit_add_to_parent(struct audit_krule *krule, 405static void audit_add_to_parent(struct audit_krule *krule,
395 struct audit_parent *parent) 406 struct audit_parent *parent)
@@ -397,6 +408,8 @@ static void audit_add_to_parent(struct audit_krule *krule,
397 struct audit_watch *w, *watch = krule->watch; 408 struct audit_watch *w, *watch = krule->watch;
398 int watch_found = 0; 409 int watch_found = 0;
399 410
411 BUG_ON(!mutex_is_locked(&audit_filter_mutex));
412
400 list_for_each_entry(w, &parent->watches, wlist) { 413 list_for_each_entry(w, &parent->watches, wlist) {
401 if (strcmp(watch->path, w->path)) 414 if (strcmp(watch->path, w->path))
402 continue; 415 continue;
@@ -413,7 +426,7 @@ static void audit_add_to_parent(struct audit_krule *krule,
413 } 426 }
414 427
415 if (!watch_found) { 428 if (!watch_found) {
416 get_inotify_watch(&parent->wdata); 429 audit_get_parent(parent);
417 watch->parent = parent; 430 watch->parent = parent;
418 431
419 list_add(&watch->wlist, &parent->watches); 432 list_add(&watch->wlist, &parent->watches);
@@ -423,13 +436,12 @@ static void audit_add_to_parent(struct audit_krule *krule,
423 436
424/* Find a matching watch entry, or add this one. 437/* Find a matching watch entry, or add this one.
425 * Caller must hold audit_filter_mutex. */ 438 * Caller must hold audit_filter_mutex. */
426int audit_add_watch(struct audit_krule *krule) 439int audit_add_watch(struct audit_krule *krule, struct list_head **list)
427{ 440{
428 struct audit_watch *watch = krule->watch; 441 struct audit_watch *watch = krule->watch;
429 struct inotify_watch *i_watch;
430 struct audit_parent *parent; 442 struct audit_parent *parent;
431 struct nameidata *ndp = NULL, *ndw = NULL; 443 struct nameidata *ndp = NULL, *ndw = NULL;
432 int ret = 0; 444 int h, ret = 0;
433 445
434 mutex_unlock(&audit_filter_mutex); 446 mutex_unlock(&audit_filter_mutex);
435 447
@@ -441,47 +453,38 @@ int audit_add_watch(struct audit_krule *krule)
441 goto error; 453 goto error;
442 } 454 }
443 455
456 mutex_lock(&audit_filter_mutex);
457
444 /* update watch filter fields */ 458 /* update watch filter fields */
445 if (ndw) { 459 if (ndw) {
446 watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev; 460 watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
447 watch->ino = ndw->path.dentry->d_inode->i_ino; 461 watch->ino = ndw->path.dentry->d_inode->i_ino;
448 } 462 }
449 463
450 /* The audit_filter_mutex must not be held during inotify calls because 464 /* either find an old parent or attach a new one */
451 * we hold it during inotify event callback processing. If an existing 465 parent = audit_find_parent(ndp->path.dentry->d_inode);
452 * inotify watch is found, inotify_find_watch() grabs a reference before 466 if (!parent) {
453 * returning.
454 */
455 if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
456 &i_watch) < 0) {
457 parent = audit_init_parent(ndp); 467 parent = audit_init_parent(ndp);
458 if (IS_ERR(parent)) { 468 if (IS_ERR(parent)) {
459 /* caller expects mutex locked */
460 mutex_lock(&audit_filter_mutex);
461 ret = PTR_ERR(parent); 469 ret = PTR_ERR(parent);
462 goto error; 470 goto error;
463 } 471 }
464 } else 472 }
465 parent = container_of(i_watch, struct audit_parent, wdata);
466
467 mutex_lock(&audit_filter_mutex);
468 473
469 /* parent was moved before we took audit_filter_mutex */ 474 audit_add_to_parent(krule, parent);
470 if (parent->flags & AUDIT_PARENT_INVALID)
471 ret = -ENOENT;
472 else
473 audit_add_to_parent(krule, parent);
474 475
475 /* match get in audit_init_parent or inotify_find_watch */ 476 /* match get in audit_find_parent or audit_init_parent */
476 put_inotify_watch(&parent->wdata); 477 audit_put_parent(parent);
477 478
479 h = audit_hash_ino((u32)watch->ino);
480 *list = &audit_inode_hash[h];
478error: 481error:
479 audit_put_nd(ndp, ndw); /* NULL args OK */ 482 audit_put_nd(ndp, ndw); /* NULL args OK */
480 return ret; 483 return ret;
481 484
482} 485}
483 486
484void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list) 487void audit_remove_watch_rule(struct audit_krule *krule)
485{ 488{
486 struct audit_watch *watch = krule->watch; 489 struct audit_watch *watch = krule->watch;
487 struct audit_parent *parent = watch->parent; 490 struct audit_parent *parent = watch->parent;
@@ -492,53 +495,74 @@ void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list)
492 audit_remove_watch(watch); 495 audit_remove_watch(watch);
493 496
494 if (list_empty(&parent->watches)) { 497 if (list_empty(&parent->watches)) {
495 /* Put parent on the inotify un-registration 498 audit_get_parent(parent);
496 * list. Grab a reference before releasing 499 fsnotify_destroy_mark(&parent->mark);
497 * audit_filter_mutex, to be released in 500 audit_put_parent(parent);
498 * audit_inotify_unregister().
499 * If filesystem is going away, just leave
500 * the sucker alone, eviction will take
501 * care of it. */
502 if (pin_inotify_watch(&parent->wdata))
503 list_add(&parent->ilist, list);
504 } 501 }
505 } 502 }
506} 503}
507 504
508/* Update watch data in audit rules based on inotify events. */ 505static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode,
509static void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask, 506 struct fsnotify_mark *inode_mark,
510 u32 cookie, const char *dname, struct inode *inode) 507 struct fsnotify_mark *vfsmount_mark,
508 __u32 mask, void *data, int data_type)
509{
510 return true;
511}
512
513/* Update watch data in audit rules based on fsnotify events. */
514static int audit_watch_handle_event(struct fsnotify_group *group,
515 struct fsnotify_mark *inode_mark,
516 struct fsnotify_mark *vfsmount_mark,
517 struct fsnotify_event *event)
511{ 518{
519 struct inode *inode;
520 __u32 mask = event->mask;
521 const char *dname = event->file_name;
512 struct audit_parent *parent; 522 struct audit_parent *parent;
513 523
514 parent = container_of(i_watch, struct audit_parent, wdata); 524 parent = container_of(inode_mark, struct audit_parent, mark);
515 525
516 if (mask & (IN_CREATE|IN_MOVED_TO) && inode) 526 BUG_ON(group != audit_watch_group);
517 audit_update_watch(parent, dname, inode->i_sb->s_dev, 527
518 inode->i_ino, 0); 528 switch (event->data_type) {
519 else if (mask & (IN_DELETE|IN_MOVED_FROM)) 529 case (FSNOTIFY_EVENT_PATH):
530 inode = event->path.dentry->d_inode;
531 break;
532 case (FSNOTIFY_EVENT_INODE):
533 inode = event->inode;
534 break;
535 default:
536 BUG();
537 inode = NULL;
538 break;
539 };
540
541 if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
542 audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0);
543 else if (mask & (FS_DELETE|FS_MOVED_FROM))
520 audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1); 544 audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
521 /* inotify automatically removes the watch and sends IN_IGNORED */ 545 else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
522 else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
523 audit_remove_parent_watches(parent);
524 /* inotify does not remove the watch, so remove it manually */
525 else if(mask & IN_MOVE_SELF) {
526 audit_remove_parent_watches(parent); 546 audit_remove_parent_watches(parent);
527 inotify_remove_watch_locked(audit_ih, i_watch); 547
528 } else if (mask & IN_IGNORED) 548 return 0;
529 put_inotify_watch(i_watch);
530} 549}
531 550
532static const struct inotify_operations audit_inotify_ops = { 551static const struct fsnotify_ops audit_watch_fsnotify_ops = {
533 .handle_event = audit_handle_ievent, 552 .should_send_event = audit_watch_should_send_event,
534 .destroy_watch = audit_free_parent, 553 .handle_event = audit_watch_handle_event,
554 .free_group_priv = NULL,
555 .freeing_mark = NULL,
556 .free_event_priv = NULL,
535}; 557};
536 558
537static int __init audit_watch_init(void) 559static int __init audit_watch_init(void)
538{ 560{
539 audit_ih = inotify_init(&audit_inotify_ops); 561 audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops);
540 if (IS_ERR(audit_ih)) 562 if (IS_ERR(audit_watch_group)) {
541 audit_panic("cannot initialize inotify handle"); 563 audit_watch_group = NULL;
564 audit_panic("cannot create audit fsnotify group");
565 }
542 return 0; 566 return 0;
543} 567}
544subsys_initcall(audit_watch_init); 568device_initcall(audit_watch_init);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index ce08041f578d..eb7675499fb5 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -71,6 +71,7 @@ static inline void audit_free_rule(struct audit_entry *e)
71{ 71{
72 int i; 72 int i;
73 struct audit_krule *erule = &e->rule; 73 struct audit_krule *erule = &e->rule;
74
74 /* some rules don't have associated watches */ 75 /* some rules don't have associated watches */
75 if (erule->watch) 76 if (erule->watch)
76 audit_put_watch(erule->watch); 77 audit_put_watch(erule->watch);
@@ -746,8 +747,7 @@ static inline int audit_dupe_lsm_field(struct audit_field *df,
746 * rule with the new rule in the filterlist, then free the old rule. 747 * rule with the new rule in the filterlist, then free the old rule.
747 * The rlist element is undefined; list manipulations are handled apart from 748 * The rlist element is undefined; list manipulations are handled apart from
748 * the initial copy. */ 749 * the initial copy. */
749struct audit_entry *audit_dupe_rule(struct audit_krule *old, 750struct audit_entry *audit_dupe_rule(struct audit_krule *old)
750 struct audit_watch *watch)
751{ 751{
752 u32 fcount = old->field_count; 752 u32 fcount = old->field_count;
753 struct audit_entry *entry; 753 struct audit_entry *entry;
@@ -769,8 +769,8 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old,
769 new->prio = old->prio; 769 new->prio = old->prio;
770 new->buflen = old->buflen; 770 new->buflen = old->buflen;
771 new->inode_f = old->inode_f; 771 new->inode_f = old->inode_f;
772 new->watch = NULL;
773 new->field_count = old->field_count; 772 new->field_count = old->field_count;
773
774 /* 774 /*
775 * note that we are OK with not refcounting here; audit_match_tree() 775 * note that we are OK with not refcounting here; audit_match_tree()
776 * never dereferences tree and we can't get false positives there 776 * never dereferences tree and we can't get false positives there
@@ -811,9 +811,9 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old,
811 } 811 }
812 } 812 }
813 813
814 if (watch) { 814 if (old->watch) {
815 audit_get_watch(watch); 815 audit_get_watch(old->watch);
816 new->watch = watch; 816 new->watch = old->watch;
817 } 817 }
818 818
819 return entry; 819 return entry;
@@ -866,7 +866,7 @@ static inline int audit_add_rule(struct audit_entry *entry)
866 struct audit_watch *watch = entry->rule.watch; 866 struct audit_watch *watch = entry->rule.watch;
867 struct audit_tree *tree = entry->rule.tree; 867 struct audit_tree *tree = entry->rule.tree;
868 struct list_head *list; 868 struct list_head *list;
869 int h, err; 869 int err;
870#ifdef CONFIG_AUDITSYSCALL 870#ifdef CONFIG_AUDITSYSCALL
871 int dont_count = 0; 871 int dont_count = 0;
872 872
@@ -889,15 +889,11 @@ static inline int audit_add_rule(struct audit_entry *entry)
889 889
890 if (watch) { 890 if (watch) {
891 /* audit_filter_mutex is dropped and re-taken during this call */ 891 /* audit_filter_mutex is dropped and re-taken during this call */
892 err = audit_add_watch(&entry->rule); 892 err = audit_add_watch(&entry->rule, &list);
893 if (err) { 893 if (err) {
894 mutex_unlock(&audit_filter_mutex); 894 mutex_unlock(&audit_filter_mutex);
895 goto error; 895 goto error;
896 } 896 }
897 /* entry->rule.watch may have changed during audit_add_watch() */
898 watch = entry->rule.watch;
899 h = audit_hash_ino((u32)audit_watch_inode(watch));
900 list = &audit_inode_hash[h];
901 } 897 }
902 if (tree) { 898 if (tree) {
903 err = audit_add_tree_rule(&entry->rule); 899 err = audit_add_tree_rule(&entry->rule);
@@ -949,7 +945,6 @@ static inline int audit_del_rule(struct audit_entry *entry)
949 struct audit_watch *watch = entry->rule.watch; 945 struct audit_watch *watch = entry->rule.watch;
950 struct audit_tree *tree = entry->rule.tree; 946 struct audit_tree *tree = entry->rule.tree;
951 struct list_head *list; 947 struct list_head *list;
952 LIST_HEAD(inotify_list);
953 int ret = 0; 948 int ret = 0;
954#ifdef CONFIG_AUDITSYSCALL 949#ifdef CONFIG_AUDITSYSCALL
955 int dont_count = 0; 950 int dont_count = 0;
@@ -969,7 +964,7 @@ static inline int audit_del_rule(struct audit_entry *entry)
969 } 964 }
970 965
971 if (e->rule.watch) 966 if (e->rule.watch)
972 audit_remove_watch_rule(&e->rule, &inotify_list); 967 audit_remove_watch_rule(&e->rule);
973 968
974 if (e->rule.tree) 969 if (e->rule.tree)
975 audit_remove_tree_rule(&e->rule); 970 audit_remove_tree_rule(&e->rule);
@@ -987,9 +982,6 @@ static inline int audit_del_rule(struct audit_entry *entry)
987#endif 982#endif
988 mutex_unlock(&audit_filter_mutex); 983 mutex_unlock(&audit_filter_mutex);
989 984
990 if (!list_empty(&inotify_list))
991 audit_inotify_unregister(&inotify_list);
992
993out: 985out:
994 if (watch) 986 if (watch)
995 audit_put_watch(watch); /* match initial get */ 987 audit_put_watch(watch); /* match initial get */
@@ -1323,30 +1315,23 @@ static int update_lsm_rule(struct audit_krule *r)
1323{ 1315{
1324 struct audit_entry *entry = container_of(r, struct audit_entry, rule); 1316 struct audit_entry *entry = container_of(r, struct audit_entry, rule);
1325 struct audit_entry *nentry; 1317 struct audit_entry *nentry;
1326 struct audit_watch *watch;
1327 struct audit_tree *tree;
1328 int err = 0; 1318 int err = 0;
1329 1319
1330 if (!security_audit_rule_known(r)) 1320 if (!security_audit_rule_known(r))
1331 return 0; 1321 return 0;
1332 1322
1333 watch = r->watch; 1323 nentry = audit_dupe_rule(r);
1334 tree = r->tree;
1335 nentry = audit_dupe_rule(r, watch);
1336 if (IS_ERR(nentry)) { 1324 if (IS_ERR(nentry)) {
1337 /* save the first error encountered for the 1325 /* save the first error encountered for the
1338 * return value */ 1326 * return value */
1339 err = PTR_ERR(nentry); 1327 err = PTR_ERR(nentry);
1340 audit_panic("error updating LSM filters"); 1328 audit_panic("error updating LSM filters");
1341 if (watch) 1329 if (r->watch)
1342 list_del(&r->rlist); 1330 list_del(&r->rlist);
1343 list_del_rcu(&entry->list); 1331 list_del_rcu(&entry->list);
1344 list_del(&r->list); 1332 list_del(&r->list);
1345 } else { 1333 } else {
1346 if (watch) { 1334 if (r->watch || r->tree)
1347 list_add(&nentry->rule.rlist, audit_watch_rules(watch));
1348 list_del(&r->rlist);
1349 } else if (tree)
1350 list_replace_init(&r->rlist, &nentry->rule.rlist); 1335 list_replace_init(&r->rlist, &nentry->rule.rlist);
1351 list_replace_rcu(&entry->list, &nentry->list); 1336 list_replace_rcu(&entry->list, &nentry->list);
1352 list_replace(&r->list, &nentry->rule.list); 1337 list_replace(&r->list, &nentry->rule.list);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 3828ad5fb8f1..1b31c130d034 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -65,7 +65,6 @@
65#include <linux/binfmts.h> 65#include <linux/binfmts.h>
66#include <linux/highmem.h> 66#include <linux/highmem.h>
67#include <linux/syscalls.h> 67#include <linux/syscalls.h>
68#include <linux/inotify.h>
69#include <linux/capability.h> 68#include <linux/capability.h>
70#include <linux/fs_struct.h> 69#include <linux/fs_struct.h>
71 70
@@ -549,9 +548,8 @@ static int audit_filter_rules(struct task_struct *tsk,
549 } 548 }
550 break; 549 break;
551 case AUDIT_WATCH: 550 case AUDIT_WATCH:
552 if (name && audit_watch_inode(rule->watch) != (unsigned long)-1) 551 if (name)
553 result = (name->dev == audit_watch_dev(rule->watch) && 552 result = audit_watch_compare(rule->watch, name->ino, name->dev);
554 name->ino == audit_watch_inode(rule->watch));
555 break; 553 break;
556 case AUDIT_DIR: 554 case AUDIT_DIR:
557 if (ctx) 555 if (ctx)
@@ -1726,7 +1724,7 @@ static inline void handle_one(const struct inode *inode)
1726 struct audit_tree_refs *p; 1724 struct audit_tree_refs *p;
1727 struct audit_chunk *chunk; 1725 struct audit_chunk *chunk;
1728 int count; 1726 int count;
1729 if (likely(list_empty(&inode->inotify_watches))) 1727 if (likely(hlist_empty(&inode->i_fsnotify_marks)))
1730 return; 1728 return;
1731 context = current->audit_context; 1729 context = current->audit_context;
1732 p = context->trees; 1730 p = context->trees;
@@ -1769,7 +1767,7 @@ retry:
1769 seq = read_seqbegin(&rename_lock); 1767 seq = read_seqbegin(&rename_lock);
1770 for(;;) { 1768 for(;;) {
1771 struct inode *inode = d->d_inode; 1769 struct inode *inode = d->d_inode;
1772 if (inode && unlikely(!list_empty(&inode->inotify_watches))) { 1770 if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) {
1773 struct audit_chunk *chunk; 1771 struct audit_chunk *chunk;
1774 chunk = audit_tree_lookup(inode); 1772 chunk = audit_tree_lookup(inode);
1775 if (chunk) { 1773 if (chunk) {
@@ -1837,13 +1835,8 @@ void __audit_getname(const char *name)
1837 context->names[context->name_count].ino = (unsigned long)-1; 1835 context->names[context->name_count].ino = (unsigned long)-1;
1838 context->names[context->name_count].osid = 0; 1836 context->names[context->name_count].osid = 0;
1839 ++context->name_count; 1837 ++context->name_count;
1840 if (!context->pwd.dentry) { 1838 if (!context->pwd.dentry)
1841 read_lock(&current->fs->lock); 1839 get_fs_pwd(current->fs, &context->pwd);
1842 context->pwd = current->fs->pwd;
1843 path_get(&current->fs->pwd);
1844 read_unlock(&current->fs->lock);
1845 }
1846
1847} 1840}
1848 1841
1849/* audit_putname - intercept a putname request 1842/* audit_putname - intercept a putname request
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3ac6f5b0a64b..5cf366965d0c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -52,7 +52,6 @@
52#include <linux/cgroupstats.h> 52#include <linux/cgroupstats.h>
53#include <linux/hash.h> 53#include <linux/hash.h>
54#include <linux/namei.h> 54#include <linux/namei.h>
55#include <linux/smp_lock.h>
56#include <linux/pid_namespace.h> 55#include <linux/pid_namespace.h>
57#include <linux/idr.h> 56#include <linux/idr.h>
58#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 57#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
@@ -138,7 +137,7 @@ struct css_id {
138 * is called after synchronize_rcu(). But for safe use, css_is_removed() 137 * is called after synchronize_rcu(). But for safe use, css_is_removed()
139 * css_tryget() should be used for avoiding race. 138 * css_tryget() should be used for avoiding race.
140 */ 139 */
141 struct cgroup_subsys_state *css; 140 struct cgroup_subsys_state __rcu *css;
142 /* 141 /*
143 * ID of this css. 142 * ID of this css.
144 */ 143 */
@@ -244,6 +243,11 @@ static int notify_on_release(const struct cgroup *cgrp)
244 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 243 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
245} 244}
246 245
246static int clone_children(const struct cgroup *cgrp)
247{
248 return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
249}
250
247/* 251/*
248 * for_each_subsys() allows you to iterate on each subsystem attached to 252 * for_each_subsys() allows you to iterate on each subsystem attached to
249 * an active hierarchy 253 * an active hierarchy
@@ -778,6 +782,7 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
778 struct inode *inode = new_inode(sb); 782 struct inode *inode = new_inode(sb);
779 783
780 if (inode) { 784 if (inode) {
785 inode->i_ino = get_next_ino();
781 inode->i_mode = mode; 786 inode->i_mode = mode;
782 inode->i_uid = current_fsuid(); 787 inode->i_uid = current_fsuid();
783 inode->i_gid = current_fsgid(); 788 inode->i_gid = current_fsgid();
@@ -1040,6 +1045,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
1040 seq_puts(seq, ",noprefix"); 1045 seq_puts(seq, ",noprefix");
1041 if (strlen(root->release_agent_path)) 1046 if (strlen(root->release_agent_path))
1042 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 1047 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
1048 if (clone_children(&root->top_cgroup))
1049 seq_puts(seq, ",clone_children");
1043 if (strlen(root->name)) 1050 if (strlen(root->name))
1044 seq_printf(seq, ",name=%s", root->name); 1051 seq_printf(seq, ",name=%s", root->name);
1045 mutex_unlock(&cgroup_mutex); 1052 mutex_unlock(&cgroup_mutex);
@@ -1050,6 +1057,7 @@ struct cgroup_sb_opts {
1050 unsigned long subsys_bits; 1057 unsigned long subsys_bits;
1051 unsigned long flags; 1058 unsigned long flags;
1052 char *release_agent; 1059 char *release_agent;
1060 bool clone_children;
1053 char *name; 1061 char *name;
1054 /* User explicitly requested empty subsystem */ 1062 /* User explicitly requested empty subsystem */
1055 bool none; 1063 bool none;
@@ -1066,7 +1074,8 @@ struct cgroup_sb_opts {
1066 */ 1074 */
1067static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) 1075static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1068{ 1076{
1069 char *token, *o = data ?: "all"; 1077 char *token, *o = data;
1078 bool all_ss = false, one_ss = false;
1070 unsigned long mask = (unsigned long)-1; 1079 unsigned long mask = (unsigned long)-1;
1071 int i; 1080 int i;
1072 bool module_pin_failed = false; 1081 bool module_pin_failed = false;
@@ -1082,30 +1091,37 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1082 while ((token = strsep(&o, ",")) != NULL) { 1091 while ((token = strsep(&o, ",")) != NULL) {
1083 if (!*token) 1092 if (!*token)
1084 return -EINVAL; 1093 return -EINVAL;
1085 if (!strcmp(token, "all")) { 1094 if (!strcmp(token, "none")) {
1086 /* Add all non-disabled subsystems */
1087 opts->subsys_bits = 0;
1088 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1089 struct cgroup_subsys *ss = subsys[i];
1090 if (ss == NULL)
1091 continue;
1092 if (!ss->disabled)
1093 opts->subsys_bits |= 1ul << i;
1094 }
1095 } else if (!strcmp(token, "none")) {
1096 /* Explicitly have no subsystems */ 1095 /* Explicitly have no subsystems */
1097 opts->none = true; 1096 opts->none = true;
1098 } else if (!strcmp(token, "noprefix")) { 1097 continue;
1098 }
1099 if (!strcmp(token, "all")) {
1100 /* Mutually exclusive option 'all' + subsystem name */
1101 if (one_ss)
1102 return -EINVAL;
1103 all_ss = true;
1104 continue;
1105 }
1106 if (!strcmp(token, "noprefix")) {
1099 set_bit(ROOT_NOPREFIX, &opts->flags); 1107 set_bit(ROOT_NOPREFIX, &opts->flags);
1100 } else if (!strncmp(token, "release_agent=", 14)) { 1108 continue;
1109 }
1110 if (!strcmp(token, "clone_children")) {
1111 opts->clone_children = true;
1112 continue;
1113 }
1114 if (!strncmp(token, "release_agent=", 14)) {
1101 /* Specifying two release agents is forbidden */ 1115 /* Specifying two release agents is forbidden */
1102 if (opts->release_agent) 1116 if (opts->release_agent)
1103 return -EINVAL; 1117 return -EINVAL;
1104 opts->release_agent = 1118 opts->release_agent =
1105 kstrndup(token + 14, PATH_MAX, GFP_KERNEL); 1119 kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
1106 if (!opts->release_agent) 1120 if (!opts->release_agent)
1107 return -ENOMEM; 1121 return -ENOMEM;
1108 } else if (!strncmp(token, "name=", 5)) { 1122 continue;
1123 }
1124 if (!strncmp(token, "name=", 5)) {
1109 const char *name = token + 5; 1125 const char *name = token + 5;
1110 /* Can't specify an empty name */ 1126 /* Can't specify an empty name */
1111 if (!strlen(name)) 1127 if (!strlen(name))
@@ -1123,24 +1139,48 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1123 if (opts->name) 1139 if (opts->name)
1124 return -EINVAL; 1140 return -EINVAL;
1125 opts->name = kstrndup(name, 1141 opts->name = kstrndup(name,
1126 MAX_CGROUP_ROOT_NAMELEN, 1142 MAX_CGROUP_ROOT_NAMELEN - 1,
1127 GFP_KERNEL); 1143 GFP_KERNEL);
1128 if (!opts->name) 1144 if (!opts->name)
1129 return -ENOMEM; 1145 return -ENOMEM;
1130 } else { 1146
1131 struct cgroup_subsys *ss; 1147 continue;
1132 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1148 }
1133 ss = subsys[i]; 1149
1134 if (ss == NULL) 1150 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1135 continue; 1151 struct cgroup_subsys *ss = subsys[i];
1136 if (!strcmp(token, ss->name)) { 1152 if (ss == NULL)
1137 if (!ss->disabled) 1153 continue;
1138 set_bit(i, &opts->subsys_bits); 1154 if (strcmp(token, ss->name))
1139 break; 1155 continue;
1140 } 1156 if (ss->disabled)
1141 } 1157 continue;
1142 if (i == CGROUP_SUBSYS_COUNT) 1158
1143 return -ENOENT; 1159 /* Mutually exclusive option 'all' + subsystem name */
1160 if (all_ss)
1161 return -EINVAL;
1162 set_bit(i, &opts->subsys_bits);
1163 one_ss = true;
1164
1165 break;
1166 }
1167 if (i == CGROUP_SUBSYS_COUNT)
1168 return -ENOENT;
1169 }
1170
1171 /*
1172 * If the 'all' option was specified select all the subsystems,
1173 * otherwise 'all, 'none' and a subsystem name options were not
1174 * specified, let's default to 'all'
1175 */
1176 if (all_ss || (!all_ss && !one_ss && !opts->none)) {
1177 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1178 struct cgroup_subsys *ss = subsys[i];
1179 if (ss == NULL)
1180 continue;
1181 if (ss->disabled)
1182 continue;
1183 set_bit(i, &opts->subsys_bits);
1144 } 1184 }
1145 } 1185 }
1146 1186
@@ -1222,7 +1262,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1222 struct cgroup *cgrp = &root->top_cgroup; 1262 struct cgroup *cgrp = &root->top_cgroup;
1223 struct cgroup_sb_opts opts; 1263 struct cgroup_sb_opts opts;
1224 1264
1225 lock_kernel();
1226 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 1265 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1227 mutex_lock(&cgroup_mutex); 1266 mutex_lock(&cgroup_mutex);
1228 1267
@@ -1255,7 +1294,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1255 kfree(opts.name); 1294 kfree(opts.name);
1256 mutex_unlock(&cgroup_mutex); 1295 mutex_unlock(&cgroup_mutex);
1257 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1296 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1258 unlock_kernel();
1259 return ret; 1297 return ret;
1260} 1298}
1261 1299
@@ -1357,6 +1395,8 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1357 strcpy(root->release_agent_path, opts->release_agent); 1395 strcpy(root->release_agent_path, opts->release_agent);
1358 if (opts->name) 1396 if (opts->name)
1359 strcpy(root->name, opts->name); 1397 strcpy(root->name, opts->name);
1398 if (opts->clone_children)
1399 set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags);
1360 return root; 1400 return root;
1361} 1401}
1362 1402
@@ -1568,7 +1608,6 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1568 out_err: 1608 out_err:
1569 kfree(opts.release_agent); 1609 kfree(opts.release_agent);
1570 kfree(opts.name); 1610 kfree(opts.name);
1571
1572 return ret; 1611 return ret;
1573} 1612}
1574 1613
@@ -1623,6 +1662,8 @@ static struct file_system_type cgroup_fs_type = {
1623 .kill_sb = cgroup_kill_sb, 1662 .kill_sb = cgroup_kill_sb,
1624}; 1663};
1625 1664
1665static struct kobject *cgroup_kobj;
1666
1626static inline struct cgroup *__d_cgrp(struct dentry *dentry) 1667static inline struct cgroup *__d_cgrp(struct dentry *dentry)
1627{ 1668{
1628 return dentry->d_fsdata; 1669 return dentry->d_fsdata;
@@ -1788,6 +1829,30 @@ out:
1788 return retval; 1829 return retval;
1789} 1830}
1790 1831
1832/**
1833 * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
1834 * @from: attach to all cgroups of a given task
1835 * @tsk: the task to be attached
1836 */
1837int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
1838{
1839 struct cgroupfs_root *root;
1840 int retval = 0;
1841
1842 cgroup_lock();
1843 for_each_active_root(root) {
1844 struct cgroup *from_cg = task_cgroup_from_root(from, root);
1845
1846 retval = cgroup_attach_task(from_cg, tsk);
1847 if (retval)
1848 break;
1849 }
1850 cgroup_unlock();
1851
1852 return retval;
1853}
1854EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
1855
1791/* 1856/*
1792 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex 1857 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
1793 * held. May take task_lock of task 1858 * held. May take task_lock of task
@@ -1857,6 +1922,8 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
1857 const char *buffer) 1922 const char *buffer)
1858{ 1923{
1859 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); 1924 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
1925 if (strlen(buffer) >= PATH_MAX)
1926 return -EINVAL;
1860 if (!cgroup_lock_live_group(cgrp)) 1927 if (!cgroup_lock_live_group(cgrp))
1861 return -ENODEV; 1928 return -ENODEV;
1862 strcpy(cgrp->root->release_agent_path, buffer); 1929 strcpy(cgrp->root->release_agent_path, buffer);
@@ -3150,6 +3217,23 @@ fail:
3150 return ret; 3217 return ret;
3151} 3218}
3152 3219
3220static u64 cgroup_clone_children_read(struct cgroup *cgrp,
3221 struct cftype *cft)
3222{
3223 return clone_children(cgrp);
3224}
3225
3226static int cgroup_clone_children_write(struct cgroup *cgrp,
3227 struct cftype *cft,
3228 u64 val)
3229{
3230 if (val)
3231 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3232 else
3233 clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3234 return 0;
3235}
3236
3153/* 3237/*
3154 * for the common functions, 'private' gives the type of file 3238 * for the common functions, 'private' gives the type of file
3155 */ 3239 */
@@ -3180,6 +3264,11 @@ static struct cftype files[] = {
3180 .write_string = cgroup_write_event_control, 3264 .write_string = cgroup_write_event_control,
3181 .mode = S_IWUGO, 3265 .mode = S_IWUGO,
3182 }, 3266 },
3267 {
3268 .name = "cgroup.clone_children",
3269 .read_u64 = cgroup_clone_children_read,
3270 .write_u64 = cgroup_clone_children_write,
3271 },
3183}; 3272};
3184 3273
3185static struct cftype cft_release_agent = { 3274static struct cftype cft_release_agent = {
@@ -3309,6 +3398,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3309 if (notify_on_release(parent)) 3398 if (notify_on_release(parent))
3310 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 3399 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
3311 3400
3401 if (clone_children(parent))
3402 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3403
3312 for_each_subsys(root, ss) { 3404 for_each_subsys(root, ss) {
3313 struct cgroup_subsys_state *css = ss->create(ss, cgrp); 3405 struct cgroup_subsys_state *css = ss->create(ss, cgrp);
3314 3406
@@ -3323,6 +3415,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3323 goto err_destroy; 3415 goto err_destroy;
3324 } 3416 }
3325 /* At error, ->destroy() callback has to free assigned ID. */ 3417 /* At error, ->destroy() callback has to free assigned ID. */
3418 if (clone_children(parent) && ss->post_clone)
3419 ss->post_clone(ss, cgrp);
3326 } 3420 }
3327 3421
3328 cgroup_lock_hierarchy(root); 3422 cgroup_lock_hierarchy(root);
@@ -3871,9 +3965,18 @@ int __init cgroup_init(void)
3871 hhead = css_set_hash(init_css_set.subsys); 3965 hhead = css_set_hash(init_css_set.subsys);
3872 hlist_add_head(&init_css_set.hlist, hhead); 3966 hlist_add_head(&init_css_set.hlist, hhead);
3873 BUG_ON(!init_root_id(&rootnode)); 3967 BUG_ON(!init_root_id(&rootnode));
3968
3969 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
3970 if (!cgroup_kobj) {
3971 err = -ENOMEM;
3972 goto out;
3973 }
3974
3874 err = register_filesystem(&cgroup_fs_type); 3975 err = register_filesystem(&cgroup_fs_type);
3875 if (err < 0) 3976 if (err < 0) {
3977 kobject_put(cgroup_kobj);
3876 goto out; 3978 goto out;
3979 }
3877 3980
3878 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); 3981 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
3879 3982
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index ce71ed53e88f..e7bebb7c6c38 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -48,20 +48,19 @@ static inline struct freezer *task_freezer(struct task_struct *task)
48 struct freezer, css); 48 struct freezer, css);
49} 49}
50 50
51int cgroup_freezing_or_frozen(struct task_struct *task) 51static inline int __cgroup_freezing_or_frozen(struct task_struct *task)
52{ 52{
53 struct freezer *freezer; 53 enum freezer_state state = task_freezer(task)->state;
54 enum freezer_state state; 54 return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
55}
55 56
57int cgroup_freezing_or_frozen(struct task_struct *task)
58{
59 int result;
56 task_lock(task); 60 task_lock(task);
57 freezer = task_freezer(task); 61 result = __cgroup_freezing_or_frozen(task);
58 if (!freezer->css.cgroup->parent)
59 state = CGROUP_THAWED; /* root cgroup can't be frozen */
60 else
61 state = freezer->state;
62 task_unlock(task); 62 task_unlock(task);
63 63 return result;
64 return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
65} 64}
66 65
67/* 66/*
@@ -154,13 +153,6 @@ static void freezer_destroy(struct cgroup_subsys *ss,
154 kfree(cgroup_freezer(cgroup)); 153 kfree(cgroup_freezer(cgroup));
155} 154}
156 155
157/* Task is frozen or will freeze immediately when next it gets woken */
158static bool is_task_frozen_enough(struct task_struct *task)
159{
160 return frozen(task) ||
161 (task_is_stopped_or_traced(task) && freezing(task));
162}
163
164/* 156/*
165 * The call to cgroup_lock() in the freezer.state write method prevents 157 * The call to cgroup_lock() in the freezer.state write method prevents
166 * a write to that file racing against an attach, and hence the 158 * a write to that file racing against an attach, and hence the
@@ -174,24 +166,25 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
174 166
175 /* 167 /*
176 * Anything frozen can't move or be moved to/from. 168 * Anything frozen can't move or be moved to/from.
177 *
178 * Since orig_freezer->state == FROZEN means that @task has been
179 * frozen, so it's sufficient to check the latter condition.
180 */ 169 */
181 170
182 if (is_task_frozen_enough(task)) 171 freezer = cgroup_freezer(new_cgroup);
172 if (freezer->state != CGROUP_THAWED)
183 return -EBUSY; 173 return -EBUSY;
184 174
185 freezer = cgroup_freezer(new_cgroup); 175 rcu_read_lock();
186 if (freezer->state == CGROUP_FROZEN) 176 if (__cgroup_freezing_or_frozen(task)) {
177 rcu_read_unlock();
187 return -EBUSY; 178 return -EBUSY;
179 }
180 rcu_read_unlock();
188 181
189 if (threadgroup) { 182 if (threadgroup) {
190 struct task_struct *c; 183 struct task_struct *c;
191 184
192 rcu_read_lock(); 185 rcu_read_lock();
193 list_for_each_entry_rcu(c, &task->thread_group, thread_group) { 186 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
194 if (is_task_frozen_enough(c)) { 187 if (__cgroup_freezing_or_frozen(c)) {
195 rcu_read_unlock(); 188 rcu_read_unlock();
196 return -EBUSY; 189 return -EBUSY;
197 } 190 }
@@ -236,31 +229,30 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
236/* 229/*
237 * caller must hold freezer->lock 230 * caller must hold freezer->lock
238 */ 231 */
239static void update_freezer_state(struct cgroup *cgroup, 232static void update_if_frozen(struct cgroup *cgroup,
240 struct freezer *freezer) 233 struct freezer *freezer)
241{ 234{
242 struct cgroup_iter it; 235 struct cgroup_iter it;
243 struct task_struct *task; 236 struct task_struct *task;
244 unsigned int nfrozen = 0, ntotal = 0; 237 unsigned int nfrozen = 0, ntotal = 0;
238 enum freezer_state old_state = freezer->state;
245 239
246 cgroup_iter_start(cgroup, &it); 240 cgroup_iter_start(cgroup, &it);
247 while ((task = cgroup_iter_next(cgroup, &it))) { 241 while ((task = cgroup_iter_next(cgroup, &it))) {
248 ntotal++; 242 ntotal++;
249 if (is_task_frozen_enough(task)) 243 if (frozen(task))
250 nfrozen++; 244 nfrozen++;
251 } 245 }
252 246
253 /* 247 if (old_state == CGROUP_THAWED) {
254 * Transition to FROZEN when no new tasks can be added ensures 248 BUG_ON(nfrozen > 0);
255 * that we never exist in the FROZEN state while there are unfrozen 249 } else if (old_state == CGROUP_FREEZING) {
256 * tasks. 250 if (nfrozen == ntotal)
257 */ 251 freezer->state = CGROUP_FROZEN;
258 if (nfrozen == ntotal) 252 } else { /* old_state == CGROUP_FROZEN */
259 freezer->state = CGROUP_FROZEN; 253 BUG_ON(nfrozen != ntotal);
260 else if (nfrozen > 0) 254 }
261 freezer->state = CGROUP_FREEZING; 255
262 else
263 freezer->state = CGROUP_THAWED;
264 cgroup_iter_end(cgroup, &it); 256 cgroup_iter_end(cgroup, &it);
265} 257}
266 258
@@ -279,7 +271,7 @@ static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
279 if (state == CGROUP_FREEZING) { 271 if (state == CGROUP_FREEZING) {
280 /* We change from FREEZING to FROZEN lazily if the cgroup was 272 /* We change from FREEZING to FROZEN lazily if the cgroup was
281 * only partially frozen when we exitted write. */ 273 * only partially frozen when we exitted write. */
282 update_freezer_state(cgroup, freezer); 274 update_if_frozen(cgroup, freezer);
283 state = freezer->state; 275 state = freezer->state;
284 } 276 }
285 spin_unlock_irq(&freezer->lock); 277 spin_unlock_irq(&freezer->lock);
@@ -301,7 +293,7 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
301 while ((task = cgroup_iter_next(cgroup, &it))) { 293 while ((task = cgroup_iter_next(cgroup, &it))) {
302 if (!freeze_task(task, true)) 294 if (!freeze_task(task, true))
303 continue; 295 continue;
304 if (is_task_frozen_enough(task)) 296 if (frozen(task))
305 continue; 297 continue;
306 if (!freezing(task) && !freezer_should_skip(task)) 298 if (!freezing(task) && !freezer_should_skip(task))
307 num_cant_freeze_now++; 299 num_cant_freeze_now++;
@@ -335,7 +327,7 @@ static int freezer_change_state(struct cgroup *cgroup,
335 327
336 spin_lock_irq(&freezer->lock); 328 spin_lock_irq(&freezer->lock);
337 329
338 update_freezer_state(cgroup, freezer); 330 update_if_frozen(cgroup, freezer);
339 if (goal_state == freezer->state) 331 if (goal_state == freezer->state)
340 goto out; 332 goto out;
341 333
diff --git a/kernel/compat.c b/kernel/compat.c
index 5adab05a3172..c9e2ec0b34a8 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -279,11 +279,6 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource,
279 struct compat_rlimit __user *rlim) 279 struct compat_rlimit __user *rlim)
280{ 280{
281 struct rlimit r; 281 struct rlimit r;
282 int ret;
283 mm_segment_t old_fs = get_fs ();
284
285 if (resource >= RLIM_NLIMITS)
286 return -EINVAL;
287 282
288 if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) || 283 if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) ||
289 __get_user(r.rlim_cur, &rlim->rlim_cur) || 284 __get_user(r.rlim_cur, &rlim->rlim_cur) ||
@@ -294,10 +289,7 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource,
294 r.rlim_cur = RLIM_INFINITY; 289 r.rlim_cur = RLIM_INFINITY;
295 if (r.rlim_max == COMPAT_RLIM_INFINITY) 290 if (r.rlim_max == COMPAT_RLIM_INFINITY)
296 r.rlim_max = RLIM_INFINITY; 291 r.rlim_max = RLIM_INFINITY;
297 set_fs(KERNEL_DS); 292 return do_prlimit(current, resource, &r, NULL);
298 ret = sys_setrlimit(resource, (struct rlimit __user *) &r);
299 set_fs(old_fs);
300 return ret;
301} 293}
302 294
303#ifdef COMPAT_RLIM_OLD_INFINITY 295#ifdef COMPAT_RLIM_OLD_INFINITY
@@ -329,16 +321,13 @@ asmlinkage long compat_sys_old_getrlimit(unsigned int resource,
329 321
330#endif 322#endif
331 323
332asmlinkage long compat_sys_getrlimit (unsigned int resource, 324asmlinkage long compat_sys_getrlimit(unsigned int resource,
333 struct compat_rlimit __user *rlim) 325 struct compat_rlimit __user *rlim)
334{ 326{
335 struct rlimit r; 327 struct rlimit r;
336 int ret; 328 int ret;
337 mm_segment_t old_fs = get_fs();
338 329
339 set_fs(KERNEL_DS); 330 ret = do_prlimit(current, resource, NULL, &r);
340 ret = sys_getrlimit(resource, (struct rlimit __user *) &r);
341 set_fs(old_fs);
342 if (!ret) { 331 if (!ret) {
343 if (r.rlim_cur > COMPAT_RLIM_INFINITY) 332 if (r.rlim_cur > COMPAT_RLIM_INFINITY)
344 r.rlim_cur = COMPAT_RLIM_INFINITY; 333 r.rlim_cur = COMPAT_RLIM_INFINITY;
@@ -1137,3 +1126,24 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info)
1137 1126
1138 return 0; 1127 return 0;
1139} 1128}
1129
1130/*
1131 * Allocate user-space memory for the duration of a single system call,
1132 * in order to marshall parameters inside a compat thunk.
1133 */
1134void __user *compat_alloc_user_space(unsigned long len)
1135{
1136 void __user *ptr;
1137
1138 /* If len would occupy more than half of the entire compat space... */
1139 if (unlikely(len > (((compat_uptr_t)~0) >> 1)))
1140 return NULL;
1141
1142 ptr = arch_compat_alloc_user_space(len);
1143
1144 if (unlikely(!access_ok(VERIFY_WRITE, ptr, len)))
1145 return NULL;
1146
1147 return ptr;
1148}
1149EXPORT_SYMBOL_GPL(compat_alloc_user_space);
diff --git a/kernel/configs.c b/kernel/configs.c
index abaee684ecbf..b4066b44a99d 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -66,6 +66,7 @@ ikconfig_read_current(struct file *file, char __user *buf,
66static const struct file_operations ikconfig_file_ops = { 66static const struct file_operations ikconfig_file_ops = {
67 .owner = THIS_MODULE, 67 .owner = THIS_MODULE,
68 .read = ikconfig_read_current, 68 .read = ikconfig_read_current,
69 .llseek = default_llseek,
69}; 70};
70 71
71static int __init ikconfig_init(void) 72static int __init ikconfig_init(void)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 97d1b426a4ac..f6e726f18491 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -235,11 +235,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
235 return -EINVAL; 235 return -EINVAL;
236 236
237 cpu_hotplug_begin(); 237 cpu_hotplug_begin();
238 set_cpu_active(cpu, false);
239 err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); 238 err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
240 if (err) { 239 if (err) {
241 set_cpu_active(cpu, true);
242
243 nr_calls--; 240 nr_calls--;
244 __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); 241 __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
245 printk("%s: attempt to take down CPU %u failed\n", 242 printk("%s: attempt to take down CPU %u failed\n",
@@ -249,7 +246,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
249 246
250 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); 247 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
251 if (err) { 248 if (err) {
252 set_cpu_active(cpu, true);
253 /* CPU didn't die: tell everyone. Can't complain. */ 249 /* CPU didn't die: tell everyone. Can't complain. */
254 cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); 250 cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
255 251
@@ -321,8 +317,6 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
321 goto out_notify; 317 goto out_notify;
322 BUG_ON(!cpu_online(cpu)); 318 BUG_ON(!cpu_online(cpu));
323 319
324 set_cpu_active(cpu, true);
325
326 /* Now call notifier in preparation. */ 320 /* Now call notifier in preparation. */
327 cpu_notify(CPU_ONLINE | mod, hcpu); 321 cpu_notify(CPU_ONLINE | mod, hcpu);
328 322
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 02b9611eadde..51b143e2a07a 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -105,7 +105,7 @@ struct cpuset {
105 /* for custom sched domain */ 105 /* for custom sched domain */
106 int relax_domain_level; 106 int relax_domain_level;
107 107
108 /* used for walking a cpuset heirarchy */ 108 /* used for walking a cpuset hierarchy */
109 struct list_head stack_list; 109 struct list_head stack_list;
110}; 110};
111 111
@@ -1397,7 +1397,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1397 if (tsk->flags & PF_THREAD_BOUND) 1397 if (tsk->flags & PF_THREAD_BOUND)
1398 return -EINVAL; 1398 return -EINVAL;
1399 1399
1400 ret = security_task_setscheduler(tsk, 0, NULL); 1400 ret = security_task_setscheduler(tsk);
1401 if (ret) 1401 if (ret)
1402 return ret; 1402 return ret;
1403 if (threadgroup) { 1403 if (threadgroup) {
@@ -1405,7 +1405,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1405 1405
1406 rcu_read_lock(); 1406 rcu_read_lock();
1407 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { 1407 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1408 ret = security_task_setscheduler(c, 0, NULL); 1408 ret = security_task_setscheduler(c);
1409 if (ret) { 1409 if (ret) {
1410 rcu_read_unlock(); 1410 rcu_read_unlock();
1411 return ret; 1411 return ret;
@@ -2113,31 +2113,17 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2113 * but making no active use of cpusets. 2113 * but making no active use of cpusets.
2114 * 2114 *
2115 * This routine ensures that top_cpuset.cpus_allowed tracks 2115 * This routine ensures that top_cpuset.cpus_allowed tracks
2116 * cpu_online_map on each CPU hotplug (cpuhp) event. 2116 * cpu_active_mask on each CPU hotplug (cpuhp) event.
2117 * 2117 *
2118 * Called within get_online_cpus(). Needs to call cgroup_lock() 2118 * Called within get_online_cpus(). Needs to call cgroup_lock()
2119 * before calling generate_sched_domains(). 2119 * before calling generate_sched_domains().
2120 */ 2120 */
2121static int cpuset_track_online_cpus(struct notifier_block *unused_nb, 2121void cpuset_update_active_cpus(void)
2122 unsigned long phase, void *unused_cpu)
2123{ 2122{
2124 struct sched_domain_attr *attr; 2123 struct sched_domain_attr *attr;
2125 cpumask_var_t *doms; 2124 cpumask_var_t *doms;
2126 int ndoms; 2125 int ndoms;
2127 2126
2128 switch (phase) {
2129 case CPU_ONLINE:
2130 case CPU_ONLINE_FROZEN:
2131 case CPU_DOWN_PREPARE:
2132 case CPU_DOWN_PREPARE_FROZEN:
2133 case CPU_DOWN_FAILED:
2134 case CPU_DOWN_FAILED_FROZEN:
2135 break;
2136
2137 default:
2138 return NOTIFY_DONE;
2139 }
2140
2141 cgroup_lock(); 2127 cgroup_lock();
2142 mutex_lock(&callback_mutex); 2128 mutex_lock(&callback_mutex);
2143 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2129 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
@@ -2148,8 +2134,6 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2148 2134
2149 /* Have scheduler rebuild the domains */ 2135 /* Have scheduler rebuild the domains */
2150 partition_sched_domains(ndoms, doms, attr); 2136 partition_sched_domains(ndoms, doms, attr);
2151
2152 return NOTIFY_OK;
2153} 2137}
2154 2138
2155#ifdef CONFIG_MEMORY_HOTPLUG 2139#ifdef CONFIG_MEMORY_HOTPLUG
@@ -2203,7 +2187,6 @@ void __init cpuset_init_smp(void)
2203 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2187 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2204 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2188 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2205 2189
2206 hotcpu_notifier(cpuset_track_online_cpus, 0);
2207 hotplug_memory_notifier(cpuset_track_online_nodes, 10); 2190 hotplug_memory_notifier(cpuset_track_online_nodes, 10);
2208 2191
2209 cpuset_wq = create_singlethread_workqueue("cpuset"); 2192 cpuset_wq = create_singlethread_workqueue("cpuset");
diff --git a/kernel/cred.c b/kernel/cred.c
index 60bc8b1e32e6..6a1aa004e376 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -22,10 +22,6 @@
22#define kdebug(FMT, ...) \ 22#define kdebug(FMT, ...) \
23 printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) 23 printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
24#else 24#else
25static inline __attribute__((format(printf, 1, 2)))
26void no_printk(const char *fmt, ...)
27{
28}
29#define kdebug(FMT, ...) \ 25#define kdebug(FMT, ...) \
30 no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) 26 no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
31#endif 27#endif
@@ -329,7 +325,7 @@ EXPORT_SYMBOL(prepare_creds);
329 325
330/* 326/*
331 * Prepare credentials for current to perform an execve() 327 * Prepare credentials for current to perform an execve()
332 * - The caller must hold current->cred_guard_mutex 328 * - The caller must hold ->cred_guard_mutex
333 */ 329 */
334struct cred *prepare_exec_creds(void) 330struct cred *prepare_exec_creds(void)
335{ 331{
@@ -388,8 +384,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
388 struct cred *new; 384 struct cred *new;
389 int ret; 385 int ret;
390 386
391 mutex_init(&p->cred_guard_mutex);
392
393 if ( 387 if (
394#ifdef CONFIG_KEYS 388#ifdef CONFIG_KEYS
395 !p->cred->thread_keyring && 389 !p->cred->thread_keyring &&
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 8bc5eeffec8a..fec596da9bd0 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -6,7 +6,7 @@
6 * Copyright (C) 2000-2001 VERITAS Software Corporation. 6 * Copyright (C) 2000-2001 VERITAS Software Corporation.
7 * Copyright (C) 2002-2004 Timesys Corporation 7 * Copyright (C) 2002-2004 Timesys Corporation
8 * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com> 8 * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
9 * Copyright (C) 2004 Pavel Machek <pavel@suse.cz> 9 * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz>
10 * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org> 10 * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
11 * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd. 11 * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
12 * Copyright (C) 2005-2009 Wind River Systems, Inc. 12 * Copyright (C) 2005-2009 Wind River Systems, Inc.
@@ -47,6 +47,7 @@
47#include <linux/pid.h> 47#include <linux/pid.h>
48#include <linux/smp.h> 48#include <linux/smp.h>
49#include <linux/mm.h> 49#include <linux/mm.h>
50#include <linux/rcupdate.h>
50 51
51#include <asm/cacheflush.h> 52#include <asm/cacheflush.h>
52#include <asm/byteorder.h> 53#include <asm/byteorder.h>
@@ -109,13 +110,15 @@ static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = {
109 */ 110 */
110atomic_t kgdb_active = ATOMIC_INIT(-1); 111atomic_t kgdb_active = ATOMIC_INIT(-1);
111EXPORT_SYMBOL_GPL(kgdb_active); 112EXPORT_SYMBOL_GPL(kgdb_active);
113static DEFINE_RAW_SPINLOCK(dbg_master_lock);
114static DEFINE_RAW_SPINLOCK(dbg_slave_lock);
112 115
113/* 116/*
114 * We use NR_CPUs not PERCPU, in case kgdb is used to debug early 117 * We use NR_CPUs not PERCPU, in case kgdb is used to debug early
115 * bootup code (which might not have percpu set up yet): 118 * bootup code (which might not have percpu set up yet):
116 */ 119 */
117static atomic_t passive_cpu_wait[NR_CPUS]; 120static atomic_t masters_in_kgdb;
118static atomic_t cpu_in_kgdb[NR_CPUS]; 121static atomic_t slaves_in_kgdb;
119static atomic_t kgdb_break_tasklet_var; 122static atomic_t kgdb_break_tasklet_var;
120atomic_t kgdb_setting_breakpoint; 123atomic_t kgdb_setting_breakpoint;
121 124
@@ -457,26 +460,32 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
457 return 1; 460 return 1;
458} 461}
459 462
460static void dbg_cpu_switch(int cpu, int next_cpu) 463static void dbg_touch_watchdogs(void)
461{ 464{
462 /* Mark the cpu we are switching away from as a slave when it 465 touch_softlockup_watchdog_sync();
463 * holds the kgdb_active token. This must be done so that the 466 clocksource_touch_watchdog();
464 * that all the cpus wait in for the debug core will not enter 467 rcu_cpu_stall_reset();
465 * again as the master. */
466 if (cpu == atomic_read(&kgdb_active)) {
467 kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
468 kgdb_info[cpu].exception_state &= ~DCPU_WANT_MASTER;
469 }
470 kgdb_info[next_cpu].exception_state |= DCPU_NEXT_MASTER;
471} 468}
472 469
473static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs) 470static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs,
471 int exception_state)
474{ 472{
475 unsigned long flags; 473 unsigned long flags;
476 int sstep_tries = 100; 474 int sstep_tries = 100;
477 int error; 475 int error;
478 int i, cpu; 476 int cpu;
479 int trace_on = 0; 477 int trace_on = 0;
478 int online_cpus = num_online_cpus();
479
480 kgdb_info[ks->cpu].enter_kgdb++;
481 kgdb_info[ks->cpu].exception_state |= exception_state;
482
483 if (exception_state == DCPU_WANT_MASTER)
484 atomic_inc(&masters_in_kgdb);
485 else
486 atomic_inc(&slaves_in_kgdb);
487 kgdb_disable_hw_debug(ks->linux_regs);
488
480acquirelock: 489acquirelock:
481 /* 490 /*
482 * Interrupts will be restored by the 'trap return' code, except when 491 * Interrupts will be restored by the 'trap return' code, except when
@@ -489,14 +498,15 @@ acquirelock:
489 kgdb_info[cpu].task = current; 498 kgdb_info[cpu].task = current;
490 kgdb_info[cpu].ret_state = 0; 499 kgdb_info[cpu].ret_state = 0;
491 kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT; 500 kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT;
492 /*
493 * Make sure the above info reaches the primary CPU before
494 * our cpu_in_kgdb[] flag setting does:
495 */
496 atomic_inc(&cpu_in_kgdb[cpu]);
497 501
498 if (exception_level == 1) 502 /* Make sure the above info reaches the primary CPU */
503 smp_mb();
504
505 if (exception_level == 1) {
506 if (raw_spin_trylock(&dbg_master_lock))
507 atomic_xchg(&kgdb_active, cpu);
499 goto cpu_master_loop; 508 goto cpu_master_loop;
509 }
500 510
501 /* 511 /*
502 * CPU will loop if it is a slave or request to become a kgdb 512 * CPU will loop if it is a slave or request to become a kgdb
@@ -508,10 +518,12 @@ cpu_loop:
508 kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER; 518 kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER;
509 goto cpu_master_loop; 519 goto cpu_master_loop;
510 } else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) { 520 } else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) {
511 if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu) 521 if (raw_spin_trylock(&dbg_master_lock)) {
522 atomic_xchg(&kgdb_active, cpu);
512 break; 523 break;
524 }
513 } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) { 525 } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
514 if (!atomic_read(&passive_cpu_wait[cpu])) 526 if (!raw_spin_is_locked(&dbg_slave_lock))
515 goto return_normal; 527 goto return_normal;
516 } else { 528 } else {
517return_normal: 529return_normal:
@@ -522,9 +534,12 @@ return_normal:
522 arch_kgdb_ops.correct_hw_break(); 534 arch_kgdb_ops.correct_hw_break();
523 if (trace_on) 535 if (trace_on)
524 tracing_on(); 536 tracing_on();
525 atomic_dec(&cpu_in_kgdb[cpu]); 537 kgdb_info[cpu].exception_state &=
526 touch_softlockup_watchdog_sync(); 538 ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
527 clocksource_touch_watchdog(); 539 kgdb_info[cpu].enter_kgdb--;
540 smp_mb__before_atomic_dec();
541 atomic_dec(&slaves_in_kgdb);
542 dbg_touch_watchdogs();
528 local_irq_restore(flags); 543 local_irq_restore(flags);
529 return 0; 544 return 0;
530 } 545 }
@@ -541,8 +556,8 @@ return_normal:
541 (kgdb_info[cpu].task && 556 (kgdb_info[cpu].task &&
542 kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { 557 kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
543 atomic_set(&kgdb_active, -1); 558 atomic_set(&kgdb_active, -1);
544 touch_softlockup_watchdog_sync(); 559 raw_spin_unlock(&dbg_master_lock);
545 clocksource_touch_watchdog(); 560 dbg_touch_watchdogs();
546 local_irq_restore(flags); 561 local_irq_restore(flags);
547 562
548 goto acquirelock; 563 goto acquirelock;
@@ -563,16 +578,12 @@ return_normal:
563 if (dbg_io_ops->pre_exception) 578 if (dbg_io_ops->pre_exception)
564 dbg_io_ops->pre_exception(); 579 dbg_io_ops->pre_exception();
565 580
566 kgdb_disable_hw_debug(ks->linux_regs);
567
568 /* 581 /*
569 * Get the passive CPU lock which will hold all the non-primary 582 * Get the passive CPU lock which will hold all the non-primary
570 * CPU in a spin state while the debugger is active 583 * CPU in a spin state while the debugger is active
571 */ 584 */
572 if (!kgdb_single_step) { 585 if (!kgdb_single_step)
573 for (i = 0; i < NR_CPUS; i++) 586 raw_spin_lock(&dbg_slave_lock);
574 atomic_inc(&passive_cpu_wait[i]);
575 }
576 587
577#ifdef CONFIG_SMP 588#ifdef CONFIG_SMP
578 /* Signal the other CPUs to enter kgdb_wait() */ 589 /* Signal the other CPUs to enter kgdb_wait() */
@@ -583,10 +594,9 @@ return_normal:
583 /* 594 /*
584 * Wait for the other CPUs to be notified and be waiting for us: 595 * Wait for the other CPUs to be notified and be waiting for us:
585 */ 596 */
586 for_each_online_cpu(i) { 597 while (kgdb_do_roundup && (atomic_read(&masters_in_kgdb) +
587 while (kgdb_do_roundup && !atomic_read(&cpu_in_kgdb[i])) 598 atomic_read(&slaves_in_kgdb)) != online_cpus)
588 cpu_relax(); 599 cpu_relax();
589 }
590 600
591 /* 601 /*
592 * At this point the primary processor is completely 602 * At this point the primary processor is completely
@@ -605,6 +615,8 @@ cpu_master_loop:
605 if (dbg_kdb_mode) { 615 if (dbg_kdb_mode) {
606 kgdb_connected = 1; 616 kgdb_connected = 1;
607 error = kdb_stub(ks); 617 error = kdb_stub(ks);
618 if (error == -1)
619 continue;
608 kgdb_connected = 0; 620 kgdb_connected = 0;
609 } else { 621 } else {
610 error = gdb_serial_stub(ks); 622 error = gdb_serial_stub(ks);
@@ -613,7 +625,8 @@ cpu_master_loop:
613 if (error == DBG_PASS_EVENT) { 625 if (error == DBG_PASS_EVENT) {
614 dbg_kdb_mode = !dbg_kdb_mode; 626 dbg_kdb_mode = !dbg_kdb_mode;
615 } else if (error == DBG_SWITCH_CPU_EVENT) { 627 } else if (error == DBG_SWITCH_CPU_EVENT) {
616 dbg_cpu_switch(cpu, dbg_switch_cpu); 628 kgdb_info[dbg_switch_cpu].exception_state |=
629 DCPU_NEXT_MASTER;
617 goto cpu_loop; 630 goto cpu_loop;
618 } else { 631 } else {
619 kgdb_info[cpu].ret_state = error; 632 kgdb_info[cpu].ret_state = error;
@@ -625,24 +638,11 @@ cpu_master_loop:
625 if (dbg_io_ops->post_exception) 638 if (dbg_io_ops->post_exception)
626 dbg_io_ops->post_exception(); 639 dbg_io_ops->post_exception();
627 640
628 atomic_dec(&cpu_in_kgdb[ks->cpu]);
629
630 if (!kgdb_single_step) { 641 if (!kgdb_single_step) {
631 for (i = NR_CPUS-1; i >= 0; i--) 642 raw_spin_unlock(&dbg_slave_lock);
632 atomic_dec(&passive_cpu_wait[i]); 643 /* Wait till all the CPUs have quit from the debugger. */
633 /* 644 while (kgdb_do_roundup && atomic_read(&slaves_in_kgdb))
634 * Wait till all the CPUs have quit from the debugger, 645 cpu_relax();
635 * but allow a CPU that hit an exception and is
636 * waiting to become the master to remain in the debug
637 * core.
638 */
639 for_each_online_cpu(i) {
640 while (kgdb_do_roundup &&
641 atomic_read(&cpu_in_kgdb[i]) &&
642 !(kgdb_info[i].exception_state &
643 DCPU_WANT_MASTER))
644 cpu_relax();
645 }
646 } 646 }
647 647
648kgdb_restore: 648kgdb_restore:
@@ -653,12 +653,20 @@ kgdb_restore:
653 else 653 else
654 kgdb_sstep_pid = 0; 654 kgdb_sstep_pid = 0;
655 } 655 }
656 if (arch_kgdb_ops.correct_hw_break)
657 arch_kgdb_ops.correct_hw_break();
656 if (trace_on) 658 if (trace_on)
657 tracing_on(); 659 tracing_on();
660
661 kgdb_info[cpu].exception_state &=
662 ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
663 kgdb_info[cpu].enter_kgdb--;
664 smp_mb__before_atomic_dec();
665 atomic_dec(&masters_in_kgdb);
658 /* Free kgdb_active */ 666 /* Free kgdb_active */
659 atomic_set(&kgdb_active, -1); 667 atomic_set(&kgdb_active, -1);
660 touch_softlockup_watchdog_sync(); 668 raw_spin_unlock(&dbg_master_lock);
661 clocksource_touch_watchdog(); 669 dbg_touch_watchdogs();
662 local_irq_restore(flags); 670 local_irq_restore(flags);
663 671
664 return kgdb_info[cpu].ret_state; 672 return kgdb_info[cpu].ret_state;
@@ -676,7 +684,6 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
676{ 684{
677 struct kgdb_state kgdb_var; 685 struct kgdb_state kgdb_var;
678 struct kgdb_state *ks = &kgdb_var; 686 struct kgdb_state *ks = &kgdb_var;
679 int ret;
680 687
681 ks->cpu = raw_smp_processor_id(); 688 ks->cpu = raw_smp_processor_id();
682 ks->ex_vector = evector; 689 ks->ex_vector = evector;
@@ -687,11 +694,10 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
687 694
688 if (kgdb_reenter_check(ks)) 695 if (kgdb_reenter_check(ks))
689 return 0; /* Ouch, double exception ! */ 696 return 0; /* Ouch, double exception ! */
690 kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER; 697 if (kgdb_info[ks->cpu].enter_kgdb != 0)
691 ret = kgdb_cpu_enter(ks, regs); 698 return 0;
692 kgdb_info[ks->cpu].exception_state &= ~(DCPU_WANT_MASTER | 699
693 DCPU_IS_SLAVE); 700 return kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
694 return ret;
695} 701}
696 702
697int kgdb_nmicallback(int cpu, void *regs) 703int kgdb_nmicallback(int cpu, void *regs)
@@ -704,12 +710,9 @@ int kgdb_nmicallback(int cpu, void *regs)
704 ks->cpu = cpu; 710 ks->cpu = cpu;
705 ks->linux_regs = regs; 711 ks->linux_regs = regs;
706 712
707 if (!atomic_read(&cpu_in_kgdb[cpu]) && 713 if (kgdb_info[ks->cpu].enter_kgdb == 0 &&
708 atomic_read(&kgdb_active) != -1 && 714 raw_spin_is_locked(&dbg_master_lock)) {
709 atomic_read(&kgdb_active) != cpu) { 715 kgdb_cpu_enter(ks, regs, DCPU_IS_SLAVE);
710 kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
711 kgdb_cpu_enter(ks, regs);
712 kgdb_info[cpu].exception_state &= ~DCPU_IS_SLAVE;
713 return 0; 716 return 0;
714 } 717 }
715#endif 718#endif
@@ -739,7 +742,7 @@ static struct console kgdbcons = {
739}; 742};
740 743
741#ifdef CONFIG_MAGIC_SYSRQ 744#ifdef CONFIG_MAGIC_SYSRQ
742static void sysrq_handle_dbg(int key, struct tty_struct *tty) 745static void sysrq_handle_dbg(int key)
743{ 746{
744 if (!dbg_io_ops) { 747 if (!dbg_io_ops) {
745 printk(KERN_CRIT "ERROR: No KGDB I/O module available\n"); 748 printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
index c5d753d80f67..3494c28a7e7a 100644
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -40,6 +40,7 @@ struct debuggerinfo_struct {
40 int exception_state; 40 int exception_state;
41 int ret_state; 41 int ret_state;
42 int irq_depth; 42 int irq_depth;
43 int enter_kgdb;
43}; 44};
44 45
45extern struct debuggerinfo_struct kgdb_info[]; 46extern struct debuggerinfo_struct kgdb_info[];
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index e8fd6868682d..481a7bd2dfe7 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -6,7 +6,7 @@
6 * Copyright (C) 2000-2001 VERITAS Software Corporation. 6 * Copyright (C) 2000-2001 VERITAS Software Corporation.
7 * Copyright (C) 2002-2004 Timesys Corporation 7 * Copyright (C) 2002-2004 Timesys Corporation
8 * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com> 8 * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
9 * Copyright (C) 2004 Pavel Machek <pavel@suse.cz> 9 * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz>
10 * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org> 10 * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
11 * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd. 11 * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
12 * Copyright (C) 2005-2009 Wind River Systems, Inc. 12 * Copyright (C) 2005-2009 Wind River Systems, Inc.
@@ -52,17 +52,6 @@ static unsigned long gdb_regs[(NUMREGBYTES +
52 * GDB remote protocol parser: 52 * GDB remote protocol parser:
53 */ 53 */
54 54
55static int hex(char ch)
56{
57 if ((ch >= 'a') && (ch <= 'f'))
58 return ch - 'a' + 10;
59 if ((ch >= '0') && (ch <= '9'))
60 return ch - '0';
61 if ((ch >= 'A') && (ch <= 'F'))
62 return ch - 'A' + 10;
63 return -1;
64}
65
66#ifdef CONFIG_KGDB_KDB 55#ifdef CONFIG_KGDB_KDB
67static int gdbstub_read_wait(void) 56static int gdbstub_read_wait(void)
68{ 57{
@@ -123,8 +112,8 @@ static void get_packet(char *buffer)
123 buffer[count] = 0; 112 buffer[count] = 0;
124 113
125 if (ch == '#') { 114 if (ch == '#') {
126 xmitcsum = hex(gdbstub_read_wait()) << 4; 115 xmitcsum = hex_to_bin(gdbstub_read_wait()) << 4;
127 xmitcsum += hex(gdbstub_read_wait()); 116 xmitcsum += hex_to_bin(gdbstub_read_wait());
128 117
129 if (checksum != xmitcsum) 118 if (checksum != xmitcsum)
130 /* failed checksum */ 119 /* failed checksum */
@@ -236,7 +225,7 @@ void gdbstub_msg_write(const char *s, int len)
236 * buf. Return a pointer to the last char put in buf (null). May 225 * buf. Return a pointer to the last char put in buf (null). May
237 * return an error. 226 * return an error.
238 */ 227 */
239int kgdb_mem2hex(char *mem, char *buf, int count) 228char *kgdb_mem2hex(char *mem, char *buf, int count)
240{ 229{
241 char *tmp; 230 char *tmp;
242 int err; 231 int err;
@@ -248,17 +237,16 @@ int kgdb_mem2hex(char *mem, char *buf, int count)
248 tmp = buf + count; 237 tmp = buf + count;
249 238
250 err = probe_kernel_read(tmp, mem, count); 239 err = probe_kernel_read(tmp, mem, count);
251 if (!err) { 240 if (err)
252 while (count > 0) { 241 return NULL;
253 buf = pack_hex_byte(buf, *tmp); 242 while (count > 0) {
254 tmp++; 243 buf = pack_hex_byte(buf, *tmp);
255 count--; 244 tmp++;
256 } 245 count--;
257
258 *buf = 0;
259 } 246 }
247 *buf = 0;
260 248
261 return err; 249 return buf;
262} 250}
263 251
264/* 252/*
@@ -280,8 +268,8 @@ int kgdb_hex2mem(char *buf, char *mem, int count)
280 tmp_hex = tmp_raw - 1; 268 tmp_hex = tmp_raw - 1;
281 while (tmp_hex >= buf) { 269 while (tmp_hex >= buf) {
282 tmp_raw--; 270 tmp_raw--;
283 *tmp_raw = hex(*tmp_hex--); 271 *tmp_raw = hex_to_bin(*tmp_hex--);
284 *tmp_raw |= hex(*tmp_hex--) << 4; 272 *tmp_raw |= hex_to_bin(*tmp_hex--) << 4;
285 } 273 }
286 274
287 return probe_kernel_write(mem, tmp_raw, count); 275 return probe_kernel_write(mem, tmp_raw, count);
@@ -304,7 +292,7 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val)
304 (*ptr)++; 292 (*ptr)++;
305 } 293 }
306 while (**ptr) { 294 while (**ptr) {
307 hex_val = hex(**ptr); 295 hex_val = hex_to_bin(**ptr);
308 if (hex_val < 0) 296 if (hex_val < 0)
309 break; 297 break;
310 298
@@ -339,6 +327,32 @@ static int kgdb_ebin2mem(char *buf, char *mem, int count)
339 return probe_kernel_write(mem, c, size); 327 return probe_kernel_write(mem, c, size);
340} 328}
341 329
330#if DBG_MAX_REG_NUM > 0
331void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
332{
333 int i;
334 int idx = 0;
335 char *ptr = (char *)gdb_regs;
336
337 for (i = 0; i < DBG_MAX_REG_NUM; i++) {
338 dbg_get_reg(i, ptr + idx, regs);
339 idx += dbg_reg_def[i].size;
340 }
341}
342
343void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
344{
345 int i;
346 int idx = 0;
347 char *ptr = (char *)gdb_regs;
348
349 for (i = 0; i < DBG_MAX_REG_NUM; i++) {
350 dbg_set_reg(i, ptr + idx, regs);
351 idx += dbg_reg_def[i].size;
352 }
353}
354#endif /* DBG_MAX_REG_NUM > 0 */
355
342/* Write memory due to an 'M' or 'X' packet. */ 356/* Write memory due to an 'M' or 'X' packet. */
343static int write_mem_msg(int binary) 357static int write_mem_msg(int binary)
344{ 358{
@@ -378,28 +392,31 @@ static void error_packet(char *pkt, int error)
378 * remapped to negative TIDs. 392 * remapped to negative TIDs.
379 */ 393 */
380 394
381#define BUF_THREAD_ID_SIZE 16 395#define BUF_THREAD_ID_SIZE 8
382 396
383static char *pack_threadid(char *pkt, unsigned char *id) 397static char *pack_threadid(char *pkt, unsigned char *id)
384{ 398{
385 char *limit; 399 unsigned char *limit;
400 int lzero = 1;
401
402 limit = id + (BUF_THREAD_ID_SIZE / 2);
403 while (id < limit) {
404 if (!lzero || *id != 0) {
405 pkt = pack_hex_byte(pkt, *id);
406 lzero = 0;
407 }
408 id++;
409 }
386 410
387 limit = pkt + BUF_THREAD_ID_SIZE; 411 if (lzero)
388 while (pkt < limit) 412 pkt = pack_hex_byte(pkt, 0);
389 pkt = pack_hex_byte(pkt, *id++);
390 413
391 return pkt; 414 return pkt;
392} 415}
393 416
394static void int_to_threadref(unsigned char *id, int value) 417static void int_to_threadref(unsigned char *id, int value)
395{ 418{
396 unsigned char *scan; 419 put_unaligned_be32(value, id);
397 int i = 4;
398
399 scan = (unsigned char *)id;
400 while (i--)
401 *scan++ = 0;
402 put_unaligned_be32(value, scan);
403} 420}
404 421
405static struct task_struct *getthread(struct pt_regs *regs, int tid) 422static struct task_struct *getthread(struct pt_regs *regs, int tid)
@@ -463,8 +480,7 @@ static void gdb_cmd_status(struct kgdb_state *ks)
463 pack_hex_byte(&remcom_out_buffer[1], ks->signo); 480 pack_hex_byte(&remcom_out_buffer[1], ks->signo);
464} 481}
465 482
466/* Handle the 'g' get registers request */ 483static void gdb_get_regs_helper(struct kgdb_state *ks)
467static void gdb_cmd_getregs(struct kgdb_state *ks)
468{ 484{
469 struct task_struct *thread; 485 struct task_struct *thread;
470 void *local_debuggerinfo; 486 void *local_debuggerinfo;
@@ -505,6 +521,12 @@ static void gdb_cmd_getregs(struct kgdb_state *ks)
505 */ 521 */
506 sleeping_thread_to_gdb_regs(gdb_regs, thread); 522 sleeping_thread_to_gdb_regs(gdb_regs, thread);
507 } 523 }
524}
525
526/* Handle the 'g' get registers request */
527static void gdb_cmd_getregs(struct kgdb_state *ks)
528{
529 gdb_get_regs_helper(ks);
508 kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES); 530 kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES);
509} 531}
510 532
@@ -527,13 +549,13 @@ static void gdb_cmd_memread(struct kgdb_state *ks)
527 char *ptr = &remcom_in_buffer[1]; 549 char *ptr = &remcom_in_buffer[1];
528 unsigned long length; 550 unsigned long length;
529 unsigned long addr; 551 unsigned long addr;
530 int err; 552 char *err;
531 553
532 if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' && 554 if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
533 kgdb_hex2long(&ptr, &length) > 0) { 555 kgdb_hex2long(&ptr, &length) > 0) {
534 err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length); 556 err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length);
535 if (err) 557 if (!err)
536 error_packet(remcom_out_buffer, err); 558 error_packet(remcom_out_buffer, -EINVAL);
537 } else { 559 } else {
538 error_packet(remcom_out_buffer, -EINVAL); 560 error_packet(remcom_out_buffer, -EINVAL);
539 } 561 }
@@ -550,6 +572,60 @@ static void gdb_cmd_memwrite(struct kgdb_state *ks)
550 strcpy(remcom_out_buffer, "OK"); 572 strcpy(remcom_out_buffer, "OK");
551} 573}
552 574
575#if DBG_MAX_REG_NUM > 0
576static char *gdb_hex_reg_helper(int regnum, char *out)
577{
578 int i;
579 int offset = 0;
580
581 for (i = 0; i < regnum; i++)
582 offset += dbg_reg_def[i].size;
583 return kgdb_mem2hex((char *)gdb_regs + offset, out,
584 dbg_reg_def[i].size);
585}
586
587/* Handle the 'p' individual regster get */
588static void gdb_cmd_reg_get(struct kgdb_state *ks)
589{
590 unsigned long regnum;
591 char *ptr = &remcom_in_buffer[1];
592
593 kgdb_hex2long(&ptr, &regnum);
594 if (regnum >= DBG_MAX_REG_NUM) {
595 error_packet(remcom_out_buffer, -EINVAL);
596 return;
597 }
598 gdb_get_regs_helper(ks);
599 gdb_hex_reg_helper(regnum, remcom_out_buffer);
600}
601
602/* Handle the 'P' individual regster set */
603static void gdb_cmd_reg_set(struct kgdb_state *ks)
604{
605 unsigned long regnum;
606 char *ptr = &remcom_in_buffer[1];
607 int i = 0;
608
609 kgdb_hex2long(&ptr, &regnum);
610 if (*ptr++ != '=' ||
611 !(!kgdb_usethread || kgdb_usethread == current) ||
612 !dbg_get_reg(regnum, gdb_regs, ks->linux_regs)) {
613 error_packet(remcom_out_buffer, -EINVAL);
614 return;
615 }
616 memset(gdb_regs, 0, sizeof(gdb_regs));
617 while (i < sizeof(gdb_regs) * 2)
618 if (hex_to_bin(ptr[i]) >= 0)
619 i++;
620 else
621 break;
622 i = i / 2;
623 kgdb_hex2mem(ptr, (char *)gdb_regs, i);
624 dbg_set_reg(regnum, gdb_regs, ks->linux_regs);
625 strcpy(remcom_out_buffer, "OK");
626}
627#endif /* DBG_MAX_REG_NUM > 0 */
628
553/* Handle the 'X' memory binary write bytes */ 629/* Handle the 'X' memory binary write bytes */
554static void gdb_cmd_binwrite(struct kgdb_state *ks) 630static void gdb_cmd_binwrite(struct kgdb_state *ks)
555{ 631{
@@ -612,7 +688,7 @@ static void gdb_cmd_query(struct kgdb_state *ks)
612{ 688{
613 struct task_struct *g; 689 struct task_struct *g;
614 struct task_struct *p; 690 struct task_struct *p;
615 unsigned char thref[8]; 691 unsigned char thref[BUF_THREAD_ID_SIZE];
616 char *ptr; 692 char *ptr;
617 int i; 693 int i;
618 int cpu; 694 int cpu;
@@ -632,8 +708,7 @@ static void gdb_cmd_query(struct kgdb_state *ks)
632 for_each_online_cpu(cpu) { 708 for_each_online_cpu(cpu) {
633 ks->thr_query = 0; 709 ks->thr_query = 0;
634 int_to_threadref(thref, -cpu - 2); 710 int_to_threadref(thref, -cpu - 2);
635 pack_threadid(ptr, thref); 711 ptr = pack_threadid(ptr, thref);
636 ptr += BUF_THREAD_ID_SIZE;
637 *(ptr++) = ','; 712 *(ptr++) = ',';
638 i++; 713 i++;
639 } 714 }
@@ -642,8 +717,7 @@ static void gdb_cmd_query(struct kgdb_state *ks)
642 do_each_thread(g, p) { 717 do_each_thread(g, p) {
643 if (i >= ks->thr_query && !finished) { 718 if (i >= ks->thr_query && !finished) {
644 int_to_threadref(thref, p->pid); 719 int_to_threadref(thref, p->pid);
645 pack_threadid(ptr, thref); 720 ptr = pack_threadid(ptr, thref);
646 ptr += BUF_THREAD_ID_SIZE;
647 *(ptr++) = ','; 721 *(ptr++) = ',';
648 ks->thr_query++; 722 ks->thr_query++;
649 if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0) 723 if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
@@ -858,11 +932,14 @@ int gdb_serial_stub(struct kgdb_state *ks)
858 int error = 0; 932 int error = 0;
859 int tmp; 933 int tmp;
860 934
861 /* Clear the out buffer. */ 935 /* Initialize comm buffer and globals. */
862 memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer)); 936 memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
937 kgdb_usethread = kgdb_info[ks->cpu].task;
938 ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
939 ks->pass_exception = 0;
863 940
864 if (kgdb_connected) { 941 if (kgdb_connected) {
865 unsigned char thref[8]; 942 unsigned char thref[BUF_THREAD_ID_SIZE];
866 char *ptr; 943 char *ptr;
867 944
868 /* Reply to host that an exception has occurred */ 945 /* Reply to host that an exception has occurred */
@@ -876,10 +953,6 @@ int gdb_serial_stub(struct kgdb_state *ks)
876 put_packet(remcom_out_buffer); 953 put_packet(remcom_out_buffer);
877 } 954 }
878 955
879 kgdb_usethread = kgdb_info[ks->cpu].task;
880 ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
881 ks->pass_exception = 0;
882
883 while (1) { 956 while (1) {
884 error = 0; 957 error = 0;
885 958
@@ -904,6 +977,14 @@ int gdb_serial_stub(struct kgdb_state *ks)
904 case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */ 977 case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
905 gdb_cmd_memwrite(ks); 978 gdb_cmd_memwrite(ks);
906 break; 979 break;
980#if DBG_MAX_REG_NUM > 0
981 case 'p': /* pXX Return gdb register XX (in hex) */
982 gdb_cmd_reg_get(ks);
983 break;
984 case 'P': /* PXX=aaaa Set gdb register XX to aaaa (in hex) */
985 gdb_cmd_reg_set(ks);
986 break;
987#endif /* DBG_MAX_REG_NUM > 0 */
907 case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */ 988 case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
908 gdb_cmd_binwrite(ks); 989 gdb_cmd_binwrite(ks);
909 break; 990 break;
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 75bd9b3ebbb7..20059ef4459a 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -274,7 +274,6 @@ static int kdb_bp(int argc, const char **argv)
274 int i, bpno; 274 int i, bpno;
275 kdb_bp_t *bp, *bp_check; 275 kdb_bp_t *bp, *bp_check;
276 int diag; 276 int diag;
277 int free;
278 char *symname = NULL; 277 char *symname = NULL;
279 long offset = 0ul; 278 long offset = 0ul;
280 int nextarg; 279 int nextarg;
@@ -305,7 +304,6 @@ static int kdb_bp(int argc, const char **argv)
305 /* 304 /*
306 * Find an empty bp structure to allocate 305 * Find an empty bp structure to allocate
307 */ 306 */
308 free = KDB_MAXBPT;
309 for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) { 307 for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) {
310 if (bp->bp_free) 308 if (bp->bp_free)
311 break; 309 break;
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index bf6e8270e957..dd0b1b7dd02c 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -86,7 +86,7 @@ int kdb_stub(struct kgdb_state *ks)
86 } 86 }
87 /* Set initial kdb state variables */ 87 /* Set initial kdb state variables */
88 KDB_STATE_CLEAR(KGDB_TRANS); 88 KDB_STATE_CLEAR(KGDB_TRANS);
89 kdb_initial_cpu = ks->cpu; 89 kdb_initial_cpu = atomic_read(&kgdb_active);
90 kdb_current_task = kgdb_info[ks->cpu].task; 90 kdb_current_task = kgdb_info[ks->cpu].task;
91 kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo; 91 kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
92 /* Remove any breakpoints as needed by kdb and clear single step */ 92 /* Remove any breakpoints as needed by kdb and clear single step */
@@ -105,7 +105,6 @@ int kdb_stub(struct kgdb_state *ks)
105 ks->pass_exception = 1; 105 ks->pass_exception = 1;
106 KDB_FLAG_SET(CATASTROPHIC); 106 KDB_FLAG_SET(CATASTROPHIC);
107 } 107 }
108 kdb_initial_cpu = ks->cpu;
109 if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) { 108 if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) {
110 KDB_STATE_CLEAR(SSBPT); 109 KDB_STATE_CLEAR(SSBPT);
111 KDB_STATE_CLEAR(DOING_SS); 110 KDB_STATE_CLEAR(DOING_SS);
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index c9b7f4f90bba..96fdaac46a80 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -823,4 +823,4 @@ int kdb_printf(const char *fmt, ...)
823 823
824 return r; 824 return r;
825} 825}
826 826EXPORT_SYMBOL_GPL(kdb_printf);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index ebe4a287419e..d7bda21a106b 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -312,7 +312,7 @@ int kdbgetularg(const char *arg, unsigned long *value)
312 312
313 if (endp == arg) { 313 if (endp == arg) {
314 /* 314 /*
315 * Try base 16, for us folks too lazy to type the 315 * Also try base 16, for us folks too lazy to type the
316 * leading 0x... 316 * leading 0x...
317 */ 317 */
318 val = simple_strtoul(arg, &endp, 16); 318 val = simple_strtoul(arg, &endp, 16);
@@ -325,6 +325,25 @@ int kdbgetularg(const char *arg, unsigned long *value)
325 return 0; 325 return 0;
326} 326}
327 327
328int kdbgetu64arg(const char *arg, u64 *value)
329{
330 char *endp;
331 u64 val;
332
333 val = simple_strtoull(arg, &endp, 0);
334
335 if (endp == arg) {
336
337 val = simple_strtoull(arg, &endp, 16);
338 if (endp == arg)
339 return KDB_BADINT;
340 }
341
342 *value = val;
343
344 return 0;
345}
346
328/* 347/*
329 * kdb_set - This function implements the 'set' command. Alter an 348 * kdb_set - This function implements the 'set' command. Alter an
330 * existing environment variable or create a new one. 349 * existing environment variable or create a new one.
@@ -1730,13 +1749,13 @@ static int kdb_go(int argc, const char **argv)
1730 int nextarg; 1749 int nextarg;
1731 long offset; 1750 long offset;
1732 1751
1752 if (raw_smp_processor_id() != kdb_initial_cpu) {
1753 kdb_printf("go must execute on the entry cpu, "
1754 "please use \"cpu %d\" and then execute go\n",
1755 kdb_initial_cpu);
1756 return KDB_BADCPUNUM;
1757 }
1733 if (argc == 1) { 1758 if (argc == 1) {
1734 if (raw_smp_processor_id() != kdb_initial_cpu) {
1735 kdb_printf("go <address> must be issued from the "
1736 "initial cpu, do cpu %d first\n",
1737 kdb_initial_cpu);
1738 return KDB_ARGCOUNT;
1739 }
1740 nextarg = 1; 1759 nextarg = 1;
1741 diag = kdbgetaddrarg(argc, argv, &nextarg, 1760 diag = kdbgetaddrarg(argc, argv, &nextarg,
1742 &addr, &offset, NULL); 1761 &addr, &offset, NULL);
@@ -1770,11 +1789,65 @@ static int kdb_go(int argc, const char **argv)
1770 */ 1789 */
1771static int kdb_rd(int argc, const char **argv) 1790static int kdb_rd(int argc, const char **argv)
1772{ 1791{
1773 int diag = kdb_check_regs(); 1792 int len = kdb_check_regs();
1774 if (diag) 1793#if DBG_MAX_REG_NUM > 0
1775 return diag; 1794 int i;
1795 char *rname;
1796 int rsize;
1797 u64 reg64;
1798 u32 reg32;
1799 u16 reg16;
1800 u8 reg8;
1801
1802 if (len)
1803 return len;
1804
1805 for (i = 0; i < DBG_MAX_REG_NUM; i++) {
1806 rsize = dbg_reg_def[i].size * 2;
1807 if (rsize > 16)
1808 rsize = 2;
1809 if (len + strlen(dbg_reg_def[i].name) + 4 + rsize > 80) {
1810 len = 0;
1811 kdb_printf("\n");
1812 }
1813 if (len)
1814 len += kdb_printf(" ");
1815 switch(dbg_reg_def[i].size * 8) {
1816 case 8:
1817 rname = dbg_get_reg(i, &reg8, kdb_current_regs);
1818 if (!rname)
1819 break;
1820 len += kdb_printf("%s: %02x", rname, reg8);
1821 break;
1822 case 16:
1823 rname = dbg_get_reg(i, &reg16, kdb_current_regs);
1824 if (!rname)
1825 break;
1826 len += kdb_printf("%s: %04x", rname, reg16);
1827 break;
1828 case 32:
1829 rname = dbg_get_reg(i, &reg32, kdb_current_regs);
1830 if (!rname)
1831 break;
1832 len += kdb_printf("%s: %08x", rname, reg32);
1833 break;
1834 case 64:
1835 rname = dbg_get_reg(i, &reg64, kdb_current_regs);
1836 if (!rname)
1837 break;
1838 len += kdb_printf("%s: %016llx", rname, reg64);
1839 break;
1840 default:
1841 len += kdb_printf("%s: ??", dbg_reg_def[i].name);
1842 }
1843 }
1844 kdb_printf("\n");
1845#else
1846 if (len)
1847 return len;
1776 1848
1777 kdb_dumpregs(kdb_current_regs); 1849 kdb_dumpregs(kdb_current_regs);
1850#endif
1778 return 0; 1851 return 0;
1779} 1852}
1780 1853
@@ -1782,32 +1855,67 @@ static int kdb_rd(int argc, const char **argv)
1782 * kdb_rm - This function implements the 'rm' (register modify) command. 1855 * kdb_rm - This function implements the 'rm' (register modify) command.
1783 * rm register-name new-contents 1856 * rm register-name new-contents
1784 * Remarks: 1857 * Remarks:
1785 * Currently doesn't allow modification of control or 1858 * Allows register modification with the same restrictions as gdb
1786 * debug registers.
1787 */ 1859 */
1788static int kdb_rm(int argc, const char **argv) 1860static int kdb_rm(int argc, const char **argv)
1789{ 1861{
1862#if DBG_MAX_REG_NUM > 0
1790 int diag; 1863 int diag;
1791 int ind = 0; 1864 const char *rname;
1792 unsigned long contents; 1865 int i;
1866 u64 reg64;
1867 u32 reg32;
1868 u16 reg16;
1869 u8 reg8;
1793 1870
1794 if (argc != 2) 1871 if (argc != 2)
1795 return KDB_ARGCOUNT; 1872 return KDB_ARGCOUNT;
1796 /* 1873 /*
1797 * Allow presence or absence of leading '%' symbol. 1874 * Allow presence or absence of leading '%' symbol.
1798 */ 1875 */
1799 if (argv[1][0] == '%') 1876 rname = argv[1];
1800 ind = 1; 1877 if (*rname == '%')
1878 rname++;
1801 1879
1802 diag = kdbgetularg(argv[2], &contents); 1880 diag = kdbgetu64arg(argv[2], &reg64);
1803 if (diag) 1881 if (diag)
1804 return diag; 1882 return diag;
1805 1883
1806 diag = kdb_check_regs(); 1884 diag = kdb_check_regs();
1807 if (diag) 1885 if (diag)
1808 return diag; 1886 return diag;
1887
1888 diag = KDB_BADREG;
1889 for (i = 0; i < DBG_MAX_REG_NUM; i++) {
1890 if (strcmp(rname, dbg_reg_def[i].name) == 0) {
1891 diag = 0;
1892 break;
1893 }
1894 }
1895 if (!diag) {
1896 switch(dbg_reg_def[i].size * 8) {
1897 case 8:
1898 reg8 = reg64;
1899 dbg_set_reg(i, &reg8, kdb_current_regs);
1900 break;
1901 case 16:
1902 reg16 = reg64;
1903 dbg_set_reg(i, &reg16, kdb_current_regs);
1904 break;
1905 case 32:
1906 reg32 = reg64;
1907 dbg_set_reg(i, &reg32, kdb_current_regs);
1908 break;
1909 case 64:
1910 dbg_set_reg(i, &reg64, kdb_current_regs);
1911 break;
1912 }
1913 }
1914 return diag;
1915#else
1809 kdb_printf("ERROR: Register set currently not implemented\n"); 1916 kdb_printf("ERROR: Register set currently not implemented\n");
1810 return 0; 1917 return 0;
1918#endif
1811} 1919}
1812 1920
1813#if defined(CONFIG_MAGIC_SYSRQ) 1921#if defined(CONFIG_MAGIC_SYSRQ)
@@ -1821,7 +1929,7 @@ static int kdb_sr(int argc, const char **argv)
1821 if (argc != 1) 1929 if (argc != 1)
1822 return KDB_ARGCOUNT; 1930 return KDB_ARGCOUNT;
1823 kdb_trap_printk++; 1931 kdb_trap_printk++;
1824 __handle_sysrq(*argv[1], NULL, 0); 1932 __handle_sysrq(*argv[1], false);
1825 kdb_trap_printk--; 1933 kdb_trap_printk--;
1826 1934
1827 return 0; 1935 return 0;
@@ -2440,6 +2548,7 @@ static void kdb_sysinfo(struct sysinfo *val)
2440 */ 2548 */
2441static int kdb_summary(int argc, const char **argv) 2549static int kdb_summary(int argc, const char **argv)
2442{ 2550{
2551 struct timespec now;
2443 struct kdb_tm tm; 2552 struct kdb_tm tm;
2444 struct sysinfo val; 2553 struct sysinfo val;
2445 2554
@@ -2454,7 +2563,8 @@ static int kdb_summary(int argc, const char **argv)
2454 kdb_printf("domainname %s\n", init_uts_ns.name.domainname); 2563 kdb_printf("domainname %s\n", init_uts_ns.name.domainname);
2455 kdb_printf("ccversion %s\n", __stringify(CCVERSION)); 2564 kdb_printf("ccversion %s\n", __stringify(CCVERSION));
2456 2565
2457 kdb_gmtime(&xtime, &tm); 2566 now = __current_kernel_time();
2567 kdb_gmtime(&now, &tm);
2458 kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d " 2568 kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d "
2459 "tz_minuteswest %d\n", 2569 "tz_minuteswest %d\n",
2460 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday, 2570 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday,
@@ -2673,6 +2783,8 @@ int kdb_register_repeat(char *cmd,
2673 2783
2674 return 0; 2784 return 0;
2675} 2785}
2786EXPORT_SYMBOL_GPL(kdb_register_repeat);
2787
2676 2788
2677/* 2789/*
2678 * kdb_register - Compatibility register function for commands that do 2790 * kdb_register - Compatibility register function for commands that do
@@ -2695,6 +2807,7 @@ int kdb_register(char *cmd,
2695 return kdb_register_repeat(cmd, func, usage, help, minlen, 2807 return kdb_register_repeat(cmd, func, usage, help, minlen,
2696 KDB_REPEAT_NONE); 2808 KDB_REPEAT_NONE);
2697} 2809}
2810EXPORT_SYMBOL_GPL(kdb_register);
2698 2811
2699/* 2812/*
2700 * kdb_unregister - This function is used to unregister a kernel 2813 * kdb_unregister - This function is used to unregister a kernel
@@ -2713,7 +2826,7 @@ int kdb_unregister(char *cmd)
2713 /* 2826 /*
2714 * find the command. 2827 * find the command.
2715 */ 2828 */
2716 for (i = 0, kp = kdb_commands; i < kdb_max_commands; i++, kp++) { 2829 for_each_kdbcmd(kp, i) {
2717 if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) { 2830 if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
2718 kp->cmd_name = NULL; 2831 kp->cmd_name = NULL;
2719 return 0; 2832 return 0;
@@ -2723,6 +2836,7 @@ int kdb_unregister(char *cmd)
2723 /* Couldn't find it. */ 2836 /* Couldn't find it. */
2724 return 1; 2837 return 1;
2725} 2838}
2839EXPORT_SYMBOL_GPL(kdb_unregister);
2726 2840
2727/* Initialize the kdb command table. */ 2841/* Initialize the kdb command table. */
2728static void __init kdb_inittab(void) 2842static void __init kdb_inittab(void)
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 97d3ba69775d..35d69ed1dfb5 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -15,29 +15,6 @@
15#include <linux/kgdb.h> 15#include <linux/kgdb.h>
16#include "../debug_core.h" 16#include "../debug_core.h"
17 17
18/* Kernel Debugger Error codes. Must not overlap with command codes. */
19#define KDB_NOTFOUND (-1)
20#define KDB_ARGCOUNT (-2)
21#define KDB_BADWIDTH (-3)
22#define KDB_BADRADIX (-4)
23#define KDB_NOTENV (-5)
24#define KDB_NOENVVALUE (-6)
25#define KDB_NOTIMP (-7)
26#define KDB_ENVFULL (-8)
27#define KDB_ENVBUFFULL (-9)
28#define KDB_TOOMANYBPT (-10)
29#define KDB_TOOMANYDBREGS (-11)
30#define KDB_DUPBPT (-12)
31#define KDB_BPTNOTFOUND (-13)
32#define KDB_BADMODE (-14)
33#define KDB_BADINT (-15)
34#define KDB_INVADDRFMT (-16)
35#define KDB_BADREG (-17)
36#define KDB_BADCPUNUM (-18)
37#define KDB_BADLENGTH (-19)
38#define KDB_NOBP (-20)
39#define KDB_BADADDR (-21)
40
41/* Kernel Debugger Command codes. Must not overlap with error codes. */ 18/* Kernel Debugger Command codes. Must not overlap with error codes. */
42#define KDB_CMD_GO (-1001) 19#define KDB_CMD_GO (-1001)
43#define KDB_CMD_CPU (-1002) 20#define KDB_CMD_CPU (-1002)
@@ -93,17 +70,6 @@
93 */ 70 */
94#define KDB_MAXBPT 16 71#define KDB_MAXBPT 16
95 72
96/* Maximum number of arguments to a function */
97#define KDB_MAXARGS 16
98
99typedef enum {
100 KDB_REPEAT_NONE = 0, /* Do not repeat this command */
101 KDB_REPEAT_NO_ARGS, /* Repeat the command without arguments */
102 KDB_REPEAT_WITH_ARGS, /* Repeat the command including its arguments */
103} kdb_repeat_t;
104
105typedef int (*kdb_func_t)(int, const char **);
106
107/* Symbol table format returned by kallsyms. */ 73/* Symbol table format returned by kallsyms. */
108typedef struct __ksymtab { 74typedef struct __ksymtab {
109 unsigned long value; /* Address of symbol */ 75 unsigned long value; /* Address of symbol */
@@ -123,11 +89,6 @@ extern int kallsyms_symbol_next(char *prefix_name, int flag);
123extern int kallsyms_symbol_complete(char *prefix_name, int max_len); 89extern int kallsyms_symbol_complete(char *prefix_name, int max_len);
124 90
125/* Exported Symbols for kernel loadable modules to use. */ 91/* Exported Symbols for kernel loadable modules to use. */
126extern int kdb_register(char *, kdb_func_t, char *, char *, short);
127extern int kdb_register_repeat(char *, kdb_func_t, char *, char *,
128 short, kdb_repeat_t);
129extern int kdb_unregister(char *);
130
131extern int kdb_getarea_size(void *, unsigned long, size_t); 92extern int kdb_getarea_size(void *, unsigned long, size_t);
132extern int kdb_putarea_size(unsigned long, void *, size_t); 93extern int kdb_putarea_size(unsigned long, void *, size_t);
133 94
@@ -144,9 +105,8 @@ extern int kdb_getword(unsigned long *, unsigned long, size_t);
144extern int kdb_putword(unsigned long, unsigned long, size_t); 105extern int kdb_putword(unsigned long, unsigned long, size_t);
145 106
146extern int kdbgetularg(const char *, unsigned long *); 107extern int kdbgetularg(const char *, unsigned long *);
147extern int kdb_set(int, const char **); 108extern int kdbgetu64arg(const char *, u64 *);
148extern char *kdbgetenv(const char *); 109extern char *kdbgetenv(const char *);
149extern int kdbgetintenv(const char *, int *);
150extern int kdbgetaddrarg(int, const char **, int*, unsigned long *, 110extern int kdbgetaddrarg(int, const char **, int*, unsigned long *,
151 long *, char **); 111 long *, char **);
152extern int kdbgetsymval(const char *, kdb_symtab_t *); 112extern int kdbgetsymval(const char *, kdb_symtab_t *);
@@ -257,7 +217,6 @@ extern void kdb_ps1(const struct task_struct *p);
257extern void kdb_print_nameval(const char *name, unsigned long val); 217extern void kdb_print_nameval(const char *name, unsigned long val);
258extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); 218extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
259extern void kdb_meminfo_proc_show(void); 219extern void kdb_meminfo_proc_show(void);
260extern const char *kdb_walk_kallsyms(loff_t *pos);
261extern char *kdb_getstr(char *, size_t, char *); 220extern char *kdb_getstr(char *, size_t, char *);
262 221
263/* Defines for kdb_symbol_print */ 222/* Defines for kdb_symbol_print */
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 45344d5c53dd..6b2485dcb050 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -82,8 +82,8 @@ static char *kdb_name_table[100]; /* arbitrary size */
82int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab) 82int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
83{ 83{
84 int ret = 0; 84 int ret = 0;
85 unsigned long symbolsize; 85 unsigned long symbolsize = 0;
86 unsigned long offset; 86 unsigned long offset = 0;
87#define knt1_size 128 /* must be >= kallsyms table size */ 87#define knt1_size 128 /* must be >= kallsyms table size */
88 char *knt1 = NULL; 88 char *knt1 = NULL;
89 89
diff --git a/kernel/early_res.c b/kernel/early_res.c
deleted file mode 100644
index 7bfae887f211..000000000000
--- a/kernel/early_res.c
+++ /dev/null
@@ -1,590 +0,0 @@
1/*
2 * early_res, could be used to replace bootmem
3 */
4#include <linux/kernel.h>
5#include <linux/types.h>
6#include <linux/init.h>
7#include <linux/bootmem.h>
8#include <linux/mm.h>
9#include <linux/early_res.h>
10#include <linux/slab.h>
11#include <linux/kmemleak.h>
12
13/*
14 * Early reserved memory areas.
15 */
16/*
17 * need to make sure this one is bigger enough before
18 * find_fw_memmap_area could be used
19 */
20#define MAX_EARLY_RES_X 32
21
22struct early_res {
23 u64 start, end;
24 char name[15];
25 char overlap_ok;
26};
27static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata;
28
29static int max_early_res __initdata = MAX_EARLY_RES_X;
30static struct early_res *early_res __initdata = &early_res_x[0];
31static int early_res_count __initdata;
32
33static int __init find_overlapped_early(u64 start, u64 end)
34{
35 int i;
36 struct early_res *r;
37
38 for (i = 0; i < max_early_res && early_res[i].end; i++) {
39 r = &early_res[i];
40 if (end > r->start && start < r->end)
41 break;
42 }
43
44 return i;
45}
46
47/*
48 * Drop the i-th range from the early reservation map,
49 * by copying any higher ranges down one over it, and
50 * clearing what had been the last slot.
51 */
52static void __init drop_range(int i)
53{
54 int j;
55
56 for (j = i + 1; j < max_early_res && early_res[j].end; j++)
57 ;
58
59 memmove(&early_res[i], &early_res[i + 1],
60 (j - 1 - i) * sizeof(struct early_res));
61
62 early_res[j - 1].end = 0;
63 early_res_count--;
64}
65
66static void __init drop_range_partial(int i, u64 start, u64 end)
67{
68 u64 common_start, common_end;
69 u64 old_start, old_end;
70
71 old_start = early_res[i].start;
72 old_end = early_res[i].end;
73 common_start = max(old_start, start);
74 common_end = min(old_end, end);
75
76 /* no overlap ? */
77 if (common_start >= common_end)
78 return;
79
80 if (old_start < common_start) {
81 /* make head segment */
82 early_res[i].end = common_start;
83 if (old_end > common_end) {
84 char name[15];
85
86 /*
87 * Save a local copy of the name, since the
88 * early_res array could get resized inside
89 * reserve_early_without_check() ->
90 * __check_and_double_early_res(), which would
91 * make the current name pointer invalid.
92 */
93 strncpy(name, early_res[i].name,
94 sizeof(early_res[i].name) - 1);
95 /* add another for left over on tail */
96 reserve_early_without_check(common_end, old_end, name);
97 }
98 return;
99 } else {
100 if (old_end > common_end) {
101 /* reuse the entry for tail left */
102 early_res[i].start = common_end;
103 return;
104 }
105 /* all covered */
106 drop_range(i);
107 }
108}
109
110/*
111 * Split any existing ranges that:
112 * 1) are marked 'overlap_ok', and
113 * 2) overlap with the stated range [start, end)
114 * into whatever portion (if any) of the existing range is entirely
115 * below or entirely above the stated range. Drop the portion
116 * of the existing range that overlaps with the stated range,
117 * which will allow the caller of this routine to then add that
118 * stated range without conflicting with any existing range.
119 */
120static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
121{
122 int i;
123 struct early_res *r;
124 u64 lower_start, lower_end;
125 u64 upper_start, upper_end;
126 char name[15];
127
128 for (i = 0; i < max_early_res && early_res[i].end; i++) {
129 r = &early_res[i];
130
131 /* Continue past non-overlapping ranges */
132 if (end <= r->start || start >= r->end)
133 continue;
134
135 /*
136 * Leave non-ok overlaps as is; let caller
137 * panic "Overlapping early reservations"
138 * when it hits this overlap.
139 */
140 if (!r->overlap_ok)
141 return;
142
143 /*
144 * We have an ok overlap. We will drop it from the early
145 * reservation map, and add back in any non-overlapping
146 * portions (lower or upper) as separate, overlap_ok,
147 * non-overlapping ranges.
148 */
149
150 /* 1. Note any non-overlapping (lower or upper) ranges. */
151 strncpy(name, r->name, sizeof(name) - 1);
152
153 lower_start = lower_end = 0;
154 upper_start = upper_end = 0;
155 if (r->start < start) {
156 lower_start = r->start;
157 lower_end = start;
158 }
159 if (r->end > end) {
160 upper_start = end;
161 upper_end = r->end;
162 }
163
164 /* 2. Drop the original ok overlapping range */
165 drop_range(i);
166
167 i--; /* resume for-loop on copied down entry */
168
169 /* 3. Add back in any non-overlapping ranges. */
170 if (lower_end)
171 reserve_early_overlap_ok(lower_start, lower_end, name);
172 if (upper_end)
173 reserve_early_overlap_ok(upper_start, upper_end, name);
174 }
175}
176
177static void __init __reserve_early(u64 start, u64 end, char *name,
178 int overlap_ok)
179{
180 int i;
181 struct early_res *r;
182
183 i = find_overlapped_early(start, end);
184 if (i >= max_early_res)
185 panic("Too many early reservations");
186 r = &early_res[i];
187 if (r->end)
188 panic("Overlapping early reservations "
189 "%llx-%llx %s to %llx-%llx %s\n",
190 start, end - 1, name ? name : "", r->start,
191 r->end - 1, r->name);
192 r->start = start;
193 r->end = end;
194 r->overlap_ok = overlap_ok;
195 if (name)
196 strncpy(r->name, name, sizeof(r->name) - 1);
197 early_res_count++;
198}
199
200/*
201 * A few early reservtations come here.
202 *
203 * The 'overlap_ok' in the name of this routine does -not- mean it
204 * is ok for these reservations to overlap an earlier reservation.
205 * Rather it means that it is ok for subsequent reservations to
206 * overlap this one.
207 *
208 * Use this entry point to reserve early ranges when you are doing
209 * so out of "Paranoia", reserving perhaps more memory than you need,
210 * just in case, and don't mind a subsequent overlapping reservation
211 * that is known to be needed.
212 *
213 * The drop_overlaps_that_are_ok() call here isn't really needed.
214 * It would be needed if we had two colliding 'overlap_ok'
215 * reservations, so that the second such would not panic on the
216 * overlap with the first. We don't have any such as of this
217 * writing, but might as well tolerate such if it happens in
218 * the future.
219 */
220void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
221{
222 drop_overlaps_that_are_ok(start, end);
223 __reserve_early(start, end, name, 1);
224}
225
226static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end)
227{
228 u64 start, end, size, mem;
229 struct early_res *new;
230
231 /* do we have enough slots left ? */
232 if ((max_early_res - early_res_count) > max(max_early_res/8, 2))
233 return;
234
235 /* double it */
236 mem = -1ULL;
237 size = sizeof(struct early_res) * max_early_res * 2;
238 if (early_res == early_res_x)
239 start = 0;
240 else
241 start = early_res[0].end;
242 end = ex_start;
243 if (start + size < end)
244 mem = find_fw_memmap_area(start, end, size,
245 sizeof(struct early_res));
246 if (mem == -1ULL) {
247 start = ex_end;
248 end = get_max_mapped();
249 if (start + size < end)
250 mem = find_fw_memmap_area(start, end, size,
251 sizeof(struct early_res));
252 }
253 if (mem == -1ULL)
254 panic("can not find more space for early_res array");
255
256 new = __va(mem);
257 /* save the first one for own */
258 new[0].start = mem;
259 new[0].end = mem + size;
260 new[0].overlap_ok = 0;
261 /* copy old to new */
262 if (early_res == early_res_x) {
263 memcpy(&new[1], &early_res[0],
264 sizeof(struct early_res) * max_early_res);
265 memset(&new[max_early_res+1], 0,
266 sizeof(struct early_res) * (max_early_res - 1));
267 early_res_count++;
268 } else {
269 memcpy(&new[1], &early_res[1],
270 sizeof(struct early_res) * (max_early_res - 1));
271 memset(&new[max_early_res], 0,
272 sizeof(struct early_res) * max_early_res);
273 }
274 memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
275 early_res = new;
276 max_early_res *= 2;
277 printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n",
278 max_early_res, mem, mem + size - 1);
279}
280
281/*
282 * Most early reservations come here.
283 *
284 * We first have drop_overlaps_that_are_ok() drop any pre-existing
285 * 'overlap_ok' ranges, so that we can then reserve this memory
286 * range without risk of panic'ing on an overlapping overlap_ok
287 * early reservation.
288 */
289void __init reserve_early(u64 start, u64 end, char *name)
290{
291 if (start >= end)
292 return;
293
294 __check_and_double_early_res(start, end);
295
296 drop_overlaps_that_are_ok(start, end);
297 __reserve_early(start, end, name, 0);
298}
299
300void __init reserve_early_without_check(u64 start, u64 end, char *name)
301{
302 struct early_res *r;
303
304 if (start >= end)
305 return;
306
307 __check_and_double_early_res(start, end);
308
309 r = &early_res[early_res_count];
310
311 r->start = start;
312 r->end = end;
313 r->overlap_ok = 0;
314 if (name)
315 strncpy(r->name, name, sizeof(r->name) - 1);
316 early_res_count++;
317}
318
319void __init free_early(u64 start, u64 end)
320{
321 struct early_res *r;
322 int i;
323
324 kmemleak_free_part(__va(start), end - start);
325
326 i = find_overlapped_early(start, end);
327 r = &early_res[i];
328 if (i >= max_early_res || r->end != end || r->start != start)
329 panic("free_early on not reserved area: %llx-%llx!",
330 start, end - 1);
331
332 drop_range(i);
333}
334
335void __init free_early_partial(u64 start, u64 end)
336{
337 struct early_res *r;
338 int i;
339
340 kmemleak_free_part(__va(start), end - start);
341
342 if (start == end)
343 return;
344
345 if (WARN_ONCE(start > end, " wrong range [%#llx, %#llx]\n", start, end))
346 return;
347
348try_next:
349 i = find_overlapped_early(start, end);
350 if (i >= max_early_res)
351 return;
352
353 r = &early_res[i];
354 /* hole ? */
355 if (r->end >= end && r->start <= start) {
356 drop_range_partial(i, start, end);
357 return;
358 }
359
360 drop_range_partial(i, start, end);
361 goto try_next;
362}
363
364#ifdef CONFIG_NO_BOOTMEM
365static void __init subtract_early_res(struct range *range, int az)
366{
367 int i, count;
368 u64 final_start, final_end;
369 int idx = 0;
370
371 count = 0;
372 for (i = 0; i < max_early_res && early_res[i].end; i++)
373 count++;
374
375 /* need to skip first one ?*/
376 if (early_res != early_res_x)
377 idx = 1;
378
379#define DEBUG_PRINT_EARLY_RES 1
380
381#if DEBUG_PRINT_EARLY_RES
382 printk(KERN_INFO "Subtract (%d early reservations)\n", count);
383#endif
384 for (i = idx; i < count; i++) {
385 struct early_res *r = &early_res[i];
386#if DEBUG_PRINT_EARLY_RES
387 printk(KERN_INFO " #%d [%010llx - %010llx] %15s\n", i,
388 r->start, r->end, r->name);
389#endif
390 final_start = PFN_DOWN(r->start);
391 final_end = PFN_UP(r->end);
392 if (final_start >= final_end)
393 continue;
394 subtract_range(range, az, final_start, final_end);
395 }
396
397}
398
399int __init get_free_all_memory_range(struct range **rangep, int nodeid)
400{
401 int i, count;
402 u64 start = 0, end;
403 u64 size;
404 u64 mem;
405 struct range *range;
406 int nr_range;
407
408 count = 0;
409 for (i = 0; i < max_early_res && early_res[i].end; i++)
410 count++;
411
412 count *= 2;
413
414 size = sizeof(struct range) * count;
415 end = get_max_mapped();
416#ifdef MAX_DMA32_PFN
417 if (end > (MAX_DMA32_PFN << PAGE_SHIFT))
418 start = MAX_DMA32_PFN << PAGE_SHIFT;
419#endif
420 mem = find_fw_memmap_area(start, end, size, sizeof(struct range));
421 if (mem == -1ULL)
422 panic("can not find more space for range free");
423
424 range = __va(mem);
425 /* use early_node_map[] and early_res to get range array at first */
426 memset(range, 0, size);
427 nr_range = 0;
428
429 /* need to go over early_node_map to find out good range for node */
430 nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
431#ifdef CONFIG_X86_32
432 subtract_range(range, count, max_low_pfn, -1ULL);
433#endif
434 subtract_early_res(range, count);
435 nr_range = clean_sort_range(range, count);
436
437 /* need to clear it ? */
438 if (nodeid == MAX_NUMNODES) {
439 memset(&early_res[0], 0,
440 sizeof(struct early_res) * max_early_res);
441 early_res = NULL;
442 max_early_res = 0;
443 }
444
445 *rangep = range;
446 return nr_range;
447}
448#else
449void __init early_res_to_bootmem(u64 start, u64 end)
450{
451 int i, count;
452 u64 final_start, final_end;
453 int idx = 0;
454
455 count = 0;
456 for (i = 0; i < max_early_res && early_res[i].end; i++)
457 count++;
458
459 /* need to skip first one ?*/
460 if (early_res != early_res_x)
461 idx = 1;
462
463 printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n",
464 count - idx, max_early_res, start, end);
465 for (i = idx; i < count; i++) {
466 struct early_res *r = &early_res[i];
467 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
468 r->start, r->end, r->name);
469 final_start = max(start, r->start);
470 final_end = min(end, r->end);
471 if (final_start >= final_end) {
472 printk(KERN_CONT "\n");
473 continue;
474 }
475 printk(KERN_CONT " ==> [%010llx - %010llx]\n",
476 final_start, final_end);
477 reserve_bootmem_generic(final_start, final_end - final_start,
478 BOOTMEM_DEFAULT);
479 }
480 /* clear them */
481 memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
482 early_res = NULL;
483 max_early_res = 0;
484 early_res_count = 0;
485}
486#endif
487
488/* Check for already reserved areas */
489static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
490{
491 int i;
492 u64 addr = *addrp;
493 int changed = 0;
494 struct early_res *r;
495again:
496 i = find_overlapped_early(addr, addr + size);
497 r = &early_res[i];
498 if (i < max_early_res && r->end) {
499 *addrp = addr = round_up(r->end, align);
500 changed = 1;
501 goto again;
502 }
503 return changed;
504}
505
506/* Check for already reserved areas */
507static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
508{
509 int i;
510 u64 addr = *addrp, last;
511 u64 size = *sizep;
512 int changed = 0;
513again:
514 last = addr + size;
515 for (i = 0; i < max_early_res && early_res[i].end; i++) {
516 struct early_res *r = &early_res[i];
517 if (last > r->start && addr < r->start) {
518 size = r->start - addr;
519 changed = 1;
520 goto again;
521 }
522 if (last > r->end && addr < r->end) {
523 addr = round_up(r->end, align);
524 size = last - addr;
525 changed = 1;
526 goto again;
527 }
528 if (last <= r->end && addr >= r->start) {
529 (*sizep)++;
530 return 0;
531 }
532 }
533 if (changed) {
534 *addrp = addr;
535 *sizep = size;
536 }
537 return changed;
538}
539
540/*
541 * Find a free area with specified alignment in a specific range.
542 * only with the area.between start to end is active range from early_node_map
543 * so they are good as RAM
544 */
545u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end,
546 u64 size, u64 align)
547{
548 u64 addr, last;
549
550 addr = round_up(ei_start, align);
551 if (addr < start)
552 addr = round_up(start, align);
553 if (addr >= ei_last)
554 goto out;
555 while (bad_addr(&addr, size, align) && addr+size <= ei_last)
556 ;
557 last = addr + size;
558 if (last > ei_last)
559 goto out;
560 if (last > end)
561 goto out;
562
563 return addr;
564
565out:
566 return -1ULL;
567}
568
569u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start,
570 u64 *sizep, u64 align)
571{
572 u64 addr, last;
573
574 addr = round_up(ei_start, align);
575 if (addr < start)
576 addr = round_up(start, align);
577 if (addr >= ei_last)
578 goto out;
579 *sizep = ei_last - addr;
580 while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last)
581 ;
582 last = addr + *sizep;
583 if (last > ei_last)
584 goto out;
585
586 return addr;
587
588out:
589 return -1ULL;
590}
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index dd62f8e714ca..0dbeae374225 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -134,23 +134,14 @@ unregister:
134 return 0; 134 return 0;
135} 135}
136 136
137int 137int __set_personality(unsigned int personality)
138__set_personality(unsigned int personality)
139{ 138{
140 struct exec_domain *ep, *oep; 139 struct exec_domain *oep = current_thread_info()->exec_domain;
141
142 ep = lookup_exec_domain(personality);
143 if (ep == current_thread_info()->exec_domain) {
144 current->personality = personality;
145 module_put(ep->module);
146 return 0;
147 }
148 140
141 current_thread_info()->exec_domain = lookup_exec_domain(personality);
149 current->personality = personality; 142 current->personality = personality;
150 oep = current_thread_info()->exec_domain;
151 current_thread_info()->exec_domain = ep;
152
153 module_put(oep->module); 143 module_put(oep->module);
144
154 return 0; 145 return 0;
155} 146}
156 147
@@ -192,11 +183,8 @@ SYSCALL_DEFINE1(personality, unsigned int, personality)
192{ 183{
193 unsigned int old = current->personality; 184 unsigned int old = current->personality;
194 185
195 if (personality != 0xffffffff) { 186 if (personality != 0xffffffff)
196 set_personality(personality); 187 set_personality(personality);
197 if (current->personality != personality)
198 return -EINVAL;
199 }
200 188
201 return old; 189 return old;
202} 190}
diff --git a/kernel/exit.c b/kernel/exit.c
index ceffc67b564a..b194febf5799 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -50,6 +50,7 @@
50#include <linux/perf_event.h> 50#include <linux/perf_event.h>
51#include <trace/events/sched.h> 51#include <trace/events/sched.h>
52#include <linux/hw_breakpoint.h> 52#include <linux/hw_breakpoint.h>
53#include <linux/oom.h>
53 54
54#include <asm/uaccess.h> 55#include <asm/uaccess.h>
55#include <asm/unistd.h> 56#include <asm/unistd.h>
@@ -149,9 +150,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
149{ 150{
150 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); 151 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
151 152
152#ifdef CONFIG_PERF_EVENTS 153 perf_event_delayed_put(tsk);
153 WARN_ON_ONCE(tsk->perf_event_ctxp);
154#endif
155 trace_sched_process_free(tsk); 154 trace_sched_process_free(tsk);
156 put_task_struct(tsk); 155 put_task_struct(tsk);
157} 156}
@@ -689,6 +688,8 @@ static void exit_mm(struct task_struct * tsk)
689 enter_lazy_tlb(mm, current); 688 enter_lazy_tlb(mm, current);
690 /* We don't want this task to be frozen prematurely */ 689 /* We don't want this task to be frozen prematurely */
691 clear_freeze_flag(tsk); 690 clear_freeze_flag(tsk);
691 if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
692 atomic_dec(&mm->oom_disable_count);
692 task_unlock(tsk); 693 task_unlock(tsk);
693 mm_update_next_owner(mm); 694 mm_update_next_owner(mm);
694 mmput(mm); 695 mmput(mm);
@@ -702,6 +703,8 @@ static void exit_mm(struct task_struct * tsk)
702 * space. 703 * space.
703 */ 704 */
704static struct task_struct *find_new_reaper(struct task_struct *father) 705static struct task_struct *find_new_reaper(struct task_struct *father)
706 __releases(&tasklist_lock)
707 __acquires(&tasklist_lock)
705{ 708{
706 struct pid_namespace *pid_ns = task_active_pid_ns(father); 709 struct pid_namespace *pid_ns = task_active_pid_ns(father);
707 struct task_struct *thread; 710 struct task_struct *thread;
@@ -771,9 +774,12 @@ static void forget_original_parent(struct task_struct *father)
771 struct task_struct *p, *n, *reaper; 774 struct task_struct *p, *n, *reaper;
772 LIST_HEAD(dead_children); 775 LIST_HEAD(dead_children);
773 776
774 exit_ptrace(father);
775
776 write_lock_irq(&tasklist_lock); 777 write_lock_irq(&tasklist_lock);
778 /*
779 * Note that exit_ptrace() and find_new_reaper() might
780 * drop tasklist_lock and reacquire it.
781 */
782 exit_ptrace(father);
777 reaper = find_new_reaper(father); 783 reaper = find_new_reaper(father);
778 784
779 list_for_each_entry_safe(p, n, &father->children, sibling) { 785 list_for_each_entry_safe(p, n, &father->children, sibling) {
@@ -1383,8 +1389,7 @@ static int wait_task_stopped(struct wait_opts *wo,
1383 if (!unlikely(wo->wo_flags & WNOWAIT)) 1389 if (!unlikely(wo->wo_flags & WNOWAIT))
1384 *p_code = 0; 1390 *p_code = 0;
1385 1391
1386 /* don't need the RCU readlock here as we're holding a spinlock */ 1392 uid = task_uid(p);
1387 uid = __task_cred(p)->uid;
1388unlock_sig: 1393unlock_sig:
1389 spin_unlock_irq(&p->sighand->siglock); 1394 spin_unlock_irq(&p->sighand->siglock);
1390 if (!exit_code) 1395 if (!exit_code)
@@ -1457,7 +1462,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1457 } 1462 }
1458 if (!unlikely(wo->wo_flags & WNOWAIT)) 1463 if (!unlikely(wo->wo_flags & WNOWAIT))
1459 p->signal->flags &= ~SIGNAL_STOP_CONTINUED; 1464 p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1460 uid = __task_cred(p)->uid; 1465 uid = task_uid(p);
1461 spin_unlock_irq(&p->sighand->siglock); 1466 spin_unlock_irq(&p->sighand->siglock);
1462 1467
1463 pid = task_pid_vnr(p); 1468 pid = task_pid_vnr(p);
diff --git a/kernel/fork.c b/kernel/fork.c
index b6cce14ba047..3b159c5991b7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -65,6 +65,7 @@
65#include <linux/perf_event.h> 65#include <linux/perf_event.h>
66#include <linux/posix-timers.h> 66#include <linux/posix-timers.h>
67#include <linux/user-return-notifier.h> 67#include <linux/user-return-notifier.h>
68#include <linux/oom.h>
68 69
69#include <asm/pgtable.h> 70#include <asm/pgtable.h>
70#include <asm/pgalloc.h> 71#include <asm/pgalloc.h>
@@ -300,7 +301,7 @@ out:
300#ifdef CONFIG_MMU 301#ifdef CONFIG_MMU
301static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) 302static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
302{ 303{
303 struct vm_area_struct *mpnt, *tmp, **pprev; 304 struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
304 struct rb_node **rb_link, *rb_parent; 305 struct rb_node **rb_link, *rb_parent;
305 int retval; 306 int retval;
306 unsigned long charge; 307 unsigned long charge;
@@ -328,6 +329,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
328 if (retval) 329 if (retval)
329 goto out; 330 goto out;
330 331
332 prev = NULL;
331 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { 333 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
332 struct file *file; 334 struct file *file;
333 335
@@ -355,11 +357,11 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
355 if (IS_ERR(pol)) 357 if (IS_ERR(pol))
356 goto fail_nomem_policy; 358 goto fail_nomem_policy;
357 vma_set_policy(tmp, pol); 359 vma_set_policy(tmp, pol);
360 tmp->vm_mm = mm;
358 if (anon_vma_fork(tmp, mpnt)) 361 if (anon_vma_fork(tmp, mpnt))
359 goto fail_nomem_anon_vma_fork; 362 goto fail_nomem_anon_vma_fork;
360 tmp->vm_flags &= ~VM_LOCKED; 363 tmp->vm_flags &= ~VM_LOCKED;
361 tmp->vm_mm = mm; 364 tmp->vm_next = tmp->vm_prev = NULL;
362 tmp->vm_next = NULL;
363 file = tmp->vm_file; 365 file = tmp->vm_file;
364 if (file) { 366 if (file) {
365 struct inode *inode = file->f_path.dentry->d_inode; 367 struct inode *inode = file->f_path.dentry->d_inode;
@@ -392,6 +394,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
392 */ 394 */
393 *pprev = tmp; 395 *pprev = tmp;
394 pprev = &tmp->vm_next; 396 pprev = &tmp->vm_next;
397 tmp->vm_prev = prev;
398 prev = tmp;
395 399
396 __vma_link_rb(mm, tmp, rb_link, rb_parent); 400 __vma_link_rb(mm, tmp, rb_link, rb_parent);
397 rb_link = &tmp->vm_rb.rb_right; 401 rb_link = &tmp->vm_rb.rb_right;
@@ -485,6 +489,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
485 mm->cached_hole_size = ~0UL; 489 mm->cached_hole_size = ~0UL;
486 mm_init_aio(mm); 490 mm_init_aio(mm);
487 mm_init_owner(mm, p); 491 mm_init_owner(mm, p);
492 atomic_set(&mm->oom_disable_count, 0);
488 493
489 if (likely(!mm_alloc_pgd(mm))) { 494 if (likely(!mm_alloc_pgd(mm))) {
490 mm->def_flags = 0; 495 mm->def_flags = 0;
@@ -738,6 +743,8 @@ good_mm:
738 /* Initializing for Swap token stuff */ 743 /* Initializing for Swap token stuff */
739 mm->token_priority = 0; 744 mm->token_priority = 0;
740 mm->last_interval = 0; 745 mm->last_interval = 0;
746 if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
747 atomic_inc(&mm->oom_disable_count);
741 748
742 tsk->mm = mm; 749 tsk->mm = mm;
743 tsk->active_mm = mm; 750 tsk->active_mm = mm;
@@ -752,13 +759,13 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
752 struct fs_struct *fs = current->fs; 759 struct fs_struct *fs = current->fs;
753 if (clone_flags & CLONE_FS) { 760 if (clone_flags & CLONE_FS) {
754 /* tsk->fs is already what we want */ 761 /* tsk->fs is already what we want */
755 write_lock(&fs->lock); 762 spin_lock(&fs->lock);
756 if (fs->in_exec) { 763 if (fs->in_exec) {
757 write_unlock(&fs->lock); 764 spin_unlock(&fs->lock);
758 return -EAGAIN; 765 return -EAGAIN;
759 } 766 }
760 fs->users++; 767 fs->users++;
761 write_unlock(&fs->lock); 768 spin_unlock(&fs->lock);
762 return 0; 769 return 0;
763 } 770 }
764 tsk->fs = copy_fs_struct(fs); 771 tsk->fs = copy_fs_struct(fs);
@@ -899,6 +906,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
899 tty_audit_fork(sig); 906 tty_audit_fork(sig);
900 907
901 sig->oom_adj = current->signal->oom_adj; 908 sig->oom_adj = current->signal->oom_adj;
909 sig->oom_score_adj = current->signal->oom_score_adj;
910
911 mutex_init(&sig->cred_guard_mutex);
902 912
903 return 0; 913 return 0;
904} 914}
@@ -907,7 +917,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
907{ 917{
908 unsigned long new_flags = p->flags; 918 unsigned long new_flags = p->flags;
909 919
910 new_flags &= ~PF_SUPERPRIV; 920 new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
911 new_flags |= PF_FORKNOEXEC; 921 new_flags |= PF_FORKNOEXEC;
912 new_flags |= PF_STARTING; 922 new_flags |= PF_STARTING;
913 p->flags = new_flags; 923 p->flags = new_flags;
@@ -1295,8 +1305,13 @@ bad_fork_cleanup_io:
1295bad_fork_cleanup_namespaces: 1305bad_fork_cleanup_namespaces:
1296 exit_task_namespaces(p); 1306 exit_task_namespaces(p);
1297bad_fork_cleanup_mm: 1307bad_fork_cleanup_mm:
1298 if (p->mm) 1308 if (p->mm) {
1309 task_lock(p);
1310 if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
1311 atomic_dec(&p->mm->oom_disable_count);
1312 task_unlock(p);
1299 mmput(p->mm); 1313 mmput(p->mm);
1314 }
1300bad_fork_cleanup_signal: 1315bad_fork_cleanup_signal:
1301 if (!(clone_flags & CLONE_THREAD)) 1316 if (!(clone_flags & CLONE_THREAD))
1302 free_signal_struct(p->signal); 1317 free_signal_struct(p->signal);
@@ -1675,13 +1690,13 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1675 1690
1676 if (new_fs) { 1691 if (new_fs) {
1677 fs = current->fs; 1692 fs = current->fs;
1678 write_lock(&fs->lock); 1693 spin_lock(&fs->lock);
1679 current->fs = new_fs; 1694 current->fs = new_fs;
1680 if (--fs->users) 1695 if (--fs->users)
1681 new_fs = NULL; 1696 new_fs = NULL;
1682 else 1697 else
1683 new_fs = fs; 1698 new_fs = fs;
1684 write_unlock(&fs->lock); 1699 spin_unlock(&fs->lock);
1685 } 1700 }
1686 1701
1687 if (new_mm) { 1702 if (new_mm) {
@@ -1689,6 +1704,10 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1689 active_mm = current->active_mm; 1704 active_mm = current->active_mm;
1690 current->mm = new_mm; 1705 current->mm = new_mm;
1691 current->active_mm = new_mm; 1706 current->active_mm = new_mm;
1707 if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
1708 atomic_dec(&mm->oom_disable_count);
1709 atomic_inc(&new_mm->oom_disable_count);
1710 }
1692 activate_mm(active_mm, new_mm); 1711 activate_mm(active_mm, new_mm);
1693 new_mm = mm; 1712 new_mm = mm;
1694 } 1713 }
diff --git a/kernel/futex.c b/kernel/futex.c
index 6a3a5fa1526d..6c683b37f2ce 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -91,6 +91,7 @@ struct futex_pi_state {
91 91
92/** 92/**
93 * struct futex_q - The hashed futex queue entry, one per waiting task 93 * struct futex_q - The hashed futex queue entry, one per waiting task
94 * @list: priority-sorted list of tasks waiting on this futex
94 * @task: the task waiting on the futex 95 * @task: the task waiting on the futex
95 * @lock_ptr: the hash bucket lock 96 * @lock_ptr: the hash bucket lock
96 * @key: the key the futex is hashed on 97 * @key: the key the futex is hashed on
@@ -104,7 +105,7 @@ struct futex_pi_state {
104 * 105 *
105 * A futex_q has a woken state, just like tasks have TASK_RUNNING. 106 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
106 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. 107 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
107 * The order of wakup is always to make the first condition true, then 108 * The order of wakeup is always to make the first condition true, then
108 * the second. 109 * the second.
109 * 110 *
110 * PI futexes are typically woken before they are removed from the hash list via 111 * PI futexes are typically woken before they are removed from the hash list via
@@ -168,7 +169,7 @@ static void get_futex_key_refs(union futex_key *key)
168 169
169 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { 170 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
170 case FUT_OFF_INODE: 171 case FUT_OFF_INODE:
171 atomic_inc(&key->shared.inode->i_count); 172 ihold(key->shared.inode);
172 break; 173 break;
173 case FUT_OFF_MMSHARED: 174 case FUT_OFF_MMSHARED:
174 atomic_inc(&key->private.mm->mm_count); 175 atomic_inc(&key->private.mm->mm_count);
@@ -295,7 +296,7 @@ void put_futex_key(int fshared, union futex_key *key)
295 * Slow path to fixup the fault we just took in the atomic write 296 * Slow path to fixup the fault we just took in the atomic write
296 * access to @uaddr. 297 * access to @uaddr.
297 * 298 *
298 * We have no generic implementation of a non destructive write to the 299 * We have no generic implementation of a non-destructive write to the
299 * user address. We know that we faulted in the atomic pagefault 300 * user address. We know that we faulted in the atomic pagefault
300 * disabled section so we can as well avoid the #PF overhead by 301 * disabled section so we can as well avoid the #PF overhead by
301 * calling get_user_pages() right away. 302 * calling get_user_pages() right away.
@@ -515,7 +516,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
515 */ 516 */
516 pi_state = this->pi_state; 517 pi_state = this->pi_state;
517 /* 518 /*
518 * Userspace might have messed up non PI and PI futexes 519 * Userspace might have messed up non-PI and PI futexes
519 */ 520 */
520 if (unlikely(!pi_state)) 521 if (unlikely(!pi_state))
521 return -EINVAL; 522 return -EINVAL;
@@ -736,8 +737,8 @@ static void wake_futex(struct futex_q *q)
736 737
737 /* 738 /*
738 * We set q->lock_ptr = NULL _before_ we wake up the task. If 739 * We set q->lock_ptr = NULL _before_ we wake up the task. If
739 * a non futex wake up happens on another CPU then the task 740 * a non-futex wake up happens on another CPU then the task
740 * might exit and p would dereference a non existing task 741 * might exit and p would dereference a non-existing task
741 * struct. Prevent this by holding a reference on p across the 742 * struct. Prevent this by holding a reference on p across the
742 * wake up. 743 * wake up.
743 */ 744 */
@@ -1131,11 +1132,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1131 1132
1132/** 1133/**
1133 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 1134 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
1134 * uaddr1: source futex user address 1135 * @uaddr1: source futex user address
1135 * uaddr2: target futex user address 1136 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
1136 * nr_wake: number of waiters to wake (must be 1 for requeue_pi) 1137 * @uaddr2: target futex user address
1137 * nr_requeue: number of waiters to requeue (0-INT_MAX) 1138 * @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
1138 * requeue_pi: if we are attempting to requeue from a non-pi futex to a 1139 * @nr_requeue: number of waiters to requeue (0-INT_MAX)
1140 * @cmpval: @uaddr1 expected value (or %NULL)
1141 * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
1139 * pi futex (pi to pi requeue is not supported) 1142 * pi futex (pi to pi requeue is not supported)
1140 * 1143 *
1141 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire 1144 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
@@ -1360,10 +1363,10 @@ out:
1360 1363
1361/* The key must be already stored in q->key. */ 1364/* The key must be already stored in q->key. */
1362static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) 1365static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
1366 __acquires(&hb->lock)
1363{ 1367{
1364 struct futex_hash_bucket *hb; 1368 struct futex_hash_bucket *hb;
1365 1369
1366 get_futex_key_refs(&q->key);
1367 hb = hash_futex(&q->key); 1370 hb = hash_futex(&q->key);
1368 q->lock_ptr = &hb->lock; 1371 q->lock_ptr = &hb->lock;
1369 1372
@@ -1373,9 +1376,9 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
1373 1376
1374static inline void 1377static inline void
1375queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) 1378queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
1379 __releases(&hb->lock)
1376{ 1380{
1377 spin_unlock(&hb->lock); 1381 spin_unlock(&hb->lock);
1378 drop_futex_key_refs(&q->key);
1379} 1382}
1380 1383
1381/** 1384/**
@@ -1391,6 +1394,7 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
1391 * an example). 1394 * an example).
1392 */ 1395 */
1393static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) 1396static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1397 __releases(&hb->lock)
1394{ 1398{
1395 int prio; 1399 int prio;
1396 1400
@@ -1471,6 +1475,7 @@ retry:
1471 * and dropped here. 1475 * and dropped here.
1472 */ 1476 */
1473static void unqueue_me_pi(struct futex_q *q) 1477static void unqueue_me_pi(struct futex_q *q)
1478 __releases(q->lock_ptr)
1474{ 1479{
1475 WARN_ON(plist_node_empty(&q->list)); 1480 WARN_ON(plist_node_empty(&q->list));
1476 plist_del(&q->list, &q->list.plist); 1481 plist_del(&q->list, &q->list.plist);
@@ -1480,8 +1485,6 @@ static void unqueue_me_pi(struct futex_q *q)
1480 q->pi_state = NULL; 1485 q->pi_state = NULL;
1481 1486
1482 spin_unlock(q->lock_ptr); 1487 spin_unlock(q->lock_ptr);
1483
1484 drop_futex_key_refs(&q->key);
1485} 1488}
1486 1489
1487/* 1490/*
@@ -1812,7 +1815,10 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1812 } 1815 }
1813 1816
1814retry: 1817retry:
1815 /* Prepare to wait on uaddr. */ 1818 /*
1819 * Prepare to wait on uaddr. On success, holds hb lock and increments
1820 * q.key refs.
1821 */
1816 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); 1822 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
1817 if (ret) 1823 if (ret)
1818 goto out; 1824 goto out;
@@ -1822,28 +1828,27 @@ retry:
1822 1828
1823 /* If we were woken (and unqueued), we succeeded, whatever. */ 1829 /* If we were woken (and unqueued), we succeeded, whatever. */
1824 ret = 0; 1830 ret = 0;
1831 /* unqueue_me() drops q.key ref */
1825 if (!unqueue_me(&q)) 1832 if (!unqueue_me(&q))
1826 goto out_put_key; 1833 goto out;
1827 ret = -ETIMEDOUT; 1834 ret = -ETIMEDOUT;
1828 if (to && !to->task) 1835 if (to && !to->task)
1829 goto out_put_key; 1836 goto out;
1830 1837
1831 /* 1838 /*
1832 * We expect signal_pending(current), but we might be the 1839 * We expect signal_pending(current), but we might be the
1833 * victim of a spurious wakeup as well. 1840 * victim of a spurious wakeup as well.
1834 */ 1841 */
1835 if (!signal_pending(current)) { 1842 if (!signal_pending(current))
1836 put_futex_key(fshared, &q.key);
1837 goto retry; 1843 goto retry;
1838 }
1839 1844
1840 ret = -ERESTARTSYS; 1845 ret = -ERESTARTSYS;
1841 if (!abs_time) 1846 if (!abs_time)
1842 goto out_put_key; 1847 goto out;
1843 1848
1844 restart = &current_thread_info()->restart_block; 1849 restart = &current_thread_info()->restart_block;
1845 restart->fn = futex_wait_restart; 1850 restart->fn = futex_wait_restart;
1846 restart->futex.uaddr = (u32 *)uaddr; 1851 restart->futex.uaddr = uaddr;
1847 restart->futex.val = val; 1852 restart->futex.val = val;
1848 restart->futex.time = abs_time->tv64; 1853 restart->futex.time = abs_time->tv64;
1849 restart->futex.bitset = bitset; 1854 restart->futex.bitset = bitset;
@@ -1856,8 +1861,6 @@ retry:
1856 1861
1857 ret = -ERESTART_RESTARTBLOCK; 1862 ret = -ERESTART_RESTARTBLOCK;
1858 1863
1859out_put_key:
1860 put_futex_key(fshared, &q.key);
1861out: 1864out:
1862 if (to) { 1865 if (to) {
1863 hrtimer_cancel(&to->timer); 1866 hrtimer_cancel(&to->timer);
@@ -1869,7 +1872,7 @@ out:
1869 1872
1870static long futex_wait_restart(struct restart_block *restart) 1873static long futex_wait_restart(struct restart_block *restart)
1871{ 1874{
1872 u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; 1875 u32 __user *uaddr = restart->futex.uaddr;
1873 int fshared = 0; 1876 int fshared = 0;
1874 ktime_t t, *tp = NULL; 1877 ktime_t t, *tp = NULL;
1875 1878
@@ -2236,7 +2239,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2236 q.rt_waiter = &rt_waiter; 2239 q.rt_waiter = &rt_waiter;
2237 q.requeue_pi_key = &key2; 2240 q.requeue_pi_key = &key2;
2238 2241
2239 /* Prepare to wait on uaddr. */ 2242 /*
2243 * Prepare to wait on uaddr. On success, increments q.key (key1) ref
2244 * count.
2245 */
2240 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); 2246 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
2241 if (ret) 2247 if (ret)
2242 goto out_key2; 2248 goto out_key2;
@@ -2254,7 +2260,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2254 * In order for us to be here, we know our q.key == key2, and since 2260 * In order for us to be here, we know our q.key == key2, and since
2255 * we took the hb->lock above, we also know that futex_requeue() has 2261 * we took the hb->lock above, we also know that futex_requeue() has
2256 * completed and we no longer have to concern ourselves with a wakeup 2262 * completed and we no longer have to concern ourselves with a wakeup
2257 * race with the atomic proxy lock acquition by the requeue code. 2263 * race with the atomic proxy lock acquisition by the requeue code. The
2264 * futex_requeue dropped our key1 reference and incremented our key2
2265 * reference count.
2258 */ 2266 */
2259 2267
2260 /* Check if the requeue code acquired the second futex for us. */ 2268 /* Check if the requeue code acquired the second futex for us. */
@@ -2458,7 +2466,7 @@ retry:
2458 */ 2466 */
2459static inline int fetch_robust_entry(struct robust_list __user **entry, 2467static inline int fetch_robust_entry(struct robust_list __user **entry,
2460 struct robust_list __user * __user *head, 2468 struct robust_list __user * __user *head,
2461 int *pi) 2469 unsigned int *pi)
2462{ 2470{
2463 unsigned long uentry; 2471 unsigned long uentry;
2464 2472
@@ -2647,7 +2655,7 @@ static int __init futex_init(void)
2647 * of the complex code paths. Also we want to prevent 2655 * of the complex code paths. Also we want to prevent
2648 * registration of robust lists in that case. NULL is 2656 * registration of robust lists in that case. NULL is
2649 * guaranteed to fault and we get -EFAULT on functional 2657 * guaranteed to fault and we get -EFAULT on functional
2650 * implementation, the non functional ones will return 2658 * implementation, the non-functional ones will return
2651 * -ENOSYS. 2659 * -ENOSYS.
2652 */ 2660 */
2653 curval = cmpxchg_futex_value_locked(NULL, 0, 0); 2661 curval = cmpxchg_futex_value_locked(NULL, 0, 0);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index d49afb2395e5..06da4dfc339b 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -19,7 +19,7 @@
19 */ 19 */
20static inline int 20static inline int
21fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, 21fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
22 compat_uptr_t __user *head, int *pi) 22 compat_uptr_t __user *head, unsigned int *pi)
23{ 23{
24 if (get_user(*uentry, head)) 24 if (get_user(*uentry, head))
25 return -EFAULT; 25 return -EFAULT;
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index ef3c3f88a7a3..9bd0934f6c33 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -33,10 +33,11 @@
33 * @children: child nodes 33 * @children: child nodes
34 * @all: list head for list of all nodes 34 * @all: list head for list of all nodes
35 * @parent: parent node 35 * @parent: parent node
36 * @info: associated profiling data structure if not a directory 36 * @loaded_info: array of pointers to profiling data sets for loaded object
37 * @ghost: when an object file containing profiling data is unloaded we keep a 37 * files.
38 * copy of the profiling data here to allow collecting coverage data 38 * @num_loaded: number of profiling data sets for loaded object files.
39 * for cleanup code. Such a node is called a "ghost". 39 * @unloaded_info: accumulated copy of profiling data sets for unloaded
40 * object files. Used only when gcov_persist=1.
40 * @dentry: main debugfs entry, either a directory or data file 41 * @dentry: main debugfs entry, either a directory or data file
41 * @links: associated symbolic links 42 * @links: associated symbolic links
42 * @name: data file basename 43 * @name: data file basename
@@ -51,10 +52,11 @@ struct gcov_node {
51 struct list_head children; 52 struct list_head children;
52 struct list_head all; 53 struct list_head all;
53 struct gcov_node *parent; 54 struct gcov_node *parent;
54 struct gcov_info *info; 55 struct gcov_info **loaded_info;
55 struct gcov_info *ghost; 56 struct gcov_info *unloaded_info;
56 struct dentry *dentry; 57 struct dentry *dentry;
57 struct dentry **links; 58 struct dentry **links;
59 int num_loaded;
58 char name[0]; 60 char name[0];
59}; 61};
60 62
@@ -136,16 +138,37 @@ static const struct seq_operations gcov_seq_ops = {
136}; 138};
137 139
138/* 140/*
139 * Return the profiling data set for a given node. This can either be the 141 * Return a profiling data set associated with the given node. This is
140 * original profiling data structure or a duplicate (also called "ghost") 142 * either a data set for a loaded object file or a data set copy in case
141 * in case the associated object file has been unloaded. 143 * all associated object files have been unloaded.
142 */ 144 */
143static struct gcov_info *get_node_info(struct gcov_node *node) 145static struct gcov_info *get_node_info(struct gcov_node *node)
144{ 146{
145 if (node->info) 147 if (node->num_loaded > 0)
146 return node->info; 148 return node->loaded_info[0];
147 149
148 return node->ghost; 150 return node->unloaded_info;
151}
152
153/*
154 * Return a newly allocated profiling data set which contains the sum of
155 * all profiling data associated with the given node.
156 */
157static struct gcov_info *get_accumulated_info(struct gcov_node *node)
158{
159 struct gcov_info *info;
160 int i = 0;
161
162 if (node->unloaded_info)
163 info = gcov_info_dup(node->unloaded_info);
164 else
165 info = gcov_info_dup(node->loaded_info[i++]);
166 if (!info)
167 return NULL;
168 for (; i < node->num_loaded; i++)
169 gcov_info_add(info, node->loaded_info[i]);
170
171 return info;
149} 172}
150 173
151/* 174/*
@@ -163,9 +186,10 @@ static int gcov_seq_open(struct inode *inode, struct file *file)
163 mutex_lock(&node_lock); 186 mutex_lock(&node_lock);
164 /* 187 /*
165 * Read from a profiling data copy to minimize reference tracking 188 * Read from a profiling data copy to minimize reference tracking
166 * complexity and concurrent access. 189 * complexity and concurrent access and to keep accumulating multiple
190 * profiling data sets associated with one node simple.
167 */ 191 */
168 info = gcov_info_dup(get_node_info(node)); 192 info = get_accumulated_info(node);
169 if (!info) 193 if (!info)
170 goto out_unlock; 194 goto out_unlock;
171 iter = gcov_iter_new(info); 195 iter = gcov_iter_new(info);
@@ -225,12 +249,25 @@ static struct gcov_node *get_node_by_name(const char *name)
225 return NULL; 249 return NULL;
226} 250}
227 251
252/*
253 * Reset all profiling data associated with the specified node.
254 */
255static void reset_node(struct gcov_node *node)
256{
257 int i;
258
259 if (node->unloaded_info)
260 gcov_info_reset(node->unloaded_info);
261 for (i = 0; i < node->num_loaded; i++)
262 gcov_info_reset(node->loaded_info[i]);
263}
264
228static void remove_node(struct gcov_node *node); 265static void remove_node(struct gcov_node *node);
229 266
230/* 267/*
231 * write() implementation for gcov data files. Reset profiling data for the 268 * write() implementation for gcov data files. Reset profiling data for the
232 * associated file. If the object file has been unloaded (i.e. this is 269 * corresponding file. If all associated object files have been unloaded,
233 * a "ghost" node), remove the debug fs node as well. 270 * remove the debug fs node as well.
234 */ 271 */
235static ssize_t gcov_seq_write(struct file *file, const char __user *addr, 272static ssize_t gcov_seq_write(struct file *file, const char __user *addr,
236 size_t len, loff_t *pos) 273 size_t len, loff_t *pos)
@@ -245,10 +282,10 @@ static ssize_t gcov_seq_write(struct file *file, const char __user *addr,
245 node = get_node_by_name(info->filename); 282 node = get_node_by_name(info->filename);
246 if (node) { 283 if (node) {
247 /* Reset counts or remove node for unloaded modules. */ 284 /* Reset counts or remove node for unloaded modules. */
248 if (node->ghost) 285 if (node->num_loaded == 0)
249 remove_node(node); 286 remove_node(node);
250 else 287 else
251 gcov_info_reset(node->info); 288 reset_node(node);
252 } 289 }
253 /* Reset counts for open file. */ 290 /* Reset counts for open file. */
254 gcov_info_reset(info); 291 gcov_info_reset(info);
@@ -378,7 +415,10 @@ static void init_node(struct gcov_node *node, struct gcov_info *info,
378 INIT_LIST_HEAD(&node->list); 415 INIT_LIST_HEAD(&node->list);
379 INIT_LIST_HEAD(&node->children); 416 INIT_LIST_HEAD(&node->children);
380 INIT_LIST_HEAD(&node->all); 417 INIT_LIST_HEAD(&node->all);
381 node->info = info; 418 if (node->loaded_info) {
419 node->loaded_info[0] = info;
420 node->num_loaded = 1;
421 }
382 node->parent = parent; 422 node->parent = parent;
383 if (name) 423 if (name)
384 strcpy(node->name, name); 424 strcpy(node->name, name);
@@ -394,9 +434,13 @@ static struct gcov_node *new_node(struct gcov_node *parent,
394 struct gcov_node *node; 434 struct gcov_node *node;
395 435
396 node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL); 436 node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL);
397 if (!node) { 437 if (!node)
398 pr_warning("out of memory\n"); 438 goto err_nomem;
399 return NULL; 439 if (info) {
440 node->loaded_info = kcalloc(1, sizeof(struct gcov_info *),
441 GFP_KERNEL);
442 if (!node->loaded_info)
443 goto err_nomem;
400 } 444 }
401 init_node(node, info, name, parent); 445 init_node(node, info, name, parent);
402 /* Differentiate between gcov data file nodes and directory nodes. */ 446 /* Differentiate between gcov data file nodes and directory nodes. */
@@ -416,6 +460,11 @@ static struct gcov_node *new_node(struct gcov_node *parent,
416 list_add(&node->all, &all_head); 460 list_add(&node->all, &all_head);
417 461
418 return node; 462 return node;
463
464err_nomem:
465 kfree(node);
466 pr_warning("out of memory\n");
467 return NULL;
419} 468}
420 469
421/* Remove symbolic links associated with node. */ 470/* Remove symbolic links associated with node. */
@@ -441,8 +490,9 @@ static void release_node(struct gcov_node *node)
441 list_del(&node->all); 490 list_del(&node->all);
442 debugfs_remove(node->dentry); 491 debugfs_remove(node->dentry);
443 remove_links(node); 492 remove_links(node);
444 if (node->ghost) 493 kfree(node->loaded_info);
445 gcov_info_free(node->ghost); 494 if (node->unloaded_info)
495 gcov_info_free(node->unloaded_info);
446 kfree(node); 496 kfree(node);
447} 497}
448 498
@@ -477,7 +527,7 @@ static struct gcov_node *get_child_by_name(struct gcov_node *parent,
477 527
478/* 528/*
479 * write() implementation for reset file. Reset all profiling data to zero 529 * write() implementation for reset file. Reset all profiling data to zero
480 * and remove ghost nodes. 530 * and remove nodes for which all associated object files are unloaded.
481 */ 531 */
482static ssize_t reset_write(struct file *file, const char __user *addr, 532static ssize_t reset_write(struct file *file, const char __user *addr,
483 size_t len, loff_t *pos) 533 size_t len, loff_t *pos)
@@ -487,8 +537,8 @@ static ssize_t reset_write(struct file *file, const char __user *addr,
487 mutex_lock(&node_lock); 537 mutex_lock(&node_lock);
488restart: 538restart:
489 list_for_each_entry(node, &all_head, all) { 539 list_for_each_entry(node, &all_head, all) {
490 if (node->info) 540 if (node->num_loaded > 0)
491 gcov_info_reset(node->info); 541 reset_node(node);
492 else if (list_empty(&node->children)) { 542 else if (list_empty(&node->children)) {
493 remove_node(node); 543 remove_node(node);
494 /* Several nodes may have gone - restart loop. */ 544 /* Several nodes may have gone - restart loop. */
@@ -511,6 +561,7 @@ static ssize_t reset_read(struct file *file, char __user *addr, size_t len,
511static const struct file_operations gcov_reset_fops = { 561static const struct file_operations gcov_reset_fops = {
512 .write = reset_write, 562 .write = reset_write,
513 .read = reset_read, 563 .read = reset_read,
564 .llseek = noop_llseek,
514}; 565};
515 566
516/* 567/*
@@ -564,37 +615,115 @@ err_remove:
564} 615}
565 616
566/* 617/*
567 * The profiling data set associated with this node is being unloaded. Store a 618 * Associate a profiling data set with an existing node. Needs to be called
568 * copy of the profiling data and turn this node into a "ghost". 619 * with node_lock held.
569 */ 620 */
570static int ghost_node(struct gcov_node *node) 621static void add_info(struct gcov_node *node, struct gcov_info *info)
571{ 622{
572 node->ghost = gcov_info_dup(node->info); 623 struct gcov_info **loaded_info;
573 if (!node->ghost) { 624 int num = node->num_loaded;
574 pr_warning("could not save data for '%s' (out of memory)\n", 625
575 node->info->filename); 626 /*
576 return -ENOMEM; 627 * Prepare new array. This is done first to simplify cleanup in
628 * case the new data set is incompatible, the node only contains
629 * unloaded data sets and there's not enough memory for the array.
630 */
631 loaded_info = kcalloc(num + 1, sizeof(struct gcov_info *), GFP_KERNEL);
632 if (!loaded_info) {
633 pr_warning("could not add '%s' (out of memory)\n",
634 info->filename);
635 return;
636 }
637 memcpy(loaded_info, node->loaded_info,
638 num * sizeof(struct gcov_info *));
639 loaded_info[num] = info;
640 /* Check if the new data set is compatible. */
641 if (num == 0) {
642 /*
643 * A module was unloaded, modified and reloaded. The new
644 * data set replaces the copy of the last one.
645 */
646 if (!gcov_info_is_compatible(node->unloaded_info, info)) {
647 pr_warning("discarding saved data for %s "
648 "(incompatible version)\n", info->filename);
649 gcov_info_free(node->unloaded_info);
650 node->unloaded_info = NULL;
651 }
652 } else {
653 /*
654 * Two different versions of the same object file are loaded.
655 * The initial one takes precedence.
656 */
657 if (!gcov_info_is_compatible(node->loaded_info[0], info)) {
658 pr_warning("could not add '%s' (incompatible "
659 "version)\n", info->filename);
660 kfree(loaded_info);
661 return;
662 }
577 } 663 }
578 node->info = NULL; 664 /* Overwrite previous array. */
665 kfree(node->loaded_info);
666 node->loaded_info = loaded_info;
667 node->num_loaded = num + 1;
668}
579 669
580 return 0; 670/*
671 * Return the index of a profiling data set associated with a node.
672 */
673static int get_info_index(struct gcov_node *node, struct gcov_info *info)
674{
675 int i;
676
677 for (i = 0; i < node->num_loaded; i++) {
678 if (node->loaded_info[i] == info)
679 return i;
680 }
681 return -ENOENT;
581} 682}
582 683
583/* 684/*
584 * Profiling data for this node has been loaded again. Add profiling data 685 * Save the data of a profiling data set which is being unloaded.
585 * from previous instantiation and turn this node into a regular node.
586 */ 686 */
587static void revive_node(struct gcov_node *node, struct gcov_info *info) 687static void save_info(struct gcov_node *node, struct gcov_info *info)
588{ 688{
589 if (gcov_info_is_compatible(node->ghost, info)) 689 if (node->unloaded_info)
590 gcov_info_add(info, node->ghost); 690 gcov_info_add(node->unloaded_info, info);
591 else { 691 else {
592 pr_warning("discarding saved data for '%s' (version changed)\n", 692 node->unloaded_info = gcov_info_dup(info);
693 if (!node->unloaded_info) {
694 pr_warning("could not save data for '%s' "
695 "(out of memory)\n", info->filename);
696 }
697 }
698}
699
700/*
701 * Disassociate a profiling data set from a node. Needs to be called with
702 * node_lock held.
703 */
704static void remove_info(struct gcov_node *node, struct gcov_info *info)
705{
706 int i;
707
708 i = get_info_index(node, info);
709 if (i < 0) {
710 pr_warning("could not remove '%s' (not found)\n",
593 info->filename); 711 info->filename);
712 return;
594 } 713 }
595 gcov_info_free(node->ghost); 714 if (gcov_persist)
596 node->ghost = NULL; 715 save_info(node, info);
597 node->info = info; 716 /* Shrink array. */
717 node->loaded_info[i] = node->loaded_info[node->num_loaded - 1];
718 node->num_loaded--;
719 if (node->num_loaded > 0)
720 return;
721 /* Last loaded data set was removed. */
722 kfree(node->loaded_info);
723 node->loaded_info = NULL;
724 node->num_loaded = 0;
725 if (!node->unloaded_info)
726 remove_node(node);
598} 727}
599 728
600/* 729/*
@@ -609,30 +738,18 @@ void gcov_event(enum gcov_action action, struct gcov_info *info)
609 node = get_node_by_name(info->filename); 738 node = get_node_by_name(info->filename);
610 switch (action) { 739 switch (action) {
611 case GCOV_ADD: 740 case GCOV_ADD:
612 /* Add new node or revive ghost. */ 741 if (node)
613 if (!node) { 742 add_info(node, info);
743 else
614 add_node(info); 744 add_node(info);
615 break;
616 }
617 if (gcov_persist)
618 revive_node(node, info);
619 else {
620 pr_warning("could not add '%s' (already exists)\n",
621 info->filename);
622 }
623 break; 745 break;
624 case GCOV_REMOVE: 746 case GCOV_REMOVE:
625 /* Remove node or turn into ghost. */ 747 if (node)
626 if (!node) { 748 remove_info(node, info);
749 else {
627 pr_warning("could not remove '%s' (not found)\n", 750 pr_warning("could not remove '%s' (not found)\n",
628 info->filename); 751 info->filename);
629 break;
630 } 752 }
631 if (gcov_persist) {
632 if (!ghost_node(node))
633 break;
634 }
635 remove_node(node);
636 break; 753 break;
637 } 754 }
638 mutex_unlock(&node_lock); 755 mutex_unlock(&node_lock);
diff --git a/kernel/groups.c b/kernel/groups.c
index 53b1916c9492..253dc0f35cf4 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -143,10 +143,9 @@ int groups_search(const struct group_info *group_info, gid_t grp)
143 right = group_info->ngroups; 143 right = group_info->ngroups;
144 while (left < right) { 144 while (left < right) {
145 unsigned int mid = (left+right)/2; 145 unsigned int mid = (left+right)/2;
146 int cmp = grp - GROUP_AT(group_info, mid); 146 if (grp > GROUP_AT(group_info, mid))
147 if (cmp > 0)
148 left = mid + 1; 147 left = mid + 1;
149 else if (cmp < 0) 148 else if (grp < GROUP_AT(group_info, mid))
150 right = mid; 149 right = mid;
151 else 150 else
152 return 1; 151 return 1;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 5c69e996bd0f..72206cf5c6cf 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -90,7 +90,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
90 do { 90 do {
91 seq = read_seqbegin(&xtime_lock); 91 seq = read_seqbegin(&xtime_lock);
92 xts = __current_kernel_time(); 92 xts = __current_kernel_time();
93 tom = wall_to_monotonic; 93 tom = __get_wall_to_monotonic();
94 } while (read_seqretry(&xtime_lock, seq)); 94 } while (read_seqretry(&xtime_lock, seq));
95 95
96 xtim = timespec_to_ktime(xts); 96 xtim = timespec_to_ktime(xts);
@@ -144,12 +144,8 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
144static int hrtimer_get_target(int this_cpu, int pinned) 144static int hrtimer_get_target(int this_cpu, int pinned)
145{ 145{
146#ifdef CONFIG_NO_HZ 146#ifdef CONFIG_NO_HZ
147 if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) { 147 if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))
148 int preferred_cpu = get_nohz_load_balancer(); 148 return get_nohz_timer_target();
149
150 if (preferred_cpu >= 0)
151 return preferred_cpu;
152 }
153#endif 149#endif
154 return this_cpu; 150 return this_cpu;
155} 151}
@@ -612,7 +608,7 @@ static int hrtimer_reprogram(struct hrtimer *timer,
612static void retrigger_next_event(void *arg) 608static void retrigger_next_event(void *arg)
613{ 609{
614 struct hrtimer_cpu_base *base; 610 struct hrtimer_cpu_base *base;
615 struct timespec realtime_offset; 611 struct timespec realtime_offset, wtm;
616 unsigned long seq; 612 unsigned long seq;
617 613
618 if (!hrtimer_hres_active()) 614 if (!hrtimer_hres_active())
@@ -620,10 +616,9 @@ static void retrigger_next_event(void *arg)
620 616
621 do { 617 do {
622 seq = read_seqbegin(&xtime_lock); 618 seq = read_seqbegin(&xtime_lock);
623 set_normalized_timespec(&realtime_offset, 619 wtm = __get_wall_to_monotonic();
624 -wall_to_monotonic.tv_sec,
625 -wall_to_monotonic.tv_nsec);
626 } while (read_seqretry(&xtime_lock, seq)); 620 } while (read_seqretry(&xtime_lock, seq));
621 set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
627 622
628 base = &__get_cpu_var(hrtimer_bases); 623 base = &__get_cpu_var(hrtimer_bases);
629 624
@@ -936,6 +931,7 @@ static inline int
936remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) 931remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
937{ 932{
938 if (hrtimer_is_queued(timer)) { 933 if (hrtimer_is_queued(timer)) {
934 unsigned long state;
939 int reprogram; 935 int reprogram;
940 936
941 /* 937 /*
@@ -949,8 +945,13 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
949 debug_deactivate(timer); 945 debug_deactivate(timer);
950 timer_stats_hrtimer_clear_start_info(timer); 946 timer_stats_hrtimer_clear_start_info(timer);
951 reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); 947 reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
952 __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 948 /*
953 reprogram); 949 * We must preserve the CALLBACK state flag here,
950 * otherwise we could move the timer base in
951 * switch_hrtimer_base.
952 */
953 state = timer->state & HRTIMER_STATE_CALLBACK;
954 __remove_hrtimer(timer, base, state, reprogram);
954 return 1; 955 return 1;
955 } 956 }
956 return 0; 957 return 0;
@@ -1096,11 +1097,10 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel);
1096 */ 1097 */
1097ktime_t hrtimer_get_remaining(const struct hrtimer *timer) 1098ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
1098{ 1099{
1099 struct hrtimer_clock_base *base;
1100 unsigned long flags; 1100 unsigned long flags;
1101 ktime_t rem; 1101 ktime_t rem;
1102 1102
1103 base = lock_hrtimer_base(timer, &flags); 1103 lock_hrtimer_base(timer, &flags);
1104 rem = hrtimer_expires_remaining(timer); 1104 rem = hrtimer_expires_remaining(timer);
1105 unlock_hrtimer_base(timer, &flags); 1105 unlock_hrtimer_base(timer, &flags);
1106 1106
@@ -1237,6 +1237,9 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
1237 BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); 1237 BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
1238 enqueue_hrtimer(timer, base); 1238 enqueue_hrtimer(timer, base);
1239 } 1239 }
1240
1241 WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
1242
1240 timer->state &= ~HRTIMER_STATE_CALLBACK; 1243 timer->state &= ~HRTIMER_STATE_CALLBACK;
1241} 1244}
1242 1245
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 0c642d51aac2..53ead174da2f 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -98,7 +98,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
98 printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" 98 printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
99 " disables this message.\n"); 99 " disables this message.\n");
100 sched_show_task(t); 100 sched_show_task(t);
101 __debug_show_held_locks(t); 101 debug_show_held_locks(t);
102 102
103 touch_nmi_watchdog(); 103 touch_nmi_watchdog();
104 104
@@ -111,7 +111,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
111 * periodically exit the critical section and enter a new one. 111 * periodically exit the critical section and enter a new one.
112 * 112 *
113 * For preemptible RCU it is sufficient to call rcu_read_unlock in order 113 * For preemptible RCU it is sufficient to call rcu_read_unlock in order
114 * exit the grace period. For classic RCU, a reschedule is required. 114 * to exit the grace period. For classic RCU, a reschedule is required.
115 */ 115 */
116static void rcu_lock_break(struct task_struct *g, struct task_struct *t) 116static void rcu_lock_break(struct task_struct *g, struct task_struct *t)
117{ 117{
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 7a56b22e0602..2c9120f0afca 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -41,6 +41,7 @@
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/init.h> 42#include <linux/init.h>
43#include <linux/slab.h> 43#include <linux/slab.h>
44#include <linux/list.h>
44#include <linux/cpu.h> 45#include <linux/cpu.h>
45#include <linux/smp.h> 46#include <linux/smp.h>
46 47
@@ -62,6 +63,9 @@ static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]);
62 63
63static int nr_slots[TYPE_MAX]; 64static int nr_slots[TYPE_MAX];
64 65
66/* Keep track of the breakpoints attached to tasks */
67static LIST_HEAD(bp_task_head);
68
65static int constraints_initialized; 69static int constraints_initialized;
66 70
67/* Gather the number of total pinned and un-pinned bp in a cpuset */ 71/* Gather the number of total pinned and un-pinned bp in a cpuset */
@@ -103,33 +107,21 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
103 return 0; 107 return 0;
104} 108}
105 109
106static int task_bp_pinned(struct task_struct *tsk, enum bp_type_idx type) 110/*
111 * Count the number of breakpoints of the same type and same task.
112 * The given event must be not on the list.
113 */
114static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type)
107{ 115{
108 struct perf_event_context *ctx = tsk->perf_event_ctxp; 116 struct task_struct *tsk = bp->hw.bp_target;
109 struct list_head *list; 117 struct perf_event *iter;
110 struct perf_event *bp;
111 unsigned long flags;
112 int count = 0; 118 int count = 0;
113 119
114 if (WARN_ONCE(!ctx, "No perf context for this task")) 120 list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
115 return 0; 121 if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type)
116 122 count += hw_breakpoint_weight(iter);
117 list = &ctx->event_list;
118
119 raw_spin_lock_irqsave(&ctx->lock, flags);
120
121 /*
122 * The current breakpoint counter is not included in the list
123 * at the open() callback time
124 */
125 list_for_each_entry(bp, list, event_entry) {
126 if (bp->attr.type == PERF_TYPE_BREAKPOINT)
127 if (find_slot_idx(bp) == type)
128 count += hw_breakpoint_weight(bp);
129 } 123 }
130 124
131 raw_spin_unlock_irqrestore(&ctx->lock, flags);
132
133 return count; 125 return count;
134} 126}
135 127
@@ -142,14 +134,14 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
142 enum bp_type_idx type) 134 enum bp_type_idx type)
143{ 135{
144 int cpu = bp->cpu; 136 int cpu = bp->cpu;
145 struct task_struct *tsk = bp->ctx->task; 137 struct task_struct *tsk = bp->hw.bp_target;
146 138
147 if (cpu >= 0) { 139 if (cpu >= 0) {
148 slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu); 140 slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu);
149 if (!tsk) 141 if (!tsk)
150 slots->pinned += max_task_bp_pinned(cpu, type); 142 slots->pinned += max_task_bp_pinned(cpu, type);
151 else 143 else
152 slots->pinned += task_bp_pinned(tsk, type); 144 slots->pinned += task_bp_pinned(bp, type);
153 slots->flexible = per_cpu(nr_bp_flexible[type], cpu); 145 slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
154 146
155 return; 147 return;
@@ -162,7 +154,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
162 if (!tsk) 154 if (!tsk)
163 nr += max_task_bp_pinned(cpu, type); 155 nr += max_task_bp_pinned(cpu, type);
164 else 156 else
165 nr += task_bp_pinned(tsk, type); 157 nr += task_bp_pinned(bp, type);
166 158
167 if (nr > slots->pinned) 159 if (nr > slots->pinned)
168 slots->pinned = nr; 160 slots->pinned = nr;
@@ -188,7 +180,7 @@ fetch_this_slot(struct bp_busy_slots *slots, int weight)
188/* 180/*
189 * Add a pinned breakpoint for the given task in our constraint table 181 * Add a pinned breakpoint for the given task in our constraint table
190 */ 182 */
191static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable, 183static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable,
192 enum bp_type_idx type, int weight) 184 enum bp_type_idx type, int weight)
193{ 185{
194 unsigned int *tsk_pinned; 186 unsigned int *tsk_pinned;
@@ -196,10 +188,11 @@ static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable,
196 int old_idx = 0; 188 int old_idx = 0;
197 int idx = 0; 189 int idx = 0;
198 190
199 old_count = task_bp_pinned(tsk, type); 191 old_count = task_bp_pinned(bp, type);
200 old_idx = old_count - 1; 192 old_idx = old_count - 1;
201 idx = old_idx + weight; 193 idx = old_idx + weight;
202 194
195 /* tsk_pinned[n] is the number of tasks having n breakpoints */
203 tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); 196 tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
204 if (enable) { 197 if (enable) {
205 tsk_pinned[idx]++; 198 tsk_pinned[idx]++;
@@ -220,25 +213,43 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
220 int weight) 213 int weight)
221{ 214{
222 int cpu = bp->cpu; 215 int cpu = bp->cpu;
223 struct task_struct *tsk = bp->ctx->task; 216 struct task_struct *tsk = bp->hw.bp_target;
217
218 /* Pinned counter cpu profiling */
219 if (!tsk) {
220
221 if (enable)
222 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight;
223 else
224 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight;
225 return;
226 }
224 227
225 /* Pinned counter task profiling */ 228 /* Pinned counter task profiling */
226 if (tsk) {
227 if (cpu >= 0) {
228 toggle_bp_task_slot(tsk, cpu, enable, type, weight);
229 return;
230 }
231 229
230 if (!enable)
231 list_del(&bp->hw.bp_list);
232
233 if (cpu >= 0) {
234 toggle_bp_task_slot(bp, cpu, enable, type, weight);
235 } else {
232 for_each_online_cpu(cpu) 236 for_each_online_cpu(cpu)
233 toggle_bp_task_slot(tsk, cpu, enable, type, weight); 237 toggle_bp_task_slot(bp, cpu, enable, type, weight);
234 return;
235 } 238 }
236 239
237 /* Pinned counter cpu profiling */
238 if (enable) 240 if (enable)
239 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; 241 list_add_tail(&bp->hw.bp_list, &bp_task_head);
240 else 242}
241 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight; 243
244/*
245 * Function to perform processor-specific cleanup during unregistration
246 */
247__weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
248{
249 /*
250 * A weak stub function here for those archs that don't define
251 * it inside arch/.../kernel/hw_breakpoint.c
252 */
242} 253}
243 254
244/* 255/*
@@ -301,6 +312,10 @@ static int __reserve_bp_slot(struct perf_event *bp)
301 weight = hw_breakpoint_weight(bp); 312 weight = hw_breakpoint_weight(bp);
302 313
303 fetch_bp_busy_slots(&slots, bp, type); 314 fetch_bp_busy_slots(&slots, bp, type);
315 /*
316 * Simulate the addition of this breakpoint to the constraints
317 * and see the result.
318 */
304 fetch_this_slot(&slots, weight); 319 fetch_this_slot(&slots, weight);
305 320
306 /* Flexible counters need to keep at least one slot */ 321 /* Flexible counters need to keep at least one slot */
@@ -339,6 +354,7 @@ void release_bp_slot(struct perf_event *bp)
339{ 354{
340 mutex_lock(&nr_bp_mutex); 355 mutex_lock(&nr_bp_mutex);
341 356
357 arch_unregister_hw_breakpoint(bp);
342 __release_bp_slot(bp); 358 __release_bp_slot(bp);
343 359
344 mutex_unlock(&nr_bp_mutex); 360 mutex_unlock(&nr_bp_mutex);
@@ -417,7 +433,7 @@ register_user_hw_breakpoint(struct perf_event_attr *attr,
417 perf_overflow_handler_t triggered, 433 perf_overflow_handler_t triggered,
418 struct task_struct *tsk) 434 struct task_struct *tsk)
419{ 435{
420 return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered); 436 return perf_event_create_kernel_counter(attr, -1, tsk, triggered);
421} 437}
422EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); 438EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
423 439
@@ -499,7 +515,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
499 get_online_cpus(); 515 get_online_cpus();
500 for_each_online_cpu(cpu) { 516 for_each_online_cpu(cpu) {
501 pevent = per_cpu_ptr(cpu_events, cpu); 517 pevent = per_cpu_ptr(cpu_events, cpu);
502 bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered); 518 bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered);
503 519
504 *pevent = bp; 520 *pevent = bp;
505 521
@@ -549,6 +565,61 @@ static struct notifier_block hw_breakpoint_exceptions_nb = {
549 .priority = 0x7fffffff 565 .priority = 0x7fffffff
550}; 566};
551 567
568static void bp_perf_event_destroy(struct perf_event *event)
569{
570 release_bp_slot(event);
571}
572
573static int hw_breakpoint_event_init(struct perf_event *bp)
574{
575 int err;
576
577 if (bp->attr.type != PERF_TYPE_BREAKPOINT)
578 return -ENOENT;
579
580 err = register_perf_hw_breakpoint(bp);
581 if (err)
582 return err;
583
584 bp->destroy = bp_perf_event_destroy;
585
586 return 0;
587}
588
589static int hw_breakpoint_add(struct perf_event *bp, int flags)
590{
591 if (!(flags & PERF_EF_START))
592 bp->hw.state = PERF_HES_STOPPED;
593
594 return arch_install_hw_breakpoint(bp);
595}
596
597static void hw_breakpoint_del(struct perf_event *bp, int flags)
598{
599 arch_uninstall_hw_breakpoint(bp);
600}
601
602static void hw_breakpoint_start(struct perf_event *bp, int flags)
603{
604 bp->hw.state = 0;
605}
606
607static void hw_breakpoint_stop(struct perf_event *bp, int flags)
608{
609 bp->hw.state = PERF_HES_STOPPED;
610}
611
612static struct pmu perf_breakpoint = {
613 .task_ctx_nr = perf_sw_context, /* could eventually get its own */
614
615 .event_init = hw_breakpoint_event_init,
616 .add = hw_breakpoint_add,
617 .del = hw_breakpoint_del,
618 .start = hw_breakpoint_start,
619 .stop = hw_breakpoint_stop,
620 .read = hw_breakpoint_pmu_read,
621};
622
552static int __init init_hw_breakpoint(void) 623static int __init init_hw_breakpoint(void)
553{ 624{
554 unsigned int **task_bp_pinned; 625 unsigned int **task_bp_pinned;
@@ -570,6 +641,8 @@ static int __init init_hw_breakpoint(void)
570 641
571 constraints_initialized = 1; 642 constraints_initialized = 1;
572 643
644 perf_pmu_register(&perf_breakpoint);
645
573 return register_die_notifier(&hw_breakpoint_exceptions_nb); 646 return register_die_notifier(&hw_breakpoint_exceptions_nb);
574 647
575 err_alloc: 648 err_alloc:
@@ -585,8 +658,3 @@ static int __init init_hw_breakpoint(void)
585core_initcall(init_hw_breakpoint); 658core_initcall(init_hw_breakpoint);
586 659
587 660
588struct pmu perf_ops_bp = {
589 .enable = arch_install_hw_breakpoint,
590 .disable = arch_uninstall_hw_breakpoint,
591 .read = hw_breakpoint_pmu_read,
592};
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
new file mode 100644
index 000000000000..31d766bf5d2e
--- /dev/null
+++ b/kernel/irq/Kconfig
@@ -0,0 +1,53 @@
1config HAVE_GENERIC_HARDIRQS
2 def_bool n
3
4if HAVE_GENERIC_HARDIRQS
5menu "IRQ subsystem"
6#
7# Interrupt subsystem related configuration options
8#
9config GENERIC_HARDIRQS
10 def_bool y
11
12config GENERIC_HARDIRQS_NO__DO_IRQ
13 def_bool y
14
15# Select this to disable the deprecated stuff
16config GENERIC_HARDIRQS_NO_DEPRECATED
17 def_bool n
18
19# Options selectable by the architecture code
20config HAVE_SPARSE_IRQ
21 def_bool n
22
23config GENERIC_IRQ_PROBE
24 def_bool n
25
26config GENERIC_PENDING_IRQ
27 def_bool n
28
29config AUTO_IRQ_AFFINITY
30 def_bool n
31
32config IRQ_PER_CPU
33 def_bool n
34
35config HARDIRQS_SW_RESEND
36 def_bool n
37
38config SPARSE_IRQ
39 bool "Support sparse irq numbering"
40 depends on HAVE_SPARSE_IRQ
41 ---help---
42
43 Sparse irq numbering is useful for distro kernels that want
44 to define a high CONFIG_NR_CPUS value but still want to have
45 low kernel memory footprint on smaller machines.
46
47 ( Sparse irqs can also be beneficial on NUMA boxes, as they spread
48 out the interrupt descriptors in a more NUMA-friendly way. )
49
50 If you don't know what to do here, say N.
51
52endmenu
53endif
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 7d047808419d..54329cd7b3ee 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,7 +1,6 @@
1 1
2obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o 2obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o
3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o 3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
4obj-$(CONFIG_PROC_FS) += proc.o 4obj-$(CONFIG_PROC_FS) += proc.o
5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o 5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
6obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o
7obj-$(CONFIG_PM_SLEEP) += pm.o 6obj-$(CONFIG_PM_SLEEP) += pm.o
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 2295a31ef110..505798f86c36 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -57,9 +57,10 @@ unsigned long probe_irq_on(void)
57 * Some chips need to know about probing in 57 * Some chips need to know about probing in
58 * progress: 58 * progress:
59 */ 59 */
60 if (desc->chip->set_type) 60 if (desc->irq_data.chip->irq_set_type)
61 desc->chip->set_type(i, IRQ_TYPE_PROBE); 61 desc->irq_data.chip->irq_set_type(&desc->irq_data,
62 desc->chip->startup(i); 62 IRQ_TYPE_PROBE);
63 desc->irq_data.chip->irq_startup(&desc->irq_data);
63 } 64 }
64 raw_spin_unlock_irq(&desc->lock); 65 raw_spin_unlock_irq(&desc->lock);
65 } 66 }
@@ -76,7 +77,7 @@ unsigned long probe_irq_on(void)
76 raw_spin_lock_irq(&desc->lock); 77 raw_spin_lock_irq(&desc->lock);
77 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 78 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
78 desc->status |= IRQ_AUTODETECT | IRQ_WAITING; 79 desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
79 if (desc->chip->startup(i)) 80 if (desc->irq_data.chip->irq_startup(&desc->irq_data))
80 desc->status |= IRQ_PENDING; 81 desc->status |= IRQ_PENDING;
81 } 82 }
82 raw_spin_unlock_irq(&desc->lock); 83 raw_spin_unlock_irq(&desc->lock);
@@ -98,7 +99,7 @@ unsigned long probe_irq_on(void)
98 /* It triggered already - consider it spurious. */ 99 /* It triggered already - consider it spurious. */
99 if (!(status & IRQ_WAITING)) { 100 if (!(status & IRQ_WAITING)) {
100 desc->status = status & ~IRQ_AUTODETECT; 101 desc->status = status & ~IRQ_AUTODETECT;
101 desc->chip->shutdown(i); 102 desc->irq_data.chip->irq_shutdown(&desc->irq_data);
102 } else 103 } else
103 if (i < 32) 104 if (i < 32)
104 mask |= 1 << i; 105 mask |= 1 << i;
@@ -137,7 +138,7 @@ unsigned int probe_irq_mask(unsigned long val)
137 mask |= 1 << i; 138 mask |= 1 << i;
138 139
139 desc->status = status & ~IRQ_AUTODETECT; 140 desc->status = status & ~IRQ_AUTODETECT;
140 desc->chip->shutdown(i); 141 desc->irq_data.chip->irq_shutdown(&desc->irq_data);
141 } 142 }
142 raw_spin_unlock_irq(&desc->lock); 143 raw_spin_unlock_irq(&desc->lock);
143 } 144 }
@@ -181,7 +182,7 @@ int probe_irq_off(unsigned long val)
181 nr_of_irqs++; 182 nr_of_irqs++;
182 } 183 }
183 desc->status = status & ~IRQ_AUTODETECT; 184 desc->status = status & ~IRQ_AUTODETECT;
184 desc->chip->shutdown(i); 185 desc->irq_data.chip->irq_shutdown(&desc->irq_data);
185 } 186 }
186 raw_spin_unlock_irq(&desc->lock); 187 raw_spin_unlock_irq(&desc->lock);
187 } 188 }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index b7091d5ca2f8..baa5c4acad83 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -18,108 +18,6 @@
18 18
19#include "internals.h" 19#include "internals.h"
20 20
21static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data)
22{
23 struct irq_desc *desc;
24 unsigned long flags;
25
26 desc = irq_to_desc(irq);
27 if (!desc) {
28 WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
29 return;
30 }
31
32 /* Ensure we don't have left over values from a previous use of this irq */
33 raw_spin_lock_irqsave(&desc->lock, flags);
34 desc->status = IRQ_DISABLED;
35 desc->chip = &no_irq_chip;
36 desc->handle_irq = handle_bad_irq;
37 desc->depth = 1;
38 desc->msi_desc = NULL;
39 desc->handler_data = NULL;
40 if (!keep_chip_data)
41 desc->chip_data = NULL;
42 desc->action = NULL;
43 desc->irq_count = 0;
44 desc->irqs_unhandled = 0;
45#ifdef CONFIG_SMP
46 cpumask_setall(desc->affinity);
47#ifdef CONFIG_GENERIC_PENDING_IRQ
48 cpumask_clear(desc->pending_mask);
49#endif
50#endif
51 raw_spin_unlock_irqrestore(&desc->lock, flags);
52}
53
54/**
55 * dynamic_irq_init - initialize a dynamically allocated irq
56 * @irq: irq number to initialize
57 */
58void dynamic_irq_init(unsigned int irq)
59{
60 dynamic_irq_init_x(irq, false);
61}
62
63/**
64 * dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq
65 * @irq: irq number to initialize
66 *
67 * does not set irq_to_desc(irq)->chip_data to NULL
68 */
69void dynamic_irq_init_keep_chip_data(unsigned int irq)
70{
71 dynamic_irq_init_x(irq, true);
72}
73
74static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data)
75{
76 struct irq_desc *desc = irq_to_desc(irq);
77 unsigned long flags;
78
79 if (!desc) {
80 WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq);
81 return;
82 }
83
84 raw_spin_lock_irqsave(&desc->lock, flags);
85 if (desc->action) {
86 raw_spin_unlock_irqrestore(&desc->lock, flags);
87 WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n",
88 irq);
89 return;
90 }
91 desc->msi_desc = NULL;
92 desc->handler_data = NULL;
93 if (!keep_chip_data)
94 desc->chip_data = NULL;
95 desc->handle_irq = handle_bad_irq;
96 desc->chip = &no_irq_chip;
97 desc->name = NULL;
98 clear_kstat_irqs(desc);
99 raw_spin_unlock_irqrestore(&desc->lock, flags);
100}
101
102/**
103 * dynamic_irq_cleanup - cleanup a dynamically allocated irq
104 * @irq: irq number to initialize
105 */
106void dynamic_irq_cleanup(unsigned int irq)
107{
108 dynamic_irq_cleanup_x(irq, false);
109}
110
111/**
112 * dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq
113 * @irq: irq number to initialize
114 *
115 * does not set irq_to_desc(irq)->chip_data to NULL
116 */
117void dynamic_irq_cleanup_keep_chip_data(unsigned int irq)
118{
119 dynamic_irq_cleanup_x(irq, true);
120}
121
122
123/** 21/**
124 * set_irq_chip - set the irq chip for an irq 22 * set_irq_chip - set the irq chip for an irq
125 * @irq: irq number 23 * @irq: irq number
@@ -140,7 +38,7 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
140 38
141 raw_spin_lock_irqsave(&desc->lock, flags); 39 raw_spin_lock_irqsave(&desc->lock, flags);
142 irq_chip_set_defaults(chip); 40 irq_chip_set_defaults(chip);
143 desc->chip = chip; 41 desc->irq_data.chip = chip;
144 raw_spin_unlock_irqrestore(&desc->lock, flags); 42 raw_spin_unlock_irqrestore(&desc->lock, flags);
145 43
146 return 0; 44 return 0;
@@ -193,7 +91,7 @@ int set_irq_data(unsigned int irq, void *data)
193 } 91 }
194 92
195 raw_spin_lock_irqsave(&desc->lock, flags); 93 raw_spin_lock_irqsave(&desc->lock, flags);
196 desc->handler_data = data; 94 desc->irq_data.handler_data = data;
197 raw_spin_unlock_irqrestore(&desc->lock, flags); 95 raw_spin_unlock_irqrestore(&desc->lock, flags);
198 return 0; 96 return 0;
199} 97}
@@ -218,7 +116,7 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry)
218 } 116 }
219 117
220 raw_spin_lock_irqsave(&desc->lock, flags); 118 raw_spin_lock_irqsave(&desc->lock, flags);
221 desc->msi_desc = entry; 119 desc->irq_data.msi_desc = entry;
222 if (entry) 120 if (entry)
223 entry->irq = irq; 121 entry->irq = irq;
224 raw_spin_unlock_irqrestore(&desc->lock, flags); 122 raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -243,19 +141,27 @@ int set_irq_chip_data(unsigned int irq, void *data)
243 return -EINVAL; 141 return -EINVAL;
244 } 142 }
245 143
246 if (!desc->chip) { 144 if (!desc->irq_data.chip) {
247 printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq); 145 printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
248 return -EINVAL; 146 return -EINVAL;
249 } 147 }
250 148
251 raw_spin_lock_irqsave(&desc->lock, flags); 149 raw_spin_lock_irqsave(&desc->lock, flags);
252 desc->chip_data = data; 150 desc->irq_data.chip_data = data;
253 raw_spin_unlock_irqrestore(&desc->lock, flags); 151 raw_spin_unlock_irqrestore(&desc->lock, flags);
254 152
255 return 0; 153 return 0;
256} 154}
257EXPORT_SYMBOL(set_irq_chip_data); 155EXPORT_SYMBOL(set_irq_chip_data);
258 156
157struct irq_data *irq_get_irq_data(unsigned int irq)
158{
159 struct irq_desc *desc = irq_to_desc(irq);
160
161 return desc ? &desc->irq_data : NULL;
162}
163EXPORT_SYMBOL_GPL(irq_get_irq_data);
164
259/** 165/**
260 * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq 166 * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq
261 * 167 *
@@ -287,93 +193,216 @@ EXPORT_SYMBOL_GPL(set_irq_nested_thread);
287/* 193/*
288 * default enable function 194 * default enable function
289 */ 195 */
290static void default_enable(unsigned int irq) 196static void default_enable(struct irq_data *data)
291{ 197{
292 struct irq_desc *desc = irq_to_desc(irq); 198 struct irq_desc *desc = irq_data_to_desc(data);
293 199
294 desc->chip->unmask(irq); 200 desc->irq_data.chip->irq_unmask(&desc->irq_data);
295 desc->status &= ~IRQ_MASKED; 201 desc->status &= ~IRQ_MASKED;
296} 202}
297 203
298/* 204/*
299 * default disable function 205 * default disable function
300 */ 206 */
301static void default_disable(unsigned int irq) 207static void default_disable(struct irq_data *data)
302{ 208{
303} 209}
304 210
305/* 211/*
306 * default startup function 212 * default startup function
307 */ 213 */
308static unsigned int default_startup(unsigned int irq) 214static unsigned int default_startup(struct irq_data *data)
309{ 215{
310 struct irq_desc *desc = irq_to_desc(irq); 216 struct irq_desc *desc = irq_data_to_desc(data);
311 217
312 desc->chip->enable(irq); 218 desc->irq_data.chip->irq_enable(data);
313 return 0; 219 return 0;
314} 220}
315 221
316/* 222/*
317 * default shutdown function 223 * default shutdown function
318 */ 224 */
319static void default_shutdown(unsigned int irq) 225static void default_shutdown(struct irq_data *data)
320{ 226{
321 struct irq_desc *desc = irq_to_desc(irq); 227 struct irq_desc *desc = irq_data_to_desc(data);
322 228
323 desc->chip->mask(irq); 229 desc->irq_data.chip->irq_mask(&desc->irq_data);
324 desc->status |= IRQ_MASKED; 230 desc->status |= IRQ_MASKED;
325} 231}
326 232
233#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
234/* Temporary migration helpers */
235static void compat_irq_mask(struct irq_data *data)
236{
237 data->chip->mask(data->irq);
238}
239
240static void compat_irq_unmask(struct irq_data *data)
241{
242 data->chip->unmask(data->irq);
243}
244
245static void compat_irq_ack(struct irq_data *data)
246{
247 data->chip->ack(data->irq);
248}
249
250static void compat_irq_mask_ack(struct irq_data *data)
251{
252 data->chip->mask_ack(data->irq);
253}
254
255static void compat_irq_eoi(struct irq_data *data)
256{
257 data->chip->eoi(data->irq);
258}
259
260static void compat_irq_enable(struct irq_data *data)
261{
262 data->chip->enable(data->irq);
263}
264
265static void compat_irq_disable(struct irq_data *data)
266{
267 data->chip->disable(data->irq);
268}
269
270static void compat_irq_shutdown(struct irq_data *data)
271{
272 data->chip->shutdown(data->irq);
273}
274
275static unsigned int compat_irq_startup(struct irq_data *data)
276{
277 return data->chip->startup(data->irq);
278}
279
280static int compat_irq_set_affinity(struct irq_data *data,
281 const struct cpumask *dest, bool force)
282{
283 return data->chip->set_affinity(data->irq, dest);
284}
285
286static int compat_irq_set_type(struct irq_data *data, unsigned int type)
287{
288 return data->chip->set_type(data->irq, type);
289}
290
291static int compat_irq_set_wake(struct irq_data *data, unsigned int on)
292{
293 return data->chip->set_wake(data->irq, on);
294}
295
296static int compat_irq_retrigger(struct irq_data *data)
297{
298 return data->chip->retrigger(data->irq);
299}
300
301static void compat_bus_lock(struct irq_data *data)
302{
303 data->chip->bus_lock(data->irq);
304}
305
306static void compat_bus_sync_unlock(struct irq_data *data)
307{
308 data->chip->bus_sync_unlock(data->irq);
309}
310#endif
311
327/* 312/*
328 * Fixup enable/disable function pointers 313 * Fixup enable/disable function pointers
329 */ 314 */
330void irq_chip_set_defaults(struct irq_chip *chip) 315void irq_chip_set_defaults(struct irq_chip *chip)
331{ 316{
332 if (!chip->enable) 317#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
333 chip->enable = default_enable;
334 if (!chip->disable)
335 chip->disable = default_disable;
336 if (!chip->startup)
337 chip->startup = default_startup;
338 /* 318 /*
339 * We use chip->disable, when the user provided its own. When 319 * Compat fixup functions need to be before we set the
340 * we have default_disable set for chip->disable, then we need 320 * defaults for enable/disable/startup/shutdown
321 */
322 if (chip->enable)
323 chip->irq_enable = compat_irq_enable;
324 if (chip->disable)
325 chip->irq_disable = compat_irq_disable;
326 if (chip->shutdown)
327 chip->irq_shutdown = compat_irq_shutdown;
328 if (chip->startup)
329 chip->irq_startup = compat_irq_startup;
330#endif
331 /*
332 * The real defaults
333 */
334 if (!chip->irq_enable)
335 chip->irq_enable = default_enable;
336 if (!chip->irq_disable)
337 chip->irq_disable = default_disable;
338 if (!chip->irq_startup)
339 chip->irq_startup = default_startup;
340 /*
341 * We use chip->irq_disable, when the user provided its own. When
342 * we have default_disable set for chip->irq_disable, then we need
341 * to use default_shutdown, otherwise the irq line is not 343 * to use default_shutdown, otherwise the irq line is not
342 * disabled on free_irq(): 344 * disabled on free_irq():
343 */ 345 */
344 if (!chip->shutdown) 346 if (!chip->irq_shutdown)
345 chip->shutdown = chip->disable != default_disable ? 347 chip->irq_shutdown = chip->irq_disable != default_disable ?
346 chip->disable : default_shutdown; 348 chip->irq_disable : default_shutdown;
347 if (!chip->name) 349
348 chip->name = chip->typename; 350#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
349 if (!chip->end) 351 if (!chip->end)
350 chip->end = dummy_irq_chip.end; 352 chip->end = dummy_irq_chip.end;
353
354 /*
355 * Now fix up the remaining compat handlers
356 */
357 if (chip->bus_lock)
358 chip->irq_bus_lock = compat_bus_lock;
359 if (chip->bus_sync_unlock)
360 chip->irq_bus_sync_unlock = compat_bus_sync_unlock;
361 if (chip->mask)
362 chip->irq_mask = compat_irq_mask;
363 if (chip->unmask)
364 chip->irq_unmask = compat_irq_unmask;
365 if (chip->ack)
366 chip->irq_ack = compat_irq_ack;
367 if (chip->mask_ack)
368 chip->irq_mask_ack = compat_irq_mask_ack;
369 if (chip->eoi)
370 chip->irq_eoi = compat_irq_eoi;
371 if (chip->set_affinity)
372 chip->irq_set_affinity = compat_irq_set_affinity;
373 if (chip->set_type)
374 chip->irq_set_type = compat_irq_set_type;
375 if (chip->set_wake)
376 chip->irq_set_wake = compat_irq_set_wake;
377 if (chip->retrigger)
378 chip->irq_retrigger = compat_irq_retrigger;
379#endif
351} 380}
352 381
353static inline void mask_ack_irq(struct irq_desc *desc, int irq) 382static inline void mask_ack_irq(struct irq_desc *desc)
354{ 383{
355 if (desc->chip->mask_ack) 384 if (desc->irq_data.chip->irq_mask_ack)
356 desc->chip->mask_ack(irq); 385 desc->irq_data.chip->irq_mask_ack(&desc->irq_data);
357 else { 386 else {
358 desc->chip->mask(irq); 387 desc->irq_data.chip->irq_mask(&desc->irq_data);
359 if (desc->chip->ack) 388 if (desc->irq_data.chip->irq_ack)
360 desc->chip->ack(irq); 389 desc->irq_data.chip->irq_ack(&desc->irq_data);
361 } 390 }
362 desc->status |= IRQ_MASKED; 391 desc->status |= IRQ_MASKED;
363} 392}
364 393
365static inline void mask_irq(struct irq_desc *desc, int irq) 394static inline void mask_irq(struct irq_desc *desc)
366{ 395{
367 if (desc->chip->mask) { 396 if (desc->irq_data.chip->irq_mask) {
368 desc->chip->mask(irq); 397 desc->irq_data.chip->irq_mask(&desc->irq_data);
369 desc->status |= IRQ_MASKED; 398 desc->status |= IRQ_MASKED;
370 } 399 }
371} 400}
372 401
373static inline void unmask_irq(struct irq_desc *desc, int irq) 402static inline void unmask_irq(struct irq_desc *desc)
374{ 403{
375 if (desc->chip->unmask) { 404 if (desc->irq_data.chip->irq_unmask) {
376 desc->chip->unmask(irq); 405 desc->irq_data.chip->irq_unmask(&desc->irq_data);
377 desc->status &= ~IRQ_MASKED; 406 desc->status &= ~IRQ_MASKED;
378 } 407 }
379} 408}
@@ -476,7 +505,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
476 irqreturn_t action_ret; 505 irqreturn_t action_ret;
477 506
478 raw_spin_lock(&desc->lock); 507 raw_spin_lock(&desc->lock);
479 mask_ack_irq(desc, irq); 508 mask_ack_irq(desc);
480 509
481 if (unlikely(desc->status & IRQ_INPROGRESS)) 510 if (unlikely(desc->status & IRQ_INPROGRESS))
482 goto out_unlock; 511 goto out_unlock;
@@ -502,7 +531,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
502 desc->status &= ~IRQ_INPROGRESS; 531 desc->status &= ~IRQ_INPROGRESS;
503 532
504 if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT))) 533 if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT)))
505 unmask_irq(desc, irq); 534 unmask_irq(desc);
506out_unlock: 535out_unlock:
507 raw_spin_unlock(&desc->lock); 536 raw_spin_unlock(&desc->lock);
508} 537}
@@ -539,7 +568,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
539 action = desc->action; 568 action = desc->action;
540 if (unlikely(!action || (desc->status & IRQ_DISABLED))) { 569 if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
541 desc->status |= IRQ_PENDING; 570 desc->status |= IRQ_PENDING;
542 mask_irq(desc, irq); 571 mask_irq(desc);
543 goto out; 572 goto out;
544 } 573 }
545 574
@@ -554,7 +583,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
554 raw_spin_lock(&desc->lock); 583 raw_spin_lock(&desc->lock);
555 desc->status &= ~IRQ_INPROGRESS; 584 desc->status &= ~IRQ_INPROGRESS;
556out: 585out:
557 desc->chip->eoi(irq); 586 desc->irq_data.chip->irq_eoi(&desc->irq_data);
558 587
559 raw_spin_unlock(&desc->lock); 588 raw_spin_unlock(&desc->lock);
560} 589}
@@ -590,14 +619,13 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
590 if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) || 619 if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) ||
591 !desc->action)) { 620 !desc->action)) {
592 desc->status |= (IRQ_PENDING | IRQ_MASKED); 621 desc->status |= (IRQ_PENDING | IRQ_MASKED);
593 mask_ack_irq(desc, irq); 622 mask_ack_irq(desc);
594 goto out_unlock; 623 goto out_unlock;
595 } 624 }
596 kstat_incr_irqs_this_cpu(irq, desc); 625 kstat_incr_irqs_this_cpu(irq, desc);
597 626
598 /* Start handling the irq */ 627 /* Start handling the irq */
599 if (desc->chip->ack) 628 desc->irq_data.chip->irq_ack(&desc->irq_data);
600 desc->chip->ack(irq);
601 629
602 /* Mark the IRQ currently in progress.*/ 630 /* Mark the IRQ currently in progress.*/
603 desc->status |= IRQ_INPROGRESS; 631 desc->status |= IRQ_INPROGRESS;
@@ -607,7 +635,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
607 irqreturn_t action_ret; 635 irqreturn_t action_ret;
608 636
609 if (unlikely(!action)) { 637 if (unlikely(!action)) {
610 mask_irq(desc, irq); 638 mask_irq(desc);
611 goto out_unlock; 639 goto out_unlock;
612 } 640 }
613 641
@@ -619,7 +647,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
619 if (unlikely((desc->status & 647 if (unlikely((desc->status &
620 (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == 648 (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
621 (IRQ_PENDING | IRQ_MASKED))) { 649 (IRQ_PENDING | IRQ_MASKED))) {
622 unmask_irq(desc, irq); 650 unmask_irq(desc);
623 } 651 }
624 652
625 desc->status &= ~IRQ_PENDING; 653 desc->status &= ~IRQ_PENDING;
@@ -650,15 +678,15 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
650 678
651 kstat_incr_irqs_this_cpu(irq, desc); 679 kstat_incr_irqs_this_cpu(irq, desc);
652 680
653 if (desc->chip->ack) 681 if (desc->irq_data.chip->irq_ack)
654 desc->chip->ack(irq); 682 desc->irq_data.chip->irq_ack(&desc->irq_data);
655 683
656 action_ret = handle_IRQ_event(irq, desc->action); 684 action_ret = handle_IRQ_event(irq, desc->action);
657 if (!noirqdebug) 685 if (!noirqdebug)
658 note_interrupt(irq, desc, action_ret); 686 note_interrupt(irq, desc, action_ret);
659 687
660 if (desc->chip->eoi) 688 if (desc->irq_data.chip->irq_eoi)
661 desc->chip->eoi(irq); 689 desc->irq_data.chip->irq_eoi(&desc->irq_data);
662} 690}
663 691
664void 692void
@@ -676,7 +704,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
676 704
677 if (!handle) 705 if (!handle)
678 handle = handle_bad_irq; 706 handle = handle_bad_irq;
679 else if (desc->chip == &no_irq_chip) { 707 else if (desc->irq_data.chip == &no_irq_chip) {
680 printk(KERN_WARNING "Trying to install %sinterrupt handler " 708 printk(KERN_WARNING "Trying to install %sinterrupt handler "
681 "for IRQ%d\n", is_chained ? "chained " : "", irq); 709 "for IRQ%d\n", is_chained ? "chained " : "", irq);
682 /* 710 /*
@@ -686,16 +714,16 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
686 * prevent us to setup the interrupt at all. Switch it to 714 * prevent us to setup the interrupt at all. Switch it to
687 * dummy_irq_chip for easy transition. 715 * dummy_irq_chip for easy transition.
688 */ 716 */
689 desc->chip = &dummy_irq_chip; 717 desc->irq_data.chip = &dummy_irq_chip;
690 } 718 }
691 719
692 chip_bus_lock(irq, desc); 720 chip_bus_lock(desc);
693 raw_spin_lock_irqsave(&desc->lock, flags); 721 raw_spin_lock_irqsave(&desc->lock, flags);
694 722
695 /* Uninstall? */ 723 /* Uninstall? */
696 if (handle == handle_bad_irq) { 724 if (handle == handle_bad_irq) {
697 if (desc->chip != &no_irq_chip) 725 if (desc->irq_data.chip != &no_irq_chip)
698 mask_ack_irq(desc, irq); 726 mask_ack_irq(desc);
699 desc->status |= IRQ_DISABLED; 727 desc->status |= IRQ_DISABLED;
700 desc->depth = 1; 728 desc->depth = 1;
701 } 729 }
@@ -706,10 +734,10 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
706 desc->status &= ~IRQ_DISABLED; 734 desc->status &= ~IRQ_DISABLED;
707 desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; 735 desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
708 desc->depth = 0; 736 desc->depth = 0;
709 desc->chip->startup(irq); 737 desc->irq_data.chip->irq_startup(&desc->irq_data);
710 } 738 }
711 raw_spin_unlock_irqrestore(&desc->lock, flags); 739 raw_spin_unlock_irqrestore(&desc->lock, flags);
712 chip_bus_sync_unlock(irq, desc); 740 chip_bus_sync_unlock(desc);
713} 741}
714EXPORT_SYMBOL_GPL(__set_irq_handler); 742EXPORT_SYMBOL_GPL(__set_irq_handler);
715 743
@@ -729,32 +757,20 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
729 __set_irq_handler(irq, handle, 0, name); 757 __set_irq_handler(irq, handle, 0, name);
730} 758}
731 759
732void set_irq_noprobe(unsigned int irq) 760void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
733{ 761{
734 struct irq_desc *desc = irq_to_desc(irq); 762 struct irq_desc *desc = irq_to_desc(irq);
735 unsigned long flags; 763 unsigned long flags;
736 764
737 if (!desc) { 765 if (!desc)
738 printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq);
739 return; 766 return;
740 }
741
742 raw_spin_lock_irqsave(&desc->lock, flags);
743 desc->status |= IRQ_NOPROBE;
744 raw_spin_unlock_irqrestore(&desc->lock, flags);
745}
746
747void set_irq_probe(unsigned int irq)
748{
749 struct irq_desc *desc = irq_to_desc(irq);
750 unsigned long flags;
751 767
752 if (!desc) { 768 /* Sanitize flags */
753 printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq); 769 set &= IRQF_MODIFY_MASK;
754 return; 770 clr &= IRQF_MODIFY_MASK;
755 }
756 771
757 raw_spin_lock_irqsave(&desc->lock, flags); 772 raw_spin_lock_irqsave(&desc->lock, flags);
758 desc->status &= ~IRQ_NOPROBE; 773 desc->status &= ~clr;
774 desc->status |= set;
759 raw_spin_unlock_irqrestore(&desc->lock, flags); 775 raw_spin_unlock_irqrestore(&desc->lock, flags);
760} 776}
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c
new file mode 100644
index 000000000000..20dc5474947e
--- /dev/null
+++ b/kernel/irq/dummychip.c
@@ -0,0 +1,68 @@
1/*
2 * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
3 * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
4 *
5 * This file contains the dummy interrupt chip implementation
6 */
7#include <linux/interrupt.h>
8#include <linux/irq.h>
9
10#include "internals.h"
11
12/*
13 * What should we do if we get a hw irq event on an illegal vector?
14 * Each architecture has to answer this themself.
15 */
16static void ack_bad(struct irq_data *data)
17{
18 struct irq_desc *desc = irq_data_to_desc(data);
19
20 print_irq_desc(data->irq, desc);
21 ack_bad_irq(data->irq);
22}
23
24/*
25 * NOP functions
26 */
27static void noop(struct irq_data *data) { }
28
29static unsigned int noop_ret(struct irq_data *data)
30{
31 return 0;
32}
33
34#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
35static void compat_noop(unsigned int irq) { }
36#define END_INIT .end = compat_noop
37#else
38#define END_INIT
39#endif
40
41/*
42 * Generic no controller implementation
43 */
44struct irq_chip no_irq_chip = {
45 .name = "none",
46 .irq_startup = noop_ret,
47 .irq_shutdown = noop,
48 .irq_enable = noop,
49 .irq_disable = noop,
50 .irq_ack = ack_bad,
51 END_INIT
52};
53
54/*
55 * Generic dummy implementation which can be used for
56 * real dumb interrupt sources
57 */
58struct irq_chip dummy_irq_chip = {
59 .name = "dummy",
60 .irq_startup = noop_ret,
61 .irq_shutdown = noop,
62 .irq_enable = noop,
63 .irq_disable = noop,
64 .irq_ack = noop,
65 .irq_mask = noop,
66 .irq_unmask = noop,
67 END_INIT
68};
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 27e5c6911223..e2347eb63306 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -11,24 +11,15 @@
11 */ 11 */
12 12
13#include <linux/irq.h> 13#include <linux/irq.h>
14#include <linux/sched.h>
15#include <linux/slab.h>
16#include <linux/module.h>
17#include <linux/random.h> 14#include <linux/random.h>
15#include <linux/sched.h>
18#include <linux/interrupt.h> 16#include <linux/interrupt.h>
19#include <linux/kernel_stat.h> 17#include <linux/kernel_stat.h>
20#include <linux/rculist.h> 18
21#include <linux/hash.h>
22#include <linux/radix-tree.h>
23#include <trace/events/irq.h> 19#include <trace/events/irq.h>
24 20
25#include "internals.h" 21#include "internals.h"
26 22
27/*
28 * lockdep: we want to handle all irq_desc locks as a single lock-class:
29 */
30struct lock_class_key irq_desc_lock_class;
31
32/** 23/**
33 * handle_bad_irq - handle spurious and unhandled irqs 24 * handle_bad_irq - handle spurious and unhandled irqs
34 * @irq: the interrupt number 25 * @irq: the interrupt number
@@ -43,304 +34,6 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
43 ack_bad_irq(irq); 34 ack_bad_irq(irq);
44} 35}
45 36
46#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
47static void __init init_irq_default_affinity(void)
48{
49 alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
50 cpumask_setall(irq_default_affinity);
51}
52#else
53static void __init init_irq_default_affinity(void)
54{
55}
56#endif
57
58/*
59 * Linux has a controller-independent interrupt architecture.
60 * Every controller has a 'controller-template', that is used
61 * by the main code to do the right thing. Each driver-visible
62 * interrupt source is transparently wired to the appropriate
63 * controller. Thus drivers need not be aware of the
64 * interrupt-controller.
65 *
66 * The code is designed to be easily extended with new/different
67 * interrupt controllers, without having to do assembly magic or
68 * having to touch the generic code.
69 *
70 * Controller mappings for all interrupt sources:
71 */
72int nr_irqs = NR_IRQS;
73EXPORT_SYMBOL_GPL(nr_irqs);
74
75#ifdef CONFIG_SPARSE_IRQ
76
77static struct irq_desc irq_desc_init = {
78 .irq = -1,
79 .status = IRQ_DISABLED,
80 .chip = &no_irq_chip,
81 .handle_irq = handle_bad_irq,
82 .depth = 1,
83 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
84};
85
86void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
87{
88 void *ptr;
89
90 ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
91 GFP_ATOMIC, node);
92
93 /*
94 * don't overwite if can not get new one
95 * init_copy_kstat_irqs() could still use old one
96 */
97 if (ptr) {
98 printk(KERN_DEBUG " alloc kstat_irqs on node %d\n", node);
99 desc->kstat_irqs = ptr;
100 }
101}
102
103static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
104{
105 memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
106
107 raw_spin_lock_init(&desc->lock);
108 desc->irq = irq;
109#ifdef CONFIG_SMP
110 desc->node = node;
111#endif
112 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
113 init_kstat_irqs(desc, node, nr_cpu_ids);
114 if (!desc->kstat_irqs) {
115 printk(KERN_ERR "can not alloc kstat_irqs\n");
116 BUG_ON(1);
117 }
118 if (!alloc_desc_masks(desc, node, false)) {
119 printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
120 BUG_ON(1);
121 }
122 init_desc_masks(desc);
123 arch_init_chip_data(desc, node);
124}
125
126/*
127 * Protect the sparse_irqs:
128 */
129DEFINE_RAW_SPINLOCK(sparse_irq_lock);
130
131static RADIX_TREE(irq_desc_tree, GFP_ATOMIC);
132
133static void set_irq_desc(unsigned int irq, struct irq_desc *desc)
134{
135 radix_tree_insert(&irq_desc_tree, irq, desc);
136}
137
138struct irq_desc *irq_to_desc(unsigned int irq)
139{
140 return radix_tree_lookup(&irq_desc_tree, irq);
141}
142
143void replace_irq_desc(unsigned int irq, struct irq_desc *desc)
144{
145 void **ptr;
146
147 ptr = radix_tree_lookup_slot(&irq_desc_tree, irq);
148 if (ptr)
149 radix_tree_replace_slot(ptr, desc);
150}
151
152static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
153 [0 ... NR_IRQS_LEGACY-1] = {
154 .irq = -1,
155 .status = IRQ_DISABLED,
156 .chip = &no_irq_chip,
157 .handle_irq = handle_bad_irq,
158 .depth = 1,
159 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
160 }
161};
162
163static unsigned int *kstat_irqs_legacy;
164
165int __init early_irq_init(void)
166{
167 struct irq_desc *desc;
168 int legacy_count;
169 int node;
170 int i;
171
172 init_irq_default_affinity();
173
174 /* initialize nr_irqs based on nr_cpu_ids */
175 arch_probe_nr_irqs();
176 printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs);
177
178 desc = irq_desc_legacy;
179 legacy_count = ARRAY_SIZE(irq_desc_legacy);
180 node = first_online_node;
181
182 /* allocate based on nr_cpu_ids */
183 kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
184 sizeof(int), GFP_NOWAIT, node);
185
186 for (i = 0; i < legacy_count; i++) {
187 desc[i].irq = i;
188#ifdef CONFIG_SMP
189 desc[i].node = node;
190#endif
191 desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
192 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
193 alloc_desc_masks(&desc[i], node, true);
194 init_desc_masks(&desc[i]);
195 set_irq_desc(i, &desc[i]);
196 }
197
198 return arch_early_irq_init();
199}
200
201struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
202{
203 struct irq_desc *desc;
204 unsigned long flags;
205
206 if (irq >= nr_irqs) {
207 WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n",
208 irq, nr_irqs);
209 return NULL;
210 }
211
212 desc = irq_to_desc(irq);
213 if (desc)
214 return desc;
215
216 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
217
218 /* We have to check it to avoid races with another CPU */
219 desc = irq_to_desc(irq);
220 if (desc)
221 goto out_unlock;
222
223 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
224
225 printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node);
226 if (!desc) {
227 printk(KERN_ERR "can not alloc irq_desc\n");
228 BUG_ON(1);
229 }
230 init_one_irq_desc(irq, desc, node);
231
232 set_irq_desc(irq, desc);
233
234out_unlock:
235 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
236
237 return desc;
238}
239
240#else /* !CONFIG_SPARSE_IRQ */
241
242struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
243 [0 ... NR_IRQS-1] = {
244 .status = IRQ_DISABLED,
245 .chip = &no_irq_chip,
246 .handle_irq = handle_bad_irq,
247 .depth = 1,
248 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
249 }
250};
251
252static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
253int __init early_irq_init(void)
254{
255 struct irq_desc *desc;
256 int count;
257 int i;
258
259 init_irq_default_affinity();
260
261 printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
262
263 desc = irq_desc;
264 count = ARRAY_SIZE(irq_desc);
265
266 for (i = 0; i < count; i++) {
267 desc[i].irq = i;
268 alloc_desc_masks(&desc[i], 0, true);
269 init_desc_masks(&desc[i]);
270 desc[i].kstat_irqs = kstat_irqs_all[i];
271 }
272 return arch_early_irq_init();
273}
274
275struct irq_desc *irq_to_desc(unsigned int irq)
276{
277 return (irq < NR_IRQS) ? irq_desc + irq : NULL;
278}
279
280struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
281{
282 return irq_to_desc(irq);
283}
284#endif /* !CONFIG_SPARSE_IRQ */
285
286void clear_kstat_irqs(struct irq_desc *desc)
287{
288 memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));
289}
290
291/*
292 * What should we do if we get a hw irq event on an illegal vector?
293 * Each architecture has to answer this themself.
294 */
295static void ack_bad(unsigned int irq)
296{
297 struct irq_desc *desc = irq_to_desc(irq);
298
299 print_irq_desc(irq, desc);
300 ack_bad_irq(irq);
301}
302
303/*
304 * NOP functions
305 */
306static void noop(unsigned int irq)
307{
308}
309
310static unsigned int noop_ret(unsigned int irq)
311{
312 return 0;
313}
314
315/*
316 * Generic no controller implementation
317 */
318struct irq_chip no_irq_chip = {
319 .name = "none",
320 .startup = noop_ret,
321 .shutdown = noop,
322 .enable = noop,
323 .disable = noop,
324 .ack = ack_bad,
325 .end = noop,
326};
327
328/*
329 * Generic dummy implementation which can be used for
330 * real dumb interrupt sources
331 */
332struct irq_chip dummy_irq_chip = {
333 .name = "dummy",
334 .startup = noop_ret,
335 .shutdown = noop,
336 .enable = noop,
337 .disable = noop,
338 .ack = noop,
339 .mask = noop,
340 .unmask = noop,
341 .end = noop,
342};
343
344/* 37/*
345 * Special, empty irq handler: 38 * Special, empty irq handler:
346 */ 39 */
@@ -457,20 +150,20 @@ unsigned int __do_IRQ(unsigned int irq)
457 /* 150 /*
458 * No locking required for CPU-local interrupts: 151 * No locking required for CPU-local interrupts:
459 */ 152 */
460 if (desc->chip->ack) 153 if (desc->irq_data.chip->ack)
461 desc->chip->ack(irq); 154 desc->irq_data.chip->ack(irq);
462 if (likely(!(desc->status & IRQ_DISABLED))) { 155 if (likely(!(desc->status & IRQ_DISABLED))) {
463 action_ret = handle_IRQ_event(irq, desc->action); 156 action_ret = handle_IRQ_event(irq, desc->action);
464 if (!noirqdebug) 157 if (!noirqdebug)
465 note_interrupt(irq, desc, action_ret); 158 note_interrupt(irq, desc, action_ret);
466 } 159 }
467 desc->chip->end(irq); 160 desc->irq_data.chip->end(irq);
468 return 1; 161 return 1;
469 } 162 }
470 163
471 raw_spin_lock(&desc->lock); 164 raw_spin_lock(&desc->lock);
472 if (desc->chip->ack) 165 if (desc->irq_data.chip->ack)
473 desc->chip->ack(irq); 166 desc->irq_data.chip->ack(irq);
474 /* 167 /*
475 * REPLAY is when Linux resends an IRQ that was dropped earlier 168 * REPLAY is when Linux resends an IRQ that was dropped earlier
476 * WAITING is used by probe to mark irqs that are being tested 169 * WAITING is used by probe to mark irqs that are being tested
@@ -530,27 +223,9 @@ out:
530 * The ->end() handler has to deal with interrupts which got 223 * The ->end() handler has to deal with interrupts which got
531 * disabled while the handler was running. 224 * disabled while the handler was running.
532 */ 225 */
533 desc->chip->end(irq); 226 desc->irq_data.chip->end(irq);
534 raw_spin_unlock(&desc->lock); 227 raw_spin_unlock(&desc->lock);
535 228
536 return 1; 229 return 1;
537} 230}
538#endif 231#endif
539
540void early_init_irq_lock_class(void)
541{
542 struct irq_desc *desc;
543 int i;
544
545 for_each_irq_desc(i, desc) {
546 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
547 }
548}
549
550unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
551{
552 struct irq_desc *desc = irq_to_desc(irq);
553 return desc ? desc->kstat_irqs[cpu] : 0;
554}
555EXPORT_SYMBOL(kstat_irqs_cpu);
556
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index c63f3bc88f0b..4571ae7e085a 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -1,9 +1,12 @@
1/* 1/*
2 * IRQ subsystem internal functions and variables: 2 * IRQ subsystem internal functions and variables:
3 */ 3 */
4#include <linux/irqdesc.h>
4 5
5extern int noirqdebug; 6extern int noirqdebug;
6 7
8#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data)
9
7/* Set default functions for irq_chip structures: */ 10/* Set default functions for irq_chip structures: */
8extern void irq_chip_set_defaults(struct irq_chip *chip); 11extern void irq_chip_set_defaults(struct irq_chip *chip);
9 12
@@ -15,21 +18,19 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
15extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); 18extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
16extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); 19extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
17 20
18extern struct lock_class_key irq_desc_lock_class;
19extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); 21extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
20extern void clear_kstat_irqs(struct irq_desc *desc);
21extern raw_spinlock_t sparse_irq_lock;
22 22
23#ifdef CONFIG_SPARSE_IRQ 23/* Resending of interrupts :*/
24void replace_irq_desc(unsigned int irq, struct irq_desc *desc); 24void check_irq_resend(struct irq_desc *desc, unsigned int irq);
25#endif
26 25
27#ifdef CONFIG_PROC_FS 26#ifdef CONFIG_PROC_FS
28extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); 27extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
28extern void unregister_irq_proc(unsigned int irq, struct irq_desc *desc);
29extern void register_handler_proc(unsigned int irq, struct irqaction *action); 29extern void register_handler_proc(unsigned int irq, struct irqaction *action);
30extern void unregister_handler_proc(unsigned int irq, struct irqaction *action); 30extern void unregister_handler_proc(unsigned int irq, struct irqaction *action);
31#else 31#else
32static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { } 32static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { }
33static inline void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) { }
33static inline void register_handler_proc(unsigned int irq, 34static inline void register_handler_proc(unsigned int irq,
34 struct irqaction *action) { } 35 struct irqaction *action) { }
35static inline void unregister_handler_proc(unsigned int irq, 36static inline void unregister_handler_proc(unsigned int irq,
@@ -40,17 +41,27 @@ extern int irq_select_affinity_usr(unsigned int irq);
40 41
41extern void irq_set_thread_affinity(struct irq_desc *desc); 42extern void irq_set_thread_affinity(struct irq_desc *desc);
42 43
44#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
45static inline void irq_end(unsigned int irq, struct irq_desc *desc)
46{
47 if (desc->irq_data.chip && desc->irq_data.chip->end)
48 desc->irq_data.chip->end(irq);
49}
50#else
51static inline void irq_end(unsigned int irq, struct irq_desc *desc) { }
52#endif
53
43/* Inline functions for support of irq chips on slow busses */ 54/* Inline functions for support of irq chips on slow busses */
44static inline void chip_bus_lock(unsigned int irq, struct irq_desc *desc) 55static inline void chip_bus_lock(struct irq_desc *desc)
45{ 56{
46 if (unlikely(desc->chip->bus_lock)) 57 if (unlikely(desc->irq_data.chip->irq_bus_lock))
47 desc->chip->bus_lock(irq); 58 desc->irq_data.chip->irq_bus_lock(&desc->irq_data);
48} 59}
49 60
50static inline void chip_bus_sync_unlock(unsigned int irq, struct irq_desc *desc) 61static inline void chip_bus_sync_unlock(struct irq_desc *desc)
51{ 62{
52 if (unlikely(desc->chip->bus_sync_unlock)) 63 if (unlikely(desc->irq_data.chip->irq_bus_sync_unlock))
53 desc->chip->bus_sync_unlock(irq); 64 desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data);
54} 65}
55 66
56/* 67/*
@@ -67,8 +78,8 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
67 irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); 78 irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
68 printk("->handle_irq(): %p, ", desc->handle_irq); 79 printk("->handle_irq(): %p, ", desc->handle_irq);
69 print_symbol("%s\n", (unsigned long)desc->handle_irq); 80 print_symbol("%s\n", (unsigned long)desc->handle_irq);
70 printk("->chip(): %p, ", desc->chip); 81 printk("->irq_data.chip(): %p, ", desc->irq_data.chip);
71 print_symbol("%s\n", (unsigned long)desc->chip); 82 print_symbol("%s\n", (unsigned long)desc->irq_data.chip);
72 printk("->action(): %p\n", desc->action); 83 printk("->action(): %p\n", desc->action);
73 if (desc->action) { 84 if (desc->action) {
74 printk("->action->handler(): %p, ", desc->action->handler); 85 printk("->action->handler(): %p, ", desc->action->handler);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
new file mode 100644
index 000000000000..9988d03797f5
--- /dev/null
+++ b/kernel/irq/irqdesc.c
@@ -0,0 +1,410 @@
1/*
2 * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
3 * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
4 *
5 * This file contains the interrupt descriptor management code
6 *
7 * Detailed information is available in Documentation/DocBook/genericirq
8 *
9 */
10#include <linux/irq.h>
11#include <linux/slab.h>
12#include <linux/module.h>
13#include <linux/interrupt.h>
14#include <linux/kernel_stat.h>
15#include <linux/radix-tree.h>
16#include <linux/bitmap.h>
17
18#include "internals.h"
19
20/*
21 * lockdep: we want to handle all irq_desc locks as a single lock-class:
22 */
23static struct lock_class_key irq_desc_lock_class;
24
25#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
26static void __init init_irq_default_affinity(void)
27{
28 alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
29 cpumask_setall(irq_default_affinity);
30}
31#else
32static void __init init_irq_default_affinity(void)
33{
34}
35#endif
36
37#ifdef CONFIG_SMP
38static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node)
39{
40 if (!zalloc_cpumask_var_node(&desc->irq_data.affinity, gfp, node))
41 return -ENOMEM;
42
43#ifdef CONFIG_GENERIC_PENDING_IRQ
44 if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) {
45 free_cpumask_var(desc->irq_data.affinity);
46 return -ENOMEM;
47 }
48#endif
49 return 0;
50}
51
52static void desc_smp_init(struct irq_desc *desc, int node)
53{
54 desc->irq_data.node = node;
55 cpumask_copy(desc->irq_data.affinity, irq_default_affinity);
56#ifdef CONFIG_GENERIC_PENDING_IRQ
57 cpumask_clear(desc->pending_mask);
58#endif
59}
60
61static inline int desc_node(struct irq_desc *desc)
62{
63 return desc->irq_data.node;
64}
65
66#else
67static inline int
68alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; }
69static inline void desc_smp_init(struct irq_desc *desc, int node) { }
70static inline int desc_node(struct irq_desc *desc) { return 0; }
71#endif
72
73static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
74{
75 desc->irq_data.irq = irq;
76 desc->irq_data.chip = &no_irq_chip;
77 desc->irq_data.chip_data = NULL;
78 desc->irq_data.handler_data = NULL;
79 desc->irq_data.msi_desc = NULL;
80 desc->status = IRQ_DEFAULT_INIT_FLAGS;
81 desc->handle_irq = handle_bad_irq;
82 desc->depth = 1;
83 desc->irq_count = 0;
84 desc->irqs_unhandled = 0;
85 desc->name = NULL;
86 memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));
87 desc_smp_init(desc, node);
88}
89
90int nr_irqs = NR_IRQS;
91EXPORT_SYMBOL_GPL(nr_irqs);
92
93static DEFINE_MUTEX(sparse_irq_lock);
94static DECLARE_BITMAP(allocated_irqs, NR_IRQS);
95
96#ifdef CONFIG_SPARSE_IRQ
97
98static RADIX_TREE(irq_desc_tree, GFP_KERNEL);
99
100static void irq_insert_desc(unsigned int irq, struct irq_desc *desc)
101{
102 radix_tree_insert(&irq_desc_tree, irq, desc);
103}
104
105struct irq_desc *irq_to_desc(unsigned int irq)
106{
107 return radix_tree_lookup(&irq_desc_tree, irq);
108}
109
110static void delete_irq_desc(unsigned int irq)
111{
112 radix_tree_delete(&irq_desc_tree, irq);
113}
114
115#ifdef CONFIG_SMP
116static void free_masks(struct irq_desc *desc)
117{
118#ifdef CONFIG_GENERIC_PENDING_IRQ
119 free_cpumask_var(desc->pending_mask);
120#endif
121 free_cpumask_var(desc->irq_data.affinity);
122}
123#else
124static inline void free_masks(struct irq_desc *desc) { }
125#endif
126
127static struct irq_desc *alloc_desc(int irq, int node)
128{
129 struct irq_desc *desc;
130 gfp_t gfp = GFP_KERNEL;
131
132 desc = kzalloc_node(sizeof(*desc), gfp, node);
133 if (!desc)
134 return NULL;
135 /* allocate based on nr_cpu_ids */
136 desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs),
137 gfp, node);
138 if (!desc->kstat_irqs)
139 goto err_desc;
140
141 if (alloc_masks(desc, gfp, node))
142 goto err_kstat;
143
144 raw_spin_lock_init(&desc->lock);
145 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
146
147 desc_set_defaults(irq, desc, node);
148
149 return desc;
150
151err_kstat:
152 kfree(desc->kstat_irqs);
153err_desc:
154 kfree(desc);
155 return NULL;
156}
157
158static void free_desc(unsigned int irq)
159{
160 struct irq_desc *desc = irq_to_desc(irq);
161
162 unregister_irq_proc(irq, desc);
163
164 mutex_lock(&sparse_irq_lock);
165 delete_irq_desc(irq);
166 mutex_unlock(&sparse_irq_lock);
167
168 free_masks(desc);
169 kfree(desc->kstat_irqs);
170 kfree(desc);
171}
172
173static int alloc_descs(unsigned int start, unsigned int cnt, int node)
174{
175 struct irq_desc *desc;
176 int i;
177
178 for (i = 0; i < cnt; i++) {
179 desc = alloc_desc(start + i, node);
180 if (!desc)
181 goto err;
182 mutex_lock(&sparse_irq_lock);
183 irq_insert_desc(start + i, desc);
184 mutex_unlock(&sparse_irq_lock);
185 }
186 return start;
187
188err:
189 for (i--; i >= 0; i--)
190 free_desc(start + i);
191
192 mutex_lock(&sparse_irq_lock);
193 bitmap_clear(allocated_irqs, start, cnt);
194 mutex_unlock(&sparse_irq_lock);
195 return -ENOMEM;
196}
197
198struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
199{
200 int res = irq_alloc_descs(irq, irq, 1, node);
201
202 if (res == -EEXIST || res == irq)
203 return irq_to_desc(irq);
204 return NULL;
205}
206
207int __init early_irq_init(void)
208{
209 int i, initcnt, node = first_online_node;
210 struct irq_desc *desc;
211
212 init_irq_default_affinity();
213
214 /* Let arch update nr_irqs and return the nr of preallocated irqs */
215 initcnt = arch_probe_nr_irqs();
216 printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt);
217
218 for (i = 0; i < initcnt; i++) {
219 desc = alloc_desc(i, node);
220 set_bit(i, allocated_irqs);
221 irq_insert_desc(i, desc);
222 }
223 return arch_early_irq_init();
224}
225
226#else /* !CONFIG_SPARSE_IRQ */
227
228struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
229 [0 ... NR_IRQS-1] = {
230 .status = IRQ_DEFAULT_INIT_FLAGS,
231 .handle_irq = handle_bad_irq,
232 .depth = 1,
233 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
234 }
235};
236
237static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
238int __init early_irq_init(void)
239{
240 int count, i, node = first_online_node;
241 struct irq_desc *desc;
242
243 init_irq_default_affinity();
244
245 printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
246
247 desc = irq_desc;
248 count = ARRAY_SIZE(irq_desc);
249
250 for (i = 0; i < count; i++) {
251 desc[i].irq_data.irq = i;
252 desc[i].irq_data.chip = &no_irq_chip;
253 desc[i].kstat_irqs = kstat_irqs_all[i];
254 alloc_masks(desc + i, GFP_KERNEL, node);
255 desc_smp_init(desc + i, node);
256 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
257 }
258 return arch_early_irq_init();
259}
260
261struct irq_desc *irq_to_desc(unsigned int irq)
262{
263 return (irq < NR_IRQS) ? irq_desc + irq : NULL;
264}
265
266struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
267{
268 return irq_to_desc(irq);
269}
270
271static void free_desc(unsigned int irq)
272{
273 dynamic_irq_cleanup(irq);
274}
275
276static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
277{
278 return start;
279}
280#endif /* !CONFIG_SPARSE_IRQ */
281
282/* Dynamic interrupt handling */
283
284/**
285 * irq_free_descs - free irq descriptors
286 * @from: Start of descriptor range
287 * @cnt: Number of consecutive irqs to free
288 */
289void irq_free_descs(unsigned int from, unsigned int cnt)
290{
291 int i;
292
293 if (from >= nr_irqs || (from + cnt) > nr_irqs)
294 return;
295
296 for (i = 0; i < cnt; i++)
297 free_desc(from + i);
298
299 mutex_lock(&sparse_irq_lock);
300 bitmap_clear(allocated_irqs, from, cnt);
301 mutex_unlock(&sparse_irq_lock);
302}
303
304/**
305 * irq_alloc_descs - allocate and initialize a range of irq descriptors
306 * @irq: Allocate for specific irq number if irq >= 0
307 * @from: Start the search from this irq number
308 * @cnt: Number of consecutive irqs to allocate.
309 * @node: Preferred node on which the irq descriptor should be allocated
310 *
311 * Returns the first irq number or error code
312 */
313int __ref
314irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
315{
316 int start, ret;
317
318 if (!cnt)
319 return -EINVAL;
320
321 mutex_lock(&sparse_irq_lock);
322
323 start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0);
324 ret = -EEXIST;
325 if (irq >=0 && start != irq)
326 goto err;
327
328 ret = -ENOMEM;
329 if (start >= nr_irqs)
330 goto err;
331
332 bitmap_set(allocated_irqs, start, cnt);
333 mutex_unlock(&sparse_irq_lock);
334 return alloc_descs(start, cnt, node);
335
336err:
337 mutex_unlock(&sparse_irq_lock);
338 return ret;
339}
340
341/**
342 * irq_reserve_irqs - mark irqs allocated
343 * @from: mark from irq number
344 * @cnt: number of irqs to mark
345 *
346 * Returns 0 on success or an appropriate error code
347 */
348int irq_reserve_irqs(unsigned int from, unsigned int cnt)
349{
350 unsigned int start;
351 int ret = 0;
352
353 if (!cnt || (from + cnt) > nr_irqs)
354 return -EINVAL;
355
356 mutex_lock(&sparse_irq_lock);
357 start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0);
358 if (start == from)
359 bitmap_set(allocated_irqs, start, cnt);
360 else
361 ret = -EEXIST;
362 mutex_unlock(&sparse_irq_lock);
363 return ret;
364}
365
366/**
367 * irq_get_next_irq - get next allocated irq number
368 * @offset: where to start the search
369 *
370 * Returns next irq number after offset or nr_irqs if none is found.
371 */
372unsigned int irq_get_next_irq(unsigned int offset)
373{
374 return find_next_bit(allocated_irqs, nr_irqs, offset);
375}
376
377/**
378 * dynamic_irq_cleanup - cleanup a dynamically allocated irq
379 * @irq: irq number to initialize
380 */
381void dynamic_irq_cleanup(unsigned int irq)
382{
383 struct irq_desc *desc = irq_to_desc(irq);
384 unsigned long flags;
385
386 raw_spin_lock_irqsave(&desc->lock, flags);
387 desc_set_defaults(irq, desc, desc_node(desc));
388 raw_spin_unlock_irqrestore(&desc->lock, flags);
389}
390
391unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
392{
393 struct irq_desc *desc = irq_to_desc(irq);
394 return desc ? desc->kstat_irqs[cpu] : 0;
395}
396
397#ifdef CONFIG_GENERIC_HARDIRQS
398unsigned int kstat_irqs(unsigned int irq)
399{
400 struct irq_desc *desc = irq_to_desc(irq);
401 int cpu;
402 int sum = 0;
403
404 if (!desc)
405 return 0;
406 for_each_possible_cpu(cpu)
407 sum += desc->kstat_irqs[cpu];
408 return sum;
409}
410#endif /* CONFIG_GENERIC_HARDIRQS */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e1497481fe8a..644e8d5fa367 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -73,8 +73,8 @@ int irq_can_set_affinity(unsigned int irq)
73{ 73{
74 struct irq_desc *desc = irq_to_desc(irq); 74 struct irq_desc *desc = irq_to_desc(irq);
75 75
76 if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip || 76 if (CHECK_IRQ_PER_CPU(desc->status) || !desc->irq_data.chip ||
77 !desc->chip->set_affinity) 77 !desc->irq_data.chip->irq_set_affinity)
78 return 0; 78 return 0;
79 79
80 return 1; 80 return 1;
@@ -109,17 +109,18 @@ void irq_set_thread_affinity(struct irq_desc *desc)
109int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) 109int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
110{ 110{
111 struct irq_desc *desc = irq_to_desc(irq); 111 struct irq_desc *desc = irq_to_desc(irq);
112 struct irq_chip *chip = desc->irq_data.chip;
112 unsigned long flags; 113 unsigned long flags;
113 114
114 if (!desc->chip->set_affinity) 115 if (!chip->irq_set_affinity)
115 return -EINVAL; 116 return -EINVAL;
116 117
117 raw_spin_lock_irqsave(&desc->lock, flags); 118 raw_spin_lock_irqsave(&desc->lock, flags);
118 119
119#ifdef CONFIG_GENERIC_PENDING_IRQ 120#ifdef CONFIG_GENERIC_PENDING_IRQ
120 if (desc->status & IRQ_MOVE_PCNTXT) { 121 if (desc->status & IRQ_MOVE_PCNTXT) {
121 if (!desc->chip->set_affinity(irq, cpumask)) { 122 if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) {
122 cpumask_copy(desc->affinity, cpumask); 123 cpumask_copy(desc->irq_data.affinity, cpumask);
123 irq_set_thread_affinity(desc); 124 irq_set_thread_affinity(desc);
124 } 125 }
125 } 126 }
@@ -128,8 +129,8 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
128 cpumask_copy(desc->pending_mask, cpumask); 129 cpumask_copy(desc->pending_mask, cpumask);
129 } 130 }
130#else 131#else
131 if (!desc->chip->set_affinity(irq, cpumask)) { 132 if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) {
132 cpumask_copy(desc->affinity, cpumask); 133 cpumask_copy(desc->irq_data.affinity, cpumask);
133 irq_set_thread_affinity(desc); 134 irq_set_thread_affinity(desc);
134 } 135 }
135#endif 136#endif
@@ -168,16 +169,16 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc)
168 * one of the targets is online. 169 * one of the targets is online.
169 */ 170 */
170 if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { 171 if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
171 if (cpumask_any_and(desc->affinity, cpu_online_mask) 172 if (cpumask_any_and(desc->irq_data.affinity, cpu_online_mask)
172 < nr_cpu_ids) 173 < nr_cpu_ids)
173 goto set_affinity; 174 goto set_affinity;
174 else 175 else
175 desc->status &= ~IRQ_AFFINITY_SET; 176 desc->status &= ~IRQ_AFFINITY_SET;
176 } 177 }
177 178
178 cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity); 179 cpumask_and(desc->irq_data.affinity, cpu_online_mask, irq_default_affinity);
179set_affinity: 180set_affinity:
180 desc->chip->set_affinity(irq, desc->affinity); 181 desc->irq_data.chip->irq_set_affinity(&desc->irq_data, desc->irq_data.affinity, false);
181 182
182 return 0; 183 return 0;
183} 184}
@@ -216,14 +217,14 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *desc)
216void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) 217void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
217{ 218{
218 if (suspend) { 219 if (suspend) {
219 if (!desc->action || (desc->action->flags & IRQF_TIMER)) 220 if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND))
220 return; 221 return;
221 desc->status |= IRQ_SUSPENDED; 222 desc->status |= IRQ_SUSPENDED;
222 } 223 }
223 224
224 if (!desc->depth++) { 225 if (!desc->depth++) {
225 desc->status |= IRQ_DISABLED; 226 desc->status |= IRQ_DISABLED;
226 desc->chip->disable(irq); 227 desc->irq_data.chip->irq_disable(&desc->irq_data);
227 } 228 }
228} 229}
229 230
@@ -246,11 +247,11 @@ void disable_irq_nosync(unsigned int irq)
246 if (!desc) 247 if (!desc)
247 return; 248 return;
248 249
249 chip_bus_lock(irq, desc); 250 chip_bus_lock(desc);
250 raw_spin_lock_irqsave(&desc->lock, flags); 251 raw_spin_lock_irqsave(&desc->lock, flags);
251 __disable_irq(desc, irq, false); 252 __disable_irq(desc, irq, false);
252 raw_spin_unlock_irqrestore(&desc->lock, flags); 253 raw_spin_unlock_irqrestore(&desc->lock, flags);
253 chip_bus_sync_unlock(irq, desc); 254 chip_bus_sync_unlock(desc);
254} 255}
255EXPORT_SYMBOL(disable_irq_nosync); 256EXPORT_SYMBOL(disable_irq_nosync);
256 257
@@ -313,7 +314,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
313 * IRQ line is re-enabled. 314 * IRQ line is re-enabled.
314 * 315 *
315 * This function may be called from IRQ context only when 316 * This function may be called from IRQ context only when
316 * desc->chip->bus_lock and desc->chip->bus_sync_unlock are NULL ! 317 * desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL !
317 */ 318 */
318void enable_irq(unsigned int irq) 319void enable_irq(unsigned int irq)
319{ 320{
@@ -323,11 +324,11 @@ void enable_irq(unsigned int irq)
323 if (!desc) 324 if (!desc)
324 return; 325 return;
325 326
326 chip_bus_lock(irq, desc); 327 chip_bus_lock(desc);
327 raw_spin_lock_irqsave(&desc->lock, flags); 328 raw_spin_lock_irqsave(&desc->lock, flags);
328 __enable_irq(desc, irq, false); 329 __enable_irq(desc, irq, false);
329 raw_spin_unlock_irqrestore(&desc->lock, flags); 330 raw_spin_unlock_irqrestore(&desc->lock, flags);
330 chip_bus_sync_unlock(irq, desc); 331 chip_bus_sync_unlock(desc);
331} 332}
332EXPORT_SYMBOL(enable_irq); 333EXPORT_SYMBOL(enable_irq);
333 334
@@ -336,8 +337,8 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
336 struct irq_desc *desc = irq_to_desc(irq); 337 struct irq_desc *desc = irq_to_desc(irq);
337 int ret = -ENXIO; 338 int ret = -ENXIO;
338 339
339 if (desc->chip->set_wake) 340 if (desc->irq_data.chip->irq_set_wake)
340 ret = desc->chip->set_wake(irq, on); 341 ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on);
341 342
342 return ret; 343 return ret;
343} 344}
@@ -429,12 +430,12 @@ void compat_irq_chip_set_default_handler(struct irq_desc *desc)
429} 430}
430 431
431int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, 432int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
432 unsigned long flags) 433 unsigned long flags)
433{ 434{
434 int ret; 435 int ret;
435 struct irq_chip *chip = desc->chip; 436 struct irq_chip *chip = desc->irq_data.chip;
436 437
437 if (!chip || !chip->set_type) { 438 if (!chip || !chip->irq_set_type) {
438 /* 439 /*
439 * IRQF_TRIGGER_* but the PIC does not support multiple 440 * IRQF_TRIGGER_* but the PIC does not support multiple
440 * flow-types? 441 * flow-types?
@@ -445,11 +446,11 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
445 } 446 }
446 447
447 /* caller masked out all except trigger mode flags */ 448 /* caller masked out all except trigger mode flags */
448 ret = chip->set_type(irq, flags); 449 ret = chip->irq_set_type(&desc->irq_data, flags);
449 450
450 if (ret) 451 if (ret)
451 pr_err("setting trigger mode %d for irq %u failed (%pF)\n", 452 pr_err("setting trigger mode %lu for irq %u failed (%pF)\n",
452 (int)flags, irq, chip->set_type); 453 flags, irq, chip->irq_set_type);
453 else { 454 else {
454 if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH)) 455 if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH))
455 flags |= IRQ_LEVEL; 456 flags |= IRQ_LEVEL;
@@ -457,8 +458,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
457 desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK); 458 desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK);
458 desc->status |= flags; 459 desc->status |= flags;
459 460
460 if (chip != desc->chip) 461 if (chip != desc->irq_data.chip)
461 irq_chip_set_defaults(desc->chip); 462 irq_chip_set_defaults(desc->irq_data.chip);
462 } 463 }
463 464
464 return ret; 465 return ret;
@@ -507,7 +508,7 @@ static int irq_wait_for_interrupt(struct irqaction *action)
507static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) 508static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
508{ 509{
509again: 510again:
510 chip_bus_lock(irq, desc); 511 chip_bus_lock(desc);
511 raw_spin_lock_irq(&desc->lock); 512 raw_spin_lock_irq(&desc->lock);
512 513
513 /* 514 /*
@@ -521,17 +522,17 @@ again:
521 */ 522 */
522 if (unlikely(desc->status & IRQ_INPROGRESS)) { 523 if (unlikely(desc->status & IRQ_INPROGRESS)) {
523 raw_spin_unlock_irq(&desc->lock); 524 raw_spin_unlock_irq(&desc->lock);
524 chip_bus_sync_unlock(irq, desc); 525 chip_bus_sync_unlock(desc);
525 cpu_relax(); 526 cpu_relax();
526 goto again; 527 goto again;
527 } 528 }
528 529
529 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { 530 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
530 desc->status &= ~IRQ_MASKED; 531 desc->status &= ~IRQ_MASKED;
531 desc->chip->unmask(irq); 532 desc->irq_data.chip->irq_unmask(&desc->irq_data);
532 } 533 }
533 raw_spin_unlock_irq(&desc->lock); 534 raw_spin_unlock_irq(&desc->lock);
534 chip_bus_sync_unlock(irq, desc); 535 chip_bus_sync_unlock(desc);
535} 536}
536 537
537#ifdef CONFIG_SMP 538#ifdef CONFIG_SMP
@@ -556,7 +557,7 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
556 } 557 }
557 558
558 raw_spin_lock_irq(&desc->lock); 559 raw_spin_lock_irq(&desc->lock);
559 cpumask_copy(mask, desc->affinity); 560 cpumask_copy(mask, desc->irq_data.affinity);
560 raw_spin_unlock_irq(&desc->lock); 561 raw_spin_unlock_irq(&desc->lock);
561 562
562 set_cpus_allowed_ptr(current, mask); 563 set_cpus_allowed_ptr(current, mask);
@@ -657,7 +658,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
657 if (!desc) 658 if (!desc)
658 return -EINVAL; 659 return -EINVAL;
659 660
660 if (desc->chip == &no_irq_chip) 661 if (desc->irq_data.chip == &no_irq_chip)
661 return -ENOSYS; 662 return -ENOSYS;
662 /* 663 /*
663 * Some drivers like serial.c use request_irq() heavily, 664 * Some drivers like serial.c use request_irq() heavily,
@@ -752,7 +753,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
752 } 753 }
753 754
754 if (!shared) { 755 if (!shared) {
755 irq_chip_set_defaults(desc->chip); 756 irq_chip_set_defaults(desc->irq_data.chip);
756 757
757 init_waitqueue_head(&desc->wait_for_threads); 758 init_waitqueue_head(&desc->wait_for_threads);
758 759
@@ -779,7 +780,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
779 if (!(desc->status & IRQ_NOAUTOEN)) { 780 if (!(desc->status & IRQ_NOAUTOEN)) {
780 desc->depth = 0; 781 desc->depth = 0;
781 desc->status &= ~IRQ_DISABLED; 782 desc->status &= ~IRQ_DISABLED;
782 desc->chip->startup(irq); 783 desc->irq_data.chip->irq_startup(&desc->irq_data);
783 } else 784 } else
784 /* Undo nested disables: */ 785 /* Undo nested disables: */
785 desc->depth = 1; 786 desc->depth = 1;
@@ -912,17 +913,17 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
912 913
913 /* Currently used only by UML, might disappear one day: */ 914 /* Currently used only by UML, might disappear one day: */
914#ifdef CONFIG_IRQ_RELEASE_METHOD 915#ifdef CONFIG_IRQ_RELEASE_METHOD
915 if (desc->chip->release) 916 if (desc->irq_data.chip->release)
916 desc->chip->release(irq, dev_id); 917 desc->irq_data.chip->release(irq, dev_id);
917#endif 918#endif
918 919
919 /* If this was the last handler, shut down the IRQ line: */ 920 /* If this was the last handler, shut down the IRQ line: */
920 if (!desc->action) { 921 if (!desc->action) {
921 desc->status |= IRQ_DISABLED; 922 desc->status |= IRQ_DISABLED;
922 if (desc->chip->shutdown) 923 if (desc->irq_data.chip->irq_shutdown)
923 desc->chip->shutdown(irq); 924 desc->irq_data.chip->irq_shutdown(&desc->irq_data);
924 else 925 else
925 desc->chip->disable(irq); 926 desc->irq_data.chip->irq_disable(&desc->irq_data);
926 } 927 }
927 928
928#ifdef CONFIG_SMP 929#ifdef CONFIG_SMP
@@ -997,9 +998,9 @@ void free_irq(unsigned int irq, void *dev_id)
997 if (!desc) 998 if (!desc)
998 return; 999 return;
999 1000
1000 chip_bus_lock(irq, desc); 1001 chip_bus_lock(desc);
1001 kfree(__free_irq(irq, dev_id)); 1002 kfree(__free_irq(irq, dev_id));
1002 chip_bus_sync_unlock(irq, desc); 1003 chip_bus_sync_unlock(desc);
1003} 1004}
1004EXPORT_SYMBOL(free_irq); 1005EXPORT_SYMBOL(free_irq);
1005 1006
@@ -1086,9 +1087,9 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1086 action->name = devname; 1087 action->name = devname;
1087 action->dev_id = dev_id; 1088 action->dev_id = dev_id;
1088 1089
1089 chip_bus_lock(irq, desc); 1090 chip_bus_lock(desc);
1090 retval = __setup_irq(irq, desc, action); 1091 retval = __setup_irq(irq, desc, action);
1091 chip_bus_sync_unlock(irq, desc); 1092 chip_bus_sync_unlock(desc);
1092 1093
1093 if (retval) 1094 if (retval)
1094 kfree(action); 1095 kfree(action);
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 241962280836..1d2541940480 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -7,6 +7,7 @@
7void move_masked_irq(int irq) 7void move_masked_irq(int irq)
8{ 8{
9 struct irq_desc *desc = irq_to_desc(irq); 9 struct irq_desc *desc = irq_to_desc(irq);
10 struct irq_chip *chip = desc->irq_data.chip;
10 11
11 if (likely(!(desc->status & IRQ_MOVE_PENDING))) 12 if (likely(!(desc->status & IRQ_MOVE_PENDING)))
12 return; 13 return;
@@ -24,7 +25,7 @@ void move_masked_irq(int irq)
24 if (unlikely(cpumask_empty(desc->pending_mask))) 25 if (unlikely(cpumask_empty(desc->pending_mask)))
25 return; 26 return;
26 27
27 if (!desc->chip->set_affinity) 28 if (!chip->irq_set_affinity)
28 return; 29 return;
29 30
30 assert_raw_spin_locked(&desc->lock); 31 assert_raw_spin_locked(&desc->lock);
@@ -43,8 +44,9 @@ void move_masked_irq(int irq)
43 */ 44 */
44 if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) 45 if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
45 < nr_cpu_ids)) 46 < nr_cpu_ids))
46 if (!desc->chip->set_affinity(irq, desc->pending_mask)) { 47 if (!chip->irq_set_affinity(&desc->irq_data,
47 cpumask_copy(desc->affinity, desc->pending_mask); 48 desc->pending_mask, false)) {
49 cpumask_copy(desc->irq_data.affinity, desc->pending_mask);
48 irq_set_thread_affinity(desc); 50 irq_set_thread_affinity(desc);
49 } 51 }
50 52
@@ -61,8 +63,8 @@ void move_native_irq(int irq)
61 if (unlikely(desc->status & IRQ_DISABLED)) 63 if (unlikely(desc->status & IRQ_DISABLED))
62 return; 64 return;
63 65
64 desc->chip->mask(irq); 66 desc->irq_data.chip->irq_mask(&desc->irq_data);
65 move_masked_irq(irq); 67 move_masked_irq(irq);
66 desc->chip->unmask(irq); 68 desc->irq_data.chip->irq_unmask(&desc->irq_data);
67} 69}
68 70
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
deleted file mode 100644
index 65d3845665ac..000000000000
--- a/kernel/irq/numa_migrate.c
+++ /dev/null
@@ -1,120 +0,0 @@
1/*
2 * NUMA irq-desc migration code
3 *
4 * Migrate IRQ data structures (irq_desc, chip_data, etc.) over to
5 * the new "home node" of the IRQ.
6 */
7
8#include <linux/irq.h>
9#include <linux/slab.h>
10#include <linux/module.h>
11#include <linux/random.h>
12#include <linux/interrupt.h>
13#include <linux/kernel_stat.h>
14
15#include "internals.h"
16
17static void init_copy_kstat_irqs(struct irq_desc *old_desc,
18 struct irq_desc *desc,
19 int node, int nr)
20{
21 init_kstat_irqs(desc, node, nr);
22
23 if (desc->kstat_irqs != old_desc->kstat_irqs)
24 memcpy(desc->kstat_irqs, old_desc->kstat_irqs,
25 nr * sizeof(*desc->kstat_irqs));
26}
27
28static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
29{
30 if (old_desc->kstat_irqs == desc->kstat_irqs)
31 return;
32
33 kfree(old_desc->kstat_irqs);
34 old_desc->kstat_irqs = NULL;
35}
36
37static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
38 struct irq_desc *desc, int node)
39{
40 memcpy(desc, old_desc, sizeof(struct irq_desc));
41 if (!alloc_desc_masks(desc, node, false)) {
42 printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
43 "for migration.\n", irq);
44 return false;
45 }
46 raw_spin_lock_init(&desc->lock);
47 desc->node = node;
48 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
49 init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids);
50 init_copy_desc_masks(old_desc, desc);
51 arch_init_copy_chip_data(old_desc, desc, node);
52 return true;
53}
54
55static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
56{
57 free_kstat_irqs(old_desc, desc);
58 free_desc_masks(old_desc, desc);
59 arch_free_chip_data(old_desc, desc);
60}
61
62static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
63 int node)
64{
65 struct irq_desc *desc;
66 unsigned int irq;
67 unsigned long flags;
68
69 irq = old_desc->irq;
70
71 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
72
73 /* We have to check it to avoid races with another CPU */
74 desc = irq_to_desc(irq);
75
76 if (desc && old_desc != desc)
77 goto out_unlock;
78
79 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
80 if (!desc) {
81 printk(KERN_ERR "irq %d: can not get new irq_desc "
82 "for migration.\n", irq);
83 /* still use old one */
84 desc = old_desc;
85 goto out_unlock;
86 }
87 if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) {
88 /* still use old one */
89 kfree(desc);
90 desc = old_desc;
91 goto out_unlock;
92 }
93
94 replace_irq_desc(irq, desc);
95 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
96
97 /* free the old one */
98 free_one_irq_desc(old_desc, desc);
99 kfree(old_desc);
100
101 return desc;
102
103out_unlock:
104 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
105
106 return desc;
107}
108
109struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
110{
111 /* those static or target node is -1, do not move them */
112 if (desc->irq < NR_IRQS_LEGACY || node == -1)
113 return desc;
114
115 if (desc->node != node)
116 desc = __real_move_irq_desc(desc, node);
117
118 return desc;
119}
120
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 09a2ee540bd2..01b1d3a88983 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -21,7 +21,7 @@ static struct proc_dir_entry *root_irq_dir;
21static int irq_affinity_proc_show(struct seq_file *m, void *v) 21static int irq_affinity_proc_show(struct seq_file *m, void *v)
22{ 22{
23 struct irq_desc *desc = irq_to_desc((long)m->private); 23 struct irq_desc *desc = irq_to_desc((long)m->private);
24 const struct cpumask *mask = desc->affinity; 24 const struct cpumask *mask = desc->irq_data.affinity;
25 25
26#ifdef CONFIG_GENERIC_PENDING_IRQ 26#ifdef CONFIG_GENERIC_PENDING_IRQ
27 if (desc->status & IRQ_MOVE_PENDING) 27 if (desc->status & IRQ_MOVE_PENDING)
@@ -65,7 +65,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
65 cpumask_var_t new_value; 65 cpumask_var_t new_value;
66 int err; 66 int err;
67 67
68 if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity || 68 if (!irq_to_desc(irq)->irq_data.chip->irq_set_affinity || no_irq_affinity ||
69 irq_balancing_disabled(irq)) 69 irq_balancing_disabled(irq))
70 return -EIO; 70 return -EIO;
71 71
@@ -185,7 +185,7 @@ static int irq_node_proc_show(struct seq_file *m, void *v)
185{ 185{
186 struct irq_desc *desc = irq_to_desc((long) m->private); 186 struct irq_desc *desc = irq_to_desc((long) m->private);
187 187
188 seq_printf(m, "%d\n", desc->node); 188 seq_printf(m, "%d\n", desc->irq_data.node);
189 return 0; 189 return 0;
190} 190}
191 191
@@ -269,7 +269,7 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
269{ 269{
270 char name [MAX_NAMELEN]; 270 char name [MAX_NAMELEN];
271 271
272 if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir) 272 if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip) || desc->dir)
273 return; 273 return;
274 274
275 memset(name, 0, MAX_NAMELEN); 275 memset(name, 0, MAX_NAMELEN);
@@ -297,6 +297,24 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
297 &irq_spurious_proc_fops, (void *)(long)irq); 297 &irq_spurious_proc_fops, (void *)(long)irq);
298} 298}
299 299
300void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
301{
302 char name [MAX_NAMELEN];
303
304 if (!root_irq_dir || !desc->dir)
305 return;
306#ifdef CONFIG_SMP
307 remove_proc_entry("smp_affinity", desc->dir);
308 remove_proc_entry("affinity_hint", desc->dir);
309 remove_proc_entry("node", desc->dir);
310#endif
311 remove_proc_entry("spurious", desc->dir);
312
313 memset(name, 0, MAX_NAMELEN);
314 sprintf(name, "%u", irq);
315 remove_proc_entry(name, root_irq_dir);
316}
317
300#undef MAX_NAMELEN 318#undef MAX_NAMELEN
301 319
302void unregister_handler_proc(unsigned int irq, struct irqaction *action) 320void unregister_handler_proc(unsigned int irq, struct irqaction *action)
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 090c3763f3a2..891115a929aa 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -60,7 +60,7 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
60 /* 60 /*
61 * Make sure the interrupt is enabled, before resending it: 61 * Make sure the interrupt is enabled, before resending it:
62 */ 62 */
63 desc->chip->enable(irq); 63 desc->irq_data.chip->irq_enable(&desc->irq_data);
64 64
65 /* 65 /*
66 * We do not resend level type interrupts. Level type 66 * We do not resend level type interrupts. Level type
@@ -70,7 +70,8 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
70 if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { 70 if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
71 desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; 71 desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
72 72
73 if (!desc->chip->retrigger || !desc->chip->retrigger(irq)) { 73 if (!desc->irq_data.chip->irq_retrigger ||
74 !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
74#ifdef CONFIG_HARDIRQS_SW_RESEND 75#ifdef CONFIG_HARDIRQS_SW_RESEND
75 /* Set it pending and activate the softirq: */ 76 /* Set it pending and activate the softirq: */
76 set_bit(irq, irqs_resend); 77 set_bit(irq, irqs_resend);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 89fb90ae534f..3089d3b9d5f3 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -14,6 +14,8 @@
14#include <linux/moduleparam.h> 14#include <linux/moduleparam.h>
15#include <linux/timer.h> 15#include <linux/timer.h>
16 16
17#include "internals.h"
18
17static int irqfixup __read_mostly; 19static int irqfixup __read_mostly;
18 20
19#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) 21#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
@@ -78,8 +80,8 @@ static int try_one_irq(int irq, struct irq_desc *desc)
78 * If we did actual work for the real IRQ line we must let the 80 * If we did actual work for the real IRQ line we must let the
79 * IRQ controller clean up too 81 * IRQ controller clean up too
80 */ 82 */
81 if (work && desc->chip && desc->chip->end) 83 if (work)
82 desc->chip->end(irq); 84 irq_end(irq, desc);
83 raw_spin_unlock(&desc->lock); 85 raw_spin_unlock(&desc->lock);
84 86
85 return ok; 87 return ok;
@@ -254,7 +256,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
254 printk(KERN_EMERG "Disabling IRQ #%d\n", irq); 256 printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
255 desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; 257 desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED;
256 desc->depth++; 258 desc->depth++;
257 desc->chip->disable(irq); 259 desc->irq_data.chip->irq_disable(&desc->irq_data);
258 260
259 mod_timer(&poll_spurious_irq_timer, 261 mod_timer(&poll_spurious_irq_timer,
260 jiffies + POLL_SPURIOUS_IRQ_INTERVAL); 262 jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
new file mode 100644
index 000000000000..f16763ff8481
--- /dev/null
+++ b/kernel/irq_work.c
@@ -0,0 +1,164 @@
1/*
2 * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
3 *
4 * Provides a framework for enqueueing and running callbacks from hardirq
5 * context. The enqueueing is NMI-safe.
6 */
7
8#include <linux/kernel.h>
9#include <linux/module.h>
10#include <linux/irq_work.h>
11#include <linux/hardirq.h>
12
13/*
14 * An entry can be in one of four states:
15 *
16 * free NULL, 0 -> {claimed} : free to be used
17 * claimed NULL, 3 -> {pending} : claimed to be enqueued
18 * pending next, 3 -> {busy} : queued, pending callback
19 * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed
20 *
21 * We use the lower two bits of the next pointer to keep PENDING and BUSY
22 * flags.
23 */
24
25#define IRQ_WORK_PENDING 1UL
26#define IRQ_WORK_BUSY 2UL
27#define IRQ_WORK_FLAGS 3UL
28
29static inline bool irq_work_is_set(struct irq_work *entry, int flags)
30{
31 return (unsigned long)entry->next & flags;
32}
33
34static inline struct irq_work *irq_work_next(struct irq_work *entry)
35{
36 unsigned long next = (unsigned long)entry->next;
37 next &= ~IRQ_WORK_FLAGS;
38 return (struct irq_work *)next;
39}
40
41static inline struct irq_work *next_flags(struct irq_work *entry, int flags)
42{
43 unsigned long next = (unsigned long)entry;
44 next |= flags;
45 return (struct irq_work *)next;
46}
47
48static DEFINE_PER_CPU(struct irq_work *, irq_work_list);
49
50/*
51 * Claim the entry so that no one else will poke at it.
52 */
53static bool irq_work_claim(struct irq_work *entry)
54{
55 struct irq_work *next, *nflags;
56
57 do {
58 next = entry->next;
59 if ((unsigned long)next & IRQ_WORK_PENDING)
60 return false;
61 nflags = next_flags(next, IRQ_WORK_FLAGS);
62 } while (cmpxchg(&entry->next, next, nflags) != next);
63
64 return true;
65}
66
67
68void __weak arch_irq_work_raise(void)
69{
70 /*
71 * Lame architectures will get the timer tick callback
72 */
73}
74
75/*
76 * Queue the entry and raise the IPI if needed.
77 */
78static void __irq_work_queue(struct irq_work *entry)
79{
80 struct irq_work **head, *next;
81
82 head = &get_cpu_var(irq_work_list);
83
84 do {
85 next = *head;
86 /* Can assign non-atomic because we keep the flags set. */
87 entry->next = next_flags(next, IRQ_WORK_FLAGS);
88 } while (cmpxchg(head, next, entry) != next);
89
90 /* The list was empty, raise self-interrupt to start processing. */
91 if (!irq_work_next(entry))
92 arch_irq_work_raise();
93
94 put_cpu_var(irq_work_list);
95}
96
97/*
98 * Enqueue the irq_work @entry, returns true on success, failure when the
99 * @entry was already enqueued by someone else.
100 *
101 * Can be re-enqueued while the callback is still in progress.
102 */
103bool irq_work_queue(struct irq_work *entry)
104{
105 if (!irq_work_claim(entry)) {
106 /*
107 * Already enqueued, can't do!
108 */
109 return false;
110 }
111
112 __irq_work_queue(entry);
113 return true;
114}
115EXPORT_SYMBOL_GPL(irq_work_queue);
116
117/*
118 * Run the irq_work entries on this cpu. Requires to be ran from hardirq
119 * context with local IRQs disabled.
120 */
121void irq_work_run(void)
122{
123 struct irq_work *list, **head;
124
125 head = &__get_cpu_var(irq_work_list);
126 if (*head == NULL)
127 return;
128
129 BUG_ON(!in_irq());
130 BUG_ON(!irqs_disabled());
131
132 list = xchg(head, NULL);
133 while (list != NULL) {
134 struct irq_work *entry = list;
135
136 list = irq_work_next(list);
137
138 /*
139 * Clear the PENDING bit, after this point the @entry
140 * can be re-used.
141 */
142 entry->next = next_flags(NULL, IRQ_WORK_BUSY);
143 entry->func(entry);
144 /*
145 * Clear the BUSY bit and return to the free state if
146 * no-one else claimed it meanwhile.
147 */
148 cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL);
149 }
150}
151EXPORT_SYMBOL_GPL(irq_work_run);
152
153/*
154 * Synchronize against the irq_work @entry, ensures the entry is not
155 * currently in use.
156 */
157void irq_work_sync(struct irq_work *entry)
158{
159 WARN_ON_ONCE(irqs_disabled());
160
161 while (irq_work_is_set(entry, IRQ_WORK_BUSY))
162 cpu_relax();
163}
164EXPORT_SYMBOL_GPL(irq_work_sync);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
new file mode 100644
index 000000000000..7be868bf25c6
--- /dev/null
+++ b/kernel/jump_label.c
@@ -0,0 +1,429 @@
1/*
2 * jump label support
3 *
4 * Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
5 *
6 */
7#include <linux/jump_label.h>
8#include <linux/memory.h>
9#include <linux/uaccess.h>
10#include <linux/module.h>
11#include <linux/list.h>
12#include <linux/jhash.h>
13#include <linux/slab.h>
14#include <linux/sort.h>
15#include <linux/err.h>
16
17#ifdef HAVE_JUMP_LABEL
18
19#define JUMP_LABEL_HASH_BITS 6
20#define JUMP_LABEL_TABLE_SIZE (1 << JUMP_LABEL_HASH_BITS)
21static struct hlist_head jump_label_table[JUMP_LABEL_TABLE_SIZE];
22
23/* mutex to protect coming/going of the the jump_label table */
24static DEFINE_MUTEX(jump_label_mutex);
25
26struct jump_label_entry {
27 struct hlist_node hlist;
28 struct jump_entry *table;
29 int nr_entries;
30 /* hang modules off here */
31 struct hlist_head modules;
32 unsigned long key;
33};
34
35struct jump_label_module_entry {
36 struct hlist_node hlist;
37 struct jump_entry *table;
38 int nr_entries;
39 struct module *mod;
40};
41
42static int jump_label_cmp(const void *a, const void *b)
43{
44 const struct jump_entry *jea = a;
45 const struct jump_entry *jeb = b;
46
47 if (jea->key < jeb->key)
48 return -1;
49
50 if (jea->key > jeb->key)
51 return 1;
52
53 return 0;
54}
55
56static void
57sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop)
58{
59 unsigned long size;
60
61 size = (((unsigned long)stop - (unsigned long)start)
62 / sizeof(struct jump_entry));
63 sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
64}
65
66static struct jump_label_entry *get_jump_label_entry(jump_label_t key)
67{
68 struct hlist_head *head;
69 struct hlist_node *node;
70 struct jump_label_entry *e;
71 u32 hash = jhash((void *)&key, sizeof(jump_label_t), 0);
72
73 head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
74 hlist_for_each_entry(e, node, head, hlist) {
75 if (key == e->key)
76 return e;
77 }
78 return NULL;
79}
80
81static struct jump_label_entry *
82add_jump_label_entry(jump_label_t key, int nr_entries, struct jump_entry *table)
83{
84 struct hlist_head *head;
85 struct jump_label_entry *e;
86 u32 hash;
87
88 e = get_jump_label_entry(key);
89 if (e)
90 return ERR_PTR(-EEXIST);
91
92 e = kmalloc(sizeof(struct jump_label_entry), GFP_KERNEL);
93 if (!e)
94 return ERR_PTR(-ENOMEM);
95
96 hash = jhash((void *)&key, sizeof(jump_label_t), 0);
97 head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
98 e->key = key;
99 e->table = table;
100 e->nr_entries = nr_entries;
101 INIT_HLIST_HEAD(&(e->modules));
102 hlist_add_head(&e->hlist, head);
103 return e;
104}
105
106static int
107build_jump_label_hashtable(struct jump_entry *start, struct jump_entry *stop)
108{
109 struct jump_entry *iter, *iter_begin;
110 struct jump_label_entry *entry;
111 int count;
112
113 sort_jump_label_entries(start, stop);
114 iter = start;
115 while (iter < stop) {
116 entry = get_jump_label_entry(iter->key);
117 if (!entry) {
118 iter_begin = iter;
119 count = 0;
120 while ((iter < stop) &&
121 (iter->key == iter_begin->key)) {
122 iter++;
123 count++;
124 }
125 entry = add_jump_label_entry(iter_begin->key,
126 count, iter_begin);
127 if (IS_ERR(entry))
128 return PTR_ERR(entry);
129 } else {
130 WARN_ONCE(1, KERN_ERR "build_jump_hashtable: unexpected entry!\n");
131 return -1;
132 }
133 }
134 return 0;
135}
136
137/***
138 * jump_label_update - update jump label text
139 * @key - key value associated with a a jump label
140 * @type - enum set to JUMP_LABEL_ENABLE or JUMP_LABEL_DISABLE
141 *
142 * Will enable/disable the jump for jump label @key, depending on the
143 * value of @type.
144 *
145 */
146
147void jump_label_update(unsigned long key, enum jump_label_type type)
148{
149 struct jump_entry *iter;
150 struct jump_label_entry *entry;
151 struct hlist_node *module_node;
152 struct jump_label_module_entry *e_module;
153 int count;
154
155 mutex_lock(&jump_label_mutex);
156 entry = get_jump_label_entry((jump_label_t)key);
157 if (entry) {
158 count = entry->nr_entries;
159 iter = entry->table;
160 while (count--) {
161 if (kernel_text_address(iter->code))
162 arch_jump_label_transform(iter, type);
163 iter++;
164 }
165 /* eanble/disable jump labels in modules */
166 hlist_for_each_entry(e_module, module_node, &(entry->modules),
167 hlist) {
168 count = e_module->nr_entries;
169 iter = e_module->table;
170 while (count--) {
171 if (kernel_text_address(iter->code))
172 arch_jump_label_transform(iter, type);
173 iter++;
174 }
175 }
176 }
177 mutex_unlock(&jump_label_mutex);
178}
179
180static int addr_conflict(struct jump_entry *entry, void *start, void *end)
181{
182 if (entry->code <= (unsigned long)end &&
183 entry->code + JUMP_LABEL_NOP_SIZE > (unsigned long)start)
184 return 1;
185
186 return 0;
187}
188
189#ifdef CONFIG_MODULES
190
191static int module_conflict(void *start, void *end)
192{
193 struct hlist_head *head;
194 struct hlist_node *node, *node_next, *module_node, *module_node_next;
195 struct jump_label_entry *e;
196 struct jump_label_module_entry *e_module;
197 struct jump_entry *iter;
198 int i, count;
199 int conflict = 0;
200
201 for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
202 head = &jump_label_table[i];
203 hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
204 hlist_for_each_entry_safe(e_module, module_node,
205 module_node_next,
206 &(e->modules), hlist) {
207 count = e_module->nr_entries;
208 iter = e_module->table;
209 while (count--) {
210 if (addr_conflict(iter, start, end)) {
211 conflict = 1;
212 goto out;
213 }
214 iter++;
215 }
216 }
217 }
218 }
219out:
220 return conflict;
221}
222
223#endif
224
225/***
226 * jump_label_text_reserved - check if addr range is reserved
227 * @start: start text addr
228 * @end: end text addr
229 *
230 * checks if the text addr located between @start and @end
231 * overlaps with any of the jump label patch addresses. Code
232 * that wants to modify kernel text should first verify that
233 * it does not overlap with any of the jump label addresses.
234 *
235 * returns 1 if there is an overlap, 0 otherwise
236 */
237int jump_label_text_reserved(void *start, void *end)
238{
239 struct jump_entry *iter;
240 struct jump_entry *iter_start = __start___jump_table;
241 struct jump_entry *iter_stop = __start___jump_table;
242 int conflict = 0;
243
244 mutex_lock(&jump_label_mutex);
245 iter = iter_start;
246 while (iter < iter_stop) {
247 if (addr_conflict(iter, start, end)) {
248 conflict = 1;
249 goto out;
250 }
251 iter++;
252 }
253
254 /* now check modules */
255#ifdef CONFIG_MODULES
256 conflict = module_conflict(start, end);
257#endif
258out:
259 mutex_unlock(&jump_label_mutex);
260 return conflict;
261}
262
263static __init int init_jump_label(void)
264{
265 int ret;
266 struct jump_entry *iter_start = __start___jump_table;
267 struct jump_entry *iter_stop = __stop___jump_table;
268 struct jump_entry *iter;
269
270 mutex_lock(&jump_label_mutex);
271 ret = build_jump_label_hashtable(__start___jump_table,
272 __stop___jump_table);
273 iter = iter_start;
274 while (iter < iter_stop) {
275 arch_jump_label_text_poke_early(iter->code);
276 iter++;
277 }
278 mutex_unlock(&jump_label_mutex);
279 return ret;
280}
281early_initcall(init_jump_label);
282
283#ifdef CONFIG_MODULES
284
285static struct jump_label_module_entry *
286add_jump_label_module_entry(struct jump_label_entry *entry,
287 struct jump_entry *iter_begin,
288 int count, struct module *mod)
289{
290 struct jump_label_module_entry *e;
291
292 e = kmalloc(sizeof(struct jump_label_module_entry), GFP_KERNEL);
293 if (!e)
294 return ERR_PTR(-ENOMEM);
295 e->mod = mod;
296 e->nr_entries = count;
297 e->table = iter_begin;
298 hlist_add_head(&e->hlist, &entry->modules);
299 return e;
300}
301
302static int add_jump_label_module(struct module *mod)
303{
304 struct jump_entry *iter, *iter_begin;
305 struct jump_label_entry *entry;
306 struct jump_label_module_entry *module_entry;
307 int count;
308
309 /* if the module doesn't have jump label entries, just return */
310 if (!mod->num_jump_entries)
311 return 0;
312
313 sort_jump_label_entries(mod->jump_entries,
314 mod->jump_entries + mod->num_jump_entries);
315 iter = mod->jump_entries;
316 while (iter < mod->jump_entries + mod->num_jump_entries) {
317 entry = get_jump_label_entry(iter->key);
318 iter_begin = iter;
319 count = 0;
320 while ((iter < mod->jump_entries + mod->num_jump_entries) &&
321 (iter->key == iter_begin->key)) {
322 iter++;
323 count++;
324 }
325 if (!entry) {
326 entry = add_jump_label_entry(iter_begin->key, 0, NULL);
327 if (IS_ERR(entry))
328 return PTR_ERR(entry);
329 }
330 module_entry = add_jump_label_module_entry(entry, iter_begin,
331 count, mod);
332 if (IS_ERR(module_entry))
333 return PTR_ERR(module_entry);
334 }
335 return 0;
336}
337
338static void remove_jump_label_module(struct module *mod)
339{
340 struct hlist_head *head;
341 struct hlist_node *node, *node_next, *module_node, *module_node_next;
342 struct jump_label_entry *e;
343 struct jump_label_module_entry *e_module;
344 int i;
345
346 /* if the module doesn't have jump label entries, just return */
347 if (!mod->num_jump_entries)
348 return;
349
350 for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
351 head = &jump_label_table[i];
352 hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
353 hlist_for_each_entry_safe(e_module, module_node,
354 module_node_next,
355 &(e->modules), hlist) {
356 if (e_module->mod == mod) {
357 hlist_del(&e_module->hlist);
358 kfree(e_module);
359 }
360 }
361 if (hlist_empty(&e->modules) && (e->nr_entries == 0)) {
362 hlist_del(&e->hlist);
363 kfree(e);
364 }
365 }
366 }
367}
368
369static int
370jump_label_module_notify(struct notifier_block *self, unsigned long val,
371 void *data)
372{
373 struct module *mod = data;
374 int ret = 0;
375
376 switch (val) {
377 case MODULE_STATE_COMING:
378 mutex_lock(&jump_label_mutex);
379 ret = add_jump_label_module(mod);
380 if (ret)
381 remove_jump_label_module(mod);
382 mutex_unlock(&jump_label_mutex);
383 break;
384 case MODULE_STATE_GOING:
385 mutex_lock(&jump_label_mutex);
386 remove_jump_label_module(mod);
387 mutex_unlock(&jump_label_mutex);
388 break;
389 }
390 return ret;
391}
392
393/***
394 * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
395 * @mod: module to patch
396 *
397 * Allow for run-time selection of the optimal nops. Before the module
398 * loads patch these with arch_get_jump_label_nop(), which is specified by
399 * the arch specific jump label code.
400 */
401void jump_label_apply_nops(struct module *mod)
402{
403 struct jump_entry *iter;
404
405 /* if the module doesn't have jump label entries, just return */
406 if (!mod->num_jump_entries)
407 return;
408
409 iter = mod->jump_entries;
410 while (iter < mod->jump_entries + mod->num_jump_entries) {
411 arch_jump_label_text_poke_early(iter->code);
412 iter++;
413 }
414}
415
416struct notifier_block jump_label_module_nb = {
417 .notifier_call = jump_label_module_notify,
418 .priority = 0,
419};
420
421static __init int init_jump_label_module(void)
422{
423 return register_module_notifier(&jump_label_module_nb);
424}
425early_initcall(init_jump_label_module);
426
427#endif /* CONFIG_MODULES */
428
429#endif
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 131b1703936f..b55045bc7563 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -151,8 +151,10 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
151 image->nr_segments = nr_segments; 151 image->nr_segments = nr_segments;
152 segment_bytes = nr_segments * sizeof(*segments); 152 segment_bytes = nr_segments * sizeof(*segments);
153 result = copy_from_user(image->segment, segments, segment_bytes); 153 result = copy_from_user(image->segment, segments, segment_bytes);
154 if (result) 154 if (result) {
155 result = -EFAULT;
155 goto out; 156 goto out;
157 }
156 158
157 /* 159 /*
158 * Verify we have good destination addresses. The caller is 160 * Verify we have good destination addresses. The caller is
@@ -814,7 +816,7 @@ static int kimage_load_normal_segment(struct kimage *image,
814 816
815 ptr = kmap(page); 817 ptr = kmap(page);
816 /* Start with a clear page */ 818 /* Start with a clear page */
817 memset(ptr, 0, PAGE_SIZE); 819 clear_page(ptr);
818 ptr += maddr & ~PAGE_MASK; 820 ptr += maddr & ~PAGE_MASK;
819 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); 821 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
820 if (mchunk > mbytes) 822 if (mchunk > mbytes)
@@ -827,7 +829,7 @@ static int kimage_load_normal_segment(struct kimage *image,
827 result = copy_from_user(ptr, buf, uchunk); 829 result = copy_from_user(ptr, buf, uchunk);
828 kunmap(page); 830 kunmap(page);
829 if (result) { 831 if (result) {
830 result = (result < 0) ? result : -EIO; 832 result = -EFAULT;
831 goto out; 833 goto out;
832 } 834 }
833 ubytes -= uchunk; 835 ubytes -= uchunk;
@@ -882,7 +884,7 @@ static int kimage_load_crash_segment(struct kimage *image,
882 kexec_flush_icache_page(page); 884 kexec_flush_icache_page(page);
883 kunmap(page); 885 kunmap(page);
884 if (result) { 886 if (result) {
885 result = (result < 0) ? result : -EIO; 887 result = -EFAULT;
886 goto out; 888 goto out;
887 } 889 }
888 ubytes -= uchunk; 890 ubytes -= uchunk;
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 35edbe22e9a9..01a0700e873f 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -1,8 +1,7 @@
1/* 1/*
2 * A generic kernel FIFO implementation. 2 * A generic kernel FIFO implementation
3 * 3 *
4 * Copyright (C) 2009 Stefani Seibold <stefani@seibold.net> 4 * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net>
5 * Copyright (C) 2004 Stelian Pop <stelian@popies.net>
6 * 5 *
7 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by 7 * it under the terms of the GNU General Public License as published by
@@ -11,7 +10,7 @@
11 * 10 *
12 * This program is distributed in the hope that it will be useful, 11 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details. 14 * GNU General Public License for more details.
16 * 15 *
17 * You should have received a copy of the GNU General Public License 16 * You should have received a copy of the GNU General Public License
@@ -24,422 +23,586 @@
24#include <linux/module.h> 23#include <linux/module.h>
25#include <linux/slab.h> 24#include <linux/slab.h>
26#include <linux/err.h> 25#include <linux/err.h>
27#include <linux/kfifo.h>
28#include <linux/log2.h> 26#include <linux/log2.h>
29#include <linux/uaccess.h> 27#include <linux/uaccess.h>
28#include <linux/kfifo.h>
30 29
31static void _kfifo_init(struct kfifo *fifo, void *buffer, 30/*
32 unsigned int size) 31 * internal helper to calculate the unused elements in a fifo
33{
34 fifo->buffer = buffer;
35 fifo->size = size;
36
37 kfifo_reset(fifo);
38}
39
40/**
41 * kfifo_init - initialize a FIFO using a preallocated buffer
42 * @fifo: the fifo to assign the buffer
43 * @buffer: the preallocated buffer to be used.
44 * @size: the size of the internal buffer, this has to be a power of 2.
45 *
46 */ 32 */
47void kfifo_init(struct kfifo *fifo, void *buffer, unsigned int size) 33static inline unsigned int kfifo_unused(struct __kfifo *fifo)
48{ 34{
49 /* size must be a power of 2 */ 35 return (fifo->mask + 1) - (fifo->in - fifo->out);
50 BUG_ON(!is_power_of_2(size));
51
52 _kfifo_init(fifo, buffer, size);
53} 36}
54EXPORT_SYMBOL(kfifo_init);
55 37
56/** 38int __kfifo_alloc(struct __kfifo *fifo, unsigned int size,
57 * kfifo_alloc - allocates a new FIFO internal buffer 39 size_t esize, gfp_t gfp_mask)
58 * @fifo: the fifo to assign then new buffer
59 * @size: the size of the buffer to be allocated, this have to be a power of 2.
60 * @gfp_mask: get_free_pages mask, passed to kmalloc()
61 *
62 * This function dynamically allocates a new fifo internal buffer
63 *
64 * The size will be rounded-up to a power of 2.
65 * The buffer will be release with kfifo_free().
66 * Return 0 if no error, otherwise the an error code
67 */
68int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask)
69{ 40{
70 unsigned char *buffer;
71
72 /* 41 /*
73 * round up to the next power of 2, since our 'let the indices 42 * round down to the next power of 2, since our 'let the indices
74 * wrap' technique works only in this case. 43 * wrap' technique works only in this case.
75 */ 44 */
76 if (!is_power_of_2(size)) { 45 if (!is_power_of_2(size))
77 BUG_ON(size > 0x80000000); 46 size = rounddown_pow_of_two(size);
78 size = roundup_pow_of_two(size); 47
48 fifo->in = 0;
49 fifo->out = 0;
50 fifo->esize = esize;
51
52 if (size < 2) {
53 fifo->data = NULL;
54 fifo->mask = 0;
55 return -EINVAL;
79 } 56 }
80 57
81 buffer = kmalloc(size, gfp_mask); 58 fifo->data = kmalloc(size * esize, gfp_mask);
82 if (!buffer) { 59
83 _kfifo_init(fifo, NULL, 0); 60 if (!fifo->data) {
61 fifo->mask = 0;
84 return -ENOMEM; 62 return -ENOMEM;
85 } 63 }
86 64 fifo->mask = size - 1;
87 _kfifo_init(fifo, buffer, size);
88 65
89 return 0; 66 return 0;
90} 67}
91EXPORT_SYMBOL(kfifo_alloc); 68EXPORT_SYMBOL(__kfifo_alloc);
92 69
93/** 70void __kfifo_free(struct __kfifo *fifo)
94 * kfifo_free - frees the FIFO internal buffer
95 * @fifo: the fifo to be freed.
96 */
97void kfifo_free(struct kfifo *fifo)
98{ 71{
99 kfree(fifo->buffer); 72 kfree(fifo->data);
100 _kfifo_init(fifo, NULL, 0); 73 fifo->in = 0;
74 fifo->out = 0;
75 fifo->esize = 0;
76 fifo->data = NULL;
77 fifo->mask = 0;
101} 78}
102EXPORT_SYMBOL(kfifo_free); 79EXPORT_SYMBOL(__kfifo_free);
103 80
104/** 81int __kfifo_init(struct __kfifo *fifo, void *buffer,
105 * kfifo_skip - skip output data 82 unsigned int size, size_t esize)
106 * @fifo: the fifo to be used.
107 * @len: number of bytes to skip
108 */
109void kfifo_skip(struct kfifo *fifo, unsigned int len)
110{ 83{
111 if (len < kfifo_len(fifo)) { 84 size /= esize;
112 __kfifo_add_out(fifo, len); 85
113 return; 86 if (!is_power_of_2(size))
87 size = rounddown_pow_of_two(size);
88
89 fifo->in = 0;
90 fifo->out = 0;
91 fifo->esize = esize;
92 fifo->data = buffer;
93
94 if (size < 2) {
95 fifo->mask = 0;
96 return -EINVAL;
114 } 97 }
115 kfifo_reset_out(fifo); 98 fifo->mask = size - 1;
99
100 return 0;
116} 101}
117EXPORT_SYMBOL(kfifo_skip); 102EXPORT_SYMBOL(__kfifo_init);
118 103
119static inline void __kfifo_in_data(struct kfifo *fifo, 104static void kfifo_copy_in(struct __kfifo *fifo, const void *src,
120 const void *from, unsigned int len, unsigned int off) 105 unsigned int len, unsigned int off)
121{ 106{
107 unsigned int size = fifo->mask + 1;
108 unsigned int esize = fifo->esize;
122 unsigned int l; 109 unsigned int l;
123 110
111 off &= fifo->mask;
112 if (esize != 1) {
113 off *= esize;
114 size *= esize;
115 len *= esize;
116 }
117 l = min(len, size - off);
118
119 memcpy(fifo->data + off, src, l);
120 memcpy(fifo->data, src + l, len - l);
124 /* 121 /*
125 * Ensure that we sample the fifo->out index -before- we 122 * make sure that the data in the fifo is up to date before
126 * start putting bytes into the kfifo. 123 * incrementing the fifo->in index counter
127 */ 124 */
125 smp_wmb();
126}
128 127
129 smp_mb(); 128unsigned int __kfifo_in(struct __kfifo *fifo,
130 129 const void *buf, unsigned int len)
131 off = __kfifo_off(fifo, fifo->in + off); 130{
131 unsigned int l;
132 132
133 /* first put the data starting from fifo->in to buffer end */ 133 l = kfifo_unused(fifo);
134 l = min(len, fifo->size - off); 134 if (len > l)
135 memcpy(fifo->buffer + off, from, l); 135 len = l;
136 136
137 /* then put the rest (if any) at the beginning of the buffer */ 137 kfifo_copy_in(fifo, buf, len, fifo->in);
138 memcpy(fifo->buffer, from + l, len - l); 138 fifo->in += len;
139 return len;
139} 140}
141EXPORT_SYMBOL(__kfifo_in);
140 142
141static inline void __kfifo_out_data(struct kfifo *fifo, 143static void kfifo_copy_out(struct __kfifo *fifo, void *dst,
142 void *to, unsigned int len, unsigned int off) 144 unsigned int len, unsigned int off)
143{ 145{
146 unsigned int size = fifo->mask + 1;
147 unsigned int esize = fifo->esize;
144 unsigned int l; 148 unsigned int l;
145 149
150 off &= fifo->mask;
151 if (esize != 1) {
152 off *= esize;
153 size *= esize;
154 len *= esize;
155 }
156 l = min(len, size - off);
157
158 memcpy(dst, fifo->data + off, l);
159 memcpy(dst + l, fifo->data, len - l);
146 /* 160 /*
147 * Ensure that we sample the fifo->in index -before- we 161 * make sure that the data is copied before
148 * start removing bytes from the kfifo. 162 * incrementing the fifo->out index counter
149 */ 163 */
164 smp_wmb();
165}
150 166
151 smp_rmb(); 167unsigned int __kfifo_out_peek(struct __kfifo *fifo,
168 void *buf, unsigned int len)
169{
170 unsigned int l;
152 171
153 off = __kfifo_off(fifo, fifo->out + off); 172 l = fifo->in - fifo->out;
173 if (len > l)
174 len = l;
154 175
155 /* first get the data from fifo->out until the end of the buffer */ 176 kfifo_copy_out(fifo, buf, len, fifo->out);
156 l = min(len, fifo->size - off); 177 return len;
157 memcpy(to, fifo->buffer + off, l); 178}
179EXPORT_SYMBOL(__kfifo_out_peek);
158 180
159 /* then get the rest (if any) from the beginning of the buffer */ 181unsigned int __kfifo_out(struct __kfifo *fifo,
160 memcpy(to + l, fifo->buffer, len - l); 182 void *buf, unsigned int len)
183{
184 len = __kfifo_out_peek(fifo, buf, len);
185 fifo->out += len;
186 return len;
161} 187}
188EXPORT_SYMBOL(__kfifo_out);
162 189
163static inline int __kfifo_from_user_data(struct kfifo *fifo, 190static unsigned long kfifo_copy_from_user(struct __kfifo *fifo,
164 const void __user *from, unsigned int len, unsigned int off, 191 const void __user *from, unsigned int len, unsigned int off,
165 unsigned *lenout) 192 unsigned int *copied)
166{ 193{
194 unsigned int size = fifo->mask + 1;
195 unsigned int esize = fifo->esize;
167 unsigned int l; 196 unsigned int l;
168 int ret; 197 unsigned long ret;
169 198
199 off &= fifo->mask;
200 if (esize != 1) {
201 off *= esize;
202 size *= esize;
203 len *= esize;
204 }
205 l = min(len, size - off);
206
207 ret = copy_from_user(fifo->data + off, from, l);
208 if (unlikely(ret))
209 ret = DIV_ROUND_UP(ret + len - l, esize);
210 else {
211 ret = copy_from_user(fifo->data, from + l, len - l);
212 if (unlikely(ret))
213 ret = DIV_ROUND_UP(ret, esize);
214 }
170 /* 215 /*
171 * Ensure that we sample the fifo->out index -before- we 216 * make sure that the data in the fifo is up to date before
172 * start putting bytes into the kfifo. 217 * incrementing the fifo->in index counter
173 */ 218 */
219 smp_wmb();
220 *copied = len - ret;
221 /* return the number of elements which are not copied */
222 return ret;
223}
174 224
175 smp_mb(); 225int __kfifo_from_user(struct __kfifo *fifo, const void __user *from,
226 unsigned long len, unsigned int *copied)
227{
228 unsigned int l;
229 unsigned long ret;
230 unsigned int esize = fifo->esize;
231 int err;
176 232
177 off = __kfifo_off(fifo, fifo->in + off); 233 if (esize != 1)
234 len /= esize;
178 235
179 /* first put the data starting from fifo->in to buffer end */ 236 l = kfifo_unused(fifo);
180 l = min(len, fifo->size - off); 237 if (len > l)
181 ret = copy_from_user(fifo->buffer + off, from, l); 238 len = l;
182 if (unlikely(ret)) {
183 *lenout = ret;
184 return -EFAULT;
185 }
186 *lenout = l;
187 239
188 /* then put the rest (if any) at the beginning of the buffer */ 240 ret = kfifo_copy_from_user(fifo, from, len, fifo->in, copied);
189 ret = copy_from_user(fifo->buffer, from + l, len - l); 241 if (unlikely(ret)) {
190 *lenout += ret ? ret : len - l; 242 len -= ret;
191 return ret ? -EFAULT : 0; 243 err = -EFAULT;
244 } else
245 err = 0;
246 fifo->in += len;
247 return err;
192} 248}
249EXPORT_SYMBOL(__kfifo_from_user);
193 250
194static inline int __kfifo_to_user_data(struct kfifo *fifo, 251static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to,
195 void __user *to, unsigned int len, unsigned int off, unsigned *lenout) 252 unsigned int len, unsigned int off, unsigned int *copied)
196{ 253{
197 unsigned int l; 254 unsigned int l;
198 int ret; 255 unsigned long ret;
199 256 unsigned int size = fifo->mask + 1;
257 unsigned int esize = fifo->esize;
258
259 off &= fifo->mask;
260 if (esize != 1) {
261 off *= esize;
262 size *= esize;
263 len *= esize;
264 }
265 l = min(len, size - off);
266
267 ret = copy_to_user(to, fifo->data + off, l);
268 if (unlikely(ret))
269 ret = DIV_ROUND_UP(ret + len - l, esize);
270 else {
271 ret = copy_to_user(to + l, fifo->data, len - l);
272 if (unlikely(ret))
273 ret = DIV_ROUND_UP(ret, esize);
274 }
200 /* 275 /*
201 * Ensure that we sample the fifo->in index -before- we 276 * make sure that the data is copied before
202 * start removing bytes from the kfifo. 277 * incrementing the fifo->out index counter
203 */ 278 */
279 smp_wmb();
280 *copied = len - ret;
281 /* return the number of elements which are not copied */
282 return ret;
283}
204 284
205 smp_rmb(); 285int __kfifo_to_user(struct __kfifo *fifo, void __user *to,
286 unsigned long len, unsigned int *copied)
287{
288 unsigned int l;
289 unsigned long ret;
290 unsigned int esize = fifo->esize;
291 int err;
206 292
207 off = __kfifo_off(fifo, fifo->out + off); 293 if (esize != 1)
294 len /= esize;
208 295
209 /* first get the data from fifo->out until the end of the buffer */ 296 l = fifo->in - fifo->out;
210 l = min(len, fifo->size - off); 297 if (len > l)
211 ret = copy_to_user(to, fifo->buffer + off, l); 298 len = l;
212 *lenout = l; 299 ret = kfifo_copy_to_user(fifo, to, len, fifo->out, copied);
213 if (unlikely(ret)) { 300 if (unlikely(ret)) {
214 *lenout -= ret; 301 len -= ret;
215 return -EFAULT; 302 err = -EFAULT;
216 } 303 } else
304 err = 0;
305 fifo->out += len;
306 return err;
307}
308EXPORT_SYMBOL(__kfifo_to_user);
217 309
218 /* then get the rest (if any) from the beginning of the buffer */ 310static int setup_sgl_buf(struct scatterlist *sgl, void *buf,
219 len -= l; 311 int nents, unsigned int len)
220 ret = copy_to_user(to + l, fifo->buffer, len); 312{
221 if (unlikely(ret)) { 313 int n;
222 *lenout += len - ret; 314 unsigned int l;
223 return -EFAULT; 315 unsigned int off;
316 struct page *page;
317
318 if (!nents)
319 return 0;
320
321 if (!len)
322 return 0;
323
324 n = 0;
325 page = virt_to_page(buf);
326 off = offset_in_page(buf);
327 l = 0;
328
329 while (len >= l + PAGE_SIZE - off) {
330 struct page *npage;
331
332 l += PAGE_SIZE;
333 buf += PAGE_SIZE;
334 npage = virt_to_page(buf);
335 if (page_to_phys(page) != page_to_phys(npage) - l) {
336 sg_set_page(sgl, page, l - off, off);
337 sgl = sg_next(sgl);
338 if (++n == nents || sgl == NULL)
339 return n;
340 page = npage;
341 len -= l - off;
342 l = off = 0;
343 }
224 } 344 }
225 *lenout += len; 345 sg_set_page(sgl, page, len, off);
226 return 0; 346 return n + 1;
227} 347}
228 348
229unsigned int __kfifo_in_n(struct kfifo *fifo, 349static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl,
230 const void *from, unsigned int len, unsigned int recsize) 350 int nents, unsigned int len, unsigned int off)
231{ 351{
232 if (kfifo_avail(fifo) < len + recsize) 352 unsigned int size = fifo->mask + 1;
233 return len + 1; 353 unsigned int esize = fifo->esize;
354 unsigned int l;
355 unsigned int n;
234 356
235 __kfifo_in_data(fifo, from, len, recsize); 357 off &= fifo->mask;
236 return 0; 358 if (esize != 1) {
359 off *= esize;
360 size *= esize;
361 len *= esize;
362 }
363 l = min(len, size - off);
364
365 n = setup_sgl_buf(sgl, fifo->data + off, nents, l);
366 n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l);
367
368 return n;
237} 369}
238EXPORT_SYMBOL(__kfifo_in_n);
239 370
240/** 371unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo,
241 * kfifo_in - puts some data into the FIFO 372 struct scatterlist *sgl, int nents, unsigned int len)
242 * @fifo: the fifo to be used.
243 * @from: the data to be added.
244 * @len: the length of the data to be added.
245 *
246 * This function copies at most @len bytes from the @from buffer into
247 * the FIFO depending on the free space, and returns the number of
248 * bytes copied.
249 *
250 * Note that with only one concurrent reader and one concurrent
251 * writer, you don't need extra locking to use these functions.
252 */
253unsigned int kfifo_in(struct kfifo *fifo, const void *from,
254 unsigned int len)
255{ 373{
256 len = min(kfifo_avail(fifo), len); 374 unsigned int l;
257 375
258 __kfifo_in_data(fifo, from, len, 0); 376 l = kfifo_unused(fifo);
259 __kfifo_add_in(fifo, len); 377 if (len > l)
260 return len; 378 len = l;
379
380 return setup_sgl(fifo, sgl, nents, len, fifo->in);
261} 381}
262EXPORT_SYMBOL(kfifo_in); 382EXPORT_SYMBOL(__kfifo_dma_in_prepare);
263 383
264unsigned int __kfifo_in_generic(struct kfifo *fifo, 384unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo,
265 const void *from, unsigned int len, unsigned int recsize) 385 struct scatterlist *sgl, int nents, unsigned int len)
266{ 386{
267 return __kfifo_in_rec(fifo, from, len, recsize); 387 unsigned int l;
388
389 l = fifo->in - fifo->out;
390 if (len > l)
391 len = l;
392
393 return setup_sgl(fifo, sgl, nents, len, fifo->out);
268} 394}
269EXPORT_SYMBOL(__kfifo_in_generic); 395EXPORT_SYMBOL(__kfifo_dma_out_prepare);
270 396
271unsigned int __kfifo_out_n(struct kfifo *fifo, 397unsigned int __kfifo_max_r(unsigned int len, size_t recsize)
272 void *to, unsigned int len, unsigned int recsize)
273{ 398{
274 if (kfifo_len(fifo) < len + recsize) 399 unsigned int max = (1 << (recsize << 3)) - 1;
275 return len;
276 400
277 __kfifo_out_data(fifo, to, len, recsize); 401 if (len > max)
278 __kfifo_add_out(fifo, len + recsize); 402 return max;
279 return 0; 403 return len;
280} 404}
281EXPORT_SYMBOL(__kfifo_out_n);
282 405
283/** 406#define __KFIFO_PEEK(data, out, mask) \
284 * kfifo_out - gets some data from the FIFO 407 ((data)[(out) & (mask)])
285 * @fifo: the fifo to be used. 408/*
286 * @to: where the data must be copied. 409 * __kfifo_peek_n internal helper function for determinate the length of
287 * @len: the size of the destination buffer. 410 * the next record in the fifo
288 *
289 * This function copies at most @len bytes from the FIFO into the
290 * @to buffer and returns the number of copied bytes.
291 *
292 * Note that with only one concurrent reader and one concurrent
293 * writer, you don't need extra locking to use these functions.
294 */ 411 */
295unsigned int kfifo_out(struct kfifo *fifo, void *to, unsigned int len) 412static unsigned int __kfifo_peek_n(struct __kfifo *fifo, size_t recsize)
296{ 413{
297 len = min(kfifo_len(fifo), len); 414 unsigned int l;
415 unsigned int mask = fifo->mask;
416 unsigned char *data = fifo->data;
298 417
299 __kfifo_out_data(fifo, to, len, 0); 418 l = __KFIFO_PEEK(data, fifo->out, mask);
300 __kfifo_add_out(fifo, len);
301 419
302 return len; 420 if (--recsize)
421 l |= __KFIFO_PEEK(data, fifo->out + 1, mask) << 8;
422
423 return l;
303} 424}
304EXPORT_SYMBOL(kfifo_out);
305 425
306/** 426#define __KFIFO_POKE(data, in, mask, val) \
307 * kfifo_out_peek - copy some data from the FIFO, but do not remove it 427 ( \
308 * @fifo: the fifo to be used. 428 (data)[(in) & (mask)] = (unsigned char)(val) \
309 * @to: where the data must be copied. 429 )
310 * @len: the size of the destination buffer. 430
311 * @offset: offset into the fifo 431/*
312 * 432 * __kfifo_poke_n internal helper function for storeing the length of
313 * This function copies at most @len bytes at @offset from the FIFO 433 * the record into the fifo
314 * into the @to buffer and returns the number of copied bytes.
315 * The data is not removed from the FIFO.
316 */ 434 */
317unsigned int kfifo_out_peek(struct kfifo *fifo, void *to, unsigned int len, 435static void __kfifo_poke_n(struct __kfifo *fifo, unsigned int n, size_t recsize)
318 unsigned offset)
319{ 436{
320 len = min(kfifo_len(fifo), len + offset); 437 unsigned int mask = fifo->mask;
438 unsigned char *data = fifo->data;
321 439
322 __kfifo_out_data(fifo, to, len, offset); 440 __KFIFO_POKE(data, fifo->in, mask, n);
323 return len; 441
442 if (recsize > 1)
443 __KFIFO_POKE(data, fifo->in + 1, mask, n >> 8);
324} 444}
325EXPORT_SYMBOL(kfifo_out_peek);
326 445
327unsigned int __kfifo_out_generic(struct kfifo *fifo, 446unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize)
328 void *to, unsigned int len, unsigned int recsize,
329 unsigned int *total)
330{ 447{
331 return __kfifo_out_rec(fifo, to, len, recsize, total); 448 return __kfifo_peek_n(fifo, recsize);
332} 449}
333EXPORT_SYMBOL(__kfifo_out_generic); 450EXPORT_SYMBOL(__kfifo_len_r);
334 451
335unsigned int __kfifo_from_user_n(struct kfifo *fifo, 452unsigned int __kfifo_in_r(struct __kfifo *fifo, const void *buf,
336 const void __user *from, unsigned int len, unsigned int recsize) 453 unsigned int len, size_t recsize)
337{ 454{
338 unsigned total; 455 if (len + recsize > kfifo_unused(fifo))
456 return 0;
339 457
340 if (kfifo_avail(fifo) < len + recsize) 458 __kfifo_poke_n(fifo, len, recsize);
341 return len + 1;
342 459
343 __kfifo_from_user_data(fifo, from, len, recsize, &total); 460 kfifo_copy_in(fifo, buf, len, fifo->in + recsize);
344 return total; 461 fifo->in += len + recsize;
462 return len;
345} 463}
346EXPORT_SYMBOL(__kfifo_from_user_n); 464EXPORT_SYMBOL(__kfifo_in_r);
347 465
348/** 466static unsigned int kfifo_out_copy_r(struct __kfifo *fifo,
349 * kfifo_from_user - puts some data from user space into the FIFO 467 void *buf, unsigned int len, size_t recsize, unsigned int *n)
350 * @fifo: the fifo to be used. 468{
351 * @from: pointer to the data to be added. 469 *n = __kfifo_peek_n(fifo, recsize);
352 * @len: the length of the data to be added. 470
353 * @total: the actual returned data length. 471 if (len > *n)
354 * 472 len = *n;
355 * This function copies at most @len bytes from the @from into the 473
356 * FIFO depending and returns -EFAULT/0. 474 kfifo_copy_out(fifo, buf, len, fifo->out + recsize);
357 * 475 return len;
358 * Note that with only one concurrent reader and one concurrent
359 * writer, you don't need extra locking to use these functions.
360 */
361int kfifo_from_user(struct kfifo *fifo,
362 const void __user *from, unsigned int len, unsigned *total)
363{
364 int ret;
365 len = min(kfifo_avail(fifo), len);
366 ret = __kfifo_from_user_data(fifo, from, len, 0, total);
367 if (ret)
368 return ret;
369 __kfifo_add_in(fifo, len);
370 return 0;
371} 476}
372EXPORT_SYMBOL(kfifo_from_user);
373 477
374unsigned int __kfifo_from_user_generic(struct kfifo *fifo, 478unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf,
375 const void __user *from, unsigned int len, unsigned int recsize) 479 unsigned int len, size_t recsize)
376{ 480{
377 return __kfifo_from_user_rec(fifo, from, len, recsize); 481 unsigned int n;
482
483 if (fifo->in == fifo->out)
484 return 0;
485
486 return kfifo_out_copy_r(fifo, buf, len, recsize, &n);
378} 487}
379EXPORT_SYMBOL(__kfifo_from_user_generic); 488EXPORT_SYMBOL(__kfifo_out_peek_r);
380 489
381unsigned int __kfifo_to_user_n(struct kfifo *fifo, 490unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf,
382 void __user *to, unsigned int len, unsigned int reclen, 491 unsigned int len, size_t recsize)
383 unsigned int recsize)
384{ 492{
385 unsigned int ret, total; 493 unsigned int n;
386 494
387 if (kfifo_len(fifo) < reclen + recsize) 495 if (fifo->in == fifo->out)
388 return len; 496 return 0;
389 497
390 ret = __kfifo_to_user_data(fifo, to, reclen, recsize, &total); 498 len = kfifo_out_copy_r(fifo, buf, len, recsize, &n);
499 fifo->out += n + recsize;
500 return len;
501}
502EXPORT_SYMBOL(__kfifo_out_r);
391 503
392 if (likely(ret == 0)) 504void __kfifo_skip_r(struct __kfifo *fifo, size_t recsize)
393 __kfifo_add_out(fifo, reclen + recsize); 505{
506 unsigned int n;
394 507
395 return total; 508 n = __kfifo_peek_n(fifo, recsize);
509 fifo->out += n + recsize;
396} 510}
397EXPORT_SYMBOL(__kfifo_to_user_n); 511EXPORT_SYMBOL(__kfifo_skip_r);
398 512
399/** 513int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from,
400 * kfifo_to_user - gets data from the FIFO and write it to user space 514 unsigned long len, unsigned int *copied, size_t recsize)
401 * @fifo: the fifo to be used.
402 * @to: where the data must be copied.
403 * @len: the size of the destination buffer.
404 * @lenout: pointer to output variable with copied data
405 *
406 * This function copies at most @len bytes from the FIFO into the
407 * @to buffer and 0 or -EFAULT.
408 *
409 * Note that with only one concurrent reader and one concurrent
410 * writer, you don't need extra locking to use these functions.
411 */
412int kfifo_to_user(struct kfifo *fifo,
413 void __user *to, unsigned int len, unsigned *lenout)
414{ 515{
415 int ret; 516 unsigned long ret;
416 len = min(kfifo_len(fifo), len); 517
417 ret = __kfifo_to_user_data(fifo, to, len, 0, lenout); 518 len = __kfifo_max_r(len, recsize);
418 __kfifo_add_out(fifo, *lenout); 519
419 return ret; 520 if (len + recsize > kfifo_unused(fifo)) {
521 *copied = 0;
522 return 0;
523 }
524
525 __kfifo_poke_n(fifo, len, recsize);
526
527 ret = kfifo_copy_from_user(fifo, from, len, fifo->in + recsize, copied);
528 if (unlikely(ret)) {
529 *copied = 0;
530 return -EFAULT;
531 }
532 fifo->in += len + recsize;
533 return 0;
420} 534}
421EXPORT_SYMBOL(kfifo_to_user); 535EXPORT_SYMBOL(__kfifo_from_user_r);
422 536
423unsigned int __kfifo_to_user_generic(struct kfifo *fifo, 537int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to,
424 void __user *to, unsigned int len, unsigned int recsize, 538 unsigned long len, unsigned int *copied, size_t recsize)
425 unsigned int *total)
426{ 539{
427 return __kfifo_to_user_rec(fifo, to, len, recsize, total); 540 unsigned long ret;
541 unsigned int n;
542
543 if (fifo->in == fifo->out) {
544 *copied = 0;
545 return 0;
546 }
547
548 n = __kfifo_peek_n(fifo, recsize);
549 if (len > n)
550 len = n;
551
552 ret = kfifo_copy_to_user(fifo, to, len, fifo->out + recsize, copied);
553 if (unlikely(ret)) {
554 *copied = 0;
555 return -EFAULT;
556 }
557 fifo->out += n + recsize;
558 return 0;
428} 559}
429EXPORT_SYMBOL(__kfifo_to_user_generic); 560EXPORT_SYMBOL(__kfifo_to_user_r);
430 561
431unsigned int __kfifo_peek_generic(struct kfifo *fifo, unsigned int recsize) 562unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo,
563 struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
432{ 564{
433 if (recsize == 0) 565 if (!nents)
434 return kfifo_avail(fifo); 566 BUG();
435 567
436 return __kfifo_peek_n(fifo, recsize); 568 len = __kfifo_max_r(len, recsize);
569
570 if (len + recsize > kfifo_unused(fifo))
571 return 0;
572
573 return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize);
437} 574}
438EXPORT_SYMBOL(__kfifo_peek_generic); 575EXPORT_SYMBOL(__kfifo_dma_in_prepare_r);
439 576
440void __kfifo_skip_generic(struct kfifo *fifo, unsigned int recsize) 577void __kfifo_dma_in_finish_r(struct __kfifo *fifo,
578 unsigned int len, size_t recsize)
441{ 579{
442 __kfifo_skip_rec(fifo, recsize); 580 len = __kfifo_max_r(len, recsize);
581 __kfifo_poke_n(fifo, len, recsize);
582 fifo->in += len + recsize;
443} 583}
444EXPORT_SYMBOL(__kfifo_skip_generic); 584EXPORT_SYMBOL(__kfifo_dma_in_finish_r);
445 585
586unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo,
587 struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
588{
589 if (!nents)
590 BUG();
591
592 len = __kfifo_max_r(len, recsize);
593
594 if (len + recsize > fifo->in - fifo->out)
595 return 0;
596
597 return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize);
598}
599EXPORT_SYMBOL(__kfifo_dma_out_prepare_r);
600
601void __kfifo_dma_out_finish_r(struct __kfifo *fifo, size_t recsize)
602{
603 unsigned int len;
604
605 len = __kfifo_peek_n(fifo, recsize);
606 fifo->out += len + recsize;
607}
608EXPORT_SYMBOL(__kfifo_dma_out_finish_r);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 6e9b19667a8d..9cd0591c96a2 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -153,7 +153,9 @@ static int ____call_usermodehelper(void *data)
153 goto fail; 153 goto fail;
154 } 154 }
155 155
156 retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp); 156 retval = kernel_execve(sub_info->path,
157 (const char *const *)sub_info->argv,
158 (const char *const *)sub_info->envp);
157 159
158 /* Exec failed? */ 160 /* Exec failed? */
159fail: 161fail:
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 282035f3ae96..99865c33a60d 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -47,6 +47,7 @@
47#include <linux/memory.h> 47#include <linux/memory.h>
48#include <linux/ftrace.h> 48#include <linux/ftrace.h>
49#include <linux/cpu.h> 49#include <linux/cpu.h>
50#include <linux/jump_label.h>
50 51
51#include <asm-generic/sections.h> 52#include <asm-generic/sections.h>
52#include <asm/cacheflush.h> 53#include <asm/cacheflush.h>
@@ -73,7 +74,8 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
73/* NOTE: change this value only with kprobe_mutex held */ 74/* NOTE: change this value only with kprobe_mutex held */
74static bool kprobes_all_disarmed; 75static bool kprobes_all_disarmed;
75 76
76static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ 77/* This protects kprobe_table and optimizing_list */
78static DEFINE_MUTEX(kprobe_mutex);
77static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 79static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
78static struct { 80static struct {
79 spinlock_t lock ____cacheline_aligned_in_smp; 81 spinlock_t lock ____cacheline_aligned_in_smp;
@@ -399,7 +401,7 @@ static inline int kprobe_optready(struct kprobe *p)
399 * Return an optimized kprobe whose optimizing code replaces 401 * Return an optimized kprobe whose optimizing code replaces
400 * instructions including addr (exclude breakpoint). 402 * instructions including addr (exclude breakpoint).
401 */ 403 */
402struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) 404static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
403{ 405{
404 int i; 406 int i;
405 struct kprobe *p = NULL; 407 struct kprobe *p = NULL;
@@ -594,6 +596,7 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
594} 596}
595 597
596#ifdef CONFIG_SYSCTL 598#ifdef CONFIG_SYSCTL
599/* This should be called with kprobe_mutex locked */
597static void __kprobes optimize_all_kprobes(void) 600static void __kprobes optimize_all_kprobes(void)
598{ 601{
599 struct hlist_head *head; 602 struct hlist_head *head;
@@ -606,17 +609,16 @@ static void __kprobes optimize_all_kprobes(void)
606 return; 609 return;
607 610
608 kprobes_allow_optimization = true; 611 kprobes_allow_optimization = true;
609 mutex_lock(&text_mutex);
610 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 612 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
611 head = &kprobe_table[i]; 613 head = &kprobe_table[i];
612 hlist_for_each_entry_rcu(p, node, head, hlist) 614 hlist_for_each_entry_rcu(p, node, head, hlist)
613 if (!kprobe_disabled(p)) 615 if (!kprobe_disabled(p))
614 optimize_kprobe(p); 616 optimize_kprobe(p);
615 } 617 }
616 mutex_unlock(&text_mutex);
617 printk(KERN_INFO "Kprobes globally optimized\n"); 618 printk(KERN_INFO "Kprobes globally optimized\n");
618} 619}
619 620
621/* This should be called with kprobe_mutex locked */
620static void __kprobes unoptimize_all_kprobes(void) 622static void __kprobes unoptimize_all_kprobes(void)
621{ 623{
622 struct hlist_head *head; 624 struct hlist_head *head;
@@ -831,6 +833,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
831 833
832void __kprobes kretprobe_hash_lock(struct task_struct *tsk, 834void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
833 struct hlist_head **head, unsigned long *flags) 835 struct hlist_head **head, unsigned long *flags)
836__acquires(hlist_lock)
834{ 837{
835 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); 838 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
836 spinlock_t *hlist_lock; 839 spinlock_t *hlist_lock;
@@ -842,6 +845,7 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
842 845
843static void __kprobes kretprobe_table_lock(unsigned long hash, 846static void __kprobes kretprobe_table_lock(unsigned long hash,
844 unsigned long *flags) 847 unsigned long *flags)
848__acquires(hlist_lock)
845{ 849{
846 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 850 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
847 spin_lock_irqsave(hlist_lock, *flags); 851 spin_lock_irqsave(hlist_lock, *flags);
@@ -849,6 +853,7 @@ static void __kprobes kretprobe_table_lock(unsigned long hash,
849 853
850void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, 854void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
851 unsigned long *flags) 855 unsigned long *flags)
856__releases(hlist_lock)
852{ 857{
853 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); 858 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
854 spinlock_t *hlist_lock; 859 spinlock_t *hlist_lock;
@@ -857,7 +862,9 @@ void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
857 spin_unlock_irqrestore(hlist_lock, *flags); 862 spin_unlock_irqrestore(hlist_lock, *flags);
858} 863}
859 864
860void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags) 865static void __kprobes kretprobe_table_unlock(unsigned long hash,
866 unsigned long *flags)
867__releases(hlist_lock)
861{ 868{
862 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 869 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
863 spin_unlock_irqrestore(hlist_lock, *flags); 870 spin_unlock_irqrestore(hlist_lock, *flags);
@@ -1141,7 +1148,8 @@ int __kprobes register_kprobe(struct kprobe *p)
1141 preempt_disable(); 1148 preempt_disable();
1142 if (!kernel_text_address((unsigned long) p->addr) || 1149 if (!kernel_text_address((unsigned long) p->addr) ||
1143 in_kprobes_functions((unsigned long) p->addr) || 1150 in_kprobes_functions((unsigned long) p->addr) ||
1144 ftrace_text_reserved(p->addr, p->addr)) { 1151 ftrace_text_reserved(p->addr, p->addr) ||
1152 jump_label_text_reserved(p->addr, p->addr)) {
1145 preempt_enable(); 1153 preempt_enable();
1146 return -EINVAL; 1154 return -EINVAL;
1147 } 1155 }
@@ -1339,18 +1347,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num)
1339 if (num <= 0) 1347 if (num <= 0)
1340 return -EINVAL; 1348 return -EINVAL;
1341 for (i = 0; i < num; i++) { 1349 for (i = 0; i < num; i++) {
1342 unsigned long addr; 1350 unsigned long addr, offset;
1343 jp = jps[i]; 1351 jp = jps[i];
1344 addr = arch_deref_entry_point(jp->entry); 1352 addr = arch_deref_entry_point(jp->entry);
1345 1353
1346 if (!kernel_text_address(addr)) 1354 /* Verify probepoint is a function entry point */
1347 ret = -EINVAL; 1355 if (kallsyms_lookup_size_offset(addr, NULL, &offset) &&
1348 else { 1356 offset == 0) {
1349 /* Todo: Verify probepoint is a function entry point */
1350 jp->kp.pre_handler = setjmp_pre_handler; 1357 jp->kp.pre_handler = setjmp_pre_handler;
1351 jp->kp.break_handler = longjmp_break_handler; 1358 jp->kp.break_handler = longjmp_break_handler;
1352 ret = register_kprobe(&jp->kp); 1359 ret = register_kprobe(&jp->kp);
1353 } 1360 } else
1361 ret = -EINVAL;
1362
1354 if (ret < 0) { 1363 if (ret < 0) {
1355 if (i > 0) 1364 if (i > 0)
1356 unregister_jprobes(jps, i); 1365 unregister_jprobes(jps, i);
@@ -1992,6 +2001,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
1992static const struct file_operations fops_kp = { 2001static const struct file_operations fops_kp = {
1993 .read = read_enabled_file_bool, 2002 .read = read_enabled_file_bool,
1994 .write = write_enabled_file_bool, 2003 .write = write_enabled_file_bool,
2004 .llseek = default_llseek,
1995}; 2005};
1996 2006
1997static int __kprobes debugfs_kprobe_init(void) 2007static int __kprobes debugfs_kprobe_init(void)
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 83911c780175..2dc3786349d1 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -14,6 +14,8 @@
14#include <linux/file.h> 14#include <linux/file.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <linux/slab.h>
18#include <linux/freezer.h>
17#include <trace/events/sched.h> 19#include <trace/events/sched.h>
18 20
19static DEFINE_SPINLOCK(kthread_create_lock); 21static DEFINE_SPINLOCK(kthread_create_lock);
@@ -35,6 +37,7 @@ struct kthread_create_info
35 37
36struct kthread { 38struct kthread {
37 int should_stop; 39 int should_stop;
40 void *data;
38 struct completion exited; 41 struct completion exited;
39}; 42};
40 43
@@ -54,6 +57,19 @@ int kthread_should_stop(void)
54} 57}
55EXPORT_SYMBOL(kthread_should_stop); 58EXPORT_SYMBOL(kthread_should_stop);
56 59
60/**
61 * kthread_data - return data value specified on kthread creation
62 * @task: kthread task in question
63 *
64 * Return the data value specified when kthread @task was created.
65 * The caller is responsible for ensuring the validity of @task when
66 * calling this function.
67 */
68void *kthread_data(struct task_struct *task)
69{
70 return to_kthread(task)->data;
71}
72
57static int kthread(void *_create) 73static int kthread(void *_create)
58{ 74{
59 /* Copy data: it's on kthread's stack */ 75 /* Copy data: it's on kthread's stack */
@@ -64,6 +80,7 @@ static int kthread(void *_create)
64 int ret; 80 int ret;
65 81
66 self.should_stop = 0; 82 self.should_stop = 0;
83 self.data = data;
67 init_completion(&self.exited); 84 init_completion(&self.exited);
68 current->vfork_done = &self.exited; 85 current->vfork_done = &self.exited;
69 86
@@ -247,3 +264,150 @@ int kthreadd(void *unused)
247 264
248 return 0; 265 return 0;
249} 266}
267
268/**
269 * kthread_worker_fn - kthread function to process kthread_worker
270 * @worker_ptr: pointer to initialized kthread_worker
271 *
272 * This function can be used as @threadfn to kthread_create() or
273 * kthread_run() with @worker_ptr argument pointing to an initialized
274 * kthread_worker. The started kthread will process work_list until
275 * the it is stopped with kthread_stop(). A kthread can also call
276 * this function directly after extra initialization.
277 *
278 * Different kthreads can be used for the same kthread_worker as long
279 * as there's only one kthread attached to it at any given time. A
280 * kthread_worker without an attached kthread simply collects queued
281 * kthread_works.
282 */
283int kthread_worker_fn(void *worker_ptr)
284{
285 struct kthread_worker *worker = worker_ptr;
286 struct kthread_work *work;
287
288 WARN_ON(worker->task);
289 worker->task = current;
290repeat:
291 set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
292
293 if (kthread_should_stop()) {
294 __set_current_state(TASK_RUNNING);
295 spin_lock_irq(&worker->lock);
296 worker->task = NULL;
297 spin_unlock_irq(&worker->lock);
298 return 0;
299 }
300
301 work = NULL;
302 spin_lock_irq(&worker->lock);
303 if (!list_empty(&worker->work_list)) {
304 work = list_first_entry(&worker->work_list,
305 struct kthread_work, node);
306 list_del_init(&work->node);
307 }
308 spin_unlock_irq(&worker->lock);
309
310 if (work) {
311 __set_current_state(TASK_RUNNING);
312 work->func(work);
313 smp_wmb(); /* wmb worker-b0 paired with flush-b1 */
314 work->done_seq = work->queue_seq;
315 smp_mb(); /* mb worker-b1 paired with flush-b0 */
316 if (atomic_read(&work->flushing))
317 wake_up_all(&work->done);
318 } else if (!freezing(current))
319 schedule();
320
321 try_to_freeze();
322 goto repeat;
323}
324EXPORT_SYMBOL_GPL(kthread_worker_fn);
325
326/**
327 * queue_kthread_work - queue a kthread_work
328 * @worker: target kthread_worker
329 * @work: kthread_work to queue
330 *
331 * Queue @work to work processor @task for async execution. @task
332 * must have been created with kthread_worker_create(). Returns %true
333 * if @work was successfully queued, %false if it was already pending.
334 */
335bool queue_kthread_work(struct kthread_worker *worker,
336 struct kthread_work *work)
337{
338 bool ret = false;
339 unsigned long flags;
340
341 spin_lock_irqsave(&worker->lock, flags);
342 if (list_empty(&work->node)) {
343 list_add_tail(&work->node, &worker->work_list);
344 work->queue_seq++;
345 if (likely(worker->task))
346 wake_up_process(worker->task);
347 ret = true;
348 }
349 spin_unlock_irqrestore(&worker->lock, flags);
350 return ret;
351}
352EXPORT_SYMBOL_GPL(queue_kthread_work);
353
354/**
355 * flush_kthread_work - flush a kthread_work
356 * @work: work to flush
357 *
358 * If @work is queued or executing, wait for it to finish execution.
359 */
360void flush_kthread_work(struct kthread_work *work)
361{
362 int seq = work->queue_seq;
363
364 atomic_inc(&work->flushing);
365
366 /*
367 * mb flush-b0 paired with worker-b1, to make sure either
368 * worker sees the above increment or we see done_seq update.
369 */
370 smp_mb__after_atomic_inc();
371
372 /* A - B <= 0 tests whether B is in front of A regardless of overflow */
373 wait_event(work->done, seq - work->done_seq <= 0);
374 atomic_dec(&work->flushing);
375
376 /*
377 * rmb flush-b1 paired with worker-b0, to make sure our caller
378 * sees every change made by work->func().
379 */
380 smp_mb__after_atomic_dec();
381}
382EXPORT_SYMBOL_GPL(flush_kthread_work);
383
384struct kthread_flush_work {
385 struct kthread_work work;
386 struct completion done;
387};
388
389static void kthread_flush_work_fn(struct kthread_work *work)
390{
391 struct kthread_flush_work *fwork =
392 container_of(work, struct kthread_flush_work, work);
393 complete(&fwork->done);
394}
395
396/**
397 * flush_kthread_worker - flush all current works on a kthread_worker
398 * @worker: worker to flush
399 *
400 * Wait until all currently executing or pending works on @worker are
401 * finished.
402 */
403void flush_kthread_worker(struct kthread_worker *worker)
404{
405 struct kthread_flush_work fwork = {
406 KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
407 COMPLETION_INITIALIZER_ONSTACK(fwork.done),
408 };
409
410 queue_kthread_work(worker, &fwork.work);
411 wait_for_completion(&fwork.done);
412}
413EXPORT_SYMBOL_GPL(flush_kthread_worker);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 54286798c37b..42ba65dff7d9 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -146,7 +146,7 @@ static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS],
146 146
147static inline u64 lockstat_clock(void) 147static inline u64 lockstat_clock(void)
148{ 148{
149 return cpu_clock(smp_processor_id()); 149 return local_clock();
150} 150}
151 151
152static int lock_point(unsigned long points[], unsigned long ip) 152static int lock_point(unsigned long points[], unsigned long ip)
@@ -639,6 +639,16 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
639 } 639 }
640#endif 640#endif
641 641
642 if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
643 debug_locks_off();
644 printk(KERN_ERR
645 "BUG: looking up invalid subclass: %u\n", subclass);
646 printk(KERN_ERR
647 "turning off the locking correctness validator.\n");
648 dump_stack();
649 return NULL;
650 }
651
642 /* 652 /*
643 * Static locks do not have their class-keys yet - for them the key 653 * Static locks do not have their class-keys yet - for them the key
644 * is the lock object itself: 654 * is the lock object itself:
@@ -774,7 +784,9 @@ out_unlock_set:
774 raw_local_irq_restore(flags); 784 raw_local_irq_restore(flags);
775 785
776 if (!subclass || force) 786 if (!subclass || force)
777 lock->class_cache = class; 787 lock->class_cache[0] = class;
788 else if (subclass < NR_LOCKDEP_CACHING_CLASSES)
789 lock->class_cache[subclass] = class;
778 790
779 if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) 791 if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass))
780 return NULL; 792 return NULL;
@@ -2679,7 +2691,11 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
2679void lockdep_init_map(struct lockdep_map *lock, const char *name, 2691void lockdep_init_map(struct lockdep_map *lock, const char *name,
2680 struct lock_class_key *key, int subclass) 2692 struct lock_class_key *key, int subclass)
2681{ 2693{
2682 lock->class_cache = NULL; 2694 int i;
2695
2696 for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
2697 lock->class_cache[i] = NULL;
2698
2683#ifdef CONFIG_LOCK_STAT 2699#ifdef CONFIG_LOCK_STAT
2684 lock->cpu = raw_smp_processor_id(); 2700 lock->cpu = raw_smp_processor_id();
2685#endif 2701#endif
@@ -2739,21 +2755,13 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2739 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 2755 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2740 return 0; 2756 return 0;
2741 2757
2742 if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
2743 debug_locks_off();
2744 printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n");
2745 printk("turning off the locking correctness validator.\n");
2746 dump_stack();
2747 return 0;
2748 }
2749
2750 if (lock->key == &__lockdep_no_validate__) 2758 if (lock->key == &__lockdep_no_validate__)
2751 check = 1; 2759 check = 1;
2752 2760
2753 if (!subclass) 2761 if (subclass < NR_LOCKDEP_CACHING_CLASSES)
2754 class = lock->class_cache; 2762 class = lock->class_cache[subclass];
2755 /* 2763 /*
2756 * Not cached yet or subclass? 2764 * Not cached?
2757 */ 2765 */
2758 if (unlikely(!class)) { 2766 if (unlikely(!class)) {
2759 class = register_lock_class(lock, subclass, 0); 2767 class = register_lock_class(lock, subclass, 0);
@@ -2918,7 +2926,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
2918 return 1; 2926 return 1;
2919 2927
2920 if (hlock->references) { 2928 if (hlock->references) {
2921 struct lock_class *class = lock->class_cache; 2929 struct lock_class *class = lock->class_cache[0];
2922 2930
2923 if (!class) 2931 if (!class)
2924 class = look_up_lock_class(lock, 0); 2932 class = look_up_lock_class(lock, 0);
@@ -3559,7 +3567,12 @@ void lockdep_reset_lock(struct lockdep_map *lock)
3559 if (list_empty(head)) 3567 if (list_empty(head))
3560 continue; 3568 continue;
3561 list_for_each_entry_safe(class, next, head, hash_entry) { 3569 list_for_each_entry_safe(class, next, head, hash_entry) {
3562 if (unlikely(class == lock->class_cache)) { 3570 int match = 0;
3571
3572 for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
3573 match |= class == lock->class_cache[j];
3574
3575 if (unlikely(match)) {
3563 if (debug_locks_off_graph_unlock()) 3576 if (debug_locks_off_graph_unlock())
3564 WARN_ON(1); 3577 WARN_ON(1);
3565 goto out_restore; 3578 goto out_restore;
@@ -3775,7 +3788,7 @@ EXPORT_SYMBOL_GPL(debug_show_all_locks);
3775 * Careful: only use this function if you are sure that 3788 * Careful: only use this function if you are sure that
3776 * the task cannot run in parallel! 3789 * the task cannot run in parallel!
3777 */ 3790 */
3778void __debug_show_held_locks(struct task_struct *task) 3791void debug_show_held_locks(struct task_struct *task)
3779{ 3792{
3780 if (unlikely(!debug_locks)) { 3793 if (unlikely(!debug_locks)) {
3781 printk("INFO: lockdep is turned off.\n"); 3794 printk("INFO: lockdep is turned off.\n");
@@ -3783,12 +3796,6 @@ void __debug_show_held_locks(struct task_struct *task)
3783 } 3796 }
3784 lockdep_print_held_locks(task); 3797 lockdep_print_held_locks(task);
3785} 3798}
3786EXPORT_SYMBOL_GPL(__debug_show_held_locks);
3787
3788void debug_show_held_locks(struct task_struct *task)
3789{
3790 __debug_show_held_locks(task);
3791}
3792EXPORT_SYMBOL_GPL(debug_show_held_locks); 3799EXPORT_SYMBOL_GPL(debug_show_held_locks);
3793 3800
3794void lockdep_sys_exit(void) 3801void lockdep_sys_exit(void)
diff --git a/kernel/module.c b/kernel/module.c
index 6c562828c85c..437a74a7524a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1,6 +1,6 @@
1/* 1/*
2 Copyright (C) 2002 Richard Henderson 2 Copyright (C) 2002 Richard Henderson
3 Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. 3 Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
4 4
5 This program is free software; you can redistribute it and/or modify 5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by 6 it under the terms of the GNU General Public License as published by
@@ -55,6 +55,7 @@
55#include <linux/async.h> 55#include <linux/async.h>
56#include <linux/percpu.h> 56#include <linux/percpu.h>
57#include <linux/kmemleak.h> 57#include <linux/kmemleak.h>
58#include <linux/jump_label.h>
58 59
59#define CREATE_TRACE_POINTS 60#define CREATE_TRACE_POINTS
60#include <trace/events/module.h> 61#include <trace/events/module.h>
@@ -110,6 +111,20 @@ int unregister_module_notifier(struct notifier_block * nb)
110} 111}
111EXPORT_SYMBOL(unregister_module_notifier); 112EXPORT_SYMBOL(unregister_module_notifier);
112 113
114struct load_info {
115 Elf_Ehdr *hdr;
116 unsigned long len;
117 Elf_Shdr *sechdrs;
118 char *secstrings, *strtab;
119 unsigned long *strmap;
120 unsigned long symoffs, stroffs;
121 struct _ddebug *debug;
122 unsigned int num_debug;
123 struct {
124 unsigned int sym, str, mod, vers, info, pcpu;
125 } index;
126};
127
113/* We require a truly strong try_module_get(): 0 means failure due to 128/* We require a truly strong try_module_get(): 0 means failure due to
114 ongoing or failed initialization etc. */ 129 ongoing or failed initialization etc. */
115static inline int strong_try_module_get(struct module *mod) 130static inline int strong_try_module_get(struct module *mod)
@@ -140,42 +155,38 @@ void __module_put_and_exit(struct module *mod, long code)
140EXPORT_SYMBOL(__module_put_and_exit); 155EXPORT_SYMBOL(__module_put_and_exit);
141 156
142/* Find a module section: 0 means not found. */ 157/* Find a module section: 0 means not found. */
143static unsigned int find_sec(Elf_Ehdr *hdr, 158static unsigned int find_sec(const struct load_info *info, const char *name)
144 Elf_Shdr *sechdrs,
145 const char *secstrings,
146 const char *name)
147{ 159{
148 unsigned int i; 160 unsigned int i;
149 161
150 for (i = 1; i < hdr->e_shnum; i++) 162 for (i = 1; i < info->hdr->e_shnum; i++) {
163 Elf_Shdr *shdr = &info->sechdrs[i];
151 /* Alloc bit cleared means "ignore it." */ 164 /* Alloc bit cleared means "ignore it." */
152 if ((sechdrs[i].sh_flags & SHF_ALLOC) 165 if ((shdr->sh_flags & SHF_ALLOC)
153 && strcmp(secstrings+sechdrs[i].sh_name, name) == 0) 166 && strcmp(info->secstrings + shdr->sh_name, name) == 0)
154 return i; 167 return i;
168 }
155 return 0; 169 return 0;
156} 170}
157 171
158/* Find a module section, or NULL. */ 172/* Find a module section, or NULL. */
159static void *section_addr(Elf_Ehdr *hdr, Elf_Shdr *shdrs, 173static void *section_addr(const struct load_info *info, const char *name)
160 const char *secstrings, const char *name)
161{ 174{
162 /* Section 0 has sh_addr 0. */ 175 /* Section 0 has sh_addr 0. */
163 return (void *)shdrs[find_sec(hdr, shdrs, secstrings, name)].sh_addr; 176 return (void *)info->sechdrs[find_sec(info, name)].sh_addr;
164} 177}
165 178
166/* Find a module section, or NULL. Fill in number of "objects" in section. */ 179/* Find a module section, or NULL. Fill in number of "objects" in section. */
167static void *section_objs(Elf_Ehdr *hdr, 180static void *section_objs(const struct load_info *info,
168 Elf_Shdr *sechdrs,
169 const char *secstrings,
170 const char *name, 181 const char *name,
171 size_t object_size, 182 size_t object_size,
172 unsigned int *num) 183 unsigned int *num)
173{ 184{
174 unsigned int sec = find_sec(hdr, sechdrs, secstrings, name); 185 unsigned int sec = find_sec(info, name);
175 186
176 /* Section 0 has sh_addr 0 and sh_size 0. */ 187 /* Section 0 has sh_addr 0 and sh_size 0. */
177 *num = sechdrs[sec].sh_size / object_size; 188 *num = info->sechdrs[sec].sh_size / object_size;
178 return (void *)sechdrs[sec].sh_addr; 189 return (void *)info->sechdrs[sec].sh_addr;
179} 190}
180 191
181/* Provided by the linker */ 192/* Provided by the linker */
@@ -227,7 +238,7 @@ bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
227 unsigned int symnum, void *data), void *data) 238 unsigned int symnum, void *data), void *data)
228{ 239{
229 struct module *mod; 240 struct module *mod;
230 const struct symsearch arr[] = { 241 static const struct symsearch arr[] = {
231 { __start___ksymtab, __stop___ksymtab, __start___kcrctab, 242 { __start___ksymtab, __stop___ksymtab, __start___kcrctab,
232 NOT_GPL_ONLY, false }, 243 NOT_GPL_ONLY, false },
233 { __start___ksymtab_gpl, __stop___ksymtab_gpl, 244 { __start___ksymtab_gpl, __stop___ksymtab_gpl,
@@ -392,7 +403,8 @@ static int percpu_modalloc(struct module *mod,
392 mod->percpu = __alloc_reserved_percpu(size, align); 403 mod->percpu = __alloc_reserved_percpu(size, align);
393 if (!mod->percpu) { 404 if (!mod->percpu) {
394 printk(KERN_WARNING 405 printk(KERN_WARNING
395 "Could not allocate %lu bytes percpu data\n", size); 406 "%s: Could not allocate %lu bytes percpu data\n",
407 mod->name, size);
396 return -ENOMEM; 408 return -ENOMEM;
397 } 409 }
398 mod->percpu_size = size; 410 mod->percpu_size = size;
@@ -404,11 +416,9 @@ static void percpu_modfree(struct module *mod)
404 free_percpu(mod->percpu); 416 free_percpu(mod->percpu);
405} 417}
406 418
407static unsigned int find_pcpusec(Elf_Ehdr *hdr, 419static unsigned int find_pcpusec(struct load_info *info)
408 Elf_Shdr *sechdrs,
409 const char *secstrings)
410{ 420{
411 return find_sec(hdr, sechdrs, secstrings, ".data..percpu"); 421 return find_sec(info, ".data..percpu");
412} 422}
413 423
414static void percpu_modcopy(struct module *mod, 424static void percpu_modcopy(struct module *mod,
@@ -468,9 +478,7 @@ static inline int percpu_modalloc(struct module *mod,
468static inline void percpu_modfree(struct module *mod) 478static inline void percpu_modfree(struct module *mod)
469{ 479{
470} 480}
471static inline unsigned int find_pcpusec(Elf_Ehdr *hdr, 481static unsigned int find_pcpusec(struct load_info *info)
472 Elf_Shdr *sechdrs,
473 const char *secstrings)
474{ 482{
475 return 0; 483 return 0;
476} 484}
@@ -524,21 +532,21 @@ static char last_unloaded_module[MODULE_NAME_LEN+1];
524EXPORT_TRACEPOINT_SYMBOL(module_get); 532EXPORT_TRACEPOINT_SYMBOL(module_get);
525 533
526/* Init the unload section of the module. */ 534/* Init the unload section of the module. */
527static void module_unload_init(struct module *mod) 535static int module_unload_init(struct module *mod)
528{ 536{
529 int cpu; 537 mod->refptr = alloc_percpu(struct module_ref);
538 if (!mod->refptr)
539 return -ENOMEM;
530 540
531 INIT_LIST_HEAD(&mod->source_list); 541 INIT_LIST_HEAD(&mod->source_list);
532 INIT_LIST_HEAD(&mod->target_list); 542 INIT_LIST_HEAD(&mod->target_list);
533 for_each_possible_cpu(cpu) {
534 per_cpu_ptr(mod->refptr, cpu)->incs = 0;
535 per_cpu_ptr(mod->refptr, cpu)->decs = 0;
536 }
537 543
538 /* Hold reference count during initialization. */ 544 /* Hold reference count during initialization. */
539 __this_cpu_write(mod->refptr->incs, 1); 545 __this_cpu_write(mod->refptr->incs, 1);
540 /* Backwards compatibility macros put refcount during init. */ 546 /* Backwards compatibility macros put refcount during init. */
541 mod->waiter = current; 547 mod->waiter = current;
548
549 return 0;
542} 550}
543 551
544/* Does a already use b? */ 552/* Does a already use b? */
@@ -618,6 +626,8 @@ static void module_unload_free(struct module *mod)
618 kfree(use); 626 kfree(use);
619 } 627 }
620 mutex_unlock(&module_mutex); 628 mutex_unlock(&module_mutex);
629
630 free_percpu(mod->refptr);
621} 631}
622 632
623#ifdef CONFIG_MODULE_FORCE_UNLOAD 633#ifdef CONFIG_MODULE_FORCE_UNLOAD
@@ -891,8 +901,9 @@ int ref_module(struct module *a, struct module *b)
891} 901}
892EXPORT_SYMBOL_GPL(ref_module); 902EXPORT_SYMBOL_GPL(ref_module);
893 903
894static inline void module_unload_init(struct module *mod) 904static inline int module_unload_init(struct module *mod)
895{ 905{
906 return 0;
896} 907}
897#endif /* CONFIG_MODULE_UNLOAD */ 908#endif /* CONFIG_MODULE_UNLOAD */
898 909
@@ -1051,10 +1062,9 @@ static inline int same_magic(const char *amagic, const char *bmagic,
1051#endif /* CONFIG_MODVERSIONS */ 1062#endif /* CONFIG_MODVERSIONS */
1052 1063
1053/* Resolve a symbol for this module. I.e. if we find one, record usage. */ 1064/* Resolve a symbol for this module. I.e. if we find one, record usage. */
1054static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, 1065static const struct kernel_symbol *resolve_symbol(struct module *mod,
1055 unsigned int versindex, 1066 const struct load_info *info,
1056 const char *name, 1067 const char *name,
1057 struct module *mod,
1058 char ownername[]) 1068 char ownername[])
1059{ 1069{
1060 struct module *owner; 1070 struct module *owner;
@@ -1068,7 +1078,8 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
1068 if (!sym) 1078 if (!sym)
1069 goto unlock; 1079 goto unlock;
1070 1080
1071 if (!check_version(sechdrs, versindex, name, mod, crc, owner)) { 1081 if (!check_version(info->sechdrs, info->index.vers, name, mod, crc,
1082 owner)) {
1072 sym = ERR_PTR(-EINVAL); 1083 sym = ERR_PTR(-EINVAL);
1073 goto getname; 1084 goto getname;
1074 } 1085 }
@@ -1087,21 +1098,20 @@ unlock:
1087 return sym; 1098 return sym;
1088} 1099}
1089 1100
1090static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs, 1101static const struct kernel_symbol *
1091 unsigned int versindex, 1102resolve_symbol_wait(struct module *mod,
1092 const char *name, 1103 const struct load_info *info,
1093 struct module *mod) 1104 const char *name)
1094{ 1105{
1095 const struct kernel_symbol *ksym; 1106 const struct kernel_symbol *ksym;
1096 char ownername[MODULE_NAME_LEN]; 1107 char owner[MODULE_NAME_LEN];
1097 1108
1098 if (wait_event_interruptible_timeout(module_wq, 1109 if (wait_event_interruptible_timeout(module_wq,
1099 !IS_ERR(ksym = resolve_symbol(sechdrs, versindex, name, 1110 !IS_ERR(ksym = resolve_symbol(mod, info, name, owner))
1100 mod, ownername)) || 1111 || PTR_ERR(ksym) != -EBUSY,
1101 PTR_ERR(ksym) != -EBUSY,
1102 30 * HZ) <= 0) { 1112 30 * HZ) <= 0) {
1103 printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n", 1113 printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n",
1104 mod->name, ownername); 1114 mod->name, owner);
1105 } 1115 }
1106 return ksym; 1116 return ksym;
1107} 1117}
@@ -1110,8 +1120,9 @@ static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs,
1110 * /sys/module/foo/sections stuff 1120 * /sys/module/foo/sections stuff
1111 * J. Corbet <corbet@lwn.net> 1121 * J. Corbet <corbet@lwn.net>
1112 */ 1122 */
1113#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS) 1123#ifdef CONFIG_SYSFS
1114 1124
1125#ifdef CONFIG_KALLSYMS
1115static inline bool sect_empty(const Elf_Shdr *sect) 1126static inline bool sect_empty(const Elf_Shdr *sect)
1116{ 1127{
1117 return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; 1128 return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
@@ -1148,8 +1159,7 @@ static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
1148 kfree(sect_attrs); 1159 kfree(sect_attrs);
1149} 1160}
1150 1161
1151static void add_sect_attrs(struct module *mod, unsigned int nsect, 1162static void add_sect_attrs(struct module *mod, const struct load_info *info)
1152 char *secstrings, Elf_Shdr *sechdrs)
1153{ 1163{
1154 unsigned int nloaded = 0, i, size[2]; 1164 unsigned int nloaded = 0, i, size[2];
1155 struct module_sect_attrs *sect_attrs; 1165 struct module_sect_attrs *sect_attrs;
@@ -1157,8 +1167,8 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1157 struct attribute **gattr; 1167 struct attribute **gattr;
1158 1168
1159 /* Count loaded sections and allocate structures */ 1169 /* Count loaded sections and allocate structures */
1160 for (i = 0; i < nsect; i++) 1170 for (i = 0; i < info->hdr->e_shnum; i++)
1161 if (!sect_empty(&sechdrs[i])) 1171 if (!sect_empty(&info->sechdrs[i]))
1162 nloaded++; 1172 nloaded++;
1163 size[0] = ALIGN(sizeof(*sect_attrs) 1173 size[0] = ALIGN(sizeof(*sect_attrs)
1164 + nloaded * sizeof(sect_attrs->attrs[0]), 1174 + nloaded * sizeof(sect_attrs->attrs[0]),
@@ -1175,11 +1185,12 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1175 sect_attrs->nsections = 0; 1185 sect_attrs->nsections = 0;
1176 sattr = &sect_attrs->attrs[0]; 1186 sattr = &sect_attrs->attrs[0];
1177 gattr = &sect_attrs->grp.attrs[0]; 1187 gattr = &sect_attrs->grp.attrs[0];
1178 for (i = 0; i < nsect; i++) { 1188 for (i = 0; i < info->hdr->e_shnum; i++) {
1179 if (sect_empty(&sechdrs[i])) 1189 Elf_Shdr *sec = &info->sechdrs[i];
1190 if (sect_empty(sec))
1180 continue; 1191 continue;
1181 sattr->address = sechdrs[i].sh_addr; 1192 sattr->address = sec->sh_addr;
1182 sattr->name = kstrdup(secstrings + sechdrs[i].sh_name, 1193 sattr->name = kstrdup(info->secstrings + sec->sh_name,
1183 GFP_KERNEL); 1194 GFP_KERNEL);
1184 if (sattr->name == NULL) 1195 if (sattr->name == NULL)
1185 goto out; 1196 goto out;
@@ -1247,8 +1258,7 @@ static void free_notes_attrs(struct module_notes_attrs *notes_attrs,
1247 kfree(notes_attrs); 1258 kfree(notes_attrs);
1248} 1259}
1249 1260
1250static void add_notes_attrs(struct module *mod, unsigned int nsect, 1261static void add_notes_attrs(struct module *mod, const struct load_info *info)
1251 char *secstrings, Elf_Shdr *sechdrs)
1252{ 1262{
1253 unsigned int notes, loaded, i; 1263 unsigned int notes, loaded, i;
1254 struct module_notes_attrs *notes_attrs; 1264 struct module_notes_attrs *notes_attrs;
@@ -1260,9 +1270,9 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1260 1270
1261 /* Count notes sections and allocate structures. */ 1271 /* Count notes sections and allocate structures. */
1262 notes = 0; 1272 notes = 0;
1263 for (i = 0; i < nsect; i++) 1273 for (i = 0; i < info->hdr->e_shnum; i++)
1264 if (!sect_empty(&sechdrs[i]) && 1274 if (!sect_empty(&info->sechdrs[i]) &&
1265 (sechdrs[i].sh_type == SHT_NOTE)) 1275 (info->sechdrs[i].sh_type == SHT_NOTE))
1266 ++notes; 1276 ++notes;
1267 1277
1268 if (notes == 0) 1278 if (notes == 0)
@@ -1276,15 +1286,15 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1276 1286
1277 notes_attrs->notes = notes; 1287 notes_attrs->notes = notes;
1278 nattr = &notes_attrs->attrs[0]; 1288 nattr = &notes_attrs->attrs[0];
1279 for (loaded = i = 0; i < nsect; ++i) { 1289 for (loaded = i = 0; i < info->hdr->e_shnum; ++i) {
1280 if (sect_empty(&sechdrs[i])) 1290 if (sect_empty(&info->sechdrs[i]))
1281 continue; 1291 continue;
1282 if (sechdrs[i].sh_type == SHT_NOTE) { 1292 if (info->sechdrs[i].sh_type == SHT_NOTE) {
1283 sysfs_bin_attr_init(nattr); 1293 sysfs_bin_attr_init(nattr);
1284 nattr->attr.name = mod->sect_attrs->attrs[loaded].name; 1294 nattr->attr.name = mod->sect_attrs->attrs[loaded].name;
1285 nattr->attr.mode = S_IRUGO; 1295 nattr->attr.mode = S_IRUGO;
1286 nattr->size = sechdrs[i].sh_size; 1296 nattr->size = info->sechdrs[i].sh_size;
1287 nattr->private = (void *) sechdrs[i].sh_addr; 1297 nattr->private = (void *) info->sechdrs[i].sh_addr;
1288 nattr->read = module_notes_read; 1298 nattr->read = module_notes_read;
1289 ++nattr; 1299 ++nattr;
1290 } 1300 }
@@ -1315,8 +1325,8 @@ static void remove_notes_attrs(struct module *mod)
1315 1325
1316#else 1326#else
1317 1327
1318static inline void add_sect_attrs(struct module *mod, unsigned int nsect, 1328static inline void add_sect_attrs(struct module *mod,
1319 char *sectstrings, Elf_Shdr *sechdrs) 1329 const struct load_info *info)
1320{ 1330{
1321} 1331}
1322 1332
@@ -1324,17 +1334,16 @@ static inline void remove_sect_attrs(struct module *mod)
1324{ 1334{
1325} 1335}
1326 1336
1327static inline void add_notes_attrs(struct module *mod, unsigned int nsect, 1337static inline void add_notes_attrs(struct module *mod,
1328 char *sectstrings, Elf_Shdr *sechdrs) 1338 const struct load_info *info)
1329{ 1339{
1330} 1340}
1331 1341
1332static inline void remove_notes_attrs(struct module *mod) 1342static inline void remove_notes_attrs(struct module *mod)
1333{ 1343{
1334} 1344}
1335#endif 1345#endif /* CONFIG_KALLSYMS */
1336 1346
1337#ifdef CONFIG_SYSFS
1338static void add_usage_links(struct module *mod) 1347static void add_usage_links(struct module *mod)
1339{ 1348{
1340#ifdef CONFIG_MODULE_UNLOAD 1349#ifdef CONFIG_MODULE_UNLOAD
@@ -1439,6 +1448,7 @@ out:
1439} 1448}
1440 1449
1441static int mod_sysfs_setup(struct module *mod, 1450static int mod_sysfs_setup(struct module *mod,
1451 const struct load_info *info,
1442 struct kernel_param *kparam, 1452 struct kernel_param *kparam,
1443 unsigned int num_params) 1453 unsigned int num_params)
1444{ 1454{
@@ -1463,6 +1473,8 @@ static int mod_sysfs_setup(struct module *mod,
1463 goto out_unreg_param; 1473 goto out_unreg_param;
1464 1474
1465 add_usage_links(mod); 1475 add_usage_links(mod);
1476 add_sect_attrs(mod, info);
1477 add_notes_attrs(mod, info);
1466 1478
1467 kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); 1479 kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
1468 return 0; 1480 return 0;
@@ -1479,33 +1491,26 @@ out:
1479 1491
1480static void mod_sysfs_fini(struct module *mod) 1492static void mod_sysfs_fini(struct module *mod)
1481{ 1493{
1494 remove_notes_attrs(mod);
1495 remove_sect_attrs(mod);
1482 kobject_put(&mod->mkobj.kobj); 1496 kobject_put(&mod->mkobj.kobj);
1483} 1497}
1484 1498
1485#else /* CONFIG_SYSFS */ 1499#else /* !CONFIG_SYSFS */
1486
1487static inline int mod_sysfs_init(struct module *mod)
1488{
1489 return 0;
1490}
1491 1500
1492static inline int mod_sysfs_setup(struct module *mod, 1501static int mod_sysfs_setup(struct module *mod,
1502 const struct load_info *info,
1493 struct kernel_param *kparam, 1503 struct kernel_param *kparam,
1494 unsigned int num_params) 1504 unsigned int num_params)
1495{ 1505{
1496 return 0; 1506 return 0;
1497} 1507}
1498 1508
1499static inline int module_add_modinfo_attrs(struct module *mod) 1509static void mod_sysfs_fini(struct module *mod)
1500{
1501 return 0;
1502}
1503
1504static inline void module_remove_modinfo_attrs(struct module *mod)
1505{ 1510{
1506} 1511}
1507 1512
1508static void mod_sysfs_fini(struct module *mod) 1513static void module_remove_modinfo_attrs(struct module *mod)
1509{ 1514{
1510} 1515}
1511 1516
@@ -1515,7 +1520,7 @@ static void del_usage_links(struct module *mod)
1515 1520
1516#endif /* CONFIG_SYSFS */ 1521#endif /* CONFIG_SYSFS */
1517 1522
1518static void mod_kobject_remove(struct module *mod) 1523static void mod_sysfs_teardown(struct module *mod)
1519{ 1524{
1520 del_usage_links(mod); 1525 del_usage_links(mod);
1521 module_remove_modinfo_attrs(mod); 1526 module_remove_modinfo_attrs(mod);
@@ -1533,6 +1538,7 @@ static int __unlink_module(void *_mod)
1533{ 1538{
1534 struct module *mod = _mod; 1539 struct module *mod = _mod;
1535 list_del(&mod->list); 1540 list_del(&mod->list);
1541 module_bug_cleanup(mod);
1536 return 0; 1542 return 0;
1537} 1543}
1538 1544
@@ -1545,9 +1551,7 @@ static void free_module(struct module *mod)
1545 mutex_lock(&module_mutex); 1551 mutex_lock(&module_mutex);
1546 stop_machine(__unlink_module, mod, NULL); 1552 stop_machine(__unlink_module, mod, NULL);
1547 mutex_unlock(&module_mutex); 1553 mutex_unlock(&module_mutex);
1548 remove_notes_attrs(mod); 1554 mod_sysfs_teardown(mod);
1549 remove_sect_attrs(mod);
1550 mod_kobject_remove(mod);
1551 1555
1552 /* Remove dynamic debug info */ 1556 /* Remove dynamic debug info */
1553 ddebug_remove_module(mod->name); 1557 ddebug_remove_module(mod->name);
@@ -1565,10 +1569,7 @@ static void free_module(struct module *mod)
1565 module_free(mod, mod->module_init); 1569 module_free(mod, mod->module_init);
1566 kfree(mod->args); 1570 kfree(mod->args);
1567 percpu_modfree(mod); 1571 percpu_modfree(mod);
1568#if defined(CONFIG_MODULE_UNLOAD) 1572
1569 if (mod->refptr)
1570 free_percpu(mod->refptr);
1571#endif
1572 /* Free lock-classes: */ 1573 /* Free lock-classes: */
1573 lockdep_free_key_range(mod->module_core, mod->core_size); 1574 lockdep_free_key_range(mod->module_core, mod->core_size);
1574 1575
@@ -1634,25 +1635,23 @@ static int verify_export_symbols(struct module *mod)
1634} 1635}
1635 1636
1636/* Change all symbols so that st_value encodes the pointer directly. */ 1637/* Change all symbols so that st_value encodes the pointer directly. */
1637static int simplify_symbols(Elf_Shdr *sechdrs, 1638static int simplify_symbols(struct module *mod, const struct load_info *info)
1638 unsigned int symindex, 1639{
1639 const char *strtab, 1640 Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
1640 unsigned int versindex, 1641 Elf_Sym *sym = (void *)symsec->sh_addr;
1641 unsigned int pcpuindex,
1642 struct module *mod)
1643{
1644 Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr;
1645 unsigned long secbase; 1642 unsigned long secbase;
1646 unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym); 1643 unsigned int i;
1647 int ret = 0; 1644 int ret = 0;
1648 const struct kernel_symbol *ksym; 1645 const struct kernel_symbol *ksym;
1649 1646
1650 for (i = 1; i < n; i++) { 1647 for (i = 1; i < symsec->sh_size / sizeof(Elf_Sym); i++) {
1648 const char *name = info->strtab + sym[i].st_name;
1649
1651 switch (sym[i].st_shndx) { 1650 switch (sym[i].st_shndx) {
1652 case SHN_COMMON: 1651 case SHN_COMMON:
1653 /* We compiled with -fno-common. These are not 1652 /* We compiled with -fno-common. These are not
1654 supposed to happen. */ 1653 supposed to happen. */
1655 DEBUGP("Common symbol: %s\n", strtab + sym[i].st_name); 1654 DEBUGP("Common symbol: %s\n", name);
1656 printk("%s: please compile with -fno-common\n", 1655 printk("%s: please compile with -fno-common\n",
1657 mod->name); 1656 mod->name);
1658 ret = -ENOEXEC; 1657 ret = -ENOEXEC;
@@ -1665,9 +1664,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1665 break; 1664 break;
1666 1665
1667 case SHN_UNDEF: 1666 case SHN_UNDEF:
1668 ksym = resolve_symbol_wait(sechdrs, versindex, 1667 ksym = resolve_symbol_wait(mod, info, name);
1669 strtab + sym[i].st_name,
1670 mod);
1671 /* Ok if resolved. */ 1668 /* Ok if resolved. */
1672 if (ksym && !IS_ERR(ksym)) { 1669 if (ksym && !IS_ERR(ksym)) {
1673 sym[i].st_value = ksym->value; 1670 sym[i].st_value = ksym->value;
@@ -1679,17 +1676,16 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1679 break; 1676 break;
1680 1677
1681 printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n", 1678 printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n",
1682 mod->name, strtab + sym[i].st_name, 1679 mod->name, name, PTR_ERR(ksym));
1683 PTR_ERR(ksym));
1684 ret = PTR_ERR(ksym) ?: -ENOENT; 1680 ret = PTR_ERR(ksym) ?: -ENOENT;
1685 break; 1681 break;
1686 1682
1687 default: 1683 default:
1688 /* Divert to percpu allocation if a percpu var. */ 1684 /* Divert to percpu allocation if a percpu var. */
1689 if (sym[i].st_shndx == pcpuindex) 1685 if (sym[i].st_shndx == info->index.pcpu)
1690 secbase = (unsigned long)mod_percpu(mod); 1686 secbase = (unsigned long)mod_percpu(mod);
1691 else 1687 else
1692 secbase = sechdrs[sym[i].st_shndx].sh_addr; 1688 secbase = info->sechdrs[sym[i].st_shndx].sh_addr;
1693 sym[i].st_value += secbase; 1689 sym[i].st_value += secbase;
1694 break; 1690 break;
1695 } 1691 }
@@ -1698,6 +1694,35 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1698 return ret; 1694 return ret;
1699} 1695}
1700 1696
1697static int apply_relocations(struct module *mod, const struct load_info *info)
1698{
1699 unsigned int i;
1700 int err = 0;
1701
1702 /* Now do relocations. */
1703 for (i = 1; i < info->hdr->e_shnum; i++) {
1704 unsigned int infosec = info->sechdrs[i].sh_info;
1705
1706 /* Not a valid relocation section? */
1707 if (infosec >= info->hdr->e_shnum)
1708 continue;
1709
1710 /* Don't bother with non-allocated sections */
1711 if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC))
1712 continue;
1713
1714 if (info->sechdrs[i].sh_type == SHT_REL)
1715 err = apply_relocate(info->sechdrs, info->strtab,
1716 info->index.sym, i, mod);
1717 else if (info->sechdrs[i].sh_type == SHT_RELA)
1718 err = apply_relocate_add(info->sechdrs, info->strtab,
1719 info->index.sym, i, mod);
1720 if (err < 0)
1721 break;
1722 }
1723 return err;
1724}
1725
1701/* Additional bytes needed by arch in front of individual sections */ 1726/* Additional bytes needed by arch in front of individual sections */
1702unsigned int __weak arch_mod_section_prepend(struct module *mod, 1727unsigned int __weak arch_mod_section_prepend(struct module *mod,
1703 unsigned int section) 1728 unsigned int section)
@@ -1722,10 +1747,7 @@ static long get_offset(struct module *mod, unsigned int *size,
1722 might -- code, read-only data, read-write data, small data. Tally 1747 might -- code, read-only data, read-write data, small data. Tally
1723 sizes, and place the offsets into sh_entsize fields: high bit means it 1748 sizes, and place the offsets into sh_entsize fields: high bit means it
1724 belongs in init. */ 1749 belongs in init. */
1725static void layout_sections(struct module *mod, 1750static void layout_sections(struct module *mod, struct load_info *info)
1726 const Elf_Ehdr *hdr,
1727 Elf_Shdr *sechdrs,
1728 const char *secstrings)
1729{ 1751{
1730 static unsigned long const masks[][2] = { 1752 static unsigned long const masks[][2] = {
1731 /* NOTE: all executable code must be the first section 1753 /* NOTE: all executable code must be the first section
@@ -1738,21 +1760,22 @@ static void layout_sections(struct module *mod,
1738 }; 1760 };
1739 unsigned int m, i; 1761 unsigned int m, i;
1740 1762
1741 for (i = 0; i < hdr->e_shnum; i++) 1763 for (i = 0; i < info->hdr->e_shnum; i++)
1742 sechdrs[i].sh_entsize = ~0UL; 1764 info->sechdrs[i].sh_entsize = ~0UL;
1743 1765
1744 DEBUGP("Core section allocation order:\n"); 1766 DEBUGP("Core section allocation order:\n");
1745 for (m = 0; m < ARRAY_SIZE(masks); ++m) { 1767 for (m = 0; m < ARRAY_SIZE(masks); ++m) {
1746 for (i = 0; i < hdr->e_shnum; ++i) { 1768 for (i = 0; i < info->hdr->e_shnum; ++i) {
1747 Elf_Shdr *s = &sechdrs[i]; 1769 Elf_Shdr *s = &info->sechdrs[i];
1770 const char *sname = info->secstrings + s->sh_name;
1748 1771
1749 if ((s->sh_flags & masks[m][0]) != masks[m][0] 1772 if ((s->sh_flags & masks[m][0]) != masks[m][0]
1750 || (s->sh_flags & masks[m][1]) 1773 || (s->sh_flags & masks[m][1])
1751 || s->sh_entsize != ~0UL 1774 || s->sh_entsize != ~0UL
1752 || strstarts(secstrings + s->sh_name, ".init")) 1775 || strstarts(sname, ".init"))
1753 continue; 1776 continue;
1754 s->sh_entsize = get_offset(mod, &mod->core_size, s, i); 1777 s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
1755 DEBUGP("\t%s\n", secstrings + s->sh_name); 1778 DEBUGP("\t%s\n", name);
1756 } 1779 }
1757 if (m == 0) 1780 if (m == 0)
1758 mod->core_text_size = mod->core_size; 1781 mod->core_text_size = mod->core_size;
@@ -1760,17 +1783,18 @@ static void layout_sections(struct module *mod,
1760 1783
1761 DEBUGP("Init section allocation order:\n"); 1784 DEBUGP("Init section allocation order:\n");
1762 for (m = 0; m < ARRAY_SIZE(masks); ++m) { 1785 for (m = 0; m < ARRAY_SIZE(masks); ++m) {
1763 for (i = 0; i < hdr->e_shnum; ++i) { 1786 for (i = 0; i < info->hdr->e_shnum; ++i) {
1764 Elf_Shdr *s = &sechdrs[i]; 1787 Elf_Shdr *s = &info->sechdrs[i];
1788 const char *sname = info->secstrings + s->sh_name;
1765 1789
1766 if ((s->sh_flags & masks[m][0]) != masks[m][0] 1790 if ((s->sh_flags & masks[m][0]) != masks[m][0]
1767 || (s->sh_flags & masks[m][1]) 1791 || (s->sh_flags & masks[m][1])
1768 || s->sh_entsize != ~0UL 1792 || s->sh_entsize != ~0UL
1769 || !strstarts(secstrings + s->sh_name, ".init")) 1793 || !strstarts(sname, ".init"))
1770 continue; 1794 continue;
1771 s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) 1795 s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
1772 | INIT_OFFSET_MASK); 1796 | INIT_OFFSET_MASK);
1773 DEBUGP("\t%s\n", secstrings + s->sh_name); 1797 DEBUGP("\t%s\n", sname);
1774 } 1798 }
1775 if (m == 0) 1799 if (m == 0)
1776 mod->init_text_size = mod->init_size; 1800 mod->init_text_size = mod->init_size;
@@ -1809,33 +1833,28 @@ static char *next_string(char *string, unsigned long *secsize)
1809 return string; 1833 return string;
1810} 1834}
1811 1835
1812static char *get_modinfo(Elf_Shdr *sechdrs, 1836static char *get_modinfo(struct load_info *info, const char *tag)
1813 unsigned int info,
1814 const char *tag)
1815{ 1837{
1816 char *p; 1838 char *p;
1817 unsigned int taglen = strlen(tag); 1839 unsigned int taglen = strlen(tag);
1818 unsigned long size = sechdrs[info].sh_size; 1840 Elf_Shdr *infosec = &info->sechdrs[info->index.info];
1841 unsigned long size = infosec->sh_size;
1819 1842
1820 for (p = (char *)sechdrs[info].sh_addr; p; p = next_string(p, &size)) { 1843 for (p = (char *)infosec->sh_addr; p; p = next_string(p, &size)) {
1821 if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=') 1844 if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')
1822 return p + taglen + 1; 1845 return p + taglen + 1;
1823 } 1846 }
1824 return NULL; 1847 return NULL;
1825} 1848}
1826 1849
1827static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs, 1850static void setup_modinfo(struct module *mod, struct load_info *info)
1828 unsigned int infoindex)
1829{ 1851{
1830 struct module_attribute *attr; 1852 struct module_attribute *attr;
1831 int i; 1853 int i;
1832 1854
1833 for (i = 0; (attr = modinfo_attrs[i]); i++) { 1855 for (i = 0; (attr = modinfo_attrs[i]); i++) {
1834 if (attr->setup) 1856 if (attr->setup)
1835 attr->setup(mod, 1857 attr->setup(mod, get_modinfo(info, attr->attr.name));
1836 get_modinfo(sechdrs,
1837 infoindex,
1838 attr->attr.name));
1839 } 1858 }
1840} 1859}
1841 1860
@@ -1876,11 +1895,10 @@ static int is_exported(const char *name, unsigned long value,
1876} 1895}
1877 1896
1878/* As per nm */ 1897/* As per nm */
1879static char elf_type(const Elf_Sym *sym, 1898static char elf_type(const Elf_Sym *sym, const struct load_info *info)
1880 Elf_Shdr *sechdrs,
1881 const char *secstrings,
1882 struct module *mod)
1883{ 1899{
1900 const Elf_Shdr *sechdrs = info->sechdrs;
1901
1884 if (ELF_ST_BIND(sym->st_info) == STB_WEAK) { 1902 if (ELF_ST_BIND(sym->st_info) == STB_WEAK) {
1885 if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT) 1903 if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT)
1886 return 'v'; 1904 return 'v';
@@ -1910,8 +1928,10 @@ static char elf_type(const Elf_Sym *sym,
1910 else 1928 else
1911 return 'b'; 1929 return 'b';
1912 } 1930 }
1913 if (strstarts(secstrings + sechdrs[sym->st_shndx].sh_name, ".debug")) 1931 if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name,
1932 ".debug")) {
1914 return 'n'; 1933 return 'n';
1934 }
1915 return '?'; 1935 return '?';
1916} 1936}
1917 1937
@@ -1936,127 +1956,96 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
1936 return true; 1956 return true;
1937} 1957}
1938 1958
1939static unsigned long layout_symtab(struct module *mod, 1959static void layout_symtab(struct module *mod, struct load_info *info)
1940 Elf_Shdr *sechdrs,
1941 unsigned int symindex,
1942 unsigned int strindex,
1943 const Elf_Ehdr *hdr,
1944 const char *secstrings,
1945 unsigned long *pstroffs,
1946 unsigned long *strmap)
1947{ 1960{
1948 unsigned long symoffs; 1961 Elf_Shdr *symsect = info->sechdrs + info->index.sym;
1949 Elf_Shdr *symsect = sechdrs + symindex; 1962 Elf_Shdr *strsect = info->sechdrs + info->index.str;
1950 Elf_Shdr *strsect = sechdrs + strindex;
1951 const Elf_Sym *src; 1963 const Elf_Sym *src;
1952 const char *strtab;
1953 unsigned int i, nsrc, ndst; 1964 unsigned int i, nsrc, ndst;
1954 1965
1955 /* Put symbol section at end of init part of module. */ 1966 /* Put symbol section at end of init part of module. */
1956 symsect->sh_flags |= SHF_ALLOC; 1967 symsect->sh_flags |= SHF_ALLOC;
1957 symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, 1968 symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
1958 symindex) | INIT_OFFSET_MASK; 1969 info->index.sym) | INIT_OFFSET_MASK;
1959 DEBUGP("\t%s\n", secstrings + symsect->sh_name); 1970 DEBUGP("\t%s\n", info->secstrings + symsect->sh_name);
1960 1971
1961 src = (void *)hdr + symsect->sh_offset; 1972 src = (void *)info->hdr + symsect->sh_offset;
1962 nsrc = symsect->sh_size / sizeof(*src); 1973 nsrc = symsect->sh_size / sizeof(*src);
1963 strtab = (void *)hdr + strsect->sh_offset;
1964 for (ndst = i = 1; i < nsrc; ++i, ++src) 1974 for (ndst = i = 1; i < nsrc; ++i, ++src)
1965 if (is_core_symbol(src, sechdrs, hdr->e_shnum)) { 1975 if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) {
1966 unsigned int j = src->st_name; 1976 unsigned int j = src->st_name;
1967 1977
1968 while(!__test_and_set_bit(j, strmap) && strtab[j]) 1978 while (!__test_and_set_bit(j, info->strmap)
1979 && info->strtab[j])
1969 ++j; 1980 ++j;
1970 ++ndst; 1981 ++ndst;
1971 } 1982 }
1972 1983
1973 /* Append room for core symbols at end of core part. */ 1984 /* Append room for core symbols at end of core part. */
1974 symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); 1985 info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
1975 mod->core_size = symoffs + ndst * sizeof(Elf_Sym); 1986 mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
1976 1987
1977 /* Put string table section at end of init part of module. */ 1988 /* Put string table section at end of init part of module. */
1978 strsect->sh_flags |= SHF_ALLOC; 1989 strsect->sh_flags |= SHF_ALLOC;
1979 strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, 1990 strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
1980 strindex) | INIT_OFFSET_MASK; 1991 info->index.str) | INIT_OFFSET_MASK;
1981 DEBUGP("\t%s\n", secstrings + strsect->sh_name); 1992 DEBUGP("\t%s\n", info->secstrings + strsect->sh_name);
1982 1993
1983 /* Append room for core symbols' strings at end of core part. */ 1994 /* Append room for core symbols' strings at end of core part. */
1984 *pstroffs = mod->core_size; 1995 info->stroffs = mod->core_size;
1985 __set_bit(0, strmap); 1996 __set_bit(0, info->strmap);
1986 mod->core_size += bitmap_weight(strmap, strsect->sh_size); 1997 mod->core_size += bitmap_weight(info->strmap, strsect->sh_size);
1987
1988 return symoffs;
1989} 1998}
1990 1999
1991static void add_kallsyms(struct module *mod, 2000static void add_kallsyms(struct module *mod, const struct load_info *info)
1992 Elf_Shdr *sechdrs,
1993 unsigned int shnum,
1994 unsigned int symindex,
1995 unsigned int strindex,
1996 unsigned long symoffs,
1997 unsigned long stroffs,
1998 const char *secstrings,
1999 unsigned long *strmap)
2000{ 2001{
2001 unsigned int i, ndst; 2002 unsigned int i, ndst;
2002 const Elf_Sym *src; 2003 const Elf_Sym *src;
2003 Elf_Sym *dst; 2004 Elf_Sym *dst;
2004 char *s; 2005 char *s;
2006 Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
2005 2007
2006 mod->symtab = (void *)sechdrs[symindex].sh_addr; 2008 mod->symtab = (void *)symsec->sh_addr;
2007 mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym); 2009 mod->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
2008 mod->strtab = (void *)sechdrs[strindex].sh_addr; 2010 /* Make sure we get permanent strtab: don't use info->strtab. */
2011 mod->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
2009 2012
2010 /* Set types up while we still have access to sections. */ 2013 /* Set types up while we still have access to sections. */
2011 for (i = 0; i < mod->num_symtab; i++) 2014 for (i = 0; i < mod->num_symtab; i++)
2012 mod->symtab[i].st_info 2015 mod->symtab[i].st_info = elf_type(&mod->symtab[i], info);
2013 = elf_type(&mod->symtab[i], sechdrs, secstrings, mod);
2014 2016
2015 mod->core_symtab = dst = mod->module_core + symoffs; 2017 mod->core_symtab = dst = mod->module_core + info->symoffs;
2016 src = mod->symtab; 2018 src = mod->symtab;
2017 *dst = *src; 2019 *dst = *src;
2018 for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { 2020 for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) {
2019 if (!is_core_symbol(src, sechdrs, shnum)) 2021 if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum))
2020 continue; 2022 continue;
2021 dst[ndst] = *src; 2023 dst[ndst] = *src;
2022 dst[ndst].st_name = bitmap_weight(strmap, dst[ndst].st_name); 2024 dst[ndst].st_name = bitmap_weight(info->strmap,
2025 dst[ndst].st_name);
2023 ++ndst; 2026 ++ndst;
2024 } 2027 }
2025 mod->core_num_syms = ndst; 2028 mod->core_num_syms = ndst;
2026 2029
2027 mod->core_strtab = s = mod->module_core + stroffs; 2030 mod->core_strtab = s = mod->module_core + info->stroffs;
2028 for (*s = 0, i = 1; i < sechdrs[strindex].sh_size; ++i) 2031 for (*s = 0, i = 1; i < info->sechdrs[info->index.str].sh_size; ++i)
2029 if (test_bit(i, strmap)) 2032 if (test_bit(i, info->strmap))
2030 *++s = mod->strtab[i]; 2033 *++s = mod->strtab[i];
2031} 2034}
2032#else 2035#else
2033static inline unsigned long layout_symtab(struct module *mod, 2036static inline void layout_symtab(struct module *mod, struct load_info *info)
2034 Elf_Shdr *sechdrs,
2035 unsigned int symindex,
2036 unsigned int strindex,
2037 const Elf_Ehdr *hdr,
2038 const char *secstrings,
2039 unsigned long *pstroffs,
2040 unsigned long *strmap)
2041{ 2037{
2042 return 0;
2043} 2038}
2044 2039
2045static inline void add_kallsyms(struct module *mod, 2040static void add_kallsyms(struct module *mod, const struct load_info *info)
2046 Elf_Shdr *sechdrs,
2047 unsigned int shnum,
2048 unsigned int symindex,
2049 unsigned int strindex,
2050 unsigned long symoffs,
2051 unsigned long stroffs,
2052 const char *secstrings,
2053 const unsigned long *strmap)
2054{ 2041{
2055} 2042}
2056#endif /* CONFIG_KALLSYMS */ 2043#endif /* CONFIG_KALLSYMS */
2057 2044
2058static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num) 2045static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
2059{ 2046{
2047 if (!debug)
2048 return;
2060#ifdef CONFIG_DYNAMIC_DEBUG 2049#ifdef CONFIG_DYNAMIC_DEBUG
2061 if (ddebug_add_module(debug, num, debug->modname)) 2050 if (ddebug_add_module(debug, num, debug->modname))
2062 printk(KERN_ERR "dynamic debug error adding module: %s\n", 2051 printk(KERN_ERR "dynamic debug error adding module: %s\n",
@@ -2087,65 +2076,47 @@ static void *module_alloc_update_bounds(unsigned long size)
2087} 2076}
2088 2077
2089#ifdef CONFIG_DEBUG_KMEMLEAK 2078#ifdef CONFIG_DEBUG_KMEMLEAK
2090static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, 2079static void kmemleak_load_module(const struct module *mod,
2091 Elf_Shdr *sechdrs, char *secstrings) 2080 const struct load_info *info)
2092{ 2081{
2093 unsigned int i; 2082 unsigned int i;
2094 2083
2095 /* only scan the sections containing data */ 2084 /* only scan the sections containing data */
2096 kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); 2085 kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
2097 2086
2098 for (i = 1; i < hdr->e_shnum; i++) { 2087 for (i = 1; i < info->hdr->e_shnum; i++) {
2099 if (!(sechdrs[i].sh_flags & SHF_ALLOC)) 2088 const char *name = info->secstrings + info->sechdrs[i].sh_name;
2089 if (!(info->sechdrs[i].sh_flags & SHF_ALLOC))
2100 continue; 2090 continue;
2101 if (strncmp(secstrings + sechdrs[i].sh_name, ".data", 5) != 0 2091 if (!strstarts(name, ".data") && !strstarts(name, ".bss"))
2102 && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0)
2103 continue; 2092 continue;
2104 2093
2105 kmemleak_scan_area((void *)sechdrs[i].sh_addr, 2094 kmemleak_scan_area((void *)info->sechdrs[i].sh_addr,
2106 sechdrs[i].sh_size, GFP_KERNEL); 2095 info->sechdrs[i].sh_size, GFP_KERNEL);
2107 } 2096 }
2108} 2097}
2109#else 2098#else
2110static inline void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, 2099static inline void kmemleak_load_module(const struct module *mod,
2111 Elf_Shdr *sechdrs, char *secstrings) 2100 const struct load_info *info)
2112{ 2101{
2113} 2102}
2114#endif 2103#endif
2115 2104
2116/* Allocate and load the module: note that size of section 0 is always 2105/* Sets info->hdr and info->len. */
2117 zero, and we rely on this for optional sections. */ 2106static int copy_and_check(struct load_info *info,
2118static noinline struct module *load_module(void __user *umod, 2107 const void __user *umod, unsigned long len,
2119 unsigned long len, 2108 const char __user *uargs)
2120 const char __user *uargs)
2121{ 2109{
2110 int err;
2122 Elf_Ehdr *hdr; 2111 Elf_Ehdr *hdr;
2123 Elf_Shdr *sechdrs;
2124 char *secstrings, *args, *modmagic, *strtab = NULL;
2125 char *staging;
2126 unsigned int i;
2127 unsigned int symindex = 0;
2128 unsigned int strindex = 0;
2129 unsigned int modindex, versindex, infoindex, pcpuindex;
2130 struct module *mod;
2131 long err = 0;
2132 void *ptr = NULL; /* Stops spurious gcc warning */
2133 unsigned long symoffs, stroffs, *strmap;
2134 void __percpu *percpu;
2135 struct _ddebug *debug = NULL;
2136 unsigned int num_debug = 0;
2137 2112
2138 mm_segment_t old_fs;
2139
2140 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
2141 umod, len, uargs);
2142 if (len < sizeof(*hdr)) 2113 if (len < sizeof(*hdr))
2143 return ERR_PTR(-ENOEXEC); 2114 return -ENOEXEC;
2144 2115
2145 /* Suck in entire file: we'll want most of it. */ 2116 /* Suck in entire file: we'll want most of it. */
2146 /* vmalloc barfs on "unusual" numbers. Check here */ 2117 /* vmalloc barfs on "unusual" numbers. Check here */
2147 if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) 2118 if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
2148 return ERR_PTR(-ENOMEM); 2119 return -ENOMEM;
2149 2120
2150 if (copy_from_user(hdr, umod, len) != 0) { 2121 if (copy_from_user(hdr, umod, len) != 0) {
2151 err = -EFAULT; 2122 err = -EFAULT;
@@ -2153,135 +2124,230 @@ static noinline struct module *load_module(void __user *umod,
2153 } 2124 }
2154 2125
2155 /* Sanity checks against insmoding binaries or wrong arch, 2126 /* Sanity checks against insmoding binaries or wrong arch,
2156 weird elf version */ 2127 weird elf version */
2157 if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 2128 if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0
2158 || hdr->e_type != ET_REL 2129 || hdr->e_type != ET_REL
2159 || !elf_check_arch(hdr) 2130 || !elf_check_arch(hdr)
2160 || hdr->e_shentsize != sizeof(*sechdrs)) { 2131 || hdr->e_shentsize != sizeof(Elf_Shdr)) {
2161 err = -ENOEXEC; 2132 err = -ENOEXEC;
2162 goto free_hdr; 2133 goto free_hdr;
2163 } 2134 }
2164 2135
2165 if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) 2136 if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) {
2166 goto truncated; 2137 err = -ENOEXEC;
2138 goto free_hdr;
2139 }
2167 2140
2168 /* Convenience variables */ 2141 info->hdr = hdr;
2169 sechdrs = (void *)hdr + hdr->e_shoff; 2142 info->len = len;
2170 secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; 2143 return 0;
2171 sechdrs[0].sh_addr = 0;
2172 2144
2173 for (i = 1; i < hdr->e_shnum; i++) { 2145free_hdr:
2174 if (sechdrs[i].sh_type != SHT_NOBITS 2146 vfree(hdr);
2175 && len < sechdrs[i].sh_offset + sechdrs[i].sh_size) 2147 return err;
2176 goto truncated; 2148}
2149
2150static void free_copy(struct load_info *info)
2151{
2152 vfree(info->hdr);
2153}
2154
2155static int rewrite_section_headers(struct load_info *info)
2156{
2157 unsigned int i;
2158
2159 /* This should always be true, but let's be sure. */
2160 info->sechdrs[0].sh_addr = 0;
2161
2162 for (i = 1; i < info->hdr->e_shnum; i++) {
2163 Elf_Shdr *shdr = &info->sechdrs[i];
2164 if (shdr->sh_type != SHT_NOBITS
2165 && info->len < shdr->sh_offset + shdr->sh_size) {
2166 printk(KERN_ERR "Module len %lu truncated\n",
2167 info->len);
2168 return -ENOEXEC;
2169 }
2177 2170
2178 /* Mark all sections sh_addr with their address in the 2171 /* Mark all sections sh_addr with their address in the
2179 temporary image. */ 2172 temporary image. */
2180 sechdrs[i].sh_addr = (size_t)hdr + sechdrs[i].sh_offset; 2173 shdr->sh_addr = (size_t)info->hdr + shdr->sh_offset;
2181 2174
2182 /* Internal symbols and strings. */
2183 if (sechdrs[i].sh_type == SHT_SYMTAB) {
2184 symindex = i;
2185 strindex = sechdrs[i].sh_link;
2186 strtab = (char *)hdr + sechdrs[strindex].sh_offset;
2187 }
2188#ifndef CONFIG_MODULE_UNLOAD 2175#ifndef CONFIG_MODULE_UNLOAD
2189 /* Don't load .exit sections */ 2176 /* Don't load .exit sections */
2190 if (strstarts(secstrings+sechdrs[i].sh_name, ".exit")) 2177 if (strstarts(info->secstrings+shdr->sh_name, ".exit"))
2191 sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC; 2178 shdr->sh_flags &= ~(unsigned long)SHF_ALLOC;
2192#endif 2179#endif
2193 } 2180 }
2194 2181
2195 modindex = find_sec(hdr, sechdrs, secstrings, 2182 /* Track but don't keep modinfo and version sections. */
2196 ".gnu.linkonce.this_module"); 2183 info->index.vers = find_sec(info, "__versions");
2197 if (!modindex) { 2184 info->index.info = find_sec(info, ".modinfo");
2185 info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
2186 info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
2187 return 0;
2188}
2189
2190/*
2191 * Set up our basic convenience variables (pointers to section headers,
2192 * search for module section index etc), and do some basic section
2193 * verification.
2194 *
2195 * Return the temporary module pointer (we'll replace it with the final
2196 * one when we move the module sections around).
2197 */
2198static struct module *setup_load_info(struct load_info *info)
2199{
2200 unsigned int i;
2201 int err;
2202 struct module *mod;
2203
2204 /* Set up the convenience variables */
2205 info->sechdrs = (void *)info->hdr + info->hdr->e_shoff;
2206 info->secstrings = (void *)info->hdr
2207 + info->sechdrs[info->hdr->e_shstrndx].sh_offset;
2208
2209 err = rewrite_section_headers(info);
2210 if (err)
2211 return ERR_PTR(err);
2212
2213 /* Find internal symbols and strings. */
2214 for (i = 1; i < info->hdr->e_shnum; i++) {
2215 if (info->sechdrs[i].sh_type == SHT_SYMTAB) {
2216 info->index.sym = i;
2217 info->index.str = info->sechdrs[i].sh_link;
2218 info->strtab = (char *)info->hdr
2219 + info->sechdrs[info->index.str].sh_offset;
2220 break;
2221 }
2222 }
2223
2224 info->index.mod = find_sec(info, ".gnu.linkonce.this_module");
2225 if (!info->index.mod) {
2198 printk(KERN_WARNING "No module found in object\n"); 2226 printk(KERN_WARNING "No module found in object\n");
2199 err = -ENOEXEC; 2227 return ERR_PTR(-ENOEXEC);
2200 goto free_hdr;
2201 } 2228 }
2202 /* This is temporary: point mod into copy of data. */ 2229 /* This is temporary: point mod into copy of data. */
2203 mod = (void *)sechdrs[modindex].sh_addr; 2230 mod = (void *)info->sechdrs[info->index.mod].sh_addr;
2204 2231
2205 if (symindex == 0) { 2232 if (info->index.sym == 0) {
2206 printk(KERN_WARNING "%s: module has no symbols (stripped?)\n", 2233 printk(KERN_WARNING "%s: module has no symbols (stripped?)\n",
2207 mod->name); 2234 mod->name);
2208 err = -ENOEXEC; 2235 return ERR_PTR(-ENOEXEC);
2209 goto free_hdr;
2210 } 2236 }
2211 2237
2212 versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); 2238 info->index.pcpu = find_pcpusec(info);
2213 infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
2214 pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
2215
2216 /* Don't keep modinfo and version sections. */
2217 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
2218 sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
2219 2239
2220 /* Check module struct version now, before we try to use module. */ 2240 /* Check module struct version now, before we try to use module. */
2221 if (!check_modstruct_version(sechdrs, versindex, mod)) { 2241 if (!check_modstruct_version(info->sechdrs, info->index.vers, mod))
2222 err = -ENOEXEC; 2242 return ERR_PTR(-ENOEXEC);
2223 goto free_hdr; 2243
2224 } 2244 return mod;
2245}
2246
2247static int check_modinfo(struct module *mod, struct load_info *info)
2248{
2249 const char *modmagic = get_modinfo(info, "vermagic");
2250 int err;
2225 2251
2226 modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
2227 /* This is allowed: modprobe --force will invalidate it. */ 2252 /* This is allowed: modprobe --force will invalidate it. */
2228 if (!modmagic) { 2253 if (!modmagic) {
2229 err = try_to_force_load(mod, "bad vermagic"); 2254 err = try_to_force_load(mod, "bad vermagic");
2230 if (err) 2255 if (err)
2231 goto free_hdr; 2256 return err;
2232 } else if (!same_magic(modmagic, vermagic, versindex)) { 2257 } else if (!same_magic(modmagic, vermagic, info->index.vers)) {
2233 printk(KERN_ERR "%s: version magic '%s' should be '%s'\n", 2258 printk(KERN_ERR "%s: version magic '%s' should be '%s'\n",
2234 mod->name, modmagic, vermagic); 2259 mod->name, modmagic, vermagic);
2235 err = -ENOEXEC; 2260 return -ENOEXEC;
2236 goto free_hdr;
2237 } 2261 }
2238 2262
2239 staging = get_modinfo(sechdrs, infoindex, "staging"); 2263 if (get_modinfo(info, "staging")) {
2240 if (staging) {
2241 add_taint_module(mod, TAINT_CRAP); 2264 add_taint_module(mod, TAINT_CRAP);
2242 printk(KERN_WARNING "%s: module is from the staging directory," 2265 printk(KERN_WARNING "%s: module is from the staging directory,"
2243 " the quality is unknown, you have been warned.\n", 2266 " the quality is unknown, you have been warned.\n",
2244 mod->name); 2267 mod->name);
2245 } 2268 }
2246 2269
2247 /* Now copy in args */ 2270 /* Set up license info based on the info section */
2248 args = strndup_user(uargs, ~0UL >> 1); 2271 set_license(mod, get_modinfo(info, "license"));
2249 if (IS_ERR(args)) {
2250 err = PTR_ERR(args);
2251 goto free_hdr;
2252 }
2253 2272
2254 strmap = kzalloc(BITS_TO_LONGS(sechdrs[strindex].sh_size) 2273 return 0;
2255 * sizeof(long), GFP_KERNEL); 2274}
2256 if (!strmap) {
2257 err = -ENOMEM;
2258 goto free_mod;
2259 }
2260 2275
2261 mod->state = MODULE_STATE_COMING; 2276static void find_module_sections(struct module *mod, struct load_info *info)
2277{
2278 mod->kp = section_objs(info, "__param",
2279 sizeof(*mod->kp), &mod->num_kp);
2280 mod->syms = section_objs(info, "__ksymtab",
2281 sizeof(*mod->syms), &mod->num_syms);
2282 mod->crcs = section_addr(info, "__kcrctab");
2283 mod->gpl_syms = section_objs(info, "__ksymtab_gpl",
2284 sizeof(*mod->gpl_syms),
2285 &mod->num_gpl_syms);
2286 mod->gpl_crcs = section_addr(info, "__kcrctab_gpl");
2287 mod->gpl_future_syms = section_objs(info,
2288 "__ksymtab_gpl_future",
2289 sizeof(*mod->gpl_future_syms),
2290 &mod->num_gpl_future_syms);
2291 mod->gpl_future_crcs = section_addr(info, "__kcrctab_gpl_future");
2262 2292
2263 /* Allow arches to frob section contents and sizes. */ 2293#ifdef CONFIG_UNUSED_SYMBOLS
2264 err = module_frob_arch_sections(hdr, sechdrs, secstrings, mod); 2294 mod->unused_syms = section_objs(info, "__ksymtab_unused",
2265 if (err < 0) 2295 sizeof(*mod->unused_syms),
2266 goto free_mod; 2296 &mod->num_unused_syms);
2297 mod->unused_crcs = section_addr(info, "__kcrctab_unused");
2298 mod->unused_gpl_syms = section_objs(info, "__ksymtab_unused_gpl",
2299 sizeof(*mod->unused_gpl_syms),
2300 &mod->num_unused_gpl_syms);
2301 mod->unused_gpl_crcs = section_addr(info, "__kcrctab_unused_gpl");
2302#endif
2303#ifdef CONFIG_CONSTRUCTORS
2304 mod->ctors = section_objs(info, ".ctors",
2305 sizeof(*mod->ctors), &mod->num_ctors);
2306#endif
2267 2307
2268 if (pcpuindex) { 2308#ifdef CONFIG_TRACEPOINTS
2269 /* We have a special allocation for this section. */ 2309 mod->tracepoints = section_objs(info, "__tracepoints",
2270 err = percpu_modalloc(mod, sechdrs[pcpuindex].sh_size, 2310 sizeof(*mod->tracepoints),
2271 sechdrs[pcpuindex].sh_addralign); 2311 &mod->num_tracepoints);
2272 if (err) 2312#endif
2273 goto free_mod; 2313#ifdef HAVE_JUMP_LABEL
2274 sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 2314 mod->jump_entries = section_objs(info, "__jump_table",
2275 } 2315 sizeof(*mod->jump_entries),
2276 /* Keep this around for failure path. */ 2316 &mod->num_jump_entries);
2277 percpu = mod_percpu(mod); 2317#endif
2318#ifdef CONFIG_EVENT_TRACING
2319 mod->trace_events = section_objs(info, "_ftrace_events",
2320 sizeof(*mod->trace_events),
2321 &mod->num_trace_events);
2322 /*
2323 * This section contains pointers to allocated objects in the trace
2324 * code and not scanning it leads to false positives.
2325 */
2326 kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
2327 mod->num_trace_events, GFP_KERNEL);
2328#endif
2329#ifdef CONFIG_FTRACE_MCOUNT_RECORD
2330 /* sechdrs[0].sh_size is always zero */
2331 mod->ftrace_callsites = section_objs(info, "__mcount_loc",
2332 sizeof(*mod->ftrace_callsites),
2333 &mod->num_ftrace_callsites);
2334#endif
2278 2335
2279 /* Determine total sizes, and put offsets in sh_entsize. For now 2336 mod->extable = section_objs(info, "__ex_table",
2280 this is done generically; there doesn't appear to be any 2337 sizeof(*mod->extable), &mod->num_exentries);
2281 special cases for the architectures. */ 2338
2282 layout_sections(mod, hdr, sechdrs, secstrings); 2339 if (section_addr(info, "__obsparm"))
2283 symoffs = layout_symtab(mod, sechdrs, symindex, strindex, hdr, 2340 printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
2284 secstrings, &stroffs, strmap); 2341 mod->name);
2342
2343 info->debug = section_objs(info, "__verbose",
2344 sizeof(*info->debug), &info->num_debug);
2345}
2346
2347static int move_module(struct module *mod, struct load_info *info)
2348{
2349 int i;
2350 void *ptr;
2285 2351
2286 /* Do the allocs. */ 2352 /* Do the allocs. */
2287 ptr = module_alloc_update_bounds(mod->core_size); 2353 ptr = module_alloc_update_bounds(mod->core_size);
@@ -2291,10 +2357,9 @@ static noinline struct module *load_module(void __user *umod,
2291 * leak. 2357 * leak.
2292 */ 2358 */
2293 kmemleak_not_leak(ptr); 2359 kmemleak_not_leak(ptr);
2294 if (!ptr) { 2360 if (!ptr)
2295 err = -ENOMEM; 2361 return -ENOMEM;
2296 goto free_percpu; 2362
2297 }
2298 memset(ptr, 0, mod->core_size); 2363 memset(ptr, 0, mod->core_size);
2299 mod->module_core = ptr; 2364 mod->module_core = ptr;
2300 2365
@@ -2307,50 +2372,40 @@ static noinline struct module *load_module(void __user *umod,
2307 */ 2372 */
2308 kmemleak_ignore(ptr); 2373 kmemleak_ignore(ptr);
2309 if (!ptr && mod->init_size) { 2374 if (!ptr && mod->init_size) {
2310 err = -ENOMEM; 2375 module_free(mod, mod->module_core);
2311 goto free_core; 2376 return -ENOMEM;
2312 } 2377 }
2313 memset(ptr, 0, mod->init_size); 2378 memset(ptr, 0, mod->init_size);
2314 mod->module_init = ptr; 2379 mod->module_init = ptr;
2315 2380
2316 /* Transfer each section which specifies SHF_ALLOC */ 2381 /* Transfer each section which specifies SHF_ALLOC */
2317 DEBUGP("final section addresses:\n"); 2382 DEBUGP("final section addresses:\n");
2318 for (i = 0; i < hdr->e_shnum; i++) { 2383 for (i = 0; i < info->hdr->e_shnum; i++) {
2319 void *dest; 2384 void *dest;
2385 Elf_Shdr *shdr = &info->sechdrs[i];
2320 2386
2321 if (!(sechdrs[i].sh_flags & SHF_ALLOC)) 2387 if (!(shdr->sh_flags & SHF_ALLOC))
2322 continue; 2388 continue;
2323 2389
2324 if (sechdrs[i].sh_entsize & INIT_OFFSET_MASK) 2390 if (shdr->sh_entsize & INIT_OFFSET_MASK)
2325 dest = mod->module_init 2391 dest = mod->module_init
2326 + (sechdrs[i].sh_entsize & ~INIT_OFFSET_MASK); 2392 + (shdr->sh_entsize & ~INIT_OFFSET_MASK);
2327 else 2393 else
2328 dest = mod->module_core + sechdrs[i].sh_entsize; 2394 dest = mod->module_core + shdr->sh_entsize;
2329 2395
2330 if (sechdrs[i].sh_type != SHT_NOBITS) 2396 if (shdr->sh_type != SHT_NOBITS)
2331 memcpy(dest, (void *)sechdrs[i].sh_addr, 2397 memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
2332 sechdrs[i].sh_size);
2333 /* Update sh_addr to point to copy in image. */ 2398 /* Update sh_addr to point to copy in image. */
2334 sechdrs[i].sh_addr = (unsigned long)dest; 2399 shdr->sh_addr = (unsigned long)dest;
2335 DEBUGP("\t0x%lx %s\n", sechdrs[i].sh_addr, secstrings + sechdrs[i].sh_name); 2400 DEBUGP("\t0x%lx %s\n",
2336 } 2401 shdr->sh_addr, info->secstrings + shdr->sh_name);
2337 /* Module has been moved. */
2338 mod = (void *)sechdrs[modindex].sh_addr;
2339 kmemleak_load_module(mod, hdr, sechdrs, secstrings);
2340
2341#if defined(CONFIG_MODULE_UNLOAD)
2342 mod->refptr = alloc_percpu(struct module_ref);
2343 if (!mod->refptr) {
2344 err = -ENOMEM;
2345 goto free_init;
2346 } 2402 }
2347#endif
2348 /* Now we've moved module, initialize linked lists, etc. */
2349 module_unload_init(mod);
2350 2403
2351 /* Set up license info based on the info section */ 2404 return 0;
2352 set_license(mod, get_modinfo(sechdrs, infoindex, "license")); 2405}
2353 2406
2407static int check_module_license_and_versions(struct module *mod)
2408{
2354 /* 2409 /*
2355 * ndiswrapper is under GPL by itself, but loads proprietary modules. 2410 * ndiswrapper is under GPL by itself, but loads proprietary modules.
2356 * Don't use add_taint_module(), as it would prevent ndiswrapper from 2411 * Don't use add_taint_module(), as it would prevent ndiswrapper from
@@ -2363,77 +2418,6 @@ static noinline struct module *load_module(void __user *umod,
2363 if (strcmp(mod->name, "driverloader") == 0) 2418 if (strcmp(mod->name, "driverloader") == 0)
2364 add_taint_module(mod, TAINT_PROPRIETARY_MODULE); 2419 add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
2365 2420
2366 /* Set up MODINFO_ATTR fields */
2367 setup_modinfo(mod, sechdrs, infoindex);
2368
2369 /* Fix up syms, so that st_value is a pointer to location. */
2370 err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex,
2371 mod);
2372 if (err < 0)
2373 goto cleanup;
2374
2375 /* Now we've got everything in the final locations, we can
2376 * find optional sections. */
2377 mod->kp = section_objs(hdr, sechdrs, secstrings, "__param",
2378 sizeof(*mod->kp), &mod->num_kp);
2379 mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab",
2380 sizeof(*mod->syms), &mod->num_syms);
2381 mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab");
2382 mod->gpl_syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab_gpl",
2383 sizeof(*mod->gpl_syms),
2384 &mod->num_gpl_syms);
2385 mod->gpl_crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab_gpl");
2386 mod->gpl_future_syms = section_objs(hdr, sechdrs, secstrings,
2387 "__ksymtab_gpl_future",
2388 sizeof(*mod->gpl_future_syms),
2389 &mod->num_gpl_future_syms);
2390 mod->gpl_future_crcs = section_addr(hdr, sechdrs, secstrings,
2391 "__kcrctab_gpl_future");
2392
2393#ifdef CONFIG_UNUSED_SYMBOLS
2394 mod->unused_syms = section_objs(hdr, sechdrs, secstrings,
2395 "__ksymtab_unused",
2396 sizeof(*mod->unused_syms),
2397 &mod->num_unused_syms);
2398 mod->unused_crcs = section_addr(hdr, sechdrs, secstrings,
2399 "__kcrctab_unused");
2400 mod->unused_gpl_syms = section_objs(hdr, sechdrs, secstrings,
2401 "__ksymtab_unused_gpl",
2402 sizeof(*mod->unused_gpl_syms),
2403 &mod->num_unused_gpl_syms);
2404 mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings,
2405 "__kcrctab_unused_gpl");
2406#endif
2407#ifdef CONFIG_CONSTRUCTORS
2408 mod->ctors = section_objs(hdr, sechdrs, secstrings, ".ctors",
2409 sizeof(*mod->ctors), &mod->num_ctors);
2410#endif
2411
2412#ifdef CONFIG_TRACEPOINTS
2413 mod->tracepoints = section_objs(hdr, sechdrs, secstrings,
2414 "__tracepoints",
2415 sizeof(*mod->tracepoints),
2416 &mod->num_tracepoints);
2417#endif
2418#ifdef CONFIG_EVENT_TRACING
2419 mod->trace_events = section_objs(hdr, sechdrs, secstrings,
2420 "_ftrace_events",
2421 sizeof(*mod->trace_events),
2422 &mod->num_trace_events);
2423 /*
2424 * This section contains pointers to allocated objects in the trace
2425 * code and not scanning it leads to false positives.
2426 */
2427 kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
2428 mod->num_trace_events, GFP_KERNEL);
2429#endif
2430#ifdef CONFIG_FTRACE_MCOUNT_RECORD
2431 /* sechdrs[0].sh_size is always zero */
2432 mod->ftrace_callsites = section_objs(hdr, sechdrs, secstrings,
2433 "__mcount_loc",
2434 sizeof(*mod->ftrace_callsites),
2435 &mod->num_ftrace_callsites);
2436#endif
2437#ifdef CONFIG_MODVERSIONS 2421#ifdef CONFIG_MODVERSIONS
2438 if ((mod->num_syms && !mod->crcs) 2422 if ((mod->num_syms && !mod->crcs)
2439 || (mod->num_gpl_syms && !mod->gpl_crcs) 2423 || (mod->num_gpl_syms && !mod->gpl_crcs)
@@ -2443,56 +2427,16 @@ static noinline struct module *load_module(void __user *umod,
2443 || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs) 2427 || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs)
2444#endif 2428#endif
2445 ) { 2429 ) {
2446 err = try_to_force_load(mod, 2430 return try_to_force_load(mod,
2447 "no versions for exported symbols"); 2431 "no versions for exported symbols");
2448 if (err)
2449 goto cleanup;
2450 } 2432 }
2451#endif 2433#endif
2434 return 0;
2435}
2452 2436
2453 /* Now do relocations. */ 2437static void flush_module_icache(const struct module *mod)
2454 for (i = 1; i < hdr->e_shnum; i++) { 2438{
2455 const char *strtab = (char *)sechdrs[strindex].sh_addr; 2439 mm_segment_t old_fs;
2456 unsigned int info = sechdrs[i].sh_info;
2457
2458 /* Not a valid relocation section? */
2459 if (info >= hdr->e_shnum)
2460 continue;
2461
2462 /* Don't bother with non-allocated sections */
2463 if (!(sechdrs[info].sh_flags & SHF_ALLOC))
2464 continue;
2465
2466 if (sechdrs[i].sh_type == SHT_REL)
2467 err = apply_relocate(sechdrs, strtab, symindex, i,mod);
2468 else if (sechdrs[i].sh_type == SHT_RELA)
2469 err = apply_relocate_add(sechdrs, strtab, symindex, i,
2470 mod);
2471 if (err < 0)
2472 goto cleanup;
2473 }
2474
2475 /* Set up and sort exception table */
2476 mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table",
2477 sizeof(*mod->extable), &mod->num_exentries);
2478 sort_extable(mod->extable, mod->extable + mod->num_exentries);
2479
2480 /* Finally, copy percpu area over. */
2481 percpu_modcopy(mod, (void *)sechdrs[pcpuindex].sh_addr,
2482 sechdrs[pcpuindex].sh_size);
2483
2484 add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex,
2485 symoffs, stroffs, secstrings, strmap);
2486 kfree(strmap);
2487 strmap = NULL;
2488
2489 if (!mod->taints)
2490 debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
2491 sizeof(*debug), &num_debug);
2492
2493 err = module_finalize(hdr, sechdrs, mod);
2494 if (err < 0)
2495 goto cleanup;
2496 2440
2497 /* flush the icache in correct context */ 2441 /* flush the icache in correct context */
2498 old_fs = get_fs(); 2442 old_fs = get_fs();
@@ -2511,11 +2455,160 @@ static noinline struct module *load_module(void __user *umod,
2511 (unsigned long)mod->module_core + mod->core_size); 2455 (unsigned long)mod->module_core + mod->core_size);
2512 2456
2513 set_fs(old_fs); 2457 set_fs(old_fs);
2458}
2514 2459
2515 mod->args = args; 2460static struct module *layout_and_allocate(struct load_info *info)
2516 if (section_addr(hdr, sechdrs, secstrings, "__obsparm")) 2461{
2517 printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", 2462 /* Module within temporary copy. */
2518 mod->name); 2463 struct module *mod;
2464 Elf_Shdr *pcpusec;
2465 int err;
2466
2467 mod = setup_load_info(info);
2468 if (IS_ERR(mod))
2469 return mod;
2470
2471 err = check_modinfo(mod, info);
2472 if (err)
2473 return ERR_PTR(err);
2474
2475 /* Allow arches to frob section contents and sizes. */
2476 err = module_frob_arch_sections(info->hdr, info->sechdrs,
2477 info->secstrings, mod);
2478 if (err < 0)
2479 goto out;
2480
2481 pcpusec = &info->sechdrs[info->index.pcpu];
2482 if (pcpusec->sh_size) {
2483 /* We have a special allocation for this section. */
2484 err = percpu_modalloc(mod,
2485 pcpusec->sh_size, pcpusec->sh_addralign);
2486 if (err)
2487 goto out;
2488 pcpusec->sh_flags &= ~(unsigned long)SHF_ALLOC;
2489 }
2490
2491 /* Determine total sizes, and put offsets in sh_entsize. For now
2492 this is done generically; there doesn't appear to be any
2493 special cases for the architectures. */
2494 layout_sections(mod, info);
2495
2496 info->strmap = kzalloc(BITS_TO_LONGS(info->sechdrs[info->index.str].sh_size)
2497 * sizeof(long), GFP_KERNEL);
2498 if (!info->strmap) {
2499 err = -ENOMEM;
2500 goto free_percpu;
2501 }
2502 layout_symtab(mod, info);
2503
2504 /* Allocate and move to the final place */
2505 err = move_module(mod, info);
2506 if (err)
2507 goto free_strmap;
2508
2509 /* Module has been copied to its final place now: return it. */
2510 mod = (void *)info->sechdrs[info->index.mod].sh_addr;
2511 kmemleak_load_module(mod, info);
2512 return mod;
2513
2514free_strmap:
2515 kfree(info->strmap);
2516free_percpu:
2517 percpu_modfree(mod);
2518out:
2519 return ERR_PTR(err);
2520}
2521
2522/* mod is no longer valid after this! */
2523static void module_deallocate(struct module *mod, struct load_info *info)
2524{
2525 kfree(info->strmap);
2526 percpu_modfree(mod);
2527 module_free(mod, mod->module_init);
2528 module_free(mod, mod->module_core);
2529}
2530
2531static int post_relocation(struct module *mod, const struct load_info *info)
2532{
2533 /* Sort exception table now relocations are done. */
2534 sort_extable(mod->extable, mod->extable + mod->num_exentries);
2535
2536 /* Copy relocated percpu area over. */
2537 percpu_modcopy(mod, (void *)info->sechdrs[info->index.pcpu].sh_addr,
2538 info->sechdrs[info->index.pcpu].sh_size);
2539
2540 /* Setup kallsyms-specific fields. */
2541 add_kallsyms(mod, info);
2542
2543 /* Arch-specific module finalizing. */
2544 return module_finalize(info->hdr, info->sechdrs, mod);
2545}
2546
2547/* Allocate and load the module: note that size of section 0 is always
2548 zero, and we rely on this for optional sections. */
2549static struct module *load_module(void __user *umod,
2550 unsigned long len,
2551 const char __user *uargs)
2552{
2553 struct load_info info = { NULL, };
2554 struct module *mod;
2555 long err;
2556
2557 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
2558 umod, len, uargs);
2559
2560 /* Copy in the blobs from userspace, check they are vaguely sane. */
2561 err = copy_and_check(&info, umod, len, uargs);
2562 if (err)
2563 return ERR_PTR(err);
2564
2565 /* Figure out module layout, and allocate all the memory. */
2566 mod = layout_and_allocate(&info);
2567 if (IS_ERR(mod)) {
2568 err = PTR_ERR(mod);
2569 goto free_copy;
2570 }
2571
2572 /* Now module is in final location, initialize linked lists, etc. */
2573 err = module_unload_init(mod);
2574 if (err)
2575 goto free_module;
2576
2577 /* Now we've got everything in the final locations, we can
2578 * find optional sections. */
2579 find_module_sections(mod, &info);
2580
2581 err = check_module_license_and_versions(mod);
2582 if (err)
2583 goto free_unload;
2584
2585 /* Set up MODINFO_ATTR fields */
2586 setup_modinfo(mod, &info);
2587
2588 /* Fix up syms, so that st_value is a pointer to location. */
2589 err = simplify_symbols(mod, &info);
2590 if (err < 0)
2591 goto free_modinfo;
2592
2593 err = apply_relocations(mod, &info);
2594 if (err < 0)
2595 goto free_modinfo;
2596
2597 err = post_relocation(mod, &info);
2598 if (err < 0)
2599 goto free_modinfo;
2600
2601 flush_module_icache(mod);
2602
2603 /* Now copy in args */
2604 mod->args = strndup_user(uargs, ~0UL >> 1);
2605 if (IS_ERR(mod->args)) {
2606 err = PTR_ERR(mod->args);
2607 goto free_arch_cleanup;
2608 }
2609
2610 /* Mark state as coming so strong_try_module_get() ignores us. */
2611 mod->state = MODULE_STATE_COMING;
2519 2612
2520 /* Now sew it into the lists so we can get lockdep and oops 2613 /* Now sew it into the lists so we can get lockdep and oops
2521 * info during argument parsing. Noone should access us, since 2614 * info during argument parsing. Noone should access us, since
@@ -2530,70 +2623,61 @@ static noinline struct module *load_module(void __user *umod,
2530 goto unlock; 2623 goto unlock;
2531 } 2624 }
2532 2625
2533 if (debug) 2626 /* This has to be done once we're sure module name is unique. */
2534 dynamic_debug_setup(debug, num_debug); 2627 if (!mod->taints)
2628 dynamic_debug_setup(info.debug, info.num_debug);
2535 2629
2536 /* Find duplicate symbols */ 2630 /* Find duplicate symbols */
2537 err = verify_export_symbols(mod); 2631 err = verify_export_symbols(mod);
2538 if (err < 0) 2632 if (err < 0)
2539 goto ddebug; 2633 goto ddebug;
2540 2634
2635 module_bug_finalize(info.hdr, info.sechdrs, mod);
2541 list_add_rcu(&mod->list, &modules); 2636 list_add_rcu(&mod->list, &modules);
2542 mutex_unlock(&module_mutex); 2637 mutex_unlock(&module_mutex);
2543 2638
2639 /* Module is ready to execute: parsing args may do that. */
2544 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL); 2640 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL);
2545 if (err < 0) 2641 if (err < 0)
2546 goto unlink; 2642 goto unlink;
2547 2643
2548 err = mod_sysfs_setup(mod, mod->kp, mod->num_kp); 2644 /* Link in to syfs. */
2645 err = mod_sysfs_setup(mod, &info, mod->kp, mod->num_kp);
2549 if (err < 0) 2646 if (err < 0)
2550 goto unlink; 2647 goto unlink;
2551 2648
2552 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2649 /* Get rid of temporary copy and strmap. */
2553 add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2650 kfree(info.strmap);
2554 2651 free_copy(&info);
2555 /* Get rid of temporary copy */
2556 vfree(hdr);
2557
2558 trace_module_load(mod);
2559 2652
2560 /* Done! */ 2653 /* Done! */
2654 trace_module_load(mod);
2561 return mod; 2655 return mod;
2562 2656
2563 unlink: 2657 unlink:
2564 mutex_lock(&module_mutex); 2658 mutex_lock(&module_mutex);
2565 /* Unlink carefully: kallsyms could be walking list. */ 2659 /* Unlink carefully: kallsyms could be walking list. */
2566 list_del_rcu(&mod->list); 2660 list_del_rcu(&mod->list);
2661 module_bug_cleanup(mod);
2662
2567 ddebug: 2663 ddebug:
2568 dynamic_debug_remove(debug); 2664 if (!mod->taints)
2665 dynamic_debug_remove(info.debug);
2569 unlock: 2666 unlock:
2570 mutex_unlock(&module_mutex); 2667 mutex_unlock(&module_mutex);
2571 synchronize_sched(); 2668 synchronize_sched();
2669 kfree(mod->args);
2670 free_arch_cleanup:
2572 module_arch_cleanup(mod); 2671 module_arch_cleanup(mod);
2573 cleanup: 2672 free_modinfo:
2574 free_modinfo(mod); 2673 free_modinfo(mod);
2674 free_unload:
2575 module_unload_free(mod); 2675 module_unload_free(mod);
2576#if defined(CONFIG_MODULE_UNLOAD) 2676 free_module:
2577 free_percpu(mod->refptr); 2677 module_deallocate(mod, &info);
2578 free_init: 2678 free_copy:
2579#endif 2679 free_copy(&info);
2580 module_free(mod, mod->module_init);
2581 free_core:
2582 module_free(mod, mod->module_core);
2583 /* mod will be freed with core. Don't access it beyond this line! */
2584 free_percpu:
2585 free_percpu(percpu);
2586 free_mod:
2587 kfree(args);
2588 kfree(strmap);
2589 free_hdr:
2590 vfree(hdr);
2591 return ERR_PTR(err); 2680 return ERR_PTR(err);
2592
2593 truncated:
2594 printk(KERN_ERR "Module len %lu truncated\n", len);
2595 err = -ENOEXEC;
2596 goto free_hdr;
2597} 2681}
2598 2682
2599/* Call module constructors. */ 2683/* Call module constructors. */
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 4c0b7b3e6d2e..200407c1502f 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -36,15 +36,6 @@
36# include <asm/mutex.h> 36# include <asm/mutex.h>
37#endif 37#endif
38 38
39/***
40 * mutex_init - initialize the mutex
41 * @lock: the mutex to be initialized
42 * @key: the lock_class_key for the class; used by mutex lock debugging
43 *
44 * Initialize the mutex to unlocked state.
45 *
46 * It is not allowed to initialize an already locked mutex.
47 */
48void 39void
49__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) 40__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
50{ 41{
@@ -68,7 +59,7 @@ EXPORT_SYMBOL(__mutex_init);
68static __used noinline void __sched 59static __used noinline void __sched
69__mutex_lock_slowpath(atomic_t *lock_count); 60__mutex_lock_slowpath(atomic_t *lock_count);
70 61
71/*** 62/**
72 * mutex_lock - acquire the mutex 63 * mutex_lock - acquire the mutex
73 * @lock: the mutex to be acquired 64 * @lock: the mutex to be acquired
74 * 65 *
@@ -105,7 +96,7 @@ EXPORT_SYMBOL(mutex_lock);
105 96
106static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); 97static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
107 98
108/*** 99/**
109 * mutex_unlock - release the mutex 100 * mutex_unlock - release the mutex
110 * @lock: the mutex to be released 101 * @lock: the mutex to be released
111 * 102 *
@@ -364,8 +355,8 @@ __mutex_lock_killable_slowpath(atomic_t *lock_count);
364static noinline int __sched 355static noinline int __sched
365__mutex_lock_interruptible_slowpath(atomic_t *lock_count); 356__mutex_lock_interruptible_slowpath(atomic_t *lock_count);
366 357
367/*** 358/**
368 * mutex_lock_interruptible - acquire the mutex, interruptable 359 * mutex_lock_interruptible - acquire the mutex, interruptible
369 * @lock: the mutex to be acquired 360 * @lock: the mutex to be acquired
370 * 361 *
371 * Lock the mutex like mutex_lock(), and return 0 if the mutex has 362 * Lock the mutex like mutex_lock(), and return 0 if the mutex has
@@ -456,15 +447,15 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
456 return prev == 1; 447 return prev == 1;
457} 448}
458 449
459/*** 450/**
460 * mutex_trylock - try acquire the mutex, without waiting 451 * mutex_trylock - try to acquire the mutex, without waiting
461 * @lock: the mutex to be acquired 452 * @lock: the mutex to be acquired
462 * 453 *
463 * Try to acquire the mutex atomically. Returns 1 if the mutex 454 * Try to acquire the mutex atomically. Returns 1 if the mutex
464 * has been acquired successfully, and 0 on contention. 455 * has been acquired successfully, and 0 on contention.
465 * 456 *
466 * NOTE: this function follows the spin_trylock() convention, so 457 * NOTE: this function follows the spin_trylock() convention, so
467 * it is negated to the down_trylock() return values! Be careful 458 * it is negated from the down_trylock() return values! Be careful
468 * about this when converting semaphore users to mutexes. 459 * about this when converting semaphore users to mutexes.
469 * 460 *
470 * This function must not be used in interrupt context. The 461 * This function must not be used in interrupt context. The
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 2a5dfec8efe0..2c98ad94ba0e 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -85,6 +85,14 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
85 return ERR_PTR(-EPERM); 85 return ERR_PTR(-EPERM);
86 if (!cgroup_is_descendant(cgroup, current)) 86 if (!cgroup_is_descendant(cgroup, current))
87 return ERR_PTR(-EPERM); 87 return ERR_PTR(-EPERM);
88 if (test_bit(CGRP_CLONE_CHILDREN, &cgroup->flags)) {
89 printk("ns_cgroup can't be created with parent "
90 "'clone_children' set.\n");
91 return ERR_PTR(-EINVAL);
92 }
93
94 printk_once("ns_cgroup deprecated: consider using the "
95 "'clone_children' flag without the ns_cgroup.\n");
88 96
89 ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); 97 ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
90 if (!ns_cgroup) 98 if (!ns_cgroup)
diff --git a/kernel/padata.c b/kernel/padata.c
index fdd8ae609ce3..751019415d23 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -26,18 +26,19 @@
26#include <linux/mutex.h> 26#include <linux/mutex.h>
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/sysfs.h>
29#include <linux/rcupdate.h> 30#include <linux/rcupdate.h>
30 31
31#define MAX_SEQ_NR INT_MAX - NR_CPUS 32#define MAX_SEQ_NR (INT_MAX - NR_CPUS)
32#define MAX_OBJ_NUM 1000 33#define MAX_OBJ_NUM 1000
33 34
34static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) 35static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
35{ 36{
36 int cpu, target_cpu; 37 int cpu, target_cpu;
37 38
38 target_cpu = cpumask_first(pd->cpumask); 39 target_cpu = cpumask_first(pd->cpumask.pcpu);
39 for (cpu = 0; cpu < cpu_index; cpu++) 40 for (cpu = 0; cpu < cpu_index; cpu++)
40 target_cpu = cpumask_next(target_cpu, pd->cpumask); 41 target_cpu = cpumask_next(target_cpu, pd->cpumask.pcpu);
41 42
42 return target_cpu; 43 return target_cpu;
43} 44}
@@ -53,26 +54,27 @@ static int padata_cpu_hash(struct padata_priv *padata)
53 * Hash the sequence numbers to the cpus by taking 54 * Hash the sequence numbers to the cpus by taking
54 * seq_nr mod. number of cpus in use. 55 * seq_nr mod. number of cpus in use.
55 */ 56 */
56 cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask); 57 cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask.pcpu);
57 58
58 return padata_index_to_cpu(pd, cpu_index); 59 return padata_index_to_cpu(pd, cpu_index);
59} 60}
60 61
61static void padata_parallel_worker(struct work_struct *work) 62static void padata_parallel_worker(struct work_struct *parallel_work)
62{ 63{
63 struct padata_queue *queue; 64 struct padata_parallel_queue *pqueue;
64 struct parallel_data *pd; 65 struct parallel_data *pd;
65 struct padata_instance *pinst; 66 struct padata_instance *pinst;
66 LIST_HEAD(local_list); 67 LIST_HEAD(local_list);
67 68
68 local_bh_disable(); 69 local_bh_disable();
69 queue = container_of(work, struct padata_queue, pwork); 70 pqueue = container_of(parallel_work,
70 pd = queue->pd; 71 struct padata_parallel_queue, work);
72 pd = pqueue->pd;
71 pinst = pd->pinst; 73 pinst = pd->pinst;
72 74
73 spin_lock(&queue->parallel.lock); 75 spin_lock(&pqueue->parallel.lock);
74 list_replace_init(&queue->parallel.list, &local_list); 76 list_replace_init(&pqueue->parallel.list, &local_list);
75 spin_unlock(&queue->parallel.lock); 77 spin_unlock(&pqueue->parallel.lock);
76 78
77 while (!list_empty(&local_list)) { 79 while (!list_empty(&local_list)) {
78 struct padata_priv *padata; 80 struct padata_priv *padata;
@@ -94,7 +96,7 @@ static void padata_parallel_worker(struct work_struct *work)
94 * @pinst: padata instance 96 * @pinst: padata instance
95 * @padata: object to be parallelized 97 * @padata: object to be parallelized
96 * @cb_cpu: cpu the serialization callback function will run on, 98 * @cb_cpu: cpu the serialization callback function will run on,
97 * must be in the cpumask of padata. 99 * must be in the serial cpumask of padata(i.e. cpumask.cbcpu).
98 * 100 *
99 * The parallelization callback function will run with BHs off. 101 * The parallelization callback function will run with BHs off.
100 * Note: Every object which is parallelized by padata_do_parallel 102 * Note: Every object which is parallelized by padata_do_parallel
@@ -104,15 +106,18 @@ int padata_do_parallel(struct padata_instance *pinst,
104 struct padata_priv *padata, int cb_cpu) 106 struct padata_priv *padata, int cb_cpu)
105{ 107{
106 int target_cpu, err; 108 int target_cpu, err;
107 struct padata_queue *queue; 109 struct padata_parallel_queue *queue;
108 struct parallel_data *pd; 110 struct parallel_data *pd;
109 111
110 rcu_read_lock_bh(); 112 rcu_read_lock_bh();
111 113
112 pd = rcu_dereference(pinst->pd); 114 pd = rcu_dereference(pinst->pd);
113 115
114 err = 0; 116 err = -EINVAL;
115 if (!(pinst->flags & PADATA_INIT)) 117 if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID)
118 goto out;
119
120 if (!cpumask_test_cpu(cb_cpu, pd->cpumask.cbcpu))
116 goto out; 121 goto out;
117 122
118 err = -EBUSY; 123 err = -EBUSY;
@@ -122,11 +127,7 @@ int padata_do_parallel(struct padata_instance *pinst,
122 if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM) 127 if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM)
123 goto out; 128 goto out;
124 129
125 err = -EINVAL; 130 err = 0;
126 if (!cpumask_test_cpu(cb_cpu, pd->cpumask))
127 goto out;
128
129 err = -EINPROGRESS;
130 atomic_inc(&pd->refcnt); 131 atomic_inc(&pd->refcnt);
131 padata->pd = pd; 132 padata->pd = pd;
132 padata->cb_cpu = cb_cpu; 133 padata->cb_cpu = cb_cpu;
@@ -137,13 +138,13 @@ int padata_do_parallel(struct padata_instance *pinst,
137 padata->seq_nr = atomic_inc_return(&pd->seq_nr); 138 padata->seq_nr = atomic_inc_return(&pd->seq_nr);
138 139
139 target_cpu = padata_cpu_hash(padata); 140 target_cpu = padata_cpu_hash(padata);
140 queue = per_cpu_ptr(pd->queue, target_cpu); 141 queue = per_cpu_ptr(pd->pqueue, target_cpu);
141 142
142 spin_lock(&queue->parallel.lock); 143 spin_lock(&queue->parallel.lock);
143 list_add_tail(&padata->list, &queue->parallel.list); 144 list_add_tail(&padata->list, &queue->parallel.list);
144 spin_unlock(&queue->parallel.lock); 145 spin_unlock(&queue->parallel.lock);
145 146
146 queue_work_on(target_cpu, pinst->wq, &queue->pwork); 147 queue_work_on(target_cpu, pinst->wq, &queue->work);
147 148
148out: 149out:
149 rcu_read_unlock_bh(); 150 rcu_read_unlock_bh();
@@ -171,84 +172,52 @@ EXPORT_SYMBOL(padata_do_parallel);
171 */ 172 */
172static struct padata_priv *padata_get_next(struct parallel_data *pd) 173static struct padata_priv *padata_get_next(struct parallel_data *pd)
173{ 174{
174 int cpu, num_cpus, empty, calc_seq_nr; 175 int cpu, num_cpus;
175 int seq_nr, next_nr, overrun, next_overrun; 176 int next_nr, next_index;
176 struct padata_queue *queue, *next_queue; 177 struct padata_parallel_queue *queue, *next_queue;
177 struct padata_priv *padata; 178 struct padata_priv *padata;
178 struct padata_list *reorder; 179 struct padata_list *reorder;
179 180
180 empty = 0; 181 num_cpus = cpumask_weight(pd->cpumask.pcpu);
181 next_nr = -1;
182 next_overrun = 0;
183 next_queue = NULL;
184
185 num_cpus = cpumask_weight(pd->cpumask);
186
187 for_each_cpu(cpu, pd->cpumask) {
188 queue = per_cpu_ptr(pd->queue, cpu);
189 reorder = &queue->reorder;
190
191 /*
192 * Calculate the seq_nr of the object that should be
193 * next in this reorder queue.
194 */
195 overrun = 0;
196 calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus)
197 + queue->cpu_index;
198 182
199 if (unlikely(calc_seq_nr > pd->max_seq_nr)) { 183 /*
200 calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1; 184 * Calculate the percpu reorder queue and the sequence
201 overrun = 1; 185 * number of the next object.
202 } 186 */
203 187 next_nr = pd->processed;
204 if (!list_empty(&reorder->list)) { 188 next_index = next_nr % num_cpus;
205 padata = list_entry(reorder->list.next, 189 cpu = padata_index_to_cpu(pd, next_index);
206 struct padata_priv, list); 190 next_queue = per_cpu_ptr(pd->pqueue, cpu);
207 191
208 seq_nr = padata->seq_nr; 192 if (unlikely(next_nr > pd->max_seq_nr)) {
209 BUG_ON(calc_seq_nr != seq_nr); 193 next_nr = next_nr - pd->max_seq_nr - 1;
210 } else { 194 next_index = next_nr % num_cpus;
211 seq_nr = calc_seq_nr; 195 cpu = padata_index_to_cpu(pd, next_index);
212 empty++; 196 next_queue = per_cpu_ptr(pd->pqueue, cpu);
213 } 197 pd->processed = 0;
214
215 if (next_nr < 0 || seq_nr < next_nr
216 || (next_overrun && !overrun)) {
217 next_nr = seq_nr;
218 next_overrun = overrun;
219 next_queue = queue;
220 }
221 } 198 }
222 199
223 padata = NULL; 200 padata = NULL;
224 201
225 if (empty == num_cpus)
226 goto out;
227
228 reorder = &next_queue->reorder; 202 reorder = &next_queue->reorder;
229 203
230 if (!list_empty(&reorder->list)) { 204 if (!list_empty(&reorder->list)) {
231 padata = list_entry(reorder->list.next, 205 padata = list_entry(reorder->list.next,
232 struct padata_priv, list); 206 struct padata_priv, list);
233 207
234 if (unlikely(next_overrun)) { 208 BUG_ON(next_nr != padata->seq_nr);
235 for_each_cpu(cpu, pd->cpumask) {
236 queue = per_cpu_ptr(pd->queue, cpu);
237 atomic_set(&queue->num_obj, 0);
238 }
239 }
240 209
241 spin_lock(&reorder->lock); 210 spin_lock(&reorder->lock);
242 list_del_init(&padata->list); 211 list_del_init(&padata->list);
243 atomic_dec(&pd->reorder_objects); 212 atomic_dec(&pd->reorder_objects);
244 spin_unlock(&reorder->lock); 213 spin_unlock(&reorder->lock);
245 214
246 atomic_inc(&next_queue->num_obj); 215 pd->processed++;
247 216
248 goto out; 217 goto out;
249 } 218 }
250 219
251 queue = per_cpu_ptr(pd->queue, smp_processor_id()); 220 queue = per_cpu_ptr(pd->pqueue, smp_processor_id());
252 if (queue->cpu_index == next_queue->cpu_index) { 221 if (queue->cpu_index == next_queue->cpu_index) {
253 padata = ERR_PTR(-ENODATA); 222 padata = ERR_PTR(-ENODATA);
254 goto out; 223 goto out;
@@ -262,7 +231,7 @@ out:
262static void padata_reorder(struct parallel_data *pd) 231static void padata_reorder(struct parallel_data *pd)
263{ 232{
264 struct padata_priv *padata; 233 struct padata_priv *padata;
265 struct padata_queue *queue; 234 struct padata_serial_queue *squeue;
266 struct padata_instance *pinst = pd->pinst; 235 struct padata_instance *pinst = pd->pinst;
267 236
268 /* 237 /*
@@ -301,13 +270,13 @@ static void padata_reorder(struct parallel_data *pd)
301 return; 270 return;
302 } 271 }
303 272
304 queue = per_cpu_ptr(pd->queue, padata->cb_cpu); 273 squeue = per_cpu_ptr(pd->squeue, padata->cb_cpu);
305 274
306 spin_lock(&queue->serial.lock); 275 spin_lock(&squeue->serial.lock);
307 list_add_tail(&padata->list, &queue->serial.list); 276 list_add_tail(&padata->list, &squeue->serial.list);
308 spin_unlock(&queue->serial.lock); 277 spin_unlock(&squeue->serial.lock);
309 278
310 queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork); 279 queue_work_on(padata->cb_cpu, pinst->wq, &squeue->work);
311 } 280 }
312 281
313 spin_unlock_bh(&pd->lock); 282 spin_unlock_bh(&pd->lock);
@@ -333,19 +302,19 @@ static void padata_reorder_timer(unsigned long arg)
333 padata_reorder(pd); 302 padata_reorder(pd);
334} 303}
335 304
336static void padata_serial_worker(struct work_struct *work) 305static void padata_serial_worker(struct work_struct *serial_work)
337{ 306{
338 struct padata_queue *queue; 307 struct padata_serial_queue *squeue;
339 struct parallel_data *pd; 308 struct parallel_data *pd;
340 LIST_HEAD(local_list); 309 LIST_HEAD(local_list);
341 310
342 local_bh_disable(); 311 local_bh_disable();
343 queue = container_of(work, struct padata_queue, swork); 312 squeue = container_of(serial_work, struct padata_serial_queue, work);
344 pd = queue->pd; 313 pd = squeue->pd;
345 314
346 spin_lock(&queue->serial.lock); 315 spin_lock(&squeue->serial.lock);
347 list_replace_init(&queue->serial.list, &local_list); 316 list_replace_init(&squeue->serial.list, &local_list);
348 spin_unlock(&queue->serial.lock); 317 spin_unlock(&squeue->serial.lock);
349 318
350 while (!list_empty(&local_list)) { 319 while (!list_empty(&local_list)) {
351 struct padata_priv *padata; 320 struct padata_priv *padata;
@@ -372,18 +341,18 @@ static void padata_serial_worker(struct work_struct *work)
372void padata_do_serial(struct padata_priv *padata) 341void padata_do_serial(struct padata_priv *padata)
373{ 342{
374 int cpu; 343 int cpu;
375 struct padata_queue *queue; 344 struct padata_parallel_queue *pqueue;
376 struct parallel_data *pd; 345 struct parallel_data *pd;
377 346
378 pd = padata->pd; 347 pd = padata->pd;
379 348
380 cpu = get_cpu(); 349 cpu = get_cpu();
381 queue = per_cpu_ptr(pd->queue, cpu); 350 pqueue = per_cpu_ptr(pd->pqueue, cpu);
382 351
383 spin_lock(&queue->reorder.lock); 352 spin_lock(&pqueue->reorder.lock);
384 atomic_inc(&pd->reorder_objects); 353 atomic_inc(&pd->reorder_objects);
385 list_add_tail(&padata->list, &queue->reorder.list); 354 list_add_tail(&padata->list, &pqueue->reorder.list);
386 spin_unlock(&queue->reorder.lock); 355 spin_unlock(&pqueue->reorder.lock);
387 356
388 put_cpu(); 357 put_cpu();
389 358
@@ -391,52 +360,89 @@ void padata_do_serial(struct padata_priv *padata)
391} 360}
392EXPORT_SYMBOL(padata_do_serial); 361EXPORT_SYMBOL(padata_do_serial);
393 362
394/* Allocate and initialize the internal cpumask dependend resources. */ 363static int padata_setup_cpumasks(struct parallel_data *pd,
395static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, 364 const struct cpumask *pcpumask,
396 const struct cpumask *cpumask) 365 const struct cpumask *cbcpumask)
397{ 366{
398 int cpu, cpu_index, num_cpus; 367 if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL))
399 struct padata_queue *queue; 368 return -ENOMEM;
400 struct parallel_data *pd;
401
402 cpu_index = 0;
403 369
404 pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL); 370 cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_active_mask);
405 if (!pd) 371 if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) {
406 goto err; 372 free_cpumask_var(pd->cpumask.cbcpu);
373 return -ENOMEM;
374 }
407 375
408 pd->queue = alloc_percpu(struct padata_queue); 376 cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_active_mask);
409 if (!pd->queue) 377 return 0;
410 goto err_free_pd; 378}
411 379
412 if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL)) 380static void __padata_list_init(struct padata_list *pd_list)
413 goto err_free_queue; 381{
382 INIT_LIST_HEAD(&pd_list->list);
383 spin_lock_init(&pd_list->lock);
384}
414 385
415 cpumask_and(pd->cpumask, cpumask, cpu_active_mask); 386/* Initialize all percpu queues used by serial workers */
387static void padata_init_squeues(struct parallel_data *pd)
388{
389 int cpu;
390 struct padata_serial_queue *squeue;
416 391
417 for_each_cpu(cpu, pd->cpumask) { 392 for_each_cpu(cpu, pd->cpumask.cbcpu) {
418 queue = per_cpu_ptr(pd->queue, cpu); 393 squeue = per_cpu_ptr(pd->squeue, cpu);
394 squeue->pd = pd;
395 __padata_list_init(&squeue->serial);
396 INIT_WORK(&squeue->work, padata_serial_worker);
397 }
398}
419 399
420 queue->pd = pd; 400/* Initialize all percpu queues used by parallel workers */
401static void padata_init_pqueues(struct parallel_data *pd)
402{
403 int cpu_index, num_cpus, cpu;
404 struct padata_parallel_queue *pqueue;
421 405
422 queue->cpu_index = cpu_index; 406 cpu_index = 0;
407 for_each_cpu(cpu, pd->cpumask.pcpu) {
408 pqueue = per_cpu_ptr(pd->pqueue, cpu);
409 pqueue->pd = pd;
410 pqueue->cpu_index = cpu_index;
423 cpu_index++; 411 cpu_index++;
424 412
425 INIT_LIST_HEAD(&queue->reorder.list); 413 __padata_list_init(&pqueue->reorder);
426 INIT_LIST_HEAD(&queue->parallel.list); 414 __padata_list_init(&pqueue->parallel);
427 INIT_LIST_HEAD(&queue->serial.list); 415 INIT_WORK(&pqueue->work, padata_parallel_worker);
428 spin_lock_init(&queue->reorder.lock); 416 atomic_set(&pqueue->num_obj, 0);
429 spin_lock_init(&queue->parallel.lock);
430 spin_lock_init(&queue->serial.lock);
431
432 INIT_WORK(&queue->pwork, padata_parallel_worker);
433 INIT_WORK(&queue->swork, padata_serial_worker);
434 atomic_set(&queue->num_obj, 0);
435 } 417 }
436 418
437 num_cpus = cpumask_weight(pd->cpumask); 419 num_cpus = cpumask_weight(pd->cpumask.pcpu);
438 pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1; 420 pd->max_seq_nr = num_cpus ? (MAX_SEQ_NR / num_cpus) * num_cpus - 1 : 0;
421}
422
423/* Allocate and initialize the internal cpumask dependend resources. */
424static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
425 const struct cpumask *pcpumask,
426 const struct cpumask *cbcpumask)
427{
428 struct parallel_data *pd;
439 429
430 pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
431 if (!pd)
432 goto err;
433
434 pd->pqueue = alloc_percpu(struct padata_parallel_queue);
435 if (!pd->pqueue)
436 goto err_free_pd;
437
438 pd->squeue = alloc_percpu(struct padata_serial_queue);
439 if (!pd->squeue)
440 goto err_free_pqueue;
441 if (padata_setup_cpumasks(pd, pcpumask, cbcpumask) < 0)
442 goto err_free_squeue;
443
444 padata_init_pqueues(pd);
445 padata_init_squeues(pd);
440 setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); 446 setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
441 atomic_set(&pd->seq_nr, -1); 447 atomic_set(&pd->seq_nr, -1);
442 atomic_set(&pd->reorder_objects, 0); 448 atomic_set(&pd->reorder_objects, 0);
@@ -446,8 +452,10 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
446 452
447 return pd; 453 return pd;
448 454
449err_free_queue: 455err_free_squeue:
450 free_percpu(pd->queue); 456 free_percpu(pd->squeue);
457err_free_pqueue:
458 free_percpu(pd->pqueue);
451err_free_pd: 459err_free_pd:
452 kfree(pd); 460 kfree(pd);
453err: 461err:
@@ -456,8 +464,10 @@ err:
456 464
457static void padata_free_pd(struct parallel_data *pd) 465static void padata_free_pd(struct parallel_data *pd)
458{ 466{
459 free_cpumask_var(pd->cpumask); 467 free_cpumask_var(pd->cpumask.pcpu);
460 free_percpu(pd->queue); 468 free_cpumask_var(pd->cpumask.cbcpu);
469 free_percpu(pd->pqueue);
470 free_percpu(pd->squeue);
461 kfree(pd); 471 kfree(pd);
462} 472}
463 473
@@ -465,11 +475,12 @@ static void padata_free_pd(struct parallel_data *pd)
465static void padata_flush_queues(struct parallel_data *pd) 475static void padata_flush_queues(struct parallel_data *pd)
466{ 476{
467 int cpu; 477 int cpu;
468 struct padata_queue *queue; 478 struct padata_parallel_queue *pqueue;
479 struct padata_serial_queue *squeue;
469 480
470 for_each_cpu(cpu, pd->cpumask) { 481 for_each_cpu(cpu, pd->cpumask.pcpu) {
471 queue = per_cpu_ptr(pd->queue, cpu); 482 pqueue = per_cpu_ptr(pd->pqueue, cpu);
472 flush_work(&queue->pwork); 483 flush_work(&pqueue->work);
473 } 484 }
474 485
475 del_timer_sync(&pd->timer); 486 del_timer_sync(&pd->timer);
@@ -477,19 +488,39 @@ static void padata_flush_queues(struct parallel_data *pd)
477 if (atomic_read(&pd->reorder_objects)) 488 if (atomic_read(&pd->reorder_objects))
478 padata_reorder(pd); 489 padata_reorder(pd);
479 490
480 for_each_cpu(cpu, pd->cpumask) { 491 for_each_cpu(cpu, pd->cpumask.cbcpu) {
481 queue = per_cpu_ptr(pd->queue, cpu); 492 squeue = per_cpu_ptr(pd->squeue, cpu);
482 flush_work(&queue->swork); 493 flush_work(&squeue->work);
483 } 494 }
484 495
485 BUG_ON(atomic_read(&pd->refcnt) != 0); 496 BUG_ON(atomic_read(&pd->refcnt) != 0);
486} 497}
487 498
499static void __padata_start(struct padata_instance *pinst)
500{
501 pinst->flags |= PADATA_INIT;
502}
503
504static void __padata_stop(struct padata_instance *pinst)
505{
506 if (!(pinst->flags & PADATA_INIT))
507 return;
508
509 pinst->flags &= ~PADATA_INIT;
510
511 synchronize_rcu();
512
513 get_online_cpus();
514 padata_flush_queues(pinst->pd);
515 put_online_cpus();
516}
517
488/* Replace the internal control stucture with a new one. */ 518/* Replace the internal control stucture with a new one. */
489static void padata_replace(struct padata_instance *pinst, 519static void padata_replace(struct padata_instance *pinst,
490 struct parallel_data *pd_new) 520 struct parallel_data *pd_new)
491{ 521{
492 struct parallel_data *pd_old = pinst->pd; 522 struct parallel_data *pd_old = pinst->pd;
523 int notification_mask = 0;
493 524
494 pinst->flags |= PADATA_RESET; 525 pinst->flags |= PADATA_RESET;
495 526
@@ -497,41 +528,162 @@ static void padata_replace(struct padata_instance *pinst,
497 528
498 synchronize_rcu(); 529 synchronize_rcu();
499 530
531 if (!cpumask_equal(pd_old->cpumask.pcpu, pd_new->cpumask.pcpu))
532 notification_mask |= PADATA_CPU_PARALLEL;
533 if (!cpumask_equal(pd_old->cpumask.cbcpu, pd_new->cpumask.cbcpu))
534 notification_mask |= PADATA_CPU_SERIAL;
535
500 padata_flush_queues(pd_old); 536 padata_flush_queues(pd_old);
501 padata_free_pd(pd_old); 537 padata_free_pd(pd_old);
502 538
539 if (notification_mask)
540 blocking_notifier_call_chain(&pinst->cpumask_change_notifier,
541 notification_mask,
542 &pd_new->cpumask);
543
503 pinst->flags &= ~PADATA_RESET; 544 pinst->flags &= ~PADATA_RESET;
504} 545}
505 546
506/** 547/**
507 * padata_set_cpumask - set the cpumask that padata should use 548 * padata_register_cpumask_notifier - Registers a notifier that will be called
549 * if either pcpu or cbcpu or both cpumasks change.
508 * 550 *
509 * @pinst: padata instance 551 * @pinst: A poineter to padata instance
510 * @cpumask: the cpumask to use 552 * @nblock: A pointer to notifier block.
511 */ 553 */
512int padata_set_cpumask(struct padata_instance *pinst, 554int padata_register_cpumask_notifier(struct padata_instance *pinst,
513 cpumask_var_t cpumask) 555 struct notifier_block *nblock)
514{ 556{
557 return blocking_notifier_chain_register(&pinst->cpumask_change_notifier,
558 nblock);
559}
560EXPORT_SYMBOL(padata_register_cpumask_notifier);
561
562/**
563 * padata_unregister_cpumask_notifier - Unregisters cpumask notifier
564 * registered earlier using padata_register_cpumask_notifier
565 *
566 * @pinst: A pointer to data instance.
567 * @nlock: A pointer to notifier block.
568 */
569int padata_unregister_cpumask_notifier(struct padata_instance *pinst,
570 struct notifier_block *nblock)
571{
572 return blocking_notifier_chain_unregister(
573 &pinst->cpumask_change_notifier,
574 nblock);
575}
576EXPORT_SYMBOL(padata_unregister_cpumask_notifier);
577
578
579/* If cpumask contains no active cpu, we mark the instance as invalid. */
580static bool padata_validate_cpumask(struct padata_instance *pinst,
581 const struct cpumask *cpumask)
582{
583 if (!cpumask_intersects(cpumask, cpu_active_mask)) {
584 pinst->flags |= PADATA_INVALID;
585 return false;
586 }
587
588 pinst->flags &= ~PADATA_INVALID;
589 return true;
590}
591
592static int __padata_set_cpumasks(struct padata_instance *pinst,
593 cpumask_var_t pcpumask,
594 cpumask_var_t cbcpumask)
595{
596 int valid;
515 struct parallel_data *pd; 597 struct parallel_data *pd;
516 int err = 0; 598
599 valid = padata_validate_cpumask(pinst, pcpumask);
600 if (!valid) {
601 __padata_stop(pinst);
602 goto out_replace;
603 }
604
605 valid = padata_validate_cpumask(pinst, cbcpumask);
606 if (!valid)
607 __padata_stop(pinst);
608
609out_replace:
610 pd = padata_alloc_pd(pinst, pcpumask, cbcpumask);
611 if (!pd)
612 return -ENOMEM;
613
614 cpumask_copy(pinst->cpumask.pcpu, pcpumask);
615 cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
616
617 padata_replace(pinst, pd);
618
619 if (valid)
620 __padata_start(pinst);
621
622 return 0;
623}
624
625/**
626 * padata_set_cpumasks - Set both parallel and serial cpumasks. The first
627 * one is used by parallel workers and the second one
628 * by the wokers doing serialization.
629 *
630 * @pinst: padata instance
631 * @pcpumask: the cpumask to use for parallel workers
632 * @cbcpumask: the cpumsak to use for serial workers
633 */
634int padata_set_cpumasks(struct padata_instance *pinst, cpumask_var_t pcpumask,
635 cpumask_var_t cbcpumask)
636{
637 int err;
517 638
518 mutex_lock(&pinst->lock); 639 mutex_lock(&pinst->lock);
640 get_online_cpus();
519 641
642 err = __padata_set_cpumasks(pinst, pcpumask, cbcpumask);
643
644 put_online_cpus();
645 mutex_unlock(&pinst->lock);
646
647 return err;
648
649}
650EXPORT_SYMBOL(padata_set_cpumasks);
651
652/**
653 * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value
654 * equivalent to @cpumask.
655 *
656 * @pinst: padata instance
657 * @cpumask_type: PADATA_CPU_SERIAL or PADATA_CPU_PARALLEL corresponding
658 * to parallel and serial cpumasks respectively.
659 * @cpumask: the cpumask to use
660 */
661int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
662 cpumask_var_t cpumask)
663{
664 struct cpumask *serial_mask, *parallel_mask;
665 int err = -EINVAL;
666
667 mutex_lock(&pinst->lock);
520 get_online_cpus(); 668 get_online_cpus();
521 669
522 pd = padata_alloc_pd(pinst, cpumask); 670 switch (cpumask_type) {
523 if (!pd) { 671 case PADATA_CPU_PARALLEL:
524 err = -ENOMEM; 672 serial_mask = pinst->cpumask.cbcpu;
525 goto out; 673 parallel_mask = cpumask;
674 break;
675 case PADATA_CPU_SERIAL:
676 parallel_mask = pinst->cpumask.pcpu;
677 serial_mask = cpumask;
678 break;
679 default:
680 goto out;
526 } 681 }
527 682
528 cpumask_copy(pinst->cpumask, cpumask); 683 err = __padata_set_cpumasks(pinst, parallel_mask, serial_mask);
529
530 padata_replace(pinst, pd);
531 684
532out: 685out:
533 put_online_cpus(); 686 put_online_cpus();
534
535 mutex_unlock(&pinst->lock); 687 mutex_unlock(&pinst->lock);
536 688
537 return err; 689 return err;
@@ -543,30 +695,48 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
543 struct parallel_data *pd; 695 struct parallel_data *pd;
544 696
545 if (cpumask_test_cpu(cpu, cpu_active_mask)) { 697 if (cpumask_test_cpu(cpu, cpu_active_mask)) {
546 pd = padata_alloc_pd(pinst, pinst->cpumask); 698 pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
699 pinst->cpumask.cbcpu);
547 if (!pd) 700 if (!pd)
548 return -ENOMEM; 701 return -ENOMEM;
549 702
550 padata_replace(pinst, pd); 703 padata_replace(pinst, pd);
704
705 if (padata_validate_cpumask(pinst, pinst->cpumask.pcpu) &&
706 padata_validate_cpumask(pinst, pinst->cpumask.cbcpu))
707 __padata_start(pinst);
551 } 708 }
552 709
553 return 0; 710 return 0;
554} 711}
555 712
556/** 713 /**
557 * padata_add_cpu - add a cpu to the padata cpumask 714 * padata_add_cpu - add a cpu to one or both(parallel and serial)
715 * padata cpumasks.
558 * 716 *
559 * @pinst: padata instance 717 * @pinst: padata instance
560 * @cpu: cpu to add 718 * @cpu: cpu to add
719 * @mask: bitmask of flags specifying to which cpumask @cpu shuld be added.
720 * The @mask may be any combination of the following flags:
721 * PADATA_CPU_SERIAL - serial cpumask
722 * PADATA_CPU_PARALLEL - parallel cpumask
561 */ 723 */
562int padata_add_cpu(struct padata_instance *pinst, int cpu) 724
725int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask)
563{ 726{
564 int err; 727 int err;
565 728
729 if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
730 return -EINVAL;
731
566 mutex_lock(&pinst->lock); 732 mutex_lock(&pinst->lock);
567 733
568 get_online_cpus(); 734 get_online_cpus();
569 cpumask_set_cpu(cpu, pinst->cpumask); 735 if (mask & PADATA_CPU_SERIAL)
736 cpumask_set_cpu(cpu, pinst->cpumask.cbcpu);
737 if (mask & PADATA_CPU_PARALLEL)
738 cpumask_set_cpu(cpu, pinst->cpumask.pcpu);
739
570 err = __padata_add_cpu(pinst, cpu); 740 err = __padata_add_cpu(pinst, cpu);
571 put_online_cpus(); 741 put_online_cpus();
572 742
@@ -578,10 +748,16 @@ EXPORT_SYMBOL(padata_add_cpu);
578 748
579static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) 749static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
580{ 750{
581 struct parallel_data *pd; 751 struct parallel_data *pd = NULL;
582 752
583 if (cpumask_test_cpu(cpu, cpu_online_mask)) { 753 if (cpumask_test_cpu(cpu, cpu_online_mask)) {
584 pd = padata_alloc_pd(pinst, pinst->cpumask); 754
755 if (!padata_validate_cpumask(pinst, pinst->cpumask.pcpu) ||
756 !padata_validate_cpumask(pinst, pinst->cpumask.cbcpu))
757 __padata_stop(pinst);
758
759 pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
760 pinst->cpumask.cbcpu);
585 if (!pd) 761 if (!pd)
586 return -ENOMEM; 762 return -ENOMEM;
587 763
@@ -591,20 +767,32 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
591 return 0; 767 return 0;
592} 768}
593 769
594/** 770 /**
595 * padata_remove_cpu - remove a cpu from the padata cpumask 771 * padata_remove_cpu - remove a cpu from the one or both(serial and paralell)
772 * padata cpumasks.
596 * 773 *
597 * @pinst: padata instance 774 * @pinst: padata instance
598 * @cpu: cpu to remove 775 * @cpu: cpu to remove
776 * @mask: bitmask specifying from which cpumask @cpu should be removed
777 * The @mask may be any combination of the following flags:
778 * PADATA_CPU_SERIAL - serial cpumask
779 * PADATA_CPU_PARALLEL - parallel cpumask
599 */ 780 */
600int padata_remove_cpu(struct padata_instance *pinst, int cpu) 781int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask)
601{ 782{
602 int err; 783 int err;
603 784
785 if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
786 return -EINVAL;
787
604 mutex_lock(&pinst->lock); 788 mutex_lock(&pinst->lock);
605 789
606 get_online_cpus(); 790 get_online_cpus();
607 cpumask_clear_cpu(cpu, pinst->cpumask); 791 if (mask & PADATA_CPU_SERIAL)
792 cpumask_clear_cpu(cpu, pinst->cpumask.cbcpu);
793 if (mask & PADATA_CPU_PARALLEL)
794 cpumask_clear_cpu(cpu, pinst->cpumask.pcpu);
795
608 err = __padata_remove_cpu(pinst, cpu); 796 err = __padata_remove_cpu(pinst, cpu);
609 put_online_cpus(); 797 put_online_cpus();
610 798
@@ -619,11 +807,20 @@ EXPORT_SYMBOL(padata_remove_cpu);
619 * 807 *
620 * @pinst: padata instance to start 808 * @pinst: padata instance to start
621 */ 809 */
622void padata_start(struct padata_instance *pinst) 810int padata_start(struct padata_instance *pinst)
623{ 811{
812 int err = 0;
813
624 mutex_lock(&pinst->lock); 814 mutex_lock(&pinst->lock);
625 pinst->flags |= PADATA_INIT; 815
816 if (pinst->flags & PADATA_INVALID)
817 err =-EINVAL;
818
819 __padata_start(pinst);
820
626 mutex_unlock(&pinst->lock); 821 mutex_unlock(&pinst->lock);
822
823 return err;
627} 824}
628EXPORT_SYMBOL(padata_start); 825EXPORT_SYMBOL(padata_start);
629 826
@@ -635,12 +832,20 @@ EXPORT_SYMBOL(padata_start);
635void padata_stop(struct padata_instance *pinst) 832void padata_stop(struct padata_instance *pinst)
636{ 833{
637 mutex_lock(&pinst->lock); 834 mutex_lock(&pinst->lock);
638 pinst->flags &= ~PADATA_INIT; 835 __padata_stop(pinst);
639 mutex_unlock(&pinst->lock); 836 mutex_unlock(&pinst->lock);
640} 837}
641EXPORT_SYMBOL(padata_stop); 838EXPORT_SYMBOL(padata_stop);
642 839
643#ifdef CONFIG_HOTPLUG_CPU 840#ifdef CONFIG_HOTPLUG_CPU
841
842static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu)
843{
844 return cpumask_test_cpu(cpu, pinst->cpumask.pcpu) ||
845 cpumask_test_cpu(cpu, pinst->cpumask.cbcpu);
846}
847
848
644static int padata_cpu_callback(struct notifier_block *nfb, 849static int padata_cpu_callback(struct notifier_block *nfb,
645 unsigned long action, void *hcpu) 850 unsigned long action, void *hcpu)
646{ 851{
@@ -653,7 +858,7 @@ static int padata_cpu_callback(struct notifier_block *nfb,
653 switch (action) { 858 switch (action) {
654 case CPU_ONLINE: 859 case CPU_ONLINE:
655 case CPU_ONLINE_FROZEN: 860 case CPU_ONLINE_FROZEN:
656 if (!cpumask_test_cpu(cpu, pinst->cpumask)) 861 if (!pinst_has_cpu(pinst, cpu))
657 break; 862 break;
658 mutex_lock(&pinst->lock); 863 mutex_lock(&pinst->lock);
659 err = __padata_add_cpu(pinst, cpu); 864 err = __padata_add_cpu(pinst, cpu);
@@ -664,7 +869,7 @@ static int padata_cpu_callback(struct notifier_block *nfb,
664 869
665 case CPU_DOWN_PREPARE: 870 case CPU_DOWN_PREPARE:
666 case CPU_DOWN_PREPARE_FROZEN: 871 case CPU_DOWN_PREPARE_FROZEN:
667 if (!cpumask_test_cpu(cpu, pinst->cpumask)) 872 if (!pinst_has_cpu(pinst, cpu))
668 break; 873 break;
669 mutex_lock(&pinst->lock); 874 mutex_lock(&pinst->lock);
670 err = __padata_remove_cpu(pinst, cpu); 875 err = __padata_remove_cpu(pinst, cpu);
@@ -675,7 +880,7 @@ static int padata_cpu_callback(struct notifier_block *nfb,
675 880
676 case CPU_UP_CANCELED: 881 case CPU_UP_CANCELED:
677 case CPU_UP_CANCELED_FROZEN: 882 case CPU_UP_CANCELED_FROZEN:
678 if (!cpumask_test_cpu(cpu, pinst->cpumask)) 883 if (!pinst_has_cpu(pinst, cpu))
679 break; 884 break;
680 mutex_lock(&pinst->lock); 885 mutex_lock(&pinst->lock);
681 __padata_remove_cpu(pinst, cpu); 886 __padata_remove_cpu(pinst, cpu);
@@ -683,7 +888,7 @@ static int padata_cpu_callback(struct notifier_block *nfb,
683 888
684 case CPU_DOWN_FAILED: 889 case CPU_DOWN_FAILED:
685 case CPU_DOWN_FAILED_FROZEN: 890 case CPU_DOWN_FAILED_FROZEN:
686 if (!cpumask_test_cpu(cpu, pinst->cpumask)) 891 if (!pinst_has_cpu(pinst, cpu))
687 break; 892 break;
688 mutex_lock(&pinst->lock); 893 mutex_lock(&pinst->lock);
689 __padata_add_cpu(pinst, cpu); 894 __padata_add_cpu(pinst, cpu);
@@ -694,36 +899,202 @@ static int padata_cpu_callback(struct notifier_block *nfb,
694} 899}
695#endif 900#endif
696 901
902static void __padata_free(struct padata_instance *pinst)
903{
904#ifdef CONFIG_HOTPLUG_CPU
905 unregister_hotcpu_notifier(&pinst->cpu_notifier);
906#endif
907
908 padata_stop(pinst);
909 padata_free_pd(pinst->pd);
910 free_cpumask_var(pinst->cpumask.pcpu);
911 free_cpumask_var(pinst->cpumask.cbcpu);
912 kfree(pinst);
913}
914
915#define kobj2pinst(_kobj) \
916 container_of(_kobj, struct padata_instance, kobj)
917#define attr2pentry(_attr) \
918 container_of(_attr, struct padata_sysfs_entry, attr)
919
920static void padata_sysfs_release(struct kobject *kobj)
921{
922 struct padata_instance *pinst = kobj2pinst(kobj);
923 __padata_free(pinst);
924}
925
926struct padata_sysfs_entry {
927 struct attribute attr;
928 ssize_t (*show)(struct padata_instance *, struct attribute *, char *);
929 ssize_t (*store)(struct padata_instance *, struct attribute *,
930 const char *, size_t);
931};
932
933static ssize_t show_cpumask(struct padata_instance *pinst,
934 struct attribute *attr, char *buf)
935{
936 struct cpumask *cpumask;
937 ssize_t len;
938
939 mutex_lock(&pinst->lock);
940 if (!strcmp(attr->name, "serial_cpumask"))
941 cpumask = pinst->cpumask.cbcpu;
942 else
943 cpumask = pinst->cpumask.pcpu;
944
945 len = bitmap_scnprintf(buf, PAGE_SIZE, cpumask_bits(cpumask),
946 nr_cpu_ids);
947 if (PAGE_SIZE - len < 2)
948 len = -EINVAL;
949 else
950 len += sprintf(buf + len, "\n");
951
952 mutex_unlock(&pinst->lock);
953 return len;
954}
955
956static ssize_t store_cpumask(struct padata_instance *pinst,
957 struct attribute *attr,
958 const char *buf, size_t count)
959{
960 cpumask_var_t new_cpumask;
961 ssize_t ret;
962 int mask_type;
963
964 if (!alloc_cpumask_var(&new_cpumask, GFP_KERNEL))
965 return -ENOMEM;
966
967 ret = bitmap_parse(buf, count, cpumask_bits(new_cpumask),
968 nr_cpumask_bits);
969 if (ret < 0)
970 goto out;
971
972 mask_type = !strcmp(attr->name, "serial_cpumask") ?
973 PADATA_CPU_SERIAL : PADATA_CPU_PARALLEL;
974 ret = padata_set_cpumask(pinst, mask_type, new_cpumask);
975 if (!ret)
976 ret = count;
977
978out:
979 free_cpumask_var(new_cpumask);
980 return ret;
981}
982
983#define PADATA_ATTR_RW(_name, _show_name, _store_name) \
984 static struct padata_sysfs_entry _name##_attr = \
985 __ATTR(_name, 0644, _show_name, _store_name)
986#define PADATA_ATTR_RO(_name, _show_name) \
987 static struct padata_sysfs_entry _name##_attr = \
988 __ATTR(_name, 0400, _show_name, NULL)
989
990PADATA_ATTR_RW(serial_cpumask, show_cpumask, store_cpumask);
991PADATA_ATTR_RW(parallel_cpumask, show_cpumask, store_cpumask);
992
993/*
994 * Padata sysfs provides the following objects:
995 * serial_cpumask [RW] - cpumask for serial workers
996 * parallel_cpumask [RW] - cpumask for parallel workers
997 */
998static struct attribute *padata_default_attrs[] = {
999 &serial_cpumask_attr.attr,
1000 &parallel_cpumask_attr.attr,
1001 NULL,
1002};
1003
1004static ssize_t padata_sysfs_show(struct kobject *kobj,
1005 struct attribute *attr, char *buf)
1006{
1007 struct padata_instance *pinst;
1008 struct padata_sysfs_entry *pentry;
1009 ssize_t ret = -EIO;
1010
1011 pinst = kobj2pinst(kobj);
1012 pentry = attr2pentry(attr);
1013 if (pentry->show)
1014 ret = pentry->show(pinst, attr, buf);
1015
1016 return ret;
1017}
1018
1019static ssize_t padata_sysfs_store(struct kobject *kobj, struct attribute *attr,
1020 const char *buf, size_t count)
1021{
1022 struct padata_instance *pinst;
1023 struct padata_sysfs_entry *pentry;
1024 ssize_t ret = -EIO;
1025
1026 pinst = kobj2pinst(kobj);
1027 pentry = attr2pentry(attr);
1028 if (pentry->show)
1029 ret = pentry->store(pinst, attr, buf, count);
1030
1031 return ret;
1032}
1033
1034static const struct sysfs_ops padata_sysfs_ops = {
1035 .show = padata_sysfs_show,
1036 .store = padata_sysfs_store,
1037};
1038
1039static struct kobj_type padata_attr_type = {
1040 .sysfs_ops = &padata_sysfs_ops,
1041 .default_attrs = padata_default_attrs,
1042 .release = padata_sysfs_release,
1043};
1044
697/** 1045/**
698 * padata_alloc - allocate and initialize a padata instance 1046 * padata_alloc_possible - Allocate and initialize padata instance.
1047 * Use the cpu_possible_mask for serial and
1048 * parallel workers.
699 * 1049 *
700 * @cpumask: cpumask that padata uses for parallelization
701 * @wq: workqueue to use for the allocated padata instance 1050 * @wq: workqueue to use for the allocated padata instance
702 */ 1051 */
703struct padata_instance *padata_alloc(const struct cpumask *cpumask, 1052struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq)
704 struct workqueue_struct *wq) 1053{
1054 return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask);
1055}
1056EXPORT_SYMBOL(padata_alloc_possible);
1057
1058/**
1059 * padata_alloc - allocate and initialize a padata instance and specify
1060 * cpumasks for serial and parallel workers.
1061 *
1062 * @wq: workqueue to use for the allocated padata instance
1063 * @pcpumask: cpumask that will be used for padata parallelization
1064 * @cbcpumask: cpumask that will be used for padata serialization
1065 */
1066struct padata_instance *padata_alloc(struct workqueue_struct *wq,
1067 const struct cpumask *pcpumask,
1068 const struct cpumask *cbcpumask)
705{ 1069{
706 struct padata_instance *pinst; 1070 struct padata_instance *pinst;
707 struct parallel_data *pd; 1071 struct parallel_data *pd = NULL;
708 1072
709 pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL); 1073 pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL);
710 if (!pinst) 1074 if (!pinst)
711 goto err; 1075 goto err;
712 1076
713 get_online_cpus(); 1077 get_online_cpus();
714 1078 if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL))
715 pd = padata_alloc_pd(pinst, cpumask);
716 if (!pd)
717 goto err_free_inst; 1079 goto err_free_inst;
1080 if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) {
1081 free_cpumask_var(pinst->cpumask.pcpu);
1082 goto err_free_inst;
1083 }
1084 if (!padata_validate_cpumask(pinst, pcpumask) ||
1085 !padata_validate_cpumask(pinst, cbcpumask))
1086 goto err_free_masks;
718 1087
719 if (!alloc_cpumask_var(&pinst->cpumask, GFP_KERNEL)) 1088 pd = padata_alloc_pd(pinst, pcpumask, cbcpumask);
720 goto err_free_pd; 1089 if (!pd)
1090 goto err_free_masks;
721 1091
722 rcu_assign_pointer(pinst->pd, pd); 1092 rcu_assign_pointer(pinst->pd, pd);
723 1093
724 pinst->wq = wq; 1094 pinst->wq = wq;
725 1095
726 cpumask_copy(pinst->cpumask, cpumask); 1096 cpumask_copy(pinst->cpumask.pcpu, pcpumask);
1097 cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
727 1098
728 pinst->flags = 0; 1099 pinst->flags = 0;
729 1100
@@ -735,12 +1106,15 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask,
735 1106
736 put_online_cpus(); 1107 put_online_cpus();
737 1108
1109 BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier);
1110 kobject_init(&pinst->kobj, &padata_attr_type);
738 mutex_init(&pinst->lock); 1111 mutex_init(&pinst->lock);
739 1112
740 return pinst; 1113 return pinst;
741 1114
742err_free_pd: 1115err_free_masks:
743 padata_free_pd(pd); 1116 free_cpumask_var(pinst->cpumask.pcpu);
1117 free_cpumask_var(pinst->cpumask.cbcpu);
744err_free_inst: 1118err_free_inst:
745 kfree(pinst); 1119 kfree(pinst);
746 put_online_cpus(); 1120 put_online_cpus();
@@ -756,19 +1130,6 @@ EXPORT_SYMBOL(padata_alloc);
756 */ 1130 */
757void padata_free(struct padata_instance *pinst) 1131void padata_free(struct padata_instance *pinst)
758{ 1132{
759 padata_stop(pinst); 1133 kobject_put(&pinst->kobj);
760
761 synchronize_rcu();
762
763#ifdef CONFIG_HOTPLUG_CPU
764 unregister_hotcpu_notifier(&pinst->cpu_notifier);
765#endif
766 get_online_cpus();
767 padata_flush_queues(pinst->pd);
768 put_online_cpus();
769
770 padata_free_pd(pinst->pd);
771 free_cpumask_var(pinst->cpumask);
772 kfree(pinst);
773} 1134}
774EXPORT_SYMBOL(padata_free); 1135EXPORT_SYMBOL(padata_free);
diff --git a/kernel/panic.c b/kernel/panic.c
index 3b16cd93fa7d..4c13b1a88ebb 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -24,6 +24,9 @@
24#include <linux/nmi.h> 24#include <linux/nmi.h>
25#include <linux/dmi.h> 25#include <linux/dmi.h>
26 26
27#define PANIC_TIMER_STEP 100
28#define PANIC_BLINK_SPD 18
29
27int panic_on_oops; 30int panic_on_oops;
28static unsigned long tainted_mask; 31static unsigned long tainted_mask;
29static int pause_on_oops; 32static int pause_on_oops;
@@ -36,36 +39,15 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
36 39
37EXPORT_SYMBOL(panic_notifier_list); 40EXPORT_SYMBOL(panic_notifier_list);
38 41
39/* Returns how long it waited in ms */ 42static long no_blink(int state)
40long (*panic_blink)(long time);
41EXPORT_SYMBOL(panic_blink);
42
43static void panic_blink_one_second(void)
44{ 43{
45 static long i = 0, end; 44 return 0;
46
47 if (panic_blink) {
48 end = i + MSEC_PER_SEC;
49
50 while (i < end) {
51 i += panic_blink(i);
52 mdelay(1);
53 i++;
54 }
55 } else {
56 /*
57 * When running under a hypervisor a small mdelay may get
58 * rounded up to the hypervisor timeslice. For example, with
59 * a 1ms in 10ms hypervisor timeslice we might inflate a
60 * mdelay(1) loop by 10x.
61 *
62 * If we have nothing to blink, spin on 1 second calls to
63 * mdelay to avoid this.
64 */
65 mdelay(MSEC_PER_SEC);
66 }
67} 45}
68 46
47/* Returns how long it waited in ms */
48long (*panic_blink)(int state);
49EXPORT_SYMBOL(panic_blink);
50
69/** 51/**
70 * panic - halt the system 52 * panic - halt the system
71 * @fmt: The text string to print 53 * @fmt: The text string to print
@@ -78,7 +60,8 @@ NORET_TYPE void panic(const char * fmt, ...)
78{ 60{
79 static char buf[1024]; 61 static char buf[1024];
80 va_list args; 62 va_list args;
81 long i; 63 long i, i_next = 0;
64 int state = 0;
82 65
83 /* 66 /*
84 * It's possible to come here directly from a panic-assertion and 67 * It's possible to come here directly from a panic-assertion and
@@ -117,6 +100,9 @@ NORET_TYPE void panic(const char * fmt, ...)
117 100
118 bust_spinlocks(0); 101 bust_spinlocks(0);
119 102
103 if (!panic_blink)
104 panic_blink = no_blink;
105
120 if (panic_timeout > 0) { 106 if (panic_timeout > 0) {
121 /* 107 /*
122 * Delay timeout seconds before rebooting the machine. 108 * Delay timeout seconds before rebooting the machine.
@@ -124,9 +110,13 @@ NORET_TYPE void panic(const char * fmt, ...)
124 */ 110 */
125 printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout); 111 printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
126 112
127 for (i = 0; i < panic_timeout; i++) { 113 for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) {
128 touch_nmi_watchdog(); 114 touch_nmi_watchdog();
129 panic_blink_one_second(); 115 if (i >= i_next) {
116 i += panic_blink(state ^= 1);
117 i_next = i + 3600 / PANIC_BLINK_SPD;
118 }
119 mdelay(PANIC_TIMER_STEP);
130 } 120 }
131 /* 121 /*
132 * This will not be a clean reboot, with everything 122 * This will not be a clean reboot, with everything
@@ -152,9 +142,13 @@ NORET_TYPE void panic(const char * fmt, ...)
152 } 142 }
153#endif 143#endif
154 local_irq_enable(); 144 local_irq_enable();
155 while (1) { 145 for (i = 0; ; i += PANIC_TIMER_STEP) {
156 touch_softlockup_watchdog(); 146 touch_softlockup_watchdog();
157 panic_blink_one_second(); 147 if (i >= i_next) {
148 i += panic_blink(state ^= 1);
149 i_next = i + 3600 / PANIC_BLINK_SPD;
150 }
151 mdelay(PANIC_TIMER_STEP);
158 } 152 }
159} 153}
160 154
@@ -344,7 +338,7 @@ static int init_oops_id(void)
344} 338}
345late_initcall(init_oops_id); 339late_initcall(init_oops_id);
346 340
347static void print_oops_end_marker(void) 341void print_oops_end_marker(void)
348{ 342{
349 init_oops_id(); 343 init_oops_id();
350 printk(KERN_WARNING "---[ end trace %016llx ]---\n", 344 printk(KERN_WARNING "---[ end trace %016llx ]---\n",
diff --git a/kernel/params.c b/kernel/params.c
index 0b30ecd53a52..08107d181758 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -31,6 +31,42 @@
31#define DEBUGP(fmt, a...) 31#define DEBUGP(fmt, a...)
32#endif 32#endif
33 33
34/* Protects all parameters, and incidentally kmalloced_param list. */
35static DEFINE_MUTEX(param_lock);
36
37/* This just allows us to keep track of which parameters are kmalloced. */
38struct kmalloced_param {
39 struct list_head list;
40 char val[];
41};
42static LIST_HEAD(kmalloced_params);
43
44static void *kmalloc_parameter(unsigned int size)
45{
46 struct kmalloced_param *p;
47
48 p = kmalloc(sizeof(*p) + size, GFP_KERNEL);
49 if (!p)
50 return NULL;
51
52 list_add(&p->list, &kmalloced_params);
53 return p->val;
54}
55
56/* Does nothing if parameter wasn't kmalloced above. */
57static void maybe_kfree_parameter(void *param)
58{
59 struct kmalloced_param *p;
60
61 list_for_each_entry(p, &kmalloced_params, list) {
62 if (p->val == param) {
63 list_del(&p->list);
64 kfree(p);
65 break;
66 }
67 }
68}
69
34static inline char dash2underscore(char c) 70static inline char dash2underscore(char c)
35{ 71{
36 if (c == '-') 72 if (c == '-')
@@ -49,18 +85,25 @@ static inline int parameq(const char *input, const char *paramname)
49 85
50static int parse_one(char *param, 86static int parse_one(char *param,
51 char *val, 87 char *val,
52 struct kernel_param *params, 88 const struct kernel_param *params,
53 unsigned num_params, 89 unsigned num_params,
54 int (*handle_unknown)(char *param, char *val)) 90 int (*handle_unknown)(char *param, char *val))
55{ 91{
56 unsigned int i; 92 unsigned int i;
93 int err;
57 94
58 /* Find parameter */ 95 /* Find parameter */
59 for (i = 0; i < num_params; i++) { 96 for (i = 0; i < num_params; i++) {
60 if (parameq(param, params[i].name)) { 97 if (parameq(param, params[i].name)) {
98 /* Noone handled NULL, so do it here. */
99 if (!val && params[i].ops->set != param_set_bool)
100 return -EINVAL;
61 DEBUGP("They are equal! Calling %p\n", 101 DEBUGP("They are equal! Calling %p\n",
62 params[i].set); 102 params[i].ops->set);
63 return params[i].set(val, &params[i]); 103 mutex_lock(&param_lock);
104 err = params[i].ops->set(val, &params[i]);
105 mutex_unlock(&param_lock);
106 return err;
64 } 107 }
65 } 108 }
66 109
@@ -128,7 +171,7 @@ static char *next_arg(char *args, char **param, char **val)
128/* Args looks like "foo=bar,bar2 baz=fuz wiz". */ 171/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
129int parse_args(const char *name, 172int parse_args(const char *name,
130 char *args, 173 char *args,
131 struct kernel_param *params, 174 const struct kernel_param *params,
132 unsigned num, 175 unsigned num,
133 int (*unknown)(char *param, char *val)) 176 int (*unknown)(char *param, char *val))
134{ 177{
@@ -176,22 +219,29 @@ int parse_args(const char *name,
176 219
177/* Lazy bastard, eh? */ 220/* Lazy bastard, eh? */
178#define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \ 221#define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \
179 int param_set_##name(const char *val, struct kernel_param *kp) \ 222 int param_set_##name(const char *val, const struct kernel_param *kp) \
180 { \ 223 { \
181 tmptype l; \ 224 tmptype l; \
182 int ret; \ 225 int ret; \
183 \ 226 \
184 if (!val) return -EINVAL; \
185 ret = strtolfn(val, 0, &l); \ 227 ret = strtolfn(val, 0, &l); \
186 if (ret == -EINVAL || ((type)l != l)) \ 228 if (ret == -EINVAL || ((type)l != l)) \
187 return -EINVAL; \ 229 return -EINVAL; \
188 *((type *)kp->arg) = l; \ 230 *((type *)kp->arg) = l; \
189 return 0; \ 231 return 0; \
190 } \ 232 } \
191 int param_get_##name(char *buffer, struct kernel_param *kp) \ 233 int param_get_##name(char *buffer, const struct kernel_param *kp) \
192 { \ 234 { \
193 return sprintf(buffer, format, *((type *)kp->arg)); \ 235 return sprintf(buffer, format, *((type *)kp->arg)); \
194 } 236 } \
237 struct kernel_param_ops param_ops_##name = { \
238 .set = param_set_##name, \
239 .get = param_get_##name, \
240 }; \
241 EXPORT_SYMBOL(param_set_##name); \
242 EXPORT_SYMBOL(param_get_##name); \
243 EXPORT_SYMBOL(param_ops_##name)
244
195 245
196STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, strict_strtoul); 246STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, strict_strtoul);
197STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol); 247STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol);
@@ -201,39 +251,50 @@ STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul);
201STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol); 251STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol);
202STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul); 252STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul);
203 253
204int param_set_charp(const char *val, struct kernel_param *kp) 254int param_set_charp(const char *val, const struct kernel_param *kp)
205{ 255{
206 if (!val) {
207 printk(KERN_ERR "%s: string parameter expected\n",
208 kp->name);
209 return -EINVAL;
210 }
211
212 if (strlen(val) > 1024) { 256 if (strlen(val) > 1024) {
213 printk(KERN_ERR "%s: string parameter too long\n", 257 printk(KERN_ERR "%s: string parameter too long\n",
214 kp->name); 258 kp->name);
215 return -ENOSPC; 259 return -ENOSPC;
216 } 260 }
217 261
218 /* This is a hack. We can't need to strdup in early boot, and we 262 maybe_kfree_parameter(*(char **)kp->arg);
263
264 /* This is a hack. We can't kmalloc in early boot, and we
219 * don't need to; this mangled commandline is preserved. */ 265 * don't need to; this mangled commandline is preserved. */
220 if (slab_is_available()) { 266 if (slab_is_available()) {
221 *(char **)kp->arg = kstrdup(val, GFP_KERNEL); 267 *(char **)kp->arg = kmalloc_parameter(strlen(val)+1);
222 if (!*(char **)kp->arg) 268 if (!*(char **)kp->arg)
223 return -ENOMEM; 269 return -ENOMEM;
270 strcpy(*(char **)kp->arg, val);
224 } else 271 } else
225 *(const char **)kp->arg = val; 272 *(const char **)kp->arg = val;
226 273
227 return 0; 274 return 0;
228} 275}
276EXPORT_SYMBOL(param_set_charp);
229 277
230int param_get_charp(char *buffer, struct kernel_param *kp) 278int param_get_charp(char *buffer, const struct kernel_param *kp)
231{ 279{
232 return sprintf(buffer, "%s", *((char **)kp->arg)); 280 return sprintf(buffer, "%s", *((char **)kp->arg));
233} 281}
282EXPORT_SYMBOL(param_get_charp);
283
284static void param_free_charp(void *arg)
285{
286 maybe_kfree_parameter(*((char **)arg));
287}
288
289struct kernel_param_ops param_ops_charp = {
290 .set = param_set_charp,
291 .get = param_get_charp,
292 .free = param_free_charp,
293};
294EXPORT_SYMBOL(param_ops_charp);
234 295
235/* Actually could be a bool or an int, for historical reasons. */ 296/* Actually could be a bool or an int, for historical reasons. */
236int param_set_bool(const char *val, struct kernel_param *kp) 297int param_set_bool(const char *val, const struct kernel_param *kp)
237{ 298{
238 bool v; 299 bool v;
239 300
@@ -258,8 +319,9 @@ int param_set_bool(const char *val, struct kernel_param *kp)
258 *(int *)kp->arg = v; 319 *(int *)kp->arg = v;
259 return 0; 320 return 0;
260} 321}
322EXPORT_SYMBOL(param_set_bool);
261 323
262int param_get_bool(char *buffer, struct kernel_param *kp) 324int param_get_bool(char *buffer, const struct kernel_param *kp)
263{ 325{
264 bool val; 326 bool val;
265 if (kp->flags & KPARAM_ISBOOL) 327 if (kp->flags & KPARAM_ISBOOL)
@@ -270,9 +332,16 @@ int param_get_bool(char *buffer, struct kernel_param *kp)
270 /* Y and N chosen as being relatively non-coder friendly */ 332 /* Y and N chosen as being relatively non-coder friendly */
271 return sprintf(buffer, "%c", val ? 'Y' : 'N'); 333 return sprintf(buffer, "%c", val ? 'Y' : 'N');
272} 334}
335EXPORT_SYMBOL(param_get_bool);
336
337struct kernel_param_ops param_ops_bool = {
338 .set = param_set_bool,
339 .get = param_get_bool,
340};
341EXPORT_SYMBOL(param_ops_bool);
273 342
274/* This one must be bool. */ 343/* This one must be bool. */
275int param_set_invbool(const char *val, struct kernel_param *kp) 344int param_set_invbool(const char *val, const struct kernel_param *kp)
276{ 345{
277 int ret; 346 int ret;
278 bool boolval; 347 bool boolval;
@@ -285,18 +354,26 @@ int param_set_invbool(const char *val, struct kernel_param *kp)
285 *(bool *)kp->arg = !boolval; 354 *(bool *)kp->arg = !boolval;
286 return ret; 355 return ret;
287} 356}
357EXPORT_SYMBOL(param_set_invbool);
288 358
289int param_get_invbool(char *buffer, struct kernel_param *kp) 359int param_get_invbool(char *buffer, const struct kernel_param *kp)
290{ 360{
291 return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y'); 361 return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y');
292} 362}
363EXPORT_SYMBOL(param_get_invbool);
364
365struct kernel_param_ops param_ops_invbool = {
366 .set = param_set_invbool,
367 .get = param_get_invbool,
368};
369EXPORT_SYMBOL(param_ops_invbool);
293 370
294/* We break the rule and mangle the string. */ 371/* We break the rule and mangle the string. */
295static int param_array(const char *name, 372static int param_array(const char *name,
296 const char *val, 373 const char *val,
297 unsigned int min, unsigned int max, 374 unsigned int min, unsigned int max,
298 void *elem, int elemsize, 375 void *elem, int elemsize,
299 int (*set)(const char *, struct kernel_param *kp), 376 int (*set)(const char *, const struct kernel_param *kp),
300 u16 flags, 377 u16 flags,
301 unsigned int *num) 378 unsigned int *num)
302{ 379{
@@ -309,12 +386,6 @@ static int param_array(const char *name,
309 kp.arg = elem; 386 kp.arg = elem;
310 kp.flags = flags; 387 kp.flags = flags;
311 388
312 /* No equals sign? */
313 if (!val) {
314 printk(KERN_ERR "%s: expects arguments\n", name);
315 return -EINVAL;
316 }
317
318 *num = 0; 389 *num = 0;
319 /* We expect a comma-separated list of values. */ 390 /* We expect a comma-separated list of values. */
320 do { 391 do {
@@ -330,6 +401,7 @@ static int param_array(const char *name,
330 /* nul-terminate and parse */ 401 /* nul-terminate and parse */
331 save = val[len]; 402 save = val[len];
332 ((char *)val)[len] = '\0'; 403 ((char *)val)[len] = '\0';
404 BUG_ON(!mutex_is_locked(&param_lock));
333 ret = set(val, &kp); 405 ret = set(val, &kp);
334 406
335 if (ret != 0) 407 if (ret != 0)
@@ -347,17 +419,17 @@ static int param_array(const char *name,
347 return 0; 419 return 0;
348} 420}
349 421
350int param_array_set(const char *val, struct kernel_param *kp) 422static int param_array_set(const char *val, const struct kernel_param *kp)
351{ 423{
352 const struct kparam_array *arr = kp->arr; 424 const struct kparam_array *arr = kp->arr;
353 unsigned int temp_num; 425 unsigned int temp_num;
354 426
355 return param_array(kp->name, val, 1, arr->max, arr->elem, 427 return param_array(kp->name, val, 1, arr->max, arr->elem,
356 arr->elemsize, arr->set, kp->flags, 428 arr->elemsize, arr->ops->set, kp->flags,
357 arr->num ?: &temp_num); 429 arr->num ?: &temp_num);
358} 430}
359 431
360int param_array_get(char *buffer, struct kernel_param *kp) 432static int param_array_get(char *buffer, const struct kernel_param *kp)
361{ 433{
362 int i, off, ret; 434 int i, off, ret;
363 const struct kparam_array *arr = kp->arr; 435 const struct kparam_array *arr = kp->arr;
@@ -368,7 +440,8 @@ int param_array_get(char *buffer, struct kernel_param *kp)
368 if (i) 440 if (i)
369 buffer[off++] = ','; 441 buffer[off++] = ',';
370 p.arg = arr->elem + arr->elemsize * i; 442 p.arg = arr->elem + arr->elemsize * i;
371 ret = arr->get(buffer + off, &p); 443 BUG_ON(!mutex_is_locked(&param_lock));
444 ret = arr->ops->get(buffer + off, &p);
372 if (ret < 0) 445 if (ret < 0)
373 return ret; 446 return ret;
374 off += ret; 447 off += ret;
@@ -377,14 +450,27 @@ int param_array_get(char *buffer, struct kernel_param *kp)
377 return off; 450 return off;
378} 451}
379 452
380int param_set_copystring(const char *val, struct kernel_param *kp) 453static void param_array_free(void *arg)
454{
455 unsigned int i;
456 const struct kparam_array *arr = arg;
457
458 if (arr->ops->free)
459 for (i = 0; i < (arr->num ? *arr->num : arr->max); i++)
460 arr->ops->free(arr->elem + arr->elemsize * i);
461}
462
463struct kernel_param_ops param_array_ops = {
464 .set = param_array_set,
465 .get = param_array_get,
466 .free = param_array_free,
467};
468EXPORT_SYMBOL(param_array_ops);
469
470int param_set_copystring(const char *val, const struct kernel_param *kp)
381{ 471{
382 const struct kparam_string *kps = kp->str; 472 const struct kparam_string *kps = kp->str;
383 473
384 if (!val) {
385 printk(KERN_ERR "%s: missing param set value\n", kp->name);
386 return -EINVAL;
387 }
388 if (strlen(val)+1 > kps->maxlen) { 474 if (strlen(val)+1 > kps->maxlen) {
389 printk(KERN_ERR "%s: string doesn't fit in %u chars.\n", 475 printk(KERN_ERR "%s: string doesn't fit in %u chars.\n",
390 kp->name, kps->maxlen-1); 476 kp->name, kps->maxlen-1);
@@ -393,12 +479,20 @@ int param_set_copystring(const char *val, struct kernel_param *kp)
393 strcpy(kps->string, val); 479 strcpy(kps->string, val);
394 return 0; 480 return 0;
395} 481}
482EXPORT_SYMBOL(param_set_copystring);
396 483
397int param_get_string(char *buffer, struct kernel_param *kp) 484int param_get_string(char *buffer, const struct kernel_param *kp)
398{ 485{
399 const struct kparam_string *kps = kp->str; 486 const struct kparam_string *kps = kp->str;
400 return strlcpy(buffer, kps->string, kps->maxlen); 487 return strlcpy(buffer, kps->string, kps->maxlen);
401} 488}
489EXPORT_SYMBOL(param_get_string);
490
491struct kernel_param_ops param_ops_string = {
492 .set = param_set_copystring,
493 .get = param_get_string,
494};
495EXPORT_SYMBOL(param_ops_string);
402 496
403/* sysfs output in /sys/modules/XYZ/parameters/ */ 497/* sysfs output in /sys/modules/XYZ/parameters/ */
404#define to_module_attr(n) container_of(n, struct module_attribute, attr) 498#define to_module_attr(n) container_of(n, struct module_attribute, attr)
@@ -409,7 +503,7 @@ extern struct kernel_param __start___param[], __stop___param[];
409struct param_attribute 503struct param_attribute
410{ 504{
411 struct module_attribute mattr; 505 struct module_attribute mattr;
412 struct kernel_param *param; 506 const struct kernel_param *param;
413}; 507};
414 508
415struct module_param_attrs 509struct module_param_attrs
@@ -428,10 +522,12 @@ static ssize_t param_attr_show(struct module_attribute *mattr,
428 int count; 522 int count;
429 struct param_attribute *attribute = to_param_attr(mattr); 523 struct param_attribute *attribute = to_param_attr(mattr);
430 524
431 if (!attribute->param->get) 525 if (!attribute->param->ops->get)
432 return -EPERM; 526 return -EPERM;
433 527
434 count = attribute->param->get(buf, attribute->param); 528 mutex_lock(&param_lock);
529 count = attribute->param->ops->get(buf, attribute->param);
530 mutex_unlock(&param_lock);
435 if (count > 0) { 531 if (count > 0) {
436 strcat(buf, "\n"); 532 strcat(buf, "\n");
437 ++count; 533 ++count;
@@ -447,10 +543,12 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
447 int err; 543 int err;
448 struct param_attribute *attribute = to_param_attr(mattr); 544 struct param_attribute *attribute = to_param_attr(mattr);
449 545
450 if (!attribute->param->set) 546 if (!attribute->param->ops->set)
451 return -EPERM; 547 return -EPERM;
452 548
453 err = attribute->param->set(buf, attribute->param); 549 mutex_lock(&param_lock);
550 err = attribute->param->ops->set(buf, attribute->param);
551 mutex_unlock(&param_lock);
454 if (!err) 552 if (!err)
455 return len; 553 return len;
456 return err; 554 return err;
@@ -464,6 +562,18 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
464#endif 562#endif
465 563
466#ifdef CONFIG_SYSFS 564#ifdef CONFIG_SYSFS
565void __kernel_param_lock(void)
566{
567 mutex_lock(&param_lock);
568}
569EXPORT_SYMBOL(__kernel_param_lock);
570
571void __kernel_param_unlock(void)
572{
573 mutex_unlock(&param_lock);
574}
575EXPORT_SYMBOL(__kernel_param_unlock);
576
467/* 577/*
468 * add_sysfs_param - add a parameter to sysfs 578 * add_sysfs_param - add a parameter to sysfs
469 * @mk: struct module_kobject 579 * @mk: struct module_kobject
@@ -475,7 +585,7 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
475 * if there's an error. 585 * if there's an error.
476 */ 586 */
477static __modinit int add_sysfs_param(struct module_kobject *mk, 587static __modinit int add_sysfs_param(struct module_kobject *mk,
478 struct kernel_param *kp, 588 const struct kernel_param *kp,
479 const char *name) 589 const char *name)
480{ 590{
481 struct module_param_attrs *new; 591 struct module_param_attrs *new;
@@ -557,7 +667,7 @@ static void free_module_param_attrs(struct module_kobject *mk)
557 * /sys/module/[mod->name]/parameters/ 667 * /sys/module/[mod->name]/parameters/
558 */ 668 */
559int module_param_sysfs_setup(struct module *mod, 669int module_param_sysfs_setup(struct module *mod,
560 struct kernel_param *kparam, 670 const struct kernel_param *kparam,
561 unsigned int num_params) 671 unsigned int num_params)
562{ 672{
563 int i, err; 673 int i, err;
@@ -602,7 +712,11 @@ void module_param_sysfs_remove(struct module *mod)
602 712
603void destroy_params(const struct kernel_param *params, unsigned num) 713void destroy_params(const struct kernel_param *params, unsigned num)
604{ 714{
605 /* FIXME: This should free kmalloced charp parameters. It doesn't. */ 715 unsigned int i;
716
717 for (i = 0; i < num; i++)
718 if (params[i].ops->free)
719 params[i].ops->free(params[i].arg);
606} 720}
607 721
608static void __init kernel_add_sysfs_param(const char *name, 722static void __init kernel_add_sysfs_param(const char *name,
@@ -768,28 +882,3 @@ static int __init param_sysfs_init(void)
768subsys_initcall(param_sysfs_init); 882subsys_initcall(param_sysfs_init);
769 883
770#endif /* CONFIG_SYSFS */ 884#endif /* CONFIG_SYSFS */
771
772EXPORT_SYMBOL(param_set_byte);
773EXPORT_SYMBOL(param_get_byte);
774EXPORT_SYMBOL(param_set_short);
775EXPORT_SYMBOL(param_get_short);
776EXPORT_SYMBOL(param_set_ushort);
777EXPORT_SYMBOL(param_get_ushort);
778EXPORT_SYMBOL(param_set_int);
779EXPORT_SYMBOL(param_get_int);
780EXPORT_SYMBOL(param_set_uint);
781EXPORT_SYMBOL(param_get_uint);
782EXPORT_SYMBOL(param_set_long);
783EXPORT_SYMBOL(param_get_long);
784EXPORT_SYMBOL(param_set_ulong);
785EXPORT_SYMBOL(param_get_ulong);
786EXPORT_SYMBOL(param_set_charp);
787EXPORT_SYMBOL(param_get_charp);
788EXPORT_SYMBOL(param_set_bool);
789EXPORT_SYMBOL(param_get_bool);
790EXPORT_SYMBOL(param_set_invbool);
791EXPORT_SYMBOL(param_get_invbool);
792EXPORT_SYMBOL(param_array_set);
793EXPORT_SYMBOL(param_array_get);
794EXPORT_SYMBOL(param_set_copystring);
795EXPORT_SYMBOL(param_get_string);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index ff86c558af4c..517d827f4982 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -31,24 +31,18 @@
31#include <linux/kernel_stat.h> 31#include <linux/kernel_stat.h>
32#include <linux/perf_event.h> 32#include <linux/perf_event.h>
33#include <linux/ftrace_event.h> 33#include <linux/ftrace_event.h>
34#include <linux/hw_breakpoint.h>
35 34
36#include <asm/irq_regs.h> 35#include <asm/irq_regs.h>
37 36
38/* 37atomic_t perf_task_events __read_mostly;
39 * Each CPU has a list of per CPU events:
40 */
41static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
42
43int perf_max_events __read_mostly = 1;
44static int perf_reserved_percpu __read_mostly;
45static int perf_overcommit __read_mostly = 1;
46
47static atomic_t nr_events __read_mostly;
48static atomic_t nr_mmap_events __read_mostly; 38static atomic_t nr_mmap_events __read_mostly;
49static atomic_t nr_comm_events __read_mostly; 39static atomic_t nr_comm_events __read_mostly;
50static atomic_t nr_task_events __read_mostly; 40static atomic_t nr_task_events __read_mostly;
51 41
42static LIST_HEAD(pmus);
43static DEFINE_MUTEX(pmus_lock);
44static struct srcu_struct pmus_srcu;
45
52/* 46/*
53 * perf event paranoia level: 47 * perf event paranoia level:
54 * -1 - not paranoid at all 48 * -1 - not paranoid at all
@@ -67,36 +61,43 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000;
67 61
68static atomic64_t perf_event_id; 62static atomic64_t perf_event_id;
69 63
70/* 64void __weak perf_event_print_debug(void) { }
71 * Lock for (sysadmin-configurable) event reservations:
72 */
73static DEFINE_SPINLOCK(perf_resource_lock);
74 65
75/* 66extern __weak const char *perf_pmu_name(void)
76 * Architecture provided APIs - weak aliases:
77 */
78extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
79{ 67{
80 return NULL; 68 return "pmu";
81} 69}
82 70
83void __weak hw_perf_disable(void) { barrier(); } 71void perf_pmu_disable(struct pmu *pmu)
84void __weak hw_perf_enable(void) { barrier(); } 72{
85 73 int *count = this_cpu_ptr(pmu->pmu_disable_count);
86void __weak perf_event_print_debug(void) { } 74 if (!(*count)++)
87 75 pmu->pmu_disable(pmu);
88static DEFINE_PER_CPU(int, perf_disable_count); 76}
89 77
90void perf_disable(void) 78void perf_pmu_enable(struct pmu *pmu)
91{ 79{
92 if (!__get_cpu_var(perf_disable_count)++) 80 int *count = this_cpu_ptr(pmu->pmu_disable_count);
93 hw_perf_disable(); 81 if (!--(*count))
82 pmu->pmu_enable(pmu);
94} 83}
95 84
96void perf_enable(void) 85static DEFINE_PER_CPU(struct list_head, rotation_list);
86
87/*
88 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
89 * because they're strictly cpu affine and rotate_start is called with IRQs
90 * disabled, while rotate_context is called from IRQ context.
91 */
92static void perf_pmu_rotate_start(struct pmu *pmu)
97{ 93{
98 if (!--__get_cpu_var(perf_disable_count)) 94 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
99 hw_perf_enable(); 95 struct list_head *head = &__get_cpu_var(rotation_list);
96
97 WARN_ON(!irqs_disabled());
98
99 if (list_empty(&cpuctx->rotation_list))
100 list_add(&cpuctx->rotation_list, head);
100} 101}
101 102
102static void get_ctx(struct perf_event_context *ctx) 103static void get_ctx(struct perf_event_context *ctx)
@@ -151,13 +152,13 @@ static u64 primary_event_id(struct perf_event *event)
151 * the context could get moved to another task. 152 * the context could get moved to another task.
152 */ 153 */
153static struct perf_event_context * 154static struct perf_event_context *
154perf_lock_task_context(struct task_struct *task, unsigned long *flags) 155perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
155{ 156{
156 struct perf_event_context *ctx; 157 struct perf_event_context *ctx;
157 158
158 rcu_read_lock(); 159 rcu_read_lock();
159 retry: 160retry:
160 ctx = rcu_dereference(task->perf_event_ctxp); 161 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
161 if (ctx) { 162 if (ctx) {
162 /* 163 /*
163 * If this context is a clone of another, it might 164 * If this context is a clone of another, it might
@@ -170,7 +171,7 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
170 * can't get swapped on us any more. 171 * can't get swapped on us any more.
171 */ 172 */
172 raw_spin_lock_irqsave(&ctx->lock, *flags); 173 raw_spin_lock_irqsave(&ctx->lock, *flags);
173 if (ctx != rcu_dereference(task->perf_event_ctxp)) { 174 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
174 raw_spin_unlock_irqrestore(&ctx->lock, *flags); 175 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
175 goto retry; 176 goto retry;
176 } 177 }
@@ -189,12 +190,13 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
189 * can't get swapped to another task. This also increments its 190 * can't get swapped to another task. This also increments its
190 * reference count so that the context can't get freed. 191 * reference count so that the context can't get freed.
191 */ 192 */
192static struct perf_event_context *perf_pin_task_context(struct task_struct *task) 193static struct perf_event_context *
194perf_pin_task_context(struct task_struct *task, int ctxn)
193{ 195{
194 struct perf_event_context *ctx; 196 struct perf_event_context *ctx;
195 unsigned long flags; 197 unsigned long flags;
196 198
197 ctx = perf_lock_task_context(task, &flags); 199 ctx = perf_lock_task_context(task, ctxn, &flags);
198 if (ctx) { 200 if (ctx) {
199 ++ctx->pin_count; 201 ++ctx->pin_count;
200 raw_spin_unlock_irqrestore(&ctx->lock, flags); 202 raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -214,7 +216,7 @@ static void perf_unpin_context(struct perf_event_context *ctx)
214 216
215static inline u64 perf_clock(void) 217static inline u64 perf_clock(void)
216{ 218{
217 return cpu_clock(raw_smp_processor_id()); 219 return local_clock();
218} 220}
219 221
220/* 222/*
@@ -302,6 +304,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
302 } 304 }
303 305
304 list_add_rcu(&event->event_entry, &ctx->event_list); 306 list_add_rcu(&event->event_entry, &ctx->event_list);
307 if (!ctx->nr_events)
308 perf_pmu_rotate_start(ctx->pmu);
305 ctx->nr_events++; 309 ctx->nr_events++;
306 if (event->attr.inherit_stat) 310 if (event->attr.inherit_stat)
307 ctx->nr_stat++; 311 ctx->nr_stat++;
@@ -311,7 +315,12 @@ static void perf_group_attach(struct perf_event *event)
311{ 315{
312 struct perf_event *group_leader = event->group_leader; 316 struct perf_event *group_leader = event->group_leader;
313 317
314 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP); 318 /*
319 * We can have double attach due to group movement in perf_event_open.
320 */
321 if (event->attach_state & PERF_ATTACH_GROUP)
322 return;
323
315 event->attach_state |= PERF_ATTACH_GROUP; 324 event->attach_state |= PERF_ATTACH_GROUP;
316 325
317 if (group_leader == event) 326 if (group_leader == event)
@@ -402,11 +411,31 @@ static void perf_group_detach(struct perf_event *event)
402 } 411 }
403} 412}
404 413
414static inline int
415event_filter_match(struct perf_event *event)
416{
417 return event->cpu == -1 || event->cpu == smp_processor_id();
418}
419
405static void 420static void
406event_sched_out(struct perf_event *event, 421event_sched_out(struct perf_event *event,
407 struct perf_cpu_context *cpuctx, 422 struct perf_cpu_context *cpuctx,
408 struct perf_event_context *ctx) 423 struct perf_event_context *ctx)
409{ 424{
425 u64 delta;
426 /*
427 * An event which could not be activated because of
428 * filter mismatch still needs to have its timings
429 * maintained, otherwise bogus information is return
430 * via read() for time_enabled, time_running:
431 */
432 if (event->state == PERF_EVENT_STATE_INACTIVE
433 && !event_filter_match(event)) {
434 delta = ctx->time - event->tstamp_stopped;
435 event->tstamp_running += delta;
436 event->tstamp_stopped = ctx->time;
437 }
438
410 if (event->state != PERF_EVENT_STATE_ACTIVE) 439 if (event->state != PERF_EVENT_STATE_ACTIVE)
411 return; 440 return;
412 441
@@ -416,7 +445,7 @@ event_sched_out(struct perf_event *event,
416 event->state = PERF_EVENT_STATE_OFF; 445 event->state = PERF_EVENT_STATE_OFF;
417 } 446 }
418 event->tstamp_stopped = ctx->time; 447 event->tstamp_stopped = ctx->time;
419 event->pmu->disable(event); 448 event->pmu->del(event, 0);
420 event->oncpu = -1; 449 event->oncpu = -1;
421 450
422 if (!is_software_event(event)) 451 if (!is_software_event(event))
@@ -432,9 +461,7 @@ group_sched_out(struct perf_event *group_event,
432 struct perf_event_context *ctx) 461 struct perf_event_context *ctx)
433{ 462{
434 struct perf_event *event; 463 struct perf_event *event;
435 464 int state = group_event->state;
436 if (group_event->state != PERF_EVENT_STATE_ACTIVE)
437 return;
438 465
439 event_sched_out(group_event, cpuctx, ctx); 466 event_sched_out(group_event, cpuctx, ctx);
440 467
@@ -444,10 +471,16 @@ group_sched_out(struct perf_event *group_event,
444 list_for_each_entry(event, &group_event->sibling_list, group_entry) 471 list_for_each_entry(event, &group_event->sibling_list, group_entry)
445 event_sched_out(event, cpuctx, ctx); 472 event_sched_out(event, cpuctx, ctx);
446 473
447 if (group_event->attr.exclusive) 474 if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
448 cpuctx->exclusive = 0; 475 cpuctx->exclusive = 0;
449} 476}
450 477
478static inline struct perf_cpu_context *
479__get_cpu_context(struct perf_event_context *ctx)
480{
481 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
482}
483
451/* 484/*
452 * Cross CPU call to remove a performance event 485 * Cross CPU call to remove a performance event
453 * 486 *
@@ -456,9 +489,9 @@ group_sched_out(struct perf_event *group_event,
456 */ 489 */
457static void __perf_event_remove_from_context(void *info) 490static void __perf_event_remove_from_context(void *info)
458{ 491{
459 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
460 struct perf_event *event = info; 492 struct perf_event *event = info;
461 struct perf_event_context *ctx = event->ctx; 493 struct perf_event_context *ctx = event->ctx;
494 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
462 495
463 /* 496 /*
464 * If this is a task context, we need to check whether it is 497 * If this is a task context, we need to check whether it is
@@ -469,27 +502,11 @@ static void __perf_event_remove_from_context(void *info)
469 return; 502 return;
470 503
471 raw_spin_lock(&ctx->lock); 504 raw_spin_lock(&ctx->lock);
472 /*
473 * Protect the list operation against NMI by disabling the
474 * events on a global level.
475 */
476 perf_disable();
477 505
478 event_sched_out(event, cpuctx, ctx); 506 event_sched_out(event, cpuctx, ctx);
479 507
480 list_del_event(event, ctx); 508 list_del_event(event, ctx);
481 509
482 if (!ctx->task) {
483 /*
484 * Allow more per task events with respect to the
485 * reservation:
486 */
487 cpuctx->max_pertask =
488 min(perf_max_events - ctx->nr_events,
489 perf_max_events - perf_reserved_percpu);
490 }
491
492 perf_enable();
493 raw_spin_unlock(&ctx->lock); 510 raw_spin_unlock(&ctx->lock);
494} 511}
495 512
@@ -554,8 +571,8 @@ retry:
554static void __perf_event_disable(void *info) 571static void __perf_event_disable(void *info)
555{ 572{
556 struct perf_event *event = info; 573 struct perf_event *event = info;
557 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
558 struct perf_event_context *ctx = event->ctx; 574 struct perf_event_context *ctx = event->ctx;
575 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
559 576
560 /* 577 /*
561 * If this is a per-task event, need to check whether this 578 * If this is a per-task event, need to check whether this
@@ -610,7 +627,7 @@ void perf_event_disable(struct perf_event *event)
610 return; 627 return;
611 } 628 }
612 629
613 retry: 630retry:
614 task_oncpu_function_call(task, __perf_event_disable, event); 631 task_oncpu_function_call(task, __perf_event_disable, event);
615 632
616 raw_spin_lock_irq(&ctx->lock); 633 raw_spin_lock_irq(&ctx->lock);
@@ -649,7 +666,7 @@ event_sched_in(struct perf_event *event,
649 */ 666 */
650 smp_wmb(); 667 smp_wmb();
651 668
652 if (event->pmu->enable(event)) { 669 if (event->pmu->add(event, PERF_EF_START)) {
653 event->state = PERF_EVENT_STATE_INACTIVE; 670 event->state = PERF_EVENT_STATE_INACTIVE;
654 event->oncpu = -1; 671 event->oncpu = -1;
655 return -EAGAIN; 672 return -EAGAIN;
@@ -673,23 +690,17 @@ group_sched_in(struct perf_event *group_event,
673 struct perf_event_context *ctx) 690 struct perf_event_context *ctx)
674{ 691{
675 struct perf_event *event, *partial_group = NULL; 692 struct perf_event *event, *partial_group = NULL;
676 const struct pmu *pmu = group_event->pmu; 693 struct pmu *pmu = group_event->pmu;
677 bool txn = false; 694 u64 now = ctx->time;
678 int ret; 695 bool simulate = false;
679 696
680 if (group_event->state == PERF_EVENT_STATE_OFF) 697 if (group_event->state == PERF_EVENT_STATE_OFF)
681 return 0; 698 return 0;
682 699
683 /* Check if group transaction availabe */ 700 pmu->start_txn(pmu);
684 if (pmu->start_txn)
685 txn = true;
686
687 if (txn)
688 pmu->start_txn(pmu);
689 701
690 if (event_sched_in(group_event, cpuctx, ctx)) { 702 if (event_sched_in(group_event, cpuctx, ctx)) {
691 if (txn) 703 pmu->cancel_txn(pmu);
692 pmu->cancel_txn(pmu);
693 return -EAGAIN; 704 return -EAGAIN;
694 } 705 }
695 706
@@ -703,29 +714,38 @@ group_sched_in(struct perf_event *group_event,
703 } 714 }
704 } 715 }
705 716
706 if (!txn) 717 if (!pmu->commit_txn(pmu))
707 return 0; 718 return 0;
708 719
709 ret = pmu->commit_txn(pmu);
710 if (!ret) {
711 pmu->cancel_txn(pmu);
712 return 0;
713 }
714
715group_error: 720group_error:
716 /* 721 /*
717 * Groups can be scheduled in as one unit only, so undo any 722 * Groups can be scheduled in as one unit only, so undo any
718 * partial group before returning: 723 * partial group before returning:
724 * The events up to the failed event are scheduled out normally,
725 * tstamp_stopped will be updated.
726 *
727 * The failed events and the remaining siblings need to have
728 * their timings updated as if they had gone thru event_sched_in()
729 * and event_sched_out(). This is required to get consistent timings
730 * across the group. This also takes care of the case where the group
731 * could never be scheduled by ensuring tstamp_stopped is set to mark
732 * the time the event was actually stopped, such that time delta
733 * calculation in update_event_times() is correct.
719 */ 734 */
720 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 735 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
721 if (event == partial_group) 736 if (event == partial_group)
722 break; 737 simulate = true;
723 event_sched_out(event, cpuctx, ctx); 738
739 if (simulate) {
740 event->tstamp_running += now - event->tstamp_stopped;
741 event->tstamp_stopped = now;
742 } else {
743 event_sched_out(event, cpuctx, ctx);
744 }
724 } 745 }
725 event_sched_out(group_event, cpuctx, ctx); 746 event_sched_out(group_event, cpuctx, ctx);
726 747
727 if (txn) 748 pmu->cancel_txn(pmu);
728 pmu->cancel_txn(pmu);
729 749
730 return -EAGAIN; 750 return -EAGAIN;
731} 751}
@@ -778,10 +798,10 @@ static void add_event_to_ctx(struct perf_event *event,
778 */ 798 */
779static void __perf_install_in_context(void *info) 799static void __perf_install_in_context(void *info)
780{ 800{
781 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
782 struct perf_event *event = info; 801 struct perf_event *event = info;
783 struct perf_event_context *ctx = event->ctx; 802 struct perf_event_context *ctx = event->ctx;
784 struct perf_event *leader = event->group_leader; 803 struct perf_event *leader = event->group_leader;
804 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
785 int err; 805 int err;
786 806
787 /* 807 /*
@@ -801,12 +821,6 @@ static void __perf_install_in_context(void *info)
801 ctx->is_active = 1; 821 ctx->is_active = 1;
802 update_context_time(ctx); 822 update_context_time(ctx);
803 823
804 /*
805 * Protect the list operation against NMI by disabling the
806 * events on a global level. NOP for non NMI based events.
807 */
808 perf_disable();
809
810 add_event_to_ctx(event, ctx); 824 add_event_to_ctx(event, ctx);
811 825
812 if (event->cpu != -1 && event->cpu != smp_processor_id()) 826 if (event->cpu != -1 && event->cpu != smp_processor_id())
@@ -844,12 +858,7 @@ static void __perf_install_in_context(void *info)
844 } 858 }
845 } 859 }
846 860
847 if (!err && !ctx->task && cpuctx->max_pertask) 861unlock:
848 cpuctx->max_pertask--;
849
850 unlock:
851 perf_enable();
852
853 raw_spin_unlock(&ctx->lock); 862 raw_spin_unlock(&ctx->lock);
854} 863}
855 864
@@ -872,6 +881,8 @@ perf_install_in_context(struct perf_event_context *ctx,
872{ 881{
873 struct task_struct *task = ctx->task; 882 struct task_struct *task = ctx->task;
874 883
884 event->ctx = ctx;
885
875 if (!task) { 886 if (!task) {
876 /* 887 /*
877 * Per cpu events are installed via an smp call and 888 * Per cpu events are installed via an smp call and
@@ -920,10 +931,12 @@ static void __perf_event_mark_enabled(struct perf_event *event,
920 931
921 event->state = PERF_EVENT_STATE_INACTIVE; 932 event->state = PERF_EVENT_STATE_INACTIVE;
922 event->tstamp_enabled = ctx->time - event->total_time_enabled; 933 event->tstamp_enabled = ctx->time - event->total_time_enabled;
923 list_for_each_entry(sub, &event->sibling_list, group_entry) 934 list_for_each_entry(sub, &event->sibling_list, group_entry) {
924 if (sub->state >= PERF_EVENT_STATE_INACTIVE) 935 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
925 sub->tstamp_enabled = 936 sub->tstamp_enabled =
926 ctx->time - sub->total_time_enabled; 937 ctx->time - sub->total_time_enabled;
938 }
939 }
927} 940}
928 941
929/* 942/*
@@ -932,9 +945,9 @@ static void __perf_event_mark_enabled(struct perf_event *event,
932static void __perf_event_enable(void *info) 945static void __perf_event_enable(void *info)
933{ 946{
934 struct perf_event *event = info; 947 struct perf_event *event = info;
935 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
936 struct perf_event_context *ctx = event->ctx; 948 struct perf_event_context *ctx = event->ctx;
937 struct perf_event *leader = event->group_leader; 949 struct perf_event *leader = event->group_leader;
950 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
938 int err; 951 int err;
939 952
940 /* 953 /*
@@ -968,12 +981,10 @@ static void __perf_event_enable(void *info)
968 if (!group_can_go_on(event, cpuctx, 1)) { 981 if (!group_can_go_on(event, cpuctx, 1)) {
969 err = -EEXIST; 982 err = -EEXIST;
970 } else { 983 } else {
971 perf_disable();
972 if (event == leader) 984 if (event == leader)
973 err = group_sched_in(event, cpuctx, ctx); 985 err = group_sched_in(event, cpuctx, ctx);
974 else 986 else
975 err = event_sched_in(event, cpuctx, ctx); 987 err = event_sched_in(event, cpuctx, ctx);
976 perf_enable();
977 } 988 }
978 989
979 if (err) { 990 if (err) {
@@ -989,7 +1000,7 @@ static void __perf_event_enable(void *info)
989 } 1000 }
990 } 1001 }
991 1002
992 unlock: 1003unlock:
993 raw_spin_unlock(&ctx->lock); 1004 raw_spin_unlock(&ctx->lock);
994} 1005}
995 1006
@@ -1030,7 +1041,7 @@ void perf_event_enable(struct perf_event *event)
1030 if (event->state == PERF_EVENT_STATE_ERROR) 1041 if (event->state == PERF_EVENT_STATE_ERROR)
1031 event->state = PERF_EVENT_STATE_OFF; 1042 event->state = PERF_EVENT_STATE_OFF;
1032 1043
1033 retry: 1044retry:
1034 raw_spin_unlock_irq(&ctx->lock); 1045 raw_spin_unlock_irq(&ctx->lock);
1035 task_oncpu_function_call(task, __perf_event_enable, event); 1046 task_oncpu_function_call(task, __perf_event_enable, event);
1036 1047
@@ -1050,7 +1061,7 @@ void perf_event_enable(struct perf_event *event)
1050 if (event->state == PERF_EVENT_STATE_OFF) 1061 if (event->state == PERF_EVENT_STATE_OFF)
1051 __perf_event_mark_enabled(event, ctx); 1062 __perf_event_mark_enabled(event, ctx);
1052 1063
1053 out: 1064out:
1054 raw_spin_unlock_irq(&ctx->lock); 1065 raw_spin_unlock_irq(&ctx->lock);
1055} 1066}
1056 1067
@@ -1081,26 +1092,26 @@ static void ctx_sched_out(struct perf_event_context *ctx,
1081 struct perf_event *event; 1092 struct perf_event *event;
1082 1093
1083 raw_spin_lock(&ctx->lock); 1094 raw_spin_lock(&ctx->lock);
1095 perf_pmu_disable(ctx->pmu);
1084 ctx->is_active = 0; 1096 ctx->is_active = 0;
1085 if (likely(!ctx->nr_events)) 1097 if (likely(!ctx->nr_events))
1086 goto out; 1098 goto out;
1087 update_context_time(ctx); 1099 update_context_time(ctx);
1088 1100
1089 perf_disable();
1090 if (!ctx->nr_active) 1101 if (!ctx->nr_active)
1091 goto out_enable; 1102 goto out;
1092 1103
1093 if (event_type & EVENT_PINNED) 1104 if (event_type & EVENT_PINNED) {
1094 list_for_each_entry(event, &ctx->pinned_groups, group_entry) 1105 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1095 group_sched_out(event, cpuctx, ctx); 1106 group_sched_out(event, cpuctx, ctx);
1107 }
1096 1108
1097 if (event_type & EVENT_FLEXIBLE) 1109 if (event_type & EVENT_FLEXIBLE) {
1098 list_for_each_entry(event, &ctx->flexible_groups, group_entry) 1110 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1099 group_sched_out(event, cpuctx, ctx); 1111 group_sched_out(event, cpuctx, ctx);
1100 1112 }
1101 out_enable: 1113out:
1102 perf_enable(); 1114 perf_pmu_enable(ctx->pmu);
1103 out:
1104 raw_spin_unlock(&ctx->lock); 1115 raw_spin_unlock(&ctx->lock);
1105} 1116}
1106 1117
@@ -1155,9 +1166,9 @@ static void __perf_event_sync_stat(struct perf_event *event,
1155 * In order to keep per-task stats reliable we need to flip the event 1166 * In order to keep per-task stats reliable we need to flip the event
1156 * values when we flip the contexts. 1167 * values when we flip the contexts.
1157 */ 1168 */
1158 value = atomic64_read(&next_event->count); 1169 value = local64_read(&next_event->count);
1159 value = atomic64_xchg(&event->count, value); 1170 value = local64_xchg(&event->count, value);
1160 atomic64_set(&next_event->count, value); 1171 local64_set(&next_event->count, value);
1161 1172
1162 swap(event->total_time_enabled, next_event->total_time_enabled); 1173 swap(event->total_time_enabled, next_event->total_time_enabled);
1163 swap(event->total_time_running, next_event->total_time_running); 1174 swap(event->total_time_running, next_event->total_time_running);
@@ -1198,34 +1209,25 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1198 } 1209 }
1199} 1210}
1200 1211
1201/* 1212void perf_event_context_sched_out(struct task_struct *task, int ctxn,
1202 * Called from scheduler to remove the events of the current task, 1213 struct task_struct *next)
1203 * with interrupts disabled.
1204 *
1205 * We stop each event and update the event value in event->count.
1206 *
1207 * This does not protect us against NMI, but disable()
1208 * sets the disabled bit in the control field of event _before_
1209 * accessing the event control register. If a NMI hits, then it will
1210 * not restart the event.
1211 */
1212void perf_event_task_sched_out(struct task_struct *task,
1213 struct task_struct *next)
1214{ 1214{
1215 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1215 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
1216 struct perf_event_context *ctx = task->perf_event_ctxp;
1217 struct perf_event_context *next_ctx; 1216 struct perf_event_context *next_ctx;
1218 struct perf_event_context *parent; 1217 struct perf_event_context *parent;
1218 struct perf_cpu_context *cpuctx;
1219 int do_switch = 1; 1219 int do_switch = 1;
1220 1220
1221 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); 1221 if (likely(!ctx))
1222 return;
1222 1223
1223 if (likely(!ctx || !cpuctx->task_ctx)) 1224 cpuctx = __get_cpu_context(ctx);
1225 if (!cpuctx->task_ctx)
1224 return; 1226 return;
1225 1227
1226 rcu_read_lock(); 1228 rcu_read_lock();
1227 parent = rcu_dereference(ctx->parent_ctx); 1229 parent = rcu_dereference(ctx->parent_ctx);
1228 next_ctx = next->perf_event_ctxp; 1230 next_ctx = next->perf_event_ctxp[ctxn];
1229 if (parent && next_ctx && 1231 if (parent && next_ctx &&
1230 rcu_dereference(next_ctx->parent_ctx) == parent) { 1232 rcu_dereference(next_ctx->parent_ctx) == parent) {
1231 /* 1233 /*
@@ -1244,8 +1246,8 @@ void perf_event_task_sched_out(struct task_struct *task,
1244 * XXX do we need a memory barrier of sorts 1246 * XXX do we need a memory barrier of sorts
1245 * wrt to rcu_dereference() of perf_event_ctxp 1247 * wrt to rcu_dereference() of perf_event_ctxp
1246 */ 1248 */
1247 task->perf_event_ctxp = next_ctx; 1249 task->perf_event_ctxp[ctxn] = next_ctx;
1248 next->perf_event_ctxp = ctx; 1250 next->perf_event_ctxp[ctxn] = ctx;
1249 ctx->task = next; 1251 ctx->task = next;
1250 next_ctx->task = task; 1252 next_ctx->task = task;
1251 do_switch = 0; 1253 do_switch = 0;
@@ -1263,10 +1265,35 @@ void perf_event_task_sched_out(struct task_struct *task,
1263 } 1265 }
1264} 1266}
1265 1267
1268#define for_each_task_context_nr(ctxn) \
1269 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
1270
1271/*
1272 * Called from scheduler to remove the events of the current task,
1273 * with interrupts disabled.
1274 *
1275 * We stop each event and update the event value in event->count.
1276 *
1277 * This does not protect us against NMI, but disable()
1278 * sets the disabled bit in the control field of event _before_
1279 * accessing the event control register. If a NMI hits, then it will
1280 * not restart the event.
1281 */
1282void __perf_event_task_sched_out(struct task_struct *task,
1283 struct task_struct *next)
1284{
1285 int ctxn;
1286
1287 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
1288
1289 for_each_task_context_nr(ctxn)
1290 perf_event_context_sched_out(task, ctxn, next);
1291}
1292
1266static void task_ctx_sched_out(struct perf_event_context *ctx, 1293static void task_ctx_sched_out(struct perf_event_context *ctx,
1267 enum event_type_t event_type) 1294 enum event_type_t event_type)
1268{ 1295{
1269 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1296 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1270 1297
1271 if (!cpuctx->task_ctx) 1298 if (!cpuctx->task_ctx)
1272 return; 1299 return;
@@ -1281,14 +1308,6 @@ static void task_ctx_sched_out(struct perf_event_context *ctx,
1281/* 1308/*
1282 * Called with IRQs disabled 1309 * Called with IRQs disabled
1283 */ 1310 */
1284static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1285{
1286 task_ctx_sched_out(ctx, EVENT_ALL);
1287}
1288
1289/*
1290 * Called with IRQs disabled
1291 */
1292static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, 1311static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
1293 enum event_type_t event_type) 1312 enum event_type_t event_type)
1294{ 1313{
@@ -1339,9 +1358,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
1339 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1358 if (event->cpu != -1 && event->cpu != smp_processor_id())
1340 continue; 1359 continue;
1341 1360
1342 if (group_can_go_on(event, cpuctx, can_add_hw)) 1361 if (group_can_go_on(event, cpuctx, can_add_hw)) {
1343 if (group_sched_in(event, cpuctx, ctx)) 1362 if (group_sched_in(event, cpuctx, ctx))
1344 can_add_hw = 0; 1363 can_add_hw = 0;
1364 }
1345 } 1365 }
1346} 1366}
1347 1367
@@ -1357,8 +1377,6 @@ ctx_sched_in(struct perf_event_context *ctx,
1357 1377
1358 ctx->timestamp = perf_clock(); 1378 ctx->timestamp = perf_clock();
1359 1379
1360 perf_disable();
1361
1362 /* 1380 /*
1363 * First go through the list and put on any pinned groups 1381 * First go through the list and put on any pinned groups
1364 * in order to give them the best chance of going on. 1382 * in order to give them the best chance of going on.
@@ -1370,8 +1388,7 @@ ctx_sched_in(struct perf_event_context *ctx,
1370 if (event_type & EVENT_FLEXIBLE) 1388 if (event_type & EVENT_FLEXIBLE)
1371 ctx_flexible_sched_in(ctx, cpuctx); 1389 ctx_flexible_sched_in(ctx, cpuctx);
1372 1390
1373 perf_enable(); 1391out:
1374 out:
1375 raw_spin_unlock(&ctx->lock); 1392 raw_spin_unlock(&ctx->lock);
1376} 1393}
1377 1394
@@ -1383,43 +1400,28 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1383 ctx_sched_in(ctx, cpuctx, event_type); 1400 ctx_sched_in(ctx, cpuctx, event_type);
1384} 1401}
1385 1402
1386static void task_ctx_sched_in(struct task_struct *task, 1403static void task_ctx_sched_in(struct perf_event_context *ctx,
1387 enum event_type_t event_type) 1404 enum event_type_t event_type)
1388{ 1405{
1389 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1406 struct perf_cpu_context *cpuctx;
1390 struct perf_event_context *ctx = task->perf_event_ctxp;
1391 1407
1392 if (likely(!ctx)) 1408 cpuctx = __get_cpu_context(ctx);
1393 return;
1394 if (cpuctx->task_ctx == ctx) 1409 if (cpuctx->task_ctx == ctx)
1395 return; 1410 return;
1411
1396 ctx_sched_in(ctx, cpuctx, event_type); 1412 ctx_sched_in(ctx, cpuctx, event_type);
1397 cpuctx->task_ctx = ctx; 1413 cpuctx->task_ctx = ctx;
1398} 1414}
1399/*
1400 * Called from scheduler to add the events of the current task
1401 * with interrupts disabled.
1402 *
1403 * We restore the event value and then enable it.
1404 *
1405 * This does not protect us against NMI, but enable()
1406 * sets the enabled bit in the control field of event _before_
1407 * accessing the event control register. If a NMI hits, then it will
1408 * keep the event running.
1409 */
1410void perf_event_task_sched_in(struct task_struct *task)
1411{
1412 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1413 struct perf_event_context *ctx = task->perf_event_ctxp;
1414 1415
1415 if (likely(!ctx)) 1416void perf_event_context_sched_in(struct perf_event_context *ctx)
1416 return; 1417{
1418 struct perf_cpu_context *cpuctx;
1417 1419
1420 cpuctx = __get_cpu_context(ctx);
1418 if (cpuctx->task_ctx == ctx) 1421 if (cpuctx->task_ctx == ctx)
1419 return; 1422 return;
1420 1423
1421 perf_disable(); 1424 perf_pmu_disable(ctx->pmu);
1422
1423 /* 1425 /*
1424 * We want to keep the following priority order: 1426 * We want to keep the following priority order:
1425 * cpu pinned (that don't need to move), task pinned, 1427 * cpu pinned (that don't need to move), task pinned,
@@ -1433,7 +1435,37 @@ void perf_event_task_sched_in(struct task_struct *task)
1433 1435
1434 cpuctx->task_ctx = ctx; 1436 cpuctx->task_ctx = ctx;
1435 1437
1436 perf_enable(); 1438 /*
1439 * Since these rotations are per-cpu, we need to ensure the
1440 * cpu-context we got scheduled on is actually rotating.
1441 */
1442 perf_pmu_rotate_start(ctx->pmu);
1443 perf_pmu_enable(ctx->pmu);
1444}
1445
1446/*
1447 * Called from scheduler to add the events of the current task
1448 * with interrupts disabled.
1449 *
1450 * We restore the event value and then enable it.
1451 *
1452 * This does not protect us against NMI, but enable()
1453 * sets the enabled bit in the control field of event _before_
1454 * accessing the event control register. If a NMI hits, then it will
1455 * keep the event running.
1456 */
1457void __perf_event_task_sched_in(struct task_struct *task)
1458{
1459 struct perf_event_context *ctx;
1460 int ctxn;
1461
1462 for_each_task_context_nr(ctxn) {
1463 ctx = task->perf_event_ctxp[ctxn];
1464 if (likely(!ctx))
1465 continue;
1466
1467 perf_event_context_sched_in(ctx);
1468 }
1437} 1469}
1438 1470
1439#define MAX_INTERRUPTS (~0ULL) 1471#define MAX_INTERRUPTS (~0ULL)
@@ -1513,22 +1545,6 @@ do { \
1513 return div64_u64(dividend, divisor); 1545 return div64_u64(dividend, divisor);
1514} 1546}
1515 1547
1516static void perf_event_stop(struct perf_event *event)
1517{
1518 if (!event->pmu->stop)
1519 return event->pmu->disable(event);
1520
1521 return event->pmu->stop(event);
1522}
1523
1524static int perf_event_start(struct perf_event *event)
1525{
1526 if (!event->pmu->start)
1527 return event->pmu->enable(event);
1528
1529 return event->pmu->start(event);
1530}
1531
1532static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) 1548static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1533{ 1549{
1534 struct hw_perf_event *hwc = &event->hw; 1550 struct hw_perf_event *hwc = &event->hw;
@@ -1547,16 +1563,14 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1547 1563
1548 hwc->sample_period = sample_period; 1564 hwc->sample_period = sample_period;
1549 1565
1550 if (atomic64_read(&hwc->period_left) > 8*sample_period) { 1566 if (local64_read(&hwc->period_left) > 8*sample_period) {
1551 perf_disable(); 1567 event->pmu->stop(event, PERF_EF_UPDATE);
1552 perf_event_stop(event); 1568 local64_set(&hwc->period_left, 0);
1553 atomic64_set(&hwc->period_left, 0); 1569 event->pmu->start(event, PERF_EF_RELOAD);
1554 perf_event_start(event);
1555 perf_enable();
1556 } 1570 }
1557} 1571}
1558 1572
1559static void perf_ctx_adjust_freq(struct perf_event_context *ctx) 1573static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
1560{ 1574{
1561 struct perf_event *event; 1575 struct perf_event *event;
1562 struct hw_perf_event *hwc; 1576 struct hw_perf_event *hwc;
@@ -1581,23 +1595,19 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1581 */ 1595 */
1582 if (interrupts == MAX_INTERRUPTS) { 1596 if (interrupts == MAX_INTERRUPTS) {
1583 perf_log_throttle(event, 1); 1597 perf_log_throttle(event, 1);
1584 perf_disable(); 1598 event->pmu->start(event, 0);
1585 event->pmu->unthrottle(event);
1586 perf_enable();
1587 } 1599 }
1588 1600
1589 if (!event->attr.freq || !event->attr.sample_freq) 1601 if (!event->attr.freq || !event->attr.sample_freq)
1590 continue; 1602 continue;
1591 1603
1592 perf_disable();
1593 event->pmu->read(event); 1604 event->pmu->read(event);
1594 now = atomic64_read(&event->count); 1605 now = local64_read(&event->count);
1595 delta = now - hwc->freq_count_stamp; 1606 delta = now - hwc->freq_count_stamp;
1596 hwc->freq_count_stamp = now; 1607 hwc->freq_count_stamp = now;
1597 1608
1598 if (delta > 0) 1609 if (delta > 0)
1599 perf_adjust_period(event, TICK_NSEC, delta); 1610 perf_adjust_period(event, period, delta);
1600 perf_enable();
1601 } 1611 }
1602 raw_spin_unlock(&ctx->lock); 1612 raw_spin_unlock(&ctx->lock);
1603} 1613}
@@ -1615,32 +1625,38 @@ static void rotate_ctx(struct perf_event_context *ctx)
1615 raw_spin_unlock(&ctx->lock); 1625 raw_spin_unlock(&ctx->lock);
1616} 1626}
1617 1627
1618void perf_event_task_tick(struct task_struct *curr) 1628/*
1629 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
1630 * because they're strictly cpu affine and rotate_start is called with IRQs
1631 * disabled, while rotate_context is called from IRQ context.
1632 */
1633static void perf_rotate_context(struct perf_cpu_context *cpuctx)
1619{ 1634{
1620 struct perf_cpu_context *cpuctx; 1635 u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
1621 struct perf_event_context *ctx; 1636 struct perf_event_context *ctx = NULL;
1622 int rotate = 0; 1637 int rotate = 0, remove = 1;
1623 1638
1624 if (!atomic_read(&nr_events)) 1639 if (cpuctx->ctx.nr_events) {
1625 return; 1640 remove = 0;
1626 1641 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
1627 cpuctx = &__get_cpu_var(perf_cpu_context); 1642 rotate = 1;
1628 if (cpuctx->ctx.nr_events && 1643 }
1629 cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
1630 rotate = 1;
1631 1644
1632 ctx = curr->perf_event_ctxp; 1645 ctx = cpuctx->task_ctx;
1633 if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active) 1646 if (ctx && ctx->nr_events) {
1634 rotate = 1; 1647 remove = 0;
1648 if (ctx->nr_events != ctx->nr_active)
1649 rotate = 1;
1650 }
1635 1651
1636 perf_ctx_adjust_freq(&cpuctx->ctx); 1652 perf_pmu_disable(cpuctx->ctx.pmu);
1653 perf_ctx_adjust_freq(&cpuctx->ctx, interval);
1637 if (ctx) 1654 if (ctx)
1638 perf_ctx_adjust_freq(ctx); 1655 perf_ctx_adjust_freq(ctx, interval);
1639 1656
1640 if (!rotate) 1657 if (!rotate)
1641 return; 1658 goto done;
1642 1659
1643 perf_disable();
1644 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 1660 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1645 if (ctx) 1661 if (ctx)
1646 task_ctx_sched_out(ctx, EVENT_FLEXIBLE); 1662 task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
@@ -1651,8 +1667,27 @@ void perf_event_task_tick(struct task_struct *curr)
1651 1667
1652 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); 1668 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1653 if (ctx) 1669 if (ctx)
1654 task_ctx_sched_in(curr, EVENT_FLEXIBLE); 1670 task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
1655 perf_enable(); 1671
1672done:
1673 if (remove)
1674 list_del_init(&cpuctx->rotation_list);
1675
1676 perf_pmu_enable(cpuctx->ctx.pmu);
1677}
1678
1679void perf_event_task_tick(void)
1680{
1681 struct list_head *head = &__get_cpu_var(rotation_list);
1682 struct perf_cpu_context *cpuctx, *tmp;
1683
1684 WARN_ON(!irqs_disabled());
1685
1686 list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
1687 if (cpuctx->jiffies_interval == 1 ||
1688 !(jiffies % cpuctx->jiffies_interval))
1689 perf_rotate_context(cpuctx);
1690 }
1656} 1691}
1657 1692
1658static int event_enable_on_exec(struct perf_event *event, 1693static int event_enable_on_exec(struct perf_event *event,
@@ -1674,20 +1709,18 @@ static int event_enable_on_exec(struct perf_event *event,
1674 * Enable all of a task's events that have been marked enable-on-exec. 1709 * Enable all of a task's events that have been marked enable-on-exec.
1675 * This expects task == current. 1710 * This expects task == current.
1676 */ 1711 */
1677static void perf_event_enable_on_exec(struct task_struct *task) 1712static void perf_event_enable_on_exec(struct perf_event_context *ctx)
1678{ 1713{
1679 struct perf_event_context *ctx;
1680 struct perf_event *event; 1714 struct perf_event *event;
1681 unsigned long flags; 1715 unsigned long flags;
1682 int enabled = 0; 1716 int enabled = 0;
1683 int ret; 1717 int ret;
1684 1718
1685 local_irq_save(flags); 1719 local_irq_save(flags);
1686 ctx = task->perf_event_ctxp;
1687 if (!ctx || !ctx->nr_events) 1720 if (!ctx || !ctx->nr_events)
1688 goto out; 1721 goto out;
1689 1722
1690 __perf_event_task_sched_out(ctx); 1723 task_ctx_sched_out(ctx, EVENT_ALL);
1691 1724
1692 raw_spin_lock(&ctx->lock); 1725 raw_spin_lock(&ctx->lock);
1693 1726
@@ -1711,8 +1744,8 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1711 1744
1712 raw_spin_unlock(&ctx->lock); 1745 raw_spin_unlock(&ctx->lock);
1713 1746
1714 perf_event_task_sched_in(task); 1747 perf_event_context_sched_in(ctx);
1715 out: 1748out:
1716 local_irq_restore(flags); 1749 local_irq_restore(flags);
1717} 1750}
1718 1751
@@ -1721,9 +1754,9 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1721 */ 1754 */
1722static void __perf_event_read(void *info) 1755static void __perf_event_read(void *info)
1723{ 1756{
1724 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1725 struct perf_event *event = info; 1757 struct perf_event *event = info;
1726 struct perf_event_context *ctx = event->ctx; 1758 struct perf_event_context *ctx = event->ctx;
1759 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1727 1760
1728 /* 1761 /*
1729 * If this is a task context, we need to check whether it is 1762 * If this is a task context, we need to check whether it is
@@ -1743,6 +1776,11 @@ static void __perf_event_read(void *info)
1743 event->pmu->read(event); 1776 event->pmu->read(event);
1744} 1777}
1745 1778
1779static inline u64 perf_event_count(struct perf_event *event)
1780{
1781 return local64_read(&event->count) + atomic64_read(&event->child_count);
1782}
1783
1746static u64 perf_event_read(struct perf_event *event) 1784static u64 perf_event_read(struct perf_event *event)
1747{ 1785{
1748 /* 1786 /*
@@ -1757,20 +1795,234 @@ static u64 perf_event_read(struct perf_event *event)
1757 unsigned long flags; 1795 unsigned long flags;
1758 1796
1759 raw_spin_lock_irqsave(&ctx->lock, flags); 1797 raw_spin_lock_irqsave(&ctx->lock, flags);
1760 update_context_time(ctx); 1798 /*
1799 * may read while context is not active
1800 * (e.g., thread is blocked), in that case
1801 * we cannot update context time
1802 */
1803 if (ctx->is_active)
1804 update_context_time(ctx);
1761 update_event_times(event); 1805 update_event_times(event);
1762 raw_spin_unlock_irqrestore(&ctx->lock, flags); 1806 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1763 } 1807 }
1764 1808
1765 return atomic64_read(&event->count); 1809 return perf_event_count(event);
1766} 1810}
1767 1811
1768/* 1812/*
1769 * Initialize the perf_event context in a task_struct: 1813 * Callchain support
1770 */ 1814 */
1815
1816struct callchain_cpus_entries {
1817 struct rcu_head rcu_head;
1818 struct perf_callchain_entry *cpu_entries[0];
1819};
1820
1821static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
1822static atomic_t nr_callchain_events;
1823static DEFINE_MUTEX(callchain_mutex);
1824struct callchain_cpus_entries *callchain_cpus_entries;
1825
1826
1827__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
1828 struct pt_regs *regs)
1829{
1830}
1831
1832__weak void perf_callchain_user(struct perf_callchain_entry *entry,
1833 struct pt_regs *regs)
1834{
1835}
1836
1837static void release_callchain_buffers_rcu(struct rcu_head *head)
1838{
1839 struct callchain_cpus_entries *entries;
1840 int cpu;
1841
1842 entries = container_of(head, struct callchain_cpus_entries, rcu_head);
1843
1844 for_each_possible_cpu(cpu)
1845 kfree(entries->cpu_entries[cpu]);
1846
1847 kfree(entries);
1848}
1849
1850static void release_callchain_buffers(void)
1851{
1852 struct callchain_cpus_entries *entries;
1853
1854 entries = callchain_cpus_entries;
1855 rcu_assign_pointer(callchain_cpus_entries, NULL);
1856 call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
1857}
1858
1859static int alloc_callchain_buffers(void)
1860{
1861 int cpu;
1862 int size;
1863 struct callchain_cpus_entries *entries;
1864
1865 /*
1866 * We can't use the percpu allocation API for data that can be
1867 * accessed from NMI. Use a temporary manual per cpu allocation
1868 * until that gets sorted out.
1869 */
1870 size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) *
1871 num_possible_cpus();
1872
1873 entries = kzalloc(size, GFP_KERNEL);
1874 if (!entries)
1875 return -ENOMEM;
1876
1877 size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
1878
1879 for_each_possible_cpu(cpu) {
1880 entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
1881 cpu_to_node(cpu));
1882 if (!entries->cpu_entries[cpu])
1883 goto fail;
1884 }
1885
1886 rcu_assign_pointer(callchain_cpus_entries, entries);
1887
1888 return 0;
1889
1890fail:
1891 for_each_possible_cpu(cpu)
1892 kfree(entries->cpu_entries[cpu]);
1893 kfree(entries);
1894
1895 return -ENOMEM;
1896}
1897
1898static int get_callchain_buffers(void)
1899{
1900 int err = 0;
1901 int count;
1902
1903 mutex_lock(&callchain_mutex);
1904
1905 count = atomic_inc_return(&nr_callchain_events);
1906 if (WARN_ON_ONCE(count < 1)) {
1907 err = -EINVAL;
1908 goto exit;
1909 }
1910
1911 if (count > 1) {
1912 /* If the allocation failed, give up */
1913 if (!callchain_cpus_entries)
1914 err = -ENOMEM;
1915 goto exit;
1916 }
1917
1918 err = alloc_callchain_buffers();
1919 if (err)
1920 release_callchain_buffers();
1921exit:
1922 mutex_unlock(&callchain_mutex);
1923
1924 return err;
1925}
1926
1927static void put_callchain_buffers(void)
1928{
1929 if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
1930 release_callchain_buffers();
1931 mutex_unlock(&callchain_mutex);
1932 }
1933}
1934
1935static int get_recursion_context(int *recursion)
1936{
1937 int rctx;
1938
1939 if (in_nmi())
1940 rctx = 3;
1941 else if (in_irq())
1942 rctx = 2;
1943 else if (in_softirq())
1944 rctx = 1;
1945 else
1946 rctx = 0;
1947
1948 if (recursion[rctx])
1949 return -1;
1950
1951 recursion[rctx]++;
1952 barrier();
1953
1954 return rctx;
1955}
1956
1957static inline void put_recursion_context(int *recursion, int rctx)
1958{
1959 barrier();
1960 recursion[rctx]--;
1961}
1962
1963static struct perf_callchain_entry *get_callchain_entry(int *rctx)
1964{
1965 int cpu;
1966 struct callchain_cpus_entries *entries;
1967
1968 *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
1969 if (*rctx == -1)
1970 return NULL;
1971
1972 entries = rcu_dereference(callchain_cpus_entries);
1973 if (!entries)
1974 return NULL;
1975
1976 cpu = smp_processor_id();
1977
1978 return &entries->cpu_entries[cpu][*rctx];
1979}
1980
1771static void 1981static void
1772__perf_event_init_context(struct perf_event_context *ctx, 1982put_callchain_entry(int rctx)
1773 struct task_struct *task) 1983{
1984 put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
1985}
1986
1987static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1988{
1989 int rctx;
1990 struct perf_callchain_entry *entry;
1991
1992
1993 entry = get_callchain_entry(&rctx);
1994 if (rctx == -1)
1995 return NULL;
1996
1997 if (!entry)
1998 goto exit_put;
1999
2000 entry->nr = 0;
2001
2002 if (!user_mode(regs)) {
2003 perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
2004 perf_callchain_kernel(entry, regs);
2005 if (current->mm)
2006 regs = task_pt_regs(current);
2007 else
2008 regs = NULL;
2009 }
2010
2011 if (regs) {
2012 perf_callchain_store(entry, PERF_CONTEXT_USER);
2013 perf_callchain_user(entry, regs);
2014 }
2015
2016exit_put:
2017 put_callchain_entry(rctx);
2018
2019 return entry;
2020}
2021
2022/*
2023 * Initialize the perf_event context in a task_struct:
2024 */
2025static void __perf_event_init_context(struct perf_event_context *ctx)
1774{ 2026{
1775 raw_spin_lock_init(&ctx->lock); 2027 raw_spin_lock_init(&ctx->lock);
1776 mutex_init(&ctx->mutex); 2028 mutex_init(&ctx->mutex);
@@ -1778,45 +2030,38 @@ __perf_event_init_context(struct perf_event_context *ctx,
1778 INIT_LIST_HEAD(&ctx->flexible_groups); 2030 INIT_LIST_HEAD(&ctx->flexible_groups);
1779 INIT_LIST_HEAD(&ctx->event_list); 2031 INIT_LIST_HEAD(&ctx->event_list);
1780 atomic_set(&ctx->refcount, 1); 2032 atomic_set(&ctx->refcount, 1);
1781 ctx->task = task;
1782} 2033}
1783 2034
1784static struct perf_event_context *find_get_context(pid_t pid, int cpu) 2035static struct perf_event_context *
2036alloc_perf_context(struct pmu *pmu, struct task_struct *task)
1785{ 2037{
1786 struct perf_event_context *ctx; 2038 struct perf_event_context *ctx;
1787 struct perf_cpu_context *cpuctx;
1788 struct task_struct *task;
1789 unsigned long flags;
1790 int err;
1791 2039
1792 if (pid == -1 && cpu != -1) { 2040 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
1793 /* Must be root to operate on a CPU event: */ 2041 if (!ctx)
1794 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) 2042 return NULL;
1795 return ERR_PTR(-EACCES);
1796
1797 if (cpu < 0 || cpu >= nr_cpumask_bits)
1798 return ERR_PTR(-EINVAL);
1799 2043
1800 /* 2044 __perf_event_init_context(ctx);
1801 * We could be clever and allow to attach a event to an 2045 if (task) {
1802 * offline CPU and activate it when the CPU comes up, but 2046 ctx->task = task;
1803 * that's for later. 2047 get_task_struct(task);
1804 */ 2048 }
1805 if (!cpu_online(cpu)) 2049 ctx->pmu = pmu;
1806 return ERR_PTR(-ENODEV);
1807 2050
1808 cpuctx = &per_cpu(perf_cpu_context, cpu); 2051 return ctx;
1809 ctx = &cpuctx->ctx; 2052}
1810 get_ctx(ctx);
1811 2053
1812 return ctx; 2054static struct task_struct *
1813 } 2055find_lively_task_by_vpid(pid_t vpid)
2056{
2057 struct task_struct *task;
2058 int err;
1814 2059
1815 rcu_read_lock(); 2060 rcu_read_lock();
1816 if (!pid) 2061 if (!vpid)
1817 task = current; 2062 task = current;
1818 else 2063 else
1819 task = find_task_by_vpid(pid); 2064 task = find_task_by_vpid(vpid);
1820 if (task) 2065 if (task)
1821 get_task_struct(task); 2066 get_task_struct(task);
1822 rcu_read_unlock(); 2067 rcu_read_unlock();
@@ -1836,36 +2081,78 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1836 if (!ptrace_may_access(task, PTRACE_MODE_READ)) 2081 if (!ptrace_may_access(task, PTRACE_MODE_READ))
1837 goto errout; 2082 goto errout;
1838 2083
1839 retry: 2084 return task;
1840 ctx = perf_lock_task_context(task, &flags); 2085errout:
2086 put_task_struct(task);
2087 return ERR_PTR(err);
2088
2089}
2090
2091static struct perf_event_context *
2092find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2093{
2094 struct perf_event_context *ctx;
2095 struct perf_cpu_context *cpuctx;
2096 unsigned long flags;
2097 int ctxn, err;
2098
2099 if (!task && cpu != -1) {
2100 /* Must be root to operate on a CPU event: */
2101 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
2102 return ERR_PTR(-EACCES);
2103
2104 if (cpu < 0 || cpu >= nr_cpumask_bits)
2105 return ERR_PTR(-EINVAL);
2106
2107 /*
2108 * We could be clever and allow to attach a event to an
2109 * offline CPU and activate it when the CPU comes up, but
2110 * that's for later.
2111 */
2112 if (!cpu_online(cpu))
2113 return ERR_PTR(-ENODEV);
2114
2115 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
2116 ctx = &cpuctx->ctx;
2117 get_ctx(ctx);
2118
2119 return ctx;
2120 }
2121
2122 err = -EINVAL;
2123 ctxn = pmu->task_ctx_nr;
2124 if (ctxn < 0)
2125 goto errout;
2126
2127retry:
2128 ctx = perf_lock_task_context(task, ctxn, &flags);
1841 if (ctx) { 2129 if (ctx) {
1842 unclone_ctx(ctx); 2130 unclone_ctx(ctx);
1843 raw_spin_unlock_irqrestore(&ctx->lock, flags); 2131 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1844 } 2132 }
1845 2133
1846 if (!ctx) { 2134 if (!ctx) {
1847 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); 2135 ctx = alloc_perf_context(pmu, task);
1848 err = -ENOMEM; 2136 err = -ENOMEM;
1849 if (!ctx) 2137 if (!ctx)
1850 goto errout; 2138 goto errout;
1851 __perf_event_init_context(ctx, task); 2139
1852 get_ctx(ctx); 2140 get_ctx(ctx);
1853 if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) { 2141
2142 if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) {
1854 /* 2143 /*
1855 * We raced with some other task; use 2144 * We raced with some other task; use
1856 * the context they set. 2145 * the context they set.
1857 */ 2146 */
2147 put_task_struct(task);
1858 kfree(ctx); 2148 kfree(ctx);
1859 goto retry; 2149 goto retry;
1860 } 2150 }
1861 get_task_struct(task);
1862 } 2151 }
1863 2152
1864 put_task_struct(task);
1865 return ctx; 2153 return ctx;
1866 2154
1867 errout: 2155errout:
1868 put_task_struct(task);
1869 return ERR_PTR(err); 2156 return ERR_PTR(err);
1870} 2157}
1871 2158
@@ -1882,32 +2169,36 @@ static void free_event_rcu(struct rcu_head *head)
1882 kfree(event); 2169 kfree(event);
1883} 2170}
1884 2171
1885static void perf_pending_sync(struct perf_event *event); 2172static void perf_buffer_put(struct perf_buffer *buffer);
1886static void perf_mmap_data_put(struct perf_mmap_data *data);
1887 2173
1888static void free_event(struct perf_event *event) 2174static void free_event(struct perf_event *event)
1889{ 2175{
1890 perf_pending_sync(event); 2176 irq_work_sync(&event->pending);
1891 2177
1892 if (!event->parent) { 2178 if (!event->parent) {
1893 atomic_dec(&nr_events); 2179 if (event->attach_state & PERF_ATTACH_TASK)
1894 if (event->attr.mmap) 2180 jump_label_dec(&perf_task_events);
2181 if (event->attr.mmap || event->attr.mmap_data)
1895 atomic_dec(&nr_mmap_events); 2182 atomic_dec(&nr_mmap_events);
1896 if (event->attr.comm) 2183 if (event->attr.comm)
1897 atomic_dec(&nr_comm_events); 2184 atomic_dec(&nr_comm_events);
1898 if (event->attr.task) 2185 if (event->attr.task)
1899 atomic_dec(&nr_task_events); 2186 atomic_dec(&nr_task_events);
2187 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
2188 put_callchain_buffers();
1900 } 2189 }
1901 2190
1902 if (event->data) { 2191 if (event->buffer) {
1903 perf_mmap_data_put(event->data); 2192 perf_buffer_put(event->buffer);
1904 event->data = NULL; 2193 event->buffer = NULL;
1905 } 2194 }
1906 2195
1907 if (event->destroy) 2196 if (event->destroy)
1908 event->destroy(event); 2197 event->destroy(event);
1909 2198
1910 put_ctx(event->ctx); 2199 if (event->ctx)
2200 put_ctx(event->ctx);
2201
1911 call_rcu(&event->rcu_head, free_event_rcu); 2202 call_rcu(&event->rcu_head, free_event_rcu);
1912} 2203}
1913 2204
@@ -2126,13 +2417,13 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
2126static unsigned int perf_poll(struct file *file, poll_table *wait) 2417static unsigned int perf_poll(struct file *file, poll_table *wait)
2127{ 2418{
2128 struct perf_event *event = file->private_data; 2419 struct perf_event *event = file->private_data;
2129 struct perf_mmap_data *data; 2420 struct perf_buffer *buffer;
2130 unsigned int events = POLL_HUP; 2421 unsigned int events = POLL_HUP;
2131 2422
2132 rcu_read_lock(); 2423 rcu_read_lock();
2133 data = rcu_dereference(event->data); 2424 buffer = rcu_dereference(event->buffer);
2134 if (data) 2425 if (buffer)
2135 events = atomic_xchg(&data->poll, 0); 2426 events = atomic_xchg(&buffer->poll, 0);
2136 rcu_read_unlock(); 2427 rcu_read_unlock();
2137 2428
2138 poll_wait(file, &event->waitq, wait); 2429 poll_wait(file, &event->waitq, wait);
@@ -2143,7 +2434,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
2143static void perf_event_reset(struct perf_event *event) 2434static void perf_event_reset(struct perf_event *event)
2144{ 2435{
2145 (void)perf_event_read(event); 2436 (void)perf_event_read(event);
2146 atomic64_set(&event->count, 0); 2437 local64_set(&event->count, 0);
2147 perf_event_update_userpage(event); 2438 perf_event_update_userpage(event);
2148} 2439}
2149 2440
@@ -2186,15 +2477,13 @@ static void perf_event_for_each(struct perf_event *event,
2186static int perf_event_period(struct perf_event *event, u64 __user *arg) 2477static int perf_event_period(struct perf_event *event, u64 __user *arg)
2187{ 2478{
2188 struct perf_event_context *ctx = event->ctx; 2479 struct perf_event_context *ctx = event->ctx;
2189 unsigned long size;
2190 int ret = 0; 2480 int ret = 0;
2191 u64 value; 2481 u64 value;
2192 2482
2193 if (!event->attr.sample_period) 2483 if (!event->attr.sample_period)
2194 return -EINVAL; 2484 return -EINVAL;
2195 2485
2196 size = copy_from_user(&value, arg, sizeof(value)); 2486 if (copy_from_user(&value, arg, sizeof(value)))
2197 if (size != sizeof(value))
2198 return -EFAULT; 2487 return -EFAULT;
2199 2488
2200 if (!value) 2489 if (!value)
@@ -2328,6 +2617,9 @@ int perf_event_task_disable(void)
2328 2617
2329static int perf_event_index(struct perf_event *event) 2618static int perf_event_index(struct perf_event *event)
2330{ 2619{
2620 if (event->hw.state & PERF_HES_STOPPED)
2621 return 0;
2622
2331 if (event->state != PERF_EVENT_STATE_ACTIVE) 2623 if (event->state != PERF_EVENT_STATE_ACTIVE)
2332 return 0; 2624 return 0;
2333 2625
@@ -2342,14 +2634,14 @@ static int perf_event_index(struct perf_event *event)
2342void perf_event_update_userpage(struct perf_event *event) 2634void perf_event_update_userpage(struct perf_event *event)
2343{ 2635{
2344 struct perf_event_mmap_page *userpg; 2636 struct perf_event_mmap_page *userpg;
2345 struct perf_mmap_data *data; 2637 struct perf_buffer *buffer;
2346 2638
2347 rcu_read_lock(); 2639 rcu_read_lock();
2348 data = rcu_dereference(event->data); 2640 buffer = rcu_dereference(event->buffer);
2349 if (!data) 2641 if (!buffer)
2350 goto unlock; 2642 goto unlock;
2351 2643
2352 userpg = data->user_page; 2644 userpg = buffer->user_page;
2353 2645
2354 /* 2646 /*
2355 * Disable preemption so as to not let the corresponding user-space 2647 * Disable preemption so as to not let the corresponding user-space
@@ -2359,9 +2651,9 @@ void perf_event_update_userpage(struct perf_event *event)
2359 ++userpg->lock; 2651 ++userpg->lock;
2360 barrier(); 2652 barrier();
2361 userpg->index = perf_event_index(event); 2653 userpg->index = perf_event_index(event);
2362 userpg->offset = atomic64_read(&event->count); 2654 userpg->offset = perf_event_count(event);
2363 if (event->state == PERF_EVENT_STATE_ACTIVE) 2655 if (event->state == PERF_EVENT_STATE_ACTIVE)
2364 userpg->offset -= atomic64_read(&event->hw.prev_count); 2656 userpg->offset -= local64_read(&event->hw.prev_count);
2365 2657
2366 userpg->time_enabled = event->total_time_enabled + 2658 userpg->time_enabled = event->total_time_enabled +
2367 atomic64_read(&event->child_total_time_enabled); 2659 atomic64_read(&event->child_total_time_enabled);
@@ -2376,6 +2668,25 @@ unlock:
2376 rcu_read_unlock(); 2668 rcu_read_unlock();
2377} 2669}
2378 2670
2671static unsigned long perf_data_size(struct perf_buffer *buffer);
2672
2673static void
2674perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags)
2675{
2676 long max_size = perf_data_size(buffer);
2677
2678 if (watermark)
2679 buffer->watermark = min(max_size, watermark);
2680
2681 if (!buffer->watermark)
2682 buffer->watermark = max_size / 2;
2683
2684 if (flags & PERF_BUFFER_WRITABLE)
2685 buffer->writable = 1;
2686
2687 atomic_set(&buffer->refcount, 1);
2688}
2689
2379#ifndef CONFIG_PERF_USE_VMALLOC 2690#ifndef CONFIG_PERF_USE_VMALLOC
2380 2691
2381/* 2692/*
@@ -2383,15 +2694,15 @@ unlock:
2383 */ 2694 */
2384 2695
2385static struct page * 2696static struct page *
2386perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) 2697perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
2387{ 2698{
2388 if (pgoff > data->nr_pages) 2699 if (pgoff > buffer->nr_pages)
2389 return NULL; 2700 return NULL;
2390 2701
2391 if (pgoff == 0) 2702 if (pgoff == 0)
2392 return virt_to_page(data->user_page); 2703 return virt_to_page(buffer->user_page);
2393 2704
2394 return virt_to_page(data->data_pages[pgoff - 1]); 2705 return virt_to_page(buffer->data_pages[pgoff - 1]);
2395} 2706}
2396 2707
2397static void *perf_mmap_alloc_page(int cpu) 2708static void *perf_mmap_alloc_page(int cpu)
@@ -2407,42 +2718,44 @@ static void *perf_mmap_alloc_page(int cpu)
2407 return page_address(page); 2718 return page_address(page);
2408} 2719}
2409 2720
2410static struct perf_mmap_data * 2721static struct perf_buffer *
2411perf_mmap_data_alloc(struct perf_event *event, int nr_pages) 2722perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
2412{ 2723{
2413 struct perf_mmap_data *data; 2724 struct perf_buffer *buffer;
2414 unsigned long size; 2725 unsigned long size;
2415 int i; 2726 int i;
2416 2727
2417 size = sizeof(struct perf_mmap_data); 2728 size = sizeof(struct perf_buffer);
2418 size += nr_pages * sizeof(void *); 2729 size += nr_pages * sizeof(void *);
2419 2730
2420 data = kzalloc(size, GFP_KERNEL); 2731 buffer = kzalloc(size, GFP_KERNEL);
2421 if (!data) 2732 if (!buffer)
2422 goto fail; 2733 goto fail;
2423 2734
2424 data->user_page = perf_mmap_alloc_page(event->cpu); 2735 buffer->user_page = perf_mmap_alloc_page(cpu);
2425 if (!data->user_page) 2736 if (!buffer->user_page)
2426 goto fail_user_page; 2737 goto fail_user_page;
2427 2738
2428 for (i = 0; i < nr_pages; i++) { 2739 for (i = 0; i < nr_pages; i++) {
2429 data->data_pages[i] = perf_mmap_alloc_page(event->cpu); 2740 buffer->data_pages[i] = perf_mmap_alloc_page(cpu);
2430 if (!data->data_pages[i]) 2741 if (!buffer->data_pages[i])
2431 goto fail_data_pages; 2742 goto fail_data_pages;
2432 } 2743 }
2433 2744
2434 data->nr_pages = nr_pages; 2745 buffer->nr_pages = nr_pages;
2435 2746
2436 return data; 2747 perf_buffer_init(buffer, watermark, flags);
2748
2749 return buffer;
2437 2750
2438fail_data_pages: 2751fail_data_pages:
2439 for (i--; i >= 0; i--) 2752 for (i--; i >= 0; i--)
2440 free_page((unsigned long)data->data_pages[i]); 2753 free_page((unsigned long)buffer->data_pages[i]);
2441 2754
2442 free_page((unsigned long)data->user_page); 2755 free_page((unsigned long)buffer->user_page);
2443 2756
2444fail_user_page: 2757fail_user_page:
2445 kfree(data); 2758 kfree(buffer);
2446 2759
2447fail: 2760fail:
2448 return NULL; 2761 return NULL;
@@ -2456,17 +2769,17 @@ static void perf_mmap_free_page(unsigned long addr)
2456 __free_page(page); 2769 __free_page(page);
2457} 2770}
2458 2771
2459static void perf_mmap_data_free(struct perf_mmap_data *data) 2772static void perf_buffer_free(struct perf_buffer *buffer)
2460{ 2773{
2461 int i; 2774 int i;
2462 2775
2463 perf_mmap_free_page((unsigned long)data->user_page); 2776 perf_mmap_free_page((unsigned long)buffer->user_page);
2464 for (i = 0; i < data->nr_pages; i++) 2777 for (i = 0; i < buffer->nr_pages; i++)
2465 perf_mmap_free_page((unsigned long)data->data_pages[i]); 2778 perf_mmap_free_page((unsigned long)buffer->data_pages[i]);
2466 kfree(data); 2779 kfree(buffer);
2467} 2780}
2468 2781
2469static inline int page_order(struct perf_mmap_data *data) 2782static inline int page_order(struct perf_buffer *buffer)
2470{ 2783{
2471 return 0; 2784 return 0;
2472} 2785}
@@ -2479,18 +2792,18 @@ static inline int page_order(struct perf_mmap_data *data)
2479 * Required for architectures that have d-cache aliasing issues. 2792 * Required for architectures that have d-cache aliasing issues.
2480 */ 2793 */
2481 2794
2482static inline int page_order(struct perf_mmap_data *data) 2795static inline int page_order(struct perf_buffer *buffer)
2483{ 2796{
2484 return data->page_order; 2797 return buffer->page_order;
2485} 2798}
2486 2799
2487static struct page * 2800static struct page *
2488perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) 2801perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
2489{ 2802{
2490 if (pgoff > (1UL << page_order(data))) 2803 if (pgoff > (1UL << page_order(buffer)))
2491 return NULL; 2804 return NULL;
2492 2805
2493 return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE); 2806 return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE);
2494} 2807}
2495 2808
2496static void perf_mmap_unmark_page(void *addr) 2809static void perf_mmap_unmark_page(void *addr)
@@ -2500,57 +2813,59 @@ static void perf_mmap_unmark_page(void *addr)
2500 page->mapping = NULL; 2813 page->mapping = NULL;
2501} 2814}
2502 2815
2503static void perf_mmap_data_free_work(struct work_struct *work) 2816static void perf_buffer_free_work(struct work_struct *work)
2504{ 2817{
2505 struct perf_mmap_data *data; 2818 struct perf_buffer *buffer;
2506 void *base; 2819 void *base;
2507 int i, nr; 2820 int i, nr;
2508 2821
2509 data = container_of(work, struct perf_mmap_data, work); 2822 buffer = container_of(work, struct perf_buffer, work);
2510 nr = 1 << page_order(data); 2823 nr = 1 << page_order(buffer);
2511 2824
2512 base = data->user_page; 2825 base = buffer->user_page;
2513 for (i = 0; i < nr + 1; i++) 2826 for (i = 0; i < nr + 1; i++)
2514 perf_mmap_unmark_page(base + (i * PAGE_SIZE)); 2827 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
2515 2828
2516 vfree(base); 2829 vfree(base);
2517 kfree(data); 2830 kfree(buffer);
2518} 2831}
2519 2832
2520static void perf_mmap_data_free(struct perf_mmap_data *data) 2833static void perf_buffer_free(struct perf_buffer *buffer)
2521{ 2834{
2522 schedule_work(&data->work); 2835 schedule_work(&buffer->work);
2523} 2836}
2524 2837
2525static struct perf_mmap_data * 2838static struct perf_buffer *
2526perf_mmap_data_alloc(struct perf_event *event, int nr_pages) 2839perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
2527{ 2840{
2528 struct perf_mmap_data *data; 2841 struct perf_buffer *buffer;
2529 unsigned long size; 2842 unsigned long size;
2530 void *all_buf; 2843 void *all_buf;
2531 2844
2532 size = sizeof(struct perf_mmap_data); 2845 size = sizeof(struct perf_buffer);
2533 size += sizeof(void *); 2846 size += sizeof(void *);
2534 2847
2535 data = kzalloc(size, GFP_KERNEL); 2848 buffer = kzalloc(size, GFP_KERNEL);
2536 if (!data) 2849 if (!buffer)
2537 goto fail; 2850 goto fail;
2538 2851
2539 INIT_WORK(&data->work, perf_mmap_data_free_work); 2852 INIT_WORK(&buffer->work, perf_buffer_free_work);
2540 2853
2541 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); 2854 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
2542 if (!all_buf) 2855 if (!all_buf)
2543 goto fail_all_buf; 2856 goto fail_all_buf;
2544 2857
2545 data->user_page = all_buf; 2858 buffer->user_page = all_buf;
2546 data->data_pages[0] = all_buf + PAGE_SIZE; 2859 buffer->data_pages[0] = all_buf + PAGE_SIZE;
2547 data->page_order = ilog2(nr_pages); 2860 buffer->page_order = ilog2(nr_pages);
2548 data->nr_pages = 1; 2861 buffer->nr_pages = 1;
2862
2863 perf_buffer_init(buffer, watermark, flags);
2549 2864
2550 return data; 2865 return buffer;
2551 2866
2552fail_all_buf: 2867fail_all_buf:
2553 kfree(data); 2868 kfree(buffer);
2554 2869
2555fail: 2870fail:
2556 return NULL; 2871 return NULL;
@@ -2558,15 +2873,15 @@ fail:
2558 2873
2559#endif 2874#endif
2560 2875
2561static unsigned long perf_data_size(struct perf_mmap_data *data) 2876static unsigned long perf_data_size(struct perf_buffer *buffer)
2562{ 2877{
2563 return data->nr_pages << (PAGE_SHIFT + page_order(data)); 2878 return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer));
2564} 2879}
2565 2880
2566static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 2881static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2567{ 2882{
2568 struct perf_event *event = vma->vm_file->private_data; 2883 struct perf_event *event = vma->vm_file->private_data;
2569 struct perf_mmap_data *data; 2884 struct perf_buffer *buffer;
2570 int ret = VM_FAULT_SIGBUS; 2885 int ret = VM_FAULT_SIGBUS;
2571 2886
2572 if (vmf->flags & FAULT_FLAG_MKWRITE) { 2887 if (vmf->flags & FAULT_FLAG_MKWRITE) {
@@ -2576,14 +2891,14 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2576 } 2891 }
2577 2892
2578 rcu_read_lock(); 2893 rcu_read_lock();
2579 data = rcu_dereference(event->data); 2894 buffer = rcu_dereference(event->buffer);
2580 if (!data) 2895 if (!buffer)
2581 goto unlock; 2896 goto unlock;
2582 2897
2583 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) 2898 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
2584 goto unlock; 2899 goto unlock;
2585 2900
2586 vmf->page = perf_mmap_to_page(data, vmf->pgoff); 2901 vmf->page = perf_mmap_to_page(buffer, vmf->pgoff);
2587 if (!vmf->page) 2902 if (!vmf->page)
2588 goto unlock; 2903 goto unlock;
2589 2904
@@ -2598,52 +2913,35 @@ unlock:
2598 return ret; 2913 return ret;
2599} 2914}
2600 2915
2601static void 2916static void perf_buffer_free_rcu(struct rcu_head *rcu_head)
2602perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2603{
2604 long max_size = perf_data_size(data);
2605
2606 if (event->attr.watermark) {
2607 data->watermark = min_t(long, max_size,
2608 event->attr.wakeup_watermark);
2609 }
2610
2611 if (!data->watermark)
2612 data->watermark = max_size / 2;
2613
2614 atomic_set(&data->refcount, 1);
2615 rcu_assign_pointer(event->data, data);
2616}
2617
2618static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
2619{ 2917{
2620 struct perf_mmap_data *data; 2918 struct perf_buffer *buffer;
2621 2919
2622 data = container_of(rcu_head, struct perf_mmap_data, rcu_head); 2920 buffer = container_of(rcu_head, struct perf_buffer, rcu_head);
2623 perf_mmap_data_free(data); 2921 perf_buffer_free(buffer);
2624} 2922}
2625 2923
2626static struct perf_mmap_data *perf_mmap_data_get(struct perf_event *event) 2924static struct perf_buffer *perf_buffer_get(struct perf_event *event)
2627{ 2925{
2628 struct perf_mmap_data *data; 2926 struct perf_buffer *buffer;
2629 2927
2630 rcu_read_lock(); 2928 rcu_read_lock();
2631 data = rcu_dereference(event->data); 2929 buffer = rcu_dereference(event->buffer);
2632 if (data) { 2930 if (buffer) {
2633 if (!atomic_inc_not_zero(&data->refcount)) 2931 if (!atomic_inc_not_zero(&buffer->refcount))
2634 data = NULL; 2932 buffer = NULL;
2635 } 2933 }
2636 rcu_read_unlock(); 2934 rcu_read_unlock();
2637 2935
2638 return data; 2936 return buffer;
2639} 2937}
2640 2938
2641static void perf_mmap_data_put(struct perf_mmap_data *data) 2939static void perf_buffer_put(struct perf_buffer *buffer)
2642{ 2940{
2643 if (!atomic_dec_and_test(&data->refcount)) 2941 if (!atomic_dec_and_test(&buffer->refcount))
2644 return; 2942 return;
2645 2943
2646 call_rcu(&data->rcu_head, perf_mmap_data_free_rcu); 2944 call_rcu(&buffer->rcu_head, perf_buffer_free_rcu);
2647} 2945}
2648 2946
2649static void perf_mmap_open(struct vm_area_struct *vma) 2947static void perf_mmap_open(struct vm_area_struct *vma)
@@ -2658,16 +2956,16 @@ static void perf_mmap_close(struct vm_area_struct *vma)
2658 struct perf_event *event = vma->vm_file->private_data; 2956 struct perf_event *event = vma->vm_file->private_data;
2659 2957
2660 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { 2958 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
2661 unsigned long size = perf_data_size(event->data); 2959 unsigned long size = perf_data_size(event->buffer);
2662 struct user_struct *user = event->mmap_user; 2960 struct user_struct *user = event->mmap_user;
2663 struct perf_mmap_data *data = event->data; 2961 struct perf_buffer *buffer = event->buffer;
2664 2962
2665 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); 2963 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
2666 vma->vm_mm->locked_vm -= event->mmap_locked; 2964 vma->vm_mm->locked_vm -= event->mmap_locked;
2667 rcu_assign_pointer(event->data, NULL); 2965 rcu_assign_pointer(event->buffer, NULL);
2668 mutex_unlock(&event->mmap_mutex); 2966 mutex_unlock(&event->mmap_mutex);
2669 2967
2670 perf_mmap_data_put(data); 2968 perf_buffer_put(buffer);
2671 free_uid(user); 2969 free_uid(user);
2672 } 2970 }
2673} 2971}
@@ -2685,11 +2983,11 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2685 unsigned long user_locked, user_lock_limit; 2983 unsigned long user_locked, user_lock_limit;
2686 struct user_struct *user = current_user(); 2984 struct user_struct *user = current_user();
2687 unsigned long locked, lock_limit; 2985 unsigned long locked, lock_limit;
2688 struct perf_mmap_data *data; 2986 struct perf_buffer *buffer;
2689 unsigned long vma_size; 2987 unsigned long vma_size;
2690 unsigned long nr_pages; 2988 unsigned long nr_pages;
2691 long user_extra, extra; 2989 long user_extra, extra;
2692 int ret = 0; 2990 int ret = 0, flags = 0;
2693 2991
2694 /* 2992 /*
2695 * Don't allow mmap() of inherited per-task counters. This would 2993 * Don't allow mmap() of inherited per-task counters. This would
@@ -2706,7 +3004,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2706 nr_pages = (vma_size / PAGE_SIZE) - 1; 3004 nr_pages = (vma_size / PAGE_SIZE) - 1;
2707 3005
2708 /* 3006 /*
2709 * If we have data pages ensure they're a power-of-two number, so we 3007 * If we have buffer pages ensure they're a power-of-two number, so we
2710 * can do bitmasks instead of modulo. 3008 * can do bitmasks instead of modulo.
2711 */ 3009 */
2712 if (nr_pages != 0 && !is_power_of_2(nr_pages)) 3010 if (nr_pages != 0 && !is_power_of_2(nr_pages))
@@ -2720,9 +3018,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2720 3018
2721 WARN_ON_ONCE(event->ctx->parent_ctx); 3019 WARN_ON_ONCE(event->ctx->parent_ctx);
2722 mutex_lock(&event->mmap_mutex); 3020 mutex_lock(&event->mmap_mutex);
2723 if (event->data) { 3021 if (event->buffer) {
2724 if (event->data->nr_pages == nr_pages) 3022 if (event->buffer->nr_pages == nr_pages)
2725 atomic_inc(&event->data->refcount); 3023 atomic_inc(&event->buffer->refcount);
2726 else 3024 else
2727 ret = -EINVAL; 3025 ret = -EINVAL;
2728 goto unlock; 3026 goto unlock;
@@ -2752,17 +3050,18 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2752 goto unlock; 3050 goto unlock;
2753 } 3051 }
2754 3052
2755 WARN_ON(event->data); 3053 WARN_ON(event->buffer);
3054
3055 if (vma->vm_flags & VM_WRITE)
3056 flags |= PERF_BUFFER_WRITABLE;
2756 3057
2757 data = perf_mmap_data_alloc(event, nr_pages); 3058 buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark,
2758 if (!data) { 3059 event->cpu, flags);
3060 if (!buffer) {
2759 ret = -ENOMEM; 3061 ret = -ENOMEM;
2760 goto unlock; 3062 goto unlock;
2761 } 3063 }
2762 3064 rcu_assign_pointer(event->buffer, buffer);
2763 perf_mmap_data_init(event, data);
2764 if (vma->vm_flags & VM_WRITE)
2765 event->data->writable = 1;
2766 3065
2767 atomic_long_add(user_extra, &user->locked_vm); 3066 atomic_long_add(user_extra, &user->locked_vm);
2768 event->mmap_locked = extra; 3067 event->mmap_locked = extra;
@@ -2824,16 +3123,7 @@ void perf_event_wakeup(struct perf_event *event)
2824 } 3123 }
2825} 3124}
2826 3125
2827/* 3126static void perf_pending_event(struct irq_work *entry)
2828 * Pending wakeups
2829 *
2830 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2831 *
2832 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2833 * single linked list and use cmpxchg() to add entries lockless.
2834 */
2835
2836static void perf_pending_event(struct perf_pending_entry *entry)
2837{ 3127{
2838 struct perf_event *event = container_of(entry, 3128 struct perf_event *event = container_of(entry,
2839 struct perf_event, pending); 3129 struct perf_event, pending);
@@ -2849,104 +3139,6 @@ static void perf_pending_event(struct perf_pending_entry *entry)
2849 } 3139 }
2850} 3140}
2851 3141
2852#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2853
2854static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2855 PENDING_TAIL,
2856};
2857
2858static void perf_pending_queue(struct perf_pending_entry *entry,
2859 void (*func)(struct perf_pending_entry *))
2860{
2861 struct perf_pending_entry **head;
2862
2863 if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2864 return;
2865
2866 entry->func = func;
2867
2868 head = &get_cpu_var(perf_pending_head);
2869
2870 do {
2871 entry->next = *head;
2872 } while (cmpxchg(head, entry->next, entry) != entry->next);
2873
2874 set_perf_event_pending();
2875
2876 put_cpu_var(perf_pending_head);
2877}
2878
2879static int __perf_pending_run(void)
2880{
2881 struct perf_pending_entry *list;
2882 int nr = 0;
2883
2884 list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2885 while (list != PENDING_TAIL) {
2886 void (*func)(struct perf_pending_entry *);
2887 struct perf_pending_entry *entry = list;
2888
2889 list = list->next;
2890
2891 func = entry->func;
2892 entry->next = NULL;
2893 /*
2894 * Ensure we observe the unqueue before we issue the wakeup,
2895 * so that we won't be waiting forever.
2896 * -- see perf_not_pending().
2897 */
2898 smp_wmb();
2899
2900 func(entry);
2901 nr++;
2902 }
2903
2904 return nr;
2905}
2906
2907static inline int perf_not_pending(struct perf_event *event)
2908{
2909 /*
2910 * If we flush on whatever cpu we run, there is a chance we don't
2911 * need to wait.
2912 */
2913 get_cpu();
2914 __perf_pending_run();
2915 put_cpu();
2916
2917 /*
2918 * Ensure we see the proper queue state before going to sleep
2919 * so that we do not miss the wakeup. -- see perf_pending_handle()
2920 */
2921 smp_rmb();
2922 return event->pending.next == NULL;
2923}
2924
2925static void perf_pending_sync(struct perf_event *event)
2926{
2927 wait_event(event->waitq, perf_not_pending(event));
2928}
2929
2930void perf_event_do_pending(void)
2931{
2932 __perf_pending_run();
2933}
2934
2935/*
2936 * Callchain support -- arch specific
2937 */
2938
2939__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2940{
2941 return NULL;
2942}
2943
2944__weak
2945void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
2946{
2947}
2948
2949
2950/* 3142/*
2951 * We assume there is only KVM supporting the callbacks. 3143 * We assume there is only KVM supporting the callbacks.
2952 * Later on, we might change it to a list if there is 3144 * Later on, we might change it to a list if there is
@@ -2971,15 +3163,15 @@ EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
2971/* 3163/*
2972 * Output 3164 * Output
2973 */ 3165 */
2974static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, 3166static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail,
2975 unsigned long offset, unsigned long head) 3167 unsigned long offset, unsigned long head)
2976{ 3168{
2977 unsigned long mask; 3169 unsigned long mask;
2978 3170
2979 if (!data->writable) 3171 if (!buffer->writable)
2980 return true; 3172 return true;
2981 3173
2982 mask = perf_data_size(data) - 1; 3174 mask = perf_data_size(buffer) - 1;
2983 3175
2984 offset = (offset - tail) & mask; 3176 offset = (offset - tail) & mask;
2985 head = (head - tail) & mask; 3177 head = (head - tail) & mask;
@@ -2992,12 +3184,11 @@ static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
2992 3184
2993static void perf_output_wakeup(struct perf_output_handle *handle) 3185static void perf_output_wakeup(struct perf_output_handle *handle)
2994{ 3186{
2995 atomic_set(&handle->data->poll, POLL_IN); 3187 atomic_set(&handle->buffer->poll, POLL_IN);
2996 3188
2997 if (handle->nmi) { 3189 if (handle->nmi) {
2998 handle->event->pending_wakeup = 1; 3190 handle->event->pending_wakeup = 1;
2999 perf_pending_queue(&handle->event->pending, 3191 irq_work_queue(&handle->event->pending);
3000 perf_pending_event);
3001 } else 3192 } else
3002 perf_event_wakeup(handle->event); 3193 perf_event_wakeup(handle->event);
3003} 3194}
@@ -3012,48 +3203,48 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
3012 */ 3203 */
3013static void perf_output_get_handle(struct perf_output_handle *handle) 3204static void perf_output_get_handle(struct perf_output_handle *handle)
3014{ 3205{
3015 struct perf_mmap_data *data = handle->data; 3206 struct perf_buffer *buffer = handle->buffer;
3016 3207
3017 preempt_disable(); 3208 preempt_disable();
3018 local_inc(&data->nest); 3209 local_inc(&buffer->nest);
3019 handle->wakeup = local_read(&data->wakeup); 3210 handle->wakeup = local_read(&buffer->wakeup);
3020} 3211}
3021 3212
3022static void perf_output_put_handle(struct perf_output_handle *handle) 3213static void perf_output_put_handle(struct perf_output_handle *handle)
3023{ 3214{
3024 struct perf_mmap_data *data = handle->data; 3215 struct perf_buffer *buffer = handle->buffer;
3025 unsigned long head; 3216 unsigned long head;
3026 3217
3027again: 3218again:
3028 head = local_read(&data->head); 3219 head = local_read(&buffer->head);
3029 3220
3030 /* 3221 /*
3031 * IRQ/NMI can happen here, which means we can miss a head update. 3222 * IRQ/NMI can happen here, which means we can miss a head update.
3032 */ 3223 */
3033 3224
3034 if (!local_dec_and_test(&data->nest)) 3225 if (!local_dec_and_test(&buffer->nest))
3035 goto out; 3226 goto out;
3036 3227
3037 /* 3228 /*
3038 * Publish the known good head. Rely on the full barrier implied 3229 * Publish the known good head. Rely on the full barrier implied
3039 * by atomic_dec_and_test() order the data->head read and this 3230 * by atomic_dec_and_test() order the buffer->head read and this
3040 * write. 3231 * write.
3041 */ 3232 */
3042 data->user_page->data_head = head; 3233 buffer->user_page->data_head = head;
3043 3234
3044 /* 3235 /*
3045 * Now check if we missed an update, rely on the (compiler) 3236 * Now check if we missed an update, rely on the (compiler)
3046 * barrier in atomic_dec_and_test() to re-read data->head. 3237 * barrier in atomic_dec_and_test() to re-read buffer->head.
3047 */ 3238 */
3048 if (unlikely(head != local_read(&data->head))) { 3239 if (unlikely(head != local_read(&buffer->head))) {
3049 local_inc(&data->nest); 3240 local_inc(&buffer->nest);
3050 goto again; 3241 goto again;
3051 } 3242 }
3052 3243
3053 if (handle->wakeup != local_read(&data->wakeup)) 3244 if (handle->wakeup != local_read(&buffer->wakeup))
3054 perf_output_wakeup(handle); 3245 perf_output_wakeup(handle);
3055 3246
3056 out: 3247out:
3057 preempt_enable(); 3248 preempt_enable();
3058} 3249}
3059 3250
@@ -3070,12 +3261,12 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle,
3070 buf += size; 3261 buf += size;
3071 handle->size -= size; 3262 handle->size -= size;
3072 if (!handle->size) { 3263 if (!handle->size) {
3073 struct perf_mmap_data *data = handle->data; 3264 struct perf_buffer *buffer = handle->buffer;
3074 3265
3075 handle->page++; 3266 handle->page++;
3076 handle->page &= data->nr_pages - 1; 3267 handle->page &= buffer->nr_pages - 1;
3077 handle->addr = data->data_pages[handle->page]; 3268 handle->addr = buffer->data_pages[handle->page];
3078 handle->size = PAGE_SIZE << page_order(data); 3269 handle->size = PAGE_SIZE << page_order(buffer);
3079 } 3270 }
3080 } while (len); 3271 } while (len);
3081} 3272}
@@ -3084,7 +3275,7 @@ int perf_output_begin(struct perf_output_handle *handle,
3084 struct perf_event *event, unsigned int size, 3275 struct perf_event *event, unsigned int size,
3085 int nmi, int sample) 3276 int nmi, int sample)
3086{ 3277{
3087 struct perf_mmap_data *data; 3278 struct perf_buffer *buffer;
3088 unsigned long tail, offset, head; 3279 unsigned long tail, offset, head;
3089 int have_lost; 3280 int have_lost;
3090 struct { 3281 struct {
@@ -3100,19 +3291,19 @@ int perf_output_begin(struct perf_output_handle *handle,
3100 if (event->parent) 3291 if (event->parent)
3101 event = event->parent; 3292 event = event->parent;
3102 3293
3103 data = rcu_dereference(event->data); 3294 buffer = rcu_dereference(event->buffer);
3104 if (!data) 3295 if (!buffer)
3105 goto out; 3296 goto out;
3106 3297
3107 handle->data = data; 3298 handle->buffer = buffer;
3108 handle->event = event; 3299 handle->event = event;
3109 handle->nmi = nmi; 3300 handle->nmi = nmi;
3110 handle->sample = sample; 3301 handle->sample = sample;
3111 3302
3112 if (!data->nr_pages) 3303 if (!buffer->nr_pages)
3113 goto out; 3304 goto out;
3114 3305
3115 have_lost = local_read(&data->lost); 3306 have_lost = local_read(&buffer->lost);
3116 if (have_lost) 3307 if (have_lost)
3117 size += sizeof(lost_event); 3308 size += sizeof(lost_event);
3118 3309
@@ -3124,30 +3315,30 @@ int perf_output_begin(struct perf_output_handle *handle,
3124 * tail pointer. So that all reads will be completed before the 3315 * tail pointer. So that all reads will be completed before the
3125 * write is issued. 3316 * write is issued.
3126 */ 3317 */
3127 tail = ACCESS_ONCE(data->user_page->data_tail); 3318 tail = ACCESS_ONCE(buffer->user_page->data_tail);
3128 smp_rmb(); 3319 smp_rmb();
3129 offset = head = local_read(&data->head); 3320 offset = head = local_read(&buffer->head);
3130 head += size; 3321 head += size;
3131 if (unlikely(!perf_output_space(data, tail, offset, head))) 3322 if (unlikely(!perf_output_space(buffer, tail, offset, head)))
3132 goto fail; 3323 goto fail;
3133 } while (local_cmpxchg(&data->head, offset, head) != offset); 3324 } while (local_cmpxchg(&buffer->head, offset, head) != offset);
3134 3325
3135 if (head - local_read(&data->wakeup) > data->watermark) 3326 if (head - local_read(&buffer->wakeup) > buffer->watermark)
3136 local_add(data->watermark, &data->wakeup); 3327 local_add(buffer->watermark, &buffer->wakeup);
3137 3328
3138 handle->page = offset >> (PAGE_SHIFT + page_order(data)); 3329 handle->page = offset >> (PAGE_SHIFT + page_order(buffer));
3139 handle->page &= data->nr_pages - 1; 3330 handle->page &= buffer->nr_pages - 1;
3140 handle->size = offset & ((PAGE_SIZE << page_order(data)) - 1); 3331 handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1);
3141 handle->addr = data->data_pages[handle->page]; 3332 handle->addr = buffer->data_pages[handle->page];
3142 handle->addr += handle->size; 3333 handle->addr += handle->size;
3143 handle->size = (PAGE_SIZE << page_order(data)) - handle->size; 3334 handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size;
3144 3335
3145 if (have_lost) { 3336 if (have_lost) {
3146 lost_event.header.type = PERF_RECORD_LOST; 3337 lost_event.header.type = PERF_RECORD_LOST;
3147 lost_event.header.misc = 0; 3338 lost_event.header.misc = 0;
3148 lost_event.header.size = sizeof(lost_event); 3339 lost_event.header.size = sizeof(lost_event);
3149 lost_event.id = event->id; 3340 lost_event.id = event->id;
3150 lost_event.lost = local_xchg(&data->lost, 0); 3341 lost_event.lost = local_xchg(&buffer->lost, 0);
3151 3342
3152 perf_output_put(handle, lost_event); 3343 perf_output_put(handle, lost_event);
3153 } 3344 }
@@ -3155,7 +3346,7 @@ int perf_output_begin(struct perf_output_handle *handle,
3155 return 0; 3346 return 0;
3156 3347
3157fail: 3348fail:
3158 local_inc(&data->lost); 3349 local_inc(&buffer->lost);
3159 perf_output_put_handle(handle); 3350 perf_output_put_handle(handle);
3160out: 3351out:
3161 rcu_read_unlock(); 3352 rcu_read_unlock();
@@ -3166,15 +3357,15 @@ out:
3166void perf_output_end(struct perf_output_handle *handle) 3357void perf_output_end(struct perf_output_handle *handle)
3167{ 3358{
3168 struct perf_event *event = handle->event; 3359 struct perf_event *event = handle->event;
3169 struct perf_mmap_data *data = handle->data; 3360 struct perf_buffer *buffer = handle->buffer;
3170 3361
3171 int wakeup_events = event->attr.wakeup_events; 3362 int wakeup_events = event->attr.wakeup_events;
3172 3363
3173 if (handle->sample && wakeup_events) { 3364 if (handle->sample && wakeup_events) {
3174 int events = local_inc_return(&data->events); 3365 int events = local_inc_return(&buffer->events);
3175 if (events >= wakeup_events) { 3366 if (events >= wakeup_events) {
3176 local_sub(wakeup_events, &data->events); 3367 local_sub(wakeup_events, &buffer->events);
3177 local_inc(&data->wakeup); 3368 local_inc(&buffer->wakeup);
3178 } 3369 }
3179 } 3370 }
3180 3371
@@ -3211,7 +3402,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
3211 u64 values[4]; 3402 u64 values[4];
3212 int n = 0; 3403 int n = 0;
3213 3404
3214 values[n++] = atomic64_read(&event->count); 3405 values[n++] = perf_event_count(event);
3215 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 3406 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
3216 values[n++] = event->total_time_enabled + 3407 values[n++] = event->total_time_enabled +
3217 atomic64_read(&event->child_total_time_enabled); 3408 atomic64_read(&event->child_total_time_enabled);
@@ -3248,7 +3439,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
3248 if (leader != event) 3439 if (leader != event)
3249 leader->pmu->read(leader); 3440 leader->pmu->read(leader);
3250 3441
3251 values[n++] = atomic64_read(&leader->count); 3442 values[n++] = perf_event_count(leader);
3252 if (read_format & PERF_FORMAT_ID) 3443 if (read_format & PERF_FORMAT_ID)
3253 values[n++] = primary_event_id(leader); 3444 values[n++] = primary_event_id(leader);
3254 3445
@@ -3260,7 +3451,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
3260 if (sub != event) 3451 if (sub != event)
3261 sub->pmu->read(sub); 3452 sub->pmu->read(sub);
3262 3453
3263 values[n++] = atomic64_read(&sub->count); 3454 values[n++] = perf_event_count(sub);
3264 if (read_format & PERF_FORMAT_ID) 3455 if (read_format & PERF_FORMAT_ID)
3265 values[n++] = primary_event_id(sub); 3456 values[n++] = primary_event_id(sub);
3266 3457
@@ -3441,14 +3632,20 @@ static void perf_event_output(struct perf_event *event, int nmi,
3441 struct perf_output_handle handle; 3632 struct perf_output_handle handle;
3442 struct perf_event_header header; 3633 struct perf_event_header header;
3443 3634
3635 /* protect the callchain buffers */
3636 rcu_read_lock();
3637
3444 perf_prepare_sample(&header, data, event, regs); 3638 perf_prepare_sample(&header, data, event, regs);
3445 3639
3446 if (perf_output_begin(&handle, event, header.size, nmi, 1)) 3640 if (perf_output_begin(&handle, event, header.size, nmi, 1))
3447 return; 3641 goto exit;
3448 3642
3449 perf_output_sample(&handle, &header, data, event); 3643 perf_output_sample(&handle, &header, data, event);
3450 3644
3451 perf_output_end(&handle); 3645 perf_output_end(&handle);
3646
3647exit:
3648 rcu_read_unlock();
3452} 3649}
3453 3650
3454/* 3651/*
@@ -3491,7 +3688,7 @@ perf_event_read_event(struct perf_event *event,
3491/* 3688/*
3492 * task tracking -- fork/exit 3689 * task tracking -- fork/exit
3493 * 3690 *
3494 * enabled by: attr.comm | attr.mmap | attr.task 3691 * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task
3495 */ 3692 */
3496 3693
3497struct perf_task_event { 3694struct perf_task_event {
@@ -3541,7 +3738,8 @@ static int perf_event_task_match(struct perf_event *event)
3541 if (event->cpu != -1 && event->cpu != smp_processor_id()) 3738 if (event->cpu != -1 && event->cpu != smp_processor_id())
3542 return 0; 3739 return 0;
3543 3740
3544 if (event->attr.comm || event->attr.mmap || event->attr.task) 3741 if (event->attr.comm || event->attr.mmap ||
3742 event->attr.mmap_data || event->attr.task)
3545 return 1; 3743 return 1;
3546 3744
3547 return 0; 3745 return 0;
@@ -3561,16 +3759,27 @@ static void perf_event_task_ctx(struct perf_event_context *ctx,
3561static void perf_event_task_event(struct perf_task_event *task_event) 3759static void perf_event_task_event(struct perf_task_event *task_event)
3562{ 3760{
3563 struct perf_cpu_context *cpuctx; 3761 struct perf_cpu_context *cpuctx;
3564 struct perf_event_context *ctx = task_event->task_ctx; 3762 struct perf_event_context *ctx;
3763 struct pmu *pmu;
3764 int ctxn;
3565 3765
3566 rcu_read_lock(); 3766 rcu_read_lock();
3567 cpuctx = &get_cpu_var(perf_cpu_context); 3767 list_for_each_entry_rcu(pmu, &pmus, entry) {
3568 perf_event_task_ctx(&cpuctx->ctx, task_event); 3768 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
3569 if (!ctx) 3769 perf_event_task_ctx(&cpuctx->ctx, task_event);
3570 ctx = rcu_dereference(current->perf_event_ctxp); 3770
3571 if (ctx) 3771 ctx = task_event->task_ctx;
3572 perf_event_task_ctx(ctx, task_event); 3772 if (!ctx) {
3573 put_cpu_var(perf_cpu_context); 3773 ctxn = pmu->task_ctx_nr;
3774 if (ctxn < 0)
3775 goto next;
3776 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
3777 }
3778 if (ctx)
3779 perf_event_task_ctx(ctx, task_event);
3780next:
3781 put_cpu_ptr(pmu->pmu_cpu_context);
3782 }
3574 rcu_read_unlock(); 3783 rcu_read_unlock();
3575} 3784}
3576 3785
@@ -3675,8 +3884,10 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3675{ 3884{
3676 struct perf_cpu_context *cpuctx; 3885 struct perf_cpu_context *cpuctx;
3677 struct perf_event_context *ctx; 3886 struct perf_event_context *ctx;
3678 unsigned int size;
3679 char comm[TASK_COMM_LEN]; 3887 char comm[TASK_COMM_LEN];
3888 unsigned int size;
3889 struct pmu *pmu;
3890 int ctxn;
3680 3891
3681 memset(comm, 0, sizeof(comm)); 3892 memset(comm, 0, sizeof(comm));
3682 strlcpy(comm, comm_event->task->comm, sizeof(comm)); 3893 strlcpy(comm, comm_event->task->comm, sizeof(comm));
@@ -3688,21 +3899,36 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3688 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; 3899 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3689 3900
3690 rcu_read_lock(); 3901 rcu_read_lock();
3691 cpuctx = &get_cpu_var(perf_cpu_context); 3902 list_for_each_entry_rcu(pmu, &pmus, entry) {
3692 perf_event_comm_ctx(&cpuctx->ctx, comm_event); 3903 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
3693 ctx = rcu_dereference(current->perf_event_ctxp); 3904 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3694 if (ctx) 3905
3695 perf_event_comm_ctx(ctx, comm_event); 3906 ctxn = pmu->task_ctx_nr;
3696 put_cpu_var(perf_cpu_context); 3907 if (ctxn < 0)
3908 goto next;
3909
3910 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
3911 if (ctx)
3912 perf_event_comm_ctx(ctx, comm_event);
3913next:
3914 put_cpu_ptr(pmu->pmu_cpu_context);
3915 }
3697 rcu_read_unlock(); 3916 rcu_read_unlock();
3698} 3917}
3699 3918
3700void perf_event_comm(struct task_struct *task) 3919void perf_event_comm(struct task_struct *task)
3701{ 3920{
3702 struct perf_comm_event comm_event; 3921 struct perf_comm_event comm_event;
3922 struct perf_event_context *ctx;
3923 int ctxn;
3703 3924
3704 if (task->perf_event_ctxp) 3925 for_each_task_context_nr(ctxn) {
3705 perf_event_enable_on_exec(task); 3926 ctx = task->perf_event_ctxp[ctxn];
3927 if (!ctx)
3928 continue;
3929
3930 perf_event_enable_on_exec(ctx);
3931 }
3706 3932
3707 if (!atomic_read(&nr_comm_events)) 3933 if (!atomic_read(&nr_comm_events))
3708 return; 3934 return;
@@ -3766,7 +3992,8 @@ static void perf_event_mmap_output(struct perf_event *event,
3766} 3992}
3767 3993
3768static int perf_event_mmap_match(struct perf_event *event, 3994static int perf_event_mmap_match(struct perf_event *event,
3769 struct perf_mmap_event *mmap_event) 3995 struct perf_mmap_event *mmap_event,
3996 int executable)
3770{ 3997{
3771 if (event->state < PERF_EVENT_STATE_INACTIVE) 3998 if (event->state < PERF_EVENT_STATE_INACTIVE)
3772 return 0; 3999 return 0;
@@ -3774,19 +4001,21 @@ static int perf_event_mmap_match(struct perf_event *event,
3774 if (event->cpu != -1 && event->cpu != smp_processor_id()) 4001 if (event->cpu != -1 && event->cpu != smp_processor_id())
3775 return 0; 4002 return 0;
3776 4003
3777 if (event->attr.mmap) 4004 if ((!executable && event->attr.mmap_data) ||
4005 (executable && event->attr.mmap))
3778 return 1; 4006 return 1;
3779 4007
3780 return 0; 4008 return 0;
3781} 4009}
3782 4010
3783static void perf_event_mmap_ctx(struct perf_event_context *ctx, 4011static void perf_event_mmap_ctx(struct perf_event_context *ctx,
3784 struct perf_mmap_event *mmap_event) 4012 struct perf_mmap_event *mmap_event,
4013 int executable)
3785{ 4014{
3786 struct perf_event *event; 4015 struct perf_event *event;
3787 4016
3788 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 4017 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3789 if (perf_event_mmap_match(event, mmap_event)) 4018 if (perf_event_mmap_match(event, mmap_event, executable))
3790 perf_event_mmap_output(event, mmap_event); 4019 perf_event_mmap_output(event, mmap_event);
3791 } 4020 }
3792} 4021}
@@ -3801,6 +4030,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
3801 char tmp[16]; 4030 char tmp[16];
3802 char *buf = NULL; 4031 char *buf = NULL;
3803 const char *name; 4032 const char *name;
4033 struct pmu *pmu;
4034 int ctxn;
3804 4035
3805 memset(tmp, 0, sizeof(tmp)); 4036 memset(tmp, 0, sizeof(tmp));
3806 4037
@@ -3830,6 +4061,14 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
3830 if (!vma->vm_mm) { 4061 if (!vma->vm_mm) {
3831 name = strncpy(tmp, "[vdso]", sizeof(tmp)); 4062 name = strncpy(tmp, "[vdso]", sizeof(tmp));
3832 goto got_name; 4063 goto got_name;
4064 } else if (vma->vm_start <= vma->vm_mm->start_brk &&
4065 vma->vm_end >= vma->vm_mm->brk) {
4066 name = strncpy(tmp, "[heap]", sizeof(tmp));
4067 goto got_name;
4068 } else if (vma->vm_start <= vma->vm_mm->start_stack &&
4069 vma->vm_end >= vma->vm_mm->start_stack) {
4070 name = strncpy(tmp, "[stack]", sizeof(tmp));
4071 goto got_name;
3833 } 4072 }
3834 4073
3835 name = strncpy(tmp, "//anon", sizeof(tmp)); 4074 name = strncpy(tmp, "//anon", sizeof(tmp));
@@ -3845,18 +4084,29 @@ got_name:
3845 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; 4084 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3846 4085
3847 rcu_read_lock(); 4086 rcu_read_lock();
3848 cpuctx = &get_cpu_var(perf_cpu_context); 4087 list_for_each_entry_rcu(pmu, &pmus, entry) {
3849 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); 4088 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
3850 ctx = rcu_dereference(current->perf_event_ctxp); 4089 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
3851 if (ctx) 4090 vma->vm_flags & VM_EXEC);
3852 perf_event_mmap_ctx(ctx, mmap_event); 4091
3853 put_cpu_var(perf_cpu_context); 4092 ctxn = pmu->task_ctx_nr;
4093 if (ctxn < 0)
4094 goto next;
4095
4096 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4097 if (ctx) {
4098 perf_event_mmap_ctx(ctx, mmap_event,
4099 vma->vm_flags & VM_EXEC);
4100 }
4101next:
4102 put_cpu_ptr(pmu->pmu_cpu_context);
4103 }
3854 rcu_read_unlock(); 4104 rcu_read_unlock();
3855 4105
3856 kfree(buf); 4106 kfree(buf);
3857} 4107}
3858 4108
3859void __perf_event_mmap(struct vm_area_struct *vma) 4109void perf_event_mmap(struct vm_area_struct *vma)
3860{ 4110{
3861 struct perf_mmap_event mmap_event; 4111 struct perf_mmap_event mmap_event;
3862 4112
@@ -3932,8 +4182,6 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
3932 struct hw_perf_event *hwc = &event->hw; 4182 struct hw_perf_event *hwc = &event->hw;
3933 int ret = 0; 4183 int ret = 0;
3934 4184
3935 throttle = (throttle && event->pmu->unthrottle != NULL);
3936
3937 if (!throttle) { 4185 if (!throttle) {
3938 hwc->interrupts++; 4186 hwc->interrupts++;
3939 } else { 4187 } else {
@@ -3976,8 +4224,7 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
3976 event->pending_kill = POLL_HUP; 4224 event->pending_kill = POLL_HUP;
3977 if (nmi) { 4225 if (nmi) {
3978 event->pending_disable = 1; 4226 event->pending_disable = 1;
3979 perf_pending_queue(&event->pending, 4227 irq_work_queue(&event->pending);
3980 perf_pending_event);
3981 } else 4228 } else
3982 perf_event_disable(event); 4229 perf_event_disable(event);
3983 } 4230 }
@@ -4001,6 +4248,17 @@ int perf_event_overflow(struct perf_event *event, int nmi,
4001 * Generic software event infrastructure 4248 * Generic software event infrastructure
4002 */ 4249 */
4003 4250
4251struct swevent_htable {
4252 struct swevent_hlist *swevent_hlist;
4253 struct mutex hlist_mutex;
4254 int hlist_refcount;
4255
4256 /* Recursion avoidance in each contexts */
4257 int recursion[PERF_NR_CONTEXTS];
4258};
4259
4260static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
4261
4004/* 4262/*
4005 * We directly increment event->count and keep a second value in 4263 * We directly increment event->count and keep a second value in
4006 * event->hw.period_left to count intervals. This period event 4264 * event->hw.period_left to count intervals. This period event
@@ -4018,14 +4276,14 @@ static u64 perf_swevent_set_period(struct perf_event *event)
4018 hwc->last_period = hwc->sample_period; 4276 hwc->last_period = hwc->sample_period;
4019 4277
4020again: 4278again:
4021 old = val = atomic64_read(&hwc->period_left); 4279 old = val = local64_read(&hwc->period_left);
4022 if (val < 0) 4280 if (val < 0)
4023 return 0; 4281 return 0;
4024 4282
4025 nr = div64_u64(period + val, period); 4283 nr = div64_u64(period + val, period);
4026 offset = nr * period; 4284 offset = nr * period;
4027 val -= offset; 4285 val -= offset;
4028 if (atomic64_cmpxchg(&hwc->period_left, old, val) != old) 4286 if (local64_cmpxchg(&hwc->period_left, old, val) != old)
4029 goto again; 4287 goto again;
4030 4288
4031 return nr; 4289 return nr;
@@ -4058,13 +4316,13 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
4058 } 4316 }
4059} 4317}
4060 4318
4061static void perf_swevent_add(struct perf_event *event, u64 nr, 4319static void perf_swevent_event(struct perf_event *event, u64 nr,
4062 int nmi, struct perf_sample_data *data, 4320 int nmi, struct perf_sample_data *data,
4063 struct pt_regs *regs) 4321 struct pt_regs *regs)
4064{ 4322{
4065 struct hw_perf_event *hwc = &event->hw; 4323 struct hw_perf_event *hwc = &event->hw;
4066 4324
4067 atomic64_add(nr, &event->count); 4325 local64_add(nr, &event->count);
4068 4326
4069 if (!regs) 4327 if (!regs)
4070 return; 4328 return;
@@ -4075,7 +4333,7 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
4075 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) 4333 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
4076 return perf_swevent_overflow(event, 1, nmi, data, regs); 4334 return perf_swevent_overflow(event, 1, nmi, data, regs);
4077 4335
4078 if (atomic64_add_negative(nr, &hwc->period_left)) 4336 if (local64_add_negative(nr, &hwc->period_left))
4079 return; 4337 return;
4080 4338
4081 perf_swevent_overflow(event, 0, nmi, data, regs); 4339 perf_swevent_overflow(event, 0, nmi, data, regs);
@@ -4084,6 +4342,9 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
4084static int perf_exclude_event(struct perf_event *event, 4342static int perf_exclude_event(struct perf_event *event,
4085 struct pt_regs *regs) 4343 struct pt_regs *regs)
4086{ 4344{
4345 if (event->hw.state & PERF_HES_STOPPED)
4346 return 0;
4347
4087 if (regs) { 4348 if (regs) {
4088 if (event->attr.exclude_user && user_mode(regs)) 4349 if (event->attr.exclude_user && user_mode(regs))
4089 return 1; 4350 return 1;
@@ -4130,11 +4391,11 @@ __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
4130 4391
4131/* For the read side: events when they trigger */ 4392/* For the read side: events when they trigger */
4132static inline struct hlist_head * 4393static inline struct hlist_head *
4133find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id) 4394find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
4134{ 4395{
4135 struct swevent_hlist *hlist; 4396 struct swevent_hlist *hlist;
4136 4397
4137 hlist = rcu_dereference(ctx->swevent_hlist); 4398 hlist = rcu_dereference(swhash->swevent_hlist);
4138 if (!hlist) 4399 if (!hlist)
4139 return NULL; 4400 return NULL;
4140 4401
@@ -4143,7 +4404,7 @@ find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
4143 4404
4144/* For the event head insertion and removal in the hlist */ 4405/* For the event head insertion and removal in the hlist */
4145static inline struct hlist_head * 4406static inline struct hlist_head *
4146find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event) 4407find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
4147{ 4408{
4148 struct swevent_hlist *hlist; 4409 struct swevent_hlist *hlist;
4149 u32 event_id = event->attr.config; 4410 u32 event_id = event->attr.config;
@@ -4154,7 +4415,7 @@ find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
4154 * and release. Which makes the protected version suitable here. 4415 * and release. Which makes the protected version suitable here.
4155 * The context lock guarantees that. 4416 * The context lock guarantees that.
4156 */ 4417 */
4157 hlist = rcu_dereference_protected(ctx->swevent_hlist, 4418 hlist = rcu_dereference_protected(swhash->swevent_hlist,
4158 lockdep_is_held(&event->ctx->lock)); 4419 lockdep_is_held(&event->ctx->lock));
4159 if (!hlist) 4420 if (!hlist)
4160 return NULL; 4421 return NULL;
@@ -4167,23 +4428,19 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
4167 struct perf_sample_data *data, 4428 struct perf_sample_data *data,
4168 struct pt_regs *regs) 4429 struct pt_regs *regs)
4169{ 4430{
4170 struct perf_cpu_context *cpuctx; 4431 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4171 struct perf_event *event; 4432 struct perf_event *event;
4172 struct hlist_node *node; 4433 struct hlist_node *node;
4173 struct hlist_head *head; 4434 struct hlist_head *head;
4174 4435
4175 cpuctx = &__get_cpu_var(perf_cpu_context);
4176
4177 rcu_read_lock(); 4436 rcu_read_lock();
4178 4437 head = find_swevent_head_rcu(swhash, type, event_id);
4179 head = find_swevent_head_rcu(cpuctx, type, event_id);
4180
4181 if (!head) 4438 if (!head)
4182 goto end; 4439 goto end;
4183 4440
4184 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 4441 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4185 if (perf_swevent_match(event, type, event_id, data, regs)) 4442 if (perf_swevent_match(event, type, event_id, data, regs))
4186 perf_swevent_add(event, nr, nmi, data, regs); 4443 perf_swevent_event(event, nr, nmi, data, regs);
4187 } 4444 }
4188end: 4445end:
4189 rcu_read_unlock(); 4446 rcu_read_unlock();
@@ -4191,36 +4448,18 @@ end:
4191 4448
4192int perf_swevent_get_recursion_context(void) 4449int perf_swevent_get_recursion_context(void)
4193{ 4450{
4194 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 4451 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4195 int rctx;
4196
4197 if (in_nmi())
4198 rctx = 3;
4199 else if (in_irq())
4200 rctx = 2;
4201 else if (in_softirq())
4202 rctx = 1;
4203 else
4204 rctx = 0;
4205 4452
4206 if (cpuctx->recursion[rctx]) 4453 return get_recursion_context(swhash->recursion);
4207 return -1;
4208
4209 cpuctx->recursion[rctx]++;
4210 barrier();
4211
4212 return rctx;
4213} 4454}
4214EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); 4455EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
4215 4456
4216void perf_swevent_put_recursion_context(int rctx) 4457void inline perf_swevent_put_recursion_context(int rctx)
4217{ 4458{
4218 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 4459 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4219 barrier();
4220 cpuctx->recursion[rctx]--;
4221}
4222EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
4223 4460
4461 put_recursion_context(swhash->recursion, rctx);
4462}
4224 4463
4225void __perf_sw_event(u32 event_id, u64 nr, int nmi, 4464void __perf_sw_event(u32 event_id, u64 nr, int nmi,
4226 struct pt_regs *regs, u64 addr) 4465 struct pt_regs *regs, u64 addr)
@@ -4245,20 +4484,20 @@ static void perf_swevent_read(struct perf_event *event)
4245{ 4484{
4246} 4485}
4247 4486
4248static int perf_swevent_enable(struct perf_event *event) 4487static int perf_swevent_add(struct perf_event *event, int flags)
4249{ 4488{
4489 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4250 struct hw_perf_event *hwc = &event->hw; 4490 struct hw_perf_event *hwc = &event->hw;
4251 struct perf_cpu_context *cpuctx;
4252 struct hlist_head *head; 4491 struct hlist_head *head;
4253 4492
4254 cpuctx = &__get_cpu_var(perf_cpu_context);
4255
4256 if (hwc->sample_period) { 4493 if (hwc->sample_period) {
4257 hwc->last_period = hwc->sample_period; 4494 hwc->last_period = hwc->sample_period;
4258 perf_swevent_set_period(event); 4495 perf_swevent_set_period(event);
4259 } 4496 }
4260 4497
4261 head = find_swevent_head(cpuctx, event); 4498 hwc->state = !(flags & PERF_EF_START);
4499
4500 head = find_swevent_head(swhash, event);
4262 if (WARN_ON_ONCE(!head)) 4501 if (WARN_ON_ONCE(!head))
4263 return -EINVAL; 4502 return -EINVAL;
4264 4503
@@ -4267,202 +4506,27 @@ static int perf_swevent_enable(struct perf_event *event)
4267 return 0; 4506 return 0;
4268} 4507}
4269 4508
4270static void perf_swevent_disable(struct perf_event *event) 4509static void perf_swevent_del(struct perf_event *event, int flags)
4271{ 4510{
4272 hlist_del_rcu(&event->hlist_entry); 4511 hlist_del_rcu(&event->hlist_entry);
4273} 4512}
4274 4513
4275static void perf_swevent_void(struct perf_event *event) 4514static void perf_swevent_start(struct perf_event *event, int flags)
4276{
4277}
4278
4279static int perf_swevent_int(struct perf_event *event)
4280{
4281 return 0;
4282}
4283
4284static const struct pmu perf_ops_generic = {
4285 .enable = perf_swevent_enable,
4286 .disable = perf_swevent_disable,
4287 .start = perf_swevent_int,
4288 .stop = perf_swevent_void,
4289 .read = perf_swevent_read,
4290 .unthrottle = perf_swevent_void, /* hwc->interrupts already reset */
4291};
4292
4293/*
4294 * hrtimer based swevent callback
4295 */
4296
4297static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4298{ 4515{
4299 enum hrtimer_restart ret = HRTIMER_RESTART; 4516 event->hw.state = 0;
4300 struct perf_sample_data data;
4301 struct pt_regs *regs;
4302 struct perf_event *event;
4303 u64 period;
4304
4305 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
4306 event->pmu->read(event);
4307
4308 perf_sample_data_init(&data, 0);
4309 data.period = event->hw.last_period;
4310 regs = get_irq_regs();
4311
4312 if (regs && !perf_exclude_event(event, regs)) {
4313 if (!(event->attr.exclude_idle && current->pid == 0))
4314 if (perf_event_overflow(event, 0, &data, regs))
4315 ret = HRTIMER_NORESTART;
4316 }
4317
4318 period = max_t(u64, 10000, event->hw.sample_period);
4319 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
4320
4321 return ret;
4322} 4517}
4323 4518
4324static void perf_swevent_start_hrtimer(struct perf_event *event) 4519static void perf_swevent_stop(struct perf_event *event, int flags)
4325{ 4520{
4326 struct hw_perf_event *hwc = &event->hw; 4521 event->hw.state = PERF_HES_STOPPED;
4327
4328 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4329 hwc->hrtimer.function = perf_swevent_hrtimer;
4330 if (hwc->sample_period) {
4331 u64 period;
4332
4333 if (hwc->remaining) {
4334 if (hwc->remaining < 0)
4335 period = 10000;
4336 else
4337 period = hwc->remaining;
4338 hwc->remaining = 0;
4339 } else {
4340 period = max_t(u64, 10000, hwc->sample_period);
4341 }
4342 __hrtimer_start_range_ns(&hwc->hrtimer,
4343 ns_to_ktime(period), 0,
4344 HRTIMER_MODE_REL, 0);
4345 }
4346}
4347
4348static void perf_swevent_cancel_hrtimer(struct perf_event *event)
4349{
4350 struct hw_perf_event *hwc = &event->hw;
4351
4352 if (hwc->sample_period) {
4353 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4354 hwc->remaining = ktime_to_ns(remaining);
4355
4356 hrtimer_cancel(&hwc->hrtimer);
4357 }
4358}
4359
4360/*
4361 * Software event: cpu wall time clock
4362 */
4363
4364static void cpu_clock_perf_event_update(struct perf_event *event)
4365{
4366 int cpu = raw_smp_processor_id();
4367 s64 prev;
4368 u64 now;
4369
4370 now = cpu_clock(cpu);
4371 prev = atomic64_xchg(&event->hw.prev_count, now);
4372 atomic64_add(now - prev, &event->count);
4373}
4374
4375static int cpu_clock_perf_event_enable(struct perf_event *event)
4376{
4377 struct hw_perf_event *hwc = &event->hw;
4378 int cpu = raw_smp_processor_id();
4379
4380 atomic64_set(&hwc->prev_count, cpu_clock(cpu));
4381 perf_swevent_start_hrtimer(event);
4382
4383 return 0;
4384}
4385
4386static void cpu_clock_perf_event_disable(struct perf_event *event)
4387{
4388 perf_swevent_cancel_hrtimer(event);
4389 cpu_clock_perf_event_update(event);
4390}
4391
4392static void cpu_clock_perf_event_read(struct perf_event *event)
4393{
4394 cpu_clock_perf_event_update(event);
4395}
4396
4397static const struct pmu perf_ops_cpu_clock = {
4398 .enable = cpu_clock_perf_event_enable,
4399 .disable = cpu_clock_perf_event_disable,
4400 .read = cpu_clock_perf_event_read,
4401};
4402
4403/*
4404 * Software event: task time clock
4405 */
4406
4407static void task_clock_perf_event_update(struct perf_event *event, u64 now)
4408{
4409 u64 prev;
4410 s64 delta;
4411
4412 prev = atomic64_xchg(&event->hw.prev_count, now);
4413 delta = now - prev;
4414 atomic64_add(delta, &event->count);
4415}
4416
4417static int task_clock_perf_event_enable(struct perf_event *event)
4418{
4419 struct hw_perf_event *hwc = &event->hw;
4420 u64 now;
4421
4422 now = event->ctx->time;
4423
4424 atomic64_set(&hwc->prev_count, now);
4425
4426 perf_swevent_start_hrtimer(event);
4427
4428 return 0;
4429}
4430
4431static void task_clock_perf_event_disable(struct perf_event *event)
4432{
4433 perf_swevent_cancel_hrtimer(event);
4434 task_clock_perf_event_update(event, event->ctx->time);
4435
4436}
4437
4438static void task_clock_perf_event_read(struct perf_event *event)
4439{
4440 u64 time;
4441
4442 if (!in_nmi()) {
4443 update_context_time(event->ctx);
4444 time = event->ctx->time;
4445 } else {
4446 u64 now = perf_clock();
4447 u64 delta = now - event->ctx->timestamp;
4448 time = event->ctx->time + delta;
4449 }
4450
4451 task_clock_perf_event_update(event, time);
4452} 4522}
4453 4523
4454static const struct pmu perf_ops_task_clock = {
4455 .enable = task_clock_perf_event_enable,
4456 .disable = task_clock_perf_event_disable,
4457 .read = task_clock_perf_event_read,
4458};
4459
4460/* Deref the hlist from the update side */ 4524/* Deref the hlist from the update side */
4461static inline struct swevent_hlist * 4525static inline struct swevent_hlist *
4462swevent_hlist_deref(struct perf_cpu_context *cpuctx) 4526swevent_hlist_deref(struct swevent_htable *swhash)
4463{ 4527{
4464 return rcu_dereference_protected(cpuctx->swevent_hlist, 4528 return rcu_dereference_protected(swhash->swevent_hlist,
4465 lockdep_is_held(&cpuctx->hlist_mutex)); 4529 lockdep_is_held(&swhash->hlist_mutex));
4466} 4530}
4467 4531
4468static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) 4532static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
@@ -4473,27 +4537,27 @@ static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
4473 kfree(hlist); 4537 kfree(hlist);
4474} 4538}
4475 4539
4476static void swevent_hlist_release(struct perf_cpu_context *cpuctx) 4540static void swevent_hlist_release(struct swevent_htable *swhash)
4477{ 4541{
4478 struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx); 4542 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
4479 4543
4480 if (!hlist) 4544 if (!hlist)
4481 return; 4545 return;
4482 4546
4483 rcu_assign_pointer(cpuctx->swevent_hlist, NULL); 4547 rcu_assign_pointer(swhash->swevent_hlist, NULL);
4484 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); 4548 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
4485} 4549}
4486 4550
4487static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) 4551static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
4488{ 4552{
4489 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 4553 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
4490 4554
4491 mutex_lock(&cpuctx->hlist_mutex); 4555 mutex_lock(&swhash->hlist_mutex);
4492 4556
4493 if (!--cpuctx->hlist_refcount) 4557 if (!--swhash->hlist_refcount)
4494 swevent_hlist_release(cpuctx); 4558 swevent_hlist_release(swhash);
4495 4559
4496 mutex_unlock(&cpuctx->hlist_mutex); 4560 mutex_unlock(&swhash->hlist_mutex);
4497} 4561}
4498 4562
4499static void swevent_hlist_put(struct perf_event *event) 4563static void swevent_hlist_put(struct perf_event *event)
@@ -4511,12 +4575,12 @@ static void swevent_hlist_put(struct perf_event *event)
4511 4575
4512static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) 4576static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
4513{ 4577{
4514 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 4578 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
4515 int err = 0; 4579 int err = 0;
4516 4580
4517 mutex_lock(&cpuctx->hlist_mutex); 4581 mutex_lock(&swhash->hlist_mutex);
4518 4582
4519 if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) { 4583 if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
4520 struct swevent_hlist *hlist; 4584 struct swevent_hlist *hlist;
4521 4585
4522 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); 4586 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
@@ -4524,11 +4588,11 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
4524 err = -ENOMEM; 4588 err = -ENOMEM;
4525 goto exit; 4589 goto exit;
4526 } 4590 }
4527 rcu_assign_pointer(cpuctx->swevent_hlist, hlist); 4591 rcu_assign_pointer(swhash->swevent_hlist, hlist);
4528 } 4592 }
4529 cpuctx->hlist_refcount++; 4593 swhash->hlist_refcount++;
4530 exit: 4594exit:
4531 mutex_unlock(&cpuctx->hlist_mutex); 4595 mutex_unlock(&swhash->hlist_mutex);
4532 4596
4533 return err; 4597 return err;
4534} 4598}
@@ -4552,7 +4616,7 @@ static int swevent_hlist_get(struct perf_event *event)
4552 put_online_cpus(); 4616 put_online_cpus();
4553 4617
4554 return 0; 4618 return 0;
4555 fail: 4619fail:
4556 for_each_possible_cpu(cpu) { 4620 for_each_possible_cpu(cpu) {
4557 if (cpu == failed_cpu) 4621 if (cpu == failed_cpu)
4558 break; 4622 break;
@@ -4563,17 +4627,64 @@ static int swevent_hlist_get(struct perf_event *event)
4563 return err; 4627 return err;
4564} 4628}
4565 4629
4566#ifdef CONFIG_EVENT_TRACING 4630atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
4631
4632static void sw_perf_event_destroy(struct perf_event *event)
4633{
4634 u64 event_id = event->attr.config;
4635
4636 WARN_ON(event->parent);
4637
4638 jump_label_dec(&perf_swevent_enabled[event_id]);
4639 swevent_hlist_put(event);
4640}
4641
4642static int perf_swevent_init(struct perf_event *event)
4643{
4644 int event_id = event->attr.config;
4645
4646 if (event->attr.type != PERF_TYPE_SOFTWARE)
4647 return -ENOENT;
4648
4649 switch (event_id) {
4650 case PERF_COUNT_SW_CPU_CLOCK:
4651 case PERF_COUNT_SW_TASK_CLOCK:
4652 return -ENOENT;
4653
4654 default:
4655 break;
4656 }
4657
4658 if (event_id > PERF_COUNT_SW_MAX)
4659 return -ENOENT;
4567 4660
4568static const struct pmu perf_ops_tracepoint = { 4661 if (!event->parent) {
4569 .enable = perf_trace_enable, 4662 int err;
4570 .disable = perf_trace_disable, 4663
4571 .start = perf_swevent_int, 4664 err = swevent_hlist_get(event);
4572 .stop = perf_swevent_void, 4665 if (err)
4666 return err;
4667
4668 jump_label_inc(&perf_swevent_enabled[event_id]);
4669 event->destroy = sw_perf_event_destroy;
4670 }
4671
4672 return 0;
4673}
4674
4675static struct pmu perf_swevent = {
4676 .task_ctx_nr = perf_sw_context,
4677
4678 .event_init = perf_swevent_init,
4679 .add = perf_swevent_add,
4680 .del = perf_swevent_del,
4681 .start = perf_swevent_start,
4682 .stop = perf_swevent_stop,
4573 .read = perf_swevent_read, 4683 .read = perf_swevent_read,
4574 .unthrottle = perf_swevent_void,
4575}; 4684};
4576 4685
4686#ifdef CONFIG_EVENT_TRACING
4687
4577static int perf_tp_filter_match(struct perf_event *event, 4688static int perf_tp_filter_match(struct perf_event *event,
4578 struct perf_sample_data *data) 4689 struct perf_sample_data *data)
4579{ 4690{
@@ -4601,7 +4712,7 @@ static int perf_tp_event_match(struct perf_event *event,
4601} 4712}
4602 4713
4603void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, 4714void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
4604 struct pt_regs *regs, struct hlist_head *head) 4715 struct pt_regs *regs, struct hlist_head *head, int rctx)
4605{ 4716{
4606 struct perf_sample_data data; 4717 struct perf_sample_data data;
4607 struct perf_event *event; 4718 struct perf_event *event;
@@ -4615,12 +4726,12 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
4615 perf_sample_data_init(&data, addr); 4726 perf_sample_data_init(&data, addr);
4616 data.raw = &raw; 4727 data.raw = &raw;
4617 4728
4618 rcu_read_lock();
4619 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 4729 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4620 if (perf_tp_event_match(event, &data, regs)) 4730 if (perf_tp_event_match(event, &data, regs))
4621 perf_swevent_add(event, count, 1, &data, regs); 4731 perf_swevent_event(event, count, 1, &data, regs);
4622 } 4732 }
4623 rcu_read_unlock(); 4733
4734 perf_swevent_put_recursion_context(rctx);
4624} 4735}
4625EXPORT_SYMBOL_GPL(perf_tp_event); 4736EXPORT_SYMBOL_GPL(perf_tp_event);
4626 4737
@@ -4629,10 +4740,13 @@ static void tp_perf_event_destroy(struct perf_event *event)
4629 perf_trace_destroy(event); 4740 perf_trace_destroy(event);
4630} 4741}
4631 4742
4632static const struct pmu *tp_perf_event_init(struct perf_event *event) 4743static int perf_tp_event_init(struct perf_event *event)
4633{ 4744{
4634 int err; 4745 int err;
4635 4746
4747 if (event->attr.type != PERF_TYPE_TRACEPOINT)
4748 return -ENOENT;
4749
4636 /* 4750 /*
4637 * Raw tracepoint data is a severe data leak, only allow root to 4751 * Raw tracepoint data is a severe data leak, only allow root to
4638 * have these. 4752 * have these.
@@ -4640,15 +4754,31 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
4640 if ((event->attr.sample_type & PERF_SAMPLE_RAW) && 4754 if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4641 perf_paranoid_tracepoint_raw() && 4755 perf_paranoid_tracepoint_raw() &&
4642 !capable(CAP_SYS_ADMIN)) 4756 !capable(CAP_SYS_ADMIN))
4643 return ERR_PTR(-EPERM); 4757 return -EPERM;
4644 4758
4645 err = perf_trace_init(event); 4759 err = perf_trace_init(event);
4646 if (err) 4760 if (err)
4647 return NULL; 4761 return err;
4648 4762
4649 event->destroy = tp_perf_event_destroy; 4763 event->destroy = tp_perf_event_destroy;
4650 4764
4651 return &perf_ops_tracepoint; 4765 return 0;
4766}
4767
4768static struct pmu perf_tracepoint = {
4769 .task_ctx_nr = perf_sw_context,
4770
4771 .event_init = perf_tp_event_init,
4772 .add = perf_trace_add,
4773 .del = perf_trace_del,
4774 .start = perf_swevent_start,
4775 .stop = perf_swevent_stop,
4776 .read = perf_swevent_read,
4777};
4778
4779static inline void perf_tp_register(void)
4780{
4781 perf_pmu_register(&perf_tracepoint);
4652} 4782}
4653 4783
4654static int perf_event_set_filter(struct perf_event *event, void __user *arg) 4784static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4676,9 +4806,8 @@ static void perf_event_free_filter(struct perf_event *event)
4676 4806
4677#else 4807#else
4678 4808
4679static const struct pmu *tp_perf_event_init(struct perf_event *event) 4809static inline void perf_tp_register(void)
4680{ 4810{
4681 return NULL;
4682} 4811}
4683 4812
4684static int perf_event_set_filter(struct perf_event *event, void __user *arg) 4813static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4693,105 +4822,389 @@ static void perf_event_free_filter(struct perf_event *event)
4693#endif /* CONFIG_EVENT_TRACING */ 4822#endif /* CONFIG_EVENT_TRACING */
4694 4823
4695#ifdef CONFIG_HAVE_HW_BREAKPOINT 4824#ifdef CONFIG_HAVE_HW_BREAKPOINT
4696static void bp_perf_event_destroy(struct perf_event *event) 4825void perf_bp_event(struct perf_event *bp, void *data)
4697{ 4826{
4698 release_bp_slot(event); 4827 struct perf_sample_data sample;
4828 struct pt_regs *regs = data;
4829
4830 perf_sample_data_init(&sample, bp->attr.bp_addr);
4831
4832 if (!bp->hw.state && !perf_exclude_event(bp, regs))
4833 perf_swevent_event(bp, 1, 1, &sample, regs);
4699} 4834}
4835#endif
4700 4836
4701static const struct pmu *bp_perf_event_init(struct perf_event *bp) 4837/*
4838 * hrtimer based swevent callback
4839 */
4840
4841static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4702{ 4842{
4703 int err; 4843 enum hrtimer_restart ret = HRTIMER_RESTART;
4844 struct perf_sample_data data;
4845 struct pt_regs *regs;
4846 struct perf_event *event;
4847 u64 period;
4704 4848
4705 err = register_perf_hw_breakpoint(bp); 4849 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
4706 if (err) 4850 event->pmu->read(event);
4707 return ERR_PTR(err);
4708 4851
4709 bp->destroy = bp_perf_event_destroy; 4852 perf_sample_data_init(&data, 0);
4853 data.period = event->hw.last_period;
4854 regs = get_irq_regs();
4710 4855
4711 return &perf_ops_bp; 4856 if (regs && !perf_exclude_event(event, regs)) {
4857 if (!(event->attr.exclude_idle && current->pid == 0))
4858 if (perf_event_overflow(event, 0, &data, regs))
4859 ret = HRTIMER_NORESTART;
4860 }
4861
4862 period = max_t(u64, 10000, event->hw.sample_period);
4863 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
4864
4865 return ret;
4712} 4866}
4713 4867
4714void perf_bp_event(struct perf_event *bp, void *data) 4868static void perf_swevent_start_hrtimer(struct perf_event *event)
4715{ 4869{
4716 struct perf_sample_data sample; 4870 struct hw_perf_event *hwc = &event->hw;
4717 struct pt_regs *regs = data;
4718 4871
4719 perf_sample_data_init(&sample, bp->attr.bp_addr); 4872 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4873 hwc->hrtimer.function = perf_swevent_hrtimer;
4874 if (hwc->sample_period) {
4875 s64 period = local64_read(&hwc->period_left);
4720 4876
4721 if (!perf_exclude_event(bp, regs)) 4877 if (period) {
4722 perf_swevent_add(bp, 1, 1, &sample, regs); 4878 if (period < 0)
4879 period = 10000;
4880
4881 local64_set(&hwc->period_left, 0);
4882 } else {
4883 period = max_t(u64, 10000, hwc->sample_period);
4884 }
4885 __hrtimer_start_range_ns(&hwc->hrtimer,
4886 ns_to_ktime(period), 0,
4887 HRTIMER_MODE_REL_PINNED, 0);
4888 }
4723} 4889}
4724#else 4890
4725static const struct pmu *bp_perf_event_init(struct perf_event *bp) 4891static void perf_swevent_cancel_hrtimer(struct perf_event *event)
4726{ 4892{
4727 return NULL; 4893 struct hw_perf_event *hwc = &event->hw;
4894
4895 if (hwc->sample_period) {
4896 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4897 local64_set(&hwc->period_left, ktime_to_ns(remaining));
4898
4899 hrtimer_cancel(&hwc->hrtimer);
4900 }
4728} 4901}
4729 4902
4730void perf_bp_event(struct perf_event *bp, void *regs) 4903/*
4904 * Software event: cpu wall time clock
4905 */
4906
4907static void cpu_clock_event_update(struct perf_event *event)
4731{ 4908{
4909 s64 prev;
4910 u64 now;
4911
4912 now = local_clock();
4913 prev = local64_xchg(&event->hw.prev_count, now);
4914 local64_add(now - prev, &event->count);
4732} 4915}
4733#endif
4734 4916
4735atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; 4917static void cpu_clock_event_start(struct perf_event *event, int flags)
4918{
4919 local64_set(&event->hw.prev_count, local_clock());
4920 perf_swevent_start_hrtimer(event);
4921}
4736 4922
4737static void sw_perf_event_destroy(struct perf_event *event) 4923static void cpu_clock_event_stop(struct perf_event *event, int flags)
4738{ 4924{
4739 u64 event_id = event->attr.config; 4925 perf_swevent_cancel_hrtimer(event);
4926 cpu_clock_event_update(event);
4927}
4740 4928
4741 WARN_ON(event->parent); 4929static int cpu_clock_event_add(struct perf_event *event, int flags)
4930{
4931 if (flags & PERF_EF_START)
4932 cpu_clock_event_start(event, flags);
4742 4933
4743 atomic_dec(&perf_swevent_enabled[event_id]); 4934 return 0;
4744 swevent_hlist_put(event);
4745} 4935}
4746 4936
4747static const struct pmu *sw_perf_event_init(struct perf_event *event) 4937static void cpu_clock_event_del(struct perf_event *event, int flags)
4748{ 4938{
4749 const struct pmu *pmu = NULL; 4939 cpu_clock_event_stop(event, flags);
4750 u64 event_id = event->attr.config; 4940}
4941
4942static void cpu_clock_event_read(struct perf_event *event)
4943{
4944 cpu_clock_event_update(event);
4945}
4946
4947static int cpu_clock_event_init(struct perf_event *event)
4948{
4949 if (event->attr.type != PERF_TYPE_SOFTWARE)
4950 return -ENOENT;
4951
4952 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
4953 return -ENOENT;
4954
4955 return 0;
4956}
4957
4958static struct pmu perf_cpu_clock = {
4959 .task_ctx_nr = perf_sw_context,
4960
4961 .event_init = cpu_clock_event_init,
4962 .add = cpu_clock_event_add,
4963 .del = cpu_clock_event_del,
4964 .start = cpu_clock_event_start,
4965 .stop = cpu_clock_event_stop,
4966 .read = cpu_clock_event_read,
4967};
4968
4969/*
4970 * Software event: task time clock
4971 */
4972
4973static void task_clock_event_update(struct perf_event *event, u64 now)
4974{
4975 u64 prev;
4976 s64 delta;
4977
4978 prev = local64_xchg(&event->hw.prev_count, now);
4979 delta = now - prev;
4980 local64_add(delta, &event->count);
4981}
4982
4983static void task_clock_event_start(struct perf_event *event, int flags)
4984{
4985 local64_set(&event->hw.prev_count, event->ctx->time);
4986 perf_swevent_start_hrtimer(event);
4987}
4988
4989static void task_clock_event_stop(struct perf_event *event, int flags)
4990{
4991 perf_swevent_cancel_hrtimer(event);
4992 task_clock_event_update(event, event->ctx->time);
4993}
4994
4995static int task_clock_event_add(struct perf_event *event, int flags)
4996{
4997 if (flags & PERF_EF_START)
4998 task_clock_event_start(event, flags);
4999
5000 return 0;
5001}
5002
5003static void task_clock_event_del(struct perf_event *event, int flags)
5004{
5005 task_clock_event_stop(event, PERF_EF_UPDATE);
5006}
5007
5008static void task_clock_event_read(struct perf_event *event)
5009{
5010 u64 time;
5011
5012 if (!in_nmi()) {
5013 update_context_time(event->ctx);
5014 time = event->ctx->time;
5015 } else {
5016 u64 now = perf_clock();
5017 u64 delta = now - event->ctx->timestamp;
5018 time = event->ctx->time + delta;
5019 }
5020
5021 task_clock_event_update(event, time);
5022}
5023
5024static int task_clock_event_init(struct perf_event *event)
5025{
5026 if (event->attr.type != PERF_TYPE_SOFTWARE)
5027 return -ENOENT;
5028
5029 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
5030 return -ENOENT;
5031
5032 return 0;
5033}
5034
5035static struct pmu perf_task_clock = {
5036 .task_ctx_nr = perf_sw_context,
5037
5038 .event_init = task_clock_event_init,
5039 .add = task_clock_event_add,
5040 .del = task_clock_event_del,
5041 .start = task_clock_event_start,
5042 .stop = task_clock_event_stop,
5043 .read = task_clock_event_read,
5044};
5045
5046static void perf_pmu_nop_void(struct pmu *pmu)
5047{
5048}
5049
5050static int perf_pmu_nop_int(struct pmu *pmu)
5051{
5052 return 0;
5053}
5054
5055static void perf_pmu_start_txn(struct pmu *pmu)
5056{
5057 perf_pmu_disable(pmu);
5058}
5059
5060static int perf_pmu_commit_txn(struct pmu *pmu)
5061{
5062 perf_pmu_enable(pmu);
5063 return 0;
5064}
5065
5066static void perf_pmu_cancel_txn(struct pmu *pmu)
5067{
5068 perf_pmu_enable(pmu);
5069}
5070
5071/*
5072 * Ensures all contexts with the same task_ctx_nr have the same
5073 * pmu_cpu_context too.
5074 */
5075static void *find_pmu_context(int ctxn)
5076{
5077 struct pmu *pmu;
5078
5079 if (ctxn < 0)
5080 return NULL;
5081
5082 list_for_each_entry(pmu, &pmus, entry) {
5083 if (pmu->task_ctx_nr == ctxn)
5084 return pmu->pmu_cpu_context;
5085 }
5086
5087 return NULL;
5088}
5089
5090static void free_pmu_context(void * __percpu cpu_context)
5091{
5092 struct pmu *pmu;
4751 5093
5094 mutex_lock(&pmus_lock);
4752 /* 5095 /*
4753 * Software events (currently) can't in general distinguish 5096 * Like a real lame refcount.
4754 * between user, kernel and hypervisor events.
4755 * However, context switches and cpu migrations are considered
4756 * to be kernel events, and page faults are never hypervisor
4757 * events.
4758 */ 5097 */
4759 switch (event_id) { 5098 list_for_each_entry(pmu, &pmus, entry) {
4760 case PERF_COUNT_SW_CPU_CLOCK: 5099 if (pmu->pmu_cpu_context == cpu_context)
4761 pmu = &perf_ops_cpu_clock; 5100 goto out;
5101 }
4762 5102
4763 break; 5103 free_percpu(cpu_context);
4764 case PERF_COUNT_SW_TASK_CLOCK: 5104out:
4765 /* 5105 mutex_unlock(&pmus_lock);
4766 * If the user instantiates this as a per-cpu event, 5106}
4767 * use the cpu_clock event instead.
4768 */
4769 if (event->ctx->task)
4770 pmu = &perf_ops_task_clock;
4771 else
4772 pmu = &perf_ops_cpu_clock;
4773 5107
4774 break; 5108int perf_pmu_register(struct pmu *pmu)
4775 case PERF_COUNT_SW_PAGE_FAULTS: 5109{
4776 case PERF_COUNT_SW_PAGE_FAULTS_MIN: 5110 int cpu, ret;
4777 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
4778 case PERF_COUNT_SW_CONTEXT_SWITCHES:
4779 case PERF_COUNT_SW_CPU_MIGRATIONS:
4780 case PERF_COUNT_SW_ALIGNMENT_FAULTS:
4781 case PERF_COUNT_SW_EMULATION_FAULTS:
4782 if (!event->parent) {
4783 int err;
4784
4785 err = swevent_hlist_get(event);
4786 if (err)
4787 return ERR_PTR(err);
4788 5111
4789 atomic_inc(&perf_swevent_enabled[event_id]); 5112 mutex_lock(&pmus_lock);
4790 event->destroy = sw_perf_event_destroy; 5113 ret = -ENOMEM;
5114 pmu->pmu_disable_count = alloc_percpu(int);
5115 if (!pmu->pmu_disable_count)
5116 goto unlock;
5117
5118 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
5119 if (pmu->pmu_cpu_context)
5120 goto got_cpu_context;
5121
5122 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
5123 if (!pmu->pmu_cpu_context)
5124 goto free_pdc;
5125
5126 for_each_possible_cpu(cpu) {
5127 struct perf_cpu_context *cpuctx;
5128
5129 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
5130 __perf_event_init_context(&cpuctx->ctx);
5131 cpuctx->ctx.type = cpu_context;
5132 cpuctx->ctx.pmu = pmu;
5133 cpuctx->jiffies_interval = 1;
5134 INIT_LIST_HEAD(&cpuctx->rotation_list);
5135 }
5136
5137got_cpu_context:
5138 if (!pmu->start_txn) {
5139 if (pmu->pmu_enable) {
5140 /*
5141 * If we have pmu_enable/pmu_disable calls, install
5142 * transaction stubs that use that to try and batch
5143 * hardware accesses.
5144 */
5145 pmu->start_txn = perf_pmu_start_txn;
5146 pmu->commit_txn = perf_pmu_commit_txn;
5147 pmu->cancel_txn = perf_pmu_cancel_txn;
5148 } else {
5149 pmu->start_txn = perf_pmu_nop_void;
5150 pmu->commit_txn = perf_pmu_nop_int;
5151 pmu->cancel_txn = perf_pmu_nop_void;
5152 }
5153 }
5154
5155 if (!pmu->pmu_enable) {
5156 pmu->pmu_enable = perf_pmu_nop_void;
5157 pmu->pmu_disable = perf_pmu_nop_void;
5158 }
5159
5160 list_add_rcu(&pmu->entry, &pmus);
5161 ret = 0;
5162unlock:
5163 mutex_unlock(&pmus_lock);
5164
5165 return ret;
5166
5167free_pdc:
5168 free_percpu(pmu->pmu_disable_count);
5169 goto unlock;
5170}
5171
5172void perf_pmu_unregister(struct pmu *pmu)
5173{
5174 mutex_lock(&pmus_lock);
5175 list_del_rcu(&pmu->entry);
5176 mutex_unlock(&pmus_lock);
5177
5178 /*
5179 * We dereference the pmu list under both SRCU and regular RCU, so
5180 * synchronize against both of those.
5181 */
5182 synchronize_srcu(&pmus_srcu);
5183 synchronize_rcu();
5184
5185 free_percpu(pmu->pmu_disable_count);
5186 free_pmu_context(pmu->pmu_cpu_context);
5187}
5188
5189struct pmu *perf_init_event(struct perf_event *event)
5190{
5191 struct pmu *pmu = NULL;
5192 int idx;
5193
5194 idx = srcu_read_lock(&pmus_srcu);
5195 list_for_each_entry_rcu(pmu, &pmus, entry) {
5196 int ret = pmu->event_init(event);
5197 if (!ret)
5198 goto unlock;
5199
5200 if (ret != -ENOENT) {
5201 pmu = ERR_PTR(ret);
5202 goto unlock;
4791 } 5203 }
4792 pmu = &perf_ops_generic;
4793 break;
4794 } 5204 }
5205 pmu = ERR_PTR(-ENOENT);
5206unlock:
5207 srcu_read_unlock(&pmus_srcu, idx);
4795 5208
4796 return pmu; 5209 return pmu;
4797} 5210}
@@ -4800,20 +5213,18 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
4800 * Allocate and initialize a event structure 5213 * Allocate and initialize a event structure
4801 */ 5214 */
4802static struct perf_event * 5215static struct perf_event *
4803perf_event_alloc(struct perf_event_attr *attr, 5216perf_event_alloc(struct perf_event_attr *attr, int cpu,
4804 int cpu, 5217 struct task_struct *task,
4805 struct perf_event_context *ctx, 5218 struct perf_event *group_leader,
4806 struct perf_event *group_leader, 5219 struct perf_event *parent_event,
4807 struct perf_event *parent_event, 5220 perf_overflow_handler_t overflow_handler)
4808 perf_overflow_handler_t overflow_handler, 5221{
4809 gfp_t gfpflags) 5222 struct pmu *pmu;
4810{
4811 const struct pmu *pmu;
4812 struct perf_event *event; 5223 struct perf_event *event;
4813 struct hw_perf_event *hwc; 5224 struct hw_perf_event *hwc;
4814 long err; 5225 long err;
4815 5226
4816 event = kzalloc(sizeof(*event), gfpflags); 5227 event = kzalloc(sizeof(*event), GFP_KERNEL);
4817 if (!event) 5228 if (!event)
4818 return ERR_PTR(-ENOMEM); 5229 return ERR_PTR(-ENOMEM);
4819 5230
@@ -4831,6 +5242,7 @@ perf_event_alloc(struct perf_event_attr *attr,
4831 INIT_LIST_HEAD(&event->event_entry); 5242 INIT_LIST_HEAD(&event->event_entry);
4832 INIT_LIST_HEAD(&event->sibling_list); 5243 INIT_LIST_HEAD(&event->sibling_list);
4833 init_waitqueue_head(&event->waitq); 5244 init_waitqueue_head(&event->waitq);
5245 init_irq_work(&event->pending, perf_pending_event);
4834 5246
4835 mutex_init(&event->mmap_mutex); 5247 mutex_init(&event->mmap_mutex);
4836 5248
@@ -4838,7 +5250,6 @@ perf_event_alloc(struct perf_event_attr *attr,
4838 event->attr = *attr; 5250 event->attr = *attr;
4839 event->group_leader = group_leader; 5251 event->group_leader = group_leader;
4840 event->pmu = NULL; 5252 event->pmu = NULL;
4841 event->ctx = ctx;
4842 event->oncpu = -1; 5253 event->oncpu = -1;
4843 5254
4844 event->parent = parent_event; 5255 event->parent = parent_event;
@@ -4848,6 +5259,17 @@ perf_event_alloc(struct perf_event_attr *attr,
4848 5259
4849 event->state = PERF_EVENT_STATE_INACTIVE; 5260 event->state = PERF_EVENT_STATE_INACTIVE;
4850 5261
5262 if (task) {
5263 event->attach_state = PERF_ATTACH_TASK;
5264#ifdef CONFIG_HAVE_HW_BREAKPOINT
5265 /*
5266 * hw_breakpoint is a bit difficult here..
5267 */
5268 if (attr->type == PERF_TYPE_BREAKPOINT)
5269 event->hw.bp_target = task;
5270#endif
5271 }
5272
4851 if (!overflow_handler && parent_event) 5273 if (!overflow_handler && parent_event)
4852 overflow_handler = parent_event->overflow_handler; 5274 overflow_handler = parent_event->overflow_handler;
4853 5275
@@ -4864,7 +5286,7 @@ perf_event_alloc(struct perf_event_attr *attr,
4864 hwc->sample_period = 1; 5286 hwc->sample_period = 1;
4865 hwc->last_period = hwc->sample_period; 5287 hwc->last_period = hwc->sample_period;
4866 5288
4867 atomic64_set(&hwc->period_left, hwc->sample_period); 5289 local64_set(&hwc->period_left, hwc->sample_period);
4868 5290
4869 /* 5291 /*
4870 * we currently do not support PERF_FORMAT_GROUP on inherited events 5292 * we currently do not support PERF_FORMAT_GROUP on inherited events
@@ -4872,29 +5294,8 @@ perf_event_alloc(struct perf_event_attr *attr,
4872 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) 5294 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
4873 goto done; 5295 goto done;
4874 5296
4875 switch (attr->type) { 5297 pmu = perf_init_event(event);
4876 case PERF_TYPE_RAW:
4877 case PERF_TYPE_HARDWARE:
4878 case PERF_TYPE_HW_CACHE:
4879 pmu = hw_perf_event_init(event);
4880 break;
4881
4882 case PERF_TYPE_SOFTWARE:
4883 pmu = sw_perf_event_init(event);
4884 break;
4885 5298
4886 case PERF_TYPE_TRACEPOINT:
4887 pmu = tp_perf_event_init(event);
4888 break;
4889
4890 case PERF_TYPE_BREAKPOINT:
4891 pmu = bp_perf_event_init(event);
4892 break;
4893
4894
4895 default:
4896 break;
4897 }
4898done: 5299done:
4899 err = 0; 5300 err = 0;
4900 if (!pmu) 5301 if (!pmu)
@@ -4912,13 +5313,21 @@ done:
4912 event->pmu = pmu; 5313 event->pmu = pmu;
4913 5314
4914 if (!event->parent) { 5315 if (!event->parent) {
4915 atomic_inc(&nr_events); 5316 if (event->attach_state & PERF_ATTACH_TASK)
4916 if (event->attr.mmap) 5317 jump_label_inc(&perf_task_events);
5318 if (event->attr.mmap || event->attr.mmap_data)
4917 atomic_inc(&nr_mmap_events); 5319 atomic_inc(&nr_mmap_events);
4918 if (event->attr.comm) 5320 if (event->attr.comm)
4919 atomic_inc(&nr_comm_events); 5321 atomic_inc(&nr_comm_events);
4920 if (event->attr.task) 5322 if (event->attr.task)
4921 atomic_inc(&nr_task_events); 5323 atomic_inc(&nr_task_events);
5324 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
5325 err = get_callchain_buffers();
5326 if (err) {
5327 free_event(event);
5328 return ERR_PTR(err);
5329 }
5330 }
4922 } 5331 }
4923 5332
4924 return event; 5333 return event;
@@ -5007,7 +5416,7 @@ err_size:
5007static int 5416static int
5008perf_event_set_output(struct perf_event *event, struct perf_event *output_event) 5417perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
5009{ 5418{
5010 struct perf_mmap_data *data = NULL, *old_data = NULL; 5419 struct perf_buffer *buffer = NULL, *old_buffer = NULL;
5011 int ret = -EINVAL; 5420 int ret = -EINVAL;
5012 5421
5013 if (!output_event) 5422 if (!output_event)
@@ -5037,19 +5446,19 @@ set:
5037 5446
5038 if (output_event) { 5447 if (output_event) {
5039 /* get the buffer we want to redirect to */ 5448 /* get the buffer we want to redirect to */
5040 data = perf_mmap_data_get(output_event); 5449 buffer = perf_buffer_get(output_event);
5041 if (!data) 5450 if (!buffer)
5042 goto unlock; 5451 goto unlock;
5043 } 5452 }
5044 5453
5045 old_data = event->data; 5454 old_buffer = event->buffer;
5046 rcu_assign_pointer(event->data, data); 5455 rcu_assign_pointer(event->buffer, buffer);
5047 ret = 0; 5456 ret = 0;
5048unlock: 5457unlock:
5049 mutex_unlock(&event->mmap_mutex); 5458 mutex_unlock(&event->mmap_mutex);
5050 5459
5051 if (old_data) 5460 if (old_buffer)
5052 perf_mmap_data_put(old_data); 5461 perf_buffer_put(old_buffer);
5053out: 5462out:
5054 return ret; 5463 return ret;
5055} 5464}
@@ -5066,12 +5475,16 @@ SYSCALL_DEFINE5(perf_event_open,
5066 struct perf_event_attr __user *, attr_uptr, 5475 struct perf_event_attr __user *, attr_uptr,
5067 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) 5476 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
5068{ 5477{
5069 struct perf_event *event, *group_leader = NULL, *output_event = NULL; 5478 struct perf_event *group_leader = NULL, *output_event = NULL;
5479 struct perf_event *event, *sibling;
5070 struct perf_event_attr attr; 5480 struct perf_event_attr attr;
5071 struct perf_event_context *ctx; 5481 struct perf_event_context *ctx;
5072 struct file *event_file = NULL; 5482 struct file *event_file = NULL;
5073 struct file *group_file = NULL; 5483 struct file *group_file = NULL;
5484 struct task_struct *task = NULL;
5485 struct pmu *pmu;
5074 int event_fd; 5486 int event_fd;
5487 int move_group = 0;
5075 int fput_needed = 0; 5488 int fput_needed = 0;
5076 int err; 5489 int err;
5077 5490
@@ -5097,20 +5510,11 @@ SYSCALL_DEFINE5(perf_event_open,
5097 if (event_fd < 0) 5510 if (event_fd < 0)
5098 return event_fd; 5511 return event_fd;
5099 5512
5100 /*
5101 * Get the target context (task or percpu):
5102 */
5103 ctx = find_get_context(pid, cpu);
5104 if (IS_ERR(ctx)) {
5105 err = PTR_ERR(ctx);
5106 goto err_fd;
5107 }
5108
5109 if (group_fd != -1) { 5513 if (group_fd != -1) {
5110 group_leader = perf_fget_light(group_fd, &fput_needed); 5514 group_leader = perf_fget_light(group_fd, &fput_needed);
5111 if (IS_ERR(group_leader)) { 5515 if (IS_ERR(group_leader)) {
5112 err = PTR_ERR(group_leader); 5516 err = PTR_ERR(group_leader);
5113 goto err_put_context; 5517 goto err_fd;
5114 } 5518 }
5115 group_file = group_leader->filp; 5519 group_file = group_leader->filp;
5116 if (flags & PERF_FLAG_FD_OUTPUT) 5520 if (flags & PERF_FLAG_FD_OUTPUT)
@@ -5119,6 +5523,58 @@ SYSCALL_DEFINE5(perf_event_open,
5119 group_leader = NULL; 5523 group_leader = NULL;
5120 } 5524 }
5121 5525
5526 if (pid != -1) {
5527 task = find_lively_task_by_vpid(pid);
5528 if (IS_ERR(task)) {
5529 err = PTR_ERR(task);
5530 goto err_group_fd;
5531 }
5532 }
5533
5534 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL);
5535 if (IS_ERR(event)) {
5536 err = PTR_ERR(event);
5537 goto err_task;
5538 }
5539
5540 /*
5541 * Special case software events and allow them to be part of
5542 * any hardware group.
5543 */
5544 pmu = event->pmu;
5545
5546 if (group_leader &&
5547 (is_software_event(event) != is_software_event(group_leader))) {
5548 if (is_software_event(event)) {
5549 /*
5550 * If event and group_leader are not both a software
5551 * event, and event is, then group leader is not.
5552 *
5553 * Allow the addition of software events to !software
5554 * groups, this is safe because software events never
5555 * fail to schedule.
5556 */
5557 pmu = group_leader->pmu;
5558 } else if (is_software_event(group_leader) &&
5559 (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
5560 /*
5561 * In case the group is a pure software group, and we
5562 * try to add a hardware event, move the whole group to
5563 * the hardware context.
5564 */
5565 move_group = 1;
5566 }
5567 }
5568
5569 /*
5570 * Get the target context (task or percpu):
5571 */
5572 ctx = find_get_context(pmu, task, cpu);
5573 if (IS_ERR(ctx)) {
5574 err = PTR_ERR(ctx);
5575 goto err_alloc;
5576 }
5577
5122 /* 5578 /*
5123 * Look up the group leader (we will attach this event to it): 5579 * Look up the group leader (we will attach this event to it):
5124 */ 5580 */
@@ -5130,42 +5586,66 @@ SYSCALL_DEFINE5(perf_event_open,
5130 * becoming part of another group-sibling): 5586 * becoming part of another group-sibling):
5131 */ 5587 */
5132 if (group_leader->group_leader != group_leader) 5588 if (group_leader->group_leader != group_leader)
5133 goto err_put_context; 5589 goto err_context;
5134 /* 5590 /*
5135 * Do not allow to attach to a group in a different 5591 * Do not allow to attach to a group in a different
5136 * task or CPU context: 5592 * task or CPU context:
5137 */ 5593 */
5138 if (group_leader->ctx != ctx) 5594 if (move_group) {
5139 goto err_put_context; 5595 if (group_leader->ctx->type != ctx->type)
5596 goto err_context;
5597 } else {
5598 if (group_leader->ctx != ctx)
5599 goto err_context;
5600 }
5601
5140 /* 5602 /*
5141 * Only a group leader can be exclusive or pinned 5603 * Only a group leader can be exclusive or pinned
5142 */ 5604 */
5143 if (attr.exclusive || attr.pinned) 5605 if (attr.exclusive || attr.pinned)
5144 goto err_put_context; 5606 goto err_context;
5145 }
5146
5147 event = perf_event_alloc(&attr, cpu, ctx, group_leader,
5148 NULL, NULL, GFP_KERNEL);
5149 if (IS_ERR(event)) {
5150 err = PTR_ERR(event);
5151 goto err_put_context;
5152 } 5607 }
5153 5608
5154 if (output_event) { 5609 if (output_event) {
5155 err = perf_event_set_output(event, output_event); 5610 err = perf_event_set_output(event, output_event);
5156 if (err) 5611 if (err)
5157 goto err_free_put_context; 5612 goto err_context;
5158 } 5613 }
5159 5614
5160 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); 5615 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
5161 if (IS_ERR(event_file)) { 5616 if (IS_ERR(event_file)) {
5162 err = PTR_ERR(event_file); 5617 err = PTR_ERR(event_file);
5163 goto err_free_put_context; 5618 goto err_context;
5619 }
5620
5621 if (move_group) {
5622 struct perf_event_context *gctx = group_leader->ctx;
5623
5624 mutex_lock(&gctx->mutex);
5625 perf_event_remove_from_context(group_leader);
5626 list_for_each_entry(sibling, &group_leader->sibling_list,
5627 group_entry) {
5628 perf_event_remove_from_context(sibling);
5629 put_ctx(gctx);
5630 }
5631 mutex_unlock(&gctx->mutex);
5632 put_ctx(gctx);
5164 } 5633 }
5165 5634
5166 event->filp = event_file; 5635 event->filp = event_file;
5167 WARN_ON_ONCE(ctx->parent_ctx); 5636 WARN_ON_ONCE(ctx->parent_ctx);
5168 mutex_lock(&ctx->mutex); 5637 mutex_lock(&ctx->mutex);
5638
5639 if (move_group) {
5640 perf_install_in_context(ctx, group_leader, cpu);
5641 get_ctx(ctx);
5642 list_for_each_entry(sibling, &group_leader->sibling_list,
5643 group_entry) {
5644 perf_install_in_context(ctx, sibling, cpu);
5645 get_ctx(ctx);
5646 }
5647 }
5648
5169 perf_install_in_context(ctx, event, cpu); 5649 perf_install_in_context(ctx, event, cpu);
5170 ++ctx->generation; 5650 ++ctx->generation;
5171 mutex_unlock(&ctx->mutex); 5651 mutex_unlock(&ctx->mutex);
@@ -5186,11 +5666,15 @@ SYSCALL_DEFINE5(perf_event_open,
5186 fd_install(event_fd, event_file); 5666 fd_install(event_fd, event_file);
5187 return event_fd; 5667 return event_fd;
5188 5668
5189err_free_put_context: 5669err_context:
5670 put_ctx(ctx);
5671err_alloc:
5190 free_event(event); 5672 free_event(event);
5191err_put_context: 5673err_task:
5674 if (task)
5675 put_task_struct(task);
5676err_group_fd:
5192 fput_light(group_file, fput_needed); 5677 fput_light(group_file, fput_needed);
5193 put_ctx(ctx);
5194err_fd: 5678err_fd:
5195 put_unused_fd(event_fd); 5679 put_unused_fd(event_fd);
5196 return err; 5680 return err;
@@ -5201,32 +5685,31 @@ err_fd:
5201 * 5685 *
5202 * @attr: attributes of the counter to create 5686 * @attr: attributes of the counter to create
5203 * @cpu: cpu in which the counter is bound 5687 * @cpu: cpu in which the counter is bound
5204 * @pid: task to profile 5688 * @task: task to profile (NULL for percpu)
5205 */ 5689 */
5206struct perf_event * 5690struct perf_event *
5207perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, 5691perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
5208 pid_t pid, 5692 struct task_struct *task,
5209 perf_overflow_handler_t overflow_handler) 5693 perf_overflow_handler_t overflow_handler)
5210{ 5694{
5211 struct perf_event *event;
5212 struct perf_event_context *ctx; 5695 struct perf_event_context *ctx;
5696 struct perf_event *event;
5213 int err; 5697 int err;
5214 5698
5215 /* 5699 /*
5216 * Get the target context (task or percpu): 5700 * Get the target context (task or percpu):
5217 */ 5701 */
5218 5702
5219 ctx = find_get_context(pid, cpu); 5703 event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler);
5220 if (IS_ERR(ctx)) {
5221 err = PTR_ERR(ctx);
5222 goto err_exit;
5223 }
5224
5225 event = perf_event_alloc(attr, cpu, ctx, NULL,
5226 NULL, overflow_handler, GFP_KERNEL);
5227 if (IS_ERR(event)) { 5704 if (IS_ERR(event)) {
5228 err = PTR_ERR(event); 5705 err = PTR_ERR(event);
5229 goto err_put_context; 5706 goto err;
5707 }
5708
5709 ctx = find_get_context(event->pmu, task, cpu);
5710 if (IS_ERR(ctx)) {
5711 err = PTR_ERR(ctx);
5712 goto err_free;
5230 } 5713 }
5231 5714
5232 event->filp = NULL; 5715 event->filp = NULL;
@@ -5244,112 +5727,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
5244 5727
5245 return event; 5728 return event;
5246 5729
5247 err_put_context: 5730err_free:
5248 put_ctx(ctx); 5731 free_event(event);
5249 err_exit: 5732err:
5250 return ERR_PTR(err); 5733 return ERR_PTR(err);
5251} 5734}
5252EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); 5735EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
5253 5736
5254/*
5255 * inherit a event from parent task to child task:
5256 */
5257static struct perf_event *
5258inherit_event(struct perf_event *parent_event,
5259 struct task_struct *parent,
5260 struct perf_event_context *parent_ctx,
5261 struct task_struct *child,
5262 struct perf_event *group_leader,
5263 struct perf_event_context *child_ctx)
5264{
5265 struct perf_event *child_event;
5266
5267 /*
5268 * Instead of creating recursive hierarchies of events,
5269 * we link inherited events back to the original parent,
5270 * which has a filp for sure, which we use as the reference
5271 * count:
5272 */
5273 if (parent_event->parent)
5274 parent_event = parent_event->parent;
5275
5276 child_event = perf_event_alloc(&parent_event->attr,
5277 parent_event->cpu, child_ctx,
5278 group_leader, parent_event,
5279 NULL, GFP_KERNEL);
5280 if (IS_ERR(child_event))
5281 return child_event;
5282 get_ctx(child_ctx);
5283
5284 /*
5285 * Make the child state follow the state of the parent event,
5286 * not its attr.disabled bit. We hold the parent's mutex,
5287 * so we won't race with perf_event_{en, dis}able_family.
5288 */
5289 if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
5290 child_event->state = PERF_EVENT_STATE_INACTIVE;
5291 else
5292 child_event->state = PERF_EVENT_STATE_OFF;
5293
5294 if (parent_event->attr.freq) {
5295 u64 sample_period = parent_event->hw.sample_period;
5296 struct hw_perf_event *hwc = &child_event->hw;
5297
5298 hwc->sample_period = sample_period;
5299 hwc->last_period = sample_period;
5300
5301 atomic64_set(&hwc->period_left, sample_period);
5302 }
5303
5304 child_event->overflow_handler = parent_event->overflow_handler;
5305
5306 /*
5307 * Link it up in the child's context:
5308 */
5309 add_event_to_ctx(child_event, child_ctx);
5310
5311 /*
5312 * Get a reference to the parent filp - we will fput it
5313 * when the child event exits. This is safe to do because
5314 * we are in the parent and we know that the filp still
5315 * exists and has a nonzero count:
5316 */
5317 atomic_long_inc(&parent_event->filp->f_count);
5318
5319 /*
5320 * Link this into the parent event's child list
5321 */
5322 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
5323 mutex_lock(&parent_event->child_mutex);
5324 list_add_tail(&child_event->child_list, &parent_event->child_list);
5325 mutex_unlock(&parent_event->child_mutex);
5326
5327 return child_event;
5328}
5329
5330static int inherit_group(struct perf_event *parent_event,
5331 struct task_struct *parent,
5332 struct perf_event_context *parent_ctx,
5333 struct task_struct *child,
5334 struct perf_event_context *child_ctx)
5335{
5336 struct perf_event *leader;
5337 struct perf_event *sub;
5338 struct perf_event *child_ctr;
5339
5340 leader = inherit_event(parent_event, parent, parent_ctx,
5341 child, NULL, child_ctx);
5342 if (IS_ERR(leader))
5343 return PTR_ERR(leader);
5344 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
5345 child_ctr = inherit_event(sub, parent, parent_ctx,
5346 child, leader, child_ctx);
5347 if (IS_ERR(child_ctr))
5348 return PTR_ERR(child_ctr);
5349 }
5350 return 0;
5351}
5352
5353static void sync_child_event(struct perf_event *child_event, 5737static void sync_child_event(struct perf_event *child_event,
5354 struct task_struct *child) 5738 struct task_struct *child)
5355{ 5739{
@@ -5359,12 +5743,12 @@ static void sync_child_event(struct perf_event *child_event,
5359 if (child_event->attr.inherit_stat) 5743 if (child_event->attr.inherit_stat)
5360 perf_event_read_event(child_event, child); 5744 perf_event_read_event(child_event, child);
5361 5745
5362 child_val = atomic64_read(&child_event->count); 5746 child_val = perf_event_count(child_event);
5363 5747
5364 /* 5748 /*
5365 * Add back the child's count to the parent's count: 5749 * Add back the child's count to the parent's count:
5366 */ 5750 */
5367 atomic64_add(child_val, &parent_event->count); 5751 atomic64_add(child_val, &parent_event->child_count);
5368 atomic64_add(child_event->total_time_enabled, 5752 atomic64_add(child_event->total_time_enabled,
5369 &parent_event->child_total_time_enabled); 5753 &parent_event->child_total_time_enabled);
5370 atomic64_add(child_event->total_time_running, 5754 atomic64_add(child_event->total_time_running,
@@ -5406,16 +5790,13 @@ __perf_event_exit_task(struct perf_event *child_event,
5406 } 5790 }
5407} 5791}
5408 5792
5409/* 5793static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
5410 * When a child task exits, feed back event values to parent events.
5411 */
5412void perf_event_exit_task(struct task_struct *child)
5413{ 5794{
5414 struct perf_event *child_event, *tmp; 5795 struct perf_event *child_event, *tmp;
5415 struct perf_event_context *child_ctx; 5796 struct perf_event_context *child_ctx;
5416 unsigned long flags; 5797 unsigned long flags;
5417 5798
5418 if (likely(!child->perf_event_ctxp)) { 5799 if (likely(!child->perf_event_ctxp[ctxn])) {
5419 perf_event_task(child, NULL, 0); 5800 perf_event_task(child, NULL, 0);
5420 return; 5801 return;
5421 } 5802 }
@@ -5427,8 +5808,8 @@ void perf_event_exit_task(struct task_struct *child)
5427 * scheduled, so we are now safe from rescheduling changing 5808 * scheduled, so we are now safe from rescheduling changing
5428 * our context. 5809 * our context.
5429 */ 5810 */
5430 child_ctx = child->perf_event_ctxp; 5811 child_ctx = child->perf_event_ctxp[ctxn];
5431 __perf_event_task_sched_out(child_ctx); 5812 task_ctx_sched_out(child_ctx, EVENT_ALL);
5432 5813
5433 /* 5814 /*
5434 * Take the context lock here so that if find_get_context is 5815 * Take the context lock here so that if find_get_context is
@@ -5436,7 +5817,7 @@ void perf_event_exit_task(struct task_struct *child)
5436 * incremented the context's refcount before we do put_ctx below. 5817 * incremented the context's refcount before we do put_ctx below.
5437 */ 5818 */
5438 raw_spin_lock(&child_ctx->lock); 5819 raw_spin_lock(&child_ctx->lock);
5439 child->perf_event_ctxp = NULL; 5820 child->perf_event_ctxp[ctxn] = NULL;
5440 /* 5821 /*
5441 * If this context is a clone; unclone it so it can't get 5822 * If this context is a clone; unclone it so it can't get
5442 * swapped to another process while we're removing all 5823 * swapped to another process while we're removing all
@@ -5489,6 +5870,17 @@ again:
5489 put_ctx(child_ctx); 5870 put_ctx(child_ctx);
5490} 5871}
5491 5872
5873/*
5874 * When a child task exits, feed back event values to parent events.
5875 */
5876void perf_event_exit_task(struct task_struct *child)
5877{
5878 int ctxn;
5879
5880 for_each_task_context_nr(ctxn)
5881 perf_event_exit_task_context(child, ctxn);
5882}
5883
5492static void perf_free_event(struct perf_event *event, 5884static void perf_free_event(struct perf_event *event,
5493 struct perf_event_context *ctx) 5885 struct perf_event_context *ctx)
5494{ 5886{
@@ -5510,48 +5902,166 @@ static void perf_free_event(struct perf_event *event,
5510 5902
5511/* 5903/*
5512 * free an unexposed, unused context as created by inheritance by 5904 * free an unexposed, unused context as created by inheritance by
5513 * init_task below, used by fork() in case of fail. 5905 * perf_event_init_task below, used by fork() in case of fail.
5514 */ 5906 */
5515void perf_event_free_task(struct task_struct *task) 5907void perf_event_free_task(struct task_struct *task)
5516{ 5908{
5517 struct perf_event_context *ctx = task->perf_event_ctxp; 5909 struct perf_event_context *ctx;
5518 struct perf_event *event, *tmp; 5910 struct perf_event *event, *tmp;
5911 int ctxn;
5519 5912
5520 if (!ctx) 5913 for_each_task_context_nr(ctxn) {
5521 return; 5914 ctx = task->perf_event_ctxp[ctxn];
5915 if (!ctx)
5916 continue;
5522 5917
5523 mutex_lock(&ctx->mutex); 5918 mutex_lock(&ctx->mutex);
5524again: 5919again:
5525 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) 5920 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
5526 perf_free_event(event, ctx); 5921 group_entry)
5922 perf_free_event(event, ctx);
5527 5923
5528 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, 5924 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
5529 group_entry) 5925 group_entry)
5530 perf_free_event(event, ctx); 5926 perf_free_event(event, ctx);
5531 5927
5532 if (!list_empty(&ctx->pinned_groups) || 5928 if (!list_empty(&ctx->pinned_groups) ||
5533 !list_empty(&ctx->flexible_groups)) 5929 !list_empty(&ctx->flexible_groups))
5534 goto again; 5930 goto again;
5535 5931
5536 mutex_unlock(&ctx->mutex); 5932 mutex_unlock(&ctx->mutex);
5537 5933
5538 put_ctx(ctx); 5934 put_ctx(ctx);
5935 }
5936}
5937
5938void perf_event_delayed_put(struct task_struct *task)
5939{
5940 int ctxn;
5941
5942 for_each_task_context_nr(ctxn)
5943 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
5944}
5945
5946/*
5947 * inherit a event from parent task to child task:
5948 */
5949static struct perf_event *
5950inherit_event(struct perf_event *parent_event,
5951 struct task_struct *parent,
5952 struct perf_event_context *parent_ctx,
5953 struct task_struct *child,
5954 struct perf_event *group_leader,
5955 struct perf_event_context *child_ctx)
5956{
5957 struct perf_event *child_event;
5958 unsigned long flags;
5959
5960 /*
5961 * Instead of creating recursive hierarchies of events,
5962 * we link inherited events back to the original parent,
5963 * which has a filp for sure, which we use as the reference
5964 * count:
5965 */
5966 if (parent_event->parent)
5967 parent_event = parent_event->parent;
5968
5969 child_event = perf_event_alloc(&parent_event->attr,
5970 parent_event->cpu,
5971 child,
5972 group_leader, parent_event,
5973 NULL);
5974 if (IS_ERR(child_event))
5975 return child_event;
5976 get_ctx(child_ctx);
5977
5978 /*
5979 * Make the child state follow the state of the parent event,
5980 * not its attr.disabled bit. We hold the parent's mutex,
5981 * so we won't race with perf_event_{en, dis}able_family.
5982 */
5983 if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
5984 child_event->state = PERF_EVENT_STATE_INACTIVE;
5985 else
5986 child_event->state = PERF_EVENT_STATE_OFF;
5987
5988 if (parent_event->attr.freq) {
5989 u64 sample_period = parent_event->hw.sample_period;
5990 struct hw_perf_event *hwc = &child_event->hw;
5991
5992 hwc->sample_period = sample_period;
5993 hwc->last_period = sample_period;
5994
5995 local64_set(&hwc->period_left, sample_period);
5996 }
5997
5998 child_event->ctx = child_ctx;
5999 child_event->overflow_handler = parent_event->overflow_handler;
6000
6001 /*
6002 * Link it up in the child's context:
6003 */
6004 raw_spin_lock_irqsave(&child_ctx->lock, flags);
6005 add_event_to_ctx(child_event, child_ctx);
6006 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
6007
6008 /*
6009 * Get a reference to the parent filp - we will fput it
6010 * when the child event exits. This is safe to do because
6011 * we are in the parent and we know that the filp still
6012 * exists and has a nonzero count:
6013 */
6014 atomic_long_inc(&parent_event->filp->f_count);
6015
6016 /*
6017 * Link this into the parent event's child list
6018 */
6019 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
6020 mutex_lock(&parent_event->child_mutex);
6021 list_add_tail(&child_event->child_list, &parent_event->child_list);
6022 mutex_unlock(&parent_event->child_mutex);
6023
6024 return child_event;
6025}
6026
6027static int inherit_group(struct perf_event *parent_event,
6028 struct task_struct *parent,
6029 struct perf_event_context *parent_ctx,
6030 struct task_struct *child,
6031 struct perf_event_context *child_ctx)
6032{
6033 struct perf_event *leader;
6034 struct perf_event *sub;
6035 struct perf_event *child_ctr;
6036
6037 leader = inherit_event(parent_event, parent, parent_ctx,
6038 child, NULL, child_ctx);
6039 if (IS_ERR(leader))
6040 return PTR_ERR(leader);
6041 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
6042 child_ctr = inherit_event(sub, parent, parent_ctx,
6043 child, leader, child_ctx);
6044 if (IS_ERR(child_ctr))
6045 return PTR_ERR(child_ctr);
6046 }
6047 return 0;
5539} 6048}
5540 6049
5541static int 6050static int
5542inherit_task_group(struct perf_event *event, struct task_struct *parent, 6051inherit_task_group(struct perf_event *event, struct task_struct *parent,
5543 struct perf_event_context *parent_ctx, 6052 struct perf_event_context *parent_ctx,
5544 struct task_struct *child, 6053 struct task_struct *child, int ctxn,
5545 int *inherited_all) 6054 int *inherited_all)
5546{ 6055{
5547 int ret; 6056 int ret;
5548 struct perf_event_context *child_ctx = child->perf_event_ctxp; 6057 struct perf_event_context *child_ctx;
5549 6058
5550 if (!event->attr.inherit) { 6059 if (!event->attr.inherit) {
5551 *inherited_all = 0; 6060 *inherited_all = 0;
5552 return 0; 6061 return 0;
5553 } 6062 }
5554 6063
6064 child_ctx = child->perf_event_ctxp[ctxn];
5555 if (!child_ctx) { 6065 if (!child_ctx) {
5556 /* 6066 /*
5557 * This is executed from the parent task context, so 6067 * This is executed from the parent task context, so
@@ -5560,14 +6070,11 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
5560 * child. 6070 * child.
5561 */ 6071 */
5562 6072
5563 child_ctx = kzalloc(sizeof(struct perf_event_context), 6073 child_ctx = alloc_perf_context(event->pmu, child);
5564 GFP_KERNEL);
5565 if (!child_ctx) 6074 if (!child_ctx)
5566 return -ENOMEM; 6075 return -ENOMEM;
5567 6076
5568 __perf_event_init_context(child_ctx, child); 6077 child->perf_event_ctxp[ctxn] = child_ctx;
5569 child->perf_event_ctxp = child_ctx;
5570 get_task_struct(child);
5571 } 6078 }
5572 6079
5573 ret = inherit_group(event, parent, parent_ctx, 6080 ret = inherit_group(event, parent, parent_ctx,
@@ -5579,11 +6086,10 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
5579 return ret; 6086 return ret;
5580} 6087}
5581 6088
5582
5583/* 6089/*
5584 * Initialize the perf_event context in task_struct 6090 * Initialize the perf_event context in task_struct
5585 */ 6091 */
5586int perf_event_init_task(struct task_struct *child) 6092int perf_event_init_context(struct task_struct *child, int ctxn)
5587{ 6093{
5588 struct perf_event_context *child_ctx, *parent_ctx; 6094 struct perf_event_context *child_ctx, *parent_ctx;
5589 struct perf_event_context *cloned_ctx; 6095 struct perf_event_context *cloned_ctx;
@@ -5592,19 +6098,19 @@ int perf_event_init_task(struct task_struct *child)
5592 int inherited_all = 1; 6098 int inherited_all = 1;
5593 int ret = 0; 6099 int ret = 0;
5594 6100
5595 child->perf_event_ctxp = NULL; 6101 child->perf_event_ctxp[ctxn] = NULL;
5596 6102
5597 mutex_init(&child->perf_event_mutex); 6103 mutex_init(&child->perf_event_mutex);
5598 INIT_LIST_HEAD(&child->perf_event_list); 6104 INIT_LIST_HEAD(&child->perf_event_list);
5599 6105
5600 if (likely(!parent->perf_event_ctxp)) 6106 if (likely(!parent->perf_event_ctxp[ctxn]))
5601 return 0; 6107 return 0;
5602 6108
5603 /* 6109 /*
5604 * If the parent's context is a clone, pin it so it won't get 6110 * If the parent's context is a clone, pin it so it won't get
5605 * swapped under us. 6111 * swapped under us.
5606 */ 6112 */
5607 parent_ctx = perf_pin_task_context(parent); 6113 parent_ctx = perf_pin_task_context(parent, ctxn);
5608 6114
5609 /* 6115 /*
5610 * No need to check if parent_ctx != NULL here; since we saw 6116 * No need to check if parent_ctx != NULL here; since we saw
@@ -5624,20 +6130,20 @@ int perf_event_init_task(struct task_struct *child)
5624 * the list, not manipulating it: 6130 * the list, not manipulating it:
5625 */ 6131 */
5626 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { 6132 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
5627 ret = inherit_task_group(event, parent, parent_ctx, child, 6133 ret = inherit_task_group(event, parent, parent_ctx,
5628 &inherited_all); 6134 child, ctxn, &inherited_all);
5629 if (ret) 6135 if (ret)
5630 break; 6136 break;
5631 } 6137 }
5632 6138
5633 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { 6139 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
5634 ret = inherit_task_group(event, parent, parent_ctx, child, 6140 ret = inherit_task_group(event, parent, parent_ctx,
5635 &inherited_all); 6141 child, ctxn, &inherited_all);
5636 if (ret) 6142 if (ret)
5637 break; 6143 break;
5638 } 6144 }
5639 6145
5640 child_ctx = child->perf_event_ctxp; 6146 child_ctx = child->perf_event_ctxp[ctxn];
5641 6147
5642 if (child_ctx && inherited_all) { 6148 if (child_ctx && inherited_all) {
5643 /* 6149 /*
@@ -5666,63 +6172,98 @@ int perf_event_init_task(struct task_struct *child)
5666 return ret; 6172 return ret;
5667} 6173}
5668 6174
6175/*
6176 * Initialize the perf_event context in task_struct
6177 */
6178int perf_event_init_task(struct task_struct *child)
6179{
6180 int ctxn, ret;
6181
6182 for_each_task_context_nr(ctxn) {
6183 ret = perf_event_init_context(child, ctxn);
6184 if (ret)
6185 return ret;
6186 }
6187
6188 return 0;
6189}
6190
5669static void __init perf_event_init_all_cpus(void) 6191static void __init perf_event_init_all_cpus(void)
5670{ 6192{
6193 struct swevent_htable *swhash;
5671 int cpu; 6194 int cpu;
5672 struct perf_cpu_context *cpuctx;
5673 6195
5674 for_each_possible_cpu(cpu) { 6196 for_each_possible_cpu(cpu) {
5675 cpuctx = &per_cpu(perf_cpu_context, cpu); 6197 swhash = &per_cpu(swevent_htable, cpu);
5676 mutex_init(&cpuctx->hlist_mutex); 6198 mutex_init(&swhash->hlist_mutex);
5677 __perf_event_init_context(&cpuctx->ctx, NULL); 6199 INIT_LIST_HEAD(&per_cpu(rotation_list, cpu));
5678 } 6200 }
5679} 6201}
5680 6202
5681static void __cpuinit perf_event_init_cpu(int cpu) 6203static void __cpuinit perf_event_init_cpu(int cpu)
5682{ 6204{
5683 struct perf_cpu_context *cpuctx; 6205 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
5684
5685 cpuctx = &per_cpu(perf_cpu_context, cpu);
5686 6206
5687 spin_lock(&perf_resource_lock); 6207 mutex_lock(&swhash->hlist_mutex);
5688 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; 6208 if (swhash->hlist_refcount > 0) {
5689 spin_unlock(&perf_resource_lock);
5690
5691 mutex_lock(&cpuctx->hlist_mutex);
5692 if (cpuctx->hlist_refcount > 0) {
5693 struct swevent_hlist *hlist; 6209 struct swevent_hlist *hlist;
5694 6210
5695 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); 6211 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
5696 WARN_ON_ONCE(!hlist); 6212 WARN_ON(!hlist);
5697 rcu_assign_pointer(cpuctx->swevent_hlist, hlist); 6213 rcu_assign_pointer(swhash->swevent_hlist, hlist);
5698 } 6214 }
5699 mutex_unlock(&cpuctx->hlist_mutex); 6215 mutex_unlock(&swhash->hlist_mutex);
5700} 6216}
5701 6217
5702#ifdef CONFIG_HOTPLUG_CPU 6218#ifdef CONFIG_HOTPLUG_CPU
5703static void __perf_event_exit_cpu(void *info) 6219static void perf_pmu_rotate_stop(struct pmu *pmu)
5704{ 6220{
5705 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 6221 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
5706 struct perf_event_context *ctx = &cpuctx->ctx; 6222
6223 WARN_ON(!irqs_disabled());
6224
6225 list_del_init(&cpuctx->rotation_list);
6226}
6227
6228static void __perf_event_exit_context(void *__info)
6229{
6230 struct perf_event_context *ctx = __info;
5707 struct perf_event *event, *tmp; 6231 struct perf_event *event, *tmp;
5708 6232
6233 perf_pmu_rotate_stop(ctx->pmu);
6234
5709 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) 6235 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5710 __perf_event_remove_from_context(event); 6236 __perf_event_remove_from_context(event);
5711 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) 6237 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
5712 __perf_event_remove_from_context(event); 6238 __perf_event_remove_from_context(event);
5713} 6239}
6240
6241static void perf_event_exit_cpu_context(int cpu)
6242{
6243 struct perf_event_context *ctx;
6244 struct pmu *pmu;
6245 int idx;
6246
6247 idx = srcu_read_lock(&pmus_srcu);
6248 list_for_each_entry_rcu(pmu, &pmus, entry) {
6249 ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
6250
6251 mutex_lock(&ctx->mutex);
6252 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
6253 mutex_unlock(&ctx->mutex);
6254 }
6255 srcu_read_unlock(&pmus_srcu, idx);
6256}
6257
5714static void perf_event_exit_cpu(int cpu) 6258static void perf_event_exit_cpu(int cpu)
5715{ 6259{
5716 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 6260 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
5717 struct perf_event_context *ctx = &cpuctx->ctx;
5718 6261
5719 mutex_lock(&cpuctx->hlist_mutex); 6262 mutex_lock(&swhash->hlist_mutex);
5720 swevent_hlist_release(cpuctx); 6263 swevent_hlist_release(swhash);
5721 mutex_unlock(&cpuctx->hlist_mutex); 6264 mutex_unlock(&swhash->hlist_mutex);
5722 6265
5723 mutex_lock(&ctx->mutex); 6266 perf_event_exit_cpu_context(cpu);
5724 smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
5725 mutex_unlock(&ctx->mutex);
5726} 6267}
5727#else 6268#else
5728static inline void perf_event_exit_cpu(int cpu) { } 6269static inline void perf_event_exit_cpu(int cpu) { }
@@ -5733,15 +6274,15 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5733{ 6274{
5734 unsigned int cpu = (long)hcpu; 6275 unsigned int cpu = (long)hcpu;
5735 6276
5736 switch (action) { 6277 switch (action & ~CPU_TASKS_FROZEN) {
5737 6278
5738 case CPU_UP_PREPARE: 6279 case CPU_UP_PREPARE:
5739 case CPU_UP_PREPARE_FROZEN: 6280 case CPU_DOWN_FAILED:
5740 perf_event_init_cpu(cpu); 6281 perf_event_init_cpu(cpu);
5741 break; 6282 break;
5742 6283
6284 case CPU_UP_CANCELED:
5743 case CPU_DOWN_PREPARE: 6285 case CPU_DOWN_PREPARE:
5744 case CPU_DOWN_PREPARE_FROZEN:
5745 perf_event_exit_cpu(cpu); 6286 perf_event_exit_cpu(cpu);
5746 break; 6287 break;
5747 6288
@@ -5752,118 +6293,13 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5752 return NOTIFY_OK; 6293 return NOTIFY_OK;
5753} 6294}
5754 6295
5755/*
5756 * This has to have a higher priority than migration_notifier in sched.c.
5757 */
5758static struct notifier_block __cpuinitdata perf_cpu_nb = {
5759 .notifier_call = perf_cpu_notify,
5760 .priority = 20,
5761};
5762
5763void __init perf_event_init(void) 6296void __init perf_event_init(void)
5764{ 6297{
5765 perf_event_init_all_cpus(); 6298 perf_event_init_all_cpus();
5766 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, 6299 init_srcu_struct(&pmus_srcu);
5767 (void *)(long)smp_processor_id()); 6300 perf_pmu_register(&perf_swevent);
5768 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, 6301 perf_pmu_register(&perf_cpu_clock);
5769 (void *)(long)smp_processor_id()); 6302 perf_pmu_register(&perf_task_clock);
5770 register_cpu_notifier(&perf_cpu_nb); 6303 perf_tp_register();
5771} 6304 perf_cpu_notifier(perf_cpu_notify);
5772
5773static ssize_t perf_show_reserve_percpu(struct sysdev_class *class,
5774 struct sysdev_class_attribute *attr,
5775 char *buf)
5776{
5777 return sprintf(buf, "%d\n", perf_reserved_percpu);
5778}
5779
5780static ssize_t
5781perf_set_reserve_percpu(struct sysdev_class *class,
5782 struct sysdev_class_attribute *attr,
5783 const char *buf,
5784 size_t count)
5785{
5786 struct perf_cpu_context *cpuctx;
5787 unsigned long val;
5788 int err, cpu, mpt;
5789
5790 err = strict_strtoul(buf, 10, &val);
5791 if (err)
5792 return err;
5793 if (val > perf_max_events)
5794 return -EINVAL;
5795
5796 spin_lock(&perf_resource_lock);
5797 perf_reserved_percpu = val;
5798 for_each_online_cpu(cpu) {
5799 cpuctx = &per_cpu(perf_cpu_context, cpu);
5800 raw_spin_lock_irq(&cpuctx->ctx.lock);
5801 mpt = min(perf_max_events - cpuctx->ctx.nr_events,
5802 perf_max_events - perf_reserved_percpu);
5803 cpuctx->max_pertask = mpt;
5804 raw_spin_unlock_irq(&cpuctx->ctx.lock);
5805 }
5806 spin_unlock(&perf_resource_lock);
5807
5808 return count;
5809}
5810
5811static ssize_t perf_show_overcommit(struct sysdev_class *class,
5812 struct sysdev_class_attribute *attr,
5813 char *buf)
5814{
5815 return sprintf(buf, "%d\n", perf_overcommit);
5816}
5817
5818static ssize_t
5819perf_set_overcommit(struct sysdev_class *class,
5820 struct sysdev_class_attribute *attr,
5821 const char *buf, size_t count)
5822{
5823 unsigned long val;
5824 int err;
5825
5826 err = strict_strtoul(buf, 10, &val);
5827 if (err)
5828 return err;
5829 if (val > 1)
5830 return -EINVAL;
5831
5832 spin_lock(&perf_resource_lock);
5833 perf_overcommit = val;
5834 spin_unlock(&perf_resource_lock);
5835
5836 return count;
5837}
5838
5839static SYSDEV_CLASS_ATTR(
5840 reserve_percpu,
5841 0644,
5842 perf_show_reserve_percpu,
5843 perf_set_reserve_percpu
5844 );
5845
5846static SYSDEV_CLASS_ATTR(
5847 overcommit,
5848 0644,
5849 perf_show_overcommit,
5850 perf_set_overcommit
5851 );
5852
5853static struct attribute *perfclass_attrs[] = {
5854 &attr_reserve_percpu.attr,
5855 &attr_overcommit.attr,
5856 NULL
5857};
5858
5859static struct attribute_group perfclass_attr_group = {
5860 .attrs = perfclass_attrs,
5861 .name = "perf_events",
5862};
5863
5864static int __init perf_event_sysfs_init(void)
5865{
5866 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
5867 &perfclass_attr_group);
5868} 6305}
5869device_initcall(perf_event_sysfs_init);
diff --git a/kernel/pid.c b/kernel/pid.c
index e9fd8c132d26..39b65b69584f 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -122,6 +122,43 @@ static void free_pidmap(struct upid *upid)
122 atomic_inc(&map->nr_free); 122 atomic_inc(&map->nr_free);
123} 123}
124 124
125/*
126 * If we started walking pids at 'base', is 'a' seen before 'b'?
127 */
128static int pid_before(int base, int a, int b)
129{
130 /*
131 * This is the same as saying
132 *
133 * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT
134 * and that mapping orders 'a' and 'b' with respect to 'base'.
135 */
136 return (unsigned)(a - base) < (unsigned)(b - base);
137}
138
139/*
140 * We might be racing with someone else trying to set pid_ns->last_pid.
141 * We want the winner to have the "later" value, because if the
142 * "earlier" value prevails, then a pid may get reused immediately.
143 *
144 * Since pids rollover, it is not sufficient to just pick the bigger
145 * value. We have to consider where we started counting from.
146 *
147 * 'base' is the value of pid_ns->last_pid that we observed when
148 * we started looking for a pid.
149 *
150 * 'pid' is the pid that we eventually found.
151 */
152static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid)
153{
154 int prev;
155 int last_write = base;
156 do {
157 prev = last_write;
158 last_write = cmpxchg(&pid_ns->last_pid, prev, pid);
159 } while ((prev != last_write) && (pid_before(base, last_write, pid)));
160}
161
125static int alloc_pidmap(struct pid_namespace *pid_ns) 162static int alloc_pidmap(struct pid_namespace *pid_ns)
126{ 163{
127 int i, offset, max_scan, pid, last = pid_ns->last_pid; 164 int i, offset, max_scan, pid, last = pid_ns->last_pid;
@@ -132,7 +169,12 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
132 pid = RESERVED_PIDS; 169 pid = RESERVED_PIDS;
133 offset = pid & BITS_PER_PAGE_MASK; 170 offset = pid & BITS_PER_PAGE_MASK;
134 map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; 171 map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
135 max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset; 172 /*
173 * If last_pid points into the middle of the map->page we
174 * want to scan this bitmap block twice, the second time
175 * we start with offset == 0 (or RESERVED_PIDS).
176 */
177 max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset;
136 for (i = 0; i <= max_scan; ++i) { 178 for (i = 0; i <= max_scan; ++i) {
137 if (unlikely(!map->page)) { 179 if (unlikely(!map->page)) {
138 void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); 180 void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
@@ -154,20 +196,12 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
154 do { 196 do {
155 if (!test_and_set_bit(offset, map->page)) { 197 if (!test_and_set_bit(offset, map->page)) {
156 atomic_dec(&map->nr_free); 198 atomic_dec(&map->nr_free);
157 pid_ns->last_pid = pid; 199 set_last_pid(pid_ns, last, pid);
158 return pid; 200 return pid;
159 } 201 }
160 offset = find_next_offset(map, offset); 202 offset = find_next_offset(map, offset);
161 pid = mk_pid(pid_ns, map, offset); 203 pid = mk_pid(pid_ns, map, offset);
162 /* 204 } while (offset < BITS_PER_PAGE && pid < pid_max);
163 * find_next_offset() found a bit, the pid from it
164 * is in-bounds, and if we fell back to the last
165 * bitmap block and the final block was the same
166 * as the starting point, pid is before last_pid.
167 */
168 } while (offset < BITS_PER_PAGE && pid < pid_max &&
169 (i != max_scan || pid < last ||
170 !((last+1) & BITS_PER_PAGE_MASK)));
171 } 205 }
172 if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { 206 if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
173 ++map; 207 ++map;
@@ -367,7 +401,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
367 struct task_struct *result = NULL; 401 struct task_struct *result = NULL;
368 if (pid) { 402 if (pid) {
369 struct hlist_node *first; 403 struct hlist_node *first;
370 first = rcu_dereference_check(pid->tasks[type].first, 404 first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
371 rcu_read_lock_held() || 405 rcu_read_lock_held() ||
372 lockdep_tasklist_lock_is_held()); 406 lockdep_tasklist_lock_is_held());
373 if (first) 407 if (first)
@@ -382,6 +416,7 @@ EXPORT_SYMBOL(pid_task);
382 */ 416 */
383struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) 417struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
384{ 418{
419 rcu_lockdep_assert(rcu_read_lock_held());
385 return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); 420 return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
386} 421}
387 422
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index f42d3f737a33..c7a8f453919e 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -48,59 +48,49 @@
48 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock 48 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
49 * held, taken with _irqsave. One lock to rule them all 49 * held, taken with _irqsave. One lock to rule them all
50 */ 50 */
51struct pm_qos_request_list { 51enum pm_qos_type {
52 struct list_head list; 52 PM_QOS_MAX, /* return the largest value */
53 union { 53 PM_QOS_MIN /* return the smallest value */
54 s32 value;
55 s32 usec;
56 s32 kbps;
57 };
58 int pm_qos_class;
59}; 54};
60 55
61static s32 max_compare(s32 v1, s32 v2);
62static s32 min_compare(s32 v1, s32 v2);
63
64struct pm_qos_object { 56struct pm_qos_object {
65 struct pm_qos_request_list requests; 57 struct plist_head requests;
66 struct blocking_notifier_head *notifiers; 58 struct blocking_notifier_head *notifiers;
67 struct miscdevice pm_qos_power_miscdev; 59 struct miscdevice pm_qos_power_miscdev;
68 char *name; 60 char *name;
69 s32 default_value; 61 s32 default_value;
70 atomic_t target_value; 62 enum pm_qos_type type;
71 s32 (*comparitor)(s32, s32);
72}; 63};
73 64
65static DEFINE_SPINLOCK(pm_qos_lock);
66
74static struct pm_qos_object null_pm_qos; 67static struct pm_qos_object null_pm_qos;
75static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); 68static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
76static struct pm_qos_object cpu_dma_pm_qos = { 69static struct pm_qos_object cpu_dma_pm_qos = {
77 .requests = {LIST_HEAD_INIT(cpu_dma_pm_qos.requests.list)}, 70 .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock),
78 .notifiers = &cpu_dma_lat_notifier, 71 .notifiers = &cpu_dma_lat_notifier,
79 .name = "cpu_dma_latency", 72 .name = "cpu_dma_latency",
80 .default_value = 2000 * USEC_PER_SEC, 73 .default_value = 2000 * USEC_PER_SEC,
81 .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), 74 .type = PM_QOS_MIN,
82 .comparitor = min_compare
83}; 75};
84 76
85static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); 77static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
86static struct pm_qos_object network_lat_pm_qos = { 78static struct pm_qos_object network_lat_pm_qos = {
87 .requests = {LIST_HEAD_INIT(network_lat_pm_qos.requests.list)}, 79 .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock),
88 .notifiers = &network_lat_notifier, 80 .notifiers = &network_lat_notifier,
89 .name = "network_latency", 81 .name = "network_latency",
90 .default_value = 2000 * USEC_PER_SEC, 82 .default_value = 2000 * USEC_PER_SEC,
91 .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), 83 .type = PM_QOS_MIN
92 .comparitor = min_compare
93}; 84};
94 85
95 86
96static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); 87static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
97static struct pm_qos_object network_throughput_pm_qos = { 88static struct pm_qos_object network_throughput_pm_qos = {
98 .requests = {LIST_HEAD_INIT(network_throughput_pm_qos.requests.list)}, 89 .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock),
99 .notifiers = &network_throughput_notifier, 90 .notifiers = &network_throughput_notifier,
100 .name = "network_throughput", 91 .name = "network_throughput",
101 .default_value = 0, 92 .default_value = 0,
102 .target_value = ATOMIC_INIT(0), 93 .type = PM_QOS_MAX,
103 .comparitor = max_compare
104}; 94};
105 95
106 96
@@ -111,8 +101,6 @@ static struct pm_qos_object *pm_qos_array[] = {
111 &network_throughput_pm_qos 101 &network_throughput_pm_qos
112}; 102};
113 103
114static DEFINE_SPINLOCK(pm_qos_lock);
115
116static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, 104static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
117 size_t count, loff_t *f_pos); 105 size_t count, loff_t *f_pos);
118static int pm_qos_power_open(struct inode *inode, struct file *filp); 106static int pm_qos_power_open(struct inode *inode, struct file *filp);
@@ -122,48 +110,58 @@ static const struct file_operations pm_qos_power_fops = {
122 .write = pm_qos_power_write, 110 .write = pm_qos_power_write,
123 .open = pm_qos_power_open, 111 .open = pm_qos_power_open,
124 .release = pm_qos_power_release, 112 .release = pm_qos_power_release,
113 .llseek = noop_llseek,
125}; 114};
126 115
127/* static helper functions */ 116/* unlocked internal variant */
128static s32 max_compare(s32 v1, s32 v2) 117static inline int pm_qos_get_value(struct pm_qos_object *o)
129{ 118{
130 return max(v1, v2); 119 if (plist_head_empty(&o->requests))
131} 120 return o->default_value;
132 121
133static s32 min_compare(s32 v1, s32 v2) 122 switch (o->type) {
134{ 123 case PM_QOS_MIN:
135 return min(v1, v2); 124 return plist_last(&o->requests)->prio;
136} 125
126 case PM_QOS_MAX:
127 return plist_first(&o->requests)->prio;
137 128
129 default:
130 /* runtime check for not using enum */
131 BUG();
132 }
133}
138 134
139static void update_target(int pm_qos_class) 135static void update_target(struct pm_qos_object *o, struct plist_node *node,
136 int del, int value)
140{ 137{
141 s32 extreme_value;
142 struct pm_qos_request_list *node;
143 unsigned long flags; 138 unsigned long flags;
144 int call_notifier = 0; 139 int prev_value, curr_value;
145 140
146 spin_lock_irqsave(&pm_qos_lock, flags); 141 spin_lock_irqsave(&pm_qos_lock, flags);
147 extreme_value = pm_qos_array[pm_qos_class]->default_value; 142 prev_value = pm_qos_get_value(o);
148 list_for_each_entry(node, 143 /* PM_QOS_DEFAULT_VALUE is a signal that the value is unchanged */
149 &pm_qos_array[pm_qos_class]->requests.list, list) { 144 if (value != PM_QOS_DEFAULT_VALUE) {
150 extreme_value = pm_qos_array[pm_qos_class]->comparitor( 145 /*
151 extreme_value, node->value); 146 * to change the list, we atomically remove, reinit
152 } 147 * with new value and add, then see if the extremal
153 if (atomic_read(&pm_qos_array[pm_qos_class]->target_value) != 148 * changed
154 extreme_value) { 149 */
155 call_notifier = 1; 150 plist_del(node, &o->requests);
156 atomic_set(&pm_qos_array[pm_qos_class]->target_value, 151 plist_node_init(node, value);
157 extreme_value); 152 plist_add(node, &o->requests);
158 pr_debug(KERN_ERR "new target for qos %d is %d\n", pm_qos_class, 153 } else if (del) {
159 atomic_read(&pm_qos_array[pm_qos_class]->target_value)); 154 plist_del(node, &o->requests);
155 } else {
156 plist_add(node, &o->requests);
160 } 157 }
158 curr_value = pm_qos_get_value(o);
161 spin_unlock_irqrestore(&pm_qos_lock, flags); 159 spin_unlock_irqrestore(&pm_qos_lock, flags);
162 160
163 if (call_notifier) 161 if (prev_value != curr_value)
164 blocking_notifier_call_chain( 162 blocking_notifier_call_chain(o->notifiers,
165 pm_qos_array[pm_qos_class]->notifiers, 163 (unsigned long)curr_value,
166 (unsigned long) extreme_value, NULL); 164 NULL);
167} 165}
168 166
169static int register_pm_qos_misc(struct pm_qos_object *qos) 167static int register_pm_qos_misc(struct pm_qos_object *qos)
@@ -196,42 +194,53 @@ static int find_pm_qos_object_by_minor(int minor)
196 */ 194 */
197int pm_qos_request(int pm_qos_class) 195int pm_qos_request(int pm_qos_class)
198{ 196{
199 return atomic_read(&pm_qos_array[pm_qos_class]->target_value); 197 unsigned long flags;
198 int value;
199
200 spin_lock_irqsave(&pm_qos_lock, flags);
201 value = pm_qos_get_value(pm_qos_array[pm_qos_class]);
202 spin_unlock_irqrestore(&pm_qos_lock, flags);
203
204 return value;
200} 205}
201EXPORT_SYMBOL_GPL(pm_qos_request); 206EXPORT_SYMBOL_GPL(pm_qos_request);
202 207
208int pm_qos_request_active(struct pm_qos_request_list *req)
209{
210 return req->pm_qos_class != 0;
211}
212EXPORT_SYMBOL_GPL(pm_qos_request_active);
213
203/** 214/**
204 * pm_qos_add_request - inserts new qos request into the list 215 * pm_qos_add_request - inserts new qos request into the list
205 * @pm_qos_class: identifies which list of qos request to us 216 * @dep: pointer to a preallocated handle
217 * @pm_qos_class: identifies which list of qos request to use
206 * @value: defines the qos request 218 * @value: defines the qos request
207 * 219 *
208 * This function inserts a new entry in the pm_qos_class list of requested qos 220 * This function inserts a new entry in the pm_qos_class list of requested qos
209 * performance characteristics. It recomputes the aggregate QoS expectations 221 * performance characteristics. It recomputes the aggregate QoS expectations
210 * for the pm_qos_class of parameters, and returns the pm_qos_request list 222 * for the pm_qos_class of parameters and initializes the pm_qos_request_list
211 * element as a handle for use in updating and removal. Call needs to save 223 * handle. Caller needs to save this handle for later use in updates and
212 * this handle for later use. 224 * removal.
213 */ 225 */
214struct pm_qos_request_list *pm_qos_add_request(int pm_qos_class, s32 value) 226
227void pm_qos_add_request(struct pm_qos_request_list *dep,
228 int pm_qos_class, s32 value)
215{ 229{
216 struct pm_qos_request_list *dep; 230 struct pm_qos_object *o = pm_qos_array[pm_qos_class];
217 unsigned long flags; 231 int new_value;
218 232
219 dep = kzalloc(sizeof(struct pm_qos_request_list), GFP_KERNEL); 233 if (pm_qos_request_active(dep)) {
220 if (dep) { 234 WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n");
221 if (value == PM_QOS_DEFAULT_VALUE) 235 return;
222 dep->value = pm_qos_array[pm_qos_class]->default_value;
223 else
224 dep->value = value;
225 dep->pm_qos_class = pm_qos_class;
226
227 spin_lock_irqsave(&pm_qos_lock, flags);
228 list_add(&dep->list,
229 &pm_qos_array[pm_qos_class]->requests.list);
230 spin_unlock_irqrestore(&pm_qos_lock, flags);
231 update_target(pm_qos_class);
232 } 236 }
233 237 if (value == PM_QOS_DEFAULT_VALUE)
234 return dep; 238 new_value = o->default_value;
239 else
240 new_value = value;
241 plist_node_init(&dep->list, new_value);
242 dep->pm_qos_class = pm_qos_class;
243 update_target(o, &dep->list, 0, PM_QOS_DEFAULT_VALUE);
235} 244}
236EXPORT_SYMBOL_GPL(pm_qos_add_request); 245EXPORT_SYMBOL_GPL(pm_qos_add_request);
237 246
@@ -246,27 +255,28 @@ EXPORT_SYMBOL_GPL(pm_qos_add_request);
246 * Attempts are made to make this code callable on hot code paths. 255 * Attempts are made to make this code callable on hot code paths.
247 */ 256 */
248void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req, 257void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req,
249 s32 new_value) 258 s32 new_value)
250{ 259{
251 unsigned long flags;
252 int pending_update = 0;
253 s32 temp; 260 s32 temp;
261 struct pm_qos_object *o;
262
263 if (!pm_qos_req) /*guard against callers passing in null */
264 return;
254 265
255 if (pm_qos_req) { /*guard against callers passing in null */ 266 if (!pm_qos_request_active(pm_qos_req)) {
256 spin_lock_irqsave(&pm_qos_lock, flags); 267 WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n");
257 if (new_value == PM_QOS_DEFAULT_VALUE) 268 return;
258 temp = pm_qos_array[pm_qos_req->pm_qos_class]->default_value;
259 else
260 temp = new_value;
261
262 if (temp != pm_qos_req->value) {
263 pending_update = 1;
264 pm_qos_req->value = temp;
265 }
266 spin_unlock_irqrestore(&pm_qos_lock, flags);
267 if (pending_update)
268 update_target(pm_qos_req->pm_qos_class);
269 } 269 }
270
271 o = pm_qos_array[pm_qos_req->pm_qos_class];
272
273 if (new_value == PM_QOS_DEFAULT_VALUE)
274 temp = o->default_value;
275 else
276 temp = new_value;
277
278 if (temp != pm_qos_req->list.prio)
279 update_target(o, &pm_qos_req->list, 0, temp);
270} 280}
271EXPORT_SYMBOL_GPL(pm_qos_update_request); 281EXPORT_SYMBOL_GPL(pm_qos_update_request);
272 282
@@ -280,19 +290,20 @@ EXPORT_SYMBOL_GPL(pm_qos_update_request);
280 */ 290 */
281void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req) 291void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req)
282{ 292{
283 unsigned long flags; 293 struct pm_qos_object *o;
284 int qos_class;
285 294
286 if (pm_qos_req == NULL) 295 if (pm_qos_req == NULL)
287 return; 296 return;
288 /* silent return to keep pcm code cleaner */ 297 /* silent return to keep pcm code cleaner */
289 298
290 qos_class = pm_qos_req->pm_qos_class; 299 if (!pm_qos_request_active(pm_qos_req)) {
291 spin_lock_irqsave(&pm_qos_lock, flags); 300 WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n");
292 list_del(&pm_qos_req->list); 301 return;
293 kfree(pm_qos_req); 302 }
294 spin_unlock_irqrestore(&pm_qos_lock, flags); 303
295 update_target(qos_class); 304 o = pm_qos_array[pm_qos_req->pm_qos_class];
305 update_target(o, &pm_qos_req->list, 1, PM_QOS_DEFAULT_VALUE);
306 memset(pm_qos_req, 0, sizeof(*pm_qos_req));
296} 307}
297EXPORT_SYMBOL_GPL(pm_qos_remove_request); 308EXPORT_SYMBOL_GPL(pm_qos_remove_request);
298 309
@@ -340,8 +351,12 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
340 351
341 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); 352 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
342 if (pm_qos_class >= 0) { 353 if (pm_qos_class >= 0) {
343 filp->private_data = (void *) pm_qos_add_request(pm_qos_class, 354 struct pm_qos_request_list *req = kzalloc(sizeof(*req), GFP_KERNEL);
344 PM_QOS_DEFAULT_VALUE); 355 if (!req)
356 return -ENOMEM;
357
358 pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE);
359 filp->private_data = req;
345 360
346 if (filp->private_data) 361 if (filp->private_data)
347 return 0; 362 return 0;
@@ -353,8 +368,9 @@ static int pm_qos_power_release(struct inode *inode, struct file *filp)
353{ 368{
354 struct pm_qos_request_list *req; 369 struct pm_qos_request_list *req;
355 370
356 req = (struct pm_qos_request_list *)filp->private_data; 371 req = filp->private_data;
357 pm_qos_remove_request(req); 372 pm_qos_remove_request(req);
373 kfree(req);
358 374
359 return 0; 375 return 0;
360} 376}
@@ -374,14 +390,16 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
374 } else if (count == 11) { /* len('0x12345678/0') */ 390 } else if (count == 11) { /* len('0x12345678/0') */
375 if (copy_from_user(ascii_value, buf, 11)) 391 if (copy_from_user(ascii_value, buf, 11))
376 return -EFAULT; 392 return -EFAULT;
393 if (strlen(ascii_value) != 10)
394 return -EINVAL;
377 x = sscanf(ascii_value, "%x", &value); 395 x = sscanf(ascii_value, "%x", &value);
378 if (x != 1) 396 if (x != 1)
379 return -EINVAL; 397 return -EINVAL;
380 pr_debug(KERN_ERR "%s, %d, 0x%x\n", ascii_value, x, value); 398 pr_debug("%s, %d, 0x%x\n", ascii_value, x, value);
381 } else 399 } else
382 return -EINVAL; 400 return -EINVAL;
383 401
384 pm_qos_req = (struct pm_qos_request_list *)filp->private_data; 402 pm_qos_req = filp->private_data;
385 pm_qos_update_request(pm_qos_req, value); 403 pm_qos_update_request(pm_qos_req, value);
386 404
387 return count; 405 return count;
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 9829646d399c..6842eeba5879 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -16,13 +16,13 @@
16 * siglock protection since other code may update expiration cache as 16 * siglock protection since other code may update expiration cache as
17 * well. 17 * well.
18 */ 18 */
19void update_rlimit_cpu(unsigned long rlim_new) 19void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
20{ 20{
21 cputime_t cputime = secs_to_cputime(rlim_new); 21 cputime_t cputime = secs_to_cputime(rlim_new);
22 22
23 spin_lock_irq(&current->sighand->siglock); 23 spin_lock_irq(&task->sighand->siglock);
24 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); 24 set_process_cpu_timer(task, CPUCLOCK_PROF, &cputime, NULL);
25 spin_unlock_irq(&current->sighand->siglock); 25 spin_unlock_irq(&task->sighand->siglock);
26} 26}
27 27
28static int check_clock(const clockid_t which_clock) 28static int check_clock(const clockid_t which_clock)
@@ -232,31 +232,24 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
232 232
233void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) 233void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
234{ 234{
235 struct sighand_struct *sighand; 235 struct signal_struct *sig = tsk->signal;
236 struct signal_struct *sig;
237 struct task_struct *t; 236 struct task_struct *t;
238 237
239 *times = INIT_CPUTIME; 238 times->utime = sig->utime;
239 times->stime = sig->stime;
240 times->sum_exec_runtime = sig->sum_sched_runtime;
240 241
241 rcu_read_lock(); 242 rcu_read_lock();
242 sighand = rcu_dereference(tsk->sighand); 243 /* make sure we can trust tsk->thread_group list */
243 if (!sighand) 244 if (!likely(pid_alive(tsk)))
244 goto out; 245 goto out;
245 246
246 sig = tsk->signal;
247
248 t = tsk; 247 t = tsk;
249 do { 248 do {
250 times->utime = cputime_add(times->utime, t->utime); 249 times->utime = cputime_add(times->utime, t->utime);
251 times->stime = cputime_add(times->stime, t->stime); 250 times->stime = cputime_add(times->stime, t->stime);
252 times->sum_exec_runtime += t->se.sum_exec_runtime; 251 times->sum_exec_runtime += t->se.sum_exec_runtime;
253 252 } while_each_thread(tsk, t);
254 t = next_thread(t);
255 } while (t != tsk);
256
257 times->utime = cputime_add(times->utime, sig->utime);
258 times->stime = cputime_add(times->stime, sig->stime);
259 times->sum_exec_runtime += sig->sum_sched_runtime;
260out: 253out:
261 rcu_read_unlock(); 254 rcu_read_unlock();
262} 255}
@@ -1279,10 +1272,6 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1279{ 1272{
1280 struct signal_struct *sig; 1273 struct signal_struct *sig;
1281 1274
1282 /* tsk == current, ensure it is safe to use ->signal/sighand */
1283 if (unlikely(tsk->exit_state))
1284 return 0;
1285
1286 if (!task_cputime_zero(&tsk->cputime_expires)) { 1275 if (!task_cputime_zero(&tsk->cputime_expires)) {
1287 struct task_cputime task_sample = { 1276 struct task_cputime task_sample = {
1288 .utime = tsk->utime, 1277 .utime = tsk->utime,
@@ -1298,7 +1287,10 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1298 if (sig->cputimer.running) { 1287 if (sig->cputimer.running) {
1299 struct task_cputime group_sample; 1288 struct task_cputime group_sample;
1300 1289
1301 thread_group_cputimer(tsk, &group_sample); 1290 spin_lock(&sig->cputimer.lock);
1291 group_sample = sig->cputimer.cputime;
1292 spin_unlock(&sig->cputimer.lock);
1293
1302 if (task_cputime_expired(&group_sample, &sig->cputime_expires)) 1294 if (task_cputime_expired(&group_sample, &sig->cputime_expires))
1303 return 1; 1295 return 1;
1304 } 1296 }
@@ -1315,6 +1307,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1315{ 1307{
1316 LIST_HEAD(firing); 1308 LIST_HEAD(firing);
1317 struct k_itimer *timer, *next; 1309 struct k_itimer *timer, *next;
1310 unsigned long flags;
1318 1311
1319 BUG_ON(!irqs_disabled()); 1312 BUG_ON(!irqs_disabled());
1320 1313
@@ -1325,7 +1318,8 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1325 if (!fastpath_timer_check(tsk)) 1318 if (!fastpath_timer_check(tsk))
1326 return; 1319 return;
1327 1320
1328 spin_lock(&tsk->sighand->siglock); 1321 if (!lock_task_sighand(tsk, &flags))
1322 return;
1329 /* 1323 /*
1330 * Here we take off tsk->signal->cpu_timers[N] and 1324 * Here we take off tsk->signal->cpu_timers[N] and
1331 * tsk->cpu_timers[N] all the timers that are firing, and 1325 * tsk->cpu_timers[N] all the timers that are firing, and
@@ -1347,7 +1341,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1347 * that gets the timer lock before we do will give it up and 1341 * that gets the timer lock before we do will give it up and
1348 * spin until we've taken care of that timer below. 1342 * spin until we've taken care of that timer below.
1349 */ 1343 */
1350 spin_unlock(&tsk->sighand->siglock); 1344 unlock_task_sighand(tsk, &flags);
1351 1345
1352 /* 1346 /*
1353 * Now that all the timers on our list have the firing flag, 1347 * Now that all the timers on our list have the firing flag,
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index ad723420acc3..9ca4973f736d 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -560,11 +560,6 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
560 new_timer->it_clock = which_clock; 560 new_timer->it_clock = which_clock;
561 new_timer->it_overrun = -1; 561 new_timer->it_overrun = -1;
562 562
563 if (copy_to_user(created_timer_id,
564 &new_timer_id, sizeof (new_timer_id))) {
565 error = -EFAULT;
566 goto out;
567 }
568 if (timer_event_spec) { 563 if (timer_event_spec) {
569 if (copy_from_user(&event, timer_event_spec, sizeof (event))) { 564 if (copy_from_user(&event, timer_event_spec, sizeof (event))) {
570 error = -EFAULT; 565 error = -EFAULT;
@@ -590,6 +585,12 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
590 new_timer->sigq->info.si_tid = new_timer->it_id; 585 new_timer->sigq->info.si_tid = new_timer->it_id;
591 new_timer->sigq->info.si_code = SI_TIMER; 586 new_timer->sigq->info.si_code = SI_TIMER;
592 587
588 if (copy_to_user(created_timer_id,
589 &new_timer_id, sizeof (new_timer_id))) {
590 error = -EFAULT;
591 goto out;
592 }
593
593 error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); 594 error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
594 if (error) 595 if (error)
595 goto out; 596 goto out;
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index ca6066a6952e..29bff6117abc 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -86,6 +86,7 @@ config PM_SLEEP_SMP
86 depends on SMP 86 depends on SMP
87 depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE 87 depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
88 depends on PM_SLEEP 88 depends on PM_SLEEP
89 select HOTPLUG
89 select HOTPLUG_CPU 90 select HOTPLUG_CPU
90 default y 91 default y
91 92
@@ -137,6 +138,8 @@ config SUSPEND_FREEZER
137config HIBERNATION 138config HIBERNATION
138 bool "Hibernation (aka 'suspend to disk')" 139 bool "Hibernation (aka 'suspend to disk')"
139 depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE 140 depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
141 select LZO_COMPRESS
142 select LZO_DECOMPRESS
140 select SUSPEND_NVS if HAS_IOMEM 143 select SUSPEND_NVS if HAS_IOMEM
141 ---help--- 144 ---help---
142 Enable the suspend to disk (STD) functionality, which is usually 145 Enable the suspend to disk (STD) functionality, which is usually
@@ -242,3 +245,17 @@ config PM_OPS
242 bool 245 bool
243 depends on PM_SLEEP || PM_RUNTIME 246 depends on PM_SLEEP || PM_RUNTIME
244 default y 247 default y
248
249config PM_OPP
250 bool "Operating Performance Point (OPP) Layer library"
251 depends on PM
252 ---help---
253 SOCs have a standard set of tuples consisting of frequency and
254 voltage pairs that the device will support per voltage domain. This
255 is called Operating Performance Point or OPP. The actual definitions
256 of OPP varies over silicon within the same family of devices.
257
258 OPP layer organizes the data internally using device pointers
259 representing individual voltage domains and provides SOC
260 implementations a ready to use framework to manage OPPs.
261 For more information, read <file:Documentation/power/opp.txt>
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
index 97024fd40cd5..83bbc7c02df9 100644
--- a/kernel/power/block_io.c
+++ b/kernel/power/block_io.c
@@ -28,7 +28,7 @@
28static int submit(int rw, struct block_device *bdev, sector_t sector, 28static int submit(int rw, struct block_device *bdev, sector_t sector,
29 struct page *page, struct bio **bio_chain) 29 struct page *page, struct bio **bio_chain)
30{ 30{
31 const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); 31 const int bio_rw = rw | REQ_SYNC | REQ_UNPLUG;
32 struct bio *bio; 32 struct bio *bio;
33 33
34 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); 34 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index aa9e916da4d5..657272e91d0a 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Copyright (c) 2003 Patrick Mochel 4 * Copyright (c) 2003 Patrick Mochel
5 * Copyright (c) 2003 Open Source Development Lab 5 * Copyright (c) 2003 Open Source Development Lab
6 * Copyright (c) 2004 Pavel Machek <pavel@suse.cz> 6 * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz>
7 * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. 7 * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc.
8 * 8 *
9 * This file is released under the GPLv2. 9 * This file is released under the GPLv2.
@@ -29,6 +29,7 @@
29#include "power.h" 29#include "power.h"
30 30
31 31
32static int nocompress = 0;
32static int noresume = 0; 33static int noresume = 0;
33static char resume_file[256] = CONFIG_PM_STD_PARTITION; 34static char resume_file[256] = CONFIG_PM_STD_PARTITION;
34dev_t swsusp_resume_device; 35dev_t swsusp_resume_device;
@@ -277,7 +278,7 @@ static int create_image(int platform_mode)
277 goto Enable_irqs; 278 goto Enable_irqs;
278 } 279 }
279 280
280 if (hibernation_test(TEST_CORE)) 281 if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events())
281 goto Power_up; 282 goto Power_up;
282 283
283 in_suspend = 1; 284 in_suspend = 1;
@@ -288,8 +289,10 @@ static int create_image(int platform_mode)
288 error); 289 error);
289 /* Restore control flow magically appears here */ 290 /* Restore control flow magically appears here */
290 restore_processor_state(); 291 restore_processor_state();
291 if (!in_suspend) 292 if (!in_suspend) {
293 events_check_enabled = false;
292 platform_leave(platform_mode); 294 platform_leave(platform_mode);
295 }
293 296
294 Power_up: 297 Power_up:
295 sysdev_resume(); 298 sysdev_resume();
@@ -328,7 +331,7 @@ int hibernation_snapshot(int platform_mode)
328 331
329 error = platform_begin(platform_mode); 332 error = platform_begin(platform_mode);
330 if (error) 333 if (error)
331 return error; 334 goto Close;
332 335
333 /* Preallocate image memory before shutting down devices. */ 336 /* Preallocate image memory before shutting down devices. */
334 error = hibernate_preallocate_memory(); 337 error = hibernate_preallocate_memory();
@@ -511,18 +514,24 @@ int hibernation_platform_enter(void)
511 514
512 local_irq_disable(); 515 local_irq_disable();
513 sysdev_suspend(PMSG_HIBERNATE); 516 sysdev_suspend(PMSG_HIBERNATE);
517 if (!pm_check_wakeup_events()) {
518 error = -EAGAIN;
519 goto Power_up;
520 }
521
514 hibernation_ops->enter(); 522 hibernation_ops->enter();
515 /* We should never get here */ 523 /* We should never get here */
516 while (1); 524 while (1);
517 525
518 /* 526 Power_up:
519 * We don't need to reenable the nonboot CPUs or resume consoles, since 527 sysdev_resume();
520 * the system is going to be halted anyway. 528 local_irq_enable();
521 */ 529 enable_nonboot_cpus();
530
522 Platform_finish: 531 Platform_finish:
523 hibernation_ops->finish(); 532 hibernation_ops->finish();
524 533
525 dpm_suspend_noirq(PMSG_RESTORE); 534 dpm_resume_noirq(PMSG_RESTORE);
526 535
527 Resume_devices: 536 Resume_devices:
528 entering_platform_hibernation = false; 537 entering_platform_hibernation = false;
@@ -630,6 +639,8 @@ int hibernate(void)
630 639
631 if (hibernation_mode == HIBERNATION_PLATFORM) 640 if (hibernation_mode == HIBERNATION_PLATFORM)
632 flags |= SF_PLATFORM_MODE; 641 flags |= SF_PLATFORM_MODE;
642 if (nocompress)
643 flags |= SF_NOCOMPRESS_MODE;
633 pr_debug("PM: writing image.\n"); 644 pr_debug("PM: writing image.\n");
634 error = swsusp_write(flags); 645 error = swsusp_write(flags);
635 swsusp_free(); 646 swsusp_free();
@@ -697,7 +708,7 @@ static int software_resume(void)
697 goto Unlock; 708 goto Unlock;
698 } 709 }
699 710
700 pr_debug("PM: Checking image partition %s\n", resume_file); 711 pr_debug("PM: Checking hibernation image partition %s\n", resume_file);
701 712
702 /* Check if the device is there */ 713 /* Check if the device is there */
703 swsusp_resume_device = name_to_dev_t(resume_file); 714 swsusp_resume_device = name_to_dev_t(resume_file);
@@ -722,10 +733,10 @@ static int software_resume(void)
722 } 733 }
723 734
724 Check_image: 735 Check_image:
725 pr_debug("PM: Resume from partition %d:%d\n", 736 pr_debug("PM: Hibernation image partition %d:%d present\n",
726 MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); 737 MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device));
727 738
728 pr_debug("PM: Checking hibernation image.\n"); 739 pr_debug("PM: Looking for hibernation image.\n");
729 error = swsusp_check(); 740 error = swsusp_check();
730 if (error) 741 if (error)
731 goto Unlock; 742 goto Unlock;
@@ -757,14 +768,14 @@ static int software_resume(void)
757 goto Done; 768 goto Done;
758 } 769 }
759 770
760 pr_debug("PM: Reading hibernation image.\n"); 771 pr_debug("PM: Loading hibernation image.\n");
761 772
762 error = swsusp_read(&flags); 773 error = swsusp_read(&flags);
763 swsusp_close(FMODE_READ); 774 swsusp_close(FMODE_READ);
764 if (!error) 775 if (!error)
765 hibernation_restore(flags & SF_PLATFORM_MODE); 776 hibernation_restore(flags & SF_PLATFORM_MODE);
766 777
767 printk(KERN_ERR "PM: Restore failed, recovering.\n"); 778 printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n");
768 swsusp_free(); 779 swsusp_free();
769 thaw_processes(); 780 thaw_processes();
770 Done: 781 Done:
@@ -777,7 +788,7 @@ static int software_resume(void)
777 /* For success case, the suspend path will release the lock */ 788 /* For success case, the suspend path will release the lock */
778 Unlock: 789 Unlock:
779 mutex_unlock(&pm_mutex); 790 mutex_unlock(&pm_mutex);
780 pr_debug("PM: Resume from disk failed.\n"); 791 pr_debug("PM: Hibernation image not present or could not be loaded.\n");
781 return error; 792 return error;
782close_finish: 793close_finish:
783 swsusp_close(FMODE_READ); 794 swsusp_close(FMODE_READ);
@@ -996,6 +1007,15 @@ static int __init resume_offset_setup(char *str)
996 return 1; 1007 return 1;
997} 1008}
998 1009
1010static int __init hibernate_setup(char *str)
1011{
1012 if (!strncmp(str, "noresume", 8))
1013 noresume = 1;
1014 else if (!strncmp(str, "nocompress", 10))
1015 nocompress = 1;
1016 return 1;
1017}
1018
999static int __init noresume_setup(char *str) 1019static int __init noresume_setup(char *str)
1000{ 1020{
1001 noresume = 1; 1021 noresume = 1;
@@ -1005,3 +1025,4 @@ static int __init noresume_setup(char *str)
1005__setup("noresume", noresume_setup); 1025__setup("noresume", noresume_setup);
1006__setup("resume_offset=", resume_offset_setup); 1026__setup("resume_offset=", resume_offset_setup);
1007__setup("resume=", resume_setup); 1027__setup("resume=", resume_setup);
1028__setup("hibernate=", hibernate_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index b58800b21fc0..7b5db6a8561e 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -204,6 +204,60 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
204 204
205power_attr(state); 205power_attr(state);
206 206
207#ifdef CONFIG_PM_SLEEP
208/*
209 * The 'wakeup_count' attribute, along with the functions defined in
210 * drivers/base/power/wakeup.c, provides a means by which wakeup events can be
211 * handled in a non-racy way.
212 *
213 * If a wakeup event occurs when the system is in a sleep state, it simply is
214 * woken up. In turn, if an event that would wake the system up from a sleep
215 * state occurs when it is undergoing a transition to that sleep state, the
216 * transition should be aborted. Moreover, if such an event occurs when the
217 * system is in the working state, an attempt to start a transition to the
218 * given sleep state should fail during certain period after the detection of
219 * the event. Using the 'state' attribute alone is not sufficient to satisfy
220 * these requirements, because a wakeup event may occur exactly when 'state'
221 * is being written to and may be delivered to user space right before it is
222 * frozen, so the event will remain only partially processed until the system is
223 * woken up by another event. In particular, it won't cause the transition to
224 * a sleep state to be aborted.
225 *
226 * This difficulty may be overcome if user space uses 'wakeup_count' before
227 * writing to 'state'. It first should read from 'wakeup_count' and store
228 * the read value. Then, after carrying out its own preparations for the system
229 * transition to a sleep state, it should write the stored value to
230 * 'wakeup_count'. If that fails, at least one wakeup event has occured since
231 * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it
232 * is allowed to write to 'state', but the transition will be aborted if there
233 * are any wakeup events detected after 'wakeup_count' was written to.
234 */
235
236static ssize_t wakeup_count_show(struct kobject *kobj,
237 struct kobj_attribute *attr,
238 char *buf)
239{
240 unsigned int val;
241
242 return pm_get_wakeup_count(&val) ? sprintf(buf, "%u\n", val) : -EINTR;
243}
244
245static ssize_t wakeup_count_store(struct kobject *kobj,
246 struct kobj_attribute *attr,
247 const char *buf, size_t n)
248{
249 unsigned int val;
250
251 if (sscanf(buf, "%u", &val) == 1) {
252 if (pm_save_wakeup_count(val))
253 return n;
254 }
255 return -EINVAL;
256}
257
258power_attr(wakeup_count);
259#endif /* CONFIG_PM_SLEEP */
260
207#ifdef CONFIG_PM_TRACE 261#ifdef CONFIG_PM_TRACE
208int pm_trace_enabled; 262int pm_trace_enabled;
209 263
@@ -227,15 +281,34 @@ pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr,
227} 281}
228 282
229power_attr(pm_trace); 283power_attr(pm_trace);
284
285static ssize_t pm_trace_dev_match_show(struct kobject *kobj,
286 struct kobj_attribute *attr,
287 char *buf)
288{
289 return show_trace_dev_match(buf, PAGE_SIZE);
290}
291
292static ssize_t
293pm_trace_dev_match_store(struct kobject *kobj, struct kobj_attribute *attr,
294 const char *buf, size_t n)
295{
296 return -EINVAL;
297}
298
299power_attr(pm_trace_dev_match);
300
230#endif /* CONFIG_PM_TRACE */ 301#endif /* CONFIG_PM_TRACE */
231 302
232static struct attribute * g[] = { 303static struct attribute * g[] = {
233 &state_attr.attr, 304 &state_attr.attr,
234#ifdef CONFIG_PM_TRACE 305#ifdef CONFIG_PM_TRACE
235 &pm_trace_attr.attr, 306 &pm_trace_attr.attr,
307 &pm_trace_dev_match_attr.attr,
236#endif 308#endif
237#ifdef CONFIG_PM_SLEEP 309#ifdef CONFIG_PM_SLEEP
238 &pm_async_attr.attr, 310 &pm_async_attr.attr,
311 &wakeup_count_attr.attr,
239#ifdef CONFIG_PM_DEBUG 312#ifdef CONFIG_PM_DEBUG
240 &pm_test_attr.attr, 313 &pm_test_attr.attr,
241#endif 314#endif
@@ -253,7 +326,7 @@ EXPORT_SYMBOL_GPL(pm_wq);
253 326
254static int __init pm_start_workqueue(void) 327static int __init pm_start_workqueue(void)
255{ 328{
256 pm_wq = create_freezeable_workqueue("pm"); 329 pm_wq = alloc_workqueue("pm", WQ_FREEZEABLE, 0);
257 330
258 return pm_wq ? 0 : -ENOMEM; 331 return pm_wq ? 0 : -ENOMEM;
259} 332}
@@ -266,6 +339,7 @@ static int __init pm_init(void)
266 int error = pm_start_workqueue(); 339 int error = pm_start_workqueue();
267 if (error) 340 if (error)
268 return error; 341 return error;
342 hibernate_image_size_init();
269 power_kobj = kobject_create_and_add("power", NULL); 343 power_kobj = kobject_create_and_add("power", NULL);
270 if (!power_kobj) 344 if (!power_kobj)
271 return -ENOMEM; 345 return -ENOMEM;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 006270fe382d..03634be55f62 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -14,6 +14,9 @@ struct swsusp_info {
14} __attribute__((aligned(PAGE_SIZE))); 14} __attribute__((aligned(PAGE_SIZE)));
15 15
16#ifdef CONFIG_HIBERNATION 16#ifdef CONFIG_HIBERNATION
17/* kernel/power/snapshot.c */
18extern void __init hibernate_image_size_init(void);
19
17#ifdef CONFIG_ARCH_HIBERNATION_HEADER 20#ifdef CONFIG_ARCH_HIBERNATION_HEADER
18/* Maximum size of architecture specific data in a hibernation header */ 21/* Maximum size of architecture specific data in a hibernation header */
19#define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4) 22#define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4)
@@ -49,7 +52,11 @@ static inline char *check_image_kernel(struct swsusp_info *info)
49extern int hibernation_snapshot(int platform_mode); 52extern int hibernation_snapshot(int platform_mode);
50extern int hibernation_restore(int platform_mode); 53extern int hibernation_restore(int platform_mode);
51extern int hibernation_platform_enter(void); 54extern int hibernation_platform_enter(void);
52#endif 55
56#else /* !CONFIG_HIBERNATION */
57
58static inline void hibernate_image_size_init(void) {}
59#endif /* !CONFIG_HIBERNATION */
53 60
54extern int pfn_is_nosave(unsigned long); 61extern int pfn_is_nosave(unsigned long);
55 62
@@ -134,6 +141,7 @@ extern int swsusp_swap_in_use(void);
134 * the image header. 141 * the image header.
135 */ 142 */
136#define SF_PLATFORM_MODE 1 143#define SF_PLATFORM_MODE 1
144#define SF_NOCOMPRESS_MODE 2
137 145
138/* kernel/power/hibernate.c */ 146/* kernel/power/hibernate.c */
139extern int swsusp_check(void); 147extern int swsusp_check(void);
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index e8b337006276..d52359374e85 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -24,7 +24,7 @@ static void do_poweroff(struct work_struct *dummy)
24 24
25static DECLARE_WORK(poweroff_work, do_poweroff); 25static DECLARE_WORK(poweroff_work, do_poweroff);
26 26
27static void handle_poweroff(int key, struct tty_struct *tty) 27static void handle_poweroff(int key)
28{ 28{
29 /* run sysrq poweroff on boot cpu */ 29 /* run sysrq poweroff on boot cpu */
30 schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work); 30 schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 71ae29052ab6..e50b4c1b2a0f 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -15,6 +15,7 @@
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/freezer.h> 16#include <linux/freezer.h>
17#include <linux/delay.h> 17#include <linux/delay.h>
18#include <linux/workqueue.h>
18 19
19/* 20/*
20 * Timeout for stopping processes 21 * Timeout for stopping processes
@@ -35,13 +36,19 @@ static int try_to_freeze_tasks(bool sig_only)
35 struct task_struct *g, *p; 36 struct task_struct *g, *p;
36 unsigned long end_time; 37 unsigned long end_time;
37 unsigned int todo; 38 unsigned int todo;
39 bool wq_busy = false;
38 struct timeval start, end; 40 struct timeval start, end;
39 u64 elapsed_csecs64; 41 u64 elapsed_csecs64;
40 unsigned int elapsed_csecs; 42 unsigned int elapsed_csecs;
43 bool wakeup = false;
41 44
42 do_gettimeofday(&start); 45 do_gettimeofday(&start);
43 46
44 end_time = jiffies + TIMEOUT; 47 end_time = jiffies + TIMEOUT;
48
49 if (!sig_only)
50 freeze_workqueues_begin();
51
45 while (true) { 52 while (true) {
46 todo = 0; 53 todo = 0;
47 read_lock(&tasklist_lock); 54 read_lock(&tasklist_lock);
@@ -63,9 +70,20 @@ static int try_to_freeze_tasks(bool sig_only)
63 todo++; 70 todo++;
64 } while_each_thread(g, p); 71 } while_each_thread(g, p);
65 read_unlock(&tasklist_lock); 72 read_unlock(&tasklist_lock);
73
74 if (!sig_only) {
75 wq_busy = freeze_workqueues_busy();
76 todo += wq_busy;
77 }
78
66 if (!todo || time_after(jiffies, end_time)) 79 if (!todo || time_after(jiffies, end_time))
67 break; 80 break;
68 81
82 if (!pm_check_wakeup_events()) {
83 wakeup = true;
84 break;
85 }
86
69 /* 87 /*
70 * We need to retry, but first give the freezing tasks some 88 * We need to retry, but first give the freezing tasks some
71 * time to enter the regrigerator. 89 * time to enter the regrigerator.
@@ -85,13 +103,18 @@ static int try_to_freeze_tasks(bool sig_only)
85 * but it cleans up leftover PF_FREEZE requests. 103 * but it cleans up leftover PF_FREEZE requests.
86 */ 104 */
87 printk("\n"); 105 printk("\n");
88 printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds " 106 printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds "
89 "(%d tasks refusing to freeze):\n", 107 "(%d tasks refusing to freeze, wq_busy=%d):\n",
90 elapsed_csecs / 100, elapsed_csecs % 100, todo); 108 wakeup ? "aborted" : "failed",
109 elapsed_csecs / 100, elapsed_csecs % 100,
110 todo - wq_busy, wq_busy);
111
112 thaw_workqueues();
113
91 read_lock(&tasklist_lock); 114 read_lock(&tasklist_lock);
92 do_each_thread(g, p) { 115 do_each_thread(g, p) {
93 task_lock(p); 116 task_lock(p);
94 if (freezing(p) && !freezer_should_skip(p)) 117 if (!wakeup && freezing(p) && !freezer_should_skip(p))
95 sched_show_task(p); 118 sched_show_task(p);
96 cancel_freezing(p); 119 cancel_freezing(p);
97 task_unlock(p); 120 task_unlock(p);
@@ -157,6 +180,7 @@ void thaw_processes(void)
157 oom_killer_enable(); 180 oom_killer_enable();
158 181
159 printk("Restarting tasks ... "); 182 printk("Restarting tasks ... ");
183 thaw_workqueues();
160 thaw_tasks(true); 184 thaw_tasks(true);
161 thaw_tasks(false); 185 thaw_tasks(false);
162 schedule(); 186 schedule();
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 25ce010e9f8b..0dac75ea4456 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * This file provides system snapshot/restore functionality for swsusp. 4 * This file provides system snapshot/restore functionality for swsusp.
5 * 5 *
6 * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz> 6 * Copyright (C) 1998-2005 Pavel Machek <pavel@ucw.cz>
7 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> 7 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
8 * 8 *
9 * This file is released under the GPLv2. 9 * This file is released under the GPLv2.
@@ -46,7 +46,12 @@ static void swsusp_unset_page_forbidden(struct page *);
46 * size will not exceed N bytes, but if that is impossible, it will 46 * size will not exceed N bytes, but if that is impossible, it will
47 * try to create the smallest image possible. 47 * try to create the smallest image possible.
48 */ 48 */
49unsigned long image_size = 500 * 1024 * 1024; 49unsigned long image_size;
50
51void __init hibernate_image_size_init(void)
52{
53 image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE;
54}
50 55
51/* List of PBEs needed for restoring the pages that were allocated before 56/* List of PBEs needed for restoring the pages that were allocated before
52 * the suspend and included in the suspend image, but have also been 57 * the suspend and included in the suspend image, but have also been
@@ -979,8 +984,8 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
979 src = kmap_atomic(s_page, KM_USER0); 984 src = kmap_atomic(s_page, KM_USER0);
980 dst = kmap_atomic(d_page, KM_USER1); 985 dst = kmap_atomic(d_page, KM_USER1);
981 do_copy_page(dst, src); 986 do_copy_page(dst, src);
982 kunmap_atomic(src, KM_USER0);
983 kunmap_atomic(dst, KM_USER1); 987 kunmap_atomic(dst, KM_USER1);
988 kunmap_atomic(src, KM_USER0);
984 } else { 989 } else {
985 if (PageHighMem(d_page)) { 990 if (PageHighMem(d_page)) {
986 /* Page pointed to by src may contain some kernel 991 /* Page pointed to by src may contain some kernel
@@ -988,7 +993,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
988 */ 993 */
989 safe_copy_page(buffer, s_page); 994 safe_copy_page(buffer, s_page);
990 dst = kmap_atomic(d_page, KM_USER0); 995 dst = kmap_atomic(d_page, KM_USER0);
991 memcpy(dst, buffer, PAGE_SIZE); 996 copy_page(dst, buffer);
992 kunmap_atomic(dst, KM_USER0); 997 kunmap_atomic(dst, KM_USER0);
993 } else { 998 } else {
994 safe_copy_page(page_address(d_page), s_page); 999 safe_copy_page(page_address(d_page), s_page);
@@ -1121,9 +1126,19 @@ static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask)
1121 return nr_alloc; 1126 return nr_alloc;
1122} 1127}
1123 1128
1124static unsigned long preallocate_image_memory(unsigned long nr_pages) 1129static unsigned long preallocate_image_memory(unsigned long nr_pages,
1130 unsigned long avail_normal)
1125{ 1131{
1126 return preallocate_image_pages(nr_pages, GFP_IMAGE); 1132 unsigned long alloc;
1133
1134 if (avail_normal <= alloc_normal)
1135 return 0;
1136
1137 alloc = avail_normal - alloc_normal;
1138 if (nr_pages < alloc)
1139 alloc = nr_pages;
1140
1141 return preallocate_image_pages(alloc, GFP_IMAGE);
1127} 1142}
1128 1143
1129#ifdef CONFIG_HIGHMEM 1144#ifdef CONFIG_HIGHMEM
@@ -1169,15 +1184,22 @@ static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
1169 */ 1184 */
1170static void free_unnecessary_pages(void) 1185static void free_unnecessary_pages(void)
1171{ 1186{
1172 unsigned long save_highmem, to_free_normal, to_free_highmem; 1187 unsigned long save, to_free_normal, to_free_highmem;
1173 1188
1174 to_free_normal = alloc_normal - count_data_pages(); 1189 save = count_data_pages();
1175 save_highmem = count_highmem_pages(); 1190 if (alloc_normal >= save) {
1176 if (alloc_highmem > save_highmem) { 1191 to_free_normal = alloc_normal - save;
1177 to_free_highmem = alloc_highmem - save_highmem; 1192 save = 0;
1193 } else {
1194 to_free_normal = 0;
1195 save -= alloc_normal;
1196 }
1197 save += count_highmem_pages();
1198 if (alloc_highmem >= save) {
1199 to_free_highmem = alloc_highmem - save;
1178 } else { 1200 } else {
1179 to_free_highmem = 0; 1201 to_free_highmem = 0;
1180 to_free_normal -= save_highmem - alloc_highmem; 1202 to_free_normal -= save - alloc_highmem;
1181 } 1203 }
1182 1204
1183 memory_bm_position_reset(&copy_bm); 1205 memory_bm_position_reset(&copy_bm);
@@ -1258,7 +1280,7 @@ int hibernate_preallocate_memory(void)
1258{ 1280{
1259 struct zone *zone; 1281 struct zone *zone;
1260 unsigned long saveable, size, max_size, count, highmem, pages = 0; 1282 unsigned long saveable, size, max_size, count, highmem, pages = 0;
1261 unsigned long alloc, save_highmem, pages_highmem; 1283 unsigned long alloc, save_highmem, pages_highmem, avail_normal;
1262 struct timeval start, stop; 1284 struct timeval start, stop;
1263 int error; 1285 int error;
1264 1286
@@ -1295,26 +1317,38 @@ int hibernate_preallocate_memory(void)
1295 else 1317 else
1296 count += zone_page_state(zone, NR_FREE_PAGES); 1318 count += zone_page_state(zone, NR_FREE_PAGES);
1297 } 1319 }
1320 avail_normal = count;
1298 count += highmem; 1321 count += highmem;
1299 count -= totalreserve_pages; 1322 count -= totalreserve_pages;
1300 1323
1301 /* Compute the maximum number of saveable pages to leave in memory. */ 1324 /* Compute the maximum number of saveable pages to leave in memory. */
1302 max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES; 1325 max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES;
1326 /* Compute the desired number of image pages specified by image_size. */
1303 size = DIV_ROUND_UP(image_size, PAGE_SIZE); 1327 size = DIV_ROUND_UP(image_size, PAGE_SIZE);
1304 if (size > max_size) 1328 if (size > max_size)
1305 size = max_size; 1329 size = max_size;
1306 /* 1330 /*
1307 * If the maximum is not less than the current number of saveable pages 1331 * If the desired number of image pages is at least as large as the
1308 * in memory, allocate page frames for the image and we're done. 1332 * current number of saveable pages in memory, allocate page frames for
1333 * the image and we're done.
1309 */ 1334 */
1310 if (size >= saveable) { 1335 if (size >= saveable) {
1311 pages = preallocate_image_highmem(save_highmem); 1336 pages = preallocate_image_highmem(save_highmem);
1312 pages += preallocate_image_memory(saveable - pages); 1337 pages += preallocate_image_memory(saveable - pages, avail_normal);
1313 goto out; 1338 goto out;
1314 } 1339 }
1315 1340
1316 /* Estimate the minimum size of the image. */ 1341 /* Estimate the minimum size of the image. */
1317 pages = minimum_image_size(saveable); 1342 pages = minimum_image_size(saveable);
1343 /*
1344 * To avoid excessive pressure on the normal zone, leave room in it to
1345 * accommodate an image of the minimum size (unless it's already too
1346 * small, in which case don't preallocate pages from it at all).
1347 */
1348 if (avail_normal > pages)
1349 avail_normal -= pages;
1350 else
1351 avail_normal = 0;
1318 if (size < pages) 1352 if (size < pages)
1319 size = min_t(unsigned long, pages, max_size); 1353 size = min_t(unsigned long, pages, max_size);
1320 1354
@@ -1335,16 +1369,34 @@ int hibernate_preallocate_memory(void)
1335 */ 1369 */
1336 pages_highmem = preallocate_image_highmem(highmem / 2); 1370 pages_highmem = preallocate_image_highmem(highmem / 2);
1337 alloc = (count - max_size) - pages_highmem; 1371 alloc = (count - max_size) - pages_highmem;
1338 pages = preallocate_image_memory(alloc); 1372 pages = preallocate_image_memory(alloc, avail_normal);
1339 if (pages < alloc) 1373 if (pages < alloc) {
1340 goto err_out; 1374 /* We have exhausted non-highmem pages, try highmem. */
1341 size = max_size - size; 1375 alloc -= pages;
1342 alloc = size; 1376 pages += pages_highmem;
1343 size = preallocate_highmem_fraction(size, highmem, count); 1377 pages_highmem = preallocate_image_highmem(alloc);
1344 pages_highmem += size; 1378 if (pages_highmem < alloc)
1345 alloc -= size; 1379 goto err_out;
1346 pages += preallocate_image_memory(alloc); 1380 pages += pages_highmem;
1347 pages += pages_highmem; 1381 /*
1382 * size is the desired number of saveable pages to leave in
1383 * memory, so try to preallocate (all memory - size) pages.
1384 */
1385 alloc = (count - pages) - size;
1386 pages += preallocate_image_highmem(alloc);
1387 } else {
1388 /*
1389 * There are approximately max_size saveable pages at this point
1390 * and we want to reduce this number down to size.
1391 */
1392 alloc = max_size - size;
1393 size = preallocate_highmem_fraction(alloc, highmem, count);
1394 pages_highmem += size;
1395 alloc -= size;
1396 size = preallocate_image_memory(alloc, avail_normal);
1397 pages_highmem += preallocate_image_highmem(alloc - size);
1398 pages += pages_highmem + size;
1399 }
1348 1400
1349 /* 1401 /*
1350 * We only need as many page frames for the image as there are saveable 1402 * We only need as many page frames for the image as there are saveable
@@ -1635,7 +1687,7 @@ int snapshot_read_next(struct snapshot_handle *handle)
1635 memory_bm_position_reset(&orig_bm); 1687 memory_bm_position_reset(&orig_bm);
1636 memory_bm_position_reset(&copy_bm); 1688 memory_bm_position_reset(&copy_bm);
1637 } else if (handle->cur <= nr_meta_pages) { 1689 } else if (handle->cur <= nr_meta_pages) {
1638 memset(buffer, 0, PAGE_SIZE); 1690 clear_page(buffer);
1639 pack_pfns(buffer, &orig_bm); 1691 pack_pfns(buffer, &orig_bm);
1640 } else { 1692 } else {
1641 struct page *page; 1693 struct page *page;
@@ -1649,7 +1701,7 @@ int snapshot_read_next(struct snapshot_handle *handle)
1649 void *kaddr; 1701 void *kaddr;
1650 1702
1651 kaddr = kmap_atomic(page, KM_USER0); 1703 kaddr = kmap_atomic(page, KM_USER0);
1652 memcpy(buffer, kaddr, PAGE_SIZE); 1704 copy_page(buffer, kaddr);
1653 kunmap_atomic(kaddr, KM_USER0); 1705 kunmap_atomic(kaddr, KM_USER0);
1654 handle->buffer = buffer; 1706 handle->buffer = buffer;
1655 } else { 1707 } else {
@@ -1932,7 +1984,7 @@ static void copy_last_highmem_page(void)
1932 void *dst; 1984 void *dst;
1933 1985
1934 dst = kmap_atomic(last_highmem_page, KM_USER0); 1986 dst = kmap_atomic(last_highmem_page, KM_USER0);
1935 memcpy(dst, buffer, PAGE_SIZE); 1987 copy_page(dst, buffer);
1936 kunmap_atomic(dst, KM_USER0); 1988 kunmap_atomic(dst, KM_USER0);
1937 last_highmem_page = NULL; 1989 last_highmem_page = NULL;
1938 } 1990 }
@@ -2218,11 +2270,11 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
2218 2270
2219 kaddr1 = kmap_atomic(p1, KM_USER0); 2271 kaddr1 = kmap_atomic(p1, KM_USER0);
2220 kaddr2 = kmap_atomic(p2, KM_USER1); 2272 kaddr2 = kmap_atomic(p2, KM_USER1);
2221 memcpy(buf, kaddr1, PAGE_SIZE); 2273 copy_page(buf, kaddr1);
2222 memcpy(kaddr1, kaddr2, PAGE_SIZE); 2274 copy_page(kaddr1, kaddr2);
2223 memcpy(kaddr2, buf, PAGE_SIZE); 2275 copy_page(kaddr2, buf);
2224 kunmap_atomic(kaddr1, KM_USER0);
2225 kunmap_atomic(kaddr2, KM_USER1); 2276 kunmap_atomic(kaddr2, KM_USER1);
2277 kunmap_atomic(kaddr1, KM_USER0);
2226} 2278}
2227 2279
2228/** 2280/**
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index f37cb7dd4402..7335952ee473 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -136,19 +136,19 @@ static int suspend_enter(suspend_state_t state)
136 if (suspend_ops->prepare) { 136 if (suspend_ops->prepare) {
137 error = suspend_ops->prepare(); 137 error = suspend_ops->prepare();
138 if (error) 138 if (error)
139 return error; 139 goto Platform_finish;
140 } 140 }
141 141
142 error = dpm_suspend_noirq(PMSG_SUSPEND); 142 error = dpm_suspend_noirq(PMSG_SUSPEND);
143 if (error) { 143 if (error) {
144 printk(KERN_ERR "PM: Some devices failed to power down\n"); 144 printk(KERN_ERR "PM: Some devices failed to power down\n");
145 goto Platfrom_finish; 145 goto Platform_finish;
146 } 146 }
147 147
148 if (suspend_ops->prepare_late) { 148 if (suspend_ops->prepare_late) {
149 error = suspend_ops->prepare_late(); 149 error = suspend_ops->prepare_late();
150 if (error) 150 if (error)
151 goto Power_up_devices; 151 goto Platform_wake;
152 } 152 }
153 153
154 if (suspend_test(TEST_PLATFORM)) 154 if (suspend_test(TEST_PLATFORM))
@@ -163,8 +163,10 @@ static int suspend_enter(suspend_state_t state)
163 163
164 error = sysdev_suspend(PMSG_SUSPEND); 164 error = sysdev_suspend(PMSG_SUSPEND);
165 if (!error) { 165 if (!error) {
166 if (!suspend_test(TEST_CORE)) 166 if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) {
167 error = suspend_ops->enter(state); 167 error = suspend_ops->enter(state);
168 events_check_enabled = false;
169 }
168 sysdev_resume(); 170 sysdev_resume();
169 } 171 }
170 172
@@ -178,10 +180,9 @@ static int suspend_enter(suspend_state_t state)
178 if (suspend_ops->wake) 180 if (suspend_ops->wake)
179 suspend_ops->wake(); 181 suspend_ops->wake();
180 182
181 Power_up_devices:
182 dpm_resume_noirq(PMSG_RESUME); 183 dpm_resume_noirq(PMSG_RESUME);
183 184
184 Platfrom_finish: 185 Platform_finish:
185 if (suspend_ops->finish) 186 if (suspend_ops->finish)
186 suspend_ops->finish(); 187 suspend_ops->finish();
187 188
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index b0bb21778391..a0e4a86ccf94 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -4,7 +4,7 @@
4 * This file provides functions for reading the suspend image from 4 * This file provides functions for reading the suspend image from
5 * and writing it to a swap partition. 5 * and writing it to a swap partition.
6 * 6 *
7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz> 7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
8 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> 8 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
9 * 9 *
10 * This file is released under the GPLv2. 10 * This file is released under the GPLv2.
@@ -24,15 +24,17 @@
24#include <linux/swapops.h> 24#include <linux/swapops.h>
25#include <linux/pm.h> 25#include <linux/pm.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/lzo.h>
28#include <linux/vmalloc.h>
27 29
28#include "power.h" 30#include "power.h"
29 31
30#define SWSUSP_SIG "S1SUSPEND" 32#define HIBERNATE_SIG "LINHIB0001"
31 33
32/* 34/*
33 * The swap map is a data structure used for keeping track of each page 35 * The swap map is a data structure used for keeping track of each page
34 * written to a swap partition. It consists of many swap_map_page 36 * written to a swap partition. It consists of many swap_map_page
35 * structures that contain each an array of MAP_PAGE_SIZE swap entries. 37 * structures that contain each an array of MAP_PAGE_ENTRIES swap entries.
36 * These structures are stored on the swap and linked together with the 38 * These structures are stored on the swap and linked together with the
37 * help of the .next_swap member. 39 * help of the .next_swap member.
38 * 40 *
@@ -148,7 +150,7 @@ sector_t alloc_swapdev_block(int swap)
148 150
149/** 151/**
150 * free_all_swap_pages - free swap pages allocated for saving image data. 152 * free_all_swap_pages - free swap pages allocated for saving image data.
151 * It also frees the extents used to register which swap entres had been 153 * It also frees the extents used to register which swap entries had been
152 * allocated. 154 * allocated.
153 */ 155 */
154 156
@@ -193,7 +195,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
193 if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || 195 if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
194 !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { 196 !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
195 memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); 197 memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
196 memcpy(swsusp_header->sig,SWSUSP_SIG, 10); 198 memcpy(swsusp_header->sig, HIBERNATE_SIG, 10);
197 swsusp_header->image = handle->first_sector; 199 swsusp_header->image = handle->first_sector;
198 swsusp_header->flags = flags; 200 swsusp_header->flags = flags;
199 error = hib_bio_write_page(swsusp_resume_block, 201 error = hib_bio_write_page(swsusp_resume_block,
@@ -249,7 +251,7 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
249 if (bio_chain) { 251 if (bio_chain) {
250 src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); 252 src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
251 if (src) { 253 if (src) {
252 memcpy(src, buf, PAGE_SIZE); 254 copy_page(src, buf);
253 } else { 255 } else {
254 WARN_ON_ONCE(1); 256 WARN_ON_ONCE(1);
255 bio_chain = NULL; /* Go synchronous */ 257 bio_chain = NULL; /* Go synchronous */
@@ -323,7 +325,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
323 error = write_page(handle->cur, handle->cur_swap, NULL); 325 error = write_page(handle->cur, handle->cur_swap, NULL);
324 if (error) 326 if (error)
325 goto out; 327 goto out;
326 memset(handle->cur, 0, PAGE_SIZE); 328 clear_page(handle->cur);
327 handle->cur_swap = offset; 329 handle->cur_swap = offset;
328 handle->k = 0; 330 handle->k = 0;
329 } 331 }
@@ -357,6 +359,18 @@ static int swap_writer_finish(struct swap_map_handle *handle,
357 return error; 359 return error;
358} 360}
359 361
362/* We need to remember how much compressed data we need to read. */
363#define LZO_HEADER sizeof(size_t)
364
365/* Number of pages/bytes we'll compress at one time. */
366#define LZO_UNC_PAGES 32
367#define LZO_UNC_SIZE (LZO_UNC_PAGES * PAGE_SIZE)
368
369/* Number of pages/bytes we need for compressed data (worst case). */
370#define LZO_CMP_PAGES DIV_ROUND_UP(lzo1x_worst_compress(LZO_UNC_SIZE) + \
371 LZO_HEADER, PAGE_SIZE)
372#define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE)
373
360/** 374/**
361 * save_image - save the suspend image data 375 * save_image - save the suspend image data
362 */ 376 */
@@ -404,6 +418,137 @@ static int save_image(struct swap_map_handle *handle,
404 return ret; 418 return ret;
405} 419}
406 420
421
422/**
423 * save_image_lzo - Save the suspend image data compressed with LZO.
424 * @handle: Swap mam handle to use for saving the image.
425 * @snapshot: Image to read data from.
426 * @nr_to_write: Number of pages to save.
427 */
428static int save_image_lzo(struct swap_map_handle *handle,
429 struct snapshot_handle *snapshot,
430 unsigned int nr_to_write)
431{
432 unsigned int m;
433 int ret = 0;
434 int nr_pages;
435 int err2;
436 struct bio *bio;
437 struct timeval start;
438 struct timeval stop;
439 size_t off, unc_len, cmp_len;
440 unsigned char *unc, *cmp, *wrk, *page;
441
442 page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
443 if (!page) {
444 printk(KERN_ERR "PM: Failed to allocate LZO page\n");
445 return -ENOMEM;
446 }
447
448 wrk = vmalloc(LZO1X_1_MEM_COMPRESS);
449 if (!wrk) {
450 printk(KERN_ERR "PM: Failed to allocate LZO workspace\n");
451 free_page((unsigned long)page);
452 return -ENOMEM;
453 }
454
455 unc = vmalloc(LZO_UNC_SIZE);
456 if (!unc) {
457 printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n");
458 vfree(wrk);
459 free_page((unsigned long)page);
460 return -ENOMEM;
461 }
462
463 cmp = vmalloc(LZO_CMP_SIZE);
464 if (!cmp) {
465 printk(KERN_ERR "PM: Failed to allocate LZO compressed\n");
466 vfree(unc);
467 vfree(wrk);
468 free_page((unsigned long)page);
469 return -ENOMEM;
470 }
471
472 printk(KERN_INFO
473 "PM: Compressing and saving image data (%u pages) ... ",
474 nr_to_write);
475 m = nr_to_write / 100;
476 if (!m)
477 m = 1;
478 nr_pages = 0;
479 bio = NULL;
480 do_gettimeofday(&start);
481 for (;;) {
482 for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) {
483 ret = snapshot_read_next(snapshot);
484 if (ret < 0)
485 goto out_finish;
486
487 if (!ret)
488 break;
489
490 memcpy(unc + off, data_of(*snapshot), PAGE_SIZE);
491
492 if (!(nr_pages % m))
493 printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
494 nr_pages++;
495 }
496
497 if (!off)
498 break;
499
500 unc_len = off;
501 ret = lzo1x_1_compress(unc, unc_len,
502 cmp + LZO_HEADER, &cmp_len, wrk);
503 if (ret < 0) {
504 printk(KERN_ERR "PM: LZO compression failed\n");
505 break;
506 }
507
508 if (unlikely(!cmp_len ||
509 cmp_len > lzo1x_worst_compress(unc_len))) {
510 printk(KERN_ERR "PM: Invalid LZO compressed length\n");
511 ret = -1;
512 break;
513 }
514
515 *(size_t *)cmp = cmp_len;
516
517 /*
518 * Given we are writing one page at a time to disk, we copy
519 * that much from the buffer, although the last bit will likely
520 * be smaller than full page. This is OK - we saved the length
521 * of the compressed data, so any garbage at the end will be
522 * discarded when we read it.
523 */
524 for (off = 0; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) {
525 memcpy(page, cmp + off, PAGE_SIZE);
526
527 ret = swap_write_page(handle, page, &bio);
528 if (ret)
529 goto out_finish;
530 }
531 }
532
533out_finish:
534 err2 = hib_wait_on_bio_chain(&bio);
535 do_gettimeofday(&stop);
536 if (!ret)
537 ret = err2;
538 if (!ret)
539 printk(KERN_CONT "\b\b\b\bdone\n");
540 else
541 printk(KERN_CONT "\n");
542 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
543
544 vfree(cmp);
545 vfree(unc);
546 vfree(wrk);
547 free_page((unsigned long)page);
548
549 return ret;
550}
551
407/** 552/**
408 * enough_swap - Make sure we have enough swap to save the image. 553 * enough_swap - Make sure we have enough swap to save the image.
409 * 554 *
@@ -411,12 +556,16 @@ static int save_image(struct swap_map_handle *handle,
411 * space avaiable from the resume partition. 556 * space avaiable from the resume partition.
412 */ 557 */
413 558
414static int enough_swap(unsigned int nr_pages) 559static int enough_swap(unsigned int nr_pages, unsigned int flags)
415{ 560{
416 unsigned int free_swap = count_swap_pages(root_swap, 1); 561 unsigned int free_swap = count_swap_pages(root_swap, 1);
562 unsigned int required;
417 563
418 pr_debug("PM: Free swap pages: %u\n", free_swap); 564 pr_debug("PM: Free swap pages: %u\n", free_swap);
419 return free_swap > nr_pages + PAGES_FOR_IO; 565
566 required = PAGES_FOR_IO + ((flags & SF_NOCOMPRESS_MODE) ?
567 nr_pages : (nr_pages * LZO_CMP_PAGES) / LZO_UNC_PAGES + 1);
568 return free_swap > required;
420} 569}
421 570
422/** 571/**
@@ -443,7 +592,7 @@ int swsusp_write(unsigned int flags)
443 printk(KERN_ERR "PM: Cannot get swap writer\n"); 592 printk(KERN_ERR "PM: Cannot get swap writer\n");
444 return error; 593 return error;
445 } 594 }
446 if (!enough_swap(pages)) { 595 if (!enough_swap(pages, flags)) {
447 printk(KERN_ERR "PM: Not enough free swap\n"); 596 printk(KERN_ERR "PM: Not enough free swap\n");
448 error = -ENOSPC; 597 error = -ENOSPC;
449 goto out_finish; 598 goto out_finish;
@@ -458,8 +607,11 @@ int swsusp_write(unsigned int flags)
458 } 607 }
459 header = (struct swsusp_info *)data_of(snapshot); 608 header = (struct swsusp_info *)data_of(snapshot);
460 error = swap_write_page(&handle, header, NULL); 609 error = swap_write_page(&handle, header, NULL);
461 if (!error) 610 if (!error) {
462 error = save_image(&handle, &snapshot, pages - 1); 611 error = (flags & SF_NOCOMPRESS_MODE) ?
612 save_image(&handle, &snapshot, pages - 1) :
613 save_image_lzo(&handle, &snapshot, pages - 1);
614 }
463out_finish: 615out_finish:
464 error = swap_writer_finish(&handle, flags, error); 616 error = swap_writer_finish(&handle, flags, error);
465 return error; 617 return error;
@@ -590,6 +742,127 @@ static int load_image(struct swap_map_handle *handle,
590} 742}
591 743
592/** 744/**
745 * load_image_lzo - Load compressed image data and decompress them with LZO.
746 * @handle: Swap map handle to use for loading data.
747 * @snapshot: Image to copy uncompressed data into.
748 * @nr_to_read: Number of pages to load.
749 */
750static int load_image_lzo(struct swap_map_handle *handle,
751 struct snapshot_handle *snapshot,
752 unsigned int nr_to_read)
753{
754 unsigned int m;
755 int error = 0;
756 struct timeval start;
757 struct timeval stop;
758 unsigned nr_pages;
759 size_t off, unc_len, cmp_len;
760 unsigned char *unc, *cmp, *page;
761
762 page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
763 if (!page) {
764 printk(KERN_ERR "PM: Failed to allocate LZO page\n");
765 return -ENOMEM;
766 }
767
768 unc = vmalloc(LZO_UNC_SIZE);
769 if (!unc) {
770 printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n");
771 free_page((unsigned long)page);
772 return -ENOMEM;
773 }
774
775 cmp = vmalloc(LZO_CMP_SIZE);
776 if (!cmp) {
777 printk(KERN_ERR "PM: Failed to allocate LZO compressed\n");
778 vfree(unc);
779 free_page((unsigned long)page);
780 return -ENOMEM;
781 }
782
783 printk(KERN_INFO
784 "PM: Loading and decompressing image data (%u pages) ... ",
785 nr_to_read);
786 m = nr_to_read / 100;
787 if (!m)
788 m = 1;
789 nr_pages = 0;
790 do_gettimeofday(&start);
791
792 error = snapshot_write_next(snapshot);
793 if (error <= 0)
794 goto out_finish;
795
796 for (;;) {
797 error = swap_read_page(handle, page, NULL); /* sync */
798 if (error)
799 break;
800
801 cmp_len = *(size_t *)page;
802 if (unlikely(!cmp_len ||
803 cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) {
804 printk(KERN_ERR "PM: Invalid LZO compressed length\n");
805 error = -1;
806 break;
807 }
808
809 memcpy(cmp, page, PAGE_SIZE);
810 for (off = PAGE_SIZE; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) {
811 error = swap_read_page(handle, page, NULL); /* sync */
812 if (error)
813 goto out_finish;
814
815 memcpy(cmp + off, page, PAGE_SIZE);
816 }
817
818 unc_len = LZO_UNC_SIZE;
819 error = lzo1x_decompress_safe(cmp + LZO_HEADER, cmp_len,
820 unc, &unc_len);
821 if (error < 0) {
822 printk(KERN_ERR "PM: LZO decompression failed\n");
823 break;
824 }
825
826 if (unlikely(!unc_len ||
827 unc_len > LZO_UNC_SIZE ||
828 unc_len & (PAGE_SIZE - 1))) {
829 printk(KERN_ERR "PM: Invalid LZO uncompressed length\n");
830 error = -1;
831 break;
832 }
833
834 for (off = 0; off < unc_len; off += PAGE_SIZE) {
835 memcpy(data_of(*snapshot), unc + off, PAGE_SIZE);
836
837 if (!(nr_pages % m))
838 printk("\b\b\b\b%3d%%", nr_pages / m);
839 nr_pages++;
840
841 error = snapshot_write_next(snapshot);
842 if (error <= 0)
843 goto out_finish;
844 }
845 }
846
847out_finish:
848 do_gettimeofday(&stop);
849 if (!error) {
850 printk("\b\b\b\bdone\n");
851 snapshot_write_finalize(snapshot);
852 if (!snapshot_image_loaded(snapshot))
853 error = -ENODATA;
854 } else
855 printk("\n");
856 swsusp_show_speed(&start, &stop, nr_to_read, "Read");
857
858 vfree(cmp);
859 vfree(unc);
860 free_page((unsigned long)page);
861
862 return error;
863}
864
865/**
593 * swsusp_read - read the hibernation image. 866 * swsusp_read - read the hibernation image.
594 * @flags_p: flags passed by the "frozen" kernel in the image header should 867 * @flags_p: flags passed by the "frozen" kernel in the image header should
595 * be written into this memeory location 868 * be written into this memeory location
@@ -612,8 +885,11 @@ int swsusp_read(unsigned int *flags_p)
612 goto end; 885 goto end;
613 if (!error) 886 if (!error)
614 error = swap_read_page(&handle, header, NULL); 887 error = swap_read_page(&handle, header, NULL);
615 if (!error) 888 if (!error) {
616 error = load_image(&handle, &snapshot, header->pages - 1); 889 error = (*flags_p & SF_NOCOMPRESS_MODE) ?
890 load_image(&handle, &snapshot, header->pages - 1) :
891 load_image_lzo(&handle, &snapshot, header->pages - 1);
892 }
617 swap_reader_finish(&handle); 893 swap_reader_finish(&handle);
618end: 894end:
619 if (!error) 895 if (!error)
@@ -634,13 +910,13 @@ int swsusp_check(void)
634 hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); 910 hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
635 if (!IS_ERR(hib_resume_bdev)) { 911 if (!IS_ERR(hib_resume_bdev)) {
636 set_blocksize(hib_resume_bdev, PAGE_SIZE); 912 set_blocksize(hib_resume_bdev, PAGE_SIZE);
637 memset(swsusp_header, 0, PAGE_SIZE); 913 clear_page(swsusp_header);
638 error = hib_bio_read_page(swsusp_resume_block, 914 error = hib_bio_read_page(swsusp_resume_block,
639 swsusp_header, NULL); 915 swsusp_header, NULL);
640 if (error) 916 if (error)
641 goto put; 917 goto put;
642 918
643 if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) { 919 if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) {
644 memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); 920 memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
645 /* Reset swap signature now */ 921 /* Reset swap signature now */
646 error = hib_bio_write_page(swsusp_resume_block, 922 error = hib_bio_write_page(swsusp_resume_block,
@@ -653,13 +929,13 @@ put:
653 if (error) 929 if (error)
654 blkdev_put(hib_resume_bdev, FMODE_READ); 930 blkdev_put(hib_resume_bdev, FMODE_READ);
655 else 931 else
656 pr_debug("PM: Signature found, resuming\n"); 932 pr_debug("PM: Image signature found, resuming\n");
657 } else { 933 } else {
658 error = PTR_ERR(hib_resume_bdev); 934 error = PTR_ERR(hib_resume_bdev);
659 } 935 }
660 936
661 if (error) 937 if (error)
662 pr_debug("PM: Error %d checking image file\n", error); 938 pr_debug("PM: Image not found (code %d)\n", error);
663 939
664 return error; 940 return error;
665} 941}
diff --git a/kernel/printk.c b/kernel/printk.c
index 444b770c9595..b2ebaee8c377 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -37,6 +37,8 @@
37#include <linux/ratelimit.h> 37#include <linux/ratelimit.h>
38#include <linux/kmsg_dump.h> 38#include <linux/kmsg_dump.h>
39#include <linux/syslog.h> 39#include <linux/syslog.h>
40#include <linux/cpu.h>
41#include <linux/notifier.h>
40 42
41#include <asm/uaccess.h> 43#include <asm/uaccess.h>
42 44
@@ -83,7 +85,7 @@ EXPORT_SYMBOL(oops_in_progress);
83 * provides serialisation for access to the entire console 85 * provides serialisation for access to the entire console
84 * driver system. 86 * driver system.
85 */ 87 */
86static DECLARE_MUTEX(console_sem); 88static DEFINE_SEMAPHORE(console_sem);
87struct console *console_drivers; 89struct console *console_drivers;
88EXPORT_SYMBOL_GPL(console_drivers); 90EXPORT_SYMBOL_GPL(console_drivers);
89 91
@@ -208,7 +210,7 @@ __setup("log_buf_len=", log_buf_len_setup);
208 210
209#ifdef CONFIG_BOOT_PRINTK_DELAY 211#ifdef CONFIG_BOOT_PRINTK_DELAY
210 212
211static unsigned int boot_delay; /* msecs delay after each printk during bootup */ 213static int boot_delay; /* msecs delay after each printk during bootup */
212static unsigned long long loops_per_msec; /* based on boot_delay */ 214static unsigned long long loops_per_msec; /* based on boot_delay */
213 215
214static int __init boot_delay_setup(char *str) 216static int __init boot_delay_setup(char *str)
@@ -554,7 +556,7 @@ static void zap_locks(void)
554 /* If a crash is occurring, make sure we can't deadlock */ 556 /* If a crash is occurring, make sure we can't deadlock */
555 spin_lock_init(&logbuf_lock); 557 spin_lock_init(&logbuf_lock);
556 /* And make sure that we print immediately */ 558 /* And make sure that we print immediately */
557 init_MUTEX(&console_sem); 559 sema_init(&console_sem, 1);
558} 560}
559 561
560#if defined(CONFIG_PRINTK_TIME) 562#if defined(CONFIG_PRINTK_TIME)
@@ -645,6 +647,7 @@ static inline int can_use_console(unsigned int cpu)
645 * released but interrupts still disabled. 647 * released but interrupts still disabled.
646 */ 648 */
647static int acquire_console_semaphore_for_printk(unsigned int cpu) 649static int acquire_console_semaphore_for_printk(unsigned int cpu)
650 __releases(&logbuf_lock)
648{ 651{
649 int retval = 0; 652 int retval = 0;
650 653
@@ -985,6 +988,32 @@ void resume_console(void)
985} 988}
986 989
987/** 990/**
991 * console_cpu_notify - print deferred console messages after CPU hotplug
992 * @self: notifier struct
993 * @action: CPU hotplug event
994 * @hcpu: unused
995 *
996 * If printk() is called from a CPU that is not online yet, the messages
997 * will be spooled but will not show up on the console. This function is
998 * called when a new CPU comes online (or fails to come up), and ensures
999 * that any such output gets printed.
1000 */
1001static int __cpuinit console_cpu_notify(struct notifier_block *self,
1002 unsigned long action, void *hcpu)
1003{
1004 switch (action) {
1005 case CPU_ONLINE:
1006 case CPU_DEAD:
1007 case CPU_DYING:
1008 case CPU_DOWN_FAILED:
1009 case CPU_UP_CANCELED:
1010 acquire_console_sem();
1011 release_console_sem();
1012 }
1013 return NOTIFY_OK;
1014}
1015
1016/**
988 * acquire_console_sem - lock the console system for exclusive use. 1017 * acquire_console_sem - lock the console system for exclusive use.
989 * 1018 *
990 * Acquires a semaphore which guarantees that the caller has 1019 * Acquires a semaphore which guarantees that the caller has
@@ -1371,7 +1400,7 @@ int unregister_console(struct console *console)
1371} 1400}
1372EXPORT_SYMBOL(unregister_console); 1401EXPORT_SYMBOL(unregister_console);
1373 1402
1374static int __init disable_boot_consoles(void) 1403static int __init printk_late_init(void)
1375{ 1404{
1376 struct console *con; 1405 struct console *con;
1377 1406
@@ -1382,9 +1411,10 @@ static int __init disable_boot_consoles(void)
1382 unregister_console(con); 1411 unregister_console(con);
1383 } 1412 }
1384 } 1413 }
1414 hotcpu_notifier(console_cpu_notify, 0);
1385 return 0; 1415 return 0;
1386} 1416}
1387late_initcall(disable_boot_consoles); 1417late_initcall(printk_late_init);
1388 1418
1389#if defined CONFIG_PRINTK 1419#if defined CONFIG_PRINTK
1390 1420
@@ -1482,7 +1512,7 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper)
1482} 1512}
1483EXPORT_SYMBOL_GPL(kmsg_dump_unregister); 1513EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
1484 1514
1485static const char const *kmsg_reasons[] = { 1515static const char * const kmsg_reasons[] = {
1486 [KMSG_DUMP_OOPS] = "oops", 1516 [KMSG_DUMP_OOPS] = "oops",
1487 [KMSG_DUMP_PANIC] = "panic", 1517 [KMSG_DUMP_PANIC] = "panic",
1488 [KMSG_DUMP_KEXEC] = "kexec", 1518 [KMSG_DUMP_KEXEC] = "kexec",
@@ -1520,9 +1550,9 @@ void kmsg_dump(enum kmsg_dump_reason reason)
1520 chars = logged_chars; 1550 chars = logged_chars;
1521 spin_unlock_irqrestore(&logbuf_lock, flags); 1551 spin_unlock_irqrestore(&logbuf_lock, flags);
1522 1552
1523 if (logged_chars > end) { 1553 if (chars > end) {
1524 s1 = log_buf + log_buf_len - logged_chars + end; 1554 s1 = log_buf + log_buf_len - chars + end;
1525 l1 = logged_chars - end; 1555 l1 = chars - end;
1526 1556
1527 s2 = log_buf; 1557 s2 = log_buf;
1528 l2 = end; 1558 l2 = end;
@@ -1530,8 +1560,8 @@ void kmsg_dump(enum kmsg_dump_reason reason)
1530 s1 = ""; 1560 s1 = "";
1531 l1 = 0; 1561 l1 = 0;
1532 1562
1533 s2 = log_buf + end - logged_chars; 1563 s2 = log_buf + end - chars;
1534 l2 = logged_chars; 1564 l2 = chars;
1535 } 1565 }
1536 1566
1537 if (!spin_trylock_irqsave(&dump_list_lock, flags)) { 1567 if (!spin_trylock_irqsave(&dump_list_lock, flags)) {
diff --git a/kernel/profile.c b/kernel/profile.c
index b22a899934cc..66f841b7fbd3 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -555,6 +555,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf,
555static const struct file_operations proc_profile_operations = { 555static const struct file_operations proc_profile_operations = {
556 .read = read_profile, 556 .read = read_profile,
557 .write = write_profile, 557 .write = write_profile,
558 .llseek = default_llseek,
558}; 559};
559 560
560#ifdef CONFIG_SMP 561#ifdef CONFIG_SMP
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 74a3d693c196..99bbaa3e5b0d 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -181,7 +181,7 @@ int ptrace_attach(struct task_struct *task)
181 * under ptrace. 181 * under ptrace.
182 */ 182 */
183 retval = -ERESTARTNOINTR; 183 retval = -ERESTARTNOINTR;
184 if (mutex_lock_interruptible(&task->cred_guard_mutex)) 184 if (mutex_lock_interruptible(&task->signal->cred_guard_mutex))
185 goto out; 185 goto out;
186 186
187 task_lock(task); 187 task_lock(task);
@@ -208,7 +208,7 @@ int ptrace_attach(struct task_struct *task)
208unlock_tasklist: 208unlock_tasklist:
209 write_unlock_irq(&tasklist_lock); 209 write_unlock_irq(&tasklist_lock);
210unlock_creds: 210unlock_creds:
211 mutex_unlock(&task->cred_guard_mutex); 211 mutex_unlock(&task->signal->cred_guard_mutex);
212out: 212out:
213 return retval; 213 return retval;
214} 214}
@@ -324,26 +324,34 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
324} 324}
325 325
326/* 326/*
327 * Detach all tasks we were using ptrace on. 327 * Detach all tasks we were using ptrace on. Called with tasklist held
328 * for writing, and returns with it held too. But note it can release
329 * and reacquire the lock.
328 */ 330 */
329void exit_ptrace(struct task_struct *tracer) 331void exit_ptrace(struct task_struct *tracer)
332 __releases(&tasklist_lock)
333 __acquires(&tasklist_lock)
330{ 334{
331 struct task_struct *p, *n; 335 struct task_struct *p, *n;
332 LIST_HEAD(ptrace_dead); 336 LIST_HEAD(ptrace_dead);
333 337
334 write_lock_irq(&tasklist_lock); 338 if (likely(list_empty(&tracer->ptraced)))
339 return;
340
335 list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { 341 list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
336 if (__ptrace_detach(tracer, p)) 342 if (__ptrace_detach(tracer, p))
337 list_add(&p->ptrace_entry, &ptrace_dead); 343 list_add(&p->ptrace_entry, &ptrace_dead);
338 } 344 }
339 write_unlock_irq(&tasklist_lock);
340 345
346 write_unlock_irq(&tasklist_lock);
341 BUG_ON(!list_empty(&tracer->ptraced)); 347 BUG_ON(!list_empty(&tracer->ptraced));
342 348
343 list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) { 349 list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
344 list_del_init(&p->ptrace_entry); 350 list_del_init(&p->ptrace_entry);
345 release_task(p); 351 release_task(p);
346 } 352 }
353
354 write_lock_irq(&tasklist_lock);
347} 355}
348 356
349int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) 357int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
@@ -396,7 +404,7 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
396 return copied; 404 return copied;
397} 405}
398 406
399static int ptrace_setoptions(struct task_struct *child, long data) 407static int ptrace_setoptions(struct task_struct *child, unsigned long data)
400{ 408{
401 child->ptrace &= ~PT_TRACE_MASK; 409 child->ptrace &= ~PT_TRACE_MASK;
402 410
@@ -475,7 +483,8 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
475#define is_sysemu_singlestep(request) 0 483#define is_sysemu_singlestep(request) 0
476#endif 484#endif
477 485
478static int ptrace_resume(struct task_struct *child, long request, long data) 486static int ptrace_resume(struct task_struct *child, long request,
487 unsigned long data)
479{ 488{
480 if (!valid_signal(data)) 489 if (!valid_signal(data))
481 return -EIO; 490 return -EIO;
@@ -552,10 +561,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
552#endif 561#endif
553 562
554int ptrace_request(struct task_struct *child, long request, 563int ptrace_request(struct task_struct *child, long request,
555 long addr, long data) 564 unsigned long addr, unsigned long data)
556{ 565{
557 int ret = -EIO; 566 int ret = -EIO;
558 siginfo_t siginfo; 567 siginfo_t siginfo;
568 void __user *datavp = (void __user *) data;
569 unsigned long __user *datalp = datavp;
559 570
560 switch (request) { 571 switch (request) {
561 case PTRACE_PEEKTEXT: 572 case PTRACE_PEEKTEXT:
@@ -572,19 +583,17 @@ int ptrace_request(struct task_struct *child, long request,
572 ret = ptrace_setoptions(child, data); 583 ret = ptrace_setoptions(child, data);
573 break; 584 break;
574 case PTRACE_GETEVENTMSG: 585 case PTRACE_GETEVENTMSG:
575 ret = put_user(child->ptrace_message, (unsigned long __user *) data); 586 ret = put_user(child->ptrace_message, datalp);
576 break; 587 break;
577 588
578 case PTRACE_GETSIGINFO: 589 case PTRACE_GETSIGINFO:
579 ret = ptrace_getsiginfo(child, &siginfo); 590 ret = ptrace_getsiginfo(child, &siginfo);
580 if (!ret) 591 if (!ret)
581 ret = copy_siginfo_to_user((siginfo_t __user *) data, 592 ret = copy_siginfo_to_user(datavp, &siginfo);
582 &siginfo);
583 break; 593 break;
584 594
585 case PTRACE_SETSIGINFO: 595 case PTRACE_SETSIGINFO:
586 if (copy_from_user(&siginfo, (siginfo_t __user *) data, 596 if (copy_from_user(&siginfo, datavp, sizeof siginfo))
587 sizeof siginfo))
588 ret = -EFAULT; 597 ret = -EFAULT;
589 else 598 else
590 ret = ptrace_setsiginfo(child, &siginfo); 599 ret = ptrace_setsiginfo(child, &siginfo);
@@ -615,7 +624,7 @@ int ptrace_request(struct task_struct *child, long request,
615 } 624 }
616 mmput(mm); 625 mmput(mm);
617 626
618 ret = put_user(tmp, (unsigned long __user *) data); 627 ret = put_user(tmp, datalp);
619 break; 628 break;
620 } 629 }
621#endif 630#endif
@@ -644,7 +653,7 @@ int ptrace_request(struct task_struct *child, long request,
644 case PTRACE_SETREGSET: 653 case PTRACE_SETREGSET:
645 { 654 {
646 struct iovec kiov; 655 struct iovec kiov;
647 struct iovec __user *uiov = (struct iovec __user *) data; 656 struct iovec __user *uiov = datavp;
648 657
649 if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov))) 658 if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
650 return -EFAULT; 659 return -EFAULT;
@@ -685,7 +694,8 @@ static struct task_struct *ptrace_get_task_struct(pid_t pid)
685#define arch_ptrace_attach(child) do { } while (0) 694#define arch_ptrace_attach(child) do { } while (0)
686#endif 695#endif
687 696
688SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data) 697SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
698 unsigned long, data)
689{ 699{
690 struct task_struct *child; 700 struct task_struct *child;
691 long ret; 701 long ret;
@@ -726,7 +736,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
726 return ret; 736 return ret;
727} 737}
728 738
729int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data) 739int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr,
740 unsigned long data)
730{ 741{
731 unsigned long tmp; 742 unsigned long tmp;
732 int copied; 743 int copied;
@@ -737,7 +748,8 @@ int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data)
737 return put_user(tmp, (unsigned long __user *)data); 748 return put_user(tmp, (unsigned long __user *)data);
738} 749}
739 750
740int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data) 751int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
752 unsigned long data)
741{ 753{
742 int copied; 754 int copied;
743 755
diff --git a/kernel/range.c b/kernel/range.c
index 74e2e6114927..471b66acabb5 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -7,10 +7,6 @@
7 7
8#include <linux/range.h> 8#include <linux/range.h>
9 9
10#ifndef ARRAY_SIZE
11#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
12#endif
13
14int add_range(struct range *range, int az, int nr_range, u64 start, u64 end) 10int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)
15{ 11{
16 if (start >= end) 12 if (start >= end)
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 72a8dc9567f5..a23a57a976d1 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -73,12 +73,14 @@ int debug_lockdep_rcu_enabled(void)
73EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); 73EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
74 74
75/** 75/**
76 * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section? 76 * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
77 * 77 *
78 * Check for bottom half being disabled, which covers both the 78 * Check for bottom half being disabled, which covers both the
79 * CONFIG_PROVE_RCU and not cases. Note that if someone uses 79 * CONFIG_PROVE_RCU and not cases. Note that if someone uses
80 * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled) 80 * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled)
81 * will show the situation. 81 * will show the situation. This is useful for debug checks in functions
82 * that require that they be called within an RCU read-side critical
83 * section.
82 * 84 *
83 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. 85 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot.
84 */ 86 */
@@ -86,7 +88,7 @@ int rcu_read_lock_bh_held(void)
86{ 88{
87 if (!debug_lockdep_rcu_enabled()) 89 if (!debug_lockdep_rcu_enabled())
88 return 1; 90 return 1;
89 return in_softirq(); 91 return in_softirq() || irqs_disabled();
90} 92}
91EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); 93EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
92 94
@@ -114,3 +116,163 @@ int rcu_my_thread_group_empty(void)
114} 116}
115EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty); 117EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty);
116#endif /* #ifdef CONFIG_PROVE_RCU */ 118#endif /* #ifdef CONFIG_PROVE_RCU */
119
120#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
121static inline void debug_init_rcu_head(struct rcu_head *head)
122{
123 debug_object_init(head, &rcuhead_debug_descr);
124}
125
126static inline void debug_rcu_head_free(struct rcu_head *head)
127{
128 debug_object_free(head, &rcuhead_debug_descr);
129}
130
131/*
132 * fixup_init is called when:
133 * - an active object is initialized
134 */
135static int rcuhead_fixup_init(void *addr, enum debug_obj_state state)
136{
137 struct rcu_head *head = addr;
138
139 switch (state) {
140 case ODEBUG_STATE_ACTIVE:
141 /*
142 * Ensure that queued callbacks are all executed.
143 * If we detect that we are nested in a RCU read-side critical
144 * section, we should simply fail, otherwise we would deadlock.
145 */
146 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
147 irqs_disabled()) {
148 WARN_ON(1);
149 return 0;
150 }
151 rcu_barrier();
152 rcu_barrier_sched();
153 rcu_barrier_bh();
154 debug_object_init(head, &rcuhead_debug_descr);
155 return 1;
156 default:
157 return 0;
158 }
159}
160
161/*
162 * fixup_activate is called when:
163 * - an active object is activated
164 * - an unknown object is activated (might be a statically initialized object)
165 * Activation is performed internally by call_rcu().
166 */
167static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
168{
169 struct rcu_head *head = addr;
170
171 switch (state) {
172
173 case ODEBUG_STATE_NOTAVAILABLE:
174 /*
175 * This is not really a fixup. We just make sure that it is
176 * tracked in the object tracker.
177 */
178 debug_object_init(head, &rcuhead_debug_descr);
179 debug_object_activate(head, &rcuhead_debug_descr);
180 return 0;
181
182 case ODEBUG_STATE_ACTIVE:
183 /*
184 * Ensure that queued callbacks are all executed.
185 * If we detect that we are nested in a RCU read-side critical
186 * section, we should simply fail, otherwise we would deadlock.
187 */
188 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
189 irqs_disabled()) {
190 WARN_ON(1);
191 return 0;
192 }
193 rcu_barrier();
194 rcu_barrier_sched();
195 rcu_barrier_bh();
196 debug_object_activate(head, &rcuhead_debug_descr);
197 return 1;
198 default:
199 return 0;
200 }
201}
202
203/*
204 * fixup_free is called when:
205 * - an active object is freed
206 */
207static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
208{
209 struct rcu_head *head = addr;
210
211 switch (state) {
212 case ODEBUG_STATE_ACTIVE:
213 /*
214 * Ensure that queued callbacks are all executed.
215 * If we detect that we are nested in a RCU read-side critical
216 * section, we should simply fail, otherwise we would deadlock.
217 */
218#ifndef CONFIG_PREEMPT
219 WARN_ON(1);
220 return 0;
221#else
222 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
223 irqs_disabled()) {
224 WARN_ON(1);
225 return 0;
226 }
227 rcu_barrier();
228 rcu_barrier_sched();
229 rcu_barrier_bh();
230 debug_object_free(head, &rcuhead_debug_descr);
231 return 1;
232#endif
233 default:
234 return 0;
235 }
236}
237
238/**
239 * init_rcu_head_on_stack() - initialize on-stack rcu_head for debugobjects
240 * @head: pointer to rcu_head structure to be initialized
241 *
242 * This function informs debugobjects of a new rcu_head structure that
243 * has been allocated as an auto variable on the stack. This function
244 * is not required for rcu_head structures that are statically defined or
245 * that are dynamically allocated on the heap. This function has no
246 * effect for !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds.
247 */
248void init_rcu_head_on_stack(struct rcu_head *head)
249{
250 debug_object_init_on_stack(head, &rcuhead_debug_descr);
251}
252EXPORT_SYMBOL_GPL(init_rcu_head_on_stack);
253
254/**
255 * destroy_rcu_head_on_stack() - destroy on-stack rcu_head for debugobjects
256 * @head: pointer to rcu_head structure to be initialized
257 *
258 * This function informs debugobjects that an on-stack rcu_head structure
259 * is about to go out of scope. As with init_rcu_head_on_stack(), this
260 * function is not required for rcu_head structures that are statically
261 * defined or that are dynamically allocated on the heap. Also as with
262 * init_rcu_head_on_stack(), this function has no effect for
263 * !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds.
264 */
265void destroy_rcu_head_on_stack(struct rcu_head *head)
266{
267 debug_object_free(head, &rcuhead_debug_descr);
268}
269EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack);
270
271struct debug_obj_descr rcuhead_debug_descr = {
272 .name = "rcu_head",
273 .fixup_init = rcuhead_fixup_init,
274 .fixup_activate = rcuhead_fixup_activate,
275 .fixup_free = rcuhead_fixup_free,
276};
277EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
278#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 38729d3cd236..d806735342ac 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -59,6 +59,14 @@ int rcu_scheduler_active __read_mostly;
59EXPORT_SYMBOL_GPL(rcu_scheduler_active); 59EXPORT_SYMBOL_GPL(rcu_scheduler_active);
60#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 60#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
61 61
62/* Forward declarations for rcutiny_plugin.h. */
63static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
64static void __call_rcu(struct rcu_head *head,
65 void (*func)(struct rcu_head *rcu),
66 struct rcu_ctrlblk *rcp);
67
68#include "rcutiny_plugin.h"
69
62#ifdef CONFIG_NO_HZ 70#ifdef CONFIG_NO_HZ
63 71
64static long rcu_dynticks_nesting = 1; 72static long rcu_dynticks_nesting = 1;
@@ -140,6 +148,7 @@ void rcu_check_callbacks(int cpu, int user)
140 rcu_sched_qs(cpu); 148 rcu_sched_qs(cpu);
141 else if (!in_softirq()) 149 else if (!in_softirq())
142 rcu_bh_qs(cpu); 150 rcu_bh_qs(cpu);
151 rcu_preempt_check_callbacks();
143} 152}
144 153
145/* 154/*
@@ -162,6 +171,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
162 *rcp->donetail = NULL; 171 *rcp->donetail = NULL;
163 if (rcp->curtail == rcp->donetail) 172 if (rcp->curtail == rcp->donetail)
164 rcp->curtail = &rcp->rcucblist; 173 rcp->curtail = &rcp->rcucblist;
174 rcu_preempt_remove_callbacks(rcp);
165 rcp->donetail = &rcp->rcucblist; 175 rcp->donetail = &rcp->rcucblist;
166 local_irq_restore(flags); 176 local_irq_restore(flags);
167 177
@@ -169,6 +179,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
169 while (list) { 179 while (list) {
170 next = list->next; 180 next = list->next;
171 prefetch(next); 181 prefetch(next);
182 debug_rcu_head_unqueue(list);
172 list->func(list); 183 list->func(list);
173 list = next; 184 list = next;
174 } 185 }
@@ -181,6 +192,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
181{ 192{
182 __rcu_process_callbacks(&rcu_sched_ctrlblk); 193 __rcu_process_callbacks(&rcu_sched_ctrlblk);
183 __rcu_process_callbacks(&rcu_bh_ctrlblk); 194 __rcu_process_callbacks(&rcu_bh_ctrlblk);
195 rcu_preempt_process_callbacks();
184} 196}
185 197
186/* 198/*
@@ -211,6 +223,7 @@ static void __call_rcu(struct rcu_head *head,
211{ 223{
212 unsigned long flags; 224 unsigned long flags;
213 225
226 debug_rcu_head_queue(head);
214 head->func = func; 227 head->func = func;
215 head->next = NULL; 228 head->next = NULL;
216 229
@@ -221,15 +234,15 @@ static void __call_rcu(struct rcu_head *head,
221} 234}
222 235
223/* 236/*
224 * Post an RCU callback to be invoked after the end of an RCU grace 237 * Post an RCU callback to be invoked after the end of an RCU-sched grace
225 * period. But since we have but one CPU, that would be after any 238 * period. But since we have but one CPU, that would be after any
226 * quiescent state. 239 * quiescent state.
227 */ 240 */
228void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 241void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
229{ 242{
230 __call_rcu(head, func, &rcu_sched_ctrlblk); 243 __call_rcu(head, func, &rcu_sched_ctrlblk);
231} 244}
232EXPORT_SYMBOL_GPL(call_rcu); 245EXPORT_SYMBOL_GPL(call_rcu_sched);
233 246
234/* 247/*
235 * Post an RCU bottom-half callback to be invoked after any subsequent 248 * Post an RCU bottom-half callback to be invoked after any subsequent
@@ -241,20 +254,6 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
241} 254}
242EXPORT_SYMBOL_GPL(call_rcu_bh); 255EXPORT_SYMBOL_GPL(call_rcu_bh);
243 256
244void rcu_barrier(void)
245{
246 struct rcu_synchronize rcu;
247
248 init_rcu_head_on_stack(&rcu.head);
249 init_completion(&rcu.completion);
250 /* Will wake me after RCU finished. */
251 call_rcu(&rcu.head, wakeme_after_rcu);
252 /* Wait for it. */
253 wait_for_completion(&rcu.completion);
254 destroy_rcu_head_on_stack(&rcu.head);
255}
256EXPORT_SYMBOL_GPL(rcu_barrier);
257
258void rcu_barrier_bh(void) 257void rcu_barrier_bh(void)
259{ 258{
260 struct rcu_synchronize rcu; 259 struct rcu_synchronize rcu;
@@ -287,5 +286,3 @@ void __init rcu_init(void)
287{ 286{
288 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 287 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
289} 288}
290
291#include "rcutiny_plugin.h"
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index d223a92bc742..6ceca4f745ff 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -1,7 +1,7 @@
1/* 1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version) 2 * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition
3 * Internal non-public definitions that provide either classic 3 * Internal non-public definitions that provide either classic
4 * or preemptable semantics. 4 * or preemptible semantics.
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by 7 * it under the terms of the GNU General Public License as published by
@@ -17,11 +17,587 @@
17 * along with this program; if not, write to the Free Software 17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 * 19 *
20 * Copyright IBM Corporation, 2009 20 * Copyright (c) 2010 Linaro
21 * 21 *
22 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> 22 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
23 */ 23 */
24 24
25#ifdef CONFIG_TINY_PREEMPT_RCU
26
27#include <linux/delay.h>
28
29/* Global control variables for preemptible RCU. */
30struct rcu_preempt_ctrlblk {
31 struct rcu_ctrlblk rcb; /* curtail: ->next ptr of last CB for GP. */
32 struct rcu_head **nexttail;
33 /* Tasks blocked in a preemptible RCU */
34 /* read-side critical section while an */
35 /* preemptible-RCU grace period is in */
36 /* progress must wait for a later grace */
37 /* period. This pointer points to the */
38 /* ->next pointer of the last task that */
39 /* must wait for a later grace period, or */
40 /* to &->rcb.rcucblist if there is no */
41 /* such task. */
42 struct list_head blkd_tasks;
43 /* Tasks blocked in RCU read-side critical */
44 /* section. Tasks are placed at the head */
45 /* of this list and age towards the tail. */
46 struct list_head *gp_tasks;
47 /* Pointer to the first task blocking the */
48 /* current grace period, or NULL if there */
49 /* is not such task. */
50 struct list_head *exp_tasks;
51 /* Pointer to first task blocking the */
52 /* current expedited grace period, or NULL */
53 /* if there is no such task. If there */
54 /* is no current expedited grace period, */
55 /* then there cannot be any such task. */
56 u8 gpnum; /* Current grace period. */
57 u8 gpcpu; /* Last grace period blocked by the CPU. */
58 u8 completed; /* Last grace period completed. */
59 /* If all three are equal, RCU is idle. */
60};
61
62static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
63 .rcb.donetail = &rcu_preempt_ctrlblk.rcb.rcucblist,
64 .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist,
65 .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist,
66 .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks),
67};
68
69static int rcu_preempted_readers_exp(void);
70static void rcu_report_exp_done(void);
71
72/*
73 * Return true if the CPU has not yet responded to the current grace period.
74 */
75static int rcu_cpu_blocking_cur_gp(void)
76{
77 return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum;
78}
79
80/*
81 * Check for a running RCU reader. Because there is only one CPU,
82 * there can be but one running RCU reader at a time. ;-)
83 */
84static int rcu_preempt_running_reader(void)
85{
86 return current->rcu_read_lock_nesting;
87}
88
89/*
90 * Check for preempted RCU readers blocking any grace period.
91 * If the caller needs a reliable answer, it must disable hard irqs.
92 */
93static int rcu_preempt_blocked_readers_any(void)
94{
95 return !list_empty(&rcu_preempt_ctrlblk.blkd_tasks);
96}
97
98/*
99 * Check for preempted RCU readers blocking the current grace period.
100 * If the caller needs a reliable answer, it must disable hard irqs.
101 */
102static int rcu_preempt_blocked_readers_cgp(void)
103{
104 return rcu_preempt_ctrlblk.gp_tasks != NULL;
105}
106
107/*
108 * Return true if another preemptible-RCU grace period is needed.
109 */
110static int rcu_preempt_needs_another_gp(void)
111{
112 return *rcu_preempt_ctrlblk.rcb.curtail != NULL;
113}
114
115/*
116 * Return true if a preemptible-RCU grace period is in progress.
117 * The caller must disable hardirqs.
118 */
119static int rcu_preempt_gp_in_progress(void)
120{
121 return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum;
122}
123
124/*
125 * Record a preemptible-RCU quiescent state for the specified CPU. Note
126 * that this just means that the task currently running on the CPU is
127 * in a quiescent state. There might be any number of tasks blocked
128 * while in an RCU read-side critical section.
129 *
130 * Unlike the other rcu_*_qs() functions, callers to this function
131 * must disable irqs in order to protect the assignment to
132 * ->rcu_read_unlock_special.
133 *
134 * Because this is a single-CPU implementation, the only way a grace
135 * period can end is if the CPU is in a quiescent state. The reason is
136 * that a blocked preemptible-RCU reader can exit its critical section
137 * only if the CPU is running it at the time. Therefore, when the
138 * last task blocking the current grace period exits its RCU read-side
139 * critical section, neither the CPU nor blocked tasks will be stopping
140 * the current grace period. (In contrast, SMP implementations
141 * might have CPUs running in RCU read-side critical sections that
142 * block later grace periods -- but this is not possible given only
143 * one CPU.)
144 */
145static void rcu_preempt_cpu_qs(void)
146{
147 /* Record both CPU and task as having responded to current GP. */
148 rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
149 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
150
151 /*
152 * If there is no GP, or if blocked readers are still blocking GP,
153 * then there is nothing more to do.
154 */
155 if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp())
156 return;
157
158 /* Advance callbacks. */
159 rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum;
160 rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.rcb.curtail;
161 rcu_preempt_ctrlblk.rcb.curtail = rcu_preempt_ctrlblk.nexttail;
162
163 /* If there are no blocked readers, next GP is done instantly. */
164 if (!rcu_preempt_blocked_readers_any())
165 rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
166
167 /* If there are done callbacks, make RCU_SOFTIRQ process them. */
168 if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
169 raise_softirq(RCU_SOFTIRQ);
170}
171
172/*
173 * Start a new RCU grace period if warranted. Hard irqs must be disabled.
174 */
175static void rcu_preempt_start_gp(void)
176{
177 if (!rcu_preempt_gp_in_progress() && rcu_preempt_needs_another_gp()) {
178
179 /* Official start of GP. */
180 rcu_preempt_ctrlblk.gpnum++;
181
182 /* Any blocked RCU readers block new GP. */
183 if (rcu_preempt_blocked_readers_any())
184 rcu_preempt_ctrlblk.gp_tasks =
185 rcu_preempt_ctrlblk.blkd_tasks.next;
186
187 /* If there is no running reader, CPU is done with GP. */
188 if (!rcu_preempt_running_reader())
189 rcu_preempt_cpu_qs();
190 }
191}
192
193/*
194 * We have entered the scheduler, and the current task might soon be
195 * context-switched away from. If this task is in an RCU read-side
196 * critical section, we will no longer be able to rely on the CPU to
197 * record that fact, so we enqueue the task on the blkd_tasks list.
198 * If the task started after the current grace period began, as recorded
199 * by ->gpcpu, we enqueue at the beginning of the list. Otherwise
200 * before the element referenced by ->gp_tasks (or at the tail if
201 * ->gp_tasks is NULL) and point ->gp_tasks at the newly added element.
202 * The task will dequeue itself when it exits the outermost enclosing
203 * RCU read-side critical section. Therefore, the current grace period
204 * cannot be permitted to complete until the ->gp_tasks pointer becomes
205 * NULL.
206 *
207 * Caller must disable preemption.
208 */
209void rcu_preempt_note_context_switch(void)
210{
211 struct task_struct *t = current;
212 unsigned long flags;
213
214 local_irq_save(flags); /* must exclude scheduler_tick(). */
215 if (rcu_preempt_running_reader() &&
216 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
217
218 /* Possibly blocking in an RCU read-side critical section. */
219 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
220
221 /*
222 * If this CPU has already checked in, then this task
223 * will hold up the next grace period rather than the
224 * current grace period. Queue the task accordingly.
225 * If the task is queued for the current grace period
226 * (i.e., this CPU has not yet passed through a quiescent
227 * state for the current grace period), then as long
228 * as that task remains queued, the current grace period
229 * cannot end.
230 */
231 list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks);
232 if (rcu_cpu_blocking_cur_gp())
233 rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry;
234 }
235
236 /*
237 * Either we were not in an RCU read-side critical section to
238 * begin with, or we have now recorded that critical section
239 * globally. Either way, we can now note a quiescent state
240 * for this CPU. Again, if we were in an RCU read-side critical
241 * section, and if that critical section was blocking the current
242 * grace period, then the fact that the task has been enqueued
243 * means that current grace period continues to be blocked.
244 */
245 rcu_preempt_cpu_qs();
246 local_irq_restore(flags);
247}
248
249/*
250 * Tiny-preemptible RCU implementation for rcu_read_lock().
251 * Just increment ->rcu_read_lock_nesting, shared state will be updated
252 * if we block.
253 */
254void __rcu_read_lock(void)
255{
256 current->rcu_read_lock_nesting++;
257 barrier(); /* needed if we ever invoke rcu_read_lock in rcutiny.c */
258}
259EXPORT_SYMBOL_GPL(__rcu_read_lock);
260
261/*
262 * Handle special cases during rcu_read_unlock(), such as needing to
263 * notify RCU core processing or task having blocked during the RCU
264 * read-side critical section.
265 */
266static void rcu_read_unlock_special(struct task_struct *t)
267{
268 int empty;
269 int empty_exp;
270 unsigned long flags;
271 struct list_head *np;
272 int special;
273
274 /*
275 * NMI handlers cannot block and cannot safely manipulate state.
276 * They therefore cannot possibly be special, so just leave.
277 */
278 if (in_nmi())
279 return;
280
281 local_irq_save(flags);
282
283 /*
284 * If RCU core is waiting for this CPU to exit critical section,
285 * let it know that we have done so.
286 */
287 special = t->rcu_read_unlock_special;
288 if (special & RCU_READ_UNLOCK_NEED_QS)
289 rcu_preempt_cpu_qs();
290
291 /* Hardware IRQ handlers cannot block. */
292 if (in_irq()) {
293 local_irq_restore(flags);
294 return;
295 }
296
297 /* Clean up if blocked during RCU read-side critical section. */
298 if (special & RCU_READ_UNLOCK_BLOCKED) {
299 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
300
301 /*
302 * Remove this task from the ->blkd_tasks list and adjust
303 * any pointers that might have been referencing it.
304 */
305 empty = !rcu_preempt_blocked_readers_cgp();
306 empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
307 np = t->rcu_node_entry.next;
308 if (np == &rcu_preempt_ctrlblk.blkd_tasks)
309 np = NULL;
310 list_del(&t->rcu_node_entry);
311 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
312 rcu_preempt_ctrlblk.gp_tasks = np;
313 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
314 rcu_preempt_ctrlblk.exp_tasks = np;
315 INIT_LIST_HEAD(&t->rcu_node_entry);
316
317 /*
318 * If this was the last task on the current list, and if
319 * we aren't waiting on the CPU, report the quiescent state
320 * and start a new grace period if needed.
321 */
322 if (!empty && !rcu_preempt_blocked_readers_cgp()) {
323 rcu_preempt_cpu_qs();
324 rcu_preempt_start_gp();
325 }
326
327 /*
328 * If this was the last task on the expedited lists,
329 * then we need wake up the waiting task.
330 */
331 if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
332 rcu_report_exp_done();
333 }
334 local_irq_restore(flags);
335}
336
337/*
338 * Tiny-preemptible RCU implementation for rcu_read_unlock().
339 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
340 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
341 * invoke rcu_read_unlock_special() to clean up after a context switch
342 * in an RCU read-side critical section and other special cases.
343 */
344void __rcu_read_unlock(void)
345{
346 struct task_struct *t = current;
347
348 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */
349 --t->rcu_read_lock_nesting;
350 barrier(); /* decrement before load of ->rcu_read_unlock_special */
351 if (t->rcu_read_lock_nesting == 0 &&
352 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
353 rcu_read_unlock_special(t);
354#ifdef CONFIG_PROVE_LOCKING
355 WARN_ON_ONCE(t->rcu_read_lock_nesting < 0);
356#endif /* #ifdef CONFIG_PROVE_LOCKING */
357}
358EXPORT_SYMBOL_GPL(__rcu_read_unlock);
359
360/*
361 * Check for a quiescent state from the current CPU. When a task blocks,
362 * the task is recorded in the rcu_preempt_ctrlblk structure, which is
363 * checked elsewhere. This is called from the scheduling-clock interrupt.
364 *
365 * Caller must disable hard irqs.
366 */
367static void rcu_preempt_check_callbacks(void)
368{
369 struct task_struct *t = current;
370
371 if (rcu_preempt_gp_in_progress() &&
372 (!rcu_preempt_running_reader() ||
373 !rcu_cpu_blocking_cur_gp()))
374 rcu_preempt_cpu_qs();
375 if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
376 rcu_preempt_ctrlblk.rcb.donetail)
377 raise_softirq(RCU_SOFTIRQ);
378 if (rcu_preempt_gp_in_progress() &&
379 rcu_cpu_blocking_cur_gp() &&
380 rcu_preempt_running_reader())
381 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
382}
383
384/*
385 * TINY_PREEMPT_RCU has an extra callback-list tail pointer to
386 * update, so this is invoked from __rcu_process_callbacks() to
387 * handle that case. Of course, it is invoked for all flavors of
388 * RCU, but RCU callbacks can appear only on one of the lists, and
389 * neither ->nexttail nor ->donetail can possibly be NULL, so there
390 * is no need for an explicit check.
391 */
392static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
393{
394 if (rcu_preempt_ctrlblk.nexttail == rcp->donetail)
395 rcu_preempt_ctrlblk.nexttail = &rcp->rcucblist;
396}
397
398/*
399 * Process callbacks for preemptible RCU.
400 */
401static void rcu_preempt_process_callbacks(void)
402{
403 __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
404}
405
406/*
407 * Queue a preemptible -RCU callback for invocation after a grace period.
408 */
409void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
410{
411 unsigned long flags;
412
413 debug_rcu_head_queue(head);
414 head->func = func;
415 head->next = NULL;
416
417 local_irq_save(flags);
418 *rcu_preempt_ctrlblk.nexttail = head;
419 rcu_preempt_ctrlblk.nexttail = &head->next;
420 rcu_preempt_start_gp(); /* checks to see if GP needed. */
421 local_irq_restore(flags);
422}
423EXPORT_SYMBOL_GPL(call_rcu);
424
425void rcu_barrier(void)
426{
427 struct rcu_synchronize rcu;
428
429 init_rcu_head_on_stack(&rcu.head);
430 init_completion(&rcu.completion);
431 /* Will wake me after RCU finished. */
432 call_rcu(&rcu.head, wakeme_after_rcu);
433 /* Wait for it. */
434 wait_for_completion(&rcu.completion);
435 destroy_rcu_head_on_stack(&rcu.head);
436}
437EXPORT_SYMBOL_GPL(rcu_barrier);
438
439/*
440 * synchronize_rcu - wait until a grace period has elapsed.
441 *
442 * Control will return to the caller some time after a full grace
443 * period has elapsed, in other words after all currently executing RCU
444 * read-side critical sections have completed. RCU read-side critical
445 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
446 * and may be nested.
447 */
448void synchronize_rcu(void)
449{
450#ifdef CONFIG_DEBUG_LOCK_ALLOC
451 if (!rcu_scheduler_active)
452 return;
453#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
454
455 WARN_ON_ONCE(rcu_preempt_running_reader());
456 if (!rcu_preempt_blocked_readers_any())
457 return;
458
459 /* Once we get past the fastpath checks, same code as rcu_barrier(). */
460 rcu_barrier();
461}
462EXPORT_SYMBOL_GPL(synchronize_rcu);
463
464static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
465static unsigned long sync_rcu_preempt_exp_count;
466static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
467
468/*
469 * Return non-zero if there are any tasks in RCU read-side critical
470 * sections blocking the current preemptible-RCU expedited grace period.
471 * If there is no preemptible-RCU expedited grace period currently in
472 * progress, returns zero unconditionally.
473 */
474static int rcu_preempted_readers_exp(void)
475{
476 return rcu_preempt_ctrlblk.exp_tasks != NULL;
477}
478
479/*
480 * Report the exit from RCU read-side critical section for the last task
481 * that queued itself during or before the current expedited preemptible-RCU
482 * grace period.
483 */
484static void rcu_report_exp_done(void)
485{
486 wake_up(&sync_rcu_preempt_exp_wq);
487}
488
489/*
490 * Wait for an rcu-preempt grace period, but expedite it. The basic idea
491 * is to rely in the fact that there is but one CPU, and that it is
492 * illegal for a task to invoke synchronize_rcu_expedited() while in a
493 * preemptible-RCU read-side critical section. Therefore, any such
494 * critical sections must correspond to blocked tasks, which must therefore
495 * be on the ->blkd_tasks list. So just record the current head of the
496 * list in the ->exp_tasks pointer, and wait for all tasks including and
497 * after the task pointed to by ->exp_tasks to drain.
498 */
499void synchronize_rcu_expedited(void)
500{
501 unsigned long flags;
502 struct rcu_preempt_ctrlblk *rpcp = &rcu_preempt_ctrlblk;
503 unsigned long snap;
504
505 barrier(); /* ensure prior action seen before grace period. */
506
507 WARN_ON_ONCE(rcu_preempt_running_reader());
508
509 /*
510 * Acquire lock so that there is only one preemptible RCU grace
511 * period in flight. Of course, if someone does the expedited
512 * grace period for us while we are acquiring the lock, just leave.
513 */
514 snap = sync_rcu_preempt_exp_count + 1;
515 mutex_lock(&sync_rcu_preempt_exp_mutex);
516 if (ULONG_CMP_LT(snap, sync_rcu_preempt_exp_count))
517 goto unlock_mb_ret; /* Others did our work for us. */
518
519 local_irq_save(flags);
520
521 /*
522 * All RCU readers have to already be on blkd_tasks because
523 * we cannot legally be executing in an RCU read-side critical
524 * section.
525 */
526
527 /* Snapshot current head of ->blkd_tasks list. */
528 rpcp->exp_tasks = rpcp->blkd_tasks.next;
529 if (rpcp->exp_tasks == &rpcp->blkd_tasks)
530 rpcp->exp_tasks = NULL;
531 local_irq_restore(flags);
532
533 /* Wait for tail of ->blkd_tasks list to drain. */
534 if (rcu_preempted_readers_exp())
535 wait_event(sync_rcu_preempt_exp_wq,
536 !rcu_preempted_readers_exp());
537
538 /* Clean up and exit. */
539 barrier(); /* ensure expedited GP seen before counter increment. */
540 sync_rcu_preempt_exp_count++;
541unlock_mb_ret:
542 mutex_unlock(&sync_rcu_preempt_exp_mutex);
543 barrier(); /* ensure subsequent action seen after grace period. */
544}
545EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
546
547/*
548 * Does preemptible RCU need the CPU to stay out of dynticks mode?
549 */
550int rcu_preempt_needs_cpu(void)
551{
552 if (!rcu_preempt_running_reader())
553 rcu_preempt_cpu_qs();
554 return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
555}
556
557/*
558 * Check for a task exiting while in a preemptible -RCU read-side
559 * critical section, clean up if so. No need to issue warnings,
560 * as debug_check_no_locks_held() already does this if lockdep
561 * is enabled.
562 */
563void exit_rcu(void)
564{
565 struct task_struct *t = current;
566
567 if (t->rcu_read_lock_nesting == 0)
568 return;
569 t->rcu_read_lock_nesting = 1;
570 rcu_read_unlock();
571}
572
573#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
574
575/*
576 * Because preemptible RCU does not exist, it never has any callbacks
577 * to check.
578 */
579static void rcu_preempt_check_callbacks(void)
580{
581}
582
583/*
584 * Because preemptible RCU does not exist, it never has any callbacks
585 * to remove.
586 */
587static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
588{
589}
590
591/*
592 * Because preemptible RCU does not exist, it never has any callbacks
593 * to process.
594 */
595static void rcu_preempt_process_callbacks(void)
596{
597}
598
599#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
600
25#ifdef CONFIG_DEBUG_LOCK_ALLOC 601#ifdef CONFIG_DEBUG_LOCK_ALLOC
26 602
27#include <linux/kernel_stat.h> 603#include <linux/kernel_stat.h>
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 6535ac8bc6a5..9d8e8fb2515f 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -120,7 +120,7 @@ struct rcu_torture {
120}; 120};
121 121
122static LIST_HEAD(rcu_torture_freelist); 122static LIST_HEAD(rcu_torture_freelist);
123static struct rcu_torture *rcu_torture_current; 123static struct rcu_torture __rcu *rcu_torture_current;
124static long rcu_torture_current_version; 124static long rcu_torture_current_version;
125static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; 125static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
126static DEFINE_SPINLOCK(rcu_torture_lock); 126static DEFINE_SPINLOCK(rcu_torture_lock);
@@ -153,8 +153,10 @@ int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
153#define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ 153#define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */
154#define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ 154#define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */
155static int fullstop = FULLSTOP_RMMOD; 155static int fullstop = FULLSTOP_RMMOD;
156DEFINE_MUTEX(fullstop_mutex); /* Protect fullstop transitions and spawning */ 156/*
157 /* of kthreads. */ 157 * Protect fullstop transitions and spawning of kthreads.
158 */
159static DEFINE_MUTEX(fullstop_mutex);
158 160
159/* 161/*
160 * Detect and respond to a system shutdown. 162 * Detect and respond to a system shutdown.
@@ -239,8 +241,7 @@ static unsigned long
239rcu_random(struct rcu_random_state *rrsp) 241rcu_random(struct rcu_random_state *rrsp)
240{ 242{
241 if (--rrsp->rrs_count < 0) { 243 if (--rrsp->rrs_count < 0) {
242 rrsp->rrs_state += 244 rrsp->rrs_state += (unsigned long)local_clock();
243 (unsigned long)cpu_clock(raw_smp_processor_id());
244 rrsp->rrs_count = RCU_RANDOM_REFRESH; 245 rrsp->rrs_count = RCU_RANDOM_REFRESH;
245 } 246 }
246 rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; 247 rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;
@@ -304,6 +305,10 @@ static void rcu_read_delay(struct rcu_random_state *rrsp)
304 mdelay(longdelay_ms); 305 mdelay(longdelay_ms);
305 if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) 306 if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
306 udelay(shortdelay_us); 307 udelay(shortdelay_us);
308#ifdef CONFIG_PREEMPT
309 if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000)))
310 preempt_schedule(); /* No QS if preempt_disable() in effect */
311#endif
307} 312}
308 313
309static void rcu_torture_read_unlock(int idx) __releases(RCU) 314static void rcu_torture_read_unlock(int idx) __releases(RCU)
@@ -537,6 +542,8 @@ static void srcu_read_delay(struct rcu_random_state *rrsp)
537 delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); 542 delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick);
538 if (!delay) 543 if (!delay)
539 schedule_timeout_interruptible(longdelay); 544 schedule_timeout_interruptible(longdelay);
545 else
546 rcu_read_delay(rrsp);
540} 547}
541 548
542static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) 549static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
@@ -732,7 +739,8 @@ rcu_torture_writer(void *arg)
732 continue; 739 continue;
733 rp->rtort_pipe_count = 0; 740 rp->rtort_pipe_count = 0;
734 udelay(rcu_random(&rand) & 0x3ff); 741 udelay(rcu_random(&rand) & 0x3ff);
735 old_rp = rcu_torture_current; 742 old_rp = rcu_dereference_check(rcu_torture_current,
743 current == writer_task);
736 rp->rtort_mbtest = 1; 744 rp->rtort_mbtest = 1;
737 rcu_assign_pointer(rcu_torture_current, rp); 745 rcu_assign_pointer(rcu_torture_current, rp);
738 smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */ 746 smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d4437345706f..ccdc04c47981 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -143,6 +143,11 @@ module_param(blimit, int, 0);
143module_param(qhimark, int, 0); 143module_param(qhimark, int, 0);
144module_param(qlowmark, int, 0); 144module_param(qlowmark, int, 0);
145 145
146#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
147int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT;
148module_param(rcu_cpu_stall_suppress, int, 0644);
149#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
150
146static void force_quiescent_state(struct rcu_state *rsp, int relaxed); 151static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
147static int rcu_pending(int cpu); 152static int rcu_pending(int cpu);
148 153
@@ -450,7 +455,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
450 455
451#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 456#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
452 457
453int rcu_cpu_stall_panicking __read_mostly; 458int rcu_cpu_stall_suppress __read_mostly;
454 459
455static void record_gp_stall_check_time(struct rcu_state *rsp) 460static void record_gp_stall_check_time(struct rcu_state *rsp)
456{ 461{
@@ -482,8 +487,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
482 rcu_print_task_stall(rnp); 487 rcu_print_task_stall(rnp);
483 raw_spin_unlock_irqrestore(&rnp->lock, flags); 488 raw_spin_unlock_irqrestore(&rnp->lock, flags);
484 489
485 /* OK, time to rat on our buddy... */ 490 /*
486 491 * OK, time to rat on our buddy...
492 * See Documentation/RCU/stallwarn.txt for info on how to debug
493 * RCU CPU stall warnings.
494 */
487 printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {", 495 printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {",
488 rsp->name); 496 rsp->name);
489 rcu_for_each_leaf_node(rsp, rnp) { 497 rcu_for_each_leaf_node(rsp, rnp) {
@@ -512,6 +520,11 @@ static void print_cpu_stall(struct rcu_state *rsp)
512 unsigned long flags; 520 unsigned long flags;
513 struct rcu_node *rnp = rcu_get_root(rsp); 521 struct rcu_node *rnp = rcu_get_root(rsp);
514 522
523 /*
524 * OK, time to rat on ourselves...
525 * See Documentation/RCU/stallwarn.txt for info on how to debug
526 * RCU CPU stall warnings.
527 */
515 printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", 528 printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
516 rsp->name, smp_processor_id(), jiffies - rsp->gp_start); 529 rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
517 trigger_all_cpu_backtrace(); 530 trigger_all_cpu_backtrace();
@@ -530,11 +543,11 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
530 long delta; 543 long delta;
531 struct rcu_node *rnp; 544 struct rcu_node *rnp;
532 545
533 if (rcu_cpu_stall_panicking) 546 if (rcu_cpu_stall_suppress)
534 return; 547 return;
535 delta = jiffies - rsp->jiffies_stall; 548 delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall);
536 rnp = rdp->mynode; 549 rnp = rdp->mynode;
537 if ((rnp->qsmask & rdp->grpmask) && delta >= 0) { 550 if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && delta >= 0) {
538 551
539 /* We haven't checked in, so go dump stack. */ 552 /* We haven't checked in, so go dump stack. */
540 print_cpu_stall(rsp); 553 print_cpu_stall(rsp);
@@ -548,10 +561,26 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
548 561
549static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) 562static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
550{ 563{
551 rcu_cpu_stall_panicking = 1; 564 rcu_cpu_stall_suppress = 1;
552 return NOTIFY_DONE; 565 return NOTIFY_DONE;
553} 566}
554 567
568/**
569 * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
570 *
571 * Set the stall-warning timeout way off into the future, thus preventing
572 * any RCU CPU stall-warning messages from appearing in the current set of
573 * RCU grace periods.
574 *
575 * The caller must disable hard irqs.
576 */
577void rcu_cpu_stall_reset(void)
578{
579 rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2;
580 rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2;
581 rcu_preempt_stall_reset();
582}
583
555static struct notifier_block rcu_panic_block = { 584static struct notifier_block rcu_panic_block = {
556 .notifier_call = rcu_panic, 585 .notifier_call = rcu_panic,
557}; 586};
@@ -571,6 +600,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
571{ 600{
572} 601}
573 602
603void rcu_cpu_stall_reset(void)
604{
605}
606
574static void __init check_cpu_stall_init(void) 607static void __init check_cpu_stall_init(void)
575{ 608{
576} 609}
@@ -712,7 +745,7 @@ static void
712rcu_start_gp(struct rcu_state *rsp, unsigned long flags) 745rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
713 __releases(rcu_get_root(rsp)->lock) 746 __releases(rcu_get_root(rsp)->lock)
714{ 747{
715 struct rcu_data *rdp = rsp->rda[smp_processor_id()]; 748 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
716 struct rcu_node *rnp = rcu_get_root(rsp); 749 struct rcu_node *rnp = rcu_get_root(rsp);
717 750
718 if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) { 751 if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
@@ -960,7 +993,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
960static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) 993static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
961{ 994{
962 int i; 995 int i;
963 struct rcu_data *rdp = rsp->rda[smp_processor_id()]; 996 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
964 997
965 if (rdp->nxtlist == NULL) 998 if (rdp->nxtlist == NULL)
966 return; /* irqs disabled, so comparison is stable. */ 999 return; /* irqs disabled, so comparison is stable. */
@@ -971,6 +1004,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
971 for (i = 0; i < RCU_NEXT_SIZE; i++) 1004 for (i = 0; i < RCU_NEXT_SIZE; i++)
972 rdp->nxttail[i] = &rdp->nxtlist; 1005 rdp->nxttail[i] = &rdp->nxtlist;
973 rsp->orphan_qlen += rdp->qlen; 1006 rsp->orphan_qlen += rdp->qlen;
1007 rdp->n_cbs_orphaned += rdp->qlen;
974 rdp->qlen = 0; 1008 rdp->qlen = 0;
975 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ 1009 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
976} 1010}
@@ -984,7 +1018,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
984 struct rcu_data *rdp; 1018 struct rcu_data *rdp;
985 1019
986 raw_spin_lock_irqsave(&rsp->onofflock, flags); 1020 raw_spin_lock_irqsave(&rsp->onofflock, flags);
987 rdp = rsp->rda[smp_processor_id()]; 1021 rdp = this_cpu_ptr(rsp->rda);
988 if (rsp->orphan_cbs_list == NULL) { 1022 if (rsp->orphan_cbs_list == NULL) {
989 raw_spin_unlock_irqrestore(&rsp->onofflock, flags); 1023 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
990 return; 1024 return;
@@ -992,6 +1026,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
992 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; 1026 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
993 rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail; 1027 rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
994 rdp->qlen += rsp->orphan_qlen; 1028 rdp->qlen += rsp->orphan_qlen;
1029 rdp->n_cbs_adopted += rsp->orphan_qlen;
995 rsp->orphan_cbs_list = NULL; 1030 rsp->orphan_cbs_list = NULL;
996 rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; 1031 rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
997 rsp->orphan_qlen = 0; 1032 rsp->orphan_qlen = 0;
@@ -1007,7 +1042,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1007 unsigned long flags; 1042 unsigned long flags;
1008 unsigned long mask; 1043 unsigned long mask;
1009 int need_report = 0; 1044 int need_report = 0;
1010 struct rcu_data *rdp = rsp->rda[cpu]; 1045 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1011 struct rcu_node *rnp; 1046 struct rcu_node *rnp;
1012 1047
1013 /* Exclude any attempts to start a new grace period. */ 1048 /* Exclude any attempts to start a new grace period. */
@@ -1112,6 +1147,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1112 while (list) { 1147 while (list) {
1113 next = list->next; 1148 next = list->next;
1114 prefetch(next); 1149 prefetch(next);
1150 debug_rcu_head_unqueue(list);
1115 list->func(list); 1151 list->func(list);
1116 list = next; 1152 list = next;
1117 if (++count >= rdp->blimit) 1153 if (++count >= rdp->blimit)
@@ -1122,6 +1158,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1122 1158
1123 /* Update count, and requeue any remaining callbacks. */ 1159 /* Update count, and requeue any remaining callbacks. */
1124 rdp->qlen -= count; 1160 rdp->qlen -= count;
1161 rdp->n_cbs_invoked += count;
1125 if (list != NULL) { 1162 if (list != NULL) {
1126 *tail = rdp->nxtlist; 1163 *tail = rdp->nxtlist;
1127 rdp->nxtlist = list; 1164 rdp->nxtlist = list;
@@ -1225,7 +1262,8 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
1225 cpu = rnp->grplo; 1262 cpu = rnp->grplo;
1226 bit = 1; 1263 bit = 1;
1227 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { 1264 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
1228 if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) 1265 if ((rnp->qsmask & bit) != 0 &&
1266 f(per_cpu_ptr(rsp->rda, cpu)))
1229 mask |= bit; 1267 mask |= bit;
1230 } 1268 }
1231 if (mask != 0) { 1269 if (mask != 0) {
@@ -1388,6 +1426,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1388 unsigned long flags; 1426 unsigned long flags;
1389 struct rcu_data *rdp; 1427 struct rcu_data *rdp;
1390 1428
1429 debug_rcu_head_queue(head);
1391 head->func = func; 1430 head->func = func;
1392 head->next = NULL; 1431 head->next = NULL;
1393 1432
@@ -1400,7 +1439,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1400 * a quiescent state betweentimes. 1439 * a quiescent state betweentimes.
1401 */ 1440 */
1402 local_irq_save(flags); 1441 local_irq_save(flags);
1403 rdp = rsp->rda[smp_processor_id()]; 1442 rdp = this_cpu_ptr(rsp->rda);
1404 rcu_process_gp_end(rsp, rdp); 1443 rcu_process_gp_end(rsp, rdp);
1405 check_for_new_grace_period(rsp, rdp); 1444 check_for_new_grace_period(rsp, rdp);
1406 1445
@@ -1699,7 +1738,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1699{ 1738{
1700 unsigned long flags; 1739 unsigned long flags;
1701 int i; 1740 int i;
1702 struct rcu_data *rdp = rsp->rda[cpu]; 1741 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1703 struct rcu_node *rnp = rcu_get_root(rsp); 1742 struct rcu_node *rnp = rcu_get_root(rsp);
1704 1743
1705 /* Set up local state, ensuring consistent view of global state. */ 1744 /* Set up local state, ensuring consistent view of global state. */
@@ -1727,7 +1766,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1727{ 1766{
1728 unsigned long flags; 1767 unsigned long flags;
1729 unsigned long mask; 1768 unsigned long mask;
1730 struct rcu_data *rdp = rsp->rda[cpu]; 1769 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1731 struct rcu_node *rnp = rcu_get_root(rsp); 1770 struct rcu_node *rnp = rcu_get_root(rsp);
1732 1771
1733 /* Set up local state, ensuring consistent view of global state. */ 1772 /* Set up local state, ensuring consistent view of global state. */
@@ -1863,7 +1902,8 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
1863/* 1902/*
1864 * Helper function for rcu_init() that initializes one rcu_state structure. 1903 * Helper function for rcu_init() that initializes one rcu_state structure.
1865 */ 1904 */
1866static void __init rcu_init_one(struct rcu_state *rsp) 1905static void __init rcu_init_one(struct rcu_state *rsp,
1906 struct rcu_data __percpu *rda)
1867{ 1907{
1868 static char *buf[] = { "rcu_node_level_0", 1908 static char *buf[] = { "rcu_node_level_0",
1869 "rcu_node_level_1", 1909 "rcu_node_level_1",
@@ -1916,37 +1956,23 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1916 } 1956 }
1917 } 1957 }
1918 1958
1959 rsp->rda = rda;
1919 rnp = rsp->level[NUM_RCU_LVLS - 1]; 1960 rnp = rsp->level[NUM_RCU_LVLS - 1];
1920 for_each_possible_cpu(i) { 1961 for_each_possible_cpu(i) {
1921 while (i > rnp->grphi) 1962 while (i > rnp->grphi)
1922 rnp++; 1963 rnp++;
1923 rsp->rda[i]->mynode = rnp; 1964 per_cpu_ptr(rsp->rda, i)->mynode = rnp;
1924 rcu_boot_init_percpu_data(i, rsp); 1965 rcu_boot_init_percpu_data(i, rsp);
1925 } 1966 }
1926} 1967}
1927 1968
1928/*
1929 * Helper macro for __rcu_init() and __rcu_init_preempt(). To be used
1930 * nowhere else! Assigns leaf node pointers into each CPU's rcu_data
1931 * structure.
1932 */
1933#define RCU_INIT_FLAVOR(rsp, rcu_data) \
1934do { \
1935 int i; \
1936 \
1937 for_each_possible_cpu(i) { \
1938 (rsp)->rda[i] = &per_cpu(rcu_data, i); \
1939 } \
1940 rcu_init_one(rsp); \
1941} while (0)
1942
1943void __init rcu_init(void) 1969void __init rcu_init(void)
1944{ 1970{
1945 int cpu; 1971 int cpu;
1946 1972
1947 rcu_bootup_announce(); 1973 rcu_bootup_announce();
1948 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); 1974 rcu_init_one(&rcu_sched_state, &rcu_sched_data);
1949 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); 1975 rcu_init_one(&rcu_bh_state, &rcu_bh_data);
1950 __rcu_init_preempt(); 1976 __rcu_init_preempt();
1951 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 1977 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1952 1978
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 14c040b18ed0..91d4170c5c13 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -202,6 +202,9 @@ struct rcu_data {
202 long qlen; /* # of queued callbacks */ 202 long qlen; /* # of queued callbacks */
203 long qlen_last_fqs_check; 203 long qlen_last_fqs_check;
204 /* qlen at last check for QS forcing */ 204 /* qlen at last check for QS forcing */
205 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
206 unsigned long n_cbs_orphaned; /* RCU cbs sent to orphanage. */
207 unsigned long n_cbs_adopted; /* RCU cbs adopted from orphanage. */
205 unsigned long n_force_qs_snap; 208 unsigned long n_force_qs_snap;
206 /* did other CPU force QS recently? */ 209 /* did other CPU force QS recently? */
207 long blimit; /* Upper limit on a processed batch */ 210 long blimit; /* Upper limit on a processed batch */
@@ -254,19 +257,23 @@ struct rcu_data {
254#define RCU_STALL_DELAY_DELTA 0 257#define RCU_STALL_DELAY_DELTA 0
255#endif 258#endif
256 259
257#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ + RCU_STALL_DELAY_DELTA) 260#define RCU_SECONDS_TILL_STALL_CHECK (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \
261 RCU_STALL_DELAY_DELTA)
258 /* for rsp->jiffies_stall */ 262 /* for rsp->jiffies_stall */
259#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA) 263#define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30)
260 /* for rsp->jiffies_stall */ 264 /* for rsp->jiffies_stall */
261#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ 265#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
262 /* to take at least one */ 266 /* to take at least one */
263 /* scheduling clock irq */ 267 /* scheduling clock irq */
264 /* before ratting on them. */ 268 /* before ratting on them. */
265 269
266#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 270#ifdef CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE
271#define RCU_CPU_STALL_SUPPRESS_INIT 0
272#else
273#define RCU_CPU_STALL_SUPPRESS_INIT 1
274#endif
267 275
268#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) 276#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
269#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
270 277
271/* 278/*
272 * RCU global state, including node hierarchy. This hierarchy is 279 * RCU global state, including node hierarchy. This hierarchy is
@@ -283,7 +290,7 @@ struct rcu_state {
283 struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */ 290 struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */
284 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ 291 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
285 u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */ 292 u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */
286 struct rcu_data *rda[NR_CPUS]; /* array of rdp pointers. */ 293 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
287 294
288 /* The following fields are guarded by the root rcu_node's lock. */ 295 /* The following fields are guarded by the root rcu_node's lock. */
289 296
@@ -365,6 +372,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
365#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 372#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
366static void rcu_print_detail_task_stall(struct rcu_state *rsp); 373static void rcu_print_detail_task_stall(struct rcu_state *rsp);
367static void rcu_print_task_stall(struct rcu_node *rnp); 374static void rcu_print_task_stall(struct rcu_node *rnp);
375static void rcu_preempt_stall_reset(void);
368#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 376#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
369static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); 377static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
370#ifdef CONFIG_HOTPLUG_CPU 378#ifdef CONFIG_HOTPLUG_CPU
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 0e4f420245d9..71a4147473f9 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -57,7 +57,7 @@ static void __init rcu_bootup_announce_oddness(void)
57 printk(KERN_INFO 57 printk(KERN_INFO
58 "\tRCU-based detection of stalled CPUs is disabled.\n"); 58 "\tRCU-based detection of stalled CPUs is disabled.\n");
59#endif 59#endif
60#ifndef CONFIG_RCU_CPU_STALL_VERBOSE 60#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
61 printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); 61 printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
62#endif 62#endif
63#if NUM_RCU_LVL_4 != 0 63#if NUM_RCU_LVL_4 != 0
@@ -154,7 +154,7 @@ static void rcu_preempt_note_context_switch(int cpu)
154 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { 154 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
155 155
156 /* Possibly blocking in an RCU read-side critical section. */ 156 /* Possibly blocking in an RCU read-side critical section. */
157 rdp = rcu_preempt_state.rda[cpu]; 157 rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
158 rnp = rdp->mynode; 158 rnp = rdp->mynode;
159 raw_spin_lock_irqsave(&rnp->lock, flags); 159 raw_spin_lock_irqsave(&rnp->lock, flags);
160 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; 160 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
@@ -201,7 +201,7 @@ static void rcu_preempt_note_context_switch(int cpu)
201 */ 201 */
202void __rcu_read_lock(void) 202void __rcu_read_lock(void)
203{ 203{
204 ACCESS_ONCE(current->rcu_read_lock_nesting)++; 204 current->rcu_read_lock_nesting++;
205 barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */ 205 barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */
206} 206}
207EXPORT_SYMBOL_GPL(__rcu_read_lock); 207EXPORT_SYMBOL_GPL(__rcu_read_lock);
@@ -344,7 +344,9 @@ void __rcu_read_unlock(void)
344 struct task_struct *t = current; 344 struct task_struct *t = current;
345 345
346 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */ 346 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */
347 if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 && 347 --t->rcu_read_lock_nesting;
348 barrier(); /* decrement before load of ->rcu_read_unlock_special */
349 if (t->rcu_read_lock_nesting == 0 &&
348 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) 350 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
349 rcu_read_unlock_special(t); 351 rcu_read_unlock_special(t);
350#ifdef CONFIG_PROVE_LOCKING 352#ifdef CONFIG_PROVE_LOCKING
@@ -417,6 +419,16 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
417 } 419 }
418} 420}
419 421
422/*
423 * Suppress preemptible RCU's CPU stall warnings by pushing the
424 * time of the next stall-warning message comfortably far into the
425 * future.
426 */
427static void rcu_preempt_stall_reset(void)
428{
429 rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2;
430}
431
420#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 432#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
421 433
422/* 434/*
@@ -546,9 +558,11 @@ EXPORT_SYMBOL_GPL(call_rcu);
546 * 558 *
547 * Control will return to the caller some time after a full grace 559 * Control will return to the caller some time after a full grace
548 * period has elapsed, in other words after all currently executing RCU 560 * period has elapsed, in other words after all currently executing RCU
549 * read-side critical sections have completed. RCU read-side critical 561 * read-side critical sections have completed. Note, however, that
550 * sections are delimited by rcu_read_lock() and rcu_read_unlock(), 562 * upon return from synchronize_rcu(), the caller might well be executing
551 * and may be nested. 563 * concurrently with new RCU read-side critical sections that began while
564 * synchronize_rcu() was waiting. RCU read-side critical sections are
565 * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
552 */ 566 */
553void synchronize_rcu(void) 567void synchronize_rcu(void)
554{ 568{
@@ -771,7 +785,7 @@ static void rcu_preempt_send_cbs_to_orphanage(void)
771 */ 785 */
772static void __init __rcu_init_preempt(void) 786static void __init __rcu_init_preempt(void)
773{ 787{
774 RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data); 788 rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
775} 789}
776 790
777/* 791/*
@@ -865,6 +879,14 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
865{ 879{
866} 880}
867 881
882/*
883 * Because preemptible RCU does not exist, there is no need to suppress
884 * its CPU stall warnings.
885 */
886static void rcu_preempt_stall_reset(void)
887{
888}
889
868#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 890#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
869 891
870/* 892/*
@@ -919,15 +941,6 @@ static void rcu_preempt_process_callbacks(void)
919} 941}
920 942
921/* 943/*
922 * In classic RCU, call_rcu() is just call_rcu_sched().
923 */
924void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
925{
926 call_rcu_sched(head, func);
927}
928EXPORT_SYMBOL_GPL(call_rcu);
929
930/*
931 * Wait for an rcu-preempt grace period, but make it happen quickly. 944 * Wait for an rcu-preempt grace period, but make it happen quickly.
932 * But because preemptable RCU does not exist, map to rcu-sched. 945 * But because preemptable RCU does not exist, map to rcu-sched.
933 */ 946 */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 36c95b45738e..d15430b9d122 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -64,7 +64,9 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
64 rdp->dynticks_fqs); 64 rdp->dynticks_fqs);
65#endif /* #ifdef CONFIG_NO_HZ */ 65#endif /* #ifdef CONFIG_NO_HZ */
66 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); 66 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
67 seq_printf(m, " ql=%ld b=%ld\n", rdp->qlen, rdp->blimit); 67 seq_printf(m, " ql=%ld b=%ld", rdp->qlen, rdp->blimit);
68 seq_printf(m, " ci=%lu co=%lu ca=%lu\n",
69 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
68} 70}
69 71
70#define PRINT_RCU_DATA(name, func, m) \ 72#define PRINT_RCU_DATA(name, func, m) \
@@ -119,7 +121,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
119 rdp->dynticks_fqs); 121 rdp->dynticks_fqs);
120#endif /* #ifdef CONFIG_NO_HZ */ 122#endif /* #ifdef CONFIG_NO_HZ */
121 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); 123 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
122 seq_printf(m, ",%ld,%ld\n", rdp->qlen, rdp->blimit); 124 seq_printf(m, ",%ld,%ld", rdp->qlen, rdp->blimit);
125 seq_printf(m, ",%lu,%lu,%lu\n",
126 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
123} 127}
124 128
125static int show_rcudata_csv(struct seq_file *m, void *unused) 129static int show_rcudata_csv(struct seq_file *m, void *unused)
@@ -128,7 +132,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
128#ifdef CONFIG_NO_HZ 132#ifdef CONFIG_NO_HZ
129 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); 133 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
130#endif /* #ifdef CONFIG_NO_HZ */ 134#endif /* #ifdef CONFIG_NO_HZ */
131 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n"); 135 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n");
132#ifdef CONFIG_TREE_PREEMPT_RCU 136#ifdef CONFIG_TREE_PREEMPT_RCU
133 seq_puts(m, "\"rcu_preempt:\"\n"); 137 seq_puts(m, "\"rcu_preempt:\"\n");
134 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); 138 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m);
@@ -262,7 +266,7 @@ static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
262 struct rcu_data *rdp; 266 struct rcu_data *rdp;
263 267
264 for_each_possible_cpu(cpu) { 268 for_each_possible_cpu(cpu) {
265 rdp = rsp->rda[cpu]; 269 rdp = per_cpu_ptr(rsp->rda, cpu);
266 if (rdp->beenonline) 270 if (rdp->beenonline)
267 print_one_rcu_pending(m, rdp); 271 print_one_rcu_pending(m, rdp);
268 } 272 }
diff --git a/kernel/resource.c b/kernel/resource.c
index 7b36976e5dea..9fad33efd0db 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -40,6 +40,23 @@ EXPORT_SYMBOL(iomem_resource);
40 40
41static DEFINE_RWLOCK(resource_lock); 41static DEFINE_RWLOCK(resource_lock);
42 42
43/*
44 * By default, we allocate free space bottom-up. The architecture can request
45 * top-down by clearing this flag. The user can override the architecture's
46 * choice with the "resource_alloc_from_bottom" kernel boot option, but that
47 * should only be a debugging tool.
48 */
49int resource_alloc_from_bottom = 1;
50
51static __init int setup_alloc_from_bottom(char *s)
52{
53 printk(KERN_INFO
54 "resource: allocating from bottom-up; please report a bug\n");
55 resource_alloc_from_bottom = 1;
56 return 0;
57}
58early_param("resource_alloc_from_bottom", setup_alloc_from_bottom);
59
43static void *r_next(struct seq_file *m, void *v, loff_t *pos) 60static void *r_next(struct seq_file *m, void *v, loff_t *pos)
44{ 61{
45 struct resource *p = v; 62 struct resource *p = v;
@@ -357,8 +374,97 @@ int __weak page_is_ram(unsigned long pfn)
357 return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; 374 return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
358} 375}
359 376
377static resource_size_t simple_align_resource(void *data,
378 const struct resource *avail,
379 resource_size_t size,
380 resource_size_t align)
381{
382 return avail->start;
383}
384
385static void resource_clip(struct resource *res, resource_size_t min,
386 resource_size_t max)
387{
388 if (res->start < min)
389 res->start = min;
390 if (res->end > max)
391 res->end = max;
392}
393
394static bool resource_contains(struct resource *res1, struct resource *res2)
395{
396 return res1->start <= res2->start && res1->end >= res2->end;
397}
398
399/*
400 * Find the resource before "child" in the sibling list of "root" children.
401 */
402static struct resource *find_sibling_prev(struct resource *root, struct resource *child)
403{
404 struct resource *this;
405
406 for (this = root->child; this; this = this->sibling)
407 if (this->sibling == child)
408 return this;
409
410 return NULL;
411}
412
413/*
414 * Find empty slot in the resource tree given range and alignment.
415 * This version allocates from the end of the root resource first.
416 */
417static int find_resource_from_top(struct resource *root, struct resource *new,
418 resource_size_t size, resource_size_t min,
419 resource_size_t max, resource_size_t align,
420 resource_size_t (*alignf)(void *,
421 const struct resource *,
422 resource_size_t,
423 resource_size_t),
424 void *alignf_data)
425{
426 struct resource *this;
427 struct resource tmp, avail, alloc;
428
429 tmp.start = root->end;
430 tmp.end = root->end;
431
432 this = find_sibling_prev(root, NULL);
433 for (;;) {
434 if (this) {
435 if (this->end < root->end)
436 tmp.start = this->end + 1;
437 } else
438 tmp.start = root->start;
439
440 resource_clip(&tmp, min, max);
441
442 /* Check for overflow after ALIGN() */
443 avail = *new;
444 avail.start = ALIGN(tmp.start, align);
445 avail.end = tmp.end;
446 if (avail.start >= tmp.start) {
447 alloc.start = alignf(alignf_data, &avail, size, align);
448 alloc.end = alloc.start + size - 1;
449 if (resource_contains(&avail, &alloc)) {
450 new->start = alloc.start;
451 new->end = alloc.end;
452 return 0;
453 }
454 }
455
456 if (!this || this->start == root->start)
457 break;
458
459 tmp.end = this->start - 1;
460 this = find_sibling_prev(root, this);
461 }
462 return -EBUSY;
463}
464
360/* 465/*
361 * Find empty slot in the resource tree given range and alignment. 466 * Find empty slot in the resource tree given range and alignment.
467 * This version allocates from the beginning of the root resource first.
362 */ 468 */
363static int find_resource(struct resource *root, struct resource *new, 469static int find_resource(struct resource *root, struct resource *new,
364 resource_size_t size, resource_size_t min, 470 resource_size_t size, resource_size_t min,
@@ -370,36 +476,43 @@ static int find_resource(struct resource *root, struct resource *new,
370 void *alignf_data) 476 void *alignf_data)
371{ 477{
372 struct resource *this = root->child; 478 struct resource *this = root->child;
373 struct resource tmp = *new; 479 struct resource tmp = *new, avail, alloc;
374 480
375 tmp.start = root->start; 481 tmp.start = root->start;
376 /* 482 /*
377 * Skip past an allocated resource that starts at 0, since the assignment 483 * Skip past an allocated resource that starts at 0, since the
378 * of this->start - 1 to tmp->end below would cause an underflow. 484 * assignment of this->start - 1 to tmp->end below would cause an
485 * underflow.
379 */ 486 */
380 if (this && this->start == 0) { 487 if (this && this->start == 0) {
381 tmp.start = this->end + 1; 488 tmp.start = this->end + 1;
382 this = this->sibling; 489 this = this->sibling;
383 } 490 }
384 for(;;) { 491 for (;;) {
385 if (this) 492 if (this)
386 tmp.end = this->start - 1; 493 tmp.end = this->start - 1;
387 else 494 else
388 tmp.end = root->end; 495 tmp.end = root->end;
389 if (tmp.start < min) 496
390 tmp.start = min; 497 resource_clip(&tmp, min, max);
391 if (tmp.end > max) 498
392 tmp.end = max; 499 /* Check for overflow after ALIGN() */
393 tmp.start = ALIGN(tmp.start, align); 500 avail = *new;
394 if (alignf) 501 avail.start = ALIGN(tmp.start, align);
395 tmp.start = alignf(alignf_data, &tmp, size, align); 502 avail.end = tmp.end;
396 if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) { 503 if (avail.start >= tmp.start) {
397 new->start = tmp.start; 504 alloc.start = alignf(alignf_data, &avail, size, align);
398 new->end = tmp.start + size - 1; 505 alloc.end = alloc.start + size - 1;
399 return 0; 506 if (resource_contains(&avail, &alloc)) {
507 new->start = alloc.start;
508 new->end = alloc.end;
509 return 0;
510 }
400 } 511 }
512
401 if (!this) 513 if (!this)
402 break; 514 break;
515
403 tmp.start = this->end + 1; 516 tmp.start = this->end + 1;
404 this = this->sibling; 517 this = this->sibling;
405 } 518 }
@@ -428,8 +541,14 @@ int allocate_resource(struct resource *root, struct resource *new,
428{ 541{
429 int err; 542 int err;
430 543
544 if (!alignf)
545 alignf = simple_align_resource;
546
431 write_lock(&resource_lock); 547 write_lock(&resource_lock);
432 err = find_resource(root, new, size, min, max, align, alignf, alignf_data); 548 if (resource_alloc_from_bottom)
549 err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
550 else
551 err = find_resource_from_top(root, new, size, min, max, align, alignf, alignf_data);
433 if (err >= 0 && __request_resource(root, new)) 552 if (err >= 0 && __request_resource(root, new))
434 err = -EBUSY; 553 err = -EBUSY;
435 write_unlock(&resource_lock); 554 write_unlock(&resource_lock);
@@ -453,6 +572,8 @@ static struct resource * __insert_resource(struct resource *parent, struct resou
453 572
454 if (first == parent) 573 if (first == parent)
455 return first; 574 return first;
575 if (WARN_ON(first == new)) /* duplicated insertion */
576 return first;
456 577
457 if ((first->start > new->start) || (first->end < new->end)) 578 if ((first->start > new->start) || (first->end < new->end))
458 break; 579 break;
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index a56f629b057a..66cb89bc5ef1 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -76,7 +76,9 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
76 } 76 }
77 77
78 if (!lockwakeup && td->bkl == 4) { 78 if (!lockwakeup && td->bkl == 4) {
79#ifdef CONFIG_LOCK_KERNEL
79 unlock_kernel(); 80 unlock_kernel();
81#endif
80 td->bkl = 0; 82 td->bkl = 0;
81 } 83 }
82 return 0; 84 return 0;
@@ -133,14 +135,18 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
133 if (td->bkl) 135 if (td->bkl)
134 return 0; 136 return 0;
135 td->bkl = 1; 137 td->bkl = 1;
138#ifdef CONFIG_LOCK_KERNEL
136 lock_kernel(); 139 lock_kernel();
140#endif
137 td->bkl = 4; 141 td->bkl = 4;
138 return 0; 142 return 0;
139 143
140 case RTTEST_UNLOCKBKL: 144 case RTTEST_UNLOCKBKL:
141 if (td->bkl != 4) 145 if (td->bkl != 4)
142 break; 146 break;
147#ifdef CONFIG_LOCK_KERNEL
143 unlock_kernel(); 148 unlock_kernel();
149#endif
144 td->bkl = 0; 150 td->bkl = 0;
145 return 0; 151 return 0;
146 152
diff --git a/kernel/sched.c b/kernel/sched.c
index f52a8801b7a2..d42992bccdfa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -77,6 +77,7 @@
77#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
78 78
79#include "sched_cpupri.h" 79#include "sched_cpupri.h"
80#include "workqueue_sched.h"
80 81
81#define CREATE_TRACE_POINTS 82#define CREATE_TRACE_POINTS
82#include <trace/events/sched.h> 83#include <trace/events/sched.h>
@@ -425,9 +426,7 @@ struct root_domain {
425 */ 426 */
426 cpumask_var_t rto_mask; 427 cpumask_var_t rto_mask;
427 atomic_t rto_count; 428 atomic_t rto_count;
428#ifdef CONFIG_SMP
429 struct cpupri cpupri; 429 struct cpupri cpupri;
430#endif
431}; 430};
432 431
433/* 432/*
@@ -436,7 +435,7 @@ struct root_domain {
436 */ 435 */
437static struct root_domain def_root_domain; 436static struct root_domain def_root_domain;
438 437
439#endif 438#endif /* CONFIG_SMP */
440 439
441/* 440/*
442 * This is the main, per-CPU runqueue data structure. 441 * This is the main, per-CPU runqueue data structure.
@@ -456,9 +455,10 @@ struct rq {
456 unsigned long nr_running; 455 unsigned long nr_running;
457 #define CPU_LOAD_IDX_MAX 5 456 #define CPU_LOAD_IDX_MAX 5
458 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 457 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
458 unsigned long last_load_update_tick;
459#ifdef CONFIG_NO_HZ 459#ifdef CONFIG_NO_HZ
460 u64 nohz_stamp; 460 u64 nohz_stamp;
461 unsigned char in_nohz_recently; 461 unsigned char nohz_balance_kick;
462#endif 462#endif
463 unsigned int skip_clock_update; 463 unsigned int skip_clock_update;
464 464
@@ -486,11 +486,12 @@ struct rq {
486 */ 486 */
487 unsigned long nr_uninterruptible; 487 unsigned long nr_uninterruptible;
488 488
489 struct task_struct *curr, *idle; 489 struct task_struct *curr, *idle, *stop;
490 unsigned long next_balance; 490 unsigned long next_balance;
491 struct mm_struct *prev_mm; 491 struct mm_struct *prev_mm;
492 492
493 u64 clock; 493 u64 clock;
494 u64 clock_task;
494 495
495 atomic_t nr_iowait; 496 atomic_t nr_iowait;
496 497
@@ -518,6 +519,10 @@ struct rq {
518 u64 avg_idle; 519 u64 avg_idle;
519#endif 520#endif
520 521
522#ifdef CONFIG_IRQ_TIME_ACCOUNTING
523 u64 prev_irq_time;
524#endif
525
521 /* calc_load related fields */ 526 /* calc_load related fields */
522 unsigned long calc_load_update; 527 unsigned long calc_load_update;
523 long calc_load_active; 528 long calc_load_active;
@@ -641,10 +646,22 @@ static inline struct task_group *task_group(struct task_struct *p)
641 646
642#endif /* CONFIG_CGROUP_SCHED */ 647#endif /* CONFIG_CGROUP_SCHED */
643 648
649static u64 irq_time_cpu(int cpu);
650static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
651
644inline void update_rq_clock(struct rq *rq) 652inline void update_rq_clock(struct rq *rq)
645{ 653{
646 if (!rq->skip_clock_update) 654 if (!rq->skip_clock_update) {
647 rq->clock = sched_clock_cpu(cpu_of(rq)); 655 int cpu = cpu_of(rq);
656 u64 irq_time;
657
658 rq->clock = sched_clock_cpu(cpu);
659 irq_time = irq_time_cpu(cpu);
660 if (rq->clock - irq_time > rq->clock_task)
661 rq->clock_task = rq->clock - irq_time;
662
663 sched_irq_time_avg_update(rq, irq_time);
664 }
648} 665}
649 666
650/* 667/*
@@ -721,7 +738,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
721 size_t cnt, loff_t *ppos) 738 size_t cnt, loff_t *ppos)
722{ 739{
723 char buf[64]; 740 char buf[64];
724 char *cmp = buf; 741 char *cmp;
725 int neg = 0; 742 int neg = 0;
726 int i; 743 int i;
727 744
@@ -732,6 +749,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
732 return -EFAULT; 749 return -EFAULT;
733 750
734 buf[cnt] = 0; 751 buf[cnt] = 0;
752 cmp = strstrip(buf);
735 753
736 if (strncmp(buf, "NO_", 3) == 0) { 754 if (strncmp(buf, "NO_", 3) == 0) {
737 neg = 1; 755 neg = 1;
@@ -739,9 +757,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
739 } 757 }
740 758
741 for (i = 0; sched_feat_names[i]; i++) { 759 for (i = 0; sched_feat_names[i]; i++) {
742 int len = strlen(sched_feat_names[i]); 760 if (strcmp(cmp, sched_feat_names[i]) == 0) {
743
744 if (strncmp(cmp, sched_feat_names[i], len) == 0) {
745 if (neg) 761 if (neg)
746 sysctl_sched_features &= ~(1UL << i); 762 sysctl_sched_features &= ~(1UL << i);
747 else 763 else
@@ -1193,6 +1209,27 @@ static void resched_cpu(int cpu)
1193 1209
1194#ifdef CONFIG_NO_HZ 1210#ifdef CONFIG_NO_HZ
1195/* 1211/*
1212 * In the semi idle case, use the nearest busy cpu for migrating timers
1213 * from an idle cpu. This is good for power-savings.
1214 *
1215 * We don't do similar optimization for completely idle system, as
1216 * selecting an idle cpu will add more delays to the timers than intended
1217 * (as that cpu's timer base may not be uptodate wrt jiffies etc).
1218 */
1219int get_nohz_timer_target(void)
1220{
1221 int cpu = smp_processor_id();
1222 int i;
1223 struct sched_domain *sd;
1224
1225 for_each_domain(cpu, sd) {
1226 for_each_cpu(i, sched_domain_span(sd))
1227 if (!idle_cpu(i))
1228 return i;
1229 }
1230 return cpu;
1231}
1232/*
1196 * When add_timer_on() enqueues a timer into the timer wheel of an 1233 * When add_timer_on() enqueues a timer into the timer wheel of an
1197 * idle CPU then this timer might expire before the next timer event 1234 * idle CPU then this timer might expire before the next timer event
1198 * which is scheduled to wake up that CPU. In case of a completely 1235 * which is scheduled to wake up that CPU. In case of a completely
@@ -1232,16 +1269,6 @@ void wake_up_idle_cpu(int cpu)
1232 smp_send_reschedule(cpu); 1269 smp_send_reschedule(cpu);
1233} 1270}
1234 1271
1235int nohz_ratelimit(int cpu)
1236{
1237 struct rq *rq = cpu_rq(cpu);
1238 u64 diff = rq->clock - rq->nohz_stamp;
1239
1240 rq->nohz_stamp = rq->clock;
1241
1242 return diff < (NSEC_PER_SEC / HZ) >> 1;
1243}
1244
1245#endif /* CONFIG_NO_HZ */ 1272#endif /* CONFIG_NO_HZ */
1246 1273
1247static u64 sched_avg_period(void) 1274static u64 sched_avg_period(void)
@@ -1281,6 +1308,10 @@ static void resched_task(struct task_struct *p)
1281static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) 1308static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1282{ 1309{
1283} 1310}
1311
1312static void sched_avg_update(struct rq *rq)
1313{
1314}
1284#endif /* CONFIG_SMP */ 1315#endif /* CONFIG_SMP */
1285 1316
1286#if BITS_PER_LONG == 32 1317#if BITS_PER_LONG == 32
@@ -1652,7 +1683,7 @@ static void update_shares(struct sched_domain *sd)
1652 if (root_task_group_empty()) 1683 if (root_task_group_empty())
1653 return; 1684 return;
1654 1685
1655 now = cpu_clock(raw_smp_processor_id()); 1686 now = local_clock();
1656 elapsed = now - sd->last_update; 1687 elapsed = now - sd->last_update;
1657 1688
1658 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1689 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
@@ -1805,6 +1836,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1805static void calc_load_account_idle(struct rq *this_rq); 1836static void calc_load_account_idle(struct rq *this_rq);
1806static void update_sysctl(void); 1837static void update_sysctl(void);
1807static int get_update_sysctl_factor(void); 1838static int get_update_sysctl_factor(void);
1839static void update_cpu_load(struct rq *this_rq);
1808 1840
1809static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1841static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1810{ 1842{
@@ -1822,7 +1854,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1822 1854
1823static const struct sched_class rt_sched_class; 1855static const struct sched_class rt_sched_class;
1824 1856
1825#define sched_class_highest (&rt_sched_class) 1857#define sched_class_highest (&stop_sched_class)
1826#define for_each_class(class) \ 1858#define for_each_class(class) \
1827 for (class = sched_class_highest; class; class = class->next) 1859 for (class = sched_class_highest; class; class = class->next)
1828 1860
@@ -1840,12 +1872,6 @@ static void dec_nr_running(struct rq *rq)
1840 1872
1841static void set_load_weight(struct task_struct *p) 1873static void set_load_weight(struct task_struct *p)
1842{ 1874{
1843 if (task_has_rt_policy(p)) {
1844 p->se.load.weight = 0;
1845 p->se.load.inv_weight = WMULT_CONST;
1846 return;
1847 }
1848
1849 /* 1875 /*
1850 * SCHED_IDLE tasks get minimal weight: 1876 * SCHED_IDLE tasks get minimal weight:
1851 */ 1877 */
@@ -1899,13 +1925,132 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1899 dec_nr_running(rq); 1925 dec_nr_running(rq);
1900} 1926}
1901 1927
1928#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1929
1930/*
1931 * There are no locks covering percpu hardirq/softirq time.
1932 * They are only modified in account_system_vtime, on corresponding CPU
1933 * with interrupts disabled. So, writes are safe.
1934 * They are read and saved off onto struct rq in update_rq_clock().
1935 * This may result in other CPU reading this CPU's irq time and can
1936 * race with irq/account_system_vtime on this CPU. We would either get old
1937 * or new value (or semi updated value on 32 bit) with a side effect of
1938 * accounting a slice of irq time to wrong task when irq is in progress
1939 * while we read rq->clock. That is a worthy compromise in place of having
1940 * locks on each irq in account_system_time.
1941 */
1942static DEFINE_PER_CPU(u64, cpu_hardirq_time);
1943static DEFINE_PER_CPU(u64, cpu_softirq_time);
1944
1945static DEFINE_PER_CPU(u64, irq_start_time);
1946static int sched_clock_irqtime;
1947
1948void enable_sched_clock_irqtime(void)
1949{
1950 sched_clock_irqtime = 1;
1951}
1952
1953void disable_sched_clock_irqtime(void)
1954{
1955 sched_clock_irqtime = 0;
1956}
1957
1958static u64 irq_time_cpu(int cpu)
1959{
1960 if (!sched_clock_irqtime)
1961 return 0;
1962
1963 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1964}
1965
1966void account_system_vtime(struct task_struct *curr)
1967{
1968 unsigned long flags;
1969 int cpu;
1970 u64 now, delta;
1971
1972 if (!sched_clock_irqtime)
1973 return;
1974
1975 local_irq_save(flags);
1976
1977 cpu = smp_processor_id();
1978 now = sched_clock_cpu(cpu);
1979 delta = now - per_cpu(irq_start_time, cpu);
1980 per_cpu(irq_start_time, cpu) = now;
1981 /*
1982 * We do not account for softirq time from ksoftirqd here.
1983 * We want to continue accounting softirq time to ksoftirqd thread
1984 * in that case, so as not to confuse scheduler with a special task
1985 * that do not consume any time, but still wants to run.
1986 */
1987 if (hardirq_count())
1988 per_cpu(cpu_hardirq_time, cpu) += delta;
1989 else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
1990 per_cpu(cpu_softirq_time, cpu) += delta;
1991
1992 local_irq_restore(flags);
1993}
1994EXPORT_SYMBOL_GPL(account_system_vtime);
1995
1996static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
1997{
1998 if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
1999 u64 delta_irq = curr_irq_time - rq->prev_irq_time;
2000 rq->prev_irq_time = curr_irq_time;
2001 sched_rt_avg_update(rq, delta_irq);
2002 }
2003}
2004
2005#else
2006
2007static u64 irq_time_cpu(int cpu)
2008{
2009 return 0;
2010}
2011
2012static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
2013
2014#endif
2015
1902#include "sched_idletask.c" 2016#include "sched_idletask.c"
1903#include "sched_fair.c" 2017#include "sched_fair.c"
1904#include "sched_rt.c" 2018#include "sched_rt.c"
2019#include "sched_stoptask.c"
1905#ifdef CONFIG_SCHED_DEBUG 2020#ifdef CONFIG_SCHED_DEBUG
1906# include "sched_debug.c" 2021# include "sched_debug.c"
1907#endif 2022#endif
1908 2023
2024void sched_set_stop_task(int cpu, struct task_struct *stop)
2025{
2026 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
2027 struct task_struct *old_stop = cpu_rq(cpu)->stop;
2028
2029 if (stop) {
2030 /*
2031 * Make it appear like a SCHED_FIFO task, its something
2032 * userspace knows about and won't get confused about.
2033 *
2034 * Also, it will make PI more or less work without too
2035 * much confusion -- but then, stop work should not
2036 * rely on PI working anyway.
2037 */
2038 sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
2039
2040 stop->sched_class = &stop_sched_class;
2041 }
2042
2043 cpu_rq(cpu)->stop = stop;
2044
2045 if (old_stop) {
2046 /*
2047 * Reset it back to a normal scheduling class so that
2048 * it can die in pieces.
2049 */
2050 old_stop->sched_class = &rt_sched_class;
2051 }
2052}
2053
1909/* 2054/*
1910 * __normal_prio - return the priority that is based on the static prio 2055 * __normal_prio - return the priority that is based on the static prio
1911 */ 2056 */
@@ -1985,6 +2130,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1985 if (p->sched_class != &fair_sched_class) 2130 if (p->sched_class != &fair_sched_class)
1986 return 0; 2131 return 0;
1987 2132
2133 if (unlikely(p->policy == SCHED_IDLE))
2134 return 0;
2135
1988 /* 2136 /*
1989 * Buddy candidates are cache hot: 2137 * Buddy candidates are cache hot:
1990 */ 2138 */
@@ -2267,11 +2415,55 @@ static void update_avg(u64 *avg, u64 sample)
2267} 2415}
2268#endif 2416#endif
2269 2417
2270/*** 2418static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
2419 bool is_sync, bool is_migrate, bool is_local,
2420 unsigned long en_flags)
2421{
2422 schedstat_inc(p, se.statistics.nr_wakeups);
2423 if (is_sync)
2424 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2425 if (is_migrate)
2426 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2427 if (is_local)
2428 schedstat_inc(p, se.statistics.nr_wakeups_local);
2429 else
2430 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2431
2432 activate_task(rq, p, en_flags);
2433}
2434
2435static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2436 int wake_flags, bool success)
2437{
2438 trace_sched_wakeup(p, success);
2439 check_preempt_curr(rq, p, wake_flags);
2440
2441 p->state = TASK_RUNNING;
2442#ifdef CONFIG_SMP
2443 if (p->sched_class->task_woken)
2444 p->sched_class->task_woken(rq, p);
2445
2446 if (unlikely(rq->idle_stamp)) {
2447 u64 delta = rq->clock - rq->idle_stamp;
2448 u64 max = 2*sysctl_sched_migration_cost;
2449
2450 if (delta > max)
2451 rq->avg_idle = max;
2452 else
2453 update_avg(&rq->avg_idle, delta);
2454 rq->idle_stamp = 0;
2455 }
2456#endif
2457 /* if a worker is waking up, notify workqueue */
2458 if ((p->flags & PF_WQ_WORKER) && success)
2459 wq_worker_waking_up(p, cpu_of(rq));
2460}
2461
2462/**
2271 * try_to_wake_up - wake up a thread 2463 * try_to_wake_up - wake up a thread
2272 * @p: the to-be-woken-up thread 2464 * @p: the thread to be awakened
2273 * @state: the mask of task states that can be woken 2465 * @state: the mask of task states that can be woken
2274 * @sync: do a synchronous wakeup? 2466 * @wake_flags: wake modifier flags (WF_*)
2275 * 2467 *
2276 * Put it on the run-queue if it's not already there. The "current" 2468 * Put it on the run-queue if it's not already there. The "current"
2277 * thread is always on the run-queue (except when the actual 2469 * thread is always on the run-queue (except when the actual
@@ -2279,7 +2471,8 @@ static void update_avg(u64 *avg, u64 sample)
2279 * the simpler "current->state = TASK_RUNNING" to mark yourself 2471 * the simpler "current->state = TASK_RUNNING" to mark yourself
2280 * runnable without the overhead of this. 2472 * runnable without the overhead of this.
2281 * 2473 *
2282 * returns failure only if the task is already active. 2474 * Returns %true if @p was woken up, %false if it was already running
2475 * or @state didn't match @p's state.
2283 */ 2476 */
2284static int try_to_wake_up(struct task_struct *p, unsigned int state, 2477static int try_to_wake_up(struct task_struct *p, unsigned int state,
2285 int wake_flags) 2478 int wake_flags)
@@ -2359,38 +2552,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2359 2552
2360out_activate: 2553out_activate:
2361#endif /* CONFIG_SMP */ 2554#endif /* CONFIG_SMP */
2362 schedstat_inc(p, se.statistics.nr_wakeups); 2555 ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
2363 if (wake_flags & WF_SYNC) 2556 cpu == this_cpu, en_flags);
2364 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2365 if (orig_cpu != cpu)
2366 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2367 if (cpu == this_cpu)
2368 schedstat_inc(p, se.statistics.nr_wakeups_local);
2369 else
2370 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2371 activate_task(rq, p, en_flags);
2372 success = 1; 2557 success = 1;
2373
2374out_running: 2558out_running:
2375 trace_sched_wakeup(p, success); 2559 ttwu_post_activation(p, rq, wake_flags, success);
2376 check_preempt_curr(rq, p, wake_flags);
2377
2378 p->state = TASK_RUNNING;
2379#ifdef CONFIG_SMP
2380 if (p->sched_class->task_woken)
2381 p->sched_class->task_woken(rq, p);
2382
2383 if (unlikely(rq->idle_stamp)) {
2384 u64 delta = rq->clock - rq->idle_stamp;
2385 u64 max = 2*sysctl_sched_migration_cost;
2386
2387 if (delta > max)
2388 rq->avg_idle = max;
2389 else
2390 update_avg(&rq->avg_idle, delta);
2391 rq->idle_stamp = 0;
2392 }
2393#endif
2394out: 2560out:
2395 task_rq_unlock(rq, &flags); 2561 task_rq_unlock(rq, &flags);
2396 put_cpu(); 2562 put_cpu();
@@ -2399,6 +2565,37 @@ out:
2399} 2565}
2400 2566
2401/** 2567/**
2568 * try_to_wake_up_local - try to wake up a local task with rq lock held
2569 * @p: the thread to be awakened
2570 *
2571 * Put @p on the run-queue if it's not alredy there. The caller must
2572 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2573 * the current task. this_rq() stays locked over invocation.
2574 */
2575static void try_to_wake_up_local(struct task_struct *p)
2576{
2577 struct rq *rq = task_rq(p);
2578 bool success = false;
2579
2580 BUG_ON(rq != this_rq());
2581 BUG_ON(p == current);
2582 lockdep_assert_held(&rq->lock);
2583
2584 if (!(p->state & TASK_NORMAL))
2585 return;
2586
2587 if (!p->se.on_rq) {
2588 if (likely(!task_running(rq, p))) {
2589 schedstat_inc(rq, ttwu_count);
2590 schedstat_inc(rq, ttwu_local);
2591 }
2592 ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
2593 success = true;
2594 }
2595 ttwu_post_activation(p, rq, 0, success);
2596}
2597
2598/**
2402 * wake_up_process - Wake up a specific process 2599 * wake_up_process - Wake up a specific process
2403 * @p: The process to be woken up. 2600 * @p: The process to be woken up.
2404 * 2601 *
@@ -2785,14 +2982,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
2785 */ 2982 */
2786 arch_start_context_switch(prev); 2983 arch_start_context_switch(prev);
2787 2984
2788 if (likely(!mm)) { 2985 if (!mm) {
2789 next->active_mm = oldmm; 2986 next->active_mm = oldmm;
2790 atomic_inc(&oldmm->mm_count); 2987 atomic_inc(&oldmm->mm_count);
2791 enter_lazy_tlb(oldmm, next); 2988 enter_lazy_tlb(oldmm, next);
2792 } else 2989 } else
2793 switch_mm(oldmm, mm, next); 2990 switch_mm(oldmm, mm, next);
2794 2991
2795 if (likely(!prev->mm)) { 2992 if (!prev->mm) {
2796 prev->active_mm = NULL; 2993 prev->active_mm = NULL;
2797 rq->prev_mm = oldmm; 2994 rq->prev_mm = oldmm;
2798 } 2995 }
@@ -3012,23 +3209,102 @@ static void calc_load_account_active(struct rq *this_rq)
3012} 3209}
3013 3210
3014/* 3211/*
3212 * The exact cpuload at various idx values, calculated at every tick would be
3213 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
3214 *
3215 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
3216 * on nth tick when cpu may be busy, then we have:
3217 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3218 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
3219 *
3220 * decay_load_missed() below does efficient calculation of
3221 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3222 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
3223 *
3224 * The calculation is approximated on a 128 point scale.
3225 * degrade_zero_ticks is the number of ticks after which load at any
3226 * particular idx is approximated to be zero.
3227 * degrade_factor is a precomputed table, a row for each load idx.
3228 * Each column corresponds to degradation factor for a power of two ticks,
3229 * based on 128 point scale.
3230 * Example:
3231 * row 2, col 3 (=12) says that the degradation at load idx 2 after
3232 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
3233 *
3234 * With this power of 2 load factors, we can degrade the load n times
3235 * by looking at 1 bits in n and doing as many mult/shift instead of
3236 * n mult/shifts needed by the exact degradation.
3237 */
3238#define DEGRADE_SHIFT 7
3239static const unsigned char
3240 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
3241static const unsigned char
3242 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
3243 {0, 0, 0, 0, 0, 0, 0, 0},
3244 {64, 32, 8, 0, 0, 0, 0, 0},
3245 {96, 72, 40, 12, 1, 0, 0},
3246 {112, 98, 75, 43, 15, 1, 0},
3247 {120, 112, 98, 76, 45, 16, 2} };
3248
3249/*
3250 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
3251 * would be when CPU is idle and so we just decay the old load without
3252 * adding any new load.
3253 */
3254static unsigned long
3255decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
3256{
3257 int j = 0;
3258
3259 if (!missed_updates)
3260 return load;
3261
3262 if (missed_updates >= degrade_zero_ticks[idx])
3263 return 0;
3264
3265 if (idx == 1)
3266 return load >> missed_updates;
3267
3268 while (missed_updates) {
3269 if (missed_updates % 2)
3270 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
3271
3272 missed_updates >>= 1;
3273 j++;
3274 }
3275 return load;
3276}
3277
3278/*
3015 * Update rq->cpu_load[] statistics. This function is usually called every 3279 * Update rq->cpu_load[] statistics. This function is usually called every
3016 * scheduler tick (TICK_NSEC). 3280 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
3281 * every tick. We fix it up based on jiffies.
3017 */ 3282 */
3018static void update_cpu_load(struct rq *this_rq) 3283static void update_cpu_load(struct rq *this_rq)
3019{ 3284{
3020 unsigned long this_load = this_rq->load.weight; 3285 unsigned long this_load = this_rq->load.weight;
3286 unsigned long curr_jiffies = jiffies;
3287 unsigned long pending_updates;
3021 int i, scale; 3288 int i, scale;
3022 3289
3023 this_rq->nr_load_updates++; 3290 this_rq->nr_load_updates++;
3024 3291
3292 /* Avoid repeated calls on same jiffy, when moving in and out of idle */
3293 if (curr_jiffies == this_rq->last_load_update_tick)
3294 return;
3295
3296 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
3297 this_rq->last_load_update_tick = curr_jiffies;
3298
3025 /* Update our load: */ 3299 /* Update our load: */
3026 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 3300 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
3301 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
3027 unsigned long old_load, new_load; 3302 unsigned long old_load, new_load;
3028 3303
3029 /* scale is effectively 1 << i now, and >> i divides by scale */ 3304 /* scale is effectively 1 << i now, and >> i divides by scale */
3030 3305
3031 old_load = this_rq->cpu_load[i]; 3306 old_load = this_rq->cpu_load[i];
3307 old_load = decay_load_missed(old_load, pending_updates - 1, i);
3032 new_load = this_load; 3308 new_load = this_load;
3033 /* 3309 /*
3034 * Round up the averaging division if load is increasing. This 3310 * Round up the averaging division if load is increasing. This
@@ -3036,10 +3312,18 @@ static void update_cpu_load(struct rq *this_rq)
3036 * example. 3312 * example.
3037 */ 3313 */
3038 if (new_load > old_load) 3314 if (new_load > old_load)
3039 new_load += scale-1; 3315 new_load += scale - 1;
3040 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 3316
3317 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
3041 } 3318 }
3042 3319
3320 sched_avg_update(this_rq);
3321}
3322
3323static void update_cpu_load_active(struct rq *this_rq)
3324{
3325 update_cpu_load(this_rq);
3326
3043 calc_load_account_active(this_rq); 3327 calc_load_account_active(this_rq);
3044} 3328}
3045 3329
@@ -3094,7 +3378,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
3094 3378
3095 if (task_current(rq, p)) { 3379 if (task_current(rq, p)) {
3096 update_rq_clock(rq); 3380 update_rq_clock(rq);
3097 ns = rq->clock - p->se.exec_start; 3381 ns = rq->clock_task - p->se.exec_start;
3098 if ((s64)ns < 0) 3382 if ((s64)ns < 0)
3099 ns = 0; 3383 ns = 0;
3100 } 3384 }
@@ -3243,7 +3527,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3243 tmp = cputime_to_cputime64(cputime); 3527 tmp = cputime_to_cputime64(cputime);
3244 if (hardirq_count() - hardirq_offset) 3528 if (hardirq_count() - hardirq_offset)
3245 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3529 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3246 else if (softirq_count()) 3530 else if (in_serving_softirq())
3247 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 3531 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3248 else 3532 else
3249 cpustat->system = cputime64_add(cpustat->system, tmp); 3533 cpustat->system = cputime64_add(cpustat->system, tmp);
@@ -3359,9 +3643,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3359 rtime = nsecs_to_cputime(p->se.sum_exec_runtime); 3643 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
3360 3644
3361 if (total) { 3645 if (total) {
3362 u64 temp; 3646 u64 temp = rtime;
3363 3647
3364 temp = (u64)(rtime * utime); 3648 temp *= utime;
3365 do_div(temp, total); 3649 do_div(temp, total);
3366 utime = (cputime_t)temp; 3650 utime = (cputime_t)temp;
3367 } else 3651 } else
@@ -3392,9 +3676,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3392 rtime = nsecs_to_cputime(cputime.sum_exec_runtime); 3676 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
3393 3677
3394 if (total) { 3678 if (total) {
3395 u64 temp; 3679 u64 temp = rtime;
3396 3680
3397 temp = (u64)(rtime * cputime.utime); 3681 temp *= cputime.utime;
3398 do_div(temp, total); 3682 do_div(temp, total);
3399 utime = (cputime_t)temp; 3683 utime = (cputime_t)temp;
3400 } else 3684 } else
@@ -3426,11 +3710,11 @@ void scheduler_tick(void)
3426 3710
3427 raw_spin_lock(&rq->lock); 3711 raw_spin_lock(&rq->lock);
3428 update_rq_clock(rq); 3712 update_rq_clock(rq);
3429 update_cpu_load(rq); 3713 update_cpu_load_active(rq);
3430 curr->sched_class->task_tick(rq, curr, 0); 3714 curr->sched_class->task_tick(rq, curr, 0);
3431 raw_spin_unlock(&rq->lock); 3715 raw_spin_unlock(&rq->lock);
3432 3716
3433 perf_event_task_tick(curr); 3717 perf_event_task_tick();
3434 3718
3435#ifdef CONFIG_SMP 3719#ifdef CONFIG_SMP
3436 rq->idle_at_tick = idle_cpu(cpu); 3720 rq->idle_at_tick = idle_cpu(cpu);
@@ -3569,17 +3853,13 @@ pick_next_task(struct rq *rq)
3569 return p; 3853 return p;
3570 } 3854 }
3571 3855
3572 class = sched_class_highest; 3856 for_each_class(class) {
3573 for ( ; ; ) {
3574 p = class->pick_next_task(rq); 3857 p = class->pick_next_task(rq);
3575 if (p) 3858 if (p)
3576 return p; 3859 return p;
3577 /*
3578 * Will never be NULL as the idle class always
3579 * returns a non-NULL p:
3580 */
3581 class = class->next;
3582 } 3860 }
3861
3862 BUG(); /* the idle class will always have a runnable task */
3583} 3863}
3584 3864
3585/* 3865/*
@@ -3598,7 +3878,6 @@ need_resched:
3598 rq = cpu_rq(cpu); 3878 rq = cpu_rq(cpu);
3599 rcu_note_context_switch(cpu); 3879 rcu_note_context_switch(cpu);
3600 prev = rq->curr; 3880 prev = rq->curr;
3601 switch_count = &prev->nivcsw;
3602 3881
3603 release_kernel_lock(prev); 3882 release_kernel_lock(prev);
3604need_resched_nonpreemptible: 3883need_resched_nonpreemptible:
@@ -3611,11 +3890,26 @@ need_resched_nonpreemptible:
3611 raw_spin_lock_irq(&rq->lock); 3890 raw_spin_lock_irq(&rq->lock);
3612 clear_tsk_need_resched(prev); 3891 clear_tsk_need_resched(prev);
3613 3892
3893 switch_count = &prev->nivcsw;
3614 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3894 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3615 if (unlikely(signal_pending_state(prev->state, prev))) 3895 if (unlikely(signal_pending_state(prev->state, prev))) {
3616 prev->state = TASK_RUNNING; 3896 prev->state = TASK_RUNNING;
3617 else 3897 } else {
3898 /*
3899 * If a worker is going to sleep, notify and
3900 * ask workqueue whether it wants to wake up a
3901 * task to maintain concurrency. If so, wake
3902 * up the task.
3903 */
3904 if (prev->flags & PF_WQ_WORKER) {
3905 struct task_struct *to_wakeup;
3906
3907 to_wakeup = wq_worker_sleeping(prev, cpu);
3908 if (to_wakeup)
3909 try_to_wake_up_local(to_wakeup);
3910 }
3618 deactivate_task(rq, prev, DEQUEUE_SLEEP); 3911 deactivate_task(rq, prev, DEQUEUE_SLEEP);
3912 }
3619 switch_count = &prev->nvcsw; 3913 switch_count = &prev->nvcsw;
3620 } 3914 }
3621 3915
@@ -3637,8 +3931,10 @@ need_resched_nonpreemptible:
3637 3931
3638 context_switch(rq, prev, next); /* unlocks the rq */ 3932 context_switch(rq, prev, next); /* unlocks the rq */
3639 /* 3933 /*
3640 * the context switch might have flipped the stack from under 3934 * The context switch have flipped the stack from under us
3641 * us, hence refresh the local variables. 3935 * and restored the local variables which were saved when
3936 * this task called schedule() in the past. prev == current
3937 * is still correct, but it can be moved to another cpu/rq.
3642 */ 3938 */
3643 cpu = smp_processor_id(); 3939 cpu = smp_processor_id();
3644 rq = cpu_rq(cpu); 3940 rq = cpu_rq(cpu);
@@ -3647,11 +3943,8 @@ need_resched_nonpreemptible:
3647 3943
3648 post_schedule(rq); 3944 post_schedule(rq);
3649 3945
3650 if (unlikely(reacquire_kernel_lock(current) < 0)) { 3946 if (unlikely(reacquire_kernel_lock(prev)))
3651 prev = rq->curr;
3652 switch_count = &prev->nivcsw;
3653 goto need_resched_nonpreemptible; 3947 goto need_resched_nonpreemptible;
3654 }
3655 3948
3656 preempt_enable_no_resched(); 3949 preempt_enable_no_resched();
3657 if (need_resched()) 3950 if (need_resched())
@@ -3704,8 +3997,16 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
3704 /* 3997 /*
3705 * Owner changed, break to re-assess state. 3998 * Owner changed, break to re-assess state.
3706 */ 3999 */
3707 if (lock->owner != owner) 4000 if (lock->owner != owner) {
4001 /*
4002 * If the lock has switched to a different owner,
4003 * we likely have heavy contention. Return 0 to quit
4004 * optimistic spinning and not contend further:
4005 */
4006 if (lock->owner)
4007 return 0;
3708 break; 4008 break;
4009 }
3709 4010
3710 /* 4011 /*
3711 * Is that owner really running on that cpu? 4012 * Is that owner really running on that cpu?
@@ -3726,7 +4027,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
3726 * off of preempt_enable. Kernel preemptions off return from interrupt 4027 * off of preempt_enable. Kernel preemptions off return from interrupt
3727 * occur there and call schedule directly. 4028 * occur there and call schedule directly.
3728 */ 4029 */
3729asmlinkage void __sched preempt_schedule(void) 4030asmlinkage void __sched notrace preempt_schedule(void)
3730{ 4031{
3731 struct thread_info *ti = current_thread_info(); 4032 struct thread_info *ti = current_thread_info();
3732 4033
@@ -3738,9 +4039,9 @@ asmlinkage void __sched preempt_schedule(void)
3738 return; 4039 return;
3739 4040
3740 do { 4041 do {
3741 add_preempt_count(PREEMPT_ACTIVE); 4042 add_preempt_count_notrace(PREEMPT_ACTIVE);
3742 schedule(); 4043 schedule();
3743 sub_preempt_count(PREEMPT_ACTIVE); 4044 sub_preempt_count_notrace(PREEMPT_ACTIVE);
3744 4045
3745 /* 4046 /*
3746 * Check again in case we missed a preemption opportunity 4047 * Check again in case we missed a preemption opportunity
@@ -4183,6 +4484,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4183 4484
4184 rq = task_rq_lock(p, &flags); 4485 rq = task_rq_lock(p, &flags);
4185 4486
4487 trace_sched_pi_setprio(p, prio);
4186 oldprio = p->prio; 4488 oldprio = p->prio;
4187 prev_class = p->sched_class; 4489 prev_class = p->sched_class;
4188 on_rq = p->se.on_rq; 4490 on_rq = p->se.on_rq;
@@ -4441,12 +4743,8 @@ recheck:
4441 */ 4743 */
4442 if (user && !capable(CAP_SYS_NICE)) { 4744 if (user && !capable(CAP_SYS_NICE)) {
4443 if (rt_policy(policy)) { 4745 if (rt_policy(policy)) {
4444 unsigned long rlim_rtprio; 4746 unsigned long rlim_rtprio =
4445 4747 task_rlimit(p, RLIMIT_RTPRIO);
4446 if (!lock_task_sighand(p, &flags))
4447 return -ESRCH;
4448 rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
4449 unlock_task_sighand(p, &flags);
4450 4748
4451 /* can't set/change the rt policy */ 4749 /* can't set/change the rt policy */
4452 if (policy != p->policy && !rlim_rtprio) 4750 if (policy != p->policy && !rlim_rtprio)
@@ -4474,7 +4772,7 @@ recheck:
4474 } 4772 }
4475 4773
4476 if (user) { 4774 if (user) {
4477 retval = security_task_setscheduler(p, policy, param); 4775 retval = security_task_setscheduler(p);
4478 if (retval) 4776 if (retval)
4479 return retval; 4777 return retval;
4480 } 4778 }
@@ -4490,6 +4788,15 @@ recheck:
4490 */ 4788 */
4491 rq = __task_rq_lock(p); 4789 rq = __task_rq_lock(p);
4492 4790
4791 /*
4792 * Changing the policy of the stop threads its a very bad idea
4793 */
4794 if (p == rq->stop) {
4795 __task_rq_unlock(rq);
4796 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4797 return -EINVAL;
4798 }
4799
4493#ifdef CONFIG_RT_GROUP_SCHED 4800#ifdef CONFIG_RT_GROUP_SCHED
4494 if (user) { 4801 if (user) {
4495 /* 4802 /*
@@ -4716,13 +5023,13 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4716 if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) 5023 if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
4717 goto out_unlock; 5024 goto out_unlock;
4718 5025
4719 retval = security_task_setscheduler(p, 0, NULL); 5026 retval = security_task_setscheduler(p);
4720 if (retval) 5027 if (retval)
4721 goto out_unlock; 5028 goto out_unlock;
4722 5029
4723 cpuset_cpus_allowed(p, cpus_allowed); 5030 cpuset_cpus_allowed(p, cpus_allowed);
4724 cpumask_and(new_mask, in_mask, cpus_allowed); 5031 cpumask_and(new_mask, in_mask, cpus_allowed);
4725 again: 5032again:
4726 retval = set_cpus_allowed_ptr(p, new_mask); 5033 retval = set_cpus_allowed_ptr(p, new_mask);
4727 5034
4728 if (!retval) { 5035 if (!retval) {
@@ -5166,7 +5473,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5166 idle->se.exec_start = sched_clock(); 5473 idle->se.exec_start = sched_clock();
5167 5474
5168 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); 5475 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
5476 /*
5477 * We're having a chicken and egg problem, even though we are
5478 * holding rq->lock, the cpu isn't yet set to this cpu so the
5479 * lockdep check in task_group() will fail.
5480 *
5481 * Similar case to sched_fork(). / Alternatively we could
5482 * use task_rq_lock() here and obtain the other rq->lock.
5483 *
5484 * Silence PROVE_RCU
5485 */
5486 rcu_read_lock();
5169 __set_task_cpu(idle, cpu); 5487 __set_task_cpu(idle, cpu);
5488 rcu_read_unlock();
5170 5489
5171 rq->curr = rq->idle = idle; 5490 rq->curr = rq->idle = idle;
5172#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 5491#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
@@ -5816,20 +6135,49 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5816 */ 6135 */
5817static struct notifier_block __cpuinitdata migration_notifier = { 6136static struct notifier_block __cpuinitdata migration_notifier = {
5818 .notifier_call = migration_call, 6137 .notifier_call = migration_call,
5819 .priority = 10 6138 .priority = CPU_PRI_MIGRATION,
5820}; 6139};
5821 6140
6141static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
6142 unsigned long action, void *hcpu)
6143{
6144 switch (action & ~CPU_TASKS_FROZEN) {
6145 case CPU_ONLINE:
6146 case CPU_DOWN_FAILED:
6147 set_cpu_active((long)hcpu, true);
6148 return NOTIFY_OK;
6149 default:
6150 return NOTIFY_DONE;
6151 }
6152}
6153
6154static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
6155 unsigned long action, void *hcpu)
6156{
6157 switch (action & ~CPU_TASKS_FROZEN) {
6158 case CPU_DOWN_PREPARE:
6159 set_cpu_active((long)hcpu, false);
6160 return NOTIFY_OK;
6161 default:
6162 return NOTIFY_DONE;
6163 }
6164}
6165
5822static int __init migration_init(void) 6166static int __init migration_init(void)
5823{ 6167{
5824 void *cpu = (void *)(long)smp_processor_id(); 6168 void *cpu = (void *)(long)smp_processor_id();
5825 int err; 6169 int err;
5826 6170
5827 /* Start one for the boot CPU: */ 6171 /* Initialize migration for the boot CPU */
5828 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 6172 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5829 BUG_ON(err == NOTIFY_BAD); 6173 BUG_ON(err == NOTIFY_BAD);
5830 migration_call(&migration_notifier, CPU_ONLINE, cpu); 6174 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5831 register_cpu_notifier(&migration_notifier); 6175 register_cpu_notifier(&migration_notifier);
5832 6176
6177 /* Register cpu active notifiers */
6178 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
6179 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
6180
5833 return 0; 6181 return 0;
5834} 6182}
5835early_initcall(migration_init); 6183early_initcall(migration_init);
@@ -6064,23 +6412,18 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6064 free_rootdomain(old_rd); 6412 free_rootdomain(old_rd);
6065} 6413}
6066 6414
6067static int init_rootdomain(struct root_domain *rd, bool bootmem) 6415static int init_rootdomain(struct root_domain *rd)
6068{ 6416{
6069 gfp_t gfp = GFP_KERNEL;
6070
6071 memset(rd, 0, sizeof(*rd)); 6417 memset(rd, 0, sizeof(*rd));
6072 6418
6073 if (bootmem) 6419 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
6074 gfp = GFP_NOWAIT;
6075
6076 if (!alloc_cpumask_var(&rd->span, gfp))
6077 goto out; 6420 goto out;
6078 if (!alloc_cpumask_var(&rd->online, gfp)) 6421 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
6079 goto free_span; 6422 goto free_span;
6080 if (!alloc_cpumask_var(&rd->rto_mask, gfp)) 6423 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
6081 goto free_online; 6424 goto free_online;
6082 6425
6083 if (cpupri_init(&rd->cpupri, bootmem) != 0) 6426 if (cpupri_init(&rd->cpupri) != 0)
6084 goto free_rto_mask; 6427 goto free_rto_mask;
6085 return 0; 6428 return 0;
6086 6429
@@ -6096,7 +6439,7 @@ out:
6096 6439
6097static void init_defrootdomain(void) 6440static void init_defrootdomain(void)
6098{ 6441{
6099 init_rootdomain(&def_root_domain, true); 6442 init_rootdomain(&def_root_domain);
6100 6443
6101 atomic_set(&def_root_domain.refcount, 1); 6444 atomic_set(&def_root_domain.refcount, 1);
6102} 6445}
@@ -6109,7 +6452,7 @@ static struct root_domain *alloc_rootdomain(void)
6109 if (!rd) 6452 if (!rd)
6110 return NULL; 6453 return NULL;
6111 6454
6112 if (init_rootdomain(rd, false) != 0) { 6455 if (init_rootdomain(rd) != 0) {
6113 kfree(rd); 6456 kfree(rd);
6114 return NULL; 6457 return NULL;
6115 } 6458 }
@@ -6319,6 +6662,7 @@ struct s_data {
6319 cpumask_var_t nodemask; 6662 cpumask_var_t nodemask;
6320 cpumask_var_t this_sibling_map; 6663 cpumask_var_t this_sibling_map;
6321 cpumask_var_t this_core_map; 6664 cpumask_var_t this_core_map;
6665 cpumask_var_t this_book_map;
6322 cpumask_var_t send_covered; 6666 cpumask_var_t send_covered;
6323 cpumask_var_t tmpmask; 6667 cpumask_var_t tmpmask;
6324 struct sched_group **sched_group_nodes; 6668 struct sched_group **sched_group_nodes;
@@ -6330,6 +6674,7 @@ enum s_alloc {
6330 sa_rootdomain, 6674 sa_rootdomain,
6331 sa_tmpmask, 6675 sa_tmpmask,
6332 sa_send_covered, 6676 sa_send_covered,
6677 sa_this_book_map,
6333 sa_this_core_map, 6678 sa_this_core_map,
6334 sa_this_sibling_map, 6679 sa_this_sibling_map,
6335 sa_nodemask, 6680 sa_nodemask,
@@ -6365,31 +6710,48 @@ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
6365#ifdef CONFIG_SCHED_MC 6710#ifdef CONFIG_SCHED_MC
6366static DEFINE_PER_CPU(struct static_sched_domain, core_domains); 6711static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
6367static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); 6712static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
6368#endif /* CONFIG_SCHED_MC */
6369 6713
6370#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6371static int 6714static int
6372cpu_to_core_group(int cpu, const struct cpumask *cpu_map, 6715cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
6373 struct sched_group **sg, struct cpumask *mask) 6716 struct sched_group **sg, struct cpumask *mask)
6374{ 6717{
6375 int group; 6718 int group;
6376 6719#ifdef CONFIG_SCHED_SMT
6377 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); 6720 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6378 group = cpumask_first(mask); 6721 group = cpumask_first(mask);
6722#else
6723 group = cpu;
6724#endif
6379 if (sg) 6725 if (sg)
6380 *sg = &per_cpu(sched_group_core, group).sg; 6726 *sg = &per_cpu(sched_group_core, group).sg;
6381 return group; 6727 return group;
6382} 6728}
6383#elif defined(CONFIG_SCHED_MC) 6729#endif /* CONFIG_SCHED_MC */
6730
6731/*
6732 * book sched-domains:
6733 */
6734#ifdef CONFIG_SCHED_BOOK
6735static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
6736static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
6737
6384static int 6738static int
6385cpu_to_core_group(int cpu, const struct cpumask *cpu_map, 6739cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
6386 struct sched_group **sg, struct cpumask *unused) 6740 struct sched_group **sg, struct cpumask *mask)
6387{ 6741{
6742 int group = cpu;
6743#ifdef CONFIG_SCHED_MC
6744 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6745 group = cpumask_first(mask);
6746#elif defined(CONFIG_SCHED_SMT)
6747 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6748 group = cpumask_first(mask);
6749#endif
6388 if (sg) 6750 if (sg)
6389 *sg = &per_cpu(sched_group_core, cpu).sg; 6751 *sg = &per_cpu(sched_group_book, group).sg;
6390 return cpu; 6752 return group;
6391} 6753}
6392#endif 6754#endif /* CONFIG_SCHED_BOOK */
6393 6755
6394static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); 6756static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
6395static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); 6757static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
@@ -6399,7 +6761,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
6399 struct sched_group **sg, struct cpumask *mask) 6761 struct sched_group **sg, struct cpumask *mask)
6400{ 6762{
6401 int group; 6763 int group;
6402#ifdef CONFIG_SCHED_MC 6764#ifdef CONFIG_SCHED_BOOK
6765 cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
6766 group = cpumask_first(mask);
6767#elif defined(CONFIG_SCHED_MC)
6403 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); 6768 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6404 group = cpumask_first(mask); 6769 group = cpumask_first(mask);
6405#elif defined(CONFIG_SCHED_SMT) 6770#elif defined(CONFIG_SCHED_SMT)
@@ -6660,6 +7025,9 @@ SD_INIT_FUNC(CPU)
6660#ifdef CONFIG_SCHED_MC 7025#ifdef CONFIG_SCHED_MC
6661 SD_INIT_FUNC(MC) 7026 SD_INIT_FUNC(MC)
6662#endif 7027#endif
7028#ifdef CONFIG_SCHED_BOOK
7029 SD_INIT_FUNC(BOOK)
7030#endif
6663 7031
6664static int default_relax_domain_level = -1; 7032static int default_relax_domain_level = -1;
6665 7033
@@ -6709,6 +7077,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
6709 free_cpumask_var(d->tmpmask); /* fall through */ 7077 free_cpumask_var(d->tmpmask); /* fall through */
6710 case sa_send_covered: 7078 case sa_send_covered:
6711 free_cpumask_var(d->send_covered); /* fall through */ 7079 free_cpumask_var(d->send_covered); /* fall through */
7080 case sa_this_book_map:
7081 free_cpumask_var(d->this_book_map); /* fall through */
6712 case sa_this_core_map: 7082 case sa_this_core_map:
6713 free_cpumask_var(d->this_core_map); /* fall through */ 7083 free_cpumask_var(d->this_core_map); /* fall through */
6714 case sa_this_sibling_map: 7084 case sa_this_sibling_map:
@@ -6755,8 +7125,10 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
6755 return sa_nodemask; 7125 return sa_nodemask;
6756 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) 7126 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
6757 return sa_this_sibling_map; 7127 return sa_this_sibling_map;
6758 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) 7128 if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
6759 return sa_this_core_map; 7129 return sa_this_core_map;
7130 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
7131 return sa_this_book_map;
6760 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) 7132 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
6761 return sa_send_covered; 7133 return sa_send_covered;
6762 d->rd = alloc_rootdomain(); 7134 d->rd = alloc_rootdomain();
@@ -6814,6 +7186,23 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
6814 return sd; 7186 return sd;
6815} 7187}
6816 7188
7189static struct sched_domain *__build_book_sched_domain(struct s_data *d,
7190 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7191 struct sched_domain *parent, int i)
7192{
7193 struct sched_domain *sd = parent;
7194#ifdef CONFIG_SCHED_BOOK
7195 sd = &per_cpu(book_domains, i).sd;
7196 SD_INIT(sd, BOOK);
7197 set_domain_attribute(sd, attr);
7198 cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
7199 sd->parent = parent;
7200 parent->child = sd;
7201 cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
7202#endif
7203 return sd;
7204}
7205
6817static struct sched_domain *__build_mc_sched_domain(struct s_data *d, 7206static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
6818 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7207 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
6819 struct sched_domain *parent, int i) 7208 struct sched_domain *parent, int i)
@@ -6871,6 +7260,15 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
6871 d->send_covered, d->tmpmask); 7260 d->send_covered, d->tmpmask);
6872 break; 7261 break;
6873#endif 7262#endif
7263#ifdef CONFIG_SCHED_BOOK
7264 case SD_LV_BOOK: /* set up book groups */
7265 cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
7266 if (cpu == cpumask_first(d->this_book_map))
7267 init_sched_build_groups(d->this_book_map, cpu_map,
7268 &cpu_to_book_group,
7269 d->send_covered, d->tmpmask);
7270 break;
7271#endif
6874 case SD_LV_CPU: /* set up physical groups */ 7272 case SD_LV_CPU: /* set up physical groups */
6875 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); 7273 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
6876 if (!cpumask_empty(d->nodemask)) 7274 if (!cpumask_empty(d->nodemask))
@@ -6918,12 +7316,14 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
6918 7316
6919 sd = __build_numa_sched_domains(&d, cpu_map, attr, i); 7317 sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
6920 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); 7318 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
7319 sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
6921 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); 7320 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
6922 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); 7321 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
6923 } 7322 }
6924 7323
6925 for_each_cpu(i, cpu_map) { 7324 for_each_cpu(i, cpu_map) {
6926 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); 7325 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
7326 build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
6927 build_sched_groups(&d, SD_LV_MC, cpu_map, i); 7327 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
6928 } 7328 }
6929 7329
@@ -6954,6 +7354,12 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
6954 init_sched_groups_power(i, sd); 7354 init_sched_groups_power(i, sd);
6955 } 7355 }
6956#endif 7356#endif
7357#ifdef CONFIG_SCHED_BOOK
7358 for_each_cpu(i, cpu_map) {
7359 sd = &per_cpu(book_domains, i).sd;
7360 init_sched_groups_power(i, sd);
7361 }
7362#endif
6957 7363
6958 for_each_cpu(i, cpu_map) { 7364 for_each_cpu(i, cpu_map) {
6959 sd = &per_cpu(phys_domains, i).sd; 7365 sd = &per_cpu(phys_domains, i).sd;
@@ -6979,6 +7385,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
6979 sd = &per_cpu(cpu_domains, i).sd; 7385 sd = &per_cpu(cpu_domains, i).sd;
6980#elif defined(CONFIG_SCHED_MC) 7386#elif defined(CONFIG_SCHED_MC)
6981 sd = &per_cpu(core_domains, i).sd; 7387 sd = &per_cpu(core_domains, i).sd;
7388#elif defined(CONFIG_SCHED_BOOK)
7389 sd = &per_cpu(book_domains, i).sd;
6982#else 7390#else
6983 sd = &per_cpu(phys_domains, i).sd; 7391 sd = &per_cpu(phys_domains, i).sd;
6984#endif 7392#endif
@@ -7288,29 +7696,35 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7288} 7696}
7289#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 7697#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
7290 7698
7291#ifndef CONFIG_CPUSETS
7292/* 7699/*
7293 * Add online and remove offline CPUs from the scheduler domains. 7700 * Update cpusets according to cpu_active mask. If cpusets are
7294 * When cpusets are enabled they take over this function. 7701 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
7702 * around partition_sched_domains().
7295 */ 7703 */
7296static int update_sched_domains(struct notifier_block *nfb, 7704static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
7297 unsigned long action, void *hcpu) 7705 void *hcpu)
7298{ 7706{
7299 switch (action) { 7707 switch (action & ~CPU_TASKS_FROZEN) {
7300 case CPU_ONLINE: 7708 case CPU_ONLINE:
7301 case CPU_ONLINE_FROZEN:
7302 case CPU_DOWN_PREPARE:
7303 case CPU_DOWN_PREPARE_FROZEN:
7304 case CPU_DOWN_FAILED: 7709 case CPU_DOWN_FAILED:
7305 case CPU_DOWN_FAILED_FROZEN: 7710 cpuset_update_active_cpus();
7306 partition_sched_domains(1, NULL, NULL);
7307 return NOTIFY_OK; 7711 return NOTIFY_OK;
7712 default:
7713 return NOTIFY_DONE;
7714 }
7715}
7308 7716
7717static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
7718 void *hcpu)
7719{
7720 switch (action & ~CPU_TASKS_FROZEN) {
7721 case CPU_DOWN_PREPARE:
7722 cpuset_update_active_cpus();
7723 return NOTIFY_OK;
7309 default: 7724 default:
7310 return NOTIFY_DONE; 7725 return NOTIFY_DONE;
7311 } 7726 }
7312} 7727}
7313#endif
7314 7728
7315static int update_runtime(struct notifier_block *nfb, 7729static int update_runtime(struct notifier_block *nfb,
7316 unsigned long action, void *hcpu) 7730 unsigned long action, void *hcpu)
@@ -7356,10 +7770,8 @@ void __init sched_init_smp(void)
7356 mutex_unlock(&sched_domains_mutex); 7770 mutex_unlock(&sched_domains_mutex);
7357 put_online_cpus(); 7771 put_online_cpus();
7358 7772
7359#ifndef CONFIG_CPUSETS 7773 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
7360 /* XXX: Theoretical race here - CPU may be hotplugged now */ 7774 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
7361 hotcpu_notifier(update_sched_domains, 0);
7362#endif
7363 7775
7364 /* RT runtime code needs to handle some hotplug events */ 7776 /* RT runtime code needs to handle some hotplug events */
7365 hotcpu_notifier(update_runtime, 0); 7777 hotcpu_notifier(update_runtime, 0);
@@ -7604,6 +8016,9 @@ void __init sched_init(void)
7604 8016
7605 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 8017 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
7606 rq->cpu_load[j] = 0; 8018 rq->cpu_load[j] = 0;
8019
8020 rq->last_load_update_tick = jiffies;
8021
7607#ifdef CONFIG_SMP 8022#ifdef CONFIG_SMP
7608 rq->sd = NULL; 8023 rq->sd = NULL;
7609 rq->rd = NULL; 8024 rq->rd = NULL;
@@ -7617,6 +8032,10 @@ void __init sched_init(void)
7617 rq->idle_stamp = 0; 8032 rq->idle_stamp = 0;
7618 rq->avg_idle = 2*sysctl_sched_migration_cost; 8033 rq->avg_idle = 2*sysctl_sched_migration_cost;
7619 rq_attach_root(rq, &def_root_domain); 8034 rq_attach_root(rq, &def_root_domain);
8035#ifdef CONFIG_NO_HZ
8036 rq->nohz_balance_kick = 0;
8037 init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
8038#endif
7620#endif 8039#endif
7621 init_rq_hrtick(rq); 8040 init_rq_hrtick(rq);
7622 atomic_set(&rq->nr_iowait, 0); 8041 atomic_set(&rq->nr_iowait, 0);
@@ -7661,8 +8080,11 @@ void __init sched_init(void)
7661 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 8080 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
7662#ifdef CONFIG_SMP 8081#ifdef CONFIG_SMP
7663#ifdef CONFIG_NO_HZ 8082#ifdef CONFIG_NO_HZ
7664 zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); 8083 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
7665 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); 8084 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
8085 atomic_set(&nohz.load_balancer, nr_cpu_ids);
8086 atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
8087 atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
7666#endif 8088#endif
7667 /* May be allocated at isolcpus cmdline parse time */ 8089 /* May be allocated at isolcpus cmdline parse time */
7668 if (cpu_isolated_map == NULL) 8090 if (cpu_isolated_map == NULL)
@@ -7869,9 +8291,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
7869 8291
7870 return 1; 8292 return 1;
7871 8293
7872 err_free_rq: 8294err_free_rq:
7873 kfree(cfs_rq); 8295 kfree(cfs_rq);
7874 err: 8296err:
7875 return 0; 8297 return 0;
7876} 8298}
7877 8299
@@ -7959,9 +8381,9 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
7959 8381
7960 return 1; 8382 return 1;
7961 8383
7962 err_free_rq: 8384err_free_rq:
7963 kfree(rt_rq); 8385 kfree(rt_rq);
7964 err: 8386err:
7965 return 0; 8387 return 0;
7966} 8388}
7967 8389
@@ -8319,7 +8741,7 @@ static int tg_set_bandwidth(struct task_group *tg,
8319 raw_spin_unlock(&rt_rq->rt_runtime_lock); 8741 raw_spin_unlock(&rt_rq->rt_runtime_lock);
8320 } 8742 }
8321 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8743 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8322 unlock: 8744unlock:
8323 read_unlock(&tasklist_lock); 8745 read_unlock(&tasklist_lock);
8324 mutex_unlock(&rt_constraints_mutex); 8746 mutex_unlock(&rt_constraints_mutex);
8325 8747
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 906a0f718cb3..52f1a149bfb1 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -10,19 +10,55 @@
10 * Ingo Molnar <mingo@redhat.com> 10 * Ingo Molnar <mingo@redhat.com>
11 * Guillaume Chazarain <guichaz@gmail.com> 11 * Guillaume Chazarain <guichaz@gmail.com>
12 * 12 *
13 * Create a semi stable clock from a mixture of other events, including: 13 *
14 * - gtod 14 * What:
15 *
16 * cpu_clock(i) provides a fast (execution time) high resolution
17 * clock with bounded drift between CPUs. The value of cpu_clock(i)
18 * is monotonic for constant i. The timestamp returned is in nanoseconds.
19 *
20 * ######################### BIG FAT WARNING ##########################
21 * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
22 * # go backwards !! #
23 * ####################################################################
24 *
25 * There is no strict promise about the base, although it tends to start
26 * at 0 on boot (but people really shouldn't rely on that).
27 *
28 * cpu_clock(i) -- can be used from any context, including NMI.
29 * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI)
30 * local_clock() -- is cpu_clock() on the current cpu.
31 *
32 * How:
33 *
34 * The implementation either uses sched_clock() when
35 * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
36 * sched_clock() is assumed to provide these properties (mostly it means
37 * the architecture provides a globally synchronized highres time source).
38 *
39 * Otherwise it tries to create a semi stable clock from a mixture of other
40 * clocks, including:
41 *
42 * - GTOD (clock monotomic)
15 * - sched_clock() 43 * - sched_clock()
16 * - explicit idle events 44 * - explicit idle events
17 * 45 *
18 * We use gtod as base and the unstable clock deltas. The deltas are filtered, 46 * We use GTOD as base and use sched_clock() deltas to improve resolution. The
19 * making it monotonic and keeping it within an expected window. 47 * deltas are filtered to provide monotonicity and keeping it within an
48 * expected window.
20 * 49 *
21 * Furthermore, explicit sleep and wakeup hooks allow us to account for time 50 * Furthermore, explicit sleep and wakeup hooks allow us to account for time
22 * that is otherwise invisible (TSC gets stopped). 51 * that is otherwise invisible (TSC gets stopped).
23 * 52 *
24 * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat 53 *
25 * consistent between cpus (never more than 2 jiffies difference). 54 * Notes:
55 *
56 * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things
57 * like cpufreq interrupts that can change the base clock (TSC) multiplier
58 * and cause funny jumps in time -- although the filtering provided by
59 * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it
60 * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on
61 * sched_clock().
26 */ 62 */
27#include <linux/spinlock.h> 63#include <linux/spinlock.h>
28#include <linux/hardirq.h> 64#include <linux/hardirq.h>
@@ -170,6 +206,11 @@ again:
170 return val; 206 return val;
171} 207}
172 208
209/*
210 * Similar to cpu_clock(), but requires local IRQs to be disabled.
211 *
212 * See cpu_clock().
213 */
173u64 sched_clock_cpu(int cpu) 214u64 sched_clock_cpu(int cpu)
174{ 215{
175 struct sched_clock_data *scd; 216 struct sched_clock_data *scd;
@@ -237,9 +278,19 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
237} 278}
238EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); 279EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
239 280
240unsigned long long cpu_clock(int cpu) 281/*
282 * As outlined at the top, provides a fast, high resolution, nanosecond
283 * time source that is monotonic per cpu argument and has bounded drift
284 * between cpus.
285 *
286 * ######################### BIG FAT WARNING ##########################
287 * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
288 * # go backwards !! #
289 * ####################################################################
290 */
291u64 cpu_clock(int cpu)
241{ 292{
242 unsigned long long clock; 293 u64 clock;
243 unsigned long flags; 294 unsigned long flags;
244 295
245 local_irq_save(flags); 296 local_irq_save(flags);
@@ -249,6 +300,25 @@ unsigned long long cpu_clock(int cpu)
249 return clock; 300 return clock;
250} 301}
251 302
303/*
304 * Similar to cpu_clock() for the current cpu. Time will only be observed
305 * to be monotonic if care is taken to only compare timestampt taken on the
306 * same CPU.
307 *
308 * See cpu_clock().
309 */
310u64 local_clock(void)
311{
312 u64 clock;
313 unsigned long flags;
314
315 local_irq_save(flags);
316 clock = sched_clock_cpu(smp_processor_id());
317 local_irq_restore(flags);
318
319 return clock;
320}
321
252#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 322#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
253 323
254void sched_clock_init(void) 324void sched_clock_init(void)
@@ -264,12 +334,17 @@ u64 sched_clock_cpu(int cpu)
264 return sched_clock(); 334 return sched_clock();
265} 335}
266 336
267 337u64 cpu_clock(int cpu)
268unsigned long long cpu_clock(int cpu)
269{ 338{
270 return sched_clock_cpu(cpu); 339 return sched_clock_cpu(cpu);
271} 340}
272 341
342u64 local_clock(void)
343{
344 return sched_clock_cpu(0);
345}
346
273#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 347#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
274 348
275EXPORT_SYMBOL_GPL(cpu_clock); 349EXPORT_SYMBOL_GPL(cpu_clock);
350EXPORT_SYMBOL_GPL(local_clock);
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index e6871cb3fc83..2722dc1b4138 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -166,14 +166,10 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
166 * 166 *
167 * Returns: -ENOMEM if memory fails. 167 * Returns: -ENOMEM if memory fails.
168 */ 168 */
169int cpupri_init(struct cpupri *cp, bool bootmem) 169int cpupri_init(struct cpupri *cp)
170{ 170{
171 gfp_t gfp = GFP_KERNEL;
172 int i; 171 int i;
173 172
174 if (bootmem)
175 gfp = GFP_NOWAIT;
176
177 memset(cp, 0, sizeof(*cp)); 173 memset(cp, 0, sizeof(*cp));
178 174
179 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { 175 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
@@ -181,7 +177,7 @@ int cpupri_init(struct cpupri *cp, bool bootmem)
181 177
182 raw_spin_lock_init(&vec->lock); 178 raw_spin_lock_init(&vec->lock);
183 vec->count = 0; 179 vec->count = 0;
184 if (!zalloc_cpumask_var(&vec->mask, gfp)) 180 if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
185 goto cleanup; 181 goto cleanup;
186 } 182 }
187 183
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 7cb5bb6b95be..9fc7d386fea4 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -27,7 +27,7 @@ struct cpupri {
27int cpupri_find(struct cpupri *cp, 27int cpupri_find(struct cpupri *cp,
28 struct task_struct *p, struct cpumask *lowest_mask); 28 struct task_struct *p, struct cpumask *lowest_mask);
29void cpupri_set(struct cpupri *cp, int cpu, int pri); 29void cpupri_set(struct cpupri *cp, int cpu, int pri);
30int cpupri_init(struct cpupri *cp, bool bootmem); 30int cpupri_init(struct cpupri *cp);
31void cpupri_cleanup(struct cpupri *cp); 31void cpupri_cleanup(struct cpupri *cp);
32#else 32#else
33#define cpupri_set(cp, cpu, pri) do { } while (0) 33#define cpupri_set(cp, cpu, pri) do { } while (0)
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 35565395d00d..2e1b0d17dd9b 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -332,7 +332,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
332 PN(sysctl_sched_latency); 332 PN(sysctl_sched_latency);
333 PN(sysctl_sched_min_granularity); 333 PN(sysctl_sched_min_granularity);
334 PN(sysctl_sched_wakeup_granularity); 334 PN(sysctl_sched_wakeup_granularity);
335 PN(sysctl_sched_child_runs_first); 335 P(sysctl_sched_child_runs_first);
336 P(sysctl_sched_features); 336 P(sysctl_sched_features);
337#undef PN 337#undef PN
338#undef P 338#undef P
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a878b5332daa..933f3d1b62ea 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -25,7 +25,7 @@
25 25
26/* 26/*
27 * Targeted preemption latency for CPU-bound tasks: 27 * Targeted preemption latency for CPU-bound tasks:
28 * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds) 28 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
29 * 29 *
30 * NOTE: this latency value is not the same as the concept of 30 * NOTE: this latency value is not the same as the concept of
31 * 'timeslice length' - timeslices in CFS are of variable length 31 * 'timeslice length' - timeslices in CFS are of variable length
@@ -52,15 +52,15 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
52 52
53/* 53/*
54 * Minimal preemption granularity for CPU-bound tasks: 54 * Minimal preemption granularity for CPU-bound tasks:
55 * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds) 55 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
56 */ 56 */
57unsigned int sysctl_sched_min_granularity = 2000000ULL; 57unsigned int sysctl_sched_min_granularity = 750000ULL;
58unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL; 58unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
59 59
60/* 60/*
61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity 61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
62 */ 62 */
63static unsigned int sched_nr_latency = 3; 63static unsigned int sched_nr_latency = 8;
64 64
65/* 65/*
66 * After fork, child runs first. If set to 0 (default) then 66 * After fork, child runs first. If set to 0 (default) then
@@ -519,7 +519,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
519static void update_curr(struct cfs_rq *cfs_rq) 519static void update_curr(struct cfs_rq *cfs_rq)
520{ 520{
521 struct sched_entity *curr = cfs_rq->curr; 521 struct sched_entity *curr = cfs_rq->curr;
522 u64 now = rq_of(cfs_rq)->clock; 522 u64 now = rq_of(cfs_rq)->clock_task;
523 unsigned long delta_exec; 523 unsigned long delta_exec;
524 524
525 if (unlikely(!curr)) 525 if (unlikely(!curr))
@@ -602,7 +602,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
602 /* 602 /*
603 * We are starting a new run period: 603 * We are starting a new run period:
604 */ 604 */
605 se->exec_start = rq_of(cfs_rq)->clock; 605 se->exec_start = rq_of(cfs_rq)->clock_task;
606} 606}
607 607
608/************************************************** 608/**************************************************
@@ -1313,7 +1313,7 @@ static struct sched_group *
1313find_idlest_group(struct sched_domain *sd, struct task_struct *p, 1313find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1314 int this_cpu, int load_idx) 1314 int this_cpu, int load_idx)
1315{ 1315{
1316 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; 1316 struct sched_group *idlest = NULL, *group = sd->groups;
1317 unsigned long min_load = ULONG_MAX, this_load = 0; 1317 unsigned long min_load = ULONG_MAX, this_load = 0;
1318 int imbalance = 100 + (sd->imbalance_pct-100)/2; 1318 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1319 1319
@@ -1348,7 +1348,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1348 1348
1349 if (local_group) { 1349 if (local_group) {
1350 this_load = avg_load; 1350 this_load = avg_load;
1351 this = group;
1352 } else if (avg_load < min_load) { 1351 } else if (avg_load < min_load) {
1353 min_load = avg_load; 1352 min_load = avg_load;
1354 idlest = group; 1353 idlest = group;
@@ -1765,6 +1764,10 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
1765 set_task_cpu(p, this_cpu); 1764 set_task_cpu(p, this_cpu);
1766 activate_task(this_rq, p, 0); 1765 activate_task(this_rq, p, 0);
1767 check_preempt_curr(this_rq, p, 0); 1766 check_preempt_curr(this_rq, p, 0);
1767
1768 /* re-arm NEWIDLE balancing when moving tasks */
1769 src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost;
1770 this_rq->idle_stamp = 0;
1768} 1771}
1769 1772
1770/* 1773/*
@@ -1799,7 +1802,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1799 * 2) too many balance attempts have failed. 1802 * 2) too many balance attempts have failed.
1800 */ 1803 */
1801 1804
1802 tsk_cache_hot = task_hot(p, rq->clock, sd); 1805 tsk_cache_hot = task_hot(p, rq->clock_task, sd);
1803 if (!tsk_cache_hot || 1806 if (!tsk_cache_hot ||
1804 sd->nr_balance_failed > sd->cache_nice_tries) { 1807 sd->nr_balance_failed > sd->cache_nice_tries) {
1805#ifdef CONFIG_SCHEDSTATS 1808#ifdef CONFIG_SCHEDSTATS
@@ -2031,12 +2034,14 @@ struct sd_lb_stats {
2031 unsigned long this_load; 2034 unsigned long this_load;
2032 unsigned long this_load_per_task; 2035 unsigned long this_load_per_task;
2033 unsigned long this_nr_running; 2036 unsigned long this_nr_running;
2037 unsigned long this_has_capacity;
2034 2038
2035 /* Statistics of the busiest group */ 2039 /* Statistics of the busiest group */
2036 unsigned long max_load; 2040 unsigned long max_load;
2037 unsigned long busiest_load_per_task; 2041 unsigned long busiest_load_per_task;
2038 unsigned long busiest_nr_running; 2042 unsigned long busiest_nr_running;
2039 unsigned long busiest_group_capacity; 2043 unsigned long busiest_group_capacity;
2044 unsigned long busiest_has_capacity;
2040 2045
2041 int group_imb; /* Is there imbalance in this sd */ 2046 int group_imb; /* Is there imbalance in this sd */
2042#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 2047#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -2059,6 +2064,7 @@ struct sg_lb_stats {
2059 unsigned long sum_weighted_load; /* Weighted load of group's tasks */ 2064 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
2060 unsigned long group_capacity; 2065 unsigned long group_capacity;
2061 int group_imb; /* Is there an imbalance in the group ? */ 2066 int group_imb; /* Is there an imbalance in the group ? */
2067 int group_has_capacity; /* Is there extra capacity in the group? */
2062}; 2068};
2063 2069
2064/** 2070/**
@@ -2268,10 +2274,14 @@ unsigned long scale_rt_power(int cpu)
2268 struct rq *rq = cpu_rq(cpu); 2274 struct rq *rq = cpu_rq(cpu);
2269 u64 total, available; 2275 u64 total, available;
2270 2276
2271 sched_avg_update(rq);
2272
2273 total = sched_avg_period() + (rq->clock - rq->age_stamp); 2277 total = sched_avg_period() + (rq->clock - rq->age_stamp);
2274 available = total - rq->rt_avg; 2278
2279 if (unlikely(total < rq->rt_avg)) {
2280 /* Ensures that power won't end up being negative */
2281 available = 0;
2282 } else {
2283 available = total - rq->rt_avg;
2284 }
2275 2285
2276 if (unlikely((s64)total < SCHED_LOAD_SCALE)) 2286 if (unlikely((s64)total < SCHED_LOAD_SCALE))
2277 total = SCHED_LOAD_SCALE; 2287 total = SCHED_LOAD_SCALE;
@@ -2287,13 +2297,6 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
2287 unsigned long power = SCHED_LOAD_SCALE; 2297 unsigned long power = SCHED_LOAD_SCALE;
2288 struct sched_group *sdg = sd->groups; 2298 struct sched_group *sdg = sd->groups;
2289 2299
2290 if (sched_feat(ARCH_POWER))
2291 power *= arch_scale_freq_power(sd, cpu);
2292 else
2293 power *= default_scale_freq_power(sd, cpu);
2294
2295 power >>= SCHED_LOAD_SHIFT;
2296
2297 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { 2300 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
2298 if (sched_feat(ARCH_POWER)) 2301 if (sched_feat(ARCH_POWER))
2299 power *= arch_scale_smt_power(sd, cpu); 2302 power *= arch_scale_smt_power(sd, cpu);
@@ -2303,6 +2306,15 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
2303 power >>= SCHED_LOAD_SHIFT; 2306 power >>= SCHED_LOAD_SHIFT;
2304 } 2307 }
2305 2308
2309 sdg->cpu_power_orig = power;
2310
2311 if (sched_feat(ARCH_POWER))
2312 power *= arch_scale_freq_power(sd, cpu);
2313 else
2314 power *= default_scale_freq_power(sd, cpu);
2315
2316 power >>= SCHED_LOAD_SHIFT;
2317
2306 power *= scale_rt_power(cpu); 2318 power *= scale_rt_power(cpu);
2307 power >>= SCHED_LOAD_SHIFT; 2319 power >>= SCHED_LOAD_SHIFT;
2308 2320
@@ -2335,6 +2347,31 @@ static void update_group_power(struct sched_domain *sd, int cpu)
2335 sdg->cpu_power = power; 2347 sdg->cpu_power = power;
2336} 2348}
2337 2349
2350/*
2351 * Try and fix up capacity for tiny siblings, this is needed when
2352 * things like SD_ASYM_PACKING need f_b_g to select another sibling
2353 * which on its own isn't powerful enough.
2354 *
2355 * See update_sd_pick_busiest() and check_asym_packing().
2356 */
2357static inline int
2358fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2359{
2360 /*
2361 * Only siblings can have significantly less than SCHED_LOAD_SCALE
2362 */
2363 if (sd->level != SD_LV_SIBLING)
2364 return 0;
2365
2366 /*
2367 * If ~90% of the cpu_power is still there, we're good.
2368 */
2369 if (group->cpu_power * 32 > group->cpu_power_orig * 29)
2370 return 1;
2371
2372 return 0;
2373}
2374
2338/** 2375/**
2339 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 2376 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
2340 * @sd: The sched_domain whose statistics are to be updated. 2377 * @sd: The sched_domain whose statistics are to be updated.
@@ -2354,7 +2391,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2354 int local_group, const struct cpumask *cpus, 2391 int local_group, const struct cpumask *cpus,
2355 int *balance, struct sg_lb_stats *sgs) 2392 int *balance, struct sg_lb_stats *sgs)
2356{ 2393{
2357 unsigned long load, max_cpu_load, min_cpu_load; 2394 unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
2358 int i; 2395 int i;
2359 unsigned int balance_cpu = -1, first_idle_cpu = 0; 2396 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2360 unsigned long avg_load_per_task = 0; 2397 unsigned long avg_load_per_task = 0;
@@ -2365,6 +2402,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2365 /* Tally up the load of all CPUs in the group */ 2402 /* Tally up the load of all CPUs in the group */
2366 max_cpu_load = 0; 2403 max_cpu_load = 0;
2367 min_cpu_load = ~0UL; 2404 min_cpu_load = ~0UL;
2405 max_nr_running = 0;
2368 2406
2369 for_each_cpu_and(i, sched_group_cpus(group), cpus) { 2407 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
2370 struct rq *rq = cpu_rq(i); 2408 struct rq *rq = cpu_rq(i);
@@ -2382,8 +2420,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2382 load = target_load(i, load_idx); 2420 load = target_load(i, load_idx);
2383 } else { 2421 } else {
2384 load = source_load(i, load_idx); 2422 load = source_load(i, load_idx);
2385 if (load > max_cpu_load) 2423 if (load > max_cpu_load) {
2386 max_cpu_load = load; 2424 max_cpu_load = load;
2425 max_nr_running = rq->nr_running;
2426 }
2387 if (min_cpu_load > load) 2427 if (min_cpu_load > load)
2388 min_cpu_load = load; 2428 min_cpu_load = load;
2389 } 2429 }
@@ -2400,14 +2440,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2400 * domains. In the newly idle case, we will allow all the cpu's 2440 * domains. In the newly idle case, we will allow all the cpu's
2401 * to do the newly idle load balance. 2441 * to do the newly idle load balance.
2402 */ 2442 */
2403 if (idle != CPU_NEWLY_IDLE && local_group && 2443 if (idle != CPU_NEWLY_IDLE && local_group) {
2404 balance_cpu != this_cpu) { 2444 if (balance_cpu != this_cpu) {
2405 *balance = 0; 2445 *balance = 0;
2406 return; 2446 return;
2447 }
2448 update_group_power(sd, this_cpu);
2407 } 2449 }
2408 2450
2409 update_group_power(sd, this_cpu);
2410
2411 /* Adjust by relative CPU power of the group */ 2451 /* Adjust by relative CPU power of the group */
2412 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; 2452 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
2413 2453
@@ -2423,11 +2463,58 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2423 if (sgs->sum_nr_running) 2463 if (sgs->sum_nr_running)
2424 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 2464 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
2425 2465
2426 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) 2466 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
2427 sgs->group_imb = 1; 2467 sgs->group_imb = 1;
2428 2468
2429 sgs->group_capacity = 2469 sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2430 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); 2470 if (!sgs->group_capacity)
2471 sgs->group_capacity = fix_small_capacity(sd, group);
2472
2473 if (sgs->group_capacity > sgs->sum_nr_running)
2474 sgs->group_has_capacity = 1;
2475}
2476
2477/**
2478 * update_sd_pick_busiest - return 1 on busiest group
2479 * @sd: sched_domain whose statistics are to be checked
2480 * @sds: sched_domain statistics
2481 * @sg: sched_group candidate to be checked for being the busiest
2482 * @sgs: sched_group statistics
2483 * @this_cpu: the current cpu
2484 *
2485 * Determine if @sg is a busier group than the previously selected
2486 * busiest group.
2487 */
2488static bool update_sd_pick_busiest(struct sched_domain *sd,
2489 struct sd_lb_stats *sds,
2490 struct sched_group *sg,
2491 struct sg_lb_stats *sgs,
2492 int this_cpu)
2493{
2494 if (sgs->avg_load <= sds->max_load)
2495 return false;
2496
2497 if (sgs->sum_nr_running > sgs->group_capacity)
2498 return true;
2499
2500 if (sgs->group_imb)
2501 return true;
2502
2503 /*
2504 * ASYM_PACKING needs to move all the work to the lowest
2505 * numbered CPUs in the group, therefore mark all groups
2506 * higher than ourself as busy.
2507 */
2508 if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
2509 this_cpu < group_first_cpu(sg)) {
2510 if (!sds->busiest)
2511 return true;
2512
2513 if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
2514 return true;
2515 }
2516
2517 return false;
2431} 2518}
2432 2519
2433/** 2520/**
@@ -2435,7 +2522,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2435 * @sd: sched_domain whose statistics are to be updated. 2522 * @sd: sched_domain whose statistics are to be updated.
2436 * @this_cpu: Cpu for which load balance is currently performed. 2523 * @this_cpu: Cpu for which load balance is currently performed.
2437 * @idle: Idle status of this_cpu 2524 * @idle: Idle status of this_cpu
2438 * @sd_idle: Idle status of the sched_domain containing group. 2525 * @sd_idle: Idle status of the sched_domain containing sg.
2439 * @cpus: Set of cpus considered for load balancing. 2526 * @cpus: Set of cpus considered for load balancing.
2440 * @balance: Should we balance. 2527 * @balance: Should we balance.
2441 * @sds: variable to hold the statistics for this sched_domain. 2528 * @sds: variable to hold the statistics for this sched_domain.
@@ -2446,7 +2533,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2446 struct sd_lb_stats *sds) 2533 struct sd_lb_stats *sds)
2447{ 2534{
2448 struct sched_domain *child = sd->child; 2535 struct sched_domain *child = sd->child;
2449 struct sched_group *group = sd->groups; 2536 struct sched_group *sg = sd->groups;
2450 struct sg_lb_stats sgs; 2537 struct sg_lb_stats sgs;
2451 int load_idx, prefer_sibling = 0; 2538 int load_idx, prefer_sibling = 0;
2452 2539
@@ -2459,45 +2546,100 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2459 do { 2546 do {
2460 int local_group; 2547 int local_group;
2461 2548
2462 local_group = cpumask_test_cpu(this_cpu, 2549 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
2463 sched_group_cpus(group));
2464 memset(&sgs, 0, sizeof(sgs)); 2550 memset(&sgs, 0, sizeof(sgs));
2465 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, 2551 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle,
2466 local_group, cpus, balance, &sgs); 2552 local_group, cpus, balance, &sgs);
2467 2553
2468 if (local_group && !(*balance)) 2554 if (local_group && !(*balance))
2469 return; 2555 return;
2470 2556
2471 sds->total_load += sgs.group_load; 2557 sds->total_load += sgs.group_load;
2472 sds->total_pwr += group->cpu_power; 2558 sds->total_pwr += sg->cpu_power;
2473 2559
2474 /* 2560 /*
2475 * In case the child domain prefers tasks go to siblings 2561 * In case the child domain prefers tasks go to siblings
2476 * first, lower the group capacity to one so that we'll try 2562 * first, lower the sg capacity to one so that we'll try
2477 * and move all the excess tasks away. 2563 * and move all the excess tasks away. We lower the capacity
2564 * of a group only if the local group has the capacity to fit
2565 * these excess tasks, i.e. nr_running < group_capacity. The
2566 * extra check prevents the case where you always pull from the
2567 * heaviest group when it is already under-utilized (possible
2568 * with a large weight task outweighs the tasks on the system).
2478 */ 2569 */
2479 if (prefer_sibling) 2570 if (prefer_sibling && !local_group && sds->this_has_capacity)
2480 sgs.group_capacity = min(sgs.group_capacity, 1UL); 2571 sgs.group_capacity = min(sgs.group_capacity, 1UL);
2481 2572
2482 if (local_group) { 2573 if (local_group) {
2483 sds->this_load = sgs.avg_load; 2574 sds->this_load = sgs.avg_load;
2484 sds->this = group; 2575 sds->this = sg;
2485 sds->this_nr_running = sgs.sum_nr_running; 2576 sds->this_nr_running = sgs.sum_nr_running;
2486 sds->this_load_per_task = sgs.sum_weighted_load; 2577 sds->this_load_per_task = sgs.sum_weighted_load;
2487 } else if (sgs.avg_load > sds->max_load && 2578 sds->this_has_capacity = sgs.group_has_capacity;
2488 (sgs.sum_nr_running > sgs.group_capacity || 2579 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
2489 sgs.group_imb)) {
2490 sds->max_load = sgs.avg_load; 2580 sds->max_load = sgs.avg_load;
2491 sds->busiest = group; 2581 sds->busiest = sg;
2492 sds->busiest_nr_running = sgs.sum_nr_running; 2582 sds->busiest_nr_running = sgs.sum_nr_running;
2493 sds->busiest_group_capacity = sgs.group_capacity; 2583 sds->busiest_group_capacity = sgs.group_capacity;
2494 sds->busiest_load_per_task = sgs.sum_weighted_load; 2584 sds->busiest_load_per_task = sgs.sum_weighted_load;
2585 sds->busiest_has_capacity = sgs.group_has_capacity;
2495 sds->group_imb = sgs.group_imb; 2586 sds->group_imb = sgs.group_imb;
2496 } 2587 }
2497 2588
2498 update_sd_power_savings_stats(group, sds, local_group, &sgs); 2589 update_sd_power_savings_stats(sg, sds, local_group, &sgs);
2499 group = group->next; 2590 sg = sg->next;
2500 } while (group != sd->groups); 2591 } while (sg != sd->groups);
2592}
2593
2594int __weak arch_sd_sibling_asym_packing(void)
2595{
2596 return 0*SD_ASYM_PACKING;
2597}
2598
2599/**
2600 * check_asym_packing - Check to see if the group is packed into the
2601 * sched doman.
2602 *
2603 * This is primarily intended to used at the sibling level. Some
2604 * cores like POWER7 prefer to use lower numbered SMT threads. In the
2605 * case of POWER7, it can move to lower SMT modes only when higher
2606 * threads are idle. When in lower SMT modes, the threads will
2607 * perform better since they share less core resources. Hence when we
2608 * have idle threads, we want them to be the higher ones.
2609 *
2610 * This packing function is run on idle threads. It checks to see if
2611 * the busiest CPU in this domain (core in the P7 case) has a higher
2612 * CPU number than the packing function is being run on. Here we are
2613 * assuming lower CPU number will be equivalent to lower a SMT thread
2614 * number.
2615 *
2616 * Returns 1 when packing is required and a task should be moved to
2617 * this CPU. The amount of the imbalance is returned in *imbalance.
2618 *
2619 * @sd: The sched_domain whose packing is to be checked.
2620 * @sds: Statistics of the sched_domain which is to be packed
2621 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
2622 * @imbalance: returns amount of imbalanced due to packing.
2623 */
2624static int check_asym_packing(struct sched_domain *sd,
2625 struct sd_lb_stats *sds,
2626 int this_cpu, unsigned long *imbalance)
2627{
2628 int busiest_cpu;
2629
2630 if (!(sd->flags & SD_ASYM_PACKING))
2631 return 0;
2632
2633 if (!sds->busiest)
2634 return 0;
2635
2636 busiest_cpu = group_first_cpu(sds->busiest);
2637 if (this_cpu > busiest_cpu)
2638 return 0;
2639
2640 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
2641 SCHED_LOAD_SCALE);
2642 return 1;
2501} 2643}
2502 2644
2503/** 2645/**
@@ -2637,6 +2779,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
2637 return fix_small_imbalance(sds, this_cpu, imbalance); 2779 return fix_small_imbalance(sds, this_cpu, imbalance);
2638 2780
2639} 2781}
2782
2640/******* find_busiest_group() helpers end here *********************/ 2783/******* find_busiest_group() helpers end here *********************/
2641 2784
2642/** 2785/**
@@ -2688,13 +2831,27 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2688 * 4) This group is more busy than the avg busieness at this 2831 * 4) This group is more busy than the avg busieness at this
2689 * sched_domain. 2832 * sched_domain.
2690 * 5) The imbalance is within the specified limit. 2833 * 5) The imbalance is within the specified limit.
2834 *
2835 * Note: when doing newidle balance, if the local group has excess
2836 * capacity (i.e. nr_running < group_capacity) and the busiest group
2837 * does not have any capacity, we force a load balance to pull tasks
2838 * to the local group. In this case, we skip past checks 3, 4 and 5.
2691 */ 2839 */
2692 if (!(*balance)) 2840 if (!(*balance))
2693 goto ret; 2841 goto ret;
2694 2842
2843 if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
2844 check_asym_packing(sd, &sds, this_cpu, imbalance))
2845 return sds.busiest;
2846
2695 if (!sds.busiest || sds.busiest_nr_running == 0) 2847 if (!sds.busiest || sds.busiest_nr_running == 0)
2696 goto out_balanced; 2848 goto out_balanced;
2697 2849
2850 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
2851 if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
2852 !sds.busiest_has_capacity)
2853 goto force_balance;
2854
2698 if (sds.this_load >= sds.max_load) 2855 if (sds.this_load >= sds.max_load)
2699 goto out_balanced; 2856 goto out_balanced;
2700 2857
@@ -2706,6 +2863,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2706 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) 2863 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
2707 goto out_balanced; 2864 goto out_balanced;
2708 2865
2866force_balance:
2709 /* Looks like there is an imbalance. Compute it */ 2867 /* Looks like there is an imbalance. Compute it */
2710 calculate_imbalance(&sds, this_cpu, imbalance); 2868 calculate_imbalance(&sds, this_cpu, imbalance);
2711 return sds.busiest; 2869 return sds.busiest;
@@ -2726,8 +2884,9 @@ ret:
2726 * find_busiest_queue - find the busiest runqueue among the cpus in group. 2884 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2727 */ 2885 */
2728static struct rq * 2886static struct rq *
2729find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, 2887find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
2730 unsigned long imbalance, const struct cpumask *cpus) 2888 enum cpu_idle_type idle, unsigned long imbalance,
2889 const struct cpumask *cpus)
2731{ 2890{
2732 struct rq *busiest = NULL, *rq; 2891 struct rq *busiest = NULL, *rq;
2733 unsigned long max_load = 0; 2892 unsigned long max_load = 0;
@@ -2738,6 +2897,9 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2738 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); 2897 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
2739 unsigned long wl; 2898 unsigned long wl;
2740 2899
2900 if (!capacity)
2901 capacity = fix_small_capacity(sd, group);
2902
2741 if (!cpumask_test_cpu(i, cpus)) 2903 if (!cpumask_test_cpu(i, cpus))
2742 continue; 2904 continue;
2743 2905
@@ -2777,9 +2939,19 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2777/* Working cpumask for load_balance and load_balance_newidle. */ 2939/* Working cpumask for load_balance and load_balance_newidle. */
2778static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 2940static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
2779 2941
2780static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle) 2942static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
2943 int busiest_cpu, int this_cpu)
2781{ 2944{
2782 if (idle == CPU_NEWLY_IDLE) { 2945 if (idle == CPU_NEWLY_IDLE) {
2946
2947 /*
2948 * ASYM_PACKING needs to force migrate tasks from busy but
2949 * higher numbered CPUs in order to pack all tasks in the
2950 * lowest numbered CPUs.
2951 */
2952 if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu)
2953 return 1;
2954
2783 /* 2955 /*
2784 * The only task running in a non-idle cpu can be moved to this 2956 * The only task running in a non-idle cpu can be moved to this
2785 * cpu in an attempt to completely freeup the other CPU 2957 * cpu in an attempt to completely freeup the other CPU
@@ -2854,7 +3026,7 @@ redo:
2854 goto out_balanced; 3026 goto out_balanced;
2855 } 3027 }
2856 3028
2857 busiest = find_busiest_queue(group, idle, imbalance, cpus); 3029 busiest = find_busiest_queue(sd, group, idle, imbalance, cpus);
2858 if (!busiest) { 3030 if (!busiest) {
2859 schedstat_inc(sd, lb_nobusyq[idle]); 3031 schedstat_inc(sd, lb_nobusyq[idle]);
2860 goto out_balanced; 3032 goto out_balanced;
@@ -2896,9 +3068,17 @@ redo:
2896 3068
2897 if (!ld_moved) { 3069 if (!ld_moved) {
2898 schedstat_inc(sd, lb_failed[idle]); 3070 schedstat_inc(sd, lb_failed[idle]);
2899 sd->nr_balance_failed++; 3071 /*
3072 * Increment the failure counter only on periodic balance.
3073 * We do not want newidle balance, which can be very
3074 * frequent, pollute the failure counter causing
3075 * excessive cache_hot migrations and active balances.
3076 */
3077 if (idle != CPU_NEWLY_IDLE)
3078 sd->nr_balance_failed++;
2900 3079
2901 if (need_active_balance(sd, sd_idle, idle)) { 3080 if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
3081 this_cpu)) {
2902 raw_spin_lock_irqsave(&busiest->lock, flags); 3082 raw_spin_lock_irqsave(&busiest->lock, flags);
2903 3083
2904 /* don't kick the active_load_balance_cpu_stop, 3084 /* don't kick the active_load_balance_cpu_stop,
@@ -3017,10 +3197,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3017 interval = msecs_to_jiffies(sd->balance_interval); 3197 interval = msecs_to_jiffies(sd->balance_interval);
3018 if (time_after(next_balance, sd->last_balance + interval)) 3198 if (time_after(next_balance, sd->last_balance + interval))
3019 next_balance = sd->last_balance + interval; 3199 next_balance = sd->last_balance + interval;
3020 if (pulled_task) { 3200 if (pulled_task)
3021 this_rq->idle_stamp = 0;
3022 break; 3201 break;
3023 }
3024 } 3202 }
3025 3203
3026 raw_spin_lock(&this_rq->lock); 3204 raw_spin_lock(&this_rq->lock);
@@ -3093,13 +3271,40 @@ out_unlock:
3093} 3271}
3094 3272
3095#ifdef CONFIG_NO_HZ 3273#ifdef CONFIG_NO_HZ
3274
3275static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
3276
3277static void trigger_sched_softirq(void *data)
3278{
3279 raise_softirq_irqoff(SCHED_SOFTIRQ);
3280}
3281
3282static inline void init_sched_softirq_csd(struct call_single_data *csd)
3283{
3284 csd->func = trigger_sched_softirq;
3285 csd->info = NULL;
3286 csd->flags = 0;
3287 csd->priv = 0;
3288}
3289
3290/*
3291 * idle load balancing details
3292 * - One of the idle CPUs nominates itself as idle load_balancer, while
3293 * entering idle.
3294 * - This idle load balancer CPU will also go into tickless mode when
3295 * it is idle, just like all other idle CPUs
3296 * - When one of the busy CPUs notice that there may be an idle rebalancing
3297 * needed, they will kick the idle load balancer, which then does idle
3298 * load balancing for all the idle CPUs.
3299 */
3096static struct { 3300static struct {
3097 atomic_t load_balancer; 3301 atomic_t load_balancer;
3098 cpumask_var_t cpu_mask; 3302 atomic_t first_pick_cpu;
3099 cpumask_var_t ilb_grp_nohz_mask; 3303 atomic_t second_pick_cpu;
3100} nohz ____cacheline_aligned = { 3304 cpumask_var_t idle_cpus_mask;
3101 .load_balancer = ATOMIC_INIT(-1), 3305 cpumask_var_t grp_idle_mask;
3102}; 3306 unsigned long next_balance; /* in jiffy units */
3307} nohz ____cacheline_aligned;
3103 3308
3104int get_nohz_load_balancer(void) 3309int get_nohz_load_balancer(void)
3105{ 3310{
@@ -3153,17 +3358,17 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
3153 */ 3358 */
3154static inline int is_semi_idle_group(struct sched_group *ilb_group) 3359static inline int is_semi_idle_group(struct sched_group *ilb_group)
3155{ 3360{
3156 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, 3361 cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
3157 sched_group_cpus(ilb_group)); 3362 sched_group_cpus(ilb_group));
3158 3363
3159 /* 3364 /*
3160 * A sched_group is semi-idle when it has atleast one busy cpu 3365 * A sched_group is semi-idle when it has atleast one busy cpu
3161 * and atleast one idle cpu. 3366 * and atleast one idle cpu.
3162 */ 3367 */
3163 if (cpumask_empty(nohz.ilb_grp_nohz_mask)) 3368 if (cpumask_empty(nohz.grp_idle_mask))
3164 return 0; 3369 return 0;
3165 3370
3166 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) 3371 if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
3167 return 0; 3372 return 0;
3168 3373
3169 return 1; 3374 return 1;
@@ -3196,7 +3401,7 @@ static int find_new_ilb(int cpu)
3196 * Optimize for the case when we have no idle CPUs or only one 3401 * Optimize for the case when we have no idle CPUs or only one
3197 * idle CPU. Don't walk the sched_domain hierarchy in such cases 3402 * idle CPU. Don't walk the sched_domain hierarchy in such cases
3198 */ 3403 */
3199 if (cpumask_weight(nohz.cpu_mask) < 2) 3404 if (cpumask_weight(nohz.idle_cpus_mask) < 2)
3200 goto out_done; 3405 goto out_done;
3201 3406
3202 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { 3407 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
@@ -3204,7 +3409,7 @@ static int find_new_ilb(int cpu)
3204 3409
3205 do { 3410 do {
3206 if (is_semi_idle_group(ilb_group)) 3411 if (is_semi_idle_group(ilb_group))
3207 return cpumask_first(nohz.ilb_grp_nohz_mask); 3412 return cpumask_first(nohz.grp_idle_mask);
3208 3413
3209 ilb_group = ilb_group->next; 3414 ilb_group = ilb_group->next;
3210 3415
@@ -3212,98 +3417,116 @@ static int find_new_ilb(int cpu)
3212 } 3417 }
3213 3418
3214out_done: 3419out_done:
3215 return cpumask_first(nohz.cpu_mask); 3420 return nr_cpu_ids;
3216} 3421}
3217#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ 3422#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
3218static inline int find_new_ilb(int call_cpu) 3423static inline int find_new_ilb(int call_cpu)
3219{ 3424{
3220 return cpumask_first(nohz.cpu_mask); 3425 return nr_cpu_ids;
3221} 3426}
3222#endif 3427#endif
3223 3428
3224/* 3429/*
3430 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
3431 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
3432 * CPU (if there is one).
3433 */
3434static void nohz_balancer_kick(int cpu)
3435{
3436 int ilb_cpu;
3437
3438 nohz.next_balance++;
3439
3440 ilb_cpu = get_nohz_load_balancer();
3441
3442 if (ilb_cpu >= nr_cpu_ids) {
3443 ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
3444 if (ilb_cpu >= nr_cpu_ids)
3445 return;
3446 }
3447
3448 if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
3449 struct call_single_data *cp;
3450
3451 cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
3452 cp = &per_cpu(remote_sched_softirq_cb, cpu);
3453 __smp_call_function_single(ilb_cpu, cp, 0);
3454 }
3455 return;
3456}
3457
3458/*
3225 * This routine will try to nominate the ilb (idle load balancing) 3459 * This routine will try to nominate the ilb (idle load balancing)
3226 * owner among the cpus whose ticks are stopped. ilb owner will do the idle 3460 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
3227 * load balancing on behalf of all those cpus. If all the cpus in the system 3461 * load balancing on behalf of all those cpus.
3228 * go into this tickless mode, then there will be no ilb owner (as there is
3229 * no need for one) and all the cpus will sleep till the next wakeup event
3230 * arrives...
3231 *
3232 * For the ilb owner, tick is not stopped. And this tick will be used
3233 * for idle load balancing. ilb owner will still be part of
3234 * nohz.cpu_mask..
3235 * 3462 *
3236 * While stopping the tick, this cpu will become the ilb owner if there 3463 * When the ilb owner becomes busy, we will not have new ilb owner until some
3237 * is no other owner. And will be the owner till that cpu becomes busy 3464 * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
3238 * or if all cpus in the system stop their ticks at which point 3465 * idle load balancing by kicking one of the idle CPUs.
3239 * there is no need for ilb owner.
3240 * 3466 *
3241 * When the ilb owner becomes busy, it nominates another owner, during the 3467 * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
3242 * next busy scheduler_tick() 3468 * ilb owner CPU in future (when there is a need for idle load balancing on
3469 * behalf of all idle CPUs).
3243 */ 3470 */
3244int select_nohz_load_balancer(int stop_tick) 3471void select_nohz_load_balancer(int stop_tick)
3245{ 3472{
3246 int cpu = smp_processor_id(); 3473 int cpu = smp_processor_id();
3247 3474
3248 if (stop_tick) { 3475 if (stop_tick) {
3249 cpu_rq(cpu)->in_nohz_recently = 1;
3250
3251 if (!cpu_active(cpu)) { 3476 if (!cpu_active(cpu)) {
3252 if (atomic_read(&nohz.load_balancer) != cpu) 3477 if (atomic_read(&nohz.load_balancer) != cpu)
3253 return 0; 3478 return;
3254 3479
3255 /* 3480 /*
3256 * If we are going offline and still the leader, 3481 * If we are going offline and still the leader,
3257 * give up! 3482 * give up!
3258 */ 3483 */
3259 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 3484 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
3485 nr_cpu_ids) != cpu)
3260 BUG(); 3486 BUG();
3261 3487
3262 return 0; 3488 return;
3263 } 3489 }
3264 3490
3265 cpumask_set_cpu(cpu, nohz.cpu_mask); 3491 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
3266 3492
3267 /* time for ilb owner also to sleep */ 3493 if (atomic_read(&nohz.first_pick_cpu) == cpu)
3268 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { 3494 atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
3269 if (atomic_read(&nohz.load_balancer) == cpu) 3495 if (atomic_read(&nohz.second_pick_cpu) == cpu)
3270 atomic_set(&nohz.load_balancer, -1); 3496 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
3271 return 0;
3272 }
3273 3497
3274 if (atomic_read(&nohz.load_balancer) == -1) { 3498 if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
3275 /* make me the ilb owner */
3276 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
3277 return 1;
3278 } else if (atomic_read(&nohz.load_balancer) == cpu) {
3279 int new_ilb; 3499 int new_ilb;
3280 3500
3281 if (!(sched_smt_power_savings || 3501 /* make me the ilb owner */
3282 sched_mc_power_savings)) 3502 if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids,
3283 return 1; 3503 cpu) != nr_cpu_ids)
3504 return;
3505
3284 /* 3506 /*
3285 * Check to see if there is a more power-efficient 3507 * Check to see if there is a more power-efficient
3286 * ilb. 3508 * ilb.
3287 */ 3509 */
3288 new_ilb = find_new_ilb(cpu); 3510 new_ilb = find_new_ilb(cpu);
3289 if (new_ilb < nr_cpu_ids && new_ilb != cpu) { 3511 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
3290 atomic_set(&nohz.load_balancer, -1); 3512 atomic_set(&nohz.load_balancer, nr_cpu_ids);
3291 resched_cpu(new_ilb); 3513 resched_cpu(new_ilb);
3292 return 0; 3514 return;
3293 } 3515 }
3294 return 1; 3516 return;
3295 } 3517 }
3296 } else { 3518 } else {
3297 if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) 3519 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
3298 return 0; 3520 return;
3299 3521
3300 cpumask_clear_cpu(cpu, nohz.cpu_mask); 3522 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
3301 3523
3302 if (atomic_read(&nohz.load_balancer) == cpu) 3524 if (atomic_read(&nohz.load_balancer) == cpu)
3303 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 3525 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
3526 nr_cpu_ids) != cpu)
3304 BUG(); 3527 BUG();
3305 } 3528 }
3306 return 0; 3529 return;
3307} 3530}
3308#endif 3531#endif
3309 3532
@@ -3385,11 +3608,102 @@ out:
3385 rq->next_balance = next_balance; 3608 rq->next_balance = next_balance;
3386} 3609}
3387 3610
3611#ifdef CONFIG_NO_HZ
3388/* 3612/*
3389 * run_rebalance_domains is triggered when needed from the scheduler tick. 3613 * In CONFIG_NO_HZ case, the idle balance kickee will do the
3390 * In CONFIG_NO_HZ case, the idle load balance owner will do the
3391 * rebalancing for all the cpus for whom scheduler ticks are stopped. 3614 * rebalancing for all the cpus for whom scheduler ticks are stopped.
3392 */ 3615 */
3616static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
3617{
3618 struct rq *this_rq = cpu_rq(this_cpu);
3619 struct rq *rq;
3620 int balance_cpu;
3621
3622 if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
3623 return;
3624
3625 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
3626 if (balance_cpu == this_cpu)
3627 continue;
3628
3629 /*
3630 * If this cpu gets work to do, stop the load balancing
3631 * work being done for other cpus. Next load
3632 * balancing owner will pick it up.
3633 */
3634 if (need_resched()) {
3635 this_rq->nohz_balance_kick = 0;
3636 break;
3637 }
3638
3639 raw_spin_lock_irq(&this_rq->lock);
3640 update_rq_clock(this_rq);
3641 update_cpu_load(this_rq);
3642 raw_spin_unlock_irq(&this_rq->lock);
3643
3644 rebalance_domains(balance_cpu, CPU_IDLE);
3645
3646 rq = cpu_rq(balance_cpu);
3647 if (time_after(this_rq->next_balance, rq->next_balance))
3648 this_rq->next_balance = rq->next_balance;
3649 }
3650 nohz.next_balance = this_rq->next_balance;
3651 this_rq->nohz_balance_kick = 0;
3652}
3653
3654/*
3655 * Current heuristic for kicking the idle load balancer
3656 * - first_pick_cpu is the one of the busy CPUs. It will kick
3657 * idle load balancer when it has more than one process active. This
3658 * eliminates the need for idle load balancing altogether when we have
3659 * only one running process in the system (common case).
3660 * - If there are more than one busy CPU, idle load balancer may have
3661 * to run for active_load_balance to happen (i.e., two busy CPUs are
3662 * SMT or core siblings and can run better if they move to different
3663 * physical CPUs). So, second_pick_cpu is the second of the busy CPUs
3664 * which will kick idle load balancer as soon as it has any load.
3665 */
3666static inline int nohz_kick_needed(struct rq *rq, int cpu)
3667{
3668 unsigned long now = jiffies;
3669 int ret;
3670 int first_pick_cpu, second_pick_cpu;
3671
3672 if (time_before(now, nohz.next_balance))
3673 return 0;
3674
3675 if (rq->idle_at_tick)
3676 return 0;
3677
3678 first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
3679 second_pick_cpu = atomic_read(&nohz.second_pick_cpu);
3680
3681 if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu &&
3682 second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
3683 return 0;
3684
3685 ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu);
3686 if (ret == nr_cpu_ids || ret == cpu) {
3687 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
3688 if (rq->nr_running > 1)
3689 return 1;
3690 } else {
3691 ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu);
3692 if (ret == nr_cpu_ids || ret == cpu) {
3693 if (rq->nr_running)
3694 return 1;
3695 }
3696 }
3697 return 0;
3698}
3699#else
3700static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
3701#endif
3702
3703/*
3704 * run_rebalance_domains is triggered when needed from the scheduler tick.
3705 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
3706 */
3393static void run_rebalance_domains(struct softirq_action *h) 3707static void run_rebalance_domains(struct softirq_action *h)
3394{ 3708{
3395 int this_cpu = smp_processor_id(); 3709 int this_cpu = smp_processor_id();
@@ -3399,37 +3713,12 @@ static void run_rebalance_domains(struct softirq_action *h)
3399 3713
3400 rebalance_domains(this_cpu, idle); 3714 rebalance_domains(this_cpu, idle);
3401 3715
3402#ifdef CONFIG_NO_HZ
3403 /* 3716 /*
3404 * If this cpu is the owner for idle load balancing, then do the 3717 * If this cpu has a pending nohz_balance_kick, then do the
3405 * balancing on behalf of the other idle cpus whose ticks are 3718 * balancing on behalf of the other idle cpus whose ticks are
3406 * stopped. 3719 * stopped.
3407 */ 3720 */
3408 if (this_rq->idle_at_tick && 3721 nohz_idle_balance(this_cpu, idle);
3409 atomic_read(&nohz.load_balancer) == this_cpu) {
3410 struct rq *rq;
3411 int balance_cpu;
3412
3413 for_each_cpu(balance_cpu, nohz.cpu_mask) {
3414 if (balance_cpu == this_cpu)
3415 continue;
3416
3417 /*
3418 * If this cpu gets work to do, stop the load balancing
3419 * work being done for other cpus. Next load
3420 * balancing owner will pick it up.
3421 */
3422 if (need_resched())
3423 break;
3424
3425 rebalance_domains(balance_cpu, CPU_IDLE);
3426
3427 rq = cpu_rq(balance_cpu);
3428 if (time_after(this_rq->next_balance, rq->next_balance))
3429 this_rq->next_balance = rq->next_balance;
3430 }
3431 }
3432#endif
3433} 3722}
3434 3723
3435static inline int on_null_domain(int cpu) 3724static inline int on_null_domain(int cpu)
@@ -3439,57 +3728,17 @@ static inline int on_null_domain(int cpu)
3439 3728
3440/* 3729/*
3441 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 3730 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3442 *
3443 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3444 * idle load balancing owner or decide to stop the periodic load balancing,
3445 * if the whole system is idle.
3446 */ 3731 */
3447static inline void trigger_load_balance(struct rq *rq, int cpu) 3732static inline void trigger_load_balance(struct rq *rq, int cpu)
3448{ 3733{
3449#ifdef CONFIG_NO_HZ
3450 /*
3451 * If we were in the nohz mode recently and busy at the current
3452 * scheduler tick, then check if we need to nominate new idle
3453 * load balancer.
3454 */
3455 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3456 rq->in_nohz_recently = 0;
3457
3458 if (atomic_read(&nohz.load_balancer) == cpu) {
3459 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3460 atomic_set(&nohz.load_balancer, -1);
3461 }
3462
3463 if (atomic_read(&nohz.load_balancer) == -1) {
3464 int ilb = find_new_ilb(cpu);
3465
3466 if (ilb < nr_cpu_ids)
3467 resched_cpu(ilb);
3468 }
3469 }
3470
3471 /*
3472 * If this cpu is idle and doing idle load balancing for all the
3473 * cpus with ticks stopped, is it time for that to stop?
3474 */
3475 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3476 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3477 resched_cpu(cpu);
3478 return;
3479 }
3480
3481 /*
3482 * If this cpu is idle and the idle load balancing is done by
3483 * someone else, then no need raise the SCHED_SOFTIRQ
3484 */
3485 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3486 cpumask_test_cpu(cpu, nohz.cpu_mask))
3487 return;
3488#endif
3489 /* Don't need to rebalance while attached to NULL domain */ 3734 /* Don't need to rebalance while attached to NULL domain */
3490 if (time_after_eq(jiffies, rq->next_balance) && 3735 if (time_after_eq(jiffies, rq->next_balance) &&
3491 likely(!on_null_domain(cpu))) 3736 likely(!on_null_domain(cpu)))
3492 raise_softirq(SCHED_SOFTIRQ); 3737 raise_softirq(SCHED_SOFTIRQ);
3738#ifdef CONFIG_NO_HZ
3739 else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
3740 nohz_balancer_kick(cpu);
3741#endif
3493} 3742}
3494 3743
3495static void rq_online_fair(struct rq *rq) 3744static void rq_online_fair(struct rq *rq)
@@ -3542,8 +3791,13 @@ static void task_fork_fair(struct task_struct *p)
3542 3791
3543 raw_spin_lock_irqsave(&rq->lock, flags); 3792 raw_spin_lock_irqsave(&rq->lock, flags);
3544 3793
3545 if (unlikely(task_cpu(p) != this_cpu)) 3794 update_rq_clock(rq);
3795
3796 if (unlikely(task_cpu(p) != this_cpu)) {
3797 rcu_read_lock();
3546 __set_task_cpu(p, this_cpu); 3798 __set_task_cpu(p, this_cpu);
3799 rcu_read_unlock();
3800 }
3547 3801
3548 update_curr(cfs_rq); 3802 update_curr(cfs_rq);
3549 3803
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 83c66e8ad3ee..185f920ec1a2 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -61,3 +61,8 @@ SCHED_FEAT(ASYM_EFF_LOAD, 1)
61 * release the lock. Decreases scheduling overhead. 61 * release the lock. Decreases scheduling overhead.
62 */ 62 */
63SCHED_FEAT(OWNER_SPIN, 1) 63SCHED_FEAT(OWNER_SPIN, 1)
64
65/*
66 * Decrement CPU power based on irq activity
67 */
68SCHED_FEAT(NONIRQ_POWER, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 8afb953e31c6..bea7d79f7e9c 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -609,7 +609,7 @@ static void update_curr_rt(struct rq *rq)
609 if (!task_has_rt_policy(curr)) 609 if (!task_has_rt_policy(curr))
610 return; 610 return;
611 611
612 delta_exec = rq->clock - curr->se.exec_start; 612 delta_exec = rq->clock_task - curr->se.exec_start;
613 if (unlikely((s64)delta_exec < 0)) 613 if (unlikely((s64)delta_exec < 0))
614 delta_exec = 0; 614 delta_exec = 0;
615 615
@@ -618,7 +618,7 @@ static void update_curr_rt(struct rq *rq)
618 curr->se.sum_exec_runtime += delta_exec; 618 curr->se.sum_exec_runtime += delta_exec;
619 account_group_exec_runtime(curr, delta_exec); 619 account_group_exec_runtime(curr, delta_exec);
620 620
621 curr->se.exec_start = rq->clock; 621 curr->se.exec_start = rq->clock_task;
622 cpuacct_charge(curr, delta_exec); 622 cpuacct_charge(curr, delta_exec);
623 623
624 sched_rt_avg_update(rq, delta_exec); 624 sched_rt_avg_update(rq, delta_exec);
@@ -960,18 +960,19 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
960 * runqueue. Otherwise simply start this RT task 960 * runqueue. Otherwise simply start this RT task
961 * on its current runqueue. 961 * on its current runqueue.
962 * 962 *
963 * We want to avoid overloading runqueues. Even if 963 * We want to avoid overloading runqueues. If the woken
964 * the RT task is of higher priority than the current RT task. 964 * task is a higher priority, then it will stay on this CPU
965 * RT tasks behave differently than other tasks. If 965 * and the lower prio task should be moved to another CPU.
966 * one gets preempted, we try to push it off to another queue. 966 * Even though this will probably make the lower prio task
967 * So trying to keep a preempting RT task on the same 967 * lose its cache, we do not want to bounce a higher task
968 * cache hot CPU will force the running RT task to 968 * around just because it gave up its CPU, perhaps for a
969 * a cold CPU. So we waste all the cache for the lower 969 * lock?
970 * RT task in hopes of saving some of a RT task 970 *
971 * that is just being woken and probably will have 971 * For equal prio tasks, we just let the scheduler sort it out.
972 * cold cache anyway.
973 */ 972 */
974 if (unlikely(rt_task(rq->curr)) && 973 if (unlikely(rt_task(rq->curr)) &&
974 (rq->curr->rt.nr_cpus_allowed < 2 ||
975 rq->curr->prio < p->prio) &&
975 (p->rt.nr_cpus_allowed > 1)) { 976 (p->rt.nr_cpus_allowed > 1)) {
976 int cpu = find_lowest_rq(p); 977 int cpu = find_lowest_rq(p);
977 978
@@ -1074,7 +1075,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
1074 } while (rt_rq); 1075 } while (rt_rq);
1075 1076
1076 p = rt_task_of(rt_se); 1077 p = rt_task_of(rt_se);
1077 p->se.exec_start = rq->clock; 1078 p->se.exec_start = rq->clock_task;
1078 1079
1079 return p; 1080 return p;
1080} 1081}
@@ -1139,7 +1140,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
1139 for_each_leaf_rt_rq(rt_rq, rq) { 1140 for_each_leaf_rt_rq(rt_rq, rq) {
1140 array = &rt_rq->active; 1141 array = &rt_rq->active;
1141 idx = sched_find_first_bit(array->bitmap); 1142 idx = sched_find_first_bit(array->bitmap);
1142 next_idx: 1143next_idx:
1143 if (idx >= MAX_RT_PRIO) 1144 if (idx >= MAX_RT_PRIO)
1144 continue; 1145 continue;
1145 if (next && next->prio < idx) 1146 if (next && next->prio < idx)
@@ -1315,7 +1316,7 @@ static int push_rt_task(struct rq *rq)
1315 if (!next_task) 1316 if (!next_task)
1316 return 0; 1317 return 0;
1317 1318
1318 retry: 1319retry:
1319 if (unlikely(next_task == rq->curr)) { 1320 if (unlikely(next_task == rq->curr)) {
1320 WARN_ON(1); 1321 WARN_ON(1);
1321 return 0; 1322 return 0;
@@ -1463,7 +1464,7 @@ static int pull_rt_task(struct rq *this_rq)
1463 * but possible) 1464 * but possible)
1464 */ 1465 */
1465 } 1466 }
1466 skip: 1467skip:
1467 double_unlock_balance(this_rq, src_rq); 1468 double_unlock_balance(this_rq, src_rq);
1468 } 1469 }
1469 1470
@@ -1491,7 +1492,10 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
1491 if (!task_running(rq, p) && 1492 if (!task_running(rq, p) &&
1492 !test_tsk_need_resched(rq->curr) && 1493 !test_tsk_need_resched(rq->curr) &&
1493 has_pushable_tasks(rq) && 1494 has_pushable_tasks(rq) &&
1494 p->rt.nr_cpus_allowed > 1) 1495 p->rt.nr_cpus_allowed > 1 &&
1496 rt_task(rq->curr) &&
1497 (rq->curr->rt.nr_cpus_allowed < 2 ||
1498 rq->curr->prio < p->prio))
1495 push_rt_tasks(rq); 1499 push_rt_tasks(rq);
1496} 1500}
1497 1501
@@ -1663,9 +1667,6 @@ static void watchdog(struct rq *rq, struct task_struct *p)
1663{ 1667{
1664 unsigned long soft, hard; 1668 unsigned long soft, hard;
1665 1669
1666 if (!p->signal)
1667 return;
1668
1669 /* max may change after cur was read, this will be fixed next tick */ 1670 /* max may change after cur was read, this will be fixed next tick */
1670 soft = task_rlimit(p, RLIMIT_RTTIME); 1671 soft = task_rlimit(p, RLIMIT_RTTIME);
1671 hard = task_rlimit_max(p, RLIMIT_RTTIME); 1672 hard = task_rlimit_max(p, RLIMIT_RTTIME);
@@ -1712,7 +1713,7 @@ static void set_curr_task_rt(struct rq *rq)
1712{ 1713{
1713 struct task_struct *p = rq->curr; 1714 struct task_struct *p = rq->curr;
1714 1715
1715 p->se.exec_start = rq->clock; 1716 p->se.exec_start = rq->clock_task;
1716 1717
1717 /* The running task is never eligible for pushing */ 1718 /* The running task is never eligible for pushing */
1718 dequeue_pushable_task(rq, p); 1719 dequeue_pushable_task(rq, p);
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 32d2bd4061b0..25c2f962f6fc 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -295,13 +295,7 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
295static inline void account_group_user_time(struct task_struct *tsk, 295static inline void account_group_user_time(struct task_struct *tsk,
296 cputime_t cputime) 296 cputime_t cputime)
297{ 297{
298 struct thread_group_cputimer *cputimer; 298 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
299
300 /* tsk == current, ensure it is safe to use ->signal */
301 if (unlikely(tsk->exit_state))
302 return;
303
304 cputimer = &tsk->signal->cputimer;
305 299
306 if (!cputimer->running) 300 if (!cputimer->running)
307 return; 301 return;
@@ -325,13 +319,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
325static inline void account_group_system_time(struct task_struct *tsk, 319static inline void account_group_system_time(struct task_struct *tsk,
326 cputime_t cputime) 320 cputime_t cputime)
327{ 321{
328 struct thread_group_cputimer *cputimer; 322 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
329
330 /* tsk == current, ensure it is safe to use ->signal */
331 if (unlikely(tsk->exit_state))
332 return;
333
334 cputimer = &tsk->signal->cputimer;
335 323
336 if (!cputimer->running) 324 if (!cputimer->running)
337 return; 325 return;
@@ -355,16 +343,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
355static inline void account_group_exec_runtime(struct task_struct *tsk, 343static inline void account_group_exec_runtime(struct task_struct *tsk,
356 unsigned long long ns) 344 unsigned long long ns)
357{ 345{
358 struct thread_group_cputimer *cputimer; 346 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
359 struct signal_struct *sig;
360
361 sig = tsk->signal;
362 /* see __exit_signal()->task_rq_unlock_wait() */
363 barrier();
364 if (unlikely(!sig))
365 return;
366
367 cputimer = &sig->cputimer;
368 347
369 if (!cputimer->running) 348 if (!cputimer->running)
370 return; 349 return;
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
new file mode 100644
index 000000000000..45bddc0c1048
--- /dev/null
+++ b/kernel/sched_stoptask.c
@@ -0,0 +1,108 @@
1/*
2 * stop-task scheduling class.
3 *
4 * The stop task is the highest priority task in the system, it preempts
5 * everything and will be preempted by nothing.
6 *
7 * See kernel/stop_machine.c
8 */
9
10#ifdef CONFIG_SMP
11static int
12select_task_rq_stop(struct rq *rq, struct task_struct *p,
13 int sd_flag, int flags)
14{
15 return task_cpu(p); /* stop tasks as never migrate */
16}
17#endif /* CONFIG_SMP */
18
19static void
20check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
21{
22 resched_task(rq->curr); /* we preempt everything */
23}
24
25static struct task_struct *pick_next_task_stop(struct rq *rq)
26{
27 struct task_struct *stop = rq->stop;
28
29 if (stop && stop->state == TASK_RUNNING)
30 return stop;
31
32 return NULL;
33}
34
35static void
36enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
37{
38}
39
40static void
41dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
42{
43}
44
45static void yield_task_stop(struct rq *rq)
46{
47 BUG(); /* the stop task should never yield, its pointless. */
48}
49
50static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
51{
52}
53
54static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
55{
56}
57
58static void set_curr_task_stop(struct rq *rq)
59{
60}
61
62static void switched_to_stop(struct rq *rq, struct task_struct *p,
63 int running)
64{
65 BUG(); /* its impossible to change to this class */
66}
67
68static void prio_changed_stop(struct rq *rq, struct task_struct *p,
69 int oldprio, int running)
70{
71 BUG(); /* how!?, what priority? */
72}
73
74static unsigned int
75get_rr_interval_stop(struct rq *rq, struct task_struct *task)
76{
77 return 0;
78}
79
80/*
81 * Simple, special scheduling class for the per-CPU stop tasks:
82 */
83static const struct sched_class stop_sched_class = {
84 .next = &rt_sched_class,
85
86 .enqueue_task = enqueue_task_stop,
87 .dequeue_task = dequeue_task_stop,
88 .yield_task = yield_task_stop,
89
90 .check_preempt_curr = check_preempt_curr_stop,
91
92 .pick_next_task = pick_next_task_stop,
93 .put_prev_task = put_prev_task_stop,
94
95#ifdef CONFIG_SMP
96 .select_task_rq = select_task_rq_stop,
97#endif
98
99 .set_curr_task = set_curr_task_stop,
100 .task_tick = task_tick_stop,
101
102 .get_rr_interval = get_rr_interval_stop,
103
104 .prio_changed = prio_changed_stop,
105 .switched_to = switched_to_stop,
106
107 /* no .task_new for stop tasks */
108};
diff --git a/kernel/signal.c b/kernel/signal.c
index 906ae5a1779c..4e3cff10fdce 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -637,7 +637,7 @@ static inline bool si_fromuser(const struct siginfo *info)
637 637
638/* 638/*
639 * Bad permissions for sending the signal 639 * Bad permissions for sending the signal
640 * - the caller must hold at least the RCU read lock 640 * - the caller must hold the RCU read lock
641 */ 641 */
642static int check_kill_permission(int sig, struct siginfo *info, 642static int check_kill_permission(int sig, struct siginfo *info,
643 struct task_struct *t) 643 struct task_struct *t)
@@ -1105,7 +1105,8 @@ int zap_other_threads(struct task_struct *p)
1105 return count; 1105 return count;
1106} 1106}
1107 1107
1108struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) 1108struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
1109 unsigned long *flags)
1109{ 1110{
1110 struct sighand_struct *sighand; 1111 struct sighand_struct *sighand;
1111 1112
@@ -1127,11 +1128,14 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long
1127 1128
1128/* 1129/*
1129 * send signal info to all the members of a group 1130 * send signal info to all the members of a group
1130 * - the caller must hold the RCU read lock at least
1131 */ 1131 */
1132int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) 1132int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1133{ 1133{
1134 int ret = check_kill_permission(sig, info, p); 1134 int ret;
1135
1136 rcu_read_lock();
1137 ret = check_kill_permission(sig, info, p);
1138 rcu_read_unlock();
1135 1139
1136 if (!ret && sig) 1140 if (!ret && sig)
1137 ret = do_send_sig_info(sig, info, p, true); 1141 ret = do_send_sig_info(sig, info, p, true);
@@ -1614,6 +1618,8 @@ static int sigkill_pending(struct task_struct *tsk)
1614 * is gone, we keep current->exit_code unless clear_code. 1618 * is gone, we keep current->exit_code unless clear_code.
1615 */ 1619 */
1616static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) 1620static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
1621 __releases(&current->sighand->siglock)
1622 __acquires(&current->sighand->siglock)
1617{ 1623{
1618 if (arch_ptrace_stop_needed(exit_code, info)) { 1624 if (arch_ptrace_stop_needed(exit_code, info)) {
1619 /* 1625 /*
@@ -2212,6 +2218,14 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
2212#ifdef __ARCH_SI_TRAPNO 2218#ifdef __ARCH_SI_TRAPNO
2213 err |= __put_user(from->si_trapno, &to->si_trapno); 2219 err |= __put_user(from->si_trapno, &to->si_trapno);
2214#endif 2220#endif
2221#ifdef BUS_MCEERR_AO
2222 /*
2223 * Other callers might not initialize the si_lsb field,
2224 * so check explicitely for the right codes here.
2225 */
2226 if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)
2227 err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
2228#endif
2215 break; 2229 break;
2216 case __SI_CHLD: 2230 case __SI_CHLD:
2217 err |= __put_user(from->si_pid, &to->si_pid); 2231 err |= __put_user(from->si_pid, &to->si_pid);
diff --git a/kernel/slow-work-debugfs.c b/kernel/slow-work-debugfs.c
deleted file mode 100644
index e45c43645298..000000000000
--- a/kernel/slow-work-debugfs.c
+++ /dev/null
@@ -1,227 +0,0 @@
1/* Slow work debugging
2 *
3 * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/module.h>
13#include <linux/slow-work.h>
14#include <linux/fs.h>
15#include <linux/time.h>
16#include <linux/seq_file.h>
17#include "slow-work.h"
18
19#define ITERATOR_SHIFT (BITS_PER_LONG - 4)
20#define ITERATOR_SELECTOR (0xfUL << ITERATOR_SHIFT)
21#define ITERATOR_COUNTER (~ITERATOR_SELECTOR)
22
23void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m)
24{
25 seq_puts(m, "Slow-work: New thread");
26}
27
28/*
29 * Render the time mark field on a work item into a 5-char time with units plus
30 * a space
31 */
32static void slow_work_print_mark(struct seq_file *m, struct slow_work *work)
33{
34 struct timespec now, diff;
35
36 now = CURRENT_TIME;
37 diff = timespec_sub(now, work->mark);
38
39 if (diff.tv_sec < 0)
40 seq_puts(m, " -ve ");
41 else if (diff.tv_sec == 0 && diff.tv_nsec < 1000)
42 seq_printf(m, "%3luns ", diff.tv_nsec);
43 else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000)
44 seq_printf(m, "%3luus ", diff.tv_nsec / 1000);
45 else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000)
46 seq_printf(m, "%3lums ", diff.tv_nsec / 1000000);
47 else if (diff.tv_sec <= 1)
48 seq_puts(m, " 1s ");
49 else if (diff.tv_sec < 60)
50 seq_printf(m, "%4lus ", diff.tv_sec);
51 else if (diff.tv_sec < 60 * 60)
52 seq_printf(m, "%4lum ", diff.tv_sec / 60);
53 else if (diff.tv_sec < 60 * 60 * 24)
54 seq_printf(m, "%4luh ", diff.tv_sec / 3600);
55 else
56 seq_puts(m, "exces ");
57}
58
59/*
60 * Describe a slow work item for debugfs
61 */
62static int slow_work_runqueue_show(struct seq_file *m, void *v)
63{
64 struct slow_work *work;
65 struct list_head *p = v;
66 unsigned long id;
67
68 switch ((unsigned long) v) {
69 case 1:
70 seq_puts(m, "THR PID ITEM ADDR FL MARK DESC\n");
71 return 0;
72 case 2:
73 seq_puts(m, "=== ===== ================ == ===== ==========\n");
74 return 0;
75
76 case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1:
77 id = (unsigned long) v - 3;
78
79 read_lock(&slow_work_execs_lock);
80 work = slow_work_execs[id];
81 if (work) {
82 smp_read_barrier_depends();
83
84 seq_printf(m, "%3lu %5d %16p %2lx ",
85 id, slow_work_pids[id], work, work->flags);
86 slow_work_print_mark(m, work);
87
88 if (work->ops->desc)
89 work->ops->desc(work, m);
90 seq_putc(m, '\n');
91 }
92 read_unlock(&slow_work_execs_lock);
93 return 0;
94
95 default:
96 work = list_entry(p, struct slow_work, link);
97 seq_printf(m, "%3s - %16p %2lx ",
98 work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq",
99 work, work->flags);
100 slow_work_print_mark(m, work);
101
102 if (work->ops->desc)
103 work->ops->desc(work, m);
104 seq_putc(m, '\n');
105 return 0;
106 }
107}
108
109/*
110 * map the iterator to a work item
111 */
112static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos)
113{
114 struct list_head *p;
115 unsigned long count, id;
116
117 switch (*_pos >> ITERATOR_SHIFT) {
118 case 0x0:
119 if (*_pos == 0)
120 *_pos = 1;
121 if (*_pos < 3)
122 return (void *)(unsigned long) *_pos;
123 if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT)
124 for (id = *_pos - 3;
125 id < SLOW_WORK_THREAD_LIMIT;
126 id++, (*_pos)++)
127 if (slow_work_execs[id])
128 return (void *)(unsigned long) *_pos;
129 *_pos = 0x1UL << ITERATOR_SHIFT;
130
131 case 0x1:
132 count = *_pos & ITERATOR_COUNTER;
133 list_for_each(p, &slow_work_queue) {
134 if (count == 0)
135 return p;
136 count--;
137 }
138 *_pos = 0x2UL << ITERATOR_SHIFT;
139
140 case 0x2:
141 count = *_pos & ITERATOR_COUNTER;
142 list_for_each(p, &vslow_work_queue) {
143 if (count == 0)
144 return p;
145 count--;
146 }
147 *_pos = 0x3UL << ITERATOR_SHIFT;
148
149 default:
150 return NULL;
151 }
152}
153
154/*
155 * set up the iterator to start reading from the first line
156 */
157static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos)
158{
159 spin_lock_irq(&slow_work_queue_lock);
160 return slow_work_runqueue_index(m, _pos);
161}
162
163/*
164 * move to the next line
165 */
166static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos)
167{
168 struct list_head *p = v;
169 unsigned long selector = *_pos >> ITERATOR_SHIFT;
170
171 (*_pos)++;
172 switch (selector) {
173 case 0x0:
174 return slow_work_runqueue_index(m, _pos);
175
176 case 0x1:
177 if (*_pos >> ITERATOR_SHIFT == 0x1) {
178 p = p->next;
179 if (p != &slow_work_queue)
180 return p;
181 }
182 *_pos = 0x2UL << ITERATOR_SHIFT;
183 p = &vslow_work_queue;
184
185 case 0x2:
186 if (*_pos >> ITERATOR_SHIFT == 0x2) {
187 p = p->next;
188 if (p != &vslow_work_queue)
189 return p;
190 }
191 *_pos = 0x3UL << ITERATOR_SHIFT;
192
193 default:
194 return NULL;
195 }
196}
197
198/*
199 * clean up after reading
200 */
201static void slow_work_runqueue_stop(struct seq_file *m, void *v)
202{
203 spin_unlock_irq(&slow_work_queue_lock);
204}
205
206static const struct seq_operations slow_work_runqueue_ops = {
207 .start = slow_work_runqueue_start,
208 .stop = slow_work_runqueue_stop,
209 .next = slow_work_runqueue_next,
210 .show = slow_work_runqueue_show,
211};
212
213/*
214 * open "/sys/kernel/debug/slow_work/runqueue" to list queue contents
215 */
216static int slow_work_runqueue_open(struct inode *inode, struct file *file)
217{
218 return seq_open(file, &slow_work_runqueue_ops);
219}
220
221const struct file_operations slow_work_runqueue_fops = {
222 .owner = THIS_MODULE,
223 .open = slow_work_runqueue_open,
224 .read = seq_read,
225 .llseek = seq_lseek,
226 .release = seq_release,
227};
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
deleted file mode 100644
index 7d3f4fa9ef4f..000000000000
--- a/kernel/slow-work.c
+++ /dev/null
@@ -1,1068 +0,0 @@
1/* Worker thread pool for slow items, such as filesystem lookups or mkdirs
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 *
11 * See Documentation/slow-work.txt
12 */
13
14#include <linux/module.h>
15#include <linux/slow-work.h>
16#include <linux/kthread.h>
17#include <linux/freezer.h>
18#include <linux/wait.h>
19#include <linux/debugfs.h>
20#include "slow-work.h"
21
22static void slow_work_cull_timeout(unsigned long);
23static void slow_work_oom_timeout(unsigned long);
24
25#ifdef CONFIG_SYSCTL
26static int slow_work_min_threads_sysctl(struct ctl_table *, int,
27 void __user *, size_t *, loff_t *);
28
29static int slow_work_max_threads_sysctl(struct ctl_table *, int ,
30 void __user *, size_t *, loff_t *);
31#endif
32
33/*
34 * The pool of threads has at least min threads in it as long as someone is
35 * using the facility, and may have as many as max.
36 *
37 * A portion of the pool may be processing very slow operations.
38 */
39static unsigned slow_work_min_threads = 2;
40static unsigned slow_work_max_threads = 4;
41static unsigned vslow_work_proportion = 50; /* % of threads that may process
42 * very slow work */
43
44#ifdef CONFIG_SYSCTL
45static const int slow_work_min_min_threads = 2;
46static int slow_work_max_max_threads = SLOW_WORK_THREAD_LIMIT;
47static const int slow_work_min_vslow = 1;
48static const int slow_work_max_vslow = 99;
49
50ctl_table slow_work_sysctls[] = {
51 {
52 .procname = "min-threads",
53 .data = &slow_work_min_threads,
54 .maxlen = sizeof(unsigned),
55 .mode = 0644,
56 .proc_handler = slow_work_min_threads_sysctl,
57 .extra1 = (void *) &slow_work_min_min_threads,
58 .extra2 = &slow_work_max_threads,
59 },
60 {
61 .procname = "max-threads",
62 .data = &slow_work_max_threads,
63 .maxlen = sizeof(unsigned),
64 .mode = 0644,
65 .proc_handler = slow_work_max_threads_sysctl,
66 .extra1 = &slow_work_min_threads,
67 .extra2 = (void *) &slow_work_max_max_threads,
68 },
69 {
70 .procname = "vslow-percentage",
71 .data = &vslow_work_proportion,
72 .maxlen = sizeof(unsigned),
73 .mode = 0644,
74 .proc_handler = proc_dointvec_minmax,
75 .extra1 = (void *) &slow_work_min_vslow,
76 .extra2 = (void *) &slow_work_max_vslow,
77 },
78 {}
79};
80#endif
81
82/*
83 * The active state of the thread pool
84 */
85static atomic_t slow_work_thread_count;
86static atomic_t vslow_work_executing_count;
87
88static bool slow_work_may_not_start_new_thread;
89static bool slow_work_cull; /* cull a thread due to lack of activity */
90static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0);
91static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0);
92static struct slow_work slow_work_new_thread; /* new thread starter */
93
94/*
95 * slow work ID allocation (use slow_work_queue_lock)
96 */
97static DECLARE_BITMAP(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
98
99/*
100 * Unregistration tracking to prevent put_ref() from disappearing during module
101 * unload
102 */
103#ifdef CONFIG_MODULES
104static struct module *slow_work_thread_processing[SLOW_WORK_THREAD_LIMIT];
105static struct module *slow_work_unreg_module;
106static struct slow_work *slow_work_unreg_work_item;
107static DECLARE_WAIT_QUEUE_HEAD(slow_work_unreg_wq);
108static DEFINE_MUTEX(slow_work_unreg_sync_lock);
109
110static void slow_work_set_thread_processing(int id, struct slow_work *work)
111{
112 if (work)
113 slow_work_thread_processing[id] = work->owner;
114}
115static void slow_work_done_thread_processing(int id, struct slow_work *work)
116{
117 struct module *module = slow_work_thread_processing[id];
118
119 slow_work_thread_processing[id] = NULL;
120 smp_mb();
121 if (slow_work_unreg_work_item == work ||
122 slow_work_unreg_module == module)
123 wake_up_all(&slow_work_unreg_wq);
124}
125static void slow_work_clear_thread_processing(int id)
126{
127 slow_work_thread_processing[id] = NULL;
128}
129#else
130static void slow_work_set_thread_processing(int id, struct slow_work *work) {}
131static void slow_work_done_thread_processing(int id, struct slow_work *work) {}
132static void slow_work_clear_thread_processing(int id) {}
133#endif
134
135/*
136 * Data for tracking currently executing items for indication through /proc
137 */
138#ifdef CONFIG_SLOW_WORK_DEBUG
139struct slow_work *slow_work_execs[SLOW_WORK_THREAD_LIMIT];
140pid_t slow_work_pids[SLOW_WORK_THREAD_LIMIT];
141DEFINE_RWLOCK(slow_work_execs_lock);
142#endif
143
144/*
145 * The queues of work items and the lock governing access to them. These are
146 * shared between all the CPUs. It doesn't make sense to have per-CPU queues
147 * as the number of threads bears no relation to the number of CPUs.
148 *
149 * There are two queues of work items: one for slow work items, and one for
150 * very slow work items.
151 */
152LIST_HEAD(slow_work_queue);
153LIST_HEAD(vslow_work_queue);
154DEFINE_SPINLOCK(slow_work_queue_lock);
155
156/*
157 * The following are two wait queues that get pinged when a work item is placed
158 * on an empty queue. These allow work items that are hogging a thread by
159 * sleeping in a way that could be deferred to yield their thread and enqueue
160 * themselves.
161 */
162static DECLARE_WAIT_QUEUE_HEAD(slow_work_queue_waits_for_occupation);
163static DECLARE_WAIT_QUEUE_HEAD(vslow_work_queue_waits_for_occupation);
164
165/*
166 * The thread controls. A variable used to signal to the threads that they
167 * should exit when the queue is empty, a waitqueue used by the threads to wait
168 * for signals, and a completion set by the last thread to exit.
169 */
170static bool slow_work_threads_should_exit;
171static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq);
172static DECLARE_COMPLETION(slow_work_last_thread_exited);
173
174/*
175 * The number of users of the thread pool and its lock. Whilst this is zero we
176 * have no threads hanging around, and when this reaches zero, we wait for all
177 * active or queued work items to complete and kill all the threads we do have.
178 */
179static int slow_work_user_count;
180static DEFINE_MUTEX(slow_work_user_lock);
181
182static inline int slow_work_get_ref(struct slow_work *work)
183{
184 if (work->ops->get_ref)
185 return work->ops->get_ref(work);
186
187 return 0;
188}
189
190static inline void slow_work_put_ref(struct slow_work *work)
191{
192 if (work->ops->put_ref)
193 work->ops->put_ref(work);
194}
195
196/*
197 * Calculate the maximum number of active threads in the pool that are
198 * permitted to process very slow work items.
199 *
200 * The answer is rounded up to at least 1, but may not equal or exceed the
201 * maximum number of the threads in the pool. This means we always have at
202 * least one thread that can process slow work items, and we always have at
203 * least one thread that won't get tied up doing so.
204 */
205static unsigned slow_work_calc_vsmax(void)
206{
207 unsigned vsmax;
208
209 vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion;
210 vsmax /= 100;
211 vsmax = max(vsmax, 1U);
212 return min(vsmax, slow_work_max_threads - 1);
213}
214
215/*
216 * Attempt to execute stuff queued on a slow thread. Return true if we managed
217 * it, false if there was nothing to do.
218 */
219static noinline bool slow_work_execute(int id)
220{
221 struct slow_work *work = NULL;
222 unsigned vsmax;
223 bool very_slow;
224
225 vsmax = slow_work_calc_vsmax();
226
227 /* see if we can schedule a new thread to be started if we're not
228 * keeping up with the work */
229 if (!waitqueue_active(&slow_work_thread_wq) &&
230 (!list_empty(&slow_work_queue) || !list_empty(&vslow_work_queue)) &&
231 atomic_read(&slow_work_thread_count) < slow_work_max_threads &&
232 !slow_work_may_not_start_new_thread)
233 slow_work_enqueue(&slow_work_new_thread);
234
235 /* find something to execute */
236 spin_lock_irq(&slow_work_queue_lock);
237 if (!list_empty(&vslow_work_queue) &&
238 atomic_read(&vslow_work_executing_count) < vsmax) {
239 work = list_entry(vslow_work_queue.next,
240 struct slow_work, link);
241 if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
242 BUG();
243 list_del_init(&work->link);
244 atomic_inc(&vslow_work_executing_count);
245 very_slow = true;
246 } else if (!list_empty(&slow_work_queue)) {
247 work = list_entry(slow_work_queue.next,
248 struct slow_work, link);
249 if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
250 BUG();
251 list_del_init(&work->link);
252 very_slow = false;
253 } else {
254 very_slow = false; /* avoid the compiler warning */
255 }
256
257 slow_work_set_thread_processing(id, work);
258 if (work) {
259 slow_work_mark_time(work);
260 slow_work_begin_exec(id, work);
261 }
262
263 spin_unlock_irq(&slow_work_queue_lock);
264
265 if (!work)
266 return false;
267
268 if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags))
269 BUG();
270
271 /* don't execute if the work is in the process of being cancelled */
272 if (!test_bit(SLOW_WORK_CANCELLING, &work->flags))
273 work->ops->execute(work);
274
275 if (very_slow)
276 atomic_dec(&vslow_work_executing_count);
277 clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags);
278
279 /* wake up anyone waiting for this work to be complete */
280 wake_up_bit(&work->flags, SLOW_WORK_EXECUTING);
281
282 slow_work_end_exec(id, work);
283
284 /* if someone tried to enqueue the item whilst we were executing it,
285 * then it'll be left unenqueued to avoid multiple threads trying to
286 * execute it simultaneously
287 *
288 * there is, however, a race between us testing the pending flag and
289 * getting the spinlock, and between the enqueuer setting the pending
290 * flag and getting the spinlock, so we use a deferral bit to tell us
291 * if the enqueuer got there first
292 */
293 if (test_bit(SLOW_WORK_PENDING, &work->flags)) {
294 spin_lock_irq(&slow_work_queue_lock);
295
296 if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) &&
297 test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags))
298 goto auto_requeue;
299
300 spin_unlock_irq(&slow_work_queue_lock);
301 }
302
303 /* sort out the race between module unloading and put_ref() */
304 slow_work_put_ref(work);
305 slow_work_done_thread_processing(id, work);
306
307 return true;
308
309auto_requeue:
310 /* we must complete the enqueue operation
311 * - we transfer our ref on the item back to the appropriate queue
312 * - don't wake another thread up as we're awake already
313 */
314 slow_work_mark_time(work);
315 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
316 list_add_tail(&work->link, &vslow_work_queue);
317 else
318 list_add_tail(&work->link, &slow_work_queue);
319 spin_unlock_irq(&slow_work_queue_lock);
320 slow_work_clear_thread_processing(id);
321 return true;
322}
323
324/**
325 * slow_work_sleep_till_thread_needed - Sleep till thread needed by other work
326 * work: The work item under execution that wants to sleep
327 * _timeout: Scheduler sleep timeout
328 *
329 * Allow a requeueable work item to sleep on a slow-work processor thread until
330 * that thread is needed to do some other work or the sleep is interrupted by
331 * some other event.
332 *
333 * The caller must set up a wake up event before calling this and must have set
334 * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own
335 * condition before calling this function as no test is made here.
336 *
337 * False is returned if there is nothing on the queue; true is returned if the
338 * work item should be requeued
339 */
340bool slow_work_sleep_till_thread_needed(struct slow_work *work,
341 signed long *_timeout)
342{
343 wait_queue_head_t *wfo_wq;
344 struct list_head *queue;
345
346 DEFINE_WAIT(wait);
347
348 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
349 wfo_wq = &vslow_work_queue_waits_for_occupation;
350 queue = &vslow_work_queue;
351 } else {
352 wfo_wq = &slow_work_queue_waits_for_occupation;
353 queue = &slow_work_queue;
354 }
355
356 if (!list_empty(queue))
357 return true;
358
359 add_wait_queue_exclusive(wfo_wq, &wait);
360 if (list_empty(queue))
361 *_timeout = schedule_timeout(*_timeout);
362 finish_wait(wfo_wq, &wait);
363
364 return !list_empty(queue);
365}
366EXPORT_SYMBOL(slow_work_sleep_till_thread_needed);
367
368/**
369 * slow_work_enqueue - Schedule a slow work item for processing
370 * @work: The work item to queue
371 *
372 * Schedule a slow work item for processing. If the item is already undergoing
373 * execution, this guarantees not to re-enter the execution routine until the
374 * first execution finishes.
375 *
376 * The item is pinned by this function as it retains a reference to it, managed
377 * through the item operations. The item is unpinned once it has been
378 * executed.
379 *
380 * An item may hog the thread that is running it for a relatively large amount
381 * of time, sufficient, for example, to perform several lookup, mkdir, create
382 * and setxattr operations. It may sleep on I/O and may sleep to obtain locks.
383 *
384 * Conversely, if a number of items are awaiting processing, it may take some
385 * time before any given item is given attention. The number of threads in the
386 * pool may be increased to deal with demand, but only up to a limit.
387 *
388 * If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in
389 * the very slow queue, from which only a portion of the threads will be
390 * allowed to pick items to execute. This ensures that very slow items won't
391 * overly block ones that are just ordinarily slow.
392 *
393 * Returns 0 if successful, -EAGAIN if not (or -ECANCELED if cancelled work is
394 * attempted queued)
395 */
396int slow_work_enqueue(struct slow_work *work)
397{
398 wait_queue_head_t *wfo_wq;
399 struct list_head *queue;
400 unsigned long flags;
401 int ret;
402
403 if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
404 return -ECANCELED;
405
406 BUG_ON(slow_work_user_count <= 0);
407 BUG_ON(!work);
408 BUG_ON(!work->ops);
409
410 /* when honouring an enqueue request, we only promise that we will run
411 * the work function in the future; we do not promise to run it once
412 * per enqueue request
413 *
414 * we use the PENDING bit to merge together repeat requests without
415 * having to disable IRQs and take the spinlock, whilst still
416 * maintaining our promise
417 */
418 if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
419 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
420 wfo_wq = &vslow_work_queue_waits_for_occupation;
421 queue = &vslow_work_queue;
422 } else {
423 wfo_wq = &slow_work_queue_waits_for_occupation;
424 queue = &slow_work_queue;
425 }
426
427 spin_lock_irqsave(&slow_work_queue_lock, flags);
428
429 if (unlikely(test_bit(SLOW_WORK_CANCELLING, &work->flags)))
430 goto cancelled;
431
432 /* we promise that we will not attempt to execute the work
433 * function in more than one thread simultaneously
434 *
435 * this, however, leaves us with a problem if we're asked to
436 * enqueue the work whilst someone is executing the work
437 * function as simply queueing the work immediately means that
438 * another thread may try executing it whilst it is already
439 * under execution
440 *
441 * to deal with this, we set the ENQ_DEFERRED bit instead of
442 * enqueueing, and the thread currently executing the work
443 * function will enqueue the work item when the work function
444 * returns and it has cleared the EXECUTING bit
445 */
446 if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
447 set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
448 } else {
449 ret = slow_work_get_ref(work);
450 if (ret < 0)
451 goto failed;
452 slow_work_mark_time(work);
453 list_add_tail(&work->link, queue);
454 wake_up(&slow_work_thread_wq);
455
456 /* if someone who could be requeued is sleeping on a
457 * thread, then ask them to yield their thread */
458 if (work->link.prev == queue)
459 wake_up(wfo_wq);
460 }
461
462 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
463 }
464 return 0;
465
466cancelled:
467 ret = -ECANCELED;
468failed:
469 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
470 return ret;
471}
472EXPORT_SYMBOL(slow_work_enqueue);
473
474static int slow_work_wait(void *word)
475{
476 schedule();
477 return 0;
478}
479
480/**
481 * slow_work_cancel - Cancel a slow work item
482 * @work: The work item to cancel
483 *
484 * This function will cancel a previously enqueued work item. If we cannot
485 * cancel the work item, it is guarenteed to have run when this function
486 * returns.
487 */
488void slow_work_cancel(struct slow_work *work)
489{
490 bool wait = true, put = false;
491
492 set_bit(SLOW_WORK_CANCELLING, &work->flags);
493 smp_mb();
494
495 /* if the work item is a delayed work item with an active timer, we
496 * need to wait for the timer to finish _before_ getting the spinlock,
497 * lest we deadlock against the timer routine
498 *
499 * the timer routine will leave DELAYED set if it notices the
500 * CANCELLING flag in time
501 */
502 if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
503 struct delayed_slow_work *dwork =
504 container_of(work, struct delayed_slow_work, work);
505 del_timer_sync(&dwork->timer);
506 }
507
508 spin_lock_irq(&slow_work_queue_lock);
509
510 if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
511 /* the timer routine aborted or never happened, so we are left
512 * holding the timer's reference on the item and should just
513 * drop the pending flag and wait for any ongoing execution to
514 * finish */
515 struct delayed_slow_work *dwork =
516 container_of(work, struct delayed_slow_work, work);
517
518 BUG_ON(timer_pending(&dwork->timer));
519 BUG_ON(!list_empty(&work->link));
520
521 clear_bit(SLOW_WORK_DELAYED, &work->flags);
522 put = true;
523 clear_bit(SLOW_WORK_PENDING, &work->flags);
524
525 } else if (test_bit(SLOW_WORK_PENDING, &work->flags) &&
526 !list_empty(&work->link)) {
527 /* the link in the pending queue holds a reference on the item
528 * that we will need to release */
529 list_del_init(&work->link);
530 wait = false;
531 put = true;
532 clear_bit(SLOW_WORK_PENDING, &work->flags);
533
534 } else if (test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) {
535 /* the executor is holding our only reference on the item, so
536 * we merely need to wait for it to finish executing */
537 clear_bit(SLOW_WORK_PENDING, &work->flags);
538 }
539
540 spin_unlock_irq(&slow_work_queue_lock);
541
542 /* the EXECUTING flag is set by the executor whilst the spinlock is set
543 * and before the item is dequeued - so assuming the above doesn't
544 * actually dequeue it, simply waiting for the EXECUTING flag to be
545 * released here should be sufficient */
546 if (wait)
547 wait_on_bit(&work->flags, SLOW_WORK_EXECUTING, slow_work_wait,
548 TASK_UNINTERRUPTIBLE);
549
550 clear_bit(SLOW_WORK_CANCELLING, &work->flags);
551 if (put)
552 slow_work_put_ref(work);
553}
554EXPORT_SYMBOL(slow_work_cancel);
555
556/*
557 * Handle expiry of the delay timer, indicating that a delayed slow work item
558 * should now be queued if not cancelled
559 */
560static void delayed_slow_work_timer(unsigned long data)
561{
562 wait_queue_head_t *wfo_wq;
563 struct list_head *queue;
564 struct slow_work *work = (struct slow_work *) data;
565 unsigned long flags;
566 bool queued = false, put = false, first = false;
567
568 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
569 wfo_wq = &vslow_work_queue_waits_for_occupation;
570 queue = &vslow_work_queue;
571 } else {
572 wfo_wq = &slow_work_queue_waits_for_occupation;
573 queue = &slow_work_queue;
574 }
575
576 spin_lock_irqsave(&slow_work_queue_lock, flags);
577 if (likely(!test_bit(SLOW_WORK_CANCELLING, &work->flags))) {
578 clear_bit(SLOW_WORK_DELAYED, &work->flags);
579
580 if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
581 /* we discard the reference the timer was holding in
582 * favour of the one the executor holds */
583 set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
584 put = true;
585 } else {
586 slow_work_mark_time(work);
587 list_add_tail(&work->link, queue);
588 queued = true;
589 if (work->link.prev == queue)
590 first = true;
591 }
592 }
593
594 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
595 if (put)
596 slow_work_put_ref(work);
597 if (first)
598 wake_up(wfo_wq);
599 if (queued)
600 wake_up(&slow_work_thread_wq);
601}
602
603/**
604 * delayed_slow_work_enqueue - Schedule a delayed slow work item for processing
605 * @dwork: The delayed work item to queue
606 * @delay: When to start executing the work, in jiffies from now
607 *
608 * This is similar to slow_work_enqueue(), but it adds a delay before the work
609 * is actually queued for processing.
610 *
611 * The item can have delayed processing requested on it whilst it is being
612 * executed. The delay will begin immediately, and if it expires before the
613 * item finishes executing, the item will be placed back on the queue when it
614 * has done executing.
615 */
616int delayed_slow_work_enqueue(struct delayed_slow_work *dwork,
617 unsigned long delay)
618{
619 struct slow_work *work = &dwork->work;
620 unsigned long flags;
621 int ret;
622
623 if (delay == 0)
624 return slow_work_enqueue(&dwork->work);
625
626 BUG_ON(slow_work_user_count <= 0);
627 BUG_ON(!work);
628 BUG_ON(!work->ops);
629
630 if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
631 return -ECANCELED;
632
633 if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
634 spin_lock_irqsave(&slow_work_queue_lock, flags);
635
636 if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
637 goto cancelled;
638
639 /* the timer holds a reference whilst it is pending */
640 ret = slow_work_get_ref(work);
641 if (ret < 0)
642 goto cant_get_ref;
643
644 if (test_and_set_bit(SLOW_WORK_DELAYED, &work->flags))
645 BUG();
646 dwork->timer.expires = jiffies + delay;
647 dwork->timer.data = (unsigned long) work;
648 dwork->timer.function = delayed_slow_work_timer;
649 add_timer(&dwork->timer);
650
651 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
652 }
653
654 return 0;
655
656cancelled:
657 ret = -ECANCELED;
658cant_get_ref:
659 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
660 return ret;
661}
662EXPORT_SYMBOL(delayed_slow_work_enqueue);
663
664/*
665 * Schedule a cull of the thread pool at some time in the near future
666 */
667static void slow_work_schedule_cull(void)
668{
669 mod_timer(&slow_work_cull_timer,
670 round_jiffies(jiffies + SLOW_WORK_CULL_TIMEOUT));
671}
672
673/*
674 * Worker thread culling algorithm
675 */
676static bool slow_work_cull_thread(void)
677{
678 unsigned long flags;
679 bool do_cull = false;
680
681 spin_lock_irqsave(&slow_work_queue_lock, flags);
682
683 if (slow_work_cull) {
684 slow_work_cull = false;
685
686 if (list_empty(&slow_work_queue) &&
687 list_empty(&vslow_work_queue) &&
688 atomic_read(&slow_work_thread_count) >
689 slow_work_min_threads) {
690 slow_work_schedule_cull();
691 do_cull = true;
692 }
693 }
694
695 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
696 return do_cull;
697}
698
699/*
700 * Determine if there is slow work available for dispatch
701 */
702static inline bool slow_work_available(int vsmax)
703{
704 return !list_empty(&slow_work_queue) ||
705 (!list_empty(&vslow_work_queue) &&
706 atomic_read(&vslow_work_executing_count) < vsmax);
707}
708
709/*
710 * Worker thread dispatcher
711 */
712static int slow_work_thread(void *_data)
713{
714 int vsmax, id;
715
716 DEFINE_WAIT(wait);
717
718 set_freezable();
719 set_user_nice(current, -5);
720
721 /* allocate ourselves an ID */
722 spin_lock_irq(&slow_work_queue_lock);
723 id = find_first_zero_bit(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
724 BUG_ON(id < 0 || id >= SLOW_WORK_THREAD_LIMIT);
725 __set_bit(id, slow_work_ids);
726 slow_work_set_thread_pid(id, current->pid);
727 spin_unlock_irq(&slow_work_queue_lock);
728
729 sprintf(current->comm, "kslowd%03u", id);
730
731 for (;;) {
732 vsmax = vslow_work_proportion;
733 vsmax *= atomic_read(&slow_work_thread_count);
734 vsmax /= 100;
735
736 prepare_to_wait_exclusive(&slow_work_thread_wq, &wait,
737 TASK_INTERRUPTIBLE);
738 if (!freezing(current) &&
739 !slow_work_threads_should_exit &&
740 !slow_work_available(vsmax) &&
741 !slow_work_cull)
742 schedule();
743 finish_wait(&slow_work_thread_wq, &wait);
744
745 try_to_freeze();
746
747 vsmax = vslow_work_proportion;
748 vsmax *= atomic_read(&slow_work_thread_count);
749 vsmax /= 100;
750
751 if (slow_work_available(vsmax) && slow_work_execute(id)) {
752 cond_resched();
753 if (list_empty(&slow_work_queue) &&
754 list_empty(&vslow_work_queue) &&
755 atomic_read(&slow_work_thread_count) >
756 slow_work_min_threads)
757 slow_work_schedule_cull();
758 continue;
759 }
760
761 if (slow_work_threads_should_exit)
762 break;
763
764 if (slow_work_cull && slow_work_cull_thread())
765 break;
766 }
767
768 spin_lock_irq(&slow_work_queue_lock);
769 slow_work_set_thread_pid(id, 0);
770 __clear_bit(id, slow_work_ids);
771 spin_unlock_irq(&slow_work_queue_lock);
772
773 if (atomic_dec_and_test(&slow_work_thread_count))
774 complete_and_exit(&slow_work_last_thread_exited, 0);
775 return 0;
776}
777
778/*
779 * Handle thread cull timer expiration
780 */
781static void slow_work_cull_timeout(unsigned long data)
782{
783 slow_work_cull = true;
784 wake_up(&slow_work_thread_wq);
785}
786
787/*
788 * Start a new slow work thread
789 */
790static void slow_work_new_thread_execute(struct slow_work *work)
791{
792 struct task_struct *p;
793
794 if (slow_work_threads_should_exit)
795 return;
796
797 if (atomic_read(&slow_work_thread_count) >= slow_work_max_threads)
798 return;
799
800 if (!mutex_trylock(&slow_work_user_lock))
801 return;
802
803 slow_work_may_not_start_new_thread = true;
804 atomic_inc(&slow_work_thread_count);
805 p = kthread_run(slow_work_thread, NULL, "kslowd");
806 if (IS_ERR(p)) {
807 printk(KERN_DEBUG "Slow work thread pool: OOM\n");
808 if (atomic_dec_and_test(&slow_work_thread_count))
809 BUG(); /* we're running on a slow work thread... */
810 mod_timer(&slow_work_oom_timer,
811 round_jiffies(jiffies + SLOW_WORK_OOM_TIMEOUT));
812 } else {
813 /* ratelimit the starting of new threads */
814 mod_timer(&slow_work_oom_timer, jiffies + 1);
815 }
816
817 mutex_unlock(&slow_work_user_lock);
818}
819
820static const struct slow_work_ops slow_work_new_thread_ops = {
821 .owner = THIS_MODULE,
822 .execute = slow_work_new_thread_execute,
823#ifdef CONFIG_SLOW_WORK_DEBUG
824 .desc = slow_work_new_thread_desc,
825#endif
826};
827
828/*
829 * post-OOM new thread start suppression expiration
830 */
831static void slow_work_oom_timeout(unsigned long data)
832{
833 slow_work_may_not_start_new_thread = false;
834}
835
836#ifdef CONFIG_SYSCTL
837/*
838 * Handle adjustment of the minimum number of threads
839 */
840static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
841 void __user *buffer,
842 size_t *lenp, loff_t *ppos)
843{
844 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
845 int n;
846
847 if (ret == 0) {
848 mutex_lock(&slow_work_user_lock);
849 if (slow_work_user_count > 0) {
850 /* see if we need to start or stop threads */
851 n = atomic_read(&slow_work_thread_count) -
852 slow_work_min_threads;
853
854 if (n < 0 && !slow_work_may_not_start_new_thread)
855 slow_work_enqueue(&slow_work_new_thread);
856 else if (n > 0)
857 slow_work_schedule_cull();
858 }
859 mutex_unlock(&slow_work_user_lock);
860 }
861
862 return ret;
863}
864
865/*
866 * Handle adjustment of the maximum number of threads
867 */
868static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
869 void __user *buffer,
870 size_t *lenp, loff_t *ppos)
871{
872 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
873 int n;
874
875 if (ret == 0) {
876 mutex_lock(&slow_work_user_lock);
877 if (slow_work_user_count > 0) {
878 /* see if we need to stop threads */
879 n = slow_work_max_threads -
880 atomic_read(&slow_work_thread_count);
881
882 if (n < 0)
883 slow_work_schedule_cull();
884 }
885 mutex_unlock(&slow_work_user_lock);
886 }
887
888 return ret;
889}
890#endif /* CONFIG_SYSCTL */
891
892/**
893 * slow_work_register_user - Register a user of the facility
894 * @module: The module about to make use of the facility
895 *
896 * Register a user of the facility, starting up the initial threads if there
897 * aren't any other users at this point. This will return 0 if successful, or
898 * an error if not.
899 */
900int slow_work_register_user(struct module *module)
901{
902 struct task_struct *p;
903 int loop;
904
905 mutex_lock(&slow_work_user_lock);
906
907 if (slow_work_user_count == 0) {
908 printk(KERN_NOTICE "Slow work thread pool: Starting up\n");
909 init_completion(&slow_work_last_thread_exited);
910
911 slow_work_threads_should_exit = false;
912 slow_work_init(&slow_work_new_thread,
913 &slow_work_new_thread_ops);
914 slow_work_may_not_start_new_thread = false;
915 slow_work_cull = false;
916
917 /* start the minimum number of threads */
918 for (loop = 0; loop < slow_work_min_threads; loop++) {
919 atomic_inc(&slow_work_thread_count);
920 p = kthread_run(slow_work_thread, NULL, "kslowd");
921 if (IS_ERR(p))
922 goto error;
923 }
924 printk(KERN_NOTICE "Slow work thread pool: Ready\n");
925 }
926
927 slow_work_user_count++;
928 mutex_unlock(&slow_work_user_lock);
929 return 0;
930
931error:
932 if (atomic_dec_and_test(&slow_work_thread_count))
933 complete(&slow_work_last_thread_exited);
934 if (loop > 0) {
935 printk(KERN_ERR "Slow work thread pool:"
936 " Aborting startup on ENOMEM\n");
937 slow_work_threads_should_exit = true;
938 wake_up_all(&slow_work_thread_wq);
939 wait_for_completion(&slow_work_last_thread_exited);
940 printk(KERN_ERR "Slow work thread pool: Aborted\n");
941 }
942 mutex_unlock(&slow_work_user_lock);
943 return PTR_ERR(p);
944}
945EXPORT_SYMBOL(slow_work_register_user);
946
947/*
948 * wait for all outstanding items from the calling module to complete
949 * - note that more items may be queued whilst we're waiting
950 */
951static void slow_work_wait_for_items(struct module *module)
952{
953#ifdef CONFIG_MODULES
954 DECLARE_WAITQUEUE(myself, current);
955 struct slow_work *work;
956 int loop;
957
958 mutex_lock(&slow_work_unreg_sync_lock);
959 add_wait_queue(&slow_work_unreg_wq, &myself);
960
961 for (;;) {
962 spin_lock_irq(&slow_work_queue_lock);
963
964 /* first of all, we wait for the last queued item in each list
965 * to be processed */
966 list_for_each_entry_reverse(work, &vslow_work_queue, link) {
967 if (work->owner == module) {
968 set_current_state(TASK_UNINTERRUPTIBLE);
969 slow_work_unreg_work_item = work;
970 goto do_wait;
971 }
972 }
973 list_for_each_entry_reverse(work, &slow_work_queue, link) {
974 if (work->owner == module) {
975 set_current_state(TASK_UNINTERRUPTIBLE);
976 slow_work_unreg_work_item = work;
977 goto do_wait;
978 }
979 }
980
981 /* then we wait for the items being processed to finish */
982 slow_work_unreg_module = module;
983 smp_mb();
984 for (loop = 0; loop < SLOW_WORK_THREAD_LIMIT; loop++) {
985 if (slow_work_thread_processing[loop] == module)
986 goto do_wait;
987 }
988 spin_unlock_irq(&slow_work_queue_lock);
989 break; /* okay, we're done */
990
991 do_wait:
992 spin_unlock_irq(&slow_work_queue_lock);
993 schedule();
994 slow_work_unreg_work_item = NULL;
995 slow_work_unreg_module = NULL;
996 }
997
998 remove_wait_queue(&slow_work_unreg_wq, &myself);
999 mutex_unlock(&slow_work_unreg_sync_lock);
1000#endif /* CONFIG_MODULES */
1001}
1002
1003/**
1004 * slow_work_unregister_user - Unregister a user of the facility
1005 * @module: The module whose items should be cleared
1006 *
1007 * Unregister a user of the facility, killing all the threads if this was the
1008 * last one.
1009 *
1010 * This waits for all the work items belonging to the nominated module to go
1011 * away before proceeding.
1012 */
1013void slow_work_unregister_user(struct module *module)
1014{
1015 /* first of all, wait for all outstanding items from the calling module
1016 * to complete */
1017 if (module)
1018 slow_work_wait_for_items(module);
1019
1020 /* then we can actually go about shutting down the facility if need
1021 * be */
1022 mutex_lock(&slow_work_user_lock);
1023
1024 BUG_ON(slow_work_user_count <= 0);
1025
1026 slow_work_user_count--;
1027 if (slow_work_user_count == 0) {
1028 printk(KERN_NOTICE "Slow work thread pool: Shutting down\n");
1029 slow_work_threads_should_exit = true;
1030 del_timer_sync(&slow_work_cull_timer);
1031 del_timer_sync(&slow_work_oom_timer);
1032 wake_up_all(&slow_work_thread_wq);
1033 wait_for_completion(&slow_work_last_thread_exited);
1034 printk(KERN_NOTICE "Slow work thread pool:"
1035 " Shut down complete\n");
1036 }
1037
1038 mutex_unlock(&slow_work_user_lock);
1039}
1040EXPORT_SYMBOL(slow_work_unregister_user);
1041
1042/*
1043 * Initialise the slow work facility
1044 */
1045static int __init init_slow_work(void)
1046{
1047 unsigned nr_cpus = num_possible_cpus();
1048
1049 if (slow_work_max_threads < nr_cpus)
1050 slow_work_max_threads = nr_cpus;
1051#ifdef CONFIG_SYSCTL
1052 if (slow_work_max_max_threads < nr_cpus * 2)
1053 slow_work_max_max_threads = nr_cpus * 2;
1054#endif
1055#ifdef CONFIG_SLOW_WORK_DEBUG
1056 {
1057 struct dentry *dbdir;
1058
1059 dbdir = debugfs_create_dir("slow_work", NULL);
1060 if (dbdir && !IS_ERR(dbdir))
1061 debugfs_create_file("runqueue", S_IFREG | 0400, dbdir,
1062 NULL, &slow_work_runqueue_fops);
1063 }
1064#endif
1065 return 0;
1066}
1067
1068subsys_initcall(init_slow_work);
diff --git a/kernel/slow-work.h b/kernel/slow-work.h
deleted file mode 100644
index a29ebd1ef41d..000000000000
--- a/kernel/slow-work.h
+++ /dev/null
@@ -1,72 +0,0 @@
1/* Slow work private definitions
2 *
3 * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of
13 * things to do */
14#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after
15 * OOM */
16
17#define SLOW_WORK_THREAD_LIMIT 255 /* abs maximum number of slow-work threads */
18
19/*
20 * slow-work.c
21 */
22#ifdef CONFIG_SLOW_WORK_DEBUG
23extern struct slow_work *slow_work_execs[];
24extern pid_t slow_work_pids[];
25extern rwlock_t slow_work_execs_lock;
26#endif
27
28extern struct list_head slow_work_queue;
29extern struct list_head vslow_work_queue;
30extern spinlock_t slow_work_queue_lock;
31
32/*
33 * slow-work-debugfs.c
34 */
35#ifdef CONFIG_SLOW_WORK_DEBUG
36extern const struct file_operations slow_work_runqueue_fops;
37
38extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *);
39#endif
40
41/*
42 * Helper functions
43 */
44static inline void slow_work_set_thread_pid(int id, pid_t pid)
45{
46#ifdef CONFIG_SLOW_WORK_DEBUG
47 slow_work_pids[id] = pid;
48#endif
49}
50
51static inline void slow_work_mark_time(struct slow_work *work)
52{
53#ifdef CONFIG_SLOW_WORK_DEBUG
54 work->mark = CURRENT_TIME;
55#endif
56}
57
58static inline void slow_work_begin_exec(int id, struct slow_work *work)
59{
60#ifdef CONFIG_SLOW_WORK_DEBUG
61 slow_work_execs[id] = work;
62#endif
63}
64
65static inline void slow_work_end_exec(int id, struct slow_work *work)
66{
67#ifdef CONFIG_SLOW_WORK_DEBUG
68 write_lock(&slow_work_execs_lock);
69 slow_work_execs[id] = NULL;
70 write_unlock(&slow_work_execs_lock);
71#endif
72}
diff --git a/kernel/smp.c b/kernel/smp.c
index 75c970c715d3..12ed8b013e2d 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -267,7 +267,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
267 * 267 *
268 * Returns 0 on success, else a negative status code. 268 * Returns 0 on success, else a negative status code.
269 */ 269 */
270int smp_call_function_single(int cpu, void (*func) (void *info), void *info, 270int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
271 int wait) 271 int wait)
272{ 272{
273 struct call_single_data d = { 273 struct call_single_data d = {
@@ -336,7 +336,7 @@ EXPORT_SYMBOL(smp_call_function_single);
336 * 3) any other online cpu in @mask 336 * 3) any other online cpu in @mask
337 */ 337 */
338int smp_call_function_any(const struct cpumask *mask, 338int smp_call_function_any(const struct cpumask *mask,
339 void (*func)(void *info), void *info, int wait) 339 smp_call_func_t func, void *info, int wait)
340{ 340{
341 unsigned int cpu; 341 unsigned int cpu;
342 const struct cpumask *nodemask; 342 const struct cpumask *nodemask;
@@ -365,9 +365,10 @@ call:
365EXPORT_SYMBOL_GPL(smp_call_function_any); 365EXPORT_SYMBOL_GPL(smp_call_function_any);
366 366
367/** 367/**
368 * __smp_call_function_single(): Run a function on another CPU 368 * __smp_call_function_single(): Run a function on a specific CPU
369 * @cpu: The CPU to run on. 369 * @cpu: The CPU to run on.
370 * @data: Pre-allocated and setup data structure 370 * @data: Pre-allocated and setup data structure
371 * @wait: If true, wait until function has completed on specified CPU.
371 * 372 *
372 * Like smp_call_function_single(), but allow caller to pass in a 373 * Like smp_call_function_single(), but allow caller to pass in a
373 * pre-allocated data structure. Useful for embedding @data inside 374 * pre-allocated data structure. Useful for embedding @data inside
@@ -376,8 +377,10 @@ EXPORT_SYMBOL_GPL(smp_call_function_any);
376void __smp_call_function_single(int cpu, struct call_single_data *data, 377void __smp_call_function_single(int cpu, struct call_single_data *data,
377 int wait) 378 int wait)
378{ 379{
379 csd_lock(data); 380 unsigned int this_cpu;
381 unsigned long flags;
380 382
383 this_cpu = get_cpu();
381 /* 384 /*
382 * Can deadlock when called with interrupts disabled. 385 * Can deadlock when called with interrupts disabled.
383 * We allow cpu's that are not yet online though, as no one else can 386 * We allow cpu's that are not yet online though, as no one else can
@@ -387,7 +390,15 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
387 WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled() 390 WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled()
388 && !oops_in_progress); 391 && !oops_in_progress);
389 392
390 generic_exec_single(cpu, data, wait); 393 if (cpu == this_cpu) {
394 local_irq_save(flags);
395 data->func(data->info);
396 local_irq_restore(flags);
397 } else {
398 csd_lock(data);
399 generic_exec_single(cpu, data, wait);
400 }
401 put_cpu();
391} 402}
392 403
393/** 404/**
@@ -405,7 +416,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
405 * must be disabled when calling this function. 416 * must be disabled when calling this function.
406 */ 417 */
407void smp_call_function_many(const struct cpumask *mask, 418void smp_call_function_many(const struct cpumask *mask,
408 void (*func)(void *), void *info, bool wait) 419 smp_call_func_t func, void *info, bool wait)
409{ 420{
410 struct call_function_data *data; 421 struct call_function_data *data;
411 unsigned long flags; 422 unsigned long flags;
@@ -489,7 +500,7 @@ EXPORT_SYMBOL(smp_call_function_many);
489 * You must not call this function with disabled interrupts or from a 500 * You must not call this function with disabled interrupts or from a
490 * hardware interrupt handler or from a bottom half handler. 501 * hardware interrupt handler or from a bottom half handler.
491 */ 502 */
492int smp_call_function(void (*func)(void *), void *info, int wait) 503int smp_call_function(smp_call_func_t func, void *info, int wait)
493{ 504{
494 preempt_disable(); 505 preempt_disable();
495 smp_call_function_many(cpu_online_mask, func, info, wait); 506 smp_call_function_many(cpu_online_mask, func, info, wait);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 07b4f1b1a73a..18f4be0d5fe0 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -67,7 +67,7 @@ char *softirq_to_name[NR_SOFTIRQS] = {
67 * to the pending events, so lets the scheduler to balance 67 * to the pending events, so lets the scheduler to balance
68 * the softirq load for us. 68 * the softirq load for us.
69 */ 69 */
70void wakeup_softirqd(void) 70static void wakeup_softirqd(void)
71{ 71{
72 /* Interrupts are disabled: no need to stop preemption */ 72 /* Interrupts are disabled: no need to stop preemption */
73 struct task_struct *tsk = __get_cpu_var(ksoftirqd); 73 struct task_struct *tsk = __get_cpu_var(ksoftirqd);
@@ -77,11 +77,21 @@ void wakeup_softirqd(void)
77} 77}
78 78
79/* 79/*
80 * preempt_count and SOFTIRQ_OFFSET usage:
81 * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
82 * softirq processing.
83 * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET)
84 * on local_bh_disable or local_bh_enable.
85 * This lets us distinguish between whether we are currently processing
86 * softirq and whether we just have bh disabled.
87 */
88
89/*
80 * This one is for softirq.c-internal use, 90 * This one is for softirq.c-internal use,
81 * where hardirqs are disabled legitimately: 91 * where hardirqs are disabled legitimately:
82 */ 92 */
83#ifdef CONFIG_TRACE_IRQFLAGS 93#ifdef CONFIG_TRACE_IRQFLAGS
84static void __local_bh_disable(unsigned long ip) 94static void __local_bh_disable(unsigned long ip, unsigned int cnt)
85{ 95{
86 unsigned long flags; 96 unsigned long flags;
87 97
@@ -95,32 +105,43 @@ static void __local_bh_disable(unsigned long ip)
95 * We must manually increment preempt_count here and manually 105 * We must manually increment preempt_count here and manually
96 * call the trace_preempt_off later. 106 * call the trace_preempt_off later.
97 */ 107 */
98 preempt_count() += SOFTIRQ_OFFSET; 108 preempt_count() += cnt;
99 /* 109 /*
100 * Were softirqs turned off above: 110 * Were softirqs turned off above:
101 */ 111 */
102 if (softirq_count() == SOFTIRQ_OFFSET) 112 if (softirq_count() == cnt)
103 trace_softirqs_off(ip); 113 trace_softirqs_off(ip);
104 raw_local_irq_restore(flags); 114 raw_local_irq_restore(flags);
105 115
106 if (preempt_count() == SOFTIRQ_OFFSET) 116 if (preempt_count() == cnt)
107 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 117 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
108} 118}
109#else /* !CONFIG_TRACE_IRQFLAGS */ 119#else /* !CONFIG_TRACE_IRQFLAGS */
110static inline void __local_bh_disable(unsigned long ip) 120static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
111{ 121{
112 add_preempt_count(SOFTIRQ_OFFSET); 122 add_preempt_count(cnt);
113 barrier(); 123 barrier();
114} 124}
115#endif /* CONFIG_TRACE_IRQFLAGS */ 125#endif /* CONFIG_TRACE_IRQFLAGS */
116 126
117void local_bh_disable(void) 127void local_bh_disable(void)
118{ 128{
119 __local_bh_disable((unsigned long)__builtin_return_address(0)); 129 __local_bh_disable((unsigned long)__builtin_return_address(0),
130 SOFTIRQ_DISABLE_OFFSET);
120} 131}
121 132
122EXPORT_SYMBOL(local_bh_disable); 133EXPORT_SYMBOL(local_bh_disable);
123 134
135static void __local_bh_enable(unsigned int cnt)
136{
137 WARN_ON_ONCE(in_irq());
138 WARN_ON_ONCE(!irqs_disabled());
139
140 if (softirq_count() == cnt)
141 trace_softirqs_on((unsigned long)__builtin_return_address(0));
142 sub_preempt_count(cnt);
143}
144
124/* 145/*
125 * Special-case - softirqs can safely be enabled in 146 * Special-case - softirqs can safely be enabled in
126 * cond_resched_softirq(), or by __do_softirq(), 147 * cond_resched_softirq(), or by __do_softirq(),
@@ -128,12 +149,7 @@ EXPORT_SYMBOL(local_bh_disable);
128 */ 149 */
129void _local_bh_enable(void) 150void _local_bh_enable(void)
130{ 151{
131 WARN_ON_ONCE(in_irq()); 152 __local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
132 WARN_ON_ONCE(!irqs_disabled());
133
134 if (softirq_count() == SOFTIRQ_OFFSET)
135 trace_softirqs_on((unsigned long)__builtin_return_address(0));
136 sub_preempt_count(SOFTIRQ_OFFSET);
137} 153}
138 154
139EXPORT_SYMBOL(_local_bh_enable); 155EXPORT_SYMBOL(_local_bh_enable);
@@ -147,13 +163,13 @@ static inline void _local_bh_enable_ip(unsigned long ip)
147 /* 163 /*
148 * Are softirqs going to be turned on now: 164 * Are softirqs going to be turned on now:
149 */ 165 */
150 if (softirq_count() == SOFTIRQ_OFFSET) 166 if (softirq_count() == SOFTIRQ_DISABLE_OFFSET)
151 trace_softirqs_on(ip); 167 trace_softirqs_on(ip);
152 /* 168 /*
153 * Keep preemption disabled until we are done with 169 * Keep preemption disabled until we are done with
154 * softirq processing: 170 * softirq processing:
155 */ 171 */
156 sub_preempt_count(SOFTIRQ_OFFSET - 1); 172 sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1);
157 173
158 if (unlikely(!in_interrupt() && local_softirq_pending())) 174 if (unlikely(!in_interrupt() && local_softirq_pending()))
159 do_softirq(); 175 do_softirq();
@@ -198,7 +214,8 @@ asmlinkage void __do_softirq(void)
198 pending = local_softirq_pending(); 214 pending = local_softirq_pending();
199 account_system_vtime(current); 215 account_system_vtime(current);
200 216
201 __local_bh_disable((unsigned long)__builtin_return_address(0)); 217 __local_bh_disable((unsigned long)__builtin_return_address(0),
218 SOFTIRQ_OFFSET);
202 lockdep_softirq_enter(); 219 lockdep_softirq_enter();
203 220
204 cpu = smp_processor_id(); 221 cpu = smp_processor_id();
@@ -212,18 +229,20 @@ restart:
212 229
213 do { 230 do {
214 if (pending & 1) { 231 if (pending & 1) {
232 unsigned int vec_nr = h - softirq_vec;
215 int prev_count = preempt_count(); 233 int prev_count = preempt_count();
216 kstat_incr_softirqs_this_cpu(h - softirq_vec);
217 234
218 trace_softirq_entry(h, softirq_vec); 235 kstat_incr_softirqs_this_cpu(vec_nr);
236
237 trace_softirq_entry(vec_nr);
219 h->action(h); 238 h->action(h);
220 trace_softirq_exit(h, softirq_vec); 239 trace_softirq_exit(vec_nr);
221 if (unlikely(prev_count != preempt_count())) { 240 if (unlikely(prev_count != preempt_count())) {
222 printk(KERN_ERR "huh, entered softirq %td %s %p" 241 printk(KERN_ERR "huh, entered softirq %u %s %p"
223 "with preempt_count %08x," 242 "with preempt_count %08x,"
224 " exited with %08x?\n", h - softirq_vec, 243 " exited with %08x?\n", vec_nr,
225 softirq_to_name[h - softirq_vec], 244 softirq_to_name[vec_nr], h->action,
226 h->action, prev_count, preempt_count()); 245 prev_count, preempt_count());
227 preempt_count() = prev_count; 246 preempt_count() = prev_count;
228 } 247 }
229 248
@@ -245,7 +264,7 @@ restart:
245 lockdep_softirq_exit(); 264 lockdep_softirq_exit();
246 265
247 account_system_vtime(current); 266 account_system_vtime(current);
248 _local_bh_enable(); 267 __local_bh_enable(SOFTIRQ_OFFSET);
249} 268}
250 269
251#ifndef __ARCH_HAS_DO_SOFTIRQ 270#ifndef __ARCH_HAS_DO_SOFTIRQ
@@ -279,10 +298,16 @@ void irq_enter(void)
279 298
280 rcu_irq_enter(); 299 rcu_irq_enter();
281 if (idle_cpu(cpu) && !in_interrupt()) { 300 if (idle_cpu(cpu) && !in_interrupt()) {
282 __irq_enter(); 301 /*
302 * Prevent raise_softirq from needlessly waking up ksoftirqd
303 * here, as softirq will be serviced on return from interrupt.
304 */
305 local_bh_disable();
283 tick_check_idle(cpu); 306 tick_check_idle(cpu);
284 } else 307 _local_bh_enable();
285 __irq_enter(); 308 }
309
310 __irq_enter();
286} 311}
287 312
288#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED 313#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
@@ -696,6 +721,7 @@ static int run_ksoftirqd(void * __bind_cpu)
696{ 721{
697 set_current_state(TASK_INTERRUPTIBLE); 722 set_current_state(TASK_INTERRUPTIBLE);
698 723
724 current->flags |= PF_KSOFTIRQD;
699 while (!kthread_should_stop()) { 725 while (!kthread_should_stop()) {
700 preempt_disable(); 726 preempt_disable();
701 if (!local_softirq_pending()) { 727 if (!local_softirq_pending()) {
@@ -886,17 +912,14 @@ int __init __weak early_irq_init(void)
886 return 0; 912 return 0;
887} 913}
888 914
915#ifdef CONFIG_GENERIC_HARDIRQS
889int __init __weak arch_probe_nr_irqs(void) 916int __init __weak arch_probe_nr_irqs(void)
890{ 917{
891 return 0; 918 return NR_IRQS_LEGACY;
892} 919}
893 920
894int __init __weak arch_early_irq_init(void) 921int __init __weak arch_early_irq_init(void)
895{ 922{
896 return 0; 923 return 0;
897} 924}
898 925#endif
899int __weak arch_init_chip_data(struct irq_desc *desc, int node)
900{
901 return 0;
902}
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
deleted file mode 100644
index 4b493f67dcb5..000000000000
--- a/kernel/softlockup.c
+++ /dev/null
@@ -1,293 +0,0 @@
1/*
2 * Detect Soft Lockups
3 *
4 * started by Ingo Molnar, Copyright (C) 2005, 2006 Red Hat, Inc.
5 *
6 * this code detects soft lockups: incidents in where on a CPU
7 * the kernel does not reschedule for 10 seconds or more.
8 */
9#include <linux/mm.h>
10#include <linux/cpu.h>
11#include <linux/nmi.h>
12#include <linux/init.h>
13#include <linux/delay.h>
14#include <linux/freezer.h>
15#include <linux/kthread.h>
16#include <linux/lockdep.h>
17#include <linux/notifier.h>
18#include <linux/module.h>
19#include <linux/sysctl.h>
20
21#include <asm/irq_regs.h>
22
23static DEFINE_SPINLOCK(print_lock);
24
25static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */
26static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */
27static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
28static DEFINE_PER_CPU(bool, softlock_touch_sync);
29
30static int __read_mostly did_panic;
31int __read_mostly softlockup_thresh = 60;
32
33/*
34 * Should we panic (and reboot, if panic_timeout= is set) when a
35 * soft-lockup occurs:
36 */
37unsigned int __read_mostly softlockup_panic =
38 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
39
40static int __init softlockup_panic_setup(char *str)
41{
42 softlockup_panic = simple_strtoul(str, NULL, 0);
43
44 return 1;
45}
46__setup("softlockup_panic=", softlockup_panic_setup);
47
48static int
49softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
50{
51 did_panic = 1;
52
53 return NOTIFY_DONE;
54}
55
56static struct notifier_block panic_block = {
57 .notifier_call = softlock_panic,
58};
59
60/*
61 * Returns seconds, approximately. We don't need nanosecond
62 * resolution, and we don't need to waste time with a big divide when
63 * 2^30ns == 1.074s.
64 */
65static unsigned long get_timestamp(int this_cpu)
66{
67 return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
68}
69
70static void __touch_softlockup_watchdog(void)
71{
72 int this_cpu = raw_smp_processor_id();
73
74 __raw_get_cpu_var(softlockup_touch_ts) = get_timestamp(this_cpu);
75}
76
77void touch_softlockup_watchdog(void)
78{
79 __raw_get_cpu_var(softlockup_touch_ts) = 0;
80}
81EXPORT_SYMBOL(touch_softlockup_watchdog);
82
83void touch_softlockup_watchdog_sync(void)
84{
85 __raw_get_cpu_var(softlock_touch_sync) = true;
86 __raw_get_cpu_var(softlockup_touch_ts) = 0;
87}
88
89void touch_all_softlockup_watchdogs(void)
90{
91 int cpu;
92
93 /* Cause each CPU to re-update its timestamp rather than complain */
94 for_each_online_cpu(cpu)
95 per_cpu(softlockup_touch_ts, cpu) = 0;
96}
97EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
98
99int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
100 void __user *buffer,
101 size_t *lenp, loff_t *ppos)
102{
103 touch_all_softlockup_watchdogs();
104 return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
105}
106
107/*
108 * This callback runs from the timer interrupt, and checks
109 * whether the watchdog thread has hung or not:
110 */
111void softlockup_tick(void)
112{
113 int this_cpu = smp_processor_id();
114 unsigned long touch_ts = per_cpu(softlockup_touch_ts, this_cpu);
115 unsigned long print_ts;
116 struct pt_regs *regs = get_irq_regs();
117 unsigned long now;
118
119 /* Is detection switched off? */
120 if (!per_cpu(softlockup_watchdog, this_cpu) || softlockup_thresh <= 0) {
121 /* Be sure we don't false trigger if switched back on */
122 if (touch_ts)
123 per_cpu(softlockup_touch_ts, this_cpu) = 0;
124 return;
125 }
126
127 if (touch_ts == 0) {
128 if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) {
129 /*
130 * If the time stamp was touched atomically
131 * make sure the scheduler tick is up to date.
132 */
133 per_cpu(softlock_touch_sync, this_cpu) = false;
134 sched_clock_tick();
135 }
136 __touch_softlockup_watchdog();
137 return;
138 }
139
140 print_ts = per_cpu(softlockup_print_ts, this_cpu);
141
142 /* report at most once a second */
143 if (print_ts == touch_ts || did_panic)
144 return;
145
146 /* do not print during early bootup: */
147 if (unlikely(system_state != SYSTEM_RUNNING)) {
148 __touch_softlockup_watchdog();
149 return;
150 }
151
152 now = get_timestamp(this_cpu);
153
154 /*
155 * Wake up the high-prio watchdog task twice per
156 * threshold timespan.
157 */
158 if (time_after(now - softlockup_thresh/2, touch_ts))
159 wake_up_process(per_cpu(softlockup_watchdog, this_cpu));
160
161 /* Warn about unreasonable delays: */
162 if (time_before_eq(now - softlockup_thresh, touch_ts))
163 return;
164
165 per_cpu(softlockup_print_ts, this_cpu) = touch_ts;
166
167 spin_lock(&print_lock);
168 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n",
169 this_cpu, now - touch_ts,
170 current->comm, task_pid_nr(current));
171 print_modules();
172 print_irqtrace_events(current);
173 if (regs)
174 show_regs(regs);
175 else
176 dump_stack();
177 spin_unlock(&print_lock);
178
179 if (softlockup_panic)
180 panic("softlockup: hung tasks");
181}
182
183/*
184 * The watchdog thread - runs every second and touches the timestamp.
185 */
186static int watchdog(void *__bind_cpu)
187{
188 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
189
190 sched_setscheduler(current, SCHED_FIFO, &param);
191
192 /* initialize timestamp */
193 __touch_softlockup_watchdog();
194
195 set_current_state(TASK_INTERRUPTIBLE);
196 /*
197 * Run briefly once per second to reset the softlockup timestamp.
198 * If this gets delayed for more than 60 seconds then the
199 * debug-printout triggers in softlockup_tick().
200 */
201 while (!kthread_should_stop()) {
202 __touch_softlockup_watchdog();
203 schedule();
204
205 if (kthread_should_stop())
206 break;
207
208 set_current_state(TASK_INTERRUPTIBLE);
209 }
210 __set_current_state(TASK_RUNNING);
211
212 return 0;
213}
214
215/*
216 * Create/destroy watchdog threads as CPUs come and go:
217 */
218static int __cpuinit
219cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
220{
221 int hotcpu = (unsigned long)hcpu;
222 struct task_struct *p;
223
224 switch (action) {
225 case CPU_UP_PREPARE:
226 case CPU_UP_PREPARE_FROZEN:
227 BUG_ON(per_cpu(softlockup_watchdog, hotcpu));
228 p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
229 if (IS_ERR(p)) {
230 printk(KERN_ERR "watchdog for %i failed\n", hotcpu);
231 return NOTIFY_BAD;
232 }
233 per_cpu(softlockup_touch_ts, hotcpu) = 0;
234 per_cpu(softlockup_watchdog, hotcpu) = p;
235 kthread_bind(p, hotcpu);
236 break;
237 case CPU_ONLINE:
238 case CPU_ONLINE_FROZEN:
239 wake_up_process(per_cpu(softlockup_watchdog, hotcpu));
240 break;
241#ifdef CONFIG_HOTPLUG_CPU
242 case CPU_UP_CANCELED:
243 case CPU_UP_CANCELED_FROZEN:
244 if (!per_cpu(softlockup_watchdog, hotcpu))
245 break;
246 /* Unbind so it can run. Fall thru. */
247 kthread_bind(per_cpu(softlockup_watchdog, hotcpu),
248 cpumask_any(cpu_online_mask));
249 case CPU_DEAD:
250 case CPU_DEAD_FROZEN:
251 p = per_cpu(softlockup_watchdog, hotcpu);
252 per_cpu(softlockup_watchdog, hotcpu) = NULL;
253 kthread_stop(p);
254 break;
255#endif /* CONFIG_HOTPLUG_CPU */
256 }
257 return NOTIFY_OK;
258}
259
260static struct notifier_block __cpuinitdata cpu_nfb = {
261 .notifier_call = cpu_callback
262};
263
264static int __initdata nosoftlockup;
265
266static int __init nosoftlockup_setup(char *str)
267{
268 nosoftlockup = 1;
269 return 1;
270}
271__setup("nosoftlockup", nosoftlockup_setup);
272
273static int __init spawn_softlockup_task(void)
274{
275 void *cpu = (void *)(long)smp_processor_id();
276 int err;
277
278 if (nosoftlockup)
279 return 0;
280
281 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
282 if (err == NOTIFY_BAD) {
283 BUG();
284 return 1;
285 }
286 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
287 register_cpu_notifier(&cpu_nfb);
288
289 atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
290
291 return 0;
292}
293early_initcall(spawn_softlockup_task);
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 2980da3fd509..c71e07500536 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -46,11 +46,9 @@ static int init_srcu_struct_fields(struct srcu_struct *sp)
46int __init_srcu_struct(struct srcu_struct *sp, const char *name, 46int __init_srcu_struct(struct srcu_struct *sp, const char *name,
47 struct lock_class_key *key) 47 struct lock_class_key *key)
48{ 48{
49#ifdef CONFIG_DEBUG_LOCK_ALLOC
50 /* Don't re-initialize a lock while it is held. */ 49 /* Don't re-initialize a lock while it is held. */
51 debug_check_no_locks_freed((void *)sp, sizeof(*sp)); 50 debug_check_no_locks_freed((void *)sp, sizeof(*sp));
52 lockdep_init_map(&sp->dep_map, name, key, 0); 51 lockdep_init_map(&sp->dep_map, name, key, 0);
53#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
54 return init_srcu_struct_fields(sp); 52 return init_srcu_struct_fields(sp);
55} 53}
56EXPORT_SYMBOL_GPL(__init_srcu_struct); 54EXPORT_SYMBOL_GPL(__init_srcu_struct);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 70f8d90331e9..2df820b03beb 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -35,9 +35,9 @@ struct cpu_stop_done {
35/* the actual stopper, one per every possible cpu, enabled on online cpus */ 35/* the actual stopper, one per every possible cpu, enabled on online cpus */
36struct cpu_stopper { 36struct cpu_stopper {
37 spinlock_t lock; 37 spinlock_t lock;
38 bool enabled; /* is this stopper enabled? */
38 struct list_head works; /* list of pending works */ 39 struct list_head works; /* list of pending works */
39 struct task_struct *thread; /* stopper thread */ 40 struct task_struct *thread; /* stopper thread */
40 bool enabled; /* is this stopper enabled? */
41}; 41};
42 42
43static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); 43static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
@@ -262,7 +262,7 @@ repeat:
262 cpu_stop_fn_t fn = work->fn; 262 cpu_stop_fn_t fn = work->fn;
263 void *arg = work->arg; 263 void *arg = work->arg;
264 struct cpu_stop_done *done = work->done; 264 struct cpu_stop_done *done = work->done;
265 char ksym_buf[KSYM_NAME_LEN]; 265 char ksym_buf[KSYM_NAME_LEN] __maybe_unused;
266 266
267 __set_current_state(TASK_RUNNING); 267 __set_current_state(TASK_RUNNING);
268 268
@@ -287,11 +287,12 @@ repeat:
287 goto repeat; 287 goto repeat;
288} 288}
289 289
290extern void sched_set_stop_task(int cpu, struct task_struct *stop);
291
290/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */ 292/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
291static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, 293static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
292 unsigned long action, void *hcpu) 294 unsigned long action, void *hcpu)
293{ 295{
294 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
295 unsigned int cpu = (unsigned long)hcpu; 296 unsigned int cpu = (unsigned long)hcpu;
296 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); 297 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
297 struct task_struct *p; 298 struct task_struct *p;
@@ -303,14 +304,14 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
303 p = kthread_create(cpu_stopper_thread, stopper, "migration/%d", 304 p = kthread_create(cpu_stopper_thread, stopper, "migration/%d",
304 cpu); 305 cpu);
305 if (IS_ERR(p)) 306 if (IS_ERR(p))
306 return NOTIFY_BAD; 307 return notifier_from_errno(PTR_ERR(p));
307 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
308 get_task_struct(p); 308 get_task_struct(p);
309 kthread_bind(p, cpu);
310 sched_set_stop_task(cpu, p);
309 stopper->thread = p; 311 stopper->thread = p;
310 break; 312 break;
311 313
312 case CPU_ONLINE: 314 case CPU_ONLINE:
313 kthread_bind(stopper->thread, cpu);
314 /* strictly unnecessary, as first user will wake it */ 315 /* strictly unnecessary, as first user will wake it */
315 wake_up_process(stopper->thread); 316 wake_up_process(stopper->thread);
316 /* mark enabled */ 317 /* mark enabled */
@@ -325,6 +326,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
325 { 326 {
326 struct cpu_stop_work *work; 327 struct cpu_stop_work *work;
327 328
329 sched_set_stop_task(cpu, NULL);
328 /* kill the stopper */ 330 /* kill the stopper */
329 kthread_stop(stopper->thread); 331 kthread_stop(stopper->thread);
330 /* drain remaining works */ 332 /* drain remaining works */
@@ -370,7 +372,7 @@ static int __init cpu_stop_init(void)
370 /* start one for the boot cpu */ 372 /* start one for the boot cpu */
371 err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE, 373 err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
372 bcpu); 374 bcpu);
373 BUG_ON(err == NOTIFY_BAD); 375 BUG_ON(err != NOTIFY_OK);
374 cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu); 376 cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
375 register_cpu_notifier(&cpu_stop_cpu_notifier); 377 register_cpu_notifier(&cpu_stop_cpu_notifier);
376 378
diff --git a/kernel/sys.c b/kernel/sys.c
index e83ddbbaf89d..7f5a0cd296a9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -931,6 +931,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
931 pgid = pid; 931 pgid = pid;
932 if (pgid < 0) 932 if (pgid < 0)
933 return -EINVAL; 933 return -EINVAL;
934 rcu_read_lock();
934 935
935 /* From this point forward we keep holding onto the tasklist lock 936 /* From this point forward we keep holding onto the tasklist lock
936 * so that our parent does not change from under us. -DaveM 937 * so that our parent does not change from under us. -DaveM
@@ -984,6 +985,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
984out: 985out:
985 /* All paths lead to here, thus we are safe. -DaveM */ 986 /* All paths lead to here, thus we are safe. -DaveM */
986 write_unlock_irq(&tasklist_lock); 987 write_unlock_irq(&tasklist_lock);
988 rcu_read_unlock();
987 return err; 989 return err;
988} 990}
989 991
@@ -1236,15 +1238,14 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
1236 1238
1237SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) 1239SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
1238{ 1240{
1239 if (resource >= RLIM_NLIMITS) 1241 struct rlimit value;
1240 return -EINVAL; 1242 int ret;
1241 else { 1243
1242 struct rlimit value; 1244 ret = do_prlimit(current, resource, NULL, &value);
1243 task_lock(current->group_leader); 1245 if (!ret)
1244 value = current->signal->rlim[resource]; 1246 ret = copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0;
1245 task_unlock(current->group_leader); 1247
1246 return copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0; 1248 return ret;
1247 }
1248} 1249}
1249 1250
1250#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT 1251#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT
@@ -1272,44 +1273,89 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
1272 1273
1273#endif 1274#endif
1274 1275
1275SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) 1276static inline bool rlim64_is_infinity(__u64 rlim64)
1276{ 1277{
1277 struct rlimit new_rlim, *old_rlim; 1278#if BITS_PER_LONG < 64
1278 int retval; 1279 return rlim64 >= ULONG_MAX;
1280#else
1281 return rlim64 == RLIM64_INFINITY;
1282#endif
1283}
1284
1285static void rlim_to_rlim64(const struct rlimit *rlim, struct rlimit64 *rlim64)
1286{
1287 if (rlim->rlim_cur == RLIM_INFINITY)
1288 rlim64->rlim_cur = RLIM64_INFINITY;
1289 else
1290 rlim64->rlim_cur = rlim->rlim_cur;
1291 if (rlim->rlim_max == RLIM_INFINITY)
1292 rlim64->rlim_max = RLIM64_INFINITY;
1293 else
1294 rlim64->rlim_max = rlim->rlim_max;
1295}
1296
1297static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim)
1298{
1299 if (rlim64_is_infinity(rlim64->rlim_cur))
1300 rlim->rlim_cur = RLIM_INFINITY;
1301 else
1302 rlim->rlim_cur = (unsigned long)rlim64->rlim_cur;
1303 if (rlim64_is_infinity(rlim64->rlim_max))
1304 rlim->rlim_max = RLIM_INFINITY;
1305 else
1306 rlim->rlim_max = (unsigned long)rlim64->rlim_max;
1307}
1308
1309/* make sure you are allowed to change @tsk limits before calling this */
1310int do_prlimit(struct task_struct *tsk, unsigned int resource,
1311 struct rlimit *new_rlim, struct rlimit *old_rlim)
1312{
1313 struct rlimit *rlim;
1314 int retval = 0;
1279 1315
1280 if (resource >= RLIM_NLIMITS) 1316 if (resource >= RLIM_NLIMITS)
1281 return -EINVAL; 1317 return -EINVAL;
1282 if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) 1318 if (new_rlim) {
1283 return -EFAULT; 1319 if (new_rlim->rlim_cur > new_rlim->rlim_max)
1284 if (new_rlim.rlim_cur > new_rlim.rlim_max) 1320 return -EINVAL;
1285 return -EINVAL; 1321 if (resource == RLIMIT_NOFILE &&
1286 old_rlim = current->signal->rlim + resource; 1322 new_rlim->rlim_max > sysctl_nr_open)
1287 if ((new_rlim.rlim_max > old_rlim->rlim_max) && 1323 return -EPERM;
1288 !capable(CAP_SYS_RESOURCE))
1289 return -EPERM;
1290 if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open)
1291 return -EPERM;
1292
1293 retval = security_task_setrlimit(resource, &new_rlim);
1294 if (retval)
1295 return retval;
1296
1297 if (resource == RLIMIT_CPU && new_rlim.rlim_cur == 0) {
1298 /*
1299 * The caller is asking for an immediate RLIMIT_CPU
1300 * expiry. But we use the zero value to mean "it was
1301 * never set". So let's cheat and make it one second
1302 * instead
1303 */
1304 new_rlim.rlim_cur = 1;
1305 } 1324 }
1306 1325
1307 task_lock(current->group_leader); 1326 /* protect tsk->signal and tsk->sighand from disappearing */
1308 *old_rlim = new_rlim; 1327 read_lock(&tasklist_lock);
1309 task_unlock(current->group_leader); 1328 if (!tsk->sighand) {
1310 1329 retval = -ESRCH;
1311 if (resource != RLIMIT_CPU)
1312 goto out; 1330 goto out;
1331 }
1332
1333 rlim = tsk->signal->rlim + resource;
1334 task_lock(tsk->group_leader);
1335 if (new_rlim) {
1336 if (new_rlim->rlim_max > rlim->rlim_max &&
1337 !capable(CAP_SYS_RESOURCE))
1338 retval = -EPERM;
1339 if (!retval)
1340 retval = security_task_setrlimit(tsk->group_leader,
1341 resource, new_rlim);
1342 if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) {
1343 /*
1344 * The caller is asking for an immediate RLIMIT_CPU
1345 * expiry. But we use the zero value to mean "it was
1346 * never set". So let's cheat and make it one second
1347 * instead
1348 */
1349 new_rlim->rlim_cur = 1;
1350 }
1351 }
1352 if (!retval) {
1353 if (old_rlim)
1354 *old_rlim = *rlim;
1355 if (new_rlim)
1356 *rlim = *new_rlim;
1357 }
1358 task_unlock(tsk->group_leader);
1313 1359
1314 /* 1360 /*
1315 * RLIMIT_CPU handling. Note that the kernel fails to return an error 1361 * RLIMIT_CPU handling. Note that the kernel fails to return an error
@@ -1317,14 +1363,84 @@ SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
1317 * very long-standing error, and fixing it now risks breakage of 1363 * very long-standing error, and fixing it now risks breakage of
1318 * applications, so we live with it 1364 * applications, so we live with it
1319 */ 1365 */
1320 if (new_rlim.rlim_cur == RLIM_INFINITY) 1366 if (!retval && new_rlim && resource == RLIMIT_CPU &&
1321 goto out; 1367 new_rlim->rlim_cur != RLIM_INFINITY)
1322 1368 update_rlimit_cpu(tsk, new_rlim->rlim_cur);
1323 update_rlimit_cpu(new_rlim.rlim_cur);
1324out: 1369out:
1370 read_unlock(&tasklist_lock);
1371 return retval;
1372}
1373
1374/* rcu lock must be held */
1375static int check_prlimit_permission(struct task_struct *task)
1376{
1377 const struct cred *cred = current_cred(), *tcred;
1378
1379 tcred = __task_cred(task);
1380 if ((cred->uid != tcred->euid ||
1381 cred->uid != tcred->suid ||
1382 cred->uid != tcred->uid ||
1383 cred->gid != tcred->egid ||
1384 cred->gid != tcred->sgid ||
1385 cred->gid != tcred->gid) &&
1386 !capable(CAP_SYS_RESOURCE)) {
1387 return -EPERM;
1388 }
1389
1325 return 0; 1390 return 0;
1326} 1391}
1327 1392
1393SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
1394 const struct rlimit64 __user *, new_rlim,
1395 struct rlimit64 __user *, old_rlim)
1396{
1397 struct rlimit64 old64, new64;
1398 struct rlimit old, new;
1399 struct task_struct *tsk;
1400 int ret;
1401
1402 if (new_rlim) {
1403 if (copy_from_user(&new64, new_rlim, sizeof(new64)))
1404 return -EFAULT;
1405 rlim64_to_rlim(&new64, &new);
1406 }
1407
1408 rcu_read_lock();
1409 tsk = pid ? find_task_by_vpid(pid) : current;
1410 if (!tsk) {
1411 rcu_read_unlock();
1412 return -ESRCH;
1413 }
1414 ret = check_prlimit_permission(tsk);
1415 if (ret) {
1416 rcu_read_unlock();
1417 return ret;
1418 }
1419 get_task_struct(tsk);
1420 rcu_read_unlock();
1421
1422 ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL,
1423 old_rlim ? &old : NULL);
1424
1425 if (!ret && old_rlim) {
1426 rlim_to_rlim64(&old, &old64);
1427 if (copy_to_user(old_rlim, &old64, sizeof(old64)))
1428 ret = -EFAULT;
1429 }
1430
1431 put_task_struct(tsk);
1432 return ret;
1433}
1434
1435SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
1436{
1437 struct rlimit new_rlim;
1438
1439 if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
1440 return -EFAULT;
1441 return do_prlimit(current, resource, &new_rlim, NULL);
1442}
1443
1328/* 1444/*
1329 * It would make sense to put struct rusage in the task_struct, 1445 * It would make sense to put struct rusage in the task_struct,
1330 * except that would make the task_struct be *really big*. After 1446 * except that would make the task_struct be *really big*. After
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 70f2ea758ffe..c782fe9924c7 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -50,6 +50,7 @@ cond_syscall(compat_sys_sendmsg);
50cond_syscall(sys_recvmsg); 50cond_syscall(sys_recvmsg);
51cond_syscall(sys_recvmmsg); 51cond_syscall(sys_recvmmsg);
52cond_syscall(compat_sys_recvmsg); 52cond_syscall(compat_sys_recvmsg);
53cond_syscall(compat_sys_recv);
53cond_syscall(compat_sys_recvfrom); 54cond_syscall(compat_sys_recvfrom);
54cond_syscall(compat_sys_recvmmsg); 55cond_syscall(compat_sys_recvmmsg);
55cond_syscall(sys_socketcall); 56cond_syscall(sys_socketcall);
@@ -181,3 +182,7 @@ cond_syscall(sys_eventfd2);
181 182
182/* performance counters: */ 183/* performance counters: */
183cond_syscall(sys_perf_event_open); 184cond_syscall(sys_perf_event_open);
185
186/* fanotify! */
187cond_syscall(sys_fanotify_init);
188cond_syscall(sys_fanotify_mark);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d24f761f4876..c33a1edb799f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -44,16 +44,17 @@
44#include <linux/times.h> 44#include <linux/times.h>
45#include <linux/limits.h> 45#include <linux/limits.h>
46#include <linux/dcache.h> 46#include <linux/dcache.h>
47#include <linux/dnotify.h>
47#include <linux/syscalls.h> 48#include <linux/syscalls.h>
48#include <linux/vmstat.h> 49#include <linux/vmstat.h>
49#include <linux/nfs_fs.h> 50#include <linux/nfs_fs.h>
50#include <linux/acpi.h> 51#include <linux/acpi.h>
51#include <linux/reboot.h> 52#include <linux/reboot.h>
52#include <linux/ftrace.h> 53#include <linux/ftrace.h>
53#include <linux/slow-work.h>
54#include <linux/perf_event.h> 54#include <linux/perf_event.h>
55#include <linux/kprobes.h> 55#include <linux/kprobes.h>
56#include <linux/pipe_fs_i.h> 56#include <linux/pipe_fs_i.h>
57#include <linux/oom.h>
57 58
58#include <asm/uaccess.h> 59#include <asm/uaccess.h>
59#include <asm/processor.h> 60#include <asm/processor.h>
@@ -76,15 +77,16 @@
76#include <scsi/sg.h> 77#include <scsi/sg.h>
77#endif 78#endif
78 79
80#ifdef CONFIG_LOCKUP_DETECTOR
81#include <linux/nmi.h>
82#endif
83
79 84
80#if defined(CONFIG_SYSCTL) 85#if defined(CONFIG_SYSCTL)
81 86
82/* External variables not in a header file. */ 87/* External variables not in a header file. */
83extern int sysctl_overcommit_memory; 88extern int sysctl_overcommit_memory;
84extern int sysctl_overcommit_ratio; 89extern int sysctl_overcommit_ratio;
85extern int sysctl_panic_on_oom;
86extern int sysctl_oom_kill_allocating_task;
87extern int sysctl_oom_dump_tasks;
88extern int max_threads; 90extern int max_threads;
89extern int core_uses_pid; 91extern int core_uses_pid;
90extern int suid_dumpable; 92extern int suid_dumpable;
@@ -106,7 +108,7 @@ extern int blk_iopoll_enabled;
106#endif 108#endif
107 109
108/* Constants used for minimum and maximum */ 110/* Constants used for minimum and maximum */
109#ifdef CONFIG_DETECT_SOFTLOCKUP 111#ifdef CONFIG_LOCKUP_DETECTOR
110static int sixty = 60; 112static int sixty = 60;
111static int neg_one = -1; 113static int neg_one = -1;
112#endif 114#endif
@@ -130,6 +132,9 @@ static int min_percpu_pagelist_fract = 8;
130 132
131static int ngroups_max = NGROUPS_MAX; 133static int ngroups_max = NGROUPS_MAX;
132 134
135#ifdef CONFIG_INOTIFY_USER
136#include <linux/inotify.h>
137#endif
133#ifdef CONFIG_SPARC 138#ifdef CONFIG_SPARC
134#include <asm/system.h> 139#include <asm/system.h>
135#endif 140#endif
@@ -156,8 +161,6 @@ extern int no_unaligned_warning;
156extern int unaligned_dump_stack; 161extern int unaligned_dump_stack;
157#endif 162#endif
158 163
159extern struct ratelimit_state printk_ratelimit_state;
160
161#ifdef CONFIG_PROC_SYSCTL 164#ifdef CONFIG_PROC_SYSCTL
162static int proc_do_cad_pid(struct ctl_table *table, int write, 165static int proc_do_cad_pid(struct ctl_table *table, int write,
163 void __user *buffer, size_t *lenp, loff_t *ppos); 166 void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -206,9 +209,6 @@ static struct ctl_table fs_table[];
206static struct ctl_table debug_table[]; 209static struct ctl_table debug_table[];
207static struct ctl_table dev_table[]; 210static struct ctl_table dev_table[];
208extern struct ctl_table random_table[]; 211extern struct ctl_table random_table[];
209#ifdef CONFIG_INOTIFY_USER
210extern struct ctl_table inotify_table[];
211#endif
212#ifdef CONFIG_EPOLL 212#ifdef CONFIG_EPOLL
213extern struct ctl_table epoll_table[]; 213extern struct ctl_table epoll_table[];
214#endif 214#endif
@@ -562,7 +562,7 @@ static struct ctl_table kern_table[] = {
562 .extra2 = &one, 562 .extra2 = &one,
563 }, 563 },
564#endif 564#endif
565#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) 565#ifdef CONFIG_HOTPLUG
566 { 566 {
567 .procname = "hotplug", 567 .procname = "hotplug",
568 .data = &uevent_helper, 568 .data = &uevent_helper,
@@ -710,7 +710,34 @@ static struct ctl_table kern_table[] = {
710 .mode = 0444, 710 .mode = 0444,
711 .proc_handler = proc_dointvec, 711 .proc_handler = proc_dointvec,
712 }, 712 },
713#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) 713#if defined(CONFIG_LOCKUP_DETECTOR)
714 {
715 .procname = "watchdog",
716 .data = &watchdog_enabled,
717 .maxlen = sizeof (int),
718 .mode = 0644,
719 .proc_handler = proc_dowatchdog_enabled,
720 },
721 {
722 .procname = "watchdog_thresh",
723 .data = &softlockup_thresh,
724 .maxlen = sizeof(int),
725 .mode = 0644,
726 .proc_handler = proc_dowatchdog_thresh,
727 .extra1 = &neg_one,
728 .extra2 = &sixty,
729 },
730 {
731 .procname = "softlockup_panic",
732 .data = &softlockup_panic,
733 .maxlen = sizeof(int),
734 .mode = 0644,
735 .proc_handler = proc_dointvec_minmax,
736 .extra1 = &zero,
737 .extra2 = &one,
738 },
739#endif
740#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR)
714 { 741 {
715 .procname = "unknown_nmi_panic", 742 .procname = "unknown_nmi_panic",
716 .data = &unknown_nmi_panic, 743 .data = &unknown_nmi_panic,
@@ -813,26 +840,6 @@ static struct ctl_table kern_table[] = {
813 .proc_handler = proc_dointvec, 840 .proc_handler = proc_dointvec,
814 }, 841 },
815#endif 842#endif
816#ifdef CONFIG_DETECT_SOFTLOCKUP
817 {
818 .procname = "softlockup_panic",
819 .data = &softlockup_panic,
820 .maxlen = sizeof(int),
821 .mode = 0644,
822 .proc_handler = proc_dointvec_minmax,
823 .extra1 = &zero,
824 .extra2 = &one,
825 },
826 {
827 .procname = "softlockup_thresh",
828 .data = &softlockup_thresh,
829 .maxlen = sizeof(int),
830 .mode = 0644,
831 .proc_handler = proc_dosoftlockup_thresh,
832 .extra1 = &neg_one,
833 .extra2 = &sixty,
834 },
835#endif
836#ifdef CONFIG_DETECT_HUNG_TASK 843#ifdef CONFIG_DETECT_HUNG_TASK
837 { 844 {
838 .procname = "hung_task_panic", 845 .procname = "hung_task_panic",
@@ -906,13 +913,6 @@ static struct ctl_table kern_table[] = {
906 .proc_handler = proc_dointvec, 913 .proc_handler = proc_dointvec,
907 }, 914 },
908#endif 915#endif
909#ifdef CONFIG_SLOW_WORK
910 {
911 .procname = "slow-work",
912 .mode = 0555,
913 .child = slow_work_sysctls,
914 },
915#endif
916#ifdef CONFIG_PERF_EVENTS 916#ifdef CONFIG_PERF_EVENTS
917 { 917 {
918 .procname = "perf_event_paranoid", 918 .procname = "perf_event_paranoid",
@@ -1338,28 +1338,28 @@ static struct ctl_table fs_table[] = {
1338 .data = &inodes_stat, 1338 .data = &inodes_stat,
1339 .maxlen = 2*sizeof(int), 1339 .maxlen = 2*sizeof(int),
1340 .mode = 0444, 1340 .mode = 0444,
1341 .proc_handler = proc_dointvec, 1341 .proc_handler = proc_nr_inodes,
1342 }, 1342 },
1343 { 1343 {
1344 .procname = "inode-state", 1344 .procname = "inode-state",
1345 .data = &inodes_stat, 1345 .data = &inodes_stat,
1346 .maxlen = 7*sizeof(int), 1346 .maxlen = 7*sizeof(int),
1347 .mode = 0444, 1347 .mode = 0444,
1348 .proc_handler = proc_dointvec, 1348 .proc_handler = proc_nr_inodes,
1349 }, 1349 },
1350 { 1350 {
1351 .procname = "file-nr", 1351 .procname = "file-nr",
1352 .data = &files_stat, 1352 .data = &files_stat,
1353 .maxlen = 3*sizeof(int), 1353 .maxlen = sizeof(files_stat),
1354 .mode = 0444, 1354 .mode = 0444,
1355 .proc_handler = proc_nr_files, 1355 .proc_handler = proc_nr_files,
1356 }, 1356 },
1357 { 1357 {
1358 .procname = "file-max", 1358 .procname = "file-max",
1359 .data = &files_stat.max_files, 1359 .data = &files_stat.max_files,
1360 .maxlen = sizeof(int), 1360 .maxlen = sizeof(files_stat.max_files),
1361 .mode = 0644, 1361 .mode = 0644,
1362 .proc_handler = proc_dointvec, 1362 .proc_handler = proc_doulongvec_minmax,
1363 }, 1363 },
1364 { 1364 {
1365 .procname = "nr_open", 1365 .procname = "nr_open",
@@ -1375,7 +1375,7 @@ static struct ctl_table fs_table[] = {
1375 .data = &dentry_stat, 1375 .data = &dentry_stat,
1376 .maxlen = 6*sizeof(int), 1376 .maxlen = 6*sizeof(int),
1377 .mode = 0444, 1377 .mode = 0444,
1378 .proc_handler = proc_dointvec, 1378 .proc_handler = proc_nr_dentry,
1379 }, 1379 },
1380 { 1380 {
1381 .procname = "overflowuid", 1381 .procname = "overflowuid",
@@ -1711,10 +1711,7 @@ static __init int sysctl_init(void)
1711{ 1711{
1712 sysctl_set_parent(NULL, root_table); 1712 sysctl_set_parent(NULL, root_table);
1713#ifdef CONFIG_SYSCTL_SYSCALL_CHECK 1713#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
1714 { 1714 sysctl_check_table(current->nsproxy, root_table);
1715 int err;
1716 err = sysctl_check_table(current->nsproxy, root_table);
1717 }
1718#endif 1715#endif
1719 return 0; 1716 return 0;
1720} 1717}
@@ -2486,7 +2483,7 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
2486 kbuf[left] = 0; 2483 kbuf[left] = 0;
2487 } 2484 }
2488 2485
2489 for (; left && vleft--; i++, min++, max++, first=0) { 2486 for (; left && vleft--; i++, first = 0) {
2490 unsigned long val; 2487 unsigned long val;
2491 2488
2492 if (write) { 2489 if (write) {
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index 04cdcf72c827..10b90d8a03c4 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -143,15 +143,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
143 if (!table->maxlen) 143 if (!table->maxlen)
144 set_fail(&fail, table, "No maxlen"); 144 set_fail(&fail, table, "No maxlen");
145 } 145 }
146 if ((table->proc_handler == proc_doulongvec_minmax) ||
147 (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
148 if (table->maxlen > sizeof (unsigned long)) {
149 if (!table->extra1)
150 set_fail(&fail, table, "No min");
151 if (!table->extra2)
152 set_fail(&fail, table, "No max");
153 }
154 }
155#ifdef CONFIG_PROC_SYSCTL 146#ifdef CONFIG_PROC_SYSCTL
156 if (table->procname && !table->proc_handler) 147 if (table->procname && !table->proc_handler)
157 set_fail(&fail, table, "No proc_handler"); 148 set_fail(&fail, table, "No proc_handler");
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 11281d5792bd..c8231fb15708 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -175,22 +175,8 @@ static void send_cpu_listeners(struct sk_buff *skb,
175 up_write(&listeners->sem); 175 up_write(&listeners->sem);
176} 176}
177 177
178static int fill_pid(pid_t pid, struct task_struct *tsk, 178static void fill_stats(struct task_struct *tsk, struct taskstats *stats)
179 struct taskstats *stats)
180{ 179{
181 int rc = 0;
182
183 if (!tsk) {
184 rcu_read_lock();
185 tsk = find_task_by_vpid(pid);
186 if (tsk)
187 get_task_struct(tsk);
188 rcu_read_unlock();
189 if (!tsk)
190 return -ESRCH;
191 } else
192 get_task_struct(tsk);
193
194 memset(stats, 0, sizeof(*stats)); 180 memset(stats, 0, sizeof(*stats));
195 /* 181 /*
196 * Each accounting subsystem adds calls to its functions to 182 * Each accounting subsystem adds calls to its functions to
@@ -209,17 +195,27 @@ static int fill_pid(pid_t pid, struct task_struct *tsk,
209 195
210 /* fill in extended acct fields */ 196 /* fill in extended acct fields */
211 xacct_add_tsk(stats, tsk); 197 xacct_add_tsk(stats, tsk);
198}
212 199
213 /* Define err: label here if needed */ 200static int fill_stats_for_pid(pid_t pid, struct taskstats *stats)
214 put_task_struct(tsk); 201{
215 return rc; 202 struct task_struct *tsk;
216 203
204 rcu_read_lock();
205 tsk = find_task_by_vpid(pid);
206 if (tsk)
207 get_task_struct(tsk);
208 rcu_read_unlock();
209 if (!tsk)
210 return -ESRCH;
211 fill_stats(tsk, stats);
212 put_task_struct(tsk);
213 return 0;
217} 214}
218 215
219static int fill_tgid(pid_t tgid, struct task_struct *first, 216static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
220 struct taskstats *stats)
221{ 217{
222 struct task_struct *tsk; 218 struct task_struct *tsk, *first;
223 unsigned long flags; 219 unsigned long flags;
224 int rc = -ESRCH; 220 int rc = -ESRCH;
225 221
@@ -228,8 +224,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *first,
228 * leaders who are already counted with the dead tasks 224 * leaders who are already counted with the dead tasks
229 */ 225 */
230 rcu_read_lock(); 226 rcu_read_lock();
231 if (!first) 227 first = find_task_by_vpid(tgid);
232 first = find_task_by_vpid(tgid);
233 228
234 if (!first || !lock_task_sighand(first, &flags)) 229 if (!first || !lock_task_sighand(first, &flags))
235 goto out; 230 goto out;
@@ -268,7 +263,6 @@ out:
268 return rc; 263 return rc;
269} 264}
270 265
271
272static void fill_tgid_exit(struct task_struct *tsk) 266static void fill_tgid_exit(struct task_struct *tsk)
273{ 267{
274 unsigned long flags; 268 unsigned long flags;
@@ -360,6 +354,12 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
360 struct nlattr *na, *ret; 354 struct nlattr *na, *ret;
361 int aggr; 355 int aggr;
362 356
357 /* If we don't pad, we end up with alignment on a 4 byte boundary.
358 * This causes lots of runtime warnings on systems requiring 8 byte
359 * alignment */
360 u32 pids[2] = { pid, 0 };
361 int pid_size = ALIGN(sizeof(pid), sizeof(long));
362
363 aggr = (type == TASKSTATS_TYPE_PID) 363 aggr = (type == TASKSTATS_TYPE_PID)
364 ? TASKSTATS_TYPE_AGGR_PID 364 ? TASKSTATS_TYPE_AGGR_PID
365 : TASKSTATS_TYPE_AGGR_TGID; 365 : TASKSTATS_TYPE_AGGR_TGID;
@@ -367,7 +367,7 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
367 na = nla_nest_start(skb, aggr); 367 na = nla_nest_start(skb, aggr);
368 if (!na) 368 if (!na)
369 goto err; 369 goto err;
370 if (nla_put(skb, type, sizeof(pid), &pid) < 0) 370 if (nla_put(skb, type, pid_size, pids) < 0)
371 goto err; 371 goto err;
372 ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); 372 ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
373 if (!ret) 373 if (!ret)
@@ -424,39 +424,46 @@ err:
424 return rc; 424 return rc;
425} 425}
426 426
427static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 427static int cmd_attr_register_cpumask(struct genl_info *info)
428{ 428{
429 int rc;
430 struct sk_buff *rep_skb;
431 struct taskstats *stats;
432 size_t size;
433 cpumask_var_t mask; 429 cpumask_var_t mask;
430 int rc;
434 431
435 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 432 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
436 return -ENOMEM; 433 return -ENOMEM;
437
438 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); 434 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask);
439 if (rc < 0) 435 if (rc < 0)
440 goto free_return_rc; 436 goto out;
441 if (rc == 0) { 437 rc = add_del_listener(info->snd_pid, mask, REGISTER);
442 rc = add_del_listener(info->snd_pid, mask, REGISTER); 438out:
443 goto free_return_rc; 439 free_cpumask_var(mask);
444 } 440 return rc;
441}
442
443static int cmd_attr_deregister_cpumask(struct genl_info *info)
444{
445 cpumask_var_t mask;
446 int rc;
445 447
448 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
449 return -ENOMEM;
446 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); 450 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask);
447 if (rc < 0) 451 if (rc < 0)
448 goto free_return_rc; 452 goto out;
449 if (rc == 0) { 453 rc = add_del_listener(info->snd_pid, mask, DEREGISTER);
450 rc = add_del_listener(info->snd_pid, mask, DEREGISTER); 454out:
451free_return_rc:
452 free_cpumask_var(mask);
453 return rc;
454 }
455 free_cpumask_var(mask); 455 free_cpumask_var(mask);
456 return rc;
457}
458
459static int cmd_attr_pid(struct genl_info *info)
460{
461 struct taskstats *stats;
462 struct sk_buff *rep_skb;
463 size_t size;
464 u32 pid;
465 int rc;
456 466
457 /*
458 * Size includes space for nested attributes
459 */
460 size = nla_total_size(sizeof(u32)) + 467 size = nla_total_size(sizeof(u32)) +
461 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); 468 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
462 469
@@ -465,33 +472,64 @@ free_return_rc:
465 return rc; 472 return rc;
466 473
467 rc = -EINVAL; 474 rc = -EINVAL;
468 if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { 475 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
469 u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); 476 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid);
470 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); 477 if (!stats)
471 if (!stats) 478 goto err;
472 goto err; 479
473 480 rc = fill_stats_for_pid(pid, stats);
474 rc = fill_pid(pid, NULL, stats); 481 if (rc < 0)
475 if (rc < 0) 482 goto err;
476 goto err; 483 return send_reply(rep_skb, info);
477 } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { 484err:
478 u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); 485 nlmsg_free(rep_skb);
479 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); 486 return rc;
480 if (!stats) 487}
481 goto err; 488
482 489static int cmd_attr_tgid(struct genl_info *info)
483 rc = fill_tgid(tgid, NULL, stats); 490{
484 if (rc < 0) 491 struct taskstats *stats;
485 goto err; 492 struct sk_buff *rep_skb;
486 } else 493 size_t size;
494 u32 tgid;
495 int rc;
496
497 size = nla_total_size(sizeof(u32)) +
498 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
499
500 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
501 if (rc < 0)
502 return rc;
503
504 rc = -EINVAL;
505 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
506 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid);
507 if (!stats)
487 goto err; 508 goto err;
488 509
510 rc = fill_stats_for_tgid(tgid, stats);
511 if (rc < 0)
512 goto err;
489 return send_reply(rep_skb, info); 513 return send_reply(rep_skb, info);
490err: 514err:
491 nlmsg_free(rep_skb); 515 nlmsg_free(rep_skb);
492 return rc; 516 return rc;
493} 517}
494 518
519static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
520{
521 if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK])
522 return cmd_attr_register_cpumask(info);
523 else if (info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK])
524 return cmd_attr_deregister_cpumask(info);
525 else if (info->attrs[TASKSTATS_CMD_ATTR_PID])
526 return cmd_attr_pid(info);
527 else if (info->attrs[TASKSTATS_CMD_ATTR_TGID])
528 return cmd_attr_tgid(info);
529 else
530 return -EINVAL;
531}
532
495static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) 533static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk)
496{ 534{
497 struct signal_struct *sig = tsk->signal; 535 struct signal_struct *sig = tsk->signal;
@@ -555,9 +593,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
555 if (!stats) 593 if (!stats)
556 goto err; 594 goto err;
557 595
558 rc = fill_pid(-1, tsk, stats); 596 fill_stats(tsk, stats);
559 if (rc < 0)
560 goto err;
561 597
562 /* 598 /*
563 * Doesn't matter if tsk is the leader or the last group member leaving 599 * Doesn't matter if tsk is the leader or the last group member leaving
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 4f104515a19b..f8b11a283171 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -115,7 +115,9 @@ static int test_kprobes(void)
115 int ret; 115 int ret;
116 struct kprobe *kps[2] = {&kp, &kp2}; 116 struct kprobe *kps[2] = {&kp, &kp2};
117 117
118 kp.addr = 0; /* addr should be cleard for reusing kprobe. */ 118 /* addr and flags should be cleard for reusing kprobe. */
119 kp.addr = NULL;
120 kp.flags = 0;
119 ret = register_kprobes(kps, 2); 121 ret = register_kprobes(kps, 2);
120 if (ret < 0) { 122 if (ret < 0) {
121 printk(KERN_ERR "Kprobe smoke test failed: " 123 printk(KERN_ERR "Kprobe smoke test failed: "
@@ -210,7 +212,9 @@ static int test_jprobes(void)
210 int ret; 212 int ret;
211 struct jprobe *jps[2] = {&jp, &jp2}; 213 struct jprobe *jps[2] = {&jp, &jp2};
212 214
213 jp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ 215 /* addr and flags should be cleard for reusing kprobe. */
216 jp.kp.addr = NULL;
217 jp.kp.flags = 0;
214 ret = register_jprobes(jps, 2); 218 ret = register_jprobes(jps, 2);
215 if (ret < 0) { 219 if (ret < 0) {
216 printk(KERN_ERR "Kprobe smoke test failed: " 220 printk(KERN_ERR "Kprobe smoke test failed: "
@@ -323,7 +327,9 @@ static int test_kretprobes(void)
323 int ret; 327 int ret;
324 struct kretprobe *rps[2] = {&rp, &rp2}; 328 struct kretprobe *rps[2] = {&rp, &rp2};
325 329
326 rp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ 330 /* addr and flags should be cleard for reusing kprobe. */
331 rp.kp.addr = NULL;
332 rp.kp.flags = 0;
327 ret = register_kretprobes(rps, 2); 333 ret = register_kretprobes(rps, 2);
328 if (ret < 0) { 334 if (ret < 0) {
329 printk(KERN_ERR "Kprobe smoke test failed: " 335 printk(KERN_ERR "Kprobe smoke test failed: "
diff --git a/kernel/time.c b/kernel/time.c
index 848b1c2ab09a..ba9b338d1835 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -300,22 +300,6 @@ struct timespec timespec_trunc(struct timespec t, unsigned gran)
300} 300}
301EXPORT_SYMBOL(timespec_trunc); 301EXPORT_SYMBOL(timespec_trunc);
302 302
303#ifndef CONFIG_GENERIC_TIME
304/*
305 * Simulate gettimeofday using do_gettimeofday which only allows a timeval
306 * and therefore only yields usec accuracy
307 */
308void getnstimeofday(struct timespec *tv)
309{
310 struct timeval x;
311
312 do_gettimeofday(&x);
313 tv->tv_sec = x.tv_sec;
314 tv->tv_nsec = x.tv_usec * NSEC_PER_USEC;
315}
316EXPORT_SYMBOL_GPL(getnstimeofday);
317#endif
318
319/* Converts Gregorian date to seconds since 1970-01-01 00:00:00. 303/* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
320 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 304 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
321 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. 305 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 95ed42951e0a..f06a8a365648 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -6,7 +6,7 @@ config TICK_ONESHOT
6 6
7config NO_HZ 7config NO_HZ
8 bool "Tickless System (Dynamic Ticks)" 8 bool "Tickless System (Dynamic Ticks)"
9 depends on GENERIC_TIME && GENERIC_CLOCKEVENTS 9 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
10 select TICK_ONESHOT 10 select TICK_ONESHOT
11 help 11 help
12 This option enables a tickless system: timer interrupts will 12 This option enables a tickless system: timer interrupts will
@@ -15,7 +15,7 @@ config NO_HZ
15 15
16config HIGH_RES_TIMERS 16config HIGH_RES_TIMERS
17 bool "High Resolution Timer Support" 17 bool "High Resolution Timer Support"
18 depends on GENERIC_TIME && GENERIC_CLOCKEVENTS 18 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
19 select TICK_ONESHOT 19 select TICK_ONESHOT
20 help 20 help
21 This option enables high resolution timer support. If your 21 This option enables high resolution timer support. If your
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index f08e99c1d561..c18d7efa1b4b 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -531,7 +531,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs)
531 return max_nsecs - (max_nsecs >> 5); 531 return max_nsecs - (max_nsecs >> 5);
532} 532}
533 533
534#ifdef CONFIG_GENERIC_TIME 534#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
535 535
536/** 536/**
537 * clocksource_select - Select the best clocksource available 537 * clocksource_select - Select the best clocksource available
@@ -577,7 +577,7 @@ static void clocksource_select(void)
577 } 577 }
578} 578}
579 579
580#else /* CONFIG_GENERIC_TIME */ 580#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */
581 581
582static inline void clocksource_select(void) { } 582static inline void clocksource_select(void) { }
583 583
@@ -639,19 +639,18 @@ static void clocksource_enqueue(struct clocksource *cs)
639#define MAX_UPDATE_LENGTH 5 /* Seconds */ 639#define MAX_UPDATE_LENGTH 5 /* Seconds */
640 640
641/** 641/**
642 * __clocksource_register_scale - Used to install new clocksources 642 * __clocksource_updatefreq_scale - Used update clocksource with new freq
643 * @t: clocksource to be registered 643 * @t: clocksource to be registered
644 * @scale: Scale factor multiplied against freq to get clocksource hz 644 * @scale: Scale factor multiplied against freq to get clocksource hz
645 * @freq: clocksource frequency (cycles per second) divided by scale 645 * @freq: clocksource frequency (cycles per second) divided by scale
646 * 646 *
647 * Returns -EBUSY if registration fails, zero otherwise. 647 * This should only be called from the clocksource->enable() method.
648 * 648 *
649 * This *SHOULD NOT* be called directly! Please use the 649 * This *SHOULD NOT* be called directly! Please use the
650 * clocksource_register_hz() or clocksource_register_khz helper functions. 650 * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions.
651 */ 651 */
652int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) 652void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
653{ 653{
654
655 /* 654 /*
656 * Ideally we want to use some of the limits used in 655 * Ideally we want to use some of the limits used in
657 * clocksource_max_deferment, to provide a more informed 656 * clocksource_max_deferment, to provide a more informed
@@ -662,7 +661,27 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
662 NSEC_PER_SEC/scale, 661 NSEC_PER_SEC/scale,
663 MAX_UPDATE_LENGTH*scale); 662 MAX_UPDATE_LENGTH*scale);
664 cs->max_idle_ns = clocksource_max_deferment(cs); 663 cs->max_idle_ns = clocksource_max_deferment(cs);
664}
665EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
666
667/**
668 * __clocksource_register_scale - Used to install new clocksources
669 * @t: clocksource to be registered
670 * @scale: Scale factor multiplied against freq to get clocksource hz
671 * @freq: clocksource frequency (cycles per second) divided by scale
672 *
673 * Returns -EBUSY if registration fails, zero otherwise.
674 *
675 * This *SHOULD NOT* be called directly! Please use the
676 * clocksource_register_hz() or clocksource_register_khz helper functions.
677 */
678int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
679{
680
681 /* Intialize mult/shift and max_idle_ns */
682 __clocksource_updatefreq_scale(cs, scale, freq);
665 683
684 /* Add clocksource to the clcoksource list */
666 mutex_lock(&clocksource_mutex); 685 mutex_lock(&clocksource_mutex);
667 clocksource_enqueue(cs); 686 clocksource_enqueue(cs);
668 clocksource_select(); 687 clocksource_select();
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index c63116863a80..d2321891538f 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -149,10 +149,18 @@ static void ntp_update_offset(long offset)
149 time_reftime = get_seconds(); 149 time_reftime = get_seconds();
150 150
151 offset64 = offset; 151 offset64 = offset;
152 freq_adj = (offset64 * secs) << 152 freq_adj = ntp_update_offset_fll(offset64, secs);
153 (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant));
154 153
155 freq_adj += ntp_update_offset_fll(offset64, secs); 154 /*
155 * Clamp update interval to reduce PLL gain with low
156 * sampling rate (e.g. intermittent network connection)
157 * to avoid instability.
158 */
159 if (unlikely(secs > 1 << (SHIFT_PLL + 1 + time_constant)))
160 secs = 1 << (SHIFT_PLL + 1 + time_constant);
161
162 freq_adj += (offset64 * secs) <<
163 (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant));
156 164
157 freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED); 165 freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED);
158 166
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index b3bafd5fc66d..48b2761b5668 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -188,7 +188,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
188 /* 188 /*
189 * Setup the next period for devices, which do not have 189 * Setup the next period for devices, which do not have
190 * periodic mode. We read dev->next_event first and add to it 190 * periodic mode. We read dev->next_event first and add to it
191 * when the event alrady expired. clockevents_program_event() 191 * when the event already expired. clockevents_program_event()
192 * sets dev->next_event only when the event is really 192 * sets dev->next_event only when the event is really
193 * programmed to the device. 193 * programmed to the device.
194 */ 194 */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 813993b5fb61..3e216e01bbd1 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -325,7 +325,7 @@ void tick_nohz_stop_sched_tick(int inidle)
325 } while (read_seqretry(&xtime_lock, seq)); 325 } while (read_seqretry(&xtime_lock, seq));
326 326
327 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || 327 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
328 arch_needs_cpu(cpu) || nohz_ratelimit(cpu)) { 328 arch_needs_cpu(cpu)) {
329 next_jiffies = last_jiffies + 1; 329 next_jiffies = last_jiffies + 1;
330 delta_jiffies = 1; 330 delta_jiffies = 1;
331 } else { 331 } else {
@@ -405,13 +405,7 @@ void tick_nohz_stop_sched_tick(int inidle)
405 * the scheduler tick in nohz_restart_sched_tick. 405 * the scheduler tick in nohz_restart_sched_tick.
406 */ 406 */
407 if (!ts->tick_stopped) { 407 if (!ts->tick_stopped) {
408 if (select_nohz_load_balancer(1)) { 408 select_nohz_load_balancer(1);
409 /*
410 * sched tick not stopped!
411 */
412 cpumask_clear_cpu(cpu, nohz_cpu_mask);
413 goto out;
414 }
415 409
416 ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); 410 ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
417 ts->tick_stopped = 1; 411 ts->tick_stopped = 1;
@@ -780,7 +774,6 @@ void tick_setup_sched_timer(void)
780{ 774{
781 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 775 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
782 ktime_t now = ktime_get(); 776 ktime_t now = ktime_get();
783 u64 offset;
784 777
785 /* 778 /*
786 * Emulate tick processing via per-CPU hrtimers: 779 * Emulate tick processing via per-CPU hrtimers:
@@ -790,10 +783,6 @@ void tick_setup_sched_timer(void)
790 783
791 /* Get the next period (per cpu) */ 784 /* Get the next period (per cpu) */
792 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); 785 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
793 offset = ktime_to_ns(tick_period) >> 1;
794 do_div(offset, num_possible_cpus());
795 offset *= smp_processor_id();
796 hrtimer_add_expires_ns(&ts->sched_timer, offset);
797 786
798 for (;;) { 787 for (;;) {
799 hrtimer_forward(&ts->sched_timer, now, tick_period); 788 hrtimer_forward(&ts->sched_timer, now, tick_period);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index caf8d4d4f5c8..49010d822f72 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -153,8 +153,8 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
153 * - wall_to_monotonic is no longer the boot time, getboottime must be 153 * - wall_to_monotonic is no longer the boot time, getboottime must be
154 * used instead. 154 * used instead.
155 */ 155 */
156struct timespec xtime __attribute__ ((aligned (16))); 156static struct timespec xtime __attribute__ ((aligned (16)));
157struct timespec wall_to_monotonic __attribute__ ((aligned (16))); 157static struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
158static struct timespec total_sleep_time; 158static struct timespec total_sleep_time;
159 159
160/* 160/*
@@ -170,11 +170,10 @@ void timekeeping_leap_insert(int leapsecond)
170{ 170{
171 xtime.tv_sec += leapsecond; 171 xtime.tv_sec += leapsecond;
172 wall_to_monotonic.tv_sec -= leapsecond; 172 wall_to_monotonic.tv_sec -= leapsecond;
173 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); 173 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
174 timekeeper.mult);
174} 175}
175 176
176#ifdef CONFIG_GENERIC_TIME
177
178/** 177/**
179 * timekeeping_forward_now - update clock to the current time 178 * timekeeping_forward_now - update clock to the current time
180 * 179 *
@@ -328,7 +327,8 @@ int do_settimeofday(struct timespec *tv)
328 timekeeper.ntp_error = 0; 327 timekeeper.ntp_error = 0;
329 ntp_clear(); 328 ntp_clear();
330 329
331 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); 330 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
331 timekeeper.mult);
332 332
333 write_sequnlock_irqrestore(&xtime_lock, flags); 333 write_sequnlock_irqrestore(&xtime_lock, flags);
334 334
@@ -376,52 +376,6 @@ void timekeeping_notify(struct clocksource *clock)
376 tick_clock_notify(); 376 tick_clock_notify();
377} 377}
378 378
379#else /* GENERIC_TIME */
380
381static inline void timekeeping_forward_now(void) { }
382
383/**
384 * ktime_get - get the monotonic time in ktime_t format
385 *
386 * returns the time in ktime_t format
387 */
388ktime_t ktime_get(void)
389{
390 struct timespec now;
391
392 ktime_get_ts(&now);
393
394 return timespec_to_ktime(now);
395}
396EXPORT_SYMBOL_GPL(ktime_get);
397
398/**
399 * ktime_get_ts - get the monotonic clock in timespec format
400 * @ts: pointer to timespec variable
401 *
402 * The function calculates the monotonic clock from the realtime
403 * clock and the wall_to_monotonic offset and stores the result
404 * in normalized timespec format in the variable pointed to by @ts.
405 */
406void ktime_get_ts(struct timespec *ts)
407{
408 struct timespec tomono;
409 unsigned long seq;
410
411 do {
412 seq = read_seqbegin(&xtime_lock);
413 getnstimeofday(ts);
414 tomono = wall_to_monotonic;
415
416 } while (read_seqretry(&xtime_lock, seq));
417
418 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
419 ts->tv_nsec + tomono.tv_nsec);
420}
421EXPORT_SYMBOL_GPL(ktime_get_ts);
422
423#endif /* !GENERIC_TIME */
424
425/** 379/**
426 * ktime_get_real - get the real (wall-) time in ktime_t format 380 * ktime_get_real - get the real (wall-) time in ktime_t format
427 * 381 *
@@ -579,9 +533,9 @@ static int timekeeping_resume(struct sys_device *dev)
579 533
580 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { 534 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
581 ts = timespec_sub(ts, timekeeping_suspend_time); 535 ts = timespec_sub(ts, timekeeping_suspend_time);
582 xtime = timespec_add_safe(xtime, ts); 536 xtime = timespec_add(xtime, ts);
583 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); 537 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
584 total_sleep_time = timespec_add_safe(total_sleep_time, ts); 538 total_sleep_time = timespec_add(total_sleep_time, ts);
585 } 539 }
586 /* re-base the last cycle value */ 540 /* re-base the last cycle value */
587 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); 541 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
@@ -736,6 +690,7 @@ static void timekeeping_adjust(s64 offset)
736static cycle_t logarithmic_accumulation(cycle_t offset, int shift) 690static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
737{ 691{
738 u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; 692 u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
693 u64 raw_nsecs;
739 694
740 /* If the offset is smaller then a shifted interval, do nothing */ 695 /* If the offset is smaller then a shifted interval, do nothing */
741 if (offset < timekeeper.cycle_interval<<shift) 696 if (offset < timekeeper.cycle_interval<<shift)
@@ -752,12 +707,15 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
752 second_overflow(); 707 second_overflow();
753 } 708 }
754 709
755 /* Accumulate into raw time */ 710 /* Accumulate raw time */
756 raw_time.tv_nsec += timekeeper.raw_interval << shift;; 711 raw_nsecs = timekeeper.raw_interval << shift;
757 while (raw_time.tv_nsec >= NSEC_PER_SEC) { 712 raw_nsecs += raw_time.tv_nsec;
758 raw_time.tv_nsec -= NSEC_PER_SEC; 713 if (raw_nsecs >= NSEC_PER_SEC) {
759 raw_time.tv_sec++; 714 u64 raw_secs = raw_nsecs;
715 raw_nsecs = do_div(raw_secs, NSEC_PER_SEC);
716 raw_time.tv_sec += raw_secs;
760 } 717 }
718 raw_time.tv_nsec = raw_nsecs;
761 719
762 /* Accumulate error between NTP and clock interval */ 720 /* Accumulate error between NTP and clock interval */
763 timekeeper.ntp_error += tick_length << shift; 721 timekeeper.ntp_error += tick_length << shift;
@@ -784,10 +742,11 @@ void update_wall_time(void)
784 return; 742 return;
785 743
786 clock = timekeeper.clock; 744 clock = timekeeper.clock;
787#ifdef CONFIG_GENERIC_TIME 745
788 offset = (clock->read(clock) - clock->cycle_last) & clock->mask; 746#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
789#else
790 offset = timekeeper.cycle_interval; 747 offset = timekeeper.cycle_interval;
748#else
749 offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
791#endif 750#endif
792 timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; 751 timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
793 752
@@ -856,7 +815,8 @@ void update_wall_time(void)
856 } 815 }
857 816
858 /* check to see if there is a new clocksource to use */ 817 /* check to see if there is a new clocksource to use */
859 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); 818 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
819 timekeeper.mult);
860} 820}
861 821
862/** 822/**
@@ -887,7 +847,7 @@ EXPORT_SYMBOL_GPL(getboottime);
887 */ 847 */
888void monotonic_to_bootbased(struct timespec *ts) 848void monotonic_to_bootbased(struct timespec *ts)
889{ 849{
890 *ts = timespec_add_safe(*ts, total_sleep_time); 850 *ts = timespec_add(*ts, total_sleep_time);
891} 851}
892EXPORT_SYMBOL_GPL(monotonic_to_bootbased); 852EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
893 853
@@ -902,6 +862,11 @@ struct timespec __current_kernel_time(void)
902 return xtime; 862 return xtime;
903} 863}
904 864
865struct timespec __get_wall_to_monotonic(void)
866{
867 return wall_to_monotonic;
868}
869
905struct timespec current_kernel_time(void) 870struct timespec current_kernel_time(void)
906{ 871{
907 struct timespec now; 872 struct timespec now;
diff --git a/kernel/timer.c b/kernel/timer.c
index ee305c8d4e18..68a9ae7679b7 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,7 +37,7 @@
37#include <linux/delay.h> 37#include <linux/delay.h>
38#include <linux/tick.h> 38#include <linux/tick.h>
39#include <linux/kallsyms.h> 39#include <linux/kallsyms.h>
40#include <linux/perf_event.h> 40#include <linux/irq_work.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/slab.h> 42#include <linux/slab.h>
43 43
@@ -90,8 +90,13 @@ static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
90 90
91/* 91/*
92 * Note that all tvec_bases are 2 byte aligned and lower bit of 92 * Note that all tvec_bases are 2 byte aligned and lower bit of
93 * base in timer_list is guaranteed to be zero. Use the LSB for 93 * base in timer_list is guaranteed to be zero. Use the LSB to
94 * the new flag to indicate whether the timer is deferrable 94 * indicate whether the timer is deferrable.
95 *
96 * A deferrable timer will work normally when the system is busy, but
97 * will not cause a CPU to come out of idle just to service it; instead,
98 * the timer will be serviced when the CPU eventually wakes up with a
99 * subsequent non-deferrable timer.
95 */ 100 */
96#define TBASE_DEFERRABLE_FLAG (0x1) 101#define TBASE_DEFERRABLE_FLAG (0x1)
97 102
@@ -321,6 +326,7 @@ EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
321 326
322/** 327/**
323 * set_timer_slack - set the allowed slack for a timer 328 * set_timer_slack - set the allowed slack for a timer
329 * @timer: the timer to be modified
324 * @slack_hz: the amount of time (in jiffies) allowed for rounding 330 * @slack_hz: the amount of time (in jiffies) allowed for rounding
325 * 331 *
326 * Set the amount of time, in jiffies, that a certain timer has 332 * Set the amount of time, in jiffies, that a certain timer has
@@ -577,6 +583,19 @@ static void __init_timer(struct timer_list *timer,
577 lockdep_init_map(&timer->lockdep_map, name, key, 0); 583 lockdep_init_map(&timer->lockdep_map, name, key, 0);
578} 584}
579 585
586void setup_deferrable_timer_on_stack_key(struct timer_list *timer,
587 const char *name,
588 struct lock_class_key *key,
589 void (*function)(unsigned long),
590 unsigned long data)
591{
592 timer->function = function;
593 timer->data = data;
594 init_timer_on_stack_key(timer, name, key);
595 timer_set_deferrable(timer);
596}
597EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key);
598
580/** 599/**
581 * init_timer_key - initialize a timer 600 * init_timer_key - initialize a timer
582 * @timer: the timer to be initialized 601 * @timer: the timer to be initialized
@@ -679,12 +698,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
679 cpu = smp_processor_id(); 698 cpu = smp_processor_id();
680 699
681#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) 700#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
682 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) { 701 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu))
683 int preferred_cpu = get_nohz_load_balancer(); 702 cpu = get_nohz_timer_target();
684
685 if (preferred_cpu >= 0)
686 cpu = preferred_cpu;
687 }
688#endif 703#endif
689 new_base = per_cpu(tvec_bases, cpu); 704 new_base = per_cpu(tvec_bases, cpu);
690 705
@@ -1264,7 +1279,10 @@ void update_process_times(int user_tick)
1264 run_local_timers(); 1279 run_local_timers();
1265 rcu_check_callbacks(cpu, user_tick); 1280 rcu_check_callbacks(cpu, user_tick);
1266 printk_tick(); 1281 printk_tick();
1267 perf_event_do_pending(); 1282#ifdef CONFIG_IRQ_WORK
1283 if (in_irq())
1284 irq_work_run();
1285#endif
1268 scheduler_tick(); 1286 scheduler_tick();
1269 run_posix_cpu_timers(p); 1287 run_posix_cpu_timers(p);
1270} 1288}
@@ -1289,7 +1307,6 @@ void run_local_timers(void)
1289{ 1307{
1290 hrtimer_run_queues(); 1308 hrtimer_run_queues();
1291 raise_softirq(TIMER_SOFTIRQ); 1309 raise_softirq(TIMER_SOFTIRQ);
1292 softlockup_tick();
1293} 1310}
1294 1311
1295/* 1312/*
@@ -1750,3 +1767,25 @@ unsigned long msleep_interruptible(unsigned int msecs)
1750} 1767}
1751 1768
1752EXPORT_SYMBOL(msleep_interruptible); 1769EXPORT_SYMBOL(msleep_interruptible);
1770
1771static int __sched do_usleep_range(unsigned long min, unsigned long max)
1772{
1773 ktime_t kmin;
1774 unsigned long delta;
1775
1776 kmin = ktime_set(0, min * NSEC_PER_USEC);
1777 delta = (max - min) * NSEC_PER_USEC;
1778 return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
1779}
1780
1781/**
1782 * usleep_range - Drop in replacement for udelay where wakeup is flexible
1783 * @min: Minimum time in usecs to sleep
1784 * @max: Maximum time in usecs to sleep
1785 */
1786void usleep_range(unsigned long min, unsigned long max)
1787{
1788 __set_current_state(TASK_UNINTERRUPTIBLE);
1789 do_usleep_range(min, max);
1790}
1791EXPORT_SYMBOL(usleep_range);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8b1797c4545b..e04b8bcdef88 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -49,6 +49,11 @@ config HAVE_SYSCALL_TRACEPOINTS
49 help 49 help
50 See Documentation/trace/ftrace-design.txt 50 See Documentation/trace/ftrace-design.txt
51 51
52config HAVE_C_RECORDMCOUNT
53 bool
54 help
55 C version of recordmcount available?
56
52config TRACER_MAX_TRACE 57config TRACER_MAX_TRACE
53 bool 58 bool
54 59
@@ -121,7 +126,7 @@ if FTRACE
121config FUNCTION_TRACER 126config FUNCTION_TRACER
122 bool "Kernel Function Tracer" 127 bool "Kernel Function Tracer"
123 depends on HAVE_FUNCTION_TRACER 128 depends on HAVE_FUNCTION_TRACER
124 select FRAME_POINTER 129 select FRAME_POINTER if (!ARM_UNWIND)
125 select KALLSYMS 130 select KALLSYMS
126 select GENERIC_TRACER 131 select GENERIC_TRACER
127 select CONTEXT_SWITCH_TRACER 132 select CONTEXT_SWITCH_TRACER
@@ -153,7 +158,7 @@ config IRQSOFF_TRACER
153 bool "Interrupts-off Latency Tracer" 158 bool "Interrupts-off Latency Tracer"
154 default n 159 default n
155 depends on TRACE_IRQFLAGS_SUPPORT 160 depends on TRACE_IRQFLAGS_SUPPORT
156 depends on GENERIC_TIME 161 depends on !ARCH_USES_GETTIMEOFFSET
157 select TRACE_IRQFLAGS 162 select TRACE_IRQFLAGS
158 select GENERIC_TRACER 163 select GENERIC_TRACER
159 select TRACER_MAX_TRACE 164 select TRACER_MAX_TRACE
@@ -175,7 +180,7 @@ config IRQSOFF_TRACER
175config PREEMPT_TRACER 180config PREEMPT_TRACER
176 bool "Preemption-off Latency Tracer" 181 bool "Preemption-off Latency Tracer"
177 default n 182 default n
178 depends on GENERIC_TIME 183 depends on !ARCH_USES_GETTIMEOFFSET
179 depends on PREEMPT 184 depends on PREEMPT
180 select GENERIC_TRACER 185 select GENERIC_TRACER
181 select TRACER_MAX_TRACE 186 select TRACER_MAX_TRACE
@@ -194,15 +199,6 @@ config PREEMPT_TRACER
194 enabled. This option and the irqs-off timing option can be 199 enabled. This option and the irqs-off timing option can be
195 used together or separately.) 200 used together or separately.)
196 201
197config SYSPROF_TRACER
198 bool "Sysprof Tracer"
199 depends on X86
200 select GENERIC_TRACER
201 select CONTEXT_SWITCH_TRACER
202 help
203 This tracer provides the trace needed by the 'Sysprof' userspace
204 tool.
205
206config SCHED_TRACER 202config SCHED_TRACER
207 bool "Scheduling Latency Tracer" 203 bool "Scheduling Latency Tracer"
208 select GENERIC_TRACER 204 select GENERIC_TRACER
@@ -229,23 +225,6 @@ config FTRACE_SYSCALLS
229 help 225 help
230 Basic tracer to catch the syscall entry and exit events. 226 Basic tracer to catch the syscall entry and exit events.
231 227
232config BOOT_TRACER
233 bool "Trace boot initcalls"
234 select GENERIC_TRACER
235 select CONTEXT_SWITCH_TRACER
236 help
237 This tracer helps developers to optimize boot times: it records
238 the timings of the initcalls and traces key events and the identity
239 of tasks that can cause boot delays, such as context-switches.
240
241 Its aim is to be parsed by the scripts/bootgraph.pl tool to
242 produce pretty graphics about boot inefficiencies, giving a visual
243 representation of the delays during initcalls - but the raw
244 /debug/tracing/trace text output is readable too.
245
246 You must pass in initcall_debug and ftrace=initcall to the kernel
247 command line to enable this on bootup.
248
249config TRACE_BRANCH_PROFILING 228config TRACE_BRANCH_PROFILING
250 bool 229 bool
251 select GENERIC_TRACER 230 select GENERIC_TRACER
@@ -325,28 +304,6 @@ config BRANCH_TRACER
325 304
326 Say N if unsure. 305 Say N if unsure.
327 306
328config KSYM_TRACER
329 bool "Trace read and write access on kernel memory locations"
330 depends on HAVE_HW_BREAKPOINT
331 select TRACING
332 help
333 This tracer helps find read and write operations on any given kernel
334 symbol i.e. /proc/kallsyms.
335
336config PROFILE_KSYM_TRACER
337 bool "Profile all kernel memory accesses on 'watched' variables"
338 depends on KSYM_TRACER
339 help
340 This tracer profiles kernel accesses on variables watched through the
341 ksym tracer ftrace plugin. Depending upon the hardware, all read
342 and write operations on kernel variables can be monitored for
343 accesses.
344
345 The results will be displayed in:
346 /debugfs/tracing/profile_ksym
347
348 Say N if unsure.
349
350config STACK_TRACER 307config STACK_TRACER
351 bool "Trace max stack" 308 bool "Trace max stack"
352 depends on HAVE_FUNCTION_TRACER 309 depends on HAVE_FUNCTION_TRACER
@@ -371,37 +328,6 @@ config STACK_TRACER
371 328
372 Say N if unsure. 329 Say N if unsure.
373 330
374config KMEMTRACE
375 bool "Trace SLAB allocations"
376 select GENERIC_TRACER
377 help
378 kmemtrace provides tracing for slab allocator functions, such as
379 kmalloc, kfree, kmem_cache_alloc, kmem_cache_free, etc. Collected
380 data is then fed to the userspace application in order to analyse
381 allocation hotspots, internal fragmentation and so on, making it
382 possible to see how well an allocator performs, as well as debug
383 and profile kernel code.
384
385 This requires an userspace application to use. See
386 Documentation/trace/kmemtrace.txt for more information.
387
388 Saying Y will make the kernel somewhat larger and slower. However,
389 if you disable kmemtrace at run-time or boot-time, the performance
390 impact is minimal (depending on the arch the kernel is built for).
391
392 If unsure, say N.
393
394config WORKQUEUE_TRACER
395 bool "Trace workqueues"
396 select GENERIC_TRACER
397 help
398 The workqueue tracer provides some statistical information
399 about each cpu workqueue thread such as the number of the
400 works inserted and executed since their creation. It can help
401 to evaluate the amount of work each of them has to perform.
402 For example it can help a developer to decide whether he should
403 choose a per-cpu workqueue instead of a singlethreaded one.
404
405config BLK_DEV_IO_TRACE 331config BLK_DEV_IO_TRACE
406 bool "Support for tracing block IO actions" 332 bool "Support for tracing block IO actions"
407 depends on SYSFS 333 depends on SYSFS
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index ffb1a5b0550e..53f338190b26 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -30,7 +30,6 @@ obj-$(CONFIG_TRACING) += trace_output.o
30obj-$(CONFIG_TRACING) += trace_stat.o 30obj-$(CONFIG_TRACING) += trace_stat.o
31obj-$(CONFIG_TRACING) += trace_printk.o 31obj-$(CONFIG_TRACING) += trace_printk.o
32obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o 32obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
33obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
34obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o 33obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
35obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o 34obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
36obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o 35obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
@@ -38,10 +37,8 @@ obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
38obj-$(CONFIG_NOP_TRACER) += trace_nop.o 37obj-$(CONFIG_NOP_TRACER) += trace_nop.o
39obj-$(CONFIG_STACK_TRACER) += trace_stack.o 38obj-$(CONFIG_STACK_TRACER) += trace_stack.o
40obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o 39obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
41obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o 40obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o 41obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
44obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
45obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o 42obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
46obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o 43obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
47ifeq ($(CONFIG_BLOCK),y) 44ifeq ($(CONFIG_BLOCK),y)
@@ -55,7 +52,9 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
55endif 52endif
56obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 53obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
57obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o 54obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
58obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
59obj-$(CONFIG_EVENT_TRACING) += power-traces.o 55obj-$(CONFIG_EVENT_TRACING) += power-traces.o
56ifeq ($(CONFIG_TRACING),y)
57obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
58endif
60 59
61libftrace-y := ftrace.o 60libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 638711c17504..bc251ed66724 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -23,7 +23,6 @@
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/debugfs.h> 25#include <linux/debugfs.h>
26#include <linux/smp_lock.h>
27#include <linux/time.h> 26#include <linux/time.h>
28#include <linux/uaccess.h> 27#include <linux/uaccess.h>
29 28
@@ -169,9 +168,12 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
169static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), 168static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
170 BLK_TC_ACT(BLK_TC_WRITE) }; 169 BLK_TC_ACT(BLK_TC_WRITE) };
171 170
171#define BLK_TC_HARDBARRIER BLK_TC_BARRIER
172#define BLK_TC_RAHEAD BLK_TC_AHEAD
173
172/* The ilog2() calls fall out because they're constant */ 174/* The ilog2() calls fall out because they're constant */
173#define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \ 175#define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \
174 (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name)) 176 (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name))
175 177
176/* 178/*
177 * The worker for the various blk_add_trace*() types. Fills out a 179 * The worker for the various blk_add_trace*() types. Fills out a
@@ -194,9 +196,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
194 return; 196 return;
195 197
196 what |= ddir_act[rw & WRITE]; 198 what |= ddir_act[rw & WRITE];
197 what |= MASK_TC_BIT(rw, BARRIER); 199 what |= MASK_TC_BIT(rw, HARDBARRIER);
198 what |= MASK_TC_BIT(rw, SYNCIO); 200 what |= MASK_TC_BIT(rw, SYNC);
199 what |= MASK_TC_BIT(rw, AHEAD); 201 what |= MASK_TC_BIT(rw, RAHEAD);
200 what |= MASK_TC_BIT(rw, META); 202 what |= MASK_TC_BIT(rw, META);
201 what |= MASK_TC_BIT(rw, DISCARD); 203 what |= MASK_TC_BIT(rw, DISCARD);
202 204
@@ -323,6 +325,7 @@ static const struct file_operations blk_dropped_fops = {
323 .owner = THIS_MODULE, 325 .owner = THIS_MODULE,
324 .open = blk_dropped_open, 326 .open = blk_dropped_open,
325 .read = blk_dropped_read, 327 .read = blk_dropped_read,
328 .llseek = default_llseek,
326}; 329};
327 330
328static int blk_msg_open(struct inode *inode, struct file *filp) 331static int blk_msg_open(struct inode *inode, struct file *filp)
@@ -362,6 +365,7 @@ static const struct file_operations blk_msg_fops = {
362 .owner = THIS_MODULE, 365 .owner = THIS_MODULE,
363 .open = blk_msg_open, 366 .open = blk_msg_open,
364 .write = blk_msg_write, 367 .write = blk_msg_write,
368 .llseek = noop_llseek,
365}; 369};
366 370
367/* 371/*
@@ -549,6 +553,41 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
549} 553}
550EXPORT_SYMBOL_GPL(blk_trace_setup); 554EXPORT_SYMBOL_GPL(blk_trace_setup);
551 555
556#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
557static int compat_blk_trace_setup(struct request_queue *q, char *name,
558 dev_t dev, struct block_device *bdev,
559 char __user *arg)
560{
561 struct blk_user_trace_setup buts;
562 struct compat_blk_user_trace_setup cbuts;
563 int ret;
564
565 if (copy_from_user(&cbuts, arg, sizeof(cbuts)))
566 return -EFAULT;
567
568 buts = (struct blk_user_trace_setup) {
569 .act_mask = cbuts.act_mask,
570 .buf_size = cbuts.buf_size,
571 .buf_nr = cbuts.buf_nr,
572 .start_lba = cbuts.start_lba,
573 .end_lba = cbuts.end_lba,
574 .pid = cbuts.pid,
575 };
576 memcpy(&buts.name, &cbuts.name, 32);
577
578 ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
579 if (ret)
580 return ret;
581
582 if (copy_to_user(arg, &buts.name, 32)) {
583 blk_trace_remove(q);
584 return -EFAULT;
585 }
586
587 return 0;
588}
589#endif
590
552int blk_trace_startstop(struct request_queue *q, int start) 591int blk_trace_startstop(struct request_queue *q, int start)
553{ 592{
554 int ret; 593 int ret;
@@ -608,6 +647,12 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
608 bdevname(bdev, b); 647 bdevname(bdev, b);
609 ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); 648 ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
610 break; 649 break;
650#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
651 case BLKTRACESETUP32:
652 bdevname(bdev, b);
653 ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
654 break;
655#endif
611 case BLKTRACESTART: 656 case BLKTRACESTART:
612 start = 1; 657 start = 1;
613 case BLKTRACESTOP: 658 case BLKTRACESTOP:
@@ -661,10 +706,13 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
661 if (likely(!bt)) 706 if (likely(!bt))
662 return; 707 return;
663 708
664 if (blk_discard_rq(rq)) 709 if (rq->cmd_flags & REQ_DISCARD)
665 rw |= (1 << BIO_RW_DISCARD); 710 rw |= REQ_DISCARD;
711
712 if (rq->cmd_flags & REQ_SECURE)
713 rw |= REQ_SECURE;
666 714
667 if (blk_pc_request(rq)) { 715 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
668 what |= BLK_TC_ACT(BLK_TC_PC); 716 what |= BLK_TC_ACT(BLK_TC_PC);
669 __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw, 717 __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw,
670 what, rq->errors, rq->cmd_len, rq->cmd); 718 what, rq->errors, rq->cmd_len, rq->cmd);
@@ -925,7 +973,7 @@ void blk_add_driver_data(struct request_queue *q,
925 if (likely(!bt)) 973 if (likely(!bt))
926 return; 974 return;
927 975
928 if (blk_pc_request(rq)) 976 if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
929 __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, 977 __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0,
930 BLK_TA_DRV_DATA, rq->errors, len, data); 978 BLK_TA_DRV_DATA, rq->errors, len, data);
931 else 979 else
@@ -1603,10 +1651,9 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
1603 struct block_device *bdev; 1651 struct block_device *bdev;
1604 ssize_t ret = -ENXIO; 1652 ssize_t ret = -ENXIO;
1605 1653
1606 lock_kernel();
1607 bdev = bdget(part_devt(p)); 1654 bdev = bdget(part_devt(p));
1608 if (bdev == NULL) 1655 if (bdev == NULL)
1609 goto out_unlock_kernel; 1656 goto out;
1610 1657
1611 q = blk_trace_get_queue(bdev); 1658 q = blk_trace_get_queue(bdev);
1612 if (q == NULL) 1659 if (q == NULL)
@@ -1634,8 +1681,7 @@ out_unlock_bdev:
1634 mutex_unlock(&bdev->bd_mutex); 1681 mutex_unlock(&bdev->bd_mutex);
1635out_bdput: 1682out_bdput:
1636 bdput(bdev); 1683 bdput(bdev);
1637out_unlock_kernel: 1684out:
1638 unlock_kernel();
1639 return ret; 1685 return ret;
1640} 1686}
1641 1687
@@ -1665,11 +1711,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
1665 1711
1666 ret = -ENXIO; 1712 ret = -ENXIO;
1667 1713
1668 lock_kernel();
1669 p = dev_to_part(dev); 1714 p = dev_to_part(dev);
1670 bdev = bdget(part_devt(p)); 1715 bdev = bdget(part_devt(p));
1671 if (bdev == NULL) 1716 if (bdev == NULL)
1672 goto out_unlock_kernel; 1717 goto out;
1673 1718
1674 q = blk_trace_get_queue(bdev); 1719 q = blk_trace_get_queue(bdev);
1675 if (q == NULL) 1720 if (q == NULL)
@@ -1704,8 +1749,6 @@ out_unlock_bdev:
1704 mutex_unlock(&bdev->bd_mutex); 1749 mutex_unlock(&bdev->bd_mutex);
1705out_bdput: 1750out_bdput:
1706 bdput(bdev); 1751 bdput(bdev);
1707out_unlock_kernel:
1708 unlock_kernel();
1709out: 1752out:
1710 return ret ? ret : count; 1753 return ret ? ret : count;
1711} 1754}
@@ -1730,7 +1773,7 @@ void blk_dump_cmd(char *buf, struct request *rq)
1730 int len = rq->cmd_len; 1773 int len = rq->cmd_len;
1731 unsigned char *cmd = rq->cmd; 1774 unsigned char *cmd = rq->cmd;
1732 1775
1733 if (!blk_pc_request(rq)) { 1776 if (rq->cmd_type != REQ_TYPE_BLOCK_PC) {
1734 buf[0] = '\0'; 1777 buf[0] = '\0';
1735 return; 1778 return;
1736 } 1779 }
@@ -1755,21 +1798,23 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
1755 1798
1756 if (rw & WRITE) 1799 if (rw & WRITE)
1757 rwbs[i++] = 'W'; 1800 rwbs[i++] = 'W';
1758 else if (rw & 1 << BIO_RW_DISCARD) 1801 else if (rw & REQ_DISCARD)
1759 rwbs[i++] = 'D'; 1802 rwbs[i++] = 'D';
1760 else if (bytes) 1803 else if (bytes)
1761 rwbs[i++] = 'R'; 1804 rwbs[i++] = 'R';
1762 else 1805 else
1763 rwbs[i++] = 'N'; 1806 rwbs[i++] = 'N';
1764 1807
1765 if (rw & 1 << BIO_RW_AHEAD) 1808 if (rw & REQ_RAHEAD)
1766 rwbs[i++] = 'A'; 1809 rwbs[i++] = 'A';
1767 if (rw & 1 << BIO_RW_BARRIER) 1810 if (rw & REQ_HARDBARRIER)
1768 rwbs[i++] = 'B'; 1811 rwbs[i++] = 'B';
1769 if (rw & 1 << BIO_RW_SYNCIO) 1812 if (rw & REQ_SYNC)
1770 rwbs[i++] = 'S'; 1813 rwbs[i++] = 'S';
1771 if (rw & 1 << BIO_RW_META) 1814 if (rw & REQ_META)
1772 rwbs[i++] = 'M'; 1815 rwbs[i++] = 'M';
1816 if (rw & REQ_SECURE)
1817 rwbs[i++] = 'E';
1773 1818
1774 rwbs[i] = '\0'; 1819 rwbs[i] = '\0';
1775} 1820}
@@ -1779,8 +1824,11 @@ void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
1779 int rw = rq->cmd_flags & 0x03; 1824 int rw = rq->cmd_flags & 0x03;
1780 int bytes; 1825 int bytes;
1781 1826
1782 if (blk_discard_rq(rq)) 1827 if (rq->cmd_flags & REQ_DISCARD)
1783 rw |= (1 << BIO_RW_DISCARD); 1828 rw |= REQ_DISCARD;
1829
1830 if (rq->cmd_flags & REQ_SECURE)
1831 rw |= REQ_SECURE;
1784 1832
1785 bytes = blk_rq_bytes(rq); 1833 bytes = blk_rq_bytes(rq);
1786 1834
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6d2cb14f9449..f3dadae83883 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -381,12 +381,19 @@ static int function_stat_show(struct seq_file *m, void *v)
381{ 381{
382 struct ftrace_profile *rec = v; 382 struct ftrace_profile *rec = v;
383 char str[KSYM_SYMBOL_LEN]; 383 char str[KSYM_SYMBOL_LEN];
384 int ret = 0;
384#ifdef CONFIG_FUNCTION_GRAPH_TRACER 385#ifdef CONFIG_FUNCTION_GRAPH_TRACER
385 static DEFINE_MUTEX(mutex);
386 static struct trace_seq s; 386 static struct trace_seq s;
387 unsigned long long avg; 387 unsigned long long avg;
388 unsigned long long stddev; 388 unsigned long long stddev;
389#endif 389#endif
390 mutex_lock(&ftrace_profile_lock);
391
392 /* we raced with function_profile_reset() */
393 if (unlikely(rec->counter == 0)) {
394 ret = -EBUSY;
395 goto out;
396 }
390 397
391 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 398 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
392 seq_printf(m, " %-30.30s %10lu", str, rec->counter); 399 seq_printf(m, " %-30.30s %10lu", str, rec->counter);
@@ -408,7 +415,6 @@ static int function_stat_show(struct seq_file *m, void *v)
408 do_div(stddev, (rec->counter - 1) * 1000); 415 do_div(stddev, (rec->counter - 1) * 1000);
409 } 416 }
410 417
411 mutex_lock(&mutex);
412 trace_seq_init(&s); 418 trace_seq_init(&s);
413 trace_print_graph_duration(rec->time, &s); 419 trace_print_graph_duration(rec->time, &s);
414 trace_seq_puts(&s, " "); 420 trace_seq_puts(&s, " ");
@@ -416,11 +422,12 @@ static int function_stat_show(struct seq_file *m, void *v)
416 trace_seq_puts(&s, " "); 422 trace_seq_puts(&s, " ");
417 trace_print_graph_duration(stddev, &s); 423 trace_print_graph_duration(stddev, &s);
418 trace_print_seq(m, &s); 424 trace_print_seq(m, &s);
419 mutex_unlock(&mutex);
420#endif 425#endif
421 seq_putc(m, '\n'); 426 seq_putc(m, '\n');
427out:
428 mutex_unlock(&ftrace_profile_lock);
422 429
423 return 0; 430 return ret;
424} 431}
425 432
426static void ftrace_profile_reset(struct ftrace_profile_stat *stat) 433static void ftrace_profile_reset(struct ftrace_profile_stat *stat)
@@ -793,6 +800,7 @@ static const struct file_operations ftrace_profile_fops = {
793 .open = tracing_open_generic, 800 .open = tracing_open_generic,
794 .read = ftrace_profile_read, 801 .read = ftrace_profile_read,
795 .write = ftrace_profile_write, 802 .write = ftrace_profile_write,
803 .llseek = default_llseek,
796}; 804};
797 805
798/* used to initialize the real stat files */ 806/* used to initialize the real stat files */
@@ -877,10 +885,8 @@ enum {
877 FTRACE_ENABLE_CALLS = (1 << 0), 885 FTRACE_ENABLE_CALLS = (1 << 0),
878 FTRACE_DISABLE_CALLS = (1 << 1), 886 FTRACE_DISABLE_CALLS = (1 << 1),
879 FTRACE_UPDATE_TRACE_FUNC = (1 << 2), 887 FTRACE_UPDATE_TRACE_FUNC = (1 << 2),
880 FTRACE_ENABLE_MCOUNT = (1 << 3), 888 FTRACE_START_FUNC_RET = (1 << 3),
881 FTRACE_DISABLE_MCOUNT = (1 << 4), 889 FTRACE_STOP_FUNC_RET = (1 << 4),
882 FTRACE_START_FUNC_RET = (1 << 5),
883 FTRACE_STOP_FUNC_RET = (1 << 6),
884}; 890};
885 891
886static int ftrace_filtered; 892static int ftrace_filtered;
@@ -1219,8 +1225,6 @@ static void ftrace_shutdown(int command)
1219 1225
1220static void ftrace_startup_sysctl(void) 1226static void ftrace_startup_sysctl(void)
1221{ 1227{
1222 int command = FTRACE_ENABLE_MCOUNT;
1223
1224 if (unlikely(ftrace_disabled)) 1228 if (unlikely(ftrace_disabled))
1225 return; 1229 return;
1226 1230
@@ -1228,23 +1232,17 @@ static void ftrace_startup_sysctl(void)
1228 saved_ftrace_func = NULL; 1232 saved_ftrace_func = NULL;
1229 /* ftrace_start_up is true if we want ftrace running */ 1233 /* ftrace_start_up is true if we want ftrace running */
1230 if (ftrace_start_up) 1234 if (ftrace_start_up)
1231 command |= FTRACE_ENABLE_CALLS; 1235 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
1232
1233 ftrace_run_update_code(command);
1234} 1236}
1235 1237
1236static void ftrace_shutdown_sysctl(void) 1238static void ftrace_shutdown_sysctl(void)
1237{ 1239{
1238 int command = FTRACE_DISABLE_MCOUNT;
1239
1240 if (unlikely(ftrace_disabled)) 1240 if (unlikely(ftrace_disabled))
1241 return; 1241 return;
1242 1242
1243 /* ftrace_start_up is true if ftrace is running */ 1243 /* ftrace_start_up is true if ftrace is running */
1244 if (ftrace_start_up) 1244 if (ftrace_start_up)
1245 command |= FTRACE_DISABLE_CALLS; 1245 ftrace_run_update_code(FTRACE_DISABLE_CALLS);
1246
1247 ftrace_run_update_code(command);
1248} 1246}
1249 1247
1250static cycle_t ftrace_update_time; 1248static cycle_t ftrace_update_time;
@@ -1361,24 +1359,29 @@ enum {
1361#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ 1359#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
1362 1360
1363struct ftrace_iterator { 1361struct ftrace_iterator {
1364 struct ftrace_page *pg; 1362 loff_t pos;
1365 int hidx; 1363 loff_t func_pos;
1366 int idx; 1364 struct ftrace_page *pg;
1367 unsigned flags; 1365 struct dyn_ftrace *func;
1368 struct trace_parser parser; 1366 struct ftrace_func_probe *probe;
1367 struct trace_parser parser;
1368 int hidx;
1369 int idx;
1370 unsigned flags;
1369}; 1371};
1370 1372
1371static void * 1373static void *
1372t_hash_next(struct seq_file *m, void *v, loff_t *pos) 1374t_hash_next(struct seq_file *m, loff_t *pos)
1373{ 1375{
1374 struct ftrace_iterator *iter = m->private; 1376 struct ftrace_iterator *iter = m->private;
1375 struct hlist_node *hnd = v; 1377 struct hlist_node *hnd = NULL;
1376 struct hlist_head *hhd; 1378 struct hlist_head *hhd;
1377 1379
1378 WARN_ON(!(iter->flags & FTRACE_ITER_HASH));
1379
1380 (*pos)++; 1380 (*pos)++;
1381 iter->pos = *pos;
1381 1382
1383 if (iter->probe)
1384 hnd = &iter->probe->node;
1382 retry: 1385 retry:
1383 if (iter->hidx >= FTRACE_FUNC_HASHSIZE) 1386 if (iter->hidx >= FTRACE_FUNC_HASHSIZE)
1384 return NULL; 1387 return NULL;
@@ -1401,7 +1404,12 @@ t_hash_next(struct seq_file *m, void *v, loff_t *pos)
1401 } 1404 }
1402 } 1405 }
1403 1406
1404 return hnd; 1407 if (WARN_ON_ONCE(!hnd))
1408 return NULL;
1409
1410 iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node);
1411
1412 return iter;
1405} 1413}
1406 1414
1407static void *t_hash_start(struct seq_file *m, loff_t *pos) 1415static void *t_hash_start(struct seq_file *m, loff_t *pos)
@@ -1410,26 +1418,32 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
1410 void *p = NULL; 1418 void *p = NULL;
1411 loff_t l; 1419 loff_t l;
1412 1420
1413 if (!(iter->flags & FTRACE_ITER_HASH)) 1421 if (iter->func_pos > *pos)
1414 *pos = 0; 1422 return NULL;
1415
1416 iter->flags |= FTRACE_ITER_HASH;
1417 1423
1418 iter->hidx = 0; 1424 iter->hidx = 0;
1419 for (l = 0; l <= *pos; ) { 1425 for (l = 0; l <= (*pos - iter->func_pos); ) {
1420 p = t_hash_next(m, p, &l); 1426 p = t_hash_next(m, &l);
1421 if (!p) 1427 if (!p)
1422 break; 1428 break;
1423 } 1429 }
1424 return p; 1430 if (!p)
1431 return NULL;
1432
1433 /* Only set this if we have an item */
1434 iter->flags |= FTRACE_ITER_HASH;
1435
1436 return iter;
1425} 1437}
1426 1438
1427static int t_hash_show(struct seq_file *m, void *v) 1439static int
1440t_hash_show(struct seq_file *m, struct ftrace_iterator *iter)
1428{ 1441{
1429 struct ftrace_func_probe *rec; 1442 struct ftrace_func_probe *rec;
1430 struct hlist_node *hnd = v;
1431 1443
1432 rec = hlist_entry(hnd, struct ftrace_func_probe, node); 1444 rec = iter->probe;
1445 if (WARN_ON_ONCE(!rec))
1446 return -EIO;
1433 1447
1434 if (rec->ops->print) 1448 if (rec->ops->print)
1435 return rec->ops->print(m, rec->ip, rec->ops, rec->data); 1449 return rec->ops->print(m, rec->ip, rec->ops, rec->data);
@@ -1450,12 +1464,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
1450 struct dyn_ftrace *rec = NULL; 1464 struct dyn_ftrace *rec = NULL;
1451 1465
1452 if (iter->flags & FTRACE_ITER_HASH) 1466 if (iter->flags & FTRACE_ITER_HASH)
1453 return t_hash_next(m, v, pos); 1467 return t_hash_next(m, pos);
1454 1468
1455 (*pos)++; 1469 (*pos)++;
1470 iter->pos = *pos;
1456 1471
1457 if (iter->flags & FTRACE_ITER_PRINTALL) 1472 if (iter->flags & FTRACE_ITER_PRINTALL)
1458 return NULL; 1473 return t_hash_start(m, pos);
1459 1474
1460 retry: 1475 retry:
1461 if (iter->idx >= iter->pg->index) { 1476 if (iter->idx >= iter->pg->index) {
@@ -1484,7 +1499,20 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
1484 } 1499 }
1485 } 1500 }
1486 1501
1487 return rec; 1502 if (!rec)
1503 return t_hash_start(m, pos);
1504
1505 iter->func_pos = *pos;
1506 iter->func = rec;
1507
1508 return iter;
1509}
1510
1511static void reset_iter_read(struct ftrace_iterator *iter)
1512{
1513 iter->pos = 0;
1514 iter->func_pos = 0;
1515 iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH);
1488} 1516}
1489 1517
1490static void *t_start(struct seq_file *m, loff_t *pos) 1518static void *t_start(struct seq_file *m, loff_t *pos)
@@ -1495,6 +1523,12 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1495 1523
1496 mutex_lock(&ftrace_lock); 1524 mutex_lock(&ftrace_lock);
1497 /* 1525 /*
1526 * If an lseek was done, then reset and start from beginning.
1527 */
1528 if (*pos < iter->pos)
1529 reset_iter_read(iter);
1530
1531 /*
1498 * For set_ftrace_filter reading, if we have the filter 1532 * For set_ftrace_filter reading, if we have the filter
1499 * off, we can short cut and just print out that all 1533 * off, we can short cut and just print out that all
1500 * functions are enabled. 1534 * functions are enabled.
@@ -1503,12 +1537,19 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1503 if (*pos > 0) 1537 if (*pos > 0)
1504 return t_hash_start(m, pos); 1538 return t_hash_start(m, pos);
1505 iter->flags |= FTRACE_ITER_PRINTALL; 1539 iter->flags |= FTRACE_ITER_PRINTALL;
1540 /* reset in case of seek/pread */
1541 iter->flags &= ~FTRACE_ITER_HASH;
1506 return iter; 1542 return iter;
1507 } 1543 }
1508 1544
1509 if (iter->flags & FTRACE_ITER_HASH) 1545 if (iter->flags & FTRACE_ITER_HASH)
1510 return t_hash_start(m, pos); 1546 return t_hash_start(m, pos);
1511 1547
1548 /*
1549 * Unfortunately, we need to restart at ftrace_pages_start
1550 * every time we let go of the ftrace_mutex. This is because
1551 * those pointers can change without the lock.
1552 */
1512 iter->pg = ftrace_pages_start; 1553 iter->pg = ftrace_pages_start;
1513 iter->idx = 0; 1554 iter->idx = 0;
1514 for (l = 0; l <= *pos; ) { 1555 for (l = 0; l <= *pos; ) {
@@ -1517,10 +1558,14 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1517 break; 1558 break;
1518 } 1559 }
1519 1560
1520 if (!p && iter->flags & FTRACE_ITER_FILTER) 1561 if (!p) {
1521 return t_hash_start(m, pos); 1562 if (iter->flags & FTRACE_ITER_FILTER)
1563 return t_hash_start(m, pos);
1522 1564
1523 return p; 1565 return NULL;
1566 }
1567
1568 return iter;
1524} 1569}
1525 1570
1526static void t_stop(struct seq_file *m, void *p) 1571static void t_stop(struct seq_file *m, void *p)
@@ -1531,16 +1576,18 @@ static void t_stop(struct seq_file *m, void *p)
1531static int t_show(struct seq_file *m, void *v) 1576static int t_show(struct seq_file *m, void *v)
1532{ 1577{
1533 struct ftrace_iterator *iter = m->private; 1578 struct ftrace_iterator *iter = m->private;
1534 struct dyn_ftrace *rec = v; 1579 struct dyn_ftrace *rec;
1535 1580
1536 if (iter->flags & FTRACE_ITER_HASH) 1581 if (iter->flags & FTRACE_ITER_HASH)
1537 return t_hash_show(m, v); 1582 return t_hash_show(m, iter);
1538 1583
1539 if (iter->flags & FTRACE_ITER_PRINTALL) { 1584 if (iter->flags & FTRACE_ITER_PRINTALL) {
1540 seq_printf(m, "#### all functions enabled ####\n"); 1585 seq_printf(m, "#### all functions enabled ####\n");
1541 return 0; 1586 return 0;
1542 } 1587 }
1543 1588
1589 rec = iter->func;
1590
1544 if (!rec) 1591 if (!rec)
1545 return 0; 1592 return 0;
1546 1593
@@ -1592,8 +1639,8 @@ ftrace_failures_open(struct inode *inode, struct file *file)
1592 1639
1593 ret = ftrace_avail_open(inode, file); 1640 ret = ftrace_avail_open(inode, file);
1594 if (!ret) { 1641 if (!ret) {
1595 m = (struct seq_file *)file->private_data; 1642 m = file->private_data;
1596 iter = (struct ftrace_iterator *)m->private; 1643 iter = m->private;
1597 iter->flags = FTRACE_ITER_FAILURES; 1644 iter->flags = FTRACE_ITER_FAILURES;
1598 } 1645 }
1599 1646
@@ -1883,7 +1930,6 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
1883 struct hlist_head *hhd; 1930 struct hlist_head *hhd;
1884 struct hlist_node *n; 1931 struct hlist_node *n;
1885 unsigned long key; 1932 unsigned long key;
1886 int resched;
1887 1933
1888 key = hash_long(ip, FTRACE_HASH_BITS); 1934 key = hash_long(ip, FTRACE_HASH_BITS);
1889 1935
@@ -1897,12 +1943,12 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
1897 * period. This syncs the hash iteration and freeing of items 1943 * period. This syncs the hash iteration and freeing of items
1898 * on the hash. rcu_read_lock is too dangerous here. 1944 * on the hash. rcu_read_lock is too dangerous here.
1899 */ 1945 */
1900 resched = ftrace_preempt_disable(); 1946 preempt_disable_notrace();
1901 hlist_for_each_entry_rcu(entry, n, hhd, node) { 1947 hlist_for_each_entry_rcu(entry, n, hhd, node) {
1902 if (entry->ip == ip) 1948 if (entry->ip == ip)
1903 entry->ops->func(ip, parent_ip, &entry->data); 1949 entry->ops->func(ip, parent_ip, &entry->data);
1904 } 1950 }
1905 ftrace_preempt_enable(resched); 1951 preempt_enable_notrace();
1906} 1952}
1907 1953
1908static struct ftrace_ops trace_probe_ops __read_mostly = 1954static struct ftrace_ops trace_probe_ops __read_mostly =
@@ -2624,6 +2670,7 @@ static const struct file_operations ftrace_graph_fops = {
2624 .read = seq_read, 2670 .read = seq_read,
2625 .write = ftrace_graph_write, 2671 .write = ftrace_graph_write,
2626 .release = ftrace_graph_release, 2672 .release = ftrace_graph_release,
2673 .llseek = seq_lseek,
2627}; 2674};
2628#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 2675#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
2629 2676
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
deleted file mode 100644
index bbfc1bb1660b..000000000000
--- a/kernel/trace/kmemtrace.c
+++ /dev/null
@@ -1,529 +0,0 @@
1/*
2 * Memory allocator tracing
3 *
4 * Copyright (C) 2008 Eduard - Gabriel Munteanu
5 * Copyright (C) 2008 Pekka Enberg <penberg@cs.helsinki.fi>
6 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
7 */
8
9#include <linux/tracepoint.h>
10#include <linux/seq_file.h>
11#include <linux/debugfs.h>
12#include <linux/dcache.h>
13#include <linux/fs.h>
14
15#include <linux/kmemtrace.h>
16
17#include "trace_output.h"
18#include "trace.h"
19
20/* Select an alternative, minimalistic output than the original one */
21#define TRACE_KMEM_OPT_MINIMAL 0x1
22
23static struct tracer_opt kmem_opts[] = {
24 /* Default disable the minimalistic output */
25 { TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) },
26 { }
27};
28
29static struct tracer_flags kmem_tracer_flags = {
30 .val = 0,
31 .opts = kmem_opts
32};
33
34static struct trace_array *kmemtrace_array;
35
36/* Trace allocations */
37static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
38 unsigned long call_site,
39 const void *ptr,
40 size_t bytes_req,
41 size_t bytes_alloc,
42 gfp_t gfp_flags,
43 int node)
44{
45 struct ftrace_event_call *call = &event_kmem_alloc;
46 struct trace_array *tr = kmemtrace_array;
47 struct kmemtrace_alloc_entry *entry;
48 struct ring_buffer_event *event;
49
50 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
51 if (!event)
52 return;
53
54 entry = ring_buffer_event_data(event);
55 tracing_generic_entry_update(&entry->ent, 0, 0);
56
57 entry->ent.type = TRACE_KMEM_ALLOC;
58 entry->type_id = type_id;
59 entry->call_site = call_site;
60 entry->ptr = ptr;
61 entry->bytes_req = bytes_req;
62 entry->bytes_alloc = bytes_alloc;
63 entry->gfp_flags = gfp_flags;
64 entry->node = node;
65
66 if (!filter_check_discard(call, entry, tr->buffer, event))
67 ring_buffer_unlock_commit(tr->buffer, event);
68
69 trace_wake_up();
70}
71
72static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
73 unsigned long call_site,
74 const void *ptr)
75{
76 struct ftrace_event_call *call = &event_kmem_free;
77 struct trace_array *tr = kmemtrace_array;
78 struct kmemtrace_free_entry *entry;
79 struct ring_buffer_event *event;
80
81 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
82 if (!event)
83 return;
84 entry = ring_buffer_event_data(event);
85 tracing_generic_entry_update(&entry->ent, 0, 0);
86
87 entry->ent.type = TRACE_KMEM_FREE;
88 entry->type_id = type_id;
89 entry->call_site = call_site;
90 entry->ptr = ptr;
91
92 if (!filter_check_discard(call, entry, tr->buffer, event))
93 ring_buffer_unlock_commit(tr->buffer, event);
94
95 trace_wake_up();
96}
97
98static void kmemtrace_kmalloc(void *ignore,
99 unsigned long call_site,
100 const void *ptr,
101 size_t bytes_req,
102 size_t bytes_alloc,
103 gfp_t gfp_flags)
104{
105 kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
106 bytes_req, bytes_alloc, gfp_flags, -1);
107}
108
109static void kmemtrace_kmem_cache_alloc(void *ignore,
110 unsigned long call_site,
111 const void *ptr,
112 size_t bytes_req,
113 size_t bytes_alloc,
114 gfp_t gfp_flags)
115{
116 kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
117 bytes_req, bytes_alloc, gfp_flags, -1);
118}
119
120static void kmemtrace_kmalloc_node(void *ignore,
121 unsigned long call_site,
122 const void *ptr,
123 size_t bytes_req,
124 size_t bytes_alloc,
125 gfp_t gfp_flags,
126 int node)
127{
128 kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
129 bytes_req, bytes_alloc, gfp_flags, node);
130}
131
132static void kmemtrace_kmem_cache_alloc_node(void *ignore,
133 unsigned long call_site,
134 const void *ptr,
135 size_t bytes_req,
136 size_t bytes_alloc,
137 gfp_t gfp_flags,
138 int node)
139{
140 kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
141 bytes_req, bytes_alloc, gfp_flags, node);
142}
143
144static void
145kmemtrace_kfree(void *ignore, unsigned long call_site, const void *ptr)
146{
147 kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr);
148}
149
150static void kmemtrace_kmem_cache_free(void *ignore,
151 unsigned long call_site, const void *ptr)
152{
153 kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr);
154}
155
156static int kmemtrace_start_probes(void)
157{
158 int err;
159
160 err = register_trace_kmalloc(kmemtrace_kmalloc, NULL);
161 if (err)
162 return err;
163 err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
164 if (err)
165 return err;
166 err = register_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
167 if (err)
168 return err;
169 err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
170 if (err)
171 return err;
172 err = register_trace_kfree(kmemtrace_kfree, NULL);
173 if (err)
174 return err;
175 err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
176
177 return err;
178}
179
180static void kmemtrace_stop_probes(void)
181{
182 unregister_trace_kmalloc(kmemtrace_kmalloc, NULL);
183 unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
184 unregister_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
185 unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
186 unregister_trace_kfree(kmemtrace_kfree, NULL);
187 unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
188}
189
190static int kmem_trace_init(struct trace_array *tr)
191{
192 kmemtrace_array = tr;
193
194 tracing_reset_online_cpus(tr);
195
196 kmemtrace_start_probes();
197
198 return 0;
199}
200
201static void kmem_trace_reset(struct trace_array *tr)
202{
203 kmemtrace_stop_probes();
204}
205
206static void kmemtrace_headers(struct seq_file *s)
207{
208 /* Don't need headers for the original kmemtrace output */
209 if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
210 return;
211
212 seq_printf(s, "#\n");
213 seq_printf(s, "# ALLOC TYPE REQ GIVEN FLAGS "
214 " POINTER NODE CALLER\n");
215 seq_printf(s, "# FREE | | | | "
216 " | | | |\n");
217 seq_printf(s, "# |\n\n");
218}
219
220/*
221 * The following functions give the original output from kmemtrace,
222 * plus the origin CPU, since reordering occurs in-kernel now.
223 */
224
225#define KMEMTRACE_USER_ALLOC 0
226#define KMEMTRACE_USER_FREE 1
227
228struct kmemtrace_user_event {
229 u8 event_id;
230 u8 type_id;
231 u16 event_size;
232 u32 cpu;
233 u64 timestamp;
234 unsigned long call_site;
235 unsigned long ptr;
236};
237
238struct kmemtrace_user_event_alloc {
239 size_t bytes_req;
240 size_t bytes_alloc;
241 unsigned gfp_flags;
242 int node;
243};
244
245static enum print_line_t
246kmemtrace_print_alloc(struct trace_iterator *iter, int flags,
247 struct trace_event *event)
248{
249 struct trace_seq *s = &iter->seq;
250 struct kmemtrace_alloc_entry *entry;
251 int ret;
252
253 trace_assign_type(entry, iter->ent);
254
255 ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu "
256 "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n",
257 entry->type_id, (void *)entry->call_site, (unsigned long)entry->ptr,
258 (unsigned long)entry->bytes_req, (unsigned long)entry->bytes_alloc,
259 (unsigned long)entry->gfp_flags, entry->node);
260
261 if (!ret)
262 return TRACE_TYPE_PARTIAL_LINE;
263 return TRACE_TYPE_HANDLED;
264}
265
266static enum print_line_t
267kmemtrace_print_free(struct trace_iterator *iter, int flags,
268 struct trace_event *event)
269{
270 struct trace_seq *s = &iter->seq;
271 struct kmemtrace_free_entry *entry;
272 int ret;
273
274 trace_assign_type(entry, iter->ent);
275
276 ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu\n",
277 entry->type_id, (void *)entry->call_site,
278 (unsigned long)entry->ptr);
279
280 if (!ret)
281 return TRACE_TYPE_PARTIAL_LINE;
282 return TRACE_TYPE_HANDLED;
283}
284
285static enum print_line_t
286kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags,
287 struct trace_event *event)
288{
289 struct trace_seq *s = &iter->seq;
290 struct kmemtrace_alloc_entry *entry;
291 struct kmemtrace_user_event *ev;
292 struct kmemtrace_user_event_alloc *ev_alloc;
293
294 trace_assign_type(entry, iter->ent);
295
296 ev = trace_seq_reserve(s, sizeof(*ev));
297 if (!ev)
298 return TRACE_TYPE_PARTIAL_LINE;
299
300 ev->event_id = KMEMTRACE_USER_ALLOC;
301 ev->type_id = entry->type_id;
302 ev->event_size = sizeof(*ev) + sizeof(*ev_alloc);
303 ev->cpu = iter->cpu;
304 ev->timestamp = iter->ts;
305 ev->call_site = entry->call_site;
306 ev->ptr = (unsigned long)entry->ptr;
307
308 ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc));
309 if (!ev_alloc)
310 return TRACE_TYPE_PARTIAL_LINE;
311
312 ev_alloc->bytes_req = entry->bytes_req;
313 ev_alloc->bytes_alloc = entry->bytes_alloc;
314 ev_alloc->gfp_flags = entry->gfp_flags;
315 ev_alloc->node = entry->node;
316
317 return TRACE_TYPE_HANDLED;
318}
319
320static enum print_line_t
321kmemtrace_print_free_user(struct trace_iterator *iter, int flags,
322 struct trace_event *event)
323{
324 struct trace_seq *s = &iter->seq;
325 struct kmemtrace_free_entry *entry;
326 struct kmemtrace_user_event *ev;
327
328 trace_assign_type(entry, iter->ent);
329
330 ev = trace_seq_reserve(s, sizeof(*ev));
331 if (!ev)
332 return TRACE_TYPE_PARTIAL_LINE;
333
334 ev->event_id = KMEMTRACE_USER_FREE;
335 ev->type_id = entry->type_id;
336 ev->event_size = sizeof(*ev);
337 ev->cpu = iter->cpu;
338 ev->timestamp = iter->ts;
339 ev->call_site = entry->call_site;
340 ev->ptr = (unsigned long)entry->ptr;
341
342 return TRACE_TYPE_HANDLED;
343}
344
345/* The two other following provide a more minimalistic output */
346static enum print_line_t
347kmemtrace_print_alloc_compress(struct trace_iterator *iter)
348{
349 struct kmemtrace_alloc_entry *entry;
350 struct trace_seq *s = &iter->seq;
351 int ret;
352
353 trace_assign_type(entry, iter->ent);
354
355 /* Alloc entry */
356 ret = trace_seq_printf(s, " + ");
357 if (!ret)
358 return TRACE_TYPE_PARTIAL_LINE;
359
360 /* Type */
361 switch (entry->type_id) {
362 case KMEMTRACE_TYPE_KMALLOC:
363 ret = trace_seq_printf(s, "K ");
364 break;
365 case KMEMTRACE_TYPE_CACHE:
366 ret = trace_seq_printf(s, "C ");
367 break;
368 case KMEMTRACE_TYPE_PAGES:
369 ret = trace_seq_printf(s, "P ");
370 break;
371 default:
372 ret = trace_seq_printf(s, "? ");
373 }
374
375 if (!ret)
376 return TRACE_TYPE_PARTIAL_LINE;
377
378 /* Requested */
379 ret = trace_seq_printf(s, "%4zu ", entry->bytes_req);
380 if (!ret)
381 return TRACE_TYPE_PARTIAL_LINE;
382
383 /* Allocated */
384 ret = trace_seq_printf(s, "%4zu ", entry->bytes_alloc);
385 if (!ret)
386 return TRACE_TYPE_PARTIAL_LINE;
387
388 /* Flags
389 * TODO: would be better to see the name of the GFP flag names
390 */
391 ret = trace_seq_printf(s, "%08x ", entry->gfp_flags);
392 if (!ret)
393 return TRACE_TYPE_PARTIAL_LINE;
394
395 /* Pointer to allocated */
396 ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr);
397 if (!ret)
398 return TRACE_TYPE_PARTIAL_LINE;
399
400 /* Node and call site*/
401 ret = trace_seq_printf(s, "%4d %pf\n", entry->node,
402 (void *)entry->call_site);
403 if (!ret)
404 return TRACE_TYPE_PARTIAL_LINE;
405
406 return TRACE_TYPE_HANDLED;
407}
408
409static enum print_line_t
410kmemtrace_print_free_compress(struct trace_iterator *iter)
411{
412 struct kmemtrace_free_entry *entry;
413 struct trace_seq *s = &iter->seq;
414 int ret;
415
416 trace_assign_type(entry, iter->ent);
417
418 /* Free entry */
419 ret = trace_seq_printf(s, " - ");
420 if (!ret)
421 return TRACE_TYPE_PARTIAL_LINE;
422
423 /* Type */
424 switch (entry->type_id) {
425 case KMEMTRACE_TYPE_KMALLOC:
426 ret = trace_seq_printf(s, "K ");
427 break;
428 case KMEMTRACE_TYPE_CACHE:
429 ret = trace_seq_printf(s, "C ");
430 break;
431 case KMEMTRACE_TYPE_PAGES:
432 ret = trace_seq_printf(s, "P ");
433 break;
434 default:
435 ret = trace_seq_printf(s, "? ");
436 }
437
438 if (!ret)
439 return TRACE_TYPE_PARTIAL_LINE;
440
441 /* Skip requested/allocated/flags */
442 ret = trace_seq_printf(s, " ");
443 if (!ret)
444 return TRACE_TYPE_PARTIAL_LINE;
445
446 /* Pointer to allocated */
447 ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr);
448 if (!ret)
449 return TRACE_TYPE_PARTIAL_LINE;
450
451 /* Skip node and print call site*/
452 ret = trace_seq_printf(s, " %pf\n", (void *)entry->call_site);
453 if (!ret)
454 return TRACE_TYPE_PARTIAL_LINE;
455
456 return TRACE_TYPE_HANDLED;
457}
458
459static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
460{
461 struct trace_entry *entry = iter->ent;
462
463 if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
464 return TRACE_TYPE_UNHANDLED;
465
466 switch (entry->type) {
467 case TRACE_KMEM_ALLOC:
468 return kmemtrace_print_alloc_compress(iter);
469 case TRACE_KMEM_FREE:
470 return kmemtrace_print_free_compress(iter);
471 default:
472 return TRACE_TYPE_UNHANDLED;
473 }
474}
475
476static struct trace_event_functions kmem_trace_alloc_funcs = {
477 .trace = kmemtrace_print_alloc,
478 .binary = kmemtrace_print_alloc_user,
479};
480
481static struct trace_event kmem_trace_alloc = {
482 .type = TRACE_KMEM_ALLOC,
483 .funcs = &kmem_trace_alloc_funcs,
484};
485
486static struct trace_event_functions kmem_trace_free_funcs = {
487 .trace = kmemtrace_print_free,
488 .binary = kmemtrace_print_free_user,
489};
490
491static struct trace_event kmem_trace_free = {
492 .type = TRACE_KMEM_FREE,
493 .funcs = &kmem_trace_free_funcs,
494};
495
496static struct tracer kmem_tracer __read_mostly = {
497 .name = "kmemtrace",
498 .init = kmem_trace_init,
499 .reset = kmem_trace_reset,
500 .print_line = kmemtrace_print_line,
501 .print_header = kmemtrace_headers,
502 .flags = &kmem_tracer_flags
503};
504
505void kmemtrace_init(void)
506{
507 /* earliest opportunity to start kmem tracing */
508}
509
510static int __init init_kmem_tracer(void)
511{
512 if (!register_ftrace_event(&kmem_trace_alloc)) {
513 pr_warning("Warning: could not register kmem events\n");
514 return 1;
515 }
516
517 if (!register_ftrace_event(&kmem_trace_free)) {
518 pr_warning("Warning: could not register kmem events\n");
519 return 1;
520 }
521
522 if (register_tracer(&kmem_tracer) != 0) {
523 pr_warning("Warning: could not register the kmem tracer\n");
524 return 1;
525 }
526
527 return 0;
528}
529device_initcall(init_kmem_tracer);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 1da7b6ea8b85..9ed509a015d8 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -224,6 +224,9 @@ enum {
224 RB_LEN_TIME_STAMP = 16, 224 RB_LEN_TIME_STAMP = 16,
225}; 225};
226 226
227#define skip_time_extend(event) \
228 ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
229
227static inline int rb_null_event(struct ring_buffer_event *event) 230static inline int rb_null_event(struct ring_buffer_event *event)
228{ 231{
229 return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; 232 return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
@@ -248,8 +251,12 @@ rb_event_data_length(struct ring_buffer_event *event)
248 return length + RB_EVNT_HDR_SIZE; 251 return length + RB_EVNT_HDR_SIZE;
249} 252}
250 253
251/* inline for ring buffer fast paths */ 254/*
252static unsigned 255 * Return the length of the given event. Will return
256 * the length of the time extend if the event is a
257 * time extend.
258 */
259static inline unsigned
253rb_event_length(struct ring_buffer_event *event) 260rb_event_length(struct ring_buffer_event *event)
254{ 261{
255 switch (event->type_len) { 262 switch (event->type_len) {
@@ -274,13 +281,41 @@ rb_event_length(struct ring_buffer_event *event)
274 return 0; 281 return 0;
275} 282}
276 283
284/*
285 * Return total length of time extend and data,
286 * or just the event length for all other events.
287 */
288static inline unsigned
289rb_event_ts_length(struct ring_buffer_event *event)
290{
291 unsigned len = 0;
292
293 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
294 /* time extends include the data event after it */
295 len = RB_LEN_TIME_EXTEND;
296 event = skip_time_extend(event);
297 }
298 return len + rb_event_length(event);
299}
300
277/** 301/**
278 * ring_buffer_event_length - return the length of the event 302 * ring_buffer_event_length - return the length of the event
279 * @event: the event to get the length of 303 * @event: the event to get the length of
304 *
305 * Returns the size of the data load of a data event.
306 * If the event is something other than a data event, it
307 * returns the size of the event itself. With the exception
308 * of a TIME EXTEND, where it still returns the size of the
309 * data load of the data event after it.
280 */ 310 */
281unsigned ring_buffer_event_length(struct ring_buffer_event *event) 311unsigned ring_buffer_event_length(struct ring_buffer_event *event)
282{ 312{
283 unsigned length = rb_event_length(event); 313 unsigned length;
314
315 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
316 event = skip_time_extend(event);
317
318 length = rb_event_length(event);
284 if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) 319 if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
285 return length; 320 return length;
286 length -= RB_EVNT_HDR_SIZE; 321 length -= RB_EVNT_HDR_SIZE;
@@ -294,6 +329,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
294static void * 329static void *
295rb_event_data(struct ring_buffer_event *event) 330rb_event_data(struct ring_buffer_event *event)
296{ 331{
332 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
333 event = skip_time_extend(event);
297 BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); 334 BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
298 /* If length is in len field, then array[0] has the data */ 335 /* If length is in len field, then array[0] has the data */
299 if (event->type_len) 336 if (event->type_len)
@@ -404,9 +441,6 @@ static inline int test_time_stamp(u64 delta)
404/* Max payload is BUF_PAGE_SIZE - header (8bytes) */ 441/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
405#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) 442#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
406 443
407/* Max number of timestamps that can fit on a page */
408#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_STAMP)
409
410int ring_buffer_print_page_header(struct trace_seq *s) 444int ring_buffer_print_page_header(struct trace_seq *s)
411{ 445{
412 struct buffer_data_page field; 446 struct buffer_data_page field;
@@ -443,6 +477,7 @@ int ring_buffer_print_page_header(struct trace_seq *s)
443 */ 477 */
444struct ring_buffer_per_cpu { 478struct ring_buffer_per_cpu {
445 int cpu; 479 int cpu;
480 atomic_t record_disabled;
446 struct ring_buffer *buffer; 481 struct ring_buffer *buffer;
447 spinlock_t reader_lock; /* serialize readers */ 482 spinlock_t reader_lock; /* serialize readers */
448 arch_spinlock_t lock; 483 arch_spinlock_t lock;
@@ -462,7 +497,6 @@ struct ring_buffer_per_cpu {
462 unsigned long read; 497 unsigned long read;
463 u64 write_stamp; 498 u64 write_stamp;
464 u64 read_stamp; 499 u64 read_stamp;
465 atomic_t record_disabled;
466}; 500};
467 501
468struct ring_buffer { 502struct ring_buffer {
@@ -1546,6 +1580,25 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
1546 iter->head = 0; 1580 iter->head = 0;
1547} 1581}
1548 1582
1583/* Slow path, do not inline */
1584static noinline struct ring_buffer_event *
1585rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
1586{
1587 event->type_len = RINGBUF_TYPE_TIME_EXTEND;
1588
1589 /* Not the first event on the page? */
1590 if (rb_event_index(event)) {
1591 event->time_delta = delta & TS_MASK;
1592 event->array[0] = delta >> TS_SHIFT;
1593 } else {
1594 /* nope, just zero it */
1595 event->time_delta = 0;
1596 event->array[0] = 0;
1597 }
1598
1599 return skip_time_extend(event);
1600}
1601
1549/** 1602/**
1550 * ring_buffer_update_event - update event type and data 1603 * ring_buffer_update_event - update event type and data
1551 * @event: the even to update 1604 * @event: the even to update
@@ -1558,28 +1611,31 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
1558 * data field. 1611 * data field.
1559 */ 1612 */
1560static void 1613static void
1561rb_update_event(struct ring_buffer_event *event, 1614rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
1562 unsigned type, unsigned length) 1615 struct ring_buffer_event *event, unsigned length,
1616 int add_timestamp, u64 delta)
1563{ 1617{
1564 event->type_len = type; 1618 /* Only a commit updates the timestamp */
1565 1619 if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
1566 switch (type) { 1620 delta = 0;
1567
1568 case RINGBUF_TYPE_PADDING:
1569 case RINGBUF_TYPE_TIME_EXTEND:
1570 case RINGBUF_TYPE_TIME_STAMP:
1571 break;
1572 1621
1573 case 0: 1622 /*
1574 length -= RB_EVNT_HDR_SIZE; 1623 * If we need to add a timestamp, then we
1575 if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) 1624 * add it to the start of the resevered space.
1576 event->array[0] = length; 1625 */
1577 else 1626 if (unlikely(add_timestamp)) {
1578 event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); 1627 event = rb_add_time_stamp(event, delta);
1579 break; 1628 length -= RB_LEN_TIME_EXTEND;
1580 default: 1629 delta = 0;
1581 BUG();
1582 } 1630 }
1631
1632 event->time_delta = delta;
1633 length -= RB_EVNT_HDR_SIZE;
1634 if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
1635 event->type_len = 0;
1636 event->array[0] = length;
1637 } else
1638 event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
1583} 1639}
1584 1640
1585/* 1641/*
@@ -1823,10 +1879,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1823 local_sub(length, &tail_page->write); 1879 local_sub(length, &tail_page->write);
1824} 1880}
1825 1881
1826static struct ring_buffer_event * 1882/*
1883 * This is the slow path, force gcc not to inline it.
1884 */
1885static noinline struct ring_buffer_event *
1827rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, 1886rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1828 unsigned long length, unsigned long tail, 1887 unsigned long length, unsigned long tail,
1829 struct buffer_page *tail_page, u64 *ts) 1888 struct buffer_page *tail_page, u64 ts)
1830{ 1889{
1831 struct buffer_page *commit_page = cpu_buffer->commit_page; 1890 struct buffer_page *commit_page = cpu_buffer->commit_page;
1832 struct ring_buffer *buffer = cpu_buffer->buffer; 1891 struct ring_buffer *buffer = cpu_buffer->buffer;
@@ -1909,8 +1968,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1909 * Nested commits always have zero deltas, so 1968 * Nested commits always have zero deltas, so
1910 * just reread the time stamp 1969 * just reread the time stamp
1911 */ 1970 */
1912 *ts = rb_time_stamp(buffer); 1971 ts = rb_time_stamp(buffer);
1913 next_page->page->time_stamp = *ts; 1972 next_page->page->time_stamp = ts;
1914 } 1973 }
1915 1974
1916 out_again: 1975 out_again:
@@ -1929,12 +1988,21 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1929 1988
1930static struct ring_buffer_event * 1989static struct ring_buffer_event *
1931__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, 1990__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1932 unsigned type, unsigned long length, u64 *ts) 1991 unsigned long length, u64 ts,
1992 u64 delta, int add_timestamp)
1933{ 1993{
1934 struct buffer_page *tail_page; 1994 struct buffer_page *tail_page;
1935 struct ring_buffer_event *event; 1995 struct ring_buffer_event *event;
1936 unsigned long tail, write; 1996 unsigned long tail, write;
1937 1997
1998 /*
1999 * If the time delta since the last event is too big to
2000 * hold in the time field of the event, then we append a
2001 * TIME EXTEND event ahead of the data event.
2002 */
2003 if (unlikely(add_timestamp))
2004 length += RB_LEN_TIME_EXTEND;
2005
1938 tail_page = cpu_buffer->tail_page; 2006 tail_page = cpu_buffer->tail_page;
1939 write = local_add_return(length, &tail_page->write); 2007 write = local_add_return(length, &tail_page->write);
1940 2008
@@ -1943,7 +2011,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1943 tail = write - length; 2011 tail = write - length;
1944 2012
1945 /* See if we shot pass the end of this buffer page */ 2013 /* See if we shot pass the end of this buffer page */
1946 if (write > BUF_PAGE_SIZE) 2014 if (unlikely(write > BUF_PAGE_SIZE))
1947 return rb_move_tail(cpu_buffer, length, tail, 2015 return rb_move_tail(cpu_buffer, length, tail,
1948 tail_page, ts); 2016 tail_page, ts);
1949 2017
@@ -1951,18 +2019,16 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1951 2019
1952 event = __rb_page_index(tail_page, tail); 2020 event = __rb_page_index(tail_page, tail);
1953 kmemcheck_annotate_bitfield(event, bitfield); 2021 kmemcheck_annotate_bitfield(event, bitfield);
1954 rb_update_event(event, type, length); 2022 rb_update_event(cpu_buffer, event, length, add_timestamp, delta);
1955 2023
1956 /* The passed in type is zero for DATA */ 2024 local_inc(&tail_page->entries);
1957 if (likely(!type))
1958 local_inc(&tail_page->entries);
1959 2025
1960 /* 2026 /*
1961 * If this is the first commit on the page, then update 2027 * If this is the first commit on the page, then update
1962 * its timestamp. 2028 * its timestamp.
1963 */ 2029 */
1964 if (!tail) 2030 if (!tail)
1965 tail_page->page->time_stamp = *ts; 2031 tail_page->page->time_stamp = ts;
1966 2032
1967 return event; 2033 return event;
1968} 2034}
@@ -1977,7 +2043,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
1977 unsigned long addr; 2043 unsigned long addr;
1978 2044
1979 new_index = rb_event_index(event); 2045 new_index = rb_event_index(event);
1980 old_index = new_index + rb_event_length(event); 2046 old_index = new_index + rb_event_ts_length(event);
1981 addr = (unsigned long)event; 2047 addr = (unsigned long)event;
1982 addr &= PAGE_MASK; 2048 addr &= PAGE_MASK;
1983 2049
@@ -2003,76 +2069,13 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
2003 return 0; 2069 return 0;
2004} 2070}
2005 2071
2006static int
2007rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
2008 u64 *ts, u64 *delta)
2009{
2010 struct ring_buffer_event *event;
2011 int ret;
2012
2013 WARN_ONCE(*delta > (1ULL << 59),
2014 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
2015 (unsigned long long)*delta,
2016 (unsigned long long)*ts,
2017 (unsigned long long)cpu_buffer->write_stamp);
2018
2019 /*
2020 * The delta is too big, we to add a
2021 * new timestamp.
2022 */
2023 event = __rb_reserve_next(cpu_buffer,
2024 RINGBUF_TYPE_TIME_EXTEND,
2025 RB_LEN_TIME_EXTEND,
2026 ts);
2027 if (!event)
2028 return -EBUSY;
2029
2030 if (PTR_ERR(event) == -EAGAIN)
2031 return -EAGAIN;
2032
2033 /* Only a commited time event can update the write stamp */
2034 if (rb_event_is_commit(cpu_buffer, event)) {
2035 /*
2036 * If this is the first on the page, then it was
2037 * updated with the page itself. Try to discard it
2038 * and if we can't just make it zero.
2039 */
2040 if (rb_event_index(event)) {
2041 event->time_delta = *delta & TS_MASK;
2042 event->array[0] = *delta >> TS_SHIFT;
2043 } else {
2044 /* try to discard, since we do not need this */
2045 if (!rb_try_to_discard(cpu_buffer, event)) {
2046 /* nope, just zero it */
2047 event->time_delta = 0;
2048 event->array[0] = 0;
2049 }
2050 }
2051 cpu_buffer->write_stamp = *ts;
2052 /* let the caller know this was the commit */
2053 ret = 1;
2054 } else {
2055 /* Try to discard the event */
2056 if (!rb_try_to_discard(cpu_buffer, event)) {
2057 /* Darn, this is just wasted space */
2058 event->time_delta = 0;
2059 event->array[0] = 0;
2060 }
2061 ret = 0;
2062 }
2063
2064 *delta = 0;
2065
2066 return ret;
2067}
2068
2069static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) 2072static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
2070{ 2073{
2071 local_inc(&cpu_buffer->committing); 2074 local_inc(&cpu_buffer->committing);
2072 local_inc(&cpu_buffer->commits); 2075 local_inc(&cpu_buffer->commits);
2073} 2076}
2074 2077
2075static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) 2078static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
2076{ 2079{
2077 unsigned long commits; 2080 unsigned long commits;
2078 2081
@@ -2110,9 +2113,10 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2110 unsigned long length) 2113 unsigned long length)
2111{ 2114{
2112 struct ring_buffer_event *event; 2115 struct ring_buffer_event *event;
2113 u64 ts, delta = 0; 2116 u64 ts, delta;
2114 int commit = 0;
2115 int nr_loops = 0; 2117 int nr_loops = 0;
2118 int add_timestamp;
2119 u64 diff;
2116 2120
2117 rb_start_commit(cpu_buffer); 2121 rb_start_commit(cpu_buffer);
2118 2122
@@ -2133,6 +2137,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2133 2137
2134 length = rb_calculate_event_length(length); 2138 length = rb_calculate_event_length(length);
2135 again: 2139 again:
2140 add_timestamp = 0;
2141 delta = 0;
2142
2136 /* 2143 /*
2137 * We allow for interrupts to reenter here and do a trace. 2144 * We allow for interrupts to reenter here and do a trace.
2138 * If one does, it will cause this original code to loop 2145 * If one does, it will cause this original code to loop
@@ -2146,56 +2153,32 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2146 goto out_fail; 2153 goto out_fail;
2147 2154
2148 ts = rb_time_stamp(cpu_buffer->buffer); 2155 ts = rb_time_stamp(cpu_buffer->buffer);
2156 diff = ts - cpu_buffer->write_stamp;
2149 2157
2150 /* 2158 /* make sure this diff is calculated here */
2151 * Only the first commit can update the timestamp. 2159 barrier();
2152 * Yes there is a race here. If an interrupt comes in
2153 * just after the conditional and it traces too, then it
2154 * will also check the deltas. More than one timestamp may
2155 * also be made. But only the entry that did the actual
2156 * commit will be something other than zero.
2157 */
2158 if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page &&
2159 rb_page_write(cpu_buffer->tail_page) ==
2160 rb_commit_index(cpu_buffer))) {
2161 u64 diff;
2162
2163 diff = ts - cpu_buffer->write_stamp;
2164
2165 /* make sure this diff is calculated here */
2166 barrier();
2167
2168 /* Did the write stamp get updated already? */
2169 if (unlikely(ts < cpu_buffer->write_stamp))
2170 goto get_event;
2171 2160
2161 /* Did the write stamp get updated already? */
2162 if (likely(ts >= cpu_buffer->write_stamp)) {
2172 delta = diff; 2163 delta = diff;
2173 if (unlikely(test_time_stamp(delta))) { 2164 if (unlikely(test_time_stamp(delta))) {
2174 2165 WARN_ONCE(delta > (1ULL << 59),
2175 commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); 2166 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
2176 if (commit == -EBUSY) 2167 (unsigned long long)delta,
2177 goto out_fail; 2168 (unsigned long long)ts,
2178 2169 (unsigned long long)cpu_buffer->write_stamp);
2179 if (commit == -EAGAIN) 2170 add_timestamp = 1;
2180 goto again;
2181
2182 RB_WARN_ON(cpu_buffer, commit < 0);
2183 } 2171 }
2184 } 2172 }
2185 2173
2186 get_event: 2174 event = __rb_reserve_next(cpu_buffer, length, ts,
2187 event = __rb_reserve_next(cpu_buffer, 0, length, &ts); 2175 delta, add_timestamp);
2188 if (unlikely(PTR_ERR(event) == -EAGAIN)) 2176 if (unlikely(PTR_ERR(event) == -EAGAIN))
2189 goto again; 2177 goto again;
2190 2178
2191 if (!event) 2179 if (!event)
2192 goto out_fail; 2180 goto out_fail;
2193 2181
2194 if (!rb_event_is_commit(cpu_buffer, event))
2195 delta = 0;
2196
2197 event->time_delta = delta;
2198
2199 return event; 2182 return event;
2200 2183
2201 out_fail: 2184 out_fail:
@@ -2207,13 +2190,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2207 2190
2208#define TRACE_RECURSIVE_DEPTH 16 2191#define TRACE_RECURSIVE_DEPTH 16
2209 2192
2210static int trace_recursive_lock(void) 2193/* Keep this code out of the fast path cache */
2194static noinline void trace_recursive_fail(void)
2211{ 2195{
2212 current->trace_recursion++;
2213
2214 if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
2215 return 0;
2216
2217 /* Disable all tracing before we do anything else */ 2196 /* Disable all tracing before we do anything else */
2218 tracing_off_permanent(); 2197 tracing_off_permanent();
2219 2198
@@ -2225,10 +2204,21 @@ static int trace_recursive_lock(void)
2225 in_nmi()); 2204 in_nmi());
2226 2205
2227 WARN_ON_ONCE(1); 2206 WARN_ON_ONCE(1);
2207}
2208
2209static inline int trace_recursive_lock(void)
2210{
2211 current->trace_recursion++;
2212
2213 if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
2214 return 0;
2215
2216 trace_recursive_fail();
2217
2228 return -1; 2218 return -1;
2229} 2219}
2230 2220
2231static void trace_recursive_unlock(void) 2221static inline void trace_recursive_unlock(void)
2232{ 2222{
2233 WARN_ON_ONCE(!current->trace_recursion); 2223 WARN_ON_ONCE(!current->trace_recursion);
2234 2224
@@ -2242,8 +2232,6 @@ static void trace_recursive_unlock(void)
2242 2232
2243#endif 2233#endif
2244 2234
2245static DEFINE_PER_CPU(int, rb_need_resched);
2246
2247/** 2235/**
2248 * ring_buffer_lock_reserve - reserve a part of the buffer 2236 * ring_buffer_lock_reserve - reserve a part of the buffer
2249 * @buffer: the ring buffer to reserve from 2237 * @buffer: the ring buffer to reserve from
@@ -2264,13 +2252,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
2264{ 2252{
2265 struct ring_buffer_per_cpu *cpu_buffer; 2253 struct ring_buffer_per_cpu *cpu_buffer;
2266 struct ring_buffer_event *event; 2254 struct ring_buffer_event *event;
2267 int cpu, resched; 2255 int cpu;
2268 2256
2269 if (ring_buffer_flags != RB_BUFFERS_ON) 2257 if (ring_buffer_flags != RB_BUFFERS_ON)
2270 return NULL; 2258 return NULL;
2271 2259
2272 /* If we are tracing schedule, we don't want to recurse */ 2260 /* If we are tracing schedule, we don't want to recurse */
2273 resched = ftrace_preempt_disable(); 2261 preempt_disable_notrace();
2274 2262
2275 if (atomic_read(&buffer->record_disabled)) 2263 if (atomic_read(&buffer->record_disabled))
2276 goto out_nocheck; 2264 goto out_nocheck;
@@ -2295,21 +2283,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
2295 if (!event) 2283 if (!event)
2296 goto out; 2284 goto out;
2297 2285
2298 /*
2299 * Need to store resched state on this cpu.
2300 * Only the first needs to.
2301 */
2302
2303 if (preempt_count() == 1)
2304 per_cpu(rb_need_resched, cpu) = resched;
2305
2306 return event; 2286 return event;
2307 2287
2308 out: 2288 out:
2309 trace_recursive_unlock(); 2289 trace_recursive_unlock();
2310 2290
2311 out_nocheck: 2291 out_nocheck:
2312 ftrace_preempt_enable(resched); 2292 preempt_enable_notrace();
2313 return NULL; 2293 return NULL;
2314} 2294}
2315EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); 2295EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
@@ -2318,12 +2298,28 @@ static void
2318rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, 2298rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
2319 struct ring_buffer_event *event) 2299 struct ring_buffer_event *event)
2320{ 2300{
2301 u64 delta;
2302
2321 /* 2303 /*
2322 * The event first in the commit queue updates the 2304 * The event first in the commit queue updates the
2323 * time stamp. 2305 * time stamp.
2324 */ 2306 */
2325 if (rb_event_is_commit(cpu_buffer, event)) 2307 if (rb_event_is_commit(cpu_buffer, event)) {
2326 cpu_buffer->write_stamp += event->time_delta; 2308 /*
2309 * A commit event that is first on a page
2310 * updates the write timestamp with the page stamp
2311 */
2312 if (!rb_event_index(event))
2313 cpu_buffer->write_stamp =
2314 cpu_buffer->commit_page->page->time_stamp;
2315 else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
2316 delta = event->array[0];
2317 delta <<= TS_SHIFT;
2318 delta += event->time_delta;
2319 cpu_buffer->write_stamp += delta;
2320 } else
2321 cpu_buffer->write_stamp += event->time_delta;
2322 }
2327} 2323}
2328 2324
2329static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, 2325static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
@@ -2355,13 +2351,7 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
2355 2351
2356 trace_recursive_unlock(); 2352 trace_recursive_unlock();
2357 2353
2358 /* 2354 preempt_enable_notrace();
2359 * Only the last preempt count needs to restore preemption.
2360 */
2361 if (preempt_count() == 1)
2362 ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
2363 else
2364 preempt_enable_no_resched_notrace();
2365 2355
2366 return 0; 2356 return 0;
2367} 2357}
@@ -2369,6 +2359,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
2369 2359
2370static inline void rb_event_discard(struct ring_buffer_event *event) 2360static inline void rb_event_discard(struct ring_buffer_event *event)
2371{ 2361{
2362 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
2363 event = skip_time_extend(event);
2364
2372 /* array[0] holds the actual length for the discarded event */ 2365 /* array[0] holds the actual length for the discarded event */
2373 event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; 2366 event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
2374 event->type_len = RINGBUF_TYPE_PADDING; 2367 event->type_len = RINGBUF_TYPE_PADDING;
@@ -2469,13 +2462,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
2469 2462
2470 trace_recursive_unlock(); 2463 trace_recursive_unlock();
2471 2464
2472 /* 2465 preempt_enable_notrace();
2473 * Only the last preempt count needs to restore preemption.
2474 */
2475 if (preempt_count() == 1)
2476 ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
2477 else
2478 preempt_enable_no_resched_notrace();
2479 2466
2480} 2467}
2481EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); 2468EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
@@ -2501,12 +2488,12 @@ int ring_buffer_write(struct ring_buffer *buffer,
2501 struct ring_buffer_event *event; 2488 struct ring_buffer_event *event;
2502 void *body; 2489 void *body;
2503 int ret = -EBUSY; 2490 int ret = -EBUSY;
2504 int cpu, resched; 2491 int cpu;
2505 2492
2506 if (ring_buffer_flags != RB_BUFFERS_ON) 2493 if (ring_buffer_flags != RB_BUFFERS_ON)
2507 return -EBUSY; 2494 return -EBUSY;
2508 2495
2509 resched = ftrace_preempt_disable(); 2496 preempt_disable_notrace();
2510 2497
2511 if (atomic_read(&buffer->record_disabled)) 2498 if (atomic_read(&buffer->record_disabled))
2512 goto out; 2499 goto out;
@@ -2536,7 +2523,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
2536 2523
2537 ret = 0; 2524 ret = 0;
2538 out: 2525 out:
2539 ftrace_preempt_enable(resched); 2526 preempt_enable_notrace();
2540 2527
2541 return ret; 2528 return ret;
2542} 2529}
@@ -2628,6 +2615,19 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
2628} 2615}
2629EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); 2616EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
2630 2617
2618/*
2619 * The total entries in the ring buffer is the running counter
2620 * of entries entered into the ring buffer, minus the sum of
2621 * the entries read from the ring buffer and the number of
2622 * entries that were overwritten.
2623 */
2624static inline unsigned long
2625rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
2626{
2627 return local_read(&cpu_buffer->entries) -
2628 (local_read(&cpu_buffer->overrun) + cpu_buffer->read);
2629}
2630
2631/** 2631/**
2632 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer 2632 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
2633 * @buffer: The ring buffer 2633 * @buffer: The ring buffer
@@ -2636,16 +2636,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
2636unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) 2636unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
2637{ 2637{
2638 struct ring_buffer_per_cpu *cpu_buffer; 2638 struct ring_buffer_per_cpu *cpu_buffer;
2639 unsigned long ret;
2640 2639
2641 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2640 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2642 return 0; 2641 return 0;
2643 2642
2644 cpu_buffer = buffer->buffers[cpu]; 2643 cpu_buffer = buffer->buffers[cpu];
2645 ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun))
2646 - cpu_buffer->read;
2647 2644
2648 return ret; 2645 return rb_num_of_entries(cpu_buffer);
2649} 2646}
2650EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); 2647EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
2651 2648
@@ -2706,8 +2703,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
2706 /* if you care about this being correct, lock the buffer */ 2703 /* if you care about this being correct, lock the buffer */
2707 for_each_buffer_cpu(buffer, cpu) { 2704 for_each_buffer_cpu(buffer, cpu) {
2708 cpu_buffer = buffer->buffers[cpu]; 2705 cpu_buffer = buffer->buffers[cpu];
2709 entries += (local_read(&cpu_buffer->entries) - 2706 entries += rb_num_of_entries(cpu_buffer);
2710 local_read(&cpu_buffer->overrun)) - cpu_buffer->read;
2711 } 2707 }
2712 2708
2713 return entries; 2709 return entries;
@@ -3007,13 +3003,11 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
3007 3003
3008static void rb_advance_iter(struct ring_buffer_iter *iter) 3004static void rb_advance_iter(struct ring_buffer_iter *iter)
3009{ 3005{
3010 struct ring_buffer *buffer;
3011 struct ring_buffer_per_cpu *cpu_buffer; 3006 struct ring_buffer_per_cpu *cpu_buffer;
3012 struct ring_buffer_event *event; 3007 struct ring_buffer_event *event;
3013 unsigned length; 3008 unsigned length;
3014 3009
3015 cpu_buffer = iter->cpu_buffer; 3010 cpu_buffer = iter->cpu_buffer;
3016 buffer = cpu_buffer->buffer;
3017 3011
3018 /* 3012 /*
3019 * Check if we are at the end of the buffer. 3013 * Check if we are at the end of the buffer.
@@ -3064,12 +3058,12 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
3064 3058
3065 again: 3059 again:
3066 /* 3060 /*
3067 * We repeat when a timestamp is encountered. It is possible 3061 * We repeat when a time extend is encountered.
3068 * to get multiple timestamps from an interrupt entering just 3062 * Since the time extend is always attached to a data event,
3069 * as one timestamp is about to be written, or from discarded 3063 * we should never loop more than once.
3070 * commits. The most that we can have is the number on a single page. 3064 * (We never hit the following condition more than twice).
3071 */ 3065 */
3072 if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) 3066 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
3073 return NULL; 3067 return NULL;
3074 3068
3075 reader = rb_get_reader_page(cpu_buffer); 3069 reader = rb_get_reader_page(cpu_buffer);
@@ -3145,14 +3139,12 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3145 return NULL; 3139 return NULL;
3146 3140
3147 /* 3141 /*
3148 * We repeat when a timestamp is encountered. 3142 * We repeat when a time extend is encountered.
3149 * We can get multiple timestamps by nested interrupts or also 3143 * Since the time extend is always attached to a data event,
3150 * if filtering is on (discarding commits). Since discarding 3144 * we should never loop more than once.
3151 * commits can be frequent we can get a lot of timestamps. 3145 * (We never hit the following condition more than twice).
3152 * But we limit them by not adding timestamps if they begin
3153 * at the start of a page.
3154 */ 3146 */
3155 if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) 3147 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
3156 return NULL; 3148 return NULL;
3157 3149
3158 if (rb_per_cpu_empty(cpu_buffer)) 3150 if (rb_per_cpu_empty(cpu_buffer))
@@ -3850,7 +3842,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3850 if (len > (commit - read)) 3842 if (len > (commit - read))
3851 len = (commit - read); 3843 len = (commit - read);
3852 3844
3853 size = rb_event_length(event); 3845 /* Always keep the time extend and data together */
3846 size = rb_event_ts_length(event);
3854 3847
3855 if (len < size) 3848 if (len < size)
3856 goto out_unlock; 3849 goto out_unlock;
@@ -3868,8 +3861,12 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3868 rpos = reader->read; 3861 rpos = reader->read;
3869 pos += size; 3862 pos += size;
3870 3863
3864 if (rpos >= commit)
3865 break;
3866
3871 event = rb_reader_event(cpu_buffer); 3867 event = rb_reader_event(cpu_buffer);
3872 size = rb_event_length(event); 3868 /* Always keep the time extend and data together */
3869 size = rb_event_ts_length(event);
3873 } while (len > size); 3870 } while (len > size);
3874 3871
3875 /* update bpage */ 3872 /* update bpage */
@@ -3986,6 +3983,7 @@ static const struct file_operations rb_simple_fops = {
3986 .open = tracing_open_generic, 3983 .open = tracing_open_generic,
3987 .read = rb_simple_read, 3984 .read = rb_simple_read,
3988 .write = rb_simple_write, 3985 .write = rb_simple_write,
3986 .llseek = default_llseek,
3989}; 3987};
3990 3988
3991 3989
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 086d36316805..82d9b8106cd0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -101,10 +101,7 @@ static inline void ftrace_enable_cpu(void)
101 preempt_enable(); 101 preempt_enable();
102} 102}
103 103
104static cpumask_var_t __read_mostly tracing_buffer_mask; 104cpumask_var_t __read_mostly tracing_buffer_mask;
105
106#define for_each_tracing_cpu(cpu) \
107 for_each_cpu(cpu, tracing_buffer_mask)
108 105
109/* 106/*
110 * ftrace_dump_on_oops - variable to dump ftrace buffer on oops 107 * ftrace_dump_on_oops - variable to dump ftrace buffer on oops
@@ -344,7 +341,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
344/* trace_flags holds trace_options default values */ 341/* trace_flags holds trace_options default values */
345unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | 342unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
346 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | 343 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
347 TRACE_ITER_GRAPH_TIME; 344 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD;
348 345
349static int trace_stop_count; 346static int trace_stop_count;
350static DEFINE_SPINLOCK(tracing_start_lock); 347static DEFINE_SPINLOCK(tracing_start_lock);
@@ -428,6 +425,7 @@ static const char *trace_options[] = {
428 "latency-format", 425 "latency-format",
429 "sleep-time", 426 "sleep-time",
430 "graph-time", 427 "graph-time",
428 "record-cmd",
431 NULL 429 NULL
432}; 430};
433 431
@@ -659,6 +657,10 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
659 return; 657 return;
660 658
661 WARN_ON_ONCE(!irqs_disabled()); 659 WARN_ON_ONCE(!irqs_disabled());
660 if (!current_trace->use_max_tr) {
661 WARN_ON_ONCE(1);
662 return;
663 }
662 arch_spin_lock(&ftrace_max_lock); 664 arch_spin_lock(&ftrace_max_lock);
663 665
664 tr->buffer = max_tr.buffer; 666 tr->buffer = max_tr.buffer;
@@ -685,6 +687,11 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
685 return; 687 return;
686 688
687 WARN_ON_ONCE(!irqs_disabled()); 689 WARN_ON_ONCE(!irqs_disabled());
690 if (!current_trace->use_max_tr) {
691 WARN_ON_ONCE(1);
692 return;
693 }
694
688 arch_spin_lock(&ftrace_max_lock); 695 arch_spin_lock(&ftrace_max_lock);
689 696
690 ftrace_disable_cpu(); 697 ftrace_disable_cpu();
@@ -729,18 +736,11 @@ __acquires(kernel_lock)
729 return -1; 736 return -1;
730 } 737 }
731 738
732 if (strlen(type->name) > MAX_TRACER_SIZE) { 739 if (strlen(type->name) >= MAX_TRACER_SIZE) {
733 pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE); 740 pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE);
734 return -1; 741 return -1;
735 } 742 }
736 743
737 /*
738 * When this gets called we hold the BKL which means that
739 * preemption is disabled. Various trace selftests however
740 * need to disable and enable preemption for successful tests.
741 * So we drop the BKL here and grab it after the tests again.
742 */
743 unlock_kernel();
744 mutex_lock(&trace_types_lock); 744 mutex_lock(&trace_types_lock);
745 745
746 tracing_selftest_running = true; 746 tracing_selftest_running = true;
@@ -822,7 +822,6 @@ __acquires(kernel_lock)
822#endif 822#endif
823 823
824 out_unlock: 824 out_unlock:
825 lock_kernel();
826 return ret; 825 return ret;
827} 826}
828 827
@@ -1331,61 +1330,6 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
1331 1330
1332#endif /* CONFIG_STACKTRACE */ 1331#endif /* CONFIG_STACKTRACE */
1333 1332
1334static void
1335ftrace_trace_special(void *__tr,
1336 unsigned long arg1, unsigned long arg2, unsigned long arg3,
1337 int pc)
1338{
1339 struct ftrace_event_call *call = &event_special;
1340 struct ring_buffer_event *event;
1341 struct trace_array *tr = __tr;
1342 struct ring_buffer *buffer = tr->buffer;
1343 struct special_entry *entry;
1344
1345 event = trace_buffer_lock_reserve(buffer, TRACE_SPECIAL,
1346 sizeof(*entry), 0, pc);
1347 if (!event)
1348 return;
1349 entry = ring_buffer_event_data(event);
1350 entry->arg1 = arg1;
1351 entry->arg2 = arg2;
1352 entry->arg3 = arg3;
1353
1354 if (!filter_check_discard(call, entry, buffer, event))
1355 trace_buffer_unlock_commit(buffer, event, 0, pc);
1356}
1357
1358void
1359__trace_special(void *__tr, void *__data,
1360 unsigned long arg1, unsigned long arg2, unsigned long arg3)
1361{
1362 ftrace_trace_special(__tr, arg1, arg2, arg3, preempt_count());
1363}
1364
1365void
1366ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
1367{
1368 struct trace_array *tr = &global_trace;
1369 struct trace_array_cpu *data;
1370 unsigned long flags;
1371 int cpu;
1372 int pc;
1373
1374 if (tracing_disabled)
1375 return;
1376
1377 pc = preempt_count();
1378 local_irq_save(flags);
1379 cpu = raw_smp_processor_id();
1380 data = tr->data[cpu];
1381
1382 if (likely(atomic_inc_return(&data->disabled) == 1))
1383 ftrace_trace_special(tr, arg1, arg2, arg3, pc);
1384
1385 atomic_dec(&data->disabled);
1386 local_irq_restore(flags);
1387}
1388
1389/** 1333/**
1390 * trace_vbprintk - write binary msg to tracing buffer 1334 * trace_vbprintk - write binary msg to tracing buffer
1391 * 1335 *
@@ -1404,7 +1348,6 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1404 struct bprint_entry *entry; 1348 struct bprint_entry *entry;
1405 unsigned long flags; 1349 unsigned long flags;
1406 int disable; 1350 int disable;
1407 int resched;
1408 int cpu, len = 0, size, pc; 1351 int cpu, len = 0, size, pc;
1409 1352
1410 if (unlikely(tracing_selftest_running || tracing_disabled)) 1353 if (unlikely(tracing_selftest_running || tracing_disabled))
@@ -1414,7 +1357,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1414 pause_graph_tracing(); 1357 pause_graph_tracing();
1415 1358
1416 pc = preempt_count(); 1359 pc = preempt_count();
1417 resched = ftrace_preempt_disable(); 1360 preempt_disable_notrace();
1418 cpu = raw_smp_processor_id(); 1361 cpu = raw_smp_processor_id();
1419 data = tr->data[cpu]; 1362 data = tr->data[cpu];
1420 1363
@@ -1452,7 +1395,7 @@ out_unlock:
1452 1395
1453out: 1396out:
1454 atomic_dec_return(&data->disabled); 1397 atomic_dec_return(&data->disabled);
1455 ftrace_preempt_enable(resched); 1398 preempt_enable_notrace();
1456 unpause_graph_tracing(); 1399 unpause_graph_tracing();
1457 1400
1458 return len; 1401 return len;
@@ -1539,11 +1482,6 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1539} 1482}
1540EXPORT_SYMBOL_GPL(trace_vprintk); 1483EXPORT_SYMBOL_GPL(trace_vprintk);
1541 1484
1542enum trace_file_type {
1543 TRACE_FILE_LAT_FMT = 1,
1544 TRACE_FILE_ANNOTATE = 2,
1545};
1546
1547static void trace_iterator_increment(struct trace_iterator *iter) 1485static void trace_iterator_increment(struct trace_iterator *iter)
1548{ 1486{
1549 /* Don't allow ftrace to trace into the ring buffers */ 1487 /* Don't allow ftrace to trace into the ring buffers */
@@ -1641,7 +1579,7 @@ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
1641} 1579}
1642 1580
1643/* Find the next real entry, and increment the iterator to the next entry */ 1581/* Find the next real entry, and increment the iterator to the next entry */
1644static void *find_next_entry_inc(struct trace_iterator *iter) 1582void *trace_find_next_entry_inc(struct trace_iterator *iter)
1645{ 1583{
1646 iter->ent = __find_next_entry(iter, &iter->cpu, 1584 iter->ent = __find_next_entry(iter, &iter->cpu,
1647 &iter->lost_events, &iter->ts); 1585 &iter->lost_events, &iter->ts);
@@ -1676,19 +1614,19 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
1676 return NULL; 1614 return NULL;
1677 1615
1678 if (iter->idx < 0) 1616 if (iter->idx < 0)
1679 ent = find_next_entry_inc(iter); 1617 ent = trace_find_next_entry_inc(iter);
1680 else 1618 else
1681 ent = iter; 1619 ent = iter;
1682 1620
1683 while (ent && iter->idx < i) 1621 while (ent && iter->idx < i)
1684 ent = find_next_entry_inc(iter); 1622 ent = trace_find_next_entry_inc(iter);
1685 1623
1686 iter->pos = *pos; 1624 iter->pos = *pos;
1687 1625
1688 return ent; 1626 return ent;
1689} 1627}
1690 1628
1691static void tracing_iter_reset(struct trace_iterator *iter, int cpu) 1629void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1692{ 1630{
1693 struct trace_array *tr = iter->tr; 1631 struct trace_array *tr = iter->tr;
1694 struct ring_buffer_event *event; 1632 struct ring_buffer_event *event;
@@ -2049,7 +1987,7 @@ int trace_empty(struct trace_iterator *iter)
2049} 1987}
2050 1988
2051/* Called with trace_event_read_lock() held. */ 1989/* Called with trace_event_read_lock() held. */
2052static enum print_line_t print_trace_line(struct trace_iterator *iter) 1990enum print_line_t print_trace_line(struct trace_iterator *iter)
2053{ 1991{
2054 enum print_line_t ret; 1992 enum print_line_t ret;
2055 1993
@@ -2258,7 +2196,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp)
2258 2196
2259static int tracing_release(struct inode *inode, struct file *file) 2197static int tracing_release(struct inode *inode, struct file *file)
2260{ 2198{
2261 struct seq_file *m = (struct seq_file *)file->private_data; 2199 struct seq_file *m = file->private_data;
2262 struct trace_iterator *iter; 2200 struct trace_iterator *iter;
2263 int cpu; 2201 int cpu;
2264 2202
@@ -2394,6 +2332,7 @@ static const struct file_operations show_traces_fops = {
2394 .open = show_traces_open, 2332 .open = show_traces_open,
2395 .read = seq_read, 2333 .read = seq_read,
2396 .release = seq_release, 2334 .release = seq_release,
2335 .llseek = seq_lseek,
2397}; 2336};
2398 2337
2399/* 2338/*
@@ -2487,6 +2426,7 @@ static const struct file_operations tracing_cpumask_fops = {
2487 .open = tracing_open_generic, 2426 .open = tracing_open_generic,
2488 .read = tracing_cpumask_read, 2427 .read = tracing_cpumask_read,
2489 .write = tracing_cpumask_write, 2428 .write = tracing_cpumask_write,
2429 .llseek = generic_file_llseek,
2490}; 2430};
2491 2431
2492static int tracing_trace_options_show(struct seq_file *m, void *v) 2432static int tracing_trace_options_show(struct seq_file *m, void *v)
@@ -2562,6 +2502,9 @@ static void set_tracer_flags(unsigned int mask, int enabled)
2562 trace_flags |= mask; 2502 trace_flags |= mask;
2563 else 2503 else
2564 trace_flags &= ~mask; 2504 trace_flags &= ~mask;
2505
2506 if (mask == TRACE_ITER_RECORD_CMD)
2507 trace_event_enable_cmd_record(enabled);
2565} 2508}
2566 2509
2567static ssize_t 2510static ssize_t
@@ -2653,6 +2596,7 @@ tracing_readme_read(struct file *filp, char __user *ubuf,
2653static const struct file_operations tracing_readme_fops = { 2596static const struct file_operations tracing_readme_fops = {
2654 .open = tracing_open_generic, 2597 .open = tracing_open_generic,
2655 .read = tracing_readme_read, 2598 .read = tracing_readme_read,
2599 .llseek = generic_file_llseek,
2656}; 2600};
2657 2601
2658static ssize_t 2602static ssize_t
@@ -2703,6 +2647,7 @@ tracing_saved_cmdlines_read(struct file *file, char __user *ubuf,
2703static const struct file_operations tracing_saved_cmdlines_fops = { 2647static const struct file_operations tracing_saved_cmdlines_fops = {
2704 .open = tracing_open_generic, 2648 .open = tracing_open_generic,
2705 .read = tracing_saved_cmdlines_read, 2649 .read = tracing_saved_cmdlines_read,
2650 .llseek = generic_file_llseek,
2706}; 2651};
2707 2652
2708static ssize_t 2653static ssize_t
@@ -2798,6 +2743,9 @@ static int tracing_resize_ring_buffer(unsigned long size)
2798 if (ret < 0) 2743 if (ret < 0)
2799 return ret; 2744 return ret;
2800 2745
2746 if (!current_trace->use_max_tr)
2747 goto out;
2748
2801 ret = ring_buffer_resize(max_tr.buffer, size); 2749 ret = ring_buffer_resize(max_tr.buffer, size);
2802 if (ret < 0) { 2750 if (ret < 0) {
2803 int r; 2751 int r;
@@ -2825,11 +2773,14 @@ static int tracing_resize_ring_buffer(unsigned long size)
2825 return ret; 2773 return ret;
2826 } 2774 }
2827 2775
2776 max_tr.entries = size;
2777 out:
2828 global_trace.entries = size; 2778 global_trace.entries = size;
2829 2779
2830 return ret; 2780 return ret;
2831} 2781}
2832 2782
2783
2833/** 2784/**
2834 * tracing_update_buffers - used by tracing facility to expand ring buffers 2785 * tracing_update_buffers - used by tracing facility to expand ring buffers
2835 * 2786 *
@@ -2890,12 +2841,26 @@ static int tracing_set_tracer(const char *buf)
2890 trace_branch_disable(); 2841 trace_branch_disable();
2891 if (current_trace && current_trace->reset) 2842 if (current_trace && current_trace->reset)
2892 current_trace->reset(tr); 2843 current_trace->reset(tr);
2893 2844 if (current_trace && current_trace->use_max_tr) {
2845 /*
2846 * We don't free the ring buffer. instead, resize it because
2847 * The max_tr ring buffer has some state (e.g. ring->clock) and
2848 * we want preserve it.
2849 */
2850 ring_buffer_resize(max_tr.buffer, 1);
2851 max_tr.entries = 1;
2852 }
2894 destroy_trace_option_files(topts); 2853 destroy_trace_option_files(topts);
2895 2854
2896 current_trace = t; 2855 current_trace = t;
2897 2856
2898 topts = create_trace_option_files(current_trace); 2857 topts = create_trace_option_files(current_trace);
2858 if (current_trace->use_max_tr) {
2859 ret = ring_buffer_resize(max_tr.buffer, global_trace.entries);
2860 if (ret < 0)
2861 goto out;
2862 max_tr.entries = global_trace.entries;
2863 }
2899 2864
2900 if (t->init) { 2865 if (t->init) {
2901 ret = tracer_init(t, tr); 2866 ret = tracer_init(t, tr);
@@ -3032,6 +2997,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
3032 if (iter->trace->pipe_open) 2997 if (iter->trace->pipe_open)
3033 iter->trace->pipe_open(iter); 2998 iter->trace->pipe_open(iter);
3034 2999
3000 nonseekable_open(inode, filp);
3035out: 3001out:
3036 mutex_unlock(&trace_types_lock); 3002 mutex_unlock(&trace_types_lock);
3037 return ret; 3003 return ret;
@@ -3211,7 +3177,7 @@ waitagain:
3211 3177
3212 trace_event_read_lock(); 3178 trace_event_read_lock();
3213 trace_access_lock(iter->cpu_file); 3179 trace_access_lock(iter->cpu_file);
3214 while (find_next_entry_inc(iter) != NULL) { 3180 while (trace_find_next_entry_inc(iter) != NULL) {
3215 enum print_line_t ret; 3181 enum print_line_t ret;
3216 int len = iter->seq.len; 3182 int len = iter->seq.len;
3217 3183
@@ -3294,7 +3260,7 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
3294 if (ret != TRACE_TYPE_NO_CONSUME) 3260 if (ret != TRACE_TYPE_NO_CONSUME)
3295 trace_consume(iter); 3261 trace_consume(iter);
3296 rem -= count; 3262 rem -= count;
3297 if (!find_next_entry_inc(iter)) { 3263 if (!trace_find_next_entry_inc(iter)) {
3298 rem = 0; 3264 rem = 0;
3299 iter->ent = NULL; 3265 iter->ent = NULL;
3300 break; 3266 break;
@@ -3350,7 +3316,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3350 if (ret <= 0) 3316 if (ret <= 0)
3351 goto out_err; 3317 goto out_err;
3352 3318
3353 if (!iter->ent && !find_next_entry_inc(iter)) { 3319 if (!iter->ent && !trace_find_next_entry_inc(iter)) {
3354 ret = -EFAULT; 3320 ret = -EFAULT;
3355 goto out_err; 3321 goto out_err;
3356 } 3322 }
@@ -3477,7 +3443,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3477 } 3443 }
3478 3444
3479 tracing_start(); 3445 tracing_start();
3480 max_tr.entries = global_trace.entries;
3481 mutex_unlock(&trace_types_lock); 3446 mutex_unlock(&trace_types_lock);
3482 3447
3483 return cnt; 3448 return cnt;
@@ -3498,6 +3463,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3498 size_t cnt, loff_t *fpos) 3463 size_t cnt, loff_t *fpos)
3499{ 3464{
3500 char *buf; 3465 char *buf;
3466 size_t written;
3501 3467
3502 if (tracing_disabled) 3468 if (tracing_disabled)
3503 return -EINVAL; 3469 return -EINVAL;
@@ -3519,11 +3485,15 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3519 } else 3485 } else
3520 buf[cnt] = '\0'; 3486 buf[cnt] = '\0';
3521 3487
3522 cnt = mark_printk("%s", buf); 3488 written = mark_printk("%s", buf);
3523 kfree(buf); 3489 kfree(buf);
3524 *fpos += cnt; 3490 *fpos += written;
3525 3491
3526 return cnt; 3492 /* don't tell userspace we wrote more - it might confuse them */
3493 if (written > cnt)
3494 written = cnt;
3495
3496 return written;
3527} 3497}
3528 3498
3529static int tracing_clock_show(struct seq_file *m, void *v) 3499static int tracing_clock_show(struct seq_file *m, void *v)
@@ -3590,18 +3560,21 @@ static const struct file_operations tracing_max_lat_fops = {
3590 .open = tracing_open_generic, 3560 .open = tracing_open_generic,
3591 .read = tracing_max_lat_read, 3561 .read = tracing_max_lat_read,
3592 .write = tracing_max_lat_write, 3562 .write = tracing_max_lat_write,
3563 .llseek = generic_file_llseek,
3593}; 3564};
3594 3565
3595static const struct file_operations tracing_ctrl_fops = { 3566static const struct file_operations tracing_ctrl_fops = {
3596 .open = tracing_open_generic, 3567 .open = tracing_open_generic,
3597 .read = tracing_ctrl_read, 3568 .read = tracing_ctrl_read,
3598 .write = tracing_ctrl_write, 3569 .write = tracing_ctrl_write,
3570 .llseek = generic_file_llseek,
3599}; 3571};
3600 3572
3601static const struct file_operations set_tracer_fops = { 3573static const struct file_operations set_tracer_fops = {
3602 .open = tracing_open_generic, 3574 .open = tracing_open_generic,
3603 .read = tracing_set_trace_read, 3575 .read = tracing_set_trace_read,
3604 .write = tracing_set_trace_write, 3576 .write = tracing_set_trace_write,
3577 .llseek = generic_file_llseek,
3605}; 3578};
3606 3579
3607static const struct file_operations tracing_pipe_fops = { 3580static const struct file_operations tracing_pipe_fops = {
@@ -3610,17 +3583,20 @@ static const struct file_operations tracing_pipe_fops = {
3610 .read = tracing_read_pipe, 3583 .read = tracing_read_pipe,
3611 .splice_read = tracing_splice_read_pipe, 3584 .splice_read = tracing_splice_read_pipe,
3612 .release = tracing_release_pipe, 3585 .release = tracing_release_pipe,
3586 .llseek = no_llseek,
3613}; 3587};
3614 3588
3615static const struct file_operations tracing_entries_fops = { 3589static const struct file_operations tracing_entries_fops = {
3616 .open = tracing_open_generic, 3590 .open = tracing_open_generic,
3617 .read = tracing_entries_read, 3591 .read = tracing_entries_read,
3618 .write = tracing_entries_write, 3592 .write = tracing_entries_write,
3593 .llseek = generic_file_llseek,
3619}; 3594};
3620 3595
3621static const struct file_operations tracing_mark_fops = { 3596static const struct file_operations tracing_mark_fops = {
3622 .open = tracing_open_generic, 3597 .open = tracing_open_generic,
3623 .write = tracing_mark_write, 3598 .write = tracing_mark_write,
3599 .llseek = generic_file_llseek,
3624}; 3600};
3625 3601
3626static const struct file_operations trace_clock_fops = { 3602static const struct file_operations trace_clock_fops = {
@@ -3926,6 +3902,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
3926static const struct file_operations tracing_stats_fops = { 3902static const struct file_operations tracing_stats_fops = {
3927 .open = tracing_open_generic, 3903 .open = tracing_open_generic,
3928 .read = tracing_stats_read, 3904 .read = tracing_stats_read,
3905 .llseek = generic_file_llseek,
3929}; 3906};
3930 3907
3931#ifdef CONFIG_DYNAMIC_FTRACE 3908#ifdef CONFIG_DYNAMIC_FTRACE
@@ -3962,6 +3939,7 @@ tracing_read_dyn_info(struct file *filp, char __user *ubuf,
3962static const struct file_operations tracing_dyn_info_fops = { 3939static const struct file_operations tracing_dyn_info_fops = {
3963 .open = tracing_open_generic, 3940 .open = tracing_open_generic,
3964 .read = tracing_read_dyn_info, 3941 .read = tracing_read_dyn_info,
3942 .llseek = generic_file_llseek,
3965}; 3943};
3966#endif 3944#endif
3967 3945
@@ -4018,13 +3996,9 @@ static void tracing_init_debugfs_percpu(long cpu)
4018{ 3996{
4019 struct dentry *d_percpu = tracing_dentry_percpu(); 3997 struct dentry *d_percpu = tracing_dentry_percpu();
4020 struct dentry *d_cpu; 3998 struct dentry *d_cpu;
4021 /* strlen(cpu) + MAX(log10(cpu)) + '\0' */ 3999 char cpu_dir[30]; /* 30 characters should be more than enough */
4022 char cpu_dir[7];
4023 4000
4024 if (cpu > 999 || cpu < 0) 4001 snprintf(cpu_dir, 30, "cpu%ld", cpu);
4025 return;
4026
4027 sprintf(cpu_dir, "cpu%ld", cpu);
4028 d_cpu = debugfs_create_dir(cpu_dir, d_percpu); 4002 d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
4029 if (!d_cpu) { 4003 if (!d_cpu) {
4030 pr_warning("Could not create debugfs '%s' entry\n", cpu_dir); 4004 pr_warning("Could not create debugfs '%s' entry\n", cpu_dir);
@@ -4115,6 +4089,7 @@ static const struct file_operations trace_options_fops = {
4115 .open = tracing_open_generic, 4089 .open = tracing_open_generic,
4116 .read = trace_options_read, 4090 .read = trace_options_read,
4117 .write = trace_options_write, 4091 .write = trace_options_write,
4092 .llseek = generic_file_llseek,
4118}; 4093};
4119 4094
4120static ssize_t 4095static ssize_t
@@ -4166,6 +4141,7 @@ static const struct file_operations trace_options_core_fops = {
4166 .open = tracing_open_generic, 4141 .open = tracing_open_generic,
4167 .read = trace_options_core_read, 4142 .read = trace_options_core_read,
4168 .write = trace_options_core_write, 4143 .write = trace_options_core_write,
4144 .llseek = generic_file_llseek,
4169}; 4145};
4170 4146
4171struct dentry *trace_create_file(const char *name, 4147struct dentry *trace_create_file(const char *name,
@@ -4355,9 +4331,6 @@ static __init int tracer_init_debugfs(void)
4355 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, 4331 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
4356 &ftrace_update_tot_cnt, &tracing_dyn_info_fops); 4332 &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
4357#endif 4333#endif
4358#ifdef CONFIG_SYSPROF_TRACER
4359 init_tracer_sysprof_debugfs(d_tracer);
4360#endif
4361 4334
4362 create_trace_options_dir(); 4335 create_trace_options_dir();
4363 4336
@@ -4414,7 +4387,7 @@ static struct notifier_block trace_die_notifier = {
4414 */ 4387 */
4415#define KERN_TRACE KERN_EMERG 4388#define KERN_TRACE KERN_EMERG
4416 4389
4417static void 4390void
4418trace_printk_seq(struct trace_seq *s) 4391trace_printk_seq(struct trace_seq *s)
4419{ 4392{
4420 /* Probably should print a warning here. */ 4393 /* Probably should print a warning here. */
@@ -4429,6 +4402,13 @@ trace_printk_seq(struct trace_seq *s)
4429 trace_seq_init(s); 4402 trace_seq_init(s);
4430} 4403}
4431 4404
4405void trace_init_global_iter(struct trace_iterator *iter)
4406{
4407 iter->tr = &global_trace;
4408 iter->trace = current_trace;
4409 iter->cpu_file = TRACE_PIPE_ALL_CPU;
4410}
4411
4432static void 4412static void
4433__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) 4413__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
4434{ 4414{
@@ -4454,8 +4434,10 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
4454 if (disable_tracing) 4434 if (disable_tracing)
4455 ftrace_kill(); 4435 ftrace_kill();
4456 4436
4437 trace_init_global_iter(&iter);
4438
4457 for_each_tracing_cpu(cpu) { 4439 for_each_tracing_cpu(cpu) {
4458 atomic_inc(&global_trace.data[cpu]->disabled); 4440 atomic_inc(&iter.tr->data[cpu]->disabled);
4459 } 4441 }
4460 4442
4461 old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; 4443 old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ;
@@ -4504,7 +4486,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
4504 iter.iter_flags |= TRACE_FILE_LAT_FMT; 4486 iter.iter_flags |= TRACE_FILE_LAT_FMT;
4505 iter.pos = -1; 4487 iter.pos = -1;
4506 4488
4507 if (find_next_entry_inc(&iter) != NULL) { 4489 if (trace_find_next_entry_inc(&iter) != NULL) {
4508 int ret; 4490 int ret;
4509 4491
4510 ret = print_trace_line(&iter); 4492 ret = print_trace_line(&iter);
@@ -4526,7 +4508,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
4526 trace_flags |= old_userobj; 4508 trace_flags |= old_userobj;
4527 4509
4528 for_each_tracing_cpu(cpu) { 4510 for_each_tracing_cpu(cpu) {
4529 atomic_dec(&global_trace.data[cpu]->disabled); 4511 atomic_dec(&iter.tr->data[cpu]->disabled);
4530 } 4512 }
4531 tracing_on(); 4513 tracing_on();
4532 } 4514 }
@@ -4575,16 +4557,14 @@ __init static int tracer_alloc_buffers(void)
4575 4557
4576 4558
4577#ifdef CONFIG_TRACER_MAX_TRACE 4559#ifdef CONFIG_TRACER_MAX_TRACE
4578 max_tr.buffer = ring_buffer_alloc(ring_buf_size, 4560 max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS);
4579 TRACE_BUFFER_FLAGS);
4580 if (!max_tr.buffer) { 4561 if (!max_tr.buffer) {
4581 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); 4562 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
4582 WARN_ON(1); 4563 WARN_ON(1);
4583 ring_buffer_free(global_trace.buffer); 4564 ring_buffer_free(global_trace.buffer);
4584 goto out_free_cpumask; 4565 goto out_free_cpumask;
4585 } 4566 }
4586 max_tr.entries = ring_buffer_size(max_tr.buffer); 4567 max_tr.entries = 1;
4587 WARN_ON(max_tr.entries != global_trace.entries);
4588#endif 4568#endif
4589 4569
4590 /* Allocate the first page for all buffers */ 4570 /* Allocate the first page for all buffers */
@@ -4597,9 +4577,6 @@ __init static int tracer_alloc_buffers(void)
4597 4577
4598 register_tracer(&nop_trace); 4578 register_tracer(&nop_trace);
4599 current_trace = &nop_trace; 4579 current_trace = &nop_trace;
4600#ifdef CONFIG_BOOT_TRACER
4601 register_tracer(&boot_tracer);
4602#endif
4603 /* All seems OK, enable tracing */ 4580 /* All seems OK, enable tracing */
4604 tracing_disabled = 0; 4581 tracing_disabled = 0;
4605 4582
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2cd96399463f..9021f8c0c0c3 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -9,10 +9,7 @@
9#include <linux/mmiotrace.h> 9#include <linux/mmiotrace.h>
10#include <linux/tracepoint.h> 10#include <linux/tracepoint.h>
11#include <linux/ftrace.h> 11#include <linux/ftrace.h>
12#include <trace/boot.h>
13#include <linux/kmemtrace.h>
14#include <linux/hw_breakpoint.h> 12#include <linux/hw_breakpoint.h>
15
16#include <linux/trace_seq.h> 13#include <linux/trace_seq.h>
17#include <linux/ftrace_event.h> 14#include <linux/ftrace_event.h>
18 15
@@ -25,30 +22,17 @@ enum trace_type {
25 TRACE_STACK, 22 TRACE_STACK,
26 TRACE_PRINT, 23 TRACE_PRINT,
27 TRACE_BPRINT, 24 TRACE_BPRINT,
28 TRACE_SPECIAL,
29 TRACE_MMIO_RW, 25 TRACE_MMIO_RW,
30 TRACE_MMIO_MAP, 26 TRACE_MMIO_MAP,
31 TRACE_BRANCH, 27 TRACE_BRANCH,
32 TRACE_BOOT_CALL,
33 TRACE_BOOT_RET,
34 TRACE_GRAPH_RET, 28 TRACE_GRAPH_RET,
35 TRACE_GRAPH_ENT, 29 TRACE_GRAPH_ENT,
36 TRACE_USER_STACK, 30 TRACE_USER_STACK,
37 TRACE_KMEM_ALLOC,
38 TRACE_KMEM_FREE,
39 TRACE_BLK, 31 TRACE_BLK,
40 TRACE_KSYM,
41 32
42 __TRACE_LAST_TYPE, 33 __TRACE_LAST_TYPE,
43}; 34};
44 35
45enum kmemtrace_type_id {
46 KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */
47 KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */
48 KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */
49};
50
51extern struct tracer boot_tracer;
52 36
53#undef __field 37#undef __field
54#define __field(type, item) type item; 38#define __field(type, item) type item;
@@ -204,23 +188,15 @@ extern void __ftrace_bad_type(void);
204 IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ 188 IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
205 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ 189 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
206 IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ 190 IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \
207 IF_ASSIGN(var, ent, struct special_entry, 0); \
208 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ 191 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
209 TRACE_MMIO_RW); \ 192 TRACE_MMIO_RW); \
210 IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ 193 IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
211 TRACE_MMIO_MAP); \ 194 TRACE_MMIO_MAP); \
212 IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\
213 IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\
214 IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \ 195 IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
215 IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \ 196 IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \
216 TRACE_GRAPH_ENT); \ 197 TRACE_GRAPH_ENT); \
217 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ 198 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
218 TRACE_GRAPH_RET); \ 199 TRACE_GRAPH_RET); \
219 IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \
220 TRACE_KMEM_ALLOC); \
221 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
222 TRACE_KMEM_FREE); \
223 IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\
224 __ftrace_bad_type(); \ 200 __ftrace_bad_type(); \
225 } while (0) 201 } while (0)
226 202
@@ -298,6 +274,7 @@ struct tracer {
298 struct tracer *next; 274 struct tracer *next;
299 int print_max; 275 int print_max;
300 struct tracer_flags *flags; 276 struct tracer_flags *flags;
277 int use_max_tr;
301}; 278};
302 279
303 280
@@ -318,7 +295,6 @@ struct dentry *trace_create_file(const char *name,
318 const struct file_operations *fops); 295 const struct file_operations *fops);
319 296
320struct dentry *tracing_init_dentry(void); 297struct dentry *tracing_init_dentry(void);
321void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
322 298
323struct ring_buffer_event; 299struct ring_buffer_event;
324 300
@@ -338,6 +314,14 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
338struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, 314struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
339 int *ent_cpu, u64 *ent_ts); 315 int *ent_cpu, u64 *ent_ts);
340 316
317int trace_empty(struct trace_iterator *iter);
318
319void *trace_find_next_entry_inc(struct trace_iterator *iter);
320
321void trace_init_global_iter(struct trace_iterator *iter);
322
323void tracing_iter_reset(struct trace_iterator *iter, int cpu);
324
341void default_wait_pipe(struct trace_iterator *iter); 325void default_wait_pipe(struct trace_iterator *iter);
342void poll_wait_pipe(struct trace_iterator *iter); 326void poll_wait_pipe(struct trace_iterator *iter);
343 327
@@ -355,15 +339,14 @@ void tracing_sched_wakeup_trace(struct trace_array *tr,
355 struct task_struct *wakee, 339 struct task_struct *wakee,
356 struct task_struct *cur, 340 struct task_struct *cur,
357 unsigned long flags, int pc); 341 unsigned long flags, int pc);
358void trace_special(struct trace_array *tr,
359 struct trace_array_cpu *data,
360 unsigned long arg1,
361 unsigned long arg2,
362 unsigned long arg3, int pc);
363void trace_function(struct trace_array *tr, 342void trace_function(struct trace_array *tr,
364 unsigned long ip, 343 unsigned long ip,
365 unsigned long parent_ip, 344 unsigned long parent_ip,
366 unsigned long flags, int pc); 345 unsigned long flags, int pc);
346void trace_graph_function(struct trace_array *tr,
347 unsigned long ip,
348 unsigned long parent_ip,
349 unsigned long flags, int pc);
367void trace_default_header(struct seq_file *m); 350void trace_default_header(struct seq_file *m);
368void print_trace_header(struct seq_file *m, struct trace_iterator *iter); 351void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
369int trace_empty(struct trace_iterator *iter); 352int trace_empty(struct trace_iterator *iter);
@@ -380,8 +363,15 @@ void tracing_start_sched_switch_record(void);
380int register_tracer(struct tracer *type); 363int register_tracer(struct tracer *type);
381void unregister_tracer(struct tracer *type); 364void unregister_tracer(struct tracer *type);
382int is_tracing_stopped(void); 365int is_tracing_stopped(void);
366enum trace_file_type {
367 TRACE_FILE_LAT_FMT = 1,
368 TRACE_FILE_ANNOTATE = 2,
369};
370
371extern cpumask_var_t __read_mostly tracing_buffer_mask;
383 372
384extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr); 373#define for_each_tracing_cpu(cpu) \
374 for_each_cpu(cpu, tracing_buffer_mask)
385 375
386extern unsigned long nsecs_to_usecs(unsigned long nsecs); 376extern unsigned long nsecs_to_usecs(unsigned long nsecs);
387 377
@@ -452,12 +442,8 @@ extern int trace_selftest_startup_nop(struct tracer *trace,
452 struct trace_array *tr); 442 struct trace_array *tr);
453extern int trace_selftest_startup_sched_switch(struct tracer *trace, 443extern int trace_selftest_startup_sched_switch(struct tracer *trace,
454 struct trace_array *tr); 444 struct trace_array *tr);
455extern int trace_selftest_startup_sysprof(struct tracer *trace,
456 struct trace_array *tr);
457extern int trace_selftest_startup_branch(struct tracer *trace, 445extern int trace_selftest_startup_branch(struct tracer *trace,
458 struct trace_array *tr); 446 struct trace_array *tr);
459extern int trace_selftest_startup_ksym(struct tracer *trace,
460 struct trace_array *tr);
461#endif /* CONFIG_FTRACE_STARTUP_TEST */ 447#endif /* CONFIG_FTRACE_STARTUP_TEST */
462 448
463extern void *head_page(struct trace_array_cpu *data); 449extern void *head_page(struct trace_array_cpu *data);
@@ -471,6 +457,8 @@ trace_array_vprintk(struct trace_array *tr,
471 unsigned long ip, const char *fmt, va_list args); 457 unsigned long ip, const char *fmt, va_list args);
472int trace_array_printk(struct trace_array *tr, 458int trace_array_printk(struct trace_array *tr,
473 unsigned long ip, const char *fmt, ...); 459 unsigned long ip, const char *fmt, ...);
460void trace_printk_seq(struct trace_seq *s);
461enum print_line_t print_trace_line(struct trace_iterator *iter);
474 462
475extern unsigned long trace_flags; 463extern unsigned long trace_flags;
476 464
@@ -617,6 +605,7 @@ enum trace_iterator_flags {
617 TRACE_ITER_LATENCY_FMT = 0x20000, 605 TRACE_ITER_LATENCY_FMT = 0x20000,
618 TRACE_ITER_SLEEP_TIME = 0x40000, 606 TRACE_ITER_SLEEP_TIME = 0x40000,
619 TRACE_ITER_GRAPH_TIME = 0x80000, 607 TRACE_ITER_GRAPH_TIME = 0x80000,
608 TRACE_ITER_RECORD_CMD = 0x100000,
620}; 609};
621 610
622/* 611/*
@@ -628,54 +617,6 @@ enum trace_iterator_flags {
628 617
629extern struct tracer nop_trace; 618extern struct tracer nop_trace;
630 619
631/**
632 * ftrace_preempt_disable - disable preemption scheduler safe
633 *
634 * When tracing can happen inside the scheduler, there exists
635 * cases that the tracing might happen before the need_resched
636 * flag is checked. If this happens and the tracer calls
637 * preempt_enable (after a disable), a schedule might take place
638 * causing an infinite recursion.
639 *
640 * To prevent this, we read the need_resched flag before
641 * disabling preemption. When we want to enable preemption we
642 * check the flag, if it is set, then we call preempt_enable_no_resched.
643 * Otherwise, we call preempt_enable.
644 *
645 * The rational for doing the above is that if need_resched is set
646 * and we have yet to reschedule, we are either in an atomic location
647 * (where we do not need to check for scheduling) or we are inside
648 * the scheduler and do not want to resched.
649 */
650static inline int ftrace_preempt_disable(void)
651{
652 int resched;
653
654 resched = need_resched();
655 preempt_disable_notrace();
656
657 return resched;
658}
659
660/**
661 * ftrace_preempt_enable - enable preemption scheduler safe
662 * @resched: the return value from ftrace_preempt_disable
663 *
664 * This is a scheduler safe way to enable preemption and not miss
665 * any preemption checks. The disabled saved the state of preemption.
666 * If resched is set, then we are either inside an atomic or
667 * are inside the scheduler (we would have already scheduled
668 * otherwise). In this case, we do not want to call normal
669 * preempt_enable, but preempt_enable_no_resched instead.
670 */
671static inline void ftrace_preempt_enable(int resched)
672{
673 if (resched)
674 preempt_enable_no_resched_notrace();
675 else
676 preempt_enable_notrace();
677}
678
679#ifdef CONFIG_BRANCH_TRACER 620#ifdef CONFIG_BRANCH_TRACER
680extern int enable_branch_tracing(struct trace_array *tr); 621extern int enable_branch_tracing(struct trace_array *tr);
681extern void disable_branch_tracing(void); 622extern void disable_branch_tracing(void);
@@ -766,6 +707,8 @@ struct filter_pred {
766 int pop_n; 707 int pop_n;
767}; 708};
768 709
710extern struct list_head ftrace_common_fields;
711
769extern enum regex_type 712extern enum regex_type
770filter_parse_regex(char *buff, int len, char **search, int *not); 713filter_parse_regex(char *buff, int len, char **search, int *not);
771extern void print_event_filter(struct ftrace_event_call *call, 714extern void print_event_filter(struct ftrace_event_call *call,
@@ -795,6 +738,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
795 return 0; 738 return 0;
796} 739}
797 740
741extern void trace_event_enable_cmd_record(bool enable);
742
798extern struct mutex event_mutex; 743extern struct mutex event_mutex;
799extern struct list_head ftrace_events; 744extern struct list_head ftrace_events;
800 745
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
deleted file mode 100644
index c21d5f3956ad..000000000000
--- a/kernel/trace/trace_boot.c
+++ /dev/null
@@ -1,185 +0,0 @@
1/*
2 * ring buffer based initcalls tracer
3 *
4 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
5 *
6 */
7
8#include <linux/init.h>
9#include <linux/debugfs.h>
10#include <linux/ftrace.h>
11#include <linux/kallsyms.h>
12#include <linux/time.h>
13
14#include "trace.h"
15#include "trace_output.h"
16
17static struct trace_array *boot_trace;
18static bool pre_initcalls_finished;
19
20/* Tells the boot tracer that the pre_smp_initcalls are finished.
21 * So we are ready .
22 * It doesn't enable sched events tracing however.
23 * You have to call enable_boot_trace to do so.
24 */
25void start_boot_trace(void)
26{
27 pre_initcalls_finished = true;
28}
29
30void enable_boot_trace(void)
31{
32 if (boot_trace && pre_initcalls_finished)
33 tracing_start_sched_switch_record();
34}
35
36void disable_boot_trace(void)
37{
38 if (boot_trace && pre_initcalls_finished)
39 tracing_stop_sched_switch_record();
40}
41
42static int boot_trace_init(struct trace_array *tr)
43{
44 boot_trace = tr;
45
46 if (!tr)
47 return 0;
48
49 tracing_reset_online_cpus(tr);
50
51 tracing_sched_switch_assign_trace(tr);
52 return 0;
53}
54
55static enum print_line_t
56initcall_call_print_line(struct trace_iterator *iter)
57{
58 struct trace_entry *entry = iter->ent;
59 struct trace_seq *s = &iter->seq;
60 struct trace_boot_call *field;
61 struct boot_trace_call *call;
62 u64 ts;
63 unsigned long nsec_rem;
64 int ret;
65
66 trace_assign_type(field, entry);
67 call = &field->boot_call;
68 ts = iter->ts;
69 nsec_rem = do_div(ts, NSEC_PER_SEC);
70
71 ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n",
72 (unsigned long)ts, nsec_rem, call->func, call->caller);
73
74 if (!ret)
75 return TRACE_TYPE_PARTIAL_LINE;
76 else
77 return TRACE_TYPE_HANDLED;
78}
79
80static enum print_line_t
81initcall_ret_print_line(struct trace_iterator *iter)
82{
83 struct trace_entry *entry = iter->ent;
84 struct trace_seq *s = &iter->seq;
85 struct trace_boot_ret *field;
86 struct boot_trace_ret *init_ret;
87 u64 ts;
88 unsigned long nsec_rem;
89 int ret;
90
91 trace_assign_type(field, entry);
92 init_ret = &field->boot_ret;
93 ts = iter->ts;
94 nsec_rem = do_div(ts, NSEC_PER_SEC);
95
96 ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
97 "returned %d after %llu msecs\n",
98 (unsigned long) ts,
99 nsec_rem,
100 init_ret->func, init_ret->result, init_ret->duration);
101
102 if (!ret)
103 return TRACE_TYPE_PARTIAL_LINE;
104 else
105 return TRACE_TYPE_HANDLED;
106}
107
108static enum print_line_t initcall_print_line(struct trace_iterator *iter)
109{
110 struct trace_entry *entry = iter->ent;
111
112 switch (entry->type) {
113 case TRACE_BOOT_CALL:
114 return initcall_call_print_line(iter);
115 case TRACE_BOOT_RET:
116 return initcall_ret_print_line(iter);
117 default:
118 return TRACE_TYPE_UNHANDLED;
119 }
120}
121
122struct tracer boot_tracer __read_mostly =
123{
124 .name = "initcall",
125 .init = boot_trace_init,
126 .reset = tracing_reset_online_cpus,
127 .print_line = initcall_print_line,
128};
129
130void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
131{
132 struct ftrace_event_call *call = &event_boot_call;
133 struct ring_buffer_event *event;
134 struct ring_buffer *buffer;
135 struct trace_boot_call *entry;
136 struct trace_array *tr = boot_trace;
137
138 if (!tr || !pre_initcalls_finished)
139 return;
140
141 /* Get its name now since this function could
142 * disappear because it is in the .init section.
143 */
144 sprint_symbol(bt->func, (unsigned long)fn);
145 preempt_disable();
146
147 buffer = tr->buffer;
148 event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_CALL,
149 sizeof(*entry), 0, 0);
150 if (!event)
151 goto out;
152 entry = ring_buffer_event_data(event);
153 entry->boot_call = *bt;
154 if (!filter_check_discard(call, entry, buffer, event))
155 trace_buffer_unlock_commit(buffer, event, 0, 0);
156 out:
157 preempt_enable();
158}
159
160void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
161{
162 struct ftrace_event_call *call = &event_boot_ret;
163 struct ring_buffer_event *event;
164 struct ring_buffer *buffer;
165 struct trace_boot_ret *entry;
166 struct trace_array *tr = boot_trace;
167
168 if (!tr || !pre_initcalls_finished)
169 return;
170
171 sprint_symbol(bt->func, (unsigned long)fn);
172 preempt_disable();
173
174 buffer = tr->buffer;
175 event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_RET,
176 sizeof(*entry), 0, 0);
177 if (!event)
178 goto out;
179 entry = ring_buffer_event_data(event);
180 entry->boot_ret = *bt;
181 if (!filter_check_discard(call, entry, buffer, event))
182 trace_buffer_unlock_commit(buffer, event, 0, 0);
183 out:
184 preempt_enable();
185}
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 9d589d8dcd1a..685a67d55db0 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -32,16 +32,15 @@
32u64 notrace trace_clock_local(void) 32u64 notrace trace_clock_local(void)
33{ 33{
34 u64 clock; 34 u64 clock;
35 int resched;
36 35
37 /* 36 /*
38 * sched_clock() is an architecture implemented, fast, scalable, 37 * sched_clock() is an architecture implemented, fast, scalable,
39 * lockless clock. It is not guaranteed to be coherent across 38 * lockless clock. It is not guaranteed to be coherent across
40 * CPUs, nor across CPU idle events. 39 * CPUs, nor across CPU idle events.
41 */ 40 */
42 resched = ftrace_preempt_disable(); 41 preempt_disable_notrace();
43 clock = sched_clock(); 42 clock = sched_clock();
44 ftrace_preempt_enable(resched); 43 preempt_enable_notrace();
45 44
46 return clock; 45 return clock;
47} 46}
@@ -56,7 +55,7 @@ u64 notrace trace_clock_local(void)
56 */ 55 */
57u64 notrace trace_clock(void) 56u64 notrace trace_clock(void)
58{ 57{
59 return cpu_clock(raw_smp_processor_id()); 58 return local_clock();
60} 59}
61 60
62 61
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index dc008c1240da..e3dfecaf13e6 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -151,23 +151,6 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
151); 151);
152 152
153/* 153/*
154 * Special (free-form) trace entry:
155 */
156FTRACE_ENTRY(special, special_entry,
157
158 TRACE_SPECIAL,
159
160 F_STRUCT(
161 __field( unsigned long, arg1 )
162 __field( unsigned long, arg2 )
163 __field( unsigned long, arg3 )
164 ),
165
166 F_printk("(%08lx) (%08lx) (%08lx)",
167 __entry->arg1, __entry->arg2, __entry->arg3)
168);
169
170/*
171 * Stack-trace entry: 154 * Stack-trace entry:
172 */ 155 */
173 156
@@ -271,33 +254,6 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
271 __entry->map_id, __entry->opcode) 254 __entry->map_id, __entry->opcode)
272); 255);
273 256
274FTRACE_ENTRY(boot_call, trace_boot_call,
275
276 TRACE_BOOT_CALL,
277
278 F_STRUCT(
279 __field_struct( struct boot_trace_call, boot_call )
280 __field_desc( pid_t, boot_call, caller )
281 __array_desc( char, boot_call, func, KSYM_SYMBOL_LEN)
282 ),
283
284 F_printk("%d %s", __entry->caller, __entry->func)
285);
286
287FTRACE_ENTRY(boot_ret, trace_boot_ret,
288
289 TRACE_BOOT_RET,
290
291 F_STRUCT(
292 __field_struct( struct boot_trace_ret, boot_ret )
293 __array_desc( char, boot_ret, func, KSYM_SYMBOL_LEN)
294 __field_desc( int, boot_ret, result )
295 __field_desc( unsigned long, boot_ret, duration )
296 ),
297
298 F_printk("%s %d %lx",
299 __entry->func, __entry->result, __entry->duration)
300);
301 257
302#define TRACE_FUNC_SIZE 30 258#define TRACE_FUNC_SIZE 30
303#define TRACE_FILE_SIZE 20 259#define TRACE_FILE_SIZE 20
@@ -318,53 +274,3 @@ FTRACE_ENTRY(branch, trace_branch,
318 __entry->func, __entry->file, __entry->correct) 274 __entry->func, __entry->file, __entry->correct)
319); 275);
320 276
321FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
322
323 TRACE_KMEM_ALLOC,
324
325 F_STRUCT(
326 __field( enum kmemtrace_type_id, type_id )
327 __field( unsigned long, call_site )
328 __field( const void *, ptr )
329 __field( size_t, bytes_req )
330 __field( size_t, bytes_alloc )
331 __field( gfp_t, gfp_flags )
332 __field( int, node )
333 ),
334
335 F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi"
336 " flags:%x node:%d",
337 __entry->type_id, __entry->call_site, __entry->ptr,
338 __entry->bytes_req, __entry->bytes_alloc,
339 __entry->gfp_flags, __entry->node)
340);
341
342FTRACE_ENTRY(kmem_free, kmemtrace_free_entry,
343
344 TRACE_KMEM_FREE,
345
346 F_STRUCT(
347 __field( enum kmemtrace_type_id, type_id )
348 __field( unsigned long, call_site )
349 __field( const void *, ptr )
350 ),
351
352 F_printk("type:%u call_site:%lx ptr:%p",
353 __entry->type_id, __entry->call_site, __entry->ptr)
354);
355
356FTRACE_ENTRY(ksym_trace, ksym_trace_entry,
357
358 TRACE_KSYM,
359
360 F_STRUCT(
361 __field( unsigned long, ip )
362 __field( unsigned char, type )
363 __array( char , cmd, TASK_COMM_LEN )
364 __field( unsigned long, addr )
365 ),
366
367 F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s",
368 (void *)__entry->ip, (unsigned int)__entry->type,
369 (void *)__entry->addr, __entry->cmd)
370);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 8a2b73f7c068..39c059ca670e 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -9,9 +9,7 @@
9#include <linux/kprobes.h> 9#include <linux/kprobes.h>
10#include "trace.h" 10#include "trace.h"
11 11
12EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs); 12static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
13
14static char *perf_trace_buf[4];
15 13
16/* 14/*
17 * Force it to be aligned to unsigned long to avoid misaligned accesses 15 * Force it to be aligned to unsigned long to avoid misaligned accesses
@@ -26,7 +24,7 @@ static int total_ref_count;
26static int perf_trace_event_init(struct ftrace_event_call *tp_event, 24static int perf_trace_event_init(struct ftrace_event_call *tp_event,
27 struct perf_event *p_event) 25 struct perf_event *p_event)
28{ 26{
29 struct hlist_head *list; 27 struct hlist_head __percpu *list;
30 int ret = -ENOMEM; 28 int ret = -ENOMEM;
31 int cpu; 29 int cpu;
32 30
@@ -44,11 +42,11 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
44 tp_event->perf_events = list; 42 tp_event->perf_events = list;
45 43
46 if (!total_ref_count) { 44 if (!total_ref_count) {
47 char *buf; 45 char __percpu *buf;
48 int i; 46 int i;
49 47
50 for (i = 0; i < 4; i++) { 48 for (i = 0; i < PERF_NR_CONTEXTS; i++) {
51 buf = (char *)alloc_percpu(perf_trace_t); 49 buf = (char __percpu *)alloc_percpu(perf_trace_t);
52 if (!buf) 50 if (!buf)
53 goto fail; 51 goto fail;
54 52
@@ -56,13 +54,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
56 } 54 }
57 } 55 }
58 56
59 if (tp_event->class->reg) 57 ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER);
60 ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER);
61 else
62 ret = tracepoint_probe_register(tp_event->name,
63 tp_event->class->perf_probe,
64 tp_event);
65
66 if (ret) 58 if (ret)
67 goto fail; 59 goto fail;
68 60
@@ -73,7 +65,7 @@ fail:
73 if (!total_ref_count) { 65 if (!total_ref_count) {
74 int i; 66 int i;
75 67
76 for (i = 0; i < 4; i++) { 68 for (i = 0; i < PERF_NR_CONTEXTS; i++) {
77 free_percpu(perf_trace_buf[i]); 69 free_percpu(perf_trace_buf[i]);
78 perf_trace_buf[i] = NULL; 70 perf_trace_buf[i] = NULL;
79 } 71 }
@@ -96,11 +88,11 @@ int perf_trace_init(struct perf_event *p_event)
96 mutex_lock(&event_mutex); 88 mutex_lock(&event_mutex);
97 list_for_each_entry(tp_event, &ftrace_events, list) { 89 list_for_each_entry(tp_event, &ftrace_events, list) {
98 if (tp_event->event.type == event_id && 90 if (tp_event->event.type == event_id &&
99 tp_event->class && 91 tp_event->class && tp_event->class->reg &&
100 (tp_event->class->perf_probe ||
101 tp_event->class->reg) &&
102 try_module_get(tp_event->mod)) { 92 try_module_get(tp_event->mod)) {
103 ret = perf_trace_event_init(tp_event, p_event); 93 ret = perf_trace_event_init(tp_event, p_event);
94 if (ret)
95 module_put(tp_event->mod);
104 break; 96 break;
105 } 97 }
106 } 98 }
@@ -109,22 +101,26 @@ int perf_trace_init(struct perf_event *p_event)
109 return ret; 101 return ret;
110} 102}
111 103
112int perf_trace_enable(struct perf_event *p_event) 104int perf_trace_add(struct perf_event *p_event, int flags)
113{ 105{
114 struct ftrace_event_call *tp_event = p_event->tp_event; 106 struct ftrace_event_call *tp_event = p_event->tp_event;
107 struct hlist_head __percpu *pcpu_list;
115 struct hlist_head *list; 108 struct hlist_head *list;
116 109
117 list = tp_event->perf_events; 110 pcpu_list = tp_event->perf_events;
118 if (WARN_ON_ONCE(!list)) 111 if (WARN_ON_ONCE(!pcpu_list))
119 return -EINVAL; 112 return -EINVAL;
120 113
121 list = this_cpu_ptr(list); 114 if (!(flags & PERF_EF_START))
115 p_event->hw.state = PERF_HES_STOPPED;
116
117 list = this_cpu_ptr(pcpu_list);
122 hlist_add_head_rcu(&p_event->hlist_entry, list); 118 hlist_add_head_rcu(&p_event->hlist_entry, list);
123 119
124 return 0; 120 return 0;
125} 121}
126 122
127void perf_trace_disable(struct perf_event *p_event) 123void perf_trace_del(struct perf_event *p_event, int flags)
128{ 124{
129 hlist_del_rcu(&p_event->hlist_entry); 125 hlist_del_rcu(&p_event->hlist_entry);
130} 126}
@@ -138,29 +134,25 @@ void perf_trace_destroy(struct perf_event *p_event)
138 if (--tp_event->perf_refcount > 0) 134 if (--tp_event->perf_refcount > 0)
139 goto out; 135 goto out;
140 136
141 if (tp_event->class->reg) 137 tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
142 tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
143 else
144 tracepoint_probe_unregister(tp_event->name,
145 tp_event->class->perf_probe,
146 tp_event);
147 138
148 /* 139 /*
149 * Ensure our callback won't be called anymore. See 140 * Ensure our callback won't be called anymore. The buffers
150 * tracepoint_probe_unregister() and __DO_TRACE(). 141 * will be freed after that.
151 */ 142 */
152 synchronize_sched(); 143 tracepoint_synchronize_unregister();
153 144
154 free_percpu(tp_event->perf_events); 145 free_percpu(tp_event->perf_events);
155 tp_event->perf_events = NULL; 146 tp_event->perf_events = NULL;
156 147
157 if (!--total_ref_count) { 148 if (!--total_ref_count) {
158 for (i = 0; i < 4; i++) { 149 for (i = 0; i < PERF_NR_CONTEXTS; i++) {
159 free_percpu(perf_trace_buf[i]); 150 free_percpu(perf_trace_buf[i]);
160 perf_trace_buf[i] = NULL; 151 perf_trace_buf[i] = NULL;
161 } 152 }
162 } 153 }
163out: 154out:
155 module_put(tp_event->mod);
164 mutex_unlock(&event_mutex); 156 mutex_unlock(&event_mutex);
165} 157}
166 158
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 53cffc0b0801..0725eeab1937 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -28,6 +28,7 @@
28DEFINE_MUTEX(event_mutex); 28DEFINE_MUTEX(event_mutex);
29 29
30LIST_HEAD(ftrace_events); 30LIST_HEAD(ftrace_events);
31LIST_HEAD(ftrace_common_fields);
31 32
32struct list_head * 33struct list_head *
33trace_get_fields(struct ftrace_event_call *event_call) 34trace_get_fields(struct ftrace_event_call *event_call)
@@ -37,15 +38,11 @@ trace_get_fields(struct ftrace_event_call *event_call)
37 return event_call->class->get_fields(event_call); 38 return event_call->class->get_fields(event_call);
38} 39}
39 40
40int trace_define_field(struct ftrace_event_call *call, const char *type, 41static int __trace_define_field(struct list_head *head, const char *type,
41 const char *name, int offset, int size, int is_signed, 42 const char *name, int offset, int size,
42 int filter_type) 43 int is_signed, int filter_type)
43{ 44{
44 struct ftrace_event_field *field; 45 struct ftrace_event_field *field;
45 struct list_head *head;
46
47 if (WARN_ON(!call->class))
48 return 0;
49 46
50 field = kzalloc(sizeof(*field), GFP_KERNEL); 47 field = kzalloc(sizeof(*field), GFP_KERNEL);
51 if (!field) 48 if (!field)
@@ -68,7 +65,6 @@ int trace_define_field(struct ftrace_event_call *call, const char *type,
68 field->size = size; 65 field->size = size;
69 field->is_signed = is_signed; 66 field->is_signed = is_signed;
70 67
71 head = trace_get_fields(call);
72 list_add(&field->link, head); 68 list_add(&field->link, head);
73 69
74 return 0; 70 return 0;
@@ -80,17 +76,32 @@ err:
80 76
81 return -ENOMEM; 77 return -ENOMEM;
82} 78}
79
80int trace_define_field(struct ftrace_event_call *call, const char *type,
81 const char *name, int offset, int size, int is_signed,
82 int filter_type)
83{
84 struct list_head *head;
85
86 if (WARN_ON(!call->class))
87 return 0;
88
89 head = trace_get_fields(call);
90 return __trace_define_field(head, type, name, offset, size,
91 is_signed, filter_type);
92}
83EXPORT_SYMBOL_GPL(trace_define_field); 93EXPORT_SYMBOL_GPL(trace_define_field);
84 94
85#define __common_field(type, item) \ 95#define __common_field(type, item) \
86 ret = trace_define_field(call, #type, "common_" #item, \ 96 ret = __trace_define_field(&ftrace_common_fields, #type, \
87 offsetof(typeof(ent), item), \ 97 "common_" #item, \
88 sizeof(ent.item), \ 98 offsetof(typeof(ent), item), \
89 is_signed_type(type), FILTER_OTHER); \ 99 sizeof(ent.item), \
100 is_signed_type(type), FILTER_OTHER); \
90 if (ret) \ 101 if (ret) \
91 return ret; 102 return ret;
92 103
93static int trace_define_common_fields(struct ftrace_event_call *call) 104static int trace_define_common_fields(void)
94{ 105{
95 int ret; 106 int ret;
96 struct trace_entry ent; 107 struct trace_entry ent;
@@ -130,6 +141,55 @@ int trace_event_raw_init(struct ftrace_event_call *call)
130} 141}
131EXPORT_SYMBOL_GPL(trace_event_raw_init); 142EXPORT_SYMBOL_GPL(trace_event_raw_init);
132 143
144int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type)
145{
146 switch (type) {
147 case TRACE_REG_REGISTER:
148 return tracepoint_probe_register(call->name,
149 call->class->probe,
150 call);
151 case TRACE_REG_UNREGISTER:
152 tracepoint_probe_unregister(call->name,
153 call->class->probe,
154 call);
155 return 0;
156
157#ifdef CONFIG_PERF_EVENTS
158 case TRACE_REG_PERF_REGISTER:
159 return tracepoint_probe_register(call->name,
160 call->class->perf_probe,
161 call);
162 case TRACE_REG_PERF_UNREGISTER:
163 tracepoint_probe_unregister(call->name,
164 call->class->perf_probe,
165 call);
166 return 0;
167#endif
168 }
169 return 0;
170}
171EXPORT_SYMBOL_GPL(ftrace_event_reg);
172
173void trace_event_enable_cmd_record(bool enable)
174{
175 struct ftrace_event_call *call;
176
177 mutex_lock(&event_mutex);
178 list_for_each_entry(call, &ftrace_events, list) {
179 if (!(call->flags & TRACE_EVENT_FL_ENABLED))
180 continue;
181
182 if (enable) {
183 tracing_start_cmdline_record();
184 call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
185 } else {
186 tracing_stop_cmdline_record();
187 call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
188 }
189 }
190 mutex_unlock(&event_mutex);
191}
192
133static int ftrace_event_enable_disable(struct ftrace_event_call *call, 193static int ftrace_event_enable_disable(struct ftrace_event_call *call,
134 int enable) 194 int enable)
135{ 195{
@@ -139,24 +199,20 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
139 case 0: 199 case 0:
140 if (call->flags & TRACE_EVENT_FL_ENABLED) { 200 if (call->flags & TRACE_EVENT_FL_ENABLED) {
141 call->flags &= ~TRACE_EVENT_FL_ENABLED; 201 call->flags &= ~TRACE_EVENT_FL_ENABLED;
142 tracing_stop_cmdline_record(); 202 if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) {
143 if (call->class->reg) 203 tracing_stop_cmdline_record();
144 call->class->reg(call, TRACE_REG_UNREGISTER); 204 call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
145 else 205 }
146 tracepoint_probe_unregister(call->name, 206 call->class->reg(call, TRACE_REG_UNREGISTER);
147 call->class->probe,
148 call);
149 } 207 }
150 break; 208 break;
151 case 1: 209 case 1:
152 if (!(call->flags & TRACE_EVENT_FL_ENABLED)) { 210 if (!(call->flags & TRACE_EVENT_FL_ENABLED)) {
153 tracing_start_cmdline_record(); 211 if (trace_flags & TRACE_ITER_RECORD_CMD) {
154 if (call->class->reg) 212 tracing_start_cmdline_record();
155 ret = call->class->reg(call, TRACE_REG_REGISTER); 213 call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
156 else 214 }
157 ret = tracepoint_probe_register(call->name, 215 ret = call->class->reg(call, TRACE_REG_REGISTER);
158 call->class->probe,
159 call);
160 if (ret) { 216 if (ret) {
161 tracing_stop_cmdline_record(); 217 tracing_stop_cmdline_record();
162 pr_info("event trace: Could not enable event " 218 pr_info("event trace: Could not enable event "
@@ -194,8 +250,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
194 mutex_lock(&event_mutex); 250 mutex_lock(&event_mutex);
195 list_for_each_entry(call, &ftrace_events, list) { 251 list_for_each_entry(call, &ftrace_events, list) {
196 252
197 if (!call->name || !call->class || 253 if (!call->name || !call->class || !call->class->reg)
198 (!call->class->probe && !call->class->reg))
199 continue; 254 continue;
200 255
201 if (match && 256 if (match &&
@@ -321,7 +376,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
321 * The ftrace subsystem is for showing formats only. 376 * The ftrace subsystem is for showing formats only.
322 * They can not be enabled or disabled via the event files. 377 * They can not be enabled or disabled via the event files.
323 */ 378 */
324 if (call->class && (call->class->probe || call->class->reg)) 379 if (call->class && call->class->reg)
325 return call; 380 return call;
326 } 381 }
327 382
@@ -474,8 +529,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
474 529
475 mutex_lock(&event_mutex); 530 mutex_lock(&event_mutex);
476 list_for_each_entry(call, &ftrace_events, list) { 531 list_for_each_entry(call, &ftrace_events, list) {
477 if (!call->name || !call->class || 532 if (!call->name || !call->class || !call->class->reg)
478 (!call->class->probe && !call->class->reg))
479 continue; 533 continue;
480 534
481 if (system && strcmp(call->class->system, system) != 0) 535 if (system && strcmp(call->class->system, system) != 0)
@@ -544,85 +598,146 @@ out:
544 return ret; 598 return ret;
545} 599}
546 600
547static ssize_t 601enum {
548event_format_read(struct file *filp, char __user *ubuf, size_t cnt, 602 FORMAT_HEADER = 1,
549 loff_t *ppos) 603 FORMAT_FIELD_SEPERATOR = 2,
604 FORMAT_PRINTFMT = 3,
605};
606
607static void *f_next(struct seq_file *m, void *v, loff_t *pos)
550{ 608{
551 struct ftrace_event_call *call = filp->private_data; 609 struct ftrace_event_call *call = m->private;
552 struct ftrace_event_field *field; 610 struct ftrace_event_field *field;
553 struct list_head *head; 611 struct list_head *common_head = &ftrace_common_fields;
554 struct trace_seq *s; 612 struct list_head *head = trace_get_fields(call);
555 int common_field_count = 5;
556 char *buf;
557 int r = 0;
558 613
559 if (*ppos) 614 (*pos)++;
560 return 0;
561 615
562 s = kmalloc(sizeof(*s), GFP_KERNEL); 616 switch ((unsigned long)v) {
563 if (!s) 617 case FORMAT_HEADER:
564 return -ENOMEM; 618 if (unlikely(list_empty(common_head)))
619 return NULL;
565 620
566 trace_seq_init(s); 621 field = list_entry(common_head->prev,
622 struct ftrace_event_field, link);
623 return field;
567 624
568 trace_seq_printf(s, "name: %s\n", call->name); 625 case FORMAT_FIELD_SEPERATOR:
569 trace_seq_printf(s, "ID: %d\n", call->event.type); 626 if (unlikely(list_empty(head)))
570 trace_seq_printf(s, "format:\n"); 627 return NULL;
571 628
572 head = trace_get_fields(call); 629 field = list_entry(head->prev, struct ftrace_event_field, link);
573 list_for_each_entry_reverse(field, head, link) { 630 return field;
574 /*
575 * Smartly shows the array type(except dynamic array).
576 * Normal:
577 * field:TYPE VAR
578 * If TYPE := TYPE[LEN], it is shown:
579 * field:TYPE VAR[LEN]
580 */
581 const char *array_descriptor = strchr(field->type, '[');
582 631
583 if (!strncmp(field->type, "__data_loc", 10)) 632 case FORMAT_PRINTFMT:
584 array_descriptor = NULL; 633 /* all done */
634 return NULL;
635 }
585 636
586 if (!array_descriptor) { 637 field = v;
587 r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;" 638 if (field->link.prev == common_head)
588 "\tsize:%u;\tsigned:%d;\n", 639 return (void *)FORMAT_FIELD_SEPERATOR;
589 field->type, field->name, field->offset, 640 else if (field->link.prev == head)
590 field->size, !!field->is_signed); 641 return (void *)FORMAT_PRINTFMT;
591 } else {
592 r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;"
593 "\tsize:%u;\tsigned:%d;\n",
594 (int)(array_descriptor - field->type),
595 field->type, field->name,
596 array_descriptor, field->offset,
597 field->size, !!field->is_signed);
598 }
599 642
600 if (--common_field_count == 0) 643 field = list_entry(field->link.prev, struct ftrace_event_field, link);
601 r = trace_seq_printf(s, "\n");
602 644
603 if (!r) 645 return field;
604 break; 646}
605 }
606 647
607 if (r) 648static void *f_start(struct seq_file *m, loff_t *pos)
608 r = trace_seq_printf(s, "\nprint fmt: %s\n", 649{
609 call->print_fmt); 650 loff_t l = 0;
651 void *p;
610 652
611 if (!r) { 653 /* Start by showing the header */
612 /* 654 if (!*pos)
613 * ug! The format output is bigger than a PAGE!! 655 return (void *)FORMAT_HEADER;
614 */ 656
615 buf = "FORMAT TOO BIG\n"; 657 p = (void *)FORMAT_HEADER;
616 r = simple_read_from_buffer(ubuf, cnt, ppos, 658 do {
617 buf, strlen(buf)); 659 p = f_next(m, p, &l);
618 goto out; 660 } while (p && l < *pos);
661
662 return p;
663}
664
665static int f_show(struct seq_file *m, void *v)
666{
667 struct ftrace_event_call *call = m->private;
668 struct ftrace_event_field *field;
669 const char *array_descriptor;
670
671 switch ((unsigned long)v) {
672 case FORMAT_HEADER:
673 seq_printf(m, "name: %s\n", call->name);
674 seq_printf(m, "ID: %d\n", call->event.type);
675 seq_printf(m, "format:\n");
676 return 0;
677
678 case FORMAT_FIELD_SEPERATOR:
679 seq_putc(m, '\n');
680 return 0;
681
682 case FORMAT_PRINTFMT:
683 seq_printf(m, "\nprint fmt: %s\n",
684 call->print_fmt);
685 return 0;
619 } 686 }
620 687
621 r = simple_read_from_buffer(ubuf, cnt, ppos, 688 field = v;
622 s->buffer, s->len); 689
623 out: 690 /*
624 kfree(s); 691 * Smartly shows the array type(except dynamic array).
625 return r; 692 * Normal:
693 * field:TYPE VAR
694 * If TYPE := TYPE[LEN], it is shown:
695 * field:TYPE VAR[LEN]
696 */
697 array_descriptor = strchr(field->type, '[');
698
699 if (!strncmp(field->type, "__data_loc", 10))
700 array_descriptor = NULL;
701
702 if (!array_descriptor)
703 seq_printf(m, "\tfield:%s %s;\toffset:%u;\tsize:%u;\tsigned:%d;\n",
704 field->type, field->name, field->offset,
705 field->size, !!field->is_signed);
706 else
707 seq_printf(m, "\tfield:%.*s %s%s;\toffset:%u;\tsize:%u;\tsigned:%d;\n",
708 (int)(array_descriptor - field->type),
709 field->type, field->name,
710 array_descriptor, field->offset,
711 field->size, !!field->is_signed);
712
713 return 0;
714}
715
716static void f_stop(struct seq_file *m, void *p)
717{
718}
719
720static const struct seq_operations trace_format_seq_ops = {
721 .start = f_start,
722 .next = f_next,
723 .stop = f_stop,
724 .show = f_show,
725};
726
727static int trace_format_open(struct inode *inode, struct file *file)
728{
729 struct ftrace_event_call *call = inode->i_private;
730 struct seq_file *m;
731 int ret;
732
733 ret = seq_open(file, &trace_format_seq_ops);
734 if (ret < 0)
735 return ret;
736
737 m = file->private_data;
738 m->private = call;
739
740 return 0;
626} 741}
627 742
628static ssize_t 743static ssize_t
@@ -817,39 +932,47 @@ static const struct file_operations ftrace_enable_fops = {
817 .open = tracing_open_generic, 932 .open = tracing_open_generic,
818 .read = event_enable_read, 933 .read = event_enable_read,
819 .write = event_enable_write, 934 .write = event_enable_write,
935 .llseek = default_llseek,
820}; 936};
821 937
822static const struct file_operations ftrace_event_format_fops = { 938static const struct file_operations ftrace_event_format_fops = {
823 .open = tracing_open_generic, 939 .open = trace_format_open,
824 .read = event_format_read, 940 .read = seq_read,
941 .llseek = seq_lseek,
942 .release = seq_release,
825}; 943};
826 944
827static const struct file_operations ftrace_event_id_fops = { 945static const struct file_operations ftrace_event_id_fops = {
828 .open = tracing_open_generic, 946 .open = tracing_open_generic,
829 .read = event_id_read, 947 .read = event_id_read,
948 .llseek = default_llseek,
830}; 949};
831 950
832static const struct file_operations ftrace_event_filter_fops = { 951static const struct file_operations ftrace_event_filter_fops = {
833 .open = tracing_open_generic, 952 .open = tracing_open_generic,
834 .read = event_filter_read, 953 .read = event_filter_read,
835 .write = event_filter_write, 954 .write = event_filter_write,
955 .llseek = default_llseek,
836}; 956};
837 957
838static const struct file_operations ftrace_subsystem_filter_fops = { 958static const struct file_operations ftrace_subsystem_filter_fops = {
839 .open = tracing_open_generic, 959 .open = tracing_open_generic,
840 .read = subsystem_filter_read, 960 .read = subsystem_filter_read,
841 .write = subsystem_filter_write, 961 .write = subsystem_filter_write,
962 .llseek = default_llseek,
842}; 963};
843 964
844static const struct file_operations ftrace_system_enable_fops = { 965static const struct file_operations ftrace_system_enable_fops = {
845 .open = tracing_open_generic, 966 .open = tracing_open_generic,
846 .read = system_enable_read, 967 .read = system_enable_read,
847 .write = system_enable_write, 968 .write = system_enable_write,
969 .llseek = default_llseek,
848}; 970};
849 971
850static const struct file_operations ftrace_show_header_fops = { 972static const struct file_operations ftrace_show_header_fops = {
851 .open = tracing_open_generic, 973 .open = tracing_open_generic,
852 .read = show_header, 974 .read = show_header,
975 .llseek = default_llseek,
853}; 976};
854 977
855static struct dentry *event_trace_events_dir(void) 978static struct dentry *event_trace_events_dir(void)
@@ -963,35 +1086,31 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
963 return -1; 1086 return -1;
964 } 1087 }
965 1088
966 if (call->class->probe || call->class->reg) 1089 if (call->class->reg)
967 trace_create_file("enable", 0644, call->dir, call, 1090 trace_create_file("enable", 0644, call->dir, call,
968 enable); 1091 enable);
969 1092
970#ifdef CONFIG_PERF_EVENTS 1093#ifdef CONFIG_PERF_EVENTS
971 if (call->event.type && (call->class->perf_probe || call->class->reg)) 1094 if (call->event.type && call->class->reg)
972 trace_create_file("id", 0444, call->dir, call, 1095 trace_create_file("id", 0444, call->dir, call,
973 id); 1096 id);
974#endif 1097#endif
975 1098
976 if (call->class->define_fields) { 1099 /*
977 /* 1100 * Other events may have the same class. Only update
978 * Other events may have the same class. Only update 1101 * the fields if they are not already defined.
979 * the fields if they are not already defined. 1102 */
980 */ 1103 head = trace_get_fields(call);
981 head = trace_get_fields(call); 1104 if (list_empty(head)) {
982 if (list_empty(head)) { 1105 ret = call->class->define_fields(call);
983 ret = trace_define_common_fields(call); 1106 if (ret < 0) {
984 if (!ret) 1107 pr_warning("Could not initialize trace point"
985 ret = call->class->define_fields(call); 1108 " events/%s\n", call->name);
986 if (ret < 0) { 1109 return ret;
987 pr_warning("Could not initialize trace point"
988 " events/%s\n", call->name);
989 return ret;
990 }
991 } 1110 }
992 trace_create_file("filter", 0644, call->dir, call,
993 filter);
994 } 1111 }
1112 trace_create_file("filter", 0644, call->dir, call,
1113 filter);
995 1114
996 trace_create_file("format", 0444, call->dir, call, 1115 trace_create_file("format", 0444, call->dir, call,
997 format); 1116 format);
@@ -999,11 +1118,17 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
999 return 0; 1118 return 0;
1000} 1119}
1001 1120
1002static int __trace_add_event_call(struct ftrace_event_call *call) 1121static int
1122__trace_add_event_call(struct ftrace_event_call *call, struct module *mod,
1123 const struct file_operations *id,
1124 const struct file_operations *enable,
1125 const struct file_operations *filter,
1126 const struct file_operations *format)
1003{ 1127{
1004 struct dentry *d_events; 1128 struct dentry *d_events;
1005 int ret; 1129 int ret;
1006 1130
1131 /* The linker may leave blanks */
1007 if (!call->name) 1132 if (!call->name)
1008 return -EINVAL; 1133 return -EINVAL;
1009 1134
@@ -1011,8 +1136,8 @@ static int __trace_add_event_call(struct ftrace_event_call *call)
1011 ret = call->class->raw_init(call); 1136 ret = call->class->raw_init(call);
1012 if (ret < 0) { 1137 if (ret < 0) {
1013 if (ret != -ENOSYS) 1138 if (ret != -ENOSYS)
1014 pr_warning("Could not initialize trace " 1139 pr_warning("Could not initialize trace events/%s\n",
1015 "events/%s\n", call->name); 1140 call->name);
1016 return ret; 1141 return ret;
1017 } 1142 }
1018 } 1143 }
@@ -1021,11 +1146,10 @@ static int __trace_add_event_call(struct ftrace_event_call *call)
1021 if (!d_events) 1146 if (!d_events)
1022 return -ENOENT; 1147 return -ENOENT;
1023 1148
1024 ret = event_create_dir(call, d_events, &ftrace_event_id_fops, 1149 ret = event_create_dir(call, d_events, id, enable, filter, format);
1025 &ftrace_enable_fops, &ftrace_event_filter_fops,
1026 &ftrace_event_format_fops);
1027 if (!ret) 1150 if (!ret)
1028 list_add(&call->list, &ftrace_events); 1151 list_add(&call->list, &ftrace_events);
1152 call->mod = mod;
1029 1153
1030 return ret; 1154 return ret;
1031} 1155}
@@ -1035,7 +1159,10 @@ int trace_add_event_call(struct ftrace_event_call *call)
1035{ 1159{
1036 int ret; 1160 int ret;
1037 mutex_lock(&event_mutex); 1161 mutex_lock(&event_mutex);
1038 ret = __trace_add_event_call(call); 1162 ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops,
1163 &ftrace_enable_fops,
1164 &ftrace_event_filter_fops,
1165 &ftrace_event_format_fops);
1039 mutex_unlock(&event_mutex); 1166 mutex_unlock(&event_mutex);
1040 return ret; 1167 return ret;
1041} 1168}
@@ -1152,8 +1279,6 @@ static void trace_module_add_events(struct module *mod)
1152{ 1279{
1153 struct ftrace_module_file_ops *file_ops = NULL; 1280 struct ftrace_module_file_ops *file_ops = NULL;
1154 struct ftrace_event_call *call, *start, *end; 1281 struct ftrace_event_call *call, *start, *end;
1155 struct dentry *d_events;
1156 int ret;
1157 1282
1158 start = mod->trace_events; 1283 start = mod->trace_events;
1159 end = mod->trace_events + mod->num_trace_events; 1284 end = mod->trace_events + mod->num_trace_events;
@@ -1161,38 +1286,14 @@ static void trace_module_add_events(struct module *mod)
1161 if (start == end) 1286 if (start == end)
1162 return; 1287 return;
1163 1288
1164 d_events = event_trace_events_dir(); 1289 file_ops = trace_create_file_ops(mod);
1165 if (!d_events) 1290 if (!file_ops)
1166 return; 1291 return;
1167 1292
1168 for_each_event(call, start, end) { 1293 for_each_event(call, start, end) {
1169 /* The linker may leave blanks */ 1294 __trace_add_event_call(call, mod,
1170 if (!call->name)
1171 continue;
1172 if (call->class->raw_init) {
1173 ret = call->class->raw_init(call);
1174 if (ret < 0) {
1175 if (ret != -ENOSYS)
1176 pr_warning("Could not initialize trace "
1177 "point events/%s\n", call->name);
1178 continue;
1179 }
1180 }
1181 /*
1182 * This module has events, create file ops for this module
1183 * if not already done.
1184 */
1185 if (!file_ops) {
1186 file_ops = trace_create_file_ops(mod);
1187 if (!file_ops)
1188 return;
1189 }
1190 call->mod = mod;
1191 ret = event_create_dir(call, d_events,
1192 &file_ops->id, &file_ops->enable, 1295 &file_ops->id, &file_ops->enable,
1193 &file_ops->filter, &file_ops->format); 1296 &file_ops->filter, &file_ops->format);
1194 if (!ret)
1195 list_add(&call->list, &ftrace_events);
1196 } 1297 }
1197} 1298}
1198 1299
@@ -1319,25 +1420,14 @@ static __init int event_trace_init(void)
1319 trace_create_file("enable", 0644, d_events, 1420 trace_create_file("enable", 0644, d_events,
1320 NULL, &ftrace_system_enable_fops); 1421 NULL, &ftrace_system_enable_fops);
1321 1422
1423 if (trace_define_common_fields())
1424 pr_warning("tracing: Failed to allocate common fields");
1425
1322 for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { 1426 for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
1323 /* The linker may leave blanks */ 1427 __trace_add_event_call(call, NULL, &ftrace_event_id_fops,
1324 if (!call->name)
1325 continue;
1326 if (call->class->raw_init) {
1327 ret = call->class->raw_init(call);
1328 if (ret < 0) {
1329 if (ret != -ENOSYS)
1330 pr_warning("Could not initialize trace "
1331 "point events/%s\n", call->name);
1332 continue;
1333 }
1334 }
1335 ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
1336 &ftrace_enable_fops, 1428 &ftrace_enable_fops,
1337 &ftrace_event_filter_fops, 1429 &ftrace_event_filter_fops,
1338 &ftrace_event_format_fops); 1430 &ftrace_event_format_fops);
1339 if (!ret)
1340 list_add(&call->list, &ftrace_events);
1341 } 1431 }
1342 1432
1343 while (true) { 1433 while (true) {
@@ -1524,12 +1614,11 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1524 struct ftrace_entry *entry; 1614 struct ftrace_entry *entry;
1525 unsigned long flags; 1615 unsigned long flags;
1526 long disabled; 1616 long disabled;
1527 int resched;
1528 int cpu; 1617 int cpu;
1529 int pc; 1618 int pc;
1530 1619
1531 pc = preempt_count(); 1620 pc = preempt_count();
1532 resched = ftrace_preempt_disable(); 1621 preempt_disable_notrace();
1533 cpu = raw_smp_processor_id(); 1622 cpu = raw_smp_processor_id();
1534 disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); 1623 disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
1535 1624
@@ -1551,7 +1640,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1551 1640
1552 out: 1641 out:
1553 atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); 1642 atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
1554 ftrace_preempt_enable(resched); 1643 preempt_enable_notrace();
1555} 1644}
1556 1645
1557static struct ftrace_ops trace_ops __initdata = 1646static struct ftrace_ops trace_ops __initdata =
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 57bb1bb32999..36d40104b17f 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -497,12 +497,10 @@ void print_subsystem_event_filter(struct event_subsystem *system,
497} 497}
498 498
499static struct ftrace_event_field * 499static struct ftrace_event_field *
500find_event_field(struct ftrace_event_call *call, char *name) 500__find_event_field(struct list_head *head, char *name)
501{ 501{
502 struct ftrace_event_field *field; 502 struct ftrace_event_field *field;
503 struct list_head *head;
504 503
505 head = trace_get_fields(call);
506 list_for_each_entry(field, head, link) { 504 list_for_each_entry(field, head, link) {
507 if (!strcmp(field->name, name)) 505 if (!strcmp(field->name, name))
508 return field; 506 return field;
@@ -511,6 +509,20 @@ find_event_field(struct ftrace_event_call *call, char *name)
511 return NULL; 509 return NULL;
512} 510}
513 511
512static struct ftrace_event_field *
513find_event_field(struct ftrace_event_call *call, char *name)
514{
515 struct ftrace_event_field *field;
516 struct list_head *head;
517
518 field = __find_event_field(&ftrace_common_fields, name);
519 if (field)
520 return field;
521
522 head = trace_get_fields(call);
523 return __find_event_field(head, name);
524}
525
514static void filter_free_pred(struct filter_pred *pred) 526static void filter_free_pred(struct filter_pred *pred)
515{ 527{
516 if (!pred) 528 if (!pred)
@@ -627,9 +639,6 @@ static int init_subsystem_preds(struct event_subsystem *system)
627 int err; 639 int err;
628 640
629 list_for_each_entry(call, &ftrace_events, list) { 641 list_for_each_entry(call, &ftrace_events, list) {
630 if (!call->class || !call->class->define_fields)
631 continue;
632
633 if (strcmp(call->class->system, system->name) != 0) 642 if (strcmp(call->class->system, system->name) != 0)
634 continue; 643 continue;
635 644
@@ -646,9 +655,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system)
646 struct ftrace_event_call *call; 655 struct ftrace_event_call *call;
647 656
648 list_for_each_entry(call, &ftrace_events, list) { 657 list_for_each_entry(call, &ftrace_events, list) {
649 if (!call->class || !call->class->define_fields)
650 continue;
651
652 if (strcmp(call->class->system, system->name) != 0) 658 if (strcmp(call->class->system, system->name) != 0)
653 continue; 659 continue;
654 660
@@ -1251,9 +1257,6 @@ static int replace_system_preds(struct event_subsystem *system,
1251 list_for_each_entry(call, &ftrace_events, list) { 1257 list_for_each_entry(call, &ftrace_events, list) {
1252 struct event_filter *filter = call->filter; 1258 struct event_filter *filter = call->filter;
1253 1259
1254 if (!call->class || !call->class->define_fields)
1255 continue;
1256
1257 if (strcmp(call->class->system, system->name) != 0) 1260 if (strcmp(call->class->system, system->name) != 0)
1258 continue; 1261 continue;
1259 1262
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 8536e2a65969..4ba44deaac25 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -125,12 +125,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
125 125
126#include "trace_entries.h" 126#include "trace_entries.h"
127 127
128static int ftrace_raw_init_event(struct ftrace_event_call *call)
129{
130 INIT_LIST_HEAD(&call->class->fields);
131 return 0;
132}
133
134#undef __entry 128#undef __entry
135#define __entry REC 129#define __entry REC
136 130
@@ -158,7 +152,7 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
158struct ftrace_event_class event_class_ftrace_##call = { \ 152struct ftrace_event_class event_class_ftrace_##call = { \
159 .system = __stringify(TRACE_SYSTEM), \ 153 .system = __stringify(TRACE_SYSTEM), \
160 .define_fields = ftrace_define_fields_##call, \ 154 .define_fields = ftrace_define_fields_##call, \
161 .raw_init = ftrace_raw_init_event, \ 155 .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
162}; \ 156}; \
163 \ 157 \
164struct ftrace_event_call __used \ 158struct ftrace_event_call __used \
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index b3f3776b0cd6..16aee4d44e8f 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -54,14 +54,14 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
54 struct trace_array_cpu *data; 54 struct trace_array_cpu *data;
55 unsigned long flags; 55 unsigned long flags;
56 long disabled; 56 long disabled;
57 int cpu, resched; 57 int cpu;
58 int pc; 58 int pc;
59 59
60 if (unlikely(!ftrace_function_enabled)) 60 if (unlikely(!ftrace_function_enabled))
61 return; 61 return;
62 62
63 pc = preempt_count(); 63 pc = preempt_count();
64 resched = ftrace_preempt_disable(); 64 preempt_disable_notrace();
65 local_save_flags(flags); 65 local_save_flags(flags);
66 cpu = raw_smp_processor_id(); 66 cpu = raw_smp_processor_id();
67 data = tr->data[cpu]; 67 data = tr->data[cpu];
@@ -71,7 +71,7 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
71 trace_function(tr, ip, parent_ip, flags, pc); 71 trace_function(tr, ip, parent_ip, flags, pc);
72 72
73 atomic_dec(&data->disabled); 73 atomic_dec(&data->disabled);
74 ftrace_preempt_enable(resched); 74 preempt_enable_notrace();
75} 75}
76 76
77static void 77static void
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 79f4bac99a94..76b05980225c 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -15,15 +15,19 @@
15#include "trace.h" 15#include "trace.h"
16#include "trace_output.h" 16#include "trace_output.h"
17 17
18/* When set, irq functions will be ignored */
19static int ftrace_graph_skip_irqs;
20
18struct fgraph_cpu_data { 21struct fgraph_cpu_data {
19 pid_t last_pid; 22 pid_t last_pid;
20 int depth; 23 int depth;
24 int depth_irq;
21 int ignore; 25 int ignore;
22 unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH]; 26 unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH];
23}; 27};
24 28
25struct fgraph_data { 29struct fgraph_data {
26 struct fgraph_cpu_data *cpu_data; 30 struct fgraph_cpu_data __percpu *cpu_data;
27 31
28 /* Place to preserve last processed entry. */ 32 /* Place to preserve last processed entry. */
29 struct ftrace_graph_ent_entry ent; 33 struct ftrace_graph_ent_entry ent;
@@ -41,6 +45,7 @@ struct fgraph_data {
41#define TRACE_GRAPH_PRINT_PROC 0x8 45#define TRACE_GRAPH_PRINT_PROC 0x8
42#define TRACE_GRAPH_PRINT_DURATION 0x10 46#define TRACE_GRAPH_PRINT_DURATION 0x10
43#define TRACE_GRAPH_PRINT_ABS_TIME 0x20 47#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
48#define TRACE_GRAPH_PRINT_IRQS 0x40
44 49
45static struct tracer_opt trace_opts[] = { 50static struct tracer_opt trace_opts[] = {
46 /* Display overruns? (for self-debug purpose) */ 51 /* Display overruns? (for self-debug purpose) */
@@ -55,13 +60,15 @@ static struct tracer_opt trace_opts[] = {
55 { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) }, 60 { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) },
56 /* Display absolute time of an entry */ 61 /* Display absolute time of an entry */
57 { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, 62 { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) },
63 /* Display interrupts */
64 { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) },
58 { } /* Empty entry */ 65 { } /* Empty entry */
59}; 66};
60 67
61static struct tracer_flags tracer_flags = { 68static struct tracer_flags tracer_flags = {
62 /* Don't display overruns and proc by default */ 69 /* Don't display overruns and proc by default */
63 .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | 70 .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |
64 TRACE_GRAPH_PRINT_DURATION, 71 TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS,
65 .opts = trace_opts 72 .opts = trace_opts
66}; 73};
67 74
@@ -204,6 +211,14 @@ int __trace_graph_entry(struct trace_array *tr,
204 return 1; 211 return 1;
205} 212}
206 213
214static inline int ftrace_graph_ignore_irqs(void)
215{
216 if (!ftrace_graph_skip_irqs)
217 return 0;
218
219 return in_irq();
220}
221
207int trace_graph_entry(struct ftrace_graph_ent *trace) 222int trace_graph_entry(struct ftrace_graph_ent *trace)
208{ 223{
209 struct trace_array *tr = graph_array; 224 struct trace_array *tr = graph_array;
@@ -218,7 +233,8 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
218 return 0; 233 return 0;
219 234
220 /* trace it when it is-nested-in or is a function enabled. */ 235 /* trace it when it is-nested-in or is a function enabled. */
221 if (!(trace->depth || ftrace_graph_addr(trace->func))) 236 if (!(trace->depth || ftrace_graph_addr(trace->func)) ||
237 ftrace_graph_ignore_irqs())
222 return 0; 238 return 0;
223 239
224 local_irq_save(flags); 240 local_irq_save(flags);
@@ -246,6 +262,34 @@ int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
246 return trace_graph_entry(trace); 262 return trace_graph_entry(trace);
247} 263}
248 264
265static void
266__trace_graph_function(struct trace_array *tr,
267 unsigned long ip, unsigned long flags, int pc)
268{
269 u64 time = trace_clock_local();
270 struct ftrace_graph_ent ent = {
271 .func = ip,
272 .depth = 0,
273 };
274 struct ftrace_graph_ret ret = {
275 .func = ip,
276 .depth = 0,
277 .calltime = time,
278 .rettime = time,
279 };
280
281 __trace_graph_entry(tr, &ent, flags, pc);
282 __trace_graph_return(tr, &ret, flags, pc);
283}
284
285void
286trace_graph_function(struct trace_array *tr,
287 unsigned long ip, unsigned long parent_ip,
288 unsigned long flags, int pc)
289{
290 __trace_graph_function(tr, ip, flags, pc);
291}
292
249void __trace_graph_return(struct trace_array *tr, 293void __trace_graph_return(struct trace_array *tr,
250 struct ftrace_graph_ret *trace, 294 struct ftrace_graph_ret *trace,
251 unsigned long flags, 295 unsigned long flags,
@@ -507,7 +551,15 @@ get_return_for_leaf(struct trace_iterator *iter,
507 * if the output fails. 551 * if the output fails.
508 */ 552 */
509 data->ent = *curr; 553 data->ent = *curr;
510 data->ret = *next; 554 /*
555 * If the next event is not a return type, then
556 * we only care about what type it is. Otherwise we can
557 * safely copy the entire event.
558 */
559 if (next->ent.type == TRACE_GRAPH_RET)
560 data->ret = *next;
561 else
562 data->ret.ent.type = next->ent.type;
511 } 563 }
512 } 564 }
513 565
@@ -641,7 +693,9 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
641 693
642 /* Print nsecs (we don't want to exceed 7 numbers) */ 694 /* Print nsecs (we don't want to exceed 7 numbers) */
643 if (len < 7) { 695 if (len < 7) {
644 snprintf(nsecs_str, 8 - len, "%03lu", nsecs_rem); 696 size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len);
697
698 snprintf(nsecs_str, slen, "%03lu", nsecs_rem);
645 ret = trace_seq_printf(s, ".%s", nsecs_str); 699 ret = trace_seq_printf(s, ".%s", nsecs_str);
646 if (!ret) 700 if (!ret)
647 return TRACE_TYPE_PARTIAL_LINE; 701 return TRACE_TYPE_PARTIAL_LINE;
@@ -846,6 +900,108 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
846 return 0; 900 return 0;
847} 901}
848 902
903/*
904 * Entry check for irq code
905 *
906 * returns 1 if
907 * - we are inside irq code
908 * - we just extered irq code
909 *
910 * retunns 0 if
911 * - funcgraph-interrupts option is set
912 * - we are not inside irq code
913 */
914static int
915check_irq_entry(struct trace_iterator *iter, u32 flags,
916 unsigned long addr, int depth)
917{
918 int cpu = iter->cpu;
919 int *depth_irq;
920 struct fgraph_data *data = iter->private;
921
922 /*
923 * If we are either displaying irqs, or we got called as
924 * a graph event and private data does not exist,
925 * then we bypass the irq check.
926 */
927 if ((flags & TRACE_GRAPH_PRINT_IRQS) ||
928 (!data))
929 return 0;
930
931 depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
932
933 /*
934 * We are inside the irq code
935 */
936 if (*depth_irq >= 0)
937 return 1;
938
939 if ((addr < (unsigned long)__irqentry_text_start) ||
940 (addr >= (unsigned long)__irqentry_text_end))
941 return 0;
942
943 /*
944 * We are entering irq code.
945 */
946 *depth_irq = depth;
947 return 1;
948}
949
950/*
951 * Return check for irq code
952 *
953 * returns 1 if
954 * - we are inside irq code
955 * - we just left irq code
956 *
957 * returns 0 if
958 * - funcgraph-interrupts option is set
959 * - we are not inside irq code
960 */
961static int
962check_irq_return(struct trace_iterator *iter, u32 flags, int depth)
963{
964 int cpu = iter->cpu;
965 int *depth_irq;
966 struct fgraph_data *data = iter->private;
967
968 /*
969 * If we are either displaying irqs, or we got called as
970 * a graph event and private data does not exist,
971 * then we bypass the irq check.
972 */
973 if ((flags & TRACE_GRAPH_PRINT_IRQS) ||
974 (!data))
975 return 0;
976
977 depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
978
979 /*
980 * We are not inside the irq code.
981 */
982 if (*depth_irq == -1)
983 return 0;
984
985 /*
986 * We are inside the irq code, and this is returning entry.
987 * Let's not trace it and clear the entry depth, since
988 * we are out of irq code.
989 *
990 * This condition ensures that we 'leave the irq code' once
991 * we are out of the entry depth. Thus protecting us from
992 * the RETURN entry loss.
993 */
994 if (*depth_irq >= depth) {
995 *depth_irq = -1;
996 return 1;
997 }
998
999 /*
1000 * We are inside the irq code, and this is not the entry.
1001 */
1002 return 1;
1003}
1004
849static enum print_line_t 1005static enum print_line_t
850print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, 1006print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
851 struct trace_iterator *iter, u32 flags) 1007 struct trace_iterator *iter, u32 flags)
@@ -856,6 +1012,9 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
856 static enum print_line_t ret; 1012 static enum print_line_t ret;
857 int cpu = iter->cpu; 1013 int cpu = iter->cpu;
858 1014
1015 if (check_irq_entry(iter, flags, call->func, call->depth))
1016 return TRACE_TYPE_HANDLED;
1017
859 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags)) 1018 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags))
860 return TRACE_TYPE_PARTIAL_LINE; 1019 return TRACE_TYPE_PARTIAL_LINE;
861 1020
@@ -893,6 +1052,9 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
893 int ret; 1052 int ret;
894 int i; 1053 int i;
895 1054
1055 if (check_irq_return(iter, flags, trace->depth))
1056 return TRACE_TYPE_HANDLED;
1057
896 if (data) { 1058 if (data) {
897 struct fgraph_cpu_data *cpu_data; 1059 struct fgraph_cpu_data *cpu_data;
898 int cpu = iter->cpu; 1060 int cpu = iter->cpu;
@@ -1045,7 +1207,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1045 1207
1046 1208
1047enum print_line_t 1209enum print_line_t
1048print_graph_function_flags(struct trace_iterator *iter, u32 flags) 1210__print_graph_function_flags(struct trace_iterator *iter, u32 flags)
1049{ 1211{
1050 struct ftrace_graph_ent_entry *field; 1212 struct ftrace_graph_ent_entry *field;
1051 struct fgraph_data *data = iter->private; 1213 struct fgraph_data *data = iter->private;
@@ -1108,7 +1270,18 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
1108static enum print_line_t 1270static enum print_line_t
1109print_graph_function(struct trace_iterator *iter) 1271print_graph_function(struct trace_iterator *iter)
1110{ 1272{
1111 return print_graph_function_flags(iter, tracer_flags.val); 1273 return __print_graph_function_flags(iter, tracer_flags.val);
1274}
1275
1276enum print_line_t print_graph_function_flags(struct trace_iterator *iter,
1277 u32 flags)
1278{
1279 if (trace_flags & TRACE_ITER_LATENCY_FMT)
1280 flags |= TRACE_GRAPH_PRINT_DURATION;
1281 else
1282 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
1283
1284 return __print_graph_function_flags(iter, flags);
1112} 1285}
1113 1286
1114static enum print_line_t 1287static enum print_line_t
@@ -1140,7 +1313,7 @@ static void print_lat_header(struct seq_file *s, u32 flags)
1140 seq_printf(s, "#%.*s|||| / \n", size, spaces); 1313 seq_printf(s, "#%.*s|||| / \n", size, spaces);
1141} 1314}
1142 1315
1143void print_graph_headers_flags(struct seq_file *s, u32 flags) 1316static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
1144{ 1317{
1145 int lat = trace_flags & TRACE_ITER_LATENCY_FMT; 1318 int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
1146 1319
@@ -1181,6 +1354,23 @@ void print_graph_headers(struct seq_file *s)
1181 print_graph_headers_flags(s, tracer_flags.val); 1354 print_graph_headers_flags(s, tracer_flags.val);
1182} 1355}
1183 1356
1357void print_graph_headers_flags(struct seq_file *s, u32 flags)
1358{
1359 struct trace_iterator *iter = s->private;
1360
1361 if (trace_flags & TRACE_ITER_LATENCY_FMT) {
1362 /* print nothing if the buffers are empty */
1363 if (trace_empty(iter))
1364 return;
1365
1366 print_trace_header(s, iter);
1367 flags |= TRACE_GRAPH_PRINT_DURATION;
1368 } else
1369 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
1370
1371 __print_graph_headers_flags(s, flags);
1372}
1373
1184void graph_trace_open(struct trace_iterator *iter) 1374void graph_trace_open(struct trace_iterator *iter)
1185{ 1375{
1186 /* pid and depth on the last trace processed */ 1376 /* pid and depth on the last trace processed */
@@ -1201,9 +1391,12 @@ void graph_trace_open(struct trace_iterator *iter)
1201 pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); 1391 pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
1202 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); 1392 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
1203 int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore); 1393 int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore);
1394 int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
1395
1204 *pid = -1; 1396 *pid = -1;
1205 *depth = 0; 1397 *depth = 0;
1206 *ignore = 0; 1398 *ignore = 0;
1399 *depth_irq = -1;
1207 } 1400 }
1208 1401
1209 iter->private = data; 1402 iter->private = data;
@@ -1226,6 +1419,14 @@ void graph_trace_close(struct trace_iterator *iter)
1226 } 1419 }
1227} 1420}
1228 1421
1422static int func_graph_set_flag(u32 old_flags, u32 bit, int set)
1423{
1424 if (bit == TRACE_GRAPH_PRINT_IRQS)
1425 ftrace_graph_skip_irqs = !set;
1426
1427 return 0;
1428}
1429
1229static struct trace_event_functions graph_functions = { 1430static struct trace_event_functions graph_functions = {
1230 .trace = print_graph_function_event, 1431 .trace = print_graph_function_event,
1231}; 1432};
@@ -1252,6 +1453,7 @@ static struct tracer graph_trace __read_mostly = {
1252 .print_line = print_graph_function, 1453 .print_line = print_graph_function,
1253 .print_header = print_graph_headers, 1454 .print_header = print_graph_headers,
1254 .flags = &tracer_flags, 1455 .flags = &tracer_flags,
1456 .set_flag = func_graph_set_flag,
1255#ifdef CONFIG_FTRACE_SELFTEST 1457#ifdef CONFIG_FTRACE_SELFTEST
1256 .selftest = trace_selftest_startup_function_graph, 1458 .selftest = trace_selftest_startup_function_graph,
1257#endif 1459#endif
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 6fd486e0cef4..5cf8c602b880 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -87,14 +87,22 @@ static __cacheline_aligned_in_smp unsigned long max_sequence;
87 87
88#ifdef CONFIG_FUNCTION_TRACER 88#ifdef CONFIG_FUNCTION_TRACER
89/* 89/*
90 * irqsoff uses its own tracer function to keep the overhead down: 90 * Prologue for the preempt and irqs off function tracers.
91 *
92 * Returns 1 if it is OK to continue, and data->disabled is
93 * incremented.
94 * 0 if the trace is to be ignored, and data->disabled
95 * is kept the same.
96 *
97 * Note, this function is also used outside this ifdef but
98 * inside the #ifdef of the function graph tracer below.
99 * This is OK, since the function graph tracer is
100 * dependent on the function tracer.
91 */ 101 */
92static void 102static int func_prolog_dec(struct trace_array *tr,
93irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) 103 struct trace_array_cpu **data,
104 unsigned long *flags)
94{ 105{
95 struct trace_array *tr = irqsoff_trace;
96 struct trace_array_cpu *data;
97 unsigned long flags;
98 long disabled; 106 long disabled;
99 int cpu; 107 int cpu;
100 108
@@ -106,18 +114,38 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
106 */ 114 */
107 cpu = raw_smp_processor_id(); 115 cpu = raw_smp_processor_id();
108 if (likely(!per_cpu(tracing_cpu, cpu))) 116 if (likely(!per_cpu(tracing_cpu, cpu)))
109 return; 117 return 0;
110 118
111 local_save_flags(flags); 119 local_save_flags(*flags);
112 /* slight chance to get a false positive on tracing_cpu */ 120 /* slight chance to get a false positive on tracing_cpu */
113 if (!irqs_disabled_flags(flags)) 121 if (!irqs_disabled_flags(*flags))
114 return; 122 return 0;
115 123
116 data = tr->data[cpu]; 124 *data = tr->data[cpu];
117 disabled = atomic_inc_return(&data->disabled); 125 disabled = atomic_inc_return(&(*data)->disabled);
118 126
119 if (likely(disabled == 1)) 127 if (likely(disabled == 1))
120 trace_function(tr, ip, parent_ip, flags, preempt_count()); 128 return 1;
129
130 atomic_dec(&(*data)->disabled);
131
132 return 0;
133}
134
135/*
136 * irqsoff uses its own tracer function to keep the overhead down:
137 */
138static void
139irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
140{
141 struct trace_array *tr = irqsoff_trace;
142 struct trace_array_cpu *data;
143 unsigned long flags;
144
145 if (!func_prolog_dec(tr, &data, &flags))
146 return;
147
148 trace_function(tr, ip, parent_ip, flags, preempt_count());
121 149
122 atomic_dec(&data->disabled); 150 atomic_dec(&data->disabled);
123} 151}
@@ -155,30 +183,16 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
155 struct trace_array *tr = irqsoff_trace; 183 struct trace_array *tr = irqsoff_trace;
156 struct trace_array_cpu *data; 184 struct trace_array_cpu *data;
157 unsigned long flags; 185 unsigned long flags;
158 long disabled;
159 int ret; 186 int ret;
160 int cpu;
161 int pc; 187 int pc;
162 188
163 cpu = raw_smp_processor_id(); 189 if (!func_prolog_dec(tr, &data, &flags))
164 if (likely(!per_cpu(tracing_cpu, cpu)))
165 return 0; 190 return 0;
166 191
167 local_save_flags(flags); 192 pc = preempt_count();
168 /* slight chance to get a false positive on tracing_cpu */ 193 ret = __trace_graph_entry(tr, trace, flags, pc);
169 if (!irqs_disabled_flags(flags))
170 return 0;
171
172 data = tr->data[cpu];
173 disabled = atomic_inc_return(&data->disabled);
174
175 if (likely(disabled == 1)) {
176 pc = preempt_count();
177 ret = __trace_graph_entry(tr, trace, flags, pc);
178 } else
179 ret = 0;
180
181 atomic_dec(&data->disabled); 194 atomic_dec(&data->disabled);
195
182 return ret; 196 return ret;
183} 197}
184 198
@@ -187,27 +201,13 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
187 struct trace_array *tr = irqsoff_trace; 201 struct trace_array *tr = irqsoff_trace;
188 struct trace_array_cpu *data; 202 struct trace_array_cpu *data;
189 unsigned long flags; 203 unsigned long flags;
190 long disabled;
191 int cpu;
192 int pc; 204 int pc;
193 205
194 cpu = raw_smp_processor_id(); 206 if (!func_prolog_dec(tr, &data, &flags))
195 if (likely(!per_cpu(tracing_cpu, cpu)))
196 return; 207 return;
197 208
198 local_save_flags(flags); 209 pc = preempt_count();
199 /* slight chance to get a false positive on tracing_cpu */ 210 __trace_graph_return(tr, trace, flags, pc);
200 if (!irqs_disabled_flags(flags))
201 return;
202
203 data = tr->data[cpu];
204 disabled = atomic_inc_return(&data->disabled);
205
206 if (likely(disabled == 1)) {
207 pc = preempt_count();
208 __trace_graph_return(tr, trace, flags, pc);
209 }
210
211 atomic_dec(&data->disabled); 211 atomic_dec(&data->disabled);
212} 212}
213 213
@@ -229,75 +229,33 @@ static void irqsoff_trace_close(struct trace_iterator *iter)
229 229
230static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) 230static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
231{ 231{
232 u32 flags = GRAPH_TRACER_FLAGS;
233
234 if (trace_flags & TRACE_ITER_LATENCY_FMT)
235 flags |= TRACE_GRAPH_PRINT_DURATION;
236 else
237 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
238
239 /* 232 /*
240 * In graph mode call the graph tracer output function, 233 * In graph mode call the graph tracer output function,
241 * otherwise go with the TRACE_FN event handler 234 * otherwise go with the TRACE_FN event handler
242 */ 235 */
243 if (is_graph()) 236 if (is_graph())
244 return print_graph_function_flags(iter, flags); 237 return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS);
245 238
246 return TRACE_TYPE_UNHANDLED; 239 return TRACE_TYPE_UNHANDLED;
247} 240}
248 241
249static void irqsoff_print_header(struct seq_file *s) 242static void irqsoff_print_header(struct seq_file *s)
250{ 243{
251 if (is_graph()) { 244 if (is_graph())
252 struct trace_iterator *iter = s->private; 245 print_graph_headers_flags(s, GRAPH_TRACER_FLAGS);
253 u32 flags = GRAPH_TRACER_FLAGS; 246 else
254
255 if (trace_flags & TRACE_ITER_LATENCY_FMT) {
256 /* print nothing if the buffers are empty */
257 if (trace_empty(iter))
258 return;
259
260 print_trace_header(s, iter);
261 flags |= TRACE_GRAPH_PRINT_DURATION;
262 } else
263 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
264
265 print_graph_headers_flags(s, flags);
266 } else
267 trace_default_header(s); 247 trace_default_header(s);
268} 248}
269 249
270static void 250static void
271trace_graph_function(struct trace_array *tr,
272 unsigned long ip, unsigned long flags, int pc)
273{
274 u64 time = trace_clock_local();
275 struct ftrace_graph_ent ent = {
276 .func = ip,
277 .depth = 0,
278 };
279 struct ftrace_graph_ret ret = {
280 .func = ip,
281 .depth = 0,
282 .calltime = time,
283 .rettime = time,
284 };
285
286 __trace_graph_entry(tr, &ent, flags, pc);
287 __trace_graph_return(tr, &ret, flags, pc);
288}
289
290static void
291__trace_function(struct trace_array *tr, 251__trace_function(struct trace_array *tr,
292 unsigned long ip, unsigned long parent_ip, 252 unsigned long ip, unsigned long parent_ip,
293 unsigned long flags, int pc) 253 unsigned long flags, int pc)
294{ 254{
295 if (!is_graph()) 255 if (is_graph())
256 trace_graph_function(tr, ip, parent_ip, flags, pc);
257 else
296 trace_function(tr, ip, parent_ip, flags, pc); 258 trace_function(tr, ip, parent_ip, flags, pc);
297 else {
298 trace_graph_function(tr, parent_ip, flags, pc);
299 trace_graph_function(tr, ip, flags, pc);
300 }
301} 259}
302 260
303#else 261#else
@@ -649,6 +607,7 @@ static struct tracer irqsoff_tracer __read_mostly =
649#endif 607#endif
650 .open = irqsoff_trace_open, 608 .open = irqsoff_trace_open,
651 .close = irqsoff_trace_close, 609 .close = irqsoff_trace_close,
610 .use_max_tr = 1,
652}; 611};
653# define register_irqsoff(trace) register_tracer(&trace) 612# define register_irqsoff(trace) register_tracer(&trace)
654#else 613#else
@@ -681,6 +640,7 @@ static struct tracer preemptoff_tracer __read_mostly =
681#endif 640#endif
682 .open = irqsoff_trace_open, 641 .open = irqsoff_trace_open,
683 .close = irqsoff_trace_close, 642 .close = irqsoff_trace_close,
643 .use_max_tr = 1,
684}; 644};
685# define register_preemptoff(trace) register_tracer(&trace) 645# define register_preemptoff(trace) register_tracer(&trace)
686#else 646#else
@@ -715,6 +675,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
715#endif 675#endif
716 .open = irqsoff_trace_open, 676 .open = irqsoff_trace_open,
717 .close = irqsoff_trace_close, 677 .close = irqsoff_trace_close,
678 .use_max_tr = 1,
718}; 679};
719 680
720# define register_preemptirqsoff(trace) register_tracer(&trace) 681# define register_preemptirqsoff(trace) register_tracer(&trace)
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
new file mode 100644
index 000000000000..3c5c5dfea0b3
--- /dev/null
+++ b/kernel/trace/trace_kdb.c
@@ -0,0 +1,135 @@
1/*
2 * kdb helper for dumping the ftrace buffer
3 *
4 * Copyright (C) 2010 Jason Wessel <jason.wessel@windriver.com>
5 *
6 * ftrace_dump_buf based on ftrace_dump:
7 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
8 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
9 *
10 */
11#include <linux/init.h>
12#include <linux/kgdb.h>
13#include <linux/kdb.h>
14#include <linux/ftrace.h>
15
16#include "trace.h"
17#include "trace_output.h"
18
19static void ftrace_dump_buf(int skip_lines, long cpu_file)
20{
21 /* use static because iter can be a bit big for the stack */
22 static struct trace_iterator iter;
23 unsigned int old_userobj;
24 int cnt = 0, cpu;
25
26 trace_init_global_iter(&iter);
27
28 for_each_tracing_cpu(cpu) {
29 atomic_inc(&iter.tr->data[cpu]->disabled);
30 }
31
32 old_userobj = trace_flags;
33
34 /* don't look at user memory in panic mode */
35 trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
36
37 kdb_printf("Dumping ftrace buffer:\n");
38
39 /* reset all but tr, trace, and overruns */
40 memset(&iter.seq, 0,
41 sizeof(struct trace_iterator) -
42 offsetof(struct trace_iterator, seq));
43 iter.iter_flags |= TRACE_FILE_LAT_FMT;
44 iter.pos = -1;
45
46 if (cpu_file == TRACE_PIPE_ALL_CPU) {
47 for_each_tracing_cpu(cpu) {
48 iter.buffer_iter[cpu] =
49 ring_buffer_read_prepare(iter.tr->buffer, cpu);
50 ring_buffer_read_start(iter.buffer_iter[cpu]);
51 tracing_iter_reset(&iter, cpu);
52 }
53 } else {
54 iter.cpu_file = cpu_file;
55 iter.buffer_iter[cpu_file] =
56 ring_buffer_read_prepare(iter.tr->buffer, cpu_file);
57 ring_buffer_read_start(iter.buffer_iter[cpu_file]);
58 tracing_iter_reset(&iter, cpu_file);
59 }
60 if (!trace_empty(&iter))
61 trace_find_next_entry_inc(&iter);
62 while (!trace_empty(&iter)) {
63 if (!cnt)
64 kdb_printf("---------------------------------\n");
65 cnt++;
66
67 if (trace_find_next_entry_inc(&iter) != NULL && !skip_lines)
68 print_trace_line(&iter);
69 if (!skip_lines)
70 trace_printk_seq(&iter.seq);
71 else
72 skip_lines--;
73 if (KDB_FLAG(CMD_INTERRUPT))
74 goto out;
75 }
76
77 if (!cnt)
78 kdb_printf(" (ftrace buffer empty)\n");
79 else
80 kdb_printf("---------------------------------\n");
81
82out:
83 trace_flags = old_userobj;
84
85 for_each_tracing_cpu(cpu) {
86 atomic_dec(&iter.tr->data[cpu]->disabled);
87 }
88
89 for_each_tracing_cpu(cpu)
90 if (iter.buffer_iter[cpu])
91 ring_buffer_read_finish(iter.buffer_iter[cpu]);
92}
93
94/*
95 * kdb_ftdump - Dump the ftrace log buffer
96 */
97static int kdb_ftdump(int argc, const char **argv)
98{
99 int skip_lines = 0;
100 long cpu_file;
101 char *cp;
102
103 if (argc > 2)
104 return KDB_ARGCOUNT;
105
106 if (argc) {
107 skip_lines = simple_strtol(argv[1], &cp, 0);
108 if (*cp)
109 skip_lines = 0;
110 }
111
112 if (argc == 2) {
113 cpu_file = simple_strtol(argv[2], &cp, 0);
114 if (*cp || cpu_file >= NR_CPUS || cpu_file < 0 ||
115 !cpu_online(cpu_file))
116 return KDB_BADINT;
117 } else {
118 cpu_file = TRACE_PIPE_ALL_CPU;
119 }
120
121 kdb_trap_printk++;
122 ftrace_dump_buf(skip_lines, cpu_file);
123 kdb_trap_printk--;
124
125 return 0;
126}
127
128static __init int kdb_ftrace_register(void)
129{
130 kdb_register_repeat("ftdump", kdb_ftdump, "[skip_#lines] [cpu]",
131 "Dump ftrace log", 0, KDB_REPEAT_NONE);
132 return 0;
133}
134
135late_initcall(kdb_ftrace_register);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index f52b5f50299d..2dec9bcde8b4 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -30,6 +30,7 @@
30#include <linux/ptrace.h> 30#include <linux/ptrace.h>
31#include <linux/perf_event.h> 31#include <linux/perf_event.h>
32#include <linux/stringify.h> 32#include <linux/stringify.h>
33#include <linux/limits.h>
33#include <asm/bitsperlong.h> 34#include <asm/bitsperlong.h>
34 35
35#include "trace.h" 36#include "trace.h"
@@ -38,6 +39,7 @@
38#define MAX_TRACE_ARGS 128 39#define MAX_TRACE_ARGS 128
39#define MAX_ARGSTR_LEN 63 40#define MAX_ARGSTR_LEN 63
40#define MAX_EVENT_NAME_LEN 64 41#define MAX_EVENT_NAME_LEN 64
42#define MAX_STRING_SIZE PATH_MAX
41#define KPROBE_EVENT_SYSTEM "kprobes" 43#define KPROBE_EVENT_SYSTEM "kprobes"
42 44
43/* Reserved field names */ 45/* Reserved field names */
@@ -58,14 +60,16 @@ const char *reserved_field_names[] = {
58}; 60};
59 61
60/* Printing function type */ 62/* Printing function type */
61typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *); 63typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *,
64 void *);
62#define PRINT_TYPE_FUNC_NAME(type) print_type_##type 65#define PRINT_TYPE_FUNC_NAME(type) print_type_##type
63#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type 66#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type
64 67
65/* Printing in basic type function template */ 68/* Printing in basic type function template */
66#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \ 69#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \
67static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ 70static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \
68 const char *name, void *data)\ 71 const char *name, \
72 void *data, void *ent)\
69{ \ 73{ \
70 return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\ 74 return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\
71} \ 75} \
@@ -80,6 +84,49 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
80DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long) 84DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
81DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long) 85DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
82 86
87/* data_rloc: data relative location, compatible with u32 */
88#define make_data_rloc(len, roffs) \
89 (((u32)(len) << 16) | ((u32)(roffs) & 0xffff))
90#define get_rloc_len(dl) ((u32)(dl) >> 16)
91#define get_rloc_offs(dl) ((u32)(dl) & 0xffff)
92
93static inline void *get_rloc_data(u32 *dl)
94{
95 return (u8 *)dl + get_rloc_offs(*dl);
96}
97
98/* For data_loc conversion */
99static inline void *get_loc_data(u32 *dl, void *ent)
100{
101 return (u8 *)ent + get_rloc_offs(*dl);
102}
103
104/*
105 * Convert data_rloc to data_loc:
106 * data_rloc stores the offset from data_rloc itself, but data_loc
107 * stores the offset from event entry.
108 */
109#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs))
110
111/* For defining macros, define string/string_size types */
112typedef u32 string;
113typedef u32 string_size;
114
115/* Print type function for string type */
116static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
117 const char *name,
118 void *data, void *ent)
119{
120 int len = *(u32 *)data >> 16;
121
122 if (!len)
123 return trace_seq_printf(s, " %s=(fault)", name);
124 else
125 return trace_seq_printf(s, " %s=\"%s\"", name,
126 (const char *)get_loc_data(data, ent));
127}
128static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
129
83/* Data fetch function type */ 130/* Data fetch function type */
84typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); 131typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
85 132
@@ -94,32 +141,38 @@ static __kprobes void call_fetch(struct fetch_param *fprm,
94 return fprm->fn(regs, fprm->data, dest); 141 return fprm->fn(regs, fprm->data, dest);
95} 142}
96 143
97#define FETCH_FUNC_NAME(kind, type) fetch_##kind##_##type 144#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type
98/* 145/*
99 * Define macro for basic types - we don't need to define s* types, because 146 * Define macro for basic types - we don't need to define s* types, because
100 * we have to care only about bitwidth at recording time. 147 * we have to care only about bitwidth at recording time.
101 */ 148 */
102#define DEFINE_BASIC_FETCH_FUNCS(kind) \ 149#define DEFINE_BASIC_FETCH_FUNCS(method) \
103DEFINE_FETCH_##kind(u8) \ 150DEFINE_FETCH_##method(u8) \
104DEFINE_FETCH_##kind(u16) \ 151DEFINE_FETCH_##method(u16) \
105DEFINE_FETCH_##kind(u32) \ 152DEFINE_FETCH_##method(u32) \
106DEFINE_FETCH_##kind(u64) 153DEFINE_FETCH_##method(u64)
107 154
108#define CHECK_BASIC_FETCH_FUNCS(kind, fn) \ 155#define CHECK_FETCH_FUNCS(method, fn) \
109 ((FETCH_FUNC_NAME(kind, u8) == fn) || \ 156 (((FETCH_FUNC_NAME(method, u8) == fn) || \
110 (FETCH_FUNC_NAME(kind, u16) == fn) || \ 157 (FETCH_FUNC_NAME(method, u16) == fn) || \
111 (FETCH_FUNC_NAME(kind, u32) == fn) || \ 158 (FETCH_FUNC_NAME(method, u32) == fn) || \
112 (FETCH_FUNC_NAME(kind, u64) == fn)) 159 (FETCH_FUNC_NAME(method, u64) == fn) || \
160 (FETCH_FUNC_NAME(method, string) == fn) || \
161 (FETCH_FUNC_NAME(method, string_size) == fn)) \
162 && (fn != NULL))
113 163
114/* Data fetch function templates */ 164/* Data fetch function templates */
115#define DEFINE_FETCH_reg(type) \ 165#define DEFINE_FETCH_reg(type) \
116static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ 166static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \
117 void *offset, void *dest) \ 167 void *offset, void *dest) \
118{ \ 168{ \
119 *(type *)dest = (type)regs_get_register(regs, \ 169 *(type *)dest = (type)regs_get_register(regs, \
120 (unsigned int)((unsigned long)offset)); \ 170 (unsigned int)((unsigned long)offset)); \
121} 171}
122DEFINE_BASIC_FETCH_FUNCS(reg) 172DEFINE_BASIC_FETCH_FUNCS(reg)
173/* No string on the register */
174#define fetch_reg_string NULL
175#define fetch_reg_string_size NULL
123 176
124#define DEFINE_FETCH_stack(type) \ 177#define DEFINE_FETCH_stack(type) \
125static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ 178static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
@@ -129,6 +182,9 @@ static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
129 (unsigned int)((unsigned long)offset)); \ 182 (unsigned int)((unsigned long)offset)); \
130} 183}
131DEFINE_BASIC_FETCH_FUNCS(stack) 184DEFINE_BASIC_FETCH_FUNCS(stack)
185/* No string on the stack entry */
186#define fetch_stack_string NULL
187#define fetch_stack_string_size NULL
132 188
133#define DEFINE_FETCH_retval(type) \ 189#define DEFINE_FETCH_retval(type) \
134static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ 190static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
@@ -137,6 +193,9 @@ static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
137 *(type *)dest = (type)regs_return_value(regs); \ 193 *(type *)dest = (type)regs_return_value(regs); \
138} 194}
139DEFINE_BASIC_FETCH_FUNCS(retval) 195DEFINE_BASIC_FETCH_FUNCS(retval)
196/* No string on the retval */
197#define fetch_retval_string NULL
198#define fetch_retval_string_size NULL
140 199
141#define DEFINE_FETCH_memory(type) \ 200#define DEFINE_FETCH_memory(type) \
142static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ 201static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
@@ -149,6 +208,62 @@ static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
149 *(type *)dest = retval; \ 208 *(type *)dest = retval; \
150} 209}
151DEFINE_BASIC_FETCH_FUNCS(memory) 210DEFINE_BASIC_FETCH_FUNCS(memory)
211/*
212 * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
213 * length and relative data location.
214 */
215static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
216 void *addr, void *dest)
217{
218 long ret;
219 int maxlen = get_rloc_len(*(u32 *)dest);
220 u8 *dst = get_rloc_data(dest);
221 u8 *src = addr;
222 mm_segment_t old_fs = get_fs();
223 if (!maxlen)
224 return;
225 /*
226 * Try to get string again, since the string can be changed while
227 * probing.
228 */
229 set_fs(KERNEL_DS);
230 pagefault_disable();
231 do
232 ret = __copy_from_user_inatomic(dst++, src++, 1);
233 while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
234 dst[-1] = '\0';
235 pagefault_enable();
236 set_fs(old_fs);
237
238 if (ret < 0) { /* Failed to fetch string */
239 ((u8 *)get_rloc_data(dest))[0] = '\0';
240 *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
241 } else
242 *(u32 *)dest = make_data_rloc(src - (u8 *)addr,
243 get_rloc_offs(*(u32 *)dest));
244}
245/* Return the length of string -- including null terminal byte */
246static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
247 void *addr, void *dest)
248{
249 int ret, len = 0;
250 u8 c;
251 mm_segment_t old_fs = get_fs();
252
253 set_fs(KERNEL_DS);
254 pagefault_disable();
255 do {
256 ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);
257 len++;
258 } while (c && ret == 0 && len < MAX_STRING_SIZE);
259 pagefault_enable();
260 set_fs(old_fs);
261
262 if (ret < 0) /* Failed to check the length */
263 *(u32 *)dest = 0;
264 else
265 *(u32 *)dest = len;
266}
152 267
153/* Memory fetching by symbol */ 268/* Memory fetching by symbol */
154struct symbol_cache { 269struct symbol_cache {
@@ -203,6 +318,8 @@ static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
203 *(type *)dest = 0; \ 318 *(type *)dest = 0; \
204} 319}
205DEFINE_BASIC_FETCH_FUNCS(symbol) 320DEFINE_BASIC_FETCH_FUNCS(symbol)
321DEFINE_FETCH_symbol(string)
322DEFINE_FETCH_symbol(string_size)
206 323
207/* Dereference memory access function */ 324/* Dereference memory access function */
208struct deref_fetch_param { 325struct deref_fetch_param {
@@ -224,12 +341,14 @@ static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
224 *(type *)dest = 0; \ 341 *(type *)dest = 0; \
225} 342}
226DEFINE_BASIC_FETCH_FUNCS(deref) 343DEFINE_BASIC_FETCH_FUNCS(deref)
344DEFINE_FETCH_deref(string)
345DEFINE_FETCH_deref(string_size)
227 346
228static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) 347static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
229{ 348{
230 if (CHECK_BASIC_FETCH_FUNCS(deref, data->orig.fn)) 349 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
231 free_deref_fetch_param(data->orig.data); 350 free_deref_fetch_param(data->orig.data);
232 else if (CHECK_BASIC_FETCH_FUNCS(symbol, data->orig.fn)) 351 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
233 free_symbol_cache(data->orig.data); 352 free_symbol_cache(data->orig.data);
234 kfree(data); 353 kfree(data);
235} 354}
@@ -240,23 +359,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
240#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) 359#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
241#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) 360#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
242 361
243#define ASSIGN_FETCH_FUNC(kind, type) \ 362/* Fetch types */
244 .kind = FETCH_FUNC_NAME(kind, type) 363enum {
245 364 FETCH_MTD_reg = 0,
246#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ 365 FETCH_MTD_stack,
247 {.name = #ptype, \ 366 FETCH_MTD_retval,
248 .size = sizeof(ftype), \ 367 FETCH_MTD_memory,
249 .is_signed = sign, \ 368 FETCH_MTD_symbol,
250 .print = PRINT_TYPE_FUNC_NAME(ptype), \ 369 FETCH_MTD_deref,
251 .fmt = PRINT_TYPE_FMT_NAME(ptype), \ 370 FETCH_MTD_END,
252ASSIGN_FETCH_FUNC(reg, ftype), \ 371};
253ASSIGN_FETCH_FUNC(stack, ftype), \ 372
254ASSIGN_FETCH_FUNC(retval, ftype), \ 373#define ASSIGN_FETCH_FUNC(method, type) \
255ASSIGN_FETCH_FUNC(memory, ftype), \ 374 [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type)
256ASSIGN_FETCH_FUNC(symbol, ftype), \ 375
257ASSIGN_FETCH_FUNC(deref, ftype), \ 376#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
377 {.name = _name, \
378 .size = _size, \
379 .is_signed = sign, \
380 .print = PRINT_TYPE_FUNC_NAME(ptype), \
381 .fmt = PRINT_TYPE_FMT_NAME(ptype), \
382 .fmttype = _fmttype, \
383 .fetch = { \
384ASSIGN_FETCH_FUNC(reg, ftype), \
385ASSIGN_FETCH_FUNC(stack, ftype), \
386ASSIGN_FETCH_FUNC(retval, ftype), \
387ASSIGN_FETCH_FUNC(memory, ftype), \
388ASSIGN_FETCH_FUNC(symbol, ftype), \
389ASSIGN_FETCH_FUNC(deref, ftype), \
390 } \
258 } 391 }
259 392
393#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
394 __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
395
396#define FETCH_TYPE_STRING 0
397#define FETCH_TYPE_STRSIZE 1
398
260/* Fetch type information table */ 399/* Fetch type information table */
261static const struct fetch_type { 400static const struct fetch_type {
262 const char *name; /* Name of type */ 401 const char *name; /* Name of type */
@@ -264,14 +403,16 @@ static const struct fetch_type {
264 int is_signed; /* Signed flag */ 403 int is_signed; /* Signed flag */
265 print_type_func_t print; /* Print functions */ 404 print_type_func_t print; /* Print functions */
266 const char *fmt; /* Fromat string */ 405 const char *fmt; /* Fromat string */
406 const char *fmttype; /* Name in format file */
267 /* Fetch functions */ 407 /* Fetch functions */
268 fetch_func_t reg; 408 fetch_func_t fetch[FETCH_MTD_END];
269 fetch_func_t stack;
270 fetch_func_t retval;
271 fetch_func_t memory;
272 fetch_func_t symbol;
273 fetch_func_t deref;
274} fetch_type_table[] = { 409} fetch_type_table[] = {
410 /* Special types */
411 [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
412 sizeof(u32), 1, "__data_loc char[]"),
413 [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
414 string_size, sizeof(u32), 0, "u32"),
415 /* Basic types */
275 ASSIGN_FETCH_TYPE(u8, u8, 0), 416 ASSIGN_FETCH_TYPE(u8, u8, 0),
276 ASSIGN_FETCH_TYPE(u16, u16, 0), 417 ASSIGN_FETCH_TYPE(u16, u16, 0),
277 ASSIGN_FETCH_TYPE(u32, u32, 0), 418 ASSIGN_FETCH_TYPE(u32, u32, 0),
@@ -302,12 +443,28 @@ static __kprobes void fetch_stack_address(struct pt_regs *regs,
302 *(unsigned long *)dest = kernel_stack_pointer(regs); 443 *(unsigned long *)dest = kernel_stack_pointer(regs);
303} 444}
304 445
446static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
447 fetch_func_t orig_fn)
448{
449 int i;
450
451 if (type != &fetch_type_table[FETCH_TYPE_STRING])
452 return NULL; /* Only string type needs size function */
453 for (i = 0; i < FETCH_MTD_END; i++)
454 if (type->fetch[i] == orig_fn)
455 return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i];
456
457 WARN_ON(1); /* This should not happen */
458 return NULL;
459}
460
305/** 461/**
306 * Kprobe event core functions 462 * Kprobe event core functions
307 */ 463 */
308 464
309struct probe_arg { 465struct probe_arg {
310 struct fetch_param fetch; 466 struct fetch_param fetch;
467 struct fetch_param fetch_size;
311 unsigned int offset; /* Offset from argument entry */ 468 unsigned int offset; /* Offset from argument entry */
312 const char *name; /* Name of this argument */ 469 const char *name; /* Name of this argument */
313 const char *comm; /* Command of this argument */ 470 const char *comm; /* Command of this argument */
@@ -356,8 +513,8 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
356static int kretprobe_dispatcher(struct kretprobe_instance *ri, 513static int kretprobe_dispatcher(struct kretprobe_instance *ri,
357 struct pt_regs *regs); 514 struct pt_regs *regs);
358 515
359/* Check the name is good for event/group */ 516/* Check the name is good for event/group/fields */
360static int check_event_name(const char *name) 517static int is_good_name(const char *name)
361{ 518{
362 if (!isalpha(*name) && *name != '_') 519 if (!isalpha(*name) && *name != '_')
363 return 0; 520 return 0;
@@ -399,7 +556,7 @@ static struct trace_probe *alloc_trace_probe(const char *group,
399 else 556 else
400 tp->rp.kp.pre_handler = kprobe_dispatcher; 557 tp->rp.kp.pre_handler = kprobe_dispatcher;
401 558
402 if (!event || !check_event_name(event)) { 559 if (!event || !is_good_name(event)) {
403 ret = -EINVAL; 560 ret = -EINVAL;
404 goto error; 561 goto error;
405 } 562 }
@@ -409,7 +566,7 @@ static struct trace_probe *alloc_trace_probe(const char *group,
409 if (!tp->call.name) 566 if (!tp->call.name)
410 goto error; 567 goto error;
411 568
412 if (!group || !check_event_name(group)) { 569 if (!group || !is_good_name(group)) {
413 ret = -EINVAL; 570 ret = -EINVAL;
414 goto error; 571 goto error;
415 } 572 }
@@ -429,9 +586,9 @@ error:
429 586
430static void free_probe_arg(struct probe_arg *arg) 587static void free_probe_arg(struct probe_arg *arg)
431{ 588{
432 if (CHECK_BASIC_FETCH_FUNCS(deref, arg->fetch.fn)) 589 if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
433 free_deref_fetch_param(arg->fetch.data); 590 free_deref_fetch_param(arg->fetch.data);
434 else if (CHECK_BASIC_FETCH_FUNCS(symbol, arg->fetch.fn)) 591 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
435 free_symbol_cache(arg->fetch.data); 592 free_symbol_cache(arg->fetch.data);
436 kfree(arg->name); 593 kfree(arg->name);
437 kfree(arg->comm); 594 kfree(arg->comm);
@@ -490,7 +647,7 @@ static int register_trace_probe(struct trace_probe *tp)
490 } 647 }
491 ret = register_probe_event(tp); 648 ret = register_probe_event(tp);
492 if (ret) { 649 if (ret) {
493 pr_warning("Faild to register probe event(%d)\n", ret); 650 pr_warning("Failed to register probe event(%d)\n", ret);
494 goto end; 651 goto end;
495 } 652 }
496 653
@@ -548,7 +705,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
548 705
549 if (strcmp(arg, "retval") == 0) { 706 if (strcmp(arg, "retval") == 0) {
550 if (is_return) 707 if (is_return)
551 f->fn = t->retval; 708 f->fn = t->fetch[FETCH_MTD_retval];
552 else 709 else
553 ret = -EINVAL; 710 ret = -EINVAL;
554 } else if (strncmp(arg, "stack", 5) == 0) { 711 } else if (strncmp(arg, "stack", 5) == 0) {
@@ -562,7 +719,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
562 if (ret || param > PARAM_MAX_STACK) 719 if (ret || param > PARAM_MAX_STACK)
563 ret = -EINVAL; 720 ret = -EINVAL;
564 else { 721 else {
565 f->fn = t->stack; 722 f->fn = t->fetch[FETCH_MTD_stack];
566 f->data = (void *)param; 723 f->data = (void *)param;
567 } 724 }
568 } else 725 } else
@@ -588,7 +745,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
588 case '%': /* named register */ 745 case '%': /* named register */
589 ret = regs_query_register_offset(arg + 1); 746 ret = regs_query_register_offset(arg + 1);
590 if (ret >= 0) { 747 if (ret >= 0) {
591 f->fn = t->reg; 748 f->fn = t->fetch[FETCH_MTD_reg];
592 f->data = (void *)(unsigned long)ret; 749 f->data = (void *)(unsigned long)ret;
593 ret = 0; 750 ret = 0;
594 } 751 }
@@ -598,7 +755,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
598 ret = strict_strtoul(arg + 1, 0, &param); 755 ret = strict_strtoul(arg + 1, 0, &param);
599 if (ret) 756 if (ret)
600 break; 757 break;
601 f->fn = t->memory; 758 f->fn = t->fetch[FETCH_MTD_memory];
602 f->data = (void *)param; 759 f->data = (void *)param;
603 } else { 760 } else {
604 ret = split_symbol_offset(arg + 1, &offset); 761 ret = split_symbol_offset(arg + 1, &offset);
@@ -606,7 +763,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
606 break; 763 break;
607 f->data = alloc_symbol_cache(arg + 1, offset); 764 f->data = alloc_symbol_cache(arg + 1, offset);
608 if (f->data) 765 if (f->data)
609 f->fn = t->symbol; 766 f->fn = t->fetch[FETCH_MTD_symbol];
610 } 767 }
611 break; 768 break;
612 case '+': /* deref memory */ 769 case '+': /* deref memory */
@@ -636,14 +793,17 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
636 if (ret) 793 if (ret)
637 kfree(dprm); 794 kfree(dprm);
638 else { 795 else {
639 f->fn = t->deref; 796 f->fn = t->fetch[FETCH_MTD_deref];
640 f->data = (void *)dprm; 797 f->data = (void *)dprm;
641 } 798 }
642 } 799 }
643 break; 800 break;
644 } 801 }
645 if (!ret && !f->fn) 802 if (!ret && !f->fn) { /* Parsed, but do not find fetch method */
803 pr_info("%s type has no corresponding fetch method.\n",
804 t->name);
646 ret = -EINVAL; 805 ret = -EINVAL;
806 }
647 return ret; 807 return ret;
648} 808}
649 809
@@ -652,6 +812,7 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp,
652 struct probe_arg *parg, int is_return) 812 struct probe_arg *parg, int is_return)
653{ 813{
654 const char *t; 814 const char *t;
815 int ret;
655 816
656 if (strlen(arg) > MAX_ARGSTR_LEN) { 817 if (strlen(arg) > MAX_ARGSTR_LEN) {
657 pr_info("Argument is too long.: %s\n", arg); 818 pr_info("Argument is too long.: %s\n", arg);
@@ -674,7 +835,13 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp,
674 } 835 }
675 parg->offset = tp->size; 836 parg->offset = tp->size;
676 tp->size += parg->type->size; 837 tp->size += parg->type->size;
677 return __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); 838 ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
839 if (ret >= 0) {
840 parg->fetch_size.fn = get_fetch_size_function(parg->type,
841 parg->fetch.fn);
842 parg->fetch_size.data = parg->fetch.data;
843 }
844 return ret;
678} 845}
679 846
680/* Return 1 if name is reserved or already used by another argument */ 847/* Return 1 if name is reserved or already used by another argument */
@@ -715,7 +882,7 @@ static int create_trace_probe(int argc, char **argv)
715 int i, ret = 0; 882 int i, ret = 0;
716 int is_return = 0, is_delete = 0; 883 int is_return = 0, is_delete = 0;
717 char *symbol = NULL, *event = NULL, *group = NULL; 884 char *symbol = NULL, *event = NULL, *group = NULL;
718 char *arg, *tmp; 885 char *arg;
719 unsigned long offset = 0; 886 unsigned long offset = 0;
720 void *addr = NULL; 887 void *addr = NULL;
721 char buf[MAX_EVENT_NAME_LEN]; 888 char buf[MAX_EVENT_NAME_LEN];
@@ -757,14 +924,17 @@ static int create_trace_probe(int argc, char **argv)
757 pr_info("Delete command needs an event name.\n"); 924 pr_info("Delete command needs an event name.\n");
758 return -EINVAL; 925 return -EINVAL;
759 } 926 }
927 mutex_lock(&probe_lock);
760 tp = find_probe_event(event, group); 928 tp = find_probe_event(event, group);
761 if (!tp) { 929 if (!tp) {
930 mutex_unlock(&probe_lock);
762 pr_info("Event %s/%s doesn't exist.\n", group, event); 931 pr_info("Event %s/%s doesn't exist.\n", group, event);
763 return -ENOENT; 932 return -ENOENT;
764 } 933 }
765 /* delete an event */ 934 /* delete an event */
766 unregister_trace_probe(tp); 935 unregister_trace_probe(tp);
767 free_trace_probe(tp); 936 free_trace_probe(tp);
937 mutex_unlock(&probe_lock);
768 return 0; 938 return 0;
769 } 939 }
770 940
@@ -821,26 +991,36 @@ static int create_trace_probe(int argc, char **argv)
821 /* parse arguments */ 991 /* parse arguments */
822 ret = 0; 992 ret = 0;
823 for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { 993 for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
994 /* Increment count for freeing args in error case */
995 tp->nr_args++;
996
824 /* Parse argument name */ 997 /* Parse argument name */
825 arg = strchr(argv[i], '='); 998 arg = strchr(argv[i], '=');
826 if (arg) 999 if (arg) {
827 *arg++ = '\0'; 1000 *arg++ = '\0';
828 else 1001 tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
1002 } else {
829 arg = argv[i]; 1003 arg = argv[i];
1004 /* If argument name is omitted, set "argN" */
1005 snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1);
1006 tp->args[i].name = kstrdup(buf, GFP_KERNEL);
1007 }
830 1008
831 tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
832 if (!tp->args[i].name) { 1009 if (!tp->args[i].name) {
833 pr_info("Failed to allocate argument%d name '%s'.\n", 1010 pr_info("Failed to allocate argument[%d] name.\n", i);
834 i, argv[i]);
835 ret = -ENOMEM; 1011 ret = -ENOMEM;
836 goto error; 1012 goto error;
837 } 1013 }
838 tmp = strchr(tp->args[i].name, ':'); 1014
839 if (tmp) 1015 if (!is_good_name(tp->args[i].name)) {
840 *tmp = '_'; /* convert : to _ */ 1016 pr_info("Invalid argument[%d] name: %s\n",
1017 i, tp->args[i].name);
1018 ret = -EINVAL;
1019 goto error;
1020 }
841 1021
842 if (conflict_field_name(tp->args[i].name, tp->args, i)) { 1022 if (conflict_field_name(tp->args[i].name, tp->args, i)) {
843 pr_info("Argument%d name '%s' conflicts with " 1023 pr_info("Argument[%d] name '%s' conflicts with "
844 "another field.\n", i, argv[i]); 1024 "another field.\n", i, argv[i]);
845 ret = -EINVAL; 1025 ret = -EINVAL;
846 goto error; 1026 goto error;
@@ -849,12 +1029,9 @@ static int create_trace_probe(int argc, char **argv)
849 /* Parse fetch argument */ 1029 /* Parse fetch argument */
850 ret = parse_probe_arg(arg, tp, &tp->args[i], is_return); 1030 ret = parse_probe_arg(arg, tp, &tp->args[i], is_return);
851 if (ret) { 1031 if (ret) {
852 pr_info("Parse error at argument%d. (%d)\n", i, ret); 1032 pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
853 kfree(tp->args[i].name);
854 goto error; 1033 goto error;
855 } 1034 }
856
857 tp->nr_args++;
858 } 1035 }
859 1036
860 ret = register_trace_probe(tp); 1037 ret = register_trace_probe(tp);
@@ -1043,6 +1220,54 @@ static const struct file_operations kprobe_profile_ops = {
1043 .release = seq_release, 1220 .release = seq_release,
1044}; 1221};
1045 1222
1223/* Sum up total data length for dynamic arraies (strings) */
1224static __kprobes int __get_data_size(struct trace_probe *tp,
1225 struct pt_regs *regs)
1226{
1227 int i, ret = 0;
1228 u32 len;
1229
1230 for (i = 0; i < tp->nr_args; i++)
1231 if (unlikely(tp->args[i].fetch_size.fn)) {
1232 call_fetch(&tp->args[i].fetch_size, regs, &len);
1233 ret += len;
1234 }
1235
1236 return ret;
1237}
1238
1239/* Store the value of each argument */
1240static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp,
1241 struct pt_regs *regs,
1242 u8 *data, int maxlen)
1243{
1244 int i;
1245 u32 end = tp->size;
1246 u32 *dl; /* Data (relative) location */
1247
1248 for (i = 0; i < tp->nr_args; i++) {
1249 if (unlikely(tp->args[i].fetch_size.fn)) {
1250 /*
1251 * First, we set the relative location and
1252 * maximum data length to *dl
1253 */
1254 dl = (u32 *)(data + tp->args[i].offset);
1255 *dl = make_data_rloc(maxlen, end - tp->args[i].offset);
1256 /* Then try to fetch string or dynamic array data */
1257 call_fetch(&tp->args[i].fetch, regs, dl);
1258 /* Reduce maximum length */
1259 end += get_rloc_len(*dl);
1260 maxlen -= get_rloc_len(*dl);
1261 /* Trick here, convert data_rloc to data_loc */
1262 *dl = convert_rloc_to_loc(*dl,
1263 ent_size + tp->args[i].offset);
1264 } else
1265 /* Just fetching data normally */
1266 call_fetch(&tp->args[i].fetch, regs,
1267 data + tp->args[i].offset);
1268 }
1269}
1270
1046/* Kprobe handler */ 1271/* Kprobe handler */
1047static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) 1272static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
1048{ 1273{
@@ -1050,8 +1275,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
1050 struct kprobe_trace_entry_head *entry; 1275 struct kprobe_trace_entry_head *entry;
1051 struct ring_buffer_event *event; 1276 struct ring_buffer_event *event;
1052 struct ring_buffer *buffer; 1277 struct ring_buffer *buffer;
1053 u8 *data; 1278 int size, dsize, pc;
1054 int size, i, pc;
1055 unsigned long irq_flags; 1279 unsigned long irq_flags;
1056 struct ftrace_event_call *call = &tp->call; 1280 struct ftrace_event_call *call = &tp->call;
1057 1281
@@ -1060,7 +1284,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
1060 local_save_flags(irq_flags); 1284 local_save_flags(irq_flags);
1061 pc = preempt_count(); 1285 pc = preempt_count();
1062 1286
1063 size = sizeof(*entry) + tp->size; 1287 dsize = __get_data_size(tp, regs);
1288 size = sizeof(*entry) + tp->size + dsize;
1064 1289
1065 event = trace_current_buffer_lock_reserve(&buffer, call->event.type, 1290 event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
1066 size, irq_flags, pc); 1291 size, irq_flags, pc);
@@ -1069,9 +1294,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
1069 1294
1070 entry = ring_buffer_event_data(event); 1295 entry = ring_buffer_event_data(event);
1071 entry->ip = (unsigned long)kp->addr; 1296 entry->ip = (unsigned long)kp->addr;
1072 data = (u8 *)&entry[1]; 1297 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1073 for (i = 0; i < tp->nr_args; i++)
1074 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1075 1298
1076 if (!filter_current_check_discard(buffer, call, entry, event)) 1299 if (!filter_current_check_discard(buffer, call, entry, event))
1077 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1300 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
@@ -1085,15 +1308,15 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
1085 struct kretprobe_trace_entry_head *entry; 1308 struct kretprobe_trace_entry_head *entry;
1086 struct ring_buffer_event *event; 1309 struct ring_buffer_event *event;
1087 struct ring_buffer *buffer; 1310 struct ring_buffer *buffer;
1088 u8 *data; 1311 int size, pc, dsize;
1089 int size, i, pc;
1090 unsigned long irq_flags; 1312 unsigned long irq_flags;
1091 struct ftrace_event_call *call = &tp->call; 1313 struct ftrace_event_call *call = &tp->call;
1092 1314
1093 local_save_flags(irq_flags); 1315 local_save_flags(irq_flags);
1094 pc = preempt_count(); 1316 pc = preempt_count();
1095 1317
1096 size = sizeof(*entry) + tp->size; 1318 dsize = __get_data_size(tp, regs);
1319 size = sizeof(*entry) + tp->size + dsize;
1097 1320
1098 event = trace_current_buffer_lock_reserve(&buffer, call->event.type, 1321 event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
1099 size, irq_flags, pc); 1322 size, irq_flags, pc);
@@ -1103,9 +1326,7 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
1103 entry = ring_buffer_event_data(event); 1326 entry = ring_buffer_event_data(event);
1104 entry->func = (unsigned long)tp->rp.kp.addr; 1327 entry->func = (unsigned long)tp->rp.kp.addr;
1105 entry->ret_ip = (unsigned long)ri->ret_addr; 1328 entry->ret_ip = (unsigned long)ri->ret_addr;
1106 data = (u8 *)&entry[1]; 1329 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1107 for (i = 0; i < tp->nr_args; i++)
1108 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1109 1330
1110 if (!filter_current_check_discard(buffer, call, entry, event)) 1331 if (!filter_current_check_discard(buffer, call, entry, event))
1111 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1332 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
@@ -1137,7 +1358,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags,
1137 data = (u8 *)&field[1]; 1358 data = (u8 *)&field[1];
1138 for (i = 0; i < tp->nr_args; i++) 1359 for (i = 0; i < tp->nr_args; i++)
1139 if (!tp->args[i].type->print(s, tp->args[i].name, 1360 if (!tp->args[i].type->print(s, tp->args[i].name,
1140 data + tp->args[i].offset)) 1361 data + tp->args[i].offset, field))
1141 goto partial; 1362 goto partial;
1142 1363
1143 if (!trace_seq_puts(s, "\n")) 1364 if (!trace_seq_puts(s, "\n"))
@@ -1179,7 +1400,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
1179 data = (u8 *)&field[1]; 1400 data = (u8 *)&field[1];
1180 for (i = 0; i < tp->nr_args; i++) 1401 for (i = 0; i < tp->nr_args; i++)
1181 if (!tp->args[i].type->print(s, tp->args[i].name, 1402 if (!tp->args[i].type->print(s, tp->args[i].name,
1182 data + tp->args[i].offset)) 1403 data + tp->args[i].offset, field))
1183 goto partial; 1404 goto partial;
1184 1405
1185 if (!trace_seq_puts(s, "\n")) 1406 if (!trace_seq_puts(s, "\n"))
@@ -1214,11 +1435,6 @@ static void probe_event_disable(struct ftrace_event_call *call)
1214 } 1435 }
1215} 1436}
1216 1437
1217static int probe_event_raw_init(struct ftrace_event_call *event_call)
1218{
1219 return 0;
1220}
1221
1222#undef DEFINE_FIELD 1438#undef DEFINE_FIELD
1223#define DEFINE_FIELD(type, item, name, is_signed) \ 1439#define DEFINE_FIELD(type, item, name, is_signed) \
1224 do { \ 1440 do { \
@@ -1239,7 +1455,7 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
1239 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); 1455 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
1240 /* Set argument names as fields */ 1456 /* Set argument names as fields */
1241 for (i = 0; i < tp->nr_args; i++) { 1457 for (i = 0; i < tp->nr_args; i++) {
1242 ret = trace_define_field(event_call, tp->args[i].type->name, 1458 ret = trace_define_field(event_call, tp->args[i].type->fmttype,
1243 tp->args[i].name, 1459 tp->args[i].name,
1244 sizeof(field) + tp->args[i].offset, 1460 sizeof(field) + tp->args[i].offset,
1245 tp->args[i].type->size, 1461 tp->args[i].type->size,
@@ -1261,7 +1477,7 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
1261 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); 1477 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
1262 /* Set argument names as fields */ 1478 /* Set argument names as fields */
1263 for (i = 0; i < tp->nr_args; i++) { 1479 for (i = 0; i < tp->nr_args; i++) {
1264 ret = trace_define_field(event_call, tp->args[i].type->name, 1480 ret = trace_define_field(event_call, tp->args[i].type->fmttype,
1265 tp->args[i].name, 1481 tp->args[i].name,
1266 sizeof(field) + tp->args[i].offset, 1482 sizeof(field) + tp->args[i].offset,
1267 tp->args[i].type->size, 1483 tp->args[i].type->size,
@@ -1301,8 +1517,13 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
1301 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); 1517 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
1302 1518
1303 for (i = 0; i < tp->nr_args; i++) { 1519 for (i = 0; i < tp->nr_args; i++) {
1304 pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", 1520 if (strcmp(tp->args[i].type->name, "string") == 0)
1305 tp->args[i].name); 1521 pos += snprintf(buf + pos, LEN_OR_ZERO,
1522 ", __get_str(%s)",
1523 tp->args[i].name);
1524 else
1525 pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
1526 tp->args[i].name);
1306 } 1527 }
1307 1528
1308#undef LEN_OR_ZERO 1529#undef LEN_OR_ZERO
@@ -1339,11 +1560,11 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
1339 struct ftrace_event_call *call = &tp->call; 1560 struct ftrace_event_call *call = &tp->call;
1340 struct kprobe_trace_entry_head *entry; 1561 struct kprobe_trace_entry_head *entry;
1341 struct hlist_head *head; 1562 struct hlist_head *head;
1342 u8 *data; 1563 int size, __size, dsize;
1343 int size, __size, i;
1344 int rctx; 1564 int rctx;
1345 1565
1346 __size = sizeof(*entry) + tp->size; 1566 dsize = __get_data_size(tp, regs);
1567 __size = sizeof(*entry) + tp->size + dsize;
1347 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1568 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1348 size -= sizeof(u32); 1569 size -= sizeof(u32);
1349 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, 1570 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
@@ -1355,9 +1576,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
1355 return; 1576 return;
1356 1577
1357 entry->ip = (unsigned long)kp->addr; 1578 entry->ip = (unsigned long)kp->addr;
1358 data = (u8 *)&entry[1]; 1579 memset(&entry[1], 0, dsize);
1359 for (i = 0; i < tp->nr_args; i++) 1580 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1360 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1361 1581
1362 head = this_cpu_ptr(call->perf_events); 1582 head = this_cpu_ptr(call->perf_events);
1363 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); 1583 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
@@ -1371,11 +1591,11 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1371 struct ftrace_event_call *call = &tp->call; 1591 struct ftrace_event_call *call = &tp->call;
1372 struct kretprobe_trace_entry_head *entry; 1592 struct kretprobe_trace_entry_head *entry;
1373 struct hlist_head *head; 1593 struct hlist_head *head;
1374 u8 *data; 1594 int size, __size, dsize;
1375 int size, __size, i;
1376 int rctx; 1595 int rctx;
1377 1596
1378 __size = sizeof(*entry) + tp->size; 1597 dsize = __get_data_size(tp, regs);
1598 __size = sizeof(*entry) + tp->size + dsize;
1379 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1599 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1380 size -= sizeof(u32); 1600 size -= sizeof(u32);
1381 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, 1601 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
@@ -1388,9 +1608,7 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1388 1608
1389 entry->func = (unsigned long)tp->rp.kp.addr; 1609 entry->func = (unsigned long)tp->rp.kp.addr;
1390 entry->ret_ip = (unsigned long)ri->ret_addr; 1610 entry->ret_ip = (unsigned long)ri->ret_addr;
1391 data = (u8 *)&entry[1]; 1611 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1392 for (i = 0; i < tp->nr_args; i++)
1393 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1394 1612
1395 head = this_cpu_ptr(call->perf_events); 1613 head = this_cpu_ptr(call->perf_events);
1396 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); 1614 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
@@ -1486,15 +1704,12 @@ static int register_probe_event(struct trace_probe *tp)
1486 int ret; 1704 int ret;
1487 1705
1488 /* Initialize ftrace_event_call */ 1706 /* Initialize ftrace_event_call */
1707 INIT_LIST_HEAD(&call->class->fields);
1489 if (probe_is_return(tp)) { 1708 if (probe_is_return(tp)) {
1490 INIT_LIST_HEAD(&call->class->fields);
1491 call->event.funcs = &kretprobe_funcs; 1709 call->event.funcs = &kretprobe_funcs;
1492 call->class->raw_init = probe_event_raw_init;
1493 call->class->define_fields = kretprobe_event_define_fields; 1710 call->class->define_fields = kretprobe_event_define_fields;
1494 } else { 1711 } else {
1495 INIT_LIST_HEAD(&call->class->fields);
1496 call->event.funcs = &kprobe_funcs; 1712 call->event.funcs = &kprobe_funcs;
1497 call->class->raw_init = probe_event_raw_init;
1498 call->class->define_fields = kprobe_event_define_fields; 1713 call->class->define_fields = kprobe_event_define_fields;
1499 } 1714 }
1500 if (set_print_fmt(tp) < 0) 1715 if (set_print_fmt(tp) < 0)
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
deleted file mode 100644
index 8eaf00749b65..000000000000
--- a/kernel/trace/trace_ksym.c
+++ /dev/null
@@ -1,508 +0,0 @@
1/*
2 * trace_ksym.c - Kernel Symbol Tracer
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2009
19 */
20
21#include <linux/kallsyms.h>
22#include <linux/uaccess.h>
23#include <linux/debugfs.h>
24#include <linux/ftrace.h>
25#include <linux/module.h>
26#include <linux/slab.h>
27#include <linux/fs.h>
28
29#include "trace_output.h"
30#include "trace.h"
31
32#include <linux/hw_breakpoint.h>
33#include <asm/hw_breakpoint.h>
34
35#include <asm/atomic.h>
36
37#define KSYM_TRACER_OP_LEN 3 /* rw- */
38
39struct trace_ksym {
40 struct perf_event **ksym_hbp;
41 struct perf_event_attr attr;
42#ifdef CONFIG_PROFILE_KSYM_TRACER
43 atomic64_t counter;
44#endif
45 struct hlist_node ksym_hlist;
46};
47
48static struct trace_array *ksym_trace_array;
49
50static unsigned int ksym_tracing_enabled;
51
52static HLIST_HEAD(ksym_filter_head);
53
54static DEFINE_MUTEX(ksym_tracer_mutex);
55
56#ifdef CONFIG_PROFILE_KSYM_TRACER
57
58#define MAX_UL_INT 0xffffffff
59
60void ksym_collect_stats(unsigned long hbp_hit_addr)
61{
62 struct hlist_node *node;
63 struct trace_ksym *entry;
64
65 rcu_read_lock();
66 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
67 if (entry->attr.bp_addr == hbp_hit_addr) {
68 atomic64_inc(&entry->counter);
69 break;
70 }
71 }
72 rcu_read_unlock();
73}
74#endif /* CONFIG_PROFILE_KSYM_TRACER */
75
76void ksym_hbp_handler(struct perf_event *hbp, int nmi,
77 struct perf_sample_data *data,
78 struct pt_regs *regs)
79{
80 struct ring_buffer_event *event;
81 struct ksym_trace_entry *entry;
82 struct ring_buffer *buffer;
83 int pc;
84
85 if (!ksym_tracing_enabled)
86 return;
87
88 buffer = ksym_trace_array->buffer;
89
90 pc = preempt_count();
91
92 event = trace_buffer_lock_reserve(buffer, TRACE_KSYM,
93 sizeof(*entry), 0, pc);
94 if (!event)
95 return;
96
97 entry = ring_buffer_event_data(event);
98 entry->ip = instruction_pointer(regs);
99 entry->type = hw_breakpoint_type(hbp);
100 entry->addr = hw_breakpoint_addr(hbp);
101 strlcpy(entry->cmd, current->comm, TASK_COMM_LEN);
102
103#ifdef CONFIG_PROFILE_KSYM_TRACER
104 ksym_collect_stats(hw_breakpoint_addr(hbp));
105#endif /* CONFIG_PROFILE_KSYM_TRACER */
106
107 trace_buffer_unlock_commit(buffer, event, 0, pc);
108}
109
110/* Valid access types are represented as
111 *
112 * rw- : Set Read/Write Access Breakpoint
113 * -w- : Set Write Access Breakpoint
114 * --- : Clear Breakpoints
115 * --x : Set Execution Break points (Not available yet)
116 *
117 */
118static int ksym_trace_get_access_type(char *str)
119{
120 int access = 0;
121
122 if (str[0] == 'r')
123 access |= HW_BREAKPOINT_R;
124
125 if (str[1] == 'w')
126 access |= HW_BREAKPOINT_W;
127
128 if (str[2] == 'x')
129 access |= HW_BREAKPOINT_X;
130
131 switch (access) {
132 case HW_BREAKPOINT_R:
133 case HW_BREAKPOINT_W:
134 case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
135 return access;
136 default:
137 return -EINVAL;
138 }
139}
140
141/*
142 * There can be several possible malformed requests and we attempt to capture
143 * all of them. We enumerate some of the rules
144 * 1. We will not allow kernel symbols with ':' since it is used as a delimiter.
145 * i.e. multiple ':' symbols disallowed. Possible uses are of the form
146 * <module>:<ksym_name>:<op>.
147 * 2. No delimiter symbol ':' in the input string
148 * 3. Spurious operator symbols or symbols not in their respective positions
149 * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file
150 * 5. Kernel symbol not a part of /proc/kallsyms
151 * 6. Duplicate requests
152 */
153static int parse_ksym_trace_str(char *input_string, char **ksymname,
154 unsigned long *addr)
155{
156 int ret;
157
158 *ksymname = strsep(&input_string, ":");
159 *addr = kallsyms_lookup_name(*ksymname);
160
161 /* Check for malformed request: (2), (1) and (5) */
162 if ((!input_string) ||
163 (strlen(input_string) != KSYM_TRACER_OP_LEN) ||
164 (*addr == 0))
165 return -EINVAL;;
166
167 ret = ksym_trace_get_access_type(input_string);
168
169 return ret;
170}
171
172int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
173{
174 struct trace_ksym *entry;
175 int ret = -ENOMEM;
176
177 entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
178 if (!entry)
179 return -ENOMEM;
180
181 hw_breakpoint_init(&entry->attr);
182
183 entry->attr.bp_type = op;
184 entry->attr.bp_addr = addr;
185 entry->attr.bp_len = HW_BREAKPOINT_LEN_4;
186
187 entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr,
188 ksym_hbp_handler);
189
190 if (IS_ERR(entry->ksym_hbp)) {
191 ret = PTR_ERR(entry->ksym_hbp);
192 if (ret == -ENOSPC) {
193 printk(KERN_ERR "ksym_tracer: Maximum limit reached."
194 " No new requests for tracing can be accepted now.\n");
195 } else {
196 printk(KERN_INFO "ksym_tracer request failed. Try again"
197 " later!!\n");
198 }
199 goto err;
200 }
201
202 hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
203
204 return 0;
205
206err:
207 kfree(entry);
208
209 return ret;
210}
211
212static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
213 size_t count, loff_t *ppos)
214{
215 struct trace_ksym *entry;
216 struct hlist_node *node;
217 struct trace_seq *s;
218 ssize_t cnt = 0;
219 int ret;
220
221 s = kmalloc(sizeof(*s), GFP_KERNEL);
222 if (!s)
223 return -ENOMEM;
224 trace_seq_init(s);
225
226 mutex_lock(&ksym_tracer_mutex);
227
228 hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
229 ret = trace_seq_printf(s, "%pS:",
230 (void *)(unsigned long)entry->attr.bp_addr);
231 if (entry->attr.bp_type == HW_BREAKPOINT_R)
232 ret = trace_seq_puts(s, "r--\n");
233 else if (entry->attr.bp_type == HW_BREAKPOINT_W)
234 ret = trace_seq_puts(s, "-w-\n");
235 else if (entry->attr.bp_type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R))
236 ret = trace_seq_puts(s, "rw-\n");
237 WARN_ON_ONCE(!ret);
238 }
239
240 cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
241
242 mutex_unlock(&ksym_tracer_mutex);
243
244 kfree(s);
245
246 return cnt;
247}
248
249static void __ksym_trace_reset(void)
250{
251 struct trace_ksym *entry;
252 struct hlist_node *node, *node1;
253
254 mutex_lock(&ksym_tracer_mutex);
255 hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
256 ksym_hlist) {
257 unregister_wide_hw_breakpoint(entry->ksym_hbp);
258 hlist_del_rcu(&(entry->ksym_hlist));
259 synchronize_rcu();
260 kfree(entry);
261 }
262 mutex_unlock(&ksym_tracer_mutex);
263}
264
265static ssize_t ksym_trace_filter_write(struct file *file,
266 const char __user *buffer,
267 size_t count, loff_t *ppos)
268{
269 struct trace_ksym *entry;
270 struct hlist_node *node;
271 char *buf, *input_string, *ksymname = NULL;
272 unsigned long ksym_addr = 0;
273 int ret, op, changed = 0;
274
275 buf = kzalloc(count + 1, GFP_KERNEL);
276 if (!buf)
277 return -ENOMEM;
278
279 ret = -EFAULT;
280 if (copy_from_user(buf, buffer, count))
281 goto out;
282
283 buf[count] = '\0';
284 input_string = strstrip(buf);
285
286 /*
287 * Clear all breakpoints if:
288 * 1: echo > ksym_trace_filter
289 * 2: echo 0 > ksym_trace_filter
290 * 3: echo "*:---" > ksym_trace_filter
291 */
292 if (!input_string[0] || !strcmp(input_string, "0") ||
293 !strcmp(input_string, "*:---")) {
294 __ksym_trace_reset();
295 ret = 0;
296 goto out;
297 }
298
299 ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
300 if (ret < 0)
301 goto out;
302
303 mutex_lock(&ksym_tracer_mutex);
304
305 ret = -EINVAL;
306 hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
307 if (entry->attr.bp_addr == ksym_addr) {
308 /* Check for malformed request: (6) */
309 if (entry->attr.bp_type != op)
310 changed = 1;
311 else
312 goto out_unlock;
313 break;
314 }
315 }
316 if (changed) {
317 unregister_wide_hw_breakpoint(entry->ksym_hbp);
318 entry->attr.bp_type = op;
319 ret = 0;
320 if (op > 0) {
321 entry->ksym_hbp =
322 register_wide_hw_breakpoint(&entry->attr,
323 ksym_hbp_handler);
324 if (IS_ERR(entry->ksym_hbp))
325 ret = PTR_ERR(entry->ksym_hbp);
326 else
327 goto out_unlock;
328 }
329 /* Error or "symbol:---" case: drop it */
330 hlist_del_rcu(&(entry->ksym_hlist));
331 synchronize_rcu();
332 kfree(entry);
333 goto out_unlock;
334 } else {
335 /* Check for malformed request: (4) */
336 if (op)
337 ret = process_new_ksym_entry(ksymname, op, ksym_addr);
338 }
339out_unlock:
340 mutex_unlock(&ksym_tracer_mutex);
341out:
342 kfree(buf);
343 return !ret ? count : ret;
344}
345
346static const struct file_operations ksym_tracing_fops = {
347 .open = tracing_open_generic,
348 .read = ksym_trace_filter_read,
349 .write = ksym_trace_filter_write,
350};
351
352static void ksym_trace_reset(struct trace_array *tr)
353{
354 ksym_tracing_enabled = 0;
355 __ksym_trace_reset();
356}
357
358static int ksym_trace_init(struct trace_array *tr)
359{
360 int cpu, ret = 0;
361
362 for_each_online_cpu(cpu)
363 tracing_reset(tr, cpu);
364 ksym_tracing_enabled = 1;
365 ksym_trace_array = tr;
366
367 return ret;
368}
369
370static void ksym_trace_print_header(struct seq_file *m)
371{
372 seq_puts(m,
373 "# TASK-PID CPU# Symbol "
374 "Type Function\n");
375 seq_puts(m,
376 "# | | | "
377 " | |\n");
378}
379
380static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
381{
382 struct trace_entry *entry = iter->ent;
383 struct trace_seq *s = &iter->seq;
384 struct ksym_trace_entry *field;
385 char str[KSYM_SYMBOL_LEN];
386 int ret;
387
388 if (entry->type != TRACE_KSYM)
389 return TRACE_TYPE_UNHANDLED;
390
391 trace_assign_type(field, entry);
392
393 ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd,
394 entry->pid, iter->cpu, (char *)field->addr);
395 if (!ret)
396 return TRACE_TYPE_PARTIAL_LINE;
397
398 switch (field->type) {
399 case HW_BREAKPOINT_R:
400 ret = trace_seq_printf(s, " R ");
401 break;
402 case HW_BREAKPOINT_W:
403 ret = trace_seq_printf(s, " W ");
404 break;
405 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
406 ret = trace_seq_printf(s, " RW ");
407 break;
408 default:
409 return TRACE_TYPE_PARTIAL_LINE;
410 }
411
412 if (!ret)
413 return TRACE_TYPE_PARTIAL_LINE;
414
415 sprint_symbol(str, field->ip);
416 ret = trace_seq_printf(s, "%s\n", str);
417 if (!ret)
418 return TRACE_TYPE_PARTIAL_LINE;
419
420 return TRACE_TYPE_HANDLED;
421}
422
423struct tracer ksym_tracer __read_mostly =
424{
425 .name = "ksym_tracer",
426 .init = ksym_trace_init,
427 .reset = ksym_trace_reset,
428#ifdef CONFIG_FTRACE_SELFTEST
429 .selftest = trace_selftest_startup_ksym,
430#endif
431 .print_header = ksym_trace_print_header,
432 .print_line = ksym_trace_output
433};
434
435#ifdef CONFIG_PROFILE_KSYM_TRACER
436static int ksym_profile_show(struct seq_file *m, void *v)
437{
438 struct hlist_node *node;
439 struct trace_ksym *entry;
440 int access_type = 0;
441 char fn_name[KSYM_NAME_LEN];
442
443 seq_puts(m, " Access Type ");
444 seq_puts(m, " Symbol Counter\n");
445 seq_puts(m, " ----------- ");
446 seq_puts(m, " ------ -------\n");
447
448 rcu_read_lock();
449 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
450
451 access_type = entry->attr.bp_type;
452
453 switch (access_type) {
454 case HW_BREAKPOINT_R:
455 seq_puts(m, " R ");
456 break;
457 case HW_BREAKPOINT_W:
458 seq_puts(m, " W ");
459 break;
460 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
461 seq_puts(m, " RW ");
462 break;
463 default:
464 seq_puts(m, " NA ");
465 }
466
467 if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0)
468 seq_printf(m, " %-36s", fn_name);
469 else
470 seq_printf(m, " %-36s", "<NA>");
471 seq_printf(m, " %15llu\n",
472 (unsigned long long)atomic64_read(&entry->counter));
473 }
474 rcu_read_unlock();
475
476 return 0;
477}
478
479static int ksym_profile_open(struct inode *node, struct file *file)
480{
481 return single_open(file, ksym_profile_show, NULL);
482}
483
484static const struct file_operations ksym_profile_fops = {
485 .open = ksym_profile_open,
486 .read = seq_read,
487 .llseek = seq_lseek,
488 .release = single_release,
489};
490#endif /* CONFIG_PROFILE_KSYM_TRACER */
491
492__init static int init_ksym_trace(void)
493{
494 struct dentry *d_tracer;
495
496 d_tracer = tracing_init_dentry();
497
498 trace_create_file("ksym_trace_filter", 0644, d_tracer,
499 NULL, &ksym_tracing_fops);
500
501#ifdef CONFIG_PROFILE_KSYM_TRACER
502 trace_create_file("ksym_profile", 0444, d_tracer,
503 NULL, &ksym_profile_fops);
504#endif
505
506 return register_tracer(&ksym_tracer);
507}
508device_initcall(init_ksym_trace);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 57c1b4596470..02272baa2206 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -16,9 +16,6 @@
16 16
17DECLARE_RWSEM(trace_event_mutex); 17DECLARE_RWSEM(trace_event_mutex);
18 18
19DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq);
20EXPORT_PER_CPU_SYMBOL(ftrace_event_seq);
21
22static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; 19static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
23 20
24static int next_event_type = __TRACE_LAST_TYPE + 1; 21static int next_event_type = __TRACE_LAST_TYPE + 1;
@@ -1069,65 +1066,6 @@ static struct trace_event trace_wake_event = {
1069 .funcs = &trace_wake_funcs, 1066 .funcs = &trace_wake_funcs,
1070}; 1067};
1071 1068
1072/* TRACE_SPECIAL */
1073static enum print_line_t trace_special_print(struct trace_iterator *iter,
1074 int flags, struct trace_event *event)
1075{
1076 struct special_entry *field;
1077
1078 trace_assign_type(field, iter->ent);
1079
1080 if (!trace_seq_printf(&iter->seq, "# %ld %ld %ld\n",
1081 field->arg1,
1082 field->arg2,
1083 field->arg3))
1084 return TRACE_TYPE_PARTIAL_LINE;
1085
1086 return TRACE_TYPE_HANDLED;
1087}
1088
1089static enum print_line_t trace_special_hex(struct trace_iterator *iter,
1090 int flags, struct trace_event *event)
1091{
1092 struct special_entry *field;
1093 struct trace_seq *s = &iter->seq;
1094
1095 trace_assign_type(field, iter->ent);
1096
1097 SEQ_PUT_HEX_FIELD_RET(s, field->arg1);
1098 SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
1099 SEQ_PUT_HEX_FIELD_RET(s, field->arg3);
1100
1101 return TRACE_TYPE_HANDLED;
1102}
1103
1104static enum print_line_t trace_special_bin(struct trace_iterator *iter,
1105 int flags, struct trace_event *event)
1106{
1107 struct special_entry *field;
1108 struct trace_seq *s = &iter->seq;
1109
1110 trace_assign_type(field, iter->ent);
1111
1112 SEQ_PUT_FIELD_RET(s, field->arg1);
1113 SEQ_PUT_FIELD_RET(s, field->arg2);
1114 SEQ_PUT_FIELD_RET(s, field->arg3);
1115
1116 return TRACE_TYPE_HANDLED;
1117}
1118
1119static struct trace_event_functions trace_special_funcs = {
1120 .trace = trace_special_print,
1121 .raw = trace_special_print,
1122 .hex = trace_special_hex,
1123 .binary = trace_special_bin,
1124};
1125
1126static struct trace_event trace_special_event = {
1127 .type = TRACE_SPECIAL,
1128 .funcs = &trace_special_funcs,
1129};
1130
1131/* TRACE_STACK */ 1069/* TRACE_STACK */
1132 1070
1133static enum print_line_t trace_stack_print(struct trace_iterator *iter, 1071static enum print_line_t trace_stack_print(struct trace_iterator *iter,
@@ -1161,9 +1099,6 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
1161 1099
1162static struct trace_event_functions trace_stack_funcs = { 1100static struct trace_event_functions trace_stack_funcs = {
1163 .trace = trace_stack_print, 1101 .trace = trace_stack_print,
1164 .raw = trace_special_print,
1165 .hex = trace_special_hex,
1166 .binary = trace_special_bin,
1167}; 1102};
1168 1103
1169static struct trace_event trace_stack_event = { 1104static struct trace_event trace_stack_event = {
@@ -1194,9 +1129,6 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
1194 1129
1195static struct trace_event_functions trace_user_stack_funcs = { 1130static struct trace_event_functions trace_user_stack_funcs = {
1196 .trace = trace_user_stack_print, 1131 .trace = trace_user_stack_print,
1197 .raw = trace_special_print,
1198 .hex = trace_special_hex,
1199 .binary = trace_special_bin,
1200}; 1132};
1201 1133
1202static struct trace_event trace_user_stack_event = { 1134static struct trace_event trace_user_stack_event = {
@@ -1314,7 +1246,6 @@ static struct trace_event *events[] __initdata = {
1314 &trace_fn_event, 1246 &trace_fn_event,
1315 &trace_ctx_event, 1247 &trace_ctx_event,
1316 &trace_wake_event, 1248 &trace_wake_event,
1317 &trace_special_event,
1318 &trace_stack_event, 1249 &trace_stack_event,
1319 &trace_user_stack_event, 1250 &trace_user_stack_event,
1320 &trace_bprint_event, 1251 &trace_bprint_event,
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 0e73bc2ef8c5..7319559ed59f 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -31,50 +31,99 @@ static int wakeup_rt;
31static arch_spinlock_t wakeup_lock = 31static arch_spinlock_t wakeup_lock =
32 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 32 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
33 33
34static void wakeup_reset(struct trace_array *tr);
34static void __wakeup_reset(struct trace_array *tr); 35static void __wakeup_reset(struct trace_array *tr);
36static int wakeup_graph_entry(struct ftrace_graph_ent *trace);
37static void wakeup_graph_return(struct ftrace_graph_ret *trace);
35 38
36static int save_lat_flag; 39static int save_lat_flag;
37 40
41#define TRACE_DISPLAY_GRAPH 1
42
43static struct tracer_opt trace_opts[] = {
44#ifdef CONFIG_FUNCTION_GRAPH_TRACER
45 /* display latency trace as call graph */
46 { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) },
47#endif
48 { } /* Empty entry */
49};
50
51static struct tracer_flags tracer_flags = {
52 .val = 0,
53 .opts = trace_opts,
54};
55
56#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
57
38#ifdef CONFIG_FUNCTION_TRACER 58#ifdef CONFIG_FUNCTION_TRACER
59
39/* 60/*
40 * irqsoff uses its own tracer function to keep the overhead down: 61 * Prologue for the wakeup function tracers.
62 *
63 * Returns 1 if it is OK to continue, and preemption
64 * is disabled and data->disabled is incremented.
65 * 0 if the trace is to be ignored, and preemption
66 * is not disabled and data->disabled is
67 * kept the same.
68 *
69 * Note, this function is also used outside this ifdef but
70 * inside the #ifdef of the function graph tracer below.
71 * This is OK, since the function graph tracer is
72 * dependent on the function tracer.
41 */ 73 */
42static void 74static int
43wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) 75func_prolog_preempt_disable(struct trace_array *tr,
76 struct trace_array_cpu **data,
77 int *pc)
44{ 78{
45 struct trace_array *tr = wakeup_trace;
46 struct trace_array_cpu *data;
47 unsigned long flags;
48 long disabled; 79 long disabled;
49 int resched;
50 int cpu; 80 int cpu;
51 int pc;
52 81
53 if (likely(!wakeup_task)) 82 if (likely(!wakeup_task))
54 return; 83 return 0;
55 84
56 pc = preempt_count(); 85 *pc = preempt_count();
57 resched = ftrace_preempt_disable(); 86 preempt_disable_notrace();
58 87
59 cpu = raw_smp_processor_id(); 88 cpu = raw_smp_processor_id();
60 if (cpu != wakeup_current_cpu) 89 if (cpu != wakeup_current_cpu)
61 goto out_enable; 90 goto out_enable;
62 91
63 data = tr->data[cpu]; 92 *data = tr->data[cpu];
64 disabled = atomic_inc_return(&data->disabled); 93 disabled = atomic_inc_return(&(*data)->disabled);
65 if (unlikely(disabled != 1)) 94 if (unlikely(disabled != 1))
66 goto out; 95 goto out;
67 96
68 local_irq_save(flags); 97 return 1;
69 98
70 trace_function(tr, ip, parent_ip, flags, pc); 99out:
100 atomic_dec(&(*data)->disabled);
101
102out_enable:
103 preempt_enable_notrace();
104 return 0;
105}
71 106
107/*
108 * wakeup uses its own tracer function to keep the overhead down:
109 */
110static void
111wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
112{
113 struct trace_array *tr = wakeup_trace;
114 struct trace_array_cpu *data;
115 unsigned long flags;
116 int pc;
117
118 if (!func_prolog_preempt_disable(tr, &data, &pc))
119 return;
120
121 local_irq_save(flags);
122 trace_function(tr, ip, parent_ip, flags, pc);
72 local_irq_restore(flags); 123 local_irq_restore(flags);
73 124
74 out:
75 atomic_dec(&data->disabled); 125 atomic_dec(&data->disabled);
76 out_enable: 126 preempt_enable_notrace();
77 ftrace_preempt_enable(resched);
78} 127}
79 128
80static struct ftrace_ops trace_ops __read_mostly = 129static struct ftrace_ops trace_ops __read_mostly =
@@ -83,6 +132,156 @@ static struct ftrace_ops trace_ops __read_mostly =
83}; 132};
84#endif /* CONFIG_FUNCTION_TRACER */ 133#endif /* CONFIG_FUNCTION_TRACER */
85 134
135static int start_func_tracer(int graph)
136{
137 int ret;
138
139 if (!graph)
140 ret = register_ftrace_function(&trace_ops);
141 else
142 ret = register_ftrace_graph(&wakeup_graph_return,
143 &wakeup_graph_entry);
144
145 if (!ret && tracing_is_enabled())
146 tracer_enabled = 1;
147 else
148 tracer_enabled = 0;
149
150 return ret;
151}
152
153static void stop_func_tracer(int graph)
154{
155 tracer_enabled = 0;
156
157 if (!graph)
158 unregister_ftrace_function(&trace_ops);
159 else
160 unregister_ftrace_graph();
161}
162
163#ifdef CONFIG_FUNCTION_GRAPH_TRACER
164static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
165{
166
167 if (!(bit & TRACE_DISPLAY_GRAPH))
168 return -EINVAL;
169
170 if (!(is_graph() ^ set))
171 return 0;
172
173 stop_func_tracer(!set);
174
175 wakeup_reset(wakeup_trace);
176 tracing_max_latency = 0;
177
178 return start_func_tracer(set);
179}
180
181static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
182{
183 struct trace_array *tr = wakeup_trace;
184 struct trace_array_cpu *data;
185 unsigned long flags;
186 int pc, ret = 0;
187
188 if (!func_prolog_preempt_disable(tr, &data, &pc))
189 return 0;
190
191 local_save_flags(flags);
192 ret = __trace_graph_entry(tr, trace, flags, pc);
193 atomic_dec(&data->disabled);
194 preempt_enable_notrace();
195
196 return ret;
197}
198
199static void wakeup_graph_return(struct ftrace_graph_ret *trace)
200{
201 struct trace_array *tr = wakeup_trace;
202 struct trace_array_cpu *data;
203 unsigned long flags;
204 int pc;
205
206 if (!func_prolog_preempt_disable(tr, &data, &pc))
207 return;
208
209 local_save_flags(flags);
210 __trace_graph_return(tr, trace, flags, pc);
211 atomic_dec(&data->disabled);
212
213 preempt_enable_notrace();
214 return;
215}
216
217static void wakeup_trace_open(struct trace_iterator *iter)
218{
219 if (is_graph())
220 graph_trace_open(iter);
221}
222
223static void wakeup_trace_close(struct trace_iterator *iter)
224{
225 if (iter->private)
226 graph_trace_close(iter);
227}
228
229#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC)
230
231static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
232{
233 /*
234 * In graph mode call the graph tracer output function,
235 * otherwise go with the TRACE_FN event handler
236 */
237 if (is_graph())
238 return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS);
239
240 return TRACE_TYPE_UNHANDLED;
241}
242
243static void wakeup_print_header(struct seq_file *s)
244{
245 if (is_graph())
246 print_graph_headers_flags(s, GRAPH_TRACER_FLAGS);
247 else
248 trace_default_header(s);
249}
250
251static void
252__trace_function(struct trace_array *tr,
253 unsigned long ip, unsigned long parent_ip,
254 unsigned long flags, int pc)
255{
256 if (is_graph())
257 trace_graph_function(tr, ip, parent_ip, flags, pc);
258 else
259 trace_function(tr, ip, parent_ip, flags, pc);
260}
261#else
262#define __trace_function trace_function
263
264static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
265{
266 return -EINVAL;
267}
268
269static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
270{
271 return -1;
272}
273
274static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
275{
276 return TRACE_TYPE_UNHANDLED;
277}
278
279static void wakeup_graph_return(struct ftrace_graph_ret *trace) { }
280static void wakeup_print_header(struct seq_file *s) { }
281static void wakeup_trace_open(struct trace_iterator *iter) { }
282static void wakeup_trace_close(struct trace_iterator *iter) { }
283#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
284
86/* 285/*
87 * Should this new latency be reported/recorded? 286 * Should this new latency be reported/recorded?
88 */ 287 */
@@ -153,7 +352,7 @@ probe_wakeup_sched_switch(void *ignore,
153 /* The task we are waiting for is waking up */ 352 /* The task we are waiting for is waking up */
154 data = wakeup_trace->data[wakeup_cpu]; 353 data = wakeup_trace->data[wakeup_cpu];
155 354
156 trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); 355 __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
157 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); 356 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
158 357
159 T0 = data->preempt_timestamp; 358 T0 = data->preempt_timestamp;
@@ -253,7 +452,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
253 * is not called by an assembly function (where as schedule is) 452 * is not called by an assembly function (where as schedule is)
254 * it should be safe to use it here. 453 * it should be safe to use it here.
255 */ 454 */
256 trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); 455 __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
257 456
258out_locked: 457out_locked:
259 arch_spin_unlock(&wakeup_lock); 458 arch_spin_unlock(&wakeup_lock);
@@ -304,12 +503,8 @@ static void start_wakeup_tracer(struct trace_array *tr)
304 */ 503 */
305 smp_wmb(); 504 smp_wmb();
306 505
307 register_ftrace_function(&trace_ops); 506 if (start_func_tracer(is_graph()))
308 507 printk(KERN_ERR "failed to start wakeup tracer\n");
309 if (tracing_is_enabled())
310 tracer_enabled = 1;
311 else
312 tracer_enabled = 0;
313 508
314 return; 509 return;
315fail_deprobe_wake_new: 510fail_deprobe_wake_new:
@@ -321,7 +516,7 @@ fail_deprobe:
321static void stop_wakeup_tracer(struct trace_array *tr) 516static void stop_wakeup_tracer(struct trace_array *tr)
322{ 517{
323 tracer_enabled = 0; 518 tracer_enabled = 0;
324 unregister_ftrace_function(&trace_ops); 519 stop_func_tracer(is_graph());
325 unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); 520 unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
326 unregister_trace_sched_wakeup_new(probe_wakeup, NULL); 521 unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
327 unregister_trace_sched_wakeup(probe_wakeup, NULL); 522 unregister_trace_sched_wakeup(probe_wakeup, NULL);
@@ -380,9 +575,16 @@ static struct tracer wakeup_tracer __read_mostly =
380 .start = wakeup_tracer_start, 575 .start = wakeup_tracer_start,
381 .stop = wakeup_tracer_stop, 576 .stop = wakeup_tracer_stop,
382 .print_max = 1, 577 .print_max = 1,
578 .print_header = wakeup_print_header,
579 .print_line = wakeup_print_line,
580 .flags = &tracer_flags,
581 .set_flag = wakeup_set_flag,
383#ifdef CONFIG_FTRACE_SELFTEST 582#ifdef CONFIG_FTRACE_SELFTEST
384 .selftest = trace_selftest_startup_wakeup, 583 .selftest = trace_selftest_startup_wakeup,
385#endif 584#endif
585 .open = wakeup_trace_open,
586 .close = wakeup_trace_close,
587 .use_max_tr = 1,
386}; 588};
387 589
388static struct tracer wakeup_rt_tracer __read_mostly = 590static struct tracer wakeup_rt_tracer __read_mostly =
@@ -394,9 +596,16 @@ static struct tracer wakeup_rt_tracer __read_mostly =
394 .stop = wakeup_tracer_stop, 596 .stop = wakeup_tracer_stop,
395 .wait_pipe = poll_wait_pipe, 597 .wait_pipe = poll_wait_pipe,
396 .print_max = 1, 598 .print_max = 1,
599 .print_header = wakeup_print_header,
600 .print_line = wakeup_print_line,
601 .flags = &tracer_flags,
602 .set_flag = wakeup_set_flag,
397#ifdef CONFIG_FTRACE_SELFTEST 603#ifdef CONFIG_FTRACE_SELFTEST
398 .selftest = trace_selftest_startup_wakeup, 604 .selftest = trace_selftest_startup_wakeup,
399#endif 605#endif
606 .open = wakeup_trace_open,
607 .close = wakeup_trace_close,
608 .use_max_tr = 1,
400}; 609};
401 610
402__init static int init_wakeup_tracer(void) 611__init static int init_wakeup_tracer(void)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 250e7f9bd2f0..155a415b3209 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -13,11 +13,9 @@ static inline int trace_valid_entry(struct trace_entry *entry)
13 case TRACE_WAKE: 13 case TRACE_WAKE:
14 case TRACE_STACK: 14 case TRACE_STACK:
15 case TRACE_PRINT: 15 case TRACE_PRINT:
16 case TRACE_SPECIAL:
17 case TRACE_BRANCH: 16 case TRACE_BRANCH:
18 case TRACE_GRAPH_ENT: 17 case TRACE_GRAPH_ENT:
19 case TRACE_GRAPH_RET: 18 case TRACE_GRAPH_RET:
20 case TRACE_KSYM:
21 return 1; 19 return 1;
22 } 20 }
23 return 0; 21 return 0;
@@ -691,38 +689,6 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
691} 689}
692#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ 690#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
693 691
694#ifdef CONFIG_SYSPROF_TRACER
695int
696trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
697{
698 unsigned long count;
699 int ret;
700
701 /* start the tracing */
702 ret = tracer_init(trace, tr);
703 if (ret) {
704 warn_failed_init_tracer(trace, ret);
705 return ret;
706 }
707
708 /* Sleep for a 1/10 of a second */
709 msleep(100);
710 /* stop the tracing. */
711 tracing_stop();
712 /* check the trace buffer */
713 ret = trace_test_buffer(tr, &count);
714 trace->reset(tr);
715 tracing_start();
716
717 if (!ret && !count) {
718 printk(KERN_CONT ".. no entries found ..");
719 ret = -1;
720 }
721
722 return ret;
723}
724#endif /* CONFIG_SYSPROF_TRACER */
725
726#ifdef CONFIG_BRANCH_TRACER 692#ifdef CONFIG_BRANCH_TRACER
727int 693int
728trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) 694trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
@@ -755,56 +721,3 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
755} 721}
756#endif /* CONFIG_BRANCH_TRACER */ 722#endif /* CONFIG_BRANCH_TRACER */
757 723
758#ifdef CONFIG_KSYM_TRACER
759static int ksym_selftest_dummy;
760
761int
762trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
763{
764 unsigned long count;
765 int ret;
766
767 /* start the tracing */
768 ret = tracer_init(trace, tr);
769 if (ret) {
770 warn_failed_init_tracer(trace, ret);
771 return ret;
772 }
773
774 ksym_selftest_dummy = 0;
775 /* Register the read-write tracing request */
776
777 ret = process_new_ksym_entry("ksym_selftest_dummy",
778 HW_BREAKPOINT_R | HW_BREAKPOINT_W,
779 (unsigned long)(&ksym_selftest_dummy));
780
781 if (ret < 0) {
782 printk(KERN_CONT "ksym_trace read-write startup test failed\n");
783 goto ret_path;
784 }
785 /* Perform a read and a write operation over the dummy variable to
786 * trigger the tracer
787 */
788 if (ksym_selftest_dummy == 0)
789 ksym_selftest_dummy++;
790
791 /* stop the tracing. */
792 tracing_stop();
793 /* check the trace buffer */
794 ret = trace_test_buffer(tr, &count);
795 trace->reset(tr);
796 tracing_start();
797
798 /* read & write operations - one each is performed on the dummy variable
799 * triggering two entries in the trace buffer
800 */
801 if (!ret && count != 2) {
802 printk(KERN_CONT "Ksym tracer startup test failed");
803 ret = -1;
804 }
805
806ret_path:
807 return ret;
808}
809#endif /* CONFIG_KSYM_TRACER */
810
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index f4bc9b27de5f..4c5dead0c239 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -110,12 +110,12 @@ static inline void check_stack(void)
110static void 110static void
111stack_trace_call(unsigned long ip, unsigned long parent_ip) 111stack_trace_call(unsigned long ip, unsigned long parent_ip)
112{ 112{
113 int cpu, resched; 113 int cpu;
114 114
115 if (unlikely(!ftrace_enabled || stack_trace_disabled)) 115 if (unlikely(!ftrace_enabled || stack_trace_disabled))
116 return; 116 return;
117 117
118 resched = ftrace_preempt_disable(); 118 preempt_disable_notrace();
119 119
120 cpu = raw_smp_processor_id(); 120 cpu = raw_smp_processor_id();
121 /* no atomic needed, we only modify this variable by this cpu */ 121 /* no atomic needed, we only modify this variable by this cpu */
@@ -127,7 +127,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
127 out: 127 out:
128 per_cpu(trace_active, cpu)--; 128 per_cpu(trace_active, cpu)--;
129 /* prevent recursion in schedule */ 129 /* prevent recursion in schedule */
130 ftrace_preempt_enable(resched); 130 preempt_enable_notrace();
131} 131}
132 132
133static struct ftrace_ops trace_ops __read_mostly = 133static struct ftrace_ops trace_ops __read_mostly =
@@ -195,6 +195,7 @@ static const struct file_operations stack_max_size_fops = {
195 .open = tracing_open_generic, 195 .open = tracing_open_generic,
196 .read = stack_max_size_read, 196 .read = stack_max_size_read,
197 .write = stack_max_size_write, 197 .write = stack_max_size_write,
198 .llseek = default_llseek,
198}; 199};
199 200
200static void * 201static void *
@@ -249,7 +250,7 @@ static int trace_lookup_stack(struct seq_file *m, long i)
249{ 250{
250 unsigned long addr = stack_dump_trace[i]; 251 unsigned long addr = stack_dump_trace[i];
251 252
252 return seq_printf(m, "%pF\n", (void *)addr); 253 return seq_printf(m, "%pS\n", (void *)addr);
253} 254}
254 255
255static void print_disabled(struct seq_file *m) 256static void print_disabled(struct seq_file *m)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 34e35804304b..bac752f0cfb5 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -23,6 +23,9 @@ static int syscall_exit_register(struct ftrace_event_call *event,
23static int syscall_enter_define_fields(struct ftrace_event_call *call); 23static int syscall_enter_define_fields(struct ftrace_event_call *call);
24static int syscall_exit_define_fields(struct ftrace_event_call *call); 24static int syscall_exit_define_fields(struct ftrace_event_call *call);
25 25
26/* All syscall exit events have the same fields */
27static LIST_HEAD(syscall_exit_fields);
28
26static struct list_head * 29static struct list_head *
27syscall_get_enter_fields(struct ftrace_event_call *call) 30syscall_get_enter_fields(struct ftrace_event_call *call)
28{ 31{
@@ -34,9 +37,7 @@ syscall_get_enter_fields(struct ftrace_event_call *call)
34static struct list_head * 37static struct list_head *
35syscall_get_exit_fields(struct ftrace_event_call *call) 38syscall_get_exit_fields(struct ftrace_event_call *call)
36{ 39{
37 struct syscall_metadata *entry = call->data; 40 return &syscall_exit_fields;
38
39 return &entry->exit_fields;
40} 41}
41 42
42struct trace_event_functions enter_syscall_print_funcs = { 43struct trace_event_functions enter_syscall_print_funcs = {
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
deleted file mode 100644
index a7974a552ca9..000000000000
--- a/kernel/trace/trace_sysprof.c
+++ /dev/null
@@ -1,329 +0,0 @@
1/*
2 * trace stack traces
3 *
4 * Copyright (C) 2004-2008, Soeren Sandmann
5 * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
6 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
7 */
8#include <linux/kallsyms.h>
9#include <linux/debugfs.h>
10#include <linux/hrtimer.h>
11#include <linux/uaccess.h>
12#include <linux/ftrace.h>
13#include <linux/module.h>
14#include <linux/irq.h>
15#include <linux/fs.h>
16
17#include <asm/stacktrace.h>
18
19#include "trace.h"
20
21static struct trace_array *sysprof_trace;
22static int __read_mostly tracer_enabled;
23
24/*
25 * 1 msec sample interval by default:
26 */
27static unsigned long sample_period = 1000000;
28static const unsigned int sample_max_depth = 512;
29
30static DEFINE_MUTEX(sample_timer_lock);
31/*
32 * Per CPU hrtimers that do the profiling:
33 */
34static DEFINE_PER_CPU(struct hrtimer, stack_trace_hrtimer);
35
36struct stack_frame {
37 const void __user *next_fp;
38 unsigned long return_address;
39};
40
41static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
42{
43 int ret;
44
45 if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
46 return 0;
47
48 ret = 1;
49 pagefault_disable();
50 if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
51 ret = 0;
52 pagefault_enable();
53
54 return ret;
55}
56
57struct backtrace_info {
58 struct trace_array_cpu *data;
59 struct trace_array *tr;
60 int pos;
61};
62
63static void
64backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
65{
66 /* Ignore warnings */
67}
68
69static void backtrace_warning(void *data, char *msg)
70{
71 /* Ignore warnings */
72}
73
74static int backtrace_stack(void *data, char *name)
75{
76 /* Don't bother with IRQ stacks for now */
77 return -1;
78}
79
80static void backtrace_address(void *data, unsigned long addr, int reliable)
81{
82 struct backtrace_info *info = data;
83
84 if (info->pos < sample_max_depth && reliable) {
85 __trace_special(info->tr, info->data, 1, addr, 0);
86
87 info->pos++;
88 }
89}
90
91static const struct stacktrace_ops backtrace_ops = {
92 .warning = backtrace_warning,
93 .warning_symbol = backtrace_warning_symbol,
94 .stack = backtrace_stack,
95 .address = backtrace_address,
96 .walk_stack = print_context_stack,
97};
98
99static int
100trace_kernel(struct pt_regs *regs, struct trace_array *tr,
101 struct trace_array_cpu *data)
102{
103 struct backtrace_info info;
104 unsigned long bp;
105 char *stack;
106
107 info.tr = tr;
108 info.data = data;
109 info.pos = 1;
110
111 __trace_special(info.tr, info.data, 1, regs->ip, 0);
112
113 stack = ((char *)regs + sizeof(struct pt_regs));
114#ifdef CONFIG_FRAME_POINTER
115 bp = regs->bp;
116#else
117 bp = 0;
118#endif
119
120 dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, &info);
121
122 return info.pos;
123}
124
125static void timer_notify(struct pt_regs *regs, int cpu)
126{
127 struct trace_array_cpu *data;
128 struct stack_frame frame;
129 struct trace_array *tr;
130 const void __user *fp;
131 int is_user;
132 int i;
133
134 if (!regs)
135 return;
136
137 tr = sysprof_trace;
138 data = tr->data[cpu];
139 is_user = user_mode(regs);
140
141 if (!current || current->pid == 0)
142 return;
143
144 if (is_user && current->state != TASK_RUNNING)
145 return;
146
147 __trace_special(tr, data, 0, 0, current->pid);
148
149 if (!is_user)
150 i = trace_kernel(regs, tr, data);
151 else
152 i = 0;
153
154 /*
155 * Trace user stack if we are not a kernel thread
156 */
157 if (current->mm && i < sample_max_depth) {
158 regs = (struct pt_regs *)current->thread.sp0 - 1;
159
160 fp = (void __user *)regs->bp;
161
162 __trace_special(tr, data, 2, regs->ip, 0);
163
164 while (i < sample_max_depth) {
165 frame.next_fp = NULL;
166 frame.return_address = 0;
167 if (!copy_stack_frame(fp, &frame))
168 break;
169 if ((unsigned long)fp < regs->sp)
170 break;
171
172 __trace_special(tr, data, 2, frame.return_address,
173 (unsigned long)fp);
174 fp = frame.next_fp;
175
176 i++;
177 }
178
179 }
180
181 /*
182 * Special trace entry if we overflow the max depth:
183 */
184 if (i == sample_max_depth)
185 __trace_special(tr, data, -1, -1, -1);
186
187 __trace_special(tr, data, 3, current->pid, i);
188}
189
190static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer)
191{
192 /* trace here */
193 timer_notify(get_irq_regs(), smp_processor_id());
194
195 hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
196
197 return HRTIMER_RESTART;
198}
199
200static void start_stack_timer(void *unused)
201{
202 struct hrtimer *hrtimer = &__get_cpu_var(stack_trace_hrtimer);
203
204 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
205 hrtimer->function = stack_trace_timer_fn;
206
207 hrtimer_start(hrtimer, ns_to_ktime(sample_period),
208 HRTIMER_MODE_REL_PINNED);
209}
210
211static void start_stack_timers(void)
212{
213 on_each_cpu(start_stack_timer, NULL, 1);
214}
215
216static void stop_stack_timer(int cpu)
217{
218 struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu);
219
220 hrtimer_cancel(hrtimer);
221}
222
223static void stop_stack_timers(void)
224{
225 int cpu;
226
227 for_each_online_cpu(cpu)
228 stop_stack_timer(cpu);
229}
230
231static void stop_stack_trace(struct trace_array *tr)
232{
233 mutex_lock(&sample_timer_lock);
234 stop_stack_timers();
235 tracer_enabled = 0;
236 mutex_unlock(&sample_timer_lock);
237}
238
239static int stack_trace_init(struct trace_array *tr)
240{
241 sysprof_trace = tr;
242
243 tracing_start_cmdline_record();
244
245 mutex_lock(&sample_timer_lock);
246 start_stack_timers();
247 tracer_enabled = 1;
248 mutex_unlock(&sample_timer_lock);
249 return 0;
250}
251
252static void stack_trace_reset(struct trace_array *tr)
253{
254 tracing_stop_cmdline_record();
255 stop_stack_trace(tr);
256}
257
258static struct tracer stack_trace __read_mostly =
259{
260 .name = "sysprof",
261 .init = stack_trace_init,
262 .reset = stack_trace_reset,
263#ifdef CONFIG_FTRACE_SELFTEST
264 .selftest = trace_selftest_startup_sysprof,
265#endif
266};
267
268__init static int init_stack_trace(void)
269{
270 return register_tracer(&stack_trace);
271}
272device_initcall(init_stack_trace);
273
274#define MAX_LONG_DIGITS 22
275
276static ssize_t
277sysprof_sample_read(struct file *filp, char __user *ubuf,
278 size_t cnt, loff_t *ppos)
279{
280 char buf[MAX_LONG_DIGITS];
281 int r;
282
283 r = sprintf(buf, "%ld\n", nsecs_to_usecs(sample_period));
284
285 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
286}
287
288static ssize_t
289sysprof_sample_write(struct file *filp, const char __user *ubuf,
290 size_t cnt, loff_t *ppos)
291{
292 char buf[MAX_LONG_DIGITS];
293 unsigned long val;
294
295 if (cnt > MAX_LONG_DIGITS-1)
296 cnt = MAX_LONG_DIGITS-1;
297
298 if (copy_from_user(&buf, ubuf, cnt))
299 return -EFAULT;
300
301 buf[cnt] = 0;
302
303 val = simple_strtoul(buf, NULL, 10);
304 /*
305 * Enforce a minimum sample period of 100 usecs:
306 */
307 if (val < 100)
308 val = 100;
309
310 mutex_lock(&sample_timer_lock);
311 stop_stack_timers();
312 sample_period = val * 1000;
313 start_stack_timers();
314 mutex_unlock(&sample_timer_lock);
315
316 return cnt;
317}
318
319static const struct file_operations sysprof_sample_fops = {
320 .read = sysprof_sample_read,
321 .write = sysprof_sample_write,
322};
323
324void init_tracer_sysprof_debugfs(struct dentry *d_tracer)
325{
326
327 trace_create_file("sysprof_sample_period", 0644,
328 d_tracer, NULL, &sysprof_sample_fops);
329}
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index a7cc3793baf6..209b379a4721 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -263,6 +263,11 @@ int __init trace_workqueue_early_init(void)
263{ 263{
264 int ret, cpu; 264 int ret, cpu;
265 265
266 for_each_possible_cpu(cpu) {
267 spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
268 INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
269 }
270
266 ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL); 271 ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
267 if (ret) 272 if (ret)
268 goto out; 273 goto out;
@@ -279,11 +284,6 @@ int __init trace_workqueue_early_init(void)
279 if (ret) 284 if (ret)
280 goto no_creation; 285 goto no_creation;
281 286
282 for_each_possible_cpu(cpu) {
283 spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
284 INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
285 }
286
287 return 0; 287 return 0;
288 288
289no_creation: 289no_creation:
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index c77f3eceea25..e95ee7f31d43 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -25,6 +25,7 @@
25#include <linux/err.h> 25#include <linux/err.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/jump_label.h>
28 29
29extern struct tracepoint __start___tracepoints[]; 30extern struct tracepoint __start___tracepoints[];
30extern struct tracepoint __stop___tracepoints[]; 31extern struct tracepoint __stop___tracepoints[];
@@ -263,7 +264,13 @@ static void set_tracepoint(struct tracepoint_entry **entry,
263 * is used. 264 * is used.
264 */ 265 */
265 rcu_assign_pointer(elem->funcs, (*entry)->funcs); 266 rcu_assign_pointer(elem->funcs, (*entry)->funcs);
266 elem->state = active; 267 if (!elem->state && active) {
268 jump_label_enable(&elem->state);
269 elem->state = active;
270 } else if (elem->state && !active) {
271 jump_label_disable(&elem->state);
272 elem->state = active;
273 }
267} 274}
268 275
269/* 276/*
@@ -277,7 +284,10 @@ static void disable_tracepoint(struct tracepoint *elem)
277 if (elem->unregfunc && elem->state) 284 if (elem->unregfunc && elem->state)
278 elem->unregfunc(); 285 elem->unregfunc();
279 286
280 elem->state = 0; 287 if (elem->state) {
288 jump_label_disable(&elem->state);
289 elem->state = 0;
290 }
281 rcu_assign_pointer(elem->funcs, NULL); 291 rcu_assign_pointer(elem->funcs, NULL);
282} 292}
283 293
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 0a67e041edf8..24dc60d9fa1f 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -63,12 +63,10 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
63 stats->ac_ppid = pid_alive(tsk) ? 63 stats->ac_ppid = pid_alive(tsk) ?
64 rcu_dereference(tsk->real_parent)->tgid : 0; 64 rcu_dereference(tsk->real_parent)->tgid : 0;
65 rcu_read_unlock(); 65 rcu_read_unlock();
66 stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC; 66 stats->ac_utime = cputime_to_usecs(tsk->utime);
67 stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC; 67 stats->ac_stime = cputime_to_usecs(tsk->stime);
68 stats->ac_utimescaled = 68 stats->ac_utimescaled = cputime_to_usecs(tsk->utimescaled);
69 cputime_to_msecs(tsk->utimescaled) * USEC_PER_MSEC; 69 stats->ac_stimescaled = cputime_to_usecs(tsk->stimescaled);
70 stats->ac_stimescaled =
71 cputime_to_msecs(tsk->stimescaled) * USEC_PER_MSEC;
72 stats->ac_minflt = tsk->min_flt; 70 stats->ac_minflt = tsk->min_flt;
73 stats->ac_majflt = tsk->maj_flt; 71 stats->ac_majflt = tsk->maj_flt;
74 72
diff --git a/kernel/user.c b/kernel/user.c
index 7e72614b736d..2c7d8d5914b1 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -91,6 +91,7 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
91 * upon function exit. 91 * upon function exit.
92 */ 92 */
93static void free_user(struct user_struct *up, unsigned long flags) 93static void free_user(struct user_struct *up, unsigned long flags)
94 __releases(&uidhash_lock)
94{ 95{
95 uid_hash_remove(up); 96 uid_hash_remove(up);
96 spin_unlock_irqrestore(&uidhash_lock, flags); 97 spin_unlock_irqrestore(&uidhash_lock, flags);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index b2d70d38dff4..25915832291a 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,6 +9,7 @@
9#include <linux/nsproxy.h> 9#include <linux/nsproxy.h>
10#include <linux/slab.h> 10#include <linux/slab.h>
11#include <linux/user_namespace.h> 11#include <linux/user_namespace.h>
12#include <linux/highuid.h>
12#include <linux/cred.h> 13#include <linux/cred.h>
13 14
14/* 15/*
@@ -82,3 +83,46 @@ void free_user_ns(struct kref *kref)
82 schedule_work(&ns->destroyer); 83 schedule_work(&ns->destroyer);
83} 84}
84EXPORT_SYMBOL(free_user_ns); 85EXPORT_SYMBOL(free_user_ns);
86
87uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid)
88{
89 struct user_namespace *tmp;
90
91 if (likely(to == cred->user->user_ns))
92 return uid;
93
94
95 /* Is cred->user the creator of the target user_ns
96 * or the creator of one of it's parents?
97 */
98 for ( tmp = to; tmp != &init_user_ns;
99 tmp = tmp->creator->user_ns ) {
100 if (cred->user == tmp->creator) {
101 return (uid_t)0;
102 }
103 }
104
105 /* No useful relationship so no mapping */
106 return overflowuid;
107}
108
109gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid)
110{
111 struct user_namespace *tmp;
112
113 if (likely(to == cred->user->user_ns))
114 return gid;
115
116 /* Is cred->user the creator of the target user_ns
117 * or the creator of one of it's parents?
118 */
119 for ( tmp = to; tmp != &init_user_ns;
120 tmp = tmp->creator->user_ns ) {
121 if (cred->user == tmp->creator) {
122 return (gid_t)0;
123 }
124 }
125
126 /* No useful relationship so no mapping */
127 return overflowgid;
128}
diff --git a/kernel/wait.c b/kernel/wait.c
index c4bd3d825f35..b0310eb6cc1e 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -92,7 +92,7 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
92} 92}
93EXPORT_SYMBOL(prepare_to_wait_exclusive); 93EXPORT_SYMBOL(prepare_to_wait_exclusive);
94 94
95/* 95/**
96 * finish_wait - clean up after waiting in a queue 96 * finish_wait - clean up after waiting in a queue
97 * @q: waitqueue waited on 97 * @q: waitqueue waited on
98 * @wait: wait descriptor 98 * @wait: wait descriptor
@@ -127,11 +127,11 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
127} 127}
128EXPORT_SYMBOL(finish_wait); 128EXPORT_SYMBOL(finish_wait);
129 129
130/* 130/**
131 * abort_exclusive_wait - abort exclusive waiting in a queue 131 * abort_exclusive_wait - abort exclusive waiting in a queue
132 * @q: waitqueue waited on 132 * @q: waitqueue waited on
133 * @wait: wait descriptor 133 * @wait: wait descriptor
134 * @state: runstate of the waiter to be woken 134 * @mode: runstate of the waiter to be woken
135 * @key: key to identify a wait bit queue or %NULL 135 * @key: key to identify a wait bit queue or %NULL
136 * 136 *
137 * Sets current thread back to running state and removes 137 * Sets current thread back to running state and removes
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
new file mode 100644
index 000000000000..bafba687a6d8
--- /dev/null
+++ b/kernel/watchdog.c
@@ -0,0 +1,566 @@
1/*
2 * Detect hard and soft lockups on a system
3 *
4 * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
5 *
6 * this code detects hard lockups: incidents in where on a CPU
7 * the kernel does not respond to anything except NMI.
8 *
9 * Note: Most of this code is borrowed heavily from softlockup.c,
10 * so thanks to Ingo for the initial implementation.
11 * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks
12 * to those contributors as well.
13 */
14
15#include <linux/mm.h>
16#include <linux/cpu.h>
17#include <linux/nmi.h>
18#include <linux/init.h>
19#include <linux/delay.h>
20#include <linux/freezer.h>
21#include <linux/kthread.h>
22#include <linux/lockdep.h>
23#include <linux/notifier.h>
24#include <linux/module.h>
25#include <linux/sysctl.h>
26
27#include <asm/irq_regs.h>
28#include <linux/perf_event.h>
29
30int watchdog_enabled;
31int __read_mostly softlockup_thresh = 60;
32
33static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
34static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
35static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
36static DEFINE_PER_CPU(bool, softlockup_touch_sync);
37static DEFINE_PER_CPU(bool, soft_watchdog_warn);
38#ifdef CONFIG_HARDLOCKUP_DETECTOR
39static DEFINE_PER_CPU(bool, hard_watchdog_warn);
40static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
41static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
42static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
43static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
44#endif
45
46static int __initdata no_watchdog;
47
48
49/* boot commands */
50/*
51 * Should we panic when a soft-lockup or hard-lockup occurs:
52 */
53#ifdef CONFIG_HARDLOCKUP_DETECTOR
54static int hardlockup_panic;
55
56static int __init hardlockup_panic_setup(char *str)
57{
58 if (!strncmp(str, "panic", 5))
59 hardlockup_panic = 1;
60 return 1;
61}
62__setup("nmi_watchdog=", hardlockup_panic_setup);
63#endif
64
65unsigned int __read_mostly softlockup_panic =
66 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
67
68static int __init softlockup_panic_setup(char *str)
69{
70 softlockup_panic = simple_strtoul(str, NULL, 0);
71
72 return 1;
73}
74__setup("softlockup_panic=", softlockup_panic_setup);
75
76static int __init nowatchdog_setup(char *str)
77{
78 no_watchdog = 1;
79 return 1;
80}
81__setup("nowatchdog", nowatchdog_setup);
82
83/* deprecated */
84static int __init nosoftlockup_setup(char *str)
85{
86 no_watchdog = 1;
87 return 1;
88}
89__setup("nosoftlockup", nosoftlockup_setup);
90/* */
91
92
93/*
94 * Returns seconds, approximately. We don't need nanosecond
95 * resolution, and we don't need to waste time with a big divide when
96 * 2^30ns == 1.074s.
97 */
98static unsigned long get_timestamp(int this_cpu)
99{
100 return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
101}
102
103static unsigned long get_sample_period(void)
104{
105 /*
106 * convert softlockup_thresh from seconds to ns
107 * the divide by 5 is to give hrtimer 5 chances to
108 * increment before the hardlockup detector generates
109 * a warning
110 */
111 return softlockup_thresh / 5 * NSEC_PER_SEC;
112}
113
114/* Commands for resetting the watchdog */
115static void __touch_watchdog(void)
116{
117 int this_cpu = smp_processor_id();
118
119 __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu);
120}
121
122void touch_softlockup_watchdog(void)
123{
124 __raw_get_cpu_var(watchdog_touch_ts) = 0;
125}
126EXPORT_SYMBOL(touch_softlockup_watchdog);
127
128void touch_all_softlockup_watchdogs(void)
129{
130 int cpu;
131
132 /*
133 * this is done lockless
134 * do we care if a 0 races with a timestamp?
135 * all it means is the softlock check starts one cycle later
136 */
137 for_each_online_cpu(cpu)
138 per_cpu(watchdog_touch_ts, cpu) = 0;
139}
140
141#ifdef CONFIG_HARDLOCKUP_DETECTOR
142void touch_nmi_watchdog(void)
143{
144 if (watchdog_enabled) {
145 unsigned cpu;
146
147 for_each_present_cpu(cpu) {
148 if (per_cpu(watchdog_nmi_touch, cpu) != true)
149 per_cpu(watchdog_nmi_touch, cpu) = true;
150 }
151 }
152 touch_softlockup_watchdog();
153}
154EXPORT_SYMBOL(touch_nmi_watchdog);
155
156#endif
157
158void touch_softlockup_watchdog_sync(void)
159{
160 __raw_get_cpu_var(softlockup_touch_sync) = true;
161 __raw_get_cpu_var(watchdog_touch_ts) = 0;
162}
163
164#ifdef CONFIG_HARDLOCKUP_DETECTOR
165/* watchdog detector functions */
166static int is_hardlockup(void)
167{
168 unsigned long hrint = __get_cpu_var(hrtimer_interrupts);
169
170 if (__get_cpu_var(hrtimer_interrupts_saved) == hrint)
171 return 1;
172
173 __get_cpu_var(hrtimer_interrupts_saved) = hrint;
174 return 0;
175}
176#endif
177
178static int is_softlockup(unsigned long touch_ts)
179{
180 unsigned long now = get_timestamp(smp_processor_id());
181
182 /* Warn about unreasonable delays: */
183 if (time_after(now, touch_ts + softlockup_thresh))
184 return now - touch_ts;
185
186 return 0;
187}
188
189#ifdef CONFIG_HARDLOCKUP_DETECTOR
190static struct perf_event_attr wd_hw_attr = {
191 .type = PERF_TYPE_HARDWARE,
192 .config = PERF_COUNT_HW_CPU_CYCLES,
193 .size = sizeof(struct perf_event_attr),
194 .pinned = 1,
195 .disabled = 1,
196};
197
198/* Callback function for perf event subsystem */
199static void watchdog_overflow_callback(struct perf_event *event, int nmi,
200 struct perf_sample_data *data,
201 struct pt_regs *regs)
202{
203 /* Ensure the watchdog never gets throttled */
204 event->hw.interrupts = 0;
205
206 if (__get_cpu_var(watchdog_nmi_touch) == true) {
207 __get_cpu_var(watchdog_nmi_touch) = false;
208 return;
209 }
210
211 /* check for a hardlockup
212 * This is done by making sure our timer interrupt
213 * is incrementing. The timer interrupt should have
214 * fired multiple times before we overflow'd. If it hasn't
215 * then this is a good indication the cpu is stuck
216 */
217 if (is_hardlockup()) {
218 int this_cpu = smp_processor_id();
219
220 /* only print hardlockups once */
221 if (__get_cpu_var(hard_watchdog_warn) == true)
222 return;
223
224 if (hardlockup_panic)
225 panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
226 else
227 WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
228
229 __get_cpu_var(hard_watchdog_warn) = true;
230 return;
231 }
232
233 __get_cpu_var(hard_watchdog_warn) = false;
234 return;
235}
236static void watchdog_interrupt_count(void)
237{
238 __get_cpu_var(hrtimer_interrupts)++;
239}
240#else
241static inline void watchdog_interrupt_count(void) { return; }
242#endif /* CONFIG_HARDLOCKUP_DETECTOR */
243
244/* watchdog kicker functions */
245static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
246{
247 unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts);
248 struct pt_regs *regs = get_irq_regs();
249 int duration;
250
251 /* kick the hardlockup detector */
252 watchdog_interrupt_count();
253
254 /* kick the softlockup detector */
255 wake_up_process(__get_cpu_var(softlockup_watchdog));
256
257 /* .. and repeat */
258 hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));
259
260 if (touch_ts == 0) {
261 if (unlikely(__get_cpu_var(softlockup_touch_sync))) {
262 /*
263 * If the time stamp was touched atomically
264 * make sure the scheduler tick is up to date.
265 */
266 __get_cpu_var(softlockup_touch_sync) = false;
267 sched_clock_tick();
268 }
269 __touch_watchdog();
270 return HRTIMER_RESTART;
271 }
272
273 /* check for a softlockup
274 * This is done by making sure a high priority task is
275 * being scheduled. The task touches the watchdog to
276 * indicate it is getting cpu time. If it hasn't then
277 * this is a good indication some task is hogging the cpu
278 */
279 duration = is_softlockup(touch_ts);
280 if (unlikely(duration)) {
281 /* only warn once */
282 if (__get_cpu_var(soft_watchdog_warn) == true)
283 return HRTIMER_RESTART;
284
285 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
286 smp_processor_id(), duration,
287 current->comm, task_pid_nr(current));
288 print_modules();
289 print_irqtrace_events(current);
290 if (regs)
291 show_regs(regs);
292 else
293 dump_stack();
294
295 if (softlockup_panic)
296 panic("softlockup: hung tasks");
297 __get_cpu_var(soft_watchdog_warn) = true;
298 } else
299 __get_cpu_var(soft_watchdog_warn) = false;
300
301 return HRTIMER_RESTART;
302}
303
304
305/*
306 * The watchdog thread - touches the timestamp.
307 */
308static int watchdog(void *unused)
309{
310 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
311 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
312
313 sched_setscheduler(current, SCHED_FIFO, &param);
314
315 /* initialize timestamp */
316 __touch_watchdog();
317
318 /* kick off the timer for the hardlockup detector */
319 /* done here because hrtimer_start can only pin to smp_processor_id() */
320 hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()),
321 HRTIMER_MODE_REL_PINNED);
322
323 set_current_state(TASK_INTERRUPTIBLE);
324 /*
325 * Run briefly once per second to reset the softlockup timestamp.
326 * If this gets delayed for more than 60 seconds then the
327 * debug-printout triggers in watchdog_timer_fn().
328 */
329 while (!kthread_should_stop()) {
330 __touch_watchdog();
331 schedule();
332
333 if (kthread_should_stop())
334 break;
335
336 set_current_state(TASK_INTERRUPTIBLE);
337 }
338 __set_current_state(TASK_RUNNING);
339
340 return 0;
341}
342
343
344#ifdef CONFIG_HARDLOCKUP_DETECTOR
345static int watchdog_nmi_enable(int cpu)
346{
347 struct perf_event_attr *wd_attr;
348 struct perf_event *event = per_cpu(watchdog_ev, cpu);
349
350 /* is it already setup and enabled? */
351 if (event && event->state > PERF_EVENT_STATE_OFF)
352 goto out;
353
354 /* it is setup but not enabled */
355 if (event != NULL)
356 goto out_enable;
357
358 /* Try to register using hardware perf events */
359 wd_attr = &wd_hw_attr;
360 wd_attr->sample_period = hw_nmi_get_sample_period();
361 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback);
362 if (!IS_ERR(event)) {
363 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
364 goto out_save;
365 }
366
367 printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event);
368 return PTR_ERR(event);
369
370 /* success path */
371out_save:
372 per_cpu(watchdog_ev, cpu) = event;
373out_enable:
374 perf_event_enable(per_cpu(watchdog_ev, cpu));
375out:
376 return 0;
377}
378
379static void watchdog_nmi_disable(int cpu)
380{
381 struct perf_event *event = per_cpu(watchdog_ev, cpu);
382
383 if (event) {
384 perf_event_disable(event);
385 per_cpu(watchdog_ev, cpu) = NULL;
386
387 /* should be in cleanup, but blocks oprofile */
388 perf_event_release_kernel(event);
389 }
390 return;
391}
392#else
393static int watchdog_nmi_enable(int cpu) { return 0; }
394static void watchdog_nmi_disable(int cpu) { return; }
395#endif /* CONFIG_HARDLOCKUP_DETECTOR */
396
397/* prepare/enable/disable routines */
398static int watchdog_prepare_cpu(int cpu)
399{
400 struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
401
402 WARN_ON(per_cpu(softlockup_watchdog, cpu));
403 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
404 hrtimer->function = watchdog_timer_fn;
405
406 return 0;
407}
408
409static int watchdog_enable(int cpu)
410{
411 struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
412 int err;
413
414 /* enable the perf event */
415 err = watchdog_nmi_enable(cpu);
416 if (err)
417 return err;
418
419 /* create the watchdog thread */
420 if (!p) {
421 p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
422 if (IS_ERR(p)) {
423 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
424 return PTR_ERR(p);
425 }
426 kthread_bind(p, cpu);
427 per_cpu(watchdog_touch_ts, cpu) = 0;
428 per_cpu(softlockup_watchdog, cpu) = p;
429 wake_up_process(p);
430 }
431
432 /* if any cpu succeeds, watchdog is considered enabled for the system */
433 watchdog_enabled = 1;
434
435 return 0;
436}
437
438static void watchdog_disable(int cpu)
439{
440 struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
441 struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
442
443 /*
444 * cancel the timer first to stop incrementing the stats
445 * and waking up the kthread
446 */
447 hrtimer_cancel(hrtimer);
448
449 /* disable the perf event */
450 watchdog_nmi_disable(cpu);
451
452 /* stop the watchdog thread */
453 if (p) {
454 per_cpu(softlockup_watchdog, cpu) = NULL;
455 kthread_stop(p);
456 }
457}
458
459static void watchdog_enable_all_cpus(void)
460{
461 int cpu;
462 int result = 0;
463
464 for_each_online_cpu(cpu)
465 result += watchdog_enable(cpu);
466
467 if (result)
468 printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n");
469
470}
471
472static void watchdog_disable_all_cpus(void)
473{
474 int cpu;
475
476 if (no_watchdog)
477 return;
478
479 for_each_online_cpu(cpu)
480 watchdog_disable(cpu);
481
482 /* if all watchdogs are disabled, then they are disabled for the system */
483 watchdog_enabled = 0;
484}
485
486
487/* sysctl functions */
488#ifdef CONFIG_SYSCTL
489/*
490 * proc handler for /proc/sys/kernel/nmi_watchdog
491 */
492
493int proc_dowatchdog_enabled(struct ctl_table *table, int write,
494 void __user *buffer, size_t *length, loff_t *ppos)
495{
496 proc_dointvec(table, write, buffer, length, ppos);
497
498 if (watchdog_enabled)
499 watchdog_enable_all_cpus();
500 else
501 watchdog_disable_all_cpus();
502 return 0;
503}
504
505int proc_dowatchdog_thresh(struct ctl_table *table, int write,
506 void __user *buffer,
507 size_t *lenp, loff_t *ppos)
508{
509 return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
510}
511#endif /* CONFIG_SYSCTL */
512
513
514/*
515 * Create/destroy watchdog threads as CPUs come and go:
516 */
517static int __cpuinit
518cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
519{
520 int hotcpu = (unsigned long)hcpu;
521 int err = 0;
522
523 switch (action) {
524 case CPU_UP_PREPARE:
525 case CPU_UP_PREPARE_FROZEN:
526 err = watchdog_prepare_cpu(hotcpu);
527 break;
528 case CPU_ONLINE:
529 case CPU_ONLINE_FROZEN:
530 err = watchdog_enable(hotcpu);
531 break;
532#ifdef CONFIG_HOTPLUG_CPU
533 case CPU_UP_CANCELED:
534 case CPU_UP_CANCELED_FROZEN:
535 watchdog_disable(hotcpu);
536 break;
537 case CPU_DEAD:
538 case CPU_DEAD_FROZEN:
539 watchdog_disable(hotcpu);
540 break;
541#endif /* CONFIG_HOTPLUG_CPU */
542 }
543 return notifier_from_errno(err);
544}
545
546static struct notifier_block __cpuinitdata cpu_nfb = {
547 .notifier_call = cpu_callback
548};
549
550static int __init spawn_watchdog_task(void)
551{
552 void *cpu = (void *)(long)smp_processor_id();
553 int err;
554
555 if (no_watchdog)
556 return 0;
557
558 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
559 WARN_ON(notifier_to_errno(err));
560
561 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
562 register_cpu_notifier(&cpu_nfb);
563
564 return 0;
565}
566early_initcall(spawn_watchdog_task);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 327d2deb4451..90db1bd1a978 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1,19 +1,26 @@
1/* 1/*
2 * linux/kernel/workqueue.c 2 * kernel/workqueue.c - generic async execution with shared worker pool
3 * 3 *
4 * Generic mechanism for defining kernel helper threads for running 4 * Copyright (C) 2002 Ingo Molnar
5 * arbitrary tasks in process context.
6 * 5 *
7 * Started by Ingo Molnar, Copyright (C) 2002 6 * Derived from the taskqueue/keventd code by:
7 * David Woodhouse <dwmw2@infradead.org>
8 * Andrew Morton
9 * Kai Petzke <wpp@marie.physik.tu-berlin.de>
10 * Theodore Ts'o <tytso@mit.edu>
8 * 11 *
9 * Derived from the taskqueue/keventd code by: 12 * Made to use alloc_percpu by Christoph Lameter.
10 * 13 *
11 * David Woodhouse <dwmw2@infradead.org> 14 * Copyright (C) 2010 SUSE Linux Products GmbH
12 * Andrew Morton 15 * Copyright (C) 2010 Tejun Heo <tj@kernel.org>
13 * Kai Petzke <wpp@marie.physik.tu-berlin.de>
14 * Theodore Ts'o <tytso@mit.edu>
15 * 16 *
16 * Made to use alloc_percpu by Christoph Lameter. 17 * This is the generic async execution mechanism. Work items as are
18 * executed in process context. The worker pool is shared and
19 * automatically managed. There is one worker pool for each CPU and
20 * one extra for works which are better served by workers which are
21 * not bound to any specific CPU.
22 *
23 * Please read Documentation/workqueue.txt for details.
17 */ 24 */
18 25
19#include <linux/module.h> 26#include <linux/module.h>
@@ -33,41 +40,276 @@
33#include <linux/kallsyms.h> 40#include <linux/kallsyms.h>
34#include <linux/debug_locks.h> 41#include <linux/debug_locks.h>
35#include <linux/lockdep.h> 42#include <linux/lockdep.h>
36#define CREATE_TRACE_POINTS 43#include <linux/idr.h>
37#include <trace/events/workqueue.h> 44
45#include "workqueue_sched.h"
46
47enum {
48 /* global_cwq flags */
49 GCWQ_MANAGE_WORKERS = 1 << 0, /* need to manage workers */
50 GCWQ_MANAGING_WORKERS = 1 << 1, /* managing workers */
51 GCWQ_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */
52 GCWQ_FREEZING = 1 << 3, /* freeze in progress */
53 GCWQ_HIGHPRI_PENDING = 1 << 4, /* highpri works on queue */
54
55 /* worker flags */
56 WORKER_STARTED = 1 << 0, /* started */
57 WORKER_DIE = 1 << 1, /* die die die */
58 WORKER_IDLE = 1 << 2, /* is idle */
59 WORKER_PREP = 1 << 3, /* preparing to run works */
60 WORKER_ROGUE = 1 << 4, /* not bound to any cpu */
61 WORKER_REBIND = 1 << 5, /* mom is home, come back */
62 WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */
63 WORKER_UNBOUND = 1 << 7, /* worker is unbound */
64
65 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND |
66 WORKER_CPU_INTENSIVE | WORKER_UNBOUND,
67
68 /* gcwq->trustee_state */
69 TRUSTEE_START = 0, /* start */
70 TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */
71 TRUSTEE_BUTCHER = 2, /* butcher workers */
72 TRUSTEE_RELEASE = 3, /* release workers */
73 TRUSTEE_DONE = 4, /* trustee is done */
74
75 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */
76 BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER,
77 BUSY_WORKER_HASH_MASK = BUSY_WORKER_HASH_SIZE - 1,
78
79 MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */
80 IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */
81
82 MAYDAY_INITIAL_TIMEOUT = HZ / 100, /* call for help after 10ms */
83 MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */
84 CREATE_COOLDOWN = HZ, /* time to breath after fail */
85 TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */
86
87 /*
88 * Rescue workers are used only on emergencies and shared by
89 * all cpus. Give -20.
90 */
91 RESCUER_NICE_LEVEL = -20,
92};
38 93
39/* 94/*
40 * The per-CPU workqueue (if single thread, we always use the first 95 * Structure fields follow one of the following exclusion rules.
41 * possible cpu). 96 *
97 * I: Modifiable by initialization/destruction paths and read-only for
98 * everyone else.
99 *
100 * P: Preemption protected. Disabling preemption is enough and should
101 * only be modified and accessed from the local cpu.
102 *
103 * L: gcwq->lock protected. Access with gcwq->lock held.
104 *
105 * X: During normal operation, modification requires gcwq->lock and
106 * should be done only from local cpu. Either disabling preemption
107 * on local cpu or grabbing gcwq->lock is enough for read access.
108 * If GCWQ_DISASSOCIATED is set, it's identical to L.
109 *
110 * F: wq->flush_mutex protected.
111 *
112 * W: workqueue_lock protected.
42 */ 113 */
43struct cpu_workqueue_struct {
44 114
45 spinlock_t lock; 115struct global_cwq;
46 116
47 struct list_head worklist; 117/*
48 wait_queue_head_t more_work; 118 * The poor guys doing the actual heavy lifting. All on-duty workers
49 struct work_struct *current_work; 119 * are either serving the manager role, on idle list or on busy hash.
120 */
121struct worker {
122 /* on idle list while idle, on busy hash table while busy */
123 union {
124 struct list_head entry; /* L: while idle */
125 struct hlist_node hentry; /* L: while busy */
126 };
50 127
51 struct workqueue_struct *wq; 128 struct work_struct *current_work; /* L: work being processed */
52 struct task_struct *thread; 129 struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
53} ____cacheline_aligned; 130 struct list_head scheduled; /* L: scheduled works */
131 struct task_struct *task; /* I: worker task */
132 struct global_cwq *gcwq; /* I: the associated gcwq */
133 /* 64 bytes boundary on 64bit, 32 on 32bit */
134 unsigned long last_active; /* L: last active timestamp */
135 unsigned int flags; /* X: flags */
136 int id; /* I: worker id */
137 struct work_struct rebind_work; /* L: rebind worker to cpu */
138};
139
140/*
141 * Global per-cpu workqueue. There's one and only one for each cpu
142 * and all works are queued and processed here regardless of their
143 * target workqueues.
144 */
145struct global_cwq {
146 spinlock_t lock; /* the gcwq lock */
147 struct list_head worklist; /* L: list of pending works */
148 unsigned int cpu; /* I: the associated cpu */
149 unsigned int flags; /* L: GCWQ_* flags */
150
151 int nr_workers; /* L: total number of workers */
152 int nr_idle; /* L: currently idle ones */
153
154 /* workers are chained either in the idle_list or busy_hash */
155 struct list_head idle_list; /* X: list of idle workers */
156 struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE];
157 /* L: hash of busy workers */
158
159 struct timer_list idle_timer; /* L: worker idle timeout */
160 struct timer_list mayday_timer; /* L: SOS timer for dworkers */
161
162 struct ida worker_ida; /* L: for worker IDs */
163
164 struct task_struct *trustee; /* L: for gcwq shutdown */
165 unsigned int trustee_state; /* L: trustee state */
166 wait_queue_head_t trustee_wait; /* trustee wait */
167 struct worker *first_idle; /* L: first idle worker */
168} ____cacheline_aligned_in_smp;
169
170/*
171 * The per-CPU workqueue. The lower WORK_STRUCT_FLAG_BITS of
172 * work_struct->data are used for flags and thus cwqs need to be
173 * aligned at two's power of the number of flag bits.
174 */
175struct cpu_workqueue_struct {
176 struct global_cwq *gcwq; /* I: the associated gcwq */
177 struct workqueue_struct *wq; /* I: the owning workqueue */
178 int work_color; /* L: current color */
179 int flush_color; /* L: flushing color */
180 int nr_in_flight[WORK_NR_COLORS];
181 /* L: nr of in_flight works */
182 int nr_active; /* L: nr of active works */
183 int max_active; /* L: max active works */
184 struct list_head delayed_works; /* L: delayed works */
185};
186
187/*
188 * Structure used to wait for workqueue flush.
189 */
190struct wq_flusher {
191 struct list_head list; /* F: list of flushers */
192 int flush_color; /* F: flush color waiting for */
193 struct completion done; /* flush completion */
194};
195
196/*
197 * All cpumasks are assumed to be always set on UP and thus can't be
198 * used to determine whether there's something to be done.
199 */
200#ifdef CONFIG_SMP
201typedef cpumask_var_t mayday_mask_t;
202#define mayday_test_and_set_cpu(cpu, mask) \
203 cpumask_test_and_set_cpu((cpu), (mask))
204#define mayday_clear_cpu(cpu, mask) cpumask_clear_cpu((cpu), (mask))
205#define for_each_mayday_cpu(cpu, mask) for_each_cpu((cpu), (mask))
206#define alloc_mayday_mask(maskp, gfp) zalloc_cpumask_var((maskp), (gfp))
207#define free_mayday_mask(mask) free_cpumask_var((mask))
208#else
209typedef unsigned long mayday_mask_t;
210#define mayday_test_and_set_cpu(cpu, mask) test_and_set_bit(0, &(mask))
211#define mayday_clear_cpu(cpu, mask) clear_bit(0, &(mask))
212#define for_each_mayday_cpu(cpu, mask) if ((cpu) = 0, (mask))
213#define alloc_mayday_mask(maskp, gfp) true
214#define free_mayday_mask(mask) do { } while (0)
215#endif
54 216
55/* 217/*
56 * The externally visible workqueue abstraction is an array of 218 * The externally visible workqueue abstraction is an array of
57 * per-CPU workqueues: 219 * per-CPU workqueues:
58 */ 220 */
59struct workqueue_struct { 221struct workqueue_struct {
60 struct cpu_workqueue_struct *cpu_wq; 222 unsigned int flags; /* I: WQ_* flags */
61 struct list_head list; 223 union {
62 const char *name; 224 struct cpu_workqueue_struct __percpu *pcpu;
63 int singlethread; 225 struct cpu_workqueue_struct *single;
64 int freezeable; /* Freeze threads during suspend */ 226 unsigned long v;
65 int rt; 227 } cpu_wq; /* I: cwq's */
228 struct list_head list; /* W: list of all workqueues */
229
230 struct mutex flush_mutex; /* protects wq flushing */
231 int work_color; /* F: current work color */
232 int flush_color; /* F: current flush color */
233 atomic_t nr_cwqs_to_flush; /* flush in progress */
234 struct wq_flusher *first_flusher; /* F: first flusher */
235 struct list_head flusher_queue; /* F: flush waiters */
236 struct list_head flusher_overflow; /* F: flush overflow list */
237
238 mayday_mask_t mayday_mask; /* cpus requesting rescue */
239 struct worker *rescuer; /* I: rescue worker */
240
241 int saved_max_active; /* W: saved cwq max_active */
242 const char *name; /* I: workqueue name */
66#ifdef CONFIG_LOCKDEP 243#ifdef CONFIG_LOCKDEP
67 struct lockdep_map lockdep_map; 244 struct lockdep_map lockdep_map;
68#endif 245#endif
69}; 246};
70 247
248struct workqueue_struct *system_wq __read_mostly;
249struct workqueue_struct *system_long_wq __read_mostly;
250struct workqueue_struct *system_nrt_wq __read_mostly;
251struct workqueue_struct *system_unbound_wq __read_mostly;
252EXPORT_SYMBOL_GPL(system_wq);
253EXPORT_SYMBOL_GPL(system_long_wq);
254EXPORT_SYMBOL_GPL(system_nrt_wq);
255EXPORT_SYMBOL_GPL(system_unbound_wq);
256
257#define CREATE_TRACE_POINTS
258#include <trace/events/workqueue.h>
259
260#define for_each_busy_worker(worker, i, pos, gcwq) \
261 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \
262 hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
263
264static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask,
265 unsigned int sw)
266{
267 if (cpu < nr_cpu_ids) {
268 if (sw & 1) {
269 cpu = cpumask_next(cpu, mask);
270 if (cpu < nr_cpu_ids)
271 return cpu;
272 }
273 if (sw & 2)
274 return WORK_CPU_UNBOUND;
275 }
276 return WORK_CPU_NONE;
277}
278
279static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
280 struct workqueue_struct *wq)
281{
282 return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2);
283}
284
285/*
286 * CPU iterators
287 *
288 * An extra gcwq is defined for an invalid cpu number
289 * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any
290 * specific CPU. The following iterators are similar to
291 * for_each_*_cpu() iterators but also considers the unbound gcwq.
292 *
293 * for_each_gcwq_cpu() : possible CPUs + WORK_CPU_UNBOUND
294 * for_each_online_gcwq_cpu() : online CPUs + WORK_CPU_UNBOUND
295 * for_each_cwq_cpu() : possible CPUs for bound workqueues,
296 * WORK_CPU_UNBOUND for unbound workqueues
297 */
298#define for_each_gcwq_cpu(cpu) \
299 for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3); \
300 (cpu) < WORK_CPU_NONE; \
301 (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3))
302
303#define for_each_online_gcwq_cpu(cpu) \
304 for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3); \
305 (cpu) < WORK_CPU_NONE; \
306 (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3))
307
308#define for_each_cwq_cpu(cpu, wq) \
309 for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq)); \
310 (cpu) < WORK_CPU_NONE; \
311 (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq)))
312
71#ifdef CONFIG_DEBUG_OBJECTS_WORK 313#ifdef CONFIG_DEBUG_OBJECTS_WORK
72 314
73static struct debug_obj_descr work_debug_descr; 315static struct debug_obj_descr work_debug_descr;
@@ -107,7 +349,7 @@ static int work_fixup_activate(void *addr, enum debug_obj_state state)
107 * statically initialized. We just make sure that it 349 * statically initialized. We just make sure that it
108 * is tracked in the object tracker. 350 * is tracked in the object tracker.
109 */ 351 */
110 if (test_bit(WORK_STRUCT_STATIC, work_data_bits(work))) { 352 if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
111 debug_object_init(work, &work_debug_descr); 353 debug_object_init(work, &work_debug_descr);
112 debug_object_activate(work, &work_debug_descr); 354 debug_object_activate(work, &work_debug_descr);
113 return 0; 355 return 0;
@@ -181,94 +423,586 @@ static inline void debug_work_deactivate(struct work_struct *work) { }
181/* Serializes the accesses to the list of workqueues. */ 423/* Serializes the accesses to the list of workqueues. */
182static DEFINE_SPINLOCK(workqueue_lock); 424static DEFINE_SPINLOCK(workqueue_lock);
183static LIST_HEAD(workqueues); 425static LIST_HEAD(workqueues);
426static bool workqueue_freezing; /* W: have wqs started freezing? */
184 427
185static int singlethread_cpu __read_mostly;
186static const struct cpumask *cpu_singlethread_map __read_mostly;
187/* 428/*
188 * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD 429 * The almighty global cpu workqueues. nr_running is the only field
189 * flushes cwq->worklist. This means that flush_workqueue/wait_on_work 430 * which is expected to be used frequently by other cpus via
190 * which comes in between can't use for_each_online_cpu(). We could 431 * try_to_wake_up(). Put it in a separate cacheline.
191 * use cpu_possible_map, the cpumask below is more a documentation
192 * than optimization.
193 */ 432 */
194static cpumask_var_t cpu_populated_map __read_mostly; 433static DEFINE_PER_CPU(struct global_cwq, global_cwq);
434static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
195 435
196/* If it's single threaded, it isn't in the list of workqueues. */ 436/*
197static inline int is_wq_single_threaded(struct workqueue_struct *wq) 437 * Global cpu workqueue and nr_running counter for unbound gcwq. The
438 * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its
439 * workers have WORKER_UNBOUND set.
440 */
441static struct global_cwq unbound_global_cwq;
442static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0); /* always 0 */
443
444static int worker_thread(void *__worker);
445
446static struct global_cwq *get_gcwq(unsigned int cpu)
447{
448 if (cpu != WORK_CPU_UNBOUND)
449 return &per_cpu(global_cwq, cpu);
450 else
451 return &unbound_global_cwq;
452}
453
454static atomic_t *get_gcwq_nr_running(unsigned int cpu)
455{
456 if (cpu != WORK_CPU_UNBOUND)
457 return &per_cpu(gcwq_nr_running, cpu);
458 else
459 return &unbound_gcwq_nr_running;
460}
461
462static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
463 struct workqueue_struct *wq)
464{
465 if (!(wq->flags & WQ_UNBOUND)) {
466 if (likely(cpu < nr_cpu_ids)) {
467#ifdef CONFIG_SMP
468 return per_cpu_ptr(wq->cpu_wq.pcpu, cpu);
469#else
470 return wq->cpu_wq.single;
471#endif
472 }
473 } else if (likely(cpu == WORK_CPU_UNBOUND))
474 return wq->cpu_wq.single;
475 return NULL;
476}
477
478static unsigned int work_color_to_flags(int color)
198{ 479{
199 return wq->singlethread; 480 return color << WORK_STRUCT_COLOR_SHIFT;
200} 481}
201 482
202static const struct cpumask *wq_cpu_map(struct workqueue_struct *wq) 483static int get_work_color(struct work_struct *work)
203{ 484{
204 return is_wq_single_threaded(wq) 485 return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) &
205 ? cpu_singlethread_map : cpu_populated_map; 486 ((1 << WORK_STRUCT_COLOR_BITS) - 1);
206} 487}
207 488
208static 489static int work_next_color(int color)
209struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu)
210{ 490{
211 if (unlikely(is_wq_single_threaded(wq))) 491 return (color + 1) % WORK_NR_COLORS;
212 cpu = singlethread_cpu;
213 return per_cpu_ptr(wq->cpu_wq, cpu);
214} 492}
215 493
216/* 494/*
217 * Set the workqueue on which a work item is to be run 495 * A work's data points to the cwq with WORK_STRUCT_CWQ set while the
218 * - Must *only* be called if the pending flag is set 496 * work is on queue. Once execution starts, WORK_STRUCT_CWQ is
497 * cleared and the work data contains the cpu number it was last on.
498 *
499 * set_work_{cwq|cpu}() and clear_work_data() can be used to set the
500 * cwq, cpu or clear work->data. These functions should only be
501 * called while the work is owned - ie. while the PENDING bit is set.
502 *
503 * get_work_[g]cwq() can be used to obtain the gcwq or cwq
504 * corresponding to a work. gcwq is available once the work has been
505 * queued anywhere after initialization. cwq is available only from
506 * queueing until execution starts.
219 */ 507 */
220static inline void set_wq_data(struct work_struct *work, 508static inline void set_work_data(struct work_struct *work, unsigned long data,
221 struct cpu_workqueue_struct *cwq) 509 unsigned long flags)
222{ 510{
223 unsigned long new;
224
225 BUG_ON(!work_pending(work)); 511 BUG_ON(!work_pending(work));
512 atomic_long_set(&work->data, data | flags | work_static(work));
513}
514
515static void set_work_cwq(struct work_struct *work,
516 struct cpu_workqueue_struct *cwq,
517 unsigned long extra_flags)
518{
519 set_work_data(work, (unsigned long)cwq,
520 WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags);
521}
522
523static void set_work_cpu(struct work_struct *work, unsigned int cpu)
524{
525 set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING);
526}
527
528static void clear_work_data(struct work_struct *work)
529{
530 set_work_data(work, WORK_STRUCT_NO_CPU, 0);
531}
532
533static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work)
534{
535 unsigned long data = atomic_long_read(&work->data);
536
537 if (data & WORK_STRUCT_CWQ)
538 return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
539 else
540 return NULL;
541}
542
543static struct global_cwq *get_work_gcwq(struct work_struct *work)
544{
545 unsigned long data = atomic_long_read(&work->data);
546 unsigned int cpu;
547
548 if (data & WORK_STRUCT_CWQ)
549 return ((struct cpu_workqueue_struct *)
550 (data & WORK_STRUCT_WQ_DATA_MASK))->gcwq;
551
552 cpu = data >> WORK_STRUCT_FLAG_BITS;
553 if (cpu == WORK_CPU_NONE)
554 return NULL;
555
556 BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND);
557 return get_gcwq(cpu);
558}
559
560/*
561 * Policy functions. These define the policies on how the global
562 * worker pool is managed. Unless noted otherwise, these functions
563 * assume that they're being called with gcwq->lock held.
564 */
565
566static bool __need_more_worker(struct global_cwq *gcwq)
567{
568 return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) ||
569 gcwq->flags & GCWQ_HIGHPRI_PENDING;
570}
571
572/*
573 * Need to wake up a worker? Called from anything but currently
574 * running workers.
575 */
576static bool need_more_worker(struct global_cwq *gcwq)
577{
578 return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq);
579}
580
581/* Can I start working? Called from busy but !running workers. */
582static bool may_start_working(struct global_cwq *gcwq)
583{
584 return gcwq->nr_idle;
585}
586
587/* Do I need to keep working? Called from currently running workers. */
588static bool keep_working(struct global_cwq *gcwq)
589{
590 atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
591
592 return !list_empty(&gcwq->worklist) &&
593 (atomic_read(nr_running) <= 1 ||
594 gcwq->flags & GCWQ_HIGHPRI_PENDING);
595}
596
597/* Do we need a new worker? Called from manager. */
598static bool need_to_create_worker(struct global_cwq *gcwq)
599{
600 return need_more_worker(gcwq) && !may_start_working(gcwq);
601}
602
603/* Do I need to be the manager? */
604static bool need_to_manage_workers(struct global_cwq *gcwq)
605{
606 return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS;
607}
226 608
227 new = (unsigned long) cwq | (1UL << WORK_STRUCT_PENDING); 609/* Do we have too many workers and should some go away? */
228 new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work); 610static bool too_many_workers(struct global_cwq *gcwq)
229 atomic_long_set(&work->data, new); 611{
612 bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS;
613 int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */
614 int nr_busy = gcwq->nr_workers - nr_idle;
615
616 return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
230} 617}
231 618
232/* 619/*
233 * Clear WORK_STRUCT_PENDING and the workqueue on which it was queued. 620 * Wake up functions.
621 */
622
623/* Return the first worker. Safe with preemption disabled */
624static struct worker *first_worker(struct global_cwq *gcwq)
625{
626 if (unlikely(list_empty(&gcwq->idle_list)))
627 return NULL;
628
629 return list_first_entry(&gcwq->idle_list, struct worker, entry);
630}
631
632/**
633 * wake_up_worker - wake up an idle worker
634 * @gcwq: gcwq to wake worker for
635 *
636 * Wake up the first idle worker of @gcwq.
637 *
638 * CONTEXT:
639 * spin_lock_irq(gcwq->lock).
640 */
641static void wake_up_worker(struct global_cwq *gcwq)
642{
643 struct worker *worker = first_worker(gcwq);
644
645 if (likely(worker))
646 wake_up_process(worker->task);
647}
648
649/**
650 * wq_worker_waking_up - a worker is waking up
651 * @task: task waking up
652 * @cpu: CPU @task is waking up to
653 *
654 * This function is called during try_to_wake_up() when a worker is
655 * being awoken.
656 *
657 * CONTEXT:
658 * spin_lock_irq(rq->lock)
659 */
660void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
661{
662 struct worker *worker = kthread_data(task);
663
664 if (likely(!(worker->flags & WORKER_NOT_RUNNING)))
665 atomic_inc(get_gcwq_nr_running(cpu));
666}
667
668/**
669 * wq_worker_sleeping - a worker is going to sleep
670 * @task: task going to sleep
671 * @cpu: CPU in question, must be the current CPU number
672 *
673 * This function is called during schedule() when a busy worker is
674 * going to sleep. Worker on the same cpu can be woken up by
675 * returning pointer to its task.
676 *
677 * CONTEXT:
678 * spin_lock_irq(rq->lock)
679 *
680 * RETURNS:
681 * Worker task on @cpu to wake up, %NULL if none.
682 */
683struct task_struct *wq_worker_sleeping(struct task_struct *task,
684 unsigned int cpu)
685{
686 struct worker *worker = kthread_data(task), *to_wakeup = NULL;
687 struct global_cwq *gcwq = get_gcwq(cpu);
688 atomic_t *nr_running = get_gcwq_nr_running(cpu);
689
690 if (unlikely(worker->flags & WORKER_NOT_RUNNING))
691 return NULL;
692
693 /* this can only happen on the local cpu */
694 BUG_ON(cpu != raw_smp_processor_id());
695
696 /*
697 * The counterpart of the following dec_and_test, implied mb,
698 * worklist not empty test sequence is in insert_work().
699 * Please read comment there.
700 *
701 * NOT_RUNNING is clear. This means that trustee is not in
702 * charge and we're running on the local cpu w/ rq lock held
703 * and preemption disabled, which in turn means that none else
704 * could be manipulating idle_list, so dereferencing idle_list
705 * without gcwq lock is safe.
706 */
707 if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist))
708 to_wakeup = first_worker(gcwq);
709 return to_wakeup ? to_wakeup->task : NULL;
710}
711
712/**
713 * worker_set_flags - set worker flags and adjust nr_running accordingly
714 * @worker: self
715 * @flags: flags to set
716 * @wakeup: wakeup an idle worker if necessary
717 *
718 * Set @flags in @worker->flags and adjust nr_running accordingly. If
719 * nr_running becomes zero and @wakeup is %true, an idle worker is
720 * woken up.
721 *
722 * CONTEXT:
723 * spin_lock_irq(gcwq->lock)
234 */ 724 */
235static inline void clear_wq_data(struct work_struct *work) 725static inline void worker_set_flags(struct worker *worker, unsigned int flags,
726 bool wakeup)
236{ 727{
237 unsigned long flags = *work_data_bits(work) & 728 struct global_cwq *gcwq = worker->gcwq;
238 (1UL << WORK_STRUCT_STATIC); 729
239 atomic_long_set(&work->data, flags); 730 WARN_ON_ONCE(worker->task != current);
731
732 /*
733 * If transitioning into NOT_RUNNING, adjust nr_running and
734 * wake up an idle worker as necessary if requested by
735 * @wakeup.
736 */
737 if ((flags & WORKER_NOT_RUNNING) &&
738 !(worker->flags & WORKER_NOT_RUNNING)) {
739 atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
740
741 if (wakeup) {
742 if (atomic_dec_and_test(nr_running) &&
743 !list_empty(&gcwq->worklist))
744 wake_up_worker(gcwq);
745 } else
746 atomic_dec(nr_running);
747 }
748
749 worker->flags |= flags;
240} 750}
241 751
242static inline 752/**
243struct cpu_workqueue_struct *get_wq_data(struct work_struct *work) 753 * worker_clr_flags - clear worker flags and adjust nr_running accordingly
754 * @worker: self
755 * @flags: flags to clear
756 *
757 * Clear @flags in @worker->flags and adjust nr_running accordingly.
758 *
759 * CONTEXT:
760 * spin_lock_irq(gcwq->lock)
761 */
762static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
244{ 763{
245 return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); 764 struct global_cwq *gcwq = worker->gcwq;
765 unsigned int oflags = worker->flags;
766
767 WARN_ON_ONCE(worker->task != current);
768
769 worker->flags &= ~flags;
770
771 /* if transitioning out of NOT_RUNNING, increment nr_running */
772 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
773 if (!(worker->flags & WORKER_NOT_RUNNING))
774 atomic_inc(get_gcwq_nr_running(gcwq->cpu));
246} 775}
247 776
777/**
778 * busy_worker_head - return the busy hash head for a work
779 * @gcwq: gcwq of interest
780 * @work: work to be hashed
781 *
782 * Return hash head of @gcwq for @work.
783 *
784 * CONTEXT:
785 * spin_lock_irq(gcwq->lock).
786 *
787 * RETURNS:
788 * Pointer to the hash head.
789 */
790static struct hlist_head *busy_worker_head(struct global_cwq *gcwq,
791 struct work_struct *work)
792{
793 const int base_shift = ilog2(sizeof(struct work_struct));
794 unsigned long v = (unsigned long)work;
795
796 /* simple shift and fold hash, do we need something better? */
797 v >>= base_shift;
798 v += v >> BUSY_WORKER_HASH_ORDER;
799 v &= BUSY_WORKER_HASH_MASK;
800
801 return &gcwq->busy_hash[v];
802}
803
804/**
805 * __find_worker_executing_work - find worker which is executing a work
806 * @gcwq: gcwq of interest
807 * @bwh: hash head as returned by busy_worker_head()
808 * @work: work to find worker for
809 *
810 * Find a worker which is executing @work on @gcwq. @bwh should be
811 * the hash head obtained by calling busy_worker_head() with the same
812 * work.
813 *
814 * CONTEXT:
815 * spin_lock_irq(gcwq->lock).
816 *
817 * RETURNS:
818 * Pointer to worker which is executing @work if found, NULL
819 * otherwise.
820 */
821static struct worker *__find_worker_executing_work(struct global_cwq *gcwq,
822 struct hlist_head *bwh,
823 struct work_struct *work)
824{
825 struct worker *worker;
826 struct hlist_node *tmp;
827
828 hlist_for_each_entry(worker, tmp, bwh, hentry)
829 if (worker->current_work == work)
830 return worker;
831 return NULL;
832}
833
834/**
835 * find_worker_executing_work - find worker which is executing a work
836 * @gcwq: gcwq of interest
837 * @work: work to find worker for
838 *
839 * Find a worker which is executing @work on @gcwq. This function is
840 * identical to __find_worker_executing_work() except that this
841 * function calculates @bwh itself.
842 *
843 * CONTEXT:
844 * spin_lock_irq(gcwq->lock).
845 *
846 * RETURNS:
847 * Pointer to worker which is executing @work if found, NULL
848 * otherwise.
849 */
850static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
851 struct work_struct *work)
852{
853 return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work),
854 work);
855}
856
857/**
858 * gcwq_determine_ins_pos - find insertion position
859 * @gcwq: gcwq of interest
860 * @cwq: cwq a work is being queued for
861 *
862 * A work for @cwq is about to be queued on @gcwq, determine insertion
863 * position for the work. If @cwq is for HIGHPRI wq, the work is
864 * queued at the head of the queue but in FIFO order with respect to
865 * other HIGHPRI works; otherwise, at the end of the queue. This
866 * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that
867 * there are HIGHPRI works pending.
868 *
869 * CONTEXT:
870 * spin_lock_irq(gcwq->lock).
871 *
872 * RETURNS:
873 * Pointer to inserstion position.
874 */
875static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq,
876 struct cpu_workqueue_struct *cwq)
877{
878 struct work_struct *twork;
879
880 if (likely(!(cwq->wq->flags & WQ_HIGHPRI)))
881 return &gcwq->worklist;
882
883 list_for_each_entry(twork, &gcwq->worklist, entry) {
884 struct cpu_workqueue_struct *tcwq = get_work_cwq(twork);
885
886 if (!(tcwq->wq->flags & WQ_HIGHPRI))
887 break;
888 }
889
890 gcwq->flags |= GCWQ_HIGHPRI_PENDING;
891 return &twork->entry;
892}
893
894/**
895 * insert_work - insert a work into gcwq
896 * @cwq: cwq @work belongs to
897 * @work: work to insert
898 * @head: insertion point
899 * @extra_flags: extra WORK_STRUCT_* flags to set
900 *
901 * Insert @work which belongs to @cwq into @gcwq after @head.
902 * @extra_flags is or'd to work_struct flags.
903 *
904 * CONTEXT:
905 * spin_lock_irq(gcwq->lock).
906 */
248static void insert_work(struct cpu_workqueue_struct *cwq, 907static void insert_work(struct cpu_workqueue_struct *cwq,
249 struct work_struct *work, struct list_head *head) 908 struct work_struct *work, struct list_head *head,
909 unsigned int extra_flags)
250{ 910{
251 trace_workqueue_insertion(cwq->thread, work); 911 struct global_cwq *gcwq = cwq->gcwq;
912
913 /* we own @work, set data and link */
914 set_work_cwq(work, cwq, extra_flags);
252 915
253 set_wq_data(work, cwq);
254 /* 916 /*
255 * Ensure that we get the right work->data if we see the 917 * Ensure that we get the right work->data if we see the
256 * result of list_add() below, see try_to_grab_pending(). 918 * result of list_add() below, see try_to_grab_pending().
257 */ 919 */
258 smp_wmb(); 920 smp_wmb();
921
259 list_add_tail(&work->entry, head); 922 list_add_tail(&work->entry, head);
260 wake_up(&cwq->more_work); 923
924 /*
925 * Ensure either worker_sched_deactivated() sees the above
926 * list_add_tail() or we see zero nr_running to avoid workers
927 * lying around lazily while there are works to be processed.
928 */
929 smp_mb();
930
931 if (__need_more_worker(gcwq))
932 wake_up_worker(gcwq);
261} 933}
262 934
263static void __queue_work(struct cpu_workqueue_struct *cwq, 935static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
264 struct work_struct *work) 936 struct work_struct *work)
265{ 937{
938 struct global_cwq *gcwq;
939 struct cpu_workqueue_struct *cwq;
940 struct list_head *worklist;
941 unsigned int work_flags;
266 unsigned long flags; 942 unsigned long flags;
267 943
268 debug_work_activate(work); 944 debug_work_activate(work);
269 spin_lock_irqsave(&cwq->lock, flags); 945
270 insert_work(cwq, work, &cwq->worklist); 946 if (WARN_ON_ONCE(wq->flags & WQ_DYING))
271 spin_unlock_irqrestore(&cwq->lock, flags); 947 return;
948
949 /* determine gcwq to use */
950 if (!(wq->flags & WQ_UNBOUND)) {
951 struct global_cwq *last_gcwq;
952
953 if (unlikely(cpu == WORK_CPU_UNBOUND))
954 cpu = raw_smp_processor_id();
955
956 /*
957 * It's multi cpu. If @wq is non-reentrant and @work
958 * was previously on a different cpu, it might still
959 * be running there, in which case the work needs to
960 * be queued on that cpu to guarantee non-reentrance.
961 */
962 gcwq = get_gcwq(cpu);
963 if (wq->flags & WQ_NON_REENTRANT &&
964 (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) {
965 struct worker *worker;
966
967 spin_lock_irqsave(&last_gcwq->lock, flags);
968
969 worker = find_worker_executing_work(last_gcwq, work);
970
971 if (worker && worker->current_cwq->wq == wq)
972 gcwq = last_gcwq;
973 else {
974 /* meh... not running there, queue here */
975 spin_unlock_irqrestore(&last_gcwq->lock, flags);
976 spin_lock_irqsave(&gcwq->lock, flags);
977 }
978 } else
979 spin_lock_irqsave(&gcwq->lock, flags);
980 } else {
981 gcwq = get_gcwq(WORK_CPU_UNBOUND);
982 spin_lock_irqsave(&gcwq->lock, flags);
983 }
984
985 /* gcwq determined, get cwq and queue */
986 cwq = get_cwq(gcwq->cpu, wq);
987 trace_workqueue_queue_work(cpu, cwq, work);
988
989 BUG_ON(!list_empty(&work->entry));
990
991 cwq->nr_in_flight[cwq->work_color]++;
992 work_flags = work_color_to_flags(cwq->work_color);
993
994 if (likely(cwq->nr_active < cwq->max_active)) {
995 trace_workqueue_activate_work(work);
996 cwq->nr_active++;
997 worklist = gcwq_determine_ins_pos(gcwq, cwq);
998 } else {
999 work_flags |= WORK_STRUCT_DELAYED;
1000 worklist = &cwq->delayed_works;
1001 }
1002
1003 insert_work(cwq, work, worklist, work_flags);
1004
1005 spin_unlock_irqrestore(&gcwq->lock, flags);
272} 1006}
273 1007
274/** 1008/**
@@ -308,9 +1042,8 @@ queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
308{ 1042{
309 int ret = 0; 1043 int ret = 0;
310 1044
311 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { 1045 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
312 BUG_ON(!list_empty(&work->entry)); 1046 __queue_work(cpu, wq, work);
313 __queue_work(wq_per_cpu(wq, cpu), work);
314 ret = 1; 1047 ret = 1;
315 } 1048 }
316 return ret; 1049 return ret;
@@ -320,10 +1053,9 @@ EXPORT_SYMBOL_GPL(queue_work_on);
320static void delayed_work_timer_fn(unsigned long __data) 1053static void delayed_work_timer_fn(unsigned long __data)
321{ 1054{
322 struct delayed_work *dwork = (struct delayed_work *)__data; 1055 struct delayed_work *dwork = (struct delayed_work *)__data;
323 struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work); 1056 struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
324 struct workqueue_struct *wq = cwq->wq;
325 1057
326 __queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work); 1058 __queue_work(smp_processor_id(), cwq->wq, &dwork->work);
327} 1059}
328 1060
329/** 1061/**
@@ -360,14 +1092,31 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
360 struct timer_list *timer = &dwork->timer; 1092 struct timer_list *timer = &dwork->timer;
361 struct work_struct *work = &dwork->work; 1093 struct work_struct *work = &dwork->work;
362 1094
363 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { 1095 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
1096 unsigned int lcpu;
1097
364 BUG_ON(timer_pending(timer)); 1098 BUG_ON(timer_pending(timer));
365 BUG_ON(!list_empty(&work->entry)); 1099 BUG_ON(!list_empty(&work->entry));
366 1100
367 timer_stats_timer_set_start_info(&dwork->timer); 1101 timer_stats_timer_set_start_info(&dwork->timer);
368 1102
369 /* This stores cwq for the moment, for the timer_fn */ 1103 /*
370 set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id())); 1104 * This stores cwq for the moment, for the timer_fn.
1105 * Note that the work's gcwq is preserved to allow
1106 * reentrance detection for delayed works.
1107 */
1108 if (!(wq->flags & WQ_UNBOUND)) {
1109 struct global_cwq *gcwq = get_work_gcwq(work);
1110
1111 if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND)
1112 lcpu = gcwq->cpu;
1113 else
1114 lcpu = raw_smp_processor_id();
1115 } else
1116 lcpu = WORK_CPU_UNBOUND;
1117
1118 set_work_cwq(work, get_cwq(lcpu, wq), 0);
1119
371 timer->expires = jiffies + delay; 1120 timer->expires = jiffies + delay;
372 timer->data = (unsigned long)dwork; 1121 timer->data = (unsigned long)dwork;
373 timer->function = delayed_work_timer_fn; 1122 timer->function = delayed_work_timer_fn;
@@ -382,80 +1131,889 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
382} 1131}
383EXPORT_SYMBOL_GPL(queue_delayed_work_on); 1132EXPORT_SYMBOL_GPL(queue_delayed_work_on);
384 1133
385static void run_workqueue(struct cpu_workqueue_struct *cwq) 1134/**
1135 * worker_enter_idle - enter idle state
1136 * @worker: worker which is entering idle state
1137 *
1138 * @worker is entering idle state. Update stats and idle timer if
1139 * necessary.
1140 *
1141 * LOCKING:
1142 * spin_lock_irq(gcwq->lock).
1143 */
1144static void worker_enter_idle(struct worker *worker)
386{ 1145{
387 spin_lock_irq(&cwq->lock); 1146 struct global_cwq *gcwq = worker->gcwq;
388 while (!list_empty(&cwq->worklist)) { 1147
389 struct work_struct *work = list_entry(cwq->worklist.next, 1148 BUG_ON(worker->flags & WORKER_IDLE);
390 struct work_struct, entry); 1149 BUG_ON(!list_empty(&worker->entry) &&
391 work_func_t f = work->func; 1150 (worker->hentry.next || worker->hentry.pprev));
392#ifdef CONFIG_LOCKDEP 1151
1152 /* can't use worker_set_flags(), also called from start_worker() */
1153 worker->flags |= WORKER_IDLE;
1154 gcwq->nr_idle++;
1155 worker->last_active = jiffies;
1156
1157 /* idle_list is LIFO */
1158 list_add(&worker->entry, &gcwq->idle_list);
1159
1160 if (likely(!(worker->flags & WORKER_ROGUE))) {
1161 if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer))
1162 mod_timer(&gcwq->idle_timer,
1163 jiffies + IDLE_WORKER_TIMEOUT);
1164 } else
1165 wake_up_all(&gcwq->trustee_wait);
1166
1167 /* sanity check nr_running */
1168 WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle &&
1169 atomic_read(get_gcwq_nr_running(gcwq->cpu)));
1170}
1171
1172/**
1173 * worker_leave_idle - leave idle state
1174 * @worker: worker which is leaving idle state
1175 *
1176 * @worker is leaving idle state. Update stats.
1177 *
1178 * LOCKING:
1179 * spin_lock_irq(gcwq->lock).
1180 */
1181static void worker_leave_idle(struct worker *worker)
1182{
1183 struct global_cwq *gcwq = worker->gcwq;
1184
1185 BUG_ON(!(worker->flags & WORKER_IDLE));
1186 worker_clr_flags(worker, WORKER_IDLE);
1187 gcwq->nr_idle--;
1188 list_del_init(&worker->entry);
1189}
1190
1191/**
1192 * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq
1193 * @worker: self
1194 *
1195 * Works which are scheduled while the cpu is online must at least be
1196 * scheduled to a worker which is bound to the cpu so that if they are
1197 * flushed from cpu callbacks while cpu is going down, they are
1198 * guaranteed to execute on the cpu.
1199 *
1200 * This function is to be used by rogue workers and rescuers to bind
1201 * themselves to the target cpu and may race with cpu going down or
1202 * coming online. kthread_bind() can't be used because it may put the
1203 * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
1204 * verbatim as it's best effort and blocking and gcwq may be
1205 * [dis]associated in the meantime.
1206 *
1207 * This function tries set_cpus_allowed() and locks gcwq and verifies
1208 * the binding against GCWQ_DISASSOCIATED which is set during
1209 * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters
1210 * idle state or fetches works without dropping lock, it can guarantee
1211 * the scheduling requirement described in the first paragraph.
1212 *
1213 * CONTEXT:
1214 * Might sleep. Called without any lock but returns with gcwq->lock
1215 * held.
1216 *
1217 * RETURNS:
1218 * %true if the associated gcwq is online (@worker is successfully
1219 * bound), %false if offline.
1220 */
1221static bool worker_maybe_bind_and_lock(struct worker *worker)
1222__acquires(&gcwq->lock)
1223{
1224 struct global_cwq *gcwq = worker->gcwq;
1225 struct task_struct *task = worker->task;
1226
1227 while (true) {
393 /* 1228 /*
394 * It is permissible to free the struct work_struct 1229 * The following call may fail, succeed or succeed
395 * from inside the function that is called from it, 1230 * without actually migrating the task to the cpu if
396 * this we need to take into account for lockdep too. 1231 * it races with cpu hotunplug operation. Verify
397 * To avoid bogus "held lock freed" warnings as well 1232 * against GCWQ_DISASSOCIATED.
398 * as problems when looking into work->lockdep_map,
399 * make a copy and use that here.
400 */ 1233 */
401 struct lockdep_map lockdep_map = work->lockdep_map; 1234 if (!(gcwq->flags & GCWQ_DISASSOCIATED))
402#endif 1235 set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu));
403 trace_workqueue_execution(cwq->thread, work); 1236
404 debug_work_deactivate(work); 1237 spin_lock_irq(&gcwq->lock);
405 cwq->current_work = work; 1238 if (gcwq->flags & GCWQ_DISASSOCIATED)
406 list_del_init(cwq->worklist.next); 1239 return false;
407 spin_unlock_irq(&cwq->lock); 1240 if (task_cpu(task) == gcwq->cpu &&
408 1241 cpumask_equal(&current->cpus_allowed,
409 BUG_ON(get_wq_data(work) != cwq); 1242 get_cpu_mask(gcwq->cpu)))
410 work_clear_pending(work); 1243 return true;
411 lock_map_acquire(&cwq->wq->lockdep_map); 1244 spin_unlock_irq(&gcwq->lock);
412 lock_map_acquire(&lockdep_map); 1245
413 f(work); 1246 /* CPU has come up inbetween, retry migration */
414 lock_map_release(&lockdep_map); 1247 cpu_relax();
415 lock_map_release(&cwq->wq->lockdep_map); 1248 }
416 1249}
417 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { 1250
418 printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " 1251/*
419 "%s/0x%08x/%d\n", 1252 * Function for worker->rebind_work used to rebind rogue busy workers
420 current->comm, preempt_count(), 1253 * to the associated cpu which is coming back online. This is
421 task_pid_nr(current)); 1254 * scheduled by cpu up but can race with other cpu hotplug operations
422 printk(KERN_ERR " last function: "); 1255 * and may be executed twice without intervening cpu down.
423 print_symbol("%s\n", (unsigned long)f); 1256 */
424 debug_show_held_locks(current); 1257static void worker_rebind_fn(struct work_struct *work)
425 dump_stack(); 1258{
1259 struct worker *worker = container_of(work, struct worker, rebind_work);
1260 struct global_cwq *gcwq = worker->gcwq;
1261
1262 if (worker_maybe_bind_and_lock(worker))
1263 worker_clr_flags(worker, WORKER_REBIND);
1264
1265 spin_unlock_irq(&gcwq->lock);
1266}
1267
1268static struct worker *alloc_worker(void)
1269{
1270 struct worker *worker;
1271
1272 worker = kzalloc(sizeof(*worker), GFP_KERNEL);
1273 if (worker) {
1274 INIT_LIST_HEAD(&worker->entry);
1275 INIT_LIST_HEAD(&worker->scheduled);
1276 INIT_WORK(&worker->rebind_work, worker_rebind_fn);
1277 /* on creation a worker is in !idle && prep state */
1278 worker->flags = WORKER_PREP;
1279 }
1280 return worker;
1281}
1282
1283/**
1284 * create_worker - create a new workqueue worker
1285 * @gcwq: gcwq the new worker will belong to
1286 * @bind: whether to set affinity to @cpu or not
1287 *
1288 * Create a new worker which is bound to @gcwq. The returned worker
1289 * can be started by calling start_worker() or destroyed using
1290 * destroy_worker().
1291 *
1292 * CONTEXT:
1293 * Might sleep. Does GFP_KERNEL allocations.
1294 *
1295 * RETURNS:
1296 * Pointer to the newly created worker.
1297 */
1298static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
1299{
1300 bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND;
1301 struct worker *worker = NULL;
1302 int id = -1;
1303
1304 spin_lock_irq(&gcwq->lock);
1305 while (ida_get_new(&gcwq->worker_ida, &id)) {
1306 spin_unlock_irq(&gcwq->lock);
1307 if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL))
1308 goto fail;
1309 spin_lock_irq(&gcwq->lock);
1310 }
1311 spin_unlock_irq(&gcwq->lock);
1312
1313 worker = alloc_worker();
1314 if (!worker)
1315 goto fail;
1316
1317 worker->gcwq = gcwq;
1318 worker->id = id;
1319
1320 if (!on_unbound_cpu)
1321 worker->task = kthread_create(worker_thread, worker,
1322 "kworker/%u:%d", gcwq->cpu, id);
1323 else
1324 worker->task = kthread_create(worker_thread, worker,
1325 "kworker/u:%d", id);
1326 if (IS_ERR(worker->task))
1327 goto fail;
1328
1329 /*
1330 * A rogue worker will become a regular one if CPU comes
1331 * online later on. Make sure every worker has
1332 * PF_THREAD_BOUND set.
1333 */
1334 if (bind && !on_unbound_cpu)
1335 kthread_bind(worker->task, gcwq->cpu);
1336 else {
1337 worker->task->flags |= PF_THREAD_BOUND;
1338 if (on_unbound_cpu)
1339 worker->flags |= WORKER_UNBOUND;
1340 }
1341
1342 return worker;
1343fail:
1344 if (id >= 0) {
1345 spin_lock_irq(&gcwq->lock);
1346 ida_remove(&gcwq->worker_ida, id);
1347 spin_unlock_irq(&gcwq->lock);
1348 }
1349 kfree(worker);
1350 return NULL;
1351}
1352
1353/**
1354 * start_worker - start a newly created worker
1355 * @worker: worker to start
1356 *
1357 * Make the gcwq aware of @worker and start it.
1358 *
1359 * CONTEXT:
1360 * spin_lock_irq(gcwq->lock).
1361 */
1362static void start_worker(struct worker *worker)
1363{
1364 worker->flags |= WORKER_STARTED;
1365 worker->gcwq->nr_workers++;
1366 worker_enter_idle(worker);
1367 wake_up_process(worker->task);
1368}
1369
1370/**
1371 * destroy_worker - destroy a workqueue worker
1372 * @worker: worker to be destroyed
1373 *
1374 * Destroy @worker and adjust @gcwq stats accordingly.
1375 *
1376 * CONTEXT:
1377 * spin_lock_irq(gcwq->lock) which is released and regrabbed.
1378 */
1379static void destroy_worker(struct worker *worker)
1380{
1381 struct global_cwq *gcwq = worker->gcwq;
1382 int id = worker->id;
1383
1384 /* sanity check frenzy */
1385 BUG_ON(worker->current_work);
1386 BUG_ON(!list_empty(&worker->scheduled));
1387
1388 if (worker->flags & WORKER_STARTED)
1389 gcwq->nr_workers--;
1390 if (worker->flags & WORKER_IDLE)
1391 gcwq->nr_idle--;
1392
1393 list_del_init(&worker->entry);
1394 worker->flags |= WORKER_DIE;
1395
1396 spin_unlock_irq(&gcwq->lock);
1397
1398 kthread_stop(worker->task);
1399 kfree(worker);
1400
1401 spin_lock_irq(&gcwq->lock);
1402 ida_remove(&gcwq->worker_ida, id);
1403}
1404
1405static void idle_worker_timeout(unsigned long __gcwq)
1406{
1407 struct global_cwq *gcwq = (void *)__gcwq;
1408
1409 spin_lock_irq(&gcwq->lock);
1410
1411 if (too_many_workers(gcwq)) {
1412 struct worker *worker;
1413 unsigned long expires;
1414
1415 /* idle_list is kept in LIFO order, check the last one */
1416 worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
1417 expires = worker->last_active + IDLE_WORKER_TIMEOUT;
1418
1419 if (time_before(jiffies, expires))
1420 mod_timer(&gcwq->idle_timer, expires);
1421 else {
1422 /* it's been idle for too long, wake up manager */
1423 gcwq->flags |= GCWQ_MANAGE_WORKERS;
1424 wake_up_worker(gcwq);
426 } 1425 }
1426 }
1427
1428 spin_unlock_irq(&gcwq->lock);
1429}
1430
1431static bool send_mayday(struct work_struct *work)
1432{
1433 struct cpu_workqueue_struct *cwq = get_work_cwq(work);
1434 struct workqueue_struct *wq = cwq->wq;
1435 unsigned int cpu;
1436
1437 if (!(wq->flags & WQ_RESCUER))
1438 return false;
1439
1440 /* mayday mayday mayday */
1441 cpu = cwq->gcwq->cpu;
1442 /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
1443 if (cpu == WORK_CPU_UNBOUND)
1444 cpu = 0;
1445 if (!mayday_test_and_set_cpu(cpu, wq->mayday_mask))
1446 wake_up_process(wq->rescuer->task);
1447 return true;
1448}
1449
1450static void gcwq_mayday_timeout(unsigned long __gcwq)
1451{
1452 struct global_cwq *gcwq = (void *)__gcwq;
1453 struct work_struct *work;
1454
1455 spin_lock_irq(&gcwq->lock);
427 1456
428 spin_lock_irq(&cwq->lock); 1457 if (need_to_create_worker(gcwq)) {
429 cwq->current_work = NULL; 1458 /*
1459 * We've been trying to create a new worker but
1460 * haven't been successful. We might be hitting an
1461 * allocation deadlock. Send distress signals to
1462 * rescuers.
1463 */
1464 list_for_each_entry(work, &gcwq->worklist, entry)
1465 send_mayday(work);
430 } 1466 }
431 spin_unlock_irq(&cwq->lock); 1467
1468 spin_unlock_irq(&gcwq->lock);
1469
1470 mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL);
432} 1471}
433 1472
434static int worker_thread(void *__cwq) 1473/**
1474 * maybe_create_worker - create a new worker if necessary
1475 * @gcwq: gcwq to create a new worker for
1476 *
1477 * Create a new worker for @gcwq if necessary. @gcwq is guaranteed to
1478 * have at least one idle worker on return from this function. If
1479 * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
1480 * sent to all rescuers with works scheduled on @gcwq to resolve
1481 * possible allocation deadlock.
1482 *
1483 * On return, need_to_create_worker() is guaranteed to be false and
1484 * may_start_working() true.
1485 *
1486 * LOCKING:
1487 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1488 * multiple times. Does GFP_KERNEL allocations. Called only from
1489 * manager.
1490 *
1491 * RETURNS:
1492 * false if no action was taken and gcwq->lock stayed locked, true
1493 * otherwise.
1494 */
1495static bool maybe_create_worker(struct global_cwq *gcwq)
1496__releases(&gcwq->lock)
1497__acquires(&gcwq->lock)
435{ 1498{
436 struct cpu_workqueue_struct *cwq = __cwq; 1499 if (!need_to_create_worker(gcwq))
437 DEFINE_WAIT(wait); 1500 return false;
1501restart:
1502 spin_unlock_irq(&gcwq->lock);
1503
1504 /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
1505 mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
1506
1507 while (true) {
1508 struct worker *worker;
1509
1510 worker = create_worker(gcwq, true);
1511 if (worker) {
1512 del_timer_sync(&gcwq->mayday_timer);
1513 spin_lock_irq(&gcwq->lock);
1514 start_worker(worker);
1515 BUG_ON(need_to_create_worker(gcwq));
1516 return true;
1517 }
1518
1519 if (!need_to_create_worker(gcwq))
1520 break;
1521
1522 __set_current_state(TASK_INTERRUPTIBLE);
1523 schedule_timeout(CREATE_COOLDOWN);
1524
1525 if (!need_to_create_worker(gcwq))
1526 break;
1527 }
1528
1529 del_timer_sync(&gcwq->mayday_timer);
1530 spin_lock_irq(&gcwq->lock);
1531 if (need_to_create_worker(gcwq))
1532 goto restart;
1533 return true;
1534}
438 1535
439 if (cwq->wq->freezeable) 1536/**
440 set_freezable(); 1537 * maybe_destroy_worker - destroy workers which have been idle for a while
1538 * @gcwq: gcwq to destroy workers for
1539 *
1540 * Destroy @gcwq workers which have been idle for longer than
1541 * IDLE_WORKER_TIMEOUT.
1542 *
1543 * LOCKING:
1544 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1545 * multiple times. Called only from manager.
1546 *
1547 * RETURNS:
1548 * false if no action was taken and gcwq->lock stayed locked, true
1549 * otherwise.
1550 */
1551static bool maybe_destroy_workers(struct global_cwq *gcwq)
1552{
1553 bool ret = false;
441 1554
442 for (;;) { 1555 while (too_many_workers(gcwq)) {
443 prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); 1556 struct worker *worker;
444 if (!freezing(current) && 1557 unsigned long expires;
445 !kthread_should_stop() &&
446 list_empty(&cwq->worklist))
447 schedule();
448 finish_wait(&cwq->more_work, &wait);
449 1558
450 try_to_freeze(); 1559 worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
1560 expires = worker->last_active + IDLE_WORKER_TIMEOUT;
451 1561
452 if (kthread_should_stop()) 1562 if (time_before(jiffies, expires)) {
1563 mod_timer(&gcwq->idle_timer, expires);
453 break; 1564 break;
1565 }
454 1566
455 run_workqueue(cwq); 1567 destroy_worker(worker);
1568 ret = true;
456 } 1569 }
457 1570
458 return 0; 1571 return ret;
1572}
1573
1574/**
1575 * manage_workers - manage worker pool
1576 * @worker: self
1577 *
1578 * Assume the manager role and manage gcwq worker pool @worker belongs
1579 * to. At any given time, there can be only zero or one manager per
1580 * gcwq. The exclusion is handled automatically by this function.
1581 *
1582 * The caller can safely start processing works on false return. On
1583 * true return, it's guaranteed that need_to_create_worker() is false
1584 * and may_start_working() is true.
1585 *
1586 * CONTEXT:
1587 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1588 * multiple times. Does GFP_KERNEL allocations.
1589 *
1590 * RETURNS:
1591 * false if no action was taken and gcwq->lock stayed locked, true if
1592 * some action was taken.
1593 */
1594static bool manage_workers(struct worker *worker)
1595{
1596 struct global_cwq *gcwq = worker->gcwq;
1597 bool ret = false;
1598
1599 if (gcwq->flags & GCWQ_MANAGING_WORKERS)
1600 return ret;
1601
1602 gcwq->flags &= ~GCWQ_MANAGE_WORKERS;
1603 gcwq->flags |= GCWQ_MANAGING_WORKERS;
1604
1605 /*
1606 * Destroy and then create so that may_start_working() is true
1607 * on return.
1608 */
1609 ret |= maybe_destroy_workers(gcwq);
1610 ret |= maybe_create_worker(gcwq);
1611
1612 gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
1613
1614 /*
1615 * The trustee might be waiting to take over the manager
1616 * position, tell it we're done.
1617 */
1618 if (unlikely(gcwq->trustee))
1619 wake_up_all(&gcwq->trustee_wait);
1620
1621 return ret;
1622}
1623
1624/**
1625 * move_linked_works - move linked works to a list
1626 * @work: start of series of works to be scheduled
1627 * @head: target list to append @work to
1628 * @nextp: out paramter for nested worklist walking
1629 *
1630 * Schedule linked works starting from @work to @head. Work series to
1631 * be scheduled starts at @work and includes any consecutive work with
1632 * WORK_STRUCT_LINKED set in its predecessor.
1633 *
1634 * If @nextp is not NULL, it's updated to point to the next work of
1635 * the last scheduled work. This allows move_linked_works() to be
1636 * nested inside outer list_for_each_entry_safe().
1637 *
1638 * CONTEXT:
1639 * spin_lock_irq(gcwq->lock).
1640 */
1641static void move_linked_works(struct work_struct *work, struct list_head *head,
1642 struct work_struct **nextp)
1643{
1644 struct work_struct *n;
1645
1646 /*
1647 * Linked worklist will always end before the end of the list,
1648 * use NULL for list head.
1649 */
1650 list_for_each_entry_safe_from(work, n, NULL, entry) {
1651 list_move_tail(&work->entry, head);
1652 if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
1653 break;
1654 }
1655
1656 /*
1657 * If we're already inside safe list traversal and have moved
1658 * multiple works to the scheduled queue, the next position
1659 * needs to be updated.
1660 */
1661 if (nextp)
1662 *nextp = n;
1663}
1664
1665static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
1666{
1667 struct work_struct *work = list_first_entry(&cwq->delayed_works,
1668 struct work_struct, entry);
1669 struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq);
1670
1671 trace_workqueue_activate_work(work);
1672 move_linked_works(work, pos, NULL);
1673 __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
1674 cwq->nr_active++;
1675}
1676
1677/**
1678 * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
1679 * @cwq: cwq of interest
1680 * @color: color of work which left the queue
1681 * @delayed: for a delayed work
1682 *
1683 * A work either has completed or is removed from pending queue,
1684 * decrement nr_in_flight of its cwq and handle workqueue flushing.
1685 *
1686 * CONTEXT:
1687 * spin_lock_irq(gcwq->lock).
1688 */
1689static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color,
1690 bool delayed)
1691{
1692 /* ignore uncolored works */
1693 if (color == WORK_NO_COLOR)
1694 return;
1695
1696 cwq->nr_in_flight[color]--;
1697
1698 if (!delayed) {
1699 cwq->nr_active--;
1700 if (!list_empty(&cwq->delayed_works)) {
1701 /* one down, submit a delayed one */
1702 if (cwq->nr_active < cwq->max_active)
1703 cwq_activate_first_delayed(cwq);
1704 }
1705 }
1706
1707 /* is flush in progress and are we at the flushing tip? */
1708 if (likely(cwq->flush_color != color))
1709 return;
1710
1711 /* are there still in-flight works? */
1712 if (cwq->nr_in_flight[color])
1713 return;
1714
1715 /* this cwq is done, clear flush_color */
1716 cwq->flush_color = -1;
1717
1718 /*
1719 * If this was the last cwq, wake up the first flusher. It
1720 * will handle the rest.
1721 */
1722 if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
1723 complete(&cwq->wq->first_flusher->done);
1724}
1725
1726/**
1727 * process_one_work - process single work
1728 * @worker: self
1729 * @work: work to process
1730 *
1731 * Process @work. This function contains all the logics necessary to
1732 * process a single work including synchronization against and
1733 * interaction with other workers on the same cpu, queueing and
1734 * flushing. As long as context requirement is met, any worker can
1735 * call this function to process a work.
1736 *
1737 * CONTEXT:
1738 * spin_lock_irq(gcwq->lock) which is released and regrabbed.
1739 */
1740static void process_one_work(struct worker *worker, struct work_struct *work)
1741__releases(&gcwq->lock)
1742__acquires(&gcwq->lock)
1743{
1744 struct cpu_workqueue_struct *cwq = get_work_cwq(work);
1745 struct global_cwq *gcwq = cwq->gcwq;
1746 struct hlist_head *bwh = busy_worker_head(gcwq, work);
1747 bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
1748 work_func_t f = work->func;
1749 int work_color;
1750 struct worker *collision;
1751#ifdef CONFIG_LOCKDEP
1752 /*
1753 * It is permissible to free the struct work_struct from
1754 * inside the function that is called from it, this we need to
1755 * take into account for lockdep too. To avoid bogus "held
1756 * lock freed" warnings as well as problems when looking into
1757 * work->lockdep_map, make a copy and use that here.
1758 */
1759 struct lockdep_map lockdep_map = work->lockdep_map;
1760#endif
1761 /*
1762 * A single work shouldn't be executed concurrently by
1763 * multiple workers on a single cpu. Check whether anyone is
1764 * already processing the work. If so, defer the work to the
1765 * currently executing one.
1766 */
1767 collision = __find_worker_executing_work(gcwq, bwh, work);
1768 if (unlikely(collision)) {
1769 move_linked_works(work, &collision->scheduled, NULL);
1770 return;
1771 }
1772
1773 /* claim and process */
1774 debug_work_deactivate(work);
1775 hlist_add_head(&worker->hentry, bwh);
1776 worker->current_work = work;
1777 worker->current_cwq = cwq;
1778 work_color = get_work_color(work);
1779
1780 /* record the current cpu number in the work data and dequeue */
1781 set_work_cpu(work, gcwq->cpu);
1782 list_del_init(&work->entry);
1783
1784 /*
1785 * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI,
1786 * wake up another worker; otherwise, clear HIGHPRI_PENDING.
1787 */
1788 if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) {
1789 struct work_struct *nwork = list_first_entry(&gcwq->worklist,
1790 struct work_struct, entry);
1791
1792 if (!list_empty(&gcwq->worklist) &&
1793 get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
1794 wake_up_worker(gcwq);
1795 else
1796 gcwq->flags &= ~GCWQ_HIGHPRI_PENDING;
1797 }
1798
1799 /*
1800 * CPU intensive works don't participate in concurrency
1801 * management. They're the scheduler's responsibility.
1802 */
1803 if (unlikely(cpu_intensive))
1804 worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
1805
1806 spin_unlock_irq(&gcwq->lock);
1807
1808 work_clear_pending(work);
1809 lock_map_acquire(&cwq->wq->lockdep_map);
1810 lock_map_acquire(&lockdep_map);
1811 trace_workqueue_execute_start(work);
1812 f(work);
1813 /*
1814 * While we must be careful to not use "work" after this, the trace
1815 * point will only record its address.
1816 */
1817 trace_workqueue_execute_end(work);
1818 lock_map_release(&lockdep_map);
1819 lock_map_release(&cwq->wq->lockdep_map);
1820
1821 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
1822 printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
1823 "%s/0x%08x/%d\n",
1824 current->comm, preempt_count(), task_pid_nr(current));
1825 printk(KERN_ERR " last function: ");
1826 print_symbol("%s\n", (unsigned long)f);
1827 debug_show_held_locks(current);
1828 dump_stack();
1829 }
1830
1831 spin_lock_irq(&gcwq->lock);
1832
1833 /* clear cpu intensive status */
1834 if (unlikely(cpu_intensive))
1835 worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
1836
1837 /* we're done with it, release */
1838 hlist_del_init(&worker->hentry);
1839 worker->current_work = NULL;
1840 worker->current_cwq = NULL;
1841 cwq_dec_nr_in_flight(cwq, work_color, false);
1842}
1843
1844/**
1845 * process_scheduled_works - process scheduled works
1846 * @worker: self
1847 *
1848 * Process all scheduled works. Please note that the scheduled list
1849 * may change while processing a work, so this function repeatedly
1850 * fetches a work from the top and executes it.
1851 *
1852 * CONTEXT:
1853 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1854 * multiple times.
1855 */
1856static void process_scheduled_works(struct worker *worker)
1857{
1858 while (!list_empty(&worker->scheduled)) {
1859 struct work_struct *work = list_first_entry(&worker->scheduled,
1860 struct work_struct, entry);
1861 process_one_work(worker, work);
1862 }
1863}
1864
1865/**
1866 * worker_thread - the worker thread function
1867 * @__worker: self
1868 *
1869 * The gcwq worker thread function. There's a single dynamic pool of
1870 * these per each cpu. These workers process all works regardless of
1871 * their specific target workqueue. The only exception is works which
1872 * belong to workqueues with a rescuer which will be explained in
1873 * rescuer_thread().
1874 */
1875static int worker_thread(void *__worker)
1876{
1877 struct worker *worker = __worker;
1878 struct global_cwq *gcwq = worker->gcwq;
1879
1880 /* tell the scheduler that this is a workqueue worker */
1881 worker->task->flags |= PF_WQ_WORKER;
1882woke_up:
1883 spin_lock_irq(&gcwq->lock);
1884
1885 /* DIE can be set only while we're idle, checking here is enough */
1886 if (worker->flags & WORKER_DIE) {
1887 spin_unlock_irq(&gcwq->lock);
1888 worker->task->flags &= ~PF_WQ_WORKER;
1889 return 0;
1890 }
1891
1892 worker_leave_idle(worker);
1893recheck:
1894 /* no more worker necessary? */
1895 if (!need_more_worker(gcwq))
1896 goto sleep;
1897
1898 /* do we need to manage? */
1899 if (unlikely(!may_start_working(gcwq)) && manage_workers(worker))
1900 goto recheck;
1901
1902 /*
1903 * ->scheduled list can only be filled while a worker is
1904 * preparing to process a work or actually processing it.
1905 * Make sure nobody diddled with it while I was sleeping.
1906 */
1907 BUG_ON(!list_empty(&worker->scheduled));
1908
1909 /*
1910 * When control reaches this point, we're guaranteed to have
1911 * at least one idle worker or that someone else has already
1912 * assumed the manager role.
1913 */
1914 worker_clr_flags(worker, WORKER_PREP);
1915
1916 do {
1917 struct work_struct *work =
1918 list_first_entry(&gcwq->worklist,
1919 struct work_struct, entry);
1920
1921 if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
1922 /* optimization path, not strictly necessary */
1923 process_one_work(worker, work);
1924 if (unlikely(!list_empty(&worker->scheduled)))
1925 process_scheduled_works(worker);
1926 } else {
1927 move_linked_works(work, &worker->scheduled, NULL);
1928 process_scheduled_works(worker);
1929 }
1930 } while (keep_working(gcwq));
1931
1932 worker_set_flags(worker, WORKER_PREP, false);
1933sleep:
1934 if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker))
1935 goto recheck;
1936
1937 /*
1938 * gcwq->lock is held and there's no work to process and no
1939 * need to manage, sleep. Workers are woken up only while
1940 * holding gcwq->lock or from local cpu, so setting the
1941 * current state before releasing gcwq->lock is enough to
1942 * prevent losing any event.
1943 */
1944 worker_enter_idle(worker);
1945 __set_current_state(TASK_INTERRUPTIBLE);
1946 spin_unlock_irq(&gcwq->lock);
1947 schedule();
1948 goto woke_up;
1949}
1950
1951/**
1952 * rescuer_thread - the rescuer thread function
1953 * @__wq: the associated workqueue
1954 *
1955 * Workqueue rescuer thread function. There's one rescuer for each
1956 * workqueue which has WQ_RESCUER set.
1957 *
1958 * Regular work processing on a gcwq may block trying to create a new
1959 * worker which uses GFP_KERNEL allocation which has slight chance of
1960 * developing into deadlock if some works currently on the same queue
1961 * need to be processed to satisfy the GFP_KERNEL allocation. This is
1962 * the problem rescuer solves.
1963 *
1964 * When such condition is possible, the gcwq summons rescuers of all
1965 * workqueues which have works queued on the gcwq and let them process
1966 * those works so that forward progress can be guaranteed.
1967 *
1968 * This should happen rarely.
1969 */
1970static int rescuer_thread(void *__wq)
1971{
1972 struct workqueue_struct *wq = __wq;
1973 struct worker *rescuer = wq->rescuer;
1974 struct list_head *scheduled = &rescuer->scheduled;
1975 bool is_unbound = wq->flags & WQ_UNBOUND;
1976 unsigned int cpu;
1977
1978 set_user_nice(current, RESCUER_NICE_LEVEL);
1979repeat:
1980 set_current_state(TASK_INTERRUPTIBLE);
1981
1982 if (kthread_should_stop())
1983 return 0;
1984
1985 /*
1986 * See whether any cpu is asking for help. Unbounded
1987 * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND.
1988 */
1989 for_each_mayday_cpu(cpu, wq->mayday_mask) {
1990 unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
1991 struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq);
1992 struct global_cwq *gcwq = cwq->gcwq;
1993 struct work_struct *work, *n;
1994
1995 __set_current_state(TASK_RUNNING);
1996 mayday_clear_cpu(cpu, wq->mayday_mask);
1997
1998 /* migrate to the target cpu if possible */
1999 rescuer->gcwq = gcwq;
2000 worker_maybe_bind_and_lock(rescuer);
2001
2002 /*
2003 * Slurp in all works issued via this workqueue and
2004 * process'em.
2005 */
2006 BUG_ON(!list_empty(&rescuer->scheduled));
2007 list_for_each_entry_safe(work, n, &gcwq->worklist, entry)
2008 if (get_work_cwq(work) == cwq)
2009 move_linked_works(work, scheduled, &n);
2010
2011 process_scheduled_works(rescuer);
2012 spin_unlock_irq(&gcwq->lock);
2013 }
2014
2015 schedule();
2016 goto repeat;
459} 2017}
460 2018
461struct wq_barrier { 2019struct wq_barrier {
@@ -469,44 +2027,137 @@ static void wq_barrier_func(struct work_struct *work)
469 complete(&barr->done); 2027 complete(&barr->done);
470} 2028}
471 2029
2030/**
2031 * insert_wq_barrier - insert a barrier work
2032 * @cwq: cwq to insert barrier into
2033 * @barr: wq_barrier to insert
2034 * @target: target work to attach @barr to
2035 * @worker: worker currently executing @target, NULL if @target is not executing
2036 *
2037 * @barr is linked to @target such that @barr is completed only after
2038 * @target finishes execution. Please note that the ordering
2039 * guarantee is observed only with respect to @target and on the local
2040 * cpu.
2041 *
2042 * Currently, a queued barrier can't be canceled. This is because
2043 * try_to_grab_pending() can't determine whether the work to be
2044 * grabbed is at the head of the queue and thus can't clear LINKED
2045 * flag of the previous work while there must be a valid next work
2046 * after a work with LINKED flag set.
2047 *
2048 * Note that when @worker is non-NULL, @target may be modified
2049 * underneath us, so we can't reliably determine cwq from @target.
2050 *
2051 * CONTEXT:
2052 * spin_lock_irq(gcwq->lock).
2053 */
472static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, 2054static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
473 struct wq_barrier *barr, struct list_head *head) 2055 struct wq_barrier *barr,
2056 struct work_struct *target, struct worker *worker)
474{ 2057{
2058 struct list_head *head;
2059 unsigned int linked = 0;
2060
475 /* 2061 /*
476 * debugobject calls are safe here even with cwq->lock locked 2062 * debugobject calls are safe here even with gcwq->lock locked
477 * as we know for sure that this will not trigger any of the 2063 * as we know for sure that this will not trigger any of the
478 * checks and call back into the fixup functions where we 2064 * checks and call back into the fixup functions where we
479 * might deadlock. 2065 * might deadlock.
480 */ 2066 */
481 INIT_WORK_ON_STACK(&barr->work, wq_barrier_func); 2067 INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
482 __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work)); 2068 __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
483
484 init_completion(&barr->done); 2069 init_completion(&barr->done);
485 2070
2071 /*
2072 * If @target is currently being executed, schedule the
2073 * barrier to the worker; otherwise, put it after @target.
2074 */
2075 if (worker)
2076 head = worker->scheduled.next;
2077 else {
2078 unsigned long *bits = work_data_bits(target);
2079
2080 head = target->entry.next;
2081 /* there can already be other linked works, inherit and set */
2082 linked = *bits & WORK_STRUCT_LINKED;
2083 __set_bit(WORK_STRUCT_LINKED_BIT, bits);
2084 }
2085
486 debug_work_activate(&barr->work); 2086 debug_work_activate(&barr->work);
487 insert_work(cwq, &barr->work, head); 2087 insert_work(cwq, &barr->work, head,
2088 work_color_to_flags(WORK_NO_COLOR) | linked);
488} 2089}
489 2090
490static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) 2091/**
2092 * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing
2093 * @wq: workqueue being flushed
2094 * @flush_color: new flush color, < 0 for no-op
2095 * @work_color: new work color, < 0 for no-op
2096 *
2097 * Prepare cwqs for workqueue flushing.
2098 *
2099 * If @flush_color is non-negative, flush_color on all cwqs should be
2100 * -1. If no cwq has in-flight commands at the specified color, all
2101 * cwq->flush_color's stay at -1 and %false is returned. If any cwq
2102 * has in flight commands, its cwq->flush_color is set to
2103 * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq
2104 * wakeup logic is armed and %true is returned.
2105 *
2106 * The caller should have initialized @wq->first_flusher prior to
2107 * calling this function with non-negative @flush_color. If
2108 * @flush_color is negative, no flush color update is done and %false
2109 * is returned.
2110 *
2111 * If @work_color is non-negative, all cwqs should have the same
2112 * work_color which is previous to @work_color and all will be
2113 * advanced to @work_color.
2114 *
2115 * CONTEXT:
2116 * mutex_lock(wq->flush_mutex).
2117 *
2118 * RETURNS:
2119 * %true if @flush_color >= 0 and there's something to flush. %false
2120 * otherwise.
2121 */
2122static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
2123 int flush_color, int work_color)
491{ 2124{
492 int active = 0; 2125 bool wait = false;
493 struct wq_barrier barr; 2126 unsigned int cpu;
494
495 WARN_ON(cwq->thread == current);
496 2127
497 spin_lock_irq(&cwq->lock); 2128 if (flush_color >= 0) {
498 if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) { 2129 BUG_ON(atomic_read(&wq->nr_cwqs_to_flush));
499 insert_wq_barrier(cwq, &barr, &cwq->worklist); 2130 atomic_set(&wq->nr_cwqs_to_flush, 1);
500 active = 1;
501 } 2131 }
502 spin_unlock_irq(&cwq->lock);
503 2132
504 if (active) { 2133 for_each_cwq_cpu(cpu, wq) {
505 wait_for_completion(&barr.done); 2134 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
506 destroy_work_on_stack(&barr.work); 2135 struct global_cwq *gcwq = cwq->gcwq;
2136
2137 spin_lock_irq(&gcwq->lock);
2138
2139 if (flush_color >= 0) {
2140 BUG_ON(cwq->flush_color != -1);
2141
2142 if (cwq->nr_in_flight[flush_color]) {
2143 cwq->flush_color = flush_color;
2144 atomic_inc(&wq->nr_cwqs_to_flush);
2145 wait = true;
2146 }
2147 }
2148
2149 if (work_color >= 0) {
2150 BUG_ON(work_color != work_next_color(cwq->work_color));
2151 cwq->work_color = work_color;
2152 }
2153
2154 spin_unlock_irq(&gcwq->lock);
507 } 2155 }
508 2156
509 return active; 2157 if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush))
2158 complete(&wq->first_flusher->done);
2159
2160 return wait;
510} 2161}
511 2162
512/** 2163/**
@@ -518,158 +2169,340 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
518 * 2169 *
519 * We sleep until all works which were queued on entry have been handled, 2170 * We sleep until all works which were queued on entry have been handled,
520 * but we are not livelocked by new incoming ones. 2171 * but we are not livelocked by new incoming ones.
521 *
522 * This function used to run the workqueues itself. Now we just wait for the
523 * helper threads to do it.
524 */ 2172 */
525void flush_workqueue(struct workqueue_struct *wq) 2173void flush_workqueue(struct workqueue_struct *wq)
526{ 2174{
527 const struct cpumask *cpu_map = wq_cpu_map(wq); 2175 struct wq_flusher this_flusher = {
528 int cpu; 2176 .list = LIST_HEAD_INIT(this_flusher.list),
2177 .flush_color = -1,
2178 .done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done),
2179 };
2180 int next_color;
529 2181
530 might_sleep();
531 lock_map_acquire(&wq->lockdep_map); 2182 lock_map_acquire(&wq->lockdep_map);
532 lock_map_release(&wq->lockdep_map); 2183 lock_map_release(&wq->lockdep_map);
533 for_each_cpu(cpu, cpu_map) 2184
534 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); 2185 mutex_lock(&wq->flush_mutex);
2186
2187 /*
2188 * Start-to-wait phase
2189 */
2190 next_color = work_next_color(wq->work_color);
2191
2192 if (next_color != wq->flush_color) {
2193 /*
2194 * Color space is not full. The current work_color
2195 * becomes our flush_color and work_color is advanced
2196 * by one.
2197 */
2198 BUG_ON(!list_empty(&wq->flusher_overflow));
2199 this_flusher.flush_color = wq->work_color;
2200 wq->work_color = next_color;
2201
2202 if (!wq->first_flusher) {
2203 /* no flush in progress, become the first flusher */
2204 BUG_ON(wq->flush_color != this_flusher.flush_color);
2205
2206 wq->first_flusher = &this_flusher;
2207
2208 if (!flush_workqueue_prep_cwqs(wq, wq->flush_color,
2209 wq->work_color)) {
2210 /* nothing to flush, done */
2211 wq->flush_color = next_color;
2212 wq->first_flusher = NULL;
2213 goto out_unlock;
2214 }
2215 } else {
2216 /* wait in queue */
2217 BUG_ON(wq->flush_color == this_flusher.flush_color);
2218 list_add_tail(&this_flusher.list, &wq->flusher_queue);
2219 flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
2220 }
2221 } else {
2222 /*
2223 * Oops, color space is full, wait on overflow queue.
2224 * The next flush completion will assign us
2225 * flush_color and transfer to flusher_queue.
2226 */
2227 list_add_tail(&this_flusher.list, &wq->flusher_overflow);
2228 }
2229
2230 mutex_unlock(&wq->flush_mutex);
2231
2232 wait_for_completion(&this_flusher.done);
2233
2234 /*
2235 * Wake-up-and-cascade phase
2236 *
2237 * First flushers are responsible for cascading flushes and
2238 * handling overflow. Non-first flushers can simply return.
2239 */
2240 if (wq->first_flusher != &this_flusher)
2241 return;
2242
2243 mutex_lock(&wq->flush_mutex);
2244
2245 /* we might have raced, check again with mutex held */
2246 if (wq->first_flusher != &this_flusher)
2247 goto out_unlock;
2248
2249 wq->first_flusher = NULL;
2250
2251 BUG_ON(!list_empty(&this_flusher.list));
2252 BUG_ON(wq->flush_color != this_flusher.flush_color);
2253
2254 while (true) {
2255 struct wq_flusher *next, *tmp;
2256
2257 /* complete all the flushers sharing the current flush color */
2258 list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) {
2259 if (next->flush_color != wq->flush_color)
2260 break;
2261 list_del_init(&next->list);
2262 complete(&next->done);
2263 }
2264
2265 BUG_ON(!list_empty(&wq->flusher_overflow) &&
2266 wq->flush_color != work_next_color(wq->work_color));
2267
2268 /* this flush_color is finished, advance by one */
2269 wq->flush_color = work_next_color(wq->flush_color);
2270
2271 /* one color has been freed, handle overflow queue */
2272 if (!list_empty(&wq->flusher_overflow)) {
2273 /*
2274 * Assign the same color to all overflowed
2275 * flushers, advance work_color and append to
2276 * flusher_queue. This is the start-to-wait
2277 * phase for these overflowed flushers.
2278 */
2279 list_for_each_entry(tmp, &wq->flusher_overflow, list)
2280 tmp->flush_color = wq->work_color;
2281
2282 wq->work_color = work_next_color(wq->work_color);
2283
2284 list_splice_tail_init(&wq->flusher_overflow,
2285 &wq->flusher_queue);
2286 flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
2287 }
2288
2289 if (list_empty(&wq->flusher_queue)) {
2290 BUG_ON(wq->flush_color != wq->work_color);
2291 break;
2292 }
2293
2294 /*
2295 * Need to flush more colors. Make the next flusher
2296 * the new first flusher and arm cwqs.
2297 */
2298 BUG_ON(wq->flush_color == wq->work_color);
2299 BUG_ON(wq->flush_color != next->flush_color);
2300
2301 list_del_init(&next->list);
2302 wq->first_flusher = next;
2303
2304 if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1))
2305 break;
2306
2307 /*
2308 * Meh... this color is already done, clear first
2309 * flusher and repeat cascading.
2310 */
2311 wq->first_flusher = NULL;
2312 }
2313
2314out_unlock:
2315 mutex_unlock(&wq->flush_mutex);
535} 2316}
536EXPORT_SYMBOL_GPL(flush_workqueue); 2317EXPORT_SYMBOL_GPL(flush_workqueue);
537 2318
538/** 2319static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
539 * flush_work - block until a work_struct's callback has terminated 2320 bool wait_executing)
540 * @work: the work which is to be flushed
541 *
542 * Returns false if @work has already terminated.
543 *
544 * It is expected that, prior to calling flush_work(), the caller has
545 * arranged for the work to not be requeued, otherwise it doesn't make
546 * sense to use this function.
547 */
548int flush_work(struct work_struct *work)
549{ 2321{
2322 struct worker *worker = NULL;
2323 struct global_cwq *gcwq;
550 struct cpu_workqueue_struct *cwq; 2324 struct cpu_workqueue_struct *cwq;
551 struct list_head *prev;
552 struct wq_barrier barr;
553 2325
554 might_sleep(); 2326 might_sleep();
555 cwq = get_wq_data(work); 2327 gcwq = get_work_gcwq(work);
556 if (!cwq) 2328 if (!gcwq)
557 return 0; 2329 return false;
558
559 lock_map_acquire(&cwq->wq->lockdep_map);
560 lock_map_release(&cwq->wq->lockdep_map);
561 2330
562 prev = NULL; 2331 spin_lock_irq(&gcwq->lock);
563 spin_lock_irq(&cwq->lock);
564 if (!list_empty(&work->entry)) { 2332 if (!list_empty(&work->entry)) {
565 /* 2333 /*
566 * See the comment near try_to_grab_pending()->smp_rmb(). 2334 * See the comment near try_to_grab_pending()->smp_rmb().
567 * If it was re-queued under us we are not going to wait. 2335 * If it was re-queued to a different gcwq under us, we
2336 * are not going to wait.
568 */ 2337 */
569 smp_rmb(); 2338 smp_rmb();
570 if (unlikely(cwq != get_wq_data(work))) 2339 cwq = get_work_cwq(work);
571 goto out; 2340 if (unlikely(!cwq || gcwq != cwq->gcwq))
572 prev = &work->entry; 2341 goto already_gone;
573 } else { 2342 } else if (wait_executing) {
574 if (cwq->current_work != work) 2343 worker = find_worker_executing_work(gcwq, work);
575 goto out; 2344 if (!worker)
576 prev = &cwq->worklist; 2345 goto already_gone;
577 } 2346 cwq = worker->current_cwq;
578 insert_wq_barrier(cwq, &barr, prev->next); 2347 } else
579out: 2348 goto already_gone;
580 spin_unlock_irq(&cwq->lock); 2349
581 if (!prev) 2350 insert_wq_barrier(cwq, barr, work, worker);
582 return 0; 2351 spin_unlock_irq(&gcwq->lock);
583 2352
584 wait_for_completion(&barr.done); 2353 lock_map_acquire(&cwq->wq->lockdep_map);
585 destroy_work_on_stack(&barr.work); 2354 lock_map_release(&cwq->wq->lockdep_map);
586 return 1; 2355 return true;
2356already_gone:
2357 spin_unlock_irq(&gcwq->lock);
2358 return false;
2359}
2360
2361/**
2362 * flush_work - wait for a work to finish executing the last queueing instance
2363 * @work: the work to flush
2364 *
2365 * Wait until @work has finished execution. This function considers
2366 * only the last queueing instance of @work. If @work has been
2367 * enqueued across different CPUs on a non-reentrant workqueue or on
2368 * multiple workqueues, @work might still be executing on return on
2369 * some of the CPUs from earlier queueing.
2370 *
2371 * If @work was queued only on a non-reentrant, ordered or unbound
2372 * workqueue, @work is guaranteed to be idle on return if it hasn't
2373 * been requeued since flush started.
2374 *
2375 * RETURNS:
2376 * %true if flush_work() waited for the work to finish execution,
2377 * %false if it was already idle.
2378 */
2379bool flush_work(struct work_struct *work)
2380{
2381 struct wq_barrier barr;
2382
2383 if (start_flush_work(work, &barr, true)) {
2384 wait_for_completion(&barr.done);
2385 destroy_work_on_stack(&barr.work);
2386 return true;
2387 } else
2388 return false;
587} 2389}
588EXPORT_SYMBOL_GPL(flush_work); 2390EXPORT_SYMBOL_GPL(flush_work);
589 2391
2392static bool wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work)
2393{
2394 struct wq_barrier barr;
2395 struct worker *worker;
2396
2397 spin_lock_irq(&gcwq->lock);
2398
2399 worker = find_worker_executing_work(gcwq, work);
2400 if (unlikely(worker))
2401 insert_wq_barrier(worker->current_cwq, &barr, work, worker);
2402
2403 spin_unlock_irq(&gcwq->lock);
2404
2405 if (unlikely(worker)) {
2406 wait_for_completion(&barr.done);
2407 destroy_work_on_stack(&barr.work);
2408 return true;
2409 } else
2410 return false;
2411}
2412
2413static bool wait_on_work(struct work_struct *work)
2414{
2415 bool ret = false;
2416 int cpu;
2417
2418 might_sleep();
2419
2420 lock_map_acquire(&work->lockdep_map);
2421 lock_map_release(&work->lockdep_map);
2422
2423 for_each_gcwq_cpu(cpu)
2424 ret |= wait_on_cpu_work(get_gcwq(cpu), work);
2425 return ret;
2426}
2427
2428/**
2429 * flush_work_sync - wait until a work has finished execution
2430 * @work: the work to flush
2431 *
2432 * Wait until @work has finished execution. On return, it's
2433 * guaranteed that all queueing instances of @work which happened
2434 * before this function is called are finished. In other words, if
2435 * @work hasn't been requeued since this function was called, @work is
2436 * guaranteed to be idle on return.
2437 *
2438 * RETURNS:
2439 * %true if flush_work_sync() waited for the work to finish execution,
2440 * %false if it was already idle.
2441 */
2442bool flush_work_sync(struct work_struct *work)
2443{
2444 struct wq_barrier barr;
2445 bool pending, waited;
2446
2447 /* we'll wait for executions separately, queue barr only if pending */
2448 pending = start_flush_work(work, &barr, false);
2449
2450 /* wait for executions to finish */
2451 waited = wait_on_work(work);
2452
2453 /* wait for the pending one */
2454 if (pending) {
2455 wait_for_completion(&barr.done);
2456 destroy_work_on_stack(&barr.work);
2457 }
2458
2459 return pending || waited;
2460}
2461EXPORT_SYMBOL_GPL(flush_work_sync);
2462
590/* 2463/*
591 * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit, 2464 * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
592 * so this work can't be re-armed in any way. 2465 * so this work can't be re-armed in any way.
593 */ 2466 */
594static int try_to_grab_pending(struct work_struct *work) 2467static int try_to_grab_pending(struct work_struct *work)
595{ 2468{
596 struct cpu_workqueue_struct *cwq; 2469 struct global_cwq *gcwq;
597 int ret = -1; 2470 int ret = -1;
598 2471
599 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) 2472 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
600 return 0; 2473 return 0;
601 2474
602 /* 2475 /*
603 * The queueing is in progress, or it is already queued. Try to 2476 * The queueing is in progress, or it is already queued. Try to
604 * steal it from ->worklist without clearing WORK_STRUCT_PENDING. 2477 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
605 */ 2478 */
606 2479 gcwq = get_work_gcwq(work);
607 cwq = get_wq_data(work); 2480 if (!gcwq)
608 if (!cwq)
609 return ret; 2481 return ret;
610 2482
611 spin_lock_irq(&cwq->lock); 2483 spin_lock_irq(&gcwq->lock);
612 if (!list_empty(&work->entry)) { 2484 if (!list_empty(&work->entry)) {
613 /* 2485 /*
614 * This work is queued, but perhaps we locked the wrong cwq. 2486 * This work is queued, but perhaps we locked the wrong gcwq.
615 * In that case we must see the new value after rmb(), see 2487 * In that case we must see the new value after rmb(), see
616 * insert_work()->wmb(). 2488 * insert_work()->wmb().
617 */ 2489 */
618 smp_rmb(); 2490 smp_rmb();
619 if (cwq == get_wq_data(work)) { 2491 if (gcwq == get_work_gcwq(work)) {
620 debug_work_deactivate(work); 2492 debug_work_deactivate(work);
621 list_del_init(&work->entry); 2493 list_del_init(&work->entry);
2494 cwq_dec_nr_in_flight(get_work_cwq(work),
2495 get_work_color(work),
2496 *work_data_bits(work) & WORK_STRUCT_DELAYED);
622 ret = 1; 2497 ret = 1;
623 } 2498 }
624 } 2499 }
625 spin_unlock_irq(&cwq->lock); 2500 spin_unlock_irq(&gcwq->lock);
626 2501
627 return ret; 2502 return ret;
628} 2503}
629 2504
630static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq, 2505static bool __cancel_work_timer(struct work_struct *work,
631 struct work_struct *work)
632{
633 struct wq_barrier barr;
634 int running = 0;
635
636 spin_lock_irq(&cwq->lock);
637 if (unlikely(cwq->current_work == work)) {
638 insert_wq_barrier(cwq, &barr, cwq->worklist.next);
639 running = 1;
640 }
641 spin_unlock_irq(&cwq->lock);
642
643 if (unlikely(running)) {
644 wait_for_completion(&barr.done);
645 destroy_work_on_stack(&barr.work);
646 }
647}
648
649static void wait_on_work(struct work_struct *work)
650{
651 struct cpu_workqueue_struct *cwq;
652 struct workqueue_struct *wq;
653 const struct cpumask *cpu_map;
654 int cpu;
655
656 might_sleep();
657
658 lock_map_acquire(&work->lockdep_map);
659 lock_map_release(&work->lockdep_map);
660
661 cwq = get_wq_data(work);
662 if (!cwq)
663 return;
664
665 wq = cwq->wq;
666 cpu_map = wq_cpu_map(wq);
667
668 for_each_cpu(cpu, cpu_map)
669 wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
670}
671
672static int __cancel_work_timer(struct work_struct *work,
673 struct timer_list* timer) 2506 struct timer_list* timer)
674{ 2507{
675 int ret; 2508 int ret;
@@ -681,54 +2514,91 @@ static int __cancel_work_timer(struct work_struct *work,
681 wait_on_work(work); 2514 wait_on_work(work);
682 } while (unlikely(ret < 0)); 2515 } while (unlikely(ret < 0));
683 2516
684 clear_wq_data(work); 2517 clear_work_data(work);
685 return ret; 2518 return ret;
686} 2519}
687 2520
688/** 2521/**
689 * cancel_work_sync - block until a work_struct's callback has terminated 2522 * cancel_work_sync - cancel a work and wait for it to finish
690 * @work: the work which is to be flushed 2523 * @work: the work to cancel
691 *
692 * Returns true if @work was pending.
693 * 2524 *
694 * cancel_work_sync() will cancel the work if it is queued. If the work's 2525 * Cancel @work and wait for its execution to finish. This function
695 * callback appears to be running, cancel_work_sync() will block until it 2526 * can be used even if the work re-queues itself or migrates to
696 * has completed. 2527 * another workqueue. On return from this function, @work is
2528 * guaranteed to be not pending or executing on any CPU.
697 * 2529 *
698 * It is possible to use this function if the work re-queues itself. It can 2530 * cancel_work_sync(&delayed_work->work) must not be used for
699 * cancel the work even if it migrates to another workqueue, however in that 2531 * delayed_work's. Use cancel_delayed_work_sync() instead.
700 * case it only guarantees that work->func() has completed on the last queued
701 * workqueue.
702 *
703 * cancel_work_sync(&delayed_work->work) should be used only if ->timer is not
704 * pending, otherwise it goes into a busy-wait loop until the timer expires.
705 * 2532 *
706 * The caller must ensure that workqueue_struct on which this work was last 2533 * The caller must ensure that the workqueue on which @work was last
707 * queued can't be destroyed before this function returns. 2534 * queued can't be destroyed before this function returns.
2535 *
2536 * RETURNS:
2537 * %true if @work was pending, %false otherwise.
708 */ 2538 */
709int cancel_work_sync(struct work_struct *work) 2539bool cancel_work_sync(struct work_struct *work)
710{ 2540{
711 return __cancel_work_timer(work, NULL); 2541 return __cancel_work_timer(work, NULL);
712} 2542}
713EXPORT_SYMBOL_GPL(cancel_work_sync); 2543EXPORT_SYMBOL_GPL(cancel_work_sync);
714 2544
715/** 2545/**
716 * cancel_delayed_work_sync - reliably kill off a delayed work. 2546 * flush_delayed_work - wait for a dwork to finish executing the last queueing
717 * @dwork: the delayed work struct 2547 * @dwork: the delayed work to flush
718 * 2548 *
719 * Returns true if @dwork was pending. 2549 * Delayed timer is cancelled and the pending work is queued for
2550 * immediate execution. Like flush_work(), this function only
2551 * considers the last queueing instance of @dwork.
720 * 2552 *
721 * It is possible to use this function if @dwork rearms itself via queue_work() 2553 * RETURNS:
722 * or queue_delayed_work(). See also the comment for cancel_work_sync(). 2554 * %true if flush_work() waited for the work to finish execution,
2555 * %false if it was already idle.
723 */ 2556 */
724int cancel_delayed_work_sync(struct delayed_work *dwork) 2557bool flush_delayed_work(struct delayed_work *dwork)
2558{
2559 if (del_timer_sync(&dwork->timer))
2560 __queue_work(raw_smp_processor_id(),
2561 get_work_cwq(&dwork->work)->wq, &dwork->work);
2562 return flush_work(&dwork->work);
2563}
2564EXPORT_SYMBOL(flush_delayed_work);
2565
2566/**
2567 * flush_delayed_work_sync - wait for a dwork to finish
2568 * @dwork: the delayed work to flush
2569 *
2570 * Delayed timer is cancelled and the pending work is queued for
2571 * execution immediately. Other than timer handling, its behavior
2572 * is identical to flush_work_sync().
2573 *
2574 * RETURNS:
2575 * %true if flush_work_sync() waited for the work to finish execution,
2576 * %false if it was already idle.
2577 */
2578bool flush_delayed_work_sync(struct delayed_work *dwork)
2579{
2580 if (del_timer_sync(&dwork->timer))
2581 __queue_work(raw_smp_processor_id(),
2582 get_work_cwq(&dwork->work)->wq, &dwork->work);
2583 return flush_work_sync(&dwork->work);
2584}
2585EXPORT_SYMBOL(flush_delayed_work_sync);
2586
2587/**
2588 * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
2589 * @dwork: the delayed work cancel
2590 *
2591 * This is cancel_work_sync() for delayed works.
2592 *
2593 * RETURNS:
2594 * %true if @dwork was pending, %false otherwise.
2595 */
2596bool cancel_delayed_work_sync(struct delayed_work *dwork)
725{ 2597{
726 return __cancel_work_timer(&dwork->work, &dwork->timer); 2598 return __cancel_work_timer(&dwork->work, &dwork->timer);
727} 2599}
728EXPORT_SYMBOL(cancel_delayed_work_sync); 2600EXPORT_SYMBOL(cancel_delayed_work_sync);
729 2601
730static struct workqueue_struct *keventd_wq __read_mostly;
731
732/** 2602/**
733 * schedule_work - put work task in global workqueue 2603 * schedule_work - put work task in global workqueue
734 * @work: job to be done 2604 * @work: job to be done
@@ -742,7 +2612,7 @@ static struct workqueue_struct *keventd_wq __read_mostly;
742 */ 2612 */
743int schedule_work(struct work_struct *work) 2613int schedule_work(struct work_struct *work)
744{ 2614{
745 return queue_work(keventd_wq, work); 2615 return queue_work(system_wq, work);
746} 2616}
747EXPORT_SYMBOL(schedule_work); 2617EXPORT_SYMBOL(schedule_work);
748 2618
@@ -755,7 +2625,7 @@ EXPORT_SYMBOL(schedule_work);
755 */ 2625 */
756int schedule_work_on(int cpu, struct work_struct *work) 2626int schedule_work_on(int cpu, struct work_struct *work)
757{ 2627{
758 return queue_work_on(cpu, keventd_wq, work); 2628 return queue_work_on(cpu, system_wq, work);
759} 2629}
760EXPORT_SYMBOL(schedule_work_on); 2630EXPORT_SYMBOL(schedule_work_on);
761 2631
@@ -770,29 +2640,11 @@ EXPORT_SYMBOL(schedule_work_on);
770int schedule_delayed_work(struct delayed_work *dwork, 2640int schedule_delayed_work(struct delayed_work *dwork,
771 unsigned long delay) 2641 unsigned long delay)
772{ 2642{
773 return queue_delayed_work(keventd_wq, dwork, delay); 2643 return queue_delayed_work(system_wq, dwork, delay);
774} 2644}
775EXPORT_SYMBOL(schedule_delayed_work); 2645EXPORT_SYMBOL(schedule_delayed_work);
776 2646
777/** 2647/**
778 * flush_delayed_work - block until a dwork_struct's callback has terminated
779 * @dwork: the delayed work which is to be flushed
780 *
781 * Any timeout is cancelled, and any pending work is run immediately.
782 */
783void flush_delayed_work(struct delayed_work *dwork)
784{
785 if (del_timer_sync(&dwork->timer)) {
786 struct cpu_workqueue_struct *cwq;
787 cwq = wq_per_cpu(get_wq_data(&dwork->work)->wq, get_cpu());
788 __queue_work(cwq, &dwork->work);
789 put_cpu();
790 }
791 flush_work(&dwork->work);
792}
793EXPORT_SYMBOL(flush_delayed_work);
794
795/**
796 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay 2648 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
797 * @cpu: cpu to use 2649 * @cpu: cpu to use
798 * @dwork: job to be done 2650 * @dwork: job to be done
@@ -804,24 +2656,25 @@ EXPORT_SYMBOL(flush_delayed_work);
804int schedule_delayed_work_on(int cpu, 2656int schedule_delayed_work_on(int cpu,
805 struct delayed_work *dwork, unsigned long delay) 2657 struct delayed_work *dwork, unsigned long delay)
806{ 2658{
807 return queue_delayed_work_on(cpu, keventd_wq, dwork, delay); 2659 return queue_delayed_work_on(cpu, system_wq, dwork, delay);
808} 2660}
809EXPORT_SYMBOL(schedule_delayed_work_on); 2661EXPORT_SYMBOL(schedule_delayed_work_on);
810 2662
811/** 2663/**
812 * schedule_on_each_cpu - call a function on each online CPU from keventd 2664 * schedule_on_each_cpu - execute a function synchronously on each online CPU
813 * @func: the function to call 2665 * @func: the function to call
814 * 2666 *
815 * Returns zero on success. 2667 * schedule_on_each_cpu() executes @func on each online CPU using the
816 * Returns -ve errno on failure. 2668 * system workqueue and blocks until all CPUs have completed.
817 *
818 * schedule_on_each_cpu() is very slow. 2669 * schedule_on_each_cpu() is very slow.
2670 *
2671 * RETURNS:
2672 * 0 on success, -errno on failure.
819 */ 2673 */
820int schedule_on_each_cpu(work_func_t func) 2674int schedule_on_each_cpu(work_func_t func)
821{ 2675{
822 int cpu; 2676 int cpu;
823 int orig = -1; 2677 struct work_struct __percpu *works;
824 struct work_struct *works;
825 2678
826 works = alloc_percpu(struct work_struct); 2679 works = alloc_percpu(struct work_struct);
827 if (!works) 2680 if (!works)
@@ -829,23 +2682,12 @@ int schedule_on_each_cpu(work_func_t func)
829 2682
830 get_online_cpus(); 2683 get_online_cpus();
831 2684
832 /*
833 * When running in keventd don't schedule a work item on
834 * itself. Can just call directly because the work queue is
835 * already bound. This also is faster.
836 */
837 if (current_is_keventd())
838 orig = raw_smp_processor_id();
839
840 for_each_online_cpu(cpu) { 2685 for_each_online_cpu(cpu) {
841 struct work_struct *work = per_cpu_ptr(works, cpu); 2686 struct work_struct *work = per_cpu_ptr(works, cpu);
842 2687
843 INIT_WORK(work, func); 2688 INIT_WORK(work, func);
844 if (cpu != orig) 2689 schedule_work_on(cpu, work);
845 schedule_work_on(cpu, work);
846 } 2690 }
847 if (orig >= 0)
848 func(per_cpu_ptr(works, orig));
849 2691
850 for_each_online_cpu(cpu) 2692 for_each_online_cpu(cpu)
851 flush_work(per_cpu_ptr(works, cpu)); 2693 flush_work(per_cpu_ptr(works, cpu));
@@ -881,7 +2723,7 @@ int schedule_on_each_cpu(work_func_t func)
881 */ 2723 */
882void flush_scheduled_work(void) 2724void flush_scheduled_work(void)
883{ 2725{
884 flush_workqueue(keventd_wq); 2726 flush_workqueue(system_wq);
885} 2727}
886EXPORT_SYMBOL(flush_scheduled_work); 2728EXPORT_SYMBOL(flush_scheduled_work);
887 2729
@@ -913,170 +2755,178 @@ EXPORT_SYMBOL_GPL(execute_in_process_context);
913 2755
914int keventd_up(void) 2756int keventd_up(void)
915{ 2757{
916 return keventd_wq != NULL; 2758 return system_wq != NULL;
917} 2759}
918 2760
919int current_is_keventd(void) 2761static int alloc_cwqs(struct workqueue_struct *wq)
920{ 2762{
921 struct cpu_workqueue_struct *cwq; 2763 /*
922 int cpu = raw_smp_processor_id(); /* preempt-safe: keventd is per-cpu */ 2764 * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
923 int ret = 0; 2765 * Make sure that the alignment isn't lower than that of
924 2766 * unsigned long long.
925 BUG_ON(!keventd_wq); 2767 */
2768 const size_t size = sizeof(struct cpu_workqueue_struct);
2769 const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
2770 __alignof__(unsigned long long));
2771#ifdef CONFIG_SMP
2772 bool percpu = !(wq->flags & WQ_UNBOUND);
2773#else
2774 bool percpu = false;
2775#endif
926 2776
927 cwq = per_cpu_ptr(keventd_wq->cpu_wq, cpu); 2777 if (percpu)
928 if (current == cwq->thread) 2778 wq->cpu_wq.pcpu = __alloc_percpu(size, align);
929 ret = 1; 2779 else {
2780 void *ptr;
930 2781
931 return ret; 2782 /*
2783 * Allocate enough room to align cwq and put an extra
2784 * pointer at the end pointing back to the originally
2785 * allocated pointer which will be used for free.
2786 */
2787 ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL);
2788 if (ptr) {
2789 wq->cpu_wq.single = PTR_ALIGN(ptr, align);
2790 *(void **)(wq->cpu_wq.single + 1) = ptr;
2791 }
2792 }
932 2793
2794 /* just in case, make sure it's actually aligned
2795 * - this is affected by PERCPU() alignment in vmlinux.lds.S
2796 */
2797 BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
2798 return wq->cpu_wq.v ? 0 : -ENOMEM;
933} 2799}
934 2800
935static struct cpu_workqueue_struct * 2801static void free_cwqs(struct workqueue_struct *wq)
936init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
937{ 2802{
938 struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); 2803#ifdef CONFIG_SMP
939 2804 bool percpu = !(wq->flags & WQ_UNBOUND);
940 cwq->wq = wq; 2805#else
941 spin_lock_init(&cwq->lock); 2806 bool percpu = false;
942 INIT_LIST_HEAD(&cwq->worklist); 2807#endif
943 init_waitqueue_head(&cwq->more_work);
944 2808
945 return cwq; 2809 if (percpu)
2810 free_percpu(wq->cpu_wq.pcpu);
2811 else if (wq->cpu_wq.single) {
2812 /* the pointer to free is stored right after the cwq */
2813 kfree(*(void **)(wq->cpu_wq.single + 1));
2814 }
946} 2815}
947 2816
948static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) 2817static int wq_clamp_max_active(int max_active, unsigned int flags,
2818 const char *name)
949{ 2819{
950 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 2820 int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;
951 struct workqueue_struct *wq = cwq->wq;
952 const char *fmt = is_wq_single_threaded(wq) ? "%s" : "%s/%d";
953 struct task_struct *p;
954 2821
955 p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu); 2822 if (max_active < 1 || max_active > lim)
956 /* 2823 printk(KERN_WARNING "workqueue: max_active %d requested for %s "
957 * Nobody can add the work_struct to this cwq, 2824 "is out of range, clamping between %d and %d\n",
958 * if (caller is __create_workqueue) 2825 max_active, name, 1, lim);
959 * nobody should see this wq
960 * else // caller is CPU_UP_PREPARE
961 * cpu is not on cpu_online_map
962 * so we can abort safely.
963 */
964 if (IS_ERR(p))
965 return PTR_ERR(p);
966 if (cwq->wq->rt)
967 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
968 cwq->thread = p;
969 2826
970 trace_workqueue_creation(cwq->thread, cpu); 2827 return clamp_val(max_active, 1, lim);
971
972 return 0;
973} 2828}
974 2829
975static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) 2830struct workqueue_struct *__alloc_workqueue_key(const char *name,
2831 unsigned int flags,
2832 int max_active,
2833 struct lock_class_key *key,
2834 const char *lock_name)
976{ 2835{
977 struct task_struct *p = cwq->thread; 2836 struct workqueue_struct *wq;
2837 unsigned int cpu;
978 2838
979 if (p != NULL) { 2839 /*
980 if (cpu >= 0) 2840 * Workqueues which may be used during memory reclaim should
981 kthread_bind(p, cpu); 2841 * have a rescuer to guarantee forward progress.
982 wake_up_process(p); 2842 */
983 } 2843 if (flags & WQ_MEM_RECLAIM)
984} 2844 flags |= WQ_RESCUER;
985 2845
986struct workqueue_struct *__create_workqueue_key(const char *name, 2846 /*
987 int singlethread, 2847 * Unbound workqueues aren't concurrency managed and should be
988 int freezeable, 2848 * dispatched to workers immediately.
989 int rt, 2849 */
990 struct lock_class_key *key, 2850 if (flags & WQ_UNBOUND)
991 const char *lock_name) 2851 flags |= WQ_HIGHPRI;
992{ 2852
993 struct workqueue_struct *wq; 2853 max_active = max_active ?: WQ_DFL_ACTIVE;
994 struct cpu_workqueue_struct *cwq; 2854 max_active = wq_clamp_max_active(max_active, flags, name);
995 int err = 0, cpu;
996 2855
997 wq = kzalloc(sizeof(*wq), GFP_KERNEL); 2856 wq = kzalloc(sizeof(*wq), GFP_KERNEL);
998 if (!wq) 2857 if (!wq)
999 return NULL; 2858 goto err;
1000 2859
1001 wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); 2860 wq->flags = flags;
1002 if (!wq->cpu_wq) { 2861 wq->saved_max_active = max_active;
1003 kfree(wq); 2862 mutex_init(&wq->flush_mutex);
1004 return NULL; 2863 atomic_set(&wq->nr_cwqs_to_flush, 0);
1005 } 2864 INIT_LIST_HEAD(&wq->flusher_queue);
2865 INIT_LIST_HEAD(&wq->flusher_overflow);
1006 2866
1007 wq->name = name; 2867 wq->name = name;
1008 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); 2868 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
1009 wq->singlethread = singlethread;
1010 wq->freezeable = freezeable;
1011 wq->rt = rt;
1012 INIT_LIST_HEAD(&wq->list); 2869 INIT_LIST_HEAD(&wq->list);
1013 2870
1014 if (singlethread) { 2871 if (alloc_cwqs(wq) < 0)
1015 cwq = init_cpu_workqueue(wq, singlethread_cpu); 2872 goto err;
1016 err = create_workqueue_thread(cwq, singlethread_cpu); 2873
1017 start_workqueue_thread(cwq, -1); 2874 for_each_cwq_cpu(cpu, wq) {
1018 } else { 2875 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
1019 cpu_maps_update_begin(); 2876 struct global_cwq *gcwq = get_gcwq(cpu);
1020 /* 2877
1021 * We must place this wq on list even if the code below fails. 2878 BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
1022 * cpu_down(cpu) can remove cpu from cpu_populated_map before 2879 cwq->gcwq = gcwq;
1023 * destroy_workqueue() takes the lock, in that case we leak 2880 cwq->wq = wq;
1024 * cwq[cpu]->thread. 2881 cwq->flush_color = -1;
1025 */ 2882 cwq->max_active = max_active;
1026 spin_lock(&workqueue_lock); 2883 INIT_LIST_HEAD(&cwq->delayed_works);
1027 list_add(&wq->list, &workqueues);
1028 spin_unlock(&workqueue_lock);
1029 /*
1030 * We must initialize cwqs for each possible cpu even if we
1031 * are going to call destroy_workqueue() finally. Otherwise
1032 * cpu_up() can hit the uninitialized cwq once we drop the
1033 * lock.
1034 */
1035 for_each_possible_cpu(cpu) {
1036 cwq = init_cpu_workqueue(wq, cpu);
1037 if (err || !cpu_online(cpu))
1038 continue;
1039 err = create_workqueue_thread(cwq, cpu);
1040 start_workqueue_thread(cwq, cpu);
1041 }
1042 cpu_maps_update_done();
1043 } 2884 }
1044 2885
1045 if (err) { 2886 if (flags & WQ_RESCUER) {
1046 destroy_workqueue(wq); 2887 struct worker *rescuer;
1047 wq = NULL; 2888
2889 if (!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL))
2890 goto err;
2891
2892 wq->rescuer = rescuer = alloc_worker();
2893 if (!rescuer)
2894 goto err;
2895
2896 rescuer->task = kthread_create(rescuer_thread, wq, "%s", name);
2897 if (IS_ERR(rescuer->task))
2898 goto err;
2899
2900 rescuer->task->flags |= PF_THREAD_BOUND;
2901 wake_up_process(rescuer->task);
1048 } 2902 }
1049 return wq;
1050}
1051EXPORT_SYMBOL_GPL(__create_workqueue_key);
1052 2903
1053static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
1054{
1055 /* 2904 /*
1056 * Our caller is either destroy_workqueue() or CPU_POST_DEAD, 2905 * workqueue_lock protects global freeze state and workqueues
1057 * cpu_add_remove_lock protects cwq->thread. 2906 * list. Grab it, set max_active accordingly and add the new
2907 * workqueue to workqueues list.
1058 */ 2908 */
1059 if (cwq->thread == NULL) 2909 spin_lock(&workqueue_lock);
1060 return;
1061 2910
1062 lock_map_acquire(&cwq->wq->lockdep_map); 2911 if (workqueue_freezing && wq->flags & WQ_FREEZEABLE)
1063 lock_map_release(&cwq->wq->lockdep_map); 2912 for_each_cwq_cpu(cpu, wq)
2913 get_cwq(cpu, wq)->max_active = 0;
1064 2914
1065 flush_cpu_workqueue(cwq); 2915 list_add(&wq->list, &workqueues);
1066 /* 2916
1067 * If the caller is CPU_POST_DEAD and cwq->worklist was not empty, 2917 spin_unlock(&workqueue_lock);
1068 * a concurrent flush_workqueue() can insert a barrier after us. 2918
1069 * However, in that case run_workqueue() won't return and check 2919 return wq;
1070 * kthread_should_stop() until it flushes all work_struct's. 2920err:
1071 * When ->worklist becomes empty it is safe to exit because no 2921 if (wq) {
1072 * more work_structs can be queued on this cwq: flush_workqueue 2922 free_cwqs(wq);
1073 * checks list_empty(), and a "normal" queue_work() can't use 2923 free_mayday_mask(wq->mayday_mask);
1074 * a dead CPU. 2924 kfree(wq->rescuer);
1075 */ 2925 kfree(wq);
1076 trace_workqueue_destruction(cwq->thread); 2926 }
1077 kthread_stop(cwq->thread); 2927 return NULL;
1078 cwq->thread = NULL;
1079} 2928}
2929EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
1080 2930
1081/** 2931/**
1082 * destroy_workqueue - safely terminate a workqueue 2932 * destroy_workqueue - safely terminate a workqueue
@@ -1086,72 +2936,520 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
1086 */ 2936 */
1087void destroy_workqueue(struct workqueue_struct *wq) 2937void destroy_workqueue(struct workqueue_struct *wq)
1088{ 2938{
1089 const struct cpumask *cpu_map = wq_cpu_map(wq); 2939 unsigned int cpu;
1090 int cpu; 2940
2941 wq->flags |= WQ_DYING;
2942 flush_workqueue(wq);
1091 2943
1092 cpu_maps_update_begin(); 2944 /*
2945 * wq list is used to freeze wq, remove from list after
2946 * flushing is complete in case freeze races us.
2947 */
1093 spin_lock(&workqueue_lock); 2948 spin_lock(&workqueue_lock);
1094 list_del(&wq->list); 2949 list_del(&wq->list);
1095 spin_unlock(&workqueue_lock); 2950 spin_unlock(&workqueue_lock);
1096 2951
1097 for_each_cpu(cpu, cpu_map) 2952 /* sanity check */
1098 cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu)); 2953 for_each_cwq_cpu(cpu, wq) {
1099 cpu_maps_update_done(); 2954 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2955 int i;
2956
2957 for (i = 0; i < WORK_NR_COLORS; i++)
2958 BUG_ON(cwq->nr_in_flight[i]);
2959 BUG_ON(cwq->nr_active);
2960 BUG_ON(!list_empty(&cwq->delayed_works));
2961 }
2962
2963 if (wq->flags & WQ_RESCUER) {
2964 kthread_stop(wq->rescuer->task);
2965 free_mayday_mask(wq->mayday_mask);
2966 kfree(wq->rescuer);
2967 }
1100 2968
1101 free_percpu(wq->cpu_wq); 2969 free_cwqs(wq);
1102 kfree(wq); 2970 kfree(wq);
1103} 2971}
1104EXPORT_SYMBOL_GPL(destroy_workqueue); 2972EXPORT_SYMBOL_GPL(destroy_workqueue);
1105 2973
2974/**
2975 * workqueue_set_max_active - adjust max_active of a workqueue
2976 * @wq: target workqueue
2977 * @max_active: new max_active value.
2978 *
2979 * Set max_active of @wq to @max_active.
2980 *
2981 * CONTEXT:
2982 * Don't call from IRQ context.
2983 */
2984void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
2985{
2986 unsigned int cpu;
2987
2988 max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
2989
2990 spin_lock(&workqueue_lock);
2991
2992 wq->saved_max_active = max_active;
2993
2994 for_each_cwq_cpu(cpu, wq) {
2995 struct global_cwq *gcwq = get_gcwq(cpu);
2996
2997 spin_lock_irq(&gcwq->lock);
2998
2999 if (!(wq->flags & WQ_FREEZEABLE) ||
3000 !(gcwq->flags & GCWQ_FREEZING))
3001 get_cwq(gcwq->cpu, wq)->max_active = max_active;
3002
3003 spin_unlock_irq(&gcwq->lock);
3004 }
3005
3006 spin_unlock(&workqueue_lock);
3007}
3008EXPORT_SYMBOL_GPL(workqueue_set_max_active);
3009
3010/**
3011 * workqueue_congested - test whether a workqueue is congested
3012 * @cpu: CPU in question
3013 * @wq: target workqueue
3014 *
3015 * Test whether @wq's cpu workqueue for @cpu is congested. There is
3016 * no synchronization around this function and the test result is
3017 * unreliable and only useful as advisory hints or for debugging.
3018 *
3019 * RETURNS:
3020 * %true if congested, %false otherwise.
3021 */
3022bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq)
3023{
3024 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3025
3026 return !list_empty(&cwq->delayed_works);
3027}
3028EXPORT_SYMBOL_GPL(workqueue_congested);
3029
3030/**
3031 * work_cpu - return the last known associated cpu for @work
3032 * @work: the work of interest
3033 *
3034 * RETURNS:
3035 * CPU number if @work was ever queued. WORK_CPU_NONE otherwise.
3036 */
3037unsigned int work_cpu(struct work_struct *work)
3038{
3039 struct global_cwq *gcwq = get_work_gcwq(work);
3040
3041 return gcwq ? gcwq->cpu : WORK_CPU_NONE;
3042}
3043EXPORT_SYMBOL_GPL(work_cpu);
3044
3045/**
3046 * work_busy - test whether a work is currently pending or running
3047 * @work: the work to be tested
3048 *
3049 * Test whether @work is currently pending or running. There is no
3050 * synchronization around this function and the test result is
3051 * unreliable and only useful as advisory hints or for debugging.
3052 * Especially for reentrant wqs, the pending state might hide the
3053 * running state.
3054 *
3055 * RETURNS:
3056 * OR'd bitmask of WORK_BUSY_* bits.
3057 */
3058unsigned int work_busy(struct work_struct *work)
3059{
3060 struct global_cwq *gcwq = get_work_gcwq(work);
3061 unsigned long flags;
3062 unsigned int ret = 0;
3063
3064 if (!gcwq)
3065 return false;
3066
3067 spin_lock_irqsave(&gcwq->lock, flags);
3068
3069 if (work_pending(work))
3070 ret |= WORK_BUSY_PENDING;
3071 if (find_worker_executing_work(gcwq, work))
3072 ret |= WORK_BUSY_RUNNING;
3073
3074 spin_unlock_irqrestore(&gcwq->lock, flags);
3075
3076 return ret;
3077}
3078EXPORT_SYMBOL_GPL(work_busy);
3079
3080/*
3081 * CPU hotplug.
3082 *
3083 * There are two challenges in supporting CPU hotplug. Firstly, there
3084 * are a lot of assumptions on strong associations among work, cwq and
3085 * gcwq which make migrating pending and scheduled works very
3086 * difficult to implement without impacting hot paths. Secondly,
3087 * gcwqs serve mix of short, long and very long running works making
3088 * blocked draining impractical.
3089 *
3090 * This is solved by allowing a gcwq to be detached from CPU, running
3091 * it with unbound (rogue) workers and allowing it to be reattached
3092 * later if the cpu comes back online. A separate thread is created
3093 * to govern a gcwq in such state and is called the trustee of the
3094 * gcwq.
3095 *
3096 * Trustee states and their descriptions.
3097 *
3098 * START Command state used on startup. On CPU_DOWN_PREPARE, a
3099 * new trustee is started with this state.
3100 *
3101 * IN_CHARGE Once started, trustee will enter this state after
3102 * assuming the manager role and making all existing
3103 * workers rogue. DOWN_PREPARE waits for trustee to
3104 * enter this state. After reaching IN_CHARGE, trustee
3105 * tries to execute the pending worklist until it's empty
3106 * and the state is set to BUTCHER, or the state is set
3107 * to RELEASE.
3108 *
3109 * BUTCHER Command state which is set by the cpu callback after
3110 * the cpu has went down. Once this state is set trustee
3111 * knows that there will be no new works on the worklist
3112 * and once the worklist is empty it can proceed to
3113 * killing idle workers.
3114 *
3115 * RELEASE Command state which is set by the cpu callback if the
3116 * cpu down has been canceled or it has come online
3117 * again. After recognizing this state, trustee stops
3118 * trying to drain or butcher and clears ROGUE, rebinds
3119 * all remaining workers back to the cpu and releases
3120 * manager role.
3121 *
3122 * DONE Trustee will enter this state after BUTCHER or RELEASE
3123 * is complete.
3124 *
3125 * trustee CPU draining
3126 * took over down complete
3127 * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
3128 * | | ^
3129 * | CPU is back online v return workers |
3130 * ----------------> RELEASE --------------
3131 */
3132
3133/**
3134 * trustee_wait_event_timeout - timed event wait for trustee
3135 * @cond: condition to wait for
3136 * @timeout: timeout in jiffies
3137 *
3138 * wait_event_timeout() for trustee to use. Handles locking and
3139 * checks for RELEASE request.
3140 *
3141 * CONTEXT:
3142 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3143 * multiple times. To be used by trustee.
3144 *
3145 * RETURNS:
3146 * Positive indicating left time if @cond is satisfied, 0 if timed
3147 * out, -1 if canceled.
3148 */
3149#define trustee_wait_event_timeout(cond, timeout) ({ \
3150 long __ret = (timeout); \
3151 while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \
3152 __ret) { \
3153 spin_unlock_irq(&gcwq->lock); \
3154 __wait_event_timeout(gcwq->trustee_wait, (cond) || \
3155 (gcwq->trustee_state == TRUSTEE_RELEASE), \
3156 __ret); \
3157 spin_lock_irq(&gcwq->lock); \
3158 } \
3159 gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \
3160})
3161
3162/**
3163 * trustee_wait_event - event wait for trustee
3164 * @cond: condition to wait for
3165 *
3166 * wait_event() for trustee to use. Automatically handles locking and
3167 * checks for CANCEL request.
3168 *
3169 * CONTEXT:
3170 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3171 * multiple times. To be used by trustee.
3172 *
3173 * RETURNS:
3174 * 0 if @cond is satisfied, -1 if canceled.
3175 */
3176#define trustee_wait_event(cond) ({ \
3177 long __ret1; \
3178 __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
3179 __ret1 < 0 ? -1 : 0; \
3180})
3181
3182static int __cpuinit trustee_thread(void *__gcwq)
3183{
3184 struct global_cwq *gcwq = __gcwq;
3185 struct worker *worker;
3186 struct work_struct *work;
3187 struct hlist_node *pos;
3188 long rc;
3189 int i;
3190
3191 BUG_ON(gcwq->cpu != smp_processor_id());
3192
3193 spin_lock_irq(&gcwq->lock);
3194 /*
3195 * Claim the manager position and make all workers rogue.
3196 * Trustee must be bound to the target cpu and can't be
3197 * cancelled.
3198 */
3199 BUG_ON(gcwq->cpu != smp_processor_id());
3200 rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS));
3201 BUG_ON(rc < 0);
3202
3203 gcwq->flags |= GCWQ_MANAGING_WORKERS;
3204
3205 list_for_each_entry(worker, &gcwq->idle_list, entry)
3206 worker->flags |= WORKER_ROGUE;
3207
3208 for_each_busy_worker(worker, i, pos, gcwq)
3209 worker->flags |= WORKER_ROGUE;
3210
3211 /*
3212 * Call schedule() so that we cross rq->lock and thus can
3213 * guarantee sched callbacks see the rogue flag. This is
3214 * necessary as scheduler callbacks may be invoked from other
3215 * cpus.
3216 */
3217 spin_unlock_irq(&gcwq->lock);
3218 schedule();
3219 spin_lock_irq(&gcwq->lock);
3220
3221 /*
3222 * Sched callbacks are disabled now. Zap nr_running. After
3223 * this, nr_running stays zero and need_more_worker() and
3224 * keep_working() are always true as long as the worklist is
3225 * not empty.
3226 */
3227 atomic_set(get_gcwq_nr_running(gcwq->cpu), 0);
3228
3229 spin_unlock_irq(&gcwq->lock);
3230 del_timer_sync(&gcwq->idle_timer);
3231 spin_lock_irq(&gcwq->lock);
3232
3233 /*
3234 * We're now in charge. Notify and proceed to drain. We need
3235 * to keep the gcwq running during the whole CPU down
3236 * procedure as other cpu hotunplug callbacks may need to
3237 * flush currently running tasks.
3238 */
3239 gcwq->trustee_state = TRUSTEE_IN_CHARGE;
3240 wake_up_all(&gcwq->trustee_wait);
3241
3242 /*
3243 * The original cpu is in the process of dying and may go away
3244 * anytime now. When that happens, we and all workers would
3245 * be migrated to other cpus. Try draining any left work. We
3246 * want to get it over with ASAP - spam rescuers, wake up as
3247 * many idlers as necessary and create new ones till the
3248 * worklist is empty. Note that if the gcwq is frozen, there
3249 * may be frozen works in freezeable cwqs. Don't declare
3250 * completion while frozen.
3251 */
3252 while (gcwq->nr_workers != gcwq->nr_idle ||
3253 gcwq->flags & GCWQ_FREEZING ||
3254 gcwq->trustee_state == TRUSTEE_IN_CHARGE) {
3255 int nr_works = 0;
3256
3257 list_for_each_entry(work, &gcwq->worklist, entry) {
3258 send_mayday(work);
3259 nr_works++;
3260 }
3261
3262 list_for_each_entry(worker, &gcwq->idle_list, entry) {
3263 if (!nr_works--)
3264 break;
3265 wake_up_process(worker->task);
3266 }
3267
3268 if (need_to_create_worker(gcwq)) {
3269 spin_unlock_irq(&gcwq->lock);
3270 worker = create_worker(gcwq, false);
3271 spin_lock_irq(&gcwq->lock);
3272 if (worker) {
3273 worker->flags |= WORKER_ROGUE;
3274 start_worker(worker);
3275 }
3276 }
3277
3278 /* give a breather */
3279 if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
3280 break;
3281 }
3282
3283 /*
3284 * Either all works have been scheduled and cpu is down, or
3285 * cpu down has already been canceled. Wait for and butcher
3286 * all workers till we're canceled.
3287 */
3288 do {
3289 rc = trustee_wait_event(!list_empty(&gcwq->idle_list));
3290 while (!list_empty(&gcwq->idle_list))
3291 destroy_worker(list_first_entry(&gcwq->idle_list,
3292 struct worker, entry));
3293 } while (gcwq->nr_workers && rc >= 0);
3294
3295 /*
3296 * At this point, either draining has completed and no worker
3297 * is left, or cpu down has been canceled or the cpu is being
3298 * brought back up. There shouldn't be any idle one left.
3299 * Tell the remaining busy ones to rebind once it finishes the
3300 * currently scheduled works by scheduling the rebind_work.
3301 */
3302 WARN_ON(!list_empty(&gcwq->idle_list));
3303
3304 for_each_busy_worker(worker, i, pos, gcwq) {
3305 struct work_struct *rebind_work = &worker->rebind_work;
3306
3307 /*
3308 * Rebind_work may race with future cpu hotplug
3309 * operations. Use a separate flag to mark that
3310 * rebinding is scheduled.
3311 */
3312 worker->flags |= WORKER_REBIND;
3313 worker->flags &= ~WORKER_ROGUE;
3314
3315 /* queue rebind_work, wq doesn't matter, use the default one */
3316 if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
3317 work_data_bits(rebind_work)))
3318 continue;
3319
3320 debug_work_activate(rebind_work);
3321 insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
3322 worker->scheduled.next,
3323 work_color_to_flags(WORK_NO_COLOR));
3324 }
3325
3326 /* relinquish manager role */
3327 gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
3328
3329 /* notify completion */
3330 gcwq->trustee = NULL;
3331 gcwq->trustee_state = TRUSTEE_DONE;
3332 wake_up_all(&gcwq->trustee_wait);
3333 spin_unlock_irq(&gcwq->lock);
3334 return 0;
3335}
3336
3337/**
3338 * wait_trustee_state - wait for trustee to enter the specified state
3339 * @gcwq: gcwq the trustee of interest belongs to
3340 * @state: target state to wait for
3341 *
3342 * Wait for the trustee to reach @state. DONE is already matched.
3343 *
3344 * CONTEXT:
3345 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3346 * multiple times. To be used by cpu_callback.
3347 */
3348static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state)
3349__releases(&gcwq->lock)
3350__acquires(&gcwq->lock)
3351{
3352 if (!(gcwq->trustee_state == state ||
3353 gcwq->trustee_state == TRUSTEE_DONE)) {
3354 spin_unlock_irq(&gcwq->lock);
3355 __wait_event(gcwq->trustee_wait,
3356 gcwq->trustee_state == state ||
3357 gcwq->trustee_state == TRUSTEE_DONE);
3358 spin_lock_irq(&gcwq->lock);
3359 }
3360}
3361
1106static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, 3362static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
1107 unsigned long action, 3363 unsigned long action,
1108 void *hcpu) 3364 void *hcpu)
1109{ 3365{
1110 unsigned int cpu = (unsigned long)hcpu; 3366 unsigned int cpu = (unsigned long)hcpu;
1111 struct cpu_workqueue_struct *cwq; 3367 struct global_cwq *gcwq = get_gcwq(cpu);
1112 struct workqueue_struct *wq; 3368 struct task_struct *new_trustee = NULL;
1113 int err = 0; 3369 struct worker *uninitialized_var(new_worker);
3370 unsigned long flags;
1114 3371
1115 action &= ~CPU_TASKS_FROZEN; 3372 action &= ~CPU_TASKS_FROZEN;
1116 3373
1117 switch (action) { 3374 switch (action) {
3375 case CPU_DOWN_PREPARE:
3376 new_trustee = kthread_create(trustee_thread, gcwq,
3377 "workqueue_trustee/%d\n", cpu);
3378 if (IS_ERR(new_trustee))
3379 return notifier_from_errno(PTR_ERR(new_trustee));
3380 kthread_bind(new_trustee, cpu);
3381 /* fall through */
1118 case CPU_UP_PREPARE: 3382 case CPU_UP_PREPARE:
1119 cpumask_set_cpu(cpu, cpu_populated_map); 3383 BUG_ON(gcwq->first_idle);
1120 } 3384 new_worker = create_worker(gcwq, false);
1121undo: 3385 if (!new_worker) {
1122 list_for_each_entry(wq, &workqueues, list) { 3386 if (new_trustee)
1123 cwq = per_cpu_ptr(wq->cpu_wq, cpu); 3387 kthread_stop(new_trustee);
1124 3388 return NOTIFY_BAD;
1125 switch (action) {
1126 case CPU_UP_PREPARE:
1127 err = create_workqueue_thread(cwq, cpu);
1128 if (!err)
1129 break;
1130 printk(KERN_ERR "workqueue [%s] for %i failed\n",
1131 wq->name, cpu);
1132 action = CPU_UP_CANCELED;
1133 err = -ENOMEM;
1134 goto undo;
1135
1136 case CPU_ONLINE:
1137 start_workqueue_thread(cwq, cpu);
1138 break;
1139
1140 case CPU_UP_CANCELED:
1141 start_workqueue_thread(cwq, -1);
1142 case CPU_POST_DEAD:
1143 cleanup_workqueue_thread(cwq);
1144 break;
1145 } 3389 }
1146 } 3390 }
1147 3391
3392 /* some are called w/ irq disabled, don't disturb irq status */
3393 spin_lock_irqsave(&gcwq->lock, flags);
3394
1148 switch (action) { 3395 switch (action) {
1149 case CPU_UP_CANCELED: 3396 case CPU_DOWN_PREPARE:
3397 /* initialize trustee and tell it to acquire the gcwq */
3398 BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE);
3399 gcwq->trustee = new_trustee;
3400 gcwq->trustee_state = TRUSTEE_START;
3401 wake_up_process(gcwq->trustee);
3402 wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
3403 /* fall through */
3404 case CPU_UP_PREPARE:
3405 BUG_ON(gcwq->first_idle);
3406 gcwq->first_idle = new_worker;
3407 break;
3408
3409 case CPU_DYING:
3410 /*
3411 * Before this, the trustee and all workers except for
3412 * the ones which are still executing works from
3413 * before the last CPU down must be on the cpu. After
3414 * this, they'll all be diasporas.
3415 */
3416 gcwq->flags |= GCWQ_DISASSOCIATED;
3417 break;
3418
1150 case CPU_POST_DEAD: 3419 case CPU_POST_DEAD:
1151 cpumask_clear_cpu(cpu, cpu_populated_map); 3420 gcwq->trustee_state = TRUSTEE_BUTCHER;
3421 /* fall through */
3422 case CPU_UP_CANCELED:
3423 destroy_worker(gcwq->first_idle);
3424 gcwq->first_idle = NULL;
3425 break;
3426
3427 case CPU_DOWN_FAILED:
3428 case CPU_ONLINE:
3429 gcwq->flags &= ~GCWQ_DISASSOCIATED;
3430 if (gcwq->trustee_state != TRUSTEE_DONE) {
3431 gcwq->trustee_state = TRUSTEE_RELEASE;
3432 wake_up_process(gcwq->trustee);
3433 wait_trustee_state(gcwq, TRUSTEE_DONE);
3434 }
3435
3436 /*
3437 * Trustee is done and there might be no worker left.
3438 * Put the first_idle in and request a real manager to
3439 * take a look.
3440 */
3441 spin_unlock_irq(&gcwq->lock);
3442 kthread_bind(gcwq->first_idle->task, cpu);
3443 spin_lock_irq(&gcwq->lock);
3444 gcwq->flags |= GCWQ_MANAGE_WORKERS;
3445 start_worker(gcwq->first_idle);
3446 gcwq->first_idle = NULL;
3447 break;
1152 } 3448 }
1153 3449
1154 return notifier_from_errno(err); 3450 spin_unlock_irqrestore(&gcwq->lock, flags);
3451
3452 return notifier_from_errno(0);
1155} 3453}
1156 3454
1157#ifdef CONFIG_SMP 3455#ifdef CONFIG_SMP
@@ -1201,14 +3499,200 @@ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
1201EXPORT_SYMBOL_GPL(work_on_cpu); 3499EXPORT_SYMBOL_GPL(work_on_cpu);
1202#endif /* CONFIG_SMP */ 3500#endif /* CONFIG_SMP */
1203 3501
1204void __init init_workqueues(void) 3502#ifdef CONFIG_FREEZER
3503
3504/**
3505 * freeze_workqueues_begin - begin freezing workqueues
3506 *
3507 * Start freezing workqueues. After this function returns, all
3508 * freezeable workqueues will queue new works to their frozen_works
3509 * list instead of gcwq->worklist.
3510 *
3511 * CONTEXT:
3512 * Grabs and releases workqueue_lock and gcwq->lock's.
3513 */
3514void freeze_workqueues_begin(void)
3515{
3516 unsigned int cpu;
3517
3518 spin_lock(&workqueue_lock);
3519
3520 BUG_ON(workqueue_freezing);
3521 workqueue_freezing = true;
3522
3523 for_each_gcwq_cpu(cpu) {
3524 struct global_cwq *gcwq = get_gcwq(cpu);
3525 struct workqueue_struct *wq;
3526
3527 spin_lock_irq(&gcwq->lock);
3528
3529 BUG_ON(gcwq->flags & GCWQ_FREEZING);
3530 gcwq->flags |= GCWQ_FREEZING;
3531
3532 list_for_each_entry(wq, &workqueues, list) {
3533 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3534
3535 if (cwq && wq->flags & WQ_FREEZEABLE)
3536 cwq->max_active = 0;
3537 }
3538
3539 spin_unlock_irq(&gcwq->lock);
3540 }
3541
3542 spin_unlock(&workqueue_lock);
3543}
3544
3545/**
3546 * freeze_workqueues_busy - are freezeable workqueues still busy?
3547 *
3548 * Check whether freezing is complete. This function must be called
3549 * between freeze_workqueues_begin() and thaw_workqueues().
3550 *
3551 * CONTEXT:
3552 * Grabs and releases workqueue_lock.
3553 *
3554 * RETURNS:
3555 * %true if some freezeable workqueues are still busy. %false if
3556 * freezing is complete.
3557 */
3558bool freeze_workqueues_busy(void)
3559{
3560 unsigned int cpu;
3561 bool busy = false;
3562
3563 spin_lock(&workqueue_lock);
3564
3565 BUG_ON(!workqueue_freezing);
3566
3567 for_each_gcwq_cpu(cpu) {
3568 struct workqueue_struct *wq;
3569 /*
3570 * nr_active is monotonically decreasing. It's safe
3571 * to peek without lock.
3572 */
3573 list_for_each_entry(wq, &workqueues, list) {
3574 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3575
3576 if (!cwq || !(wq->flags & WQ_FREEZEABLE))
3577 continue;
3578
3579 BUG_ON(cwq->nr_active < 0);
3580 if (cwq->nr_active) {
3581 busy = true;
3582 goto out_unlock;
3583 }
3584 }
3585 }
3586out_unlock:
3587 spin_unlock(&workqueue_lock);
3588 return busy;
3589}
3590
3591/**
3592 * thaw_workqueues - thaw workqueues
3593 *
3594 * Thaw workqueues. Normal queueing is restored and all collected
3595 * frozen works are transferred to their respective gcwq worklists.
3596 *
3597 * CONTEXT:
3598 * Grabs and releases workqueue_lock and gcwq->lock's.
3599 */
3600void thaw_workqueues(void)
1205{ 3601{
1206 alloc_cpumask_var(&cpu_populated_map, GFP_KERNEL); 3602 unsigned int cpu;
1207 3603
1208 cpumask_copy(cpu_populated_map, cpu_online_mask); 3604 spin_lock(&workqueue_lock);
1209 singlethread_cpu = cpumask_first(cpu_possible_mask); 3605
1210 cpu_singlethread_map = cpumask_of(singlethread_cpu); 3606 if (!workqueue_freezing)
1211 hotcpu_notifier(workqueue_cpu_callback, 0); 3607 goto out_unlock;
1212 keventd_wq = create_workqueue("events"); 3608
1213 BUG_ON(!keventd_wq); 3609 for_each_gcwq_cpu(cpu) {
3610 struct global_cwq *gcwq = get_gcwq(cpu);
3611 struct workqueue_struct *wq;
3612
3613 spin_lock_irq(&gcwq->lock);
3614
3615 BUG_ON(!(gcwq->flags & GCWQ_FREEZING));
3616 gcwq->flags &= ~GCWQ_FREEZING;
3617
3618 list_for_each_entry(wq, &workqueues, list) {
3619 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3620
3621 if (!cwq || !(wq->flags & WQ_FREEZEABLE))
3622 continue;
3623
3624 /* restore max_active and repopulate worklist */
3625 cwq->max_active = wq->saved_max_active;
3626
3627 while (!list_empty(&cwq->delayed_works) &&
3628 cwq->nr_active < cwq->max_active)
3629 cwq_activate_first_delayed(cwq);
3630 }
3631
3632 wake_up_worker(gcwq);
3633
3634 spin_unlock_irq(&gcwq->lock);
3635 }
3636
3637 workqueue_freezing = false;
3638out_unlock:
3639 spin_unlock(&workqueue_lock);
3640}
3641#endif /* CONFIG_FREEZER */
3642
3643static int __init init_workqueues(void)
3644{
3645 unsigned int cpu;
3646 int i;
3647
3648 cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
3649
3650 /* initialize gcwqs */
3651 for_each_gcwq_cpu(cpu) {
3652 struct global_cwq *gcwq = get_gcwq(cpu);
3653
3654 spin_lock_init(&gcwq->lock);
3655 INIT_LIST_HEAD(&gcwq->worklist);
3656 gcwq->cpu = cpu;
3657 gcwq->flags |= GCWQ_DISASSOCIATED;
3658
3659 INIT_LIST_HEAD(&gcwq->idle_list);
3660 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
3661 INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
3662
3663 init_timer_deferrable(&gcwq->idle_timer);
3664 gcwq->idle_timer.function = idle_worker_timeout;
3665 gcwq->idle_timer.data = (unsigned long)gcwq;
3666
3667 setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout,
3668 (unsigned long)gcwq);
3669
3670 ida_init(&gcwq->worker_ida);
3671
3672 gcwq->trustee_state = TRUSTEE_DONE;
3673 init_waitqueue_head(&gcwq->trustee_wait);
3674 }
3675
3676 /* create the initial worker */
3677 for_each_online_gcwq_cpu(cpu) {
3678 struct global_cwq *gcwq = get_gcwq(cpu);
3679 struct worker *worker;
3680
3681 if (cpu != WORK_CPU_UNBOUND)
3682 gcwq->flags &= ~GCWQ_DISASSOCIATED;
3683 worker = create_worker(gcwq, true);
3684 BUG_ON(!worker);
3685 spin_lock_irq(&gcwq->lock);
3686 start_worker(worker);
3687 spin_unlock_irq(&gcwq->lock);
3688 }
3689
3690 system_wq = alloc_workqueue("events", 0, 0);
3691 system_long_wq = alloc_workqueue("events_long", 0, 0);
3692 system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
3693 system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
3694 WQ_UNBOUND_MAX_ACTIVE);
3695 BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq);
3696 return 0;
1214} 3697}
3698early_initcall(init_workqueues);
diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h
new file mode 100644
index 000000000000..2d10fc98dc79
--- /dev/null
+++ b/kernel/workqueue_sched.h
@@ -0,0 +1,9 @@
1/*
2 * kernel/workqueue_sched.h
3 *
4 * Scheduler hooks for concurrency managed workqueue. Only to be
5 * included from sched.c and workqueue.c.
6 */
7void wq_worker_waking_up(struct task_struct *task, unsigned int cpu);
8struct task_struct *wq_worker_sleeping(struct task_struct *task,
9 unsigned int cpu);